pyobo 0.12.0__py3-none-any.whl → 0.12.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyobo/.DS_Store +0 -0
- pyobo/api/properties.py +8 -12
- pyobo/api/xrefs.py +1 -2
- pyobo/cli/database.py +30 -2
- pyobo/cli/database_utils.py +5 -11
- pyobo/getters.py +20 -79
- pyobo/gilda_utils.py +3 -80
- pyobo/identifier_utils/__init__.py +3 -10
- pyobo/identifier_utils/api.py +21 -12
- pyobo/obographs.py +11 -2
- pyobo/reader.py +13 -17
- pyobo/sources/cgnc.py +9 -1
- pyobo/sources/credit.py +17 -6
- pyobo/sources/flybase.py +5 -5
- pyobo/sources/omim_ps.py +4 -4
- pyobo/sources/pharmgkb/pharmgkb_gene.py +1 -1
- pyobo/struct/functional/ontology.py +3 -1
- pyobo/struct/reference.py +4 -4
- pyobo/struct/struct.py +112 -55
- pyobo/utils/cache.py +3 -4
- pyobo/utils/io.py +38 -14
- pyobo/utils/path.py +16 -19
- pyobo/version.py +1 -1
- {pyobo-0.12.0.dist-info → pyobo-0.12.2.dist-info}/METADATA +67 -118
- {pyobo-0.12.0.dist-info → pyobo-0.12.2.dist-info}/RECORD +164 -169
- {pyobo-0.12.0.dist-info → pyobo-0.12.2.dist-info}/WHEEL +1 -1
- pyobo/identifier_utils/model.py +0 -130
- pyobo/identifier_utils/preprocessing.json +0 -812
- pyobo/identifier_utils/preprocessing.py +0 -61
- pyobo/resources/goc.py +0 -75
- pyobo/resources/goc.tsv +0 -188
- {pyobo-0.12.0.dist-info → pyobo-0.12.2.dist-info}/entry_points.txt +0 -0
- {pyobo-0.12.0.dist-info → pyobo-0.12.2.dist-info}/licenses/LICENSE +0 -0
pyobo/.DS_Store
CHANGED
|
Binary file
|
pyobo/api/properties.py
CHANGED
|
@@ -113,18 +113,14 @@ def get_properties_df(prefix: str, **kwargs: Unpack[GetOntologyKwargs]) -> pd.Da
|
|
|
113
113
|
:param prefix: the resource to load
|
|
114
114
|
:returns: A dataframe with the properties
|
|
115
115
|
"""
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
)
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
use_tqdm=check_should_use_tqdm(kwargs)
|
|
125
|
-
)
|
|
126
|
-
|
|
127
|
-
return _df_getter()
|
|
116
|
+
df1 = get_literal_properties_df(prefix, **kwargs)
|
|
117
|
+
df2 = get_object_properties_df(prefix, **kwargs)
|
|
118
|
+
df = pd.concat([df1[["source", "predicate", "target"]], df2])
|
|
119
|
+
ll = len(prefix) + 1
|
|
120
|
+
df[f"{prefix}_id"] = df["source"].map(lambda x: x[ll:])
|
|
121
|
+
df = df.rename(columns={"predicate": "property", "target": "value"})
|
|
122
|
+
del df["source"]
|
|
123
|
+
return df[[f"{prefix}_id", "property", "value"]]
|
|
128
124
|
|
|
129
125
|
|
|
130
126
|
@wrap_norm_prefix
|
pyobo/api/xrefs.py
CHANGED
|
@@ -81,8 +81,7 @@ get_xrefs = get_filtered_xrefs
|
|
|
81
81
|
def get_xrefs_df(prefix: str, **kwargs: Unpack[GetOntologyKwargs]) -> pd.DataFrame:
|
|
82
82
|
"""Get all xrefs."""
|
|
83
83
|
warnings.warn(
|
|
84
|
-
|
|
85
|
-
f"Not using cache artifact path to {CacheArtifact.xrefs}",
|
|
84
|
+
"use pyobo.get_mappings_df instead of pyobo.get_xrefs_df.",
|
|
86
85
|
DeprecationWarning,
|
|
87
86
|
stacklevel=2,
|
|
88
87
|
)
|
pyobo/cli/database.py
CHANGED
|
@@ -2,8 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
import warnings
|
|
5
|
+
from collections.abc import Iterable
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
|
|
8
|
+
import bioregistry
|
|
7
9
|
import click
|
|
8
10
|
from more_click import verbose_option
|
|
9
11
|
from tqdm.contrib.logging import logging_redirect_tqdm
|
|
@@ -11,11 +13,11 @@ from typing_extensions import Unpack
|
|
|
11
13
|
from zenodo_client import update_zenodo
|
|
12
14
|
|
|
13
15
|
from .database_utils import (
|
|
16
|
+
IterHelperHelperDict,
|
|
14
17
|
_iter_alts,
|
|
15
18
|
_iter_definitions,
|
|
16
19
|
_iter_edges,
|
|
17
20
|
_iter_mappings,
|
|
18
|
-
_iter_metadata,
|
|
19
21
|
_iter_names,
|
|
20
22
|
_iter_properties,
|
|
21
23
|
_iter_relations,
|
|
@@ -23,6 +25,7 @@ from .database_utils import (
|
|
|
23
25
|
_iter_synonyms,
|
|
24
26
|
_iter_typedefs,
|
|
25
27
|
_iter_xrefs,
|
|
28
|
+
iter_helper_helper,
|
|
26
29
|
)
|
|
27
30
|
from .utils import (
|
|
28
31
|
Clickable,
|
|
@@ -44,12 +47,14 @@ from ..constants import (
|
|
|
44
47
|
TYPEDEFS_RECORD,
|
|
45
48
|
DatabaseKwargs,
|
|
46
49
|
)
|
|
47
|
-
from ..getters import db_output_helper
|
|
50
|
+
from ..getters import db_output_helper, get_ontology
|
|
48
51
|
|
|
49
52
|
__all__ = [
|
|
50
53
|
"main",
|
|
51
54
|
]
|
|
52
55
|
|
|
56
|
+
logger = logging.getLogger(__name__)
|
|
57
|
+
|
|
53
58
|
|
|
54
59
|
@click.group(name="database")
|
|
55
60
|
def main():
|
|
@@ -129,9 +134,32 @@ def build(ctx: click.Context, **kwargs: Unpack[DatabaseKwargs]) -> None:
|
|
|
129
134
|
ctx.invoke(species, **updated_kwargs)
|
|
130
135
|
|
|
131
136
|
|
|
137
|
+
@database_annotate
|
|
138
|
+
def cache(zenodo: bool, directory: Path, **kwargs: Unpack[DatabaseKwargs]) -> None:
|
|
139
|
+
"""Cache all things."""
|
|
140
|
+
if zenodo:
|
|
141
|
+
click.echo("no zenodo for caching")
|
|
142
|
+
|
|
143
|
+
kwargs["force_process"] = True
|
|
144
|
+
with logging_redirect_tqdm():
|
|
145
|
+
for _ in iter_helper_helper(get_ontology, **kwargs):
|
|
146
|
+
# this pass intentional to consume the iterable
|
|
147
|
+
pass
|
|
148
|
+
|
|
149
|
+
|
|
132
150
|
@database_annotate
|
|
133
151
|
def metadata(zenodo: bool, directory: Path, **kwargs: Unpack[DatabaseKwargs]) -> None:
|
|
134
152
|
"""Make the prefix-metadata dump."""
|
|
153
|
+
from ..api import get_metadata
|
|
154
|
+
|
|
155
|
+
def _iter_metadata(
|
|
156
|
+
**kwargs: Unpack[IterHelperHelperDict],
|
|
157
|
+
) -> Iterable[tuple[str, str, str, bool]]:
|
|
158
|
+
for prefix, data in iter_helper_helper(get_metadata, **kwargs):
|
|
159
|
+
version = data["version"]
|
|
160
|
+
logger.debug(f"[{prefix}] using version {version}")
|
|
161
|
+
yield prefix, version, data["date"], bioregistry.is_deprecated(prefix)
|
|
162
|
+
|
|
135
163
|
it = _iter_metadata(**kwargs)
|
|
136
164
|
db_output_helper(
|
|
137
165
|
it,
|
pyobo/cli/database_utils.py
CHANGED
|
@@ -9,7 +9,6 @@ from collections.abc import Iterable
|
|
|
9
9
|
from functools import partial
|
|
10
10
|
from typing import cast
|
|
11
11
|
|
|
12
|
-
import bioregistry
|
|
13
12
|
from tqdm.auto import tqdm
|
|
14
13
|
from typing_extensions import Unpack
|
|
15
14
|
|
|
@@ -21,7 +20,6 @@ from ..api import (
|
|
|
21
20
|
get_id_synonyms_mapping,
|
|
22
21
|
get_id_to_alts,
|
|
23
22
|
get_mappings_df,
|
|
24
|
-
get_metadata,
|
|
25
23
|
get_properties_df,
|
|
26
24
|
get_relations_df,
|
|
27
25
|
get_typedef_df,
|
|
@@ -40,19 +38,12 @@ def _iter_ncbigene(left: int, right: int) -> Iterable[tuple[str, str, str]]:
|
|
|
40
38
|
with gzip.open(ncbi_path, "rt") as file:
|
|
41
39
|
next(file) # throw away the header
|
|
42
40
|
for line in tqdm(
|
|
43
|
-
file, desc=f"
|
|
41
|
+
file, desc=f"[{ncbigene.PREFIX}] extracting names", unit_scale=True, total=56_700_000
|
|
44
42
|
):
|
|
45
43
|
parts = line.strip().split("\t")
|
|
46
44
|
yield ncbigene.PREFIX, parts[left], parts[right]
|
|
47
45
|
|
|
48
46
|
|
|
49
|
-
def _iter_metadata(**kwargs: Unpack[IterHelperHelperDict]):
|
|
50
|
-
for prefix, data in iter_helper_helper(get_metadata, **kwargs):
|
|
51
|
-
version = data["version"]
|
|
52
|
-
logger.debug(f"[{prefix}] using version {version}")
|
|
53
|
-
yield prefix, version, data["date"], bioregistry.is_deprecated(prefix)
|
|
54
|
-
|
|
55
|
-
|
|
56
47
|
def _iter_names(leave: bool = False, **kwargs) -> Iterable[tuple[str, str, str]]:
|
|
57
48
|
"""Iterate over all prefix-identifier-name triples we can get.
|
|
58
49
|
|
|
@@ -60,11 +51,14 @@ def _iter_names(leave: bool = False, **kwargs) -> Iterable[tuple[str, str, str]]
|
|
|
60
51
|
"""
|
|
61
52
|
yield from iter_helper(get_id_name_mapping, leave=leave, **kwargs)
|
|
62
53
|
yield from _iter_ncbigene(1, 2)
|
|
54
|
+
yield from _iter_pubchem_compound()
|
|
55
|
+
|
|
63
56
|
|
|
57
|
+
def _iter_pubchem_compound():
|
|
64
58
|
pcc_path = pubchem._ensure_cid_name_path()
|
|
65
59
|
with gzip.open(pcc_path, mode="rt", encoding="ISO-8859-1") as file:
|
|
66
60
|
for line in tqdm(
|
|
67
|
-
file, desc=f"
|
|
61
|
+
file, desc=f"[{pubchem.PREFIX}] extracting names", unit_scale=True, total=119_000_000
|
|
68
62
|
):
|
|
69
63
|
identifier, name = line.strip().split("\t", 1)
|
|
70
64
|
yield pubchem.PREFIX, identifier, name
|
pyobo/getters.py
CHANGED
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import datetime
|
|
6
|
-
import gzip
|
|
7
6
|
import json
|
|
8
7
|
import logging
|
|
9
8
|
import pathlib
|
|
@@ -16,17 +15,17 @@ from collections import Counter
|
|
|
16
15
|
from collections.abc import Callable, Iterable, Mapping, Sequence
|
|
17
16
|
from pathlib import Path
|
|
18
17
|
from textwrap import indent
|
|
19
|
-
from typing import TypeVar
|
|
18
|
+
from typing import Any, TypeVar
|
|
20
19
|
|
|
21
20
|
import bioregistry
|
|
22
21
|
import click
|
|
23
22
|
import pystow.utils
|
|
24
|
-
from bioontologies import robot
|
|
25
23
|
from tabulate import tabulate
|
|
26
24
|
from tqdm.auto import tqdm
|
|
27
25
|
from typing_extensions import Unpack
|
|
28
26
|
|
|
29
27
|
from .constants import (
|
|
28
|
+
BUILD_SUBDIRECTORY_NAME,
|
|
30
29
|
DATABASE_DIRECTORY,
|
|
31
30
|
GetOntologyKwargs,
|
|
32
31
|
IterHelperHelperDict,
|
|
@@ -36,7 +35,7 @@ from .identifier_utils import ParseError, wrap_norm_prefix
|
|
|
36
35
|
from .plugins import has_nomenclature_plugin, run_nomenclature_plugin
|
|
37
36
|
from .reader import from_obo_path, from_obonet
|
|
38
37
|
from .struct import Obo
|
|
39
|
-
from .utils.io import
|
|
38
|
+
from .utils.io import safe_open_writer
|
|
40
39
|
from .utils.path import ensure_path, prefix_directory_join
|
|
41
40
|
from .version import get_git_hash, get_version
|
|
42
41
|
|
|
@@ -119,19 +118,21 @@ def get_ontology(
|
|
|
119
118
|
logger.info("UBERON has so much garbage in it that defaulting to non-strict parsing")
|
|
120
119
|
strict = False
|
|
121
120
|
|
|
122
|
-
if
|
|
121
|
+
if force_process:
|
|
122
|
+
obonet_json_gz_path = None
|
|
123
|
+
elif not cache:
|
|
123
124
|
logger.debug("[%s] caching was turned off, so dont look for an obonet file", prefix)
|
|
124
125
|
obonet_json_gz_path = None
|
|
125
126
|
else:
|
|
126
127
|
obonet_json_gz_path = prefix_directory_join(
|
|
127
|
-
prefix, name=f"{prefix}.obonet.json.gz",
|
|
128
|
+
prefix, BUILD_SUBDIRECTORY_NAME, name=f"{prefix}.obonet.json.gz", version=version
|
|
128
129
|
)
|
|
129
130
|
logger.debug(
|
|
130
131
|
"[%s] caching is turned on, so look for an obonet file at %s",
|
|
131
132
|
prefix,
|
|
132
133
|
obonet_json_gz_path,
|
|
133
134
|
)
|
|
134
|
-
if obonet_json_gz_path.
|
|
135
|
+
if obonet_json_gz_path.is_file() and not force:
|
|
135
136
|
from .utils.cache import get_gzipped_graph
|
|
136
137
|
|
|
137
138
|
logger.debug("[%s] using obonet cache at %s", prefix, obonet_json_gz_path)
|
|
@@ -158,6 +159,8 @@ def get_ontology(
|
|
|
158
159
|
elif ontology_format == "obo":
|
|
159
160
|
pass # all gucci
|
|
160
161
|
elif ontology_format == "owl":
|
|
162
|
+
from bioontologies import robot
|
|
163
|
+
|
|
161
164
|
_converted_obo_path = path.with_suffix(".obo")
|
|
162
165
|
if prefix in REQUIRES_NO_ROBOT_CHECK:
|
|
163
166
|
robot_check = False
|
|
@@ -198,64 +201,6 @@ def _ensure_ontology_path(
|
|
|
198
201
|
return None, None
|
|
199
202
|
|
|
200
203
|
|
|
201
|
-
#: Obonet/Pronto can't parse these (consider converting to OBO with ROBOT?)
|
|
202
|
-
CANT_PARSE = {
|
|
203
|
-
"agro",
|
|
204
|
-
"aro",
|
|
205
|
-
"bco",
|
|
206
|
-
"caro",
|
|
207
|
-
"cco",
|
|
208
|
-
"chmo",
|
|
209
|
-
"cido",
|
|
210
|
-
"covoc",
|
|
211
|
-
"cto",
|
|
212
|
-
"cvdo",
|
|
213
|
-
"dicom",
|
|
214
|
-
"dinto",
|
|
215
|
-
"emap",
|
|
216
|
-
"epso",
|
|
217
|
-
"eupath",
|
|
218
|
-
"fbbi",
|
|
219
|
-
"fma",
|
|
220
|
-
"fobi",
|
|
221
|
-
"foodon",
|
|
222
|
-
"genepio",
|
|
223
|
-
"hancestro",
|
|
224
|
-
"hom",
|
|
225
|
-
"hso",
|
|
226
|
-
"htn", # Unknown string format: creation: 16MAY2017
|
|
227
|
-
"ico",
|
|
228
|
-
"idocovid19",
|
|
229
|
-
"labo",
|
|
230
|
-
"mamo",
|
|
231
|
-
"mfmo",
|
|
232
|
-
"mfo",
|
|
233
|
-
"mfomd",
|
|
234
|
-
"miapa",
|
|
235
|
-
"mo",
|
|
236
|
-
"oae",
|
|
237
|
-
"ogms", # Unknown string format: creation: 16MAY2017
|
|
238
|
-
"ohd",
|
|
239
|
-
"ons",
|
|
240
|
-
"oostt",
|
|
241
|
-
"opmi",
|
|
242
|
-
"ornaseq",
|
|
243
|
-
"orth",
|
|
244
|
-
"pdro",
|
|
245
|
-
"probonto",
|
|
246
|
-
"psdo",
|
|
247
|
-
"reo",
|
|
248
|
-
"rex",
|
|
249
|
-
"rnao",
|
|
250
|
-
"sepio",
|
|
251
|
-
"sio",
|
|
252
|
-
"spd",
|
|
253
|
-
"sweetrealm",
|
|
254
|
-
"txpo",
|
|
255
|
-
"vido",
|
|
256
|
-
"vt",
|
|
257
|
-
"xl",
|
|
258
|
-
}
|
|
259
204
|
SKIP = {
|
|
260
205
|
"ncbigene": "too big, refs acquired from other dbs",
|
|
261
206
|
"pubchem.compound": "top big, can't deal with this now",
|
|
@@ -276,11 +221,12 @@ SKIP = {
|
|
|
276
221
|
"kegg.genes": "needs fix", # FIXME
|
|
277
222
|
"kegg.genome": "needs fix", # FIXME
|
|
278
223
|
"kegg.pathway": "needs fix", # FIXME
|
|
279
|
-
"ensemblglossary": "
|
|
224
|
+
"ensemblglossary": "URI is self-referential to data in OLS, extract from there",
|
|
280
225
|
"epio": "content from fraunhofer is unreliable",
|
|
281
226
|
"epso": "content from fraunhofer is unreliable",
|
|
282
227
|
"gwascentral.phenotype": "website is down? or API changed?", # FIXME
|
|
283
228
|
"gwascentral.study": "website is down? or API changed?", # FIXME
|
|
229
|
+
"snomedct": "dead source",
|
|
284
230
|
}
|
|
285
231
|
|
|
286
232
|
X = TypeVar("X")
|
|
@@ -412,7 +358,7 @@ def iter_helper_helper(
|
|
|
412
358
|
except ValueError as e:
|
|
413
359
|
if _is_xml(e):
|
|
414
360
|
# this means that it tried doing parsing on an xml page
|
|
415
|
-
logger.
|
|
361
|
+
logger.warning(
|
|
416
362
|
"no resource available for %s. See http://www.obofoundry.org/ontology/%s",
|
|
417
363
|
prefix,
|
|
418
364
|
prefix,
|
|
@@ -452,7 +398,7 @@ def _prep_dir(directory: None | str | pathlib.Path) -> pathlib.Path:
|
|
|
452
398
|
|
|
453
399
|
|
|
454
400
|
def db_output_helper(
|
|
455
|
-
it: Iterable[tuple[
|
|
401
|
+
it: Iterable[tuple[Any, ...]],
|
|
456
402
|
db_name: str,
|
|
457
403
|
columns: Sequence[str],
|
|
458
404
|
*,
|
|
@@ -497,13 +443,10 @@ def db_output_helper(
|
|
|
497
443
|
logger.info("writing %s to %s", db_name, db_path)
|
|
498
444
|
logger.info("writing %s sample to %s", db_name, db_sample_path)
|
|
499
445
|
sample_rows = []
|
|
500
|
-
with gzip.open(db_path, mode="wt") if use_gzip else open(db_path, "w") as gzipped_file:
|
|
501
|
-
writer = get_writer(gzipped_file)
|
|
502
446
|
|
|
447
|
+
with safe_open_writer(db_path) as writer:
|
|
503
448
|
# for the first 10 rows, put it in a sample file too
|
|
504
|
-
with
|
|
505
|
-
sample_writer = get_writer(sample_file)
|
|
506
|
-
|
|
449
|
+
with safe_open_writer(db_sample_path) as sample_writer:
|
|
507
450
|
# write header
|
|
508
451
|
writer.writerow(columns)
|
|
509
452
|
sample_writer.writerow(columns)
|
|
@@ -523,15 +466,13 @@ def db_output_helper(
|
|
|
523
466
|
c_detailed[tuple(row[i] for i in summary_detailed)] += 1
|
|
524
467
|
writer.writerow(row)
|
|
525
468
|
|
|
526
|
-
with
|
|
527
|
-
|
|
528
|
-
writer.writerows(c.most_common())
|
|
469
|
+
with safe_open_writer(db_summary_path) as summary_writer:
|
|
470
|
+
summary_writer.writerows(c.most_common())
|
|
529
471
|
|
|
530
472
|
if summary_detailed is not None:
|
|
531
473
|
logger.info(f"writing {db_name} detailed summary to {db_summary_detailed_path}")
|
|
532
|
-
with
|
|
533
|
-
|
|
534
|
-
writer.writerows((*keys, v) for keys, v in c_detailed.most_common())
|
|
474
|
+
with safe_open_writer(db_summary_detailed_path) as detailed_summary_writer:
|
|
475
|
+
detailed_summary_writer.writerows((*keys, v) for keys, v in c_detailed.most_common())
|
|
535
476
|
rv.append(("Summary (Detailed)", db_summary_detailed_path))
|
|
536
477
|
|
|
537
478
|
with open(db_metadata_path, "w") as file:
|
pyobo/gilda_utils.py
CHANGED
|
@@ -2,20 +2,15 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
import logging
|
|
6
5
|
import warnings
|
|
7
6
|
from collections.abc import Iterable, Sequence
|
|
8
7
|
from typing import TYPE_CHECKING, Any, cast
|
|
9
8
|
|
|
10
|
-
import bioregistry
|
|
11
9
|
import ssslm
|
|
12
|
-
from ssslm import
|
|
13
|
-
from tqdm.auto import tqdm
|
|
10
|
+
from ssslm import literal_mappings_to_gilda
|
|
14
11
|
from typing_extensions import Unpack
|
|
15
12
|
|
|
16
13
|
from pyobo.api import (
|
|
17
|
-
get_id_name_mapping,
|
|
18
|
-
get_ids,
|
|
19
14
|
get_literal_mappings,
|
|
20
15
|
get_literal_mappings_subset,
|
|
21
16
|
)
|
|
@@ -26,83 +21,11 @@ if TYPE_CHECKING:
|
|
|
26
21
|
import gilda
|
|
27
22
|
|
|
28
23
|
__all__ = [
|
|
24
|
+
"get_gilda_term_subset",
|
|
25
|
+
"get_gilda_terms",
|
|
29
26
|
"get_grounder",
|
|
30
|
-
"iter_gilda_prediction_tuples",
|
|
31
27
|
]
|
|
32
28
|
|
|
33
|
-
logger = logging.getLogger(__name__)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
# TODO the only place this is used is in Biomappings -
|
|
37
|
-
# might be better to directly move it there
|
|
38
|
-
def iter_gilda_prediction_tuples(
|
|
39
|
-
prefix: str,
|
|
40
|
-
relation: str = "skos:exactMatch",
|
|
41
|
-
*,
|
|
42
|
-
grounder: gilda.Grounder | None = None,
|
|
43
|
-
identifiers_are_names: bool = False,
|
|
44
|
-
strict: bool = False,
|
|
45
|
-
) -> Iterable[tuple[str, str, str, str, str, str, str, str, float]]:
|
|
46
|
-
"""Iterate over prediction tuples for a given prefix."""
|
|
47
|
-
if grounder is None:
|
|
48
|
-
import gilda.api
|
|
49
|
-
|
|
50
|
-
grounder = gilda.api.grounder
|
|
51
|
-
grounder_ = GildaGrounder(grounder)
|
|
52
|
-
id_name_mapping = get_id_name_mapping(prefix, strict=strict)
|
|
53
|
-
it = tqdm(
|
|
54
|
-
id_name_mapping.items(), desc=f"[{prefix}] gilda tuples", unit_scale=True, unit="name"
|
|
55
|
-
)
|
|
56
|
-
for identifier, name in it:
|
|
57
|
-
norm_identifier = _normalize_identifier(prefix, identifier)
|
|
58
|
-
for scored_match in grounder_.get_matches(name):
|
|
59
|
-
yield (
|
|
60
|
-
prefix,
|
|
61
|
-
norm_identifier,
|
|
62
|
-
name,
|
|
63
|
-
relation,
|
|
64
|
-
scored_match.prefix,
|
|
65
|
-
_normalize_identifier(scored_match.prefix, scored_match.identifier),
|
|
66
|
-
name,
|
|
67
|
-
"semapv:LexicalMatching",
|
|
68
|
-
round(scored_match.score, 3),
|
|
69
|
-
)
|
|
70
|
-
|
|
71
|
-
if identifiers_are_names:
|
|
72
|
-
it = tqdm(get_ids(prefix), desc=f"[{prefix}] gilda tuples", unit_scale=True, unit="id")
|
|
73
|
-
for identifier in it:
|
|
74
|
-
norm_identifier = _normalize_identifier(prefix, identifier)
|
|
75
|
-
for scored_match in grounder_.get_matches(identifier):
|
|
76
|
-
yield (
|
|
77
|
-
prefix,
|
|
78
|
-
norm_identifier,
|
|
79
|
-
identifier,
|
|
80
|
-
relation,
|
|
81
|
-
scored_match.prefix,
|
|
82
|
-
_normalize_identifier(scored_match.prefix, scored_match.identifier),
|
|
83
|
-
identifier,
|
|
84
|
-
"semapv:LexicalMatching",
|
|
85
|
-
scored_match.score,
|
|
86
|
-
)
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
def _normalize_identifier(prefix: str, identifier: str) -> str:
|
|
90
|
-
"""Normalize the identifier."""
|
|
91
|
-
resource = bioregistry.get_resource(prefix)
|
|
92
|
-
if resource is None:
|
|
93
|
-
raise KeyError
|
|
94
|
-
return resource.miriam_standardize_identifier(identifier) or identifier
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
def normalize_identifier(prefix: str, identifier: str) -> str:
|
|
98
|
-
"""Normalize the identifier."""
|
|
99
|
-
warnings.warn(
|
|
100
|
-
"normalization to MIRIAM is deprecated, please update to using Bioregistry standard identifiers",
|
|
101
|
-
DeprecationWarning,
|
|
102
|
-
stacklevel=2,
|
|
103
|
-
)
|
|
104
|
-
return _normalize_identifier(prefix, identifier)
|
|
105
|
-
|
|
106
29
|
|
|
107
30
|
def get_grounder(*args: Any, **kwargs: Any) -> gilda.Grounder:
|
|
108
31
|
"""Get a grounder."""
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
"""Extract registry information."""
|
|
2
2
|
|
|
3
|
+
from curies_processing import get_rules
|
|
4
|
+
|
|
3
5
|
from .api import (
|
|
4
|
-
BlacklistedError,
|
|
5
6
|
DefaultCoercionError,
|
|
6
7
|
EmptyStringError,
|
|
7
8
|
NotCURIEError,
|
|
@@ -14,15 +15,9 @@ from .api import (
|
|
|
14
15
|
standardize_ec,
|
|
15
16
|
wrap_norm_prefix,
|
|
16
17
|
)
|
|
17
|
-
from .preprocessing import (
|
|
18
|
-
remap_full,
|
|
19
|
-
remap_prefix,
|
|
20
|
-
str_is_blacklisted,
|
|
21
|
-
)
|
|
22
18
|
from .relations import ground_relation
|
|
23
19
|
|
|
24
20
|
__all__ = [
|
|
25
|
-
"BlacklistedError",
|
|
26
21
|
"DefaultCoercionError",
|
|
27
22
|
"EmptyStringError",
|
|
28
23
|
"NotCURIEError",
|
|
@@ -32,10 +27,8 @@ __all__ = [
|
|
|
32
27
|
"UnregisteredPrefixError",
|
|
33
28
|
"_is_valid_identifier",
|
|
34
29
|
"_parse_str_or_curie_or_uri_helper",
|
|
30
|
+
"get_rules",
|
|
35
31
|
"ground_relation",
|
|
36
|
-
"remap_full",
|
|
37
|
-
"remap_prefix",
|
|
38
32
|
"standardize_ec",
|
|
39
|
-
"str_is_blacklisted",
|
|
40
33
|
"wrap_norm_prefix",
|
|
41
34
|
]
|
pyobo/identifier_utils/api.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
from functools import wraps
|
|
6
|
+
from functools import lru_cache, wraps
|
|
7
7
|
from typing import Annotated, ClassVar
|
|
8
8
|
|
|
9
9
|
import bioregistry
|
|
@@ -11,14 +11,14 @@ import click
|
|
|
11
11
|
from bioregistry import NormalizedNamableReference as Reference
|
|
12
12
|
from bioregistry.constants import FailureReturnType
|
|
13
13
|
from curies import ReferenceTuple
|
|
14
|
+
from curies.preprocessing import BlocklistError, PreprocessingConverter
|
|
15
|
+
from curies_processing import get_rules
|
|
14
16
|
from pydantic import ValidationError
|
|
15
17
|
from typing_extensions import Doc
|
|
16
18
|
|
|
17
|
-
from .preprocessing import remap_full, remap_prefix, str_is_blacklisted
|
|
18
19
|
from .relations import ground_relation
|
|
19
20
|
|
|
20
21
|
__all__ = [
|
|
21
|
-
"BlacklistedError",
|
|
22
22
|
"DefaultCoercionError",
|
|
23
23
|
"EmptyStringError",
|
|
24
24
|
"NotCURIEError",
|
|
@@ -34,10 +34,6 @@ __all__ = [
|
|
|
34
34
|
logger = logging.getLogger(__name__)
|
|
35
35
|
|
|
36
36
|
|
|
37
|
-
class BlacklistedError(ValueError):
|
|
38
|
-
"""A sentinel for blacklisted strings."""
|
|
39
|
-
|
|
40
|
-
|
|
41
37
|
Line = Annotated[str | None, Doc("""The OBO line where the parsing happened""")]
|
|
42
38
|
|
|
43
39
|
|
|
@@ -138,6 +134,15 @@ def _preclean_uri(s: str) -> str:
|
|
|
138
134
|
return s
|
|
139
135
|
|
|
140
136
|
|
|
137
|
+
@lru_cache(1)
|
|
138
|
+
def _get_converter() -> PreprocessingConverter:
|
|
139
|
+
return PreprocessingConverter(
|
|
140
|
+
converter=bioregistry.manager.converter,
|
|
141
|
+
rules=get_rules(),
|
|
142
|
+
preclean=_preclean_uri,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
|
|
141
146
|
def _parse_str_or_curie_or_uri_helper(
|
|
142
147
|
str_or_curie_or_uri: str,
|
|
143
148
|
*,
|
|
@@ -148,7 +153,7 @@ def _parse_str_or_curie_or_uri_helper(
|
|
|
148
153
|
line: str | None = None,
|
|
149
154
|
name: str | None = None,
|
|
150
155
|
context: str | None = None,
|
|
151
|
-
) -> Reference | ParseError |
|
|
156
|
+
) -> Reference | ParseError | BlocklistError:
|
|
152
157
|
"""Parse a string that looks like a CURIE.
|
|
153
158
|
|
|
154
159
|
:param str_or_curie_or_uri: A compact uniform resource identifier (CURIE)
|
|
@@ -171,19 +176,23 @@ def _parse_str_or_curie_or_uri_helper(
|
|
|
171
176
|
context=context,
|
|
172
177
|
)
|
|
173
178
|
|
|
179
|
+
rules = get_rules()
|
|
180
|
+
|
|
174
181
|
if upgrade:
|
|
175
182
|
# Remap the curie with the full list
|
|
176
|
-
if r1 := remap_full(
|
|
183
|
+
if r1 := rules.remap_full(
|
|
184
|
+
str_or_curie_or_uri, reference_cls=Reference, context=ontology_prefix
|
|
185
|
+
):
|
|
177
186
|
return r1
|
|
178
187
|
|
|
179
188
|
# Remap node's prefix (if necessary)
|
|
180
|
-
str_or_curie_or_uri = remap_prefix(str_or_curie_or_uri,
|
|
189
|
+
str_or_curie_or_uri = rules.remap_prefix(str_or_curie_or_uri, context=ontology_prefix)
|
|
181
190
|
|
|
182
191
|
if r2 := ground_relation(str_or_curie_or_uri):
|
|
183
192
|
return r2
|
|
184
193
|
|
|
185
|
-
if
|
|
186
|
-
return
|
|
194
|
+
if rules.str_is_blocked(str_or_curie_or_uri, context=ontology_prefix):
|
|
195
|
+
return BlocklistError()
|
|
187
196
|
|
|
188
197
|
if _is_uri(str_or_curie_or_uri):
|
|
189
198
|
rt = bioregistry.parse_iri(
|
pyobo/obographs.py
CHANGED
|
@@ -4,6 +4,7 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
6
|
from collections.abc import Iterable
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
7
8
|
|
|
8
9
|
import bioregistry
|
|
9
10
|
from bioontologies.obograph import (
|
|
@@ -16,12 +17,14 @@ from bioontologies.obograph import (
|
|
|
16
17
|
Synonym,
|
|
17
18
|
Xref,
|
|
18
19
|
)
|
|
19
|
-
from bioontologies.robot import ParseResults
|
|
20
20
|
from tqdm import tqdm
|
|
21
21
|
|
|
22
22
|
from pyobo.struct import Obo, OBOLiteral, Reference, Term
|
|
23
23
|
from pyobo.struct.typedef import definition_source, is_a
|
|
24
24
|
|
|
25
|
+
if TYPE_CHECKING:
|
|
26
|
+
from bioontologies.robot import ParseResults
|
|
27
|
+
|
|
25
28
|
__all__ = [
|
|
26
29
|
"graph_from_obo",
|
|
27
30
|
"parse_results_from_obo",
|
|
@@ -33,6 +36,8 @@ logger = logging.getLogger(__name__)
|
|
|
33
36
|
def parse_results_from_obo(obo: Obo) -> ParseResults:
|
|
34
37
|
"""Get parse results from an OBO graph."""
|
|
35
38
|
graph = graph_from_obo(obo)
|
|
39
|
+
from bioontologies.robot import ParseResults
|
|
40
|
+
|
|
36
41
|
return ParseResults(graph_document=GraphDocument(graphs=[graph]))
|
|
37
42
|
|
|
38
43
|
|
|
@@ -41,7 +46,11 @@ def graph_from_obo(obo: Obo, use_tqdm: bool = True) -> Graph:
|
|
|
41
46
|
nodes: list[Node] = []
|
|
42
47
|
edges: list[Edge] = []
|
|
43
48
|
for term in tqdm(
|
|
44
|
-
obo,
|
|
49
|
+
obo,
|
|
50
|
+
disable=not use_tqdm,
|
|
51
|
+
unit="term",
|
|
52
|
+
unit_scale=True,
|
|
53
|
+
desc=f"[{obo._prefix_version}] to OBO Graph JSON",
|
|
45
54
|
):
|
|
46
55
|
nodes.append(_get_class_node(term))
|
|
47
56
|
edges.extend(_iter_edges(term))
|