pyobo 0.12.0__py3-none-any.whl → 0.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyobo/.DS_Store +0 -0
- pyobo/api/properties.py +8 -12
- pyobo/api/xrefs.py +1 -2
- pyobo/cli/database.py +30 -2
- pyobo/cli/database_utils.py +5 -11
- pyobo/getters.py +18 -78
- pyobo/gilda_utils.py +3 -80
- pyobo/identifier_utils/__init__.py +2 -10
- pyobo/identifier_utils/api.py +21 -12
- pyobo/identifier_utils/preprocessing.json +74 -13
- pyobo/identifier_utils/preprocessing.py +5 -39
- pyobo/obographs.py +5 -1
- pyobo/reader.py +13 -17
- pyobo/sources/cgnc.py +9 -1
- pyobo/sources/flybase.py +5 -5
- pyobo/sources/omim_ps.py +4 -4
- pyobo/sources/pharmgkb/pharmgkb_gene.py +1 -1
- pyobo/struct/functional/ontology.py +3 -1
- pyobo/struct/reference.py +4 -4
- pyobo/struct/struct.py +112 -55
- pyobo/utils/cache.py +3 -4
- pyobo/utils/io.py +38 -14
- pyobo/utils/path.py +16 -19
- pyobo/version.py +1 -1
- {pyobo-0.12.0.dist-info → pyobo-0.12.1.dist-info}/METADATA +71 -110
- {pyobo-0.12.0.dist-info → pyobo-0.12.1.dist-info}/RECORD +29 -30
- {pyobo-0.12.0.dist-info → pyobo-0.12.1.dist-info}/WHEEL +1 -1
- pyobo/identifier_utils/model.py +0 -130
- {pyobo-0.12.0.dist-info → pyobo-0.12.1.dist-info}/entry_points.txt +0 -0
- {pyobo-0.12.0.dist-info → pyobo-0.12.1.dist-info}/licenses/LICENSE +0 -0
pyobo/.DS_Store
CHANGED
|
Binary file
|
pyobo/api/properties.py
CHANGED
|
@@ -113,18 +113,14 @@ def get_properties_df(prefix: str, **kwargs: Unpack[GetOntologyKwargs]) -> pd.Da
|
|
|
113
113
|
:param prefix: the resource to load
|
|
114
114
|
:returns: A dataframe with the properties
|
|
115
115
|
"""
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
)
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
use_tqdm=check_should_use_tqdm(kwargs)
|
|
125
|
-
)
|
|
126
|
-
|
|
127
|
-
return _df_getter()
|
|
116
|
+
df1 = get_literal_properties_df(prefix, **kwargs)
|
|
117
|
+
df2 = get_object_properties_df(prefix, **kwargs)
|
|
118
|
+
df = pd.concat([df1[["source", "predicate", "target"]], df2])
|
|
119
|
+
ll = len(prefix) + 1
|
|
120
|
+
df[f"{prefix}_id"] = df["source"].map(lambda x: x[ll:])
|
|
121
|
+
df = df.rename(columns={"predicate": "property", "target": "value"})
|
|
122
|
+
del df["source"]
|
|
123
|
+
return df[[f"{prefix}_id", "property", "value"]]
|
|
128
124
|
|
|
129
125
|
|
|
130
126
|
@wrap_norm_prefix
|
pyobo/api/xrefs.py
CHANGED
|
@@ -81,8 +81,7 @@ get_xrefs = get_filtered_xrefs
|
|
|
81
81
|
def get_xrefs_df(prefix: str, **kwargs: Unpack[GetOntologyKwargs]) -> pd.DataFrame:
|
|
82
82
|
"""Get all xrefs."""
|
|
83
83
|
warnings.warn(
|
|
84
|
-
|
|
85
|
-
f"Not using cache artifact path to {CacheArtifact.xrefs}",
|
|
84
|
+
"use pyobo.get_mappings_df instead of pyobo.get_xrefs_df.",
|
|
86
85
|
DeprecationWarning,
|
|
87
86
|
stacklevel=2,
|
|
88
87
|
)
|
pyobo/cli/database.py
CHANGED
|
@@ -2,8 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
import warnings
|
|
5
|
+
from collections.abc import Iterable
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
|
|
8
|
+
import bioregistry
|
|
7
9
|
import click
|
|
8
10
|
from more_click import verbose_option
|
|
9
11
|
from tqdm.contrib.logging import logging_redirect_tqdm
|
|
@@ -11,11 +13,11 @@ from typing_extensions import Unpack
|
|
|
11
13
|
from zenodo_client import update_zenodo
|
|
12
14
|
|
|
13
15
|
from .database_utils import (
|
|
16
|
+
IterHelperHelperDict,
|
|
14
17
|
_iter_alts,
|
|
15
18
|
_iter_definitions,
|
|
16
19
|
_iter_edges,
|
|
17
20
|
_iter_mappings,
|
|
18
|
-
_iter_metadata,
|
|
19
21
|
_iter_names,
|
|
20
22
|
_iter_properties,
|
|
21
23
|
_iter_relations,
|
|
@@ -23,6 +25,7 @@ from .database_utils import (
|
|
|
23
25
|
_iter_synonyms,
|
|
24
26
|
_iter_typedefs,
|
|
25
27
|
_iter_xrefs,
|
|
28
|
+
iter_helper_helper,
|
|
26
29
|
)
|
|
27
30
|
from .utils import (
|
|
28
31
|
Clickable,
|
|
@@ -44,12 +47,14 @@ from ..constants import (
|
|
|
44
47
|
TYPEDEFS_RECORD,
|
|
45
48
|
DatabaseKwargs,
|
|
46
49
|
)
|
|
47
|
-
from ..getters import db_output_helper
|
|
50
|
+
from ..getters import db_output_helper, get_ontology
|
|
48
51
|
|
|
49
52
|
__all__ = [
|
|
50
53
|
"main",
|
|
51
54
|
]
|
|
52
55
|
|
|
56
|
+
logger = logging.getLogger(__name__)
|
|
57
|
+
|
|
53
58
|
|
|
54
59
|
@click.group(name="database")
|
|
55
60
|
def main():
|
|
@@ -129,9 +134,32 @@ def build(ctx: click.Context, **kwargs: Unpack[DatabaseKwargs]) -> None:
|
|
|
129
134
|
ctx.invoke(species, **updated_kwargs)
|
|
130
135
|
|
|
131
136
|
|
|
137
|
+
@database_annotate
|
|
138
|
+
def cache(zenodo: bool, directory: Path, **kwargs: Unpack[DatabaseKwargs]) -> None:
|
|
139
|
+
"""Cache all things."""
|
|
140
|
+
if zenodo:
|
|
141
|
+
click.echo("no zenodo for caching")
|
|
142
|
+
|
|
143
|
+
kwargs["force_process"] = True
|
|
144
|
+
with logging_redirect_tqdm():
|
|
145
|
+
for _ in iter_helper_helper(get_ontology, **kwargs):
|
|
146
|
+
# this pass intentional to consume the iterable
|
|
147
|
+
pass
|
|
148
|
+
|
|
149
|
+
|
|
132
150
|
@database_annotate
|
|
133
151
|
def metadata(zenodo: bool, directory: Path, **kwargs: Unpack[DatabaseKwargs]) -> None:
|
|
134
152
|
"""Make the prefix-metadata dump."""
|
|
153
|
+
from ..api import get_metadata
|
|
154
|
+
|
|
155
|
+
def _iter_metadata(
|
|
156
|
+
**kwargs: Unpack[IterHelperHelperDict],
|
|
157
|
+
) -> Iterable[tuple[str, str, str, bool]]:
|
|
158
|
+
for prefix, data in iter_helper_helper(get_metadata, **kwargs):
|
|
159
|
+
version = data["version"]
|
|
160
|
+
logger.debug(f"[{prefix}] using version {version}")
|
|
161
|
+
yield prefix, version, data["date"], bioregistry.is_deprecated(prefix)
|
|
162
|
+
|
|
135
163
|
it = _iter_metadata(**kwargs)
|
|
136
164
|
db_output_helper(
|
|
137
165
|
it,
|
pyobo/cli/database_utils.py
CHANGED
|
@@ -9,7 +9,6 @@ from collections.abc import Iterable
|
|
|
9
9
|
from functools import partial
|
|
10
10
|
from typing import cast
|
|
11
11
|
|
|
12
|
-
import bioregistry
|
|
13
12
|
from tqdm.auto import tqdm
|
|
14
13
|
from typing_extensions import Unpack
|
|
15
14
|
|
|
@@ -21,7 +20,6 @@ from ..api import (
|
|
|
21
20
|
get_id_synonyms_mapping,
|
|
22
21
|
get_id_to_alts,
|
|
23
22
|
get_mappings_df,
|
|
24
|
-
get_metadata,
|
|
25
23
|
get_properties_df,
|
|
26
24
|
get_relations_df,
|
|
27
25
|
get_typedef_df,
|
|
@@ -40,19 +38,12 @@ def _iter_ncbigene(left: int, right: int) -> Iterable[tuple[str, str, str]]:
|
|
|
40
38
|
with gzip.open(ncbi_path, "rt") as file:
|
|
41
39
|
next(file) # throw away the header
|
|
42
40
|
for line in tqdm(
|
|
43
|
-
file, desc=f"
|
|
41
|
+
file, desc=f"[{ncbigene.PREFIX}] extracting names", unit_scale=True, total=56_700_000
|
|
44
42
|
):
|
|
45
43
|
parts = line.strip().split("\t")
|
|
46
44
|
yield ncbigene.PREFIX, parts[left], parts[right]
|
|
47
45
|
|
|
48
46
|
|
|
49
|
-
def _iter_metadata(**kwargs: Unpack[IterHelperHelperDict]):
|
|
50
|
-
for prefix, data in iter_helper_helper(get_metadata, **kwargs):
|
|
51
|
-
version = data["version"]
|
|
52
|
-
logger.debug(f"[{prefix}] using version {version}")
|
|
53
|
-
yield prefix, version, data["date"], bioregistry.is_deprecated(prefix)
|
|
54
|
-
|
|
55
|
-
|
|
56
47
|
def _iter_names(leave: bool = False, **kwargs) -> Iterable[tuple[str, str, str]]:
|
|
57
48
|
"""Iterate over all prefix-identifier-name triples we can get.
|
|
58
49
|
|
|
@@ -60,11 +51,14 @@ def _iter_names(leave: bool = False, **kwargs) -> Iterable[tuple[str, str, str]]
|
|
|
60
51
|
"""
|
|
61
52
|
yield from iter_helper(get_id_name_mapping, leave=leave, **kwargs)
|
|
62
53
|
yield from _iter_ncbigene(1, 2)
|
|
54
|
+
yield from _iter_pubchem_compound()
|
|
55
|
+
|
|
63
56
|
|
|
57
|
+
def _iter_pubchem_compound():
|
|
64
58
|
pcc_path = pubchem._ensure_cid_name_path()
|
|
65
59
|
with gzip.open(pcc_path, mode="rt", encoding="ISO-8859-1") as file:
|
|
66
60
|
for line in tqdm(
|
|
67
|
-
file, desc=f"
|
|
61
|
+
file, desc=f"[{pubchem.PREFIX}] extracting names", unit_scale=True, total=119_000_000
|
|
68
62
|
):
|
|
69
63
|
identifier, name = line.strip().split("\t", 1)
|
|
70
64
|
yield pubchem.PREFIX, identifier, name
|
pyobo/getters.py
CHANGED
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import datetime
|
|
6
|
-
import gzip
|
|
7
6
|
import json
|
|
8
7
|
import logging
|
|
9
8
|
import pathlib
|
|
@@ -16,7 +15,7 @@ from collections import Counter
|
|
|
16
15
|
from collections.abc import Callable, Iterable, Mapping, Sequence
|
|
17
16
|
from pathlib import Path
|
|
18
17
|
from textwrap import indent
|
|
19
|
-
from typing import TypeVar
|
|
18
|
+
from typing import Any, TypeVar
|
|
20
19
|
|
|
21
20
|
import bioregistry
|
|
22
21
|
import click
|
|
@@ -27,6 +26,7 @@ from tqdm.auto import tqdm
|
|
|
27
26
|
from typing_extensions import Unpack
|
|
28
27
|
|
|
29
28
|
from .constants import (
|
|
29
|
+
BUILD_SUBDIRECTORY_NAME,
|
|
30
30
|
DATABASE_DIRECTORY,
|
|
31
31
|
GetOntologyKwargs,
|
|
32
32
|
IterHelperHelperDict,
|
|
@@ -36,7 +36,7 @@ from .identifier_utils import ParseError, wrap_norm_prefix
|
|
|
36
36
|
from .plugins import has_nomenclature_plugin, run_nomenclature_plugin
|
|
37
37
|
from .reader import from_obo_path, from_obonet
|
|
38
38
|
from .struct import Obo
|
|
39
|
-
from .utils.io import
|
|
39
|
+
from .utils.io import safe_open_writer
|
|
40
40
|
from .utils.path import ensure_path, prefix_directory_join
|
|
41
41
|
from .version import get_git_hash, get_version
|
|
42
42
|
|
|
@@ -119,19 +119,21 @@ def get_ontology(
|
|
|
119
119
|
logger.info("UBERON has so much garbage in it that defaulting to non-strict parsing")
|
|
120
120
|
strict = False
|
|
121
121
|
|
|
122
|
-
if
|
|
122
|
+
if force_process:
|
|
123
|
+
obonet_json_gz_path = None
|
|
124
|
+
elif not cache:
|
|
123
125
|
logger.debug("[%s] caching was turned off, so dont look for an obonet file", prefix)
|
|
124
126
|
obonet_json_gz_path = None
|
|
125
127
|
else:
|
|
126
128
|
obonet_json_gz_path = prefix_directory_join(
|
|
127
|
-
prefix, name=f"{prefix}.obonet.json.gz",
|
|
129
|
+
prefix, BUILD_SUBDIRECTORY_NAME, name=f"{prefix}.obonet.json.gz", version=version
|
|
128
130
|
)
|
|
129
131
|
logger.debug(
|
|
130
132
|
"[%s] caching is turned on, so look for an obonet file at %s",
|
|
131
133
|
prefix,
|
|
132
134
|
obonet_json_gz_path,
|
|
133
135
|
)
|
|
134
|
-
if obonet_json_gz_path.
|
|
136
|
+
if obonet_json_gz_path.is_file() and not force:
|
|
135
137
|
from .utils.cache import get_gzipped_graph
|
|
136
138
|
|
|
137
139
|
logger.debug("[%s] using obonet cache at %s", prefix, obonet_json_gz_path)
|
|
@@ -198,64 +200,6 @@ def _ensure_ontology_path(
|
|
|
198
200
|
return None, None
|
|
199
201
|
|
|
200
202
|
|
|
201
|
-
#: Obonet/Pronto can't parse these (consider converting to OBO with ROBOT?)
|
|
202
|
-
CANT_PARSE = {
|
|
203
|
-
"agro",
|
|
204
|
-
"aro",
|
|
205
|
-
"bco",
|
|
206
|
-
"caro",
|
|
207
|
-
"cco",
|
|
208
|
-
"chmo",
|
|
209
|
-
"cido",
|
|
210
|
-
"covoc",
|
|
211
|
-
"cto",
|
|
212
|
-
"cvdo",
|
|
213
|
-
"dicom",
|
|
214
|
-
"dinto",
|
|
215
|
-
"emap",
|
|
216
|
-
"epso",
|
|
217
|
-
"eupath",
|
|
218
|
-
"fbbi",
|
|
219
|
-
"fma",
|
|
220
|
-
"fobi",
|
|
221
|
-
"foodon",
|
|
222
|
-
"genepio",
|
|
223
|
-
"hancestro",
|
|
224
|
-
"hom",
|
|
225
|
-
"hso",
|
|
226
|
-
"htn", # Unknown string format: creation: 16MAY2017
|
|
227
|
-
"ico",
|
|
228
|
-
"idocovid19",
|
|
229
|
-
"labo",
|
|
230
|
-
"mamo",
|
|
231
|
-
"mfmo",
|
|
232
|
-
"mfo",
|
|
233
|
-
"mfomd",
|
|
234
|
-
"miapa",
|
|
235
|
-
"mo",
|
|
236
|
-
"oae",
|
|
237
|
-
"ogms", # Unknown string format: creation: 16MAY2017
|
|
238
|
-
"ohd",
|
|
239
|
-
"ons",
|
|
240
|
-
"oostt",
|
|
241
|
-
"opmi",
|
|
242
|
-
"ornaseq",
|
|
243
|
-
"orth",
|
|
244
|
-
"pdro",
|
|
245
|
-
"probonto",
|
|
246
|
-
"psdo",
|
|
247
|
-
"reo",
|
|
248
|
-
"rex",
|
|
249
|
-
"rnao",
|
|
250
|
-
"sepio",
|
|
251
|
-
"sio",
|
|
252
|
-
"spd",
|
|
253
|
-
"sweetrealm",
|
|
254
|
-
"txpo",
|
|
255
|
-
"vido",
|
|
256
|
-
"vt",
|
|
257
|
-
"xl",
|
|
258
|
-
}
|
|
259
203
|
SKIP = {
|
|
260
204
|
"ncbigene": "too big, refs acquired from other dbs",
|
|
261
205
|
"pubchem.compound": "top big, can't deal with this now",
|
|
@@ -276,11 +220,12 @@ SKIP = {
|
|
|
276
220
|
"kegg.genes": "needs fix", # FIXME
|
|
277
221
|
"kegg.genome": "needs fix", # FIXME
|
|
278
222
|
"kegg.pathway": "needs fix", # FIXME
|
|
279
|
-
"ensemblglossary": "
|
|
223
|
+
"ensemblglossary": "URI is self-referential to data in OLS, extract from there",
|
|
280
224
|
"epio": "content from fraunhofer is unreliable",
|
|
281
225
|
"epso": "content from fraunhofer is unreliable",
|
|
282
226
|
"gwascentral.phenotype": "website is down? or API changed?", # FIXME
|
|
283
227
|
"gwascentral.study": "website is down? or API changed?", # FIXME
|
|
228
|
+
"snomedct": "dead source",
|
|
284
229
|
}
|
|
285
230
|
|
|
286
231
|
X = TypeVar("X")
|
|
@@ -412,7 +357,7 @@ def iter_helper_helper(
|
|
|
412
357
|
except ValueError as e:
|
|
413
358
|
if _is_xml(e):
|
|
414
359
|
# this means that it tried doing parsing on an xml page
|
|
415
|
-
logger.
|
|
360
|
+
logger.warning(
|
|
416
361
|
"no resource available for %s. See http://www.obofoundry.org/ontology/%s",
|
|
417
362
|
prefix,
|
|
418
363
|
prefix,
|
|
@@ -452,7 +397,7 @@ def _prep_dir(directory: None | str | pathlib.Path) -> pathlib.Path:
|
|
|
452
397
|
|
|
453
398
|
|
|
454
399
|
def db_output_helper(
|
|
455
|
-
it: Iterable[tuple[
|
|
400
|
+
it: Iterable[tuple[Any, ...]],
|
|
456
401
|
db_name: str,
|
|
457
402
|
columns: Sequence[str],
|
|
458
403
|
*,
|
|
@@ -497,13 +442,10 @@ def db_output_helper(
|
|
|
497
442
|
logger.info("writing %s to %s", db_name, db_path)
|
|
498
443
|
logger.info("writing %s sample to %s", db_name, db_sample_path)
|
|
499
444
|
sample_rows = []
|
|
500
|
-
with gzip.open(db_path, mode="wt") if use_gzip else open(db_path, "w") as gzipped_file:
|
|
501
|
-
writer = get_writer(gzipped_file)
|
|
502
445
|
|
|
446
|
+
with safe_open_writer(db_path) as writer:
|
|
503
447
|
# for the first 10 rows, put it in a sample file too
|
|
504
|
-
with
|
|
505
|
-
sample_writer = get_writer(sample_file)
|
|
506
|
-
|
|
448
|
+
with safe_open_writer(db_sample_path) as sample_writer:
|
|
507
449
|
# write header
|
|
508
450
|
writer.writerow(columns)
|
|
509
451
|
sample_writer.writerow(columns)
|
|
@@ -523,15 +465,13 @@ def db_output_helper(
|
|
|
523
465
|
c_detailed[tuple(row[i] for i in summary_detailed)] += 1
|
|
524
466
|
writer.writerow(row)
|
|
525
467
|
|
|
526
|
-
with
|
|
527
|
-
|
|
528
|
-
writer.writerows(c.most_common())
|
|
468
|
+
with safe_open_writer(db_summary_path) as summary_writer:
|
|
469
|
+
summary_writer.writerows(c.most_common())
|
|
529
470
|
|
|
530
471
|
if summary_detailed is not None:
|
|
531
472
|
logger.info(f"writing {db_name} detailed summary to {db_summary_detailed_path}")
|
|
532
|
-
with
|
|
533
|
-
|
|
534
|
-
writer.writerows((*keys, v) for keys, v in c_detailed.most_common())
|
|
473
|
+
with safe_open_writer(db_summary_detailed_path) as detailed_summary_writer:
|
|
474
|
+
detailed_summary_writer.writerows((*keys, v) for keys, v in c_detailed.most_common())
|
|
535
475
|
rv.append(("Summary (Detailed)", db_summary_detailed_path))
|
|
536
476
|
|
|
537
477
|
with open(db_metadata_path, "w") as file:
|
pyobo/gilda_utils.py
CHANGED
|
@@ -2,20 +2,15 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
import logging
|
|
6
5
|
import warnings
|
|
7
6
|
from collections.abc import Iterable, Sequence
|
|
8
7
|
from typing import TYPE_CHECKING, Any, cast
|
|
9
8
|
|
|
10
|
-
import bioregistry
|
|
11
9
|
import ssslm
|
|
12
|
-
from ssslm import
|
|
13
|
-
from tqdm.auto import tqdm
|
|
10
|
+
from ssslm import literal_mappings_to_gilda
|
|
14
11
|
from typing_extensions import Unpack
|
|
15
12
|
|
|
16
13
|
from pyobo.api import (
|
|
17
|
-
get_id_name_mapping,
|
|
18
|
-
get_ids,
|
|
19
14
|
get_literal_mappings,
|
|
20
15
|
get_literal_mappings_subset,
|
|
21
16
|
)
|
|
@@ -26,83 +21,11 @@ if TYPE_CHECKING:
|
|
|
26
21
|
import gilda
|
|
27
22
|
|
|
28
23
|
__all__ = [
|
|
24
|
+
"get_gilda_term_subset",
|
|
25
|
+
"get_gilda_terms",
|
|
29
26
|
"get_grounder",
|
|
30
|
-
"iter_gilda_prediction_tuples",
|
|
31
27
|
]
|
|
32
28
|
|
|
33
|
-
logger = logging.getLogger(__name__)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
# TODO the only place this is used is in Biomappings -
|
|
37
|
-
# might be better to directly move it there
|
|
38
|
-
def iter_gilda_prediction_tuples(
|
|
39
|
-
prefix: str,
|
|
40
|
-
relation: str = "skos:exactMatch",
|
|
41
|
-
*,
|
|
42
|
-
grounder: gilda.Grounder | None = None,
|
|
43
|
-
identifiers_are_names: bool = False,
|
|
44
|
-
strict: bool = False,
|
|
45
|
-
) -> Iterable[tuple[str, str, str, str, str, str, str, str, float]]:
|
|
46
|
-
"""Iterate over prediction tuples for a given prefix."""
|
|
47
|
-
if grounder is None:
|
|
48
|
-
import gilda.api
|
|
49
|
-
|
|
50
|
-
grounder = gilda.api.grounder
|
|
51
|
-
grounder_ = GildaGrounder(grounder)
|
|
52
|
-
id_name_mapping = get_id_name_mapping(prefix, strict=strict)
|
|
53
|
-
it = tqdm(
|
|
54
|
-
id_name_mapping.items(), desc=f"[{prefix}] gilda tuples", unit_scale=True, unit="name"
|
|
55
|
-
)
|
|
56
|
-
for identifier, name in it:
|
|
57
|
-
norm_identifier = _normalize_identifier(prefix, identifier)
|
|
58
|
-
for scored_match in grounder_.get_matches(name):
|
|
59
|
-
yield (
|
|
60
|
-
prefix,
|
|
61
|
-
norm_identifier,
|
|
62
|
-
name,
|
|
63
|
-
relation,
|
|
64
|
-
scored_match.prefix,
|
|
65
|
-
_normalize_identifier(scored_match.prefix, scored_match.identifier),
|
|
66
|
-
name,
|
|
67
|
-
"semapv:LexicalMatching",
|
|
68
|
-
round(scored_match.score, 3),
|
|
69
|
-
)
|
|
70
|
-
|
|
71
|
-
if identifiers_are_names:
|
|
72
|
-
it = tqdm(get_ids(prefix), desc=f"[{prefix}] gilda tuples", unit_scale=True, unit="id")
|
|
73
|
-
for identifier in it:
|
|
74
|
-
norm_identifier = _normalize_identifier(prefix, identifier)
|
|
75
|
-
for scored_match in grounder_.get_matches(identifier):
|
|
76
|
-
yield (
|
|
77
|
-
prefix,
|
|
78
|
-
norm_identifier,
|
|
79
|
-
identifier,
|
|
80
|
-
relation,
|
|
81
|
-
scored_match.prefix,
|
|
82
|
-
_normalize_identifier(scored_match.prefix, scored_match.identifier),
|
|
83
|
-
identifier,
|
|
84
|
-
"semapv:LexicalMatching",
|
|
85
|
-
scored_match.score,
|
|
86
|
-
)
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
def _normalize_identifier(prefix: str, identifier: str) -> str:
|
|
90
|
-
"""Normalize the identifier."""
|
|
91
|
-
resource = bioregistry.get_resource(prefix)
|
|
92
|
-
if resource is None:
|
|
93
|
-
raise KeyError
|
|
94
|
-
return resource.miriam_standardize_identifier(identifier) or identifier
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
def normalize_identifier(prefix: str, identifier: str) -> str:
|
|
98
|
-
"""Normalize the identifier."""
|
|
99
|
-
warnings.warn(
|
|
100
|
-
"normalization to MIRIAM is deprecated, please update to using Bioregistry standard identifiers",
|
|
101
|
-
DeprecationWarning,
|
|
102
|
-
stacklevel=2,
|
|
103
|
-
)
|
|
104
|
-
return _normalize_identifier(prefix, identifier)
|
|
105
|
-
|
|
106
29
|
|
|
107
30
|
def get_grounder(*args: Any, **kwargs: Any) -> gilda.Grounder:
|
|
108
31
|
"""Get a grounder."""
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""Extract registry information."""
|
|
2
2
|
|
|
3
3
|
from .api import (
|
|
4
|
-
BlacklistedError,
|
|
5
4
|
DefaultCoercionError,
|
|
6
5
|
EmptyStringError,
|
|
7
6
|
NotCURIEError,
|
|
@@ -14,15 +13,10 @@ from .api import (
|
|
|
14
13
|
standardize_ec,
|
|
15
14
|
wrap_norm_prefix,
|
|
16
15
|
)
|
|
17
|
-
from .preprocessing import
|
|
18
|
-
remap_full,
|
|
19
|
-
remap_prefix,
|
|
20
|
-
str_is_blacklisted,
|
|
21
|
-
)
|
|
16
|
+
from .preprocessing import get_rules
|
|
22
17
|
from .relations import ground_relation
|
|
23
18
|
|
|
24
19
|
__all__ = [
|
|
25
|
-
"BlacklistedError",
|
|
26
20
|
"DefaultCoercionError",
|
|
27
21
|
"EmptyStringError",
|
|
28
22
|
"NotCURIEError",
|
|
@@ -32,10 +26,8 @@ __all__ = [
|
|
|
32
26
|
"UnregisteredPrefixError",
|
|
33
27
|
"_is_valid_identifier",
|
|
34
28
|
"_parse_str_or_curie_or_uri_helper",
|
|
29
|
+
"get_rules",
|
|
35
30
|
"ground_relation",
|
|
36
|
-
"remap_full",
|
|
37
|
-
"remap_prefix",
|
|
38
31
|
"standardize_ec",
|
|
39
|
-
"str_is_blacklisted",
|
|
40
32
|
"wrap_norm_prefix",
|
|
41
33
|
]
|
pyobo/identifier_utils/api.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
-
from functools import wraps
|
|
6
|
+
from functools import lru_cache, wraps
|
|
7
7
|
from typing import Annotated, ClassVar
|
|
8
8
|
|
|
9
9
|
import bioregistry
|
|
@@ -11,14 +11,14 @@ import click
|
|
|
11
11
|
from bioregistry import NormalizedNamableReference as Reference
|
|
12
12
|
from bioregistry.constants import FailureReturnType
|
|
13
13
|
from curies import ReferenceTuple
|
|
14
|
+
from curies.preprocessing import BlocklistError, PreprocessingConverter
|
|
14
15
|
from pydantic import ValidationError
|
|
15
16
|
from typing_extensions import Doc
|
|
16
17
|
|
|
17
|
-
from .preprocessing import
|
|
18
|
+
from .preprocessing import get_rules
|
|
18
19
|
from .relations import ground_relation
|
|
19
20
|
|
|
20
21
|
__all__ = [
|
|
21
|
-
"BlacklistedError",
|
|
22
22
|
"DefaultCoercionError",
|
|
23
23
|
"EmptyStringError",
|
|
24
24
|
"NotCURIEError",
|
|
@@ -34,10 +34,6 @@ __all__ = [
|
|
|
34
34
|
logger = logging.getLogger(__name__)
|
|
35
35
|
|
|
36
36
|
|
|
37
|
-
class BlacklistedError(ValueError):
|
|
38
|
-
"""A sentinel for blacklisted strings."""
|
|
39
|
-
|
|
40
|
-
|
|
41
37
|
Line = Annotated[str | None, Doc("""The OBO line where the parsing happened""")]
|
|
42
38
|
|
|
43
39
|
|
|
@@ -138,6 +134,15 @@ def _preclean_uri(s: str) -> str:
|
|
|
138
134
|
return s
|
|
139
135
|
|
|
140
136
|
|
|
137
|
+
@lru_cache(1)
|
|
138
|
+
def _get_converter() -> PreprocessingConverter:
|
|
139
|
+
return PreprocessingConverter(
|
|
140
|
+
converter=bioregistry.manager.converter,
|
|
141
|
+
rules=get_rules(),
|
|
142
|
+
preclean=_preclean_uri,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
|
|
141
146
|
def _parse_str_or_curie_or_uri_helper(
|
|
142
147
|
str_or_curie_or_uri: str,
|
|
143
148
|
*,
|
|
@@ -148,7 +153,7 @@ def _parse_str_or_curie_or_uri_helper(
|
|
|
148
153
|
line: str | None = None,
|
|
149
154
|
name: str | None = None,
|
|
150
155
|
context: str | None = None,
|
|
151
|
-
) -> Reference | ParseError |
|
|
156
|
+
) -> Reference | ParseError | BlocklistError:
|
|
152
157
|
"""Parse a string that looks like a CURIE.
|
|
153
158
|
|
|
154
159
|
:param str_or_curie_or_uri: A compact uniform resource identifier (CURIE)
|
|
@@ -171,19 +176,23 @@ def _parse_str_or_curie_or_uri_helper(
|
|
|
171
176
|
context=context,
|
|
172
177
|
)
|
|
173
178
|
|
|
179
|
+
rules = get_rules()
|
|
180
|
+
|
|
174
181
|
if upgrade:
|
|
175
182
|
# Remap the curie with the full list
|
|
176
|
-
if r1 := remap_full(
|
|
183
|
+
if r1 := rules.remap_full(
|
|
184
|
+
str_or_curie_or_uri, reference_cls=Reference, context=ontology_prefix
|
|
185
|
+
):
|
|
177
186
|
return r1
|
|
178
187
|
|
|
179
188
|
# Remap node's prefix (if necessary)
|
|
180
|
-
str_or_curie_or_uri = remap_prefix(str_or_curie_or_uri,
|
|
189
|
+
str_or_curie_or_uri = rules.remap_prefix(str_or_curie_or_uri, context=ontology_prefix)
|
|
181
190
|
|
|
182
191
|
if r2 := ground_relation(str_or_curie_or_uri):
|
|
183
192
|
return r2
|
|
184
193
|
|
|
185
|
-
if
|
|
186
|
-
return
|
|
194
|
+
if rules.str_is_blocked(str_or_curie_or_uri, context=ontology_prefix):
|
|
195
|
+
return BlocklistError()
|
|
187
196
|
|
|
188
197
|
if _is_uri(str_or_curie_or_uri):
|
|
189
198
|
rt = bioregistry.parse_iri(
|