pyobo 0.12.4__py3-none-any.whl → 0.12.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyobo/.DS_Store +0 -0
- pyobo/__init__.py +6 -0
- pyobo/api/__init__.py +3 -0
- pyobo/api/embedding.py +118 -0
- pyobo/api/utils.py +0 -10
- pyobo/cli/cli.py +1 -6
- pyobo/cli/database.py +7 -1
- pyobo/constants.py +23 -0
- pyobo/getters.py +52 -35
- pyobo/identifier_utils/api.py +3 -1
- pyobo/sources/__init__.py +14 -1
- pyobo/sources/chembl/__init__.py +6 -0
- pyobo/sources/chembl/chembl_cell.py +94 -0
- pyobo/sources/chembl/chembl_mechanism.py +81 -0
- pyobo/sources/chembl/chembl_tissue.py +70 -0
- pyobo/sources/clinicaltrials.py +32 -33
- pyobo/sources/complexportal.py +5 -1
- pyobo/sources/drugcentral.py +2 -1
- pyobo/sources/hgnc/hgnc.py +13 -6
- pyobo/sources/iana_media_type.py +100 -0
- pyobo/sources/mesh.py +82 -29
- pyobo/sources/reactome.py +10 -3
- pyobo/sources/spdx.py +89 -0
- pyobo/sources/uniprot/uniprot.py +2 -2
- pyobo/sources/wikipathways.py +92 -7
- pyobo/struct/__init__.py +2 -0
- pyobo/struct/functional/dsl.py +10 -1
- pyobo/struct/functional/ontology.py +3 -3
- pyobo/struct/obo/reader.py +17 -53
- pyobo/struct/obograph/export.py +2 -2
- pyobo/struct/struct.py +125 -8
- pyobo/struct/struct_utils.py +10 -0
- pyobo/struct/typedef.py +15 -3
- pyobo/struct/vocabulary.py +8 -0
- pyobo/utils/cache.py +4 -3
- pyobo/utils/io.py +18 -56
- pyobo/utils/misc.py +142 -1
- pyobo/utils/path.py +34 -2
- pyobo/version.py +1 -1
- {pyobo-0.12.4.dist-info → pyobo-0.12.6.dist-info}/METADATA +11 -7
- {pyobo-0.12.4.dist-info → pyobo-0.12.6.dist-info}/RECORD +44 -38
- {pyobo-0.12.4.dist-info → pyobo-0.12.6.dist-info}/WHEEL +0 -0
- {pyobo-0.12.4.dist-info → pyobo-0.12.6.dist-info}/entry_points.txt +0 -0
- {pyobo-0.12.4.dist-info → pyobo-0.12.6.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""Converter for ChEMBL mechanisms."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from collections.abc import Iterable
|
|
5
|
+
|
|
6
|
+
import chembl_downloader
|
|
7
|
+
|
|
8
|
+
from pyobo.struct import CHARLIE_TERM, PYOBO_INJECTED, Obo, Term
|
|
9
|
+
from pyobo.struct.typedef import exact_match
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"ChEMBLMechanismGetter",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
PREFIX = "chembl.mechanism"
|
|
18
|
+
QUERY = "SELECT * from ACTION_TYPE"
|
|
19
|
+
|
|
20
|
+
ROOT = (
|
|
21
|
+
Term.default(PREFIX, "mechanism", name="mechanism")
|
|
22
|
+
.append_contributor(CHARLIE_TERM)
|
|
23
|
+
.append_comment(PYOBO_INJECTED)
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ChEMBLMechanismGetter(Obo):
|
|
28
|
+
"""An ontology representation of ChEMBL mechanisms."""
|
|
29
|
+
|
|
30
|
+
ontology = PREFIX
|
|
31
|
+
bioversions_key = "chembl"
|
|
32
|
+
typedefs = [exact_match]
|
|
33
|
+
root_terms = [ROOT.reference]
|
|
34
|
+
|
|
35
|
+
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
36
|
+
"""Iterate over terms in the ontology."""
|
|
37
|
+
return iter_terms(version=self._version_or_raise)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def normalize_chembl_mechanism(name: str) -> str:
|
|
41
|
+
"""Normalize a ChEMBL mechanism name into an identifier."""
|
|
42
|
+
return name.lower().replace(" ", "-")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _norm_name(name: str) -> str:
|
|
46
|
+
return name.lower().replace("rnai ", "RNAi ")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_pattern(version: str | None = None) -> str:
|
|
50
|
+
"""Get a pattern."""
|
|
51
|
+
df = chembl_downloader.query("SELECT action_type from ACTION_TYPE", version=version)
|
|
52
|
+
parts = "|".join(sorted(normalize_chembl_mechanism(name) for (name,) in df.values))
|
|
53
|
+
return f"^[{parts}]$"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def iter_terms(version: str) -> Iterable[Term]:
|
|
57
|
+
"""Iterate over ChEMBL mechanisms."""
|
|
58
|
+
df = chembl_downloader.query(QUERY, version=version)
|
|
59
|
+
terms = {}
|
|
60
|
+
parents = {}
|
|
61
|
+
for name, _description, parent in df.values:
|
|
62
|
+
identifier = normalize_chembl_mechanism(name)
|
|
63
|
+
terms[name] = Term.from_triple(prefix=PREFIX, identifier=identifier, name=_norm_name(name))
|
|
64
|
+
if name != parent: # protect against "other" which is a child of itself
|
|
65
|
+
parents[name] = parent
|
|
66
|
+
for child, parent in parents.items():
|
|
67
|
+
terms[child].append_parent(terms[parent])
|
|
68
|
+
|
|
69
|
+
# these are the three top-level things in the hierarchy, which
|
|
70
|
+
# we annotate onto a dummy parent term
|
|
71
|
+
for name in [
|
|
72
|
+
"POSITIVE MODULATOR",
|
|
73
|
+
"NEGATIVE MODULATOR",
|
|
74
|
+
"OTHER",
|
|
75
|
+
]:
|
|
76
|
+
terms[name].append_parent(ROOT)
|
|
77
|
+
yield from terms.values()
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
if __name__ == "__main__":
|
|
81
|
+
ChEMBLMechanismGetter.cli()
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Converter for ChEMBL tissues."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from collections.abc import Iterable
|
|
5
|
+
|
|
6
|
+
import chembl_downloader
|
|
7
|
+
|
|
8
|
+
from pyobo.struct import Obo, Reference, Term
|
|
9
|
+
from pyobo.struct.typedef import exact_match
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"ChEMBLTissueGetter",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
PREFIX = "chembl.tissue"
|
|
18
|
+
QUERY = """\
|
|
19
|
+
SELECT
|
|
20
|
+
CHEMBL_ID,
|
|
21
|
+
PREF_NAME,
|
|
22
|
+
UBERON_ID,
|
|
23
|
+
EFO_ID,
|
|
24
|
+
BTO_ID,
|
|
25
|
+
CALOHA_ID
|
|
26
|
+
FROM TISSUE_DICTIONARY
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ChEMBLTissueGetter(Obo):
|
|
31
|
+
"""An ontology representation of ChEMBL tissues."""
|
|
32
|
+
|
|
33
|
+
ontology = PREFIX
|
|
34
|
+
bioversions_key = "chembl"
|
|
35
|
+
typedefs = [exact_match]
|
|
36
|
+
|
|
37
|
+
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
38
|
+
"""Iterate over terms in the ontology."""
|
|
39
|
+
return iter_terms(version=self._version_or_raise)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def iter_terms(version: str | None = None) -> Iterable[Term]:
|
|
43
|
+
"""Iterate over ChEMBL tissue terms."""
|
|
44
|
+
with chembl_downloader.cursor(version=version) as cursor:
|
|
45
|
+
cursor.execute(QUERY)
|
|
46
|
+
for chembl_id, name, uberon, efo, bto, caloha in cursor.fetchall():
|
|
47
|
+
term = Term(
|
|
48
|
+
reference=Reference(prefix=PREFIX, identifier=chembl_id, name=name),
|
|
49
|
+
)
|
|
50
|
+
if uberon:
|
|
51
|
+
term.append_exact_match(
|
|
52
|
+
Reference(prefix="uberon", identifier=uberon.removeprefix("UBERON:"))
|
|
53
|
+
)
|
|
54
|
+
if efo:
|
|
55
|
+
term.append_exact_match(
|
|
56
|
+
Reference(
|
|
57
|
+
prefix="efo", identifier=efo.removeprefix("EFO:").removeprefix("EFO;")
|
|
58
|
+
)
|
|
59
|
+
)
|
|
60
|
+
if bto:
|
|
61
|
+
term.append_exact_match(
|
|
62
|
+
Reference(prefix="bto", identifier=bto.removeprefix("BTO:"))
|
|
63
|
+
)
|
|
64
|
+
if caloha:
|
|
65
|
+
term.append_exact_match(Reference(prefix="caloha", identifier=caloha))
|
|
66
|
+
yield term
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
if __name__ == "__main__":
|
|
70
|
+
ChEMBLTissueGetter.cli()
|
pyobo/sources/clinicaltrials.py
CHANGED
|
@@ -27,49 +27,48 @@ HAS_INTERVENTION = TypeDef(
|
|
|
27
27
|
is_metadata_tag=True,
|
|
28
28
|
)
|
|
29
29
|
|
|
30
|
-
|
|
30
|
+
INVESTIGATION_TERM = Term(
|
|
31
|
+
reference=Reference(prefix="obi", identifier="0000066", name="investigation")
|
|
32
|
+
)
|
|
31
33
|
|
|
32
|
-
|
|
33
|
-
reference=
|
|
34
|
-
).append_parent(
|
|
34
|
+
OBSERVATIONAL_INVESTIGATION_TERM = Term(
|
|
35
|
+
reference=Reference(prefix="obi", identifier="0003693", name="observational investigation")
|
|
36
|
+
).append_parent(INVESTIGATION_TERM)
|
|
35
37
|
|
|
36
|
-
|
|
37
|
-
reference=
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
38
|
+
CLINICAL_INVESTIGATION_TERM = Term(
|
|
39
|
+
reference=Reference(prefix="obi", identifier="0003697", name="clinical investigation")
|
|
40
|
+
).append_parent(INVESTIGATION_TERM)
|
|
41
|
+
|
|
42
|
+
CLINICAL_TRIAL_TERM = Term(
|
|
43
|
+
reference=Reference(prefix="obi", identifier="0003699", name="clinical trial")
|
|
44
|
+
).append_parent(CLINICAL_INVESTIGATION_TERM)
|
|
41
45
|
|
|
42
46
|
RANDOMIZED_INTERVENTIONAL_CLINICAL_TRIAL_TERM = Term(
|
|
43
|
-
reference=
|
|
44
|
-
|
|
45
|
-
"
|
|
46
|
-
name="randomized
|
|
47
|
+
reference=Reference(
|
|
48
|
+
prefix="obi",
|
|
49
|
+
identifier="0004001",
|
|
50
|
+
name="randomized clinical trial",
|
|
47
51
|
)
|
|
48
|
-
).append_parent(
|
|
52
|
+
).append_parent(CLINICAL_TRIAL_TERM)
|
|
49
53
|
|
|
50
54
|
NON_RANDOMIZED_INTERVENTIONAL_CLINICAL_TRIAL_TERM = Term(
|
|
51
|
-
reference=
|
|
52
|
-
|
|
53
|
-
"
|
|
54
|
-
name="non-randomized
|
|
55
|
-
)
|
|
56
|
-
).append_parent(INTERVENTIONAL_CLINICAL_TRIAL_TERM)
|
|
57
|
-
|
|
58
|
-
OBSERVATIONAL_CLINICAL_TRIAL_TERM = Term(
|
|
59
|
-
reference=default_reference(
|
|
60
|
-
PREFIX, "observational-clinical-trial", name="observational clinical trial"
|
|
55
|
+
reference=Reference(
|
|
56
|
+
prefix="obi",
|
|
57
|
+
identifier="0004002",
|
|
58
|
+
name="non-randomized clinical trial",
|
|
61
59
|
)
|
|
62
60
|
).append_parent(CLINICAL_TRIAL_TERM)
|
|
63
61
|
|
|
62
|
+
# TODO request OBI term
|
|
64
63
|
EXPANDED_ACCESS_STUDY_TERM = Term(
|
|
65
64
|
reference=default_reference(PREFIX, "expanded-access-study", name="expanded access study")
|
|
66
|
-
).append_parent(
|
|
65
|
+
).append_parent(INVESTIGATION_TERM)
|
|
67
66
|
|
|
68
67
|
TERMS = [
|
|
69
|
-
|
|
68
|
+
INVESTIGATION_TERM,
|
|
69
|
+
CLINICAL_INVESTIGATION_TERM,
|
|
70
|
+
OBSERVATIONAL_INVESTIGATION_TERM,
|
|
70
71
|
CLINICAL_TRIAL_TERM,
|
|
71
|
-
OBSERVATIONAL_CLINICAL_TRIAL_TERM,
|
|
72
|
-
INTERVENTIONAL_CLINICAL_TRIAL_TERM,
|
|
73
72
|
EXPANDED_ACCESS_STUDY_TERM,
|
|
74
73
|
RANDOMIZED_INTERVENTIONAL_CLINICAL_TRIAL_TERM,
|
|
75
74
|
NON_RANDOMIZED_INTERVENTIONAL_CLINICAL_TRIAL_TERM,
|
|
@@ -79,13 +78,13 @@ TERMS = [
|
|
|
79
78
|
# types in ClinicalTrials.gov. See summary script at
|
|
80
79
|
# https://gist.github.com/cthoyt/12a3cb3c63ad68d73fe5a2f0d506526f
|
|
81
80
|
PARENTS: dict[tuple[str | None, str | None], Term] = {
|
|
82
|
-
("INTERVENTIONAL", None):
|
|
83
|
-
("INTERVENTIONAL", "NA"):
|
|
81
|
+
("INTERVENTIONAL", None): CLINICAL_TRIAL_TERM,
|
|
82
|
+
("INTERVENTIONAL", "NA"): CLINICAL_TRIAL_TERM,
|
|
84
83
|
("INTERVENTIONAL", "RANDOMIZED"): RANDOMIZED_INTERVENTIONAL_CLINICAL_TRIAL_TERM,
|
|
85
84
|
("INTERVENTIONAL", "NON_RANDOMIZED"): NON_RANDOMIZED_INTERVENTIONAL_CLINICAL_TRIAL_TERM,
|
|
86
|
-
("OBSERVATIONAL", None):
|
|
85
|
+
("OBSERVATIONAL", None): OBSERVATIONAL_INVESTIGATION_TERM,
|
|
87
86
|
("EXPANDED_ACCESS", None): EXPANDED_ACCESS_STUDY_TERM,
|
|
88
|
-
(None, None):
|
|
87
|
+
(None, None): INVESTIGATION_TERM,
|
|
89
88
|
}
|
|
90
89
|
|
|
91
90
|
|
|
@@ -95,7 +94,7 @@ class ClinicalTrialsGetter(Obo):
|
|
|
95
94
|
ontology = PREFIX
|
|
96
95
|
dynamic_version = True
|
|
97
96
|
typedefs = [has_contributor, INVESTIGATES_CONDITION, HAS_INTERVENTION]
|
|
98
|
-
root_terms = [
|
|
97
|
+
root_terms = [INVESTIGATION_TERM.reference]
|
|
99
98
|
|
|
100
99
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
101
100
|
"""Iterate over terms for studies."""
|
pyobo/sources/complexportal.py
CHANGED
|
@@ -57,6 +57,7 @@ SPECIES = [
|
|
|
57
57
|
DTYPE = {
|
|
58
58
|
"taxonomy_id": str,
|
|
59
59
|
}
|
|
60
|
+
ROOT = Reference(prefix="go", identifier="0032991", name="macromolecular complex")
|
|
60
61
|
|
|
61
62
|
|
|
62
63
|
def _parse_members(s) -> list[tuple[Reference, str]]:
|
|
@@ -157,10 +158,12 @@ class ComplexPortalGetter(Obo):
|
|
|
157
158
|
|
|
158
159
|
bioversions_key = ontology = PREFIX
|
|
159
160
|
typedefs = [from_species, has_part, has_citation]
|
|
161
|
+
root_terms = [ROOT]
|
|
160
162
|
|
|
161
163
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
162
164
|
"""Iterate over terms in the ontology."""
|
|
163
|
-
|
|
165
|
+
yield Term(reference=ROOT)
|
|
166
|
+
yield from get_terms(version=self._version_or_raise)
|
|
164
167
|
|
|
165
168
|
|
|
166
169
|
def get_df(version: str, force: bool = False) -> pd.DataFrame:
|
|
@@ -232,6 +235,7 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
232
235
|
definition=definition.strip() if pd.notna(definition) else None,
|
|
233
236
|
synonyms=[Synonym(name=alias) for alias in aliases],
|
|
234
237
|
)
|
|
238
|
+
term.append_parent(ROOT)
|
|
235
239
|
for reference, note in xrefs:
|
|
236
240
|
if note == "identity":
|
|
237
241
|
term.append_xref(reference)
|
pyobo/sources/drugcentral.py
CHANGED
|
@@ -6,7 +6,6 @@ from collections.abc import Iterable
|
|
|
6
6
|
from contextlib import closing
|
|
7
7
|
|
|
8
8
|
import bioregistry
|
|
9
|
-
import psycopg2
|
|
10
9
|
from pydantic import ValidationError
|
|
11
10
|
from tqdm.auto import tqdm
|
|
12
11
|
|
|
@@ -42,6 +41,8 @@ class DrugCentralGetter(Obo):
|
|
|
42
41
|
|
|
43
42
|
def iter_terms() -> Iterable[Term]:
|
|
44
43
|
"""Iterate over DrugCentral terms."""
|
|
44
|
+
import psycopg2
|
|
45
|
+
|
|
45
46
|
with closing(psycopg2.connect(**PARAMS)) as conn:
|
|
46
47
|
with closing(conn.cursor()) as cur:
|
|
47
48
|
cur.execute(
|
pyobo/sources/hgnc/hgnc.py
CHANGED
|
@@ -7,6 +7,7 @@ import typing
|
|
|
7
7
|
from collections import Counter, defaultdict
|
|
8
8
|
from collections.abc import Iterable
|
|
9
9
|
|
|
10
|
+
import pydantic
|
|
10
11
|
from tabulate import tabulate
|
|
11
12
|
from tqdm.auto import tqdm
|
|
12
13
|
|
|
@@ -280,7 +281,7 @@ def get_terms(version: str | None = None, force: bool = False) -> Iterable[Term]
|
|
|
280
281
|
continue # only add concrete annotations
|
|
281
282
|
term.append_relationship(
|
|
282
283
|
gene_product_member_of,
|
|
283
|
-
Reference(prefix="ec", identifier=ec_code),
|
|
284
|
+
Reference(prefix="ec", identifier=ec_code.strip()),
|
|
284
285
|
)
|
|
285
286
|
for rna_central_ids in entry.pop("rna_central_id", []):
|
|
286
287
|
for rna_central_id in rna_central_ids.split(","):
|
|
@@ -314,7 +315,7 @@ def get_terms(version: str | None = None, force: bool = False) -> Iterable[Term]
|
|
|
314
315
|
)
|
|
315
316
|
for mgi_curie in entry.pop("mgd_id", []):
|
|
316
317
|
if not mgi_curie.startswith("MGI:"):
|
|
317
|
-
tqdm.write(f"hgnc:{identifier} had bad MGI CURIE: {mgi_curie}")
|
|
318
|
+
tqdm.write(f"[hgnc:{identifier}] had bad MGI CURIE: {mgi_curie}")
|
|
318
319
|
continue
|
|
319
320
|
mgi_id = mgi_curie[len("MGI:") :]
|
|
320
321
|
if not mgi_id:
|
|
@@ -335,7 +336,7 @@ def get_terms(version: str | None = None, force: bool = False) -> Iterable[Term]
|
|
|
335
336
|
Reference(prefix="iuphar.ligand", identifier=iuphar[len("ligandId:") :])
|
|
336
337
|
)
|
|
337
338
|
else:
|
|
338
|
-
tqdm.write(f"unhandled IUPHAR: {iuphar}")
|
|
339
|
+
tqdm.write(f"[hgnc:{identifier}] unhandled IUPHAR: {iuphar}")
|
|
339
340
|
|
|
340
341
|
for lrg_info in entry.pop("lsdb", []):
|
|
341
342
|
if lrg_info.startswith("LRG_"):
|
|
@@ -360,9 +361,15 @@ def get_terms(version: str | None = None, force: bool = False) -> Iterable[Term]
|
|
|
360
361
|
xref_identifiers = [i.strip(".") for i in xref_identifiers]
|
|
361
362
|
|
|
362
363
|
if len(xref_identifiers) == 1:
|
|
363
|
-
|
|
364
|
-
Reference(prefix=xref_prefix, identifier=str(xref_identifiers[0]))
|
|
365
|
-
|
|
364
|
+
try:
|
|
365
|
+
xref = Reference(prefix=xref_prefix, identifier=str(xref_identifiers[0]))
|
|
366
|
+
except pydantic.ValidationError:
|
|
367
|
+
tqdm.write(
|
|
368
|
+
f"[hgnc:{identifier}] had bad {key} xref: {xref_prefix}:{xref_identifiers[0]}"
|
|
369
|
+
)
|
|
370
|
+
continue
|
|
371
|
+
else:
|
|
372
|
+
term.append_exact_match(xref)
|
|
366
373
|
else:
|
|
367
374
|
for xref_identifier in xref_identifiers:
|
|
368
375
|
term.append_xref(Reference(prefix=xref_prefix, identifier=str(xref_identifier)))
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""An ontology representation of IANA media types (i.e. MIME types).
|
|
2
|
+
|
|
3
|
+
.. seealso:: https://www.iana.org/assignments/media-types/media-types.xhtml
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from collections.abc import Iterable
|
|
7
|
+
|
|
8
|
+
from pyobo import Obo, Reference, Term, default_reference
|
|
9
|
+
from pyobo.struct.typedef import term_replaced_by
|
|
10
|
+
from pyobo.utils.path import ensure_df
|
|
11
|
+
|
|
12
|
+
__all__ = ["IANAGetter"]
|
|
13
|
+
|
|
14
|
+
PREFIX = "iana.mediatype"
|
|
15
|
+
ROOT = Term.from_triple(prefix="dcterms", identifier="MediaType", name="media type")
|
|
16
|
+
|
|
17
|
+
#: The top-level types listed on https://www.iana.org/assignments/media-types/media-types.xhtml
|
|
18
|
+
MEDIA_TYPE_GROUPS = [
|
|
19
|
+
"application",
|
|
20
|
+
"audio",
|
|
21
|
+
"font",
|
|
22
|
+
"haptics",
|
|
23
|
+
"image",
|
|
24
|
+
"message",
|
|
25
|
+
"model",
|
|
26
|
+
"multipart",
|
|
27
|
+
"text",
|
|
28
|
+
"video",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
GROUP_TO_CSV = {
|
|
32
|
+
media_type_group: (
|
|
33
|
+
f"https://www.iana.org/assignments/media-types/{media_type_group}.csv",
|
|
34
|
+
Term(reference=default_reference(PREFIX, media_type_group, media_type_group)).append_parent(
|
|
35
|
+
ROOT
|
|
36
|
+
),
|
|
37
|
+
)
|
|
38
|
+
for media_type_group in MEDIA_TYPE_GROUPS
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class IANAGetter(Obo):
|
|
43
|
+
"""An ontology representation of IANA media types (i.e. MIME types)."""
|
|
44
|
+
|
|
45
|
+
ontology = bioregistry_key = PREFIX
|
|
46
|
+
name = "IANA Media Types"
|
|
47
|
+
dynamic_version = True
|
|
48
|
+
root_terms = [t.reference for _, (_, t) in sorted(GROUP_TO_CSV.items())]
|
|
49
|
+
typedefs = [
|
|
50
|
+
term_replaced_by,
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
54
|
+
"""Iterate over terms in the ontology."""
|
|
55
|
+
return get_terms()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def get_terms() -> list[Term]:
|
|
59
|
+
"""Get IANA Media Type terms."""
|
|
60
|
+
terms: dict[str, Term] = {}
|
|
61
|
+
forwards: dict[Term, str] = {}
|
|
62
|
+
for key, (url, parent) in GROUP_TO_CSV.items():
|
|
63
|
+
df = ensure_df(PREFIX, url=url, sep=",")
|
|
64
|
+
terms[key] = parent
|
|
65
|
+
for name, identifier, references in df.values:
|
|
66
|
+
if "OBSOLE" in name or "DEPRECATED" in name:
|
|
67
|
+
is_obsolete = True
|
|
68
|
+
else:
|
|
69
|
+
is_obsolete = None
|
|
70
|
+
term = Term(
|
|
71
|
+
reference=Reference(prefix=PREFIX, identifier=identifier, name=name),
|
|
72
|
+
is_obsolete=is_obsolete,
|
|
73
|
+
).append_parent(parent)
|
|
74
|
+
for reference in _process_references(references):
|
|
75
|
+
term.append_see_also_uri(reference)
|
|
76
|
+
terms[identifier.casefold()] = term
|
|
77
|
+
|
|
78
|
+
if "in favor of" in name:
|
|
79
|
+
_, _, new = name.partition("in favor of ")
|
|
80
|
+
forwards[term] = new.casefold().strip().rstrip(")").strip()
|
|
81
|
+
|
|
82
|
+
for old, new in forwards.items():
|
|
83
|
+
if new == "vnd.afpc.afplinedata":
|
|
84
|
+
new = "application/vnd.afpc.afplinedata"
|
|
85
|
+
old.append_replaced_by(terms[new].reference)
|
|
86
|
+
|
|
87
|
+
return list(terms.values())
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _process_references(cell: str) -> list[str]:
|
|
91
|
+
rv = []
|
|
92
|
+
for part in cell.split("]["):
|
|
93
|
+
part = part.strip("[").strip("]")
|
|
94
|
+
if part.startswith("RFC"):
|
|
95
|
+
rv.append(f"https://www.iana.org/go/rfc{part.removeprefix('RFC')}")
|
|
96
|
+
return rv
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
if __name__ == "__main__":
|
|
100
|
+
IANAGetter.cli()
|
pyobo/sources/mesh.py
CHANGED
|
@@ -10,12 +10,12 @@ from pathlib import Path
|
|
|
10
10
|
from typing import Any
|
|
11
11
|
from xml.etree.ElementTree import Element
|
|
12
12
|
|
|
13
|
+
import bioversions
|
|
13
14
|
from lxml import etree
|
|
14
15
|
from tqdm.auto import tqdm
|
|
15
16
|
|
|
16
|
-
from pyobo.api.utils import safe_get_version
|
|
17
17
|
from pyobo.identifier_utils import standardize_ec
|
|
18
|
-
from pyobo.struct import Obo, Reference, Synonym, Term
|
|
18
|
+
from pyobo.struct import Obo, Reference, Synonym, Term, default_reference
|
|
19
19
|
from pyobo.utils.cache import cached_json, cached_mapping
|
|
20
20
|
from pyobo.utils.path import ensure_path, prefix_directory_join
|
|
21
21
|
|
|
@@ -31,6 +31,37 @@ PREFIX = "mesh"
|
|
|
31
31
|
NOW_YEAR = str(datetime.datetime.now().year)
|
|
32
32
|
CAS_RE = re.compile(r"^\d{1,7}\-\d{2}\-\d$")
|
|
33
33
|
UNII_RE = re.compile(r"[0-9A-Za-z]{10}$")
|
|
34
|
+
SUPPLEMENT_PARENT = default_reference(
|
|
35
|
+
prefix=PREFIX, identifier="supplemental-record", name="supplemental records"
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
#: A mapping from tree header letters to labels
|
|
39
|
+
#:
|
|
40
|
+
#: .. seealso:: https://meshb-prev.nlm.nih.gov/treeView
|
|
41
|
+
TREE_HEADER_TO_NAME = {
|
|
42
|
+
"A": "Anatomy",
|
|
43
|
+
"B": "Organisms",
|
|
44
|
+
"C": "Diseases",
|
|
45
|
+
"D": "Chemicals and Drugs",
|
|
46
|
+
"E": "Analytical, Diagnostic and Therapeutic Techniques, and Equipment",
|
|
47
|
+
"F": "Psychiatry and Psychology",
|
|
48
|
+
"G": "Phenomena and Processes",
|
|
49
|
+
"H": "Disciplines and Occupations",
|
|
50
|
+
"I": "Anthropology, Education, Sociology, and Social Phenomena",
|
|
51
|
+
"J": "Technology, Industry, and Agriculture",
|
|
52
|
+
"K": "Humanities",
|
|
53
|
+
"L": "Information Science",
|
|
54
|
+
"M": "Named Groups",
|
|
55
|
+
"N": "Health Care",
|
|
56
|
+
"V": "Publication Characteristics",
|
|
57
|
+
"Z": "Geographicals",
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
#: A mapping from tree header letters to term objects
|
|
61
|
+
TREE_HEADERS: dict[str, Reference] = {
|
|
62
|
+
letter: default_reference(prefix=PREFIX, identifier=letter, name=name)
|
|
63
|
+
for letter, name in TREE_HEADER_TO_NAME.items()
|
|
64
|
+
}
|
|
34
65
|
|
|
35
66
|
|
|
36
67
|
def _get_xml_root(path: Path) -> Element:
|
|
@@ -46,13 +77,20 @@ class MeSHGetter(Obo):
|
|
|
46
77
|
"""An ontology representation of the Medical Subject Headings."""
|
|
47
78
|
|
|
48
79
|
ontology = bioversions_key = PREFIX
|
|
80
|
+
root_terms = [
|
|
81
|
+
SUPPLEMENT_PARENT,
|
|
82
|
+
*TREE_HEADERS.values(),
|
|
83
|
+
]
|
|
49
84
|
|
|
50
85
|
def _get_version(self) -> str | None:
|
|
51
86
|
return NOW_YEAR
|
|
52
87
|
|
|
53
88
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
54
89
|
"""Iterate over terms in the ontology."""
|
|
55
|
-
|
|
90
|
+
yield Term(reference=SUPPLEMENT_PARENT)
|
|
91
|
+
for x in TREE_HEADERS.values():
|
|
92
|
+
yield Term(reference=x)
|
|
93
|
+
yield from get_terms(version=self._version_or_raise, force=force)
|
|
56
94
|
|
|
57
95
|
|
|
58
96
|
def get_tree_to_mesh_id(version: str) -> Mapping[str, str]:
|
|
@@ -74,21 +112,21 @@ def get_tree_to_mesh_id(version: str) -> Mapping[str, str]:
|
|
|
74
112
|
return _inner()
|
|
75
113
|
|
|
76
114
|
|
|
77
|
-
def get_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
115
|
+
def get_terms(version: str, *, force: bool = False) -> Iterable[Term]:
|
|
78
116
|
"""Get MeSH OBO terms."""
|
|
79
117
|
mesh_id_to_term: dict[str, Term] = {}
|
|
80
118
|
|
|
81
|
-
|
|
119
|
+
descriptor_records = ensure_mesh_descriptors(version=version, force=force)
|
|
82
120
|
supplemental_records = ensure_mesh_supplemental_records(version=version, force=force)
|
|
83
121
|
|
|
84
|
-
for
|
|
85
|
-
identifier =
|
|
86
|
-
name =
|
|
87
|
-
definition =
|
|
122
|
+
for descriptor_record in itt.chain(descriptor_records, supplemental_records):
|
|
123
|
+
identifier = descriptor_record["identifier"]
|
|
124
|
+
name = descriptor_record["name"]
|
|
125
|
+
definition = descriptor_record.get("scope_note")
|
|
88
126
|
|
|
89
127
|
xrefs: list[Reference] = []
|
|
90
128
|
synonyms: set[str] = set()
|
|
91
|
-
for concept in
|
|
129
|
+
for concept in descriptor_record["concepts"]:
|
|
92
130
|
synonyms.add(concept["name"])
|
|
93
131
|
for term in concept["terms"]:
|
|
94
132
|
synonyms.add(term["name"])
|
|
@@ -102,11 +140,23 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
102
140
|
xrefs=xrefs,
|
|
103
141
|
)
|
|
104
142
|
|
|
105
|
-
for
|
|
106
|
-
mesh_id_to_term[
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
143
|
+
for descriptor_record in descriptor_records:
|
|
144
|
+
term = mesh_id_to_term[descriptor_record["identifier"]]
|
|
145
|
+
for parent_descriptor_id in descriptor_record["parents"]:
|
|
146
|
+
term.append_parent(mesh_id_to_term[parent_descriptor_id])
|
|
147
|
+
|
|
148
|
+
# This takes care of terms that don't have any parents like
|
|
149
|
+
# Body Regions (https://meshb.nlm.nih.gov/record/ui?ui=D001829),
|
|
150
|
+
# which have the tree code A01 and need to point to a made-up
|
|
151
|
+
# term for "A"
|
|
152
|
+
for top_level_letter in descriptor_record["top_levels"]:
|
|
153
|
+
term.append_parent(TREE_HEADERS[top_level_letter])
|
|
154
|
+
|
|
155
|
+
# MeSH supplementary records' identifiers start with "C"
|
|
156
|
+
# and do not have a hierarchy assigned to them
|
|
157
|
+
for supplemental_record in supplemental_records:
|
|
158
|
+
term = mesh_id_to_term[supplemental_record["identifier"]]
|
|
159
|
+
term.append_parent(SUPPLEMENT_PARENT)
|
|
110
160
|
|
|
111
161
|
return mesh_id_to_term.values()
|
|
112
162
|
|
|
@@ -153,7 +203,7 @@ def ensure_mesh_supplemental_records(version: str, force: bool = False) -> list[
|
|
|
153
203
|
return _inner() # type:ignore
|
|
154
204
|
|
|
155
205
|
|
|
156
|
-
def get_descriptor_records(element: Element, id_key: str, name_key) -> list[dict[str, Any]]:
|
|
206
|
+
def get_descriptor_records(element: Element, id_key: str, name_key: str) -> list[dict[str, Any]]:
|
|
157
207
|
"""Get MeSH descriptor records."""
|
|
158
208
|
logger.info("extract MeSH descriptors, concepts, and terms")
|
|
159
209
|
|
|
@@ -164,7 +214,7 @@ def get_descriptor_records(element: Element, id_key: str, name_key) -> list[dict
|
|
|
164
214
|
logger.debug(f"got {len(rv)} descriptors")
|
|
165
215
|
|
|
166
216
|
# cache tree numbers
|
|
167
|
-
tree_number_to_descriptor_ui = {
|
|
217
|
+
tree_number_to_descriptor_ui: dict[str, str] = {
|
|
168
218
|
tree_number: descriptor["identifier"]
|
|
169
219
|
for descriptor in rv
|
|
170
220
|
for tree_number in descriptor["tree_numbers"]
|
|
@@ -173,26 +223,29 @@ def get_descriptor_records(element: Element, id_key: str, name_key) -> list[dict
|
|
|
173
223
|
|
|
174
224
|
# add in parents to each descriptor based on their tree numbers
|
|
175
225
|
for descriptor in rv:
|
|
226
|
+
top_levels = set()
|
|
176
227
|
parents_descriptor_uis = set()
|
|
177
228
|
for tree_number in descriptor["tree_numbers"]:
|
|
178
229
|
try:
|
|
179
230
|
parent_tn, _self_tn = tree_number.rsplit(".", 1)
|
|
180
231
|
except ValueError:
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
parent_descriptor_ui = tree_number_to_descriptor_ui.get(parent_tn)
|
|
185
|
-
if parent_descriptor_ui is not None:
|
|
186
|
-
parents_descriptor_uis.add(parent_descriptor_ui)
|
|
232
|
+
# e.g., this happens for A01 (Body Regions)
|
|
233
|
+
# https://meshb.nlm.nih.gov/record/ui?ui=D001829
|
|
234
|
+
top_levels.add(tree_number[0])
|
|
187
235
|
else:
|
|
188
|
-
|
|
236
|
+
parent_descriptor_ui = tree_number_to_descriptor_ui.get(parent_tn)
|
|
237
|
+
if parent_descriptor_ui is not None:
|
|
238
|
+
parents_descriptor_uis.add(parent_descriptor_ui)
|
|
239
|
+
else:
|
|
240
|
+
tqdm.write(f"missing tree number: {parent_tn}")
|
|
189
241
|
|
|
190
|
-
descriptor["parents"] =
|
|
242
|
+
descriptor["parents"] = sorted(parents_descriptor_uis)
|
|
243
|
+
descriptor["top_levels"] = sorted(top_levels)
|
|
191
244
|
|
|
192
245
|
return rv
|
|
193
246
|
|
|
194
247
|
|
|
195
|
-
def get_scope_note(descriptor_record) -> str | None:
|
|
248
|
+
def get_scope_note(descriptor_record: Mapping[str, Any] | list[Mapping[str, Any]]) -> str | None:
|
|
196
249
|
"""Get the scope note from the preferred concept in a term's record."""
|
|
197
250
|
if isinstance(descriptor_record, dict):
|
|
198
251
|
# necessary for pre-2023 data
|
|
@@ -221,7 +274,7 @@ def get_descriptor_record(
|
|
|
221
274
|
"""
|
|
222
275
|
concepts = get_concept_records(element)
|
|
223
276
|
scope_note = get_scope_note(concepts)
|
|
224
|
-
rv = {
|
|
277
|
+
rv: dict[str, Any] = {
|
|
225
278
|
"identifier": element.findtext(id_key),
|
|
226
279
|
"name": element.findtext(name_key),
|
|
227
280
|
"tree_numbers": sorted(
|
|
@@ -298,7 +351,7 @@ def get_term_records(element: Element) -> list[Mapping[str, Any]]:
|
|
|
298
351
|
return [get_term_record(term) for term in element.findall("TermList/Term")]
|
|
299
352
|
|
|
300
353
|
|
|
301
|
-
def get_term_record(element) -> Mapping[str, Any]:
|
|
354
|
+
def get_term_record(element: Element) -> Mapping[str, Any]:
|
|
302
355
|
"""Get a single MeSH term record."""
|
|
303
356
|
return {
|
|
304
357
|
"term_ui": element.findtext("TermUI"),
|
|
@@ -363,7 +416,7 @@ def get_mesh_category_references(
|
|
|
363
416
|
https://meshb.nlm.nih.gov/treeView
|
|
364
417
|
"""
|
|
365
418
|
if version is None:
|
|
366
|
-
version =
|
|
419
|
+
version = bioversions.get_version("mesh", strict=True)
|
|
367
420
|
tree_to_mesh = get_tree_to_mesh_id(version=version)
|
|
368
421
|
rv = []
|
|
369
422
|
for i in range(1, 100):
|