pyobo 0.12.4__py3-none-any.whl → 0.12.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. pyobo/.DS_Store +0 -0
  2. pyobo/__init__.py +6 -0
  3. pyobo/api/__init__.py +3 -0
  4. pyobo/api/embedding.py +118 -0
  5. pyobo/api/utils.py +0 -10
  6. pyobo/cli/cli.py +1 -6
  7. pyobo/constants.py +23 -0
  8. pyobo/getters.py +52 -35
  9. pyobo/sources/__init__.py +14 -1
  10. pyobo/sources/chembl/__init__.py +6 -0
  11. pyobo/sources/chembl/chembl_cell.py +94 -0
  12. pyobo/sources/chembl/chembl_mechanism.py +81 -0
  13. pyobo/sources/chembl/chembl_tissue.py +70 -0
  14. pyobo/sources/clinicaltrials.py +32 -33
  15. pyobo/sources/complexportal.py +5 -1
  16. pyobo/sources/hgnc/hgnc.py +13 -6
  17. pyobo/sources/iana_media_type.py +100 -0
  18. pyobo/sources/mesh.py +82 -29
  19. pyobo/sources/reactome.py +10 -3
  20. pyobo/sources/spdx.py +85 -0
  21. pyobo/sources/uniprot/uniprot.py +2 -2
  22. pyobo/sources/wikipathways.py +92 -7
  23. pyobo/struct/__init__.py +2 -0
  24. pyobo/struct/functional/dsl.py +10 -1
  25. pyobo/struct/functional/ontology.py +3 -3
  26. pyobo/struct/obo/reader.py +17 -53
  27. pyobo/struct/obograph/export.py +2 -2
  28. pyobo/struct/struct.py +115 -8
  29. pyobo/struct/struct_utils.py +10 -0
  30. pyobo/struct/typedef.py +15 -3
  31. pyobo/struct/vocabulary.py +8 -0
  32. pyobo/utils/cache.py +4 -3
  33. pyobo/utils/io.py +18 -56
  34. pyobo/utils/misc.py +135 -1
  35. pyobo/utils/path.py +34 -2
  36. pyobo/version.py +1 -1
  37. {pyobo-0.12.4.dist-info → pyobo-0.12.5.dist-info}/METADATA +5 -5
  38. {pyobo-0.12.4.dist-info → pyobo-0.12.5.dist-info}/RECORD +41 -35
  39. {pyobo-0.12.4.dist-info → pyobo-0.12.5.dist-info}/WHEEL +0 -0
  40. {pyobo-0.12.4.dist-info → pyobo-0.12.5.dist-info}/entry_points.txt +0 -0
  41. {pyobo-0.12.4.dist-info → pyobo-0.12.5.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,70 @@
1
+ """Converter for ChEMBL tissues."""
2
+
3
+ import logging
4
+ from collections.abc import Iterable
5
+
6
+ import chembl_downloader
7
+
8
+ from pyobo.struct import Obo, Reference, Term
9
+ from pyobo.struct.typedef import exact_match
10
+
11
+ __all__ = [
12
+ "ChEMBLTissueGetter",
13
+ ]
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ PREFIX = "chembl.tissue"
18
+ QUERY = """\
19
+ SELECT
20
+ CHEMBL_ID,
21
+ PREF_NAME,
22
+ UBERON_ID,
23
+ EFO_ID,
24
+ BTO_ID,
25
+ CALOHA_ID
26
+ FROM TISSUE_DICTIONARY
27
+ """
28
+
29
+
30
+ class ChEMBLTissueGetter(Obo):
31
+ """An ontology representation of ChEMBL tissues."""
32
+
33
+ ontology = PREFIX
34
+ bioversions_key = "chembl"
35
+ typedefs = [exact_match]
36
+
37
+ def iter_terms(self, force: bool = False) -> Iterable[Term]:
38
+ """Iterate over terms in the ontology."""
39
+ return iter_terms(version=self._version_or_raise)
40
+
41
+
42
+ def iter_terms(version: str | None = None) -> Iterable[Term]:
43
+ """Iterate over ChEMBL tissue terms."""
44
+ with chembl_downloader.cursor(version=version) as cursor:
45
+ cursor.execute(QUERY)
46
+ for chembl_id, name, uberon, efo, bto, caloha in cursor.fetchall():
47
+ term = Term(
48
+ reference=Reference(prefix=PREFIX, identifier=chembl_id, name=name),
49
+ )
50
+ if uberon:
51
+ term.append_exact_match(
52
+ Reference(prefix="uberon", identifier=uberon.removeprefix("UBERON:"))
53
+ )
54
+ if efo:
55
+ term.append_exact_match(
56
+ Reference(
57
+ prefix="efo", identifier=efo.removeprefix("EFO:").removeprefix("EFO;")
58
+ )
59
+ )
60
+ if bto:
61
+ term.append_exact_match(
62
+ Reference(prefix="bto", identifier=bto.removeprefix("BTO:"))
63
+ )
64
+ if caloha:
65
+ term.append_exact_match(Reference(prefix="caloha", identifier=caloha))
66
+ yield term
67
+
68
+
69
+ if __name__ == "__main__":
70
+ ChEMBLTissueGetter.cli()
@@ -27,49 +27,48 @@ HAS_INTERVENTION = TypeDef(
27
27
  is_metadata_tag=True,
28
28
  )
29
29
 
30
- STUDY_TERM = Term(reference=default_reference(PREFIX, "study", name="study"))
30
+ INVESTIGATION_TERM = Term(
31
+ reference=Reference(prefix="obi", identifier="0000066", name="investigation")
32
+ )
31
33
 
32
- CLINICAL_TRIAL_TERM = Term(
33
- reference=default_reference(PREFIX, "clinical-trial", name="clinical trial")
34
- ).append_parent(STUDY_TERM)
34
+ OBSERVATIONAL_INVESTIGATION_TERM = Term(
35
+ reference=Reference(prefix="obi", identifier="0003693", name="observational investigation")
36
+ ).append_parent(INVESTIGATION_TERM)
35
37
 
36
- INTERVENTIONAL_CLINICAL_TRIAL_TERM = Term(
37
- reference=default_reference(
38
- PREFIX, "interventional-clinical-trial", name="interventional clinical trial"
39
- )
40
- ).append_parent(CLINICAL_TRIAL_TERM)
38
+ CLINICAL_INVESTIGATION_TERM = Term(
39
+ reference=Reference(prefix="obi", identifier="0003697", name="clinical investigation")
40
+ ).append_parent(INVESTIGATION_TERM)
41
+
42
+ CLINICAL_TRIAL_TERM = Term(
43
+ reference=Reference(prefix="obi", identifier="0003699", name="clinical trial")
44
+ ).append_parent(CLINICAL_INVESTIGATION_TERM)
41
45
 
42
46
  RANDOMIZED_INTERVENTIONAL_CLINICAL_TRIAL_TERM = Term(
43
- reference=default_reference(
44
- PREFIX,
45
- "randomized-interventional-clinical-trial",
46
- name="randomized interventional clinical trial",
47
+ reference=Reference(
48
+ prefix="obi",
49
+ identifier="0004001",
50
+ name="randomized clinical trial",
47
51
  )
48
- ).append_parent(INTERVENTIONAL_CLINICAL_TRIAL_TERM)
52
+ ).append_parent(CLINICAL_TRIAL_TERM)
49
53
 
50
54
  NON_RANDOMIZED_INTERVENTIONAL_CLINICAL_TRIAL_TERM = Term(
51
- reference=default_reference(
52
- PREFIX,
53
- "non-randomized-interventional-clinical-trial",
54
- name="non-randomized interventional clinical trial",
55
- )
56
- ).append_parent(INTERVENTIONAL_CLINICAL_TRIAL_TERM)
57
-
58
- OBSERVATIONAL_CLINICAL_TRIAL_TERM = Term(
59
- reference=default_reference(
60
- PREFIX, "observational-clinical-trial", name="observational clinical trial"
55
+ reference=Reference(
56
+ prefix="obi",
57
+ identifier="0004002",
58
+ name="non-randomized clinical trial",
61
59
  )
62
60
  ).append_parent(CLINICAL_TRIAL_TERM)
63
61
 
62
+ # TODO request OBI term
64
63
  EXPANDED_ACCESS_STUDY_TERM = Term(
65
64
  reference=default_reference(PREFIX, "expanded-access-study", name="expanded access study")
66
- ).append_parent(STUDY_TERM)
65
+ ).append_parent(INVESTIGATION_TERM)
67
66
 
68
67
  TERMS = [
69
- STUDY_TERM,
68
+ INVESTIGATION_TERM,
69
+ CLINICAL_INVESTIGATION_TERM,
70
+ OBSERVATIONAL_INVESTIGATION_TERM,
70
71
  CLINICAL_TRIAL_TERM,
71
- OBSERVATIONAL_CLINICAL_TRIAL_TERM,
72
- INTERVENTIONAL_CLINICAL_TRIAL_TERM,
73
72
  EXPANDED_ACCESS_STUDY_TERM,
74
73
  RANDOMIZED_INTERVENTIONAL_CLINICAL_TRIAL_TERM,
75
74
  NON_RANDOMIZED_INTERVENTIONAL_CLINICAL_TRIAL_TERM,
@@ -79,13 +78,13 @@ TERMS = [
79
78
  # types in ClinicalTrials.gov. See summary script at
80
79
  # https://gist.github.com/cthoyt/12a3cb3c63ad68d73fe5a2f0d506526f
81
80
  PARENTS: dict[tuple[str | None, str | None], Term] = {
82
- ("INTERVENTIONAL", None): INTERVENTIONAL_CLINICAL_TRIAL_TERM,
83
- ("INTERVENTIONAL", "NA"): INTERVENTIONAL_CLINICAL_TRIAL_TERM,
81
+ ("INTERVENTIONAL", None): CLINICAL_TRIAL_TERM,
82
+ ("INTERVENTIONAL", "NA"): CLINICAL_TRIAL_TERM,
84
83
  ("INTERVENTIONAL", "RANDOMIZED"): RANDOMIZED_INTERVENTIONAL_CLINICAL_TRIAL_TERM,
85
84
  ("INTERVENTIONAL", "NON_RANDOMIZED"): NON_RANDOMIZED_INTERVENTIONAL_CLINICAL_TRIAL_TERM,
86
- ("OBSERVATIONAL", None): OBSERVATIONAL_CLINICAL_TRIAL_TERM,
85
+ ("OBSERVATIONAL", None): OBSERVATIONAL_INVESTIGATION_TERM,
87
86
  ("EXPANDED_ACCESS", None): EXPANDED_ACCESS_STUDY_TERM,
88
- (None, None): STUDY_TERM,
87
+ (None, None): INVESTIGATION_TERM,
89
88
  }
90
89
 
91
90
 
@@ -95,7 +94,7 @@ class ClinicalTrialsGetter(Obo):
95
94
  ontology = PREFIX
96
95
  dynamic_version = True
97
96
  typedefs = [has_contributor, INVESTIGATES_CONDITION, HAS_INTERVENTION]
98
- root_terms = [STUDY_TERM.reference]
97
+ root_terms = [INVESTIGATION_TERM.reference]
99
98
 
100
99
  def iter_terms(self, force: bool = False) -> Iterable[Term]:
101
100
  """Iterate over terms for studies."""
@@ -57,6 +57,7 @@ SPECIES = [
57
57
  DTYPE = {
58
58
  "taxonomy_id": str,
59
59
  }
60
+ ROOT = Reference(prefix="go", identifier="0032991", name="macromolecular complex")
60
61
 
61
62
 
62
63
  def _parse_members(s) -> list[tuple[Reference, str]]:
@@ -157,10 +158,12 @@ class ComplexPortalGetter(Obo):
157
158
 
158
159
  bioversions_key = ontology = PREFIX
159
160
  typedefs = [from_species, has_part, has_citation]
161
+ root_terms = [ROOT]
160
162
 
161
163
  def iter_terms(self, force: bool = False) -> Iterable[Term]:
162
164
  """Iterate over terms in the ontology."""
163
- return get_terms(version=self._version_or_raise)
165
+ yield Term(reference=ROOT)
166
+ yield from get_terms(version=self._version_or_raise)
164
167
 
165
168
 
166
169
  def get_df(version: str, force: bool = False) -> pd.DataFrame:
@@ -232,6 +235,7 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
232
235
  definition=definition.strip() if pd.notna(definition) else None,
233
236
  synonyms=[Synonym(name=alias) for alias in aliases],
234
237
  )
238
+ term.append_parent(ROOT)
235
239
  for reference, note in xrefs:
236
240
  if note == "identity":
237
241
  term.append_xref(reference)
@@ -7,6 +7,7 @@ import typing
7
7
  from collections import Counter, defaultdict
8
8
  from collections.abc import Iterable
9
9
 
10
+ import pydantic
10
11
  from tabulate import tabulate
11
12
  from tqdm.auto import tqdm
12
13
 
@@ -280,7 +281,7 @@ def get_terms(version: str | None = None, force: bool = False) -> Iterable[Term]
280
281
  continue # only add concrete annotations
281
282
  term.append_relationship(
282
283
  gene_product_member_of,
283
- Reference(prefix="ec", identifier=ec_code),
284
+ Reference(prefix="ec", identifier=ec_code.strip()),
284
285
  )
285
286
  for rna_central_ids in entry.pop("rna_central_id", []):
286
287
  for rna_central_id in rna_central_ids.split(","):
@@ -314,7 +315,7 @@ def get_terms(version: str | None = None, force: bool = False) -> Iterable[Term]
314
315
  )
315
316
  for mgi_curie in entry.pop("mgd_id", []):
316
317
  if not mgi_curie.startswith("MGI:"):
317
- tqdm.write(f"hgnc:{identifier} had bad MGI CURIE: {mgi_curie}")
318
+ tqdm.write(f"[hgnc:{identifier}] had bad MGI CURIE: {mgi_curie}")
318
319
  continue
319
320
  mgi_id = mgi_curie[len("MGI:") :]
320
321
  if not mgi_id:
@@ -335,7 +336,7 @@ def get_terms(version: str | None = None, force: bool = False) -> Iterable[Term]
335
336
  Reference(prefix="iuphar.ligand", identifier=iuphar[len("ligandId:") :])
336
337
  )
337
338
  else:
338
- tqdm.write(f"unhandled IUPHAR: {iuphar}")
339
+ tqdm.write(f"[hgnc:{identifier}] unhandled IUPHAR: {iuphar}")
339
340
 
340
341
  for lrg_info in entry.pop("lsdb", []):
341
342
  if lrg_info.startswith("LRG_"):
@@ -360,9 +361,15 @@ def get_terms(version: str | None = None, force: bool = False) -> Iterable[Term]
360
361
  xref_identifiers = [i.strip(".") for i in xref_identifiers]
361
362
 
362
363
  if len(xref_identifiers) == 1:
363
- term.append_exact_match(
364
- Reference(prefix=xref_prefix, identifier=str(xref_identifiers[0]))
365
- )
364
+ try:
365
+ xref = Reference(prefix=xref_prefix, identifier=str(xref_identifiers[0]))
366
+ except pydantic.ValidationError:
367
+ tqdm.write(
368
+ f"[hgnc:{identifier}] had bad {key} xref: {xref_prefix}:{xref_identifiers[0]}"
369
+ )
370
+ continue
371
+ else:
372
+ term.append_exact_match(xref)
366
373
  else:
367
374
  for xref_identifier in xref_identifiers:
368
375
  term.append_xref(Reference(prefix=xref_prefix, identifier=str(xref_identifier)))
@@ -0,0 +1,100 @@
1
+ """An ontology representation of IANA media types (i.e. MIME types).
2
+
3
+ .. seealso:: https://www.iana.org/assignments/media-types/media-types.xhtml
4
+ """
5
+
6
+ from collections.abc import Iterable
7
+
8
+ from pyobo import Obo, Reference, Term, default_reference
9
+ from pyobo.struct.typedef import term_replaced_by
10
+ from pyobo.utils.path import ensure_df
11
+
12
+ __all__ = ["IANAGetter"]
13
+
14
+ PREFIX = "iana.mediatype"
15
+ ROOT = Term.from_triple(prefix="dcterms", identifier="MediaType", name="media type")
16
+
17
+ #: The top-level types listed on https://www.iana.org/assignments/media-types/media-types.xhtml
18
+ MEDIA_TYPE_GROUPS = [
19
+ "application",
20
+ "audio",
21
+ "font",
22
+ "haptics",
23
+ "image",
24
+ "message",
25
+ "model",
26
+ "multipart",
27
+ "text",
28
+ "video",
29
+ ]
30
+
31
+ GROUP_TO_CSV = {
32
+ media_type_group: (
33
+ f"https://www.iana.org/assignments/media-types/{media_type_group}.csv",
34
+ Term(reference=default_reference(PREFIX, media_type_group, media_type_group)).append_parent(
35
+ ROOT
36
+ ),
37
+ )
38
+ for media_type_group in MEDIA_TYPE_GROUPS
39
+ }
40
+
41
+
42
+ class IANAGetter(Obo):
43
+ """An ontology representation of IANA media types (i.e. MIME types)."""
44
+
45
+ ontology = bioregistry_key = PREFIX
46
+ name = "IANA Media Types"
47
+ dynamic_version = True
48
+ root_terms = [t.reference for _, (_, t) in sorted(GROUP_TO_CSV.items())]
49
+ typedefs = [
50
+ term_replaced_by,
51
+ ]
52
+
53
+ def iter_terms(self, force: bool = False) -> Iterable[Term]:
54
+ """Iterate over terms in the ontology."""
55
+ return get_terms()
56
+
57
+
58
+ def get_terms() -> list[Term]:
59
+ """Get IANA Media Type terms."""
60
+ terms: dict[str, Term] = {}
61
+ forwards: dict[Term, str] = {}
62
+ for key, (url, parent) in GROUP_TO_CSV.items():
63
+ df = ensure_df(PREFIX, url=url, sep=",")
64
+ terms[key] = parent
65
+ for name, identifier, references in df.values:
66
+ if "OBSOLE" in name or "DEPRECATED" in name:
67
+ is_obsolete = True
68
+ else:
69
+ is_obsolete = None
70
+ term = Term(
71
+ reference=Reference(prefix=PREFIX, identifier=identifier, name=name),
72
+ is_obsolete=is_obsolete,
73
+ ).append_parent(parent)
74
+ for reference in _process_references(references):
75
+ term.append_see_also_uri(reference)
76
+ terms[identifier.casefold()] = term
77
+
78
+ if "in favor of" in name:
79
+ _, _, new = name.partition("in favor of ")
80
+ forwards[term] = new.casefold().strip().rstrip(")").strip()
81
+
82
+ for old, new in forwards.items():
83
+ if new == "vnd.afpc.afplinedata":
84
+ new = "application/vnd.afpc.afplinedata"
85
+ old.append_replaced_by(terms[new].reference)
86
+
87
+ return list(terms.values())
88
+
89
+
90
+ def _process_references(cell: str) -> list[str]:
91
+ rv = []
92
+ for part in cell.split("]["):
93
+ part = part.strip("[").strip("]")
94
+ if part.startswith("RFC"):
95
+ rv.append(f"https://www.iana.org/go/rfc{part.removeprefix('RFC')}")
96
+ return rv
97
+
98
+
99
+ if __name__ == "__main__":
100
+ IANAGetter.cli()
pyobo/sources/mesh.py CHANGED
@@ -10,12 +10,12 @@ from pathlib import Path
10
10
  from typing import Any
11
11
  from xml.etree.ElementTree import Element
12
12
 
13
+ import bioversions
13
14
  from lxml import etree
14
15
  from tqdm.auto import tqdm
15
16
 
16
- from pyobo.api.utils import safe_get_version
17
17
  from pyobo.identifier_utils import standardize_ec
18
- from pyobo.struct import Obo, Reference, Synonym, Term
18
+ from pyobo.struct import Obo, Reference, Synonym, Term, default_reference
19
19
  from pyobo.utils.cache import cached_json, cached_mapping
20
20
  from pyobo.utils.path import ensure_path, prefix_directory_join
21
21
 
@@ -31,6 +31,37 @@ PREFIX = "mesh"
31
31
  NOW_YEAR = str(datetime.datetime.now().year)
32
32
  CAS_RE = re.compile(r"^\d{1,7}\-\d{2}\-\d$")
33
33
  UNII_RE = re.compile(r"[0-9A-Za-z]{10}$")
34
+ SUPPLEMENT_PARENT = default_reference(
35
+ prefix=PREFIX, identifier="supplemental-record", name="supplemental records"
36
+ )
37
+
38
+ #: A mapping from tree header letters to labels
39
+ #:
40
+ #: .. seealso:: https://meshb-prev.nlm.nih.gov/treeView
41
+ TREE_HEADER_TO_NAME = {
42
+ "A": "Anatomy",
43
+ "B": "Organisms",
44
+ "C": "Diseases",
45
+ "D": "Chemicals and Drugs",
46
+ "E": "Analytical, Diagnostic and Therapeutic Techniques, and Equipment",
47
+ "F": "Psychiatry and Psychology",
48
+ "G": "Phenomena and Processes",
49
+ "H": "Disciplines and Occupations",
50
+ "I": "Anthropology, Education, Sociology, and Social Phenomena",
51
+ "J": "Technology, Industry, and Agriculture",
52
+ "K": "Humanities",
53
+ "L": "Information Science",
54
+ "M": "Named Groups",
55
+ "N": "Health Care",
56
+ "V": "Publication Characteristics",
57
+ "Z": "Geographicals",
58
+ }
59
+
60
+ #: A mapping from tree header letters to term objects
61
+ TREE_HEADERS: dict[str, Reference] = {
62
+ letter: default_reference(prefix=PREFIX, identifier=letter, name=name)
63
+ for letter, name in TREE_HEADER_TO_NAME.items()
64
+ }
34
65
 
35
66
 
36
67
  def _get_xml_root(path: Path) -> Element:
@@ -46,13 +77,20 @@ class MeSHGetter(Obo):
46
77
  """An ontology representation of the Medical Subject Headings."""
47
78
 
48
79
  ontology = bioversions_key = PREFIX
80
+ root_terms = [
81
+ SUPPLEMENT_PARENT,
82
+ *TREE_HEADERS.values(),
83
+ ]
49
84
 
50
85
  def _get_version(self) -> str | None:
51
86
  return NOW_YEAR
52
87
 
53
88
  def iter_terms(self, force: bool = False) -> Iterable[Term]:
54
89
  """Iterate over terms in the ontology."""
55
- return get_terms(version=self._version_or_raise, force=force)
90
+ yield Term(reference=SUPPLEMENT_PARENT)
91
+ for x in TREE_HEADERS.values():
92
+ yield Term(reference=x)
93
+ yield from get_terms(version=self._version_or_raise, force=force)
56
94
 
57
95
 
58
96
  def get_tree_to_mesh_id(version: str) -> Mapping[str, str]:
@@ -74,21 +112,21 @@ def get_tree_to_mesh_id(version: str) -> Mapping[str, str]:
74
112
  return _inner()
75
113
 
76
114
 
77
- def get_terms(version: str, force: bool = False) -> Iterable[Term]:
115
+ def get_terms(version: str, *, force: bool = False) -> Iterable[Term]:
78
116
  """Get MeSH OBO terms."""
79
117
  mesh_id_to_term: dict[str, Term] = {}
80
118
 
81
- descriptors = ensure_mesh_descriptors(version=version, force=force)
119
+ descriptor_records = ensure_mesh_descriptors(version=version, force=force)
82
120
  supplemental_records = ensure_mesh_supplemental_records(version=version, force=force)
83
121
 
84
- for entry in itt.chain(descriptors, supplemental_records):
85
- identifier = entry["identifier"]
86
- name = entry["name"]
87
- definition = entry.get("scope_note")
122
+ for descriptor_record in itt.chain(descriptor_records, supplemental_records):
123
+ identifier = descriptor_record["identifier"]
124
+ name = descriptor_record["name"]
125
+ definition = descriptor_record.get("scope_note")
88
126
 
89
127
  xrefs: list[Reference] = []
90
128
  synonyms: set[str] = set()
91
- for concept in entry["concepts"]:
129
+ for concept in descriptor_record["concepts"]:
92
130
  synonyms.add(concept["name"])
93
131
  for term in concept["terms"]:
94
132
  synonyms.add(term["name"])
@@ -102,11 +140,23 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
102
140
  xrefs=xrefs,
103
141
  )
104
142
 
105
- for entry in descriptors:
106
- mesh_id_to_term[entry["identifier"]].parents = [
107
- mesh_id_to_term[parent_descriptor_id].reference
108
- for parent_descriptor_id in entry["parents"]
109
- ]
143
+ for descriptor_record in descriptor_records:
144
+ term = mesh_id_to_term[descriptor_record["identifier"]]
145
+ for parent_descriptor_id in descriptor_record["parents"]:
146
+ term.append_parent(mesh_id_to_term[parent_descriptor_id])
147
+
148
+ # This takes care of terms that don't have any parents like
149
+ # Body Regions (https://meshb.nlm.nih.gov/record/ui?ui=D001829),
150
+ # which have the tree code A01 and need to point to a made-up
151
+ # term for "A"
152
+ for top_level_letter in descriptor_record["top_levels"]:
153
+ term.append_parent(TREE_HEADERS[top_level_letter])
154
+
155
+ # MeSH supplementary records' identifiers start with "C"
156
+ # and do not have a hierarchy assigned to them
157
+ for supplemental_record in supplemental_records:
158
+ term = mesh_id_to_term[supplemental_record["identifier"]]
159
+ term.append_parent(SUPPLEMENT_PARENT)
110
160
 
111
161
  return mesh_id_to_term.values()
112
162
 
@@ -153,7 +203,7 @@ def ensure_mesh_supplemental_records(version: str, force: bool = False) -> list[
153
203
  return _inner() # type:ignore
154
204
 
155
205
 
156
- def get_descriptor_records(element: Element, id_key: str, name_key) -> list[dict[str, Any]]:
206
+ def get_descriptor_records(element: Element, id_key: str, name_key: str) -> list[dict[str, Any]]:
157
207
  """Get MeSH descriptor records."""
158
208
  logger.info("extract MeSH descriptors, concepts, and terms")
159
209
 
@@ -164,7 +214,7 @@ def get_descriptor_records(element: Element, id_key: str, name_key) -> list[dict
164
214
  logger.debug(f"got {len(rv)} descriptors")
165
215
 
166
216
  # cache tree numbers
167
- tree_number_to_descriptor_ui = {
217
+ tree_number_to_descriptor_ui: dict[str, str] = {
168
218
  tree_number: descriptor["identifier"]
169
219
  for descriptor in rv
170
220
  for tree_number in descriptor["tree_numbers"]
@@ -173,26 +223,29 @@ def get_descriptor_records(element: Element, id_key: str, name_key) -> list[dict
173
223
 
174
224
  # add in parents to each descriptor based on their tree numbers
175
225
  for descriptor in rv:
226
+ top_levels = set()
176
227
  parents_descriptor_uis = set()
177
228
  for tree_number in descriptor["tree_numbers"]:
178
229
  try:
179
230
  parent_tn, _self_tn = tree_number.rsplit(".", 1)
180
231
  except ValueError:
181
- logger.debug("No dot for %s", tree_number)
182
- continue
183
-
184
- parent_descriptor_ui = tree_number_to_descriptor_ui.get(parent_tn)
185
- if parent_descriptor_ui is not None:
186
- parents_descriptor_uis.add(parent_descriptor_ui)
232
+ # e.g., this happens for A01 (Body Regions)
233
+ # https://meshb.nlm.nih.gov/record/ui?ui=D001829
234
+ top_levels.add(tree_number[0])
187
235
  else:
188
- logger.debug("missing tree number: %s", parent_tn)
236
+ parent_descriptor_ui = tree_number_to_descriptor_ui.get(parent_tn)
237
+ if parent_descriptor_ui is not None:
238
+ parents_descriptor_uis.add(parent_descriptor_ui)
239
+ else:
240
+ tqdm.write(f"missing tree number: {parent_tn}")
189
241
 
190
- descriptor["parents"] = list(parents_descriptor_uis)
242
+ descriptor["parents"] = sorted(parents_descriptor_uis)
243
+ descriptor["top_levels"] = sorted(top_levels)
191
244
 
192
245
  return rv
193
246
 
194
247
 
195
- def get_scope_note(descriptor_record) -> str | None:
248
+ def get_scope_note(descriptor_record: Mapping[str, Any] | list[Mapping[str, Any]]) -> str | None:
196
249
  """Get the scope note from the preferred concept in a term's record."""
197
250
  if isinstance(descriptor_record, dict):
198
251
  # necessary for pre-2023 data
@@ -221,7 +274,7 @@ def get_descriptor_record(
221
274
  """
222
275
  concepts = get_concept_records(element)
223
276
  scope_note = get_scope_note(concepts)
224
- rv = {
277
+ rv: dict[str, Any] = {
225
278
  "identifier": element.findtext(id_key),
226
279
  "name": element.findtext(name_key),
227
280
  "tree_numbers": sorted(
@@ -298,7 +351,7 @@ def get_term_records(element: Element) -> list[Mapping[str, Any]]:
298
351
  return [get_term_record(term) for term in element.findall("TermList/Term")]
299
352
 
300
353
 
301
- def get_term_record(element) -> Mapping[str, Any]:
354
+ def get_term_record(element: Element) -> Mapping[str, Any]:
302
355
  """Get a single MeSH term record."""
303
356
  return {
304
357
  "term_ui": element.findtext("TermUI"),
@@ -363,7 +416,7 @@ def get_mesh_category_references(
363
416
  https://meshb.nlm.nih.gov/treeView
364
417
  """
365
418
  if version is None:
366
- version = safe_get_version("mesh")
419
+ version = bioversions.get_version("mesh", strict=True)
367
420
  tree_to_mesh = get_tree_to_mesh_id(version=version)
368
421
  rv = []
369
422
  for i in range(1, 100):
pyobo/sources/reactome.py CHANGED
@@ -22,6 +22,7 @@ __all__ = [
22
22
  logger = logging.getLogger(__name__)
23
23
 
24
24
  PREFIX = "reactome"
25
+ ROOT = Reference(prefix="pw", identifier="0000001", name="pathway")
25
26
 
26
27
 
27
28
  # TODO alt ids https://reactome.org/download/current/reactome_stable_ids.txt
@@ -32,10 +33,12 @@ class ReactomeGetter(Obo):
32
33
 
33
34
  ontology = bioversions_key = PREFIX
34
35
  typedefs = [from_species, has_participant, has_citation]
36
+ root_terms = [ROOT]
35
37
 
36
38
  def iter_terms(self, force: bool = False) -> Iterable[Term]:
37
39
  """Iterate over terms in the ontology."""
38
- return iter_terms(version=self._version_or_raise, force=force)
40
+ yield Term(reference=ROOT)
41
+ yield from iter_terms(version=self._version_or_raise, force=force)
39
42
 
40
43
 
41
44
  def ensure_participant_df(version: str, force: bool = False) -> pd.DataFrame:
@@ -87,6 +90,10 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
87
90
  for parent_id, child_id in hierarchy_df.values:
88
91
  terms[child_id].append_parent(terms[parent_id])
89
92
 
93
+ for term in terms.values():
94
+ if not term.parents:
95
+ term.append_parent(ROOT)
96
+
90
97
  uniprot_pathway_df = ensure_participant_df(version=version, force=force)
91
98
  for uniprot_id, reactome_id in tqdm(
92
99
  uniprot_pathway_df.values,
@@ -102,7 +109,7 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
102
109
  reference = Reference(prefix="uniprot.isoform", identifier=uniprot_id)
103
110
  else:
104
111
  reference = Reference(prefix="uniprot", identifier=uniprot_id)
105
- terms[reactome_id].append_relationship(has_participant, reference)
112
+ terms[reactome_id].annotate_object(has_participant, reference)
106
113
 
107
114
  chebi_pathway_url = f"https://reactome.org/download/{version}/ChEBI2Reactome_All_Levels.txt"
108
115
  chebi_pathway_df = ensure_df(
@@ -122,7 +129,7 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
122
129
  if reactome_id not in terms:
123
130
  tqdm.write(f"{reactome_id} appears in chebi participants file but not pathways file")
124
131
  continue
125
- terms[reactome_id].append_relationship(
132
+ terms[reactome_id].annotate_object(
126
133
  has_participant, Reference(prefix="chebi", identifier=chebi_id)
127
134
  )
128
135