pyobo 0.11.2__py3-none-any.whl → 0.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. pyobo/.DS_Store +0 -0
  2. pyobo/__init__.py +95 -20
  3. pyobo/__main__.py +0 -0
  4. pyobo/api/__init__.py +81 -10
  5. pyobo/api/alts.py +52 -42
  6. pyobo/api/combine.py +39 -0
  7. pyobo/api/edges.py +68 -0
  8. pyobo/api/hierarchy.py +231 -203
  9. pyobo/api/metadata.py +14 -19
  10. pyobo/api/names.py +207 -127
  11. pyobo/api/properties.py +117 -117
  12. pyobo/api/relations.py +68 -94
  13. pyobo/api/species.py +24 -21
  14. pyobo/api/typedefs.py +11 -11
  15. pyobo/api/utils.py +66 -13
  16. pyobo/api/xrefs.py +107 -114
  17. pyobo/cli/__init__.py +0 -0
  18. pyobo/cli/cli.py +35 -50
  19. pyobo/cli/database.py +210 -160
  20. pyobo/cli/database_utils.py +155 -0
  21. pyobo/cli/lookup.py +163 -195
  22. pyobo/cli/utils.py +19 -6
  23. pyobo/constants.py +102 -3
  24. pyobo/getters.py +209 -191
  25. pyobo/gilda_utils.py +52 -250
  26. pyobo/identifier_utils/__init__.py +33 -0
  27. pyobo/identifier_utils/api.py +305 -0
  28. pyobo/identifier_utils/preprocessing.json +873 -0
  29. pyobo/identifier_utils/preprocessing.py +27 -0
  30. pyobo/identifier_utils/relations/__init__.py +8 -0
  31. pyobo/identifier_utils/relations/api.py +162 -0
  32. pyobo/identifier_utils/relations/data.json +5824 -0
  33. pyobo/identifier_utils/relations/data_owl.json +57 -0
  34. pyobo/identifier_utils/relations/data_rdf.json +1 -0
  35. pyobo/identifier_utils/relations/data_rdfs.json +7 -0
  36. pyobo/mocks.py +9 -6
  37. pyobo/ner/__init__.py +9 -0
  38. pyobo/ner/api.py +72 -0
  39. pyobo/ner/normalizer.py +33 -0
  40. pyobo/obographs.py +48 -40
  41. pyobo/plugins.py +5 -4
  42. pyobo/py.typed +0 -0
  43. pyobo/reader.py +1354 -395
  44. pyobo/reader_utils.py +155 -0
  45. pyobo/resource_utils.py +42 -22
  46. pyobo/resources/__init__.py +0 -0
  47. pyobo/resources/goc.py +75 -0
  48. pyobo/resources/goc.tsv +188 -0
  49. pyobo/resources/ncbitaxon.py +4 -5
  50. pyobo/resources/ncbitaxon.tsv.gz +0 -0
  51. pyobo/resources/ro.py +3 -2
  52. pyobo/resources/ro.tsv +0 -0
  53. pyobo/resources/so.py +0 -0
  54. pyobo/resources/so.tsv +0 -0
  55. pyobo/sources/README.md +12 -8
  56. pyobo/sources/__init__.py +52 -29
  57. pyobo/sources/agrovoc.py +0 -0
  58. pyobo/sources/antibodyregistry.py +11 -12
  59. pyobo/sources/bigg/__init__.py +13 -0
  60. pyobo/sources/bigg/bigg_compartment.py +81 -0
  61. pyobo/sources/bigg/bigg_metabolite.py +229 -0
  62. pyobo/sources/bigg/bigg_model.py +46 -0
  63. pyobo/sources/bigg/bigg_reaction.py +77 -0
  64. pyobo/sources/biogrid.py +1 -2
  65. pyobo/sources/ccle.py +7 -12
  66. pyobo/sources/cgnc.py +9 -6
  67. pyobo/sources/chebi.py +1 -1
  68. pyobo/sources/chembl/__init__.py +9 -0
  69. pyobo/sources/{chembl.py → chembl/chembl_compound.py} +13 -25
  70. pyobo/sources/chembl/chembl_target.py +160 -0
  71. pyobo/sources/civic_gene.py +55 -15
  72. pyobo/sources/clinicaltrials.py +160 -0
  73. pyobo/sources/complexportal.py +24 -24
  74. pyobo/sources/conso.py +14 -22
  75. pyobo/sources/cpt.py +0 -0
  76. pyobo/sources/credit.py +1 -9
  77. pyobo/sources/cvx.py +27 -5
  78. pyobo/sources/depmap.py +9 -12
  79. pyobo/sources/dictybase_gene.py +2 -7
  80. pyobo/sources/drugbank/__init__.py +9 -0
  81. pyobo/sources/{drugbank.py → drugbank/drugbank.py} +11 -16
  82. pyobo/sources/{drugbank_salt.py → drugbank/drugbank_salt.py} +3 -8
  83. pyobo/sources/drugcentral.py +17 -13
  84. pyobo/sources/expasy.py +31 -34
  85. pyobo/sources/famplex.py +13 -18
  86. pyobo/sources/flybase.py +8 -13
  87. pyobo/sources/gard.py +62 -0
  88. pyobo/sources/geonames/__init__.py +9 -0
  89. pyobo/sources/geonames/features.py +28 -0
  90. pyobo/sources/{geonames.py → geonames/geonames.py} +87 -26
  91. pyobo/sources/geonames/utils.py +115 -0
  92. pyobo/sources/gmt_utils.py +6 -7
  93. pyobo/sources/go.py +20 -13
  94. pyobo/sources/gtdb.py +154 -0
  95. pyobo/sources/gwascentral/__init__.py +9 -0
  96. pyobo/sources/{gwascentral_phenotype.py → gwascentral/gwascentral_phenotype.py} +5 -7
  97. pyobo/sources/{gwascentral_study.py → gwascentral/gwascentral_study.py} +1 -7
  98. pyobo/sources/hgnc/__init__.py +9 -0
  99. pyobo/sources/{hgnc.py → hgnc/hgnc.py} +56 -70
  100. pyobo/sources/{hgncgenefamily.py → hgnc/hgncgenefamily.py} +8 -18
  101. pyobo/sources/icd/__init__.py +9 -0
  102. pyobo/sources/{icd10.py → icd/icd10.py} +35 -37
  103. pyobo/sources/icd/icd11.py +148 -0
  104. pyobo/sources/{icd_utils.py → icd/icd_utils.py} +66 -20
  105. pyobo/sources/interpro.py +4 -9
  106. pyobo/sources/itis.py +0 -5
  107. pyobo/sources/kegg/__init__.py +0 -0
  108. pyobo/sources/kegg/api.py +16 -38
  109. pyobo/sources/kegg/genes.py +9 -20
  110. pyobo/sources/kegg/genome.py +1 -7
  111. pyobo/sources/kegg/pathway.py +9 -21
  112. pyobo/sources/mesh.py +58 -24
  113. pyobo/sources/mgi.py +3 -10
  114. pyobo/sources/mirbase/__init__.py +11 -0
  115. pyobo/sources/{mirbase.py → mirbase/mirbase.py} +8 -11
  116. pyobo/sources/{mirbase_constants.py → mirbase/mirbase_constants.py} +0 -0
  117. pyobo/sources/{mirbase_family.py → mirbase/mirbase_family.py} +4 -8
  118. pyobo/sources/{mirbase_mature.py → mirbase/mirbase_mature.py} +3 -7
  119. pyobo/sources/msigdb.py +74 -39
  120. pyobo/sources/ncbi/__init__.py +9 -0
  121. pyobo/sources/ncbi/ncbi_gc.py +162 -0
  122. pyobo/sources/{ncbigene.py → ncbi/ncbigene.py} +18 -19
  123. pyobo/sources/nih_reporter.py +60 -0
  124. pyobo/sources/nlm/__init__.py +9 -0
  125. pyobo/sources/nlm/nlm_catalog.py +48 -0
  126. pyobo/sources/nlm/nlm_publisher.py +36 -0
  127. pyobo/sources/nlm/utils.py +116 -0
  128. pyobo/sources/npass.py +6 -8
  129. pyobo/sources/omim_ps.py +11 -4
  130. pyobo/sources/pathbank.py +4 -8
  131. pyobo/sources/pfam/__init__.py +9 -0
  132. pyobo/sources/{pfam.py → pfam/pfam.py} +3 -8
  133. pyobo/sources/{pfam_clan.py → pfam/pfam_clan.py} +2 -7
  134. pyobo/sources/pharmgkb/__init__.py +15 -0
  135. pyobo/sources/pharmgkb/pharmgkb_chemical.py +89 -0
  136. pyobo/sources/pharmgkb/pharmgkb_disease.py +77 -0
  137. pyobo/sources/pharmgkb/pharmgkb_gene.py +108 -0
  138. pyobo/sources/pharmgkb/pharmgkb_pathway.py +63 -0
  139. pyobo/sources/pharmgkb/pharmgkb_variant.py +84 -0
  140. pyobo/sources/pharmgkb/utils.py +86 -0
  141. pyobo/sources/pid.py +1 -6
  142. pyobo/sources/pombase.py +6 -10
  143. pyobo/sources/pubchem.py +4 -9
  144. pyobo/sources/reactome.py +5 -11
  145. pyobo/sources/rgd.py +11 -16
  146. pyobo/sources/rhea.py +37 -36
  147. pyobo/sources/ror.py +69 -42
  148. pyobo/sources/selventa/__init__.py +0 -0
  149. pyobo/sources/selventa/schem.py +4 -7
  150. pyobo/sources/selventa/scomp.py +1 -6
  151. pyobo/sources/selventa/sdis.py +4 -7
  152. pyobo/sources/selventa/sfam.py +1 -6
  153. pyobo/sources/sgd.py +6 -11
  154. pyobo/sources/signor/__init__.py +7 -0
  155. pyobo/sources/signor/download.py +41 -0
  156. pyobo/sources/signor/signor_complexes.py +105 -0
  157. pyobo/sources/slm.py +12 -15
  158. pyobo/sources/umls/__init__.py +7 -1
  159. pyobo/sources/umls/__main__.py +0 -0
  160. pyobo/sources/umls/get_synonym_types.py +20 -4
  161. pyobo/sources/umls/sty.py +57 -0
  162. pyobo/sources/umls/synonym_types.tsv +1 -1
  163. pyobo/sources/umls/umls.py +18 -22
  164. pyobo/sources/unimod.py +46 -0
  165. pyobo/sources/uniprot/__init__.py +1 -1
  166. pyobo/sources/uniprot/uniprot.py +40 -32
  167. pyobo/sources/uniprot/uniprot_ptm.py +4 -34
  168. pyobo/sources/utils.py +3 -2
  169. pyobo/sources/wikipathways.py +7 -10
  170. pyobo/sources/zfin.py +5 -10
  171. pyobo/ssg/__init__.py +12 -16
  172. pyobo/ssg/base.html +0 -0
  173. pyobo/ssg/index.html +26 -13
  174. pyobo/ssg/term.html +12 -2
  175. pyobo/ssg/typedef.html +0 -0
  176. pyobo/struct/__init__.py +54 -8
  177. pyobo/struct/functional/__init__.py +1 -0
  178. pyobo/struct/functional/dsl.py +2572 -0
  179. pyobo/struct/functional/macros.py +423 -0
  180. pyobo/struct/functional/obo_to_functional.py +385 -0
  181. pyobo/struct/functional/ontology.py +272 -0
  182. pyobo/struct/functional/utils.py +112 -0
  183. pyobo/struct/reference.py +331 -136
  184. pyobo/struct/struct.py +1484 -657
  185. pyobo/struct/struct_utils.py +1078 -0
  186. pyobo/struct/typedef.py +162 -210
  187. pyobo/struct/utils.py +12 -5
  188. pyobo/struct/vocabulary.py +138 -0
  189. pyobo/utils/__init__.py +0 -0
  190. pyobo/utils/cache.py +16 -15
  191. pyobo/utils/io.py +51 -41
  192. pyobo/utils/iter.py +5 -5
  193. pyobo/utils/misc.py +41 -53
  194. pyobo/utils/ndex_utils.py +0 -0
  195. pyobo/utils/path.py +73 -70
  196. pyobo/version.py +3 -3
  197. pyobo-0.12.1.dist-info/METADATA +671 -0
  198. pyobo-0.12.1.dist-info/RECORD +201 -0
  199. pyobo-0.12.1.dist-info/WHEEL +4 -0
  200. {pyobo-0.11.2.dist-info → pyobo-0.12.1.dist-info}/entry_points.txt +1 -0
  201. pyobo-0.12.1.dist-info/licenses/LICENSE +21 -0
  202. pyobo/aws.py +0 -162
  203. pyobo/cli/aws.py +0 -47
  204. pyobo/identifier_utils.py +0 -142
  205. pyobo/normalizer.py +0 -232
  206. pyobo/registries/__init__.py +0 -16
  207. pyobo/registries/metaregistry.json +0 -507
  208. pyobo/registries/metaregistry.py +0 -135
  209. pyobo/sources/icd11.py +0 -105
  210. pyobo/xrefdb/__init__.py +0 -1
  211. pyobo/xrefdb/canonicalizer.py +0 -214
  212. pyobo/xrefdb/priority.py +0 -59
  213. pyobo/xrefdb/sources/__init__.py +0 -60
  214. pyobo/xrefdb/sources/biomappings.py +0 -36
  215. pyobo/xrefdb/sources/cbms2019.py +0 -91
  216. pyobo/xrefdb/sources/chembl.py +0 -83
  217. pyobo/xrefdb/sources/compath.py +0 -82
  218. pyobo/xrefdb/sources/famplex.py +0 -64
  219. pyobo/xrefdb/sources/gilda.py +0 -50
  220. pyobo/xrefdb/sources/intact.py +0 -113
  221. pyobo/xrefdb/sources/ncit.py +0 -133
  222. pyobo/xrefdb/sources/pubchem.py +0 -27
  223. pyobo/xrefdb/sources/wikidata.py +0 -116
  224. pyobo/xrefdb/xrefs_pipeline.py +0 -180
  225. pyobo-0.11.2.dist-info/METADATA +0 -711
  226. pyobo-0.11.2.dist-info/RECORD +0 -157
  227. pyobo-0.11.2.dist-info/WHEEL +0 -5
  228. pyobo-0.11.2.dist-info/top_level.txt +0 -1
pyobo/reader.py CHANGED
@@ -1,33 +1,59 @@
1
1
  """OBO Readers."""
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import logging
6
+ import typing as t
7
+ from collections import Counter
4
8
  from collections.abc import Iterable, Mapping
5
9
  from datetime import datetime
10
+ from io import StringIO
6
11
  from pathlib import Path
7
- from typing import Any, Optional, Union
12
+ from textwrap import dedent
13
+ from typing import Any
8
14
 
9
15
  import bioregistry
10
16
  import networkx as nx
17
+ from curies import ReferenceTuple
18
+ from curies.preprocessing import BlocklistError
19
+ from curies.vocabulary import SynonymScope
11
20
  from more_itertools import pairwise
12
21
  from tqdm.auto import tqdm
13
22
 
14
23
  from .constants import DATE_FORMAT, PROVENANCE_PREFIXES
15
- from .identifier_utils import MissingPrefixError, normalize_curie
16
- from .registries import curie_has_blacklisted_prefix, curie_is_blacklisted, remap_prefix
24
+ from .identifier_utils import (
25
+ NotCURIEError,
26
+ ParseError,
27
+ UnparsableIRIError,
28
+ _is_valid_identifier,
29
+ _parse_str_or_curie_or_uri_helper,
30
+ get_rules,
31
+ )
32
+ from .reader_utils import (
33
+ _chomp_axioms,
34
+ _chomp_references,
35
+ _chomp_specificity,
36
+ _chomp_typedef,
37
+ _parse_provenance_list,
38
+ )
17
39
  from .struct import (
18
40
  Obo,
19
41
  Reference,
20
42
  Synonym,
21
- SynonymSpecificities,
22
- SynonymSpecificity,
23
43
  SynonymTypeDef,
24
44
  Term,
25
45
  TypeDef,
46
+ default_reference,
26
47
  make_ad_hoc_ontology,
27
48
  )
28
- from .struct.struct import DEFAULT_SYNONYM_TYPE
29
- from .struct.typedef import default_typedefs, develops_from, has_part, part_of
30
- from .utils.misc import cleanup_version
49
+ from .struct import vocabulary as v
50
+ from .struct.reference import OBOLiteral, _obo_parse_identifier
51
+ from .struct.struct_utils import Annotation, Stanza
52
+ from .struct.typedef import comment as has_comment
53
+ from .struct.typedef import default_typedefs, has_ontology_root_term
54
+ from .utils.cache import write_gzipped_graph
55
+ from .utils.io import safe_open
56
+ from .utils.misc import STATIC_VERSION_REWRITES, cleanup_version
31
57
 
32
58
  __all__ = [
33
59
  "from_obo_path",
@@ -36,369 +62,1026 @@ __all__ = [
36
62
 
37
63
  logger = logging.getLogger(__name__)
38
64
 
39
- # FIXME use bioontologies
40
- # RELATION_REMAPPINGS: Mapping[str, Tuple[str, str]] = bioontologies.upgrade.load()
41
- RELATION_REMAPPINGS: Mapping[str, tuple[str, str]] = {
42
- "part_of": part_of.pair,
43
- "has_part": has_part.pair,
44
- "develops_from": develops_from.pair,
45
- "seeAlso": ("rdf", "seeAlso"),
46
- "dc-contributor": ("dc", "contributor"),
47
- "dc-creator": ("dc", "creator"),
48
- }
49
-
50
65
 
51
66
  def from_obo_path(
52
- path: Union[str, Path], prefix: Optional[str] = None, *, strict: bool = True, **kwargs
67
+ path: str | Path,
68
+ prefix: str | None = None,
69
+ *,
70
+ strict: bool = False,
71
+ version: str | None,
72
+ upgrade: bool = True,
73
+ use_tqdm: bool = False,
74
+ ignore_obsolete: bool = False,
75
+ _cache_path: Path | None = None,
53
76
  ) -> Obo:
54
77
  """Get the OBO graph from a path."""
55
- import obonet
56
-
57
- logger.info("[%s] parsing with obonet from %s", prefix or "", path)
58
- with open(path) as file:
59
- graph = obonet.read_obo(
60
- tqdm(
61
- file,
62
- unit_scale=True,
63
- desc=f'[{prefix or ""}] parsing obo',
64
- disable=None,
65
- leave=False,
66
- )
67
- )
78
+ path = Path(path).expanduser().resolve()
79
+ if path.suffix.endswith(".zip"):
80
+ import io
81
+ import zipfile
82
+
83
+ logger.info("[%s] parsing zipped OBO with obonet from %s", prefix or "<unknown>", path)
84
+ with zipfile.ZipFile(path) as zf:
85
+ with zf.open(path.name.removesuffix(".zip"), "r") as file:
86
+ content = file.read().decode("utf-8")
87
+ graph = _read_obo(
88
+ io.StringIO(content), prefix, ignore_obsolete=ignore_obsolete, use_tqdm=use_tqdm
89
+ )
90
+ else:
91
+ logger.info("[%s] parsing OBO with obonet from %s", prefix or "<unknown>", path)
92
+ with safe_open(path, read=True) as file:
93
+ graph = _read_obo(file, prefix, ignore_obsolete=ignore_obsolete, use_tqdm=use_tqdm)
68
94
 
69
95
  if prefix:
70
96
  # Make sure the graph is named properly
71
97
  _clean_graph_ontology(graph, prefix)
72
98
 
99
+ if _cache_path:
100
+ logger.info("[%s] writing obonet cache to %s", prefix, _cache_path)
101
+ write_gzipped_graph(path=_cache_path, graph=graph)
102
+
73
103
  # Convert to an Obo instance and return
74
- return from_obonet(graph, strict=strict, **kwargs)
104
+ return from_obonet(graph, strict=strict, version=version, upgrade=upgrade, use_tqdm=use_tqdm)
105
+
106
+
107
+ def _read_obo(
108
+ filelike, prefix: str | None, ignore_obsolete: bool, use_tqdm: bool = True
109
+ ) -> nx.MultiDiGraph:
110
+ import obonet
111
+
112
+ return obonet.read_obo(
113
+ tqdm(
114
+ filelike,
115
+ unit_scale=True,
116
+ desc=f"[{prefix or ''}] parsing OBO",
117
+ disable=not use_tqdm,
118
+ leave=True,
119
+ ),
120
+ ignore_obsolete=ignore_obsolete,
121
+ )
75
122
 
76
123
 
77
- def from_obonet(graph: nx.MultiDiGraph, *, strict: bool = True) -> "Obo":
124
+ def _normalize_prefix_strict(prefix: str) -> str:
125
+ n = bioregistry.normalize_prefix(prefix)
126
+ if n is None:
127
+ raise ValueError(f"unknown prefix: {prefix}")
128
+ return n
129
+
130
+
131
+ def from_str(
132
+ text: str,
133
+ *,
134
+ strict: bool = False,
135
+ version: str | None = None,
136
+ upgrade: bool = True,
137
+ ignore_obsolete: bool = False,
138
+ use_tqdm: bool = False,
139
+ ) -> Obo:
140
+ """Read an ontology from a string representation."""
141
+ import obonet
142
+
143
+ text = dedent(text).strip()
144
+ io = StringIO()
145
+ io.write(text)
146
+ io.seek(0)
147
+ graph = obonet.read_obo(io, ignore_obsolete=ignore_obsolete)
148
+ return from_obonet(graph, strict=strict, version=version, upgrade=upgrade, use_tqdm=use_tqdm)
149
+
150
+
151
+ def from_obonet(
152
+ graph: nx.MultiDiGraph,
153
+ *,
154
+ strict: bool = False,
155
+ version: str | None = None,
156
+ upgrade: bool = True,
157
+ use_tqdm: bool = False,
158
+ ) -> Obo:
78
159
  """Get all of the terms from a OBO graph."""
79
- _ontology = graph.graph["ontology"]
80
- ontology = bioregistry.normalize_prefix(_ontology) # probably always okay
81
- if ontology is None:
82
- raise ValueError(f"unknown prefix: {_ontology}")
83
- logger.info("[%s] extracting OBO using obonet", ontology)
84
-
85
- date = _get_date(graph=graph, ontology=ontology)
86
- name = _get_name(graph=graph, ontology=ontology)
87
-
88
- data_version = graph.graph.get("data-version")
89
- if not data_version:
90
- if date is not None:
91
- data_version = date.strftime("%Y-%m-%d")
92
- logger.info(
93
- "[%s] does not report a version. falling back to date: %s",
94
- ontology,
95
- data_version,
96
- )
97
- else:
98
- logger.warning("[%s] does not report a version nor a date", ontology)
99
- else:
100
- data_version = cleanup_version(data_version=data_version, prefix=ontology)
101
- if data_version is not None:
102
- logger.info("[%s] using version %s", ontology, data_version)
103
- elif date is not None:
104
- logger.info(
105
- "[%s] unrecognized version format, falling back to date: %s",
106
- ontology,
107
- data_version,
108
- )
109
- data_version = date.strftime("%Y-%m-%d")
110
- else:
111
- logger.warning(
112
- "[%s] UNRECOGNIZED VERSION FORMAT AND MISSING DATE: %s", ontology, data_version
113
- )
160
+ ontology_prefix_raw = graph.graph["ontology"]
161
+ ontology_prefix = _normalize_prefix_strict(ontology_prefix_raw)
162
+ logger.info("[%s] extracting OBO using obonet", ontology_prefix)
163
+
164
+ date = _get_date(graph=graph, ontology_prefix=ontology_prefix)
165
+ name = _get_name(graph=graph, ontology_prefix=ontology_prefix)
166
+ imports = graph.graph.get("import")
114
167
 
168
+ macro_config = MacroConfig(graph.graph, strict=strict, ontology_prefix=ontology_prefix)
169
+
170
+ data_version = _clean_graph_version(
171
+ graph, ontology_prefix=ontology_prefix, version=version, date=date
172
+ )
115
173
  if data_version and "/" in data_version:
116
- raise ValueError(f"[{ontology}] will not accept slash in data version: {data_version}")
117
-
118
- #: Parsed CURIEs to references (even external ones)
119
- reference_it = (
120
- Reference(
121
- prefix=prefix,
122
- identifier=bioregistry.standardize_identifier(prefix, identifier),
123
- # if name isn't available, it means its external to this ontology
124
- name=data.get("name"),
174
+ raise ValueError(
175
+ f"[{ontology_prefix}] slashes not allowed in data versions because of filesystem usage: {data_version}"
125
176
  )
126
- for prefix, identifier, data in _iter_obo_graph(graph=graph, strict=strict)
127
- )
128
- references: Mapping[tuple[str, str], Reference] = {
129
- reference.pair: reference for reference in reference_it
130
- }
177
+
178
+ missing_typedefs: set[ReferenceTuple] = set()
179
+
180
+ subset_typedefs = _get_subsetdefs(graph.graph, ontology_prefix=ontology_prefix)
181
+
182
+ root_terms: list[Reference] = []
183
+ property_values: list[Annotation] = []
184
+ for ann in iterate_node_properties(
185
+ graph.graph,
186
+ ontology_prefix=ontology_prefix,
187
+ upgrade=upgrade,
188
+ node=Reference(prefix="obo", identifier=ontology_prefix),
189
+ strict=strict,
190
+ context="graph property",
191
+ ):
192
+ if ann.predicate.pair == has_ontology_root_term.pair:
193
+ match ann.value:
194
+ case OBOLiteral():
195
+ logger.warning(
196
+ "[%s] tried to use a literal as an ontology root: %s",
197
+ ontology_prefix,
198
+ ann.value.value,
199
+ )
200
+ continue
201
+ case Reference():
202
+ root_terms.append(ann.value)
203
+ else:
204
+ property_values.append(ann)
205
+
206
+ for remark in graph.graph.get("remark", []):
207
+ property_values.append(Annotation(has_comment.reference, OBOLiteral.string(remark)))
208
+
209
+ idspaces: dict[str, str] = {}
210
+ for x in graph.graph.get("idspace", []):
211
+ prefix, uri_prefix, *_ = (y.strip() for y in x.split(" ", 2))
212
+ idspaces[prefix] = uri_prefix
131
213
 
132
214
  #: CURIEs to typedefs
133
- typedefs: Mapping[tuple[str, str], TypeDef] = {
134
- typedef.pair: typedef for typedef in iterate_graph_typedefs(graph, ontology)
215
+ typedefs: Mapping[ReferenceTuple, TypeDef] = {
216
+ typedef.pair: typedef
217
+ for typedef in iterate_typedefs(
218
+ graph,
219
+ ontology_prefix=ontology_prefix,
220
+ strict=strict,
221
+ upgrade=upgrade,
222
+ macro_config=macro_config,
223
+ )
135
224
  }
136
225
 
137
- synonym_typedefs: Mapping[str, SynonymTypeDef] = {
138
- synonym_typedef.curie: synonym_typedef
139
- for synonym_typedef in iterate_graph_synonym_typedefs(graph, ontology=ontology)
226
+ synonym_typedefs: Mapping[ReferenceTuple, SynonymTypeDef] = {
227
+ synonym_typedef.pair: synonym_typedef
228
+ for synonym_typedef in iterate_graph_synonym_typedefs(
229
+ graph,
230
+ ontology_prefix=ontology_prefix,
231
+ strict=strict,
232
+ upgrade=upgrade,
233
+ )
140
234
  }
141
235
 
142
- missing_typedefs = set()
236
+ terms = _get_terms(
237
+ graph,
238
+ strict=strict,
239
+ ontology_prefix=ontology_prefix,
240
+ upgrade=upgrade,
241
+ typedefs=typedefs,
242
+ missing_typedefs=missing_typedefs,
243
+ synonym_typedefs=synonym_typedefs,
244
+ subset_typedefs=subset_typedefs,
245
+ macro_config=macro_config,
246
+ use_tqdm=use_tqdm,
247
+ )
248
+
249
+ return make_ad_hoc_ontology(
250
+ _ontology=ontology_prefix,
251
+ _name=name,
252
+ _auto_generated_by=graph.graph.get("auto-generated-by"),
253
+ _typedefs=list(typedefs.values()),
254
+ _synonym_typedefs=list(synonym_typedefs.values()),
255
+ _date=date,
256
+ _data_version=data_version,
257
+ _root_terms=root_terms,
258
+ terms=terms,
259
+ _property_values=property_values,
260
+ _subsetdefs=subset_typedefs,
261
+ _imports=imports,
262
+ _idspaces=idspaces,
263
+ )
264
+
265
+
266
+ def _get_terms(
267
+ graph,
268
+ *,
269
+ strict: bool,
270
+ ontology_prefix: str,
271
+ upgrade: bool,
272
+ typedefs: Mapping[ReferenceTuple, TypeDef],
273
+ synonym_typedefs: Mapping[ReferenceTuple, SynonymTypeDef],
274
+ subset_typedefs,
275
+ missing_typedefs: set[ReferenceTuple],
276
+ macro_config: MacroConfig,
277
+ use_tqdm: bool = False,
278
+ ) -> list[Term]:
143
279
  terms = []
144
- n_alt_ids, n_parents, n_synonyms, n_relations, n_properties, n_xrefs = 0, 0, 0, 0, 0, 0
145
- for prefix, identifier, data in _iter_obo_graph(graph=graph, strict=strict):
146
- if prefix != ontology or not data:
280
+ for reference, data in _iter_obo_graph(
281
+ graph=graph,
282
+ strict=strict,
283
+ ontology_prefix=ontology_prefix,
284
+ use_tqdm=use_tqdm,
285
+ upgrade=upgrade,
286
+ ):
287
+ if reference.prefix != ontology_prefix:
288
+ continue
289
+ if not data:
290
+ # this allows us to skip anything that isn't really defined
291
+ # caveat: this misses terms that are just defined with an ID
147
292
  continue
148
293
 
149
- identifier = bioregistry.standardize_identifier(prefix, identifier)
150
- reference = references[ontology, identifier]
151
-
152
- try:
153
- node_xrefs = list(iterate_node_xrefs(prefix=prefix, data=data, strict=strict))
154
- except MissingPrefixError as e:
155
- e.reference = reference
156
- raise e
157
- xrefs, provenance = [], []
158
- for node_xref in node_xrefs:
159
- if node_xref.prefix in PROVENANCE_PREFIXES:
160
- provenance.append(node_xref)
161
- else:
162
- xrefs.append(node_xref)
163
- n_xrefs += len(xrefs)
294
+ term = Term(
295
+ reference=reference,
296
+ builtin=_get_boolean(data, "builtin"),
297
+ is_anonymous=_get_boolean(data, "is_anonymous"),
298
+ is_obsolete=_get_boolean(data, "is_obsolete"),
299
+ namespace=data.get("namespace"),
300
+ )
164
301
 
165
- definition, definition_references = get_definition(
166
- data, prefix=prefix, identifier=identifier
302
+ _process_alts(term, data, ontology_prefix=ontology_prefix, strict=strict)
303
+ _process_parents(term, data, ontology_prefix=ontology_prefix, strict=strict)
304
+ _process_synonyms(
305
+ term,
306
+ data,
307
+ ontology_prefix=ontology_prefix,
308
+ strict=strict,
309
+ upgrade=upgrade,
310
+ synonym_typedefs=synonym_typedefs,
311
+ )
312
+ _process_xrefs(
313
+ term,
314
+ data,
315
+ ontology_prefix=ontology_prefix,
316
+ strict=strict,
317
+ macro_config=macro_config,
318
+ upgrade=upgrade,
319
+ )
320
+ _process_properties(
321
+ term,
322
+ data,
323
+ ontology_prefix=ontology_prefix,
324
+ strict=strict,
325
+ upgrade=upgrade,
326
+ typedefs=typedefs,
167
327
  )
168
- if definition_references:
169
- provenance.extend(definition_references)
328
+ _process_relations(
329
+ term,
330
+ data,
331
+ ontology_prefix=ontology_prefix,
332
+ strict=strict,
333
+ upgrade=upgrade,
334
+ typedefs=typedefs,
335
+ missing_typedefs=missing_typedefs,
336
+ )
337
+ _process_replaced_by(term, data, ontology_prefix=ontology_prefix, strict=strict)
338
+ _process_subsets(term, data, ontology_prefix=ontology_prefix, strict=strict)
339
+ _process_intersection_of(term, data, ontology_prefix=ontology_prefix, strict=strict)
340
+ _process_union_of(term, data, ontology_prefix=ontology_prefix, strict=strict)
341
+ _process_equivalent_to(term, data, ontology_prefix=ontology_prefix, strict=strict)
342
+ _process_disjoint_from(term, data, ontology_prefix=ontology_prefix, strict=strict)
343
+ _process_consider(term, data, ontology_prefix=ontology_prefix, strict=strict)
344
+ _process_comment(term, data, ontology_prefix=ontology_prefix, strict=strict)
345
+ _process_description(term, data, ontology_prefix=ontology_prefix, strict=strict)
346
+ _process_creation_date(term, data)
170
347
 
171
- try:
172
- alt_ids = list(iterate_node_alt_ids(data, strict=strict))
173
- except MissingPrefixError as e:
174
- e.reference = reference
175
- raise e
176
- n_alt_ids += len(alt_ids)
348
+ terms.append(term)
349
+ return terms
177
350
 
178
- try:
179
- parents = list(
180
- iterate_node_parents(
181
- data,
182
- prefix=prefix,
183
- identifier=identifier,
184
- strict=strict,
185
- )
186
- )
187
- except MissingPrefixError as e:
188
- e.reference = reference
189
- raise e
190
- n_parents += len(parents)
191
351
 
192
- synonyms = list(
193
- iterate_node_synonyms(
194
- data,
195
- synonym_typedefs,
196
- prefix=prefix,
197
- identifier=identifier,
198
- strict=strict,
352
+ def _process_description(term: Stanza, data, *, ontology_prefix: str, strict: bool):
353
+ definition, definition_references = get_definition(
354
+ data, node=term.reference, strict=strict, ontology_prefix=ontology_prefix
355
+ )
356
+ term.definition = definition
357
+ if term.definition:
358
+ for definition_reference in definition_references:
359
+ term._append_annotation(
360
+ v.has_description,
361
+ OBOLiteral.string(term.definition),
362
+ Annotation(v.has_dbxref, definition_reference),
199
363
  )
364
+
365
+
366
+ def _process_comment(term: Stanza, data, *, ontology_prefix: str, strict: bool) -> None:
367
+ if comment := data.get("comment"):
368
+ term.append_comment(comment)
369
+
370
+
371
+ def _process_creation_date(term: Stanza, data) -> None:
372
+ date_str = data.get("creation_date")
373
+ if not date_str:
374
+ return
375
+ if isinstance(date_str, list):
376
+ date_str = date_str[0]
377
+ try:
378
+ term.append_creation_date(date_str)
379
+ except ValueError:
380
+ logger.warning("[%s] failed to parse creation_date: %s", term.reference.curie, date_str)
381
+
382
+
383
+ def _process_union_of(term: Stanza, data, *, ontology_prefix: str, strict: bool) -> None:
384
+ for reference in iterate_node_reference_tag(
385
+ "union_of", data=data, ontology_prefix=ontology_prefix, strict=strict, node=term.reference
386
+ ):
387
+ term.append_union_of(reference)
388
+
389
+
390
+ def _process_equivalent_to(term: Stanza, data, *, ontology_prefix: str, strict: bool) -> None:
391
+ for reference in iterate_node_reference_tag(
392
+ "equivalent_to",
393
+ data=data,
394
+ ontology_prefix=ontology_prefix,
395
+ strict=strict,
396
+ node=term.reference,
397
+ ):
398
+ term.append_equivalent_to(reference)
399
+
400
+
401
+ def _process_disjoint_from(term: Stanza, data, *, ontology_prefix: str, strict: bool) -> None:
402
+ for reference in iterate_node_reference_tag(
403
+ "disjoint_from",
404
+ data=data,
405
+ ontology_prefix=ontology_prefix,
406
+ strict=strict,
407
+ node=term.reference,
408
+ ):
409
+ term.append_disjoint_from(reference)
410
+
411
+
412
+ def _process_alts(term: Stanza, data, *, ontology_prefix: str, strict: bool) -> None:
413
+ for alt_reference in iterate_node_reference_tag(
414
+ "alt_id", data, node=term.reference, strict=strict, ontology_prefix=ontology_prefix
415
+ ):
416
+ term.append_alt(alt_reference)
417
+
418
+
419
+ def _process_parents(term: Stanza, data, *, ontology_prefix: str, strict: bool) -> None:
420
+ for tag in ["is_a", "instance_of"]:
421
+ for parent in iterate_node_reference_tag(
422
+ tag, data, node=term.reference, strict=strict, ontology_prefix=ontology_prefix
423
+ ):
424
+ term.append_parent(parent)
425
+
426
+
427
+ def _process_synonyms(
428
+ term: Stanza,
429
+ data,
430
+ *,
431
+ ontology_prefix: str,
432
+ strict: bool,
433
+ upgrade: bool,
434
+ synonym_typedefs: Mapping[ReferenceTuple, SynonymTypeDef],
435
+ ) -> None:
436
+ synonyms = list(
437
+ iterate_node_synonyms(
438
+ data,
439
+ synonym_typedefs,
440
+ node=term.reference,
441
+ strict=strict,
442
+ ontology_prefix=ontology_prefix,
443
+ upgrade=upgrade,
200
444
  )
201
- n_synonyms += len(synonyms)
445
+ )
446
+ for synonym in synonyms:
447
+ term.append_synonym(synonym)
202
448
 
203
- term = Term(
204
- reference=reference,
205
- definition=definition,
206
- parents=parents,
207
- synonyms=synonyms,
208
- xrefs=xrefs,
209
- provenance=provenance,
210
- alt_ids=alt_ids,
449
+
450
+ def _process_xrefs(
451
+ term: Stanza,
452
+ data,
453
+ *,
454
+ ontology_prefix: str,
455
+ strict: bool,
456
+ macro_config: MacroConfig,
457
+ upgrade: bool,
458
+ ) -> None:
459
+ for reference, provenance in iterate_node_xrefs(
460
+ data=data,
461
+ strict=strict,
462
+ ontology_prefix=ontology_prefix,
463
+ node=term.reference,
464
+ upgrade=upgrade,
465
+ ):
466
+ _handle_xref(term, reference, provenance=provenance, macro_config=macro_config)
467
+
468
+
469
+ def _process_properties(
470
+ term: Stanza, data, *, ontology_prefix: str, strict: bool, upgrade: bool, typedefs
471
+ ) -> None:
472
+ for ann in iterate_node_properties(
473
+ data,
474
+ node=term.reference,
475
+ strict=strict,
476
+ ontology_prefix=ontology_prefix,
477
+ upgrade=upgrade,
478
+ context="stanza property",
479
+ ):
480
+ # TODO parse axioms
481
+ term.append_property(ann)
482
+
483
+
484
+ def _process_relations(
485
+ term: Stanza,
486
+ data,
487
+ *,
488
+ ontology_prefix: str,
489
+ strict: bool,
490
+ upgrade: bool,
491
+ typedefs: Mapping[ReferenceTuple, TypeDef],
492
+ missing_typedefs: set[ReferenceTuple],
493
+ ) -> None:
494
+ relations_references = list(
495
+ iterate_node_relationships(
496
+ data,
497
+ node=term.reference,
498
+ strict=strict,
499
+ ontology_prefix=ontology_prefix,
500
+ upgrade=upgrade,
211
501
  )
502
+ )
503
+ for relation, reference in relations_references:
504
+ if (
505
+ relation.pair not in typedefs
506
+ and relation.pair not in default_typedefs
507
+ and relation.pair not in missing_typedefs
508
+ ):
509
+ missing_typedefs.add(relation.pair)
510
+ logger.warning("[%s] has no typedef for %s", ontology_prefix, relation.curie)
511
+ logger.debug("[%s] available typedefs: %s", ontology_prefix, set(typedefs))
512
+ # TODO parse axioms
513
+ term.append_relationship(relation, reference)
514
+
515
+
516
+ def _process_replaced_by(stanza: Stanza, data, *, ontology_prefix: str, strict: bool) -> None:
517
+ for reference in iterate_node_reference_tag(
518
+ "replaced_by", data, node=stanza.reference, strict=strict, ontology_prefix=ontology_prefix
519
+ ):
520
+ stanza.append_replaced_by(reference)
521
+
522
+
523
+ def _process_subsets(stanza: Stanza, data, *, ontology_prefix: str, strict: bool) -> None:
524
+ for reference in iterate_node_reference_tag(
525
+ "subset",
526
+ data,
527
+ node=stanza.reference,
528
+ strict=strict,
529
+ ontology_prefix=ontology_prefix,
530
+ counter=SUBSET_ERROR_COUNTER,
531
+ ):
532
+ stanza.append_subset(reference)
533
+
534
+
535
+ def _get_boolean(data: Mapping[str, Any], tag: str) -> bool | None:
536
+ value = data.get(tag)
537
+ if value is None:
538
+ return None
539
+ if isinstance(value, list):
540
+ value = value[0]
541
+ if value == "false":
542
+ return False
543
+ if value == "true":
544
+ return True
545
+ raise ValueError(value)
546
+
547
+
548
+ def _get_reference(
549
+ data: Mapping[str, Any], tag: str, *, ontology_prefix: str, strict: bool, **kwargs
550
+ ) -> Reference | None:
551
+ value = data.get(tag)
552
+ if value is None:
553
+ return None
554
+ if isinstance(value, list):
555
+ value = value[0]
556
+ return _obo_parse_identifier(
557
+ value, ontology_prefix=ontology_prefix, strict=strict, context=tag, **kwargs
558
+ )
212
559
 
213
- try:
214
- relations_references = list(
215
- iterate_node_relationships(
216
- data,
217
- prefix=ontology,
218
- identifier=identifier,
219
- strict=strict,
560
+
561
+ class MacroConfig:
562
+ """A configuration data class for reader macros."""
563
+
564
+ def __init__(
565
+ self, data: Mapping[str, list[str]] | None = None, *, strict: bool, ontology_prefix: str
566
+ ):
567
+ """Instantiate the configuration from obonet graph metadata."""
568
+ if data is None:
569
+ data = {}
570
+
571
+ self.treat_xrefs_as_equivalent: set[str] = set()
572
+ for prefix in data.get("treat-xrefs-as-equivalent", []):
573
+ prefix_norm = bioregistry.normalize_prefix(prefix)
574
+ if prefix_norm is None:
575
+ continue
576
+ self.treat_xrefs_as_equivalent.add(prefix_norm)
577
+
578
+ self.treat_xrefs_as_genus_differentia: dict[str, tuple[Reference, Reference]] = {}
579
+ for line in data.get("treat-xrefs-as-genus-differentia", []):
580
+ try:
581
+ gd_prefix, gd_predicate, gd_target = line.split()
582
+ except ValueError:
583
+ # this happens in `plana`, where there's an incorrectly written
584
+ # line `CARO part_of NCBITaxon:79327; CL part_of NCBITaxon:79327`
585
+ tqdm.write(
586
+ f"[{ontology_prefix}] failed to parse treat-xrefs-as-genus-differentia: {line}"
220
587
  )
588
+ continue
589
+
590
+ gd_prefix_norm = bioregistry.normalize_prefix(gd_prefix)
591
+ if gd_prefix_norm is None:
592
+ continue
593
+ gd_predicate_re = _obo_parse_identifier(
594
+ gd_predicate, ontology_prefix=ontology_prefix, strict=strict
221
595
  )
222
- except MissingPrefixError as e:
223
- e.reference = reference
224
- raise e
225
- for relation, reference in relations_references:
226
- if (relation.prefix, relation.identifier) in typedefs:
227
- typedef = typedefs[relation.prefix, relation.identifier]
228
- elif (relation.prefix, relation.identifier) in default_typedefs:
229
- typedef = default_typedefs[relation.prefix, relation.identifier]
230
- else:
231
- if (relation.prefix, relation.identifier) not in missing_typedefs:
232
- missing_typedefs.add((relation.prefix, relation.identifier))
233
- logger.warning("[%s] has no typedef for %s", ontology, relation)
234
- logger.debug("[%s] available typedefs: %s", ontology, set(typedefs))
596
+ if gd_predicate_re is None:
597
+ continue
598
+ gd_target_re = _obo_parse_identifier(
599
+ gd_target, ontology_prefix=ontology_prefix, strict=strict
600
+ )
601
+ if gd_target_re is None:
602
+ continue
603
+ self.treat_xrefs_as_genus_differentia[gd_prefix_norm] = (gd_predicate_re, gd_target_re)
604
+
605
+ self.treat_xrefs_as_relationship: dict[str, Reference] = {}
606
+ for line in data.get("treat-xrefs-as-relationship", []):
607
+ try:
608
+ gd_prefix, gd_predicate = line.split()
609
+ except ValueError:
610
+ tqdm.write(
611
+ f"[{ontology_prefix}] failed to parse treat-xrefs-as-relationship: {line}"
612
+ )
235
613
  continue
236
- n_relations += 1
237
- term.append_relationship(typedef, reference)
238
- for prop, value in iterate_node_properties(data, term=term):
239
- n_properties += 1
240
- term.append_property(prop, value)
241
- terms.append(term)
242
614
 
243
- logger.info(
244
- f"[{ontology}] got {len(references):,} references, {len(typedefs):,} typedefs, {len(terms):,} terms,"
245
- f" {n_alt_ids:,} alt ids, {n_parents:,} parents, {n_synonyms:,} synonyms, {n_xrefs:,} xrefs,"
246
- f" {n_relations:,} relations, and {n_properties:,} properties",
247
- )
615
+ gd_prefix_norm = bioregistry.normalize_prefix(gd_prefix)
616
+ if gd_prefix_norm is None:
617
+ continue
618
+ gd_predicate_re = _obo_parse_identifier(
619
+ gd_predicate, ontology_prefix=ontology_prefix, strict=strict
620
+ )
621
+ if gd_predicate_re is None:
622
+ continue
623
+ self.treat_xrefs_as_relationship[gd_prefix_norm] = gd_predicate_re
248
624
 
249
- return make_ad_hoc_ontology(
250
- _ontology=ontology,
251
- _name=name,
252
- _auto_generated_by=graph.graph.get("auto-generated-by"),
253
- _format_version=graph.graph.get("format-version"),
254
- _typedefs=list(typedefs.values()),
255
- _synonym_typedefs=list(synonym_typedefs.values()),
256
- _date=date,
257
- _data_version=data_version,
258
- terms=terms,
259
- )
625
+ self.treat_xrefs_as_is_a: set[str] = set()
626
+ for prefix in data.get("treat-xrefs-as-is_a", []):
627
+ gd_prefix_norm = bioregistry.normalize_prefix(prefix)
628
+ if gd_prefix_norm is None:
629
+ continue
630
+ self.treat_xrefs_as_is_a.add(gd_prefix_norm)
631
+
632
+
633
+ def _handle_xref(
634
+ term: Stanza,
635
+ xref: Reference,
636
+ *,
637
+ provenance: list[Reference | OBOLiteral],
638
+ macro_config: MacroConfig | None = None,
639
+ ) -> Stanza:
640
+ annotations = [Annotation(v.has_dbxref, p) for p in provenance]
641
+
642
+ if macro_config is not None:
643
+ if xref.prefix in macro_config.treat_xrefs_as_equivalent:
644
+ return term.append_equivalent(xref, annotations=annotations)
645
+ elif object_property := macro_config.treat_xrefs_as_genus_differentia.get(xref.prefix):
646
+ # TODO how to add annotations here?
647
+ if annotations:
648
+ logger.warning(
649
+ "[%s] unable to add provenance to xref upgraded to intersection_of: %s",
650
+ term.reference.curie,
651
+ xref,
652
+ )
653
+ return term.append_intersection_of(xref).append_intersection_of(object_property)
654
+ elif predicate := macro_config.treat_xrefs_as_relationship.get(xref.prefix):
655
+ return term.append_relationship(predicate, xref, annotations=annotations)
656
+ elif xref.prefix in macro_config.treat_xrefs_as_is_a:
657
+ return term.append_parent(xref, annotations=annotations)
658
+
659
+ # TODO this is not what spec calls for, maybe
660
+ # need a flag in macro config for this
661
+ if xref.prefix in PROVENANCE_PREFIXES:
662
+ return term.append_provenance(xref, annotations=annotations)
663
+
664
+ return term.append_xref(xref, annotations=annotations)
665
+
666
+
667
+ SUBSET_ERROR_COUNTER: Counter[tuple[str, str]] = Counter()
668
+
669
+
670
+ def _get_subsetdefs(graph: nx.MultiDiGraph, ontology_prefix: str) -> list[tuple[Reference, str]]:
671
+ rv = []
672
+ for subsetdef in graph.get("subsetdef", []):
673
+ left, _, right = subsetdef.partition(" ")
674
+ if not right:
675
+ logger.warning("[%s] subsetdef did not have two parts", ontology_prefix, subsetdef)
676
+ continue
677
+ left_ref = _obo_parse_identifier(
678
+ left,
679
+ ontology_prefix=ontology_prefix,
680
+ name=right,
681
+ line=subsetdef,
682
+ counter=SUBSET_ERROR_COUNTER,
683
+ )
684
+ if left_ref is None:
685
+ continue
686
+ right = right.strip('"')
687
+ rv.append((left_ref, right))
688
+ return rv
260
689
 
261
690
 
262
691
  def _clean_graph_ontology(graph, prefix: str) -> None:
263
692
  """Update the ontology entry in the graph's metadata, if necessary."""
264
693
  if "ontology" not in graph.graph:
265
- logger.warning('[%s] missing "ontology" key', prefix)
694
+ logger.debug('[%s] missing "ontology" key', prefix)
266
695
  graph.graph["ontology"] = prefix
267
696
  elif not graph.graph["ontology"].isalpha():
268
- logger.warning(
269
- "[%s] ontology=%s has a strange format. replacing with prefix",
697
+ logger.debug(
698
+ "[%s] ontology prefix `%s` has a strange format. replacing with prefix",
270
699
  prefix,
271
700
  graph.graph["ontology"],
272
701
  )
273
702
  graph.graph["ontology"] = prefix
274
703
 
275
704
 
705
+ def _clean_graph_version(
706
+ graph, ontology_prefix: str, version: str | None, date: datetime | None
707
+ ) -> str | None:
708
+ if ontology_prefix in STATIC_VERSION_REWRITES:
709
+ return STATIC_VERSION_REWRITES[ontology_prefix]
710
+
711
+ data_version: str | None = graph.graph.get("data-version") or None
712
+ if version:
713
+ clean_injected_version = cleanup_version(version, prefix=ontology_prefix)
714
+ if not data_version:
715
+ logger.debug(
716
+ "[%s] did not have a version, overriding with %s",
717
+ ontology_prefix,
718
+ clean_injected_version,
719
+ )
720
+ return clean_injected_version
721
+
722
+ clean_data_version = cleanup_version(data_version, prefix=ontology_prefix)
723
+ if clean_data_version != clean_injected_version:
724
+ # in this case, we're going to trust the one that's passed
725
+ # through explicitly more than the graph's content
726
+ logger.debug(
727
+ "[%s] had version %s, overriding with %s", ontology_prefix, data_version, version
728
+ )
729
+ return clean_injected_version
730
+
731
+ if data_version:
732
+ clean_data_version = cleanup_version(data_version, prefix=ontology_prefix)
733
+ logger.debug("[%s] using version %s", ontology_prefix, clean_data_version)
734
+ return clean_data_version
735
+
736
+ if date is not None:
737
+ derived_date_version = date.strftime("%Y-%m-%d")
738
+ logger.debug(
739
+ "[%s] does not report a version. falling back to date: %s",
740
+ ontology_prefix,
741
+ derived_date_version,
742
+ )
743
+ return derived_date_version
744
+
745
+ logger.debug("[%s] does not report a version nor a date", ontology_prefix)
746
+ return None
747
+
748
+
276
749
  def _iter_obo_graph(
277
750
  graph: nx.MultiDiGraph,
278
751
  *,
279
- strict: bool = True,
280
- ) -> Iterable[tuple[str, str, Mapping[str, Any]]]:
752
+ strict: bool = False,
753
+ ontology_prefix: str,
754
+ use_tqdm: bool = False,
755
+ upgrade: bool,
756
+ ) -> Iterable[tuple[Reference, Mapping[str, Any]]]:
281
757
  """Iterate over the nodes in the graph with the prefix stripped (if it's there)."""
282
- for node, data in graph.nodes(data=True):
283
- prefix, identifier = normalize_curie(node, strict=strict)
284
- if prefix is None or identifier is None:
285
- continue
286
- yield prefix, identifier, data
287
-
288
-
289
- def _get_date(graph, ontology: str) -> Optional[datetime]:
758
+ for node, data in tqdm(
759
+ graph.nodes(data=True), disable=not use_tqdm, unit_scale=True, desc=f"[{ontology_prefix}]"
760
+ ):
761
+ name = data.get("name")
762
+ match _parse_str_or_curie_or_uri_helper(
763
+ node,
764
+ ontology_prefix=ontology_prefix,
765
+ name=name,
766
+ upgrade=upgrade,
767
+ context="stanza ID",
768
+ ):
769
+ case Reference() as reference:
770
+ yield reference, data
771
+ case NotCURIEError() as exc:
772
+ if _is_valid_identifier(node):
773
+ yield default_reference(ontology_prefix, node, name=name), data
774
+ elif strict:
775
+ raise exc
776
+ else:
777
+ logger.warning(str(exc))
778
+ case ParseError() as exc:
779
+ if strict:
780
+ raise exc
781
+ else:
782
+ logger.warning(str(exc))
783
+ # if blacklisted, just skip it with no warning
784
+
785
+
786
+ def _get_date(graph, ontology_prefix: str) -> datetime | None:
290
787
  try:
291
788
  rv = datetime.strptime(graph.graph["date"], DATE_FORMAT)
292
789
  except KeyError:
293
- logger.info("[%s] does not report a date", ontology)
790
+ logger.info("[%s] does not report a date", ontology_prefix)
294
791
  return None
295
792
  except ValueError:
296
- logger.info("[%s] reports a date that can't be parsed: %s", ontology, graph.graph["date"])
793
+ logger.info(
794
+ "[%s] reports a date that can't be parsed: %s", ontology_prefix, graph.graph["date"]
795
+ )
297
796
  return None
298
797
  else:
299
798
  return rv
300
799
 
301
800
 
302
- def _get_name(graph, ontology: str) -> str:
801
+ def _get_name(graph, ontology_prefix: str) -> str:
303
802
  try:
304
803
  rv = graph.graph["name"]
305
804
  except KeyError:
306
- logger.info("[%s] does not report a name", ontology)
307
- rv = ontology
805
+ logger.info("[%s] does not report a name", ontology_prefix)
806
+ rv = ontology_prefix
308
807
  return rv
309
808
 
310
809
 
311
810
  def iterate_graph_synonym_typedefs(
312
- graph: nx.MultiDiGraph, *, ontology: str, strict: bool = False
811
+ graph: nx.MultiDiGraph, *, ontology_prefix: str, strict: bool = False, upgrade: bool
313
812
  ) -> Iterable[SynonymTypeDef]:
314
813
  """Get synonym type definitions from an :mod:`obonet` graph."""
315
- for s in graph.graph.get("synonymtypedef", []):
316
- sid, name = s.split(" ", 1)
317
- name = name.strip().strip('"')
318
- if sid.startswith("http://") or sid.startswith("https://"):
319
- reference = Reference.from_iri(sid, name=name)
320
- elif ":" not in sid: # assume it's ad-hoc
321
- reference = Reference(prefix=ontology, identifier=sid, name=name)
322
- else: # assume it's a curie
323
- reference = Reference.from_curie(sid, name=name, strict=strict)
324
-
325
- if reference is None:
814
+ for line in graph.graph.get("synonymtypedef", []):
815
+ # TODO handle trailing comments
816
+ line, _, specificity = (x.strip() for x in line.rpartition('"'))
817
+ specificity = specificity.upper()
818
+ if not specificity:
819
+ specificity = None
820
+ elif specificity not in t.get_args(SynonymScope):
326
821
  if strict:
327
- raise ValueError(f"Could not parse {sid}")
328
- else:
329
- continue
822
+ raise ValueError(f"invalid synonym specificty: {specificity}")
823
+ logger.warning("[%s] invalid synonym specificty: %s", ontology_prefix, specificity)
824
+ specificity = None
330
825
 
331
- yield SynonymTypeDef(reference=reference)
826
+ curie, name = line.split(" ", 1)
827
+ # the name should be in quotes, so strip them out
828
+ name = name.strip().strip('"')
829
+ # TODO unquote the string?
830
+ reference = _obo_parse_identifier(
831
+ curie,
832
+ ontology_prefix=ontology_prefix,
833
+ name=name,
834
+ upgrade=upgrade,
835
+ strict=strict,
836
+ )
837
+ if reference is None:
838
+ logger.warning("[%s] unable to parse synonym typedef ID %s", ontology_prefix, curie)
839
+ continue
840
+ yield SynonymTypeDef(reference=reference, specificity=specificity)
332
841
 
333
842
 
334
- def iterate_graph_typedefs(
335
- graph: nx.MultiDiGraph, default_prefix: str, *, strict: bool = True
843
+ def iterate_typedefs(
844
+ graph: nx.MultiDiGraph,
845
+ *,
846
+ ontology_prefix: str,
847
+ strict: bool = False,
848
+ upgrade: bool,
849
+ macro_config: MacroConfig | None = None,
336
850
  ) -> Iterable[TypeDef]:
337
851
  """Get type definitions from an :mod:`obonet` graph."""
338
- for typedef in graph.graph.get("typedefs", []):
339
- if "id" in typedef:
340
- curie = typedef["id"]
341
- elif "identifier" in typedef:
342
- curie = typedef["identifier"]
852
+ if macro_config is None:
853
+ macro_config = MacroConfig(strict=strict, ontology_prefix=ontology_prefix)
854
+ # can't really have a pre-defined set of synonym typedefs here!
855
+ synonym_typedefs: Mapping[ReferenceTuple, SynonymTypeDef] = {}
856
+ typedefs: Mapping[ReferenceTuple, TypeDef] = {}
857
+ missing_typedefs: set[ReferenceTuple] = set()
858
+ for data in graph.graph.get("typedefs", []):
859
+ if "id" in data:
860
+ typedef_id = data["id"]
861
+ elif "identifier" in data:
862
+ typedef_id = data["identifier"]
343
863
  else:
344
- raise KeyError
864
+ raise KeyError("typedef is missing an `id`")
345
865
 
346
- name = typedef.get("name")
866
+ name = data.get("name")
347
867
  if name is None:
348
- logger.debug("[%s] typedef %s is missing a name", graph.graph["ontology"], curie)
868
+ logger.debug("[%s] typedef %s is missing a name", ontology_prefix, typedef_id)
349
869
 
350
- if ":" in curie:
351
- reference = Reference.from_curie(curie, name=name, strict=strict)
352
- else:
353
- reference = Reference(prefix=graph.graph["ontology"], identifier=curie, name=name)
870
+ reference = _obo_parse_identifier(
871
+ typedef_id, strict=strict, ontology_prefix=ontology_prefix, name=name, upgrade=upgrade
872
+ )
354
873
  if reference is None:
355
- logger.warning("[%s] unable to parse typedef CURIE %s", graph.graph["ontology"], curie)
874
+ logger.warning("[%s] unable to parse typedef ID %s", ontology_prefix, typedef_id)
356
875
  continue
357
876
 
358
- xrefs = []
359
- for curie in typedef.get("xref", []):
360
- _xref = Reference.from_curie(curie, strict=strict)
361
- if _xref:
362
- xrefs.append(_xref)
363
- yield TypeDef(reference=reference, xrefs=xrefs)
877
+ typedef = TypeDef(
878
+ reference=reference,
879
+ namespace=data.get("namespace"),
880
+ is_metadata_tag=_get_boolean(data, "is_metadata_tag"),
881
+ is_class_level=_get_boolean(data, "is_class_level"),
882
+ builtin=_get_boolean(data, "builtin"),
883
+ is_obsolete=_get_boolean(data, "is_obsolete"),
884
+ is_anonymous=_get_boolean(data, "is_anonymous"),
885
+ is_anti_symmetric=_get_boolean(data, "is_anti_symmetric"),
886
+ is_symmetric=_get_boolean(data, "is_symmetric"),
887
+ is_reflexive=_get_boolean(data, "is_reflexive"),
888
+ is_cyclic=_get_boolean(data, "is_cyclic"),
889
+ is_transitive=_get_boolean(data, "is_transitive"),
890
+ is_functional=_get_boolean(data, "is_functional"),
891
+ is_inverse_functional=_get_boolean(data, "is_inverse_functional"),
892
+ domain=_get_reference(data, "domain", ontology_prefix=ontology_prefix, strict=strict),
893
+ range=_get_reference(data, "range", ontology_prefix=ontology_prefix, strict=strict),
894
+ inverse=_get_reference(
895
+ data, "inverse_of", ontology_prefix=ontology_prefix, strict=strict
896
+ ),
897
+ )
898
+ _process_alts(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
899
+ _process_parents(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
900
+ _process_synonyms(
901
+ typedef,
902
+ data,
903
+ ontology_prefix=ontology_prefix,
904
+ strict=strict,
905
+ upgrade=upgrade,
906
+ synonym_typedefs=synonym_typedefs,
907
+ )
908
+ _process_xrefs(
909
+ typedef,
910
+ data,
911
+ ontology_prefix=ontology_prefix,
912
+ strict=strict,
913
+ macro_config=macro_config,
914
+ upgrade=upgrade,
915
+ )
916
+ _process_properties(
917
+ typedef,
918
+ data,
919
+ ontology_prefix=ontology_prefix,
920
+ strict=strict,
921
+ upgrade=upgrade,
922
+ typedefs=typedefs,
923
+ )
924
+ _process_relations(
925
+ typedef,
926
+ data,
927
+ ontology_prefix=ontology_prefix,
928
+ strict=strict,
929
+ upgrade=upgrade,
930
+ typedefs=typedefs,
931
+ missing_typedefs=missing_typedefs,
932
+ )
933
+ _process_replaced_by(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
934
+ _process_subsets(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
935
+ _process_intersection_of(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
936
+ _process_union_of(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
937
+ _process_equivalent_to(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
938
+ _process_disjoint_from(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
939
+ _process_consider(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
940
+ _process_comment(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
941
+ _process_description(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
942
+ _process_creation_date(typedef, data)
943
+
944
+ # the next 4 are typedef-specific
945
+ _process_equivalent_to_chain(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
946
+ _process_holds_over_chain(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
947
+ typedef.disjoint_over.extend(
948
+ iterate_node_reference_tag(
949
+ "disjoint_over",
950
+ data,
951
+ node=typedef.reference,
952
+ ontology_prefix=ontology_prefix,
953
+ strict=strict,
954
+ )
955
+ )
956
+ typedef.transitive_over.extend(
957
+ iterate_node_reference_tag(
958
+ "transitive_over",
959
+ data,
960
+ node=typedef.reference,
961
+ ontology_prefix=ontology_prefix,
962
+ strict=strict,
963
+ )
964
+ )
965
+
966
+ yield typedef
967
+
968
+
969
+ def _process_consider(stanza: Stanza, data, *, ontology_prefix: str, strict: bool = False):
970
+ for reference in iterate_node_reference_tag(
971
+ "consider",
972
+ data,
973
+ node=stanza.reference,
974
+ ontology_prefix=ontology_prefix,
975
+ strict=strict,
976
+ ):
977
+ stanza.append_see_also(reference)
978
+
979
+
980
+ def _process_equivalent_to_chain(
981
+ typedef: TypeDef, data, *, ontology_prefix: str, strict: bool = False
982
+ ) -> None:
983
+ for chain in _iterate_chain(
984
+ "equivalent_to_chain", typedef, data, ontology_prefix=ontology_prefix, strict=strict
985
+ ):
986
+ typedef.equivalent_to_chain.append(chain)
987
+
988
+
989
+ def _process_holds_over_chain(
990
+ typedef: TypeDef, data, *, ontology_prefix: str, strict: bool = False
991
+ ) -> None:
992
+ for chain in _iterate_chain(
993
+ "holds_over_chain", typedef, data, ontology_prefix=ontology_prefix, strict=strict
994
+ ):
995
+ typedef.holds_over_chain.append(chain)
996
+
997
+
998
+ def _iterate_chain(
999
+ tag: str, typedef: TypeDef, data, *, ontology_prefix: str, strict: bool = False
1000
+ ) -> Iterable[list[Reference]]:
1001
+ for chain in data.get(tag, []):
1002
+ # chain is a list of CURIEs
1003
+ predicate_chain = _process_chain_helper(typedef, chain, ontology_prefix=ontology_prefix)
1004
+ if predicate_chain is None:
1005
+ logger.warning(
1006
+ "[%s - %s] could not parse line: %s: %s",
1007
+ ontology_prefix,
1008
+ typedef.curie,
1009
+ tag,
1010
+ chain,
1011
+ )
1012
+ else:
1013
+ yield predicate_chain
1014
+
1015
+
1016
+ def _process_chain_helper(
1017
+ term: Stanza, chain: str, ontology_prefix: str, strict: bool = False
1018
+ ) -> list[Reference] | None:
1019
+ rv = []
1020
+ for curie in chain.split():
1021
+ curie = curie.strip()
1022
+ r = _obo_parse_identifier(
1023
+ curie, ontology_prefix=ontology_prefix, strict=strict, node=term.reference
1024
+ )
1025
+ if r is None:
1026
+ return None
1027
+ rv.append(r)
1028
+ return rv
364
1029
 
365
1030
 
366
1031
  def get_definition(
367
- data, *, prefix: str, identifier: str
368
- ) -> Union[tuple[None, None], tuple[str, list[Reference]]]:
1032
+ data, *, node: Reference, ontology_prefix: str, strict: bool = False
1033
+ ) -> tuple[None | str, list[Reference | OBOLiteral]]:
369
1034
  """Extract the definition from the data."""
370
1035
  definition = data.get("def") # it's allowed not to have a definition
371
1036
  if not definition:
372
- return None, None
373
- return _extract_definition(definition, prefix=prefix, identifier=identifier)
1037
+ return None, []
1038
+ return _extract_definition(
1039
+ definition, node=node, strict=strict, ontology_prefix=ontology_prefix
1040
+ )
374
1041
 
375
1042
 
376
1043
  def _extract_definition(
377
1044
  s: str,
378
1045
  *,
379
- prefix: str,
380
- identifier: str,
1046
+ node: Reference,
381
1047
  strict: bool = False,
382
- ) -> Union[tuple[None, None], tuple[str, list[Reference]]]:
1048
+ ontology_prefix: str,
1049
+ ) -> tuple[None | str, list[Reference | OBOLiteral]]:
383
1050
  """Extract the definitions."""
384
1051
  if not s.startswith('"'):
385
- raise ValueError("definition does not start with a quote")
1052
+ logger.warning(f"[{node.curie}] definition does not start with a quote")
1053
+ return None, []
386
1054
 
387
1055
  try:
388
1056
  definition, rest = _quote_split(s)
389
- except ValueError:
390
- logger.warning("[%s:%s] could not parse definition: %s", prefix, identifier, s)
391
- return None, None
1057
+ except ValueError as e:
1058
+ logger.warning("[%s] failed to parse definition quotes: %s", node.curie, str(e))
1059
+ return None, []
392
1060
 
393
- if not rest.startswith("[") or not rest.endswith("]"):
394
- logger.warning("[%s:%s] problem with definition: %s", prefix, identifier, s)
1061
+ if not rest.startswith("["):
1062
+ logger.debug("[%s] no square brackets for provenance on line: %s", node.curie, s)
395
1063
  provenance = []
396
1064
  else:
397
- provenance = _parse_trailing_ref_list(rest, strict=strict)
398
- return definition, provenance
1065
+ rest = rest.lstrip("[").rstrip("]") # FIXME this doesn't account for trailing annotations
1066
+ provenance = _parse_provenance_list(
1067
+ rest,
1068
+ node=node,
1069
+ ontology_prefix=ontology_prefix,
1070
+ counter=DEFINITION_PROVENANCE_COUNTER,
1071
+ scope_text="definition provenance",
1072
+ line=s,
1073
+ strict=strict,
1074
+ )
1075
+ return definition or None, provenance
399
1076
 
400
1077
 
401
- def _get_first_nonquoted(s: str) -> Optional[int]:
1078
+ def get_first_nonescaped_quote(s: str) -> int | None:
1079
+ """Get the first non-escaped quote."""
1080
+ if not s:
1081
+ return None
1082
+ if s[0] == '"':
1083
+ # special case first position
1084
+ return 0
402
1085
  for i, (a, b) in enumerate(pairwise(s), start=1):
403
1086
  if b == '"' and a != "\\":
404
1087
  return i
@@ -406,10 +1089,12 @@ def _get_first_nonquoted(s: str) -> Optional[int]:
406
1089
 
407
1090
 
408
1091
  def _quote_split(s: str) -> tuple[str, str]:
409
- s = s.lstrip('"')
410
- i = _get_first_nonquoted(s)
1092
+ if not s.startswith('"'):
1093
+ raise ValueError(f"'{s}' does not start with a quote")
1094
+ s = s.removeprefix('"')
1095
+ i = get_first_nonescaped_quote(s)
411
1096
  if i is None:
412
- raise ValueError
1097
+ raise ValueError(f"no closing quote found in `{s}`")
413
1098
  return _clean_definition(s[:i].strip()), s[i + 1 :].strip()
414
1099
 
415
1100
 
@@ -421,78 +1106,64 @@ def _clean_definition(s: str) -> str:
421
1106
 
422
1107
  def _extract_synonym(
423
1108
  s: str,
424
- synonym_typedefs: Mapping[str, SynonymTypeDef],
1109
+ synonym_typedefs: Mapping[ReferenceTuple, SynonymTypeDef],
425
1110
  *,
426
- prefix: str,
427
- identifier: str,
428
- strict: bool = True,
429
- ) -> Optional[Synonym]:
1111
+ node: Reference,
1112
+ strict: bool = False,
1113
+ ontology_prefix: str,
1114
+ upgrade: bool,
1115
+ ) -> Synonym | None:
430
1116
  # TODO check if the synonym is written like a CURIE... it shouldn't but I've seen it happen
431
1117
  try:
432
1118
  name, rest = _quote_split(s)
433
1119
  except ValueError:
434
- logger.warning("[%s:%s] invalid synonym: %s", prefix, identifier, s)
1120
+ logger.warning("[%s] invalid synonym: %s", node.curie, s)
435
1121
  return None
436
1122
 
437
- specificity: Optional[SynonymSpecificity] = None
438
- for _specificity in SynonymSpecificities:
439
- if rest.startswith(_specificity):
440
- specificity = _specificity
441
- rest = rest[len(_specificity) :].strip()
442
- break
443
-
444
- stype: Optional[SynonymTypeDef] = None
445
- for _stype in synonym_typedefs.values():
446
- # Since there aren't a lot of carefully defined synonym definitions, it
447
- # can appear as a string or curie. Therefore, we might see temporary prefixes
448
- # get added, so we should check against full curies as well as local unique
449
- # identifiers
450
- if rest.startswith(_stype.curie):
451
- rest = rest[len(_stype.curie) :].strip()
452
- stype = _stype
453
- break
454
- elif rest.startswith(_stype.preferred_curie):
455
- rest = rest[len(_stype.preferred_curie) :].strip()
456
- stype = _stype
457
- break
458
- elif rest.startswith(_stype.identifier):
459
- rest = rest[len(_stype.identifier) :].strip()
460
- stype = _stype
461
- break
462
-
463
- if not rest.startswith("[") or not rest.endswith("]"):
464
- logger.warning("[%s:%s] problem with synonym: %s", prefix, identifier, s)
465
- return None
1123
+ specificity, rest = _chomp_specificity(rest)
1124
+ synonym_typedef, rest = _chomp_typedef(
1125
+ rest,
1126
+ synonym_typedefs=synonym_typedefs,
1127
+ strict=strict,
1128
+ node=node,
1129
+ ontology_prefix=ontology_prefix,
1130
+ upgrade=upgrade,
1131
+ )
1132
+ provenance, rest = _chomp_references(
1133
+ rest,
1134
+ strict=strict,
1135
+ node=node,
1136
+ ontology_prefix=ontology_prefix,
1137
+ line=s,
1138
+ )
1139
+ annotations = _chomp_axioms(rest, node=node, strict=strict)
466
1140
 
467
- provenance = _parse_trailing_ref_list(rest, strict=strict)
468
1141
  return Synonym(
469
1142
  name=name,
470
- specificity=specificity or "EXACT",
471
- type=stype or DEFAULT_SYNONYM_TYPE,
472
- provenance=provenance,
1143
+ specificity=specificity,
1144
+ type=synonym_typedef.reference if synonym_typedef else None,
1145
+ provenance=list(provenance or []),
1146
+ annotations=annotations,
473
1147
  )
474
1148
 
475
1149
 
476
- def _parse_trailing_ref_list(rest, *, strict: bool = True):
477
- rest = rest.lstrip("[").rstrip("]")
478
- return [
479
- Reference.from_curie(curie.strip(), strict=strict)
480
- for curie in rest.split(",")
481
- if curie.strip()
482
- ]
1150
+ #: A counter for errors in parsing provenance
1151
+ DEFINITION_PROVENANCE_COUNTER: Counter[tuple[str, str]] = Counter()
483
1152
 
484
1153
 
485
1154
  def iterate_node_synonyms(
486
1155
  data: Mapping[str, Any],
487
- synonym_typedefs: Mapping[str, SynonymTypeDef],
1156
+ synonym_typedefs: Mapping[ReferenceTuple, SynonymTypeDef],
488
1157
  *,
489
- prefix: str,
490
- identifier: str,
1158
+ node: Reference,
491
1159
  strict: bool = False,
1160
+ ontology_prefix: str,
1161
+ upgrade: bool,
492
1162
  ) -> Iterable[Synonym]:
493
1163
  """Extract synonyms from a :mod:`obonet` node's data.
494
1164
 
495
- Example strings:
1165
+ Example strings
1166
+
496
1167
  - "LTEC I" EXACT [Orphanet:93938,DOI:xxxx]
497
1168
  - "LTEC I" EXACT [Orphanet:93938]
498
1169
  - "LTEC I" [Orphanet:93938]
@@ -500,121 +1171,409 @@ def iterate_node_synonyms(
500
1171
  """
501
1172
  for s in data.get("synonym", []):
502
1173
  s = _extract_synonym(
503
- s, synonym_typedefs, prefix=prefix, identifier=identifier, strict=strict
1174
+ s,
1175
+ synonym_typedefs,
1176
+ node=node,
1177
+ strict=strict,
1178
+ ontology_prefix=ontology_prefix,
1179
+ upgrade=upgrade,
504
1180
  )
505
1181
  if s is not None:
506
1182
  yield s
507
1183
 
508
1184
 
509
- HANDLED_PROPERTY_TYPES = {
510
- "xsd:string": str,
511
- "xsd:dateTime": datetime,
512
- }
513
-
514
-
515
1185
  def iterate_node_properties(
516
- data: Mapping[str, Any], *, property_prefix: Optional[str] = None, term=None
517
- ) -> Iterable[tuple[str, str]]:
1186
+ data: Mapping[str, Any],
1187
+ *,
1188
+ node: Reference,
1189
+ strict: bool = False,
1190
+ ontology_prefix: str,
1191
+ upgrade: bool,
1192
+ context: str,
1193
+ ) -> Iterable[Annotation]:
518
1194
  """Extract properties from a :mod:`obonet` node's data."""
519
1195
  for prop_value_type in data.get("property_value", []):
520
- try:
521
- prop, value_type = prop_value_type.split(" ", 1)
522
- except ValueError:
523
- logger.info("malformed property: %s on %s", prop_value_type, term and term.curie)
524
- continue
525
- if property_prefix is not None and prop.startswith(property_prefix):
526
- prop = prop[len(property_prefix) :]
1196
+ if yv := _handle_prop(
1197
+ prop_value_type,
1198
+ node=node,
1199
+ strict=strict,
1200
+ ontology_prefix=ontology_prefix,
1201
+ upgrade=upgrade,
1202
+ context=context,
1203
+ ):
1204
+ yield yv
1205
+
1206
+
1207
+ #: Keep track of property-value pairs for which the value couldn't be parsed,
1208
+ #: such as `dc:conformsTo autoimmune:inflammation.yaml` in MONDO
1209
+ UNHANDLED_PROP_OBJECTS: Counter[tuple[str, str]] = Counter()
1210
+
1211
+ UNHANDLED_PROPS: Counter[tuple[str, str]] = Counter()
1212
+
1213
+
1214
+ def _handle_prop(
1215
+ prop_value_type: str,
1216
+ *,
1217
+ node: Reference,
1218
+ strict: bool = False,
1219
+ ontology_prefix: str,
1220
+ upgrade: bool,
1221
+ context: str | None,
1222
+ ) -> Annotation | None:
1223
+ try:
1224
+ prop, value_type = prop_value_type.split(" ", 1)
1225
+ except ValueError:
1226
+ logger.warning("[%s] property_value is missing a space: %s", node.curie, prop_value_type)
1227
+ return None
1228
+
1229
+ prop_reference = _get_prop(
1230
+ prop,
1231
+ node=node,
1232
+ strict=strict,
1233
+ ontology_prefix=ontology_prefix,
1234
+ upgrade=upgrade,
1235
+ line=prop_value_type,
1236
+ counter=UNHANDLED_PROPS,
1237
+ context=context,
1238
+ )
1239
+ if prop_reference is None:
1240
+ return None
1241
+
1242
+ value_type = value_type.strip()
1243
+ datatype: Reference | None
1244
+ if " " not in value_type:
1245
+ value, datatype = value_type, None
1246
+ else:
1247
+ value, datatype_raw = (s.strip() for s in value_type.rsplit(" ", 1))
1248
+ match _parse_str_or_curie_or_uri_helper(
1249
+ datatype_raw,
1250
+ ontology_prefix=ontology_prefix,
1251
+ node=node,
1252
+ predicate=prop_reference,
1253
+ line=prop_value_type,
1254
+ upgrade=upgrade,
1255
+ context="property datatype",
1256
+ ):
1257
+ case Reference() as datatype_:
1258
+ datatype = datatype_
1259
+ case BlocklistError():
1260
+ return None
1261
+ case ParseError() as exc:
1262
+ if strict:
1263
+ raise exc
1264
+ else:
1265
+ logger.warning(str(exc))
1266
+ return None
1267
+
1268
+ # if it's an empty string, like the ones removed in https://github.com/oborel/obo-relations/pull/830,
1269
+ # just quit
1270
+ if value == '""':
1271
+ return None
1272
+
1273
+ quoted = value.startswith('"') and value.endswith('"')
1274
+ value = value.strip('"').strip()
527
1275
 
1276
+ # first, special case datetimes. Whether it's quoted or not,
1277
+ # we always deal with this first
1278
+ if datatype and datatype.curie == "xsd:dateTime":
528
1279
  try:
529
- value, _ = value_type.rsplit(" ", 1) # second entry is the value type
1280
+ obo_literal = OBOLiteral.datetime(value)
530
1281
  except ValueError:
531
- # logger.debug(f'property missing datatype. defaulting to string - {prop_value_type}')
532
- value = value_type # could assign type to be 'xsd:string' by default
533
- value = value.strip('"')
534
- yield prop, value
1282
+ logger.warning(
1283
+ "[%s - %s] could not parse date: %s", node.curie, prop_reference.curie, value
1284
+ )
1285
+ return None
1286
+ else:
1287
+ return Annotation(prop_reference, obo_literal)
1288
+
1289
+ if datatype and datatype.curie == "xsd:anyURI":
1290
+ match _parse_str_or_curie_or_uri_helper(
1291
+ value,
1292
+ node=node,
1293
+ predicate=prop_reference,
1294
+ ontology_prefix=ontology_prefix,
1295
+ line=prop_value_type,
1296
+ upgrade=upgrade,
1297
+ context="property object",
1298
+ ):
1299
+ case Reference() as obj_reference:
1300
+ return Annotation(prop_reference, obj_reference)
1301
+ case BlocklistError():
1302
+ return None
1303
+ case UnparsableIRIError():
1304
+ return Annotation(prop_reference, OBOLiteral.uri(value))
1305
+ case ParseError() as exc:
1306
+ if strict:
1307
+ raise exc
1308
+ else:
1309
+ logger.warning(str(exc))
1310
+ return None
1311
+
1312
+ # if it's quoted and there's a data try parsing as a CURIE/URI anyway (this is a bit
1313
+ # aggressive, but more useful than spec).
1314
+ if quoted:
1315
+ # give a try parsing it anyway, just in case ;)
1316
+ match _parse_str_or_curie_or_uri_helper(
1317
+ value,
1318
+ ontology_prefix=ontology_prefix,
1319
+ node=node,
1320
+ line=prop_value_type,
1321
+ upgrade=upgrade,
1322
+ predicate=prop_reference,
1323
+ context="property object",
1324
+ ):
1325
+ case Reference() as obj_reference:
1326
+ return Annotation(prop_reference, obj_reference)
1327
+ case BlocklistError():
1328
+ return None
1329
+ case ParseError():
1330
+ if datatype:
1331
+ return Annotation(prop_reference, OBOLiteral(value, datatype, None))
1332
+ else:
1333
+ return Annotation(prop_reference, OBOLiteral.string(value))
1334
+ else:
1335
+ if datatype:
1336
+ logger.debug(
1337
+ "[%s] throwing away datatype since no quotes were used: %s", node.curie, value_type
1338
+ )
1339
+
1340
+ # if it wasn't quoted and there was no datatype, go for parsing as an object
1341
+ match _obo_parse_identifier(
1342
+ value,
1343
+ strict=strict,
1344
+ ontology_prefix=ontology_prefix,
1345
+ node=node,
1346
+ predicate=prop_reference,
1347
+ line=prop_value_type,
1348
+ context="property object",
1349
+ counter=UNHANDLED_PROP_OBJECTS,
1350
+ ):
1351
+ case Reference() as obj_reference:
1352
+ return Annotation(prop_reference, obj_reference)
1353
+ case None:
1354
+ return None
1355
+
1356
+
1357
+ def _get_prop(
1358
+ property_id: str,
1359
+ *,
1360
+ node: Reference,
1361
+ strict: bool,
1362
+ ontology_prefix: str,
1363
+ upgrade: bool,
1364
+ line: str,
1365
+ counter: Counter[tuple[str, str]] | None = None,
1366
+ context: str | None = None,
1367
+ ) -> Reference | None:
1368
+ if rv := _parse_default_prop(property_id, ontology_prefix):
1369
+ return rv
1370
+ return _obo_parse_identifier(
1371
+ property_id,
1372
+ strict=strict,
1373
+ node=node,
1374
+ ontology_prefix=ontology_prefix,
1375
+ upgrade=upgrade,
1376
+ counter=counter,
1377
+ context=context,
1378
+ line=line,
1379
+ )
1380
+
1381
+
1382
+ def _parse_default_prop(property_id, ontology_prefix) -> Reference | None:
1383
+ for delim in "#/":
1384
+ sw = f"http://purl.obolibrary.org/obo/{ontology_prefix}{delim}"
1385
+ if property_id.startswith(sw):
1386
+ identifier = property_id.removeprefix(sw)
1387
+ return default_reference(ontology_prefix, identifier)
1388
+ return None
535
1389
 
536
1390
 
537
- def iterate_node_parents(
1391
+ def iterate_node_reference_tag(
1392
+ tag: str,
538
1393
  data: Mapping[str, Any],
539
1394
  *,
540
- prefix: str,
541
- identifier: str,
542
- strict: bool = True,
1395
+ node: Reference,
1396
+ strict: bool = False,
1397
+ ontology_prefix: str,
1398
+ upgrade: bool = True,
1399
+ counter: Counter[tuple[str, str]] | None = None,
543
1400
  ) -> Iterable[Reference]:
544
- """Extract parents from a :mod:`obonet` node's data."""
545
- for parent_curie in data.get("is_a", []):
546
- reference = Reference.from_curie(parent_curie, strict=strict)
1401
+ """Extract a list of CURIEs from the data."""
1402
+ for identifier in data.get(tag, []):
1403
+ reference = _obo_parse_identifier(
1404
+ identifier,
1405
+ strict=strict,
1406
+ node=node,
1407
+ ontology_prefix=ontology_prefix,
1408
+ upgrade=upgrade,
1409
+ counter=counter,
1410
+ )
547
1411
  if reference is None:
548
1412
  logger.warning(
549
- "[%s:%s] could not parse parent curie: %s", prefix, identifier, parent_curie
1413
+ "[%s] %s - could not parse identifier: %s", ontology_prefix, tag, identifier
550
1414
  )
551
- continue
552
- yield reference
1415
+ else:
1416
+ yield reference
553
1417
 
554
1418
 
555
- def iterate_node_alt_ids(data: Mapping[str, Any], *, strict: bool = True) -> Iterable[Reference]:
556
- """Extract alternate identifiers from a :mod:`obonet` node's data."""
557
- for curie in data.get("alt_id", []):
558
- reference = Reference.from_curie(curie, strict=strict)
559
- if reference is not None:
560
- yield reference
1419
+ def _process_intersection_of(
1420
+ term: Stanza,
1421
+ data: Mapping[str, Any],
1422
+ *,
1423
+ strict: bool = False,
1424
+ ontology_prefix: str,
1425
+ upgrade: bool = True,
1426
+ ) -> None:
1427
+ """Extract a list of CURIEs from the data."""
1428
+ for line in data.get("intersection_of", []):
1429
+ predicate_id, _, target_id = line.partition(" ")
1430
+ predicate = _obo_parse_identifier(
1431
+ predicate_id,
1432
+ strict=strict,
1433
+ node=term.reference,
1434
+ ontology_prefix=ontology_prefix,
1435
+ upgrade=upgrade,
1436
+ )
1437
+ if predicate is None:
1438
+ logger.warning("[%s] - could not parse intersection_of: %s", ontology_prefix, line)
1439
+ continue
1440
+
1441
+ if target_id:
1442
+ # this means that there's a second part, so let's try parsing it
1443
+ target = _obo_parse_identifier(
1444
+ target_id,
1445
+ strict=strict,
1446
+ node=term.reference,
1447
+ predicate=predicate,
1448
+ ontology_prefix=ontology_prefix,
1449
+ upgrade=upgrade,
1450
+ )
1451
+ if target is None:
1452
+ logger.warning(
1453
+ "[%s] could not parse intersection_of target: %s", ontology_prefix, line
1454
+ )
1455
+ continue
1456
+ term.append_intersection_of(predicate, target)
1457
+ else:
1458
+ term.append_intersection_of(predicate)
561
1459
 
562
1460
 
563
1461
  def iterate_node_relationships(
564
1462
  data: Mapping[str, Any],
565
1463
  *,
566
- prefix: str,
567
- identifier: str,
568
- strict: bool = True,
1464
+ node: Reference,
1465
+ strict: bool = False,
1466
+ ontology_prefix: str,
1467
+ upgrade: bool,
569
1468
  ) -> Iterable[tuple[Reference, Reference]]:
570
1469
  """Extract relationships from a :mod:`obonet` node's data."""
571
- for s in data.get("relationship", []):
572
- relation_curie, target_curie = s.split(" ")
573
- relation_prefix: Optional[str]
574
- relation_identifier: Optional[str]
575
- if relation_curie in RELATION_REMAPPINGS:
576
- relation_prefix, relation_identifier = RELATION_REMAPPINGS[relation_curie]
577
- else:
578
- relation_prefix, relation_identifier = normalize_curie(relation_curie, strict=strict)
579
- if relation_prefix is not None and relation_identifier is not None:
580
- relation = Reference(prefix=relation_prefix, identifier=relation_identifier)
581
- elif prefix is not None:
582
- relation = Reference(prefix=prefix, identifier=relation_curie)
583
- else:
584
- logger.debug("unhandled relation: %s", relation_curie)
585
- relation = Reference(prefix="obo", identifier=relation_curie)
586
-
587
- # TODO replace with omni-parser from :mod:`curies`
588
- target = Reference.from_curie(target_curie, strict=strict)
589
- if target is None:
590
- logger.warning(
591
- "[%s:%s] %s could not parse target %s", prefix, identifier, relation, target_curie
592
- )
593
- continue
1470
+ for line in data.get("relationship", []):
1471
+ relation_curie, target_curie = line.split(" ")
1472
+
1473
+ predicate = _obo_parse_identifier(
1474
+ relation_curie,
1475
+ strict=strict,
1476
+ ontology_prefix=ontology_prefix,
1477
+ node=node,
1478
+ upgrade=upgrade,
1479
+ line=line,
1480
+ context="relationship predicate",
1481
+ )
1482
+ match predicate:
1483
+ # TODO extend with other exception handling
1484
+ case None:
1485
+ logger.warning("[%s] could not parse relation %s", node.curie, relation_curie)
1486
+ continue
594
1487
 
595
- yield relation, target
1488
+ match _parse_str_or_curie_or_uri_helper(
1489
+ target_curie,
1490
+ ontology_prefix=ontology_prefix,
1491
+ node=node,
1492
+ predicate=predicate,
1493
+ line=line,
1494
+ context="relationship target",
1495
+ upgrade=upgrade,
1496
+ ):
1497
+ case Reference() as target:
1498
+ yield predicate, target
1499
+ case ParseError() as exc:
1500
+ if strict:
1501
+ raise exc
1502
+ else:
1503
+ logger.warning(str(exc))
596
1504
 
597
1505
 
598
1506
  def iterate_node_xrefs(
599
- *, prefix: str, data: Mapping[str, Any], strict: bool = True
600
- ) -> Iterable[Reference]:
1507
+ *,
1508
+ data: Mapping[str, Any],
1509
+ strict: bool = False,
1510
+ ontology_prefix: str,
1511
+ node: Reference,
1512
+ upgrade: bool,
1513
+ ) -> Iterable[tuple[Reference, list[Reference | OBOLiteral]]]:
601
1514
  """Extract xrefs from a :mod:`obonet` node's data."""
602
- for xref in data.get("xref", []):
603
- xref = xref.strip()
604
-
605
- if curie_has_blacklisted_prefix(xref) or curie_is_blacklisted(xref) or ":" not in xref:
606
- continue # sometimes xref to self... weird
1515
+ for line in data.get("xref", []):
1516
+ line = line.strip()
1517
+ if pair := _parse_xref_line(
1518
+ line.strip(),
1519
+ strict=strict,
1520
+ node=node,
1521
+ ontology_prefix=ontology_prefix,
1522
+ upgrade=upgrade,
1523
+ ):
1524
+ yield pair
1525
+
1526
+
1527
+ def _parse_xref_line(
1528
+ line: str, *, strict: bool = False, ontology_prefix: str, node: Reference, upgrade: bool
1529
+ ) -> tuple[Reference, list[Reference | OBOLiteral]] | None:
1530
+ xref, _, rest = line.partition(" [")
1531
+
1532
+ rules = get_rules()
1533
+
1534
+ if rules.str_is_blocked(xref, context=ontology_prefix) or ":" not in xref:
1535
+ return None # sometimes xref to self... weird
1536
+
1537
+ xref = rules.remap_prefix(xref, context=ontology_prefix)
1538
+
1539
+ split_space = " " in xref
1540
+ if split_space:
1541
+ _xref_split = xref.split(" ", 1)
1542
+ if _xref_split[1][0] not in {'"', "("}:
1543
+ logger.debug("[%s] Problem with space in xref %s", node.curie, xref)
1544
+ return None
1545
+ xref = _xref_split[0]
1546
+
1547
+ xref_ref = _parse_str_or_curie_or_uri_helper(
1548
+ xref, ontology_prefix=ontology_prefix, node=node, line=line, context="xref", upgrade=upgrade
1549
+ )
1550
+ match xref_ref:
1551
+ case BlocklistError():
1552
+ return None
1553
+ case ParseError() as exc:
1554
+ if strict:
1555
+ raise exc
1556
+ else:
1557
+ if not XREF_PROVENANCE_COUNTER[ontology_prefix, xref]:
1558
+ logger.warning(str(exc))
1559
+ XREF_PROVENANCE_COUNTER[ontology_prefix, xref] += 1
1560
+ return None
1561
+
1562
+ if rest:
1563
+ rest_front, _, _rest_rest = rest.partition("]")
1564
+ provenance = _parse_provenance_list(
1565
+ rest_front,
1566
+ node=node,
1567
+ ontology_prefix=ontology_prefix,
1568
+ counter=XREF_PROVENANCE_COUNTER,
1569
+ scope_text="xref provenance",
1570
+ line=line,
1571
+ strict=strict,
1572
+ )
1573
+ else:
1574
+ provenance = []
607
1575
 
608
- xref = remap_prefix(xref)
1576
+ return xref_ref, provenance
609
1577
 
610
- split_space = " " in xref
611
- if split_space:
612
- _xref_split = xref.split(" ", 1)
613
- if _xref_split[1][0] not in {'"', "("}:
614
- logger.debug("[%s] Problem with space in xref %s", prefix, xref)
615
- continue
616
- xref = _xref_split[0]
617
1578
 
618
- yv = Reference.from_curie(xref, strict=strict)
619
- if yv is not None:
620
- yield yv
1579
+ XREF_PROVENANCE_COUNTER: Counter[tuple[str, str]] = Counter()