pyobo 0.11.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. pyobo/.DS_Store +0 -0
  2. pyobo/__init__.py +95 -20
  3. pyobo/__main__.py +0 -0
  4. pyobo/api/__init__.py +81 -10
  5. pyobo/api/alts.py +52 -42
  6. pyobo/api/combine.py +39 -0
  7. pyobo/api/edges.py +68 -0
  8. pyobo/api/hierarchy.py +231 -203
  9. pyobo/api/metadata.py +14 -19
  10. pyobo/api/names.py +207 -127
  11. pyobo/api/properties.py +117 -113
  12. pyobo/api/relations.py +68 -94
  13. pyobo/api/species.py +24 -21
  14. pyobo/api/typedefs.py +11 -11
  15. pyobo/api/utils.py +66 -13
  16. pyobo/api/xrefs.py +108 -114
  17. pyobo/cli/__init__.py +0 -0
  18. pyobo/cli/cli.py +35 -50
  19. pyobo/cli/database.py +183 -161
  20. pyobo/{xrefdb/xrefs_pipeline.py → cli/database_utils.py} +54 -73
  21. pyobo/cli/lookup.py +163 -195
  22. pyobo/cli/utils.py +19 -6
  23. pyobo/constants.py +102 -3
  24. pyobo/getters.py +196 -118
  25. pyobo/gilda_utils.py +79 -200
  26. pyobo/identifier_utils/__init__.py +41 -0
  27. pyobo/identifier_utils/api.py +296 -0
  28. pyobo/identifier_utils/model.py +130 -0
  29. pyobo/identifier_utils/preprocessing.json +812 -0
  30. pyobo/identifier_utils/preprocessing.py +61 -0
  31. pyobo/identifier_utils/relations/__init__.py +8 -0
  32. pyobo/identifier_utils/relations/api.py +162 -0
  33. pyobo/identifier_utils/relations/data.json +5824 -0
  34. pyobo/identifier_utils/relations/data_owl.json +57 -0
  35. pyobo/identifier_utils/relations/data_rdf.json +1 -0
  36. pyobo/identifier_utils/relations/data_rdfs.json +7 -0
  37. pyobo/mocks.py +9 -6
  38. pyobo/ner/__init__.py +9 -0
  39. pyobo/ner/api.py +72 -0
  40. pyobo/ner/normalizer.py +33 -0
  41. pyobo/obographs.py +43 -39
  42. pyobo/plugins.py +5 -4
  43. pyobo/py.typed +0 -0
  44. pyobo/reader.py +1358 -395
  45. pyobo/reader_utils.py +155 -0
  46. pyobo/resource_utils.py +42 -22
  47. pyobo/resources/__init__.py +0 -0
  48. pyobo/resources/goc.py +75 -0
  49. pyobo/resources/goc.tsv +188 -0
  50. pyobo/resources/ncbitaxon.py +4 -5
  51. pyobo/resources/ncbitaxon.tsv.gz +0 -0
  52. pyobo/resources/ro.py +3 -2
  53. pyobo/resources/ro.tsv +0 -0
  54. pyobo/resources/so.py +0 -0
  55. pyobo/resources/so.tsv +0 -0
  56. pyobo/sources/README.md +12 -8
  57. pyobo/sources/__init__.py +52 -29
  58. pyobo/sources/agrovoc.py +0 -0
  59. pyobo/sources/antibodyregistry.py +11 -12
  60. pyobo/sources/bigg/__init__.py +13 -0
  61. pyobo/sources/bigg/bigg_compartment.py +81 -0
  62. pyobo/sources/bigg/bigg_metabolite.py +229 -0
  63. pyobo/sources/bigg/bigg_model.py +46 -0
  64. pyobo/sources/bigg/bigg_reaction.py +77 -0
  65. pyobo/sources/biogrid.py +1 -2
  66. pyobo/sources/ccle.py +7 -12
  67. pyobo/sources/cgnc.py +0 -5
  68. pyobo/sources/chebi.py +1 -1
  69. pyobo/sources/chembl/__init__.py +9 -0
  70. pyobo/sources/{chembl.py → chembl/chembl_compound.py} +13 -25
  71. pyobo/sources/chembl/chembl_target.py +160 -0
  72. pyobo/sources/civic_gene.py +55 -15
  73. pyobo/sources/clinicaltrials.py +160 -0
  74. pyobo/sources/complexportal.py +24 -24
  75. pyobo/sources/conso.py +14 -22
  76. pyobo/sources/cpt.py +0 -0
  77. pyobo/sources/credit.py +1 -9
  78. pyobo/sources/cvx.py +27 -5
  79. pyobo/sources/depmap.py +9 -12
  80. pyobo/sources/dictybase_gene.py +2 -7
  81. pyobo/sources/drugbank/__init__.py +9 -0
  82. pyobo/sources/{drugbank.py → drugbank/drugbank.py} +11 -16
  83. pyobo/sources/{drugbank_salt.py → drugbank/drugbank_salt.py} +3 -8
  84. pyobo/sources/drugcentral.py +17 -13
  85. pyobo/sources/expasy.py +31 -34
  86. pyobo/sources/famplex.py +13 -18
  87. pyobo/sources/flybase.py +3 -8
  88. pyobo/sources/gard.py +62 -0
  89. pyobo/sources/geonames/__init__.py +9 -0
  90. pyobo/sources/geonames/features.py +28 -0
  91. pyobo/sources/{geonames.py → geonames/geonames.py} +87 -26
  92. pyobo/sources/geonames/utils.py +115 -0
  93. pyobo/sources/gmt_utils.py +6 -7
  94. pyobo/sources/go.py +20 -13
  95. pyobo/sources/gtdb.py +154 -0
  96. pyobo/sources/gwascentral/__init__.py +9 -0
  97. pyobo/sources/{gwascentral_phenotype.py → gwascentral/gwascentral_phenotype.py} +5 -7
  98. pyobo/sources/{gwascentral_study.py → gwascentral/gwascentral_study.py} +1 -7
  99. pyobo/sources/hgnc/__init__.py +9 -0
  100. pyobo/sources/{hgnc.py → hgnc/hgnc.py} +56 -70
  101. pyobo/sources/{hgncgenefamily.py → hgnc/hgncgenefamily.py} +8 -18
  102. pyobo/sources/icd/__init__.py +9 -0
  103. pyobo/sources/{icd10.py → icd/icd10.py} +35 -37
  104. pyobo/sources/icd/icd11.py +148 -0
  105. pyobo/sources/{icd_utils.py → icd/icd_utils.py} +66 -20
  106. pyobo/sources/interpro.py +4 -9
  107. pyobo/sources/itis.py +0 -5
  108. pyobo/sources/kegg/__init__.py +0 -0
  109. pyobo/sources/kegg/api.py +16 -38
  110. pyobo/sources/kegg/genes.py +9 -20
  111. pyobo/sources/kegg/genome.py +1 -7
  112. pyobo/sources/kegg/pathway.py +9 -21
  113. pyobo/sources/mesh.py +58 -24
  114. pyobo/sources/mgi.py +3 -10
  115. pyobo/sources/mirbase/__init__.py +11 -0
  116. pyobo/sources/{mirbase.py → mirbase/mirbase.py} +8 -11
  117. pyobo/sources/{mirbase_constants.py → mirbase/mirbase_constants.py} +0 -0
  118. pyobo/sources/{mirbase_family.py → mirbase/mirbase_family.py} +4 -8
  119. pyobo/sources/{mirbase_mature.py → mirbase/mirbase_mature.py} +3 -7
  120. pyobo/sources/msigdb.py +74 -39
  121. pyobo/sources/ncbi/__init__.py +9 -0
  122. pyobo/sources/ncbi/ncbi_gc.py +162 -0
  123. pyobo/sources/{ncbigene.py → ncbi/ncbigene.py} +18 -19
  124. pyobo/sources/nih_reporter.py +60 -0
  125. pyobo/sources/nlm/__init__.py +9 -0
  126. pyobo/sources/nlm/nlm_catalog.py +48 -0
  127. pyobo/sources/nlm/nlm_publisher.py +36 -0
  128. pyobo/sources/nlm/utils.py +116 -0
  129. pyobo/sources/npass.py +6 -8
  130. pyobo/sources/omim_ps.py +10 -3
  131. pyobo/sources/pathbank.py +4 -8
  132. pyobo/sources/pfam/__init__.py +9 -0
  133. pyobo/sources/{pfam.py → pfam/pfam.py} +3 -8
  134. pyobo/sources/{pfam_clan.py → pfam/pfam_clan.py} +2 -7
  135. pyobo/sources/pharmgkb/__init__.py +15 -0
  136. pyobo/sources/pharmgkb/pharmgkb_chemical.py +89 -0
  137. pyobo/sources/pharmgkb/pharmgkb_disease.py +77 -0
  138. pyobo/sources/pharmgkb/pharmgkb_gene.py +108 -0
  139. pyobo/sources/pharmgkb/pharmgkb_pathway.py +63 -0
  140. pyobo/sources/pharmgkb/pharmgkb_variant.py +84 -0
  141. pyobo/sources/pharmgkb/utils.py +86 -0
  142. pyobo/sources/pid.py +1 -6
  143. pyobo/sources/pombase.py +6 -10
  144. pyobo/sources/pubchem.py +4 -9
  145. pyobo/sources/reactome.py +5 -11
  146. pyobo/sources/rgd.py +11 -16
  147. pyobo/sources/rhea.py +37 -36
  148. pyobo/sources/ror.py +69 -42
  149. pyobo/sources/selventa/__init__.py +0 -0
  150. pyobo/sources/selventa/schem.py +4 -7
  151. pyobo/sources/selventa/scomp.py +1 -6
  152. pyobo/sources/selventa/sdis.py +4 -7
  153. pyobo/sources/selventa/sfam.py +1 -6
  154. pyobo/sources/sgd.py +6 -11
  155. pyobo/sources/signor/__init__.py +7 -0
  156. pyobo/sources/signor/download.py +41 -0
  157. pyobo/sources/signor/signor_complexes.py +105 -0
  158. pyobo/sources/slm.py +12 -15
  159. pyobo/sources/umls/__init__.py +7 -1
  160. pyobo/sources/umls/__main__.py +0 -0
  161. pyobo/sources/umls/get_synonym_types.py +20 -4
  162. pyobo/sources/umls/sty.py +57 -0
  163. pyobo/sources/umls/synonym_types.tsv +1 -1
  164. pyobo/sources/umls/umls.py +18 -22
  165. pyobo/sources/unimod.py +46 -0
  166. pyobo/sources/uniprot/__init__.py +1 -1
  167. pyobo/sources/uniprot/uniprot.py +40 -32
  168. pyobo/sources/uniprot/uniprot_ptm.py +4 -34
  169. pyobo/sources/utils.py +3 -2
  170. pyobo/sources/wikipathways.py +7 -10
  171. pyobo/sources/zfin.py +5 -10
  172. pyobo/ssg/__init__.py +12 -16
  173. pyobo/ssg/base.html +0 -0
  174. pyobo/ssg/index.html +26 -13
  175. pyobo/ssg/term.html +12 -2
  176. pyobo/ssg/typedef.html +0 -0
  177. pyobo/struct/__init__.py +54 -8
  178. pyobo/struct/functional/__init__.py +1 -0
  179. pyobo/struct/functional/dsl.py +2572 -0
  180. pyobo/struct/functional/macros.py +423 -0
  181. pyobo/struct/functional/obo_to_functional.py +385 -0
  182. pyobo/struct/functional/ontology.py +270 -0
  183. pyobo/struct/functional/utils.py +112 -0
  184. pyobo/struct/reference.py +331 -136
  185. pyobo/struct/struct.py +1413 -643
  186. pyobo/struct/struct_utils.py +1078 -0
  187. pyobo/struct/typedef.py +162 -210
  188. pyobo/struct/utils.py +12 -5
  189. pyobo/struct/vocabulary.py +138 -0
  190. pyobo/utils/__init__.py +0 -0
  191. pyobo/utils/cache.py +13 -11
  192. pyobo/utils/io.py +17 -31
  193. pyobo/utils/iter.py +5 -5
  194. pyobo/utils/misc.py +41 -53
  195. pyobo/utils/ndex_utils.py +0 -0
  196. pyobo/utils/path.py +76 -70
  197. pyobo/version.py +3 -3
  198. {pyobo-0.11.1.dist-info → pyobo-0.12.0.dist-info}/METADATA +224 -225
  199. pyobo-0.12.0.dist-info/RECORD +202 -0
  200. pyobo-0.12.0.dist-info/WHEEL +4 -0
  201. {pyobo-0.11.1.dist-info → pyobo-0.12.0.dist-info}/entry_points.txt +1 -0
  202. {pyobo-0.11.1.dist-info → pyobo-0.12.0.dist-info/licenses}/LICENSE +0 -0
  203. pyobo/apps/__init__.py +0 -3
  204. pyobo/apps/cli.py +0 -24
  205. pyobo/apps/gilda/__init__.py +0 -3
  206. pyobo/apps/gilda/__main__.py +0 -8
  207. pyobo/apps/gilda/app.py +0 -48
  208. pyobo/apps/gilda/cli.py +0 -36
  209. pyobo/apps/gilda/templates/base.html +0 -33
  210. pyobo/apps/gilda/templates/home.html +0 -11
  211. pyobo/apps/gilda/templates/matches.html +0 -32
  212. pyobo/apps/mapper/__init__.py +0 -3
  213. pyobo/apps/mapper/__main__.py +0 -11
  214. pyobo/apps/mapper/cli.py +0 -37
  215. pyobo/apps/mapper/mapper.py +0 -187
  216. pyobo/apps/mapper/templates/base.html +0 -35
  217. pyobo/apps/mapper/templates/mapper_home.html +0 -64
  218. pyobo/aws.py +0 -162
  219. pyobo/cli/aws.py +0 -47
  220. pyobo/identifier_utils.py +0 -142
  221. pyobo/normalizer.py +0 -232
  222. pyobo/registries/__init__.py +0 -16
  223. pyobo/registries/metaregistry.json +0 -507
  224. pyobo/registries/metaregistry.py +0 -135
  225. pyobo/sources/icd11.py +0 -105
  226. pyobo/xrefdb/__init__.py +0 -1
  227. pyobo/xrefdb/canonicalizer.py +0 -214
  228. pyobo/xrefdb/priority.py +0 -59
  229. pyobo/xrefdb/sources/__init__.py +0 -60
  230. pyobo/xrefdb/sources/biomappings.py +0 -36
  231. pyobo/xrefdb/sources/cbms2019.py +0 -91
  232. pyobo/xrefdb/sources/chembl.py +0 -83
  233. pyobo/xrefdb/sources/compath.py +0 -82
  234. pyobo/xrefdb/sources/famplex.py +0 -64
  235. pyobo/xrefdb/sources/gilda.py +0 -50
  236. pyobo/xrefdb/sources/intact.py +0 -113
  237. pyobo/xrefdb/sources/ncit.py +0 -133
  238. pyobo/xrefdb/sources/pubchem.py +0 -27
  239. pyobo/xrefdb/sources/wikidata.py +0 -116
  240. pyobo-0.11.1.dist-info/RECORD +0 -173
  241. pyobo-0.11.1.dist-info/WHEEL +0 -5
  242. pyobo-0.11.1.dist-info/top_level.txt +0 -1
pyobo/reader.py CHANGED
@@ -1,33 +1,59 @@
1
1
  """OBO Readers."""
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import logging
6
+ import typing as t
7
+ from collections import Counter
4
8
  from collections.abc import Iterable, Mapping
5
9
  from datetime import datetime
10
+ from io import StringIO
6
11
  from pathlib import Path
7
- from typing import Any, Optional, Union
12
+ from textwrap import dedent
13
+ from typing import Any
8
14
 
9
15
  import bioregistry
10
16
  import networkx as nx
17
+ from curies import ReferenceTuple
18
+ from curies.vocabulary import SynonymScope
11
19
  from more_itertools import pairwise
12
20
  from tqdm.auto import tqdm
13
21
 
14
22
  from .constants import DATE_FORMAT, PROVENANCE_PREFIXES
15
- from .identifier_utils import MissingPrefixError, normalize_curie
16
- from .registries import curie_has_blacklisted_prefix, curie_is_blacklisted, remap_prefix
23
+ from .identifier_utils import (
24
+ BlacklistedError,
25
+ NotCURIEError,
26
+ ParseError,
27
+ UnparsableIRIError,
28
+ _is_valid_identifier,
29
+ _parse_str_or_curie_or_uri_helper,
30
+ remap_prefix,
31
+ str_is_blacklisted,
32
+ )
33
+ from .reader_utils import (
34
+ _chomp_axioms,
35
+ _chomp_references,
36
+ _chomp_specificity,
37
+ _chomp_typedef,
38
+ _parse_provenance_list,
39
+ )
17
40
  from .struct import (
18
41
  Obo,
19
42
  Reference,
20
43
  Synonym,
21
- SynonymSpecificities,
22
- SynonymSpecificity,
23
44
  SynonymTypeDef,
24
45
  Term,
25
46
  TypeDef,
47
+ default_reference,
26
48
  make_ad_hoc_ontology,
27
49
  )
28
- from .struct.struct import DEFAULT_SYNONYM_TYPE
29
- from .struct.typedef import default_typedefs, develops_from, has_part, part_of
30
- from .utils.misc import cleanup_version
50
+ from .struct import vocabulary as v
51
+ from .struct.reference import OBOLiteral, _obo_parse_identifier
52
+ from .struct.struct_utils import Annotation, Stanza
53
+ from .struct.typedef import comment as has_comment
54
+ from .struct.typedef import default_typedefs, has_ontology_root_term
55
+ from .utils.cache import write_gzipped_graph
56
+ from .utils.misc import STATIC_VERSION_REWRITES, cleanup_version
31
57
 
32
58
  __all__ = [
33
59
  "from_obo_path",
@@ -36,369 +62,1032 @@ __all__ = [
36
62
 
37
63
  logger = logging.getLogger(__name__)
38
64
 
39
- # FIXME use bioontologies
40
- # RELATION_REMAPPINGS: Mapping[str, Tuple[str, str]] = bioontologies.upgrade.load()
41
- RELATION_REMAPPINGS: Mapping[str, tuple[str, str]] = {
42
- "part_of": part_of.pair,
43
- "has_part": has_part.pair,
44
- "develops_from": develops_from.pair,
45
- "seeAlso": ("rdf", "seeAlso"),
46
- "dc-contributor": ("dc", "contributor"),
47
- "dc-creator": ("dc", "creator"),
48
- }
49
-
50
65
 
51
66
  def from_obo_path(
52
- path: Union[str, Path], prefix: Optional[str] = None, *, strict: bool = True, **kwargs
67
+ path: str | Path,
68
+ prefix: str | None = None,
69
+ *,
70
+ strict: bool = False,
71
+ version: str | None,
72
+ upgrade: bool = True,
73
+ use_tqdm: bool = False,
74
+ ignore_obsolete: bool = False,
75
+ _cache_path: Path | None = None,
53
76
  ) -> Obo:
54
77
  """Get the OBO graph from a path."""
55
- import obonet
56
-
57
- logger.info("[%s] parsing with obonet from %s", prefix or "", path)
58
- with open(path) as file:
59
- graph = obonet.read_obo(
60
- tqdm(
61
- file,
62
- unit_scale=True,
63
- desc=f'[{prefix or ""}] parsing obo',
64
- disable=None,
65
- leave=False,
66
- )
67
- )
78
+ path = Path(path).expanduser().resolve()
79
+ if path.suffix.endswith(".gz"):
80
+ import gzip
81
+
82
+ logger.info("[%s] parsing gzipped OBO with obonet from %s", prefix or "<unknown>", path)
83
+ with gzip.open(path, "rt") as file:
84
+ graph = _read_obo(file, prefix, ignore_obsolete=ignore_obsolete, use_tqdm=use_tqdm)
85
+ elif path.suffix.endswith(".zip"):
86
+ import io
87
+ import zipfile
88
+
89
+ logger.info("[%s] parsing zipped OBO with obonet from %s", prefix or "<unknown>", path)
90
+ with zipfile.ZipFile(path) as zf:
91
+ with zf.open(path.name.removesuffix(".zip"), "r") as file:
92
+ content = file.read().decode("utf-8")
93
+ graph = _read_obo(
94
+ io.StringIO(content), prefix, ignore_obsolete=ignore_obsolete, use_tqdm=use_tqdm
95
+ )
96
+ else:
97
+ logger.info("[%s] parsing OBO with obonet from %s", prefix or "<unknown>", path)
98
+ with open(path) as file:
99
+ graph = _read_obo(file, prefix, ignore_obsolete=ignore_obsolete, use_tqdm=use_tqdm)
68
100
 
69
101
  if prefix:
70
102
  # Make sure the graph is named properly
71
103
  _clean_graph_ontology(graph, prefix)
72
104
 
105
+ if _cache_path:
106
+ logger.info("[%s] writing obonet cache to %s", prefix, _cache_path)
107
+ write_gzipped_graph(path=_cache_path, graph=graph)
108
+
73
109
  # Convert to an Obo instance and return
74
- return from_obonet(graph, strict=strict, **kwargs)
110
+ return from_obonet(graph, strict=strict, version=version, upgrade=upgrade, use_tqdm=use_tqdm)
111
+
112
+
113
+ def _read_obo(
114
+ filelike, prefix: str | None, ignore_obsolete: bool, use_tqdm: bool = True
115
+ ) -> nx.MultiDiGraph:
116
+ import obonet
117
+
118
+ return obonet.read_obo(
119
+ tqdm(
120
+ filelike,
121
+ unit_scale=True,
122
+ desc=f"[{prefix or ''}] parsing OBO",
123
+ disable=not use_tqdm,
124
+ leave=True,
125
+ ),
126
+ ignore_obsolete=ignore_obsolete,
127
+ )
75
128
 
76
129
 
77
- def from_obonet(graph: nx.MultiDiGraph, *, strict: bool = True) -> "Obo":
130
+ def _normalize_prefix_strict(prefix: str) -> str:
131
+ n = bioregistry.normalize_prefix(prefix)
132
+ if n is None:
133
+ raise ValueError(f"unknown prefix: {prefix}")
134
+ return n
135
+
136
+
137
+ def from_str(
138
+ text: str,
139
+ *,
140
+ strict: bool = False,
141
+ version: str | None = None,
142
+ upgrade: bool = True,
143
+ ignore_obsolete: bool = False,
144
+ use_tqdm: bool = False,
145
+ ) -> Obo:
146
+ """Read an ontology from a string representation."""
147
+ import obonet
148
+
149
+ text = dedent(text).strip()
150
+ io = StringIO()
151
+ io.write(text)
152
+ io.seek(0)
153
+ graph = obonet.read_obo(io, ignore_obsolete=ignore_obsolete)
154
+ return from_obonet(graph, strict=strict, version=version, upgrade=upgrade, use_tqdm=use_tqdm)
155
+
156
+
157
+ def from_obonet(
158
+ graph: nx.MultiDiGraph,
159
+ *,
160
+ strict: bool = False,
161
+ version: str | None = None,
162
+ upgrade: bool = True,
163
+ use_tqdm: bool = False,
164
+ ) -> Obo:
78
165
  """Get all of the terms from a OBO graph."""
79
- _ontology = graph.graph["ontology"]
80
- ontology = bioregistry.normalize_prefix(_ontology) # probably always okay
81
- if ontology is None:
82
- raise ValueError(f"unknown prefix: {_ontology}")
83
- logger.info("[%s] extracting OBO using obonet", ontology)
84
-
85
- date = _get_date(graph=graph, ontology=ontology)
86
- name = _get_name(graph=graph, ontology=ontology)
87
-
88
- data_version = graph.graph.get("data-version")
89
- if not data_version:
90
- if date is not None:
91
- data_version = date.strftime("%Y-%m-%d")
92
- logger.info(
93
- "[%s] does not report a version. falling back to date: %s",
94
- ontology,
95
- data_version,
96
- )
97
- else:
98
- logger.warning("[%s] does not report a version nor a date", ontology)
99
- else:
100
- data_version = cleanup_version(data_version=data_version, prefix=ontology)
101
- if data_version is not None:
102
- logger.info("[%s] using version %s", ontology, data_version)
103
- elif date is not None:
104
- logger.info(
105
- "[%s] unrecognized version format, falling back to date: %s",
106
- ontology,
107
- data_version,
108
- )
109
- data_version = date.strftime("%Y-%m-%d")
110
- else:
111
- logger.warning(
112
- "[%s] UNRECOGNIZED VERSION FORMAT AND MISSING DATE: %s", ontology, data_version
113
- )
166
+ ontology_prefix_raw = graph.graph["ontology"]
167
+ ontology_prefix = _normalize_prefix_strict(ontology_prefix_raw)
168
+ logger.info("[%s] extracting OBO using obonet", ontology_prefix)
169
+
170
+ date = _get_date(graph=graph, ontology_prefix=ontology_prefix)
171
+ name = _get_name(graph=graph, ontology_prefix=ontology_prefix)
172
+ imports = graph.graph.get("import")
114
173
 
174
+ macro_config = MacroConfig(graph.graph, strict=strict, ontology_prefix=ontology_prefix)
175
+
176
+ data_version = _clean_graph_version(
177
+ graph, ontology_prefix=ontology_prefix, version=version, date=date
178
+ )
115
179
  if data_version and "/" in data_version:
116
- raise ValueError(f"[{ontology}] will not accept slash in data version: {data_version}")
117
-
118
- #: Parsed CURIEs to references (even external ones)
119
- reference_it = (
120
- Reference(
121
- prefix=prefix,
122
- identifier=bioregistry.standardize_identifier(prefix, identifier),
123
- # if name isn't available, it means its external to this ontology
124
- name=data.get("name"),
180
+ raise ValueError(
181
+ f"[{ontology_prefix}] slashes not allowed in data versions because of filesystem usage: {data_version}"
125
182
  )
126
- for prefix, identifier, data in _iter_obo_graph(graph=graph, strict=strict)
127
- )
128
- references: Mapping[tuple[str, str], Reference] = {
129
- reference.pair: reference for reference in reference_it
130
- }
183
+
184
+ missing_typedefs: set[ReferenceTuple] = set()
185
+
186
+ subset_typedefs = _get_subsetdefs(graph.graph, ontology_prefix=ontology_prefix)
187
+
188
+ root_terms: list[Reference] = []
189
+ property_values: list[Annotation] = []
190
+ for ann in iterate_node_properties(
191
+ graph.graph,
192
+ ontology_prefix=ontology_prefix,
193
+ upgrade=upgrade,
194
+ node=Reference(prefix="obo", identifier=ontology_prefix),
195
+ strict=strict,
196
+ context="graph property",
197
+ ):
198
+ if ann.predicate.pair == has_ontology_root_term.pair:
199
+ match ann.value:
200
+ case OBOLiteral():
201
+ logger.warning(
202
+ "[%s] tried to use a literal as an ontology root: %s",
203
+ ontology_prefix,
204
+ ann.value.value,
205
+ )
206
+ continue
207
+ case Reference():
208
+ root_terms.append(ann.value)
209
+ else:
210
+ property_values.append(ann)
211
+
212
+ for remark in graph.graph.get("remark", []):
213
+ property_values.append(Annotation(has_comment.reference, OBOLiteral.string(remark)))
214
+
215
+ idspaces: dict[str, str] = {}
216
+ for x in graph.graph.get("idspace", []):
217
+ prefix, uri_prefix, *_ = (y.strip() for y in x.split(" ", 2))
218
+ idspaces[prefix] = uri_prefix
131
219
 
132
220
  #: CURIEs to typedefs
133
- typedefs: Mapping[tuple[str, str], TypeDef] = {
134
- typedef.pair: typedef for typedef in iterate_graph_typedefs(graph, ontology)
221
+ typedefs: Mapping[ReferenceTuple, TypeDef] = {
222
+ typedef.pair: typedef
223
+ for typedef in iterate_typedefs(
224
+ graph,
225
+ ontology_prefix=ontology_prefix,
226
+ strict=strict,
227
+ upgrade=upgrade,
228
+ macro_config=macro_config,
229
+ )
135
230
  }
136
231
 
137
- synonym_typedefs: Mapping[str, SynonymTypeDef] = {
138
- synonym_typedef.curie: synonym_typedef
139
- for synonym_typedef in iterate_graph_synonym_typedefs(graph, ontology=ontology)
232
+ synonym_typedefs: Mapping[ReferenceTuple, SynonymTypeDef] = {
233
+ synonym_typedef.pair: synonym_typedef
234
+ for synonym_typedef in iterate_graph_synonym_typedefs(
235
+ graph,
236
+ ontology_prefix=ontology_prefix,
237
+ strict=strict,
238
+ upgrade=upgrade,
239
+ )
140
240
  }
141
241
 
142
- missing_typedefs = set()
242
+ terms = _get_terms(
243
+ graph,
244
+ strict=strict,
245
+ ontology_prefix=ontology_prefix,
246
+ upgrade=upgrade,
247
+ typedefs=typedefs,
248
+ missing_typedefs=missing_typedefs,
249
+ synonym_typedefs=synonym_typedefs,
250
+ subset_typedefs=subset_typedefs,
251
+ macro_config=macro_config,
252
+ use_tqdm=use_tqdm,
253
+ )
254
+
255
+ return make_ad_hoc_ontology(
256
+ _ontology=ontology_prefix,
257
+ _name=name,
258
+ _auto_generated_by=graph.graph.get("auto-generated-by"),
259
+ _typedefs=list(typedefs.values()),
260
+ _synonym_typedefs=list(synonym_typedefs.values()),
261
+ _date=date,
262
+ _data_version=data_version,
263
+ _root_terms=root_terms,
264
+ terms=terms,
265
+ _property_values=property_values,
266
+ _subsetdefs=subset_typedefs,
267
+ _imports=imports,
268
+ _idspaces=idspaces,
269
+ )
270
+
271
+
272
+ def _get_terms(
273
+ graph,
274
+ *,
275
+ strict: bool,
276
+ ontology_prefix: str,
277
+ upgrade: bool,
278
+ typedefs: Mapping[ReferenceTuple, TypeDef],
279
+ synonym_typedefs: Mapping[ReferenceTuple, SynonymTypeDef],
280
+ subset_typedefs,
281
+ missing_typedefs: set[ReferenceTuple],
282
+ macro_config: MacroConfig,
283
+ use_tqdm: bool = False,
284
+ ) -> list[Term]:
143
285
  terms = []
144
- n_alt_ids, n_parents, n_synonyms, n_relations, n_properties, n_xrefs = 0, 0, 0, 0, 0, 0
145
- for prefix, identifier, data in _iter_obo_graph(graph=graph, strict=strict):
146
- if prefix != ontology or not data:
286
+ for reference, data in _iter_obo_graph(
287
+ graph=graph,
288
+ strict=strict,
289
+ ontology_prefix=ontology_prefix,
290
+ use_tqdm=use_tqdm,
291
+ upgrade=upgrade,
292
+ ):
293
+ if reference.prefix != ontology_prefix:
294
+ continue
295
+ if not data:
296
+ # this allows us to skip anything that isn't really defined
297
+ # caveat: this misses terms that are just defined with an ID
147
298
  continue
148
299
 
149
- identifier = bioregistry.standardize_identifier(prefix, identifier)
150
- reference = references[ontology, identifier]
151
-
152
- try:
153
- node_xrefs = list(iterate_node_xrefs(prefix=prefix, data=data, strict=strict))
154
- except MissingPrefixError as e:
155
- e.reference = reference
156
- raise e
157
- xrefs, provenance = [], []
158
- for node_xref in node_xrefs:
159
- if node_xref.prefix in PROVENANCE_PREFIXES:
160
- provenance.append(node_xref)
161
- else:
162
- xrefs.append(node_xref)
163
- n_xrefs += len(xrefs)
300
+ term = Term(
301
+ reference=reference,
302
+ builtin=_get_boolean(data, "builtin"),
303
+ is_anonymous=_get_boolean(data, "is_anonymous"),
304
+ is_obsolete=_get_boolean(data, "is_obsolete"),
305
+ namespace=data.get("namespace"),
306
+ )
164
307
 
165
- definition, definition_references = get_definition(
166
- data, prefix=prefix, identifier=identifier
308
+ _process_alts(term, data, ontology_prefix=ontology_prefix, strict=strict)
309
+ _process_parents(term, data, ontology_prefix=ontology_prefix, strict=strict)
310
+ _process_synonyms(
311
+ term,
312
+ data,
313
+ ontology_prefix=ontology_prefix,
314
+ strict=strict,
315
+ upgrade=upgrade,
316
+ synonym_typedefs=synonym_typedefs,
317
+ )
318
+ _process_xrefs(
319
+ term,
320
+ data,
321
+ ontology_prefix=ontology_prefix,
322
+ strict=strict,
323
+ macro_config=macro_config,
324
+ upgrade=upgrade,
325
+ )
326
+ _process_properties(
327
+ term,
328
+ data,
329
+ ontology_prefix=ontology_prefix,
330
+ strict=strict,
331
+ upgrade=upgrade,
332
+ typedefs=typedefs,
167
333
  )
168
- if definition_references:
169
- provenance.extend(definition_references)
334
+ _process_relations(
335
+ term,
336
+ data,
337
+ ontology_prefix=ontology_prefix,
338
+ strict=strict,
339
+ upgrade=upgrade,
340
+ typedefs=typedefs,
341
+ missing_typedefs=missing_typedefs,
342
+ )
343
+ _process_replaced_by(term, data, ontology_prefix=ontology_prefix, strict=strict)
344
+ _process_subsets(term, data, ontology_prefix=ontology_prefix, strict=strict)
345
+ _process_intersection_of(term, data, ontology_prefix=ontology_prefix, strict=strict)
346
+ _process_union_of(term, data, ontology_prefix=ontology_prefix, strict=strict)
347
+ _process_equivalent_to(term, data, ontology_prefix=ontology_prefix, strict=strict)
348
+ _process_disjoint_from(term, data, ontology_prefix=ontology_prefix, strict=strict)
349
+ _process_consider(term, data, ontology_prefix=ontology_prefix, strict=strict)
350
+ _process_comment(term, data, ontology_prefix=ontology_prefix, strict=strict)
351
+ _process_description(term, data, ontology_prefix=ontology_prefix, strict=strict)
352
+ _process_creation_date(term, data)
170
353
 
171
- try:
172
- alt_ids = list(iterate_node_alt_ids(data, strict=strict))
173
- except MissingPrefixError as e:
174
- e.reference = reference
175
- raise e
176
- n_alt_ids += len(alt_ids)
354
+ terms.append(term)
355
+ return terms
177
356
 
178
- try:
179
- parents = list(
180
- iterate_node_parents(
181
- data,
182
- prefix=prefix,
183
- identifier=identifier,
184
- strict=strict,
185
- )
186
- )
187
- except MissingPrefixError as e:
188
- e.reference = reference
189
- raise e
190
- n_parents += len(parents)
191
357
 
192
- synonyms = list(
193
- iterate_node_synonyms(
194
- data,
195
- synonym_typedefs,
196
- prefix=prefix,
197
- identifier=identifier,
198
- strict=strict,
358
+ def _process_description(term: Stanza, data, *, ontology_prefix: str, strict: bool):
359
+ definition, definition_references = get_definition(
360
+ data, node=term.reference, strict=strict, ontology_prefix=ontology_prefix
361
+ )
362
+ term.definition = definition
363
+ if term.definition:
364
+ for definition_reference in definition_references:
365
+ term._append_annotation(
366
+ v.has_description,
367
+ OBOLiteral.string(term.definition),
368
+ Annotation(v.has_dbxref, definition_reference),
199
369
  )
370
+
371
+
372
+ def _process_comment(term: Stanza, data, *, ontology_prefix: str, strict: bool) -> None:
373
+ if comment := data.get("comment"):
374
+ term.append_comment(comment)
375
+
376
+
377
+ def _process_creation_date(term: Stanza, data) -> None:
378
+ date_str = data.get("creation_date")
379
+ if not date_str:
380
+ return
381
+ if isinstance(date_str, list):
382
+ date_str = date_str[0]
383
+ try:
384
+ term.append_creation_date(date_str)
385
+ except ValueError:
386
+ logger.warning("[%s] failed to parse creation_date: %s", term.reference.curie, date_str)
387
+
388
+
389
+ def _process_union_of(term: Stanza, data, *, ontology_prefix: str, strict: bool) -> None:
390
+ for reference in iterate_node_reference_tag(
391
+ "union_of", data=data, ontology_prefix=ontology_prefix, strict=strict, node=term.reference
392
+ ):
393
+ term.append_union_of(reference)
394
+
395
+
396
+ def _process_equivalent_to(term: Stanza, data, *, ontology_prefix: str, strict: bool) -> None:
397
+ for reference in iterate_node_reference_tag(
398
+ "equivalent_to",
399
+ data=data,
400
+ ontology_prefix=ontology_prefix,
401
+ strict=strict,
402
+ node=term.reference,
403
+ ):
404
+ term.append_equivalent_to(reference)
405
+
406
+
407
+ def _process_disjoint_from(term: Stanza, data, *, ontology_prefix: str, strict: bool) -> None:
408
+ for reference in iterate_node_reference_tag(
409
+ "disjoint_from",
410
+ data=data,
411
+ ontology_prefix=ontology_prefix,
412
+ strict=strict,
413
+ node=term.reference,
414
+ ):
415
+ term.append_disjoint_from(reference)
416
+
417
+
418
+ def _process_alts(term: Stanza, data, *, ontology_prefix: str, strict: bool) -> None:
419
+ for alt_reference in iterate_node_reference_tag(
420
+ "alt_id", data, node=term.reference, strict=strict, ontology_prefix=ontology_prefix
421
+ ):
422
+ term.append_alt(alt_reference)
423
+
424
+
425
+ def _process_parents(term: Stanza, data, *, ontology_prefix: str, strict: bool) -> None:
426
+ for tag in ["is_a", "instance_of"]:
427
+ for parent in iterate_node_reference_tag(
428
+ tag, data, node=term.reference, strict=strict, ontology_prefix=ontology_prefix
429
+ ):
430
+ term.append_parent(parent)
431
+
432
+
433
+ def _process_synonyms(
434
+ term: Stanza,
435
+ data,
436
+ *,
437
+ ontology_prefix: str,
438
+ strict: bool,
439
+ upgrade: bool,
440
+ synonym_typedefs: Mapping[ReferenceTuple, SynonymTypeDef],
441
+ ) -> None:
442
+ synonyms = list(
443
+ iterate_node_synonyms(
444
+ data,
445
+ synonym_typedefs,
446
+ node=term.reference,
447
+ strict=strict,
448
+ ontology_prefix=ontology_prefix,
449
+ upgrade=upgrade,
200
450
  )
201
- n_synonyms += len(synonyms)
451
+ )
452
+ for synonym in synonyms:
453
+ term.append_synonym(synonym)
202
454
 
203
- term = Term(
204
- reference=reference,
205
- definition=definition,
206
- parents=parents,
207
- synonyms=synonyms,
208
- xrefs=xrefs,
209
- provenance=provenance,
210
- alt_ids=alt_ids,
455
+
456
+ def _process_xrefs(
457
+ term: Stanza,
458
+ data,
459
+ *,
460
+ ontology_prefix: str,
461
+ strict: bool,
462
+ macro_config: MacroConfig,
463
+ upgrade: bool,
464
+ ) -> None:
465
+ for reference, provenance in iterate_node_xrefs(
466
+ data=data,
467
+ strict=strict,
468
+ ontology_prefix=ontology_prefix,
469
+ node=term.reference,
470
+ upgrade=upgrade,
471
+ ):
472
+ _handle_xref(term, reference, provenance=provenance, macro_config=macro_config)
473
+
474
+
475
+ def _process_properties(
476
+ term: Stanza, data, *, ontology_prefix: str, strict: bool, upgrade: bool, typedefs
477
+ ) -> None:
478
+ for ann in iterate_node_properties(
479
+ data,
480
+ node=term.reference,
481
+ strict=strict,
482
+ ontology_prefix=ontology_prefix,
483
+ upgrade=upgrade,
484
+ context="stanza property",
485
+ ):
486
+ # TODO parse axioms
487
+ term.append_property(ann)
488
+
489
+
490
+ def _process_relations(
491
+ term: Stanza,
492
+ data,
493
+ *,
494
+ ontology_prefix: str,
495
+ strict: bool,
496
+ upgrade: bool,
497
+ typedefs: Mapping[ReferenceTuple, TypeDef],
498
+ missing_typedefs: set[ReferenceTuple],
499
+ ) -> None:
500
+ relations_references = list(
501
+ iterate_node_relationships(
502
+ data,
503
+ node=term.reference,
504
+ strict=strict,
505
+ ontology_prefix=ontology_prefix,
506
+ upgrade=upgrade,
211
507
  )
508
+ )
509
+ for relation, reference in relations_references:
510
+ if (
511
+ relation.pair not in typedefs
512
+ and relation.pair not in default_typedefs
513
+ and relation.pair not in missing_typedefs
514
+ ):
515
+ missing_typedefs.add(relation.pair)
516
+ logger.warning("[%s] has no typedef for %s", ontology_prefix, relation.curie)
517
+ logger.debug("[%s] available typedefs: %s", ontology_prefix, set(typedefs))
518
+ # TODO parse axioms
519
+ term.append_relationship(relation, reference)
520
+
521
+
522
+ def _process_replaced_by(stanza: Stanza, data, *, ontology_prefix: str, strict: bool) -> None:
523
+ for reference in iterate_node_reference_tag(
524
+ "replaced_by", data, node=stanza.reference, strict=strict, ontology_prefix=ontology_prefix
525
+ ):
526
+ stanza.append_replaced_by(reference)
527
+
528
+
529
+ def _process_subsets(stanza: Stanza, data, *, ontology_prefix: str, strict: bool) -> None:
530
+ for reference in iterate_node_reference_tag(
531
+ "subset",
532
+ data,
533
+ node=stanza.reference,
534
+ strict=strict,
535
+ ontology_prefix=ontology_prefix,
536
+ counter=SUBSET_ERROR_COUNTER,
537
+ ):
538
+ stanza.append_subset(reference)
539
+
540
+
541
+ def _get_boolean(data: Mapping[str, Any], tag: str) -> bool | None:
542
+ value = data.get(tag)
543
+ if value is None:
544
+ return None
545
+ if isinstance(value, list):
546
+ value = value[0]
547
+ if value == "false":
548
+ return False
549
+ if value == "true":
550
+ return True
551
+ raise ValueError(value)
552
+
553
+
554
+ def _get_reference(
555
+ data: Mapping[str, Any], tag: str, *, ontology_prefix: str, strict: bool, **kwargs
556
+ ) -> Reference | None:
557
+ value = data.get(tag)
558
+ if value is None:
559
+ return None
560
+ if isinstance(value, list):
561
+ value = value[0]
562
+ return _obo_parse_identifier(
563
+ value, ontology_prefix=ontology_prefix, strict=strict, context=tag, **kwargs
564
+ )
212
565
 
213
- try:
214
- relations_references = list(
215
- iterate_node_relationships(
216
- data,
217
- prefix=ontology,
218
- identifier=identifier,
219
- strict=strict,
566
+
567
+ class MacroConfig:
568
+ """A configuration data class for reader macros."""
569
+
570
+ def __init__(
571
+ self, data: Mapping[str, list[str]] | None = None, *, strict: bool, ontology_prefix: str
572
+ ):
573
+ """Instantiate the configuration from obonet graph metadata."""
574
+ if data is None:
575
+ data = {}
576
+
577
+ self.treat_xrefs_as_equivalent: set[str] = set()
578
+ for prefix in data.get("treat-xrefs-as-equivalent", []):
579
+ prefix_norm = bioregistry.normalize_prefix(prefix)
580
+ if prefix_norm is None:
581
+ continue
582
+ self.treat_xrefs_as_equivalent.add(prefix_norm)
583
+
584
+ self.treat_xrefs_as_genus_differentia: dict[str, tuple[Reference, Reference]] = {}
585
+ for line in data.get("treat-xrefs-as-genus-differentia", []):
586
+ try:
587
+ gd_prefix, gd_predicate, gd_target = line.split()
588
+ except ValueError:
589
+ # this happens in `plana`, where there's an incorrectly written
590
+ # line `CARO part_of NCBITaxon:79327; CL part_of NCBITaxon:79327`
591
+ tqdm.write(
592
+ f"[{ontology_prefix}] failed to parse treat-xrefs-as-genus-differentia: {line}"
220
593
  )
594
+ continue
595
+
596
+ gd_prefix_norm = bioregistry.normalize_prefix(gd_prefix)
597
+ if gd_prefix_norm is None:
598
+ continue
599
+ gd_predicate_re = _obo_parse_identifier(
600
+ gd_predicate, ontology_prefix=ontology_prefix, strict=strict
221
601
  )
222
- except MissingPrefixError as e:
223
- e.reference = reference
224
- raise e
225
- for relation, reference in relations_references:
226
- if (relation.prefix, relation.identifier) in typedefs:
227
- typedef = typedefs[relation.prefix, relation.identifier]
228
- elif (relation.prefix, relation.identifier) in default_typedefs:
229
- typedef = default_typedefs[relation.prefix, relation.identifier]
230
- else:
231
- if (relation.prefix, relation.identifier) not in missing_typedefs:
232
- missing_typedefs.add((relation.prefix, relation.identifier))
233
- logger.warning("[%s] has no typedef for %s", ontology, relation)
234
- logger.debug("[%s] available typedefs: %s", ontology, set(typedefs))
602
+ if gd_predicate_re is None:
603
+ continue
604
+ gd_target_re = _obo_parse_identifier(
605
+ gd_target, ontology_prefix=ontology_prefix, strict=strict
606
+ )
607
+ if gd_target_re is None:
608
+ continue
609
+ self.treat_xrefs_as_genus_differentia[gd_prefix_norm] = (gd_predicate_re, gd_target_re)
610
+
611
+ self.treat_xrefs_as_relationship: dict[str, Reference] = {}
612
+ for line in data.get("treat-xrefs-as-relationship", []):
613
+ try:
614
+ gd_prefix, gd_predicate = line.split()
615
+ except ValueError:
616
+ tqdm.write(
617
+ f"[{ontology_prefix}] failed to parse treat-xrefs-as-relationship: {line}"
618
+ )
235
619
  continue
236
- n_relations += 1
237
- term.append_relationship(typedef, reference)
238
- for prop, value in iterate_node_properties(data, term=term):
239
- n_properties += 1
240
- term.append_property(prop, value)
241
- terms.append(term)
242
620
 
243
- logger.info(
244
- f"[{ontology}] got {len(references):,} references, {len(typedefs):,} typedefs, {len(terms):,} terms,"
245
- f" {n_alt_ids:,} alt ids, {n_parents:,} parents, {n_synonyms:,} synonyms, {n_xrefs:,} xrefs,"
246
- f" {n_relations:,} relations, and {n_properties:,} properties",
247
- )
621
+ gd_prefix_norm = bioregistry.normalize_prefix(gd_prefix)
622
+ if gd_prefix_norm is None:
623
+ continue
624
+ gd_predicate_re = _obo_parse_identifier(
625
+ gd_predicate, ontology_prefix=ontology_prefix, strict=strict
626
+ )
627
+ if gd_predicate_re is None:
628
+ continue
629
+ self.treat_xrefs_as_relationship[gd_prefix_norm] = gd_predicate_re
248
630
 
249
- return make_ad_hoc_ontology(
250
- _ontology=ontology,
251
- _name=name,
252
- _auto_generated_by=graph.graph.get("auto-generated-by"),
253
- _format_version=graph.graph.get("format-version"),
254
- _typedefs=list(typedefs.values()),
255
- _synonym_typedefs=list(synonym_typedefs.values()),
256
- _date=date,
257
- _data_version=data_version,
258
- terms=terms,
259
- )
631
+ self.treat_xrefs_as_is_a: set[str] = set()
632
+ for prefix in data.get("treat-xrefs-as-is_a", []):
633
+ gd_prefix_norm = bioregistry.normalize_prefix(prefix)
634
+ if gd_prefix_norm is None:
635
+ continue
636
+ self.treat_xrefs_as_is_a.add(gd_prefix_norm)
637
+
638
+
639
+ def _handle_xref(
640
+ term: Stanza,
641
+ xref: Reference,
642
+ *,
643
+ provenance: list[Reference | OBOLiteral],
644
+ macro_config: MacroConfig | None = None,
645
+ ) -> Stanza:
646
+ annotations = [Annotation(v.has_dbxref, p) for p in provenance]
647
+
648
+ if macro_config is not None:
649
+ if xref.prefix in macro_config.treat_xrefs_as_equivalent:
650
+ return term.append_equivalent(xref, annotations=annotations)
651
+ elif object_property := macro_config.treat_xrefs_as_genus_differentia.get(xref.prefix):
652
+ # TODO how to add annotations here?
653
+ if annotations:
654
+ logger.warning(
655
+ "[%s] unable to add provenance to xref upgraded to intersection_of: %s",
656
+ term.reference.curie,
657
+ xref,
658
+ )
659
+ return term.append_intersection_of(xref).append_intersection_of(object_property)
660
+ elif predicate := macro_config.treat_xrefs_as_relationship.get(xref.prefix):
661
+ return term.append_relationship(predicate, xref, annotations=annotations)
662
+ elif xref.prefix in macro_config.treat_xrefs_as_is_a:
663
+ return term.append_parent(xref, annotations=annotations)
664
+
665
+ # TODO this is not what spec calls for, maybe
666
+ # need a flag in macro config for this
667
+ if xref.prefix in PROVENANCE_PREFIXES:
668
+ return term.append_provenance(xref, annotations=annotations)
669
+
670
+ return term.append_xref(xref, annotations=annotations)
671
+
672
+
673
+ SUBSET_ERROR_COUNTER: Counter[tuple[str, str]] = Counter()
674
+
675
+
676
+ def _get_subsetdefs(graph: nx.MultiDiGraph, ontology_prefix: str) -> list[tuple[Reference, str]]:
677
+ rv = []
678
+ for subsetdef in graph.get("subsetdef", []):
679
+ left, _, right = subsetdef.partition(" ")
680
+ if not right:
681
+ logger.warning("[%s] subsetdef did not have two parts", ontology_prefix, subsetdef)
682
+ continue
683
+ left_ref = _obo_parse_identifier(
684
+ left,
685
+ ontology_prefix=ontology_prefix,
686
+ name=right,
687
+ line=subsetdef,
688
+ counter=SUBSET_ERROR_COUNTER,
689
+ )
690
+ if left_ref is None:
691
+ continue
692
+ right = right.strip('"')
693
+ rv.append((left_ref, right))
694
+ return rv
260
695
 
261
696
 
262
697
  def _clean_graph_ontology(graph, prefix: str) -> None:
263
698
  """Update the ontology entry in the graph's metadata, if necessary."""
264
699
  if "ontology" not in graph.graph:
265
- logger.warning('[%s] missing "ontology" key', prefix)
700
+ logger.debug('[%s] missing "ontology" key', prefix)
266
701
  graph.graph["ontology"] = prefix
267
702
  elif not graph.graph["ontology"].isalpha():
268
- logger.warning(
269
- "[%s] ontology=%s has a strange format. replacing with prefix",
703
+ logger.debug(
704
+ "[%s] ontology prefix `%s` has a strange format. replacing with prefix",
270
705
  prefix,
271
706
  graph.graph["ontology"],
272
707
  )
273
708
  graph.graph["ontology"] = prefix
274
709
 
275
710
 
711
+ def _clean_graph_version(
712
+ graph, ontology_prefix: str, version: str | None, date: datetime | None
713
+ ) -> str | None:
714
+ if ontology_prefix in STATIC_VERSION_REWRITES:
715
+ return STATIC_VERSION_REWRITES[ontology_prefix]
716
+
717
+ data_version: str | None = graph.graph.get("data-version") or None
718
+ if version:
719
+ clean_injected_version = cleanup_version(version, prefix=ontology_prefix)
720
+ if not data_version:
721
+ logger.debug(
722
+ "[%s] did not have a version, overriding with %s",
723
+ ontology_prefix,
724
+ clean_injected_version,
725
+ )
726
+ return clean_injected_version
727
+
728
+ clean_data_version = cleanup_version(data_version, prefix=ontology_prefix)
729
+ if clean_data_version != clean_injected_version:
730
+ # in this case, we're going to trust the one that's passed
731
+ # through explicitly more than the graph's content
732
+ logger.debug(
733
+ "[%s] had version %s, overriding with %s", ontology_prefix, data_version, version
734
+ )
735
+ return clean_injected_version
736
+
737
+ if data_version:
738
+ clean_data_version = cleanup_version(data_version, prefix=ontology_prefix)
739
+ logger.debug("[%s] using version %s", ontology_prefix, clean_data_version)
740
+ return clean_data_version
741
+
742
+ if date is not None:
743
+ derived_date_version = date.strftime("%Y-%m-%d")
744
+ logger.debug(
745
+ "[%s] does not report a version. falling back to date: %s",
746
+ ontology_prefix,
747
+ derived_date_version,
748
+ )
749
+ return derived_date_version
750
+
751
+ logger.debug("[%s] does not report a version nor a date", ontology_prefix)
752
+ return None
753
+
754
+
276
755
  def _iter_obo_graph(
277
756
  graph: nx.MultiDiGraph,
278
757
  *,
279
- strict: bool = True,
280
- ) -> Iterable[tuple[str, str, Mapping[str, Any]]]:
758
+ strict: bool = False,
759
+ ontology_prefix: str,
760
+ use_tqdm: bool = False,
761
+ upgrade: bool,
762
+ ) -> Iterable[tuple[Reference, Mapping[str, Any]]]:
281
763
  """Iterate over the nodes in the graph with the prefix stripped (if it's there)."""
282
- for node, data in graph.nodes(data=True):
283
- prefix, identifier = normalize_curie(node, strict=strict)
284
- if prefix is None or identifier is None:
285
- continue
286
- yield prefix, identifier, data
287
-
288
-
289
- def _get_date(graph, ontology: str) -> Optional[datetime]:
764
+ for node, data in tqdm(
765
+ graph.nodes(data=True), disable=not use_tqdm, unit_scale=True, desc=f"[{ontology_prefix}]"
766
+ ):
767
+ name = data.get("name")
768
+ match _parse_str_or_curie_or_uri_helper(
769
+ node,
770
+ ontology_prefix=ontology_prefix,
771
+ name=name,
772
+ upgrade=upgrade,
773
+ context="stanza ID",
774
+ ):
775
+ case Reference() as reference:
776
+ yield reference, data
777
+ case NotCURIEError() as exc:
778
+ if _is_valid_identifier(node):
779
+ yield default_reference(ontology_prefix, node, name=name), data
780
+ elif strict:
781
+ raise exc
782
+ else:
783
+ logger.warning(str(exc))
784
+ case ParseError() as exc:
785
+ if strict:
786
+ raise exc
787
+ else:
788
+ logger.warning(str(exc))
789
+ # if blacklisted, just skip it with no warning
790
+
791
+
792
+ def _get_date(graph, ontology_prefix: str) -> datetime | None:
290
793
  try:
291
794
  rv = datetime.strptime(graph.graph["date"], DATE_FORMAT)
292
795
  except KeyError:
293
- logger.info("[%s] does not report a date", ontology)
796
+ logger.info("[%s] does not report a date", ontology_prefix)
294
797
  return None
295
798
  except ValueError:
296
- logger.info("[%s] reports a date that can't be parsed: %s", ontology, graph.graph["date"])
799
+ logger.info(
800
+ "[%s] reports a date that can't be parsed: %s", ontology_prefix, graph.graph["date"]
801
+ )
297
802
  return None
298
803
  else:
299
804
  return rv
300
805
 
301
806
 
302
- def _get_name(graph, ontology: str) -> str:
807
+ def _get_name(graph, ontology_prefix: str) -> str:
303
808
  try:
304
809
  rv = graph.graph["name"]
305
810
  except KeyError:
306
- logger.info("[%s] does not report a name", ontology)
307
- rv = ontology
811
+ logger.info("[%s] does not report a name", ontology_prefix)
812
+ rv = ontology_prefix
308
813
  return rv
309
814
 
310
815
 
311
816
  def iterate_graph_synonym_typedefs(
312
- graph: nx.MultiDiGraph, *, ontology: str, strict: bool = False
817
+ graph: nx.MultiDiGraph, *, ontology_prefix: str, strict: bool = False, upgrade: bool
313
818
  ) -> Iterable[SynonymTypeDef]:
314
819
  """Get synonym type definitions from an :mod:`obonet` graph."""
315
- for s in graph.graph.get("synonymtypedef", []):
316
- sid, name = s.split(" ", 1)
317
- name = name.strip().strip('"')
318
- if sid.startswith("http://") or sid.startswith("https://"):
319
- reference = Reference.from_iri(sid, name=name)
320
- elif ":" not in sid: # assume it's ad-hoc
321
- reference = Reference(prefix=ontology, identifier=sid, name=name)
322
- else: # assume it's a curie
323
- reference = Reference.from_curie(sid, name=name, strict=strict)
324
-
325
- if reference is None:
820
+ for line in graph.graph.get("synonymtypedef", []):
821
+ # TODO handle trailing comments
822
+ line, _, specificity = (x.strip() for x in line.rpartition('"'))
823
+ specificity = specificity.upper()
824
+ if not specificity:
825
+ specificity = None
826
+ elif specificity not in t.get_args(SynonymScope):
326
827
  if strict:
327
- raise ValueError(f"Could not parse {sid}")
328
- else:
329
- continue
828
+ raise ValueError(f"invalid synonym specificty: {specificity}")
829
+ logger.warning("[%s] invalid synonym specificty: %s", ontology_prefix, specificity)
830
+ specificity = None
330
831
 
331
- yield SynonymTypeDef(reference=reference)
832
+ curie, name = line.split(" ", 1)
833
+ # the name should be in quotes, so strip them out
834
+ name = name.strip().strip('"')
835
+ # TODO unquote the string?
836
+ reference = _obo_parse_identifier(
837
+ curie,
838
+ ontology_prefix=ontology_prefix,
839
+ name=name,
840
+ upgrade=upgrade,
841
+ strict=strict,
842
+ )
843
+ if reference is None:
844
+ logger.warning("[%s] unable to parse synonym typedef ID %s", ontology_prefix, curie)
845
+ continue
846
+ yield SynonymTypeDef(reference=reference, specificity=specificity)
332
847
 
333
848
 
334
- def iterate_graph_typedefs(
335
- graph: nx.MultiDiGraph, default_prefix: str, *, strict: bool = True
849
+ def iterate_typedefs(
850
+ graph: nx.MultiDiGraph,
851
+ *,
852
+ ontology_prefix: str,
853
+ strict: bool = False,
854
+ upgrade: bool,
855
+ macro_config: MacroConfig | None = None,
336
856
  ) -> Iterable[TypeDef]:
337
857
  """Get type definitions from an :mod:`obonet` graph."""
338
- for typedef in graph.graph.get("typedefs", []):
339
- if "id" in typedef:
340
- curie = typedef["id"]
341
- elif "identifier" in typedef:
342
- curie = typedef["identifier"]
858
+ if macro_config is None:
859
+ macro_config = MacroConfig(strict=strict, ontology_prefix=ontology_prefix)
860
+ # can't really have a pre-defined set of synonym typedefs here!
861
+ synonym_typedefs: Mapping[ReferenceTuple, SynonymTypeDef] = {}
862
+ typedefs: Mapping[ReferenceTuple, TypeDef] = {}
863
+ missing_typedefs: set[ReferenceTuple] = set()
864
+ for data in graph.graph.get("typedefs", []):
865
+ if "id" in data:
866
+ typedef_id = data["id"]
867
+ elif "identifier" in data:
868
+ typedef_id = data["identifier"]
343
869
  else:
344
- raise KeyError
870
+ raise KeyError("typedef is missing an `id`")
345
871
 
346
- name = typedef.get("name")
872
+ name = data.get("name")
347
873
  if name is None:
348
- logger.debug("[%s] typedef %s is missing a name", graph.graph["ontology"], curie)
874
+ logger.debug("[%s] typedef %s is missing a name", ontology_prefix, typedef_id)
349
875
 
350
- if ":" in curie:
351
- reference = Reference.from_curie(curie, name=name, strict=strict)
352
- else:
353
- reference = Reference(prefix=graph.graph["ontology"], identifier=curie, name=name)
876
+ reference = _obo_parse_identifier(
877
+ typedef_id, strict=strict, ontology_prefix=ontology_prefix, name=name, upgrade=upgrade
878
+ )
354
879
  if reference is None:
355
- logger.warning("[%s] unable to parse typedef CURIE %s", graph.graph["ontology"], curie)
880
+ logger.warning("[%s] unable to parse typedef ID %s", ontology_prefix, typedef_id)
356
881
  continue
357
882
 
358
- xrefs = []
359
- for curie in typedef.get("xref", []):
360
- _xref = Reference.from_curie(curie, strict=strict)
361
- if _xref:
362
- xrefs.append(_xref)
363
- yield TypeDef(reference=reference, xrefs=xrefs)
883
+ typedef = TypeDef(
884
+ reference=reference,
885
+ namespace=data.get("namespace"),
886
+ is_metadata_tag=_get_boolean(data, "is_metadata_tag"),
887
+ is_class_level=_get_boolean(data, "is_class_level"),
888
+ builtin=_get_boolean(data, "builtin"),
889
+ is_obsolete=_get_boolean(data, "is_obsolete"),
890
+ is_anonymous=_get_boolean(data, "is_anonymous"),
891
+ is_anti_symmetric=_get_boolean(data, "is_anti_symmetric"),
892
+ is_symmetric=_get_boolean(data, "is_symmetric"),
893
+ is_reflexive=_get_boolean(data, "is_reflexive"),
894
+ is_cyclic=_get_boolean(data, "is_cyclic"),
895
+ is_transitive=_get_boolean(data, "is_transitive"),
896
+ is_functional=_get_boolean(data, "is_functional"),
897
+ is_inverse_functional=_get_boolean(data, "is_inverse_functional"),
898
+ domain=_get_reference(data, "domain", ontology_prefix=ontology_prefix, strict=strict),
899
+ range=_get_reference(data, "range", ontology_prefix=ontology_prefix, strict=strict),
900
+ inverse=_get_reference(
901
+ data, "inverse_of", ontology_prefix=ontology_prefix, strict=strict
902
+ ),
903
+ )
904
+ _process_alts(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
905
+ _process_parents(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
906
+ _process_synonyms(
907
+ typedef,
908
+ data,
909
+ ontology_prefix=ontology_prefix,
910
+ strict=strict,
911
+ upgrade=upgrade,
912
+ synonym_typedefs=synonym_typedefs,
913
+ )
914
+ _process_xrefs(
915
+ typedef,
916
+ data,
917
+ ontology_prefix=ontology_prefix,
918
+ strict=strict,
919
+ macro_config=macro_config,
920
+ upgrade=upgrade,
921
+ )
922
+ _process_properties(
923
+ typedef,
924
+ data,
925
+ ontology_prefix=ontology_prefix,
926
+ strict=strict,
927
+ upgrade=upgrade,
928
+ typedefs=typedefs,
929
+ )
930
+ _process_relations(
931
+ typedef,
932
+ data,
933
+ ontology_prefix=ontology_prefix,
934
+ strict=strict,
935
+ upgrade=upgrade,
936
+ typedefs=typedefs,
937
+ missing_typedefs=missing_typedefs,
938
+ )
939
+ _process_replaced_by(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
940
+ _process_subsets(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
941
+ _process_intersection_of(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
942
+ _process_union_of(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
943
+ _process_equivalent_to(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
944
+ _process_disjoint_from(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
945
+ _process_consider(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
946
+ _process_comment(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
947
+ _process_description(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
948
+ _process_creation_date(typedef, data)
949
+
950
+ # the next 4 are typedef-specific
951
+ _process_equivalent_to_chain(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
952
+ _process_holds_over_chain(typedef, data, ontology_prefix=ontology_prefix, strict=strict)
953
+ typedef.disjoint_over.extend(
954
+ iterate_node_reference_tag(
955
+ "disjoint_over",
956
+ data,
957
+ node=typedef.reference,
958
+ ontology_prefix=ontology_prefix,
959
+ strict=strict,
960
+ )
961
+ )
962
+ typedef.transitive_over.extend(
963
+ iterate_node_reference_tag(
964
+ "transitive_over",
965
+ data,
966
+ node=typedef.reference,
967
+ ontology_prefix=ontology_prefix,
968
+ strict=strict,
969
+ )
970
+ )
971
+
972
+ yield typedef
973
+
974
+
975
+ def _process_consider(stanza: Stanza, data, *, ontology_prefix: str, strict: bool = False):
976
+ for reference in iterate_node_reference_tag(
977
+ "consider",
978
+ data,
979
+ node=stanza.reference,
980
+ ontology_prefix=ontology_prefix,
981
+ strict=strict,
982
+ ):
983
+ stanza.append_see_also(reference)
984
+
985
+
986
+ def _process_equivalent_to_chain(
987
+ typedef: TypeDef, data, *, ontology_prefix: str, strict: bool = False
988
+ ) -> None:
989
+ for chain in _iterate_chain(
990
+ "equivalent_to_chain", typedef, data, ontology_prefix=ontology_prefix, strict=strict
991
+ ):
992
+ typedef.equivalent_to_chain.append(chain)
993
+
994
+
995
+ def _process_holds_over_chain(
996
+ typedef: TypeDef, data, *, ontology_prefix: str, strict: bool = False
997
+ ) -> None:
998
+ for chain in _iterate_chain(
999
+ "holds_over_chain", typedef, data, ontology_prefix=ontology_prefix, strict=strict
1000
+ ):
1001
+ typedef.holds_over_chain.append(chain)
1002
+
1003
+
1004
+ def _iterate_chain(
1005
+ tag: str, typedef: TypeDef, data, *, ontology_prefix: str, strict: bool = False
1006
+ ) -> Iterable[list[Reference]]:
1007
+ for chain in data.get(tag, []):
1008
+ # chain is a list of CURIEs
1009
+ predicate_chain = _process_chain_helper(typedef, chain, ontology_prefix=ontology_prefix)
1010
+ if predicate_chain is None:
1011
+ logger.warning(
1012
+ "[%s - %s] could not parse line: %s: %s",
1013
+ ontology_prefix,
1014
+ typedef.curie,
1015
+ tag,
1016
+ chain,
1017
+ )
1018
+ else:
1019
+ yield predicate_chain
1020
+
1021
+
1022
+ def _process_chain_helper(
1023
+ term: Stanza, chain: str, ontology_prefix: str, strict: bool = False
1024
+ ) -> list[Reference] | None:
1025
+ rv = []
1026
+ for curie in chain.split():
1027
+ curie = curie.strip()
1028
+ r = _obo_parse_identifier(
1029
+ curie, ontology_prefix=ontology_prefix, strict=strict, node=term.reference
1030
+ )
1031
+ if r is None:
1032
+ return None
1033
+ rv.append(r)
1034
+ return rv
364
1035
 
365
1036
 
366
1037
  def get_definition(
367
- data, *, prefix: str, identifier: str
368
- ) -> Union[tuple[None, None], tuple[str, list[Reference]]]:
1038
+ data, *, node: Reference, ontology_prefix: str, strict: bool = False
1039
+ ) -> tuple[None | str, list[Reference | OBOLiteral]]:
369
1040
  """Extract the definition from the data."""
370
1041
  definition = data.get("def") # it's allowed not to have a definition
371
1042
  if not definition:
372
- return None, None
373
- return _extract_definition(definition, prefix=prefix, identifier=identifier)
1043
+ return None, []
1044
+ return _extract_definition(
1045
+ definition, node=node, strict=strict, ontology_prefix=ontology_prefix
1046
+ )
374
1047
 
375
1048
 
376
1049
  def _extract_definition(
377
1050
  s: str,
378
1051
  *,
379
- prefix: str,
380
- identifier: str,
1052
+ node: Reference,
381
1053
  strict: bool = False,
382
- ) -> Union[tuple[None, None], tuple[str, list[Reference]]]:
1054
+ ontology_prefix: str,
1055
+ ) -> tuple[None | str, list[Reference | OBOLiteral]]:
383
1056
  """Extract the definitions."""
384
1057
  if not s.startswith('"'):
385
- raise ValueError("definition does not start with a quote")
1058
+ logger.warning(f"[{node.curie}] definition does not start with a quote")
1059
+ return None, []
386
1060
 
387
1061
  try:
388
1062
  definition, rest = _quote_split(s)
389
- except ValueError:
390
- logger.warning("[%s:%s] could not parse definition: %s", prefix, identifier, s)
391
- return None, None
1063
+ except ValueError as e:
1064
+ logger.warning("[%s] failed to parse definition quotes: %s", node.curie, str(e))
1065
+ return None, []
392
1066
 
393
- if not rest.startswith("[") or not rest.endswith("]"):
394
- logger.warning("[%s:%s] problem with definition: %s", prefix, identifier, s)
1067
+ if not rest.startswith("["):
1068
+ logger.debug("[%s] no square brackets for provenance on line: %s", node.curie, s)
395
1069
  provenance = []
396
1070
  else:
397
- provenance = _parse_trailing_ref_list(rest, strict=strict)
398
- return definition, provenance
1071
+ rest = rest.lstrip("[").rstrip("]") # FIXME this doesn't account for trailing annotations
1072
+ provenance = _parse_provenance_list(
1073
+ rest,
1074
+ node=node,
1075
+ ontology_prefix=ontology_prefix,
1076
+ counter=DEFINITION_PROVENANCE_COUNTER,
1077
+ scope_text="definition provenance",
1078
+ line=s,
1079
+ strict=strict,
1080
+ )
1081
+ return definition or None, provenance
399
1082
 
400
1083
 
401
- def _get_first_nonquoted(s: str) -> Optional[int]:
1084
+ def get_first_nonescaped_quote(s: str) -> int | None:
1085
+ """Get the first non-escaped quote."""
1086
+ if not s:
1087
+ return None
1088
+ if s[0] == '"':
1089
+ # special case first position
1090
+ return 0
402
1091
  for i, (a, b) in enumerate(pairwise(s), start=1):
403
1092
  if b == '"' and a != "\\":
404
1093
  return i
@@ -406,10 +1095,12 @@ def _get_first_nonquoted(s: str) -> Optional[int]:
406
1095
 
407
1096
 
408
1097
  def _quote_split(s: str) -> tuple[str, str]:
409
- s = s.lstrip('"')
410
- i = _get_first_nonquoted(s)
1098
+ if not s.startswith('"'):
1099
+ raise ValueError(f"'{s}' does not start with a quote")
1100
+ s = s.removeprefix('"')
1101
+ i = get_first_nonescaped_quote(s)
411
1102
  if i is None:
412
- raise ValueError
1103
+ raise ValueError(f"no closing quote found in `{s}`")
413
1104
  return _clean_definition(s[:i].strip()), s[i + 1 :].strip()
414
1105
 
415
1106
 
@@ -421,78 +1112,64 @@ def _clean_definition(s: str) -> str:
421
1112
 
422
1113
  def _extract_synonym(
423
1114
  s: str,
424
- synonym_typedefs: Mapping[str, SynonymTypeDef],
1115
+ synonym_typedefs: Mapping[ReferenceTuple, SynonymTypeDef],
425
1116
  *,
426
- prefix: str,
427
- identifier: str,
428
- strict: bool = True,
429
- ) -> Optional[Synonym]:
1117
+ node: Reference,
1118
+ strict: bool = False,
1119
+ ontology_prefix: str,
1120
+ upgrade: bool,
1121
+ ) -> Synonym | None:
430
1122
  # TODO check if the synonym is written like a CURIE... it shouldn't but I've seen it happen
431
1123
  try:
432
1124
  name, rest = _quote_split(s)
433
1125
  except ValueError:
434
- logger.warning("[%s:%s] invalid synonym: %s", prefix, identifier, s)
1126
+ logger.warning("[%s] invalid synonym: %s", node.curie, s)
435
1127
  return None
436
1128
 
437
- specificity: Optional[SynonymSpecificity] = None
438
- for _specificity in SynonymSpecificities:
439
- if rest.startswith(_specificity):
440
- specificity = _specificity
441
- rest = rest[len(_specificity) :].strip()
442
- break
443
-
444
- stype: Optional[SynonymTypeDef] = None
445
- for _stype in synonym_typedefs.values():
446
- # Since there aren't a lot of carefully defined synonym definitions, it
447
- # can appear as a string or curie. Therefore, we might see temporary prefixes
448
- # get added, so we should check against full curies as well as local unique
449
- # identifiers
450
- if rest.startswith(_stype.curie):
451
- rest = rest[len(_stype.curie) :].strip()
452
- stype = _stype
453
- break
454
- elif rest.startswith(_stype.preferred_curie):
455
- rest = rest[len(_stype.preferred_curie) :].strip()
456
- stype = _stype
457
- break
458
- elif rest.startswith(_stype.identifier):
459
- rest = rest[len(_stype.identifier) :].strip()
460
- stype = _stype
461
- break
462
-
463
- if not rest.startswith("[") or not rest.endswith("]"):
464
- logger.warning("[%s:%s] problem with synonym: %s", prefix, identifier, s)
465
- return None
1129
+ specificity, rest = _chomp_specificity(rest)
1130
+ synonym_typedef, rest = _chomp_typedef(
1131
+ rest,
1132
+ synonym_typedefs=synonym_typedefs,
1133
+ strict=strict,
1134
+ node=node,
1135
+ ontology_prefix=ontology_prefix,
1136
+ upgrade=upgrade,
1137
+ )
1138
+ provenance, rest = _chomp_references(
1139
+ rest,
1140
+ strict=strict,
1141
+ node=node,
1142
+ ontology_prefix=ontology_prefix,
1143
+ line=s,
1144
+ )
1145
+ annotations = _chomp_axioms(rest, node=node, strict=strict)
466
1146
 
467
- provenance = _parse_trailing_ref_list(rest, strict=strict)
468
1147
  return Synonym(
469
1148
  name=name,
470
- specificity=specificity or "EXACT",
471
- type=stype or DEFAULT_SYNONYM_TYPE,
472
- provenance=provenance,
1149
+ specificity=specificity,
1150
+ type=synonym_typedef.reference if synonym_typedef else None,
1151
+ provenance=list(provenance or []),
1152
+ annotations=annotations,
473
1153
  )
474
1154
 
475
1155
 
476
- def _parse_trailing_ref_list(rest, *, strict: bool = True):
477
- rest = rest.lstrip("[").rstrip("]")
478
- return [
479
- Reference.from_curie(curie.strip(), strict=strict)
480
- for curie in rest.split(",")
481
- if curie.strip()
482
- ]
1156
+ #: A counter for errors in parsing provenance
1157
+ DEFINITION_PROVENANCE_COUNTER: Counter[tuple[str, str]] = Counter()
483
1158
 
484
1159
 
485
1160
  def iterate_node_synonyms(
486
1161
  data: Mapping[str, Any],
487
- synonym_typedefs: Mapping[str, SynonymTypeDef],
1162
+ synonym_typedefs: Mapping[ReferenceTuple, SynonymTypeDef],
488
1163
  *,
489
- prefix: str,
490
- identifier: str,
1164
+ node: Reference,
491
1165
  strict: bool = False,
1166
+ ontology_prefix: str,
1167
+ upgrade: bool,
492
1168
  ) -> Iterable[Synonym]:
493
1169
  """Extract synonyms from a :mod:`obonet` node's data.
494
1170
 
495
- Example strings:
1171
+ Example strings
1172
+
496
1173
  - "LTEC I" EXACT [Orphanet:93938,DOI:xxxx]
497
1174
  - "LTEC I" EXACT [Orphanet:93938]
498
1175
  - "LTEC I" [Orphanet:93938]
@@ -500,121 +1177,407 @@ def iterate_node_synonyms(
500
1177
  """
501
1178
  for s in data.get("synonym", []):
502
1179
  s = _extract_synonym(
503
- s, synonym_typedefs, prefix=prefix, identifier=identifier, strict=strict
1180
+ s,
1181
+ synonym_typedefs,
1182
+ node=node,
1183
+ strict=strict,
1184
+ ontology_prefix=ontology_prefix,
1185
+ upgrade=upgrade,
504
1186
  )
505
1187
  if s is not None:
506
1188
  yield s
507
1189
 
508
1190
 
509
- HANDLED_PROPERTY_TYPES = {
510
- "xsd:string": str,
511
- "xsd:dateTime": datetime,
512
- }
513
-
514
-
515
1191
  def iterate_node_properties(
516
- data: Mapping[str, Any], *, property_prefix: Optional[str] = None, term=None
517
- ) -> Iterable[tuple[str, str]]:
1192
+ data: Mapping[str, Any],
1193
+ *,
1194
+ node: Reference,
1195
+ strict: bool = False,
1196
+ ontology_prefix: str,
1197
+ upgrade: bool,
1198
+ context: str,
1199
+ ) -> Iterable[Annotation]:
518
1200
  """Extract properties from a :mod:`obonet` node's data."""
519
1201
  for prop_value_type in data.get("property_value", []):
520
- try:
521
- prop, value_type = prop_value_type.split(" ", 1)
522
- except ValueError:
523
- logger.info("malformed property: %s on %s", prop_value_type, term and term.curie)
524
- continue
525
- if property_prefix is not None and prop.startswith(property_prefix):
526
- prop = prop[len(property_prefix) :]
1202
+ if yv := _handle_prop(
1203
+ prop_value_type,
1204
+ node=node,
1205
+ strict=strict,
1206
+ ontology_prefix=ontology_prefix,
1207
+ upgrade=upgrade,
1208
+ context=context,
1209
+ ):
1210
+ yield yv
1211
+
1212
+
1213
+ #: Keep track of property-value pairs for which the value couldn't be parsed,
1214
+ #: such as `dc:conformsTo autoimmune:inflammation.yaml` in MONDO
1215
+ UNHANDLED_PROP_OBJECTS: Counter[tuple[str, str]] = Counter()
1216
+
1217
+ UNHANDLED_PROPS: Counter[tuple[str, str]] = Counter()
1218
+
1219
+
1220
+ def _handle_prop(
1221
+ prop_value_type: str,
1222
+ *,
1223
+ node: Reference,
1224
+ strict: bool = False,
1225
+ ontology_prefix: str,
1226
+ upgrade: bool,
1227
+ context: str | None,
1228
+ ) -> Annotation | None:
1229
+ try:
1230
+ prop, value_type = prop_value_type.split(" ", 1)
1231
+ except ValueError:
1232
+ logger.warning("[%s] property_value is missing a space: %s", node.curie, prop_value_type)
1233
+ return None
1234
+
1235
+ prop_reference = _get_prop(
1236
+ prop,
1237
+ node=node,
1238
+ strict=strict,
1239
+ ontology_prefix=ontology_prefix,
1240
+ upgrade=upgrade,
1241
+ line=prop_value_type,
1242
+ counter=UNHANDLED_PROPS,
1243
+ context=context,
1244
+ )
1245
+ if prop_reference is None:
1246
+ return None
1247
+
1248
+ value_type = value_type.strip()
1249
+ datatype: Reference | None
1250
+ if " " not in value_type:
1251
+ value, datatype = value_type, None
1252
+ else:
1253
+ value, datatype_raw = (s.strip() for s in value_type.rsplit(" ", 1))
1254
+ match _parse_str_or_curie_or_uri_helper(
1255
+ datatype_raw,
1256
+ ontology_prefix=ontology_prefix,
1257
+ node=node,
1258
+ predicate=prop_reference,
1259
+ line=prop_value_type,
1260
+ upgrade=upgrade,
1261
+ context="property datatype",
1262
+ ):
1263
+ case Reference() as datatype_:
1264
+ datatype = datatype_
1265
+ case BlacklistedError():
1266
+ return None
1267
+ case ParseError() as exc:
1268
+ if strict:
1269
+ raise exc
1270
+ else:
1271
+ logger.warning(str(exc))
1272
+ return None
1273
+
1274
+ # if it's an empty string, like the ones removed in https://github.com/oborel/obo-relations/pull/830,
1275
+ # just quit
1276
+ if value == '""':
1277
+ return None
1278
+
1279
+ quoted = value.startswith('"') and value.endswith('"')
1280
+ value = value.strip('"').strip()
527
1281
 
1282
+ # first, special case datetimes. Whether it's quoted or not,
1283
+ # we always deal with this first
1284
+ if datatype and datatype.curie == "xsd:dateTime":
528
1285
  try:
529
- value, _ = value_type.rsplit(" ", 1) # second entry is the value type
1286
+ obo_literal = OBOLiteral.datetime(value)
530
1287
  except ValueError:
531
- # logger.debug(f'property missing datatype. defaulting to string - {prop_value_type}')
532
- value = value_type # could assign type to be 'xsd:string' by default
533
- value = value.strip('"')
534
- yield prop, value
1288
+ logger.warning(
1289
+ "[%s - %s] could not parse date: %s", node.curie, prop_reference.curie, value
1290
+ )
1291
+ return None
1292
+ else:
1293
+ return Annotation(prop_reference, obo_literal)
1294
+
1295
+ if datatype and datatype.curie == "xsd:anyURI":
1296
+ match _parse_str_or_curie_or_uri_helper(
1297
+ value,
1298
+ node=node,
1299
+ predicate=prop_reference,
1300
+ ontology_prefix=ontology_prefix,
1301
+ line=prop_value_type,
1302
+ upgrade=upgrade,
1303
+ context="property object",
1304
+ ):
1305
+ case Reference() as obj_reference:
1306
+ return Annotation(prop_reference, obj_reference)
1307
+ case BlacklistedError():
1308
+ return None
1309
+ case UnparsableIRIError():
1310
+ return Annotation(prop_reference, OBOLiteral.uri(value))
1311
+ case ParseError() as exc:
1312
+ if strict:
1313
+ raise exc
1314
+ else:
1315
+ logger.warning(str(exc))
1316
+ return None
1317
+
1318
+ # if it's quoted and there's a data try parsing as a CURIE/URI anyway (this is a bit
1319
+ # aggressive, but more useful than spec).
1320
+ if quoted:
1321
+ # give a try parsing it anyway, just in case ;)
1322
+ match _parse_str_or_curie_or_uri_helper(
1323
+ value,
1324
+ ontology_prefix=ontology_prefix,
1325
+ node=node,
1326
+ line=prop_value_type,
1327
+ upgrade=upgrade,
1328
+ predicate=prop_reference,
1329
+ context="property object",
1330
+ ):
1331
+ case Reference() as obj_reference:
1332
+ return Annotation(prop_reference, obj_reference)
1333
+ case BlacklistedError():
1334
+ return None
1335
+ case ParseError():
1336
+ if datatype:
1337
+ return Annotation(prop_reference, OBOLiteral(value, datatype, None))
1338
+ else:
1339
+ return Annotation(prop_reference, OBOLiteral.string(value))
1340
+ else:
1341
+ if datatype:
1342
+ logger.debug(
1343
+ "[%s] throwing away datatype since no quotes were used: %s", node.curie, value_type
1344
+ )
1345
+
1346
+ # if it wasn't quoted and there was no datatype, go for parsing as an object
1347
+ match _obo_parse_identifier(
1348
+ value,
1349
+ strict=strict,
1350
+ ontology_prefix=ontology_prefix,
1351
+ node=node,
1352
+ predicate=prop_reference,
1353
+ line=prop_value_type,
1354
+ context="property object",
1355
+ counter=UNHANDLED_PROP_OBJECTS,
1356
+ ):
1357
+ case Reference() as obj_reference:
1358
+ return Annotation(prop_reference, obj_reference)
1359
+ case None:
1360
+ return None
1361
+
1362
+
1363
+ def _get_prop(
1364
+ property_id: str,
1365
+ *,
1366
+ node: Reference,
1367
+ strict: bool,
1368
+ ontology_prefix: str,
1369
+ upgrade: bool,
1370
+ line: str,
1371
+ counter: Counter[tuple[str, str]] | None = None,
1372
+ context: str | None = None,
1373
+ ) -> Reference | None:
1374
+ if rv := _parse_default_prop(property_id, ontology_prefix):
1375
+ return rv
1376
+ return _obo_parse_identifier(
1377
+ property_id,
1378
+ strict=strict,
1379
+ node=node,
1380
+ ontology_prefix=ontology_prefix,
1381
+ upgrade=upgrade,
1382
+ counter=counter,
1383
+ context=context,
1384
+ line=line,
1385
+ )
1386
+
1387
+
1388
+ def _parse_default_prop(property_id, ontology_prefix) -> Reference | None:
1389
+ for delim in "#/":
1390
+ sw = f"http://purl.obolibrary.org/obo/{ontology_prefix}{delim}"
1391
+ if property_id.startswith(sw):
1392
+ identifier = property_id.removeprefix(sw)
1393
+ return default_reference(ontology_prefix, identifier)
1394
+ return None
535
1395
 
536
1396
 
537
- def iterate_node_parents(
1397
+ def iterate_node_reference_tag(
1398
+ tag: str,
538
1399
  data: Mapping[str, Any],
539
1400
  *,
540
- prefix: str,
541
- identifier: str,
542
- strict: bool = True,
1401
+ node: Reference,
1402
+ strict: bool = False,
1403
+ ontology_prefix: str,
1404
+ upgrade: bool = True,
1405
+ counter: Counter[tuple[str, str]] | None = None,
543
1406
  ) -> Iterable[Reference]:
544
- """Extract parents from a :mod:`obonet` node's data."""
545
- for parent_curie in data.get("is_a", []):
546
- reference = Reference.from_curie(parent_curie, strict=strict)
1407
+ """Extract a list of CURIEs from the data."""
1408
+ for identifier in data.get(tag, []):
1409
+ reference = _obo_parse_identifier(
1410
+ identifier,
1411
+ strict=strict,
1412
+ node=node,
1413
+ ontology_prefix=ontology_prefix,
1414
+ upgrade=upgrade,
1415
+ counter=counter,
1416
+ )
547
1417
  if reference is None:
548
1418
  logger.warning(
549
- "[%s:%s] could not parse parent curie: %s", prefix, identifier, parent_curie
1419
+ "[%s] %s - could not parse identifier: %s", ontology_prefix, tag, identifier
550
1420
  )
551
- continue
552
- yield reference
1421
+ else:
1422
+ yield reference
553
1423
 
554
1424
 
555
- def iterate_node_alt_ids(data: Mapping[str, Any], *, strict: bool = True) -> Iterable[Reference]:
556
- """Extract alternate identifiers from a :mod:`obonet` node's data."""
557
- for curie in data.get("alt_id", []):
558
- reference = Reference.from_curie(curie, strict=strict)
559
- if reference is not None:
560
- yield reference
1425
+ def _process_intersection_of(
1426
+ term: Stanza,
1427
+ data: Mapping[str, Any],
1428
+ *,
1429
+ strict: bool = False,
1430
+ ontology_prefix: str,
1431
+ upgrade: bool = True,
1432
+ ) -> None:
1433
+ """Extract a list of CURIEs from the data."""
1434
+ for line in data.get("intersection_of", []):
1435
+ predicate_id, _, target_id = line.partition(" ")
1436
+ predicate = _obo_parse_identifier(
1437
+ predicate_id,
1438
+ strict=strict,
1439
+ node=term.reference,
1440
+ ontology_prefix=ontology_prefix,
1441
+ upgrade=upgrade,
1442
+ )
1443
+ if predicate is None:
1444
+ logger.warning("[%s] - could not parse intersection_of: %s", ontology_prefix, line)
1445
+ continue
1446
+
1447
+ if target_id:
1448
+ # this means that there's a second part, so let's try parsing it
1449
+ target = _obo_parse_identifier(
1450
+ target_id,
1451
+ strict=strict,
1452
+ node=term.reference,
1453
+ predicate=predicate,
1454
+ ontology_prefix=ontology_prefix,
1455
+ upgrade=upgrade,
1456
+ )
1457
+ if target is None:
1458
+ logger.warning(
1459
+ "[%s] could not parse intersection_of target: %s", ontology_prefix, line
1460
+ )
1461
+ continue
1462
+ term.append_intersection_of(predicate, target)
1463
+ else:
1464
+ term.append_intersection_of(predicate)
561
1465
 
562
1466
 
563
1467
  def iterate_node_relationships(
564
1468
  data: Mapping[str, Any],
565
1469
  *,
566
- prefix: str,
567
- identifier: str,
568
- strict: bool = True,
1470
+ node: Reference,
1471
+ strict: bool = False,
1472
+ ontology_prefix: str,
1473
+ upgrade: bool,
569
1474
  ) -> Iterable[tuple[Reference, Reference]]:
570
1475
  """Extract relationships from a :mod:`obonet` node's data."""
571
- for s in data.get("relationship", []):
572
- relation_curie, target_curie = s.split(" ")
573
- relation_prefix: Optional[str]
574
- relation_identifier: Optional[str]
575
- if relation_curie in RELATION_REMAPPINGS:
576
- relation_prefix, relation_identifier = RELATION_REMAPPINGS[relation_curie]
577
- else:
578
- relation_prefix, relation_identifier = normalize_curie(relation_curie, strict=strict)
579
- if relation_prefix is not None and relation_identifier is not None:
580
- relation = Reference(prefix=relation_prefix, identifier=relation_identifier)
581
- elif prefix is not None:
582
- relation = Reference(prefix=prefix, identifier=relation_curie)
583
- else:
584
- logger.debug("unhandled relation: %s", relation_curie)
585
- relation = Reference(prefix="obo", identifier=relation_curie)
586
-
587
- # TODO replace with omni-parser from :mod:`curies`
588
- target = Reference.from_curie(target_curie, strict=strict)
589
- if target is None:
590
- logger.warning(
591
- "[%s:%s] %s could not parse target %s", prefix, identifier, relation, target_curie
592
- )
593
- continue
1476
+ for line in data.get("relationship", []):
1477
+ relation_curie, target_curie = line.split(" ")
1478
+
1479
+ predicate = _obo_parse_identifier(
1480
+ relation_curie,
1481
+ strict=strict,
1482
+ ontology_prefix=ontology_prefix,
1483
+ node=node,
1484
+ upgrade=upgrade,
1485
+ line=line,
1486
+ context="relationship predicate",
1487
+ )
1488
+ match predicate:
1489
+ # TODO extend with other exception handling
1490
+ case None:
1491
+ logger.warning("[%s] could not parse relation %s", node.curie, relation_curie)
1492
+ continue
594
1493
 
595
- yield relation, target
1494
+ match _parse_str_or_curie_or_uri_helper(
1495
+ target_curie,
1496
+ ontology_prefix=ontology_prefix,
1497
+ node=node,
1498
+ predicate=predicate,
1499
+ line=line,
1500
+ context="relationship target",
1501
+ upgrade=upgrade,
1502
+ ):
1503
+ case Reference() as target:
1504
+ yield predicate, target
1505
+ case ParseError() as exc:
1506
+ if strict:
1507
+ raise exc
1508
+ else:
1509
+ logger.warning(str(exc))
596
1510
 
597
1511
 
598
1512
  def iterate_node_xrefs(
599
- *, prefix: str, data: Mapping[str, Any], strict: bool = True
600
- ) -> Iterable[Reference]:
1513
+ *,
1514
+ data: Mapping[str, Any],
1515
+ strict: bool = False,
1516
+ ontology_prefix: str,
1517
+ node: Reference,
1518
+ upgrade: bool,
1519
+ ) -> Iterable[tuple[Reference, list[Reference | OBOLiteral]]]:
601
1520
  """Extract xrefs from a :mod:`obonet` node's data."""
602
- for xref in data.get("xref", []):
603
- xref = xref.strip()
604
-
605
- if curie_has_blacklisted_prefix(xref) or curie_is_blacklisted(xref) or ":" not in xref:
606
- continue # sometimes xref to self... weird
1521
+ for line in data.get("xref", []):
1522
+ line = line.strip()
1523
+ if pair := _parse_xref_line(
1524
+ line.strip(),
1525
+ strict=strict,
1526
+ node=node,
1527
+ ontology_prefix=ontology_prefix,
1528
+ upgrade=upgrade,
1529
+ ):
1530
+ yield pair
1531
+
1532
+
1533
+ def _parse_xref_line(
1534
+ line: str, *, strict: bool = False, ontology_prefix: str, node: Reference, upgrade: bool
1535
+ ) -> tuple[Reference, list[Reference | OBOLiteral]] | None:
1536
+ xref, _, rest = line.partition(" [")
1537
+
1538
+ if str_is_blacklisted(xref, ontology_prefix=ontology_prefix) or ":" not in xref:
1539
+ return None # sometimes xref to self... weird
1540
+
1541
+ xref = remap_prefix(xref, ontology_prefix=ontology_prefix)
1542
+
1543
+ split_space = " " in xref
1544
+ if split_space:
1545
+ _xref_split = xref.split(" ", 1)
1546
+ if _xref_split[1][0] not in {'"', "("}:
1547
+ logger.debug("[%s] Problem with space in xref %s", node.curie, xref)
1548
+ return None
1549
+ xref = _xref_split[0]
1550
+
1551
+ xref_ref = _parse_str_or_curie_or_uri_helper(
1552
+ xref, ontology_prefix=ontology_prefix, node=node, line=line, context="xref", upgrade=upgrade
1553
+ )
1554
+ match xref_ref:
1555
+ case BlacklistedError():
1556
+ return None
1557
+ case ParseError() as exc:
1558
+ if strict:
1559
+ raise exc
1560
+ else:
1561
+ if not XREF_PROVENANCE_COUNTER[ontology_prefix, xref]:
1562
+ logger.warning(str(exc))
1563
+ XREF_PROVENANCE_COUNTER[ontology_prefix, xref] += 1
1564
+ return None
1565
+
1566
+ if rest:
1567
+ rest_front, _, _rest_rest = rest.partition("]")
1568
+ provenance = _parse_provenance_list(
1569
+ rest_front,
1570
+ node=node,
1571
+ ontology_prefix=ontology_prefix,
1572
+ counter=XREF_PROVENANCE_COUNTER,
1573
+ scope_text="xref provenance",
1574
+ line=line,
1575
+ strict=strict,
1576
+ )
1577
+ else:
1578
+ provenance = []
607
1579
 
608
- xref = remap_prefix(xref)
1580
+ return xref_ref, provenance
609
1581
 
610
- split_space = " " in xref
611
- if split_space:
612
- _xref_split = xref.split(" ", 1)
613
- if _xref_split[1][0] not in {'"', "("}:
614
- logger.debug("[%s] Problem with space in xref %s", prefix, xref)
615
- continue
616
- xref = _xref_split[0]
617
1582
 
618
- yv = Reference.from_curie(xref, strict=strict)
619
- if yv is not None:
620
- yield yv
1583
+ XREF_PROVENANCE_COUNTER: Counter[tuple[str, str]] = Counter()