pyobo 0.12.10__py3-none-any.whl → 0.12.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. pyobo/__init__.py +6 -0
  2. pyobo/api/__init__.py +11 -1
  3. pyobo/api/alts.py +18 -4
  4. pyobo/api/embedding.py +108 -9
  5. pyobo/api/names.py +28 -6
  6. pyobo/api/xrefs.py +21 -1
  7. pyobo/cli/cli.py +9 -3
  8. pyobo/cli/database.py +63 -22
  9. pyobo/cli/lookup.py +39 -24
  10. pyobo/cli/utils.py +6 -2
  11. pyobo/constants.py +66 -7
  12. pyobo/getters.py +8 -3
  13. pyobo/ner/api.py +17 -10
  14. pyobo/ner/scispacy_utils.py +2 -0
  15. pyobo/plugins.py +3 -1
  16. pyobo/sources/__init__.py +2 -0
  17. pyobo/sources/antibodyregistry.py +3 -3
  18. pyobo/sources/bigg/bigg_compartment.py +1 -1
  19. pyobo/sources/complexportal.py +3 -3
  20. pyobo/sources/conso.py +3 -3
  21. pyobo/sources/famplex.py +3 -3
  22. pyobo/sources/goldbook.py +86 -0
  23. pyobo/sources/hgnc/hgnc.py +157 -96
  24. pyobo/sources/hgnc/hgncgenefamily.py +14 -13
  25. pyobo/sources/msigdb.py +3 -3
  26. pyobo/sources/omim_ps.py +8 -2
  27. pyobo/sources/reactome.py +3 -3
  28. pyobo/sources/rgd.py +7 -11
  29. pyobo/sources/slm.py +3 -3
  30. pyobo/sources/uniprot/uniprot.py +3 -3
  31. pyobo/sources/wikipathways.py +7 -2
  32. pyobo/struct/__init__.py +2 -2
  33. pyobo/struct/functional/macros.py +1 -1
  34. pyobo/struct/functional/obo_to_functional.py +7 -3
  35. pyobo/struct/obo/reader.py +4 -4
  36. pyobo/struct/struct.py +48 -18
  37. pyobo/struct/struct_utils.py +19 -5
  38. pyobo/struct/typedef.py +19 -3
  39. pyobo/struct/vocabulary.py +6 -3
  40. pyobo/utils/path.py +5 -4
  41. pyobo/version.py +1 -1
  42. {pyobo-0.12.10.dist-info → pyobo-0.12.12.dist-info}/METADATA +45 -23
  43. {pyobo-0.12.10.dist-info → pyobo-0.12.12.dist-info}/RECORD +46 -45
  44. {pyobo-0.12.10.dist-info → pyobo-0.12.12.dist-info}/WHEEL +1 -1
  45. {pyobo-0.12.10.dist-info → pyobo-0.12.12.dist-info}/entry_points.txt +0 -0
  46. {pyobo-0.12.10.dist-info → pyobo-0.12.12.dist-info}/licenses/LICENSE +0 -0
pyobo/ner/api.py CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import logging
5
6
  from collections.abc import Iterable
6
7
  from subprocess import CalledProcessError
7
8
  from typing import TYPE_CHECKING
@@ -22,6 +23,8 @@ __all__ = [
22
23
  "get_grounder",
23
24
  ]
24
25
 
26
+ logger = logging.getLogger(__name__)
27
+
25
28
 
26
29
  def get_grounder(
27
30
  prefixes: str | Iterable[str],
@@ -29,25 +32,29 @@ def get_grounder(
29
32
  grounder_cls: type[gilda.Grounder] | None = None,
30
33
  versions: None | str | Iterable[str | None] | dict[str, str] = None,
31
34
  skip_obsolete: bool = False,
35
+ raise_on_missing: bool = False,
32
36
  **kwargs: Unpack[GetOntologyKwargs],
33
37
  ) -> ssslm.Grounder:
34
38
  """Get a grounder for the given prefix(es)."""
35
- literal_mappings: list[LiteralMapping] = []
39
+ all_literal_mappings: list[LiteralMapping] = []
36
40
  it = _clean_prefix_versions(prefixes, versions=versions)
37
41
  disable = len(it) == 1 or not check_should_use_tqdm(kwargs)
38
42
  for prefix, kwargs["version"] in tqdm(it, leave=False, disable=disable):
39
43
  try:
40
- literal_mappings.extend(
41
- get_literal_mappings(
42
- prefix,
43
- skip_obsolete=skip_obsolete,
44
- **kwargs,
45
- )
46
- )
47
- except (NoBuildError, CalledProcessError):
44
+ literal_mappings = get_literal_mappings(prefix, skip_obsolete=skip_obsolete, **kwargs)
45
+ except (NoBuildError, CalledProcessError) as e:
46
+ logger.warning("[%s] unable to get literal mappings: %s", prefix, e)
48
47
  continue
48
+ else:
49
+ if not literal_mappings:
50
+ if raise_on_missing:
51
+ raise ValueError(f"no literal mappings were loaded for {prefix}")
52
+ logger.warning("[%s] no literal mappings loaded", prefix)
53
+ all_literal_mappings.extend(literal_mappings)
49
54
 
50
- return ssslm.make_grounder(literal_mappings, implementation="gilda", grounder_cls=grounder_cls)
55
+ return ssslm.make_grounder(
56
+ all_literal_mappings, implementation="gilda", grounder_cls=grounder_cls
57
+ )
51
58
 
52
59
 
53
60
  def _clean_prefix_versions(
@@ -227,6 +227,8 @@ def get_scispacy_entities(prefix: str, **kwargs: Unpack[GetOntologyKwargs]) -> I
227
227
  # TODO reuse labels, synonyms, and definitions cache
228
228
  ontology = get_ontology(prefix, **kwargs)
229
229
  for term in ontology:
230
+ if not term.name or term.prefix != ontology.ontology:
231
+ continue
230
232
  yield Entity(
231
233
  concept_id=term.curie,
232
234
  canonical_name=term.name,
pyobo/plugins.py CHANGED
@@ -4,8 +4,10 @@ from __future__ import annotations
4
4
 
5
5
  from collections.abc import Callable, Iterable, Mapping
6
6
  from functools import lru_cache
7
+ from typing import TYPE_CHECKING
7
8
 
8
- from .struct import Obo
9
+ if TYPE_CHECKING:
10
+ from .struct import Obo
9
11
 
10
12
  __all__ = [
11
13
  "has_nomenclature_plugin",
pyobo/sources/__init__.py CHANGED
@@ -29,6 +29,7 @@ from .famplex import FamPlexGetter
29
29
  from .flybase import FlyBaseGetter
30
30
  from .gard import GARDGetter
31
31
  from .geonames import GeonamesFeatureGetter, GeonamesGetter
32
+ from .goldbook import GoldBookGetter
32
33
  from .gtdb import GTDBGetter
33
34
  from .gwascentral import GWASCentralPhenotypeGetter, GWASCentralStudyGetter
34
35
  from .hgnc import HGNCGetter, HGNCGroupGetter
@@ -110,6 +111,7 @@ __all__ = [
110
111
  "GWASCentralStudyGetter",
111
112
  "GeonamesFeatureGetter",
112
113
  "GeonamesGetter",
114
+ "GoldBookGetter",
113
115
  "HGNCGetter",
114
116
  "HGNCGroupGetter",
115
117
  "IANAGetter",
@@ -12,7 +12,7 @@ from tqdm.auto import tqdm
12
12
 
13
13
  from pyobo import Obo, Reference, Term
14
14
  from pyobo.api.utils import get_version
15
- from pyobo.struct.typedef import has_citation
15
+ from pyobo.struct.typedef import is_mentioned_by
16
16
  from pyobo.utils.path import ensure_df
17
17
 
18
18
  __all__ = [
@@ -47,7 +47,7 @@ class AntibodyRegistryGetter(Obo):
47
47
  """An ontology representation of the Antibody Registry."""
48
48
 
49
49
  ontology = bioversions_key = PREFIX
50
- typedefs = [has_citation]
50
+ typedefs = [is_mentioned_by]
51
51
 
52
52
  def iter_terms(self, force: bool = False) -> Iterable[Term]:
53
53
  """Iterate over terms in the ontology."""
@@ -97,7 +97,7 @@ def iter_terms(*, force: bool = False, version: str | None = None) -> Iterable[T
97
97
  pubmed_id = pubmed_id.strip()
98
98
  if not pubmed_id:
99
99
  continue
100
- term.append_provenance(Reference(prefix="pubmed", identifier=pubmed_id))
100
+ term.append_mentioned_by(Reference(prefix="pubmed", identifier=pubmed_id))
101
101
  yield term
102
102
 
103
103
 
@@ -55,7 +55,7 @@ def get_compartments(*, force: bool = False, version: str | None = None) -> dict
55
55
  """Get a dictionary of BiGG compartments."""
56
56
  rv = {}
57
57
  soup = get_soup(DATA_URL)
58
- table = soup.find(**{"class": "myTable"}) # type:ignore[arg-type]
58
+ table = soup.find(class_="myTable")
59
59
  if table is None:
60
60
  raise ValueError
61
61
  for row in table.find_all("tr"): # type:ignore[attr-defined]
@@ -14,8 +14,8 @@ from pyobo.struct import (
14
14
  Term,
15
15
  _parse_str_or_curie_or_uri,
16
16
  from_species,
17
- has_citation,
18
17
  has_part,
18
+ is_mentioned_by,
19
19
  )
20
20
  from pyobo.utils.path import ensure_df
21
21
 
@@ -157,7 +157,7 @@ class ComplexPortalGetter(Obo):
157
157
  """An ontology representation of the Complex Portal."""
158
158
 
159
159
  bioversions_key = ontology = PREFIX
160
- typedefs = [from_species, has_part, has_citation]
160
+ typedefs = [from_species, has_part, is_mentioned_by]
161
161
  root_terms = [ROOT]
162
162
 
163
163
  def iter_terms(self, force: bool = False) -> Iterable[Term]:
@@ -240,7 +240,7 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
240
240
  if note == "identity":
241
241
  term.append_xref(reference)
242
242
  elif note == "see-also" and reference.prefix == "pubmed":
243
- term.append_provenance(reference)
243
+ term.append_mentioned_by(reference)
244
244
  elif (note, reference.prefix) not in unhandled_xref_type:
245
245
  logger.debug(f"unhandled xref type: {note} / {reference.prefix}")
246
246
  unhandled_xref_type.add((note, reference.prefix))
pyobo/sources/conso.py CHANGED
@@ -4,7 +4,7 @@ from collections.abc import Iterable
4
4
 
5
5
  import pandas as pd
6
6
 
7
- from ..struct import Obo, Reference, Synonym, Term, _parse_str_or_curie_or_uri, has_citation
7
+ from ..struct import Obo, Reference, Synonym, Term, _parse_str_or_curie_or_uri, is_mentioned_by
8
8
  from ..utils.io import multidict
9
9
  from ..utils.path import ensure_df
10
10
 
@@ -25,7 +25,7 @@ class CONSOGetter(Obo):
25
25
 
26
26
  ontology = PREFIX
27
27
  dynamic_version = True
28
- typedefs = [has_citation]
28
+ typedefs = [is_mentioned_by]
29
29
 
30
30
  def iter_terms(self, force: bool = False) -> Iterable[Term]:
31
31
  """Iterate over terms in the ontology."""
@@ -71,7 +71,7 @@ def iter_terms() -> Iterable[Term]:
71
71
  continue
72
72
  reference = _parse_str_or_curie_or_uri(curie)
73
73
  if reference is not None:
74
- term.append_provenance(reference)
74
+ term.append_mentioned_by(reference)
75
75
  yield term
76
76
 
77
77
 
pyobo/sources/famplex.py CHANGED
@@ -9,7 +9,7 @@ from pystow.utils import get_commit
9
9
 
10
10
  from pyobo import get_name_id_mapping
11
11
  from pyobo.struct import Obo, Reference, Term, _parse_str_or_curie_or_uri
12
- from pyobo.struct.typedef import has_citation, has_member, has_part, is_a, part_of
12
+ from pyobo.struct.typedef import has_member, has_part, is_a, is_mentioned_by, part_of
13
13
  from pyobo.utils.io import multidict
14
14
  from pyobo.utils.path import ensure_df
15
15
 
@@ -23,7 +23,7 @@ class FamPlexGetter(Obo):
23
23
 
24
24
  ontology = PREFIX
25
25
  dynamic_version = True
26
- typedefs = [has_member, has_part, is_a, part_of, has_citation]
26
+ typedefs = [has_member, has_part, is_a, part_of, is_mentioned_by]
27
27
 
28
28
  def _get_version(self) -> str:
29
29
  return get_commit("sorgerlab", "famplex")
@@ -110,7 +110,7 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
110
110
  _parse_str_or_curie_or_uri(provenance) if isinstance(provenance, str) else None
111
111
  )
112
112
  if provenance_reference:
113
- term.append_provenance(provenance_reference)
113
+ term.append_mentioned_by(provenance_reference)
114
114
 
115
115
  for xref_reference in id_xrefs.get(entity, []):
116
116
  term.append_xref(xref_reference)
@@ -0,0 +1,86 @@
1
+ """An ontology representation of IUPAC Gold Book."""
2
+
3
+ import json.decoder
4
+ from collections.abc import Iterable
5
+
6
+ import pystow.utils
7
+ import requests
8
+ from tqdm import tqdm
9
+
10
+ from pyobo.struct import Obo, Reference, Term
11
+ from pyobo.utils.path import ensure_path
12
+
13
+ PREFIX = "goldbook"
14
+ URL = "https://goldbook.iupac.org/terms/index/all/json/download"
15
+ TERM_URL_FORMAT = "https://goldbook.iupac.org/terms/view/{}/json"
16
+
17
+
18
+ class GoldBookGetter(Obo):
19
+ """An ontology representation of IUPAC Gold Book."""
20
+
21
+ ontology = PREFIX
22
+ dynamic_version = True
23
+
24
+ def iter_terms(self, force: bool = False) -> Iterable[Term]:
25
+ """Iterate over terms in the ontology."""
26
+ return _iter_terms()
27
+
28
+
29
+ def _iter_terms() -> Iterable[Term]:
30
+ res = requests.get(URL, timeout=15).json()
31
+ for identifier in tqdm(res["terms"]["list"], unit_scale=True):
32
+ if term := _get_term(identifier):
33
+ yield term
34
+
35
+
36
+ def _get_term(identifier: str) -> Term | None:
37
+ url = TERM_URL_FORMAT.format(identifier)
38
+ try:
39
+ path = ensure_path(PREFIX, "terms", url=url, name=f"{identifier}.json")
40
+ except pystow.utils.DownloadError:
41
+ tqdm.write(f"[{PREFIX}:{identifier}] failed to download {url}")
42
+ return None
43
+
44
+ try:
45
+ with path.open() as file:
46
+ res = json.load(file)
47
+ except json.decoder.JSONDecodeError:
48
+ tqdm.write(f"[{PREFIX}:{identifier}] failed to parse data in {path}")
49
+ return None
50
+
51
+ record = res["term"]
52
+ definitions = record["definitions"]
53
+ if definitions:
54
+ definition = _clean(definitions[0]["text"])
55
+ else:
56
+ definition = None
57
+
58
+ term = Term(
59
+ reference=Reference(
60
+ prefix=PREFIX,
61
+ identifier=identifier,
62
+ name=record["title"].strip(),
63
+ ),
64
+ definition=definition,
65
+ )
66
+
67
+ if synonym := record.get("synonym"):
68
+ if synonym.startswith("<"):
69
+ if synonym.startswith("<em>synonym</em>:"):
70
+ synonym = synonym.removeprefix("<em>synonym</em>:")
71
+ term.append_synonym(_clean(synonym))
72
+ elif synonym.startswith("<em>synonyms</em>:"):
73
+ for s in synonym.removeprefix("<em>synonyms</em>:").strip().split(","):
74
+ term.append_synonym(_clean(s))
75
+ else:
76
+ tqdm.write(f"[{term.curie}] issue with synonym: {synonym}")
77
+
78
+ return term
79
+
80
+
81
+ def _clean(s: str) -> str:
82
+ return s.strip().replace("\\n", "\n")
83
+
84
+
85
+ if __name__ == "__main__":
86
+ GoldBookGetter.cli()
@@ -7,6 +7,7 @@ import typing
7
7
  from collections import Counter, defaultdict
8
8
  from collections.abc import Iterable
9
9
 
10
+ import obographs
10
11
  import pydantic
11
12
  from tabulate import tabulate
12
13
  from tqdm.auto import tqdm
@@ -14,22 +15,22 @@ from tqdm.auto import tqdm
14
15
  from pyobo.api.utils import get_version
15
16
  from pyobo.resources.so import get_so_name
16
17
  from pyobo.struct import (
18
+ Annotation,
17
19
  Obo,
20
+ OBOLiteral,
18
21
  Reference,
19
- SynonymTypeDef,
20
22
  Term,
21
- TypeDef,
22
- default_reference,
23
23
  from_species,
24
24
  gene_product_member_of,
25
- has_citation,
26
25
  has_gene_product,
26
+ is_mentioned_by,
27
27
  member_of,
28
28
  orthologous,
29
29
  transcribes_to,
30
30
  )
31
- from pyobo.struct.typedef import exact_match
32
- from pyobo.utils.path import ensure_path, prefix_directory_join
31
+ from pyobo.struct.struct import gene_symbol_synonym, previous_gene_symbol, previous_name
32
+ from pyobo.struct.typedef import comment, ends, exact_match, located_in, starts
33
+ from pyobo.utils.path import ensure_path
33
34
 
34
35
  __all__ = [
35
36
  "HGNCGetter",
@@ -43,26 +44,8 @@ DEFINITIONS_URL_FMT = (
43
44
  "hgnc_complete_set_{version}.json"
44
45
  )
45
46
 
46
- previous_symbol_type = SynonymTypeDef(
47
- reference=default_reference(PREFIX, "previous_symbol", name="previous symbol")
48
- )
49
- alias_symbol_type = SynonymTypeDef(
50
- reference=default_reference(PREFIX, "alias_symbol", name="alias symbol")
51
- )
52
- previous_name_type = SynonymTypeDef(
53
- reference=default_reference(PREFIX, "previous_name", name="previous name")
54
- )
55
- alias_name_type = SynonymTypeDef(
56
- reference=default_reference(PREFIX, "alias_name", name="alias name")
57
- )
58
- HAS_LOCUS_TYPE = TypeDef(
59
- reference=default_reference(PREFIX, "locus_type", name="has locus type"), is_metadata_tag=True
60
- )
61
- HAS_LOCUS_GROUP = TypeDef(
62
- reference=default_reference(PREFIX, "locus_group", name="has locus group"), is_metadata_tag=True
63
- )
64
- HAS_LOCATION = TypeDef(
65
- reference=default_reference(PREFIX, "location", name="has location"), is_metadata_tag=True
47
+ CHR_URL = (
48
+ "https://raw.githubusercontent.com/monarch-initiative/monochrom/refs/heads/master/chr.json"
66
49
  )
67
50
 
68
51
  #: First column is MIRIAM prefix, second column is HGNC key
@@ -157,7 +140,7 @@ LOCUS_TYPE_TO_SO = {
157
140
  "complex locus constituent": "0000997", # https://github.com/pyobo/pyobo/issues/118#issuecomment-1564520052
158
141
  # non-coding RNA
159
142
  "RNA, Y": "0002359",
160
- "RNA, cluster": "", # TODO see https://github.com/The-Sequence-Ontology/SO-Ontologies/issues/564
143
+ "RNA, cluster": "0003001", # TODO see https://github.com/The-Sequence-Ontology/SO-Ontologies/issues/564
161
144
  "RNA, long non-coding": "0002127", # HGNC links to wrong one
162
145
  "RNA, micro": "0001265",
163
146
  "RNA, misc": "0001266",
@@ -180,7 +163,7 @@ LOCUS_TYPE_TO_SO = {
180
163
  "fragile site": "0002349",
181
164
  "readthrough": "0000697", # maybe not right
182
165
  "transposable element": "0000111", # HGNC links to wrong one
183
- "virus integration site": "", # TODO see https://github.com/The-Sequence-Ontology/SO-Ontologies/issues/551
166
+ "virus integration site": "0003002", # TODO see https://github.com/The-Sequence-Ontology/SO-Ontologies/issues/551
184
167
  "region": "0001411", # a small bucket for things that need a better annotation, even higher than "gene"
185
168
  "unknown": "0000704", # gene
186
169
  None: "0000704", # gene
@@ -190,6 +173,14 @@ PUBLICATION_TERM = Term(
190
173
  reference=Reference(prefix="IAO", identifier="0000013", name="journal article")
191
174
  )
192
175
 
176
+ #: Indicates the cytogenetic location of the gene or region on the chromsome.
177
+ #: In the absence of that information one of the following may be listed.
178
+ QUALIFIERS = {
179
+ " not on reference assembly": "not on reference assembly -named gene is not annotated on the current version of the Genome Reference Consortium human reference assembly; may have been annotated on previous assembly versions or on a non-reference human assembly",
180
+ " unplaced": "unplaced - named gene is annotated on an unplaced/unlocalized scaffold of the human reference assembly",
181
+ " alternate reference locus": "reserved - named gene has never been annotated on any human assembly",
182
+ }
183
+
193
184
 
194
185
  class HGNCGetter(Obo):
195
186
  """An ontology representation of HGNC's gene nomenclature."""
@@ -203,16 +194,16 @@ class HGNCGetter(Obo):
203
194
  orthologous,
204
195
  member_of,
205
196
  exact_match,
206
- has_citation,
207
- HAS_LOCUS_GROUP,
208
- HAS_LOCUS_TYPE,
209
- HAS_LOCATION,
197
+ is_mentioned_by,
198
+ located_in,
199
+ starts,
200
+ ends,
201
+ comment,
210
202
  ]
211
203
  synonym_typedefs = [
212
- previous_name_type,
213
- previous_symbol_type,
214
- alias_name_type,
215
- alias_symbol_type,
204
+ previous_name,
205
+ previous_gene_symbol,
206
+ gene_symbol_synonym,
216
207
  ]
217
208
  root_terms = [
218
209
  Reference(prefix="SO", identifier=so_id, name=get_so_name(so_id))
@@ -225,12 +216,28 @@ class HGNCGetter(Obo):
225
216
  return get_terms(force=force, version=self.data_version)
226
217
 
227
218
 
219
+ def _get_location_to_chr() -> dict[str, Reference]:
220
+ uri_prefix = "http://purl.obolibrary.org/obo/CHR_9606-chr"
221
+ graph: obographs.Graph = obographs.read(CHR_URL, squeeze=True)
222
+ rv = {}
223
+ for node in graph.nodes:
224
+ if node.id.startswith(uri_prefix):
225
+ identifier = node.id.removeprefix(uri_prefix)
226
+ rv[identifier] = Reference(
227
+ prefix="CHR", identifier=f"9606-chr{identifier}", name=node.lbl
228
+ )
229
+ return rv
230
+
231
+
228
232
  def get_terms(version: str | None = None, force: bool = False) -> Iterable[Term]:
229
233
  """Get HGNC terms."""
230
234
  if version is None:
231
235
  version = get_version("hgnc")
236
+
237
+ unhandled_locations: defaultdict[str, set[str]] = defaultdict(set)
238
+ location_to_chr = _get_location_to_chr()
239
+
232
240
  unhandled_entry_keys: typing.Counter[str] = Counter()
233
- unhandle_locus_types: defaultdict[str, dict[str, Term]] = defaultdict(dict)
234
241
  path = ensure_path(
235
242
  PREFIX,
236
243
  url=DEFINITIONS_URL_FMT.format(version=version),
@@ -352,7 +359,6 @@ def get_terms(version: str | None = None, force: bool = False) -> Iterable[Term]
352
359
  xref_identifiers = [str(xref_identifiers)]
353
360
 
354
361
  if xref_prefix == "merops.entry":
355
- continue
356
362
  # e.g., XM02-001 should be rewritten as XM02.001
357
363
  xref_identifiers = [i.replace("-", ".") for i in xref_identifiers]
358
364
 
@@ -375,7 +381,7 @@ def get_terms(version: str | None = None, force: bool = False) -> Iterable[Term]
375
381
  term.append_xref(Reference(prefix=xref_prefix, identifier=str(xref_identifier)))
376
382
 
377
383
  for pubmed_id in entry.pop("pubmed_id", []):
378
- term.append_provenance(Reference(prefix="pubmed", identifier=str(pubmed_id)))
384
+ term.append_mentioned_by(Reference(prefix="pubmed", identifier=str(pubmed_id)))
379
385
 
380
386
  gene_group_ids = entry.pop("gene_group_id", [])
381
387
  gene_groups = entry.pop("gene_group", [])
@@ -390,34 +396,118 @@ def get_terms(version: str | None = None, force: bool = False) -> Iterable[Term]
390
396
  )
391
397
 
392
398
  for alias_symbol in entry.pop("alias_symbol", []):
393
- term.append_synonym(alias_symbol, type=alias_symbol_type)
399
+ term.append_synonym(alias_symbol, type=gene_symbol_synonym)
394
400
  for alias_name in entry.pop("alias_name", []):
395
- term.append_synonym(alias_name, type=alias_name_type)
401
+ # regular synonym, no type needed.
402
+ term.append_synonym(alias_name)
396
403
  for previous_symbol in itt.chain(
397
404
  entry.pop("previous_symbol", []), entry.pop("prev_symbol", [])
398
405
  ):
399
- term.append_synonym(previous_symbol, type=previous_symbol_type)
400
- for previous_name in entry.pop("prev_name", []):
401
- term.append_synonym(previous_name, type=previous_name_type)
406
+ term.append_synonym(previous_symbol, type=previous_gene_symbol)
407
+ for previous_name_ in entry.pop("prev_name", []):
408
+ term.append_synonym(previous_name_, type=previous_name)
409
+
410
+ location: str | None = entry.pop("location", None)
411
+ if location is not None and location not in {
412
+ "not on reference assembly",
413
+ "unplaced",
414
+ "reserved",
415
+ }:
416
+ annotations = []
417
+ for qualifier_suffix, qualifier_text in QUALIFIERS.items():
418
+ if location.endswith(qualifier_suffix):
419
+ location = location.removesuffix(qualifier_suffix)
420
+ annotations.append(
421
+ Annotation(
422
+ predicate=comment.reference, value=OBOLiteral.string(qualifier_text)
423
+ )
424
+ )
425
+ break
402
426
 
403
- for prop, td in [("location", HAS_LOCATION)]:
404
- value = entry.pop(prop, None)
405
- if value:
406
- term.annotate_string(td, value)
427
+ if location in location_to_chr:
428
+ term.append_relationship(
429
+ located_in, location_to_chr[location], annotations=annotations
430
+ )
431
+ elif location == "mitochondria":
432
+ term.append_relationship(
433
+ located_in,
434
+ Reference(prefix="go", identifier="0000262", name="mitochondrial chromosome"),
435
+ annotations=annotations,
436
+ )
437
+ elif " and " in location:
438
+ left, _, right = location.partition(" and ")
439
+ if left not in location_to_chr:
440
+ unhandled_locations[left].add(identifier)
441
+ elif right not in location_to_chr:
442
+ unhandled_locations[right].add(identifier)
443
+ elif left in location_to_chr and right in location_to_chr:
444
+ term.append_relationship(
445
+ located_in, location_to_chr[left], annotations=annotations
446
+ )
447
+ term.append_relationship(
448
+ located_in, location_to_chr[right], annotations=annotations
449
+ )
450
+ else:
451
+ unhandled_locations[location].add(identifier)
452
+ elif " or " in location:
453
+ left, _, right = location.partition(" or ")
454
+ if left not in location_to_chr:
455
+ unhandled_locations[left].add(identifier)
456
+ elif right not in location_to_chr:
457
+ unhandled_locations[right].add(identifier)
458
+ elif left in location_to_chr and right in location_to_chr:
459
+ # FIXME implement
460
+ unhandled_locations[location].add(identifier)
461
+ else:
462
+ unhandled_locations[location].add(identifier)
463
+ elif "-" in location:
464
+ start, _, end = location.partition("-")
465
+
466
+ # the range that sarts with a q needs
467
+ # the chromosome moved over, like in
468
+ # 17q24.2-q24.3
469
+ if end.startswith("q"):
470
+ chr, _, _ = start.partition("q")
471
+ end = f"{chr}{end}"
472
+ # the range that sarts with a p needs
473
+ # the chromosome moved over, like in
474
+ # 1p34.2-p34.1
475
+ elif end.startswith("p"):
476
+ chr, _, _ = start.partition("p")
477
+ end = f"{chr}{end}"
478
+
479
+ if start not in location_to_chr:
480
+ unhandled_locations[start].add(identifier)
481
+ elif end not in location_to_chr:
482
+ unhandled_locations[end].add(identifier)
483
+ elif start in location_to_chr and end in location_to_chr:
484
+ term.append_relationship(
485
+ starts, location_to_chr[start], annotations=annotations
486
+ )
487
+ term.append_relationship(ends, location_to_chr[end], annotations=annotations)
488
+ else:
489
+ unhandled_locations[location].add(identifier)
490
+ else:
491
+ unhandled_locations[location].add(identifier)
407
492
 
408
493
  locus_type = entry.pop("locus_type")
409
- locus_group = entry.pop("locus_group")
494
+ # note that locus group is a more broad category than locus type,
495
+ # and since we already have an exhaustive mapping from locus type
496
+ # to SO, then we can throw this annotation away
497
+ _locus_group = entry.pop("locus_group")
410
498
  so_id = LOCUS_TYPE_TO_SO.get(locus_type)
411
- if so_id:
412
- term.append_parent(Reference(prefix="SO", identifier=so_id, name=get_so_name(so_id)))
413
- else:
414
- term.append_parent(
415
- Reference(prefix="SO", identifier="0000704", name=get_so_name("0000704"))
416
- ) # gene
417
- unhandle_locus_types[locus_type][identifier] = term
418
- term.annotate_string(HAS_LOCUS_TYPE, locus_type)
419
- term.annotate_string(HAS_LOCUS_GROUP, locus_group)
420
-
499
+ if not so_id:
500
+ raise ValueError("""\
501
+ HGNC has updated their list of locus types, so the HGNC script is currently
502
+ incomplete. This can be fixed by updating the ``LOCUS_TYPE_TO_SO`` dictionary
503
+ to point to a new SO term. If there is none existing, then make a pull request
504
+ to https://github.com/The-Sequence-Ontology/SO-Ontologies like in
505
+ https://github.com/The-Sequence-Ontology/SO-Ontologies/pull/668. If the
506
+ maintainers aren't responsive, you can still use the proposed term before it's
507
+ accepted upstream like was done for SO:0003001 and SO:0003002
508
+ """)
509
+
510
+ term.append_parent(Reference(prefix="SO", identifier=so_id, name=get_so_name(so_id)))
421
511
  term.set_species(identifier="9606", name="Homo sapiens")
422
512
 
423
513
  for key in entry:
@@ -425,45 +515,16 @@ def get_terms(version: str | None = None, force: bool = False) -> Iterable[Term]
425
515
  unhandled_entry_keys[key] += 1
426
516
  yield term
427
517
 
428
- with open(prefix_directory_join(PREFIX, name="unhandled.json"), "w") as file:
429
- json.dump(
430
- {
431
- k: {hgnc_id: term.name for hgnc_id, term in v.items()}
432
- for k, v in unhandle_locus_types.items()
433
- },
434
- file,
435
- indent=2,
436
- )
437
-
438
- with open(prefix_directory_join(PREFIX, name="unhandled.md"), "w") as file:
439
- for k, v in sorted(unhandle_locus_types.items()):
440
- t = tabulate(
441
- [
442
- (
443
- hgnc_id,
444
- term.name,
445
- term.is_obsolete,
446
- f"https://bioregistry.io/{term.curie}",
447
- ", ".join(
448
- f"https://bioregistry.io/{p.curie}"
449
- for p in term.provenance
450
- if isinstance(p, Reference)
451
- ),
452
- )
453
- for hgnc_id, term in sorted(v.items())
454
- ],
455
- headers=["hgnc_id", "name", "obsolete", "link", "provenance"],
518
+ if unhandled_locations:
519
+ logger.warning(
520
+ "Unhandled chromosomal locations:\n\n%s\n",
521
+ tabulate(
522
+ [(k, len(vs), f"HGNC:{min(vs)}") for k, vs in unhandled_locations.items()],
523
+ headers=["location", "count", "example"],
456
524
  tablefmt="github",
457
- )
458
- print(f"## {k} ({len(v)})", file=file)
459
- print(t, "\n", file=file)
525
+ ),
526
+ )
460
527
 
461
- unhandle_locus_type_counter = Counter(
462
- {locus_type: len(d) for locus_type, d in unhandle_locus_types.items()}
463
- )
464
- logger.warning(
465
- "Unhandled locus types:\n%s", tabulate(unhandle_locus_type_counter.most_common())
466
- )
467
528
  if unhandled_entry_keys:
468
529
  logger.warning("Unhandled keys:\n%s", tabulate(unhandled_entry_keys.most_common()))
469
530