pyobo 0.12.10__py3-none-any.whl → 0.12.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyobo/__init__.py +6 -0
- pyobo/api/__init__.py +11 -1
- pyobo/api/alts.py +18 -4
- pyobo/api/embedding.py +108 -9
- pyobo/api/names.py +28 -6
- pyobo/api/xrefs.py +21 -1
- pyobo/cli/cli.py +9 -3
- pyobo/cli/database.py +63 -22
- pyobo/cli/lookup.py +39 -24
- pyobo/cli/utils.py +6 -2
- pyobo/constants.py +66 -7
- pyobo/getters.py +8 -3
- pyobo/ner/api.py +17 -10
- pyobo/ner/scispacy_utils.py +2 -0
- pyobo/plugins.py +3 -1
- pyobo/sources/__init__.py +2 -0
- pyobo/sources/antibodyregistry.py +3 -3
- pyobo/sources/bigg/bigg_compartment.py +1 -1
- pyobo/sources/complexportal.py +3 -3
- pyobo/sources/conso.py +3 -3
- pyobo/sources/famplex.py +3 -3
- pyobo/sources/goldbook.py +86 -0
- pyobo/sources/hgnc/hgnc.py +157 -96
- pyobo/sources/hgnc/hgncgenefamily.py +14 -13
- pyobo/sources/msigdb.py +3 -3
- pyobo/sources/omim_ps.py +8 -2
- pyobo/sources/reactome.py +3 -3
- pyobo/sources/rgd.py +7 -11
- pyobo/sources/slm.py +3 -3
- pyobo/sources/uniprot/uniprot.py +3 -3
- pyobo/sources/wikipathways.py +7 -2
- pyobo/struct/__init__.py +2 -2
- pyobo/struct/functional/macros.py +1 -1
- pyobo/struct/functional/obo_to_functional.py +7 -3
- pyobo/struct/obo/reader.py +4 -4
- pyobo/struct/struct.py +48 -18
- pyobo/struct/struct_utils.py +19 -5
- pyobo/struct/typedef.py +19 -3
- pyobo/struct/vocabulary.py +6 -3
- pyobo/utils/path.py +5 -4
- pyobo/version.py +1 -1
- {pyobo-0.12.10.dist-info → pyobo-0.12.12.dist-info}/METADATA +45 -23
- {pyobo-0.12.10.dist-info → pyobo-0.12.12.dist-info}/RECORD +46 -45
- {pyobo-0.12.10.dist-info → pyobo-0.12.12.dist-info}/WHEEL +1 -1
- {pyobo-0.12.10.dist-info → pyobo-0.12.12.dist-info}/entry_points.txt +0 -0
- {pyobo-0.12.10.dist-info → pyobo-0.12.12.dist-info}/licenses/LICENSE +0 -0
pyobo/ner/api.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import logging
|
|
5
6
|
from collections.abc import Iterable
|
|
6
7
|
from subprocess import CalledProcessError
|
|
7
8
|
from typing import TYPE_CHECKING
|
|
@@ -22,6 +23,8 @@ __all__ = [
|
|
|
22
23
|
"get_grounder",
|
|
23
24
|
]
|
|
24
25
|
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
25
28
|
|
|
26
29
|
def get_grounder(
|
|
27
30
|
prefixes: str | Iterable[str],
|
|
@@ -29,25 +32,29 @@ def get_grounder(
|
|
|
29
32
|
grounder_cls: type[gilda.Grounder] | None = None,
|
|
30
33
|
versions: None | str | Iterable[str | None] | dict[str, str] = None,
|
|
31
34
|
skip_obsolete: bool = False,
|
|
35
|
+
raise_on_missing: bool = False,
|
|
32
36
|
**kwargs: Unpack[GetOntologyKwargs],
|
|
33
37
|
) -> ssslm.Grounder:
|
|
34
38
|
"""Get a grounder for the given prefix(es)."""
|
|
35
|
-
|
|
39
|
+
all_literal_mappings: list[LiteralMapping] = []
|
|
36
40
|
it = _clean_prefix_versions(prefixes, versions=versions)
|
|
37
41
|
disable = len(it) == 1 or not check_should_use_tqdm(kwargs)
|
|
38
42
|
for prefix, kwargs["version"] in tqdm(it, leave=False, disable=disable):
|
|
39
43
|
try:
|
|
40
|
-
literal_mappings
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
skip_obsolete=skip_obsolete,
|
|
44
|
-
**kwargs,
|
|
45
|
-
)
|
|
46
|
-
)
|
|
47
|
-
except (NoBuildError, CalledProcessError):
|
|
44
|
+
literal_mappings = get_literal_mappings(prefix, skip_obsolete=skip_obsolete, **kwargs)
|
|
45
|
+
except (NoBuildError, CalledProcessError) as e:
|
|
46
|
+
logger.warning("[%s] unable to get literal mappings: %s", prefix, e)
|
|
48
47
|
continue
|
|
48
|
+
else:
|
|
49
|
+
if not literal_mappings:
|
|
50
|
+
if raise_on_missing:
|
|
51
|
+
raise ValueError(f"no literal mappings were loaded for {prefix}")
|
|
52
|
+
logger.warning("[%s] no literal mappings loaded", prefix)
|
|
53
|
+
all_literal_mappings.extend(literal_mappings)
|
|
49
54
|
|
|
50
|
-
return ssslm.make_grounder(
|
|
55
|
+
return ssslm.make_grounder(
|
|
56
|
+
all_literal_mappings, implementation="gilda", grounder_cls=grounder_cls
|
|
57
|
+
)
|
|
51
58
|
|
|
52
59
|
|
|
53
60
|
def _clean_prefix_versions(
|
pyobo/ner/scispacy_utils.py
CHANGED
|
@@ -227,6 +227,8 @@ def get_scispacy_entities(prefix: str, **kwargs: Unpack[GetOntologyKwargs]) -> I
|
|
|
227
227
|
# TODO reuse labels, synonyms, and definitions cache
|
|
228
228
|
ontology = get_ontology(prefix, **kwargs)
|
|
229
229
|
for term in ontology:
|
|
230
|
+
if not term.name or term.prefix != ontology.ontology:
|
|
231
|
+
continue
|
|
230
232
|
yield Entity(
|
|
231
233
|
concept_id=term.curie,
|
|
232
234
|
canonical_name=term.name,
|
pyobo/plugins.py
CHANGED
|
@@ -4,8 +4,10 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from collections.abc import Callable, Iterable, Mapping
|
|
6
6
|
from functools import lru_cache
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
7
8
|
|
|
8
|
-
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from .struct import Obo
|
|
9
11
|
|
|
10
12
|
__all__ = [
|
|
11
13
|
"has_nomenclature_plugin",
|
pyobo/sources/__init__.py
CHANGED
|
@@ -29,6 +29,7 @@ from .famplex import FamPlexGetter
|
|
|
29
29
|
from .flybase import FlyBaseGetter
|
|
30
30
|
from .gard import GARDGetter
|
|
31
31
|
from .geonames import GeonamesFeatureGetter, GeonamesGetter
|
|
32
|
+
from .goldbook import GoldBookGetter
|
|
32
33
|
from .gtdb import GTDBGetter
|
|
33
34
|
from .gwascentral import GWASCentralPhenotypeGetter, GWASCentralStudyGetter
|
|
34
35
|
from .hgnc import HGNCGetter, HGNCGroupGetter
|
|
@@ -110,6 +111,7 @@ __all__ = [
|
|
|
110
111
|
"GWASCentralStudyGetter",
|
|
111
112
|
"GeonamesFeatureGetter",
|
|
112
113
|
"GeonamesGetter",
|
|
114
|
+
"GoldBookGetter",
|
|
113
115
|
"HGNCGetter",
|
|
114
116
|
"HGNCGroupGetter",
|
|
115
117
|
"IANAGetter",
|
|
@@ -12,7 +12,7 @@ from tqdm.auto import tqdm
|
|
|
12
12
|
|
|
13
13
|
from pyobo import Obo, Reference, Term
|
|
14
14
|
from pyobo.api.utils import get_version
|
|
15
|
-
from pyobo.struct.typedef import
|
|
15
|
+
from pyobo.struct.typedef import is_mentioned_by
|
|
16
16
|
from pyobo.utils.path import ensure_df
|
|
17
17
|
|
|
18
18
|
__all__ = [
|
|
@@ -47,7 +47,7 @@ class AntibodyRegistryGetter(Obo):
|
|
|
47
47
|
"""An ontology representation of the Antibody Registry."""
|
|
48
48
|
|
|
49
49
|
ontology = bioversions_key = PREFIX
|
|
50
|
-
typedefs = [
|
|
50
|
+
typedefs = [is_mentioned_by]
|
|
51
51
|
|
|
52
52
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
53
53
|
"""Iterate over terms in the ontology."""
|
|
@@ -97,7 +97,7 @@ def iter_terms(*, force: bool = False, version: str | None = None) -> Iterable[T
|
|
|
97
97
|
pubmed_id = pubmed_id.strip()
|
|
98
98
|
if not pubmed_id:
|
|
99
99
|
continue
|
|
100
|
-
term.
|
|
100
|
+
term.append_mentioned_by(Reference(prefix="pubmed", identifier=pubmed_id))
|
|
101
101
|
yield term
|
|
102
102
|
|
|
103
103
|
|
|
@@ -55,7 +55,7 @@ def get_compartments(*, force: bool = False, version: str | None = None) -> dict
|
|
|
55
55
|
"""Get a dictionary of BiGG compartments."""
|
|
56
56
|
rv = {}
|
|
57
57
|
soup = get_soup(DATA_URL)
|
|
58
|
-
table = soup.find(
|
|
58
|
+
table = soup.find(class_="myTable")
|
|
59
59
|
if table is None:
|
|
60
60
|
raise ValueError
|
|
61
61
|
for row in table.find_all("tr"): # type:ignore[attr-defined]
|
pyobo/sources/complexportal.py
CHANGED
|
@@ -14,8 +14,8 @@ from pyobo.struct import (
|
|
|
14
14
|
Term,
|
|
15
15
|
_parse_str_or_curie_or_uri,
|
|
16
16
|
from_species,
|
|
17
|
-
has_citation,
|
|
18
17
|
has_part,
|
|
18
|
+
is_mentioned_by,
|
|
19
19
|
)
|
|
20
20
|
from pyobo.utils.path import ensure_df
|
|
21
21
|
|
|
@@ -157,7 +157,7 @@ class ComplexPortalGetter(Obo):
|
|
|
157
157
|
"""An ontology representation of the Complex Portal."""
|
|
158
158
|
|
|
159
159
|
bioversions_key = ontology = PREFIX
|
|
160
|
-
typedefs = [from_species, has_part,
|
|
160
|
+
typedefs = [from_species, has_part, is_mentioned_by]
|
|
161
161
|
root_terms = [ROOT]
|
|
162
162
|
|
|
163
163
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
@@ -240,7 +240,7 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
240
240
|
if note == "identity":
|
|
241
241
|
term.append_xref(reference)
|
|
242
242
|
elif note == "see-also" and reference.prefix == "pubmed":
|
|
243
|
-
term.
|
|
243
|
+
term.append_mentioned_by(reference)
|
|
244
244
|
elif (note, reference.prefix) not in unhandled_xref_type:
|
|
245
245
|
logger.debug(f"unhandled xref type: {note} / {reference.prefix}")
|
|
246
246
|
unhandled_xref_type.add((note, reference.prefix))
|
pyobo/sources/conso.py
CHANGED
|
@@ -4,7 +4,7 @@ from collections.abc import Iterable
|
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
7
|
-
from ..struct import Obo, Reference, Synonym, Term, _parse_str_or_curie_or_uri,
|
|
7
|
+
from ..struct import Obo, Reference, Synonym, Term, _parse_str_or_curie_or_uri, is_mentioned_by
|
|
8
8
|
from ..utils.io import multidict
|
|
9
9
|
from ..utils.path import ensure_df
|
|
10
10
|
|
|
@@ -25,7 +25,7 @@ class CONSOGetter(Obo):
|
|
|
25
25
|
|
|
26
26
|
ontology = PREFIX
|
|
27
27
|
dynamic_version = True
|
|
28
|
-
typedefs = [
|
|
28
|
+
typedefs = [is_mentioned_by]
|
|
29
29
|
|
|
30
30
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
31
31
|
"""Iterate over terms in the ontology."""
|
|
@@ -71,7 +71,7 @@ def iter_terms() -> Iterable[Term]:
|
|
|
71
71
|
continue
|
|
72
72
|
reference = _parse_str_or_curie_or_uri(curie)
|
|
73
73
|
if reference is not None:
|
|
74
|
-
term.
|
|
74
|
+
term.append_mentioned_by(reference)
|
|
75
75
|
yield term
|
|
76
76
|
|
|
77
77
|
|
pyobo/sources/famplex.py
CHANGED
|
@@ -9,7 +9,7 @@ from pystow.utils import get_commit
|
|
|
9
9
|
|
|
10
10
|
from pyobo import get_name_id_mapping
|
|
11
11
|
from pyobo.struct import Obo, Reference, Term, _parse_str_or_curie_or_uri
|
|
12
|
-
from pyobo.struct.typedef import
|
|
12
|
+
from pyobo.struct.typedef import has_member, has_part, is_a, is_mentioned_by, part_of
|
|
13
13
|
from pyobo.utils.io import multidict
|
|
14
14
|
from pyobo.utils.path import ensure_df
|
|
15
15
|
|
|
@@ -23,7 +23,7 @@ class FamPlexGetter(Obo):
|
|
|
23
23
|
|
|
24
24
|
ontology = PREFIX
|
|
25
25
|
dynamic_version = True
|
|
26
|
-
typedefs = [has_member, has_part, is_a, part_of,
|
|
26
|
+
typedefs = [has_member, has_part, is_a, part_of, is_mentioned_by]
|
|
27
27
|
|
|
28
28
|
def _get_version(self) -> str:
|
|
29
29
|
return get_commit("sorgerlab", "famplex")
|
|
@@ -110,7 +110,7 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
110
110
|
_parse_str_or_curie_or_uri(provenance) if isinstance(provenance, str) else None
|
|
111
111
|
)
|
|
112
112
|
if provenance_reference:
|
|
113
|
-
term.
|
|
113
|
+
term.append_mentioned_by(provenance_reference)
|
|
114
114
|
|
|
115
115
|
for xref_reference in id_xrefs.get(entity, []):
|
|
116
116
|
term.append_xref(xref_reference)
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""An ontology representation of IUPAC Gold Book."""
|
|
2
|
+
|
|
3
|
+
import json.decoder
|
|
4
|
+
from collections.abc import Iterable
|
|
5
|
+
|
|
6
|
+
import pystow.utils
|
|
7
|
+
import requests
|
|
8
|
+
from tqdm import tqdm
|
|
9
|
+
|
|
10
|
+
from pyobo.struct import Obo, Reference, Term
|
|
11
|
+
from pyobo.utils.path import ensure_path
|
|
12
|
+
|
|
13
|
+
PREFIX = "goldbook"
|
|
14
|
+
URL = "https://goldbook.iupac.org/terms/index/all/json/download"
|
|
15
|
+
TERM_URL_FORMAT = "https://goldbook.iupac.org/terms/view/{}/json"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class GoldBookGetter(Obo):
|
|
19
|
+
"""An ontology representation of IUPAC Gold Book."""
|
|
20
|
+
|
|
21
|
+
ontology = PREFIX
|
|
22
|
+
dynamic_version = True
|
|
23
|
+
|
|
24
|
+
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
25
|
+
"""Iterate over terms in the ontology."""
|
|
26
|
+
return _iter_terms()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _iter_terms() -> Iterable[Term]:
|
|
30
|
+
res = requests.get(URL, timeout=15).json()
|
|
31
|
+
for identifier in tqdm(res["terms"]["list"], unit_scale=True):
|
|
32
|
+
if term := _get_term(identifier):
|
|
33
|
+
yield term
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _get_term(identifier: str) -> Term | None:
|
|
37
|
+
url = TERM_URL_FORMAT.format(identifier)
|
|
38
|
+
try:
|
|
39
|
+
path = ensure_path(PREFIX, "terms", url=url, name=f"{identifier}.json")
|
|
40
|
+
except pystow.utils.DownloadError:
|
|
41
|
+
tqdm.write(f"[{PREFIX}:{identifier}] failed to download {url}")
|
|
42
|
+
return None
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
with path.open() as file:
|
|
46
|
+
res = json.load(file)
|
|
47
|
+
except json.decoder.JSONDecodeError:
|
|
48
|
+
tqdm.write(f"[{PREFIX}:{identifier}] failed to parse data in {path}")
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
record = res["term"]
|
|
52
|
+
definitions = record["definitions"]
|
|
53
|
+
if definitions:
|
|
54
|
+
definition = _clean(definitions[0]["text"])
|
|
55
|
+
else:
|
|
56
|
+
definition = None
|
|
57
|
+
|
|
58
|
+
term = Term(
|
|
59
|
+
reference=Reference(
|
|
60
|
+
prefix=PREFIX,
|
|
61
|
+
identifier=identifier,
|
|
62
|
+
name=record["title"].strip(),
|
|
63
|
+
),
|
|
64
|
+
definition=definition,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
if synonym := record.get("synonym"):
|
|
68
|
+
if synonym.startswith("<"):
|
|
69
|
+
if synonym.startswith("<em>synonym</em>:"):
|
|
70
|
+
synonym = synonym.removeprefix("<em>synonym</em>:")
|
|
71
|
+
term.append_synonym(_clean(synonym))
|
|
72
|
+
elif synonym.startswith("<em>synonyms</em>:"):
|
|
73
|
+
for s in synonym.removeprefix("<em>synonyms</em>:").strip().split(","):
|
|
74
|
+
term.append_synonym(_clean(s))
|
|
75
|
+
else:
|
|
76
|
+
tqdm.write(f"[{term.curie}] issue with synonym: {synonym}")
|
|
77
|
+
|
|
78
|
+
return term
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _clean(s: str) -> str:
|
|
82
|
+
return s.strip().replace("\\n", "\n")
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
if __name__ == "__main__":
|
|
86
|
+
GoldBookGetter.cli()
|
pyobo/sources/hgnc/hgnc.py
CHANGED
|
@@ -7,6 +7,7 @@ import typing
|
|
|
7
7
|
from collections import Counter, defaultdict
|
|
8
8
|
from collections.abc import Iterable
|
|
9
9
|
|
|
10
|
+
import obographs
|
|
10
11
|
import pydantic
|
|
11
12
|
from tabulate import tabulate
|
|
12
13
|
from tqdm.auto import tqdm
|
|
@@ -14,22 +15,22 @@ from tqdm.auto import tqdm
|
|
|
14
15
|
from pyobo.api.utils import get_version
|
|
15
16
|
from pyobo.resources.so import get_so_name
|
|
16
17
|
from pyobo.struct import (
|
|
18
|
+
Annotation,
|
|
17
19
|
Obo,
|
|
20
|
+
OBOLiteral,
|
|
18
21
|
Reference,
|
|
19
|
-
SynonymTypeDef,
|
|
20
22
|
Term,
|
|
21
|
-
TypeDef,
|
|
22
|
-
default_reference,
|
|
23
23
|
from_species,
|
|
24
24
|
gene_product_member_of,
|
|
25
|
-
has_citation,
|
|
26
25
|
has_gene_product,
|
|
26
|
+
is_mentioned_by,
|
|
27
27
|
member_of,
|
|
28
28
|
orthologous,
|
|
29
29
|
transcribes_to,
|
|
30
30
|
)
|
|
31
|
-
from pyobo.struct.
|
|
32
|
-
from pyobo.
|
|
31
|
+
from pyobo.struct.struct import gene_symbol_synonym, previous_gene_symbol, previous_name
|
|
32
|
+
from pyobo.struct.typedef import comment, ends, exact_match, located_in, starts
|
|
33
|
+
from pyobo.utils.path import ensure_path
|
|
33
34
|
|
|
34
35
|
__all__ = [
|
|
35
36
|
"HGNCGetter",
|
|
@@ -43,26 +44,8 @@ DEFINITIONS_URL_FMT = (
|
|
|
43
44
|
"hgnc_complete_set_{version}.json"
|
|
44
45
|
)
|
|
45
46
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
)
|
|
49
|
-
alias_symbol_type = SynonymTypeDef(
|
|
50
|
-
reference=default_reference(PREFIX, "alias_symbol", name="alias symbol")
|
|
51
|
-
)
|
|
52
|
-
previous_name_type = SynonymTypeDef(
|
|
53
|
-
reference=default_reference(PREFIX, "previous_name", name="previous name")
|
|
54
|
-
)
|
|
55
|
-
alias_name_type = SynonymTypeDef(
|
|
56
|
-
reference=default_reference(PREFIX, "alias_name", name="alias name")
|
|
57
|
-
)
|
|
58
|
-
HAS_LOCUS_TYPE = TypeDef(
|
|
59
|
-
reference=default_reference(PREFIX, "locus_type", name="has locus type"), is_metadata_tag=True
|
|
60
|
-
)
|
|
61
|
-
HAS_LOCUS_GROUP = TypeDef(
|
|
62
|
-
reference=default_reference(PREFIX, "locus_group", name="has locus group"), is_metadata_tag=True
|
|
63
|
-
)
|
|
64
|
-
HAS_LOCATION = TypeDef(
|
|
65
|
-
reference=default_reference(PREFIX, "location", name="has location"), is_metadata_tag=True
|
|
47
|
+
CHR_URL = (
|
|
48
|
+
"https://raw.githubusercontent.com/monarch-initiative/monochrom/refs/heads/master/chr.json"
|
|
66
49
|
)
|
|
67
50
|
|
|
68
51
|
#: First column is MIRIAM prefix, second column is HGNC key
|
|
@@ -157,7 +140,7 @@ LOCUS_TYPE_TO_SO = {
|
|
|
157
140
|
"complex locus constituent": "0000997", # https://github.com/pyobo/pyobo/issues/118#issuecomment-1564520052
|
|
158
141
|
# non-coding RNA
|
|
159
142
|
"RNA, Y": "0002359",
|
|
160
|
-
"RNA, cluster": "", # TODO see https://github.com/The-Sequence-Ontology/SO-Ontologies/issues/564
|
|
143
|
+
"RNA, cluster": "0003001", # TODO see https://github.com/The-Sequence-Ontology/SO-Ontologies/issues/564
|
|
161
144
|
"RNA, long non-coding": "0002127", # HGNC links to wrong one
|
|
162
145
|
"RNA, micro": "0001265",
|
|
163
146
|
"RNA, misc": "0001266",
|
|
@@ -180,7 +163,7 @@ LOCUS_TYPE_TO_SO = {
|
|
|
180
163
|
"fragile site": "0002349",
|
|
181
164
|
"readthrough": "0000697", # maybe not right
|
|
182
165
|
"transposable element": "0000111", # HGNC links to wrong one
|
|
183
|
-
"virus integration site": "", # TODO see https://github.com/The-Sequence-Ontology/SO-Ontologies/issues/551
|
|
166
|
+
"virus integration site": "0003002", # TODO see https://github.com/The-Sequence-Ontology/SO-Ontologies/issues/551
|
|
184
167
|
"region": "0001411", # a small bucket for things that need a better annotation, even higher than "gene"
|
|
185
168
|
"unknown": "0000704", # gene
|
|
186
169
|
None: "0000704", # gene
|
|
@@ -190,6 +173,14 @@ PUBLICATION_TERM = Term(
|
|
|
190
173
|
reference=Reference(prefix="IAO", identifier="0000013", name="journal article")
|
|
191
174
|
)
|
|
192
175
|
|
|
176
|
+
#: Indicates the cytogenetic location of the gene or region on the chromsome.
|
|
177
|
+
#: In the absence of that information one of the following may be listed.
|
|
178
|
+
QUALIFIERS = {
|
|
179
|
+
" not on reference assembly": "not on reference assembly -named gene is not annotated on the current version of the Genome Reference Consortium human reference assembly; may have been annotated on previous assembly versions or on a non-reference human assembly",
|
|
180
|
+
" unplaced": "unplaced - named gene is annotated on an unplaced/unlocalized scaffold of the human reference assembly",
|
|
181
|
+
" alternate reference locus": "reserved - named gene has never been annotated on any human assembly",
|
|
182
|
+
}
|
|
183
|
+
|
|
193
184
|
|
|
194
185
|
class HGNCGetter(Obo):
|
|
195
186
|
"""An ontology representation of HGNC's gene nomenclature."""
|
|
@@ -203,16 +194,16 @@ class HGNCGetter(Obo):
|
|
|
203
194
|
orthologous,
|
|
204
195
|
member_of,
|
|
205
196
|
exact_match,
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
197
|
+
is_mentioned_by,
|
|
198
|
+
located_in,
|
|
199
|
+
starts,
|
|
200
|
+
ends,
|
|
201
|
+
comment,
|
|
210
202
|
]
|
|
211
203
|
synonym_typedefs = [
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
alias_symbol_type,
|
|
204
|
+
previous_name,
|
|
205
|
+
previous_gene_symbol,
|
|
206
|
+
gene_symbol_synonym,
|
|
216
207
|
]
|
|
217
208
|
root_terms = [
|
|
218
209
|
Reference(prefix="SO", identifier=so_id, name=get_so_name(so_id))
|
|
@@ -225,12 +216,28 @@ class HGNCGetter(Obo):
|
|
|
225
216
|
return get_terms(force=force, version=self.data_version)
|
|
226
217
|
|
|
227
218
|
|
|
219
|
+
def _get_location_to_chr() -> dict[str, Reference]:
|
|
220
|
+
uri_prefix = "http://purl.obolibrary.org/obo/CHR_9606-chr"
|
|
221
|
+
graph: obographs.Graph = obographs.read(CHR_URL, squeeze=True)
|
|
222
|
+
rv = {}
|
|
223
|
+
for node in graph.nodes:
|
|
224
|
+
if node.id.startswith(uri_prefix):
|
|
225
|
+
identifier = node.id.removeprefix(uri_prefix)
|
|
226
|
+
rv[identifier] = Reference(
|
|
227
|
+
prefix="CHR", identifier=f"9606-chr{identifier}", name=node.lbl
|
|
228
|
+
)
|
|
229
|
+
return rv
|
|
230
|
+
|
|
231
|
+
|
|
228
232
|
def get_terms(version: str | None = None, force: bool = False) -> Iterable[Term]:
|
|
229
233
|
"""Get HGNC terms."""
|
|
230
234
|
if version is None:
|
|
231
235
|
version = get_version("hgnc")
|
|
236
|
+
|
|
237
|
+
unhandled_locations: defaultdict[str, set[str]] = defaultdict(set)
|
|
238
|
+
location_to_chr = _get_location_to_chr()
|
|
239
|
+
|
|
232
240
|
unhandled_entry_keys: typing.Counter[str] = Counter()
|
|
233
|
-
unhandle_locus_types: defaultdict[str, dict[str, Term]] = defaultdict(dict)
|
|
234
241
|
path = ensure_path(
|
|
235
242
|
PREFIX,
|
|
236
243
|
url=DEFINITIONS_URL_FMT.format(version=version),
|
|
@@ -352,7 +359,6 @@ def get_terms(version: str | None = None, force: bool = False) -> Iterable[Term]
|
|
|
352
359
|
xref_identifiers = [str(xref_identifiers)]
|
|
353
360
|
|
|
354
361
|
if xref_prefix == "merops.entry":
|
|
355
|
-
continue
|
|
356
362
|
# e.g., XM02-001 should be rewritten as XM02.001
|
|
357
363
|
xref_identifiers = [i.replace("-", ".") for i in xref_identifiers]
|
|
358
364
|
|
|
@@ -375,7 +381,7 @@ def get_terms(version: str | None = None, force: bool = False) -> Iterable[Term]
|
|
|
375
381
|
term.append_xref(Reference(prefix=xref_prefix, identifier=str(xref_identifier)))
|
|
376
382
|
|
|
377
383
|
for pubmed_id in entry.pop("pubmed_id", []):
|
|
378
|
-
term.
|
|
384
|
+
term.append_mentioned_by(Reference(prefix="pubmed", identifier=str(pubmed_id)))
|
|
379
385
|
|
|
380
386
|
gene_group_ids = entry.pop("gene_group_id", [])
|
|
381
387
|
gene_groups = entry.pop("gene_group", [])
|
|
@@ -390,34 +396,118 @@ def get_terms(version: str | None = None, force: bool = False) -> Iterable[Term]
|
|
|
390
396
|
)
|
|
391
397
|
|
|
392
398
|
for alias_symbol in entry.pop("alias_symbol", []):
|
|
393
|
-
term.append_synonym(alias_symbol, type=
|
|
399
|
+
term.append_synonym(alias_symbol, type=gene_symbol_synonym)
|
|
394
400
|
for alias_name in entry.pop("alias_name", []):
|
|
395
|
-
|
|
401
|
+
# regular synonym, no type needed.
|
|
402
|
+
term.append_synonym(alias_name)
|
|
396
403
|
for previous_symbol in itt.chain(
|
|
397
404
|
entry.pop("previous_symbol", []), entry.pop("prev_symbol", [])
|
|
398
405
|
):
|
|
399
|
-
term.append_synonym(previous_symbol, type=
|
|
400
|
-
for
|
|
401
|
-
term.append_synonym(
|
|
406
|
+
term.append_synonym(previous_symbol, type=previous_gene_symbol)
|
|
407
|
+
for previous_name_ in entry.pop("prev_name", []):
|
|
408
|
+
term.append_synonym(previous_name_, type=previous_name)
|
|
409
|
+
|
|
410
|
+
location: str | None = entry.pop("location", None)
|
|
411
|
+
if location is not None and location not in {
|
|
412
|
+
"not on reference assembly",
|
|
413
|
+
"unplaced",
|
|
414
|
+
"reserved",
|
|
415
|
+
}:
|
|
416
|
+
annotations = []
|
|
417
|
+
for qualifier_suffix, qualifier_text in QUALIFIERS.items():
|
|
418
|
+
if location.endswith(qualifier_suffix):
|
|
419
|
+
location = location.removesuffix(qualifier_suffix)
|
|
420
|
+
annotations.append(
|
|
421
|
+
Annotation(
|
|
422
|
+
predicate=comment.reference, value=OBOLiteral.string(qualifier_text)
|
|
423
|
+
)
|
|
424
|
+
)
|
|
425
|
+
break
|
|
402
426
|
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
427
|
+
if location in location_to_chr:
|
|
428
|
+
term.append_relationship(
|
|
429
|
+
located_in, location_to_chr[location], annotations=annotations
|
|
430
|
+
)
|
|
431
|
+
elif location == "mitochondria":
|
|
432
|
+
term.append_relationship(
|
|
433
|
+
located_in,
|
|
434
|
+
Reference(prefix="go", identifier="0000262", name="mitochondrial chromosome"),
|
|
435
|
+
annotations=annotations,
|
|
436
|
+
)
|
|
437
|
+
elif " and " in location:
|
|
438
|
+
left, _, right = location.partition(" and ")
|
|
439
|
+
if left not in location_to_chr:
|
|
440
|
+
unhandled_locations[left].add(identifier)
|
|
441
|
+
elif right not in location_to_chr:
|
|
442
|
+
unhandled_locations[right].add(identifier)
|
|
443
|
+
elif left in location_to_chr and right in location_to_chr:
|
|
444
|
+
term.append_relationship(
|
|
445
|
+
located_in, location_to_chr[left], annotations=annotations
|
|
446
|
+
)
|
|
447
|
+
term.append_relationship(
|
|
448
|
+
located_in, location_to_chr[right], annotations=annotations
|
|
449
|
+
)
|
|
450
|
+
else:
|
|
451
|
+
unhandled_locations[location].add(identifier)
|
|
452
|
+
elif " or " in location:
|
|
453
|
+
left, _, right = location.partition(" or ")
|
|
454
|
+
if left not in location_to_chr:
|
|
455
|
+
unhandled_locations[left].add(identifier)
|
|
456
|
+
elif right not in location_to_chr:
|
|
457
|
+
unhandled_locations[right].add(identifier)
|
|
458
|
+
elif left in location_to_chr and right in location_to_chr:
|
|
459
|
+
# FIXME implement
|
|
460
|
+
unhandled_locations[location].add(identifier)
|
|
461
|
+
else:
|
|
462
|
+
unhandled_locations[location].add(identifier)
|
|
463
|
+
elif "-" in location:
|
|
464
|
+
start, _, end = location.partition("-")
|
|
465
|
+
|
|
466
|
+
# the range that sarts with a q needs
|
|
467
|
+
# the chromosome moved over, like in
|
|
468
|
+
# 17q24.2-q24.3
|
|
469
|
+
if end.startswith("q"):
|
|
470
|
+
chr, _, _ = start.partition("q")
|
|
471
|
+
end = f"{chr}{end}"
|
|
472
|
+
# the range that sarts with a p needs
|
|
473
|
+
# the chromosome moved over, like in
|
|
474
|
+
# 1p34.2-p34.1
|
|
475
|
+
elif end.startswith("p"):
|
|
476
|
+
chr, _, _ = start.partition("p")
|
|
477
|
+
end = f"{chr}{end}"
|
|
478
|
+
|
|
479
|
+
if start not in location_to_chr:
|
|
480
|
+
unhandled_locations[start].add(identifier)
|
|
481
|
+
elif end not in location_to_chr:
|
|
482
|
+
unhandled_locations[end].add(identifier)
|
|
483
|
+
elif start in location_to_chr and end in location_to_chr:
|
|
484
|
+
term.append_relationship(
|
|
485
|
+
starts, location_to_chr[start], annotations=annotations
|
|
486
|
+
)
|
|
487
|
+
term.append_relationship(ends, location_to_chr[end], annotations=annotations)
|
|
488
|
+
else:
|
|
489
|
+
unhandled_locations[location].add(identifier)
|
|
490
|
+
else:
|
|
491
|
+
unhandled_locations[location].add(identifier)
|
|
407
492
|
|
|
408
493
|
locus_type = entry.pop("locus_type")
|
|
409
|
-
|
|
494
|
+
# note that locus group is a more broad category than locus type,
|
|
495
|
+
# and since we already have an exhaustive mapping from locus type
|
|
496
|
+
# to SO, then we can throw this annotation away
|
|
497
|
+
_locus_group = entry.pop("locus_group")
|
|
410
498
|
so_id = LOCUS_TYPE_TO_SO.get(locus_type)
|
|
411
|
-
if so_id:
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
499
|
+
if not so_id:
|
|
500
|
+
raise ValueError("""\
|
|
501
|
+
HGNC has updated their list of locus types, so the HGNC script is currently
|
|
502
|
+
incomplete. This can be fixed by updating the ``LOCUS_TYPE_TO_SO`` dictionary
|
|
503
|
+
to point to a new SO term. If there is none existing, then make a pull request
|
|
504
|
+
to https://github.com/The-Sequence-Ontology/SO-Ontologies like in
|
|
505
|
+
https://github.com/The-Sequence-Ontology/SO-Ontologies/pull/668. If the
|
|
506
|
+
maintainers aren't responsive, you can still use the proposed term before it's
|
|
507
|
+
accepted upstream like was done for SO:0003001 and SO:0003002
|
|
508
|
+
""")
|
|
509
|
+
|
|
510
|
+
term.append_parent(Reference(prefix="SO", identifier=so_id, name=get_so_name(so_id)))
|
|
421
511
|
term.set_species(identifier="9606", name="Homo sapiens")
|
|
422
512
|
|
|
423
513
|
for key in entry:
|
|
@@ -425,45 +515,16 @@ def get_terms(version: str | None = None, force: bool = False) -> Iterable[Term]
|
|
|
425
515
|
unhandled_entry_keys[key] += 1
|
|
426
516
|
yield term
|
|
427
517
|
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
for k,
|
|
433
|
-
|
|
434
|
-
file,
|
|
435
|
-
indent=2,
|
|
436
|
-
)
|
|
437
|
-
|
|
438
|
-
with open(prefix_directory_join(PREFIX, name="unhandled.md"), "w") as file:
|
|
439
|
-
for k, v in sorted(unhandle_locus_types.items()):
|
|
440
|
-
t = tabulate(
|
|
441
|
-
[
|
|
442
|
-
(
|
|
443
|
-
hgnc_id,
|
|
444
|
-
term.name,
|
|
445
|
-
term.is_obsolete,
|
|
446
|
-
f"https://bioregistry.io/{term.curie}",
|
|
447
|
-
", ".join(
|
|
448
|
-
f"https://bioregistry.io/{p.curie}"
|
|
449
|
-
for p in term.provenance
|
|
450
|
-
if isinstance(p, Reference)
|
|
451
|
-
),
|
|
452
|
-
)
|
|
453
|
-
for hgnc_id, term in sorted(v.items())
|
|
454
|
-
],
|
|
455
|
-
headers=["hgnc_id", "name", "obsolete", "link", "provenance"],
|
|
518
|
+
if unhandled_locations:
|
|
519
|
+
logger.warning(
|
|
520
|
+
"Unhandled chromosomal locations:\n\n%s\n",
|
|
521
|
+
tabulate(
|
|
522
|
+
[(k, len(vs), f"HGNC:{min(vs)}") for k, vs in unhandled_locations.items()],
|
|
523
|
+
headers=["location", "count", "example"],
|
|
456
524
|
tablefmt="github",
|
|
457
|
-
)
|
|
458
|
-
|
|
459
|
-
print(t, "\n", file=file)
|
|
525
|
+
),
|
|
526
|
+
)
|
|
460
527
|
|
|
461
|
-
unhandle_locus_type_counter = Counter(
|
|
462
|
-
{locus_type: len(d) for locus_type, d in unhandle_locus_types.items()}
|
|
463
|
-
)
|
|
464
|
-
logger.warning(
|
|
465
|
-
"Unhandled locus types:\n%s", tabulate(unhandle_locus_type_counter.most_common())
|
|
466
|
-
)
|
|
467
528
|
if unhandled_entry_keys:
|
|
468
529
|
logger.warning("Unhandled keys:\n%s", tabulate(unhandled_entry_keys.most_common()))
|
|
469
530
|
|