pyobo 0.10.11__py3-none-any.whl → 0.10.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyobo/api/names.py +27 -6
- pyobo/api/utils.py +5 -0
- pyobo/cli/lookup.py +2 -2
- pyobo/constants.py +31 -1
- pyobo/gilda_utils.py +21 -0
- pyobo/identifier_utils.py +22 -5
- pyobo/reader.py +1 -1
- pyobo/sources/__init__.py +2 -0
- pyobo/sources/antibodyregistry.py +2 -2
- pyobo/sources/biogrid.py +3 -3
- pyobo/sources/credit.py +68 -0
- pyobo/sources/geonames.py +27 -9
- pyobo/sources/hgnc.py +2 -2
- pyobo/sources/mesh.py +3 -3
- pyobo/sources/msigdb.py +1 -1
- pyobo/sources/npass.py +1 -1
- pyobo/sources/pubchem.py +3 -3
- pyobo/sources/rgd.py +1 -1
- pyobo/sources/rhea.py +2 -2
- pyobo/sources/ror.py +67 -21
- pyobo/sources/uniprot/uniprot.py +2 -2
- pyobo/struct/struct.py +4 -3
- pyobo/struct/typedef.py +10 -0
- pyobo/utils/path.py +2 -1
- pyobo/version.py +1 -1
- pyobo/xrefdb/sources/__init__.py +6 -3
- pyobo/xrefdb/sources/chembl.py +5 -5
- pyobo/xrefdb/sources/pubchem.py +3 -2
- pyobo/xrefdb/sources/wikidata.py +8 -1
- {pyobo-0.10.11.dist-info → pyobo-0.10.12.dist-info}/METADATA +23 -23
- {pyobo-0.10.11.dist-info → pyobo-0.10.12.dist-info}/RECORD +35 -35
- {pyobo-0.10.11.dist-info → pyobo-0.10.12.dist-info}/WHEEL +1 -1
- pyobo/xrefdb/bengo.py +0 -44
- {pyobo-0.10.11.dist-info → pyobo-0.10.12.dist-info}/LICENSE +0 -0
- {pyobo-0.10.11.dist-info → pyobo-0.10.12.dist-info}/entry_points.txt +0 -0
- {pyobo-0.10.11.dist-info → pyobo-0.10.12.dist-info}/top_level.txt +0 -0
pyobo/api/names.py
CHANGED
|
@@ -2,11 +2,15 @@
|
|
|
2
2
|
|
|
3
3
|
"""High-level API for nomenclature."""
|
|
4
4
|
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
5
7
|
import logging
|
|
6
8
|
import subprocess
|
|
7
9
|
from functools import lru_cache
|
|
8
10
|
from typing import Callable, List, Mapping, Optional, Set, TypeVar
|
|
9
11
|
|
|
12
|
+
from curies import Reference, ReferenceTuple
|
|
13
|
+
|
|
10
14
|
from .alts import get_primary_identifier
|
|
11
15
|
from .utils import get_version
|
|
12
16
|
from ..getters import NoBuild, get_ontology
|
|
@@ -32,6 +36,8 @@ logger = logging.getLogger(__name__)
|
|
|
32
36
|
|
|
33
37
|
def get_name_by_curie(curie: str, *, version: Optional[str] = None) -> Optional[str]:
|
|
34
38
|
"""Get the name for a CURIE, if possible."""
|
|
39
|
+
if version is None:
|
|
40
|
+
version = get_version(curie.split(":")[0])
|
|
35
41
|
prefix, identifier = normalize_curie(curie)
|
|
36
42
|
if prefix and identifier:
|
|
37
43
|
return get_name(prefix, identifier, version=version)
|
|
@@ -40,7 +46,8 @@ def get_name_by_curie(curie: str, *, version: Optional[str] = None) -> Optional[
|
|
|
40
46
|
|
|
41
47
|
X = TypeVar("X")
|
|
42
48
|
|
|
43
|
-
NO_BUILD_PREFIXES = set()
|
|
49
|
+
NO_BUILD_PREFIXES: Set[str] = set()
|
|
50
|
+
NO_BUILD_LOGGED: Set = set()
|
|
44
51
|
|
|
45
52
|
|
|
46
53
|
def _help_get(
|
|
@@ -59,8 +66,10 @@ def _help_get(
|
|
|
59
66
|
logger.warning("[%s] unable to look up results with %s", prefix, f)
|
|
60
67
|
NO_BUILD_PREFIXES.add(prefix)
|
|
61
68
|
return None
|
|
62
|
-
except ValueError:
|
|
63
|
-
|
|
69
|
+
except ValueError as e:
|
|
70
|
+
if prefix not in NO_BUILD_PREFIXES:
|
|
71
|
+
logger.warning("[%s] value error while looking up results with %s: %s", prefix, f, e)
|
|
72
|
+
NO_BUILD_PREFIXES.add(prefix)
|
|
64
73
|
return None
|
|
65
74
|
|
|
66
75
|
if not mapping:
|
|
@@ -74,9 +83,17 @@ def _help_get(
|
|
|
74
83
|
|
|
75
84
|
|
|
76
85
|
@wrap_norm_prefix
|
|
77
|
-
def get_name(
|
|
86
|
+
def get_name(
|
|
87
|
+
prefix: str | Reference | ReferenceTuple,
|
|
88
|
+
identifier: Optional[str] = None,
|
|
89
|
+
/,
|
|
90
|
+
*,
|
|
91
|
+
version: Optional[str] = None,
|
|
92
|
+
) -> Optional[str]:
|
|
78
93
|
"""Get the name for an entity."""
|
|
79
|
-
|
|
94
|
+
if isinstance(prefix, (ReferenceTuple, Reference)):
|
|
95
|
+
prefix, identifier = prefix.prefix, prefix.identifier
|
|
96
|
+
return _help_get(get_id_name_mapping, prefix, identifier, version=version) # type:ignore
|
|
80
97
|
|
|
81
98
|
|
|
82
99
|
@lru_cache()
|
|
@@ -159,8 +176,12 @@ def get_name_id_mapping(
|
|
|
159
176
|
|
|
160
177
|
|
|
161
178
|
@wrap_norm_prefix
|
|
162
|
-
def get_definition(
|
|
179
|
+
def get_definition(
|
|
180
|
+
prefix: str, identifier: str | None = None, *, version: Optional[str] = None
|
|
181
|
+
) -> Optional[str]:
|
|
163
182
|
"""Get the definition for an entity."""
|
|
183
|
+
if identifier is None:
|
|
184
|
+
prefix, _, identifier = prefix.rpartition(":")
|
|
164
185
|
return _help_get(get_id_definition_mapping, prefix, identifier, version=version)
|
|
165
186
|
|
|
166
187
|
|
pyobo/api/utils.py
CHANGED
|
@@ -7,6 +7,7 @@ from typing import Optional
|
|
|
7
7
|
|
|
8
8
|
import bioversions
|
|
9
9
|
|
|
10
|
+
from ..constants import VERSION_PINS
|
|
10
11
|
from ..utils.path import prefix_directory_join
|
|
11
12
|
|
|
12
13
|
__all__ = [
|
|
@@ -25,6 +26,10 @@ def get_version(prefix: str) -> Optional[str]:
|
|
|
25
26
|
:param prefix: the resource name
|
|
26
27
|
:return: The version if available else None
|
|
27
28
|
"""
|
|
29
|
+
# Prioritize loaded environmental variable VERSION_PINS dictionary
|
|
30
|
+
version = VERSION_PINS.get(prefix)
|
|
31
|
+
if version:
|
|
32
|
+
return version
|
|
28
33
|
try:
|
|
29
34
|
version = bioversions.get_version(prefix)
|
|
30
35
|
except KeyError:
|
pyobo/cli/lookup.py
CHANGED
|
@@ -282,7 +282,7 @@ def ancestors(prefix: str, identifier: str, force: bool, version: Optional[str])
|
|
|
282
282
|
"""Look up ancestors."""
|
|
283
283
|
curies = get_ancestors(prefix=prefix, identifier=identifier, force=force, version=version)
|
|
284
284
|
for curie in sorted(curies or []):
|
|
285
|
-
click.echo(f"{curie}\t{get_name_by_curie(curie)}")
|
|
285
|
+
click.echo(f"{curie}\t{get_name_by_curie(curie, version=version)}")
|
|
286
286
|
|
|
287
287
|
|
|
288
288
|
@lookup.command()
|
|
@@ -295,7 +295,7 @@ def descendants(prefix: str, identifier: str, force: bool, version: Optional[str
|
|
|
295
295
|
"""Look up descendants."""
|
|
296
296
|
curies = get_descendants(prefix=prefix, identifier=identifier, force=force, version=version)
|
|
297
297
|
for curie in sorted(curies or []):
|
|
298
|
-
click.echo(f"{curie}\t{get_name_by_curie(curie)}")
|
|
298
|
+
click.echo(f"{curie}\t{get_name_by_curie(curie, version=version)}")
|
|
299
299
|
|
|
300
300
|
|
|
301
301
|
@lookup.command()
|
pyobo/constants.py
CHANGED
|
@@ -2,7 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
"""Constants for PyOBO."""
|
|
4
4
|
|
|
5
|
+
import json
|
|
5
6
|
import logging
|
|
7
|
+
import os
|
|
6
8
|
import re
|
|
7
9
|
|
|
8
10
|
import pystow
|
|
@@ -11,6 +13,7 @@ __all__ = [
|
|
|
11
13
|
"RAW_DIRECTORY",
|
|
12
14
|
"DATABASE_DIRECTORY",
|
|
13
15
|
"SPECIES_REMAPPING",
|
|
16
|
+
"VERSION_PINS",
|
|
14
17
|
]
|
|
15
18
|
|
|
16
19
|
logger = logging.getLogger(__name__)
|
|
@@ -80,7 +83,6 @@ TYPEDEFS_FILE = "typedefs.tsv.gz"
|
|
|
80
83
|
SPECIES_RECORD = "5334738"
|
|
81
84
|
SPECIES_FILE = "species.tsv.gz"
|
|
82
85
|
|
|
83
|
-
|
|
84
86
|
NCBITAXON_PREFIX = "NCBITaxon"
|
|
85
87
|
DATE_FORMAT = "%d:%m:%Y %H:%M"
|
|
86
88
|
PROVENANCE_PREFIXES = {
|
|
@@ -99,3 +101,31 @@ PROVENANCE_PREFIXES = {
|
|
|
99
101
|
"isbn",
|
|
100
102
|
"issn",
|
|
101
103
|
}
|
|
104
|
+
|
|
105
|
+
# Load version pin dictionary from the environmental variable VERSION_PINS
|
|
106
|
+
try:
|
|
107
|
+
VERSION_PINS_STR = os.getenv("VERSION_PINS")
|
|
108
|
+
if not VERSION_PINS_STR:
|
|
109
|
+
VERSION_PINS = {}
|
|
110
|
+
else:
|
|
111
|
+
VERSION_PINS = json.loads(VERSION_PINS_STR)
|
|
112
|
+
for k, v in VERSION_PINS.items():
|
|
113
|
+
if not isinstance(k, str) or not isinstance(v, str):
|
|
114
|
+
logger.error("The prefix and version name must both be " "strings")
|
|
115
|
+
VERSION_PINS = {}
|
|
116
|
+
break
|
|
117
|
+
except ValueError as e:
|
|
118
|
+
logger.error(
|
|
119
|
+
"The value for the environment variable VERSION_PINS must be a valid JSON string: %s" % e
|
|
120
|
+
)
|
|
121
|
+
VERSION_PINS = {}
|
|
122
|
+
|
|
123
|
+
if VERSION_PINS:
|
|
124
|
+
logger.debug(
|
|
125
|
+
f"These are the resource versions that are pinned.\n{VERSION_PINS}. "
|
|
126
|
+
f"\nPyobo will download the latest version of a resource if it's "
|
|
127
|
+
f"not pinned.\nIf you want to use a specific version of a "
|
|
128
|
+
f"resource, edit your VERSION_PINS environmental "
|
|
129
|
+
f"variable which is a JSON string to include a prefix and version "
|
|
130
|
+
f"name."
|
|
131
|
+
)
|
pyobo/gilda_utils.py
CHANGED
|
@@ -15,6 +15,7 @@ from gilda.term import filter_out_duplicates
|
|
|
15
15
|
from tqdm.auto import tqdm
|
|
16
16
|
|
|
17
17
|
from pyobo import (
|
|
18
|
+
get_descendants,
|
|
18
19
|
get_id_name_mapping,
|
|
19
20
|
get_id_species_mapping,
|
|
20
21
|
get_id_synonyms_mapping,
|
|
@@ -247,3 +248,23 @@ def get_gilda_terms(
|
|
|
247
248
|
)
|
|
248
249
|
if term is not None:
|
|
249
250
|
yield term
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def get_gilda_term_subset(
|
|
254
|
+
source: str, ancestors: Union[str, List[str]], **kwargs
|
|
255
|
+
) -> Iterable[gilda.term.Term]:
|
|
256
|
+
"""Get a subset of terms."""
|
|
257
|
+
subset = {
|
|
258
|
+
descendant
|
|
259
|
+
for parent_curie in _ensure_list(ancestors)
|
|
260
|
+
for descendant in get_descendants(*parent_curie.split(":")) or []
|
|
261
|
+
}
|
|
262
|
+
for term in get_gilda_terms(source, **kwargs):
|
|
263
|
+
if bioregistry.curie_to_str(term.db, term.id) in subset:
|
|
264
|
+
yield term
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _ensure_list(s: Union[str, List[str]]) -> List[str]:
|
|
268
|
+
if isinstance(s, str):
|
|
269
|
+
return [s]
|
|
270
|
+
return s
|
pyobo/identifier_utils.py
CHANGED
|
@@ -2,11 +2,14 @@
|
|
|
2
2
|
|
|
3
3
|
"""Utilities for handling prefixes."""
|
|
4
4
|
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
5
7
|
import logging
|
|
6
8
|
from functools import wraps
|
|
7
9
|
from typing import Optional, Tuple, Union
|
|
8
10
|
|
|
9
11
|
import bioregistry
|
|
12
|
+
from curies import Reference, ReferenceTuple
|
|
10
13
|
|
|
11
14
|
from .registries import (
|
|
12
15
|
curie_has_blacklisted_prefix,
|
|
@@ -108,11 +111,25 @@ def wrap_norm_prefix(f):
|
|
|
108
111
|
"""Decorate a function that take in a prefix to auto-normalize, or return None if it can't be normalized."""
|
|
109
112
|
|
|
110
113
|
@wraps(f)
|
|
111
|
-
def _wrapped(prefix, *args, **kwargs):
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
114
|
+
def _wrapped(prefix: str | Reference | ReferenceTuple, *args, **kwargs):
|
|
115
|
+
if isinstance(prefix, str):
|
|
116
|
+
norm_prefix = bioregistry.normalize_prefix(prefix)
|
|
117
|
+
if norm_prefix is None:
|
|
118
|
+
raise ValueError(f"Invalid prefix: {prefix}")
|
|
119
|
+
prefix = norm_prefix
|
|
120
|
+
elif isinstance(prefix, Reference):
|
|
121
|
+
norm_prefix = bioregistry.normalize_prefix(prefix.prefix)
|
|
122
|
+
if norm_prefix is None:
|
|
123
|
+
raise ValueError(f"Invalid prefix: {prefix.prefix}")
|
|
124
|
+
prefix = Reference(prefix=norm_prefix, identifier=prefix.identifier)
|
|
125
|
+
elif isinstance(prefix, ReferenceTuple):
|
|
126
|
+
norm_prefix = bioregistry.normalize_prefix(prefix.prefix)
|
|
127
|
+
if norm_prefix is None:
|
|
128
|
+
raise ValueError(f"Invalid prefix: {prefix.prefix}")
|
|
129
|
+
prefix = ReferenceTuple(norm_prefix, prefix.identifier)
|
|
130
|
+
else:
|
|
131
|
+
raise TypeError
|
|
132
|
+
return f(prefix, *args, **kwargs)
|
|
116
133
|
|
|
117
134
|
return _wrapped
|
|
118
135
|
|
pyobo/reader.py
CHANGED
|
@@ -417,7 +417,7 @@ def _clean_definition(s: str) -> str:
|
|
|
417
417
|
# if '\t' in s:
|
|
418
418
|
# logger.warning('has tab')
|
|
419
419
|
return (
|
|
420
|
-
s.replace('\\"', '"').replace("\n", " ").replace("\t", " ").replace("\d", "") # noqa:W605
|
|
420
|
+
s.replace('\\"', '"').replace("\n", " ").replace("\t", " ").replace(r"\d", "") # noqa:W605
|
|
421
421
|
)
|
|
422
422
|
|
|
423
423
|
|
pyobo/sources/__init__.py
CHANGED
|
@@ -12,6 +12,7 @@ from .civic_gene import CIVICGeneGetter
|
|
|
12
12
|
from .complexportal import ComplexPortalGetter
|
|
13
13
|
from .conso import CONSOGetter
|
|
14
14
|
from .cpt import CPTGetter
|
|
15
|
+
from .credit import CreditGetter
|
|
15
16
|
from .cvx import CVXGetter
|
|
16
17
|
from .depmap import DepMapGetter
|
|
17
18
|
from .dictybase_gene import DictybaseGetter
|
|
@@ -69,6 +70,7 @@ __all__ = [
|
|
|
69
70
|
"CVXGetter",
|
|
70
71
|
"ChEMBLCompoundGetter",
|
|
71
72
|
"ComplexPortalGetter",
|
|
73
|
+
"CreditGetter",
|
|
72
74
|
"DepMapGetter",
|
|
73
75
|
"DictybaseGetter",
|
|
74
76
|
"DrugBankGetter",
|
|
@@ -5,12 +5,12 @@
|
|
|
5
5
|
import logging
|
|
6
6
|
from typing import Iterable, Mapping, Optional
|
|
7
7
|
|
|
8
|
-
import bioversions
|
|
9
8
|
import pandas as pd
|
|
10
9
|
from bioregistry.utils import removeprefix
|
|
11
10
|
from tqdm.auto import tqdm
|
|
12
11
|
|
|
13
12
|
from pyobo import Obo, Term
|
|
13
|
+
from pyobo.api.utils import get_version
|
|
14
14
|
from pyobo.utils.path import ensure_df
|
|
15
15
|
|
|
16
16
|
__all__ = [
|
|
@@ -27,7 +27,7 @@ CHUNKSIZE = 20_000
|
|
|
27
27
|
def get_chunks(*, force: bool = False, version: Optional[str] = None) -> pd.DataFrame:
|
|
28
28
|
"""Get the BioGRID identifiers mapping dataframe."""
|
|
29
29
|
if version is None:
|
|
30
|
-
version =
|
|
30
|
+
version = get_version(PREFIX)
|
|
31
31
|
df = ensure_df(
|
|
32
32
|
PREFIX,
|
|
33
33
|
url=URL,
|
pyobo/sources/biogrid.py
CHANGED
|
@@ -5,9 +5,9 @@
|
|
|
5
5
|
from functools import partial
|
|
6
6
|
from typing import Mapping, Optional
|
|
7
7
|
|
|
8
|
-
import bioversions
|
|
9
8
|
import pandas as pd
|
|
10
9
|
|
|
10
|
+
from pyobo.api.utils import get_version
|
|
11
11
|
from pyobo.resources.ncbitaxon import get_ncbitaxon_id
|
|
12
12
|
from pyobo.utils.cache import cached_mapping
|
|
13
13
|
from pyobo.utils.path import ensure_df, prefix_directory_join
|
|
@@ -52,7 +52,7 @@ def _lookup(name: str) -> Optional[str]:
|
|
|
52
52
|
|
|
53
53
|
def get_df() -> pd.DataFrame:
|
|
54
54
|
"""Get the BioGRID identifiers mapping dataframe."""
|
|
55
|
-
version =
|
|
55
|
+
version = get_version("biogrid")
|
|
56
56
|
url = f"{BASE_URL}/BIOGRID-{version}/BIOGRID-IDENTIFIERS-{version}.tab.zip"
|
|
57
57
|
df = ensure_df(PREFIX, url=url, skiprows=28, dtype=str, version=version)
|
|
58
58
|
df["taxonomy_id"] = df["ORGANISM_OFFICIAL_NAME"].map(_lookup)
|
|
@@ -65,7 +65,7 @@ def get_df() -> pd.DataFrame:
|
|
|
65
65
|
"cache",
|
|
66
66
|
"xrefs",
|
|
67
67
|
name="ncbigene.tsv",
|
|
68
|
-
version=partial(
|
|
68
|
+
version=partial(get_version, PREFIX),
|
|
69
69
|
),
|
|
70
70
|
header=["biogrid_id", "ncbigene_id"],
|
|
71
71
|
)
|
pyobo/sources/credit.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""Converter for the Contributor Roles Taxonomy."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Iterable
|
|
7
|
+
|
|
8
|
+
from more_itertools import chunked
|
|
9
|
+
|
|
10
|
+
from pyobo.struct import Obo, Term
|
|
11
|
+
from pyobo.utils.path import ensure_path
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"CreditGetter",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
url = "https://api.github.com/repos/CASRAI-CRedIT/Dictionary/contents/Picklists/Contributor%20Roles"
|
|
18
|
+
PREFIX = "credit"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class CreditGetter(Obo):
|
|
22
|
+
"""An ontology representation of the Contributor Roles Taxonomy."""
|
|
23
|
+
|
|
24
|
+
ontology = PREFIX
|
|
25
|
+
static_version = "2022"
|
|
26
|
+
idspaces = {
|
|
27
|
+
PREFIX: "https://credit.niso.org/contributor-roles/",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
31
|
+
"""Iterate over terms in the ontology."""
|
|
32
|
+
return get_terms(force=force)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_obo(force: bool = False) -> Obo:
|
|
36
|
+
"""Get RGD as OBO."""
|
|
37
|
+
return CreditGetter(force=force)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def get_terms(force: bool = False) -> list[Term]:
|
|
41
|
+
"""Get terms from the Contributor Roles Taxonomy via GitHub."""
|
|
42
|
+
path = ensure_path(PREFIX, url=url, name="picklist-api.json", force=force)
|
|
43
|
+
with open(path) as f:
|
|
44
|
+
data = json.load(f)
|
|
45
|
+
terms = []
|
|
46
|
+
for x in data:
|
|
47
|
+
name = x["name"].removesuffix(".md").lower()
|
|
48
|
+
|
|
49
|
+
pp = ensure_path(PREFIX, "picklist", url=x["download_url"], backend="requests")
|
|
50
|
+
with open(pp) as f:
|
|
51
|
+
header, *rest = f.read().splitlines()
|
|
52
|
+
name = header = header.removeprefix("# Contributor Roles/")
|
|
53
|
+
dd = {k.removeprefix("## "): v for k, v in chunked(rest, 2)}
|
|
54
|
+
identifier = (
|
|
55
|
+
dd["Canonical URL"]
|
|
56
|
+
.removeprefix("https://credit.niso.org/contributor-roles/")
|
|
57
|
+
.rstrip("/")
|
|
58
|
+
)
|
|
59
|
+
desc = dd["Short definition"]
|
|
60
|
+
terms.append(
|
|
61
|
+
Term.from_triple(prefix=PREFIX, identifier=identifier, name=name, definition=desc)
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
return terms
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
if __name__ == "__main__":
|
|
68
|
+
get_obo(force=True).write_default(write_obo=True)
|
pyobo/sources/geonames.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
"""Get terms from geonames."""
|
|
2
2
|
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
3
5
|
import logging
|
|
4
6
|
from typing import Collection, Iterable, Mapping
|
|
5
7
|
|
|
@@ -146,15 +148,7 @@ def get_code_to_admin2(
|
|
|
146
148
|
return code_to_admin2
|
|
147
149
|
|
|
148
150
|
|
|
149
|
-
def
|
|
150
|
-
code_to_country,
|
|
151
|
-
code_to_admin1,
|
|
152
|
-
code_to_admin2,
|
|
153
|
-
*,
|
|
154
|
-
minimum_population: int = 100_000,
|
|
155
|
-
force: bool = False,
|
|
156
|
-
) -> Mapping[str, Term]:
|
|
157
|
-
"""Get a mapping from city code to term."""
|
|
151
|
+
def _get_cities_df(force: bool = False) -> pd.DataFrame:
|
|
158
152
|
columns = [
|
|
159
153
|
"geonames_id",
|
|
160
154
|
"name",
|
|
@@ -184,7 +178,19 @@ def get_cities(
|
|
|
184
178
|
names=columns,
|
|
185
179
|
dtype=str,
|
|
186
180
|
)
|
|
181
|
+
return cities_df
|
|
182
|
+
|
|
187
183
|
|
|
184
|
+
def get_cities(
|
|
185
|
+
code_to_country,
|
|
186
|
+
code_to_admin1,
|
|
187
|
+
code_to_admin2,
|
|
188
|
+
*,
|
|
189
|
+
minimum_population: int = 100_000,
|
|
190
|
+
force: bool = False,
|
|
191
|
+
) -> Mapping[str, Term]:
|
|
192
|
+
"""Get a mapping from city code to term."""
|
|
193
|
+
cities_df = _get_cities_df(force=force)
|
|
188
194
|
cities_df = cities_df[cities_df.population.astype(int) > minimum_population]
|
|
189
195
|
cities_df.synonyms = cities_df.synonyms.str.split(",")
|
|
190
196
|
|
|
@@ -235,5 +241,17 @@ def get_cities(
|
|
|
235
241
|
return terms
|
|
236
242
|
|
|
237
243
|
|
|
244
|
+
def get_city_to_country() -> dict[str, str]:
|
|
245
|
+
"""Get a mapping from city GeoNames to country GeoNames id."""
|
|
246
|
+
rv = {}
|
|
247
|
+
code_to_country = get_code_to_country()
|
|
248
|
+
cities_df = _get_cities_df()
|
|
249
|
+
for city_geonames_id, country_code in cities_df[["geonames_id", "country_code"]].values:
|
|
250
|
+
if pd.isna(city_geonames_id) or pd.isna(country_code):
|
|
251
|
+
continue
|
|
252
|
+
rv[city_geonames_id] = code_to_country[country_code].identifier
|
|
253
|
+
return rv
|
|
254
|
+
|
|
255
|
+
|
|
238
256
|
if __name__ == "__main__":
|
|
239
257
|
GeonamesGetter().write_default(write_obo=True, force=True)
|
pyobo/sources/hgnc.py
CHANGED
|
@@ -10,10 +10,10 @@ from collections import Counter, defaultdict
|
|
|
10
10
|
from operator import attrgetter
|
|
11
11
|
from typing import DefaultDict, Dict, Iterable, Optional
|
|
12
12
|
|
|
13
|
-
import bioversions
|
|
14
13
|
from tabulate import tabulate
|
|
15
14
|
from tqdm.auto import tqdm
|
|
16
15
|
|
|
16
|
+
from pyobo.api.utils import get_version
|
|
17
17
|
from pyobo.struct import (
|
|
18
18
|
Obo,
|
|
19
19
|
Reference,
|
|
@@ -241,7 +241,7 @@ def get_obo(*, force: bool = False) -> Obo:
|
|
|
241
241
|
def get_terms(version: Optional[str] = None, force: bool = False) -> Iterable[Term]: # noqa:C901
|
|
242
242
|
"""Get HGNC terms."""
|
|
243
243
|
if version is None:
|
|
244
|
-
version =
|
|
244
|
+
version = get_version("hgnc")
|
|
245
245
|
unhandled_entry_keys: typing.Counter[str] = Counter()
|
|
246
246
|
unhandle_locus_types: DefaultDict[str, Dict[str, Term]] = defaultdict(dict)
|
|
247
247
|
path = ensure_path(
|
pyobo/sources/mesh.py
CHANGED
|
@@ -11,6 +11,7 @@ from xml.etree.ElementTree import Element
|
|
|
11
11
|
|
|
12
12
|
from tqdm.auto import tqdm
|
|
13
13
|
|
|
14
|
+
from pyobo.api.utils import get_version
|
|
14
15
|
from pyobo.identifier_utils import standardize_ec
|
|
15
16
|
from pyobo.struct import Obo, Reference, Synonym, Term
|
|
16
17
|
from pyobo.utils.cache import cached_json, cached_mapping
|
|
@@ -331,9 +332,8 @@ def get_mesh_category_curies(
|
|
|
331
332
|
.. seealso:: https://meshb.nlm.nih.gov/treeView
|
|
332
333
|
"""
|
|
333
334
|
if version is None:
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
version = bioversions.get_version("mesh")
|
|
335
|
+
version = get_version("mesh")
|
|
336
|
+
assert version is not None
|
|
337
337
|
tree_to_mesh = get_tree_to_mesh_id(version=version)
|
|
338
338
|
rv = []
|
|
339
339
|
for i in range(1, 100):
|
pyobo/sources/msigdb.py
CHANGED
|
@@ -137,7 +137,7 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
|
137
137
|
def _get_definition(attrib) -> Optional[str]:
|
|
138
138
|
rv = attrib["DESCRIPTION_FULL"].strip() or attrib["DESCRIPTION_BRIEF"].strip() or None
|
|
139
139
|
if rv is not None:
|
|
140
|
-
return rv.replace("\d", "").replace("\s", "") # noqa: W605
|
|
140
|
+
return rv.replace(r"\d", "").replace(r"\s", "") # noqa: W605
|
|
141
141
|
return None
|
|
142
142
|
|
|
143
143
|
|
pyobo/sources/npass.py
CHANGED
pyobo/sources/pubchem.py
CHANGED
|
@@ -5,12 +5,12 @@
|
|
|
5
5
|
import logging
|
|
6
6
|
from typing import Iterable, Mapping, Optional
|
|
7
7
|
|
|
8
|
-
import bioversions
|
|
9
8
|
import pandas as pd
|
|
10
9
|
from bioregistry.utils import removeprefix
|
|
11
10
|
from tqdm.auto import tqdm
|
|
12
11
|
|
|
13
12
|
from ..api import get_name_id_mapping
|
|
13
|
+
from ..api.utils import get_version
|
|
14
14
|
from ..struct import Obo, Reference, Synonym, Term
|
|
15
15
|
from ..utils.iter import iterate_gzips_together
|
|
16
16
|
from ..utils.path import ensure_df, ensure_path
|
|
@@ -26,7 +26,7 @@ PREFIX = "pubchem.compound"
|
|
|
26
26
|
|
|
27
27
|
def _get_pubchem_extras_url(version: Optional[str], end: str) -> str:
|
|
28
28
|
if version is None:
|
|
29
|
-
version =
|
|
29
|
+
version = get_version("pubchem")
|
|
30
30
|
return f"ftp://ftp.ncbi.nlm.nih.gov/pubchem/Compound/Monthly/{version}/Extras/{end}"
|
|
31
31
|
|
|
32
32
|
|
|
@@ -100,7 +100,7 @@ def get_pubchem_id_to_mesh_id(version: str) -> Mapping[str, str]:
|
|
|
100
100
|
|
|
101
101
|
def _ensure_cid_name_path(*, version: Optional[str] = None, force: bool = False) -> str:
|
|
102
102
|
if version is None:
|
|
103
|
-
version =
|
|
103
|
+
version = get_version("pubchem")
|
|
104
104
|
# 2 tab-separated columns: compound_id, name
|
|
105
105
|
cid_name_url = _get_pubchem_extras_url(version, "CID-Title.gz")
|
|
106
106
|
cid_name_path = ensure_path(PREFIX, url=cid_name_url, version=version, force=force)
|
pyobo/sources/rgd.py
CHANGED
|
@@ -28,7 +28,7 @@ old_name_type = SynonymTypeDef.from_text("old_name")
|
|
|
28
28
|
|
|
29
29
|
# NOTE unigene id was discontinue in January 18th, 2021 dump
|
|
30
30
|
|
|
31
|
-
GENES_URL = "https://download.rgd.mcw.edu/data_release/
|
|
31
|
+
GENES_URL = "https://download.rgd.mcw.edu/data_release/GENES_RAT.txt"
|
|
32
32
|
GENES_HEADER = [
|
|
33
33
|
"GENE_RGD_ID",
|
|
34
34
|
"SYMBOL",
|
pyobo/sources/rhea.py
CHANGED
|
@@ -5,9 +5,9 @@
|
|
|
5
5
|
import logging
|
|
6
6
|
from typing import TYPE_CHECKING, Dict, Iterable, Optional
|
|
7
7
|
|
|
8
|
-
import bioversions
|
|
9
8
|
import pystow
|
|
10
9
|
|
|
10
|
+
from pyobo.api.utils import get_version
|
|
11
11
|
from pyobo.struct import Obo, Reference, Term
|
|
12
12
|
from pyobo.struct.typedef import (
|
|
13
13
|
TypeDef,
|
|
@@ -63,7 +63,7 @@ def ensure_rhea_rdf(version: Optional[str] = None, force: bool = False) -> "rdfl
|
|
|
63
63
|
"""Get the Rhea RDF graph."""
|
|
64
64
|
# see docs: https://ftp.expasy.org/databases/rhea/rdf/rhea_rdf_documentation.pdf
|
|
65
65
|
if version is None:
|
|
66
|
-
version =
|
|
66
|
+
version = get_version(PREFIX)
|
|
67
67
|
return pystow.ensure_rdf(
|
|
68
68
|
"pyobo",
|
|
69
69
|
"raw",
|