pyobo 0.12.0__py3-none-any.whl → 0.12.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyobo/.DS_Store +0 -0
- pyobo/api/properties.py +8 -12
- pyobo/api/xrefs.py +1 -2
- pyobo/cli/database.py +30 -2
- pyobo/cli/database_utils.py +5 -11
- pyobo/getters.py +20 -79
- pyobo/gilda_utils.py +3 -80
- pyobo/identifier_utils/__init__.py +3 -10
- pyobo/identifier_utils/api.py +21 -12
- pyobo/obographs.py +11 -2
- pyobo/reader.py +13 -17
- pyobo/sources/cgnc.py +9 -1
- pyobo/sources/credit.py +17 -6
- pyobo/sources/flybase.py +5 -5
- pyobo/sources/omim_ps.py +4 -4
- pyobo/sources/pharmgkb/pharmgkb_gene.py +1 -1
- pyobo/struct/functional/ontology.py +3 -1
- pyobo/struct/reference.py +4 -4
- pyobo/struct/struct.py +112 -55
- pyobo/utils/cache.py +3 -4
- pyobo/utils/io.py +38 -14
- pyobo/utils/path.py +16 -19
- pyobo/version.py +1 -1
- {pyobo-0.12.0.dist-info → pyobo-0.12.2.dist-info}/METADATA +67 -118
- {pyobo-0.12.0.dist-info → pyobo-0.12.2.dist-info}/RECORD +164 -169
- {pyobo-0.12.0.dist-info → pyobo-0.12.2.dist-info}/WHEEL +1 -1
- pyobo/identifier_utils/model.py +0 -130
- pyobo/identifier_utils/preprocessing.json +0 -812
- pyobo/identifier_utils/preprocessing.py +0 -61
- pyobo/resources/goc.py +0 -75
- pyobo/resources/goc.tsv +0 -188
- {pyobo-0.12.0.dist-info → pyobo-0.12.2.dist-info}/entry_points.txt +0 -0
- {pyobo-0.12.0.dist-info → pyobo-0.12.2.dist-info}/licenses/LICENSE +0 -0
pyobo/reader.py
CHANGED
|
@@ -15,20 +15,19 @@ from typing import Any
|
|
|
15
15
|
import bioregistry
|
|
16
16
|
import networkx as nx
|
|
17
17
|
from curies import ReferenceTuple
|
|
18
|
+
from curies.preprocessing import BlocklistError
|
|
18
19
|
from curies.vocabulary import SynonymScope
|
|
19
20
|
from more_itertools import pairwise
|
|
20
21
|
from tqdm.auto import tqdm
|
|
21
22
|
|
|
22
23
|
from .constants import DATE_FORMAT, PROVENANCE_PREFIXES
|
|
23
24
|
from .identifier_utils import (
|
|
24
|
-
BlacklistedError,
|
|
25
25
|
NotCURIEError,
|
|
26
26
|
ParseError,
|
|
27
27
|
UnparsableIRIError,
|
|
28
28
|
_is_valid_identifier,
|
|
29
29
|
_parse_str_or_curie_or_uri_helper,
|
|
30
|
-
|
|
31
|
-
str_is_blacklisted,
|
|
30
|
+
get_rules,
|
|
32
31
|
)
|
|
33
32
|
from .reader_utils import (
|
|
34
33
|
_chomp_axioms,
|
|
@@ -53,6 +52,7 @@ from .struct.struct_utils import Annotation, Stanza
|
|
|
53
52
|
from .struct.typedef import comment as has_comment
|
|
54
53
|
from .struct.typedef import default_typedefs, has_ontology_root_term
|
|
55
54
|
from .utils.cache import write_gzipped_graph
|
|
55
|
+
from .utils.io import safe_open
|
|
56
56
|
from .utils.misc import STATIC_VERSION_REWRITES, cleanup_version
|
|
57
57
|
|
|
58
58
|
__all__ = [
|
|
@@ -76,13 +76,7 @@ def from_obo_path(
|
|
|
76
76
|
) -> Obo:
|
|
77
77
|
"""Get the OBO graph from a path."""
|
|
78
78
|
path = Path(path).expanduser().resolve()
|
|
79
|
-
if path.suffix.endswith(".
|
|
80
|
-
import gzip
|
|
81
|
-
|
|
82
|
-
logger.info("[%s] parsing gzipped OBO with obonet from %s", prefix or "<unknown>", path)
|
|
83
|
-
with gzip.open(path, "rt") as file:
|
|
84
|
-
graph = _read_obo(file, prefix, ignore_obsolete=ignore_obsolete, use_tqdm=use_tqdm)
|
|
85
|
-
elif path.suffix.endswith(".zip"):
|
|
79
|
+
if path.suffix.endswith(".zip"):
|
|
86
80
|
import io
|
|
87
81
|
import zipfile
|
|
88
82
|
|
|
@@ -95,7 +89,7 @@ def from_obo_path(
|
|
|
95
89
|
)
|
|
96
90
|
else:
|
|
97
91
|
logger.info("[%s] parsing OBO with obonet from %s", prefix or "<unknown>", path)
|
|
98
|
-
with
|
|
92
|
+
with safe_open(path, read=True) as file:
|
|
99
93
|
graph = _read_obo(file, prefix, ignore_obsolete=ignore_obsolete, use_tqdm=use_tqdm)
|
|
100
94
|
|
|
101
95
|
if prefix:
|
|
@@ -1262,7 +1256,7 @@ def _handle_prop(
|
|
|
1262
1256
|
):
|
|
1263
1257
|
case Reference() as datatype_:
|
|
1264
1258
|
datatype = datatype_
|
|
1265
|
-
case
|
|
1259
|
+
case BlocklistError():
|
|
1266
1260
|
return None
|
|
1267
1261
|
case ParseError() as exc:
|
|
1268
1262
|
if strict:
|
|
@@ -1304,7 +1298,7 @@ def _handle_prop(
|
|
|
1304
1298
|
):
|
|
1305
1299
|
case Reference() as obj_reference:
|
|
1306
1300
|
return Annotation(prop_reference, obj_reference)
|
|
1307
|
-
case
|
|
1301
|
+
case BlocklistError():
|
|
1308
1302
|
return None
|
|
1309
1303
|
case UnparsableIRIError():
|
|
1310
1304
|
return Annotation(prop_reference, OBOLiteral.uri(value))
|
|
@@ -1330,7 +1324,7 @@ def _handle_prop(
|
|
|
1330
1324
|
):
|
|
1331
1325
|
case Reference() as obj_reference:
|
|
1332
1326
|
return Annotation(prop_reference, obj_reference)
|
|
1333
|
-
case
|
|
1327
|
+
case BlocklistError():
|
|
1334
1328
|
return None
|
|
1335
1329
|
case ParseError():
|
|
1336
1330
|
if datatype:
|
|
@@ -1535,10 +1529,12 @@ def _parse_xref_line(
|
|
|
1535
1529
|
) -> tuple[Reference, list[Reference | OBOLiteral]] | None:
|
|
1536
1530
|
xref, _, rest = line.partition(" [")
|
|
1537
1531
|
|
|
1538
|
-
|
|
1532
|
+
rules = get_rules()
|
|
1533
|
+
|
|
1534
|
+
if rules.str_is_blocked(xref, context=ontology_prefix) or ":" not in xref:
|
|
1539
1535
|
return None # sometimes xref to self... weird
|
|
1540
1536
|
|
|
1541
|
-
xref = remap_prefix(xref,
|
|
1537
|
+
xref = rules.remap_prefix(xref, context=ontology_prefix)
|
|
1542
1538
|
|
|
1543
1539
|
split_space = " " in xref
|
|
1544
1540
|
if split_space:
|
|
@@ -1552,7 +1548,7 @@ def _parse_xref_line(
|
|
|
1552
1548
|
xref, ontology_prefix=ontology_prefix, node=node, line=line, context="xref", upgrade=upgrade
|
|
1553
1549
|
)
|
|
1554
1550
|
match xref_ref:
|
|
1555
|
-
case
|
|
1551
|
+
case BlocklistError():
|
|
1556
1552
|
return None
|
|
1557
1553
|
case ParseError() as exc:
|
|
1558
1554
|
if strict:
|
pyobo/sources/cgnc.py
CHANGED
|
@@ -45,7 +45,15 @@ HEADER = [
|
|
|
45
45
|
|
|
46
46
|
def get_terms(force: bool = False) -> Iterable[Term]:
|
|
47
47
|
"""Get CGNC terms."""
|
|
48
|
-
df = ensure_df(
|
|
48
|
+
df = ensure_df(
|
|
49
|
+
PREFIX,
|
|
50
|
+
url=URL,
|
|
51
|
+
name=f"{PREFIX}.tsv",
|
|
52
|
+
force=force,
|
|
53
|
+
header=0,
|
|
54
|
+
names=HEADER,
|
|
55
|
+
on_bad_lines="skip",
|
|
56
|
+
)
|
|
49
57
|
for i, (cgnc_id, entrez_id, ensembl_id, name, synonym_1, synoynm_2, _, _) in enumerate(
|
|
50
58
|
df.values
|
|
51
59
|
):
|
pyobo/sources/credit.py
CHANGED
|
@@ -7,7 +7,7 @@ from collections.abc import Iterable
|
|
|
7
7
|
|
|
8
8
|
from more_itertools import chunked
|
|
9
9
|
|
|
10
|
-
from pyobo.struct import Obo, Term
|
|
10
|
+
from pyobo.struct import CHARLIE_TERM, HUMAN_TERM, Obo, Reference, Term, default_reference
|
|
11
11
|
from pyobo.utils.path import ensure_path
|
|
12
12
|
|
|
13
13
|
__all__ = [
|
|
@@ -16,6 +16,12 @@ __all__ = [
|
|
|
16
16
|
|
|
17
17
|
url = "https://api.github.com/repos/CASRAI-CRedIT/Dictionary/contents/Picklists/Contributor%20Roles"
|
|
18
18
|
PREFIX = "credit"
|
|
19
|
+
ROOT = default_reference(prefix=PREFIX, identifier="contributor-role", name="contributor role")
|
|
20
|
+
ROOT_TERM = (
|
|
21
|
+
Term(reference=ROOT)
|
|
22
|
+
.append_contributor(CHARLIE_TERM)
|
|
23
|
+
.append_xref(Reference(prefix="cro", identifier="0000000"))
|
|
24
|
+
)
|
|
19
25
|
|
|
20
26
|
|
|
21
27
|
class CreditGetter(Obo):
|
|
@@ -23,6 +29,7 @@ class CreditGetter(Obo):
|
|
|
23
29
|
|
|
24
30
|
ontology = PREFIX
|
|
25
31
|
static_version = "2022"
|
|
32
|
+
root_terms = [ROOT]
|
|
26
33
|
|
|
27
34
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
28
35
|
"""Iterate over terms in the ontology."""
|
|
@@ -34,14 +41,16 @@ def get_terms(force: bool = False) -> list[Term]:
|
|
|
34
41
|
path = ensure_path(PREFIX, url=url, name="picklist-api.json", force=force)
|
|
35
42
|
with open(path) as f:
|
|
36
43
|
data = json.load(f)
|
|
37
|
-
terms = [
|
|
44
|
+
terms = [
|
|
45
|
+
CHARLIE_TERM,
|
|
46
|
+
HUMAN_TERM,
|
|
47
|
+
ROOT_TERM,
|
|
48
|
+
]
|
|
38
49
|
for x in data:
|
|
39
|
-
name = x["name"].removesuffix(".md").lower()
|
|
40
|
-
|
|
41
50
|
pp = ensure_path(PREFIX, "picklist", url=x["download_url"], backend="requests")
|
|
42
51
|
with open(pp) as f:
|
|
43
52
|
header, *rest = f.read().splitlines()
|
|
44
|
-
name = header
|
|
53
|
+
name = header.removeprefix("# Contributor Roles/")
|
|
45
54
|
dd = {k.removeprefix("## "): v for k, v in chunked(rest, 2)}
|
|
46
55
|
identifier = (
|
|
47
56
|
dd["Canonical URL"]
|
|
@@ -50,7 +59,9 @@ def get_terms(force: bool = False) -> list[Term]:
|
|
|
50
59
|
)
|
|
51
60
|
desc = dd["Short definition"]
|
|
52
61
|
terms.append(
|
|
53
|
-
Term.from_triple(
|
|
62
|
+
Term.from_triple(
|
|
63
|
+
prefix=PREFIX, identifier=identifier, name=name, definition=desc
|
|
64
|
+
).append_parent(ROOT)
|
|
54
65
|
)
|
|
55
66
|
|
|
56
67
|
return terms
|
pyobo/sources/flybase.py
CHANGED
|
@@ -18,7 +18,7 @@ __all__ = [
|
|
|
18
18
|
|
|
19
19
|
logger = logging.getLogger(__name__)
|
|
20
20
|
|
|
21
|
-
BASE_URL = "
|
|
21
|
+
BASE_URL = "https://s3ftp.flybase.org/releases"
|
|
22
22
|
PREFIX = "flybase"
|
|
23
23
|
NAME = "FlyBase"
|
|
24
24
|
|
|
@@ -51,7 +51,7 @@ def _get_names(version: str, force: bool = False) -> pd.DataFrame:
|
|
|
51
51
|
|
|
52
52
|
def _get_organisms(version: str, force: bool = False) -> Mapping[str, str]:
|
|
53
53
|
"""Get mapping from abbreviation column to NCBI taxonomy ID column."""
|
|
54
|
-
url = f"
|
|
54
|
+
url = f"{BASE_URL}/FB{version}/precomputed_files/species/organism_list_fb_{version}.tsv.gz"
|
|
55
55
|
df = ensure_df(
|
|
56
56
|
PREFIX, url=url, force=force, version=version, skiprows=4, header=None, usecols=[2, 4]
|
|
57
57
|
)
|
|
@@ -60,7 +60,7 @@ def _get_organisms(version: str, force: bool = False) -> Mapping[str, str]:
|
|
|
60
60
|
|
|
61
61
|
|
|
62
62
|
def _get_definitions(version: str, force: bool = False) -> Mapping[str, str]:
|
|
63
|
-
url = f"
|
|
63
|
+
url = f"{BASE_URL}/FB{version}/precomputed_files/genes/automated_gene_summaries.tsv.gz"
|
|
64
64
|
df = ensure_df(
|
|
65
65
|
PREFIX, url=url, force=force, version=version, skiprows=2, header=None, usecols=[0, 1]
|
|
66
66
|
)
|
|
@@ -69,7 +69,7 @@ def _get_definitions(version: str, force: bool = False) -> Mapping[str, str]:
|
|
|
69
69
|
|
|
70
70
|
def _get_human_orthologs(version: str, force: bool = False) -> Mapping[str, set[str]]:
|
|
71
71
|
url = (
|
|
72
|
-
f"
|
|
72
|
+
f"{BASE_URL}/FB{version}/precomputed_files/"
|
|
73
73
|
f"orthologs/dmel_human_orthologs_disease_fb_{version}.tsv.gz"
|
|
74
74
|
)
|
|
75
75
|
df = ensure_df(
|
|
@@ -86,7 +86,7 @@ def _get_human_orthologs(version: str, force: bool = False) -> Mapping[str, set[
|
|
|
86
86
|
|
|
87
87
|
|
|
88
88
|
def _get_synonyms(version, force):
|
|
89
|
-
url = f"
|
|
89
|
+
url = f"{BASE_URL}/FB{version}/precomputed_files/synonyms/fb_synonym_fb_{version}.tsv.gz"
|
|
90
90
|
df = ensure_df(PREFIX, url=url, force=force, version=version, skiprows=4, usecols=[0, 2])
|
|
91
91
|
return df # TODO use this
|
|
92
92
|
|
pyobo/sources/omim_ps.py
CHANGED
|
@@ -13,7 +13,7 @@ __all__ = [
|
|
|
13
13
|
|
|
14
14
|
logger = logging.getLogger(__name__)
|
|
15
15
|
PREFIX = "omim.ps"
|
|
16
|
-
URL = "https://omim.org/phenotypicSeriesTitles/
|
|
16
|
+
URL = "https://omim.org/phenotypicSeriesTitles/"
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
class OMIMPSGetter(Obo):
|
|
@@ -26,13 +26,13 @@ class OMIMPSGetter(Obo):
|
|
|
26
26
|
soup = get_soup(URL, user_agent="Mozilla/5.0")
|
|
27
27
|
content = soup.find(id="mimContent")
|
|
28
28
|
if content is None:
|
|
29
|
-
raise ValueError
|
|
29
|
+
raise ValueError("omim.ps failed - scraper could not find id='mimContent' in HTML")
|
|
30
30
|
table = content.find("table") # type:ignore[attr-defined]
|
|
31
31
|
if table is None:
|
|
32
|
-
raise ValueError
|
|
32
|
+
raise ValueError("omim.ps failed - scraper could not find table in HTML")
|
|
33
33
|
tbody = table.find("tbody")
|
|
34
34
|
if tbody is None:
|
|
35
|
-
raise ValueError
|
|
35
|
+
raise ValueError("omim.ps failed - scraper could not find table body in HTML")
|
|
36
36
|
for row in tbody.find_all("tr"):
|
|
37
37
|
anchor = row.find("td").find("a")
|
|
38
38
|
name = anchor.text.strip()
|
|
@@ -16,6 +16,7 @@ from pyobo.struct.functional.utils import (
|
|
|
16
16
|
FunctionalOWLSerializable,
|
|
17
17
|
list_to_funowl,
|
|
18
18
|
)
|
|
19
|
+
from pyobo.utils.io import safe_open
|
|
19
20
|
|
|
20
21
|
__all__ = [
|
|
21
22
|
"Document",
|
|
@@ -109,7 +110,8 @@ class Document:
|
|
|
109
110
|
def write_funowl(self, path: str | Path) -> None:
|
|
110
111
|
"""Write functional OWL to a file.."""
|
|
111
112
|
path = Path(path).expanduser().resolve()
|
|
112
|
-
path
|
|
113
|
+
with safe_open(path, read=False) as file:
|
|
114
|
+
file.write(self.to_funowl())
|
|
113
115
|
|
|
114
116
|
def to_funowl(self) -> str:
|
|
115
117
|
"""Get the document as a functional OWL string."""
|
pyobo/struct/reference.py
CHANGED
|
@@ -14,9 +14,9 @@ import dateutil.parser
|
|
|
14
14
|
import pytz
|
|
15
15
|
from bioregistry import NormalizedNamableReference as Reference
|
|
16
16
|
from curies import ReferenceTuple
|
|
17
|
+
from curies.preprocessing import BlocklistError
|
|
17
18
|
|
|
18
19
|
from ..identifier_utils import (
|
|
19
|
-
BlacklistedError,
|
|
20
20
|
NotCURIEError,
|
|
21
21
|
ParseError,
|
|
22
22
|
UnparsableIRIError,
|
|
@@ -62,7 +62,7 @@ def _parse_str_or_curie_or_uri(
|
|
|
62
62
|
match reference:
|
|
63
63
|
case Reference():
|
|
64
64
|
return reference
|
|
65
|
-
case
|
|
65
|
+
case BlocklistError():
|
|
66
66
|
return None
|
|
67
67
|
case ParseError():
|
|
68
68
|
if strict:
|
|
@@ -224,7 +224,7 @@ def _obo_parse_identifier(
|
|
|
224
224
|
):
|
|
225
225
|
case Reference() as reference:
|
|
226
226
|
return reference
|
|
227
|
-
case
|
|
227
|
+
case BlocklistError():
|
|
228
228
|
return None
|
|
229
229
|
case NotCURIEError() as exc:
|
|
230
230
|
# this means there's no colon `:`
|
|
@@ -272,7 +272,7 @@ def _parse_reference_or_uri_literal(
|
|
|
272
272
|
):
|
|
273
273
|
case Reference() as reference:
|
|
274
274
|
return reference
|
|
275
|
-
case
|
|
275
|
+
case BlocklistError():
|
|
276
276
|
return None
|
|
277
277
|
case UnparsableIRIError():
|
|
278
278
|
# this means that it's defininitely a URI,
|
pyobo/struct/struct.py
CHANGED
|
@@ -70,7 +70,7 @@ from ..constants import (
|
|
|
70
70
|
TARGET_PREFIX,
|
|
71
71
|
)
|
|
72
72
|
from ..utils.cache import write_gzipped_graph
|
|
73
|
-
from ..utils.io import multidict, write_iterable_tsv
|
|
73
|
+
from ..utils.io import multidict, safe_open, write_iterable_tsv
|
|
74
74
|
from ..utils.path import (
|
|
75
75
|
CacheArtifact,
|
|
76
76
|
get_cache_path,
|
|
@@ -712,6 +712,13 @@ class Obo:
|
|
|
712
712
|
raise ValueError(f"There is no version available for {self.ontology}")
|
|
713
713
|
return self.data_version
|
|
714
714
|
|
|
715
|
+
@property
|
|
716
|
+
def _prefix_version(self) -> str:
|
|
717
|
+
"""Get the prefix and version (for logging)."""
|
|
718
|
+
if self.data_version:
|
|
719
|
+
return f"{self.ontology} {self.data_version}"
|
|
720
|
+
return self.ontology
|
|
721
|
+
|
|
715
722
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
716
723
|
"""Iterate over terms in this ontology."""
|
|
717
724
|
raise NotImplementedError
|
|
@@ -722,10 +729,11 @@ class Obo:
|
|
|
722
729
|
|
|
723
730
|
return graph_from_obo(self)
|
|
724
731
|
|
|
725
|
-
def write_obograph(self, path: Path) -> None:
|
|
732
|
+
def write_obograph(self, path: str | Path) -> None:
|
|
726
733
|
"""Write OBO Graph json."""
|
|
727
734
|
graph = self.get_graph()
|
|
728
|
-
path
|
|
735
|
+
with safe_open(path, read=False) as file:
|
|
736
|
+
file.write(graph.model_dump_json(indent=2, exclude_none=True, exclude_unset=True))
|
|
729
737
|
|
|
730
738
|
@classmethod
|
|
731
739
|
def cli(cls, *args, default_rewrite: bool = False) -> Any:
|
|
@@ -761,13 +769,12 @@ class Obo:
|
|
|
761
769
|
click.secho(f"[{cls.ontology}] Got an exception during instantiation - {type(e)}")
|
|
762
770
|
sys.exit(1)
|
|
763
771
|
inst.write_default(
|
|
764
|
-
write_obograph=
|
|
765
|
-
write_obo=
|
|
772
|
+
write_obograph=False,
|
|
773
|
+
write_obo=False,
|
|
766
774
|
write_owl=owl,
|
|
767
775
|
write_ofn=ofn,
|
|
768
776
|
write_ttl=ttl,
|
|
769
777
|
write_nodes=True,
|
|
770
|
-
write_edges=True,
|
|
771
778
|
force=force or rewrite,
|
|
772
779
|
use_tqdm=True,
|
|
773
780
|
)
|
|
@@ -969,9 +976,14 @@ class Obo:
|
|
|
969
976
|
emit_annotation_properties=emit_annotation_properties,
|
|
970
977
|
)
|
|
971
978
|
if use_tqdm:
|
|
972
|
-
it = tqdm(
|
|
979
|
+
it = tqdm(
|
|
980
|
+
it,
|
|
981
|
+
desc=f"[{self._prefix_version}] writing OBO",
|
|
982
|
+
unit_scale=True,
|
|
983
|
+
unit="line",
|
|
984
|
+
)
|
|
973
985
|
if isinstance(file, str | Path | os.PathLike):
|
|
974
|
-
with
|
|
986
|
+
with safe_open(file, read=False) as fh:
|
|
975
987
|
self._write_lines(it, fh)
|
|
976
988
|
else:
|
|
977
989
|
self._write_lines(it, file)
|
|
@@ -1002,11 +1014,72 @@ class Obo:
|
|
|
1002
1014
|
|
|
1003
1015
|
def write_nodes(self, path: str | Path) -> None:
|
|
1004
1016
|
"""Write a nodes TSV file."""
|
|
1005
|
-
|
|
1006
|
-
|
|
1017
|
+
write_iterable_tsv(
|
|
1018
|
+
path=path,
|
|
1019
|
+
header=self.nodes_header,
|
|
1020
|
+
it=self.iterate_edge_rows(),
|
|
1021
|
+
)
|
|
1022
|
+
|
|
1023
|
+
@property
|
|
1024
|
+
def nodes_header(self) -> Sequence[str]:
|
|
1025
|
+
"""Get the header for nodes."""
|
|
1026
|
+
return [
|
|
1027
|
+
"curie:ID",
|
|
1028
|
+
"name:string",
|
|
1029
|
+
"synonyms:string[]",
|
|
1030
|
+
"synonym_predicates:string[]",
|
|
1031
|
+
"synonym_types:string[]",
|
|
1032
|
+
"definition:string",
|
|
1033
|
+
"deprecated:boolean",
|
|
1034
|
+
"type:string",
|
|
1035
|
+
"provenance:string[]",
|
|
1036
|
+
"alts:string[]",
|
|
1037
|
+
"replaced_by:string[]",
|
|
1038
|
+
"mapping_objects:string[]",
|
|
1039
|
+
"mapping_predicates:string[]",
|
|
1040
|
+
"version:string",
|
|
1041
|
+
]
|
|
1042
|
+
|
|
1043
|
+
def _get_node_row(self, node: Term, sep: str, version: str) -> Sequence[str]:
|
|
1044
|
+
synonym_predicate_curies, synonym_type_curies, synonyms = [], [], []
|
|
1045
|
+
for synonym in node.synonyms:
|
|
1046
|
+
synonym_predicate_curies.append(synonym.predicate.curie)
|
|
1047
|
+
synonym_type_curies.append(synonym.type.curie if synonym.type else "")
|
|
1048
|
+
synonyms.append(synonym.name)
|
|
1049
|
+
mapping_predicate_curies, mapping_target_curies = [], []
|
|
1050
|
+
for predicate, obj in node.get_mappings(include_xrefs=True, add_context=False):
|
|
1051
|
+
mapping_predicate_curies.append(predicate.curie)
|
|
1052
|
+
mapping_target_curies.append(obj.curie)
|
|
1053
|
+
return (
|
|
1054
|
+
node.curie,
|
|
1055
|
+
node.name or "",
|
|
1056
|
+
sep.join(synonyms),
|
|
1057
|
+
sep.join(synonym_predicate_curies),
|
|
1058
|
+
sep.join(synonym_type_curies),
|
|
1059
|
+
node.definition or "",
|
|
1060
|
+
"true" if node.is_obsolete else "false",
|
|
1061
|
+
node.type,
|
|
1062
|
+
sep.join(
|
|
1063
|
+
reference.curie for reference in node.provenance if isinstance(reference, Reference)
|
|
1064
|
+
),
|
|
1065
|
+
sep.join(alt_reference.curie for alt_reference in node.alt_ids),
|
|
1066
|
+
sep.join(ref.curie for ref in node.get_replaced_by()),
|
|
1067
|
+
sep.join(mapping_target_curies),
|
|
1068
|
+
sep.join(mapping_predicate_curies),
|
|
1069
|
+
version,
|
|
1070
|
+
)
|
|
1071
|
+
|
|
1072
|
+
def iterate_node_rows(self, sep: str = ";") -> Iterable[Sequence[str]]:
|
|
1073
|
+
"""Get a nodes iterator appropriate for serialization."""
|
|
1074
|
+
version = self.data_version or ""
|
|
1075
|
+
for node in self.iter_terms():
|
|
1076
|
+
if node.prefix != self.ontology:
|
|
1077
|
+
continue
|
|
1078
|
+
yield self._get_node_row(node, sep=sep, version=version)
|
|
1007
1079
|
|
|
1008
1080
|
def write_edges(self, path: str | Path) -> None:
|
|
1009
1081
|
"""Write a edges TSV file."""
|
|
1082
|
+
# node, this is actually taken care of as part of the cache configuration
|
|
1010
1083
|
write_iterable_tsv(
|
|
1011
1084
|
path=path,
|
|
1012
1085
|
header=self.edges_header,
|
|
@@ -1025,15 +1098,15 @@ class Obo:
|
|
|
1025
1098
|
|
|
1026
1099
|
@property
|
|
1027
1100
|
def _obo_path(self) -> Path:
|
|
1028
|
-
return self._path(BUILD_SUBDIRECTORY_NAME, name=f"{self.ontology}.obo")
|
|
1101
|
+
return self._path(BUILD_SUBDIRECTORY_NAME, name=f"{self.ontology}.obo.gz")
|
|
1029
1102
|
|
|
1030
1103
|
@property
|
|
1031
1104
|
def _obograph_path(self) -> Path:
|
|
1032
|
-
return self._path(BUILD_SUBDIRECTORY_NAME, name=f"{self.ontology}.json")
|
|
1105
|
+
return self._path(BUILD_SUBDIRECTORY_NAME, name=f"{self.ontology}.json.gz")
|
|
1033
1106
|
|
|
1034
1107
|
@property
|
|
1035
1108
|
def _owl_path(self) -> Path:
|
|
1036
|
-
return self._path(BUILD_SUBDIRECTORY_NAME, name=f"{self.ontology}.owl")
|
|
1109
|
+
return self._path(BUILD_SUBDIRECTORY_NAME, name=f"{self.ontology}.owl.gz")
|
|
1037
1110
|
|
|
1038
1111
|
@property
|
|
1039
1112
|
def _obonet_gz_path(self) -> Path:
|
|
@@ -1041,7 +1114,7 @@ class Obo:
|
|
|
1041
1114
|
|
|
1042
1115
|
@property
|
|
1043
1116
|
def _ofn_path(self) -> Path:
|
|
1044
|
-
return self._path(BUILD_SUBDIRECTORY_NAME, name=f"{self.ontology}.ofn")
|
|
1117
|
+
return self._path(BUILD_SUBDIRECTORY_NAME, name=f"{self.ontology}.ofn.gz")
|
|
1045
1118
|
|
|
1046
1119
|
@property
|
|
1047
1120
|
def _ttl_path(self) -> Path:
|
|
@@ -1060,22 +1133,10 @@ class Obo:
|
|
|
1060
1133
|
[f"{self.ontology}_id", "taxonomy_id"],
|
|
1061
1134
|
self.iterate_id_species,
|
|
1062
1135
|
),
|
|
1063
|
-
(
|
|
1064
|
-
# TODO deprecate this in favor of literal mappings output
|
|
1065
|
-
CacheArtifact.synonyms,
|
|
1066
|
-
[f"{self.ontology}_id", "synonym"],
|
|
1067
|
-
self.iterate_synonym_rows,
|
|
1068
|
-
),
|
|
1069
1136
|
(CacheArtifact.alts, [f"{self.ontology}_id", "alt_id"], self.iterate_alt_rows),
|
|
1070
1137
|
(CacheArtifact.mappings, SSSOM_DF_COLUMNS, self.iterate_mapping_rows),
|
|
1071
1138
|
(CacheArtifact.relations, self.relations_header, self.iter_relation_rows),
|
|
1072
1139
|
(CacheArtifact.edges, self.edges_header, self.iterate_edge_rows),
|
|
1073
|
-
(
|
|
1074
|
-
# TODO deprecate this in favor of pair of literal and object properties
|
|
1075
|
-
CacheArtifact.properties,
|
|
1076
|
-
self.properties_header,
|
|
1077
|
-
self._iter_property_rows,
|
|
1078
|
-
),
|
|
1079
1140
|
(
|
|
1080
1141
|
CacheArtifact.object_properties,
|
|
1081
1142
|
self.object_properties_header,
|
|
@@ -1097,8 +1158,8 @@ class Obo:
|
|
|
1097
1158
|
"""Write the metadata JSON file."""
|
|
1098
1159
|
metadata = self.get_metadata()
|
|
1099
1160
|
for path in (self._root_metadata_path, self._get_cache_path(CacheArtifact.metadata)):
|
|
1100
|
-
logger.debug("[%s
|
|
1101
|
-
with path
|
|
1161
|
+
logger.debug("[%s] caching metadata to %s", self._prefix_version, path)
|
|
1162
|
+
with safe_open(path, read=False) as file:
|
|
1102
1163
|
json.dump(metadata, file, indent=2)
|
|
1103
1164
|
|
|
1104
1165
|
def write_prefix_map(self) -> None:
|
|
@@ -1110,9 +1171,8 @@ class Obo:
|
|
|
1110
1171
|
"""Write cache parts."""
|
|
1111
1172
|
typedefs_path = self._get_cache_path(CacheArtifact.typedefs)
|
|
1112
1173
|
logger.debug(
|
|
1113
|
-
"[%s
|
|
1114
|
-
self.
|
|
1115
|
-
self.data_version,
|
|
1174
|
+
"[%s] caching typedefs to %s",
|
|
1175
|
+
self._prefix_version,
|
|
1116
1176
|
typedefs_path,
|
|
1117
1177
|
)
|
|
1118
1178
|
typedef_df: pd.DataFrame = self.get_typedef_df()
|
|
@@ -1121,10 +1181,10 @@ class Obo:
|
|
|
1121
1181
|
|
|
1122
1182
|
for cache_artifact, header, fn in self._get_cache_config():
|
|
1123
1183
|
path = self._get_cache_path(cache_artifact)
|
|
1124
|
-
if path.
|
|
1184
|
+
if path.is_file() and not force:
|
|
1125
1185
|
continue
|
|
1126
1186
|
tqdm.write(
|
|
1127
|
-
f"[{self.
|
|
1187
|
+
f"[{self._prefix_version}] writing {cache_artifact.name} to {path}",
|
|
1128
1188
|
)
|
|
1129
1189
|
write_iterable_tsv(
|
|
1130
1190
|
path=path,
|
|
@@ -1139,12 +1199,11 @@ class Obo:
|
|
|
1139
1199
|
relations_path = get_relation_cache_path(
|
|
1140
1200
|
self.ontology, reference=relation, version=self.data_version
|
|
1141
1201
|
)
|
|
1142
|
-
if relations_path.
|
|
1202
|
+
if relations_path.is_file() and not force:
|
|
1143
1203
|
continue
|
|
1144
1204
|
logger.debug(
|
|
1145
|
-
"[%s
|
|
1146
|
-
self.
|
|
1147
|
-
self.data_version,
|
|
1205
|
+
"[%s] caching relation %s ! %s",
|
|
1206
|
+
self._prefix_version,
|
|
1148
1207
|
relation.curie,
|
|
1149
1208
|
relation.name,
|
|
1150
1209
|
)
|
|
@@ -1164,8 +1223,7 @@ class Obo:
|
|
|
1164
1223
|
write_owl: bool = False,
|
|
1165
1224
|
write_ofn: bool = False,
|
|
1166
1225
|
write_ttl: bool = False,
|
|
1167
|
-
write_nodes: bool =
|
|
1168
|
-
write_edges: bool = True,
|
|
1226
|
+
write_nodes: bool = False,
|
|
1169
1227
|
obograph_use_internal: bool = False,
|
|
1170
1228
|
write_cache: bool = True,
|
|
1171
1229
|
) -> None:
|
|
@@ -1174,15 +1232,15 @@ class Obo:
|
|
|
1174
1232
|
self.write_prefix_map()
|
|
1175
1233
|
if write_cache:
|
|
1176
1234
|
self.write_cache(force=force)
|
|
1177
|
-
if write_obo and (not self._obo_path.
|
|
1178
|
-
tqdm.write(f"[{self.
|
|
1235
|
+
if write_obo and (not self._obo_path.is_file() or force):
|
|
1236
|
+
tqdm.write(f"[{self._prefix_version}] writing OBO to {self._obo_path}")
|
|
1179
1237
|
self.write_obo(self._obo_path, use_tqdm=use_tqdm)
|
|
1180
|
-
if (write_ofn or write_owl or write_obograph) and (not self._ofn_path.
|
|
1181
|
-
tqdm.write(f"[{self.
|
|
1238
|
+
if (write_ofn or write_owl or write_obograph) and (not self._ofn_path.is_file() or force):
|
|
1239
|
+
tqdm.write(f"[{self._prefix_version}] writing OFN to {self._ofn_path}")
|
|
1182
1240
|
self.write_ofn(self._ofn_path)
|
|
1183
|
-
if write_obograph and (not self._obograph_path.
|
|
1241
|
+
if write_obograph and (not self._obograph_path.is_file() or force):
|
|
1184
1242
|
if obograph_use_internal:
|
|
1185
|
-
tqdm.write(f"[{self.
|
|
1243
|
+
tqdm.write(f"[{self._prefix_version}] writing OBO Graph to {self._obograph_path}")
|
|
1186
1244
|
self.write_obograph(self._obograph_path)
|
|
1187
1245
|
else:
|
|
1188
1246
|
import bioontologies.robot
|
|
@@ -1193,22 +1251,22 @@ class Obo:
|
|
|
1193
1251
|
bioontologies.robot.convert(
|
|
1194
1252
|
self._ofn_path, self._obograph_path, debug=True, merge=False, reason=False
|
|
1195
1253
|
)
|
|
1196
|
-
if write_owl and (not self._owl_path.
|
|
1197
|
-
tqdm.write(f"[{self.
|
|
1254
|
+
if write_owl and (not self._owl_path.is_file() or force):
|
|
1255
|
+
tqdm.write(f"[{self._prefix_version}] writing OWL to {self._owl_path}")
|
|
1198
1256
|
import bioontologies.robot
|
|
1199
1257
|
|
|
1200
1258
|
bioontologies.robot.convert(
|
|
1201
1259
|
self._ofn_path, self._owl_path, debug=True, merge=False, reason=False
|
|
1202
1260
|
)
|
|
1203
|
-
if write_ttl and (not self._ttl_path.
|
|
1204
|
-
tqdm.write(f"[{self.
|
|
1261
|
+
if write_ttl and (not self._ttl_path.is_file() or force):
|
|
1262
|
+
tqdm.write(f"[{self._prefix_version}] writing Turtle to {self._ttl_path}")
|
|
1205
1263
|
self.write_rdf(self._ttl_path)
|
|
1206
|
-
if write_obonet and (not self._obonet_gz_path.
|
|
1207
|
-
tqdm.write(f"[{self.
|
|
1264
|
+
if write_obonet and (not self._obonet_gz_path.is_file() or force):
|
|
1265
|
+
tqdm.write(f"[{self._prefix_version}] writing obonet to {self._obonet_gz_path}")
|
|
1208
1266
|
self.write_obonet_gz(self._obonet_gz_path)
|
|
1209
1267
|
if write_nodes:
|
|
1210
1268
|
nodes_path = self._get_cache_path(CacheArtifact.nodes)
|
|
1211
|
-
tqdm.write(f"[{self.
|
|
1269
|
+
tqdm.write(f"[{self._prefix_version}] writing nodes TSV to {nodes_path}")
|
|
1212
1270
|
self.write_nodes(nodes_path)
|
|
1213
1271
|
|
|
1214
1272
|
@property
|
|
@@ -1335,9 +1393,8 @@ class Obo:
|
|
|
1335
1393
|
rv.add_edge(_source, _target, key=_key)
|
|
1336
1394
|
|
|
1337
1395
|
logger.info(
|
|
1338
|
-
"[%s
|
|
1339
|
-
self.
|
|
1340
|
-
self.data_version,
|
|
1396
|
+
"[%s] exported graph with %d nodes",
|
|
1397
|
+
self._prefix_version,
|
|
1341
1398
|
rv.number_of_nodes(),
|
|
1342
1399
|
)
|
|
1343
1400
|
return rv
|
pyobo/utils/cache.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
"""Utilities for caching files."""
|
|
2
2
|
|
|
3
|
-
import gzip
|
|
4
3
|
import json
|
|
5
4
|
import logging
|
|
6
5
|
from collections.abc import Iterable, Mapping
|
|
@@ -14,7 +13,7 @@ from pystow.cache import CachedDataFrame as cached_df # noqa:N813
|
|
|
14
13
|
from pystow.cache import CachedJSON as cached_json # noqa:N813
|
|
15
14
|
from pystow.cache import CachedPickle as cached_pickle # noqa:N813
|
|
16
15
|
|
|
17
|
-
from .io import open_map_tsv, open_multimap_tsv, write_map_tsv, write_multimap_tsv
|
|
16
|
+
from .io import open_map_tsv, open_multimap_tsv, safe_open, write_map_tsv, write_multimap_tsv
|
|
18
17
|
|
|
19
18
|
__all__ = [
|
|
20
19
|
"cached_collection",
|
|
@@ -70,13 +69,13 @@ NODE_LINK_STYLE = "links" # TODO update to "edges"
|
|
|
70
69
|
|
|
71
70
|
def get_gzipped_graph(path: str | Path) -> nx.MultiDiGraph:
|
|
72
71
|
"""Read a graph that's gzipped nodelink."""
|
|
73
|
-
with
|
|
72
|
+
with safe_open(path, read=True) as file:
|
|
74
73
|
return nx.node_link_graph(json.load(file), edges=NODE_LINK_STYLE)
|
|
75
74
|
|
|
76
75
|
|
|
77
76
|
def write_gzipped_graph(graph: nx.MultiDiGraph, path: str | Path) -> None:
|
|
78
77
|
"""Write a graph as gzipped nodelink."""
|
|
79
|
-
with
|
|
78
|
+
with safe_open(path, read=False) as file:
|
|
80
79
|
json.dump(nx.node_link_data(graph, edges=NODE_LINK_STYLE), file)
|
|
81
80
|
|
|
82
81
|
|