pyobo 0.11.1__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyobo/.DS_Store +0 -0
- pyobo/__init__.py +95 -20
- pyobo/__main__.py +0 -0
- pyobo/api/__init__.py +81 -10
- pyobo/api/alts.py +52 -42
- pyobo/api/combine.py +39 -0
- pyobo/api/edges.py +68 -0
- pyobo/api/hierarchy.py +231 -203
- pyobo/api/metadata.py +14 -19
- pyobo/api/names.py +207 -127
- pyobo/api/properties.py +117 -113
- pyobo/api/relations.py +68 -94
- pyobo/api/species.py +24 -21
- pyobo/api/typedefs.py +11 -11
- pyobo/api/utils.py +66 -13
- pyobo/api/xrefs.py +108 -114
- pyobo/cli/__init__.py +0 -0
- pyobo/cli/cli.py +35 -50
- pyobo/cli/database.py +183 -161
- pyobo/{xrefdb/xrefs_pipeline.py → cli/database_utils.py} +54 -73
- pyobo/cli/lookup.py +163 -195
- pyobo/cli/utils.py +19 -6
- pyobo/constants.py +102 -3
- pyobo/getters.py +196 -118
- pyobo/gilda_utils.py +79 -200
- pyobo/identifier_utils/__init__.py +41 -0
- pyobo/identifier_utils/api.py +296 -0
- pyobo/identifier_utils/model.py +130 -0
- pyobo/identifier_utils/preprocessing.json +812 -0
- pyobo/identifier_utils/preprocessing.py +61 -0
- pyobo/identifier_utils/relations/__init__.py +8 -0
- pyobo/identifier_utils/relations/api.py +162 -0
- pyobo/identifier_utils/relations/data.json +5824 -0
- pyobo/identifier_utils/relations/data_owl.json +57 -0
- pyobo/identifier_utils/relations/data_rdf.json +1 -0
- pyobo/identifier_utils/relations/data_rdfs.json +7 -0
- pyobo/mocks.py +9 -6
- pyobo/ner/__init__.py +9 -0
- pyobo/ner/api.py +72 -0
- pyobo/ner/normalizer.py +33 -0
- pyobo/obographs.py +43 -39
- pyobo/plugins.py +5 -4
- pyobo/py.typed +0 -0
- pyobo/reader.py +1358 -395
- pyobo/reader_utils.py +155 -0
- pyobo/resource_utils.py +42 -22
- pyobo/resources/__init__.py +0 -0
- pyobo/resources/goc.py +75 -0
- pyobo/resources/goc.tsv +188 -0
- pyobo/resources/ncbitaxon.py +4 -5
- pyobo/resources/ncbitaxon.tsv.gz +0 -0
- pyobo/resources/ro.py +3 -2
- pyobo/resources/ro.tsv +0 -0
- pyobo/resources/so.py +0 -0
- pyobo/resources/so.tsv +0 -0
- pyobo/sources/README.md +12 -8
- pyobo/sources/__init__.py +52 -29
- pyobo/sources/agrovoc.py +0 -0
- pyobo/sources/antibodyregistry.py +11 -12
- pyobo/sources/bigg/__init__.py +13 -0
- pyobo/sources/bigg/bigg_compartment.py +81 -0
- pyobo/sources/bigg/bigg_metabolite.py +229 -0
- pyobo/sources/bigg/bigg_model.py +46 -0
- pyobo/sources/bigg/bigg_reaction.py +77 -0
- pyobo/sources/biogrid.py +1 -2
- pyobo/sources/ccle.py +7 -12
- pyobo/sources/cgnc.py +0 -5
- pyobo/sources/chebi.py +1 -1
- pyobo/sources/chembl/__init__.py +9 -0
- pyobo/sources/{chembl.py → chembl/chembl_compound.py} +13 -25
- pyobo/sources/chembl/chembl_target.py +160 -0
- pyobo/sources/civic_gene.py +55 -15
- pyobo/sources/clinicaltrials.py +160 -0
- pyobo/sources/complexportal.py +24 -24
- pyobo/sources/conso.py +14 -22
- pyobo/sources/cpt.py +0 -0
- pyobo/sources/credit.py +1 -9
- pyobo/sources/cvx.py +27 -5
- pyobo/sources/depmap.py +9 -12
- pyobo/sources/dictybase_gene.py +2 -7
- pyobo/sources/drugbank/__init__.py +9 -0
- pyobo/sources/{drugbank.py → drugbank/drugbank.py} +11 -16
- pyobo/sources/{drugbank_salt.py → drugbank/drugbank_salt.py} +3 -8
- pyobo/sources/drugcentral.py +17 -13
- pyobo/sources/expasy.py +31 -34
- pyobo/sources/famplex.py +13 -18
- pyobo/sources/flybase.py +3 -8
- pyobo/sources/gard.py +62 -0
- pyobo/sources/geonames/__init__.py +9 -0
- pyobo/sources/geonames/features.py +28 -0
- pyobo/sources/{geonames.py → geonames/geonames.py} +87 -26
- pyobo/sources/geonames/utils.py +115 -0
- pyobo/sources/gmt_utils.py +6 -7
- pyobo/sources/go.py +20 -13
- pyobo/sources/gtdb.py +154 -0
- pyobo/sources/gwascentral/__init__.py +9 -0
- pyobo/sources/{gwascentral_phenotype.py → gwascentral/gwascentral_phenotype.py} +5 -7
- pyobo/sources/{gwascentral_study.py → gwascentral/gwascentral_study.py} +1 -7
- pyobo/sources/hgnc/__init__.py +9 -0
- pyobo/sources/{hgnc.py → hgnc/hgnc.py} +56 -70
- pyobo/sources/{hgncgenefamily.py → hgnc/hgncgenefamily.py} +8 -18
- pyobo/sources/icd/__init__.py +9 -0
- pyobo/sources/{icd10.py → icd/icd10.py} +35 -37
- pyobo/sources/icd/icd11.py +148 -0
- pyobo/sources/{icd_utils.py → icd/icd_utils.py} +66 -20
- pyobo/sources/interpro.py +4 -9
- pyobo/sources/itis.py +0 -5
- pyobo/sources/kegg/__init__.py +0 -0
- pyobo/sources/kegg/api.py +16 -38
- pyobo/sources/kegg/genes.py +9 -20
- pyobo/sources/kegg/genome.py +1 -7
- pyobo/sources/kegg/pathway.py +9 -21
- pyobo/sources/mesh.py +58 -24
- pyobo/sources/mgi.py +3 -10
- pyobo/sources/mirbase/__init__.py +11 -0
- pyobo/sources/{mirbase.py → mirbase/mirbase.py} +8 -11
- pyobo/sources/{mirbase_constants.py → mirbase/mirbase_constants.py} +0 -0
- pyobo/sources/{mirbase_family.py → mirbase/mirbase_family.py} +4 -8
- pyobo/sources/{mirbase_mature.py → mirbase/mirbase_mature.py} +3 -7
- pyobo/sources/msigdb.py +74 -39
- pyobo/sources/ncbi/__init__.py +9 -0
- pyobo/sources/ncbi/ncbi_gc.py +162 -0
- pyobo/sources/{ncbigene.py → ncbi/ncbigene.py} +18 -19
- pyobo/sources/nih_reporter.py +60 -0
- pyobo/sources/nlm/__init__.py +9 -0
- pyobo/sources/nlm/nlm_catalog.py +48 -0
- pyobo/sources/nlm/nlm_publisher.py +36 -0
- pyobo/sources/nlm/utils.py +116 -0
- pyobo/sources/npass.py +6 -8
- pyobo/sources/omim_ps.py +10 -3
- pyobo/sources/pathbank.py +4 -8
- pyobo/sources/pfam/__init__.py +9 -0
- pyobo/sources/{pfam.py → pfam/pfam.py} +3 -8
- pyobo/sources/{pfam_clan.py → pfam/pfam_clan.py} +2 -7
- pyobo/sources/pharmgkb/__init__.py +15 -0
- pyobo/sources/pharmgkb/pharmgkb_chemical.py +89 -0
- pyobo/sources/pharmgkb/pharmgkb_disease.py +77 -0
- pyobo/sources/pharmgkb/pharmgkb_gene.py +108 -0
- pyobo/sources/pharmgkb/pharmgkb_pathway.py +63 -0
- pyobo/sources/pharmgkb/pharmgkb_variant.py +84 -0
- pyobo/sources/pharmgkb/utils.py +86 -0
- pyobo/sources/pid.py +1 -6
- pyobo/sources/pombase.py +6 -10
- pyobo/sources/pubchem.py +4 -9
- pyobo/sources/reactome.py +5 -11
- pyobo/sources/rgd.py +11 -16
- pyobo/sources/rhea.py +37 -36
- pyobo/sources/ror.py +69 -42
- pyobo/sources/selventa/__init__.py +0 -0
- pyobo/sources/selventa/schem.py +4 -7
- pyobo/sources/selventa/scomp.py +1 -6
- pyobo/sources/selventa/sdis.py +4 -7
- pyobo/sources/selventa/sfam.py +1 -6
- pyobo/sources/sgd.py +6 -11
- pyobo/sources/signor/__init__.py +7 -0
- pyobo/sources/signor/download.py +41 -0
- pyobo/sources/signor/signor_complexes.py +105 -0
- pyobo/sources/slm.py +12 -15
- pyobo/sources/umls/__init__.py +7 -1
- pyobo/sources/umls/__main__.py +0 -0
- pyobo/sources/umls/get_synonym_types.py +20 -4
- pyobo/sources/umls/sty.py +57 -0
- pyobo/sources/umls/synonym_types.tsv +1 -1
- pyobo/sources/umls/umls.py +18 -22
- pyobo/sources/unimod.py +46 -0
- pyobo/sources/uniprot/__init__.py +1 -1
- pyobo/sources/uniprot/uniprot.py +40 -32
- pyobo/sources/uniprot/uniprot_ptm.py +4 -34
- pyobo/sources/utils.py +3 -2
- pyobo/sources/wikipathways.py +7 -10
- pyobo/sources/zfin.py +5 -10
- pyobo/ssg/__init__.py +12 -16
- pyobo/ssg/base.html +0 -0
- pyobo/ssg/index.html +26 -13
- pyobo/ssg/term.html +12 -2
- pyobo/ssg/typedef.html +0 -0
- pyobo/struct/__init__.py +54 -8
- pyobo/struct/functional/__init__.py +1 -0
- pyobo/struct/functional/dsl.py +2572 -0
- pyobo/struct/functional/macros.py +423 -0
- pyobo/struct/functional/obo_to_functional.py +385 -0
- pyobo/struct/functional/ontology.py +270 -0
- pyobo/struct/functional/utils.py +112 -0
- pyobo/struct/reference.py +331 -136
- pyobo/struct/struct.py +1413 -643
- pyobo/struct/struct_utils.py +1078 -0
- pyobo/struct/typedef.py +162 -210
- pyobo/struct/utils.py +12 -5
- pyobo/struct/vocabulary.py +138 -0
- pyobo/utils/__init__.py +0 -0
- pyobo/utils/cache.py +13 -11
- pyobo/utils/io.py +17 -31
- pyobo/utils/iter.py +5 -5
- pyobo/utils/misc.py +41 -53
- pyobo/utils/ndex_utils.py +0 -0
- pyobo/utils/path.py +76 -70
- pyobo/version.py +3 -3
- {pyobo-0.11.1.dist-info → pyobo-0.12.0.dist-info}/METADATA +224 -225
- pyobo-0.12.0.dist-info/RECORD +202 -0
- pyobo-0.12.0.dist-info/WHEEL +4 -0
- {pyobo-0.11.1.dist-info → pyobo-0.12.0.dist-info}/entry_points.txt +1 -0
- {pyobo-0.11.1.dist-info → pyobo-0.12.0.dist-info/licenses}/LICENSE +0 -0
- pyobo/apps/__init__.py +0 -3
- pyobo/apps/cli.py +0 -24
- pyobo/apps/gilda/__init__.py +0 -3
- pyobo/apps/gilda/__main__.py +0 -8
- pyobo/apps/gilda/app.py +0 -48
- pyobo/apps/gilda/cli.py +0 -36
- pyobo/apps/gilda/templates/base.html +0 -33
- pyobo/apps/gilda/templates/home.html +0 -11
- pyobo/apps/gilda/templates/matches.html +0 -32
- pyobo/apps/mapper/__init__.py +0 -3
- pyobo/apps/mapper/__main__.py +0 -11
- pyobo/apps/mapper/cli.py +0 -37
- pyobo/apps/mapper/mapper.py +0 -187
- pyobo/apps/mapper/templates/base.html +0 -35
- pyobo/apps/mapper/templates/mapper_home.html +0 -64
- pyobo/aws.py +0 -162
- pyobo/cli/aws.py +0 -47
- pyobo/identifier_utils.py +0 -142
- pyobo/normalizer.py +0 -232
- pyobo/registries/__init__.py +0 -16
- pyobo/registries/metaregistry.json +0 -507
- pyobo/registries/metaregistry.py +0 -135
- pyobo/sources/icd11.py +0 -105
- pyobo/xrefdb/__init__.py +0 -1
- pyobo/xrefdb/canonicalizer.py +0 -214
- pyobo/xrefdb/priority.py +0 -59
- pyobo/xrefdb/sources/__init__.py +0 -60
- pyobo/xrefdb/sources/biomappings.py +0 -36
- pyobo/xrefdb/sources/cbms2019.py +0 -91
- pyobo/xrefdb/sources/chembl.py +0 -83
- pyobo/xrefdb/sources/compath.py +0 -82
- pyobo/xrefdb/sources/famplex.py +0 -64
- pyobo/xrefdb/sources/gilda.py +0 -50
- pyobo/xrefdb/sources/intact.py +0 -113
- pyobo/xrefdb/sources/ncit.py +0 -133
- pyobo/xrefdb/sources/pubchem.py +0 -27
- pyobo/xrefdb/sources/wikidata.py +0 -116
- pyobo-0.11.1.dist-info/RECORD +0 -173
- pyobo-0.11.1.dist-info/WHEEL +0 -5
- pyobo-0.11.1.dist-info/top_level.txt +0 -1
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Download utilities for SIGNOR."""
|
|
2
|
+
|
|
3
|
+
import enum
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import requests
|
|
7
|
+
|
|
8
|
+
from pyobo.utils.path import prefix_directory_join
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"DownloadKey",
|
|
12
|
+
"download_signor",
|
|
13
|
+
"get_signor_df",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DownloadKey(enum.Enum):
|
|
18
|
+
"""Download key."""
|
|
19
|
+
|
|
20
|
+
complex = "Download complex data"
|
|
21
|
+
family = "Download protein family data"
|
|
22
|
+
phenotype = "Download phenotype data"
|
|
23
|
+
stimulus = "Download stimulus data"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def download_signor(key: DownloadKey) -> requests.Response:
|
|
27
|
+
"""Download from SIGNOR."""
|
|
28
|
+
return requests.post(
|
|
29
|
+
"https://signor.uniroma2.it/download_complexes.php",
|
|
30
|
+
files={"submit": (None, key.value)},
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_signor_df(prefix: str, *, version: str, key: DownloadKey, force: bool) -> pd.DataFrame:
|
|
35
|
+
"""Get the appropriate SIGNOR dataframe."""
|
|
36
|
+
path = prefix_directory_join(prefix, version=version, name=f"{key.name}.csv")
|
|
37
|
+
if not path.is_file() or force:
|
|
38
|
+
res = download_signor(key)
|
|
39
|
+
path.write_text(res.text)
|
|
40
|
+
df = pd.read_csv(path, sep=";")
|
|
41
|
+
return df
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""A source for SIGNOR complexes."""
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from pyobo import Obo, Reference, Term, default_reference
|
|
8
|
+
from pyobo.sources.signor.download import DownloadKey, get_signor_df
|
|
9
|
+
from pyobo.struct import CHARLIE_TERM, HUMAN_TERM, PYOBO_INJECTED
|
|
10
|
+
from pyobo.struct.typedef import exact_match, has_component, has_member
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"SignorGetter",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
PREFIX = "signor"
|
|
17
|
+
|
|
18
|
+
PROTEIN_FAMILY = (
|
|
19
|
+
Term(reference=default_reference(PREFIX, "protein-family"))
|
|
20
|
+
.append_contributor(CHARLIE_TERM)
|
|
21
|
+
.append_comment(PYOBO_INJECTED)
|
|
22
|
+
)
|
|
23
|
+
PROTEIN_COMPLEX = (
|
|
24
|
+
Term(reference=default_reference(PREFIX, "protein-complex"))
|
|
25
|
+
.append_contributor(CHARLIE_TERM)
|
|
26
|
+
.append_comment(PYOBO_INJECTED)
|
|
27
|
+
)
|
|
28
|
+
PHENOTYPE = (
|
|
29
|
+
Term(reference=default_reference(PREFIX, "phenotype"))
|
|
30
|
+
.append_contributor(CHARLIE_TERM)
|
|
31
|
+
.append_comment(PYOBO_INJECTED)
|
|
32
|
+
)
|
|
33
|
+
STIMULUS = (
|
|
34
|
+
Term(reference=default_reference(PREFIX, "stimulus"))
|
|
35
|
+
.append_contributor(CHARLIE_TERM)
|
|
36
|
+
.append_comment(PYOBO_INJECTED)
|
|
37
|
+
)
|
|
38
|
+
ROOT_TERMS = (PROTEIN_FAMILY, PROTEIN_COMPLEX, PHENOTYPE, STIMULUS)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class SignorGetter(Obo):
|
|
42
|
+
"""An ontology representation of SIGNOR complexes."""
|
|
43
|
+
|
|
44
|
+
ontology = bioversions_key = PREFIX
|
|
45
|
+
typedefs = [exact_match, has_component, has_member]
|
|
46
|
+
root_terms = [r.reference for r in ROOT_TERMS]
|
|
47
|
+
|
|
48
|
+
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
49
|
+
"""Iterate over terms in the ontology."""
|
|
50
|
+
return iter_terms(version=self._version_or_raise, force=force)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
|
|
54
|
+
"""Iterate over terms."""
|
|
55
|
+
yield CHARLIE_TERM
|
|
56
|
+
yield HUMAN_TERM
|
|
57
|
+
yield from ROOT_TERMS
|
|
58
|
+
|
|
59
|
+
complexes_df = get_signor_df(PREFIX, version=version, force=force, key=DownloadKey.complex)
|
|
60
|
+
for identifier, name, proteins in complexes_df.values:
|
|
61
|
+
term = Term.from_triple(PREFIX, identifier, name)
|
|
62
|
+
term.append_parent(PROTEIN_COMPLEX)
|
|
63
|
+
for part_id in proteins.split(","):
|
|
64
|
+
part_id = part_id.strip()
|
|
65
|
+
if part_id.startswith("SIGNOR-"):
|
|
66
|
+
part = Reference(prefix="signor", identifier=part_id)
|
|
67
|
+
elif part_id.startswith("CHEBI:"):
|
|
68
|
+
part = Reference(prefix="chebi", identifier=part_id.removeprefix("CHEBI:"))
|
|
69
|
+
else:
|
|
70
|
+
part = Reference(prefix="uniprot", identifier=part_id)
|
|
71
|
+
term.annotate_object(has_component, part)
|
|
72
|
+
yield term
|
|
73
|
+
|
|
74
|
+
family_df = get_signor_df(PREFIX, version=version, force=force, key=DownloadKey.family)
|
|
75
|
+
for identifier, name, proteins in family_df.values:
|
|
76
|
+
term = Term.from_triple(PREFIX, identifier, name)
|
|
77
|
+
term.append_parent(PROTEIN_FAMILY)
|
|
78
|
+
for uniprot_id in proteins.split(","):
|
|
79
|
+
uniprot_id = uniprot_id.strip()
|
|
80
|
+
term.annotate_object(has_member, Reference(prefix="uniprot", identifier=uniprot_id))
|
|
81
|
+
yield term
|
|
82
|
+
|
|
83
|
+
stimulus_df = get_signor_df(PREFIX, version=version, force=force, key=DownloadKey.stimulus)
|
|
84
|
+
# for some reason, there are many duplicates in this file
|
|
85
|
+
stimulus_df = stimulus_df.drop_duplicates()
|
|
86
|
+
for identifier, name, description in stimulus_df.values:
|
|
87
|
+
term = Term.from_triple(PREFIX, identifier, name, definition=_clean_descr(description))
|
|
88
|
+
term.append_parent(STIMULUS)
|
|
89
|
+
yield term
|
|
90
|
+
|
|
91
|
+
phenotypes_df = get_signor_df(PREFIX, version=version, force=force, key=DownloadKey.phenotype)
|
|
92
|
+
for identifier, name, description in phenotypes_df.values:
|
|
93
|
+
term = Term.from_triple(PREFIX, identifier, name, definition=_clean_descr(description))
|
|
94
|
+
term.append_parent(PHENOTYPE)
|
|
95
|
+
yield term
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _clean_descr(d) -> str | None:
|
|
99
|
+
if pd.isna(d):
|
|
100
|
+
return None
|
|
101
|
+
return d.replace("\n", " ")
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
if __name__ == "__main__":
|
|
105
|
+
SignorGetter.cli()
|
pyobo/sources/slm.py
CHANGED
|
@@ -5,9 +5,9 @@ from collections.abc import Iterable
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
from tqdm.auto import tqdm
|
|
7
7
|
|
|
8
|
-
from pyobo import Obo, Reference, Term
|
|
8
|
+
from pyobo import Obo, Reference, Term, TypeDef
|
|
9
9
|
from pyobo.struct.struct import abbreviation as abbreviation_typedef
|
|
10
|
-
from pyobo.struct.typedef import exact_match, has_inchi, has_smiles
|
|
10
|
+
from pyobo.struct.typedef import exact_match, has_citation, has_inchi, has_smiles
|
|
11
11
|
from pyobo.utils.path import ensure_df
|
|
12
12
|
|
|
13
13
|
__all__ = [
|
|
@@ -36,13 +36,14 @@ COLUMNS = [
|
|
|
36
36
|
"HMDB",
|
|
37
37
|
"PMID",
|
|
38
38
|
]
|
|
39
|
+
LEVEL = TypeDef.default(PREFIX, "level", is_metadata_tag=True)
|
|
39
40
|
|
|
40
41
|
|
|
41
42
|
class SLMGetter(Obo):
|
|
42
43
|
"""An ontology representation of SwissLipid's lipid nomenclature."""
|
|
43
44
|
|
|
44
45
|
ontology = bioversions_key = PREFIX
|
|
45
|
-
typedefs = [exact_match]
|
|
46
|
+
typedefs = [exact_match, LEVEL, has_inchi, has_smiles, has_citation]
|
|
46
47
|
synonym_typedefs = [abbreviation_typedef]
|
|
47
48
|
|
|
48
49
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
@@ -50,9 +51,7 @@ class SLMGetter(Obo):
|
|
|
50
51
|
return iter_terms(force=force, version=self._version_or_raise)
|
|
51
52
|
|
|
52
53
|
|
|
53
|
-
|
|
54
|
-
"""Get SwissLipids as OBO."""
|
|
55
|
-
return SLMGetter(force=force)
|
|
54
|
+
INVALID_INCHI = {"-", "none"}
|
|
56
55
|
|
|
57
56
|
|
|
58
57
|
def iter_terms(version: str, force: bool = False):
|
|
@@ -90,27 +89,25 @@ def iter_terms(version: str, force: bool = False):
|
|
|
90
89
|
raise ValueError(identifier)
|
|
91
90
|
term = Term.from_triple(PREFIX, identifier, name)
|
|
92
91
|
if pd.notna(level):
|
|
93
|
-
term.
|
|
92
|
+
term.annotate_string(LEVEL, level)
|
|
94
93
|
if pd.notna(abbreviation):
|
|
95
94
|
term.append_synonym(abbreviation, type=abbreviation_typedef)
|
|
96
95
|
if pd.notna(synonyms):
|
|
97
96
|
for synonym in synonyms.split("|"):
|
|
98
97
|
term.append_synonym(synonym.strip())
|
|
99
98
|
if pd.notna(smiles):
|
|
100
|
-
term.
|
|
99
|
+
term.annotate_string(has_smiles, smiles)
|
|
101
100
|
if pd.notna(inchi) and inchi != "InChI=none":
|
|
102
101
|
if inchi.startswith("InChI="):
|
|
103
102
|
inchi = inchi[len("InChI=") :]
|
|
104
|
-
term.
|
|
103
|
+
term.annotate_string(has_inchi, inchi)
|
|
105
104
|
if pd.notna(inchikey):
|
|
106
105
|
inchikey = inchikey.removeprefix("InChIKey=").strip()
|
|
107
|
-
if inchikey and inchikey
|
|
106
|
+
if inchikey and inchikey not in INVALID_INCHI:
|
|
108
107
|
try:
|
|
109
108
|
inchi_ref = Reference(prefix="inchikey", identifier=inchikey)
|
|
110
109
|
except ValueError:
|
|
111
|
-
tqdm.write(
|
|
112
|
-
f"[slm:{identifier}] had invalid inchikey reference: ({type(inchikey)}) {inchikey}"
|
|
113
|
-
)
|
|
110
|
+
tqdm.write(f"[slm:{identifier}] had invalid inchikey reference: `{inchikey}`")
|
|
114
111
|
else:
|
|
115
112
|
term.append_exact_match(inchi_ref)
|
|
116
113
|
for chebi_id in _split(chebi_ids):
|
|
@@ -120,7 +117,7 @@ def iter_terms(version: str, force: bool = False):
|
|
|
120
117
|
for hmdb_id in _split(hmdb_ids):
|
|
121
118
|
term.append_exact_match(("hmdb", hmdb_id))
|
|
122
119
|
for pubmed_id in _split(pubmed_ids):
|
|
123
|
-
term.append_provenance(("pubmed", pubmed_id))
|
|
120
|
+
term.append_provenance(Reference(prefix="pubmed", identifier=pubmed_id))
|
|
124
121
|
# TODO how to handle class, parents, and components?
|
|
125
122
|
yield term
|
|
126
123
|
|
|
@@ -134,4 +131,4 @@ def _split(s: str) -> Iterable[str]:
|
|
|
134
131
|
|
|
135
132
|
|
|
136
133
|
if __name__ == "__main__":
|
|
137
|
-
|
|
134
|
+
SLMGetter.cli()
|
pyobo/sources/umls/__init__.py
CHANGED
pyobo/sources/umls/__main__.py
CHANGED
|
File without changes
|
|
@@ -1,19 +1,22 @@
|
|
|
1
1
|
"""Utilities for UMLS synonyms."""
|
|
2
2
|
|
|
3
|
+
import re
|
|
3
4
|
from collections.abc import Mapping
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
|
|
6
7
|
import requests
|
|
7
8
|
from bs4 import BeautifulSoup
|
|
8
9
|
|
|
10
|
+
from pyobo.struct import SynonymTypeDef, default_reference
|
|
9
11
|
from pyobo.utils.io import open_map_tsv, write_map_tsv
|
|
10
12
|
|
|
11
|
-
__all__ = ["get_umls_synonyms"]
|
|
13
|
+
__all__ = ["get_umls_synonyms", "get_umls_typedefs"]
|
|
12
14
|
|
|
13
15
|
HERE = Path(__file__).parent.resolve()
|
|
14
16
|
SYNONYM_TYPE_PATH = HERE.joinpath("synonym_types.tsv")
|
|
15
17
|
|
|
16
18
|
ABBREVIATIONS_URL = "https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/abbreviations.html"
|
|
19
|
+
SPACES = re.compile(r"\s+")
|
|
17
20
|
|
|
18
21
|
|
|
19
22
|
def get_umls_synonyms(*, refresh: bool = False) -> Mapping[str, str]:
|
|
@@ -23,14 +26,27 @@ def get_umls_synonyms(*, refresh: bool = False) -> Mapping[str, str]:
|
|
|
23
26
|
res = requests.get(ABBREVIATIONS_URL, timeout=5)
|
|
24
27
|
soup = BeautifulSoup(res.text, features="html.parser")
|
|
25
28
|
table = soup.find(id="mrdoc_TTY")
|
|
26
|
-
|
|
29
|
+
if table is None:
|
|
30
|
+
raise ValueError
|
|
31
|
+
body = table.find("tbody") # type:ignore[attr-defined]
|
|
32
|
+
if body is None:
|
|
33
|
+
raise ValueError
|
|
27
34
|
rv = {}
|
|
28
35
|
for row in body.find_all("tr"):
|
|
29
36
|
left, right = row.find_all("td")
|
|
30
|
-
rv[left.text.strip()] = right.text.strip()
|
|
37
|
+
rv[left.text.strip()] = SPACES.sub(" ", right.text.strip())
|
|
31
38
|
write_map_tsv(path=SYNONYM_TYPE_PATH, rv=rv, header=["key", "name"])
|
|
32
39
|
return rv
|
|
33
40
|
|
|
34
41
|
|
|
42
|
+
def get_umls_typedefs(*, refresh: bool = False) -> dict[str, SynonymTypeDef]:
|
|
43
|
+
"""Get all synonym type definitions."""
|
|
44
|
+
umls_synonyms = get_umls_synonyms(refresh=refresh)
|
|
45
|
+
return {
|
|
46
|
+
identifier: SynonymTypeDef(reference=default_reference("umls", identifier, name=name))
|
|
47
|
+
for identifier, name in umls_synonyms.items()
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
|
|
35
51
|
if __name__ == "__main__":
|
|
36
|
-
|
|
52
|
+
get_umls_typedefs(refresh=True)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Converter for UMLS Semantic Types."""
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
|
|
5
|
+
from pyobo import Obo, Reference, Term, default_reference
|
|
6
|
+
from pyobo.struct.typedef import has_category
|
|
7
|
+
from pyobo.utils.path import ensure_df
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"UMLSSTyGetter",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
PREFIX = "sty"
|
|
14
|
+
|
|
15
|
+
URL = "https://www.nlm.nih.gov/research/umls/knowledge_sources/semantic_network/SemGroups.txt"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class UMLSSTyGetter(Obo):
|
|
19
|
+
"""An ontology representation of UMLS Semantic Types."""
|
|
20
|
+
|
|
21
|
+
ontology = PREFIX
|
|
22
|
+
bioversions_key = "umls"
|
|
23
|
+
typedefs = [has_category]
|
|
24
|
+
|
|
25
|
+
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
26
|
+
"""Iterate over terms in the ontology."""
|
|
27
|
+
return iter_terms(version=self._version_or_raise)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
COLUMNS = [
|
|
31
|
+
"group",
|
|
32
|
+
"group_label",
|
|
33
|
+
"sty_id",
|
|
34
|
+
"sty_name",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def iter_terms(version: str) -> Iterable[Term]:
|
|
39
|
+
"""Iterate over UMLS terms."""
|
|
40
|
+
df = ensure_df(PREFIX, url=URL, version=version, sep="|", header=None, names=COLUMNS)
|
|
41
|
+
|
|
42
|
+
extras = {
|
|
43
|
+
group: Term(
|
|
44
|
+
reference=default_reference(PREFIX, group, name=group_label),
|
|
45
|
+
)
|
|
46
|
+
for group, group_label in df[["group", "group_label"]].drop_duplicates().values
|
|
47
|
+
}
|
|
48
|
+
yield from extras.values()
|
|
49
|
+
|
|
50
|
+
for group, _group_label, sty_id, sty_name in df.values:
|
|
51
|
+
term = Term(reference=Reference(prefix="sty", identifier=sty_id, name=sty_name))
|
|
52
|
+
term.append_parent(extras[group])
|
|
53
|
+
yield term
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
if __name__ == "__main__":
|
|
57
|
+
UMLSSTyGetter.cli()
|
|
@@ -146,6 +146,7 @@ OAM Obsolete Modifier Abbreviation
|
|
|
146
146
|
OAP Obsolete active preferred term
|
|
147
147
|
OAS Obsolete active synonym
|
|
148
148
|
OC Nursing outcomes
|
|
149
|
+
ODN Obsolete Display Name
|
|
149
150
|
OET Obsolete entry term
|
|
150
151
|
OF Obsolete fully specified name
|
|
151
152
|
OL Non-current Lower Level Term
|
|
@@ -188,7 +189,6 @@ PX Expanded preferred terms (pair with PS)
|
|
|
188
189
|
PXQ Preferred qualifier term
|
|
189
190
|
QAB Qualifier abbreviation
|
|
190
191
|
QEV Qualifier entry version
|
|
191
|
-
QSV Qualifier sort version
|
|
192
192
|
RAB Root abbreviation
|
|
193
193
|
RHT Root hierarchical term
|
|
194
194
|
RPT Root preferred term
|
pyobo/sources/umls/umls.py
CHANGED
|
@@ -15,7 +15,7 @@ from umls_downloader import open_umls, open_umls_semantic_types
|
|
|
15
15
|
|
|
16
16
|
from pyobo import Obo, Reference, Synonym, SynonymTypeDef, Term
|
|
17
17
|
|
|
18
|
-
from .get_synonym_types import
|
|
18
|
+
from .get_synonym_types import get_umls_typedefs
|
|
19
19
|
|
|
20
20
|
__all__ = [
|
|
21
21
|
"UMLSGetter",
|
|
@@ -46,30 +46,26 @@ RRF_COLUMNS = [
|
|
|
46
46
|
|
|
47
47
|
PREFIX = "umls"
|
|
48
48
|
SOURCE_VOCAB_URL = "https://www.nlm.nih.gov/research/umls/sourcereleasedocs/index.html"
|
|
49
|
-
|
|
49
|
+
UMLS_TYPEDEFS: dict[str, SynonymTypeDef] = get_umls_typedefs()
|
|
50
50
|
|
|
51
51
|
|
|
52
52
|
class UMLSGetter(Obo):
|
|
53
53
|
"""An ontology representation of UMLS."""
|
|
54
54
|
|
|
55
55
|
ontology = bioversions_key = PREFIX
|
|
56
|
-
synonym_typedefs =
|
|
56
|
+
synonym_typedefs = list(UMLS_TYPEDEFS.values())
|
|
57
57
|
|
|
58
58
|
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
59
59
|
"""Iterate over terms in the ontology."""
|
|
60
60
|
return iter_terms(version=self._version_or_raise)
|
|
61
61
|
|
|
62
62
|
|
|
63
|
-
def get_obo() -> Obo:
|
|
64
|
-
"""Get UMLS as OBO."""
|
|
65
|
-
return UMLSGetter()
|
|
66
|
-
|
|
67
|
-
|
|
68
63
|
def get_semantic_types() -> Mapping[str, set[str]]:
|
|
69
64
|
"""Get UMLS semantic types for each term."""
|
|
70
65
|
dd = defaultdict(set)
|
|
71
66
|
with open_umls_semantic_types() as file:
|
|
72
|
-
|
|
67
|
+
# this is very fast and doesn't need a progress bar
|
|
68
|
+
for line in file:
|
|
73
69
|
cui, sty, _ = line.decode("utf8").split("|", 2)
|
|
74
70
|
dd[cui].add(sty)
|
|
75
71
|
return dict(dd)
|
|
@@ -80,7 +76,7 @@ def iter_terms(version: str) -> Iterable[Term]:
|
|
|
80
76
|
semantic_types = get_semantic_types()
|
|
81
77
|
|
|
82
78
|
with open_umls(version=version) as file:
|
|
83
|
-
it = tqdm(file, unit_scale=True, desc="[umls] parsing")
|
|
79
|
+
it = tqdm(file, unit_scale=True, desc="[umls] parsing", total=16_700_000)
|
|
84
80
|
lines = (line.decode("utf-8").strip().split("|") for line in it)
|
|
85
81
|
for cui, cui_lines in itt.groupby(lines, key=operator.itemgetter(0)):
|
|
86
82
|
df = pd.DataFrame(list(cui_lines), columns=RRF_COLUMNS)
|
|
@@ -96,38 +92,38 @@ def iter_terms(version: str) -> Iterable[Term]:
|
|
|
96
92
|
continue
|
|
97
93
|
|
|
98
94
|
df["TTY - Term Type in Source"] = df["TTY - Term Type in Source"].map(
|
|
99
|
-
|
|
95
|
+
UMLS_TYPEDEFS.__getitem__
|
|
100
96
|
)
|
|
101
97
|
|
|
102
98
|
_r = pref_rows_df.iloc[0]
|
|
103
99
|
sdf = df[["SAB - source name", "CODE", "TTY - Term Type in Source", "STR"]]
|
|
104
100
|
|
|
105
101
|
synonyms = []
|
|
106
|
-
xrefs =
|
|
102
|
+
xrefs = set()
|
|
107
103
|
for source, identifier, synonym_type, synonym in sdf.values:
|
|
108
104
|
norm_source = bioregistry.normalize_prefix(source)
|
|
109
|
-
if norm_source
|
|
105
|
+
if not norm_source or not identifier or "," in identifier:
|
|
110
106
|
provenance = []
|
|
111
107
|
else:
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
108
|
+
try:
|
|
109
|
+
ref = Reference(prefix=norm_source, identifier=identifier)
|
|
110
|
+
except ValueError:
|
|
111
|
+
continue
|
|
112
|
+
else:
|
|
113
|
+
provenance = [ref]
|
|
114
|
+
xrefs.add(ref)
|
|
115
115
|
synonyms.append(
|
|
116
116
|
Synonym(
|
|
117
117
|
name=synonym,
|
|
118
118
|
provenance=provenance,
|
|
119
|
-
type=
|
|
119
|
+
type=synonym_type.reference,
|
|
120
120
|
)
|
|
121
121
|
)
|
|
122
122
|
|
|
123
|
-
xrefs = sorted(
|
|
124
|
-
set(xrefs), key=lambda reference: (reference.prefix, reference.identifier)
|
|
125
|
-
)
|
|
126
|
-
|
|
127
123
|
term = Term(
|
|
128
124
|
reference=Reference(prefix=PREFIX, identifier=cui, name=_r["STR"]),
|
|
129
125
|
synonyms=synonyms,
|
|
130
|
-
xrefs=xrefs,
|
|
126
|
+
xrefs=sorted(xrefs),
|
|
131
127
|
)
|
|
132
128
|
for sty_id in semantic_types.get(cui, set()):
|
|
133
129
|
term.append_parent(Reference(prefix="sty", identifier=sty_id))
|
pyobo/sources/unimod.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Unimod provides an OBO file, but it's got lots of errors in its encoding."""
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
|
|
5
|
+
from lxml import etree
|
|
6
|
+
|
|
7
|
+
from pyobo.struct import Obo, Reference, Term
|
|
8
|
+
from pyobo.utils.path import ensure_path
|
|
9
|
+
|
|
10
|
+
URL = "https://www.unimod.org/xml/unimod.xml"
|
|
11
|
+
PREFIX_MAP = {"umod": "http://www.unimod.org/xmlns/schema/unimod_2"}
|
|
12
|
+
PREFIX = "unimod"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class UnimodGetter(Obo):
|
|
16
|
+
"""An ontology representation of the unimod modifications."""
|
|
17
|
+
|
|
18
|
+
ontology = bioversions_key = PREFIX
|
|
19
|
+
dynamic_version = True
|
|
20
|
+
|
|
21
|
+
def iter_terms(self, force: bool = False) -> Iterable[Term]:
|
|
22
|
+
"""Iterate over terms in the ontology."""
|
|
23
|
+
return get_terms()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_terms() -> Iterable[Term]:
|
|
27
|
+
"""Get terms."""
|
|
28
|
+
path = ensure_path("unimod", url=URL)
|
|
29
|
+
x = etree.parse(path).getroot()
|
|
30
|
+
mods = x.findall("umod:modifications/umod:mod", namespaces=PREFIX_MAP)
|
|
31
|
+
return map(_mod_to_term, mods)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _mod_to_term(mod: etree.Element) -> Term:
|
|
35
|
+
title = mod.attrib["title"]
|
|
36
|
+
name = mod.attrib["full_name"]
|
|
37
|
+
identifier = mod.attrib["record_id"]
|
|
38
|
+
term = Term(
|
|
39
|
+
reference=Reference(prefix=PREFIX, identifier=identifier, name=title),
|
|
40
|
+
definition=name if name != title else None,
|
|
41
|
+
)
|
|
42
|
+
return term
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
if __name__ == "__main__":
|
|
46
|
+
UnimodGetter.cli()
|