PyPI - pyobo - Versions diffs - 0.11.1__py3-none-any.whl → 0.12.0__py3-none-any.whl - Mend

pyobo 0.11.1py3-none-any.whl → 0.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (242) hide show

pyobo/.DS_Store +0 -0
pyobo/__init__.py +95 -20
pyobo/__main__.py +0 -0
pyobo/api/__init__.py +81 -10
pyobo/api/alts.py +52 -42
pyobo/api/combine.py +39 -0
pyobo/api/edges.py +68 -0
pyobo/api/hierarchy.py +231 -203
pyobo/api/metadata.py +14 -19
pyobo/api/names.py +207 -127
pyobo/api/properties.py +117 -113
pyobo/api/relations.py +68 -94
pyobo/api/species.py +24 -21
pyobo/api/typedefs.py +11 -11
pyobo/api/utils.py +66 -13
pyobo/api/xrefs.py +108 -114
pyobo/cli/__init__.py +0 -0
pyobo/cli/cli.py +35 -50
pyobo/cli/database.py +183 -161
pyobo/{xrefdb/xrefs_pipeline.py → cli/database_utils.py} +54 -73
pyobo/cli/lookup.py +163 -195
pyobo/cli/utils.py +19 -6
pyobo/constants.py +102 -3
pyobo/getters.py +196 -118
pyobo/gilda_utils.py +79 -200
pyobo/identifier_utils/__init__.py +41 -0
pyobo/identifier_utils/api.py +296 -0
pyobo/identifier_utils/model.py +130 -0
pyobo/identifier_utils/preprocessing.json +812 -0
pyobo/identifier_utils/preprocessing.py +61 -0
pyobo/identifier_utils/relations/__init__.py +8 -0
pyobo/identifier_utils/relations/api.py +162 -0
pyobo/identifier_utils/relations/data.json +5824 -0
pyobo/identifier_utils/relations/data_owl.json +57 -0
pyobo/identifier_utils/relations/data_rdf.json +1 -0
pyobo/identifier_utils/relations/data_rdfs.json +7 -0
pyobo/mocks.py +9 -6
pyobo/ner/__init__.py +9 -0
pyobo/ner/api.py +72 -0
pyobo/ner/normalizer.py +33 -0
pyobo/obographs.py +43 -39
pyobo/plugins.py +5 -4
pyobo/py.typed +0 -0
pyobo/reader.py +1358 -395
pyobo/reader_utils.py +155 -0
pyobo/resource_utils.py +42 -22
pyobo/resources/__init__.py +0 -0
pyobo/resources/goc.py +75 -0
pyobo/resources/goc.tsv +188 -0
pyobo/resources/ncbitaxon.py +4 -5
pyobo/resources/ncbitaxon.tsv.gz +0 -0
pyobo/resources/ro.py +3 -2
pyobo/resources/ro.tsv +0 -0
pyobo/resources/so.py +0 -0
pyobo/resources/so.tsv +0 -0
pyobo/sources/README.md +12 -8
pyobo/sources/__init__.py +52 -29
pyobo/sources/agrovoc.py +0 -0
pyobo/sources/antibodyregistry.py +11 -12
pyobo/sources/bigg/__init__.py +13 -0
pyobo/sources/bigg/bigg_compartment.py +81 -0
pyobo/sources/bigg/bigg_metabolite.py +229 -0
pyobo/sources/bigg/bigg_model.py +46 -0
pyobo/sources/bigg/bigg_reaction.py +77 -0
pyobo/sources/biogrid.py +1 -2
pyobo/sources/ccle.py +7 -12
pyobo/sources/cgnc.py +0 -5
pyobo/sources/chebi.py +1 -1
pyobo/sources/chembl/__init__.py +9 -0
pyobo/sources/{chembl.py → chembl/chembl_compound.py} +13 -25
pyobo/sources/chembl/chembl_target.py +160 -0
pyobo/sources/civic_gene.py +55 -15
pyobo/sources/clinicaltrials.py +160 -0
pyobo/sources/complexportal.py +24 -24
pyobo/sources/conso.py +14 -22
pyobo/sources/cpt.py +0 -0
pyobo/sources/credit.py +1 -9
pyobo/sources/cvx.py +27 -5
pyobo/sources/depmap.py +9 -12
pyobo/sources/dictybase_gene.py +2 -7
pyobo/sources/drugbank/__init__.py +9 -0
pyobo/sources/{drugbank.py → drugbank/drugbank.py} +11 -16
pyobo/sources/{drugbank_salt.py → drugbank/drugbank_salt.py} +3 -8
pyobo/sources/drugcentral.py +17 -13
pyobo/sources/expasy.py +31 -34
pyobo/sources/famplex.py +13 -18
pyobo/sources/flybase.py +3 -8
pyobo/sources/gard.py +62 -0
pyobo/sources/geonames/__init__.py +9 -0
pyobo/sources/geonames/features.py +28 -0
pyobo/sources/{geonames.py → geonames/geonames.py} +87 -26
pyobo/sources/geonames/utils.py +115 -0
pyobo/sources/gmt_utils.py +6 -7
pyobo/sources/go.py +20 -13
pyobo/sources/gtdb.py +154 -0
pyobo/sources/gwascentral/__init__.py +9 -0
pyobo/sources/{gwascentral_phenotype.py → gwascentral/gwascentral_phenotype.py} +5 -7
pyobo/sources/{gwascentral_study.py → gwascentral/gwascentral_study.py} +1 -7
pyobo/sources/hgnc/__init__.py +9 -0
pyobo/sources/{hgnc.py → hgnc/hgnc.py} +56 -70
pyobo/sources/{hgncgenefamily.py → hgnc/hgncgenefamily.py} +8 -18
pyobo/sources/icd/__init__.py +9 -0
pyobo/sources/{icd10.py → icd/icd10.py} +35 -37
pyobo/sources/icd/icd11.py +148 -0
pyobo/sources/{icd_utils.py → icd/icd_utils.py} +66 -20
pyobo/sources/interpro.py +4 -9
pyobo/sources/itis.py +0 -5
pyobo/sources/kegg/__init__.py +0 -0
pyobo/sources/kegg/api.py +16 -38
pyobo/sources/kegg/genes.py +9 -20
pyobo/sources/kegg/genome.py +1 -7
pyobo/sources/kegg/pathway.py +9 -21
pyobo/sources/mesh.py +58 -24
pyobo/sources/mgi.py +3 -10
pyobo/sources/mirbase/__init__.py +11 -0
pyobo/sources/{mirbase.py → mirbase/mirbase.py} +8 -11
pyobo/sources/{mirbase_constants.py → mirbase/mirbase_constants.py} +0 -0
pyobo/sources/{mirbase_family.py → mirbase/mirbase_family.py} +4 -8
pyobo/sources/{mirbase_mature.py → mirbase/mirbase_mature.py} +3 -7
pyobo/sources/msigdb.py +74 -39
pyobo/sources/ncbi/__init__.py +9 -0
pyobo/sources/ncbi/ncbi_gc.py +162 -0
pyobo/sources/{ncbigene.py → ncbi/ncbigene.py} +18 -19
pyobo/sources/nih_reporter.py +60 -0
pyobo/sources/nlm/__init__.py +9 -0
pyobo/sources/nlm/nlm_catalog.py +48 -0
pyobo/sources/nlm/nlm_publisher.py +36 -0
pyobo/sources/nlm/utils.py +116 -0
pyobo/sources/npass.py +6 -8
pyobo/sources/omim_ps.py +10 -3
pyobo/sources/pathbank.py +4 -8
pyobo/sources/pfam/__init__.py +9 -0
pyobo/sources/{pfam.py → pfam/pfam.py} +3 -8
pyobo/sources/{pfam_clan.py → pfam/pfam_clan.py} +2 -7
pyobo/sources/pharmgkb/__init__.py +15 -0
pyobo/sources/pharmgkb/pharmgkb_chemical.py +89 -0
pyobo/sources/pharmgkb/pharmgkb_disease.py +77 -0
pyobo/sources/pharmgkb/pharmgkb_gene.py +108 -0
pyobo/sources/pharmgkb/pharmgkb_pathway.py +63 -0
pyobo/sources/pharmgkb/pharmgkb_variant.py +84 -0
pyobo/sources/pharmgkb/utils.py +86 -0
pyobo/sources/pid.py +1 -6
pyobo/sources/pombase.py +6 -10
pyobo/sources/pubchem.py +4 -9
pyobo/sources/reactome.py +5 -11
pyobo/sources/rgd.py +11 -16
pyobo/sources/rhea.py +37 -36
pyobo/sources/ror.py +69 -42
pyobo/sources/selventa/__init__.py +0 -0
pyobo/sources/selventa/schem.py +4 -7
pyobo/sources/selventa/scomp.py +1 -6
pyobo/sources/selventa/sdis.py +4 -7
pyobo/sources/selventa/sfam.py +1 -6
pyobo/sources/sgd.py +6 -11
pyobo/sources/signor/__init__.py +7 -0
pyobo/sources/signor/download.py +41 -0
pyobo/sources/signor/signor_complexes.py +105 -0
pyobo/sources/slm.py +12 -15
pyobo/sources/umls/__init__.py +7 -1
pyobo/sources/umls/__main__.py +0 -0
pyobo/sources/umls/get_synonym_types.py +20 -4
pyobo/sources/umls/sty.py +57 -0
pyobo/sources/umls/synonym_types.tsv +1 -1
pyobo/sources/umls/umls.py +18 -22
pyobo/sources/unimod.py +46 -0
pyobo/sources/uniprot/__init__.py +1 -1
pyobo/sources/uniprot/uniprot.py +40 -32
pyobo/sources/uniprot/uniprot_ptm.py +4 -34
pyobo/sources/utils.py +3 -2
pyobo/sources/wikipathways.py +7 -10
pyobo/sources/zfin.py +5 -10
pyobo/ssg/__init__.py +12 -16
pyobo/ssg/base.html +0 -0
pyobo/ssg/index.html +26 -13
pyobo/ssg/term.html +12 -2
pyobo/ssg/typedef.html +0 -0
pyobo/struct/__init__.py +54 -8
pyobo/struct/functional/__init__.py +1 -0
pyobo/struct/functional/dsl.py +2572 -0
pyobo/struct/functional/macros.py +423 -0
pyobo/struct/functional/obo_to_functional.py +385 -0
pyobo/struct/functional/ontology.py +270 -0
pyobo/struct/functional/utils.py +112 -0
pyobo/struct/reference.py +331 -136
pyobo/struct/struct.py +1413 -643
pyobo/struct/struct_utils.py +1078 -0
pyobo/struct/typedef.py +162 -210
pyobo/struct/utils.py +12 -5
pyobo/struct/vocabulary.py +138 -0
pyobo/utils/__init__.py +0 -0
pyobo/utils/cache.py +13 -11
pyobo/utils/io.py +17 -31
pyobo/utils/iter.py +5 -5
pyobo/utils/misc.py +41 -53
pyobo/utils/ndex_utils.py +0 -0
pyobo/utils/path.py +76 -70
pyobo/version.py +3 -3
{pyobo-0.11.1.dist-info → pyobo-0.12.0.dist-info}/METADATA +224 -225
pyobo-0.12.0.dist-info/RECORD +202 -0
pyobo-0.12.0.dist-info/WHEEL +4 -0
{pyobo-0.11.1.dist-info → pyobo-0.12.0.dist-info}/entry_points.txt +1 -0
{pyobo-0.11.1.dist-info → pyobo-0.12.0.dist-info/licenses}/LICENSE +0 -0
pyobo/apps/__init__.py +0 -3
pyobo/apps/cli.py +0 -24
pyobo/apps/gilda/__init__.py +0 -3
pyobo/apps/gilda/__main__.py +0 -8
pyobo/apps/gilda/app.py +0 -48
pyobo/apps/gilda/cli.py +0 -36
pyobo/apps/gilda/templates/base.html +0 -33
pyobo/apps/gilda/templates/home.html +0 -11
pyobo/apps/gilda/templates/matches.html +0 -32
pyobo/apps/mapper/__init__.py +0 -3
pyobo/apps/mapper/__main__.py +0 -11
pyobo/apps/mapper/cli.py +0 -37
pyobo/apps/mapper/mapper.py +0 -187
pyobo/apps/mapper/templates/base.html +0 -35
pyobo/apps/mapper/templates/mapper_home.html +0 -64
pyobo/aws.py +0 -162
pyobo/cli/aws.py +0 -47
pyobo/identifier_utils.py +0 -142
pyobo/normalizer.py +0 -232
pyobo/registries/__init__.py +0 -16
pyobo/registries/metaregistry.json +0 -507
pyobo/registries/metaregistry.py +0 -135
pyobo/sources/icd11.py +0 -105
pyobo/xrefdb/__init__.py +0 -1
pyobo/xrefdb/canonicalizer.py +0 -214
pyobo/xrefdb/priority.py +0 -59
pyobo/xrefdb/sources/__init__.py +0 -60
pyobo/xrefdb/sources/biomappings.py +0 -36
pyobo/xrefdb/sources/cbms2019.py +0 -91
pyobo/xrefdb/sources/chembl.py +0 -83
pyobo/xrefdb/sources/compath.py +0 -82
pyobo/xrefdb/sources/famplex.py +0 -64
pyobo/xrefdb/sources/gilda.py +0 -50
pyobo/xrefdb/sources/intact.py +0 -113
pyobo/xrefdb/sources/ncit.py +0 -133
pyobo/xrefdb/sources/pubchem.py +0 -27
pyobo/xrefdb/sources/wikidata.py +0 -116
pyobo-0.11.1.dist-info/RECORD +0 -173
pyobo-0.11.1.dist-info/WHEEL +0 -5
pyobo-0.11.1.dist-info/top_level.txt +0 -1

pyobo/sources/geonames/utils.py ADDED Viewed

@@ -0,0 +1,115 @@
+"""Shared code for geonames sources."""
+from __future__ import annotations
+from collections.abc import Iterable
+import pandas as pd
+from tqdm import tqdm
+from pyobo import Reference, Term, TypeDef, default_reference
+from pyobo.struct.struct import CHARLIE_TERM, HUMAN_TERM, PYOBO_INJECTED
+from pyobo.utils.path import ensure_df
+PREFIX = "geonames"
+PREFIX_FEATURE = "geonames.feature"
+FEATURES_URL = "https://download.geonames.org/export/dump/featureCodes_en.txt"
+COUNTRIES_URL = "https://download.geonames.org/export/dump/countryInfo.txt"
+ADMIN1_URL = "https://download.geonames.org/export/dump/admin1CodesASCII.txt"
+ADMIN2_URL = "https://download.geonames.org/export/dump/admin2Codes.txt"
+CITIES_URL = "https://download.geonames.org/export/dump/cities15000.zip"
+SYNONYMS_URL = "https://download.geonames.org/export/dump/alternateNamesV2.zip"
+# External parent classes
+CITY = Reference(prefix="ENVO", identifier="00000856", name="city")
+NATION = Reference(prefix="ENVO", identifier="00000009", name="national geopolitical entity")
+ADMIN_1 = Reference(prefix="ENVO", identifier="00000005", name="first-order administrative region")
+ADMIN_2 = Reference(prefix="ENVO", identifier="00000006", name="second-order administrative region")
+# Builtin classes
+FEATURE = default_reference(PREFIX_FEATURE, "feature", "GeoNames feature")
+FEATURE_TERM = Term(reference=FEATURE)
+# Type definitions
+CODE_TYPEDEF = TypeDef(
+    reference=default_reference(PREFIX, "code", name="GeoNames code"), is_metadata_tag=True
+)
+SYNONYMS_DF_COLUMNS = [
+    "id",
+    "geonames_id",
+    "iso_lang",
+    "synonym",
+    "is_preferred",
+    "is_short",
+    "is_colloquial",
+    "is_historic",
+    "start_time",
+    "end_time",
+]
+P_CATEGORY = default_reference(PREFIX_FEATURE, "P", "city feature")
+FEATURE_CATEGORIES = {
+    "A": default_reference(PREFIX_FEATURE, "A", "geopolitical feature"),
+    "H": default_reference(PREFIX_FEATURE, "H", "aquatic feature"),
+    "V": default_reference(PREFIX_FEATURE, "V", "floral feature feature"),
+    "S": default_reference(PREFIX_FEATURE, "S", "building feature"),
+    "U": default_reference(PREFIX_FEATURE, "U", "undersea feature"),
+    "T": default_reference(PREFIX_FEATURE, "T", "geographic feature"),
+    "L": default_reference(PREFIX_FEATURE, "L", "parks feature"),
+    "P": P_CATEGORY,
+    "R": default_reference(PREFIX_FEATURE, "R", "road or rail feature"),
+}
+def get_features(*, force: bool = False) -> dict[str, Term]:
+    """Get all features."""
+    df = ensure_df(
+        PREFIX,
+        url=FEATURES_URL,
+        force=force,
+        keep_default_na=False,  # NA is a country code
+        dtype=str,
+    )
+    rv = {}
+    for identifier, name, description in df.values:
+        if pd.isna(identifier) or identifier == "null":
+            continue
+        term = Term(
+            reference=Reference(
+                prefix=PREFIX_FEATURE, identifier=identifier, name=name if pd.notna(name) else None
+            ),
+            definition=description if pd.notna(description) else None,
+        )
+        parent_letter, _, rest = identifier.partition(".")
+        if not rest:
+            tqdm.write(f"[{PREFIX_FEATURE}] unhandled identifier: {identifier}")
+        elif parent_letter not in FEATURE_CATEGORIES:
+            tqdm.write(f"[{PREFIX_FEATURE}] unhandled category: {parent_letter}")
+        else:
+            term.append_parent(FEATURE_CATEGORIES[parent_letter])
+        rv[identifier] = term
+    return rv
+def get_feature_terms(
+    force: bool = False, features: dict[str, Term] | None = None
+) -> Iterable[Term]:
+    """Get terms for GeoNames features."""
+    yield FEATURE_TERM
+    yield HUMAN_TERM
+    yield CHARLIE_TERM
+    for cat in FEATURE_CATEGORIES.values():
+        yield (
+            Term(reference=cat)
+            .append_parent(FEATURE_TERM)
+            .append_contributor(CHARLIE_TERM)
+            .append_comment(PYOBO_INJECTED)
+        )
+    if features is None:
+        features = get_features(force=force)
+    yield from features.values()

pyobo/sources/gmt_utils.py CHANGED Viewed

@@ -2,17 +2,17 @@
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Union
 GMTSummary = tuple[str, str, set[str]]
 WikiPathwaysGMTSummary = tuple[str, str, str, str, str, set[str]]
-def parse_gmt_file(path: Union[str, Path]) -> Iterable[GMTSummary]:
+def parse_gmt_file(path: str | Path) -> Iterable[GMTSummary]:
     """Return file as list of pathway - gene sets (ENTREZ-identifiers).
     :param path: path to GMT file
-    :return: line-based processed file
+    :yields: processed lines
     """
     with open(path) as file:
         for line in file:
@@ -23,15 +23,14 @@ def _process_line(line: str) -> tuple[str, str, set[str]]:
     """Return the pathway name, url, and gene sets associated.
     :param line: gmt file line
-    :return: pathway name
-    :return: pathway info url
-    :return: genes set associated
+    :returns: pathway name, pathway info url, and genes set associated
     """
     name, info, *entries = (p.strip() for p in line.split("\t"))
     return name, info, set(entries)
-def parse_wikipathways_gmt(path: Union[str, Path]) -> Iterable[WikiPathwaysGMTSummary]:
+def parse_wikipathways_gmt(path: str | Path) -> Iterable[WikiPathwaysGMTSummary]:
     """Parse WikiPathways GMT."""
     for info, _uri, entries in parse_gmt_file(path):
         info, version, identifier, species = info.split("%")

pyobo/sources/go.py CHANGED Viewed

@@ -4,33 +4,46 @@ from pyobo import get_descendants
 __all__ = [
     "is_biological_process",
-    "is_molecular_function",
     "is_cellular_component",
+    "is_molecular_function",
 ]
 def is_biological_process(identifier: str) -> bool:
     """Return if the given GO identifier is a biological process.
+    :param identifier: A local unique identifier from GO
+    :return: If the identifier is a biological process
     >>> is_biological_process("0006915")
     True
     >>> is_biological_process("GO:0006915")
     True
-    >>> is_molecular_function("0006915")
-    False
-    >>> is_cellular_component("0006915")
-    False
     """
     return _is_descendant(identifier, "0008150")
 def is_molecular_function(identifier: str) -> bool:
-    """Return if the given GO identifier is a molecular function."""
+    """Return if the given GO identifier is a molecular function.
+    :param identifier: A local unique identifier from GO
+    :return: If the identifier is a molecular function
+    >>> is_molecular_function("0006915")
+    False
+    """
     return _is_descendant(identifier, "0003674")
 def is_cellular_component(identifier: str) -> bool:
-    """Return if the given GO identifier is a cellular component."""
+    """Return if the given GO identifier is a cellular component.
+    :param identifier: A local unique identifier from GO
+    :return: If the identifier is a cellular component
+    >>> is_cellular_component("0006915")
+    False
+    """
     return _is_descendant(identifier, "0005575")
@@ -40,9 +53,3 @@ def _is_descendant(identifier: str, ancestor: str) -> bool:
         identifier = f"go:{identifier}"
     descendants = get_descendants("go", ancestor)
     return descendants is not None and identifier in descendants
-if __name__ == "__main__":
-    import doctest
-    doctest.testmod()

pyobo/sources/gtdb.py ADDED Viewed

@@ -0,0 +1,154 @@
+"""Convert GTDB taxonomy to OBO format."""
+import logging
+from collections.abc import Iterable
+import pandas as pd
+from tqdm.auto import tqdm
+from pyobo.struct import Obo, Reference, Term
+from pyobo.struct.typedef import has_taxonomy_rank
+from pyobo.utils.path import ensure_path
+__all__ = [
+    "GTDBGetter",
+]
+PREFIX = "gtdb"
+#: A mapping from GTDB prefixes to TAXRANK ranks
+LEVEL_TO_TAXRANK = {
+    "d": Reference(prefix="TAXRANK", identifier="0000037", name="domain"),
+    "p": Reference(prefix="TAXRANK", identifier="0000001", name="phylum"),
+    "c": Reference(prefix="TAXRANK", identifier="0000002", name="class"),
+    "o": Reference(prefix="TAXRANK", identifier="0000003", name="order"),
+    "f": Reference(prefix="TAXRANK", identifier="0000004", name="family"),
+    "g": Reference(prefix="TAXRANK", identifier="0000005", name="genus"),
+    "s": Reference(prefix="TAXRANK", identifier="0000006", name="species"),
+}
+#: AR stands for archea
+GTDB_AR_URL = "https://data.gtdb.ecogenomic.org/releases/latest/ar53_metadata.tsv.gz"
+#: BAC stands for bacteria
+GTDB_BAC_URL = "https://data.gtdb.ecogenomic.org/releases/latest/bac120_metadata.tsv.gz"
+logger = logging.getLogger(__name__)
+class GTDBGetter(Obo):
+    """An ontology representation of the GTDB taxonomy."""
+    ontology = bioversions_key = PREFIX
+    typedefs = [has_taxonomy_rank]
+    root_terms = [
+        Reference(prefix=PREFIX, identifier="d__Archea", name="Archea"),
+        Reference(prefix=PREFIX, identifier="d__Bacteria", name="Bacteria"),
+    ]
+    def iter_terms(self, force: bool = False) -> Iterable[Term]:
+        """Iterate over terms in the ontology."""
+        return iter_terms(version=self._version_or_raise, force=force)
+def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
+    """Iterate over GTDB terms."""
+    # Add the taxrank terms so we get nice display in Protege
+    for reference in LEVEL_TO_TAXRANK.values():
+        yield Term(reference=reference)
+    ar_path = ensure_path(PREFIX, url=GTDB_AR_URL, version=version, force=force)
+    bac_path = ensure_path(PREFIX, url=GTDB_BAC_URL, version=version, force=force)
+    columns = ["gtdb_taxonomy", "ncbi_species_taxid"]
+    for path_name, path in [
+        ("ar", ar_path),
+        ("bac", bac_path),
+    ]:
+        df = pd.read_csv(path, sep="\t", dtype=str)
+        for tax_string, ncbitaxon_id in tqdm(
+            df[columns].values, desc=f"[{PREFIX}] processing {path_name}", unit_scale=True
+        ):
+            yield from _process_row(tax_string, ncbitaxon_id)
+def _process_row(tax_string, ncbitaxon_id) -> Iterable[Term]:
+    if not isinstance(tax_string, str):
+        logger.warning(f"Invalid taxonomy string: {tax_string}")
+        return None
+    taxa = _parse_tax_string(tax_string)
+    if not taxa:
+        logger.warning(f"No valid taxa found in: {tax_string}")
+        return None
+    parent_reference = None
+    for level, name in taxa:
+        identifier = f"{level}__{name.replace(' ', '_')}"
+        term = Term(
+            reference=Reference(prefix=PREFIX, identifier=identifier, name=name),
+        )
+        term.annotate_object(has_taxonomy_rank, LEVEL_TO_TAXRANK[level])
+        if parent_reference:
+            term.append_parent(parent_reference)
+        if ncbitaxon_id and level == "s":
+            # if the level is "s", it's a species. There might be multiple
+            # mappings to NCBITaxon, so we only use "see also" as the predicate
+            term.append_xref(
+                Reference(prefix="ncbitaxon", identifier=ncbitaxon_id),
+                # TODO @jose use confidence=... keyword here
+            )
+        yield term
+        parent_reference = term.reference
+def _parse_tax_string(tax_string: str) -> list[tuple[str, str]]:
+    """Parse GTDB taxonomy string into (level, name) tuples."""
+    return [
+        level_name for part in _split_tax_string(tax_string) if (level_name := _parse_name(part))
+    ]
+def _split_tax_string(tax_string: str) -> list[str]:
+    return [p.strip() for p in tax_string.split(";") if p.strip()]
+def _parse_name(part: str) -> tuple[str, str] | None:
+    """Parse a GTDB taxonomy identifier.
+    :param part: The string
+    :returns: A tuple with the level and name, if parsable
+    >>> _parse_name("f__Sulfolobaceae")
+    ('f', 'Sulfoobaceae')
+    The following is malformed because it is missing a double underscore
+    >>> _parse_name("f_Sulfolobaceae")
+    The following is malformed because it has an invalid taxonomic level
+    >>> _parse_name("x__Sulfolobaceae")
+    The following is malformed because it's missing a name
+    >>> _parse_name("f__")
+    """
+    if len(part) < 4 or "__" not in part:
+        logger.warning(f"Malformed taxon string: {part}")
+        return None
+    level, delimiter, name = part.partition("__")
+    if not delimiter:
+        logger.warning(f"Missing double underscore delimiter: {part}")
+        return None
+    if level not in LEVEL_TO_TAXRANK or not name:
+        logger.warning(f"Invalid taxonomic level `{level}` in {part}")
+        return None
+    if not name:
+        logger.warning(f"Missing name: {part}")
+        return None
+    return level, name
+if __name__ == "__main__":
+    GTDBGetter().cli()

pyobo/sources/gwascentral/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""Resources from GWAS Central."""
+from .gwascentral_phenotype import GWASCentralPhenotypeGetter
+from .gwascentral_study import GWASCentralStudyGetter
+__all__ = [
+    "GWASCentralPhenotypeGetter",
+    "GWASCentralStudyGetter",
+]

pyobo/sources/{gwascentral_phenotype.py → gwascentral/gwascentral_phenotype.py} RENAMED Viewed

@@ -5,10 +5,11 @@ from collections.abc import Iterable
 from tqdm.auto import tqdm, trange
-from pyobo.sources.gwascentral_study import VERSION
 from pyobo.struct import Obo, Reference, Term
 from pyobo.utils.path import ensure_path
+from .gwascentral_study import VERSION
 __all__ = [
     "GWASCentralPhenotypeGetter",
 ]
@@ -27,11 +28,6 @@ class GWASCentralPhenotypeGetter(Obo):
         return iter_terms(force=force, version=self._version_or_raise)
-def get_obo(force: bool = False) -> Obo:
-    """Get GWAS Central Studies as OBO."""
-    return GWASCentralPhenotypeGetter(force=force)
 def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
     """Iterate over terms from GWAS Central Phenotype."""
     for n in trange(1, 11000, desc=f"{PREFIX} download"):
@@ -43,11 +39,13 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
                 url=f"https://www.gwascentral.org/phenotype/HGVPM{n}?format=json",
                 name=f"HGVPM{n}.json",
                 force=force,
+                backend="requests",
+                timeout=1,
             )
         except OSError as e:
             tqdm.write(f"{n}: {e}")
             continue
-        with open(path) as file:
+        with path.open() as file:
             j = json.load(file)
         description = j.get("description")

pyobo/sources/{gwascentral_study.py → gwascentral/gwascentral_study.py} RENAMED Viewed

@@ -3,7 +3,6 @@
 import logging
 import tarfile
 from collections.abc import Iterable
-from typing import Optional
 from xml.etree import ElementTree
 from pyobo.struct import Obo, Reference, Term, has_part
@@ -31,12 +30,7 @@ class GWASCentralStudyGetter(Obo):
         return iterate_terms(force=force, version=self._version_or_raise)
-def get_obo(force: bool = False):
-    """Get GWAS Central Studies as OBO."""
-    return GWASCentralStudyGetter(force=force)
-def _find_text(element, name: str) -> Optional[str]:
+def _find_text(element, name: str) -> str | None:
     x = element.find(name)
     if x is not None:
         return x.text

pyobo/sources/hgnc/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""Resources from HGNC."""
+from .hgnc import HGNCGetter
+from .hgncgenefamily import HGNCGroupGetter
+__all__ = [
+    "HGNCGetter",
+    "HGNCGroupGetter",
+]

pyobo 0.11.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

pyobo 0.11.1py3-none-any.whl → 0.12.0py3-none-any.whl