PyPI - pyobo - Versions diffs - 0.11.2__py3-none-any.whl → 0.12.0__py3-none-any.whl - Mend

pyobo 0.11.2py3-none-any.whl → 0.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (227) hide show

pyobo/.DS_Store +0 -0
pyobo/__init__.py +95 -20
pyobo/__main__.py +0 -0
pyobo/api/__init__.py +81 -10
pyobo/api/alts.py +52 -42
pyobo/api/combine.py +39 -0
pyobo/api/edges.py +68 -0
pyobo/api/hierarchy.py +231 -203
pyobo/api/metadata.py +14 -19
pyobo/api/names.py +207 -127
pyobo/api/properties.py +117 -113
pyobo/api/relations.py +68 -94
pyobo/api/species.py +24 -21
pyobo/api/typedefs.py +11 -11
pyobo/api/utils.py +66 -13
pyobo/api/xrefs.py +108 -114
pyobo/cli/__init__.py +0 -0
pyobo/cli/cli.py +35 -50
pyobo/cli/database.py +183 -161
pyobo/{xrefdb/xrefs_pipeline.py → cli/database_utils.py} +54 -73
pyobo/cli/lookup.py +163 -195
pyobo/cli/utils.py +19 -6
pyobo/constants.py +102 -3
pyobo/getters.py +196 -118
pyobo/gilda_utils.py +79 -200
pyobo/identifier_utils/__init__.py +41 -0
pyobo/identifier_utils/api.py +296 -0
pyobo/identifier_utils/model.py +130 -0
pyobo/identifier_utils/preprocessing.json +812 -0
pyobo/identifier_utils/preprocessing.py +61 -0
pyobo/identifier_utils/relations/__init__.py +8 -0
pyobo/identifier_utils/relations/api.py +162 -0
pyobo/identifier_utils/relations/data.json +5824 -0
pyobo/identifier_utils/relations/data_owl.json +57 -0
pyobo/identifier_utils/relations/data_rdf.json +1 -0
pyobo/identifier_utils/relations/data_rdfs.json +7 -0
pyobo/mocks.py +9 -6
pyobo/ner/__init__.py +9 -0
pyobo/ner/api.py +72 -0
pyobo/ner/normalizer.py +33 -0
pyobo/obographs.py +43 -39
pyobo/plugins.py +5 -4
pyobo/py.typed +0 -0
pyobo/reader.py +1358 -395
pyobo/reader_utils.py +155 -0
pyobo/resource_utils.py +42 -22
pyobo/resources/__init__.py +0 -0
pyobo/resources/goc.py +75 -0
pyobo/resources/goc.tsv +188 -0
pyobo/resources/ncbitaxon.py +4 -5
pyobo/resources/ncbitaxon.tsv.gz +0 -0
pyobo/resources/ro.py +3 -2
pyobo/resources/ro.tsv +0 -0
pyobo/resources/so.py +0 -0
pyobo/resources/so.tsv +0 -0
pyobo/sources/README.md +12 -8
pyobo/sources/__init__.py +52 -29
pyobo/sources/agrovoc.py +0 -0
pyobo/sources/antibodyregistry.py +11 -12
pyobo/sources/bigg/__init__.py +13 -0
pyobo/sources/bigg/bigg_compartment.py +81 -0
pyobo/sources/bigg/bigg_metabolite.py +229 -0
pyobo/sources/bigg/bigg_model.py +46 -0
pyobo/sources/bigg/bigg_reaction.py +77 -0
pyobo/sources/biogrid.py +1 -2
pyobo/sources/ccle.py +7 -12
pyobo/sources/cgnc.py +0 -5
pyobo/sources/chebi.py +1 -1
pyobo/sources/chembl/__init__.py +9 -0
pyobo/sources/{chembl.py → chembl/chembl_compound.py} +13 -25
pyobo/sources/chembl/chembl_target.py +160 -0
pyobo/sources/civic_gene.py +55 -15
pyobo/sources/clinicaltrials.py +160 -0
pyobo/sources/complexportal.py +24 -24
pyobo/sources/conso.py +14 -22
pyobo/sources/cpt.py +0 -0
pyobo/sources/credit.py +1 -9
pyobo/sources/cvx.py +27 -5
pyobo/sources/depmap.py +9 -12
pyobo/sources/dictybase_gene.py +2 -7
pyobo/sources/drugbank/__init__.py +9 -0
pyobo/sources/{drugbank.py → drugbank/drugbank.py} +11 -16
pyobo/sources/{drugbank_salt.py → drugbank/drugbank_salt.py} +3 -8
pyobo/sources/drugcentral.py +17 -13
pyobo/sources/expasy.py +31 -34
pyobo/sources/famplex.py +13 -18
pyobo/sources/flybase.py +3 -8
pyobo/sources/gard.py +62 -0
pyobo/sources/geonames/__init__.py +9 -0
pyobo/sources/geonames/features.py +28 -0
pyobo/sources/{geonames.py → geonames/geonames.py} +87 -26
pyobo/sources/geonames/utils.py +115 -0
pyobo/sources/gmt_utils.py +6 -7
pyobo/sources/go.py +20 -13
pyobo/sources/gtdb.py +154 -0
pyobo/sources/gwascentral/__init__.py +9 -0
pyobo/sources/{gwascentral_phenotype.py → gwascentral/gwascentral_phenotype.py} +5 -7
pyobo/sources/{gwascentral_study.py → gwascentral/gwascentral_study.py} +1 -7
pyobo/sources/hgnc/__init__.py +9 -0
pyobo/sources/{hgnc.py → hgnc/hgnc.py} +56 -70
pyobo/sources/{hgncgenefamily.py → hgnc/hgncgenefamily.py} +8 -18
pyobo/sources/icd/__init__.py +9 -0
pyobo/sources/{icd10.py → icd/icd10.py} +35 -37
pyobo/sources/icd/icd11.py +148 -0
pyobo/sources/{icd_utils.py → icd/icd_utils.py} +66 -20
pyobo/sources/interpro.py +4 -9
pyobo/sources/itis.py +0 -5
pyobo/sources/kegg/__init__.py +0 -0
pyobo/sources/kegg/api.py +16 -38
pyobo/sources/kegg/genes.py +9 -20
pyobo/sources/kegg/genome.py +1 -7
pyobo/sources/kegg/pathway.py +9 -21
pyobo/sources/mesh.py +58 -24
pyobo/sources/mgi.py +3 -10
pyobo/sources/mirbase/__init__.py +11 -0
pyobo/sources/{mirbase.py → mirbase/mirbase.py} +8 -11
pyobo/sources/{mirbase_constants.py → mirbase/mirbase_constants.py} +0 -0
pyobo/sources/{mirbase_family.py → mirbase/mirbase_family.py} +4 -8
pyobo/sources/{mirbase_mature.py → mirbase/mirbase_mature.py} +3 -7
pyobo/sources/msigdb.py +74 -39
pyobo/sources/ncbi/__init__.py +9 -0
pyobo/sources/ncbi/ncbi_gc.py +162 -0
pyobo/sources/{ncbigene.py → ncbi/ncbigene.py} +18 -19
pyobo/sources/nih_reporter.py +60 -0
pyobo/sources/nlm/__init__.py +9 -0
pyobo/sources/nlm/nlm_catalog.py +48 -0
pyobo/sources/nlm/nlm_publisher.py +36 -0
pyobo/sources/nlm/utils.py +116 -0
pyobo/sources/npass.py +6 -8
pyobo/sources/omim_ps.py +10 -3
pyobo/sources/pathbank.py +4 -8
pyobo/sources/pfam/__init__.py +9 -0
pyobo/sources/{pfam.py → pfam/pfam.py} +3 -8
pyobo/sources/{pfam_clan.py → pfam/pfam_clan.py} +2 -7
pyobo/sources/pharmgkb/__init__.py +15 -0
pyobo/sources/pharmgkb/pharmgkb_chemical.py +89 -0
pyobo/sources/pharmgkb/pharmgkb_disease.py +77 -0
pyobo/sources/pharmgkb/pharmgkb_gene.py +108 -0
pyobo/sources/pharmgkb/pharmgkb_pathway.py +63 -0
pyobo/sources/pharmgkb/pharmgkb_variant.py +84 -0
pyobo/sources/pharmgkb/utils.py +86 -0
pyobo/sources/pid.py +1 -6
pyobo/sources/pombase.py +6 -10
pyobo/sources/pubchem.py +4 -9
pyobo/sources/reactome.py +5 -11
pyobo/sources/rgd.py +11 -16
pyobo/sources/rhea.py +37 -36
pyobo/sources/ror.py +69 -42
pyobo/sources/selventa/__init__.py +0 -0
pyobo/sources/selventa/schem.py +4 -7
pyobo/sources/selventa/scomp.py +1 -6
pyobo/sources/selventa/sdis.py +4 -7
pyobo/sources/selventa/sfam.py +1 -6
pyobo/sources/sgd.py +6 -11
pyobo/sources/signor/__init__.py +7 -0
pyobo/sources/signor/download.py +41 -0
pyobo/sources/signor/signor_complexes.py +105 -0
pyobo/sources/slm.py +12 -15
pyobo/sources/umls/__init__.py +7 -1
pyobo/sources/umls/__main__.py +0 -0
pyobo/sources/umls/get_synonym_types.py +20 -4
pyobo/sources/umls/sty.py +57 -0
pyobo/sources/umls/synonym_types.tsv +1 -1
pyobo/sources/umls/umls.py +18 -22
pyobo/sources/unimod.py +46 -0
pyobo/sources/uniprot/__init__.py +1 -1
pyobo/sources/uniprot/uniprot.py +40 -32
pyobo/sources/uniprot/uniprot_ptm.py +4 -34
pyobo/sources/utils.py +3 -2
pyobo/sources/wikipathways.py +7 -10
pyobo/sources/zfin.py +5 -10
pyobo/ssg/__init__.py +12 -16
pyobo/ssg/base.html +0 -0
pyobo/ssg/index.html +26 -13
pyobo/ssg/term.html +12 -2
pyobo/ssg/typedef.html +0 -0
pyobo/struct/__init__.py +54 -8
pyobo/struct/functional/__init__.py +1 -0
pyobo/struct/functional/dsl.py +2572 -0
pyobo/struct/functional/macros.py +423 -0
pyobo/struct/functional/obo_to_functional.py +385 -0
pyobo/struct/functional/ontology.py +270 -0
pyobo/struct/functional/utils.py +112 -0
pyobo/struct/reference.py +331 -136
pyobo/struct/struct.py +1413 -643
pyobo/struct/struct_utils.py +1078 -0
pyobo/struct/typedef.py +162 -210
pyobo/struct/utils.py +12 -5
pyobo/struct/vocabulary.py +138 -0
pyobo/utils/__init__.py +0 -0
pyobo/utils/cache.py +13 -11
pyobo/utils/io.py +17 -31
pyobo/utils/iter.py +5 -5
pyobo/utils/misc.py +41 -53
pyobo/utils/ndex_utils.py +0 -0
pyobo/utils/path.py +76 -70
pyobo/version.py +3 -3
{pyobo-0.11.2.dist-info → pyobo-0.12.0.dist-info}/METADATA +228 -229
pyobo-0.12.0.dist-info/RECORD +202 -0
pyobo-0.12.0.dist-info/WHEEL +4 -0
{pyobo-0.11.2.dist-info → pyobo-0.12.0.dist-info}/entry_points.txt +1 -0
pyobo-0.12.0.dist-info/licenses/LICENSE +21 -0
pyobo/aws.py +0 -162
pyobo/cli/aws.py +0 -47
pyobo/identifier_utils.py +0 -142
pyobo/normalizer.py +0 -232
pyobo/registries/__init__.py +0 -16
pyobo/registries/metaregistry.json +0 -507
pyobo/registries/metaregistry.py +0 -135
pyobo/sources/icd11.py +0 -105
pyobo/xrefdb/__init__.py +0 -1
pyobo/xrefdb/canonicalizer.py +0 -214
pyobo/xrefdb/priority.py +0 -59
pyobo/xrefdb/sources/__init__.py +0 -60
pyobo/xrefdb/sources/biomappings.py +0 -36
pyobo/xrefdb/sources/cbms2019.py +0 -91
pyobo/xrefdb/sources/chembl.py +0 -83
pyobo/xrefdb/sources/compath.py +0 -82
pyobo/xrefdb/sources/famplex.py +0 -64
pyobo/xrefdb/sources/gilda.py +0 -50
pyobo/xrefdb/sources/intact.py +0 -113
pyobo/xrefdb/sources/ncit.py +0 -133
pyobo/xrefdb/sources/pubchem.py +0 -27
pyobo/xrefdb/sources/wikidata.py +0 -116
pyobo-0.11.2.dist-info/RECORD +0 -157
pyobo-0.11.2.dist-info/WHEEL +0 -5
pyobo-0.11.2.dist-info/top_level.txt +0 -1

pyobo/sources/mesh.py CHANGED Viewed

@@ -4,22 +4,25 @@ import datetime
 import itertools as itt
 import logging
 import re
+import time
 from collections.abc import Collection, Iterable, Mapping
-from typing import Any, Optional
+from pathlib import Path
+from typing import Any
 from xml.etree.ElementTree import Element
+from lxml import etree
 from tqdm.auto import tqdm
 from pyobo.api.utils import safe_get_version
 from pyobo.identifier_utils import standardize_ec
 from pyobo.struct import Obo, Reference, Synonym, Term
 from pyobo.utils.cache import cached_json, cached_mapping
-from pyobo.utils.io import parse_xml_gz
 from pyobo.utils.path import ensure_path, prefix_directory_join
 __all__ = [
     "MeSHGetter",
     "get_mesh_category_curies",
+    "get_mesh_category_references",
 ]
 logger = logging.getLogger(__name__)
@@ -30,12 +33,21 @@ CAS_RE = re.compile(r"^\d{1,7}\-\d{2}\-\d$")
 UNII_RE = re.compile(r"[0-9A-Za-z]{10}$")
+def _get_xml_root(path: Path) -> Element:
+    """Parse an XML file from a path to a GZIP file."""
+    t = time.time()
+    logger.info("parsing xml from %s", path)
+    tree = etree.parse(path.as_posix())  # type:ignore
+    logger.info("parsed xml in %.2f seconds", time.time() - t)
+    return tree.getroot()
 class MeSHGetter(Obo):
     """An ontology representation of the Medical Subject Headings."""
     ontology = bioversions_key = PREFIX
-    def _get_version(self) -> Optional[str]:
+    def _get_version(self) -> str | None:
         return NOW_YEAR
     def iter_terms(self, force: bool = False) -> Iterable[Term]:
@@ -43,11 +55,6 @@ class MeSHGetter(Obo):
         return get_terms(version=self._version_or_raise, force=force)
-def get_obo(force: bool = False) -> Obo:
-    """Get MeSH as OBO."""
-    return MeSHGetter(force=force)
 def get_tree_to_mesh_id(version: str) -> Mapping[str, str]:
     """Get a mapping from MeSH tree numbers to their MeSH identifiers."""
@@ -110,12 +117,12 @@ def ensure_mesh_descriptors(
     """Get the parsed MeSH dictionary, and cache it if it wasn't already."""
     @cached_json(path=prefix_directory_join(PREFIX, name="desc.json", version=version), force=force)
-    def _inner():
+    def _inner() -> list[dict[str, Any]]:
         path = ensure_path(PREFIX, url=get_descriptors_url(version), version=version)
-        root = parse_xml_gz(path)
+        root = _get_xml_root(path)
         return get_descriptor_records(root, id_key="DescriptorUI", name_key="DescriptorName/String")
-    return _inner()
+    return _inner()  # type:ignore
 def get_descriptors_url(version: str) -> str:
@@ -136,14 +143,14 @@ def ensure_mesh_supplemental_records(version: str, force: bool = False) -> list[
     """Get the parsed MeSH dictionary, and cache it if it wasn't already."""
     @cached_json(path=prefix_directory_join(PREFIX, name="supp.json", version=version), force=force)
-    def _inner():
+    def _inner() -> list[dict[str, Any]]:
         path = ensure_path(PREFIX, url=get_supplemental_url(version), version=version)
-        root = parse_xml_gz(path)
+        root = _get_xml_root(path)
         return get_descriptor_records(
             root, id_key="SupplementalRecordUI", name_key="SupplementalRecordName/String"
         )
-    return _inner()
+    return _inner()  # type:ignore
 def get_descriptor_records(element: Element, id_key: str, name_key) -> list[dict[str, Any]]:
@@ -169,7 +176,7 @@ def get_descriptor_records(element: Element, id_key: str, name_key) -> list[dict
         parents_descriptor_uis = set()
         for tree_number in descriptor["tree_numbers"]:
             try:
-                parent_tn, self_tn = tree_number.rsplit(".", 1)
+                parent_tn, _self_tn = tree_number.rsplit(".", 1)
             except ValueError:
                 logger.debug("No dot for %s", tree_number)
                 continue
@@ -185,7 +192,7 @@ def get_descriptor_records(element: Element, id_key: str, name_key) -> list[dict
     return rv
-def get_scope_note(descriptor_record) -> Optional[str]:
+def get_scope_note(descriptor_record) -> str | None:
     """Get the scope note from the preferred concept in a term's record."""
     if isinstance(descriptor_record, dict):
         # necessary for pre-2023 data
@@ -207,9 +214,10 @@ def get_descriptor_record(
     """Get descriptor records from the main element.
     :param element: An XML element
-    :param id_key: For descriptors, set to 'DescriptorUI'. For supplement, set to 'SupplementalRecordUI'
-    :param name_key: For descriptors, set to 'DescriptorName/String'.
-     For supplement, set to 'SupplementalRecordName/String'
+    :param id_key: For descriptors, set to 'DescriptorUI'. For supplement, set to
+        'SupplementalRecordUI'
+    :param name_key: For descriptors, set to 'DescriptorName/String'. For supplement,
+        set to 'SupplementalRecordName/String'
     """
     concepts = get_concept_records(element)
     scope_note = get_scope_note(concepts)
@@ -248,7 +256,7 @@ def _get_xrefs(element: Element) -> list[tuple[str, str]]:
         elif registry_number.startswith("txid"):
             rv.append(("NCBITaxon", registry_number[4:]))
         elif registry_number.startswith("EC "):
-            rv.append(("eccode", standardize_ec(registry_number[3:])))
+            rv.append(("ec", standardize_ec(registry_number[3:])))
         elif CAS_RE.fullmatch(registry_number):
             rv.append(("cas", registry_number))
         elif UNII_RE.fullmatch(registry_number):
@@ -319,16 +327,40 @@ def _get_descriptor_qualifiers(descriptor: Element) -> list[Mapping[str, str]]:
 def get_mesh_category_curies(
-    letter: str, *, skip: Optional[Collection[str]] = None, version: Optional[str] = None
+    letter: str, *, skip: Collection[str] | None = None, version: str | None = None
 ) -> list[str]:
     """Get the MeSH LUIDs for a category, by letter (e.g., "A").
     :param letter: The MeSH tree, A for anatomy, C for disease, etc.
     :param skip: An optional collection of MeSH tree codes to skip, such as "A03"
     :param version: The MeSH version to use. Defaults to latest
     :returns: A list of MeSH CURIE strings for the top level of each MeSH tree.
-    .. seealso:: https://meshb.nlm.nih.gov/treeView
+    .. seealso::
+        https://meshb.nlm.nih.gov/treeView
+    """
+    return [
+        reference.curie
+        for reference in get_mesh_category_references(letter=letter, skip=skip, version=version)
+    ]
+def get_mesh_category_references(
+    letter: str, *, skip: Collection[str] | None = None, version: str | None = None
+) -> list[Reference]:
+    """Get the MeSH references for a category, by letter (e.g., "A").
+    :param letter: The MeSH tree, A for anatomy, C for disease, etc.
+    :param skip: An optional collection of MeSH tree codes to skip, such as "A03"
+    :param version: The MeSH version to use. Defaults to latest
+    :returns: A list of MeSH references for the top level of each MeSH tree.
+    .. seealso::
+        https://meshb.nlm.nih.gov/treeView
     """
     if version is None:
         version = safe_get_version("mesh")
@@ -340,10 +372,12 @@ def get_mesh_category_curies(
             continue
         mesh_id = tree_to_mesh.get(key)
         if mesh_id is None:
+            # as soon as we get to a missing ID, we don't
+            # have to go any further
             break
-        rv.append(f"mesh:{mesh_id}")
+        rv.append(Reference(prefix="mesh", identifier=mesh_id))
     return rv
 if __name__ == "__main__":
-    get_obo(force=True).write_default(force=True, write_obo=True)
+    MeSHGetter.cli()

pyobo/sources/mgi.py CHANGED Viewed

@@ -12,7 +12,6 @@ from pyobo.struct.typedef import exact_match
 from ..struct import (
     Obo,
     Reference,
-    Synonym,
     Term,
     from_species,
     has_gene_product,
@@ -35,8 +34,7 @@ ENSEMBL_XREFS_URL = "http://www.informatics.jax.org/downloads/reports/MRK_ENSEMB
 class MGIGetter(Obo):
     """An ontology representation of MGI's mouse gene nomenclature."""
-    ontology = PREFIX
-    dynamic_version = True
+    ontology = bioversions_key = PREFIX
     typedefs = [from_species, has_gene_product, transcribes_to, exact_match]
     def iter_terms(self, force: bool = False) -> Iterable[Term]:
@@ -44,11 +42,6 @@ class MGIGetter(Obo):
         return get_terms(force=force)
-def get_obo(force: bool = False) -> Obo:
-    """Get MGI as OBO."""
-    return MGIGetter(force=force)
 COLUMNS = ["MGI Accession ID", "Marker Symbol", "Marker Name"]
@@ -159,7 +152,7 @@ def get_terms(force: bool = False) -> Iterable[Term]:
         )
         if identifier in mgi_to_synonyms:
             for synonym in mgi_to_synonyms[identifier]:
-                term.append_synonym(Synonym(name=synonym))
+                term.append_synonym(synonym)
         if identifier in mgi_to_entrez_id:
             term.append_exact_match(
                 Reference(prefix="ncbigene", identifier=mgi_to_entrez_id[identifier])
@@ -179,4 +172,4 @@ def get_terms(force: bool = False) -> Iterable[Term]:
 if __name__ == "__main__":
-    get_obo(force=True).write_default(write_obo=True, write_obograph=True, use_tqdm=True)
+    MGIGetter.cli()

pyobo/sources/mirbase/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+"""Resources from miRBase."""
+from .mirbase import MiRBaseGetter
+from .mirbase_family import MiRBaseFamilyGetter
+from .mirbase_mature import MiRBaseMatureGetter
+__all__ = [
+    "MiRBaseFamilyGetter",
+    "MiRBaseGetter",
+    "MiRBaseMatureGetter",
+]

pyobo/sources/{mirbase.py → mirbase/mirbase.py} RENAMED Viewed

@@ -6,12 +6,13 @@ from collections.abc import Iterable, Mapping
 from tqdm.auto import tqdm
-from pyobo.sources.mirbase_constants import BASE_URL, _assert_frozen_version
 from pyobo.struct import Obo, Reference, Synonym, Term, from_species
 from pyobo.struct.typedef import has_mature
 from pyobo.utils.cache import cached_mapping
 from pyobo.utils.path import ensure_df, ensure_path, prefix_directory_join
+from .mirbase_constants import BASE_URL, _assert_frozen_version
 __all__ = [
     "MiRBaseGetter",
 ]
@@ -41,11 +42,6 @@ class MiRBaseGetter(Obo):
         return get_terms(version=self._version_or_raise, force=force)
-def get_obo(force: bool = False) -> Obo:
-    """Get miRBase as OBO."""
-    return MiRBaseGetter(force=force)
 def get_terms(version: str, force: bool = False) -> list[Term]:
     """Parse miRNA data from filepath and convert it to dictionary."""
     _assert_frozen_version(version)
@@ -54,7 +50,7 @@ def get_terms(version: str, force: bool = False) -> list[Term]:
     file_handle = (
         gzip.open(definitions_path, "rt")
-        if definitions_path.endswith(".gz")
+        if definitions_path.suffix.endswith(".gz")
         else open(definitions_path)
     )
     with file_handle as file:
@@ -101,7 +97,7 @@ def _process_definitions_lines(
     for group in tqdm(groups, desc=f"mapping {PREFIX}"):
         name = group[0][5:23].strip()
-        qualifier, dtype, species_code, length = map(
+        _qualifier, _dtype, species_code, _length = map(
             str.strip, group[0][23:].strip().rstrip(".").split(";")
         )
         identifier = group[2][3:-2].strip()
@@ -134,7 +130,7 @@ def _process_definitions_lines(
             xref_prefix, xref_identifier, xref_label = map(str.strip, line.split(";"))
             xref_prefix = xref_prefix.lower()
             xref_prefix = xref_mapping.get(xref_prefix, xref_prefix)
-            if xref_prefix == "pictar":
+            if xref_prefix in {"pictar", "mir", "mirte"}:
                 continue
             try:
@@ -157,7 +153,8 @@ def _process_definitions_lines(
         species_identifier, species_name = organisms[species_code]
         term.set_species(species_identifier, species_name)
-        term.extend_relationship(has_mature, matures)
+        for mature in matures:
+            term.append_relationship(has_mature, mature)
         yield term
@@ -199,4 +196,4 @@ def get_mature_id_to_name(version: str) -> Mapping[str, str]:
 if __name__ == "__main__":
-    get_obo(force=True).write_default(force=True, write_obograph=True, write_obo=True)
+    MiRBaseGetter.cli()

pyobo/sources/{mirbase_constants.py → mirbase/mirbase_constants.py} RENAMED Viewed

File without changes

pyobo/sources/{mirbase_family.py → mirbase/mirbase_family.py} RENAMED Viewed

@@ -5,12 +5,13 @@ from collections.abc import Iterable
 import pandas as pd
 from tqdm.auto import tqdm
-from pyobo.sources.mirbase_constants import (
+from pyobo.struct import Obo, Reference, Term, has_member
+from .mirbase_constants import (
     get_premature_df,
     get_premature_family_df,
     get_premature_to_prefamily_df,
 )
-from pyobo.struct import Obo, Reference, Term, has_member
 __all__ = [
     "MiRBaseFamilyGetter",
@@ -31,11 +32,6 @@ class MiRBaseFamilyGetter(Obo):
         return iter_terms(version=self._version_or_raise, force=force)
-def get_obo(force: bool = False) -> Obo:
-    """Get miRBase family as OBO."""
-    return MiRBaseFamilyGetter(force=force)
 def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
     """Get miRBase family terms."""
     df = get_df(version, force=force)
@@ -66,4 +62,4 @@ def get_df(version: str, force: bool = False) -> pd.DataFrame:
 if __name__ == "__main__":
-    get_obo().write_default(use_tqdm=True, write_obo=True, force=True)
+    MiRBaseFamilyGetter.cli()

pyobo/sources/{mirbase_mature.py → mirbase/mirbase_mature.py} RENAMED Viewed

@@ -5,9 +5,10 @@ from collections.abc import Iterable
 import pandas as pd
 from tqdm.auto import tqdm
-from pyobo.sources.mirbase_constants import get_mature_df
 from pyobo.struct import Obo, Reference, Synonym, Term
+from .mirbase_constants import get_mature_df
 __all__ = [
     "MiRBaseMatureGetter",
 ]
@@ -26,11 +27,6 @@ class MiRBaseMatureGetter(Obo):
         return iter_terms(version=self._version_or_raise, force=force)
-def get_obo(force: bool = False) -> Obo:
-    """Get miRBase mature as OBO."""
-    return MiRBaseMatureGetter(force=force)
 def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
     """Get miRBase mature terms."""
     df = get_mature_df(version, force=force)
@@ -49,4 +45,4 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
 if __name__ == "__main__":
-    get_obo().write_default(write_obo=True, write_obograph=True, use_tqdm=True)
+    MiRBaseMatureGetter.cli()

pyobo/sources/msigdb.py CHANGED Viewed

@@ -1,41 +1,55 @@
 """Parsers for MSig."""
 import logging
+import zipfile
 from collections.abc import Iterable
-from typing import Optional
-from lxml.etree import ElementTree
+from lxml import etree
+from pydantic import ValidationError
 from tqdm.auto import tqdm
-from ..struct import Obo, Reference, Term, has_participant
-from ..utils.path import ensure_path
-logger = logging.getLogger(__name__)
+from pyobo.struct import Obo, Reference, Term, TypeDef, has_citation, has_participant
+from pyobo.utils.path import ensure_path
 __all__ = [
     "MSigDBGetter",
 ]
+logger = logging.getLogger(__name__)
 PREFIX = "msigdb"
 BASE_URL = "https://data.broadinstitute.org/gsea-msigdb/msigdb/release"
+CATEGORY_CODE = TypeDef.default(PREFIX, "category_code", name="category code", is_metadata_tag=True)
+SUB_CATEGORY_CODE = TypeDef.default(
+    PREFIX, "sub_category_code", name="sub-category code", is_metadata_tag=True
+)
+CONTRIBUTOR = TypeDef.default(PREFIX, "contributor", name="contributor", is_metadata_tag=True)
+EXACT_SOURCE = TypeDef.default(PREFIX, "exact_source", name="exact source", is_metadata_tag=True)
+EXTERNAL_DETAILS_URL = TypeDef.default(
+    PREFIX, "external_details_url", name="external details URL", is_metadata_tag=True
+)
+PROPERTIES = [
+    ("CATEGORY_CODE", CATEGORY_CODE),
+    ("SUB_CATEGORY_CODE", SUB_CATEGORY_CODE),
+    ("CONTRIBUTOR", CONTRIBUTOR),
+    ("EXACT_SOURCE", EXACT_SOURCE),
+    ("EXTERNAL_DETAILS_URL", EXTERNAL_DETAILS_URL),
+]
 class MSigDBGetter(Obo):
     """An ontology representation of MMSigDB's gene set nomenclature."""
     ontology = bioversions_key = PREFIX
-    typedefs = [has_participant]
+    typedefs = [has_participant, has_citation, *(p for _, p in PROPERTIES)]
     def iter_terms(self, force: bool = False) -> Iterable[Term]:
         """Iterate over terms in the ontology."""
         return iter_terms(version=self._version_or_raise, force=force)
-def get_obo(force: bool = False) -> Obo:
-    """Get MSIG as Obo."""
-    return MSigDBGetter(force=force)
 _SPECIES = {
     "Homo sapiens": "9606",
     "Mus musculus": "10090",
@@ -49,24 +63,36 @@ GO_URL_PREFIX = "http://amigo.geneontology.org/amigo/term/GO:"
 KEGG_URL_PREFIX = "http://www.genome.jp/kegg/pathway/hsa/"
-def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
-    """Get MSigDb terms."""
-    xml_url = f"{BASE_URL}/{version}.Hs/msigdb_v{version}.Hs.xml"
+def _iter_entries(version: str, force: bool = False):
+    xml_url = f"{BASE_URL}/{version}.Hs/msigdb_v{version}.Hs.xml.zip"
     path = ensure_path(prefix=PREFIX, url=xml_url, version=version, force=force)
-    tree = ElementTree.parse(path)
+    with zipfile.ZipFile(path, "r") as zf:
+        with zf.open(f"msigdb_v{version}.Hs.xml") as file:
+            for _ in range(3):
+                next(file)
+            # from here on out, every row except the last is a GENESET
+            for i, line_bytes in enumerate(file, start=4):
+                line = line_bytes.decode("utf8").strip()
+                if not line.startswith("<GENESET"):
+                    continue
+                try:
+                    tree = etree.fromstring(line)
+                except etree.XMLSyntaxError as e:
+                    # this is the result of faulty encoding in XML - maybe they
+                    # wrote XML with their own string formatting instead of using a
+                    # library.
+                    logger.debug("[%s] failed on line %s: %s", PREFIX, i, e)
+                else:
+                    yield tree
-    for entry in tqdm(tree.getroot(), desc=f"{PREFIX} v{version}", unit_scale=True):
+def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
+    """Get MSigDb terms."""
+    entries = _iter_entries(version=version, force=force)
+    for entry in tqdm(entries, desc=f"{PREFIX} v{version}", unit_scale=True):
         attrib = dict(entry.attrib)
         tax_id = _SPECIES[attrib["ORGANISM"]]
-        reference_id = attrib["PMID"].strip()
-        if not reference_id:
-            reference = None
-        elif reference_id.startswith("GSE"):
-            reference = Reference(prefix="gse", identifier=reference_id)
-        else:
-            reference = Reference(prefix="pubmed", identifier=reference_id)
         # NONE have the entry "HISTORICAL_NAME"
         # historical_name = thing.attrib['HISTORICAL_NAME']
@@ -77,19 +103,20 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
         term = Term(
             reference=Reference(prefix=PREFIX, identifier=identifier, name=name),
             definition=_get_definition(attrib),
-            provenance=[] if reference is None else [reference],
             is_obsolete=is_obsolete,
         )
-        for key in [
-            "CATEGORY_CODE",
-            "SUB_CATEGORY_CODE",
-            "CONTRIBUTOR",
-            "EXACT_SOURCE",
-            "EXTERNAL_DETAILS_URL",
-        ]:
-            value = attrib[key].strip()
-            if value:
-                term.append_property(key.lower(), value)
+        reference_id = attrib["PMID"].strip()
+        if not reference_id:
+            pass
+        elif reference_id.startswith("GSE"):
+            term.append_see_also(Reference(prefix="gse", identifier=reference_id))
+        else:
+            term.append_provenance(Reference(prefix="pubmed", identifier=reference_id))
+        for key, typedef in PROPERTIES:
+            if value := attrib[key].strip():
+                term.annotate_string(typedef, value)
         term.set_species(tax_id)
@@ -123,17 +150,25 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
                 logger.warning(
                     "missing %s source: msigdb:%s (%s)", contributor, identifier, external_details
                 )
-            term.append_xref(Reference(prefix="kegg.pathway", identifier=external_id))
+            try:
+                kegg_reference = Reference(prefix="kegg.pathway", identifier=external_id)
+            except ValidationError:
+                # TODO handle kegg.network which starts with N, like N01146
+                if not external_id.startswith("N"):
+                    tqdm.write(f"could not validate kegg.pathway:{external_id}")
+            else:
+                term.append_xref(kegg_reference)
         for ncbigene_id in attrib["MEMBERS_EZID"].strip().split(","):
             if ncbigene_id:
-                term.append_relationship(
+                term.annotate_object(
                     has_participant, Reference(prefix="ncbigene", identifier=ncbigene_id)
                 )
         yield term
-def _get_definition(attrib) -> Optional[str]:
+def _get_definition(attrib) -> str | None:
     rv = attrib["DESCRIPTION_FULL"].strip() or attrib["DESCRIPTION_BRIEF"].strip() or None
     if rv is not None:
         return rv.replace(r"\d", "").replace(r"\s", "")

pyobo/sources/ncbi/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""Resources from NCBI."""
+from .ncbi_gc import NCBIGCGetter
+from .ncbigene import NCBIGeneGetter
+__all__ = [
+    "NCBIGCGetter",
+    "NCBIGeneGetter",
+]

pyobo 0.11.2__py3-none-any.whl → 0.12.0__py3-none-any.whl

pyobo 0.11.2py3-none-any.whl → 0.12.0py3-none-any.whl