PyPI - pymetadata - Versions diffs - 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl - Mend

pymetadata 0.5.3py3-none-any.whl → 0.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pymetadata might be problematic. Click here for more details.

Files changed (10) hide show

pymetadata/__init__.py +1 -1
pymetadata/chebi.py +21 -29
pymetadata/core/annotation.py +114 -53
pymetadata/identifiers/registry.py +0 -202
pymetadata/metadata/eco.py +882 -301
{pymetadata-0.5.3.dist-info → pymetadata-0.5.5.dist-info}/METADATA +7 -8
{pymetadata-0.5.3.dist-info → pymetadata-0.5.5.dist-info}/RECORD +9 -10
pymetadata/resources/chebi_webservice_wsdl.xml +0 -509
{pymetadata-0.5.3.dist-info → pymetadata-0.5.5.dist-info}/WHEEL +0 -0
{pymetadata-0.5.3.dist-info → pymetadata-0.5.5.dist-info}/licenses/LICENSE +0 -0

pymetadata/__init__.py CHANGED Viewed

@@ -3,7 +3,7 @@
 from pathlib import Path
 __author__ = "Matthias Koenig"
-__version__ = "0.5.3"
+__version__ = "0.5.5"
 program_name: str = "pymetadata"

pymetadata/chebi.py CHANGED Viewed

@@ -1,28 +1,19 @@
 """Module for working with chebi."""
 from pathlib import Path
-from pprint import pprint
 from typing import Any, Dict, Optional
-from zeep import Client
+import requests
 import pymetadata
 from pymetadata import log
 from pymetadata.cache import DataclassJSONEncoder, read_json_cache, write_json_cache
+from pymetadata.console import console
 logger = log.get_logger(__name__)
-# FIXME: copy the file to the cache dir
-client = Client(str(pymetadata.RESOURCES_DIR / "chebi_webservice_wsdl.xml"))
 class ChebiQuery:
-    """Class to query information from ChEBI.
-    An overview over available methods:
-        python -mzeep https://www.ebi.ac.uk/webservices/chebi/2.0/webservice?wsdl
-    """
+    """Class to query information from ChEBI."""
     @staticmethod
     def query(
@@ -52,27 +43,28 @@ class ChebiQuery:
         # fetch and cache data
         if not data:
-            try:
-                result = client.service.getCompleteEntity(chebi)
-                # print(result)
-            except Exception:
+            response = requests.get(
+                url=f"https://www.ebi.ac.uk/chebi/backend/api/public/compounds/?chebi_ids={chebi}"
+            )
+            if response.status_code == 200:
+                result = response.json()
+            else:
                 logger.error(f"CHEBI information could not be retrieved for: {chebi}")
                 return dict()
-            # parse formula
-            formula = None
-            formulae = result["Formulae"]
-            if formulae:
-                formula = formulae[0]["data"]
+            result = result[chebi]["data"]
+            chemical_data = result["chemical_data"]
+            default_structure = result["default_structure"]
             data = {
                 "chebi": chebi,
-                "name": result["chebiAsciiName"],
+                "name": result["ascii_name"],
                 "definition": result["definition"],
-                "formula": formula,
-                "charge": result["charge"],
-                "mass": result["mass"],
-                "inchikey": result["inchiKey"],
+                "formula": chemical_data["formula"] if chemical_data else None,
+                "charge": chemical_data["charge"] if chemical_data else None,
+                "mass": chemical_data["mass"] if chemical_data else None,
+                "inchikey": default_structure["standard_inchi_key"]
+                if default_structure
+                else None,
             }
             logger.info(f"Write chebi: {chebi_path}")
@@ -86,7 +78,7 @@ class ChebiQuery:
 if __name__ == "__main__":
     chebis = ["CHEBI:2668", "CHEBI:138366", "CHEBI:9637", "CHEBI:155897"]
     for chebi in chebis:
-        print(chebi)
+        console.rule(chebi, align="left", style="bold white")
         d = ChebiQuery.query(chebi=chebi, cache=False)
-        pprint(d)
+        console.print(d)
         d = ChebiQuery.query(chebi=chebi, cache=True)

pymetadata/core/annotation.py CHANGED Viewed

@@ -5,12 +5,14 @@ Core data structure to store annotations.
 import re
 import urllib
+from enum import Enum
 from pprint import pprint
 from typing import Any, Dict, Final, List, Optional, Tuple, Union
 import requests
 from pymetadata import log
+from pymetadata.console import console
 from pymetadata.core.xref import CrossReference, is_url
 from pymetadata.identifiers.miriam import BQB, BQM
 from pymetadata.identifiers.registry import REGISTRY
@@ -19,19 +21,31 @@ from pymetadata.ontologies.ols import ONTOLOGIES, OLSQuery
 OLS_QUERY = OLSQuery(ontologies=ONTOLOGIES)
-IDENTIFIERS_ORG_PREFIX: Final = "http://identifiers.org"
+IDENTIFIERS_ORG_PREFIX: Final = "https://identifiers.org"
 IDENTIFIERS_ORG_PATTERN1: Final = re.compile(r"^https?://identifiers.org/(.+?)/(.+)")
 IDENTIFIERS_ORG_PATTERN2: Final = re.compile(r"^https?://identifiers.org/(.+)")
+BIOREGISTRY_PREFIX: Final = "https://bioregistry.io"
+BIOREGISTRY_PATTERN: Final = re.compile(r"^https?://bioregistry.io/(.+)")
 MIRIAM_URN_PATTERN: Final = re.compile(r"^urn:miriam:(.+)")
 logger = log.get_logger(__name__)
+class ProviderType(str, Enum):
+    """Provider type."""
+    IDENTIFIERS_ORG = "identifiers.org"
+    BIOREGISTRY_IO = "bioregistry.io"
+    NONE = "none"
 class RDFAnnotation:
     """RDFAnnotation class.
     Basic storage of annotation information. This consists of the relation
-    and the the resource.
+    and the resource.
     The annotations can be attached to other objects thereby forming
     triples which can be converted to RDF.
@@ -40,6 +54,7 @@ class RDFAnnotation:
         - `collection/term`, i.e., the combination of collection and term
         - `http(s)://arbitrary.url`, an arbitrary URL
         - urn:miriam:uniprot:P03023
+        - https://bioregistry.io/chebi:15996 urls via the bioregistry provider
     """
     replaced_collections: Dict[str, str] = {
@@ -53,6 +68,7 @@ class RDFAnnotation:
         self.collection: Optional[str] = None
         self.term: Optional[str] = None
         self.resource: str = resource
+        self.provider: ProviderType = ProviderType.IDENTIFIERS_ORG
         if not qualifier:
             raise ValueError(
@@ -75,15 +91,19 @@ class RDFAnnotation:
             if match1:
                 # handle identifiers.org pattern
                 self.collection, self.term = match1.group(1), match1.group(2)
+                self.provider = ProviderType.IDENTIFIERS_ORG
             if not self.collection:
-                # tests new short pattern
+                # tests new compact patterns
                 match2 = IDENTIFIERS_ORG_PATTERN2.match(resource)
                 if match2:
                     tokens = match2.group(1).split(":")
                     if len(tokens) == 2:
                         self.collection = tokens[0].lower()
+                        # check if the namespace is embedded
                         self.term = match2.group(1)
+                        self.provider = ProviderType.IDENTIFIERS_ORG
                     else:
                         logger.warning(
                             f"Identifiers.org URL does not conform to new"
@@ -94,16 +114,25 @@ class RDFAnnotation:
                 # other urls are directly stored as resources without collection
                 self.collection = None
                 self.term = resource
-                logger.debug(
-                    f"{resource} does not conform to "
-                    f"http(s)://identifiers.org/collection/id or http(s)://identifiers.org/id",
-                )
+                if BIOREGISTRY_PATTERN.match(resource):
+                    self.provider = ProviderType.BIOREGISTRY_IO
+                    console.print(self.provider)
+                else:
+                    self.provider = ProviderType.NONE
+                    logger.warning(
+                        f"{resource} does not conform to "
+                        f"http(s)://identifiers.org/collection/id or http(s)://identifiers.org/id or "
+                        f"https://bioregistry.io/id .",
+                    )
+        # handle urns
         elif resource.startswith("urn:miriam:"):
             match3 = MIRIAM_URN_PATTERN.match(resource)
             if match3:
                 tokens = match3.group(1).split(":")
                 self.collection = tokens[0]
                 self.term = ":".join(tokens[1:]).replace("%3A", ":")
+                self.provider = ProviderType.IDENTIFIERS_ORG
                 logger.warning(
                     f"Deprecated urn pattern `{resource}` updated: "
@@ -113,13 +142,16 @@ class RDFAnnotation:
         else:
             # handle short notation
             tokens = resource.split("/")
-            if len(tokens) == 2:
+            if len(tokens) > 1:
                 self.collection = tokens[0]
                 self.term = "/".join(tokens[1:])
+                self.provider = ProviderType.IDENTIFIERS_ORG
             elif len(tokens) == 1 and ":" in tokens[0]:
                 self.collection = tokens[0].split(":")[0].lower()
                 self.term = tokens[0]
+                self.provider = ProviderType.IDENTIFIERS_ORG
+            # validation
             if len(tokens) < 2 and not self.collection:
                 logger.error(
                     f"Resource `{resource}` could not be split in collection and term. "
@@ -129,6 +161,13 @@ class RDFAnnotation:
                 )
                 self.collection = None
                 self.term = resource
+                self.provider = ProviderType.NONE
+        # shorten compact terms
+        if self.term and self.collection:
+            self.term = self.shorten_compact_term(
+                term=self.term, collection=self.collection
+            )
         # clean legacy collections
         if self.collection in self.replaced_collections:
@@ -136,6 +175,21 @@ class RDFAnnotation:
         self.validate()
+    @staticmethod
+    def shorten_compact_term(term: str, collection: str) -> str:
+        """Shorten the compact terms and return term.
+        If the namespace is not embeddd in the term return the shortened term.
+        """
+        namespace = REGISTRY.ns_dict.get(collection, None)
+        if namespace and not namespace.namespaceEmbeddedInLui:
+            # shorter term
+            if term.lower().startswith(collection):
+                tokens = term.split(":")
+                term = ":".join(tokens[1:])
+        return term
     @staticmethod
     def from_tuple(t: Tuple[Union[BQB, BQM], str]) -> "RDFAnnotation":
         """Construct from tuple."""
@@ -161,12 +215,12 @@ class RDFAnnotation:
     def __repr__(self) -> str:
         """Get representation string."""
-        return f"RDFAnnotation({self.qualifier}|{self.collection}|{self.term})"
+        return f"RDFAnnotation({self.qualifier}|{self.collection}|{self.term}|{self.provider.value})"
     def to_dict(self) -> Dict:
         """Convert to dict."""
         return {
-            "qualifier": self.qualifier.value,  # FIXME use enums!
+            "qualifier": self.qualifier.value,
             "collection": self.collection,
             "term": self.term,
         }
@@ -343,52 +397,59 @@ class RDFAnnotationData(RDFAnnotation):
 if __name__ == "__main__":
     for annotation in [
-        # FIXME: support this
         RDFAnnotation(
             qualifier=BQB.IS_VERSION_OF,
-            resource="NCIT:C75913",
-        ),
-        RDFAnnotation(
-            qualifier=BQB.IS_VERSION_OF,
-            resource="taxonomy/562",
-        ),
-        RDFAnnotation(
-            qualifier=BQB.IS_VERSION_OF,
-            resource="http://identifiers.org/taxonomy/9606",
-        ),
-        RDFAnnotation(
-            qualifier=BQB.IS_VERSION_OF,
-            resource="http://identifiers.org/biomodels.sbo/SBO:0000247",
-        ),
-        RDFAnnotation(
-            qualifier=BQB.IS_VERSION_OF, resource="urn:miriam:obo.go:GO%3A0005623"
-        ),
-        RDFAnnotation(
-            qualifier=BQB.IS_VERSION_OF, resource="urn:miriam:chebi:CHEBI%3A33699"
-        ),
-        RDFAnnotation(qualifier=BQB.IS_VERSION_OF, resource="chebi/CHEBI:456215"),
-        RDFAnnotation(
-            qualifier=BQB.IS, resource="https://en.wikipedia.org/wiki/Cytosol"
-        ),
-        RDFAnnotation(
-            qualifier=BQB.IS_VERSION_OF, resource="urn:miriam:uniprot:P03023"
-        ),
-        RDFAnnotation(
-            qualifier=BQB.IS_VERSION_OF,
-            resource="http://identifiers.org/go/GO:0005829",
-        ),
-        RDFAnnotation(
-            qualifier=BQB.IS_VERSION_OF, resource="http://identifiers.org/go/GO:0005829"
-        ),
-        RDFAnnotation(
-            qualifier=BQB.IS_VERSION_OF, resource="http://identifiers.org/GO:0005829"
-        ),
-        RDFAnnotation(
-            qualifier=BQB.IS_VERSION_OF, resource="http://identifiers.org/GO:0005829"
+            resource="https://bioregistry.io/chebi:15996",
         ),
-        RDFAnnotation(qualifier=BQB.IS_VERSION_OF, resource="bto/BTO:0000089"),
-        RDFAnnotation(qualifier=BQB.IS_VERSION_OF, resource="BTO:0000089"),
-        RDFAnnotation(qualifier=BQB.IS_VERSION_OF, resource="chebi/CHEBI:000012"),
+        # RDFAnnotation(
+        #     qualifier=BQB.IS_VERSION_OF,
+        #     resource="NCIT:C75913",
+        # ),
+        # RDFAnnotation(
+        #     qualifier=BQB.IS_VERSION_OF,
+        #     resource="ncit:C75913",
+        # ),
+        # RDFAnnotation(
+        #     qualifier=BQB.IS_VERSION_OF,
+        #     resource="taxonomy/562",
+        # ),
+        # RDFAnnotation(
+        #     qualifier=BQB.IS_VERSION_OF,
+        #     resource="http://identifiers.org/taxonomy/9606",
+        # ),
+        # RDFAnnotation(
+        #     qualifier=BQB.IS_VERSION_OF,
+        #     resource="http://identifiers.org/biomodels.sbo/SBO:0000247",
+        # ),
+        # RDFAnnotation(
+        #     qualifier=BQB.IS_VERSION_OF, resource="urn:miriam:obo.go:GO%3A0005623"
+        # ),
+        # RDFAnnotation(
+        #     qualifier=BQB.IS_VERSION_OF, resource="urn:miriam:chebi:CHEBI%3A33699"
+        # ),
+        # RDFAnnotation(qualifier=BQB.IS_VERSION_OF, resource="chebi/CHEBI:456215"),
+        # RDFAnnotation(
+        #     qualifier=BQB.IS, resource="https://en.wikipedia.org/wiki/Cytosol"
+        # ),
+        # RDFAnnotation(
+        #     qualifier=BQB.IS_VERSION_OF, resource="urn:miriam:uniprot:P03023"
+        # ),
+        # RDFAnnotation(
+        #     qualifier=BQB.IS_VERSION_OF,
+        #     resource="http://identifiers.org/go/GO:0005829",
+        # ),
+        # RDFAnnotation(
+        #     qualifier=BQB.IS_VERSION_OF, resource="http://identifiers.org/go/GO:0005829"
+        # ),
+        # RDFAnnotation(
+        #     qualifier=BQB.IS_VERSION_OF, resource="http://identifiers.org/GO:0005829"
+        # ),
+        # RDFAnnotation(
+        #     qualifier=BQB.IS_VERSION_OF, resource="http://identifiers.org/GO:0005829"
+        # ),
+        # RDFAnnotation(qualifier=BQB.IS_VERSION_OF, resource="bto/BTO:0000089"),
+        # RDFAnnotation(qualifier=BQB.IS_VERSION_OF, resource="BTO:0000089"),
+        # RDFAnnotation(qualifier=BQB.IS_VERSION_OF, resource="chebi/CHEBI:000012"),
     ]:
         print("-" * 80)
         data = RDFAnnotationData(annotation)

pymetadata/identifiers/registry.py CHANGED Viewed

@@ -88,195 +88,6 @@ class Namespace:
             self.resources = list()
-def ols_namespaces() -> Dict[str, Namespace]:
-    """Define Ontologies available from OLS but not in identifiers.org."""
-    ols_info: Dict = {
-        "deprecated": False,
-        "deprecationDate": None,
-        "institution": {
-            "description": "At EMBL-EBI, we make the "
-            "world’s public biological data "
-            "freely available to the "
-            "scientific community via a "
-            "range of services and tools, "
-            "perform basic research and "
-            "provide professional training "
-            "in bioinformatics. \n"
-            "We are part of the European "
-            "Molecular Biology Laboratory "
-            "(EMBL), an international, "
-            "innovative and "
-            "interdisciplinary research "
-            "organisation funded by 26 "
-            "member states and two "
-            "associate member states.",
-            "homeUrl": "https://www.ebi.ac.uk",
-            "id": 2,
-            "location": {"countryCode": "GB", "countryName": "United Kingdom"},
-            "name": "European Bioinformatics Institute",
-            "rorId": "https://ror.org/02catss52",
-        },
-        "location": {"countryCode": "GB", "countryName": "United Kingdom"},
-        "official": False,
-        "providerCode": "ols",
-    }
-    # Custom namespaces for OLS ontology, for simple support
-    namespaces = [
-        Namespace(
-            id=None,
-            prefix="omim",
-            pattern=r"^MI:\d+$",
-            name="OMIM",
-            description="Molecular Interactions Controlled Vocabulary",
-            namespaceEmbeddedInLui=True,
-        ),
-        Namespace(
-            id=None,
-            prefix="dron",
-            pattern=r"^DRON:\d+$",
-            name="DRON",
-            description="The drug ontology",
-            namespaceEmbeddedInLui=True,
-        ),
-        Namespace(
-            id=None,
-            prefix="cmo",
-            pattern=r"^CMO:\d+$",
-            name="Chemical methods ontology",
-            description="Morphological and physiological measurement records "
-            "generated from clinical and model organism research and health programs.",
-            namespaceEmbeddedInLui=True,
-        ),
-        Namespace(
-            id=None,
-            prefix="chmo",
-            pattern=r"^CHMO:\d+$",
-            name="Chemical methods ontology",
-            description="CHMO, the chemical methods ontology",
-            namespaceEmbeddedInLui=True,
-        ),
-        Namespace(
-            id=None,
-            prefix="vto",
-            pattern=r"^VTO:\d+$",
-            name="Vertebrate Taxonomy Ontology",
-            description="VTO Vertebrate Taxonomy Ontology",
-            namespaceEmbeddedInLui=True,
-        ),
-        Namespace(
-            id=None,
-            prefix="opmi",
-            pattern=r"^OPMI:\d+$",
-            name="Ontology of Precision Medicine and Investigation",
-            description="OPMI: Ontology of Precision Medicine and Investigation",
-            namespaceEmbeddedInLui=True,
-        ),
-        Namespace(
-            id=None,
-            prefix="atol",
-            pattern=r"^ATOL:\d+$",
-            name="ATOL",
-            description="Animal Trait Ontology for Livestock",
-            namespaceEmbeddedInLui=True,
-        ),
-        Namespace(
-            id=None,
-            prefix="nbo",
-            pattern=r"^NBO:\d+$",
-            name="NBO",
-            description="Neuro Behavior Ontology",
-            namespaceEmbeddedInLui=True,
-        ),
-        Namespace(
-            id=None,
-            prefix="scdo",
-            pattern=r"^SCDO:\d+$",
-            name="Sickle Cell Disease Ontology",
-            description="Sickle Cell Disease Ontology",
-            namespaceEmbeddedInLui=True,
-        ),
-        Namespace(
-            id=None,
-            prefix="fix",
-            pattern=r"^FIX:\d+$",
-            name="Physico-chemical methods and properties Ontology",
-            description="Physico-chemical methods and properties Ontology",
-            namespaceEmbeddedInLui=True,
-        ),
-        Namespace(
-            id=None,
-            prefix="oba",
-            pattern=r"^OBA:\d+$",
-            name="Ontology of Biological Attributes",
-            description="PubChem is an open chemistry database at the National "
-            "Institutes of Health (NIH).",
-            namespaceEmbeddedInLui=True,
-        ),
-        Namespace(
-            id=None,
-            prefix="mmo",
-            pattern=r"^MMO:\d+$",
-            name="Measurement method ontology",
-            description="Measurement method ontology",
-            namespaceEmbeddedInLui=True,
-        ),
-        Namespace(
-            id=None,
-            prefix="symp",
-            pattern=r"^SYMP:\d+$",
-            name="Symptom ontology",
-            description="The Symptom Ontology has been developed as a standardized ontology for symptoms of human diseases.",
-            namespaceEmbeddedInLui=True,
-        ),
-    ]
-    for ns in namespaces:
-        if not ns.resources:
-            ns.resources = []
-        if not ns.prefix:
-            continue
-        ns.resources.append(
-            Resource(
-                id=None,
-                name=f"{ns.prefix} through OLS",
-                description=f"{ns.prefix} through OLS",
-                mirId=None,
-                sampleId=None,
-                resourceHomeUrl=None,
-                urlPattern=f"https://www.ebi.ac.uk/ols4/ontologies/{ns.prefix}/terms?obo_id={ns.prefix.upper()}"
-                + ":{$id}",
-                **ols_info,
-            )
-        )
-    return {ns.prefix: ns for ns in namespaces}  # type: ignore
-def misc_namespaces() -> Dict[str, Namespace]:
-    """Define misc namespaces."""
-    namespaces = [
-        Namespace(
-            id="brenda.ligand",
-            pattern=r"^\d+$",
-            name="BRENDA Ligand",
-            prefix=None,
-            description="BRENDA Ligand Information",
-            namespaceEmbeddedInLui=False,
-        ),
-        Namespace(
-            id="metabolights.compound",
-            pattern=r"^MTBLC\d+$",
-            name="Metabolights compound",
-            prefix=None,
-            description="metabolights compound",
-            namespaceEmbeddedInLui=False,
-        ),
-    ]
-    return {ns.id: ns for ns in namespaces}  # type: ignore
 class Registry:
     """Managing the available annotation information.
@@ -284,10 +95,6 @@ class Registry:
     """
     URL = "https://registry.api.identifiers.org/resolutionApi/getResolverDataset"
-    CUSTOM_NAMESPACES = {
-        **ols_namespaces(),
-        **misc_namespaces(),
-    }
     def __init__(
         self,
@@ -325,7 +132,6 @@ class Registry:
     @staticmethod
     def update_registry(
-        custom_namespaces: Dict[str, Namespace] = CUSTOM_NAMESPACES,
         registry_path: Optional[Path] = None,
     ) -> Dict[str, Namespace]:
         """Update registry from identifiers.org webservice."""
@@ -338,14 +144,6 @@ class Registry:
             ns = Namespace.from_dict(data)
             ns_dict[ns.prefix] = ns
-        if custom_namespaces is not None:
-            for key, ns in custom_namespaces.items():
-                if key in ns_dict:
-                    logger.error(
-                        f"Namespace with key '{key}' exists in MIRIAM. Overwrite namespace!"
-                    )
-                ns_dict[key] = ns
         if registry_path is not None:
             write_json_cache(
                 data=ns_dict,

pymetadata 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

Potentially problematic release.

pymetadata 0.5.3py3-none-any.whl → 0.5.5py3-none-any.whl