PyPI - rcsb.exdb - Versions diffs - 1.31__py3-none-any.whl - Mend

rcsb.exdb 1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (98) hide show

rcsb/__init__.py +1 -0
rcsb/exdb/__init__.py +1 -0
rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
rcsb/exdb/branch/GlycanProvider.py +116 -0
rcsb/exdb/branch/GlycanUtils.py +114 -0
rcsb/exdb/branch/__init__.py +0 -0
rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
rcsb/exdb/chemref/__init__.py +0 -0
rcsb/exdb/citation/CitationAdapter.py +91 -0
rcsb/exdb/citation/CitationExtractor.py +190 -0
rcsb/exdb/citation/CitationUtils.py +51 -0
rcsb/exdb/citation/__init__.py +0 -0
rcsb/exdb/cli/__init__.py +0 -0
rcsb/exdb/entry/EntryInfoProvider.py +148 -0
rcsb/exdb/entry/__init__.py +0 -0
rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
rcsb/exdb/seq/AnnotationExtractor.py +76 -0
rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
rcsb/exdb/seq/UniProtExtractor.py +80 -0
rcsb/exdb/seq/__init__.py +0 -0
rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
rcsb/exdb/tests/__init__.py +0 -0
rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
rcsb/exdb/tests/testChemRefLoader.py +106 -0
rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
rcsb/exdb/tests/testCitationAdapter.py +97 -0
rcsb/exdb/tests/testCitationExtractor.py +93 -0
rcsb/exdb/tests/testCitationUtils.py +92 -0
rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
rcsb/exdb/tests/testGlycanProvider.py +98 -0
rcsb/exdb/tests/testGlycanUtils.py +64 -0
rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
rcsb/exdb/tests/testObjectExtractor.py +342 -0
rcsb/exdb/tests/testObjectTransformer.py +83 -0
rcsb/exdb/tests/testObjectUpdater.py +120 -0
rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
rcsb/exdb/tests/testUniProtExtractor.py +77 -0
rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
rcsb/exdb/tree/__init__.py +0 -0
rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
rcsb/exdb/utils/ObjectExtractor.py +286 -0
rcsb/exdb/utils/ObjectTransformer.py +124 -0
rcsb/exdb/utils/ObjectUpdater.py +121 -0
rcsb/exdb/utils/ObjectValidator.py +160 -0
rcsb/exdb/utils/__init__.py +0 -0
rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
rcsb/exdb/wf/__init__.py +0 -0
rcsb_exdb-1.31.dist-info/METADATA +103 -0
rcsb_exdb-1.31.dist-info/RECORD +98 -0
rcsb_exdb-1.31.dist-info/WHEEL +4 -0
rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0

rcsb/exdb/seq/PolymerEntityExtractor.py ADDED Viewed

@@ -0,0 +1,328 @@
+##
+# File: PolymerEntityExtractor.py
+# Date: 5-Dec-2020  jdw
+#
+# Utilities to extract selected details from the core polymer entity collections.
+#
+#
+# Updates:
+#  9-Jan-2024 dwp Turn off use of uniprot_exdb DB for enriching protein entity details file (data not used)
+# 10-Dec-2024 dwp Sort extracted polymer entity sequence data by entity ID (alphabetically), to ensure consistent
+#                 ordering between coasts (order of sequence data influences results of mmseqs2 sequence searching)
+#
+##
+__docformat__ = "google en"
+__author__ = "John Westbrook"
+__email__ = "jwest@rcsb.rutgers.edu"
+__license__ = "Apache 2.0"
+import logging
+import os
+from collections import OrderedDict
+from rcsb.exdb.utils.ObjectExtractor import ObjectExtractor
+from rcsb.utils.io.MarshalUtil import MarshalUtil
+logger = logging.getLogger(__name__)
+def getRangeOverlap(entityBeg, entityEnd, refBeg, refEnd):
+    r1 = range(entityBeg, entityEnd)
+    r2 = range(refBeg, refEnd)
+    if r1.start == r1.stop or r2.start == r2.stop:
+        return set()
+    if not ((r1.start < r2.stop and r1.stop > r2.start) or (r1.stop > r2.start and r2.stop > r1.start)):
+        return set()
+    return set(range(max(r1.start, r2.start), min(r1.stop, r2.stop) + 1))
+class PolymerEntityExtractor(object):
+    """Utilities to extract selected details from the core polymer entity collections."""
+    def __init__(self, cfgOb):
+        self.__cfgOb = cfgOb
+    def exportProteinSequenceDetails(self, filePath, fmt="json", minSeqLen=0):
+        """Export protein sequence and taxonomy data (required to build protein sequence fasta file)"""
+        rD, missingSrcD = self.getProteinSequenceDetails(minSeqLen=minSeqLen)
+        # ----
+        mU = MarshalUtil()
+        ok1 = mU.doExport(filePath, rD, fmt=fmt, indent=3)
+        #
+        pth, _ = os.path.split(filePath)
+        mU = MarshalUtil()
+        ok2 = mU.doExport(os.path.join(pth, "missingSrcNames.json"), missingSrcD, fmt="json")
+        logger.info("Exporting (%d) protein sequence records with missing source count (%d) status %r", len(rD), len(missingSrcD), ok1 and ok2)
+    def getProteinSequenceDetails(self, minSeqLen=0):
+        """Get protein sequence and taxonomy data (required to build protein sequence fasta file)"""
+        missingSrcD = {}
+        rD = {}
+        try:
+            obEx = ObjectExtractor(
+                self.__cfgOb,
+                databaseName="pdbx_core",
+                collectionName="pdbx_core_polymer_entity",
+                useCache=False,
+                keyAttribute="entity",
+                uniqueAttributes=["rcsb_id"],
+                selectionQuery={"entity_poly.rcsb_entity_polymer_type": "Protein"},
+                selectionList=[
+                    "rcsb_id",
+                    "rcsb_entity_source_organism",
+                    "rcsb_polymer_entity.rcsb_source_part_count",
+                    "rcsb_polymer_entity.rcsb_source_taxonomy_count",
+                    "rcsb_polymer_entity.src_method",
+                    "entity_poly",
+                    "rcsb_polymer_entity_align",
+                ],
+            )
+            #
+            eCount = obEx.getCount()
+            logger.info("Polymer entity count is %d", eCount)
+            objD = obEx.getObjects()
+            rD = {}
+            for rId, eD in objD.items():
+                try:
+                    pD = eD["entity_poly"]
+                    seqS = pD["pdbx_seq_one_letter_code_can"]
+                    seqLen = len(seqS)
+                except Exception:
+                    logger.warning("%s no one-letter-code sequence", rId)
+                #
+                if seqLen < minSeqLen:
+                    continue
+                #
+                srcMethod = None
+                try:
+                    pD = eD["rcsb_polymer_entity"]
+                    srcMethod = pD["src_method"]
+                except Exception:
+                    pass
+                #
+                if "rcsb_entity_source_organism" not in eD:
+                    logger.debug("%s No source information (%r) skipping (seqLen %d)", rId, srcMethod, seqLen)
+                    continue
+                try:
+                    sL = []
+                    for tD in eD["rcsb_entity_source_organism"]:
+                        srcName = tD["scientific_name"] if "scientific_name" in tD else None
+                        if "beg_seq_num" in tD and "end_seq_num" in tD:
+                            begSeqNum = tD["beg_seq_num"]
+                            endSeqNum = tD["end_seq_num"] if tD["end_seq_num"] <= seqLen else seqLen
+                        else:
+                            begSeqNum = 1
+                            endSeqNum = seqLen
+                        srcId = tD["pdbx_src_id"]
+                        srcType = tD["source_type"]
+                        taxId = tD["ncbi_taxonomy_id"] if "ncbi_taxonomy_id" in tD else -1
+                        if srcName and taxId == -1:
+                            missingSrcD.setdefault(srcName, []).append(rId)
+                        orgName = tD["ncbi_scientific_name"] if "ncbi_scientific_name" in tD else ""
+                        sL.append({"srcId": srcId, "taxId": taxId, "orgName": orgName, "entitySeqBeg": begSeqNum, "entitySeqEnd": endSeqNum})
+                    if len(sL) == 1:
+                        sL[0]["entitySeqBeg"] = 1
+                        sL[0]["entitySeqEnd"] = seqLen
+                except Exception as e:
+                    logger.exception("Failing for (%r) tD %r with %s", rId, tD, str(e))
+                #
+                try:
+                    pD = eD["rcsb_polymer_entity"]
+                    partCount = pD["rcsb_source_part_count"]
+                except Exception:
+                    logger.warning("%s no source part count", rId)
+                    partCount = 1
+                try:
+                    pD = eD["rcsb_polymer_entity"]
+                    taxCount = pD["rcsb_source_taxonomy_count"]
+                except Exception:
+                    if srcType == "synthetic":
+                        taxCount = 0
+                    else:
+                        logger.warning("%s (srcName %r) no source taxonomy count type %r", rId, srcName, srcType)
+                        if srcName:
+                            taxCount = 1
+                        else:
+                            taxCount = 0
+                #
+                uDL = []
+                try:
+                    for tD in eD["rcsb_polymer_entity_align"]:
+                        uD = {}
+                        if tD["reference_database_name"] in ["UniProt", "GenBank", "PIR", "EMBL", "NORINE", "PRF"]:
+                            uD["refDbId"] = tD["reference_database_accession"]
+                            uD["refDbName"] = tD["reference_database_name"]
+                            uD["provSource"] = tD["provenance_source"]
+                            #
+                            # Skip the below step now that uniprot_exdb DB is no longer being updated in weekly workflow.
+                            # The data added here isn't used by subsequent tasks. It simply provides
+                            # additional information in the pdbprent-details.json file (under "alignmentL")
+                            # if tD["reference_database_accession"] in unpD:
+                            #     # This adds {"accession": rId, "taxId": taxId, "scientific_name": sn, "gene": gn, "name": pn, "sequence": sequence}
+                            #     uD.update(unpD[tD["reference_database_accession"]])
+                            aL = []
+                            for qD in tD["aligned_regions"]:
+                                if qD["entity_beg_seq_id"] + qD["length"] - 1 > seqLen:
+                                    qD["length"] = seqLen - qD["entity_beg_seq_id"] + 1
+                                srcId = self.__getSourcePart(rId, sL, qD["entity_beg_seq_id"], qD["length"])
+                                aL.append({"srcId": srcId, "entitySeqBeg": qD["entity_beg_seq_id"], "refSeqBeg": qD["ref_beg_seq_id"], "length": qD["length"]})
+                            uD["alignList"] = aL
+                            uDL.append(uD)
+                        else:
+                            logger.info("%s reference database %s", rId, tD["reference_database_name"])
+                except Exception:
+                    pass
+                rD[rId] = {"alignmentL": uDL, "sourceOrgL": sL, "partCount": partCount, "taxCount": taxCount, "sequence": seqS, "seqLen": seqLen}
+            # Sort the dict in alphabetical order (by entity ID key) to ensure consistent/reproducible treatment by mmseqs2
+            sortedD = OrderedDict((k, rD.pop(k)) for k in sorted(rD))
+        except Exception as e:
+            logger.exception("Failing with %s", str(e))
+        return sortedD, missingSrcD
+    def __getSourcePart(self, entityId, sourceOrgL, entityBeg, seqLen):
+        """Return the source part containing the input entity range -
+        Args:
+            sourceOrgL (list): list of source dictionaries
+            entityBeg (int):  begining entity sequence position (matched region)
+            seqLen (int):  length sequence range (matched region)
+        Returns:
+            (int): corresponding source part id or None
+        """
+        entityEnd = entityBeg + seqLen - 1
+        for sD in sourceOrgL:
+            srcId = sD["srcId"]
+            if sD["entitySeqBeg"] <= entityBeg and sD["entitySeqEnd"] >= entityEnd:
+                return srcId
+        #
+        if len(sourceOrgL) == 1:
+            logger.error("%r (%d) Inconsistent range for beg %r end %r sourceOrgL %r", entityId, len(sourceOrgL), entityBeg, entityEnd, sourceOrgL)
+            return 1
+        else:
+            ovTupL = []
+            for sD in sourceOrgL:
+                srcId = sD["srcId"]
+                logger.debug("%r %r beg %r end %r beg %r end %r", entityId, srcId, sD["entitySeqBeg"], sD["entitySeqEnd"], entityBeg, entityEnd)
+                oVS = getRangeOverlap(sD["entitySeqBeg"], sD["entitySeqEnd"], entityBeg, entityEnd)
+                ovTupL.append((srcId, len(oVS)))
+            rL = sorted(ovTupL, key=lambda x: x[1], reverse=True)
+            logger.debug("ovTupL %r", rL)
+            #
+            return rL[0][0]
+    def exportProteinEntityFasta(self, fastaPath, taxonPath, detailsPath, minSeqLen=10):
+        """Export protein entity Fasta file and associated taxon mapping file (for mmseqs2)
+        Args:
+            fastaPath (str): protein sequence FASTA output file path
+            taxonPath (str): taxon mapping file path (seqid TaxId) (tdd format)
+            detailPath (str): protein entity details file path (json)
+        Returns:
+            bool: True for success or False otherwise
+        Example:
+            "5H7D_1": {
+                    # "alignmentL": [
+                    #     {
+                    #         "refDbId": "P42588",
+                    #         "refDbName": "UniProt",
+                    #         "provSource": "PDB",
+                    #         "accession": "P42588",
+                    #         "taxId": 83333,
+                    #         "scientific_name": "Escherichia coli (strain K12)",
+                    #         "gene": "patA",
+                    #         "name": "PATase",
+                    #         "alignList": [
+                    #         {
+                    #             "srcId": "1",
+                    #             "entitySeqBeg": 5,
+                    #             "refSeqBeg": 7,
+                    #             "length": 447
+                    #         }
+                    #         ]
+                    #     },
+                    #     {
+                    #         "refDbId": "P38507",
+                    #         "refDbName": "UniProt",
+                    #         "provSource": "PDB",
+                    #         "accession": "P38507",
+                    #         "taxId": 1280,
+                    #         "scientific_name": "Staphylococcus aureus",
+                    #         "gene": "spa",
+                    #         "name": "IgG-binding protein A",
+                    #         "alignList": [
+                    #         {
+                    #             "srcId": "2",
+                    #             "entitySeqBeg": 452,
+                    #             entitySeqBeg"220,
+                    #             "length": 48
+                    #         }
+                    #         ]
+                    #     }
+                    # ],
+                    "sourceOrgL": [
+                        {
+                            "srcId": "1",
+                            "taxId": 83333,
+                            "orgName": "Escherichia coli K-12",
+                            "entitySeqBeg": 1,
+                            "entitySeqEnd": 451
+                        },
+                        {
+                            "srcId": "2",
+                            "taxId": 1280,
+                            "orgName": "Staphylococcus aureus",
+                            "entitySeqBeg": 452,
+                            "entitySeqEnd": 499
+                        }
+                    ],
+                    "partCount": 2,
+                    "taxCount": 2,
+                    "sequence": "GSHMSASALACSAHALNLIEKRTLDHEEMKALNREVIEYFKEHVNPGF...",
+                    "seqLen": 499
+                },
+        >1ABC_#|prt|<taxid>|beg|end|refdb|refId|refTaxId|refbeg|refend|ref_gn|ref_nm
+        """
+        proteinSeqD, _ = self.getProteinSequenceDetails(minSeqLen=minSeqLen)
+        ok = False
+        try:
+            taxonL = []
+            seqDict = {}
+            for eId, eD in proteinSeqD.items():
+                #
+                seq = eD["sequence"]
+                for sD in eD["sourceOrgL"]:
+                    srcId = sD["srcId"]
+                    taxId = sD["taxId"]
+                    seqBeg = int(sD["entitySeqBeg"])
+                    seqEnd = int(sD["entitySeqEnd"])
+                    seqLen = 1 + (seqEnd - seqBeg)
+                    # orgName = sD["orgName"]
+                    cD = {"sequence": seq[seqBeg - 1: seqEnd], "entityId": eId, "srcId": srcId, "seqBeg": seqBeg, "seqEnd": seqEnd, "seqLen": seqLen, "taxId": taxId}
+                    seqId = ""
+                    cL = []
+                    for k, v in cD.items():
+                        if k in ["sequence"]:
+                            continue
+                        cL.append(str(v))
+                        cL.append(str(k))
+                    seqId = "|".join(cL)
+                    seqDict[seqId] = cD
+                    taxonL.append("%s\t%s" % (seqId, taxId))
+                # ----
+            mU = MarshalUtil()
+            ok = mU.doExport(detailsPath, proteinSeqD, fmt="json", indent=3)
+            ok = mU.doExport(fastaPath, seqDict, fmt="fasta")
+            ok = mU.doExport(taxonPath, taxonL, fmt="list")
+        except Exception as e:
+            logger.exception("Failing %r with %s", fastaPath, str(e))
+        return ok