PyPI - rcsb.exdb - Versions diffs - 1.31__py3-none-any.whl - Mend

rcsb.exdb 1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (98) hide show

rcsb/__init__.py +1 -0
rcsb/exdb/__init__.py +1 -0
rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
rcsb/exdb/branch/GlycanProvider.py +116 -0
rcsb/exdb/branch/GlycanUtils.py +114 -0
rcsb/exdb/branch/__init__.py +0 -0
rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
rcsb/exdb/chemref/__init__.py +0 -0
rcsb/exdb/citation/CitationAdapter.py +91 -0
rcsb/exdb/citation/CitationExtractor.py +190 -0
rcsb/exdb/citation/CitationUtils.py +51 -0
rcsb/exdb/citation/__init__.py +0 -0
rcsb/exdb/cli/__init__.py +0 -0
rcsb/exdb/entry/EntryInfoProvider.py +148 -0
rcsb/exdb/entry/__init__.py +0 -0
rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
rcsb/exdb/seq/AnnotationExtractor.py +76 -0
rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
rcsb/exdb/seq/UniProtExtractor.py +80 -0
rcsb/exdb/seq/__init__.py +0 -0
rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
rcsb/exdb/tests/__init__.py +0 -0
rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
rcsb/exdb/tests/testChemRefLoader.py +106 -0
rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
rcsb/exdb/tests/testCitationAdapter.py +97 -0
rcsb/exdb/tests/testCitationExtractor.py +93 -0
rcsb/exdb/tests/testCitationUtils.py +92 -0
rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
rcsb/exdb/tests/testGlycanProvider.py +98 -0
rcsb/exdb/tests/testGlycanUtils.py +64 -0
rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
rcsb/exdb/tests/testObjectExtractor.py +342 -0
rcsb/exdb/tests/testObjectTransformer.py +83 -0
rcsb/exdb/tests/testObjectUpdater.py +120 -0
rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
rcsb/exdb/tests/testUniProtExtractor.py +77 -0
rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
rcsb/exdb/tree/__init__.py +0 -0
rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
rcsb/exdb/utils/ObjectExtractor.py +286 -0
rcsb/exdb/utils/ObjectTransformer.py +124 -0
rcsb/exdb/utils/ObjectUpdater.py +121 -0
rcsb/exdb/utils/ObjectValidator.py +160 -0
rcsb/exdb/utils/__init__.py +0 -0
rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
rcsb/exdb/wf/__init__.py +0 -0
rcsb_exdb-1.31.dist-info/METADATA +103 -0
rcsb_exdb-1.31.dist-info/RECORD +98 -0
rcsb_exdb-1.31.dist-info/WHEEL +4 -0
rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0

rcsb/exdb/seq/ReferenceSequenceCacheProvider.py ADDED Viewed

@@ -0,0 +1,397 @@
+##
+# File: ReferenceSequenceCacheProvider.py
+# Date: 10-Feb-2020  jdw
+#
+# Utilities to cache referencence sequence data and mappings.
+#
+# Updates:
+# 8-Apr-2020 jdw change testCache() conditions to specifically track missing matched reference Id codes.
+#
+##
+__docformat__ = "google en"
+__author__ = "John Westbrook"
+__email__ = "jwest@rcsb.rutgers.edu"
+__license__ = "Apache 2.0"
+import logging
+from collections import defaultdict
+from rcsb.exdb.utils.ObjectExtractor import ObjectExtractor
+from rcsb.exdb.utils.ObjectUpdater import ObjectUpdater
+from rcsb.utils.io.IoUtil import getObjSize
+from rcsb.utils.io.TimeUtil import TimeUtil
+from rcsb.utils.multiproc.MultiProcUtil import MultiProcUtil
+from rcsb.utils.seq.UniProtUtils import UniProtUtils
+logger = logging.getLogger(__name__)
+class ReferenceUpdateWorker(object):
+    """A skeleton class that implements the interface expected by the multiprocessing
+    for fetching reference sequences --
+    """
+    def __init__(self, cfgOb, **kwargs):
+        self.__cfgOb = cfgOb
+        _ = kwargs
+        self.__refDatabaseName = "uniprot_exdb"
+        self.__refDataCollectionName = "reference_entry"
+        self.__refMatchDataCollectionName = "reference_match"
+        #
+        self.__createCollections(self.__refDatabaseName, self.__refDataCollectionName, indexAttributeNames=["rcsb_id", "rcsb_last_update"])
+        self.__createCollections(self.__refDatabaseName, self.__refMatchDataCollectionName, indexAttributeNames=["rcsb_id", "rcsb_last_update"])
+    def updateList(self, dataList, procName, optionsD, workingDir):
+        """Update the input list of reference sequence identifiers and return
+        matching diagnostics and reference feature data.
+        """
+        _ = optionsD
+        _ = workingDir
+        saveText = optionsD.get("saveText", False)
+        fetchLimit = optionsD.get("fetchLimit", None)
+        refDbName = optionsD.get("refDbName", "UniProt")
+        maxChunkSize = optionsD.get("maxChunkSize", 50)
+        successList = []
+        retList1 = []
+        retList2 = []
+        diagList = []
+        emptyList = []
+        #
+        try:
+            tU = TimeUtil()
+            idList = dataList[:fetchLimit] if fetchLimit else dataList
+            logger.info("%s starting fetch for %d %s entries", procName, len(idList), refDbName)
+            if refDbName == "UniProt":
+                fobj = UniProtUtils(saveText=saveText)
+                logger.debug("Maximum reference chunk size %d", maxChunkSize)
+                refD, matchD = fobj.fetchList(idList, maxChunkSize=maxChunkSize)
+                if len(matchD) == len(idList):
+                    for uId, tD in matchD.items():
+                        tD["rcsb_id"] = uId.strip()
+                        tD["rcsb_last_update"] = tU.getDateTimeObj(tU.getTimestamp())
+                        retList1.append(tD)
+                    for uId, tD in refD.items():
+                        tD["rcsb_id"] = uId.strip()
+                        tD["rcsb_last_update"] = tU.getDateTimeObj(tU.getTimestamp())
+                        retList2.append(tD)
+                    successList.extend(idList)
+                    self.__updateReferenceData(self.__refDatabaseName, self.__refDataCollectionName, retList2)
+                    self.__updateReferenceData(self.__refDatabaseName, self.__refMatchDataCollectionName, retList1)
+                else:
+                    logger.info("Failing with fetch for %d entries with matchD %r", len(idList), matchD)
+            else:
+                logger.error("Unsupported reference database %r", refDbName)
+        except Exception as e:
+            logger.exception("Failing %s for %d data items %s", procName, len(dataList), str(e))
+        logger.info("%s dataList length %d success length %d rst1 %d rst2 %d", procName, len(dataList), len(successList), len(retList1), len(retList2))
+        #
+        return successList, emptyList, emptyList, diagList
+    def __updateReferenceData(self, databaseName, collectionName, objDL):
+        updateDL = []
+        for objD in objDL:
+            try:
+                selectD = {"rcsb_id": objD["rcsb_id"]}
+                updateDL.append({"selectD": selectD, "updateD": objD})
+            except Exception as e:
+                logger.exception("Failing with %s", str(e))
+        obUpd = ObjectUpdater(self.__cfgOb)
+        numUpd = obUpd.update(databaseName, collectionName, updateDL)
+        logger.debug("Updated reference count is %d", numUpd)
+    def __createCollections(self, databaseName, collectionName, indexAttributeNames=None):
+        obUpd = ObjectUpdater(self.__cfgOb)
+        ok = obUpd.createCollection(databaseName, collectionName, indexAttributeNames=indexAttributeNames, checkExists=True, bsonSchema=None)
+        return ok
+class ReferenceSequenceCacheProvider(object):
+    """Utilities to cache referencence sequence data and correspondence mappings."""
+    def __init__(self, cfgOb, databaseName, collectionName, polymerType, siftsProvider=None, maxChunkSize=50, fetchLimit=None, expireDays=14, numProc=1, **kwargs):
+        self.__cfgOb = cfgOb
+        #
+        self.__maxChunkSize = maxChunkSize
+        self.__numProc = numProc
+        #
+        self.__refDatabaseName = "uniprot_exdb"
+        self.__refDataCollectionName = "reference_entry"
+        self.__refMatchDataCollectionName = "reference_match"
+        self.__ssP = siftsProvider
+        self.__matchD, self.__refD, self.__missingMatchIds = self.__reload(databaseName, collectionName, polymerType, fetchLimit, expireDays, **kwargs)
+    def getMatchInfo(self):
+        return self.__matchD
+    def getRefData(self):
+        return self.__refD
+    def getMissingMatchedIdCodes(self):
+        return self.__missingMatchIds
+    def getDocuments(self, formatType="exchange"):
+        fobj = UniProtUtils(saveText=False)
+        exObjD = fobj.reformat(self.__refD, formatType=formatType)
+        return list(exObjD.values())
+    def getRefDataCount(self):
+        return len(self.__refD)
+    def testCache(self, minMatchPrimaryPercent=None, logSizes=False, minMissing=0):
+        """Test the state of reference sequence data relative to proportion of matched primary sequence
+        in the primary data set.
+        Args:
+            minMatchPrimaryPercent (float, optional): minimal acceptable of matching primary accessions. Defaults to None.
+            logSizes (bool, optional): flag to log resource sizes. Defaults to False.
+            minMissing (int, optional):  minimum acceptable missing matched reference Ids. Defaults to 0.
+        Returns:
+            bool: True for success or False otherwise
+        """
+        try:
+            ok = bool(self.__matchD and self.__refD and self.__missingMatchIds <= minMissing)
+            logger.info("Reference cache lengths: matchD %d refD %d missing matches %d", len(self.__matchD), len(self.__refD), self.__missingMatchIds)
+            if ok:
+                return ok
+        except Exception as e:
+            logger.error("Failing with unexpected cache state %s", str(e))
+            return False
+        #
+        # -- The remaining check on the portion is not currently --
+        #
+        numRef = len(self.__matchD)
+        countD = defaultdict(int)
+        logger.info("Match dictionary length %d", len(self.__matchD))
+        for _, mD in self.__matchD.items():
+            if "matched" in mD:
+                countD[mD["matched"]] += 1
+        logger.info("Reference length %d match length %d coverage %r", len(self.__refD), len(self.__matchD), countD.items())
+        if minMatchPrimaryPercent:
+            try:
+                okC = 100.0 * float(countD["primary"]) / float(numRef) > minMatchPrimaryPercent
+            except Exception:
+                okC = False
+            logger.info("Primary reference match count test status %r", okC)
+        #
+        if logSizes:
+            logger.info(
+                "RefMatchD %.2f RefD %.2f",
+                getObjSize(self.__matchD) / 1000000.0,
+                getObjSize(self.__refD) / 1000000.0,
+            )
+        return ok and okC
+    def __reload(self, databaseName, collectionName, polymerType, fetchLimit, expireDays, **kwargs):
+        _ = kwargs
+        # --  This
+        logger.info("Reloading sequence reference data fetchLimit %r expireDays %r", fetchLimit, expireDays)
+        numMissing = self.__refreshReferenceData(expireDays=expireDays, failureFraction=0.75)
+        logger.info("Reference identifiers expired/missing %d", numMissing)
+        # --
+        refIdMapD = {}
+        matchD = {}
+        refD = {}
+        failList = []
+        #
+        # assignRefD: Dict of all entities of polymerType "Protein" (or other), with associated container_identifiers and other info as corresponding values
+        assignRefD = self.__getPolymerReferenceSequenceAssignments(databaseName, collectionName, polymerType, fetchLimit)
+        logger.info("Polymer reference sequence assignments %d (assignRefD)", len(assignRefD))
+        #
+        # refIdMapD: Dict of all *unique* UniProt Ids of entities that have:
+        #    "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.provenance_source":"PDB",
+        #    "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_name":"UniProt",
+        #    "entity_poly.rcsb_entity_polymer_type":"Protein"
+        #  Values are the list of entities that have those UniProt IDs
+        #  i.e. refIdMapD[<database_accession>] = [entity_key1, entity_key2,...]
+        # This will usually only contain several hundred to a few thousand IDs
+        refIdMapD, _ = self.__getAssignmentMap(assignRefD)
+        logger.info("Reference ID assignemnt map length %d (refIdMapD)", len(refIdMapD))
+        #
+        # List of all entry IDs for entities in assignRefD (will contain duplicates for entries with >1 entity)
+        entryIdL = [rcsbId[:4] for rcsbId in assignRefD]
+        #
+        # List of *unique* UniProt IDs from SIFTS for all protein (or, "polymerType") entries currently in ExDB
+        siftsUniProtL = self.__ssP.getEntryUniqueIdentifiers(entryIdL, idType="UNPID") if self.__ssP else []
+        logger.info("Incorporating all %d SIFTS accessions for %d entities", len(siftsUniProtL), len(entryIdL))
+        #
+        # unpIdList: List of all *unique* UniProt IDs combined from 'refIdMapD' and 'siftsUniProtL'
+        #            Since not everything will be covered by SIFTS, this will be slightly more than siftsUniProtL
+        unpIdList = sorted(set(list(refIdMapD.keys()) + siftsUniProtL))
+        logger.info("UniProt ID list length %d (unpIdList)", len(unpIdList))
+        #
+        # cacheUnpIdList: List of UniProt IDs from uniprot_exdb.reference_match, from today backwards
+        cacheUnpIdList = self.__getReferenceDataIds(expireDays=0)
+        logger.info("Using %d cached reference sequences", len(cacheUnpIdList))
+        #
+        # updateUnpIdList: List of the *delta* UniProt IDs between what's possible based on entity collections (unpIdList)
+        #                  and what's already in uniprot_exdb.reference_match (cacheUnpIdList)
+        updateUnpIdList = sorted(set(unpIdList) - set(cacheUnpIdList))
+        logger.info("UniProt list lengths (unique): set(unpIdList) %d - set(cacheUnpIdList) %d", len(set(unpIdList)), len(set(cacheUnpIdList)))
+        #
+        if updateUnpIdList:
+            logger.info("Updating cache for %d UniProt accessions (consolidated PDB + SIFTS)", len(updateUnpIdList))
+            ok, failList = self.__updateReferenceData(updateUnpIdList)
+            logger.info("Fetch references update status is %r missing count %d", ok, len(failList))
+        else:
+            logger.info("No reference sequence updates required")
+        #
+        matchD = self.__getReferenceData(self.__refDatabaseName, self.__refMatchDataCollectionName)
+        refD = self.__getReferenceData(self.__refDatabaseName, self.__refDataCollectionName)
+        logger.info("Completed - returning match length %d and reference data length %d num missing %d", len(matchD), len(refD), len(failList))
+        return matchD, refD, len(failList)
+    def __refreshReferenceData(self, expireDays=14, failureFraction=0.75):
+        """Update expired reference data and purge any obsolete data not to exceeding the
+        the input failureFraction.
+        Args:
+            expireDays (int, optional): expiration interval in days. Defaults to 14.
+            failureFraction (float, optional): fractional limit of obsolete entries purged. Defaults to 0.75.
+        Returns:
+            (int): number of obsolete entries purged
+        """
+        idList = self.__getReferenceDataIds(expireDays=expireDays)
+        logger.info("Expired (days=%d) reference identifiers %d", expireDays, len(idList))
+        if not idList:
+            return 0
+        #
+        ok, failList = self.__updateReferenceData(idList)
+        logger.info("After reference update (status=%r) missing expired match identifiers %d", ok, len(failList))
+        tFrac = float(len(failList)) / float(len(idList))
+        if tFrac < failureFraction:
+            obUpd = ObjectUpdater(self.__cfgOb)
+            selectD = {"rcsb_id": failList}
+            numPurge = obUpd.delete(self.__refDatabaseName, self.__refMatchDataCollectionName, selectD)
+            if len(failList) != numPurge:
+                logger.info("Update match failures %d purge count %d", len(failList), numPurge)
+            numPurge = obUpd.delete(self.__refDatabaseName, self.__refDataCollectionName, selectD)
+            if len(failList) != numPurge:
+                logger.info("Update reference data failures %d purge count %d", len(failList), numPurge)
+        return len(failList)
+    def __getReferenceDataIds(self, expireDays=14):
+        """Get reference data identifiers subject to an expiration interval
+         (i.e. not updated in/older than deltaDays)
+        Args:
+            expireDays (int, optional): expiration interval in days. Defaults to 14.
+        Returns:
+            (list): reference identifier list
+        """
+        selectD = None
+        if expireDays > 0:
+            tU = TimeUtil()
+            tS = tU.getTimestamp(useUtc=True, before={"days": expireDays})
+            selectD = {"rcsb_latest_update": {"$lt": tU.getDateTimeObj(tS)}}
+        matchD = self.__getReferenceData(self.__refDatabaseName, self.__refMatchDataCollectionName, selectD=selectD)
+        return sorted(matchD.keys())
+    def __updateReferenceData(self, idList):
+        numProc = self.__numProc
+        chunkSize = self.__maxChunkSize
+        logger.info("Length starting list is %d", len(idList))
+        optD = {"maxChunkSize": chunkSize}
+        rWorker = ReferenceUpdateWorker(self.__cfgOb)
+        mpu = MultiProcUtil(verbose=True)
+        mpu.setOptions(optD)
+        mpu.set(workerObj=rWorker, workerMethod="updateList")
+        ok, failList, resultList, _ = mpu.runMulti(dataList=idList, numProc=numProc, numResults=2, chunkSize=chunkSize)
+        logger.info("Multi-proc %r failures %r result lengths %r %r", ok, len(failList), len(resultList[0]), len(resultList[1]))
+        return ok, failList
+    def __getReferenceData(self, databaseName, collectionName, selectD=None):
+        logger.info("Searching %s %s with selection query %r", databaseName, collectionName, selectD)
+        obEx = ObjectExtractor(
+            self.__cfgOb,
+            databaseName=databaseName,
+            collectionName=collectionName,
+            keyAttribute="rcsb_id",
+            uniqueAttributes=["rcsb_id"],
+            selectionQuery=selectD,
+        )
+        docCount = obEx.getCount()
+        logger.debug("Reference data match count %d", docCount)
+        objD = obEx.getObjects()
+        return objD
+    def __getPolymerReferenceSequenceAssignments(self, databaseName, collectionName, polymerType, fetchLimit):
+        """Get all accessions assigned to input reference sequence database for the input polymerType.
+        Returns:
+         (dict): {"1abc_1": "rcsb_polymer_entity_container_identifiers": {"reference_sequence_identifiers": []},
+                            "rcsb_entity_source_organism"" {"ncbi_taxonomy_id": []}
+        """
+        try:
+            obEx = ObjectExtractor(
+                self.__cfgOb,
+                databaseName=databaseName,
+                collectionName=collectionName,
+                cacheFilePath=None,
+                useCache=False,
+                keyAttribute="entity",
+                uniqueAttributes=["rcsb_id"],
+                cacheKwargs=None,
+                objectLimit=fetchLimit,
+                selectionQuery={"entity_poly.rcsb_entity_polymer_type": polymerType},
+                selectionList=[
+                    "rcsb_id",
+                    "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers",
+                    "rcsb_polymer_entity_container_identifiers.auth_asym_ids",
+                    "rcsb_entity_source_organism.ncbi_taxonomy_id",
+                ],
+            )
+            eCount = obEx.getCount()
+            logger.info("Polymer entity count type %s is %d", polymerType, eCount)
+            objD = obEx.getObjects()
+            logger.info("Reading polymer entity count %d reference accession length %d ", eCount, len(objD))
+            #
+        except Exception as e:
+            logger.exception("Failing for %s (%s) with %s", databaseName, collectionName, str(e))
+        return objD
+    def __getAssignmentMap(self, polymerEntityObjD):
+        referenceDatabaseName = "UniProt"
+        provSource = "PDB"
+        refIdD = defaultdict(list)
+        taxIdD = defaultdict(list)
+        numMissing = 0
+        numMissingTaxons = 0
+        for entityKey, eD in polymerEntityObjD.items():
+            try:
+                accS = set()
+                for ii, tD in enumerate(eD["rcsb_polymer_entity_container_identifiers"]["reference_sequence_identifiers"]):
+                    if tD["database_name"] == referenceDatabaseName and tD["provenance_source"] == provSource:
+                        accS.add(tD["database_accession"])
+                        refIdD[tD["database_accession"]].append(entityKey)
+                        #
+                        # pick up the corresponding taxonomy -
+                        try:
+                            taxIdD[tD["database_accession"]].append(eD["rcsb_entity_source_organism"][ii]["ncbi_taxonomy_id"])
+                        except Exception:
+                            logger.debug("Failing taxonomy lookup for %s %r", entityKey, tD["database_accession"])
+                            numMissingTaxons += 1
+                logger.debug("PDB assigned sequences length %d", len(accS))
+            except Exception as e:
+                numMissing += 1
+                logger.debug("No sequence assignments for %s with %s", entityKey, str(e))
+        #
+        numMultipleTaxons = 0
+        for refId, taxIdL in taxIdD.items():
+            taxIdL = list(set(taxIdL))
+            if len(taxIdL) > 1:
+                logger.debug("Multitple taxIds assigned to reference sequence id %s: %r", refId, taxIdL)
+                numMultipleTaxons += 1
+        logger.info("Entities with missing taxonomy %d", numMissingTaxons)
+        logger.info("Reference sequences with multiple taxonomies %d", numMultipleTaxons)
+        logger.info("Unique %s accession assignments by %s %d (entities missing archive accession assignments %d) ", referenceDatabaseName, provSource, len(refIdD), numMissing)
+        return refIdD, taxIdD

rcsb/exdb/seq/TaxonomyExtractor.py ADDED Viewed

@@ -0,0 +1,69 @@
+##
+# File: TaxonomyExtractor.py
+# Date: 15-Oct-2019  jdw
+#
+# Utilities to extract taxonomy details from the core entity collection.
+#
+# Updates:
+#
+##
+__docformat__ = "google en"
+__author__ = "John Westbrook"
+__email__ = "jwest@rcsb.rutgers.edu"
+__license__ = "Apache 2.0"
+import logging
+from rcsb.exdb.utils.ObjectExtractor import ObjectExtractor
+logger = logging.getLogger(__name__)
+class TaxonomyExtractor(object):
+    """Utilities to extract taxonomy details from the core entity collection."""
+    def __init__(self, cfgOb):
+        self.__cfgOb = cfgOb
+        self.__databaseName = "pdbx_core"
+        self.__collectionName = "pdbx_core_polymer_entity"
+    def getUniqueTaxons(self):
+        taxIdL = self.__extractEntityTaxons()
+        return taxIdL
+    def __extractEntityTaxons(self):
+        """Test case - extract unique entity source and host taxonomies"""
+        try:
+            obEx = ObjectExtractor(
+                self.__cfgOb,
+                databaseName=self.__databaseName,
+                collectionName=self.__collectionName,
+                cacheFilePath=None,
+                useCache=False,
+                keyAttribute="entity",
+                uniqueAttributes=["rcsb_id"],
+                cacheKwargs=None,
+                objectLimit=None,
+                # selectionQuery={"entity.type": "polymer"},
+                selectionQuery=None,
+                selectionList=["rcsb_id", "rcsb_entity_source_organism.ncbi_taxonomy_id", "rcsb_entity_host_organism.ncbi_taxonomy_id"],
+            )
+            eCount = obEx.getCount()
+            logger.info("Polymer entity count is %d", eCount)
+            taxIdS = set()
+            objD = obEx.getObjects()
+            for _, eD in objD.items():
+                try:
+                    for tD in eD["rcsb_entity_source_organism"]:
+                        taxIdS.add(tD["ncbi_taxonomy_id"])
+                except Exception:
+                    pass
+                try:
+                    for tD in eD["rcsb_entity_host_organism"]:
+                        taxIdS.add(tD["ncbi_taxonomy_id"])
+                except Exception:
+                    pass
+            logger.info("Unique taxons %d", len(taxIdS))
+            return list(taxIdS)
+        except Exception as e:
+            logger.exception("Failing with %s", str(e))

rcsb/exdb/seq/UniProtCoreEtlWorker.py ADDED Viewed

@@ -0,0 +1,177 @@
+##
+# File: UniProtCoreEtlWorker.py
+# Date: 9-Dec-2019  jdw
+#
+# ETL utilities for processing and loading UniProt core collection reference data.
+#
+# Updates:
+#
+#
+##
+__docformat__ = "google en"
+__author__ = "John Westbrook"
+__email__ = "jwest@rcsb.rutgers.edu"
+__license__ = "Apache 2.0"
+import logging
+from jsonschema import Draft4Validator
+from jsonschema import FormatChecker
+from rcsb.db.helpers.DocumentDefinitionHelper import DocumentDefinitionHelper
+from rcsb.db.mongo.DocumentLoader import DocumentLoader
+from rcsb.db.processors.DataExchangeStatus import DataExchangeStatus
+from rcsb.db.utils.SchemaProvider import SchemaProvider
+from rcsb.exdb.seq.ReferenceSequenceAssignmentProvider import ReferenceSequenceAssignmentProvider
+#
+logger = logging.getLogger(__name__)
+class UniProtCoreEtlWorker(object):
+    """Prepare and load UniProt 'core' sequence reference data collections."""
+    def __init__(self, cfgOb, cachePath, useCache=True, numProc=2, chunkSize=10, maxStepLength=2000, readBackCheck=False, documentLimit=None, doValidate=False, verbose=False):
+        self.__cfgOb = cfgOb
+        self.__cachePath = cachePath
+        self.__useCache = useCache
+        self.__readBackCheck = readBackCheck
+        self.__numProc = numProc
+        self.__chunkSize = chunkSize
+        self.__maxStepLength = maxStepLength
+        self.__documentLimit = documentLimit
+        #
+        self.__resourceName = "MONGO_DB"
+        self.__verbose = verbose
+        self.__statusList = []
+        self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=self.__useCache)
+        self.__docHelper = DocumentDefinitionHelper(cfgOb=self.__cfgOb)
+        self.__valInst = None
+        self.__doValidate = doValidate
+        #
+    def __updateStatus(self, updateId, databaseName, collectionName, status, startTimestamp):
+        try:
+            sFlag = "Y" if status else "N"
+            desp = DataExchangeStatus()
+            desp.setStartTime(tS=startTimestamp)
+            desp.setObject(databaseName, collectionName)
+            desp.setStatus(updateId=updateId, successFlag=sFlag)
+            desp.setEndTime()
+            self.__statusList.append(desp.getStatus())
+            return True
+        except Exception as e:
+            logger.exception("Failing with %s", str(e))
+        return False
+    def __getReferenceSequenceProvider(self):
+        """ """
+        try:
+            rsaP = ReferenceSequenceAssignmentProvider(
+                self.__cfgOb,
+                databaseName="pdbx_core",
+                collectionName="pdbx_core_polymer_entity",
+                polymerType="Protein",
+                referenceDatabaseName="UniProt",
+                provSource="PDB",
+                useCache=self.__useCache,
+                cachePath=self.__cachePath,
+                fetchLimit=self.__documentLimit,
+                siftsAbbreviated="TEST",
+            )
+            ok = rsaP.testCache()
+            return ok, rsaP
+        except Exception as e:
+            logger.exception("Failing with %s", str(e))
+        return None
+    def load(self, updateId, extResource, loadType="full"):
+        """Load sequence reference data"""
+        try:
+            self.__statusList = []
+            desp = DataExchangeStatus()
+            statusStartTimestamp = desp.setStartTime()
+            #
+            dList = indexL = []
+            databaseName = collectionName = collectionVersion = None
+            addValues = {}
+            #
+            if extResource == "UniProt":
+                databaseName = "uniprot_core"
+                # configName = self.__cfgOb.getDefaultSectionName()
+                # dirPath = os.path.join(self.__cachePath, self.__cfgOb.get("EXDB_CACHE_DIR", self.__cfgOb.getDefaultSectionName()))
+                #
+                ok, rsP = self.__getReferenceSequenceProvider()
+                if not ok:
+                    return False
+                #
+                dList = rsP.getDocuments()
+                logger.info("Resource %r extracted mapped document length %d", extResource, len(dList))
+                logger.debug("Objects %r", dList[:2])
+                #
+                cDL = self.__docHelper.getCollectionInfo(databaseName)
+                collectionName = cDL[0]["NAME"]
+                collectionVersion = cDL[0]["VERSION"]
+                indexL = self.__docHelper.getDocumentIndexAttributes(collectionName, "primary")
+                logger.info("Database %r collection %r version %r index attributes %r", databaseName, collectionName, collectionVersion, indexL)
+            else:
+                logger.error("Unsupported external resource %r", extResource)
+            #
+            if self.__doValidate:
+                self.__valInst = self.__getValidator(databaseName, collectionName, schemaLevel="full")
+                for dObj in dList:
+                    self.__validateObj(databaseName, collectionName, dObj, label="Original")
+            #
+            dl = DocumentLoader(
+                self.__cfgOb,
+                self.__cachePath,
+                self.__resourceName,
+                numProc=self.__numProc,
+                chunkSize=self.__chunkSize,
+                maxStepLength=self.__maxStepLength,
+                documentLimit=self.__documentLimit,
+                verbose=self.__verbose,
+                readBackCheck=self.__readBackCheck,
+            )
+            #
+            ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=dList, indexAttributeList=indexL, keyNames=None, addValues=addValues)
+            okS = self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
+            return ok and okS
+        except Exception as e:
+            logger.exception("Failing with %s", str(e))
+        return False
+    def getLoadStatus(self):
+        return self.__statusList
+    def __getValidator(self, databaseName, collectionName, schemaLevel="full"):
+        # _ = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=True)
+        # cD = self.__schP.makeSchema(databaseName, collectionName, encodingType="JSON", level=schemaLevel, saveSchema=True)
+        logger.info("Fetch schema for %r %r validation level %r", databaseName, collectionName, schemaLevel)
+        cD = self.__schP.getJsonSchema(databaseName, collectionName, encodingType="JSON", level=schemaLevel)
+        # Raises exceptions for schema compliance.
+        Draft4Validator.check_schema(cD)
+        valInst = Draft4Validator(cD, format_checker=FormatChecker())
+        return valInst
+    def __validateObj(self, databaseName, collectionName, rObj, label=""):
+        try:
+            eCount = 0
+            tId = rObj["rcsb_id"] if rObj and "rcsb_id" in rObj else "anonymous"
+            for error in sorted(self.__valInst.iter_errors(rObj), key=str):
+                logger.info("Database %s collection %s (%s %r) path %s error: %s", databaseName, collectionName, label, tId, error.path, error.message)
+                logger.debug(">>> Failing object is %r", rObj)
+                if "rcsb_uniprot_feature" in rObj:
+                    for dd in rObj["rcsb_uniprot_feature"]:
+                        if "feature_id" in dd:
+                            logger.info("feature_id %r", dd["feature_id"])
+                        else:
+                            logger.info("no feature_id keys %r", sorted(dd.keys()))
+                            logger.info("description %r", dd["description"])
+                eCount += 1
+        except Exception as e:
+            logger.exception("Validation failing %s", str(e))
+        return eCount