PyPI - rcsb.exdb - Versions diffs - 1.31__py3-none-any.whl - Mend

rcsb.exdb 1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (98) hide show

rcsb/__init__.py +1 -0
rcsb/exdb/__init__.py +1 -0
rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
rcsb/exdb/branch/GlycanProvider.py +116 -0
rcsb/exdb/branch/GlycanUtils.py +114 -0
rcsb/exdb/branch/__init__.py +0 -0
rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
rcsb/exdb/chemref/__init__.py +0 -0
rcsb/exdb/citation/CitationAdapter.py +91 -0
rcsb/exdb/citation/CitationExtractor.py +190 -0
rcsb/exdb/citation/CitationUtils.py +51 -0
rcsb/exdb/citation/__init__.py +0 -0
rcsb/exdb/cli/__init__.py +0 -0
rcsb/exdb/entry/EntryInfoProvider.py +148 -0
rcsb/exdb/entry/__init__.py +0 -0
rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
rcsb/exdb/seq/AnnotationExtractor.py +76 -0
rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
rcsb/exdb/seq/UniProtExtractor.py +80 -0
rcsb/exdb/seq/__init__.py +0 -0
rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
rcsb/exdb/tests/__init__.py +0 -0
rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
rcsb/exdb/tests/testChemRefLoader.py +106 -0
rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
rcsb/exdb/tests/testCitationAdapter.py +97 -0
rcsb/exdb/tests/testCitationExtractor.py +93 -0
rcsb/exdb/tests/testCitationUtils.py +92 -0
rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
rcsb/exdb/tests/testGlycanProvider.py +98 -0
rcsb/exdb/tests/testGlycanUtils.py +64 -0
rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
rcsb/exdb/tests/testObjectExtractor.py +342 -0
rcsb/exdb/tests/testObjectTransformer.py +83 -0
rcsb/exdb/tests/testObjectUpdater.py +120 -0
rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
rcsb/exdb/tests/testUniProtExtractor.py +77 -0
rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
rcsb/exdb/tree/__init__.py +0 -0
rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
rcsb/exdb/utils/ObjectExtractor.py +286 -0
rcsb/exdb/utils/ObjectTransformer.py +124 -0
rcsb/exdb/utils/ObjectUpdater.py +121 -0
rcsb/exdb/utils/ObjectValidator.py +160 -0
rcsb/exdb/utils/__init__.py +0 -0
rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
rcsb/exdb/wf/__init__.py +0 -0
rcsb_exdb-1.31.dist-info/METADATA +103 -0
rcsb_exdb-1.31.dist-info/RECORD +98 -0
rcsb_exdb-1.31.dist-info/WHEEL +4 -0
rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0

rcsb/exdb/chemref/PubChemIndexCacheProvider.py ADDED Viewed

@@ -0,0 +1,638 @@
+##
+# File: PubChemIndexCacheProvider.py
+# Date: 2-Apr-2020  jdw
+#
+# Utilities to manage chemical component/BIRD to PubChem compound identifier mapping data.
+#
+# Updates:
+#  9-May-2020 jdw separate cache behavior with separate option rebuildChemIndices=True/False
+# 16-Jul-2020 jdw separate index and reference data management.
+# 23-Jul-2021 jdw Make PubChemIndexCacheProvider a subclass of StashableBase()
+#  2-Mar-2023 aae Return correct status from Single proc
+#  8-Apr-2025 dwp Let MultiProc handle chunking; add more logging to debug slowness on west coast
+#
+##
+__docformat__ = "google en"
+__author__ = "John Westbrook"
+__email__ = "jwest@rcsb.rutgers.edu"
+__license__ = "Apache 2.0"
+import logging
+import os
+import time
+from rcsb.exdb.utils.ObjectExtractor import ObjectExtractor
+from rcsb.exdb.utils.ObjectUpdater import ObjectUpdater
+from rcsb.utils.chem.ChemCompIndexProvider import ChemCompIndexProvider
+from rcsb.utils.chem.ChemCompSearchIndexProvider import ChemCompSearchIndexProvider
+from rcsb.utils.chemref.PubChemUtils import PubChemUtils, ChemicalIdentifier
+from rcsb.utils.io.IoUtil import getObjSize
+from rcsb.utils.io.MarshalUtil import MarshalUtil
+from rcsb.utils.io.StashableBase import StashableBase
+from rcsb.utils.io.TimeUtil import TimeUtil
+from rcsb.utils.multiproc.MultiProcUtil import MultiProcUtil
+logger = logging.getLogger(__name__)
+class PubChemUpdateWorker(object):
+    """A skeleton worker class that implements the interface expected by the multiprocessing module
+    for fetching CCD/BIRD to PubChem chemical compound identifier correspondences --
+    """
+    def __init__(self, cfgOb, searchIdxD, **kwargs):
+        self.__cfgOb = cfgOb
+        self.__searchIdxD = searchIdxD
+        #
+        _ = kwargs
+        self.__lookupD = {}
+        for sId, sD in self.__searchIdxD.items():
+            ccId = sId.split("|")[0]
+            self.__lookupD.setdefault(ccId, []).append(sD)
+        self.__databaseName = "pubchem_exdb"
+        self.__matchIndexCollectionName = "reference_match_index"
+        self.__createCollections(self.__databaseName, self.__matchIndexCollectionName, indexAttributeNames=["rcsb_id", "rcsb_last_update"])
+        self.__pcU = PubChemUtils()
+    def __genChemIdList(self, ccId):
+        """Return a list of ChemicalIdentifier() objects for the input chemical component identifier.
+        Args:
+            ccId (str): chemical component identifiers
+        Returns:
+            (list): list of ChemicalIdentifier() objects corresponding to the input chemical component.
+        """
+        chemIdList = []
+        idType = None
+        descr = None
+        if ccId in self.__lookupD:
+            for sD in self.__lookupD[ccId]:
+                if "inchi-key" in sD:
+                    idType = "inchikey"
+                    descr = sD["inchi-key"]
+                elif "smiles" in sD:
+                    idType = "smiles"
+                    descr = sD["smiles"]
+                chemIdList.append(ChemicalIdentifier(idCode=ccId, identifierSource=sD["build-type"], identifierType=idType, identifier=descr, indexName=sD["name"]))
+        return chemIdList
+    def updateList(self, dataList, procName, optionsD, workingDir):
+        """Update the input list of reference data identifiers (ChemicalIdentifier()) and return
+        matching diagnostics and reference feature data.
+        {
+               "_id" : ObjectId("5e8dfb49eab967a0483a0472"),
+               "rcsb_id" : "local reference ID (ccid|bird)", << LOCAL CANONICAL ID (e.g. ATP, PRD_000100)
+               "rcsb_last_update" : ISODate("2020-04-08T16:26:47.993+0000"),
+               "matched_ids" : [
+                   {"matched_id":  "<external reference ID code>", "search_id_type" : "oe-smiles", "search_id_source": "model-xyz",
+                                   'source_index_name': <>, 'source_inchikey': <>, 'source_smiles': <>},
+                   {"matched_id":  "<external reference ID code>", "search_id_type": ... , "search_id_source": ... , ...}
+                   ]                          ]
+               },
+           }
+           // Failed matches are recorded with NO matchedIds:
+           {
+               "_id" : ObjectId("5e8dfb49eab967a0483a04a3"),
+               "rcsb_id" : "local reference ID (ccid|bird)", << LOCAL ID
+               "rcsb_last_update" : ISODate("2020-04-08T16:26:48.025+0000"),
+           }
+           #
+        """
+        _ = workingDir
+        matchIdOnly = optionsD.get("matchIdOnly", True)
+        # Path to store raw request data -
+        exportPath = optionsD.get("exportPath", None)
+        #
+        successList = []
+        diagList = []
+        failList = []
+        retList = []
+        #
+        try:
+            startTime = time.time()
+            tU = TimeUtil()
+            ccIdList = dataList  # len(dataList) should be of size chunkSize
+            logger.info("%s search starting for %d reference definitions (matchIdOnly %r exportPath %r)", procName, len(ccIdList), matchIdOnly, exportPath)
+            tIdxDL = []
+            timeS = tU.getDateTimeObj(tU.getTimestamp())
+            for ccId in ccIdList:
+                # Get various forms from the search index -
+                chemIdList = self.__genChemIdList(ccId)
+                tIdxD = {"rcsb_id": ccId, "rcsb_last_update": timeS}
+                #
+                mL = []
+                for chemId in chemIdList:
+                    stA = time.time()
+                    ok, refDL = self.__pcU.assemble(chemId, exportPath=exportPath, matchIdOnly=matchIdOnly)
+                    #
+                    if not ok:
+                        etA = time.time()
+                        logger.debug("Failing %s search source %s for %s (%.4f secs)", chemId.identifierType, chemId.identifierSource, chemId.idCode, etA - stA)
+                    #
+                    if ok and refDL:
+                        for tD in refDL:
+                            pcId = tD["cid"]
+                            inchiKey = (
+                                self.__searchIdxD[chemId.indexName]["inchi-key"]
+                                if chemId.indexName in self.__searchIdxD and "inchi-key" in self.__searchIdxD[chemId.indexName]
+                                else None
+                            )
+                            smiles = (
+                                self.__searchIdxD[chemId.indexName]["smiles"] if chemId.indexName in self.__searchIdxD and "smiles" in self.__searchIdxD[chemId.indexName] else None
+                            )
+                            mL.append(
+                                {
+                                    "matched_id": pcId,
+                                    "search_id_type": chemId.identifierType,
+                                    "search_id_source": chemId.identifierSource,
+                                    "source_index_name": chemId.indexName,
+                                    "source_smiles": smiles,
+                                    "source_inchikey": inchiKey,
+                                }
+                            )
+                #
+                if mL:
+                    tIdxD["matched_ids"] = mL
+                    successList.append(ccId)
+                else:
+                    logger.info("No match result for any form of %s", ccId)
+                #
+                tIdxDL.append(tIdxD)
+            # --
+            failList = sorted(set(dataList) - set(successList))
+            if failList:
+                logger.info("%s returns %d definitions with failures: %r", procName, len(failList), failList)
+            # --
+            endTime = time.time()
+            logger.info("%s completed updateList len %r duration %.3f secs", procName, len(ccIdList), endTime - startTime)
+            startTimeL = time.time()
+            logger.info("Saving dataList (len=%d)", len(ccIdList))
+            self.__updateObjectStore(self.__databaseName, self.__matchIndexCollectionName, tIdxDL)
+            endTimeL = time.time()
+            logger.info("Saved chunk (len=%d) in %.3f secs", len(ccIdList), endTimeL - startTimeL)
+        except Exception as e:
+            logger.exception("Failing %s for %d data items %s", procName, len(dataList), str(e))
+        logger.info("%s dataList length %d success length %d retList %d", procName, len(dataList), len(successList), len(retList))
+        #
+        return successList, retList, diagList
+    def __updateObjectStore(self, databaseName, collectionName, objDL):
+        updateDL = []
+        for objD in objDL:
+            try:
+                selectD = {"rcsb_id": objD["rcsb_id"]}
+                updateDL.append({"selectD": selectD, "updateD": objD})
+            except Exception as e:
+                logger.exception("Failing with %s", str(e))
+        obUpd = ObjectUpdater(self.__cfgOb)
+        numUpd = obUpd.update(databaseName, collectionName, updateDL)
+        logger.info("Updated reference count is %d", numUpd)
+    def __createCollections(self, databaseName, collectionName, indexAttributeNames=None):
+        obUpd = ObjectUpdater(self.__cfgOb)
+        ok = obUpd.createCollection(databaseName, collectionName, indexAttributeNames=indexAttributeNames, checkExists=True, bsonSchema=None)
+        return ok
+class PubChemIndexCacheProvider(StashableBase):
+    """Utilities to manage chemical component/BIRD to PubChem compound identifier mapping data."""
+    def __init__(self, cfgOb, cachePath):
+        dirName = "PubChem-index"
+        super(PubChemIndexCacheProvider, self).__init__(cachePath, [dirName])
+        self.__cfgOb = cfgOb
+        self.__cachePath = cachePath
+        self.__dirPath = os.path.join(self.__cachePath, dirName)
+        #
+        self.__databaseName = "pubchem_exdb"
+        self.__matchIndexCollectionName = "reference_match_index"
+        #
+        self.__matchD = None
+    def getMatchData(self, expireDays=0):
+        if not self.__matchD:
+            selectD = {}
+            if expireDays > 0:
+                tU = TimeUtil()
+                tS = tU.getTimestamp(useUtc=True, before={"days": expireDays})
+                selectD.update({"rcsb_latest_update": {"$lt": tU.getDateTimeObj(tS)}})
+            self.__matchD = self.__getReferenceData(self.__databaseName, self.__matchIndexCollectionName, selectD=selectD)
+            #
+        return self.__matchD
+    def testCache(self, minMatch=None, logSizes=False):
+        self.getMatchData()
+        okC = bool(self.__matchD)
+        if not okC:
+            return okC
+        logger.info("Reference data cache lengths: matchD %d", len(self.__matchD))
+        if minMatch and len(self.__matchD) < minMatch:
+            return False
+        #
+        if logSizes:
+            logger.info("PubChem MatchD %.2f", getObjSize(self.__matchD) / 1000000.0)
+        return True
+    def __getdumpFilePath(self, fmt="json"):
+        stashBaseFileName = "pubchem_match_index_object_list"
+        fExt = ".json" if fmt == "json" else ".pic"
+        fp = os.path.join(self.__dirPath, stashBaseFileName + fExt)
+        return fp
+    def dump(self, fmt="json"):
+        """Dump PubChem index reference data from the object store.
+        Args:
+            fmt (str, optional): [description]. Defaults to "json".
+        Returns:
+            bool: True for success or False otherwise
+        """
+        ok = False
+        try:
+            self.getMatchData()
+            if fmt in ["json", "pickle"]:
+                kwargs = {}
+                fp = self.__getdumpFilePath(fmt=fmt)
+                logger.info("Saving object store to %s", fp)
+                mU = MarshalUtil(workPath=self.__dirPath)
+                if fmt in ["json"]:
+                    kwargs = {"indent": 3}
+                ok = mU.doExport(fp, self.__matchD, fmt=fmt, **kwargs)
+        except Exception as e:
+            logger.exception("Failing for %r with %s", self.__dirPath, str(e))
+        return ok
+    def reloadDump(self, fmt="json"):
+        """Reload PubChem reference data store from saved dump.
+        Args:
+            fmt (str, optional): format of the backup file (pickle or json). Defaults to "json".
+        Returns:
+            (int): number of objects restored.
+        """
+        numUpd = 0
+        try:
+            # Read from disk backup and update object store -
+            if fmt in ["json", "pickle"]:
+                fp = self.__getdumpFilePath(fmt="json")
+                logger.info("Restoring object store from %s", fp)
+                mU = MarshalUtil(workPath=self.__dirPath)
+                matchD = mU.doImport(fp, fmt=fmt)
+                numUpd = self.__reloadDump(matchD, self.__databaseName, self.__matchIndexCollectionName, indexAttributeNames=["rcsb_id", "rcsb_last_update"])
+        except Exception as e:
+            logger.exception("Failing for %r with %s", self.__dirPath, str(e))
+        # --
+        return numUpd
+    def updateMissing(self, expireDays=0, fetchLimit=None, updateUnmatched=True, numProcChemComp=8, numProc=2, **kwargs):
+        """Update match index from object store
+        Args:
+            expireDays (int): expiration days on match data (default 0 meaning none)
+            fetchLimit (int): limit to the number of entry updates performed (None)
+            updateUnmatched (bool): Previously unmatched search definitions will be retried on update (default=True)
+            numProcChemComp (int): for rebuilding local ChemComp indices the number processors to apply (default=8)
+            numProc (int): for rebuilding local PubChem indices the number processors to apply (default=2)
+        Returns:
+            bool: True for success or False otherwise
+        ChemicalIdentifierFields = ("idCode", "identifierSource", "identifierType", "identifier")
+        ChemicalIdentifier = collections.namedtuple("ChemicalIdentifier", ChemicalIdentifierFields, defaults=(None,) * len(ChemicalIdentifierFields))
+                // Failed matches are recorded with NO matchedIds:
+                {
+                    "_id" : ObjectId("5e8dfb49eab967a0483a04a3"),
+                    "rcsb_id" : "local reference ID (ccid|bird)", << LOCAL ID
+                    "rcsb_last_update" : ISODate("2020-04-08T16:26:48.025+0000"),
+                }
+        """
+        #
+        matchD = {}
+        matchedIdList = []
+        ok = False
+        try:
+            # ---
+            # Get current the indices of source chemical reference data -
+            ok, ccidxP, ccsidxP = self.__rebuildChemCompSourceIndices(numProcChemComp, **kwargs)
+            if not ok:
+                return matchD
+            #
+            ccIdxD = ccidxP.getIndex()
+            searchIdxD = ccsidxP.getIndex()
+            # Index of target of local chemical component and BIRD identifiers
+            sourceIdList = sorted(ccIdxD.keys())
+            logger.info("Reloading chemical reference data (expireDays %r, updateUnmatched %r)", expireDays, updateUnmatched)
+            matchedIdList = self.__getMatchIndexIds(searchIdxD, expireDays=expireDays, updateUnmatched=updateUnmatched)
+            # --
+            logger.info("Starting matched reference identifier count (%d) ", len(matchedIdList))
+            updateIdList = sorted(set(sourceIdList) - set(matchedIdList))
+            logger.info("Missing chemical definition correspondences %d fetchLimit %r", len(updateIdList), fetchLimit)
+            #
+            updateIdList = updateIdList[:fetchLimit] if fetchLimit else updateIdList
+            #
+            if updateIdList:
+                logger.info("Update reference data cache for %d chemical identifiers", len(updateIdList))
+                ok, failList = self.__updateReferenceData(updateIdList, searchIdxD, numProc, **kwargs)
+                logger.info("Update reference data return status is %r missing count %d", ok, len(failList))
+            else:
+                logger.info("No reference data updates required")
+            # --
+            if not ok:
+                logger.warning("updateMissing completed with status %r failures %r", ok, len(failList))
+            #
+            return True
+        except Exception as e:
+            logger.exception("Failing with %s", str(e))
+        return ok
+    def getMatches(self):
+        """Get all PubChem correspondences from the current match index..
+        Returns:
+            (list): PubChem compound identifier codes.
+        """
+        self.getMatchData()
+        #
+        pcidList = []
+        try:
+            pcidS = set()
+            for _, mD in self.__matchD.items():
+                if "matched_ids" in mD:
+                    for sD in mD["matched_ids"]:
+                        pcidS.add(sD["matched_id"])
+            pcidList = list(pcidS)
+        except Exception as e:
+            logger.exception("Failing with %s", str(e))
+        return pcidList
+    def getSelectedMatches(self, **kwargs):
+        """Select preferred PubChem correspondences from the current match index for the input source component build type.
+            and separatel return alternative matches for other source types.
+        Args:
+            sourceTypes (list, optional):  list of source chemical component build types (default: ["model-xyz"])
+            exportPath: (str, optional): export path for correspondences
+        Returns:
+            dict, dict : mapD { ccId1: [{'pcId': ... , 'inchiKey': ... }], ccId2: ...},
+                         altD { ccId1: [{'pcId': ... , 'inchiKey': ... 'sourceType': ... }], ccId2: ...}
+                Example match index entry:
+                {
+                    "_id" : ObjectId("5e8dfb49eab967a0483a0472"),
+                    "rcsb_id" : "local reference ID (ccid|bird)", << LOCAL CANONICAL ID (e.g. ATP, PRD_000100)
+                    "rcsb_last_update" : ISODate("2020-04-08T16:26:47.993+0000"),
+                    "matched_ids" : [
+                        {"matched_id":  "<external reference ID code>", "search_id_type" : "oe-smiles", "search_id_source": "model-xyz",
+                                        'source_index_name': <>, 'source_inchikey': <>, 'source_smiles': <>},
+                        {"matched_id":  "<external reference ID code>", "search_id_type": ... , "search_id_source": ... , ...}
+                        ]                          ]
+                    },
+                }
+        """
+        #
+        self.getMatchData()
+        sourceTypes = kwargs.get("sourceTypes", ["model-xyz"])
+        exportPath = kwargs.get("exportPath", None)
+        #
+        mapD = {}
+        altMapD = {}
+        extraMapD = {}
+        try:
+            for ccId, mD in self.__matchD.items():
+                if "matched_ids" in mD:
+                    for sD in mD["matched_ids"]:
+                        #
+                        if sD and "search_id_source" in sD:
+                            pcId = sD["matched_id"]
+                            inchiKey = sD["source_inchikey"]
+                            #
+                            if sD["search_id_source"] in sourceTypes:
+                                mapD.setdefault(ccId, []).append({"pcId": pcId, "inchiKey": inchiKey})
+                            else:
+                                altMapD.setdefault(ccId, []).append({"pcId": pcId, "inchiKey": inchiKey, "sourceType": sD["search_id_source"]})
+            #
+            difS = set(altMapD.keys()) - set(mapD.keys())
+            logger.info("PubChem preferred correspondence length (%d) alternative extras (%d)", len(mapD), len(difS))
+            for ccId in difS:
+                extraMapD[ccId] = altMapD[ccId]
+            if exportPath:
+                fp = os.path.join(exportPath, "pubchem_matches.json")
+                mU = MarshalUtil(workPath=exportPath)
+                mU.doExport(fp, mapD, fmt="json", indent=3)
+        except Exception as e:
+            logger.exception("Failing with %s", str(e))
+        return mapD, extraMapD
+    #
+    # -- Extract current data from object store --
+    def __getMatchIndexIds(self, searchIdxD, expireDays=0, updateUnmatched=True):
+        """Get CCD/BIRD reference data identifiers in the current match index subject to an
+           expiration interval (i.e. not matched or older than deltaDays).
+        Args:
+            searchIdxD (dict): CCD/BIRD search index dictionary
+            expireDays (int, optional): expiration interval in days. Defaults to 0 (no expiration).
+            updateUnmatched (bool, optional): include only matched identifiers (i.e. exclude any tried but unmatched cases)
+        Returns:
+            (list): chemical component/BIRD reference identifier list
+        """
+        selectD = {}
+        if expireDays > 0:
+            tU = TimeUtil()
+            tS = tU.getTimestamp(useUtc=True, before={"days": expireDays})
+            selectD.update({"rcsb_latest_update": {"$lt": tU.getDateTimeObj(tS)}})
+        #
+        if updateUnmatched:
+            # Return only cases with an existing correspondence
+            selectD.update({"matched_ids": {"$exists": True}})
+        matchD = self.__getReferenceData(self.__databaseName, self.__matchIndexCollectionName, selectD=selectD if selectD else None)
+        #
+        # For the selected cases in the index-
+        retIdList = []
+        if searchIdxD:
+            # Exclude definitions if source InChIKey in the match index differs with the Key in the current search index.
+            for ccId, inD in matchD.items():
+                if updateUnmatched and "matched_ids" not in inD:
+                    retIdList.append(ccId)
+                    continue
+                hasChanged = False
+                for mD in inD["matched_ids"]:
+                    if mD["source_index_name"] not in searchIdxD:
+                        hasChanged = True
+                        logger.info("Identifier %s no longer in search index", mD["source_index_name"])
+                        break
+                    if mD["source_inchikey"] != searchIdxD[mD["source_index_name"]]["inchi-key"]:
+                        logger.info("Identifier %s InChIKey changed search index", mD["source_index_name"])
+                        hasChanged = True
+                        break
+                if not hasChanged:
+                    retIdList.append(ccId)
+        #
+        return sorted(retIdList)
+    #
+    def __getReferenceData(self, databaseName, collectionName, selectD=None, selectionList=None):
+        logger.info("Searching %s %s with selection query %r", databaseName, collectionName, selectD)
+        obEx = ObjectExtractor(
+            self.__cfgOb,
+            databaseName=databaseName,
+            collectionName=collectionName,
+            keyAttribute="rcsb_id",
+            uniqueAttributes=["rcsb_id"],
+            selectionQuery=selectD,
+            selectionList=selectionList,
+            stripObjectId=True,
+        )
+        docCount = obEx.getCount()
+        logger.info("Reference data object count %d", docCount)
+        objD = obEx.getObjects()
+        return objD
+    def __updateReferenceData(self, idList, searchIdxD, numProc=2, **kwargs):
+        """Launch worker methods to update chemical reference data correspondences.
+        Args:
+            idList (list): list of local chemical identifiers (ChemIdentifier())
+        Returns:
+            (bool, list): status flag, list of unmatched identifiers
+        """
+        chunkSize = 10
+        exportPath = kwargs.get("exportPath", None)
+        logger.info("Length starting list is %d", len(idList))
+        optD = {"chunkSize": chunkSize, "exportPath": exportPath, "matchIdOnly": True}
+        rWorker = PubChemUpdateWorker(self.__cfgOb, searchIdxD)
+        if numProc > 1:
+            mpu = MultiProcUtil(verbose=True)
+            mpu.setOptions(optD)
+            mpu.set(workerObj=rWorker, workerMethod="updateList")
+            ok, failList, resultList, _ = mpu.runMulti(dataList=idList, numProc=numProc, numResults=1, chunkSize=chunkSize)
+            logger.info("Multi-proc %r failures %r result lengths %r", ok, len(failList), len(resultList[0]))
+        else:
+            successList, _, _ = rWorker.updateList(idList, "SingleProc", optD, self.__dirPath)
+            failList = list(set(idList) - set(successList))
+            ok = len(failList) == 0
+            logger.info("Single-proc status %r failures %r", ok, len(failList))
+        #
+        if len(failList) > 0:
+            if len(failList) <= 100:
+                logger.info("failList: %r", failList)
+            else:
+                logger.info("failList[:100]: %r", failList[:100])
+        #
+        return ok, failList
+    def __reloadDump(self, objD, databaseName, collectionName, indexAttributeNames=None):
+        """Internal method to restore the input database/collection using the input data object.
+        Args:
+            objD (obj): Target reference or index data object
+            databaseName (str): target database name
+            collectionName (str): target collection name
+            indexAttributeNames (list, optional): Primary index attributes. Defaults to None.
+        Returns:
+            int: inserted or updated object count
+        """
+        try:
+            numUpd = 0
+            numTotal = 0
+            updateDL = []
+            for entityKey, obj in objD.items():
+                if "_id" in obj:
+                    obj.pop("_id")
+                selectD = {"rcsb_id": entityKey}
+                updateDL.append({"selectD": selectD, "updateD": obj})
+            #
+            obUpd = ObjectUpdater(self.__cfgOb)
+            ok = obUpd.createCollection(databaseName, collectionName, indexAttributeNames=indexAttributeNames, checkExists=True, bsonSchema=None)
+            if ok:
+                numUpd = obUpd.update(databaseName, collectionName, updateDL)
+                logger.debug("Updated object count is %d", numUpd)
+            else:
+                logger.error("Create %s %s failed", databaseName, collectionName)
+            numTotal = obUpd.count(databaseName, collectionName)
+        except Exception as e:
+            logger.exception("Failing with %s", str(e))
+        #
+        return numTotal
+    #                           --- --- ---
+    # -- Load or rebuild source chemical reference data indices --
+    def __rebuildChemCompSourceIndices(self, numProc, **kwargs):
+        """Rebuild source indices of chemical component definitions."""
+        logger.info("Rebuilding chemical definition index.")
+        ok1, ccidxP = self.__buildChemCompIndex(**kwargs)
+        logger.info("__buildChemCompIndex completed with status %r", ok1)
+        logger.info("Rebuilding chemical search indices.")
+        ok2, ccsidxP = self.__buildChemCompSearchIndex(numProc, **kwargs)
+        logger.info("__buildChemCompSearchIndex completed with status %r", ok2)
+        return ok1 & ok2, ccidxP, ccsidxP
+    def __buildChemCompIndex(self, **kwargs):
+        """Build chemical component cache files from the input component dictionaries"""
+        try:
+            molLimit = kwargs.get("molLimit", None)
+            useCache = not kwargs.get("rebuildChemIndices", False)
+            logSizes = kwargs.get("logSizes", False)
+            ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "cc-full")
+            ccUrlTarget = kwargs.get("ccUrlTarget", None)
+            birdUrlTarget = kwargs.get("birdUrlTarget", None)
+            cachePath = kwargs.get("cachePath", self.__cachePath)
+            #
+            ccidxP = ChemCompIndexProvider(
+                ccUrlTarget=ccUrlTarget, birdUrlTarget=birdUrlTarget, cachePath=cachePath, useCache=useCache, molLimit=molLimit, ccFileNamePrefix=ccFileNamePrefix
+            )
+            ok = ccidxP.testCache(minCount=molLimit, logSizes=logSizes)
+            return ok, ccidxP if ok else None
+        except Exception as e:
+            logger.exception("Failing with %s", str(e))
+        #
+        return False, None
+    def __buildChemCompSearchIndex(self, numProc, **kwargs):
+        """Test build search index chemical component cache files from the input component dictionaries"""
+        try:
+            cachePath = kwargs.get("cachePath", self.__cachePath)
+            molLimit = kwargs.get("molLimit", None)
+            useCache = not kwargs.get("rebuildChemIndices", False)
+            logSizes = kwargs.get("logSizes", False)
+            limitPerceptions = kwargs.get("limitPerceptions", False)
+            #
+            chunkSize = kwargs.get("chunkSize", 5)
+            molLimit = kwargs.get("molLimit", None)
+            ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "cc-full")
+            quietFlag = kwargs.get("quietFlag", True)
+            ccUrlTarget = kwargs.get("ccUrlTarget", None)
+            birdUrlTarget = kwargs.get("birdUrlTarget", None)
+            #
+            ccsiP = ChemCompSearchIndexProvider(
+                ccUrlTarget=ccUrlTarget,
+                birdUrlTarget=birdUrlTarget,
+                cachePath=cachePath,
+                useCache=useCache,
+                molLimit=molLimit,
+                ccFileNamePrefix=ccFileNamePrefix,
+                limitPerceptions=limitPerceptions,
+                numProc=numProc,
+                maxChunkSize=chunkSize,
+                quietFlag=quietFlag,
+            )
+            ok = ccsiP.testCache(minCount=molLimit, logSizes=logSizes)
+            return ok, ccsiP if ok else None
+        except Exception as e:
+            logger.exception("Failing with %s", str(e))
+        return False, None

rcsb/exdb/chemref/__init__.py ADDED Viewed

File without changes