PyPI - rcsb.exdb - Versions diffs - 1.28__tar.gz → 1.30__tar.gz - Mend

rcsb.exdb 1.28tar.gz → 1.30tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (92) hide show

{rcsb_exdb-1.28 → rcsb_exdb-1.30}/HISTORY.txt RENAMED Viewed

@@ -110,3 +110,5 @@
                   Update Azure pipelines to run on latest macOS and ubuntu version
 23-Jan-2025 V1.27 Update TreeNodeListWorker to index 'id' field
 11-Feb-2025 V1.28 Move ExDB CLI code (workflow, exec, and tests) and Dockerfile to rcsb.workflow to avoid circular imports
+ 8-Apr-2025 V1.29 Add more logging to PubChemIndexCacheProvider and increase default numProc
+ 2-Oct-2025 V1.30 Make use of ExDB configuration file for loading drugbank and tree node list DBs/collections and setting indexed fields

{rcsb_exdb-1.28/rcsb.exdb.egg-info → rcsb_exdb-1.30}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: rcsb.exdb
-Version: 1.28
+Version: 1.30
 Summary: RCSB Python ExDB data extraction and loading workflows
 Home-page: https://github.com/rcsb/py-rcsb_exdb
 Author: John Westbrook
@@ -19,8 +19,8 @@ License-File: LICENSE
 Requires-Dist: numpy
 Requires-Dist: jsonschema>=2.6.0
 Requires-Dist: rcsb.utils.io>=1.48
-Requires-Dist: rcsb.db>=1.800
-Requires-Dist: rcsb.utils.chem>=0.81
+Requires-Dist: rcsb.db>=1.808
+Requires-Dist: rcsb.utils.chem>=0.84
 Requires-Dist: rcsb.utils.chemref>=0.91
 Requires-Dist: rcsb.utils.config>=0.40
 Requires-Dist: rcsb.utils.ec>=0.25
@@ -41,6 +41,7 @@ Dynamic: description
 Dynamic: description-content-type
 Dynamic: home-page
 Dynamic: license
+Dynamic: license-file
 Dynamic: provides-extra
 Dynamic: requires-dist
 Dynamic: summary

{rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/chemref/ChemRefEtlWorker.py RENAMED Viewed

@@ -7,6 +7,8 @@
 # Updates:
 #  9-Dec-2018  jdw add validation methods
 #  3-Sep-2019  jdw move to rcsb.exdb.chemref
+#  7-Aug-2025  dwp change target DB and collection from "drugbank_core" to "dw" and "core_drugbank" (as part of transition to DW);
+#                  make use of configuration file for loading drugbank collection and setting indexed fields
 #
 ##
 __docformat__ = "google en"
@@ -66,9 +68,10 @@ class ChemRefEtlWorker(object):
             desp = DataExchangeStatus()
             statusStartTimestamp = desp.setStartTime()
             addValues = {}
+            collectionGroupName = "core_drugbank"
             #
             if extResource == "DrugBank":
-                databaseName = "drugbank_core"
+                databaseNameMongo = self.__schP.getDatabaseMongoName(collectionGroupName=collectionGroupName)
                 configName = self.__cfgOb.getDefaultSectionName()
                 user = self.__cfgOb.get("_DRUGBANK_AUTH_USERNAME", sectionName=configName)
                 pw = self.__cfgOb.get("_DRUGBANK_AUTH_PASSWORD", sectionName=configName)
@@ -81,10 +84,10 @@ class ChemRefEtlWorker(object):
                 #
                 logger.info("Resource %r extracted mapped document length %d", extResource, len(dList))
                 logger.debug("Objects %r", dList[:2])
-                sD, _, collectionList, _ = self.__schP.getSchemaInfo(databaseName)
+                _, _, collectionList, docIndexD = self.__schP.getSchemaInfo(collectionGroupName=collectionGroupName)
                 collectionName = collectionList[0] if collectionList else "unassigned"
-                indexL = sD.getDocumentIndex(collectionName, "primary")
-                logger.info("Database %r collection %r index attributes %r", databaseName, collectionName, indexL)
+                indexDL = docIndexD[collectionName] if collectionName in docIndexD else []
+                logger.info("Database %r collection %r index attributes %r", databaseNameMongo, collectionName, indexDL)
                 #
                 # For some reason, 'addValues' was being overwritten with an empty dict (https://github.com/rcsb/py-rcsb_exdb/commit/26bd79e9a2fffc97c034b4116dece9248d1c1f39)
                 # Will need to review this -- do we want to add the schema version values or not? (Also, see similar logic in UniProtCoreEtlWorker.py)
@@ -103,8 +106,8 @@ class ChemRefEtlWorker(object):
                 readBackCheck=self.__readBackCheck,
             )
             #
-            ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=dList, indexAttributeList=indexL, keyNames=None, addValues=addValues)
-            self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
+            ok = dl.load(databaseNameMongo, collectionName, loadType=loadType, documentList=dList, keyNames=None, addValues=addValues, indexDL=indexDL)
+            self.__updateStatus(updateId, databaseNameMongo, collectionName, ok, statusStartTimestamp)
             return True
         except Exception as e:

{rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/chemref/ChemRefExtractor.py RENAMED Viewed

@@ -7,6 +7,7 @@
 # Updates:
 #  7-Jan-2019  jdw moved from ChemRefEtlWorker.
 #  3-Sep-2019  jdw moved again to module rcsb.exdb.chemref
+# 14-Aug-2025  dwp rename bird_chem_comp_core to core_chem_comp
 #
 ##
 __docformat__ = "google en"
@@ -42,8 +43,8 @@ class ChemRefExtractor(object):
         """
         idD = {}
         try:
-            databaseName = "bird_chem_comp_core"
-            collectionName = "bird_chem_comp_core"
+            databaseName = "dw"
+            collectionName = "core_chem_comp"
             selectD = {"rcsb_chem_comp_related.resource_name": referenceResourceName}
             selectionList = ["rcsb_id", "rcsb_chem_comp_related"]
             logger.info("Searching %s %s with selection query %r", databaseName, collectionName, selectD)

{rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/chemref/PubChemIndexCacheProvider.py RENAMED Viewed

@@ -9,6 +9,7 @@
 # 16-Jul-2020 jdw separate index and reference data management.
 # 23-Jul-2021 jdw Make PubChemIndexCacheProvider a subclass of StashableBase()
 #  2-Mar-2023 aae Return correct status from Single proc
+#  8-Apr-2025 dwp Let MultiProc handle chunking; add more logging to debug slowness on west coast
 #
 ##
 __docformat__ = "google en"
@@ -100,84 +101,82 @@ class PubChemUpdateWorker(object):
            #
         """
         _ = workingDir
-        chunkSize = optionsD.get("chunkSize", 50)
         matchIdOnly = optionsD.get("matchIdOnly", True)
         # Path to store raw request data -
         exportPath = optionsD.get("exportPath", None)
         #
         successList = []
-        retList1 = []
-        retList2 = []
         diagList = []
-        emptyList = []
+        failList = []
+        retList = []
         #
         try:
+            startTime = time.time()
             tU = TimeUtil()
-            ccIdList = dataList
-            numChunks = len(list(self.__chunker(ccIdList, chunkSize)))
-            logger.info("%s search starting for %d reference definitions (in chunks of length %d)", procName, len(ccIdList), chunkSize)
-            for ii, ccIdChunk in enumerate(self.__chunker(ccIdList, chunkSize), 1):
-                logger.info("%s starting chunk for %d of %d", procName, ii, numChunks)
-                # tDL = []
-                tIdxDL = []
-                timeS = tU.getDateTimeObj(tU.getTimestamp())
-                for ccId in ccIdChunk:
-                    # Get various forms from the search index -
-                    chemIdList = self.__genChemIdList(ccId)
-                    tIdxD = {"rcsb_id": ccId, "rcsb_last_update": timeS}
+            ccIdList = dataList  # len(dataList) should be of size chunkSize
+            logger.info("%s search starting for %d reference definitions (matchIdOnly %r exportPath %r)", procName, len(ccIdList), matchIdOnly, exportPath)
+            tIdxDL = []
+            timeS = tU.getDateTimeObj(tU.getTimestamp())
+            for ccId in ccIdList:
+                # Get various forms from the search index -
+                chemIdList = self.__genChemIdList(ccId)
+                tIdxD = {"rcsb_id": ccId, "rcsb_last_update": timeS}
+                #
+                mL = []
+                for chemId in chemIdList:
+                    stA = time.time()
+                    ok, refDL = self.__pcU.assemble(chemId, exportPath=exportPath, matchIdOnly=matchIdOnly)
                     #
-                    mL = []
-                    for chemId in chemIdList:
-                        stA = time.time()
-                        ok, refDL = self.__pcU.assemble(chemId, exportPath=exportPath, matchIdOnly=matchIdOnly)
-                        #
-                        if not ok:
-                            etA = time.time()
-                            logger.debug("Failing %s search source %s for %s (%.4f secs)", chemId.identifierType, chemId.identifierSource, chemId.idCode, etA - stA)
-                        #
-                        if ok and refDL:
-                            for tD in refDL:
-                                pcId = tD["cid"]
-                                inchiKey = (
-                                    self.__searchIdxD[chemId.indexName]["inchi-key"]
-                                    if chemId.indexName in self.__searchIdxD and "inchi-key" in self.__searchIdxD[chemId.indexName]
-                                    else None
-                                )
-                                smiles = (
-                                    self.__searchIdxD[chemId.indexName]["smiles"] if chemId.indexName in self.__searchIdxD and "smiles" in self.__searchIdxD[chemId.indexName] else None
-                                )
-                                mL.append(
-                                    {
-                                        "matched_id": pcId,
-                                        "search_id_type": chemId.identifierType,
-                                        "search_id_source": chemId.identifierSource,
-                                        "source_index_name": chemId.indexName,
-                                        "source_smiles": smiles,
-                                        "source_inchikey": inchiKey,
-                                    }
-                                )
-                                # tD.update({"rcsb_id": pcId, "rcsb_last_update": timeS})
-                                # tDL.append(tD)
+                    if not ok:
+                        etA = time.time()
+                        logger.debug("Failing %s search source %s for %s (%.4f secs)", chemId.identifierType, chemId.identifierSource, chemId.idCode, etA - stA)
                     #
-                    if mL:
-                        tIdxD["matched_ids"] = mL
-                        successList.append(ccId)
-                    else:
-                        logger.info("No match result for any form of %s", ccId)
-                    #
-                    tIdxDL.append(tIdxD)
-                # --
-                startTimeL = time.time()
-                logger.info("Saving chunk %d (len=%d)", ii, len(ccIdChunk))
-                self.__updateObjectStore(self.__databaseName, self.__matchIndexCollectionName, tIdxDL)
-                endTimeL = time.time()
-                logger.info("Saved chunk %d (len=%d) in %.3f secs", ii, len(ccIdChunk), endTimeL - startTimeL)
+                    if ok and refDL:
+                        for tD in refDL:
+                            pcId = tD["cid"]
+                            inchiKey = (
+                                self.__searchIdxD[chemId.indexName]["inchi-key"]
+                                if chemId.indexName in self.__searchIdxD and "inchi-key" in self.__searchIdxD[chemId.indexName]
+                                else None
+                            )
+                            smiles = (
+                                self.__searchIdxD[chemId.indexName]["smiles"] if chemId.indexName in self.__searchIdxD and "smiles" in self.__searchIdxD[chemId.indexName] else None
+                            )
+                            mL.append(
+                                {
+                                    "matched_id": pcId,
+                                    "search_id_type": chemId.identifierType,
+                                    "search_id_source": chemId.identifierSource,
+                                    "source_index_name": chemId.indexName,
+                                    "source_smiles": smiles,
+                                    "source_inchikey": inchiKey,
+                                }
+                            )
+                #
+                if mL:
+                    tIdxD["matched_ids"] = mL
+                    successList.append(ccId)
+                else:
+                    logger.info("No match result for any form of %s", ccId)
+                #
+                tIdxDL.append(tIdxD)
+            # --
+            failList = sorted(set(dataList) - set(successList))
+            if failList:
+                logger.info("%s returns %d definitions with failures: %r", procName, len(failList), failList)
+            # --
+            endTime = time.time()
+            logger.info("%s completed updateList len %r duration %.3f secs", procName, len(ccIdList), endTime - startTime)
+            startTimeL = time.time()
+            logger.info("Saving dataList (len=%d)", len(ccIdList))
+            self.__updateObjectStore(self.__databaseName, self.__matchIndexCollectionName, tIdxDL)
+            endTimeL = time.time()
+            logger.info("Saved chunk (len=%d) in %.3f secs", len(ccIdList), endTimeL - startTimeL)
         except Exception as e:
             logger.exception("Failing %s for %d data items %s", procName, len(dataList), str(e))
-        logger.info("%s dataList length %d success length %d rst1 %d rst2 %d", procName, len(dataList), len(successList), len(retList1), len(retList2))
+        logger.info("%s dataList length %d success length %d retList %d", procName, len(dataList), len(successList), len(retList))
         #
-        return successList, emptyList, emptyList, diagList
+        return successList, retList, diagList
     def __updateObjectStore(self, databaseName, collectionName, objDL):
         updateDL = []
@@ -196,10 +195,6 @@ class PubChemUpdateWorker(object):
         ok = obUpd.createCollection(databaseName, collectionName, indexAttributeNames=indexAttributeNames, checkExists=True, bsonSchema=None)
         return ok
-    def __chunker(self, iList, chunkSize):
-        chunkSize = max(1, chunkSize)
-        return (iList[i: i + chunkSize] for i in range(0, len(iList), chunkSize))
 class PubChemIndexCacheProvider(StashableBase):
     """Utilities to manage chemical component/BIRD to PubChem compound identifier mapping data."""
@@ -515,7 +510,7 @@ class PubChemIndexCacheProvider(StashableBase):
         Returns:
             (bool, list): status flag, list of unmatched identifiers
         """
-        chunkSize = 50
+        chunkSize = 10
         exportPath = kwargs.get("exportPath", None)
         logger.info("Length starting list is %d", len(idList))
         optD = {"chunkSize": chunkSize, "exportPath": exportPath, "matchIdOnly": True}
@@ -524,14 +519,20 @@ class PubChemIndexCacheProvider(StashableBase):
             mpu = MultiProcUtil(verbose=True)
             mpu.setOptions(optD)
             mpu.set(workerObj=rWorker, workerMethod="updateList")
-            ok, failList, resultList, _ = mpu.runMulti(dataList=idList, numProc=numProc, numResults=2, chunkSize=chunkSize)
-            logger.info("Multi-proc %r failures %r result lengths %r %r", ok, len(failList), len(resultList[0]), len(resultList[1]))
+            ok, failList, resultList, _ = mpu.runMulti(dataList=idList, numProc=numProc, numResults=1, chunkSize=chunkSize)
+            logger.info("Multi-proc %r failures %r result lengths %r", ok, len(failList), len(resultList[0]))
         else:
-            successList, _, _, _ = rWorker.updateList(idList, "SingleProc", optD, self.__dirPath)
+            successList, _, _ = rWorker.updateList(idList, "SingleProc", optD, self.__dirPath)
             failList = list(set(idList) - set(successList))
             ok = len(failList) == 0
             logger.info("Single-proc status %r failures %r", ok, len(failList))
         #
+        if len(failList) > 0:
+            if len(failList) <= 100:
+                logger.info("failList: %r", failList)
+            else:
+                logger.info("failList[:100]: %r", failList[:100])
+        #
         return ok, failList
     def __reloadDump(self, objD, databaseName, collectionName, indexAttributeNames=None):

{rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/cli/__init__.py RENAMED Viewed

@@ -2,4 +2,4 @@ __docformat__ = "google en"
 __author__ = "John Westbrook"
 __email__ = "john.westbrook@rcsb.org"
 __license__ = "Apache 2.0"
-__version__ = "1.28"
+__version__ = "1.30"

{rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/fixturePdbxLoader.py RENAMED Viewed

@@ -162,7 +162,9 @@ class PdbxLoaderFixture(unittest.TestCase):
         ]
         self.__ldList = [
             {
-                "databaseName": "bird_chem_comp_core",
+                # "databaseName": "dw",
+                "collectionGroupName": "core_chem_comp",
+                "contentType": "bird_chem_comp_core",
                 "collectionNameList": None,
                 "loadType": "full",
                 "mergeContentTypes": None,
@@ -170,7 +172,9 @@ class PdbxLoaderFixture(unittest.TestCase):
                 "inputIdCodeList": self.__birdChemCompCoreIdList
             },
             {
-                "databaseName": "pdbx_core",
+                # "databaseName": "pdbx_core",
+                "collectionGroupName": "pdbx_core",
+                "contentType": "pdbx_core",
                 "collectionNameList": None,
                 "loadType": "replace",
                 "mergeContentTypes": ["vrpt"],
@@ -179,6 +183,8 @@ class PdbxLoaderFixture(unittest.TestCase):
             },
             # {
             #     "databaseName": "pdbx_comp_model_core",
+            #     "collectionGroupName": "pdbx_comp_model_core",
+            #     "contentType": "pdbx_comp_model_core",
             #     "collectionNameList": None,
             #     "loadType": "full",
             #     "mergeContentTypes": None,
@@ -220,7 +226,7 @@ class PdbxLoaderFixture(unittest.TestCase):
         """Wrapper for the PDBx loader module"""
         ok = False
         try:
-            logger.info("Loading %s", kwargs["databaseName"])
+            logger.info("Loading %s", kwargs["collectionGroupName"])
             mw = PdbxLoader(
                 self.__cfgOb,
                 cachePath=self.__cachePath,
@@ -235,8 +241,9 @@ class PdbxLoaderFixture(unittest.TestCase):
                 rebuildSchemaFlag=False,
             )
             ok = mw.load(
-                kwargs["databaseName"],
+                collectionGroupName=kwargs["collectionGroupName"],
                 collectionLoadList=kwargs["collectionNameList"],
+                contentType=kwargs["contentType"],
                 loadType=kwargs["loadType"],
                 inputPathList=None,
                 inputIdCodeList=kwargs["inputIdCodeList"],

{rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testObjectExtractor.py RENAMED Viewed

@@ -81,8 +81,8 @@ class ObjectExtractorTests(unittest.TestCase):
         try:
             obEx = ObjectExtractor(
                 self.__cfgOb,
-                databaseName="bird_chem_comp_core",
-                collectionName="bird_chem_comp_core",
+                databaseName="dw",
+                collectionName="core_chem_comp",
                 cacheFilePath=os.path.join(self.__workPath, "drugbank-mapping-cache.json"),
                 useCache=False,
                 cacheKwargs=self.__testEntryCacheKwargs,

{rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py RENAMED Viewed

@@ -60,6 +60,7 @@ class ReferenceSequenceAnnotationAdapterTests(unittest.TestCase):
         endTime = time.time()
         logger.info("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)
+    # NOTE: IF YOU DISABLE THE TEST BELOW, THEN 'testReferenceCacheProvider' FAILS. CHECK WHETHER ALL 'Reference' PROVIDERS CAN BE DISABLED.
     # @unittest.skip("Disable test - no longer using in production, and fails too frequently with 'Bad xml text' when fetching from UniProt")
     def testAnnotationAdapter(self):
         """Test case - create and read cache reference sequences assignments and related data."""

{rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testTreeNodeListWorker.py RENAMED Viewed

@@ -1,5 +1,5 @@
 ##
-# File:    TreeNodeListWorkerTests.py
+# File:    testTreeNodeListWorker.py
 # Author:  J. Westbrook
 # Date:    23-Apr-2019
 #

{rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tree/TreeNodeListWorker.py RENAMED Viewed

@@ -10,6 +10,8 @@
 #  8-Aug-2023 dwp Load full (unfiltered) taxonomy tree node list, and stop loading GO tree (will be loaded in DW instead)
 # 27-Aug-2024 dwp Update CARD ontology tree loading
 # 23-Jan-2025 dwp Change indexed field from 'update_id' to 'id'
+#  7-Aug-2025 dwp Change target DB and collection names to "dw" and "tree_*" (via configuration file);
+#                 Make use of configuration file for loading tree node lists and setting indexed fields
 #
 ##
 __docformat__ = "google en"
@@ -32,8 +34,6 @@ from rcsb.utils.struct.ScopClassificationProvider import ScopClassificationProvi
 from rcsb.utils.struct.Scop2ClassificationProvider import Scop2ClassificationProvider
 from rcsb.utils.taxonomy.TaxonomyProvider import TaxonomyProvider
 from rcsb.exdb.seq.TaxonomyExtractor import TaxonomyExtractor
-# from rcsb.utils.go.GeneOntologyProvider import GeneOntologyProvider
-# from rcsb.exdb.seq.AnnotationExtractor import AnnotationExtractor
 logger = logging.getLogger(__name__)
@@ -76,37 +76,28 @@ class TreeNodeListWorker(object):
         Relevant configuration options:
         tree_node_lists_configuration:
-            DATABASE_NAME: tree_node_lists
-            DATABASE_VERSION_STRING: v5
-            COLLECTION_VERSION_STRING: 1.0.0
-            COLLECTION_TAXONOMY: tree_taxonomy_node_list
-            COLLECTION_ENZYME: tree_ec_node_list
-            COLLECTION_SCOP: tree_scop_node_list
-            COLLECTION_CATH: tree_cath_node_list
+            DATABASE_NAME: dw
+            COLLECTION_VERSION_STRING: 2.1.0
+            COLLECTION_NAME_LIST:
+                - tree_taxonomy
+                - tree_ec
+                - tree_scop
+                - tree_scop2
+                - tree_cath
+                - tree_atc
+                - tree_card
+                - tree_ecod
+            COLLECTION_INDICES:
+                - INDEX_NAME: primary
+                ATTRIBUTE_NAMES:
+                    - id
+                - INDEX_NAME: index_2
+                ATTRIBUTE_NAMES:
+                    - parents
         """
         try:
             useCache = self.__useCache
             #
-            # if not useCache:
-            #    cDL = ["domains_struct", "NCBI", "ec", "go", "atc"]
-            #    for cD in cDL:
-            #        try:
-            #            cfp = os.path.join(self.__cachePath, cD)
-            #            os.makedirs(cfp, 0o755)
-            #        except Exception:
-            #            pass
-            #        #
-            #        try:
-            #            cfp = os.path.join(self.__cachePath, cD)
-            #            fpL = glob.glob(os.path.join(cfp, "*"))
-            #            if fpL:
-            #                for fp in fpL:
-            #                    os.remove(fp)
-            #        except Exception:
-            #            pass
-            #
-            #
             logger.info("Starting with cache path %r (useCache=%r)", self.__cachePath, useCache)
             #
             self.__statusList = []
@@ -124,65 +115,77 @@ class TreeNodeListWorker(object):
                 readBackCheck=self.__readBackCheck,
             )
             #
-            databaseName = "tree_node_lists"
+            sectionName = "tree_node_lists_configuration"
+            databaseNameMongo = self.__cfgOb.get("DATABASE_NAME", sectionName=sectionName)
+            collectionNameList = self.__cfgOb.get("COLLECTION_NAME_LIST", sectionName=sectionName)
+            collectionIndexList = self.__cfgOb.get("COLLECTION_INDICES", sectionName=sectionName)
+            # databaseNameMongo = 'dw'
+            # collectionNameList = ['tree_taxonomy', 'tree_ec', 'tree_scop', 'tree_scop2', 'tree_cath', 'tree_atc', 'tree_card', 'tree_ecod', 'tree_go']
+            # collectionIndexList = [{'INDEX_NAME': 'primary', 'ATTRIBUTE_NAMES': ['id']}, {'INDEX_NAME': 'index_2', 'ATTRIBUTE_NAMES': ['parents']}]
             # collectionVersion = self.__cfgOb.get("COLLECTION_VERSION_STRING", sectionName=sectionName)
             # addValues = {"_schema_version": collectionVersion}
             addValues = None
-            #
-            # --- GO - TURNED OFF 08 Aug 2023 dwp (tree is now loaded in DW)
-            # goP = GeneOntologyProvider(goDirPath=os.path.join(self.__cachePath, "go"), useCache=useCache)
-            # ok = goP.testCache()
-            # anEx = AnnotationExtractor(self.__cfgOb)
-            # goIdL = anEx.getUniqueIdentifiers("GO")
-            # logger.info("Unique GO assignments %d", len(goIdL))
-            # nL = goP.exportTreeNodeList(goIdL)
-            # logger.info("GO tree node list length %d", len(nL))
-            # if doLoad:
-            #     collectionName = "tree_go_node_list"
-            #     ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["id"], keyNames=None, addValues=addValues, schemaLevel=None)
-            #     self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
-            #
-            # ---- CATH
+            ok = True
+            for collectionName in collectionNameList:
+                nL = self.__getTreeDocList(collectionName, useCache)
+                if nL and doLoad:
+                    ok = dl.load(
+                        databaseNameMongo,
+                        collectionName,
+                        loadType=loadType,
+                        documentList=nL,
+                        keyNames=None,
+                        addValues=addValues,
+                        schemaLevel=None,
+                        indexDL=collectionIndexList
+                    ) and ok
+                    self.__updateStatus(updateId, databaseNameMongo, collectionName, ok, statusStartTimestamp)
+                logger.info(
+                    "Completed load of tree node list for database %r, collection %r, len(nL) %r (status %r)",
+                    databaseNameMongo, collectionName, len(nL), ok
+                )
+            # ---
+            logger.info("Completed tree node list loading operations with loadType %r (status %r)", loadType, ok)
+            return True
+        except Exception as e:
+            logger.exception("Failing with %s", str(e))
+        return False
+    def __checkTaxonNodeList(self, nL):
+        eCount = 0
+        tD = {dD["id"]: True for dD in nL}
+        for dD in nL:
+            if "parents" in dD:
+                pId = dD["parents"][0]
+                if pId not in tD:
+                    logger.info("Missing parent for taxon %d", pId)
+                    eCount += 1
+            else:
+                logger.info("No parents for node %r", dD["id"])
+    def getLoadStatus(self):
+        return self.__statusList
+    def __getTreeDocList(self, collectionName, useCache):
+        nL = []
+        if collectionName.lower() == "tree_cath":
             ccu = CathClassificationProvider(cachePath=self.__cachePath, useCache=useCache)
             nL = ccu.getTreeNodeList()
-            logger.info("Starting load SCOP node tree length %d", len(nL))
-            if doLoad:
-                collectionName = "tree_cath_node_list"
-                ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["id"], keyNames=None, addValues=addValues, schemaLevel=None)
-                self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
-            # ---- SCOP
+        elif collectionName.lower() == "tree_scop2":
+            scu2 = Scop2ClassificationProvider(cachePath=self.__cachePath, useCache=useCache)
+            nL = scu2.getTreeNodeList()
+        elif collectionName.lower() == "tree_scop":
             scu = ScopClassificationProvider(cachePath=self.__cachePath, useCache=useCache)
             nL = scu.getTreeNodeList()
-            logger.info("Starting load SCOP node tree length %d", len(nL))
-            if doLoad:
-                collectionName = "tree_scop_node_list"
-                ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["id"], keyNames=None, addValues=addValues, schemaLevel=None)
-                self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
-            # --- SCOP2
-            scu = Scop2ClassificationProvider(cachePath=self.__cachePath, useCache=useCache)
-            nL = scu.getTreeNodeList()
-            logger.info("Starting load SCOP2 node tree length %d", len(nL))
-            if doLoad:
-                collectionName = "tree_scop2_node_list"
-                ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["id"], keyNames=None, addValues=addValues, schemaLevel=None)
-                self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
-            # ---- Ecod
+        elif collectionName.lower() == "tree_ecod":
             ecu = EcodClassificationProvider(cachePath=self.__cachePath, useCache=useCache)
             nL = ecu.getTreeNodeList()
-            logger.info("Starting load ECOD node tree length %d", len(nL))
-            if doLoad:
-                collectionName = "tree_ecod_node_list"
-                ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["id"], keyNames=None, addValues=addValues, schemaLevel=None)
-                self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
-            # ---- EC
+        elif collectionName.lower() == "tree_ec":
             edbu = EnzymeDatabaseProvider(cachePath=self.__cachePath, useCache=useCache)
             nL = edbu.getTreeNodeList()
-            logger.info("Starting load of EC node tree length %d", len(nL))
-            if doLoad:
-                collectionName = "tree_ec_node_list"
-                ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["id"], keyNames=None, addValues=addValues, schemaLevel=None)
-                self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
-            # ---- CARD
+        elif collectionName.lower() == "tree_card":
             okCou = True
             cou = CARDTargetOntologyProvider(cachePath=self.__cachePath, useCache=useCache)
             if not cou.testCache():
@@ -193,21 +196,7 @@ class TreeNodeListWorker(object):
                     okCou = False
             if okCou:
                 nL = cou.getTreeNodeList()
-                logger.info("Starting load of CARD ontology node tree length %d", len(nL))
-                if doLoad:
-                    collectionName = "tree_card_node_list"
-                    ok = dl.load(
-                        databaseName,
-                        collectionName,
-                        loadType=loadType,
-                        documentList=nL,
-                        indexAttributeList=["id"],
-                        keyNames=None,
-                        addValues=addValues,
-                        schemaLevel=None
-                    )
-                    self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
-            # ---- Taxonomy
+        elif collectionName.lower() == "tree_taxonomy":
             tU = TaxonomyProvider(cachePath=self.__cachePath, useCache=useCache)
             if self.__useFilteredLists:
                 # Get the taxon coverage in the current data set -
@@ -226,43 +215,14 @@ class TreeNodeListWorker(object):
                 # Get the full taxon node list without filtering
                 nL = tU.exportNodeList()
             self.__checkTaxonNodeList(nL)
-            logger.info("Starting load of taxonomy node tree length %d", len(nL))
-            if doLoad:
-                collectionName = "tree_taxonomy_node_list"
-                logger.debug("Taxonomy nodes (%d) %r", len(nL), nL[:5])
-                ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["id"], keyNames=None, addValues=addValues, schemaLevel=None)
-                self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
-            logger.info("Tree loading operations completed.")
-            #
-            # ---  ATC
+        elif collectionName.lower() == "tree_atc":
             crEx = ChemRefExtractor(self.__cfgOb)
             atcFilterD = crEx.getChemCompAccessionMapping("ATC")
             logger.info("Length of ATC filter %d", len(atcFilterD))
             atcP = AtcProvider(cachePath=self.__cachePath, useCache=useCache)
             nL = atcP.getTreeNodeList(filterD=atcFilterD)
-            collectionName = "tree_atc_node_list"
-            logger.debug("ATC node list length %d %r", len(nL), nL[:5])
-            ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["id"], keyNames=None, addValues=addValues, schemaLevel=None)
-            self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
-            #
-            # ---
-            logger.info("Completed tree node list loading operations.\n")
-            return True
-        except Exception as e:
-            logger.exception("Failing with %s", str(e))
-        return False
-    def __checkTaxonNodeList(self, nL):
-        eCount = 0
-        tD = {dD["id"]: True for dD in nL}
-        for dD in nL:
-            if "parents" in dD:
-                pId = dD["parents"][0]
-                if pId not in tD:
-                    logger.info("Missing parent for taxon %d", pId)
-                    eCount += 1
-            else:
-                logger.info("No parents for node %r", dD["id"])
-    def getLoadStatus(self):
-        return self.__statusList
+        else:
+            logger.error("Unsupported tree node collection %r", collectionName)
+        #
+        logger.info("Gathered tree nodes for loading collection %s (length %d)", collectionName, len(nL))
+        return nL

{rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/wf/PubChemEtlWorkflow.py RENAMED Viewed

@@ -165,7 +165,7 @@ class PubChemEtlWorkflow(object):
             birdUrlTarget = kwargs.get("birdUrlTarget", None)
             ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "cc-full")
             numProcChemComp = kwargs.get("numProcChemComp", 8)
-            numProc = kwargs.get("numProc", 2)
+            numProc = kwargs.get("numProc", 4)
             rebuildChemIndices = kwargs.get("rebuildChemIndices", True)
             exportPath = kwargs.get("exportPath", None)
             useStash = kwargs.get("useStash", True)
@@ -209,7 +209,7 @@ class PubChemEtlWorkflow(object):
         try:
             ok1 = ok2 = ok3 = ok4 = ok5 = ok6 = False
             #  --
-            numProc = kwargs.get("numProc", 2)
+            numProc = kwargs.get("numProc", 4)
             useStash = kwargs.get("useStash", True)
             useGit = kwargs.get("useGit", False)
             #

{rcsb_exdb-1.28 → rcsb_exdb-1.30/rcsb.exdb.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: rcsb.exdb
-Version: 1.28
+Version: 1.30
 Summary: RCSB Python ExDB data extraction and loading workflows
 Home-page: https://github.com/rcsb/py-rcsb_exdb
 Author: John Westbrook
@@ -19,8 +19,8 @@ License-File: LICENSE
 Requires-Dist: numpy
 Requires-Dist: jsonschema>=2.6.0
 Requires-Dist: rcsb.utils.io>=1.48
-Requires-Dist: rcsb.db>=1.800
-Requires-Dist: rcsb.utils.chem>=0.81
+Requires-Dist: rcsb.db>=1.808
+Requires-Dist: rcsb.utils.chem>=0.84
 Requires-Dist: rcsb.utils.chemref>=0.91
 Requires-Dist: rcsb.utils.config>=0.40
 Requires-Dist: rcsb.utils.ec>=0.25
@@ -41,6 +41,7 @@ Dynamic: description
 Dynamic: description-content-type
 Dynamic: home-page
 Dynamic: license
+Dynamic: license-file
 Dynamic: provides-extra
 Dynamic: requires-dist
 Dynamic: summary

{rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb.exdb.egg-info/requires.txt RENAMED Viewed

@@ -1,8 +1,8 @@
 numpy
 jsonschema>=2.6.0
 rcsb.utils.io>=1.48
-rcsb.db>=1.800
-rcsb.utils.chem>=0.81
+rcsb.db>=1.808
+rcsb.utils.chem>=0.84
 rcsb.utils.chemref>=0.91
 rcsb.utils.config>=0.40
 rcsb.utils.ec>=0.25

{rcsb_exdb-1.28 → rcsb_exdb-1.30}/requirements.txt RENAMED Viewed

@@ -4,8 +4,8 @@
 numpy
 jsonschema >= 2.6.0
 rcsb.utils.io >= 1.48
-rcsb.db >= 1.800
-rcsb.utils.chem >= 0.81
+rcsb.db >= 1.808
+rcsb.utils.chem >= 0.84
 rcsb.utils.chemref >= 0.91
 rcsb.utils.config >= 0.40
 rcsb.utils.ec >= 0.25