rcsb.exdb 1.28__tar.gz → 1.30__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/HISTORY.txt +2 -0
- {rcsb_exdb-1.28/rcsb.exdb.egg-info → rcsb_exdb-1.30}/PKG-INFO +5 -4
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/chemref/ChemRefEtlWorker.py +9 -6
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/chemref/ChemRefExtractor.py +3 -2
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/chemref/PubChemIndexCacheProvider.py +73 -72
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/cli/__init__.py +1 -1
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/fixturePdbxLoader.py +11 -4
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testObjectExtractor.py +2 -2
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +1 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testTreeNodeListWorker.py +1 -1
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tree/TreeNodeListWorker.py +87 -127
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/wf/PubChemEtlWorkflow.py +2 -2
- {rcsb_exdb-1.28 → rcsb_exdb-1.30/rcsb.exdb.egg-info}/PKG-INFO +5 -4
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb.exdb.egg-info/requires.txt +2 -2
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/requirements.txt +2 -2
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/LICENSE +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/MANIFEST.in +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/README.md +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/__init__.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/__init__.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/branch/BranchedEntityExtractor.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/branch/GlycanProvider.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/branch/GlycanUtils.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/branch/__init__.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/chemref/ChemRefMappingProvider.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/chemref/PubChemDataCacheProvider.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/chemref/PubChemEtlWrapper.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/chemref/__init__.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/citation/CitationAdapter.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/citation/CitationExtractor.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/citation/CitationUtils.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/citation/__init__.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/entry/EntryInfoProvider.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/entry/__init__.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/seq/AnnotationExtractor.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/seq/LigandNeighborMappingExtractor.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/seq/LigandNeighborMappingProvider.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/seq/PolymerEntityExtractor.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/seq/TaxonomyExtractor.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/seq/UniProtCoreEtlWorker.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/seq/UniProtExtractor.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/seq/__init__.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/__init__.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testAnnotationExtractor.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testBranchedEntityExtractor.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testChemRefLoader.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testChemRefMappingProvider.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testCitationAdapter.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testCitationExtractor.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testCitationUtils.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testEntryInfoProvider.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testGlycanEtlWorkflow.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testGlycanProvider.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testGlycanUtils.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testLigandNeighborMappingProvider.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testObjectTransformer.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testObjectUpdater.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testPolymerEntityExtractor.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testPubChemDataCacheProvider.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testPubChemEtlWorkflow.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testPubChemEtlWrapper.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testPubChemIndexCacheProvider.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testTaxonomyExtractor.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testUniProtCoreEtlWorker.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testUniProtExtractor.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tree/__init__.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/utils/ObjectAdapterBase.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/utils/ObjectExtractor.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/utils/ObjectTransformer.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/utils/ObjectUpdater.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/utils/ObjectValidator.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/utils/__init__.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/wf/EntryInfoEtlWorkflow.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/wf/GlycanEtlWorkflow.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/wf/__init__.py +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb.exdb.egg-info/SOURCES.txt +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb.exdb.egg-info/dependency_links.txt +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb.exdb.egg-info/not-zip-safe +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb.exdb.egg-info/top_level.txt +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/setup.cfg +0 -0
- {rcsb_exdb-1.28 → rcsb_exdb-1.30}/setup.py +0 -0
|
@@ -110,3 +110,5 @@
|
|
|
110
110
|
Update Azure pipelines to run on latest macOS and ubuntu version
|
|
111
111
|
23-Jan-2025 V1.27 Update TreeNodeListWorker to index 'id' field
|
|
112
112
|
11-Feb-2025 V1.28 Move ExDB CLI code (workflow, exec, and tests) and Dockerfile to rcsb.workflow to avoid circular imports
|
|
113
|
+
8-Apr-2025 V1.29 Add more logging to PubChemIndexCacheProvider and increase default numProc
|
|
114
|
+
2-Oct-2025 V1.30 Make use of ExDB configuration file for loading drugbank and tree node list DBs/collections and setting indexed fields
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: rcsb.exdb
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.30
|
|
4
4
|
Summary: RCSB Python ExDB data extraction and loading workflows
|
|
5
5
|
Home-page: https://github.com/rcsb/py-rcsb_exdb
|
|
6
6
|
Author: John Westbrook
|
|
@@ -19,8 +19,8 @@ License-File: LICENSE
|
|
|
19
19
|
Requires-Dist: numpy
|
|
20
20
|
Requires-Dist: jsonschema>=2.6.0
|
|
21
21
|
Requires-Dist: rcsb.utils.io>=1.48
|
|
22
|
-
Requires-Dist: rcsb.db>=1.
|
|
23
|
-
Requires-Dist: rcsb.utils.chem>=0.
|
|
22
|
+
Requires-Dist: rcsb.db>=1.808
|
|
23
|
+
Requires-Dist: rcsb.utils.chem>=0.84
|
|
24
24
|
Requires-Dist: rcsb.utils.chemref>=0.91
|
|
25
25
|
Requires-Dist: rcsb.utils.config>=0.40
|
|
26
26
|
Requires-Dist: rcsb.utils.ec>=0.25
|
|
@@ -41,6 +41,7 @@ Dynamic: description
|
|
|
41
41
|
Dynamic: description-content-type
|
|
42
42
|
Dynamic: home-page
|
|
43
43
|
Dynamic: license
|
|
44
|
+
Dynamic: license-file
|
|
44
45
|
Dynamic: provides-extra
|
|
45
46
|
Dynamic: requires-dist
|
|
46
47
|
Dynamic: summary
|
|
@@ -7,6 +7,8 @@
|
|
|
7
7
|
# Updates:
|
|
8
8
|
# 9-Dec-2018 jdw add validation methods
|
|
9
9
|
# 3-Sep-2019 jdw move to rcsb.exdb.chemref
|
|
10
|
+
# 7-Aug-2025 dwp change target DB and collection from "drugbank_core" to "dw" and "core_drugbank" (as part of transition to DW);
|
|
11
|
+
# make use of configuration file for loading drugbank collection and setting indexed fields
|
|
10
12
|
#
|
|
11
13
|
##
|
|
12
14
|
__docformat__ = "google en"
|
|
@@ -66,9 +68,10 @@ class ChemRefEtlWorker(object):
|
|
|
66
68
|
desp = DataExchangeStatus()
|
|
67
69
|
statusStartTimestamp = desp.setStartTime()
|
|
68
70
|
addValues = {}
|
|
71
|
+
collectionGroupName = "core_drugbank"
|
|
69
72
|
#
|
|
70
73
|
if extResource == "DrugBank":
|
|
71
|
-
|
|
74
|
+
databaseNameMongo = self.__schP.getDatabaseMongoName(collectionGroupName=collectionGroupName)
|
|
72
75
|
configName = self.__cfgOb.getDefaultSectionName()
|
|
73
76
|
user = self.__cfgOb.get("_DRUGBANK_AUTH_USERNAME", sectionName=configName)
|
|
74
77
|
pw = self.__cfgOb.get("_DRUGBANK_AUTH_PASSWORD", sectionName=configName)
|
|
@@ -81,10 +84,10 @@ class ChemRefEtlWorker(object):
|
|
|
81
84
|
#
|
|
82
85
|
logger.info("Resource %r extracted mapped document length %d", extResource, len(dList))
|
|
83
86
|
logger.debug("Objects %r", dList[:2])
|
|
84
|
-
|
|
87
|
+
_, _, collectionList, docIndexD = self.__schP.getSchemaInfo(collectionGroupName=collectionGroupName)
|
|
85
88
|
collectionName = collectionList[0] if collectionList else "unassigned"
|
|
86
|
-
|
|
87
|
-
logger.info("Database %r collection %r index attributes %r",
|
|
89
|
+
indexDL = docIndexD[collectionName] if collectionName in docIndexD else []
|
|
90
|
+
logger.info("Database %r collection %r index attributes %r", databaseNameMongo, collectionName, indexDL)
|
|
88
91
|
#
|
|
89
92
|
# For some reason, 'addValues' was being overwritten with an empty dict (https://github.com/rcsb/py-rcsb_exdb/commit/26bd79e9a2fffc97c034b4116dece9248d1c1f39)
|
|
90
93
|
# Will need to review this -- do we want to add the schema version values or not? (Also, see similar logic in UniProtCoreEtlWorker.py)
|
|
@@ -103,8 +106,8 @@ class ChemRefEtlWorker(object):
|
|
|
103
106
|
readBackCheck=self.__readBackCheck,
|
|
104
107
|
)
|
|
105
108
|
#
|
|
106
|
-
ok = dl.load(
|
|
107
|
-
self.__updateStatus(updateId,
|
|
109
|
+
ok = dl.load(databaseNameMongo, collectionName, loadType=loadType, documentList=dList, keyNames=None, addValues=addValues, indexDL=indexDL)
|
|
110
|
+
self.__updateStatus(updateId, databaseNameMongo, collectionName, ok, statusStartTimestamp)
|
|
108
111
|
|
|
109
112
|
return True
|
|
110
113
|
except Exception as e:
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
# Updates:
|
|
8
8
|
# 7-Jan-2019 jdw moved from ChemRefEtlWorker.
|
|
9
9
|
# 3-Sep-2019 jdw moved again to module rcsb.exdb.chemref
|
|
10
|
+
# 14-Aug-2025 dwp rename bird_chem_comp_core to core_chem_comp
|
|
10
11
|
#
|
|
11
12
|
##
|
|
12
13
|
__docformat__ = "google en"
|
|
@@ -42,8 +43,8 @@ class ChemRefExtractor(object):
|
|
|
42
43
|
"""
|
|
43
44
|
idD = {}
|
|
44
45
|
try:
|
|
45
|
-
databaseName = "
|
|
46
|
-
collectionName = "
|
|
46
|
+
databaseName = "dw"
|
|
47
|
+
collectionName = "core_chem_comp"
|
|
47
48
|
selectD = {"rcsb_chem_comp_related.resource_name": referenceResourceName}
|
|
48
49
|
selectionList = ["rcsb_id", "rcsb_chem_comp_related"]
|
|
49
50
|
logger.info("Searching %s %s with selection query %r", databaseName, collectionName, selectD)
|
|
@@ -9,6 +9,7 @@
|
|
|
9
9
|
# 16-Jul-2020 jdw separate index and reference data management.
|
|
10
10
|
# 23-Jul-2021 jdw Make PubChemIndexCacheProvider a subclass of StashableBase()
|
|
11
11
|
# 2-Mar-2023 aae Return correct status from Single proc
|
|
12
|
+
# 8-Apr-2025 dwp Let MultiProc handle chunking; add more logging to debug slowness on west coast
|
|
12
13
|
#
|
|
13
14
|
##
|
|
14
15
|
__docformat__ = "google en"
|
|
@@ -100,84 +101,82 @@ class PubChemUpdateWorker(object):
|
|
|
100
101
|
#
|
|
101
102
|
"""
|
|
102
103
|
_ = workingDir
|
|
103
|
-
chunkSize = optionsD.get("chunkSize", 50)
|
|
104
104
|
matchIdOnly = optionsD.get("matchIdOnly", True)
|
|
105
105
|
# Path to store raw request data -
|
|
106
106
|
exportPath = optionsD.get("exportPath", None)
|
|
107
107
|
#
|
|
108
108
|
successList = []
|
|
109
|
-
retList1 = []
|
|
110
|
-
retList2 = []
|
|
111
109
|
diagList = []
|
|
112
|
-
|
|
110
|
+
failList = []
|
|
111
|
+
retList = []
|
|
113
112
|
#
|
|
114
113
|
try:
|
|
114
|
+
startTime = time.time()
|
|
115
115
|
tU = TimeUtil()
|
|
116
|
-
ccIdList = dataList
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
#
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
116
|
+
ccIdList = dataList # len(dataList) should be of size chunkSize
|
|
117
|
+
logger.info("%s search starting for %d reference definitions (matchIdOnly %r exportPath %r)", procName, len(ccIdList), matchIdOnly, exportPath)
|
|
118
|
+
tIdxDL = []
|
|
119
|
+
timeS = tU.getDateTimeObj(tU.getTimestamp())
|
|
120
|
+
for ccId in ccIdList:
|
|
121
|
+
# Get various forms from the search index -
|
|
122
|
+
chemIdList = self.__genChemIdList(ccId)
|
|
123
|
+
tIdxD = {"rcsb_id": ccId, "rcsb_last_update": timeS}
|
|
124
|
+
#
|
|
125
|
+
mL = []
|
|
126
|
+
for chemId in chemIdList:
|
|
127
|
+
stA = time.time()
|
|
128
|
+
ok, refDL = self.__pcU.assemble(chemId, exportPath=exportPath, matchIdOnly=matchIdOnly)
|
|
128
129
|
#
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
ok, refDL = self.__pcU.assemble(chemId, exportPath=exportPath, matchIdOnly=matchIdOnly)
|
|
133
|
-
#
|
|
134
|
-
if not ok:
|
|
135
|
-
etA = time.time()
|
|
136
|
-
logger.debug("Failing %s search source %s for %s (%.4f secs)", chemId.identifierType, chemId.identifierSource, chemId.idCode, etA - stA)
|
|
137
|
-
|
|
138
|
-
#
|
|
139
|
-
if ok and refDL:
|
|
140
|
-
for tD in refDL:
|
|
141
|
-
pcId = tD["cid"]
|
|
142
|
-
inchiKey = (
|
|
143
|
-
self.__searchIdxD[chemId.indexName]["inchi-key"]
|
|
144
|
-
if chemId.indexName in self.__searchIdxD and "inchi-key" in self.__searchIdxD[chemId.indexName]
|
|
145
|
-
else None
|
|
146
|
-
)
|
|
147
|
-
smiles = (
|
|
148
|
-
self.__searchIdxD[chemId.indexName]["smiles"] if chemId.indexName in self.__searchIdxD and "smiles" in self.__searchIdxD[chemId.indexName] else None
|
|
149
|
-
)
|
|
150
|
-
mL.append(
|
|
151
|
-
{
|
|
152
|
-
"matched_id": pcId,
|
|
153
|
-
"search_id_type": chemId.identifierType,
|
|
154
|
-
"search_id_source": chemId.identifierSource,
|
|
155
|
-
"source_index_name": chemId.indexName,
|
|
156
|
-
"source_smiles": smiles,
|
|
157
|
-
"source_inchikey": inchiKey,
|
|
158
|
-
}
|
|
159
|
-
)
|
|
160
|
-
# tD.update({"rcsb_id": pcId, "rcsb_last_update": timeS})
|
|
161
|
-
# tDL.append(tD)
|
|
130
|
+
if not ok:
|
|
131
|
+
etA = time.time()
|
|
132
|
+
logger.debug("Failing %s search source %s for %s (%.4f secs)", chemId.identifierType, chemId.identifierSource, chemId.idCode, etA - stA)
|
|
162
133
|
#
|
|
163
|
-
if
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
134
|
+
if ok and refDL:
|
|
135
|
+
for tD in refDL:
|
|
136
|
+
pcId = tD["cid"]
|
|
137
|
+
inchiKey = (
|
|
138
|
+
self.__searchIdxD[chemId.indexName]["inchi-key"]
|
|
139
|
+
if chemId.indexName in self.__searchIdxD and "inchi-key" in self.__searchIdxD[chemId.indexName]
|
|
140
|
+
else None
|
|
141
|
+
)
|
|
142
|
+
smiles = (
|
|
143
|
+
self.__searchIdxD[chemId.indexName]["smiles"] if chemId.indexName in self.__searchIdxD and "smiles" in self.__searchIdxD[chemId.indexName] else None
|
|
144
|
+
)
|
|
145
|
+
mL.append(
|
|
146
|
+
{
|
|
147
|
+
"matched_id": pcId,
|
|
148
|
+
"search_id_type": chemId.identifierType,
|
|
149
|
+
"search_id_source": chemId.identifierSource,
|
|
150
|
+
"source_index_name": chemId.indexName,
|
|
151
|
+
"source_smiles": smiles,
|
|
152
|
+
"source_inchikey": inchiKey,
|
|
153
|
+
}
|
|
154
|
+
)
|
|
155
|
+
#
|
|
156
|
+
if mL:
|
|
157
|
+
tIdxD["matched_ids"] = mL
|
|
158
|
+
successList.append(ccId)
|
|
159
|
+
else:
|
|
160
|
+
logger.info("No match result for any form of %s", ccId)
|
|
161
|
+
#
|
|
162
|
+
tIdxDL.append(tIdxD)
|
|
163
|
+
# --
|
|
164
|
+
failList = sorted(set(dataList) - set(successList))
|
|
165
|
+
if failList:
|
|
166
|
+
logger.info("%s returns %d definitions with failures: %r", procName, len(failList), failList)
|
|
167
|
+
# --
|
|
168
|
+
endTime = time.time()
|
|
169
|
+
logger.info("%s completed updateList len %r duration %.3f secs", procName, len(ccIdList), endTime - startTime)
|
|
170
|
+
startTimeL = time.time()
|
|
171
|
+
logger.info("Saving dataList (len=%d)", len(ccIdList))
|
|
172
|
+
self.__updateObjectStore(self.__databaseName, self.__matchIndexCollectionName, tIdxDL)
|
|
173
|
+
endTimeL = time.time()
|
|
174
|
+
logger.info("Saved chunk (len=%d) in %.3f secs", len(ccIdList), endTimeL - startTimeL)
|
|
176
175
|
except Exception as e:
|
|
177
176
|
logger.exception("Failing %s for %d data items %s", procName, len(dataList), str(e))
|
|
178
|
-
logger.info("%s dataList length %d success length %d
|
|
177
|
+
logger.info("%s dataList length %d success length %d retList %d", procName, len(dataList), len(successList), len(retList))
|
|
179
178
|
#
|
|
180
|
-
return successList,
|
|
179
|
+
return successList, retList, diagList
|
|
181
180
|
|
|
182
181
|
def __updateObjectStore(self, databaseName, collectionName, objDL):
|
|
183
182
|
updateDL = []
|
|
@@ -196,10 +195,6 @@ class PubChemUpdateWorker(object):
|
|
|
196
195
|
ok = obUpd.createCollection(databaseName, collectionName, indexAttributeNames=indexAttributeNames, checkExists=True, bsonSchema=None)
|
|
197
196
|
return ok
|
|
198
197
|
|
|
199
|
-
def __chunker(self, iList, chunkSize):
|
|
200
|
-
chunkSize = max(1, chunkSize)
|
|
201
|
-
return (iList[i: i + chunkSize] for i in range(0, len(iList), chunkSize))
|
|
202
|
-
|
|
203
198
|
|
|
204
199
|
class PubChemIndexCacheProvider(StashableBase):
|
|
205
200
|
"""Utilities to manage chemical component/BIRD to PubChem compound identifier mapping data."""
|
|
@@ -515,7 +510,7 @@ class PubChemIndexCacheProvider(StashableBase):
|
|
|
515
510
|
Returns:
|
|
516
511
|
(bool, list): status flag, list of unmatched identifiers
|
|
517
512
|
"""
|
|
518
|
-
chunkSize =
|
|
513
|
+
chunkSize = 10
|
|
519
514
|
exportPath = kwargs.get("exportPath", None)
|
|
520
515
|
logger.info("Length starting list is %d", len(idList))
|
|
521
516
|
optD = {"chunkSize": chunkSize, "exportPath": exportPath, "matchIdOnly": True}
|
|
@@ -524,14 +519,20 @@ class PubChemIndexCacheProvider(StashableBase):
|
|
|
524
519
|
mpu = MultiProcUtil(verbose=True)
|
|
525
520
|
mpu.setOptions(optD)
|
|
526
521
|
mpu.set(workerObj=rWorker, workerMethod="updateList")
|
|
527
|
-
ok, failList, resultList, _ = mpu.runMulti(dataList=idList, numProc=numProc, numResults=
|
|
528
|
-
logger.info("Multi-proc %r failures %r result lengths %r
|
|
522
|
+
ok, failList, resultList, _ = mpu.runMulti(dataList=idList, numProc=numProc, numResults=1, chunkSize=chunkSize)
|
|
523
|
+
logger.info("Multi-proc %r failures %r result lengths %r", ok, len(failList), len(resultList[0]))
|
|
529
524
|
else:
|
|
530
|
-
successList, _, _
|
|
525
|
+
successList, _, _ = rWorker.updateList(idList, "SingleProc", optD, self.__dirPath)
|
|
531
526
|
failList = list(set(idList) - set(successList))
|
|
532
527
|
ok = len(failList) == 0
|
|
533
528
|
logger.info("Single-proc status %r failures %r", ok, len(failList))
|
|
534
529
|
#
|
|
530
|
+
if len(failList) > 0:
|
|
531
|
+
if len(failList) <= 100:
|
|
532
|
+
logger.info("failList: %r", failList)
|
|
533
|
+
else:
|
|
534
|
+
logger.info("failList[:100]: %r", failList[:100])
|
|
535
|
+
#
|
|
535
536
|
return ok, failList
|
|
536
537
|
|
|
537
538
|
def __reloadDump(self, objD, databaseName, collectionName, indexAttributeNames=None):
|
|
@@ -162,7 +162,9 @@ class PdbxLoaderFixture(unittest.TestCase):
|
|
|
162
162
|
]
|
|
163
163
|
self.__ldList = [
|
|
164
164
|
{
|
|
165
|
-
"databaseName": "
|
|
165
|
+
# "databaseName": "dw",
|
|
166
|
+
"collectionGroupName": "core_chem_comp",
|
|
167
|
+
"contentType": "bird_chem_comp_core",
|
|
166
168
|
"collectionNameList": None,
|
|
167
169
|
"loadType": "full",
|
|
168
170
|
"mergeContentTypes": None,
|
|
@@ -170,7 +172,9 @@ class PdbxLoaderFixture(unittest.TestCase):
|
|
|
170
172
|
"inputIdCodeList": self.__birdChemCompCoreIdList
|
|
171
173
|
},
|
|
172
174
|
{
|
|
173
|
-
"databaseName": "pdbx_core",
|
|
175
|
+
# "databaseName": "pdbx_core",
|
|
176
|
+
"collectionGroupName": "pdbx_core",
|
|
177
|
+
"contentType": "pdbx_core",
|
|
174
178
|
"collectionNameList": None,
|
|
175
179
|
"loadType": "replace",
|
|
176
180
|
"mergeContentTypes": ["vrpt"],
|
|
@@ -179,6 +183,8 @@ class PdbxLoaderFixture(unittest.TestCase):
|
|
|
179
183
|
},
|
|
180
184
|
# {
|
|
181
185
|
# "databaseName": "pdbx_comp_model_core",
|
|
186
|
+
# "collectionGroupName": "pdbx_comp_model_core",
|
|
187
|
+
# "contentType": "pdbx_comp_model_core",
|
|
182
188
|
# "collectionNameList": None,
|
|
183
189
|
# "loadType": "full",
|
|
184
190
|
# "mergeContentTypes": None,
|
|
@@ -220,7 +226,7 @@ class PdbxLoaderFixture(unittest.TestCase):
|
|
|
220
226
|
"""Wrapper for the PDBx loader module"""
|
|
221
227
|
ok = False
|
|
222
228
|
try:
|
|
223
|
-
logger.info("Loading %s", kwargs["
|
|
229
|
+
logger.info("Loading %s", kwargs["collectionGroupName"])
|
|
224
230
|
mw = PdbxLoader(
|
|
225
231
|
self.__cfgOb,
|
|
226
232
|
cachePath=self.__cachePath,
|
|
@@ -235,8 +241,9 @@ class PdbxLoaderFixture(unittest.TestCase):
|
|
|
235
241
|
rebuildSchemaFlag=False,
|
|
236
242
|
)
|
|
237
243
|
ok = mw.load(
|
|
238
|
-
kwargs["
|
|
244
|
+
collectionGroupName=kwargs["collectionGroupName"],
|
|
239
245
|
collectionLoadList=kwargs["collectionNameList"],
|
|
246
|
+
contentType=kwargs["contentType"],
|
|
240
247
|
loadType=kwargs["loadType"],
|
|
241
248
|
inputPathList=None,
|
|
242
249
|
inputIdCodeList=kwargs["inputIdCodeList"],
|
|
@@ -81,8 +81,8 @@ class ObjectExtractorTests(unittest.TestCase):
|
|
|
81
81
|
try:
|
|
82
82
|
obEx = ObjectExtractor(
|
|
83
83
|
self.__cfgOb,
|
|
84
|
-
databaseName="
|
|
85
|
-
collectionName="
|
|
84
|
+
databaseName="dw",
|
|
85
|
+
collectionName="core_chem_comp",
|
|
86
86
|
cacheFilePath=os.path.join(self.__workPath, "drugbank-mapping-cache.json"),
|
|
87
87
|
useCache=False,
|
|
88
88
|
cacheKwargs=self.__testEntryCacheKwargs,
|
|
@@ -60,6 +60,7 @@ class ReferenceSequenceAnnotationAdapterTests(unittest.TestCase):
|
|
|
60
60
|
endTime = time.time()
|
|
61
61
|
logger.info("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)
|
|
62
62
|
|
|
63
|
+
# NOTE: IF YOU DISABLE THE TEST BELOW, THEN 'testReferenceCacheProvider' FAILS. CHECK WHETHER ALL 'Reference' PROVIDERS CAN BE DISABLED.
|
|
63
64
|
# @unittest.skip("Disable test - no longer using in production, and fails too frequently with 'Bad xml text' when fetching from UniProt")
|
|
64
65
|
def testAnnotationAdapter(self):
|
|
65
66
|
"""Test case - create and read cache reference sequences assignments and related data."""
|
|
@@ -10,6 +10,8 @@
|
|
|
10
10
|
# 8-Aug-2023 dwp Load full (unfiltered) taxonomy tree node list, and stop loading GO tree (will be loaded in DW instead)
|
|
11
11
|
# 27-Aug-2024 dwp Update CARD ontology tree loading
|
|
12
12
|
# 23-Jan-2025 dwp Change indexed field from 'update_id' to 'id'
|
|
13
|
+
# 7-Aug-2025 dwp Change target DB and collection names to "dw" and "tree_*" (via configuration file);
|
|
14
|
+
# Make use of configuration file for loading tree node lists and setting indexed fields
|
|
13
15
|
#
|
|
14
16
|
##
|
|
15
17
|
__docformat__ = "google en"
|
|
@@ -32,8 +34,6 @@ from rcsb.utils.struct.ScopClassificationProvider import ScopClassificationProvi
|
|
|
32
34
|
from rcsb.utils.struct.Scop2ClassificationProvider import Scop2ClassificationProvider
|
|
33
35
|
from rcsb.utils.taxonomy.TaxonomyProvider import TaxonomyProvider
|
|
34
36
|
from rcsb.exdb.seq.TaxonomyExtractor import TaxonomyExtractor
|
|
35
|
-
# from rcsb.utils.go.GeneOntologyProvider import GeneOntologyProvider
|
|
36
|
-
# from rcsb.exdb.seq.AnnotationExtractor import AnnotationExtractor
|
|
37
37
|
|
|
38
38
|
logger = logging.getLogger(__name__)
|
|
39
39
|
|
|
@@ -76,37 +76,28 @@ class TreeNodeListWorker(object):
|
|
|
76
76
|
Relevant configuration options:
|
|
77
77
|
|
|
78
78
|
tree_node_lists_configuration:
|
|
79
|
-
DATABASE_NAME:
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
79
|
+
DATABASE_NAME: dw
|
|
80
|
+
COLLECTION_VERSION_STRING: 2.1.0
|
|
81
|
+
COLLECTION_NAME_LIST:
|
|
82
|
+
- tree_taxonomy
|
|
83
|
+
- tree_ec
|
|
84
|
+
- tree_scop
|
|
85
|
+
- tree_scop2
|
|
86
|
+
- tree_cath
|
|
87
|
+
- tree_atc
|
|
88
|
+
- tree_card
|
|
89
|
+
- tree_ecod
|
|
90
|
+
COLLECTION_INDICES:
|
|
91
|
+
- INDEX_NAME: primary
|
|
92
|
+
ATTRIBUTE_NAMES:
|
|
93
|
+
- id
|
|
94
|
+
- INDEX_NAME: index_2
|
|
95
|
+
ATTRIBUTE_NAMES:
|
|
96
|
+
- parents
|
|
87
97
|
"""
|
|
88
98
|
try:
|
|
89
99
|
useCache = self.__useCache
|
|
90
100
|
#
|
|
91
|
-
# if not useCache:
|
|
92
|
-
# cDL = ["domains_struct", "NCBI", "ec", "go", "atc"]
|
|
93
|
-
# for cD in cDL:
|
|
94
|
-
# try:
|
|
95
|
-
# cfp = os.path.join(self.__cachePath, cD)
|
|
96
|
-
# os.makedirs(cfp, 0o755)
|
|
97
|
-
# except Exception:
|
|
98
|
-
# pass
|
|
99
|
-
# #
|
|
100
|
-
# try:
|
|
101
|
-
# cfp = os.path.join(self.__cachePath, cD)
|
|
102
|
-
# fpL = glob.glob(os.path.join(cfp, "*"))
|
|
103
|
-
# if fpL:
|
|
104
|
-
# for fp in fpL:
|
|
105
|
-
# os.remove(fp)
|
|
106
|
-
# except Exception:
|
|
107
|
-
# pass
|
|
108
|
-
#
|
|
109
|
-
#
|
|
110
101
|
logger.info("Starting with cache path %r (useCache=%r)", self.__cachePath, useCache)
|
|
111
102
|
#
|
|
112
103
|
self.__statusList = []
|
|
@@ -124,65 +115,77 @@ class TreeNodeListWorker(object):
|
|
|
124
115
|
readBackCheck=self.__readBackCheck,
|
|
125
116
|
)
|
|
126
117
|
#
|
|
127
|
-
|
|
118
|
+
sectionName = "tree_node_lists_configuration"
|
|
119
|
+
databaseNameMongo = self.__cfgOb.get("DATABASE_NAME", sectionName=sectionName)
|
|
120
|
+
collectionNameList = self.__cfgOb.get("COLLECTION_NAME_LIST", sectionName=sectionName)
|
|
121
|
+
collectionIndexList = self.__cfgOb.get("COLLECTION_INDICES", sectionName=sectionName)
|
|
122
|
+
# databaseNameMongo = 'dw'
|
|
123
|
+
# collectionNameList = ['tree_taxonomy', 'tree_ec', 'tree_scop', 'tree_scop2', 'tree_cath', 'tree_atc', 'tree_card', 'tree_ecod', 'tree_go']
|
|
124
|
+
# collectionIndexList = [{'INDEX_NAME': 'primary', 'ATTRIBUTE_NAMES': ['id']}, {'INDEX_NAME': 'index_2', 'ATTRIBUTE_NAMES': ['parents']}]
|
|
125
|
+
|
|
128
126
|
# collectionVersion = self.__cfgOb.get("COLLECTION_VERSION_STRING", sectionName=sectionName)
|
|
129
127
|
# addValues = {"_schema_version": collectionVersion}
|
|
130
128
|
addValues = None
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
129
|
+
|
|
130
|
+
ok = True
|
|
131
|
+
for collectionName in collectionNameList:
|
|
132
|
+
nL = self.__getTreeDocList(collectionName, useCache)
|
|
133
|
+
if nL and doLoad:
|
|
134
|
+
ok = dl.load(
|
|
135
|
+
databaseNameMongo,
|
|
136
|
+
collectionName,
|
|
137
|
+
loadType=loadType,
|
|
138
|
+
documentList=nL,
|
|
139
|
+
keyNames=None,
|
|
140
|
+
addValues=addValues,
|
|
141
|
+
schemaLevel=None,
|
|
142
|
+
indexDL=collectionIndexList
|
|
143
|
+
) and ok
|
|
144
|
+
self.__updateStatus(updateId, databaseNameMongo, collectionName, ok, statusStartTimestamp)
|
|
145
|
+
logger.info(
|
|
146
|
+
"Completed load of tree node list for database %r, collection %r, len(nL) %r (status %r)",
|
|
147
|
+
databaseNameMongo, collectionName, len(nL), ok
|
|
148
|
+
)
|
|
149
|
+
# ---
|
|
150
|
+
logger.info("Completed tree node list loading operations with loadType %r (status %r)", loadType, ok)
|
|
151
|
+
return True
|
|
152
|
+
except Exception as e:
|
|
153
|
+
logger.exception("Failing with %s", str(e))
|
|
154
|
+
return False
|
|
155
|
+
|
|
156
|
+
def __checkTaxonNodeList(self, nL):
|
|
157
|
+
eCount = 0
|
|
158
|
+
tD = {dD["id"]: True for dD in nL}
|
|
159
|
+
for dD in nL:
|
|
160
|
+
if "parents" in dD:
|
|
161
|
+
pId = dD["parents"][0]
|
|
162
|
+
if pId not in tD:
|
|
163
|
+
logger.info("Missing parent for taxon %d", pId)
|
|
164
|
+
eCount += 1
|
|
165
|
+
else:
|
|
166
|
+
logger.info("No parents for node %r", dD["id"])
|
|
167
|
+
|
|
168
|
+
def getLoadStatus(self):
|
|
169
|
+
return self.__statusList
|
|
170
|
+
|
|
171
|
+
def __getTreeDocList(self, collectionName, useCache):
|
|
172
|
+
nL = []
|
|
173
|
+
if collectionName.lower() == "tree_cath":
|
|
146
174
|
ccu = CathClassificationProvider(cachePath=self.__cachePath, useCache=useCache)
|
|
147
175
|
nL = ccu.getTreeNodeList()
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
|
|
153
|
-
# ---- SCOP
|
|
176
|
+
elif collectionName.lower() == "tree_scop2":
|
|
177
|
+
scu2 = Scop2ClassificationProvider(cachePath=self.__cachePath, useCache=useCache)
|
|
178
|
+
nL = scu2.getTreeNodeList()
|
|
179
|
+
elif collectionName.lower() == "tree_scop":
|
|
154
180
|
scu = ScopClassificationProvider(cachePath=self.__cachePath, useCache=useCache)
|
|
155
181
|
nL = scu.getTreeNodeList()
|
|
156
|
-
|
|
157
|
-
if doLoad:
|
|
158
|
-
collectionName = "tree_scop_node_list"
|
|
159
|
-
ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["id"], keyNames=None, addValues=addValues, schemaLevel=None)
|
|
160
|
-
self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
|
|
161
|
-
# --- SCOP2
|
|
162
|
-
scu = Scop2ClassificationProvider(cachePath=self.__cachePath, useCache=useCache)
|
|
163
|
-
nL = scu.getTreeNodeList()
|
|
164
|
-
logger.info("Starting load SCOP2 node tree length %d", len(nL))
|
|
165
|
-
if doLoad:
|
|
166
|
-
collectionName = "tree_scop2_node_list"
|
|
167
|
-
ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["id"], keyNames=None, addValues=addValues, schemaLevel=None)
|
|
168
|
-
self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
|
|
169
|
-
# ---- Ecod
|
|
182
|
+
elif collectionName.lower() == "tree_ecod":
|
|
170
183
|
ecu = EcodClassificationProvider(cachePath=self.__cachePath, useCache=useCache)
|
|
171
184
|
nL = ecu.getTreeNodeList()
|
|
172
|
-
|
|
173
|
-
if doLoad:
|
|
174
|
-
collectionName = "tree_ecod_node_list"
|
|
175
|
-
ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["id"], keyNames=None, addValues=addValues, schemaLevel=None)
|
|
176
|
-
self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
|
|
177
|
-
# ---- EC
|
|
185
|
+
elif collectionName.lower() == "tree_ec":
|
|
178
186
|
edbu = EnzymeDatabaseProvider(cachePath=self.__cachePath, useCache=useCache)
|
|
179
187
|
nL = edbu.getTreeNodeList()
|
|
180
|
-
|
|
181
|
-
if doLoad:
|
|
182
|
-
collectionName = "tree_ec_node_list"
|
|
183
|
-
ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["id"], keyNames=None, addValues=addValues, schemaLevel=None)
|
|
184
|
-
self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
|
|
185
|
-
# ---- CARD
|
|
188
|
+
elif collectionName.lower() == "tree_card":
|
|
186
189
|
okCou = True
|
|
187
190
|
cou = CARDTargetOntologyProvider(cachePath=self.__cachePath, useCache=useCache)
|
|
188
191
|
if not cou.testCache():
|
|
@@ -193,21 +196,7 @@ class TreeNodeListWorker(object):
|
|
|
193
196
|
okCou = False
|
|
194
197
|
if okCou:
|
|
195
198
|
nL = cou.getTreeNodeList()
|
|
196
|
-
|
|
197
|
-
if doLoad:
|
|
198
|
-
collectionName = "tree_card_node_list"
|
|
199
|
-
ok = dl.load(
|
|
200
|
-
databaseName,
|
|
201
|
-
collectionName,
|
|
202
|
-
loadType=loadType,
|
|
203
|
-
documentList=nL,
|
|
204
|
-
indexAttributeList=["id"],
|
|
205
|
-
keyNames=None,
|
|
206
|
-
addValues=addValues,
|
|
207
|
-
schemaLevel=None
|
|
208
|
-
)
|
|
209
|
-
self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
|
|
210
|
-
# ---- Taxonomy
|
|
199
|
+
elif collectionName.lower() == "tree_taxonomy":
|
|
211
200
|
tU = TaxonomyProvider(cachePath=self.__cachePath, useCache=useCache)
|
|
212
201
|
if self.__useFilteredLists:
|
|
213
202
|
# Get the taxon coverage in the current data set -
|
|
@@ -226,43 +215,14 @@ class TreeNodeListWorker(object):
|
|
|
226
215
|
# Get the full taxon node list without filtering
|
|
227
216
|
nL = tU.exportNodeList()
|
|
228
217
|
self.__checkTaxonNodeList(nL)
|
|
229
|
-
|
|
230
|
-
if doLoad:
|
|
231
|
-
collectionName = "tree_taxonomy_node_list"
|
|
232
|
-
logger.debug("Taxonomy nodes (%d) %r", len(nL), nL[:5])
|
|
233
|
-
ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["id"], keyNames=None, addValues=addValues, schemaLevel=None)
|
|
234
|
-
self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
|
|
235
|
-
logger.info("Tree loading operations completed.")
|
|
236
|
-
#
|
|
237
|
-
# --- ATC
|
|
218
|
+
elif collectionName.lower() == "tree_atc":
|
|
238
219
|
crEx = ChemRefExtractor(self.__cfgOb)
|
|
239
220
|
atcFilterD = crEx.getChemCompAccessionMapping("ATC")
|
|
240
221
|
logger.info("Length of ATC filter %d", len(atcFilterD))
|
|
241
222
|
atcP = AtcProvider(cachePath=self.__cachePath, useCache=useCache)
|
|
242
223
|
nL = atcP.getTreeNodeList(filterD=atcFilterD)
|
|
243
|
-
|
|
244
|
-
logger.
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
# ---
|
|
249
|
-
logger.info("Completed tree node list loading operations.\n")
|
|
250
|
-
return True
|
|
251
|
-
except Exception as e:
|
|
252
|
-
logger.exception("Failing with %s", str(e))
|
|
253
|
-
return False
|
|
254
|
-
|
|
255
|
-
def __checkTaxonNodeList(self, nL):
|
|
256
|
-
eCount = 0
|
|
257
|
-
tD = {dD["id"]: True for dD in nL}
|
|
258
|
-
for dD in nL:
|
|
259
|
-
if "parents" in dD:
|
|
260
|
-
pId = dD["parents"][0]
|
|
261
|
-
if pId not in tD:
|
|
262
|
-
logger.info("Missing parent for taxon %d", pId)
|
|
263
|
-
eCount += 1
|
|
264
|
-
else:
|
|
265
|
-
logger.info("No parents for node %r", dD["id"])
|
|
266
|
-
|
|
267
|
-
def getLoadStatus(self):
|
|
268
|
-
return self.__statusList
|
|
224
|
+
else:
|
|
225
|
+
logger.error("Unsupported tree node collection %r", collectionName)
|
|
226
|
+
#
|
|
227
|
+
logger.info("Gathered tree nodes for loading collection %s (length %d)", collectionName, len(nL))
|
|
228
|
+
return nL
|
|
@@ -165,7 +165,7 @@ class PubChemEtlWorkflow(object):
|
|
|
165
165
|
birdUrlTarget = kwargs.get("birdUrlTarget", None)
|
|
166
166
|
ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "cc-full")
|
|
167
167
|
numProcChemComp = kwargs.get("numProcChemComp", 8)
|
|
168
|
-
numProc = kwargs.get("numProc",
|
|
168
|
+
numProc = kwargs.get("numProc", 4)
|
|
169
169
|
rebuildChemIndices = kwargs.get("rebuildChemIndices", True)
|
|
170
170
|
exportPath = kwargs.get("exportPath", None)
|
|
171
171
|
useStash = kwargs.get("useStash", True)
|
|
@@ -209,7 +209,7 @@ class PubChemEtlWorkflow(object):
|
|
|
209
209
|
try:
|
|
210
210
|
ok1 = ok2 = ok3 = ok4 = ok5 = ok6 = False
|
|
211
211
|
# --
|
|
212
|
-
numProc = kwargs.get("numProc",
|
|
212
|
+
numProc = kwargs.get("numProc", 4)
|
|
213
213
|
useStash = kwargs.get("useStash", True)
|
|
214
214
|
useGit = kwargs.get("useGit", False)
|
|
215
215
|
#
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: rcsb.exdb
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.30
|
|
4
4
|
Summary: RCSB Python ExDB data extraction and loading workflows
|
|
5
5
|
Home-page: https://github.com/rcsb/py-rcsb_exdb
|
|
6
6
|
Author: John Westbrook
|
|
@@ -19,8 +19,8 @@ License-File: LICENSE
|
|
|
19
19
|
Requires-Dist: numpy
|
|
20
20
|
Requires-Dist: jsonschema>=2.6.0
|
|
21
21
|
Requires-Dist: rcsb.utils.io>=1.48
|
|
22
|
-
Requires-Dist: rcsb.db>=1.
|
|
23
|
-
Requires-Dist: rcsb.utils.chem>=0.
|
|
22
|
+
Requires-Dist: rcsb.db>=1.808
|
|
23
|
+
Requires-Dist: rcsb.utils.chem>=0.84
|
|
24
24
|
Requires-Dist: rcsb.utils.chemref>=0.91
|
|
25
25
|
Requires-Dist: rcsb.utils.config>=0.40
|
|
26
26
|
Requires-Dist: rcsb.utils.ec>=0.25
|
|
@@ -41,6 +41,7 @@ Dynamic: description
|
|
|
41
41
|
Dynamic: description-content-type
|
|
42
42
|
Dynamic: home-page
|
|
43
43
|
Dynamic: license
|
|
44
|
+
Dynamic: license-file
|
|
44
45
|
Dynamic: provides-extra
|
|
45
46
|
Dynamic: requires-dist
|
|
46
47
|
Dynamic: summary
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py
RENAMED
|
File without changes
|
{rcsb_exdb-1.28 → rcsb_exdb-1.30}/rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|