rcsb.exdb 1.29__tar.gz → 1.30__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/HISTORY.txt +1 -0
- {rcsb_exdb-1.29/rcsb.exdb.egg-info → rcsb_exdb-1.30}/PKG-INFO +3 -3
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/chemref/ChemRefEtlWorker.py +9 -6
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/chemref/ChemRefExtractor.py +3 -2
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/cli/__init__.py +1 -1
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/fixturePdbxLoader.py +11 -4
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testObjectExtractor.py +2 -2
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +1 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testTreeNodeListWorker.py +1 -1
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tree/TreeNodeListWorker.py +87 -127
- {rcsb_exdb-1.29 → rcsb_exdb-1.30/rcsb.exdb.egg-info}/PKG-INFO +3 -3
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb.exdb.egg-info/requires.txt +2 -2
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/requirements.txt +2 -2
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/LICENSE +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/MANIFEST.in +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/README.md +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/__init__.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/__init__.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/branch/BranchedEntityExtractor.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/branch/GlycanProvider.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/branch/GlycanUtils.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/branch/__init__.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/chemref/ChemRefMappingProvider.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/chemref/PubChemDataCacheProvider.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/chemref/PubChemEtlWrapper.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/chemref/PubChemIndexCacheProvider.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/chemref/__init__.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/citation/CitationAdapter.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/citation/CitationExtractor.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/citation/CitationUtils.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/citation/__init__.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/entry/EntryInfoProvider.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/entry/__init__.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/seq/AnnotationExtractor.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/seq/LigandNeighborMappingExtractor.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/seq/LigandNeighborMappingProvider.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/seq/PolymerEntityExtractor.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/seq/TaxonomyExtractor.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/seq/UniProtCoreEtlWorker.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/seq/UniProtExtractor.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/seq/__init__.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/__init__.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testAnnotationExtractor.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testBranchedEntityExtractor.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testChemRefLoader.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testChemRefMappingProvider.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testCitationAdapter.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testCitationExtractor.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testCitationUtils.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testEntryInfoProvider.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testGlycanEtlWorkflow.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testGlycanProvider.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testGlycanUtils.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testLigandNeighborMappingProvider.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testObjectTransformer.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testObjectUpdater.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testPolymerEntityExtractor.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testPubChemDataCacheProvider.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testPubChemEtlWorkflow.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testPubChemEtlWrapper.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testPubChemIndexCacheProvider.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testTaxonomyExtractor.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testUniProtCoreEtlWorker.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testUniProtExtractor.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tree/__init__.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/utils/ObjectAdapterBase.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/utils/ObjectExtractor.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/utils/ObjectTransformer.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/utils/ObjectUpdater.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/utils/ObjectValidator.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/utils/__init__.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/wf/EntryInfoEtlWorkflow.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/wf/GlycanEtlWorkflow.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/wf/PubChemEtlWorkflow.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/wf/__init__.py +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb.exdb.egg-info/SOURCES.txt +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb.exdb.egg-info/dependency_links.txt +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb.exdb.egg-info/not-zip-safe +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb.exdb.egg-info/top_level.txt +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/setup.cfg +0 -0
- {rcsb_exdb-1.29 → rcsb_exdb-1.30}/setup.py +0 -0
|
@@ -111,3 +111,4 @@
|
|
|
111
111
|
23-Jan-2025 V1.27 Update TreeNodeListWorker to index 'id' field
|
|
112
112
|
11-Feb-2025 V1.28 Move ExDB CLI code (workflow, exec, and tests) and Dockerfile to rcsb.workflow to avoid circular imports
|
|
113
113
|
8-Apr-2025 V1.29 Add more logging to PubChemIndexCacheProvider and increase default numProc
|
|
114
|
+
2-Oct-2025 V1.30 Make use of ExDB configuration file for loading drugbank and tree node list DBs/collections and setting indexed fields
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rcsb.exdb
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.30
|
|
4
4
|
Summary: RCSB Python ExDB data extraction and loading workflows
|
|
5
5
|
Home-page: https://github.com/rcsb/py-rcsb_exdb
|
|
6
6
|
Author: John Westbrook
|
|
@@ -19,8 +19,8 @@ License-File: LICENSE
|
|
|
19
19
|
Requires-Dist: numpy
|
|
20
20
|
Requires-Dist: jsonschema>=2.6.0
|
|
21
21
|
Requires-Dist: rcsb.utils.io>=1.48
|
|
22
|
-
Requires-Dist: rcsb.db>=1.
|
|
23
|
-
Requires-Dist: rcsb.utils.chem>=0.
|
|
22
|
+
Requires-Dist: rcsb.db>=1.808
|
|
23
|
+
Requires-Dist: rcsb.utils.chem>=0.84
|
|
24
24
|
Requires-Dist: rcsb.utils.chemref>=0.91
|
|
25
25
|
Requires-Dist: rcsb.utils.config>=0.40
|
|
26
26
|
Requires-Dist: rcsb.utils.ec>=0.25
|
|
@@ -7,6 +7,8 @@
|
|
|
7
7
|
# Updates:
|
|
8
8
|
# 9-Dec-2018 jdw add validation methods
|
|
9
9
|
# 3-Sep-2019 jdw move to rcsb.exdb.chemref
|
|
10
|
+
# 7-Aug-2025 dwp change target DB and collection from "drugbank_core" to "dw" and "core_drugbank" (as part of transition to DW);
|
|
11
|
+
# make use of configuration file for loading drugbank collection and setting indexed fields
|
|
10
12
|
#
|
|
11
13
|
##
|
|
12
14
|
__docformat__ = "google en"
|
|
@@ -66,9 +68,10 @@ class ChemRefEtlWorker(object):
|
|
|
66
68
|
desp = DataExchangeStatus()
|
|
67
69
|
statusStartTimestamp = desp.setStartTime()
|
|
68
70
|
addValues = {}
|
|
71
|
+
collectionGroupName = "core_drugbank"
|
|
69
72
|
#
|
|
70
73
|
if extResource == "DrugBank":
|
|
71
|
-
|
|
74
|
+
databaseNameMongo = self.__schP.getDatabaseMongoName(collectionGroupName=collectionGroupName)
|
|
72
75
|
configName = self.__cfgOb.getDefaultSectionName()
|
|
73
76
|
user = self.__cfgOb.get("_DRUGBANK_AUTH_USERNAME", sectionName=configName)
|
|
74
77
|
pw = self.__cfgOb.get("_DRUGBANK_AUTH_PASSWORD", sectionName=configName)
|
|
@@ -81,10 +84,10 @@ class ChemRefEtlWorker(object):
|
|
|
81
84
|
#
|
|
82
85
|
logger.info("Resource %r extracted mapped document length %d", extResource, len(dList))
|
|
83
86
|
logger.debug("Objects %r", dList[:2])
|
|
84
|
-
|
|
87
|
+
_, _, collectionList, docIndexD = self.__schP.getSchemaInfo(collectionGroupName=collectionGroupName)
|
|
85
88
|
collectionName = collectionList[0] if collectionList else "unassigned"
|
|
86
|
-
|
|
87
|
-
logger.info("Database %r collection %r index attributes %r",
|
|
89
|
+
indexDL = docIndexD[collectionName] if collectionName in docIndexD else []
|
|
90
|
+
logger.info("Database %r collection %r index attributes %r", databaseNameMongo, collectionName, indexDL)
|
|
88
91
|
#
|
|
89
92
|
# For some reason, 'addValues' was being overwritten with an empty dict (https://github.com/rcsb/py-rcsb_exdb/commit/26bd79e9a2fffc97c034b4116dece9248d1c1f39)
|
|
90
93
|
# Will need to review this -- do we want to add the schema version values or not? (Also, see similar logic in UniProtCoreEtlWorker.py)
|
|
@@ -103,8 +106,8 @@ class ChemRefEtlWorker(object):
|
|
|
103
106
|
readBackCheck=self.__readBackCheck,
|
|
104
107
|
)
|
|
105
108
|
#
|
|
106
|
-
ok = dl.load(
|
|
107
|
-
self.__updateStatus(updateId,
|
|
109
|
+
ok = dl.load(databaseNameMongo, collectionName, loadType=loadType, documentList=dList, keyNames=None, addValues=addValues, indexDL=indexDL)
|
|
110
|
+
self.__updateStatus(updateId, databaseNameMongo, collectionName, ok, statusStartTimestamp)
|
|
108
111
|
|
|
109
112
|
return True
|
|
110
113
|
except Exception as e:
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
# Updates:
|
|
8
8
|
# 7-Jan-2019 jdw moved from ChemRefEtlWorker.
|
|
9
9
|
# 3-Sep-2019 jdw moved again to module rcsb.exdb.chemref
|
|
10
|
+
# 14-Aug-2025 dwp rename bird_chem_comp_core to core_chem_comp
|
|
10
11
|
#
|
|
11
12
|
##
|
|
12
13
|
__docformat__ = "google en"
|
|
@@ -42,8 +43,8 @@ class ChemRefExtractor(object):
|
|
|
42
43
|
"""
|
|
43
44
|
idD = {}
|
|
44
45
|
try:
|
|
45
|
-
databaseName = "
|
|
46
|
-
collectionName = "
|
|
46
|
+
databaseName = "dw"
|
|
47
|
+
collectionName = "core_chem_comp"
|
|
47
48
|
selectD = {"rcsb_chem_comp_related.resource_name": referenceResourceName}
|
|
48
49
|
selectionList = ["rcsb_id", "rcsb_chem_comp_related"]
|
|
49
50
|
logger.info("Searching %s %s with selection query %r", databaseName, collectionName, selectD)
|
|
@@ -162,7 +162,9 @@ class PdbxLoaderFixture(unittest.TestCase):
|
|
|
162
162
|
]
|
|
163
163
|
self.__ldList = [
|
|
164
164
|
{
|
|
165
|
-
"databaseName": "
|
|
165
|
+
# "databaseName": "dw",
|
|
166
|
+
"collectionGroupName": "core_chem_comp",
|
|
167
|
+
"contentType": "bird_chem_comp_core",
|
|
166
168
|
"collectionNameList": None,
|
|
167
169
|
"loadType": "full",
|
|
168
170
|
"mergeContentTypes": None,
|
|
@@ -170,7 +172,9 @@ class PdbxLoaderFixture(unittest.TestCase):
|
|
|
170
172
|
"inputIdCodeList": self.__birdChemCompCoreIdList
|
|
171
173
|
},
|
|
172
174
|
{
|
|
173
|
-
"databaseName": "pdbx_core",
|
|
175
|
+
# "databaseName": "pdbx_core",
|
|
176
|
+
"collectionGroupName": "pdbx_core",
|
|
177
|
+
"contentType": "pdbx_core",
|
|
174
178
|
"collectionNameList": None,
|
|
175
179
|
"loadType": "replace",
|
|
176
180
|
"mergeContentTypes": ["vrpt"],
|
|
@@ -179,6 +183,8 @@ class PdbxLoaderFixture(unittest.TestCase):
|
|
|
179
183
|
},
|
|
180
184
|
# {
|
|
181
185
|
# "databaseName": "pdbx_comp_model_core",
|
|
186
|
+
# "collectionGroupName": "pdbx_comp_model_core",
|
|
187
|
+
# "contentType": "pdbx_comp_model_core",
|
|
182
188
|
# "collectionNameList": None,
|
|
183
189
|
# "loadType": "full",
|
|
184
190
|
# "mergeContentTypes": None,
|
|
@@ -220,7 +226,7 @@ class PdbxLoaderFixture(unittest.TestCase):
|
|
|
220
226
|
"""Wrapper for the PDBx loader module"""
|
|
221
227
|
ok = False
|
|
222
228
|
try:
|
|
223
|
-
logger.info("Loading %s", kwargs["
|
|
229
|
+
logger.info("Loading %s", kwargs["collectionGroupName"])
|
|
224
230
|
mw = PdbxLoader(
|
|
225
231
|
self.__cfgOb,
|
|
226
232
|
cachePath=self.__cachePath,
|
|
@@ -235,8 +241,9 @@ class PdbxLoaderFixture(unittest.TestCase):
|
|
|
235
241
|
rebuildSchemaFlag=False,
|
|
236
242
|
)
|
|
237
243
|
ok = mw.load(
|
|
238
|
-
kwargs["
|
|
244
|
+
collectionGroupName=kwargs["collectionGroupName"],
|
|
239
245
|
collectionLoadList=kwargs["collectionNameList"],
|
|
246
|
+
contentType=kwargs["contentType"],
|
|
240
247
|
loadType=kwargs["loadType"],
|
|
241
248
|
inputPathList=None,
|
|
242
249
|
inputIdCodeList=kwargs["inputIdCodeList"],
|
|
@@ -81,8 +81,8 @@ class ObjectExtractorTests(unittest.TestCase):
|
|
|
81
81
|
try:
|
|
82
82
|
obEx = ObjectExtractor(
|
|
83
83
|
self.__cfgOb,
|
|
84
|
-
databaseName="
|
|
85
|
-
collectionName="
|
|
84
|
+
databaseName="dw",
|
|
85
|
+
collectionName="core_chem_comp",
|
|
86
86
|
cacheFilePath=os.path.join(self.__workPath, "drugbank-mapping-cache.json"),
|
|
87
87
|
useCache=False,
|
|
88
88
|
cacheKwargs=self.__testEntryCacheKwargs,
|
|
@@ -60,6 +60,7 @@ class ReferenceSequenceAnnotationAdapterTests(unittest.TestCase):
|
|
|
60
60
|
endTime = time.time()
|
|
61
61
|
logger.info("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)
|
|
62
62
|
|
|
63
|
+
# NOTE: IF YOU DISABLE THE TEST BELOW, THEN 'testReferenceCacheProvider' FAILS. CHECK WHETHER ALL 'Reference' PROVIDERS CAN BE DISABLED.
|
|
63
64
|
# @unittest.skip("Disable test - no longer using in production, and fails too frequently with 'Bad xml text' when fetching from UniProt")
|
|
64
65
|
def testAnnotationAdapter(self):
|
|
65
66
|
"""Test case - create and read cache reference sequences assignments and related data."""
|
|
@@ -10,6 +10,8 @@
|
|
|
10
10
|
# 8-Aug-2023 dwp Load full (unfiltered) taxonomy tree node list, and stop loading GO tree (will be loaded in DW instead)
|
|
11
11
|
# 27-Aug-2024 dwp Update CARD ontology tree loading
|
|
12
12
|
# 23-Jan-2025 dwp Change indexed field from 'update_id' to 'id'
|
|
13
|
+
# 7-Aug-2025 dwp Change target DB and collection names to "dw" and "tree_*" (via configuration file);
|
|
14
|
+
# Make use of configuration file for loading tree node lists and setting indexed fields
|
|
13
15
|
#
|
|
14
16
|
##
|
|
15
17
|
__docformat__ = "google en"
|
|
@@ -32,8 +34,6 @@ from rcsb.utils.struct.ScopClassificationProvider import ScopClassificationProvi
|
|
|
32
34
|
from rcsb.utils.struct.Scop2ClassificationProvider import Scop2ClassificationProvider
|
|
33
35
|
from rcsb.utils.taxonomy.TaxonomyProvider import TaxonomyProvider
|
|
34
36
|
from rcsb.exdb.seq.TaxonomyExtractor import TaxonomyExtractor
|
|
35
|
-
# from rcsb.utils.go.GeneOntologyProvider import GeneOntologyProvider
|
|
36
|
-
# from rcsb.exdb.seq.AnnotationExtractor import AnnotationExtractor
|
|
37
37
|
|
|
38
38
|
logger = logging.getLogger(__name__)
|
|
39
39
|
|
|
@@ -76,37 +76,28 @@ class TreeNodeListWorker(object):
|
|
|
76
76
|
Relevant configuration options:
|
|
77
77
|
|
|
78
78
|
tree_node_lists_configuration:
|
|
79
|
-
DATABASE_NAME:
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
79
|
+
DATABASE_NAME: dw
|
|
80
|
+
COLLECTION_VERSION_STRING: 2.1.0
|
|
81
|
+
COLLECTION_NAME_LIST:
|
|
82
|
+
- tree_taxonomy
|
|
83
|
+
- tree_ec
|
|
84
|
+
- tree_scop
|
|
85
|
+
- tree_scop2
|
|
86
|
+
- tree_cath
|
|
87
|
+
- tree_atc
|
|
88
|
+
- tree_card
|
|
89
|
+
- tree_ecod
|
|
90
|
+
COLLECTION_INDICES:
|
|
91
|
+
- INDEX_NAME: primary
|
|
92
|
+
ATTRIBUTE_NAMES:
|
|
93
|
+
- id
|
|
94
|
+
- INDEX_NAME: index_2
|
|
95
|
+
ATTRIBUTE_NAMES:
|
|
96
|
+
- parents
|
|
87
97
|
"""
|
|
88
98
|
try:
|
|
89
99
|
useCache = self.__useCache
|
|
90
100
|
#
|
|
91
|
-
# if not useCache:
|
|
92
|
-
# cDL = ["domains_struct", "NCBI", "ec", "go", "atc"]
|
|
93
|
-
# for cD in cDL:
|
|
94
|
-
# try:
|
|
95
|
-
# cfp = os.path.join(self.__cachePath, cD)
|
|
96
|
-
# os.makedirs(cfp, 0o755)
|
|
97
|
-
# except Exception:
|
|
98
|
-
# pass
|
|
99
|
-
# #
|
|
100
|
-
# try:
|
|
101
|
-
# cfp = os.path.join(self.__cachePath, cD)
|
|
102
|
-
# fpL = glob.glob(os.path.join(cfp, "*"))
|
|
103
|
-
# if fpL:
|
|
104
|
-
# for fp in fpL:
|
|
105
|
-
# os.remove(fp)
|
|
106
|
-
# except Exception:
|
|
107
|
-
# pass
|
|
108
|
-
#
|
|
109
|
-
#
|
|
110
101
|
logger.info("Starting with cache path %r (useCache=%r)", self.__cachePath, useCache)
|
|
111
102
|
#
|
|
112
103
|
self.__statusList = []
|
|
@@ -124,65 +115,77 @@ class TreeNodeListWorker(object):
|
|
|
124
115
|
readBackCheck=self.__readBackCheck,
|
|
125
116
|
)
|
|
126
117
|
#
|
|
127
|
-
|
|
118
|
+
sectionName = "tree_node_lists_configuration"
|
|
119
|
+
databaseNameMongo = self.__cfgOb.get("DATABASE_NAME", sectionName=sectionName)
|
|
120
|
+
collectionNameList = self.__cfgOb.get("COLLECTION_NAME_LIST", sectionName=sectionName)
|
|
121
|
+
collectionIndexList = self.__cfgOb.get("COLLECTION_INDICES", sectionName=sectionName)
|
|
122
|
+
# databaseNameMongo = 'dw'
|
|
123
|
+
# collectionNameList = ['tree_taxonomy', 'tree_ec', 'tree_scop', 'tree_scop2', 'tree_cath', 'tree_atc', 'tree_card', 'tree_ecod', 'tree_go']
|
|
124
|
+
# collectionIndexList = [{'INDEX_NAME': 'primary', 'ATTRIBUTE_NAMES': ['id']}, {'INDEX_NAME': 'index_2', 'ATTRIBUTE_NAMES': ['parents']}]
|
|
125
|
+
|
|
128
126
|
# collectionVersion = self.__cfgOb.get("COLLECTION_VERSION_STRING", sectionName=sectionName)
|
|
129
127
|
# addValues = {"_schema_version": collectionVersion}
|
|
130
128
|
addValues = None
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
129
|
+
|
|
130
|
+
ok = True
|
|
131
|
+
for collectionName in collectionNameList:
|
|
132
|
+
nL = self.__getTreeDocList(collectionName, useCache)
|
|
133
|
+
if nL and doLoad:
|
|
134
|
+
ok = dl.load(
|
|
135
|
+
databaseNameMongo,
|
|
136
|
+
collectionName,
|
|
137
|
+
loadType=loadType,
|
|
138
|
+
documentList=nL,
|
|
139
|
+
keyNames=None,
|
|
140
|
+
addValues=addValues,
|
|
141
|
+
schemaLevel=None,
|
|
142
|
+
indexDL=collectionIndexList
|
|
143
|
+
) and ok
|
|
144
|
+
self.__updateStatus(updateId, databaseNameMongo, collectionName, ok, statusStartTimestamp)
|
|
145
|
+
logger.info(
|
|
146
|
+
"Completed load of tree node list for database %r, collection %r, len(nL) %r (status %r)",
|
|
147
|
+
databaseNameMongo, collectionName, len(nL), ok
|
|
148
|
+
)
|
|
149
|
+
# ---
|
|
150
|
+
logger.info("Completed tree node list loading operations with loadType %r (status %r)", loadType, ok)
|
|
151
|
+
return True
|
|
152
|
+
except Exception as e:
|
|
153
|
+
logger.exception("Failing with %s", str(e))
|
|
154
|
+
return False
|
|
155
|
+
|
|
156
|
+
def __checkTaxonNodeList(self, nL):
|
|
157
|
+
eCount = 0
|
|
158
|
+
tD = {dD["id"]: True for dD in nL}
|
|
159
|
+
for dD in nL:
|
|
160
|
+
if "parents" in dD:
|
|
161
|
+
pId = dD["parents"][0]
|
|
162
|
+
if pId not in tD:
|
|
163
|
+
logger.info("Missing parent for taxon %d", pId)
|
|
164
|
+
eCount += 1
|
|
165
|
+
else:
|
|
166
|
+
logger.info("No parents for node %r", dD["id"])
|
|
167
|
+
|
|
168
|
+
def getLoadStatus(self):
|
|
169
|
+
return self.__statusList
|
|
170
|
+
|
|
171
|
+
def __getTreeDocList(self, collectionName, useCache):
|
|
172
|
+
nL = []
|
|
173
|
+
if collectionName.lower() == "tree_cath":
|
|
146
174
|
ccu = CathClassificationProvider(cachePath=self.__cachePath, useCache=useCache)
|
|
147
175
|
nL = ccu.getTreeNodeList()
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
|
|
153
|
-
# ---- SCOP
|
|
176
|
+
elif collectionName.lower() == "tree_scop2":
|
|
177
|
+
scu2 = Scop2ClassificationProvider(cachePath=self.__cachePath, useCache=useCache)
|
|
178
|
+
nL = scu2.getTreeNodeList()
|
|
179
|
+
elif collectionName.lower() == "tree_scop":
|
|
154
180
|
scu = ScopClassificationProvider(cachePath=self.__cachePath, useCache=useCache)
|
|
155
181
|
nL = scu.getTreeNodeList()
|
|
156
|
-
|
|
157
|
-
if doLoad:
|
|
158
|
-
collectionName = "tree_scop_node_list"
|
|
159
|
-
ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["id"], keyNames=None, addValues=addValues, schemaLevel=None)
|
|
160
|
-
self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
|
|
161
|
-
# --- SCOP2
|
|
162
|
-
scu = Scop2ClassificationProvider(cachePath=self.__cachePath, useCache=useCache)
|
|
163
|
-
nL = scu.getTreeNodeList()
|
|
164
|
-
logger.info("Starting load SCOP2 node tree length %d", len(nL))
|
|
165
|
-
if doLoad:
|
|
166
|
-
collectionName = "tree_scop2_node_list"
|
|
167
|
-
ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["id"], keyNames=None, addValues=addValues, schemaLevel=None)
|
|
168
|
-
self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
|
|
169
|
-
# ---- Ecod
|
|
182
|
+
elif collectionName.lower() == "tree_ecod":
|
|
170
183
|
ecu = EcodClassificationProvider(cachePath=self.__cachePath, useCache=useCache)
|
|
171
184
|
nL = ecu.getTreeNodeList()
|
|
172
|
-
|
|
173
|
-
if doLoad:
|
|
174
|
-
collectionName = "tree_ecod_node_list"
|
|
175
|
-
ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["id"], keyNames=None, addValues=addValues, schemaLevel=None)
|
|
176
|
-
self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
|
|
177
|
-
# ---- EC
|
|
185
|
+
elif collectionName.lower() == "tree_ec":
|
|
178
186
|
edbu = EnzymeDatabaseProvider(cachePath=self.__cachePath, useCache=useCache)
|
|
179
187
|
nL = edbu.getTreeNodeList()
|
|
180
|
-
|
|
181
|
-
if doLoad:
|
|
182
|
-
collectionName = "tree_ec_node_list"
|
|
183
|
-
ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["id"], keyNames=None, addValues=addValues, schemaLevel=None)
|
|
184
|
-
self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
|
|
185
|
-
# ---- CARD
|
|
188
|
+
elif collectionName.lower() == "tree_card":
|
|
186
189
|
okCou = True
|
|
187
190
|
cou = CARDTargetOntologyProvider(cachePath=self.__cachePath, useCache=useCache)
|
|
188
191
|
if not cou.testCache():
|
|
@@ -193,21 +196,7 @@ class TreeNodeListWorker(object):
|
|
|
193
196
|
okCou = False
|
|
194
197
|
if okCou:
|
|
195
198
|
nL = cou.getTreeNodeList()
|
|
196
|
-
|
|
197
|
-
if doLoad:
|
|
198
|
-
collectionName = "tree_card_node_list"
|
|
199
|
-
ok = dl.load(
|
|
200
|
-
databaseName,
|
|
201
|
-
collectionName,
|
|
202
|
-
loadType=loadType,
|
|
203
|
-
documentList=nL,
|
|
204
|
-
indexAttributeList=["id"],
|
|
205
|
-
keyNames=None,
|
|
206
|
-
addValues=addValues,
|
|
207
|
-
schemaLevel=None
|
|
208
|
-
)
|
|
209
|
-
self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
|
|
210
|
-
# ---- Taxonomy
|
|
199
|
+
elif collectionName.lower() == "tree_taxonomy":
|
|
211
200
|
tU = TaxonomyProvider(cachePath=self.__cachePath, useCache=useCache)
|
|
212
201
|
if self.__useFilteredLists:
|
|
213
202
|
# Get the taxon coverage in the current data set -
|
|
@@ -226,43 +215,14 @@ class TreeNodeListWorker(object):
|
|
|
226
215
|
# Get the full taxon node list without filtering
|
|
227
216
|
nL = tU.exportNodeList()
|
|
228
217
|
self.__checkTaxonNodeList(nL)
|
|
229
|
-
|
|
230
|
-
if doLoad:
|
|
231
|
-
collectionName = "tree_taxonomy_node_list"
|
|
232
|
-
logger.debug("Taxonomy nodes (%d) %r", len(nL), nL[:5])
|
|
233
|
-
ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["id"], keyNames=None, addValues=addValues, schemaLevel=None)
|
|
234
|
-
self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
|
|
235
|
-
logger.info("Tree loading operations completed.")
|
|
236
|
-
#
|
|
237
|
-
# --- ATC
|
|
218
|
+
elif collectionName.lower() == "tree_atc":
|
|
238
219
|
crEx = ChemRefExtractor(self.__cfgOb)
|
|
239
220
|
atcFilterD = crEx.getChemCompAccessionMapping("ATC")
|
|
240
221
|
logger.info("Length of ATC filter %d", len(atcFilterD))
|
|
241
222
|
atcP = AtcProvider(cachePath=self.__cachePath, useCache=useCache)
|
|
242
223
|
nL = atcP.getTreeNodeList(filterD=atcFilterD)
|
|
243
|
-
|
|
244
|
-
logger.
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
# ---
|
|
249
|
-
logger.info("Completed tree node list loading operations.\n")
|
|
250
|
-
return True
|
|
251
|
-
except Exception as e:
|
|
252
|
-
logger.exception("Failing with %s", str(e))
|
|
253
|
-
return False
|
|
254
|
-
|
|
255
|
-
def __checkTaxonNodeList(self, nL):
|
|
256
|
-
eCount = 0
|
|
257
|
-
tD = {dD["id"]: True for dD in nL}
|
|
258
|
-
for dD in nL:
|
|
259
|
-
if "parents" in dD:
|
|
260
|
-
pId = dD["parents"][0]
|
|
261
|
-
if pId not in tD:
|
|
262
|
-
logger.info("Missing parent for taxon %d", pId)
|
|
263
|
-
eCount += 1
|
|
264
|
-
else:
|
|
265
|
-
logger.info("No parents for node %r", dD["id"])
|
|
266
|
-
|
|
267
|
-
def getLoadStatus(self):
|
|
268
|
-
return self.__statusList
|
|
224
|
+
else:
|
|
225
|
+
logger.error("Unsupported tree node collection %r", collectionName)
|
|
226
|
+
#
|
|
227
|
+
logger.info("Gathered tree nodes for loading collection %s (length %d)", collectionName, len(nL))
|
|
228
|
+
return nL
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rcsb.exdb
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.30
|
|
4
4
|
Summary: RCSB Python ExDB data extraction and loading workflows
|
|
5
5
|
Home-page: https://github.com/rcsb/py-rcsb_exdb
|
|
6
6
|
Author: John Westbrook
|
|
@@ -19,8 +19,8 @@ License-File: LICENSE
|
|
|
19
19
|
Requires-Dist: numpy
|
|
20
20
|
Requires-Dist: jsonschema>=2.6.0
|
|
21
21
|
Requires-Dist: rcsb.utils.io>=1.48
|
|
22
|
-
Requires-Dist: rcsb.db>=1.
|
|
23
|
-
Requires-Dist: rcsb.utils.chem>=0.
|
|
22
|
+
Requires-Dist: rcsb.db>=1.808
|
|
23
|
+
Requires-Dist: rcsb.utils.chem>=0.84
|
|
24
24
|
Requires-Dist: rcsb.utils.chemref>=0.91
|
|
25
25
|
Requires-Dist: rcsb.utils.config>=0.40
|
|
26
26
|
Requires-Dist: rcsb.utils.ec>=0.25
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py
RENAMED
|
File without changes
|
{rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|