rcsb.exdb 1.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rcsb/__init__.py +1 -0
- rcsb/exdb/__init__.py +1 -0
- rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
- rcsb/exdb/branch/GlycanProvider.py +116 -0
- rcsb/exdb/branch/GlycanUtils.py +114 -0
- rcsb/exdb/branch/__init__.py +0 -0
- rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
- rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
- rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
- rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
- rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
- rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
- rcsb/exdb/chemref/__init__.py +0 -0
- rcsb/exdb/citation/CitationAdapter.py +91 -0
- rcsb/exdb/citation/CitationExtractor.py +190 -0
- rcsb/exdb/citation/CitationUtils.py +51 -0
- rcsb/exdb/citation/__init__.py +0 -0
- rcsb/exdb/cli/__init__.py +0 -0
- rcsb/exdb/entry/EntryInfoProvider.py +148 -0
- rcsb/exdb/entry/__init__.py +0 -0
- rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
- rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
- rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
- rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
- rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
- rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
- rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
- rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
- rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
- rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
- rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
- rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
- rcsb/exdb/seq/AnnotationExtractor.py +76 -0
- rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
- rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
- rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
- rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
- rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
- rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
- rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
- rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
- rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
- rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
- rcsb/exdb/seq/UniProtExtractor.py +80 -0
- rcsb/exdb/seq/__init__.py +0 -0
- rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
- rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
- rcsb/exdb/tests/__init__.py +0 -0
- rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
- rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
- rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
- rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
- rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
- rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
- rcsb/exdb/tests/testChemRefLoader.py +106 -0
- rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
- rcsb/exdb/tests/testCitationAdapter.py +97 -0
- rcsb/exdb/tests/testCitationExtractor.py +93 -0
- rcsb/exdb/tests/testCitationUtils.py +92 -0
- rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
- rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
- rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
- rcsb/exdb/tests/testGlycanProvider.py +98 -0
- rcsb/exdb/tests/testGlycanUtils.py +64 -0
- rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
- rcsb/exdb/tests/testObjectExtractor.py +342 -0
- rcsb/exdb/tests/testObjectTransformer.py +83 -0
- rcsb/exdb/tests/testObjectUpdater.py +120 -0
- rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
- rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
- rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
- rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
- rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
- rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
- rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
- rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
- rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
- rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
- rcsb/exdb/tests/testUniProtExtractor.py +77 -0
- rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
- rcsb/exdb/tree/__init__.py +0 -0
- rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
- rcsb/exdb/utils/ObjectExtractor.py +286 -0
- rcsb/exdb/utils/ObjectTransformer.py +124 -0
- rcsb/exdb/utils/ObjectUpdater.py +121 -0
- rcsb/exdb/utils/ObjectValidator.py +160 -0
- rcsb/exdb/utils/__init__.py +0 -0
- rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
- rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
- rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
- rcsb/exdb/wf/__init__.py +0 -0
- rcsb_exdb-1.31.dist-info/METADATA +103 -0
- rcsb_exdb-1.31.dist-info/RECORD +98 -0
- rcsb_exdb-1.31.dist-info/WHEEL +4 -0
- rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,372 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: PubChemDataCacheProvider.py
|
|
3
|
+
# Date: 2-Apr-2020 jdw
|
|
4
|
+
#
|
|
5
|
+
# Utilities to cache chemical reference data and mappings for PubChem
|
|
6
|
+
#
|
|
7
|
+
# Updates:
|
|
8
|
+
# 9-May-2020 jdw separate cache behavior with separate option rebuildChemIndices=True/False
|
|
9
|
+
# 16-Jul-2020 jdw separate index and reference data management.
|
|
10
|
+
# 23-Jul-2021 jdw Make PubChemDataCacheProvider a subclass of StashableBase()
|
|
11
|
+
# 15-Mar-2023 aae Update default numProc to 2
|
|
12
|
+
#
|
|
13
|
+
##
|
|
14
|
+
__docformat__ = "google en"
|
|
15
|
+
__author__ = "John Westbrook"
|
|
16
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
17
|
+
__license__ = "Apache 2.0"
|
|
18
|
+
|
|
19
|
+
import logging
|
|
20
|
+
import os
|
|
21
|
+
import time
|
|
22
|
+
|
|
23
|
+
from rcsb.exdb.utils.ObjectExtractor import ObjectExtractor
|
|
24
|
+
from rcsb.exdb.utils.ObjectUpdater import ObjectUpdater
|
|
25
|
+
from rcsb.utils.chemref.PubChemUtils import PubChemUtils, ChemicalIdentifier
|
|
26
|
+
from rcsb.utils.io.IoUtil import getObjSize
|
|
27
|
+
from rcsb.utils.io.MarshalUtil import MarshalUtil
|
|
28
|
+
from rcsb.utils.io.StashableBase import StashableBase
|
|
29
|
+
from rcsb.utils.io.TimeUtil import TimeUtil
|
|
30
|
+
from rcsb.utils.multiproc.MultiProcUtil import MultiProcUtil
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class PubChemDataUpdateWorker(object):
|
|
37
|
+
"""A skeleton worker class that implements the interface expected by the multiprocessing module
|
|
38
|
+
for fetching PubChem chemical reference data --
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self, cfgOb, **kwargs):
|
|
42
|
+
self.__cfgOb = cfgOb
|
|
43
|
+
#
|
|
44
|
+
_ = kwargs
|
|
45
|
+
self.__databaseName = "pubchem_exdb"
|
|
46
|
+
self.__refDataCollectionName = "reference_entry"
|
|
47
|
+
self.__createCollections(self.__databaseName, self.__refDataCollectionName, indexAttributeNames=["rcsb_id", "rcsb_last_update"])
|
|
48
|
+
self.__pcU = PubChemUtils()
|
|
49
|
+
|
|
50
|
+
def updateList(self, dataList, procName, optionsD, workingDir):
|
|
51
|
+
"""Update the input list of reference data identifiers (ChemicalIdentifier()) and return
|
|
52
|
+
matching diagnostics and reference feature data.
|
|
53
|
+
|
|
54
|
+
"""
|
|
55
|
+
_ = workingDir
|
|
56
|
+
chunkSize = optionsD.get("chunkSize", 50)
|
|
57
|
+
# Path to store raw request data -
|
|
58
|
+
exportPath = optionsD.get("exportPath", None)
|
|
59
|
+
#
|
|
60
|
+
successList = []
|
|
61
|
+
retList1 = []
|
|
62
|
+
retList2 = []
|
|
63
|
+
diagList = []
|
|
64
|
+
emptyList = []
|
|
65
|
+
# -
|
|
66
|
+
try:
|
|
67
|
+
tU = TimeUtil()
|
|
68
|
+
pcidList = dataList
|
|
69
|
+
numChunks = len(list(self.__chunker(pcidList, chunkSize)))
|
|
70
|
+
logger.info("%s search starting for %d reference definitions (in chunks of length %d)", procName, len(pcidList), chunkSize)
|
|
71
|
+
for ii, pcidChunk in enumerate(self.__chunker(pcidList, chunkSize), 1):
|
|
72
|
+
logger.info("%s starting chunk for %d of %d", procName, ii, numChunks)
|
|
73
|
+
tDL = []
|
|
74
|
+
timeS = tU.getDateTimeObj(tU.getTimestamp())
|
|
75
|
+
for pcid in pcidChunk:
|
|
76
|
+
#
|
|
77
|
+
chemId = ChemicalIdentifier(idCode=pcid, identifierType="cid", identifier=pcid, identifierSource="ccd-match")
|
|
78
|
+
#
|
|
79
|
+
stA = time.time()
|
|
80
|
+
ok, refDL = self.__pcU.assemble(chemId, exportPath=exportPath)
|
|
81
|
+
#
|
|
82
|
+
if not ok:
|
|
83
|
+
etA = time.time()
|
|
84
|
+
logger.info("Failing %s search source %s for %s (%.4f secs)", chemId.identifierType, chemId.identifierSource, chemId.idCode, etA - stA)
|
|
85
|
+
|
|
86
|
+
#
|
|
87
|
+
if ok and refDL:
|
|
88
|
+
successList.append(pcid)
|
|
89
|
+
for tD in refDL:
|
|
90
|
+
tD.update({"rcsb_id": tD["cid"], "rcsb_last_update": timeS})
|
|
91
|
+
tDL.append(tD)
|
|
92
|
+
else:
|
|
93
|
+
logger.info("No match result for any form of %s", pcid)
|
|
94
|
+
# --
|
|
95
|
+
startTimeL = time.time()
|
|
96
|
+
logger.info("Saving chunk %d (len=%d)", ii, len(pcidChunk))
|
|
97
|
+
self.__updateObjectStore(self.__databaseName, self.__refDataCollectionName, tDL)
|
|
98
|
+
endTimeL = time.time()
|
|
99
|
+
logger.info("Saved chunk %d (len=%d) in %.3f secs", ii, len(pcidChunk), endTimeL - startTimeL)
|
|
100
|
+
except Exception as e:
|
|
101
|
+
logger.exception("Failing %s for %d data items %s", procName, len(dataList), str(e))
|
|
102
|
+
logger.info("%s dataList length %d success length %d rst1 %d rst2 %d", procName, len(dataList), len(successList), len(retList1), len(retList2))
|
|
103
|
+
#
|
|
104
|
+
return successList, emptyList, emptyList, diagList
|
|
105
|
+
|
|
106
|
+
def __updateObjectStore(self, databaseName, collectionName, objDL):
|
|
107
|
+
updateDL = []
|
|
108
|
+
for objD in objDL:
|
|
109
|
+
try:
|
|
110
|
+
selectD = {"rcsb_id": objD["rcsb_id"]}
|
|
111
|
+
updateDL.append({"selectD": selectD, "updateD": objD})
|
|
112
|
+
except Exception as e:
|
|
113
|
+
logger.exception("Failing with %s", str(e))
|
|
114
|
+
obUpd = ObjectUpdater(self.__cfgOb)
|
|
115
|
+
numUpd = obUpd.update(databaseName, collectionName, updateDL)
|
|
116
|
+
logger.info("Updated reference count is %d", numUpd)
|
|
117
|
+
|
|
118
|
+
def __createCollections(self, databaseName, collectionName, indexAttributeNames=None):
|
|
119
|
+
obUpd = ObjectUpdater(self.__cfgOb)
|
|
120
|
+
ok = obUpd.createCollection(databaseName, collectionName, indexAttributeNames=indexAttributeNames, checkExists=True, bsonSchema=None)
|
|
121
|
+
return ok
|
|
122
|
+
|
|
123
|
+
def __chunker(self, iList, chunkSize):
|
|
124
|
+
chunkSize = max(1, chunkSize)
|
|
125
|
+
return (iList[i : i + chunkSize] for i in range(0, len(iList), chunkSize))
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
class PubChemDataCacheProvider(StashableBase):
|
|
129
|
+
"""Utilities to cache chemical reference data extracted from PubChem compound data"""
|
|
130
|
+
|
|
131
|
+
def __init__(self, cfgOb, cachePath):
|
|
132
|
+
dirName = "PubChem-data"
|
|
133
|
+
super(PubChemDataCacheProvider, self).__init__(cachePath, [dirName])
|
|
134
|
+
self.__cfgOb = cfgOb
|
|
135
|
+
self.__dirPath = os.path.join(cachePath, dirName)
|
|
136
|
+
#
|
|
137
|
+
self.__databaseName = "pubchem_exdb"
|
|
138
|
+
self.__refDataCollectionName = "reference_entry"
|
|
139
|
+
#
|
|
140
|
+
self.__refD = None
|
|
141
|
+
|
|
142
|
+
def getRefData(self, expireDays=0):
|
|
143
|
+
if not self.__refD:
|
|
144
|
+
selectD = {}
|
|
145
|
+
if expireDays > 0:
|
|
146
|
+
tU = TimeUtil()
|
|
147
|
+
tS = tU.getTimestamp(useUtc=True, before={"days": expireDays})
|
|
148
|
+
selectD.update({"rcsb_latest_update": {"$lt": tU.getDateTimeObj(tS)}})
|
|
149
|
+
self.__refD = self.__getReferenceData(self.__databaseName, self.__refDataCollectionName, selectD=selectD)
|
|
150
|
+
#
|
|
151
|
+
return self.__refD
|
|
152
|
+
|
|
153
|
+
def getRefIdCodes(self, expireDays=0):
|
|
154
|
+
selectD = {}
|
|
155
|
+
selectionList = ["rcsb_id"]
|
|
156
|
+
if expireDays > 0:
|
|
157
|
+
tU = TimeUtil()
|
|
158
|
+
tS = tU.getTimestamp(useUtc=True, before={"days": expireDays})
|
|
159
|
+
selectD.update({"rcsb_latest_update": {"$lt": tU.getDateTimeObj(tS)}})
|
|
160
|
+
refIds = self.__getReferenceData(self.__databaseName, self.__refDataCollectionName, selectD=selectD, selectionList=selectionList)
|
|
161
|
+
#
|
|
162
|
+
return list(refIds.keys()) if refIds else []
|
|
163
|
+
|
|
164
|
+
def getRefDataCount(self):
|
|
165
|
+
return len(self.__refD) if self.__refD else 0
|
|
166
|
+
|
|
167
|
+
def testCache(self, minCount=None, logSizes=False):
|
|
168
|
+
okC = bool(self.__refD)
|
|
169
|
+
if not okC:
|
|
170
|
+
return okC
|
|
171
|
+
logger.info("Reference data cache lengths: refD %d", len(self.__refD))
|
|
172
|
+
if minCount and len(self.__refD) < minCount:
|
|
173
|
+
return False
|
|
174
|
+
#
|
|
175
|
+
if logSizes:
|
|
176
|
+
logger.info("refD %.2f", getObjSize(self.__refD) / 1000000.0)
|
|
177
|
+
return True
|
|
178
|
+
|
|
179
|
+
def __getdumpFilePath(self, fmt="json"):
|
|
180
|
+
stashBaseFileName = "pubchem_match_data_object_list"
|
|
181
|
+
fExt = ".json" if fmt == "json" else ".pic"
|
|
182
|
+
fp = os.path.join(self.__dirPath, stashBaseFileName + fExt)
|
|
183
|
+
return fp
|
|
184
|
+
|
|
185
|
+
def dump(self, fmt="json"):
|
|
186
|
+
"""Dump PubChem reference data from the object store.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
fmt (str, optional): backup file format. Defaults to "json".
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
(bool): True for success or False otherwise
|
|
193
|
+
"""
|
|
194
|
+
ok = False
|
|
195
|
+
try:
|
|
196
|
+
self.getRefData()
|
|
197
|
+
if fmt in ["json", "pickle"]:
|
|
198
|
+
kwargs = {}
|
|
199
|
+
fp = self.__getdumpFilePath(fmt=fmt)
|
|
200
|
+
logger.info("Saving object store to %s", fp)
|
|
201
|
+
mU = MarshalUtil(workPath=self.__dirPath)
|
|
202
|
+
if fmt in ["json"]:
|
|
203
|
+
kwargs = {"indent": 3}
|
|
204
|
+
ok = mU.doExport(fp, self.__refD, fmt=fmt, **kwargs)
|
|
205
|
+
except Exception as e:
|
|
206
|
+
logger.exception("Failing for %r with %s", self.__dirPath, str(e))
|
|
207
|
+
return ok
|
|
208
|
+
|
|
209
|
+
def reloadDump(self, fmt="json"):
|
|
210
|
+
"""Load PubChem reference data store from saved dump.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
fmt (str, optional): format of the backup file (pickle or json). Defaults to "json".
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
(int): number of objects restored.
|
|
217
|
+
"""
|
|
218
|
+
numUpd = 0
|
|
219
|
+
try:
|
|
220
|
+
# Read from disk backup and update object store -
|
|
221
|
+
if fmt in ["json", "pickle"]:
|
|
222
|
+
fp = self.__getdumpFilePath(fmt=fmt)
|
|
223
|
+
logger.info("Restoring object store from %s", fp)
|
|
224
|
+
mU = MarshalUtil(workPath=self.__dirPath)
|
|
225
|
+
refD = mU.doImport(fp, fmt=fmt)
|
|
226
|
+
numUpd = self.__reloadDump(refD, self.__databaseName, self.__refDataCollectionName, indexAttributeNames=["rcsb_id", "rcsb_last_update"])
|
|
227
|
+
except Exception as e:
|
|
228
|
+
logger.exception("Failing for %r with %s", self.__dirPath, str(e))
|
|
229
|
+
# --
|
|
230
|
+
return numUpd
|
|
231
|
+
|
|
232
|
+
def updateMissing(self, idList, exportPath=None, numProc=2, chunkSize=5):
|
|
233
|
+
"""Fetch and load reference data for any missing PubChem ID codes in the input list.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
idList (list): PubChem ID codes
|
|
237
|
+
numProc (int, optional): number of processor to use. Defaults to 2.
|
|
238
|
+
chunkSize (int, optional): chunk size between data store updates. Defaults to 5.
|
|
239
|
+
exportPath (str, optional): store raw fetched data in this path. Defaults to None.
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
(bool, list): status flag, list of failed identifiers
|
|
243
|
+
"""
|
|
244
|
+
curIdList = self.getRefIdCodes()
|
|
245
|
+
missS = set(idList) - set(curIdList)
|
|
246
|
+
if missS:
|
|
247
|
+
logger.info("Loading (%d) missing identifiers", len(missS))
|
|
248
|
+
ok, failList = self.load(list(missS), numProc=numProc, chunkSize=chunkSize, exportPath=exportPath)
|
|
249
|
+
else:
|
|
250
|
+
logger.info("No missing identifier - nothing to load")
|
|
251
|
+
ok = True
|
|
252
|
+
failList = []
|
|
253
|
+
|
|
254
|
+
return ok, failList
|
|
255
|
+
|
|
256
|
+
def load(self, idList, exportPath=None, numProc=2, chunkSize=5):
|
|
257
|
+
"""Fetch and load reference data for the input list of PubChem compound codes.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
idList (list): PubChem ID codes
|
|
261
|
+
exportPath (str, optional): store raw fetched data in this path. Defaults to None.
|
|
262
|
+
numProc (int, optional): number of processor to use. Defaults to 2.
|
|
263
|
+
chunkSize (int, optional): chunk size between data store updates. Defaults to 5.
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
(bool, list): status flag, list of failed identifiers
|
|
268
|
+
|
|
269
|
+
"""
|
|
270
|
+
logger.info("Length starting list is %d", len(idList))
|
|
271
|
+
optD = {"chunkSize": chunkSize, "exportPath": exportPath}
|
|
272
|
+
rWorker = PubChemDataUpdateWorker(self.__cfgOb)
|
|
273
|
+
if numProc > 1:
|
|
274
|
+
mpu = MultiProcUtil(verbose=True)
|
|
275
|
+
mpu.setOptions(optD)
|
|
276
|
+
mpu.set(workerObj=rWorker, workerMethod="updateList")
|
|
277
|
+
ok, failList, resultList, _ = mpu.runMulti(dataList=idList, numProc=numProc, numResults=2, chunkSize=chunkSize)
|
|
278
|
+
logger.info("Multi-proc %r failures %r result lengths %r %r", ok, len(failList), len(resultList[0]), len(resultList[1]))
|
|
279
|
+
else:
|
|
280
|
+
successList, _, _, _ = rWorker.updateList(idList, "SingleProc", optD, None)
|
|
281
|
+
failList = list(set(idList) - set(successList))
|
|
282
|
+
ok = len(failList) == 0
|
|
283
|
+
logger.info("Single-proc status %r failures %r", ok, len(failList))
|
|
284
|
+
#
|
|
285
|
+
return ok, failList
|
|
286
|
+
|
|
287
|
+
def getRelatedMapping(self, pcidList):
|
|
288
|
+
"""Assemble related identifiers (xrefs) for the input PubChem compound Id list.
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
pcidList (list): PubChem compound ID list
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
dict :{<pcid>: {'relatedId1': ... 'relatedId2': ... }, ...}
|
|
295
|
+
|
|
296
|
+
"""
|
|
297
|
+
#
|
|
298
|
+
retD = {}
|
|
299
|
+
logger.info("Get XREFs for PubChem compound ID list (%d)", len(pcidList))
|
|
300
|
+
#
|
|
301
|
+
try:
|
|
302
|
+
xrefD = self.__getReferenceData(self.__databaseName, self.__refDataCollectionName, selectD=None, selectionList=["rcsb_id", "cid", "data.xrefs"])
|
|
303
|
+
for pcid in pcidList:
|
|
304
|
+
try:
|
|
305
|
+
xD = xrefD[pcid]["data"]["xrefs"]
|
|
306
|
+
if isinstance(xD, list):
|
|
307
|
+
xD = {}
|
|
308
|
+
except Exception:
|
|
309
|
+
xD = {}
|
|
310
|
+
#
|
|
311
|
+
mD = {}
|
|
312
|
+
logger.debug("%s (%s) xrefs %r", pcid, xrefD[pcid]["cid"], xD)
|
|
313
|
+
for rNm, rIdL in xD.items():
|
|
314
|
+
mD[rNm] = rIdL
|
|
315
|
+
retD[pcid] = mD
|
|
316
|
+
except Exception as e:
|
|
317
|
+
logger.exception("Failing with %s", str(e))
|
|
318
|
+
#
|
|
319
|
+
return retD
|
|
320
|
+
|
|
321
|
+
#
|
|
322
|
+
def __getReferenceData(self, databaseName, collectionName, selectD=None, selectionList=None):
|
|
323
|
+
logger.info("Searching %s %s with selection query %r", databaseName, collectionName, selectD)
|
|
324
|
+
obEx = ObjectExtractor(
|
|
325
|
+
self.__cfgOb,
|
|
326
|
+
databaseName=databaseName,
|
|
327
|
+
collectionName=collectionName,
|
|
328
|
+
keyAttribute="rcsb_id",
|
|
329
|
+
uniqueAttributes=["rcsb_id"],
|
|
330
|
+
selectionQuery=selectD,
|
|
331
|
+
selectionList=selectionList,
|
|
332
|
+
stripObjectId=True,
|
|
333
|
+
)
|
|
334
|
+
docCount = obEx.getCount()
|
|
335
|
+
logger.info("Reference data object count %d", docCount)
|
|
336
|
+
objD = obEx.getObjects()
|
|
337
|
+
return objD
|
|
338
|
+
|
|
339
|
+
def __reloadDump(self, objD, databaseName, collectionName, indexAttributeNames=None):
|
|
340
|
+
"""Internal method to restore the input database/collection using the input data object.
|
|
341
|
+
|
|
342
|
+
Args:
|
|
343
|
+
objD (obj): Target reference or index data object
|
|
344
|
+
databaseName (str): target database name
|
|
345
|
+
collectionName (str): target collection name
|
|
346
|
+
indexAttributeNames (list, optional): Primary index attributes. Defaults to None.
|
|
347
|
+
|
|
348
|
+
Returns:
|
|
349
|
+
int: inserted or updated object count
|
|
350
|
+
"""
|
|
351
|
+
try:
|
|
352
|
+
numUpd = 0
|
|
353
|
+
numTotal = 0
|
|
354
|
+
updateDL = []
|
|
355
|
+
for entityKey, obj in objD.items():
|
|
356
|
+
# if "_id" in obj:
|
|
357
|
+
# obj.pop("_id")
|
|
358
|
+
selectD = {"rcsb_id": entityKey}
|
|
359
|
+
updateDL.append({"selectD": selectD, "updateD": obj})
|
|
360
|
+
#
|
|
361
|
+
obUpd = ObjectUpdater(self.__cfgOb)
|
|
362
|
+
ok = obUpd.createCollection(databaseName, collectionName, indexAttributeNames=indexAttributeNames, checkExists=True, bsonSchema=None)
|
|
363
|
+
if ok:
|
|
364
|
+
numUpd = obUpd.update(databaseName, collectionName, updateDL)
|
|
365
|
+
logger.debug("Updated object count is %d", numUpd)
|
|
366
|
+
else:
|
|
367
|
+
logger.error("Create %s %s failed", databaseName, collectionName)
|
|
368
|
+
numTotal = obUpd.count(databaseName, collectionName)
|
|
369
|
+
except Exception as e:
|
|
370
|
+
logger.exception("Failing with %s", str(e))
|
|
371
|
+
#
|
|
372
|
+
return numTotal
|
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: PubChemEtlWrapper.py
|
|
3
|
+
# Date: 19-Jul-2029 jdw
|
|
4
|
+
#
|
|
5
|
+
#
|
|
6
|
+
# Updates:
|
|
7
|
+
# 14-Mar-2023 aae Updates to use multiprocess count
|
|
8
|
+
#
|
|
9
|
+
##
|
|
10
|
+
__docformat__ = "google en"
|
|
11
|
+
__author__ = "John Westbrook"
|
|
12
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
13
|
+
__license__ = "Apache 2.0"
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
import os
|
|
17
|
+
|
|
18
|
+
from rcsb.exdb.chemref.PubChemDataCacheProvider import PubChemDataCacheProvider
|
|
19
|
+
from rcsb.exdb.chemref.PubChemIndexCacheProvider import PubChemIndexCacheProvider
|
|
20
|
+
from rcsb.utils.chemref.PubChemProvider import PubChemProvider
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class PubChemEtlWrapper(object):
|
|
26
|
+
"""Workflow wrapper for updating chemical component/BIRD to PubChem mapping and related PubChem reference data."""
|
|
27
|
+
|
|
28
|
+
def __init__(self, cfgOb, cachePath, **kwargs):
|
|
29
|
+
self.__cfgOb = cfgOb
|
|
30
|
+
self.__configName = self.__cfgOb.getDefaultSectionName()
|
|
31
|
+
self.__cachePath = cachePath
|
|
32
|
+
self.__dirPath = os.path.join(self.__cachePath, "PubChem")
|
|
33
|
+
#
|
|
34
|
+
self.__stashRemotePrefix = kwargs.get("stashRemotePrefix", None)
|
|
35
|
+
#
|
|
36
|
+
self.__pcicP = PubChemIndexCacheProvider(self.__cfgOb, self.__cachePath)
|
|
37
|
+
self.__pcdcP = PubChemDataCacheProvider(self.__cfgOb, self.__cachePath)
|
|
38
|
+
self.__pcP = PubChemProvider(cachePath=self.__cachePath)
|
|
39
|
+
#
|
|
40
|
+
self.__identifierD = None
|
|
41
|
+
#
|
|
42
|
+
|
|
43
|
+
def reloadDump(self, contentType="index"):
|
|
44
|
+
"""Reload the input content type in the data store from saved object store dump.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
contentType (str): target content to restore (data|index)
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
(int): number of records in restored collection.
|
|
51
|
+
"""
|
|
52
|
+
numRecords = 0
|
|
53
|
+
if contentType.lower() == "index":
|
|
54
|
+
numRecords = self.__pcicP.reloadDump()
|
|
55
|
+
elif contentType.lower() == "data":
|
|
56
|
+
numRecords = self.__pcdcP.reloadDump()
|
|
57
|
+
return numRecords
|
|
58
|
+
|
|
59
|
+
def dump(self, contentType):
|
|
60
|
+
"""Dump PubChem content from the object store.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
contentType (str): target content to restore (data|index)
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
(bool): True for success or False otherwise
|
|
67
|
+
"""
|
|
68
|
+
ok = False
|
|
69
|
+
if contentType.lower() == "index":
|
|
70
|
+
ok = self.__pcicP.dump()
|
|
71
|
+
elif contentType.lower() == "data":
|
|
72
|
+
ok = self.__pcdcP.dump()
|
|
73
|
+
elif contentType.lower() == "identifiers":
|
|
74
|
+
ok = self.__dumpIdentifiers()
|
|
75
|
+
|
|
76
|
+
return ok
|
|
77
|
+
|
|
78
|
+
def toStash(self, contentType, useGit=True, useStash=True):
|
|
79
|
+
"""Store PubChem extracted content () on the remote stash storage resource.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
contentType (str): target content to stash (data|index|identifiers)
|
|
83
|
+
useStash (bool): should stash (Buildlocker) be updated? (default: True)
|
|
84
|
+
useGit (bool): should stash (GitHub) be updated? (default: True)
|
|
85
|
+
Returns:
|
|
86
|
+
(bool): True for success or False otherwise
|
|
87
|
+
"""
|
|
88
|
+
if contentType.lower() == "index":
|
|
89
|
+
return self.__pcicP.backup(self.__cfgOb, self.__configName, remotePrefix=self.__stashRemotePrefix, useGit=useGit, useStash=useStash)
|
|
90
|
+
elif contentType.lower() == "data":
|
|
91
|
+
return self.__pcdcP.backup(self.__cfgOb, self.__configName, remotePrefix=self.__stashRemotePrefix, useGit=useGit, useStash=useStash)
|
|
92
|
+
elif contentType.lower() == "identifiers":
|
|
93
|
+
return self.__pcP.backup(self.__cfgOb, self.__configName, remotePrefix=self.__stashRemotePrefix, useGit=useGit, useStash=useStash)
|
|
94
|
+
return False
|
|
95
|
+
|
|
96
|
+
def fromStash(self, contentType, useStash=True, useGit=True):
|
|
97
|
+
"""Fetch PubChem extracted content from the remote stash storage resource.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
contentType (str): target content to fetch (data|index)
|
|
101
|
+
Returns:
|
|
102
|
+
(bool): True for success or False otherwise
|
|
103
|
+
"""
|
|
104
|
+
if contentType.lower() == "index":
|
|
105
|
+
return self.__pcicP.restore(self.__cfgOb, self.__configName, remotePrefix=self.__stashRemotePrefix, useGit=useGit, useStash=useStash)
|
|
106
|
+
elif contentType.lower() == "data":
|
|
107
|
+
return self.__pcdcP.restore(self.__cfgOb, self.__configName, remotePrefix=self.__stashRemotePrefix, useGit=useGit, useStash=useStash)
|
|
108
|
+
elif contentType.lower() == "identifiers":
|
|
109
|
+
return self.__pcdcP.restore(self.__cfgOb, self.__configName, remotePrefix=self.__stashRemotePrefix, useGit=useGit, useStash=useStash)
|
|
110
|
+
return False
|
|
111
|
+
|
|
112
|
+
def updateIndex(self, **kwargs):
|
|
113
|
+
"""Search and store PubChem correspondences for CCD and BIRD reference chemical definitions.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
ccUrlTarget (str, optional): target url for chemical component dictionary resource file (default: None=all public)
|
|
117
|
+
birdUrlTarget (str, optional): target url for bird dictionary resource file (cc format) (default: None=all public)
|
|
118
|
+
ccFileNamePrefix (str, optional): index file prefix (default: full)
|
|
119
|
+
rebuildChemIndices (bool, optional): rebuild indices from source (default: False)
|
|
120
|
+
fetchLimit (int, optional): maximum number of definitions to process (default: None)
|
|
121
|
+
exportPath(str, optional): path to export raw PubChem search results (default: None)
|
|
122
|
+
numProcChemComp (int, optional): number processors to include in multiprocessing mode for ChemComp indices (default: 8)
|
|
123
|
+
numProc (int, optional): number processors to include in multiprocessing mode for PubChem (default: 2)
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
(bool): True for success or False otherwise
|
|
127
|
+
"""
|
|
128
|
+
ok = False
|
|
129
|
+
try:
|
|
130
|
+
rebuildChemIndices = kwargs.get("rebuildChemIndices", False)
|
|
131
|
+
ccUrlTarget = kwargs.get("ccUrlTarget", None)
|
|
132
|
+
birdUrlTarget = kwargs.get("birdUrlTarget", None)
|
|
133
|
+
ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "full")
|
|
134
|
+
fetchLimit = kwargs.get("fetchLimit", None)
|
|
135
|
+
exportPath = kwargs.get("exportPath", None)
|
|
136
|
+
expireDays = kwargs.get("expireDays", 0)
|
|
137
|
+
numProcChemComp = kwargs.get("numProcChemComp", 8)
|
|
138
|
+
numProc = kwargs.get("numProc", 2)
|
|
139
|
+
|
|
140
|
+
# -- Update/create mapping index cache ---
|
|
141
|
+
ok = self.__pcicP.updateMissing(
|
|
142
|
+
expireDays=expireDays,
|
|
143
|
+
cachePath=self.__cachePath,
|
|
144
|
+
ccUrlTarget=ccUrlTarget,
|
|
145
|
+
birdUrlTarget=birdUrlTarget,
|
|
146
|
+
ccFileNamePrefix=ccFileNamePrefix,
|
|
147
|
+
exportPath=exportPath,
|
|
148
|
+
rebuildChemIndices=rebuildChemIndices,
|
|
149
|
+
fetchLimit=fetchLimit,
|
|
150
|
+
numProcChemComp=numProcChemComp,
|
|
151
|
+
numProc=numProc,
|
|
152
|
+
)
|
|
153
|
+
except Exception as e:
|
|
154
|
+
logger.exception("Failing with %s", str(e))
|
|
155
|
+
return ok
|
|
156
|
+
|
|
157
|
+
def getMatches(self):
|
|
158
|
+
"""Return a list of matched PubChem compound identifiers.
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
(list, str): list of PubChem compound identifiers
|
|
162
|
+
"""
|
|
163
|
+
return self.__pcicP.getMatches()
|
|
164
|
+
|
|
165
|
+
def getSelectedMatches(self, **kwargs):
|
|
166
|
+
"""
|
|
167
|
+
Return preferred PubChem correspondences from the current match index for the input source
|
|
168
|
+
component build type. Separately return alternative matches for other source types.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
sourceTypes (list, optional): list of source chemical component build types (default: ["model-xyz"])
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
(dict, dict): mapD { ccId1: [{'pcId': ... , 'inchiKey': ... }], ccId2: ...},
|
|
175
|
+
altD { ccId1: [{'pcId': ... , 'inchiKey': ... 'sourceType': ... }], ccId2: ...}
|
|
176
|
+
"""
|
|
177
|
+
sourceTypes = kwargs.get("sourceTypes", ["model-xyz"])
|
|
178
|
+
mapD, extraMapD = self.__pcicP.getSelectedMatches(exportPath=self.__dirPath, sourceTypes=sourceTypes)
|
|
179
|
+
logger.debug("mapD (%d) extraMapD (%d) %r", len(mapD), len(extraMapD), extraMapD)
|
|
180
|
+
return mapD, extraMapD
|
|
181
|
+
|
|
182
|
+
def updateData(self, pcidList, doExport=False, numProc=2):
|
|
183
|
+
"""Update PubChem reference data for the input list of compound identifiers.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
pcidList (list,str): PubChem compound identifiers
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
(bool): True for success or False otherwise
|
|
190
|
+
"""
|
|
191
|
+
ok = False
|
|
192
|
+
try:
|
|
193
|
+
exportPath = self.__dirPath if doExport else None
|
|
194
|
+
ok, failList = self.__pcdcP.updateMissing(pcidList, exportPath=exportPath, numProc=numProc)
|
|
195
|
+
if failList:
|
|
196
|
+
logger.info("No data updated for %r", failList)
|
|
197
|
+
except Exception as e:
|
|
198
|
+
logger.exception("Failing with %s", str(e))
|
|
199
|
+
return ok
|
|
200
|
+
|
|
201
|
+
def updateMatchedData(self, exportRaw=False, numProc=2):
|
|
202
|
+
"""Update PubChem reference data using matched compound identifiers in the current index.
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
(bool): True for success or False otherwise
|
|
206
|
+
"""
|
|
207
|
+
ok = False
|
|
208
|
+
try:
|
|
209
|
+
pcidList = self.getMatches()
|
|
210
|
+
exportPath = self.__dirPath if exportRaw else None
|
|
211
|
+
ok, failList = self.__pcdcP.updateMissing(pcidList, exportPath=exportPath, numProc=numProc)
|
|
212
|
+
if failList:
|
|
213
|
+
logger.info("No data updated for %r", failList)
|
|
214
|
+
except Exception as e:
|
|
215
|
+
logger.exception("Failing with %s", str(e))
|
|
216
|
+
return ok
|
|
217
|
+
|
|
218
|
+
def __getPubChemIdentifiers(self, pcidList):
|
|
219
|
+
"""Return related identifiers (xrefs) for the input PubChem compound identifier list.
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
pcidList (list): PubChem compound identifier list
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
(dict) :{<pcid>: {'relatedId1': ... 'relatedId2': ... }, ...}
|
|
226
|
+
|
|
227
|
+
"""
|
|
228
|
+
rD = self.__pcdcP.getRelatedMapping(pcidList)
|
|
229
|
+
logger.info("Related identifier map length (%d)", len(rD))
|
|
230
|
+
return rD
|
|
231
|
+
|
|
232
|
+
def updateIdentifiers(self, **kwargs):
|
|
233
|
+
"""Update PubChem assigned related identifiers for matching compounds for the input chemical component sourceTypes.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
sourceTypes (list, optional): list of source chemical component build types (default: ["model-xyz"])
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
(bool): True for success or False otherwise
|
|
240
|
+
"""
|
|
241
|
+
ok = False
|
|
242
|
+
try:
|
|
243
|
+
sourceTypes = kwargs.get("sourceTypes", ["model-xyz"])
|
|
244
|
+
mapD, _ = self.getSelectedMatches(sourceTypes=sourceTypes)
|
|
245
|
+
pcIdList = []
|
|
246
|
+
# mapD { ccId1: [{'pcId': ... , 'inchiKey': ... }],
|
|
247
|
+
for mDL in mapD.values():
|
|
248
|
+
pcIdList.extend([mD["pcId"] for mD in mDL])
|
|
249
|
+
logger.info("pcIdList (%d)", len(pcIdList))
|
|
250
|
+
rD = self.__getPubChemIdentifiers(pcIdList)
|
|
251
|
+
#
|
|
252
|
+
# Update the identifier mappings
|
|
253
|
+
for _, mDL in mapD.items():
|
|
254
|
+
for mD in mDL:
|
|
255
|
+
pcId = mD["pcId"]
|
|
256
|
+
if pcId in rD:
|
|
257
|
+
for rIdName, rIdValue in rD[pcId].items():
|
|
258
|
+
mD[rIdName] = rIdValue
|
|
259
|
+
#
|
|
260
|
+
self.__identifierD = mapD
|
|
261
|
+
ok = self.__identifierD is not None
|
|
262
|
+
except Exception as e:
|
|
263
|
+
logger.exception("Failing with %s", str(e))
|
|
264
|
+
return ok
|
|
265
|
+
|
|
266
|
+
def getIdentifiers(self, **kwargs):
|
|
267
|
+
"""Get PubChem assigned related identifiers for matching compounds for the input chemical component sourceTypes.
|
|
268
|
+
|
|
269
|
+
Returns:
|
|
270
|
+
dict: riD { ccId1: [{'pcId': ... , 'inchiKey': ... 'ChEBI': ... 'ChEMBL': ... 'CAS': ... }], ccId2: ...},
|
|
271
|
+
|
|
272
|
+
"""
|
|
273
|
+
if not self.__identifierD:
|
|
274
|
+
self.updateIdentifiers(**kwargs)
|
|
275
|
+
return self.__identifierD
|
|
276
|
+
|
|
277
|
+
def __dumpIdentifiers(self):
|
|
278
|
+
rD = self.getIdentifiers()
|
|
279
|
+
ok = self.__pcP.load(rD, "identifiers", fmt="json")
|
|
280
|
+
return ok
|