rcsb.exdb 1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. rcsb/__init__.py +1 -0
  2. rcsb/exdb/__init__.py +1 -0
  3. rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
  4. rcsb/exdb/branch/GlycanProvider.py +116 -0
  5. rcsb/exdb/branch/GlycanUtils.py +114 -0
  6. rcsb/exdb/branch/__init__.py +0 -0
  7. rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
  8. rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
  9. rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
  10. rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
  11. rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
  12. rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
  13. rcsb/exdb/chemref/__init__.py +0 -0
  14. rcsb/exdb/citation/CitationAdapter.py +91 -0
  15. rcsb/exdb/citation/CitationExtractor.py +190 -0
  16. rcsb/exdb/citation/CitationUtils.py +51 -0
  17. rcsb/exdb/citation/__init__.py +0 -0
  18. rcsb/exdb/cli/__init__.py +0 -0
  19. rcsb/exdb/entry/EntryInfoProvider.py +148 -0
  20. rcsb/exdb/entry/__init__.py +0 -0
  21. rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
  22. rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
  23. rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
  24. rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
  25. rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
  26. rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
  27. rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
  28. rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
  29. rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
  30. rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
  31. rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
  32. rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
  33. rcsb/exdb/seq/AnnotationExtractor.py +76 -0
  34. rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
  35. rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
  36. rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
  37. rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
  38. rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
  39. rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
  40. rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
  41. rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
  42. rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
  43. rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
  44. rcsb/exdb/seq/UniProtExtractor.py +80 -0
  45. rcsb/exdb/seq/__init__.py +0 -0
  46. rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
  47. rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
  48. rcsb/exdb/tests/__init__.py +0 -0
  49. rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
  50. rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
  51. rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
  52. rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
  53. rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
  54. rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
  55. rcsb/exdb/tests/testChemRefLoader.py +106 -0
  56. rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
  57. rcsb/exdb/tests/testCitationAdapter.py +97 -0
  58. rcsb/exdb/tests/testCitationExtractor.py +93 -0
  59. rcsb/exdb/tests/testCitationUtils.py +92 -0
  60. rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
  61. rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
  62. rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
  63. rcsb/exdb/tests/testGlycanProvider.py +98 -0
  64. rcsb/exdb/tests/testGlycanUtils.py +64 -0
  65. rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
  66. rcsb/exdb/tests/testObjectExtractor.py +342 -0
  67. rcsb/exdb/tests/testObjectTransformer.py +83 -0
  68. rcsb/exdb/tests/testObjectUpdater.py +120 -0
  69. rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
  70. rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
  71. rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
  72. rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
  73. rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
  74. rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
  75. rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
  76. rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
  77. rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
  78. rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
  79. rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
  80. rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
  81. rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
  82. rcsb/exdb/tests/testUniProtExtractor.py +77 -0
  83. rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
  84. rcsb/exdb/tree/__init__.py +0 -0
  85. rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
  86. rcsb/exdb/utils/ObjectExtractor.py +286 -0
  87. rcsb/exdb/utils/ObjectTransformer.py +124 -0
  88. rcsb/exdb/utils/ObjectUpdater.py +121 -0
  89. rcsb/exdb/utils/ObjectValidator.py +160 -0
  90. rcsb/exdb/utils/__init__.py +0 -0
  91. rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
  92. rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
  93. rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
  94. rcsb/exdb/wf/__init__.py +0 -0
  95. rcsb_exdb-1.31.dist-info/METADATA +103 -0
  96. rcsb_exdb-1.31.dist-info/RECORD +98 -0
  97. rcsb_exdb-1.31.dist-info/WHEEL +4 -0
  98. rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,372 @@
1
+ ##
2
+ # File: PubChemDataCacheProvider.py
3
+ # Date: 2-Apr-2020 jdw
4
+ #
5
+ # Utilities to cache chemical reference data and mappings for PubChem
6
+ #
7
+ # Updates:
8
+ # 9-May-2020 jdw separate cache behavior with separate option rebuildChemIndices=True/False
9
+ # 16-Jul-2020 jdw separate index and reference data management.
10
+ # 23-Jul-2021 jdw Make PubChemDataCacheProvider a subclass of StashableBase()
11
+ # 15-Mar-2023 aae Update default numProc to 2
12
+ #
13
+ ##
14
+ __docformat__ = "google en"
15
+ __author__ = "John Westbrook"
16
+ __email__ = "jwest@rcsb.rutgers.edu"
17
+ __license__ = "Apache 2.0"
18
+
19
+ import logging
20
+ import os
21
+ import time
22
+
23
+ from rcsb.exdb.utils.ObjectExtractor import ObjectExtractor
24
+ from rcsb.exdb.utils.ObjectUpdater import ObjectUpdater
25
+ from rcsb.utils.chemref.PubChemUtils import PubChemUtils, ChemicalIdentifier
26
+ from rcsb.utils.io.IoUtil import getObjSize
27
+ from rcsb.utils.io.MarshalUtil import MarshalUtil
28
+ from rcsb.utils.io.StashableBase import StashableBase
29
+ from rcsb.utils.io.TimeUtil import TimeUtil
30
+ from rcsb.utils.multiproc.MultiProcUtil import MultiProcUtil
31
+
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ class PubChemDataUpdateWorker(object):
37
+ """A skeleton worker class that implements the interface expected by the multiprocessing module
38
+ for fetching PubChem chemical reference data --
39
+ """
40
+
41
+ def __init__(self, cfgOb, **kwargs):
42
+ self.__cfgOb = cfgOb
43
+ #
44
+ _ = kwargs
45
+ self.__databaseName = "pubchem_exdb"
46
+ self.__refDataCollectionName = "reference_entry"
47
+ self.__createCollections(self.__databaseName, self.__refDataCollectionName, indexAttributeNames=["rcsb_id", "rcsb_last_update"])
48
+ self.__pcU = PubChemUtils()
49
+
50
+ def updateList(self, dataList, procName, optionsD, workingDir):
51
+ """Update the input list of reference data identifiers (ChemicalIdentifier()) and return
52
+ matching diagnostics and reference feature data.
53
+
54
+ """
55
+ _ = workingDir
56
+ chunkSize = optionsD.get("chunkSize", 50)
57
+ # Path to store raw request data -
58
+ exportPath = optionsD.get("exportPath", None)
59
+ #
60
+ successList = []
61
+ retList1 = []
62
+ retList2 = []
63
+ diagList = []
64
+ emptyList = []
65
+ # -
66
+ try:
67
+ tU = TimeUtil()
68
+ pcidList = dataList
69
+ numChunks = len(list(self.__chunker(pcidList, chunkSize)))
70
+ logger.info("%s search starting for %d reference definitions (in chunks of length %d)", procName, len(pcidList), chunkSize)
71
+ for ii, pcidChunk in enumerate(self.__chunker(pcidList, chunkSize), 1):
72
+ logger.info("%s starting chunk for %d of %d", procName, ii, numChunks)
73
+ tDL = []
74
+ timeS = tU.getDateTimeObj(tU.getTimestamp())
75
+ for pcid in pcidChunk:
76
+ #
77
+ chemId = ChemicalIdentifier(idCode=pcid, identifierType="cid", identifier=pcid, identifierSource="ccd-match")
78
+ #
79
+ stA = time.time()
80
+ ok, refDL = self.__pcU.assemble(chemId, exportPath=exportPath)
81
+ #
82
+ if not ok:
83
+ etA = time.time()
84
+ logger.info("Failing %s search source %s for %s (%.4f secs)", chemId.identifierType, chemId.identifierSource, chemId.idCode, etA - stA)
85
+
86
+ #
87
+ if ok and refDL:
88
+ successList.append(pcid)
89
+ for tD in refDL:
90
+ tD.update({"rcsb_id": tD["cid"], "rcsb_last_update": timeS})
91
+ tDL.append(tD)
92
+ else:
93
+ logger.info("No match result for any form of %s", pcid)
94
+ # --
95
+ startTimeL = time.time()
96
+ logger.info("Saving chunk %d (len=%d)", ii, len(pcidChunk))
97
+ self.__updateObjectStore(self.__databaseName, self.__refDataCollectionName, tDL)
98
+ endTimeL = time.time()
99
+ logger.info("Saved chunk %d (len=%d) in %.3f secs", ii, len(pcidChunk), endTimeL - startTimeL)
100
+ except Exception as e:
101
+ logger.exception("Failing %s for %d data items %s", procName, len(dataList), str(e))
102
+ logger.info("%s dataList length %d success length %d rst1 %d rst2 %d", procName, len(dataList), len(successList), len(retList1), len(retList2))
103
+ #
104
+ return successList, emptyList, emptyList, diagList
105
+
106
+ def __updateObjectStore(self, databaseName, collectionName, objDL):
107
+ updateDL = []
108
+ for objD in objDL:
109
+ try:
110
+ selectD = {"rcsb_id": objD["rcsb_id"]}
111
+ updateDL.append({"selectD": selectD, "updateD": objD})
112
+ except Exception as e:
113
+ logger.exception("Failing with %s", str(e))
114
+ obUpd = ObjectUpdater(self.__cfgOb)
115
+ numUpd = obUpd.update(databaseName, collectionName, updateDL)
116
+ logger.info("Updated reference count is %d", numUpd)
117
+
118
+ def __createCollections(self, databaseName, collectionName, indexAttributeNames=None):
119
+ obUpd = ObjectUpdater(self.__cfgOb)
120
+ ok = obUpd.createCollection(databaseName, collectionName, indexAttributeNames=indexAttributeNames, checkExists=True, bsonSchema=None)
121
+ return ok
122
+
123
+ def __chunker(self, iList, chunkSize):
124
+ chunkSize = max(1, chunkSize)
125
+ return (iList[i : i + chunkSize] for i in range(0, len(iList), chunkSize))
126
+
127
+
128
+ class PubChemDataCacheProvider(StashableBase):
129
+ """Utilities to cache chemical reference data extracted from PubChem compound data"""
130
+
131
+ def __init__(self, cfgOb, cachePath):
132
+ dirName = "PubChem-data"
133
+ super(PubChemDataCacheProvider, self).__init__(cachePath, [dirName])
134
+ self.__cfgOb = cfgOb
135
+ self.__dirPath = os.path.join(cachePath, dirName)
136
+ #
137
+ self.__databaseName = "pubchem_exdb"
138
+ self.__refDataCollectionName = "reference_entry"
139
+ #
140
+ self.__refD = None
141
+
142
+ def getRefData(self, expireDays=0):
143
+ if not self.__refD:
144
+ selectD = {}
145
+ if expireDays > 0:
146
+ tU = TimeUtil()
147
+ tS = tU.getTimestamp(useUtc=True, before={"days": expireDays})
148
+ selectD.update({"rcsb_latest_update": {"$lt": tU.getDateTimeObj(tS)}})
149
+ self.__refD = self.__getReferenceData(self.__databaseName, self.__refDataCollectionName, selectD=selectD)
150
+ #
151
+ return self.__refD
152
+
153
+ def getRefIdCodes(self, expireDays=0):
154
+ selectD = {}
155
+ selectionList = ["rcsb_id"]
156
+ if expireDays > 0:
157
+ tU = TimeUtil()
158
+ tS = tU.getTimestamp(useUtc=True, before={"days": expireDays})
159
+ selectD.update({"rcsb_latest_update": {"$lt": tU.getDateTimeObj(tS)}})
160
+ refIds = self.__getReferenceData(self.__databaseName, self.__refDataCollectionName, selectD=selectD, selectionList=selectionList)
161
+ #
162
+ return list(refIds.keys()) if refIds else []
163
+
164
+ def getRefDataCount(self):
165
+ return len(self.__refD) if self.__refD else 0
166
+
167
+ def testCache(self, minCount=None, logSizes=False):
168
+ okC = bool(self.__refD)
169
+ if not okC:
170
+ return okC
171
+ logger.info("Reference data cache lengths: refD %d", len(self.__refD))
172
+ if minCount and len(self.__refD) < minCount:
173
+ return False
174
+ #
175
+ if logSizes:
176
+ logger.info("refD %.2f", getObjSize(self.__refD) / 1000000.0)
177
+ return True
178
+
179
+ def __getdumpFilePath(self, fmt="json"):
180
+ stashBaseFileName = "pubchem_match_data_object_list"
181
+ fExt = ".json" if fmt == "json" else ".pic"
182
+ fp = os.path.join(self.__dirPath, stashBaseFileName + fExt)
183
+ return fp
184
+
185
+ def dump(self, fmt="json"):
186
+ """Dump PubChem reference data from the object store.
187
+
188
+ Args:
189
+ fmt (str, optional): backup file format. Defaults to "json".
190
+
191
+ Returns:
192
+ (bool): True for success or False otherwise
193
+ """
194
+ ok = False
195
+ try:
196
+ self.getRefData()
197
+ if fmt in ["json", "pickle"]:
198
+ kwargs = {}
199
+ fp = self.__getdumpFilePath(fmt=fmt)
200
+ logger.info("Saving object store to %s", fp)
201
+ mU = MarshalUtil(workPath=self.__dirPath)
202
+ if fmt in ["json"]:
203
+ kwargs = {"indent": 3}
204
+ ok = mU.doExport(fp, self.__refD, fmt=fmt, **kwargs)
205
+ except Exception as e:
206
+ logger.exception("Failing for %r with %s", self.__dirPath, str(e))
207
+ return ok
208
+
209
+ def reloadDump(self, fmt="json"):
210
+ """Load PubChem reference data store from saved dump.
211
+
212
+ Args:
213
+ fmt (str, optional): format of the backup file (pickle or json). Defaults to "json".
214
+
215
+ Returns:
216
+ (int): number of objects restored.
217
+ """
218
+ numUpd = 0
219
+ try:
220
+ # Read from disk backup and update object store -
221
+ if fmt in ["json", "pickle"]:
222
+ fp = self.__getdumpFilePath(fmt=fmt)
223
+ logger.info("Restoring object store from %s", fp)
224
+ mU = MarshalUtil(workPath=self.__dirPath)
225
+ refD = mU.doImport(fp, fmt=fmt)
226
+ numUpd = self.__reloadDump(refD, self.__databaseName, self.__refDataCollectionName, indexAttributeNames=["rcsb_id", "rcsb_last_update"])
227
+ except Exception as e:
228
+ logger.exception("Failing for %r with %s", self.__dirPath, str(e))
229
+ # --
230
+ return numUpd
231
+
232
+ def updateMissing(self, idList, exportPath=None, numProc=2, chunkSize=5):
233
+ """Fetch and load reference data for any missing PubChem ID codes in the input list.
234
+
235
+ Args:
236
+ idList (list): PubChem ID codes
237
+ numProc (int, optional): number of processor to use. Defaults to 2.
238
+ chunkSize (int, optional): chunk size between data store updates. Defaults to 5.
239
+ exportPath (str, optional): store raw fetched data in this path. Defaults to None.
240
+
241
+ Returns:
242
+ (bool, list): status flag, list of failed identifiers
243
+ """
244
+ curIdList = self.getRefIdCodes()
245
+ missS = set(idList) - set(curIdList)
246
+ if missS:
247
+ logger.info("Loading (%d) missing identifiers", len(missS))
248
+ ok, failList = self.load(list(missS), numProc=numProc, chunkSize=chunkSize, exportPath=exportPath)
249
+ else:
250
+ logger.info("No missing identifier - nothing to load")
251
+ ok = True
252
+ failList = []
253
+
254
+ return ok, failList
255
+
256
+ def load(self, idList, exportPath=None, numProc=2, chunkSize=5):
257
+ """Fetch and load reference data for the input list of PubChem compound codes.
258
+
259
+ Args:
260
+ idList (list): PubChem ID codes
261
+ exportPath (str, optional): store raw fetched data in this path. Defaults to None.
262
+ numProc (int, optional): number of processor to use. Defaults to 2.
263
+ chunkSize (int, optional): chunk size between data store updates. Defaults to 5.
264
+
265
+
266
+ Returns:
267
+ (bool, list): status flag, list of failed identifiers
268
+
269
+ """
270
+ logger.info("Length starting list is %d", len(idList))
271
+ optD = {"chunkSize": chunkSize, "exportPath": exportPath}
272
+ rWorker = PubChemDataUpdateWorker(self.__cfgOb)
273
+ if numProc > 1:
274
+ mpu = MultiProcUtil(verbose=True)
275
+ mpu.setOptions(optD)
276
+ mpu.set(workerObj=rWorker, workerMethod="updateList")
277
+ ok, failList, resultList, _ = mpu.runMulti(dataList=idList, numProc=numProc, numResults=2, chunkSize=chunkSize)
278
+ logger.info("Multi-proc %r failures %r result lengths %r %r", ok, len(failList), len(resultList[0]), len(resultList[1]))
279
+ else:
280
+ successList, _, _, _ = rWorker.updateList(idList, "SingleProc", optD, None)
281
+ failList = list(set(idList) - set(successList))
282
+ ok = len(failList) == 0
283
+ logger.info("Single-proc status %r failures %r", ok, len(failList))
284
+ #
285
+ return ok, failList
286
+
287
+ def getRelatedMapping(self, pcidList):
288
+ """Assemble related identifiers (xrefs) for the input PubChem compound Id list.
289
+
290
+ Args:
291
+ pcidList (list): PubChem compound ID list
292
+
293
+ Returns:
294
+ dict :{<pcid>: {'relatedId1': ... 'relatedId2': ... }, ...}
295
+
296
+ """
297
+ #
298
+ retD = {}
299
+ logger.info("Get XREFs for PubChem compound ID list (%d)", len(pcidList))
300
+ #
301
+ try:
302
+ xrefD = self.__getReferenceData(self.__databaseName, self.__refDataCollectionName, selectD=None, selectionList=["rcsb_id", "cid", "data.xrefs"])
303
+ for pcid in pcidList:
304
+ try:
305
+ xD = xrefD[pcid]["data"]["xrefs"]
306
+ if isinstance(xD, list):
307
+ xD = {}
308
+ except Exception:
309
+ xD = {}
310
+ #
311
+ mD = {}
312
+ logger.debug("%s (%s) xrefs %r", pcid, xrefD[pcid]["cid"], xD)
313
+ for rNm, rIdL in xD.items():
314
+ mD[rNm] = rIdL
315
+ retD[pcid] = mD
316
+ except Exception as e:
317
+ logger.exception("Failing with %s", str(e))
318
+ #
319
+ return retD
320
+
321
+ #
322
+ def __getReferenceData(self, databaseName, collectionName, selectD=None, selectionList=None):
323
+ logger.info("Searching %s %s with selection query %r", databaseName, collectionName, selectD)
324
+ obEx = ObjectExtractor(
325
+ self.__cfgOb,
326
+ databaseName=databaseName,
327
+ collectionName=collectionName,
328
+ keyAttribute="rcsb_id",
329
+ uniqueAttributes=["rcsb_id"],
330
+ selectionQuery=selectD,
331
+ selectionList=selectionList,
332
+ stripObjectId=True,
333
+ )
334
+ docCount = obEx.getCount()
335
+ logger.info("Reference data object count %d", docCount)
336
+ objD = obEx.getObjects()
337
+ return objD
338
+
339
+ def __reloadDump(self, objD, databaseName, collectionName, indexAttributeNames=None):
340
+ """Internal method to restore the input database/collection using the input data object.
341
+
342
+ Args:
343
+ objD (obj): Target reference or index data object
344
+ databaseName (str): target database name
345
+ collectionName (str): target collection name
346
+ indexAttributeNames (list, optional): Primary index attributes. Defaults to None.
347
+
348
+ Returns:
349
+ int: inserted or updated object count
350
+ """
351
+ try:
352
+ numUpd = 0
353
+ numTotal = 0
354
+ updateDL = []
355
+ for entityKey, obj in objD.items():
356
+ # if "_id" in obj:
357
+ # obj.pop("_id")
358
+ selectD = {"rcsb_id": entityKey}
359
+ updateDL.append({"selectD": selectD, "updateD": obj})
360
+ #
361
+ obUpd = ObjectUpdater(self.__cfgOb)
362
+ ok = obUpd.createCollection(databaseName, collectionName, indexAttributeNames=indexAttributeNames, checkExists=True, bsonSchema=None)
363
+ if ok:
364
+ numUpd = obUpd.update(databaseName, collectionName, updateDL)
365
+ logger.debug("Updated object count is %d", numUpd)
366
+ else:
367
+ logger.error("Create %s %s failed", databaseName, collectionName)
368
+ numTotal = obUpd.count(databaseName, collectionName)
369
+ except Exception as e:
370
+ logger.exception("Failing with %s", str(e))
371
+ #
372
+ return numTotal
@@ -0,0 +1,280 @@
1
+ ##
2
+ # File: PubChemEtlWrapper.py
3
+ # Date: 19-Jul-2029 jdw
4
+ #
5
+ #
6
+ # Updates:
7
+ # 14-Mar-2023 aae Updates to use multiprocess count
8
+ #
9
+ ##
10
+ __docformat__ = "google en"
11
+ __author__ = "John Westbrook"
12
+ __email__ = "jwest@rcsb.rutgers.edu"
13
+ __license__ = "Apache 2.0"
14
+
15
+ import logging
16
+ import os
17
+
18
+ from rcsb.exdb.chemref.PubChemDataCacheProvider import PubChemDataCacheProvider
19
+ from rcsb.exdb.chemref.PubChemIndexCacheProvider import PubChemIndexCacheProvider
20
+ from rcsb.utils.chemref.PubChemProvider import PubChemProvider
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class PubChemEtlWrapper(object):
26
+ """Workflow wrapper for updating chemical component/BIRD to PubChem mapping and related PubChem reference data."""
27
+
28
+ def __init__(self, cfgOb, cachePath, **kwargs):
29
+ self.__cfgOb = cfgOb
30
+ self.__configName = self.__cfgOb.getDefaultSectionName()
31
+ self.__cachePath = cachePath
32
+ self.__dirPath = os.path.join(self.__cachePath, "PubChem")
33
+ #
34
+ self.__stashRemotePrefix = kwargs.get("stashRemotePrefix", None)
35
+ #
36
+ self.__pcicP = PubChemIndexCacheProvider(self.__cfgOb, self.__cachePath)
37
+ self.__pcdcP = PubChemDataCacheProvider(self.__cfgOb, self.__cachePath)
38
+ self.__pcP = PubChemProvider(cachePath=self.__cachePath)
39
+ #
40
+ self.__identifierD = None
41
+ #
42
+
43
+ def reloadDump(self, contentType="index"):
44
+ """Reload the input content type in the data store from saved object store dump.
45
+
46
+ Args:
47
+ contentType (str): target content to restore (data|index)
48
+
49
+ Returns:
50
+ (int): number of records in restored collection.
51
+ """
52
+ numRecords = 0
53
+ if contentType.lower() == "index":
54
+ numRecords = self.__pcicP.reloadDump()
55
+ elif contentType.lower() == "data":
56
+ numRecords = self.__pcdcP.reloadDump()
57
+ return numRecords
58
+
59
+ def dump(self, contentType):
60
+ """Dump PubChem content from the object store.
61
+
62
+ Args:
63
+ contentType (str): target content to restore (data|index)
64
+
65
+ Returns:
66
+ (bool): True for success or False otherwise
67
+ """
68
+ ok = False
69
+ if contentType.lower() == "index":
70
+ ok = self.__pcicP.dump()
71
+ elif contentType.lower() == "data":
72
+ ok = self.__pcdcP.dump()
73
+ elif contentType.lower() == "identifiers":
74
+ ok = self.__dumpIdentifiers()
75
+
76
+ return ok
77
+
78
+ def toStash(self, contentType, useGit=True, useStash=True):
79
+ """Store PubChem extracted content () on the remote stash storage resource.
80
+
81
+ Args:
82
+ contentType (str): target content to stash (data|index|identifiers)
83
+ useStash (bool): should stash (Buildlocker) be updated? (default: True)
84
+ useGit (bool): should stash (GitHub) be updated? (default: True)
85
+ Returns:
86
+ (bool): True for success or False otherwise
87
+ """
88
+ if contentType.lower() == "index":
89
+ return self.__pcicP.backup(self.__cfgOb, self.__configName, remotePrefix=self.__stashRemotePrefix, useGit=useGit, useStash=useStash)
90
+ elif contentType.lower() == "data":
91
+ return self.__pcdcP.backup(self.__cfgOb, self.__configName, remotePrefix=self.__stashRemotePrefix, useGit=useGit, useStash=useStash)
92
+ elif contentType.lower() == "identifiers":
93
+ return self.__pcP.backup(self.__cfgOb, self.__configName, remotePrefix=self.__stashRemotePrefix, useGit=useGit, useStash=useStash)
94
+ return False
95
+
96
+ def fromStash(self, contentType, useStash=True, useGit=True):
97
+ """Fetch PubChem extracted content from the remote stash storage resource.
98
+
99
+ Args:
100
+ contentType (str): target content to fetch (data|index)
101
+ Returns:
102
+ (bool): True for success or False otherwise
103
+ """
104
+ if contentType.lower() == "index":
105
+ return self.__pcicP.restore(self.__cfgOb, self.__configName, remotePrefix=self.__stashRemotePrefix, useGit=useGit, useStash=useStash)
106
+ elif contentType.lower() == "data":
107
+ return self.__pcdcP.restore(self.__cfgOb, self.__configName, remotePrefix=self.__stashRemotePrefix, useGit=useGit, useStash=useStash)
108
+ elif contentType.lower() == "identifiers":
109
+ return self.__pcdcP.restore(self.__cfgOb, self.__configName, remotePrefix=self.__stashRemotePrefix, useGit=useGit, useStash=useStash)
110
+ return False
111
+
112
+ def updateIndex(self, **kwargs):
113
+ """Search and store PubChem correspondences for CCD and BIRD reference chemical definitions.
114
+
115
+ Args:
116
+ ccUrlTarget (str, optional): target url for chemical component dictionary resource file (default: None=all public)
117
+ birdUrlTarget (str, optional): target url for bird dictionary resource file (cc format) (default: None=all public)
118
+ ccFileNamePrefix (str, optional): index file prefix (default: full)
119
+ rebuildChemIndices (bool, optional): rebuild indices from source (default: False)
120
+ fetchLimit (int, optional): maximum number of definitions to process (default: None)
121
+ exportPath(str, optional): path to export raw PubChem search results (default: None)
122
+ numProcChemComp (int, optional): number processors to include in multiprocessing mode for ChemComp indices (default: 8)
123
+ numProc (int, optional): number processors to include in multiprocessing mode for PubChem (default: 2)
124
+
125
+ Returns:
126
+ (bool): True for success or False otherwise
127
+ """
128
+ ok = False
129
+ try:
130
+ rebuildChemIndices = kwargs.get("rebuildChemIndices", False)
131
+ ccUrlTarget = kwargs.get("ccUrlTarget", None)
132
+ birdUrlTarget = kwargs.get("birdUrlTarget", None)
133
+ ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "full")
134
+ fetchLimit = kwargs.get("fetchLimit", None)
135
+ exportPath = kwargs.get("exportPath", None)
136
+ expireDays = kwargs.get("expireDays", 0)
137
+ numProcChemComp = kwargs.get("numProcChemComp", 8)
138
+ numProc = kwargs.get("numProc", 2)
139
+
140
+ # -- Update/create mapping index cache ---
141
+ ok = self.__pcicP.updateMissing(
142
+ expireDays=expireDays,
143
+ cachePath=self.__cachePath,
144
+ ccUrlTarget=ccUrlTarget,
145
+ birdUrlTarget=birdUrlTarget,
146
+ ccFileNamePrefix=ccFileNamePrefix,
147
+ exportPath=exportPath,
148
+ rebuildChemIndices=rebuildChemIndices,
149
+ fetchLimit=fetchLimit,
150
+ numProcChemComp=numProcChemComp,
151
+ numProc=numProc,
152
+ )
153
+ except Exception as e:
154
+ logger.exception("Failing with %s", str(e))
155
+ return ok
156
+
157
+ def getMatches(self):
158
+ """Return a list of matched PubChem compound identifiers.
159
+
160
+ Returns:
161
+ (list, str): list of PubChem compound identifiers
162
+ """
163
+ return self.__pcicP.getMatches()
164
+
165
+ def getSelectedMatches(self, **kwargs):
166
+ """
167
+ Return preferred PubChem correspondences from the current match index for the input source
168
+ component build type. Separately return alternative matches for other source types.
169
+
170
+ Args:
171
+ sourceTypes (list, optional): list of source chemical component build types (default: ["model-xyz"])
172
+
173
+ Returns:
174
+ (dict, dict): mapD { ccId1: [{'pcId': ... , 'inchiKey': ... }], ccId2: ...},
175
+ altD { ccId1: [{'pcId': ... , 'inchiKey': ... 'sourceType': ... }], ccId2: ...}
176
+ """
177
+ sourceTypes = kwargs.get("sourceTypes", ["model-xyz"])
178
+ mapD, extraMapD = self.__pcicP.getSelectedMatches(exportPath=self.__dirPath, sourceTypes=sourceTypes)
179
+ logger.debug("mapD (%d) extraMapD (%d) %r", len(mapD), len(extraMapD), extraMapD)
180
+ return mapD, extraMapD
181
+
182
+ def updateData(self, pcidList, doExport=False, numProc=2):
183
+ """Update PubChem reference data for the input list of compound identifiers.
184
+
185
+ Args:
186
+ pcidList (list,str): PubChem compound identifiers
187
+
188
+ Returns:
189
+ (bool): True for success or False otherwise
190
+ """
191
+ ok = False
192
+ try:
193
+ exportPath = self.__dirPath if doExport else None
194
+ ok, failList = self.__pcdcP.updateMissing(pcidList, exportPath=exportPath, numProc=numProc)
195
+ if failList:
196
+ logger.info("No data updated for %r", failList)
197
+ except Exception as e:
198
+ logger.exception("Failing with %s", str(e))
199
+ return ok
200
+
201
+ def updateMatchedData(self, exportRaw=False, numProc=2):
202
+ """Update PubChem reference data using matched compound identifiers in the current index.
203
+
204
+ Returns:
205
+ (bool): True for success or False otherwise
206
+ """
207
+ ok = False
208
+ try:
209
+ pcidList = self.getMatches()
210
+ exportPath = self.__dirPath if exportRaw else None
211
+ ok, failList = self.__pcdcP.updateMissing(pcidList, exportPath=exportPath, numProc=numProc)
212
+ if failList:
213
+ logger.info("No data updated for %r", failList)
214
+ except Exception as e:
215
+ logger.exception("Failing with %s", str(e))
216
+ return ok
217
+
218
+ def __getPubChemIdentifiers(self, pcidList):
219
+ """Return related identifiers (xrefs) for the input PubChem compound identifier list.
220
+
221
+ Args:
222
+ pcidList (list): PubChem compound identifier list
223
+
224
+ Returns:
225
+ (dict) :{<pcid>: {'relatedId1': ... 'relatedId2': ... }, ...}
226
+
227
+ """
228
+ rD = self.__pcdcP.getRelatedMapping(pcidList)
229
+ logger.info("Related identifier map length (%d)", len(rD))
230
+ return rD
231
+
232
+ def updateIdentifiers(self, **kwargs):
233
+ """Update PubChem assigned related identifiers for matching compounds for the input chemical component sourceTypes.
234
+
235
+ Args:
236
+ sourceTypes (list, optional): list of source chemical component build types (default: ["model-xyz"])
237
+
238
+ Returns:
239
+ (bool): True for success or False otherwise
240
+ """
241
+ ok = False
242
+ try:
243
+ sourceTypes = kwargs.get("sourceTypes", ["model-xyz"])
244
+ mapD, _ = self.getSelectedMatches(sourceTypes=sourceTypes)
245
+ pcIdList = []
246
+ # mapD { ccId1: [{'pcId': ... , 'inchiKey': ... }],
247
+ for mDL in mapD.values():
248
+ pcIdList.extend([mD["pcId"] for mD in mDL])
249
+ logger.info("pcIdList (%d)", len(pcIdList))
250
+ rD = self.__getPubChemIdentifiers(pcIdList)
251
+ #
252
+ # Update the identifier mappings
253
+ for _, mDL in mapD.items():
254
+ for mD in mDL:
255
+ pcId = mD["pcId"]
256
+ if pcId in rD:
257
+ for rIdName, rIdValue in rD[pcId].items():
258
+ mD[rIdName] = rIdValue
259
+ #
260
+ self.__identifierD = mapD
261
+ ok = self.__identifierD is not None
262
+ except Exception as e:
263
+ logger.exception("Failing with %s", str(e))
264
+ return ok
265
+
266
+ def getIdentifiers(self, **kwargs):
267
+ """Get PubChem assigned related identifiers for matching compounds for the input chemical component sourceTypes.
268
+
269
+ Returns:
270
+ dict: riD { ccId1: [{'pcId': ... , 'inchiKey': ... 'ChEBI': ... 'ChEMBL': ... 'CAS': ... }], ccId2: ...},
271
+
272
+ """
273
+ if not self.__identifierD:
274
+ self.updateIdentifiers(**kwargs)
275
+ return self.__identifierD
276
+
277
+ def __dumpIdentifiers(self):
278
+ rD = self.getIdentifiers()
279
+ ok = self.__pcP.load(rD, "identifiers", fmt="json")
280
+ return ok