rcsb.exdb 1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. rcsb/__init__.py +1 -0
  2. rcsb/exdb/__init__.py +1 -0
  3. rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
  4. rcsb/exdb/branch/GlycanProvider.py +116 -0
  5. rcsb/exdb/branch/GlycanUtils.py +114 -0
  6. rcsb/exdb/branch/__init__.py +0 -0
  7. rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
  8. rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
  9. rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
  10. rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
  11. rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
  12. rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
  13. rcsb/exdb/chemref/__init__.py +0 -0
  14. rcsb/exdb/citation/CitationAdapter.py +91 -0
  15. rcsb/exdb/citation/CitationExtractor.py +190 -0
  16. rcsb/exdb/citation/CitationUtils.py +51 -0
  17. rcsb/exdb/citation/__init__.py +0 -0
  18. rcsb/exdb/cli/__init__.py +0 -0
  19. rcsb/exdb/entry/EntryInfoProvider.py +148 -0
  20. rcsb/exdb/entry/__init__.py +0 -0
  21. rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
  22. rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
  23. rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
  24. rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
  25. rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
  26. rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
  27. rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
  28. rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
  29. rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
  30. rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
  31. rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
  32. rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
  33. rcsb/exdb/seq/AnnotationExtractor.py +76 -0
  34. rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
  35. rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
  36. rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
  37. rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
  38. rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
  39. rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
  40. rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
  41. rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
  42. rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
  43. rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
  44. rcsb/exdb/seq/UniProtExtractor.py +80 -0
  45. rcsb/exdb/seq/__init__.py +0 -0
  46. rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
  47. rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
  48. rcsb/exdb/tests/__init__.py +0 -0
  49. rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
  50. rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
  51. rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
  52. rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
  53. rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
  54. rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
  55. rcsb/exdb/tests/testChemRefLoader.py +106 -0
  56. rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
  57. rcsb/exdb/tests/testCitationAdapter.py +97 -0
  58. rcsb/exdb/tests/testCitationExtractor.py +93 -0
  59. rcsb/exdb/tests/testCitationUtils.py +92 -0
  60. rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
  61. rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
  62. rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
  63. rcsb/exdb/tests/testGlycanProvider.py +98 -0
  64. rcsb/exdb/tests/testGlycanUtils.py +64 -0
  65. rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
  66. rcsb/exdb/tests/testObjectExtractor.py +342 -0
  67. rcsb/exdb/tests/testObjectTransformer.py +83 -0
  68. rcsb/exdb/tests/testObjectUpdater.py +120 -0
  69. rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
  70. rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
  71. rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
  72. rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
  73. rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
  74. rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
  75. rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
  76. rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
  77. rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
  78. rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
  79. rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
  80. rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
  81. rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
  82. rcsb/exdb/tests/testUniProtExtractor.py +77 -0
  83. rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
  84. rcsb/exdb/tree/__init__.py +0 -0
  85. rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
  86. rcsb/exdb/utils/ObjectExtractor.py +286 -0
  87. rcsb/exdb/utils/ObjectTransformer.py +124 -0
  88. rcsb/exdb/utils/ObjectUpdater.py +121 -0
  89. rcsb/exdb/utils/ObjectValidator.py +160 -0
  90. rcsb/exdb/utils/__init__.py +0 -0
  91. rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
  92. rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
  93. rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
  94. rcsb/exdb/wf/__init__.py +0 -0
  95. rcsb_exdb-1.31.dist-info/METADATA +103 -0
  96. rcsb_exdb-1.31.dist-info/RECORD +98 -0
  97. rcsb_exdb-1.31.dist-info/WHEEL +4 -0
  98. rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,638 @@
1
+ ##
2
+ # File: PubChemIndexCacheProvider.py
3
+ # Date: 2-Apr-2020 jdw
4
+ #
5
+ # Utilities to manage chemical component/BIRD to PubChem compound identifier mapping data.
6
+ #
7
+ # Updates:
8
+ # 9-May-2020 jdw separate cache behavior with separate option rebuildChemIndices=True/False
9
+ # 16-Jul-2020 jdw separate index and reference data management.
10
+ # 23-Jul-2021 jdw Make PubChemIndexCacheProvider a subclass of StashableBase()
11
+ # 2-Mar-2023 aae Return correct status from Single proc
12
+ # 8-Apr-2025 dwp Let MultiProc handle chunking; add more logging to debug slowness on west coast
13
+ #
14
+ ##
15
+ __docformat__ = "google en"
16
+ __author__ = "John Westbrook"
17
+ __email__ = "jwest@rcsb.rutgers.edu"
18
+ __license__ = "Apache 2.0"
19
+
20
+ import logging
21
+ import os
22
+ import time
23
+
24
+ from rcsb.exdb.utils.ObjectExtractor import ObjectExtractor
25
+ from rcsb.exdb.utils.ObjectUpdater import ObjectUpdater
26
+ from rcsb.utils.chem.ChemCompIndexProvider import ChemCompIndexProvider
27
+ from rcsb.utils.chem.ChemCompSearchIndexProvider import ChemCompSearchIndexProvider
28
+ from rcsb.utils.chemref.PubChemUtils import PubChemUtils, ChemicalIdentifier
29
+ from rcsb.utils.io.IoUtil import getObjSize
30
+ from rcsb.utils.io.MarshalUtil import MarshalUtil
31
+ from rcsb.utils.io.StashableBase import StashableBase
32
+ from rcsb.utils.io.TimeUtil import TimeUtil
33
+ from rcsb.utils.multiproc.MultiProcUtil import MultiProcUtil
34
+
35
+
36
+ logger = logging.getLogger(__name__)
37
+
38
+
39
+ class PubChemUpdateWorker(object):
40
+ """A skeleton worker class that implements the interface expected by the multiprocessing module
41
+ for fetching CCD/BIRD to PubChem chemical compound identifier correspondences --
42
+ """
43
+
44
+ def __init__(self, cfgOb, searchIdxD, **kwargs):
45
+ self.__cfgOb = cfgOb
46
+ self.__searchIdxD = searchIdxD
47
+ #
48
+ _ = kwargs
49
+ self.__lookupD = {}
50
+ for sId, sD in self.__searchIdxD.items():
51
+ ccId = sId.split("|")[0]
52
+ self.__lookupD.setdefault(ccId, []).append(sD)
53
+ self.__databaseName = "pubchem_exdb"
54
+ self.__matchIndexCollectionName = "reference_match_index"
55
+ self.__createCollections(self.__databaseName, self.__matchIndexCollectionName, indexAttributeNames=["rcsb_id", "rcsb_last_update"])
56
+ self.__pcU = PubChemUtils()
57
+
58
+ def __genChemIdList(self, ccId):
59
+ """Return a list of ChemicalIdentifier() objects for the input chemical component identifier.
60
+
61
+ Args:
62
+ ccId (str): chemical component identifiers
63
+
64
+ Returns:
65
+ (list): list of ChemicalIdentifier() objects corresponding to the input chemical component.
66
+ """
67
+ chemIdList = []
68
+ idType = None
69
+ descr = None
70
+ if ccId in self.__lookupD:
71
+ for sD in self.__lookupD[ccId]:
72
+ if "inchi-key" in sD:
73
+ idType = "inchikey"
74
+ descr = sD["inchi-key"]
75
+ elif "smiles" in sD:
76
+ idType = "smiles"
77
+ descr = sD["smiles"]
78
+ chemIdList.append(ChemicalIdentifier(idCode=ccId, identifierSource=sD["build-type"], identifierType=idType, identifier=descr, indexName=sD["name"]))
79
+ return chemIdList
80
+
81
+ def updateList(self, dataList, procName, optionsD, workingDir):
82
+ """Update the input list of reference data identifiers (ChemicalIdentifier()) and return
83
+ matching diagnostics and reference feature data.
84
+ {
85
+ "_id" : ObjectId("5e8dfb49eab967a0483a0472"),
86
+ "rcsb_id" : "local reference ID (ccid|bird)", << LOCAL CANONICAL ID (e.g. ATP, PRD_000100)
87
+ "rcsb_last_update" : ISODate("2020-04-08T16:26:47.993+0000"),
88
+ "matched_ids" : [
89
+ {"matched_id": "<external reference ID code>", "search_id_type" : "oe-smiles", "search_id_source": "model-xyz",
90
+ 'source_index_name': <>, 'source_inchikey': <>, 'source_smiles': <>},
91
+ {"matched_id": "<external reference ID code>", "search_id_type": ... , "search_id_source": ... , ...}
92
+ ] ]
93
+ },
94
+ }
95
+ // Failed matches are recorded with NO matchedIds:
96
+ {
97
+ "_id" : ObjectId("5e8dfb49eab967a0483a04a3"),
98
+ "rcsb_id" : "local reference ID (ccid|bird)", << LOCAL ID
99
+ "rcsb_last_update" : ISODate("2020-04-08T16:26:48.025+0000"),
100
+ }
101
+ #
102
+ """
103
+ _ = workingDir
104
+ matchIdOnly = optionsD.get("matchIdOnly", True)
105
+ # Path to store raw request data -
106
+ exportPath = optionsD.get("exportPath", None)
107
+ #
108
+ successList = []
109
+ diagList = []
110
+ failList = []
111
+ retList = []
112
+ #
113
+ try:
114
+ startTime = time.time()
115
+ tU = TimeUtil()
116
+ ccIdList = dataList # len(dataList) should be of size chunkSize
117
+ logger.info("%s search starting for %d reference definitions (matchIdOnly %r exportPath %r)", procName, len(ccIdList), matchIdOnly, exportPath)
118
+ tIdxDL = []
119
+ timeS = tU.getDateTimeObj(tU.getTimestamp())
120
+ for ccId in ccIdList:
121
+ # Get various forms from the search index -
122
+ chemIdList = self.__genChemIdList(ccId)
123
+ tIdxD = {"rcsb_id": ccId, "rcsb_last_update": timeS}
124
+ #
125
+ mL = []
126
+ for chemId in chemIdList:
127
+ stA = time.time()
128
+ ok, refDL = self.__pcU.assemble(chemId, exportPath=exportPath, matchIdOnly=matchIdOnly)
129
+ #
130
+ if not ok:
131
+ etA = time.time()
132
+ logger.debug("Failing %s search source %s for %s (%.4f secs)", chemId.identifierType, chemId.identifierSource, chemId.idCode, etA - stA)
133
+ #
134
+ if ok and refDL:
135
+ for tD in refDL:
136
+ pcId = tD["cid"]
137
+ inchiKey = (
138
+ self.__searchIdxD[chemId.indexName]["inchi-key"]
139
+ if chemId.indexName in self.__searchIdxD and "inchi-key" in self.__searchIdxD[chemId.indexName]
140
+ else None
141
+ )
142
+ smiles = (
143
+ self.__searchIdxD[chemId.indexName]["smiles"] if chemId.indexName in self.__searchIdxD and "smiles" in self.__searchIdxD[chemId.indexName] else None
144
+ )
145
+ mL.append(
146
+ {
147
+ "matched_id": pcId,
148
+ "search_id_type": chemId.identifierType,
149
+ "search_id_source": chemId.identifierSource,
150
+ "source_index_name": chemId.indexName,
151
+ "source_smiles": smiles,
152
+ "source_inchikey": inchiKey,
153
+ }
154
+ )
155
+ #
156
+ if mL:
157
+ tIdxD["matched_ids"] = mL
158
+ successList.append(ccId)
159
+ else:
160
+ logger.info("No match result for any form of %s", ccId)
161
+ #
162
+ tIdxDL.append(tIdxD)
163
+ # --
164
+ failList = sorted(set(dataList) - set(successList))
165
+ if failList:
166
+ logger.info("%s returns %d definitions with failures: %r", procName, len(failList), failList)
167
+ # --
168
+ endTime = time.time()
169
+ logger.info("%s completed updateList len %r duration %.3f secs", procName, len(ccIdList), endTime - startTime)
170
+ startTimeL = time.time()
171
+ logger.info("Saving dataList (len=%d)", len(ccIdList))
172
+ self.__updateObjectStore(self.__databaseName, self.__matchIndexCollectionName, tIdxDL)
173
+ endTimeL = time.time()
174
+ logger.info("Saved chunk (len=%d) in %.3f secs", len(ccIdList), endTimeL - startTimeL)
175
+ except Exception as e:
176
+ logger.exception("Failing %s for %d data items %s", procName, len(dataList), str(e))
177
+ logger.info("%s dataList length %d success length %d retList %d", procName, len(dataList), len(successList), len(retList))
178
+ #
179
+ return successList, retList, diagList
180
+
181
+ def __updateObjectStore(self, databaseName, collectionName, objDL):
182
+ updateDL = []
183
+ for objD in objDL:
184
+ try:
185
+ selectD = {"rcsb_id": objD["rcsb_id"]}
186
+ updateDL.append({"selectD": selectD, "updateD": objD})
187
+ except Exception as e:
188
+ logger.exception("Failing with %s", str(e))
189
+ obUpd = ObjectUpdater(self.__cfgOb)
190
+ numUpd = obUpd.update(databaseName, collectionName, updateDL)
191
+ logger.info("Updated reference count is %d", numUpd)
192
+
193
+ def __createCollections(self, databaseName, collectionName, indexAttributeNames=None):
194
+ obUpd = ObjectUpdater(self.__cfgOb)
195
+ ok = obUpd.createCollection(databaseName, collectionName, indexAttributeNames=indexAttributeNames, checkExists=True, bsonSchema=None)
196
+ return ok
197
+
198
+
199
+ class PubChemIndexCacheProvider(StashableBase):
200
+ """Utilities to manage chemical component/BIRD to PubChem compound identifier mapping data."""
201
+
202
+ def __init__(self, cfgOb, cachePath):
203
+ dirName = "PubChem-index"
204
+ super(PubChemIndexCacheProvider, self).__init__(cachePath, [dirName])
205
+ self.__cfgOb = cfgOb
206
+ self.__cachePath = cachePath
207
+ self.__dirPath = os.path.join(self.__cachePath, dirName)
208
+ #
209
+ self.__databaseName = "pubchem_exdb"
210
+ self.__matchIndexCollectionName = "reference_match_index"
211
+ #
212
+
213
+ self.__matchD = None
214
+
215
+ def getMatchData(self, expireDays=0):
216
+ if not self.__matchD:
217
+ selectD = {}
218
+ if expireDays > 0:
219
+ tU = TimeUtil()
220
+ tS = tU.getTimestamp(useUtc=True, before={"days": expireDays})
221
+ selectD.update({"rcsb_latest_update": {"$lt": tU.getDateTimeObj(tS)}})
222
+ self.__matchD = self.__getReferenceData(self.__databaseName, self.__matchIndexCollectionName, selectD=selectD)
223
+ #
224
+ return self.__matchD
225
+
226
+ def testCache(self, minMatch=None, logSizes=False):
227
+ self.getMatchData()
228
+ okC = bool(self.__matchD)
229
+ if not okC:
230
+ return okC
231
+ logger.info("Reference data cache lengths: matchD %d", len(self.__matchD))
232
+ if minMatch and len(self.__matchD) < minMatch:
233
+ return False
234
+ #
235
+ if logSizes:
236
+ logger.info("PubChem MatchD %.2f", getObjSize(self.__matchD) / 1000000.0)
237
+ return True
238
+
239
+ def __getdumpFilePath(self, fmt="json"):
240
+ stashBaseFileName = "pubchem_match_index_object_list"
241
+ fExt = ".json" if fmt == "json" else ".pic"
242
+ fp = os.path.join(self.__dirPath, stashBaseFileName + fExt)
243
+ return fp
244
+
245
+ def dump(self, fmt="json"):
246
+ """Dump PubChem index reference data from the object store.
247
+
248
+ Args:
249
+ fmt (str, optional): [description]. Defaults to "json".
250
+
251
+ Returns:
252
+ bool: True for success or False otherwise
253
+ """
254
+ ok = False
255
+ try:
256
+ self.getMatchData()
257
+ if fmt in ["json", "pickle"]:
258
+ kwargs = {}
259
+ fp = self.__getdumpFilePath(fmt=fmt)
260
+ logger.info("Saving object store to %s", fp)
261
+ mU = MarshalUtil(workPath=self.__dirPath)
262
+ if fmt in ["json"]:
263
+ kwargs = {"indent": 3}
264
+ ok = mU.doExport(fp, self.__matchD, fmt=fmt, **kwargs)
265
+ except Exception as e:
266
+ logger.exception("Failing for %r with %s", self.__dirPath, str(e))
267
+ return ok
268
+
269
+ def reloadDump(self, fmt="json"):
270
+ """Reload PubChem reference data store from saved dump.
271
+
272
+ Args:
273
+ fmt (str, optional): format of the backup file (pickle or json). Defaults to "json".
274
+
275
+ Returns:
276
+ (int): number of objects restored.
277
+ """
278
+ numUpd = 0
279
+ try:
280
+ # Read from disk backup and update object store -
281
+ if fmt in ["json", "pickle"]:
282
+ fp = self.__getdumpFilePath(fmt="json")
283
+ logger.info("Restoring object store from %s", fp)
284
+ mU = MarshalUtil(workPath=self.__dirPath)
285
+ matchD = mU.doImport(fp, fmt=fmt)
286
+ numUpd = self.__reloadDump(matchD, self.__databaseName, self.__matchIndexCollectionName, indexAttributeNames=["rcsb_id", "rcsb_last_update"])
287
+ except Exception as e:
288
+ logger.exception("Failing for %r with %s", self.__dirPath, str(e))
289
+ # --
290
+ return numUpd
291
+
292
+ def updateMissing(self, expireDays=0, fetchLimit=None, updateUnmatched=True, numProcChemComp=8, numProc=2, **kwargs):
293
+ """Update match index from object store
294
+
295
+ Args:
296
+ expireDays (int): expiration days on match data (default 0 meaning none)
297
+ fetchLimit (int): limit to the number of entry updates performed (None)
298
+ updateUnmatched (bool): Previously unmatched search definitions will be retried on update (default=True)
299
+ numProcChemComp (int): for rebuilding local ChemComp indices the number processors to apply (default=8)
300
+ numProc (int): for rebuilding local PubChem indices the number processors to apply (default=2)
301
+
302
+ Returns:
303
+ bool: True for success or False otherwise
304
+
305
+ ChemicalIdentifierFields = ("idCode", "identifierSource", "identifierType", "identifier")
306
+ ChemicalIdentifier = collections.namedtuple("ChemicalIdentifier", ChemicalIdentifierFields, defaults=(None,) * len(ChemicalIdentifierFields))
307
+
308
+
309
+ // Failed matches are recorded with NO matchedIds:
310
+ {
311
+ "_id" : ObjectId("5e8dfb49eab967a0483a04a3"),
312
+ "rcsb_id" : "local reference ID (ccid|bird)", << LOCAL ID
313
+ "rcsb_last_update" : ISODate("2020-04-08T16:26:48.025+0000"),
314
+ }
315
+ """
316
+ #
317
+ matchD = {}
318
+ matchedIdList = []
319
+ ok = False
320
+ try:
321
+ # ---
322
+ # Get current the indices of source chemical reference data -
323
+ ok, ccidxP, ccsidxP = self.__rebuildChemCompSourceIndices(numProcChemComp, **kwargs)
324
+ if not ok:
325
+ return matchD
326
+ #
327
+ ccIdxD = ccidxP.getIndex()
328
+ searchIdxD = ccsidxP.getIndex()
329
+ # Index of target of local chemical component and BIRD identifiers
330
+ sourceIdList = sorted(ccIdxD.keys())
331
+ logger.info("Reloading chemical reference data (expireDays %r, updateUnmatched %r)", expireDays, updateUnmatched)
332
+ matchedIdList = self.__getMatchIndexIds(searchIdxD, expireDays=expireDays, updateUnmatched=updateUnmatched)
333
+ # --
334
+ logger.info("Starting matched reference identifier count (%d) ", len(matchedIdList))
335
+ updateIdList = sorted(set(sourceIdList) - set(matchedIdList))
336
+ logger.info("Missing chemical definition correspondences %d fetchLimit %r", len(updateIdList), fetchLimit)
337
+ #
338
+ updateIdList = updateIdList[:fetchLimit] if fetchLimit else updateIdList
339
+ #
340
+ if updateIdList:
341
+ logger.info("Update reference data cache for %d chemical identifiers", len(updateIdList))
342
+ ok, failList = self.__updateReferenceData(updateIdList, searchIdxD, numProc, **kwargs)
343
+ logger.info("Update reference data return status is %r missing count %d", ok, len(failList))
344
+ else:
345
+ logger.info("No reference data updates required")
346
+ # --
347
+ if not ok:
348
+ logger.warning("updateMissing completed with status %r failures %r", ok, len(failList))
349
+ #
350
+ return True
351
+ except Exception as e:
352
+ logger.exception("Failing with %s", str(e))
353
+ return ok
354
+
355
+ def getMatches(self):
356
+ """Get all PubChem correspondences from the current match index..
357
+
358
+ Returns:
359
+
360
+ (list): PubChem compound identifier codes.
361
+
362
+ """
363
+ self.getMatchData()
364
+ #
365
+ pcidList = []
366
+ try:
367
+ pcidS = set()
368
+ for _, mD in self.__matchD.items():
369
+ if "matched_ids" in mD:
370
+ for sD in mD["matched_ids"]:
371
+ pcidS.add(sD["matched_id"])
372
+ pcidList = list(pcidS)
373
+ except Exception as e:
374
+ logger.exception("Failing with %s", str(e))
375
+ return pcidList
376
+
377
+ def getSelectedMatches(self, **kwargs):
378
+ """Select preferred PubChem correspondences from the current match index for the input source component build type.
379
+ and separatel return alternative matches for other source types.
380
+
381
+ Args:
382
+ sourceTypes (list, optional): list of source chemical component build types (default: ["model-xyz"])
383
+ exportPath: (str, optional): export path for correspondences
384
+
385
+ Returns:
386
+ dict, dict : mapD { ccId1: [{'pcId': ... , 'inchiKey': ... }], ccId2: ...},
387
+ altD { ccId1: [{'pcId': ... , 'inchiKey': ... 'sourceType': ... }], ccId2: ...}
388
+
389
+ Example match index entry:
390
+ {
391
+ "_id" : ObjectId("5e8dfb49eab967a0483a0472"),
392
+ "rcsb_id" : "local reference ID (ccid|bird)", << LOCAL CANONICAL ID (e.g. ATP, PRD_000100)
393
+ "rcsb_last_update" : ISODate("2020-04-08T16:26:47.993+0000"),
394
+ "matched_ids" : [
395
+ {"matched_id": "<external reference ID code>", "search_id_type" : "oe-smiles", "search_id_source": "model-xyz",
396
+ 'source_index_name': <>, 'source_inchikey': <>, 'source_smiles': <>},
397
+ {"matched_id": "<external reference ID code>", "search_id_type": ... , "search_id_source": ... , ...}
398
+ ] ]
399
+ },
400
+ }
401
+ """
402
+ #
403
+ self.getMatchData()
404
+
405
+ sourceTypes = kwargs.get("sourceTypes", ["model-xyz"])
406
+ exportPath = kwargs.get("exportPath", None)
407
+ #
408
+ mapD = {}
409
+ altMapD = {}
410
+ extraMapD = {}
411
+ try:
412
+ for ccId, mD in self.__matchD.items():
413
+ if "matched_ids" in mD:
414
+ for sD in mD["matched_ids"]:
415
+ #
416
+ if sD and "search_id_source" in sD:
417
+ pcId = sD["matched_id"]
418
+ inchiKey = sD["source_inchikey"]
419
+ #
420
+ if sD["search_id_source"] in sourceTypes:
421
+ mapD.setdefault(ccId, []).append({"pcId": pcId, "inchiKey": inchiKey})
422
+ else:
423
+ altMapD.setdefault(ccId, []).append({"pcId": pcId, "inchiKey": inchiKey, "sourceType": sD["search_id_source"]})
424
+ #
425
+ difS = set(altMapD.keys()) - set(mapD.keys())
426
+ logger.info("PubChem preferred correspondence length (%d) alternative extras (%d)", len(mapD), len(difS))
427
+ for ccId in difS:
428
+ extraMapD[ccId] = altMapD[ccId]
429
+ if exportPath:
430
+ fp = os.path.join(exportPath, "pubchem_matches.json")
431
+ mU = MarshalUtil(workPath=exportPath)
432
+ mU.doExport(fp, mapD, fmt="json", indent=3)
433
+ except Exception as e:
434
+ logger.exception("Failing with %s", str(e))
435
+
436
+ return mapD, extraMapD
437
+
438
+ #
439
+ # -- Extract current data from object store --
440
+ def __getMatchIndexIds(self, searchIdxD, expireDays=0, updateUnmatched=True):
441
+ """Get CCD/BIRD reference data identifiers in the current match index subject to an
442
+ expiration interval (i.e. not matched or older than deltaDays).
443
+
444
+ Args:
445
+ searchIdxD (dict): CCD/BIRD search index dictionary
446
+ expireDays (int, optional): expiration interval in days. Defaults to 0 (no expiration).
447
+ updateUnmatched (bool, optional): include only matched identifiers (i.e. exclude any tried but unmatched cases)
448
+
449
+ Returns:
450
+ (list): chemical component/BIRD reference identifier list
451
+ """
452
+ selectD = {}
453
+ if expireDays > 0:
454
+ tU = TimeUtil()
455
+ tS = tU.getTimestamp(useUtc=True, before={"days": expireDays})
456
+ selectD.update({"rcsb_latest_update": {"$lt": tU.getDateTimeObj(tS)}})
457
+ #
458
+ if updateUnmatched:
459
+ # Return only cases with an existing correspondence
460
+ selectD.update({"matched_ids": {"$exists": True}})
461
+ matchD = self.__getReferenceData(self.__databaseName, self.__matchIndexCollectionName, selectD=selectD if selectD else None)
462
+ #
463
+ # For the selected cases in the index-
464
+ retIdList = []
465
+ if searchIdxD:
466
+ # Exclude definitions if source InChIKey in the match index differs with the Key in the current search index.
467
+ for ccId, inD in matchD.items():
468
+ if updateUnmatched and "matched_ids" not in inD:
469
+ retIdList.append(ccId)
470
+ continue
471
+ hasChanged = False
472
+ for mD in inD["matched_ids"]:
473
+ if mD["source_index_name"] not in searchIdxD:
474
+ hasChanged = True
475
+ logger.info("Identifier %s no longer in search index", mD["source_index_name"])
476
+ break
477
+ if mD["source_inchikey"] != searchIdxD[mD["source_index_name"]]["inchi-key"]:
478
+ logger.info("Identifier %s InChIKey changed search index", mD["source_index_name"])
479
+ hasChanged = True
480
+ break
481
+ if not hasChanged:
482
+ retIdList.append(ccId)
483
+ #
484
+ return sorted(retIdList)
485
+
486
+ #
487
+ def __getReferenceData(self, databaseName, collectionName, selectD=None, selectionList=None):
488
+ logger.info("Searching %s %s with selection query %r", databaseName, collectionName, selectD)
489
+ obEx = ObjectExtractor(
490
+ self.__cfgOb,
491
+ databaseName=databaseName,
492
+ collectionName=collectionName,
493
+ keyAttribute="rcsb_id",
494
+ uniqueAttributes=["rcsb_id"],
495
+ selectionQuery=selectD,
496
+ selectionList=selectionList,
497
+ stripObjectId=True,
498
+ )
499
+ docCount = obEx.getCount()
500
+ logger.info("Reference data object count %d", docCount)
501
+ objD = obEx.getObjects()
502
+ return objD
503
+
504
+ def __updateReferenceData(self, idList, searchIdxD, numProc=2, **kwargs):
505
+ """Launch worker methods to update chemical reference data correspondences.
506
+
507
+ Args:
508
+ idList (list): list of local chemical identifiers (ChemIdentifier())
509
+
510
+ Returns:
511
+ (bool, list): status flag, list of unmatched identifiers
512
+ """
513
+ chunkSize = 10
514
+ exportPath = kwargs.get("exportPath", None)
515
+ logger.info("Length starting list is %d", len(idList))
516
+ optD = {"chunkSize": chunkSize, "exportPath": exportPath, "matchIdOnly": True}
517
+ rWorker = PubChemUpdateWorker(self.__cfgOb, searchIdxD)
518
+ if numProc > 1:
519
+ mpu = MultiProcUtil(verbose=True)
520
+ mpu.setOptions(optD)
521
+ mpu.set(workerObj=rWorker, workerMethod="updateList")
522
+ ok, failList, resultList, _ = mpu.runMulti(dataList=idList, numProc=numProc, numResults=1, chunkSize=chunkSize)
523
+ logger.info("Multi-proc %r failures %r result lengths %r", ok, len(failList), len(resultList[0]))
524
+ else:
525
+ successList, _, _ = rWorker.updateList(idList, "SingleProc", optD, self.__dirPath)
526
+ failList = list(set(idList) - set(successList))
527
+ ok = len(failList) == 0
528
+ logger.info("Single-proc status %r failures %r", ok, len(failList))
529
+ #
530
+ if len(failList) > 0:
531
+ if len(failList) <= 100:
532
+ logger.info("failList: %r", failList)
533
+ else:
534
+ logger.info("failList[:100]: %r", failList[:100])
535
+ #
536
+ return ok, failList
537
+
538
+ def __reloadDump(self, objD, databaseName, collectionName, indexAttributeNames=None):
539
+ """Internal method to restore the input database/collection using the input data object.
540
+
541
+ Args:
542
+ objD (obj): Target reference or index data object
543
+ databaseName (str): target database name
544
+ collectionName (str): target collection name
545
+ indexAttributeNames (list, optional): Primary index attributes. Defaults to None.
546
+
547
+ Returns:
548
+ int: inserted or updated object count
549
+ """
550
+ try:
551
+ numUpd = 0
552
+ numTotal = 0
553
+ updateDL = []
554
+ for entityKey, obj in objD.items():
555
+ if "_id" in obj:
556
+ obj.pop("_id")
557
+ selectD = {"rcsb_id": entityKey}
558
+ updateDL.append({"selectD": selectD, "updateD": obj})
559
+ #
560
+ obUpd = ObjectUpdater(self.__cfgOb)
561
+ ok = obUpd.createCollection(databaseName, collectionName, indexAttributeNames=indexAttributeNames, checkExists=True, bsonSchema=None)
562
+ if ok:
563
+ numUpd = obUpd.update(databaseName, collectionName, updateDL)
564
+ logger.debug("Updated object count is %d", numUpd)
565
+ else:
566
+ logger.error("Create %s %s failed", databaseName, collectionName)
567
+ numTotal = obUpd.count(databaseName, collectionName)
568
+ except Exception as e:
569
+ logger.exception("Failing with %s", str(e))
570
+ #
571
+ return numTotal
572
+
573
+ # --- --- ---
574
+ # -- Load or rebuild source chemical reference data indices --
575
+ def __rebuildChemCompSourceIndices(self, numProc, **kwargs):
576
+ """Rebuild source indices of chemical component definitions."""
577
+ logger.info("Rebuilding chemical definition index.")
578
+ ok1, ccidxP = self.__buildChemCompIndex(**kwargs)
579
+ logger.info("__buildChemCompIndex completed with status %r", ok1)
580
+ logger.info("Rebuilding chemical search indices.")
581
+ ok2, ccsidxP = self.__buildChemCompSearchIndex(numProc, **kwargs)
582
+ logger.info("__buildChemCompSearchIndex completed with status %r", ok2)
583
+ return ok1 & ok2, ccidxP, ccsidxP
584
+
585
+ def __buildChemCompIndex(self, **kwargs):
586
+ """Build chemical component cache files from the input component dictionaries"""
587
+ try:
588
+ molLimit = kwargs.get("molLimit", None)
589
+ useCache = not kwargs.get("rebuildChemIndices", False)
590
+ logSizes = kwargs.get("logSizes", False)
591
+ ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "cc-full")
592
+ ccUrlTarget = kwargs.get("ccUrlTarget", None)
593
+ birdUrlTarget = kwargs.get("birdUrlTarget", None)
594
+ cachePath = kwargs.get("cachePath", self.__cachePath)
595
+ #
596
+ ccidxP = ChemCompIndexProvider(
597
+ ccUrlTarget=ccUrlTarget, birdUrlTarget=birdUrlTarget, cachePath=cachePath, useCache=useCache, molLimit=molLimit, ccFileNamePrefix=ccFileNamePrefix
598
+ )
599
+ ok = ccidxP.testCache(minCount=molLimit, logSizes=logSizes)
600
+ return ok, ccidxP if ok else None
601
+ except Exception as e:
602
+ logger.exception("Failing with %s", str(e))
603
+ #
604
+ return False, None
605
+
606
+ def __buildChemCompSearchIndex(self, numProc, **kwargs):
607
+ """Test build search index chemical component cache files from the input component dictionaries"""
608
+ try:
609
+ cachePath = kwargs.get("cachePath", self.__cachePath)
610
+ molLimit = kwargs.get("molLimit", None)
611
+ useCache = not kwargs.get("rebuildChemIndices", False)
612
+ logSizes = kwargs.get("logSizes", False)
613
+ limitPerceptions = kwargs.get("limitPerceptions", False)
614
+ #
615
+ chunkSize = kwargs.get("chunkSize", 5)
616
+ molLimit = kwargs.get("molLimit", None)
617
+ ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "cc-full")
618
+ quietFlag = kwargs.get("quietFlag", True)
619
+ ccUrlTarget = kwargs.get("ccUrlTarget", None)
620
+ birdUrlTarget = kwargs.get("birdUrlTarget", None)
621
+ #
622
+ ccsiP = ChemCompSearchIndexProvider(
623
+ ccUrlTarget=ccUrlTarget,
624
+ birdUrlTarget=birdUrlTarget,
625
+ cachePath=cachePath,
626
+ useCache=useCache,
627
+ molLimit=molLimit,
628
+ ccFileNamePrefix=ccFileNamePrefix,
629
+ limitPerceptions=limitPerceptions,
630
+ numProc=numProc,
631
+ maxChunkSize=chunkSize,
632
+ quietFlag=quietFlag,
633
+ )
634
+ ok = ccsiP.testCache(minCount=molLimit, logSizes=logSizes)
635
+ return ok, ccsiP if ok else None
636
+ except Exception as e:
637
+ logger.exception("Failing with %s", str(e))
638
+ return False, None
File without changes