rcsb.exdb 1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. rcsb/__init__.py +1 -0
  2. rcsb/exdb/__init__.py +1 -0
  3. rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
  4. rcsb/exdb/branch/GlycanProvider.py +116 -0
  5. rcsb/exdb/branch/GlycanUtils.py +114 -0
  6. rcsb/exdb/branch/__init__.py +0 -0
  7. rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
  8. rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
  9. rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
  10. rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
  11. rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
  12. rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
  13. rcsb/exdb/chemref/__init__.py +0 -0
  14. rcsb/exdb/citation/CitationAdapter.py +91 -0
  15. rcsb/exdb/citation/CitationExtractor.py +190 -0
  16. rcsb/exdb/citation/CitationUtils.py +51 -0
  17. rcsb/exdb/citation/__init__.py +0 -0
  18. rcsb/exdb/cli/__init__.py +0 -0
  19. rcsb/exdb/entry/EntryInfoProvider.py +148 -0
  20. rcsb/exdb/entry/__init__.py +0 -0
  21. rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
  22. rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
  23. rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
  24. rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
  25. rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
  26. rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
  27. rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
  28. rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
  29. rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
  30. rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
  31. rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
  32. rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
  33. rcsb/exdb/seq/AnnotationExtractor.py +76 -0
  34. rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
  35. rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
  36. rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
  37. rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
  38. rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
  39. rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
  40. rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
  41. rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
  42. rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
  43. rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
  44. rcsb/exdb/seq/UniProtExtractor.py +80 -0
  45. rcsb/exdb/seq/__init__.py +0 -0
  46. rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
  47. rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
  48. rcsb/exdb/tests/__init__.py +0 -0
  49. rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
  50. rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
  51. rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
  52. rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
  53. rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
  54. rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
  55. rcsb/exdb/tests/testChemRefLoader.py +106 -0
  56. rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
  57. rcsb/exdb/tests/testCitationAdapter.py +97 -0
  58. rcsb/exdb/tests/testCitationExtractor.py +93 -0
  59. rcsb/exdb/tests/testCitationUtils.py +92 -0
  60. rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
  61. rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
  62. rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
  63. rcsb/exdb/tests/testGlycanProvider.py +98 -0
  64. rcsb/exdb/tests/testGlycanUtils.py +64 -0
  65. rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
  66. rcsb/exdb/tests/testObjectExtractor.py +342 -0
  67. rcsb/exdb/tests/testObjectTransformer.py +83 -0
  68. rcsb/exdb/tests/testObjectUpdater.py +120 -0
  69. rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
  70. rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
  71. rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
  72. rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
  73. rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
  74. rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
  75. rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
  76. rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
  77. rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
  78. rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
  79. rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
  80. rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
  81. rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
  82. rcsb/exdb/tests/testUniProtExtractor.py +77 -0
  83. rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
  84. rcsb/exdb/tree/__init__.py +0 -0
  85. rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
  86. rcsb/exdb/utils/ObjectExtractor.py +286 -0
  87. rcsb/exdb/utils/ObjectTransformer.py +124 -0
  88. rcsb/exdb/utils/ObjectUpdater.py +121 -0
  89. rcsb/exdb/utils/ObjectValidator.py +160 -0
  90. rcsb/exdb/utils/__init__.py +0 -0
  91. rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
  92. rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
  93. rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
  94. rcsb/exdb/wf/__init__.py +0 -0
  95. rcsb_exdb-1.31.dist-info/METADATA +103 -0
  96. rcsb_exdb-1.31.dist-info/RECORD +98 -0
  97. rcsb_exdb-1.31.dist-info/WHEEL +4 -0
  98. rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,397 @@
1
+ ##
2
+ # File: ReferenceSequenceCacheProvider.py
3
+ # Date: 10-Feb-2020 jdw
4
+ #
5
+ # Utilities to cache referencence sequence data and mappings.
6
+ #
7
+ # Updates:
8
+ # 8-Apr-2020 jdw change testCache() conditions to specifically track missing matched reference Id codes.
9
+ #
10
+ ##
11
+ __docformat__ = "google en"
12
+ __author__ = "John Westbrook"
13
+ __email__ = "jwest@rcsb.rutgers.edu"
14
+ __license__ = "Apache 2.0"
15
+
16
+ import logging
17
+ from collections import defaultdict
18
+
19
+
20
+ from rcsb.exdb.utils.ObjectExtractor import ObjectExtractor
21
+ from rcsb.exdb.utils.ObjectUpdater import ObjectUpdater
22
+ from rcsb.utils.io.IoUtil import getObjSize
23
+ from rcsb.utils.io.TimeUtil import TimeUtil
24
+ from rcsb.utils.multiproc.MultiProcUtil import MultiProcUtil
25
+ from rcsb.utils.seq.UniProtUtils import UniProtUtils
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class ReferenceUpdateWorker(object):
31
+ """A skeleton class that implements the interface expected by the multiprocessing
32
+ for fetching reference sequences --
33
+ """
34
+
35
+ def __init__(self, cfgOb, **kwargs):
36
+ self.__cfgOb = cfgOb
37
+ _ = kwargs
38
+ self.__refDatabaseName = "uniprot_exdb"
39
+ self.__refDataCollectionName = "reference_entry"
40
+ self.__refMatchDataCollectionName = "reference_match"
41
+ #
42
+ self.__createCollections(self.__refDatabaseName, self.__refDataCollectionName, indexAttributeNames=["rcsb_id", "rcsb_last_update"])
43
+ self.__createCollections(self.__refDatabaseName, self.__refMatchDataCollectionName, indexAttributeNames=["rcsb_id", "rcsb_last_update"])
44
+
45
+ def updateList(self, dataList, procName, optionsD, workingDir):
46
+ """Update the input list of reference sequence identifiers and return
47
+ matching diagnostics and reference feature data.
48
+ """
49
+ _ = optionsD
50
+ _ = workingDir
51
+ saveText = optionsD.get("saveText", False)
52
+ fetchLimit = optionsD.get("fetchLimit", None)
53
+ refDbName = optionsD.get("refDbName", "UniProt")
54
+ maxChunkSize = optionsD.get("maxChunkSize", 50)
55
+ successList = []
56
+ retList1 = []
57
+ retList2 = []
58
+ diagList = []
59
+ emptyList = []
60
+ #
61
+ try:
62
+ tU = TimeUtil()
63
+ idList = dataList[:fetchLimit] if fetchLimit else dataList
64
+ logger.info("%s starting fetch for %d %s entries", procName, len(idList), refDbName)
65
+ if refDbName == "UniProt":
66
+ fobj = UniProtUtils(saveText=saveText)
67
+ logger.debug("Maximum reference chunk size %d", maxChunkSize)
68
+ refD, matchD = fobj.fetchList(idList, maxChunkSize=maxChunkSize)
69
+ if len(matchD) == len(idList):
70
+ for uId, tD in matchD.items():
71
+ tD["rcsb_id"] = uId.strip()
72
+ tD["rcsb_last_update"] = tU.getDateTimeObj(tU.getTimestamp())
73
+ retList1.append(tD)
74
+ for uId, tD in refD.items():
75
+ tD["rcsb_id"] = uId.strip()
76
+ tD["rcsb_last_update"] = tU.getDateTimeObj(tU.getTimestamp())
77
+ retList2.append(tD)
78
+ successList.extend(idList)
79
+ self.__updateReferenceData(self.__refDatabaseName, self.__refDataCollectionName, retList2)
80
+ self.__updateReferenceData(self.__refDatabaseName, self.__refMatchDataCollectionName, retList1)
81
+ else:
82
+ logger.info("Failing with fetch for %d entries with matchD %r", len(idList), matchD)
83
+ else:
84
+ logger.error("Unsupported reference database %r", refDbName)
85
+ except Exception as e:
86
+ logger.exception("Failing %s for %d data items %s", procName, len(dataList), str(e))
87
+ logger.info("%s dataList length %d success length %d rst1 %d rst2 %d", procName, len(dataList), len(successList), len(retList1), len(retList2))
88
+ #
89
+ return successList, emptyList, emptyList, diagList
90
+
91
+ def __updateReferenceData(self, databaseName, collectionName, objDL):
92
+ updateDL = []
93
+ for objD in objDL:
94
+ try:
95
+ selectD = {"rcsb_id": objD["rcsb_id"]}
96
+ updateDL.append({"selectD": selectD, "updateD": objD})
97
+ except Exception as e:
98
+ logger.exception("Failing with %s", str(e))
99
+ obUpd = ObjectUpdater(self.__cfgOb)
100
+ numUpd = obUpd.update(databaseName, collectionName, updateDL)
101
+ logger.debug("Updated reference count is %d", numUpd)
102
+
103
+ def __createCollections(self, databaseName, collectionName, indexAttributeNames=None):
104
+ obUpd = ObjectUpdater(self.__cfgOb)
105
+ ok = obUpd.createCollection(databaseName, collectionName, indexAttributeNames=indexAttributeNames, checkExists=True, bsonSchema=None)
106
+ return ok
107
+
108
+
109
+ class ReferenceSequenceCacheProvider(object):
110
+ """Utilities to cache referencence sequence data and correspondence mappings."""
111
+
112
+ def __init__(self, cfgOb, databaseName, collectionName, polymerType, siftsProvider=None, maxChunkSize=50, fetchLimit=None, expireDays=14, numProc=1, **kwargs):
113
+ self.__cfgOb = cfgOb
114
+ #
115
+ self.__maxChunkSize = maxChunkSize
116
+ self.__numProc = numProc
117
+ #
118
+ self.__refDatabaseName = "uniprot_exdb"
119
+ self.__refDataCollectionName = "reference_entry"
120
+ self.__refMatchDataCollectionName = "reference_match"
121
+
122
+ self.__ssP = siftsProvider
123
+ self.__matchD, self.__refD, self.__missingMatchIds = self.__reload(databaseName, collectionName, polymerType, fetchLimit, expireDays, **kwargs)
124
+
125
+ def getMatchInfo(self):
126
+ return self.__matchD
127
+
128
+ def getRefData(self):
129
+ return self.__refD
130
+
131
+ def getMissingMatchedIdCodes(self):
132
+ return self.__missingMatchIds
133
+
134
+ def getDocuments(self, formatType="exchange"):
135
+ fobj = UniProtUtils(saveText=False)
136
+ exObjD = fobj.reformat(self.__refD, formatType=formatType)
137
+ return list(exObjD.values())
138
+
139
+ def getRefDataCount(self):
140
+ return len(self.__refD)
141
+
142
+ def testCache(self, minMatchPrimaryPercent=None, logSizes=False, minMissing=0):
143
+ """Test the state of reference sequence data relative to proportion of matched primary sequence
144
+ in the primary data set.
145
+
146
+ Args:
147
+ minMatchPrimaryPercent (float, optional): minimal acceptable of matching primary accessions. Defaults to None.
148
+ logSizes (bool, optional): flag to log resource sizes. Defaults to False.
149
+ minMissing (int, optional): minimum acceptable missing matched reference Ids. Defaults to 0.
150
+
151
+ Returns:
152
+ bool: True for success or False otherwise
153
+ """
154
+ try:
155
+ ok = bool(self.__matchD and self.__refD and self.__missingMatchIds <= minMissing)
156
+ logger.info("Reference cache lengths: matchD %d refD %d missing matches %d", len(self.__matchD), len(self.__refD), self.__missingMatchIds)
157
+ if ok:
158
+ return ok
159
+ except Exception as e:
160
+ logger.error("Failing with unexpected cache state %s", str(e))
161
+ return False
162
+ #
163
+ # -- The remaining check on the portion is not currently --
164
+ #
165
+ numRef = len(self.__matchD)
166
+ countD = defaultdict(int)
167
+ logger.info("Match dictionary length %d", len(self.__matchD))
168
+ for _, mD in self.__matchD.items():
169
+ if "matched" in mD:
170
+ countD[mD["matched"]] += 1
171
+ logger.info("Reference length %d match length %d coverage %r", len(self.__refD), len(self.__matchD), countD.items())
172
+ if minMatchPrimaryPercent:
173
+ try:
174
+ okC = 100.0 * float(countD["primary"]) / float(numRef) > minMatchPrimaryPercent
175
+ except Exception:
176
+ okC = False
177
+ logger.info("Primary reference match count test status %r", okC)
178
+ #
179
+ if logSizes:
180
+ logger.info(
181
+ "RefMatchD %.2f RefD %.2f",
182
+ getObjSize(self.__matchD) / 1000000.0,
183
+ getObjSize(self.__refD) / 1000000.0,
184
+ )
185
+ return ok and okC
186
+
187
+ def __reload(self, databaseName, collectionName, polymerType, fetchLimit, expireDays, **kwargs):
188
+ _ = kwargs
189
+
190
+ # -- This
191
+ logger.info("Reloading sequence reference data fetchLimit %r expireDays %r", fetchLimit, expireDays)
192
+ numMissing = self.__refreshReferenceData(expireDays=expireDays, failureFraction=0.75)
193
+ logger.info("Reference identifiers expired/missing %d", numMissing)
194
+ # --
195
+ refIdMapD = {}
196
+ matchD = {}
197
+ refD = {}
198
+ failList = []
199
+ #
200
+ # assignRefD: Dict of all entities of polymerType "Protein" (or other), with associated container_identifiers and other info as corresponding values
201
+ assignRefD = self.__getPolymerReferenceSequenceAssignments(databaseName, collectionName, polymerType, fetchLimit)
202
+ logger.info("Polymer reference sequence assignments %d (assignRefD)", len(assignRefD))
203
+ #
204
+ # refIdMapD: Dict of all *unique* UniProt Ids of entities that have:
205
+ # "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.provenance_source":"PDB",
206
+ # "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_name":"UniProt",
207
+ # "entity_poly.rcsb_entity_polymer_type":"Protein"
208
+ # Values are the list of entities that have those UniProt IDs
209
+ # i.e. refIdMapD[<database_accession>] = [entity_key1, entity_key2,...]
210
+ # This will usually only contain several hundred to a few thousand IDs
211
+ refIdMapD, _ = self.__getAssignmentMap(assignRefD)
212
+ logger.info("Reference ID assignemnt map length %d (refIdMapD)", len(refIdMapD))
213
+ #
214
+ # List of all entry IDs for entities in assignRefD (will contain duplicates for entries with >1 entity)
215
+ entryIdL = [rcsbId[:4] for rcsbId in assignRefD]
216
+ #
217
+ # List of *unique* UniProt IDs from SIFTS for all protein (or, "polymerType") entries currently in ExDB
218
+ siftsUniProtL = self.__ssP.getEntryUniqueIdentifiers(entryIdL, idType="UNPID") if self.__ssP else []
219
+ logger.info("Incorporating all %d SIFTS accessions for %d entities", len(siftsUniProtL), len(entryIdL))
220
+ #
221
+ # unpIdList: List of all *unique* UniProt IDs combined from 'refIdMapD' and 'siftsUniProtL'
222
+ # Since not everything will be covered by SIFTS, this will be slightly more than siftsUniProtL
223
+ unpIdList = sorted(set(list(refIdMapD.keys()) + siftsUniProtL))
224
+ logger.info("UniProt ID list length %d (unpIdList)", len(unpIdList))
225
+ #
226
+ # cacheUnpIdList: List of UniProt IDs from uniprot_exdb.reference_match, from today backwards
227
+ cacheUnpIdList = self.__getReferenceDataIds(expireDays=0)
228
+ logger.info("Using %d cached reference sequences", len(cacheUnpIdList))
229
+ #
230
+ # updateUnpIdList: List of the *delta* UniProt IDs between what's possible based on entity collections (unpIdList)
231
+ # and what's already in uniprot_exdb.reference_match (cacheUnpIdList)
232
+ updateUnpIdList = sorted(set(unpIdList) - set(cacheUnpIdList))
233
+ logger.info("UniProt list lengths (unique): set(unpIdList) %d - set(cacheUnpIdList) %d", len(set(unpIdList)), len(set(cacheUnpIdList)))
234
+ #
235
+ if updateUnpIdList:
236
+ logger.info("Updating cache for %d UniProt accessions (consolidated PDB + SIFTS)", len(updateUnpIdList))
237
+ ok, failList = self.__updateReferenceData(updateUnpIdList)
238
+ logger.info("Fetch references update status is %r missing count %d", ok, len(failList))
239
+ else:
240
+ logger.info("No reference sequence updates required")
241
+ #
242
+ matchD = self.__getReferenceData(self.__refDatabaseName, self.__refMatchDataCollectionName)
243
+ refD = self.__getReferenceData(self.__refDatabaseName, self.__refDataCollectionName)
244
+ logger.info("Completed - returning match length %d and reference data length %d num missing %d", len(matchD), len(refD), len(failList))
245
+ return matchD, refD, len(failList)
246
+
247
+ def __refreshReferenceData(self, expireDays=14, failureFraction=0.75):
248
+ """Update expired reference data and purge any obsolete data not to exceeding the
249
+ the input failureFraction.
250
+
251
+ Args:
252
+ expireDays (int, optional): expiration interval in days. Defaults to 14.
253
+ failureFraction (float, optional): fractional limit of obsolete entries purged. Defaults to 0.75.
254
+
255
+ Returns:
256
+ (int): number of obsolete entries purged
257
+
258
+ """
259
+ idList = self.__getReferenceDataIds(expireDays=expireDays)
260
+ logger.info("Expired (days=%d) reference identifiers %d", expireDays, len(idList))
261
+ if not idList:
262
+ return 0
263
+ #
264
+ ok, failList = self.__updateReferenceData(idList)
265
+ logger.info("After reference update (status=%r) missing expired match identifiers %d", ok, len(failList))
266
+ tFrac = float(len(failList)) / float(len(idList))
267
+ if tFrac < failureFraction:
268
+ obUpd = ObjectUpdater(self.__cfgOb)
269
+ selectD = {"rcsb_id": failList}
270
+ numPurge = obUpd.delete(self.__refDatabaseName, self.__refMatchDataCollectionName, selectD)
271
+ if len(failList) != numPurge:
272
+ logger.info("Update match failures %d purge count %d", len(failList), numPurge)
273
+ numPurge = obUpd.delete(self.__refDatabaseName, self.__refDataCollectionName, selectD)
274
+ if len(failList) != numPurge:
275
+ logger.info("Update reference data failures %d purge count %d", len(failList), numPurge)
276
+ return len(failList)
277
+
278
+ def __getReferenceDataIds(self, expireDays=14):
279
+ """Get reference data identifiers subject to an expiration interval
280
+ (i.e. not updated in/older than deltaDays)
281
+
282
+ Args:
283
+ expireDays (int, optional): expiration interval in days. Defaults to 14.
284
+
285
+ Returns:
286
+ (list): reference identifier list
287
+ """
288
+ selectD = None
289
+ if expireDays > 0:
290
+ tU = TimeUtil()
291
+ tS = tU.getTimestamp(useUtc=True, before={"days": expireDays})
292
+ selectD = {"rcsb_latest_update": {"$lt": tU.getDateTimeObj(tS)}}
293
+ matchD = self.__getReferenceData(self.__refDatabaseName, self.__refMatchDataCollectionName, selectD=selectD)
294
+ return sorted(matchD.keys())
295
+
296
+ def __updateReferenceData(self, idList):
297
+ numProc = self.__numProc
298
+ chunkSize = self.__maxChunkSize
299
+ logger.info("Length starting list is %d", len(idList))
300
+ optD = {"maxChunkSize": chunkSize}
301
+ rWorker = ReferenceUpdateWorker(self.__cfgOb)
302
+ mpu = MultiProcUtil(verbose=True)
303
+ mpu.setOptions(optD)
304
+ mpu.set(workerObj=rWorker, workerMethod="updateList")
305
+ ok, failList, resultList, _ = mpu.runMulti(dataList=idList, numProc=numProc, numResults=2, chunkSize=chunkSize)
306
+ logger.info("Multi-proc %r failures %r result lengths %r %r", ok, len(failList), len(resultList[0]), len(resultList[1]))
307
+ return ok, failList
308
+
309
+ def __getReferenceData(self, databaseName, collectionName, selectD=None):
310
+ logger.info("Searching %s %s with selection query %r", databaseName, collectionName, selectD)
311
+ obEx = ObjectExtractor(
312
+ self.__cfgOb,
313
+ databaseName=databaseName,
314
+ collectionName=collectionName,
315
+ keyAttribute="rcsb_id",
316
+ uniqueAttributes=["rcsb_id"],
317
+ selectionQuery=selectD,
318
+ )
319
+ docCount = obEx.getCount()
320
+ logger.debug("Reference data match count %d", docCount)
321
+ objD = obEx.getObjects()
322
+ return objD
323
+
324
+ def __getPolymerReferenceSequenceAssignments(self, databaseName, collectionName, polymerType, fetchLimit):
325
+ """Get all accessions assigned to input reference sequence database for the input polymerType.
326
+
327
+ Returns:
328
+ (dict): {"1abc_1": "rcsb_polymer_entity_container_identifiers": {"reference_sequence_identifiers": []},
329
+ "rcsb_entity_source_organism"" {"ncbi_taxonomy_id": []}
330
+ """
331
+ try:
332
+
333
+ obEx = ObjectExtractor(
334
+ self.__cfgOb,
335
+ databaseName=databaseName,
336
+ collectionName=collectionName,
337
+ cacheFilePath=None,
338
+ useCache=False,
339
+ keyAttribute="entity",
340
+ uniqueAttributes=["rcsb_id"],
341
+ cacheKwargs=None,
342
+ objectLimit=fetchLimit,
343
+ selectionQuery={"entity_poly.rcsb_entity_polymer_type": polymerType},
344
+ selectionList=[
345
+ "rcsb_id",
346
+ "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers",
347
+ "rcsb_polymer_entity_container_identifiers.auth_asym_ids",
348
+ "rcsb_entity_source_organism.ncbi_taxonomy_id",
349
+ ],
350
+ )
351
+ eCount = obEx.getCount()
352
+ logger.info("Polymer entity count type %s is %d", polymerType, eCount)
353
+ objD = obEx.getObjects()
354
+ logger.info("Reading polymer entity count %d reference accession length %d ", eCount, len(objD))
355
+ #
356
+ except Exception as e:
357
+ logger.exception("Failing for %s (%s) with %s", databaseName, collectionName, str(e))
358
+ return objD
359
+
360
+ def __getAssignmentMap(self, polymerEntityObjD):
361
+ referenceDatabaseName = "UniProt"
362
+ provSource = "PDB"
363
+ refIdD = defaultdict(list)
364
+ taxIdD = defaultdict(list)
365
+ numMissing = 0
366
+ numMissingTaxons = 0
367
+ for entityKey, eD in polymerEntityObjD.items():
368
+ try:
369
+ accS = set()
370
+ for ii, tD in enumerate(eD["rcsb_polymer_entity_container_identifiers"]["reference_sequence_identifiers"]):
371
+ if tD["database_name"] == referenceDatabaseName and tD["provenance_source"] == provSource:
372
+ accS.add(tD["database_accession"])
373
+ refIdD[tD["database_accession"]].append(entityKey)
374
+ #
375
+ # pick up the corresponding taxonomy -
376
+ try:
377
+ taxIdD[tD["database_accession"]].append(eD["rcsb_entity_source_organism"][ii]["ncbi_taxonomy_id"])
378
+ except Exception:
379
+ logger.debug("Failing taxonomy lookup for %s %r", entityKey, tD["database_accession"])
380
+ numMissingTaxons += 1
381
+
382
+ logger.debug("PDB assigned sequences length %d", len(accS))
383
+ except Exception as e:
384
+ numMissing += 1
385
+ logger.debug("No sequence assignments for %s with %s", entityKey, str(e))
386
+ #
387
+ numMultipleTaxons = 0
388
+ for refId, taxIdL in taxIdD.items():
389
+ taxIdL = list(set(taxIdL))
390
+ if len(taxIdL) > 1:
391
+ logger.debug("Multitple taxIds assigned to reference sequence id %s: %r", refId, taxIdL)
392
+ numMultipleTaxons += 1
393
+
394
+ logger.info("Entities with missing taxonomy %d", numMissingTaxons)
395
+ logger.info("Reference sequences with multiple taxonomies %d", numMultipleTaxons)
396
+ logger.info("Unique %s accession assignments by %s %d (entities missing archive accession assignments %d) ", referenceDatabaseName, provSource, len(refIdD), numMissing)
397
+ return refIdD, taxIdD
@@ -0,0 +1,69 @@
1
+ ##
2
+ # File: TaxonomyExtractor.py
3
+ # Date: 15-Oct-2019 jdw
4
+ #
5
+ # Utilities to extract taxonomy details from the core entity collection.
6
+ #
7
+ # Updates:
8
+ #
9
+ ##
10
+ __docformat__ = "google en"
11
+ __author__ = "John Westbrook"
12
+ __email__ = "jwest@rcsb.rutgers.edu"
13
+ __license__ = "Apache 2.0"
14
+
15
+ import logging
16
+
17
+ from rcsb.exdb.utils.ObjectExtractor import ObjectExtractor
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class TaxonomyExtractor(object):
23
+ """Utilities to extract taxonomy details from the core entity collection."""
24
+
25
+ def __init__(self, cfgOb):
26
+ self.__cfgOb = cfgOb
27
+ self.__databaseName = "pdbx_core"
28
+ self.__collectionName = "pdbx_core_polymer_entity"
29
+
30
+ def getUniqueTaxons(self):
31
+ taxIdL = self.__extractEntityTaxons()
32
+ return taxIdL
33
+
34
+ def __extractEntityTaxons(self):
35
+ """Test case - extract unique entity source and host taxonomies"""
36
+ try:
37
+ obEx = ObjectExtractor(
38
+ self.__cfgOb,
39
+ databaseName=self.__databaseName,
40
+ collectionName=self.__collectionName,
41
+ cacheFilePath=None,
42
+ useCache=False,
43
+ keyAttribute="entity",
44
+ uniqueAttributes=["rcsb_id"],
45
+ cacheKwargs=None,
46
+ objectLimit=None,
47
+ # selectionQuery={"entity.type": "polymer"},
48
+ selectionQuery=None,
49
+ selectionList=["rcsb_id", "rcsb_entity_source_organism.ncbi_taxonomy_id", "rcsb_entity_host_organism.ncbi_taxonomy_id"],
50
+ )
51
+ eCount = obEx.getCount()
52
+ logger.info("Polymer entity count is %d", eCount)
53
+ taxIdS = set()
54
+ objD = obEx.getObjects()
55
+ for _, eD in objD.items():
56
+ try:
57
+ for tD in eD["rcsb_entity_source_organism"]:
58
+ taxIdS.add(tD["ncbi_taxonomy_id"])
59
+ except Exception:
60
+ pass
61
+ try:
62
+ for tD in eD["rcsb_entity_host_organism"]:
63
+ taxIdS.add(tD["ncbi_taxonomy_id"])
64
+ except Exception:
65
+ pass
66
+ logger.info("Unique taxons %d", len(taxIdS))
67
+ return list(taxIdS)
68
+ except Exception as e:
69
+ logger.exception("Failing with %s", str(e))
@@ -0,0 +1,177 @@
1
+ ##
2
+ # File: UniProtCoreEtlWorker.py
3
+ # Date: 9-Dec-2019 jdw
4
+ #
5
+ # ETL utilities for processing and loading UniProt core collection reference data.
6
+ #
7
+ # Updates:
8
+ #
9
+ #
10
+ ##
11
+ __docformat__ = "google en"
12
+ __author__ = "John Westbrook"
13
+ __email__ = "jwest@rcsb.rutgers.edu"
14
+ __license__ = "Apache 2.0"
15
+
16
+ import logging
17
+
18
+ from jsonschema import Draft4Validator
19
+ from jsonschema import FormatChecker
20
+
21
+ from rcsb.db.helpers.DocumentDefinitionHelper import DocumentDefinitionHelper
22
+ from rcsb.db.mongo.DocumentLoader import DocumentLoader
23
+ from rcsb.db.processors.DataExchangeStatus import DataExchangeStatus
24
+ from rcsb.db.utils.SchemaProvider import SchemaProvider
25
+ from rcsb.exdb.seq.ReferenceSequenceAssignmentProvider import ReferenceSequenceAssignmentProvider
26
+
27
+ #
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ class UniProtCoreEtlWorker(object):
33
+ """Prepare and load UniProt 'core' sequence reference data collections."""
34
+
35
+ def __init__(self, cfgOb, cachePath, useCache=True, numProc=2, chunkSize=10, maxStepLength=2000, readBackCheck=False, documentLimit=None, doValidate=False, verbose=False):
36
+ self.__cfgOb = cfgOb
37
+ self.__cachePath = cachePath
38
+ self.__useCache = useCache
39
+ self.__readBackCheck = readBackCheck
40
+ self.__numProc = numProc
41
+ self.__chunkSize = chunkSize
42
+ self.__maxStepLength = maxStepLength
43
+ self.__documentLimit = documentLimit
44
+ #
45
+ self.__resourceName = "MONGO_DB"
46
+ self.__verbose = verbose
47
+ self.__statusList = []
48
+ self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=self.__useCache)
49
+ self.__docHelper = DocumentDefinitionHelper(cfgOb=self.__cfgOb)
50
+ self.__valInst = None
51
+ self.__doValidate = doValidate
52
+ #
53
+
54
+ def __updateStatus(self, updateId, databaseName, collectionName, status, startTimestamp):
55
+ try:
56
+ sFlag = "Y" if status else "N"
57
+ desp = DataExchangeStatus()
58
+ desp.setStartTime(tS=startTimestamp)
59
+ desp.setObject(databaseName, collectionName)
60
+ desp.setStatus(updateId=updateId, successFlag=sFlag)
61
+ desp.setEndTime()
62
+ self.__statusList.append(desp.getStatus())
63
+ return True
64
+ except Exception as e:
65
+ logger.exception("Failing with %s", str(e))
66
+ return False
67
+
68
+ def __getReferenceSequenceProvider(self):
69
+ """ """
70
+ try:
71
+ rsaP = ReferenceSequenceAssignmentProvider(
72
+ self.__cfgOb,
73
+ databaseName="pdbx_core",
74
+ collectionName="pdbx_core_polymer_entity",
75
+ polymerType="Protein",
76
+ referenceDatabaseName="UniProt",
77
+ provSource="PDB",
78
+ useCache=self.__useCache,
79
+ cachePath=self.__cachePath,
80
+ fetchLimit=self.__documentLimit,
81
+ siftsAbbreviated="TEST",
82
+ )
83
+ ok = rsaP.testCache()
84
+ return ok, rsaP
85
+ except Exception as e:
86
+ logger.exception("Failing with %s", str(e))
87
+ return None
88
+
89
+ def load(self, updateId, extResource, loadType="full"):
90
+ """Load sequence reference data"""
91
+ try:
92
+ self.__statusList = []
93
+ desp = DataExchangeStatus()
94
+ statusStartTimestamp = desp.setStartTime()
95
+ #
96
+ dList = indexL = []
97
+ databaseName = collectionName = collectionVersion = None
98
+ addValues = {}
99
+ #
100
+ if extResource == "UniProt":
101
+ databaseName = "uniprot_core"
102
+ # configName = self.__cfgOb.getDefaultSectionName()
103
+ # dirPath = os.path.join(self.__cachePath, self.__cfgOb.get("EXDB_CACHE_DIR", self.__cfgOb.getDefaultSectionName()))
104
+ #
105
+ ok, rsP = self.__getReferenceSequenceProvider()
106
+ if not ok:
107
+ return False
108
+ #
109
+ dList = rsP.getDocuments()
110
+ logger.info("Resource %r extracted mapped document length %d", extResource, len(dList))
111
+ logger.debug("Objects %r", dList[:2])
112
+ #
113
+ cDL = self.__docHelper.getCollectionInfo(databaseName)
114
+ collectionName = cDL[0]["NAME"]
115
+ collectionVersion = cDL[0]["VERSION"]
116
+ indexL = self.__docHelper.getDocumentIndexAttributes(collectionName, "primary")
117
+ logger.info("Database %r collection %r version %r index attributes %r", databaseName, collectionName, collectionVersion, indexL)
118
+ else:
119
+ logger.error("Unsupported external resource %r", extResource)
120
+ #
121
+ if self.__doValidate:
122
+ self.__valInst = self.__getValidator(databaseName, collectionName, schemaLevel="full")
123
+ for dObj in dList:
124
+ self.__validateObj(databaseName, collectionName, dObj, label="Original")
125
+ #
126
+ dl = DocumentLoader(
127
+ self.__cfgOb,
128
+ self.__cachePath,
129
+ self.__resourceName,
130
+ numProc=self.__numProc,
131
+ chunkSize=self.__chunkSize,
132
+ maxStepLength=self.__maxStepLength,
133
+ documentLimit=self.__documentLimit,
134
+ verbose=self.__verbose,
135
+ readBackCheck=self.__readBackCheck,
136
+ )
137
+ #
138
+ ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=dList, indexAttributeList=indexL, keyNames=None, addValues=addValues)
139
+ okS = self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
140
+
141
+ return ok and okS
142
+ except Exception as e:
143
+ logger.exception("Failing with %s", str(e))
144
+ return False
145
+
146
+ def getLoadStatus(self):
147
+ return self.__statusList
148
+
149
+ def __getValidator(self, databaseName, collectionName, schemaLevel="full"):
150
+ # _ = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=True)
151
+ # cD = self.__schP.makeSchema(databaseName, collectionName, encodingType="JSON", level=schemaLevel, saveSchema=True)
152
+ logger.info("Fetch schema for %r %r validation level %r", databaseName, collectionName, schemaLevel)
153
+ cD = self.__schP.getJsonSchema(databaseName, collectionName, encodingType="JSON", level=schemaLevel)
154
+ # Raises exceptions for schema compliance.
155
+ Draft4Validator.check_schema(cD)
156
+ valInst = Draft4Validator(cD, format_checker=FormatChecker())
157
+ return valInst
158
+
159
+ def __validateObj(self, databaseName, collectionName, rObj, label=""):
160
+ try:
161
+ eCount = 0
162
+ tId = rObj["rcsb_id"] if rObj and "rcsb_id" in rObj else "anonymous"
163
+ for error in sorted(self.__valInst.iter_errors(rObj), key=str):
164
+ logger.info("Database %s collection %s (%s %r) path %s error: %s", databaseName, collectionName, label, tId, error.path, error.message)
165
+ logger.debug(">>> Failing object is %r", rObj)
166
+ if "rcsb_uniprot_feature" in rObj:
167
+ for dd in rObj["rcsb_uniprot_feature"]:
168
+ if "feature_id" in dd:
169
+ logger.info("feature_id %r", dd["feature_id"])
170
+ else:
171
+ logger.info("no feature_id keys %r", sorted(dd.keys()))
172
+ logger.info("description %r", dd["description"])
173
+ eCount += 1
174
+ except Exception as e:
175
+ logger.exception("Validation failing %s", str(e))
176
+
177
+ return eCount