rcsb.exdb 1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. rcsb/__init__.py +1 -0
  2. rcsb/exdb/__init__.py +1 -0
  3. rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
  4. rcsb/exdb/branch/GlycanProvider.py +116 -0
  5. rcsb/exdb/branch/GlycanUtils.py +114 -0
  6. rcsb/exdb/branch/__init__.py +0 -0
  7. rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
  8. rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
  9. rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
  10. rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
  11. rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
  12. rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
  13. rcsb/exdb/chemref/__init__.py +0 -0
  14. rcsb/exdb/citation/CitationAdapter.py +91 -0
  15. rcsb/exdb/citation/CitationExtractor.py +190 -0
  16. rcsb/exdb/citation/CitationUtils.py +51 -0
  17. rcsb/exdb/citation/__init__.py +0 -0
  18. rcsb/exdb/cli/__init__.py +0 -0
  19. rcsb/exdb/entry/EntryInfoProvider.py +148 -0
  20. rcsb/exdb/entry/__init__.py +0 -0
  21. rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
  22. rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
  23. rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
  24. rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
  25. rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
  26. rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
  27. rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
  28. rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
  29. rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
  30. rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
  31. rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
  32. rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
  33. rcsb/exdb/seq/AnnotationExtractor.py +76 -0
  34. rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
  35. rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
  36. rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
  37. rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
  38. rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
  39. rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
  40. rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
  41. rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
  42. rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
  43. rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
  44. rcsb/exdb/seq/UniProtExtractor.py +80 -0
  45. rcsb/exdb/seq/__init__.py +0 -0
  46. rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
  47. rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
  48. rcsb/exdb/tests/__init__.py +0 -0
  49. rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
  50. rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
  51. rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
  52. rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
  53. rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
  54. rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
  55. rcsb/exdb/tests/testChemRefLoader.py +106 -0
  56. rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
  57. rcsb/exdb/tests/testCitationAdapter.py +97 -0
  58. rcsb/exdb/tests/testCitationExtractor.py +93 -0
  59. rcsb/exdb/tests/testCitationUtils.py +92 -0
  60. rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
  61. rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
  62. rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
  63. rcsb/exdb/tests/testGlycanProvider.py +98 -0
  64. rcsb/exdb/tests/testGlycanUtils.py +64 -0
  65. rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
  66. rcsb/exdb/tests/testObjectExtractor.py +342 -0
  67. rcsb/exdb/tests/testObjectTransformer.py +83 -0
  68. rcsb/exdb/tests/testObjectUpdater.py +120 -0
  69. rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
  70. rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
  71. rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
  72. rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
  73. rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
  74. rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
  75. rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
  76. rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
  77. rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
  78. rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
  79. rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
  80. rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
  81. rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
  82. rcsb/exdb/tests/testUniProtExtractor.py +77 -0
  83. rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
  84. rcsb/exdb/tree/__init__.py +0 -0
  85. rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
  86. rcsb/exdb/utils/ObjectExtractor.py +286 -0
  87. rcsb/exdb/utils/ObjectTransformer.py +124 -0
  88. rcsb/exdb/utils/ObjectUpdater.py +121 -0
  89. rcsb/exdb/utils/ObjectValidator.py +160 -0
  90. rcsb/exdb/utils/__init__.py +0 -0
  91. rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
  92. rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
  93. rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
  94. rcsb/exdb/wf/__init__.py +0 -0
  95. rcsb_exdb-1.31.dist-info/METADATA +103 -0
  96. rcsb_exdb-1.31.dist-info/RECORD +98 -0
  97. rcsb_exdb-1.31.dist-info/WHEEL +4 -0
  98. rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,544 @@
1
+ ##
2
+ # File: EntityPolymerExtractor.py
3
+ # Date: 19-Feb-2019 jdw
4
+ #
5
+ # Selected utilities to extract entity polymer mapping and feature data
6
+ # from the exchange database schema.
7
+ #
8
+ # Updates:
9
+ #
10
+ #
11
+ ##
12
+ __docformat__ = "google en"
13
+ __author__ = "John Westbrook"
14
+ __email__ = "jwest@rcsb.rutgers.edu"
15
+ __license__ = "Apache 2.0"
16
+
17
+ import copy
18
+ import logging
19
+ import os
20
+
21
+ from rcsb.db.mongo.Connection import Connection
22
+ from rcsb.db.mongo.MongoDbUtil import MongoDbUtil
23
+ from rcsb.utils.io.MarshalUtil import MarshalUtil
24
+
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class EntityPolymerExtractor(object):
30
+ """Utilities to extract polymer related data from entry and entity collections."""
31
+
32
+ def __init__(self, cfgOb, **kwargs):
33
+ self.__cfgOb = cfgOb
34
+ self.__resourceName = "MONGO_DB"
35
+ self.__mU = MarshalUtil()
36
+ self.__entryD, self.__authAsymIdIndex = self.__rebuildCache(**kwargs)
37
+ #
38
+
39
+ def __rebuildCache(self, **kwargs):
40
+ useCache = kwargs.get("useCache", True)
41
+ dirPath = kwargs.get("exdbDirPath", ".")
42
+ cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"})
43
+ #
44
+ ext = "pic" if cacheKwargs["fmt"] == "pickle" else "json"
45
+ fn = "entity-polymer-extracted-data-cache" + "." + ext
46
+ cacheFilePath = os.path.join(dirPath, fn)
47
+ #
48
+ cD = {"entryD": {}, "authIdxD": {}}
49
+ try:
50
+ self.__mU.mkdir(dirPath)
51
+ if not useCache:
52
+ for fp in [cacheFilePath]:
53
+ try:
54
+ os.remove(fp)
55
+ except Exception:
56
+ pass
57
+
58
+ if useCache and cacheFilePath and os.access(cacheFilePath, os.R_OK):
59
+ cD = self.__mU.doImport(cacheFilePath, **cacheKwargs)
60
+ else:
61
+ entryD = self.__selectEntries(**kwargs)
62
+ entryD = self.__selectPolymerEntities(entryD, **kwargs)
63
+ authIdxD = self.__buildIndices(entryD)
64
+ cD["entryD"] = entryD
65
+ cD["authIdxD"] = authIdxD
66
+ if cacheFilePath:
67
+ ok = self.__mU.doExport(cacheFilePath, cD, **cacheKwargs)
68
+ logger.info("Saved entity-polymer extracted results (%d) status %r in %s", len(entryD), ok, cacheFilePath)
69
+ except Exception as e:
70
+ logger.exception("Failing with %s", str(e))
71
+ return cD["entryD"], cD["authIdxD"]
72
+
73
+ def __buildIndices(self, entryD):
74
+ indD = {}
75
+ for entryId, eD in entryD.items():
76
+ entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {}
77
+ for entityId, pD in entityD.items():
78
+ for authAsymId in pD["auth_asym_ids"]:
79
+ # avoid tuples for json serialization
80
+ # indD[(entryId, authAsymId)] = entityId
81
+ indD[entryId + "_" + authAsymId] = entityId
82
+ return indD
83
+
84
+ def getEntryCount(self):
85
+ return len(self.__entryD)
86
+
87
+ def getRefSeqAccessions(self, dbName):
88
+ acL = []
89
+ try:
90
+ for _, eD in self.__entryD.items():
91
+ entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {}
92
+ for _, pD in entityD.items():
93
+ for dD in pD["struct_ref"]:
94
+ if "pdbx_db_accession" in dD and dD["db_name"] == dbName:
95
+ acL.append(dD["pdbx_db_accession"])
96
+ return list(set(acL))
97
+ except Exception as e:
98
+ logger.exception("Failing with %s", str(e))
99
+
100
+ return acL
101
+
102
+ def countRefSeqAccessions(self, dbName):
103
+ cD = {}
104
+ try:
105
+ for _, eD in self.__entryD.items():
106
+ entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {}
107
+ for _, pD in entityD.items():
108
+ iCount = 0
109
+ for dD in pD["struct_ref"]:
110
+ if "pdbx_db_accession" in dD and dD["db_name"] == dbName:
111
+ iCount += 1
112
+ cD[iCount] = cD[iCount] + 1 if iCount in cD else 1
113
+ except Exception as e:
114
+ logger.exception("Failing with %s", str(e))
115
+
116
+ return cD
117
+
118
+ def countRefSeqAccessionDbType(self):
119
+ cD = {}
120
+ try:
121
+ for _, eD in self.__entryD.items():
122
+ entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {}
123
+ for _, pD in entityD.items():
124
+ for dD in pD["struct_ref"]:
125
+ if "pdbx_db_accession" in dD and "db_name" in dD:
126
+ cD[dD["db_name"]] = cD[dD["db_name"]] + 1 if dD["db_name"] in cD else 1
127
+ except Exception as e:
128
+ logger.exception("Failing with %s", str(e))
129
+
130
+ return cD
131
+
132
+ def countRefSeqAccessionAny(self):
133
+ cD = {}
134
+ try:
135
+ for _, eD in self.__entryD.items():
136
+ entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {}
137
+ for _, pD in entityD.items():
138
+ iCount = len(pD["struct_ref"])
139
+ # if iCount == 0:
140
+ # logger.info("entryId %r " % (entryId, entityId))
141
+ cD[iCount] = cD[iCount] + 1 if iCount in cD else 1
142
+ except Exception as e:
143
+ logger.exception("Failing with %s", str(e))
144
+
145
+ return cD
146
+
147
+ def getUniqueTaxons(self):
148
+ #
149
+ tD = {}
150
+ try:
151
+ for _, eD in self.__entryD.items():
152
+ entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {}
153
+ for _, pD in entityD.items():
154
+ # logger.info("Entity dictionary %r", pD.keys())
155
+ if "rcsb_entity_source_organism" in pD:
156
+ for dd in pD["rcsb_entity_source_organism"]:
157
+ if "ncbi_taxonomy_id" in dd:
158
+ tD[dd["ncbi_taxonomy_id"]] = tD[dd["ncbi_taxonomy_id"]] + 1 if dd["ncbi_taxonomy_id"] in tD else 1
159
+ except Exception as e:
160
+ logger.exception("Failing with %s", str(e))
161
+ logger.info("Taxon coverage %d", len(tD))
162
+ return tD
163
+
164
+ def getOrigTaxons(self):
165
+ #
166
+ tD = {}
167
+ try:
168
+ for entryId, eD in self.__entryD.items():
169
+ entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {}
170
+ for entityId, pD in entityD.items():
171
+ # logger.info("Entity dictionary %r", pD.keys())
172
+ if "original_taxonomy_ids" in pD:
173
+ for tV in pD["original_taxonomy_ids"]:
174
+ tD.setdefault(entryId, []).append((entityId, tV))
175
+ if entryId not in tD:
176
+ logger.debug("No taxonomy for %s", entryId)
177
+ except Exception as e:
178
+ logger.exception("Failing with %s", str(e))
179
+ logger.info("Taxon coverage %d", len(tD))
180
+ return tD
181
+
182
+ def countRefSeqAccessionByTaxon(self, dbNameList=None):
183
+ #
184
+ tD = {}
185
+ iCount = 0
186
+ #
187
+ try:
188
+ for _, eD in self.__entryD.items():
189
+ entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {}
190
+ for _, pD in entityD.items():
191
+ # logger.info("Entity dictionary %r", pD.keys())
192
+ if "rcsb_entity_source_organism" in pD:
193
+ for dd in pD["rcsb_entity_source_organism"]:
194
+ if "ncbi_taxonomy_id" in dd:
195
+ tId = dd["ncbi_taxonomy_id"]
196
+ for dD in pD["struct_ref"]:
197
+ if "pdbx_db_accession" in dD and "db_name" in dD:
198
+ if dD["db_name"] in dbNameList:
199
+ tD.setdefault(tId, []).append(dD["pdbx_db_accession"])
200
+ iCount += 1
201
+ except Exception as e:
202
+ logger.exception("Failing with %s", str(e))
203
+
204
+ logger.info("Total observed accessions %d", iCount)
205
+ return tD
206
+
207
+ def checkRefSeqAlignRange(self, dbName):
208
+ ok = True
209
+ try:
210
+ eCount = 0
211
+ aCount = 0
212
+ tCount = 0
213
+ for entryId, eD in self.__entryD.items():
214
+ entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {}
215
+ for entityId, pD in entityD.items():
216
+ for dD in pD["struct_ref"]:
217
+ if "db_name" in dD and dD["db_name"] == dbName:
218
+ if "pdbx_db_accession" in dD and "alignD" in dD and "pdbx_seq_one_letter_code" in dD and "pdbx_align_begin" in dD:
219
+ seqLen = len(dD["pdbx_seq_one_letter_code"])
220
+ dbBegin = 100000000
221
+ dbEnd = -1
222
+ refSeqDbBegin = dD["pdbx_align_begin"]
223
+ for authAsymId, alDL in dD["alignD"].items():
224
+ tCount += 1
225
+ difL = []
226
+ for alD in alDL:
227
+ tBeg = alD["db_align_beg"]
228
+ tEnd = alD["db_align_end"]
229
+ tDif = tEnd - tBeg + 1
230
+ difL.append(tDif)
231
+ dbBegin = min(tBeg, dbBegin)
232
+ dbEnd = max(tEnd, dbEnd)
233
+
234
+ # range is calculate on off -
235
+ # if seqLen < dbEnd - dbBegin + 1:
236
+ if seqLen < dbEnd - dbBegin and not refSeqDbBegin == dbBegin:
237
+ fDif = sum(difL)
238
+ logger.debug(
239
+ "Bad alignment for %r %r %r %r (%d) seqLen %r (%d) dbBegin %r dbEnd %r difL %r tDif %r",
240
+ entryId,
241
+ entityId,
242
+ authAsymId,
243
+ alD["pdbx_strand_id"],
244
+ len(alDL),
245
+ seqLen,
246
+ dbEnd - dbBegin + 1,
247
+ dbBegin,
248
+ dbEnd,
249
+ difL,
250
+ fDif,
251
+ )
252
+ aCount += 1
253
+
254
+ else:
255
+ eCount += 1
256
+ logger.info("Incomplete %s struct_ref record count %d", dbName, eCount)
257
+ logger.info("Inconsistent %s db reference alignments %d/%d", dbName, aCount, tCount)
258
+
259
+ except Exception as e:
260
+ logger.exception("Failing with %s", str(e))
261
+ ok = False
262
+
263
+ return ok
264
+
265
+ def getEntityRefSeqAccessions(self, dbName, entryId, entityId):
266
+ acL = []
267
+ try:
268
+ dL = self.__entryD[entryId]["selected_polymer_entities"][entityId]["struct_ref"]
269
+ acL = list(set([d["pdbx_db_accession"] for d in dL if d["db_name"] == dbName]))
270
+ except Exception as e:
271
+ logger.exception("Failing with %s %r %r %s", dbName, entryId, entityId, str(e))
272
+ return acL
273
+
274
+ def __selectEntries(self, **kwargs):
275
+ """Return a dictionary of PDB entries satifying the input conditions (e.g. method, resolution limit)"""
276
+
277
+ dbName = kwargs.get("dbName", "pdbx_core")
278
+ collectionName = kwargs.get("collectionName", "pdbx_core_entry")
279
+ selectionQueryD = kwargs.get("entrySelectionQuery", {})
280
+ #
281
+ entryD = {}
282
+ try:
283
+ with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
284
+ mg = MongoDbUtil(client)
285
+ if mg.collectionExists(dbName, collectionName):
286
+ logger.info("%s %s document count is %d", dbName, collectionName, mg.count(dbName, collectionName))
287
+ qD = {}
288
+ if selectionQueryD:
289
+ qD.update(qD)
290
+ selectL = ["rcsb_entry_container_identifiers"]
291
+ dL = mg.fetch(dbName, collectionName, selectL, queryD=qD)
292
+ logger.info("Selection %r fetch result count %d", selectL, len(dL))
293
+ #
294
+ for dD in dL:
295
+ #
296
+ if (
297
+ ("rcsb_entry_container_identifiers" in dD)
298
+ and ("entry_id" in dD["rcsb_entry_container_identifiers"])
299
+ and ("polymer_entity_ids" in dD["rcsb_entry_container_identifiers"])
300
+ and dD["rcsb_entry_container_identifiers"]["polymer_entity_ids"]
301
+ ):
302
+ entryD[dD["rcsb_entry_container_identifiers"]["entry_id"]] = {"polymer_entity_ids": dD["rcsb_entry_container_identifiers"]["polymer_entity_ids"]}
303
+
304
+ except Exception as e:
305
+ logger.exception("Failing with %s", str(e))
306
+ return entryD
307
+ #
308
+
309
+ def __selectPolymerEntities(self, entryD, **kwargs):
310
+ """Skeleton entity selector recovering essential biological sequence mapping features
311
+ for macromolecules (default type = protein).
312
+
313
+ "1CP9": {
314
+ "polymer_entity_ids": [
315
+ "1",
316
+ "2"
317
+ ],
318
+ "selected_polymer_entities": {
319
+ "1": {
320
+ "rcsb_multiple_source_flag": "N",
321
+ "asym_ids": [
322
+ "A"
323
+ ],
324
+ "auth_asym_ids": [
325
+ "A"
326
+ ],
327
+ "entity_id": "1",
328
+ "type": "polypeptide(L)",
329
+ "rcsb_entity_polymer_type": "Protein",
330
+ "rcsb_entity_source_organism": [
331
+ {
332
+ "ncbi_taxonomy_id": 587,
333
+ "beg_seq_num": 1,
334
+ "end_seq_num": 205,
335
+ "ncbi_scientific_name": "Providencia rettgeri"
336
+ }
337
+ ],
338
+ "struct_ref": [
339
+ {
340
+ "id": "1",
341
+ "db_name": "UNP",
342
+ "pdbx_db_accession": "Q7WZI9",
343
+ "entity_id": "1",
344
+ "pdbx_seq_one_letter_code": "QSTQIKIERDNYGVPHIYANDTYSLFYGYGYA...",
345
+ "alignD": {
346
+ "A": [
347
+ {
348
+ "align_id": "1",
349
+ "ref_id": "1",
350
+ "pdbx_PDB_id_code": "1CP9",
351
+ "pdbx_strand_id": "A",
352
+ "seq_align_beg": 1,
353
+ "seq_align_end": 205,
354
+ "pdbx_db_accession": "Q7WZI9",
355
+ "db_align_beg": 24,
356
+ "db_align_end": 228,
357
+ "pdbx_auth_seq_align_beg": "1",
358
+ "pdbx_auth_seq_align_end": "205",
359
+ "rcsb_entity_id": "1"
360
+ }
361
+ ]
362
+ }
363
+ }
364
+ ]
365
+ },
366
+ "2": {
367
+ "rcsb_multiple_source_flag": "N",
368
+ "asym_ids": [
369
+ "B"
370
+ ],
371
+ "auth_asym_ids": [
372
+ "B"
373
+ ],
374
+ "entity_id": "2",
375
+ "type": "polypeptide(L)",
376
+ "rcsb_entity_polymer_type": "Protein",
377
+ "rcsb_entity_source_organism": [
378
+ {
379
+ "ncbi_taxonomy_id": 587,
380
+ "beg_seq_num": 1,
381
+ "end_seq_num": 553,
382
+ "ncbi_scientific_name": "Providencia rettgeri"
383
+ }
384
+ ],
385
+ "struct_ref": [
386
+ {
387
+ "id": "2",
388
+ "db_name": "UNP",
389
+ "pdbx_db_accession": "Q7WZI9",
390
+ "entity_id": "2",
391
+ "pdbx_seq_one_letter_code": "SNVWLVGKTKASGAKAILLNGPQFGWFNPAYTYGIGLHG",
392
+ "alignD": {
393
+ "B": [
394
+ {
395
+ "align_id": "2",
396
+ "ref_id": "2",
397
+ "pdbx_PDB_id_code": "1CP9",
398
+ "pdbx_strand_id": "B",
399
+ "seq_align_beg": 1,
400
+ "seq_align_end": 553,
401
+ "pdbx_db_accession": "Q7WZI9",
402
+ "db_align_beg": 285,
403
+ "db_align_end": 837,
404
+ "pdbx_auth_seq_align_beg": "1",
405
+ "pdbx_auth_seq_align_end": "553",
406
+ "rcsb_entity_id": "2"
407
+ }
408
+ ]
409
+ }
410
+ }
411
+ ]
412
+ }
413
+ }
414
+ },
415
+
416
+ """
417
+ dbName = kwargs.get("dbName", "pdbx_core")
418
+ collectionName = kwargs.get("collectionName", "pdbx_core_polymer_entity")
419
+ resultKey = kwargs.get("resultKey", "selected_polymer_entities")
420
+
421
+ entryLimit = kwargs.get("entryLimit", None)
422
+ selectionQueryD = kwargs.get("entitySelectionQuery", {"entity_poly.rcsb_entity_polymer_type": "Protein"})
423
+ #
424
+ try:
425
+ with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
426
+ mg = MongoDbUtil(client)
427
+ if mg.collectionExists(dbName, collectionName):
428
+ logger.info("%s %s document count is %d", dbName, collectionName, mg.count(dbName, collectionName))
429
+ selectL = [
430
+ "rcsb_polymer_entity_container_identifiers",
431
+ "entity.rcsb_multiple_source_flag",
432
+ "entity_poly.type",
433
+ "entity_poly.rcsb_entity_polymer_type",
434
+ "entity_poly.pdbx_seq_one_letter_code_can",
435
+ "rcsb_entity_source_organism.ncbi_taxonomy_id",
436
+ "rcsb_entity_source_organism.ncbi_scientific_name",
437
+ "rcsb_entity_source_organism.beg_seq_num",
438
+ "rcsb_entity_source_organism.end_seq_num",
439
+ "struct_ref.id",
440
+ "struct_ref.pdbx_db_accession",
441
+ "struct_ref.db_name",
442
+ "struct_ref.entity_id",
443
+ "struct_ref.pdbx_seq_one_letter_code",
444
+ "struct_ref.pdbx_align_begin",
445
+ "struct_ref_seq",
446
+ #
447
+ "entity_src_nat.pdbx_ncbi_taxonomy_id",
448
+ "entity_src_gen.pdbx_gene_src_ncbi_taxonomy_id",
449
+ "entity_src_gen.pdbx_host_org_ncbi_taxonomy_id",
450
+ "pdbx_entity_src_syn.ncbi_taxonomy_id",
451
+ ]
452
+ iCount = 0
453
+ for entryId in entryD:
454
+ #
455
+ if resultKey in entryD[entryId]:
456
+ continue
457
+ #
458
+ qD = {"rcsb_polymer_entity_container_identifiers.entry_id": entryId}
459
+ qD.update(selectionQueryD)
460
+ #
461
+ dL = mg.fetch(dbName, collectionName, selectL, queryD=qD)
462
+ logger.debug("%s query %r fetch result count %d", entryId, qD, len(dL))
463
+ eD = {}
464
+ for ii, dD in enumerate(dL, 1):
465
+ rD = {}
466
+ logger.debug("%s (%4d) d is %r", entryId, ii, dD)
467
+ if "entity" in dD:
468
+ rD["rcsb_multiple_source_flag"] = dD["entity"]["rcsb_multiple_source_flag"] if "rcsb_multiple_source_flag" in dD["entity"] else "N"
469
+ #
470
+ if "rcsb_polymer_entity_container_identifiers" in dD:
471
+ rD["asym_ids"] = dD["rcsb_entity_container_identifiers"]["asym_ids"] if "asym_ids" in dD["rcsb_entity_container_identifiers"] else []
472
+ rD["auth_asym_ids"] = dD["rcsb_entity_container_identifiers"]["auth_asym_ids"] if "auth_asym_ids" in dD["rcsb_entity_container_identifiers"] else []
473
+ rD["entity_id"] = dD["rcsb_entity_container_identifiers"]["entity_id"]
474
+ #
475
+ if "entity_poly" in dD:
476
+ rD["type"] = dD["entity_poly"]["type"] if "type" in dD["entity_poly"] else None
477
+ rD["rcsb_entity_polymer_type"] = dD["entity_poly"]["rcsb_entity_polymer_type"] if "rcsb_entity_polymer_type" in dD["entity_poly"] else None
478
+ rD["entity_polymer_length"] = len(dD["entity_poly"]["pdbx_seq_one_letter_code_can"]) if "pdbx_seq_one_letter_code_can" in dD["entity_poly"] else 0
479
+ #
480
+ tL = []
481
+ if "rcsb_entity_source_organism" in dD:
482
+ for tD in dD["rcsb_entity_source_organism"]:
483
+ tL.append(tD)
484
+ rD["rcsb_entity_source_organism"] = copy.copy(tL)
485
+ #
486
+ qDL = []
487
+ if "struct_ref" in dD:
488
+ for tD in dD["struct_ref"]:
489
+ if "db_name" in tD:
490
+ tD["db_name"] = str(tD["db_name"]).upper().strip()
491
+ tD["db_name"] = "UNP" if tD["db_name"] in ["TREMBL"] else tD["db_name"]
492
+ qDL.append(tD)
493
+ if "struct_ref_seq" in dD:
494
+ for qD in qDL:
495
+ refId = qD["id"]
496
+ alignL = []
497
+ for tD in dD["struct_ref_seq"]:
498
+ if refId == tD["ref_id"]:
499
+ alignL.append(tD)
500
+ # qD['align_list'] = copy.copy(aL)
501
+ for align in alignL:
502
+ authAsymId = align["pdbx_strand_id"]
503
+ qD.setdefault("alignD", {}).setdefault(authAsymId, []).append(align)
504
+
505
+ rD["struct_ref"] = qDL
506
+ #
507
+ taxIdL = []
508
+ if "entity_src_nat" in dD:
509
+ for tD in dD["entity_src_nat"]:
510
+ if "pdbx_ncbi_taxonomy_id" in tD:
511
+ taxIdL.append(tD["pdbx_ncbi_taxonomy_id"])
512
+ if "entity_src_gen" in dD:
513
+ for tD in dD["entity_src_gen"]:
514
+ if "pdbx_gene_src_ncbi_taxonomy_id" in tD:
515
+ taxIdL.append(tD["pdbx_gene_src_ncbi_taxonomy_id"])
516
+ if "pdbx_host_org_ncbi_taxonomy_id" in tD:
517
+ taxIdL.append(tD["pdbx_host_org_ncbi_taxonomy_id"])
518
+ if "pdbx_entity_src_syn" in dD:
519
+ for tD in dD["pdbx_entity_src_syn"]:
520
+ if "ncbi_taxonomy_id" in tD:
521
+ taxIdL.append(tD["ncbi_taxonomy_id"])
522
+ qL = []
523
+ for taxId in taxIdL:
524
+ ttL = [int(t.strip()) for t in taxId.split(",") if t.strip().isdigit()]
525
+ qL.extend(ttL)
526
+ logger.debug("TaxId list %r", qL)
527
+ rD["original_taxonomy_ids"] = copy.copy(list(set(qL)))
528
+ #
529
+ if "entity_id" in rD:
530
+ eD[rD["entity_id"]] = copy.copy(rD)
531
+
532
+ entryD[entryId][resultKey] = copy.copy(eD)
533
+
534
+ iCount += 1
535
+ if iCount % 1000 == 0:
536
+ logger.info("Completed fetch %d/%d entries", iCount, len(entryD))
537
+ if entryLimit and iCount >= entryLimit:
538
+ logger.info("Quitting after %d", iCount)
539
+ break
540
+
541
+ except Exception as e:
542
+ logger.exception("Failing with %s", str(e))
543
+
544
+ return entryD