rcsb.exdb 1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. rcsb/__init__.py +1 -0
  2. rcsb/exdb/__init__.py +1 -0
  3. rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
  4. rcsb/exdb/branch/GlycanProvider.py +116 -0
  5. rcsb/exdb/branch/GlycanUtils.py +114 -0
  6. rcsb/exdb/branch/__init__.py +0 -0
  7. rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
  8. rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
  9. rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
  10. rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
  11. rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
  12. rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
  13. rcsb/exdb/chemref/__init__.py +0 -0
  14. rcsb/exdb/citation/CitationAdapter.py +91 -0
  15. rcsb/exdb/citation/CitationExtractor.py +190 -0
  16. rcsb/exdb/citation/CitationUtils.py +51 -0
  17. rcsb/exdb/citation/__init__.py +0 -0
  18. rcsb/exdb/cli/__init__.py +0 -0
  19. rcsb/exdb/entry/EntryInfoProvider.py +148 -0
  20. rcsb/exdb/entry/__init__.py +0 -0
  21. rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
  22. rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
  23. rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
  24. rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
  25. rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
  26. rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
  27. rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
  28. rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
  29. rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
  30. rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
  31. rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
  32. rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
  33. rcsb/exdb/seq/AnnotationExtractor.py +76 -0
  34. rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
  35. rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
  36. rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
  37. rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
  38. rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
  39. rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
  40. rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
  41. rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
  42. rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
  43. rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
  44. rcsb/exdb/seq/UniProtExtractor.py +80 -0
  45. rcsb/exdb/seq/__init__.py +0 -0
  46. rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
  47. rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
  48. rcsb/exdb/tests/__init__.py +0 -0
  49. rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
  50. rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
  51. rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
  52. rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
  53. rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
  54. rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
  55. rcsb/exdb/tests/testChemRefLoader.py +106 -0
  56. rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
  57. rcsb/exdb/tests/testCitationAdapter.py +97 -0
  58. rcsb/exdb/tests/testCitationExtractor.py +93 -0
  59. rcsb/exdb/tests/testCitationUtils.py +92 -0
  60. rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
  61. rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
  62. rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
  63. rcsb/exdb/tests/testGlycanProvider.py +98 -0
  64. rcsb/exdb/tests/testGlycanUtils.py +64 -0
  65. rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
  66. rcsb/exdb/tests/testObjectExtractor.py +342 -0
  67. rcsb/exdb/tests/testObjectTransformer.py +83 -0
  68. rcsb/exdb/tests/testObjectUpdater.py +120 -0
  69. rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
  70. rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
  71. rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
  72. rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
  73. rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
  74. rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
  75. rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
  76. rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
  77. rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
  78. rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
  79. rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
  80. rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
  81. rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
  82. rcsb/exdb/tests/testUniProtExtractor.py +77 -0
  83. rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
  84. rcsb/exdb/tree/__init__.py +0 -0
  85. rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
  86. rcsb/exdb/utils/ObjectExtractor.py +286 -0
  87. rcsb/exdb/utils/ObjectTransformer.py +124 -0
  88. rcsb/exdb/utils/ObjectUpdater.py +121 -0
  89. rcsb/exdb/utils/ObjectValidator.py +160 -0
  90. rcsb/exdb/utils/__init__.py +0 -0
  91. rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
  92. rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
  93. rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
  94. rcsb/exdb/wf/__init__.py +0 -0
  95. rcsb_exdb-1.31.dist-info/METADATA +103 -0
  96. rcsb_exdb-1.31.dist-info/RECORD +98 -0
  97. rcsb_exdb-1.31.dist-info/WHEEL +4 -0
  98. rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,557 @@
1
+ ##
2
+ # File: EntityInstanceExtractor.py
3
+ # Date: 19-Feb-2019 jdw
4
+ #
5
+ # Selected utilities to extract data from entity instance collections.
6
+ #
7
+ # PRELIMINARY VERSION -
8
+ #
9
+ # Updates:
10
+ #
11
+ #
12
+ ##
13
+ __docformat__ = "google en"
14
+ __author__ = "John Westbrook"
15
+ __email__ = "jwest@rcsb.rutgers.edu"
16
+ __license__ = "Apache 2.0"
17
+
18
+
19
+ import copy
20
+ import logging
21
+ from itertools import chain, groupby, islice
22
+ from operator import itemgetter
23
+ from statistics import mean, stdev
24
+
25
+ import numpy as np
26
+ import requests
27
+ from rcsb.db.mongo.Connection import Connection
28
+ from rcsb.db.mongo.MongoDbUtil import MongoDbUtil
29
+ from rcsb.utils.io.MarshalUtil import MarshalUtil
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ class EntityInstanceExtractor(object):
35
+ """Selected utilities to extract data from entity instance collections.
36
+
37
+ >>> from operator import itemgetter
38
+ >>>
39
+ >>> seq2 = [1, 2, 4, 5, 6, 8, 9, 10]
40
+ >>> list = []
41
+ >>> for k, g in groupby(enumerate(seq2), lambda (i,x):i-x):
42
+ ... list.append(map(itemgetter(1), g))
43
+ ...
44
+ >>> print list
45
+ [[1, 2], [4, 5, 6], [8, 9, 10]]
46
+ Or as a list comprehension:
47
+
48
+ >>> [map(itemgetter(1), g) for k, g in groupby(enumerate(seq2), lambda (i,x):i-x)]
49
+ [[1, 2], [4, 5, 6], [8, 9, 10]]
50
+
51
+
52
+ ##
53
+ ##
54
+
55
+ import numpy as np
56
+
57
+ def main():
58
+ # Generate some random data
59
+ x = np.cumsum(np.random.random(1000) - 0.5)
60
+ condition = np.abs(x) < 1
61
+
62
+ # Print the start and stop indicies of each region where the absolute
63
+ # values of x are below 1, and the min and max of each of these regions
64
+ for start, stop in contiguous_regions(condition):
65
+ segment = x[start:stop]
66
+ print start, stop
67
+ print segment.min(), segment.max()
68
+
69
+ import numpy as np
70
+
71
+ Samples = np.array([[1, 2, 3],
72
+ [1, 2]])
73
+ c = np.hstack(Samples) # Will gives [1,2,3,1,2]
74
+ mean, std = np.mean(c), np.std(c)
75
+ newSamples = np.asarray([(np.array(xi)-mean)/std for xi in Samples])
76
+ print newSamples
77
+
78
+ """
79
+
80
+ def __init__(self, cfgOb):
81
+ self.__cfgOb = cfgOb
82
+ self.__resourceName = "MONGO_DB"
83
+ #
84
+ self.__seqCache = {}
85
+ self.__mU = MarshalUtil()
86
+ #
87
+
88
+ def getEntryInfo(self, **kwargs):
89
+ """Return a dictionary of PDB entries satifying the input conditions (e.g. method, resolution limit)"""
90
+
91
+ resLimit = kwargs.get("resLimit", 3.5)
92
+ expMethod = kwargs.get("expMethod", "X-ray")
93
+ #
94
+ dbName = kwargs.get("dbName", "pdbx_core")
95
+ collectionName = kwargs.get("collectionName", "pdbx_core_entry")
96
+ #
97
+ entryD = {}
98
+ try:
99
+ with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
100
+ mg = MongoDbUtil(client)
101
+ if mg.collectionExists(dbName, collectionName):
102
+ logger.info("%s %s document count is %d", dbName, collectionName, mg.count(dbName, collectionName))
103
+ qD = {"rcsb_entry_info.experimental_method": expMethod, "refine.0.ls_d_res_high": {"$lte": resLimit}}
104
+ selectL = ["rcsb_entry_container_identifiers", "rcsb_entry_info", "refine"]
105
+ dL = mg.fetch(dbName, collectionName, selectL, queryD=qD)
106
+ logger.info("Selection %r fetch result count %d", selectL, len(dL))
107
+ #
108
+ for dV in dL:
109
+ if "rcsb_entry_container_identifiers" not in dV:
110
+ continue
111
+ entryId = dV["rcsb_entry_container_identifiers"]["entry_id"]
112
+ entryD[entryId] = {}
113
+ if "rcsb_entry_info" in dV and "polymer_composition" in dV["rcsb_entry_info"]:
114
+ entryD[entryId] = {
115
+ "polymer_composition": dV["rcsb_entry_info"]["polymer_composition"],
116
+ "experimental_method": dV["rcsb_entry_info"]["experimental_method"],
117
+ }
118
+ if "refine" in dV and dV["refine"] and "ls_d_res_high" in dV["refine"][0]:
119
+ entryD[entryId]["ls_d_res_high"] = dV["refine"][0]["ls_d_res_high"]
120
+ logger.debug("Got res %r", dV["refine"][0]["ls_d_res_high"])
121
+
122
+ except Exception as e:
123
+ logger.exception("Failing with %s", str(e))
124
+ return entryD
125
+ #
126
+
127
+ def getEntityIds(self, entryIdList):
128
+ """ """
129
+ dbName = "pdbx_core"
130
+ collectionName = "pdbx_core_polymer_entity"
131
+ docD = {}
132
+ try:
133
+ with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
134
+ mg = MongoDbUtil(client)
135
+ if mg.collectionExists(dbName, collectionName):
136
+ logger.info("%s %s document count is %d", dbName, collectionName, mg.count(dbName, collectionName))
137
+ for entryId in entryIdList:
138
+ qD = {"rcsb_polymer_entity_container_identifiers.entry_id": entryId}
139
+ selectL = ["rcsb_polymer_entity_container_identifiers"]
140
+ tL = mg.fetch(dbName, collectionName, selectL, queryD=qD)
141
+ #
142
+ logger.debug("Selection %r fetch result count %d", selectL, len(tL))
143
+ docD[entryId] = [vv["rcsb_polymer_entity_container_identifiers"] for vv in tL]
144
+ logger.debug("docD is %r", docD)
145
+ except Exception as e:
146
+ logger.exception("Failing with %s", str(e))
147
+ return docD
148
+
149
+ def getPolymerEntities(self, entryD, **kwargs):
150
+ """Add 'selected_polymer_entities' satisfying the input contiditions and add this to the input entry dictionary."""
151
+ dbName = kwargs.get("dbName", "pdbx_core")
152
+ collectionName = kwargs.get("collectionName", "pdbx_core_polymer_entity")
153
+ resultKey = kwargs.get("resultKey", "selected_polymer_entities")
154
+ savePath = kwargs.get("savePath", "entry-data.pic")
155
+ entryLimit = kwargs.get("entryLimit", None)
156
+ saveKwargs = kwargs.get("saveKwargs", {"fmt": "pickle"})
157
+ #
158
+ try:
159
+ with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
160
+ mg = MongoDbUtil(client)
161
+ if mg.collectionExists(dbName, collectionName):
162
+ logger.info("%s %s document count is %d", dbName, collectionName, mg.count(dbName, collectionName))
163
+ selectL = [
164
+ "rcsb_polymer_entity_container_identifiers",
165
+ "entity_poly.type",
166
+ "entity_poly.pdbx_seq_one_letter_code_can",
167
+ "rcsb_entity_source_organism.ncbi_taxonomy_id",
168
+ "rcsb_entity_source_organism.ncbi_scientific_name",
169
+ "struct_ref.pdbx_seq_one_letter_code",
170
+ "struct_ref.pdbx_db_accession",
171
+ "struct_ref.db_name",
172
+ "struct_ref.entity_id",
173
+ ]
174
+ iCount = 0
175
+ for entryId in entryD:
176
+ #
177
+ if resultKey in entryD[entryId]:
178
+ continue
179
+ #
180
+ qD = {
181
+ "rcsb_polymer_entity_container_identifiers.entry_id": entryId,
182
+ "entity_poly.rcsb_entity_polymer_type": "Protein",
183
+ "entity.rcsb_multiple_source_flag": "N",
184
+ }
185
+ #
186
+ dL = mg.fetch(dbName, collectionName, selectL, queryD=qD)
187
+ logger.debug("%s query %r fetch result count %d", entryId, qD, len(dL))
188
+ eD = {}
189
+ for ii, dV in enumerate(dL, 1):
190
+ rD = {}
191
+ logger.debug("%s (%4d) d is %r", entryId, ii, dV)
192
+ if "rcsb_polymer_entity_container_identifiers" in dV and "asym_ids" in dV["rcsb_polymer_entity_container_identifiers"]:
193
+ rD["asym_ids"] = dV["rcsb_polymer_entity_container_identifiers"]["asym_ids"]
194
+ rD["entity_id"] = dV["rcsb_polymer_entity_container_identifiers"]["entity_id"]
195
+ if "entity_poly" in dV and "type" in dV["entity_poly"]:
196
+ rD["type"] = dV["entity_poly"]["type"]
197
+ rD["seq_one_letter_code_can"] = dV["entity_poly"]["pdbx_seq_one_letter_code_can"]
198
+
199
+ if "rcsb_entity_source_organism" in dV:
200
+ rD["ncbi_taxonomy_id"] = dV["rcsb_entity_source_organism"][0]["ncbi_taxonomy_id"] if "ncbi_taxonomy_id" in dV["rcsb_entity_source_organism"][0] else None
201
+ rD["ncbi_scientific_name"] = (
202
+ dV["rcsb_entity_source_organism"][0]["ncbi_scientific_name"] if "ncbi_scientific_name" in dV["rcsb_entity_source_organism"][0] else None
203
+ )
204
+
205
+ if "struct_ref" in dV and len(dV["struct_ref"]) == 1:
206
+ rD["seq_one_letter_code_ref"] = dV["struct_ref"][0]["pdbx_seq_one_letter_code"] if "pdbx_seq_one_letter_code" in dV["struct_ref"][0] else None
207
+ rD["db_accession"] = dV["struct_ref"][0]["pdbx_db_accession"] if "pdbx_db_accession" in dV["struct_ref"][0] else None
208
+ rD["db_name"] = dV["struct_ref"][0]["db_name"] if "db_name" in dV["struct_ref"][0] else None
209
+ #
210
+ refDbName = rD["db_name"]
211
+ dbAccession = rD["db_accession"]
212
+ dbRefSeq = self.__seqCache[dbAccession] if dbAccession in self.__seqCache else None
213
+
214
+ if refDbName in ["UNP"] and not dbRefSeq:
215
+ dbRefSeq = self.__fetchUniprot(dbAccession)
216
+ self.__seqCache[dbAccession] = dbRefSeq
217
+ logger.debug("Fetch uniprot %r", dbRefSeq)
218
+ rD["ref_db_seq"] = dbRefSeq
219
+ else:
220
+ rD["seq_one_letter_code_ref"] = rD["db_accession"] = rD["db_name"] = None
221
+ #
222
+ if "entity_id" in rD:
223
+ eD[rD["entity_id"]] = copy.copy(rD)
224
+
225
+ entryD[entryId][resultKey] = copy.copy(eD)
226
+
227
+ iCount += 1
228
+ if iCount % 10 == 0:
229
+ logger.info("Completed polymer entities fetch %d/%d entries", iCount, len(entryD))
230
+ if iCount % 2000 == 0:
231
+ ok = self.__mU.doExport(savePath, entryD, **saveKwargs)
232
+ logger.info("Saved polymer entity results (%d) status %r in %s", iCount, ok, savePath)
233
+ if entryLimit and iCount >= entryLimit:
234
+ logger.info("Quitting after %d", iCount)
235
+ break
236
+ #
237
+ # for entryId in entryD:
238
+ # logger.debug(">> %s docD %r" % (entryId, entryD[entryId]))
239
+ ok = self.__mU.doExport(savePath, entryD, **saveKwargs)
240
+ logger.info("Saved polymer entity results (%d) entries %d status %r in %s", iCount, len(entryD), ok, savePath)
241
+ except Exception as e:
242
+ logger.exception("Failing with %s", str(e))
243
+ return entryD
244
+
245
+ def getEntityInstances(self, entryD, **kwargs):
246
+ """Get the selected validation data for the instances in the input entry dictionary.
247
+
248
+ entryD[entryId]['selected_polymer_entities'][entityId]['validation'] = {}
249
+
250
+ Add keys: 'pdbx_vrpt_instance_results' and 'pdbx_unobs_or_zero_occ_residues' to the validation dictionary above.
251
+
252
+ Args:
253
+ resourceName (str): resource name (e.g. DrugBank, CCDC)
254
+ **kwargs: unused
255
+
256
+ Returns:
257
+ entryD: { }
258
+ """
259
+ dbName = kwargs.get("dbName", "pdbx_core")
260
+ collectionName = kwargs.get("collectionName", "pdbx_core_polymer_entity_instance")
261
+ savePath = kwargs.get("savePath", "entry-data.pic")
262
+ saveKwargs = kwargs.get("saveKwargs", {"fmt": "pickle"})
263
+ entryLimit = kwargs.get("entryLimit", None)
264
+ #
265
+ try:
266
+ optF = False
267
+ iCount = 0
268
+ with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
269
+ mg = MongoDbUtil(client)
270
+ if mg.collectionExists(dbName, collectionName):
271
+ logger.info("%s %s total document count is %d", dbName, collectionName, mg.count(dbName, collectionName))
272
+ #
273
+ for entryId, dV in entryD.items():
274
+ for entityId, peD in dV["selected_polymer_entities"].items():
275
+ # if 'anal_instances' in peD:
276
+ # continue
277
+ vD = {}
278
+ for asymId in peD["asym_ids"]:
279
+ qD = {
280
+ "rcsb_polymer_entity_instance_container_identifiers.entry_id": entryId,
281
+ "rcsb_polymer_entity_instance_container_identifiers.asym_id": asymId,
282
+ }
283
+ # qD = {'rcsb_entity_instance_container_validation_identifiers.entity_type': 'polymer'}
284
+ # selectL = ['pdbx_vrpt_instance_results', 'pdbx_unobs_or_zero_occ_residues']
285
+ selectL = ["pdbx_vrpt_instance_results"]
286
+ tL = mg.fetch(dbName, collectionName, selectL, queryD=qD)
287
+ dV = {}
288
+ if not tL:
289
+ logger.info("No validation data for %s %s %s(%s)", dbName, collectionName, entryId, asymId)
290
+ continue
291
+ #
292
+ logger.debug(">>> %s %s (%s) dict key length %d ", collectionName, entryId, asymId, len(tL[0]))
293
+
294
+ #
295
+ if optF:
296
+ dV["pdbx_vrpt_instance_results"] = tL[0]["pdbx_vrpt_instance_results"] if "pdbx_vrpt_instance_results" in tL[0] else []
297
+ dV["pdbx_unobs_or_zero_occ_residues"] = tL[0]["pdbx_unobs_or_zero_occ_residues"] if "pdbx_unobs_or_zero_occ_residues" in tL[0] else []
298
+ #
299
+ if optF:
300
+ urdL = tL[0]["pdbx_unobs_or_zero_occ_residues"] if "pdbx_unobs_or_zero_occ_residues" in tL[0] else []
301
+ oL = [{"label_seq_id": urd["label_seq_id"], "label_comp_id": urd["label_comp_id"]} for urd in urdL]
302
+ dV["pdbx_unobs_or_zero_occ_residues"] = oL
303
+ #
304
+ try:
305
+ irdL = tL[0]["pdbx_vrpt_instance_results"] if "pdbx_vrpt_instance_results" in tL[0] else []
306
+ oL = [{"label_seq_id": ird["label_seq_id"], "label_comp_id": ird["label_comp_id"]} for ird in irdL]
307
+ dV["pdbx_vrpt_instance_results_seq"] = oL
308
+ except Exception as e:
309
+ logger.error("Failing with entryId %s entityId %s asymId %s bad validation data %s", entryId, entityId, asymId, str(e))
310
+
311
+ #
312
+ try:
313
+ irdL = tL[0]["pdbx_vrpt_instance_results"] if "pdbx_vrpt_instance_results" in tL[0] else []
314
+ oL = [{"OWAB": ird["OWAB"], "label_seq_id": ird["label_seq_id"], "label_comp_id": ird["label_comp_id"]} for ird in irdL]
315
+ dV["pdbx_vrpt_instance_results_occ"] = oL
316
+ except Exception as e:
317
+ logger.debug("Failing with entryId %s entityId %s asymId %s bad validation data %s", entryId, entityId, asymId, str(e))
318
+
319
+ vD[asymId] = copy.copy(dV)
320
+ #
321
+ analD = self.analEntity(entryId, peD, vD)
322
+ entryD[entryId]["selected_polymer_entities"][entityId]["anal_instances"] = copy.copy(analD)
323
+ iCount += 1
324
+ if iCount % 500 == 0:
325
+ logger.info("Completed %d/%d entries", iCount, len(entryD))
326
+ if iCount % 2000 == 0:
327
+ ok = self.__mU.doExport(savePath, entryD, **saveKwargs)
328
+ logger.info("Saved polymer entity instance results (%d) status %r in %s", iCount, ok, savePath)
329
+ if entryLimit and iCount >= entryLimit:
330
+ break
331
+ ok = self.__mU.doExport(savePath, entryD, **saveKwargs)
332
+ logger.info("Saved polymer instance results (%d) entries %d status %r in %s", iCount, len(entryD), ok, savePath)
333
+ except Exception as e:
334
+ logger.exception("Failing with %s", str(e))
335
+ return entryD
336
+
337
+ def analEntity(self, entryId, entityD, vD, **kwargs):
338
+ """
339
+
340
+ {'polymer_composition': 'protein/NA', 'experimental_method': 'X-ray',
341
+ 'selected_polymer_entities': {'1': {'asym_ids': ['D', 'C', 'E', 'A', 'B', 'F'],
342
+ 'entity_id': '1', 'type': 'polypeptide(L)',
343
+ 'seq_one_letter_code_can': 'MAKGQSLQDPFLNALRRERVPVSIYLVNGIKLQGQIESFDQFVILLKNTVSQMVYKHAISTVVPS',
344
+ 'ncbi_taxonomy_id': 511693,
345
+ 'ncbi_scientific_name': 'Escherichia coli BL21',
346
+ 'seq_one_letter_code_ref': 'MAKGQSLQDPFLNALRRERVPVSIYLVNGIKLQGQIESFDQFVILLKNTVSQMVYKHAISTVVPS',
347
+ 'db_accession': 'C5W5L7',
348
+ 'db_name': 'UNP',
349
+ 'validation': {'D': {'pdbx_vrpt_instance_results': [{'OWAB': 29.45, 'label_seq_id': 5, 'label_comp_id': 'GLN'},
350
+ {'OWAB': 26.12, 'label_seq_id': 6, 'label_comp_id': 'SER'},
351
+ {'OWAB': 22.72, 'label_seq_id': 7, 'label_comp_id': 'LEU'},
352
+ {'OWAB': 14.56, 'label_seq_id': 8, 'label_comp_id': 'GLN'},
353
+ {'OWAB': 19.18, 'label_seq_id': 9, 'label_comp_id': 'ASP'},
354
+ {'OWAB': 16.56, 'label_seq_id': 10, 'label_comp_id': 'PRO'},
355
+ {'OWAB': 14.78, 'label_seq_id': 11, 'label_comp_id': 'PHE'},
356
+ {'OWAB': 11.2, 'label_seq_id': 12, 'label_comp_id': 'LEU'}, }}...]
357
+
358
+ 'pdbx_unobs_or_zero_occ_residues': [{'label_seq_id': 1, 'label_comp_id': 'MET'},
359
+ {'label_seq_id': 2, 'label_comp_id': 'ALA'},
360
+ {'label_seq_id': 3, 'label_comp_id': 'LYS'},
361
+ {'label_seq_id': 4, 'label_comp_id': 'GLY'}]}
362
+
363
+ """
364
+ _ = kwargs
365
+ analD = {}
366
+ try:
367
+ entityId = entityD["entity_id"]
368
+ asymIdL = entityD["asym_ids"]
369
+
370
+ refSeq = entityD["seq_one_letter_code_ref"] if "seq_one_letter_code_ref" in entityD else None
371
+ entitySeq = entityD["seq_one_letter_code_can"] if "seq_one_letter_code_can" in entityD else None
372
+ # -------
373
+ # Get UniProt
374
+ #
375
+ dbName = entityD["db_name"] if "db_name" in entityD else None
376
+ dbAccession = entityD["db_accession"] if "db_accession" in entityD else None
377
+ dbRefSeq = entityD["ref_db_seq"] if "ref_db_seq" in entityD else None
378
+ # --
379
+ if dbRefSeq:
380
+ logger.debug("%s (%s) ref db %4d: %r", dbAccession, dbName, len(dbRefSeq), dbRefSeq)
381
+ if refSeq:
382
+ logger.debug("%s (%s) seq ref pdb %4d: %r", dbAccession, dbName, len(refSeq), refSeq)
383
+ if entitySeq:
384
+ logger.debug("%s (%s) entity sample %4d: %r", dbAccession, dbName, len(entitySeq), entitySeq)
385
+ #
386
+ lenRefDbSeq = len(dbRefSeq) if dbRefSeq else None
387
+ lenEntitySeq = len(entitySeq)
388
+ # sampleSeqCov = 1.0 - float(lenRefDbSeq - lenEntitySeq) / float(lenRefDbSeq) if lenRefDbSeq else None
389
+ #
390
+
391
+ # -
392
+ for asymId in asymIdL:
393
+ if asymId not in vD:
394
+ logger.error("Missing validation data for %s %s %s", entryId, entityId, asymId)
395
+ continue
396
+ #
397
+ irDL = vD[asymId]["pdbx_vrpt_instance_results_seq"] if "pdbx_vrpt_instance_results_seq" in vD[asymId] else []
398
+ lsL = list(set([dV["label_seq_id"] for dV in irDL]))
399
+ lenInstanceSeq = len(lsL)
400
+
401
+ instRefDbSeqCov = 1.0 - float(lenRefDbSeq - lenInstanceSeq) / float(lenRefDbSeq) if lenRefDbSeq else None
402
+ instSampleSeqCov = 1.0 - float(lenEntitySeq - lenInstanceSeq) / float(lenEntitySeq)
403
+ #
404
+ occDL = vD[asymId]["pdbx_vrpt_instance_results_occ"] if "pdbx_vrpt_instance_results_occ" in vD[asymId] else []
405
+ # average the
406
+ owabRegD = {}
407
+ if occDL:
408
+ owabD = {}
409
+ for dV in occDL:
410
+ owabD.setdefault(dV["label_seq_id"], []).append(dV["OWAB"])
411
+ #
412
+ # logger.info("owabD %r" % owabD)
413
+ meanOwabD = {k: mean(v) for k, v in owabD.items()}
414
+ meanOwab = mean(meanOwabD.values())
415
+ stdevOwab = stdev(meanOwabD.values())
416
+ #
417
+ logger.debug(">> Length of B values list %d mean %.3f stdev %.3f", len(meanOwabD), meanOwab, stdevOwab)
418
+ #
419
+ meanOwabA = np.array(list(meanOwabD.values()))
420
+ #
421
+ condition = meanOwabA > (meanOwab + meanOwab)
422
+ regL = self.__contiguousRegions(condition)
423
+ for ii, (start, stop) in enumerate(regL, 1):
424
+ segment = meanOwabA[start:stop]
425
+ logger.debug("B value range = start %d stop %d min %.3f max %.3f", start, stop, segment.min(), segment.max())
426
+ owabRegD[ii] = {"length": stop - start + 1, "occ_min": segment.min(), "occ_max": segment.max()}
427
+
428
+ #
429
+ #
430
+ # if False:
431
+ # uDL = vD[asymId]['pdbx_unobs_or_zero_occ_residues'] if 'pdbx_unobs_or_zero_occ_residues' in vD[asymId] else []
432
+ # unobsL = [d['label_seq_id'] for d in uDL]
433
+ #
434
+ # segL = []
435
+ # for k, g in groupby(enumerate(lsL), lambda x: x[0] - x[1]):
436
+ # logger.info(" Segment entryId %s entityId %s asymId %s: %r" % (entryId, entityId, asymId, list(map(itemgetter(1), g))))
437
+ #
438
+ # for k, g in groupby(enumerate(lsL), lambda(i, x): i - x):
439
+ # logger.info(" entryId %s entityId %s asymId %s: %r" % (entryId, entityId, asymId, list(map(itemgetter(1), g)))
440
+
441
+ segL = [list(map(itemgetter(1), g)) for _, g in groupby(enumerate(lsL), lambda x: x[0] - x[1])]
442
+ logger.debug("Modeled sequence length %d segments %d", len(lsL), len(segL))
443
+ #
444
+ gapD = {}
445
+ for ii in range(1, len(segL)):
446
+ bG = segL[ii - 1][-1]
447
+ eG = segL[ii][0]
448
+ gapD[ii] = eG - bG - 1
449
+ logger.debug("Gap %d length %d", ii, gapD[ii])
450
+ #
451
+ #
452
+ if instRefDbSeqCov:
453
+ logger.debug(
454
+ "Summary %s %s %s refcov %.2f sampleCov %.2f - gaps (%d) %r owabs seqments (%d) %r",
455
+ entryId,
456
+ entityId,
457
+ asymId,
458
+ instRefDbSeqCov,
459
+ instSampleSeqCov,
460
+ len(gapD),
461
+ list(gapD.values()),
462
+ len(owabRegD),
463
+ list(owabRegD.values()),
464
+ )
465
+ else:
466
+ logger.debug(
467
+ "Summary %s %s %s sampleCov %.2f - gaps (%d) %r owabs seqments (%d) %r",
468
+ entryId,
469
+ entityId,
470
+ asymId,
471
+ instSampleSeqCov,
472
+ len(gapD),
473
+ list(gapD),
474
+ len(owabRegD),
475
+ list(owabRegD.values()),
476
+ )
477
+ #
478
+ analD[asymId] = {"coverage_inst_refdb": instRefDbSeqCov, "coverage_inst_entity": instSampleSeqCov, "gapD": copy.copy(gapD), "owabRegiond": copy.copy(owabRegD)}
479
+ logger.debug("entry %s entity %s analD %r", entryId, entityId, analD)
480
+ except Exception as e:
481
+ logger.exception("%s failing with %s", entryId, str(e))
482
+ #
483
+ return analD
484
+
485
+ def __getSegments(self, values):
486
+ xV = np.asarray(values)
487
+ # Generate some random data
488
+ # x = np.cumsum(np.random.random(1000) - 0.5)
489
+ #
490
+ condition = np.abs(xV) < 1
491
+
492
+ # Print the start and stop indicies of each region where the absolute
493
+ # values of x are below 1, and the min and max of each of these regions
494
+ for start, stop in self.__contiguousRegions(condition):
495
+ segment = xV[start:stop]
496
+ print(start, stop)
497
+ print(segment.min(), segment.max())
498
+
499
+ def __contiguousRegions(self, condition):
500
+ """Finds contiguous True regions of the boolean array "condition.
501
+
502
+ Returns a 2D array where the first column is the start index of the region and the
503
+ second column is the end index.
504
+
505
+ """
506
+
507
+ # Find the indicies of changes in "condition"
508
+ dV = np.diff(condition)
509
+ (idx,) = dV.nonzero()
510
+
511
+ # We need to start things after the change in "condition". Therefore,
512
+ # we'll shift the index by 1 to the right.
513
+ idx += 1
514
+
515
+ if condition[0]:
516
+ # If the start of condition is True prepend a 0
517
+ idx = np.r_[0, idx]
518
+
519
+ if condition[-1]:
520
+ # If the end of condition is True, append the length of the array
521
+ idx = np.r_[idx, condition.size] # Edit
522
+
523
+ # Reshape the result into two columns
524
+ idx.shape = (-1, 2)
525
+ return idx
526
+
527
+ def __window(self, seq, num=2):
528
+ """Returns a sliding window (of width n) over data from the iterable
529
+ s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...
530
+ """
531
+ it = iter(seq)
532
+ result = tuple(islice(it, num))
533
+ if len(result) == num:
534
+ yield result
535
+ for elem in it:
536
+ result = result[1:] + (elem,)
537
+ yield result
538
+
539
+ def missingElements(self, lV):
540
+ missing = chain.from_iterable(range(x + 1, y) for x, y in self.__window(lV) if (y - x) > 1)
541
+ return list(missing)
542
+
543
+ def __fetchUniprot(self, uniProtId):
544
+ baseUrl = "http://www.uniprot.org"
545
+ wsEndPoint = "/uniprot/"
546
+ fS = ""
547
+ try:
548
+ fullUrl = baseUrl + wsEndPoint + uniProtId + ".fasta"
549
+ result = requests.get(fullUrl)
550
+ if result.ok:
551
+ fL = result.text.split("\n")
552
+ fS = "".join(fL[1:])
553
+ else:
554
+ logger.error("UniProt Fasta request for %s returns status %r", uniProtId, result.status_code)
555
+ except Exception as e:
556
+ logger.error("Failing request for %s with %s", uniProtId, str(e))
557
+ return fS