rcsb.exdb 1.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rcsb/__init__.py +1 -0
- rcsb/exdb/__init__.py +1 -0
- rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
- rcsb/exdb/branch/GlycanProvider.py +116 -0
- rcsb/exdb/branch/GlycanUtils.py +114 -0
- rcsb/exdb/branch/__init__.py +0 -0
- rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
- rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
- rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
- rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
- rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
- rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
- rcsb/exdb/chemref/__init__.py +0 -0
- rcsb/exdb/citation/CitationAdapter.py +91 -0
- rcsb/exdb/citation/CitationExtractor.py +190 -0
- rcsb/exdb/citation/CitationUtils.py +51 -0
- rcsb/exdb/citation/__init__.py +0 -0
- rcsb/exdb/cli/__init__.py +0 -0
- rcsb/exdb/entry/EntryInfoProvider.py +148 -0
- rcsb/exdb/entry/__init__.py +0 -0
- rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
- rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
- rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
- rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
- rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
- rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
- rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
- rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
- rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
- rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
- rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
- rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
- rcsb/exdb/seq/AnnotationExtractor.py +76 -0
- rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
- rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
- rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
- rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
- rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
- rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
- rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
- rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
- rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
- rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
- rcsb/exdb/seq/UniProtExtractor.py +80 -0
- rcsb/exdb/seq/__init__.py +0 -0
- rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
- rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
- rcsb/exdb/tests/__init__.py +0 -0
- rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
- rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
- rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
- rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
- rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
- rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
- rcsb/exdb/tests/testChemRefLoader.py +106 -0
- rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
- rcsb/exdb/tests/testCitationAdapter.py +97 -0
- rcsb/exdb/tests/testCitationExtractor.py +93 -0
- rcsb/exdb/tests/testCitationUtils.py +92 -0
- rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
- rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
- rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
- rcsb/exdb/tests/testGlycanProvider.py +98 -0
- rcsb/exdb/tests/testGlycanUtils.py +64 -0
- rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
- rcsb/exdb/tests/testObjectExtractor.py +342 -0
- rcsb/exdb/tests/testObjectTransformer.py +83 -0
- rcsb/exdb/tests/testObjectUpdater.py +120 -0
- rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
- rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
- rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
- rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
- rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
- rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
- rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
- rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
- rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
- rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
- rcsb/exdb/tests/testUniProtExtractor.py +77 -0
- rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
- rcsb/exdb/tree/__init__.py +0 -0
- rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
- rcsb/exdb/utils/ObjectExtractor.py +286 -0
- rcsb/exdb/utils/ObjectTransformer.py +124 -0
- rcsb/exdb/utils/ObjectUpdater.py +121 -0
- rcsb/exdb/utils/ObjectValidator.py +160 -0
- rcsb/exdb/utils/__init__.py +0 -0
- rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
- rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
- rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
- rcsb/exdb/wf/__init__.py +0 -0
- rcsb_exdb-1.31.dist-info/METADATA +103 -0
- rcsb_exdb-1.31.dist-info/RECORD +98 -0
- rcsb_exdb-1.31.dist-info/WHEEL +4 -0
- rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,557 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: EntityInstanceExtractor.py
|
|
3
|
+
# Date: 19-Feb-2019 jdw
|
|
4
|
+
#
|
|
5
|
+
# Selected utilities to extract data from entity instance collections.
|
|
6
|
+
#
|
|
7
|
+
# PRELIMINARY VERSION -
|
|
8
|
+
#
|
|
9
|
+
# Updates:
|
|
10
|
+
#
|
|
11
|
+
#
|
|
12
|
+
##
|
|
13
|
+
__docformat__ = "google en"
|
|
14
|
+
__author__ = "John Westbrook"
|
|
15
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
16
|
+
__license__ = "Apache 2.0"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
import copy
|
|
20
|
+
import logging
|
|
21
|
+
from itertools import chain, groupby, islice
|
|
22
|
+
from operator import itemgetter
|
|
23
|
+
from statistics import mean, stdev
|
|
24
|
+
|
|
25
|
+
import numpy as np
|
|
26
|
+
import requests
|
|
27
|
+
from rcsb.db.mongo.Connection import Connection
|
|
28
|
+
from rcsb.db.mongo.MongoDbUtil import MongoDbUtil
|
|
29
|
+
from rcsb.utils.io.MarshalUtil import MarshalUtil
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class EntityInstanceExtractor(object):
|
|
35
|
+
"""Selected utilities to extract data from entity instance collections.
|
|
36
|
+
|
|
37
|
+
>>> from operator import itemgetter
|
|
38
|
+
>>>
|
|
39
|
+
>>> seq2 = [1, 2, 4, 5, 6, 8, 9, 10]
|
|
40
|
+
>>> list = []
|
|
41
|
+
>>> for k, g in groupby(enumerate(seq2), lambda (i,x):i-x):
|
|
42
|
+
... list.append(map(itemgetter(1), g))
|
|
43
|
+
...
|
|
44
|
+
>>> print list
|
|
45
|
+
[[1, 2], [4, 5, 6], [8, 9, 10]]
|
|
46
|
+
Or as a list comprehension:
|
|
47
|
+
|
|
48
|
+
>>> [map(itemgetter(1), g) for k, g in groupby(enumerate(seq2), lambda (i,x):i-x)]
|
|
49
|
+
[[1, 2], [4, 5, 6], [8, 9, 10]]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
##
|
|
53
|
+
##
|
|
54
|
+
|
|
55
|
+
import numpy as np
|
|
56
|
+
|
|
57
|
+
def main():
|
|
58
|
+
# Generate some random data
|
|
59
|
+
x = np.cumsum(np.random.random(1000) - 0.5)
|
|
60
|
+
condition = np.abs(x) < 1
|
|
61
|
+
|
|
62
|
+
# Print the start and stop indicies of each region where the absolute
|
|
63
|
+
# values of x are below 1, and the min and max of each of these regions
|
|
64
|
+
for start, stop in contiguous_regions(condition):
|
|
65
|
+
segment = x[start:stop]
|
|
66
|
+
print start, stop
|
|
67
|
+
print segment.min(), segment.max()
|
|
68
|
+
|
|
69
|
+
import numpy as np
|
|
70
|
+
|
|
71
|
+
Samples = np.array([[1, 2, 3],
|
|
72
|
+
[1, 2]])
|
|
73
|
+
c = np.hstack(Samples) # Will gives [1,2,3,1,2]
|
|
74
|
+
mean, std = np.mean(c), np.std(c)
|
|
75
|
+
newSamples = np.asarray([(np.array(xi)-mean)/std for xi in Samples])
|
|
76
|
+
print newSamples
|
|
77
|
+
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
def __init__(self, cfgOb):
|
|
81
|
+
self.__cfgOb = cfgOb
|
|
82
|
+
self.__resourceName = "MONGO_DB"
|
|
83
|
+
#
|
|
84
|
+
self.__seqCache = {}
|
|
85
|
+
self.__mU = MarshalUtil()
|
|
86
|
+
#
|
|
87
|
+
|
|
88
|
+
def getEntryInfo(self, **kwargs):
|
|
89
|
+
"""Return a dictionary of PDB entries satifying the input conditions (e.g. method, resolution limit)"""
|
|
90
|
+
|
|
91
|
+
resLimit = kwargs.get("resLimit", 3.5)
|
|
92
|
+
expMethod = kwargs.get("expMethod", "X-ray")
|
|
93
|
+
#
|
|
94
|
+
dbName = kwargs.get("dbName", "pdbx_core")
|
|
95
|
+
collectionName = kwargs.get("collectionName", "pdbx_core_entry")
|
|
96
|
+
#
|
|
97
|
+
entryD = {}
|
|
98
|
+
try:
|
|
99
|
+
with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
|
|
100
|
+
mg = MongoDbUtil(client)
|
|
101
|
+
if mg.collectionExists(dbName, collectionName):
|
|
102
|
+
logger.info("%s %s document count is %d", dbName, collectionName, mg.count(dbName, collectionName))
|
|
103
|
+
qD = {"rcsb_entry_info.experimental_method": expMethod, "refine.0.ls_d_res_high": {"$lte": resLimit}}
|
|
104
|
+
selectL = ["rcsb_entry_container_identifiers", "rcsb_entry_info", "refine"]
|
|
105
|
+
dL = mg.fetch(dbName, collectionName, selectL, queryD=qD)
|
|
106
|
+
logger.info("Selection %r fetch result count %d", selectL, len(dL))
|
|
107
|
+
#
|
|
108
|
+
for dV in dL:
|
|
109
|
+
if "rcsb_entry_container_identifiers" not in dV:
|
|
110
|
+
continue
|
|
111
|
+
entryId = dV["rcsb_entry_container_identifiers"]["entry_id"]
|
|
112
|
+
entryD[entryId] = {}
|
|
113
|
+
if "rcsb_entry_info" in dV and "polymer_composition" in dV["rcsb_entry_info"]:
|
|
114
|
+
entryD[entryId] = {
|
|
115
|
+
"polymer_composition": dV["rcsb_entry_info"]["polymer_composition"],
|
|
116
|
+
"experimental_method": dV["rcsb_entry_info"]["experimental_method"],
|
|
117
|
+
}
|
|
118
|
+
if "refine" in dV and dV["refine"] and "ls_d_res_high" in dV["refine"][0]:
|
|
119
|
+
entryD[entryId]["ls_d_res_high"] = dV["refine"][0]["ls_d_res_high"]
|
|
120
|
+
logger.debug("Got res %r", dV["refine"][0]["ls_d_res_high"])
|
|
121
|
+
|
|
122
|
+
except Exception as e:
|
|
123
|
+
logger.exception("Failing with %s", str(e))
|
|
124
|
+
return entryD
|
|
125
|
+
#
|
|
126
|
+
|
|
127
|
+
def getEntityIds(self, entryIdList):
|
|
128
|
+
""" """
|
|
129
|
+
dbName = "pdbx_core"
|
|
130
|
+
collectionName = "pdbx_core_polymer_entity"
|
|
131
|
+
docD = {}
|
|
132
|
+
try:
|
|
133
|
+
with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
|
|
134
|
+
mg = MongoDbUtil(client)
|
|
135
|
+
if mg.collectionExists(dbName, collectionName):
|
|
136
|
+
logger.info("%s %s document count is %d", dbName, collectionName, mg.count(dbName, collectionName))
|
|
137
|
+
for entryId in entryIdList:
|
|
138
|
+
qD = {"rcsb_polymer_entity_container_identifiers.entry_id": entryId}
|
|
139
|
+
selectL = ["rcsb_polymer_entity_container_identifiers"]
|
|
140
|
+
tL = mg.fetch(dbName, collectionName, selectL, queryD=qD)
|
|
141
|
+
#
|
|
142
|
+
logger.debug("Selection %r fetch result count %d", selectL, len(tL))
|
|
143
|
+
docD[entryId] = [vv["rcsb_polymer_entity_container_identifiers"] for vv in tL]
|
|
144
|
+
logger.debug("docD is %r", docD)
|
|
145
|
+
except Exception as e:
|
|
146
|
+
logger.exception("Failing with %s", str(e))
|
|
147
|
+
return docD
|
|
148
|
+
|
|
149
|
+
def getPolymerEntities(self, entryD, **kwargs):
|
|
150
|
+
"""Add 'selected_polymer_entities' satisfying the input contiditions and add this to the input entry dictionary."""
|
|
151
|
+
dbName = kwargs.get("dbName", "pdbx_core")
|
|
152
|
+
collectionName = kwargs.get("collectionName", "pdbx_core_polymer_entity")
|
|
153
|
+
resultKey = kwargs.get("resultKey", "selected_polymer_entities")
|
|
154
|
+
savePath = kwargs.get("savePath", "entry-data.pic")
|
|
155
|
+
entryLimit = kwargs.get("entryLimit", None)
|
|
156
|
+
saveKwargs = kwargs.get("saveKwargs", {"fmt": "pickle"})
|
|
157
|
+
#
|
|
158
|
+
try:
|
|
159
|
+
with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
|
|
160
|
+
mg = MongoDbUtil(client)
|
|
161
|
+
if mg.collectionExists(dbName, collectionName):
|
|
162
|
+
logger.info("%s %s document count is %d", dbName, collectionName, mg.count(dbName, collectionName))
|
|
163
|
+
selectL = [
|
|
164
|
+
"rcsb_polymer_entity_container_identifiers",
|
|
165
|
+
"entity_poly.type",
|
|
166
|
+
"entity_poly.pdbx_seq_one_letter_code_can",
|
|
167
|
+
"rcsb_entity_source_organism.ncbi_taxonomy_id",
|
|
168
|
+
"rcsb_entity_source_organism.ncbi_scientific_name",
|
|
169
|
+
"struct_ref.pdbx_seq_one_letter_code",
|
|
170
|
+
"struct_ref.pdbx_db_accession",
|
|
171
|
+
"struct_ref.db_name",
|
|
172
|
+
"struct_ref.entity_id",
|
|
173
|
+
]
|
|
174
|
+
iCount = 0
|
|
175
|
+
for entryId in entryD:
|
|
176
|
+
#
|
|
177
|
+
if resultKey in entryD[entryId]:
|
|
178
|
+
continue
|
|
179
|
+
#
|
|
180
|
+
qD = {
|
|
181
|
+
"rcsb_polymer_entity_container_identifiers.entry_id": entryId,
|
|
182
|
+
"entity_poly.rcsb_entity_polymer_type": "Protein",
|
|
183
|
+
"entity.rcsb_multiple_source_flag": "N",
|
|
184
|
+
}
|
|
185
|
+
#
|
|
186
|
+
dL = mg.fetch(dbName, collectionName, selectL, queryD=qD)
|
|
187
|
+
logger.debug("%s query %r fetch result count %d", entryId, qD, len(dL))
|
|
188
|
+
eD = {}
|
|
189
|
+
for ii, dV in enumerate(dL, 1):
|
|
190
|
+
rD = {}
|
|
191
|
+
logger.debug("%s (%4d) d is %r", entryId, ii, dV)
|
|
192
|
+
if "rcsb_polymer_entity_container_identifiers" in dV and "asym_ids" in dV["rcsb_polymer_entity_container_identifiers"]:
|
|
193
|
+
rD["asym_ids"] = dV["rcsb_polymer_entity_container_identifiers"]["asym_ids"]
|
|
194
|
+
rD["entity_id"] = dV["rcsb_polymer_entity_container_identifiers"]["entity_id"]
|
|
195
|
+
if "entity_poly" in dV and "type" in dV["entity_poly"]:
|
|
196
|
+
rD["type"] = dV["entity_poly"]["type"]
|
|
197
|
+
rD["seq_one_letter_code_can"] = dV["entity_poly"]["pdbx_seq_one_letter_code_can"]
|
|
198
|
+
|
|
199
|
+
if "rcsb_entity_source_organism" in dV:
|
|
200
|
+
rD["ncbi_taxonomy_id"] = dV["rcsb_entity_source_organism"][0]["ncbi_taxonomy_id"] if "ncbi_taxonomy_id" in dV["rcsb_entity_source_organism"][0] else None
|
|
201
|
+
rD["ncbi_scientific_name"] = (
|
|
202
|
+
dV["rcsb_entity_source_organism"][0]["ncbi_scientific_name"] if "ncbi_scientific_name" in dV["rcsb_entity_source_organism"][0] else None
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
if "struct_ref" in dV and len(dV["struct_ref"]) == 1:
|
|
206
|
+
rD["seq_one_letter_code_ref"] = dV["struct_ref"][0]["pdbx_seq_one_letter_code"] if "pdbx_seq_one_letter_code" in dV["struct_ref"][0] else None
|
|
207
|
+
rD["db_accession"] = dV["struct_ref"][0]["pdbx_db_accession"] if "pdbx_db_accession" in dV["struct_ref"][0] else None
|
|
208
|
+
rD["db_name"] = dV["struct_ref"][0]["db_name"] if "db_name" in dV["struct_ref"][0] else None
|
|
209
|
+
#
|
|
210
|
+
refDbName = rD["db_name"]
|
|
211
|
+
dbAccession = rD["db_accession"]
|
|
212
|
+
dbRefSeq = self.__seqCache[dbAccession] if dbAccession in self.__seqCache else None
|
|
213
|
+
|
|
214
|
+
if refDbName in ["UNP"] and not dbRefSeq:
|
|
215
|
+
dbRefSeq = self.__fetchUniprot(dbAccession)
|
|
216
|
+
self.__seqCache[dbAccession] = dbRefSeq
|
|
217
|
+
logger.debug("Fetch uniprot %r", dbRefSeq)
|
|
218
|
+
rD["ref_db_seq"] = dbRefSeq
|
|
219
|
+
else:
|
|
220
|
+
rD["seq_one_letter_code_ref"] = rD["db_accession"] = rD["db_name"] = None
|
|
221
|
+
#
|
|
222
|
+
if "entity_id" in rD:
|
|
223
|
+
eD[rD["entity_id"]] = copy.copy(rD)
|
|
224
|
+
|
|
225
|
+
entryD[entryId][resultKey] = copy.copy(eD)
|
|
226
|
+
|
|
227
|
+
iCount += 1
|
|
228
|
+
if iCount % 10 == 0:
|
|
229
|
+
logger.info("Completed polymer entities fetch %d/%d entries", iCount, len(entryD))
|
|
230
|
+
if iCount % 2000 == 0:
|
|
231
|
+
ok = self.__mU.doExport(savePath, entryD, **saveKwargs)
|
|
232
|
+
logger.info("Saved polymer entity results (%d) status %r in %s", iCount, ok, savePath)
|
|
233
|
+
if entryLimit and iCount >= entryLimit:
|
|
234
|
+
logger.info("Quitting after %d", iCount)
|
|
235
|
+
break
|
|
236
|
+
#
|
|
237
|
+
# for entryId in entryD:
|
|
238
|
+
# logger.debug(">> %s docD %r" % (entryId, entryD[entryId]))
|
|
239
|
+
ok = self.__mU.doExport(savePath, entryD, **saveKwargs)
|
|
240
|
+
logger.info("Saved polymer entity results (%d) entries %d status %r in %s", iCount, len(entryD), ok, savePath)
|
|
241
|
+
except Exception as e:
|
|
242
|
+
logger.exception("Failing with %s", str(e))
|
|
243
|
+
return entryD
|
|
244
|
+
|
|
245
|
+
def getEntityInstances(self, entryD, **kwargs):
|
|
246
|
+
"""Get the selected validation data for the instances in the input entry dictionary.
|
|
247
|
+
|
|
248
|
+
entryD[entryId]['selected_polymer_entities'][entityId]['validation'] = {}
|
|
249
|
+
|
|
250
|
+
Add keys: 'pdbx_vrpt_instance_results' and 'pdbx_unobs_or_zero_occ_residues' to the validation dictionary above.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
resourceName (str): resource name (e.g. DrugBank, CCDC)
|
|
254
|
+
**kwargs: unused
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
entryD: { }
|
|
258
|
+
"""
|
|
259
|
+
dbName = kwargs.get("dbName", "pdbx_core")
|
|
260
|
+
collectionName = kwargs.get("collectionName", "pdbx_core_polymer_entity_instance")
|
|
261
|
+
savePath = kwargs.get("savePath", "entry-data.pic")
|
|
262
|
+
saveKwargs = kwargs.get("saveKwargs", {"fmt": "pickle"})
|
|
263
|
+
entryLimit = kwargs.get("entryLimit", None)
|
|
264
|
+
#
|
|
265
|
+
try:
|
|
266
|
+
optF = False
|
|
267
|
+
iCount = 0
|
|
268
|
+
with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
|
|
269
|
+
mg = MongoDbUtil(client)
|
|
270
|
+
if mg.collectionExists(dbName, collectionName):
|
|
271
|
+
logger.info("%s %s total document count is %d", dbName, collectionName, mg.count(dbName, collectionName))
|
|
272
|
+
#
|
|
273
|
+
for entryId, dV in entryD.items():
|
|
274
|
+
for entityId, peD in dV["selected_polymer_entities"].items():
|
|
275
|
+
# if 'anal_instances' in peD:
|
|
276
|
+
# continue
|
|
277
|
+
vD = {}
|
|
278
|
+
for asymId in peD["asym_ids"]:
|
|
279
|
+
qD = {
|
|
280
|
+
"rcsb_polymer_entity_instance_container_identifiers.entry_id": entryId,
|
|
281
|
+
"rcsb_polymer_entity_instance_container_identifiers.asym_id": asymId,
|
|
282
|
+
}
|
|
283
|
+
# qD = {'rcsb_entity_instance_container_validation_identifiers.entity_type': 'polymer'}
|
|
284
|
+
# selectL = ['pdbx_vrpt_instance_results', 'pdbx_unobs_or_zero_occ_residues']
|
|
285
|
+
selectL = ["pdbx_vrpt_instance_results"]
|
|
286
|
+
tL = mg.fetch(dbName, collectionName, selectL, queryD=qD)
|
|
287
|
+
dV = {}
|
|
288
|
+
if not tL:
|
|
289
|
+
logger.info("No validation data for %s %s %s(%s)", dbName, collectionName, entryId, asymId)
|
|
290
|
+
continue
|
|
291
|
+
#
|
|
292
|
+
logger.debug(">>> %s %s (%s) dict key length %d ", collectionName, entryId, asymId, len(tL[0]))
|
|
293
|
+
|
|
294
|
+
#
|
|
295
|
+
if optF:
|
|
296
|
+
dV["pdbx_vrpt_instance_results"] = tL[0]["pdbx_vrpt_instance_results"] if "pdbx_vrpt_instance_results" in tL[0] else []
|
|
297
|
+
dV["pdbx_unobs_or_zero_occ_residues"] = tL[0]["pdbx_unobs_or_zero_occ_residues"] if "pdbx_unobs_or_zero_occ_residues" in tL[0] else []
|
|
298
|
+
#
|
|
299
|
+
if optF:
|
|
300
|
+
urdL = tL[0]["pdbx_unobs_or_zero_occ_residues"] if "pdbx_unobs_or_zero_occ_residues" in tL[0] else []
|
|
301
|
+
oL = [{"label_seq_id": urd["label_seq_id"], "label_comp_id": urd["label_comp_id"]} for urd in urdL]
|
|
302
|
+
dV["pdbx_unobs_or_zero_occ_residues"] = oL
|
|
303
|
+
#
|
|
304
|
+
try:
|
|
305
|
+
irdL = tL[0]["pdbx_vrpt_instance_results"] if "pdbx_vrpt_instance_results" in tL[0] else []
|
|
306
|
+
oL = [{"label_seq_id": ird["label_seq_id"], "label_comp_id": ird["label_comp_id"]} for ird in irdL]
|
|
307
|
+
dV["pdbx_vrpt_instance_results_seq"] = oL
|
|
308
|
+
except Exception as e:
|
|
309
|
+
logger.error("Failing with entryId %s entityId %s asymId %s bad validation data %s", entryId, entityId, asymId, str(e))
|
|
310
|
+
|
|
311
|
+
#
|
|
312
|
+
try:
|
|
313
|
+
irdL = tL[0]["pdbx_vrpt_instance_results"] if "pdbx_vrpt_instance_results" in tL[0] else []
|
|
314
|
+
oL = [{"OWAB": ird["OWAB"], "label_seq_id": ird["label_seq_id"], "label_comp_id": ird["label_comp_id"]} for ird in irdL]
|
|
315
|
+
dV["pdbx_vrpt_instance_results_occ"] = oL
|
|
316
|
+
except Exception as e:
|
|
317
|
+
logger.debug("Failing with entryId %s entityId %s asymId %s bad validation data %s", entryId, entityId, asymId, str(e))
|
|
318
|
+
|
|
319
|
+
vD[asymId] = copy.copy(dV)
|
|
320
|
+
#
|
|
321
|
+
analD = self.analEntity(entryId, peD, vD)
|
|
322
|
+
entryD[entryId]["selected_polymer_entities"][entityId]["anal_instances"] = copy.copy(analD)
|
|
323
|
+
iCount += 1
|
|
324
|
+
if iCount % 500 == 0:
|
|
325
|
+
logger.info("Completed %d/%d entries", iCount, len(entryD))
|
|
326
|
+
if iCount % 2000 == 0:
|
|
327
|
+
ok = self.__mU.doExport(savePath, entryD, **saveKwargs)
|
|
328
|
+
logger.info("Saved polymer entity instance results (%d) status %r in %s", iCount, ok, savePath)
|
|
329
|
+
if entryLimit and iCount >= entryLimit:
|
|
330
|
+
break
|
|
331
|
+
ok = self.__mU.doExport(savePath, entryD, **saveKwargs)
|
|
332
|
+
logger.info("Saved polymer instance results (%d) entries %d status %r in %s", iCount, len(entryD), ok, savePath)
|
|
333
|
+
except Exception as e:
|
|
334
|
+
logger.exception("Failing with %s", str(e))
|
|
335
|
+
return entryD
|
|
336
|
+
|
|
337
|
+
def analEntity(self, entryId, entityD, vD, **kwargs):
|
|
338
|
+
"""
|
|
339
|
+
|
|
340
|
+
{'polymer_composition': 'protein/NA', 'experimental_method': 'X-ray',
|
|
341
|
+
'selected_polymer_entities': {'1': {'asym_ids': ['D', 'C', 'E', 'A', 'B', 'F'],
|
|
342
|
+
'entity_id': '1', 'type': 'polypeptide(L)',
|
|
343
|
+
'seq_one_letter_code_can': 'MAKGQSLQDPFLNALRRERVPVSIYLVNGIKLQGQIESFDQFVILLKNTVSQMVYKHAISTVVPS',
|
|
344
|
+
'ncbi_taxonomy_id': 511693,
|
|
345
|
+
'ncbi_scientific_name': 'Escherichia coli BL21',
|
|
346
|
+
'seq_one_letter_code_ref': 'MAKGQSLQDPFLNALRRERVPVSIYLVNGIKLQGQIESFDQFVILLKNTVSQMVYKHAISTVVPS',
|
|
347
|
+
'db_accession': 'C5W5L7',
|
|
348
|
+
'db_name': 'UNP',
|
|
349
|
+
'validation': {'D': {'pdbx_vrpt_instance_results': [{'OWAB': 29.45, 'label_seq_id': 5, 'label_comp_id': 'GLN'},
|
|
350
|
+
{'OWAB': 26.12, 'label_seq_id': 6, 'label_comp_id': 'SER'},
|
|
351
|
+
{'OWAB': 22.72, 'label_seq_id': 7, 'label_comp_id': 'LEU'},
|
|
352
|
+
{'OWAB': 14.56, 'label_seq_id': 8, 'label_comp_id': 'GLN'},
|
|
353
|
+
{'OWAB': 19.18, 'label_seq_id': 9, 'label_comp_id': 'ASP'},
|
|
354
|
+
{'OWAB': 16.56, 'label_seq_id': 10, 'label_comp_id': 'PRO'},
|
|
355
|
+
{'OWAB': 14.78, 'label_seq_id': 11, 'label_comp_id': 'PHE'},
|
|
356
|
+
{'OWAB': 11.2, 'label_seq_id': 12, 'label_comp_id': 'LEU'}, }}...]
|
|
357
|
+
|
|
358
|
+
'pdbx_unobs_or_zero_occ_residues': [{'label_seq_id': 1, 'label_comp_id': 'MET'},
|
|
359
|
+
{'label_seq_id': 2, 'label_comp_id': 'ALA'},
|
|
360
|
+
{'label_seq_id': 3, 'label_comp_id': 'LYS'},
|
|
361
|
+
{'label_seq_id': 4, 'label_comp_id': 'GLY'}]}
|
|
362
|
+
|
|
363
|
+
"""
|
|
364
|
+
_ = kwargs
|
|
365
|
+
analD = {}
|
|
366
|
+
try:
|
|
367
|
+
entityId = entityD["entity_id"]
|
|
368
|
+
asymIdL = entityD["asym_ids"]
|
|
369
|
+
|
|
370
|
+
refSeq = entityD["seq_one_letter_code_ref"] if "seq_one_letter_code_ref" in entityD else None
|
|
371
|
+
entitySeq = entityD["seq_one_letter_code_can"] if "seq_one_letter_code_can" in entityD else None
|
|
372
|
+
# -------
|
|
373
|
+
# Get UniProt
|
|
374
|
+
#
|
|
375
|
+
dbName = entityD["db_name"] if "db_name" in entityD else None
|
|
376
|
+
dbAccession = entityD["db_accession"] if "db_accession" in entityD else None
|
|
377
|
+
dbRefSeq = entityD["ref_db_seq"] if "ref_db_seq" in entityD else None
|
|
378
|
+
# --
|
|
379
|
+
if dbRefSeq:
|
|
380
|
+
logger.debug("%s (%s) ref db %4d: %r", dbAccession, dbName, len(dbRefSeq), dbRefSeq)
|
|
381
|
+
if refSeq:
|
|
382
|
+
logger.debug("%s (%s) seq ref pdb %4d: %r", dbAccession, dbName, len(refSeq), refSeq)
|
|
383
|
+
if entitySeq:
|
|
384
|
+
logger.debug("%s (%s) entity sample %4d: %r", dbAccession, dbName, len(entitySeq), entitySeq)
|
|
385
|
+
#
|
|
386
|
+
lenRefDbSeq = len(dbRefSeq) if dbRefSeq else None
|
|
387
|
+
lenEntitySeq = len(entitySeq)
|
|
388
|
+
# sampleSeqCov = 1.0 - float(lenRefDbSeq - lenEntitySeq) / float(lenRefDbSeq) if lenRefDbSeq else None
|
|
389
|
+
#
|
|
390
|
+
|
|
391
|
+
# -
|
|
392
|
+
for asymId in asymIdL:
|
|
393
|
+
if asymId not in vD:
|
|
394
|
+
logger.error("Missing validation data for %s %s %s", entryId, entityId, asymId)
|
|
395
|
+
continue
|
|
396
|
+
#
|
|
397
|
+
irDL = vD[asymId]["pdbx_vrpt_instance_results_seq"] if "pdbx_vrpt_instance_results_seq" in vD[asymId] else []
|
|
398
|
+
lsL = list(set([dV["label_seq_id"] for dV in irDL]))
|
|
399
|
+
lenInstanceSeq = len(lsL)
|
|
400
|
+
|
|
401
|
+
instRefDbSeqCov = 1.0 - float(lenRefDbSeq - lenInstanceSeq) / float(lenRefDbSeq) if lenRefDbSeq else None
|
|
402
|
+
instSampleSeqCov = 1.0 - float(lenEntitySeq - lenInstanceSeq) / float(lenEntitySeq)
|
|
403
|
+
#
|
|
404
|
+
occDL = vD[asymId]["pdbx_vrpt_instance_results_occ"] if "pdbx_vrpt_instance_results_occ" in vD[asymId] else []
|
|
405
|
+
# average the
|
|
406
|
+
owabRegD = {}
|
|
407
|
+
if occDL:
|
|
408
|
+
owabD = {}
|
|
409
|
+
for dV in occDL:
|
|
410
|
+
owabD.setdefault(dV["label_seq_id"], []).append(dV["OWAB"])
|
|
411
|
+
#
|
|
412
|
+
# logger.info("owabD %r" % owabD)
|
|
413
|
+
meanOwabD = {k: mean(v) for k, v in owabD.items()}
|
|
414
|
+
meanOwab = mean(meanOwabD.values())
|
|
415
|
+
stdevOwab = stdev(meanOwabD.values())
|
|
416
|
+
#
|
|
417
|
+
logger.debug(">> Length of B values list %d mean %.3f stdev %.3f", len(meanOwabD), meanOwab, stdevOwab)
|
|
418
|
+
#
|
|
419
|
+
meanOwabA = np.array(list(meanOwabD.values()))
|
|
420
|
+
#
|
|
421
|
+
condition = meanOwabA > (meanOwab + meanOwab)
|
|
422
|
+
regL = self.__contiguousRegions(condition)
|
|
423
|
+
for ii, (start, stop) in enumerate(regL, 1):
|
|
424
|
+
segment = meanOwabA[start:stop]
|
|
425
|
+
logger.debug("B value range = start %d stop %d min %.3f max %.3f", start, stop, segment.min(), segment.max())
|
|
426
|
+
owabRegD[ii] = {"length": stop - start + 1, "occ_min": segment.min(), "occ_max": segment.max()}
|
|
427
|
+
|
|
428
|
+
#
|
|
429
|
+
#
|
|
430
|
+
# if False:
|
|
431
|
+
# uDL = vD[asymId]['pdbx_unobs_or_zero_occ_residues'] if 'pdbx_unobs_or_zero_occ_residues' in vD[asymId] else []
|
|
432
|
+
# unobsL = [d['label_seq_id'] for d in uDL]
|
|
433
|
+
#
|
|
434
|
+
# segL = []
|
|
435
|
+
# for k, g in groupby(enumerate(lsL), lambda x: x[0] - x[1]):
|
|
436
|
+
# logger.info(" Segment entryId %s entityId %s asymId %s: %r" % (entryId, entityId, asymId, list(map(itemgetter(1), g))))
|
|
437
|
+
#
|
|
438
|
+
# for k, g in groupby(enumerate(lsL), lambda(i, x): i - x):
|
|
439
|
+
# logger.info(" entryId %s entityId %s asymId %s: %r" % (entryId, entityId, asymId, list(map(itemgetter(1), g)))
|
|
440
|
+
|
|
441
|
+
segL = [list(map(itemgetter(1), g)) for _, g in groupby(enumerate(lsL), lambda x: x[0] - x[1])]
|
|
442
|
+
logger.debug("Modeled sequence length %d segments %d", len(lsL), len(segL))
|
|
443
|
+
#
|
|
444
|
+
gapD = {}
|
|
445
|
+
for ii in range(1, len(segL)):
|
|
446
|
+
bG = segL[ii - 1][-1]
|
|
447
|
+
eG = segL[ii][0]
|
|
448
|
+
gapD[ii] = eG - bG - 1
|
|
449
|
+
logger.debug("Gap %d length %d", ii, gapD[ii])
|
|
450
|
+
#
|
|
451
|
+
#
|
|
452
|
+
if instRefDbSeqCov:
|
|
453
|
+
logger.debug(
|
|
454
|
+
"Summary %s %s %s refcov %.2f sampleCov %.2f - gaps (%d) %r owabs seqments (%d) %r",
|
|
455
|
+
entryId,
|
|
456
|
+
entityId,
|
|
457
|
+
asymId,
|
|
458
|
+
instRefDbSeqCov,
|
|
459
|
+
instSampleSeqCov,
|
|
460
|
+
len(gapD),
|
|
461
|
+
list(gapD.values()),
|
|
462
|
+
len(owabRegD),
|
|
463
|
+
list(owabRegD.values()),
|
|
464
|
+
)
|
|
465
|
+
else:
|
|
466
|
+
logger.debug(
|
|
467
|
+
"Summary %s %s %s sampleCov %.2f - gaps (%d) %r owabs seqments (%d) %r",
|
|
468
|
+
entryId,
|
|
469
|
+
entityId,
|
|
470
|
+
asymId,
|
|
471
|
+
instSampleSeqCov,
|
|
472
|
+
len(gapD),
|
|
473
|
+
list(gapD),
|
|
474
|
+
len(owabRegD),
|
|
475
|
+
list(owabRegD.values()),
|
|
476
|
+
)
|
|
477
|
+
#
|
|
478
|
+
analD[asymId] = {"coverage_inst_refdb": instRefDbSeqCov, "coverage_inst_entity": instSampleSeqCov, "gapD": copy.copy(gapD), "owabRegiond": copy.copy(owabRegD)}
|
|
479
|
+
logger.debug("entry %s entity %s analD %r", entryId, entityId, analD)
|
|
480
|
+
except Exception as e:
|
|
481
|
+
logger.exception("%s failing with %s", entryId, str(e))
|
|
482
|
+
#
|
|
483
|
+
return analD
|
|
484
|
+
|
|
485
|
+
def __getSegments(self, values):
|
|
486
|
+
xV = np.asarray(values)
|
|
487
|
+
# Generate some random data
|
|
488
|
+
# x = np.cumsum(np.random.random(1000) - 0.5)
|
|
489
|
+
#
|
|
490
|
+
condition = np.abs(xV) < 1
|
|
491
|
+
|
|
492
|
+
# Print the start and stop indicies of each region where the absolute
|
|
493
|
+
# values of x are below 1, and the min and max of each of these regions
|
|
494
|
+
for start, stop in self.__contiguousRegions(condition):
|
|
495
|
+
segment = xV[start:stop]
|
|
496
|
+
print(start, stop)
|
|
497
|
+
print(segment.min(), segment.max())
|
|
498
|
+
|
|
499
|
+
def __contiguousRegions(self, condition):
|
|
500
|
+
"""Finds contiguous True regions of the boolean array "condition.
|
|
501
|
+
|
|
502
|
+
Returns a 2D array where the first column is the start index of the region and the
|
|
503
|
+
second column is the end index.
|
|
504
|
+
|
|
505
|
+
"""
|
|
506
|
+
|
|
507
|
+
# Find the indicies of changes in "condition"
|
|
508
|
+
dV = np.diff(condition)
|
|
509
|
+
(idx,) = dV.nonzero()
|
|
510
|
+
|
|
511
|
+
# We need to start things after the change in "condition". Therefore,
|
|
512
|
+
# we'll shift the index by 1 to the right.
|
|
513
|
+
idx += 1
|
|
514
|
+
|
|
515
|
+
if condition[0]:
|
|
516
|
+
# If the start of condition is True prepend a 0
|
|
517
|
+
idx = np.r_[0, idx]
|
|
518
|
+
|
|
519
|
+
if condition[-1]:
|
|
520
|
+
# If the end of condition is True, append the length of the array
|
|
521
|
+
idx = np.r_[idx, condition.size] # Edit
|
|
522
|
+
|
|
523
|
+
# Reshape the result into two columns
|
|
524
|
+
idx.shape = (-1, 2)
|
|
525
|
+
return idx
|
|
526
|
+
|
|
527
|
+
def __window(self, seq, num=2):
|
|
528
|
+
"""Returns a sliding window (of width n) over data from the iterable
|
|
529
|
+
s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...
|
|
530
|
+
"""
|
|
531
|
+
it = iter(seq)
|
|
532
|
+
result = tuple(islice(it, num))
|
|
533
|
+
if len(result) == num:
|
|
534
|
+
yield result
|
|
535
|
+
for elem in it:
|
|
536
|
+
result = result[1:] + (elem,)
|
|
537
|
+
yield result
|
|
538
|
+
|
|
539
|
+
def missingElements(self, lV):
|
|
540
|
+
missing = chain.from_iterable(range(x + 1, y) for x, y in self.__window(lV) if (y - x) > 1)
|
|
541
|
+
return list(missing)
|
|
542
|
+
|
|
543
|
+
def __fetchUniprot(self, uniProtId):
|
|
544
|
+
baseUrl = "http://www.uniprot.org"
|
|
545
|
+
wsEndPoint = "/uniprot/"
|
|
546
|
+
fS = ""
|
|
547
|
+
try:
|
|
548
|
+
fullUrl = baseUrl + wsEndPoint + uniProtId + ".fasta"
|
|
549
|
+
result = requests.get(fullUrl)
|
|
550
|
+
if result.ok:
|
|
551
|
+
fL = result.text.split("\n")
|
|
552
|
+
fS = "".join(fL[1:])
|
|
553
|
+
else:
|
|
554
|
+
logger.error("UniProt Fasta request for %s returns status %r", uniProtId, result.status_code)
|
|
555
|
+
except Exception as e:
|
|
556
|
+
logger.error("Failing request for %s with %s", uniProtId, str(e))
|
|
557
|
+
return fS
|