rcsb.exdb 1.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rcsb/__init__.py +1 -0
- rcsb/exdb/__init__.py +1 -0
- rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
- rcsb/exdb/branch/GlycanProvider.py +116 -0
- rcsb/exdb/branch/GlycanUtils.py +114 -0
- rcsb/exdb/branch/__init__.py +0 -0
- rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
- rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
- rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
- rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
- rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
- rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
- rcsb/exdb/chemref/__init__.py +0 -0
- rcsb/exdb/citation/CitationAdapter.py +91 -0
- rcsb/exdb/citation/CitationExtractor.py +190 -0
- rcsb/exdb/citation/CitationUtils.py +51 -0
- rcsb/exdb/citation/__init__.py +0 -0
- rcsb/exdb/cli/__init__.py +0 -0
- rcsb/exdb/entry/EntryInfoProvider.py +148 -0
- rcsb/exdb/entry/__init__.py +0 -0
- rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
- rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
- rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
- rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
- rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
- rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
- rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
- rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
- rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
- rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
- rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
- rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
- rcsb/exdb/seq/AnnotationExtractor.py +76 -0
- rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
- rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
- rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
- rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
- rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
- rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
- rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
- rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
- rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
- rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
- rcsb/exdb/seq/UniProtExtractor.py +80 -0
- rcsb/exdb/seq/__init__.py +0 -0
- rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
- rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
- rcsb/exdb/tests/__init__.py +0 -0
- rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
- rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
- rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
- rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
- rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
- rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
- rcsb/exdb/tests/testChemRefLoader.py +106 -0
- rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
- rcsb/exdb/tests/testCitationAdapter.py +97 -0
- rcsb/exdb/tests/testCitationExtractor.py +93 -0
- rcsb/exdb/tests/testCitationUtils.py +92 -0
- rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
- rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
- rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
- rcsb/exdb/tests/testGlycanProvider.py +98 -0
- rcsb/exdb/tests/testGlycanUtils.py +64 -0
- rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
- rcsb/exdb/tests/testObjectExtractor.py +342 -0
- rcsb/exdb/tests/testObjectTransformer.py +83 -0
- rcsb/exdb/tests/testObjectUpdater.py +120 -0
- rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
- rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
- rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
- rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
- rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
- rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
- rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
- rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
- rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
- rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
- rcsb/exdb/tests/testUniProtExtractor.py +77 -0
- rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
- rcsb/exdb/tree/__init__.py +0 -0
- rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
- rcsb/exdb/utils/ObjectExtractor.py +286 -0
- rcsb/exdb/utils/ObjectTransformer.py +124 -0
- rcsb/exdb/utils/ObjectUpdater.py +121 -0
- rcsb/exdb/utils/ObjectValidator.py +160 -0
- rcsb/exdb/utils/__init__.py +0 -0
- rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
- rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
- rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
- rcsb/exdb/wf/__init__.py +0 -0
- rcsb_exdb-1.31.dist-info/METADATA +103 -0
- rcsb_exdb-1.31.dist-info/RECORD +98 -0
- rcsb_exdb-1.31.dist-info/WHEEL +4 -0
- rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,544 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: EntityPolymerExtractor.py
|
|
3
|
+
# Date: 19-Feb-2019 jdw
|
|
4
|
+
#
|
|
5
|
+
# Selected utilities to extract entity polymer mapping and feature data
|
|
6
|
+
# from the exchange database schema.
|
|
7
|
+
#
|
|
8
|
+
# Updates:
|
|
9
|
+
#
|
|
10
|
+
#
|
|
11
|
+
##
|
|
12
|
+
__docformat__ = "google en"
|
|
13
|
+
__author__ = "John Westbrook"
|
|
14
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
15
|
+
__license__ = "Apache 2.0"
|
|
16
|
+
|
|
17
|
+
import copy
|
|
18
|
+
import logging
|
|
19
|
+
import os
|
|
20
|
+
|
|
21
|
+
from rcsb.db.mongo.Connection import Connection
|
|
22
|
+
from rcsb.db.mongo.MongoDbUtil import MongoDbUtil
|
|
23
|
+
from rcsb.utils.io.MarshalUtil import MarshalUtil
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class EntityPolymerExtractor(object):
|
|
30
|
+
"""Utilities to extract polymer related data from entry and entity collections."""
|
|
31
|
+
|
|
32
|
+
def __init__(self, cfgOb, **kwargs):
|
|
33
|
+
self.__cfgOb = cfgOb
|
|
34
|
+
self.__resourceName = "MONGO_DB"
|
|
35
|
+
self.__mU = MarshalUtil()
|
|
36
|
+
self.__entryD, self.__authAsymIdIndex = self.__rebuildCache(**kwargs)
|
|
37
|
+
#
|
|
38
|
+
|
|
39
|
+
def __rebuildCache(self, **kwargs):
|
|
40
|
+
useCache = kwargs.get("useCache", True)
|
|
41
|
+
dirPath = kwargs.get("exdbDirPath", ".")
|
|
42
|
+
cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"})
|
|
43
|
+
#
|
|
44
|
+
ext = "pic" if cacheKwargs["fmt"] == "pickle" else "json"
|
|
45
|
+
fn = "entity-polymer-extracted-data-cache" + "." + ext
|
|
46
|
+
cacheFilePath = os.path.join(dirPath, fn)
|
|
47
|
+
#
|
|
48
|
+
cD = {"entryD": {}, "authIdxD": {}}
|
|
49
|
+
try:
|
|
50
|
+
self.__mU.mkdir(dirPath)
|
|
51
|
+
if not useCache:
|
|
52
|
+
for fp in [cacheFilePath]:
|
|
53
|
+
try:
|
|
54
|
+
os.remove(fp)
|
|
55
|
+
except Exception:
|
|
56
|
+
pass
|
|
57
|
+
|
|
58
|
+
if useCache and cacheFilePath and os.access(cacheFilePath, os.R_OK):
|
|
59
|
+
cD = self.__mU.doImport(cacheFilePath, **cacheKwargs)
|
|
60
|
+
else:
|
|
61
|
+
entryD = self.__selectEntries(**kwargs)
|
|
62
|
+
entryD = self.__selectPolymerEntities(entryD, **kwargs)
|
|
63
|
+
authIdxD = self.__buildIndices(entryD)
|
|
64
|
+
cD["entryD"] = entryD
|
|
65
|
+
cD["authIdxD"] = authIdxD
|
|
66
|
+
if cacheFilePath:
|
|
67
|
+
ok = self.__mU.doExport(cacheFilePath, cD, **cacheKwargs)
|
|
68
|
+
logger.info("Saved entity-polymer extracted results (%d) status %r in %s", len(entryD), ok, cacheFilePath)
|
|
69
|
+
except Exception as e:
|
|
70
|
+
logger.exception("Failing with %s", str(e))
|
|
71
|
+
return cD["entryD"], cD["authIdxD"]
|
|
72
|
+
|
|
73
|
+
def __buildIndices(self, entryD):
|
|
74
|
+
indD = {}
|
|
75
|
+
for entryId, eD in entryD.items():
|
|
76
|
+
entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {}
|
|
77
|
+
for entityId, pD in entityD.items():
|
|
78
|
+
for authAsymId in pD["auth_asym_ids"]:
|
|
79
|
+
# avoid tuples for json serialization
|
|
80
|
+
# indD[(entryId, authAsymId)] = entityId
|
|
81
|
+
indD[entryId + "_" + authAsymId] = entityId
|
|
82
|
+
return indD
|
|
83
|
+
|
|
84
|
+
def getEntryCount(self):
|
|
85
|
+
return len(self.__entryD)
|
|
86
|
+
|
|
87
|
+
def getRefSeqAccessions(self, dbName):
|
|
88
|
+
acL = []
|
|
89
|
+
try:
|
|
90
|
+
for _, eD in self.__entryD.items():
|
|
91
|
+
entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {}
|
|
92
|
+
for _, pD in entityD.items():
|
|
93
|
+
for dD in pD["struct_ref"]:
|
|
94
|
+
if "pdbx_db_accession" in dD and dD["db_name"] == dbName:
|
|
95
|
+
acL.append(dD["pdbx_db_accession"])
|
|
96
|
+
return list(set(acL))
|
|
97
|
+
except Exception as e:
|
|
98
|
+
logger.exception("Failing with %s", str(e))
|
|
99
|
+
|
|
100
|
+
return acL
|
|
101
|
+
|
|
102
|
+
def countRefSeqAccessions(self, dbName):
|
|
103
|
+
cD = {}
|
|
104
|
+
try:
|
|
105
|
+
for _, eD in self.__entryD.items():
|
|
106
|
+
entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {}
|
|
107
|
+
for _, pD in entityD.items():
|
|
108
|
+
iCount = 0
|
|
109
|
+
for dD in pD["struct_ref"]:
|
|
110
|
+
if "pdbx_db_accession" in dD and dD["db_name"] == dbName:
|
|
111
|
+
iCount += 1
|
|
112
|
+
cD[iCount] = cD[iCount] + 1 if iCount in cD else 1
|
|
113
|
+
except Exception as e:
|
|
114
|
+
logger.exception("Failing with %s", str(e))
|
|
115
|
+
|
|
116
|
+
return cD
|
|
117
|
+
|
|
118
|
+
def countRefSeqAccessionDbType(self):
|
|
119
|
+
cD = {}
|
|
120
|
+
try:
|
|
121
|
+
for _, eD in self.__entryD.items():
|
|
122
|
+
entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {}
|
|
123
|
+
for _, pD in entityD.items():
|
|
124
|
+
for dD in pD["struct_ref"]:
|
|
125
|
+
if "pdbx_db_accession" in dD and "db_name" in dD:
|
|
126
|
+
cD[dD["db_name"]] = cD[dD["db_name"]] + 1 if dD["db_name"] in cD else 1
|
|
127
|
+
except Exception as e:
|
|
128
|
+
logger.exception("Failing with %s", str(e))
|
|
129
|
+
|
|
130
|
+
return cD
|
|
131
|
+
|
|
132
|
+
def countRefSeqAccessionAny(self):
|
|
133
|
+
cD = {}
|
|
134
|
+
try:
|
|
135
|
+
for _, eD in self.__entryD.items():
|
|
136
|
+
entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {}
|
|
137
|
+
for _, pD in entityD.items():
|
|
138
|
+
iCount = len(pD["struct_ref"])
|
|
139
|
+
# if iCount == 0:
|
|
140
|
+
# logger.info("entryId %r " % (entryId, entityId))
|
|
141
|
+
cD[iCount] = cD[iCount] + 1 if iCount in cD else 1
|
|
142
|
+
except Exception as e:
|
|
143
|
+
logger.exception("Failing with %s", str(e))
|
|
144
|
+
|
|
145
|
+
return cD
|
|
146
|
+
|
|
147
|
+
def getUniqueTaxons(self):
|
|
148
|
+
#
|
|
149
|
+
tD = {}
|
|
150
|
+
try:
|
|
151
|
+
for _, eD in self.__entryD.items():
|
|
152
|
+
entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {}
|
|
153
|
+
for _, pD in entityD.items():
|
|
154
|
+
# logger.info("Entity dictionary %r", pD.keys())
|
|
155
|
+
if "rcsb_entity_source_organism" in pD:
|
|
156
|
+
for dd in pD["rcsb_entity_source_organism"]:
|
|
157
|
+
if "ncbi_taxonomy_id" in dd:
|
|
158
|
+
tD[dd["ncbi_taxonomy_id"]] = tD[dd["ncbi_taxonomy_id"]] + 1 if dd["ncbi_taxonomy_id"] in tD else 1
|
|
159
|
+
except Exception as e:
|
|
160
|
+
logger.exception("Failing with %s", str(e))
|
|
161
|
+
logger.info("Taxon coverage %d", len(tD))
|
|
162
|
+
return tD
|
|
163
|
+
|
|
164
|
+
def getOrigTaxons(self):
|
|
165
|
+
#
|
|
166
|
+
tD = {}
|
|
167
|
+
try:
|
|
168
|
+
for entryId, eD in self.__entryD.items():
|
|
169
|
+
entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {}
|
|
170
|
+
for entityId, pD in entityD.items():
|
|
171
|
+
# logger.info("Entity dictionary %r", pD.keys())
|
|
172
|
+
if "original_taxonomy_ids" in pD:
|
|
173
|
+
for tV in pD["original_taxonomy_ids"]:
|
|
174
|
+
tD.setdefault(entryId, []).append((entityId, tV))
|
|
175
|
+
if entryId not in tD:
|
|
176
|
+
logger.debug("No taxonomy for %s", entryId)
|
|
177
|
+
except Exception as e:
|
|
178
|
+
logger.exception("Failing with %s", str(e))
|
|
179
|
+
logger.info("Taxon coverage %d", len(tD))
|
|
180
|
+
return tD
|
|
181
|
+
|
|
182
|
+
def countRefSeqAccessionByTaxon(self, dbNameList=None):
|
|
183
|
+
#
|
|
184
|
+
tD = {}
|
|
185
|
+
iCount = 0
|
|
186
|
+
#
|
|
187
|
+
try:
|
|
188
|
+
for _, eD in self.__entryD.items():
|
|
189
|
+
entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {}
|
|
190
|
+
for _, pD in entityD.items():
|
|
191
|
+
# logger.info("Entity dictionary %r", pD.keys())
|
|
192
|
+
if "rcsb_entity_source_organism" in pD:
|
|
193
|
+
for dd in pD["rcsb_entity_source_organism"]:
|
|
194
|
+
if "ncbi_taxonomy_id" in dd:
|
|
195
|
+
tId = dd["ncbi_taxonomy_id"]
|
|
196
|
+
for dD in pD["struct_ref"]:
|
|
197
|
+
if "pdbx_db_accession" in dD and "db_name" in dD:
|
|
198
|
+
if dD["db_name"] in dbNameList:
|
|
199
|
+
tD.setdefault(tId, []).append(dD["pdbx_db_accession"])
|
|
200
|
+
iCount += 1
|
|
201
|
+
except Exception as e:
|
|
202
|
+
logger.exception("Failing with %s", str(e))
|
|
203
|
+
|
|
204
|
+
logger.info("Total observed accessions %d", iCount)
|
|
205
|
+
return tD
|
|
206
|
+
|
|
207
|
+
def checkRefSeqAlignRange(self, dbName):
|
|
208
|
+
ok = True
|
|
209
|
+
try:
|
|
210
|
+
eCount = 0
|
|
211
|
+
aCount = 0
|
|
212
|
+
tCount = 0
|
|
213
|
+
for entryId, eD in self.__entryD.items():
|
|
214
|
+
entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {}
|
|
215
|
+
for entityId, pD in entityD.items():
|
|
216
|
+
for dD in pD["struct_ref"]:
|
|
217
|
+
if "db_name" in dD and dD["db_name"] == dbName:
|
|
218
|
+
if "pdbx_db_accession" in dD and "alignD" in dD and "pdbx_seq_one_letter_code" in dD and "pdbx_align_begin" in dD:
|
|
219
|
+
seqLen = len(dD["pdbx_seq_one_letter_code"])
|
|
220
|
+
dbBegin = 100000000
|
|
221
|
+
dbEnd = -1
|
|
222
|
+
refSeqDbBegin = dD["pdbx_align_begin"]
|
|
223
|
+
for authAsymId, alDL in dD["alignD"].items():
|
|
224
|
+
tCount += 1
|
|
225
|
+
difL = []
|
|
226
|
+
for alD in alDL:
|
|
227
|
+
tBeg = alD["db_align_beg"]
|
|
228
|
+
tEnd = alD["db_align_end"]
|
|
229
|
+
tDif = tEnd - tBeg + 1
|
|
230
|
+
difL.append(tDif)
|
|
231
|
+
dbBegin = min(tBeg, dbBegin)
|
|
232
|
+
dbEnd = max(tEnd, dbEnd)
|
|
233
|
+
|
|
234
|
+
# range is calculate on off -
|
|
235
|
+
# if seqLen < dbEnd - dbBegin + 1:
|
|
236
|
+
if seqLen < dbEnd - dbBegin and not refSeqDbBegin == dbBegin:
|
|
237
|
+
fDif = sum(difL)
|
|
238
|
+
logger.debug(
|
|
239
|
+
"Bad alignment for %r %r %r %r (%d) seqLen %r (%d) dbBegin %r dbEnd %r difL %r tDif %r",
|
|
240
|
+
entryId,
|
|
241
|
+
entityId,
|
|
242
|
+
authAsymId,
|
|
243
|
+
alD["pdbx_strand_id"],
|
|
244
|
+
len(alDL),
|
|
245
|
+
seqLen,
|
|
246
|
+
dbEnd - dbBegin + 1,
|
|
247
|
+
dbBegin,
|
|
248
|
+
dbEnd,
|
|
249
|
+
difL,
|
|
250
|
+
fDif,
|
|
251
|
+
)
|
|
252
|
+
aCount += 1
|
|
253
|
+
|
|
254
|
+
else:
|
|
255
|
+
eCount += 1
|
|
256
|
+
logger.info("Incomplete %s struct_ref record count %d", dbName, eCount)
|
|
257
|
+
logger.info("Inconsistent %s db reference alignments %d/%d", dbName, aCount, tCount)
|
|
258
|
+
|
|
259
|
+
except Exception as e:
|
|
260
|
+
logger.exception("Failing with %s", str(e))
|
|
261
|
+
ok = False
|
|
262
|
+
|
|
263
|
+
return ok
|
|
264
|
+
|
|
265
|
+
def getEntityRefSeqAccessions(self, dbName, entryId, entityId):
|
|
266
|
+
acL = []
|
|
267
|
+
try:
|
|
268
|
+
dL = self.__entryD[entryId]["selected_polymer_entities"][entityId]["struct_ref"]
|
|
269
|
+
acL = list(set([d["pdbx_db_accession"] for d in dL if d["db_name"] == dbName]))
|
|
270
|
+
except Exception as e:
|
|
271
|
+
logger.exception("Failing with %s %r %r %s", dbName, entryId, entityId, str(e))
|
|
272
|
+
return acL
|
|
273
|
+
|
|
274
|
+
def __selectEntries(self, **kwargs):
|
|
275
|
+
"""Return a dictionary of PDB entries satifying the input conditions (e.g. method, resolution limit)"""
|
|
276
|
+
|
|
277
|
+
dbName = kwargs.get("dbName", "pdbx_core")
|
|
278
|
+
collectionName = kwargs.get("collectionName", "pdbx_core_entry")
|
|
279
|
+
selectionQueryD = kwargs.get("entrySelectionQuery", {})
|
|
280
|
+
#
|
|
281
|
+
entryD = {}
|
|
282
|
+
try:
|
|
283
|
+
with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
|
|
284
|
+
mg = MongoDbUtil(client)
|
|
285
|
+
if mg.collectionExists(dbName, collectionName):
|
|
286
|
+
logger.info("%s %s document count is %d", dbName, collectionName, mg.count(dbName, collectionName))
|
|
287
|
+
qD = {}
|
|
288
|
+
if selectionQueryD:
|
|
289
|
+
qD.update(qD)
|
|
290
|
+
selectL = ["rcsb_entry_container_identifiers"]
|
|
291
|
+
dL = mg.fetch(dbName, collectionName, selectL, queryD=qD)
|
|
292
|
+
logger.info("Selection %r fetch result count %d", selectL, len(dL))
|
|
293
|
+
#
|
|
294
|
+
for dD in dL:
|
|
295
|
+
#
|
|
296
|
+
if (
|
|
297
|
+
("rcsb_entry_container_identifiers" in dD)
|
|
298
|
+
and ("entry_id" in dD["rcsb_entry_container_identifiers"])
|
|
299
|
+
and ("polymer_entity_ids" in dD["rcsb_entry_container_identifiers"])
|
|
300
|
+
and dD["rcsb_entry_container_identifiers"]["polymer_entity_ids"]
|
|
301
|
+
):
|
|
302
|
+
entryD[dD["rcsb_entry_container_identifiers"]["entry_id"]] = {"polymer_entity_ids": dD["rcsb_entry_container_identifiers"]["polymer_entity_ids"]}
|
|
303
|
+
|
|
304
|
+
except Exception as e:
|
|
305
|
+
logger.exception("Failing with %s", str(e))
|
|
306
|
+
return entryD
|
|
307
|
+
#
|
|
308
|
+
|
|
309
|
+
def __selectPolymerEntities(self, entryD, **kwargs):
|
|
310
|
+
"""Skeleton entity selector recovering essential biological sequence mapping features
|
|
311
|
+
for macromolecules (default type = protein).
|
|
312
|
+
|
|
313
|
+
"1CP9": {
|
|
314
|
+
"polymer_entity_ids": [
|
|
315
|
+
"1",
|
|
316
|
+
"2"
|
|
317
|
+
],
|
|
318
|
+
"selected_polymer_entities": {
|
|
319
|
+
"1": {
|
|
320
|
+
"rcsb_multiple_source_flag": "N",
|
|
321
|
+
"asym_ids": [
|
|
322
|
+
"A"
|
|
323
|
+
],
|
|
324
|
+
"auth_asym_ids": [
|
|
325
|
+
"A"
|
|
326
|
+
],
|
|
327
|
+
"entity_id": "1",
|
|
328
|
+
"type": "polypeptide(L)",
|
|
329
|
+
"rcsb_entity_polymer_type": "Protein",
|
|
330
|
+
"rcsb_entity_source_organism": [
|
|
331
|
+
{
|
|
332
|
+
"ncbi_taxonomy_id": 587,
|
|
333
|
+
"beg_seq_num": 1,
|
|
334
|
+
"end_seq_num": 205,
|
|
335
|
+
"ncbi_scientific_name": "Providencia rettgeri"
|
|
336
|
+
}
|
|
337
|
+
],
|
|
338
|
+
"struct_ref": [
|
|
339
|
+
{
|
|
340
|
+
"id": "1",
|
|
341
|
+
"db_name": "UNP",
|
|
342
|
+
"pdbx_db_accession": "Q7WZI9",
|
|
343
|
+
"entity_id": "1",
|
|
344
|
+
"pdbx_seq_one_letter_code": "QSTQIKIERDNYGVPHIYANDTYSLFYGYGYA...",
|
|
345
|
+
"alignD": {
|
|
346
|
+
"A": [
|
|
347
|
+
{
|
|
348
|
+
"align_id": "1",
|
|
349
|
+
"ref_id": "1",
|
|
350
|
+
"pdbx_PDB_id_code": "1CP9",
|
|
351
|
+
"pdbx_strand_id": "A",
|
|
352
|
+
"seq_align_beg": 1,
|
|
353
|
+
"seq_align_end": 205,
|
|
354
|
+
"pdbx_db_accession": "Q7WZI9",
|
|
355
|
+
"db_align_beg": 24,
|
|
356
|
+
"db_align_end": 228,
|
|
357
|
+
"pdbx_auth_seq_align_beg": "1",
|
|
358
|
+
"pdbx_auth_seq_align_end": "205",
|
|
359
|
+
"rcsb_entity_id": "1"
|
|
360
|
+
}
|
|
361
|
+
]
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
]
|
|
365
|
+
},
|
|
366
|
+
"2": {
|
|
367
|
+
"rcsb_multiple_source_flag": "N",
|
|
368
|
+
"asym_ids": [
|
|
369
|
+
"B"
|
|
370
|
+
],
|
|
371
|
+
"auth_asym_ids": [
|
|
372
|
+
"B"
|
|
373
|
+
],
|
|
374
|
+
"entity_id": "2",
|
|
375
|
+
"type": "polypeptide(L)",
|
|
376
|
+
"rcsb_entity_polymer_type": "Protein",
|
|
377
|
+
"rcsb_entity_source_organism": [
|
|
378
|
+
{
|
|
379
|
+
"ncbi_taxonomy_id": 587,
|
|
380
|
+
"beg_seq_num": 1,
|
|
381
|
+
"end_seq_num": 553,
|
|
382
|
+
"ncbi_scientific_name": "Providencia rettgeri"
|
|
383
|
+
}
|
|
384
|
+
],
|
|
385
|
+
"struct_ref": [
|
|
386
|
+
{
|
|
387
|
+
"id": "2",
|
|
388
|
+
"db_name": "UNP",
|
|
389
|
+
"pdbx_db_accession": "Q7WZI9",
|
|
390
|
+
"entity_id": "2",
|
|
391
|
+
"pdbx_seq_one_letter_code": "SNVWLVGKTKASGAKAILLNGPQFGWFNPAYTYGIGLHG",
|
|
392
|
+
"alignD": {
|
|
393
|
+
"B": [
|
|
394
|
+
{
|
|
395
|
+
"align_id": "2",
|
|
396
|
+
"ref_id": "2",
|
|
397
|
+
"pdbx_PDB_id_code": "1CP9",
|
|
398
|
+
"pdbx_strand_id": "B",
|
|
399
|
+
"seq_align_beg": 1,
|
|
400
|
+
"seq_align_end": 553,
|
|
401
|
+
"pdbx_db_accession": "Q7WZI9",
|
|
402
|
+
"db_align_beg": 285,
|
|
403
|
+
"db_align_end": 837,
|
|
404
|
+
"pdbx_auth_seq_align_beg": "1",
|
|
405
|
+
"pdbx_auth_seq_align_end": "553",
|
|
406
|
+
"rcsb_entity_id": "2"
|
|
407
|
+
}
|
|
408
|
+
]
|
|
409
|
+
}
|
|
410
|
+
}
|
|
411
|
+
]
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
},
|
|
415
|
+
|
|
416
|
+
"""
|
|
417
|
+
dbName = kwargs.get("dbName", "pdbx_core")
|
|
418
|
+
collectionName = kwargs.get("collectionName", "pdbx_core_polymer_entity")
|
|
419
|
+
resultKey = kwargs.get("resultKey", "selected_polymer_entities")
|
|
420
|
+
|
|
421
|
+
entryLimit = kwargs.get("entryLimit", None)
|
|
422
|
+
selectionQueryD = kwargs.get("entitySelectionQuery", {"entity_poly.rcsb_entity_polymer_type": "Protein"})
|
|
423
|
+
#
|
|
424
|
+
try:
|
|
425
|
+
with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
|
|
426
|
+
mg = MongoDbUtil(client)
|
|
427
|
+
if mg.collectionExists(dbName, collectionName):
|
|
428
|
+
logger.info("%s %s document count is %d", dbName, collectionName, mg.count(dbName, collectionName))
|
|
429
|
+
selectL = [
|
|
430
|
+
"rcsb_polymer_entity_container_identifiers",
|
|
431
|
+
"entity.rcsb_multiple_source_flag",
|
|
432
|
+
"entity_poly.type",
|
|
433
|
+
"entity_poly.rcsb_entity_polymer_type",
|
|
434
|
+
"entity_poly.pdbx_seq_one_letter_code_can",
|
|
435
|
+
"rcsb_entity_source_organism.ncbi_taxonomy_id",
|
|
436
|
+
"rcsb_entity_source_organism.ncbi_scientific_name",
|
|
437
|
+
"rcsb_entity_source_organism.beg_seq_num",
|
|
438
|
+
"rcsb_entity_source_organism.end_seq_num",
|
|
439
|
+
"struct_ref.id",
|
|
440
|
+
"struct_ref.pdbx_db_accession",
|
|
441
|
+
"struct_ref.db_name",
|
|
442
|
+
"struct_ref.entity_id",
|
|
443
|
+
"struct_ref.pdbx_seq_one_letter_code",
|
|
444
|
+
"struct_ref.pdbx_align_begin",
|
|
445
|
+
"struct_ref_seq",
|
|
446
|
+
#
|
|
447
|
+
"entity_src_nat.pdbx_ncbi_taxonomy_id",
|
|
448
|
+
"entity_src_gen.pdbx_gene_src_ncbi_taxonomy_id",
|
|
449
|
+
"entity_src_gen.pdbx_host_org_ncbi_taxonomy_id",
|
|
450
|
+
"pdbx_entity_src_syn.ncbi_taxonomy_id",
|
|
451
|
+
]
|
|
452
|
+
iCount = 0
|
|
453
|
+
for entryId in entryD:
|
|
454
|
+
#
|
|
455
|
+
if resultKey in entryD[entryId]:
|
|
456
|
+
continue
|
|
457
|
+
#
|
|
458
|
+
qD = {"rcsb_polymer_entity_container_identifiers.entry_id": entryId}
|
|
459
|
+
qD.update(selectionQueryD)
|
|
460
|
+
#
|
|
461
|
+
dL = mg.fetch(dbName, collectionName, selectL, queryD=qD)
|
|
462
|
+
logger.debug("%s query %r fetch result count %d", entryId, qD, len(dL))
|
|
463
|
+
eD = {}
|
|
464
|
+
for ii, dD in enumerate(dL, 1):
|
|
465
|
+
rD = {}
|
|
466
|
+
logger.debug("%s (%4d) d is %r", entryId, ii, dD)
|
|
467
|
+
if "entity" in dD:
|
|
468
|
+
rD["rcsb_multiple_source_flag"] = dD["entity"]["rcsb_multiple_source_flag"] if "rcsb_multiple_source_flag" in dD["entity"] else "N"
|
|
469
|
+
#
|
|
470
|
+
if "rcsb_polymer_entity_container_identifiers" in dD:
|
|
471
|
+
rD["asym_ids"] = dD["rcsb_entity_container_identifiers"]["asym_ids"] if "asym_ids" in dD["rcsb_entity_container_identifiers"] else []
|
|
472
|
+
rD["auth_asym_ids"] = dD["rcsb_entity_container_identifiers"]["auth_asym_ids"] if "auth_asym_ids" in dD["rcsb_entity_container_identifiers"] else []
|
|
473
|
+
rD["entity_id"] = dD["rcsb_entity_container_identifiers"]["entity_id"]
|
|
474
|
+
#
|
|
475
|
+
if "entity_poly" in dD:
|
|
476
|
+
rD["type"] = dD["entity_poly"]["type"] if "type" in dD["entity_poly"] else None
|
|
477
|
+
rD["rcsb_entity_polymer_type"] = dD["entity_poly"]["rcsb_entity_polymer_type"] if "rcsb_entity_polymer_type" in dD["entity_poly"] else None
|
|
478
|
+
rD["entity_polymer_length"] = len(dD["entity_poly"]["pdbx_seq_one_letter_code_can"]) if "pdbx_seq_one_letter_code_can" in dD["entity_poly"] else 0
|
|
479
|
+
#
|
|
480
|
+
tL = []
|
|
481
|
+
if "rcsb_entity_source_organism" in dD:
|
|
482
|
+
for tD in dD["rcsb_entity_source_organism"]:
|
|
483
|
+
tL.append(tD)
|
|
484
|
+
rD["rcsb_entity_source_organism"] = copy.copy(tL)
|
|
485
|
+
#
|
|
486
|
+
qDL = []
|
|
487
|
+
if "struct_ref" in dD:
|
|
488
|
+
for tD in dD["struct_ref"]:
|
|
489
|
+
if "db_name" in tD:
|
|
490
|
+
tD["db_name"] = str(tD["db_name"]).upper().strip()
|
|
491
|
+
tD["db_name"] = "UNP" if tD["db_name"] in ["TREMBL"] else tD["db_name"]
|
|
492
|
+
qDL.append(tD)
|
|
493
|
+
if "struct_ref_seq" in dD:
|
|
494
|
+
for qD in qDL:
|
|
495
|
+
refId = qD["id"]
|
|
496
|
+
alignL = []
|
|
497
|
+
for tD in dD["struct_ref_seq"]:
|
|
498
|
+
if refId == tD["ref_id"]:
|
|
499
|
+
alignL.append(tD)
|
|
500
|
+
# qD['align_list'] = copy.copy(aL)
|
|
501
|
+
for align in alignL:
|
|
502
|
+
authAsymId = align["pdbx_strand_id"]
|
|
503
|
+
qD.setdefault("alignD", {}).setdefault(authAsymId, []).append(align)
|
|
504
|
+
|
|
505
|
+
rD["struct_ref"] = qDL
|
|
506
|
+
#
|
|
507
|
+
taxIdL = []
|
|
508
|
+
if "entity_src_nat" in dD:
|
|
509
|
+
for tD in dD["entity_src_nat"]:
|
|
510
|
+
if "pdbx_ncbi_taxonomy_id" in tD:
|
|
511
|
+
taxIdL.append(tD["pdbx_ncbi_taxonomy_id"])
|
|
512
|
+
if "entity_src_gen" in dD:
|
|
513
|
+
for tD in dD["entity_src_gen"]:
|
|
514
|
+
if "pdbx_gene_src_ncbi_taxonomy_id" in tD:
|
|
515
|
+
taxIdL.append(tD["pdbx_gene_src_ncbi_taxonomy_id"])
|
|
516
|
+
if "pdbx_host_org_ncbi_taxonomy_id" in tD:
|
|
517
|
+
taxIdL.append(tD["pdbx_host_org_ncbi_taxonomy_id"])
|
|
518
|
+
if "pdbx_entity_src_syn" in dD:
|
|
519
|
+
for tD in dD["pdbx_entity_src_syn"]:
|
|
520
|
+
if "ncbi_taxonomy_id" in tD:
|
|
521
|
+
taxIdL.append(tD["ncbi_taxonomy_id"])
|
|
522
|
+
qL = []
|
|
523
|
+
for taxId in taxIdL:
|
|
524
|
+
ttL = [int(t.strip()) for t in taxId.split(",") if t.strip().isdigit()]
|
|
525
|
+
qL.extend(ttL)
|
|
526
|
+
logger.debug("TaxId list %r", qL)
|
|
527
|
+
rD["original_taxonomy_ids"] = copy.copy(list(set(qL)))
|
|
528
|
+
#
|
|
529
|
+
if "entity_id" in rD:
|
|
530
|
+
eD[rD["entity_id"]] = copy.copy(rD)
|
|
531
|
+
|
|
532
|
+
entryD[entryId][resultKey] = copy.copy(eD)
|
|
533
|
+
|
|
534
|
+
iCount += 1
|
|
535
|
+
if iCount % 1000 == 0:
|
|
536
|
+
logger.info("Completed fetch %d/%d entries", iCount, len(entryD))
|
|
537
|
+
if entryLimit and iCount >= entryLimit:
|
|
538
|
+
logger.info("Quitting after %d", iCount)
|
|
539
|
+
break
|
|
540
|
+
|
|
541
|
+
except Exception as e:
|
|
542
|
+
logger.exception("Failing with %s", str(e))
|
|
543
|
+
|
|
544
|
+
return entryD
|