rcsb.exdb 1.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rcsb/__init__.py +1 -0
- rcsb/exdb/__init__.py +1 -0
- rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
- rcsb/exdb/branch/GlycanProvider.py +116 -0
- rcsb/exdb/branch/GlycanUtils.py +114 -0
- rcsb/exdb/branch/__init__.py +0 -0
- rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
- rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
- rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
- rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
- rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
- rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
- rcsb/exdb/chemref/__init__.py +0 -0
- rcsb/exdb/citation/CitationAdapter.py +91 -0
- rcsb/exdb/citation/CitationExtractor.py +190 -0
- rcsb/exdb/citation/CitationUtils.py +51 -0
- rcsb/exdb/citation/__init__.py +0 -0
- rcsb/exdb/cli/__init__.py +0 -0
- rcsb/exdb/entry/EntryInfoProvider.py +148 -0
- rcsb/exdb/entry/__init__.py +0 -0
- rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
- rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
- rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
- rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
- rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
- rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
- rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
- rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
- rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
- rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
- rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
- rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
- rcsb/exdb/seq/AnnotationExtractor.py +76 -0
- rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
- rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
- rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
- rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
- rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
- rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
- rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
- rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
- rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
- rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
- rcsb/exdb/seq/UniProtExtractor.py +80 -0
- rcsb/exdb/seq/__init__.py +0 -0
- rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
- rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
- rcsb/exdb/tests/__init__.py +0 -0
- rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
- rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
- rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
- rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
- rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
- rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
- rcsb/exdb/tests/testChemRefLoader.py +106 -0
- rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
- rcsb/exdb/tests/testCitationAdapter.py +97 -0
- rcsb/exdb/tests/testCitationExtractor.py +93 -0
- rcsb/exdb/tests/testCitationUtils.py +92 -0
- rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
- rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
- rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
- rcsb/exdb/tests/testGlycanProvider.py +98 -0
- rcsb/exdb/tests/testGlycanUtils.py +64 -0
- rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
- rcsb/exdb/tests/testObjectExtractor.py +342 -0
- rcsb/exdb/tests/testObjectTransformer.py +83 -0
- rcsb/exdb/tests/testObjectUpdater.py +120 -0
- rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
- rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
- rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
- rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
- rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
- rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
- rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
- rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
- rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
- rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
- rcsb/exdb/tests/testUniProtExtractor.py +77 -0
- rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
- rcsb/exdb/tree/__init__.py +0 -0
- rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
- rcsb/exdb/utils/ObjectExtractor.py +286 -0
- rcsb/exdb/utils/ObjectTransformer.py +124 -0
- rcsb/exdb/utils/ObjectUpdater.py +121 -0
- rcsb/exdb/utils/ObjectValidator.py +160 -0
- rcsb/exdb/utils/__init__.py +0 -0
- rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
- rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
- rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
- rcsb/exdb/wf/__init__.py +0 -0
- rcsb_exdb-1.31.dist-info/METADATA +103 -0
- rcsb_exdb-1.31.dist-info/RECORD +98 -0
- rcsb_exdb-1.31.dist-info/WHEEL +4 -0
- rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,598 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: ReferenceSequenceAnnotationAdapter.py
|
|
3
|
+
# Date: 8-Oct-2019 jdw
|
|
4
|
+
#
|
|
5
|
+
# Selected utilities to update reference sequence annotations information
|
|
6
|
+
# in the core_entity collection.
|
|
7
|
+
#
|
|
8
|
+
# Updates:
|
|
9
|
+
#
|
|
10
|
+
##
|
|
11
|
+
__docformat__ = "google en"
|
|
12
|
+
__author__ = "John Westbrook"
|
|
13
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
14
|
+
__license__ = "Apache 2.0"
|
|
15
|
+
|
|
16
|
+
import copy
|
|
17
|
+
import logging
|
|
18
|
+
|
|
19
|
+
from collections import defaultdict
|
|
20
|
+
|
|
21
|
+
from rcsb.exdb.utils.ObjectAdapterBase import ObjectAdapterBase
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ReferenceSequenceAnnotationAdapter(ObjectAdapterBase):
|
|
27
|
+
"""Selected utilities to update reference sequence annotations information
|
|
28
|
+
in the core_entity collection.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, referenceSequenceAnnotationProvider):
|
|
32
|
+
super(ReferenceSequenceAnnotationAdapter, self).__init__()
|
|
33
|
+
#
|
|
34
|
+
self.__rsaP = referenceSequenceAnnotationProvider
|
|
35
|
+
self.__ssP = self.__rsaP.getSiftsSummaryProvider()
|
|
36
|
+
self.__ecP = self.__rsaP.getEcProvider()
|
|
37
|
+
self.__refD = self.__rsaP.getRefData()
|
|
38
|
+
self.__matchD = self.__rsaP.getMatchInfo()
|
|
39
|
+
self.__ggP = self.__rsaP.getGlyGenProvider()
|
|
40
|
+
#
|
|
41
|
+
|
|
42
|
+
def filter(self, obj, **kwargs):
|
|
43
|
+
isTestMode = False
|
|
44
|
+
if isTestMode:
|
|
45
|
+
ok1, tObj = self.__filterAccessions(copy.deepcopy(obj))
|
|
46
|
+
ok2, tObj = self.__filterFeatures(tObj)
|
|
47
|
+
return ok1 and ok2, obj
|
|
48
|
+
else:
|
|
49
|
+
ok1, obj = self.__filterAccessions(obj)
|
|
50
|
+
ok2, obj = self.__filterFeatures(obj)
|
|
51
|
+
return ok1 and ok2, obj
|
|
52
|
+
|
|
53
|
+
def __filterFeatures(self, obj):
|
|
54
|
+
"""Uses data from uniprot_exdb.reference_entry to populate the following data at the pdbx_core_polymer_entity collection:
|
|
55
|
+
rcsb_polymer_entity_annotation:
|
|
56
|
+
- GO (data from UniProt + lineage info from GO source)
|
|
57
|
+
- InterPro (data from UniProt)
|
|
58
|
+
- Pfam (is possible but not currently enabled)
|
|
59
|
+
- GlyGen
|
|
60
|
+
rcsb_enzyme_class_combined
|
|
61
|
+
- EC (data from UniProt)
|
|
62
|
+
rcsb_ec_lineage
|
|
63
|
+
- EC (data from EC source)
|
|
64
|
+
rcsb_gene_name
|
|
65
|
+
- Gene/taxonomy data from UniProt
|
|
66
|
+
"""
|
|
67
|
+
ok = True
|
|
68
|
+
try:
|
|
69
|
+
if not ("rcsb_polymer_entity_container_identifiers" in obj and "rcsb_id" in obj):
|
|
70
|
+
return False, obj
|
|
71
|
+
entityKey = obj["rcsb_id"]
|
|
72
|
+
eciD = obj["rcsb_polymer_entity_container_identifiers"]
|
|
73
|
+
|
|
74
|
+
#
|
|
75
|
+
logger.debug(" ------------- Running feature filter on %r --------------", entityKey)
|
|
76
|
+
#
|
|
77
|
+
rsDL = []
|
|
78
|
+
soDL = []
|
|
79
|
+
peaDL = []
|
|
80
|
+
peObj = {}
|
|
81
|
+
#
|
|
82
|
+
try:
|
|
83
|
+
rsDL = eciD["reference_sequence_identifiers"]
|
|
84
|
+
except Exception:
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
try:
|
|
88
|
+
soDL = obj["rcsb_entity_source_organism"]
|
|
89
|
+
except Exception:
|
|
90
|
+
pass
|
|
91
|
+
#
|
|
92
|
+
try:
|
|
93
|
+
peObj = obj["rcsb_polymer_entity"]
|
|
94
|
+
except Exception:
|
|
95
|
+
pass
|
|
96
|
+
#
|
|
97
|
+
try:
|
|
98
|
+
peaDL = obj["rcsb_polymer_entity_annotation"]
|
|
99
|
+
except Exception:
|
|
100
|
+
pass
|
|
101
|
+
#
|
|
102
|
+
# rsD {'database_name': 'UniProt', 'database_accession': 'P06881', 'provenance_source': 'PDB'}
|
|
103
|
+
unpIdS = set()
|
|
104
|
+
for rsD in rsDL:
|
|
105
|
+
if "database_name" in rsD and rsD["database_name"] == "UniProt" and "database_accession" in rsD:
|
|
106
|
+
unpIdS.add(rsD["database_accession"])
|
|
107
|
+
#
|
|
108
|
+
unpGeneDL = []
|
|
109
|
+
unpAnnDL = []
|
|
110
|
+
glygenDL = []
|
|
111
|
+
geneLookupD = {}
|
|
112
|
+
geneFilterD = defaultdict(int)
|
|
113
|
+
resourceFilterD = defaultdict(int)
|
|
114
|
+
# Loop over all UniProt IDs from `rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession`
|
|
115
|
+
for unpId in unpIdS:
|
|
116
|
+
uD = self.__refD[unpId] if unpId in self.__refD else None
|
|
117
|
+
# uD represents a document from uniprot_exdb.reference_entry
|
|
118
|
+
if not uD:
|
|
119
|
+
# This occurs when the UniProt IDs from:
|
|
120
|
+
# rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession
|
|
121
|
+
# are not in:
|
|
122
|
+
# uniprot_exdb.reference_entry
|
|
123
|
+
logger.info("%s no reference data for unexpected UniProt accession %r", entityKey, unpId)
|
|
124
|
+
continue
|
|
125
|
+
if "gene" in uD and "taxonomy_id" in uD:
|
|
126
|
+
taxId = int(uD["taxonomy_id"])
|
|
127
|
+
logger.debug("%s : %r gene names %r", entityKey, unpId, uD["gene"])
|
|
128
|
+
for tD in uD["gene"]:
|
|
129
|
+
geneFilterD[tD["name"]] += 1
|
|
130
|
+
if geneFilterD[tD["name"]] > 1:
|
|
131
|
+
continue
|
|
132
|
+
geneLookupD[tD["name"].upper()] = tD["name"]
|
|
133
|
+
unpGeneDL.append({"provenance_source": "UniProt", "value": tD["name"], "taxonomy_id": taxId})
|
|
134
|
+
if "dbReferences" in uD:
|
|
135
|
+
logger.debug("%s : %r references %d", entityKey, unpId, len(uD["dbReferences"]))
|
|
136
|
+
for tD in uD["dbReferences"]:
|
|
137
|
+
# Skipping Pfam now
|
|
138
|
+
if "resource" in tD and "id_code" in tD and tD["resource"] in ["GO", "InterPro"]:
|
|
139
|
+
resourceFilterD[(tD["resource"], tD["id_code"])] += 1
|
|
140
|
+
if resourceFilterD[(tD["resource"], tD["id_code"])] > 1:
|
|
141
|
+
logger.debug("Skipping duplicate annotation %r %r", tD["resource"], tD["id_code"])
|
|
142
|
+
continue
|
|
143
|
+
if tD["resource"] in ["GO"]:
|
|
144
|
+
if self.__rsaP.goIdExists(tD["id_code"]):
|
|
145
|
+
goLin = self.__rsaP.getGeneOntologyLineage([tD["id_code"]])
|
|
146
|
+
goName = self.__rsaP.getGeneOntologyName(tD["id_code"])
|
|
147
|
+
if goLin and goName:
|
|
148
|
+
unpAnnDL.append(
|
|
149
|
+
{
|
|
150
|
+
"provenance_source": "UniProt",
|
|
151
|
+
"annotation_id": tD["id_code"],
|
|
152
|
+
"type": tD["resource"],
|
|
153
|
+
"name": goName,
|
|
154
|
+
"assignment_version": uD["version"],
|
|
155
|
+
"annotation_lineage": goLin,
|
|
156
|
+
}
|
|
157
|
+
)
|
|
158
|
+
elif tD["resource"] in ["Pfam"]:
|
|
159
|
+
pfamName = self.__rsaP.getPfamName(tD["id_code"])
|
|
160
|
+
if pfamName:
|
|
161
|
+
unpAnnDL.append(
|
|
162
|
+
{
|
|
163
|
+
"provenance_source": "UniProt",
|
|
164
|
+
"annotation_id": tD["id_code"],
|
|
165
|
+
"name": pfamName,
|
|
166
|
+
"type": tD["resource"],
|
|
167
|
+
"assignment_version": uD["version"],
|
|
168
|
+
}
|
|
169
|
+
)
|
|
170
|
+
else:
|
|
171
|
+
unpAnnDL.append({"provenance_source": "UniProt", "annotation_id": tD["id_code"], "type": tD["resource"], "assignment_version": uD["version"]})
|
|
172
|
+
|
|
173
|
+
elif tD["resource"] in ["InterPro"]:
|
|
174
|
+
interProName = self.__rsaP.getInterProName(tD["id_code"])
|
|
175
|
+
interProLinL = self.__rsaP.getInterProLineage(tD["id_code"])
|
|
176
|
+
if interProName and interProLinL:
|
|
177
|
+
unpAnnDL.append(
|
|
178
|
+
{
|
|
179
|
+
"provenance_source": "UniProt",
|
|
180
|
+
"annotation_id": tD["id_code"],
|
|
181
|
+
"name": interProName,
|
|
182
|
+
"type": tD["resource"],
|
|
183
|
+
"assignment_version": uD["version"],
|
|
184
|
+
"annotation_lineage": interProLinL,
|
|
185
|
+
}
|
|
186
|
+
)
|
|
187
|
+
else:
|
|
188
|
+
unpAnnDL.append({"provenance_source": "UniProt", "annotation_id": tD["id_code"], "type": tD["resource"], "assignment_version": uD["version"]})
|
|
189
|
+
|
|
190
|
+
else:
|
|
191
|
+
unpAnnDL.append({"provenance_source": "UniProt", "annotation_id": tD["id_code"], "type": tD["resource"], "assignment_version": uD["version"]})
|
|
192
|
+
#
|
|
193
|
+
if self.__ggP and self.__ggP.hasGlycoprotein(unpId):
|
|
194
|
+
logger.debug("Mapping glycoprotein for %r", unpId)
|
|
195
|
+
glygenDL.append(
|
|
196
|
+
{
|
|
197
|
+
"provenance_source": "PDB", # This should be GlyGen
|
|
198
|
+
"annotation_id": unpId,
|
|
199
|
+
"name": "Glycoprotein",
|
|
200
|
+
"type": "GlyGen",
|
|
201
|
+
"assignment_version": "1.0",
|
|
202
|
+
}
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
#
|
|
206
|
+
# raD {'resource_identifier': 'PF00503', 'provenance_source': 'SIFTS', 'resource_name': 'Pfam'}
|
|
207
|
+
# "provenance_source": <"PDB"|"RCSB"|"SIFTS"|"UniProt"> "GO", "InterPro", "Pfam"
|
|
208
|
+
#
|
|
209
|
+
# ------------
|
|
210
|
+
# Filter existing annotations identifiers
|
|
211
|
+
# (remove any pre-existing UniProt and GlyGen annotations, so that most recent ones gathered above can be added)
|
|
212
|
+
if peaDL:
|
|
213
|
+
qL = []
|
|
214
|
+
for peaD in peaDL:
|
|
215
|
+
if (peaD["provenance_source"] == "UniProt") or (peaD["type"] == "GlyGen"):
|
|
216
|
+
continue
|
|
217
|
+
qL.append(peaD)
|
|
218
|
+
# Put back the base object list -
|
|
219
|
+
peaDL = qL
|
|
220
|
+
|
|
221
|
+
for unpAnnD in unpAnnDL:
|
|
222
|
+
peaDL.append(unpAnnD)
|
|
223
|
+
#
|
|
224
|
+
if glygenDL:
|
|
225
|
+
# logger.debug("%r glygenDL (%d) %r", entityKey, len(glygenDL), glygenDL)
|
|
226
|
+
peaDL.extend(glygenDL)
|
|
227
|
+
|
|
228
|
+
if peaDL:
|
|
229
|
+
obj["rcsb_polymer_entity_annotation"] = peaDL
|
|
230
|
+
# logger.info("annotation object is %r", obj["rcsb_polymer_entity_annotation"])
|
|
231
|
+
#
|
|
232
|
+
# -------------- Add gene names -----------------
|
|
233
|
+
#
|
|
234
|
+
numSource = len(soDL) # number of items originally present in rcsb_entity_source_organism
|
|
235
|
+
logger.debug("%s unpGeneDL %r", entityKey, unpGeneDL)
|
|
236
|
+
for ii, soD in enumerate(soDL):
|
|
237
|
+
if "ncbi_taxonomy_id" not in soD:
|
|
238
|
+
continue
|
|
239
|
+
logger.debug("soD (%d) taxonomy %r", ii, soD["ncbi_taxonomy_id"])
|
|
240
|
+
# Filter any existing annotations
|
|
241
|
+
if "rcsb_gene_name" in soD:
|
|
242
|
+
qL = []
|
|
243
|
+
for qD in soD["rcsb_gene_name"]:
|
|
244
|
+
if "value" not in qD:
|
|
245
|
+
continue
|
|
246
|
+
if qD["provenance_source"] != "UniProt":
|
|
247
|
+
# standardize case consistent with UniProt
|
|
248
|
+
if qD["value"].upper() in geneLookupD:
|
|
249
|
+
qD["value"] = geneLookupD[qD["value"].upper()]
|
|
250
|
+
else:
|
|
251
|
+
geneLookupD[qD["value"].upper()] = qD["value"]
|
|
252
|
+
qL.append(qD)
|
|
253
|
+
soD["rcsb_gene_name"] = qL
|
|
254
|
+
taxId = soD["ncbi_taxonomy_id"]
|
|
255
|
+
for unpGeneD in unpGeneDL:
|
|
256
|
+
# Only for matching taxonomies
|
|
257
|
+
if taxId == unpGeneD["taxonomy_id"]:
|
|
258
|
+
# skip cases with primary annotations and multiple sources
|
|
259
|
+
if "rcsb_gene_name" in soD and numSource > 1:
|
|
260
|
+
logger.debug("%s skipping special chimeric case", entityKey)
|
|
261
|
+
continue
|
|
262
|
+
soD.setdefault("rcsb_gene_name", []).append({"provenance_source": unpGeneD["provenance_source"], "value": unpGeneD["value"]})
|
|
263
|
+
#
|
|
264
|
+
# -------------- Remapping/extending EC assignments. --------------
|
|
265
|
+
if peObj:
|
|
266
|
+
linL = []
|
|
267
|
+
enzD = {}
|
|
268
|
+
if "rcsb_enzyme_class_combined" in peObj:
|
|
269
|
+
logger.debug("%s PDB EC assignment %r", entityKey, peObj["rcsb_enzyme_class_combined"])
|
|
270
|
+
enzD = {tD["ec"]: tD["provenance_source"] for tD in peObj["rcsb_enzyme_class_combined"]}
|
|
271
|
+
logger.debug("%s PDB EC assignment mapped %r", entityKey, enzD)
|
|
272
|
+
#
|
|
273
|
+
unpEcD = {}
|
|
274
|
+
for unpId in unpIdS:
|
|
275
|
+
uD = self.__refD[unpId] if unpId in self.__refD else None
|
|
276
|
+
if not uD:
|
|
277
|
+
logger.info("%s no data for unexpected UniProt accession %r", entityKey, unpId)
|
|
278
|
+
continue
|
|
279
|
+
if "dbReferences" in uD:
|
|
280
|
+
logger.debug("%s : %r references %d", entityKey, unpId, len(uD["dbReferences"]))
|
|
281
|
+
for tD in uD["dbReferences"]:
|
|
282
|
+
if "resource" in tD and "id_code" in tD and tD["resource"] in ["EC"]:
|
|
283
|
+
logger.debug("%s UniProt accession %r EC %r", entityKey, unpId, tD)
|
|
284
|
+
tEc = self.__ecP.normalize(tD["id_code"])
|
|
285
|
+
if self.__ecP.exists(tEc):
|
|
286
|
+
unpEcD[tEc] = "UniProt"
|
|
287
|
+
# integrate the UniProt data and update the object -
|
|
288
|
+
if unpEcD:
|
|
289
|
+
logger.debug("%s UniProt EC assignment %r", entityKey, unpEcD)
|
|
290
|
+
for ecId in unpEcD:
|
|
291
|
+
if ecId in enzD:
|
|
292
|
+
continue
|
|
293
|
+
enzD[ecId] = unpEcD[ecId]
|
|
294
|
+
for ecId in enzD:
|
|
295
|
+
tL = self.__ecP.getLineage(ecId)
|
|
296
|
+
if tL:
|
|
297
|
+
linL.extend(tL)
|
|
298
|
+
peObj["rcsb_enzyme_class_combined"] = [{"ec": k, "provenance_source": v, "depth": k.count(".") + 1} for k, v in enzD.items()]
|
|
299
|
+
peObj["rcsb_ec_lineage"] = [{"depth": tup[0], "id": tup[1], "name": tup[2]} for tup in linL]
|
|
300
|
+
#
|
|
301
|
+
except Exception as e:
|
|
302
|
+
ok = False
|
|
303
|
+
logger.exception("Feature filter adapter failing with error with %s", str(e))
|
|
304
|
+
#
|
|
305
|
+
return ok, obj
|
|
306
|
+
|
|
307
|
+
def __filterAccessions(self, obj):
|
|
308
|
+
ok = True
|
|
309
|
+
try:
|
|
310
|
+
entityKey = obj["rcsb_id"]
|
|
311
|
+
logger.debug(" ------------- Running accession filter on %r --------------", entityKey)
|
|
312
|
+
#
|
|
313
|
+
referenceDatabaseName = "UniProt"
|
|
314
|
+
provSourceL = ["PDB"]
|
|
315
|
+
alignDL = None
|
|
316
|
+
ersDL = None
|
|
317
|
+
authAsymIdL = None
|
|
318
|
+
taxIdL = None
|
|
319
|
+
try:
|
|
320
|
+
ersDL = obj["rcsb_polymer_entity_container_identifiers"]["reference_sequence_identifiers"]
|
|
321
|
+
authAsymIdL = obj["rcsb_polymer_entity_container_identifiers"]["auth_asym_ids"]
|
|
322
|
+
except Exception:
|
|
323
|
+
logger.debug("%s no reference assignment protein sequence.", entityKey)
|
|
324
|
+
|
|
325
|
+
#
|
|
326
|
+
try:
|
|
327
|
+
taxIdL = [oD["ncbi_taxonomy_id"] for oD in obj["rcsb_entity_source_organism"]]
|
|
328
|
+
taxIdL = list(set(taxIdL))
|
|
329
|
+
logger.debug("%s taxonomy (%d) %r", entityKey, len(taxIdL), taxIdL)
|
|
330
|
+
except Exception as e:
|
|
331
|
+
logger.debug("Failing with %s", str(e))
|
|
332
|
+
#
|
|
333
|
+
if ersDL:
|
|
334
|
+
retDL = []
|
|
335
|
+
dupD = {}
|
|
336
|
+
# Loop over all identifier docs in `rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers`
|
|
337
|
+
for ersD in ersDL:
|
|
338
|
+
# Check currency of reference assignments made by entities in provSourceL (e.g. in this case only PDB)
|
|
339
|
+
isMatchedRefDb, isMatchedAltDb, updErsD = self.__reMapAccessions(entityKey, ersD, referenceDatabaseName, taxIdL, provSourceL)
|
|
340
|
+
# Possible results:
|
|
341
|
+
# '4XVA_1' isMatchedRefDb False isMatchedAltDb False updErsD {'database_name': 'UniProt', 'database_accession': 'P00431', 'provenance_source': 'SIFTS'}
|
|
342
|
+
# - Most are this
|
|
343
|
+
# - '4XBI_1' another example updErsD {'database_name': 'UniProt', 'database_accession': 'Q8IB03', 'provenance_source': 'SIFTS'}
|
|
344
|
+
# '8IVK_1' isMatchedRefDb True isMatchedAltDb False updErsD {'database_name': 'UniProt', 'database_accession': 'C3SKF0', 'provenance_source': 'PDB'}
|
|
345
|
+
# - Not super common
|
|
346
|
+
# '1W3M_1' isMatchedRefDb False isMatchedAltDb True updErsD {'database_name': 'NORINE', 'database_accession': 'NOR00763', 'provenance_source': 'PDB'}
|
|
347
|
+
# - Not super common
|
|
348
|
+
logger.debug("%r isMatchedRefDb %r isMatchedAltDb %r updErsD %r", entityKey, isMatchedRefDb, isMatchedAltDb, updErsD)
|
|
349
|
+
|
|
350
|
+
if (isMatchedRefDb or isMatchedAltDb) and updErsD["database_accession"] not in dupD:
|
|
351
|
+
dupD[updErsD["database_accession"]] = True
|
|
352
|
+
retDL.append(updErsD)
|
|
353
|
+
#
|
|
354
|
+
# Re-apply the latest SIFTS mapping if available and we did not match the target reference database ...
|
|
355
|
+
if not isMatchedRefDb and entityKey not in dupD:
|
|
356
|
+
dupD[entityKey] = True
|
|
357
|
+
siftsAccDL = self.__getSiftsAccessions(entityKey, authAsymIdL)
|
|
358
|
+
for siftsAccD in siftsAccDL:
|
|
359
|
+
logger.debug("Using/adding SIFTS accession mapping for %s", entityKey)
|
|
360
|
+
retDL.append(siftsAccD)
|
|
361
|
+
if not siftsAccDL:
|
|
362
|
+
logger.debug("No alternative SIFTS accession mapping for %s", entityKey)
|
|
363
|
+
# No alternative SIFTS accession mapping for 1W3M_1
|
|
364
|
+
|
|
365
|
+
if retDL:
|
|
366
|
+
logger.debug("%s retDL %r", entityKey, retDL)
|
|
367
|
+
obj["rcsb_polymer_entity_container_identifiers"]["reference_sequence_identifiers"] = retDL
|
|
368
|
+
else:
|
|
369
|
+
del obj["rcsb_polymer_entity_container_identifiers"]["reference_sequence_identifiers"]
|
|
370
|
+
logger.debug("Incomplete reference sequence mapping for %s", entityKey)
|
|
371
|
+
#
|
|
372
|
+
# ------------- update alignment details -------------
|
|
373
|
+
try:
|
|
374
|
+
alignDL = obj["rcsb_polymer_entity_align"]
|
|
375
|
+
except Exception:
|
|
376
|
+
pass
|
|
377
|
+
if alignDL and authAsymIdL:
|
|
378
|
+
retDL = []
|
|
379
|
+
dupD = {}
|
|
380
|
+
for alignD in alignDL:
|
|
381
|
+
isMatchedRefDb, isMatchedAltDb, updAlignD, alignHash = self.__reMapAlignments(entityKey, alignD, referenceDatabaseName, taxIdL, provSourceL)
|
|
382
|
+
#
|
|
383
|
+
if (isMatchedRefDb or isMatchedAltDb) and alignHash not in dupD:
|
|
384
|
+
if alignHash:
|
|
385
|
+
dupD[alignHash] = True
|
|
386
|
+
retDL.append(updAlignD)
|
|
387
|
+
#
|
|
388
|
+
# logger.debug("%s retDL %r", entityKey, retDL)
|
|
389
|
+
#
|
|
390
|
+
if not isMatchedRefDb and entityKey not in dupD:
|
|
391
|
+
dupD[entityKey] = True
|
|
392
|
+
siftsAlignDL = self.__getSiftsAlignments(entityKey, authAsymIdL)
|
|
393
|
+
for siftsAlignD in siftsAlignDL:
|
|
394
|
+
logger.debug("Using/adding SIFTS mapping for the alignment of %s", entityKey)
|
|
395
|
+
retDL.append(siftsAlignD)
|
|
396
|
+
if not siftsAlignDL:
|
|
397
|
+
logger.debug("No alternative SIFTS alignment for %s", entityKey)
|
|
398
|
+
#
|
|
399
|
+
if retDL:
|
|
400
|
+
obj["rcsb_polymer_entity_align"] = retDL
|
|
401
|
+
else:
|
|
402
|
+
del obj["rcsb_polymer_entity_align"]
|
|
403
|
+
logger.debug("Reference sequence alignment NOT updated for %s", entityKey)
|
|
404
|
+
except Exception as e:
|
|
405
|
+
ok = False
|
|
406
|
+
logger.exception("Filter adapter failing with error with %s", str(e))
|
|
407
|
+
#
|
|
408
|
+
return ok, obj
|
|
409
|
+
|
|
410
|
+
def __reMapAccessions(self, entityKey, rsiD, referenceDatabaseName, taxIdL, provSourceL, excludeReferenceDatabases=None):
|
|
411
|
+
"""Internal method to re-map accession for the input database and assignment source
|
|
412
|
+
|
|
413
|
+
Args:
|
|
414
|
+
rsiDL (list): current list of accession
|
|
415
|
+
databaseName (str, optional): resource database name. Defaults to 'UniProt'.
|
|
416
|
+
provSource (str, optional): assignment provenance. Defaults to 'PDB'.
|
|
417
|
+
|
|
418
|
+
Returns:
|
|
419
|
+
bool, bool, dict: flag for mapping success, flag for a supported reference database,
|
|
420
|
+
and remapped (and unmapped) accessions in the input object list
|
|
421
|
+
|
|
422
|
+
Example:
|
|
423
|
+
"P14118": {
|
|
424
|
+
"searchId": "P14118",
|
|
425
|
+
"matchedIds": {
|
|
426
|
+
"P84099": {
|
|
427
|
+
"taxId": 10090
|
|
428
|
+
},
|
|
429
|
+
"P84100": {
|
|
430
|
+
"taxId": 10116
|
|
431
|
+
},
|
|
432
|
+
"P84098": {
|
|
433
|
+
"taxId": 9606
|
|
434
|
+
}
|
|
435
|
+
},
|
|
436
|
+
"matched": "secondary"
|
|
437
|
+
},
|
|
438
|
+
"""
|
|
439
|
+
isMatchedRefDb = False
|
|
440
|
+
isMatchedAltDb = False
|
|
441
|
+
excludeReferenceDatabases = excludeReferenceDatabases if excludeReferenceDatabases else ["PDB"]
|
|
442
|
+
refDbList = ["UniProt", "GenBank", "EMBL", "NDB", "NORINE", "PIR", "PRF", "RefSeq"]
|
|
443
|
+
#
|
|
444
|
+
rId = rsiD["database_accession"]
|
|
445
|
+
logger.debug("%s rId %r db %r prov %r", entityKey, rId, rsiD["database_name"], rsiD["provenance_source"])
|
|
446
|
+
#
|
|
447
|
+
if rsiD["database_name"] in excludeReferenceDatabases:
|
|
448
|
+
isMatchedAltDb = False
|
|
449
|
+
elif rsiD["database_name"] == referenceDatabaseName and rsiD["provenance_source"] in provSourceL:
|
|
450
|
+
try:
|
|
451
|
+
# self.__matchD represents uniprot_exdb.reference_match
|
|
452
|
+
if rId in self.__matchD and self.__matchD[rId]["matched"] in ["primary"]:
|
|
453
|
+
# no change
|
|
454
|
+
isMatchedRefDb = True
|
|
455
|
+
elif rId in self.__matchD and self.__matchD[rId]["matched"] in ["secondary"]:
|
|
456
|
+
logger.debug("secondary %r matched len %d", self.__matchD[rId]["matched"], len(self.__matchD[rId]["matchedIds"]))
|
|
457
|
+
if len(self.__matchD[rId]["matchedIds"]) == 1:
|
|
458
|
+
for mId, mD in self.__matchD[rId]["matchedIds"].items():
|
|
459
|
+
rsiD["database_accession"] = mId
|
|
460
|
+
logger.debug("%s matched secondary %s -> %s", entityKey, rId, mId)
|
|
461
|
+
isMatchedRefDb = True
|
|
462
|
+
elif taxIdL and len(taxIdL) == 1:
|
|
463
|
+
# -- simplest match case --
|
|
464
|
+
numM = 0
|
|
465
|
+
for mId, mD in self.__matchD[rId]["matchedIds"].items():
|
|
466
|
+
if taxIdL[0] == mD["taxId"]:
|
|
467
|
+
rsiD["database_accession"] = mId
|
|
468
|
+
numM += 1
|
|
469
|
+
if numM == 1:
|
|
470
|
+
isMatchedRefDb = True
|
|
471
|
+
logger.debug("%s matched secondary with taxId %r %s -> %s", entityKey, taxIdL[0], rId, rsiD["database_accession"])
|
|
472
|
+
elif not taxIdL:
|
|
473
|
+
logger.debug("%s no taxids with UniProt (%s) secondary mapping", entityKey, rId)
|
|
474
|
+
else:
|
|
475
|
+
logger.info("%s ambiguous mapping for a UniProt (%s) secondary mapping - taxIds %r", entityKey, rId, taxIdL)
|
|
476
|
+
#
|
|
477
|
+
except Exception:
|
|
478
|
+
pass
|
|
479
|
+
|
|
480
|
+
elif rsiD["provenance_source"] in provSourceL and rsiD["database_name"] in refDbList:
|
|
481
|
+
logger.debug("%s leaving reference accession for %s %s assigned by %r", entityKey, rId, rsiD["database_name"], provSourceL)
|
|
482
|
+
isMatchedRefDb = False
|
|
483
|
+
isMatchedAltDb = True
|
|
484
|
+
else:
|
|
485
|
+
logger.debug("%s leaving an unverified reference accession for %s %s assigned by %r", entityKey, rId, rsiD["database_name"], rsiD["provenance_source"])
|
|
486
|
+
#
|
|
487
|
+
logger.debug("%s isMatched %r isExcluded %r for accession %r", entityKey, isMatchedRefDb, isMatchedAltDb, rId)
|
|
488
|
+
#
|
|
489
|
+
return isMatchedRefDb, isMatchedAltDb, rsiD
|
|
490
|
+
|
|
491
|
+
def __reMapAlignments(self, entityKey, alignD, referenceDatabaseName, taxIdL, provSourceL, excludeReferenceDatabases=None):
|
|
492
|
+
"""Internal method to re-map alignments for the input database and assignment source
|
|
493
|
+
|
|
494
|
+
Args:
|
|
495
|
+
alignD (dict): alignment object including accession and aligned regions
|
|
496
|
+
databaseName (str, optional): resource database name. Defaults to 'UniProt'.
|
|
497
|
+
provSourceL (list, optional): assignment provenance. Defaults to 'PDB'.
|
|
498
|
+
|
|
499
|
+
Returns:
|
|
500
|
+
bool, bool, list: flag for mapping success (refdb), flag for mapping success (altdb),
|
|
501
|
+
and remapped (and unmapped) accessions in the input align list
|
|
502
|
+
"""
|
|
503
|
+
isMatchedAltDb = False
|
|
504
|
+
isMatchedRefDb = False
|
|
505
|
+
excludeReferenceDatabases = excludeReferenceDatabases if excludeReferenceDatabases else ["PDB"]
|
|
506
|
+
refDbList = ["UniProt", "GenBank", "EMBL", "NDB", "NORINE", "PIR", "PRF", "RefSeq"]
|
|
507
|
+
provSourceL = provSourceL if provSourceL else []
|
|
508
|
+
rId = alignD["reference_database_accession"]
|
|
509
|
+
#
|
|
510
|
+
if alignD["reference_database_name"] in excludeReferenceDatabases:
|
|
511
|
+
isMatchedAltDb = False
|
|
512
|
+
elif alignD["reference_database_name"] == referenceDatabaseName and alignD["provenance_source"] in provSourceL:
|
|
513
|
+
try:
|
|
514
|
+
if rId in self.__matchD and self.__matchD[rId]["matched"] in ["primary"]:
|
|
515
|
+
# no change
|
|
516
|
+
isMatchedRefDb = True
|
|
517
|
+
elif rId in self.__matchD and self.__matchD[rId]["matched"] in ["secondary"]:
|
|
518
|
+
if len(self.__matchD[rId]["matchedIds"]) == 1:
|
|
519
|
+
for mId, mD in self.__matchD[rId]["matchedIds"].items():
|
|
520
|
+
alignD["reference_database_accession"] = mId
|
|
521
|
+
isMatchedRefDb = True
|
|
522
|
+
elif taxIdL and len(taxIdL) == 1:
|
|
523
|
+
# -- simplest match case --
|
|
524
|
+
numM = 0
|
|
525
|
+
for mId, mD in self.__matchD[rId]["matchedIds"].items():
|
|
526
|
+
if taxIdL[0] == mD["taxId"]:
|
|
527
|
+
alignD["reference_database_accession"] = mId
|
|
528
|
+
numM += 1
|
|
529
|
+
if numM == 1:
|
|
530
|
+
isMatchedRefDb = True
|
|
531
|
+
elif not taxIdL:
|
|
532
|
+
logger.debug("%s no taxids with UniProt (%s) secondary mapping", entityKey, rId)
|
|
533
|
+
else:
|
|
534
|
+
logger.info("%s ambiguous mapping for a UniProt (%s) secondary mapping - taxIds %r", entityKey, rId, taxIdL)
|
|
535
|
+
#
|
|
536
|
+
except Exception:
|
|
537
|
+
pass
|
|
538
|
+
elif alignD["provenance_source"] in provSourceL and alignD["reference_database_name"] in refDbList:
|
|
539
|
+
logger.debug("%s leaving reference alignment for %s %s assigned by %r", entityKey, rId, alignD["reference_database_name"], provSourceL)
|
|
540
|
+
isMatchedRefDb = False
|
|
541
|
+
isMatchedAltDb = True
|
|
542
|
+
else:
|
|
543
|
+
logger.debug("%s leaving a reference alignment for %s %s assigned by %r", entityKey, rId, alignD["reference_database_name"], alignD["provenance_source"])
|
|
544
|
+
#
|
|
545
|
+
logger.debug("%s isMatched %r isExcluded %r for alignment %r", entityKey, isMatchedRefDb, isMatchedAltDb, rId)
|
|
546
|
+
return isMatchedRefDb, isMatchedAltDb, alignD, self.__hashAlignment(alignD)
|
|
547
|
+
|
|
548
|
+
def __hashAlignment(self, aD):
|
|
549
|
+
"""
|
|
550
|
+
Example:
|
|
551
|
+
|
|
552
|
+
{'reference_database_name': 'UniProt', 'reference_database_accession': 'P62942', 'provenance_source': 'PDB',
|
|
553
|
+
'aligned_regions': [{'entity_beg_seq_id': 1, 'ref_beg_seq_id': 1, 'length': 107}]}]
|
|
554
|
+
"""
|
|
555
|
+
hsh = None
|
|
556
|
+
hL = []
|
|
557
|
+
try:
|
|
558
|
+
hL.append(aD["reference_database_accession"])
|
|
559
|
+
for aR in aD["aligned_regions"]:
|
|
560
|
+
hL.append(aR["entity_beg_seq_id"])
|
|
561
|
+
hL.append(aR["ref_beg_seq_id"])
|
|
562
|
+
hL.append(aR["length"])
|
|
563
|
+
hsh = tuple(hL)
|
|
564
|
+
except Exception:
|
|
565
|
+
pass
|
|
566
|
+
return hsh
|
|
567
|
+
|
|
568
|
+
def __getSiftsAccessions(self, entityKey, authAsymIdL):
|
|
569
|
+
retL = []
|
|
570
|
+
saoLD = self.__ssP.getLongestAlignments(entityKey[:4], authAsymIdL)
|
|
571
|
+
for (_, dbAccession), _ in saoLD.items():
|
|
572
|
+
retL.append({"database_name": "UniProt", "database_accession": dbAccession, "provenance_source": "SIFTS"})
|
|
573
|
+
return retL
|
|
574
|
+
|
|
575
|
+
def __getSiftsAlignments(self, entityKey, authAsymIdL):
|
|
576
|
+
retL = []
|
|
577
|
+
saoLD = self.__ssP.getLongestAlignments(entityKey[:4], authAsymIdL)
|
|
578
|
+
for (_, dbAccession), saoL in saoLD.items():
|
|
579
|
+
dD = {"reference_database_name": "UniProt", "reference_database_accession": dbAccession, "provenance_source": "SIFTS", "aligned_regions": []}
|
|
580
|
+
for sao in saoL:
|
|
581
|
+
dD["aligned_regions"].append({"ref_beg_seq_id": sao.getDbSeqIdBeg(), "entity_beg_seq_id": sao.getEntitySeqIdBeg(), "length": sao.getEntityAlignLength()})
|
|
582
|
+
retL.append(dD)
|
|
583
|
+
return retL
|
|
584
|
+
|
|
585
|
+
def getReferenceAccessionAlignSummary(self):
|
|
586
|
+
"""Summarize the alignment of PDB accession assignments with the current reference sequence database."""
|
|
587
|
+
numPrimary = 0
|
|
588
|
+
numSecondary = 0
|
|
589
|
+
numNone = 0
|
|
590
|
+
for _, mD in self.__matchD.items():
|
|
591
|
+
if mD["matched"] == "primary":
|
|
592
|
+
numPrimary += 1
|
|
593
|
+
elif mD["matched"] == "secondary":
|
|
594
|
+
numSecondary += 1
|
|
595
|
+
else:
|
|
596
|
+
numNone += 1
|
|
597
|
+
logger.debug("Matched primary: %d secondary: %d none %d", numPrimary, numSecondary, numNone)
|
|
598
|
+
return numPrimary, numSecondary, numNone
|