rcsb.exdb 1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. rcsb/__init__.py +1 -0
  2. rcsb/exdb/__init__.py +1 -0
  3. rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
  4. rcsb/exdb/branch/GlycanProvider.py +116 -0
  5. rcsb/exdb/branch/GlycanUtils.py +114 -0
  6. rcsb/exdb/branch/__init__.py +0 -0
  7. rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
  8. rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
  9. rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
  10. rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
  11. rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
  12. rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
  13. rcsb/exdb/chemref/__init__.py +0 -0
  14. rcsb/exdb/citation/CitationAdapter.py +91 -0
  15. rcsb/exdb/citation/CitationExtractor.py +190 -0
  16. rcsb/exdb/citation/CitationUtils.py +51 -0
  17. rcsb/exdb/citation/__init__.py +0 -0
  18. rcsb/exdb/cli/__init__.py +0 -0
  19. rcsb/exdb/entry/EntryInfoProvider.py +148 -0
  20. rcsb/exdb/entry/__init__.py +0 -0
  21. rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
  22. rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
  23. rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
  24. rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
  25. rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
  26. rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
  27. rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
  28. rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
  29. rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
  30. rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
  31. rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
  32. rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
  33. rcsb/exdb/seq/AnnotationExtractor.py +76 -0
  34. rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
  35. rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
  36. rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
  37. rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
  38. rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
  39. rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
  40. rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
  41. rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
  42. rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
  43. rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
  44. rcsb/exdb/seq/UniProtExtractor.py +80 -0
  45. rcsb/exdb/seq/__init__.py +0 -0
  46. rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
  47. rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
  48. rcsb/exdb/tests/__init__.py +0 -0
  49. rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
  50. rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
  51. rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
  52. rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
  53. rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
  54. rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
  55. rcsb/exdb/tests/testChemRefLoader.py +106 -0
  56. rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
  57. rcsb/exdb/tests/testCitationAdapter.py +97 -0
  58. rcsb/exdb/tests/testCitationExtractor.py +93 -0
  59. rcsb/exdb/tests/testCitationUtils.py +92 -0
  60. rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
  61. rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
  62. rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
  63. rcsb/exdb/tests/testGlycanProvider.py +98 -0
  64. rcsb/exdb/tests/testGlycanUtils.py +64 -0
  65. rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
  66. rcsb/exdb/tests/testObjectExtractor.py +342 -0
  67. rcsb/exdb/tests/testObjectTransformer.py +83 -0
  68. rcsb/exdb/tests/testObjectUpdater.py +120 -0
  69. rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
  70. rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
  71. rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
  72. rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
  73. rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
  74. rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
  75. rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
  76. rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
  77. rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
  78. rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
  79. rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
  80. rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
  81. rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
  82. rcsb/exdb/tests/testUniProtExtractor.py +77 -0
  83. rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
  84. rcsb/exdb/tree/__init__.py +0 -0
  85. rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
  86. rcsb/exdb/utils/ObjectExtractor.py +286 -0
  87. rcsb/exdb/utils/ObjectTransformer.py +124 -0
  88. rcsb/exdb/utils/ObjectUpdater.py +121 -0
  89. rcsb/exdb/utils/ObjectValidator.py +160 -0
  90. rcsb/exdb/utils/__init__.py +0 -0
  91. rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
  92. rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
  93. rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
  94. rcsb/exdb/wf/__init__.py +0 -0
  95. rcsb_exdb-1.31.dist-info/METADATA +103 -0
  96. rcsb_exdb-1.31.dist-info/RECORD +98 -0
  97. rcsb_exdb-1.31.dist-info/WHEEL +4 -0
  98. rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,598 @@
1
+ ##
2
+ # File: ReferenceSequenceAnnotationAdapter.py
3
+ # Date: 8-Oct-2019 jdw
4
+ #
5
+ # Selected utilities to update reference sequence annotations information
6
+ # in the core_entity collection.
7
+ #
8
+ # Updates:
9
+ #
10
+ ##
11
+ __docformat__ = "google en"
12
+ __author__ = "John Westbrook"
13
+ __email__ = "jwest@rcsb.rutgers.edu"
14
+ __license__ = "Apache 2.0"
15
+
16
+ import copy
17
+ import logging
18
+
19
+ from collections import defaultdict
20
+
21
+ from rcsb.exdb.utils.ObjectAdapterBase import ObjectAdapterBase
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class ReferenceSequenceAnnotationAdapter(ObjectAdapterBase):
27
+ """Selected utilities to update reference sequence annotations information
28
+ in the core_entity collection.
29
+ """
30
+
31
+ def __init__(self, referenceSequenceAnnotationProvider):
32
+ super(ReferenceSequenceAnnotationAdapter, self).__init__()
33
+ #
34
+ self.__rsaP = referenceSequenceAnnotationProvider
35
+ self.__ssP = self.__rsaP.getSiftsSummaryProvider()
36
+ self.__ecP = self.__rsaP.getEcProvider()
37
+ self.__refD = self.__rsaP.getRefData()
38
+ self.__matchD = self.__rsaP.getMatchInfo()
39
+ self.__ggP = self.__rsaP.getGlyGenProvider()
40
+ #
41
+
42
+ def filter(self, obj, **kwargs):
43
+ isTestMode = False
44
+ if isTestMode:
45
+ ok1, tObj = self.__filterAccessions(copy.deepcopy(obj))
46
+ ok2, tObj = self.__filterFeatures(tObj)
47
+ return ok1 and ok2, obj
48
+ else:
49
+ ok1, obj = self.__filterAccessions(obj)
50
+ ok2, obj = self.__filterFeatures(obj)
51
+ return ok1 and ok2, obj
52
+
53
+ def __filterFeatures(self, obj):
54
+ """Uses data from uniprot_exdb.reference_entry to populate the following data at the pdbx_core_polymer_entity collection:
55
+ rcsb_polymer_entity_annotation:
56
+ - GO (data from UniProt + lineage info from GO source)
57
+ - InterPro (data from UniProt)
58
+ - Pfam (is possible but not currently enabled)
59
+ - GlyGen
60
+ rcsb_enzyme_class_combined
61
+ - EC (data from UniProt)
62
+ rcsb_ec_lineage
63
+ - EC (data from EC source)
64
+ rcsb_gene_name
65
+ - Gene/taxonomy data from UniProt
66
+ """
67
+ ok = True
68
+ try:
69
+ if not ("rcsb_polymer_entity_container_identifiers" in obj and "rcsb_id" in obj):
70
+ return False, obj
71
+ entityKey = obj["rcsb_id"]
72
+ eciD = obj["rcsb_polymer_entity_container_identifiers"]
73
+
74
+ #
75
+ logger.debug(" ------------- Running feature filter on %r --------------", entityKey)
76
+ #
77
+ rsDL = []
78
+ soDL = []
79
+ peaDL = []
80
+ peObj = {}
81
+ #
82
+ try:
83
+ rsDL = eciD["reference_sequence_identifiers"]
84
+ except Exception:
85
+ pass
86
+
87
+ try:
88
+ soDL = obj["rcsb_entity_source_organism"]
89
+ except Exception:
90
+ pass
91
+ #
92
+ try:
93
+ peObj = obj["rcsb_polymer_entity"]
94
+ except Exception:
95
+ pass
96
+ #
97
+ try:
98
+ peaDL = obj["rcsb_polymer_entity_annotation"]
99
+ except Exception:
100
+ pass
101
+ #
102
+ # rsD {'database_name': 'UniProt', 'database_accession': 'P06881', 'provenance_source': 'PDB'}
103
+ unpIdS = set()
104
+ for rsD in rsDL:
105
+ if "database_name" in rsD and rsD["database_name"] == "UniProt" and "database_accession" in rsD:
106
+ unpIdS.add(rsD["database_accession"])
107
+ #
108
+ unpGeneDL = []
109
+ unpAnnDL = []
110
+ glygenDL = []
111
+ geneLookupD = {}
112
+ geneFilterD = defaultdict(int)
113
+ resourceFilterD = defaultdict(int)
114
+ # Loop over all UniProt IDs from `rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession`
115
+ for unpId in unpIdS:
116
+ uD = self.__refD[unpId] if unpId in self.__refD else None
117
+ # uD represents a document from uniprot_exdb.reference_entry
118
+ if not uD:
119
+ # This occurs when the UniProt IDs from:
120
+ # rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers.database_accession
121
+ # are not in:
122
+ # uniprot_exdb.reference_entry
123
+ logger.info("%s no reference data for unexpected UniProt accession %r", entityKey, unpId)
124
+ continue
125
+ if "gene" in uD and "taxonomy_id" in uD:
126
+ taxId = int(uD["taxonomy_id"])
127
+ logger.debug("%s : %r gene names %r", entityKey, unpId, uD["gene"])
128
+ for tD in uD["gene"]:
129
+ geneFilterD[tD["name"]] += 1
130
+ if geneFilterD[tD["name"]] > 1:
131
+ continue
132
+ geneLookupD[tD["name"].upper()] = tD["name"]
133
+ unpGeneDL.append({"provenance_source": "UniProt", "value": tD["name"], "taxonomy_id": taxId})
134
+ if "dbReferences" in uD:
135
+ logger.debug("%s : %r references %d", entityKey, unpId, len(uD["dbReferences"]))
136
+ for tD in uD["dbReferences"]:
137
+ # Skipping Pfam now
138
+ if "resource" in tD and "id_code" in tD and tD["resource"] in ["GO", "InterPro"]:
139
+ resourceFilterD[(tD["resource"], tD["id_code"])] += 1
140
+ if resourceFilterD[(tD["resource"], tD["id_code"])] > 1:
141
+ logger.debug("Skipping duplicate annotation %r %r", tD["resource"], tD["id_code"])
142
+ continue
143
+ if tD["resource"] in ["GO"]:
144
+ if self.__rsaP.goIdExists(tD["id_code"]):
145
+ goLin = self.__rsaP.getGeneOntologyLineage([tD["id_code"]])
146
+ goName = self.__rsaP.getGeneOntologyName(tD["id_code"])
147
+ if goLin and goName:
148
+ unpAnnDL.append(
149
+ {
150
+ "provenance_source": "UniProt",
151
+ "annotation_id": tD["id_code"],
152
+ "type": tD["resource"],
153
+ "name": goName,
154
+ "assignment_version": uD["version"],
155
+ "annotation_lineage": goLin,
156
+ }
157
+ )
158
+ elif tD["resource"] in ["Pfam"]:
159
+ pfamName = self.__rsaP.getPfamName(tD["id_code"])
160
+ if pfamName:
161
+ unpAnnDL.append(
162
+ {
163
+ "provenance_source": "UniProt",
164
+ "annotation_id": tD["id_code"],
165
+ "name": pfamName,
166
+ "type": tD["resource"],
167
+ "assignment_version": uD["version"],
168
+ }
169
+ )
170
+ else:
171
+ unpAnnDL.append({"provenance_source": "UniProt", "annotation_id": tD["id_code"], "type": tD["resource"], "assignment_version": uD["version"]})
172
+
173
+ elif tD["resource"] in ["InterPro"]:
174
+ interProName = self.__rsaP.getInterProName(tD["id_code"])
175
+ interProLinL = self.__rsaP.getInterProLineage(tD["id_code"])
176
+ if interProName and interProLinL:
177
+ unpAnnDL.append(
178
+ {
179
+ "provenance_source": "UniProt",
180
+ "annotation_id": tD["id_code"],
181
+ "name": interProName,
182
+ "type": tD["resource"],
183
+ "assignment_version": uD["version"],
184
+ "annotation_lineage": interProLinL,
185
+ }
186
+ )
187
+ else:
188
+ unpAnnDL.append({"provenance_source": "UniProt", "annotation_id": tD["id_code"], "type": tD["resource"], "assignment_version": uD["version"]})
189
+
190
+ else:
191
+ unpAnnDL.append({"provenance_source": "UniProt", "annotation_id": tD["id_code"], "type": tD["resource"], "assignment_version": uD["version"]})
192
+ #
193
+ if self.__ggP and self.__ggP.hasGlycoprotein(unpId):
194
+ logger.debug("Mapping glycoprotein for %r", unpId)
195
+ glygenDL.append(
196
+ {
197
+ "provenance_source": "PDB", # This should be GlyGen
198
+ "annotation_id": unpId,
199
+ "name": "Glycoprotein",
200
+ "type": "GlyGen",
201
+ "assignment_version": "1.0",
202
+ }
203
+ )
204
+
205
+ #
206
+ # raD {'resource_identifier': 'PF00503', 'provenance_source': 'SIFTS', 'resource_name': 'Pfam'}
207
+ # "provenance_source": <"PDB"|"RCSB"|"SIFTS"|"UniProt"> "GO", "InterPro", "Pfam"
208
+ #
209
+ # ------------
210
+ # Filter existing annotations identifiers
211
+ # (remove any pre-existing UniProt and GlyGen annotations, so that most recent ones gathered above can be added)
212
+ if peaDL:
213
+ qL = []
214
+ for peaD in peaDL:
215
+ if (peaD["provenance_source"] == "UniProt") or (peaD["type"] == "GlyGen"):
216
+ continue
217
+ qL.append(peaD)
218
+ # Put back the base object list -
219
+ peaDL = qL
220
+
221
+ for unpAnnD in unpAnnDL:
222
+ peaDL.append(unpAnnD)
223
+ #
224
+ if glygenDL:
225
+ # logger.debug("%r glygenDL (%d) %r", entityKey, len(glygenDL), glygenDL)
226
+ peaDL.extend(glygenDL)
227
+
228
+ if peaDL:
229
+ obj["rcsb_polymer_entity_annotation"] = peaDL
230
+ # logger.info("annotation object is %r", obj["rcsb_polymer_entity_annotation"])
231
+ #
232
+ # -------------- Add gene names -----------------
233
+ #
234
+ numSource = len(soDL) # number of items originally present in rcsb_entity_source_organism
235
+ logger.debug("%s unpGeneDL %r", entityKey, unpGeneDL)
236
+ for ii, soD in enumerate(soDL):
237
+ if "ncbi_taxonomy_id" not in soD:
238
+ continue
239
+ logger.debug("soD (%d) taxonomy %r", ii, soD["ncbi_taxonomy_id"])
240
+ # Filter any existing annotations
241
+ if "rcsb_gene_name" in soD:
242
+ qL = []
243
+ for qD in soD["rcsb_gene_name"]:
244
+ if "value" not in qD:
245
+ continue
246
+ if qD["provenance_source"] != "UniProt":
247
+ # standardize case consistent with UniProt
248
+ if qD["value"].upper() in geneLookupD:
249
+ qD["value"] = geneLookupD[qD["value"].upper()]
250
+ else:
251
+ geneLookupD[qD["value"].upper()] = qD["value"]
252
+ qL.append(qD)
253
+ soD["rcsb_gene_name"] = qL
254
+ taxId = soD["ncbi_taxonomy_id"]
255
+ for unpGeneD in unpGeneDL:
256
+ # Only for matching taxonomies
257
+ if taxId == unpGeneD["taxonomy_id"]:
258
+ # skip cases with primary annotations and multiple sources
259
+ if "rcsb_gene_name" in soD and numSource > 1:
260
+ logger.debug("%s skipping special chimeric case", entityKey)
261
+ continue
262
+ soD.setdefault("rcsb_gene_name", []).append({"provenance_source": unpGeneD["provenance_source"], "value": unpGeneD["value"]})
263
+ #
264
+ # -------------- Remapping/extending EC assignments. --------------
265
+ if peObj:
266
+ linL = []
267
+ enzD = {}
268
+ if "rcsb_enzyme_class_combined" in peObj:
269
+ logger.debug("%s PDB EC assignment %r", entityKey, peObj["rcsb_enzyme_class_combined"])
270
+ enzD = {tD["ec"]: tD["provenance_source"] for tD in peObj["rcsb_enzyme_class_combined"]}
271
+ logger.debug("%s PDB EC assignment mapped %r", entityKey, enzD)
272
+ #
273
+ unpEcD = {}
274
+ for unpId in unpIdS:
275
+ uD = self.__refD[unpId] if unpId in self.__refD else None
276
+ if not uD:
277
+ logger.info("%s no data for unexpected UniProt accession %r", entityKey, unpId)
278
+ continue
279
+ if "dbReferences" in uD:
280
+ logger.debug("%s : %r references %d", entityKey, unpId, len(uD["dbReferences"]))
281
+ for tD in uD["dbReferences"]:
282
+ if "resource" in tD and "id_code" in tD and tD["resource"] in ["EC"]:
283
+ logger.debug("%s UniProt accession %r EC %r", entityKey, unpId, tD)
284
+ tEc = self.__ecP.normalize(tD["id_code"])
285
+ if self.__ecP.exists(tEc):
286
+ unpEcD[tEc] = "UniProt"
287
+ # integrate the UniProt data and update the object -
288
+ if unpEcD:
289
+ logger.debug("%s UniProt EC assignment %r", entityKey, unpEcD)
290
+ for ecId in unpEcD:
291
+ if ecId in enzD:
292
+ continue
293
+ enzD[ecId] = unpEcD[ecId]
294
+ for ecId in enzD:
295
+ tL = self.__ecP.getLineage(ecId)
296
+ if tL:
297
+ linL.extend(tL)
298
+ peObj["rcsb_enzyme_class_combined"] = [{"ec": k, "provenance_source": v, "depth": k.count(".") + 1} for k, v in enzD.items()]
299
+ peObj["rcsb_ec_lineage"] = [{"depth": tup[0], "id": tup[1], "name": tup[2]} for tup in linL]
300
+ #
301
+ except Exception as e:
302
+ ok = False
303
+ logger.exception("Feature filter adapter failing with error with %s", str(e))
304
+ #
305
+ return ok, obj
306
+
307
+ def __filterAccessions(self, obj):
308
+ ok = True
309
+ try:
310
+ entityKey = obj["rcsb_id"]
311
+ logger.debug(" ------------- Running accession filter on %r --------------", entityKey)
312
+ #
313
+ referenceDatabaseName = "UniProt"
314
+ provSourceL = ["PDB"]
315
+ alignDL = None
316
+ ersDL = None
317
+ authAsymIdL = None
318
+ taxIdL = None
319
+ try:
320
+ ersDL = obj["rcsb_polymer_entity_container_identifiers"]["reference_sequence_identifiers"]
321
+ authAsymIdL = obj["rcsb_polymer_entity_container_identifiers"]["auth_asym_ids"]
322
+ except Exception:
323
+ logger.debug("%s no reference assignment protein sequence.", entityKey)
324
+
325
+ #
326
+ try:
327
+ taxIdL = [oD["ncbi_taxonomy_id"] for oD in obj["rcsb_entity_source_organism"]]
328
+ taxIdL = list(set(taxIdL))
329
+ logger.debug("%s taxonomy (%d) %r", entityKey, len(taxIdL), taxIdL)
330
+ except Exception as e:
331
+ logger.debug("Failing with %s", str(e))
332
+ #
333
+ if ersDL:
334
+ retDL = []
335
+ dupD = {}
336
+ # Loop over all identifier docs in `rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers`
337
+ for ersD in ersDL:
338
+ # Check currency of reference assignments made by entities in provSourceL (e.g. in this case only PDB)
339
+ isMatchedRefDb, isMatchedAltDb, updErsD = self.__reMapAccessions(entityKey, ersD, referenceDatabaseName, taxIdL, provSourceL)
340
+ # Possible results:
341
+ # '4XVA_1' isMatchedRefDb False isMatchedAltDb False updErsD {'database_name': 'UniProt', 'database_accession': 'P00431', 'provenance_source': 'SIFTS'}
342
+ # - Most are this
343
+ # - '4XBI_1' another example updErsD {'database_name': 'UniProt', 'database_accession': 'Q8IB03', 'provenance_source': 'SIFTS'}
344
+ # '8IVK_1' isMatchedRefDb True isMatchedAltDb False updErsD {'database_name': 'UniProt', 'database_accession': 'C3SKF0', 'provenance_source': 'PDB'}
345
+ # - Not super common
346
+ # '1W3M_1' isMatchedRefDb False isMatchedAltDb True updErsD {'database_name': 'NORINE', 'database_accession': 'NOR00763', 'provenance_source': 'PDB'}
347
+ # - Not super common
348
+ logger.debug("%r isMatchedRefDb %r isMatchedAltDb %r updErsD %r", entityKey, isMatchedRefDb, isMatchedAltDb, updErsD)
349
+
350
+ if (isMatchedRefDb or isMatchedAltDb) and updErsD["database_accession"] not in dupD:
351
+ dupD[updErsD["database_accession"]] = True
352
+ retDL.append(updErsD)
353
+ #
354
+ # Re-apply the latest SIFTS mapping if available and we did not match the target reference database ...
355
+ if not isMatchedRefDb and entityKey not in dupD:
356
+ dupD[entityKey] = True
357
+ siftsAccDL = self.__getSiftsAccessions(entityKey, authAsymIdL)
358
+ for siftsAccD in siftsAccDL:
359
+ logger.debug("Using/adding SIFTS accession mapping for %s", entityKey)
360
+ retDL.append(siftsAccD)
361
+ if not siftsAccDL:
362
+ logger.debug("No alternative SIFTS accession mapping for %s", entityKey)
363
+ # No alternative SIFTS accession mapping for 1W3M_1
364
+
365
+ if retDL:
366
+ logger.debug("%s retDL %r", entityKey, retDL)
367
+ obj["rcsb_polymer_entity_container_identifiers"]["reference_sequence_identifiers"] = retDL
368
+ else:
369
+ del obj["rcsb_polymer_entity_container_identifiers"]["reference_sequence_identifiers"]
370
+ logger.debug("Incomplete reference sequence mapping for %s", entityKey)
371
+ #
372
+ # ------------- update alignment details -------------
373
+ try:
374
+ alignDL = obj["rcsb_polymer_entity_align"]
375
+ except Exception:
376
+ pass
377
+ if alignDL and authAsymIdL:
378
+ retDL = []
379
+ dupD = {}
380
+ for alignD in alignDL:
381
+ isMatchedRefDb, isMatchedAltDb, updAlignD, alignHash = self.__reMapAlignments(entityKey, alignD, referenceDatabaseName, taxIdL, provSourceL)
382
+ #
383
+ if (isMatchedRefDb or isMatchedAltDb) and alignHash not in dupD:
384
+ if alignHash:
385
+ dupD[alignHash] = True
386
+ retDL.append(updAlignD)
387
+ #
388
+ # logger.debug("%s retDL %r", entityKey, retDL)
389
+ #
390
+ if not isMatchedRefDb and entityKey not in dupD:
391
+ dupD[entityKey] = True
392
+ siftsAlignDL = self.__getSiftsAlignments(entityKey, authAsymIdL)
393
+ for siftsAlignD in siftsAlignDL:
394
+ logger.debug("Using/adding SIFTS mapping for the alignment of %s", entityKey)
395
+ retDL.append(siftsAlignD)
396
+ if not siftsAlignDL:
397
+ logger.debug("No alternative SIFTS alignment for %s", entityKey)
398
+ #
399
+ if retDL:
400
+ obj["rcsb_polymer_entity_align"] = retDL
401
+ else:
402
+ del obj["rcsb_polymer_entity_align"]
403
+ logger.debug("Reference sequence alignment NOT updated for %s", entityKey)
404
+ except Exception as e:
405
+ ok = False
406
+ logger.exception("Filter adapter failing with error with %s", str(e))
407
+ #
408
+ return ok, obj
409
+
410
+ def __reMapAccessions(self, entityKey, rsiD, referenceDatabaseName, taxIdL, provSourceL, excludeReferenceDatabases=None):
411
+ """Internal method to re-map accession for the input database and assignment source
412
+
413
+ Args:
414
+ rsiDL (list): current list of accession
415
+ databaseName (str, optional): resource database name. Defaults to 'UniProt'.
416
+ provSource (str, optional): assignment provenance. Defaults to 'PDB'.
417
+
418
+ Returns:
419
+ bool, bool, dict: flag for mapping success, flag for a supported reference database,
420
+ and remapped (and unmapped) accessions in the input object list
421
+
422
+ Example:
423
+ "P14118": {
424
+ "searchId": "P14118",
425
+ "matchedIds": {
426
+ "P84099": {
427
+ "taxId": 10090
428
+ },
429
+ "P84100": {
430
+ "taxId": 10116
431
+ },
432
+ "P84098": {
433
+ "taxId": 9606
434
+ }
435
+ },
436
+ "matched": "secondary"
437
+ },
438
+ """
439
+ isMatchedRefDb = False
440
+ isMatchedAltDb = False
441
+ excludeReferenceDatabases = excludeReferenceDatabases if excludeReferenceDatabases else ["PDB"]
442
+ refDbList = ["UniProt", "GenBank", "EMBL", "NDB", "NORINE", "PIR", "PRF", "RefSeq"]
443
+ #
444
+ rId = rsiD["database_accession"]
445
+ logger.debug("%s rId %r db %r prov %r", entityKey, rId, rsiD["database_name"], rsiD["provenance_source"])
446
+ #
447
+ if rsiD["database_name"] in excludeReferenceDatabases:
448
+ isMatchedAltDb = False
449
+ elif rsiD["database_name"] == referenceDatabaseName and rsiD["provenance_source"] in provSourceL:
450
+ try:
451
+ # self.__matchD represents uniprot_exdb.reference_match
452
+ if rId in self.__matchD and self.__matchD[rId]["matched"] in ["primary"]:
453
+ # no change
454
+ isMatchedRefDb = True
455
+ elif rId in self.__matchD and self.__matchD[rId]["matched"] in ["secondary"]:
456
+ logger.debug("secondary %r matched len %d", self.__matchD[rId]["matched"], len(self.__matchD[rId]["matchedIds"]))
457
+ if len(self.__matchD[rId]["matchedIds"]) == 1:
458
+ for mId, mD in self.__matchD[rId]["matchedIds"].items():
459
+ rsiD["database_accession"] = mId
460
+ logger.debug("%s matched secondary %s -> %s", entityKey, rId, mId)
461
+ isMatchedRefDb = True
462
+ elif taxIdL and len(taxIdL) == 1:
463
+ # -- simplest match case --
464
+ numM = 0
465
+ for mId, mD in self.__matchD[rId]["matchedIds"].items():
466
+ if taxIdL[0] == mD["taxId"]:
467
+ rsiD["database_accession"] = mId
468
+ numM += 1
469
+ if numM == 1:
470
+ isMatchedRefDb = True
471
+ logger.debug("%s matched secondary with taxId %r %s -> %s", entityKey, taxIdL[0], rId, rsiD["database_accession"])
472
+ elif not taxIdL:
473
+ logger.debug("%s no taxids with UniProt (%s) secondary mapping", entityKey, rId)
474
+ else:
475
+ logger.info("%s ambiguous mapping for a UniProt (%s) secondary mapping - taxIds %r", entityKey, rId, taxIdL)
476
+ #
477
+ except Exception:
478
+ pass
479
+
480
+ elif rsiD["provenance_source"] in provSourceL and rsiD["database_name"] in refDbList:
481
+ logger.debug("%s leaving reference accession for %s %s assigned by %r", entityKey, rId, rsiD["database_name"], provSourceL)
482
+ isMatchedRefDb = False
483
+ isMatchedAltDb = True
484
+ else:
485
+ logger.debug("%s leaving an unverified reference accession for %s %s assigned by %r", entityKey, rId, rsiD["database_name"], rsiD["provenance_source"])
486
+ #
487
+ logger.debug("%s isMatched %r isExcluded %r for accession %r", entityKey, isMatchedRefDb, isMatchedAltDb, rId)
488
+ #
489
+ return isMatchedRefDb, isMatchedAltDb, rsiD
490
+
491
+ def __reMapAlignments(self, entityKey, alignD, referenceDatabaseName, taxIdL, provSourceL, excludeReferenceDatabases=None):
492
+ """Internal method to re-map alignments for the input database and assignment source
493
+
494
+ Args:
495
+ alignD (dict): alignment object including accession and aligned regions
496
+ databaseName (str, optional): resource database name. Defaults to 'UniProt'.
497
+ provSourceL (list, optional): assignment provenance. Defaults to 'PDB'.
498
+
499
+ Returns:
500
+ bool, bool, list: flag for mapping success (refdb), flag for mapping success (altdb),
501
+ and remapped (and unmapped) accessions in the input align list
502
+ """
503
+ isMatchedAltDb = False
504
+ isMatchedRefDb = False
505
+ excludeReferenceDatabases = excludeReferenceDatabases if excludeReferenceDatabases else ["PDB"]
506
+ refDbList = ["UniProt", "GenBank", "EMBL", "NDB", "NORINE", "PIR", "PRF", "RefSeq"]
507
+ provSourceL = provSourceL if provSourceL else []
508
+ rId = alignD["reference_database_accession"]
509
+ #
510
+ if alignD["reference_database_name"] in excludeReferenceDatabases:
511
+ isMatchedAltDb = False
512
+ elif alignD["reference_database_name"] == referenceDatabaseName and alignD["provenance_source"] in provSourceL:
513
+ try:
514
+ if rId in self.__matchD and self.__matchD[rId]["matched"] in ["primary"]:
515
+ # no change
516
+ isMatchedRefDb = True
517
+ elif rId in self.__matchD and self.__matchD[rId]["matched"] in ["secondary"]:
518
+ if len(self.__matchD[rId]["matchedIds"]) == 1:
519
+ for mId, mD in self.__matchD[rId]["matchedIds"].items():
520
+ alignD["reference_database_accession"] = mId
521
+ isMatchedRefDb = True
522
+ elif taxIdL and len(taxIdL) == 1:
523
+ # -- simplest match case --
524
+ numM = 0
525
+ for mId, mD in self.__matchD[rId]["matchedIds"].items():
526
+ if taxIdL[0] == mD["taxId"]:
527
+ alignD["reference_database_accession"] = mId
528
+ numM += 1
529
+ if numM == 1:
530
+ isMatchedRefDb = True
531
+ elif not taxIdL:
532
+ logger.debug("%s no taxids with UniProt (%s) secondary mapping", entityKey, rId)
533
+ else:
534
+ logger.info("%s ambiguous mapping for a UniProt (%s) secondary mapping - taxIds %r", entityKey, rId, taxIdL)
535
+ #
536
+ except Exception:
537
+ pass
538
+ elif alignD["provenance_source"] in provSourceL and alignD["reference_database_name"] in refDbList:
539
+ logger.debug("%s leaving reference alignment for %s %s assigned by %r", entityKey, rId, alignD["reference_database_name"], provSourceL)
540
+ isMatchedRefDb = False
541
+ isMatchedAltDb = True
542
+ else:
543
+ logger.debug("%s leaving a reference alignment for %s %s assigned by %r", entityKey, rId, alignD["reference_database_name"], alignD["provenance_source"])
544
+ #
545
+ logger.debug("%s isMatched %r isExcluded %r for alignment %r", entityKey, isMatchedRefDb, isMatchedAltDb, rId)
546
+ return isMatchedRefDb, isMatchedAltDb, alignD, self.__hashAlignment(alignD)
547
+
548
+ def __hashAlignment(self, aD):
549
+ """
550
+ Example:
551
+
552
+ {'reference_database_name': 'UniProt', 'reference_database_accession': 'P62942', 'provenance_source': 'PDB',
553
+ 'aligned_regions': [{'entity_beg_seq_id': 1, 'ref_beg_seq_id': 1, 'length': 107}]}]
554
+ """
555
+ hsh = None
556
+ hL = []
557
+ try:
558
+ hL.append(aD["reference_database_accession"])
559
+ for aR in aD["aligned_regions"]:
560
+ hL.append(aR["entity_beg_seq_id"])
561
+ hL.append(aR["ref_beg_seq_id"])
562
+ hL.append(aR["length"])
563
+ hsh = tuple(hL)
564
+ except Exception:
565
+ pass
566
+ return hsh
567
+
568
+ def __getSiftsAccessions(self, entityKey, authAsymIdL):
569
+ retL = []
570
+ saoLD = self.__ssP.getLongestAlignments(entityKey[:4], authAsymIdL)
571
+ for (_, dbAccession), _ in saoLD.items():
572
+ retL.append({"database_name": "UniProt", "database_accession": dbAccession, "provenance_source": "SIFTS"})
573
+ return retL
574
+
575
+ def __getSiftsAlignments(self, entityKey, authAsymIdL):
576
+ retL = []
577
+ saoLD = self.__ssP.getLongestAlignments(entityKey[:4], authAsymIdL)
578
+ for (_, dbAccession), saoL in saoLD.items():
579
+ dD = {"reference_database_name": "UniProt", "reference_database_accession": dbAccession, "provenance_source": "SIFTS", "aligned_regions": []}
580
+ for sao in saoL:
581
+ dD["aligned_regions"].append({"ref_beg_seq_id": sao.getDbSeqIdBeg(), "entity_beg_seq_id": sao.getEntitySeqIdBeg(), "length": sao.getEntityAlignLength()})
582
+ retL.append(dD)
583
+ return retL
584
+
585
+ def getReferenceAccessionAlignSummary(self):
586
+ """Summarize the alignment of PDB accession assignments with the current reference sequence database."""
587
+ numPrimary = 0
588
+ numSecondary = 0
589
+ numNone = 0
590
+ for _, mD in self.__matchD.items():
591
+ if mD["matched"] == "primary":
592
+ numPrimary += 1
593
+ elif mD["matched"] == "secondary":
594
+ numSecondary += 1
595
+ else:
596
+ numNone += 1
597
+ logger.debug("Matched primary: %d secondary: %d none %d", numPrimary, numSecondary, numNone)
598
+ return numPrimary, numSecondary, numNone