rcsb.exdb 1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. rcsb/__init__.py +1 -0
  2. rcsb/exdb/__init__.py +1 -0
  3. rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
  4. rcsb/exdb/branch/GlycanProvider.py +116 -0
  5. rcsb/exdb/branch/GlycanUtils.py +114 -0
  6. rcsb/exdb/branch/__init__.py +0 -0
  7. rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
  8. rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
  9. rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
  10. rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
  11. rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
  12. rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
  13. rcsb/exdb/chemref/__init__.py +0 -0
  14. rcsb/exdb/citation/CitationAdapter.py +91 -0
  15. rcsb/exdb/citation/CitationExtractor.py +190 -0
  16. rcsb/exdb/citation/CitationUtils.py +51 -0
  17. rcsb/exdb/citation/__init__.py +0 -0
  18. rcsb/exdb/cli/__init__.py +0 -0
  19. rcsb/exdb/entry/EntryInfoProvider.py +148 -0
  20. rcsb/exdb/entry/__init__.py +0 -0
  21. rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
  22. rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
  23. rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
  24. rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
  25. rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
  26. rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
  27. rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
  28. rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
  29. rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
  30. rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
  31. rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
  32. rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
  33. rcsb/exdb/seq/AnnotationExtractor.py +76 -0
  34. rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
  35. rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
  36. rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
  37. rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
  38. rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
  39. rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
  40. rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
  41. rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
  42. rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
  43. rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
  44. rcsb/exdb/seq/UniProtExtractor.py +80 -0
  45. rcsb/exdb/seq/__init__.py +0 -0
  46. rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
  47. rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
  48. rcsb/exdb/tests/__init__.py +0 -0
  49. rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
  50. rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
  51. rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
  52. rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
  53. rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
  54. rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
  55. rcsb/exdb/tests/testChemRefLoader.py +106 -0
  56. rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
  57. rcsb/exdb/tests/testCitationAdapter.py +97 -0
  58. rcsb/exdb/tests/testCitationExtractor.py +93 -0
  59. rcsb/exdb/tests/testCitationUtils.py +92 -0
  60. rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
  61. rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
  62. rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
  63. rcsb/exdb/tests/testGlycanProvider.py +98 -0
  64. rcsb/exdb/tests/testGlycanUtils.py +64 -0
  65. rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
  66. rcsb/exdb/tests/testObjectExtractor.py +342 -0
  67. rcsb/exdb/tests/testObjectTransformer.py +83 -0
  68. rcsb/exdb/tests/testObjectUpdater.py +120 -0
  69. rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
  70. rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
  71. rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
  72. rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
  73. rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
  74. rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
  75. rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
  76. rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
  77. rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
  78. rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
  79. rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
  80. rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
  81. rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
  82. rcsb/exdb/tests/testUniProtExtractor.py +77 -0
  83. rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
  84. rcsb/exdb/tree/__init__.py +0 -0
  85. rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
  86. rcsb/exdb/utils/ObjectExtractor.py +286 -0
  87. rcsb/exdb/utils/ObjectTransformer.py +124 -0
  88. rcsb/exdb/utils/ObjectUpdater.py +121 -0
  89. rcsb/exdb/utils/ObjectValidator.py +160 -0
  90. rcsb/exdb/utils/__init__.py +0 -0
  91. rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
  92. rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
  93. rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
  94. rcsb/exdb/wf/__init__.py +0 -0
  95. rcsb_exdb-1.31.dist-info/METADATA +103 -0
  96. rcsb_exdb-1.31.dist-info/RECORD +98 -0
  97. rcsb_exdb-1.31.dist-info/WHEEL +4 -0
  98. rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,534 @@
1
+ ##
2
+ # File: ReferenceSequenceAssignmentAdapter.py
3
+ # Date: 8-Oct-2019 jdw
4
+ #
5
+ # Selected utilities to update reference sequence assignments information
6
+ # in the core_entity collection.
7
+ #
8
+ # Updates:
9
+ #
10
+ ##
11
+ __docformat__ = "google en"
12
+ __author__ = "John Westbrook"
13
+ __email__ = "jwest@rcsb.rutgers.edu"
14
+ __license__ = "Apache 2.0"
15
+
16
+ import copy
17
+ import logging
18
+
19
+ from collections import defaultdict
20
+
21
+ from rcsb.exdb.utils.ObjectAdapterBase import ObjectAdapterBase
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class ReferenceSequenceAssignmentAdapter(ObjectAdapterBase):
27
+ """Selected utilities to update reference sequence assignments information
28
+ in the core_entity collection.
29
+
30
+ "pdbx_ec" : "5.2.1.8",
31
+ "rcsb_ec_lineage" : [
32
+ {
33
+ "id" : "5",
34
+ "depth" : NumberInt(1),
35
+ "name" : "Isomerases"
36
+ },
37
+ {
38
+ "id" : "5.2",
39
+ "depth" : NumberInt(2),
40
+ "name" : "cis-trans-Isomerases"
41
+ },
42
+ {
43
+ "id" : "5.2.1",
44
+ "depth" : NumberInt(3),
45
+ "name" : "cis-trans Isomerases (only sub-subclass identified to date)"
46
+ },
47
+ {
48
+ "id" : "5.2.1.8",
49
+ "depth" : NumberInt(4),
50
+ "name" : "peptidylprolyl isomerase"
51
+ }
52
+ ],
53
+ """
54
+
55
+ def __init__(self, refSeqAssignProvider):
56
+ super(ReferenceSequenceAssignmentAdapter, self).__init__()
57
+ #
58
+ self.__rsaP = refSeqAssignProvider
59
+ self.__ssP = self.__rsaP.getSiftsSummaryProvider()
60
+ self.__ecP = self.__rsaP.getEcProvider()
61
+ self.__refD = self.__rsaP.getRefData()
62
+ self.__matchD = self.__rsaP.getMatchInfo()
63
+ #
64
+
65
+ def filter(self, obj, **kwargs):
66
+ isTestMode = False
67
+ if isTestMode:
68
+ ok1, tObj = self.__filterAccessions(copy.deepcopy(obj))
69
+ ok2, tObj = self.__filterFeatures(tObj)
70
+ return ok1 and ok2, obj
71
+ else:
72
+ ok1, obj = self.__filterAccessions(obj)
73
+ ok2, obj = self.__filterFeatures(obj)
74
+ return ok1 and ok2, obj
75
+
76
+ def __filterFeatures(self, obj):
77
+ ok = True
78
+ try:
79
+ if not ("rcsb_polymer_entity_container_identifiers" in obj and "rcsb_id" in obj):
80
+ return False, obj
81
+ entityKey = obj["rcsb_id"]
82
+ eciD = obj["rcsb_polymer_entity_container_identifiers"]
83
+
84
+ #
85
+ logger.debug(" ------------- Running feature filter on %r --------------", entityKey)
86
+ #
87
+ rsDL = []
88
+ soDL = []
89
+ peaDL = []
90
+ peObj = {}
91
+ #
92
+ try:
93
+ rsDL = eciD["reference_sequence_identifiers"]
94
+ except Exception:
95
+ pass
96
+
97
+ try:
98
+ soDL = obj["rcsb_entity_source_organism"]
99
+ except Exception:
100
+ pass
101
+ #
102
+ try:
103
+ peObj = obj["rcsb_polymer_entity"]
104
+ except Exception:
105
+ pass
106
+ #
107
+ try:
108
+ peaDL = obj["rcsb_polymer_entity_annotation"]
109
+ except Exception:
110
+ pass
111
+ #
112
+ # rsD {'database_name': 'UniProt', 'database_accession': 'P06881', 'provenance_source': 'PDB'}
113
+ unpIdS = set()
114
+ for rsD in rsDL:
115
+ if "database_name" in rsD and rsD["database_name"] == "UniProt" and "database_accession" in rsD:
116
+ unpIdS.add(rsD["database_accession"])
117
+ #
118
+ unpGeneDL = []
119
+ unpAnnDL = []
120
+ geneLookupD = {}
121
+ geneFilterD = defaultdict(int)
122
+ resourceFilterD = defaultdict(int)
123
+ for unpId in unpIdS:
124
+ uD = self.__refD[unpId] if unpId in self.__refD else None
125
+ if not uD:
126
+ logger.info("%s no reference data for unexpected UniProt accession %r", entityKey, unpId)
127
+ continue
128
+ if "gene" in uD and "taxonomy_id" in uD:
129
+ taxId = int(uD["taxonomy_id"])
130
+ logger.debug("%s : %r gene names %r", entityKey, unpId, uD["gene"])
131
+ for tD in uD["gene"]:
132
+ geneFilterD[tD["name"]] += 1
133
+ if geneFilterD[tD["name"]] > 1:
134
+ continue
135
+ geneLookupD[tD["name"].upper()] = tD["name"]
136
+ unpGeneDL.append({"provenance_source": "UniProt", "value": tD["name"], "taxonomy_id": taxId})
137
+ if "dbReferences" in uD:
138
+ logger.debug("%s : %r references %d", entityKey, unpId, len(uD["dbReferences"]))
139
+ for tD in uD["dbReferences"]:
140
+ if "resource" in tD and "id_code" in tD and tD["resource"] in ["GO", "Pfam", "InterPro"]:
141
+ resourceFilterD[(tD["resource"], tD["id_code"])] += 1
142
+ if resourceFilterD[(tD["resource"], tD["id_code"])] > 1:
143
+ logger.debug("Skipping duplicate annotation %r %r", tD["resource"], tD["id_code"])
144
+ continue
145
+ if tD["resource"] in ["GO"]:
146
+ if self.__rsaP.goIdExists(tD["id_code"]):
147
+ goLin = self.__rsaP.getGeneOntologyLineage([tD["id_code"]])
148
+ if goLin:
149
+ unpAnnDL.append(
150
+ {
151
+ "provenance_source": "UniProt",
152
+ "annotation_id": tD["id_code"],
153
+ "type": tD["resource"],
154
+ "assignment_version": uD["version"],
155
+ "annotation_lineage": goLin,
156
+ }
157
+ )
158
+ else:
159
+ unpAnnDL.append({"provenance_source": "UniProt", "annotation_id": tD["id_code"], "type": tD["resource"], "assignment_version": uD["version"]})
160
+
161
+ #
162
+ # raD {'resource_identifier': 'PF00503', 'provenance_source': 'SIFTS', 'resource_name': 'Pfam'}
163
+ # "provenance_source": <"PDB"|"RCSB"|"SIFTS"|"UniProt"> "GO", "InterPro", "Pfam"
164
+ #
165
+ # ------------
166
+ # Filter existing annotations identifiers
167
+ if peaDL:
168
+ qL = []
169
+ for peaD in peaDL:
170
+ if peaD["provenance_source"] != "UniProt":
171
+ qL.append(peaD)
172
+ # Put back the base object list -
173
+ peaDL = qL
174
+
175
+ for unpAnnD in unpAnnDL:
176
+ peaDL.append(unpAnnD)
177
+ #
178
+ if peaDL:
179
+ obj["rcsb_polymer_entity_annotation"] = peaDL
180
+ # logger.debug("annotation object is %r", obj["rcsb_polymer_entity_annotation"])
181
+ #
182
+ # -------------- Add gene names -----------------
183
+ #
184
+ numSource = len(soDL)
185
+ logger.debug("%s unpGeneDL %r", entityKey, unpGeneDL)
186
+ for ii, soD in enumerate(soDL):
187
+ if "ncbi_taxonomy_id" not in soD:
188
+ continue
189
+ logger.debug("soD (%d) taxonomy %r", ii, soD["ncbi_taxonomy_id"])
190
+ # Filter any existing annotations
191
+ if "rcsb_gene_name" in soD:
192
+ qL = []
193
+ for qD in soD["rcsb_gene_name"]:
194
+ if qD["provenance_source"] != "UniProt":
195
+ # standardize case consistent with UniProt
196
+ if qD["value"].upper() in geneLookupD:
197
+ qD["value"] = geneLookupD[qD["value"].upper()]
198
+ else:
199
+ geneLookupD[qD["value"].upper()] = qD["value"]
200
+ qL.append(qD)
201
+ soD["rcsb_gene_name"] = qL
202
+ taxId = soD["ncbi_taxonomy_id"]
203
+ for unpGeneD in unpGeneDL:
204
+ # Only for matching taxonomies
205
+ if taxId == unpGeneD["taxonomy_id"]:
206
+ # skip cases with primary annotations and multiple sources
207
+ if "rcsb_gene_name" in soD and numSource > 1:
208
+ logger.debug("%s skipping special chimeric case", entityKey)
209
+ continue
210
+ soD.setdefault("rcsb_gene_name", []).append({"provenance_source": unpGeneD["provenance_source"], "value": unpGeneD["value"]})
211
+ #
212
+ # -------------- Remapping/extending EC assignments. --------------
213
+ if peObj:
214
+ linL = []
215
+ enzD = {}
216
+ if "rcsb_enzyme_class_combined" in peObj:
217
+ logger.debug("%s PDB EC assignment %r", entityKey, peObj["rcsb_enzyme_class_combined"])
218
+ enzD = {tD["ec"]: tD["provenance_source"] for tD in peObj["rcsb_enzyme_class_combined"]}
219
+ logger.debug("%s PDB EC assignment mapped %r", entityKey, enzD)
220
+ #
221
+ unpEcD = {}
222
+ for unpId in unpIdS:
223
+ uD = self.__refD[unpId] if unpId in self.__refD else None
224
+ if not uD:
225
+ logger.info("%s no data for unexpected UniProt accession %r", entityKey, unpId)
226
+ continue
227
+ if "dbReferences" in uD:
228
+ logger.debug("%s : %r references %d", entityKey, unpId, len(uD["dbReferences"]))
229
+ for tD in uD["dbReferences"]:
230
+ if "resource" in tD and "id_code" in tD and tD["resource"] in ["EC"]:
231
+ logger.debug("%s UniProt accession %r EC %r", entityKey, unpId, tD)
232
+ tEc = self.__ecP.normalize(tD["id_code"])
233
+ if self.__ecP.exists(tEc):
234
+ unpEcD[tEc] = "UniProt"
235
+ # integrate the UniProt data and update the object -
236
+ if unpEcD:
237
+ logger.debug("%s UniProt EC assignment %r", entityKey, unpEcD)
238
+ for ecId in unpEcD:
239
+ if ecId in enzD:
240
+ continue
241
+ enzD[ecId] = unpEcD[ecId]
242
+ for ecId in enzD:
243
+ tL = self.__ecP.getLineage(ecId)
244
+ if tL:
245
+ linL.extend(tL)
246
+ peObj["rcsb_enzyme_class_combined"] = [{"ec": k, "provenance_source": v, "depth": k.count(".") + 1} for k, v in enzD.items()]
247
+ peObj["rcsb_ec_lineage"] = [{"depth": tup[0], "id": tup[1], "name": tup[2]} for tup in linL]
248
+ #
249
+ except Exception as e:
250
+ ok = False
251
+ logger.exception("Feature filter adapter failing with error with %s", str(e))
252
+ #
253
+ return ok, obj
254
+
255
+ def __filterAccessions(self, obj):
256
+ ok = True
257
+ try:
258
+ entityKey = obj["rcsb_id"]
259
+ logger.debug(" ------------- Running accession filter on %r --------------", entityKey)
260
+ #
261
+ referenceDatabaseName = "UniProt"
262
+ provSourceL = ["PDB"]
263
+ alignDL = None
264
+ ersDL = None
265
+ authAsymIdL = None
266
+ taxIdL = None
267
+ try:
268
+ ersDL = obj["rcsb_polymer_entity_container_identifiers"]["reference_sequence_identifiers"]
269
+ authAsymIdL = obj["rcsb_polymer_entity_container_identifiers"]["auth_asym_ids"]
270
+ except Exception:
271
+ logger.debug("%s no reference assignment protein sequence.", entityKey)
272
+
273
+ #
274
+ try:
275
+ taxIdL = [oD["ncbi_taxonomy_id"] for oD in obj["rcsb_entity_source_organism"]]
276
+ taxIdL = list(set(taxIdL))
277
+ logger.debug("%s taxonomy (%d) %r", entityKey, len(taxIdL), taxIdL)
278
+ except Exception as e:
279
+ logger.debug("Failing with %s", str(e))
280
+ #
281
+ if ersDL:
282
+ retDL = []
283
+ dupD = {}
284
+ for ersD in ersDL:
285
+ # Check currency of reference assignments made by entities in provSourceL (e.g. in this case only PDB)
286
+ isMatchedRefDb, isMatchedAltDb, updErsD = self.__reMapAccessions(entityKey, ersD, referenceDatabaseName, taxIdL, provSourceL)
287
+ #
288
+ logger.debug("isMatchedRefDb %r isMatchedAltDb %r updErsD %r", isMatchedRefDb, isMatchedAltDb, updErsD)
289
+
290
+ if (isMatchedRefDb or isMatchedAltDb) and updErsD["database_accession"] not in dupD:
291
+ dupD[updErsD["database_accession"]] = True
292
+ retDL.append(updErsD)
293
+ #
294
+ # Re-apply the latest SIFTS mapping if available ...
295
+ if not isMatchedRefDb and entityKey not in dupD:
296
+ dupD[entityKey] = True
297
+ siftsAccDL = self.__getSiftsAccessions(entityKey, authAsymIdL)
298
+ for siftsAccD in siftsAccDL:
299
+ logger.debug("Using/adding SIFTS accession mapping for %s", entityKey)
300
+ retDL.append(siftsAccD)
301
+ if not siftsAccDL:
302
+ logger.debug("No alternative SIFTS accession mapping for %s", entityKey)
303
+
304
+ if retDL:
305
+ logger.debug("%s retDL %r", entityKey, retDL)
306
+ obj["rcsb_polymer_entity_container_identifiers"]["reference_sequence_identifiers"] = retDL
307
+ else:
308
+ del obj["rcsb_polymer_entity_container_identifiers"]["reference_sequence_identifiers"]
309
+ logger.info("Incomplete reference sequence mapping update for %s", entityKey)
310
+ #
311
+ # ------------- update alignment details -------------
312
+ try:
313
+ alignDL = obj["rcsb_polymer_entity_align"]
314
+ except Exception:
315
+ pass
316
+ if alignDL and authAsymIdL:
317
+ retDL = []
318
+ dupD = {}
319
+ for alignD in alignDL:
320
+ isMatchedRefDb, isMatchedAltDb, updAlignD, alignHash = self.__reMapAlignments(entityKey, alignD, referenceDatabaseName, taxIdL, provSourceL)
321
+ #
322
+ if (isMatchedRefDb or isMatchedAltDb) and alignHash not in dupD:
323
+ if alignHash:
324
+ dupD[alignHash] = True
325
+ retDL.append(updAlignD)
326
+ #
327
+
328
+ if not isMatchedRefDb and entityKey not in dupD:
329
+ dupD[entityKey] = True
330
+ siftsAlignDL = self.__getSiftsAlignments(entityKey, authAsymIdL)
331
+ for siftsAlignD in siftsAlignDL:
332
+ logger.debug("Using/adding SIFTS mapping for the alignment of %s", entityKey)
333
+ retDL.append(siftsAlignD)
334
+ if not siftsAlignDL:
335
+ logger.debug("No alternative SIFTS alignment for %s", entityKey)
336
+ #
337
+ if retDL:
338
+ obj["rcsb_polymer_entity_align"] = retDL
339
+ else:
340
+ del obj["rcsb_polymer_entity_align"]
341
+ logger.debug("Reference sequence alignment NOT updated for %s", entityKey)
342
+ except Exception as e:
343
+ ok = False
344
+ logger.exception("Filter adapter failing with error with %s", str(e))
345
+ #
346
+ return ok, obj
347
+
348
+ def __reMapAccessions(self, entityKey, rsiD, referenceDatabaseName, taxIdL, provSourceL, excludeReferenceDatabases=None):
349
+ """Internal method to re-map accession for the input database and assignment source
350
+
351
+ Args:
352
+ rsiDL (list): current list of accession
353
+ databaseName (str, optional): resource database name. Defaults to 'UniProt'.
354
+ provSource (str, optional): assignment provenance. Defaults to 'PDB'.
355
+
356
+ Returns:
357
+ bool, bool, dict: flag for mapping success, flag for a supported reference database,
358
+ and remapped (and unmapped) accessions in the input object list
359
+
360
+ Example:
361
+ "P14118": {
362
+ "searchId": "P14118",
363
+ "matchedIds": {
364
+ "P84099": {
365
+ "taxId": 10090
366
+ },
367
+ "P84100": {
368
+ "taxId": 10116
369
+ },
370
+ "P84098": {
371
+ "taxId": 9606
372
+ }
373
+ },
374
+ "matched": "secondary"
375
+ },
376
+ """
377
+ isMatchedRefDb = False
378
+ isMatchedAltDb = False
379
+ excludeReferenceDatabases = excludeReferenceDatabases if excludeReferenceDatabases else ["PDB"]
380
+ refDbList = ["UniProt", "GenBank", "EMBL", "NDB", "NORINE", "PIR", "PRF", "RefSeq"]
381
+ #
382
+ rId = rsiD["database_accession"]
383
+ logger.debug("%s rId %r db %r prov %r", entityKey, rId, rsiD["database_name"], rsiD["provenance_source"])
384
+ #
385
+ if rsiD["database_name"] in excludeReferenceDatabases:
386
+ isMatchedAltDb = False
387
+ elif rsiD["database_name"] == referenceDatabaseName and rsiD["provenance_source"] in provSourceL:
388
+ try:
389
+ if rId in self.__matchD and self.__matchD[rId]["matched"] in ["primary"]:
390
+ # no change
391
+ isMatchedRefDb = True
392
+ elif rId in self.__matchD and self.__matchD[rId]["matched"] in ["secondary"]:
393
+ logger.debug("secondary %r matched len %d", self.__matchD[rId]["matched"], len(self.__matchD[rId]["matchedIds"]))
394
+ if len(self.__matchD[rId]["matchedIds"]) == 1:
395
+ for mId, mD in self.__matchD[rId]["matchedIds"].items():
396
+ rsiD["database_accession"] = mId
397
+ logger.debug("%s matched secondary %s -> %s", entityKey, rId, mId)
398
+ isMatchedRefDb = True
399
+ elif taxIdL and len(taxIdL) == 1:
400
+ # -- simplest match case --
401
+ numM = 0
402
+ for mId, mD in self.__matchD[rId]["matchedIds"].items():
403
+ if taxIdL[0] == mD["taxId"]:
404
+ rsiD["database_accession"] = mId
405
+ numM += 1
406
+ if numM == 1:
407
+ isMatchedRefDb = True
408
+ logger.debug("%s matched secondary with taxId %r %s -> %s", entityKey, taxIdL[0], rId, rsiD["database_accession"])
409
+ elif not taxIdL:
410
+ logger.debug("%s no taxids with UniProt (%s) secondary mapping", entityKey, rId)
411
+ else:
412
+ logger.debug("%s ambiguous mapping for a UniProt (%s) secondary mapping - taxIds %r", entityKey, rId, taxIdL)
413
+ #
414
+ except Exception:
415
+ pass
416
+
417
+ elif rsiD["provenance_source"] in provSourceL and rsiD["database_name"] in refDbList:
418
+ logger.debug("%s leaving reference accession for %s %s assigned by %r", entityKey, rId, rsiD["database_name"], provSourceL)
419
+ isMatchedRefDb = True
420
+ else:
421
+ logger.debug("%s leaving an unverified reference accession for %s %s assigned by %r", entityKey, rId, rsiD["database_name"], rsiD["provenance_source"])
422
+ #
423
+ logger.debug("%s isMatched %r isExcluded %r for accession %r", entityKey, isMatchedRefDb, isMatchedAltDb, rId)
424
+ #
425
+ return isMatchedRefDb, isMatchedAltDb, rsiD
426
+
427
+ def __reMapAlignments(self, entityKey, alignD, referenceDatabaseName, taxIdL, provSourceL, excludeReferenceDatabases=None):
428
+ """Internal method to re-map alignments for the input databae and assignment source
429
+
430
+ Args:
431
+ alignD (dict): alignment object including accession and aligned regions
432
+ databaseName (str, optional): resource database name. Defaults to 'UniProt'.
433
+ provSourceL (list, optional): assignment provenance. Defaults to 'PDB'.
434
+
435
+ Returns:
436
+ bool, bool, list: flag for mapping success (refdb), flag for mapping success (altdb),
437
+ and remapped (and unmapped) accessions in the input align list
438
+ """
439
+ isMatchedAltDb = False
440
+ isMatchedRefDb = False
441
+ excludeReferenceDatabases = excludeReferenceDatabases if excludeReferenceDatabases else ["PDB"]
442
+ refDbList = ["UniProt", "GenBank", "EMBL", "NDB", "NORINE", "PIR", "PRF", "RefSeq"]
443
+ provSourceL = provSourceL if provSourceL else []
444
+ rId = alignD["reference_database_accession"]
445
+ #
446
+ if alignD["reference_database_name"] in excludeReferenceDatabases:
447
+ isMatchedAltDb = False
448
+ elif alignD["reference_database_name"] == referenceDatabaseName and alignD["provenance_source"] in provSourceL:
449
+ try:
450
+ if rId in self.__matchD and self.__matchD[rId]["matched"] in ["primary"]:
451
+ # no change
452
+ isMatchedRefDb = True
453
+ elif rId in self.__matchD and self.__matchD[rId]["matched"] in ["secondary"]:
454
+ if len(self.__matchD[rId]["matchedIds"]) == 1:
455
+ for mId, mD in self.__matchD[rId]["matchedIds"].items():
456
+ alignD["reference_database_accession"] = mId
457
+ isMatchedRefDb = True
458
+ elif taxIdL and len(taxIdL) == 1:
459
+ # -- simplest match case --
460
+ numM = 0
461
+ for mId, mD in self.__matchD[rId]["matchedIds"].items():
462
+ if taxIdL[0] == mD["taxId"]:
463
+ alignD["reference_database_accession"] = mId
464
+ numM += 1
465
+ if numM == 1:
466
+ isMatchedRefDb = True
467
+ elif not taxIdL:
468
+ logger.debug("%s no taxids with UniProt (%s) secondary mapping", entityKey, rId)
469
+ else:
470
+ logger.info("%s ambiguous mapping for a UniProt (%s) secondary mapping - taxIds %r", entityKey, rId, taxIdL)
471
+ #
472
+ except Exception:
473
+ pass
474
+ elif alignD["provenance_source"] in provSourceL and alignD["reference_database_name"] in refDbList:
475
+ logger.debug("%s leaving reference alignment for %s %s assigned by %r", entityKey, rId, alignD["reference_database_name"], provSourceL)
476
+ isMatchedRefDb = False
477
+ isMatchedAltDb = False
478
+ else:
479
+ logger.debug("%s leaving a reference alignment for %s %s assigned by %r", entityKey, rId, alignD["reference_database_name"], alignD["provenance_source"])
480
+ #
481
+ logger.debug("%s isMatched %r isExcluded %r for alignment %r", entityKey, isMatchedRefDb, isMatchedAltDb, rId)
482
+ return isMatchedRefDb, isMatchedAltDb, alignD, self.__hashAlignment(alignD)
483
+
484
+ def __hashAlignment(self, aD):
485
+ """
486
+ Example:
487
+
488
+ {'reference_database_name': 'UniProt', 'reference_database_accession': 'P62942', 'provenance_source': 'PDB',
489
+ 'aligned_regions': [{'entity_beg_seq_id': 1, 'ref_beg_seq_id': 1, 'length': 107}]}]
490
+ """
491
+ hsh = None
492
+ hL = []
493
+ try:
494
+ hL.append(aD["reference_database_accession"])
495
+ for aR in aD["aligned_regions"]:
496
+ hL.append(aR["entity_beg_seq_id"])
497
+ hL.append(aR["ref_beg_seq_id"])
498
+ hL.append(aR["length"])
499
+ hsh = tuple(hL)
500
+ except Exception:
501
+ pass
502
+ return hsh
503
+
504
+ def __getSiftsAccessions(self, entityKey, authAsymIdL):
505
+ retL = []
506
+ saoLD = self.__ssP.getLongestAlignments(entityKey[:4], authAsymIdL)
507
+ for (_, dbAccession), _ in saoLD.items():
508
+ retL.append({"database_name": "UniProt", "database_accession": dbAccession, "provenance_source": "SIFTS"})
509
+ return retL
510
+
511
+ def __getSiftsAlignments(self, entityKey, authAsymIdL):
512
+ retL = []
513
+ saoLD = self.__ssP.getLongestAlignments(entityKey[:4], authAsymIdL)
514
+ for (_, dbAccession), saoL in saoLD.items():
515
+ dD = {"reference_database_name": "UniProt", "reference_database_accession": dbAccession, "provenance_source": "SIFTS", "aligned_regions": []}
516
+ for sao in saoL:
517
+ dD["aligned_regions"].append({"ref_beg_seq_id": sao.getDbSeqIdBeg(), "entity_beg_seq_id": sao.getEntitySeqIdBeg(), "length": sao.getEntityAlignLength()})
518
+ retL.append(dD)
519
+ return retL
520
+
521
+ def getReferenceAccessionAlignSummary(self):
522
+ """Summarize the alignment of PDB accession assignments with the current reference sequence database."""
523
+ numPrimary = 0
524
+ numSecondary = 0
525
+ numNone = 0
526
+ for _, mD in self.__matchD.items():
527
+ if mD["matched"] == "primary":
528
+ numPrimary += 1
529
+ elif mD["matched"] == "secondary":
530
+ numSecondary += 1
531
+ else:
532
+ numNone += 1
533
+ logger.debug("Matched primary: %d secondary: %d none %d", numPrimary, numSecondary, numNone)
534
+ return numPrimary, numSecondary, numNone