rcsb.exdb 1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. rcsb/__init__.py +1 -0
  2. rcsb/exdb/__init__.py +1 -0
  3. rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
  4. rcsb/exdb/branch/GlycanProvider.py +116 -0
  5. rcsb/exdb/branch/GlycanUtils.py +114 -0
  6. rcsb/exdb/branch/__init__.py +0 -0
  7. rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
  8. rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
  9. rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
  10. rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
  11. rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
  12. rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
  13. rcsb/exdb/chemref/__init__.py +0 -0
  14. rcsb/exdb/citation/CitationAdapter.py +91 -0
  15. rcsb/exdb/citation/CitationExtractor.py +190 -0
  16. rcsb/exdb/citation/CitationUtils.py +51 -0
  17. rcsb/exdb/citation/__init__.py +0 -0
  18. rcsb/exdb/cli/__init__.py +0 -0
  19. rcsb/exdb/entry/EntryInfoProvider.py +148 -0
  20. rcsb/exdb/entry/__init__.py +0 -0
  21. rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
  22. rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
  23. rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
  24. rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
  25. rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
  26. rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
  27. rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
  28. rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
  29. rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
  30. rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
  31. rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
  32. rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
  33. rcsb/exdb/seq/AnnotationExtractor.py +76 -0
  34. rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
  35. rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
  36. rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
  37. rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
  38. rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
  39. rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
  40. rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
  41. rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
  42. rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
  43. rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
  44. rcsb/exdb/seq/UniProtExtractor.py +80 -0
  45. rcsb/exdb/seq/__init__.py +0 -0
  46. rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
  47. rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
  48. rcsb/exdb/tests/__init__.py +0 -0
  49. rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
  50. rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
  51. rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
  52. rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
  53. rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
  54. rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
  55. rcsb/exdb/tests/testChemRefLoader.py +106 -0
  56. rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
  57. rcsb/exdb/tests/testCitationAdapter.py +97 -0
  58. rcsb/exdb/tests/testCitationExtractor.py +93 -0
  59. rcsb/exdb/tests/testCitationUtils.py +92 -0
  60. rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
  61. rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
  62. rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
  63. rcsb/exdb/tests/testGlycanProvider.py +98 -0
  64. rcsb/exdb/tests/testGlycanUtils.py +64 -0
  65. rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
  66. rcsb/exdb/tests/testObjectExtractor.py +342 -0
  67. rcsb/exdb/tests/testObjectTransformer.py +83 -0
  68. rcsb/exdb/tests/testObjectUpdater.py +120 -0
  69. rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
  70. rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
  71. rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
  72. rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
  73. rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
  74. rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
  75. rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
  76. rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
  77. rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
  78. rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
  79. rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
  80. rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
  81. rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
  82. rcsb/exdb/tests/testUniProtExtractor.py +77 -0
  83. rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
  84. rcsb/exdb/tree/__init__.py +0 -0
  85. rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
  86. rcsb/exdb/utils/ObjectExtractor.py +286 -0
  87. rcsb/exdb/utils/ObjectTransformer.py +124 -0
  88. rcsb/exdb/utils/ObjectUpdater.py +121 -0
  89. rcsb/exdb/utils/ObjectValidator.py +160 -0
  90. rcsb/exdb/utils/__init__.py +0 -0
  91. rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
  92. rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
  93. rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
  94. rcsb/exdb/wf/__init__.py +0 -0
  95. rcsb_exdb-1.31.dist-info/METADATA +103 -0
  96. rcsb_exdb-1.31.dist-info/RECORD +98 -0
  97. rcsb_exdb-1.31.dist-info/WHEEL +4 -0
  98. rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,328 @@
1
+ ##
2
+ # File: PolymerEntityExtractor.py
3
+ # Date: 5-Dec-2020 jdw
4
+ #
5
+ # Utilities to extract selected details from the core polymer entity collections.
6
+ #
7
+ #
8
+ # Updates:
9
+ # 9-Jan-2024 dwp Turn off use of uniprot_exdb DB for enriching protein entity details file (data not used)
10
+ # 10-Dec-2024 dwp Sort extracted polymer entity sequence data by entity ID (alphabetically), to ensure consistent
11
+ # ordering between coasts (order of sequence data influences results of mmseqs2 sequence searching)
12
+ #
13
+ ##
14
+ __docformat__ = "google en"
15
+ __author__ = "John Westbrook"
16
+ __email__ = "jwest@rcsb.rutgers.edu"
17
+ __license__ = "Apache 2.0"
18
+
19
+ import logging
20
+ import os
21
+ from collections import OrderedDict
22
+
23
+ from rcsb.exdb.utils.ObjectExtractor import ObjectExtractor
24
+ from rcsb.utils.io.MarshalUtil import MarshalUtil
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ def getRangeOverlap(entityBeg, entityEnd, refBeg, refEnd):
30
+ r1 = range(entityBeg, entityEnd)
31
+ r2 = range(refBeg, refEnd)
32
+ if r1.start == r1.stop or r2.start == r2.stop:
33
+ return set()
34
+ if not ((r1.start < r2.stop and r1.stop > r2.start) or (r1.stop > r2.start and r2.stop > r1.start)):
35
+ return set()
36
+ return set(range(max(r1.start, r2.start), min(r1.stop, r2.stop) + 1))
37
+
38
+
39
+ class PolymerEntityExtractor(object):
40
+ """Utilities to extract selected details from the core polymer entity collections."""
41
+
42
+ def __init__(self, cfgOb):
43
+ self.__cfgOb = cfgOb
44
+
45
+ def exportProteinSequenceDetails(self, filePath, fmt="json", minSeqLen=0):
46
+ """Export protein sequence and taxonomy data (required to build protein sequence fasta file)"""
47
+ rD, missingSrcD = self.getProteinSequenceDetails(minSeqLen=minSeqLen)
48
+ # ----
49
+ mU = MarshalUtil()
50
+ ok1 = mU.doExport(filePath, rD, fmt=fmt, indent=3)
51
+ #
52
+ pth, _ = os.path.split(filePath)
53
+ mU = MarshalUtil()
54
+ ok2 = mU.doExport(os.path.join(pth, "missingSrcNames.json"), missingSrcD, fmt="json")
55
+ logger.info("Exporting (%d) protein sequence records with missing source count (%d) status %r", len(rD), len(missingSrcD), ok1 and ok2)
56
+
57
+ def getProteinSequenceDetails(self, minSeqLen=0):
58
+ """Get protein sequence and taxonomy data (required to build protein sequence fasta file)"""
59
+ missingSrcD = {}
60
+ rD = {}
61
+ try:
62
+ obEx = ObjectExtractor(
63
+ self.__cfgOb,
64
+ databaseName="pdbx_core",
65
+ collectionName="pdbx_core_polymer_entity",
66
+ useCache=False,
67
+ keyAttribute="entity",
68
+ uniqueAttributes=["rcsb_id"],
69
+ selectionQuery={"entity_poly.rcsb_entity_polymer_type": "Protein"},
70
+ selectionList=[
71
+ "rcsb_id",
72
+ "rcsb_entity_source_organism",
73
+ "rcsb_polymer_entity.rcsb_source_part_count",
74
+ "rcsb_polymer_entity.rcsb_source_taxonomy_count",
75
+ "rcsb_polymer_entity.src_method",
76
+ "entity_poly",
77
+ "rcsb_polymer_entity_align",
78
+ ],
79
+ )
80
+ #
81
+ eCount = obEx.getCount()
82
+ logger.info("Polymer entity count is %d", eCount)
83
+ objD = obEx.getObjects()
84
+ rD = {}
85
+ for rId, eD in objD.items():
86
+
87
+ try:
88
+ pD = eD["entity_poly"]
89
+ seqS = pD["pdbx_seq_one_letter_code_can"]
90
+ seqLen = len(seqS)
91
+ except Exception:
92
+ logger.warning("%s no one-letter-code sequence", rId)
93
+ #
94
+ if seqLen < minSeqLen:
95
+ continue
96
+ #
97
+ srcMethod = None
98
+ try:
99
+ pD = eD["rcsb_polymer_entity"]
100
+ srcMethod = pD["src_method"]
101
+ except Exception:
102
+ pass
103
+ #
104
+ if "rcsb_entity_source_organism" not in eD:
105
+ logger.debug("%s No source information (%r) skipping (seqLen %d)", rId, srcMethod, seqLen)
106
+ continue
107
+ try:
108
+ sL = []
109
+ for tD in eD["rcsb_entity_source_organism"]:
110
+ srcName = tD["scientific_name"] if "scientific_name" in tD else None
111
+ if "beg_seq_num" in tD and "end_seq_num" in tD:
112
+ begSeqNum = tD["beg_seq_num"]
113
+ endSeqNum = tD["end_seq_num"] if tD["end_seq_num"] <= seqLen else seqLen
114
+ else:
115
+ begSeqNum = 1
116
+ endSeqNum = seqLen
117
+ srcId = tD["pdbx_src_id"]
118
+ srcType = tD["source_type"]
119
+ taxId = tD["ncbi_taxonomy_id"] if "ncbi_taxonomy_id" in tD else -1
120
+ if srcName and taxId == -1:
121
+ missingSrcD.setdefault(srcName, []).append(rId)
122
+ orgName = tD["ncbi_scientific_name"] if "ncbi_scientific_name" in tD else ""
123
+ sL.append({"srcId": srcId, "taxId": taxId, "orgName": orgName, "entitySeqBeg": begSeqNum, "entitySeqEnd": endSeqNum})
124
+ if len(sL) == 1:
125
+ sL[0]["entitySeqBeg"] = 1
126
+ sL[0]["entitySeqEnd"] = seqLen
127
+
128
+ except Exception as e:
129
+ logger.exception("Failing for (%r) tD %r with %s", rId, tD, str(e))
130
+ #
131
+ try:
132
+ pD = eD["rcsb_polymer_entity"]
133
+ partCount = pD["rcsb_source_part_count"]
134
+ except Exception:
135
+ logger.warning("%s no source part count", rId)
136
+ partCount = 1
137
+ try:
138
+ pD = eD["rcsb_polymer_entity"]
139
+ taxCount = pD["rcsb_source_taxonomy_count"]
140
+ except Exception:
141
+ if srcType == "synthetic":
142
+ taxCount = 0
143
+ else:
144
+ logger.warning("%s (srcName %r) no source taxonomy count type %r", rId, srcName, srcType)
145
+ if srcName:
146
+ taxCount = 1
147
+ else:
148
+ taxCount = 0
149
+ #
150
+ uDL = []
151
+ try:
152
+ for tD in eD["rcsb_polymer_entity_align"]:
153
+ uD = {}
154
+ if tD["reference_database_name"] in ["UniProt", "GenBank", "PIR", "EMBL", "NORINE", "PRF"]:
155
+ uD["refDbId"] = tD["reference_database_accession"]
156
+ uD["refDbName"] = tD["reference_database_name"]
157
+ uD["provSource"] = tD["provenance_source"]
158
+ #
159
+ # Skip the below step now that uniprot_exdb DB is no longer being updated in weekly workflow.
160
+ # The data added here isn't used by subsequent tasks. It simply provides
161
+ # additional information in the pdbprent-details.json file (under "alignmentL")
162
+ # if tD["reference_database_accession"] in unpD:
163
+ # # This adds {"accession": rId, "taxId": taxId, "scientific_name": sn, "gene": gn, "name": pn, "sequence": sequence}
164
+ # uD.update(unpD[tD["reference_database_accession"]])
165
+ aL = []
166
+ for qD in tD["aligned_regions"]:
167
+ if qD["entity_beg_seq_id"] + qD["length"] - 1 > seqLen:
168
+ qD["length"] = seqLen - qD["entity_beg_seq_id"] + 1
169
+ srcId = self.__getSourcePart(rId, sL, qD["entity_beg_seq_id"], qD["length"])
170
+
171
+ aL.append({"srcId": srcId, "entitySeqBeg": qD["entity_beg_seq_id"], "refSeqBeg": qD["ref_beg_seq_id"], "length": qD["length"]})
172
+ uD["alignList"] = aL
173
+ uDL.append(uD)
174
+ else:
175
+ logger.info("%s reference database %s", rId, tD["reference_database_name"])
176
+
177
+ except Exception:
178
+ pass
179
+ rD[rId] = {"alignmentL": uDL, "sourceOrgL": sL, "partCount": partCount, "taxCount": taxCount, "sequence": seqS, "seqLen": seqLen}
180
+
181
+ # Sort the dict in alphabetical order (by entity ID key) to ensure consistent/reproducible treatment by mmseqs2
182
+ sortedD = OrderedDict((k, rD.pop(k)) for k in sorted(rD))
183
+
184
+ except Exception as e:
185
+ logger.exception("Failing with %s", str(e))
186
+ return sortedD, missingSrcD
187
+
188
+ def __getSourcePart(self, entityId, sourceOrgL, entityBeg, seqLen):
189
+ """Return the source part containing the input entity range -
190
+
191
+ Args:
192
+ sourceOrgL (list): list of source dictionaries
193
+ entityBeg (int): begining entity sequence position (matched region)
194
+ seqLen (int): length sequence range (matched region)
195
+
196
+ Returns:
197
+ (int): corresponding source part id or None
198
+ """
199
+ entityEnd = entityBeg + seqLen - 1
200
+ for sD in sourceOrgL:
201
+ srcId = sD["srcId"]
202
+ if sD["entitySeqBeg"] <= entityBeg and sD["entitySeqEnd"] >= entityEnd:
203
+ return srcId
204
+ #
205
+ if len(sourceOrgL) == 1:
206
+ logger.error("%r (%d) Inconsistent range for beg %r end %r sourceOrgL %r", entityId, len(sourceOrgL), entityBeg, entityEnd, sourceOrgL)
207
+ return 1
208
+ else:
209
+ ovTupL = []
210
+ for sD in sourceOrgL:
211
+ srcId = sD["srcId"]
212
+ logger.debug("%r %r beg %r end %r beg %r end %r", entityId, srcId, sD["entitySeqBeg"], sD["entitySeqEnd"], entityBeg, entityEnd)
213
+ oVS = getRangeOverlap(sD["entitySeqBeg"], sD["entitySeqEnd"], entityBeg, entityEnd)
214
+ ovTupL.append((srcId, len(oVS)))
215
+ rL = sorted(ovTupL, key=lambda x: x[1], reverse=True)
216
+ logger.debug("ovTupL %r", rL)
217
+ #
218
+ return rL[0][0]
219
+
220
+ def exportProteinEntityFasta(self, fastaPath, taxonPath, detailsPath, minSeqLen=10):
221
+ """Export protein entity Fasta file and associated taxon mapping file (for mmseqs2)
222
+
223
+ Args:
224
+ fastaPath (str): protein sequence FASTA output file path
225
+ taxonPath (str): taxon mapping file path (seqid TaxId) (tdd format)
226
+ detailPath (str): protein entity details file path (json)
227
+
228
+ Returns:
229
+ bool: True for success or False otherwise
230
+
231
+ Example:
232
+ "5H7D_1": {
233
+ # "alignmentL": [
234
+ # {
235
+ # "refDbId": "P42588",
236
+ # "refDbName": "UniProt",
237
+ # "provSource": "PDB",
238
+ # "accession": "P42588",
239
+ # "taxId": 83333,
240
+ # "scientific_name": "Escherichia coli (strain K12)",
241
+ # "gene": "patA",
242
+ # "name": "PATase",
243
+ # "alignList": [
244
+ # {
245
+ # "srcId": "1",
246
+ # "entitySeqBeg": 5,
247
+ # "refSeqBeg": 7,
248
+ # "length": 447
249
+ # }
250
+ # ]
251
+ # },
252
+ # {
253
+ # "refDbId": "P38507",
254
+ # "refDbName": "UniProt",
255
+ # "provSource": "PDB",
256
+ # "accession": "P38507",
257
+ # "taxId": 1280,
258
+ # "scientific_name": "Staphylococcus aureus",
259
+ # "gene": "spa",
260
+ # "name": "IgG-binding protein A",
261
+ # "alignList": [
262
+ # {
263
+ # "srcId": "2",
264
+ # "entitySeqBeg": 452,
265
+ # entitySeqBeg"220,
266
+ # "length": 48
267
+ # }
268
+ # ]
269
+ # }
270
+ # ],
271
+ "sourceOrgL": [
272
+ {
273
+ "srcId": "1",
274
+ "taxId": 83333,
275
+ "orgName": "Escherichia coli K-12",
276
+ "entitySeqBeg": 1,
277
+ "entitySeqEnd": 451
278
+ },
279
+ {
280
+ "srcId": "2",
281
+ "taxId": 1280,
282
+ "orgName": "Staphylococcus aureus",
283
+ "entitySeqBeg": 452,
284
+ "entitySeqEnd": 499
285
+ }
286
+ ],
287
+ "partCount": 2,
288
+ "taxCount": 2,
289
+ "sequence": "GSHMSASALACSAHALNLIEKRTLDHEEMKALNREVIEYFKEHVNPGF...",
290
+ "seqLen": 499
291
+ },
292
+ >1ABC_#|prt|<taxid>|beg|end|refdb|refId|refTaxId|refbeg|refend|ref_gn|ref_nm
293
+ """
294
+ proteinSeqD, _ = self.getProteinSequenceDetails(minSeqLen=minSeqLen)
295
+ ok = False
296
+
297
+ try:
298
+ taxonL = []
299
+ seqDict = {}
300
+ for eId, eD in proteinSeqD.items():
301
+ #
302
+ seq = eD["sequence"]
303
+ for sD in eD["sourceOrgL"]:
304
+ srcId = sD["srcId"]
305
+ taxId = sD["taxId"]
306
+ seqBeg = int(sD["entitySeqBeg"])
307
+ seqEnd = int(sD["entitySeqEnd"])
308
+ seqLen = 1 + (seqEnd - seqBeg)
309
+ # orgName = sD["orgName"]
310
+ cD = {"sequence": seq[seqBeg - 1: seqEnd], "entityId": eId, "srcId": srcId, "seqBeg": seqBeg, "seqEnd": seqEnd, "seqLen": seqLen, "taxId": taxId}
311
+ seqId = ""
312
+ cL = []
313
+ for k, v in cD.items():
314
+ if k in ["sequence"]:
315
+ continue
316
+ cL.append(str(v))
317
+ cL.append(str(k))
318
+ seqId = "|".join(cL)
319
+ seqDict[seqId] = cD
320
+ taxonL.append("%s\t%s" % (seqId, taxId))
321
+ # ----
322
+ mU = MarshalUtil()
323
+ ok = mU.doExport(detailsPath, proteinSeqD, fmt="json", indent=3)
324
+ ok = mU.doExport(fastaPath, seqDict, fmt="fasta")
325
+ ok = mU.doExport(taxonPath, taxonL, fmt="list")
326
+ except Exception as e:
327
+ logger.exception("Failing %r with %s", fastaPath, str(e))
328
+ return ok