rcsb.exdb 1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. rcsb/__init__.py +1 -0
  2. rcsb/exdb/__init__.py +1 -0
  3. rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
  4. rcsb/exdb/branch/GlycanProvider.py +116 -0
  5. rcsb/exdb/branch/GlycanUtils.py +114 -0
  6. rcsb/exdb/branch/__init__.py +0 -0
  7. rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
  8. rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
  9. rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
  10. rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
  11. rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
  12. rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
  13. rcsb/exdb/chemref/__init__.py +0 -0
  14. rcsb/exdb/citation/CitationAdapter.py +91 -0
  15. rcsb/exdb/citation/CitationExtractor.py +190 -0
  16. rcsb/exdb/citation/CitationUtils.py +51 -0
  17. rcsb/exdb/citation/__init__.py +0 -0
  18. rcsb/exdb/cli/__init__.py +0 -0
  19. rcsb/exdb/entry/EntryInfoProvider.py +148 -0
  20. rcsb/exdb/entry/__init__.py +0 -0
  21. rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
  22. rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
  23. rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
  24. rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
  25. rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
  26. rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
  27. rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
  28. rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
  29. rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
  30. rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
  31. rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
  32. rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
  33. rcsb/exdb/seq/AnnotationExtractor.py +76 -0
  34. rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
  35. rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
  36. rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
  37. rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
  38. rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
  39. rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
  40. rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
  41. rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
  42. rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
  43. rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
  44. rcsb/exdb/seq/UniProtExtractor.py +80 -0
  45. rcsb/exdb/seq/__init__.py +0 -0
  46. rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
  47. rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
  48. rcsb/exdb/tests/__init__.py +0 -0
  49. rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
  50. rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
  51. rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
  52. rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
  53. rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
  54. rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
  55. rcsb/exdb/tests/testChemRefLoader.py +106 -0
  56. rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
  57. rcsb/exdb/tests/testCitationAdapter.py +97 -0
  58. rcsb/exdb/tests/testCitationExtractor.py +93 -0
  59. rcsb/exdb/tests/testCitationUtils.py +92 -0
  60. rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
  61. rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
  62. rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
  63. rcsb/exdb/tests/testGlycanProvider.py +98 -0
  64. rcsb/exdb/tests/testGlycanUtils.py +64 -0
  65. rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
  66. rcsb/exdb/tests/testObjectExtractor.py +342 -0
  67. rcsb/exdb/tests/testObjectTransformer.py +83 -0
  68. rcsb/exdb/tests/testObjectUpdater.py +120 -0
  69. rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
  70. rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
  71. rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
  72. rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
  73. rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
  74. rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
  75. rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
  76. rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
  77. rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
  78. rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
  79. rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
  80. rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
  81. rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
  82. rcsb/exdb/tests/testUniProtExtractor.py +77 -0
  83. rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
  84. rcsb/exdb/tree/__init__.py +0 -0
  85. rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
  86. rcsb/exdb/utils/ObjectExtractor.py +286 -0
  87. rcsb/exdb/utils/ObjectTransformer.py +124 -0
  88. rcsb/exdb/utils/ObjectUpdater.py +121 -0
  89. rcsb/exdb/utils/ObjectValidator.py +160 -0
  90. rcsb/exdb/utils/__init__.py +0 -0
  91. rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
  92. rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
  93. rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
  94. rcsb/exdb/wf/__init__.py +0 -0
  95. rcsb_exdb-1.31.dist-info/METADATA +103 -0
  96. rcsb_exdb-1.31.dist-info/RECORD +98 -0
  97. rcsb_exdb-1.31.dist-info/WHEEL +4 -0
  98. rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,228 @@
1
+ ##
2
+ # File: ReferenceSequenceAnnotationProvider.py
3
+ # Date: 14-Feb-2020 jdw
4
+ #
5
+ # Utilities to cache content required to update referencence sequence annotations.
6
+ #
7
+ # Updates:
8
+ # 25-May-2022 dwp Add error checking for SIFTS data loading
9
+ ##
10
+ __docformat__ = "google en"
11
+ __author__ = "John Westbrook"
12
+ __email__ = "jwest@rcsb.rutgers.edu"
13
+ __license__ = "Apache 2.0"
14
+
15
+ import logging
16
+ import os
17
+ from collections import defaultdict
18
+
19
+ from rcsb.exdb.seq.ReferenceSequenceCacheProvider import ReferenceSequenceCacheProvider
20
+ from rcsb.utils.ec.EnzymeDatabaseProvider import EnzymeDatabaseProvider
21
+ from rcsb.utils.go.GeneOntologyProvider import GeneOntologyProvider
22
+ from rcsb.utils.io.IoUtil import getObjSize
23
+ from rcsb.utils.io.MarshalUtil import MarshalUtil
24
+ from rcsb.utils.seq.GlyGenProvider import GlyGenProvider
25
+ from rcsb.utils.seq.InterProProvider import InterProProvider
26
+ from rcsb.utils.seq.PfamProvider import PfamProvider
27
+ from rcsb.utils.seq.SiftsSummaryProvider import SiftsSummaryProvider
28
+ from rcsb.utils.seq.UniProtUtils import UniProtUtils
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ class ReferenceSequenceAnnotationProvider(object):
34
+ """Utilities to cache content required to update referencence sequence annotations."""
35
+
36
+ def __init__(self, cfgOb, databaseName, collectionName, polymerType, maxChunkSize=10, fetchLimit=None, numProc=2, expireDays=14, **kwargs):
37
+ self.__cfgOb = cfgOb
38
+ self.__mU = MarshalUtil()
39
+ #
40
+ self.__maxChunkSize = maxChunkSize
41
+ self.__statusList = []
42
+ #
43
+ self.__ggP = self.__fetchGlyGenProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
44
+ self.__pfP = self.__fetchPfamProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
45
+ self.__ipP = self.__fetchInterProProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
46
+ self.__ssP = self.__fetchSiftsSummaryProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
47
+ self.__goP = self.__fetchGoProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
48
+ self.__ecP = self.__fetchEcProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
49
+ #
50
+ self.__rsaP = ReferenceSequenceCacheProvider(
51
+ self.__cfgOb, databaseName, collectionName, polymerType, siftsProvider=self.__ssP, maxChunkSize=maxChunkSize, numProc=numProc, fetchLimit=fetchLimit, expireDays=expireDays
52
+ )
53
+ self.__matchD = self.__rsaP.getMatchInfo()
54
+ self.__refD = self.__rsaP.getRefData()
55
+ self.__missingMatchedIdCodes = self.__rsaP.getMissingMatchedIdCodes()
56
+
57
+ def goIdExists(self, goId):
58
+ try:
59
+ return self.__goP.exists(goId)
60
+ except Exception as e:
61
+ logger.exception("Failing for %r with %s", goId, str(e))
62
+ return False
63
+
64
+ def getGeneOntologyName(self, goId):
65
+ try:
66
+ return self.__goP.getName(goId)
67
+ except Exception as e:
68
+ logger.exception("Failing for %r with %s", goId, str(e))
69
+ return None
70
+
71
+ def getGeneOntologyLineage(self, goIdL):
72
+ # "id" "name"
73
+ gL = []
74
+ try:
75
+ gTupL = self.__goP.getUniqueDescendants(goIdL)
76
+ for gTup in gTupL:
77
+ gL.append({"id": gTup[0], "name": gTup[1]})
78
+ except Exception as e:
79
+ logger.exception("Failing for %r with %s", goIdL, str(e))
80
+ return gL
81
+
82
+ def getGlyGenProvider(self):
83
+ return self.__ggP
84
+
85
+ def getPfamProvider(self):
86
+ return self.__pfP
87
+
88
+ def getPfamName(self, idCode):
89
+ return self.__pfP.getDescription(idCode)
90
+
91
+ def getInterProProvider(self):
92
+ return self.__ipP
93
+
94
+ def getInterProName(self, idCode):
95
+ return self.__ipP.getDescription(idCode)
96
+
97
+ def getInterProLineage(self, idCode):
98
+ linL = []
99
+ try:
100
+ tupL = self.__ipP.getLineageWithNames(idCode)
101
+ for tup in tupL:
102
+ linL.append({"id": tup[0], "name": tup[1], "depth": tup[2]})
103
+ except Exception as e:
104
+ logger.exception("Failing for %r with %s", idCode, str(e))
105
+ return linL
106
+
107
+ def getEcProvider(self):
108
+ return self.__ecP
109
+
110
+ def getSiftsSummaryProvider(self):
111
+ return self.__ssP
112
+
113
+ def getMatchInfo(self):
114
+ return self.__matchD
115
+
116
+ def getRefData(self):
117
+ return self.__refD
118
+
119
+ def getDocuments(self, formatType="exchange"):
120
+ fobj = UniProtUtils(saveText=False)
121
+ exObjD = fobj.reformat(self.__refD, formatType=formatType)
122
+ return list(exObjD.values())
123
+
124
+ def getRefDataCount(self):
125
+ return len(self.__refD)
126
+
127
+ def testCache(self, minMatchPrimaryPercent=None, logSizes=False, minMissing=0):
128
+ okC = True
129
+ logger.info("Reference sequence cache lengths: matchD %d refD %d", len(self.__matchD), len(self.__refD))
130
+ logger.info("missingMatchedIdCodes %r minMissing %r", self.__missingMatchedIdCodes, minMissing)
131
+ ok = bool(self.__matchD and self.__refD and self.__ssP and self.__missingMatchedIdCodes <= minMissing)
132
+ logger.info("Initial testCache check status %r", ok)
133
+ #
134
+ numRef = len(self.__matchD)
135
+ countD = defaultdict(int)
136
+ logger.info("Match dictionary length %d", len(self.__matchD))
137
+ for _, mD in self.__matchD.items():
138
+ if "matched" in mD:
139
+ countD[mD["matched"]] += 1
140
+ logger.info("Reference length %d match length %d coverage %r", len(self.__refD), len(self.__matchD), countD.items())
141
+ if minMatchPrimaryPercent:
142
+ try:
143
+ okC = 100.0 * float(countD["primary"]) / float(numRef) > minMatchPrimaryPercent
144
+ except Exception:
145
+ okC = False
146
+ logger.info("Primary reference match percent test status %r", okC)
147
+ #
148
+ if logSizes:
149
+ logger.info(
150
+ "SIFTS %.2f GO %.2f EC %.2f RefMatchD %.2f RefD %.2f",
151
+ getObjSize(self.__ssP) / 1000000.0,
152
+ getObjSize(self.__goP) / 1000000.0,
153
+ getObjSize(self.__ecP) / 1000000.0,
154
+ getObjSize(self.__matchD) / 1000000.0,
155
+ getObjSize(self.__refD) / 1000000.0,
156
+ )
157
+ return ok and okC
158
+
159
+ def __fetchSiftsSummaryProvider(self, cfgOb, configName, **kwargs):
160
+ abbreviated = kwargs.get("siftsAbbreviated", "TEST")
161
+ cachePath = kwargs.get("cachePath", ".")
162
+ cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"})
163
+ useCache = kwargs.get("useCache", True)
164
+ #
165
+ siftsSummaryDataPath = cfgOb.getPath("SIFTS_SUMMARY_DATA_PATH", sectionName=configName)
166
+ if siftsSummaryDataPath.lower().startswith("http"):
167
+ srcDirPath = siftsSummaryDataPath
168
+ else:
169
+ srcDirPath = os.path.join(cachePath, siftsSummaryDataPath)
170
+ cacheDirPath = os.path.join(cachePath, cfgOb.get("SIFTS_SUMMARY_CACHE_DIR", sectionName=configName))
171
+ logger.debug("ssP %r %r", srcDirPath, cacheDirPath)
172
+ ssP = SiftsSummaryProvider(srcDirPath=srcDirPath, cacheDirPath=cacheDirPath, useCache=useCache, abbreviated=abbreviated, cacheKwargs=cacheKwargs)
173
+ ok = ssP.testCache()
174
+ if not ok:
175
+ logger.error("Failed to refetch SIFTS summary data using srcDirPath %s, cacheDirPath %s", srcDirPath, cacheDirPath)
176
+ return None
177
+ logger.debug("SIFTS cache status %r", ok)
178
+ logger.debug("ssP entry count %d", ssP.getEntryCount())
179
+ return ssP
180
+
181
+ def __fetchGoProvider(self, cfgOb, configName, **kwargs):
182
+ cachePath = kwargs.get("cachePath", ".")
183
+ useCache = kwargs.get("useCache", True)
184
+ #
185
+ cacheDirPath = os.path.join(cachePath, cfgOb.get("EXDB_CACHE_DIR", sectionName=configName))
186
+ logger.debug("goP %r %r", cacheDirPath, useCache)
187
+ goP = GeneOntologyProvider(goDirPath=cacheDirPath, useCache=useCache)
188
+ ok = goP.testCache()
189
+ logger.debug("Gene Ontology (%r) root node count %r", ok, goP.getRootNodes())
190
+ return goP
191
+
192
+ def __fetchEcProvider(self, cfgOb, configName, **kwargs):
193
+ cachePath = kwargs.get("cachePath", ".")
194
+ useCache = kwargs.get("useCache", True)
195
+ #
196
+ cacheDirPath = os.path.join(cachePath, cfgOb.get("ENZYME_CLASSIFICATION_CACHE_DIR", sectionName=configName))
197
+ logger.debug("ecP %r %r", cacheDirPath, useCache)
198
+ ecP = EnzymeDatabaseProvider(enzymeDirPath=cacheDirPath, useCache=useCache)
199
+ ok = ecP.testCache()
200
+ logger.debug("Enzyme cache status %r", ok)
201
+ return ecP
202
+
203
+ def __fetchGlyGenProvider(self, cfgOb, configName, **kwargs):
204
+ _ = cfgOb
205
+ _ = configName
206
+ cachePath = kwargs.get("cachePath", ".")
207
+ useCache = kwargs.get("useCache", True)
208
+ ggP = GlyGenProvider(cachePath=cachePath, useCache=useCache)
209
+ ok = ggP.testCache()
210
+ return ggP if ok else None
211
+
212
+ def __fetchPfamProvider(self, cfgOb, configName, **kwargs):
213
+ _ = cfgOb
214
+ _ = configName
215
+ cachePath = kwargs.get("cachePath", ".")
216
+ useCache = kwargs.get("useCache", True)
217
+ pfP = PfamProvider(cachePath=cachePath, useCache=useCache)
218
+ ok = pfP.testCache()
219
+ return pfP if ok else None
220
+
221
+ def __fetchInterProProvider(self, cfgOb, configName, **kwargs):
222
+ _ = cfgOb
223
+ _ = configName
224
+ cachePath = kwargs.get("cachePath", ".")
225
+ useCache = kwargs.get("useCache", True)
226
+ ipP = InterProProvider(cachePath=cachePath, useCache=useCache)
227
+ ok = ipP.testCache()
228
+ return ipP if ok else None