rcsb.exdb 1.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rcsb/__init__.py +1 -0
- rcsb/exdb/__init__.py +1 -0
- rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
- rcsb/exdb/branch/GlycanProvider.py +116 -0
- rcsb/exdb/branch/GlycanUtils.py +114 -0
- rcsb/exdb/branch/__init__.py +0 -0
- rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
- rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
- rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
- rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
- rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
- rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
- rcsb/exdb/chemref/__init__.py +0 -0
- rcsb/exdb/citation/CitationAdapter.py +91 -0
- rcsb/exdb/citation/CitationExtractor.py +190 -0
- rcsb/exdb/citation/CitationUtils.py +51 -0
- rcsb/exdb/citation/__init__.py +0 -0
- rcsb/exdb/cli/__init__.py +0 -0
- rcsb/exdb/entry/EntryInfoProvider.py +148 -0
- rcsb/exdb/entry/__init__.py +0 -0
- rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
- rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
- rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
- rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
- rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
- rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
- rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
- rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
- rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
- rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
- rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
- rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
- rcsb/exdb/seq/AnnotationExtractor.py +76 -0
- rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
- rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
- rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
- rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
- rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
- rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
- rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
- rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
- rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
- rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
- rcsb/exdb/seq/UniProtExtractor.py +80 -0
- rcsb/exdb/seq/__init__.py +0 -0
- rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
- rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
- rcsb/exdb/tests/__init__.py +0 -0
- rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
- rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
- rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
- rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
- rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
- rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
- rcsb/exdb/tests/testChemRefLoader.py +106 -0
- rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
- rcsb/exdb/tests/testCitationAdapter.py +97 -0
- rcsb/exdb/tests/testCitationExtractor.py +93 -0
- rcsb/exdb/tests/testCitationUtils.py +92 -0
- rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
- rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
- rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
- rcsb/exdb/tests/testGlycanProvider.py +98 -0
- rcsb/exdb/tests/testGlycanUtils.py +64 -0
- rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
- rcsb/exdb/tests/testObjectExtractor.py +342 -0
- rcsb/exdb/tests/testObjectTransformer.py +83 -0
- rcsb/exdb/tests/testObjectUpdater.py +120 -0
- rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
- rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
- rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
- rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
- rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
- rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
- rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
- rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
- rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
- rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
- rcsb/exdb/tests/testUniProtExtractor.py +77 -0
- rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
- rcsb/exdb/tree/__init__.py +0 -0
- rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
- rcsb/exdb/utils/ObjectExtractor.py +286 -0
- rcsb/exdb/utils/ObjectTransformer.py +124 -0
- rcsb/exdb/utils/ObjectUpdater.py +121 -0
- rcsb/exdb/utils/ObjectValidator.py +160 -0
- rcsb/exdb/utils/__init__.py +0 -0
- rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
- rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
- rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
- rcsb/exdb/wf/__init__.py +0 -0
- rcsb_exdb-1.31.dist-info/METADATA +103 -0
- rcsb_exdb-1.31.dist-info/RECORD +98 -0
- rcsb_exdb-1.31.dist-info/WHEEL +4 -0
- rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: ReferenceSequenceAnnotationProvider.py
|
|
3
|
+
# Date: 14-Feb-2020 jdw
|
|
4
|
+
#
|
|
5
|
+
# Utilities to cache content required to update referencence sequence annotations.
|
|
6
|
+
#
|
|
7
|
+
# Updates:
|
|
8
|
+
# 25-May-2022 dwp Add error checking for SIFTS data loading
|
|
9
|
+
##
|
|
10
|
+
__docformat__ = "google en"
|
|
11
|
+
__author__ = "John Westbrook"
|
|
12
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
13
|
+
__license__ = "Apache 2.0"
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
import os
|
|
17
|
+
from collections import defaultdict
|
|
18
|
+
|
|
19
|
+
from rcsb.exdb.seq.ReferenceSequenceCacheProvider import ReferenceSequenceCacheProvider
|
|
20
|
+
from rcsb.utils.ec.EnzymeDatabaseProvider import EnzymeDatabaseProvider
|
|
21
|
+
from rcsb.utils.go.GeneOntologyProvider import GeneOntologyProvider
|
|
22
|
+
from rcsb.utils.io.IoUtil import getObjSize
|
|
23
|
+
from rcsb.utils.io.MarshalUtil import MarshalUtil
|
|
24
|
+
from rcsb.utils.seq.GlyGenProvider import GlyGenProvider
|
|
25
|
+
from rcsb.utils.seq.InterProProvider import InterProProvider
|
|
26
|
+
from rcsb.utils.seq.PfamProvider import PfamProvider
|
|
27
|
+
from rcsb.utils.seq.SiftsSummaryProvider import SiftsSummaryProvider
|
|
28
|
+
from rcsb.utils.seq.UniProtUtils import UniProtUtils
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ReferenceSequenceAnnotationProvider(object):
|
|
34
|
+
"""Utilities to cache content required to update referencence sequence annotations."""
|
|
35
|
+
|
|
36
|
+
def __init__(self, cfgOb, databaseName, collectionName, polymerType, maxChunkSize=10, fetchLimit=None, numProc=2, expireDays=14, **kwargs):
|
|
37
|
+
self.__cfgOb = cfgOb
|
|
38
|
+
self.__mU = MarshalUtil()
|
|
39
|
+
#
|
|
40
|
+
self.__maxChunkSize = maxChunkSize
|
|
41
|
+
self.__statusList = []
|
|
42
|
+
#
|
|
43
|
+
self.__ggP = self.__fetchGlyGenProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
|
|
44
|
+
self.__pfP = self.__fetchPfamProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
|
|
45
|
+
self.__ipP = self.__fetchInterProProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
|
|
46
|
+
self.__ssP = self.__fetchSiftsSummaryProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
|
|
47
|
+
self.__goP = self.__fetchGoProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
|
|
48
|
+
self.__ecP = self.__fetchEcProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
|
|
49
|
+
#
|
|
50
|
+
self.__rsaP = ReferenceSequenceCacheProvider(
|
|
51
|
+
self.__cfgOb, databaseName, collectionName, polymerType, siftsProvider=self.__ssP, maxChunkSize=maxChunkSize, numProc=numProc, fetchLimit=fetchLimit, expireDays=expireDays
|
|
52
|
+
)
|
|
53
|
+
self.__matchD = self.__rsaP.getMatchInfo()
|
|
54
|
+
self.__refD = self.__rsaP.getRefData()
|
|
55
|
+
self.__missingMatchedIdCodes = self.__rsaP.getMissingMatchedIdCodes()
|
|
56
|
+
|
|
57
|
+
def goIdExists(self, goId):
|
|
58
|
+
try:
|
|
59
|
+
return self.__goP.exists(goId)
|
|
60
|
+
except Exception as e:
|
|
61
|
+
logger.exception("Failing for %r with %s", goId, str(e))
|
|
62
|
+
return False
|
|
63
|
+
|
|
64
|
+
def getGeneOntologyName(self, goId):
|
|
65
|
+
try:
|
|
66
|
+
return self.__goP.getName(goId)
|
|
67
|
+
except Exception as e:
|
|
68
|
+
logger.exception("Failing for %r with %s", goId, str(e))
|
|
69
|
+
return None
|
|
70
|
+
|
|
71
|
+
def getGeneOntologyLineage(self, goIdL):
|
|
72
|
+
# "id" "name"
|
|
73
|
+
gL = []
|
|
74
|
+
try:
|
|
75
|
+
gTupL = self.__goP.getUniqueDescendants(goIdL)
|
|
76
|
+
for gTup in gTupL:
|
|
77
|
+
gL.append({"id": gTup[0], "name": gTup[1]})
|
|
78
|
+
except Exception as e:
|
|
79
|
+
logger.exception("Failing for %r with %s", goIdL, str(e))
|
|
80
|
+
return gL
|
|
81
|
+
|
|
82
|
+
def getGlyGenProvider(self):
|
|
83
|
+
return self.__ggP
|
|
84
|
+
|
|
85
|
+
def getPfamProvider(self):
|
|
86
|
+
return self.__pfP
|
|
87
|
+
|
|
88
|
+
def getPfamName(self, idCode):
|
|
89
|
+
return self.__pfP.getDescription(idCode)
|
|
90
|
+
|
|
91
|
+
def getInterProProvider(self):
|
|
92
|
+
return self.__ipP
|
|
93
|
+
|
|
94
|
+
def getInterProName(self, idCode):
|
|
95
|
+
return self.__ipP.getDescription(idCode)
|
|
96
|
+
|
|
97
|
+
def getInterProLineage(self, idCode):
|
|
98
|
+
linL = []
|
|
99
|
+
try:
|
|
100
|
+
tupL = self.__ipP.getLineageWithNames(idCode)
|
|
101
|
+
for tup in tupL:
|
|
102
|
+
linL.append({"id": tup[0], "name": tup[1], "depth": tup[2]})
|
|
103
|
+
except Exception as e:
|
|
104
|
+
logger.exception("Failing for %r with %s", idCode, str(e))
|
|
105
|
+
return linL
|
|
106
|
+
|
|
107
|
+
def getEcProvider(self):
|
|
108
|
+
return self.__ecP
|
|
109
|
+
|
|
110
|
+
def getSiftsSummaryProvider(self):
|
|
111
|
+
return self.__ssP
|
|
112
|
+
|
|
113
|
+
def getMatchInfo(self):
|
|
114
|
+
return self.__matchD
|
|
115
|
+
|
|
116
|
+
def getRefData(self):
|
|
117
|
+
return self.__refD
|
|
118
|
+
|
|
119
|
+
def getDocuments(self, formatType="exchange"):
|
|
120
|
+
fobj = UniProtUtils(saveText=False)
|
|
121
|
+
exObjD = fobj.reformat(self.__refD, formatType=formatType)
|
|
122
|
+
return list(exObjD.values())
|
|
123
|
+
|
|
124
|
+
def getRefDataCount(self):
|
|
125
|
+
return len(self.__refD)
|
|
126
|
+
|
|
127
|
+
def testCache(self, minMatchPrimaryPercent=None, logSizes=False, minMissing=0):
|
|
128
|
+
okC = True
|
|
129
|
+
logger.info("Reference sequence cache lengths: matchD %d refD %d", len(self.__matchD), len(self.__refD))
|
|
130
|
+
logger.info("missingMatchedIdCodes %r minMissing %r", self.__missingMatchedIdCodes, minMissing)
|
|
131
|
+
ok = bool(self.__matchD and self.__refD and self.__ssP and self.__missingMatchedIdCodes <= minMissing)
|
|
132
|
+
logger.info("Initial testCache check status %r", ok)
|
|
133
|
+
#
|
|
134
|
+
numRef = len(self.__matchD)
|
|
135
|
+
countD = defaultdict(int)
|
|
136
|
+
logger.info("Match dictionary length %d", len(self.__matchD))
|
|
137
|
+
for _, mD in self.__matchD.items():
|
|
138
|
+
if "matched" in mD:
|
|
139
|
+
countD[mD["matched"]] += 1
|
|
140
|
+
logger.info("Reference length %d match length %d coverage %r", len(self.__refD), len(self.__matchD), countD.items())
|
|
141
|
+
if minMatchPrimaryPercent:
|
|
142
|
+
try:
|
|
143
|
+
okC = 100.0 * float(countD["primary"]) / float(numRef) > minMatchPrimaryPercent
|
|
144
|
+
except Exception:
|
|
145
|
+
okC = False
|
|
146
|
+
logger.info("Primary reference match percent test status %r", okC)
|
|
147
|
+
#
|
|
148
|
+
if logSizes:
|
|
149
|
+
logger.info(
|
|
150
|
+
"SIFTS %.2f GO %.2f EC %.2f RefMatchD %.2f RefD %.2f",
|
|
151
|
+
getObjSize(self.__ssP) / 1000000.0,
|
|
152
|
+
getObjSize(self.__goP) / 1000000.0,
|
|
153
|
+
getObjSize(self.__ecP) / 1000000.0,
|
|
154
|
+
getObjSize(self.__matchD) / 1000000.0,
|
|
155
|
+
getObjSize(self.__refD) / 1000000.0,
|
|
156
|
+
)
|
|
157
|
+
return ok and okC
|
|
158
|
+
|
|
159
|
+
def __fetchSiftsSummaryProvider(self, cfgOb, configName, **kwargs):
|
|
160
|
+
abbreviated = kwargs.get("siftsAbbreviated", "TEST")
|
|
161
|
+
cachePath = kwargs.get("cachePath", ".")
|
|
162
|
+
cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"})
|
|
163
|
+
useCache = kwargs.get("useCache", True)
|
|
164
|
+
#
|
|
165
|
+
siftsSummaryDataPath = cfgOb.getPath("SIFTS_SUMMARY_DATA_PATH", sectionName=configName)
|
|
166
|
+
if siftsSummaryDataPath.lower().startswith("http"):
|
|
167
|
+
srcDirPath = siftsSummaryDataPath
|
|
168
|
+
else:
|
|
169
|
+
srcDirPath = os.path.join(cachePath, siftsSummaryDataPath)
|
|
170
|
+
cacheDirPath = os.path.join(cachePath, cfgOb.get("SIFTS_SUMMARY_CACHE_DIR", sectionName=configName))
|
|
171
|
+
logger.debug("ssP %r %r", srcDirPath, cacheDirPath)
|
|
172
|
+
ssP = SiftsSummaryProvider(srcDirPath=srcDirPath, cacheDirPath=cacheDirPath, useCache=useCache, abbreviated=abbreviated, cacheKwargs=cacheKwargs)
|
|
173
|
+
ok = ssP.testCache()
|
|
174
|
+
if not ok:
|
|
175
|
+
logger.error("Failed to refetch SIFTS summary data using srcDirPath %s, cacheDirPath %s", srcDirPath, cacheDirPath)
|
|
176
|
+
return None
|
|
177
|
+
logger.debug("SIFTS cache status %r", ok)
|
|
178
|
+
logger.debug("ssP entry count %d", ssP.getEntryCount())
|
|
179
|
+
return ssP
|
|
180
|
+
|
|
181
|
+
def __fetchGoProvider(self, cfgOb, configName, **kwargs):
|
|
182
|
+
cachePath = kwargs.get("cachePath", ".")
|
|
183
|
+
useCache = kwargs.get("useCache", True)
|
|
184
|
+
#
|
|
185
|
+
cacheDirPath = os.path.join(cachePath, cfgOb.get("EXDB_CACHE_DIR", sectionName=configName))
|
|
186
|
+
logger.debug("goP %r %r", cacheDirPath, useCache)
|
|
187
|
+
goP = GeneOntologyProvider(goDirPath=cacheDirPath, useCache=useCache)
|
|
188
|
+
ok = goP.testCache()
|
|
189
|
+
logger.debug("Gene Ontology (%r) root node count %r", ok, goP.getRootNodes())
|
|
190
|
+
return goP
|
|
191
|
+
|
|
192
|
+
def __fetchEcProvider(self, cfgOb, configName, **kwargs):
|
|
193
|
+
cachePath = kwargs.get("cachePath", ".")
|
|
194
|
+
useCache = kwargs.get("useCache", True)
|
|
195
|
+
#
|
|
196
|
+
cacheDirPath = os.path.join(cachePath, cfgOb.get("ENZYME_CLASSIFICATION_CACHE_DIR", sectionName=configName))
|
|
197
|
+
logger.debug("ecP %r %r", cacheDirPath, useCache)
|
|
198
|
+
ecP = EnzymeDatabaseProvider(enzymeDirPath=cacheDirPath, useCache=useCache)
|
|
199
|
+
ok = ecP.testCache()
|
|
200
|
+
logger.debug("Enzyme cache status %r", ok)
|
|
201
|
+
return ecP
|
|
202
|
+
|
|
203
|
+
def __fetchGlyGenProvider(self, cfgOb, configName, **kwargs):
|
|
204
|
+
_ = cfgOb
|
|
205
|
+
_ = configName
|
|
206
|
+
cachePath = kwargs.get("cachePath", ".")
|
|
207
|
+
useCache = kwargs.get("useCache", True)
|
|
208
|
+
ggP = GlyGenProvider(cachePath=cachePath, useCache=useCache)
|
|
209
|
+
ok = ggP.testCache()
|
|
210
|
+
return ggP if ok else None
|
|
211
|
+
|
|
212
|
+
def __fetchPfamProvider(self, cfgOb, configName, **kwargs):
|
|
213
|
+
_ = cfgOb
|
|
214
|
+
_ = configName
|
|
215
|
+
cachePath = kwargs.get("cachePath", ".")
|
|
216
|
+
useCache = kwargs.get("useCache", True)
|
|
217
|
+
pfP = PfamProvider(cachePath=cachePath, useCache=useCache)
|
|
218
|
+
ok = pfP.testCache()
|
|
219
|
+
return pfP if ok else None
|
|
220
|
+
|
|
221
|
+
def __fetchInterProProvider(self, cfgOb, configName, **kwargs):
|
|
222
|
+
_ = cfgOb
|
|
223
|
+
_ = configName
|
|
224
|
+
cachePath = kwargs.get("cachePath", ".")
|
|
225
|
+
useCache = kwargs.get("useCache", True)
|
|
226
|
+
ipP = InterProProvider(cachePath=cachePath, useCache=useCache)
|
|
227
|
+
ok = ipP.testCache()
|
|
228
|
+
return ipP if ok else None
|