rcsb.exdb 1.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rcsb/__init__.py +1 -0
- rcsb/exdb/__init__.py +1 -0
- rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
- rcsb/exdb/branch/GlycanProvider.py +116 -0
- rcsb/exdb/branch/GlycanUtils.py +114 -0
- rcsb/exdb/branch/__init__.py +0 -0
- rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
- rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
- rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
- rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
- rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
- rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
- rcsb/exdb/chemref/__init__.py +0 -0
- rcsb/exdb/citation/CitationAdapter.py +91 -0
- rcsb/exdb/citation/CitationExtractor.py +190 -0
- rcsb/exdb/citation/CitationUtils.py +51 -0
- rcsb/exdb/citation/__init__.py +0 -0
- rcsb/exdb/cli/__init__.py +0 -0
- rcsb/exdb/entry/EntryInfoProvider.py +148 -0
- rcsb/exdb/entry/__init__.py +0 -0
- rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
- rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
- rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
- rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
- rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
- rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
- rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
- rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
- rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
- rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
- rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
- rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
- rcsb/exdb/seq/AnnotationExtractor.py +76 -0
- rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
- rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
- rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
- rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
- rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
- rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
- rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
- rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
- rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
- rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
- rcsb/exdb/seq/UniProtExtractor.py +80 -0
- rcsb/exdb/seq/__init__.py +0 -0
- rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
- rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
- rcsb/exdb/tests/__init__.py +0 -0
- rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
- rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
- rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
- rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
- rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
- rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
- rcsb/exdb/tests/testChemRefLoader.py +106 -0
- rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
- rcsb/exdb/tests/testCitationAdapter.py +97 -0
- rcsb/exdb/tests/testCitationExtractor.py +93 -0
- rcsb/exdb/tests/testCitationUtils.py +92 -0
- rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
- rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
- rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
- rcsb/exdb/tests/testGlycanProvider.py +98 -0
- rcsb/exdb/tests/testGlycanUtils.py +64 -0
- rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
- rcsb/exdb/tests/testObjectExtractor.py +342 -0
- rcsb/exdb/tests/testObjectTransformer.py +83 -0
- rcsb/exdb/tests/testObjectUpdater.py +120 -0
- rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
- rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
- rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
- rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
- rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
- rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
- rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
- rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
- rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
- rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
- rcsb/exdb/tests/testUniProtExtractor.py +77 -0
- rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
- rcsb/exdb/tree/__init__.py +0 -0
- rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
- rcsb/exdb/utils/ObjectExtractor.py +286 -0
- rcsb/exdb/utils/ObjectTransformer.py +124 -0
- rcsb/exdb/utils/ObjectUpdater.py +121 -0
- rcsb/exdb/utils/ObjectValidator.py +160 -0
- rcsb/exdb/utils/__init__.py +0 -0
- rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
- rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
- rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
- rcsb/exdb/wf/__init__.py +0 -0
- rcsb_exdb-1.31.dist-info/METADATA +103 -0
- rcsb_exdb-1.31.dist-info/RECORD +98 -0
- rcsb_exdb-1.31.dist-info/WHEEL +4 -0
- rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,388 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: ReferenceSequenceAssignmentProvider.py
|
|
3
|
+
# Date: 8-Oct-2019 jdw
|
|
4
|
+
#
|
|
5
|
+
# Utilities to cache content required to update referencence sequence assignments.
|
|
6
|
+
#
|
|
7
|
+
# Updates:
|
|
8
|
+
#
|
|
9
|
+
##
|
|
10
|
+
__docformat__ = "google en"
|
|
11
|
+
__author__ = "John Westbrook"
|
|
12
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
13
|
+
__license__ = "Apache 2.0"
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
import os
|
|
17
|
+
from collections import defaultdict
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
from rcsb.exdb.utils.ObjectExtractor import ObjectExtractor
|
|
21
|
+
from rcsb.utils.ec.EnzymeDatabaseProvider import EnzymeDatabaseProvider
|
|
22
|
+
from rcsb.utils.io.IoUtil import getObjSize
|
|
23
|
+
from rcsb.utils.io.MarshalUtil import MarshalUtil
|
|
24
|
+
from rcsb.utils.seq.InterProProvider import InterProProvider
|
|
25
|
+
from rcsb.utils.seq.PfamProvider import PfamProvider
|
|
26
|
+
from rcsb.utils.seq.SiftsSummaryProvider import SiftsSummaryProvider
|
|
27
|
+
from rcsb.utils.seq.UniProtUtils import UniProtUtils
|
|
28
|
+
from rcsb.utils.go.GeneOntologyProvider import GeneOntologyProvider
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ReferenceSequenceAssignmentProvider(object):
|
|
34
|
+
"""Utilities to cache content required to update referencence sequence assignments."""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
cfgOb,
|
|
39
|
+
databaseName="pdbx_core",
|
|
40
|
+
collectionName="pdbx_core_polymer_entity",
|
|
41
|
+
polymerType="Protein",
|
|
42
|
+
referenceDatabaseName="UniProt",
|
|
43
|
+
provSource="PDB",
|
|
44
|
+
maxChunkSize=10,
|
|
45
|
+
fetchLimit=None,
|
|
46
|
+
**kwargs
|
|
47
|
+
):
|
|
48
|
+
self.__cfgOb = cfgOb
|
|
49
|
+
self.__polymerType = polymerType
|
|
50
|
+
self.__mU = MarshalUtil()
|
|
51
|
+
#
|
|
52
|
+
self.__maxChunkSize = maxChunkSize
|
|
53
|
+
self.__statusList = []
|
|
54
|
+
#
|
|
55
|
+
self.__pfP = self.__fetchPfamProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
|
|
56
|
+
self.__ipP = self.__fetchInterProProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
|
|
57
|
+
self.__ssP = self.__fetchSiftsSummaryProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
|
|
58
|
+
self.__goP = self.__fetchGoProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
|
|
59
|
+
self.__ecP = self.__fetchEcProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
|
|
60
|
+
self.__refIdMapD, self.__matchD, self.__refD = self.__reload(databaseName, collectionName, polymerType, referenceDatabaseName, provSource, fetchLimit, **kwargs)
|
|
61
|
+
|
|
62
|
+
def goIdExists(self, goId):
|
|
63
|
+
try:
|
|
64
|
+
return self.__goP.exists(goId)
|
|
65
|
+
except Exception as e:
|
|
66
|
+
logger.exception("Failing for %r with %s", goId, str(e))
|
|
67
|
+
return False
|
|
68
|
+
|
|
69
|
+
def getGeneOntologyLineage(self, goIdL):
|
|
70
|
+
# "id" "name"
|
|
71
|
+
gL = []
|
|
72
|
+
try:
|
|
73
|
+
gTupL = self.__goP.getUniqueDescendants(goIdL)
|
|
74
|
+
for gTup in gTupL:
|
|
75
|
+
gL.append({"id": gTup[0], "name": gTup[1]})
|
|
76
|
+
except Exception as e:
|
|
77
|
+
logger.exception("Failing for %r with %s", goIdL, str(e))
|
|
78
|
+
return gL
|
|
79
|
+
|
|
80
|
+
def getPfamProvider(self):
|
|
81
|
+
return self.__pfP
|
|
82
|
+
|
|
83
|
+
def getInterProProvider(self):
|
|
84
|
+
return self.__ipP
|
|
85
|
+
|
|
86
|
+
def getEcProvider(self):
|
|
87
|
+
return self.__ecP
|
|
88
|
+
|
|
89
|
+
def getSiftsSummaryProvider(self):
|
|
90
|
+
return self.__ssP
|
|
91
|
+
|
|
92
|
+
def getMatchInfo(self):
|
|
93
|
+
return self.__matchD
|
|
94
|
+
|
|
95
|
+
def getRefData(self):
|
|
96
|
+
return self.__refD
|
|
97
|
+
|
|
98
|
+
def getDocuments(self, formatType="exchange"):
|
|
99
|
+
fobj = UniProtUtils(saveText=False)
|
|
100
|
+
exObjD = fobj.reformat(self.__refD, formatType=formatType)
|
|
101
|
+
return list(exObjD.values())
|
|
102
|
+
|
|
103
|
+
def getRefIdMap(self):
|
|
104
|
+
return self.__refIdMapD
|
|
105
|
+
|
|
106
|
+
def getRefDataCount(self):
|
|
107
|
+
return len(self.__refD)
|
|
108
|
+
|
|
109
|
+
def testCache(self, minMatchPrimaryPercent=None, logSizes=False):
|
|
110
|
+
okC = True
|
|
111
|
+
logger.info("Reference cache lengths: refIdMap %d matchD %d refD %d", len(self.__refIdMapD), len(self.__matchD), len(self.__refD))
|
|
112
|
+
ok = bool(self.__refIdMapD and self.__matchD and self.__refD and self.__ssP)
|
|
113
|
+
#
|
|
114
|
+
numRef = len(self.__refIdMapD)
|
|
115
|
+
countD = defaultdict(int)
|
|
116
|
+
logger.info("Match dictionary length %d", len(self.__matchD))
|
|
117
|
+
for _, mD in self.__matchD.items():
|
|
118
|
+
if "matched" in mD:
|
|
119
|
+
countD[mD["matched"]] += 1
|
|
120
|
+
logger.info("Reference length %d match length %d coverage %r", len(self.__refD), len(self.__matchD), countD.items())
|
|
121
|
+
if minMatchPrimaryPercent:
|
|
122
|
+
try:
|
|
123
|
+
okC = 100.0 * float(countD["primary"]) / float(numRef) > minMatchPrimaryPercent
|
|
124
|
+
except Exception:
|
|
125
|
+
okC = False
|
|
126
|
+
logger.info("Primary reference match percent test status %r", okC)
|
|
127
|
+
#
|
|
128
|
+
if logSizes:
|
|
129
|
+
logger.info(
|
|
130
|
+
"Pfam %.2f InterPro %.2f SIFTS %.2f GO %.2f EC %.2f RefIdMap %.2f RefMatchD %.2f RefD %.2f",
|
|
131
|
+
getObjSize(self.__pfP) / 1000000.0,
|
|
132
|
+
getObjSize(self.__ipP) / 1000000.0,
|
|
133
|
+
getObjSize(self.__ssP) / 1000000.0,
|
|
134
|
+
getObjSize(self.__goP) / 1000000.0,
|
|
135
|
+
getObjSize(self.__ecP) / 1000000.0,
|
|
136
|
+
getObjSize(self.__refIdMapD) / 1000000.0,
|
|
137
|
+
getObjSize(self.__matchD) / 1000000.0,
|
|
138
|
+
getObjSize(self.__refD) / 1000000.0,
|
|
139
|
+
)
|
|
140
|
+
return ok and okC
|
|
141
|
+
|
|
142
|
+
def __reload(self, databaseName, collectionName, polymerType, referenceDatabaseName, provSource, fetchLimit, **kwargs):
|
|
143
|
+
assignRefD = self.__getPolymerReferenceSequenceAssignments(databaseName, collectionName, polymerType, fetchLimit)
|
|
144
|
+
refIdMapD, _ = self.__getAssignmentMap(assignRefD, referenceDatabaseName=referenceDatabaseName, provSource=provSource)
|
|
145
|
+
#
|
|
146
|
+
entryIdL = [rcsbId[:4] for rcsbId in assignRefD]
|
|
147
|
+
siftsUniProtL = self.__ssP.getEntryUniqueIdentifiers(entryIdL, idType="UNPID")
|
|
148
|
+
logger.info("Incorporating %d SIFTS accessions for %d entries", len(siftsUniProtL), len(entryIdL))
|
|
149
|
+
unpIdList = sorted(set(list(refIdMapD.keys()) + siftsUniProtL))
|
|
150
|
+
#
|
|
151
|
+
logger.info("Rebuild cache for %d UniProt accessions (consolidated)", len(unpIdList))
|
|
152
|
+
#
|
|
153
|
+
matchD, refD = self.__rebuildReferenceCache(unpIdList, referenceDatabaseName, **kwargs)
|
|
154
|
+
return refIdMapD, matchD, refD
|
|
155
|
+
|
|
156
|
+
def __getPolymerReferenceSequenceAssignments(self, databaseName, collectionName, polymerType, fetchLimit):
|
|
157
|
+
"""Get all accessions assigned to input reference sequence database for the input polymerType.
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
(dict): {"1abc_1": "rcsb_polymer_entity_container_identifiers": {"reference_sequence_identifiers": []},
|
|
161
|
+
"rcsb_polymer_entity_align": [],
|
|
162
|
+
"rcsb_entity_source_organism"" {"ncbi_taxonomy_id": []}
|
|
163
|
+
"""
|
|
164
|
+
try:
|
|
165
|
+
obEx = ObjectExtractor(
|
|
166
|
+
self.__cfgOb,
|
|
167
|
+
databaseName=databaseName,
|
|
168
|
+
collectionName=collectionName,
|
|
169
|
+
cacheFilePath=None,
|
|
170
|
+
useCache=False,
|
|
171
|
+
keyAttribute="entity",
|
|
172
|
+
uniqueAttributes=["rcsb_id"],
|
|
173
|
+
cacheKwargs=None,
|
|
174
|
+
objectLimit=fetchLimit,
|
|
175
|
+
selectionQuery={"entity_poly.rcsb_entity_polymer_type": polymerType},
|
|
176
|
+
selectionList=[
|
|
177
|
+
"rcsb_id",
|
|
178
|
+
"rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers",
|
|
179
|
+
"rcsb_polymer_entity_container_identifiers.auth_asym_ids",
|
|
180
|
+
# "rcsb_polymer_entity_align",
|
|
181
|
+
# "rcsb_entity_source_organism.ncbi_taxonomy_id",
|
|
182
|
+
# "rcsb_polymer_entity_container_identifiers.related_annotation_identifiers",
|
|
183
|
+
# "rcsb_polymer_entity_annotation",
|
|
184
|
+
"rcsb_entity_source_organism.ncbi_taxonomy_id",
|
|
185
|
+
],
|
|
186
|
+
)
|
|
187
|
+
eCount = obEx.getCount()
|
|
188
|
+
logger.info("Polymer entity count type %s is %d", polymerType, eCount)
|
|
189
|
+
objD = obEx.getObjects()
|
|
190
|
+
logger.info("Reading polymer entity count %d ref accession length %d ", eCount, len(objD))
|
|
191
|
+
#
|
|
192
|
+
except Exception as e:
|
|
193
|
+
logger.exception("Failing for %s (%s) with %s", databaseName, collectionName, str(e))
|
|
194
|
+
return objD
|
|
195
|
+
|
|
196
|
+
def __getAssignmentMap(self, objD, referenceDatabaseName="UniProt", provSource="PDB"):
|
|
197
|
+
refIdD = defaultdict(list)
|
|
198
|
+
taxIdD = defaultdict(list)
|
|
199
|
+
numMissing = 0
|
|
200
|
+
numMissingTaxons = 0
|
|
201
|
+
for entityKey, eD in objD.items():
|
|
202
|
+
try:
|
|
203
|
+
accS = set()
|
|
204
|
+
for ii, tD in enumerate(eD["rcsb_polymer_entity_container_identifiers"]["reference_sequence_identifiers"]):
|
|
205
|
+
if tD["database_name"] == referenceDatabaseName and tD["provenance_source"] == provSource:
|
|
206
|
+
accS.add(tD["database_accession"])
|
|
207
|
+
refIdD[tD["database_accession"]].append(entityKey)
|
|
208
|
+
#
|
|
209
|
+
# pick up the corresponding taxonomy -
|
|
210
|
+
try:
|
|
211
|
+
taxIdD[tD["database_accession"]].append(eD["rcsb_entity_source_organism"][ii]["ncbi_taxonomy_id"])
|
|
212
|
+
except Exception:
|
|
213
|
+
logger.debug("Failing taxonomy lookup for %s %r", entityKey, tD["database_accession"])
|
|
214
|
+
numMissingTaxons += 1
|
|
215
|
+
|
|
216
|
+
logger.debug("PDB assigned sequences length %d", len(accS))
|
|
217
|
+
except Exception as e:
|
|
218
|
+
numMissing += 1
|
|
219
|
+
logger.debug("No sequence assignments for %s with %s", entityKey, str(e))
|
|
220
|
+
#
|
|
221
|
+
numMultipleTaxons = 0
|
|
222
|
+
for refId, taxIdL in taxIdD.items():
|
|
223
|
+
taxIdL = list(set(taxIdL))
|
|
224
|
+
if len(taxIdL) > 1:
|
|
225
|
+
logger.debug("Multitple taxIds assigned to reference sequence id %s: %r", refId, taxIdL)
|
|
226
|
+
numMultipleTaxons += 1
|
|
227
|
+
|
|
228
|
+
logger.info("Entities with missing taxonomy %d", numMissingTaxons)
|
|
229
|
+
logger.info("Reference sequences with multiple taxonomies %d", numMultipleTaxons)
|
|
230
|
+
logger.info("Unique %s accession assignments by %s %d (entities missing archive accession assignments %d) ", referenceDatabaseName, provSource, len(refIdD), numMissing)
|
|
231
|
+
return refIdD, taxIdD
|
|
232
|
+
|
|
233
|
+
#
|
|
234
|
+
def __rebuildReferenceCache(self, idList, refDbName, **kwargs):
|
|
235
|
+
""" """
|
|
236
|
+
fetchLimit = None
|
|
237
|
+
doMissing = True
|
|
238
|
+
dD = {}
|
|
239
|
+
cachePath = kwargs.get("cachePath", ".")
|
|
240
|
+
dirPath = os.path.join(cachePath, "exdb")
|
|
241
|
+
# cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "json", "indent": 3})
|
|
242
|
+
cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"})
|
|
243
|
+
useCache = kwargs.get("useCache", True)
|
|
244
|
+
saveText = kwargs.get("saveText", False)
|
|
245
|
+
#
|
|
246
|
+
ext = "pic" if cacheKwargs["fmt"] == "pickle" else "json"
|
|
247
|
+
fn = refDbName + "-ref-sequence-data-cache" + "." + ext
|
|
248
|
+
dataCacheFilePath = os.path.join(dirPath, fn)
|
|
249
|
+
#
|
|
250
|
+
fn = refDbName + "-ref-sequence-id-cache" + ".json"
|
|
251
|
+
accCacheFilePath = os.path.join(dirPath, fn)
|
|
252
|
+
#
|
|
253
|
+
self.__mU.mkdir(dirPath)
|
|
254
|
+
if not useCache:
|
|
255
|
+
for fp in [dataCacheFilePath, accCacheFilePath]:
|
|
256
|
+
try:
|
|
257
|
+
os.remove(fp)
|
|
258
|
+
except Exception:
|
|
259
|
+
pass
|
|
260
|
+
#
|
|
261
|
+
if useCache and accCacheFilePath and self.__mU.exists(accCacheFilePath) and dataCacheFilePath and self.__mU.exists(dataCacheFilePath):
|
|
262
|
+
dD = self.__mU.doImport(dataCacheFilePath, **cacheKwargs)
|
|
263
|
+
idD = self.__mU.doImport(accCacheFilePath, fmt="json")
|
|
264
|
+
logger.info("Reading cached reference sequence ID and data cache files - cached match reference length %d", len(idD["matchInfo"]))
|
|
265
|
+
idD["matchInfo"] = self.__rebuildReferenceMatchIndex(idList, dD["refDbCache"])
|
|
266
|
+
# Check for completeness -
|
|
267
|
+
if doMissing:
|
|
268
|
+
missingS = set(idList) - set(idD["matchInfo"].keys())
|
|
269
|
+
if missingS:
|
|
270
|
+
logger.info("Reference sequence cache missing %d accessions", len(missingS))
|
|
271
|
+
extraD, extraIdD = self.__fetchReferenceEntries(refDbName, list(missingS), saveText=saveText, fetchLimit=fetchLimit)
|
|
272
|
+
dD["refDbCache"].update(extraD["refDbCache"])
|
|
273
|
+
idD["matchInfo"].update(extraIdD["matchInfo"])
|
|
274
|
+
#
|
|
275
|
+
idD["matchInfo"] = self.__rebuildReferenceMatchIndex(idList, dD["refDbCache"])
|
|
276
|
+
#
|
|
277
|
+
if accCacheFilePath and dataCacheFilePath and cacheKwargs:
|
|
278
|
+
self.__mU.mkdir(dirPath)
|
|
279
|
+
ok1 = self.__mU.doExport(dataCacheFilePath, dD, **cacheKwargs)
|
|
280
|
+
ok2 = self.__mU.doExport(accCacheFilePath, idD, fmt="json", indent=3)
|
|
281
|
+
logger.info("Cache updated with missing references with status %r", ok1 and ok2)
|
|
282
|
+
#
|
|
283
|
+
else:
|
|
284
|
+
logger.info("Rebuilding reference cache for %s for %d accessions with limit %r", refDbName, len(idList), fetchLimit)
|
|
285
|
+
dD, idD = self.__fetchReferenceEntries(refDbName, idList, saveText=saveText, fetchLimit=fetchLimit)
|
|
286
|
+
if accCacheFilePath and dataCacheFilePath and cacheKwargs:
|
|
287
|
+
self.__mU.mkdir(dirPath)
|
|
288
|
+
ok1 = self.__mU.doExport(dataCacheFilePath, dD, **cacheKwargs)
|
|
289
|
+
ok2 = self.__mU.doExport(accCacheFilePath, idD, fmt="json", indent=3)
|
|
290
|
+
logger.info("Cache save status %r", ok1 and ok2)
|
|
291
|
+
|
|
292
|
+
return idD["matchInfo"], dD["refDbCache"]
|
|
293
|
+
|
|
294
|
+
def __rebuildReferenceMatchIndex(self, idList, referenceD):
|
|
295
|
+
fobj = UniProtUtils()
|
|
296
|
+
logger.info("Rebuilding match index on idList (%d) using reference data (%d) %r", len(idList), len(referenceD), type(referenceD))
|
|
297
|
+
matchD = fobj.rebuildMatchResultIndex(idList, referenceD)
|
|
298
|
+
return matchD
|
|
299
|
+
|
|
300
|
+
def __fetchReferenceEntries(self, refDbName, idList, saveText=False, fetchLimit=None):
|
|
301
|
+
"""Fetch database entries from the input reference sequence database name."""
|
|
302
|
+
dD = {"refDbName": refDbName, "refDbCache": {}}
|
|
303
|
+
idD = {"matchInfo": {}, "refIdMap": {}}
|
|
304
|
+
|
|
305
|
+
try:
|
|
306
|
+
idList = idList[:fetchLimit] if fetchLimit else idList
|
|
307
|
+
logger.info("Starting fetch for %d %s entries", len(idList), refDbName)
|
|
308
|
+
if refDbName == "UniProt":
|
|
309
|
+
fobj = UniProtUtils(saveText=saveText)
|
|
310
|
+
logger.info("Maximum reference chunk size %d", self.__maxChunkSize)
|
|
311
|
+
refD, matchD = fobj.fetchList(idList, maxChunkSize=self.__maxChunkSize)
|
|
312
|
+
dD = {"refDbName": refDbName, "refDbCache": refD}
|
|
313
|
+
idD = {"matchInfo": matchD}
|
|
314
|
+
#
|
|
315
|
+
# Check the coverage -
|
|
316
|
+
#
|
|
317
|
+
countD = defaultdict(int)
|
|
318
|
+
logger.info("Match dictionary length %d", len(matchD))
|
|
319
|
+
for _, mD in matchD.items():
|
|
320
|
+
if "matched" in mD:
|
|
321
|
+
countD[mD["matched"]] += 1
|
|
322
|
+
logger.info("Reference length %d match length %d coverage %r", len(refD), len(matchD), countD.items())
|
|
323
|
+
except Exception as e:
|
|
324
|
+
logger.exception("Failing with %s", str(e))
|
|
325
|
+
|
|
326
|
+
return dD, idD
|
|
327
|
+
|
|
328
|
+
def __fetchSiftsSummaryProvider(self, cfgOb, configName, **kwargs):
|
|
329
|
+
abbreviated = kwargs.get("siftsAbbreviated", "TEST")
|
|
330
|
+
cachePath = kwargs.get("cachePath", ".")
|
|
331
|
+
cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"})
|
|
332
|
+
useCache = kwargs.get("useCache", True)
|
|
333
|
+
#
|
|
334
|
+
siftsSummaryDataPath = cfgOb.getPath("SIFTS_SUMMARY_DATA_PATH", sectionName=configName)
|
|
335
|
+
if siftsSummaryDataPath.lower().startswith("http"):
|
|
336
|
+
srcDirPath = siftsSummaryDataPath
|
|
337
|
+
else:
|
|
338
|
+
srcDirPath = os.path.join(cachePath, siftsSummaryDataPath)
|
|
339
|
+
cacheDirPath = os.path.join(cachePath, cfgOb.get("SIFTS_SUMMARY_CACHE_DIR", sectionName=configName))
|
|
340
|
+
logger.debug("ssP %r %r", srcDirPath, cacheDirPath)
|
|
341
|
+
ssP = SiftsSummaryProvider(srcDirPath=srcDirPath, cacheDirPath=cacheDirPath, useCache=useCache, abbreviated=abbreviated, cacheKwargs=cacheKwargs)
|
|
342
|
+
ok = ssP.testCache()
|
|
343
|
+
if not ok:
|
|
344
|
+
logger.error("Failed to refetch SIFTS summary data using srcDirPath %s, cacheDirPath %s", srcDirPath, cacheDirPath)
|
|
345
|
+
return None
|
|
346
|
+
logger.debug("SIFTS cache status %r", ok)
|
|
347
|
+
logger.debug("ssP entry count %d", ssP.getEntryCount())
|
|
348
|
+
return ssP
|
|
349
|
+
|
|
350
|
+
def __fetchGoProvider(self, cfgOb, configName, **kwargs):
|
|
351
|
+
cachePath = kwargs.get("cachePath", ".")
|
|
352
|
+
useCache = kwargs.get("useCache", True)
|
|
353
|
+
#
|
|
354
|
+
cacheDirPath = os.path.join(cachePath, cfgOb.get("EXDB_CACHE_DIR", sectionName=configName))
|
|
355
|
+
logger.debug("goP %r %r", cacheDirPath, useCache)
|
|
356
|
+
goP = GeneOntologyProvider(goDirPath=cacheDirPath, useCache=useCache)
|
|
357
|
+
ok = goP.testCache()
|
|
358
|
+
logger.debug("Gene Ontology (%r) root node count %r", ok, goP.getRootNodes())
|
|
359
|
+
return goP
|
|
360
|
+
|
|
361
|
+
def __fetchEcProvider(self, cfgOb, configName, **kwargs):
|
|
362
|
+
cachePath = kwargs.get("cachePath", ".")
|
|
363
|
+
useCache = kwargs.get("useCache", True)
|
|
364
|
+
#
|
|
365
|
+
cacheDirPath = os.path.join(cachePath, cfgOb.get("ENZYME_CLASSIFICATION_CACHE_DIR", sectionName=configName))
|
|
366
|
+
logger.debug("ecP %r %r", cacheDirPath, useCache)
|
|
367
|
+
ecP = EnzymeDatabaseProvider(enzymeDirPath=cacheDirPath, useCache=useCache)
|
|
368
|
+
ok = ecP.testCache()
|
|
369
|
+
logger.debug("Enzyme cache status %r", ok)
|
|
370
|
+
return ecP
|
|
371
|
+
|
|
372
|
+
def __fetchPfamProvider(self, cfgOb, configName, **kwargs):
|
|
373
|
+
_ = cfgOb
|
|
374
|
+
_ = configName
|
|
375
|
+
cachePath = kwargs.get("cachePath", ".")
|
|
376
|
+
useCache = kwargs.get("useCache", True)
|
|
377
|
+
pfP = PfamProvider(cachePath=cachePath, useCache=useCache)
|
|
378
|
+
ok = pfP.testCache()
|
|
379
|
+
return pfP if ok else None
|
|
380
|
+
|
|
381
|
+
def __fetchInterProProvider(self, cfgOb, configName, **kwargs):
|
|
382
|
+
_ = cfgOb
|
|
383
|
+
_ = configName
|
|
384
|
+
cachePath = kwargs.get("cachePath", ".")
|
|
385
|
+
useCache = kwargs.get("useCache", True)
|
|
386
|
+
ipP = InterProProvider(cachePath=cachePath, useCache=useCache)
|
|
387
|
+
ok = ipP.testCache()
|
|
388
|
+
return ipP if ok else None
|