rcsb.exdb 1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. rcsb/__init__.py +1 -0
  2. rcsb/exdb/__init__.py +1 -0
  3. rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
  4. rcsb/exdb/branch/GlycanProvider.py +116 -0
  5. rcsb/exdb/branch/GlycanUtils.py +114 -0
  6. rcsb/exdb/branch/__init__.py +0 -0
  7. rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
  8. rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
  9. rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
  10. rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
  11. rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
  12. rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
  13. rcsb/exdb/chemref/__init__.py +0 -0
  14. rcsb/exdb/citation/CitationAdapter.py +91 -0
  15. rcsb/exdb/citation/CitationExtractor.py +190 -0
  16. rcsb/exdb/citation/CitationUtils.py +51 -0
  17. rcsb/exdb/citation/__init__.py +0 -0
  18. rcsb/exdb/cli/__init__.py +0 -0
  19. rcsb/exdb/entry/EntryInfoProvider.py +148 -0
  20. rcsb/exdb/entry/__init__.py +0 -0
  21. rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
  22. rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
  23. rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
  24. rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
  25. rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
  26. rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
  27. rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
  28. rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
  29. rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
  30. rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
  31. rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
  32. rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
  33. rcsb/exdb/seq/AnnotationExtractor.py +76 -0
  34. rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
  35. rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
  36. rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
  37. rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
  38. rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
  39. rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
  40. rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
  41. rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
  42. rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
  43. rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
  44. rcsb/exdb/seq/UniProtExtractor.py +80 -0
  45. rcsb/exdb/seq/__init__.py +0 -0
  46. rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
  47. rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
  48. rcsb/exdb/tests/__init__.py +0 -0
  49. rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
  50. rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
  51. rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
  52. rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
  53. rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
  54. rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
  55. rcsb/exdb/tests/testChemRefLoader.py +106 -0
  56. rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
  57. rcsb/exdb/tests/testCitationAdapter.py +97 -0
  58. rcsb/exdb/tests/testCitationExtractor.py +93 -0
  59. rcsb/exdb/tests/testCitationUtils.py +92 -0
  60. rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
  61. rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
  62. rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
  63. rcsb/exdb/tests/testGlycanProvider.py +98 -0
  64. rcsb/exdb/tests/testGlycanUtils.py +64 -0
  65. rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
  66. rcsb/exdb/tests/testObjectExtractor.py +342 -0
  67. rcsb/exdb/tests/testObjectTransformer.py +83 -0
  68. rcsb/exdb/tests/testObjectUpdater.py +120 -0
  69. rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
  70. rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
  71. rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
  72. rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
  73. rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
  74. rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
  75. rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
  76. rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
  77. rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
  78. rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
  79. rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
  80. rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
  81. rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
  82. rcsb/exdb/tests/testUniProtExtractor.py +77 -0
  83. rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
  84. rcsb/exdb/tree/__init__.py +0 -0
  85. rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
  86. rcsb/exdb/utils/ObjectExtractor.py +286 -0
  87. rcsb/exdb/utils/ObjectTransformer.py +124 -0
  88. rcsb/exdb/utils/ObjectUpdater.py +121 -0
  89. rcsb/exdb/utils/ObjectValidator.py +160 -0
  90. rcsb/exdb/utils/__init__.py +0 -0
  91. rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
  92. rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
  93. rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
  94. rcsb/exdb/wf/__init__.py +0 -0
  95. rcsb_exdb-1.31.dist-info/METADATA +103 -0
  96. rcsb_exdb-1.31.dist-info/RECORD +98 -0
  97. rcsb_exdb-1.31.dist-info/WHEEL +4 -0
  98. rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,388 @@
1
+ ##
2
+ # File: ReferenceSequenceAssignmentProvider.py
3
+ # Date: 8-Oct-2019 jdw
4
+ #
5
+ # Utilities to cache content required to update referencence sequence assignments.
6
+ #
7
+ # Updates:
8
+ #
9
+ ##
10
+ __docformat__ = "google en"
11
+ __author__ = "John Westbrook"
12
+ __email__ = "jwest@rcsb.rutgers.edu"
13
+ __license__ = "Apache 2.0"
14
+
15
+ import logging
16
+ import os
17
+ from collections import defaultdict
18
+
19
+
20
+ from rcsb.exdb.utils.ObjectExtractor import ObjectExtractor
21
+ from rcsb.utils.ec.EnzymeDatabaseProvider import EnzymeDatabaseProvider
22
+ from rcsb.utils.io.IoUtil import getObjSize
23
+ from rcsb.utils.io.MarshalUtil import MarshalUtil
24
+ from rcsb.utils.seq.InterProProvider import InterProProvider
25
+ from rcsb.utils.seq.PfamProvider import PfamProvider
26
+ from rcsb.utils.seq.SiftsSummaryProvider import SiftsSummaryProvider
27
+ from rcsb.utils.seq.UniProtUtils import UniProtUtils
28
+ from rcsb.utils.go.GeneOntologyProvider import GeneOntologyProvider
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ class ReferenceSequenceAssignmentProvider(object):
34
+ """Utilities to cache content required to update referencence sequence assignments."""
35
+
36
+ def __init__(
37
+ self,
38
+ cfgOb,
39
+ databaseName="pdbx_core",
40
+ collectionName="pdbx_core_polymer_entity",
41
+ polymerType="Protein",
42
+ referenceDatabaseName="UniProt",
43
+ provSource="PDB",
44
+ maxChunkSize=10,
45
+ fetchLimit=None,
46
+ **kwargs
47
+ ):
48
+ self.__cfgOb = cfgOb
49
+ self.__polymerType = polymerType
50
+ self.__mU = MarshalUtil()
51
+ #
52
+ self.__maxChunkSize = maxChunkSize
53
+ self.__statusList = []
54
+ #
55
+ self.__pfP = self.__fetchPfamProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
56
+ self.__ipP = self.__fetchInterProProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
57
+ self.__ssP = self.__fetchSiftsSummaryProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
58
+ self.__goP = self.__fetchGoProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
59
+ self.__ecP = self.__fetchEcProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
60
+ self.__refIdMapD, self.__matchD, self.__refD = self.__reload(databaseName, collectionName, polymerType, referenceDatabaseName, provSource, fetchLimit, **kwargs)
61
+
62
+ def goIdExists(self, goId):
63
+ try:
64
+ return self.__goP.exists(goId)
65
+ except Exception as e:
66
+ logger.exception("Failing for %r with %s", goId, str(e))
67
+ return False
68
+
69
+ def getGeneOntologyLineage(self, goIdL):
70
+ # "id" "name"
71
+ gL = []
72
+ try:
73
+ gTupL = self.__goP.getUniqueDescendants(goIdL)
74
+ for gTup in gTupL:
75
+ gL.append({"id": gTup[0], "name": gTup[1]})
76
+ except Exception as e:
77
+ logger.exception("Failing for %r with %s", goIdL, str(e))
78
+ return gL
79
+
80
+ def getPfamProvider(self):
81
+ return self.__pfP
82
+
83
+ def getInterProProvider(self):
84
+ return self.__ipP
85
+
86
+ def getEcProvider(self):
87
+ return self.__ecP
88
+
89
+ def getSiftsSummaryProvider(self):
90
+ return self.__ssP
91
+
92
+ def getMatchInfo(self):
93
+ return self.__matchD
94
+
95
+ def getRefData(self):
96
+ return self.__refD
97
+
98
+ def getDocuments(self, formatType="exchange"):
99
+ fobj = UniProtUtils(saveText=False)
100
+ exObjD = fobj.reformat(self.__refD, formatType=formatType)
101
+ return list(exObjD.values())
102
+
103
+ def getRefIdMap(self):
104
+ return self.__refIdMapD
105
+
106
+ def getRefDataCount(self):
107
+ return len(self.__refD)
108
+
109
+ def testCache(self, minMatchPrimaryPercent=None, logSizes=False):
110
+ okC = True
111
+ logger.info("Reference cache lengths: refIdMap %d matchD %d refD %d", len(self.__refIdMapD), len(self.__matchD), len(self.__refD))
112
+ ok = bool(self.__refIdMapD and self.__matchD and self.__refD and self.__ssP)
113
+ #
114
+ numRef = len(self.__refIdMapD)
115
+ countD = defaultdict(int)
116
+ logger.info("Match dictionary length %d", len(self.__matchD))
117
+ for _, mD in self.__matchD.items():
118
+ if "matched" in mD:
119
+ countD[mD["matched"]] += 1
120
+ logger.info("Reference length %d match length %d coverage %r", len(self.__refD), len(self.__matchD), countD.items())
121
+ if minMatchPrimaryPercent:
122
+ try:
123
+ okC = 100.0 * float(countD["primary"]) / float(numRef) > minMatchPrimaryPercent
124
+ except Exception:
125
+ okC = False
126
+ logger.info("Primary reference match percent test status %r", okC)
127
+ #
128
+ if logSizes:
129
+ logger.info(
130
+ "Pfam %.2f InterPro %.2f SIFTS %.2f GO %.2f EC %.2f RefIdMap %.2f RefMatchD %.2f RefD %.2f",
131
+ getObjSize(self.__pfP) / 1000000.0,
132
+ getObjSize(self.__ipP) / 1000000.0,
133
+ getObjSize(self.__ssP) / 1000000.0,
134
+ getObjSize(self.__goP) / 1000000.0,
135
+ getObjSize(self.__ecP) / 1000000.0,
136
+ getObjSize(self.__refIdMapD) / 1000000.0,
137
+ getObjSize(self.__matchD) / 1000000.0,
138
+ getObjSize(self.__refD) / 1000000.0,
139
+ )
140
+ return ok and okC
141
+
142
+ def __reload(self, databaseName, collectionName, polymerType, referenceDatabaseName, provSource, fetchLimit, **kwargs):
143
+ assignRefD = self.__getPolymerReferenceSequenceAssignments(databaseName, collectionName, polymerType, fetchLimit)
144
+ refIdMapD, _ = self.__getAssignmentMap(assignRefD, referenceDatabaseName=referenceDatabaseName, provSource=provSource)
145
+ #
146
+ entryIdL = [rcsbId[:4] for rcsbId in assignRefD]
147
+ siftsUniProtL = self.__ssP.getEntryUniqueIdentifiers(entryIdL, idType="UNPID")
148
+ logger.info("Incorporating %d SIFTS accessions for %d entries", len(siftsUniProtL), len(entryIdL))
149
+ unpIdList = sorted(set(list(refIdMapD.keys()) + siftsUniProtL))
150
+ #
151
+ logger.info("Rebuild cache for %d UniProt accessions (consolidated)", len(unpIdList))
152
+ #
153
+ matchD, refD = self.__rebuildReferenceCache(unpIdList, referenceDatabaseName, **kwargs)
154
+ return refIdMapD, matchD, refD
155
+
156
+ def __getPolymerReferenceSequenceAssignments(self, databaseName, collectionName, polymerType, fetchLimit):
157
+ """Get all accessions assigned to input reference sequence database for the input polymerType.
158
+
159
+ Returns:
160
+ (dict): {"1abc_1": "rcsb_polymer_entity_container_identifiers": {"reference_sequence_identifiers": []},
161
+ "rcsb_polymer_entity_align": [],
162
+ "rcsb_entity_source_organism"" {"ncbi_taxonomy_id": []}
163
+ """
164
+ try:
165
+ obEx = ObjectExtractor(
166
+ self.__cfgOb,
167
+ databaseName=databaseName,
168
+ collectionName=collectionName,
169
+ cacheFilePath=None,
170
+ useCache=False,
171
+ keyAttribute="entity",
172
+ uniqueAttributes=["rcsb_id"],
173
+ cacheKwargs=None,
174
+ objectLimit=fetchLimit,
175
+ selectionQuery={"entity_poly.rcsb_entity_polymer_type": polymerType},
176
+ selectionList=[
177
+ "rcsb_id",
178
+ "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers",
179
+ "rcsb_polymer_entity_container_identifiers.auth_asym_ids",
180
+ # "rcsb_polymer_entity_align",
181
+ # "rcsb_entity_source_organism.ncbi_taxonomy_id",
182
+ # "rcsb_polymer_entity_container_identifiers.related_annotation_identifiers",
183
+ # "rcsb_polymer_entity_annotation",
184
+ "rcsb_entity_source_organism.ncbi_taxonomy_id",
185
+ ],
186
+ )
187
+ eCount = obEx.getCount()
188
+ logger.info("Polymer entity count type %s is %d", polymerType, eCount)
189
+ objD = obEx.getObjects()
190
+ logger.info("Reading polymer entity count %d ref accession length %d ", eCount, len(objD))
191
+ #
192
+ except Exception as e:
193
+ logger.exception("Failing for %s (%s) with %s", databaseName, collectionName, str(e))
194
+ return objD
195
+
196
+ def __getAssignmentMap(self, objD, referenceDatabaseName="UniProt", provSource="PDB"):
197
+ refIdD = defaultdict(list)
198
+ taxIdD = defaultdict(list)
199
+ numMissing = 0
200
+ numMissingTaxons = 0
201
+ for entityKey, eD in objD.items():
202
+ try:
203
+ accS = set()
204
+ for ii, tD in enumerate(eD["rcsb_polymer_entity_container_identifiers"]["reference_sequence_identifiers"]):
205
+ if tD["database_name"] == referenceDatabaseName and tD["provenance_source"] == provSource:
206
+ accS.add(tD["database_accession"])
207
+ refIdD[tD["database_accession"]].append(entityKey)
208
+ #
209
+ # pick up the corresponding taxonomy -
210
+ try:
211
+ taxIdD[tD["database_accession"]].append(eD["rcsb_entity_source_organism"][ii]["ncbi_taxonomy_id"])
212
+ except Exception:
213
+ logger.debug("Failing taxonomy lookup for %s %r", entityKey, tD["database_accession"])
214
+ numMissingTaxons += 1
215
+
216
+ logger.debug("PDB assigned sequences length %d", len(accS))
217
+ except Exception as e:
218
+ numMissing += 1
219
+ logger.debug("No sequence assignments for %s with %s", entityKey, str(e))
220
+ #
221
+ numMultipleTaxons = 0
222
+ for refId, taxIdL in taxIdD.items():
223
+ taxIdL = list(set(taxIdL))
224
+ if len(taxIdL) > 1:
225
+ logger.debug("Multitple taxIds assigned to reference sequence id %s: %r", refId, taxIdL)
226
+ numMultipleTaxons += 1
227
+
228
+ logger.info("Entities with missing taxonomy %d", numMissingTaxons)
229
+ logger.info("Reference sequences with multiple taxonomies %d", numMultipleTaxons)
230
+ logger.info("Unique %s accession assignments by %s %d (entities missing archive accession assignments %d) ", referenceDatabaseName, provSource, len(refIdD), numMissing)
231
+ return refIdD, taxIdD
232
+
233
+ #
234
+ def __rebuildReferenceCache(self, idList, refDbName, **kwargs):
235
+ """ """
236
+ fetchLimit = None
237
+ doMissing = True
238
+ dD = {}
239
+ cachePath = kwargs.get("cachePath", ".")
240
+ dirPath = os.path.join(cachePath, "exdb")
241
+ # cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "json", "indent": 3})
242
+ cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"})
243
+ useCache = kwargs.get("useCache", True)
244
+ saveText = kwargs.get("saveText", False)
245
+ #
246
+ ext = "pic" if cacheKwargs["fmt"] == "pickle" else "json"
247
+ fn = refDbName + "-ref-sequence-data-cache" + "." + ext
248
+ dataCacheFilePath = os.path.join(dirPath, fn)
249
+ #
250
+ fn = refDbName + "-ref-sequence-id-cache" + ".json"
251
+ accCacheFilePath = os.path.join(dirPath, fn)
252
+ #
253
+ self.__mU.mkdir(dirPath)
254
+ if not useCache:
255
+ for fp in [dataCacheFilePath, accCacheFilePath]:
256
+ try:
257
+ os.remove(fp)
258
+ except Exception:
259
+ pass
260
+ #
261
+ if useCache and accCacheFilePath and self.__mU.exists(accCacheFilePath) and dataCacheFilePath and self.__mU.exists(dataCacheFilePath):
262
+ dD = self.__mU.doImport(dataCacheFilePath, **cacheKwargs)
263
+ idD = self.__mU.doImport(accCacheFilePath, fmt="json")
264
+ logger.info("Reading cached reference sequence ID and data cache files - cached match reference length %d", len(idD["matchInfo"]))
265
+ idD["matchInfo"] = self.__rebuildReferenceMatchIndex(idList, dD["refDbCache"])
266
+ # Check for completeness -
267
+ if doMissing:
268
+ missingS = set(idList) - set(idD["matchInfo"].keys())
269
+ if missingS:
270
+ logger.info("Reference sequence cache missing %d accessions", len(missingS))
271
+ extraD, extraIdD = self.__fetchReferenceEntries(refDbName, list(missingS), saveText=saveText, fetchLimit=fetchLimit)
272
+ dD["refDbCache"].update(extraD["refDbCache"])
273
+ idD["matchInfo"].update(extraIdD["matchInfo"])
274
+ #
275
+ idD["matchInfo"] = self.__rebuildReferenceMatchIndex(idList, dD["refDbCache"])
276
+ #
277
+ if accCacheFilePath and dataCacheFilePath and cacheKwargs:
278
+ self.__mU.mkdir(dirPath)
279
+ ok1 = self.__mU.doExport(dataCacheFilePath, dD, **cacheKwargs)
280
+ ok2 = self.__mU.doExport(accCacheFilePath, idD, fmt="json", indent=3)
281
+ logger.info("Cache updated with missing references with status %r", ok1 and ok2)
282
+ #
283
+ else:
284
+ logger.info("Rebuilding reference cache for %s for %d accessions with limit %r", refDbName, len(idList), fetchLimit)
285
+ dD, idD = self.__fetchReferenceEntries(refDbName, idList, saveText=saveText, fetchLimit=fetchLimit)
286
+ if accCacheFilePath and dataCacheFilePath and cacheKwargs:
287
+ self.__mU.mkdir(dirPath)
288
+ ok1 = self.__mU.doExport(dataCacheFilePath, dD, **cacheKwargs)
289
+ ok2 = self.__mU.doExport(accCacheFilePath, idD, fmt="json", indent=3)
290
+ logger.info("Cache save status %r", ok1 and ok2)
291
+
292
+ return idD["matchInfo"], dD["refDbCache"]
293
+
294
+ def __rebuildReferenceMatchIndex(self, idList, referenceD):
295
+ fobj = UniProtUtils()
296
+ logger.info("Rebuilding match index on idList (%d) using reference data (%d) %r", len(idList), len(referenceD), type(referenceD))
297
+ matchD = fobj.rebuildMatchResultIndex(idList, referenceD)
298
+ return matchD
299
+
300
+ def __fetchReferenceEntries(self, refDbName, idList, saveText=False, fetchLimit=None):
301
+ """Fetch database entries from the input reference sequence database name."""
302
+ dD = {"refDbName": refDbName, "refDbCache": {}}
303
+ idD = {"matchInfo": {}, "refIdMap": {}}
304
+
305
+ try:
306
+ idList = idList[:fetchLimit] if fetchLimit else idList
307
+ logger.info("Starting fetch for %d %s entries", len(idList), refDbName)
308
+ if refDbName == "UniProt":
309
+ fobj = UniProtUtils(saveText=saveText)
310
+ logger.info("Maximum reference chunk size %d", self.__maxChunkSize)
311
+ refD, matchD = fobj.fetchList(idList, maxChunkSize=self.__maxChunkSize)
312
+ dD = {"refDbName": refDbName, "refDbCache": refD}
313
+ idD = {"matchInfo": matchD}
314
+ #
315
+ # Check the coverage -
316
+ #
317
+ countD = defaultdict(int)
318
+ logger.info("Match dictionary length %d", len(matchD))
319
+ for _, mD in matchD.items():
320
+ if "matched" in mD:
321
+ countD[mD["matched"]] += 1
322
+ logger.info("Reference length %d match length %d coverage %r", len(refD), len(matchD), countD.items())
323
+ except Exception as e:
324
+ logger.exception("Failing with %s", str(e))
325
+
326
+ return dD, idD
327
+
328
+ def __fetchSiftsSummaryProvider(self, cfgOb, configName, **kwargs):
329
+ abbreviated = kwargs.get("siftsAbbreviated", "TEST")
330
+ cachePath = kwargs.get("cachePath", ".")
331
+ cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"})
332
+ useCache = kwargs.get("useCache", True)
333
+ #
334
+ siftsSummaryDataPath = cfgOb.getPath("SIFTS_SUMMARY_DATA_PATH", sectionName=configName)
335
+ if siftsSummaryDataPath.lower().startswith("http"):
336
+ srcDirPath = siftsSummaryDataPath
337
+ else:
338
+ srcDirPath = os.path.join(cachePath, siftsSummaryDataPath)
339
+ cacheDirPath = os.path.join(cachePath, cfgOb.get("SIFTS_SUMMARY_CACHE_DIR", sectionName=configName))
340
+ logger.debug("ssP %r %r", srcDirPath, cacheDirPath)
341
+ ssP = SiftsSummaryProvider(srcDirPath=srcDirPath, cacheDirPath=cacheDirPath, useCache=useCache, abbreviated=abbreviated, cacheKwargs=cacheKwargs)
342
+ ok = ssP.testCache()
343
+ if not ok:
344
+ logger.error("Failed to refetch SIFTS summary data using srcDirPath %s, cacheDirPath %s", srcDirPath, cacheDirPath)
345
+ return None
346
+ logger.debug("SIFTS cache status %r", ok)
347
+ logger.debug("ssP entry count %d", ssP.getEntryCount())
348
+ return ssP
349
+
350
+ def __fetchGoProvider(self, cfgOb, configName, **kwargs):
351
+ cachePath = kwargs.get("cachePath", ".")
352
+ useCache = kwargs.get("useCache", True)
353
+ #
354
+ cacheDirPath = os.path.join(cachePath, cfgOb.get("EXDB_CACHE_DIR", sectionName=configName))
355
+ logger.debug("goP %r %r", cacheDirPath, useCache)
356
+ goP = GeneOntologyProvider(goDirPath=cacheDirPath, useCache=useCache)
357
+ ok = goP.testCache()
358
+ logger.debug("Gene Ontology (%r) root node count %r", ok, goP.getRootNodes())
359
+ return goP
360
+
361
+ def __fetchEcProvider(self, cfgOb, configName, **kwargs):
362
+ cachePath = kwargs.get("cachePath", ".")
363
+ useCache = kwargs.get("useCache", True)
364
+ #
365
+ cacheDirPath = os.path.join(cachePath, cfgOb.get("ENZYME_CLASSIFICATION_CACHE_DIR", sectionName=configName))
366
+ logger.debug("ecP %r %r", cacheDirPath, useCache)
367
+ ecP = EnzymeDatabaseProvider(enzymeDirPath=cacheDirPath, useCache=useCache)
368
+ ok = ecP.testCache()
369
+ logger.debug("Enzyme cache status %r", ok)
370
+ return ecP
371
+
372
+ def __fetchPfamProvider(self, cfgOb, configName, **kwargs):
373
+ _ = cfgOb
374
+ _ = configName
375
+ cachePath = kwargs.get("cachePath", ".")
376
+ useCache = kwargs.get("useCache", True)
377
+ pfP = PfamProvider(cachePath=cachePath, useCache=useCache)
378
+ ok = pfP.testCache()
379
+ return pfP if ok else None
380
+
381
+ def __fetchInterProProvider(self, cfgOb, configName, **kwargs):
382
+ _ = cfgOb
383
+ _ = configName
384
+ cachePath = kwargs.get("cachePath", ".")
385
+ useCache = kwargs.get("useCache", True)
386
+ ipP = InterProProvider(cachePath=cachePath, useCache=useCache)
387
+ ok = ipP.testCache()
388
+ return ipP if ok else None