rcsb.exdb 1.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rcsb/__init__.py +1 -0
- rcsb/exdb/__init__.py +1 -0
- rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
- rcsb/exdb/branch/GlycanProvider.py +116 -0
- rcsb/exdb/branch/GlycanUtils.py +114 -0
- rcsb/exdb/branch/__init__.py +0 -0
- rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
- rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
- rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
- rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
- rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
- rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
- rcsb/exdb/chemref/__init__.py +0 -0
- rcsb/exdb/citation/CitationAdapter.py +91 -0
- rcsb/exdb/citation/CitationExtractor.py +190 -0
- rcsb/exdb/citation/CitationUtils.py +51 -0
- rcsb/exdb/citation/__init__.py +0 -0
- rcsb/exdb/cli/__init__.py +0 -0
- rcsb/exdb/entry/EntryInfoProvider.py +148 -0
- rcsb/exdb/entry/__init__.py +0 -0
- rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
- rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
- rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
- rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
- rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
- rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
- rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
- rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
- rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
- rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
- rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
- rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
- rcsb/exdb/seq/AnnotationExtractor.py +76 -0
- rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
- rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
- rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
- rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
- rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
- rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
- rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
- rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
- rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
- rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
- rcsb/exdb/seq/UniProtExtractor.py +80 -0
- rcsb/exdb/seq/__init__.py +0 -0
- rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
- rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
- rcsb/exdb/tests/__init__.py +0 -0
- rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
- rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
- rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
- rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
- rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
- rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
- rcsb/exdb/tests/testChemRefLoader.py +106 -0
- rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
- rcsb/exdb/tests/testCitationAdapter.py +97 -0
- rcsb/exdb/tests/testCitationExtractor.py +93 -0
- rcsb/exdb/tests/testCitationUtils.py +92 -0
- rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
- rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
- rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
- rcsb/exdb/tests/testGlycanProvider.py +98 -0
- rcsb/exdb/tests/testGlycanUtils.py +64 -0
- rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
- rcsb/exdb/tests/testObjectExtractor.py +342 -0
- rcsb/exdb/tests/testObjectTransformer.py +83 -0
- rcsb/exdb/tests/testObjectUpdater.py +120 -0
- rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
- rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
- rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
- rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
- rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
- rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
- rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
- rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
- rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
- rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
- rcsb/exdb/tests/testUniProtExtractor.py +77 -0
- rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
- rcsb/exdb/tree/__init__.py +0 -0
- rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
- rcsb/exdb/utils/ObjectExtractor.py +286 -0
- rcsb/exdb/utils/ObjectTransformer.py +124 -0
- rcsb/exdb/utils/ObjectUpdater.py +121 -0
- rcsb/exdb/utils/ObjectValidator.py +160 -0
- rcsb/exdb/utils/__init__.py +0 -0
- rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
- rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
- rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
- rcsb/exdb/wf/__init__.py +0 -0
- rcsb_exdb-1.31.dist-info/METADATA +103 -0
- rcsb_exdb-1.31.dist-info/RECORD +98 -0
- rcsb_exdb-1.31.dist-info/WHEEL +4 -0
- rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: EntityInstanceExtractorTests.py
|
|
3
|
+
# Author: J. Westbrook
|
|
4
|
+
# Date: 25-Mar-2019
|
|
5
|
+
#
|
|
6
|
+
# Updates:
|
|
7
|
+
# 21-Apr-2019 jdw Tests for full cache construction and processiong
|
|
8
|
+
#
|
|
9
|
+
##
|
|
10
|
+
"""
|
|
11
|
+
Tests for extractor of selected values from entity polymer collections (full cache)
|
|
12
|
+
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
__docformat__ = "google en"
|
|
16
|
+
__author__ = "John Westbrook"
|
|
17
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
18
|
+
__license__ = "Apache 2.0"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
import logging
|
|
22
|
+
import os
|
|
23
|
+
import time
|
|
24
|
+
import unittest
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
from rcsb.exdb.seq.EntityPolymerExtractor import EntityPolymerExtractor
|
|
28
|
+
from rcsb.utils.config.ConfigUtil import ConfigUtil
|
|
29
|
+
from rcsb.utils.io.MarshalUtil import MarshalUtil
|
|
30
|
+
from rcsb.utils.taxonomy.TaxonomyUtils import TaxonomyUtils
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]-%(module)s.%(funcName)s: %(message)s")
|
|
34
|
+
logger = logging.getLogger()
|
|
35
|
+
|
|
36
|
+
HERE = os.path.abspath(os.path.dirname(__file__))
|
|
37
|
+
TOPDIR = os.path.dirname(os.path.dirname(os.path.dirname(HERE)))
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class EntityPolymerExtractorFullTests(unittest.TestCase):
|
|
41
|
+
def __init__(self, methodName="runTest"):
|
|
42
|
+
super(EntityPolymerExtractorFullTests, self).__init__(methodName)
|
|
43
|
+
self.__verbose = True
|
|
44
|
+
|
|
45
|
+
def setUp(self):
|
|
46
|
+
#
|
|
47
|
+
#
|
|
48
|
+
self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
|
|
49
|
+
configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example.yml")
|
|
50
|
+
#
|
|
51
|
+
# Caution: this is very site specific setting
|
|
52
|
+
#
|
|
53
|
+
configName = "site_info_remote"
|
|
54
|
+
self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath)
|
|
55
|
+
if configName != "site_info_configuration":
|
|
56
|
+
self.__cfgOb.replaceSectionName("site_info_configuration", configName)
|
|
57
|
+
#
|
|
58
|
+
#
|
|
59
|
+
self.__workPath = os.path.join(HERE, "test-cache-preserve")
|
|
60
|
+
#
|
|
61
|
+
self.__fullCacheKwargs = {"fmt": "pickle"}
|
|
62
|
+
self.__fullEntitySaveCachePath = os.path.join(self.__workPath, "entity-polymer-data-cache.pic")
|
|
63
|
+
#
|
|
64
|
+
self.__mU = MarshalUtil()
|
|
65
|
+
self.__entryLimitFull = 50
|
|
66
|
+
#
|
|
67
|
+
self.__startTime = time.time()
|
|
68
|
+
logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
|
|
69
|
+
|
|
70
|
+
def tearDown(self):
|
|
71
|
+
endTime = time.time()
|
|
72
|
+
logger.info("Completed %s at %s (%.4f seconds)\n", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)
|
|
73
|
+
|
|
74
|
+
@unittest.skip("rebuild cache")
|
|
75
|
+
def testRebuildCache(self):
|
|
76
|
+
"""Test case - extract entity polymer info - rebuild full cache of extracted entity polymer data -"""
|
|
77
|
+
try:
|
|
78
|
+
epe = EntityPolymerExtractor(
|
|
79
|
+
self.__cfgOb, saveCachePath=self.__fullEntitySaveCachePath, useCache=False, saveCacheKwargs=self.__fullCacheKwargs, entryLimit=self.__entryLimitFull
|
|
80
|
+
)
|
|
81
|
+
eCount = epe.getEntryCount()
|
|
82
|
+
if self.__entryLimitFull is not None:
|
|
83
|
+
self.assertGreaterEqual(eCount, self.__entryLimitFull)
|
|
84
|
+
else:
|
|
85
|
+
self.assertGreaterEqual(eCount, 10)
|
|
86
|
+
#
|
|
87
|
+
except Exception as e:
|
|
88
|
+
logger.exception("Failing with %s", str(e))
|
|
89
|
+
self.fail()
|
|
90
|
+
|
|
91
|
+
def testAccessEntityPolymerFeatures(self):
|
|
92
|
+
"""Test case - access cached entity polymer info from full cache"""
|
|
93
|
+
try:
|
|
94
|
+
epe = EntityPolymerExtractor(self.__cfgOb, saveCachePath=self.__fullEntitySaveCachePath, useCache=True, saveCacheKwargs=self.__fullCacheKwargs)
|
|
95
|
+
eCount = epe.getEntryCount()
|
|
96
|
+
logger.info("Entry count %d", eCount)
|
|
97
|
+
self.assertGreaterEqual(eCount, self.__entryLimitFull)
|
|
98
|
+
#
|
|
99
|
+
unpL = epe.getRefSeqAccessions("UNP")
|
|
100
|
+
logger.info("Ref seq count %d", len(unpL))
|
|
101
|
+
self.assertGreaterEqual(len(unpL), 1)
|
|
102
|
+
#
|
|
103
|
+
testOp = False
|
|
104
|
+
if testOp:
|
|
105
|
+
for entryId in ["1CP9"]:
|
|
106
|
+
for entityId in ["1", "2"]:
|
|
107
|
+
uL = epe.getEntityRefSeqAccessions("UNP", entryId, entityId)
|
|
108
|
+
logger.debug("UNP for %s %s %r", entryId, entityId, uL)
|
|
109
|
+
#
|
|
110
|
+
except Exception as e:
|
|
111
|
+
logger.exception("Failing with %s", str(e))
|
|
112
|
+
self.fail()
|
|
113
|
+
|
|
114
|
+
def testAccessEntityPolymerReadCache(self):
|
|
115
|
+
"""Test case - access cached entity polymer info from full cache"""
|
|
116
|
+
try:
|
|
117
|
+
epe = EntityPolymerExtractor(self.__cfgOb, saveCachePath=self.__fullEntitySaveCachePath, useCache=True, saveCacheKwargs=self.__fullCacheKwargs)
|
|
118
|
+
logger.info("Cache entry count %d", epe.getEntryCount())
|
|
119
|
+
cD = epe.countRefSeqAccessions("UNP")
|
|
120
|
+
self.assertGreaterEqual(len(cD), 2)
|
|
121
|
+
#
|
|
122
|
+
logger.info("UNP reference sequences per entity %r", dict(sorted(cD.items())))
|
|
123
|
+
logger.info("Reference sequences per entity %r", dict(sorted(epe.countRefSeqAccessionAny().items())))
|
|
124
|
+
logger.info("Reference sequences per ref db %r", dict(sorted(epe.countRefSeqAccessionDbType().items())))
|
|
125
|
+
#
|
|
126
|
+
ok = epe.checkRefSeqAlignRange("UNP")
|
|
127
|
+
self.assertTrue(ok)
|
|
128
|
+
unpL = epe.getRefSeqAccessions("UNP")
|
|
129
|
+
logger.info("Unique UNP reference sequences %d", len(unpL))
|
|
130
|
+
self.assertTrue(ok)
|
|
131
|
+
#
|
|
132
|
+
except Exception as e:
|
|
133
|
+
logger.exception("Failing with %s", str(e))
|
|
134
|
+
self.fail()
|
|
135
|
+
|
|
136
|
+
def testTaxonomyEntityPolymerReadCache(self):
|
|
137
|
+
"""Test case - evaluate taxonomy - from full cache"""
|
|
138
|
+
try:
|
|
139
|
+
taxIdList = [562, 9606, 3701]
|
|
140
|
+
for taxId in taxIdList:
|
|
141
|
+
tU = TaxonomyUtils(taxDirPath=self.__workPath)
|
|
142
|
+
tL = tU.getLineage(taxId)
|
|
143
|
+
logger.info("Taxonomy lineage for %d %r", taxId, tL)
|
|
144
|
+
#
|
|
145
|
+
#
|
|
146
|
+
epe = EntityPolymerExtractor(self.__cfgOb, saveCachePath=self.__fullEntitySaveCachePath, useCache=True, saveCacheKwargs=self.__fullCacheKwargs)
|
|
147
|
+
logger.info("Cache entry count %d", epe.getEntryCount())
|
|
148
|
+
logger.info("Reference sequences per ref db %r", dict(sorted(epe.countRefSeqAccessionDbType().items())))
|
|
149
|
+
rD = epe.countRefSeqAccessionByTaxon(dbNameList=["UNP"])
|
|
150
|
+
logger.info("Unique taxons %d", len(list(rD.keys())))
|
|
151
|
+
#
|
|
152
|
+
numT = 0
|
|
153
|
+
for tId, aL in rD.items():
|
|
154
|
+
tL = tU.getLineage(tId)
|
|
155
|
+
if taxId in tL:
|
|
156
|
+
tc = len(set(aL))
|
|
157
|
+
logger.info("Matched %5d %s (%r)", tc, tU.getScientificName(tId), tId)
|
|
158
|
+
numT += tc
|
|
159
|
+
logger.info("Total matched accessions %d ", numT)
|
|
160
|
+
except Exception as e:
|
|
161
|
+
logger.exception("Failing with %s", str(e))
|
|
162
|
+
self.fail()
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def entityPolymerExtractFullSuite():
|
|
166
|
+
suiteSelect = unittest.TestSuite()
|
|
167
|
+
# suiteSelect.addTest(EntityPolymerExtractorFullTests("testRebuildCache"))
|
|
168
|
+
suiteSelect.addTest(EntityPolymerExtractorFullTests("testAccessEntityPolymerFeatures"))
|
|
169
|
+
suiteSelect.addTest(EntityPolymerExtractorFullTests("testAccessEntityPolymerReadCache"))
|
|
170
|
+
suiteSelect.addTest(EntityPolymerExtractorFullTests("testTaxonomyEntityPolymerReadCache"))
|
|
171
|
+
return suiteSelect
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
if __name__ == "__main__":
|
|
175
|
+
mySuite = entityPolymerExtractFullSuite()
|
|
176
|
+
unittest.TextTestRunner(verbosity=2).run(mySuite)
|
|
@@ -0,0 +1,449 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: ReferenceSequenceAssignmentUpdater.py
|
|
3
|
+
# Date: 8-Oct-2019 jdw
|
|
4
|
+
#
|
|
5
|
+
# Selected utilities to update reference sequence assignments information
|
|
6
|
+
# in the core_entity collection.
|
|
7
|
+
#
|
|
8
|
+
# Updates:
|
|
9
|
+
#
|
|
10
|
+
##
|
|
11
|
+
__docformat__ = "google en"
|
|
12
|
+
__author__ = "John Westbrook"
|
|
13
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
14
|
+
__license__ = "Apache 2.0"
|
|
15
|
+
|
|
16
|
+
import logging
|
|
17
|
+
import os
|
|
18
|
+
from collections import defaultdict
|
|
19
|
+
|
|
20
|
+
from rcsb.db.processors.DataExchangeStatus import DataExchangeStatus
|
|
21
|
+
from rcsb.exdb.utils.ObjectExtractor import ObjectExtractor
|
|
22
|
+
from rcsb.exdb.utils.ObjectUpdater import ObjectUpdater
|
|
23
|
+
from rcsb.utils.io.MarshalUtil import MarshalUtil
|
|
24
|
+
from rcsb.utils.seq.SiftsSummaryProvider import SiftsSummaryProvider
|
|
25
|
+
from rcsb.utils.seq.UniProtUtils import UniProtUtils
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ReferenceSequenceAssignmentUpdater(object):
|
|
31
|
+
"""Selected utilities to update reference sequence assignments information
|
|
32
|
+
in the core_entity collection.
|
|
33
|
+
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(self, cfgOb, databaseName="pdbx_core", collectionName="pdbx_core_polymer_entity", polymerType="Protein", referenceDatabaseName="UniProt", provSource="PDB", **kwargs):
|
|
37
|
+
self.__cfgOb = cfgOb
|
|
38
|
+
self.__polymerType = polymerType
|
|
39
|
+
self.__mU = MarshalUtil()
|
|
40
|
+
#
|
|
41
|
+
self.__databaseName = databaseName
|
|
42
|
+
self.__collectionName = collectionName
|
|
43
|
+
self.__statusList = []
|
|
44
|
+
#
|
|
45
|
+
self.__ssP = self.__fetchSiftsSummaryProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
|
|
46
|
+
self.__assignRefD, self.__refD, self.__matchD = self.__reload(databaseName, collectionName, polymerType, referenceDatabaseName, provSource, **kwargs)
|
|
47
|
+
|
|
48
|
+
def __reload(self, databaseName, collectionName, polymerType, referenceDatabaseName, provSource, **kwargs):
|
|
49
|
+
assignRefD = self.__getPolymerReferenceSequenceAssignments(databaseName, collectionName, polymerType, **kwargs)
|
|
50
|
+
# get refIdD = {refId: [entity_id, ....], }
|
|
51
|
+
refIdD, _ = self.__getUniqueAssignments(assignRefD, referenceDatabaseName=referenceDatabaseName, provSource=provSource)
|
|
52
|
+
#
|
|
53
|
+
refD, matchD = self.__rebuildReferenceCache(referenceDatabaseName, list(refIdD.keys()), **kwargs)
|
|
54
|
+
return assignRefD, refD, matchD
|
|
55
|
+
|
|
56
|
+
def doUpdate(self, updateId, updateLimit=None):
|
|
57
|
+
desp = DataExchangeStatus()
|
|
58
|
+
statusStartTimestamp = desp.setStartTime()
|
|
59
|
+
#
|
|
60
|
+
numUpd = 0
|
|
61
|
+
updateDL = self.__buildUpdate(self.__assignRefD)
|
|
62
|
+
if updateDL:
|
|
63
|
+
if updateLimit:
|
|
64
|
+
numUpd = self.__doUpdate(self.__cfgOb, updateDL[:updateLimit], self.__databaseName, self.__collectionName)
|
|
65
|
+
else:
|
|
66
|
+
numUpd = self.__doUpdate(self.__cfgOb, updateDL, self.__databaseName, self.__collectionName)
|
|
67
|
+
self.__updateStatus(updateId, self.__databaseName, self.__collectionName, True, statusStartTimestamp)
|
|
68
|
+
return len(updateDL), numUpd
|
|
69
|
+
|
|
70
|
+
def __doUpdate(self, cfgOb, updateDL, databaseName, collectionName):
|
|
71
|
+
obUpd = ObjectUpdater(cfgOb)
|
|
72
|
+
numUpd = obUpd.update(databaseName, collectionName, updateDL)
|
|
73
|
+
logger.info("Update count is %d", numUpd)
|
|
74
|
+
|
|
75
|
+
return numUpd
|
|
76
|
+
|
|
77
|
+
def __getPolymerReferenceSequenceAssignments(self, databaseName, collectionName, polymerType, **kwargs):
|
|
78
|
+
"""Get all accessions assigned to input reference sequence database for the input polymerType.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
(dict): {"1abc_1": "rcsb_entity_container_identifiers": {"reference_sequence_identifiers": []},
|
|
82
|
+
"rcsb_polymer_entity_align": [],
|
|
83
|
+
"rcsb_entity_source_organism"" {"ncbi_taxonomy_id": []}
|
|
84
|
+
"""
|
|
85
|
+
cachePath = kwargs.get("cachePath", ".")
|
|
86
|
+
exDbDir = "exdb"
|
|
87
|
+
cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "json", "indent": 3})
|
|
88
|
+
useCache = kwargs.get("useCache", True)
|
|
89
|
+
fetchLimit = kwargs.get("fetchLimit", None)
|
|
90
|
+
cacheFilePath = os.path.join(cachePath, exDbDir, "entity-poly-ref-seq-assign-cache.json")
|
|
91
|
+
#
|
|
92
|
+
try:
|
|
93
|
+
obEx = ObjectExtractor(
|
|
94
|
+
self.__cfgOb,
|
|
95
|
+
databaseName=databaseName,
|
|
96
|
+
collectionName=collectionName,
|
|
97
|
+
cacheFilePath=cacheFilePath,
|
|
98
|
+
useCache=useCache,
|
|
99
|
+
keyAttribute="entity",
|
|
100
|
+
uniqueAttributes=["rcsb_id"],
|
|
101
|
+
cacheKwargs=cacheKwargs,
|
|
102
|
+
objectLimit=fetchLimit,
|
|
103
|
+
selectionQuery={"entity_poly.rcsb_entity_polymer_type": polymerType},
|
|
104
|
+
selectionList=[
|
|
105
|
+
"rcsb_id",
|
|
106
|
+
"rcsb_entity_container_identifiers.reference_sequence_identifiers",
|
|
107
|
+
"rcsb_entity_container_identifiers.auth_asym_ids",
|
|
108
|
+
"rcsb_polymer_entity_align",
|
|
109
|
+
"rcsb_entity_source_organism.ncbi_taxonomy_id",
|
|
110
|
+
],
|
|
111
|
+
)
|
|
112
|
+
eCount = obEx.getCount()
|
|
113
|
+
logger.info("Entity count is %d", eCount)
|
|
114
|
+
objD = obEx.getObjects()
|
|
115
|
+
logger.info("Reading polymer entity entity count %d ref accession length %d ", eCount, len(objD))
|
|
116
|
+
#
|
|
117
|
+
except Exception as e:
|
|
118
|
+
logger.exception("Failing for %s (%s) with %s", databaseName, collectionName, str(e))
|
|
119
|
+
return objD
|
|
120
|
+
|
|
121
|
+
def __getUniqueAssignments(self, objD, referenceDatabaseName="UniProt", provSource="PDB"):
|
|
122
|
+
refIdD = defaultdict(list)
|
|
123
|
+
taxIdD = defaultdict(list)
|
|
124
|
+
numMissing = 0
|
|
125
|
+
for entityKey, eD in objD.items():
|
|
126
|
+
try:
|
|
127
|
+
accS = set()
|
|
128
|
+
for ii, tD in enumerate(eD["rcsb_entity_container_identifiers"]["reference_sequence_identifiers"]):
|
|
129
|
+
if tD["database_name"] == referenceDatabaseName and tD["provenance_source"] == provSource:
|
|
130
|
+
accS.add(tD["database_accession"])
|
|
131
|
+
refIdD[tD["database_accession"]].append(entityKey)
|
|
132
|
+
#
|
|
133
|
+
# pick up the corresponding taxonomy -
|
|
134
|
+
try:
|
|
135
|
+
taxIdD[tD["database_accession"]].append(eD["rcsb_entity_source_organism"][ii]["ncbi_taxonomy_id"])
|
|
136
|
+
except Exception:
|
|
137
|
+
logger.warning("Failing taxonomy lookup for %s %r", entityKey, tD["database_accession"])
|
|
138
|
+
|
|
139
|
+
logger.debug("PDB assigned sequences length %d", len(accS))
|
|
140
|
+
except Exception as e:
|
|
141
|
+
numMissing += 1
|
|
142
|
+
logger.debug("No sequence assignments for %s with %s", entityKey, str(e))
|
|
143
|
+
#
|
|
144
|
+
for refId, taxIdL in taxIdD.items():
|
|
145
|
+
taxIdL = list(set(taxIdL))
|
|
146
|
+
if len(taxIdL) > 1:
|
|
147
|
+
logger.info("Multitple taxIds assigned to reference sequence id %s: %r", refId, taxIdL)
|
|
148
|
+
|
|
149
|
+
logger.info("Unique %s accession assignments by %s %d (missing %d) ", referenceDatabaseName, provSource, len(refIdD), numMissing)
|
|
150
|
+
return refIdD, taxIdD
|
|
151
|
+
|
|
152
|
+
def __reMapAccessions(self, rsiDL, referenceDatabaseName="UniProt", provSourceL=None, excludeReferenceDatabases=None):
|
|
153
|
+
"""Internal method to re-map accessions for the input databae and assignment source
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
rsiDL (list): list of accession
|
|
157
|
+
databaseName (str, optional): resource database name. Defaults to 'UniProt'.
|
|
158
|
+
provSource (str, optional): assignment provenance. Defaults to 'PDB'.
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
bool, list: flag for mapping success, and remapped (and unmapped) accessions in the input object list
|
|
162
|
+
"""
|
|
163
|
+
isMatched = False
|
|
164
|
+
unMapped = 0
|
|
165
|
+
matched = 0
|
|
166
|
+
excludeReferenceDatabases = excludeReferenceDatabases if excludeReferenceDatabases else ["PDB"]
|
|
167
|
+
provSourceL = provSourceL if provSourceL else []
|
|
168
|
+
retDL = []
|
|
169
|
+
for rsiD in rsiDL:
|
|
170
|
+
if rsiD["database_name"] in excludeReferenceDatabases:
|
|
171
|
+
unMapped += 1
|
|
172
|
+
continue
|
|
173
|
+
if rsiD["database_name"] == referenceDatabaseName and rsiD["provenance_source"] in provSourceL:
|
|
174
|
+
try:
|
|
175
|
+
if len(self.__matchD[rsiD["database_accession"]]["matchedIds"]) == 1:
|
|
176
|
+
rsiD["database_accession"] = self.__matchD[rsiD["database_accession"]]["matchedIds"][0]
|
|
177
|
+
matched += 1
|
|
178
|
+
else:
|
|
179
|
+
logger.info("Skipping mapping to multiple superseding accessions %s", rsiD["database_accession"])
|
|
180
|
+
#
|
|
181
|
+
except Exception:
|
|
182
|
+
unMapped += 1
|
|
183
|
+
retDL.append(rsiD)
|
|
184
|
+
if matched == len(retDL):
|
|
185
|
+
isMatched = True
|
|
186
|
+
return not unMapped, isMatched, retDL
|
|
187
|
+
|
|
188
|
+
def __reMapAlignments(self, alignDL, referenceDatabaseName="UniProt", provSourceL=None, excludeReferenceDatabases=None):
|
|
189
|
+
"""Internal method to re-map alignments for the input databae and assignment source
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
alignDL (list): list of aligned regions
|
|
193
|
+
databaseName (str, optional): resource database name. Defaults to 'UniProt'.
|
|
194
|
+
provSourceL (list, optional): assignment provenance. Defaults to 'PDB'.
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
bool, list: flag for mapping success, and remapped (and unmapped) accessions in the input align list
|
|
198
|
+
"""
|
|
199
|
+
isMatched = False
|
|
200
|
+
unMapped = 0
|
|
201
|
+
matched = 0
|
|
202
|
+
excludeReferenceDatabases = excludeReferenceDatabases if excludeReferenceDatabases else ["PDB"]
|
|
203
|
+
retDL = []
|
|
204
|
+
provSourceL = provSourceL if provSourceL else []
|
|
205
|
+
for alignD in alignDL:
|
|
206
|
+
if alignD["reference_database_name"] in excludeReferenceDatabases:
|
|
207
|
+
unMapped += 1
|
|
208
|
+
continue
|
|
209
|
+
if alignD["reference_database_name"] == referenceDatabaseName and alignD["provenance_code"] in provSourceL:
|
|
210
|
+
try:
|
|
211
|
+
if len(self.__matchD[alignD["reference_database_accession"]]["matchedIds"]) == 1:
|
|
212
|
+
alignD["reference_database_accession"] = self.__matchD[alignD["reference_database_accession"]]["matchedIds"][0]
|
|
213
|
+
matched += 1
|
|
214
|
+
else:
|
|
215
|
+
logger.info("Skipping alignment mapping to multiple superseding accessions %s", alignD["reference_database_accession"])
|
|
216
|
+
except Exception:
|
|
217
|
+
unMapped += 1
|
|
218
|
+
retDL.append(alignD)
|
|
219
|
+
if matched == len(retDL):
|
|
220
|
+
isMatched = True
|
|
221
|
+
#
|
|
222
|
+
return not unMapped, isMatched, retDL
|
|
223
|
+
|
|
224
|
+
def __getSiftsAccessions(self, entityKey, authAsymIdL):
|
|
225
|
+
retL = []
|
|
226
|
+
saoLD = self.__ssP.getLongestAlignments(entityKey[:4], authAsymIdL)
|
|
227
|
+
for (_, dbAccession), _ in saoLD.items():
|
|
228
|
+
retL.append({"database_name": "UniProt", "database_accession": dbAccession, "provenance_source": "SIFTS"})
|
|
229
|
+
return retL
|
|
230
|
+
|
|
231
|
+
def __getSiftsAlignments(self, entityKey, authAsymIdL):
|
|
232
|
+
retL = []
|
|
233
|
+
saoLD = self.__ssP.getLongestAlignments(entityKey[:4], authAsymIdL)
|
|
234
|
+
for (_, dbAccession), saoL in saoLD.items():
|
|
235
|
+
dD = {"reference_database_name": "UniProt", "reference_database_accession": dbAccession, "provenance_code": "SIFTS", "aligned_regions": []}
|
|
236
|
+
for sao in saoL:
|
|
237
|
+
dD["aligned_regions"].append({"ref_beg_seq_id": sao.getDbSeqIdBeg(), "entity_beg_seq_id": sao.getEntitySeqIdBeg(), "length": sao.getEntityAlignLength()})
|
|
238
|
+
retL.append(dD)
|
|
239
|
+
return retL
|
|
240
|
+
|
|
241
|
+
def __buildUpdate(self, assignRefD):
|
|
242
|
+
#
|
|
243
|
+
updateDL = []
|
|
244
|
+
for entityKey, eD in assignRefD.items():
|
|
245
|
+
selectD = {"rcsb_id": entityKey}
|
|
246
|
+
try:
|
|
247
|
+
updateD = {}
|
|
248
|
+
authAsymIdL = []
|
|
249
|
+
ersDL = (
|
|
250
|
+
eD["rcsb_entity_container_identifiers"]["reference_sequence_identifiers"] if "reference_sequence_identifiers" in eD["rcsb_entity_container_identifiers"] else None
|
|
251
|
+
)
|
|
252
|
+
#
|
|
253
|
+
#
|
|
254
|
+
if ersDL:
|
|
255
|
+
authAsymIdL = eD["rcsb_entity_container_identifiers"]["auth_asym_ids"]
|
|
256
|
+
isMapped, isMatched, updErsDL = self.__reMapAccessions(ersDL, referenceDatabaseName="UniProt", provSourceL=["PDB"])
|
|
257
|
+
#
|
|
258
|
+
if not isMapped or not isMatched:
|
|
259
|
+
tL = self.__getSiftsAccessions(entityKey, authAsymIdL)
|
|
260
|
+
if tL:
|
|
261
|
+
logger.debug("Using SIFTS accession mapping for %s", entityKey)
|
|
262
|
+
else:
|
|
263
|
+
logger.info("No alternative SIFTS accession mapping for %s", entityKey)
|
|
264
|
+
updErsDL = tL if tL else []
|
|
265
|
+
#
|
|
266
|
+
if len(updErsDL) < len(ersDL):
|
|
267
|
+
logger.info("Incomplete reference sequence mapping update for %s", entityKey)
|
|
268
|
+
updateD["rcsb_entity_container_identifiers.reference_sequence_identifiers"] = updErsDL
|
|
269
|
+
#
|
|
270
|
+
alignDL = eD["rcsb_polymer_entity_align"] if "rcsb_polymer_entity_align" in eD else None
|
|
271
|
+
if alignDL and authAsymIdL:
|
|
272
|
+
isMapped, isMatched, updAlignDL = self.__reMapAlignments(alignDL, referenceDatabaseName="UniProt", provSourceL=["PDB"])
|
|
273
|
+
#
|
|
274
|
+
if not isMapped or not isMatched:
|
|
275
|
+
tL = self.__getSiftsAlignments(entityKey, authAsymIdL)
|
|
276
|
+
if tL:
|
|
277
|
+
logger.debug("Using SIFTS alignment mapping for %s", entityKey)
|
|
278
|
+
else:
|
|
279
|
+
logger.info("No alternative SIFTS alignment mapping for %s", entityKey)
|
|
280
|
+
updAlignDL = tL if tL else updAlignDL
|
|
281
|
+
#
|
|
282
|
+
if len(updAlignDL) < len(alignDL):
|
|
283
|
+
logger.info("Incomplete alignment mapping update for %s", entityKey)
|
|
284
|
+
updateD["rcsb_polymer_entity_align"] = updAlignDL
|
|
285
|
+
#
|
|
286
|
+
if updateD:
|
|
287
|
+
updateDL.append({"selectD": selectD, "updateD": updateD})
|
|
288
|
+
except Exception as e:
|
|
289
|
+
logger.exception("Mapping error for %s with %s", entityKey, str(e))
|
|
290
|
+
#
|
|
291
|
+
return updateDL
|
|
292
|
+
|
|
293
|
+
def __rebuildReferenceCache(self, refDbName, idList, **kwargs):
|
|
294
|
+
""" """
|
|
295
|
+
dD = {}
|
|
296
|
+
cachePath = kwargs.get("cachePath", ".")
|
|
297
|
+
dirPath = os.path.join(cachePath, "exdb")
|
|
298
|
+
cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "json", "indent": 3})
|
|
299
|
+
useCache = kwargs.get("useCache", True)
|
|
300
|
+
fetchLimit = kwargs.get("fetchLimit", None)
|
|
301
|
+
saveText = kwargs.get("saveText", False)
|
|
302
|
+
#
|
|
303
|
+
ext = "pic" if cacheKwargs["fmt"] == "pickle" else "json"
|
|
304
|
+
fn = "ref-sequence-data-cache" + "." + ext
|
|
305
|
+
cacheFilePath = os.path.join(dirPath, fn)
|
|
306
|
+
#
|
|
307
|
+
self.__mU.mkdir(dirPath)
|
|
308
|
+
if not useCache:
|
|
309
|
+
for fp in [cacheFilePath]:
|
|
310
|
+
try:
|
|
311
|
+
os.remove(fp)
|
|
312
|
+
except Exception:
|
|
313
|
+
pass
|
|
314
|
+
#
|
|
315
|
+
if useCache and cacheFilePath and self.__mU.exists(cacheFilePath):
|
|
316
|
+
dD = self.__mU.doImport(cacheFilePath, **cacheKwargs)
|
|
317
|
+
# Check for completeness -
|
|
318
|
+
missingS = set(dD["refDbCache"].keys()) - set(idList)
|
|
319
|
+
if missingS:
|
|
320
|
+
logger.info("Reference sequence cache missing %d accessions", len(missingS))
|
|
321
|
+
extraD = self.__fetchReferenceEntries(refDbName, list(missingS), saveText=saveText, fetchLimit=fetchLimit)
|
|
322
|
+
dD["refDbCache"].update(extraD["refDbCache"])
|
|
323
|
+
dD["matchInfo"].update(extraD["matchInfo"])
|
|
324
|
+
if cacheFilePath and cacheKwargs:
|
|
325
|
+
self.__mU.mkdir(dirPath)
|
|
326
|
+
ok = self.__mU.doExport(cacheFilePath, dD, **cacheKwargs)
|
|
327
|
+
logger.info("Cache updated with status %r", ok)
|
|
328
|
+
#
|
|
329
|
+
else:
|
|
330
|
+
dD = self.__fetchReferenceEntries(refDbName, idList, saveText=saveText, fetchLimit=fetchLimit)
|
|
331
|
+
if cacheFilePath and cacheKwargs:
|
|
332
|
+
self.__mU.mkdir(dirPath)
|
|
333
|
+
ok = self.__mU.doExport(cacheFilePath, dD, **cacheKwargs)
|
|
334
|
+
logger.info("Cache save status %r", ok)
|
|
335
|
+
|
|
336
|
+
return dD["refDbCache"], dD["matchInfo"]
|
|
337
|
+
|
|
338
|
+
def __fetchReferenceEntries(self, refDbName, idList, saveText=False, fetchLimit=None):
|
|
339
|
+
"""Fetch database entries from the input reference sequence database name."""
|
|
340
|
+
dD = {"refDbName": refDbName, "refDbCache": {}, "matchInfo": {}}
|
|
341
|
+
|
|
342
|
+
try:
|
|
343
|
+
idList = idList[:fetchLimit] if fetchLimit else idList
|
|
344
|
+
logger.info("Starting fetch for %d %s entries", len(idList), refDbName)
|
|
345
|
+
if refDbName == "UniProt":
|
|
346
|
+
fobj = UniProtUtils(saveText=saveText)
|
|
347
|
+
refD, matchD = fobj.fetchList(idList)
|
|
348
|
+
dD = {"refDbName": refDbName, "refDbCache": refD, "matchInfo": matchD}
|
|
349
|
+
|
|
350
|
+
except Exception as e:
|
|
351
|
+
logger.exception("Failing with %s", str(e))
|
|
352
|
+
|
|
353
|
+
return dD
|
|
354
|
+
|
|
355
|
+
def __fetchSiftsSummaryProvider(self, cfgOb, configName, **kwargs):
|
|
356
|
+
abbreviated = kwargs.get("siftsAbbreviated", "PROD")
|
|
357
|
+
cachePath = kwargs.get("cachePath", ".")
|
|
358
|
+
cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"})
|
|
359
|
+
useCache = kwargs.get("useCache", True)
|
|
360
|
+
#
|
|
361
|
+
siftsSummaryDataPath = cfgOb.getPath("SIFTS_SUMMARY_DATA_PATH", sectionName=configName)
|
|
362
|
+
# logger.info("Using SIFTS_SUMMARY_DATA_PATH, %r", siftsSummaryDataPath)
|
|
363
|
+
if siftsSummaryDataPath.lower().startswith("http"):
|
|
364
|
+
srcDirPath = siftsSummaryDataPath
|
|
365
|
+
else:
|
|
366
|
+
srcDirPath = os.path.join(cachePath, siftsSummaryDataPath)
|
|
367
|
+
cacheDirPath = os.path.join(cachePath, cfgOb.get("SIFTS_SUMMARY_CACHE_DIR", sectionName=configName))
|
|
368
|
+
logger.debug("ssP %r %r", srcDirPath, cacheDirPath)
|
|
369
|
+
ssP = SiftsSummaryProvider(srcDirPath=srcDirPath, cacheDirPath=cacheDirPath, useCache=useCache, abbreviated=abbreviated, cacheKwargs=cacheKwargs)
|
|
370
|
+
logger.info("ssP entry count %d", ssP.getEntryCount())
|
|
371
|
+
return ssP
|
|
372
|
+
|
|
373
|
+
def __dumpEntries(self, refD):
|
|
374
|
+
for (eId, eDict) in refD.items():
|
|
375
|
+
logger.info("------ Reference id %s", eId)
|
|
376
|
+
for k, v in eDict.items():
|
|
377
|
+
logger.info("%-15s = %r", k, v)
|
|
378
|
+
|
|
379
|
+
def __getUpdateAssignmentCandidates(self, objD):
|
|
380
|
+
totCount = 0
|
|
381
|
+
difCount = 0
|
|
382
|
+
pdbUnpIdD = defaultdict(list)
|
|
383
|
+
siftsUnpIdD = defaultdict(list)
|
|
384
|
+
assignIdDifD = defaultdict(list)
|
|
385
|
+
#
|
|
386
|
+
for entityKey, eD in objD.items():
|
|
387
|
+
try:
|
|
388
|
+
siftsS = set()
|
|
389
|
+
pdbS = set()
|
|
390
|
+
for tD in eD["rcsb_entity_container_identifiers"]["reference_sequence_identifiers"]:
|
|
391
|
+
if tD["database_name"] == "UniProt":
|
|
392
|
+
if tD["provenance_source"] == "SIFTS":
|
|
393
|
+
siftsS.add(tD["database_accession"])
|
|
394
|
+
siftsUnpIdD[tD["database_accession"]].append(entityKey)
|
|
395
|
+
elif tD["provenance_source"] == "PDB":
|
|
396
|
+
pdbS.add(tD["database_accession"])
|
|
397
|
+
pdbUnpIdD[tD["database_accession"]].append(entityKey)
|
|
398
|
+
else:
|
|
399
|
+
logger.debug("No UniProt for %r", eD["rcsb_entity_container_identifiers"])
|
|
400
|
+
logger.debug("PDB assigned sequence length %d", len(pdbS))
|
|
401
|
+
logger.debug("SIFTS assigned sequence length %d", len(siftsS))
|
|
402
|
+
|
|
403
|
+
if pdbS and siftsS:
|
|
404
|
+
totCount += 1
|
|
405
|
+
if pdbS != siftsS:
|
|
406
|
+
difCount += 1
|
|
407
|
+
for idV in pdbS:
|
|
408
|
+
assignIdDifD[idV].append(entityKey)
|
|
409
|
+
|
|
410
|
+
except Exception as e:
|
|
411
|
+
logger.warning("No identifiers for %s with %s", entityKey, str(e))
|
|
412
|
+
#
|
|
413
|
+
logger.info("Total %d differences %d", totCount, difCount)
|
|
414
|
+
logger.info("Unique UniProt accession assignments PDB %d SIFTS %d", len(pdbUnpIdD), len(siftsUnpIdD))
|
|
415
|
+
logger.info("Current unique overalapping assignment differences %d ", len(assignIdDifD))
|
|
416
|
+
logger.info("Current unique overalapping assignment differences %r ", assignIdDifD)
|
|
417
|
+
return assignIdDifD, pdbUnpIdD, siftsUnpIdD
|
|
418
|
+
|
|
419
|
+
def getReferenceAccessionAlignSummary(self):
|
|
420
|
+
"""Summarize the alignment of PDB accession assignments with the current reference sequence database."""
|
|
421
|
+
numPrimary = 0
|
|
422
|
+
numSecondary = 0
|
|
423
|
+
numNone = 0
|
|
424
|
+
for _, mD in self.__matchD.items():
|
|
425
|
+
if mD["matched"] == "primary":
|
|
426
|
+
numPrimary += 1
|
|
427
|
+
elif mD["matched"] == "secondary":
|
|
428
|
+
numSecondary += 1
|
|
429
|
+
else:
|
|
430
|
+
numNone += 1
|
|
431
|
+
logger.debug("Matched primary: %d secondary: %d none %d", numPrimary, numSecondary, numNone)
|
|
432
|
+
return numPrimary, numSecondary, numNone
|
|
433
|
+
|
|
434
|
+
def getLoadStatus(self):
|
|
435
|
+
return self.__statusList
|
|
436
|
+
|
|
437
|
+
def __updateStatus(self, updateId, databaseName, collectionName, status, startTimestamp):
|
|
438
|
+
try:
|
|
439
|
+
sFlag = "Y" if status else "N"
|
|
440
|
+
desp = DataExchangeStatus()
|
|
441
|
+
desp.setStartTime(tS=startTimestamp)
|
|
442
|
+
desp.setObject(databaseName, collectionName)
|
|
443
|
+
desp.setStatus(updateId=updateId, successFlag=sFlag)
|
|
444
|
+
desp.setEndTime()
|
|
445
|
+
self.__statusList.append(desp.getStatus())
|
|
446
|
+
return True
|
|
447
|
+
except Exception as e:
|
|
448
|
+
logger.exception("Failing with %s", str(e))
|
|
449
|
+
return False
|