rcsb.exdb 1.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rcsb/__init__.py +1 -0
- rcsb/exdb/__init__.py +1 -0
- rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
- rcsb/exdb/branch/GlycanProvider.py +116 -0
- rcsb/exdb/branch/GlycanUtils.py +114 -0
- rcsb/exdb/branch/__init__.py +0 -0
- rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
- rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
- rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
- rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
- rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
- rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
- rcsb/exdb/chemref/__init__.py +0 -0
- rcsb/exdb/citation/CitationAdapter.py +91 -0
- rcsb/exdb/citation/CitationExtractor.py +190 -0
- rcsb/exdb/citation/CitationUtils.py +51 -0
- rcsb/exdb/citation/__init__.py +0 -0
- rcsb/exdb/cli/__init__.py +0 -0
- rcsb/exdb/entry/EntryInfoProvider.py +148 -0
- rcsb/exdb/entry/__init__.py +0 -0
- rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
- rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
- rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
- rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
- rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
- rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
- rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
- rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
- rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
- rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
- rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
- rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
- rcsb/exdb/seq/AnnotationExtractor.py +76 -0
- rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
- rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
- rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
- rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
- rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
- rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
- rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
- rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
- rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
- rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
- rcsb/exdb/seq/UniProtExtractor.py +80 -0
- rcsb/exdb/seq/__init__.py +0 -0
- rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
- rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
- rcsb/exdb/tests/__init__.py +0 -0
- rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
- rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
- rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
- rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
- rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
- rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
- rcsb/exdb/tests/testChemRefLoader.py +106 -0
- rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
- rcsb/exdb/tests/testCitationAdapter.py +97 -0
- rcsb/exdb/tests/testCitationExtractor.py +93 -0
- rcsb/exdb/tests/testCitationUtils.py +92 -0
- rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
- rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
- rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
- rcsb/exdb/tests/testGlycanProvider.py +98 -0
- rcsb/exdb/tests/testGlycanUtils.py +64 -0
- rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
- rcsb/exdb/tests/testObjectExtractor.py +342 -0
- rcsb/exdb/tests/testObjectTransformer.py +83 -0
- rcsb/exdb/tests/testObjectUpdater.py +120 -0
- rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
- rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
- rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
- rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
- rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
- rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
- rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
- rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
- rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
- rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
- rcsb/exdb/tests/testUniProtExtractor.py +77 -0
- rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
- rcsb/exdb/tree/__init__.py +0 -0
- rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
- rcsb/exdb/utils/ObjectExtractor.py +286 -0
- rcsb/exdb/utils/ObjectTransformer.py +124 -0
- rcsb/exdb/utils/ObjectUpdater.py +121 -0
- rcsb/exdb/utils/ObjectValidator.py +160 -0
- rcsb/exdb/utils/__init__.py +0 -0
- rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
- rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
- rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
- rcsb/exdb/wf/__init__.py +0 -0
- rcsb_exdb-1.31.dist-info/METADATA +103 -0
- rcsb_exdb-1.31.dist-info/RECORD +98 -0
- rcsb_exdb-1.31.dist-info/WHEEL +4 -0
- rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: EntityPolymerExtractorTests.py
|
|
3
|
+
# Author: J. Westbrook
|
|
4
|
+
# Date: 25-Mar-2019
|
|
5
|
+
#
|
|
6
|
+
# Updates:
|
|
7
|
+
# 21-Apr-2019 jdw Separate tests against the mock-data repo in this module
|
|
8
|
+
#
|
|
9
|
+
##
|
|
10
|
+
"""
|
|
11
|
+
Tests for extractor entity polymer collections (limited tests from mock-data repos)
|
|
12
|
+
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
__docformat__ = "google en"
|
|
16
|
+
__author__ = "John Westbrook"
|
|
17
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
18
|
+
__license__ = "Apache 2.0"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
import logging
|
|
22
|
+
import os
|
|
23
|
+
import time
|
|
24
|
+
import unittest
|
|
25
|
+
|
|
26
|
+
from rcsb.exdb.seq.EntityPolymerExtractor import EntityPolymerExtractor
|
|
27
|
+
from rcsb.utils.config.ConfigUtil import ConfigUtil
|
|
28
|
+
from rcsb.utils.io.MarshalUtil import MarshalUtil
|
|
29
|
+
from rcsb.utils.taxonomy.TaxonomyProvider import TaxonomyProvider
|
|
30
|
+
|
|
31
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]-%(module)s.%(funcName)s: %(message)s")
|
|
32
|
+
logger = logging.getLogger()
|
|
33
|
+
|
|
34
|
+
HERE = os.path.abspath(os.path.dirname(__file__))
|
|
35
|
+
TOPDIR = os.path.dirname(os.path.dirname(os.path.dirname(HERE)))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class EntityPolymerExtractorTests(unittest.TestCase):
|
|
39
|
+
def __init__(self, methodName="runTest"):
|
|
40
|
+
super(EntityPolymerExtractorTests, self).__init__(methodName)
|
|
41
|
+
self.__verbose = True
|
|
42
|
+
|
|
43
|
+
def setUp(self):
|
|
44
|
+
#
|
|
45
|
+
#
|
|
46
|
+
self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
|
|
47
|
+
configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example.yml")
|
|
48
|
+
#
|
|
49
|
+
configName = "site_info_configuration"
|
|
50
|
+
self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath)
|
|
51
|
+
#
|
|
52
|
+
self.__cachePath = os.path.join(TOPDIR, "CACHE")
|
|
53
|
+
self.__workPath = os.path.join(HERE, "test-output")
|
|
54
|
+
self.__taxonomyDataPath = os.path.join(self.__cachePath, self.__cfgOb.get("NCBI_TAXONOMY_CACHE_DIR", sectionName=configName))
|
|
55
|
+
#
|
|
56
|
+
self.__cacheKwargs = {"fmt": "json", "indent": 3}
|
|
57
|
+
self.__exdbCacheDirPath = os.path.join(self.__cachePath, self.__cfgOb.get("EXDB_CACHE_DIR", sectionName=configName))
|
|
58
|
+
#
|
|
59
|
+
self.__mU = MarshalUtil()
|
|
60
|
+
self.__entryLimitTest = 18
|
|
61
|
+
#
|
|
62
|
+
self.__startTime = time.time()
|
|
63
|
+
logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
|
|
64
|
+
|
|
65
|
+
def tearDown(self):
|
|
66
|
+
endTime = time.time()
|
|
67
|
+
logger.info("Completed %s at %s (%.4f seconds)\n", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)
|
|
68
|
+
|
|
69
|
+
def testExtractEntityPolymers(self):
|
|
70
|
+
"""Test case - extract entity polymer info"""
|
|
71
|
+
try:
|
|
72
|
+
epe = EntityPolymerExtractor(self.__cfgOb, exdbDirPath=self.__exdbCacheDirPath, useCache=False, cacheKwargs=self.__cacheKwargs, entryLimit=self.__entryLimitTest)
|
|
73
|
+
eCount = epe.getEntryCount()
|
|
74
|
+
self.assertGreaterEqual(eCount, self.__entryLimitTest)
|
|
75
|
+
#
|
|
76
|
+
except Exception as e:
|
|
77
|
+
logger.exception("Failing with %s", str(e))
|
|
78
|
+
self.fail()
|
|
79
|
+
|
|
80
|
+
def testAccessEntityPolymerFeatures(self):
|
|
81
|
+
"""Test case - access cached entity polymer info from test cache"""
|
|
82
|
+
try:
|
|
83
|
+
epe = EntityPolymerExtractor(self.__cfgOb, exdbDirPath=self.__exdbCacheDirPath, useCache=False, cacheKwargs=self.__cacheKwargs)
|
|
84
|
+
eCount = epe.getEntryCount()
|
|
85
|
+
logger.info("Entry count %d", eCount)
|
|
86
|
+
self.assertGreaterEqual(eCount, self.__entryLimitTest)
|
|
87
|
+
#
|
|
88
|
+
unpL = epe.getRefSeqAccessions("UNP")
|
|
89
|
+
logger.info("Ref seq count %d", len(unpL))
|
|
90
|
+
self.assertGreaterEqual(len(unpL), 1)
|
|
91
|
+
#
|
|
92
|
+
for entryId in ["3RER"]:
|
|
93
|
+
for entityId in ["1"]:
|
|
94
|
+
uL = epe.getEntityRefSeqAccessions("UNP", entryId, entityId)
|
|
95
|
+
logger.info("UNP for %s %s %r", entryId, entityId, uL)
|
|
96
|
+
#
|
|
97
|
+
except Exception as e:
|
|
98
|
+
logger.exception("Failing with %s", str(e))
|
|
99
|
+
self.fail()
|
|
100
|
+
|
|
101
|
+
def testTaxonomyReadCache(self):
|
|
102
|
+
"""Test case - access cached entity polymer info from test cache"""
|
|
103
|
+
try:
|
|
104
|
+
epe = EntityPolymerExtractor(self.__cfgOb, exdbDirPath=self.__exdbCacheDirPath, useCache=False, cacheKwargs=self.__cacheKwargs)
|
|
105
|
+
logger.info("Cache entry count %d", epe.getEntryCount())
|
|
106
|
+
#
|
|
107
|
+
obsL = []
|
|
108
|
+
tD = epe.getOrigTaxons()
|
|
109
|
+
logger.info("Taxons %d", len(tD))
|
|
110
|
+
|
|
111
|
+
tU = TaxonomyProvider(taxDirPath=self.__taxonomyDataPath, useCache=True)
|
|
112
|
+
#
|
|
113
|
+
for entryId, taxIdL in tD.items():
|
|
114
|
+
for entityId, iTaxId in taxIdL:
|
|
115
|
+
# logger.info("entryId %r entityId %r taxId %r" % (entryId, entityId, taxId))
|
|
116
|
+
mTaxId = tU.getMergedTaxId(iTaxId)
|
|
117
|
+
if iTaxId != mTaxId:
|
|
118
|
+
obsL.append({"entryId": entryId, "entityId": entityId, "taxId": iTaxId, "replaceTaxId": mTaxId})
|
|
119
|
+
logger.info("Obsolete list length %d", len(obsL))
|
|
120
|
+
self.__mU.doExport(os.path.join(self.__workPath, "obsolete-taxons.json"), obsL, fmt="json", indent=3)
|
|
121
|
+
#
|
|
122
|
+
except Exception as e:
|
|
123
|
+
logger.exception("Failing with %s", str(e))
|
|
124
|
+
self.fail()
|
|
125
|
+
|
|
126
|
+
def testAccessEntityPolymerReadCache(self):
|
|
127
|
+
"""Test case - access cached entity polymer info from test cache"""
|
|
128
|
+
try:
|
|
129
|
+
epe = EntityPolymerExtractor(self.__cfgOb, exdbDirPath=self.__exdbCacheDirPath, useCache=False, cacheKwargs=self.__cacheKwargs)
|
|
130
|
+
logger.info("Cache entry count %d", epe.getEntryCount())
|
|
131
|
+
cD = epe.countRefSeqAccessions("UNP")
|
|
132
|
+
self.assertGreaterEqual(len(cD), 2)
|
|
133
|
+
logger.info("UNP reference sequences per entity %r", dict(sorted(cD.items())))
|
|
134
|
+
logger.info("Reference sequences per entity %r", dict(sorted(epe.countRefSeqAccessionAny().items())))
|
|
135
|
+
logger.info("Reference sequences per ref db %r", dict(sorted(epe.countRefSeqAccessionDbType().items())))
|
|
136
|
+
#
|
|
137
|
+
ok = epe.checkRefSeqAlignRange("UNP")
|
|
138
|
+
self.assertTrue(ok)
|
|
139
|
+
unpL = epe.getRefSeqAccessions("UNP")
|
|
140
|
+
logger.info("Unique UNP reference sequences %d", len(unpL))
|
|
141
|
+
self.assertTrue(ok)
|
|
142
|
+
tD = epe.getUniqueTaxons()
|
|
143
|
+
logger.info("Unique taxons %d", len(tD))
|
|
144
|
+
tD = epe.countRefSeqAccessionByTaxon("UNP")
|
|
145
|
+
logger.info("Unique taxons %d", len(tD))
|
|
146
|
+
#
|
|
147
|
+
except Exception as e:
|
|
148
|
+
logger.exception("Failing with %s", str(e))
|
|
149
|
+
self.fail()
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def entityPolymerExtractSuite():
|
|
153
|
+
suiteSelect = unittest.TestSuite()
|
|
154
|
+
suiteSelect.addTest(EntityPolymerExtractorTests("testExtractEntityPolymers"))
|
|
155
|
+
suiteSelect.addTest(EntityPolymerExtractorTests("testAccessEntityPolymerFeatures"))
|
|
156
|
+
suiteSelect.addTest(EntityPolymerExtractorTests("testAccessEntityPolymerReadCache"))
|
|
157
|
+
return suiteSelect
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def entityTaxonomyExtractSuite():
|
|
161
|
+
suiteSelect = unittest.TestSuite()
|
|
162
|
+
suiteSelect.addTest(EntityPolymerExtractorTests("testTaxonomyReadCache"))
|
|
163
|
+
return suiteSelect
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
if __name__ == "__main__":
|
|
167
|
+
mySuite = entityPolymerExtractSuite()
|
|
168
|
+
unittest.TextTestRunner(verbosity=2).run(mySuite)
|
|
169
|
+
|
|
170
|
+
mySuite = entityTaxonomyExtractSuite()
|
|
171
|
+
unittest.TextTestRunner(verbosity=2).run(mySuite)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: UpdateReferenceSequencesTests.py
|
|
3
|
+
# Author: J. Westbrook
|
|
4
|
+
# Date: 12-Oct-2019
|
|
5
|
+
#
|
|
6
|
+
# Updates:
|
|
7
|
+
#
|
|
8
|
+
##
|
|
9
|
+
"""
|
|
10
|
+
Tests for reference sequence assignment update operations
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
__docformat__ = "google en"
|
|
14
|
+
__author__ = "John Westbrook"
|
|
15
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
16
|
+
__license__ = "Apache 2.0"
|
|
17
|
+
|
|
18
|
+
import logging
|
|
19
|
+
import os
|
|
20
|
+
import time
|
|
21
|
+
import unittest
|
|
22
|
+
|
|
23
|
+
from rcsb.exdb.seq.ReferenceSequenceAssignmentUpdater import ReferenceSequenceAssignmentUpdater
|
|
24
|
+
from rcsb.utils.config.ConfigUtil import ConfigUtil
|
|
25
|
+
|
|
26
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]-%(module)s.%(funcName)s: %(message)s")
|
|
27
|
+
logger = logging.getLogger()
|
|
28
|
+
|
|
29
|
+
HERE = os.path.abspath(os.path.dirname(__file__))
|
|
30
|
+
TOPDIR = os.path.dirname(os.path.dirname(os.path.dirname(HERE)))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ReferenceSequenceAssignmentUpdaterTests(unittest.TestCase):
|
|
34
|
+
def __init__(self, methodName="runTest"):
|
|
35
|
+
super(ReferenceSequenceAssignmentUpdaterTests, self).__init__(methodName)
|
|
36
|
+
self.__verbose = True
|
|
37
|
+
|
|
38
|
+
def setUp(self):
|
|
39
|
+
#
|
|
40
|
+
self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
|
|
41
|
+
configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example.yml")
|
|
42
|
+
configName = "site_info_configuration"
|
|
43
|
+
self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath)
|
|
44
|
+
#
|
|
45
|
+
self.__resourceName = "MONGO_DB"
|
|
46
|
+
self.__cachePath = os.path.join(TOPDIR, "CACHE")
|
|
47
|
+
self.__testEntityCacheKwargs = {"fmt": "json", "indent": 3}
|
|
48
|
+
self.__fetchLimitTest = None
|
|
49
|
+
#
|
|
50
|
+
self.__startTime = time.time()
|
|
51
|
+
logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
|
|
52
|
+
|
|
53
|
+
def tearDown(self):
|
|
54
|
+
endTime = time.time()
|
|
55
|
+
logger.info("Completed %s at %s (%.4f seconds)\n", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)
|
|
56
|
+
|
|
57
|
+
def testUpdateAssignments(self):
|
|
58
|
+
"""Test case - get reference sequences and update candidates"""
|
|
59
|
+
try:
|
|
60
|
+
rsau = ReferenceSequenceAssignmentUpdater(self.__cfgOb, useCache=False, cachePath=self.__cachePath, fetchLimit=self.__fetchLimitTest, siftsAbbreviated="TEST")
|
|
61
|
+
updateLimit = None
|
|
62
|
+
updateId = "2019_01"
|
|
63
|
+
lenUpd, numUpd = rsau.doUpdate(updateId, updateLimit=updateLimit)
|
|
64
|
+
logger.info("Update length %d numUpd %d", lenUpd, numUpd)
|
|
65
|
+
# self.assertEqual(numUpd, lenUpd)
|
|
66
|
+
except Exception as e:
|
|
67
|
+
logger.exception("Failing with %s", str(e))
|
|
68
|
+
self.fail()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def referenceUpdaterSuite():
|
|
72
|
+
suiteSelect = unittest.TestSuite()
|
|
73
|
+
suiteSelect.addTest(ReferenceSequenceAssignmentUpdaterTests("testUpdateAssignments"))
|
|
74
|
+
return suiteSelect
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
if __name__ == "__main__":
|
|
78
|
+
mySuite = referenceUpdaterSuite()
|
|
79
|
+
unittest.TextTestRunner(verbosity=2).run(mySuite)
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: ReferenceSequenceUtilsTests.py
|
|
3
|
+
# Author: J. Westbrook
|
|
4
|
+
# Date: 25-Mar-2019
|
|
5
|
+
#
|
|
6
|
+
# Updates:
|
|
7
|
+
#
|
|
8
|
+
##
|
|
9
|
+
"""
|
|
10
|
+
Tests for accessing reference sequence data corresponding to polymer entity sequence assignments.
|
|
11
|
+
|
|
12
|
+
(Limited tests against to mock-data repos.)
|
|
13
|
+
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
__docformat__ = "google en"
|
|
17
|
+
__author__ = "John Westbrook"
|
|
18
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
19
|
+
__license__ = "Apache 2.0"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
import logging
|
|
23
|
+
import os
|
|
24
|
+
import time
|
|
25
|
+
import unittest
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
from rcsb.exdb.seq.ReferenceSequenceUtils import ReferenceSequenceUtils
|
|
29
|
+
from rcsb.utils.config.ConfigUtil import ConfigUtil
|
|
30
|
+
from rcsb.utils.io.MarshalUtil import MarshalUtil
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]-%(module)s.%(funcName)s: %(message)s")
|
|
34
|
+
logger = logging.getLogger()
|
|
35
|
+
|
|
36
|
+
HERE = os.path.abspath(os.path.dirname(__file__))
|
|
37
|
+
TOPDIR = os.path.dirname(os.path.dirname(os.path.dirname(HERE)))
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class ReferenceSequenceUtilsTests(unittest.TestCase):
|
|
41
|
+
def __init__(self, methodName="runTest"):
|
|
42
|
+
super(ReferenceSequenceUtilsTests, self).__init__(methodName)
|
|
43
|
+
self.__verbose = True
|
|
44
|
+
|
|
45
|
+
def setUp(self):
|
|
46
|
+
#
|
|
47
|
+
self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
|
|
48
|
+
configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example.yml")
|
|
49
|
+
#
|
|
50
|
+
configName = "site_info_configuration"
|
|
51
|
+
self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath)
|
|
52
|
+
#
|
|
53
|
+
self.__cachePath = os.path.join(TOPDIR, "CACHE")
|
|
54
|
+
#
|
|
55
|
+
self.__cacheKwargs = {"fmt": "json", "indent": 3}
|
|
56
|
+
self.__exdbCacheDirPath = os.path.join(self.__cachePath, self.__cfgOb.get("EXDB_CACHE_DIR", sectionName=configName))
|
|
57
|
+
#
|
|
58
|
+
# Reference sequence test data cache -
|
|
59
|
+
#
|
|
60
|
+
self.__refDbCachePath = os.path.join(HERE, "test-output", "unp-data-test-cache.json")
|
|
61
|
+
self.__cacheKwargs = {"fmt": "json", "indent": 3}
|
|
62
|
+
self.__useCache = False
|
|
63
|
+
self.__fetchLimit = None
|
|
64
|
+
#
|
|
65
|
+
# Entity polymer extracted data ...
|
|
66
|
+
#
|
|
67
|
+
self.__entryLimit = 500
|
|
68
|
+
#
|
|
69
|
+
self.__mU = MarshalUtil()
|
|
70
|
+
#
|
|
71
|
+
self.__startTime = time.time()
|
|
72
|
+
logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
|
|
73
|
+
|
|
74
|
+
def tearDown(self):
|
|
75
|
+
endTime = time.time()
|
|
76
|
+
logger.info("Completed %s at %s (%.4f seconds)\n", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)
|
|
77
|
+
|
|
78
|
+
def testFetchUnp(self):
|
|
79
|
+
"""Test case - extract entity polymer info -"""
|
|
80
|
+
try:
|
|
81
|
+
refDbName = "UNP"
|
|
82
|
+
rsu = ReferenceSequenceUtils(
|
|
83
|
+
self.__cfgOb,
|
|
84
|
+
refDbName,
|
|
85
|
+
exdbDirPath=self.__exdbCacheDirPath,
|
|
86
|
+
cacheKwargs=self.__cacheKwargs,
|
|
87
|
+
useCache=self.__useCache,
|
|
88
|
+
entryLimit=self.__entryLimit,
|
|
89
|
+
fetchLimit=self.__fetchLimit,
|
|
90
|
+
)
|
|
91
|
+
numPrimary, numSecondary, numNone = rsu.getReferenceAccessionAlignSummary()
|
|
92
|
+
self.assertGreaterEqual(numPrimary, 70)
|
|
93
|
+
logger.info("For %r matched primary: %d secondary: %d none %d", refDbName, numPrimary, numSecondary, numNone)
|
|
94
|
+
#
|
|
95
|
+
except Exception as e:
|
|
96
|
+
logger.exception("Failing with %s", str(e))
|
|
97
|
+
self.fail()
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def unpFetchSuite():
|
|
101
|
+
suiteSelect = unittest.TestSuite()
|
|
102
|
+
suiteSelect.addTest(ReferenceSequenceUtilsTests("testFetchUnp"))
|
|
103
|
+
return suiteSelect
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
if __name__ == "__main__":
|
|
107
|
+
mySuite = unpFetchSuite()
|
|
108
|
+
unittest.TextTestRunner(verbosity=2).run(mySuite)
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: AnnotationExtractor.py
|
|
3
|
+
# Date: 15-Oct-2019 jdw
|
|
4
|
+
#
|
|
5
|
+
# Utilities to extract selected annotation details from the exchange collections.
|
|
6
|
+
#
|
|
7
|
+
# Currently, used to established covered annotations for scoping tree brower displays
|
|
8
|
+
# for expansive annotation hierarchies.
|
|
9
|
+
#
|
|
10
|
+
# Updates:
|
|
11
|
+
#
|
|
12
|
+
##
|
|
13
|
+
__docformat__ = "google en"
|
|
14
|
+
__author__ = "John Westbrook"
|
|
15
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
16
|
+
__license__ = "Apache 2.0"
|
|
17
|
+
|
|
18
|
+
import logging
|
|
19
|
+
|
|
20
|
+
from rcsb.exdb.utils.ObjectExtractor import ObjectExtractor
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class AnnotationExtractor(object):
|
|
26
|
+
"""Utilities to extract selected annotation details from the exchange collections."""
|
|
27
|
+
|
|
28
|
+
def __init__(self, cfgOb):
|
|
29
|
+
self.__cfgOb = cfgOb
|
|
30
|
+
|
|
31
|
+
def getUniqueIdentifiers(self, annotationType):
|
|
32
|
+
"""Extract unique rcsb_polymer_entity_annotation ids for the input annotation type.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
annotationType (str): a value of rcsb_polymer_entity_annotation.type
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
list: unique list of identifiers of annotationType
|
|
39
|
+
"""
|
|
40
|
+
idL = self.__extractEntityAnnotationIdentifiers(annotationType)
|
|
41
|
+
return idL
|
|
42
|
+
|
|
43
|
+
def __extractEntityAnnotationIdentifiers(self, annotationType):
|
|
44
|
+
"""Extract unique rcsb_polymer_entity_annotation ids for the input annotation type."""
|
|
45
|
+
try:
|
|
46
|
+
databaseName = "pdbx_core"
|
|
47
|
+
collectionName = "pdbx_core_polymer_entity"
|
|
48
|
+
obEx = ObjectExtractor(
|
|
49
|
+
self.__cfgOb,
|
|
50
|
+
databaseName=databaseName,
|
|
51
|
+
collectionName=collectionName,
|
|
52
|
+
cacheFilePath=None,
|
|
53
|
+
useCache=False,
|
|
54
|
+
keyAttribute="entity",
|
|
55
|
+
uniqueAttributes=["rcsb_id"],
|
|
56
|
+
cacheKwargs=None,
|
|
57
|
+
objectLimit=None,
|
|
58
|
+
# selectionQuery={"rcsb_polymer_entity_annotation.type": annotationType},
|
|
59
|
+
selectionQuery=None,
|
|
60
|
+
selectionList=["rcsb_id", "rcsb_polymer_entity_annotation.annotation_id", "rcsb_polymer_entity_annotation.type"],
|
|
61
|
+
)
|
|
62
|
+
eCount = obEx.getCount()
|
|
63
|
+
logger.info("For type %r polymer entity annotation object count is %d", annotationType, eCount)
|
|
64
|
+
idS = set()
|
|
65
|
+
objD = obEx.getObjects()
|
|
66
|
+
for _, eD in objD.items():
|
|
67
|
+
try:
|
|
68
|
+
for tD in eD["rcsb_polymer_entity_annotation"]:
|
|
69
|
+
if tD["type"] == annotationType:
|
|
70
|
+
idS.add(tD["annotation_id"])
|
|
71
|
+
except Exception:
|
|
72
|
+
pass
|
|
73
|
+
logger.info("Unique identifiers %d", len(idS))
|
|
74
|
+
return list(idS)
|
|
75
|
+
except Exception as e:
|
|
76
|
+
logger.exception("Failing with %s", str(e))
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: LigandNeighborMappingExtractor.py
|
|
3
|
+
# Date: 28-Jun-2021 jdw
|
|
4
|
+
#
|
|
5
|
+
# Utilities to extract ligand neighbor mapping details from the exchange collections.
|
|
6
|
+
#
|
|
7
|
+
# Updates:
|
|
8
|
+
# 17-Jul-2024 dwp Stop fetching and including rcsb_ligand_neighbors.ligand_is_bound, since no longer populating that field
|
|
9
|
+
#
|
|
10
|
+
##
|
|
11
|
+
__docformat__ = "google en"
|
|
12
|
+
__author__ = "John Westbrook"
|
|
13
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
14
|
+
__license__ = "Apache 2.0"
|
|
15
|
+
|
|
16
|
+
import logging
|
|
17
|
+
|
|
18
|
+
from rcsb.exdb.utils.ObjectExtractor import ObjectExtractor
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class LigandNeighborMappingExtractor(object):
|
|
24
|
+
"""Utilities to extract ligand neighbor mapping details from the exchange collections."""
|
|
25
|
+
|
|
26
|
+
def __init__(self, cfgOb):
|
|
27
|
+
self.__cfgOb = cfgOb
|
|
28
|
+
|
|
29
|
+
def getLigandNeighbors(self):
|
|
30
|
+
"""Extract unique chemical component ids involved in neighbor interactions with each
|
|
31
|
+
polymer and branched entity instance.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
dict: {'entryId_entityId': [(chem_comp_id, isBound),...], }
|
|
35
|
+
"""
|
|
36
|
+
return self.__extractLigandNeighbors()
|
|
37
|
+
|
|
38
|
+
def __extractLigandNeighbors(self):
|
|
39
|
+
"""Extract unique chemical component ids involved in neighbor interactions with each
|
|
40
|
+
polymer and branched entity instance."""
|
|
41
|
+
try:
|
|
42
|
+
databaseName = "pdbx_core"
|
|
43
|
+
collectionName = "pdbx_core_polymer_entity_instance"
|
|
44
|
+
obEx = ObjectExtractor(
|
|
45
|
+
self.__cfgOb,
|
|
46
|
+
databaseName=databaseName,
|
|
47
|
+
collectionName=collectionName,
|
|
48
|
+
cacheFilePath=None,
|
|
49
|
+
useCache=False,
|
|
50
|
+
keyAttribute="rcsb_id",
|
|
51
|
+
uniqueAttributes=["rcsb_id"],
|
|
52
|
+
cacheKwargs=None,
|
|
53
|
+
objectLimit=None,
|
|
54
|
+
# selectionQuery={"rcsb_polymer_entity_annotation.type": annotationType},
|
|
55
|
+
selectionQuery=None,
|
|
56
|
+
selectionList=[
|
|
57
|
+
"rcsb_id",
|
|
58
|
+
"rcsb_polymer_entity_instance_container_identifiers.entry_id",
|
|
59
|
+
"rcsb_polymer_entity_instance_container_identifiers.entity_id",
|
|
60
|
+
"rcsb_polymer_entity_instance_container_identifiers.asym_id",
|
|
61
|
+
"rcsb_ligand_neighbors.ligand_comp_id",
|
|
62
|
+
],
|
|
63
|
+
)
|
|
64
|
+
eCount = obEx.getCount()
|
|
65
|
+
logger.info("Total neighbor count (%d)", eCount)
|
|
66
|
+
rD = {}
|
|
67
|
+
objD = obEx.getObjects()
|
|
68
|
+
for _, peiD in objD.items():
|
|
69
|
+
try:
|
|
70
|
+
entryId = peiD["rcsb_polymer_entity_instance_container_identifiers"]["entry_id"]
|
|
71
|
+
entityId = peiD["rcsb_polymer_entity_instance_container_identifiers"]["entity_id"]
|
|
72
|
+
ky = entryId + "_" + entityId
|
|
73
|
+
for lnD in peiD["rcsb_ligand_neighbors"] if "rcsb_ligand_neighbors" in peiD else []:
|
|
74
|
+
if "ligand_comp_id" in lnD:
|
|
75
|
+
rD.setdefault(ky, set()).add(lnD["ligand_comp_id"])
|
|
76
|
+
else:
|
|
77
|
+
logger.warning("%s %s missing details lnD %r", entryId, entityId, lnD)
|
|
78
|
+
except Exception as e:
|
|
79
|
+
logger.exception("Failing with %s", str(e))
|
|
80
|
+
rD = {k: list(v) for k, v in rD.items()}
|
|
81
|
+
logger.info("Unique instance %d", len(rD))
|
|
82
|
+
return rD
|
|
83
|
+
except Exception as e:
|
|
84
|
+
logger.exception("Failing with %s", str(e))
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: LigandNeighborMappingProvider.py
|
|
3
|
+
# Date: 28-Jun-2021 jdw
|
|
4
|
+
#
|
|
5
|
+
# Updated:
|
|
6
|
+
#
|
|
7
|
+
##
|
|
8
|
+
"""
|
|
9
|
+
Accessors for essential ligand neighbor mapping details associated with polymer and branched
|
|
10
|
+
entity instances.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import datetime
|
|
14
|
+
import logging
|
|
15
|
+
import os.path
|
|
16
|
+
import time
|
|
17
|
+
|
|
18
|
+
from rcsb.utils.io.FileUtil import FileUtil
|
|
19
|
+
from rcsb.utils.io.MarshalUtil import MarshalUtil
|
|
20
|
+
from rcsb.utils.io.StashableBase import StashableBase
|
|
21
|
+
from rcsb.exdb.seq.LigandNeighborMappingExtractor import LigandNeighborMappingExtractor
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class LigandNeighborMappingProvider(StashableBase):
|
|
27
|
+
"""Accessors for essential ligand neighbor mapping details associated with polymer and branched
|
|
28
|
+
entity instances."""
|
|
29
|
+
|
|
30
|
+
def __init__(self, cachePath, useCache=True):
|
|
31
|
+
#
|
|
32
|
+
self.__cachePath = cachePath
|
|
33
|
+
self.__useCache = useCache
|
|
34
|
+
self.__dirName = "ligand-neighbor-mapping"
|
|
35
|
+
super(LigandNeighborMappingProvider, self).__init__(self.__cachePath, [self.__dirName])
|
|
36
|
+
self.__dirPath = os.path.join(self.__cachePath, self.__dirName)
|
|
37
|
+
#
|
|
38
|
+
self.__mU = MarshalUtil(workPath=self.__dirPath)
|
|
39
|
+
self.__mapD = self.__reload(self.__dirPath, useCache)
|
|
40
|
+
#
|
|
41
|
+
|
|
42
|
+
def testCache(self, minCount=0):
|
|
43
|
+
logger.info("Cached ligand neighbor mapping count %d", len(self.__mapD["mapping"]) if "mapping" in self.__mapD else 0)
|
|
44
|
+
if minCount == 0 or self.__mapD and "mapping" in self.__mapD and len(self.__mapD["mapping"]) >= minCount:
|
|
45
|
+
return True
|
|
46
|
+
else:
|
|
47
|
+
return False
|
|
48
|
+
|
|
49
|
+
def getLigandNeighbors(self, rcsbEntityId):
|
|
50
|
+
"""Get the unique list of ligand neighbors for the input polymer or branched entity instance.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
rcsbEntityId (str): entryId '_' entityId
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
list: [chem_comp_id, ... ]
|
|
57
|
+
"""
|
|
58
|
+
try:
|
|
59
|
+
return list(set([t for t in self.__mapD["mapping"][rcsbEntityId.upper()]]))
|
|
60
|
+
except Exception:
|
|
61
|
+
return []
|
|
62
|
+
|
|
63
|
+
def reload(self):
|
|
64
|
+
self.__mapD = self.__reload(self.__dirPath, useCache=True)
|
|
65
|
+
|
|
66
|
+
def __reload(self, dirPath, useCache):
|
|
67
|
+
startTime = time.time()
|
|
68
|
+
retD = {}
|
|
69
|
+
ok = False
|
|
70
|
+
mappingPath = self.__getMappingDataPath()
|
|
71
|
+
#
|
|
72
|
+
logger.info("useCache %r mappingPath %r", useCache, mappingPath)
|
|
73
|
+
if useCache and self.__mU.exists(mappingPath):
|
|
74
|
+
retD = self.__mU.doImport(mappingPath, fmt="json")
|
|
75
|
+
ok = True
|
|
76
|
+
else:
|
|
77
|
+
fU = FileUtil()
|
|
78
|
+
fU.mkdir(dirPath)
|
|
79
|
+
# ---
|
|
80
|
+
num = len(retD["mapping"]) if "mapping" in retD else 0
|
|
81
|
+
logger.info("Completed ligand mapping reload (%d) with status (%r) at %s (%.4f seconds)", num, ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime)
|
|
82
|
+
return retD
|
|
83
|
+
|
|
84
|
+
def __getMappingDataPath(self):
|
|
85
|
+
return os.path.join(self.__dirPath, "ligand-neighbor-mapping-data.json")
|
|
86
|
+
|
|
87
|
+
def fetchLigandNeighborMapping(self, cfgOb):
|
|
88
|
+
"""Fetch ligand neighbor mapping details
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
cfgOb (obj): instance configuration class ConfigUtil()
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
bool: True for success or False otherwise
|
|
95
|
+
"""
|
|
96
|
+
try:
|
|
97
|
+
lnmEx = LigandNeighborMappingExtractor(cfgOb)
|
|
98
|
+
lnD = lnmEx.getLigandNeighbors()
|
|
99
|
+
fp = self.__getMappingDataPath()
|
|
100
|
+
tS = datetime.datetime.now().isoformat()
|
|
101
|
+
vS = datetime.datetime.now().strftime("%Y-%m-%d")
|
|
102
|
+
ok = self.__mU.doExport(fp, {"version": vS, "created": tS, "mapping": lnD}, fmt="json", indent=3)
|
|
103
|
+
return ok
|
|
104
|
+
except Exception as e:
|
|
105
|
+
logger.exception("Failing with %s", str(e))
|
|
106
|
+
return False
|