rcsb.exdb 1.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rcsb/__init__.py +1 -0
- rcsb/exdb/__init__.py +1 -0
- rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
- rcsb/exdb/branch/GlycanProvider.py +116 -0
- rcsb/exdb/branch/GlycanUtils.py +114 -0
- rcsb/exdb/branch/__init__.py +0 -0
- rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
- rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
- rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
- rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
- rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
- rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
- rcsb/exdb/chemref/__init__.py +0 -0
- rcsb/exdb/citation/CitationAdapter.py +91 -0
- rcsb/exdb/citation/CitationExtractor.py +190 -0
- rcsb/exdb/citation/CitationUtils.py +51 -0
- rcsb/exdb/citation/__init__.py +0 -0
- rcsb/exdb/cli/__init__.py +0 -0
- rcsb/exdb/entry/EntryInfoProvider.py +148 -0
- rcsb/exdb/entry/__init__.py +0 -0
- rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
- rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
- rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
- rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
- rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
- rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
- rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
- rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
- rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
- rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
- rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
- rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
- rcsb/exdb/seq/AnnotationExtractor.py +76 -0
- rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
- rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
- rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
- rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
- rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
- rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
- rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
- rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
- rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
- rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
- rcsb/exdb/seq/UniProtExtractor.py +80 -0
- rcsb/exdb/seq/__init__.py +0 -0
- rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
- rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
- rcsb/exdb/tests/__init__.py +0 -0
- rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
- rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
- rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
- rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
- rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
- rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
- rcsb/exdb/tests/testChemRefLoader.py +106 -0
- rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
- rcsb/exdb/tests/testCitationAdapter.py +97 -0
- rcsb/exdb/tests/testCitationExtractor.py +93 -0
- rcsb/exdb/tests/testCitationUtils.py +92 -0
- rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
- rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
- rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
- rcsb/exdb/tests/testGlycanProvider.py +98 -0
- rcsb/exdb/tests/testGlycanUtils.py +64 -0
- rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
- rcsb/exdb/tests/testObjectExtractor.py +342 -0
- rcsb/exdb/tests/testObjectTransformer.py +83 -0
- rcsb/exdb/tests/testObjectUpdater.py +120 -0
- rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
- rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
- rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
- rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
- rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
- rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
- rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
- rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
- rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
- rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
- rcsb/exdb/tests/testUniProtExtractor.py +77 -0
- rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
- rcsb/exdb/tree/__init__.py +0 -0
- rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
- rcsb/exdb/utils/ObjectExtractor.py +286 -0
- rcsb/exdb/utils/ObjectTransformer.py +124 -0
- rcsb/exdb/utils/ObjectUpdater.py +121 -0
- rcsb/exdb/utils/ObjectValidator.py +160 -0
- rcsb/exdb/utils/__init__.py +0 -0
- rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
- rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
- rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
- rcsb/exdb/wf/__init__.py +0 -0
- rcsb_exdb-1.31.dist-info/METADATA +103 -0
- rcsb_exdb-1.31.dist-info/RECORD +98 -0
- rcsb_exdb-1.31.dist-info/WHEEL +4 -0
- rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: ReferenceSequenceUtils.py
|
|
3
|
+
# Date: 28-Mar-2019 jdw
|
|
4
|
+
#
|
|
5
|
+
# Selected utilities to integrate reference sequence information with PDB polymer entity data.
|
|
6
|
+
#
|
|
7
|
+
# Updates:
|
|
8
|
+
# 21-Apr-2019 jdw refactor
|
|
9
|
+
#
|
|
10
|
+
##
|
|
11
|
+
__docformat__ = "google en"
|
|
12
|
+
__author__ = "John Westbrook"
|
|
13
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
14
|
+
__license__ = "Apache 2.0"
|
|
15
|
+
|
|
16
|
+
import logging
|
|
17
|
+
import os
|
|
18
|
+
|
|
19
|
+
from rcsb.exdb.seq.EntityPolymerExtractor import EntityPolymerExtractor
|
|
20
|
+
from rcsb.utils.io.MarshalUtil import MarshalUtil
|
|
21
|
+
from rcsb.utils.seq.UniProtUtils import UniProtUtils
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ReferenceSequenceUtils(object):
|
|
27
|
+
"""Selected utilities to integrate reference sequence information with PDB polymer entity data."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, cfgOb, refDbName, **kwargs):
|
|
30
|
+
self.__cfgOb = cfgOb
|
|
31
|
+
self.__refDbName = refDbName
|
|
32
|
+
self.__mU = MarshalUtil()
|
|
33
|
+
#
|
|
34
|
+
self.__refIdList = self.__getReferenceAssignments(refDbName, **kwargs)
|
|
35
|
+
self.__refD, self.__matchD = self.__rebuildCache(refDbName, self.__refIdList, **kwargs)
|
|
36
|
+
|
|
37
|
+
def __getReferenceAssignments(self, refDbName, **kwargs):
|
|
38
|
+
"""Get all accessions assigned to input reference sequence database"""
|
|
39
|
+
rL = []
|
|
40
|
+
exdbDirPath = kwargs.get("exdbDirPath", None)
|
|
41
|
+
cacheKwargs = kwargs.get("cacheKwargs", None)
|
|
42
|
+
useCache = kwargs.get("useCache", True)
|
|
43
|
+
entryLimit = kwargs.get("entryLimit", None)
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
epe = EntityPolymerExtractor(self.__cfgOb, exdbDirPath=exdbDirPath, useCache=useCache, cacheKwargs=cacheKwargs, entryLimit=entryLimit)
|
|
47
|
+
eCount = epe.getEntryCount()
|
|
48
|
+
rL = epe.getRefSeqAccessions(refDbName)
|
|
49
|
+
logger.info("Reading polymer entity cache with repository entry count %d ref accession length %d ", eCount, len(rL))
|
|
50
|
+
#
|
|
51
|
+
except Exception as e:
|
|
52
|
+
logger.exception("Failing with %s", str(e))
|
|
53
|
+
|
|
54
|
+
return rL
|
|
55
|
+
|
|
56
|
+
def __rebuildCache(self, refDbName, idList, **kwargs):
|
|
57
|
+
""" """
|
|
58
|
+
dD = {}
|
|
59
|
+
dirPath = kwargs.get("exdbDirPath", None)
|
|
60
|
+
cacheKwargs = kwargs.get("cacheKwargs", None)
|
|
61
|
+
useCache = kwargs.get("useCache", True)
|
|
62
|
+
fetchLimit = kwargs.get("fetchLimit", None)
|
|
63
|
+
saveText = kwargs.get("saveText", False)
|
|
64
|
+
|
|
65
|
+
ext = "pic" if cacheKwargs["fmt"] == "pickle" else "json"
|
|
66
|
+
fn = "ref-sequence-data-cache" + "." + ext
|
|
67
|
+
cacheFilePath = os.path.join(dirPath, fn)
|
|
68
|
+
self.__mU.mkdir(dirPath)
|
|
69
|
+
if not useCache:
|
|
70
|
+
for fp in [cacheFilePath]:
|
|
71
|
+
try:
|
|
72
|
+
os.remove(fp)
|
|
73
|
+
except Exception:
|
|
74
|
+
pass
|
|
75
|
+
#
|
|
76
|
+
if useCache and cacheFilePath and self.__mU.exists(cacheFilePath):
|
|
77
|
+
dD = self.__mU.doImport(cacheFilePath, **cacheKwargs)
|
|
78
|
+
else:
|
|
79
|
+
dD = self.__fetchReferenceEntries(refDbName, idList, saveText=saveText, fetchLimit=fetchLimit)
|
|
80
|
+
if cacheFilePath and cacheKwargs:
|
|
81
|
+
self.__mU.mkdir(dirPath)
|
|
82
|
+
ok = self.__mU.doExport(cacheFilePath, dD, **cacheKwargs)
|
|
83
|
+
logger.info("Cache save status %r", ok)
|
|
84
|
+
|
|
85
|
+
return dD["refDbCache"], dD["matchInfo"]
|
|
86
|
+
|
|
87
|
+
def __fetchReferenceEntries(self, refDbName, idList, saveText=False, fetchLimit=None):
|
|
88
|
+
"""Fetch database entries from the input reference sequence database name."""
|
|
89
|
+
dD = {"refDbName": refDbName, "refDbCache": {}, "matchInfo": {}}
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
idList = idList[:fetchLimit] if fetchLimit else idList
|
|
93
|
+
logger.info("Starting fetch for %d %s entries", len(idList), refDbName)
|
|
94
|
+
if refDbName == "UNP":
|
|
95
|
+
fobj = UniProtUtils(saveText=saveText)
|
|
96
|
+
refD, matchD = fobj.fetchList(idList)
|
|
97
|
+
dD = {"refDbName": refDbName, "refDbCache": refD, "matchInfo": matchD}
|
|
98
|
+
|
|
99
|
+
except Exception as e:
|
|
100
|
+
logger.exception("Failing with %s", str(e))
|
|
101
|
+
|
|
102
|
+
return dD
|
|
103
|
+
|
|
104
|
+
def __dumpEntries(self, refD):
|
|
105
|
+
for (eId, eDict) in refD.items():
|
|
106
|
+
logger.info("------ Entry id %s", eId)
|
|
107
|
+
for k, v in eDict.items():
|
|
108
|
+
logger.info("%-15s = %r", k, v)
|
|
109
|
+
|
|
110
|
+
def getReferenceAccessionAlignSummary(self):
|
|
111
|
+
"""Summarize the alignment of PDB accession assignments with the current reference sequence database."""
|
|
112
|
+
numPrimary = 0
|
|
113
|
+
numSecondary = 0
|
|
114
|
+
numNone = 0
|
|
115
|
+
for _, mD in self.__matchD.items():
|
|
116
|
+
if mD["matched"] == "primary":
|
|
117
|
+
numPrimary += 1
|
|
118
|
+
elif mD["matched"] == "secondary":
|
|
119
|
+
numSecondary += 1
|
|
120
|
+
else:
|
|
121
|
+
numNone += 1
|
|
122
|
+
logger.debug("Matched primary: %d secondary: %d none %d", numPrimary, numSecondary, numNone)
|
|
123
|
+
return numPrimary, numSecondary, numNone
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: ReferenceSequenceUtilsTests.py
|
|
3
|
+
# Author: J. Westbrook
|
|
4
|
+
# Date: 22-Apr-2019
|
|
5
|
+
#
|
|
6
|
+
# Updates:
|
|
7
|
+
#
|
|
8
|
+
##
|
|
9
|
+
"""
|
|
10
|
+
Tests updating reference sequence cache
|
|
11
|
+
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
__docformat__ = "google en"
|
|
15
|
+
__author__ = "John Westbrook"
|
|
16
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
17
|
+
__license__ = "Apache 2.0"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
import logging
|
|
21
|
+
import os
|
|
22
|
+
import time
|
|
23
|
+
import unittest
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
from rcsb.exdb.seq.ReferenceSequenceUtils import ReferenceSequenceUtils
|
|
27
|
+
from rcsb.utils.config.ConfigUtil import ConfigUtil
|
|
28
|
+
from rcsb.utils.io.MarshalUtil import MarshalUtil
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]-%(module)s.%(funcName)s: %(message)s")
|
|
32
|
+
logger = logging.getLogger()
|
|
33
|
+
|
|
34
|
+
HERE = os.path.abspath(os.path.dirname(__file__))
|
|
35
|
+
TOPDIR = os.path.dirname(os.path.dirname(os.path.dirname(HERE)))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class ReferenceSequenceUtilsTests(unittest.TestCase):
|
|
39
|
+
def __init__(self, methodName="runTest"):
|
|
40
|
+
super(ReferenceSequenceUtilsTests, self).__init__(methodName)
|
|
41
|
+
self.__verbose = True
|
|
42
|
+
|
|
43
|
+
def setUp(self):
|
|
44
|
+
#
|
|
45
|
+
#
|
|
46
|
+
self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
|
|
47
|
+
configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example.yml")
|
|
48
|
+
#
|
|
49
|
+
# Caution: this is very site specific setting !
|
|
50
|
+
configName = "site_info_remote"
|
|
51
|
+
self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath)
|
|
52
|
+
if configName != "site_info_configuration":
|
|
53
|
+
self.__cfgOb.replaceSectionName("site_info_configuration", configName)
|
|
54
|
+
#
|
|
55
|
+
self.__workPath = os.path.join(HERE, "test-cache-preserve")
|
|
56
|
+
#
|
|
57
|
+
self.__entityPolymerCachePath = os.path.join(self.__workPath, "entity-polymer-data-cache.pic")
|
|
58
|
+
self.__entityPolymerCacheKwargs = {"fmt": "pickle"}
|
|
59
|
+
self.__useEntityPolymerCache = True
|
|
60
|
+
#
|
|
61
|
+
self.__refDbCachePath = os.path.join(self.__workPath, "unp-data-test-cache.json")
|
|
62
|
+
self.__refDbCacheKwargs = {"fmt": "json", "indent": 3}
|
|
63
|
+
#
|
|
64
|
+
self.__refDbUseCache = True
|
|
65
|
+
self.__fetchLimit = 500
|
|
66
|
+
#
|
|
67
|
+
self.__mU = MarshalUtil()
|
|
68
|
+
#
|
|
69
|
+
self.__startTime = time.time()
|
|
70
|
+
logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
|
|
71
|
+
|
|
72
|
+
def tearDown(self):
|
|
73
|
+
endTime = time.time()
|
|
74
|
+
logger.info("Completed %s at %s (%.4f seconds)\n", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)
|
|
75
|
+
|
|
76
|
+
def testUpdateUniProtCache(self):
|
|
77
|
+
"""Test case - extract entity polymer info and update reference sequence cache"""
|
|
78
|
+
try:
|
|
79
|
+
refDbName = "UNP"
|
|
80
|
+
rsu = ReferenceSequenceUtils(
|
|
81
|
+
self.__cfgOb,
|
|
82
|
+
refDbName,
|
|
83
|
+
referenceCachePath=self.__refDbCachePath,
|
|
84
|
+
referenceCacheKwargs=self.__refDbCacheKwargs,
|
|
85
|
+
useReferenceCache=self.__refDbUseCache,
|
|
86
|
+
entityPolymerCachePath=self.__entityPolymerCachePath,
|
|
87
|
+
entityPolymerCacheKwargs=self.__entityPolymerCacheKwargs,
|
|
88
|
+
useEntityPolymerCache=self.__useEntityPolymerCache,
|
|
89
|
+
fetchLimit=self.__fetchLimit,
|
|
90
|
+
)
|
|
91
|
+
numPrimary, numSecondary, numNone = rsu.getReferenceAccessionAlignSummary()
|
|
92
|
+
self.assertGreaterEqual(numPrimary, 70)
|
|
93
|
+
logger.info("For %r matched primary: %d secondary: %d none %d", refDbName, numPrimary, numSecondary, numNone)
|
|
94
|
+
#
|
|
95
|
+
except Exception as e:
|
|
96
|
+
logger.exception("Failing with %s", str(e))
|
|
97
|
+
self.fail()
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def unpFetchSuite():
|
|
101
|
+
suiteSelect = unittest.TestSuite()
|
|
102
|
+
suiteSelect.addTest(ReferenceSequenceUtilsTests("testUpdateUniProtCache"))
|
|
103
|
+
return suiteSelect
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
if __name__ == "__main__":
|
|
107
|
+
|
|
108
|
+
mySuite = unpFetchSuite()
|
|
109
|
+
unittest.TextTestRunner(verbosity=2).run(mySuite)
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: ObjectExtractorTests.py
|
|
3
|
+
# Author: J. Westbrook
|
|
4
|
+
# Date: 25-Apr-2019
|
|
5
|
+
#
|
|
6
|
+
# Updates:
|
|
7
|
+
#
|
|
8
|
+
##
|
|
9
|
+
"""
|
|
10
|
+
Tests for extractor selected values from collections (limited tests from mock-data repos)
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
__docformat__ = "google en"
|
|
14
|
+
__author__ = "John Westbrook"
|
|
15
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
16
|
+
__license__ = "Apache 2.0"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
import logging
|
|
20
|
+
import os
|
|
21
|
+
|
|
22
|
+
import time
|
|
23
|
+
import unittest
|
|
24
|
+
|
|
25
|
+
from rcsb.exdb.utils.ObjectExtractor import ObjectExtractor
|
|
26
|
+
from rcsb.utils.io.MarshalUtil import MarshalUtil
|
|
27
|
+
from rcsb.utils.config.ConfigUtil import ConfigUtil
|
|
28
|
+
|
|
29
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]-%(module)s.%(funcName)s: %(message)s")
|
|
30
|
+
logger = logging.getLogger()
|
|
31
|
+
|
|
32
|
+
HERE = os.path.abspath(os.path.dirname(__file__))
|
|
33
|
+
TOPDIR = os.path.dirname(os.path.dirname(os.path.dirname(HERE)))
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ObjectExtractorTests(unittest.TestCase):
|
|
37
|
+
def __init__(self, methodName="runTest"):
|
|
38
|
+
super(ObjectExtractorTests, self).__init__(methodName)
|
|
39
|
+
self.__verbose = False
|
|
40
|
+
|
|
41
|
+
def setUp(self):
|
|
42
|
+
#
|
|
43
|
+
self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
|
|
44
|
+
configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example.yml")
|
|
45
|
+
#
|
|
46
|
+
configName = "site_info_remote_configuration"
|
|
47
|
+
self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath)
|
|
48
|
+
#
|
|
49
|
+
self.__workPath = "."
|
|
50
|
+
self.__mU = MarshalUtil(workPath=self.__workPath)
|
|
51
|
+
self.__entityTaxonPath = os.path.join(self.__workPath, "entity_taxon.tdd")
|
|
52
|
+
self.__startTime = time.time()
|
|
53
|
+
logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
|
|
54
|
+
|
|
55
|
+
def tearDown(self):
|
|
56
|
+
endTime = time.time()
|
|
57
|
+
logger.info("Completed %s at %s (%.4f seconds)\n", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)
|
|
58
|
+
|
|
59
|
+
def testExtractEntityTaxonomyContent(self):
|
|
60
|
+
"""Test case - extract unique entity source and host taxonomies"""
|
|
61
|
+
tL = []
|
|
62
|
+
try:
|
|
63
|
+
obEx = ObjectExtractor(
|
|
64
|
+
self.__cfgOb,
|
|
65
|
+
databaseName="pdbx_core",
|
|
66
|
+
collectionName="pdbx_core_polymer_entity",
|
|
67
|
+
useCache=False,
|
|
68
|
+
keyAttribute="entity",
|
|
69
|
+
uniqueAttributes=["rcsb_id"],
|
|
70
|
+
selectionQuery=None,
|
|
71
|
+
selectionList=["rcsb_id", "rcsb_entity_source_organism.ncbi_taxonomy_id", "rcsb_entity_host_organism.ncbi_taxonomy_id"],
|
|
72
|
+
)
|
|
73
|
+
eCount = obEx.getCount()
|
|
74
|
+
logger.info("Polymer entity count is %d", eCount)
|
|
75
|
+
objD = obEx.getObjects()
|
|
76
|
+
sD = {}
|
|
77
|
+
hD = {}
|
|
78
|
+
for rId, eD in objD.items():
|
|
79
|
+
try:
|
|
80
|
+
for tD in eD["rcsb_entity_source_organism"]:
|
|
81
|
+
sD.setdefault(rId, []).append(str(tD["ncbi_taxonomy_id"]))
|
|
82
|
+
|
|
83
|
+
except Exception:
|
|
84
|
+
pass
|
|
85
|
+
try:
|
|
86
|
+
for tD in eD["rcsb_entity_host_organism"]:
|
|
87
|
+
hD.setdefault(rId, []).append(str(tD["ncbi_taxonomy_id"]))
|
|
88
|
+
except Exception:
|
|
89
|
+
pass
|
|
90
|
+
for rId, taxIdL in sD.items():
|
|
91
|
+
tS = "|".join(sorted(set(taxIdL)))
|
|
92
|
+
if tS:
|
|
93
|
+
lS = "%s\t%s" % (rId, "|".join(sorted(set(taxIdL))))
|
|
94
|
+
tL.append(lS)
|
|
95
|
+
self.__mU.doExport(self.__entityTaxonPath, tL, fmt="list")
|
|
96
|
+
except Exception as e:
|
|
97
|
+
logger.exception("Failing with %s", str(e))
|
|
98
|
+
self.fail()
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def objectExtractorSuite():
|
|
102
|
+
suiteSelect = unittest.TestSuite()
|
|
103
|
+
suiteSelect.addTest(ObjectExtractorTests("testExtractEntityTaxonomyContent"))
|
|
104
|
+
return suiteSelect
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
if __name__ == "__main__":
|
|
108
|
+
mySuite = objectExtractorSuite()
|
|
109
|
+
unittest.TextTestRunner(verbosity=2).run(mySuite)
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: EntityPolymerExtractorFixture.py
|
|
3
|
+
# Author: J. Westbrook
|
|
4
|
+
# Date: 25-Mar-2019
|
|
5
|
+
#
|
|
6
|
+
# Updates:
|
|
7
|
+
# 21-Apr-2019 jdw Separate tests against the mock-data repo in this module
|
|
8
|
+
# 4-Sep-201 jdw make this a fixture
|
|
9
|
+
#
|
|
10
|
+
##
|
|
11
|
+
"""
|
|
12
|
+
Fixture extractor to preserve entity polymer data.
|
|
13
|
+
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
__docformat__ = "google en"
|
|
17
|
+
__author__ = "John Westbrook"
|
|
18
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
19
|
+
__license__ = "Apache 2.0"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
import logging
|
|
23
|
+
import os
|
|
24
|
+
import time
|
|
25
|
+
import unittest
|
|
26
|
+
|
|
27
|
+
from rcsb.exdb.seq.EntityPolymerExtractor import EntityPolymerExtractor
|
|
28
|
+
from rcsb.utils.config.ConfigUtil import ConfigUtil
|
|
29
|
+
|
|
30
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]-%(module)s.%(funcName)s: %(message)s")
|
|
31
|
+
logger = logging.getLogger()
|
|
32
|
+
|
|
33
|
+
HERE = os.path.abspath(os.path.dirname(__file__))
|
|
34
|
+
TOPDIR = os.path.dirname(os.path.dirname(os.path.dirname(HERE)))
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class EntityPolymerExtractorFixture(unittest.TestCase):
|
|
38
|
+
def __init__(self, methodName="runTest"):
|
|
39
|
+
super(EntityPolymerExtractorFixture, self).__init__(methodName)
|
|
40
|
+
self.__verbose = True
|
|
41
|
+
|
|
42
|
+
def setUp(self):
|
|
43
|
+
#
|
|
44
|
+
#
|
|
45
|
+
self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
|
|
46
|
+
configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example.yml")
|
|
47
|
+
#
|
|
48
|
+
configName = "site_info_configuration"
|
|
49
|
+
self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath)
|
|
50
|
+
#
|
|
51
|
+
self.__cachePath = os.path.join(TOPDIR, "CACHE")
|
|
52
|
+
#
|
|
53
|
+
self.__cacheKwargs = {"fmt": "pickle"}
|
|
54
|
+
self.__exdbCacheDirPath = os.path.join(self.__cachePath, self.__cfgOb.get("EXDB_CACHE_DIR", sectionName=configName))
|
|
55
|
+
#
|
|
56
|
+
self.__entryLimitTest = None
|
|
57
|
+
#
|
|
58
|
+
self.__startTime = time.time()
|
|
59
|
+
logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
|
|
60
|
+
|
|
61
|
+
def tearDown(self):
|
|
62
|
+
endTime = time.time()
|
|
63
|
+
logger.info("Completed %s at %s (%.4f seconds)\n", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)
|
|
64
|
+
|
|
65
|
+
def testExtractEntityPolymers(self):
|
|
66
|
+
"""Fixture - extract and save entity polymer info"""
|
|
67
|
+
try:
|
|
68
|
+
epe = EntityPolymerExtractor(self.__cfgOb, exdbDirPath=self.__exdbCacheDirPath, useCache=False, cacheKwargs=self.__cacheKwargs, entryLimit=self.__entryLimitTest)
|
|
69
|
+
eCount = epe.getEntryCount()
|
|
70
|
+
self.assertGreaterEqual(eCount, 10)
|
|
71
|
+
#
|
|
72
|
+
except Exception as e:
|
|
73
|
+
logger.exception("Failing with %s", str(e))
|
|
74
|
+
self.fail()
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def entityPolymerExtractSuite():
|
|
78
|
+
suiteSelect = unittest.TestSuite()
|
|
79
|
+
suiteSelect.addTest(EntityPolymerExtractorFixture("testExtractEntityPolymers"))
|
|
80
|
+
return suiteSelect
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
if __name__ == "__main__":
|
|
84
|
+
mySuite = entityPolymerExtractSuite()
|
|
85
|
+
unittest.TextTestRunner(verbosity=2).run(mySuite)
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: EntityInstanceExtractorTests.py
|
|
3
|
+
# Author: J. Westbrook
|
|
4
|
+
# Date: 19-Dec-2019
|
|
5
|
+
#
|
|
6
|
+
# Updates:
|
|
7
|
+
#
|
|
8
|
+
##
|
|
9
|
+
"""
|
|
10
|
+
Tests for preliminary version of the extractor selected values from entity instance collections.
|
|
11
|
+
|
|
12
|
+
PRELIMINARY VERSION
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
__docformat__ = "google en"
|
|
16
|
+
__author__ = "John Westbrook"
|
|
17
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
18
|
+
__license__ = "Apache 2.0"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
import logging
|
|
22
|
+
import os
|
|
23
|
+
import time
|
|
24
|
+
import unittest
|
|
25
|
+
|
|
26
|
+
from rcsb.exdb.seq.EntityInstanceExtractor import EntityInstanceExtractor
|
|
27
|
+
from rcsb.utils.config.ConfigUtil import ConfigUtil
|
|
28
|
+
from rcsb.utils.io.MarshalUtil import MarshalUtil
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]-%(module)s.%(funcName)s: %(message)s")
|
|
32
|
+
logger = logging.getLogger()
|
|
33
|
+
|
|
34
|
+
HERE = os.path.abspath(os.path.dirname(__file__))
|
|
35
|
+
TOPDIR = os.path.dirname(os.path.dirname(os.path.dirname(HERE)))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class EntityInstanceExtractorTests(unittest.TestCase):
|
|
39
|
+
def __init__(self, methodName="runTest"):
|
|
40
|
+
super(EntityInstanceExtractorTests, self).__init__(methodName)
|
|
41
|
+
self.__verbose = True
|
|
42
|
+
|
|
43
|
+
def setUp(self):
|
|
44
|
+
#
|
|
45
|
+
#
|
|
46
|
+
self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
|
|
47
|
+
configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example.yml")
|
|
48
|
+
configName = "site_info_configuration"
|
|
49
|
+
self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath)
|
|
50
|
+
# self.__cfgOb.dump()
|
|
51
|
+
self.__resourceName = "MONGO_DB"
|
|
52
|
+
self.__readBackCheck = True
|
|
53
|
+
self.__numProc = 2
|
|
54
|
+
self.__chunkSize = 10
|
|
55
|
+
self.__documentLimit = None
|
|
56
|
+
self.__filterType = "assign-dates"
|
|
57
|
+
#
|
|
58
|
+
#
|
|
59
|
+
self.__workPath = os.path.join(HERE, "test-output")
|
|
60
|
+
self.__entitySavePath = os.path.join(HERE, "test-output", "entity-data-dictionary.json")
|
|
61
|
+
self.__entrySavePath = os.path.join(HERE, "test-output", "entry-data-dictionary.json")
|
|
62
|
+
self.__instanceSavePath = os.path.join(HERE, "test-output", "instance-data-dictionary.json")
|
|
63
|
+
self.__saveKwargs = {"fmt": "json", "indent": 3}
|
|
64
|
+
self.__mU = MarshalUtil()
|
|
65
|
+
self.__entryLimit = 3
|
|
66
|
+
#
|
|
67
|
+
self.__startTime = time.time()
|
|
68
|
+
logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
|
|
69
|
+
|
|
70
|
+
def tearDown(self):
|
|
71
|
+
endTime = time.time()
|
|
72
|
+
logger.debug("Completed %s at %s (%.4f seconds)\n", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)
|
|
73
|
+
|
|
74
|
+
def testExtractEntryInfo(self):
|
|
75
|
+
"""Test case - extract entry instance data -"""
|
|
76
|
+
try:
|
|
77
|
+
eiExt = EntityInstanceExtractor(self.__cfgOb)
|
|
78
|
+
entryD = eiExt.getEntryInfo()
|
|
79
|
+
self.assertTrue(len(entryD) > 15)
|
|
80
|
+
ok = self.__mU.doExport(self.__entrySavePath, entryD, fmt="json")
|
|
81
|
+
self.assertTrue(ok)
|
|
82
|
+
#
|
|
83
|
+
except Exception as e:
|
|
84
|
+
logger.exception("Failing with %s", str(e))
|
|
85
|
+
self.fail()
|
|
86
|
+
|
|
87
|
+
def testExtractEntityPolymers(self):
|
|
88
|
+
"""Test case - extract entity polymer instance data -"""
|
|
89
|
+
try:
|
|
90
|
+
eiExt = EntityInstanceExtractor(self.__cfgOb)
|
|
91
|
+
entryD = eiExt.getEntryInfo()
|
|
92
|
+
self.assertTrue(len(entryD) > 15)
|
|
93
|
+
ok = self.__mU.doExport(self.__entrySavePath, entryD, fmt="json")
|
|
94
|
+
self.assertTrue(ok)
|
|
95
|
+
logger.info("EntryD length %d", len(entryD))
|
|
96
|
+
entryD = self.__mU.doImport(self.__entrySavePath, fmt="json")
|
|
97
|
+
#
|
|
98
|
+
entryD = eiExt.getPolymerEntities(entryD, savePath=self.__entitySavePath, entryLimit=None, saveKwargs=self.__saveKwargs)
|
|
99
|
+
self.assertTrue(len(entryD) > 15)
|
|
100
|
+
logger.info("EntryD + polymer entities length %d", len(entryD))
|
|
101
|
+
#
|
|
102
|
+
#
|
|
103
|
+
except Exception as e:
|
|
104
|
+
logger.exception("Failing with %s", str(e))
|
|
105
|
+
self.fail()
|
|
106
|
+
|
|
107
|
+
def testExtractEntityInstances(self):
|
|
108
|
+
"""Test case - extract entity instance data -"""
|
|
109
|
+
try:
|
|
110
|
+
eiExt = EntityInstanceExtractor(self.__cfgOb)
|
|
111
|
+
entryD = eiExt.getEntryInfo()
|
|
112
|
+
self.assertTrue(len(entryD) > 15)
|
|
113
|
+
#
|
|
114
|
+
entryD = eiExt.getPolymerEntities(entryD, savePath=self.__entitySavePath, entryLimit=None, saveKwargs=self.__saveKwargs)
|
|
115
|
+
self.assertTrue(len(entryD) > 15)
|
|
116
|
+
#
|
|
117
|
+
entryD = eiExt.getEntityInstances(entryD, savePath=self.__instanceSavePath, entryLimit=self.__entryLimit, saveKwargs=self.__saveKwargs)
|
|
118
|
+
self.assertTrue(len(entryD) > 15)
|
|
119
|
+
#
|
|
120
|
+
except Exception as e:
|
|
121
|
+
logger.exception("Failing with %s", str(e))
|
|
122
|
+
self.fail()
|
|
123
|
+
|
|
124
|
+
def testAnalEntityInstances(self):
|
|
125
|
+
"""Test case - analysis of entity instance data -"""
|
|
126
|
+
try:
|
|
127
|
+
eiExt = EntityInstanceExtractor(self.__cfgOb)
|
|
128
|
+
entryD = eiExt.getEntryInfo()
|
|
129
|
+
self.assertTrue(len(entryD) > 15)
|
|
130
|
+
#
|
|
131
|
+
entryD = eiExt.getPolymerEntities(entryD, savePath=self.__entitySavePath, entryLimit=None, saveKwargs=self.__saveKwargs)
|
|
132
|
+
self.assertTrue(len(entryD) > 15)
|
|
133
|
+
#
|
|
134
|
+
entryD = eiExt.getEntityInstances(entryD, savePath=self.__instanceSavePath, entryLimit=self.__entryLimit, saveKwargs=self.__saveKwargs)
|
|
135
|
+
self.assertTrue(len(entryD) > 15)
|
|
136
|
+
|
|
137
|
+
logger.info("EntryD + polymer entities instances length %d", len(entryD))
|
|
138
|
+
#
|
|
139
|
+
# entryD = self.__mU.doImport(self.__instanceSavePath, fmt="json")
|
|
140
|
+
# logger.info("entryD %r", entryD)
|
|
141
|
+
for entryId in entryD:
|
|
142
|
+
for entityId, eD in entryD[entryId]["selected_polymer_entities"].items():
|
|
143
|
+
analD = eD["anal_instances"] if "anal_instances" in eD else {}
|
|
144
|
+
for asymId, aD in analD.items():
|
|
145
|
+
logger.debug("entryId %s entityId %s asymId %s analD: %r", entryId, entityId, asymId, aD)
|
|
146
|
+
except Exception as e:
|
|
147
|
+
logger.exception("Failing with %s", str(e))
|
|
148
|
+
self.fail()
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def entityInstanceExtractSuite():
|
|
152
|
+
suiteSelect = unittest.TestSuite()
|
|
153
|
+
suiteSelect.addTest(EntityInstanceExtractorTests("testExtractEntityPolymers"))
|
|
154
|
+
suiteSelect.addTest(EntityInstanceExtractorTests("testExtractEntityInstances"))
|
|
155
|
+
suiteSelect.addTest(EntityInstanceExtractorTests("testAnalEntityInstances"))
|
|
156
|
+
return suiteSelect
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def entryExtractSuite():
|
|
160
|
+
suiteSelect = unittest.TestSuite()
|
|
161
|
+
suiteSelect.addTest(EntityInstanceExtractorTests("testExtractEntryInfo"))
|
|
162
|
+
return suiteSelect
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
if __name__ == "__main__":
|
|
166
|
+
mySuite = entityInstanceExtractSuite()
|
|
167
|
+
unittest.TextTestRunner(verbosity=2).run(mySuite)
|
|
168
|
+
|
|
169
|
+
mySuite = entryExtractSuite()
|
|
170
|
+
unittest.TextTestRunner(verbosity=2).run(mySuite)
|