rcsb.exdb 1.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rcsb/__init__.py +1 -0
- rcsb/exdb/__init__.py +1 -0
- rcsb/exdb/branch/BranchedEntityExtractor.py +82 -0
- rcsb/exdb/branch/GlycanProvider.py +116 -0
- rcsb/exdb/branch/GlycanUtils.py +114 -0
- rcsb/exdb/branch/__init__.py +0 -0
- rcsb/exdb/chemref/ChemRefEtlWorker.py +118 -0
- rcsb/exdb/chemref/ChemRefExtractor.py +70 -0
- rcsb/exdb/chemref/ChemRefMappingProvider.py +139 -0
- rcsb/exdb/chemref/PubChemDataCacheProvider.py +372 -0
- rcsb/exdb/chemref/PubChemEtlWrapper.py +280 -0
- rcsb/exdb/chemref/PubChemIndexCacheProvider.py +638 -0
- rcsb/exdb/chemref/__init__.py +0 -0
- rcsb/exdb/citation/CitationAdapter.py +91 -0
- rcsb/exdb/citation/CitationExtractor.py +190 -0
- rcsb/exdb/citation/CitationUtils.py +51 -0
- rcsb/exdb/citation/__init__.py +0 -0
- rcsb/exdb/cli/__init__.py +0 -0
- rcsb/exdb/entry/EntryInfoProvider.py +148 -0
- rcsb/exdb/entry/__init__.py +0 -0
- rcsb/exdb/examples-seq/EntityInstanceExtractor.py +557 -0
- rcsb/exdb/examples-seq/EntityPolymerExtractor.py +544 -0
- rcsb/exdb/examples-seq/EntityPolymerExtractorFullTests.py +176 -0
- rcsb/exdb/examples-seq/ReferenceSequenceAssignmentUpdater.py +449 -0
- rcsb/exdb/examples-seq/ReferenceSequenceUtils.py +123 -0
- rcsb/exdb/examples-seq/ReferenceSequenceUtilsTests.py +109 -0
- rcsb/exdb/examples-seq/exampleObjectExtractor.py +109 -0
- rcsb/exdb/examples-seq/fixtureEntityPolymerExtractor.py +85 -0
- rcsb/exdb/examples-seq/testEntityInstanceExtractor.py +170 -0
- rcsb/exdb/examples-seq/testEntityPolymerExtractor.py +171 -0
- rcsb/exdb/examples-seq/testReferenceSequenceAssignmentUpdater.py +79 -0
- rcsb/exdb/examples-seq/testReferenceSequenceUtils.py +108 -0
- rcsb/exdb/seq/AnnotationExtractor.py +76 -0
- rcsb/exdb/seq/LigandNeighborMappingExtractor.py +84 -0
- rcsb/exdb/seq/LigandNeighborMappingProvider.py +106 -0
- rcsb/exdb/seq/PolymerEntityExtractor.py +328 -0
- rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +598 -0
- rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +228 -0
- rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +534 -0
- rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +388 -0
- rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +397 -0
- rcsb/exdb/seq/TaxonomyExtractor.py +69 -0
- rcsb/exdb/seq/UniProtCoreEtlWorker.py +177 -0
- rcsb/exdb/seq/UniProtExtractor.py +80 -0
- rcsb/exdb/seq/__init__.py +0 -0
- rcsb/exdb/tests/TEST-EXDB-CLI-EXEC.sh +19 -0
- rcsb/exdb/tests/TEST-EXDB-CLI-REFSEQ-EXEC.sh +12 -0
- rcsb/exdb/tests/__init__.py +0 -0
- rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +104 -0
- rcsb/exdb/tests/fixturePdbxLoader.py +298 -0
- rcsb/exdb/tests/test-data/components-abbrev.cif +2739 -0
- rcsb/exdb/tests/test-data/prdcc-abbrev.cif +9171 -0
- rcsb/exdb/tests/testAnnotationExtractor.py +79 -0
- rcsb/exdb/tests/testBranchedEntityExtractor.py +81 -0
- rcsb/exdb/tests/testChemRefLoader.py +106 -0
- rcsb/exdb/tests/testChemRefMappingProvider.py +95 -0
- rcsb/exdb/tests/testCitationAdapter.py +97 -0
- rcsb/exdb/tests/testCitationExtractor.py +93 -0
- rcsb/exdb/tests/testCitationUtils.py +92 -0
- rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +70 -0
- rcsb/exdb/tests/testEntryInfoProvider.py +97 -0
- rcsb/exdb/tests/testGlycanEtlWorkflow.py +70 -0
- rcsb/exdb/tests/testGlycanProvider.py +98 -0
- rcsb/exdb/tests/testGlycanUtils.py +64 -0
- rcsb/exdb/tests/testLigandNeighborMappingProvider.py +90 -0
- rcsb/exdb/tests/testObjectExtractor.py +342 -0
- rcsb/exdb/tests/testObjectTransformer.py +83 -0
- rcsb/exdb/tests/testObjectUpdater.py +120 -0
- rcsb/exdb/tests/testPolymerEntityExtractor.py +93 -0
- rcsb/exdb/tests/testPubChemDataCacheProvider.py +124 -0
- rcsb/exdb/tests/testPubChemEtlWorkflow.py +134 -0
- rcsb/exdb/tests/testPubChemEtlWrapper.py +155 -0
- rcsb/exdb/tests/testPubChemIndexCacheProvider.py +123 -0
- rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +106 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +121 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +122 -0
- rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +117 -0
- rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +94 -0
- rcsb/exdb/tests/testTaxonomyExtractor.py +75 -0
- rcsb/exdb/tests/testTreeNodeListWorker.py +111 -0
- rcsb/exdb/tests/testUniProtCoreEtlWorker.py +99 -0
- rcsb/exdb/tests/testUniProtExtractor.py +77 -0
- rcsb/exdb/tree/TreeNodeListWorker.py +228 -0
- rcsb/exdb/tree/__init__.py +0 -0
- rcsb/exdb/utils/ObjectAdapterBase.py +22 -0
- rcsb/exdb/utils/ObjectExtractor.py +286 -0
- rcsb/exdb/utils/ObjectTransformer.py +124 -0
- rcsb/exdb/utils/ObjectUpdater.py +121 -0
- rcsb/exdb/utils/ObjectValidator.py +160 -0
- rcsb/exdb/utils/__init__.py +0 -0
- rcsb/exdb/wf/EntryInfoEtlWorkflow.py +71 -0
- rcsb/exdb/wf/GlycanEtlWorkflow.py +76 -0
- rcsb/exdb/wf/PubChemEtlWorkflow.py +240 -0
- rcsb/exdb/wf/__init__.py +0 -0
- rcsb_exdb-1.31.dist-info/METADATA +103 -0
- rcsb_exdb-1.31.dist-info/RECORD +98 -0
- rcsb_exdb-1.31.dist-info/WHEEL +4 -0
- rcsb_exdb-1.31.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: UniProtCoreEtlWorkerTests.py
|
|
3
|
+
# Author: J. Westbrook
|
|
4
|
+
# Date: 9-Dec-2018
|
|
5
|
+
#
|
|
6
|
+
# Updates:
|
|
7
|
+
#
|
|
8
|
+
##
|
|
9
|
+
"""
|
|
10
|
+
Tests for loading UniProt core collection
|
|
11
|
+
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
__docformat__ = "google en"
|
|
15
|
+
__author__ = "John Westbrook"
|
|
16
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
17
|
+
__license__ = "Apache 2.0"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
import logging
|
|
21
|
+
import os
|
|
22
|
+
import platform
|
|
23
|
+
import resource
|
|
24
|
+
import time
|
|
25
|
+
import unittest
|
|
26
|
+
|
|
27
|
+
from rcsb.exdb.seq.UniProtCoreEtlWorker import UniProtCoreEtlWorker
|
|
28
|
+
from rcsb.utils.config.ConfigUtil import ConfigUtil
|
|
29
|
+
|
|
30
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]-%(module)s.%(funcName)s: %(message)s")
|
|
31
|
+
logger = logging.getLogger()
|
|
32
|
+
|
|
33
|
+
HERE = os.path.abspath(os.path.dirname(__file__))
|
|
34
|
+
TOPDIR = os.path.dirname(os.path.dirname(os.path.dirname(HERE)))
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class UniProtCoreEtlWorkerTests(unittest.TestCase):
|
|
38
|
+
def __init__(self, methodName="runTest"):
|
|
39
|
+
super(UniProtCoreEtlWorkerTests, self).__init__(methodName)
|
|
40
|
+
self.__verbose = True
|
|
41
|
+
|
|
42
|
+
def setUp(self):
|
|
43
|
+
#
|
|
44
|
+
#
|
|
45
|
+
self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
|
|
46
|
+
configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example.yml")
|
|
47
|
+
configName = "site_info_configuration"
|
|
48
|
+
self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath)
|
|
49
|
+
self.__cachePath = os.path.join(TOPDIR, "CACHE")
|
|
50
|
+
#
|
|
51
|
+
# sample data set
|
|
52
|
+
self.__updateId = "2018_23"
|
|
53
|
+
#
|
|
54
|
+
self.__startTime = time.time()
|
|
55
|
+
logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
|
|
56
|
+
|
|
57
|
+
def tearDown(self):
|
|
58
|
+
unitS = "MB" if platform.system() == "Darwin" else "GB"
|
|
59
|
+
rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
|
|
60
|
+
logger.info("Maximum resident memory size %.4f %s", rusageMax / 10 ** 6, unitS)
|
|
61
|
+
endTime = time.time()
|
|
62
|
+
logger.info("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)
|
|
63
|
+
|
|
64
|
+
@unittest.skip("Disable test - deprecated")
|
|
65
|
+
def testLoadUniProtCore(self):
|
|
66
|
+
"""Test case - load UniProt core collection reference data -"""
|
|
67
|
+
try:
|
|
68
|
+
uw = UniProtCoreEtlWorker(self.__cfgOb, self.__cachePath)
|
|
69
|
+
ok = uw.load(self.__updateId, extResource="UniProt", loadType="full")
|
|
70
|
+
#
|
|
71
|
+
self.assertTrue(ok)
|
|
72
|
+
except Exception as e:
|
|
73
|
+
logger.exception("Failing with %s", str(e))
|
|
74
|
+
self.fail()
|
|
75
|
+
|
|
76
|
+
@unittest.skip("Disable test - deprecated")
|
|
77
|
+
def testValidateUniProtCore(self):
|
|
78
|
+
"""Test case - validate UniProt core collection reference data -"""
|
|
79
|
+
try:
|
|
80
|
+
uw = UniProtCoreEtlWorker(self.__cfgOb, self.__cachePath, doValidate=True)
|
|
81
|
+
ok = uw.load(self.__updateId, extResource="UniProt", loadType="full")
|
|
82
|
+
#
|
|
83
|
+
self.assertTrue(ok)
|
|
84
|
+
except Exception as e:
|
|
85
|
+
logger.exception("Failing with %s", str(e))
|
|
86
|
+
self.fail()
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def uniProtCoreEtlWorkerSuite():
|
|
90
|
+
suiteSelect = unittest.TestSuite()
|
|
91
|
+
suiteSelect.addTest(UniProtCoreEtlWorkerTests("testLoadUniProtCore"))
|
|
92
|
+
suiteSelect.addTest(UniProtCoreEtlWorkerTests("testValidateUniProtCore"))
|
|
93
|
+
return suiteSelect
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
if __name__ == "__main__":
|
|
97
|
+
#
|
|
98
|
+
mySuite = uniProtCoreEtlWorkerSuite()
|
|
99
|
+
unittest.TextTestRunner(verbosity=2).run(mySuite)
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: UniProtExtractorTests.py
|
|
3
|
+
# Author: J. Westbrook
|
|
4
|
+
# Date: 5-Dec-2020
|
|
5
|
+
#
|
|
6
|
+
# Updates:
|
|
7
|
+
#
|
|
8
|
+
##
|
|
9
|
+
"""
|
|
10
|
+
Tests for extraction of UniProt reference sequence details from the ExDB UniProt collection.
|
|
11
|
+
"""
|
|
12
|
+
__docformat__ = "google en"
|
|
13
|
+
__author__ = "John Westbrook"
|
|
14
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
15
|
+
__license__ = "Apache 2.0"
|
|
16
|
+
|
|
17
|
+
import logging
|
|
18
|
+
import os
|
|
19
|
+
import platform
|
|
20
|
+
import resource
|
|
21
|
+
import time
|
|
22
|
+
import unittest
|
|
23
|
+
|
|
24
|
+
from rcsb.exdb.seq.UniProtExtractor import UniProtExtractor
|
|
25
|
+
from rcsb.utils.config.ConfigUtil import ConfigUtil
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]-%(module)s.%(funcName)s: %(message)s")
|
|
29
|
+
logger = logging.getLogger()
|
|
30
|
+
|
|
31
|
+
HERE = os.path.abspath(os.path.dirname(__file__))
|
|
32
|
+
TOPDIR = os.path.dirname(os.path.dirname(os.path.dirname(HERE)))
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class UniProtExtractorTests(unittest.TestCase):
|
|
36
|
+
def __init__(self, methodName="runTest"):
|
|
37
|
+
super(UniProtExtractorTests, self).__init__(methodName)
|
|
38
|
+
self.__verbose = True
|
|
39
|
+
|
|
40
|
+
def setUp(self):
|
|
41
|
+
#
|
|
42
|
+
self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
|
|
43
|
+
configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example.yml")
|
|
44
|
+
configName = "site_info_configuration"
|
|
45
|
+
self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath)
|
|
46
|
+
#
|
|
47
|
+
#
|
|
48
|
+
self.__startTime = time.time()
|
|
49
|
+
logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
|
|
50
|
+
|
|
51
|
+
def tearDown(self):
|
|
52
|
+
unitS = "MB" if platform.system() == "Darwin" else "GB"
|
|
53
|
+
rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
|
|
54
|
+
logger.info("Maximum resident memory size %.4f %s", rusageMax / 10 ** 6, unitS)
|
|
55
|
+
endTime = time.time()
|
|
56
|
+
logger.info("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)
|
|
57
|
+
|
|
58
|
+
def testGetUniProtDetails(self):
|
|
59
|
+
"""Test case - get UniProt reference sequences and essential details"""
|
|
60
|
+
try:
|
|
61
|
+
uEx = UniProtExtractor(self.__cfgOb)
|
|
62
|
+
unpD = uEx.getReferenceSequenceDetails()
|
|
63
|
+
logger.info("UniProt count %d", len(unpD))
|
|
64
|
+
except Exception as e:
|
|
65
|
+
logger.exception("Failing with %s", str(e))
|
|
66
|
+
self.fail()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def extractorSuite():
|
|
70
|
+
suiteSelect = unittest.TestSuite()
|
|
71
|
+
suiteSelect.addTest(UniProtExtractorTests("testGetGoIds"))
|
|
72
|
+
return suiteSelect
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
if __name__ == "__main__":
|
|
76
|
+
mySuite = extractorSuite()
|
|
77
|
+
unittest.TextTestRunner(verbosity=2).run(mySuite)
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: TreeNodeListWorker.py
|
|
3
|
+
# Date: 9-Apr-2019 jdw
|
|
4
|
+
#
|
|
5
|
+
# Loading worker for tree node list data.
|
|
6
|
+
#
|
|
7
|
+
# Updates:
|
|
8
|
+
# 9-Sep-2019 jdw add AtcProvider() and ChemrefExtractor() for ATC tree.
|
|
9
|
+
# 12-Apr-2023 dwp add CARD ontology tree
|
|
10
|
+
# 8-Aug-2023 dwp Load full (unfiltered) taxonomy tree node list, and stop loading GO tree (will be loaded in DW instead)
|
|
11
|
+
# 27-Aug-2024 dwp Update CARD ontology tree loading
|
|
12
|
+
# 23-Jan-2025 dwp Change indexed field from 'update_id' to 'id'
|
|
13
|
+
# 7-Aug-2025 dwp Change target DB and collection names to "dw" and "tree_*" (via configuration file);
|
|
14
|
+
# Make use of configuration file for loading tree node lists and setting indexed fields
|
|
15
|
+
#
|
|
16
|
+
##
|
|
17
|
+
__docformat__ = "google en"
|
|
18
|
+
__author__ = "John Westbrook"
|
|
19
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
20
|
+
__license__ = "Apache 2.0"
|
|
21
|
+
|
|
22
|
+
import logging
|
|
23
|
+
import os.path
|
|
24
|
+
|
|
25
|
+
from rcsb.db.mongo.DocumentLoader import DocumentLoader
|
|
26
|
+
from rcsb.db.processors.DataExchangeStatus import DataExchangeStatus
|
|
27
|
+
from rcsb.exdb.chemref.ChemRefExtractor import ChemRefExtractor
|
|
28
|
+
from rcsb.utils.chemref.AtcProvider import AtcProvider
|
|
29
|
+
from rcsb.utils.ec.EnzymeDatabaseProvider import EnzymeDatabaseProvider
|
|
30
|
+
from rcsb.utils.targets.CARDTargetOntologyProvider import CARDTargetOntologyProvider
|
|
31
|
+
from rcsb.utils.struct.CathClassificationProvider import CathClassificationProvider
|
|
32
|
+
from rcsb.utils.struct.EcodClassificationProvider import EcodClassificationProvider
|
|
33
|
+
from rcsb.utils.struct.ScopClassificationProvider import ScopClassificationProvider
|
|
34
|
+
from rcsb.utils.struct.Scop2ClassificationProvider import Scop2ClassificationProvider
|
|
35
|
+
from rcsb.utils.taxonomy.TaxonomyProvider import TaxonomyProvider
|
|
36
|
+
from rcsb.exdb.seq.TaxonomyExtractor import TaxonomyExtractor
|
|
37
|
+
|
|
38
|
+
logger = logging.getLogger(__name__)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class TreeNodeListWorker(object):
|
|
42
|
+
"""Prepare and load repository holdings and repository update data."""
|
|
43
|
+
|
|
44
|
+
def __init__(self, cfgOb, cachePath, numProc=1, chunkSize=10, maxStepLength=4000, readBackCheck=False, documentLimit=None, verbose=False, useCache=False, useFilteredLists=False):
|
|
45
|
+
self.__cfgOb = cfgOb
|
|
46
|
+
self.__cachePath = os.path.abspath(cachePath)
|
|
47
|
+
self.__readBackCheck = readBackCheck
|
|
48
|
+
self.__numProc = numProc
|
|
49
|
+
self.__chunkSize = chunkSize
|
|
50
|
+
self.__maxStepLength = maxStepLength
|
|
51
|
+
self.__documentLimit = documentLimit
|
|
52
|
+
self.__resourceName = "MONGO_DB"
|
|
53
|
+
self.__filterType = "assign-dates"
|
|
54
|
+
self.__verbose = verbose
|
|
55
|
+
self.__statusList = []
|
|
56
|
+
self.__useCache = useCache
|
|
57
|
+
self.__useFilteredLists = useFilteredLists
|
|
58
|
+
|
|
59
|
+
def __updateStatus(self, updateId, databaseName, collectionName, status, startTimestamp):
|
|
60
|
+
try:
|
|
61
|
+
sFlag = "Y" if status else "N"
|
|
62
|
+
desp = DataExchangeStatus()
|
|
63
|
+
desp.setStartTime(tS=startTimestamp)
|
|
64
|
+
desp.setObject(databaseName, collectionName)
|
|
65
|
+
desp.setStatus(updateId=updateId, successFlag=sFlag)
|
|
66
|
+
desp.setEndTime()
|
|
67
|
+
self.__statusList.append(desp.getStatus())
|
|
68
|
+
return True
|
|
69
|
+
except Exception as e:
|
|
70
|
+
logger.exception("Failing with %s", str(e))
|
|
71
|
+
return False
|
|
72
|
+
|
|
73
|
+
def load(self, updateId, loadType="full", doLoad=True):
|
|
74
|
+
"""Load tree node lists and status data -
|
|
75
|
+
|
|
76
|
+
Relevant configuration options:
|
|
77
|
+
|
|
78
|
+
tree_node_lists_configuration:
|
|
79
|
+
DATABASE_NAME: dw
|
|
80
|
+
COLLECTION_VERSION_STRING: 2.1.0
|
|
81
|
+
COLLECTION_NAME_LIST:
|
|
82
|
+
- tree_taxonomy
|
|
83
|
+
- tree_ec
|
|
84
|
+
- tree_scop
|
|
85
|
+
- tree_scop2
|
|
86
|
+
- tree_cath
|
|
87
|
+
- tree_atc
|
|
88
|
+
- tree_card
|
|
89
|
+
- tree_ecod
|
|
90
|
+
COLLECTION_INDICES:
|
|
91
|
+
- INDEX_NAME: primary
|
|
92
|
+
ATTRIBUTE_NAMES:
|
|
93
|
+
- id
|
|
94
|
+
- INDEX_NAME: index_2
|
|
95
|
+
ATTRIBUTE_NAMES:
|
|
96
|
+
- parents
|
|
97
|
+
"""
|
|
98
|
+
try:
|
|
99
|
+
useCache = self.__useCache
|
|
100
|
+
#
|
|
101
|
+
logger.info("Starting with cache path %r (useCache=%r)", self.__cachePath, useCache)
|
|
102
|
+
#
|
|
103
|
+
self.__statusList = []
|
|
104
|
+
desp = DataExchangeStatus()
|
|
105
|
+
statusStartTimestamp = desp.setStartTime()
|
|
106
|
+
dl = DocumentLoader(
|
|
107
|
+
self.__cfgOb,
|
|
108
|
+
self.__cachePath,
|
|
109
|
+
self.__resourceName,
|
|
110
|
+
numProc=self.__numProc,
|
|
111
|
+
chunkSize=self.__chunkSize,
|
|
112
|
+
maxStepLength=self.__maxStepLength,
|
|
113
|
+
documentLimit=self.__documentLimit,
|
|
114
|
+
verbose=self.__verbose,
|
|
115
|
+
readBackCheck=self.__readBackCheck,
|
|
116
|
+
)
|
|
117
|
+
#
|
|
118
|
+
sectionName = "tree_node_lists_configuration"
|
|
119
|
+
databaseNameMongo = self.__cfgOb.get("DATABASE_NAME", sectionName=sectionName)
|
|
120
|
+
collectionNameList = self.__cfgOb.get("COLLECTION_NAME_LIST", sectionName=sectionName)
|
|
121
|
+
collectionIndexList = self.__cfgOb.get("COLLECTION_INDICES", sectionName=sectionName)
|
|
122
|
+
# databaseNameMongo = 'dw'
|
|
123
|
+
# collectionNameList = ['tree_taxonomy', 'tree_ec', 'tree_scop', 'tree_scop2', 'tree_cath', 'tree_atc', 'tree_card', 'tree_ecod', 'tree_go']
|
|
124
|
+
# collectionIndexList = [{'INDEX_NAME': 'primary', 'ATTRIBUTE_NAMES': ['id']}, {'INDEX_NAME': 'index_2', 'ATTRIBUTE_NAMES': ['parents']}]
|
|
125
|
+
|
|
126
|
+
# collectionVersion = self.__cfgOb.get("COLLECTION_VERSION_STRING", sectionName=sectionName)
|
|
127
|
+
# addValues = {"_schema_version": collectionVersion}
|
|
128
|
+
addValues = None
|
|
129
|
+
|
|
130
|
+
ok = True
|
|
131
|
+
for collectionName in collectionNameList:
|
|
132
|
+
nL = self.__getTreeDocList(collectionName, useCache)
|
|
133
|
+
if nL and doLoad:
|
|
134
|
+
ok = dl.load(
|
|
135
|
+
databaseNameMongo,
|
|
136
|
+
collectionName,
|
|
137
|
+
loadType=loadType,
|
|
138
|
+
documentList=nL,
|
|
139
|
+
keyNames=None,
|
|
140
|
+
addValues=addValues,
|
|
141
|
+
schemaLevel=None,
|
|
142
|
+
indexDL=collectionIndexList
|
|
143
|
+
) and ok
|
|
144
|
+
self.__updateStatus(updateId, databaseNameMongo, collectionName, ok, statusStartTimestamp)
|
|
145
|
+
logger.info(
|
|
146
|
+
"Completed load of tree node list for database %r, collection %r, len(nL) %r (status %r)",
|
|
147
|
+
databaseNameMongo, collectionName, len(nL), ok
|
|
148
|
+
)
|
|
149
|
+
# ---
|
|
150
|
+
logger.info("Completed tree node list loading operations with loadType %r (status %r)", loadType, ok)
|
|
151
|
+
return True
|
|
152
|
+
except Exception as e:
|
|
153
|
+
logger.exception("Failing with %s", str(e))
|
|
154
|
+
return False
|
|
155
|
+
|
|
156
|
+
def __checkTaxonNodeList(self, nL):
|
|
157
|
+
eCount = 0
|
|
158
|
+
tD = {dD["id"]: True for dD in nL}
|
|
159
|
+
for dD in nL:
|
|
160
|
+
if "parents" in dD:
|
|
161
|
+
pId = dD["parents"][0]
|
|
162
|
+
if pId not in tD:
|
|
163
|
+
logger.info("Missing parent for taxon %d", pId)
|
|
164
|
+
eCount += 1
|
|
165
|
+
else:
|
|
166
|
+
logger.info("No parents for node %r", dD["id"])
|
|
167
|
+
|
|
168
|
+
def getLoadStatus(self):
|
|
169
|
+
return self.__statusList
|
|
170
|
+
|
|
171
|
+
def __getTreeDocList(self, collectionName, useCache):
|
|
172
|
+
nL = []
|
|
173
|
+
if collectionName.lower() == "tree_cath":
|
|
174
|
+
ccu = CathClassificationProvider(cachePath=self.__cachePath, useCache=useCache)
|
|
175
|
+
nL = ccu.getTreeNodeList()
|
|
176
|
+
elif collectionName.lower() == "tree_scop2":
|
|
177
|
+
scu2 = Scop2ClassificationProvider(cachePath=self.__cachePath, useCache=useCache)
|
|
178
|
+
nL = scu2.getTreeNodeList()
|
|
179
|
+
elif collectionName.lower() == "tree_scop":
|
|
180
|
+
scu = ScopClassificationProvider(cachePath=self.__cachePath, useCache=useCache)
|
|
181
|
+
nL = scu.getTreeNodeList()
|
|
182
|
+
elif collectionName.lower() == "tree_ecod":
|
|
183
|
+
ecu = EcodClassificationProvider(cachePath=self.__cachePath, useCache=useCache)
|
|
184
|
+
nL = ecu.getTreeNodeList()
|
|
185
|
+
elif collectionName.lower() == "tree_ec":
|
|
186
|
+
edbu = EnzymeDatabaseProvider(cachePath=self.__cachePath, useCache=useCache)
|
|
187
|
+
nL = edbu.getTreeNodeList()
|
|
188
|
+
elif collectionName.lower() == "tree_card":
|
|
189
|
+
okCou = True
|
|
190
|
+
cou = CARDTargetOntologyProvider(cachePath=self.__cachePath, useCache=useCache)
|
|
191
|
+
if not cou.testCache():
|
|
192
|
+
ok = cou.buildOntologyData()
|
|
193
|
+
cou.reload()
|
|
194
|
+
if not (ok and cou.testCache()):
|
|
195
|
+
logger.error("Skipping load of CARD Target Ontology tree data because it is missing.")
|
|
196
|
+
okCou = False
|
|
197
|
+
if okCou:
|
|
198
|
+
nL = cou.getTreeNodeList()
|
|
199
|
+
elif collectionName.lower() == "tree_taxonomy":
|
|
200
|
+
tU = TaxonomyProvider(cachePath=self.__cachePath, useCache=useCache)
|
|
201
|
+
if self.__useFilteredLists:
|
|
202
|
+
# Get the taxon coverage in the current data set -
|
|
203
|
+
epe = TaxonomyExtractor(self.__cfgOb)
|
|
204
|
+
tL = epe.getUniqueTaxons()
|
|
205
|
+
logger.info("Taxon coverage length %d", len(tL))
|
|
206
|
+
#
|
|
207
|
+
fD = {1}
|
|
208
|
+
for taxId in tL:
|
|
209
|
+
fD.update({k: True for k in tU.getLineage(taxId)})
|
|
210
|
+
logger.info("Taxon filter dictionary length %d", len(fD))
|
|
211
|
+
logger.debug("fD %r", sorted(fD))
|
|
212
|
+
#
|
|
213
|
+
nL = tU.exportNodeList(filterD=fD)
|
|
214
|
+
else:
|
|
215
|
+
# Get the full taxon node list without filtering
|
|
216
|
+
nL = tU.exportNodeList()
|
|
217
|
+
self.__checkTaxonNodeList(nL)
|
|
218
|
+
elif collectionName.lower() == "tree_atc":
|
|
219
|
+
crEx = ChemRefExtractor(self.__cfgOb)
|
|
220
|
+
atcFilterD = crEx.getChemCompAccessionMapping("ATC")
|
|
221
|
+
logger.info("Length of ATC filter %d", len(atcFilterD))
|
|
222
|
+
atcP = AtcProvider(cachePath=self.__cachePath, useCache=useCache)
|
|
223
|
+
nL = atcP.getTreeNodeList(filterD=atcFilterD)
|
|
224
|
+
else:
|
|
225
|
+
logger.error("Unsupported tree node collection %r", collectionName)
|
|
226
|
+
#
|
|
227
|
+
logger.info("Gathered tree nodes for loading collection %s (length %d)", collectionName, len(nL))
|
|
228
|
+
return nL
|
|
File without changes
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: ObjectAdapterBase.py
|
|
3
|
+
# Date: 17-Oct-2019
|
|
4
|
+
#
|
|
5
|
+
##
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ObjectAdapterBase(object):
|
|
9
|
+
def __init(self, *args, **kwargs):
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
def filter(self, obj, **kwargs):
|
|
13
|
+
"""Operates on the input object and returns the transformed result.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
obj (object): input object/document
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
|
|
20
|
+
bool, object: filter status and transformed input object/document
|
|
21
|
+
"""
|
|
22
|
+
raise NotImplementedError
|