rcsb.exdb 1.0__tar.gz → 1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/HISTORY.txt +2 -0
- {rcsb.exdb-1.0/rcsb.exdb.egg-info → rcsb_exdb-1.1}/PKG-INFO +2 -1
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/chemref/ChemRefEtlWorker.py +3 -1
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/chemref/PubChemIndexCacheProvider.py +8 -2
- rcsb_exdb-1.1/rcsb/exdb/cli/ExDbExec.py +211 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/cli/__init__.py +1 -1
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/seq/UniProtCoreEtlWorker.py +3 -1
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/fixturePdbxLoader.py +43 -40
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testExDbWorkflow.py +2 -1
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testObjectExtractor.py +1 -1
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testPolymerEntityExtractor.py +1 -1
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +2 -2
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +2 -2
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tree/TreeNodeListWorker.py +3 -1
- rcsb_exdb-1.1/rcsb/exdb/wf/ExDbWorkflow.py +415 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/wf/PubChemEtlWorkflow.py +9 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1/rcsb.exdb.egg-info}/PKG-INFO +2 -1
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb.exdb.egg-info/requires.txt +1 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/requirements.txt +1 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/setup.cfg +1 -1
- rcsb.exdb-1.0/rcsb/exdb/cli/ExDbExec.py +0 -194
- rcsb.exdb-1.0/rcsb/exdb/wf/ExDbWorkflow.py +0 -244
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/LICENSE +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/MANIFEST.in +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/README.md +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/__init__.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/__init__.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/branch/BranchedEntityExtractor.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/branch/GlycanProvider.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/branch/GlycanUtils.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/branch/__init__.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/chemref/ChemRefExtractor.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/chemref/ChemRefMappingProvider.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/chemref/PubChemDataCacheProvider.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/chemref/PubChemEtlWrapper.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/chemref/__init__.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/citation/CitationAdapter.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/citation/CitationExtractor.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/citation/CitationUtils.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/citation/__init__.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/entry/EntryInfoProvider.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/entry/__init__.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/seq/AnnotationExtractor.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/seq/LigandNeighborMappingExtractor.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/seq/LigandNeighborMappingProvider.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/seq/PolymerEntityExtractor.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/seq/TaxonomyExtractor.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/seq/UniProtExtractor.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/seq/__init__.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/__init__.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testAnnotationExtractor.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testBranchedEntityExtractor.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testChemRefLoader.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testChemRefMappingProvider.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testCitationAdapter.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testCitationExtractor.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testCitationUtils.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testEntryInfoProvider.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testGlycanEtlWorkflow.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testGlycanProvider.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testGlycanUtils.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testLigandNeighborMappingProvider.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testObjectTransformer.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testObjectUpdater.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testPubChemDataCacheProvider.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testPubChemEtlWorkflow.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testPubChemEtlWrapper.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testPubChemIndexCacheProvider.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testTaxonomyExtractor.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testTreeNodeListWorker.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testUniProtCoreEtlWorker.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tests/testUniProtExtractor.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/tree/__init__.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/utils/ObjectAdapterBase.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/utils/ObjectExtractor.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/utils/ObjectTransformer.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/utils/ObjectUpdater.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/utils/ObjectValidator.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/utils/__init__.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/wf/EntryInfoEtlWorkflow.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/wf/GlycanEtlWorkflow.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb/exdb/wf/__init__.py +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb.exdb.egg-info/SOURCES.txt +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb.exdb.egg-info/dependency_links.txt +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb.exdb.egg-info/entry_points.txt +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb.exdb.egg-info/not-zip-safe +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/rcsb.exdb.egg-info/top_level.txt +0 -0
- {rcsb.exdb-1.0 → rcsb_exdb-1.1}/setup.py +0 -0
|
@@ -97,3 +97,5 @@
|
|
|
97
97
|
Add documentation to reference sequence providers
|
|
98
98
|
9-Jan-2024 V1.00 Update PolymerEntityExtractor to turn off usage of uniprot_exdb as source data;
|
|
99
99
|
This package update also coincides with the turning off of uniprot_exdb data loading during the weekly workflow
|
|
100
|
+
6-May-2024 V1.1 Update ExDbExec CLI and ExDbWorkflow to support CLI usage from weekly-update workflow;
|
|
101
|
+
Update unit tests and setuptools config
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: rcsb.exdb
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.1
|
|
4
4
|
Summary: RCSB Python ExDB data extraction and loading workflows
|
|
5
5
|
Home-page: https://github.com/rcsb/py-rcsb_exdb
|
|
6
6
|
Author: John Westbrook
|
|
@@ -30,6 +30,7 @@ Requires-Dist: rcsb.utils.seq>=0.63
|
|
|
30
30
|
Requires-Dist: rcsb.utils.struct>=0.37
|
|
31
31
|
Requires-Dist: rcsb.utils.taxonomy>=0.39
|
|
32
32
|
Requires-Dist: rcsb.utils.dictionary>=0.71
|
|
33
|
+
Requires-Dist: rcsb.workflow>=0.42
|
|
33
34
|
Requires-Dist: statistics; python_version < "3.0"
|
|
34
35
|
Provides-Extra: dev
|
|
35
36
|
Requires-Dist: check-manifest; extra == "dev"
|
|
@@ -29,13 +29,14 @@ logger = logging.getLogger(__name__)
|
|
|
29
29
|
class ChemRefEtlWorker(object):
|
|
30
30
|
"""Prepare and load chemical reference data collections."""
|
|
31
31
|
|
|
32
|
-
def __init__(self, cfgOb, cachePath, useCache=True, numProc=2, chunkSize=10, readBackCheck=False, documentLimit=None, verbose=False):
|
|
32
|
+
def __init__(self, cfgOb, cachePath, useCache=True, numProc=2, chunkSize=10, maxStepLength=2000, readBackCheck=False, documentLimit=None, verbose=False):
|
|
33
33
|
self.__cfgOb = cfgOb
|
|
34
34
|
self.__cachePath = cachePath
|
|
35
35
|
self.__useCache = useCache
|
|
36
36
|
self.__readBackCheck = readBackCheck
|
|
37
37
|
self.__numProc = numProc
|
|
38
38
|
self.__chunkSize = chunkSize
|
|
39
|
+
self.__maxStepLength = maxStepLength
|
|
39
40
|
self.__documentLimit = documentLimit
|
|
40
41
|
#
|
|
41
42
|
self.__resourceName = "MONGO_DB"
|
|
@@ -95,6 +96,7 @@ class ChemRefEtlWorker(object):
|
|
|
95
96
|
self.__resourceName,
|
|
96
97
|
numProc=self.__numProc,
|
|
97
98
|
chunkSize=self.__chunkSize,
|
|
99
|
+
maxStepLength=self.__maxStepLength,
|
|
98
100
|
documentLimit=self.__documentLimit,
|
|
99
101
|
verbose=self.__verbose,
|
|
100
102
|
readBackCheck=self.__readBackCheck,
|
|
@@ -196,7 +196,7 @@ class PubChemUpdateWorker(object):
|
|
|
196
196
|
|
|
197
197
|
def __chunker(self, iList, chunkSize):
|
|
198
198
|
chunkSize = max(1, chunkSize)
|
|
199
|
-
return (iList[i
|
|
199
|
+
return (iList[i: i + chunkSize] for i in range(0, len(iList), chunkSize))
|
|
200
200
|
|
|
201
201
|
|
|
202
202
|
class PubChemIndexCacheProvider(StashableBase):
|
|
@@ -319,6 +319,7 @@ class PubChemIndexCacheProvider(StashableBase):
|
|
|
319
319
|
#
|
|
320
320
|
matchD = {}
|
|
321
321
|
matchedIdList = []
|
|
322
|
+
ok = False
|
|
322
323
|
try:
|
|
323
324
|
# ---
|
|
324
325
|
# Get current the indices of source chemical reference data -
|
|
@@ -346,7 +347,10 @@ class PubChemIndexCacheProvider(StashableBase):
|
|
|
346
347
|
else:
|
|
347
348
|
logger.info("No reference data updates required")
|
|
348
349
|
# --
|
|
349
|
-
|
|
350
|
+
if not ok:
|
|
351
|
+
logger.warning("updateMissing completed with status %r failures %r", ok, len(failList))
|
|
352
|
+
#
|
|
353
|
+
return True
|
|
350
354
|
except Exception as e:
|
|
351
355
|
logger.exception("Failing with %s", str(e))
|
|
352
356
|
return ok
|
|
@@ -569,8 +573,10 @@ class PubChemIndexCacheProvider(StashableBase):
|
|
|
569
573
|
"""Rebuild source indices of chemical component definitions."""
|
|
570
574
|
logger.info("Rebuilding chemical definition index.")
|
|
571
575
|
ok1, ccidxP = self.__buildChemCompIndex(**kwargs)
|
|
576
|
+
logger.info("__buildChemCompIndex completed with status %r", ok1)
|
|
572
577
|
logger.info("Rebuilding chemical search indices.")
|
|
573
578
|
ok2, ccsidxP = self.__buildChemCompSearchIndex(numProc, **kwargs)
|
|
579
|
+
logger.info("__buildChemCompSearchIndex completed with status %r", ok2)
|
|
574
580
|
return ok1 & ok2, ccidxP, ccsidxP
|
|
575
581
|
|
|
576
582
|
def __buildChemCompIndex(self, **kwargs):
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
##
|
|
2
|
+
# File: ExDbExec.py
|
|
3
|
+
# Date: 22-Apr-2019 jdw
|
|
4
|
+
#
|
|
5
|
+
# Execution wrapper -- for extract and load operations -
|
|
6
|
+
#
|
|
7
|
+
# Updates:
|
|
8
|
+
# 4-Sep-2019 jdw add Tree and Drugbank loaders
|
|
9
|
+
# 14-Feb-2020 jdw change over to ReferenceSequenceAnnotationProvider/Adapter
|
|
10
|
+
# 9-Mar-2023 dwp Lower refChunkSize to 10 (UniProt API having trouble streaming XML responses)
|
|
11
|
+
# 25-Apr-2024 dwp Add arguments and logic to support CLI usage from weekly-update workflow;
|
|
12
|
+
# Add support for logging output to a specific file
|
|
13
|
+
##
|
|
14
|
+
__docformat__ = "google en"
|
|
15
|
+
__author__ = "John Westbrook"
|
|
16
|
+
__email__ = "jwest@rcsb.rutgers.edu"
|
|
17
|
+
__license__ = "Apache 2.0"
|
|
18
|
+
|
|
19
|
+
import os
|
|
20
|
+
import sys
|
|
21
|
+
import argparse
|
|
22
|
+
import logging
|
|
23
|
+
|
|
24
|
+
from rcsb.utils.config.ConfigUtil import ConfigUtil
|
|
25
|
+
from rcsb.exdb.wf.ExDbWorkflow import ExDbWorkflow
|
|
26
|
+
|
|
27
|
+
HERE = os.path.abspath(os.path.dirname(__file__))
|
|
28
|
+
TOPDIR = os.path.dirname(os.path.dirname(os.path.dirname(HERE)))
|
|
29
|
+
|
|
30
|
+
# logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]-%(module)s.%(funcName)s: %(message)s", stream=sys.stdout)
|
|
31
|
+
logger = logging.getLogger()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def main():
|
|
35
|
+
parser = argparse.ArgumentParser()
|
|
36
|
+
#
|
|
37
|
+
parser.add_argument(
|
|
38
|
+
"--op",
|
|
39
|
+
default=None,
|
|
40
|
+
required=True,
|
|
41
|
+
help="Loading operation to perform",
|
|
42
|
+
choices=[
|
|
43
|
+
"etl_chemref", # ETL integrated chemical reference data
|
|
44
|
+
"etl_uniprot_core", # ETL UniProt core reference data
|
|
45
|
+
"etl_tree_node_lists", # ETL tree node lists
|
|
46
|
+
"upd_ref_seq", # Update reference sequence assignments
|
|
47
|
+
"upd_neighbor_interactions",
|
|
48
|
+
"upd_uniprot_taxonomy",
|
|
49
|
+
"upd_targets_cofactors",
|
|
50
|
+
"upd_pubchem",
|
|
51
|
+
"upd_entry_info",
|
|
52
|
+
"upd_glycan_idx",
|
|
53
|
+
"upd_resource_stash",
|
|
54
|
+
]
|
|
55
|
+
)
|
|
56
|
+
parser.add_argument(
|
|
57
|
+
"--load_type",
|
|
58
|
+
default="full",
|
|
59
|
+
help="Type of load ('full' for complete and fresh single-worker load, 'replace' for incremental and multi-worker load)",
|
|
60
|
+
choices=["full", "replace"],
|
|
61
|
+
)
|
|
62
|
+
#
|
|
63
|
+
parser.add_argument("--config_path", default=None, help="Path to configuration options file")
|
|
64
|
+
parser.add_argument("--config_name", default="site_info_remote_configuration", help="Configuration section name")
|
|
65
|
+
parser.add_argument("--cache_path", default=None, help="Cache path for resource files")
|
|
66
|
+
parser.add_argument("--num_proc", default=2, help="Number of processes to execute (default=2)")
|
|
67
|
+
parser.add_argument("--chunk_size", default=10, help="Number of files loaded per process")
|
|
68
|
+
parser.add_argument("--max_step_length", default=500, help="Maximum subList size (default=500)")
|
|
69
|
+
parser.add_argument("--db_type", default="mongo", help="Database server type (default=mongo)")
|
|
70
|
+
parser.add_argument("--document_limit", default=None, help="Load document limit for testing")
|
|
71
|
+
#
|
|
72
|
+
parser.add_argument("--rebuild_cache", default=False, action="store_true", help="Rebuild cached resource files")
|
|
73
|
+
parser.add_argument("--rebuild_sequence_cache", default=False, action="store_true", help="Rebuild cached resource files for reference sequence updates")
|
|
74
|
+
parser.add_argument("--provider_type_exclude", default=None, help="Resource provider types to exclude")
|
|
75
|
+
parser.add_argument("--use_filtered_tax_list", default=False, action="store_true", help="Use filtered list for taxonomy tree loading")
|
|
76
|
+
parser.add_argument("--disable_read_back_check", default=False, action="store_true", help="Disable read back check on all documents")
|
|
77
|
+
parser.add_argument("--debug", default=False, action="store_true", help="Turn on verbose logging")
|
|
78
|
+
parser.add_argument("--mock", default=False, action="store_true", help="Use MOCK repository configuration for testing")
|
|
79
|
+
parser.add_argument("--log_file_path", default=None, help="Path to runtime log file output.")
|
|
80
|
+
#
|
|
81
|
+
# Arguments specific for op == 'upd_ref_seq'
|
|
82
|
+
parser.add_argument("--ref_chunk_size", default=10, help="Max chunk size for reference sequence updates (for op 'upd_ref_seq')")
|
|
83
|
+
parser.add_argument("--min_missing", default=0, help="Minimum number of allowed missing reference sequences (for op 'upd_ref_seq')")
|
|
84
|
+
parser.add_argument("--min_match_primary_percent", default=None, help="Minimum reference sequence match percentage (for op 'upd_ref_seq')")
|
|
85
|
+
parser.add_argument("--test_mode", default=False, action="store_true", help="Test mode for reference sequence updates (for op 'upd_ref_seq')")
|
|
86
|
+
#
|
|
87
|
+
# Arguments buildExdbResources
|
|
88
|
+
parser.add_argument("--rebuild_all_neighbor_interactions", default=False, action="store_true", help="Rebuild all neighbor interactions from scratch (default is incrementally)")
|
|
89
|
+
parser.add_argument("--cc_file_prefix", default="cc-full", help="File name discriminator for index sets")
|
|
90
|
+
parser.add_argument("--cc_url_target", default=None, help="target url for chemical component dictionary resource file (default: None=all public)")
|
|
91
|
+
parser.add_argument("--bird_url_target", default=None, help="target url for bird dictionary resource file (cc format) (default: None=all public)")
|
|
92
|
+
#
|
|
93
|
+
args = parser.parse_args()
|
|
94
|
+
#
|
|
95
|
+
try:
|
|
96
|
+
op, commonD, loadD = processArguments(args)
|
|
97
|
+
except Exception as err:
|
|
98
|
+
logger.exception("Argument processing problem %s", str(err))
|
|
99
|
+
raise ValueError("Argument processing problem") from err
|
|
100
|
+
#
|
|
101
|
+
#
|
|
102
|
+
# Log input arguments
|
|
103
|
+
loadLogD = {k: v for d in [commonD, loadD] for k, v in d.items() if k != "inputIdCodeList"}
|
|
104
|
+
logger.info("running load op %r on loadLogD %r:", op, loadLogD)
|
|
105
|
+
#
|
|
106
|
+
# Run the operation
|
|
107
|
+
okR = False
|
|
108
|
+
exWf = ExDbWorkflow(**commonD)
|
|
109
|
+
if op in ["etl_chemref", "etl_uniprot_core", "etl_tree_node_lists", "upd_ref_seq"]:
|
|
110
|
+
okR = exWf.load(op, **loadD)
|
|
111
|
+
elif op in ["upd_neighbor_interactions", "upd_uniprot_taxonomy", "upd_targets_cofactors", "upd_pubchem", "upd_entry_info", "upd_glycan_idx", "upd_resource_stash"]:
|
|
112
|
+
okR = exWf.buildExdbResource(op, **loadD)
|
|
113
|
+
else:
|
|
114
|
+
logger.error("Unsupported op %r", op)
|
|
115
|
+
#
|
|
116
|
+
logger.info("Operation %r completed with status %r", op, okR)
|
|
117
|
+
#
|
|
118
|
+
if not okR:
|
|
119
|
+
logger.error("Operation %r failed with status %r", op, okR)
|
|
120
|
+
raise ValueError("Operation %r failed" % op)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def processArguments(args):
|
|
124
|
+
# Logging details
|
|
125
|
+
logFilePath = args.log_file_path
|
|
126
|
+
debugFlag = args.debug
|
|
127
|
+
if debugFlag:
|
|
128
|
+
logger.setLevel(logging.DEBUG)
|
|
129
|
+
else:
|
|
130
|
+
logger.setLevel(logging.INFO)
|
|
131
|
+
if logFilePath:
|
|
132
|
+
logDir = os.path.dirname(logFilePath)
|
|
133
|
+
if not os.path.isdir(logDir):
|
|
134
|
+
os.makedirs(logDir)
|
|
135
|
+
handler = logging.FileHandler(logFilePath, mode="a")
|
|
136
|
+
if debugFlag:
|
|
137
|
+
handler.setLevel(logging.DEBUG)
|
|
138
|
+
else:
|
|
139
|
+
handler.setLevel(logging.INFO)
|
|
140
|
+
formatter = logging.Formatter("%(asctime)s [%(levelname)s]-%(module)s.%(funcName)s: %(message)s")
|
|
141
|
+
handler.setFormatter(formatter)
|
|
142
|
+
logger.addHandler(handler)
|
|
143
|
+
#
|
|
144
|
+
# Configuration details
|
|
145
|
+
configPath = args.config_path
|
|
146
|
+
configName = args.config_name
|
|
147
|
+
if not (configPath and configName):
|
|
148
|
+
logger.error("Config path and/or name not provided: %r, %r", configPath, configName)
|
|
149
|
+
raise ValueError("Config path and/or name not provided: %r, %r" % (configPath, configName))
|
|
150
|
+
mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") if args.mock else None
|
|
151
|
+
logger.info("Using configuration file %r (section %r)", configPath, configName)
|
|
152
|
+
cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath)
|
|
153
|
+
cfgObTmp = cfgOb.exportConfig()
|
|
154
|
+
logger.info("Length of config object (%r)", len(cfgObTmp))
|
|
155
|
+
if len(cfgObTmp) == 0:
|
|
156
|
+
logger.error("Missing or access issue for config file %r", configPath)
|
|
157
|
+
raise ValueError("Missing or access issue for config file %r" % configPath)
|
|
158
|
+
else:
|
|
159
|
+
del cfgObTmp
|
|
160
|
+
#
|
|
161
|
+
# Do any additional argument checking
|
|
162
|
+
op = args.op
|
|
163
|
+
if not op:
|
|
164
|
+
raise ValueError("Must supply a value to '--op' argument")
|
|
165
|
+
#
|
|
166
|
+
cachePath = args.cache_path if args.cache_path else "."
|
|
167
|
+
cachePath = os.path.abspath(cachePath)
|
|
168
|
+
|
|
169
|
+
if args.db_type != "mongo":
|
|
170
|
+
logger.error("Unsupported database type %r (must be 'mongo')", args.db_type)
|
|
171
|
+
raise ValueError("Unsupported database type %r (must be 'mongo')" % args.db_type)
|
|
172
|
+
|
|
173
|
+
# Now collect arguments into dictionaries
|
|
174
|
+
commonD = {
|
|
175
|
+
"configPath": configPath,
|
|
176
|
+
"configName": configName,
|
|
177
|
+
"cachePath": cachePath,
|
|
178
|
+
"mockTopPath": mockTopPath,
|
|
179
|
+
"debugFlag": debugFlag,
|
|
180
|
+
"rebuildCache": args.rebuild_cache,
|
|
181
|
+
"providerTypeExclude": args.provider_type_exclude,
|
|
182
|
+
}
|
|
183
|
+
loadD = {
|
|
184
|
+
"loadType": args.load_type,
|
|
185
|
+
"numProc": int(args.num_proc),
|
|
186
|
+
"chunkSize": int(args.chunk_size),
|
|
187
|
+
"maxStepLength": int(args.max_step_length),
|
|
188
|
+
"dbType": args.db_type,
|
|
189
|
+
"documentLimit": int(args.document_limit) if args.document_limit else None,
|
|
190
|
+
"readBackCheck": not args.disable_read_back_check,
|
|
191
|
+
"rebuildSequenceCache": args.rebuild_sequence_cache,
|
|
192
|
+
"useFilteredLists": args.use_filtered_tax_list,
|
|
193
|
+
"refChunkSize": int(args.ref_chunk_size),
|
|
194
|
+
"minMissing": int(args.min_missing),
|
|
195
|
+
"minMatchPrimaryPercent": float(args.min_match_primary_percent) if args.min_match_primary_percent else None,
|
|
196
|
+
"testMode": args.test_mode,
|
|
197
|
+
"rebuildAllNeighborInteractions": args.rebuild_all_neighbor_interactions,
|
|
198
|
+
"ccFileNamePrefix": args.cc_file_prefix,
|
|
199
|
+
"ccUrlTarget": args.cc_url_target,
|
|
200
|
+
"birdUrlTarget": args.bird_url_target,
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
return op, commonD, loadD
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
if __name__ == "__main__":
|
|
207
|
+
try:
|
|
208
|
+
main()
|
|
209
|
+
except Exception as e:
|
|
210
|
+
logger.exception("Run failed %s", str(e))
|
|
211
|
+
sys.exit(1)
|
|
@@ -32,13 +32,14 @@ logger = logging.getLogger(__name__)
|
|
|
32
32
|
class UniProtCoreEtlWorker(object):
|
|
33
33
|
"""Prepare and load UniProt 'core' sequence reference data collections."""
|
|
34
34
|
|
|
35
|
-
def __init__(self, cfgOb, cachePath, useCache=True, numProc=2, chunkSize=10, readBackCheck=False, documentLimit=None, doValidate=False, verbose=False):
|
|
35
|
+
def __init__(self, cfgOb, cachePath, useCache=True, numProc=2, chunkSize=10, maxStepLength=2000, readBackCheck=False, documentLimit=None, doValidate=False, verbose=False):
|
|
36
36
|
self.__cfgOb = cfgOb
|
|
37
37
|
self.__cachePath = cachePath
|
|
38
38
|
self.__useCache = useCache
|
|
39
39
|
self.__readBackCheck = readBackCheck
|
|
40
40
|
self.__numProc = numProc
|
|
41
41
|
self.__chunkSize = chunkSize
|
|
42
|
+
self.__maxStepLength = maxStepLength
|
|
42
43
|
self.__documentLimit = documentLimit
|
|
43
44
|
#
|
|
44
45
|
self.__resourceName = "MONGO_DB"
|
|
@@ -128,6 +129,7 @@ class UniProtCoreEtlWorker(object):
|
|
|
128
129
|
self.__resourceName,
|
|
129
130
|
numProc=self.__numProc,
|
|
130
131
|
chunkSize=self.__chunkSize,
|
|
132
|
+
maxStepLength=self.__maxStepLength,
|
|
131
133
|
documentLimit=self.__documentLimit,
|
|
132
134
|
verbose=self.__verbose,
|
|
133
135
|
readBackCheck=self.__readBackCheck,
|
|
@@ -65,7 +65,7 @@ class PdbxLoaderFixture(unittest.TestCase):
|
|
|
65
65
|
self.__cachePath = os.path.join(TOPDIR, "CACHE")
|
|
66
66
|
self.__readBackCheck = True
|
|
67
67
|
self.__numProc = 1
|
|
68
|
-
self.__chunkSize =
|
|
68
|
+
self.__chunkSize = 2
|
|
69
69
|
self.__fileLimit = 38
|
|
70
70
|
self.__documentStyle = "rowwise_by_name_with_cardinality"
|
|
71
71
|
#
|
|
@@ -121,44 +121,44 @@ class PdbxLoaderFixture(unittest.TestCase):
|
|
|
121
121
|
]
|
|
122
122
|
#
|
|
123
123
|
self.__pdbIdList = [
|
|
124
|
-
"
|
|
125
|
-
"
|
|
126
|
-
"
|
|
127
|
-
"
|
|
128
|
-
"
|
|
129
|
-
"
|
|
130
|
-
"
|
|
131
|
-
"
|
|
132
|
-
"
|
|
133
|
-
"
|
|
134
|
-
"
|
|
135
|
-
"
|
|
136
|
-
"
|
|
137
|
-
"
|
|
138
|
-
"
|
|
139
|
-
"
|
|
140
|
-
"
|
|
141
|
-
"
|
|
142
|
-
"
|
|
143
|
-
"
|
|
144
|
-
"
|
|
145
|
-
"
|
|
146
|
-
"
|
|
147
|
-
"
|
|
148
|
-
"
|
|
149
|
-
"
|
|
150
|
-
"
|
|
151
|
-
"
|
|
152
|
-
"
|
|
153
|
-
"
|
|
154
|
-
"
|
|
155
|
-
"
|
|
156
|
-
"
|
|
157
|
-
"
|
|
158
|
-
"
|
|
159
|
-
"
|
|
160
|
-
"
|
|
161
|
-
"
|
|
124
|
+
"1AH1",
|
|
125
|
+
"1B5F",
|
|
126
|
+
"1BMV",
|
|
127
|
+
"1C58",
|
|
128
|
+
"1DSR",
|
|
129
|
+
"1DUL",
|
|
130
|
+
"1KQE",
|
|
131
|
+
"1O3Q",
|
|
132
|
+
"1SFO",
|
|
133
|
+
"2HW3",
|
|
134
|
+
"2HYV",
|
|
135
|
+
"2OSL",
|
|
136
|
+
"2VOO",
|
|
137
|
+
"2WMG",
|
|
138
|
+
"3AD7",
|
|
139
|
+
"3HYA",
|
|
140
|
+
"3IYD",
|
|
141
|
+
"3MBG",
|
|
142
|
+
"3RER",
|
|
143
|
+
"3VD8",
|
|
144
|
+
"3VFJ",
|
|
145
|
+
"3X11",
|
|
146
|
+
"3ZTJ",
|
|
147
|
+
"4E2O",
|
|
148
|
+
"4EN8",
|
|
149
|
+
"4MEY",
|
|
150
|
+
"5EU8",
|
|
151
|
+
"5KDS",
|
|
152
|
+
# "5TM0",
|
|
153
|
+
"5VH4",
|
|
154
|
+
# "5VP2",
|
|
155
|
+
# "6FSZ",
|
|
156
|
+
"6LU7",
|
|
157
|
+
"6NN7",
|
|
158
|
+
# "6Q20",
|
|
159
|
+
"6RFK",
|
|
160
|
+
"6RKU",
|
|
161
|
+
"6YRQ",
|
|
162
162
|
]
|
|
163
163
|
self.__ldList = [
|
|
164
164
|
{
|
|
@@ -213,10 +213,12 @@ class PdbxLoaderFixture(unittest.TestCase):
|
|
|
213
213
|
def testPdbxLoader(self):
|
|
214
214
|
#
|
|
215
215
|
for ld in self.__ldList:
|
|
216
|
-
self.__pdbxLoaderWrapper(**ld)
|
|
216
|
+
ok = self.__pdbxLoaderWrapper(**ld)
|
|
217
|
+
self.assertTrue(ok)
|
|
217
218
|
|
|
218
219
|
def __pdbxLoaderWrapper(self, **kwargs):
|
|
219
220
|
"""Wrapper for the PDBx loader module"""
|
|
221
|
+
ok = False
|
|
220
222
|
try:
|
|
221
223
|
logger.info("Loading %s", kwargs["databaseName"])
|
|
222
224
|
mw = PdbxLoader(
|
|
@@ -257,6 +259,7 @@ class PdbxLoaderFixture(unittest.TestCase):
|
|
|
257
259
|
except Exception as e:
|
|
258
260
|
logger.exception("Failing with %s", str(e))
|
|
259
261
|
self.fail()
|
|
262
|
+
return ok
|
|
260
263
|
|
|
261
264
|
def __loadStatus(self, statusList):
|
|
262
265
|
sectionName = "data_exchange_configuration"
|
|
@@ -88,7 +88,8 @@ class ExDbWorkflowTests(unittest.TestCase):
|
|
|
88
88
|
def testExDbLoaderWorkflows(self):
|
|
89
89
|
"""Test run workflow steps ..."""
|
|
90
90
|
try:
|
|
91
|
-
opL = ["etl_chemref", "upd_ref_seq", "etl_tree_node_lists"]
|
|
91
|
+
# opL = ["etl_chemref", "upd_ref_seq", "etl_tree_node_lists"]
|
|
92
|
+
opL = ["etl_chemref", "etl_tree_node_lists"]
|
|
92
93
|
rlWf = ExDbWorkflow(**self.__commonD)
|
|
93
94
|
for op in opL:
|
|
94
95
|
ok = rlWf.load(op, **self.__loadCommonD)
|
|
@@ -118,7 +118,7 @@ class ObjectExtractorTests(unittest.TestCase):
|
|
|
118
118
|
eCount = obEx.getCount()
|
|
119
119
|
logger.info("Entry count is %d", eCount)
|
|
120
120
|
logger.info("Entries are %r", list(eD.keys()))
|
|
121
|
-
self.assertGreaterEqual(eCount,
|
|
121
|
+
self.assertGreaterEqual(eCount, 5)
|
|
122
122
|
except Exception as e:
|
|
123
123
|
logger.exception("Failing with %s", str(e))
|
|
124
124
|
self.fail()
|
|
@@ -64,7 +64,7 @@ class PolymerEntityExtractorTests(unittest.TestCase):
|
|
|
64
64
|
pEx = PolymerEntityExtractor(self.__cfgOb)
|
|
65
65
|
pD, _ = pEx.getProteinSequenceDetails()
|
|
66
66
|
#
|
|
67
|
-
self.assertGreaterEqual(len(pD),
|
|
67
|
+
self.assertGreaterEqual(len(pD), 70)
|
|
68
68
|
logger.info("Polymer entity count %d", len(pD))
|
|
69
69
|
except Exception as e:
|
|
70
70
|
logger.exception("Failing with %s", str(e))
|
|
@@ -91,7 +91,7 @@ class ReferenceSequenceAssignmentProviderTests(unittest.TestCase):
|
|
|
91
91
|
ok = rsaP.testCache()
|
|
92
92
|
self.assertTrue(ok)
|
|
93
93
|
numRef = rsaP.getRefDataCount()
|
|
94
|
-
self.assertGreaterEqual(numRef,
|
|
94
|
+
self.assertGreaterEqual(numRef, 49)
|
|
95
95
|
#
|
|
96
96
|
# --- Reload from cache ---
|
|
97
97
|
rsaP = ReferenceSequenceAssignmentProvider(
|
|
@@ -100,7 +100,7 @@ class ReferenceSequenceAssignmentProviderTests(unittest.TestCase):
|
|
|
100
100
|
ok = rsaP.testCache()
|
|
101
101
|
self.assertTrue(ok)
|
|
102
102
|
numRef = rsaP.getRefDataCount()
|
|
103
|
-
self.assertGreaterEqual(numRef,
|
|
103
|
+
self.assertGreaterEqual(numRef, 49)
|
|
104
104
|
except Exception as e:
|
|
105
105
|
logger.exception("Failing with %s", str(e))
|
|
106
106
|
self.fail()
|
|
@@ -70,14 +70,14 @@ class ReferenceSequenceCacheProviderTests(unittest.TestCase):
|
|
|
70
70
|
ok = rsaP.testCache()
|
|
71
71
|
self.assertTrue(ok)
|
|
72
72
|
numRef = rsaP.getRefDataCount()
|
|
73
|
-
self.assertGreaterEqual(numRef,
|
|
73
|
+
self.assertGreaterEqual(numRef, 49)
|
|
74
74
|
#
|
|
75
75
|
# --- Reload from cache ---
|
|
76
76
|
rsaP = ReferenceSequenceCacheProvider(self.__cfgOb, databaseName, collectionName, polymerType, maxChunkSize=50, numProc=2, expireDays=14)
|
|
77
77
|
ok = rsaP.testCache()
|
|
78
78
|
self.assertTrue(ok)
|
|
79
79
|
numRef = rsaP.getRefDataCount()
|
|
80
|
-
self.assertGreaterEqual(numRef,
|
|
80
|
+
self.assertGreaterEqual(numRef, 49)
|
|
81
81
|
except Exception as e:
|
|
82
82
|
logger.exception("Failing with %s", str(e))
|
|
83
83
|
self.fail()
|
|
@@ -39,12 +39,13 @@ logger = logging.getLogger(__name__)
|
|
|
39
39
|
class TreeNodeListWorker(object):
|
|
40
40
|
"""Prepare and load repository holdings and repository update data."""
|
|
41
41
|
|
|
42
|
-
def __init__(self, cfgOb, cachePath, numProc=1, chunkSize=10, readBackCheck=False, documentLimit=None, verbose=False, useCache=False, useFilteredLists=False):
|
|
42
|
+
def __init__(self, cfgOb, cachePath, numProc=1, chunkSize=10, maxStepLength=4000, readBackCheck=False, documentLimit=None, verbose=False, useCache=False, useFilteredLists=False):
|
|
43
43
|
self.__cfgOb = cfgOb
|
|
44
44
|
self.__cachePath = os.path.abspath(cachePath)
|
|
45
45
|
self.__readBackCheck = readBackCheck
|
|
46
46
|
self.__numProc = numProc
|
|
47
47
|
self.__chunkSize = chunkSize
|
|
48
|
+
self.__maxStepLength = maxStepLength
|
|
48
49
|
self.__documentLimit = documentLimit
|
|
49
50
|
self.__resourceName = "MONGO_DB"
|
|
50
51
|
self.__filterType = "assign-dates"
|
|
@@ -115,6 +116,7 @@ class TreeNodeListWorker(object):
|
|
|
115
116
|
self.__resourceName,
|
|
116
117
|
numProc=self.__numProc,
|
|
117
118
|
chunkSize=self.__chunkSize,
|
|
119
|
+
maxStepLength=self.__maxStepLength,
|
|
118
120
|
documentLimit=self.__documentLimit,
|
|
119
121
|
verbose=self.__verbose,
|
|
120
122
|
readBackCheck=self.__readBackCheck,
|