rcsb.exdb 1.0__tar.gz → 1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/HISTORY.txt +3 -0
  2. {rcsb.exdb-1.0/rcsb.exdb.egg-info → rcsb_exdb-1.2}/PKG-INFO +4 -3
  3. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/chemref/ChemRefEtlWorker.py +3 -1
  4. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/chemref/PubChemIndexCacheProvider.py +8 -2
  5. rcsb_exdb-1.2/rcsb/exdb/cli/ExDbExec.py +211 -0
  6. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/cli/__init__.py +1 -1
  7. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/seq/UniProtCoreEtlWorker.py +3 -1
  8. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/fixturePdbxLoader.py +45 -42
  9. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testExDbWorkflow.py +4 -3
  10. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testObjectExtractor.py +1 -1
  11. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testPolymerEntityExtractor.py +1 -1
  12. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +1 -0
  13. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +2 -2
  14. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +2 -2
  15. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tree/TreeNodeListWorker.py +3 -1
  16. rcsb_exdb-1.2/rcsb/exdb/wf/ExDbWorkflow.py +415 -0
  17. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/wf/PubChemEtlWorkflow.py +9 -0
  18. {rcsb.exdb-1.0 → rcsb_exdb-1.2/rcsb.exdb.egg-info}/PKG-INFO +4 -3
  19. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb.exdb.egg-info/requires.txt +3 -2
  20. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/requirements.txt +3 -2
  21. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/setup.cfg +1 -1
  22. rcsb.exdb-1.0/rcsb/exdb/cli/ExDbExec.py +0 -194
  23. rcsb.exdb-1.0/rcsb/exdb/wf/ExDbWorkflow.py +0 -244
  24. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/LICENSE +0 -0
  25. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/MANIFEST.in +0 -0
  26. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/README.md +0 -0
  27. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/__init__.py +0 -0
  28. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/__init__.py +0 -0
  29. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/branch/BranchedEntityExtractor.py +0 -0
  30. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/branch/GlycanProvider.py +0 -0
  31. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/branch/GlycanUtils.py +0 -0
  32. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/branch/__init__.py +0 -0
  33. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/chemref/ChemRefExtractor.py +0 -0
  34. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/chemref/ChemRefMappingProvider.py +0 -0
  35. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/chemref/PubChemDataCacheProvider.py +0 -0
  36. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/chemref/PubChemEtlWrapper.py +0 -0
  37. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/chemref/__init__.py +0 -0
  38. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/citation/CitationAdapter.py +0 -0
  39. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/citation/CitationExtractor.py +0 -0
  40. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/citation/CitationUtils.py +0 -0
  41. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/citation/__init__.py +0 -0
  42. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/entry/EntryInfoProvider.py +0 -0
  43. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/entry/__init__.py +0 -0
  44. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/seq/AnnotationExtractor.py +0 -0
  45. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/seq/LigandNeighborMappingExtractor.py +0 -0
  46. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/seq/LigandNeighborMappingProvider.py +0 -0
  47. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/seq/PolymerEntityExtractor.py +0 -0
  48. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +0 -0
  49. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +0 -0
  50. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +0 -0
  51. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +0 -0
  52. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +0 -0
  53. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/seq/TaxonomyExtractor.py +0 -0
  54. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/seq/UniProtExtractor.py +0 -0
  55. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/seq/__init__.py +0 -0
  56. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/__init__.py +0 -0
  57. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +0 -0
  58. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testAnnotationExtractor.py +0 -0
  59. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testBranchedEntityExtractor.py +0 -0
  60. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testChemRefLoader.py +0 -0
  61. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testChemRefMappingProvider.py +0 -0
  62. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testCitationAdapter.py +0 -0
  63. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testCitationExtractor.py +0 -0
  64. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testCitationUtils.py +0 -0
  65. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +0 -0
  66. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testEntryInfoProvider.py +0 -0
  67. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testGlycanEtlWorkflow.py +0 -0
  68. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testGlycanProvider.py +0 -0
  69. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testGlycanUtils.py +0 -0
  70. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testLigandNeighborMappingProvider.py +0 -0
  71. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testObjectTransformer.py +0 -0
  72. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testObjectUpdater.py +0 -0
  73. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testPubChemDataCacheProvider.py +0 -0
  74. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testPubChemEtlWorkflow.py +0 -0
  75. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testPubChemEtlWrapper.py +0 -0
  76. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testPubChemIndexCacheProvider.py +0 -0
  77. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +0 -0
  78. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +0 -0
  79. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testTaxonomyExtractor.py +0 -0
  80. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testTreeNodeListWorker.py +0 -0
  81. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testUniProtCoreEtlWorker.py +0 -0
  82. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tests/testUniProtExtractor.py +0 -0
  83. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/tree/__init__.py +0 -0
  84. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/utils/ObjectAdapterBase.py +0 -0
  85. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/utils/ObjectExtractor.py +0 -0
  86. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/utils/ObjectTransformer.py +0 -0
  87. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/utils/ObjectUpdater.py +0 -0
  88. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/utils/ObjectValidator.py +0 -0
  89. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/utils/__init__.py +0 -0
  90. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/wf/EntryInfoEtlWorkflow.py +0 -0
  91. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/wf/GlycanEtlWorkflow.py +0 -0
  92. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb/exdb/wf/__init__.py +0 -0
  93. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb.exdb.egg-info/SOURCES.txt +0 -0
  94. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb.exdb.egg-info/dependency_links.txt +0 -0
  95. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb.exdb.egg-info/entry_points.txt +0 -0
  96. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb.exdb.egg-info/not-zip-safe +0 -0
  97. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/rcsb.exdb.egg-info/top_level.txt +0 -0
  98. {rcsb.exdb-1.0 → rcsb_exdb-1.2}/setup.py +0 -0
@@ -97,3 +97,6 @@
97
97
  Add documentation to reference sequence providers
98
98
  9-Jan-2024 V1.00 Update PolymerEntityExtractor to turn off usage of uniprot_exdb as source data;
99
99
  This package update also coincides with the turning off of uniprot_exdb data loading during the weekly workflow
100
+ 6-May-2024 V1.1 Update ExDbExec CLI and ExDbWorkflow to support CLI usage from weekly-update workflow;
101
+ Update unit tests and setuptools config
102
+ 9-May-2024 V1.2 Adjust provider type exclusion input to accept a list of types
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: rcsb.exdb
3
- Version: 1.0
3
+ Version: 1.2
4
4
  Summary: RCSB Python ExDB data extraction and loading workflows
5
5
  Home-page: https://github.com/rcsb/py-rcsb_exdb
6
6
  Author: John Westbrook
@@ -19,7 +19,7 @@ Requires-Dist: OpenEye-toolkits>=2020.2.2
19
19
  Requires-Dist: numpy
20
20
  Requires-Dist: jsonschema>=2.6.0
21
21
  Requires-Dist: rcsb.utils.io>=1.28
22
- Requires-Dist: rcsb.db>=1.691
22
+ Requires-Dist: rcsb.db>=1.720
23
23
  Requires-Dist: rcsb.utils.chemref>=0.79
24
24
  Requires-Dist: rcsb.utils.chem>=0.75
25
25
  Requires-Dist: rcsb.utils.citation>=0.16
@@ -29,7 +29,8 @@ Requires-Dist: rcsb.utils.go>=0.17
29
29
  Requires-Dist: rcsb.utils.seq>=0.63
30
30
  Requires-Dist: rcsb.utils.struct>=0.37
31
31
  Requires-Dist: rcsb.utils.taxonomy>=0.39
32
- Requires-Dist: rcsb.utils.dictionary>=0.71
32
+ Requires-Dist: rcsb.utils.dictionary>=1.23
33
+ Requires-Dist: rcsb.workflow>=0.42
33
34
  Requires-Dist: statistics; python_version < "3.0"
34
35
  Provides-Extra: dev
35
36
  Requires-Dist: check-manifest; extra == "dev"
@@ -29,13 +29,14 @@ logger = logging.getLogger(__name__)
29
29
  class ChemRefEtlWorker(object):
30
30
  """Prepare and load chemical reference data collections."""
31
31
 
32
- def __init__(self, cfgOb, cachePath, useCache=True, numProc=2, chunkSize=10, readBackCheck=False, documentLimit=None, verbose=False):
32
+ def __init__(self, cfgOb, cachePath, useCache=True, numProc=2, chunkSize=10, maxStepLength=2000, readBackCheck=False, documentLimit=None, verbose=False):
33
33
  self.__cfgOb = cfgOb
34
34
  self.__cachePath = cachePath
35
35
  self.__useCache = useCache
36
36
  self.__readBackCheck = readBackCheck
37
37
  self.__numProc = numProc
38
38
  self.__chunkSize = chunkSize
39
+ self.__maxStepLength = maxStepLength
39
40
  self.__documentLimit = documentLimit
40
41
  #
41
42
  self.__resourceName = "MONGO_DB"
@@ -95,6 +96,7 @@ class ChemRefEtlWorker(object):
95
96
  self.__resourceName,
96
97
  numProc=self.__numProc,
97
98
  chunkSize=self.__chunkSize,
99
+ maxStepLength=self.__maxStepLength,
98
100
  documentLimit=self.__documentLimit,
99
101
  verbose=self.__verbose,
100
102
  readBackCheck=self.__readBackCheck,
@@ -196,7 +196,7 @@ class PubChemUpdateWorker(object):
196
196
 
197
197
  def __chunker(self, iList, chunkSize):
198
198
  chunkSize = max(1, chunkSize)
199
- return (iList[i : i + chunkSize] for i in range(0, len(iList), chunkSize))
199
+ return (iList[i: i + chunkSize] for i in range(0, len(iList), chunkSize))
200
200
 
201
201
 
202
202
  class PubChemIndexCacheProvider(StashableBase):
@@ -319,6 +319,7 @@ class PubChemIndexCacheProvider(StashableBase):
319
319
  #
320
320
  matchD = {}
321
321
  matchedIdList = []
322
+ ok = False
322
323
  try:
323
324
  # ---
324
325
  # Get current the indices of source chemical reference data -
@@ -346,7 +347,10 @@ class PubChemIndexCacheProvider(StashableBase):
346
347
  else:
347
348
  logger.info("No reference data updates required")
348
349
  # --
349
- return ok
350
+ if not ok:
351
+ logger.warning("updateMissing completed with status %r failures %r", ok, len(failList))
352
+ #
353
+ return True
350
354
  except Exception as e:
351
355
  logger.exception("Failing with %s", str(e))
352
356
  return ok
@@ -569,8 +573,10 @@ class PubChemIndexCacheProvider(StashableBase):
569
573
  """Rebuild source indices of chemical component definitions."""
570
574
  logger.info("Rebuilding chemical definition index.")
571
575
  ok1, ccidxP = self.__buildChemCompIndex(**kwargs)
576
+ logger.info("__buildChemCompIndex completed with status %r", ok1)
572
577
  logger.info("Rebuilding chemical search indices.")
573
578
  ok2, ccsidxP = self.__buildChemCompSearchIndex(numProc, **kwargs)
579
+ logger.info("__buildChemCompSearchIndex completed with status %r", ok2)
574
580
  return ok1 & ok2, ccidxP, ccsidxP
575
581
 
576
582
  def __buildChemCompIndex(self, **kwargs):
@@ -0,0 +1,211 @@
1
+ ##
2
+ # File: ExDbExec.py
3
+ # Date: 22-Apr-2019 jdw
4
+ #
5
+ # Execution wrapper -- for extract and load operations -
6
+ #
7
+ # Updates:
8
+ # 4-Sep-2019 jdw add Tree and Drugbank loaders
9
+ # 14-Feb-2020 jdw change over to ReferenceSequenceAnnotationProvider/Adapter
10
+ # 9-Mar-2023 dwp Lower refChunkSize to 10 (UniProt API having trouble streaming XML responses)
11
+ # 25-Apr-2024 dwp Add arguments and logic to support CLI usage from weekly-update workflow;
12
+ # Add support for logging output to a specific file
13
+ ##
14
+ __docformat__ = "google en"
15
+ __author__ = "John Westbrook"
16
+ __email__ = "jwest@rcsb.rutgers.edu"
17
+ __license__ = "Apache 2.0"
18
+
19
+ import os
20
+ import sys
21
+ import argparse
22
+ import logging
23
+
24
+ from rcsb.utils.config.ConfigUtil import ConfigUtil
25
+ from rcsb.exdb.wf.ExDbWorkflow import ExDbWorkflow
26
+
27
+ HERE = os.path.abspath(os.path.dirname(__file__))
28
+ TOPDIR = os.path.dirname(os.path.dirname(os.path.dirname(HERE)))
29
+
30
+ # logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]-%(module)s.%(funcName)s: %(message)s", stream=sys.stdout)
31
+ logger = logging.getLogger()
32
+
33
+
34
+ def main():
35
+ parser = argparse.ArgumentParser()
36
+ #
37
+ parser.add_argument(
38
+ "--op",
39
+ default=None,
40
+ required=True,
41
+ help="Loading operation to perform",
42
+ choices=[
43
+ "etl_chemref", # ETL integrated chemical reference data
44
+ "etl_uniprot_core", # ETL UniProt core reference data
45
+ "etl_tree_node_lists", # ETL tree node lists
46
+ "upd_ref_seq", # Update reference sequence assignments
47
+ "upd_neighbor_interactions",
48
+ "upd_uniprot_taxonomy",
49
+ "upd_targets_cofactors",
50
+ "upd_pubchem",
51
+ "upd_entry_info",
52
+ "upd_glycan_idx",
53
+ "upd_resource_stash",
54
+ ]
55
+ )
56
+ parser.add_argument(
57
+ "--load_type",
58
+ default="full",
59
+ help="Type of load ('full' for complete and fresh single-worker load, 'replace' for incremental and multi-worker load)",
60
+ choices=["full", "replace"],
61
+ )
62
+ #
63
+ parser.add_argument("--config_path", default=None, help="Path to configuration options file")
64
+ parser.add_argument("--config_name", default="site_info_remote_configuration", help="Configuration section name")
65
+ parser.add_argument("--cache_path", default=None, help="Cache path for resource files")
66
+ parser.add_argument("--num_proc", default=2, help="Number of processes to execute (default=2)")
67
+ parser.add_argument("--chunk_size", default=10, help="Number of files loaded per process")
68
+ parser.add_argument("--max_step_length", default=500, help="Maximum subList size (default=500)")
69
+ parser.add_argument("--db_type", default="mongo", help="Database server type (default=mongo)")
70
+ parser.add_argument("--document_limit", default=None, help="Load document limit for testing")
71
+ #
72
+ parser.add_argument("--rebuild_cache", default=False, action="store_true", help="Rebuild cached resource files")
73
+ parser.add_argument("--rebuild_sequence_cache", default=False, action="store_true", help="Rebuild cached resource files for reference sequence updates")
74
+ parser.add_argument("--provider_types_exclude", default=None, help="Resource provider types to exclude")
75
+ parser.add_argument("--use_filtered_tax_list", default=False, action="store_true", help="Use filtered list for taxonomy tree loading")
76
+ parser.add_argument("--disable_read_back_check", default=False, action="store_true", help="Disable read back check on all documents")
77
+ parser.add_argument("--debug", default=False, action="store_true", help="Turn on verbose logging")
78
+ parser.add_argument("--mock", default=False, action="store_true", help="Use MOCK repository configuration for testing")
79
+ parser.add_argument("--log_file_path", default=None, help="Path to runtime log file output.")
80
+ #
81
+ # Arguments specific for op == 'upd_ref_seq'
82
+ parser.add_argument("--ref_chunk_size", default=10, help="Max chunk size for reference sequence updates (for op 'upd_ref_seq')")
83
+ parser.add_argument("--min_missing", default=0, help="Minimum number of allowed missing reference sequences (for op 'upd_ref_seq')")
84
+ parser.add_argument("--min_match_primary_percent", default=None, help="Minimum reference sequence match percentage (for op 'upd_ref_seq')")
85
+ parser.add_argument("--test_mode", default=False, action="store_true", help="Test mode for reference sequence updates (for op 'upd_ref_seq')")
86
+ #
87
+ # Arguments buildExdbResources
88
+ parser.add_argument("--rebuild_all_neighbor_interactions", default=False, action="store_true", help="Rebuild all neighbor interactions from scratch (default is incrementally)")
89
+ parser.add_argument("--cc_file_prefix", default="cc-full", help="File name discriminator for index sets")
90
+ parser.add_argument("--cc_url_target", default=None, help="target url for chemical component dictionary resource file (default: None=all public)")
91
+ parser.add_argument("--bird_url_target", default=None, help="target url for bird dictionary resource file (cc format) (default: None=all public)")
92
+ #
93
+ args = parser.parse_args()
94
+ #
95
+ try:
96
+ op, commonD, loadD = processArguments(args)
97
+ except Exception as err:
98
+ logger.exception("Argument processing problem %s", str(err))
99
+ raise ValueError("Argument processing problem") from err
100
+ #
101
+ #
102
+ # Log input arguments
103
+ loadLogD = {k: v for d in [commonD, loadD] for k, v in d.items() if k != "inputIdCodeList"}
104
+ logger.info("running load op %r on loadLogD %r:", op, loadLogD)
105
+ #
106
+ # Run the operation
107
+ okR = False
108
+ exWf = ExDbWorkflow(**commonD)
109
+ if op in ["etl_chemref", "etl_uniprot_core", "etl_tree_node_lists", "upd_ref_seq"]:
110
+ okR = exWf.load(op, **loadD)
111
+ elif op in ["upd_neighbor_interactions", "upd_uniprot_taxonomy", "upd_targets_cofactors", "upd_pubchem", "upd_entry_info", "upd_glycan_idx", "upd_resource_stash"]:
112
+ okR = exWf.buildExdbResource(op, **loadD)
113
+ else:
114
+ logger.error("Unsupported op %r", op)
115
+ #
116
+ logger.info("Operation %r completed with status %r", op, okR)
117
+ #
118
+ if not okR:
119
+ logger.error("Operation %r failed with status %r", op, okR)
120
+ raise ValueError("Operation %r failed" % op)
121
+
122
+
123
+ def processArguments(args):
124
+ # Logging details
125
+ logFilePath = args.log_file_path
126
+ debugFlag = args.debug
127
+ if debugFlag:
128
+ logger.setLevel(logging.DEBUG)
129
+ else:
130
+ logger.setLevel(logging.INFO)
131
+ if logFilePath:
132
+ logDir = os.path.dirname(logFilePath)
133
+ if not os.path.isdir(logDir):
134
+ os.makedirs(logDir)
135
+ handler = logging.FileHandler(logFilePath, mode="a")
136
+ if debugFlag:
137
+ handler.setLevel(logging.DEBUG)
138
+ else:
139
+ handler.setLevel(logging.INFO)
140
+ formatter = logging.Formatter("%(asctime)s [%(levelname)s]-%(module)s.%(funcName)s: %(message)s")
141
+ handler.setFormatter(formatter)
142
+ logger.addHandler(handler)
143
+ #
144
+ # Configuration details
145
+ configPath = args.config_path
146
+ configName = args.config_name
147
+ if not (configPath and configName):
148
+ logger.error("Config path and/or name not provided: %r, %r", configPath, configName)
149
+ raise ValueError("Config path and/or name not provided: %r, %r" % (configPath, configName))
150
+ mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") if args.mock else None
151
+ logger.info("Using configuration file %r (section %r)", configPath, configName)
152
+ cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath)
153
+ cfgObTmp = cfgOb.exportConfig()
154
+ logger.info("Length of config object (%r)", len(cfgObTmp))
155
+ if len(cfgObTmp) == 0:
156
+ logger.error("Missing or access issue for config file %r", configPath)
157
+ raise ValueError("Missing or access issue for config file %r" % configPath)
158
+ else:
159
+ del cfgObTmp
160
+ #
161
+ # Do any additional argument checking
162
+ op = args.op
163
+ if not op:
164
+ raise ValueError("Must supply a value to '--op' argument")
165
+ #
166
+ cachePath = args.cache_path if args.cache_path else "."
167
+ cachePath = os.path.abspath(cachePath)
168
+
169
+ if args.db_type != "mongo":
170
+ logger.error("Unsupported database type %r (must be 'mongo')", args.db_type)
171
+ raise ValueError("Unsupported database type %r (must be 'mongo')" % args.db_type)
172
+
173
+ # Now collect arguments into dictionaries
174
+ commonD = {
175
+ "configPath": configPath,
176
+ "configName": configName,
177
+ "cachePath": cachePath,
178
+ "mockTopPath": mockTopPath,
179
+ "debugFlag": debugFlag,
180
+ "rebuildCache": args.rebuild_cache,
181
+ "providerTypeExcludeL": args.provider_types_exclude,
182
+ }
183
+ loadD = {
184
+ "loadType": args.load_type,
185
+ "numProc": int(args.num_proc),
186
+ "chunkSize": int(args.chunk_size),
187
+ "maxStepLength": int(args.max_step_length),
188
+ "dbType": args.db_type,
189
+ "documentLimit": int(args.document_limit) if args.document_limit else None,
190
+ "readBackCheck": not args.disable_read_back_check,
191
+ "rebuildSequenceCache": args.rebuild_sequence_cache,
192
+ "useFilteredLists": args.use_filtered_tax_list,
193
+ "refChunkSize": int(args.ref_chunk_size),
194
+ "minMissing": int(args.min_missing),
195
+ "minMatchPrimaryPercent": float(args.min_match_primary_percent) if args.min_match_primary_percent else None,
196
+ "testMode": args.test_mode,
197
+ "rebuildAllNeighborInteractions": args.rebuild_all_neighbor_interactions,
198
+ "ccFileNamePrefix": args.cc_file_prefix,
199
+ "ccUrlTarget": args.cc_url_target,
200
+ "birdUrlTarget": args.bird_url_target,
201
+ }
202
+
203
+ return op, commonD, loadD
204
+
205
+
206
+ if __name__ == "__main__":
207
+ try:
208
+ main()
209
+ except Exception as e:
210
+ logger.exception("Run failed %s", str(e))
211
+ sys.exit(1)
@@ -2,4 +2,4 @@ __docformat__ = "google en"
2
2
  __author__ = "John Westbrook"
3
3
  __email__ = "john.westbrook@rcsb.org"
4
4
  __license__ = "Apache 2.0"
5
- __version__ = "1.00"
5
+ __version__ = "1.2"
@@ -32,13 +32,14 @@ logger = logging.getLogger(__name__)
32
32
  class UniProtCoreEtlWorker(object):
33
33
  """Prepare and load UniProt 'core' sequence reference data collections."""
34
34
 
35
- def __init__(self, cfgOb, cachePath, useCache=True, numProc=2, chunkSize=10, readBackCheck=False, documentLimit=None, doValidate=False, verbose=False):
35
+ def __init__(self, cfgOb, cachePath, useCache=True, numProc=2, chunkSize=10, maxStepLength=2000, readBackCheck=False, documentLimit=None, doValidate=False, verbose=False):
36
36
  self.__cfgOb = cfgOb
37
37
  self.__cachePath = cachePath
38
38
  self.__useCache = useCache
39
39
  self.__readBackCheck = readBackCheck
40
40
  self.__numProc = numProc
41
41
  self.__chunkSize = chunkSize
42
+ self.__maxStepLength = maxStepLength
42
43
  self.__documentLimit = documentLimit
43
44
  #
44
45
  self.__resourceName = "MONGO_DB"
@@ -128,6 +129,7 @@ class UniProtCoreEtlWorker(object):
128
129
  self.__resourceName,
129
130
  numProc=self.__numProc,
130
131
  chunkSize=self.__chunkSize,
132
+ maxStepLength=self.__maxStepLength,
131
133
  documentLimit=self.__documentLimit,
132
134
  verbose=self.__verbose,
133
135
  readBackCheck=self.__readBackCheck,
@@ -48,7 +48,7 @@ class PdbxLoaderFixture(unittest.TestCase):
48
48
  #
49
49
  #
50
50
  self.__isMac = platform.system() == "Darwin"
51
- self.__excludeType = None if self.__isMac else "optional"
51
+ self.__excludeTypeL = None if self.__isMac else ["optional"]
52
52
  self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
53
53
  configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example.yml")
54
54
  # configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example-local.yml")
@@ -65,7 +65,7 @@ class PdbxLoaderFixture(unittest.TestCase):
65
65
  self.__cachePath = os.path.join(TOPDIR, "CACHE")
66
66
  self.__readBackCheck = True
67
67
  self.__numProc = 1
68
- self.__chunkSize = 5
68
+ self.__chunkSize = 2
69
69
  self.__fileLimit = 38
70
70
  self.__documentStyle = "rowwise_by_name_with_cardinality"
71
71
  #
@@ -121,44 +121,44 @@ class PdbxLoaderFixture(unittest.TestCase):
121
121
  ]
122
122
  #
123
123
  self.__pdbIdList = [
124
- "1ah1",
125
- "1b5f",
126
- "1bmv",
127
- "1c58",
128
- "1dsr",
129
- "1dul",
130
- "1kqe",
131
- "1o3q",
132
- "1sfo",
133
- "2hw3",
134
- "2hyv",
135
- "2osl",
136
- "2voo",
137
- "2wmg",
138
- "3ad7",
139
- "3hya",
140
- "3iyd",
141
- "3mbg",
142
- "3rer",
143
- "3vd8",
144
- "3vfj",
145
- "3x11",
146
- "3ztj",
147
- "4e2o",
148
- "4en8",
149
- "4mey",
150
- "5eu8",
151
- "5kds",
152
- "5tm0",
153
- "5vh4",
154
- "5vp2",
155
- "6fsz",
156
- "6lu7",
157
- "6nn7",
158
- "6q20",
159
- "6rfk",
160
- "6rku",
161
- "6yrq",
124
+ "1AH1",
125
+ "1B5F",
126
+ "1BMV",
127
+ "1C58",
128
+ "1DSR",
129
+ "1DUL",
130
+ "1KQE",
131
+ "1O3Q",
132
+ "1SFO",
133
+ "2HW3",
134
+ "2HYV",
135
+ "2OSL",
136
+ "2VOO",
137
+ "2WMG",
138
+ "3AD7",
139
+ "3HYA",
140
+ "3IYD",
141
+ "3MBG",
142
+ "3RER",
143
+ "3VD8",
144
+ "3VFJ",
145
+ "3X11",
146
+ "3ZTJ",
147
+ "4E2O",
148
+ "4EN8",
149
+ "4MEY",
150
+ "5EU8",
151
+ "5KDS",
152
+ # "5TM0",
153
+ "5VH4",
154
+ # "5VP2",
155
+ # "6FSZ",
156
+ "6LU7",
157
+ "6NN7",
158
+ # "6Q20",
159
+ "6RFK",
160
+ "6RKU",
161
+ "6YRQ",
162
162
  ]
163
163
  self.__ldList = [
164
164
  {
@@ -213,10 +213,12 @@ class PdbxLoaderFixture(unittest.TestCase):
213
213
  def testPdbxLoader(self):
214
214
  #
215
215
  for ld in self.__ldList:
216
- self.__pdbxLoaderWrapper(**ld)
216
+ ok = self.__pdbxLoaderWrapper(**ld)
217
+ self.assertTrue(ok)
217
218
 
218
219
  def __pdbxLoaderWrapper(self, **kwargs):
219
220
  """Wrapper for the PDBx loader module"""
221
+ ok = False
220
222
  try:
221
223
  logger.info("Loading %s", kwargs["databaseName"])
222
224
  mw = PdbxLoader(
@@ -247,7 +249,7 @@ class PdbxLoaderFixture(unittest.TestCase):
247
249
  validationLevel=kwargs["validationLevel"],
248
250
  mergeContentTypes=kwargs["mergeContentTypes"],
249
251
  useNameFlag=False,
250
- providerTypeExclude=self.__excludeType,
252
+ providerTypeExcludeL=self.__excludeTypeL,
251
253
  restoreUseGit=True,
252
254
  restoreUseStash=False,
253
255
  )
@@ -257,6 +259,7 @@ class PdbxLoaderFixture(unittest.TestCase):
257
259
  except Exception as e:
258
260
  logger.exception("Failing with %s", str(e))
259
261
  self.fail()
262
+ return ok
260
263
 
261
264
  def __loadStatus(self, statusList):
262
265
  sectionName = "data_exchange_configuration"
@@ -42,7 +42,7 @@ class ExDbWorkflowTests(unittest.TestCase):
42
42
 
43
43
  def setUp(self):
44
44
  self.__isMac = platform.system() == "Darwin"
45
- self.__excludeType = None if self.__isMac else "optional"
45
+ self.__excludeTypeL = None if self.__isMac else ["optional"]
46
46
  mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
47
47
  configPath = os.path.join(mockTopPath, "config", "dbload-setup-example.yml")
48
48
  configName = "site_info_configuration"
@@ -55,7 +55,7 @@ class ExDbWorkflowTests(unittest.TestCase):
55
55
  "configName": configName,
56
56
  "cachePath": cachePath,
57
57
  "rebuildCache": False,
58
- "providerTypeExclude": self.__excludeType,
58
+ "providerTypeExcludeL": self.__excludeTypeL,
59
59
  "restoreUseGit": True,
60
60
  "restoreUseStash": False,
61
61
  }
@@ -88,7 +88,8 @@ class ExDbWorkflowTests(unittest.TestCase):
88
88
  def testExDbLoaderWorkflows(self):
89
89
  """Test run workflow steps ..."""
90
90
  try:
91
- opL = ["etl_chemref", "upd_ref_seq", "etl_tree_node_lists"]
91
+ # opL = ["etl_chemref", "upd_ref_seq", "etl_tree_node_lists"]
92
+ opL = ["etl_chemref", "etl_tree_node_lists"]
92
93
  rlWf = ExDbWorkflow(**self.__commonD)
93
94
  for op in opL:
94
95
  ok = rlWf.load(op, **self.__loadCommonD)
@@ -118,7 +118,7 @@ class ObjectExtractorTests(unittest.TestCase):
118
118
  eCount = obEx.getCount()
119
119
  logger.info("Entry count is %d", eCount)
120
120
  logger.info("Entries are %r", list(eD.keys()))
121
- self.assertGreaterEqual(eCount, 6)
121
+ self.assertGreaterEqual(eCount, 5)
122
122
  except Exception as e:
123
123
  logger.exception("Failing with %s", str(e))
124
124
  self.fail()
@@ -64,7 +64,7 @@ class PolymerEntityExtractorTests(unittest.TestCase):
64
64
  pEx = PolymerEntityExtractor(self.__cfgOb)
65
65
  pD, _ = pEx.getProteinSequenceDetails()
66
66
  #
67
- self.assertGreaterEqual(len(pD), 100)
67
+ self.assertGreaterEqual(len(pD), 70)
68
68
  logger.info("Polymer entity count %d", len(pD))
69
69
  except Exception as e:
70
70
  logger.exception("Failing with %s", str(e))
@@ -60,6 +60,7 @@ class ReferenceSequenceAnnotationAdapterTests(unittest.TestCase):
60
60
  endTime = time.time()
61
61
  logger.info("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)
62
62
 
63
+ @unittest.skip("Disable test - no longer using in production, and fails too frequently with 'Bad xml text' when fetching from UniProt")
63
64
  def testAnnotationAdapter(self):
64
65
  """Test case - create and read cache reference sequences assignments and related data."""
65
66
  try:
@@ -91,7 +91,7 @@ class ReferenceSequenceAssignmentProviderTests(unittest.TestCase):
91
91
  ok = rsaP.testCache()
92
92
  self.assertTrue(ok)
93
93
  numRef = rsaP.getRefDataCount()
94
- self.assertGreaterEqual(numRef, 90)
94
+ self.assertGreaterEqual(numRef, 49)
95
95
  #
96
96
  # --- Reload from cache ---
97
97
  rsaP = ReferenceSequenceAssignmentProvider(
@@ -100,7 +100,7 @@ class ReferenceSequenceAssignmentProviderTests(unittest.TestCase):
100
100
  ok = rsaP.testCache()
101
101
  self.assertTrue(ok)
102
102
  numRef = rsaP.getRefDataCount()
103
- self.assertGreaterEqual(numRef, 90)
103
+ self.assertGreaterEqual(numRef, 49)
104
104
  except Exception as e:
105
105
  logger.exception("Failing with %s", str(e))
106
106
  self.fail()
@@ -70,14 +70,14 @@ class ReferenceSequenceCacheProviderTests(unittest.TestCase):
70
70
  ok = rsaP.testCache()
71
71
  self.assertTrue(ok)
72
72
  numRef = rsaP.getRefDataCount()
73
- self.assertGreaterEqual(numRef, 90)
73
+ self.assertGreaterEqual(numRef, 44)
74
74
  #
75
75
  # --- Reload from cache ---
76
76
  rsaP = ReferenceSequenceCacheProvider(self.__cfgOb, databaseName, collectionName, polymerType, maxChunkSize=50, numProc=2, expireDays=14)
77
77
  ok = rsaP.testCache()
78
78
  self.assertTrue(ok)
79
79
  numRef = rsaP.getRefDataCount()
80
- self.assertGreaterEqual(numRef, 90)
80
+ self.assertGreaterEqual(numRef, 44)
81
81
  except Exception as e:
82
82
  logger.exception("Failing with %s", str(e))
83
83
  self.fail()
@@ -39,12 +39,13 @@ logger = logging.getLogger(__name__)
39
39
  class TreeNodeListWorker(object):
40
40
  """Prepare and load repository holdings and repository update data."""
41
41
 
42
- def __init__(self, cfgOb, cachePath, numProc=1, chunkSize=10, readBackCheck=False, documentLimit=None, verbose=False, useCache=False, useFilteredLists=False):
42
+ def __init__(self, cfgOb, cachePath, numProc=1, chunkSize=10, maxStepLength=4000, readBackCheck=False, documentLimit=None, verbose=False, useCache=False, useFilteredLists=False):
43
43
  self.__cfgOb = cfgOb
44
44
  self.__cachePath = os.path.abspath(cachePath)
45
45
  self.__readBackCheck = readBackCheck
46
46
  self.__numProc = numProc
47
47
  self.__chunkSize = chunkSize
48
+ self.__maxStepLength = maxStepLength
48
49
  self.__documentLimit = documentLimit
49
50
  self.__resourceName = "MONGO_DB"
50
51
  self.__filterType = "assign-dates"
@@ -115,6 +116,7 @@ class TreeNodeListWorker(object):
115
116
  self.__resourceName,
116
117
  numProc=self.__numProc,
117
118
  chunkSize=self.__chunkSize,
119
+ maxStepLength=self.__maxStepLength,
118
120
  documentLimit=self.__documentLimit,
119
121
  verbose=self.__verbose,
120
122
  readBackCheck=self.__readBackCheck,