rcsb.exdb 0.99__tar.gz → 1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/HISTORY.txt +4 -0
  2. {rcsb.exdb-0.99/rcsb.exdb.egg-info → rcsb_exdb-1.1}/PKG-INFO +2 -1
  3. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/chemref/ChemRefEtlWorker.py +3 -1
  4. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/chemref/PubChemIndexCacheProvider.py +8 -2
  5. rcsb_exdb-1.1/rcsb/exdb/cli/ExDbExec.py +211 -0
  6. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/cli/__init__.py +1 -1
  7. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/seq/PolymerEntityExtractor.py +47 -46
  8. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/seq/UniProtCoreEtlWorker.py +3 -1
  9. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/fixturePdbxLoader.py +43 -40
  10. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testExDbWorkflow.py +2 -1
  11. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testObjectExtractor.py +1 -1
  12. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testPolymerEntityExtractor.py +1 -1
  13. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +2 -2
  14. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +2 -2
  15. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tree/TreeNodeListWorker.py +3 -1
  16. rcsb_exdb-1.1/rcsb/exdb/wf/ExDbWorkflow.py +415 -0
  17. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/wf/PubChemEtlWorkflow.py +9 -0
  18. {rcsb.exdb-0.99 → rcsb_exdb-1.1/rcsb.exdb.egg-info}/PKG-INFO +2 -1
  19. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb.exdb.egg-info/requires.txt +1 -0
  20. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/requirements.txt +1 -0
  21. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/setup.cfg +1 -1
  22. rcsb.exdb-0.99/rcsb/exdb/cli/ExDbExec.py +0 -194
  23. rcsb.exdb-0.99/rcsb/exdb/wf/ExDbWorkflow.py +0 -244
  24. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/LICENSE +0 -0
  25. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/MANIFEST.in +0 -0
  26. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/README.md +0 -0
  27. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/__init__.py +0 -0
  28. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/__init__.py +0 -0
  29. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/branch/BranchedEntityExtractor.py +0 -0
  30. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/branch/GlycanProvider.py +0 -0
  31. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/branch/GlycanUtils.py +0 -0
  32. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/branch/__init__.py +0 -0
  33. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/chemref/ChemRefExtractor.py +0 -0
  34. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/chemref/ChemRefMappingProvider.py +0 -0
  35. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/chemref/PubChemDataCacheProvider.py +0 -0
  36. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/chemref/PubChemEtlWrapper.py +0 -0
  37. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/chemref/__init__.py +0 -0
  38. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/citation/CitationAdapter.py +0 -0
  39. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/citation/CitationExtractor.py +0 -0
  40. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/citation/CitationUtils.py +0 -0
  41. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/citation/__init__.py +0 -0
  42. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/entry/EntryInfoProvider.py +0 -0
  43. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/entry/__init__.py +0 -0
  44. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/seq/AnnotationExtractor.py +0 -0
  45. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/seq/LigandNeighborMappingExtractor.py +0 -0
  46. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/seq/LigandNeighborMappingProvider.py +0 -0
  47. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +0 -0
  48. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +0 -0
  49. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +0 -0
  50. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +0 -0
  51. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +0 -0
  52. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/seq/TaxonomyExtractor.py +0 -0
  53. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/seq/UniProtExtractor.py +0 -0
  54. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/seq/__init__.py +0 -0
  55. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/__init__.py +0 -0
  56. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +0 -0
  57. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testAnnotationExtractor.py +0 -0
  58. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testBranchedEntityExtractor.py +0 -0
  59. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testChemRefLoader.py +0 -0
  60. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testChemRefMappingProvider.py +0 -0
  61. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testCitationAdapter.py +0 -0
  62. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testCitationExtractor.py +0 -0
  63. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testCitationUtils.py +0 -0
  64. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +0 -0
  65. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testEntryInfoProvider.py +0 -0
  66. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testGlycanEtlWorkflow.py +0 -0
  67. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testGlycanProvider.py +0 -0
  68. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testGlycanUtils.py +0 -0
  69. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testLigandNeighborMappingProvider.py +0 -0
  70. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testObjectTransformer.py +0 -0
  71. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testObjectUpdater.py +0 -0
  72. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testPubChemDataCacheProvider.py +0 -0
  73. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testPubChemEtlWorkflow.py +0 -0
  74. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testPubChemEtlWrapper.py +0 -0
  75. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testPubChemIndexCacheProvider.py +0 -0
  76. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +0 -0
  77. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +0 -0
  78. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +0 -0
  79. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testTaxonomyExtractor.py +0 -0
  80. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testTreeNodeListWorker.py +0 -0
  81. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testUniProtCoreEtlWorker.py +0 -0
  82. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tests/testUniProtExtractor.py +0 -0
  83. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/tree/__init__.py +0 -0
  84. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/utils/ObjectAdapterBase.py +0 -0
  85. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/utils/ObjectExtractor.py +0 -0
  86. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/utils/ObjectTransformer.py +0 -0
  87. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/utils/ObjectUpdater.py +0 -0
  88. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/utils/ObjectValidator.py +0 -0
  89. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/utils/__init__.py +0 -0
  90. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/wf/EntryInfoEtlWorkflow.py +0 -0
  91. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/wf/GlycanEtlWorkflow.py +0 -0
  92. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb/exdb/wf/__init__.py +0 -0
  93. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb.exdb.egg-info/SOURCES.txt +0 -0
  94. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb.exdb.egg-info/dependency_links.txt +0 -0
  95. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb.exdb.egg-info/entry_points.txt +0 -0
  96. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb.exdb.egg-info/not-zip-safe +0 -0
  97. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/rcsb.exdb.egg-info/top_level.txt +0 -0
  98. {rcsb.exdb-0.99 → rcsb_exdb-1.1}/setup.py +0 -0
@@ -95,3 +95,7 @@
95
95
  Load full (unfiltered) taxonomy tree node list, and stop loading GO tree
96
96
  19-Sep-2023 V0.99 Add reload method to ChemRefMappingProvider and LigandNeighborMappingProvider;
97
97
  Add documentation to reference sequence providers
98
+ 9-Jan-2024 V1.00 Update PolymerEntityExtractor to turn off usage of uniprot_exdb as source data;
99
+ This package update also coincides with the turning off of uniprot_exdb data loading during the weekly workflow
100
+ 6-May-2024 V1.1 Update ExDbExec CLI and ExDbWorkflow to support CLI usage from weekly-update workflow;
101
+ Update unit tests and setuptools config
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: rcsb.exdb
3
- Version: 0.99
3
+ Version: 1.1
4
4
  Summary: RCSB Python ExDB data extraction and loading workflows
5
5
  Home-page: https://github.com/rcsb/py-rcsb_exdb
6
6
  Author: John Westbrook
@@ -30,6 +30,7 @@ Requires-Dist: rcsb.utils.seq>=0.63
30
30
  Requires-Dist: rcsb.utils.struct>=0.37
31
31
  Requires-Dist: rcsb.utils.taxonomy>=0.39
32
32
  Requires-Dist: rcsb.utils.dictionary>=0.71
33
+ Requires-Dist: rcsb.workflow>=0.42
33
34
  Requires-Dist: statistics; python_version < "3.0"
34
35
  Provides-Extra: dev
35
36
  Requires-Dist: check-manifest; extra == "dev"
@@ -29,13 +29,14 @@ logger = logging.getLogger(__name__)
29
29
  class ChemRefEtlWorker(object):
30
30
  """Prepare and load chemical reference data collections."""
31
31
 
32
- def __init__(self, cfgOb, cachePath, useCache=True, numProc=2, chunkSize=10, readBackCheck=False, documentLimit=None, verbose=False):
32
+ def __init__(self, cfgOb, cachePath, useCache=True, numProc=2, chunkSize=10, maxStepLength=2000, readBackCheck=False, documentLimit=None, verbose=False):
33
33
  self.__cfgOb = cfgOb
34
34
  self.__cachePath = cachePath
35
35
  self.__useCache = useCache
36
36
  self.__readBackCheck = readBackCheck
37
37
  self.__numProc = numProc
38
38
  self.__chunkSize = chunkSize
39
+ self.__maxStepLength = maxStepLength
39
40
  self.__documentLimit = documentLimit
40
41
  #
41
42
  self.__resourceName = "MONGO_DB"
@@ -95,6 +96,7 @@ class ChemRefEtlWorker(object):
95
96
  self.__resourceName,
96
97
  numProc=self.__numProc,
97
98
  chunkSize=self.__chunkSize,
99
+ maxStepLength=self.__maxStepLength,
98
100
  documentLimit=self.__documentLimit,
99
101
  verbose=self.__verbose,
100
102
  readBackCheck=self.__readBackCheck,
@@ -196,7 +196,7 @@ class PubChemUpdateWorker(object):
196
196
 
197
197
  def __chunker(self, iList, chunkSize):
198
198
  chunkSize = max(1, chunkSize)
199
- return (iList[i : i + chunkSize] for i in range(0, len(iList), chunkSize))
199
+ return (iList[i: i + chunkSize] for i in range(0, len(iList), chunkSize))
200
200
 
201
201
 
202
202
  class PubChemIndexCacheProvider(StashableBase):
@@ -319,6 +319,7 @@ class PubChemIndexCacheProvider(StashableBase):
319
319
  #
320
320
  matchD = {}
321
321
  matchedIdList = []
322
+ ok = False
322
323
  try:
323
324
  # ---
324
325
  # Get current the indices of source chemical reference data -
@@ -346,7 +347,10 @@ class PubChemIndexCacheProvider(StashableBase):
346
347
  else:
347
348
  logger.info("No reference data updates required")
348
349
  # --
349
- return ok
350
+ if not ok:
351
+ logger.warning("updateMissing completed with status %r failures %r", ok, len(failList))
352
+ #
353
+ return True
350
354
  except Exception as e:
351
355
  logger.exception("Failing with %s", str(e))
352
356
  return ok
@@ -569,8 +573,10 @@ class PubChemIndexCacheProvider(StashableBase):
569
573
  """Rebuild source indices of chemical component definitions."""
570
574
  logger.info("Rebuilding chemical definition index.")
571
575
  ok1, ccidxP = self.__buildChemCompIndex(**kwargs)
576
+ logger.info("__buildChemCompIndex completed with status %r", ok1)
572
577
  logger.info("Rebuilding chemical search indices.")
573
578
  ok2, ccsidxP = self.__buildChemCompSearchIndex(numProc, **kwargs)
579
+ logger.info("__buildChemCompSearchIndex completed with status %r", ok2)
574
580
  return ok1 & ok2, ccidxP, ccsidxP
575
581
 
576
582
  def __buildChemCompIndex(self, **kwargs):
@@ -0,0 +1,211 @@
1
+ ##
2
+ # File: ExDbExec.py
3
+ # Date: 22-Apr-2019 jdw
4
+ #
5
+ # Execution wrapper -- for extract and load operations -
6
+ #
7
+ # Updates:
8
+ # 4-Sep-2019 jdw add Tree and Drugbank loaders
9
+ # 14-Feb-2020 jdw change over to ReferenceSequenceAnnotationProvider/Adapter
10
+ # 9-Mar-2023 dwp Lower refChunkSize to 10 (UniProt API having trouble streaming XML responses)
11
+ # 25-Apr-2024 dwp Add arguments and logic to support CLI usage from weekly-update workflow;
12
+ # Add support for logging output to a specific file
13
+ ##
14
+ __docformat__ = "google en"
15
+ __author__ = "John Westbrook"
16
+ __email__ = "jwest@rcsb.rutgers.edu"
17
+ __license__ = "Apache 2.0"
18
+
19
+ import os
20
+ import sys
21
+ import argparse
22
+ import logging
23
+
24
+ from rcsb.utils.config.ConfigUtil import ConfigUtil
25
+ from rcsb.exdb.wf.ExDbWorkflow import ExDbWorkflow
26
+
27
+ HERE = os.path.abspath(os.path.dirname(__file__))
28
+ TOPDIR = os.path.dirname(os.path.dirname(os.path.dirname(HERE)))
29
+
30
+ # logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]-%(module)s.%(funcName)s: %(message)s", stream=sys.stdout)
31
+ logger = logging.getLogger()
32
+
33
+
34
+ def main():
35
+ parser = argparse.ArgumentParser()
36
+ #
37
+ parser.add_argument(
38
+ "--op",
39
+ default=None,
40
+ required=True,
41
+ help="Loading operation to perform",
42
+ choices=[
43
+ "etl_chemref", # ETL integrated chemical reference data
44
+ "etl_uniprot_core", # ETL UniProt core reference data
45
+ "etl_tree_node_lists", # ETL tree node lists
46
+ "upd_ref_seq", # Update reference sequence assignments
47
+ "upd_neighbor_interactions",
48
+ "upd_uniprot_taxonomy",
49
+ "upd_targets_cofactors",
50
+ "upd_pubchem",
51
+ "upd_entry_info",
52
+ "upd_glycan_idx",
53
+ "upd_resource_stash",
54
+ ]
55
+ )
56
+ parser.add_argument(
57
+ "--load_type",
58
+ default="full",
59
+ help="Type of load ('full' for complete and fresh single-worker load, 'replace' for incremental and multi-worker load)",
60
+ choices=["full", "replace"],
61
+ )
62
+ #
63
+ parser.add_argument("--config_path", default=None, help="Path to configuration options file")
64
+ parser.add_argument("--config_name", default="site_info_remote_configuration", help="Configuration section name")
65
+ parser.add_argument("--cache_path", default=None, help="Cache path for resource files")
66
+ parser.add_argument("--num_proc", default=2, help="Number of processes to execute (default=2)")
67
+ parser.add_argument("--chunk_size", default=10, help="Number of files loaded per process")
68
+ parser.add_argument("--max_step_length", default=500, help="Maximum subList size (default=500)")
69
+ parser.add_argument("--db_type", default="mongo", help="Database server type (default=mongo)")
70
+ parser.add_argument("--document_limit", default=None, help="Load document limit for testing")
71
+ #
72
+ parser.add_argument("--rebuild_cache", default=False, action="store_true", help="Rebuild cached resource files")
73
+ parser.add_argument("--rebuild_sequence_cache", default=False, action="store_true", help="Rebuild cached resource files for reference sequence updates")
74
+ parser.add_argument("--provider_type_exclude", default=None, help="Resource provider types to exclude")
75
+ parser.add_argument("--use_filtered_tax_list", default=False, action="store_true", help="Use filtered list for taxonomy tree loading")
76
+ parser.add_argument("--disable_read_back_check", default=False, action="store_true", help="Disable read back check on all documents")
77
+ parser.add_argument("--debug", default=False, action="store_true", help="Turn on verbose logging")
78
+ parser.add_argument("--mock", default=False, action="store_true", help="Use MOCK repository configuration for testing")
79
+ parser.add_argument("--log_file_path", default=None, help="Path to runtime log file output.")
80
+ #
81
+ # Arguments specific for op == 'upd_ref_seq'
82
+ parser.add_argument("--ref_chunk_size", default=10, help="Max chunk size for reference sequence updates (for op 'upd_ref_seq')")
83
+ parser.add_argument("--min_missing", default=0, help="Minimum number of allowed missing reference sequences (for op 'upd_ref_seq')")
84
+ parser.add_argument("--min_match_primary_percent", default=None, help="Minimum reference sequence match percentage (for op 'upd_ref_seq')")
85
+ parser.add_argument("--test_mode", default=False, action="store_true", help="Test mode for reference sequence updates (for op 'upd_ref_seq')")
86
+ #
87
+ # Arguments buildExdbResources
88
+ parser.add_argument("--rebuild_all_neighbor_interactions", default=False, action="store_true", help="Rebuild all neighbor interactions from scratch (default is incrementally)")
89
+ parser.add_argument("--cc_file_prefix", default="cc-full", help="File name discriminator for index sets")
90
+ parser.add_argument("--cc_url_target", default=None, help="target url for chemical component dictionary resource file (default: None=all public)")
91
+ parser.add_argument("--bird_url_target", default=None, help="target url for bird dictionary resource file (cc format) (default: None=all public)")
92
+ #
93
+ args = parser.parse_args()
94
+ #
95
+ try:
96
+ op, commonD, loadD = processArguments(args)
97
+ except Exception as err:
98
+ logger.exception("Argument processing problem %s", str(err))
99
+ raise ValueError("Argument processing problem") from err
100
+ #
101
+ #
102
+ # Log input arguments
103
+ loadLogD = {k: v for d in [commonD, loadD] for k, v in d.items() if k != "inputIdCodeList"}
104
+ logger.info("running load op %r on loadLogD %r:", op, loadLogD)
105
+ #
106
+ # Run the operation
107
+ okR = False
108
+ exWf = ExDbWorkflow(**commonD)
109
+ if op in ["etl_chemref", "etl_uniprot_core", "etl_tree_node_lists", "upd_ref_seq"]:
110
+ okR = exWf.load(op, **loadD)
111
+ elif op in ["upd_neighbor_interactions", "upd_uniprot_taxonomy", "upd_targets_cofactors", "upd_pubchem", "upd_entry_info", "upd_glycan_idx", "upd_resource_stash"]:
112
+ okR = exWf.buildExdbResource(op, **loadD)
113
+ else:
114
+ logger.error("Unsupported op %r", op)
115
+ #
116
+ logger.info("Operation %r completed with status %r", op, okR)
117
+ #
118
+ if not okR:
119
+ logger.error("Operation %r failed with status %r", op, okR)
120
+ raise ValueError("Operation %r failed" % op)
121
+
122
+
123
+ def processArguments(args):
124
+ # Logging details
125
+ logFilePath = args.log_file_path
126
+ debugFlag = args.debug
127
+ if debugFlag:
128
+ logger.setLevel(logging.DEBUG)
129
+ else:
130
+ logger.setLevel(logging.INFO)
131
+ if logFilePath:
132
+ logDir = os.path.dirname(logFilePath)
133
+ if not os.path.isdir(logDir):
134
+ os.makedirs(logDir)
135
+ handler = logging.FileHandler(logFilePath, mode="a")
136
+ if debugFlag:
137
+ handler.setLevel(logging.DEBUG)
138
+ else:
139
+ handler.setLevel(logging.INFO)
140
+ formatter = logging.Formatter("%(asctime)s [%(levelname)s]-%(module)s.%(funcName)s: %(message)s")
141
+ handler.setFormatter(formatter)
142
+ logger.addHandler(handler)
143
+ #
144
+ # Configuration details
145
+ configPath = args.config_path
146
+ configName = args.config_name
147
+ if not (configPath and configName):
148
+ logger.error("Config path and/or name not provided: %r, %r", configPath, configName)
149
+ raise ValueError("Config path and/or name not provided: %r, %r" % (configPath, configName))
150
+ mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") if args.mock else None
151
+ logger.info("Using configuration file %r (section %r)", configPath, configName)
152
+ cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath)
153
+ cfgObTmp = cfgOb.exportConfig()
154
+ logger.info("Length of config object (%r)", len(cfgObTmp))
155
+ if len(cfgObTmp) == 0:
156
+ logger.error("Missing or access issue for config file %r", configPath)
157
+ raise ValueError("Missing or access issue for config file %r" % configPath)
158
+ else:
159
+ del cfgObTmp
160
+ #
161
+ # Do any additional argument checking
162
+ op = args.op
163
+ if not op:
164
+ raise ValueError("Must supply a value to '--op' argument")
165
+ #
166
+ cachePath = args.cache_path if args.cache_path else "."
167
+ cachePath = os.path.abspath(cachePath)
168
+
169
+ if args.db_type != "mongo":
170
+ logger.error("Unsupported database type %r (must be 'mongo')", args.db_type)
171
+ raise ValueError("Unsupported database type %r (must be 'mongo')" % args.db_type)
172
+
173
+ # Now collect arguments into dictionaries
174
+ commonD = {
175
+ "configPath": configPath,
176
+ "configName": configName,
177
+ "cachePath": cachePath,
178
+ "mockTopPath": mockTopPath,
179
+ "debugFlag": debugFlag,
180
+ "rebuildCache": args.rebuild_cache,
181
+ "providerTypeExclude": args.provider_type_exclude,
182
+ }
183
+ loadD = {
184
+ "loadType": args.load_type,
185
+ "numProc": int(args.num_proc),
186
+ "chunkSize": int(args.chunk_size),
187
+ "maxStepLength": int(args.max_step_length),
188
+ "dbType": args.db_type,
189
+ "documentLimit": int(args.document_limit) if args.document_limit else None,
190
+ "readBackCheck": not args.disable_read_back_check,
191
+ "rebuildSequenceCache": args.rebuild_sequence_cache,
192
+ "useFilteredLists": args.use_filtered_tax_list,
193
+ "refChunkSize": int(args.ref_chunk_size),
194
+ "minMissing": int(args.min_missing),
195
+ "minMatchPrimaryPercent": float(args.min_match_primary_percent) if args.min_match_primary_percent else None,
196
+ "testMode": args.test_mode,
197
+ "rebuildAllNeighborInteractions": args.rebuild_all_neighbor_interactions,
198
+ "ccFileNamePrefix": args.cc_file_prefix,
199
+ "ccUrlTarget": args.cc_url_target,
200
+ "birdUrlTarget": args.bird_url_target,
201
+ }
202
+
203
+ return op, commonD, loadD
204
+
205
+
206
+ if __name__ == "__main__":
207
+ try:
208
+ main()
209
+ except Exception as e:
210
+ logger.exception("Run failed %s", str(e))
211
+ sys.exit(1)
@@ -2,4 +2,4 @@ __docformat__ = "google en"
2
2
  __author__ = "John Westbrook"
3
3
  __email__ = "john.westbrook@rcsb.org"
4
4
  __license__ = "Apache 2.0"
5
- __version__ = "0.99"
5
+ __version__ = "1.1"
@@ -6,6 +6,7 @@
6
6
  #
7
7
  #
8
8
  # Updates:
9
+ # 9-Jan-2024 dwp Turn off use of uniprot_exdb DB for enriching protein entity details file (data not used)
9
10
  #
10
11
  ##
11
12
  __docformat__ = "google en"
@@ -16,7 +17,6 @@ __license__ = "Apache 2.0"
16
17
  import logging
17
18
  import os
18
19
 
19
- from rcsb.exdb.seq.UniProtExtractor import UniProtExtractor
20
20
  from rcsb.exdb.utils.ObjectExtractor import ObjectExtractor
21
21
  from rcsb.utils.io.MarshalUtil import MarshalUtil
22
22
 
@@ -56,9 +56,6 @@ class PolymerEntityExtractor(object):
56
56
  missingSrcD = {}
57
57
  rD = {}
58
58
  try:
59
- unpEx = UniProtExtractor(self.__cfgOb)
60
- unpD = unpEx.getReferenceSequenceDetails()
61
- #
62
59
  obEx = ObjectExtractor(
63
60
  self.__cfgOb,
64
61
  databaseName="pdbx_core",
@@ -155,9 +152,13 @@ class PolymerEntityExtractor(object):
155
152
  uD["refDbId"] = tD["reference_database_accession"]
156
153
  uD["refDbName"] = tD["reference_database_name"]
157
154
  uD["provSource"] = tD["provenance_source"]
158
- if tD["reference_database_accession"] in unpD:
159
- # This adds {"accession": rId, "taxId": taxId, "scientific_name": sn, "gene": gn, "name": pn, "sequence": sequence}
160
- uD.update(unpD[tD["reference_database_accession"]])
155
+ #
156
+ # Skip the below step now that uniprot_exdb DB is no longer being updated in weekly workflow.
157
+ # The data added here isn't used by subsequent tasks. It simply provides
158
+ # additional information in the pdbprent-details.json file (under "alignmentL")
159
+ # if tD["reference_database_accession"] in unpD:
160
+ # # This adds {"accession": rId, "taxId": taxId, "scientific_name": sn, "gene": gn, "name": pn, "sequence": sequence}
161
+ # uD.update(unpD[tD["reference_database_accession"]])
161
162
  aL = []
162
163
  for qD in tD["aligned_regions"]:
163
164
  if qD["entity_beg_seq_id"] + qD["length"] - 1 > seqLen:
@@ -223,44 +224,44 @@ class PolymerEntityExtractor(object):
223
224
 
224
225
  Example:
225
226
  "5H7D_1": {
226
- "alignmentL": [
227
- {
228
- "refDbId": "P42588",
229
- "refDbName": "UniProt",
230
- "provSource": "PDB",
231
- "accession": "P42588",
232
- "taxId": 83333,
233
- "scientific_name": "Escherichia coli (strain K12)",
234
- "gene": "patA",
235
- "name": "PATase",
236
- "alignList": [
237
- {
238
- "srcId": "1",
239
- "entitySeqBeg": 5,
240
- "refSeqBeg": 7,
241
- "length": 447
242
- }
243
- ]
244
- },
245
- {
246
- "refDbId": "P38507",
247
- "refDbName": "UniProt",
248
- "provSource": "PDB",
249
- "accession": "P38507",
250
- "taxId": 1280,
251
- "scientific_name": "Staphylococcus aureus",
252
- "gene": "spa",
253
- "name": "IgG-binding protein A",
254
- "alignList": [
255
- {
256
- "srcId": "2",
257
- "entitySeqBeg": 452,
258
- entitySeqBeg"220,
259
- "length": 48
260
- }
261
- ]
262
- }
263
- ],
227
+ # "alignmentL": [
228
+ # {
229
+ # "refDbId": "P42588",
230
+ # "refDbName": "UniProt",
231
+ # "provSource": "PDB",
232
+ # "accession": "P42588",
233
+ # "taxId": 83333,
234
+ # "scientific_name": "Escherichia coli (strain K12)",
235
+ # "gene": "patA",
236
+ # "name": "PATase",
237
+ # "alignList": [
238
+ # {
239
+ # "srcId": "1",
240
+ # "entitySeqBeg": 5,
241
+ # "refSeqBeg": 7,
242
+ # "length": 447
243
+ # }
244
+ # ]
245
+ # },
246
+ # {
247
+ # "refDbId": "P38507",
248
+ # "refDbName": "UniProt",
249
+ # "provSource": "PDB",
250
+ # "accession": "P38507",
251
+ # "taxId": 1280,
252
+ # "scientific_name": "Staphylococcus aureus",
253
+ # "gene": "spa",
254
+ # "name": "IgG-binding protein A",
255
+ # "alignList": [
256
+ # {
257
+ # "srcId": "2",
258
+ # "entitySeqBeg": 452,
259
+ # entitySeqBeg"220,
260
+ # "length": 48
261
+ # }
262
+ # ]
263
+ # }
264
+ # ],
264
265
  "sourceOrgL": [
265
266
  {
266
267
  "srcId": "1",
@@ -300,7 +301,7 @@ class PolymerEntityExtractor(object):
300
301
  seqEnd = int(sD["entitySeqEnd"])
301
302
  seqLen = 1 + (seqEnd - seqBeg)
302
303
  # orgName = sD["orgName"]
303
- cD = {"sequence": seq[seqBeg - 1 : seqEnd], "entityId": eId, "srcId": srcId, "seqBeg": seqBeg, "seqEnd": seqEnd, "seqLen": seqLen, "taxId": taxId}
304
+ cD = {"sequence": seq[seqBeg - 1: seqEnd], "entityId": eId, "srcId": srcId, "seqBeg": seqBeg, "seqEnd": seqEnd, "seqLen": seqLen, "taxId": taxId}
304
305
  seqId = ""
305
306
  cL = []
306
307
  for k, v in cD.items():
@@ -32,13 +32,14 @@ logger = logging.getLogger(__name__)
32
32
  class UniProtCoreEtlWorker(object):
33
33
  """Prepare and load UniProt 'core' sequence reference data collections."""
34
34
 
35
- def __init__(self, cfgOb, cachePath, useCache=True, numProc=2, chunkSize=10, readBackCheck=False, documentLimit=None, doValidate=False, verbose=False):
35
+ def __init__(self, cfgOb, cachePath, useCache=True, numProc=2, chunkSize=10, maxStepLength=2000, readBackCheck=False, documentLimit=None, doValidate=False, verbose=False):
36
36
  self.__cfgOb = cfgOb
37
37
  self.__cachePath = cachePath
38
38
  self.__useCache = useCache
39
39
  self.__readBackCheck = readBackCheck
40
40
  self.__numProc = numProc
41
41
  self.__chunkSize = chunkSize
42
+ self.__maxStepLength = maxStepLength
42
43
  self.__documentLimit = documentLimit
43
44
  #
44
45
  self.__resourceName = "MONGO_DB"
@@ -128,6 +129,7 @@ class UniProtCoreEtlWorker(object):
128
129
  self.__resourceName,
129
130
  numProc=self.__numProc,
130
131
  chunkSize=self.__chunkSize,
132
+ maxStepLength=self.__maxStepLength,
131
133
  documentLimit=self.__documentLimit,
132
134
  verbose=self.__verbose,
133
135
  readBackCheck=self.__readBackCheck,
@@ -65,7 +65,7 @@ class PdbxLoaderFixture(unittest.TestCase):
65
65
  self.__cachePath = os.path.join(TOPDIR, "CACHE")
66
66
  self.__readBackCheck = True
67
67
  self.__numProc = 1
68
- self.__chunkSize = 5
68
+ self.__chunkSize = 2
69
69
  self.__fileLimit = 38
70
70
  self.__documentStyle = "rowwise_by_name_with_cardinality"
71
71
  #
@@ -121,44 +121,44 @@ class PdbxLoaderFixture(unittest.TestCase):
121
121
  ]
122
122
  #
123
123
  self.__pdbIdList = [
124
- "1ah1",
125
- "1b5f",
126
- "1bmv",
127
- "1c58",
128
- "1dsr",
129
- "1dul",
130
- "1kqe",
131
- "1o3q",
132
- "1sfo",
133
- "2hw3",
134
- "2hyv",
135
- "2osl",
136
- "2voo",
137
- "2wmg",
138
- "3ad7",
139
- "3hya",
140
- "3iyd",
141
- "3mbg",
142
- "3rer",
143
- "3vd8",
144
- "3vfj",
145
- "3x11",
146
- "3ztj",
147
- "4e2o",
148
- "4en8",
149
- "4mey",
150
- "5eu8",
151
- "5kds",
152
- "5tm0",
153
- "5vh4",
154
- "5vp2",
155
- "6fsz",
156
- "6lu7",
157
- "6nn7",
158
- "6q20",
159
- "6rfk",
160
- "6rku",
161
- "6yrq",
124
+ "1AH1",
125
+ "1B5F",
126
+ "1BMV",
127
+ "1C58",
128
+ "1DSR",
129
+ "1DUL",
130
+ "1KQE",
131
+ "1O3Q",
132
+ "1SFO",
133
+ "2HW3",
134
+ "2HYV",
135
+ "2OSL",
136
+ "2VOO",
137
+ "2WMG",
138
+ "3AD7",
139
+ "3HYA",
140
+ "3IYD",
141
+ "3MBG",
142
+ "3RER",
143
+ "3VD8",
144
+ "3VFJ",
145
+ "3X11",
146
+ "3ZTJ",
147
+ "4E2O",
148
+ "4EN8",
149
+ "4MEY",
150
+ "5EU8",
151
+ "5KDS",
152
+ # "5TM0",
153
+ "5VH4",
154
+ # "5VP2",
155
+ # "6FSZ",
156
+ "6LU7",
157
+ "6NN7",
158
+ # "6Q20",
159
+ "6RFK",
160
+ "6RKU",
161
+ "6YRQ",
162
162
  ]
163
163
  self.__ldList = [
164
164
  {
@@ -213,10 +213,12 @@ class PdbxLoaderFixture(unittest.TestCase):
213
213
  def testPdbxLoader(self):
214
214
  #
215
215
  for ld in self.__ldList:
216
- self.__pdbxLoaderWrapper(**ld)
216
+ ok = self.__pdbxLoaderWrapper(**ld)
217
+ self.assertTrue(ok)
217
218
 
218
219
  def __pdbxLoaderWrapper(self, **kwargs):
219
220
  """Wrapper for the PDBx loader module"""
221
+ ok = False
220
222
  try:
221
223
  logger.info("Loading %s", kwargs["databaseName"])
222
224
  mw = PdbxLoader(
@@ -257,6 +259,7 @@ class PdbxLoaderFixture(unittest.TestCase):
257
259
  except Exception as e:
258
260
  logger.exception("Failing with %s", str(e))
259
261
  self.fail()
262
+ return ok
260
263
 
261
264
  def __loadStatus(self, statusList):
262
265
  sectionName = "data_exchange_configuration"
@@ -88,7 +88,8 @@ class ExDbWorkflowTests(unittest.TestCase):
88
88
  def testExDbLoaderWorkflows(self):
89
89
  """Test run workflow steps ..."""
90
90
  try:
91
- opL = ["etl_chemref", "upd_ref_seq", "etl_tree_node_lists"]
91
+ # opL = ["etl_chemref", "upd_ref_seq", "etl_tree_node_lists"]
92
+ opL = ["etl_chemref", "etl_tree_node_lists"]
92
93
  rlWf = ExDbWorkflow(**self.__commonD)
93
94
  for op in opL:
94
95
  ok = rlWf.load(op, **self.__loadCommonD)
@@ -118,7 +118,7 @@ class ObjectExtractorTests(unittest.TestCase):
118
118
  eCount = obEx.getCount()
119
119
  logger.info("Entry count is %d", eCount)
120
120
  logger.info("Entries are %r", list(eD.keys()))
121
- self.assertGreaterEqual(eCount, 6)
121
+ self.assertGreaterEqual(eCount, 5)
122
122
  except Exception as e:
123
123
  logger.exception("Failing with %s", str(e))
124
124
  self.fail()
@@ -64,7 +64,7 @@ class PolymerEntityExtractorTests(unittest.TestCase):
64
64
  pEx = PolymerEntityExtractor(self.__cfgOb)
65
65
  pD, _ = pEx.getProteinSequenceDetails()
66
66
  #
67
- self.assertGreaterEqual(len(pD), 100)
67
+ self.assertGreaterEqual(len(pD), 70)
68
68
  logger.info("Polymer entity count %d", len(pD))
69
69
  except Exception as e:
70
70
  logger.exception("Failing with %s", str(e))
@@ -91,7 +91,7 @@ class ReferenceSequenceAssignmentProviderTests(unittest.TestCase):
91
91
  ok = rsaP.testCache()
92
92
  self.assertTrue(ok)
93
93
  numRef = rsaP.getRefDataCount()
94
- self.assertGreaterEqual(numRef, 90)
94
+ self.assertGreaterEqual(numRef, 49)
95
95
  #
96
96
  # --- Reload from cache ---
97
97
  rsaP = ReferenceSequenceAssignmentProvider(
@@ -100,7 +100,7 @@ class ReferenceSequenceAssignmentProviderTests(unittest.TestCase):
100
100
  ok = rsaP.testCache()
101
101
  self.assertTrue(ok)
102
102
  numRef = rsaP.getRefDataCount()
103
- self.assertGreaterEqual(numRef, 90)
103
+ self.assertGreaterEqual(numRef, 49)
104
104
  except Exception as e:
105
105
  logger.exception("Failing with %s", str(e))
106
106
  self.fail()
@@ -70,14 +70,14 @@ class ReferenceSequenceCacheProviderTests(unittest.TestCase):
70
70
  ok = rsaP.testCache()
71
71
  self.assertTrue(ok)
72
72
  numRef = rsaP.getRefDataCount()
73
- self.assertGreaterEqual(numRef, 90)
73
+ self.assertGreaterEqual(numRef, 49)
74
74
  #
75
75
  # --- Reload from cache ---
76
76
  rsaP = ReferenceSequenceCacheProvider(self.__cfgOb, databaseName, collectionName, polymerType, maxChunkSize=50, numProc=2, expireDays=14)
77
77
  ok = rsaP.testCache()
78
78
  self.assertTrue(ok)
79
79
  numRef = rsaP.getRefDataCount()
80
- self.assertGreaterEqual(numRef, 90)
80
+ self.assertGreaterEqual(numRef, 49)
81
81
  except Exception as e:
82
82
  logger.exception("Failing with %s", str(e))
83
83
  self.fail()