rcsb.exdb 1.27__tar.gz → 1.29__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/HISTORY.txt +3 -1
  2. {rcsb_exdb-1.27/rcsb.exdb.egg-info → rcsb_exdb-1.29}/PKG-INFO +5 -63
  3. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/README.md +0 -54
  4. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/chemref/PubChemIndexCacheProvider.py +73 -72
  5. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/cli/__init__.py +1 -1
  6. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/wf/PubChemEtlWorkflow.py +2 -2
  7. {rcsb_exdb-1.27 → rcsb_exdb-1.29/rcsb.exdb.egg-info}/PKG-INFO +5 -63
  8. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb.exdb.egg-info/SOURCES.txt +0 -4
  9. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb.exdb.egg-info/requires.txt +2 -9
  10. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/requirements.txt +4 -7
  11. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/setup.py +1 -2
  12. rcsb_exdb-1.27/rcsb/exdb/cli/ExDbExec.py +0 -239
  13. rcsb_exdb-1.27/rcsb/exdb/tests/testExDbWorkflow.py +0 -145
  14. rcsb_exdb-1.27/rcsb/exdb/wf/ExDbWorkflow.py +0 -521
  15. rcsb_exdb-1.27/rcsb.exdb.egg-info/entry_points.txt +0 -2
  16. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/LICENSE +0 -0
  17. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/MANIFEST.in +0 -0
  18. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/__init__.py +0 -0
  19. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/__init__.py +0 -0
  20. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/branch/BranchedEntityExtractor.py +0 -0
  21. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/branch/GlycanProvider.py +0 -0
  22. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/branch/GlycanUtils.py +0 -0
  23. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/branch/__init__.py +0 -0
  24. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/chemref/ChemRefEtlWorker.py +0 -0
  25. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/chemref/ChemRefExtractor.py +0 -0
  26. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/chemref/ChemRefMappingProvider.py +0 -0
  27. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/chemref/PubChemDataCacheProvider.py +0 -0
  28. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/chemref/PubChemEtlWrapper.py +0 -0
  29. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/chemref/__init__.py +0 -0
  30. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/citation/CitationAdapter.py +0 -0
  31. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/citation/CitationExtractor.py +0 -0
  32. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/citation/CitationUtils.py +0 -0
  33. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/citation/__init__.py +0 -0
  34. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/entry/EntryInfoProvider.py +0 -0
  35. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/entry/__init__.py +0 -0
  36. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/seq/AnnotationExtractor.py +0 -0
  37. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/seq/LigandNeighborMappingExtractor.py +0 -0
  38. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/seq/LigandNeighborMappingProvider.py +0 -0
  39. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/seq/PolymerEntityExtractor.py +0 -0
  40. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +0 -0
  41. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +0 -0
  42. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +0 -0
  43. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +0 -0
  44. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +0 -0
  45. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/seq/TaxonomyExtractor.py +0 -0
  46. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/seq/UniProtCoreEtlWorker.py +0 -0
  47. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/seq/UniProtExtractor.py +0 -0
  48. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/seq/__init__.py +0 -0
  49. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/__init__.py +0 -0
  50. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +0 -0
  51. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/fixturePdbxLoader.py +0 -0
  52. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testAnnotationExtractor.py +0 -0
  53. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testBranchedEntityExtractor.py +0 -0
  54. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testChemRefLoader.py +0 -0
  55. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testChemRefMappingProvider.py +0 -0
  56. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testCitationAdapter.py +0 -0
  57. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testCitationExtractor.py +0 -0
  58. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testCitationUtils.py +0 -0
  59. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +0 -0
  60. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testEntryInfoProvider.py +0 -0
  61. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testGlycanEtlWorkflow.py +0 -0
  62. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testGlycanProvider.py +0 -0
  63. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testGlycanUtils.py +0 -0
  64. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testLigandNeighborMappingProvider.py +0 -0
  65. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testObjectExtractor.py +0 -0
  66. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testObjectTransformer.py +0 -0
  67. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testObjectUpdater.py +0 -0
  68. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testPolymerEntityExtractor.py +0 -0
  69. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testPubChemDataCacheProvider.py +0 -0
  70. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testPubChemEtlWorkflow.py +0 -0
  71. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testPubChemEtlWrapper.py +0 -0
  72. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testPubChemIndexCacheProvider.py +0 -0
  73. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +0 -0
  74. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +0 -0
  75. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +0 -0
  76. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +0 -0
  77. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +0 -0
  78. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testTaxonomyExtractor.py +0 -0
  79. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testTreeNodeListWorker.py +0 -0
  80. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testUniProtCoreEtlWorker.py +0 -0
  81. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tests/testUniProtExtractor.py +0 -0
  82. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tree/TreeNodeListWorker.py +0 -0
  83. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/tree/__init__.py +0 -0
  84. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/utils/ObjectAdapterBase.py +0 -0
  85. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/utils/ObjectExtractor.py +0 -0
  86. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/utils/ObjectTransformer.py +0 -0
  87. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/utils/ObjectUpdater.py +0 -0
  88. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/utils/ObjectValidator.py +0 -0
  89. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/utils/__init__.py +0 -0
  90. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/wf/EntryInfoEtlWorkflow.py +0 -0
  91. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/wf/GlycanEtlWorkflow.py +0 -0
  92. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb/exdb/wf/__init__.py +0 -0
  93. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb.exdb.egg-info/dependency_links.txt +0 -0
  94. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb.exdb.egg-info/not-zip-safe +0 -0
  95. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/rcsb.exdb.egg-info/top_level.txt +0 -0
  96. {rcsb_exdb-1.27 → rcsb_exdb-1.29}/setup.cfg +0 -0
@@ -108,4 +108,6 @@
108
108
  Update CI/CD to python 3.10
109
109
  10-Dec-2024 V1.26 Update PolymerEntityExtractor to sort extracted sequence data;
110
110
  Update Azure pipelines to run on latest macOS and ubuntu version
111
- 23-Jan-2025 V1.27 Update TreeNodeListWorker to index 'id' field
111
+ 23-Jan-2025 V1.27 Update TreeNodeListWorker to index 'id' field
112
+ 11-Feb-2025 V1.28 Move ExDB CLI code (workflow, exec, and tests) and Dockerfile to rcsb.workflow to avoid circular imports
113
+ 8-Apr-2025 V1.29 Add more logging to PubChemIndexCacheProvider and increase default numProc
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: rcsb.exdb
3
- Version: 1.27
3
+ Version: 1.29
4
4
  Summary: RCSB Python ExDB data extraction and loading workflows
5
5
  Home-page: https://github.com/rcsb/py-rcsb_exdb
6
6
  Author: John Westbrook
@@ -16,25 +16,20 @@ Classifier: Programming Language :: Python :: 3.9
16
16
  Classifier: Programming Language :: Python :: 3.10
17
17
  Description-Content-Type: text/markdown
18
18
  License-File: LICENSE
19
- Requires-Dist: OpenEye-toolkits>=2024.1.1
20
19
  Requires-Dist: numpy
21
20
  Requires-Dist: jsonschema>=2.6.0
22
21
  Requires-Dist: rcsb.utils.io>=1.48
23
- Requires-Dist: rcsb.db>=1.725
24
- Requires-Dist: rcsb.utils.chem>=0.79
22
+ Requires-Dist: rcsb.db>=1.800
23
+ Requires-Dist: rcsb.utils.chem>=0.81
25
24
  Requires-Dist: rcsb.utils.chemref>=0.91
26
- Requires-Dist: rcsb.utils.citation>=0.22
27
25
  Requires-Dist: rcsb.utils.config>=0.40
28
26
  Requires-Dist: rcsb.utils.ec>=0.25
29
27
  Requires-Dist: rcsb.utils.go>=0.18
30
28
  Requires-Dist: rcsb.utils.seq>=0.82
31
- Requires-Dist: rcsb.utils.seqalign>=0.31
32
29
  Requires-Dist: rcsb.utils.targets>=0.82
33
30
  Requires-Dist: rcsb.utils.struct>=0.47
34
31
  Requires-Dist: rcsb.utils.taxonomy>=0.43
35
32
  Requires-Dist: rcsb.utils.dictionary>=1.27
36
- Requires-Dist: rcsb.workflow>=0.46
37
- Requires-Dist: statistics; python_version < "3.0"
38
33
  Provides-Extra: dev
39
34
  Requires-Dist: check-manifest; extra == "dev"
40
35
  Provides-Extra: test
@@ -46,6 +41,7 @@ Dynamic: description
46
41
  Dynamic: description-content-type
47
42
  Dynamic: home-page
48
43
  Dynamic: license
44
+ Dynamic: license-file
49
45
  Dynamic: provides-extra
50
46
  Dynamic: requires-dist
51
47
  Dynamic: summary
@@ -115,57 +111,3 @@ install this system. Once HomeBrew is installed, you can further install the
115
111
  [MongoDB](https://docs.mongodb.com/manual/tutorial/install-mongodb-on-os-x/) packages which
116
112
  are required to support the ExDB tools. HomeBrew also provides a variety of options for
117
113
  managing a [Python virtual environments](https://gist.github.com/Geoyi/f55ed54d24cc9ff1c14bd95fac21c042).
118
-
119
- ### Command Line Interfaces
120
-
121
- A convenience CLI `exdb_exec_cli` is provided for performing update and loading operations.
122
-
123
- ```bash
124
- exdb_exec_cli --help
125
-
126
- usage: exdb_exec_cli [-h] [--data_set_id DATA_SET_ID] [--full] [--etl_chemref]
127
- [--etl_tree_node_lists] [--config_path CONFIG_PATH]
128
- [--config_name CONFIG_NAME] [--db_type DB_TYPE]
129
- [--read_back_check] [--num_proc NUM_PROC]
130
- [--chunk_size CHUNK_SIZE]
131
- [--document_limit DOCUMENT_LIMIT] [--debug] [--mock]
132
- [--cache_path CACHE_PATH] [--rebuild_cache]
133
-
134
- optional arguments:
135
- -h, --help show this help message and exit
136
- --data_set_id DATA_SET_ID
137
- Data set identifier (default= 2019_14 for current
138
- week)
139
- --full Fresh full load in a new tables/collections (Default)
140
- --etl_chemref ETL integrated chemical reference data
141
- --etl_tree_node_lists
142
- ETL tree node lists
143
- --config_path CONFIG_PATH
144
- Path to configuration options file
145
- --config_name CONFIG_NAME
146
- Configuration section name
147
- --db_type DB_TYPE Database server type (default=mongo)
148
- --read_back_check Perform read back check on all documents
149
- --num_proc NUM_PROC Number of processes to execute (default=2)
150
- --chunk_size CHUNK_SIZE
151
- Number of files loaded per process
152
- --document_limit DOCUMENT_LIMIT
153
- Load document limit for testing
154
- --debug Turn on verbose logging
155
- --mock Use MOCK repository configuration for testing
156
- --cache_path CACHE_PATH
157
- Top cache path for external and local resource files
158
- --rebuild_cache Rebuild cached files from remote resources
159
- ________________________________________________________________________________
160
-
161
- ```
162
-
163
- For example, to construct and load tree nodes list data collections, the following
164
- command may be used:
165
-
166
- ```bash
167
- exdb_exec_cli --mock --full --etl_tree_node_lists --rebuild_cache \
168
- --cache_path ./CACHE \
169
- --config_path ./rcsb/mock-data/config/dbload-setup-example.yml \
170
- --config_name site_info_configuration >& LOGTREE \
171
- ```
@@ -63,57 +63,3 @@ install this system. Once HomeBrew is installed, you can further install the
63
63
  [MongoDB](https://docs.mongodb.com/manual/tutorial/install-mongodb-on-os-x/) packages which
64
64
  are required to support the ExDB tools. HomeBrew also provides a variety of options for
65
65
  managing a [Python virtual environments](https://gist.github.com/Geoyi/f55ed54d24cc9ff1c14bd95fac21c042).
66
-
67
- ### Command Line Interfaces
68
-
69
- A convenience CLI `exdb_exec_cli` is provided for performing update and loading operations.
70
-
71
- ```bash
72
- exdb_exec_cli --help
73
-
74
- usage: exdb_exec_cli [-h] [--data_set_id DATA_SET_ID] [--full] [--etl_chemref]
75
- [--etl_tree_node_lists] [--config_path CONFIG_PATH]
76
- [--config_name CONFIG_NAME] [--db_type DB_TYPE]
77
- [--read_back_check] [--num_proc NUM_PROC]
78
- [--chunk_size CHUNK_SIZE]
79
- [--document_limit DOCUMENT_LIMIT] [--debug] [--mock]
80
- [--cache_path CACHE_PATH] [--rebuild_cache]
81
-
82
- optional arguments:
83
- -h, --help show this help message and exit
84
- --data_set_id DATA_SET_ID
85
- Data set identifier (default= 2019_14 for current
86
- week)
87
- --full Fresh full load in a new tables/collections (Default)
88
- --etl_chemref ETL integrated chemical reference data
89
- --etl_tree_node_lists
90
- ETL tree node lists
91
- --config_path CONFIG_PATH
92
- Path to configuration options file
93
- --config_name CONFIG_NAME
94
- Configuration section name
95
- --db_type DB_TYPE Database server type (default=mongo)
96
- --read_back_check Perform read back check on all documents
97
- --num_proc NUM_PROC Number of processes to execute (default=2)
98
- --chunk_size CHUNK_SIZE
99
- Number of files loaded per process
100
- --document_limit DOCUMENT_LIMIT
101
- Load document limit for testing
102
- --debug Turn on verbose logging
103
- --mock Use MOCK repository configuration for testing
104
- --cache_path CACHE_PATH
105
- Top cache path for external and local resource files
106
- --rebuild_cache Rebuild cached files from remote resources
107
- ________________________________________________________________________________
108
-
109
- ```
110
-
111
- For example, to construct and load tree nodes list data collections, the following
112
- command may be used:
113
-
114
- ```bash
115
- exdb_exec_cli --mock --full --etl_tree_node_lists --rebuild_cache \
116
- --cache_path ./CACHE \
117
- --config_path ./rcsb/mock-data/config/dbload-setup-example.yml \
118
- --config_name site_info_configuration >& LOGTREE \
119
- ```
@@ -9,6 +9,7 @@
9
9
  # 16-Jul-2020 jdw separate index and reference data management.
10
10
  # 23-Jul-2021 jdw Make PubChemIndexCacheProvider a subclass of StashableBase()
11
11
  # 2-Mar-2023 aae Return correct status from Single proc
12
+ # 8-Apr-2025 dwp Let MultiProc handle chunking; add more logging to debug slowness on west coast
12
13
  #
13
14
  ##
14
15
  __docformat__ = "google en"
@@ -100,84 +101,82 @@ class PubChemUpdateWorker(object):
100
101
  #
101
102
  """
102
103
  _ = workingDir
103
- chunkSize = optionsD.get("chunkSize", 50)
104
104
  matchIdOnly = optionsD.get("matchIdOnly", True)
105
105
  # Path to store raw request data -
106
106
  exportPath = optionsD.get("exportPath", None)
107
107
  #
108
108
  successList = []
109
- retList1 = []
110
- retList2 = []
111
109
  diagList = []
112
- emptyList = []
110
+ failList = []
111
+ retList = []
113
112
  #
114
113
  try:
114
+ startTime = time.time()
115
115
  tU = TimeUtil()
116
- ccIdList = dataList
117
- numChunks = len(list(self.__chunker(ccIdList, chunkSize)))
118
- logger.info("%s search starting for %d reference definitions (in chunks of length %d)", procName, len(ccIdList), chunkSize)
119
- for ii, ccIdChunk in enumerate(self.__chunker(ccIdList, chunkSize), 1):
120
- logger.info("%s starting chunk for %d of %d", procName, ii, numChunks)
121
- # tDL = []
122
- tIdxDL = []
123
- timeS = tU.getDateTimeObj(tU.getTimestamp())
124
- for ccId in ccIdChunk:
125
- # Get various forms from the search index -
126
- chemIdList = self.__genChemIdList(ccId)
127
- tIdxD = {"rcsb_id": ccId, "rcsb_last_update": timeS}
116
+ ccIdList = dataList # len(dataList) should be of size chunkSize
117
+ logger.info("%s search starting for %d reference definitions (matchIdOnly %r exportPath %r)", procName, len(ccIdList), matchIdOnly, exportPath)
118
+ tIdxDL = []
119
+ timeS = tU.getDateTimeObj(tU.getTimestamp())
120
+ for ccId in ccIdList:
121
+ # Get various forms from the search index -
122
+ chemIdList = self.__genChemIdList(ccId)
123
+ tIdxD = {"rcsb_id": ccId, "rcsb_last_update": timeS}
124
+ #
125
+ mL = []
126
+ for chemId in chemIdList:
127
+ stA = time.time()
128
+ ok, refDL = self.__pcU.assemble(chemId, exportPath=exportPath, matchIdOnly=matchIdOnly)
128
129
  #
129
- mL = []
130
- for chemId in chemIdList:
131
- stA = time.time()
132
- ok, refDL = self.__pcU.assemble(chemId, exportPath=exportPath, matchIdOnly=matchIdOnly)
133
- #
134
- if not ok:
135
- etA = time.time()
136
- logger.debug("Failing %s search source %s for %s (%.4f secs)", chemId.identifierType, chemId.identifierSource, chemId.idCode, etA - stA)
137
-
138
- #
139
- if ok and refDL:
140
- for tD in refDL:
141
- pcId = tD["cid"]
142
- inchiKey = (
143
- self.__searchIdxD[chemId.indexName]["inchi-key"]
144
- if chemId.indexName in self.__searchIdxD and "inchi-key" in self.__searchIdxD[chemId.indexName]
145
- else None
146
- )
147
- smiles = (
148
- self.__searchIdxD[chemId.indexName]["smiles"] if chemId.indexName in self.__searchIdxD and "smiles" in self.__searchIdxD[chemId.indexName] else None
149
- )
150
- mL.append(
151
- {
152
- "matched_id": pcId,
153
- "search_id_type": chemId.identifierType,
154
- "search_id_source": chemId.identifierSource,
155
- "source_index_name": chemId.indexName,
156
- "source_smiles": smiles,
157
- "source_inchikey": inchiKey,
158
- }
159
- )
160
- # tD.update({"rcsb_id": pcId, "rcsb_last_update": timeS})
161
- # tDL.append(tD)
130
+ if not ok:
131
+ etA = time.time()
132
+ logger.debug("Failing %s search source %s for %s (%.4f secs)", chemId.identifierType, chemId.identifierSource, chemId.idCode, etA - stA)
162
133
  #
163
- if mL:
164
- tIdxD["matched_ids"] = mL
165
- successList.append(ccId)
166
- else:
167
- logger.info("No match result for any form of %s", ccId)
168
- #
169
- tIdxDL.append(tIdxD)
170
- # --
171
- startTimeL = time.time()
172
- logger.info("Saving chunk %d (len=%d)", ii, len(ccIdChunk))
173
- self.__updateObjectStore(self.__databaseName, self.__matchIndexCollectionName, tIdxDL)
174
- endTimeL = time.time()
175
- logger.info("Saved chunk %d (len=%d) in %.3f secs", ii, len(ccIdChunk), endTimeL - startTimeL)
134
+ if ok and refDL:
135
+ for tD in refDL:
136
+ pcId = tD["cid"]
137
+ inchiKey = (
138
+ self.__searchIdxD[chemId.indexName]["inchi-key"]
139
+ if chemId.indexName in self.__searchIdxD and "inchi-key" in self.__searchIdxD[chemId.indexName]
140
+ else None
141
+ )
142
+ smiles = (
143
+ self.__searchIdxD[chemId.indexName]["smiles"] if chemId.indexName in self.__searchIdxD and "smiles" in self.__searchIdxD[chemId.indexName] else None
144
+ )
145
+ mL.append(
146
+ {
147
+ "matched_id": pcId,
148
+ "search_id_type": chemId.identifierType,
149
+ "search_id_source": chemId.identifierSource,
150
+ "source_index_name": chemId.indexName,
151
+ "source_smiles": smiles,
152
+ "source_inchikey": inchiKey,
153
+ }
154
+ )
155
+ #
156
+ if mL:
157
+ tIdxD["matched_ids"] = mL
158
+ successList.append(ccId)
159
+ else:
160
+ logger.info("No match result for any form of %s", ccId)
161
+ #
162
+ tIdxDL.append(tIdxD)
163
+ # --
164
+ failList = sorted(set(dataList) - set(successList))
165
+ if failList:
166
+ logger.info("%s returns %d definitions with failures: %r", procName, len(failList), failList)
167
+ # --
168
+ endTime = time.time()
169
+ logger.info("%s completed updateList len %r duration %.3f secs", procName, len(ccIdList), endTime - startTime)
170
+ startTimeL = time.time()
171
+ logger.info("Saving dataList (len=%d)", len(ccIdList))
172
+ self.__updateObjectStore(self.__databaseName, self.__matchIndexCollectionName, tIdxDL)
173
+ endTimeL = time.time()
174
+ logger.info("Saved chunk (len=%d) in %.3f secs", len(ccIdList), endTimeL - startTimeL)
176
175
  except Exception as e:
177
176
  logger.exception("Failing %s for %d data items %s", procName, len(dataList), str(e))
178
- logger.info("%s dataList length %d success length %d rst1 %d rst2 %d", procName, len(dataList), len(successList), len(retList1), len(retList2))
177
+ logger.info("%s dataList length %d success length %d retList %d", procName, len(dataList), len(successList), len(retList))
179
178
  #
180
- return successList, emptyList, emptyList, diagList
179
+ return successList, retList, diagList
181
180
 
182
181
  def __updateObjectStore(self, databaseName, collectionName, objDL):
183
182
  updateDL = []
@@ -196,10 +195,6 @@ class PubChemUpdateWorker(object):
196
195
  ok = obUpd.createCollection(databaseName, collectionName, indexAttributeNames=indexAttributeNames, checkExists=True, bsonSchema=None)
197
196
  return ok
198
197
 
199
- def __chunker(self, iList, chunkSize):
200
- chunkSize = max(1, chunkSize)
201
- return (iList[i: i + chunkSize] for i in range(0, len(iList), chunkSize))
202
-
203
198
 
204
199
  class PubChemIndexCacheProvider(StashableBase):
205
200
  """Utilities to manage chemical component/BIRD to PubChem compound identifier mapping data."""
@@ -515,7 +510,7 @@ class PubChemIndexCacheProvider(StashableBase):
515
510
  Returns:
516
511
  (bool, list): status flag, list of unmatched identifiers
517
512
  """
518
- chunkSize = 50
513
+ chunkSize = 10
519
514
  exportPath = kwargs.get("exportPath", None)
520
515
  logger.info("Length starting list is %d", len(idList))
521
516
  optD = {"chunkSize": chunkSize, "exportPath": exportPath, "matchIdOnly": True}
@@ -524,14 +519,20 @@ class PubChemIndexCacheProvider(StashableBase):
524
519
  mpu = MultiProcUtil(verbose=True)
525
520
  mpu.setOptions(optD)
526
521
  mpu.set(workerObj=rWorker, workerMethod="updateList")
527
- ok, failList, resultList, _ = mpu.runMulti(dataList=idList, numProc=numProc, numResults=2, chunkSize=chunkSize)
528
- logger.info("Multi-proc %r failures %r result lengths %r %r", ok, len(failList), len(resultList[0]), len(resultList[1]))
522
+ ok, failList, resultList, _ = mpu.runMulti(dataList=idList, numProc=numProc, numResults=1, chunkSize=chunkSize)
523
+ logger.info("Multi-proc %r failures %r result lengths %r", ok, len(failList), len(resultList[0]))
529
524
  else:
530
- successList, _, _, _ = rWorker.updateList(idList, "SingleProc", optD, self.__dirPath)
525
+ successList, _, _ = rWorker.updateList(idList, "SingleProc", optD, self.__dirPath)
531
526
  failList = list(set(idList) - set(successList))
532
527
  ok = len(failList) == 0
533
528
  logger.info("Single-proc status %r failures %r", ok, len(failList))
534
529
  #
530
+ if len(failList) > 0:
531
+ if len(failList) <= 100:
532
+ logger.info("failList: %r", failList)
533
+ else:
534
+ logger.info("failList[:100]: %r", failList[:100])
535
+ #
535
536
  return ok, failList
536
537
 
537
538
  def __reloadDump(self, objD, databaseName, collectionName, indexAttributeNames=None):
@@ -2,4 +2,4 @@ __docformat__ = "google en"
2
2
  __author__ = "John Westbrook"
3
3
  __email__ = "john.westbrook@rcsb.org"
4
4
  __license__ = "Apache 2.0"
5
- __version__ = "1.27"
5
+ __version__ = "1.29"
@@ -165,7 +165,7 @@ class PubChemEtlWorkflow(object):
165
165
  birdUrlTarget = kwargs.get("birdUrlTarget", None)
166
166
  ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "cc-full")
167
167
  numProcChemComp = kwargs.get("numProcChemComp", 8)
168
- numProc = kwargs.get("numProc", 2)
168
+ numProc = kwargs.get("numProc", 4)
169
169
  rebuildChemIndices = kwargs.get("rebuildChemIndices", True)
170
170
  exportPath = kwargs.get("exportPath", None)
171
171
  useStash = kwargs.get("useStash", True)
@@ -209,7 +209,7 @@ class PubChemEtlWorkflow(object):
209
209
  try:
210
210
  ok1 = ok2 = ok3 = ok4 = ok5 = ok6 = False
211
211
  # --
212
- numProc = kwargs.get("numProc", 2)
212
+ numProc = kwargs.get("numProc", 4)
213
213
  useStash = kwargs.get("useStash", True)
214
214
  useGit = kwargs.get("useGit", False)
215
215
  #
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: rcsb.exdb
3
- Version: 1.27
3
+ Version: 1.29
4
4
  Summary: RCSB Python ExDB data extraction and loading workflows
5
5
  Home-page: https://github.com/rcsb/py-rcsb_exdb
6
6
  Author: John Westbrook
@@ -16,25 +16,20 @@ Classifier: Programming Language :: Python :: 3.9
16
16
  Classifier: Programming Language :: Python :: 3.10
17
17
  Description-Content-Type: text/markdown
18
18
  License-File: LICENSE
19
- Requires-Dist: OpenEye-toolkits>=2024.1.1
20
19
  Requires-Dist: numpy
21
20
  Requires-Dist: jsonschema>=2.6.0
22
21
  Requires-Dist: rcsb.utils.io>=1.48
23
- Requires-Dist: rcsb.db>=1.725
24
- Requires-Dist: rcsb.utils.chem>=0.79
22
+ Requires-Dist: rcsb.db>=1.800
23
+ Requires-Dist: rcsb.utils.chem>=0.81
25
24
  Requires-Dist: rcsb.utils.chemref>=0.91
26
- Requires-Dist: rcsb.utils.citation>=0.22
27
25
  Requires-Dist: rcsb.utils.config>=0.40
28
26
  Requires-Dist: rcsb.utils.ec>=0.25
29
27
  Requires-Dist: rcsb.utils.go>=0.18
30
28
  Requires-Dist: rcsb.utils.seq>=0.82
31
- Requires-Dist: rcsb.utils.seqalign>=0.31
32
29
  Requires-Dist: rcsb.utils.targets>=0.82
33
30
  Requires-Dist: rcsb.utils.struct>=0.47
34
31
  Requires-Dist: rcsb.utils.taxonomy>=0.43
35
32
  Requires-Dist: rcsb.utils.dictionary>=1.27
36
- Requires-Dist: rcsb.workflow>=0.46
37
- Requires-Dist: statistics; python_version < "3.0"
38
33
  Provides-Extra: dev
39
34
  Requires-Dist: check-manifest; extra == "dev"
40
35
  Provides-Extra: test
@@ -46,6 +41,7 @@ Dynamic: description
46
41
  Dynamic: description-content-type
47
42
  Dynamic: home-page
48
43
  Dynamic: license
44
+ Dynamic: license-file
49
45
  Dynamic: provides-extra
50
46
  Dynamic: requires-dist
51
47
  Dynamic: summary
@@ -115,57 +111,3 @@ install this system. Once HomeBrew is installed, you can further install the
115
111
  [MongoDB](https://docs.mongodb.com/manual/tutorial/install-mongodb-on-os-x/) packages which
116
112
  are required to support the ExDB tools. HomeBrew also provides a variety of options for
117
113
  managing a [Python virtual environments](https://gist.github.com/Geoyi/f55ed54d24cc9ff1c14bd95fac21c042).
118
-
119
- ### Command Line Interfaces
120
-
121
- A convenience CLI `exdb_exec_cli` is provided for performing update and loading operations.
122
-
123
- ```bash
124
- exdb_exec_cli --help
125
-
126
- usage: exdb_exec_cli [-h] [--data_set_id DATA_SET_ID] [--full] [--etl_chemref]
127
- [--etl_tree_node_lists] [--config_path CONFIG_PATH]
128
- [--config_name CONFIG_NAME] [--db_type DB_TYPE]
129
- [--read_back_check] [--num_proc NUM_PROC]
130
- [--chunk_size CHUNK_SIZE]
131
- [--document_limit DOCUMENT_LIMIT] [--debug] [--mock]
132
- [--cache_path CACHE_PATH] [--rebuild_cache]
133
-
134
- optional arguments:
135
- -h, --help show this help message and exit
136
- --data_set_id DATA_SET_ID
137
- Data set identifier (default= 2019_14 for current
138
- week)
139
- --full Fresh full load in a new tables/collections (Default)
140
- --etl_chemref ETL integrated chemical reference data
141
- --etl_tree_node_lists
142
- ETL tree node lists
143
- --config_path CONFIG_PATH
144
- Path to configuration options file
145
- --config_name CONFIG_NAME
146
- Configuration section name
147
- --db_type DB_TYPE Database server type (default=mongo)
148
- --read_back_check Perform read back check on all documents
149
- --num_proc NUM_PROC Number of processes to execute (default=2)
150
- --chunk_size CHUNK_SIZE
151
- Number of files loaded per process
152
- --document_limit DOCUMENT_LIMIT
153
- Load document limit for testing
154
- --debug Turn on verbose logging
155
- --mock Use MOCK repository configuration for testing
156
- --cache_path CACHE_PATH
157
- Top cache path for external and local resource files
158
- --rebuild_cache Rebuild cached files from remote resources
159
- ________________________________________________________________________________
160
-
161
- ```
162
-
163
- For example, to construct and load tree nodes list data collections, the following
164
- command may be used:
165
-
166
- ```bash
167
- exdb_exec_cli --mock --full --etl_tree_node_lists --rebuild_cache \
168
- --cache_path ./CACHE \
169
- --config_path ./rcsb/mock-data/config/dbload-setup-example.yml \
170
- --config_name site_info_configuration >& LOGTREE \
171
- ```
@@ -9,7 +9,6 @@ rcsb/__init__.py
9
9
  rcsb.exdb.egg-info/PKG-INFO
10
10
  rcsb.exdb.egg-info/SOURCES.txt
11
11
  rcsb.exdb.egg-info/dependency_links.txt
12
- rcsb.exdb.egg-info/entry_points.txt
13
12
  rcsb.exdb.egg-info/not-zip-safe
14
13
  rcsb.exdb.egg-info/requires.txt
15
14
  rcsb.exdb.egg-info/top_level.txt
@@ -29,7 +28,6 @@ rcsb/exdb/citation/CitationAdapter.py
29
28
  rcsb/exdb/citation/CitationExtractor.py
30
29
  rcsb/exdb/citation/CitationUtils.py
31
30
  rcsb/exdb/citation/__init__.py
32
- rcsb/exdb/cli/ExDbExec.py
33
31
  rcsb/exdb/cli/__init__.py
34
32
  rcsb/exdb/entry/EntryInfoProvider.py
35
33
  rcsb/exdb/entry/__init__.py
@@ -58,7 +56,6 @@ rcsb/exdb/tests/testCitationExtractor.py
58
56
  rcsb/exdb/tests/testCitationUtils.py
59
57
  rcsb/exdb/tests/testEntryInfoEtlWorkflow.py
60
58
  rcsb/exdb/tests/testEntryInfoProvider.py
61
- rcsb/exdb/tests/testExDbWorkflow.py
62
59
  rcsb/exdb/tests/testGlycanEtlWorkflow.py
63
60
  rcsb/exdb/tests/testGlycanProvider.py
64
61
  rcsb/exdb/tests/testGlycanUtils.py
@@ -89,7 +86,6 @@ rcsb/exdb/utils/ObjectUpdater.py
89
86
  rcsb/exdb/utils/ObjectValidator.py
90
87
  rcsb/exdb/utils/__init__.py
91
88
  rcsb/exdb/wf/EntryInfoEtlWorkflow.py
92
- rcsb/exdb/wf/ExDbWorkflow.py
93
89
  rcsb/exdb/wf/GlycanEtlWorkflow.py
94
90
  rcsb/exdb/wf/PubChemEtlWorkflow.py
95
91
  rcsb/exdb/wf/__init__.py
@@ -1,24 +1,17 @@
1
- OpenEye-toolkits>=2024.1.1
2
1
  numpy
3
2
  jsonschema>=2.6.0
4
3
  rcsb.utils.io>=1.48
5
- rcsb.db>=1.725
6
- rcsb.utils.chem>=0.79
4
+ rcsb.db>=1.800
5
+ rcsb.utils.chem>=0.81
7
6
  rcsb.utils.chemref>=0.91
8
- rcsb.utils.citation>=0.22
9
7
  rcsb.utils.config>=0.40
10
8
  rcsb.utils.ec>=0.25
11
9
  rcsb.utils.go>=0.18
12
10
  rcsb.utils.seq>=0.82
13
- rcsb.utils.seqalign>=0.31
14
11
  rcsb.utils.targets>=0.82
15
12
  rcsb.utils.struct>=0.47
16
13
  rcsb.utils.taxonomy>=0.43
17
14
  rcsb.utils.dictionary>=1.27
18
- rcsb.workflow>=0.46
19
-
20
- [:python_version < "3.0"]
21
- statistics
22
15
 
23
16
  [dev]
24
17
  check-manifest
@@ -1,20 +1,17 @@
1
1
  --extra-index-url https://pypi.anaconda.org/OpenEye/simple
2
- OpenEye-toolkits >= 2024.1.1
2
+ # Above line may be needed despite the OpenEye package not being a direct requirement of this package (it's used by rcsb.utils.chem)
3
+ # OpenEye-toolkits >= 2024.1.1
3
4
  numpy
4
5
  jsonschema >= 2.6.0
5
6
  rcsb.utils.io >= 1.48
6
- rcsb.db >= 1.725
7
- rcsb.utils.chem >= 0.79
7
+ rcsb.db >= 1.800
8
+ rcsb.utils.chem >= 0.81
8
9
  rcsb.utils.chemref >= 0.91
9
- rcsb.utils.citation >= 0.22
10
10
  rcsb.utils.config >= 0.40
11
11
  rcsb.utils.ec >= 0.25
12
12
  rcsb.utils.go >= 0.18
13
13
  rcsb.utils.seq >= 0.82
14
- rcsb.utils.seqalign >= 0.31
15
14
  rcsb.utils.targets >= 0.82
16
15
  rcsb.utils.struct >= 0.47
17
16
  rcsb.utils.taxonomy >= 0.43
18
17
  rcsb.utils.dictionary >= 1.27
19
- rcsb.workflow >= 0.46
20
- statistics; python_version < "3.0"
@@ -47,7 +47,6 @@ setup(
47
47
  "Programming Language :: Python :: 3.9",
48
48
  "Programming Language :: Python :: 3.10",
49
49
  ],
50
- entry_points={"console_scripts": ["exdb_exec_cli=rcsb.exdb.cli.ExDbExec:main"]},
51
50
  #
52
51
  install_requires=packagesRequired[1:],
53
52
  packages=find_packages(exclude=["rcsb.mock-data", "rcsb.exdb.tests-anal", "rcsb.exdb.tests-*", "tests.*"]),
@@ -57,7 +56,7 @@ setup(
57
56
  },
58
57
  #
59
58
  test_suite="rcsb.exdb.tests",
60
- tests_require=["tox"],
59
+ tests_require=["tox", "rcsb.utils.citation >= 0.22"],
61
60
  #
62
61
  # Not configured ...
63
62
  extras_require={"dev": ["check-manifest"], "test": ["coverage"]},