rcsb.exdb 0.94__tar.gz → 0.96__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/HISTORY.txt +2 -0
  2. {rcsb.exdb-0.94/rcsb.exdb.egg-info → rcsb.exdb-0.96}/PKG-INFO +1 -1
  3. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/chemref/PubChemDataCacheProvider.py +5 -4
  4. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/chemref/PubChemEtlWrapper.py +12 -7
  5. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/chemref/PubChemIndexCacheProvider.py +9 -10
  6. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/cli/__init__.py +1 -1
  7. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testGlycanUtils.py +2 -1
  8. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testPubChemEtlWorkflow.py +10 -4
  9. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testPubChemEtlWrapper.py +11 -6
  10. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testPubChemIndexCacheProvider.py +2 -2
  11. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tree/TreeNodeListWorker.py +11 -1
  12. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/wf/PubChemEtlWorkflow.py +67 -14
  13. {rcsb.exdb-0.94 → rcsb.exdb-0.96/rcsb.exdb.egg-info}/PKG-INFO +1 -1
  14. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/LICENSE +0 -0
  15. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/MANIFEST.in +0 -0
  16. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/README.md +0 -0
  17. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/__init__.py +0 -0
  18. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/__init__.py +0 -0
  19. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/branch/BranchedEntityExtractor.py +0 -0
  20. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/branch/GlycanProvider.py +0 -0
  21. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/branch/GlycanUtils.py +0 -0
  22. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/branch/__init__.py +0 -0
  23. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/chemref/ChemRefEtlWorker.py +0 -0
  24. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/chemref/ChemRefExtractor.py +0 -0
  25. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/chemref/ChemRefMappingProvider.py +0 -0
  26. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/chemref/__init__.py +0 -0
  27. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/citation/CitationAdapter.py +0 -0
  28. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/citation/CitationExtractor.py +0 -0
  29. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/citation/CitationUtils.py +0 -0
  30. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/citation/__init__.py +0 -0
  31. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/cli/ExDbExec.py +0 -0
  32. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/entry/EntryInfoProvider.py +0 -0
  33. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/entry/__init__.py +0 -0
  34. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/seq/AnnotationExtractor.py +0 -0
  35. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/seq/LigandNeighborMappingExtractor.py +0 -0
  36. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/seq/LigandNeighborMappingProvider.py +0 -0
  37. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/seq/PolymerEntityExtractor.py +0 -0
  38. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +0 -0
  39. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +0 -0
  40. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +0 -0
  41. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +0 -0
  42. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +0 -0
  43. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/seq/TaxonomyExtractor.py +0 -0
  44. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/seq/UniProtCoreEtlWorker.py +0 -0
  45. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/seq/UniProtExtractor.py +0 -0
  46. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/seq/__init__.py +0 -0
  47. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/__init__.py +0 -0
  48. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +0 -0
  49. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/fixturePdbxLoader.py +0 -0
  50. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testAnnotationExtractor.py +0 -0
  51. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testBranchedEntityExtractor.py +0 -0
  52. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testChemRefLoader.py +0 -0
  53. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testChemRefMappingProvider.py +0 -0
  54. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testCitationAdapter.py +0 -0
  55. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testCitationExtractor.py +0 -0
  56. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testCitationUtils.py +0 -0
  57. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +0 -0
  58. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testEntryInfoProvider.py +0 -0
  59. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testExDbWorkflow.py +0 -0
  60. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testGlycanEtlWorkflow.py +0 -0
  61. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testGlycanProvider.py +0 -0
  62. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testLigandNeighborMappingProvider.py +0 -0
  63. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testObjectExtractor.py +0 -0
  64. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testObjectTransformer.py +0 -0
  65. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testObjectUpdater.py +0 -0
  66. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testPolymerEntityExtractor.py +0 -0
  67. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testPubChemDataCacheProvider.py +0 -0
  68. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +0 -0
  69. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +0 -0
  70. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +0 -0
  71. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +0 -0
  72. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +0 -0
  73. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testTaxonomyExtractor.py +0 -0
  74. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testTreeNodeListWorker.py +0 -0
  75. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testUniProtCoreEtlWorker.py +0 -0
  76. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tests/testUniProtExtractor.py +0 -0
  77. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/tree/__init__.py +0 -0
  78. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/utils/ObjectAdapterBase.py +0 -0
  79. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/utils/ObjectExtractor.py +0 -0
  80. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/utils/ObjectTransformer.py +0 -0
  81. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/utils/ObjectUpdater.py +0 -0
  82. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/utils/ObjectValidator.py +0 -0
  83. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/utils/__init__.py +0 -0
  84. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/wf/EntryInfoEtlWorkflow.py +0 -0
  85. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/wf/ExDbWorkflow.py +0 -0
  86. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/wf/GlycanEtlWorkflow.py +0 -0
  87. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb/exdb/wf/__init__.py +0 -0
  88. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb.exdb.egg-info/SOURCES.txt +0 -0
  89. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb.exdb.egg-info/dependency_links.txt +0 -0
  90. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb.exdb.egg-info/entry_points.txt +0 -0
  91. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb.exdb.egg-info/not-zip-safe +0 -0
  92. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb.exdb.egg-info/requires.txt +0 -0
  93. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/rcsb.exdb.egg-info/top_level.txt +0 -0
  94. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/requirements.txt +0 -0
  95. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/setup.cfg +0 -0
  96. {rcsb.exdb-0.94 → rcsb.exdb-0.96}/setup.py +0 -0
@@ -88,3 +88,5 @@
88
88
  9-Jan-2023 V0.93 Configuration changes to support tox 4
89
89
  9-Mar-2023 V0.94 Update ExDbWorkflow to make use of multiple processors for 'upd_ref_seq' operation;
90
90
  Lower refChunkSize to 10 for requests to UniProt API
91
+ 13-Mar-2023 V0.95 Updates to PubChem workflow to use multiprocess count, disable git stash testing, remove obsolete entries from test data
92
+ 12-Apr-2023 V0.96 Add CARD ontology data to tree builder
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: rcsb.exdb
3
- Version: 0.94
3
+ Version: 0.96
4
4
  Summary: RCSB Python ExDB data extraction and loading workflows
5
5
  Home-page: https://github.com/rcsb/py-rcsb_exdb
6
6
  Author: John Westbrook
@@ -8,6 +8,7 @@
8
8
  # 9-May-2020 jdw separate cache behavior with separate option rebuildChemIndices=True/False
9
9
  # 16-Jul-2020 jdw separate index and reference data management.
10
10
  # 23-Jul-2021 jdw Make PubChemDataCacheProvider a subclass of StashableBase()
11
+ # 15-Mar-2023 aae Update default numProc to 2
11
12
  #
12
13
  ##
13
14
  __docformat__ = "google en"
@@ -228,12 +229,12 @@ class PubChemDataCacheProvider(StashableBase):
228
229
  # --
229
230
  return numUpd
230
231
 
231
- def updateMissing(self, idList, exportPath=None, numProc=1, chunkSize=5):
232
+ def updateMissing(self, idList, exportPath=None, numProc=2, chunkSize=5):
232
233
  """Fetch and load reference data for any missing PubChem ID codes in the input list.
233
234
 
234
235
  Args:
235
236
  idList (list): PubChem ID codes
236
- numProc (int, optional): number of processor to use. Defaults to 1.
237
+ numProc (int, optional): number of processor to use. Defaults to 2.
237
238
  chunkSize (int, optional): chunk size between data store updates. Defaults to 5.
238
239
  exportPath (str, optional): store raw fetched data in this path. Defaults to None.
239
240
 
@@ -252,13 +253,13 @@ class PubChemDataCacheProvider(StashableBase):
252
253
 
253
254
  return ok, failList
254
255
 
255
- def load(self, idList, exportPath=None, numProc=1, chunkSize=5):
256
+ def load(self, idList, exportPath=None, numProc=2, chunkSize=5):
256
257
  """Fetch and load reference data for the input list of PubChem compound codes.
257
258
 
258
259
  Args:
259
260
  idList (list): PubChem ID codes
260
261
  exportPath (str, optional): store raw fetched data in this path. Defaults to None.
261
- numProc (int, optional): number of processor to use. Defaults to 1.
262
+ numProc (int, optional): number of processor to use. Defaults to 2.
262
263
  chunkSize (int, optional): chunk size between data store updates. Defaults to 5.
263
264
 
264
265
 
@@ -4,7 +4,7 @@
4
4
  #
5
5
  #
6
6
  # Updates:
7
- #
7
+ # 14-Mar-2023 aae Updates to use multiprocess count
8
8
  #
9
9
  ##
10
10
  __docformat__ = "google en"
@@ -80,6 +80,8 @@ class PubChemEtlWrapper(object):
80
80
 
81
81
  Args:
82
82
  contentType (str): target content to stash (data|index|identifiers)
83
+ useStash (bool): should stash (Buildlocker) be updated? (default: True)
84
+ useGit (bool): should stash (GitHub) be updated? (default: True)
83
85
  Returns:
84
86
  (bool): True for success or False otherwise
85
87
  """
@@ -117,7 +119,8 @@ class PubChemEtlWrapper(object):
117
119
  rebuildChemIndices (bool, optional): rebuild indices from source (default: False)
118
120
  fetchLimit (int, optional): maximum number of definitions to process (default: None)
119
121
  exportPath(str, optional): path to export raw PubChem search results (default: None)
120
- numProc(int): number processors to include in multiprocessing mode (default: 12)
122
+ numProcChemComp (int, optional): number processors to include in multiprocessing mode for ChemComp indices (default: 8)
123
+ numProc (int, optional): number processors to include in multiprocessing mode for PubChem (default: 2)
121
124
 
122
125
  Returns:
123
126
  (bool): True for success or False otherwise
@@ -131,7 +134,8 @@ class PubChemEtlWrapper(object):
131
134
  fetchLimit = kwargs.get("fetchLimit", None)
132
135
  exportPath = kwargs.get("exportPath", None)
133
136
  expireDays = kwargs.get("expireDays", 0)
134
- numProc = kwargs.get("numProc", 12)
137
+ numProcChemComp = kwargs.get("numProcChemComp", 8)
138
+ numProc = kwargs.get("numProc", 2)
135
139
 
136
140
  # -- Update/create mapping index cache ---
137
141
  ok = self.__pcicP.updateMissing(
@@ -143,6 +147,7 @@ class PubChemEtlWrapper(object):
143
147
  exportPath=exportPath,
144
148
  rebuildChemIndices=rebuildChemIndices,
145
149
  fetchLimit=fetchLimit,
150
+ numProcChemComp=numProcChemComp,
146
151
  numProc=numProc,
147
152
  )
148
153
  except Exception as e:
@@ -174,7 +179,7 @@ class PubChemEtlWrapper(object):
174
179
  logger.debug("mapD (%d) extraMapD (%d) %r", len(mapD), len(extraMapD), extraMapD)
175
180
  return mapD, extraMapD
176
181
 
177
- def updateData(self, pcidList, doExport=False):
182
+ def updateData(self, pcidList, doExport=False, numProc=2):
178
183
  """Update PubChem reference data for the input list of compound identifiers.
179
184
 
180
185
  Args:
@@ -186,14 +191,14 @@ class PubChemEtlWrapper(object):
186
191
  ok = False
187
192
  try:
188
193
  exportPath = self.__dirPath if doExport else None
189
- ok, failList = self.__pcdcP.updateMissing(pcidList, exportPath=exportPath)
194
+ ok, failList = self.__pcdcP.updateMissing(pcidList, exportPath=exportPath, numProc=numProc)
190
195
  if failList:
191
196
  logger.info("No data updated for %r", failList)
192
197
  except Exception as e:
193
198
  logger.exception("Failing with %s", str(e))
194
199
  return ok
195
200
 
196
- def updateMatchedData(self, exportRaw=False):
201
+ def updateMatchedData(self, exportRaw=False, numProc=2):
197
202
  """Update PubChem reference data using matched compound identifiers in the current index.
198
203
 
199
204
  Returns:
@@ -203,7 +208,7 @@ class PubChemEtlWrapper(object):
203
208
  try:
204
209
  pcidList = self.getMatches()
205
210
  exportPath = self.__dirPath if exportRaw else None
206
- ok, failList = self.__pcdcP.updateMissing(pcidList, exportPath=exportPath)
211
+ ok, failList = self.__pcdcP.updateMissing(pcidList, exportPath=exportPath, numProc=numProc)
207
212
  if failList:
208
213
  logger.info("No data updated for %r", failList)
209
214
  except Exception as e:
@@ -8,6 +8,7 @@
8
8
  # 9-May-2020 jdw separate cache behavior with separate option rebuildChemIndices=True/False
9
9
  # 16-Jul-2020 jdw separate index and reference data management.
10
10
  # 23-Jul-2021 jdw Make PubChemIndexCacheProvider a subclass of StashableBase()
11
+ # 2-Mar-2023 aae Return correct status from Single proc
11
12
  #
12
13
  ##
13
14
  __docformat__ = "google en"
@@ -291,14 +292,15 @@ class PubChemIndexCacheProvider(StashableBase):
291
292
  # --
292
293
  return numUpd
293
294
 
294
- def updateMissing(self, expireDays=0, fetchLimit=None, updateUnmatched=True, numProc=12, **kwargs):
295
+ def updateMissing(self, expireDays=0, fetchLimit=None, updateUnmatched=True, numProcChemComp=8, numProc=2, **kwargs):
295
296
  """Update match index from object store
296
297
 
297
298
  Args:
298
299
  expireDays (int): expiration days on match data (default 0 meaning none)
299
300
  fetchLimit (int): limit to the number of entry updates performed (None)
300
301
  updateUnmatched (bool): Previously unmatched search definitions will be retried on update (default=True)
301
- numProc (int): for rebuilding local chemical indices the number processors to apply (default=12)
302
+ numProcChemComp (int): for rebuilding local ChemComp indices the number processors to apply (default=8)
303
+ numProc (int): for rebuilding local PubChem indices the number processors to apply (default=2)
302
304
 
303
305
  Returns:
304
306
  bool: True for success or False otherwise
@@ -320,7 +322,7 @@ class PubChemIndexCacheProvider(StashableBase):
320
322
  try:
321
323
  # ---
322
324
  # Get current the indices of source chemical reference data -
323
- ok, ccidxP, ccsidxP = self.__rebuildChemCompSourceIndices(numProc, **kwargs)
325
+ ok, ccidxP, ccsidxP = self.__rebuildChemCompSourceIndices(numProcChemComp, **kwargs)
324
326
  if not ok:
325
327
  return matchD
326
328
  #
@@ -338,8 +340,8 @@ class PubChemIndexCacheProvider(StashableBase):
338
340
  updateIdList = updateIdList[:fetchLimit] if fetchLimit else updateIdList
339
341
  #
340
342
  if updateIdList:
341
- logger.info("Update reference data cache for %d chemical identifers", len(updateIdList))
342
- ok, failList = self.__updateReferenceData(updateIdList, searchIdxD, **kwargs)
343
+ logger.info("Update reference data cache for %d chemical identifiers", len(updateIdList))
344
+ ok, failList = self.__updateReferenceData(updateIdList, searchIdxD, numProc, **kwargs)
343
345
  logger.info("Update reference data return status is %r missing count %d", ok, len(failList))
344
346
  else:
345
347
  logger.info("No reference data updates required")
@@ -498,7 +500,7 @@ class PubChemIndexCacheProvider(StashableBase):
498
500
  objD = obEx.getObjects()
499
501
  return objD
500
502
 
501
- def __updateReferenceData(self, idList, searchIdxD, **kwargs):
503
+ def __updateReferenceData(self, idList, searchIdxD, numProc=2, **kwargs):
502
504
  """Launch worker methods to update chemical reference data correspondences.
503
505
 
504
506
  Args:
@@ -507,7 +509,6 @@ class PubChemIndexCacheProvider(StashableBase):
507
509
  Returns:
508
510
  (bool, list): status flag, list of unmatched identifiers
509
511
  """
510
- numProc = 1
511
512
  chunkSize = 50
512
513
  exportPath = kwargs.get("exportPath", None)
513
514
  logger.info("Length starting list is %d", len(idList))
@@ -522,7 +523,7 @@ class PubChemIndexCacheProvider(StashableBase):
522
523
  else:
523
524
  successList, _, _, _ = rWorker.updateList(idList, "SingleProc", optD, self.__dirPath)
524
525
  failList = list(set(idList) - set(successList))
525
- ok = len(failList) > 0
526
+ ok = len(failList) == 0
526
527
  logger.info("Single-proc status %r failures %r", ok, len(failList))
527
528
  #
528
529
  return ok, failList
@@ -602,8 +603,6 @@ class PubChemIndexCacheProvider(StashableBase):
602
603
  logSizes = kwargs.get("logSizes", False)
603
604
  limitPerceptions = kwargs.get("limitPerceptions", False)
604
605
  #
605
- # numProc = kwargs.get("numProc", 1)
606
- # numProc = self.__numProc
607
606
  chunkSize = kwargs.get("chunkSize", 5)
608
607
  molLimit = kwargs.get("molLimit", None)
609
608
  ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "cc-full")
@@ -2,4 +2,4 @@ __docformat__ = "google en"
2
2
  __author__ = "John Westbrook"
3
3
  __email__ = "john.westbrook@rcsb.org"
4
4
  __license__ = "Apache 2.0"
5
- __version__ = "0.94"
5
+ __version__ = "0.96"
@@ -4,6 +4,7 @@
4
4
  # Date: 25-May-2021
5
5
  #
6
6
  # Update:
7
+ # 9-Feb-2023 aae Fix TOPDIR path
7
8
  ##
8
9
  """
9
10
  Tests for creating glycan accession mapping details.
@@ -22,7 +23,7 @@ from rcsb.exdb.branch.GlycanUtils import GlycanUtils
22
23
  from rcsb.utils.config.ConfigUtil import ConfigUtil
23
24
 
24
25
  HERE = os.path.abspath(os.path.dirname(__file__))
25
- TOPDIR = os.path.dirname(os.path.dirname(HERE))
26
+ TOPDIR = os.path.dirname(os.path.dirname(os.path.dirname(HERE)))
26
27
 
27
28
  logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s]-%(module)s.%(funcName)s: %(message)s")
28
29
  logger = logging.getLogger()
@@ -4,7 +4,7 @@
4
4
  # Date: 29-Jul-2020
5
5
  #
6
6
  # Updates:
7
- #
7
+ # 13-Mar-2023 aae Disable git stash testing
8
8
  ##
9
9
  """
10
10
  Tests for PubChem ETL workflow methods
@@ -51,6 +51,10 @@ class PubChemEtlWorkflowTests(unittest.TestCase):
51
51
  self.__birdUrlTarget = os.path.join(self.__dataPath, "prdcc-abbrev.cif")
52
52
  self.__ccFileNamePrefix = "cc-abbrev"
53
53
  #
54
+ # This tests pushing files to the stash
55
+ self.__testStashServer = True
56
+ self.__testStashGit = False
57
+ #
54
58
  self.__startTime = time.time()
55
59
  logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
56
60
 
@@ -71,8 +75,10 @@ class PubChemEtlWorkflowTests(unittest.TestCase):
71
75
  ccUrlTarget=self.__ccUrlTarget,
72
76
  birdUrlTarget=self.__birdUrlTarget,
73
77
  ccFileNamePrefix=self.__ccFileNamePrefix,
74
- numProc=4,
78
+ numProcChemComp=4,
75
79
  rebuildChemIndices=True,
80
+ useStash=self.__testStashServer,
81
+ useGit=self.__testStashGit
76
82
  )
77
83
  self.assertTrue(ok)
78
84
  except Exception as e:
@@ -84,7 +90,7 @@ class PubChemEtlWorkflowTests(unittest.TestCase):
84
90
  try:
85
91
  # --
86
92
  pcewP = PubChemEtlWorkflow(configPath=self.__configPath, configName=self.__configName, cachePath=self.__cachePath)
87
- ok = pcewP.dump()
93
+ ok = pcewP.dump(useStash=self.__testStashServer, useGit=self.__testStashGit)
88
94
  self.assertTrue(ok)
89
95
  except Exception as e:
90
96
  logger.exception("Failing with %s", str(e))
@@ -106,7 +112,7 @@ class PubChemEtlWorkflowTests(unittest.TestCase):
106
112
  try:
107
113
  # --
108
114
  pcewP = PubChemEtlWorkflow(configPath=self.__configPath, configName=self.__configName, cachePath=self.__cachePath)
109
- ok = pcewP.updateMatchedData()
115
+ ok = pcewP.updateMatchedData(useStash=self.__testStashServer, useGit=self.__testStashGit)
110
116
  self.assertTrue(ok)
111
117
  except Exception as e:
112
118
  logger.exception("Failing with %s", str(e))
@@ -4,7 +4,8 @@
4
4
  # Date: 20-Jul-2020
5
5
  #
6
6
  # Updates:
7
- #
7
+ # 13-Mar-2023 aae Updates to use multiprocess count, disable git stash testing,
8
+ # Fix tests after removing obsolete entries from test data
8
9
  ##
9
10
  """
10
11
  Tests for PubChem ETL wrapper methods
@@ -56,11 +57,15 @@ class PubChemEtlWrapperTests(unittest.TestCase):
56
57
  # These are test source files for chemical component/BIRD indices
57
58
  self.__ccUrlTarget = os.path.join(self.__dataPath, "components-abbrev.cif")
58
59
  self.__birdUrlTarget = os.path.join(self.__dataPath, "prdcc-abbrev.cif")
59
- self.__numComponents = 30
60
+ self.__numComponents = 25
60
61
  self.__numSelectMatches = 23
61
62
  self.__numAltMatches = 2
62
63
  self.__numTotalMatches = 50
63
64
  #
65
+ # This tests pushing files to the stash
66
+ self.__testStashServer = True
67
+ self.__testStashGit = False
68
+ #
64
69
  self.__startTime = time.time()
65
70
  logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
66
71
 
@@ -83,7 +88,7 @@ class PubChemEtlWrapperTests(unittest.TestCase):
83
88
  ccFileNamePrefix="cc-abbrev",
84
89
  exportPath=self.__dirPath,
85
90
  rebuildChemIndices=True,
86
- numProc=4,
91
+ numProcChemComp=4,
87
92
  )
88
93
  self.assertTrue(ok)
89
94
  #
@@ -97,7 +102,7 @@ class PubChemEtlWrapperTests(unittest.TestCase):
97
102
  #
98
103
  ok = pcewP.dump(contentType="index")
99
104
  self.assertTrue(ok)
100
- ok = pcewP.toStash(contentType="index")
105
+ ok = pcewP.toStash(contentType="index", useStash=self.__testStashServer, useGit=self.__testStashGit)
101
106
  self.assertTrue(ok)
102
107
  except Exception as e:
103
108
  logger.exception("Failing with %s", str(e))
@@ -125,13 +130,13 @@ class PubChemEtlWrapperTests(unittest.TestCase):
125
130
  self.assertTrue(ok)
126
131
  ok = pcewP.dump(contentType="data")
127
132
  self.assertTrue(ok)
128
- ok = pcewP.toStash(contentType="data")
133
+ ok = pcewP.toStash(contentType="data", useStash=self.__testStashServer, useGit=self.__testStashGit)
129
134
  self.assertTrue(ok)
130
135
  ok = pcewP.updateIdentifiers()
131
136
  self.assertTrue(ok)
132
137
  ok = pcewP.dump(contentType="identifiers")
133
138
  self.assertTrue(ok)
134
- ok = pcewP.toStash(contentType="identifiers")
139
+ ok = pcewP.toStash(contentType="identifiers", useStash=self.__testStashServer, useGit=self.__testStashGit)
135
140
  self.assertTrue(ok)
136
141
  except Exception as e:
137
142
  logger.exception("Failing with %s", str(e))
@@ -4,7 +4,7 @@
4
4
  # Date: 16-Jul-2020
5
5
  #
6
6
  # Updates:
7
- #
7
+ # 13-Mar-2023 aae Fix tests after removing obsolete entries from test data
8
8
  ##
9
9
  """
10
10
  Tests for PubChem index cache maintenance operations
@@ -66,7 +66,7 @@ class PubChemIndexCacheProviderTests(unittest.TestCase):
66
66
  """Test case - search, backup, restore and select PubChem correspondences for reference chemical definitions."""
67
67
  try:
68
68
  # -- Update/create mapping index cache ---
69
- numObj = 30
69
+ numObj = 25
70
70
  pcicP = PubChemIndexCacheProvider(self.__cfgOb, self.__cachePath)
71
71
  pcicP.updateMissing(
72
72
  expireDays=0,
@@ -6,7 +6,8 @@
6
6
  #
7
7
  # Updates:
8
8
  # 9-Sep-2019 jdw add AtcProvider() and ChemrefExtractor() for ATC tree.
9
- # JDW TODO TEST
9
+ # 12-Apr-2023 dwp add CARD ontology tree
10
+ #
10
11
  ##
11
12
  __docformat__ = "google en"
12
13
  __author__ = "John Westbrook"
@@ -23,6 +24,7 @@ from rcsb.exdb.seq.AnnotationExtractor import AnnotationExtractor
23
24
  from rcsb.exdb.seq.TaxonomyExtractor import TaxonomyExtractor
24
25
  from rcsb.utils.chemref.AtcProvider import AtcProvider
25
26
  from rcsb.utils.ec.EnzymeDatabaseProvider import EnzymeDatabaseProvider
27
+ from rcsb.utils.targets.CARDTargetOntologyProvider import CARDTargetOntologyProvider
26
28
  from rcsb.utils.go.GeneOntologyProvider import GeneOntologyProvider
27
29
  from rcsb.utils.struct.CathClassificationProvider import CathClassificationProvider
28
30
  from rcsb.utils.struct.EcodClassificationProvider import EcodClassificationProvider
@@ -172,6 +174,14 @@ class TreeNodeListWorker(object):
172
174
  collectionName = "tree_ec_node_list"
173
175
  ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["update_id"], keyNames=None, addValues=addValues, schemaLevel=None)
174
176
  self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
177
+ # ---- CARD
178
+ cou = CARDTargetOntologyProvider(cachePath=self.__cachePath, useCache=False)
179
+ nL = cou.getTreeNodeList()
180
+ logger.info("Starting load of EC node tree length %d", len(nL))
181
+ if doLoad:
182
+ collectionName = "tree_card_node_list"
183
+ ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["update_id"], keyNames=None, addValues=addValues, schemaLevel=None)
184
+ self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
175
185
  # ---- Taxonomy
176
186
  # Get the taxon coverage in the current data set -
177
187
  epe = TaxonomyExtractor(self.__cfgOb)
@@ -5,7 +5,7 @@
5
5
  # Workflow wrapper -- PubChem ETL utilities
6
6
  #
7
7
  # Updates:
8
- #
8
+ # 13-Mar-2023 aae Updates to use multiprocess count, disable git stash testing
9
9
  ##
10
10
  __docformat__ = "google en"
11
11
  __author__ = "John Westbrook"
@@ -47,23 +47,48 @@ class PubChemEtlWorkflow(object):
47
47
  logger.setLevel(logging.DEBUG)
48
48
  #
49
49
 
50
- def dump(self):
51
- """Dump the current object store of PubChem correspondences and data."""
50
+ def dump(self, **kwargs):
51
+ """Dump the current object store of PubChem correspondences and data.
52
+
53
+ Args:
54
+ useStash (bool): should stash (Buildlocker) be updated? (default: True)
55
+ useGit (bool): should stash (GitHub) be updated? (default: True)
56
+
57
+ Returns:
58
+ (bool): True for success or False otherwise
59
+
60
+ """
52
61
  ok1 = ok2 = ok3 = ok4 = False
53
62
  try:
63
+ useStash = kwargs.get("useStash", True)
64
+ useGit = kwargs.get("useGit", True) # Revisit stashing in GitHub as file timestamp will always cause a commit
54
65
  # -- Update local chemical indices and create PubChem mapping index ---
55
66
  pcewP = PubChemEtlWrapper(self.__cfgOb, self.__cachePath, stashRemotePrefix=self.__stashRemotePrefix)
56
67
  sTime = time.time()
57
68
  logger.info("Dumping index data")
58
69
  ok1 = pcewP.dump(contentType="index")
59
- ok2 = pcewP.toStash(contentType="index")
60
70
  eTime = time.time()
61
71
  logger.info("Dumping index data done in (%.4f seconds)", eTime - sTime)
72
+ if useGit or useStash:
73
+ sTime = time.time()
74
+ logger.info("Stashing index data")
75
+ ok2 = pcewP.toStash(contentType="index", useStash=useStash, useGit=useGit)
76
+ eTime = time.time()
77
+ logger.info("Stashing index data done in (%.4f seconds)", eTime - sTime)
78
+ else:
79
+ ok2 = True
62
80
 
63
81
  sTime = time.time()
64
82
  logger.info("Dumping reference data")
65
83
  ok3 = pcewP.dump(contentType="data")
66
- ok4 = pcewP.toStash(contentType="data")
84
+ if useGit or useStash:
85
+ sTime = time.time()
86
+ logger.info("Stashing reference data")
87
+ ok4 = pcewP.toStash(contentType="data", useStash=useStash, useGit=useGit)
88
+ eTime = time.time()
89
+ logger.info("Stashing reference data done in (%.4f seconds)", eTime - sTime)
90
+ else:
91
+ ok4 = True
67
92
  eTime = time.time()
68
93
  logger.info("Dumping data done in (%.4f seconds)", eTime - sTime)
69
94
  except Exception as e:
@@ -77,7 +102,7 @@ class PubChemEtlWorkflow(object):
77
102
  # -- Update local chemical indices and create PubChem mapping index ---
78
103
  pcewP = PubChemEtlWrapper(self.__cfgOb, self.__cachePath, stashRemotePrefix=self.__stashRemotePrefix)
79
104
  sTime = time.time()
80
- ok1 = pcewP.toStash(contentType="index")
105
+ ok1 = pcewP. toStash(contentType="index")
81
106
  eTime = time.time()
82
107
  logger.info("Stashing index data done in (%.4f seconds)", eTime - sTime)
83
108
 
@@ -122,8 +147,11 @@ class PubChemEtlWorkflow(object):
122
147
  birdUrlTarget (str, optional): target url for bird dictionary resource file (cc format) (default: None=all public)
123
148
  ccFileNamePrefix (str, optional): index file prefix (default: full)
124
149
  rebuildChemIndices (bool, optional): rebuild indices from source (default: False)
125
- exportPath(str, optional): path to export raw PubChem search results (default: None)
126
- numProc(int): number processors to include in multiprocessing mode (default: 12)
150
+ exportPath (str, optional): path to export raw PubChem search results (default: None)
151
+ numProcChemComp (int, optional): number processors to include in multiprocessing mode (default: 8)
152
+ numProc (int, optional): number processors to include in multiprocessing mode (default: 2)
153
+ useStash (bool, optional): should stash (Buildlocker) be updated? (default: True)
154
+ useGit (bool, optional): should stash (GitHub) be updated? (default: True)
127
155
 
128
156
  Returns:
129
157
  (bool): True for success or False otherwise
@@ -135,9 +163,12 @@ class PubChemEtlWorkflow(object):
135
163
  ccUrlTarget = kwargs.get("ccUrlTarget", None)
136
164
  birdUrlTarget = kwargs.get("birdUrlTarget", None)
137
165
  ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "cc-full")
138
- numProc = kwargs.get("numProc", 12)
166
+ numProcChemComp = kwargs.get("numProcChemComp", 8)
167
+ numProc = kwargs.get("numProc", 2)
139
168
  rebuildChemIndices = kwargs.get("rebuildChemIndices", True)
140
169
  exportPath = kwargs.get("exportPath", None)
170
+ useStash = kwargs.get("useStash", True)
171
+ useGit = kwargs.get("useGit", True)
141
172
  #
142
173
  pcewP = PubChemEtlWrapper(self.__cfgOb, self.__cachePath, stashRemotePrefix=self.__stashRemotePrefix)
143
174
  ok1 = pcewP.updateIndex(
@@ -146,30 +177,52 @@ class PubChemEtlWorkflow(object):
146
177
  ccFileNamePrefix=ccFileNamePrefix,
147
178
  exportPath=exportPath,
148
179
  rebuildChemIndices=rebuildChemIndices,
180
+ numProcChemComp=numProcChemComp,
149
181
  numProc=numProc,
150
182
  )
151
183
  ok2 = pcewP.dump(contentType="index")
152
- ok3 = pcewP.toStash(contentType="index")
184
+ if useGit or useStash:
185
+ ok3 = pcewP.toStash(contentType="index", useStash=useStash, useGit=useGit)
186
+ else:
187
+ ok3 = True
153
188
  except Exception as e:
154
189
  logger.exception("Failing with %s", str(e))
155
190
  #
156
191
  return ok1 and ok2 and ok3
157
192
 
158
- def updateMatchedData(self):
193
+ def updateMatchedData(self, **kwargs):
159
194
  """Update PubChem annotation data for matched correspondences. Generate and stash
160
195
  related identifiers for corresponding components and BIRD chemical definitions.
196
+
197
+ Args:
198
+ numProc(int): number processors to include in multiprocessing mode (default: 2)
199
+ useStash(bool): should stash (Buildlocker) be updated? (default: True)
200
+ useGit(bool): should stash (GitHub) be updated? (default: True)
201
+
202
+ Returns:
203
+ (bool): True for success or False otherwise
161
204
  """
162
205
  try:
163
206
  ok1 = ok2 = ok3 = ok4 = ok5 = ok6 = False
164
207
  # --
208
+ numProc = kwargs.get("numProc", 2)
209
+ useStash = kwargs.get("useStash", True)
210
+ useGit = kwargs.get("useGit", True)
211
+ #
165
212
  pcewP = PubChemEtlWrapper(self.__cfgOb, self.__cachePath, stashRemotePrefix=self.__stashRemotePrefix)
166
- ok1 = pcewP.updateMatchedData()
213
+ ok1 = pcewP.updateMatchedData(numProc=numProc)
167
214
  ok2 = pcewP.dump(contentType="data")
168
- ok3 = pcewP.toStash(contentType="data")
215
+ if useGit or useStash:
216
+ ok3 = pcewP.toStash(contentType="data", useStash=useStash, useGit=useGit)
217
+ else:
218
+ ok3 = True
169
219
  #
170
220
  ok4 = pcewP.updateIdentifiers()
171
221
  ok5 = pcewP.dump(contentType="identifiers")
172
- ok6 = pcewP.toStash(contentType="identifiers")
222
+ if useGit or useStash:
223
+ ok6 = pcewP.toStash(contentType="identifiers", useStash=useStash, useGit=useGit)
224
+ else:
225
+ ok6 = True
173
226
  #
174
227
  except Exception as e:
175
228
  logger.exception("Failing with %s", str(e))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: rcsb.exdb
3
- Version: 0.94
3
+ Version: 0.96
4
4
  Summary: RCSB Python ExDB data extraction and loading workflows
5
5
  Home-page: https://github.com/rcsb/py-rcsb_exdb
6
6
  Author: John Westbrook
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes