rcsb.exdb 1.29__tar.gz → 1.30__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/HISTORY.txt +1 -0
  2. {rcsb_exdb-1.29/rcsb.exdb.egg-info → rcsb_exdb-1.30}/PKG-INFO +3 -3
  3. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/chemref/ChemRefEtlWorker.py +9 -6
  4. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/chemref/ChemRefExtractor.py +3 -2
  5. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/cli/__init__.py +1 -1
  6. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/fixturePdbxLoader.py +11 -4
  7. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testObjectExtractor.py +2 -2
  8. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testReferenceSequenceAnnotationAdapter.py +1 -0
  9. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testTreeNodeListWorker.py +1 -1
  10. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tree/TreeNodeListWorker.py +87 -127
  11. {rcsb_exdb-1.29 → rcsb_exdb-1.30/rcsb.exdb.egg-info}/PKG-INFO +3 -3
  12. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb.exdb.egg-info/requires.txt +2 -2
  13. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/requirements.txt +2 -2
  14. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/LICENSE +0 -0
  15. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/MANIFEST.in +0 -0
  16. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/README.md +0 -0
  17. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/__init__.py +0 -0
  18. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/__init__.py +0 -0
  19. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/branch/BranchedEntityExtractor.py +0 -0
  20. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/branch/GlycanProvider.py +0 -0
  21. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/branch/GlycanUtils.py +0 -0
  22. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/branch/__init__.py +0 -0
  23. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/chemref/ChemRefMappingProvider.py +0 -0
  24. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/chemref/PubChemDataCacheProvider.py +0 -0
  25. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/chemref/PubChemEtlWrapper.py +0 -0
  26. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/chemref/PubChemIndexCacheProvider.py +0 -0
  27. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/chemref/__init__.py +0 -0
  28. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/citation/CitationAdapter.py +0 -0
  29. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/citation/CitationExtractor.py +0 -0
  30. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/citation/CitationUtils.py +0 -0
  31. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/citation/__init__.py +0 -0
  32. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/entry/EntryInfoProvider.py +0 -0
  33. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/entry/__init__.py +0 -0
  34. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/seq/AnnotationExtractor.py +0 -0
  35. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/seq/LigandNeighborMappingExtractor.py +0 -0
  36. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/seq/LigandNeighborMappingProvider.py +0 -0
  37. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/seq/PolymerEntityExtractor.py +0 -0
  38. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/seq/ReferenceSequenceAnnotationAdapter.py +0 -0
  39. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/seq/ReferenceSequenceAnnotationProvider.py +0 -0
  40. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/seq/ReferenceSequenceAssignmentAdapter.py +0 -0
  41. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/seq/ReferenceSequenceAssignmentProvider.py +0 -0
  42. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/seq/ReferenceSequenceCacheProvider.py +0 -0
  43. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/seq/TaxonomyExtractor.py +0 -0
  44. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/seq/UniProtCoreEtlWorker.py +0 -0
  45. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/seq/UniProtExtractor.py +0 -0
  46. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/seq/__init__.py +0 -0
  47. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/__init__.py +0 -0
  48. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/fixtureDictMethodResourceProvider.py +0 -0
  49. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testAnnotationExtractor.py +0 -0
  50. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testBranchedEntityExtractor.py +0 -0
  51. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testChemRefLoader.py +0 -0
  52. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testChemRefMappingProvider.py +0 -0
  53. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testCitationAdapter.py +0 -0
  54. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testCitationExtractor.py +0 -0
  55. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testCitationUtils.py +0 -0
  56. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testEntryInfoEtlWorkflow.py +0 -0
  57. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testEntryInfoProvider.py +0 -0
  58. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testGlycanEtlWorkflow.py +0 -0
  59. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testGlycanProvider.py +0 -0
  60. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testGlycanUtils.py +0 -0
  61. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testLigandNeighborMappingProvider.py +0 -0
  62. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testObjectTransformer.py +0 -0
  63. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testObjectUpdater.py +0 -0
  64. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testPolymerEntityExtractor.py +0 -0
  65. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testPubChemDataCacheProvider.py +0 -0
  66. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testPubChemEtlWorkflow.py +0 -0
  67. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testPubChemEtlWrapper.py +0 -0
  68. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testPubChemIndexCacheProvider.py +0 -0
  69. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testReferenceSequenceAssignmentAdapter.py +0 -0
  70. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testReferenceSequenceAssignmentAdapterValidate.py +0 -0
  71. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testReferenceSequenceAssignmentProvider.py +0 -0
  72. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testReferenceSequenceCacheProvider.py +0 -0
  73. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testTaxonomyExtractor.py +0 -0
  74. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testUniProtCoreEtlWorker.py +0 -0
  75. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tests/testUniProtExtractor.py +0 -0
  76. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/tree/__init__.py +0 -0
  77. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/utils/ObjectAdapterBase.py +0 -0
  78. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/utils/ObjectExtractor.py +0 -0
  79. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/utils/ObjectTransformer.py +0 -0
  80. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/utils/ObjectUpdater.py +0 -0
  81. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/utils/ObjectValidator.py +0 -0
  82. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/utils/__init__.py +0 -0
  83. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/wf/EntryInfoEtlWorkflow.py +0 -0
  84. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/wf/GlycanEtlWorkflow.py +0 -0
  85. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/wf/PubChemEtlWorkflow.py +0 -0
  86. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb/exdb/wf/__init__.py +0 -0
  87. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb.exdb.egg-info/SOURCES.txt +0 -0
  88. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb.exdb.egg-info/dependency_links.txt +0 -0
  89. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb.exdb.egg-info/not-zip-safe +0 -0
  90. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/rcsb.exdb.egg-info/top_level.txt +0 -0
  91. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/setup.cfg +0 -0
  92. {rcsb_exdb-1.29 → rcsb_exdb-1.30}/setup.py +0 -0
@@ -111,3 +111,4 @@
111
111
  23-Jan-2025 V1.27 Update TreeNodeListWorker to index 'id' field
112
112
  11-Feb-2025 V1.28 Move ExDB CLI code (workflow, exec, and tests) and Dockerfile to rcsb.workflow to avoid circular imports
113
113
  8-Apr-2025 V1.29 Add more logging to PubChemIndexCacheProvider and increase default numProc
114
+ 2-Oct-2025 V1.30 Make use of ExDB configuration file for loading drugbank and tree node list DBs/collections and setting indexed fields
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rcsb.exdb
3
- Version: 1.29
3
+ Version: 1.30
4
4
  Summary: RCSB Python ExDB data extraction and loading workflows
5
5
  Home-page: https://github.com/rcsb/py-rcsb_exdb
6
6
  Author: John Westbrook
@@ -19,8 +19,8 @@ License-File: LICENSE
19
19
  Requires-Dist: numpy
20
20
  Requires-Dist: jsonschema>=2.6.0
21
21
  Requires-Dist: rcsb.utils.io>=1.48
22
- Requires-Dist: rcsb.db>=1.800
23
- Requires-Dist: rcsb.utils.chem>=0.81
22
+ Requires-Dist: rcsb.db>=1.808
23
+ Requires-Dist: rcsb.utils.chem>=0.84
24
24
  Requires-Dist: rcsb.utils.chemref>=0.91
25
25
  Requires-Dist: rcsb.utils.config>=0.40
26
26
  Requires-Dist: rcsb.utils.ec>=0.25
@@ -7,6 +7,8 @@
7
7
  # Updates:
8
8
  # 9-Dec-2018 jdw add validation methods
9
9
  # 3-Sep-2019 jdw move to rcsb.exdb.chemref
10
+ # 7-Aug-2025 dwp change target DB and collection from "drugbank_core" to "dw" and "core_drugbank" (as part of transition to DW);
11
+ # make use of configuration file for loading drugbank collection and setting indexed fields
10
12
  #
11
13
  ##
12
14
  __docformat__ = "google en"
@@ -66,9 +68,10 @@ class ChemRefEtlWorker(object):
66
68
  desp = DataExchangeStatus()
67
69
  statusStartTimestamp = desp.setStartTime()
68
70
  addValues = {}
71
+ collectionGroupName = "core_drugbank"
69
72
  #
70
73
  if extResource == "DrugBank":
71
- databaseName = "drugbank_core"
74
+ databaseNameMongo = self.__schP.getDatabaseMongoName(collectionGroupName=collectionGroupName)
72
75
  configName = self.__cfgOb.getDefaultSectionName()
73
76
  user = self.__cfgOb.get("_DRUGBANK_AUTH_USERNAME", sectionName=configName)
74
77
  pw = self.__cfgOb.get("_DRUGBANK_AUTH_PASSWORD", sectionName=configName)
@@ -81,10 +84,10 @@ class ChemRefEtlWorker(object):
81
84
  #
82
85
  logger.info("Resource %r extracted mapped document length %d", extResource, len(dList))
83
86
  logger.debug("Objects %r", dList[:2])
84
- sD, _, collectionList, _ = self.__schP.getSchemaInfo(databaseName)
87
+ _, _, collectionList, docIndexD = self.__schP.getSchemaInfo(collectionGroupName=collectionGroupName)
85
88
  collectionName = collectionList[0] if collectionList else "unassigned"
86
- indexL = sD.getDocumentIndex(collectionName, "primary")
87
- logger.info("Database %r collection %r index attributes %r", databaseName, collectionName, indexL)
89
+ indexDL = docIndexD[collectionName] if collectionName in docIndexD else []
90
+ logger.info("Database %r collection %r index attributes %r", databaseNameMongo, collectionName, indexDL)
88
91
  #
89
92
  # For some reason, 'addValues' was being overwritten with an empty dict (https://github.com/rcsb/py-rcsb_exdb/commit/26bd79e9a2fffc97c034b4116dece9248d1c1f39)
90
93
  # Will need to review this -- do we want to add the schema version values or not? (Also, see similar logic in UniProtCoreEtlWorker.py)
@@ -103,8 +106,8 @@ class ChemRefEtlWorker(object):
103
106
  readBackCheck=self.__readBackCheck,
104
107
  )
105
108
  #
106
- ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=dList, indexAttributeList=indexL, keyNames=None, addValues=addValues)
107
- self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
109
+ ok = dl.load(databaseNameMongo, collectionName, loadType=loadType, documentList=dList, keyNames=None, addValues=addValues, indexDL=indexDL)
110
+ self.__updateStatus(updateId, databaseNameMongo, collectionName, ok, statusStartTimestamp)
108
111
 
109
112
  return True
110
113
  except Exception as e:
@@ -7,6 +7,7 @@
7
7
  # Updates:
8
8
  # 7-Jan-2019 jdw moved from ChemRefEtlWorker.
9
9
  # 3-Sep-2019 jdw moved again to module rcsb.exdb.chemref
10
+ # 14-Aug-2025 dwp rename bird_chem_comp_core to core_chem_comp
10
11
  #
11
12
  ##
12
13
  __docformat__ = "google en"
@@ -42,8 +43,8 @@ class ChemRefExtractor(object):
42
43
  """
43
44
  idD = {}
44
45
  try:
45
- databaseName = "bird_chem_comp_core"
46
- collectionName = "bird_chem_comp_core"
46
+ databaseName = "dw"
47
+ collectionName = "core_chem_comp"
47
48
  selectD = {"rcsb_chem_comp_related.resource_name": referenceResourceName}
48
49
  selectionList = ["rcsb_id", "rcsb_chem_comp_related"]
49
50
  logger.info("Searching %s %s with selection query %r", databaseName, collectionName, selectD)
@@ -2,4 +2,4 @@ __docformat__ = "google en"
2
2
  __author__ = "John Westbrook"
3
3
  __email__ = "john.westbrook@rcsb.org"
4
4
  __license__ = "Apache 2.0"
5
- __version__ = "1.29"
5
+ __version__ = "1.30"
@@ -162,7 +162,9 @@ class PdbxLoaderFixture(unittest.TestCase):
162
162
  ]
163
163
  self.__ldList = [
164
164
  {
165
- "databaseName": "bird_chem_comp_core",
165
+ # "databaseName": "dw",
166
+ "collectionGroupName": "core_chem_comp",
167
+ "contentType": "bird_chem_comp_core",
166
168
  "collectionNameList": None,
167
169
  "loadType": "full",
168
170
  "mergeContentTypes": None,
@@ -170,7 +172,9 @@ class PdbxLoaderFixture(unittest.TestCase):
170
172
  "inputIdCodeList": self.__birdChemCompCoreIdList
171
173
  },
172
174
  {
173
- "databaseName": "pdbx_core",
175
+ # "databaseName": "pdbx_core",
176
+ "collectionGroupName": "pdbx_core",
177
+ "contentType": "pdbx_core",
174
178
  "collectionNameList": None,
175
179
  "loadType": "replace",
176
180
  "mergeContentTypes": ["vrpt"],
@@ -179,6 +183,8 @@ class PdbxLoaderFixture(unittest.TestCase):
179
183
  },
180
184
  # {
181
185
  # "databaseName": "pdbx_comp_model_core",
186
+ # "collectionGroupName": "pdbx_comp_model_core",
187
+ # "contentType": "pdbx_comp_model_core",
182
188
  # "collectionNameList": None,
183
189
  # "loadType": "full",
184
190
  # "mergeContentTypes": None,
@@ -220,7 +226,7 @@ class PdbxLoaderFixture(unittest.TestCase):
220
226
  """Wrapper for the PDBx loader module"""
221
227
  ok = False
222
228
  try:
223
- logger.info("Loading %s", kwargs["databaseName"])
229
+ logger.info("Loading %s", kwargs["collectionGroupName"])
224
230
  mw = PdbxLoader(
225
231
  self.__cfgOb,
226
232
  cachePath=self.__cachePath,
@@ -235,8 +241,9 @@ class PdbxLoaderFixture(unittest.TestCase):
235
241
  rebuildSchemaFlag=False,
236
242
  )
237
243
  ok = mw.load(
238
- kwargs["databaseName"],
244
+ collectionGroupName=kwargs["collectionGroupName"],
239
245
  collectionLoadList=kwargs["collectionNameList"],
246
+ contentType=kwargs["contentType"],
240
247
  loadType=kwargs["loadType"],
241
248
  inputPathList=None,
242
249
  inputIdCodeList=kwargs["inputIdCodeList"],
@@ -81,8 +81,8 @@ class ObjectExtractorTests(unittest.TestCase):
81
81
  try:
82
82
  obEx = ObjectExtractor(
83
83
  self.__cfgOb,
84
- databaseName="bird_chem_comp_core",
85
- collectionName="bird_chem_comp_core",
84
+ databaseName="dw",
85
+ collectionName="core_chem_comp",
86
86
  cacheFilePath=os.path.join(self.__workPath, "drugbank-mapping-cache.json"),
87
87
  useCache=False,
88
88
  cacheKwargs=self.__testEntryCacheKwargs,
@@ -60,6 +60,7 @@ class ReferenceSequenceAnnotationAdapterTests(unittest.TestCase):
60
60
  endTime = time.time()
61
61
  logger.info("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)
62
62
 
63
+ # NOTE: IF YOU DISABLE THE TEST BELOW, THEN 'testReferenceCacheProvider' FAILS. CHECK WHETHER ALL 'Reference' PROVIDERS CAN BE DISABLED.
63
64
  # @unittest.skip("Disable test - no longer using in production, and fails too frequently with 'Bad xml text' when fetching from UniProt")
64
65
  def testAnnotationAdapter(self):
65
66
  """Test case - create and read cache reference sequences assignments and related data."""
@@ -1,5 +1,5 @@
1
1
  ##
2
- # File: TreeNodeListWorkerTests.py
2
+ # File: testTreeNodeListWorker.py
3
3
  # Author: J. Westbrook
4
4
  # Date: 23-Apr-2019
5
5
  #
@@ -10,6 +10,8 @@
10
10
  # 8-Aug-2023 dwp Load full (unfiltered) taxonomy tree node list, and stop loading GO tree (will be loaded in DW instead)
11
11
  # 27-Aug-2024 dwp Update CARD ontology tree loading
12
12
  # 23-Jan-2025 dwp Change indexed field from 'update_id' to 'id'
13
+ # 7-Aug-2025 dwp Change target DB and collection names to "dw" and "tree_*" (via configuration file);
14
+ # Make use of configuration file for loading tree node lists and setting indexed fields
13
15
  #
14
16
  ##
15
17
  __docformat__ = "google en"
@@ -32,8 +34,6 @@ from rcsb.utils.struct.ScopClassificationProvider import ScopClassificationProvi
32
34
  from rcsb.utils.struct.Scop2ClassificationProvider import Scop2ClassificationProvider
33
35
  from rcsb.utils.taxonomy.TaxonomyProvider import TaxonomyProvider
34
36
  from rcsb.exdb.seq.TaxonomyExtractor import TaxonomyExtractor
35
- # from rcsb.utils.go.GeneOntologyProvider import GeneOntologyProvider
36
- # from rcsb.exdb.seq.AnnotationExtractor import AnnotationExtractor
37
37
 
38
38
  logger = logging.getLogger(__name__)
39
39
 
@@ -76,37 +76,28 @@ class TreeNodeListWorker(object):
76
76
  Relevant configuration options:
77
77
 
78
78
  tree_node_lists_configuration:
79
- DATABASE_NAME: tree_node_lists
80
- DATABASE_VERSION_STRING: v5
81
- COLLECTION_VERSION_STRING: 1.0.0
82
- COLLECTION_TAXONOMY: tree_taxonomy_node_list
83
- COLLECTION_ENZYME: tree_ec_node_list
84
- COLLECTION_SCOP: tree_scop_node_list
85
- COLLECTION_CATH: tree_cath_node_list
86
-
79
+ DATABASE_NAME: dw
80
+ COLLECTION_VERSION_STRING: 2.1.0
81
+ COLLECTION_NAME_LIST:
82
+ - tree_taxonomy
83
+ - tree_ec
84
+ - tree_scop
85
+ - tree_scop2
86
+ - tree_cath
87
+ - tree_atc
88
+ - tree_card
89
+ - tree_ecod
90
+ COLLECTION_INDICES:
91
+ - INDEX_NAME: primary
92
+ ATTRIBUTE_NAMES:
93
+ - id
94
+ - INDEX_NAME: index_2
95
+ ATTRIBUTE_NAMES:
96
+ - parents
87
97
  """
88
98
  try:
89
99
  useCache = self.__useCache
90
100
  #
91
- # if not useCache:
92
- # cDL = ["domains_struct", "NCBI", "ec", "go", "atc"]
93
- # for cD in cDL:
94
- # try:
95
- # cfp = os.path.join(self.__cachePath, cD)
96
- # os.makedirs(cfp, 0o755)
97
- # except Exception:
98
- # pass
99
- # #
100
- # try:
101
- # cfp = os.path.join(self.__cachePath, cD)
102
- # fpL = glob.glob(os.path.join(cfp, "*"))
103
- # if fpL:
104
- # for fp in fpL:
105
- # os.remove(fp)
106
- # except Exception:
107
- # pass
108
- #
109
- #
110
101
  logger.info("Starting with cache path %r (useCache=%r)", self.__cachePath, useCache)
111
102
  #
112
103
  self.__statusList = []
@@ -124,65 +115,77 @@ class TreeNodeListWorker(object):
124
115
  readBackCheck=self.__readBackCheck,
125
116
  )
126
117
  #
127
- databaseName = "tree_node_lists"
118
+ sectionName = "tree_node_lists_configuration"
119
+ databaseNameMongo = self.__cfgOb.get("DATABASE_NAME", sectionName=sectionName)
120
+ collectionNameList = self.__cfgOb.get("COLLECTION_NAME_LIST", sectionName=sectionName)
121
+ collectionIndexList = self.__cfgOb.get("COLLECTION_INDICES", sectionName=sectionName)
122
+ # databaseNameMongo = 'dw'
123
+ # collectionNameList = ['tree_taxonomy', 'tree_ec', 'tree_scop', 'tree_scop2', 'tree_cath', 'tree_atc', 'tree_card', 'tree_ecod', 'tree_go']
124
+ # collectionIndexList = [{'INDEX_NAME': 'primary', 'ATTRIBUTE_NAMES': ['id']}, {'INDEX_NAME': 'index_2', 'ATTRIBUTE_NAMES': ['parents']}]
125
+
128
126
  # collectionVersion = self.__cfgOb.get("COLLECTION_VERSION_STRING", sectionName=sectionName)
129
127
  # addValues = {"_schema_version": collectionVersion}
130
128
  addValues = None
131
- #
132
- # --- GO - TURNED OFF 08 Aug 2023 dwp (tree is now loaded in DW)
133
- # goP = GeneOntologyProvider(goDirPath=os.path.join(self.__cachePath, "go"), useCache=useCache)
134
- # ok = goP.testCache()
135
- # anEx = AnnotationExtractor(self.__cfgOb)
136
- # goIdL = anEx.getUniqueIdentifiers("GO")
137
- # logger.info("Unique GO assignments %d", len(goIdL))
138
- # nL = goP.exportTreeNodeList(goIdL)
139
- # logger.info("GO tree node list length %d", len(nL))
140
- # if doLoad:
141
- # collectionName = "tree_go_node_list"
142
- # ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["id"], keyNames=None, addValues=addValues, schemaLevel=None)
143
- # self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
144
- #
145
- # ---- CATH
129
+
130
+ ok = True
131
+ for collectionName in collectionNameList:
132
+ nL = self.__getTreeDocList(collectionName, useCache)
133
+ if nL and doLoad:
134
+ ok = dl.load(
135
+ databaseNameMongo,
136
+ collectionName,
137
+ loadType=loadType,
138
+ documentList=nL,
139
+ keyNames=None,
140
+ addValues=addValues,
141
+ schemaLevel=None,
142
+ indexDL=collectionIndexList
143
+ ) and ok
144
+ self.__updateStatus(updateId, databaseNameMongo, collectionName, ok, statusStartTimestamp)
145
+ logger.info(
146
+ "Completed load of tree node list for database %r, collection %r, len(nL) %r (status %r)",
147
+ databaseNameMongo, collectionName, len(nL), ok
148
+ )
149
+ # ---
150
+ logger.info("Completed tree node list loading operations with loadType %r (status %r)", loadType, ok)
151
+ return True
152
+ except Exception as e:
153
+ logger.exception("Failing with %s", str(e))
154
+ return False
155
+
156
+ def __checkTaxonNodeList(self, nL):
157
+ eCount = 0
158
+ tD = {dD["id"]: True for dD in nL}
159
+ for dD in nL:
160
+ if "parents" in dD:
161
+ pId = dD["parents"][0]
162
+ if pId not in tD:
163
+ logger.info("Missing parent for taxon %d", pId)
164
+ eCount += 1
165
+ else:
166
+ logger.info("No parents for node %r", dD["id"])
167
+
168
+ def getLoadStatus(self):
169
+ return self.__statusList
170
+
171
+ def __getTreeDocList(self, collectionName, useCache):
172
+ nL = []
173
+ if collectionName.lower() == "tree_cath":
146
174
  ccu = CathClassificationProvider(cachePath=self.__cachePath, useCache=useCache)
147
175
  nL = ccu.getTreeNodeList()
148
- logger.info("Starting load SCOP node tree length %d", len(nL))
149
- if doLoad:
150
- collectionName = "tree_cath_node_list"
151
- ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["id"], keyNames=None, addValues=addValues, schemaLevel=None)
152
- self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
153
- # ---- SCOP
176
+ elif collectionName.lower() == "tree_scop2":
177
+ scu2 = Scop2ClassificationProvider(cachePath=self.__cachePath, useCache=useCache)
178
+ nL = scu2.getTreeNodeList()
179
+ elif collectionName.lower() == "tree_scop":
154
180
  scu = ScopClassificationProvider(cachePath=self.__cachePath, useCache=useCache)
155
181
  nL = scu.getTreeNodeList()
156
- logger.info("Starting load SCOP node tree length %d", len(nL))
157
- if doLoad:
158
- collectionName = "tree_scop_node_list"
159
- ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["id"], keyNames=None, addValues=addValues, schemaLevel=None)
160
- self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
161
- # --- SCOP2
162
- scu = Scop2ClassificationProvider(cachePath=self.__cachePath, useCache=useCache)
163
- nL = scu.getTreeNodeList()
164
- logger.info("Starting load SCOP2 node tree length %d", len(nL))
165
- if doLoad:
166
- collectionName = "tree_scop2_node_list"
167
- ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["id"], keyNames=None, addValues=addValues, schemaLevel=None)
168
- self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
169
- # ---- Ecod
182
+ elif collectionName.lower() == "tree_ecod":
170
183
  ecu = EcodClassificationProvider(cachePath=self.__cachePath, useCache=useCache)
171
184
  nL = ecu.getTreeNodeList()
172
- logger.info("Starting load ECOD node tree length %d", len(nL))
173
- if doLoad:
174
- collectionName = "tree_ecod_node_list"
175
- ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["id"], keyNames=None, addValues=addValues, schemaLevel=None)
176
- self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
177
- # ---- EC
185
+ elif collectionName.lower() == "tree_ec":
178
186
  edbu = EnzymeDatabaseProvider(cachePath=self.__cachePath, useCache=useCache)
179
187
  nL = edbu.getTreeNodeList()
180
- logger.info("Starting load of EC node tree length %d", len(nL))
181
- if doLoad:
182
- collectionName = "tree_ec_node_list"
183
- ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["id"], keyNames=None, addValues=addValues, schemaLevel=None)
184
- self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
185
- # ---- CARD
188
+ elif collectionName.lower() == "tree_card":
186
189
  okCou = True
187
190
  cou = CARDTargetOntologyProvider(cachePath=self.__cachePath, useCache=useCache)
188
191
  if not cou.testCache():
@@ -193,21 +196,7 @@ class TreeNodeListWorker(object):
193
196
  okCou = False
194
197
  if okCou:
195
198
  nL = cou.getTreeNodeList()
196
- logger.info("Starting load of CARD ontology node tree length %d", len(nL))
197
- if doLoad:
198
- collectionName = "tree_card_node_list"
199
- ok = dl.load(
200
- databaseName,
201
- collectionName,
202
- loadType=loadType,
203
- documentList=nL,
204
- indexAttributeList=["id"],
205
- keyNames=None,
206
- addValues=addValues,
207
- schemaLevel=None
208
- )
209
- self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
210
- # ---- Taxonomy
199
+ elif collectionName.lower() == "tree_taxonomy":
211
200
  tU = TaxonomyProvider(cachePath=self.__cachePath, useCache=useCache)
212
201
  if self.__useFilteredLists:
213
202
  # Get the taxon coverage in the current data set -
@@ -226,43 +215,14 @@ class TreeNodeListWorker(object):
226
215
  # Get the full taxon node list without filtering
227
216
  nL = tU.exportNodeList()
228
217
  self.__checkTaxonNodeList(nL)
229
- logger.info("Starting load of taxonomy node tree length %d", len(nL))
230
- if doLoad:
231
- collectionName = "tree_taxonomy_node_list"
232
- logger.debug("Taxonomy nodes (%d) %r", len(nL), nL[:5])
233
- ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["id"], keyNames=None, addValues=addValues, schemaLevel=None)
234
- self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
235
- logger.info("Tree loading operations completed.")
236
- #
237
- # --- ATC
218
+ elif collectionName.lower() == "tree_atc":
238
219
  crEx = ChemRefExtractor(self.__cfgOb)
239
220
  atcFilterD = crEx.getChemCompAccessionMapping("ATC")
240
221
  logger.info("Length of ATC filter %d", len(atcFilterD))
241
222
  atcP = AtcProvider(cachePath=self.__cachePath, useCache=useCache)
242
223
  nL = atcP.getTreeNodeList(filterD=atcFilterD)
243
- collectionName = "tree_atc_node_list"
244
- logger.debug("ATC node list length %d %r", len(nL), nL[:5])
245
- ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=nL, indexAttributeList=["id"], keyNames=None, addValues=addValues, schemaLevel=None)
246
- self.__updateStatus(updateId, databaseName, collectionName, ok, statusStartTimestamp)
247
- #
248
- # ---
249
- logger.info("Completed tree node list loading operations.\n")
250
- return True
251
- except Exception as e:
252
- logger.exception("Failing with %s", str(e))
253
- return False
254
-
255
- def __checkTaxonNodeList(self, nL):
256
- eCount = 0
257
- tD = {dD["id"]: True for dD in nL}
258
- for dD in nL:
259
- if "parents" in dD:
260
- pId = dD["parents"][0]
261
- if pId not in tD:
262
- logger.info("Missing parent for taxon %d", pId)
263
- eCount += 1
264
- else:
265
- logger.info("No parents for node %r", dD["id"])
266
-
267
- def getLoadStatus(self):
268
- return self.__statusList
224
+ else:
225
+ logger.error("Unsupported tree node collection %r", collectionName)
226
+ #
227
+ logger.info("Gathered tree nodes for loading collection %s (length %d)", collectionName, len(nL))
228
+ return nL
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rcsb.exdb
3
- Version: 1.29
3
+ Version: 1.30
4
4
  Summary: RCSB Python ExDB data extraction and loading workflows
5
5
  Home-page: https://github.com/rcsb/py-rcsb_exdb
6
6
  Author: John Westbrook
@@ -19,8 +19,8 @@ License-File: LICENSE
19
19
  Requires-Dist: numpy
20
20
  Requires-Dist: jsonschema>=2.6.0
21
21
  Requires-Dist: rcsb.utils.io>=1.48
22
- Requires-Dist: rcsb.db>=1.800
23
- Requires-Dist: rcsb.utils.chem>=0.81
22
+ Requires-Dist: rcsb.db>=1.808
23
+ Requires-Dist: rcsb.utils.chem>=0.84
24
24
  Requires-Dist: rcsb.utils.chemref>=0.91
25
25
  Requires-Dist: rcsb.utils.config>=0.40
26
26
  Requires-Dist: rcsb.utils.ec>=0.25
@@ -1,8 +1,8 @@
1
1
  numpy
2
2
  jsonschema>=2.6.0
3
3
  rcsb.utils.io>=1.48
4
- rcsb.db>=1.800
5
- rcsb.utils.chem>=0.81
4
+ rcsb.db>=1.808
5
+ rcsb.utils.chem>=0.84
6
6
  rcsb.utils.chemref>=0.91
7
7
  rcsb.utils.config>=0.40
8
8
  rcsb.utils.ec>=0.25
@@ -4,8 +4,8 @@
4
4
  numpy
5
5
  jsonschema >= 2.6.0
6
6
  rcsb.utils.io >= 1.48
7
- rcsb.db >= 1.800
8
- rcsb.utils.chem >= 0.81
7
+ rcsb.db >= 1.808
8
+ rcsb.utils.chem >= 0.84
9
9
  rcsb.utils.chemref >= 0.91
10
10
  rcsb.utils.config >= 0.40
11
11
  rcsb.utils.ec >= 0.25
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes