dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. build_backend.py +93 -0
  2. dsgrid/__init__.py +22 -0
  3. dsgrid/api/__init__.py +0 -0
  4. dsgrid/api/api_manager.py +179 -0
  5. dsgrid/api/app.py +419 -0
  6. dsgrid/api/models.py +60 -0
  7. dsgrid/api/response_models.py +116 -0
  8. dsgrid/apps/__init__.py +0 -0
  9. dsgrid/apps/project_viewer/app.py +216 -0
  10. dsgrid/apps/registration_gui.py +444 -0
  11. dsgrid/chronify.py +32 -0
  12. dsgrid/cli/__init__.py +0 -0
  13. dsgrid/cli/common.py +120 -0
  14. dsgrid/cli/config.py +176 -0
  15. dsgrid/cli/download.py +13 -0
  16. dsgrid/cli/dsgrid.py +157 -0
  17. dsgrid/cli/dsgrid_admin.py +92 -0
  18. dsgrid/cli/install_notebooks.py +62 -0
  19. dsgrid/cli/query.py +729 -0
  20. dsgrid/cli/registry.py +1862 -0
  21. dsgrid/cloud/__init__.py +0 -0
  22. dsgrid/cloud/cloud_storage_interface.py +140 -0
  23. dsgrid/cloud/factory.py +31 -0
  24. dsgrid/cloud/fake_storage_interface.py +37 -0
  25. dsgrid/cloud/s3_storage_interface.py +156 -0
  26. dsgrid/common.py +36 -0
  27. dsgrid/config/__init__.py +0 -0
  28. dsgrid/config/annual_time_dimension_config.py +194 -0
  29. dsgrid/config/common.py +142 -0
  30. dsgrid/config/config_base.py +148 -0
  31. dsgrid/config/dataset_config.py +907 -0
  32. dsgrid/config/dataset_schema_handler_factory.py +46 -0
  33. dsgrid/config/date_time_dimension_config.py +136 -0
  34. dsgrid/config/dimension_config.py +54 -0
  35. dsgrid/config/dimension_config_factory.py +65 -0
  36. dsgrid/config/dimension_mapping_base.py +350 -0
  37. dsgrid/config/dimension_mappings_config.py +48 -0
  38. dsgrid/config/dimensions.py +1025 -0
  39. dsgrid/config/dimensions_config.py +71 -0
  40. dsgrid/config/file_schema.py +190 -0
  41. dsgrid/config/index_time_dimension_config.py +80 -0
  42. dsgrid/config/input_dataset_requirements.py +31 -0
  43. dsgrid/config/mapping_tables.py +209 -0
  44. dsgrid/config/noop_time_dimension_config.py +42 -0
  45. dsgrid/config/project_config.py +1462 -0
  46. dsgrid/config/registration_models.py +188 -0
  47. dsgrid/config/representative_period_time_dimension_config.py +194 -0
  48. dsgrid/config/simple_models.py +49 -0
  49. dsgrid/config/supplemental_dimension.py +29 -0
  50. dsgrid/config/time_dimension_base_config.py +192 -0
  51. dsgrid/data_models.py +155 -0
  52. dsgrid/dataset/__init__.py +0 -0
  53. dsgrid/dataset/dataset.py +123 -0
  54. dsgrid/dataset/dataset_expression_handler.py +86 -0
  55. dsgrid/dataset/dataset_mapping_manager.py +121 -0
  56. dsgrid/dataset/dataset_schema_handler_base.py +945 -0
  57. dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
  58. dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
  59. dsgrid/dataset/growth_rates.py +162 -0
  60. dsgrid/dataset/models.py +51 -0
  61. dsgrid/dataset/table_format_handler_base.py +257 -0
  62. dsgrid/dataset/table_format_handler_factory.py +17 -0
  63. dsgrid/dataset/unpivoted_table.py +121 -0
  64. dsgrid/dimension/__init__.py +0 -0
  65. dsgrid/dimension/base_models.py +230 -0
  66. dsgrid/dimension/dimension_filters.py +308 -0
  67. dsgrid/dimension/standard.py +252 -0
  68. dsgrid/dimension/time.py +352 -0
  69. dsgrid/dimension/time_utils.py +103 -0
  70. dsgrid/dsgrid_rc.py +88 -0
  71. dsgrid/exceptions.py +105 -0
  72. dsgrid/filesystem/__init__.py +0 -0
  73. dsgrid/filesystem/cloud_filesystem.py +32 -0
  74. dsgrid/filesystem/factory.py +32 -0
  75. dsgrid/filesystem/filesystem_interface.py +136 -0
  76. dsgrid/filesystem/local_filesystem.py +74 -0
  77. dsgrid/filesystem/s3_filesystem.py +118 -0
  78. dsgrid/loggers.py +132 -0
  79. dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
  80. dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
  81. dsgrid/notebooks/registration.ipynb +48 -0
  82. dsgrid/notebooks/start_notebook.sh +11 -0
  83. dsgrid/project.py +451 -0
  84. dsgrid/query/__init__.py +0 -0
  85. dsgrid/query/dataset_mapping_plan.py +142 -0
  86. dsgrid/query/derived_dataset.py +388 -0
  87. dsgrid/query/models.py +728 -0
  88. dsgrid/query/query_context.py +287 -0
  89. dsgrid/query/query_submitter.py +994 -0
  90. dsgrid/query/report_factory.py +19 -0
  91. dsgrid/query/report_peak_load.py +70 -0
  92. dsgrid/query/reports_base.py +20 -0
  93. dsgrid/registry/__init__.py +0 -0
  94. dsgrid/registry/bulk_register.py +165 -0
  95. dsgrid/registry/common.py +287 -0
  96. dsgrid/registry/config_update_checker_base.py +63 -0
  97. dsgrid/registry/data_store_factory.py +34 -0
  98. dsgrid/registry/data_store_interface.py +74 -0
  99. dsgrid/registry/dataset_config_generator.py +158 -0
  100. dsgrid/registry/dataset_registry_manager.py +950 -0
  101. dsgrid/registry/dataset_update_checker.py +16 -0
  102. dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
  103. dsgrid/registry/dimension_mapping_update_checker.py +16 -0
  104. dsgrid/registry/dimension_registry_manager.py +413 -0
  105. dsgrid/registry/dimension_update_checker.py +16 -0
  106. dsgrid/registry/duckdb_data_store.py +207 -0
  107. dsgrid/registry/filesystem_data_store.py +150 -0
  108. dsgrid/registry/filter_registry_manager.py +123 -0
  109. dsgrid/registry/project_config_generator.py +57 -0
  110. dsgrid/registry/project_registry_manager.py +1623 -0
  111. dsgrid/registry/project_update_checker.py +48 -0
  112. dsgrid/registry/registration_context.py +223 -0
  113. dsgrid/registry/registry_auto_updater.py +316 -0
  114. dsgrid/registry/registry_database.py +667 -0
  115. dsgrid/registry/registry_interface.py +446 -0
  116. dsgrid/registry/registry_manager.py +558 -0
  117. dsgrid/registry/registry_manager_base.py +367 -0
  118. dsgrid/registry/versioning.py +92 -0
  119. dsgrid/rust_ext/__init__.py +14 -0
  120. dsgrid/rust_ext/find_minimal_patterns.py +129 -0
  121. dsgrid/spark/__init__.py +0 -0
  122. dsgrid/spark/functions.py +589 -0
  123. dsgrid/spark/types.py +110 -0
  124. dsgrid/tests/__init__.py +0 -0
  125. dsgrid/tests/common.py +140 -0
  126. dsgrid/tests/make_us_data_registry.py +265 -0
  127. dsgrid/tests/register_derived_datasets.py +103 -0
  128. dsgrid/tests/utils.py +25 -0
  129. dsgrid/time/__init__.py +0 -0
  130. dsgrid/time/time_conversions.py +80 -0
  131. dsgrid/time/types.py +67 -0
  132. dsgrid/units/__init__.py +0 -0
  133. dsgrid/units/constants.py +113 -0
  134. dsgrid/units/convert.py +71 -0
  135. dsgrid/units/energy.py +145 -0
  136. dsgrid/units/power.py +87 -0
  137. dsgrid/utils/__init__.py +0 -0
  138. dsgrid/utils/dataset.py +830 -0
  139. dsgrid/utils/files.py +179 -0
  140. dsgrid/utils/filters.py +125 -0
  141. dsgrid/utils/id_remappings.py +100 -0
  142. dsgrid/utils/py_expression_eval/LICENSE +19 -0
  143. dsgrid/utils/py_expression_eval/README.md +8 -0
  144. dsgrid/utils/py_expression_eval/__init__.py +847 -0
  145. dsgrid/utils/py_expression_eval/tests.py +283 -0
  146. dsgrid/utils/run_command.py +70 -0
  147. dsgrid/utils/scratch_dir_context.py +65 -0
  148. dsgrid/utils/spark.py +918 -0
  149. dsgrid/utils/spark_partition.py +98 -0
  150. dsgrid/utils/timing.py +239 -0
  151. dsgrid/utils/utilities.py +221 -0
  152. dsgrid/utils/versioning.py +36 -0
  153. dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
  154. dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
  155. dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
  156. dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
  157. dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
@@ -0,0 +1,558 @@
1
+ """Manages registration of all projects and datasets."""
2
+
3
+ import getpass
4
+ import logging
5
+ import os
6
+ import shutil
7
+ import uuid
8
+ from pathlib import Path
9
+
10
+ from sqlalchemy import Connection
11
+
12
+ from dsgrid.common import (
13
+ LOCAL_REGISTRY,
14
+ REMOTE_REGISTRY,
15
+ SYNC_EXCLUDE_LIST,
16
+ )
17
+ from dsgrid.cloud.factory import make_cloud_storage_interface
18
+ from dsgrid.dsgrid_rc import DsgridRuntimeConfig
19
+ from dsgrid.exceptions import DSGInvalidOperation, DSGValueNotRegistered, DSGInvalidParameter
20
+ from dsgrid.utils.run_command import check_run_command
21
+ from dsgrid.filesystem.factory import make_filesystem_interface
22
+ from dsgrid.utils.spark import init_spark, get_active_session
23
+ from .common import (
24
+ DataStoreType,
25
+ RegistryManagerParams,
26
+ )
27
+ from dsgrid.registry.registry_database import RegistryDatabase
28
+ from dsgrid.registry.registry_interface import (
29
+ DatasetRegistryInterface,
30
+ DimensionMappingRegistryInterface,
31
+ DimensionRegistryInterface,
32
+ ProjectRegistryInterface,
33
+ )
34
+ from .dimension_mapping_registry_manager import DimensionMappingRegistryManager
35
+ from .dataset_registry_manager import DatasetRegistryManager
36
+ from .dimension_registry_manager import DimensionRegistryManager
37
+ from .project_registry_manager import ProjectRegistryManager
38
+ from .registry_database import DatabaseConnection
39
+
40
+
41
+ logger = logging.getLogger(__name__)
42
+
43
+
44
+ class RegistryManager:
45
+ """Manages registration of all projects and datasets."""
46
+
47
+ def __init__(self, params: RegistryManagerParams, db: RegistryDatabase):
48
+ self._db = db
49
+ self._data_store = db.data_store
50
+ self._check_environment_variables(params)
51
+ if get_active_session() is None:
52
+ init_spark("dsgrid")
53
+ self._params = params
54
+ self._dimension_mgr = DimensionRegistryManager.load(
55
+ params.base_path,
56
+ params,
57
+ DimensionRegistryInterface(db),
58
+ )
59
+ self._dimension_mapping_mgr = DimensionMappingRegistryManager.load(
60
+ params.base_path,
61
+ params,
62
+ self._dimension_mgr,
63
+ DimensionMappingRegistryInterface(db),
64
+ )
65
+ self._dataset_mgr = DatasetRegistryManager.load(
66
+ params.base_path,
67
+ params,
68
+ self._dimension_mgr,
69
+ self._dimension_mapping_mgr,
70
+ DatasetRegistryInterface(db),
71
+ self._data_store,
72
+ )
73
+ self._project_mgr = ProjectRegistryManager.load(
74
+ params.base_path,
75
+ params,
76
+ self._dataset_mgr,
77
+ self._dimension_mgr,
78
+ self._dimension_mapping_mgr,
79
+ ProjectRegistryInterface(db),
80
+ )
81
+
82
+ @classmethod
83
+ def create(
84
+ cls,
85
+ conn: DatabaseConnection,
86
+ data_path: Path,
87
+ data_store_type: DataStoreType = DataStoreType.FILESYSTEM,
88
+ remote_path=REMOTE_REGISTRY,
89
+ user=None,
90
+ scratch_dir=None,
91
+ overwrite=False,
92
+ ):
93
+ """Creates a new RegistryManager at the given path.
94
+
95
+ Parameters
96
+ ----------
97
+ db_url : str
98
+ data_path : Path
99
+ data_store_type : DataStoreType
100
+ remote_path : str
101
+ Path to remote registry.
102
+ use_remote_data_path : None, str
103
+ Path to remote registry.
104
+ scratch_dir : None | Path
105
+ Base directory for dsgrid temporary directories. Must be accessible on all compute
106
+ nodes. Defaults to the current directory.
107
+ overwrite: bool
108
+ Overwrite the database if it exists.
109
+
110
+ Returns
111
+ -------
112
+ RegistryManager
113
+
114
+ """
115
+ if RegistryDatabase.has_database(conn) and not overwrite:
116
+ msg = f"database={conn.url} already exists. Choose a different name or set overwrite=True."
117
+ raise DSGInvalidOperation(msg)
118
+
119
+ db_filename = conn.try_get_filename()
120
+ if db_filename is not None and db_filename.is_relative_to(data_path):
121
+ msg = (
122
+ f"The database path {db_filename} cannot be relative to the data_path {data_path}."
123
+ )
124
+ raise DSGInvalidOperation(msg)
125
+
126
+ if not user:
127
+ user = getpass.getuser()
128
+ uid = str(uuid.uuid4())
129
+
130
+ if str(data_path).startswith("s3"):
131
+ msg = f"s3 is not currently supported: {data_path}"
132
+ raise Exception(msg)
133
+
134
+ fs_interface = make_filesystem_interface(data_path)
135
+ logger.info("Created registry with database=%s data_path=%s", conn.url, data_path)
136
+ cloud_interface = make_cloud_storage_interface(
137
+ data_path, "", offline=True, uuid=uid, user=user
138
+ )
139
+ scratch_dir = scratch_dir or DsgridRuntimeConfig.load().get_scratch_dir()
140
+ params = RegistryManagerParams(
141
+ base_path=Path(data_path),
142
+ remote_path=remote_path,
143
+ use_remote_data=False,
144
+ fs_interface=fs_interface,
145
+ cloud_interface=cloud_interface,
146
+ offline=True,
147
+ scratch_dir=scratch_dir,
148
+ )
149
+ RegistryDatabase.delete(conn)
150
+ db = RegistryDatabase.create(
151
+ conn, data_path, data_store_type=data_store_type, overwrite=overwrite
152
+ )
153
+ return cls(params, db)
154
+
155
+ def dispose(self) -> None:
156
+ """Dispose the database engine and release all connections."""
157
+ self._db.dispose()
158
+
159
+ def __enter__(self) -> "RegistryManager":
160
+ """Enter context manager."""
161
+ return self
162
+
163
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
164
+ """Exit context manager and dispose resources."""
165
+ self.dispose()
166
+
167
+ @property
168
+ def dataset_manager(self) -> DatasetRegistryManager:
169
+ """Return the dataset manager."""
170
+ return self._dataset_mgr
171
+
172
+ @property
173
+ def dimension_mapping_manager(self) -> DimensionMappingRegistryManager:
174
+ """Return the dimension mapping manager."""
175
+ return self._dimension_mapping_mgr
176
+
177
+ @property
178
+ def dimension_manager(self) -> DimensionRegistryManager:
179
+ """Return the dimension manager."""
180
+ return self._dimension_mgr
181
+
182
+ @property
183
+ def project_manager(self) -> ProjectRegistryManager:
184
+ """Return the project manager."""
185
+ return self._project_mgr
186
+
187
+ @classmethod
188
+ def load(
189
+ cls,
190
+ conn: DatabaseConnection,
191
+ remote_path=REMOTE_REGISTRY,
192
+ use_remote_data=None,
193
+ offline_mode=True,
194
+ user=None,
195
+ no_prompts=False,
196
+ scratch_dir=None,
197
+ ):
198
+ """Loads a registry from the given path.
199
+
200
+ Parameters
201
+ ----------
202
+ conn : DatabaseConnection
203
+ remote_path: str, optional
204
+ path of the remote registry; default is REMOTE_REGISTRY
205
+ use_remote_data: bool, None
206
+ If set, use load data tables from remote_path. If not set, auto-determine what to do
207
+ based on HPC or AWS EMR environment variables.
208
+ offline_mode : bool
209
+ Load registry in offline mode; default is True
210
+ user : str
211
+ username
212
+ no_prompts : bool
213
+ If no_prompts is False, the user will be prompted to continue sync pulling the registry if lock files exist.
214
+ scratch_dir : None | Path
215
+ Base directory for dsgrid temporary directories. Must be accessible on all compute
216
+ nodes. Defaults to the current directory.
217
+
218
+ Returns
219
+ -------
220
+ RegistryManager
221
+
222
+ Examples
223
+ --------
224
+ >>> from dsgrid.registry.registry_manager import RegistryManager
225
+ >>> from dsgrid.registry.registry_database import DatabaseConnection
226
+ >>> manager = RegistryManager.load(
227
+ DatabaseConnection(
228
+ hostname="dsgrid-registry.hpc.nrel.gov",
229
+ database="standard-scenarios",
230
+ )
231
+ )
232
+ """
233
+ db = RegistryDatabase.connect(conn)
234
+ data_path = db.get_data_path()
235
+ if not user:
236
+ user = getpass.getuser()
237
+ uid = str(uuid.uuid4())
238
+ fs_interface = make_filesystem_interface(data_path)
239
+
240
+ if use_remote_data is None:
241
+ use_remote_data = _should_use_remote_data(remote_path)
242
+
243
+ cloud_interface = make_cloud_storage_interface(
244
+ data_path, remote_path, offline=offline_mode, uuid=uid, user=user
245
+ )
246
+
247
+ if not offline_mode:
248
+ sync = False
249
+ lock_files = list(cloud_interface.get_lock_files())
250
+ if lock_files:
251
+ msg = f"There are {len(lock_files)} lock files in the registry:"
252
+ for lock_file in lock_files:
253
+ msg = msg + "\n\t" + f"- {lock_file}"
254
+ logger.info(msg)
255
+ if not no_prompts:
256
+ msg = (
257
+ msg
258
+ + "\n... Do you want to continue syncing the registry contents? [Y] >>> "
259
+ )
260
+ val = input(msg)
261
+ if val == "" or val.lower() == "y":
262
+ sync = True
263
+ else:
264
+ logger.info("Skipping remote registry sync.")
265
+ else:
266
+ sync = True
267
+
268
+ if sync:
269
+ logger.info("Sync configs from remote registry.")
270
+ # NOTE: When creating a registry, only the /configs are pulled. To sync_pull /data, use the dsgrid registry data-sync CLI command.
271
+ cloud_interface.sync_pull(
272
+ remote_path + "/configs",
273
+ str(data_path) + "/configs",
274
+ exclude=SYNC_EXCLUDE_LIST,
275
+ delete_local=True,
276
+ )
277
+
278
+ scratch_dir = scratch_dir or DsgridRuntimeConfig.load().get_scratch_dir()
279
+ params = RegistryManagerParams(
280
+ base_path=data_path,
281
+ remote_path=remote_path,
282
+ use_remote_data=use_remote_data,
283
+ fs_interface=fs_interface,
284
+ cloud_interface=cloud_interface,
285
+ offline=offline_mode,
286
+ scratch_dir=scratch_dir,
287
+ )
288
+
289
+ logger.info(
290
+ "Loaded local registry at %s offline_mode=%s",
291
+ conn.url,
292
+ # conn.port,
293
+ offline_mode,
294
+ )
295
+ return cls(params, db)
296
+
297
+ def data_sync(self, project_id, dataset_id, no_prompts=True):
298
+ """Sync data from the remote dsgrid registry.
299
+
300
+ Parameters
301
+ ----------
302
+ project_id : str
303
+ Sync by project_id filter
304
+ dataset_id : str
305
+ Sync by dataset_id filter
306
+ no_prompts : bool
307
+ If no_prompts is False, the user will be prompted to continue sync pulling the registry if lock files exist. By default, True.
308
+ """
309
+ if not project_id and not dataset_id:
310
+ msg = "Must provide a dataset_id or project_id for dsgrid data-sync."
311
+ raise ValueError(msg)
312
+
313
+ datasets = []
314
+ if project_id:
315
+ config = self.project_manager.get_by_id(project_id)
316
+ if dataset_id:
317
+ if dataset_id not in config.list_registered_dataset_ids():
318
+ msg = f"No registered dataset ID = '{dataset_id}' registered to project ID = '{project_id}'"
319
+ raise DSGValueNotRegistered(msg)
320
+ datasets += [(dataset_id, str(config.get_dataset(dataset_id).version))]
321
+ else:
322
+ for dataset in config.list_registered_dataset_ids():
323
+ datasets.append((dataset, str(config.get_dataset(dataset).version)))
324
+
325
+ if dataset_id and not project_id:
326
+ if not self.dataset_manager.has_id(dataset_id):
327
+ msg = f"No registered dataset ID = '{dataset_id}'"
328
+ raise DSGValueNotRegistered(msg)
329
+ version = self.dataset_manager.get_latest_version(dataset_id)
330
+ datasets += [(dataset_id, version)]
331
+
332
+ for dataset, version in datasets:
333
+ self._data_sync(dataset, version, no_prompts)
334
+
335
+ def _data_sync(self, dataset_id, version, no_prompts=True):
336
+ cloud_interface = self._params.cloud_interface
337
+ offline_mode = self._params.offline
338
+
339
+ if offline_mode:
340
+ msg = "dsgrid data-sync only works in online mode."
341
+ raise ValueError(msg)
342
+ sync = True
343
+
344
+ lock_files = list(
345
+ cloud_interface.get_lock_files(
346
+ relative_path=f"{cloud_interface._s3_filesystem._bucket}/configs/datasets/{dataset_id}"
347
+ )
348
+ )
349
+ if lock_files:
350
+ assert len(lock_files) == 1
351
+ msg = f"There are {len(lock_files)} lock files in the registry:"
352
+ for lock_file in lock_files:
353
+ msg = msg + "\n\t" + f"- {lock_file}"
354
+ logger.info(msg)
355
+ if not no_prompts:
356
+ msg = msg + "\n... Do you want to continue syncing the registry contents? [Y] >>> "
357
+ val = input(msg)
358
+ if val == "" or val.lower() == "y":
359
+ sync = True
360
+ else:
361
+ logger.info("Skipping remote registry sync.")
362
+ sync = False
363
+
364
+ if sync:
365
+ logger.info("Sync data from remote registry for %s, version=%s.", dataset_id, version)
366
+ cloud_interface.sync_pull(
367
+ remote_path=self._params.remote_path + f"/data/{dataset_id}/{version}",
368
+ local_path=str(self._params.base_path) + f"/data/{dataset_id}/{version}",
369
+ delete_local=True,
370
+ )
371
+ cloud_interface.sync_pull(
372
+ remote_path=self._params.remote_path + f"/data/{dataset_id}/registry.json5",
373
+ local_path=str(self._params.base_path) + f"/data/{dataset_id}/registry.json5",
374
+ delete_local=True,
375
+ is_file=True,
376
+ )
377
+ else:
378
+ logger.info(
379
+ "Skipping remote registry data sync for %s, version=%s.", dataset_id, version
380
+ )
381
+
382
+ @property
383
+ def path(self):
384
+ return self._params.base_path
385
+
386
+ def show(self, conn: Connection | None = None, filters=None, max_width=None, drop_fields=None):
387
+ """Show tables of all registry configs."""
388
+ self.project_manager.show(
389
+ conn=conn, filters=filters, max_width=max_width, drop_fields=drop_fields
390
+ )
391
+ self.dataset_manager.show(
392
+ conn=conn, filters=filters, max_width=max_width, drop_fields=drop_fields
393
+ )
394
+ self.dimension_manager.show(
395
+ conn=conn, filters=filters, max_width=max_width, drop_fields=drop_fields
396
+ )
397
+ self.dimension_mapping_manager.show(
398
+ conn=conn, filters=filters, max_width=max_width, drop_fields=drop_fields
399
+ )
400
+
401
+ @staticmethod
402
+ def copy(
403
+ src: DatabaseConnection, dst: DatabaseConnection, dst_data_path, mode="copy", force=False
404
+ ):
405
+ """Copy a registry to a new path.
406
+
407
+ Parameters
408
+ ----------
409
+ src : DatabaseConnection
410
+ dst : DatabaseConnection
411
+ dst_data_path : Path
412
+ simple_model : RegistrySimpleModel
413
+ Filter all configs and data according to this model.
414
+ mode : str
415
+ Controls whether to copy all data, make symlinks to data files, or sync data with the
416
+ rsync utility (not available on Windows). Options: 'copy', 'data-symlinks', 'rsync'
417
+ force : bool
418
+ Overwrite dst_data_path if it already exists. Does not apply if using rsync.
419
+
420
+ Raises
421
+ ------
422
+ DSGInvalidParameter
423
+ Raised if src is not a valid registry.
424
+ Raised if dst_data_path exists, use_rsync is False, and force is False.
425
+
426
+ """
427
+ src_db = RegistryDatabase.connect(src)
428
+ src_data_path = src_db.get_data_path()
429
+ src_db.engine.dispose()
430
+ # TODO: This does not support the duckdb data store. Need to implement this copy operation
431
+ # in the DataStoreInterface.
432
+ if not {x.name for x in src_data_path.iterdir()}.issuperset({"data"}):
433
+ msg = f"{src_data_path} is not a valid registry"
434
+ raise DSGInvalidParameter(msg)
435
+
436
+ if mode in ("copy", "data-symlinks"):
437
+ if dst_data_path.exists():
438
+ if force:
439
+ shutil.rmtree(dst_data_path)
440
+ else:
441
+ msg = f"{dst_data_path} already exists."
442
+ raise DSGInvalidParameter(msg)
443
+ RegistryDatabase.copy(src, dst, dst_data_path)
444
+ if mode == "rsync":
445
+ cmd = f"rsync -a {src_data_path}/ {dst_data_path}"
446
+ logger.info("rsync data with [%s]", cmd)
447
+ check_run_command(cmd)
448
+ elif mode in ("copy", "data-symlinks"):
449
+ logger.info("Copy data from source registry %s", src_data_path)
450
+ if mode == "data-symlinks":
451
+ _make_data_symlinks(src_data_path, dst_data_path)
452
+ else:
453
+ for path in (src_data_path / "data").iterdir():
454
+ dst_path = dst_data_path / "data" / path.name
455
+ shutil.copytree(path, dst_path, symlinks=True)
456
+ else:
457
+ msg = f"mode={mode} is not supported"
458
+ raise DSGInvalidParameter(msg)
459
+
460
+ @staticmethod
461
+ def _check_environment_variables(params):
462
+ if not params.offline:
463
+ illegal_vars = [x for x in os.environ if x.startswith("__DSGRID_SKIP_CHECK")]
464
+ if illegal_vars:
465
+ msg = (
466
+ f"Internal environment variables to skip checks are not allowed to be set "
467
+ f"in online mode: {illegal_vars}"
468
+ )
469
+ raise Exception(msg)
470
+
471
+
472
+ def _make_data_symlinks(src, dst):
473
+ # registry/data/dataset_id/registry.json5
474
+ # registry/data/dataset_id/version/*.parquet
475
+ for dataset_id in (src / "data").iterdir():
476
+ if dataset_id.is_dir():
477
+ (dst / "data" / dataset_id.name).mkdir(parents=True)
478
+ for path in (src / "data" / dataset_id.name).iterdir():
479
+ if path.is_dir():
480
+ (dst / "data" / dataset_id.name / path.name).mkdir()
481
+ for data_file in path.iterdir():
482
+ os.symlink(
483
+ data_file.absolute(),
484
+ dst / "data" / dataset_id.name / path.name / data_file.name,
485
+ target_is_directory=data_file.is_dir(),
486
+ )
487
+ elif path.is_file():
488
+ shutil.copyfile(path, dst / "data" / dataset_id.name / path.name)
489
+
490
+
491
+ def get_registry_path(registry_path=None):
492
+ """
493
+ Returns the registry_path, defaulting to the DSGRID_REGISTRY_PATH environment
494
+ variable or dsgrid.common.LOCAL_REGISTRY = Path.home() / ".dsgrid-registry"
495
+ if registry_path is None.
496
+ """
497
+ if registry_path is None:
498
+ registry_path = os.environ.get("DSGRID_REGISTRY_PATH", None)
499
+ if registry_path is None:
500
+ registry_path = (
501
+ LOCAL_REGISTRY # TEMPORARY: Replace with S3_REGISTRY when that is supported
502
+ )
503
+ if not os.path.exists(registry_path):
504
+ msg = (
505
+ f"Registry path {registry_path} does not exist. To create the registry, "
506
+ "run the following command:\n"
507
+ " dsgrid registry create $DSGRID_REGISTRY_PATH\n"
508
+ "Then register dimensions, dimension mappings, projects, and datasets."
509
+ )
510
+ raise ValueError(msg)
511
+ return registry_path
512
+
513
+
514
+ def _should_use_remote_data(remote_path):
515
+ # We can reconsider this code if we ever have remote registries again.
516
+ return False
517
+ # if not str(remote_path).lower().startswith("s3"):
518
+ # # We are on a local filesystem. Use the remote path.
519
+ # return True
520
+
521
+ # use_remote_data = False
522
+ # if "DSGRID_USE_LOCAL_DATA" in os.environ:
523
+ # pass
524
+ # elif sys.platform in ("darwin", "win32"):
525
+ # # Local systems need to sync all load data files.
526
+ # pass
527
+ # elif on_hpc():
528
+ # pass
529
+ # elif "GITHUB_ACTION" in os.environ:
530
+ # logger.info("Do not use remote data on GitHub CI")
531
+ # else:
532
+ # # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/identify_ec2_instances.html
533
+ # try:
534
+ # response = requests.get(
535
+ # "http://169.254.169.254/latest/dynamic/instance-identity/document", timeout=2
536
+ # )
537
+ # ret = 0
538
+ # except requests.ConnectTimeout:
539
+ # logger.warning(
540
+ # "Connection timed out while trying to read AWS identity. "
541
+ # "If you are not running on AWS and would prefer to not experience this delay, set "
542
+ # "the environment varible DSGRID_USE_LOCAL_DATA."
543
+ # )
544
+ # ret = 1
545
+ # except Exception:
546
+ # logger.exception("Failed to read identity document")
547
+ # ret = 1
548
+
549
+ # if ret == 0 and response.status_code == 200:
550
+ # identity_data = response.json()
551
+ # logger.info("Identity data: %s", identity_data)
552
+ # if "instanceId" in identity_data:
553
+ # logger.info("Use remote data on AWS")
554
+ # use_remote_data = True
555
+ # else:
556
+ # logger.warning("Unknown payload from identity request.")
557
+
558
+ # return use_remote_data