dsgrid-toolkit 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dsgrid-toolkit might be problematic. Click here for more details.

Files changed (152) hide show
  1. dsgrid/__init__.py +22 -0
  2. dsgrid/api/__init__.py +0 -0
  3. dsgrid/api/api_manager.py +179 -0
  4. dsgrid/api/app.py +420 -0
  5. dsgrid/api/models.py +60 -0
  6. dsgrid/api/response_models.py +116 -0
  7. dsgrid/apps/__init__.py +0 -0
  8. dsgrid/apps/project_viewer/app.py +216 -0
  9. dsgrid/apps/registration_gui.py +444 -0
  10. dsgrid/chronify.py +22 -0
  11. dsgrid/cli/__init__.py +0 -0
  12. dsgrid/cli/common.py +120 -0
  13. dsgrid/cli/config.py +177 -0
  14. dsgrid/cli/download.py +13 -0
  15. dsgrid/cli/dsgrid.py +142 -0
  16. dsgrid/cli/dsgrid_admin.py +349 -0
  17. dsgrid/cli/install_notebooks.py +62 -0
  18. dsgrid/cli/query.py +711 -0
  19. dsgrid/cli/registry.py +1773 -0
  20. dsgrid/cloud/__init__.py +0 -0
  21. dsgrid/cloud/cloud_storage_interface.py +140 -0
  22. dsgrid/cloud/factory.py +31 -0
  23. dsgrid/cloud/fake_storage_interface.py +37 -0
  24. dsgrid/cloud/s3_storage_interface.py +156 -0
  25. dsgrid/common.py +35 -0
  26. dsgrid/config/__init__.py +0 -0
  27. dsgrid/config/annual_time_dimension_config.py +187 -0
  28. dsgrid/config/common.py +131 -0
  29. dsgrid/config/config_base.py +148 -0
  30. dsgrid/config/dataset_config.py +684 -0
  31. dsgrid/config/dataset_schema_handler_factory.py +41 -0
  32. dsgrid/config/date_time_dimension_config.py +108 -0
  33. dsgrid/config/dimension_config.py +54 -0
  34. dsgrid/config/dimension_config_factory.py +65 -0
  35. dsgrid/config/dimension_mapping_base.py +349 -0
  36. dsgrid/config/dimension_mappings_config.py +48 -0
  37. dsgrid/config/dimensions.py +775 -0
  38. dsgrid/config/dimensions_config.py +71 -0
  39. dsgrid/config/index_time_dimension_config.py +76 -0
  40. dsgrid/config/input_dataset_requirements.py +31 -0
  41. dsgrid/config/mapping_tables.py +209 -0
  42. dsgrid/config/noop_time_dimension_config.py +42 -0
  43. dsgrid/config/project_config.py +1457 -0
  44. dsgrid/config/registration_models.py +199 -0
  45. dsgrid/config/representative_period_time_dimension_config.py +194 -0
  46. dsgrid/config/simple_models.py +49 -0
  47. dsgrid/config/supplemental_dimension.py +29 -0
  48. dsgrid/config/time_dimension_base_config.py +200 -0
  49. dsgrid/data_models.py +155 -0
  50. dsgrid/dataset/__init__.py +0 -0
  51. dsgrid/dataset/dataset.py +123 -0
  52. dsgrid/dataset/dataset_expression_handler.py +86 -0
  53. dsgrid/dataset/dataset_mapping_manager.py +121 -0
  54. dsgrid/dataset/dataset_schema_handler_base.py +899 -0
  55. dsgrid/dataset/dataset_schema_handler_one_table.py +196 -0
  56. dsgrid/dataset/dataset_schema_handler_standard.py +303 -0
  57. dsgrid/dataset/growth_rates.py +162 -0
  58. dsgrid/dataset/models.py +44 -0
  59. dsgrid/dataset/table_format_handler_base.py +257 -0
  60. dsgrid/dataset/table_format_handler_factory.py +17 -0
  61. dsgrid/dataset/unpivoted_table.py +121 -0
  62. dsgrid/dimension/__init__.py +0 -0
  63. dsgrid/dimension/base_models.py +218 -0
  64. dsgrid/dimension/dimension_filters.py +308 -0
  65. dsgrid/dimension/standard.py +213 -0
  66. dsgrid/dimension/time.py +531 -0
  67. dsgrid/dimension/time_utils.py +88 -0
  68. dsgrid/dsgrid_rc.py +88 -0
  69. dsgrid/exceptions.py +105 -0
  70. dsgrid/filesystem/__init__.py +0 -0
  71. dsgrid/filesystem/cloud_filesystem.py +32 -0
  72. dsgrid/filesystem/factory.py +32 -0
  73. dsgrid/filesystem/filesystem_interface.py +136 -0
  74. dsgrid/filesystem/local_filesystem.py +74 -0
  75. dsgrid/filesystem/s3_filesystem.py +118 -0
  76. dsgrid/loggers.py +132 -0
  77. dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +950 -0
  78. dsgrid/notebooks/registration.ipynb +48 -0
  79. dsgrid/notebooks/start_notebook.sh +11 -0
  80. dsgrid/project.py +451 -0
  81. dsgrid/query/__init__.py +0 -0
  82. dsgrid/query/dataset_mapping_plan.py +142 -0
  83. dsgrid/query/derived_dataset.py +384 -0
  84. dsgrid/query/models.py +726 -0
  85. dsgrid/query/query_context.py +287 -0
  86. dsgrid/query/query_submitter.py +847 -0
  87. dsgrid/query/report_factory.py +19 -0
  88. dsgrid/query/report_peak_load.py +70 -0
  89. dsgrid/query/reports_base.py +20 -0
  90. dsgrid/registry/__init__.py +0 -0
  91. dsgrid/registry/bulk_register.py +161 -0
  92. dsgrid/registry/common.py +287 -0
  93. dsgrid/registry/config_update_checker_base.py +63 -0
  94. dsgrid/registry/data_store_factory.py +34 -0
  95. dsgrid/registry/data_store_interface.py +69 -0
  96. dsgrid/registry/dataset_config_generator.py +156 -0
  97. dsgrid/registry/dataset_registry_manager.py +734 -0
  98. dsgrid/registry/dataset_update_checker.py +16 -0
  99. dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
  100. dsgrid/registry/dimension_mapping_update_checker.py +16 -0
  101. dsgrid/registry/dimension_registry_manager.py +413 -0
  102. dsgrid/registry/dimension_update_checker.py +16 -0
  103. dsgrid/registry/duckdb_data_store.py +185 -0
  104. dsgrid/registry/filesystem_data_store.py +141 -0
  105. dsgrid/registry/filter_registry_manager.py +123 -0
  106. dsgrid/registry/project_config_generator.py +57 -0
  107. dsgrid/registry/project_registry_manager.py +1616 -0
  108. dsgrid/registry/project_update_checker.py +48 -0
  109. dsgrid/registry/registration_context.py +223 -0
  110. dsgrid/registry/registry_auto_updater.py +316 -0
  111. dsgrid/registry/registry_database.py +662 -0
  112. dsgrid/registry/registry_interface.py +446 -0
  113. dsgrid/registry/registry_manager.py +544 -0
  114. dsgrid/registry/registry_manager_base.py +367 -0
  115. dsgrid/registry/versioning.py +92 -0
  116. dsgrid/spark/__init__.py +0 -0
  117. dsgrid/spark/functions.py +545 -0
  118. dsgrid/spark/types.py +50 -0
  119. dsgrid/tests/__init__.py +0 -0
  120. dsgrid/tests/common.py +139 -0
  121. dsgrid/tests/make_us_data_registry.py +204 -0
  122. dsgrid/tests/register_derived_datasets.py +103 -0
  123. dsgrid/tests/utils.py +25 -0
  124. dsgrid/time/__init__.py +0 -0
  125. dsgrid/time/time_conversions.py +80 -0
  126. dsgrid/time/types.py +67 -0
  127. dsgrid/units/__init__.py +0 -0
  128. dsgrid/units/constants.py +113 -0
  129. dsgrid/units/convert.py +71 -0
  130. dsgrid/units/energy.py +145 -0
  131. dsgrid/units/power.py +87 -0
  132. dsgrid/utils/__init__.py +0 -0
  133. dsgrid/utils/dataset.py +612 -0
  134. dsgrid/utils/files.py +179 -0
  135. dsgrid/utils/filters.py +125 -0
  136. dsgrid/utils/id_remappings.py +100 -0
  137. dsgrid/utils/py_expression_eval/LICENSE +19 -0
  138. dsgrid/utils/py_expression_eval/README.md +8 -0
  139. dsgrid/utils/py_expression_eval/__init__.py +847 -0
  140. dsgrid/utils/py_expression_eval/tests.py +283 -0
  141. dsgrid/utils/run_command.py +70 -0
  142. dsgrid/utils/scratch_dir_context.py +64 -0
  143. dsgrid/utils/spark.py +918 -0
  144. dsgrid/utils/spark_partition.py +98 -0
  145. dsgrid/utils/timing.py +239 -0
  146. dsgrid/utils/utilities.py +184 -0
  147. dsgrid/utils/versioning.py +36 -0
  148. dsgrid_toolkit-0.2.0.dist-info/METADATA +216 -0
  149. dsgrid_toolkit-0.2.0.dist-info/RECORD +152 -0
  150. dsgrid_toolkit-0.2.0.dist-info/WHEEL +4 -0
  151. dsgrid_toolkit-0.2.0.dist-info/entry_points.txt +4 -0
  152. dsgrid_toolkit-0.2.0.dist-info/licenses/LICENSE +29 -0
@@ -0,0 +1,544 @@
1
+ """Manages registration of all projects and datasets."""
2
+
3
+ import getpass
4
+ import logging
5
+ import os
6
+ import requests
7
+ import shutil
8
+ import sys
9
+ import uuid
10
+ from pathlib import Path
11
+
12
+ from sqlalchemy import Connection
13
+
14
+ from dsgrid.common import (
15
+ LOCAL_REGISTRY,
16
+ REMOTE_REGISTRY,
17
+ SYNC_EXCLUDE_LIST,
18
+ on_hpc,
19
+ )
20
+ from dsgrid.cloud.factory import make_cloud_storage_interface
21
+ from dsgrid.dsgrid_rc import DsgridRuntimeConfig
22
+ from dsgrid.exceptions import DSGInvalidOperation, DSGValueNotRegistered, DSGInvalidParameter
23
+ from dsgrid.utils.run_command import check_run_command
24
+ from dsgrid.filesystem.factory import make_filesystem_interface
25
+ from dsgrid.utils.spark import init_spark, get_active_session
26
+ from .common import (
27
+ DataStoreType,
28
+ RegistryManagerParams,
29
+ )
30
+ from dsgrid.registry.registry_database import RegistryDatabase
31
+ from dsgrid.registry.registry_interface import (
32
+ DatasetRegistryInterface,
33
+ DimensionMappingRegistryInterface,
34
+ DimensionRegistryInterface,
35
+ ProjectRegistryInterface,
36
+ )
37
+ from .dimension_mapping_registry_manager import DimensionMappingRegistryManager
38
+ from .dataset_registry_manager import DatasetRegistryManager
39
+ from .dimension_registry_manager import DimensionRegistryManager
40
+ from .project_registry_manager import ProjectRegistryManager
41
+ from .registry_database import DatabaseConnection
42
+
43
+
44
+ logger = logging.getLogger(__name__)
45
+
46
+
47
+ class RegistryManager:
48
+ """Manages registration of all projects and datasets."""
49
+
50
+ def __init__(self, params: RegistryManagerParams, db: RegistryDatabase):
51
+ self._data_store = db.data_store
52
+ self._check_environment_variables(params)
53
+ if get_active_session() is None:
54
+ init_spark("dsgrid")
55
+ self._params = params
56
+ self._dimension_mgr = DimensionRegistryManager.load(
57
+ params.base_path,
58
+ params,
59
+ DimensionRegistryInterface(db),
60
+ )
61
+ self._dimension_mapping_mgr = DimensionMappingRegistryManager.load(
62
+ params.base_path,
63
+ params,
64
+ self._dimension_mgr,
65
+ DimensionMappingRegistryInterface(db),
66
+ )
67
+ self._dataset_mgr = DatasetRegistryManager.load(
68
+ params.base_path,
69
+ params,
70
+ self._dimension_mgr,
71
+ self._dimension_mapping_mgr,
72
+ DatasetRegistryInterface(db),
73
+ self._data_store,
74
+ )
75
+ self._project_mgr = ProjectRegistryManager.load(
76
+ params.base_path,
77
+ params,
78
+ self._dataset_mgr,
79
+ self._dimension_mgr,
80
+ self._dimension_mapping_mgr,
81
+ ProjectRegistryInterface(db),
82
+ )
83
+
84
+ @classmethod
85
+ def create(
86
+ cls,
87
+ conn: DatabaseConnection,
88
+ data_path: Path,
89
+ data_store_type: DataStoreType = DataStoreType.FILESYSTEM,
90
+ remote_path=REMOTE_REGISTRY,
91
+ user=None,
92
+ scratch_dir=None,
93
+ overwrite=False,
94
+ ):
95
+ """Creates a new RegistryManager at the given path.
96
+
97
+ Parameters
98
+ ----------
99
+ db_url : str
100
+ data_path : Path
101
+ data_store_type : DataStoreType
102
+ remote_path : str
103
+ Path to remote registry.
104
+ use_remote_data_path : None, str
105
+ Path to remote registry.
106
+ scratch_dir : None | Path
107
+ Base directory for dsgrid temporary directories. Must be accessible on all compute
108
+ nodes. Defaults to the current directory.
109
+ overwrite: bool
110
+ Overwrite the database if it exists.
111
+
112
+ Returns
113
+ -------
114
+ RegistryManager
115
+
116
+ """
117
+ if RegistryDatabase.has_database(conn) and not overwrite:
118
+ msg = f"database={conn.url} already exists. Choose a different name or set overwrite=True."
119
+ raise DSGInvalidOperation(msg)
120
+
121
+ db_filename = conn.try_get_filename()
122
+ if db_filename is not None and db_filename.is_relative_to(data_path):
123
+ msg = (
124
+ f"The database path {db_filename} cannot be relative to the data_path {data_path}."
125
+ )
126
+ raise DSGInvalidOperation(msg)
127
+
128
+ if not user:
129
+ user = getpass.getuser()
130
+ uid = str(uuid.uuid4())
131
+
132
+ if str(data_path).startswith("s3"):
133
+ msg = f"s3 is not currently supported: {data_path}"
134
+ raise Exception(msg)
135
+
136
+ fs_interface = make_filesystem_interface(data_path)
137
+ logger.info("Created registry with database=%s data_path=%s", conn.url, data_path)
138
+ cloud_interface = make_cloud_storage_interface(
139
+ data_path, "", offline=True, uuid=uid, user=user
140
+ )
141
+ scratch_dir = scratch_dir or DsgridRuntimeConfig.load().get_scratch_dir()
142
+ params = RegistryManagerParams(
143
+ base_path=Path(data_path),
144
+ remote_path=remote_path,
145
+ use_remote_data=False,
146
+ fs_interface=fs_interface,
147
+ cloud_interface=cloud_interface,
148
+ offline=True,
149
+ scratch_dir=scratch_dir,
150
+ )
151
+ RegistryDatabase.delete(conn)
152
+ db = RegistryDatabase.create(
153
+ conn, data_path, data_store_type=data_store_type, overwrite=overwrite
154
+ )
155
+ return cls(params, db)
156
+
157
+ @property
158
+ def dataset_manager(self) -> DatasetRegistryManager:
159
+ """Return the dataset manager."""
160
+ return self._dataset_mgr
161
+
162
+ @property
163
+ def dimension_mapping_manager(self) -> DimensionMappingRegistryManager:
164
+ """Return the dimension mapping manager."""
165
+ return self._dimension_mapping_mgr
166
+
167
+ @property
168
+ def dimension_manager(self) -> DimensionRegistryManager:
169
+ """Return the dimension manager."""
170
+ return self._dimension_mgr
171
+
172
+ @property
173
+ def project_manager(self) -> ProjectRegistryManager:
174
+ """Return the project manager."""
175
+ return self._project_mgr
176
+
177
+ @classmethod
178
+ def load(
179
+ cls,
180
+ conn: DatabaseConnection,
181
+ remote_path=REMOTE_REGISTRY,
182
+ use_remote_data=None,
183
+ offline_mode=True,
184
+ user=None,
185
+ no_prompts=False,
186
+ scratch_dir=None,
187
+ ):
188
+ """Loads a registry from the given path.
189
+
190
+ Parameters
191
+ ----------
192
+ conn : DatabaseConnection
193
+ remote_path: str, optional
194
+ path of the remote registry; default is REMOTE_REGISTRY
195
+ use_remote_data: bool, None
196
+ If set, use load data tables from remote_path. If not set, auto-determine what to do
197
+ based on HPC or AWS EMR environment variables.
198
+ offline_mode : bool
199
+ Load registry in offline mode; default is False
200
+ user : str
201
+ username
202
+ no_prompts : bool
203
+ If no_prompts is False, the user will be prompted to continue sync pulling the registry if lock files exist.
204
+ scratch_dir : None | Path
205
+ Base directory for dsgrid temporary directories. Must be accessible on all compute
206
+ nodes. Defaults to the current directory.
207
+
208
+ Returns
209
+ -------
210
+ RegistryManager
211
+
212
+ Examples
213
+ --------
214
+ >>> from dsgrid.registry.registry_manager import RegistryManager
215
+ >>> from dsgrid.registry.registry_database import DatabaseConnection
216
+ >>> manager = RegistryManager.load(
217
+ DatabaseConnection(
218
+ hostname="dsgrid-registry.hpc.nrel.gov",
219
+ database="standard-scenarios",
220
+ )
221
+ )
222
+ """
223
+ db = RegistryDatabase.connect(conn)
224
+ data_path = db.get_data_path()
225
+ if not user:
226
+ user = getpass.getuser()
227
+ uid = str(uuid.uuid4())
228
+ fs_interface = make_filesystem_interface(data_path)
229
+
230
+ if use_remote_data is None:
231
+ use_remote_data = _should_use_remote_data(remote_path)
232
+
233
+ cloud_interface = make_cloud_storage_interface(
234
+ data_path, remote_path, offline=offline_mode, uuid=uid, user=user
235
+ )
236
+
237
+ if not offline_mode:
238
+ lock_files = list(cloud_interface.get_lock_files())
239
+ if lock_files:
240
+ msg = f"There are {len(lock_files)} lock files in the registry:"
241
+ for lock_file in lock_files:
242
+ msg = msg + "\n\t" + f"- {lock_file}"
243
+ logger.info(msg)
244
+ if not no_prompts:
245
+ msg = (
246
+ msg
247
+ + "\n... Do you want to continue syncing the registry contents? [Y] >>> "
248
+ )
249
+ val = input(msg)
250
+ if val == "" or val.lower() == "y":
251
+ sync = True
252
+ else:
253
+ logger.info("Skipping remote registry sync.")
254
+ else:
255
+ sync = True
256
+
257
+ if sync:
258
+ logger.info("Sync configs from remote registry.")
259
+ # NOTE: When creating a registry, only the /configs are pulled. To sync_pull /data, use the dsgrid registry data-sync CLI command.
260
+ cloud_interface.sync_pull(
261
+ remote_path + "/configs",
262
+ str(data_path) + "/configs",
263
+ exclude=SYNC_EXCLUDE_LIST,
264
+ delete_local=True,
265
+ )
266
+
267
+ scratch_dir = scratch_dir or DsgridRuntimeConfig.load().get_scratch_dir()
268
+ params = RegistryManagerParams(
269
+ base_path=data_path,
270
+ remote_path=remote_path,
271
+ use_remote_data=use_remote_data,
272
+ fs_interface=fs_interface,
273
+ cloud_interface=cloud_interface,
274
+ offline=offline_mode,
275
+ scratch_dir=scratch_dir,
276
+ )
277
+
278
+ logger.info(
279
+ "Loaded local registry at %s offline_mode=%s",
280
+ conn.url,
281
+ # conn.port,
282
+ offline_mode,
283
+ )
284
+ return cls(params, db)
285
+
286
+ def data_sync(self, project_id, dataset_id, no_prompts=True):
287
+ """Sync data from the remote dsgrid registry.
288
+
289
+ Parameters
290
+ ----------
291
+ project_id : str
292
+ Sync by project_id filter
293
+ dataset_id : str
294
+ Sync by dataset_id filter
295
+ no_prompts : bool
296
+ If no_prompts is False, the user will be prompted to continue sync pulling the registry if lock files exist. By default, True.
297
+ """
298
+ if not project_id and not dataset_id:
299
+ msg = "Must provide a dataset_id or project_id for dsgrid data-sync."
300
+ raise ValueError(msg)
301
+
302
+ if project_id:
303
+ config = self.project_manager.get_by_id(project_id)
304
+ if dataset_id:
305
+ if dataset_id not in config.list_registered_dataset_ids():
306
+ msg = f"No registered dataset ID = '{dataset_id}' registered to project ID = '{project_id}'"
307
+ raise DSGValueNotRegistered(msg)
308
+ datasets = [(dataset_id, str(config.get_dataset(dataset_id).version))]
309
+ else:
310
+ datasets = []
311
+ for dataset in config.list_registered_dataset_ids():
312
+ datasets.append((dataset, str(config.get_dataset(dataset).version)))
313
+
314
+ if dataset_id and not project_id:
315
+ if not self.dataset_manager.has_id(dataset_id):
316
+ msg = f"No registered dataset ID = '{dataset_id}'"
317
+ raise DSGValueNotRegistered(msg)
318
+ version = self.dataset_manager.get_latest_version(dataset_id)
319
+ datasets = [(dataset_id, version)]
320
+
321
+ for dataset, version in datasets:
322
+ self._data_sync(dataset, version, no_prompts)
323
+
324
+ def _data_sync(self, dataset_id, version, no_prompts=True):
325
+ cloud_interface = self._params.cloud_interface
326
+ offline_mode = self._params.offline
327
+
328
+ if offline_mode:
329
+ msg = "dsgrid data-sync only works in online mode."
330
+ raise ValueError(msg)
331
+ sync = True
332
+
333
+ lock_files = list(
334
+ cloud_interface.get_lock_files(
335
+ relative_path=f"{cloud_interface._s3_filesystem._bucket}/configs/datasets/{dataset_id}"
336
+ )
337
+ )
338
+ if lock_files:
339
+ assert len(lock_files) == 1
340
+ msg = f"There are {len(lock_files)} lock files in the registry:"
341
+ for lock_file in lock_files:
342
+ msg = msg + "\n\t" + f"- {lock_file}"
343
+ logger.info(msg)
344
+ if not no_prompts:
345
+ msg = msg + "\n... Do you want to continue syncing the registry contents? [Y] >>> "
346
+ val = input(msg)
347
+ if val == "" or val.lower() == "y":
348
+ sync = True
349
+ else:
350
+ logger.info("Skipping remote registry sync.")
351
+ sync = False
352
+
353
+ if sync:
354
+ logger.info("Sync data from remote registry for %s, version=%s.", dataset_id, version)
355
+ cloud_interface.sync_pull(
356
+ remote_path=self._params.remote_path + f"/data/{dataset_id}/{version}",
357
+ local_path=str(self._params.base_path) + f"/data/{dataset_id}/{version}",
358
+ delete_local=True,
359
+ )
360
+ cloud_interface.sync_pull(
361
+ remote_path=self._params.remote_path + f"/data/{dataset_id}/registry.json5",
362
+ local_path=str(self._params.base_path) + f"/data/{dataset_id}/registry.json5",
363
+ delete_local=True,
364
+ is_file=True,
365
+ )
366
+ else:
367
+ logger.info(
368
+ "Skipping remote registry data sync for %s, version=%s.", dataset_id, version
369
+ )
370
+
371
+ @property
372
+ def path(self):
373
+ return self._params.base_path
374
+
375
+ def show(self, conn: Connection | None = None, filters=None, max_width=None, drop_fields=None):
376
+ """Show tables of all registry configs."""
377
+ self.project_manager.show(
378
+ conn=conn, filters=filters, max_width=max_width, drop_fields=drop_fields
379
+ )
380
+ self.dataset_manager.show(
381
+ conn=conn, filters=filters, max_width=max_width, drop_fields=drop_fields
382
+ )
383
+ self.dimension_manager.show(
384
+ conn=conn, filters=filters, max_width=max_width, drop_fields=drop_fields
385
+ )
386
+ self.dimension_mapping_manager.show(
387
+ conn=conn, filters=filters, max_width=max_width, drop_fields=drop_fields
388
+ )
389
+
390
+ @staticmethod
391
+ def copy(
392
+ src: DatabaseConnection, dst: DatabaseConnection, dst_data_path, mode="copy", force=False
393
+ ):
394
+ """Copy a registry to a new path.
395
+
396
+ Parameters
397
+ ----------
398
+ src : DatabaseConnection
399
+ dst : DatabaseConnection
400
+ dst_data_path : Path
401
+ simple_model : RegistrySimpleModel
402
+ Filter all configs and data according to this model.
403
+ mode : str
404
+ Controls whether to copy all data, make symlinks to data files, or sync data with the
405
+ rsync utility (not available on Windows). Options: 'copy', 'data-symlinks', 'rsync'
406
+ force : bool
407
+ Overwrite dst_data_path if it already exists. Does not apply if using rsync.
408
+
409
+ Raises
410
+ ------
411
+ DSGInvalidParameter
412
+ Raised if src is not a valid registry.
413
+ Raised if dst_data_path exists, use_rsync is False, and force is False.
414
+
415
+ """
416
+ src_db = RegistryDatabase.connect(src)
417
+ src_data_path = src_db.get_data_path()
418
+ # TODO: This does not support the duckdb data store. Need to implement this copy operation
419
+ # in the DataStoreInterface.
420
+ if not {x.name for x in src_data_path.iterdir()}.issuperset({"data"}):
421
+ msg = f"{src_data_path} is not a valid registry"
422
+ raise DSGInvalidParameter(msg)
423
+
424
+ if mode in ("copy", "data-symlinks"):
425
+ if dst_data_path.exists():
426
+ if force:
427
+ shutil.rmtree(dst_data_path)
428
+ else:
429
+ msg = f"{dst_data_path} already exists."
430
+ raise DSGInvalidParameter(msg)
431
+ RegistryDatabase.copy(src, dst, dst_data_path)
432
+ if mode == "rsync":
433
+ cmd = f"rsync -a {src_data_path}/ {dst_data_path}"
434
+ logger.info("rsync data with [%s]", cmd)
435
+ check_run_command(cmd)
436
+ elif mode in ("copy", "data-symlinks"):
437
+ logger.info("Copy data from source registry %s", src_data_path)
438
+ if mode == "data-symlinks":
439
+ _make_data_symlinks(src_data_path, dst_data_path)
440
+ else:
441
+ for path in (src_data_path / "data").iterdir():
442
+ dst_path = dst_data_path / "data" / path.name
443
+ shutil.copytree(path, dst_path, symlinks=True)
444
+ else:
445
+ msg = f"mode={mode} is not supported"
446
+ raise DSGInvalidParameter(msg)
447
+
448
+ @staticmethod
449
+ def _check_environment_variables(params):
450
+ if not params.offline:
451
+ illegal_vars = [x for x in os.environ if x.startswith("__DSGRID_SKIP_CHECK")]
452
+ if illegal_vars:
453
+ msg = (
454
+ f"Internal environment variables to skip checks are not allowed to be set "
455
+ f"in online mode: {illegal_vars}"
456
+ )
457
+ raise Exception(msg)
458
+
459
+
460
+ def _make_data_symlinks(src, dst):
461
+ # registry/data/dataset_id/registry.json5
462
+ # registry/data/dataset_id/version/*.parquet
463
+ for dataset_id in (src / "data").iterdir():
464
+ if dataset_id.is_dir():
465
+ (dst / "data" / dataset_id.name).mkdir(parents=True)
466
+ for path in (src / "data" / dataset_id.name).iterdir():
467
+ if path.is_dir():
468
+ (dst / "data" / dataset_id.name / path.name).mkdir()
469
+ for data_file in path.iterdir():
470
+ os.symlink(
471
+ data_file.absolute(),
472
+ dst / "data" / dataset_id.name / path.name / data_file.name,
473
+ target_is_directory=data_file.is_dir(),
474
+ )
475
+ elif path.is_file():
476
+ shutil.copyfile(path, dst / "data" / dataset_id.name / path.name)
477
+
478
+
479
+ def get_registry_path(registry_path=None):
480
+ """
481
+ Returns the registry_path, defaulting to the DSGRID_REGISTRY_PATH environment
482
+ variable or dsgrid.common.LOCAL_REGISTRY = Path.home() / ".dsgrid-registry"
483
+ if registry_path is None.
484
+ """
485
+ if registry_path is None:
486
+ registry_path = os.environ.get("DSGRID_REGISTRY_PATH", None)
487
+ if registry_path is None:
488
+ registry_path = (
489
+ LOCAL_REGISTRY # TEMPORARY: Replace with S3_REGISTRY when that is supported
490
+ )
491
+ if not os.path.exists(registry_path):
492
+ msg = (
493
+ f"Registry path {registry_path} does not exist. To create the registry, "
494
+ "run the following command:\n"
495
+ " dsgrid registry create $DSGRID_REGISTRY_PATH\n"
496
+ "Then register dimensions, dimension mappings, projects, and datasets."
497
+ )
498
+ raise ValueError(msg)
499
+ return registry_path
500
+
501
+
502
+ def _should_use_remote_data(remote_path):
503
+ if not str(remote_path).lower().startswith("s3"):
504
+ # We are on a local filesystem. Use the remote path.
505
+ return True
506
+
507
+ use_remote_data = False
508
+ if "DSGRID_USE_LOCAL_DATA" in os.environ:
509
+ pass
510
+ elif sys.platform in ("darwin", "win32"):
511
+ # Local systems need to sync all load data files.
512
+ pass
513
+ elif on_hpc():
514
+ pass
515
+ elif "GITHUB_ACTION" in os.environ:
516
+ logger.info("Do not use remote data on GitHub CI")
517
+ else:
518
+ # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/identify_ec2_instances.html
519
+ try:
520
+ response = requests.get(
521
+ "http://169.254.169.254/latest/dynamic/instance-identity/document", timeout=2
522
+ )
523
+ ret = 0
524
+ except requests.ConnectTimeout:
525
+ logger.warning(
526
+ "Connection timed out while trying to read AWS identity. "
527
+ "If you are not running on AWS and would prefer to not experience this delay, set "
528
+ "the environment varible DSGRID_USE_LOCAL_DATA."
529
+ )
530
+ ret = 1
531
+ except Exception:
532
+ logger.exception("Failed to read identity document")
533
+ ret = 1
534
+
535
+ if ret == 0 and response.status_code == 200:
536
+ identity_data = response.json()
537
+ logger.info("Identity data: %s", identity_data)
538
+ if "instanceId" in identity_data:
539
+ logger.info("Use remote data on AWS")
540
+ use_remote_data = True
541
+ else:
542
+ logger.warning("Unknown payload from identity request.")
543
+
544
+ return use_remote_data