dsgrid-toolkit 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dsgrid-toolkit might be problematic. Click here for more details.
- dsgrid/__init__.py +22 -0
- dsgrid/api/__init__.py +0 -0
- dsgrid/api/api_manager.py +179 -0
- dsgrid/api/app.py +420 -0
- dsgrid/api/models.py +60 -0
- dsgrid/api/response_models.py +116 -0
- dsgrid/apps/__init__.py +0 -0
- dsgrid/apps/project_viewer/app.py +216 -0
- dsgrid/apps/registration_gui.py +444 -0
- dsgrid/chronify.py +22 -0
- dsgrid/cli/__init__.py +0 -0
- dsgrid/cli/common.py +120 -0
- dsgrid/cli/config.py +177 -0
- dsgrid/cli/download.py +13 -0
- dsgrid/cli/dsgrid.py +142 -0
- dsgrid/cli/dsgrid_admin.py +349 -0
- dsgrid/cli/install_notebooks.py +62 -0
- dsgrid/cli/query.py +711 -0
- dsgrid/cli/registry.py +1773 -0
- dsgrid/cloud/__init__.py +0 -0
- dsgrid/cloud/cloud_storage_interface.py +140 -0
- dsgrid/cloud/factory.py +31 -0
- dsgrid/cloud/fake_storage_interface.py +37 -0
- dsgrid/cloud/s3_storage_interface.py +156 -0
- dsgrid/common.py +35 -0
- dsgrid/config/__init__.py +0 -0
- dsgrid/config/annual_time_dimension_config.py +187 -0
- dsgrid/config/common.py +131 -0
- dsgrid/config/config_base.py +148 -0
- dsgrid/config/dataset_config.py +684 -0
- dsgrid/config/dataset_schema_handler_factory.py +41 -0
- dsgrid/config/date_time_dimension_config.py +108 -0
- dsgrid/config/dimension_config.py +54 -0
- dsgrid/config/dimension_config_factory.py +65 -0
- dsgrid/config/dimension_mapping_base.py +349 -0
- dsgrid/config/dimension_mappings_config.py +48 -0
- dsgrid/config/dimensions.py +775 -0
- dsgrid/config/dimensions_config.py +71 -0
- dsgrid/config/index_time_dimension_config.py +76 -0
- dsgrid/config/input_dataset_requirements.py +31 -0
- dsgrid/config/mapping_tables.py +209 -0
- dsgrid/config/noop_time_dimension_config.py +42 -0
- dsgrid/config/project_config.py +1457 -0
- dsgrid/config/registration_models.py +199 -0
- dsgrid/config/representative_period_time_dimension_config.py +194 -0
- dsgrid/config/simple_models.py +49 -0
- dsgrid/config/supplemental_dimension.py +29 -0
- dsgrid/config/time_dimension_base_config.py +200 -0
- dsgrid/data_models.py +155 -0
- dsgrid/dataset/__init__.py +0 -0
- dsgrid/dataset/dataset.py +123 -0
- dsgrid/dataset/dataset_expression_handler.py +86 -0
- dsgrid/dataset/dataset_mapping_manager.py +121 -0
- dsgrid/dataset/dataset_schema_handler_base.py +899 -0
- dsgrid/dataset/dataset_schema_handler_one_table.py +196 -0
- dsgrid/dataset/dataset_schema_handler_standard.py +303 -0
- dsgrid/dataset/growth_rates.py +162 -0
- dsgrid/dataset/models.py +44 -0
- dsgrid/dataset/table_format_handler_base.py +257 -0
- dsgrid/dataset/table_format_handler_factory.py +17 -0
- dsgrid/dataset/unpivoted_table.py +121 -0
- dsgrid/dimension/__init__.py +0 -0
- dsgrid/dimension/base_models.py +218 -0
- dsgrid/dimension/dimension_filters.py +308 -0
- dsgrid/dimension/standard.py +213 -0
- dsgrid/dimension/time.py +531 -0
- dsgrid/dimension/time_utils.py +88 -0
- dsgrid/dsgrid_rc.py +88 -0
- dsgrid/exceptions.py +105 -0
- dsgrid/filesystem/__init__.py +0 -0
- dsgrid/filesystem/cloud_filesystem.py +32 -0
- dsgrid/filesystem/factory.py +32 -0
- dsgrid/filesystem/filesystem_interface.py +136 -0
- dsgrid/filesystem/local_filesystem.py +74 -0
- dsgrid/filesystem/s3_filesystem.py +118 -0
- dsgrid/loggers.py +132 -0
- dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +950 -0
- dsgrid/notebooks/registration.ipynb +48 -0
- dsgrid/notebooks/start_notebook.sh +11 -0
- dsgrid/project.py +451 -0
- dsgrid/query/__init__.py +0 -0
- dsgrid/query/dataset_mapping_plan.py +142 -0
- dsgrid/query/derived_dataset.py +384 -0
- dsgrid/query/models.py +726 -0
- dsgrid/query/query_context.py +287 -0
- dsgrid/query/query_submitter.py +847 -0
- dsgrid/query/report_factory.py +19 -0
- dsgrid/query/report_peak_load.py +70 -0
- dsgrid/query/reports_base.py +20 -0
- dsgrid/registry/__init__.py +0 -0
- dsgrid/registry/bulk_register.py +161 -0
- dsgrid/registry/common.py +287 -0
- dsgrid/registry/config_update_checker_base.py +63 -0
- dsgrid/registry/data_store_factory.py +34 -0
- dsgrid/registry/data_store_interface.py +69 -0
- dsgrid/registry/dataset_config_generator.py +156 -0
- dsgrid/registry/dataset_registry_manager.py +734 -0
- dsgrid/registry/dataset_update_checker.py +16 -0
- dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
- dsgrid/registry/dimension_mapping_update_checker.py +16 -0
- dsgrid/registry/dimension_registry_manager.py +413 -0
- dsgrid/registry/dimension_update_checker.py +16 -0
- dsgrid/registry/duckdb_data_store.py +185 -0
- dsgrid/registry/filesystem_data_store.py +141 -0
- dsgrid/registry/filter_registry_manager.py +123 -0
- dsgrid/registry/project_config_generator.py +57 -0
- dsgrid/registry/project_registry_manager.py +1616 -0
- dsgrid/registry/project_update_checker.py +48 -0
- dsgrid/registry/registration_context.py +223 -0
- dsgrid/registry/registry_auto_updater.py +316 -0
- dsgrid/registry/registry_database.py +662 -0
- dsgrid/registry/registry_interface.py +446 -0
- dsgrid/registry/registry_manager.py +544 -0
- dsgrid/registry/registry_manager_base.py +367 -0
- dsgrid/registry/versioning.py +92 -0
- dsgrid/spark/__init__.py +0 -0
- dsgrid/spark/functions.py +545 -0
- dsgrid/spark/types.py +50 -0
- dsgrid/tests/__init__.py +0 -0
- dsgrid/tests/common.py +139 -0
- dsgrid/tests/make_us_data_registry.py +204 -0
- dsgrid/tests/register_derived_datasets.py +103 -0
- dsgrid/tests/utils.py +25 -0
- dsgrid/time/__init__.py +0 -0
- dsgrid/time/time_conversions.py +80 -0
- dsgrid/time/types.py +67 -0
- dsgrid/units/__init__.py +0 -0
- dsgrid/units/constants.py +113 -0
- dsgrid/units/convert.py +71 -0
- dsgrid/units/energy.py +145 -0
- dsgrid/units/power.py +87 -0
- dsgrid/utils/__init__.py +0 -0
- dsgrid/utils/dataset.py +612 -0
- dsgrid/utils/files.py +179 -0
- dsgrid/utils/filters.py +125 -0
- dsgrid/utils/id_remappings.py +100 -0
- dsgrid/utils/py_expression_eval/LICENSE +19 -0
- dsgrid/utils/py_expression_eval/README.md +8 -0
- dsgrid/utils/py_expression_eval/__init__.py +847 -0
- dsgrid/utils/py_expression_eval/tests.py +283 -0
- dsgrid/utils/run_command.py +70 -0
- dsgrid/utils/scratch_dir_context.py +64 -0
- dsgrid/utils/spark.py +918 -0
- dsgrid/utils/spark_partition.py +98 -0
- dsgrid/utils/timing.py +239 -0
- dsgrid/utils/utilities.py +184 -0
- dsgrid/utils/versioning.py +36 -0
- dsgrid_toolkit-0.2.0.dist-info/METADATA +216 -0
- dsgrid_toolkit-0.2.0.dist-info/RECORD +152 -0
- dsgrid_toolkit-0.2.0.dist-info/WHEEL +4 -0
- dsgrid_toolkit-0.2.0.dist-info/entry_points.txt +4 -0
- dsgrid_toolkit-0.2.0.dist-info/licenses/LICENSE +29 -0
|
@@ -0,0 +1,544 @@
|
|
|
1
|
+
"""Manages registration of all projects and datasets."""
|
|
2
|
+
|
|
3
|
+
import getpass
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import requests
|
|
7
|
+
import shutil
|
|
8
|
+
import sys
|
|
9
|
+
import uuid
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from sqlalchemy import Connection
|
|
13
|
+
|
|
14
|
+
from dsgrid.common import (
|
|
15
|
+
LOCAL_REGISTRY,
|
|
16
|
+
REMOTE_REGISTRY,
|
|
17
|
+
SYNC_EXCLUDE_LIST,
|
|
18
|
+
on_hpc,
|
|
19
|
+
)
|
|
20
|
+
from dsgrid.cloud.factory import make_cloud_storage_interface
|
|
21
|
+
from dsgrid.dsgrid_rc import DsgridRuntimeConfig
|
|
22
|
+
from dsgrid.exceptions import DSGInvalidOperation, DSGValueNotRegistered, DSGInvalidParameter
|
|
23
|
+
from dsgrid.utils.run_command import check_run_command
|
|
24
|
+
from dsgrid.filesystem.factory import make_filesystem_interface
|
|
25
|
+
from dsgrid.utils.spark import init_spark, get_active_session
|
|
26
|
+
from .common import (
|
|
27
|
+
DataStoreType,
|
|
28
|
+
RegistryManagerParams,
|
|
29
|
+
)
|
|
30
|
+
from dsgrid.registry.registry_database import RegistryDatabase
|
|
31
|
+
from dsgrid.registry.registry_interface import (
|
|
32
|
+
DatasetRegistryInterface,
|
|
33
|
+
DimensionMappingRegistryInterface,
|
|
34
|
+
DimensionRegistryInterface,
|
|
35
|
+
ProjectRegistryInterface,
|
|
36
|
+
)
|
|
37
|
+
from .dimension_mapping_registry_manager import DimensionMappingRegistryManager
|
|
38
|
+
from .dataset_registry_manager import DatasetRegistryManager
|
|
39
|
+
from .dimension_registry_manager import DimensionRegistryManager
|
|
40
|
+
from .project_registry_manager import ProjectRegistryManager
|
|
41
|
+
from .registry_database import DatabaseConnection
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
logger = logging.getLogger(__name__)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class RegistryManager:
|
|
48
|
+
"""Manages registration of all projects and datasets."""
|
|
49
|
+
|
|
50
|
+
def __init__(self, params: RegistryManagerParams, db: RegistryDatabase):
|
|
51
|
+
self._data_store = db.data_store
|
|
52
|
+
self._check_environment_variables(params)
|
|
53
|
+
if get_active_session() is None:
|
|
54
|
+
init_spark("dsgrid")
|
|
55
|
+
self._params = params
|
|
56
|
+
self._dimension_mgr = DimensionRegistryManager.load(
|
|
57
|
+
params.base_path,
|
|
58
|
+
params,
|
|
59
|
+
DimensionRegistryInterface(db),
|
|
60
|
+
)
|
|
61
|
+
self._dimension_mapping_mgr = DimensionMappingRegistryManager.load(
|
|
62
|
+
params.base_path,
|
|
63
|
+
params,
|
|
64
|
+
self._dimension_mgr,
|
|
65
|
+
DimensionMappingRegistryInterface(db),
|
|
66
|
+
)
|
|
67
|
+
self._dataset_mgr = DatasetRegistryManager.load(
|
|
68
|
+
params.base_path,
|
|
69
|
+
params,
|
|
70
|
+
self._dimension_mgr,
|
|
71
|
+
self._dimension_mapping_mgr,
|
|
72
|
+
DatasetRegistryInterface(db),
|
|
73
|
+
self._data_store,
|
|
74
|
+
)
|
|
75
|
+
self._project_mgr = ProjectRegistryManager.load(
|
|
76
|
+
params.base_path,
|
|
77
|
+
params,
|
|
78
|
+
self._dataset_mgr,
|
|
79
|
+
self._dimension_mgr,
|
|
80
|
+
self._dimension_mapping_mgr,
|
|
81
|
+
ProjectRegistryInterface(db),
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
@classmethod
|
|
85
|
+
def create(
|
|
86
|
+
cls,
|
|
87
|
+
conn: DatabaseConnection,
|
|
88
|
+
data_path: Path,
|
|
89
|
+
data_store_type: DataStoreType = DataStoreType.FILESYSTEM,
|
|
90
|
+
remote_path=REMOTE_REGISTRY,
|
|
91
|
+
user=None,
|
|
92
|
+
scratch_dir=None,
|
|
93
|
+
overwrite=False,
|
|
94
|
+
):
|
|
95
|
+
"""Creates a new RegistryManager at the given path.
|
|
96
|
+
|
|
97
|
+
Parameters
|
|
98
|
+
----------
|
|
99
|
+
db_url : str
|
|
100
|
+
data_path : Path
|
|
101
|
+
data_store_type : DataStoreType
|
|
102
|
+
remote_path : str
|
|
103
|
+
Path to remote registry.
|
|
104
|
+
use_remote_data_path : None, str
|
|
105
|
+
Path to remote registry.
|
|
106
|
+
scratch_dir : None | Path
|
|
107
|
+
Base directory for dsgrid temporary directories. Must be accessible on all compute
|
|
108
|
+
nodes. Defaults to the current directory.
|
|
109
|
+
overwrite: bool
|
|
110
|
+
Overwrite the database if it exists.
|
|
111
|
+
|
|
112
|
+
Returns
|
|
113
|
+
-------
|
|
114
|
+
RegistryManager
|
|
115
|
+
|
|
116
|
+
"""
|
|
117
|
+
if RegistryDatabase.has_database(conn) and not overwrite:
|
|
118
|
+
msg = f"database={conn.url} already exists. Choose a different name or set overwrite=True."
|
|
119
|
+
raise DSGInvalidOperation(msg)
|
|
120
|
+
|
|
121
|
+
db_filename = conn.try_get_filename()
|
|
122
|
+
if db_filename is not None and db_filename.is_relative_to(data_path):
|
|
123
|
+
msg = (
|
|
124
|
+
f"The database path {db_filename} cannot be relative to the data_path {data_path}."
|
|
125
|
+
)
|
|
126
|
+
raise DSGInvalidOperation(msg)
|
|
127
|
+
|
|
128
|
+
if not user:
|
|
129
|
+
user = getpass.getuser()
|
|
130
|
+
uid = str(uuid.uuid4())
|
|
131
|
+
|
|
132
|
+
if str(data_path).startswith("s3"):
|
|
133
|
+
msg = f"s3 is not currently supported: {data_path}"
|
|
134
|
+
raise Exception(msg)
|
|
135
|
+
|
|
136
|
+
fs_interface = make_filesystem_interface(data_path)
|
|
137
|
+
logger.info("Created registry with database=%s data_path=%s", conn.url, data_path)
|
|
138
|
+
cloud_interface = make_cloud_storage_interface(
|
|
139
|
+
data_path, "", offline=True, uuid=uid, user=user
|
|
140
|
+
)
|
|
141
|
+
scratch_dir = scratch_dir or DsgridRuntimeConfig.load().get_scratch_dir()
|
|
142
|
+
params = RegistryManagerParams(
|
|
143
|
+
base_path=Path(data_path),
|
|
144
|
+
remote_path=remote_path,
|
|
145
|
+
use_remote_data=False,
|
|
146
|
+
fs_interface=fs_interface,
|
|
147
|
+
cloud_interface=cloud_interface,
|
|
148
|
+
offline=True,
|
|
149
|
+
scratch_dir=scratch_dir,
|
|
150
|
+
)
|
|
151
|
+
RegistryDatabase.delete(conn)
|
|
152
|
+
db = RegistryDatabase.create(
|
|
153
|
+
conn, data_path, data_store_type=data_store_type, overwrite=overwrite
|
|
154
|
+
)
|
|
155
|
+
return cls(params, db)
|
|
156
|
+
|
|
157
|
+
@property
|
|
158
|
+
def dataset_manager(self) -> DatasetRegistryManager:
|
|
159
|
+
"""Return the dataset manager."""
|
|
160
|
+
return self._dataset_mgr
|
|
161
|
+
|
|
162
|
+
@property
|
|
163
|
+
def dimension_mapping_manager(self) -> DimensionMappingRegistryManager:
|
|
164
|
+
"""Return the dimension mapping manager."""
|
|
165
|
+
return self._dimension_mapping_mgr
|
|
166
|
+
|
|
167
|
+
@property
|
|
168
|
+
def dimension_manager(self) -> DimensionRegistryManager:
|
|
169
|
+
"""Return the dimension manager."""
|
|
170
|
+
return self._dimension_mgr
|
|
171
|
+
|
|
172
|
+
@property
|
|
173
|
+
def project_manager(self) -> ProjectRegistryManager:
|
|
174
|
+
"""Return the project manager."""
|
|
175
|
+
return self._project_mgr
|
|
176
|
+
|
|
177
|
+
@classmethod
|
|
178
|
+
def load(
|
|
179
|
+
cls,
|
|
180
|
+
conn: DatabaseConnection,
|
|
181
|
+
remote_path=REMOTE_REGISTRY,
|
|
182
|
+
use_remote_data=None,
|
|
183
|
+
offline_mode=True,
|
|
184
|
+
user=None,
|
|
185
|
+
no_prompts=False,
|
|
186
|
+
scratch_dir=None,
|
|
187
|
+
):
|
|
188
|
+
"""Loads a registry from the given path.
|
|
189
|
+
|
|
190
|
+
Parameters
|
|
191
|
+
----------
|
|
192
|
+
conn : DatabaseConnection
|
|
193
|
+
remote_path: str, optional
|
|
194
|
+
path of the remote registry; default is REMOTE_REGISTRY
|
|
195
|
+
use_remote_data: bool, None
|
|
196
|
+
If set, use load data tables from remote_path. If not set, auto-determine what to do
|
|
197
|
+
based on HPC or AWS EMR environment variables.
|
|
198
|
+
offline_mode : bool
|
|
199
|
+
Load registry in offline mode; default is False
|
|
200
|
+
user : str
|
|
201
|
+
username
|
|
202
|
+
no_prompts : bool
|
|
203
|
+
If no_prompts is False, the user will be prompted to continue sync pulling the registry if lock files exist.
|
|
204
|
+
scratch_dir : None | Path
|
|
205
|
+
Base directory for dsgrid temporary directories. Must be accessible on all compute
|
|
206
|
+
nodes. Defaults to the current directory.
|
|
207
|
+
|
|
208
|
+
Returns
|
|
209
|
+
-------
|
|
210
|
+
RegistryManager
|
|
211
|
+
|
|
212
|
+
Examples
|
|
213
|
+
--------
|
|
214
|
+
>>> from dsgrid.registry.registry_manager import RegistryManager
|
|
215
|
+
>>> from dsgrid.registry.registry_database import DatabaseConnection
|
|
216
|
+
>>> manager = RegistryManager.load(
|
|
217
|
+
DatabaseConnection(
|
|
218
|
+
hostname="dsgrid-registry.hpc.nrel.gov",
|
|
219
|
+
database="standard-scenarios",
|
|
220
|
+
)
|
|
221
|
+
)
|
|
222
|
+
"""
|
|
223
|
+
db = RegistryDatabase.connect(conn)
|
|
224
|
+
data_path = db.get_data_path()
|
|
225
|
+
if not user:
|
|
226
|
+
user = getpass.getuser()
|
|
227
|
+
uid = str(uuid.uuid4())
|
|
228
|
+
fs_interface = make_filesystem_interface(data_path)
|
|
229
|
+
|
|
230
|
+
if use_remote_data is None:
|
|
231
|
+
use_remote_data = _should_use_remote_data(remote_path)
|
|
232
|
+
|
|
233
|
+
cloud_interface = make_cloud_storage_interface(
|
|
234
|
+
data_path, remote_path, offline=offline_mode, uuid=uid, user=user
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
if not offline_mode:
|
|
238
|
+
lock_files = list(cloud_interface.get_lock_files())
|
|
239
|
+
if lock_files:
|
|
240
|
+
msg = f"There are {len(lock_files)} lock files in the registry:"
|
|
241
|
+
for lock_file in lock_files:
|
|
242
|
+
msg = msg + "\n\t" + f"- {lock_file}"
|
|
243
|
+
logger.info(msg)
|
|
244
|
+
if not no_prompts:
|
|
245
|
+
msg = (
|
|
246
|
+
msg
|
|
247
|
+
+ "\n... Do you want to continue syncing the registry contents? [Y] >>> "
|
|
248
|
+
)
|
|
249
|
+
val = input(msg)
|
|
250
|
+
if val == "" or val.lower() == "y":
|
|
251
|
+
sync = True
|
|
252
|
+
else:
|
|
253
|
+
logger.info("Skipping remote registry sync.")
|
|
254
|
+
else:
|
|
255
|
+
sync = True
|
|
256
|
+
|
|
257
|
+
if sync:
|
|
258
|
+
logger.info("Sync configs from remote registry.")
|
|
259
|
+
# NOTE: When creating a registry, only the /configs are pulled. To sync_pull /data, use the dsgrid registry data-sync CLI command.
|
|
260
|
+
cloud_interface.sync_pull(
|
|
261
|
+
remote_path + "/configs",
|
|
262
|
+
str(data_path) + "/configs",
|
|
263
|
+
exclude=SYNC_EXCLUDE_LIST,
|
|
264
|
+
delete_local=True,
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
scratch_dir = scratch_dir or DsgridRuntimeConfig.load().get_scratch_dir()
|
|
268
|
+
params = RegistryManagerParams(
|
|
269
|
+
base_path=data_path,
|
|
270
|
+
remote_path=remote_path,
|
|
271
|
+
use_remote_data=use_remote_data,
|
|
272
|
+
fs_interface=fs_interface,
|
|
273
|
+
cloud_interface=cloud_interface,
|
|
274
|
+
offline=offline_mode,
|
|
275
|
+
scratch_dir=scratch_dir,
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
logger.info(
|
|
279
|
+
"Loaded local registry at %s offline_mode=%s",
|
|
280
|
+
conn.url,
|
|
281
|
+
# conn.port,
|
|
282
|
+
offline_mode,
|
|
283
|
+
)
|
|
284
|
+
return cls(params, db)
|
|
285
|
+
|
|
286
|
+
def data_sync(self, project_id, dataset_id, no_prompts=True):
|
|
287
|
+
"""Sync data from the remote dsgrid registry.
|
|
288
|
+
|
|
289
|
+
Parameters
|
|
290
|
+
----------
|
|
291
|
+
project_id : str
|
|
292
|
+
Sync by project_id filter
|
|
293
|
+
dataset_id : str
|
|
294
|
+
Sync by dataset_id filter
|
|
295
|
+
no_prompts : bool
|
|
296
|
+
If no_prompts is False, the user will be prompted to continue sync pulling the registry if lock files exist. By default, True.
|
|
297
|
+
"""
|
|
298
|
+
if not project_id and not dataset_id:
|
|
299
|
+
msg = "Must provide a dataset_id or project_id for dsgrid data-sync."
|
|
300
|
+
raise ValueError(msg)
|
|
301
|
+
|
|
302
|
+
if project_id:
|
|
303
|
+
config = self.project_manager.get_by_id(project_id)
|
|
304
|
+
if dataset_id:
|
|
305
|
+
if dataset_id not in config.list_registered_dataset_ids():
|
|
306
|
+
msg = f"No registered dataset ID = '{dataset_id}' registered to project ID = '{project_id}'"
|
|
307
|
+
raise DSGValueNotRegistered(msg)
|
|
308
|
+
datasets = [(dataset_id, str(config.get_dataset(dataset_id).version))]
|
|
309
|
+
else:
|
|
310
|
+
datasets = []
|
|
311
|
+
for dataset in config.list_registered_dataset_ids():
|
|
312
|
+
datasets.append((dataset, str(config.get_dataset(dataset).version)))
|
|
313
|
+
|
|
314
|
+
if dataset_id and not project_id:
|
|
315
|
+
if not self.dataset_manager.has_id(dataset_id):
|
|
316
|
+
msg = f"No registered dataset ID = '{dataset_id}'"
|
|
317
|
+
raise DSGValueNotRegistered(msg)
|
|
318
|
+
version = self.dataset_manager.get_latest_version(dataset_id)
|
|
319
|
+
datasets = [(dataset_id, version)]
|
|
320
|
+
|
|
321
|
+
for dataset, version in datasets:
|
|
322
|
+
self._data_sync(dataset, version, no_prompts)
|
|
323
|
+
|
|
324
|
+
def _data_sync(self, dataset_id, version, no_prompts=True):
|
|
325
|
+
cloud_interface = self._params.cloud_interface
|
|
326
|
+
offline_mode = self._params.offline
|
|
327
|
+
|
|
328
|
+
if offline_mode:
|
|
329
|
+
msg = "dsgrid data-sync only works in online mode."
|
|
330
|
+
raise ValueError(msg)
|
|
331
|
+
sync = True
|
|
332
|
+
|
|
333
|
+
lock_files = list(
|
|
334
|
+
cloud_interface.get_lock_files(
|
|
335
|
+
relative_path=f"{cloud_interface._s3_filesystem._bucket}/configs/datasets/{dataset_id}"
|
|
336
|
+
)
|
|
337
|
+
)
|
|
338
|
+
if lock_files:
|
|
339
|
+
assert len(lock_files) == 1
|
|
340
|
+
msg = f"There are {len(lock_files)} lock files in the registry:"
|
|
341
|
+
for lock_file in lock_files:
|
|
342
|
+
msg = msg + "\n\t" + f"- {lock_file}"
|
|
343
|
+
logger.info(msg)
|
|
344
|
+
if not no_prompts:
|
|
345
|
+
msg = msg + "\n... Do you want to continue syncing the registry contents? [Y] >>> "
|
|
346
|
+
val = input(msg)
|
|
347
|
+
if val == "" or val.lower() == "y":
|
|
348
|
+
sync = True
|
|
349
|
+
else:
|
|
350
|
+
logger.info("Skipping remote registry sync.")
|
|
351
|
+
sync = False
|
|
352
|
+
|
|
353
|
+
if sync:
|
|
354
|
+
logger.info("Sync data from remote registry for %s, version=%s.", dataset_id, version)
|
|
355
|
+
cloud_interface.sync_pull(
|
|
356
|
+
remote_path=self._params.remote_path + f"/data/{dataset_id}/{version}",
|
|
357
|
+
local_path=str(self._params.base_path) + f"/data/{dataset_id}/{version}",
|
|
358
|
+
delete_local=True,
|
|
359
|
+
)
|
|
360
|
+
cloud_interface.sync_pull(
|
|
361
|
+
remote_path=self._params.remote_path + f"/data/{dataset_id}/registry.json5",
|
|
362
|
+
local_path=str(self._params.base_path) + f"/data/{dataset_id}/registry.json5",
|
|
363
|
+
delete_local=True,
|
|
364
|
+
is_file=True,
|
|
365
|
+
)
|
|
366
|
+
else:
|
|
367
|
+
logger.info(
|
|
368
|
+
"Skipping remote registry data sync for %s, version=%s.", dataset_id, version
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
@property
|
|
372
|
+
def path(self):
|
|
373
|
+
return self._params.base_path
|
|
374
|
+
|
|
375
|
+
def show(self, conn: Connection | None = None, filters=None, max_width=None, drop_fields=None):
|
|
376
|
+
"""Show tables of all registry configs."""
|
|
377
|
+
self.project_manager.show(
|
|
378
|
+
conn=conn, filters=filters, max_width=max_width, drop_fields=drop_fields
|
|
379
|
+
)
|
|
380
|
+
self.dataset_manager.show(
|
|
381
|
+
conn=conn, filters=filters, max_width=max_width, drop_fields=drop_fields
|
|
382
|
+
)
|
|
383
|
+
self.dimension_manager.show(
|
|
384
|
+
conn=conn, filters=filters, max_width=max_width, drop_fields=drop_fields
|
|
385
|
+
)
|
|
386
|
+
self.dimension_mapping_manager.show(
|
|
387
|
+
conn=conn, filters=filters, max_width=max_width, drop_fields=drop_fields
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
@staticmethod
|
|
391
|
+
def copy(
|
|
392
|
+
src: DatabaseConnection, dst: DatabaseConnection, dst_data_path, mode="copy", force=False
|
|
393
|
+
):
|
|
394
|
+
"""Copy a registry to a new path.
|
|
395
|
+
|
|
396
|
+
Parameters
|
|
397
|
+
----------
|
|
398
|
+
src : DatabaseConnection
|
|
399
|
+
dst : DatabaseConnection
|
|
400
|
+
dst_data_path : Path
|
|
401
|
+
simple_model : RegistrySimpleModel
|
|
402
|
+
Filter all configs and data according to this model.
|
|
403
|
+
mode : str
|
|
404
|
+
Controls whether to copy all data, make symlinks to data files, or sync data with the
|
|
405
|
+
rsync utility (not available on Windows). Options: 'copy', 'data-symlinks', 'rsync'
|
|
406
|
+
force : bool
|
|
407
|
+
Overwrite dst_data_path if it already exists. Does not apply if using rsync.
|
|
408
|
+
|
|
409
|
+
Raises
|
|
410
|
+
------
|
|
411
|
+
DSGInvalidParameter
|
|
412
|
+
Raised if src is not a valid registry.
|
|
413
|
+
Raised if dst_data_path exists, use_rsync is False, and force is False.
|
|
414
|
+
|
|
415
|
+
"""
|
|
416
|
+
src_db = RegistryDatabase.connect(src)
|
|
417
|
+
src_data_path = src_db.get_data_path()
|
|
418
|
+
# TODO: This does not support the duckdb data store. Need to implement this copy operation
|
|
419
|
+
# in the DataStoreInterface.
|
|
420
|
+
if not {x.name for x in src_data_path.iterdir()}.issuperset({"data"}):
|
|
421
|
+
msg = f"{src_data_path} is not a valid registry"
|
|
422
|
+
raise DSGInvalidParameter(msg)
|
|
423
|
+
|
|
424
|
+
if mode in ("copy", "data-symlinks"):
|
|
425
|
+
if dst_data_path.exists():
|
|
426
|
+
if force:
|
|
427
|
+
shutil.rmtree(dst_data_path)
|
|
428
|
+
else:
|
|
429
|
+
msg = f"{dst_data_path} already exists."
|
|
430
|
+
raise DSGInvalidParameter(msg)
|
|
431
|
+
RegistryDatabase.copy(src, dst, dst_data_path)
|
|
432
|
+
if mode == "rsync":
|
|
433
|
+
cmd = f"rsync -a {src_data_path}/ {dst_data_path}"
|
|
434
|
+
logger.info("rsync data with [%s]", cmd)
|
|
435
|
+
check_run_command(cmd)
|
|
436
|
+
elif mode in ("copy", "data-symlinks"):
|
|
437
|
+
logger.info("Copy data from source registry %s", src_data_path)
|
|
438
|
+
if mode == "data-symlinks":
|
|
439
|
+
_make_data_symlinks(src_data_path, dst_data_path)
|
|
440
|
+
else:
|
|
441
|
+
for path in (src_data_path / "data").iterdir():
|
|
442
|
+
dst_path = dst_data_path / "data" / path.name
|
|
443
|
+
shutil.copytree(path, dst_path, symlinks=True)
|
|
444
|
+
else:
|
|
445
|
+
msg = f"mode={mode} is not supported"
|
|
446
|
+
raise DSGInvalidParameter(msg)
|
|
447
|
+
|
|
448
|
+
@staticmethod
|
|
449
|
+
def _check_environment_variables(params):
|
|
450
|
+
if not params.offline:
|
|
451
|
+
illegal_vars = [x for x in os.environ if x.startswith("__DSGRID_SKIP_CHECK")]
|
|
452
|
+
if illegal_vars:
|
|
453
|
+
msg = (
|
|
454
|
+
f"Internal environment variables to skip checks are not allowed to be set "
|
|
455
|
+
f"in online mode: {illegal_vars}"
|
|
456
|
+
)
|
|
457
|
+
raise Exception(msg)
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def _make_data_symlinks(src, dst):
|
|
461
|
+
# registry/data/dataset_id/registry.json5
|
|
462
|
+
# registry/data/dataset_id/version/*.parquet
|
|
463
|
+
for dataset_id in (src / "data").iterdir():
|
|
464
|
+
if dataset_id.is_dir():
|
|
465
|
+
(dst / "data" / dataset_id.name).mkdir(parents=True)
|
|
466
|
+
for path in (src / "data" / dataset_id.name).iterdir():
|
|
467
|
+
if path.is_dir():
|
|
468
|
+
(dst / "data" / dataset_id.name / path.name).mkdir()
|
|
469
|
+
for data_file in path.iterdir():
|
|
470
|
+
os.symlink(
|
|
471
|
+
data_file.absolute(),
|
|
472
|
+
dst / "data" / dataset_id.name / path.name / data_file.name,
|
|
473
|
+
target_is_directory=data_file.is_dir(),
|
|
474
|
+
)
|
|
475
|
+
elif path.is_file():
|
|
476
|
+
shutil.copyfile(path, dst / "data" / dataset_id.name / path.name)
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def get_registry_path(registry_path=None):
|
|
480
|
+
"""
|
|
481
|
+
Returns the registry_path, defaulting to the DSGRID_REGISTRY_PATH environment
|
|
482
|
+
variable or dsgrid.common.LOCAL_REGISTRY = Path.home() / ".dsgrid-registry"
|
|
483
|
+
if registry_path is None.
|
|
484
|
+
"""
|
|
485
|
+
if registry_path is None:
|
|
486
|
+
registry_path = os.environ.get("DSGRID_REGISTRY_PATH", None)
|
|
487
|
+
if registry_path is None:
|
|
488
|
+
registry_path = (
|
|
489
|
+
LOCAL_REGISTRY # TEMPORARY: Replace with S3_REGISTRY when that is supported
|
|
490
|
+
)
|
|
491
|
+
if not os.path.exists(registry_path):
|
|
492
|
+
msg = (
|
|
493
|
+
f"Registry path {registry_path} does not exist. To create the registry, "
|
|
494
|
+
"run the following command:\n"
|
|
495
|
+
" dsgrid registry create $DSGRID_REGISTRY_PATH\n"
|
|
496
|
+
"Then register dimensions, dimension mappings, projects, and datasets."
|
|
497
|
+
)
|
|
498
|
+
raise ValueError(msg)
|
|
499
|
+
return registry_path
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
def _should_use_remote_data(remote_path):
|
|
503
|
+
if not str(remote_path).lower().startswith("s3"):
|
|
504
|
+
# We are on a local filesystem. Use the remote path.
|
|
505
|
+
return True
|
|
506
|
+
|
|
507
|
+
use_remote_data = False
|
|
508
|
+
if "DSGRID_USE_LOCAL_DATA" in os.environ:
|
|
509
|
+
pass
|
|
510
|
+
elif sys.platform in ("darwin", "win32"):
|
|
511
|
+
# Local systems need to sync all load data files.
|
|
512
|
+
pass
|
|
513
|
+
elif on_hpc():
|
|
514
|
+
pass
|
|
515
|
+
elif "GITHUB_ACTION" in os.environ:
|
|
516
|
+
logger.info("Do not use remote data on GitHub CI")
|
|
517
|
+
else:
|
|
518
|
+
# https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/identify_ec2_instances.html
|
|
519
|
+
try:
|
|
520
|
+
response = requests.get(
|
|
521
|
+
"http://169.254.169.254/latest/dynamic/instance-identity/document", timeout=2
|
|
522
|
+
)
|
|
523
|
+
ret = 0
|
|
524
|
+
except requests.ConnectTimeout:
|
|
525
|
+
logger.warning(
|
|
526
|
+
"Connection timed out while trying to read AWS identity. "
|
|
527
|
+
"If you are not running on AWS and would prefer to not experience this delay, set "
|
|
528
|
+
"the environment varible DSGRID_USE_LOCAL_DATA."
|
|
529
|
+
)
|
|
530
|
+
ret = 1
|
|
531
|
+
except Exception:
|
|
532
|
+
logger.exception("Failed to read identity document")
|
|
533
|
+
ret = 1
|
|
534
|
+
|
|
535
|
+
if ret == 0 and response.status_code == 200:
|
|
536
|
+
identity_data = response.json()
|
|
537
|
+
logger.info("Identity data: %s", identity_data)
|
|
538
|
+
if "instanceId" in identity_data:
|
|
539
|
+
logger.info("Use remote data on AWS")
|
|
540
|
+
use_remote_data = True
|
|
541
|
+
else:
|
|
542
|
+
logger.warning("Unknown payload from identity request.")
|
|
543
|
+
|
|
544
|
+
return use_remote_data
|