dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- build_backend.py +93 -0
- dsgrid/__init__.py +22 -0
- dsgrid/api/__init__.py +0 -0
- dsgrid/api/api_manager.py +179 -0
- dsgrid/api/app.py +419 -0
- dsgrid/api/models.py +60 -0
- dsgrid/api/response_models.py +116 -0
- dsgrid/apps/__init__.py +0 -0
- dsgrid/apps/project_viewer/app.py +216 -0
- dsgrid/apps/registration_gui.py +444 -0
- dsgrid/chronify.py +32 -0
- dsgrid/cli/__init__.py +0 -0
- dsgrid/cli/common.py +120 -0
- dsgrid/cli/config.py +176 -0
- dsgrid/cli/download.py +13 -0
- dsgrid/cli/dsgrid.py +157 -0
- dsgrid/cli/dsgrid_admin.py +92 -0
- dsgrid/cli/install_notebooks.py +62 -0
- dsgrid/cli/query.py +729 -0
- dsgrid/cli/registry.py +1862 -0
- dsgrid/cloud/__init__.py +0 -0
- dsgrid/cloud/cloud_storage_interface.py +140 -0
- dsgrid/cloud/factory.py +31 -0
- dsgrid/cloud/fake_storage_interface.py +37 -0
- dsgrid/cloud/s3_storage_interface.py +156 -0
- dsgrid/common.py +36 -0
- dsgrid/config/__init__.py +0 -0
- dsgrid/config/annual_time_dimension_config.py +194 -0
- dsgrid/config/common.py +142 -0
- dsgrid/config/config_base.py +148 -0
- dsgrid/config/dataset_config.py +907 -0
- dsgrid/config/dataset_schema_handler_factory.py +46 -0
- dsgrid/config/date_time_dimension_config.py +136 -0
- dsgrid/config/dimension_config.py +54 -0
- dsgrid/config/dimension_config_factory.py +65 -0
- dsgrid/config/dimension_mapping_base.py +350 -0
- dsgrid/config/dimension_mappings_config.py +48 -0
- dsgrid/config/dimensions.py +1025 -0
- dsgrid/config/dimensions_config.py +71 -0
- dsgrid/config/file_schema.py +190 -0
- dsgrid/config/index_time_dimension_config.py +80 -0
- dsgrid/config/input_dataset_requirements.py +31 -0
- dsgrid/config/mapping_tables.py +209 -0
- dsgrid/config/noop_time_dimension_config.py +42 -0
- dsgrid/config/project_config.py +1462 -0
- dsgrid/config/registration_models.py +188 -0
- dsgrid/config/representative_period_time_dimension_config.py +194 -0
- dsgrid/config/simple_models.py +49 -0
- dsgrid/config/supplemental_dimension.py +29 -0
- dsgrid/config/time_dimension_base_config.py +192 -0
- dsgrid/data_models.py +155 -0
- dsgrid/dataset/__init__.py +0 -0
- dsgrid/dataset/dataset.py +123 -0
- dsgrid/dataset/dataset_expression_handler.py +86 -0
- dsgrid/dataset/dataset_mapping_manager.py +121 -0
- dsgrid/dataset/dataset_schema_handler_base.py +945 -0
- dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
- dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
- dsgrid/dataset/growth_rates.py +162 -0
- dsgrid/dataset/models.py +51 -0
- dsgrid/dataset/table_format_handler_base.py +257 -0
- dsgrid/dataset/table_format_handler_factory.py +17 -0
- dsgrid/dataset/unpivoted_table.py +121 -0
- dsgrid/dimension/__init__.py +0 -0
- dsgrid/dimension/base_models.py +230 -0
- dsgrid/dimension/dimension_filters.py +308 -0
- dsgrid/dimension/standard.py +252 -0
- dsgrid/dimension/time.py +352 -0
- dsgrid/dimension/time_utils.py +103 -0
- dsgrid/dsgrid_rc.py +88 -0
- dsgrid/exceptions.py +105 -0
- dsgrid/filesystem/__init__.py +0 -0
- dsgrid/filesystem/cloud_filesystem.py +32 -0
- dsgrid/filesystem/factory.py +32 -0
- dsgrid/filesystem/filesystem_interface.py +136 -0
- dsgrid/filesystem/local_filesystem.py +74 -0
- dsgrid/filesystem/s3_filesystem.py +118 -0
- dsgrid/loggers.py +132 -0
- dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
- dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
- dsgrid/notebooks/registration.ipynb +48 -0
- dsgrid/notebooks/start_notebook.sh +11 -0
- dsgrid/project.py +451 -0
- dsgrid/query/__init__.py +0 -0
- dsgrid/query/dataset_mapping_plan.py +142 -0
- dsgrid/query/derived_dataset.py +388 -0
- dsgrid/query/models.py +728 -0
- dsgrid/query/query_context.py +287 -0
- dsgrid/query/query_submitter.py +994 -0
- dsgrid/query/report_factory.py +19 -0
- dsgrid/query/report_peak_load.py +70 -0
- dsgrid/query/reports_base.py +20 -0
- dsgrid/registry/__init__.py +0 -0
- dsgrid/registry/bulk_register.py +165 -0
- dsgrid/registry/common.py +287 -0
- dsgrid/registry/config_update_checker_base.py +63 -0
- dsgrid/registry/data_store_factory.py +34 -0
- dsgrid/registry/data_store_interface.py +74 -0
- dsgrid/registry/dataset_config_generator.py +158 -0
- dsgrid/registry/dataset_registry_manager.py +950 -0
- dsgrid/registry/dataset_update_checker.py +16 -0
- dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
- dsgrid/registry/dimension_mapping_update_checker.py +16 -0
- dsgrid/registry/dimension_registry_manager.py +413 -0
- dsgrid/registry/dimension_update_checker.py +16 -0
- dsgrid/registry/duckdb_data_store.py +207 -0
- dsgrid/registry/filesystem_data_store.py +150 -0
- dsgrid/registry/filter_registry_manager.py +123 -0
- dsgrid/registry/project_config_generator.py +57 -0
- dsgrid/registry/project_registry_manager.py +1623 -0
- dsgrid/registry/project_update_checker.py +48 -0
- dsgrid/registry/registration_context.py +223 -0
- dsgrid/registry/registry_auto_updater.py +316 -0
- dsgrid/registry/registry_database.py +667 -0
- dsgrid/registry/registry_interface.py +446 -0
- dsgrid/registry/registry_manager.py +558 -0
- dsgrid/registry/registry_manager_base.py +367 -0
- dsgrid/registry/versioning.py +92 -0
- dsgrid/rust_ext/__init__.py +14 -0
- dsgrid/rust_ext/find_minimal_patterns.py +129 -0
- dsgrid/spark/__init__.py +0 -0
- dsgrid/spark/functions.py +589 -0
- dsgrid/spark/types.py +110 -0
- dsgrid/tests/__init__.py +0 -0
- dsgrid/tests/common.py +140 -0
- dsgrid/tests/make_us_data_registry.py +265 -0
- dsgrid/tests/register_derived_datasets.py +103 -0
- dsgrid/tests/utils.py +25 -0
- dsgrid/time/__init__.py +0 -0
- dsgrid/time/time_conversions.py +80 -0
- dsgrid/time/types.py +67 -0
- dsgrid/units/__init__.py +0 -0
- dsgrid/units/constants.py +113 -0
- dsgrid/units/convert.py +71 -0
- dsgrid/units/energy.py +145 -0
- dsgrid/units/power.py +87 -0
- dsgrid/utils/__init__.py +0 -0
- dsgrid/utils/dataset.py +830 -0
- dsgrid/utils/files.py +179 -0
- dsgrid/utils/filters.py +125 -0
- dsgrid/utils/id_remappings.py +100 -0
- dsgrid/utils/py_expression_eval/LICENSE +19 -0
- dsgrid/utils/py_expression_eval/README.md +8 -0
- dsgrid/utils/py_expression_eval/__init__.py +847 -0
- dsgrid/utils/py_expression_eval/tests.py +283 -0
- dsgrid/utils/run_command.py +70 -0
- dsgrid/utils/scratch_dir_context.py +65 -0
- dsgrid/utils/spark.py +918 -0
- dsgrid/utils/spark_partition.py +98 -0
- dsgrid/utils/timing.py +239 -0
- dsgrid/utils/utilities.py +221 -0
- dsgrid/utils/versioning.py +36 -0
- dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
- dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
- dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
- dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
- dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
dsgrid/tests/common.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
import getpass
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
from semver import VersionInfo
|
|
7
|
+
|
|
8
|
+
from dsgrid.exceptions import DSGInvalidParameter, DSGInvalidOperation
|
|
9
|
+
from dsgrid.registry.dimension_registry_manager import DimensionRegistryManager
|
|
10
|
+
from dsgrid.registry.project_registry_manager import ProjectRegistryManager
|
|
11
|
+
from dsgrid.registry.registry_manager import RegistryManager
|
|
12
|
+
from dsgrid.registry.common import DataStoreType, DatabaseConnection, VersionUpdateType
|
|
13
|
+
from dsgrid.utils.files import dump_data, load_data
|
|
14
|
+
|
|
15
|
+
TEST_PROJECT_PATH = Path(__file__).absolute().parents[2] / "dsgrid-test-data"
|
|
16
|
+
TEST_PROJECT_REPO = TEST_PROJECT_PATH / "test_efs"
|
|
17
|
+
TEST_STANDARD_SCENARIOS_PROJECT_REPO = TEST_PROJECT_PATH / "standard_scenarios_2021"
|
|
18
|
+
TEST_DATASET_DIRECTORY = TEST_PROJECT_PATH / "datasets"
|
|
19
|
+
TEST_REGISTRY_DATABASE = "cached-test-dsgrid"
|
|
20
|
+
TEST_REGISTRY_BASE_PATH = Path("tests/data/registry")
|
|
21
|
+
TEST_REGISTRY_DATA_PATH = Path("tests/data/registry/registry_data")
|
|
22
|
+
TEST_EFS_REGISTRATION_FILE = Path("tests/data/test_efs_registration.json5")
|
|
23
|
+
# AWS_PROFILE_NAME = "nrel-aws-dsgrid"
|
|
24
|
+
TEST_REMOTE_REGISTRY = "s3://nrel-dsgrid-registry-test"
|
|
25
|
+
CACHED_TEST_REGISTRY_DB = f"sqlite:///{TEST_REGISTRY_BASE_PATH}/cached_registry.db"
|
|
26
|
+
STANDARD_SCENARIOS_PROJECT_REPO = Path(__file__).parents[2] / "dsgrid-project-StandardScenarios"
|
|
27
|
+
IEF_PROJECT_REPO = Path(__file__).parents[2] / "dsgrid-project-IEF"
|
|
28
|
+
SIMPLE_STANDARD_SCENARIOS = TEST_PROJECT_PATH / "filtered_registries" / "simple_standard_scenarios"
|
|
29
|
+
SIMPLE_STANDARD_SCENARIOS_REGISTRY_DB = (
|
|
30
|
+
f"sqlite:///{TEST_PROJECT_PATH}/filtered_registries/simple_standard_scenarios/registry.db"
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def create_local_test_registry(
|
|
35
|
+
tmpdir: Path, conn=None, data_store_type: DataStoreType = DataStoreType.FILESYSTEM
|
|
36
|
+
):
|
|
37
|
+
if conn is None:
|
|
38
|
+
conn = DatabaseConnection(url=f"sqlite:///{tmpdir}/dsgrid-test.db")
|
|
39
|
+
data_path = tmpdir / "registry_data"
|
|
40
|
+
mgr = RegistryManager.create(conn, data_path, data_store_type=data_store_type, overwrite=True)
|
|
41
|
+
mgr.dispose()
|
|
42
|
+
return data_path
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def check_configs_update(base_dir, manager):
|
|
46
|
+
"""Runs an update on one of each type of config.
|
|
47
|
+
|
|
48
|
+
Parameters
|
|
49
|
+
----------
|
|
50
|
+
base_dir : Path
|
|
51
|
+
manager : RegistryManager
|
|
52
|
+
|
|
53
|
+
Returns
|
|
54
|
+
-------
|
|
55
|
+
list
|
|
56
|
+
Each updated config and new version: [(Updated ID, new version)]
|
|
57
|
+
For the dimension element the tuple is (Updated ID, dimension type, new version).
|
|
58
|
+
Order is dimension ID, dimension mapping ID, dataset ID, project ID
|
|
59
|
+
|
|
60
|
+
"""
|
|
61
|
+
update_dir = base_dir / "updates"
|
|
62
|
+
user = getpass.getuser()
|
|
63
|
+
|
|
64
|
+
updated_ids = []
|
|
65
|
+
for mgr in (
|
|
66
|
+
manager.dimension_manager,
|
|
67
|
+
manager.dimension_mapping_manager,
|
|
68
|
+
manager.dataset_manager,
|
|
69
|
+
manager.project_manager,
|
|
70
|
+
):
|
|
71
|
+
config_id = mgr.list_ids()[0]
|
|
72
|
+
version = mgr.get_latest_version(config_id)
|
|
73
|
+
check_config_update(update_dir, mgr, config_id, user, version)
|
|
74
|
+
new_version = mgr.get_latest_version(config_id)
|
|
75
|
+
if isinstance(mgr, DimensionRegistryManager):
|
|
76
|
+
config = mgr.get_by_id(config_id)
|
|
77
|
+
updated_ids.append((config_id, config.model.dimension_type, new_version))
|
|
78
|
+
else:
|
|
79
|
+
updated_ids.append((config_id, new_version))
|
|
80
|
+
|
|
81
|
+
return updated_ids
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def check_config_update(base_dir, mgr: ProjectRegistryManager, config_id, user, version):
|
|
85
|
+
"""Runs basic positive and negative update tests for the config.
|
|
86
|
+
|
|
87
|
+
Parameters
|
|
88
|
+
----------
|
|
89
|
+
base_dir : str
|
|
90
|
+
mgr : RegistryManagerBase
|
|
91
|
+
config_id : str
|
|
92
|
+
user : str
|
|
93
|
+
version : str
|
|
94
|
+
|
|
95
|
+
"""
|
|
96
|
+
config_file = Path(base_dir) / mgr.config_class().config_filename()
|
|
97
|
+
assert not config_file.exists()
|
|
98
|
+
try:
|
|
99
|
+
mgr.dump(config_id, base_dir, force=True)
|
|
100
|
+
with pytest.raises(DSGInvalidOperation):
|
|
101
|
+
mgr.dump(config_id, base_dir)
|
|
102
|
+
mgr.dump(config_id, base_dir, force=True)
|
|
103
|
+
assert config_file.exists()
|
|
104
|
+
config_data = load_data(config_file)
|
|
105
|
+
config_data["description"] += "; updated description"
|
|
106
|
+
dump_data(config_data, config_file)
|
|
107
|
+
with pytest.raises(DSGInvalidParameter):
|
|
108
|
+
mgr.update_from_file(
|
|
109
|
+
config_file,
|
|
110
|
+
"invalid_config_id",
|
|
111
|
+
user,
|
|
112
|
+
VersionUpdateType.PATCH,
|
|
113
|
+
"update to description",
|
|
114
|
+
version,
|
|
115
|
+
)
|
|
116
|
+
with pytest.raises(DSGInvalidParameter):
|
|
117
|
+
mgr.update_from_file(
|
|
118
|
+
config_file,
|
|
119
|
+
config_id,
|
|
120
|
+
user,
|
|
121
|
+
VersionUpdateType.PATCH,
|
|
122
|
+
"update to description",
|
|
123
|
+
str(VersionInfo.parse(version).bump_patch()),
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
mgr.update_from_file(
|
|
127
|
+
config_file,
|
|
128
|
+
config_id,
|
|
129
|
+
user,
|
|
130
|
+
VersionUpdateType.PATCH,
|
|
131
|
+
"update to description",
|
|
132
|
+
version,
|
|
133
|
+
)
|
|
134
|
+
assert (
|
|
135
|
+
VersionInfo.parse(mgr.get_latest_version(config_id))
|
|
136
|
+
== VersionInfo.parse(version).bump_patch()
|
|
137
|
+
)
|
|
138
|
+
finally:
|
|
139
|
+
if config_file.exists():
|
|
140
|
+
os.remove(config_file)
|
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import getpass
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import shutil
|
|
6
|
+
import tempfile
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import rich_click as click
|
|
10
|
+
|
|
11
|
+
from dsgrid.cli.common import path_callback
|
|
12
|
+
from dsgrid.loggers import setup_logging, check_log_file_size
|
|
13
|
+
from dsgrid.registry.common import DataStoreType, DatabaseConnection
|
|
14
|
+
from dsgrid.registry.registry_manager import RegistryManager
|
|
15
|
+
from dsgrid.tests.common import (
|
|
16
|
+
create_local_test_registry,
|
|
17
|
+
TEST_REMOTE_REGISTRY,
|
|
18
|
+
TEST_PROJECT_REPO,
|
|
19
|
+
TEST_DATASET_DIRECTORY,
|
|
20
|
+
)
|
|
21
|
+
from dsgrid.utils.timing import timer_stats_collector
|
|
22
|
+
from dsgrid.utils.files import dump_data, load_data
|
|
23
|
+
from dsgrid.utils.id_remappings import (
|
|
24
|
+
map_dimension_names_to_ids,
|
|
25
|
+
replace_dimension_names_with_current_ids,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _find_file_with_stem(directory: Path, stem: str) -> Path | None:
|
|
33
|
+
"""Find a file in directory with the given stem, regardless of extension."""
|
|
34
|
+
for path in directory.iterdir():
|
|
35
|
+
if path.stem == stem:
|
|
36
|
+
return path
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def update_dataset_config_paths(config_file: Path, dataset_id: str) -> None:
|
|
41
|
+
"""Update the data file paths in a dataset config to be relative to the config file.
|
|
42
|
+
|
|
43
|
+
Parameters
|
|
44
|
+
----------
|
|
45
|
+
config_file : Path
|
|
46
|
+
Path to the dataset configuration file.
|
|
47
|
+
dataset_id : str
|
|
48
|
+
The dataset ID, used to locate the data files in TEST_DATASET_DIRECTORY.
|
|
49
|
+
"""
|
|
50
|
+
data = load_data(config_file)
|
|
51
|
+
if "data_layout" not in data:
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
data_layout = data["data_layout"]
|
|
55
|
+
config_dir = config_file.parent.resolve()
|
|
56
|
+
dataset_data_dir = (TEST_DATASET_DIRECTORY / dataset_id).resolve()
|
|
57
|
+
|
|
58
|
+
if "data_file" in data_layout:
|
|
59
|
+
stem = Path(data_layout["data_file"]["path"]).stem
|
|
60
|
+
data_file_path = _find_file_with_stem(dataset_data_dir, stem)
|
|
61
|
+
if data_file_path is None:
|
|
62
|
+
msg = f"Could not find data file with stem '{stem}' in {dataset_data_dir}"
|
|
63
|
+
raise FileNotFoundError(msg)
|
|
64
|
+
relative_path = os.path.relpath(data_file_path, config_dir)
|
|
65
|
+
data_layout["data_file"]["path"] = relative_path
|
|
66
|
+
|
|
67
|
+
if "lookup_data_file" in data_layout and data_layout["lookup_data_file"] is not None:
|
|
68
|
+
stem = Path(data_layout["lookup_data_file"]["path"]).stem
|
|
69
|
+
lookup_file_path = _find_file_with_stem(dataset_data_dir, stem)
|
|
70
|
+
if lookup_file_path is None:
|
|
71
|
+
msg = f"Could not find lookup file with stem '{stem}' in {dataset_data_dir}"
|
|
72
|
+
raise FileNotFoundError(msg)
|
|
73
|
+
relative_path = os.path.relpath(lookup_file_path, config_dir)
|
|
74
|
+
data_layout["lookup_data_file"]["path"] = relative_path
|
|
75
|
+
|
|
76
|
+
if "missing_associations" in data_layout and data_layout["missing_associations"] is not None:
|
|
77
|
+
items = []
|
|
78
|
+
for item in data_layout["missing_associations"]:
|
|
79
|
+
stem = Path(item).stem
|
|
80
|
+
missing_path = _find_file_with_stem(dataset_data_dir, stem)
|
|
81
|
+
if missing_path is None:
|
|
82
|
+
msg = (
|
|
83
|
+
f"Could not find missing associations with stem '{stem}' in {dataset_data_dir}"
|
|
84
|
+
)
|
|
85
|
+
raise FileNotFoundError(msg)
|
|
86
|
+
relative_path = os.path.relpath(missing_path, config_dir)
|
|
87
|
+
items.append(relative_path)
|
|
88
|
+
data_layout["missing_associations"] = items
|
|
89
|
+
|
|
90
|
+
dump_data(data, config_file)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@contextlib.contextmanager
|
|
94
|
+
def make_test_data_registry(
|
|
95
|
+
registry_path,
|
|
96
|
+
src_dir,
|
|
97
|
+
include_projects=True,
|
|
98
|
+
include_datasets=True,
|
|
99
|
+
offline_mode=True,
|
|
100
|
+
database_url: str | None = None,
|
|
101
|
+
data_store_type: DataStoreType = DataStoreType.FILESYSTEM,
|
|
102
|
+
):
|
|
103
|
+
"""Creates a local registry from a dsgrid project source directory for testing.
|
|
104
|
+
|
|
105
|
+
This is a context manager that yields the RegistryManager and disposes it on exit.
|
|
106
|
+
|
|
107
|
+
Parameters
|
|
108
|
+
----------
|
|
109
|
+
registry_path : Path
|
|
110
|
+
Path in which the registry will be created.
|
|
111
|
+
src_dir : Path
|
|
112
|
+
Path containing source config files
|
|
113
|
+
include_projects : bool
|
|
114
|
+
If False, do not register any projects.
|
|
115
|
+
include_datasets : bool
|
|
116
|
+
If False, do not register any datasets.
|
|
117
|
+
offline_mode : bool
|
|
118
|
+
If False, use the test remote registry.
|
|
119
|
+
data_store_type: DataStoreType
|
|
120
|
+
Type of store to use for the registry data.
|
|
121
|
+
|
|
122
|
+
Yields
|
|
123
|
+
------
|
|
124
|
+
RegistryManager
|
|
125
|
+
"""
|
|
126
|
+
if not include_projects and include_datasets:
|
|
127
|
+
msg = "If include_datasets is True then include_projects must also be True."
|
|
128
|
+
raise Exception(msg)
|
|
129
|
+
url = f"sqlite:///{registry_path}/registry.db" if database_url is None else database_url
|
|
130
|
+
conn = DatabaseConnection(url=url)
|
|
131
|
+
create_local_test_registry(registry_path, conn=conn, data_store_type=data_store_type)
|
|
132
|
+
dataset_dirs = [
|
|
133
|
+
Path("datasets/modeled/comstock"),
|
|
134
|
+
Path("datasets/modeled/comstock_unpivoted"),
|
|
135
|
+
]
|
|
136
|
+
|
|
137
|
+
user = getpass.getuser()
|
|
138
|
+
log_message = "Initial registration"
|
|
139
|
+
if offline_mode:
|
|
140
|
+
manager = RegistryManager.load(conn, offline_mode=offline_mode)
|
|
141
|
+
else:
|
|
142
|
+
manager = RegistryManager.load(
|
|
143
|
+
conn, remote_path=TEST_REMOTE_REGISTRY, offline_mode=offline_mode
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
try:
|
|
147
|
+
project_config_file = src_dir / "project.json5"
|
|
148
|
+
project_id = load_data(project_config_file)["project_id"]
|
|
149
|
+
dataset_config_files = [src_dir / path / "dataset.json5" for path in dataset_dirs]
|
|
150
|
+
dataset_mapping_files = [
|
|
151
|
+
src_dir / path / "dimension_mappings.json5" for path in dataset_dirs
|
|
152
|
+
]
|
|
153
|
+
for i, filename in enumerate(dataset_mapping_files):
|
|
154
|
+
if not filename.exists():
|
|
155
|
+
dataset_mapping_files[i] = None
|
|
156
|
+
dataset_ids = [
|
|
157
|
+
load_data(config_file)["dataset_id"] for config_file in dataset_config_files
|
|
158
|
+
]
|
|
159
|
+
|
|
160
|
+
if include_projects:
|
|
161
|
+
print("\n 1. register project: \n")
|
|
162
|
+
manager.project_manager.register(
|
|
163
|
+
project_config_file,
|
|
164
|
+
user,
|
|
165
|
+
log_message,
|
|
166
|
+
)
|
|
167
|
+
if include_datasets:
|
|
168
|
+
for i, dataset_config_file in enumerate(dataset_config_files):
|
|
169
|
+
dataset_id = dataset_ids[i]
|
|
170
|
+
print(f"\n 2. register dataset {dataset_id}: \n")
|
|
171
|
+
dataset_mapping_file = dataset_mapping_files[i]
|
|
172
|
+
mappings = map_dimension_names_to_ids(manager.dimension_manager)
|
|
173
|
+
replace_dimension_names_with_current_ids(dataset_config_file, mappings)
|
|
174
|
+
update_dataset_config_paths(dataset_config_file, dataset_id)
|
|
175
|
+
manager.dataset_manager.register(
|
|
176
|
+
dataset_config_file,
|
|
177
|
+
user,
|
|
178
|
+
log_message,
|
|
179
|
+
)
|
|
180
|
+
print(f"\n 3. submit dataset {dataset_id} to project\n")
|
|
181
|
+
manager.project_manager.submit_dataset(
|
|
182
|
+
project_id,
|
|
183
|
+
dataset_id,
|
|
184
|
+
user,
|
|
185
|
+
log_message,
|
|
186
|
+
dimension_mapping_file=dataset_mapping_file,
|
|
187
|
+
)
|
|
188
|
+
yield manager
|
|
189
|
+
finally:
|
|
190
|
+
manager.dispose()
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
@click.command()
|
|
194
|
+
@click.argument(
|
|
195
|
+
"registry-path",
|
|
196
|
+
type=Path,
|
|
197
|
+
default=f"{Path.home()}/.dsgrid-test-registry",
|
|
198
|
+
callback=path_callback,
|
|
199
|
+
)
|
|
200
|
+
@click.option(
|
|
201
|
+
"-f",
|
|
202
|
+
"--force",
|
|
203
|
+
default=False,
|
|
204
|
+
is_flag=True,
|
|
205
|
+
show_default=True,
|
|
206
|
+
help="Delete registry-path if it exists.",
|
|
207
|
+
)
|
|
208
|
+
@click.option(
|
|
209
|
+
"-p",
|
|
210
|
+
"--project-dir",
|
|
211
|
+
default=TEST_PROJECT_REPO,
|
|
212
|
+
help="path to a project repository",
|
|
213
|
+
callback=path_callback,
|
|
214
|
+
)
|
|
215
|
+
@click.option(
|
|
216
|
+
"-t",
|
|
217
|
+
"--data-store-type",
|
|
218
|
+
type=click.Choice([x.value for x in DataStoreType]),
|
|
219
|
+
default=DataStoreType.FILESYSTEM.value,
|
|
220
|
+
show_default=True,
|
|
221
|
+
help="Type of store to use for the registry data.",
|
|
222
|
+
callback=lambda *x: DataStoreType(x[2]),
|
|
223
|
+
)
|
|
224
|
+
@click.option(
|
|
225
|
+
"--verbose",
|
|
226
|
+
is_flag=True,
|
|
227
|
+
default=False,
|
|
228
|
+
show_default=True,
|
|
229
|
+
help="Enable verbose log output.",
|
|
230
|
+
)
|
|
231
|
+
def run(
|
|
232
|
+
registry_path: Path,
|
|
233
|
+
force: bool,
|
|
234
|
+
project_dir: Path,
|
|
235
|
+
data_store_type: DataStoreType,
|
|
236
|
+
verbose: bool,
|
|
237
|
+
):
|
|
238
|
+
"""Creates a local registry from a dsgrid project source directory for testing."""
|
|
239
|
+
level = logging.DEBUG if verbose else logging.INFO
|
|
240
|
+
log_file = Path("test_dsgrid_project.log")
|
|
241
|
+
check_log_file_size(log_file, no_prompts=True)
|
|
242
|
+
setup_logging("dsgrid", log_file, console_level=level, file_level=level, mode="a")
|
|
243
|
+
if registry_path.exists():
|
|
244
|
+
if force:
|
|
245
|
+
shutil.rmtree(registry_path)
|
|
246
|
+
else:
|
|
247
|
+
print(f"{registry_path} already exists. Use --force to overwrite.")
|
|
248
|
+
os.makedirs(registry_path)
|
|
249
|
+
tmp_project_dir = Path(tempfile.gettempdir()) / "tmp_test_project_dir"
|
|
250
|
+
if tmp_project_dir.exists():
|
|
251
|
+
shutil.rmtree(tmp_project_dir)
|
|
252
|
+
shutil.copytree(project_dir, tmp_project_dir)
|
|
253
|
+
try:
|
|
254
|
+
with make_test_data_registry(
|
|
255
|
+
registry_path,
|
|
256
|
+
tmp_project_dir / "dsgrid_project",
|
|
257
|
+
data_store_type=data_store_type,
|
|
258
|
+
):
|
|
259
|
+
pass # Manager is created and disposed in context manager
|
|
260
|
+
finally:
|
|
261
|
+
timer_stats_collector.log_stats()
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
if __name__ == "__main__":
|
|
265
|
+
run()
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import shutil
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import rich_click as click
|
|
6
|
+
|
|
7
|
+
from dsgrid.loggers import setup_logging, check_log_file_size
|
|
8
|
+
from dsgrid.query.models import ProjectQueryModel
|
|
9
|
+
from dsgrid.registry.dataset_registry import DatasetRegistry
|
|
10
|
+
from dsgrid.utils.run_command import check_run_command
|
|
11
|
+
from dsgrid.utils.timing import timer_stats_collector
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@click.command()
|
|
18
|
+
@click.argument(
|
|
19
|
+
"query_files",
|
|
20
|
+
nargs=-1,
|
|
21
|
+
type=click.Path(exists=True),
|
|
22
|
+
callback=lambda *x: [Path(y) for y in x[2]],
|
|
23
|
+
)
|
|
24
|
+
@click.option(
|
|
25
|
+
"-r", "--registry-path", required=True, callback=lambda *x: Path(x[2]), help="Path to registry"
|
|
26
|
+
)
|
|
27
|
+
@click.option(
|
|
28
|
+
"-o",
|
|
29
|
+
"--output",
|
|
30
|
+
default="query_output",
|
|
31
|
+
show_default=True,
|
|
32
|
+
type=click.Path(),
|
|
33
|
+
help="Output directory for query results",
|
|
34
|
+
callback=lambda *x: Path(x[2]),
|
|
35
|
+
)
|
|
36
|
+
@click.option(
|
|
37
|
+
"-p",
|
|
38
|
+
"--project-id",
|
|
39
|
+
default="dsgrid_conus_2022",
|
|
40
|
+
show_default=True,
|
|
41
|
+
type=str,
|
|
42
|
+
help="Project ID",
|
|
43
|
+
)
|
|
44
|
+
@click.option(
|
|
45
|
+
"--verbose", is_flag=True, default=False, show_default=True, help="Enable verbose log output."
|
|
46
|
+
)
|
|
47
|
+
def run(query_files, project_id, registry_path, output, verbose):
|
|
48
|
+
"""Registers derived datasets in a local registry for testing."""
|
|
49
|
+
level = logging.DEBUG if verbose else logging.INFO
|
|
50
|
+
log_file = Path("dsgrid_registration.log")
|
|
51
|
+
check_log_file_size(log_file, no_prompts=True)
|
|
52
|
+
logger = setup_logging(__name__, log_file, console_level=level, file_level=level, mode="a")
|
|
53
|
+
try:
|
|
54
|
+
_run_registration(query_files, project_id, registry_path, output)
|
|
55
|
+
finally:
|
|
56
|
+
# Raise the console level so that timer stats only go to the log file.
|
|
57
|
+
for _, handler in enumerate(logger.handlers):
|
|
58
|
+
if handler.name == "console":
|
|
59
|
+
handler.setLevel(logging.WARNING)
|
|
60
|
+
break
|
|
61
|
+
|
|
62
|
+
timer_stats_collector.log_stats()
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _run_registration(
|
|
66
|
+
query_files: list[Path], project_id: str, registry_path: Path, query_output_dir: Path
|
|
67
|
+
):
|
|
68
|
+
log_message = "Submit derived dataset"
|
|
69
|
+
query_output_dir.mkdir(exist_ok=True)
|
|
70
|
+
derived_dataset_config_dir = query_output_dir / "derived_dataset_configs"
|
|
71
|
+
if derived_dataset_config_dir.exists():
|
|
72
|
+
shutil.rmtree(derived_dataset_config_dir)
|
|
73
|
+
derived_dataset_config_dir.mkdir()
|
|
74
|
+
for query_file in query_files:
|
|
75
|
+
logger.info("Register derived dataset from %s", query_file)
|
|
76
|
+
query = ProjectQueryModel.from_file(query_file)
|
|
77
|
+
dataset_id = query.project.dataset.dataset_id
|
|
78
|
+
dataset_config_dir = derived_dataset_config_dir / dataset_id
|
|
79
|
+
dataset_config_file = dataset_config_dir / DatasetRegistry.config_filename()
|
|
80
|
+
|
|
81
|
+
create_cmd = (
|
|
82
|
+
f"dsgrid query project run --registry-path={registry_path} "
|
|
83
|
+
f"-o {query_output_dir} {query_file}"
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
config_cmd = (
|
|
87
|
+
f"dsgrid query project create-derived-dataset-config "
|
|
88
|
+
f"--registry-path={registry_path} {query_output_dir / dataset_id} {dataset_config_dir}"
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
submit_cmd = (
|
|
92
|
+
f"dsgrid registry --path {registry_path} projects "
|
|
93
|
+
f"register-and-submit-dataset -c {dataset_config_file} -p {project_id} "
|
|
94
|
+
f"-l '{log_message}' -d {query_output_dir / dataset_id}"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
for cmd in (create_cmd, config_cmd, submit_cmd):
|
|
98
|
+
logger.info(cmd)
|
|
99
|
+
check_run_command(cmd)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
if __name__ == "__main__":
|
|
103
|
+
run()
|
dsgrid/tests/utils.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from dsgrid.spark.types import use_duckdb
|
|
4
|
+
from dsgrid.utils.spark import get_spark_session
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def read_parquet(filename: Path):
|
|
8
|
+
"""Read a Parquet file and load it into cache. This helps debugging with pytest --pdb.
|
|
9
|
+
If you don't use this, the parquet file will get deleted on a failure and you won't be able
|
|
10
|
+
to inspect the dataframe.
|
|
11
|
+
"""
|
|
12
|
+
spark = get_spark_session()
|
|
13
|
+
df = spark.read.parquet(str(filename))
|
|
14
|
+
if not use_duckdb():
|
|
15
|
+
df.cache()
|
|
16
|
+
df.count()
|
|
17
|
+
return df
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def read_parquet_two_table_format(path: Path):
|
|
21
|
+
spark = get_spark_session()
|
|
22
|
+
load_data = spark.read.parquet(str(path / "load_data.parquet"))
|
|
23
|
+
lookup = spark.read.parquet(str(path / "load_data_lookup.parquet"))
|
|
24
|
+
table = load_data.join(lookup, on="id").drop("id")
|
|
25
|
+
return table
|
dsgrid/time/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""Functions to perform time conversions"""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
|
|
5
|
+
from dsgrid.time.types import DayType, Season
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def convert_datetime_to_day_type(timestamp):
|
|
9
|
+
"""Returns the day type for the datetime.
|
|
10
|
+
|
|
11
|
+
Parameters
|
|
12
|
+
----------
|
|
13
|
+
timestamp : datetime.datetime
|
|
14
|
+
|
|
15
|
+
Returns
|
|
16
|
+
-------
|
|
17
|
+
str
|
|
18
|
+
DayType id
|
|
19
|
+
|
|
20
|
+
"""
|
|
21
|
+
# Monday is 0.
|
|
22
|
+
if timestamp.weekday() <= 4:
|
|
23
|
+
return DayType.WEEKDAY.value
|
|
24
|
+
return DayType.WEEKEND.value
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def convert_datetime_to_season(timestamp):
|
|
28
|
+
"""Returns the season for the datetime.
|
|
29
|
+
|
|
30
|
+
Parameters
|
|
31
|
+
----------
|
|
32
|
+
timestamp : datetime.datetime
|
|
33
|
+
|
|
34
|
+
Returns
|
|
35
|
+
-------
|
|
36
|
+
str
|
|
37
|
+
Season id
|
|
38
|
+
|
|
39
|
+
"""
|
|
40
|
+
# TODO: dates do change slightly every year. Is this close enough?
|
|
41
|
+
# dates also change by region, it's weather driven.
|
|
42
|
+
year = timestamp.year
|
|
43
|
+
if timestamp < datetime(year, 3, 20) or timestamp > datetime(year, 12, 21):
|
|
44
|
+
season = Season.WINTER.value
|
|
45
|
+
elif timestamp < datetime(year, 6, 20):
|
|
46
|
+
season = Season.SPRING.value
|
|
47
|
+
elif timestamp < datetime(year, 9, 22):
|
|
48
|
+
season = Season.SUMMER.value
|
|
49
|
+
else:
|
|
50
|
+
season = Season.AUTUMN.value
|
|
51
|
+
|
|
52
|
+
return season
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def interpret_datetime(timestamp):
|
|
56
|
+
"""Return a datetime object from a timestamp string.
|
|
57
|
+
|
|
58
|
+
Parameters
|
|
59
|
+
----------
|
|
60
|
+
timestamp : str
|
|
61
|
+
|
|
62
|
+
Returns
|
|
63
|
+
-------
|
|
64
|
+
datetime.datetime
|
|
65
|
+
|
|
66
|
+
"""
|
|
67
|
+
formats = (
|
|
68
|
+
"%Y-%m-%dT%H:%M:%S",
|
|
69
|
+
"%Y-%m-%d %H:%M:%S",
|
|
70
|
+
"%Y-%m-%dT%H:%M:%SZ",
|
|
71
|
+
"%Y-%m-%dT%H:%M:%S.%f",
|
|
72
|
+
"%Y-%m-%d %H:%M:%S.%f",
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
for i, fmt in enumerate(formats):
|
|
76
|
+
try:
|
|
77
|
+
return datetime.strptime(timestamp, fmt)
|
|
78
|
+
except ValueError:
|
|
79
|
+
if i == len(formats) - 1:
|
|
80
|
+
raise
|
dsgrid/time/types.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Types related to time"""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import NamedTuple
|
|
5
|
+
|
|
6
|
+
from dsgrid.data_models import DSGEnum
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DayType(DSGEnum):
|
|
10
|
+
"""Day types"""
|
|
11
|
+
|
|
12
|
+
WEEKEND = "weekend"
|
|
13
|
+
WEEKDAY = "weekday"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Season(DSGEnum):
|
|
17
|
+
"""Seasons"""
|
|
18
|
+
|
|
19
|
+
WINTER = "winter"
|
|
20
|
+
SPRING = "spring"
|
|
21
|
+
SUMMER = "summer"
|
|
22
|
+
AUTUMN = "autumn"
|
|
23
|
+
FALL = "autumn"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# The types below represent the timestamps that exist as columns in all datasets.
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class DatetimeTimestampType(NamedTuple):
|
|
30
|
+
"""Single column with datetime."""
|
|
31
|
+
|
|
32
|
+
timestamp: datetime
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class AnnualTimestampType(NamedTuple):
|
|
36
|
+
"""Single column with only year."""
|
|
37
|
+
|
|
38
|
+
time_year: int
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class OneWeekPerMonthByHourType(NamedTuple):
|
|
42
|
+
"""Columns of representative time with one week per month."""
|
|
43
|
+
|
|
44
|
+
month: int
|
|
45
|
+
# 0 = Monday, 6 = Sunday. Follows pyspark.sql.functions.weekday and Python datetime.weekday.
|
|
46
|
+
day_of_week: int
|
|
47
|
+
hour: int
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class OneWeekdayDayAndOneWeekendDayPerMonthByHourType(NamedTuple):
|
|
51
|
+
"""Columns of representative time with month, hour, and weekday vs weekend."""
|
|
52
|
+
|
|
53
|
+
month: int
|
|
54
|
+
is_weekday: bool
|
|
55
|
+
hour: int
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class IndexTimestampType(NamedTuple):
|
|
59
|
+
"""Single column with numerical indices."""
|
|
60
|
+
|
|
61
|
+
time_index: int
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class StringTimestampType(NamedTuple):
|
|
65
|
+
"""Single column with time (must include offset) as str."""
|
|
66
|
+
|
|
67
|
+
timestamp: str
|
dsgrid/units/__init__.py
ADDED
|
File without changes
|