dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- build_backend.py +93 -0
- dsgrid/__init__.py +22 -0
- dsgrid/api/__init__.py +0 -0
- dsgrid/api/api_manager.py +179 -0
- dsgrid/api/app.py +419 -0
- dsgrid/api/models.py +60 -0
- dsgrid/api/response_models.py +116 -0
- dsgrid/apps/__init__.py +0 -0
- dsgrid/apps/project_viewer/app.py +216 -0
- dsgrid/apps/registration_gui.py +444 -0
- dsgrid/chronify.py +32 -0
- dsgrid/cli/__init__.py +0 -0
- dsgrid/cli/common.py +120 -0
- dsgrid/cli/config.py +176 -0
- dsgrid/cli/download.py +13 -0
- dsgrid/cli/dsgrid.py +157 -0
- dsgrid/cli/dsgrid_admin.py +92 -0
- dsgrid/cli/install_notebooks.py +62 -0
- dsgrid/cli/query.py +729 -0
- dsgrid/cli/registry.py +1862 -0
- dsgrid/cloud/__init__.py +0 -0
- dsgrid/cloud/cloud_storage_interface.py +140 -0
- dsgrid/cloud/factory.py +31 -0
- dsgrid/cloud/fake_storage_interface.py +37 -0
- dsgrid/cloud/s3_storage_interface.py +156 -0
- dsgrid/common.py +36 -0
- dsgrid/config/__init__.py +0 -0
- dsgrid/config/annual_time_dimension_config.py +194 -0
- dsgrid/config/common.py +142 -0
- dsgrid/config/config_base.py +148 -0
- dsgrid/config/dataset_config.py +907 -0
- dsgrid/config/dataset_schema_handler_factory.py +46 -0
- dsgrid/config/date_time_dimension_config.py +136 -0
- dsgrid/config/dimension_config.py +54 -0
- dsgrid/config/dimension_config_factory.py +65 -0
- dsgrid/config/dimension_mapping_base.py +350 -0
- dsgrid/config/dimension_mappings_config.py +48 -0
- dsgrid/config/dimensions.py +1025 -0
- dsgrid/config/dimensions_config.py +71 -0
- dsgrid/config/file_schema.py +190 -0
- dsgrid/config/index_time_dimension_config.py +80 -0
- dsgrid/config/input_dataset_requirements.py +31 -0
- dsgrid/config/mapping_tables.py +209 -0
- dsgrid/config/noop_time_dimension_config.py +42 -0
- dsgrid/config/project_config.py +1462 -0
- dsgrid/config/registration_models.py +188 -0
- dsgrid/config/representative_period_time_dimension_config.py +194 -0
- dsgrid/config/simple_models.py +49 -0
- dsgrid/config/supplemental_dimension.py +29 -0
- dsgrid/config/time_dimension_base_config.py +192 -0
- dsgrid/data_models.py +155 -0
- dsgrid/dataset/__init__.py +0 -0
- dsgrid/dataset/dataset.py +123 -0
- dsgrid/dataset/dataset_expression_handler.py +86 -0
- dsgrid/dataset/dataset_mapping_manager.py +121 -0
- dsgrid/dataset/dataset_schema_handler_base.py +945 -0
- dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
- dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
- dsgrid/dataset/growth_rates.py +162 -0
- dsgrid/dataset/models.py +51 -0
- dsgrid/dataset/table_format_handler_base.py +257 -0
- dsgrid/dataset/table_format_handler_factory.py +17 -0
- dsgrid/dataset/unpivoted_table.py +121 -0
- dsgrid/dimension/__init__.py +0 -0
- dsgrid/dimension/base_models.py +230 -0
- dsgrid/dimension/dimension_filters.py +308 -0
- dsgrid/dimension/standard.py +252 -0
- dsgrid/dimension/time.py +352 -0
- dsgrid/dimension/time_utils.py +103 -0
- dsgrid/dsgrid_rc.py +88 -0
- dsgrid/exceptions.py +105 -0
- dsgrid/filesystem/__init__.py +0 -0
- dsgrid/filesystem/cloud_filesystem.py +32 -0
- dsgrid/filesystem/factory.py +32 -0
- dsgrid/filesystem/filesystem_interface.py +136 -0
- dsgrid/filesystem/local_filesystem.py +74 -0
- dsgrid/filesystem/s3_filesystem.py +118 -0
- dsgrid/loggers.py +132 -0
- dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
- dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
- dsgrid/notebooks/registration.ipynb +48 -0
- dsgrid/notebooks/start_notebook.sh +11 -0
- dsgrid/project.py +451 -0
- dsgrid/query/__init__.py +0 -0
- dsgrid/query/dataset_mapping_plan.py +142 -0
- dsgrid/query/derived_dataset.py +388 -0
- dsgrid/query/models.py +728 -0
- dsgrid/query/query_context.py +287 -0
- dsgrid/query/query_submitter.py +994 -0
- dsgrid/query/report_factory.py +19 -0
- dsgrid/query/report_peak_load.py +70 -0
- dsgrid/query/reports_base.py +20 -0
- dsgrid/registry/__init__.py +0 -0
- dsgrid/registry/bulk_register.py +165 -0
- dsgrid/registry/common.py +287 -0
- dsgrid/registry/config_update_checker_base.py +63 -0
- dsgrid/registry/data_store_factory.py +34 -0
- dsgrid/registry/data_store_interface.py +74 -0
- dsgrid/registry/dataset_config_generator.py +158 -0
- dsgrid/registry/dataset_registry_manager.py +950 -0
- dsgrid/registry/dataset_update_checker.py +16 -0
- dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
- dsgrid/registry/dimension_mapping_update_checker.py +16 -0
- dsgrid/registry/dimension_registry_manager.py +413 -0
- dsgrid/registry/dimension_update_checker.py +16 -0
- dsgrid/registry/duckdb_data_store.py +207 -0
- dsgrid/registry/filesystem_data_store.py +150 -0
- dsgrid/registry/filter_registry_manager.py +123 -0
- dsgrid/registry/project_config_generator.py +57 -0
- dsgrid/registry/project_registry_manager.py +1623 -0
- dsgrid/registry/project_update_checker.py +48 -0
- dsgrid/registry/registration_context.py +223 -0
- dsgrid/registry/registry_auto_updater.py +316 -0
- dsgrid/registry/registry_database.py +667 -0
- dsgrid/registry/registry_interface.py +446 -0
- dsgrid/registry/registry_manager.py +558 -0
- dsgrid/registry/registry_manager_base.py +367 -0
- dsgrid/registry/versioning.py +92 -0
- dsgrid/rust_ext/__init__.py +14 -0
- dsgrid/rust_ext/find_minimal_patterns.py +129 -0
- dsgrid/spark/__init__.py +0 -0
- dsgrid/spark/functions.py +589 -0
- dsgrid/spark/types.py +110 -0
- dsgrid/tests/__init__.py +0 -0
- dsgrid/tests/common.py +140 -0
- dsgrid/tests/make_us_data_registry.py +265 -0
- dsgrid/tests/register_derived_datasets.py +103 -0
- dsgrid/tests/utils.py +25 -0
- dsgrid/time/__init__.py +0 -0
- dsgrid/time/time_conversions.py +80 -0
- dsgrid/time/types.py +67 -0
- dsgrid/units/__init__.py +0 -0
- dsgrid/units/constants.py +113 -0
- dsgrid/units/convert.py +71 -0
- dsgrid/units/energy.py +145 -0
- dsgrid/units/power.py +87 -0
- dsgrid/utils/__init__.py +0 -0
- dsgrid/utils/dataset.py +830 -0
- dsgrid/utils/files.py +179 -0
- dsgrid/utils/filters.py +125 -0
- dsgrid/utils/id_remappings.py +100 -0
- dsgrid/utils/py_expression_eval/LICENSE +19 -0
- dsgrid/utils/py_expression_eval/README.md +8 -0
- dsgrid/utils/py_expression_eval/__init__.py +847 -0
- dsgrid/utils/py_expression_eval/tests.py +283 -0
- dsgrid/utils/run_command.py +70 -0
- dsgrid/utils/scratch_dir_context.py +65 -0
- dsgrid/utils/spark.py +918 -0
- dsgrid/utils/spark_partition.py +98 -0
- dsgrid/utils/timing.py +239 -0
- dsgrid/utils/utilities.py +221 -0
- dsgrid/utils/versioning.py +36 -0
- dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
- dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
- dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
- dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
- dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
"""Contains data models to control bulk registration of projects and datasets."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Iterable
|
|
5
|
+
|
|
6
|
+
from pydantic import Field, ValidationInfo, field_validator, model_validator
|
|
7
|
+
|
|
8
|
+
from dsgrid.data_models import DSGBaseModel
|
|
9
|
+
from dsgrid.dimension.base_models import DimensionType
|
|
10
|
+
from dsgrid.utils.files import load_data
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ProjectRegistrationModel(DSGBaseModel):
|
|
14
|
+
"""Defines a project to be registered."""
|
|
15
|
+
|
|
16
|
+
project_id: str = Field(description="Project ID")
|
|
17
|
+
config_file: Path = Field(description="Path to project.json5")
|
|
18
|
+
log_message: str | None = Field(
|
|
19
|
+
default=None,
|
|
20
|
+
description="Log message to use when registering the project. Defaults to an auto-generated message.",
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
@model_validator(mode="before")
|
|
24
|
+
@classmethod
|
|
25
|
+
def fix_paths(cls, data: dict[str, Any]) -> dict[str, Any]:
|
|
26
|
+
_fix_paths(data, ("config_file",))
|
|
27
|
+
return data
|
|
28
|
+
|
|
29
|
+
@field_validator("log_message")
|
|
30
|
+
def fix_log_message(cls, log_message: str | None, info: ValidationInfo) -> str | None:
|
|
31
|
+
if log_message is None and "project_id" in info.data:
|
|
32
|
+
log_message = f"Register project {info.data['project_id']}"
|
|
33
|
+
return log_message
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class DatasetRegistrationModel(DSGBaseModel):
|
|
37
|
+
"""Defines a dataset to be registered."""
|
|
38
|
+
|
|
39
|
+
dataset_id: str = Field(description="Dataset ID")
|
|
40
|
+
config_file: Path = Field(description="Path to dataset.json5")
|
|
41
|
+
replace_dimension_names_with_ids: bool = Field(
|
|
42
|
+
description="Replace the dimension entries with IDs of dimensions in the database "
|
|
43
|
+
"with matching names. Typically only useful for tests.",
|
|
44
|
+
default=False,
|
|
45
|
+
)
|
|
46
|
+
log_message: str | None = Field(
|
|
47
|
+
default=None,
|
|
48
|
+
description="Log message to use when registering the dataset. Defaults to an auto-generated message.",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
@field_validator("log_message")
|
|
52
|
+
def fix_log_message(cls, log_message: str | None, info: ValidationInfo) -> str | None:
|
|
53
|
+
if log_message is None and "dataset_id" in info.data:
|
|
54
|
+
log_message = f"Register dataset {info.data['dataset_id']}"
|
|
55
|
+
return log_message
|
|
56
|
+
|
|
57
|
+
@model_validator(mode="before")
|
|
58
|
+
@classmethod
|
|
59
|
+
def fix_paths(cls, data: dict[str, Any]) -> dict[str, Any]:
|
|
60
|
+
_fix_paths(data, ("config_file",))
|
|
61
|
+
return data
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class DatasetSubmissionModel(DSGBaseModel):
|
|
65
|
+
"""Defines how a dataset should be submitted to a project."""
|
|
66
|
+
|
|
67
|
+
dataset_id: str
|
|
68
|
+
project_id: str
|
|
69
|
+
dimension_mapping_file: Path | None = Field(
|
|
70
|
+
description="Path to file containing mappings of dataset-to-project dimensions",
|
|
71
|
+
default=None,
|
|
72
|
+
)
|
|
73
|
+
dimension_mapping_references_file: Path | None = Field(
|
|
74
|
+
description="Path to file containing references to mappings of dataset-to-project dimensions",
|
|
75
|
+
default=None,
|
|
76
|
+
)
|
|
77
|
+
replace_dimension_mapping_names_with_ids: bool = Field(
|
|
78
|
+
description="Replace the dimension mapping entries with IDs of dimension mappings "
|
|
79
|
+
"in the database with matching names. Typically only useful for tests.",
|
|
80
|
+
default=False,
|
|
81
|
+
)
|
|
82
|
+
autogen_reverse_supplemental_mappings: set[DimensionType] = Field(
|
|
83
|
+
description="Dimensions on which to attempt create reverse mappings from supplemental dimensions.",
|
|
84
|
+
default=set(),
|
|
85
|
+
)
|
|
86
|
+
log_message: str | None = Field(
|
|
87
|
+
default=None,
|
|
88
|
+
description="Log message to use when submitting the dataset. Defaults to an auto-generated message.",
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
@model_validator(mode="before")
|
|
92
|
+
@classmethod
|
|
93
|
+
def fix_autogen_reverse_supplemental_mappings(cls, data: dict[str, Any]) -> dict[str, Any]:
|
|
94
|
+
if "autogen_reverse_supplemental_mappings" in data:
|
|
95
|
+
data["autogen_reverse_supplemental_mappings"] = {
|
|
96
|
+
DimensionType(x) for x in data["autogen_reverse_supplemental_mappings"]
|
|
97
|
+
}
|
|
98
|
+
return data
|
|
99
|
+
|
|
100
|
+
@field_validator("log_message")
|
|
101
|
+
def fix_log_message(cls, log_message: str | None, info: ValidationInfo) -> str | None:
|
|
102
|
+
if log_message is None and "dataset_id" in info.data:
|
|
103
|
+
log_message = (
|
|
104
|
+
f"Submit dataset {info.data['dataset_id']} to project {info.data['project_id']}"
|
|
105
|
+
)
|
|
106
|
+
return log_message
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class SubmittedDatasetsJournal(DSGBaseModel):
|
|
110
|
+
"""Defines a dataset that was successfully submitted to a project."""
|
|
111
|
+
|
|
112
|
+
dataset_id: str
|
|
113
|
+
project_id: str
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class RegistrationJournal(DSGBaseModel):
|
|
117
|
+
"""Defines projects and datasets that were succesfully registered."""
|
|
118
|
+
|
|
119
|
+
registered_projects: list[str] = []
|
|
120
|
+
registered_datasets: list[str] = []
|
|
121
|
+
submitted_datasets: list[SubmittedDatasetsJournal] = []
|
|
122
|
+
|
|
123
|
+
def add_dataset(self, dataset_id: str) -> None:
|
|
124
|
+
assert dataset_id not in self.registered_datasets, dataset_id
|
|
125
|
+
self.registered_datasets.append(dataset_id)
|
|
126
|
+
|
|
127
|
+
def add_project(self, project_id: str) -> None:
|
|
128
|
+
assert project_id not in self.registered_projects, project_id
|
|
129
|
+
self.registered_projects.append(project_id)
|
|
130
|
+
|
|
131
|
+
def add_submitted_dataset(self, dataset_id: str, project_id: str) -> None:
|
|
132
|
+
entry = SubmittedDatasetsJournal(dataset_id=dataset_id, project_id=project_id)
|
|
133
|
+
assert entry not in self.submitted_datasets, entry
|
|
134
|
+
self.submitted_datasets.append(entry)
|
|
135
|
+
|
|
136
|
+
def has_entries(self) -> bool:
|
|
137
|
+
return (
|
|
138
|
+
bool(self.registered_projects)
|
|
139
|
+
or bool(self.registered_datasets)
|
|
140
|
+
or bool(self.submitted_datasets)
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class RegistrationModel(DSGBaseModel):
|
|
145
|
+
"""Defines a list of projects and datasets to be registered."""
|
|
146
|
+
|
|
147
|
+
projects: list[ProjectRegistrationModel] = Field(description="List of projects to register.")
|
|
148
|
+
datasets: list[DatasetRegistrationModel] = Field(description="List of datasets to register.")
|
|
149
|
+
dataset_submissions: list[DatasetSubmissionModel] = Field(
|
|
150
|
+
description="List of datasets to be submitted to projects."
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
def filter_by_journal(self, journal: RegistrationJournal) -> "RegistrationModel":
|
|
154
|
+
"""Return a new instance of RegistrationModel by filtering an existing instance with
|
|
155
|
+
a journal.
|
|
156
|
+
"""
|
|
157
|
+
projects = list(
|
|
158
|
+
filter(lambda x: x.project_id not in journal.registered_projects, self.projects)
|
|
159
|
+
)
|
|
160
|
+
datasets = list(
|
|
161
|
+
filter(lambda x: x.dataset_id not in journal.registered_datasets, self.datasets)
|
|
162
|
+
)
|
|
163
|
+
dataset_submissions = list(
|
|
164
|
+
filter(
|
|
165
|
+
lambda x: SubmittedDatasetsJournal(
|
|
166
|
+
dataset_id=x.dataset_id, project_id=x.project_id
|
|
167
|
+
)
|
|
168
|
+
not in journal.submitted_datasets,
|
|
169
|
+
self.dataset_submissions,
|
|
170
|
+
)
|
|
171
|
+
)
|
|
172
|
+
return RegistrationModel(
|
|
173
|
+
projects=projects,
|
|
174
|
+
datasets=datasets,
|
|
175
|
+
dataset_submissions=dataset_submissions,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _fix_paths(data: dict[str, Any], fields: Iterable[str]) -> None:
|
|
180
|
+
for field in fields:
|
|
181
|
+
val = data.get(field)
|
|
182
|
+
if isinstance(val, str):
|
|
183
|
+
data[field] = Path(val)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def create_registration(input_file: Path):
|
|
187
|
+
"""Create registration inputs."""
|
|
188
|
+
return RegistrationModel(**load_data(input_file))
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
import logging
|
|
3
|
+
from datetime import timedelta
|
|
4
|
+
from typing import Type, Any, Union
|
|
5
|
+
|
|
6
|
+
import chronify
|
|
7
|
+
|
|
8
|
+
from dsgrid.dimension.time import RepresentativePeriodFormat, TimeIntervalType
|
|
9
|
+
from dsgrid.time.types import (
|
|
10
|
+
OneWeekPerMonthByHourType,
|
|
11
|
+
OneWeekdayDayAndOneWeekendDayPerMonthByHourType,
|
|
12
|
+
)
|
|
13
|
+
from .dimensions import RepresentativePeriodTimeDimensionModel
|
|
14
|
+
from .time_dimension_base_config import TimeDimensionBaseConfig
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class RepresentativePeriodTimeDimensionConfig(TimeDimensionBaseConfig):
|
|
21
|
+
"""Provides an interface to an RepresentativePeriodTimeDimensionModel."""
|
|
22
|
+
|
|
23
|
+
def __init__(self, *args, **kwargs):
|
|
24
|
+
super().__init__(*args, **kwargs)
|
|
25
|
+
# We expect the list of required formats to grow.
|
|
26
|
+
# It's possible that one function (or set of functions) can handle all permutations
|
|
27
|
+
# of parameters. We can make that determination once we have requirements for more
|
|
28
|
+
# formats.
|
|
29
|
+
match self.model.format:
|
|
30
|
+
case RepresentativePeriodFormat.ONE_WEEK_PER_MONTH_BY_HOUR:
|
|
31
|
+
self._format_handler = OneWeekPerMonthByHourHandler()
|
|
32
|
+
case RepresentativePeriodFormat.ONE_WEEKDAY_DAY_AND_ONE_WEEKEND_DAY_PER_MONTH_BY_HOUR:
|
|
33
|
+
self._format_handler = OneWeekdayDayAndWeekendDayPerMonthByHourHandler()
|
|
34
|
+
case _:
|
|
35
|
+
msg = self.model.format.value
|
|
36
|
+
raise NotImplementedError(msg)
|
|
37
|
+
|
|
38
|
+
def supports_chronify(self) -> bool:
|
|
39
|
+
return True
|
|
40
|
+
|
|
41
|
+
def to_chronify(
|
|
42
|
+
self,
|
|
43
|
+
) -> Union[chronify.RepresentativePeriodTimeTZ, chronify.RepresentativePeriodTimeNTZ]:
|
|
44
|
+
if len(self._model.ranges) != 1:
|
|
45
|
+
msg = (
|
|
46
|
+
"Mapping RepresentativePeriodTime with chronify is only supported with one range: "
|
|
47
|
+
f"{self._model.ranges}"
|
|
48
|
+
)
|
|
49
|
+
raise NotImplementedError(msg)
|
|
50
|
+
range_ = self._model.ranges[0]
|
|
51
|
+
if range_.start != 1 or range_.end != 12:
|
|
52
|
+
msg = (
|
|
53
|
+
"Mapping RepresentativePeriodTime with chronify is only supported with a full year: "
|
|
54
|
+
f"{range_}"
|
|
55
|
+
)
|
|
56
|
+
raise NotImplementedError(msg)
|
|
57
|
+
# RepresentativePeriodTimeDimensionModel does not map to NTZ at the moment
|
|
58
|
+
if isinstance(self._format_handler, OneWeekPerMonthByHourHandler) or isinstance(
|
|
59
|
+
self._format_handler, OneWeekdayDayAndWeekendDayPerMonthByHourHandler
|
|
60
|
+
):
|
|
61
|
+
return chronify.RepresentativePeriodTimeTZ(
|
|
62
|
+
measurement_type=self._model.measurement_type,
|
|
63
|
+
interval_type=self._model.time_interval_type,
|
|
64
|
+
time_format=chronify.RepresentativePeriodFormat(self._model.format.value),
|
|
65
|
+
time_zone_column="time_zone",
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
msg = f"Cannot chronify time_config for {self._format_handler}"
|
|
69
|
+
raise NotImplementedError(msg)
|
|
70
|
+
|
|
71
|
+
@staticmethod
|
|
72
|
+
def model_class() -> RepresentativePeriodTimeDimensionModel:
|
|
73
|
+
return RepresentativePeriodTimeDimensionModel
|
|
74
|
+
|
|
75
|
+
def get_frequency(self) -> timedelta:
|
|
76
|
+
return self._format_handler.get_frequency()
|
|
77
|
+
|
|
78
|
+
def get_start_times(self) -> list[Any]:
|
|
79
|
+
return self._format_handler.get_start_times(self.model.ranges)
|
|
80
|
+
|
|
81
|
+
def get_lengths(self) -> list[int]:
|
|
82
|
+
return self._format_handler.get_lengths(self.model.ranges)
|
|
83
|
+
|
|
84
|
+
def get_load_data_time_columns(self) -> list[str]:
|
|
85
|
+
return self._format_handler.get_load_data_time_columns()
|
|
86
|
+
|
|
87
|
+
def get_time_zone(self) -> None:
|
|
88
|
+
return None
|
|
89
|
+
|
|
90
|
+
def get_tzinfo(self) -> None:
|
|
91
|
+
return None
|
|
92
|
+
|
|
93
|
+
def get_time_interval_type(self) -> TimeIntervalType:
|
|
94
|
+
return self.model.time_interval_type
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class RepresentativeTimeFormatHandlerBase(abc.ABC):
|
|
98
|
+
"""Provides implementations for different representative time formats."""
|
|
99
|
+
|
|
100
|
+
@staticmethod
|
|
101
|
+
@abc.abstractmethod
|
|
102
|
+
def get_representative_time_type() -> Type:
|
|
103
|
+
"""Return the time type representing the data."""
|
|
104
|
+
|
|
105
|
+
@abc.abstractmethod
|
|
106
|
+
def get_frequency(self):
|
|
107
|
+
"""Return the frequency.
|
|
108
|
+
|
|
109
|
+
Returns
|
|
110
|
+
-------
|
|
111
|
+
timedelta
|
|
112
|
+
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
@staticmethod
|
|
116
|
+
@abc.abstractmethod
|
|
117
|
+
def get_load_data_time_columns():
|
|
118
|
+
"""Return the required timestamp columns in the load data table.
|
|
119
|
+
|
|
120
|
+
Returns
|
|
121
|
+
-------
|
|
122
|
+
list
|
|
123
|
+
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class OneWeekPerMonthByHourHandler(RepresentativeTimeFormatHandlerBase):
|
|
128
|
+
"""Handler for format with hourly data that includes one week per month."""
|
|
129
|
+
|
|
130
|
+
@staticmethod
|
|
131
|
+
def get_representative_time_type() -> OneWeekPerMonthByHourType:
|
|
132
|
+
return OneWeekPerMonthByHourType
|
|
133
|
+
|
|
134
|
+
def get_frequency(self) -> timedelta:
|
|
135
|
+
return timedelta(hours=1)
|
|
136
|
+
|
|
137
|
+
@staticmethod
|
|
138
|
+
def get_start_times(ranges) -> list[OneWeekPerMonthByHourType]:
|
|
139
|
+
"""Get the starting combination of (month, day_of_week, hour) based on sorted order"""
|
|
140
|
+
start_times = []
|
|
141
|
+
for model in ranges:
|
|
142
|
+
start_times.append(OneWeekPerMonthByHourType(month=model.start, day_of_week=0, hour=0))
|
|
143
|
+
return start_times
|
|
144
|
+
|
|
145
|
+
@staticmethod
|
|
146
|
+
def get_lengths(ranges) -> list[int]:
|
|
147
|
+
"""Get the number of unique combinations of (month, day_of_week, hour)"""
|
|
148
|
+
lengths = []
|
|
149
|
+
for model in ranges:
|
|
150
|
+
n_months = model.end - model.start + 1
|
|
151
|
+
lengths.append(n_months * 7 * 24)
|
|
152
|
+
return lengths
|
|
153
|
+
|
|
154
|
+
@staticmethod
|
|
155
|
+
def get_load_data_time_columns() -> list[str]:
|
|
156
|
+
return list(OneWeekPerMonthByHourType._fields)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class OneWeekdayDayAndWeekendDayPerMonthByHourHandler(RepresentativeTimeFormatHandlerBase):
|
|
160
|
+
"""Handler for format with hourly data that includes one weekday day and one weekend day
|
|
161
|
+
per month.
|
|
162
|
+
"""
|
|
163
|
+
|
|
164
|
+
@staticmethod
|
|
165
|
+
def get_representative_time_type() -> OneWeekdayDayAndOneWeekendDayPerMonthByHourType:
|
|
166
|
+
return OneWeekdayDayAndOneWeekendDayPerMonthByHourType
|
|
167
|
+
|
|
168
|
+
def get_frequency(self) -> timedelta:
|
|
169
|
+
return timedelta(hours=1)
|
|
170
|
+
|
|
171
|
+
@staticmethod
|
|
172
|
+
def get_start_times(ranges) -> list[OneWeekdayDayAndOneWeekendDayPerMonthByHourType]:
|
|
173
|
+
"""Get the starting combination of (month, hour, is_weekday) based on sorted order"""
|
|
174
|
+
start_times = []
|
|
175
|
+
for model in ranges:
|
|
176
|
+
start_times.append(
|
|
177
|
+
OneWeekdayDayAndOneWeekendDayPerMonthByHourType(
|
|
178
|
+
month=model.start, hour=0, is_weekday=False
|
|
179
|
+
)
|
|
180
|
+
)
|
|
181
|
+
return start_times
|
|
182
|
+
|
|
183
|
+
@staticmethod
|
|
184
|
+
def get_lengths(ranges) -> list[int]:
|
|
185
|
+
"""Get the number of unique combinations of (month, hour, is_weekday)"""
|
|
186
|
+
lengths = []
|
|
187
|
+
for model in ranges:
|
|
188
|
+
n_months = model.end - model.start + 1
|
|
189
|
+
lengths.append(n_months * 24 * 2)
|
|
190
|
+
return lengths
|
|
191
|
+
|
|
192
|
+
@staticmethod
|
|
193
|
+
def get_load_data_time_columns() -> list[str]:
|
|
194
|
+
return list(OneWeekdayDayAndOneWeekendDayPerMonthByHourType._fields)
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Defines simplified data models for testing and filtering."""
|
|
2
|
+
|
|
3
|
+
from pydantic import field_validator, model_validator, Field
|
|
4
|
+
|
|
5
|
+
from dsgrid.data_models import DSGBaseModel
|
|
6
|
+
from dsgrid.dimension.base_models import DimensionType
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DimensionSimpleModel(DSGBaseModel):
|
|
10
|
+
dimension_type: DimensionType
|
|
11
|
+
dimension_name: str | None = None
|
|
12
|
+
record_ids: list[str]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class DimensionsSimpleModel(DSGBaseModel):
|
|
16
|
+
base_dimensions: list[DimensionSimpleModel]
|
|
17
|
+
supplemental_dimensions: list[DimensionSimpleModel] = Field(default=[])
|
|
18
|
+
|
|
19
|
+
@field_validator("base_dimensions")
|
|
20
|
+
@classmethod
|
|
21
|
+
def check_base_dimensions(cls, base_dimensions):
|
|
22
|
+
dimension_types = {x.dimension_type for x in base_dimensions}
|
|
23
|
+
if len(dimension_types) != len(base_dimensions):
|
|
24
|
+
msg = "base_dimensions cannot contain duplicate dimension types"
|
|
25
|
+
raise ValueError(msg)
|
|
26
|
+
return base_dimensions
|
|
27
|
+
|
|
28
|
+
@model_validator(mode="after")
|
|
29
|
+
def check_supplemental_dimensions(self) -> "DimensionsSimpleModel":
|
|
30
|
+
for dim in self.supplemental_dimensions:
|
|
31
|
+
if dim.dimension_name is None:
|
|
32
|
+
msg = f"supplemental dimensions must define dimension_name: {dim}"
|
|
33
|
+
raise ValueError(msg)
|
|
34
|
+
return self
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class DatasetSimpleModel(DSGBaseModel):
|
|
38
|
+
dataset_id: str
|
|
39
|
+
dimensions: list[DimensionSimpleModel]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class ProjectSimpleModel(DSGBaseModel):
|
|
43
|
+
project_id: str
|
|
44
|
+
dimensions: DimensionsSimpleModel
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class RegistrySimpleModel(DSGBaseModel):
|
|
48
|
+
projects: list[ProjectSimpleModel]
|
|
49
|
+
datasets: list[DatasetSimpleModel]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Defines a supplemental dimension."""
|
|
2
|
+
|
|
3
|
+
from typing import Annotated
|
|
4
|
+
from pydantic import Field
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
from dsgrid.data_models import DSGBaseModel
|
|
8
|
+
from .dimensions import DimensionModel
|
|
9
|
+
from .mapping_tables import MappingTableByNameModel
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SupplementalDimensionModel(DimensionModel):
|
|
13
|
+
"""Defines a supplemental dimension."""
|
|
14
|
+
|
|
15
|
+
mapping: MappingTableByNameModel = Field(
|
|
16
|
+
description="Defines how the supplemental dimension will be mapped to the project's base "
|
|
17
|
+
"dimension.",
|
|
18
|
+
title="mapping",
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class SupplementalDimensionsListModel(DSGBaseModel):
|
|
23
|
+
"""Defines a list of supplemental dimensions."""
|
|
24
|
+
|
|
25
|
+
supplemental_dimensions: Annotated[
|
|
26
|
+
list[SupplementalDimensionModel], Field(min_length=1)
|
|
27
|
+
] = Field(
|
|
28
|
+
description="List of supplemental dimensions and mappings to be registered",
|
|
29
|
+
)
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
import logging
|
|
3
|
+
from datetime import tzinfo
|
|
4
|
+
from typing import Any
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
import chronify
|
|
8
|
+
|
|
9
|
+
from .dimension_config import DimensionBaseConfigWithoutFiles
|
|
10
|
+
from dsgrid.dimension.time import (
|
|
11
|
+
TimeIntervalType,
|
|
12
|
+
TimeBasedDataAdjustmentModel,
|
|
13
|
+
)
|
|
14
|
+
from dsgrid.dimension.time_utils import (
|
|
15
|
+
build_time_ranges,
|
|
16
|
+
)
|
|
17
|
+
from dsgrid.config.dimensions import TimeRangeModel
|
|
18
|
+
|
|
19
|
+
from dsgrid.spark.types import (
|
|
20
|
+
DataFrame,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class TimeDimensionBaseConfig(DimensionBaseConfigWithoutFiles, abc.ABC):
|
|
28
|
+
"""Base class for all time dimension configs"""
|
|
29
|
+
|
|
30
|
+
def supports_chronify(self) -> bool:
|
|
31
|
+
"""Return True if the config can be converted to chronify."""
|
|
32
|
+
return False
|
|
33
|
+
|
|
34
|
+
# @abc.abstractmethod
|
|
35
|
+
def to_chronify(self) -> chronify.TimeBaseModel:
|
|
36
|
+
"""Return the chronify version of the time model."""
|
|
37
|
+
# This is likely temporary until we can use chronify models directly.
|
|
38
|
+
msg = f"{type(self)}.to_chronify"
|
|
39
|
+
raise NotImplementedError(msg)
|
|
40
|
+
|
|
41
|
+
def check_dataset_time_consistency(self, load_data_df, time_columns: list[str]) -> None:
|
|
42
|
+
"""Check consistency of the load data with the time dimension.
|
|
43
|
+
|
|
44
|
+
Parameters
|
|
45
|
+
----------
|
|
46
|
+
load_data_df : pyspark.sql.DataFrame
|
|
47
|
+
time_columns : list[str]
|
|
48
|
+
|
|
49
|
+
Raises
|
|
50
|
+
------
|
|
51
|
+
DSGInvalidDataset
|
|
52
|
+
Raised if the dataset is inconsistent with the time dimension.
|
|
53
|
+
"""
|
|
54
|
+
msg = f"{type(self)}.check_dataset_time_consistency is not implemented"
|
|
55
|
+
raise NotImplementedError(msg)
|
|
56
|
+
|
|
57
|
+
def build_time_dataframe(self) -> DataFrame:
|
|
58
|
+
"""Build time dimension as specified in config in a spark dataframe.
|
|
59
|
+
|
|
60
|
+
Returns
|
|
61
|
+
-------
|
|
62
|
+
pyspark.sql.DataFrame
|
|
63
|
+
"""
|
|
64
|
+
msg = f"{self.__class__.__name__}.build_time_dataframe is not implemented"
|
|
65
|
+
raise NotImplementedError(msg)
|
|
66
|
+
|
|
67
|
+
@abc.abstractmethod
|
|
68
|
+
def get_load_data_time_columns(self) -> list[str]:
|
|
69
|
+
"""Return the required timestamp columns in the load data table.
|
|
70
|
+
|
|
71
|
+
Returns
|
|
72
|
+
-------
|
|
73
|
+
list
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
def list_load_data_columns_for_query_name(self) -> list[str]:
|
|
77
|
+
"""Return the time columns expected in the load data table for this dimension's query name.
|
|
78
|
+
|
|
79
|
+
Returns
|
|
80
|
+
-------
|
|
81
|
+
list[str]
|
|
82
|
+
"""
|
|
83
|
+
# This may need to be re-implemented by child classes.
|
|
84
|
+
return [self.model.name]
|
|
85
|
+
|
|
86
|
+
def map_timestamp_load_data_columns_for_query_name(self, df) -> DataFrame:
|
|
87
|
+
"""Map the timestamp columns in the load data table to those specified by the query name.
|
|
88
|
+
|
|
89
|
+
Parameters
|
|
90
|
+
----------
|
|
91
|
+
df : pyspark.sql.DataFrame
|
|
92
|
+
|
|
93
|
+
Returns
|
|
94
|
+
-------
|
|
95
|
+
pyspark.sql.DataFrame
|
|
96
|
+
"""
|
|
97
|
+
time_cols = self.get_load_data_time_columns()
|
|
98
|
+
if len(time_cols) > 1:
|
|
99
|
+
msg = (
|
|
100
|
+
"Handling of multiple time columns needs to be implemented in the child class: "
|
|
101
|
+
f"{type(self)}: {time_cols=}"
|
|
102
|
+
)
|
|
103
|
+
raise NotImplementedError(msg)
|
|
104
|
+
|
|
105
|
+
time_col = time_cols[0]
|
|
106
|
+
if time_col not in df.columns:
|
|
107
|
+
return df
|
|
108
|
+
return df.withColumnRenamed(time_col, self.model.name)
|
|
109
|
+
|
|
110
|
+
def get_time_ranges(self) -> list[Any]:
|
|
111
|
+
"""Return time ranges with time_zone applied.
|
|
112
|
+
|
|
113
|
+
Returns
|
|
114
|
+
-------
|
|
115
|
+
list
|
|
116
|
+
list of DatetimeRange
|
|
117
|
+
"""
|
|
118
|
+
msg = f"{type(self)}.get_time_ranges is not implemented"
|
|
119
|
+
raise NotImplementedError(msg)
|
|
120
|
+
|
|
121
|
+
@abc.abstractmethod
|
|
122
|
+
def get_start_times(self) -> list[Any]:
|
|
123
|
+
"""Return the list of starting timestamp (with tzinfo) for this dimension.
|
|
124
|
+
One per time range.
|
|
125
|
+
|
|
126
|
+
Returns
|
|
127
|
+
-------
|
|
128
|
+
list[Any]
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
@abc.abstractmethod
|
|
132
|
+
def get_lengths(self) -> list[int]:
|
|
133
|
+
"""Return the list of time range length (number of time steps) for this dimension.
|
|
134
|
+
One per time range.
|
|
135
|
+
|
|
136
|
+
Returns
|
|
137
|
+
-------
|
|
138
|
+
list[Any]
|
|
139
|
+
"""
|
|
140
|
+
|
|
141
|
+
@abc.abstractmethod
|
|
142
|
+
def get_time_zone(self) -> str | None:
|
|
143
|
+
"""Return a time zone instance for this dimension."""
|
|
144
|
+
|
|
145
|
+
def get_time_zones(self) -> list[str]:
|
|
146
|
+
"""Return a list of time zones for this dimension."""
|
|
147
|
+
if self.get_time_zone():
|
|
148
|
+
return [self.get_time_zone()]
|
|
149
|
+
return []
|
|
150
|
+
|
|
151
|
+
@abc.abstractmethod
|
|
152
|
+
def get_tzinfo(self) -> tzinfo | None:
|
|
153
|
+
"""Return a tzinfo instance for this dimension.
|
|
154
|
+
|
|
155
|
+
Returns
|
|
156
|
+
-------
|
|
157
|
+
tzinfo | None
|
|
158
|
+
"""
|
|
159
|
+
|
|
160
|
+
@abc.abstractmethod
|
|
161
|
+
def get_time_interval_type(self) -> TimeIntervalType:
|
|
162
|
+
"""Return the time interval type for this dimension.
|
|
163
|
+
|
|
164
|
+
Returns
|
|
165
|
+
-------
|
|
166
|
+
TimeIntervalType
|
|
167
|
+
"""
|
|
168
|
+
|
|
169
|
+
def list_expected_dataset_timestamps(
|
|
170
|
+
self,
|
|
171
|
+
time_based_data_adjustment: TimeBasedDataAdjustmentModel | None = None,
|
|
172
|
+
) -> list[tuple]:
|
|
173
|
+
"""Return a list of the timestamps expected in the load_data table.
|
|
174
|
+
Parameters
|
|
175
|
+
----------
|
|
176
|
+
time_based_data_adjustmen : TimeBasedDataAdjustmentModel | None
|
|
177
|
+
|
|
178
|
+
Returns
|
|
179
|
+
-------
|
|
180
|
+
list
|
|
181
|
+
List of tuples of columns representing time in the load_data table.
|
|
182
|
+
|
|
183
|
+
"""
|
|
184
|
+
msg = f"{type(self)}.list_expected_dataset_timestamps is not implemented"
|
|
185
|
+
raise NotImplementedError(msg)
|
|
186
|
+
|
|
187
|
+
def _build_time_ranges(
|
|
188
|
+
self,
|
|
189
|
+
time_ranges: list[TimeRangeModel],
|
|
190
|
+
tz: str | None = None,
|
|
191
|
+
) -> list[tuple[pd.Timestamp, pd.Timestamp, pd.Timedelta]]:
|
|
192
|
+
return build_time_ranges(time_ranges, tz=tz)
|