dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- build_backend.py +93 -0
- dsgrid/__init__.py +22 -0
- dsgrid/api/__init__.py +0 -0
- dsgrid/api/api_manager.py +179 -0
- dsgrid/api/app.py +419 -0
- dsgrid/api/models.py +60 -0
- dsgrid/api/response_models.py +116 -0
- dsgrid/apps/__init__.py +0 -0
- dsgrid/apps/project_viewer/app.py +216 -0
- dsgrid/apps/registration_gui.py +444 -0
- dsgrid/chronify.py +32 -0
- dsgrid/cli/__init__.py +0 -0
- dsgrid/cli/common.py +120 -0
- dsgrid/cli/config.py +176 -0
- dsgrid/cli/download.py +13 -0
- dsgrid/cli/dsgrid.py +157 -0
- dsgrid/cli/dsgrid_admin.py +92 -0
- dsgrid/cli/install_notebooks.py +62 -0
- dsgrid/cli/query.py +729 -0
- dsgrid/cli/registry.py +1862 -0
- dsgrid/cloud/__init__.py +0 -0
- dsgrid/cloud/cloud_storage_interface.py +140 -0
- dsgrid/cloud/factory.py +31 -0
- dsgrid/cloud/fake_storage_interface.py +37 -0
- dsgrid/cloud/s3_storage_interface.py +156 -0
- dsgrid/common.py +36 -0
- dsgrid/config/__init__.py +0 -0
- dsgrid/config/annual_time_dimension_config.py +194 -0
- dsgrid/config/common.py +142 -0
- dsgrid/config/config_base.py +148 -0
- dsgrid/config/dataset_config.py +907 -0
- dsgrid/config/dataset_schema_handler_factory.py +46 -0
- dsgrid/config/date_time_dimension_config.py +136 -0
- dsgrid/config/dimension_config.py +54 -0
- dsgrid/config/dimension_config_factory.py +65 -0
- dsgrid/config/dimension_mapping_base.py +350 -0
- dsgrid/config/dimension_mappings_config.py +48 -0
- dsgrid/config/dimensions.py +1025 -0
- dsgrid/config/dimensions_config.py +71 -0
- dsgrid/config/file_schema.py +190 -0
- dsgrid/config/index_time_dimension_config.py +80 -0
- dsgrid/config/input_dataset_requirements.py +31 -0
- dsgrid/config/mapping_tables.py +209 -0
- dsgrid/config/noop_time_dimension_config.py +42 -0
- dsgrid/config/project_config.py +1462 -0
- dsgrid/config/registration_models.py +188 -0
- dsgrid/config/representative_period_time_dimension_config.py +194 -0
- dsgrid/config/simple_models.py +49 -0
- dsgrid/config/supplemental_dimension.py +29 -0
- dsgrid/config/time_dimension_base_config.py +192 -0
- dsgrid/data_models.py +155 -0
- dsgrid/dataset/__init__.py +0 -0
- dsgrid/dataset/dataset.py +123 -0
- dsgrid/dataset/dataset_expression_handler.py +86 -0
- dsgrid/dataset/dataset_mapping_manager.py +121 -0
- dsgrid/dataset/dataset_schema_handler_base.py +945 -0
- dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
- dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
- dsgrid/dataset/growth_rates.py +162 -0
- dsgrid/dataset/models.py +51 -0
- dsgrid/dataset/table_format_handler_base.py +257 -0
- dsgrid/dataset/table_format_handler_factory.py +17 -0
- dsgrid/dataset/unpivoted_table.py +121 -0
- dsgrid/dimension/__init__.py +0 -0
- dsgrid/dimension/base_models.py +230 -0
- dsgrid/dimension/dimension_filters.py +308 -0
- dsgrid/dimension/standard.py +252 -0
- dsgrid/dimension/time.py +352 -0
- dsgrid/dimension/time_utils.py +103 -0
- dsgrid/dsgrid_rc.py +88 -0
- dsgrid/exceptions.py +105 -0
- dsgrid/filesystem/__init__.py +0 -0
- dsgrid/filesystem/cloud_filesystem.py +32 -0
- dsgrid/filesystem/factory.py +32 -0
- dsgrid/filesystem/filesystem_interface.py +136 -0
- dsgrid/filesystem/local_filesystem.py +74 -0
- dsgrid/filesystem/s3_filesystem.py +118 -0
- dsgrid/loggers.py +132 -0
- dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
- dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
- dsgrid/notebooks/registration.ipynb +48 -0
- dsgrid/notebooks/start_notebook.sh +11 -0
- dsgrid/project.py +451 -0
- dsgrid/query/__init__.py +0 -0
- dsgrid/query/dataset_mapping_plan.py +142 -0
- dsgrid/query/derived_dataset.py +388 -0
- dsgrid/query/models.py +728 -0
- dsgrid/query/query_context.py +287 -0
- dsgrid/query/query_submitter.py +994 -0
- dsgrid/query/report_factory.py +19 -0
- dsgrid/query/report_peak_load.py +70 -0
- dsgrid/query/reports_base.py +20 -0
- dsgrid/registry/__init__.py +0 -0
- dsgrid/registry/bulk_register.py +165 -0
- dsgrid/registry/common.py +287 -0
- dsgrid/registry/config_update_checker_base.py +63 -0
- dsgrid/registry/data_store_factory.py +34 -0
- dsgrid/registry/data_store_interface.py +74 -0
- dsgrid/registry/dataset_config_generator.py +158 -0
- dsgrid/registry/dataset_registry_manager.py +950 -0
- dsgrid/registry/dataset_update_checker.py +16 -0
- dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
- dsgrid/registry/dimension_mapping_update_checker.py +16 -0
- dsgrid/registry/dimension_registry_manager.py +413 -0
- dsgrid/registry/dimension_update_checker.py +16 -0
- dsgrid/registry/duckdb_data_store.py +207 -0
- dsgrid/registry/filesystem_data_store.py +150 -0
- dsgrid/registry/filter_registry_manager.py +123 -0
- dsgrid/registry/project_config_generator.py +57 -0
- dsgrid/registry/project_registry_manager.py +1623 -0
- dsgrid/registry/project_update_checker.py +48 -0
- dsgrid/registry/registration_context.py +223 -0
- dsgrid/registry/registry_auto_updater.py +316 -0
- dsgrid/registry/registry_database.py +667 -0
- dsgrid/registry/registry_interface.py +446 -0
- dsgrid/registry/registry_manager.py +558 -0
- dsgrid/registry/registry_manager_base.py +367 -0
- dsgrid/registry/versioning.py +92 -0
- dsgrid/rust_ext/__init__.py +14 -0
- dsgrid/rust_ext/find_minimal_patterns.py +129 -0
- dsgrid/spark/__init__.py +0 -0
- dsgrid/spark/functions.py +589 -0
- dsgrid/spark/types.py +110 -0
- dsgrid/tests/__init__.py +0 -0
- dsgrid/tests/common.py +140 -0
- dsgrid/tests/make_us_data_registry.py +265 -0
- dsgrid/tests/register_derived_datasets.py +103 -0
- dsgrid/tests/utils.py +25 -0
- dsgrid/time/__init__.py +0 -0
- dsgrid/time/time_conversions.py +80 -0
- dsgrid/time/types.py +67 -0
- dsgrid/units/__init__.py +0 -0
- dsgrid/units/constants.py +113 -0
- dsgrid/units/convert.py +71 -0
- dsgrid/units/energy.py +145 -0
- dsgrid/units/power.py +87 -0
- dsgrid/utils/__init__.py +0 -0
- dsgrid/utils/dataset.py +830 -0
- dsgrid/utils/files.py +179 -0
- dsgrid/utils/filters.py +125 -0
- dsgrid/utils/id_remappings.py +100 -0
- dsgrid/utils/py_expression_eval/LICENSE +19 -0
- dsgrid/utils/py_expression_eval/README.md +8 -0
- dsgrid/utils/py_expression_eval/__init__.py +847 -0
- dsgrid/utils/py_expression_eval/tests.py +283 -0
- dsgrid/utils/run_command.py +70 -0
- dsgrid/utils/scratch_dir_context.py +65 -0
- dsgrid/utils/spark.py +918 -0
- dsgrid/utils/spark_partition.py +98 -0
- dsgrid/utils/timing.py +239 -0
- dsgrid/utils/utilities.py +221 -0
- dsgrid/utils/versioning.py +36 -0
- dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
- dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
- dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
- dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
- dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
|
@@ -0,0 +1,945 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Iterable, Self
|
|
6
|
+
|
|
7
|
+
import chronify
|
|
8
|
+
from sqlalchemy import Connection
|
|
9
|
+
|
|
10
|
+
import dsgrid
|
|
11
|
+
from dsgrid.chronify import create_store, create_in_memory_store
|
|
12
|
+
from dsgrid.config.annual_time_dimension_config import (
|
|
13
|
+
AnnualTimeDimensionConfig,
|
|
14
|
+
map_annual_time_to_date_time,
|
|
15
|
+
)
|
|
16
|
+
from dsgrid.config.dimension_config import (
|
|
17
|
+
DimensionBaseConfig,
|
|
18
|
+
DimensionBaseConfigWithFiles,
|
|
19
|
+
)
|
|
20
|
+
from dsgrid.config.noop_time_dimension_config import NoOpTimeDimensionConfig
|
|
21
|
+
from dsgrid.config.date_time_dimension_config import DateTimeDimensionConfig
|
|
22
|
+
from dsgrid.config.index_time_dimension_config import IndexTimeDimensionConfig
|
|
23
|
+
from dsgrid.config.project_config import ProjectConfig
|
|
24
|
+
from dsgrid.config.time_dimension_base_config import TimeDimensionBaseConfig
|
|
25
|
+
from dsgrid.dimension.time import TimeBasedDataAdjustmentModel
|
|
26
|
+
from dsgrid.dsgrid_rc import DsgridRuntimeConfig
|
|
27
|
+
from dsgrid.common import VALUE_COLUMN, BackendEngine
|
|
28
|
+
from dsgrid.config.dataset_config import (
|
|
29
|
+
DatasetConfig,
|
|
30
|
+
InputDatasetType,
|
|
31
|
+
UserDataLayout,
|
|
32
|
+
)
|
|
33
|
+
from dsgrid.config.dimension_mapping_base import (
|
|
34
|
+
DimensionMappingReferenceModel,
|
|
35
|
+
)
|
|
36
|
+
from dsgrid.config.simple_models import DimensionSimpleModel
|
|
37
|
+
from dsgrid.dataset.models import ValueFormat
|
|
38
|
+
from dsgrid.dataset.table_format_handler_factory import make_table_format_handler
|
|
39
|
+
from dsgrid.config.file_schema import read_data_file
|
|
40
|
+
from dsgrid.dimension.base_models import DatasetDimensionRequirements, DimensionType
|
|
41
|
+
from dsgrid.exceptions import DSGInvalidDataset, DSGInvalidDimensionMapping
|
|
42
|
+
from dsgrid.dimension.time import (
|
|
43
|
+
DaylightSavingAdjustmentModel,
|
|
44
|
+
)
|
|
45
|
+
from dsgrid.dataset.dataset_mapping_manager import DatasetMappingManager
|
|
46
|
+
from dsgrid.query.dataset_mapping_plan import DatasetMappingPlan, MapOperation
|
|
47
|
+
from dsgrid.query.query_context import QueryContext
|
|
48
|
+
from dsgrid.query.models import ColumnType
|
|
49
|
+
from dsgrid.spark.functions import (
|
|
50
|
+
cache,
|
|
51
|
+
except_all,
|
|
52
|
+
is_dataframe_empty,
|
|
53
|
+
join,
|
|
54
|
+
make_temp_view_name,
|
|
55
|
+
unpersist,
|
|
56
|
+
)
|
|
57
|
+
from dsgrid.registry.data_store_interface import DataStoreInterface
|
|
58
|
+
from dsgrid.spark.types import DataFrame, F, use_duckdb
|
|
59
|
+
from dsgrid.units.convert import convert_units_unpivoted
|
|
60
|
+
from dsgrid.utils.dataset import (
|
|
61
|
+
check_historical_annual_time_model_year_consistency,
|
|
62
|
+
filter_out_expected_missing_associations,
|
|
63
|
+
handle_dimension_association_errors,
|
|
64
|
+
is_noop_mapping,
|
|
65
|
+
map_stacked_dimension,
|
|
66
|
+
add_time_zone,
|
|
67
|
+
map_time_dimension_with_chronify_duckdb,
|
|
68
|
+
map_time_dimension_with_chronify_spark_hive,
|
|
69
|
+
map_time_dimension_with_chronify_spark_path,
|
|
70
|
+
ordered_subset_columns,
|
|
71
|
+
repartition_if_needed_by_mapping,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
from dsgrid.utils.scratch_dir_context import ScratchDirContext
|
|
75
|
+
from dsgrid.utils.spark import (
|
|
76
|
+
check_for_nulls,
|
|
77
|
+
create_dataframe_from_product,
|
|
78
|
+
get_unique_values,
|
|
79
|
+
persist_table,
|
|
80
|
+
read_dataframe,
|
|
81
|
+
save_to_warehouse,
|
|
82
|
+
write_dataframe,
|
|
83
|
+
)
|
|
84
|
+
from dsgrid.utils.timing import timer_stats_collector, track_timing
|
|
85
|
+
from dsgrid.registry.dimension_registry_manager import DimensionRegistryManager
|
|
86
|
+
from dsgrid.registry.dimension_mapping_registry_manager import (
|
|
87
|
+
DimensionMappingRegistryManager,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
logger = logging.getLogger(__name__)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class DatasetSchemaHandlerBase(abc.ABC):
|
|
94
|
+
"""define interface/required behaviors per dataset schema"""
|
|
95
|
+
|
|
96
|
+
def __init__(
|
|
97
|
+
self,
|
|
98
|
+
config: DatasetConfig,
|
|
99
|
+
conn: Connection | None,
|
|
100
|
+
dimension_mgr: DimensionRegistryManager,
|
|
101
|
+
dimension_mapping_mgr: DimensionMappingRegistryManager,
|
|
102
|
+
mapping_references: list[DimensionMappingReferenceModel] | None = None,
|
|
103
|
+
):
|
|
104
|
+
self._conn = conn
|
|
105
|
+
self._config = config
|
|
106
|
+
self._dimension_mgr = dimension_mgr
|
|
107
|
+
self._dimension_mapping_mgr = dimension_mapping_mgr
|
|
108
|
+
self._mapping_references: list[DimensionMappingReferenceModel] = mapping_references or []
|
|
109
|
+
|
|
110
|
+
@classmethod
|
|
111
|
+
@abc.abstractmethod
|
|
112
|
+
def load(cls, config: DatasetConfig, *args, store: DataStoreInterface | None = None) -> Self:
|
|
113
|
+
"""Create a dataset schema handler by loading the data tables from files.
|
|
114
|
+
|
|
115
|
+
Parameters
|
|
116
|
+
----------
|
|
117
|
+
config: DatasetConfig
|
|
118
|
+
store: DataStoreInterface | None
|
|
119
|
+
If provided, the dataset must already be registered.
|
|
120
|
+
If not provided, the dataset must not be registered and the file path must be
|
|
121
|
+
available via the DatasetConfig.
|
|
122
|
+
|
|
123
|
+
Returns
|
|
124
|
+
-------
|
|
125
|
+
DatasetSchemaHandlerBase
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
@abc.abstractmethod
|
|
129
|
+
def check_consistency(
|
|
130
|
+
self,
|
|
131
|
+
missing_dimension_associations: dict[str, DataFrame],
|
|
132
|
+
scratch_dir_context: ScratchDirContext,
|
|
133
|
+
requirements: DatasetDimensionRequirements,
|
|
134
|
+
) -> None:
|
|
135
|
+
"""
|
|
136
|
+
Check all data consistencies, including data columns, dataset to dimension records, and time
|
|
137
|
+
"""
|
|
138
|
+
|
|
139
|
+
@abc.abstractmethod
|
|
140
|
+
def check_time_consistency(self):
|
|
141
|
+
"""Check the time consistency of the dataset."""
|
|
142
|
+
|
|
143
|
+
@abc.abstractmethod
|
|
144
|
+
def get_base_load_data_table(self) -> DataFrame:
|
|
145
|
+
"""Return the base load data table, which must include time."""
|
|
146
|
+
|
|
147
|
+
@abc.abstractmethod
|
|
148
|
+
def _get_load_data_table(self) -> DataFrame:
|
|
149
|
+
"""Return the full load data table."""
|
|
150
|
+
|
|
151
|
+
def _make_actual_dimension_association_table_from_data(self) -> DataFrame:
|
|
152
|
+
return self._remove_non_dimension_columns(self._get_load_data_table()).distinct()
|
|
153
|
+
|
|
154
|
+
def _make_expected_dimension_association_table_from_records(
|
|
155
|
+
self, dimension_types: Iterable[DimensionType], context: ScratchDirContext
|
|
156
|
+
) -> DataFrame:
|
|
157
|
+
"""Return a dataframe containing one row for each unique dimension combination except time.
|
|
158
|
+
Use dimensions in the dataset's dimension records.
|
|
159
|
+
"""
|
|
160
|
+
data: dict[str, list[str]] = {}
|
|
161
|
+
for dim_type in dimension_types:
|
|
162
|
+
dim = self._config.get_dimension_with_records(dim_type)
|
|
163
|
+
if dim is not None:
|
|
164
|
+
data[dim_type.value] = list(dim.get_unique_ids())
|
|
165
|
+
|
|
166
|
+
if not data:
|
|
167
|
+
msg = "Bug: did not find any dimension records"
|
|
168
|
+
raise Exception(msg)
|
|
169
|
+
return create_dataframe_from_product(data, context)
|
|
170
|
+
|
|
171
|
+
@track_timing(timer_stats_collector)
|
|
172
|
+
def _check_dimension_associations(
|
|
173
|
+
self,
|
|
174
|
+
missing_dimension_associations: dict[str, DataFrame],
|
|
175
|
+
context: ScratchDirContext,
|
|
176
|
+
requirements: DatasetDimensionRequirements,
|
|
177
|
+
) -> None:
|
|
178
|
+
"""Check that a cross-join of dimension records is present, unless explicitly excepted."""
|
|
179
|
+
|
|
180
|
+
if not requirements.check_dimension_associations:
|
|
181
|
+
logger.info(
|
|
182
|
+
"Skip checks of dataset dimension associations for %s",
|
|
183
|
+
self._config.model.dataset_id,
|
|
184
|
+
)
|
|
185
|
+
return
|
|
186
|
+
|
|
187
|
+
logger.info("Check dimension associations")
|
|
188
|
+
assoc_by_records = self._make_expected_dimension_association_table_from_records(
|
|
189
|
+
[x for x in DimensionType if x != DimensionType.TIME], context
|
|
190
|
+
)
|
|
191
|
+
assoc_by_data = self._make_actual_dimension_association_table_from_data()
|
|
192
|
+
# This first check is redundant with the checks below. But, it is significantly
|
|
193
|
+
# easier for users to debug.
|
|
194
|
+
for column in assoc_by_records.columns:
|
|
195
|
+
expected = get_unique_values(assoc_by_records, column)
|
|
196
|
+
actual = get_unique_values(assoc_by_data, column)
|
|
197
|
+
if actual != expected:
|
|
198
|
+
missing = sorted(expected.difference(actual))
|
|
199
|
+
extra = sorted(actual.difference(expected))
|
|
200
|
+
num_matching = len(actual.intersection(expected))
|
|
201
|
+
msg = (
|
|
202
|
+
f"Dataset records for dimension type {column} do not match expected "
|
|
203
|
+
f"values. {missing=} {extra=} {num_matching=}"
|
|
204
|
+
)
|
|
205
|
+
raise DSGInvalidDataset(msg)
|
|
206
|
+
|
|
207
|
+
required_assoc = assoc_by_records
|
|
208
|
+
if missing_dimension_associations:
|
|
209
|
+
for missing_df in missing_dimension_associations.values():
|
|
210
|
+
required_assoc = filter_out_expected_missing_associations(
|
|
211
|
+
required_assoc, missing_df
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
cols = sorted(required_assoc.columns)
|
|
215
|
+
diff = except_all(required_assoc.select(*cols), assoc_by_data.select(*cols))
|
|
216
|
+
cache(diff)
|
|
217
|
+
try:
|
|
218
|
+
if not is_dataframe_empty(diff):
|
|
219
|
+
handle_dimension_association_errors(diff, assoc_by_data, self.dataset_id)
|
|
220
|
+
logger.info("Successfully checked dataset dimension associations")
|
|
221
|
+
finally:
|
|
222
|
+
unpersist(diff)
|
|
223
|
+
|
|
224
|
+
def make_mapped_dimension_association_table(self, context: ScratchDirContext) -> DataFrame:
|
|
225
|
+
"""Return a dataframe containing one row for each unique dimension combination except time.
|
|
226
|
+
Use mapped dimensions.
|
|
227
|
+
"""
|
|
228
|
+
assoc_df = self._make_actual_dimension_association_table_from_data()
|
|
229
|
+
mapping_plan = self.build_default_dataset_mapping_plan()
|
|
230
|
+
with DatasetMappingManager(self.dataset_id, mapping_plan, context) as mapping_manager:
|
|
231
|
+
df = (
|
|
232
|
+
self._remap_dimension_columns(assoc_df, mapping_manager)
|
|
233
|
+
.drop("fraction")
|
|
234
|
+
.distinct()
|
|
235
|
+
)
|
|
236
|
+
check_for_nulls(df)
|
|
237
|
+
return df
|
|
238
|
+
|
|
239
|
+
def remove_expected_missing_mapped_associations(
|
|
240
|
+
self, store: DataStoreInterface, df: DataFrame, context: ScratchDirContext
|
|
241
|
+
) -> DataFrame:
|
|
242
|
+
"""Remove expected missing associations from the full join of expected associations."""
|
|
243
|
+
missing_associations = store.read_missing_associations_tables(
|
|
244
|
+
self._config.model.dataset_id, self._config.model.version
|
|
245
|
+
)
|
|
246
|
+
if not missing_associations:
|
|
247
|
+
return df
|
|
248
|
+
|
|
249
|
+
final_df = df
|
|
250
|
+
mapping_plan = self.build_default_dataset_mapping_plan()
|
|
251
|
+
with DatasetMappingManager(self.dataset_id, mapping_plan, context) as mapping_manager:
|
|
252
|
+
for missing_df in missing_associations.values():
|
|
253
|
+
mapped_df = (
|
|
254
|
+
self._remap_dimension_columns(missing_df, mapping_manager)
|
|
255
|
+
.drop("fraction")
|
|
256
|
+
.distinct()
|
|
257
|
+
)
|
|
258
|
+
final_df = filter_out_expected_missing_associations(final_df, mapped_df)
|
|
259
|
+
return final_df
|
|
260
|
+
|
|
261
|
+
@abc.abstractmethod
|
|
262
|
+
def filter_data(self, dimensions: list[DimensionSimpleModel], store: DataStoreInterface):
|
|
263
|
+
"""Filter the load data by dimensions and rewrite the files.
|
|
264
|
+
|
|
265
|
+
dimensions : list[DimensionSimpleModel]
|
|
266
|
+
store : DataStoreInterface
|
|
267
|
+
The data store to use for reading and writing the data.
|
|
268
|
+
"""
|
|
269
|
+
|
|
270
|
+
@property
|
|
271
|
+
def connection(self) -> Connection | None:
|
|
272
|
+
"""Return the active sqlalchemy connection to the registry database."""
|
|
273
|
+
return self._conn
|
|
274
|
+
|
|
275
|
+
@property
|
|
276
|
+
def dataset_id(self):
|
|
277
|
+
return self._config.config_id
|
|
278
|
+
|
|
279
|
+
@property
|
|
280
|
+
def config(self):
|
|
281
|
+
"""Returns the DatasetConfig.
|
|
282
|
+
|
|
283
|
+
Returns
|
|
284
|
+
-------
|
|
285
|
+
DatasetConfig
|
|
286
|
+
|
|
287
|
+
"""
|
|
288
|
+
return self._config
|
|
289
|
+
|
|
290
|
+
@abc.abstractmethod
|
|
291
|
+
def make_project_dataframe(self, context, project_config) -> DataFrame:
|
|
292
|
+
"""Return a load_data dataframe with dimensions mapped to the project's with filters
|
|
293
|
+
as specified by the QueryContext.
|
|
294
|
+
|
|
295
|
+
Parameters
|
|
296
|
+
----------
|
|
297
|
+
context : QueryContext
|
|
298
|
+
project_config : ProjectConfig
|
|
299
|
+
|
|
300
|
+
Returns
|
|
301
|
+
-------
|
|
302
|
+
pyspark.sql.DataFrame
|
|
303
|
+
|
|
304
|
+
"""
|
|
305
|
+
|
|
306
|
+
@abc.abstractmethod
|
|
307
|
+
def make_mapped_dataframe(
|
|
308
|
+
self,
|
|
309
|
+
context: QueryContext,
|
|
310
|
+
time_dimension: TimeDimensionBaseConfig | None = None,
|
|
311
|
+
) -> DataFrame:
|
|
312
|
+
"""Return a load_data dataframe with dimensions mapped as stored in the handler.
|
|
313
|
+
|
|
314
|
+
Parameters
|
|
315
|
+
----------
|
|
316
|
+
context
|
|
317
|
+
time_dimension
|
|
318
|
+
Required if the time dimension is being mapped.
|
|
319
|
+
This should be the destination time dimension.
|
|
320
|
+
|
|
321
|
+
"""
|
|
322
|
+
|
|
323
|
+
@track_timing(timer_stats_collector)
|
|
324
|
+
def _check_dataset_time_consistency(self, load_data_df: DataFrame):
|
|
325
|
+
"""Check dataset time consistency such that:
|
|
326
|
+
1. time range(s) match time config record;
|
|
327
|
+
2. all dimension combinations return the same set of time range(s).
|
|
328
|
+
|
|
329
|
+
Callers must ensure that the dataset has a time dimension.
|
|
330
|
+
"""
|
|
331
|
+
if os.environ.get("__DSGRID_SKIP_CHECK_DATASET_TIME_CONSISTENCY__"):
|
|
332
|
+
logger.warning("Skip dataset time consistency checks.")
|
|
333
|
+
return
|
|
334
|
+
|
|
335
|
+
logger.info("Check dataset time consistency.")
|
|
336
|
+
time_dim = self._config.get_time_dimension()
|
|
337
|
+
assert time_dim is not None, "time cannot be checked if the dataset has no time dimension"
|
|
338
|
+
time_cols = self._get_time_dimension_columns()
|
|
339
|
+
time_dim.check_dataset_time_consistency(load_data_df, time_cols)
|
|
340
|
+
if not isinstance(time_dim, NoOpTimeDimensionConfig):
|
|
341
|
+
self._check_dataset_time_consistency_by_time_array(time_cols, load_data_df)
|
|
342
|
+
self._check_model_year_time_consistency(load_data_df)
|
|
343
|
+
|
|
344
|
+
@track_timing(timer_stats_collector)
|
|
345
|
+
def _check_dataset_time_consistency_with_chronify(self):
|
|
346
|
+
"""Check dataset time consistency such that:
|
|
347
|
+
1. time range(s) match time config record;
|
|
348
|
+
2. all dimension combinations return the same set of time range(s).
|
|
349
|
+
|
|
350
|
+
Callers must ensure that the dataset has a time dimension.
|
|
351
|
+
"""
|
|
352
|
+
if os.environ.get("__DSGRID_SKIP_CHECK_DATASET_TIME_CONSISTENCY__"):
|
|
353
|
+
logger.warning("Skip dataset time consistency checks.")
|
|
354
|
+
return
|
|
355
|
+
|
|
356
|
+
logger.info("Check dataset time consistency.")
|
|
357
|
+
assert isinstance(self._config.model.data_layout, UserDataLayout)
|
|
358
|
+
file_schema = self._config.model.data_layout.data_file
|
|
359
|
+
scratch_dir = DsgridRuntimeConfig.load().get_scratch_dir()
|
|
360
|
+
with ScratchDirContext(scratch_dir) as context:
|
|
361
|
+
load_data_df = read_data_file(file_schema, scratch_dir_context=context)
|
|
362
|
+
chronify_schema = self._get_chronify_schema(load_data_df)
|
|
363
|
+
assert file_schema.path is not None
|
|
364
|
+
data_file_path = Path(file_schema.path)
|
|
365
|
+
if data_file_path.suffix == ".parquet" or not use_duckdb():
|
|
366
|
+
if data_file_path.suffix == ".csv":
|
|
367
|
+
# This is a workaround for time zone issues between Spark, Pandas,
|
|
368
|
+
# and Chronify when reading CSV files.
|
|
369
|
+
# Chronify can ingest them correctly when we go to Parquet first.
|
|
370
|
+
# This is really only a test issue because normal dsgrid users will not
|
|
371
|
+
# use Spark with CSV data files.
|
|
372
|
+
src_path = context.get_temp_filename(suffix=".parquet")
|
|
373
|
+
write_dataframe(load_data_df, src_path)
|
|
374
|
+
else:
|
|
375
|
+
src_path = data_file_path
|
|
376
|
+
store_file = context.get_temp_filename(suffix=".db")
|
|
377
|
+
with create_store(store_file) as store:
|
|
378
|
+
# This performs all of the checks.
|
|
379
|
+
store.create_view_from_parquet(src_path, chronify_schema)
|
|
380
|
+
store.drop_view(chronify_schema.name)
|
|
381
|
+
else:
|
|
382
|
+
# For CSV and JSON files, use in-memory store with ingest_table.
|
|
383
|
+
# This avoids the complexity of converting to parquet.
|
|
384
|
+
with create_in_memory_store() as store:
|
|
385
|
+
# ingest_table performs all of the time checks.
|
|
386
|
+
store.ingest_table(load_data_df.toPandas(), chronify_schema)
|
|
387
|
+
store.drop_table(chronify_schema.name)
|
|
388
|
+
|
|
389
|
+
self._check_model_year_time_consistency(load_data_df)
|
|
390
|
+
|
|
391
|
+
def _get_chronify_schema(self, df: DataFrame):
|
|
392
|
+
time_dim = self._config.get_dimension(DimensionType.TIME)
|
|
393
|
+
time_cols = time_dim.get_load_data_time_columns()
|
|
394
|
+
time_array_id_columns = [
|
|
395
|
+
x
|
|
396
|
+
for x in df.columns
|
|
397
|
+
# If there are multiple weather years:
|
|
398
|
+
# - that are continuous, weather year needs to be excluded (one overall range).
|
|
399
|
+
# - that are not continuous, weather year needs to be included and chronify
|
|
400
|
+
# needs additional support. TODO: issue #340
|
|
401
|
+
if x != DimensionType.WEATHER_YEAR.value
|
|
402
|
+
and x
|
|
403
|
+
in set(df.columns).difference(time_cols).difference(self._config.get_value_columns())
|
|
404
|
+
]
|
|
405
|
+
if self._config.get_value_format() == ValueFormat.PIVOTED:
|
|
406
|
+
# We can ignore all pivoted columns but one for time checking.
|
|
407
|
+
# Looking at the rest would be redundant.
|
|
408
|
+
value_column = next(iter(self._config.get_pivoted_dimension_columns()))
|
|
409
|
+
else:
|
|
410
|
+
value_column = VALUE_COLUMN
|
|
411
|
+
return chronify.TableSchema(
|
|
412
|
+
name=make_temp_view_name(),
|
|
413
|
+
time_config=time_dim.to_chronify(),
|
|
414
|
+
time_array_id_columns=time_array_id_columns,
|
|
415
|
+
value_column=value_column,
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
def _check_model_year_time_consistency(self, df: DataFrame):
|
|
419
|
+
time_dim = self._config.get_dimension(DimensionType.TIME)
|
|
420
|
+
if self._config.model.dataset_type == InputDatasetType.HISTORICAL and isinstance(
|
|
421
|
+
time_dim, AnnualTimeDimensionConfig
|
|
422
|
+
):
|
|
423
|
+
annual_cols = time_dim.get_load_data_time_columns()
|
|
424
|
+
assert len(annual_cols) == 1
|
|
425
|
+
annual_col = annual_cols[0]
|
|
426
|
+
check_historical_annual_time_model_year_consistency(
|
|
427
|
+
df, annual_col, DimensionType.MODEL_YEAR.value
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
@track_timing(timer_stats_collector)
|
|
431
|
+
def _check_dataset_time_consistency_by_time_array(self, time_cols, load_data_df):
|
|
432
|
+
"""Check that each unique time array has the same timestamps."""
|
|
433
|
+
logger.info("Check dataset time consistency by time array.")
|
|
434
|
+
unique_array_cols = set(DimensionType.get_allowed_dimension_column_names()).intersection(
|
|
435
|
+
load_data_df.columns
|
|
436
|
+
)
|
|
437
|
+
counts = load_data_df.groupBy(*time_cols).count().select("count")
|
|
438
|
+
distinct_counts = counts.select("count").distinct().collect()
|
|
439
|
+
if len(distinct_counts) != 1:
|
|
440
|
+
msg = (
|
|
441
|
+
"All time arrays must be repeated the same number of times: "
|
|
442
|
+
f"unique timestamp repeats = {len(distinct_counts)}"
|
|
443
|
+
)
|
|
444
|
+
raise DSGInvalidDataset(msg)
|
|
445
|
+
ta_counts = load_data_df.groupBy(*unique_array_cols).count().select("count")
|
|
446
|
+
distinct_ta_counts = ta_counts.select("count").distinct().collect()
|
|
447
|
+
if len(distinct_ta_counts) != 1:
|
|
448
|
+
msg = (
|
|
449
|
+
"All combinations of non-time dimensions must have the same time array length: "
|
|
450
|
+
f"unique time array lengths = {len(distinct_ta_counts)}"
|
|
451
|
+
)
|
|
452
|
+
raise DSGInvalidDataset(msg)
|
|
453
|
+
|
|
454
|
+
def _check_load_data_unpivoted_value_column(self, df):
|
|
455
|
+
logger.info("Check load data unpivoted columns.")
|
|
456
|
+
if VALUE_COLUMN not in df.columns:
|
|
457
|
+
msg = f"value_column={VALUE_COLUMN} is not in columns={df.columns}"
|
|
458
|
+
raise DSGInvalidDataset(msg)
|
|
459
|
+
|
|
460
|
+
def _convert_units(
|
|
461
|
+
self,
|
|
462
|
+
df: DataFrame,
|
|
463
|
+
project_metric_records: DataFrame,
|
|
464
|
+
mapping_manager: DatasetMappingManager,
|
|
465
|
+
):
|
|
466
|
+
if not self._config.model.enable_unit_conversion:
|
|
467
|
+
return df
|
|
468
|
+
|
|
469
|
+
op = mapping_manager.plan.convert_units_op
|
|
470
|
+
if mapping_manager.has_completed_operation(op):
|
|
471
|
+
return df
|
|
472
|
+
|
|
473
|
+
# Note that a dataset could have the same dimension record IDs as the project,
|
|
474
|
+
# no mappings, but then still have different units.
|
|
475
|
+
mapping_records = None
|
|
476
|
+
for ref in self._mapping_references:
|
|
477
|
+
dim_type = ref.from_dimension_type
|
|
478
|
+
if dim_type == DimensionType.METRIC:
|
|
479
|
+
mapping_records = self._dimension_mapping_mgr.get_by_id(
|
|
480
|
+
ref.mapping_id, version=ref.version, conn=self.connection
|
|
481
|
+
).get_records_dataframe()
|
|
482
|
+
break
|
|
483
|
+
|
|
484
|
+
dataset_dim = self._config.get_dimension_with_records(DimensionType.METRIC)
|
|
485
|
+
dataset_records = dataset_dim.get_records_dataframe()
|
|
486
|
+
df = convert_units_unpivoted(
|
|
487
|
+
df,
|
|
488
|
+
DimensionType.METRIC.value,
|
|
489
|
+
dataset_records,
|
|
490
|
+
mapping_records,
|
|
491
|
+
project_metric_records,
|
|
492
|
+
)
|
|
493
|
+
if op.persist:
|
|
494
|
+
df = mapping_manager.persist_table(df, op)
|
|
495
|
+
return df
|
|
496
|
+
|
|
497
|
+
def _finalize_table(self, context: QueryContext, df: DataFrame, project_config: ProjectConfig):
|
|
498
|
+
# TODO: remove ProjectConfig so that dataset queries can use this.
|
|
499
|
+
# Issue #370
|
|
500
|
+
table_handler = make_table_format_handler(
|
|
501
|
+
self._config.get_value_format(),
|
|
502
|
+
project_config,
|
|
503
|
+
dataset_id=self.dataset_id,
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
time_dim = project_config.get_base_dimension(DimensionType.TIME)
|
|
507
|
+
context.set_dataset_metadata(
|
|
508
|
+
self.dataset_id,
|
|
509
|
+
context.model.result.column_type,
|
|
510
|
+
project_config.get_load_data_time_columns(time_dim.model.name),
|
|
511
|
+
)
|
|
512
|
+
|
|
513
|
+
if context.model.result.column_type == ColumnType.DIMENSION_NAMES:
|
|
514
|
+
df = table_handler.convert_columns_to_query_names(
|
|
515
|
+
df, self._config.model.dataset_id, context
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
return df
|
|
519
|
+
|
|
520
|
+
@staticmethod
|
|
521
|
+
def _get_pivoted_column_name(
|
|
522
|
+
context: QueryContext, pivoted_dimension_type: DimensionType, project_config
|
|
523
|
+
):
|
|
524
|
+
match context.model.result.column_type:
|
|
525
|
+
case ColumnType.DIMENSION_NAMES:
|
|
526
|
+
pivoted_column_name = project_config.get_base_dimension(
|
|
527
|
+
pivoted_dimension_type
|
|
528
|
+
).model.name
|
|
529
|
+
case ColumnType.DIMENSION_TYPES:
|
|
530
|
+
pivoted_column_name = pivoted_dimension_type.value
|
|
531
|
+
case _:
|
|
532
|
+
msg = str(context.model.result.column_type)
|
|
533
|
+
raise NotImplementedError(msg)
|
|
534
|
+
|
|
535
|
+
return pivoted_column_name
|
|
536
|
+
|
|
537
|
+
def _get_dataset_to_project_mapping_records(self, dimension_type: DimensionType):
|
|
538
|
+
config = self._get_dataset_to_project_mapping_config(dimension_type)
|
|
539
|
+
if config is None:
|
|
540
|
+
return config
|
|
541
|
+
return config.get_records_dataframe()
|
|
542
|
+
|
|
543
|
+
def _get_dataset_to_project_mapping_config(self, dimension_type: DimensionType):
|
|
544
|
+
ref = self._get_dataset_to_project_mapping_reference(dimension_type)
|
|
545
|
+
if ref is None:
|
|
546
|
+
return ref
|
|
547
|
+
return self._dimension_mapping_mgr.get_by_id(
|
|
548
|
+
ref.mapping_id, version=ref.version, conn=self.connection
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
def _get_dataset_to_project_mapping_reference(self, dimension_type: DimensionType):
|
|
552
|
+
for ref in self._mapping_references:
|
|
553
|
+
if ref.from_dimension_type == dimension_type:
|
|
554
|
+
return ref
|
|
555
|
+
return
|
|
556
|
+
|
|
557
|
+
def _get_mapping_to_dimension(
|
|
558
|
+
self, dimension_type: DimensionType
|
|
559
|
+
) -> DimensionBaseConfig | None:
|
|
560
|
+
ref = self._get_dataset_to_project_mapping_reference(dimension_type)
|
|
561
|
+
if ref is None:
|
|
562
|
+
return None
|
|
563
|
+
config = self._dimension_mapping_mgr.get_by_id(ref.mapping_id, conn=self._conn)
|
|
564
|
+
return self._dimension_mgr.get_by_id(
|
|
565
|
+
config.model.to_dimension.dimension_id, conn=self._conn
|
|
566
|
+
)
|
|
567
|
+
|
|
568
|
+
def _get_project_metric_records(self, project_config: ProjectConfig) -> DataFrame:
|
|
569
|
+
metric_dim_query_name = getattr(
|
|
570
|
+
project_config.get_dataset_base_dimension_names(self._config.model.dataset_id),
|
|
571
|
+
DimensionType.METRIC.value,
|
|
572
|
+
)
|
|
573
|
+
if metric_dim_query_name is None:
|
|
574
|
+
# This is a workaround for dsgrid projects created before the field
|
|
575
|
+
# base_dimension_names was added to InputDatasetModel.
|
|
576
|
+
metric_dims = project_config.list_base_dimensions(dimension_type=DimensionType.METRIC)
|
|
577
|
+
if len(metric_dims) > 1:
|
|
578
|
+
msg = (
|
|
579
|
+
"The dataset's base_dimension_names value is not set and "
|
|
580
|
+
"there are multiple metric dimensions in the project. Please re-register the "
|
|
581
|
+
f"dataset with dataset_id={self._config.model.dataset_id}."
|
|
582
|
+
)
|
|
583
|
+
raise DSGInvalidDataset(msg)
|
|
584
|
+
metric_dim_query_name = metric_dims[0].model.name
|
|
585
|
+
return project_config.get_dimension_records(metric_dim_query_name)
|
|
586
|
+
|
|
587
|
+
def _get_time_dimension_columns(self):
|
|
588
|
+
time_dim = self._config.get_dimension(DimensionType.TIME)
|
|
589
|
+
time_cols = time_dim.get_load_data_time_columns()
|
|
590
|
+
return time_cols
|
|
591
|
+
|
|
592
|
+
def _iter_dataset_record_ids(self, context: QueryContext):
|
|
593
|
+
for dim_type, project_record_ids in context.get_record_ids().items():
|
|
594
|
+
dataset_mapping = self._get_dataset_to_project_mapping_records(dim_type)
|
|
595
|
+
if dataset_mapping is None:
|
|
596
|
+
dataset_record_ids = project_record_ids
|
|
597
|
+
else:
|
|
598
|
+
dataset_record_ids = (
|
|
599
|
+
join(
|
|
600
|
+
dataset_mapping.withColumnRenamed("from_id", "dataset_record_id"),
|
|
601
|
+
project_record_ids,
|
|
602
|
+
"to_id",
|
|
603
|
+
"id",
|
|
604
|
+
)
|
|
605
|
+
.select("dataset_record_id")
|
|
606
|
+
.withColumnRenamed("dataset_record_id", "id")
|
|
607
|
+
.distinct()
|
|
608
|
+
)
|
|
609
|
+
yield dim_type, dataset_record_ids
|
|
610
|
+
|
|
611
|
+
@staticmethod
|
|
612
|
+
def _list_dimension_columns(df: DataFrame) -> list[str]:
|
|
613
|
+
columns = DimensionType.get_allowed_dimension_column_names()
|
|
614
|
+
return [x for x in df.columns if x in columns]
|
|
615
|
+
|
|
616
|
+
def _list_dimension_types_in_load_data(self, df: DataFrame) -> list[DimensionType]:
|
|
617
|
+
dims = [DimensionType(x) for x in DatasetSchemaHandlerBase._list_dimension_columns(df)]
|
|
618
|
+
if self._config.get_value_format() == ValueFormat.PIVOTED:
|
|
619
|
+
pivoted_type = self._config.get_pivoted_dimension_type()
|
|
620
|
+
assert pivoted_type is not None
|
|
621
|
+
dims.append(pivoted_type)
|
|
622
|
+
return dims
|
|
623
|
+
|
|
624
|
+
def _prefilter_pivoted_dimensions(self, context: QueryContext, df):
|
|
625
|
+
for dim_type, dataset_record_ids in self._iter_dataset_record_ids(context):
|
|
626
|
+
if dim_type == self._config.get_pivoted_dimension_type():
|
|
627
|
+
# Drop columns that don't match requested project record IDs.
|
|
628
|
+
cols_to_keep = {x.id for x in dataset_record_ids.collect()}
|
|
629
|
+
cols_to_drop = set(self._config.get_pivoted_dimension_columns()).difference(
|
|
630
|
+
cols_to_keep
|
|
631
|
+
)
|
|
632
|
+
if cols_to_drop:
|
|
633
|
+
df = df.drop(*cols_to_drop)
|
|
634
|
+
|
|
635
|
+
return df
|
|
636
|
+
|
|
637
|
+
def _prefilter_stacked_dimensions(self, context: QueryContext, df):
|
|
638
|
+
for dim_type, dataset_record_ids in self._iter_dataset_record_ids(context):
|
|
639
|
+
# Drop rows that don't match requested project record IDs.
|
|
640
|
+
tmp = dataset_record_ids.withColumnRenamed("id", "dataset_record_id")
|
|
641
|
+
if dim_type.value not in df.columns:
|
|
642
|
+
# This dimensions is stored in another table (e.g., lookup or load_data)
|
|
643
|
+
continue
|
|
644
|
+
df = join(df, tmp, dim_type.value, "dataset_record_id").drop("dataset_record_id")
|
|
645
|
+
|
|
646
|
+
return df
|
|
647
|
+
|
|
648
|
+
def _prefilter_time_dimension(self, context: QueryContext, df):
|
|
649
|
+
# TODO #196:
|
|
650
|
+
return df
|
|
651
|
+
|
|
652
|
+
def build_default_dataset_mapping_plan(self) -> DatasetMappingPlan:
|
|
653
|
+
"""Build a default mapping order of dimensions to a project."""
|
|
654
|
+
mappings: list[MapOperation] = []
|
|
655
|
+
for ref in self._mapping_references:
|
|
656
|
+
config = self._dimension_mapping_mgr.get_by_id(ref.mapping_id, conn=self.connection)
|
|
657
|
+
dim = self._dimension_mgr.get_by_id(
|
|
658
|
+
config.model.to_dimension.dimension_id, conn=self.connection
|
|
659
|
+
)
|
|
660
|
+
mappings.append(
|
|
661
|
+
MapOperation(
|
|
662
|
+
name=dim.model.name,
|
|
663
|
+
mapping_reference=ref,
|
|
664
|
+
)
|
|
665
|
+
)
|
|
666
|
+
|
|
667
|
+
return DatasetMappingPlan(dataset_id=self._config.model.dataset_id, mappings=mappings)
|
|
668
|
+
|
|
669
|
+
def check_dataset_mapping_plan(
|
|
670
|
+
self, mapping_plan: DatasetMappingPlan, project_config: ProjectConfig
|
|
671
|
+
) -> None:
|
|
672
|
+
"""Check that a user-defined mapping plan is valid."""
|
|
673
|
+
req_dimensions: dict[DimensionType, DimensionMappingReferenceModel] = {}
|
|
674
|
+
actual_mapping_dims: dict[DimensionType, str] = {}
|
|
675
|
+
|
|
676
|
+
for ref in self._mapping_references:
|
|
677
|
+
assert ref.to_dimension_type not in req_dimensions
|
|
678
|
+
req_dimensions[ref.to_dimension_type] = ref
|
|
679
|
+
|
|
680
|
+
dataset_id = mapping_plan.dataset_id
|
|
681
|
+
indexes_to_remove: list[int] = []
|
|
682
|
+
for i, mapping in enumerate(mapping_plan.mappings):
|
|
683
|
+
to_dim = project_config.get_dimension(mapping.name)
|
|
684
|
+
if to_dim.model.dimension_type == DimensionType.TIME:
|
|
685
|
+
msg = (
|
|
686
|
+
f"DatasetMappingPlan for {dataset_id=} is invalid because specification "
|
|
687
|
+
f"of the time dimension is not supported: {mapping.name}"
|
|
688
|
+
)
|
|
689
|
+
raise DSGInvalidDimensionMapping(msg)
|
|
690
|
+
if to_dim.model.dimension_type in actual_mapping_dims:
|
|
691
|
+
msg = (
|
|
692
|
+
f"DatasetMappingPlan for {dataset_id=} is invalid because it can only "
|
|
693
|
+
f"support mapping one dimension for a given dimension type. "
|
|
694
|
+
f"type={to_dim.model.dimension_type} "
|
|
695
|
+
f"first={actual_mapping_dims[to_dim.model.dimension_type]} "
|
|
696
|
+
f"second={mapping.name}"
|
|
697
|
+
)
|
|
698
|
+
raise DSGInvalidDimensionMapping(msg)
|
|
699
|
+
|
|
700
|
+
from_dim = self._config.get_dimension(to_dim.model.dimension_type)
|
|
701
|
+
supp_dim_names = {
|
|
702
|
+
x.model.name
|
|
703
|
+
for x in project_config.list_supplemental_dimensions(to_dim.model.dimension_type)
|
|
704
|
+
}
|
|
705
|
+
if mapping.name in supp_dim_names:
|
|
706
|
+
# This could be useful if we wanted to use DatasetMappingPlan for mapping
|
|
707
|
+
# a single dataset to a project's dimensions without being concerned about
|
|
708
|
+
# aggregrations. As it stands, we can are only using this within our
|
|
709
|
+
# project query process. We need much more handling to make that work.
|
|
710
|
+
msg = (
|
|
711
|
+
"DatasetMappingPlan for {dataset_id=} is invalid because it specifies "
|
|
712
|
+
f"a supplemental dimension: {mapping.name}"
|
|
713
|
+
)
|
|
714
|
+
elif to_dim.model.dimension_type not in req_dimensions:
|
|
715
|
+
msg = (
|
|
716
|
+
f"DatasetMappingPlan for {dataset_id=} is invalid because there is no "
|
|
717
|
+
f"dataset-to-project-base mapping defined for {to_dim.model.label}"
|
|
718
|
+
)
|
|
719
|
+
raise DSGInvalidDimensionMapping(msg)
|
|
720
|
+
|
|
721
|
+
ref = req_dimensions[to_dim.model.dimension_type]
|
|
722
|
+
mapping_config = self._dimension_mapping_mgr.get_by_id(
|
|
723
|
+
ref.mapping_id, version=ref.version, conn=self.connection
|
|
724
|
+
)
|
|
725
|
+
if (
|
|
726
|
+
from_dim.model.dimension_id == mapping_config.model.from_dimension.dimension_id
|
|
727
|
+
and to_dim.model.dimension_id == mapping_config.model.to_dimension.dimension_id
|
|
728
|
+
):
|
|
729
|
+
mapping.mapping_reference = ref
|
|
730
|
+
actual_mapping_dims[to_dim.model.dimension_type] = mapping.name
|
|
731
|
+
|
|
732
|
+
for index in indexes_to_remove:
|
|
733
|
+
mapping_plan.mappings.pop(index)
|
|
734
|
+
|
|
735
|
+
if diff_dims := set(req_dimensions.keys()).difference(actual_mapping_dims.keys()):
|
|
736
|
+
req = sorted((x.value for x in req_dimensions))
|
|
737
|
+
act = sorted((x.value for x in actual_mapping_dims))
|
|
738
|
+
diff = sorted((x.value for x in diff_dims))
|
|
739
|
+
msg = (
|
|
740
|
+
"If a mapping order is specified for a dataset, it must include all "
|
|
741
|
+
"dimension types that require mappings to the project base dimension.\n"
|
|
742
|
+
f"Required dimension types: {req}\nActual dimension types: {act}\n"
|
|
743
|
+
f"Difference: {diff}"
|
|
744
|
+
)
|
|
745
|
+
raise DSGInvalidDimensionMapping(msg)
|
|
746
|
+
|
|
747
|
+
def _remap_dimension_columns(
|
|
748
|
+
self,
|
|
749
|
+
df: DataFrame,
|
|
750
|
+
mapping_manager: DatasetMappingManager,
|
|
751
|
+
filtered_records: dict[DimensionType, DataFrame] | None = None,
|
|
752
|
+
) -> DataFrame:
|
|
753
|
+
"""Map the table's dimensions according to the plan.
|
|
754
|
+
|
|
755
|
+
Parameters
|
|
756
|
+
----------
|
|
757
|
+
df
|
|
758
|
+
The dataframe to map.
|
|
759
|
+
mapping_manager
|
|
760
|
+
Manages checkpointing and order of the mapping operations.
|
|
761
|
+
filtered_records
|
|
762
|
+
If not None, use these records to filter the table.
|
|
763
|
+
If None, do not persist any intermediate tables.
|
|
764
|
+
If not None, use this context to persist intermediate tables if required.
|
|
765
|
+
"""
|
|
766
|
+
completed_operations = mapping_manager.get_completed_mapping_operations()
|
|
767
|
+
for dim_mapping in mapping_manager.plan.mappings:
|
|
768
|
+
if dim_mapping.name in completed_operations:
|
|
769
|
+
logger.info(
|
|
770
|
+
"Skip mapping operation %s because the result exists in a checkpointed file.",
|
|
771
|
+
dim_mapping.name,
|
|
772
|
+
)
|
|
773
|
+
continue
|
|
774
|
+
assert dim_mapping.mapping_reference is not None
|
|
775
|
+
ref = dim_mapping.mapping_reference
|
|
776
|
+
dim_type = ref.from_dimension_type
|
|
777
|
+
column = dim_type.value
|
|
778
|
+
mapping_config = self._dimension_mapping_mgr.get_by_id(
|
|
779
|
+
ref.mapping_id, version=ref.version, conn=self.connection
|
|
780
|
+
)
|
|
781
|
+
logger.info(
|
|
782
|
+
"Mapping dimension type %s mapping_type=%s",
|
|
783
|
+
dim_type,
|
|
784
|
+
mapping_config.model.mapping_type,
|
|
785
|
+
)
|
|
786
|
+
records = mapping_config.get_records_dataframe()
|
|
787
|
+
if filtered_records is not None and dim_type in filtered_records:
|
|
788
|
+
records = join(records, filtered_records[dim_type], "to_id", "id").drop("id")
|
|
789
|
+
|
|
790
|
+
if is_noop_mapping(records):
|
|
791
|
+
logger.info("Skip no-op mapping %s.", ref.mapping_id)
|
|
792
|
+
continue
|
|
793
|
+
if column in df.columns:
|
|
794
|
+
persisted_file: Path | None = None
|
|
795
|
+
df = map_stacked_dimension(df, records, column)
|
|
796
|
+
df, persisted_file = repartition_if_needed_by_mapping(
|
|
797
|
+
df,
|
|
798
|
+
mapping_config.model.mapping_type,
|
|
799
|
+
mapping_manager.scratch_dir_context,
|
|
800
|
+
repartition=dim_mapping.handle_data_skew,
|
|
801
|
+
)
|
|
802
|
+
if dim_mapping.persist and persisted_file is None:
|
|
803
|
+
mapping_manager.persist_table(df, dim_mapping)
|
|
804
|
+
if persisted_file is not None:
|
|
805
|
+
mapping_manager.save_checkpoint(persisted_file, dim_mapping)
|
|
806
|
+
|
|
807
|
+
return df
|
|
808
|
+
|
|
809
|
+
def _apply_fraction(
|
|
810
|
+
self,
|
|
811
|
+
df,
|
|
812
|
+
value_columns,
|
|
813
|
+
mapping_manager: DatasetMappingManager,
|
|
814
|
+
agg_func=None,
|
|
815
|
+
):
|
|
816
|
+
op = mapping_manager.plan.apply_fraction_op
|
|
817
|
+
if "fraction" not in df.columns:
|
|
818
|
+
return df
|
|
819
|
+
if mapping_manager.has_completed_operation(op):
|
|
820
|
+
return df
|
|
821
|
+
agg_func = agg_func or F.sum
|
|
822
|
+
# Maintain column order.
|
|
823
|
+
agg_ops = [
|
|
824
|
+
agg_func(F.col(x) * F.col("fraction")).alias(x)
|
|
825
|
+
for x in [y for y in df.columns if y in value_columns]
|
|
826
|
+
]
|
|
827
|
+
gcols = set(df.columns) - value_columns - {"fraction"}
|
|
828
|
+
df = df.groupBy(*ordered_subset_columns(df, gcols)).agg(*agg_ops)
|
|
829
|
+
df = df.drop("fraction")
|
|
830
|
+
if op.persist:
|
|
831
|
+
df = mapping_manager.persist_table(df, op)
|
|
832
|
+
return df
|
|
833
|
+
|
|
834
|
+
@track_timing(timer_stats_collector)
|
|
835
|
+
def _convert_time_dimension(
|
|
836
|
+
self,
|
|
837
|
+
load_data_df: DataFrame,
|
|
838
|
+
to_time_dim: TimeDimensionBaseConfig,
|
|
839
|
+
value_column: str,
|
|
840
|
+
mapping_manager: DatasetMappingManager,
|
|
841
|
+
wrap_time_allowed: bool,
|
|
842
|
+
time_based_data_adjustment: TimeBasedDataAdjustmentModel,
|
|
843
|
+
to_geo_dim: DimensionBaseConfigWithFiles | None = None,
|
|
844
|
+
):
|
|
845
|
+
op = mapping_manager.plan.map_time_op
|
|
846
|
+
if mapping_manager.has_completed_operation(op):
|
|
847
|
+
return load_data_df
|
|
848
|
+
self._validate_daylight_saving_adjustment(time_based_data_adjustment)
|
|
849
|
+
time_dim = self._config.get_time_dimension()
|
|
850
|
+
assert time_dim is not None
|
|
851
|
+
if time_dim.model.is_time_zone_required_in_geography():
|
|
852
|
+
if self._config.model.use_project_geography_time_zone:
|
|
853
|
+
if to_geo_dim is None:
|
|
854
|
+
msg = "Bug: to_geo_dim must be provided if time zone is required in geography."
|
|
855
|
+
raise Exception(msg)
|
|
856
|
+
logger.info("Add time zone from project geography dimension.")
|
|
857
|
+
geography_dim = to_geo_dim
|
|
858
|
+
else:
|
|
859
|
+
logger.info("Add time zone from dataset geography dimension.")
|
|
860
|
+
geography_dim = self._config.get_dimension(DimensionType.GEOGRAPHY)
|
|
861
|
+
load_data_df = add_time_zone(load_data_df, geography_dim)
|
|
862
|
+
|
|
863
|
+
if isinstance(time_dim, AnnualTimeDimensionConfig):
|
|
864
|
+
if not isinstance(to_time_dim, DateTimeDimensionConfig):
|
|
865
|
+
msg = f"Annual time can only be mapped to DateTime: {to_time_dim.model.time_type}"
|
|
866
|
+
raise NotImplementedError(msg)
|
|
867
|
+
|
|
868
|
+
return map_annual_time_to_date_time(
|
|
869
|
+
load_data_df,
|
|
870
|
+
time_dim,
|
|
871
|
+
to_time_dim,
|
|
872
|
+
{value_column},
|
|
873
|
+
)
|
|
874
|
+
|
|
875
|
+
config = dsgrid.runtime_config
|
|
876
|
+
if not time_dim.supports_chronify():
|
|
877
|
+
# annual time is returned above
|
|
878
|
+
# no mapping for no-op
|
|
879
|
+
assert isinstance(
|
|
880
|
+
time_dim, NoOpTimeDimensionConfig
|
|
881
|
+
), "Only NoOp and AnnualTimeDimensionConfig do not currently support Chronify"
|
|
882
|
+
return load_data_df
|
|
883
|
+
match (config.backend_engine, config.use_hive_metastore):
|
|
884
|
+
case (BackendEngine.SPARK, True):
|
|
885
|
+
table_name = make_temp_view_name()
|
|
886
|
+
load_data_df = map_time_dimension_with_chronify_spark_hive(
|
|
887
|
+
df=save_to_warehouse(load_data_df, table_name),
|
|
888
|
+
table_name=table_name,
|
|
889
|
+
value_column=value_column,
|
|
890
|
+
from_time_dim=time_dim,
|
|
891
|
+
to_time_dim=to_time_dim,
|
|
892
|
+
scratch_dir_context=mapping_manager.scratch_dir_context,
|
|
893
|
+
time_based_data_adjustment=time_based_data_adjustment,
|
|
894
|
+
wrap_time_allowed=wrap_time_allowed,
|
|
895
|
+
)
|
|
896
|
+
|
|
897
|
+
case (BackendEngine.SPARK, False):
|
|
898
|
+
filename = persist_table(
|
|
899
|
+
load_data_df,
|
|
900
|
+
mapping_manager.scratch_dir_context,
|
|
901
|
+
tag="query before time mapping",
|
|
902
|
+
)
|
|
903
|
+
load_data_df = map_time_dimension_with_chronify_spark_path(
|
|
904
|
+
df=read_dataframe(filename),
|
|
905
|
+
filename=filename,
|
|
906
|
+
value_column=value_column,
|
|
907
|
+
from_time_dim=time_dim,
|
|
908
|
+
to_time_dim=to_time_dim,
|
|
909
|
+
scratch_dir_context=mapping_manager.scratch_dir_context,
|
|
910
|
+
time_based_data_adjustment=time_based_data_adjustment,
|
|
911
|
+
wrap_time_allowed=wrap_time_allowed,
|
|
912
|
+
)
|
|
913
|
+
case (BackendEngine.DUCKDB, _):
|
|
914
|
+
load_data_df = map_time_dimension_with_chronify_duckdb(
|
|
915
|
+
df=load_data_df,
|
|
916
|
+
value_column=value_column,
|
|
917
|
+
from_time_dim=time_dim,
|
|
918
|
+
to_time_dim=to_time_dim,
|
|
919
|
+
scratch_dir_context=mapping_manager.scratch_dir_context,
|
|
920
|
+
time_based_data_adjustment=time_based_data_adjustment,
|
|
921
|
+
wrap_time_allowed=wrap_time_allowed,
|
|
922
|
+
)
|
|
923
|
+
|
|
924
|
+
if time_dim.model.is_time_zone_required_in_geography():
|
|
925
|
+
load_data_df = load_data_df.drop("time_zone")
|
|
926
|
+
|
|
927
|
+
if op.persist:
|
|
928
|
+
load_data_df = mapping_manager.persist_table(load_data_df, op)
|
|
929
|
+
return load_data_df
|
|
930
|
+
|
|
931
|
+
def _validate_daylight_saving_adjustment(self, time_based_data_adjustment):
|
|
932
|
+
if (
|
|
933
|
+
time_based_data_adjustment.daylight_saving_adjustment
|
|
934
|
+
== DaylightSavingAdjustmentModel()
|
|
935
|
+
):
|
|
936
|
+
return
|
|
937
|
+
time_dim = self._config.get_time_dimension()
|
|
938
|
+
if not isinstance(time_dim, IndexTimeDimensionConfig):
|
|
939
|
+
assert time_dim is not None
|
|
940
|
+
msg = f"time_based_data_adjustment.daylight_saving_adjustment does not apply to {time_dim.model.time_type=} time type, it applies to INDEX time type only."
|
|
941
|
+
logger.warning(msg)
|
|
942
|
+
|
|
943
|
+
def _remove_non_dimension_columns(self, df: DataFrame) -> DataFrame:
|
|
944
|
+
allowed_columns = self._list_dimension_columns(df)
|
|
945
|
+
return df.select(*allowed_columns)
|