dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. build_backend.py +93 -0
  2. dsgrid/__init__.py +22 -0
  3. dsgrid/api/__init__.py +0 -0
  4. dsgrid/api/api_manager.py +179 -0
  5. dsgrid/api/app.py +419 -0
  6. dsgrid/api/models.py +60 -0
  7. dsgrid/api/response_models.py +116 -0
  8. dsgrid/apps/__init__.py +0 -0
  9. dsgrid/apps/project_viewer/app.py +216 -0
  10. dsgrid/apps/registration_gui.py +444 -0
  11. dsgrid/chronify.py +32 -0
  12. dsgrid/cli/__init__.py +0 -0
  13. dsgrid/cli/common.py +120 -0
  14. dsgrid/cli/config.py +176 -0
  15. dsgrid/cli/download.py +13 -0
  16. dsgrid/cli/dsgrid.py +157 -0
  17. dsgrid/cli/dsgrid_admin.py +92 -0
  18. dsgrid/cli/install_notebooks.py +62 -0
  19. dsgrid/cli/query.py +729 -0
  20. dsgrid/cli/registry.py +1862 -0
  21. dsgrid/cloud/__init__.py +0 -0
  22. dsgrid/cloud/cloud_storage_interface.py +140 -0
  23. dsgrid/cloud/factory.py +31 -0
  24. dsgrid/cloud/fake_storage_interface.py +37 -0
  25. dsgrid/cloud/s3_storage_interface.py +156 -0
  26. dsgrid/common.py +36 -0
  27. dsgrid/config/__init__.py +0 -0
  28. dsgrid/config/annual_time_dimension_config.py +194 -0
  29. dsgrid/config/common.py +142 -0
  30. dsgrid/config/config_base.py +148 -0
  31. dsgrid/config/dataset_config.py +907 -0
  32. dsgrid/config/dataset_schema_handler_factory.py +46 -0
  33. dsgrid/config/date_time_dimension_config.py +136 -0
  34. dsgrid/config/dimension_config.py +54 -0
  35. dsgrid/config/dimension_config_factory.py +65 -0
  36. dsgrid/config/dimension_mapping_base.py +350 -0
  37. dsgrid/config/dimension_mappings_config.py +48 -0
  38. dsgrid/config/dimensions.py +1025 -0
  39. dsgrid/config/dimensions_config.py +71 -0
  40. dsgrid/config/file_schema.py +190 -0
  41. dsgrid/config/index_time_dimension_config.py +80 -0
  42. dsgrid/config/input_dataset_requirements.py +31 -0
  43. dsgrid/config/mapping_tables.py +209 -0
  44. dsgrid/config/noop_time_dimension_config.py +42 -0
  45. dsgrid/config/project_config.py +1462 -0
  46. dsgrid/config/registration_models.py +188 -0
  47. dsgrid/config/representative_period_time_dimension_config.py +194 -0
  48. dsgrid/config/simple_models.py +49 -0
  49. dsgrid/config/supplemental_dimension.py +29 -0
  50. dsgrid/config/time_dimension_base_config.py +192 -0
  51. dsgrid/data_models.py +155 -0
  52. dsgrid/dataset/__init__.py +0 -0
  53. dsgrid/dataset/dataset.py +123 -0
  54. dsgrid/dataset/dataset_expression_handler.py +86 -0
  55. dsgrid/dataset/dataset_mapping_manager.py +121 -0
  56. dsgrid/dataset/dataset_schema_handler_base.py +945 -0
  57. dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
  58. dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
  59. dsgrid/dataset/growth_rates.py +162 -0
  60. dsgrid/dataset/models.py +51 -0
  61. dsgrid/dataset/table_format_handler_base.py +257 -0
  62. dsgrid/dataset/table_format_handler_factory.py +17 -0
  63. dsgrid/dataset/unpivoted_table.py +121 -0
  64. dsgrid/dimension/__init__.py +0 -0
  65. dsgrid/dimension/base_models.py +230 -0
  66. dsgrid/dimension/dimension_filters.py +308 -0
  67. dsgrid/dimension/standard.py +252 -0
  68. dsgrid/dimension/time.py +352 -0
  69. dsgrid/dimension/time_utils.py +103 -0
  70. dsgrid/dsgrid_rc.py +88 -0
  71. dsgrid/exceptions.py +105 -0
  72. dsgrid/filesystem/__init__.py +0 -0
  73. dsgrid/filesystem/cloud_filesystem.py +32 -0
  74. dsgrid/filesystem/factory.py +32 -0
  75. dsgrid/filesystem/filesystem_interface.py +136 -0
  76. dsgrid/filesystem/local_filesystem.py +74 -0
  77. dsgrid/filesystem/s3_filesystem.py +118 -0
  78. dsgrid/loggers.py +132 -0
  79. dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
  80. dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
  81. dsgrid/notebooks/registration.ipynb +48 -0
  82. dsgrid/notebooks/start_notebook.sh +11 -0
  83. dsgrid/project.py +451 -0
  84. dsgrid/query/__init__.py +0 -0
  85. dsgrid/query/dataset_mapping_plan.py +142 -0
  86. dsgrid/query/derived_dataset.py +388 -0
  87. dsgrid/query/models.py +728 -0
  88. dsgrid/query/query_context.py +287 -0
  89. dsgrid/query/query_submitter.py +994 -0
  90. dsgrid/query/report_factory.py +19 -0
  91. dsgrid/query/report_peak_load.py +70 -0
  92. dsgrid/query/reports_base.py +20 -0
  93. dsgrid/registry/__init__.py +0 -0
  94. dsgrid/registry/bulk_register.py +165 -0
  95. dsgrid/registry/common.py +287 -0
  96. dsgrid/registry/config_update_checker_base.py +63 -0
  97. dsgrid/registry/data_store_factory.py +34 -0
  98. dsgrid/registry/data_store_interface.py +74 -0
  99. dsgrid/registry/dataset_config_generator.py +158 -0
  100. dsgrid/registry/dataset_registry_manager.py +950 -0
  101. dsgrid/registry/dataset_update_checker.py +16 -0
  102. dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
  103. dsgrid/registry/dimension_mapping_update_checker.py +16 -0
  104. dsgrid/registry/dimension_registry_manager.py +413 -0
  105. dsgrid/registry/dimension_update_checker.py +16 -0
  106. dsgrid/registry/duckdb_data_store.py +207 -0
  107. dsgrid/registry/filesystem_data_store.py +150 -0
  108. dsgrid/registry/filter_registry_manager.py +123 -0
  109. dsgrid/registry/project_config_generator.py +57 -0
  110. dsgrid/registry/project_registry_manager.py +1623 -0
  111. dsgrid/registry/project_update_checker.py +48 -0
  112. dsgrid/registry/registration_context.py +223 -0
  113. dsgrid/registry/registry_auto_updater.py +316 -0
  114. dsgrid/registry/registry_database.py +667 -0
  115. dsgrid/registry/registry_interface.py +446 -0
  116. dsgrid/registry/registry_manager.py +558 -0
  117. dsgrid/registry/registry_manager_base.py +367 -0
  118. dsgrid/registry/versioning.py +92 -0
  119. dsgrid/rust_ext/__init__.py +14 -0
  120. dsgrid/rust_ext/find_minimal_patterns.py +129 -0
  121. dsgrid/spark/__init__.py +0 -0
  122. dsgrid/spark/functions.py +589 -0
  123. dsgrid/spark/types.py +110 -0
  124. dsgrid/tests/__init__.py +0 -0
  125. dsgrid/tests/common.py +140 -0
  126. dsgrid/tests/make_us_data_registry.py +265 -0
  127. dsgrid/tests/register_derived_datasets.py +103 -0
  128. dsgrid/tests/utils.py +25 -0
  129. dsgrid/time/__init__.py +0 -0
  130. dsgrid/time/time_conversions.py +80 -0
  131. dsgrid/time/types.py +67 -0
  132. dsgrid/units/__init__.py +0 -0
  133. dsgrid/units/constants.py +113 -0
  134. dsgrid/units/convert.py +71 -0
  135. dsgrid/units/energy.py +145 -0
  136. dsgrid/units/power.py +87 -0
  137. dsgrid/utils/__init__.py +0 -0
  138. dsgrid/utils/dataset.py +830 -0
  139. dsgrid/utils/files.py +179 -0
  140. dsgrid/utils/filters.py +125 -0
  141. dsgrid/utils/id_remappings.py +100 -0
  142. dsgrid/utils/py_expression_eval/LICENSE +19 -0
  143. dsgrid/utils/py_expression_eval/README.md +8 -0
  144. dsgrid/utils/py_expression_eval/__init__.py +847 -0
  145. dsgrid/utils/py_expression_eval/tests.py +283 -0
  146. dsgrid/utils/run_command.py +70 -0
  147. dsgrid/utils/scratch_dir_context.py +65 -0
  148. dsgrid/utils/spark.py +918 -0
  149. dsgrid/utils/spark_partition.py +98 -0
  150. dsgrid/utils/timing.py +239 -0
  151. dsgrid/utils/utilities.py +221 -0
  152. dsgrid/utils/versioning.py +36 -0
  153. dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
  154. dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
  155. dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
  156. dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
  157. dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
@@ -0,0 +1,945 @@
1
+ import abc
2
+ import logging
3
+ import os
4
+ from pathlib import Path
5
+ from typing import Iterable, Self
6
+
7
+ import chronify
8
+ from sqlalchemy import Connection
9
+
10
+ import dsgrid
11
+ from dsgrid.chronify import create_store, create_in_memory_store
12
+ from dsgrid.config.annual_time_dimension_config import (
13
+ AnnualTimeDimensionConfig,
14
+ map_annual_time_to_date_time,
15
+ )
16
+ from dsgrid.config.dimension_config import (
17
+ DimensionBaseConfig,
18
+ DimensionBaseConfigWithFiles,
19
+ )
20
+ from dsgrid.config.noop_time_dimension_config import NoOpTimeDimensionConfig
21
+ from dsgrid.config.date_time_dimension_config import DateTimeDimensionConfig
22
+ from dsgrid.config.index_time_dimension_config import IndexTimeDimensionConfig
23
+ from dsgrid.config.project_config import ProjectConfig
24
+ from dsgrid.config.time_dimension_base_config import TimeDimensionBaseConfig
25
+ from dsgrid.dimension.time import TimeBasedDataAdjustmentModel
26
+ from dsgrid.dsgrid_rc import DsgridRuntimeConfig
27
+ from dsgrid.common import VALUE_COLUMN, BackendEngine
28
+ from dsgrid.config.dataset_config import (
29
+ DatasetConfig,
30
+ InputDatasetType,
31
+ UserDataLayout,
32
+ )
33
+ from dsgrid.config.dimension_mapping_base import (
34
+ DimensionMappingReferenceModel,
35
+ )
36
+ from dsgrid.config.simple_models import DimensionSimpleModel
37
+ from dsgrid.dataset.models import ValueFormat
38
+ from dsgrid.dataset.table_format_handler_factory import make_table_format_handler
39
+ from dsgrid.config.file_schema import read_data_file
40
+ from dsgrid.dimension.base_models import DatasetDimensionRequirements, DimensionType
41
+ from dsgrid.exceptions import DSGInvalidDataset, DSGInvalidDimensionMapping
42
+ from dsgrid.dimension.time import (
43
+ DaylightSavingAdjustmentModel,
44
+ )
45
+ from dsgrid.dataset.dataset_mapping_manager import DatasetMappingManager
46
+ from dsgrid.query.dataset_mapping_plan import DatasetMappingPlan, MapOperation
47
+ from dsgrid.query.query_context import QueryContext
48
+ from dsgrid.query.models import ColumnType
49
+ from dsgrid.spark.functions import (
50
+ cache,
51
+ except_all,
52
+ is_dataframe_empty,
53
+ join,
54
+ make_temp_view_name,
55
+ unpersist,
56
+ )
57
+ from dsgrid.registry.data_store_interface import DataStoreInterface
58
+ from dsgrid.spark.types import DataFrame, F, use_duckdb
59
+ from dsgrid.units.convert import convert_units_unpivoted
60
+ from dsgrid.utils.dataset import (
61
+ check_historical_annual_time_model_year_consistency,
62
+ filter_out_expected_missing_associations,
63
+ handle_dimension_association_errors,
64
+ is_noop_mapping,
65
+ map_stacked_dimension,
66
+ add_time_zone,
67
+ map_time_dimension_with_chronify_duckdb,
68
+ map_time_dimension_with_chronify_spark_hive,
69
+ map_time_dimension_with_chronify_spark_path,
70
+ ordered_subset_columns,
71
+ repartition_if_needed_by_mapping,
72
+ )
73
+
74
+ from dsgrid.utils.scratch_dir_context import ScratchDirContext
75
+ from dsgrid.utils.spark import (
76
+ check_for_nulls,
77
+ create_dataframe_from_product,
78
+ get_unique_values,
79
+ persist_table,
80
+ read_dataframe,
81
+ save_to_warehouse,
82
+ write_dataframe,
83
+ )
84
+ from dsgrid.utils.timing import timer_stats_collector, track_timing
85
+ from dsgrid.registry.dimension_registry_manager import DimensionRegistryManager
86
+ from dsgrid.registry.dimension_mapping_registry_manager import (
87
+ DimensionMappingRegistryManager,
88
+ )
89
+
90
+ logger = logging.getLogger(__name__)
91
+
92
+
93
+ class DatasetSchemaHandlerBase(abc.ABC):
94
+ """define interface/required behaviors per dataset schema"""
95
+
96
+ def __init__(
97
+ self,
98
+ config: DatasetConfig,
99
+ conn: Connection | None,
100
+ dimension_mgr: DimensionRegistryManager,
101
+ dimension_mapping_mgr: DimensionMappingRegistryManager,
102
+ mapping_references: list[DimensionMappingReferenceModel] | None = None,
103
+ ):
104
+ self._conn = conn
105
+ self._config = config
106
+ self._dimension_mgr = dimension_mgr
107
+ self._dimension_mapping_mgr = dimension_mapping_mgr
108
+ self._mapping_references: list[DimensionMappingReferenceModel] = mapping_references or []
109
+
110
+ @classmethod
111
+ @abc.abstractmethod
112
+ def load(cls, config: DatasetConfig, *args, store: DataStoreInterface | None = None) -> Self:
113
+ """Create a dataset schema handler by loading the data tables from files.
114
+
115
+ Parameters
116
+ ----------
117
+ config: DatasetConfig
118
+ store: DataStoreInterface | None
119
+ If provided, the dataset must already be registered.
120
+ If not provided, the dataset must not be registered and the file path must be
121
+ available via the DatasetConfig.
122
+
123
+ Returns
124
+ -------
125
+ DatasetSchemaHandlerBase
126
+ """
127
+
128
+ @abc.abstractmethod
129
+ def check_consistency(
130
+ self,
131
+ missing_dimension_associations: dict[str, DataFrame],
132
+ scratch_dir_context: ScratchDirContext,
133
+ requirements: DatasetDimensionRequirements,
134
+ ) -> None:
135
+ """
136
+ Check all data consistencies, including data columns, dataset to dimension records, and time
137
+ """
138
+
139
+ @abc.abstractmethod
140
+ def check_time_consistency(self):
141
+ """Check the time consistency of the dataset."""
142
+
143
+ @abc.abstractmethod
144
+ def get_base_load_data_table(self) -> DataFrame:
145
+ """Return the base load data table, which must include time."""
146
+
147
+ @abc.abstractmethod
148
+ def _get_load_data_table(self) -> DataFrame:
149
+ """Return the full load data table."""
150
+
151
+ def _make_actual_dimension_association_table_from_data(self) -> DataFrame:
152
+ return self._remove_non_dimension_columns(self._get_load_data_table()).distinct()
153
+
154
+ def _make_expected_dimension_association_table_from_records(
155
+ self, dimension_types: Iterable[DimensionType], context: ScratchDirContext
156
+ ) -> DataFrame:
157
+ """Return a dataframe containing one row for each unique dimension combination except time.
158
+ Use dimensions in the dataset's dimension records.
159
+ """
160
+ data: dict[str, list[str]] = {}
161
+ for dim_type in dimension_types:
162
+ dim = self._config.get_dimension_with_records(dim_type)
163
+ if dim is not None:
164
+ data[dim_type.value] = list(dim.get_unique_ids())
165
+
166
+ if not data:
167
+ msg = "Bug: did not find any dimension records"
168
+ raise Exception(msg)
169
+ return create_dataframe_from_product(data, context)
170
+
171
+ @track_timing(timer_stats_collector)
172
+ def _check_dimension_associations(
173
+ self,
174
+ missing_dimension_associations: dict[str, DataFrame],
175
+ context: ScratchDirContext,
176
+ requirements: DatasetDimensionRequirements,
177
+ ) -> None:
178
+ """Check that a cross-join of dimension records is present, unless explicitly excepted."""
179
+
180
+ if not requirements.check_dimension_associations:
181
+ logger.info(
182
+ "Skip checks of dataset dimension associations for %s",
183
+ self._config.model.dataset_id,
184
+ )
185
+ return
186
+
187
+ logger.info("Check dimension associations")
188
+ assoc_by_records = self._make_expected_dimension_association_table_from_records(
189
+ [x for x in DimensionType if x != DimensionType.TIME], context
190
+ )
191
+ assoc_by_data = self._make_actual_dimension_association_table_from_data()
192
+ # This first check is redundant with the checks below. But, it is significantly
193
+ # easier for users to debug.
194
+ for column in assoc_by_records.columns:
195
+ expected = get_unique_values(assoc_by_records, column)
196
+ actual = get_unique_values(assoc_by_data, column)
197
+ if actual != expected:
198
+ missing = sorted(expected.difference(actual))
199
+ extra = sorted(actual.difference(expected))
200
+ num_matching = len(actual.intersection(expected))
201
+ msg = (
202
+ f"Dataset records for dimension type {column} do not match expected "
203
+ f"values. {missing=} {extra=} {num_matching=}"
204
+ )
205
+ raise DSGInvalidDataset(msg)
206
+
207
+ required_assoc = assoc_by_records
208
+ if missing_dimension_associations:
209
+ for missing_df in missing_dimension_associations.values():
210
+ required_assoc = filter_out_expected_missing_associations(
211
+ required_assoc, missing_df
212
+ )
213
+
214
+ cols = sorted(required_assoc.columns)
215
+ diff = except_all(required_assoc.select(*cols), assoc_by_data.select(*cols))
216
+ cache(diff)
217
+ try:
218
+ if not is_dataframe_empty(diff):
219
+ handle_dimension_association_errors(diff, assoc_by_data, self.dataset_id)
220
+ logger.info("Successfully checked dataset dimension associations")
221
+ finally:
222
+ unpersist(diff)
223
+
224
+ def make_mapped_dimension_association_table(self, context: ScratchDirContext) -> DataFrame:
225
+ """Return a dataframe containing one row for each unique dimension combination except time.
226
+ Use mapped dimensions.
227
+ """
228
+ assoc_df = self._make_actual_dimension_association_table_from_data()
229
+ mapping_plan = self.build_default_dataset_mapping_plan()
230
+ with DatasetMappingManager(self.dataset_id, mapping_plan, context) as mapping_manager:
231
+ df = (
232
+ self._remap_dimension_columns(assoc_df, mapping_manager)
233
+ .drop("fraction")
234
+ .distinct()
235
+ )
236
+ check_for_nulls(df)
237
+ return df
238
+
239
+ def remove_expected_missing_mapped_associations(
240
+ self, store: DataStoreInterface, df: DataFrame, context: ScratchDirContext
241
+ ) -> DataFrame:
242
+ """Remove expected missing associations from the full join of expected associations."""
243
+ missing_associations = store.read_missing_associations_tables(
244
+ self._config.model.dataset_id, self._config.model.version
245
+ )
246
+ if not missing_associations:
247
+ return df
248
+
249
+ final_df = df
250
+ mapping_plan = self.build_default_dataset_mapping_plan()
251
+ with DatasetMappingManager(self.dataset_id, mapping_plan, context) as mapping_manager:
252
+ for missing_df in missing_associations.values():
253
+ mapped_df = (
254
+ self._remap_dimension_columns(missing_df, mapping_manager)
255
+ .drop("fraction")
256
+ .distinct()
257
+ )
258
+ final_df = filter_out_expected_missing_associations(final_df, mapped_df)
259
+ return final_df
260
+
261
+ @abc.abstractmethod
262
+ def filter_data(self, dimensions: list[DimensionSimpleModel], store: DataStoreInterface):
263
+ """Filter the load data by dimensions and rewrite the files.
264
+
265
+ dimensions : list[DimensionSimpleModel]
266
+ store : DataStoreInterface
267
+ The data store to use for reading and writing the data.
268
+ """
269
+
270
+ @property
271
+ def connection(self) -> Connection | None:
272
+ """Return the active sqlalchemy connection to the registry database."""
273
+ return self._conn
274
+
275
+ @property
276
+ def dataset_id(self):
277
+ return self._config.config_id
278
+
279
+ @property
280
+ def config(self):
281
+ """Returns the DatasetConfig.
282
+
283
+ Returns
284
+ -------
285
+ DatasetConfig
286
+
287
+ """
288
+ return self._config
289
+
290
+ @abc.abstractmethod
291
+ def make_project_dataframe(self, context, project_config) -> DataFrame:
292
+ """Return a load_data dataframe with dimensions mapped to the project's with filters
293
+ as specified by the QueryContext.
294
+
295
+ Parameters
296
+ ----------
297
+ context : QueryContext
298
+ project_config : ProjectConfig
299
+
300
+ Returns
301
+ -------
302
+ pyspark.sql.DataFrame
303
+
304
+ """
305
+
306
+ @abc.abstractmethod
307
+ def make_mapped_dataframe(
308
+ self,
309
+ context: QueryContext,
310
+ time_dimension: TimeDimensionBaseConfig | None = None,
311
+ ) -> DataFrame:
312
+ """Return a load_data dataframe with dimensions mapped as stored in the handler.
313
+
314
+ Parameters
315
+ ----------
316
+ context
317
+ time_dimension
318
+ Required if the time dimension is being mapped.
319
+ This should be the destination time dimension.
320
+
321
+ """
322
+
323
+ @track_timing(timer_stats_collector)
324
+ def _check_dataset_time_consistency(self, load_data_df: DataFrame):
325
+ """Check dataset time consistency such that:
326
+ 1. time range(s) match time config record;
327
+ 2. all dimension combinations return the same set of time range(s).
328
+
329
+ Callers must ensure that the dataset has a time dimension.
330
+ """
331
+ if os.environ.get("__DSGRID_SKIP_CHECK_DATASET_TIME_CONSISTENCY__"):
332
+ logger.warning("Skip dataset time consistency checks.")
333
+ return
334
+
335
+ logger.info("Check dataset time consistency.")
336
+ time_dim = self._config.get_time_dimension()
337
+ assert time_dim is not None, "time cannot be checked if the dataset has no time dimension"
338
+ time_cols = self._get_time_dimension_columns()
339
+ time_dim.check_dataset_time_consistency(load_data_df, time_cols)
340
+ if not isinstance(time_dim, NoOpTimeDimensionConfig):
341
+ self._check_dataset_time_consistency_by_time_array(time_cols, load_data_df)
342
+ self._check_model_year_time_consistency(load_data_df)
343
+
344
+ @track_timing(timer_stats_collector)
345
+ def _check_dataset_time_consistency_with_chronify(self):
346
+ """Check dataset time consistency such that:
347
+ 1. time range(s) match time config record;
348
+ 2. all dimension combinations return the same set of time range(s).
349
+
350
+ Callers must ensure that the dataset has a time dimension.
351
+ """
352
+ if os.environ.get("__DSGRID_SKIP_CHECK_DATASET_TIME_CONSISTENCY__"):
353
+ logger.warning("Skip dataset time consistency checks.")
354
+ return
355
+
356
+ logger.info("Check dataset time consistency.")
357
+ assert isinstance(self._config.model.data_layout, UserDataLayout)
358
+ file_schema = self._config.model.data_layout.data_file
359
+ scratch_dir = DsgridRuntimeConfig.load().get_scratch_dir()
360
+ with ScratchDirContext(scratch_dir) as context:
361
+ load_data_df = read_data_file(file_schema, scratch_dir_context=context)
362
+ chronify_schema = self._get_chronify_schema(load_data_df)
363
+ assert file_schema.path is not None
364
+ data_file_path = Path(file_schema.path)
365
+ if data_file_path.suffix == ".parquet" or not use_duckdb():
366
+ if data_file_path.suffix == ".csv":
367
+ # This is a workaround for time zone issues between Spark, Pandas,
368
+ # and Chronify when reading CSV files.
369
+ # Chronify can ingest them correctly when we go to Parquet first.
370
+ # This is really only a test issue because normal dsgrid users will not
371
+ # use Spark with CSV data files.
372
+ src_path = context.get_temp_filename(suffix=".parquet")
373
+ write_dataframe(load_data_df, src_path)
374
+ else:
375
+ src_path = data_file_path
376
+ store_file = context.get_temp_filename(suffix=".db")
377
+ with create_store(store_file) as store:
378
+ # This performs all of the checks.
379
+ store.create_view_from_parquet(src_path, chronify_schema)
380
+ store.drop_view(chronify_schema.name)
381
+ else:
382
+ # For CSV and JSON files, use in-memory store with ingest_table.
383
+ # This avoids the complexity of converting to parquet.
384
+ with create_in_memory_store() as store:
385
+ # ingest_table performs all of the time checks.
386
+ store.ingest_table(load_data_df.toPandas(), chronify_schema)
387
+ store.drop_table(chronify_schema.name)
388
+
389
+ self._check_model_year_time_consistency(load_data_df)
390
+
391
+ def _get_chronify_schema(self, df: DataFrame):
392
+ time_dim = self._config.get_dimension(DimensionType.TIME)
393
+ time_cols = time_dim.get_load_data_time_columns()
394
+ time_array_id_columns = [
395
+ x
396
+ for x in df.columns
397
+ # If there are multiple weather years:
398
+ # - that are continuous, weather year needs to be excluded (one overall range).
399
+ # - that are not continuous, weather year needs to be included and chronify
400
+ # needs additional support. TODO: issue #340
401
+ if x != DimensionType.WEATHER_YEAR.value
402
+ and x
403
+ in set(df.columns).difference(time_cols).difference(self._config.get_value_columns())
404
+ ]
405
+ if self._config.get_value_format() == ValueFormat.PIVOTED:
406
+ # We can ignore all pivoted columns but one for time checking.
407
+ # Looking at the rest would be redundant.
408
+ value_column = next(iter(self._config.get_pivoted_dimension_columns()))
409
+ else:
410
+ value_column = VALUE_COLUMN
411
+ return chronify.TableSchema(
412
+ name=make_temp_view_name(),
413
+ time_config=time_dim.to_chronify(),
414
+ time_array_id_columns=time_array_id_columns,
415
+ value_column=value_column,
416
+ )
417
+
418
+ def _check_model_year_time_consistency(self, df: DataFrame):
419
+ time_dim = self._config.get_dimension(DimensionType.TIME)
420
+ if self._config.model.dataset_type == InputDatasetType.HISTORICAL and isinstance(
421
+ time_dim, AnnualTimeDimensionConfig
422
+ ):
423
+ annual_cols = time_dim.get_load_data_time_columns()
424
+ assert len(annual_cols) == 1
425
+ annual_col = annual_cols[0]
426
+ check_historical_annual_time_model_year_consistency(
427
+ df, annual_col, DimensionType.MODEL_YEAR.value
428
+ )
429
+
430
+ @track_timing(timer_stats_collector)
431
+ def _check_dataset_time_consistency_by_time_array(self, time_cols, load_data_df):
432
+ """Check that each unique time array has the same timestamps."""
433
+ logger.info("Check dataset time consistency by time array.")
434
+ unique_array_cols = set(DimensionType.get_allowed_dimension_column_names()).intersection(
435
+ load_data_df.columns
436
+ )
437
+ counts = load_data_df.groupBy(*time_cols).count().select("count")
438
+ distinct_counts = counts.select("count").distinct().collect()
439
+ if len(distinct_counts) != 1:
440
+ msg = (
441
+ "All time arrays must be repeated the same number of times: "
442
+ f"unique timestamp repeats = {len(distinct_counts)}"
443
+ )
444
+ raise DSGInvalidDataset(msg)
445
+ ta_counts = load_data_df.groupBy(*unique_array_cols).count().select("count")
446
+ distinct_ta_counts = ta_counts.select("count").distinct().collect()
447
+ if len(distinct_ta_counts) != 1:
448
+ msg = (
449
+ "All combinations of non-time dimensions must have the same time array length: "
450
+ f"unique time array lengths = {len(distinct_ta_counts)}"
451
+ )
452
+ raise DSGInvalidDataset(msg)
453
+
454
+ def _check_load_data_unpivoted_value_column(self, df):
455
+ logger.info("Check load data unpivoted columns.")
456
+ if VALUE_COLUMN not in df.columns:
457
+ msg = f"value_column={VALUE_COLUMN} is not in columns={df.columns}"
458
+ raise DSGInvalidDataset(msg)
459
+
460
+ def _convert_units(
461
+ self,
462
+ df: DataFrame,
463
+ project_metric_records: DataFrame,
464
+ mapping_manager: DatasetMappingManager,
465
+ ):
466
+ if not self._config.model.enable_unit_conversion:
467
+ return df
468
+
469
+ op = mapping_manager.plan.convert_units_op
470
+ if mapping_manager.has_completed_operation(op):
471
+ return df
472
+
473
+ # Note that a dataset could have the same dimension record IDs as the project,
474
+ # no mappings, but then still have different units.
475
+ mapping_records = None
476
+ for ref in self._mapping_references:
477
+ dim_type = ref.from_dimension_type
478
+ if dim_type == DimensionType.METRIC:
479
+ mapping_records = self._dimension_mapping_mgr.get_by_id(
480
+ ref.mapping_id, version=ref.version, conn=self.connection
481
+ ).get_records_dataframe()
482
+ break
483
+
484
+ dataset_dim = self._config.get_dimension_with_records(DimensionType.METRIC)
485
+ dataset_records = dataset_dim.get_records_dataframe()
486
+ df = convert_units_unpivoted(
487
+ df,
488
+ DimensionType.METRIC.value,
489
+ dataset_records,
490
+ mapping_records,
491
+ project_metric_records,
492
+ )
493
+ if op.persist:
494
+ df = mapping_manager.persist_table(df, op)
495
+ return df
496
+
497
+ def _finalize_table(self, context: QueryContext, df: DataFrame, project_config: ProjectConfig):
498
+ # TODO: remove ProjectConfig so that dataset queries can use this.
499
+ # Issue #370
500
+ table_handler = make_table_format_handler(
501
+ self._config.get_value_format(),
502
+ project_config,
503
+ dataset_id=self.dataset_id,
504
+ )
505
+
506
+ time_dim = project_config.get_base_dimension(DimensionType.TIME)
507
+ context.set_dataset_metadata(
508
+ self.dataset_id,
509
+ context.model.result.column_type,
510
+ project_config.get_load_data_time_columns(time_dim.model.name),
511
+ )
512
+
513
+ if context.model.result.column_type == ColumnType.DIMENSION_NAMES:
514
+ df = table_handler.convert_columns_to_query_names(
515
+ df, self._config.model.dataset_id, context
516
+ )
517
+
518
+ return df
519
+
520
+ @staticmethod
521
+ def _get_pivoted_column_name(
522
+ context: QueryContext, pivoted_dimension_type: DimensionType, project_config
523
+ ):
524
+ match context.model.result.column_type:
525
+ case ColumnType.DIMENSION_NAMES:
526
+ pivoted_column_name = project_config.get_base_dimension(
527
+ pivoted_dimension_type
528
+ ).model.name
529
+ case ColumnType.DIMENSION_TYPES:
530
+ pivoted_column_name = pivoted_dimension_type.value
531
+ case _:
532
+ msg = str(context.model.result.column_type)
533
+ raise NotImplementedError(msg)
534
+
535
+ return pivoted_column_name
536
+
537
+ def _get_dataset_to_project_mapping_records(self, dimension_type: DimensionType):
538
+ config = self._get_dataset_to_project_mapping_config(dimension_type)
539
+ if config is None:
540
+ return config
541
+ return config.get_records_dataframe()
542
+
543
+ def _get_dataset_to_project_mapping_config(self, dimension_type: DimensionType):
544
+ ref = self._get_dataset_to_project_mapping_reference(dimension_type)
545
+ if ref is None:
546
+ return ref
547
+ return self._dimension_mapping_mgr.get_by_id(
548
+ ref.mapping_id, version=ref.version, conn=self.connection
549
+ )
550
+
551
+ def _get_dataset_to_project_mapping_reference(self, dimension_type: DimensionType):
552
+ for ref in self._mapping_references:
553
+ if ref.from_dimension_type == dimension_type:
554
+ return ref
555
+ return
556
+
557
+ def _get_mapping_to_dimension(
558
+ self, dimension_type: DimensionType
559
+ ) -> DimensionBaseConfig | None:
560
+ ref = self._get_dataset_to_project_mapping_reference(dimension_type)
561
+ if ref is None:
562
+ return None
563
+ config = self._dimension_mapping_mgr.get_by_id(ref.mapping_id, conn=self._conn)
564
+ return self._dimension_mgr.get_by_id(
565
+ config.model.to_dimension.dimension_id, conn=self._conn
566
+ )
567
+
568
+ def _get_project_metric_records(self, project_config: ProjectConfig) -> DataFrame:
569
+ metric_dim_query_name = getattr(
570
+ project_config.get_dataset_base_dimension_names(self._config.model.dataset_id),
571
+ DimensionType.METRIC.value,
572
+ )
573
+ if metric_dim_query_name is None:
574
+ # This is a workaround for dsgrid projects created before the field
575
+ # base_dimension_names was added to InputDatasetModel.
576
+ metric_dims = project_config.list_base_dimensions(dimension_type=DimensionType.METRIC)
577
+ if len(metric_dims) > 1:
578
+ msg = (
579
+ "The dataset's base_dimension_names value is not set and "
580
+ "there are multiple metric dimensions in the project. Please re-register the "
581
+ f"dataset with dataset_id={self._config.model.dataset_id}."
582
+ )
583
+ raise DSGInvalidDataset(msg)
584
+ metric_dim_query_name = metric_dims[0].model.name
585
+ return project_config.get_dimension_records(metric_dim_query_name)
586
+
587
+ def _get_time_dimension_columns(self):
588
+ time_dim = self._config.get_dimension(DimensionType.TIME)
589
+ time_cols = time_dim.get_load_data_time_columns()
590
+ return time_cols
591
+
592
+ def _iter_dataset_record_ids(self, context: QueryContext):
593
+ for dim_type, project_record_ids in context.get_record_ids().items():
594
+ dataset_mapping = self._get_dataset_to_project_mapping_records(dim_type)
595
+ if dataset_mapping is None:
596
+ dataset_record_ids = project_record_ids
597
+ else:
598
+ dataset_record_ids = (
599
+ join(
600
+ dataset_mapping.withColumnRenamed("from_id", "dataset_record_id"),
601
+ project_record_ids,
602
+ "to_id",
603
+ "id",
604
+ )
605
+ .select("dataset_record_id")
606
+ .withColumnRenamed("dataset_record_id", "id")
607
+ .distinct()
608
+ )
609
+ yield dim_type, dataset_record_ids
610
+
611
+ @staticmethod
612
+ def _list_dimension_columns(df: DataFrame) -> list[str]:
613
+ columns = DimensionType.get_allowed_dimension_column_names()
614
+ return [x for x in df.columns if x in columns]
615
+
616
+ def _list_dimension_types_in_load_data(self, df: DataFrame) -> list[DimensionType]:
617
+ dims = [DimensionType(x) for x in DatasetSchemaHandlerBase._list_dimension_columns(df)]
618
+ if self._config.get_value_format() == ValueFormat.PIVOTED:
619
+ pivoted_type = self._config.get_pivoted_dimension_type()
620
+ assert pivoted_type is not None
621
+ dims.append(pivoted_type)
622
+ return dims
623
+
624
+ def _prefilter_pivoted_dimensions(self, context: QueryContext, df):
625
+ for dim_type, dataset_record_ids in self._iter_dataset_record_ids(context):
626
+ if dim_type == self._config.get_pivoted_dimension_type():
627
+ # Drop columns that don't match requested project record IDs.
628
+ cols_to_keep = {x.id for x in dataset_record_ids.collect()}
629
+ cols_to_drop = set(self._config.get_pivoted_dimension_columns()).difference(
630
+ cols_to_keep
631
+ )
632
+ if cols_to_drop:
633
+ df = df.drop(*cols_to_drop)
634
+
635
+ return df
636
+
637
+ def _prefilter_stacked_dimensions(self, context: QueryContext, df):
638
+ for dim_type, dataset_record_ids in self._iter_dataset_record_ids(context):
639
+ # Drop rows that don't match requested project record IDs.
640
+ tmp = dataset_record_ids.withColumnRenamed("id", "dataset_record_id")
641
+ if dim_type.value not in df.columns:
642
+ # This dimensions is stored in another table (e.g., lookup or load_data)
643
+ continue
644
+ df = join(df, tmp, dim_type.value, "dataset_record_id").drop("dataset_record_id")
645
+
646
+ return df
647
+
648
+ def _prefilter_time_dimension(self, context: QueryContext, df):
649
+ # TODO #196:
650
+ return df
651
+
652
+ def build_default_dataset_mapping_plan(self) -> DatasetMappingPlan:
653
+ """Build a default mapping order of dimensions to a project."""
654
+ mappings: list[MapOperation] = []
655
+ for ref in self._mapping_references:
656
+ config = self._dimension_mapping_mgr.get_by_id(ref.mapping_id, conn=self.connection)
657
+ dim = self._dimension_mgr.get_by_id(
658
+ config.model.to_dimension.dimension_id, conn=self.connection
659
+ )
660
+ mappings.append(
661
+ MapOperation(
662
+ name=dim.model.name,
663
+ mapping_reference=ref,
664
+ )
665
+ )
666
+
667
+ return DatasetMappingPlan(dataset_id=self._config.model.dataset_id, mappings=mappings)
668
+
669
+ def check_dataset_mapping_plan(
670
+ self, mapping_plan: DatasetMappingPlan, project_config: ProjectConfig
671
+ ) -> None:
672
+ """Check that a user-defined mapping plan is valid."""
673
+ req_dimensions: dict[DimensionType, DimensionMappingReferenceModel] = {}
674
+ actual_mapping_dims: dict[DimensionType, str] = {}
675
+
676
+ for ref in self._mapping_references:
677
+ assert ref.to_dimension_type not in req_dimensions
678
+ req_dimensions[ref.to_dimension_type] = ref
679
+
680
+ dataset_id = mapping_plan.dataset_id
681
+ indexes_to_remove: list[int] = []
682
+ for i, mapping in enumerate(mapping_plan.mappings):
683
+ to_dim = project_config.get_dimension(mapping.name)
684
+ if to_dim.model.dimension_type == DimensionType.TIME:
685
+ msg = (
686
+ f"DatasetMappingPlan for {dataset_id=} is invalid because specification "
687
+ f"of the time dimension is not supported: {mapping.name}"
688
+ )
689
+ raise DSGInvalidDimensionMapping(msg)
690
+ if to_dim.model.dimension_type in actual_mapping_dims:
691
+ msg = (
692
+ f"DatasetMappingPlan for {dataset_id=} is invalid because it can only "
693
+ f"support mapping one dimension for a given dimension type. "
694
+ f"type={to_dim.model.dimension_type} "
695
+ f"first={actual_mapping_dims[to_dim.model.dimension_type]} "
696
+ f"second={mapping.name}"
697
+ )
698
+ raise DSGInvalidDimensionMapping(msg)
699
+
700
+ from_dim = self._config.get_dimension(to_dim.model.dimension_type)
701
+ supp_dim_names = {
702
+ x.model.name
703
+ for x in project_config.list_supplemental_dimensions(to_dim.model.dimension_type)
704
+ }
705
+ if mapping.name in supp_dim_names:
706
+ # This could be useful if we wanted to use DatasetMappingPlan for mapping
707
+ # a single dataset to a project's dimensions without being concerned about
708
+ # aggregrations. As it stands, we can are only using this within our
709
+ # project query process. We need much more handling to make that work.
710
+ msg = (
711
+ "DatasetMappingPlan for {dataset_id=} is invalid because it specifies "
712
+ f"a supplemental dimension: {mapping.name}"
713
+ )
714
+ elif to_dim.model.dimension_type not in req_dimensions:
715
+ msg = (
716
+ f"DatasetMappingPlan for {dataset_id=} is invalid because there is no "
717
+ f"dataset-to-project-base mapping defined for {to_dim.model.label}"
718
+ )
719
+ raise DSGInvalidDimensionMapping(msg)
720
+
721
+ ref = req_dimensions[to_dim.model.dimension_type]
722
+ mapping_config = self._dimension_mapping_mgr.get_by_id(
723
+ ref.mapping_id, version=ref.version, conn=self.connection
724
+ )
725
+ if (
726
+ from_dim.model.dimension_id == mapping_config.model.from_dimension.dimension_id
727
+ and to_dim.model.dimension_id == mapping_config.model.to_dimension.dimension_id
728
+ ):
729
+ mapping.mapping_reference = ref
730
+ actual_mapping_dims[to_dim.model.dimension_type] = mapping.name
731
+
732
+ for index in indexes_to_remove:
733
+ mapping_plan.mappings.pop(index)
734
+
735
+ if diff_dims := set(req_dimensions.keys()).difference(actual_mapping_dims.keys()):
736
+ req = sorted((x.value for x in req_dimensions))
737
+ act = sorted((x.value for x in actual_mapping_dims))
738
+ diff = sorted((x.value for x in diff_dims))
739
+ msg = (
740
+ "If a mapping order is specified for a dataset, it must include all "
741
+ "dimension types that require mappings to the project base dimension.\n"
742
+ f"Required dimension types: {req}\nActual dimension types: {act}\n"
743
+ f"Difference: {diff}"
744
+ )
745
+ raise DSGInvalidDimensionMapping(msg)
746
+
747
+ def _remap_dimension_columns(
748
+ self,
749
+ df: DataFrame,
750
+ mapping_manager: DatasetMappingManager,
751
+ filtered_records: dict[DimensionType, DataFrame] | None = None,
752
+ ) -> DataFrame:
753
+ """Map the table's dimensions according to the plan.
754
+
755
+ Parameters
756
+ ----------
757
+ df
758
+ The dataframe to map.
759
+ mapping_manager
760
+ Manages checkpointing and order of the mapping operations.
761
+ filtered_records
762
+ If not None, use these records to filter the table.
763
+ If None, do not persist any intermediate tables.
764
+ If not None, use this context to persist intermediate tables if required.
765
+ """
766
+ completed_operations = mapping_manager.get_completed_mapping_operations()
767
+ for dim_mapping in mapping_manager.plan.mappings:
768
+ if dim_mapping.name in completed_operations:
769
+ logger.info(
770
+ "Skip mapping operation %s because the result exists in a checkpointed file.",
771
+ dim_mapping.name,
772
+ )
773
+ continue
774
+ assert dim_mapping.mapping_reference is not None
775
+ ref = dim_mapping.mapping_reference
776
+ dim_type = ref.from_dimension_type
777
+ column = dim_type.value
778
+ mapping_config = self._dimension_mapping_mgr.get_by_id(
779
+ ref.mapping_id, version=ref.version, conn=self.connection
780
+ )
781
+ logger.info(
782
+ "Mapping dimension type %s mapping_type=%s",
783
+ dim_type,
784
+ mapping_config.model.mapping_type,
785
+ )
786
+ records = mapping_config.get_records_dataframe()
787
+ if filtered_records is not None and dim_type in filtered_records:
788
+ records = join(records, filtered_records[dim_type], "to_id", "id").drop("id")
789
+
790
+ if is_noop_mapping(records):
791
+ logger.info("Skip no-op mapping %s.", ref.mapping_id)
792
+ continue
793
+ if column in df.columns:
794
+ persisted_file: Path | None = None
795
+ df = map_stacked_dimension(df, records, column)
796
+ df, persisted_file = repartition_if_needed_by_mapping(
797
+ df,
798
+ mapping_config.model.mapping_type,
799
+ mapping_manager.scratch_dir_context,
800
+ repartition=dim_mapping.handle_data_skew,
801
+ )
802
+ if dim_mapping.persist and persisted_file is None:
803
+ mapping_manager.persist_table(df, dim_mapping)
804
+ if persisted_file is not None:
805
+ mapping_manager.save_checkpoint(persisted_file, dim_mapping)
806
+
807
+ return df
808
+
809
+ def _apply_fraction(
810
+ self,
811
+ df,
812
+ value_columns,
813
+ mapping_manager: DatasetMappingManager,
814
+ agg_func=None,
815
+ ):
816
+ op = mapping_manager.plan.apply_fraction_op
817
+ if "fraction" not in df.columns:
818
+ return df
819
+ if mapping_manager.has_completed_operation(op):
820
+ return df
821
+ agg_func = agg_func or F.sum
822
+ # Maintain column order.
823
+ agg_ops = [
824
+ agg_func(F.col(x) * F.col("fraction")).alias(x)
825
+ for x in [y for y in df.columns if y in value_columns]
826
+ ]
827
+ gcols = set(df.columns) - value_columns - {"fraction"}
828
+ df = df.groupBy(*ordered_subset_columns(df, gcols)).agg(*agg_ops)
829
+ df = df.drop("fraction")
830
+ if op.persist:
831
+ df = mapping_manager.persist_table(df, op)
832
+ return df
833
+
834
+ @track_timing(timer_stats_collector)
835
+ def _convert_time_dimension(
836
+ self,
837
+ load_data_df: DataFrame,
838
+ to_time_dim: TimeDimensionBaseConfig,
839
+ value_column: str,
840
+ mapping_manager: DatasetMappingManager,
841
+ wrap_time_allowed: bool,
842
+ time_based_data_adjustment: TimeBasedDataAdjustmentModel,
843
+ to_geo_dim: DimensionBaseConfigWithFiles | None = None,
844
+ ):
845
+ op = mapping_manager.plan.map_time_op
846
+ if mapping_manager.has_completed_operation(op):
847
+ return load_data_df
848
+ self._validate_daylight_saving_adjustment(time_based_data_adjustment)
849
+ time_dim = self._config.get_time_dimension()
850
+ assert time_dim is not None
851
+ if time_dim.model.is_time_zone_required_in_geography():
852
+ if self._config.model.use_project_geography_time_zone:
853
+ if to_geo_dim is None:
854
+ msg = "Bug: to_geo_dim must be provided if time zone is required in geography."
855
+ raise Exception(msg)
856
+ logger.info("Add time zone from project geography dimension.")
857
+ geography_dim = to_geo_dim
858
+ else:
859
+ logger.info("Add time zone from dataset geography dimension.")
860
+ geography_dim = self._config.get_dimension(DimensionType.GEOGRAPHY)
861
+ load_data_df = add_time_zone(load_data_df, geography_dim)
862
+
863
+ if isinstance(time_dim, AnnualTimeDimensionConfig):
864
+ if not isinstance(to_time_dim, DateTimeDimensionConfig):
865
+ msg = f"Annual time can only be mapped to DateTime: {to_time_dim.model.time_type}"
866
+ raise NotImplementedError(msg)
867
+
868
+ return map_annual_time_to_date_time(
869
+ load_data_df,
870
+ time_dim,
871
+ to_time_dim,
872
+ {value_column},
873
+ )
874
+
875
+ config = dsgrid.runtime_config
876
+ if not time_dim.supports_chronify():
877
+ # annual time is returned above
878
+ # no mapping for no-op
879
+ assert isinstance(
880
+ time_dim, NoOpTimeDimensionConfig
881
+ ), "Only NoOp and AnnualTimeDimensionConfig do not currently support Chronify"
882
+ return load_data_df
883
+ match (config.backend_engine, config.use_hive_metastore):
884
+ case (BackendEngine.SPARK, True):
885
+ table_name = make_temp_view_name()
886
+ load_data_df = map_time_dimension_with_chronify_spark_hive(
887
+ df=save_to_warehouse(load_data_df, table_name),
888
+ table_name=table_name,
889
+ value_column=value_column,
890
+ from_time_dim=time_dim,
891
+ to_time_dim=to_time_dim,
892
+ scratch_dir_context=mapping_manager.scratch_dir_context,
893
+ time_based_data_adjustment=time_based_data_adjustment,
894
+ wrap_time_allowed=wrap_time_allowed,
895
+ )
896
+
897
+ case (BackendEngine.SPARK, False):
898
+ filename = persist_table(
899
+ load_data_df,
900
+ mapping_manager.scratch_dir_context,
901
+ tag="query before time mapping",
902
+ )
903
+ load_data_df = map_time_dimension_with_chronify_spark_path(
904
+ df=read_dataframe(filename),
905
+ filename=filename,
906
+ value_column=value_column,
907
+ from_time_dim=time_dim,
908
+ to_time_dim=to_time_dim,
909
+ scratch_dir_context=mapping_manager.scratch_dir_context,
910
+ time_based_data_adjustment=time_based_data_adjustment,
911
+ wrap_time_allowed=wrap_time_allowed,
912
+ )
913
+ case (BackendEngine.DUCKDB, _):
914
+ load_data_df = map_time_dimension_with_chronify_duckdb(
915
+ df=load_data_df,
916
+ value_column=value_column,
917
+ from_time_dim=time_dim,
918
+ to_time_dim=to_time_dim,
919
+ scratch_dir_context=mapping_manager.scratch_dir_context,
920
+ time_based_data_adjustment=time_based_data_adjustment,
921
+ wrap_time_allowed=wrap_time_allowed,
922
+ )
923
+
924
+ if time_dim.model.is_time_zone_required_in_geography():
925
+ load_data_df = load_data_df.drop("time_zone")
926
+
927
+ if op.persist:
928
+ load_data_df = mapping_manager.persist_table(load_data_df, op)
929
+ return load_data_df
930
+
931
+ def _validate_daylight_saving_adjustment(self, time_based_data_adjustment):
932
+ if (
933
+ time_based_data_adjustment.daylight_saving_adjustment
934
+ == DaylightSavingAdjustmentModel()
935
+ ):
936
+ return
937
+ time_dim = self._config.get_time_dimension()
938
+ if not isinstance(time_dim, IndexTimeDimensionConfig):
939
+ assert time_dim is not None
940
+ msg = f"time_based_data_adjustment.daylight_saving_adjustment does not apply to {time_dim.model.time_type=} time type, it applies to INDEX time type only."
941
+ logger.warning(msg)
942
+
943
+ def _remove_non_dimension_columns(self, df: DataFrame) -> DataFrame:
944
+ allowed_columns = self._list_dimension_columns(df)
945
+ return df.select(*allowed_columns)