dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. build_backend.py +93 -0
  2. dsgrid/__init__.py +22 -0
  3. dsgrid/api/__init__.py +0 -0
  4. dsgrid/api/api_manager.py +179 -0
  5. dsgrid/api/app.py +419 -0
  6. dsgrid/api/models.py +60 -0
  7. dsgrid/api/response_models.py +116 -0
  8. dsgrid/apps/__init__.py +0 -0
  9. dsgrid/apps/project_viewer/app.py +216 -0
  10. dsgrid/apps/registration_gui.py +444 -0
  11. dsgrid/chronify.py +32 -0
  12. dsgrid/cli/__init__.py +0 -0
  13. dsgrid/cli/common.py +120 -0
  14. dsgrid/cli/config.py +176 -0
  15. dsgrid/cli/download.py +13 -0
  16. dsgrid/cli/dsgrid.py +157 -0
  17. dsgrid/cli/dsgrid_admin.py +92 -0
  18. dsgrid/cli/install_notebooks.py +62 -0
  19. dsgrid/cli/query.py +729 -0
  20. dsgrid/cli/registry.py +1862 -0
  21. dsgrid/cloud/__init__.py +0 -0
  22. dsgrid/cloud/cloud_storage_interface.py +140 -0
  23. dsgrid/cloud/factory.py +31 -0
  24. dsgrid/cloud/fake_storage_interface.py +37 -0
  25. dsgrid/cloud/s3_storage_interface.py +156 -0
  26. dsgrid/common.py +36 -0
  27. dsgrid/config/__init__.py +0 -0
  28. dsgrid/config/annual_time_dimension_config.py +194 -0
  29. dsgrid/config/common.py +142 -0
  30. dsgrid/config/config_base.py +148 -0
  31. dsgrid/config/dataset_config.py +907 -0
  32. dsgrid/config/dataset_schema_handler_factory.py +46 -0
  33. dsgrid/config/date_time_dimension_config.py +136 -0
  34. dsgrid/config/dimension_config.py +54 -0
  35. dsgrid/config/dimension_config_factory.py +65 -0
  36. dsgrid/config/dimension_mapping_base.py +350 -0
  37. dsgrid/config/dimension_mappings_config.py +48 -0
  38. dsgrid/config/dimensions.py +1025 -0
  39. dsgrid/config/dimensions_config.py +71 -0
  40. dsgrid/config/file_schema.py +190 -0
  41. dsgrid/config/index_time_dimension_config.py +80 -0
  42. dsgrid/config/input_dataset_requirements.py +31 -0
  43. dsgrid/config/mapping_tables.py +209 -0
  44. dsgrid/config/noop_time_dimension_config.py +42 -0
  45. dsgrid/config/project_config.py +1462 -0
  46. dsgrid/config/registration_models.py +188 -0
  47. dsgrid/config/representative_period_time_dimension_config.py +194 -0
  48. dsgrid/config/simple_models.py +49 -0
  49. dsgrid/config/supplemental_dimension.py +29 -0
  50. dsgrid/config/time_dimension_base_config.py +192 -0
  51. dsgrid/data_models.py +155 -0
  52. dsgrid/dataset/__init__.py +0 -0
  53. dsgrid/dataset/dataset.py +123 -0
  54. dsgrid/dataset/dataset_expression_handler.py +86 -0
  55. dsgrid/dataset/dataset_mapping_manager.py +121 -0
  56. dsgrid/dataset/dataset_schema_handler_base.py +945 -0
  57. dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
  58. dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
  59. dsgrid/dataset/growth_rates.py +162 -0
  60. dsgrid/dataset/models.py +51 -0
  61. dsgrid/dataset/table_format_handler_base.py +257 -0
  62. dsgrid/dataset/table_format_handler_factory.py +17 -0
  63. dsgrid/dataset/unpivoted_table.py +121 -0
  64. dsgrid/dimension/__init__.py +0 -0
  65. dsgrid/dimension/base_models.py +230 -0
  66. dsgrid/dimension/dimension_filters.py +308 -0
  67. dsgrid/dimension/standard.py +252 -0
  68. dsgrid/dimension/time.py +352 -0
  69. dsgrid/dimension/time_utils.py +103 -0
  70. dsgrid/dsgrid_rc.py +88 -0
  71. dsgrid/exceptions.py +105 -0
  72. dsgrid/filesystem/__init__.py +0 -0
  73. dsgrid/filesystem/cloud_filesystem.py +32 -0
  74. dsgrid/filesystem/factory.py +32 -0
  75. dsgrid/filesystem/filesystem_interface.py +136 -0
  76. dsgrid/filesystem/local_filesystem.py +74 -0
  77. dsgrid/filesystem/s3_filesystem.py +118 -0
  78. dsgrid/loggers.py +132 -0
  79. dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
  80. dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
  81. dsgrid/notebooks/registration.ipynb +48 -0
  82. dsgrid/notebooks/start_notebook.sh +11 -0
  83. dsgrid/project.py +451 -0
  84. dsgrid/query/__init__.py +0 -0
  85. dsgrid/query/dataset_mapping_plan.py +142 -0
  86. dsgrid/query/derived_dataset.py +388 -0
  87. dsgrid/query/models.py +728 -0
  88. dsgrid/query/query_context.py +287 -0
  89. dsgrid/query/query_submitter.py +994 -0
  90. dsgrid/query/report_factory.py +19 -0
  91. dsgrid/query/report_peak_load.py +70 -0
  92. dsgrid/query/reports_base.py +20 -0
  93. dsgrid/registry/__init__.py +0 -0
  94. dsgrid/registry/bulk_register.py +165 -0
  95. dsgrid/registry/common.py +287 -0
  96. dsgrid/registry/config_update_checker_base.py +63 -0
  97. dsgrid/registry/data_store_factory.py +34 -0
  98. dsgrid/registry/data_store_interface.py +74 -0
  99. dsgrid/registry/dataset_config_generator.py +158 -0
  100. dsgrid/registry/dataset_registry_manager.py +950 -0
  101. dsgrid/registry/dataset_update_checker.py +16 -0
  102. dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
  103. dsgrid/registry/dimension_mapping_update_checker.py +16 -0
  104. dsgrid/registry/dimension_registry_manager.py +413 -0
  105. dsgrid/registry/dimension_update_checker.py +16 -0
  106. dsgrid/registry/duckdb_data_store.py +207 -0
  107. dsgrid/registry/filesystem_data_store.py +150 -0
  108. dsgrid/registry/filter_registry_manager.py +123 -0
  109. dsgrid/registry/project_config_generator.py +57 -0
  110. dsgrid/registry/project_registry_manager.py +1623 -0
  111. dsgrid/registry/project_update_checker.py +48 -0
  112. dsgrid/registry/registration_context.py +223 -0
  113. dsgrid/registry/registry_auto_updater.py +316 -0
  114. dsgrid/registry/registry_database.py +667 -0
  115. dsgrid/registry/registry_interface.py +446 -0
  116. dsgrid/registry/registry_manager.py +558 -0
  117. dsgrid/registry/registry_manager_base.py +367 -0
  118. dsgrid/registry/versioning.py +92 -0
  119. dsgrid/rust_ext/__init__.py +14 -0
  120. dsgrid/rust_ext/find_minimal_patterns.py +129 -0
  121. dsgrid/spark/__init__.py +0 -0
  122. dsgrid/spark/functions.py +589 -0
  123. dsgrid/spark/types.py +110 -0
  124. dsgrid/tests/__init__.py +0 -0
  125. dsgrid/tests/common.py +140 -0
  126. dsgrid/tests/make_us_data_registry.py +265 -0
  127. dsgrid/tests/register_derived_datasets.py +103 -0
  128. dsgrid/tests/utils.py +25 -0
  129. dsgrid/time/__init__.py +0 -0
  130. dsgrid/time/time_conversions.py +80 -0
  131. dsgrid/time/types.py +67 -0
  132. dsgrid/units/__init__.py +0 -0
  133. dsgrid/units/constants.py +113 -0
  134. dsgrid/units/convert.py +71 -0
  135. dsgrid/units/energy.py +145 -0
  136. dsgrid/units/power.py +87 -0
  137. dsgrid/utils/__init__.py +0 -0
  138. dsgrid/utils/dataset.py +830 -0
  139. dsgrid/utils/files.py +179 -0
  140. dsgrid/utils/filters.py +125 -0
  141. dsgrid/utils/id_remappings.py +100 -0
  142. dsgrid/utils/py_expression_eval/LICENSE +19 -0
  143. dsgrid/utils/py_expression_eval/README.md +8 -0
  144. dsgrid/utils/py_expression_eval/__init__.py +847 -0
  145. dsgrid/utils/py_expression_eval/tests.py +283 -0
  146. dsgrid/utils/run_command.py +70 -0
  147. dsgrid/utils/scratch_dir_context.py +65 -0
  148. dsgrid/utils/spark.py +918 -0
  149. dsgrid/utils/spark_partition.py +98 -0
  150. dsgrid/utils/timing.py +239 -0
  151. dsgrid/utils/utilities.py +221 -0
  152. dsgrid/utils/versioning.py +36 -0
  153. dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
  154. dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
  155. dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
  156. dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
  157. dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
@@ -0,0 +1,830 @@
1
+ import logging
2
+ import os
3
+ from pathlib import Path
4
+ from typing import Iterable
5
+ from zoneinfo import ZoneInfo
6
+
7
+ import chronify
8
+ from chronify.models import TableSchema
9
+
10
+ import dsgrid
11
+ from dsgrid.common import SCALING_FACTOR_COLUMN, VALUE_COLUMN
12
+ from dsgrid.config.dimension_config import DimensionConfig
13
+ from dsgrid.config.dimension_mapping_base import DimensionMappingType
14
+ from dsgrid.config.time_dimension_base_config import TimeDimensionBaseConfig
15
+ from dsgrid.dataset.dataset_mapping_manager import DatasetMappingManager
16
+ from dsgrid.dimension.base_models import DimensionType
17
+ from dsgrid.dimension.time import (
18
+ DaylightSavingFallBackType,
19
+ DaylightSavingSpringForwardType,
20
+ TimeBasedDataAdjustmentModel,
21
+ )
22
+ from dsgrid.exceptions import (
23
+ DSGInvalidField,
24
+ DSGInvalidDimensionMapping,
25
+ DSGInvalidDataset,
26
+ )
27
+ from dsgrid.spark.functions import (
28
+ coalesce,
29
+ count_distinct_on_group_by,
30
+ create_temp_view,
31
+ handle_column_spaces,
32
+ make_temp_view_name,
33
+ read_parquet,
34
+ is_dataframe_empty,
35
+ join,
36
+ join_multiple_columns,
37
+ unpivot,
38
+ )
39
+ from dsgrid.spark.functions import except_all, get_spark_session
40
+ from dsgrid.spark.types import (
41
+ DataFrame,
42
+ F,
43
+ IntegerType,
44
+ LongType,
45
+ ShortType,
46
+ StringType,
47
+ use_duckdb,
48
+ )
49
+ from dsgrid.utils.scratch_dir_context import ScratchDirContext
50
+ from dsgrid.utils.spark import (
51
+ check_for_nulls,
52
+ write_dataframe,
53
+ )
54
+ from dsgrid.utils.timing import timer_stats_collector, track_timing
55
+
56
+ logger = logging.getLogger(__name__)
57
+
58
+
59
+ def map_stacked_dimension(
60
+ df: DataFrame,
61
+ records: DataFrame,
62
+ column: str,
63
+ drop_column: bool = True,
64
+ to_column: str | None = None,
65
+ ) -> DataFrame:
66
+ to_column_ = to_column or column
67
+ if "fraction" not in df.columns:
68
+ df = df.withColumn("fraction", F.lit(1.0))
69
+ # map and consolidate from_fraction only
70
+ records = records.filter("to_id IS NOT NULL")
71
+ df = join(df, records, column, "from_id", how="inner").drop("from_id")
72
+ if drop_column:
73
+ df = df.drop(column)
74
+ df = df.withColumnRenamed("to_id", to_column_)
75
+ nonfraction_cols = [x for x in df.columns if x not in {"fraction", "from_fraction"}]
76
+ df = df.select(
77
+ *nonfraction_cols,
78
+ (F.col("fraction") * F.col("from_fraction")).alias("fraction"),
79
+ )
80
+ return df
81
+
82
+
83
+ def add_time_zone(
84
+ load_data_df: DataFrame,
85
+ geography_dim: DimensionConfig,
86
+ df_key: str = "geography",
87
+ dim_key: str = "id",
88
+ ):
89
+ """Add a time_zone column to a load_data dataframe from a geography dimension.
90
+
91
+ Parameters
92
+ ----------
93
+ load_data_df : DataFrame
94
+ geography_dim: DimensionConfig
95
+
96
+ Returns
97
+ -------
98
+ DataFrame
99
+
100
+ """
101
+ geo_records = geography_dim.get_records_dataframe()
102
+ if df_key not in load_data_df.columns:
103
+ msg = f"Cannot locate {df_key=} in load_data_df: {load_data_df.columns}"
104
+ raise ValueError(msg)
105
+
106
+ df = add_column_from_records(
107
+ load_data_df, geo_records, "time_zone", df_key, record_key=dim_key
108
+ )
109
+ return df
110
+
111
+
112
+ def add_column_from_records(df, dimension_records, record_column, df_key, record_key: str = "id"):
113
+ df = join(
114
+ df1=df,
115
+ df2=dimension_records.select(F.col(record_key).alias("record_id"), record_column),
116
+ column1=df_key,
117
+ column2="record_id",
118
+ how="inner",
119
+ ).drop("record_id")
120
+ return df
121
+
122
+
123
+ def add_null_rows_from_load_data_lookup(df: DataFrame, lookup: DataFrame) -> DataFrame:
124
+ """Add null rows from the nulled load data lookup table to data table.
125
+
126
+ Parameters
127
+ ----------
128
+ df
129
+ load data table
130
+ lookup
131
+ load data lookup table that has been filtered for nulls.
132
+ """
133
+ if not is_dataframe_empty(lookup):
134
+ intersect_cols = set(lookup.columns).intersection(df.columns)
135
+ null_rows_to_add = except_all(lookup.select(*intersect_cols), df.select(*intersect_cols))
136
+ for col in set(df.columns).difference(null_rows_to_add.columns):
137
+ null_rows_to_add = null_rows_to_add.withColumn(col, F.lit(None))
138
+ df = df.union(null_rows_to_add.select(*df.columns))
139
+
140
+ return df
141
+
142
+
143
+ def apply_scaling_factor(
144
+ df: DataFrame,
145
+ value_column: str,
146
+ mapping_manager: DatasetMappingManager,
147
+ scaling_factor_column: str = SCALING_FACTOR_COLUMN,
148
+ ) -> DataFrame:
149
+ """Apply the scaling factor to all value columns and then drop the scaling factor column."""
150
+ op = mapping_manager.plan.apply_scaling_factor_op
151
+ if mapping_manager.has_completed_operation(op):
152
+ return df
153
+
154
+ func = _apply_scaling_factor_duckdb if use_duckdb() else _apply_scaling_factor_spark
155
+ df = func(df, value_column, scaling_factor_column)
156
+ if mapping_manager.plan.apply_scaling_factor_op.persist:
157
+ df = mapping_manager.persist_table(df, op)
158
+ return df
159
+
160
+
161
+ def _apply_scaling_factor_duckdb(
162
+ df: DataFrame,
163
+ value_column: str,
164
+ scaling_factor_column: str,
165
+ ):
166
+ # Workaround for the fact that duckdb doesn't support
167
+ # F.col(scaling_factor_column).isNotNull()
168
+ cols = (x for x in df.columns if x not in (value_column, scaling_factor_column))
169
+ cols_str = ",".join(cols)
170
+ view = create_temp_view(df)
171
+ query = f"""
172
+ SELECT
173
+ {cols_str},
174
+ (
175
+ CASE WHEN {scaling_factor_column} IS NULL THEN {value_column}
176
+ ELSE {value_column} * {scaling_factor_column} END
177
+ ) AS {value_column}
178
+ FROM {view}
179
+ """
180
+ spark = get_spark_session()
181
+ return spark.sql(query)
182
+
183
+
184
+ def _apply_scaling_factor_spark(
185
+ df: DataFrame,
186
+ value_column: str,
187
+ scaling_factor_column: str,
188
+ ):
189
+ return df.withColumn(
190
+ value_column,
191
+ F.when(
192
+ F.col(scaling_factor_column).isNotNull(),
193
+ F.col(value_column) * F.col(scaling_factor_column),
194
+ ).otherwise(F.col(value_column)),
195
+ ).drop(scaling_factor_column)
196
+
197
+
198
+ def check_historical_annual_time_model_year_consistency(
199
+ df: DataFrame, time_column: str, model_year_column: str
200
+ ) -> None:
201
+ """Check that the model year values match the time dimension years for a historical
202
+ dataset with an annual time dimension.
203
+ """
204
+ invalid = (
205
+ df.select(time_column, model_year_column)
206
+ .filter(f"{time_column} IS NOT NULL")
207
+ .distinct()
208
+ .filter(f"{time_column} != {model_year_column}")
209
+ .collect()
210
+ )
211
+ if invalid:
212
+ msg = (
213
+ "A historical dataset with annual time must have rows where the time years match the model years. "
214
+ f"{invalid}"
215
+ )
216
+ raise DSGInvalidDataset(msg)
217
+
218
+
219
+ @track_timing(timer_stats_collector)
220
+ def check_null_value_in_dimension_rows(dim_table, exclude_columns=None):
221
+ if os.environ.get("__DSGRID_SKIP_CHECK_NULL_DIMENSION__"):
222
+ # This has intermittently caused GC-related timeouts for TEMPO.
223
+ # Leave a backdoor to skip these checks, which may eventually be removed.
224
+ logger.warning("Skip check_null_value_in_dimension_rows")
225
+ return
226
+
227
+ try:
228
+ exclude = {"id"}
229
+ if exclude_columns is not None:
230
+ exclude.update(exclude_columns)
231
+ check_for_nulls(dim_table, exclude_columns=exclude)
232
+ except DSGInvalidField as exc:
233
+ msg = (
234
+ "Invalid dimension mapping application. "
235
+ "Combination of remapped dataset dimensions contain NULL value(s) for "
236
+ f"dimension(s): \n{str(exc)}"
237
+ )
238
+ raise DSGInvalidDimensionMapping(msg)
239
+
240
+
241
+ def handle_dimension_association_errors(
242
+ diff: DataFrame,
243
+ dataset_table: DataFrame,
244
+ dataset_id: str,
245
+ ) -> None:
246
+ """Record missing dimension record combinations in a Parquet file and log an error."""
247
+ out_file = Path(f"{dataset_id}__missing_dimension_record_combinations.parquet")
248
+ df = write_dataframe(coalesce(diff, 1), out_file, overwrite=True)
249
+ logger.error(
250
+ "Dataset %s is missing required dimension records. Recorded missing records in %s",
251
+ dataset_id,
252
+ out_file,
253
+ )
254
+
255
+ # Analyze patterns in missing data to help identify root causes
256
+ try:
257
+ from dsgrid.rust_ext import find_minimal_patterns_from_file
258
+
259
+ logger.info("Analyzing missing data patterns for dataset %s...", dataset_id)
260
+ if out_file.is_dir():
261
+ files = list(out_file.glob("*.parquet"))
262
+ assert len(files) == 1, f"Expected 1 file, got {files}"
263
+ filename = files[0]
264
+ else:
265
+ filename = out_file
266
+ patterns = find_minimal_patterns_from_file(
267
+ filename,
268
+ max_depth=0,
269
+ verbose=False,
270
+ )
271
+
272
+ if patterns:
273
+ logger.error("Found %d minimal closed patterns in missing data:", len(patterns))
274
+ for pattern in patterns[:10]: # Show top 10 patterns
275
+ logger.error(
276
+ " Pattern %d: %s = %s (%d missing rows)",
277
+ pattern.pattern_id,
278
+ " | ".join(pattern.columns),
279
+ " | ".join(pattern.values),
280
+ pattern.num_rows,
281
+ )
282
+ if len(patterns) > 10:
283
+ logger.error(" ... and %d more patterns", len(patterns) - 10)
284
+ else:
285
+ logger.warning("No closed patterns found in missing data")
286
+ except ImportError:
287
+ logger.warning(
288
+ "Rust pattern analysis not available. Install with: pip install -e . "
289
+ "or build with: maturin develop"
290
+ )
291
+ _look_for_error_contributors(df, dataset_table)
292
+ except Exception as e:
293
+ logger.warning("Failed to analyze missing data patterns: %s", e)
294
+
295
+ msg = (
296
+ f"Dataset {dataset_id} is missing required dimension records. "
297
+ "Please look in the log file for more information."
298
+ )
299
+ raise DSGInvalidDataset(msg)
300
+
301
+
302
+ def _look_for_error_contributors(diff: DataFrame, dataset_table: DataFrame) -> None:
303
+ diff_counts = {x: diff.select(x).distinct().count() for x in diff.columns}
304
+ for col in diff.columns:
305
+ dataset_count = dataset_table.select(col).distinct().count()
306
+ if dataset_count != diff_counts[col]:
307
+ logger.error(
308
+ "Error contributor: column=%s dataset_distinct_count=%s missing_distinct_count=%s",
309
+ col,
310
+ dataset_count,
311
+ diff_counts[col],
312
+ )
313
+
314
+
315
+ def is_noop_mapping(records: DataFrame) -> bool:
316
+ """Return True if the mapping is a no-op."""
317
+ return is_dataframe_empty(
318
+ records.filter(
319
+ "(to_id IS NULL and from_id IS NOT NULL) or "
320
+ "(to_id IS NOT NULL and from_id IS NULL) or "
321
+ "(from_id != to_id) or (from_fraction != 1.0)"
322
+ )
323
+ )
324
+
325
+
326
+ def map_time_dimension_with_chronify_duckdb(
327
+ df: DataFrame,
328
+ value_column: str,
329
+ from_time_dim: TimeDimensionBaseConfig,
330
+ to_time_dim: TimeDimensionBaseConfig,
331
+ scratch_dir_context: ScratchDirContext,
332
+ wrap_time_allowed: bool = False,
333
+ time_based_data_adjustment: TimeBasedDataAdjustmentModel | None = None,
334
+ ) -> DataFrame:
335
+ """Create a time-mapped table with chronify and DuckDB.
336
+ All operations are performed in memory.
337
+ """
338
+ # This will only work if the source and destination tables will fit in memory.
339
+ # We could potentially use a file-based DuckDB database for larger-than memory datasets.
340
+ # However, time checks and unpivot operations have failed with out-of-memory errors,
341
+ # and so we have never reached this point.
342
+ # If we solve those problems, this code could be modified.
343
+ src_schema, dst_schema = _get_mapping_schemas(df, value_column, from_time_dim, to_time_dim)
344
+ store = chronify.Store.create_in_memory_db()
345
+ store.ingest_table(df.relation, src_schema, skip_time_checks=True)
346
+ store.map_table_time_config(
347
+ src_schema.name,
348
+ dst_schema,
349
+ wrap_time_allowed=wrap_time_allowed,
350
+ data_adjustment=_to_chronify_time_based_data_adjustment(time_based_data_adjustment),
351
+ scratch_dir=scratch_dir_context.scratch_dir,
352
+ )
353
+ pandas_df = store.read_table(dst_schema.name)
354
+ store.drop_table(dst_schema.name)
355
+ return df.session.createDataFrame(pandas_df)
356
+
357
+
358
+ def convert_time_zone_with_chronify_duckdb(
359
+ df: DataFrame,
360
+ value_column: str,
361
+ from_time_dim: TimeDimensionBaseConfig,
362
+ time_zone: str,
363
+ scratch_dir_context: ScratchDirContext,
364
+ ) -> DataFrame:
365
+ """Create a single time zone-converted table with chronify and DuckDB.
366
+ All operations are performed in memory.
367
+ """
368
+ src_schema = _get_src_schema(df, value_column, from_time_dim)
369
+ store = chronify.Store.create_in_memory_db()
370
+ store.ingest_table(df.relation, src_schema, skip_time_checks=True)
371
+ zone_info_tz = ZoneInfo(time_zone)
372
+ dst_schema = store.convert_time_zone(
373
+ src_schema.name,
374
+ zone_info_tz,
375
+ scratch_dir=scratch_dir_context.scratch_dir,
376
+ )
377
+ pandas_df = store.read_table(dst_schema.name)
378
+ store.drop_table(dst_schema.name)
379
+ return df.session.createDataFrame(pandas_df)
380
+
381
+
382
+ def convert_time_zone_by_column_with_chronify_duckdb(
383
+ df: DataFrame,
384
+ value_column: str,
385
+ from_time_dim: TimeDimensionBaseConfig,
386
+ time_zone_column: str,
387
+ scratch_dir_context: ScratchDirContext,
388
+ wrap_time_allowed: bool = False,
389
+ ) -> DataFrame:
390
+ """Create a multiple time zone-converted table (based on a time_zone_column)
391
+ using chronify and DuckDB.
392
+ All operations are performed in memory.
393
+ """
394
+ src_schema = _get_src_schema(df, value_column, from_time_dim)
395
+ store = chronify.Store.create_in_memory_db()
396
+ store.ingest_table(df.relation, src_schema, skip_time_checks=True)
397
+ dst_schema = store.convert_time_zone_by_column(
398
+ src_schema.name,
399
+ time_zone_column,
400
+ wrap_time_allowed=wrap_time_allowed,
401
+ scratch_dir=scratch_dir_context.scratch_dir,
402
+ )
403
+ pandas_df = store.read_table(dst_schema.name)
404
+ store.drop_table(dst_schema.name)
405
+ return df.session.createDataFrame(pandas_df)
406
+
407
+
408
+ def map_time_dimension_with_chronify_spark_hive(
409
+ df: DataFrame,
410
+ table_name: str,
411
+ value_column: str,
412
+ from_time_dim: TimeDimensionBaseConfig,
413
+ to_time_dim: TimeDimensionBaseConfig,
414
+ scratch_dir_context: ScratchDirContext,
415
+ time_based_data_adjustment: TimeBasedDataAdjustmentModel | None = None,
416
+ wrap_time_allowed: bool = False,
417
+ ) -> DataFrame:
418
+ """Create a time-mapped table with chronify and Spark and a Hive Metastore.
419
+ The source data must already be stored in the metastore.
420
+ Chronify will store the mapped table in the metastore.
421
+ """
422
+ src_schema, dst_schema = _get_mapping_schemas(
423
+ df, value_column, from_time_dim, to_time_dim, src_name=table_name
424
+ )
425
+ store = chronify.Store.create_new_hive_store(dsgrid.runtime_config.thrift_server_url)
426
+ with store.engine.begin() as conn:
427
+ # This bypasses checks because the table should already be valid.
428
+ store.schema_manager.add_schema(conn, src_schema)
429
+ try:
430
+ store.map_table_time_config(
431
+ src_schema.name,
432
+ dst_schema,
433
+ check_mapped_timestamps=False,
434
+ scratch_dir=scratch_dir_context.scratch_dir,
435
+ wrap_time_allowed=wrap_time_allowed,
436
+ data_adjustment=_to_chronify_time_based_data_adjustment(time_based_data_adjustment),
437
+ )
438
+ finally:
439
+ with store.engine.begin() as conn:
440
+ store.schema_manager.remove_schema(conn, src_schema.name)
441
+
442
+ return df.sparkSession.sql(f"SELECT * FROM {dst_schema.name}")
443
+
444
+
445
+ def convert_time_zone_with_chronify_spark_hive(
446
+ df: DataFrame,
447
+ value_column: str,
448
+ from_time_dim: TimeDimensionBaseConfig,
449
+ time_zone: str,
450
+ scratch_dir_context: ScratchDirContext,
451
+ ) -> DataFrame:
452
+ """Create a single time zone-converted table with chronify and Spark and a Hive Metastore."""
453
+ src_schema = _get_src_schema(df, value_column, from_time_dim)
454
+ store = chronify.Store.create_new_hive_store(dsgrid.runtime_config.thrift_server_url)
455
+ with store.engine.begin() as conn:
456
+ # This bypasses checks because the table should already be valid.
457
+ store.schema_manager.add_schema(conn, src_schema)
458
+ zone_info_tz = ZoneInfo(time_zone)
459
+ try:
460
+ dst_schema = store.convert_time_zone(
461
+ src_schema.name,
462
+ zone_info_tz,
463
+ scratch_dir=scratch_dir_context.scratch_dir,
464
+ )
465
+ finally:
466
+ with store.engine.begin() as conn:
467
+ store.schema_manager.remove_schema(conn, src_schema.name)
468
+
469
+ return df.sparkSession.sql(f"SELECT * FROM {dst_schema.name}")
470
+
471
+
472
+ def convert_time_zone_by_column_with_chronify_spark_hive(
473
+ df: DataFrame,
474
+ value_column: str,
475
+ from_time_dim: TimeDimensionBaseConfig,
476
+ time_zone_column: str,
477
+ scratch_dir_context: ScratchDirContext,
478
+ wrap_time_allowed: bool = False,
479
+ ) -> DataFrame:
480
+ """Create a multiple time zone-converted table (based on a time_zone_column)
481
+ using chronify and Spark and a Hive Metastore.
482
+ """
483
+ src_schema = _get_src_schema(df, value_column, from_time_dim)
484
+ store = chronify.Store.create_new_hive_store(dsgrid.runtime_config.thrift_server_url)
485
+ with store.engine.begin() as conn:
486
+ # This bypasses checks because the table should already be valid.
487
+ store.schema_manager.add_schema(conn, src_schema)
488
+ try:
489
+ dst_schema = store.convert_time_zone_by_column(
490
+ src_schema.name,
491
+ time_zone_column,
492
+ wrap_time_allowed=wrap_time_allowed,
493
+ scratch_dir=scratch_dir_context.scratch_dir,
494
+ )
495
+ finally:
496
+ with store.engine.begin() as conn:
497
+ store.schema_manager.remove_schema(conn, src_schema.name)
498
+
499
+ return df.sparkSession.sql(f"SELECT * FROM {dst_schema.name}")
500
+
501
+
502
+ def map_time_dimension_with_chronify_spark_path(
503
+ df: DataFrame,
504
+ filename: Path,
505
+ value_column: str,
506
+ from_time_dim: TimeDimensionBaseConfig,
507
+ to_time_dim: TimeDimensionBaseConfig,
508
+ scratch_dir_context: ScratchDirContext,
509
+ wrap_time_allowed: bool = False,
510
+ time_based_data_adjustment: TimeBasedDataAdjustmentModel | None = None,
511
+ ) -> DataFrame:
512
+ """Create a time-mapped table with chronify and Spark using the local filesystem.
513
+ Chronify will store the mapped table in a Parquet file within scratch_dir_context.
514
+ """
515
+ src_schema, dst_schema = _get_mapping_schemas(df, value_column, from_time_dim, to_time_dim)
516
+ store = chronify.Store.create_new_hive_store(dsgrid.runtime_config.thrift_server_url)
517
+ store.create_view_from_parquet(filename, src_schema, bypass_checks=True)
518
+ output_file = scratch_dir_context.get_temp_filename(suffix=".parquet")
519
+ store.map_table_time_config(
520
+ src_schema.name,
521
+ dst_schema,
522
+ check_mapped_timestamps=False,
523
+ scratch_dir=scratch_dir_context.scratch_dir,
524
+ output_file=output_file,
525
+ wrap_time_allowed=wrap_time_allowed,
526
+ data_adjustment=_to_chronify_time_based_data_adjustment(time_based_data_adjustment),
527
+ )
528
+ return df.sparkSession.read.load(str(output_file))
529
+
530
+
531
+ def convert_time_zone_with_chronify_spark_path(
532
+ df: DataFrame,
533
+ filename: Path,
534
+ value_column: str,
535
+ from_time_dim: TimeDimensionBaseConfig,
536
+ time_zone: str,
537
+ scratch_dir_context: ScratchDirContext,
538
+ ) -> DataFrame:
539
+ """Create a single time zone-converted table with chronify and Spark using the local filesystem."""
540
+ src_schema = _get_src_schema(df, value_column, from_time_dim)
541
+ store = chronify.Store.create_new_hive_store(dsgrid.runtime_config.thrift_server_url)
542
+ store.create_view_from_parquet(filename, src_schema, bypass_checks=True)
543
+ output_file = scratch_dir_context.get_temp_filename(suffix=".parquet")
544
+ zone_info_tz = ZoneInfo(time_zone)
545
+ store.convert_time_zone(
546
+ src_schema.name,
547
+ zone_info_tz,
548
+ scratch_dir=scratch_dir_context.scratch_dir,
549
+ output_file=output_file,
550
+ )
551
+ return df.sparkSession.read.load(str(output_file))
552
+
553
+
554
+ def convert_time_zone_by_column_with_chronify_spark_path(
555
+ df: DataFrame,
556
+ filename: Path,
557
+ value_column: str,
558
+ from_time_dim: TimeDimensionBaseConfig,
559
+ time_zone_column: str,
560
+ scratch_dir_context: ScratchDirContext,
561
+ wrap_time_allowed: bool = False,
562
+ ) -> DataFrame:
563
+ """Create a multiple time zone-converted table (based on a time_zone_column)
564
+ using chronify and Spark using the local filesystem.
565
+ """
566
+ src_schema = _get_src_schema(df, value_column, from_time_dim)
567
+ store = chronify.Store.create_new_hive_store(dsgrid.runtime_config.thrift_server_url)
568
+ store.create_view_from_parquet(filename, src_schema, bypass_checks=True)
569
+ output_file = scratch_dir_context.get_temp_filename(suffix=".parquet")
570
+ store.convert_time_zone_by_column(
571
+ src_schema.name,
572
+ time_zone_column,
573
+ wrap_time_allowed=wrap_time_allowed,
574
+ scratch_dir=scratch_dir_context.scratch_dir,
575
+ output_file=output_file,
576
+ )
577
+ return df.sparkSession.read.load(str(output_file))
578
+
579
+
580
+ def _to_chronify_time_based_data_adjustment(
581
+ adj: TimeBasedDataAdjustmentModel | None,
582
+ ) -> chronify.TimeBasedDataAdjustment | None:
583
+ if adj is None:
584
+ return None
585
+ if (
586
+ adj.daylight_saving_adjustment.spring_forward_hour == DaylightSavingSpringForwardType.NONE
587
+ and adj.daylight_saving_adjustment.fall_back_hour == DaylightSavingFallBackType.NONE
588
+ ):
589
+ chronify_dst_adjustment = chronify.time.DaylightSavingAdjustmentType.NONE
590
+ elif (
591
+ adj.daylight_saving_adjustment.spring_forward_hour == DaylightSavingSpringForwardType.DROP
592
+ and adj.daylight_saving_adjustment.fall_back_hour == DaylightSavingFallBackType.DUPLICATE
593
+ ):
594
+ chronify_dst_adjustment = (
595
+ chronify.time.DaylightSavingAdjustmentType.DROP_SPRING_FORWARD_DUPLICATE_FALLBACK
596
+ )
597
+ elif (
598
+ adj.daylight_saving_adjustment.spring_forward_hour == DaylightSavingSpringForwardType.DROP
599
+ and adj.daylight_saving_adjustment.fall_back_hour == DaylightSavingFallBackType.INTERPOLATE
600
+ ):
601
+ chronify_dst_adjustment = (
602
+ chronify.time.DaylightSavingAdjustmentType.DROP_SPRING_FORWARD_INTERPOLATE_FALLBACK
603
+ )
604
+ else:
605
+ msg = f"dsgrid time_based_data_adjustment = {adj}"
606
+ raise NotImplementedError(msg)
607
+
608
+ return chronify.TimeBasedDataAdjustment(
609
+ leap_day_adjustment=adj.leap_day_adjustment.value,
610
+ daylight_saving_adjustment=chronify_dst_adjustment,
611
+ )
612
+
613
+
614
+ def _get_src_schema(
615
+ df: DataFrame,
616
+ value_column: str,
617
+ from_time_dim: TimeDimensionBaseConfig,
618
+ src_name: str | None = None,
619
+ ) -> TableSchema:
620
+ src = src_name or "src_" + make_temp_view_name()
621
+ time_col_list = from_time_dim.get_load_data_time_columns()
622
+ time_config = from_time_dim.to_chronify()
623
+ time_array_id_columns = [
624
+ x
625
+ for x in df.columns
626
+ if x in set(df.columns).difference(set(time_col_list)) - {value_column}
627
+ ]
628
+ src_schema = chronify.TableSchema(
629
+ name=src,
630
+ time_config=time_config,
631
+ time_array_id_columns=time_array_id_columns,
632
+ value_column=value_column,
633
+ )
634
+ return src_schema
635
+
636
+
637
+ def _get_dst_schema(
638
+ df: DataFrame,
639
+ value_column: str,
640
+ from_time_dim: TimeDimensionBaseConfig,
641
+ to_time_dim: TimeDimensionBaseConfig,
642
+ ) -> TableSchema:
643
+ time_config = to_time_dim.to_chronify()
644
+ time_col_list = from_time_dim.get_load_data_time_columns()
645
+ time_array_id_columns = [
646
+ x
647
+ for x in df.columns
648
+ if x in set(df.columns).difference(set(time_col_list)) - {value_column}
649
+ ]
650
+ dst_schema = chronify.TableSchema(
651
+ name="dst_" + make_temp_view_name(),
652
+ time_config=time_config,
653
+ time_array_id_columns=time_array_id_columns,
654
+ value_column=value_column,
655
+ )
656
+ return dst_schema
657
+
658
+
659
+ def _get_mapping_schemas(
660
+ df: DataFrame,
661
+ value_column: str,
662
+ from_time_dim: TimeDimensionBaseConfig,
663
+ to_time_dim: TimeDimensionBaseConfig,
664
+ src_name: str | None = None,
665
+ ) -> tuple[TableSchema, TableSchema]:
666
+ src_schema = _get_src_schema(df, value_column, from_time_dim, src_name=src_name)
667
+ dst_schema = _get_dst_schema(df, value_column, from_time_dim, to_time_dim)
668
+ return src_schema, dst_schema
669
+
670
+
671
+ def ordered_subset_columns(df, subset: set[str]) -> list[str]:
672
+ """Return a list of columns in the dataframe that are present in subset."""
673
+ return [x for x in df.columns if x in subset]
674
+
675
+
676
+ def remove_invalid_null_timestamps(df, time_columns, stacked_columns):
677
+ """Remove rows from the dataframe where the time column is NULL and other rows with the
678
+ same dimensions contain valid data.
679
+ """
680
+ assert len(time_columns) == 1, time_columns
681
+ time_column = next(iter(time_columns))
682
+ orig_columns = df.columns
683
+ stacked = list(stacked_columns)
684
+ return (
685
+ join_multiple_columns(
686
+ df,
687
+ count_distinct_on_group_by(df, stacked, time_column, "count_time"),
688
+ stacked,
689
+ )
690
+ .filter(f"{handle_column_spaces(time_column)} IS NOT NULL OR count_time = 0")
691
+ .select(orig_columns)
692
+ )
693
+
694
+
695
+ @track_timing(timer_stats_collector)
696
+ def repartition_if_needed_by_mapping(
697
+ df: DataFrame,
698
+ mapping_type: DimensionMappingType,
699
+ scratch_dir_context: ScratchDirContext,
700
+ repartition: bool | None = None,
701
+ ) -> tuple[DataFrame, Path | None]:
702
+ """Repartition the dataframe if the mapping might cause data skew.
703
+
704
+ Parameters
705
+ ----------
706
+ df : DataFrame
707
+ The dataframe to repartition.
708
+ mapping_type : DimensionMappingType
709
+ scratch_dir_context : ScratchDirContext
710
+ The scratch directory context to use for temporary files.
711
+ repartition : bool
712
+ If None, repartition based on the mapping type.
713
+ Otherwise, always repartition if True, or never if False.
714
+ """
715
+ if use_duckdb():
716
+ return df, None
717
+
718
+ # We experienced an issue with the IEF buildings dataset where the disaggregation of
719
+ # region to county caused a major issue where one Spark executor thread got stuck,
720
+ # seemingly indefinitely. A message like this was repeated continually.
721
+ # UnsafeExternalSorter: Thread 152 spilling sort data of 4.0 GiB to disk (0 time so far)
722
+ # It appears to be caused by data skew, though the imbalance didn't seem too severe.
723
+ # Using a variation of what online sources call a "salting technique" solves the issue.
724
+ # Apply the technique to mappings that will cause an explosion of rows.
725
+ # Note that this probably isn't needed in all cases and we may need to adjust in the
726
+ # future.
727
+
728
+ # Note: log messages below are checked in the tests.
729
+ if repartition or (
730
+ repartition is None
731
+ and mapping_type
732
+ in {
733
+ DimensionMappingType.ONE_TO_MANY_DISAGGREGATION,
734
+ # These cases might be problematic in the future.
735
+ # DimensionMappingType.ONE_TO_MANY_ASSIGNMENT,
736
+ # DimensionMappingType.ONE_TO_MANY_EXPLICIT_MULTIPLIERS,
737
+ # DimensionMappingType.MANY_TO_MANY_DISAGGREGATION,
738
+ # This is usually happening with scenario and hasn't caused a problem.
739
+ # DimensionMappingType.DUPLICATION,
740
+ }
741
+ ):
742
+ filename = scratch_dir_context.get_temp_filename(suffix=".parquet")
743
+ # Salting techniques online talk about adding or modifying a column with random values.
744
+ # We might be able to use one of our value columns. However, there are cases where there
745
+ # could be many instances of zero or null. So, add a new column with random values.
746
+ logger.info("Repartition after mapping %s", mapping_type)
747
+ salted_column = "salted_key"
748
+ spark = get_spark_session()
749
+ num_partitions = int(spark.conf.get("spark.sql.shuffle.partitions"))
750
+ df.withColumn(
751
+ salted_column, (F.rand() * num_partitions).cast(IntegerType()) + 1
752
+ ).repartition(salted_column).write.parquet(str(filename))
753
+ df = read_parquet(filename).drop(salted_column)
754
+ logger.info("Completed repartition.")
755
+ return df, filename
756
+
757
+ logger.debug("Repartition is not needed for mapping_type %s", mapping_type)
758
+ return df, None
759
+
760
+
761
+ def unpivot_dataframe(
762
+ df: DataFrame,
763
+ value_columns: Iterable[str],
764
+ variable_column: str,
765
+ time_columns: list[str],
766
+ ) -> DataFrame:
767
+ """Unpivot the dataframe, accounting for time columns."""
768
+ values = value_columns if isinstance(value_columns, set) else set(value_columns)
769
+ ids = [x for x in df.columns if x != VALUE_COLUMN and x not in values]
770
+ df = unpivot(df, value_columns, variable_column, VALUE_COLUMN)
771
+ cols = set(df.columns).difference(time_columns)
772
+ new_rows = df.filter(f"{VALUE_COLUMN} IS NULL").select(*cols).distinct()
773
+ for col in time_columns:
774
+ new_rows = new_rows.withColumn(col, F.lit(None))
775
+
776
+ return (
777
+ df.filter(f"{VALUE_COLUMN} IS NOT NULL")
778
+ .union(new_rows.select(*df.columns))
779
+ .select(*ids, variable_column, VALUE_COLUMN)
780
+ )
781
+
782
+
783
+ def convert_types_if_necessary(df: DataFrame) -> DataFrame:
784
+ """Convert the types of the dataframe if necessary."""
785
+ allowed_int_columns = (
786
+ DimensionType.MODEL_YEAR.value,
787
+ DimensionType.WEATHER_YEAR.value,
788
+ )
789
+ int_types = {IntegerType(), LongType(), ShortType()}
790
+ existing_columns = set(df.columns)
791
+ for column in allowed_int_columns:
792
+ if column in existing_columns and df.schema[column].dataType in int_types:
793
+ df = df.withColumn(column, F.col(column).cast(StringType()))
794
+ return df
795
+
796
+
797
+ def filter_out_expected_missing_associations(
798
+ main_df: DataFrame, missing_df: DataFrame
799
+ ) -> DataFrame:
800
+ """Filter out rows that are expected to be missing from the main dataframe."""
801
+ missing_columns = [DimensionType.from_column(x).value for x in missing_df.columns]
802
+ spark = get_spark_session()
803
+ main_view = make_temp_view_name()
804
+ assoc_view = make_temp_view_name()
805
+ main_columns = ",".join((f"{main_view}.{x}" for x in main_df.columns))
806
+
807
+ main_df.createOrReplaceTempView(main_view)
808
+ missing_df.createOrReplaceTempView(assoc_view)
809
+ join_str = " AND ".join((f"{main_view}.{x} = {assoc_view}.{x}" for x in missing_columns))
810
+ query = f"""
811
+ SELECT {main_columns}
812
+ FROM {main_view}
813
+ ANTI JOIN {assoc_view}
814
+ ON {join_str}
815
+ """
816
+ res = spark.sql(query)
817
+ return res
818
+
819
+
820
+ def split_expected_missing_rows(
821
+ df: DataFrame, time_columns: list[str]
822
+ ) -> tuple[DataFrame, DataFrame | None]:
823
+ """Split a DataFrame into two if it contains expected missing data."""
824
+ null_df = df.filter(f"{VALUE_COLUMN} IS NULL")
825
+ if is_dataframe_empty(null_df):
826
+ return df, None
827
+
828
+ drop_columns = time_columns + [VALUE_COLUMN]
829
+ missing_associations = null_df.drop(*drop_columns)
830
+ return df.filter(f"{VALUE_COLUMN} IS NOT NULL"), missing_associations