dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- build_backend.py +93 -0
- dsgrid/__init__.py +22 -0
- dsgrid/api/__init__.py +0 -0
- dsgrid/api/api_manager.py +179 -0
- dsgrid/api/app.py +419 -0
- dsgrid/api/models.py +60 -0
- dsgrid/api/response_models.py +116 -0
- dsgrid/apps/__init__.py +0 -0
- dsgrid/apps/project_viewer/app.py +216 -0
- dsgrid/apps/registration_gui.py +444 -0
- dsgrid/chronify.py +32 -0
- dsgrid/cli/__init__.py +0 -0
- dsgrid/cli/common.py +120 -0
- dsgrid/cli/config.py +176 -0
- dsgrid/cli/download.py +13 -0
- dsgrid/cli/dsgrid.py +157 -0
- dsgrid/cli/dsgrid_admin.py +92 -0
- dsgrid/cli/install_notebooks.py +62 -0
- dsgrid/cli/query.py +729 -0
- dsgrid/cli/registry.py +1862 -0
- dsgrid/cloud/__init__.py +0 -0
- dsgrid/cloud/cloud_storage_interface.py +140 -0
- dsgrid/cloud/factory.py +31 -0
- dsgrid/cloud/fake_storage_interface.py +37 -0
- dsgrid/cloud/s3_storage_interface.py +156 -0
- dsgrid/common.py +36 -0
- dsgrid/config/__init__.py +0 -0
- dsgrid/config/annual_time_dimension_config.py +194 -0
- dsgrid/config/common.py +142 -0
- dsgrid/config/config_base.py +148 -0
- dsgrid/config/dataset_config.py +907 -0
- dsgrid/config/dataset_schema_handler_factory.py +46 -0
- dsgrid/config/date_time_dimension_config.py +136 -0
- dsgrid/config/dimension_config.py +54 -0
- dsgrid/config/dimension_config_factory.py +65 -0
- dsgrid/config/dimension_mapping_base.py +350 -0
- dsgrid/config/dimension_mappings_config.py +48 -0
- dsgrid/config/dimensions.py +1025 -0
- dsgrid/config/dimensions_config.py +71 -0
- dsgrid/config/file_schema.py +190 -0
- dsgrid/config/index_time_dimension_config.py +80 -0
- dsgrid/config/input_dataset_requirements.py +31 -0
- dsgrid/config/mapping_tables.py +209 -0
- dsgrid/config/noop_time_dimension_config.py +42 -0
- dsgrid/config/project_config.py +1462 -0
- dsgrid/config/registration_models.py +188 -0
- dsgrid/config/representative_period_time_dimension_config.py +194 -0
- dsgrid/config/simple_models.py +49 -0
- dsgrid/config/supplemental_dimension.py +29 -0
- dsgrid/config/time_dimension_base_config.py +192 -0
- dsgrid/data_models.py +155 -0
- dsgrid/dataset/__init__.py +0 -0
- dsgrid/dataset/dataset.py +123 -0
- dsgrid/dataset/dataset_expression_handler.py +86 -0
- dsgrid/dataset/dataset_mapping_manager.py +121 -0
- dsgrid/dataset/dataset_schema_handler_base.py +945 -0
- dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
- dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
- dsgrid/dataset/growth_rates.py +162 -0
- dsgrid/dataset/models.py +51 -0
- dsgrid/dataset/table_format_handler_base.py +257 -0
- dsgrid/dataset/table_format_handler_factory.py +17 -0
- dsgrid/dataset/unpivoted_table.py +121 -0
- dsgrid/dimension/__init__.py +0 -0
- dsgrid/dimension/base_models.py +230 -0
- dsgrid/dimension/dimension_filters.py +308 -0
- dsgrid/dimension/standard.py +252 -0
- dsgrid/dimension/time.py +352 -0
- dsgrid/dimension/time_utils.py +103 -0
- dsgrid/dsgrid_rc.py +88 -0
- dsgrid/exceptions.py +105 -0
- dsgrid/filesystem/__init__.py +0 -0
- dsgrid/filesystem/cloud_filesystem.py +32 -0
- dsgrid/filesystem/factory.py +32 -0
- dsgrid/filesystem/filesystem_interface.py +136 -0
- dsgrid/filesystem/local_filesystem.py +74 -0
- dsgrid/filesystem/s3_filesystem.py +118 -0
- dsgrid/loggers.py +132 -0
- dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
- dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
- dsgrid/notebooks/registration.ipynb +48 -0
- dsgrid/notebooks/start_notebook.sh +11 -0
- dsgrid/project.py +451 -0
- dsgrid/query/__init__.py +0 -0
- dsgrid/query/dataset_mapping_plan.py +142 -0
- dsgrid/query/derived_dataset.py +388 -0
- dsgrid/query/models.py +728 -0
- dsgrid/query/query_context.py +287 -0
- dsgrid/query/query_submitter.py +994 -0
- dsgrid/query/report_factory.py +19 -0
- dsgrid/query/report_peak_load.py +70 -0
- dsgrid/query/reports_base.py +20 -0
- dsgrid/registry/__init__.py +0 -0
- dsgrid/registry/bulk_register.py +165 -0
- dsgrid/registry/common.py +287 -0
- dsgrid/registry/config_update_checker_base.py +63 -0
- dsgrid/registry/data_store_factory.py +34 -0
- dsgrid/registry/data_store_interface.py +74 -0
- dsgrid/registry/dataset_config_generator.py +158 -0
- dsgrid/registry/dataset_registry_manager.py +950 -0
- dsgrid/registry/dataset_update_checker.py +16 -0
- dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
- dsgrid/registry/dimension_mapping_update_checker.py +16 -0
- dsgrid/registry/dimension_registry_manager.py +413 -0
- dsgrid/registry/dimension_update_checker.py +16 -0
- dsgrid/registry/duckdb_data_store.py +207 -0
- dsgrid/registry/filesystem_data_store.py +150 -0
- dsgrid/registry/filter_registry_manager.py +123 -0
- dsgrid/registry/project_config_generator.py +57 -0
- dsgrid/registry/project_registry_manager.py +1623 -0
- dsgrid/registry/project_update_checker.py +48 -0
- dsgrid/registry/registration_context.py +223 -0
- dsgrid/registry/registry_auto_updater.py +316 -0
- dsgrid/registry/registry_database.py +667 -0
- dsgrid/registry/registry_interface.py +446 -0
- dsgrid/registry/registry_manager.py +558 -0
- dsgrid/registry/registry_manager_base.py +367 -0
- dsgrid/registry/versioning.py +92 -0
- dsgrid/rust_ext/__init__.py +14 -0
- dsgrid/rust_ext/find_minimal_patterns.py +129 -0
- dsgrid/spark/__init__.py +0 -0
- dsgrid/spark/functions.py +589 -0
- dsgrid/spark/types.py +110 -0
- dsgrid/tests/__init__.py +0 -0
- dsgrid/tests/common.py +140 -0
- dsgrid/tests/make_us_data_registry.py +265 -0
- dsgrid/tests/register_derived_datasets.py +103 -0
- dsgrid/tests/utils.py +25 -0
- dsgrid/time/__init__.py +0 -0
- dsgrid/time/time_conversions.py +80 -0
- dsgrid/time/types.py +67 -0
- dsgrid/units/__init__.py +0 -0
- dsgrid/units/constants.py +113 -0
- dsgrid/units/convert.py +71 -0
- dsgrid/units/energy.py +145 -0
- dsgrid/units/power.py +87 -0
- dsgrid/utils/__init__.py +0 -0
- dsgrid/utils/dataset.py +830 -0
- dsgrid/utils/files.py +179 -0
- dsgrid/utils/filters.py +125 -0
- dsgrid/utils/id_remappings.py +100 -0
- dsgrid/utils/py_expression_eval/LICENSE +19 -0
- dsgrid/utils/py_expression_eval/README.md +8 -0
- dsgrid/utils/py_expression_eval/__init__.py +847 -0
- dsgrid/utils/py_expression_eval/tests.py +283 -0
- dsgrid/utils/run_command.py +70 -0
- dsgrid/utils/scratch_dir_context.py +65 -0
- dsgrid/utils/spark.py +918 -0
- dsgrid/utils/spark_partition.py +98 -0
- dsgrid/utils/timing.py +239 -0
- dsgrid/utils/utilities.py +221 -0
- dsgrid/utils/versioning.py +36 -0
- dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
- dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
- dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
- dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
- dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
dsgrid/utils/dataset.py
ADDED
|
@@ -0,0 +1,830 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Iterable
|
|
5
|
+
from zoneinfo import ZoneInfo
|
|
6
|
+
|
|
7
|
+
import chronify
|
|
8
|
+
from chronify.models import TableSchema
|
|
9
|
+
|
|
10
|
+
import dsgrid
|
|
11
|
+
from dsgrid.common import SCALING_FACTOR_COLUMN, VALUE_COLUMN
|
|
12
|
+
from dsgrid.config.dimension_config import DimensionConfig
|
|
13
|
+
from dsgrid.config.dimension_mapping_base import DimensionMappingType
|
|
14
|
+
from dsgrid.config.time_dimension_base_config import TimeDimensionBaseConfig
|
|
15
|
+
from dsgrid.dataset.dataset_mapping_manager import DatasetMappingManager
|
|
16
|
+
from dsgrid.dimension.base_models import DimensionType
|
|
17
|
+
from dsgrid.dimension.time import (
|
|
18
|
+
DaylightSavingFallBackType,
|
|
19
|
+
DaylightSavingSpringForwardType,
|
|
20
|
+
TimeBasedDataAdjustmentModel,
|
|
21
|
+
)
|
|
22
|
+
from dsgrid.exceptions import (
|
|
23
|
+
DSGInvalidField,
|
|
24
|
+
DSGInvalidDimensionMapping,
|
|
25
|
+
DSGInvalidDataset,
|
|
26
|
+
)
|
|
27
|
+
from dsgrid.spark.functions import (
|
|
28
|
+
coalesce,
|
|
29
|
+
count_distinct_on_group_by,
|
|
30
|
+
create_temp_view,
|
|
31
|
+
handle_column_spaces,
|
|
32
|
+
make_temp_view_name,
|
|
33
|
+
read_parquet,
|
|
34
|
+
is_dataframe_empty,
|
|
35
|
+
join,
|
|
36
|
+
join_multiple_columns,
|
|
37
|
+
unpivot,
|
|
38
|
+
)
|
|
39
|
+
from dsgrid.spark.functions import except_all, get_spark_session
|
|
40
|
+
from dsgrid.spark.types import (
|
|
41
|
+
DataFrame,
|
|
42
|
+
F,
|
|
43
|
+
IntegerType,
|
|
44
|
+
LongType,
|
|
45
|
+
ShortType,
|
|
46
|
+
StringType,
|
|
47
|
+
use_duckdb,
|
|
48
|
+
)
|
|
49
|
+
from dsgrid.utils.scratch_dir_context import ScratchDirContext
|
|
50
|
+
from dsgrid.utils.spark import (
|
|
51
|
+
check_for_nulls,
|
|
52
|
+
write_dataframe,
|
|
53
|
+
)
|
|
54
|
+
from dsgrid.utils.timing import timer_stats_collector, track_timing
|
|
55
|
+
|
|
56
|
+
logger = logging.getLogger(__name__)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def map_stacked_dimension(
|
|
60
|
+
df: DataFrame,
|
|
61
|
+
records: DataFrame,
|
|
62
|
+
column: str,
|
|
63
|
+
drop_column: bool = True,
|
|
64
|
+
to_column: str | None = None,
|
|
65
|
+
) -> DataFrame:
|
|
66
|
+
to_column_ = to_column or column
|
|
67
|
+
if "fraction" not in df.columns:
|
|
68
|
+
df = df.withColumn("fraction", F.lit(1.0))
|
|
69
|
+
# map and consolidate from_fraction only
|
|
70
|
+
records = records.filter("to_id IS NOT NULL")
|
|
71
|
+
df = join(df, records, column, "from_id", how="inner").drop("from_id")
|
|
72
|
+
if drop_column:
|
|
73
|
+
df = df.drop(column)
|
|
74
|
+
df = df.withColumnRenamed("to_id", to_column_)
|
|
75
|
+
nonfraction_cols = [x for x in df.columns if x not in {"fraction", "from_fraction"}]
|
|
76
|
+
df = df.select(
|
|
77
|
+
*nonfraction_cols,
|
|
78
|
+
(F.col("fraction") * F.col("from_fraction")).alias("fraction"),
|
|
79
|
+
)
|
|
80
|
+
return df
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def add_time_zone(
|
|
84
|
+
load_data_df: DataFrame,
|
|
85
|
+
geography_dim: DimensionConfig,
|
|
86
|
+
df_key: str = "geography",
|
|
87
|
+
dim_key: str = "id",
|
|
88
|
+
):
|
|
89
|
+
"""Add a time_zone column to a load_data dataframe from a geography dimension.
|
|
90
|
+
|
|
91
|
+
Parameters
|
|
92
|
+
----------
|
|
93
|
+
load_data_df : DataFrame
|
|
94
|
+
geography_dim: DimensionConfig
|
|
95
|
+
|
|
96
|
+
Returns
|
|
97
|
+
-------
|
|
98
|
+
DataFrame
|
|
99
|
+
|
|
100
|
+
"""
|
|
101
|
+
geo_records = geography_dim.get_records_dataframe()
|
|
102
|
+
if df_key not in load_data_df.columns:
|
|
103
|
+
msg = f"Cannot locate {df_key=} in load_data_df: {load_data_df.columns}"
|
|
104
|
+
raise ValueError(msg)
|
|
105
|
+
|
|
106
|
+
df = add_column_from_records(
|
|
107
|
+
load_data_df, geo_records, "time_zone", df_key, record_key=dim_key
|
|
108
|
+
)
|
|
109
|
+
return df
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def add_column_from_records(df, dimension_records, record_column, df_key, record_key: str = "id"):
|
|
113
|
+
df = join(
|
|
114
|
+
df1=df,
|
|
115
|
+
df2=dimension_records.select(F.col(record_key).alias("record_id"), record_column),
|
|
116
|
+
column1=df_key,
|
|
117
|
+
column2="record_id",
|
|
118
|
+
how="inner",
|
|
119
|
+
).drop("record_id")
|
|
120
|
+
return df
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def add_null_rows_from_load_data_lookup(df: DataFrame, lookup: DataFrame) -> DataFrame:
|
|
124
|
+
"""Add null rows from the nulled load data lookup table to data table.
|
|
125
|
+
|
|
126
|
+
Parameters
|
|
127
|
+
----------
|
|
128
|
+
df
|
|
129
|
+
load data table
|
|
130
|
+
lookup
|
|
131
|
+
load data lookup table that has been filtered for nulls.
|
|
132
|
+
"""
|
|
133
|
+
if not is_dataframe_empty(lookup):
|
|
134
|
+
intersect_cols = set(lookup.columns).intersection(df.columns)
|
|
135
|
+
null_rows_to_add = except_all(lookup.select(*intersect_cols), df.select(*intersect_cols))
|
|
136
|
+
for col in set(df.columns).difference(null_rows_to_add.columns):
|
|
137
|
+
null_rows_to_add = null_rows_to_add.withColumn(col, F.lit(None))
|
|
138
|
+
df = df.union(null_rows_to_add.select(*df.columns))
|
|
139
|
+
|
|
140
|
+
return df
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def apply_scaling_factor(
|
|
144
|
+
df: DataFrame,
|
|
145
|
+
value_column: str,
|
|
146
|
+
mapping_manager: DatasetMappingManager,
|
|
147
|
+
scaling_factor_column: str = SCALING_FACTOR_COLUMN,
|
|
148
|
+
) -> DataFrame:
|
|
149
|
+
"""Apply the scaling factor to all value columns and then drop the scaling factor column."""
|
|
150
|
+
op = mapping_manager.plan.apply_scaling_factor_op
|
|
151
|
+
if mapping_manager.has_completed_operation(op):
|
|
152
|
+
return df
|
|
153
|
+
|
|
154
|
+
func = _apply_scaling_factor_duckdb if use_duckdb() else _apply_scaling_factor_spark
|
|
155
|
+
df = func(df, value_column, scaling_factor_column)
|
|
156
|
+
if mapping_manager.plan.apply_scaling_factor_op.persist:
|
|
157
|
+
df = mapping_manager.persist_table(df, op)
|
|
158
|
+
return df
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _apply_scaling_factor_duckdb(
|
|
162
|
+
df: DataFrame,
|
|
163
|
+
value_column: str,
|
|
164
|
+
scaling_factor_column: str,
|
|
165
|
+
):
|
|
166
|
+
# Workaround for the fact that duckdb doesn't support
|
|
167
|
+
# F.col(scaling_factor_column).isNotNull()
|
|
168
|
+
cols = (x for x in df.columns if x not in (value_column, scaling_factor_column))
|
|
169
|
+
cols_str = ",".join(cols)
|
|
170
|
+
view = create_temp_view(df)
|
|
171
|
+
query = f"""
|
|
172
|
+
SELECT
|
|
173
|
+
{cols_str},
|
|
174
|
+
(
|
|
175
|
+
CASE WHEN {scaling_factor_column} IS NULL THEN {value_column}
|
|
176
|
+
ELSE {value_column} * {scaling_factor_column} END
|
|
177
|
+
) AS {value_column}
|
|
178
|
+
FROM {view}
|
|
179
|
+
"""
|
|
180
|
+
spark = get_spark_session()
|
|
181
|
+
return spark.sql(query)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _apply_scaling_factor_spark(
|
|
185
|
+
df: DataFrame,
|
|
186
|
+
value_column: str,
|
|
187
|
+
scaling_factor_column: str,
|
|
188
|
+
):
|
|
189
|
+
return df.withColumn(
|
|
190
|
+
value_column,
|
|
191
|
+
F.when(
|
|
192
|
+
F.col(scaling_factor_column).isNotNull(),
|
|
193
|
+
F.col(value_column) * F.col(scaling_factor_column),
|
|
194
|
+
).otherwise(F.col(value_column)),
|
|
195
|
+
).drop(scaling_factor_column)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def check_historical_annual_time_model_year_consistency(
|
|
199
|
+
df: DataFrame, time_column: str, model_year_column: str
|
|
200
|
+
) -> None:
|
|
201
|
+
"""Check that the model year values match the time dimension years for a historical
|
|
202
|
+
dataset with an annual time dimension.
|
|
203
|
+
"""
|
|
204
|
+
invalid = (
|
|
205
|
+
df.select(time_column, model_year_column)
|
|
206
|
+
.filter(f"{time_column} IS NOT NULL")
|
|
207
|
+
.distinct()
|
|
208
|
+
.filter(f"{time_column} != {model_year_column}")
|
|
209
|
+
.collect()
|
|
210
|
+
)
|
|
211
|
+
if invalid:
|
|
212
|
+
msg = (
|
|
213
|
+
"A historical dataset with annual time must have rows where the time years match the model years. "
|
|
214
|
+
f"{invalid}"
|
|
215
|
+
)
|
|
216
|
+
raise DSGInvalidDataset(msg)
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
@track_timing(timer_stats_collector)
|
|
220
|
+
def check_null_value_in_dimension_rows(dim_table, exclude_columns=None):
|
|
221
|
+
if os.environ.get("__DSGRID_SKIP_CHECK_NULL_DIMENSION__"):
|
|
222
|
+
# This has intermittently caused GC-related timeouts for TEMPO.
|
|
223
|
+
# Leave a backdoor to skip these checks, which may eventually be removed.
|
|
224
|
+
logger.warning("Skip check_null_value_in_dimension_rows")
|
|
225
|
+
return
|
|
226
|
+
|
|
227
|
+
try:
|
|
228
|
+
exclude = {"id"}
|
|
229
|
+
if exclude_columns is not None:
|
|
230
|
+
exclude.update(exclude_columns)
|
|
231
|
+
check_for_nulls(dim_table, exclude_columns=exclude)
|
|
232
|
+
except DSGInvalidField as exc:
|
|
233
|
+
msg = (
|
|
234
|
+
"Invalid dimension mapping application. "
|
|
235
|
+
"Combination of remapped dataset dimensions contain NULL value(s) for "
|
|
236
|
+
f"dimension(s): \n{str(exc)}"
|
|
237
|
+
)
|
|
238
|
+
raise DSGInvalidDimensionMapping(msg)
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def handle_dimension_association_errors(
|
|
242
|
+
diff: DataFrame,
|
|
243
|
+
dataset_table: DataFrame,
|
|
244
|
+
dataset_id: str,
|
|
245
|
+
) -> None:
|
|
246
|
+
"""Record missing dimension record combinations in a Parquet file and log an error."""
|
|
247
|
+
out_file = Path(f"{dataset_id}__missing_dimension_record_combinations.parquet")
|
|
248
|
+
df = write_dataframe(coalesce(diff, 1), out_file, overwrite=True)
|
|
249
|
+
logger.error(
|
|
250
|
+
"Dataset %s is missing required dimension records. Recorded missing records in %s",
|
|
251
|
+
dataset_id,
|
|
252
|
+
out_file,
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
# Analyze patterns in missing data to help identify root causes
|
|
256
|
+
try:
|
|
257
|
+
from dsgrid.rust_ext import find_minimal_patterns_from_file
|
|
258
|
+
|
|
259
|
+
logger.info("Analyzing missing data patterns for dataset %s...", dataset_id)
|
|
260
|
+
if out_file.is_dir():
|
|
261
|
+
files = list(out_file.glob("*.parquet"))
|
|
262
|
+
assert len(files) == 1, f"Expected 1 file, got {files}"
|
|
263
|
+
filename = files[0]
|
|
264
|
+
else:
|
|
265
|
+
filename = out_file
|
|
266
|
+
patterns = find_minimal_patterns_from_file(
|
|
267
|
+
filename,
|
|
268
|
+
max_depth=0,
|
|
269
|
+
verbose=False,
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
if patterns:
|
|
273
|
+
logger.error("Found %d minimal closed patterns in missing data:", len(patterns))
|
|
274
|
+
for pattern in patterns[:10]: # Show top 10 patterns
|
|
275
|
+
logger.error(
|
|
276
|
+
" Pattern %d: %s = %s (%d missing rows)",
|
|
277
|
+
pattern.pattern_id,
|
|
278
|
+
" | ".join(pattern.columns),
|
|
279
|
+
" | ".join(pattern.values),
|
|
280
|
+
pattern.num_rows,
|
|
281
|
+
)
|
|
282
|
+
if len(patterns) > 10:
|
|
283
|
+
logger.error(" ... and %d more patterns", len(patterns) - 10)
|
|
284
|
+
else:
|
|
285
|
+
logger.warning("No closed patterns found in missing data")
|
|
286
|
+
except ImportError:
|
|
287
|
+
logger.warning(
|
|
288
|
+
"Rust pattern analysis not available. Install with: pip install -e . "
|
|
289
|
+
"or build with: maturin develop"
|
|
290
|
+
)
|
|
291
|
+
_look_for_error_contributors(df, dataset_table)
|
|
292
|
+
except Exception as e:
|
|
293
|
+
logger.warning("Failed to analyze missing data patterns: %s", e)
|
|
294
|
+
|
|
295
|
+
msg = (
|
|
296
|
+
f"Dataset {dataset_id} is missing required dimension records. "
|
|
297
|
+
"Please look in the log file for more information."
|
|
298
|
+
)
|
|
299
|
+
raise DSGInvalidDataset(msg)
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def _look_for_error_contributors(diff: DataFrame, dataset_table: DataFrame) -> None:
|
|
303
|
+
diff_counts = {x: diff.select(x).distinct().count() for x in diff.columns}
|
|
304
|
+
for col in diff.columns:
|
|
305
|
+
dataset_count = dataset_table.select(col).distinct().count()
|
|
306
|
+
if dataset_count != diff_counts[col]:
|
|
307
|
+
logger.error(
|
|
308
|
+
"Error contributor: column=%s dataset_distinct_count=%s missing_distinct_count=%s",
|
|
309
|
+
col,
|
|
310
|
+
dataset_count,
|
|
311
|
+
diff_counts[col],
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def is_noop_mapping(records: DataFrame) -> bool:
|
|
316
|
+
"""Return True if the mapping is a no-op."""
|
|
317
|
+
return is_dataframe_empty(
|
|
318
|
+
records.filter(
|
|
319
|
+
"(to_id IS NULL and from_id IS NOT NULL) or "
|
|
320
|
+
"(to_id IS NOT NULL and from_id IS NULL) or "
|
|
321
|
+
"(from_id != to_id) or (from_fraction != 1.0)"
|
|
322
|
+
)
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def map_time_dimension_with_chronify_duckdb(
|
|
327
|
+
df: DataFrame,
|
|
328
|
+
value_column: str,
|
|
329
|
+
from_time_dim: TimeDimensionBaseConfig,
|
|
330
|
+
to_time_dim: TimeDimensionBaseConfig,
|
|
331
|
+
scratch_dir_context: ScratchDirContext,
|
|
332
|
+
wrap_time_allowed: bool = False,
|
|
333
|
+
time_based_data_adjustment: TimeBasedDataAdjustmentModel | None = None,
|
|
334
|
+
) -> DataFrame:
|
|
335
|
+
"""Create a time-mapped table with chronify and DuckDB.
|
|
336
|
+
All operations are performed in memory.
|
|
337
|
+
"""
|
|
338
|
+
# This will only work if the source and destination tables will fit in memory.
|
|
339
|
+
# We could potentially use a file-based DuckDB database for larger-than memory datasets.
|
|
340
|
+
# However, time checks and unpivot operations have failed with out-of-memory errors,
|
|
341
|
+
# and so we have never reached this point.
|
|
342
|
+
# If we solve those problems, this code could be modified.
|
|
343
|
+
src_schema, dst_schema = _get_mapping_schemas(df, value_column, from_time_dim, to_time_dim)
|
|
344
|
+
store = chronify.Store.create_in_memory_db()
|
|
345
|
+
store.ingest_table(df.relation, src_schema, skip_time_checks=True)
|
|
346
|
+
store.map_table_time_config(
|
|
347
|
+
src_schema.name,
|
|
348
|
+
dst_schema,
|
|
349
|
+
wrap_time_allowed=wrap_time_allowed,
|
|
350
|
+
data_adjustment=_to_chronify_time_based_data_adjustment(time_based_data_adjustment),
|
|
351
|
+
scratch_dir=scratch_dir_context.scratch_dir,
|
|
352
|
+
)
|
|
353
|
+
pandas_df = store.read_table(dst_schema.name)
|
|
354
|
+
store.drop_table(dst_schema.name)
|
|
355
|
+
return df.session.createDataFrame(pandas_df)
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def convert_time_zone_with_chronify_duckdb(
|
|
359
|
+
df: DataFrame,
|
|
360
|
+
value_column: str,
|
|
361
|
+
from_time_dim: TimeDimensionBaseConfig,
|
|
362
|
+
time_zone: str,
|
|
363
|
+
scratch_dir_context: ScratchDirContext,
|
|
364
|
+
) -> DataFrame:
|
|
365
|
+
"""Create a single time zone-converted table with chronify and DuckDB.
|
|
366
|
+
All operations are performed in memory.
|
|
367
|
+
"""
|
|
368
|
+
src_schema = _get_src_schema(df, value_column, from_time_dim)
|
|
369
|
+
store = chronify.Store.create_in_memory_db()
|
|
370
|
+
store.ingest_table(df.relation, src_schema, skip_time_checks=True)
|
|
371
|
+
zone_info_tz = ZoneInfo(time_zone)
|
|
372
|
+
dst_schema = store.convert_time_zone(
|
|
373
|
+
src_schema.name,
|
|
374
|
+
zone_info_tz,
|
|
375
|
+
scratch_dir=scratch_dir_context.scratch_dir,
|
|
376
|
+
)
|
|
377
|
+
pandas_df = store.read_table(dst_schema.name)
|
|
378
|
+
store.drop_table(dst_schema.name)
|
|
379
|
+
return df.session.createDataFrame(pandas_df)
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def convert_time_zone_by_column_with_chronify_duckdb(
|
|
383
|
+
df: DataFrame,
|
|
384
|
+
value_column: str,
|
|
385
|
+
from_time_dim: TimeDimensionBaseConfig,
|
|
386
|
+
time_zone_column: str,
|
|
387
|
+
scratch_dir_context: ScratchDirContext,
|
|
388
|
+
wrap_time_allowed: bool = False,
|
|
389
|
+
) -> DataFrame:
|
|
390
|
+
"""Create a multiple time zone-converted table (based on a time_zone_column)
|
|
391
|
+
using chronify and DuckDB.
|
|
392
|
+
All operations are performed in memory.
|
|
393
|
+
"""
|
|
394
|
+
src_schema = _get_src_schema(df, value_column, from_time_dim)
|
|
395
|
+
store = chronify.Store.create_in_memory_db()
|
|
396
|
+
store.ingest_table(df.relation, src_schema, skip_time_checks=True)
|
|
397
|
+
dst_schema = store.convert_time_zone_by_column(
|
|
398
|
+
src_schema.name,
|
|
399
|
+
time_zone_column,
|
|
400
|
+
wrap_time_allowed=wrap_time_allowed,
|
|
401
|
+
scratch_dir=scratch_dir_context.scratch_dir,
|
|
402
|
+
)
|
|
403
|
+
pandas_df = store.read_table(dst_schema.name)
|
|
404
|
+
store.drop_table(dst_schema.name)
|
|
405
|
+
return df.session.createDataFrame(pandas_df)
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
def map_time_dimension_with_chronify_spark_hive(
|
|
409
|
+
df: DataFrame,
|
|
410
|
+
table_name: str,
|
|
411
|
+
value_column: str,
|
|
412
|
+
from_time_dim: TimeDimensionBaseConfig,
|
|
413
|
+
to_time_dim: TimeDimensionBaseConfig,
|
|
414
|
+
scratch_dir_context: ScratchDirContext,
|
|
415
|
+
time_based_data_adjustment: TimeBasedDataAdjustmentModel | None = None,
|
|
416
|
+
wrap_time_allowed: bool = False,
|
|
417
|
+
) -> DataFrame:
|
|
418
|
+
"""Create a time-mapped table with chronify and Spark and a Hive Metastore.
|
|
419
|
+
The source data must already be stored in the metastore.
|
|
420
|
+
Chronify will store the mapped table in the metastore.
|
|
421
|
+
"""
|
|
422
|
+
src_schema, dst_schema = _get_mapping_schemas(
|
|
423
|
+
df, value_column, from_time_dim, to_time_dim, src_name=table_name
|
|
424
|
+
)
|
|
425
|
+
store = chronify.Store.create_new_hive_store(dsgrid.runtime_config.thrift_server_url)
|
|
426
|
+
with store.engine.begin() as conn:
|
|
427
|
+
# This bypasses checks because the table should already be valid.
|
|
428
|
+
store.schema_manager.add_schema(conn, src_schema)
|
|
429
|
+
try:
|
|
430
|
+
store.map_table_time_config(
|
|
431
|
+
src_schema.name,
|
|
432
|
+
dst_schema,
|
|
433
|
+
check_mapped_timestamps=False,
|
|
434
|
+
scratch_dir=scratch_dir_context.scratch_dir,
|
|
435
|
+
wrap_time_allowed=wrap_time_allowed,
|
|
436
|
+
data_adjustment=_to_chronify_time_based_data_adjustment(time_based_data_adjustment),
|
|
437
|
+
)
|
|
438
|
+
finally:
|
|
439
|
+
with store.engine.begin() as conn:
|
|
440
|
+
store.schema_manager.remove_schema(conn, src_schema.name)
|
|
441
|
+
|
|
442
|
+
return df.sparkSession.sql(f"SELECT * FROM {dst_schema.name}")
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
def convert_time_zone_with_chronify_spark_hive(
|
|
446
|
+
df: DataFrame,
|
|
447
|
+
value_column: str,
|
|
448
|
+
from_time_dim: TimeDimensionBaseConfig,
|
|
449
|
+
time_zone: str,
|
|
450
|
+
scratch_dir_context: ScratchDirContext,
|
|
451
|
+
) -> DataFrame:
|
|
452
|
+
"""Create a single time zone-converted table with chronify and Spark and a Hive Metastore."""
|
|
453
|
+
src_schema = _get_src_schema(df, value_column, from_time_dim)
|
|
454
|
+
store = chronify.Store.create_new_hive_store(dsgrid.runtime_config.thrift_server_url)
|
|
455
|
+
with store.engine.begin() as conn:
|
|
456
|
+
# This bypasses checks because the table should already be valid.
|
|
457
|
+
store.schema_manager.add_schema(conn, src_schema)
|
|
458
|
+
zone_info_tz = ZoneInfo(time_zone)
|
|
459
|
+
try:
|
|
460
|
+
dst_schema = store.convert_time_zone(
|
|
461
|
+
src_schema.name,
|
|
462
|
+
zone_info_tz,
|
|
463
|
+
scratch_dir=scratch_dir_context.scratch_dir,
|
|
464
|
+
)
|
|
465
|
+
finally:
|
|
466
|
+
with store.engine.begin() as conn:
|
|
467
|
+
store.schema_manager.remove_schema(conn, src_schema.name)
|
|
468
|
+
|
|
469
|
+
return df.sparkSession.sql(f"SELECT * FROM {dst_schema.name}")
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def convert_time_zone_by_column_with_chronify_spark_hive(
|
|
473
|
+
df: DataFrame,
|
|
474
|
+
value_column: str,
|
|
475
|
+
from_time_dim: TimeDimensionBaseConfig,
|
|
476
|
+
time_zone_column: str,
|
|
477
|
+
scratch_dir_context: ScratchDirContext,
|
|
478
|
+
wrap_time_allowed: bool = False,
|
|
479
|
+
) -> DataFrame:
|
|
480
|
+
"""Create a multiple time zone-converted table (based on a time_zone_column)
|
|
481
|
+
using chronify and Spark and a Hive Metastore.
|
|
482
|
+
"""
|
|
483
|
+
src_schema = _get_src_schema(df, value_column, from_time_dim)
|
|
484
|
+
store = chronify.Store.create_new_hive_store(dsgrid.runtime_config.thrift_server_url)
|
|
485
|
+
with store.engine.begin() as conn:
|
|
486
|
+
# This bypasses checks because the table should already be valid.
|
|
487
|
+
store.schema_manager.add_schema(conn, src_schema)
|
|
488
|
+
try:
|
|
489
|
+
dst_schema = store.convert_time_zone_by_column(
|
|
490
|
+
src_schema.name,
|
|
491
|
+
time_zone_column,
|
|
492
|
+
wrap_time_allowed=wrap_time_allowed,
|
|
493
|
+
scratch_dir=scratch_dir_context.scratch_dir,
|
|
494
|
+
)
|
|
495
|
+
finally:
|
|
496
|
+
with store.engine.begin() as conn:
|
|
497
|
+
store.schema_manager.remove_schema(conn, src_schema.name)
|
|
498
|
+
|
|
499
|
+
return df.sparkSession.sql(f"SELECT * FROM {dst_schema.name}")
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
def map_time_dimension_with_chronify_spark_path(
|
|
503
|
+
df: DataFrame,
|
|
504
|
+
filename: Path,
|
|
505
|
+
value_column: str,
|
|
506
|
+
from_time_dim: TimeDimensionBaseConfig,
|
|
507
|
+
to_time_dim: TimeDimensionBaseConfig,
|
|
508
|
+
scratch_dir_context: ScratchDirContext,
|
|
509
|
+
wrap_time_allowed: bool = False,
|
|
510
|
+
time_based_data_adjustment: TimeBasedDataAdjustmentModel | None = None,
|
|
511
|
+
) -> DataFrame:
|
|
512
|
+
"""Create a time-mapped table with chronify and Spark using the local filesystem.
|
|
513
|
+
Chronify will store the mapped table in a Parquet file within scratch_dir_context.
|
|
514
|
+
"""
|
|
515
|
+
src_schema, dst_schema = _get_mapping_schemas(df, value_column, from_time_dim, to_time_dim)
|
|
516
|
+
store = chronify.Store.create_new_hive_store(dsgrid.runtime_config.thrift_server_url)
|
|
517
|
+
store.create_view_from_parquet(filename, src_schema, bypass_checks=True)
|
|
518
|
+
output_file = scratch_dir_context.get_temp_filename(suffix=".parquet")
|
|
519
|
+
store.map_table_time_config(
|
|
520
|
+
src_schema.name,
|
|
521
|
+
dst_schema,
|
|
522
|
+
check_mapped_timestamps=False,
|
|
523
|
+
scratch_dir=scratch_dir_context.scratch_dir,
|
|
524
|
+
output_file=output_file,
|
|
525
|
+
wrap_time_allowed=wrap_time_allowed,
|
|
526
|
+
data_adjustment=_to_chronify_time_based_data_adjustment(time_based_data_adjustment),
|
|
527
|
+
)
|
|
528
|
+
return df.sparkSession.read.load(str(output_file))
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
def convert_time_zone_with_chronify_spark_path(
|
|
532
|
+
df: DataFrame,
|
|
533
|
+
filename: Path,
|
|
534
|
+
value_column: str,
|
|
535
|
+
from_time_dim: TimeDimensionBaseConfig,
|
|
536
|
+
time_zone: str,
|
|
537
|
+
scratch_dir_context: ScratchDirContext,
|
|
538
|
+
) -> DataFrame:
|
|
539
|
+
"""Create a single time zone-converted table with chronify and Spark using the local filesystem."""
|
|
540
|
+
src_schema = _get_src_schema(df, value_column, from_time_dim)
|
|
541
|
+
store = chronify.Store.create_new_hive_store(dsgrid.runtime_config.thrift_server_url)
|
|
542
|
+
store.create_view_from_parquet(filename, src_schema, bypass_checks=True)
|
|
543
|
+
output_file = scratch_dir_context.get_temp_filename(suffix=".parquet")
|
|
544
|
+
zone_info_tz = ZoneInfo(time_zone)
|
|
545
|
+
store.convert_time_zone(
|
|
546
|
+
src_schema.name,
|
|
547
|
+
zone_info_tz,
|
|
548
|
+
scratch_dir=scratch_dir_context.scratch_dir,
|
|
549
|
+
output_file=output_file,
|
|
550
|
+
)
|
|
551
|
+
return df.sparkSession.read.load(str(output_file))
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
def convert_time_zone_by_column_with_chronify_spark_path(
|
|
555
|
+
df: DataFrame,
|
|
556
|
+
filename: Path,
|
|
557
|
+
value_column: str,
|
|
558
|
+
from_time_dim: TimeDimensionBaseConfig,
|
|
559
|
+
time_zone_column: str,
|
|
560
|
+
scratch_dir_context: ScratchDirContext,
|
|
561
|
+
wrap_time_allowed: bool = False,
|
|
562
|
+
) -> DataFrame:
|
|
563
|
+
"""Create a multiple time zone-converted table (based on a time_zone_column)
|
|
564
|
+
using chronify and Spark using the local filesystem.
|
|
565
|
+
"""
|
|
566
|
+
src_schema = _get_src_schema(df, value_column, from_time_dim)
|
|
567
|
+
store = chronify.Store.create_new_hive_store(dsgrid.runtime_config.thrift_server_url)
|
|
568
|
+
store.create_view_from_parquet(filename, src_schema, bypass_checks=True)
|
|
569
|
+
output_file = scratch_dir_context.get_temp_filename(suffix=".parquet")
|
|
570
|
+
store.convert_time_zone_by_column(
|
|
571
|
+
src_schema.name,
|
|
572
|
+
time_zone_column,
|
|
573
|
+
wrap_time_allowed=wrap_time_allowed,
|
|
574
|
+
scratch_dir=scratch_dir_context.scratch_dir,
|
|
575
|
+
output_file=output_file,
|
|
576
|
+
)
|
|
577
|
+
return df.sparkSession.read.load(str(output_file))
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
def _to_chronify_time_based_data_adjustment(
|
|
581
|
+
adj: TimeBasedDataAdjustmentModel | None,
|
|
582
|
+
) -> chronify.TimeBasedDataAdjustment | None:
|
|
583
|
+
if adj is None:
|
|
584
|
+
return None
|
|
585
|
+
if (
|
|
586
|
+
adj.daylight_saving_adjustment.spring_forward_hour == DaylightSavingSpringForwardType.NONE
|
|
587
|
+
and adj.daylight_saving_adjustment.fall_back_hour == DaylightSavingFallBackType.NONE
|
|
588
|
+
):
|
|
589
|
+
chronify_dst_adjustment = chronify.time.DaylightSavingAdjustmentType.NONE
|
|
590
|
+
elif (
|
|
591
|
+
adj.daylight_saving_adjustment.spring_forward_hour == DaylightSavingSpringForwardType.DROP
|
|
592
|
+
and adj.daylight_saving_adjustment.fall_back_hour == DaylightSavingFallBackType.DUPLICATE
|
|
593
|
+
):
|
|
594
|
+
chronify_dst_adjustment = (
|
|
595
|
+
chronify.time.DaylightSavingAdjustmentType.DROP_SPRING_FORWARD_DUPLICATE_FALLBACK
|
|
596
|
+
)
|
|
597
|
+
elif (
|
|
598
|
+
adj.daylight_saving_adjustment.spring_forward_hour == DaylightSavingSpringForwardType.DROP
|
|
599
|
+
and adj.daylight_saving_adjustment.fall_back_hour == DaylightSavingFallBackType.INTERPOLATE
|
|
600
|
+
):
|
|
601
|
+
chronify_dst_adjustment = (
|
|
602
|
+
chronify.time.DaylightSavingAdjustmentType.DROP_SPRING_FORWARD_INTERPOLATE_FALLBACK
|
|
603
|
+
)
|
|
604
|
+
else:
|
|
605
|
+
msg = f"dsgrid time_based_data_adjustment = {adj}"
|
|
606
|
+
raise NotImplementedError(msg)
|
|
607
|
+
|
|
608
|
+
return chronify.TimeBasedDataAdjustment(
|
|
609
|
+
leap_day_adjustment=adj.leap_day_adjustment.value,
|
|
610
|
+
daylight_saving_adjustment=chronify_dst_adjustment,
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
|
|
614
|
+
def _get_src_schema(
|
|
615
|
+
df: DataFrame,
|
|
616
|
+
value_column: str,
|
|
617
|
+
from_time_dim: TimeDimensionBaseConfig,
|
|
618
|
+
src_name: str | None = None,
|
|
619
|
+
) -> TableSchema:
|
|
620
|
+
src = src_name or "src_" + make_temp_view_name()
|
|
621
|
+
time_col_list = from_time_dim.get_load_data_time_columns()
|
|
622
|
+
time_config = from_time_dim.to_chronify()
|
|
623
|
+
time_array_id_columns = [
|
|
624
|
+
x
|
|
625
|
+
for x in df.columns
|
|
626
|
+
if x in set(df.columns).difference(set(time_col_list)) - {value_column}
|
|
627
|
+
]
|
|
628
|
+
src_schema = chronify.TableSchema(
|
|
629
|
+
name=src,
|
|
630
|
+
time_config=time_config,
|
|
631
|
+
time_array_id_columns=time_array_id_columns,
|
|
632
|
+
value_column=value_column,
|
|
633
|
+
)
|
|
634
|
+
return src_schema
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
def _get_dst_schema(
|
|
638
|
+
df: DataFrame,
|
|
639
|
+
value_column: str,
|
|
640
|
+
from_time_dim: TimeDimensionBaseConfig,
|
|
641
|
+
to_time_dim: TimeDimensionBaseConfig,
|
|
642
|
+
) -> TableSchema:
|
|
643
|
+
time_config = to_time_dim.to_chronify()
|
|
644
|
+
time_col_list = from_time_dim.get_load_data_time_columns()
|
|
645
|
+
time_array_id_columns = [
|
|
646
|
+
x
|
|
647
|
+
for x in df.columns
|
|
648
|
+
if x in set(df.columns).difference(set(time_col_list)) - {value_column}
|
|
649
|
+
]
|
|
650
|
+
dst_schema = chronify.TableSchema(
|
|
651
|
+
name="dst_" + make_temp_view_name(),
|
|
652
|
+
time_config=time_config,
|
|
653
|
+
time_array_id_columns=time_array_id_columns,
|
|
654
|
+
value_column=value_column,
|
|
655
|
+
)
|
|
656
|
+
return dst_schema
|
|
657
|
+
|
|
658
|
+
|
|
659
|
+
def _get_mapping_schemas(
|
|
660
|
+
df: DataFrame,
|
|
661
|
+
value_column: str,
|
|
662
|
+
from_time_dim: TimeDimensionBaseConfig,
|
|
663
|
+
to_time_dim: TimeDimensionBaseConfig,
|
|
664
|
+
src_name: str | None = None,
|
|
665
|
+
) -> tuple[TableSchema, TableSchema]:
|
|
666
|
+
src_schema = _get_src_schema(df, value_column, from_time_dim, src_name=src_name)
|
|
667
|
+
dst_schema = _get_dst_schema(df, value_column, from_time_dim, to_time_dim)
|
|
668
|
+
return src_schema, dst_schema
|
|
669
|
+
|
|
670
|
+
|
|
671
|
+
def ordered_subset_columns(df, subset: set[str]) -> list[str]:
|
|
672
|
+
"""Return a list of columns in the dataframe that are present in subset."""
|
|
673
|
+
return [x for x in df.columns if x in subset]
|
|
674
|
+
|
|
675
|
+
|
|
676
|
+
def remove_invalid_null_timestamps(df, time_columns, stacked_columns):
|
|
677
|
+
"""Remove rows from the dataframe where the time column is NULL and other rows with the
|
|
678
|
+
same dimensions contain valid data.
|
|
679
|
+
"""
|
|
680
|
+
assert len(time_columns) == 1, time_columns
|
|
681
|
+
time_column = next(iter(time_columns))
|
|
682
|
+
orig_columns = df.columns
|
|
683
|
+
stacked = list(stacked_columns)
|
|
684
|
+
return (
|
|
685
|
+
join_multiple_columns(
|
|
686
|
+
df,
|
|
687
|
+
count_distinct_on_group_by(df, stacked, time_column, "count_time"),
|
|
688
|
+
stacked,
|
|
689
|
+
)
|
|
690
|
+
.filter(f"{handle_column_spaces(time_column)} IS NOT NULL OR count_time = 0")
|
|
691
|
+
.select(orig_columns)
|
|
692
|
+
)
|
|
693
|
+
|
|
694
|
+
|
|
695
|
+
@track_timing(timer_stats_collector)
|
|
696
|
+
def repartition_if_needed_by_mapping(
|
|
697
|
+
df: DataFrame,
|
|
698
|
+
mapping_type: DimensionMappingType,
|
|
699
|
+
scratch_dir_context: ScratchDirContext,
|
|
700
|
+
repartition: bool | None = None,
|
|
701
|
+
) -> tuple[DataFrame, Path | None]:
|
|
702
|
+
"""Repartition the dataframe if the mapping might cause data skew.
|
|
703
|
+
|
|
704
|
+
Parameters
|
|
705
|
+
----------
|
|
706
|
+
df : DataFrame
|
|
707
|
+
The dataframe to repartition.
|
|
708
|
+
mapping_type : DimensionMappingType
|
|
709
|
+
scratch_dir_context : ScratchDirContext
|
|
710
|
+
The scratch directory context to use for temporary files.
|
|
711
|
+
repartition : bool
|
|
712
|
+
If None, repartition based on the mapping type.
|
|
713
|
+
Otherwise, always repartition if True, or never if False.
|
|
714
|
+
"""
|
|
715
|
+
if use_duckdb():
|
|
716
|
+
return df, None
|
|
717
|
+
|
|
718
|
+
# We experienced an issue with the IEF buildings dataset where the disaggregation of
|
|
719
|
+
# region to county caused a major issue where one Spark executor thread got stuck,
|
|
720
|
+
# seemingly indefinitely. A message like this was repeated continually.
|
|
721
|
+
# UnsafeExternalSorter: Thread 152 spilling sort data of 4.0 GiB to disk (0 time so far)
|
|
722
|
+
# It appears to be caused by data skew, though the imbalance didn't seem too severe.
|
|
723
|
+
# Using a variation of what online sources call a "salting technique" solves the issue.
|
|
724
|
+
# Apply the technique to mappings that will cause an explosion of rows.
|
|
725
|
+
# Note that this probably isn't needed in all cases and we may need to adjust in the
|
|
726
|
+
# future.
|
|
727
|
+
|
|
728
|
+
# Note: log messages below are checked in the tests.
|
|
729
|
+
if repartition or (
|
|
730
|
+
repartition is None
|
|
731
|
+
and mapping_type
|
|
732
|
+
in {
|
|
733
|
+
DimensionMappingType.ONE_TO_MANY_DISAGGREGATION,
|
|
734
|
+
# These cases might be problematic in the future.
|
|
735
|
+
# DimensionMappingType.ONE_TO_MANY_ASSIGNMENT,
|
|
736
|
+
# DimensionMappingType.ONE_TO_MANY_EXPLICIT_MULTIPLIERS,
|
|
737
|
+
# DimensionMappingType.MANY_TO_MANY_DISAGGREGATION,
|
|
738
|
+
# This is usually happening with scenario and hasn't caused a problem.
|
|
739
|
+
# DimensionMappingType.DUPLICATION,
|
|
740
|
+
}
|
|
741
|
+
):
|
|
742
|
+
filename = scratch_dir_context.get_temp_filename(suffix=".parquet")
|
|
743
|
+
# Salting techniques online talk about adding or modifying a column with random values.
|
|
744
|
+
# We might be able to use one of our value columns. However, there are cases where there
|
|
745
|
+
# could be many instances of zero or null. So, add a new column with random values.
|
|
746
|
+
logger.info("Repartition after mapping %s", mapping_type)
|
|
747
|
+
salted_column = "salted_key"
|
|
748
|
+
spark = get_spark_session()
|
|
749
|
+
num_partitions = int(spark.conf.get("spark.sql.shuffle.partitions"))
|
|
750
|
+
df.withColumn(
|
|
751
|
+
salted_column, (F.rand() * num_partitions).cast(IntegerType()) + 1
|
|
752
|
+
).repartition(salted_column).write.parquet(str(filename))
|
|
753
|
+
df = read_parquet(filename).drop(salted_column)
|
|
754
|
+
logger.info("Completed repartition.")
|
|
755
|
+
return df, filename
|
|
756
|
+
|
|
757
|
+
logger.debug("Repartition is not needed for mapping_type %s", mapping_type)
|
|
758
|
+
return df, None
|
|
759
|
+
|
|
760
|
+
|
|
761
|
+
def unpivot_dataframe(
|
|
762
|
+
df: DataFrame,
|
|
763
|
+
value_columns: Iterable[str],
|
|
764
|
+
variable_column: str,
|
|
765
|
+
time_columns: list[str],
|
|
766
|
+
) -> DataFrame:
|
|
767
|
+
"""Unpivot the dataframe, accounting for time columns."""
|
|
768
|
+
values = value_columns if isinstance(value_columns, set) else set(value_columns)
|
|
769
|
+
ids = [x for x in df.columns if x != VALUE_COLUMN and x not in values]
|
|
770
|
+
df = unpivot(df, value_columns, variable_column, VALUE_COLUMN)
|
|
771
|
+
cols = set(df.columns).difference(time_columns)
|
|
772
|
+
new_rows = df.filter(f"{VALUE_COLUMN} IS NULL").select(*cols).distinct()
|
|
773
|
+
for col in time_columns:
|
|
774
|
+
new_rows = new_rows.withColumn(col, F.lit(None))
|
|
775
|
+
|
|
776
|
+
return (
|
|
777
|
+
df.filter(f"{VALUE_COLUMN} IS NOT NULL")
|
|
778
|
+
.union(new_rows.select(*df.columns))
|
|
779
|
+
.select(*ids, variable_column, VALUE_COLUMN)
|
|
780
|
+
)
|
|
781
|
+
|
|
782
|
+
|
|
783
|
+
def convert_types_if_necessary(df: DataFrame) -> DataFrame:
|
|
784
|
+
"""Convert the types of the dataframe if necessary."""
|
|
785
|
+
allowed_int_columns = (
|
|
786
|
+
DimensionType.MODEL_YEAR.value,
|
|
787
|
+
DimensionType.WEATHER_YEAR.value,
|
|
788
|
+
)
|
|
789
|
+
int_types = {IntegerType(), LongType(), ShortType()}
|
|
790
|
+
existing_columns = set(df.columns)
|
|
791
|
+
for column in allowed_int_columns:
|
|
792
|
+
if column in existing_columns and df.schema[column].dataType in int_types:
|
|
793
|
+
df = df.withColumn(column, F.col(column).cast(StringType()))
|
|
794
|
+
return df
|
|
795
|
+
|
|
796
|
+
|
|
797
|
+
def filter_out_expected_missing_associations(
|
|
798
|
+
main_df: DataFrame, missing_df: DataFrame
|
|
799
|
+
) -> DataFrame:
|
|
800
|
+
"""Filter out rows that are expected to be missing from the main dataframe."""
|
|
801
|
+
missing_columns = [DimensionType.from_column(x).value for x in missing_df.columns]
|
|
802
|
+
spark = get_spark_session()
|
|
803
|
+
main_view = make_temp_view_name()
|
|
804
|
+
assoc_view = make_temp_view_name()
|
|
805
|
+
main_columns = ",".join((f"{main_view}.{x}" for x in main_df.columns))
|
|
806
|
+
|
|
807
|
+
main_df.createOrReplaceTempView(main_view)
|
|
808
|
+
missing_df.createOrReplaceTempView(assoc_view)
|
|
809
|
+
join_str = " AND ".join((f"{main_view}.{x} = {assoc_view}.{x}" for x in missing_columns))
|
|
810
|
+
query = f"""
|
|
811
|
+
SELECT {main_columns}
|
|
812
|
+
FROM {main_view}
|
|
813
|
+
ANTI JOIN {assoc_view}
|
|
814
|
+
ON {join_str}
|
|
815
|
+
"""
|
|
816
|
+
res = spark.sql(query)
|
|
817
|
+
return res
|
|
818
|
+
|
|
819
|
+
|
|
820
|
+
def split_expected_missing_rows(
|
|
821
|
+
df: DataFrame, time_columns: list[str]
|
|
822
|
+
) -> tuple[DataFrame, DataFrame | None]:
|
|
823
|
+
"""Split a DataFrame into two if it contains expected missing data."""
|
|
824
|
+
null_df = df.filter(f"{VALUE_COLUMN} IS NULL")
|
|
825
|
+
if is_dataframe_empty(null_df):
|
|
826
|
+
return df, None
|
|
827
|
+
|
|
828
|
+
drop_columns = time_columns + [VALUE_COLUMN]
|
|
829
|
+
missing_associations = null_df.drop(*drop_columns)
|
|
830
|
+
return df.filter(f"{VALUE_COLUMN} IS NOT NULL"), missing_associations
|