dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- build_backend.py +93 -0
- dsgrid/__init__.py +22 -0
- dsgrid/api/__init__.py +0 -0
- dsgrid/api/api_manager.py +179 -0
- dsgrid/api/app.py +419 -0
- dsgrid/api/models.py +60 -0
- dsgrid/api/response_models.py +116 -0
- dsgrid/apps/__init__.py +0 -0
- dsgrid/apps/project_viewer/app.py +216 -0
- dsgrid/apps/registration_gui.py +444 -0
- dsgrid/chronify.py +32 -0
- dsgrid/cli/__init__.py +0 -0
- dsgrid/cli/common.py +120 -0
- dsgrid/cli/config.py +176 -0
- dsgrid/cli/download.py +13 -0
- dsgrid/cli/dsgrid.py +157 -0
- dsgrid/cli/dsgrid_admin.py +92 -0
- dsgrid/cli/install_notebooks.py +62 -0
- dsgrid/cli/query.py +729 -0
- dsgrid/cli/registry.py +1862 -0
- dsgrid/cloud/__init__.py +0 -0
- dsgrid/cloud/cloud_storage_interface.py +140 -0
- dsgrid/cloud/factory.py +31 -0
- dsgrid/cloud/fake_storage_interface.py +37 -0
- dsgrid/cloud/s3_storage_interface.py +156 -0
- dsgrid/common.py +36 -0
- dsgrid/config/__init__.py +0 -0
- dsgrid/config/annual_time_dimension_config.py +194 -0
- dsgrid/config/common.py +142 -0
- dsgrid/config/config_base.py +148 -0
- dsgrid/config/dataset_config.py +907 -0
- dsgrid/config/dataset_schema_handler_factory.py +46 -0
- dsgrid/config/date_time_dimension_config.py +136 -0
- dsgrid/config/dimension_config.py +54 -0
- dsgrid/config/dimension_config_factory.py +65 -0
- dsgrid/config/dimension_mapping_base.py +350 -0
- dsgrid/config/dimension_mappings_config.py +48 -0
- dsgrid/config/dimensions.py +1025 -0
- dsgrid/config/dimensions_config.py +71 -0
- dsgrid/config/file_schema.py +190 -0
- dsgrid/config/index_time_dimension_config.py +80 -0
- dsgrid/config/input_dataset_requirements.py +31 -0
- dsgrid/config/mapping_tables.py +209 -0
- dsgrid/config/noop_time_dimension_config.py +42 -0
- dsgrid/config/project_config.py +1462 -0
- dsgrid/config/registration_models.py +188 -0
- dsgrid/config/representative_period_time_dimension_config.py +194 -0
- dsgrid/config/simple_models.py +49 -0
- dsgrid/config/supplemental_dimension.py +29 -0
- dsgrid/config/time_dimension_base_config.py +192 -0
- dsgrid/data_models.py +155 -0
- dsgrid/dataset/__init__.py +0 -0
- dsgrid/dataset/dataset.py +123 -0
- dsgrid/dataset/dataset_expression_handler.py +86 -0
- dsgrid/dataset/dataset_mapping_manager.py +121 -0
- dsgrid/dataset/dataset_schema_handler_base.py +945 -0
- dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
- dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
- dsgrid/dataset/growth_rates.py +162 -0
- dsgrid/dataset/models.py +51 -0
- dsgrid/dataset/table_format_handler_base.py +257 -0
- dsgrid/dataset/table_format_handler_factory.py +17 -0
- dsgrid/dataset/unpivoted_table.py +121 -0
- dsgrid/dimension/__init__.py +0 -0
- dsgrid/dimension/base_models.py +230 -0
- dsgrid/dimension/dimension_filters.py +308 -0
- dsgrid/dimension/standard.py +252 -0
- dsgrid/dimension/time.py +352 -0
- dsgrid/dimension/time_utils.py +103 -0
- dsgrid/dsgrid_rc.py +88 -0
- dsgrid/exceptions.py +105 -0
- dsgrid/filesystem/__init__.py +0 -0
- dsgrid/filesystem/cloud_filesystem.py +32 -0
- dsgrid/filesystem/factory.py +32 -0
- dsgrid/filesystem/filesystem_interface.py +136 -0
- dsgrid/filesystem/local_filesystem.py +74 -0
- dsgrid/filesystem/s3_filesystem.py +118 -0
- dsgrid/loggers.py +132 -0
- dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
- dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
- dsgrid/notebooks/registration.ipynb +48 -0
- dsgrid/notebooks/start_notebook.sh +11 -0
- dsgrid/project.py +451 -0
- dsgrid/query/__init__.py +0 -0
- dsgrid/query/dataset_mapping_plan.py +142 -0
- dsgrid/query/derived_dataset.py +388 -0
- dsgrid/query/models.py +728 -0
- dsgrid/query/query_context.py +287 -0
- dsgrid/query/query_submitter.py +994 -0
- dsgrid/query/report_factory.py +19 -0
- dsgrid/query/report_peak_load.py +70 -0
- dsgrid/query/reports_base.py +20 -0
- dsgrid/registry/__init__.py +0 -0
- dsgrid/registry/bulk_register.py +165 -0
- dsgrid/registry/common.py +287 -0
- dsgrid/registry/config_update_checker_base.py +63 -0
- dsgrid/registry/data_store_factory.py +34 -0
- dsgrid/registry/data_store_interface.py +74 -0
- dsgrid/registry/dataset_config_generator.py +158 -0
- dsgrid/registry/dataset_registry_manager.py +950 -0
- dsgrid/registry/dataset_update_checker.py +16 -0
- dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
- dsgrid/registry/dimension_mapping_update_checker.py +16 -0
- dsgrid/registry/dimension_registry_manager.py +413 -0
- dsgrid/registry/dimension_update_checker.py +16 -0
- dsgrid/registry/duckdb_data_store.py +207 -0
- dsgrid/registry/filesystem_data_store.py +150 -0
- dsgrid/registry/filter_registry_manager.py +123 -0
- dsgrid/registry/project_config_generator.py +57 -0
- dsgrid/registry/project_registry_manager.py +1623 -0
- dsgrid/registry/project_update_checker.py +48 -0
- dsgrid/registry/registration_context.py +223 -0
- dsgrid/registry/registry_auto_updater.py +316 -0
- dsgrid/registry/registry_database.py +667 -0
- dsgrid/registry/registry_interface.py +446 -0
- dsgrid/registry/registry_manager.py +558 -0
- dsgrid/registry/registry_manager_base.py +367 -0
- dsgrid/registry/versioning.py +92 -0
- dsgrid/rust_ext/__init__.py +14 -0
- dsgrid/rust_ext/find_minimal_patterns.py +129 -0
- dsgrid/spark/__init__.py +0 -0
- dsgrid/spark/functions.py +589 -0
- dsgrid/spark/types.py +110 -0
- dsgrid/tests/__init__.py +0 -0
- dsgrid/tests/common.py +140 -0
- dsgrid/tests/make_us_data_registry.py +265 -0
- dsgrid/tests/register_derived_datasets.py +103 -0
- dsgrid/tests/utils.py +25 -0
- dsgrid/time/__init__.py +0 -0
- dsgrid/time/time_conversions.py +80 -0
- dsgrid/time/types.py +67 -0
- dsgrid/units/__init__.py +0 -0
- dsgrid/units/constants.py +113 -0
- dsgrid/units/convert.py +71 -0
- dsgrid/units/energy.py +145 -0
- dsgrid/units/power.py +87 -0
- dsgrid/utils/__init__.py +0 -0
- dsgrid/utils/dataset.py +830 -0
- dsgrid/utils/files.py +179 -0
- dsgrid/utils/filters.py +125 -0
- dsgrid/utils/id_remappings.py +100 -0
- dsgrid/utils/py_expression_eval/LICENSE +19 -0
- dsgrid/utils/py_expression_eval/README.md +8 -0
- dsgrid/utils/py_expression_eval/__init__.py +847 -0
- dsgrid/utils/py_expression_eval/tests.py +283 -0
- dsgrid/utils/run_command.py +70 -0
- dsgrid/utils/scratch_dir_context.py +65 -0
- dsgrid/utils/spark.py +918 -0
- dsgrid/utils/spark_partition.py +98 -0
- dsgrid/utils/timing.py +239 -0
- dsgrid/utils/utilities.py +221 -0
- dsgrid/utils/versioning.py +36 -0
- dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
- dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
- dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
- dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
- dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Generator
|
|
5
|
+
|
|
6
|
+
from dsgrid.dataset.models import (
|
|
7
|
+
ValueFormat,
|
|
8
|
+
StackedTableFormatModel,
|
|
9
|
+
PivotedTableFormatModel,
|
|
10
|
+
)
|
|
11
|
+
from dsgrid.common import VALUE_COLUMN
|
|
12
|
+
from dsgrid.dimension.base_models import DimensionType
|
|
13
|
+
from dsgrid.config.project_config import DatasetBaseDimensionNamesModel
|
|
14
|
+
from dsgrid.dataset.dataset_mapping_manager import DatasetMappingManager
|
|
15
|
+
from dsgrid.query.dataset_mapping_plan import DatasetMappingPlan, MapOperationCheckpoint
|
|
16
|
+
from dsgrid.spark.functions import drop_temp_tables_and_views
|
|
17
|
+
from dsgrid.spark.types import DataFrame
|
|
18
|
+
from dsgrid.utils.spark import get_spark_session
|
|
19
|
+
from dsgrid.utils.scratch_dir_context import ScratchDirContext
|
|
20
|
+
from .models import ColumnType, DatasetMetadataModel, DimensionMetadataModel, QueryBaseModel
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class QueryContext:
|
|
27
|
+
"""Maintains context of the query as it is processed through the stack."""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
model: QueryBaseModel,
|
|
32
|
+
base_dimension_names: DatasetBaseDimensionNamesModel,
|
|
33
|
+
scratch_dir_context: ScratchDirContext,
|
|
34
|
+
checkpoint: MapOperationCheckpoint | None = None,
|
|
35
|
+
) -> None:
|
|
36
|
+
self._model = model
|
|
37
|
+
self._record_ids_by_dimension_type: dict[DimensionType, list[tuple[str]]] = {}
|
|
38
|
+
self._metadata = DatasetMetadataModel(
|
|
39
|
+
table_format=self.model.result.table_format,
|
|
40
|
+
base_dimension_names=base_dimension_names,
|
|
41
|
+
)
|
|
42
|
+
self._dataset_metadata: dict[str, DatasetMetadataModel] = {}
|
|
43
|
+
self._scratch_dir_context = scratch_dir_context
|
|
44
|
+
self._checkpoint = checkpoint
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def metadata(self) -> DatasetMetadataModel:
|
|
48
|
+
return self._metadata
|
|
49
|
+
|
|
50
|
+
@metadata.setter
|
|
51
|
+
def metadata(self, val: DatasetMetadataModel) -> None:
|
|
52
|
+
self._metadata = val
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def model(self) -> QueryBaseModel:
|
|
56
|
+
return self._model
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def base_dimension_names(self) -> DatasetBaseDimensionNamesModel:
|
|
60
|
+
return self._metadata.base_dimension_names
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def scratch_dir_context(self) -> ScratchDirContext:
|
|
64
|
+
"""Return the context for managing scratch directories."""
|
|
65
|
+
return self._scratch_dir_context
|
|
66
|
+
|
|
67
|
+
def consolidate_dataset_metadata(self) -> None:
|
|
68
|
+
for dim_type in DimensionType:
|
|
69
|
+
main_metadata = self._metadata.dimensions.get_metadata(dim_type)
|
|
70
|
+
main_metadata.clear()
|
|
71
|
+
keys = set()
|
|
72
|
+
for dataset_metadata in self._dataset_metadata.values():
|
|
73
|
+
for metadata in dataset_metadata.dimensions.get_metadata(dim_type):
|
|
74
|
+
key = metadata.make_key()
|
|
75
|
+
if key not in keys:
|
|
76
|
+
main_metadata.append(metadata)
|
|
77
|
+
keys.add(key)
|
|
78
|
+
|
|
79
|
+
def finalize(self) -> None:
|
|
80
|
+
"""Perform cleanup."""
|
|
81
|
+
drop_temp_tables_and_views()
|
|
82
|
+
|
|
83
|
+
def get_value_columns(self) -> set[str]:
|
|
84
|
+
"""Return the value columns in the final dataset."""
|
|
85
|
+
match self.get_value_format():
|
|
86
|
+
case ValueFormat.PIVOTED:
|
|
87
|
+
return self.get_pivoted_columns()
|
|
88
|
+
case ValueFormat.STACKED:
|
|
89
|
+
return {VALUE_COLUMN}
|
|
90
|
+
case _:
|
|
91
|
+
msg = str(self.get_value_format())
|
|
92
|
+
raise NotImplementedError(msg)
|
|
93
|
+
|
|
94
|
+
def get_pivoted_columns(self) -> set[str]:
|
|
95
|
+
if self.get_value_format() != ValueFormat.PIVOTED:
|
|
96
|
+
msg = "Bug: get_pivoted_columns is only supported on a pivoted table"
|
|
97
|
+
raise Exception(msg)
|
|
98
|
+
metadata = self._get_metadata()
|
|
99
|
+
assert isinstance(metadata.table_format, PivotedTableFormatModel)
|
|
100
|
+
return self.get_dimension_column_names(metadata.table_format.pivoted_dimension_type)
|
|
101
|
+
|
|
102
|
+
def get_pivoted_dimension_type(self) -> DimensionType:
|
|
103
|
+
if self.get_value_format() != ValueFormat.PIVOTED:
|
|
104
|
+
msg = "Bug: get_pivoted_dimension_type is only supported on a pivoted table"
|
|
105
|
+
raise Exception(msg)
|
|
106
|
+
metadata = self._get_metadata()
|
|
107
|
+
assert isinstance(metadata.table_format, PivotedTableFormatModel)
|
|
108
|
+
return metadata.table_format.pivoted_dimension_type
|
|
109
|
+
|
|
110
|
+
def get_value_format(self, dataset_id: str | None = None) -> ValueFormat:
|
|
111
|
+
val = self._get_metadata(dataset_id).table_format.format_type
|
|
112
|
+
if not isinstance(val, ValueFormat):
|
|
113
|
+
val = ValueFormat(val)
|
|
114
|
+
return val
|
|
115
|
+
|
|
116
|
+
def set_value_format(self, val: ValueFormat) -> None:
|
|
117
|
+
if not isinstance(val, ValueFormat):
|
|
118
|
+
val = ValueFormat(val)
|
|
119
|
+
self._metadata.table_format.format_type = val
|
|
120
|
+
|
|
121
|
+
def get_dimension_column_names(
|
|
122
|
+
self, dimension_type: DimensionType, dataset_id: str | None = None
|
|
123
|
+
) -> set[str]:
|
|
124
|
+
"""Return the load data column names for the dimension."""
|
|
125
|
+
return self._get_metadata(dataset_id).dimensions.get_column_names(dimension_type)
|
|
126
|
+
|
|
127
|
+
def get_all_dimension_column_names(
|
|
128
|
+
self, dataset_id: str | None = None, exclude: set[DimensionType] | None = None
|
|
129
|
+
) -> set[str]:
|
|
130
|
+
names = set()
|
|
131
|
+
for dimension_type in DimensionType:
|
|
132
|
+
if exclude is not None and dimension_type in exclude:
|
|
133
|
+
continue
|
|
134
|
+
names.update(self.get_dimension_column_names(dimension_type, dataset_id=dataset_id))
|
|
135
|
+
return names
|
|
136
|
+
|
|
137
|
+
def get_dimension_names(
|
|
138
|
+
self, dimension_type: DimensionType, dataset_id: str | None = None
|
|
139
|
+
) -> set[str]:
|
|
140
|
+
return self._get_metadata(dataset_id).dimensions.get_dimension_names(dimension_type)
|
|
141
|
+
|
|
142
|
+
def get_all_dimension_names(
|
|
143
|
+
self, dataset_id: str | None = None, exclude: set[DimensionType] | None = None
|
|
144
|
+
) -> set[str]:
|
|
145
|
+
names = set()
|
|
146
|
+
for dimension_type in DimensionType:
|
|
147
|
+
if exclude is not None and dimension_type in exclude:
|
|
148
|
+
continue
|
|
149
|
+
names.update(self.get_dimension_names(dimension_type, dataset_id=dataset_id))
|
|
150
|
+
return names
|
|
151
|
+
|
|
152
|
+
def set_dataset_metadata(
|
|
153
|
+
self,
|
|
154
|
+
dataset_id: str,
|
|
155
|
+
column_type: ColumnType,
|
|
156
|
+
mapped_time_columns: list[str],
|
|
157
|
+
) -> None:
|
|
158
|
+
table_format = StackedTableFormatModel()
|
|
159
|
+
self._dataset_metadata[dataset_id] = DatasetMetadataModel(table_format=table_format)
|
|
160
|
+
base_dimension_names = self.base_dimension_names
|
|
161
|
+
for dim_type in DimensionType:
|
|
162
|
+
name = getattr(base_dimension_names, dim_type.value)
|
|
163
|
+
assert name is not None
|
|
164
|
+
match (column_type, dim_type):
|
|
165
|
+
case (ColumnType.DIMENSION_TYPES, DimensionType.TIME):
|
|
166
|
+
column_names = mapped_time_columns
|
|
167
|
+
case (ColumnType.DIMENSION_NAMES, _):
|
|
168
|
+
column_names = [name]
|
|
169
|
+
case (ColumnType.DIMENSION_TYPES, _):
|
|
170
|
+
column_names = [dim_type.value]
|
|
171
|
+
case _:
|
|
172
|
+
msg = f"Bug: need to support {column_type=} {dim_type=}"
|
|
173
|
+
raise NotImplementedError(msg)
|
|
174
|
+
self.add_dimension_metadata(
|
|
175
|
+
dim_type,
|
|
176
|
+
DimensionMetadataModel(dimension_name=name, column_names=column_names),
|
|
177
|
+
dataset_id=dataset_id,
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
def convert_to_pivoted(self) -> str:
|
|
181
|
+
assert isinstance(self.model.result.table_format, PivotedTableFormatModel)
|
|
182
|
+
pivoted_dimension_type = self.model.result.table_format.pivoted_dimension_type
|
|
183
|
+
self.set_value_format(ValueFormat.PIVOTED)
|
|
184
|
+
columns = self.get_dimension_column_names(pivoted_dimension_type)
|
|
185
|
+
names = self.get_dimension_names(pivoted_dimension_type)
|
|
186
|
+
if len(columns) != 1 or len(names) != 1:
|
|
187
|
+
# This is checked in the query model and so this should never happen.
|
|
188
|
+
msg = (
|
|
189
|
+
"Bug: The pivoted dimension can only have 1 column and 1 name: "
|
|
190
|
+
f"{columns=} {names=}"
|
|
191
|
+
)
|
|
192
|
+
raise Exception(msg)
|
|
193
|
+
return next(iter(columns))
|
|
194
|
+
|
|
195
|
+
def serialize_dataset_metadata_to_file(self, dataset_id: str, filename: Path) -> None:
|
|
196
|
+
filename.write_text(self._dataset_metadata[dataset_id].model_dump_json(indent=2))
|
|
197
|
+
|
|
198
|
+
def set_dataset_metadata_from_file(self, dataset_id: str, filename: Path) -> None:
|
|
199
|
+
assert dataset_id not in self._dataset_metadata, dataset_id
|
|
200
|
+
self._dataset_metadata[dataset_id] = DatasetMetadataModel.from_file(filename)
|
|
201
|
+
|
|
202
|
+
def add_dimension_metadata(
|
|
203
|
+
self,
|
|
204
|
+
dimension_type: DimensionType,
|
|
205
|
+
dimension_metadata: DimensionMetadataModel,
|
|
206
|
+
dataset_id=None,
|
|
207
|
+
) -> None:
|
|
208
|
+
self._get_metadata(dataset_id).dimensions.add_metadata(dimension_type, dimension_metadata)
|
|
209
|
+
logger.debug(
|
|
210
|
+
"Added dimension name for %s: %s dataset_id=%s",
|
|
211
|
+
dimension_type,
|
|
212
|
+
dimension_metadata,
|
|
213
|
+
dataset_id,
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
def get_dimension_column_names_by_name(
|
|
217
|
+
self,
|
|
218
|
+
dimension_type: DimensionType,
|
|
219
|
+
name: str,
|
|
220
|
+
dataset_id: str | None = None,
|
|
221
|
+
) -> list[str]:
|
|
222
|
+
"""Return the load data column names for the dimension."""
|
|
223
|
+
for metadata in self.get_dimension_metadata(dimension_type, dataset_id=dataset_id):
|
|
224
|
+
if metadata.dimension_name == name:
|
|
225
|
+
return metadata.column_names
|
|
226
|
+
msg = f"No dimension match: {dimension_type=} {name=}"
|
|
227
|
+
raise Exception(msg)
|
|
228
|
+
|
|
229
|
+
def get_dimension_metadata(
|
|
230
|
+
self,
|
|
231
|
+
dimension_type: DimensionType,
|
|
232
|
+
dataset_id: str | None = None,
|
|
233
|
+
) -> list[DimensionMetadataModel]:
|
|
234
|
+
return self._get_metadata(dataset_id).dimensions.get_metadata(dimension_type)
|
|
235
|
+
|
|
236
|
+
def replace_dimension_metadata(
|
|
237
|
+
self,
|
|
238
|
+
dimension_type: DimensionType,
|
|
239
|
+
dimension_metadata: list[DimensionMetadataModel],
|
|
240
|
+
dataset_id: str | None = None,
|
|
241
|
+
) -> None:
|
|
242
|
+
self._get_metadata(dataset_id).dimensions.replace_metadata(
|
|
243
|
+
dimension_type, dimension_metadata
|
|
244
|
+
)
|
|
245
|
+
logger.debug(
|
|
246
|
+
"Replaced dimension for %s: %s dataset_id=%s",
|
|
247
|
+
dimension_type,
|
|
248
|
+
dimension_metadata,
|
|
249
|
+
dataset_id,
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
def _get_metadata(self, dataset_id: str | None = None) -> DatasetMetadataModel:
|
|
253
|
+
return self._metadata if dataset_id is None else self._dataset_metadata[dataset_id]
|
|
254
|
+
|
|
255
|
+
def get_record_ids(self) -> dict[DimensionType, DataFrame]:
|
|
256
|
+
spark = get_spark_session()
|
|
257
|
+
return {
|
|
258
|
+
k: spark.createDataFrame(v, ["id"])
|
|
259
|
+
for k, v in self._record_ids_by_dimension_type.items()
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
def try_get_record_ids_by_dimension_type(self, dim_type: DimensionType) -> DataFrame | None:
|
|
263
|
+
records = self._record_ids_by_dimension_type.get(dim_type)
|
|
264
|
+
if records is None:
|
|
265
|
+
return records
|
|
266
|
+
|
|
267
|
+
spark = get_spark_session()
|
|
268
|
+
return spark.createDataFrame(records, [dim_type.value])
|
|
269
|
+
|
|
270
|
+
def set_record_ids_by_dimension_type(
|
|
271
|
+
self, dim_type: DimensionType, record_ids: DataFrame
|
|
272
|
+
) -> None:
|
|
273
|
+
# Can't keep the dataframes in memory because of spark restarts.
|
|
274
|
+
self._record_ids_by_dimension_type[dim_type] = [(x.id,) for x in record_ids.collect()]
|
|
275
|
+
|
|
276
|
+
@contextmanager
|
|
277
|
+
def dataset_mapping_manager(
|
|
278
|
+
self, dataset_id: str, plan: DatasetMappingPlan
|
|
279
|
+
) -> Generator[DatasetMappingManager, None, None]:
|
|
280
|
+
"""Start a mapping manager for a dataset."""
|
|
281
|
+
checkpoint = (
|
|
282
|
+
self._checkpoint
|
|
283
|
+
if self._checkpoint is not None and self._checkpoint.dataset_id == dataset_id
|
|
284
|
+
else None
|
|
285
|
+
)
|
|
286
|
+
with DatasetMappingManager(dataset_id, plan, self._scratch_dir_context, checkpoint) as mgr:
|
|
287
|
+
yield mgr
|