dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. build_backend.py +93 -0
  2. dsgrid/__init__.py +22 -0
  3. dsgrid/api/__init__.py +0 -0
  4. dsgrid/api/api_manager.py +179 -0
  5. dsgrid/api/app.py +419 -0
  6. dsgrid/api/models.py +60 -0
  7. dsgrid/api/response_models.py +116 -0
  8. dsgrid/apps/__init__.py +0 -0
  9. dsgrid/apps/project_viewer/app.py +216 -0
  10. dsgrid/apps/registration_gui.py +444 -0
  11. dsgrid/chronify.py +32 -0
  12. dsgrid/cli/__init__.py +0 -0
  13. dsgrid/cli/common.py +120 -0
  14. dsgrid/cli/config.py +176 -0
  15. dsgrid/cli/download.py +13 -0
  16. dsgrid/cli/dsgrid.py +157 -0
  17. dsgrid/cli/dsgrid_admin.py +92 -0
  18. dsgrid/cli/install_notebooks.py +62 -0
  19. dsgrid/cli/query.py +729 -0
  20. dsgrid/cli/registry.py +1862 -0
  21. dsgrid/cloud/__init__.py +0 -0
  22. dsgrid/cloud/cloud_storage_interface.py +140 -0
  23. dsgrid/cloud/factory.py +31 -0
  24. dsgrid/cloud/fake_storage_interface.py +37 -0
  25. dsgrid/cloud/s3_storage_interface.py +156 -0
  26. dsgrid/common.py +36 -0
  27. dsgrid/config/__init__.py +0 -0
  28. dsgrid/config/annual_time_dimension_config.py +194 -0
  29. dsgrid/config/common.py +142 -0
  30. dsgrid/config/config_base.py +148 -0
  31. dsgrid/config/dataset_config.py +907 -0
  32. dsgrid/config/dataset_schema_handler_factory.py +46 -0
  33. dsgrid/config/date_time_dimension_config.py +136 -0
  34. dsgrid/config/dimension_config.py +54 -0
  35. dsgrid/config/dimension_config_factory.py +65 -0
  36. dsgrid/config/dimension_mapping_base.py +350 -0
  37. dsgrid/config/dimension_mappings_config.py +48 -0
  38. dsgrid/config/dimensions.py +1025 -0
  39. dsgrid/config/dimensions_config.py +71 -0
  40. dsgrid/config/file_schema.py +190 -0
  41. dsgrid/config/index_time_dimension_config.py +80 -0
  42. dsgrid/config/input_dataset_requirements.py +31 -0
  43. dsgrid/config/mapping_tables.py +209 -0
  44. dsgrid/config/noop_time_dimension_config.py +42 -0
  45. dsgrid/config/project_config.py +1462 -0
  46. dsgrid/config/registration_models.py +188 -0
  47. dsgrid/config/representative_period_time_dimension_config.py +194 -0
  48. dsgrid/config/simple_models.py +49 -0
  49. dsgrid/config/supplemental_dimension.py +29 -0
  50. dsgrid/config/time_dimension_base_config.py +192 -0
  51. dsgrid/data_models.py +155 -0
  52. dsgrid/dataset/__init__.py +0 -0
  53. dsgrid/dataset/dataset.py +123 -0
  54. dsgrid/dataset/dataset_expression_handler.py +86 -0
  55. dsgrid/dataset/dataset_mapping_manager.py +121 -0
  56. dsgrid/dataset/dataset_schema_handler_base.py +945 -0
  57. dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
  58. dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
  59. dsgrid/dataset/growth_rates.py +162 -0
  60. dsgrid/dataset/models.py +51 -0
  61. dsgrid/dataset/table_format_handler_base.py +257 -0
  62. dsgrid/dataset/table_format_handler_factory.py +17 -0
  63. dsgrid/dataset/unpivoted_table.py +121 -0
  64. dsgrid/dimension/__init__.py +0 -0
  65. dsgrid/dimension/base_models.py +230 -0
  66. dsgrid/dimension/dimension_filters.py +308 -0
  67. dsgrid/dimension/standard.py +252 -0
  68. dsgrid/dimension/time.py +352 -0
  69. dsgrid/dimension/time_utils.py +103 -0
  70. dsgrid/dsgrid_rc.py +88 -0
  71. dsgrid/exceptions.py +105 -0
  72. dsgrid/filesystem/__init__.py +0 -0
  73. dsgrid/filesystem/cloud_filesystem.py +32 -0
  74. dsgrid/filesystem/factory.py +32 -0
  75. dsgrid/filesystem/filesystem_interface.py +136 -0
  76. dsgrid/filesystem/local_filesystem.py +74 -0
  77. dsgrid/filesystem/s3_filesystem.py +118 -0
  78. dsgrid/loggers.py +132 -0
  79. dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
  80. dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
  81. dsgrid/notebooks/registration.ipynb +48 -0
  82. dsgrid/notebooks/start_notebook.sh +11 -0
  83. dsgrid/project.py +451 -0
  84. dsgrid/query/__init__.py +0 -0
  85. dsgrid/query/dataset_mapping_plan.py +142 -0
  86. dsgrid/query/derived_dataset.py +388 -0
  87. dsgrid/query/models.py +728 -0
  88. dsgrid/query/query_context.py +287 -0
  89. dsgrid/query/query_submitter.py +994 -0
  90. dsgrid/query/report_factory.py +19 -0
  91. dsgrid/query/report_peak_load.py +70 -0
  92. dsgrid/query/reports_base.py +20 -0
  93. dsgrid/registry/__init__.py +0 -0
  94. dsgrid/registry/bulk_register.py +165 -0
  95. dsgrid/registry/common.py +287 -0
  96. dsgrid/registry/config_update_checker_base.py +63 -0
  97. dsgrid/registry/data_store_factory.py +34 -0
  98. dsgrid/registry/data_store_interface.py +74 -0
  99. dsgrid/registry/dataset_config_generator.py +158 -0
  100. dsgrid/registry/dataset_registry_manager.py +950 -0
  101. dsgrid/registry/dataset_update_checker.py +16 -0
  102. dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
  103. dsgrid/registry/dimension_mapping_update_checker.py +16 -0
  104. dsgrid/registry/dimension_registry_manager.py +413 -0
  105. dsgrid/registry/dimension_update_checker.py +16 -0
  106. dsgrid/registry/duckdb_data_store.py +207 -0
  107. dsgrid/registry/filesystem_data_store.py +150 -0
  108. dsgrid/registry/filter_registry_manager.py +123 -0
  109. dsgrid/registry/project_config_generator.py +57 -0
  110. dsgrid/registry/project_registry_manager.py +1623 -0
  111. dsgrid/registry/project_update_checker.py +48 -0
  112. dsgrid/registry/registration_context.py +223 -0
  113. dsgrid/registry/registry_auto_updater.py +316 -0
  114. dsgrid/registry/registry_database.py +667 -0
  115. dsgrid/registry/registry_interface.py +446 -0
  116. dsgrid/registry/registry_manager.py +558 -0
  117. dsgrid/registry/registry_manager_base.py +367 -0
  118. dsgrid/registry/versioning.py +92 -0
  119. dsgrid/rust_ext/__init__.py +14 -0
  120. dsgrid/rust_ext/find_minimal_patterns.py +129 -0
  121. dsgrid/spark/__init__.py +0 -0
  122. dsgrid/spark/functions.py +589 -0
  123. dsgrid/spark/types.py +110 -0
  124. dsgrid/tests/__init__.py +0 -0
  125. dsgrid/tests/common.py +140 -0
  126. dsgrid/tests/make_us_data_registry.py +265 -0
  127. dsgrid/tests/register_derived_datasets.py +103 -0
  128. dsgrid/tests/utils.py +25 -0
  129. dsgrid/time/__init__.py +0 -0
  130. dsgrid/time/time_conversions.py +80 -0
  131. dsgrid/time/types.py +67 -0
  132. dsgrid/units/__init__.py +0 -0
  133. dsgrid/units/constants.py +113 -0
  134. dsgrid/units/convert.py +71 -0
  135. dsgrid/units/energy.py +145 -0
  136. dsgrid/units/power.py +87 -0
  137. dsgrid/utils/__init__.py +0 -0
  138. dsgrid/utils/dataset.py +830 -0
  139. dsgrid/utils/files.py +179 -0
  140. dsgrid/utils/filters.py +125 -0
  141. dsgrid/utils/id_remappings.py +100 -0
  142. dsgrid/utils/py_expression_eval/LICENSE +19 -0
  143. dsgrid/utils/py_expression_eval/README.md +8 -0
  144. dsgrid/utils/py_expression_eval/__init__.py +847 -0
  145. dsgrid/utils/py_expression_eval/tests.py +283 -0
  146. dsgrid/utils/run_command.py +70 -0
  147. dsgrid/utils/scratch_dir_context.py +65 -0
  148. dsgrid/utils/spark.py +918 -0
  149. dsgrid/utils/spark_partition.py +98 -0
  150. dsgrid/utils/timing.py +239 -0
  151. dsgrid/utils/utilities.py +221 -0
  152. dsgrid/utils/versioning.py +36 -0
  153. dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
  154. dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
  155. dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
  156. dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
  157. dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
@@ -0,0 +1,48 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "fbb09a8a",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from IPython.core.display import HTML\n",
11
+ "from dsgrid.apps.registration_gui import RegistrationGui\n",
12
+ "\n",
13
+ "display(HTML(\"<style>.container { width:100% !important; }</style>\"))"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": null,
19
+ "id": "e7e59cb0",
20
+ "metadata": {},
21
+ "outputs": [],
22
+ "source": [
23
+ "app = RegistrationGui()"
24
+ ]
25
+ }
26
+ ],
27
+ "metadata": {
28
+ "kernelspec": {
29
+ "display_name": "Python 3 (ipykernel)",
30
+ "language": "python",
31
+ "name": "python3"
32
+ },
33
+ "language_info": {
34
+ "codemirror_mode": {
35
+ "name": "ipython",
36
+ "version": 3
37
+ },
38
+ "file_extension": ".py",
39
+ "mimetype": "text/x-python",
40
+ "name": "python",
41
+ "nbconvert_exporter": "python",
42
+ "pygments_lexer": "ipython3",
43
+ "version": "3.8.12"
44
+ }
45
+ },
46
+ "nbformat": 4,
47
+ "nbformat_minor": 5
48
+ }
@@ -0,0 +1,11 @@
1
+ #!/bin/bash
2
+ unset XDG_RUNTIME_DIR
3
+ export SPARK_CLUSTER=$1
4
+ export DSGRID_LOG_FILE_PATH=`pwd`/$2
5
+ echo "Spark cluster is running at ${SPARK_CLUSTER}" >&2
6
+ echo "JADE output directory is ${DSGRID_LOG_FILE_PATH}" >&2
7
+ mkdir -p $DSGRID_LOG_FILE_PATH
8
+ jupyter notebook --no-browser --ip=0.0.0.0 --port 8889 &
9
+ sleep 10
10
+ echo "Create an ssh tunnel with this command: ssh -L 8889:${HOSTNAME}:8889 ${USER}@el1.hpc.nrel.gov" >&2
11
+ wait
dsgrid/project.py ADDED
@@ -0,0 +1,451 @@
1
+ """Interface to a dsgrid project."""
2
+
3
+ import json
4
+ import logging
5
+
6
+ from pathlib import Path
7
+ from semver import VersionInfo
8
+ from sqlalchemy import Connection
9
+
10
+ from dsgrid.common import VALUE_COLUMN
11
+ from dsgrid.config.project_config import ProjectConfig
12
+ from dsgrid.dataset.dataset import Dataset
13
+ from dsgrid.dataset.growth_rates import apply_exponential_growth_rate, apply_annual_multiplier
14
+ from dsgrid.dimension.base_models import DimensionType, DimensionCategory
15
+ from dsgrid.dimension.dimension_filters import (
16
+ DimensionFilterSingleQueryNameBaseModel,
17
+ SubsetDimensionFilterModel,
18
+ )
19
+ from dsgrid.exceptions import DSGInvalidQuery, DSGValueNotRegistered
20
+ from dsgrid.query.query_context import QueryContext
21
+ from dsgrid.query.models import (
22
+ StandaloneDatasetModel,
23
+ ProjectionDatasetModel,
24
+ DatasetConstructionMethod,
25
+ ColumnType,
26
+ )
27
+ from dsgrid.registry.dataset_registry_manager import DatasetRegistryManager
28
+ from dsgrid.registry.dimension_mapping_registry_manager import DimensionMappingRegistryManager
29
+ from dsgrid.registry.dimension_registry_manager import DimensionRegistryManager
30
+ from dsgrid.utils.files import compute_hash
31
+ from dsgrid.spark.functions import (
32
+ is_dataframe_empty,
33
+ )
34
+ from dsgrid.spark.types import DataFrame
35
+ from dsgrid.utils.spark import (
36
+ read_dataframe,
37
+ try_read_dataframe,
38
+ restart_spark_with_custom_conf,
39
+ write_dataframe_and_auto_partition,
40
+ get_active_session,
41
+ )
42
+ from dsgrid.utils.timing import timer_stats_collector, track_timing, Timer
43
+
44
+
45
+ logger = logging.getLogger(__name__)
46
+
47
+
48
+ class Project:
49
+ """Interface to a dsgrid project."""
50
+
51
+ def __init__(
52
+ self,
53
+ config: ProjectConfig,
54
+ version: str,
55
+ dataset_configs,
56
+ dimension_mgr: DimensionRegistryManager,
57
+ dimension_mapping_mgr: DimensionMappingRegistryManager,
58
+ dataset_mgr: DatasetRegistryManager,
59
+ ):
60
+ self._spark = get_active_session()
61
+ self._config = config
62
+ self._version = version
63
+ self._dataset_configs = dataset_configs
64
+ self._datasets = {}
65
+ self._dataset_mgr = dataset_mgr
66
+ self._dimension_mgr = dimension_mgr
67
+ self._dimension_mapping_mgr = dimension_mapping_mgr
68
+
69
+ @property
70
+ def config(self) -> ProjectConfig:
71
+ """Returns the ProjectConfig."""
72
+ return self._config
73
+
74
+ @property
75
+ def dimension_manager(self):
76
+ return self._dimension_mgr
77
+
78
+ @property
79
+ def dimension_mapping_manager(self):
80
+ return self._dimension_mapping_mgr
81
+
82
+ @property
83
+ def version(self):
84
+ """Return the version of the project.
85
+
86
+ Returns
87
+ -------
88
+ str
89
+
90
+ """
91
+ return self._version
92
+
93
+ def is_registered(self, dataset_id):
94
+ """Provides the status of dataset_id within this project.
95
+
96
+ Parameters
97
+ ----------
98
+ dataset_id : str
99
+
100
+ Returns
101
+ -------
102
+ bool
103
+ True if dataset_id is in this project's config and the dataset has been
104
+ registered with (successfully submitted to) this project; False if dataset_id
105
+ is in this project's config but the dataset is not yet available.
106
+
107
+ Throws
108
+ ------
109
+ DSGValueNotRegistered
110
+ If dataset_id is not in this project's config.
111
+ """
112
+ if dataset_id not in self.list_datasets():
113
+ msg = f"{dataset_id} is not expected by {self.config.model.project_id}"
114
+ raise DSGValueNotRegistered(msg)
115
+
116
+ return dataset_id in self._dataset_configs
117
+
118
+ def get_dataset(self, dataset_id, conn: Connection | None = None) -> Dataset:
119
+ """Returns a Dataset. Calls load_dataset if it hasn't already been loaded.
120
+
121
+ Parameters
122
+ ----------
123
+ dataset_id : str
124
+
125
+ Returns
126
+ -------
127
+ Dataset
128
+
129
+ """
130
+ if dataset_id in self._datasets:
131
+ dataset = self._datasets[dataset_id]
132
+ else:
133
+ dataset = self.load_dataset(dataset_id, conn=conn)
134
+ return dataset
135
+
136
+ def load_dataset(self, dataset_id, conn: Connection | None = None) -> Dataset:
137
+ """Loads a dataset.
138
+
139
+ Parameters
140
+ ----------
141
+ dataset_id : str
142
+
143
+ Returns
144
+ -------
145
+ Dataset
146
+
147
+ """
148
+ if dataset_id not in self._dataset_configs:
149
+ msg = f"dataset_id={dataset_id} is not registered in the project"
150
+ raise DSGValueNotRegistered(msg)
151
+ config = self._dataset_configs[dataset_id]
152
+ input_dataset = self._config.get_dataset(dataset_id)
153
+ dataset = Dataset.load(
154
+ config,
155
+ self._dimension_mgr,
156
+ self._dimension_mapping_mgr,
157
+ self._dataset_mgr.store,
158
+ mapping_references=input_dataset.mapping_references,
159
+ conn=conn,
160
+ )
161
+ self._datasets[dataset_id] = dataset
162
+ return dataset
163
+
164
+ def unload_dataset(self, dataset_id):
165
+ """Unloads a dataset.
166
+
167
+ Parameters
168
+ ----------
169
+ dataset_id : str
170
+
171
+ """
172
+ self._datasets.pop(dataset_id, None)
173
+
174
+ def _iter_datasets(self):
175
+ for dataset in self.config.model.datasets:
176
+ yield dataset
177
+
178
+ def list_datasets(self):
179
+ return [x.dataset_id for x in self._iter_datasets()]
180
+
181
+ @track_timing(timer_stats_collector)
182
+ def process_query(self, context: QueryContext, cached_datasets_dir: Path) -> dict[str, Path]:
183
+ """Return a dictionary of dataset_id to dataframe path for all datasets in the query."""
184
+ self._build_filtered_record_ids_by_dimension_type(context)
185
+
186
+ # Note: Store DataFrame filenames instead of objects because the SparkSession will get
187
+ # restarted for each dataset. The Spark DataFrame keeps a reference to the session that
188
+ # created it, and so that reference will be invalid.
189
+ df_filenames = {}
190
+ for dataset in context.model.project.dataset.source_datasets:
191
+ if isinstance(dataset, StandaloneDatasetModel):
192
+ path = self._process_dataset(context, cached_datasets_dir, dataset.dataset_id)
193
+ elif isinstance(dataset, ProjectionDatasetModel):
194
+ path = self._process_projection_dataset(context, cached_datasets_dir, dataset)
195
+ else:
196
+ msg = f"Unsupported type: {type(dataset)}"
197
+ raise NotImplementedError(msg)
198
+ df_filenames[dataset.dataset_id] = path
199
+
200
+ if not df_filenames:
201
+ logger.warning("No data matched %s", context.model.name)
202
+
203
+ return df_filenames
204
+
205
+ def _build_filtered_record_ids_by_dimension_type(self, context: QueryContext):
206
+ record_ids: dict[DimensionType, DataFrame] = {}
207
+ for dim_filter in context.model.project.dataset.params.dimension_filters:
208
+ dim_type = dim_filter.dimension_type
209
+ if dim_type == DimensionType.TIME:
210
+ # TODO #196
211
+ # This needs to handled by the dataset handler function _prefilter_time_dimension
212
+ msg = "Pre-filtering time is not supported yet"
213
+ raise NotImplementedError(msg)
214
+ if isinstance(dim_filter, SubsetDimensionFilterModel):
215
+ df = dim_filter.get_filtered_records_dataframe(self._config.get_dimension).select(
216
+ "id"
217
+ )
218
+ else:
219
+ query_name = dim_filter.dimension_name
220
+ records = self._config.get_dimension_records(query_name)
221
+ df = dim_filter.apply_filter(records).select("id")
222
+ supp_query_names = set(
223
+ self._config.list_dimension_names(category=DimensionCategory.SUPPLEMENTAL)
224
+ )
225
+ if query_name in supp_query_names:
226
+ assert isinstance(dim_filter, DimensionFilterSingleQueryNameBaseModel)
227
+ base_query_name = getattr(
228
+ context.base_dimension_names, dim_filter.dimension_type.value
229
+ )
230
+ base_dim = self._config.get_dimension(base_query_name)
231
+ supp_dim = self._config.get_dimension(query_name)
232
+ mapping_records = self._config.get_base_to_supplemental_mapping_records(
233
+ base_dim, supp_dim
234
+ )
235
+ df = (
236
+ mapping_records.join(df, on=mapping_records.to_id == df.id)
237
+ .select("from_id")
238
+ .withColumnRenamed("from_id", "id")
239
+ .distinct()
240
+ )
241
+
242
+ if dim_type in record_ids:
243
+ df = record_ids[dim_type].join(df, "id")
244
+ if is_dataframe_empty(df):
245
+ msg = f"Query filter produced empty records: {dim_filter}"
246
+ raise DSGInvalidQuery(msg)
247
+ record_ids[dim_type] = df
248
+
249
+ for dimension_type, ids in record_ids.items():
250
+ context.set_record_ids_by_dimension_type(dimension_type, ids)
251
+
252
+ def _process_dataset(
253
+ self,
254
+ context: QueryContext,
255
+ cached_datasets_dir: Path,
256
+ dataset_id: str,
257
+ ) -> Path:
258
+ """Return a Path to the created DataFrame. Does not return a DataFrame object because
259
+ the SparkSession will be restarted.
260
+
261
+ """
262
+ logger.info("Start processing query for dataset_id=%s", dataset_id)
263
+ hash_dir = self._compute_dataset_hash_and_serialize(
264
+ context, cached_datasets_dir, dataset_id
265
+ )
266
+ cached_dataset_path = hash_dir / (dataset_id + ".parquet")
267
+ metadata_file = cached_dataset_path.with_suffix(".json5")
268
+ if try_read_dataframe(cached_dataset_path) is None:
269
+ # An alternative solution is to call custom_spark_conf instead.
270
+ # That changes some settings without restarting the SparkSession.
271
+ # Results were not as good with that solution.
272
+ # Observations on queries with comstock and resstock showed that Spark
273
+ # used many fewer executors on the second query. That was with a standalone
274
+ # cluster on Kestrel with dynamic allocation enabled.
275
+ # We don't understand why that is the case. It may not be an issue with YARN as
276
+ # the cluster manager on AWS.
277
+ # Queries on standalone clusters will be easier to debug if we restart the session
278
+ # for each big job.
279
+ with restart_spark_with_custom_conf(
280
+ conf=context.model.project.get_spark_conf(dataset_id),
281
+ force=True,
282
+ ):
283
+ logger.info("Build project-mapped dataset %s", dataset_id)
284
+ # Call load_dataset instead of get_dataset because the latter won't be valid here
285
+ # after the SparkSession restart.
286
+ with self._dimension_mgr.db.engine.connect() as conn:
287
+ dataset = self.load_dataset(dataset_id, conn=conn)
288
+ with Timer(timer_stats_collector, "build_project_mapped_dataset"):
289
+ df = dataset.make_project_dataframe(context, self._config)
290
+ context.serialize_dataset_metadata_to_file(
291
+ dataset.dataset_id, metadata_file
292
+ )
293
+ write_dataframe_and_auto_partition(df, cached_dataset_path)
294
+ else:
295
+ assert metadata_file.exists(), metadata_file
296
+ context.set_dataset_metadata_from_file(dataset_id, metadata_file)
297
+ logger.info("Use cached project-mapped dataset %s", dataset_id)
298
+
299
+ logger.info("Finished processing query for dataset_id=%s", dataset_id)
300
+ return cached_dataset_path
301
+
302
+ def _process_projection_dataset(
303
+ self,
304
+ context: QueryContext,
305
+ cached_datasets_dir: Path,
306
+ dataset: ProjectionDatasetModel,
307
+ ) -> Path:
308
+ logger.info(
309
+ "Apply %s for dataset_id=%s",
310
+ dataset.construction_method.value,
311
+ dataset.initial_value_dataset_id,
312
+ )
313
+ hash_dir = self._compute_dataset_hash_and_serialize(
314
+ context, cached_datasets_dir, dataset.dataset_id
315
+ )
316
+ cached_dataset_path = hash_dir / (dataset.dataset_id + ".parquet")
317
+ metadata_file = cached_dataset_path.with_suffix(".json5")
318
+ if try_read_dataframe(cached_dataset_path) is None:
319
+ self._build_projection_dataset(
320
+ context,
321
+ cached_datasets_dir,
322
+ dataset,
323
+ cached_dataset_path,
324
+ metadata_file,
325
+ )
326
+ else:
327
+ assert metadata_file.exists(), metadata_file
328
+ context.set_dataset_metadata_from_file(dataset.dataset_id, metadata_file)
329
+ logger.info("Use cached project-mapped dataset %s", dataset.dataset_id)
330
+
331
+ return cached_dataset_path
332
+
333
+ @track_timing(timer_stats_collector)
334
+ def _build_projection_dataset(
335
+ self,
336
+ context: QueryContext,
337
+ cached_datasets_dir: Path,
338
+ dataset: ProjectionDatasetModel,
339
+ dataset_path: Path,
340
+ metadata_file: Path,
341
+ ):
342
+ def get_myear_column(dataset_id):
343
+ match context.model.result.column_type:
344
+ case ColumnType.DIMENSION_TYPES:
345
+ return DimensionType.MODEL_YEAR.value
346
+ case ColumnType.DIMENSION_NAMES:
347
+ pass
348
+ case _:
349
+ msg = f"BUG: unhandled {context.model.result.column_type=}"
350
+ raise NotImplementedError(msg)
351
+ names = list(
352
+ context.get_dimension_column_names(DimensionType.MODEL_YEAR, dataset_id=dataset_id)
353
+ )
354
+ assert len(names) == 1, f"{dataset_id=} {names=}"
355
+ return names[0]
356
+
357
+ iv_path = self._process_dataset(
358
+ context,
359
+ cached_datasets_dir,
360
+ dataset.initial_value_dataset_id,
361
+ )
362
+ gr_path = self._process_dataset(
363
+ context,
364
+ cached_datasets_dir,
365
+ dataset.growth_rate_dataset_id,
366
+ )
367
+ model_year_column = get_myear_column(dataset.initial_value_dataset_id)
368
+ model_year_column_gr = get_myear_column(dataset.growth_rate_dataset_id)
369
+ if model_year_column != model_year_column_gr:
370
+ msg = (
371
+ "BUG: initial_value and growth rate datasets have different model_year columns: "
372
+ f"{model_year_column=} {model_year_column_gr=}"
373
+ )
374
+ raise Exception(msg)
375
+ match context.model.result.column_type:
376
+ case ColumnType.DIMENSION_NAMES:
377
+ time_columns = context.get_dimension_column_names(
378
+ DimensionType.TIME, dataset_id=dataset.initial_value_dataset_id
379
+ )
380
+ case ColumnType.DIMENSION_TYPES:
381
+ dset = self.get_dataset(dataset.initial_value_dataset_id)
382
+ time_dim = dset.config.get_time_dimension()
383
+ assert time_dim is not None
384
+ time_columns = set(time_dim.get_load_data_time_columns())
385
+ case _:
386
+ msg = f"BUG: unhandled {context.model.result.column_type=}"
387
+ raise NotImplementedError(msg)
388
+ with restart_spark_with_custom_conf(
389
+ conf=context.model.project.get_spark_conf(dataset.dataset_id),
390
+ force=True,
391
+ ):
392
+ logger.info("Build projection dataset %s", dataset.dataset_id)
393
+ iv_df = read_dataframe(iv_path)
394
+ gr_df = read_dataframe(gr_path)
395
+ value_columns = {VALUE_COLUMN}
396
+ match dataset.construction_method:
397
+ case DatasetConstructionMethod.EXPONENTIAL_GROWTH:
398
+ df = apply_exponential_growth_rate(
399
+ dataset, iv_df, gr_df, time_columns, model_year_column, value_columns
400
+ )
401
+ case DatasetConstructionMethod.ANNUAL_MULTIPLIER:
402
+ df = apply_annual_multiplier(iv_df, gr_df, time_columns, value_columns)
403
+ case _:
404
+ msg = f"BUG: Unsupported {dataset.construction_method=}"
405
+ raise NotImplementedError(msg)
406
+ df = write_dataframe_and_auto_partition(df, dataset_path)
407
+
408
+ time_dim = self._config.get_base_time_dimension()
409
+ assert time_dim is not None
410
+ time_columns = time_dim.get_load_data_time_columns()
411
+ context.set_dataset_metadata(
412
+ dataset.dataset_id,
413
+ context.model.result.column_type,
414
+ time_columns,
415
+ )
416
+ context.serialize_dataset_metadata_to_file(dataset.dataset_id, metadata_file)
417
+
418
+ def _compute_dataset_hash_and_serialize(
419
+ self, context: QueryContext, cached_datasets_dir: Path, dataset_id: str
420
+ ) -> Path:
421
+ """Create a hash that can be used to identify whether the mapping of the dataset to
422
+ project dimensions can be skipped based on a previous query.
423
+
424
+ If a directory with the hash does not already exist, create it and serialize the content
425
+ used to create the hash.
426
+
427
+ Examples of changes that will invalidate the query:
428
+ - Bump to project major version number
429
+ - Change to a dataset version
430
+ - Change to a project's dimension requirements for a dataset
431
+ - Change to a dataset dimension mapping
432
+
433
+ Returns
434
+ -------
435
+ str
436
+ Directory based on the hash
437
+ """
438
+ dataset_query_info = {
439
+ "project_id": self._config.model.project_id,
440
+ "project_major_version": VersionInfo.parse(self._config.model.version).major,
441
+ "dataset": self._config.get_dataset(dataset_id).model_dump(mode="json"),
442
+ "dataset_query_params": context.model.project.dataset.params.model_dump(mode="json"),
443
+ }
444
+ text = json.dumps(dataset_query_info, indent=2)
445
+ hash_dir_name = compute_hash(text.encode())
446
+ hash_dir = cached_datasets_dir / hash_dir_name
447
+ if not hash_dir.exists():
448
+ hash_dir.mkdir()
449
+ model_file = hash_dir / "model.json"
450
+ model_file.write_text(text)
451
+ return hash_dir
File without changes
@@ -0,0 +1,142 @@
1
+ import logging
2
+ from datetime import datetime
3
+ from pathlib import Path
4
+ from typing import Self
5
+
6
+ from pydantic import Field, model_validator
7
+
8
+ from dsgrid.data_models import DSGBaseModel
9
+ from dsgrid.utils.files import compute_hash
10
+ from dsgrid.utils.utilities import check_uniqueness
11
+ from dsgrid.config.dimension_mapping_base import DimensionMappingReferenceModel
12
+
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class MapOperation(DSGBaseModel):
18
+ """Defines one mapping operation for a dataset."""
19
+
20
+ name: str = Field(
21
+ description="Identifier for the mapping operation. This must be a unique name.",
22
+ )
23
+ handle_data_skew: bool | None = Field(
24
+ default=None,
25
+ description="Use a salting technique to handle data skew in this mapping "
26
+ "operation. Skew can happen when some partitions have significantly more data than "
27
+ "others, resulting in unbalanced task execution times. "
28
+ "If this value is None, dsgrid will make its own determination of whether this "
29
+ "should be done based on the characteristics of the mapping operation. Setting it "
30
+ "to True or False will override that behavior and inform dsgrid of what to do. "
31
+ "This will automatically trigger a persist to the filesystem (implicitly setting "
32
+ "persist to True).",
33
+ )
34
+ persist: bool = Field(
35
+ default=False,
36
+ description="Persist the intermediate dataset to the filesystem after mapping "
37
+ "this dimension. This can be useful to prevent the query from becoming too "
38
+ "large. It can also be useful for benchmarking and debugging purposes.",
39
+ )
40
+ mapping_reference: DimensionMappingReferenceModel | None = Field(
41
+ default=None,
42
+ description="Reference to the model used to map the dimension. Set at runtime by dsgrid.",
43
+ )
44
+
45
+
46
+ class MapOperationCheckpoint(DSGBaseModel):
47
+ """Defines a completed mapping operation that has been persisted to the filesystem."""
48
+
49
+ dataset_id: str
50
+ completed_operation_names: list[str] = Field(
51
+ description="Names of the completed mapping operations."
52
+ )
53
+ persisted_table_filename: Path = Field(description="Path to a persisted file.")
54
+ mapping_plan_hash: str = Field(
55
+ description="Hash of the mapping plan. This is used to ensure that the mapping plan "
56
+ "hasn't changed."
57
+ )
58
+ timestamp: datetime = Field(
59
+ default_factory=datetime.now,
60
+ description="Timestamp of when the operation was completed.",
61
+ )
62
+
63
+
64
+ class DatasetMappingPlan(DSGBaseModel):
65
+ """Defines how to map a dataset to a list of dimensions."""
66
+
67
+ dataset_id: str = Field(
68
+ description="ID of the dataset to be mapped.",
69
+ )
70
+ mappings: list[MapOperation] = Field(
71
+ default=[],
72
+ description="Defines how to map each dimension of the dataset.",
73
+ )
74
+ apply_fraction_op: MapOperation = Field(
75
+ default=MapOperation(
76
+ name="apply_fraction_op",
77
+ handle_data_skew=False,
78
+ persist=False,
79
+ ),
80
+ description="Defines handling of the query that applies the from_fraction value after "
81
+ "mapping all dimensions.",
82
+ )
83
+ apply_scaling_factor_op: MapOperation = Field(
84
+ default=MapOperation(
85
+ name="apply_scaling_factor_op",
86
+ handle_data_skew=False,
87
+ persist=False,
88
+ ),
89
+ description="Defines handling of the query that applies the scaling factor, if one exists. "
90
+ "This happens after apply_fraction_op.",
91
+ )
92
+ convert_units_op: MapOperation = Field(
93
+ default=MapOperation(
94
+ name="convert_units_op",
95
+ handle_data_skew=False,
96
+ persist=False,
97
+ ),
98
+ description="Defines handling of the query that converts units. This happens after "
99
+ "apply_fraction_op and before mapping time. It is strongly recommended to not persist "
100
+ "this table because the code currently always persists before mapping time.",
101
+ )
102
+ map_time_op: MapOperation = Field(
103
+ default=MapOperation(
104
+ name="map_time",
105
+ handle_data_skew=False,
106
+ persist=False,
107
+ ),
108
+ description="Defines handling of the query that maps the time dimension. This happens after "
109
+ "convert_units_op. Unlike the other dimension mappings, this does not use the generic "
110
+ "mapping code. It relies on specific handling in chronify by time type.",
111
+ )
112
+ keep_intermediate_files: bool = Field(
113
+ default=False,
114
+ description="If True, keep the intermediate tables created during the mapping process. "
115
+ "This is useful for debugging and benchmarking, but will consume more disk space.",
116
+ )
117
+
118
+ @model_validator(mode="after")
119
+ def check_names(self) -> Self:
120
+ names = [x.name for x in self.mappings] + [
121
+ self.apply_fraction_op.name,
122
+ self.apply_scaling_factor_op.name,
123
+ self.convert_units_op.name,
124
+ self.map_time_op.name,
125
+ ]
126
+ check_uniqueness(names, "name")
127
+ return self
128
+
129
+ def list_mapping_operations(self) -> list[MapOperation]:
130
+ """List all mapping operations in the plan, in order."""
131
+ return self.mappings + [
132
+ self.apply_fraction_op,
133
+ self.apply_scaling_factor_op,
134
+ self.convert_units_op,
135
+ self.map_time_op,
136
+ ]
137
+
138
+ def compute_hash(self) -> str:
139
+ """Compute a hash of the mapping plan."""
140
+ return compute_hash(
141
+ bytes(self.model_dump_json(exclude={"keep_intermediate_files"}).encode("utf-8"))
142
+ )