dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- build_backend.py +93 -0
- dsgrid/__init__.py +22 -0
- dsgrid/api/__init__.py +0 -0
- dsgrid/api/api_manager.py +179 -0
- dsgrid/api/app.py +419 -0
- dsgrid/api/models.py +60 -0
- dsgrid/api/response_models.py +116 -0
- dsgrid/apps/__init__.py +0 -0
- dsgrid/apps/project_viewer/app.py +216 -0
- dsgrid/apps/registration_gui.py +444 -0
- dsgrid/chronify.py +32 -0
- dsgrid/cli/__init__.py +0 -0
- dsgrid/cli/common.py +120 -0
- dsgrid/cli/config.py +176 -0
- dsgrid/cli/download.py +13 -0
- dsgrid/cli/dsgrid.py +157 -0
- dsgrid/cli/dsgrid_admin.py +92 -0
- dsgrid/cli/install_notebooks.py +62 -0
- dsgrid/cli/query.py +729 -0
- dsgrid/cli/registry.py +1862 -0
- dsgrid/cloud/__init__.py +0 -0
- dsgrid/cloud/cloud_storage_interface.py +140 -0
- dsgrid/cloud/factory.py +31 -0
- dsgrid/cloud/fake_storage_interface.py +37 -0
- dsgrid/cloud/s3_storage_interface.py +156 -0
- dsgrid/common.py +36 -0
- dsgrid/config/__init__.py +0 -0
- dsgrid/config/annual_time_dimension_config.py +194 -0
- dsgrid/config/common.py +142 -0
- dsgrid/config/config_base.py +148 -0
- dsgrid/config/dataset_config.py +907 -0
- dsgrid/config/dataset_schema_handler_factory.py +46 -0
- dsgrid/config/date_time_dimension_config.py +136 -0
- dsgrid/config/dimension_config.py +54 -0
- dsgrid/config/dimension_config_factory.py +65 -0
- dsgrid/config/dimension_mapping_base.py +350 -0
- dsgrid/config/dimension_mappings_config.py +48 -0
- dsgrid/config/dimensions.py +1025 -0
- dsgrid/config/dimensions_config.py +71 -0
- dsgrid/config/file_schema.py +190 -0
- dsgrid/config/index_time_dimension_config.py +80 -0
- dsgrid/config/input_dataset_requirements.py +31 -0
- dsgrid/config/mapping_tables.py +209 -0
- dsgrid/config/noop_time_dimension_config.py +42 -0
- dsgrid/config/project_config.py +1462 -0
- dsgrid/config/registration_models.py +188 -0
- dsgrid/config/representative_period_time_dimension_config.py +194 -0
- dsgrid/config/simple_models.py +49 -0
- dsgrid/config/supplemental_dimension.py +29 -0
- dsgrid/config/time_dimension_base_config.py +192 -0
- dsgrid/data_models.py +155 -0
- dsgrid/dataset/__init__.py +0 -0
- dsgrid/dataset/dataset.py +123 -0
- dsgrid/dataset/dataset_expression_handler.py +86 -0
- dsgrid/dataset/dataset_mapping_manager.py +121 -0
- dsgrid/dataset/dataset_schema_handler_base.py +945 -0
- dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
- dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
- dsgrid/dataset/growth_rates.py +162 -0
- dsgrid/dataset/models.py +51 -0
- dsgrid/dataset/table_format_handler_base.py +257 -0
- dsgrid/dataset/table_format_handler_factory.py +17 -0
- dsgrid/dataset/unpivoted_table.py +121 -0
- dsgrid/dimension/__init__.py +0 -0
- dsgrid/dimension/base_models.py +230 -0
- dsgrid/dimension/dimension_filters.py +308 -0
- dsgrid/dimension/standard.py +252 -0
- dsgrid/dimension/time.py +352 -0
- dsgrid/dimension/time_utils.py +103 -0
- dsgrid/dsgrid_rc.py +88 -0
- dsgrid/exceptions.py +105 -0
- dsgrid/filesystem/__init__.py +0 -0
- dsgrid/filesystem/cloud_filesystem.py +32 -0
- dsgrid/filesystem/factory.py +32 -0
- dsgrid/filesystem/filesystem_interface.py +136 -0
- dsgrid/filesystem/local_filesystem.py +74 -0
- dsgrid/filesystem/s3_filesystem.py +118 -0
- dsgrid/loggers.py +132 -0
- dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
- dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
- dsgrid/notebooks/registration.ipynb +48 -0
- dsgrid/notebooks/start_notebook.sh +11 -0
- dsgrid/project.py +451 -0
- dsgrid/query/__init__.py +0 -0
- dsgrid/query/dataset_mapping_plan.py +142 -0
- dsgrid/query/derived_dataset.py +388 -0
- dsgrid/query/models.py +728 -0
- dsgrid/query/query_context.py +287 -0
- dsgrid/query/query_submitter.py +994 -0
- dsgrid/query/report_factory.py +19 -0
- dsgrid/query/report_peak_load.py +70 -0
- dsgrid/query/reports_base.py +20 -0
- dsgrid/registry/__init__.py +0 -0
- dsgrid/registry/bulk_register.py +165 -0
- dsgrid/registry/common.py +287 -0
- dsgrid/registry/config_update_checker_base.py +63 -0
- dsgrid/registry/data_store_factory.py +34 -0
- dsgrid/registry/data_store_interface.py +74 -0
- dsgrid/registry/dataset_config_generator.py +158 -0
- dsgrid/registry/dataset_registry_manager.py +950 -0
- dsgrid/registry/dataset_update_checker.py +16 -0
- dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
- dsgrid/registry/dimension_mapping_update_checker.py +16 -0
- dsgrid/registry/dimension_registry_manager.py +413 -0
- dsgrid/registry/dimension_update_checker.py +16 -0
- dsgrid/registry/duckdb_data_store.py +207 -0
- dsgrid/registry/filesystem_data_store.py +150 -0
- dsgrid/registry/filter_registry_manager.py +123 -0
- dsgrid/registry/project_config_generator.py +57 -0
- dsgrid/registry/project_registry_manager.py +1623 -0
- dsgrid/registry/project_update_checker.py +48 -0
- dsgrid/registry/registration_context.py +223 -0
- dsgrid/registry/registry_auto_updater.py +316 -0
- dsgrid/registry/registry_database.py +667 -0
- dsgrid/registry/registry_interface.py +446 -0
- dsgrid/registry/registry_manager.py +558 -0
- dsgrid/registry/registry_manager_base.py +367 -0
- dsgrid/registry/versioning.py +92 -0
- dsgrid/rust_ext/__init__.py +14 -0
- dsgrid/rust_ext/find_minimal_patterns.py +129 -0
- dsgrid/spark/__init__.py +0 -0
- dsgrid/spark/functions.py +589 -0
- dsgrid/spark/types.py +110 -0
- dsgrid/tests/__init__.py +0 -0
- dsgrid/tests/common.py +140 -0
- dsgrid/tests/make_us_data_registry.py +265 -0
- dsgrid/tests/register_derived_datasets.py +103 -0
- dsgrid/tests/utils.py +25 -0
- dsgrid/time/__init__.py +0 -0
- dsgrid/time/time_conversions.py +80 -0
- dsgrid/time/types.py +67 -0
- dsgrid/units/__init__.py +0 -0
- dsgrid/units/constants.py +113 -0
- dsgrid/units/convert.py +71 -0
- dsgrid/units/energy.py +145 -0
- dsgrid/units/power.py +87 -0
- dsgrid/utils/__init__.py +0 -0
- dsgrid/utils/dataset.py +830 -0
- dsgrid/utils/files.py +179 -0
- dsgrid/utils/filters.py +125 -0
- dsgrid/utils/id_remappings.py +100 -0
- dsgrid/utils/py_expression_eval/LICENSE +19 -0
- dsgrid/utils/py_expression_eval/README.md +8 -0
- dsgrid/utils/py_expression_eval/__init__.py +847 -0
- dsgrid/utils/py_expression_eval/tests.py +283 -0
- dsgrid/utils/run_command.py +70 -0
- dsgrid/utils/scratch_dir_context.py +65 -0
- dsgrid/utils/spark.py +918 -0
- dsgrid/utils/spark_partition.py +98 -0
- dsgrid/utils/timing.py +239 -0
- dsgrid/utils/utilities.py +221 -0
- dsgrid/utils/versioning.py +36 -0
- dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
- dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
- dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
- dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
- dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "code",
|
|
5
|
+
"execution_count": null,
|
|
6
|
+
"id": "fbb09a8a",
|
|
7
|
+
"metadata": {},
|
|
8
|
+
"outputs": [],
|
|
9
|
+
"source": [
|
|
10
|
+
"from IPython.core.display import HTML\n",
|
|
11
|
+
"from dsgrid.apps.registration_gui import RegistrationGui\n",
|
|
12
|
+
"\n",
|
|
13
|
+
"display(HTML(\"<style>.container { width:100% !important; }</style>\"))"
|
|
14
|
+
]
|
|
15
|
+
},
|
|
16
|
+
{
|
|
17
|
+
"cell_type": "code",
|
|
18
|
+
"execution_count": null,
|
|
19
|
+
"id": "e7e59cb0",
|
|
20
|
+
"metadata": {},
|
|
21
|
+
"outputs": [],
|
|
22
|
+
"source": [
|
|
23
|
+
"app = RegistrationGui()"
|
|
24
|
+
]
|
|
25
|
+
}
|
|
26
|
+
],
|
|
27
|
+
"metadata": {
|
|
28
|
+
"kernelspec": {
|
|
29
|
+
"display_name": "Python 3 (ipykernel)",
|
|
30
|
+
"language": "python",
|
|
31
|
+
"name": "python3"
|
|
32
|
+
},
|
|
33
|
+
"language_info": {
|
|
34
|
+
"codemirror_mode": {
|
|
35
|
+
"name": "ipython",
|
|
36
|
+
"version": 3
|
|
37
|
+
},
|
|
38
|
+
"file_extension": ".py",
|
|
39
|
+
"mimetype": "text/x-python",
|
|
40
|
+
"name": "python",
|
|
41
|
+
"nbconvert_exporter": "python",
|
|
42
|
+
"pygments_lexer": "ipython3",
|
|
43
|
+
"version": "3.8.12"
|
|
44
|
+
}
|
|
45
|
+
},
|
|
46
|
+
"nbformat": 4,
|
|
47
|
+
"nbformat_minor": 5
|
|
48
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
unset XDG_RUNTIME_DIR
|
|
3
|
+
export SPARK_CLUSTER=$1
|
|
4
|
+
export DSGRID_LOG_FILE_PATH=`pwd`/$2
|
|
5
|
+
echo "Spark cluster is running at ${SPARK_CLUSTER}" >&2
|
|
6
|
+
echo "JADE output directory is ${DSGRID_LOG_FILE_PATH}" >&2
|
|
7
|
+
mkdir -p $DSGRID_LOG_FILE_PATH
|
|
8
|
+
jupyter notebook --no-browser --ip=0.0.0.0 --port 8889 &
|
|
9
|
+
sleep 10
|
|
10
|
+
echo "Create an ssh tunnel with this command: ssh -L 8889:${HOSTNAME}:8889 ${USER}@el1.hpc.nrel.gov" >&2
|
|
11
|
+
wait
|
dsgrid/project.py
ADDED
|
@@ -0,0 +1,451 @@
|
|
|
1
|
+
"""Interface to a dsgrid project."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from semver import VersionInfo
|
|
8
|
+
from sqlalchemy import Connection
|
|
9
|
+
|
|
10
|
+
from dsgrid.common import VALUE_COLUMN
|
|
11
|
+
from dsgrid.config.project_config import ProjectConfig
|
|
12
|
+
from dsgrid.dataset.dataset import Dataset
|
|
13
|
+
from dsgrid.dataset.growth_rates import apply_exponential_growth_rate, apply_annual_multiplier
|
|
14
|
+
from dsgrid.dimension.base_models import DimensionType, DimensionCategory
|
|
15
|
+
from dsgrid.dimension.dimension_filters import (
|
|
16
|
+
DimensionFilterSingleQueryNameBaseModel,
|
|
17
|
+
SubsetDimensionFilterModel,
|
|
18
|
+
)
|
|
19
|
+
from dsgrid.exceptions import DSGInvalidQuery, DSGValueNotRegistered
|
|
20
|
+
from dsgrid.query.query_context import QueryContext
|
|
21
|
+
from dsgrid.query.models import (
|
|
22
|
+
StandaloneDatasetModel,
|
|
23
|
+
ProjectionDatasetModel,
|
|
24
|
+
DatasetConstructionMethod,
|
|
25
|
+
ColumnType,
|
|
26
|
+
)
|
|
27
|
+
from dsgrid.registry.dataset_registry_manager import DatasetRegistryManager
|
|
28
|
+
from dsgrid.registry.dimension_mapping_registry_manager import DimensionMappingRegistryManager
|
|
29
|
+
from dsgrid.registry.dimension_registry_manager import DimensionRegistryManager
|
|
30
|
+
from dsgrid.utils.files import compute_hash
|
|
31
|
+
from dsgrid.spark.functions import (
|
|
32
|
+
is_dataframe_empty,
|
|
33
|
+
)
|
|
34
|
+
from dsgrid.spark.types import DataFrame
|
|
35
|
+
from dsgrid.utils.spark import (
|
|
36
|
+
read_dataframe,
|
|
37
|
+
try_read_dataframe,
|
|
38
|
+
restart_spark_with_custom_conf,
|
|
39
|
+
write_dataframe_and_auto_partition,
|
|
40
|
+
get_active_session,
|
|
41
|
+
)
|
|
42
|
+
from dsgrid.utils.timing import timer_stats_collector, track_timing, Timer
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
logger = logging.getLogger(__name__)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class Project:
|
|
49
|
+
"""Interface to a dsgrid project."""
|
|
50
|
+
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
config: ProjectConfig,
|
|
54
|
+
version: str,
|
|
55
|
+
dataset_configs,
|
|
56
|
+
dimension_mgr: DimensionRegistryManager,
|
|
57
|
+
dimension_mapping_mgr: DimensionMappingRegistryManager,
|
|
58
|
+
dataset_mgr: DatasetRegistryManager,
|
|
59
|
+
):
|
|
60
|
+
self._spark = get_active_session()
|
|
61
|
+
self._config = config
|
|
62
|
+
self._version = version
|
|
63
|
+
self._dataset_configs = dataset_configs
|
|
64
|
+
self._datasets = {}
|
|
65
|
+
self._dataset_mgr = dataset_mgr
|
|
66
|
+
self._dimension_mgr = dimension_mgr
|
|
67
|
+
self._dimension_mapping_mgr = dimension_mapping_mgr
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def config(self) -> ProjectConfig:
|
|
71
|
+
"""Returns the ProjectConfig."""
|
|
72
|
+
return self._config
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def dimension_manager(self):
|
|
76
|
+
return self._dimension_mgr
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def dimension_mapping_manager(self):
|
|
80
|
+
return self._dimension_mapping_mgr
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def version(self):
|
|
84
|
+
"""Return the version of the project.
|
|
85
|
+
|
|
86
|
+
Returns
|
|
87
|
+
-------
|
|
88
|
+
str
|
|
89
|
+
|
|
90
|
+
"""
|
|
91
|
+
return self._version
|
|
92
|
+
|
|
93
|
+
def is_registered(self, dataset_id):
|
|
94
|
+
"""Provides the status of dataset_id within this project.
|
|
95
|
+
|
|
96
|
+
Parameters
|
|
97
|
+
----------
|
|
98
|
+
dataset_id : str
|
|
99
|
+
|
|
100
|
+
Returns
|
|
101
|
+
-------
|
|
102
|
+
bool
|
|
103
|
+
True if dataset_id is in this project's config and the dataset has been
|
|
104
|
+
registered with (successfully submitted to) this project; False if dataset_id
|
|
105
|
+
is in this project's config but the dataset is not yet available.
|
|
106
|
+
|
|
107
|
+
Throws
|
|
108
|
+
------
|
|
109
|
+
DSGValueNotRegistered
|
|
110
|
+
If dataset_id is not in this project's config.
|
|
111
|
+
"""
|
|
112
|
+
if dataset_id not in self.list_datasets():
|
|
113
|
+
msg = f"{dataset_id} is not expected by {self.config.model.project_id}"
|
|
114
|
+
raise DSGValueNotRegistered(msg)
|
|
115
|
+
|
|
116
|
+
return dataset_id in self._dataset_configs
|
|
117
|
+
|
|
118
|
+
def get_dataset(self, dataset_id, conn: Connection | None = None) -> Dataset:
|
|
119
|
+
"""Returns a Dataset. Calls load_dataset if it hasn't already been loaded.
|
|
120
|
+
|
|
121
|
+
Parameters
|
|
122
|
+
----------
|
|
123
|
+
dataset_id : str
|
|
124
|
+
|
|
125
|
+
Returns
|
|
126
|
+
-------
|
|
127
|
+
Dataset
|
|
128
|
+
|
|
129
|
+
"""
|
|
130
|
+
if dataset_id in self._datasets:
|
|
131
|
+
dataset = self._datasets[dataset_id]
|
|
132
|
+
else:
|
|
133
|
+
dataset = self.load_dataset(dataset_id, conn=conn)
|
|
134
|
+
return dataset
|
|
135
|
+
|
|
136
|
+
def load_dataset(self, dataset_id, conn: Connection | None = None) -> Dataset:
|
|
137
|
+
"""Loads a dataset.
|
|
138
|
+
|
|
139
|
+
Parameters
|
|
140
|
+
----------
|
|
141
|
+
dataset_id : str
|
|
142
|
+
|
|
143
|
+
Returns
|
|
144
|
+
-------
|
|
145
|
+
Dataset
|
|
146
|
+
|
|
147
|
+
"""
|
|
148
|
+
if dataset_id not in self._dataset_configs:
|
|
149
|
+
msg = f"dataset_id={dataset_id} is not registered in the project"
|
|
150
|
+
raise DSGValueNotRegistered(msg)
|
|
151
|
+
config = self._dataset_configs[dataset_id]
|
|
152
|
+
input_dataset = self._config.get_dataset(dataset_id)
|
|
153
|
+
dataset = Dataset.load(
|
|
154
|
+
config,
|
|
155
|
+
self._dimension_mgr,
|
|
156
|
+
self._dimension_mapping_mgr,
|
|
157
|
+
self._dataset_mgr.store,
|
|
158
|
+
mapping_references=input_dataset.mapping_references,
|
|
159
|
+
conn=conn,
|
|
160
|
+
)
|
|
161
|
+
self._datasets[dataset_id] = dataset
|
|
162
|
+
return dataset
|
|
163
|
+
|
|
164
|
+
def unload_dataset(self, dataset_id):
|
|
165
|
+
"""Unloads a dataset.
|
|
166
|
+
|
|
167
|
+
Parameters
|
|
168
|
+
----------
|
|
169
|
+
dataset_id : str
|
|
170
|
+
|
|
171
|
+
"""
|
|
172
|
+
self._datasets.pop(dataset_id, None)
|
|
173
|
+
|
|
174
|
+
def _iter_datasets(self):
|
|
175
|
+
for dataset in self.config.model.datasets:
|
|
176
|
+
yield dataset
|
|
177
|
+
|
|
178
|
+
def list_datasets(self):
|
|
179
|
+
return [x.dataset_id for x in self._iter_datasets()]
|
|
180
|
+
|
|
181
|
+
@track_timing(timer_stats_collector)
|
|
182
|
+
def process_query(self, context: QueryContext, cached_datasets_dir: Path) -> dict[str, Path]:
|
|
183
|
+
"""Return a dictionary of dataset_id to dataframe path for all datasets in the query."""
|
|
184
|
+
self._build_filtered_record_ids_by_dimension_type(context)
|
|
185
|
+
|
|
186
|
+
# Note: Store DataFrame filenames instead of objects because the SparkSession will get
|
|
187
|
+
# restarted for each dataset. The Spark DataFrame keeps a reference to the session that
|
|
188
|
+
# created it, and so that reference will be invalid.
|
|
189
|
+
df_filenames = {}
|
|
190
|
+
for dataset in context.model.project.dataset.source_datasets:
|
|
191
|
+
if isinstance(dataset, StandaloneDatasetModel):
|
|
192
|
+
path = self._process_dataset(context, cached_datasets_dir, dataset.dataset_id)
|
|
193
|
+
elif isinstance(dataset, ProjectionDatasetModel):
|
|
194
|
+
path = self._process_projection_dataset(context, cached_datasets_dir, dataset)
|
|
195
|
+
else:
|
|
196
|
+
msg = f"Unsupported type: {type(dataset)}"
|
|
197
|
+
raise NotImplementedError(msg)
|
|
198
|
+
df_filenames[dataset.dataset_id] = path
|
|
199
|
+
|
|
200
|
+
if not df_filenames:
|
|
201
|
+
logger.warning("No data matched %s", context.model.name)
|
|
202
|
+
|
|
203
|
+
return df_filenames
|
|
204
|
+
|
|
205
|
+
def _build_filtered_record_ids_by_dimension_type(self, context: QueryContext):
|
|
206
|
+
record_ids: dict[DimensionType, DataFrame] = {}
|
|
207
|
+
for dim_filter in context.model.project.dataset.params.dimension_filters:
|
|
208
|
+
dim_type = dim_filter.dimension_type
|
|
209
|
+
if dim_type == DimensionType.TIME:
|
|
210
|
+
# TODO #196
|
|
211
|
+
# This needs to handled by the dataset handler function _prefilter_time_dimension
|
|
212
|
+
msg = "Pre-filtering time is not supported yet"
|
|
213
|
+
raise NotImplementedError(msg)
|
|
214
|
+
if isinstance(dim_filter, SubsetDimensionFilterModel):
|
|
215
|
+
df = dim_filter.get_filtered_records_dataframe(self._config.get_dimension).select(
|
|
216
|
+
"id"
|
|
217
|
+
)
|
|
218
|
+
else:
|
|
219
|
+
query_name = dim_filter.dimension_name
|
|
220
|
+
records = self._config.get_dimension_records(query_name)
|
|
221
|
+
df = dim_filter.apply_filter(records).select("id")
|
|
222
|
+
supp_query_names = set(
|
|
223
|
+
self._config.list_dimension_names(category=DimensionCategory.SUPPLEMENTAL)
|
|
224
|
+
)
|
|
225
|
+
if query_name in supp_query_names:
|
|
226
|
+
assert isinstance(dim_filter, DimensionFilterSingleQueryNameBaseModel)
|
|
227
|
+
base_query_name = getattr(
|
|
228
|
+
context.base_dimension_names, dim_filter.dimension_type.value
|
|
229
|
+
)
|
|
230
|
+
base_dim = self._config.get_dimension(base_query_name)
|
|
231
|
+
supp_dim = self._config.get_dimension(query_name)
|
|
232
|
+
mapping_records = self._config.get_base_to_supplemental_mapping_records(
|
|
233
|
+
base_dim, supp_dim
|
|
234
|
+
)
|
|
235
|
+
df = (
|
|
236
|
+
mapping_records.join(df, on=mapping_records.to_id == df.id)
|
|
237
|
+
.select("from_id")
|
|
238
|
+
.withColumnRenamed("from_id", "id")
|
|
239
|
+
.distinct()
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
if dim_type in record_ids:
|
|
243
|
+
df = record_ids[dim_type].join(df, "id")
|
|
244
|
+
if is_dataframe_empty(df):
|
|
245
|
+
msg = f"Query filter produced empty records: {dim_filter}"
|
|
246
|
+
raise DSGInvalidQuery(msg)
|
|
247
|
+
record_ids[dim_type] = df
|
|
248
|
+
|
|
249
|
+
for dimension_type, ids in record_ids.items():
|
|
250
|
+
context.set_record_ids_by_dimension_type(dimension_type, ids)
|
|
251
|
+
|
|
252
|
+
def _process_dataset(
|
|
253
|
+
self,
|
|
254
|
+
context: QueryContext,
|
|
255
|
+
cached_datasets_dir: Path,
|
|
256
|
+
dataset_id: str,
|
|
257
|
+
) -> Path:
|
|
258
|
+
"""Return a Path to the created DataFrame. Does not return a DataFrame object because
|
|
259
|
+
the SparkSession will be restarted.
|
|
260
|
+
|
|
261
|
+
"""
|
|
262
|
+
logger.info("Start processing query for dataset_id=%s", dataset_id)
|
|
263
|
+
hash_dir = self._compute_dataset_hash_and_serialize(
|
|
264
|
+
context, cached_datasets_dir, dataset_id
|
|
265
|
+
)
|
|
266
|
+
cached_dataset_path = hash_dir / (dataset_id + ".parquet")
|
|
267
|
+
metadata_file = cached_dataset_path.with_suffix(".json5")
|
|
268
|
+
if try_read_dataframe(cached_dataset_path) is None:
|
|
269
|
+
# An alternative solution is to call custom_spark_conf instead.
|
|
270
|
+
# That changes some settings without restarting the SparkSession.
|
|
271
|
+
# Results were not as good with that solution.
|
|
272
|
+
# Observations on queries with comstock and resstock showed that Spark
|
|
273
|
+
# used many fewer executors on the second query. That was with a standalone
|
|
274
|
+
# cluster on Kestrel with dynamic allocation enabled.
|
|
275
|
+
# We don't understand why that is the case. It may not be an issue with YARN as
|
|
276
|
+
# the cluster manager on AWS.
|
|
277
|
+
# Queries on standalone clusters will be easier to debug if we restart the session
|
|
278
|
+
# for each big job.
|
|
279
|
+
with restart_spark_with_custom_conf(
|
|
280
|
+
conf=context.model.project.get_spark_conf(dataset_id),
|
|
281
|
+
force=True,
|
|
282
|
+
):
|
|
283
|
+
logger.info("Build project-mapped dataset %s", dataset_id)
|
|
284
|
+
# Call load_dataset instead of get_dataset because the latter won't be valid here
|
|
285
|
+
# after the SparkSession restart.
|
|
286
|
+
with self._dimension_mgr.db.engine.connect() as conn:
|
|
287
|
+
dataset = self.load_dataset(dataset_id, conn=conn)
|
|
288
|
+
with Timer(timer_stats_collector, "build_project_mapped_dataset"):
|
|
289
|
+
df = dataset.make_project_dataframe(context, self._config)
|
|
290
|
+
context.serialize_dataset_metadata_to_file(
|
|
291
|
+
dataset.dataset_id, metadata_file
|
|
292
|
+
)
|
|
293
|
+
write_dataframe_and_auto_partition(df, cached_dataset_path)
|
|
294
|
+
else:
|
|
295
|
+
assert metadata_file.exists(), metadata_file
|
|
296
|
+
context.set_dataset_metadata_from_file(dataset_id, metadata_file)
|
|
297
|
+
logger.info("Use cached project-mapped dataset %s", dataset_id)
|
|
298
|
+
|
|
299
|
+
logger.info("Finished processing query for dataset_id=%s", dataset_id)
|
|
300
|
+
return cached_dataset_path
|
|
301
|
+
|
|
302
|
+
def _process_projection_dataset(
|
|
303
|
+
self,
|
|
304
|
+
context: QueryContext,
|
|
305
|
+
cached_datasets_dir: Path,
|
|
306
|
+
dataset: ProjectionDatasetModel,
|
|
307
|
+
) -> Path:
|
|
308
|
+
logger.info(
|
|
309
|
+
"Apply %s for dataset_id=%s",
|
|
310
|
+
dataset.construction_method.value,
|
|
311
|
+
dataset.initial_value_dataset_id,
|
|
312
|
+
)
|
|
313
|
+
hash_dir = self._compute_dataset_hash_and_serialize(
|
|
314
|
+
context, cached_datasets_dir, dataset.dataset_id
|
|
315
|
+
)
|
|
316
|
+
cached_dataset_path = hash_dir / (dataset.dataset_id + ".parquet")
|
|
317
|
+
metadata_file = cached_dataset_path.with_suffix(".json5")
|
|
318
|
+
if try_read_dataframe(cached_dataset_path) is None:
|
|
319
|
+
self._build_projection_dataset(
|
|
320
|
+
context,
|
|
321
|
+
cached_datasets_dir,
|
|
322
|
+
dataset,
|
|
323
|
+
cached_dataset_path,
|
|
324
|
+
metadata_file,
|
|
325
|
+
)
|
|
326
|
+
else:
|
|
327
|
+
assert metadata_file.exists(), metadata_file
|
|
328
|
+
context.set_dataset_metadata_from_file(dataset.dataset_id, metadata_file)
|
|
329
|
+
logger.info("Use cached project-mapped dataset %s", dataset.dataset_id)
|
|
330
|
+
|
|
331
|
+
return cached_dataset_path
|
|
332
|
+
|
|
333
|
+
@track_timing(timer_stats_collector)
|
|
334
|
+
def _build_projection_dataset(
|
|
335
|
+
self,
|
|
336
|
+
context: QueryContext,
|
|
337
|
+
cached_datasets_dir: Path,
|
|
338
|
+
dataset: ProjectionDatasetModel,
|
|
339
|
+
dataset_path: Path,
|
|
340
|
+
metadata_file: Path,
|
|
341
|
+
):
|
|
342
|
+
def get_myear_column(dataset_id):
|
|
343
|
+
match context.model.result.column_type:
|
|
344
|
+
case ColumnType.DIMENSION_TYPES:
|
|
345
|
+
return DimensionType.MODEL_YEAR.value
|
|
346
|
+
case ColumnType.DIMENSION_NAMES:
|
|
347
|
+
pass
|
|
348
|
+
case _:
|
|
349
|
+
msg = f"BUG: unhandled {context.model.result.column_type=}"
|
|
350
|
+
raise NotImplementedError(msg)
|
|
351
|
+
names = list(
|
|
352
|
+
context.get_dimension_column_names(DimensionType.MODEL_YEAR, dataset_id=dataset_id)
|
|
353
|
+
)
|
|
354
|
+
assert len(names) == 1, f"{dataset_id=} {names=}"
|
|
355
|
+
return names[0]
|
|
356
|
+
|
|
357
|
+
iv_path = self._process_dataset(
|
|
358
|
+
context,
|
|
359
|
+
cached_datasets_dir,
|
|
360
|
+
dataset.initial_value_dataset_id,
|
|
361
|
+
)
|
|
362
|
+
gr_path = self._process_dataset(
|
|
363
|
+
context,
|
|
364
|
+
cached_datasets_dir,
|
|
365
|
+
dataset.growth_rate_dataset_id,
|
|
366
|
+
)
|
|
367
|
+
model_year_column = get_myear_column(dataset.initial_value_dataset_id)
|
|
368
|
+
model_year_column_gr = get_myear_column(dataset.growth_rate_dataset_id)
|
|
369
|
+
if model_year_column != model_year_column_gr:
|
|
370
|
+
msg = (
|
|
371
|
+
"BUG: initial_value and growth rate datasets have different model_year columns: "
|
|
372
|
+
f"{model_year_column=} {model_year_column_gr=}"
|
|
373
|
+
)
|
|
374
|
+
raise Exception(msg)
|
|
375
|
+
match context.model.result.column_type:
|
|
376
|
+
case ColumnType.DIMENSION_NAMES:
|
|
377
|
+
time_columns = context.get_dimension_column_names(
|
|
378
|
+
DimensionType.TIME, dataset_id=dataset.initial_value_dataset_id
|
|
379
|
+
)
|
|
380
|
+
case ColumnType.DIMENSION_TYPES:
|
|
381
|
+
dset = self.get_dataset(dataset.initial_value_dataset_id)
|
|
382
|
+
time_dim = dset.config.get_time_dimension()
|
|
383
|
+
assert time_dim is not None
|
|
384
|
+
time_columns = set(time_dim.get_load_data_time_columns())
|
|
385
|
+
case _:
|
|
386
|
+
msg = f"BUG: unhandled {context.model.result.column_type=}"
|
|
387
|
+
raise NotImplementedError(msg)
|
|
388
|
+
with restart_spark_with_custom_conf(
|
|
389
|
+
conf=context.model.project.get_spark_conf(dataset.dataset_id),
|
|
390
|
+
force=True,
|
|
391
|
+
):
|
|
392
|
+
logger.info("Build projection dataset %s", dataset.dataset_id)
|
|
393
|
+
iv_df = read_dataframe(iv_path)
|
|
394
|
+
gr_df = read_dataframe(gr_path)
|
|
395
|
+
value_columns = {VALUE_COLUMN}
|
|
396
|
+
match dataset.construction_method:
|
|
397
|
+
case DatasetConstructionMethod.EXPONENTIAL_GROWTH:
|
|
398
|
+
df = apply_exponential_growth_rate(
|
|
399
|
+
dataset, iv_df, gr_df, time_columns, model_year_column, value_columns
|
|
400
|
+
)
|
|
401
|
+
case DatasetConstructionMethod.ANNUAL_MULTIPLIER:
|
|
402
|
+
df = apply_annual_multiplier(iv_df, gr_df, time_columns, value_columns)
|
|
403
|
+
case _:
|
|
404
|
+
msg = f"BUG: Unsupported {dataset.construction_method=}"
|
|
405
|
+
raise NotImplementedError(msg)
|
|
406
|
+
df = write_dataframe_and_auto_partition(df, dataset_path)
|
|
407
|
+
|
|
408
|
+
time_dim = self._config.get_base_time_dimension()
|
|
409
|
+
assert time_dim is not None
|
|
410
|
+
time_columns = time_dim.get_load_data_time_columns()
|
|
411
|
+
context.set_dataset_metadata(
|
|
412
|
+
dataset.dataset_id,
|
|
413
|
+
context.model.result.column_type,
|
|
414
|
+
time_columns,
|
|
415
|
+
)
|
|
416
|
+
context.serialize_dataset_metadata_to_file(dataset.dataset_id, metadata_file)
|
|
417
|
+
|
|
418
|
+
def _compute_dataset_hash_and_serialize(
|
|
419
|
+
self, context: QueryContext, cached_datasets_dir: Path, dataset_id: str
|
|
420
|
+
) -> Path:
|
|
421
|
+
"""Create a hash that can be used to identify whether the mapping of the dataset to
|
|
422
|
+
project dimensions can be skipped based on a previous query.
|
|
423
|
+
|
|
424
|
+
If a directory with the hash does not already exist, create it and serialize the content
|
|
425
|
+
used to create the hash.
|
|
426
|
+
|
|
427
|
+
Examples of changes that will invalidate the query:
|
|
428
|
+
- Bump to project major version number
|
|
429
|
+
- Change to a dataset version
|
|
430
|
+
- Change to a project's dimension requirements for a dataset
|
|
431
|
+
- Change to a dataset dimension mapping
|
|
432
|
+
|
|
433
|
+
Returns
|
|
434
|
+
-------
|
|
435
|
+
str
|
|
436
|
+
Directory based on the hash
|
|
437
|
+
"""
|
|
438
|
+
dataset_query_info = {
|
|
439
|
+
"project_id": self._config.model.project_id,
|
|
440
|
+
"project_major_version": VersionInfo.parse(self._config.model.version).major,
|
|
441
|
+
"dataset": self._config.get_dataset(dataset_id).model_dump(mode="json"),
|
|
442
|
+
"dataset_query_params": context.model.project.dataset.params.model_dump(mode="json"),
|
|
443
|
+
}
|
|
444
|
+
text = json.dumps(dataset_query_info, indent=2)
|
|
445
|
+
hash_dir_name = compute_hash(text.encode())
|
|
446
|
+
hash_dir = cached_datasets_dir / hash_dir_name
|
|
447
|
+
if not hash_dir.exists():
|
|
448
|
+
hash_dir.mkdir()
|
|
449
|
+
model_file = hash_dir / "model.json"
|
|
450
|
+
model_file.write_text(text)
|
|
451
|
+
return hash_dir
|
dsgrid/query/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Self
|
|
5
|
+
|
|
6
|
+
from pydantic import Field, model_validator
|
|
7
|
+
|
|
8
|
+
from dsgrid.data_models import DSGBaseModel
|
|
9
|
+
from dsgrid.utils.files import compute_hash
|
|
10
|
+
from dsgrid.utils.utilities import check_uniqueness
|
|
11
|
+
from dsgrid.config.dimension_mapping_base import DimensionMappingReferenceModel
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class MapOperation(DSGBaseModel):
|
|
18
|
+
"""Defines one mapping operation for a dataset."""
|
|
19
|
+
|
|
20
|
+
name: str = Field(
|
|
21
|
+
description="Identifier for the mapping operation. This must be a unique name.",
|
|
22
|
+
)
|
|
23
|
+
handle_data_skew: bool | None = Field(
|
|
24
|
+
default=None,
|
|
25
|
+
description="Use a salting technique to handle data skew in this mapping "
|
|
26
|
+
"operation. Skew can happen when some partitions have significantly more data than "
|
|
27
|
+
"others, resulting in unbalanced task execution times. "
|
|
28
|
+
"If this value is None, dsgrid will make its own determination of whether this "
|
|
29
|
+
"should be done based on the characteristics of the mapping operation. Setting it "
|
|
30
|
+
"to True or False will override that behavior and inform dsgrid of what to do. "
|
|
31
|
+
"This will automatically trigger a persist to the filesystem (implicitly setting "
|
|
32
|
+
"persist to True).",
|
|
33
|
+
)
|
|
34
|
+
persist: bool = Field(
|
|
35
|
+
default=False,
|
|
36
|
+
description="Persist the intermediate dataset to the filesystem after mapping "
|
|
37
|
+
"this dimension. This can be useful to prevent the query from becoming too "
|
|
38
|
+
"large. It can also be useful for benchmarking and debugging purposes.",
|
|
39
|
+
)
|
|
40
|
+
mapping_reference: DimensionMappingReferenceModel | None = Field(
|
|
41
|
+
default=None,
|
|
42
|
+
description="Reference to the model used to map the dimension. Set at runtime by dsgrid.",
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class MapOperationCheckpoint(DSGBaseModel):
|
|
47
|
+
"""Defines a completed mapping operation that has been persisted to the filesystem."""
|
|
48
|
+
|
|
49
|
+
dataset_id: str
|
|
50
|
+
completed_operation_names: list[str] = Field(
|
|
51
|
+
description="Names of the completed mapping operations."
|
|
52
|
+
)
|
|
53
|
+
persisted_table_filename: Path = Field(description="Path to a persisted file.")
|
|
54
|
+
mapping_plan_hash: str = Field(
|
|
55
|
+
description="Hash of the mapping plan. This is used to ensure that the mapping plan "
|
|
56
|
+
"hasn't changed."
|
|
57
|
+
)
|
|
58
|
+
timestamp: datetime = Field(
|
|
59
|
+
default_factory=datetime.now,
|
|
60
|
+
description="Timestamp of when the operation was completed.",
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class DatasetMappingPlan(DSGBaseModel):
|
|
65
|
+
"""Defines how to map a dataset to a list of dimensions."""
|
|
66
|
+
|
|
67
|
+
dataset_id: str = Field(
|
|
68
|
+
description="ID of the dataset to be mapped.",
|
|
69
|
+
)
|
|
70
|
+
mappings: list[MapOperation] = Field(
|
|
71
|
+
default=[],
|
|
72
|
+
description="Defines how to map each dimension of the dataset.",
|
|
73
|
+
)
|
|
74
|
+
apply_fraction_op: MapOperation = Field(
|
|
75
|
+
default=MapOperation(
|
|
76
|
+
name="apply_fraction_op",
|
|
77
|
+
handle_data_skew=False,
|
|
78
|
+
persist=False,
|
|
79
|
+
),
|
|
80
|
+
description="Defines handling of the query that applies the from_fraction value after "
|
|
81
|
+
"mapping all dimensions.",
|
|
82
|
+
)
|
|
83
|
+
apply_scaling_factor_op: MapOperation = Field(
|
|
84
|
+
default=MapOperation(
|
|
85
|
+
name="apply_scaling_factor_op",
|
|
86
|
+
handle_data_skew=False,
|
|
87
|
+
persist=False,
|
|
88
|
+
),
|
|
89
|
+
description="Defines handling of the query that applies the scaling factor, if one exists. "
|
|
90
|
+
"This happens after apply_fraction_op.",
|
|
91
|
+
)
|
|
92
|
+
convert_units_op: MapOperation = Field(
|
|
93
|
+
default=MapOperation(
|
|
94
|
+
name="convert_units_op",
|
|
95
|
+
handle_data_skew=False,
|
|
96
|
+
persist=False,
|
|
97
|
+
),
|
|
98
|
+
description="Defines handling of the query that converts units. This happens after "
|
|
99
|
+
"apply_fraction_op and before mapping time. It is strongly recommended to not persist "
|
|
100
|
+
"this table because the code currently always persists before mapping time.",
|
|
101
|
+
)
|
|
102
|
+
map_time_op: MapOperation = Field(
|
|
103
|
+
default=MapOperation(
|
|
104
|
+
name="map_time",
|
|
105
|
+
handle_data_skew=False,
|
|
106
|
+
persist=False,
|
|
107
|
+
),
|
|
108
|
+
description="Defines handling of the query that maps the time dimension. This happens after "
|
|
109
|
+
"convert_units_op. Unlike the other dimension mappings, this does not use the generic "
|
|
110
|
+
"mapping code. It relies on specific handling in chronify by time type.",
|
|
111
|
+
)
|
|
112
|
+
keep_intermediate_files: bool = Field(
|
|
113
|
+
default=False,
|
|
114
|
+
description="If True, keep the intermediate tables created during the mapping process. "
|
|
115
|
+
"This is useful for debugging and benchmarking, but will consume more disk space.",
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
@model_validator(mode="after")
|
|
119
|
+
def check_names(self) -> Self:
|
|
120
|
+
names = [x.name for x in self.mappings] + [
|
|
121
|
+
self.apply_fraction_op.name,
|
|
122
|
+
self.apply_scaling_factor_op.name,
|
|
123
|
+
self.convert_units_op.name,
|
|
124
|
+
self.map_time_op.name,
|
|
125
|
+
]
|
|
126
|
+
check_uniqueness(names, "name")
|
|
127
|
+
return self
|
|
128
|
+
|
|
129
|
+
def list_mapping_operations(self) -> list[MapOperation]:
|
|
130
|
+
"""List all mapping operations in the plan, in order."""
|
|
131
|
+
return self.mappings + [
|
|
132
|
+
self.apply_fraction_op,
|
|
133
|
+
self.apply_scaling_factor_op,
|
|
134
|
+
self.convert_units_op,
|
|
135
|
+
self.map_time_op,
|
|
136
|
+
]
|
|
137
|
+
|
|
138
|
+
def compute_hash(self) -> str:
|
|
139
|
+
"""Compute a hash of the mapping plan."""
|
|
140
|
+
return compute_hash(
|
|
141
|
+
bytes(self.model_dump_json(exclude={"keep_intermediate_files"}).encode("utf-8"))
|
|
142
|
+
)
|