dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- build_backend.py +93 -0
- dsgrid/__init__.py +22 -0
- dsgrid/api/__init__.py +0 -0
- dsgrid/api/api_manager.py +179 -0
- dsgrid/api/app.py +419 -0
- dsgrid/api/models.py +60 -0
- dsgrid/api/response_models.py +116 -0
- dsgrid/apps/__init__.py +0 -0
- dsgrid/apps/project_viewer/app.py +216 -0
- dsgrid/apps/registration_gui.py +444 -0
- dsgrid/chronify.py +32 -0
- dsgrid/cli/__init__.py +0 -0
- dsgrid/cli/common.py +120 -0
- dsgrid/cli/config.py +176 -0
- dsgrid/cli/download.py +13 -0
- dsgrid/cli/dsgrid.py +157 -0
- dsgrid/cli/dsgrid_admin.py +92 -0
- dsgrid/cli/install_notebooks.py +62 -0
- dsgrid/cli/query.py +729 -0
- dsgrid/cli/registry.py +1862 -0
- dsgrid/cloud/__init__.py +0 -0
- dsgrid/cloud/cloud_storage_interface.py +140 -0
- dsgrid/cloud/factory.py +31 -0
- dsgrid/cloud/fake_storage_interface.py +37 -0
- dsgrid/cloud/s3_storage_interface.py +156 -0
- dsgrid/common.py +36 -0
- dsgrid/config/__init__.py +0 -0
- dsgrid/config/annual_time_dimension_config.py +194 -0
- dsgrid/config/common.py +142 -0
- dsgrid/config/config_base.py +148 -0
- dsgrid/config/dataset_config.py +907 -0
- dsgrid/config/dataset_schema_handler_factory.py +46 -0
- dsgrid/config/date_time_dimension_config.py +136 -0
- dsgrid/config/dimension_config.py +54 -0
- dsgrid/config/dimension_config_factory.py +65 -0
- dsgrid/config/dimension_mapping_base.py +350 -0
- dsgrid/config/dimension_mappings_config.py +48 -0
- dsgrid/config/dimensions.py +1025 -0
- dsgrid/config/dimensions_config.py +71 -0
- dsgrid/config/file_schema.py +190 -0
- dsgrid/config/index_time_dimension_config.py +80 -0
- dsgrid/config/input_dataset_requirements.py +31 -0
- dsgrid/config/mapping_tables.py +209 -0
- dsgrid/config/noop_time_dimension_config.py +42 -0
- dsgrid/config/project_config.py +1462 -0
- dsgrid/config/registration_models.py +188 -0
- dsgrid/config/representative_period_time_dimension_config.py +194 -0
- dsgrid/config/simple_models.py +49 -0
- dsgrid/config/supplemental_dimension.py +29 -0
- dsgrid/config/time_dimension_base_config.py +192 -0
- dsgrid/data_models.py +155 -0
- dsgrid/dataset/__init__.py +0 -0
- dsgrid/dataset/dataset.py +123 -0
- dsgrid/dataset/dataset_expression_handler.py +86 -0
- dsgrid/dataset/dataset_mapping_manager.py +121 -0
- dsgrid/dataset/dataset_schema_handler_base.py +945 -0
- dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
- dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
- dsgrid/dataset/growth_rates.py +162 -0
- dsgrid/dataset/models.py +51 -0
- dsgrid/dataset/table_format_handler_base.py +257 -0
- dsgrid/dataset/table_format_handler_factory.py +17 -0
- dsgrid/dataset/unpivoted_table.py +121 -0
- dsgrid/dimension/__init__.py +0 -0
- dsgrid/dimension/base_models.py +230 -0
- dsgrid/dimension/dimension_filters.py +308 -0
- dsgrid/dimension/standard.py +252 -0
- dsgrid/dimension/time.py +352 -0
- dsgrid/dimension/time_utils.py +103 -0
- dsgrid/dsgrid_rc.py +88 -0
- dsgrid/exceptions.py +105 -0
- dsgrid/filesystem/__init__.py +0 -0
- dsgrid/filesystem/cloud_filesystem.py +32 -0
- dsgrid/filesystem/factory.py +32 -0
- dsgrid/filesystem/filesystem_interface.py +136 -0
- dsgrid/filesystem/local_filesystem.py +74 -0
- dsgrid/filesystem/s3_filesystem.py +118 -0
- dsgrid/loggers.py +132 -0
- dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
- dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
- dsgrid/notebooks/registration.ipynb +48 -0
- dsgrid/notebooks/start_notebook.sh +11 -0
- dsgrid/project.py +451 -0
- dsgrid/query/__init__.py +0 -0
- dsgrid/query/dataset_mapping_plan.py +142 -0
- dsgrid/query/derived_dataset.py +388 -0
- dsgrid/query/models.py +728 -0
- dsgrid/query/query_context.py +287 -0
- dsgrid/query/query_submitter.py +994 -0
- dsgrid/query/report_factory.py +19 -0
- dsgrid/query/report_peak_load.py +70 -0
- dsgrid/query/reports_base.py +20 -0
- dsgrid/registry/__init__.py +0 -0
- dsgrid/registry/bulk_register.py +165 -0
- dsgrid/registry/common.py +287 -0
- dsgrid/registry/config_update_checker_base.py +63 -0
- dsgrid/registry/data_store_factory.py +34 -0
- dsgrid/registry/data_store_interface.py +74 -0
- dsgrid/registry/dataset_config_generator.py +158 -0
- dsgrid/registry/dataset_registry_manager.py +950 -0
- dsgrid/registry/dataset_update_checker.py +16 -0
- dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
- dsgrid/registry/dimension_mapping_update_checker.py +16 -0
- dsgrid/registry/dimension_registry_manager.py +413 -0
- dsgrid/registry/dimension_update_checker.py +16 -0
- dsgrid/registry/duckdb_data_store.py +207 -0
- dsgrid/registry/filesystem_data_store.py +150 -0
- dsgrid/registry/filter_registry_manager.py +123 -0
- dsgrid/registry/project_config_generator.py +57 -0
- dsgrid/registry/project_registry_manager.py +1623 -0
- dsgrid/registry/project_update_checker.py +48 -0
- dsgrid/registry/registration_context.py +223 -0
- dsgrid/registry/registry_auto_updater.py +316 -0
- dsgrid/registry/registry_database.py +667 -0
- dsgrid/registry/registry_interface.py +446 -0
- dsgrid/registry/registry_manager.py +558 -0
- dsgrid/registry/registry_manager_base.py +367 -0
- dsgrid/registry/versioning.py +92 -0
- dsgrid/rust_ext/__init__.py +14 -0
- dsgrid/rust_ext/find_minimal_patterns.py +129 -0
- dsgrid/spark/__init__.py +0 -0
- dsgrid/spark/functions.py +589 -0
- dsgrid/spark/types.py +110 -0
- dsgrid/tests/__init__.py +0 -0
- dsgrid/tests/common.py +140 -0
- dsgrid/tests/make_us_data_registry.py +265 -0
- dsgrid/tests/register_derived_datasets.py +103 -0
- dsgrid/tests/utils.py +25 -0
- dsgrid/time/__init__.py +0 -0
- dsgrid/time/time_conversions.py +80 -0
- dsgrid/time/types.py +67 -0
- dsgrid/units/__init__.py +0 -0
- dsgrid/units/constants.py +113 -0
- dsgrid/units/convert.py +71 -0
- dsgrid/units/energy.py +145 -0
- dsgrid/units/power.py +87 -0
- dsgrid/utils/__init__.py +0 -0
- dsgrid/utils/dataset.py +830 -0
- dsgrid/utils/files.py +179 -0
- dsgrid/utils/filters.py +125 -0
- dsgrid/utils/id_remappings.py +100 -0
- dsgrid/utils/py_expression_eval/LICENSE +19 -0
- dsgrid/utils/py_expression_eval/README.md +8 -0
- dsgrid/utils/py_expression_eval/__init__.py +847 -0
- dsgrid/utils/py_expression_eval/tests.py +283 -0
- dsgrid/utils/run_command.py +70 -0
- dsgrid/utils/scratch_dir_context.py +65 -0
- dsgrid/utils/spark.py +918 -0
- dsgrid/utils/spark_partition.py +98 -0
- dsgrid/utils/timing.py +239 -0
- dsgrid/utils/utilities.py +221 -0
- dsgrid/utils/versioning.py +36 -0
- dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
- dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
- dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
- dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
- dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
|
@@ -0,0 +1,907 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Literal, Union
|
|
5
|
+
|
|
6
|
+
from pydantic import field_validator, model_validator, Field
|
|
7
|
+
|
|
8
|
+
from dsgrid.common import SCALING_FACTOR_COLUMN, VALUE_COLUMN
|
|
9
|
+
from dsgrid.config.common import make_base_dimension_template
|
|
10
|
+
from dsgrid.config.dimension_config import (
|
|
11
|
+
DimensionBaseConfig,
|
|
12
|
+
DimensionBaseConfigWithFiles,
|
|
13
|
+
)
|
|
14
|
+
from dsgrid.config.file_schema import FileSchema
|
|
15
|
+
from dsgrid.config.time_dimension_base_config import TimeDimensionBaseConfig
|
|
16
|
+
from dsgrid.dataset.models import (
|
|
17
|
+
TableFormat,
|
|
18
|
+
ValueFormat,
|
|
19
|
+
)
|
|
20
|
+
from dsgrid.dimension.base_models import DimensionType, check_timezone_in_geography
|
|
21
|
+
from dsgrid.dimension.time import TimeDimensionType
|
|
22
|
+
from dsgrid.exceptions import DSGInvalidDataset, DSGInvalidParameter
|
|
23
|
+
from dsgrid.registry.common import check_config_id_strict
|
|
24
|
+
from dsgrid.data_models import DSGBaseDatabaseModel, DSGBaseModel, DSGEnum, EnumValue
|
|
25
|
+
from dsgrid.exceptions import DSGInvalidDimension
|
|
26
|
+
from dsgrid.spark.types import (
|
|
27
|
+
DataFrame,
|
|
28
|
+
F,
|
|
29
|
+
)
|
|
30
|
+
from dsgrid.utils.spark import get_unique_values, read_dataframe
|
|
31
|
+
from dsgrid.utils.utilities import check_uniqueness
|
|
32
|
+
from .config_base import ConfigBase
|
|
33
|
+
from .dimensions import (
|
|
34
|
+
DimensionsListModel,
|
|
35
|
+
DimensionReferenceModel,
|
|
36
|
+
DimensionModel,
|
|
37
|
+
TimeDimensionBaseModel,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# Note that there is special handling for S3 at use sites.
|
|
42
|
+
ALLOWED_LOAD_DATA_FILENAMES = ("load_data.parquet", "load_data.csv", "table.parquet")
|
|
43
|
+
ALLOWED_LOAD_DATA_LOOKUP_FILENAMES = (
|
|
44
|
+
"load_data_lookup.parquet",
|
|
45
|
+
"lookup_table.parquet",
|
|
46
|
+
# The next two are only used for test data.
|
|
47
|
+
"load_data_lookup.csv",
|
|
48
|
+
"load_data_lookup.json",
|
|
49
|
+
)
|
|
50
|
+
ALLOWED_DATA_FILES = ALLOWED_LOAD_DATA_FILENAMES + ALLOWED_LOAD_DATA_LOOKUP_FILENAMES
|
|
51
|
+
ALLOWED_MISSING_DIMENSION_ASSOCATIONS_FILENAMES = (
|
|
52
|
+
"missing_associations.csv",
|
|
53
|
+
"missing_associations.parquet",
|
|
54
|
+
)
|
|
55
|
+
MISSING_ASSOCIATIONS_DIR_NAME = "missing_associations"
|
|
56
|
+
|
|
57
|
+
logger = logging.getLogger(__name__)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def check_load_data_filename(path: str | Path) -> Path:
|
|
61
|
+
"""Return the load_data filename in path. Supports Parquet and CSV.
|
|
62
|
+
|
|
63
|
+
Parameters
|
|
64
|
+
----------
|
|
65
|
+
path : str | Path
|
|
66
|
+
|
|
67
|
+
Returns
|
|
68
|
+
-------
|
|
69
|
+
Path
|
|
70
|
+
|
|
71
|
+
Raises
|
|
72
|
+
------
|
|
73
|
+
ValueError
|
|
74
|
+
Raised if no supported load data filename exists.
|
|
75
|
+
|
|
76
|
+
"""
|
|
77
|
+
path_ = path if isinstance(path, Path) else Path(path)
|
|
78
|
+
if str(path_).startswith("s3://"):
|
|
79
|
+
# Only Parquet is supported on AWS.
|
|
80
|
+
return path_ / "/load_data.parquet"
|
|
81
|
+
|
|
82
|
+
for allowed_name in ALLOWED_LOAD_DATA_FILENAMES:
|
|
83
|
+
filename = path_ / allowed_name
|
|
84
|
+
if filename.exists():
|
|
85
|
+
return filename
|
|
86
|
+
|
|
87
|
+
# Use ValueError because this gets called in Pydantic model validation.
|
|
88
|
+
msg = f"no load_data file exists in {path_}"
|
|
89
|
+
raise ValueError(msg)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def check_load_data_lookup_filename(path: str | Path) -> Path:
|
|
93
|
+
"""Return the load_data_lookup filename in path. Supports Parquet, CSV, and JSON.
|
|
94
|
+
|
|
95
|
+
Parameters
|
|
96
|
+
----------
|
|
97
|
+
path : Path
|
|
98
|
+
|
|
99
|
+
Returns
|
|
100
|
+
-------
|
|
101
|
+
Path
|
|
102
|
+
|
|
103
|
+
Raises
|
|
104
|
+
------
|
|
105
|
+
ValueError
|
|
106
|
+
Raised if no supported load data lookup filename exists.
|
|
107
|
+
|
|
108
|
+
"""
|
|
109
|
+
path_ = path if isinstance(path, Path) else Path(path)
|
|
110
|
+
if str(path_).startswith("s3://"):
|
|
111
|
+
# Only Parquet is supported on AWS.
|
|
112
|
+
return path_ / "/load_data_lookup.parquet"
|
|
113
|
+
|
|
114
|
+
for allowed_name in ALLOWED_LOAD_DATA_LOOKUP_FILENAMES:
|
|
115
|
+
filename = path_ / allowed_name
|
|
116
|
+
if filename.exists():
|
|
117
|
+
return filename
|
|
118
|
+
|
|
119
|
+
# Use ValueError because this gets called in Pydantic model validation.
|
|
120
|
+
msg = f"no load_data_lookup file exists in {path_}"
|
|
121
|
+
raise ValueError(msg)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class InputDatasetType(DSGEnum):
|
|
125
|
+
MODELED = "modeled"
|
|
126
|
+
HISTORICAL = "historical"
|
|
127
|
+
BENCHMARK = "benchmark"
|
|
128
|
+
UNSPECIFIED = "unspecified"
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class DSGDatasetParquetType(DSGEnum):
|
|
132
|
+
"""Dataset parquet types."""
|
|
133
|
+
|
|
134
|
+
LOAD_DATA = EnumValue(
|
|
135
|
+
value="load_data",
|
|
136
|
+
description="""
|
|
137
|
+
In TWO_TABLE table_format, load_data is a file with ID, timestamp, and metric value columns.
|
|
138
|
+
In ONE_TABLE table_format, load_data is a file with multiple data dimension and metric value columns.
|
|
139
|
+
""",
|
|
140
|
+
)
|
|
141
|
+
LOAD_DATA_LOOKUP = EnumValue(
|
|
142
|
+
value="load_data_lookup",
|
|
143
|
+
description="""
|
|
144
|
+
load_data_lookup is a file with multiple data dimension columns and an ID column that maps to load_data file.
|
|
145
|
+
""",
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class DataClassificationType(DSGEnum):
|
|
150
|
+
"""Data risk classification type.
|
|
151
|
+
|
|
152
|
+
See FIPS 199, https://csrc.nist.gov/files/pubs/fips/199/final/docs/fips-pub-199-final.pdf
|
|
153
|
+
for more information. In general these classifications describe potential impact on
|
|
154
|
+
organizations and individuals. In more detailed schemes a separate classification could
|
|
155
|
+
be applied to confidentiality, integrity, and availability.
|
|
156
|
+
"""
|
|
157
|
+
|
|
158
|
+
LOW = EnumValue(
|
|
159
|
+
value="low",
|
|
160
|
+
description=(
|
|
161
|
+
"The loss of confidentiality, integrity, or availability could be "
|
|
162
|
+
"expected to have a limited adverse effect on organizational operations, "
|
|
163
|
+
"organizational assets, or individuals."
|
|
164
|
+
),
|
|
165
|
+
)
|
|
166
|
+
MODERATE = EnumValue(
|
|
167
|
+
value="moderate",
|
|
168
|
+
description=(
|
|
169
|
+
"The loss of confidentiality, integrity, or availability could be expected "
|
|
170
|
+
"to have a serious adverse effect on organizational operations, organizational "
|
|
171
|
+
"assets, or individuals."
|
|
172
|
+
),
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
class DatasetQualifierType(str, Enum):
|
|
177
|
+
QUANTITY = "quantity"
|
|
178
|
+
GROWTH_RATE = "growth_rate"
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
class GrowthRateType(str, Enum):
|
|
182
|
+
EXPONENTIAL_ANNUAL = "exponential_annual"
|
|
183
|
+
EXPONENTIAL_MONTHLY = "exponential_monthly"
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
class QuantityModel(DSGBaseModel):
|
|
187
|
+
dataset_qualifier_type: Literal[DatasetQualifierType.QUANTITY] = DatasetQualifierType.QUANTITY
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class GrowthRateModel(DSGBaseModel):
|
|
191
|
+
dataset_qualifier_type: Literal[
|
|
192
|
+
DatasetQualifierType.GROWTH_RATE
|
|
193
|
+
] = DatasetQualifierType.GROWTH_RATE
|
|
194
|
+
growth_rate_type: GrowthRateType = Field(
|
|
195
|
+
title="growth_rate_type",
|
|
196
|
+
description="Type of growth rates, e.g., exponential_annual",
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
class UserDataLayout(DSGBaseModel):
|
|
201
|
+
"""User-defined data layout for dataset registration."""
|
|
202
|
+
|
|
203
|
+
data_file: FileSchema = Field(
|
|
204
|
+
title="data_file",
|
|
205
|
+
description="Defines the data file",
|
|
206
|
+
)
|
|
207
|
+
lookup_data_file: FileSchema | None = Field(
|
|
208
|
+
default=None,
|
|
209
|
+
title="lookup_data_file",
|
|
210
|
+
description="Defines the lookup data file. Required if the table format is 'two_table'.",
|
|
211
|
+
)
|
|
212
|
+
missing_associations: list[str] = Field(
|
|
213
|
+
default=[],
|
|
214
|
+
title="missing_associations",
|
|
215
|
+
description="List of paths to missing associations files (e.g., "
|
|
216
|
+
"missing_associations.parquet) or directories of files containing missing combinations by "
|
|
217
|
+
"dimension type (e.g., geography__subsector.csv, subsector__metric.csv).",
|
|
218
|
+
)
|
|
219
|
+
table_format: TableFormat = Field(
|
|
220
|
+
title="table_format",
|
|
221
|
+
description="Table structure: one_table (all data in single table) or "
|
|
222
|
+
"two_table (time series data separate from lookup metadata).",
|
|
223
|
+
)
|
|
224
|
+
value_format: ValueFormat = Field(
|
|
225
|
+
title="value_format",
|
|
226
|
+
description="Value column format: stacked (single value column) or "
|
|
227
|
+
"pivoted (one dimension's records as columns).",
|
|
228
|
+
)
|
|
229
|
+
pivoted_dimension_type: DimensionType | None = Field(
|
|
230
|
+
default=None,
|
|
231
|
+
title="pivoted_dimension_type",
|
|
232
|
+
description="The dimension type whose records are columns (pivoted) that contain "
|
|
233
|
+
"data values. Required when value_format is 'pivoted'.",
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
@model_validator(mode="after")
|
|
237
|
+
def validate_layout(self):
|
|
238
|
+
"""Validate data layout consistency."""
|
|
239
|
+
if self.table_format == TableFormat.TWO_TABLE:
|
|
240
|
+
if self.lookup_data_file is None:
|
|
241
|
+
msg = "lookup_data_file is required when table_format is 'two_table'"
|
|
242
|
+
raise ValueError(msg)
|
|
243
|
+
if self.value_format == ValueFormat.PIVOTED:
|
|
244
|
+
if self.pivoted_dimension_type is None:
|
|
245
|
+
msg = "pivoted_dimension_type is required when value_format is 'pivoted'"
|
|
246
|
+
raise ValueError(msg)
|
|
247
|
+
if self.value_format == ValueFormat.STACKED:
|
|
248
|
+
if self.pivoted_dimension_type is not None:
|
|
249
|
+
msg = "pivoted_dimension_type must be None when value_format is 'stacked'"
|
|
250
|
+
raise ValueError(msg)
|
|
251
|
+
return self
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
class RegistryDataLayout(DSGBaseModel):
|
|
255
|
+
"""Data layout stored in the dsgrid registry (without file paths)."""
|
|
256
|
+
|
|
257
|
+
table_format: TableFormat = Field(
|
|
258
|
+
title="table_format",
|
|
259
|
+
description="Table structure: one_table or two_table.",
|
|
260
|
+
)
|
|
261
|
+
value_format: ValueFormat = Field(
|
|
262
|
+
title="value_format",
|
|
263
|
+
description="Value column format: stacked or pivoted.",
|
|
264
|
+
)
|
|
265
|
+
pivoted_dimension_type: DimensionType | None = Field(
|
|
266
|
+
default=None,
|
|
267
|
+
title="pivoted_dimension_type",
|
|
268
|
+
description="The dimension type whose records are columns when pivoted.",
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
@model_validator(mode="after")
|
|
272
|
+
def validate_layout(self):
|
|
273
|
+
"""Validate data layout consistency."""
|
|
274
|
+
if self.value_format == ValueFormat.PIVOTED:
|
|
275
|
+
if self.pivoted_dimension_type is None:
|
|
276
|
+
msg = "pivoted_dimension_type is required when value_format is 'pivoted'"
|
|
277
|
+
raise ValueError(msg)
|
|
278
|
+
if self.value_format == ValueFormat.STACKED:
|
|
279
|
+
if self.pivoted_dimension_type is not None:
|
|
280
|
+
msg = "pivoted_dimension_type must be None when value_format is 'stacked'"
|
|
281
|
+
raise ValueError(msg)
|
|
282
|
+
return self
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def user_layout_to_registry_layout(user_layout: UserDataLayout) -> RegistryDataLayout:
|
|
286
|
+
"""Convert a UserDataLayout to a RegistryDataLayout for registry storage.
|
|
287
|
+
|
|
288
|
+
Parameters
|
|
289
|
+
----------
|
|
290
|
+
user_layout : UserDataLayout
|
|
291
|
+
The user layout containing file paths and layout settings.
|
|
292
|
+
|
|
293
|
+
Returns
|
|
294
|
+
-------
|
|
295
|
+
RegistryDataLayout
|
|
296
|
+
A registry layout without file paths, suitable for database storage.
|
|
297
|
+
"""
|
|
298
|
+
return RegistryDataLayout(
|
|
299
|
+
table_format=user_layout.table_format,
|
|
300
|
+
value_format=user_layout.value_format,
|
|
301
|
+
pivoted_dimension_type=user_layout.pivoted_dimension_type,
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
class DatasetConfigModel(DSGBaseDatabaseModel):
|
|
306
|
+
"""Represents dataset configurations."""
|
|
307
|
+
|
|
308
|
+
dataset_id: str = Field(
|
|
309
|
+
title="dataset_id",
|
|
310
|
+
description="Unique dataset identifier.",
|
|
311
|
+
)
|
|
312
|
+
data_layout: UserDataLayout | None = Field(
|
|
313
|
+
default=None,
|
|
314
|
+
title="data_layout",
|
|
315
|
+
description="Defines the data layout (table format, value format, and file paths) "
|
|
316
|
+
"for dataset registration.",
|
|
317
|
+
)
|
|
318
|
+
registry_data_layout: RegistryDataLayout | None = Field(
|
|
319
|
+
default=None,
|
|
320
|
+
title="registry_data_layout",
|
|
321
|
+
description="Defines the dataset's data layout once stored in the registry.",
|
|
322
|
+
)
|
|
323
|
+
dataset_type: InputDatasetType = Field(
|
|
324
|
+
default=InputDatasetType.UNSPECIFIED,
|
|
325
|
+
title="dataset_type",
|
|
326
|
+
description="Input dataset type.",
|
|
327
|
+
json_schema_extra={
|
|
328
|
+
"options": InputDatasetType.format_for_docs(),
|
|
329
|
+
},
|
|
330
|
+
)
|
|
331
|
+
dataset_qualifier_metadata: Union[QuantityModel, GrowthRateModel] = Field(
|
|
332
|
+
default=QuantityModel(dataset_qualifier_type=DatasetQualifierType.QUANTITY),
|
|
333
|
+
title="dataset_qualifier_metadata",
|
|
334
|
+
description="Additional metadata to include related to the dataset_qualifier",
|
|
335
|
+
discriminator="dataset_qualifier_type",
|
|
336
|
+
)
|
|
337
|
+
description: str | None = Field(
|
|
338
|
+
default=None,
|
|
339
|
+
title="description",
|
|
340
|
+
description="A detailed description of the dataset",
|
|
341
|
+
)
|
|
342
|
+
sector_description: str | None = Field(
|
|
343
|
+
default=None,
|
|
344
|
+
title="sector_description",
|
|
345
|
+
description="Sectoral description (e.g., residential, commercial, industrial, "
|
|
346
|
+
"transportation, electricity)",
|
|
347
|
+
)
|
|
348
|
+
data_source: str | None = Field(
|
|
349
|
+
default=None,
|
|
350
|
+
title="data_source",
|
|
351
|
+
description="Original data source name, e.g. 'ComStock', 'EIA 861'.",
|
|
352
|
+
)
|
|
353
|
+
# for old data, port from origin_date
|
|
354
|
+
data_source_date: str | None = Field(
|
|
355
|
+
default=None,
|
|
356
|
+
title="data_source_date",
|
|
357
|
+
description="Date or year the original source data were published, e.g., '2021' for 'EIA AEO 2021'.",
|
|
358
|
+
)
|
|
359
|
+
# for old data, port from origin_version or drop
|
|
360
|
+
data_source_version: str | None = Field(
|
|
361
|
+
default=None,
|
|
362
|
+
title="data_source_version",
|
|
363
|
+
description=(
|
|
364
|
+
"Source data version, if applicable. For example, could specify preliminary "
|
|
365
|
+
"versus final data."
|
|
366
|
+
),
|
|
367
|
+
)
|
|
368
|
+
data_source_authors: list[str] | None = Field(
|
|
369
|
+
default=None,
|
|
370
|
+
title="data_source_authors",
|
|
371
|
+
description="List of authors for the original data source.",
|
|
372
|
+
)
|
|
373
|
+
data_source_doi_url: str | None = Field(
|
|
374
|
+
default=None,
|
|
375
|
+
title="data_source_doi_url",
|
|
376
|
+
description="Original data source doi or other url",
|
|
377
|
+
)
|
|
378
|
+
origin_creator: str | None = Field(
|
|
379
|
+
default=None,
|
|
380
|
+
title="origin_creator",
|
|
381
|
+
description="First and last name of the person who formatted this dataset for dsgrid",
|
|
382
|
+
)
|
|
383
|
+
origin_organization: str | None = Field(
|
|
384
|
+
default=None,
|
|
385
|
+
title="origin_organization",
|
|
386
|
+
description="Organization name of the origin_creator, e.g., 'NREL'",
|
|
387
|
+
)
|
|
388
|
+
origin_contributors: list[str] | None = Field(
|
|
389
|
+
default=None,
|
|
390
|
+
title="origin_contributors",
|
|
391
|
+
description=(
|
|
392
|
+
"List of contributors to the compilation of this dataset for dsgrid, "
|
|
393
|
+
" e.g., ['Harry Potter', 'Ronald Weasley']"
|
|
394
|
+
),
|
|
395
|
+
)
|
|
396
|
+
origin_project: str | None = Field(
|
|
397
|
+
default=None,
|
|
398
|
+
title="origin_project",
|
|
399
|
+
description=(
|
|
400
|
+
"Name of the project for/from which this dataset was compiled, e.g., "
|
|
401
|
+
"'IEF', 'Building Standard Scenarios'."
|
|
402
|
+
),
|
|
403
|
+
)
|
|
404
|
+
user_defined_metadata: dict[str, Any] = Field(
|
|
405
|
+
title="user_defined_metadata",
|
|
406
|
+
description="Additional user defined metadata fields",
|
|
407
|
+
default={},
|
|
408
|
+
)
|
|
409
|
+
tags: list[str] | None = Field(
|
|
410
|
+
default=None,
|
|
411
|
+
title="tags",
|
|
412
|
+
description="List of data tags",
|
|
413
|
+
)
|
|
414
|
+
# ETH@20251008 - Although we could define a default DataClassificationType,
|
|
415
|
+
# it seems better to default to 'low' by priniting in the template, so that
|
|
416
|
+
# the base assumption of low risk is clear to dataset contributors.
|
|
417
|
+
data_classification: DataClassificationType = Field(
|
|
418
|
+
title="data_classification",
|
|
419
|
+
description="Data security classification (e.g., low, moderate).",
|
|
420
|
+
json_schema_extra={
|
|
421
|
+
"options": DataClassificationType.format_for_docs(),
|
|
422
|
+
},
|
|
423
|
+
)
|
|
424
|
+
enable_unit_conversion: bool = Field(
|
|
425
|
+
default=True,
|
|
426
|
+
description="If the dataset uses its dimension mapping for the metric dimension to also "
|
|
427
|
+
"perform unit conversion, then this value should be false.",
|
|
428
|
+
)
|
|
429
|
+
# This field must be listed before dimensions.
|
|
430
|
+
use_project_geography_time_zone: bool = Field(
|
|
431
|
+
default=False,
|
|
432
|
+
description="If true, time zones will be applied from the project's geography dimension. "
|
|
433
|
+
"If false, the dataset's geography dimension records must provide a time zone column.",
|
|
434
|
+
)
|
|
435
|
+
dimensions: DimensionsListModel = Field(
|
|
436
|
+
title="dimensions",
|
|
437
|
+
description="List of dimensions that make up the dimensions of dataset. They will be "
|
|
438
|
+
"automatically registered during dataset registration and then converted "
|
|
439
|
+
"to dimension_references.",
|
|
440
|
+
default=[],
|
|
441
|
+
)
|
|
442
|
+
dimension_references: list[DimensionReferenceModel] = Field(
|
|
443
|
+
title="dimensions",
|
|
444
|
+
description="List of registered dimension references that make up the dimensions of dataset.",
|
|
445
|
+
default=[],
|
|
446
|
+
)
|
|
447
|
+
trivial_dimensions: list[DimensionType] = Field(
|
|
448
|
+
title="trivial_dimensions",
|
|
449
|
+
default=[],
|
|
450
|
+
description="List of trivial dimensions (i.e., 1-element dimensions) that "
|
|
451
|
+
"do not exist in the load_data_lookup. List the dimensions by dimension type. "
|
|
452
|
+
"Trivial dimensions are 1-element dimensions that are not present in the parquet data "
|
|
453
|
+
"columns. Instead they are added by dsgrid as an alias column.",
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
@model_validator(mode="after")
|
|
457
|
+
def check_layout_fields(self):
|
|
458
|
+
"""Ensure data_layout and registry_data_layout are mutually exclusive."""
|
|
459
|
+
if self.data_layout is not None and self.registry_data_layout is not None:
|
|
460
|
+
msg = "data_layout and registry_data_layout cannot both be set"
|
|
461
|
+
raise ValueError(msg)
|
|
462
|
+
return self
|
|
463
|
+
|
|
464
|
+
@field_validator("dataset_id")
|
|
465
|
+
@classmethod
|
|
466
|
+
def check_dataset_id(cls, dataset_id):
|
|
467
|
+
"""Check dataset ID validity"""
|
|
468
|
+
check_config_id_strict(dataset_id, "Dataset")
|
|
469
|
+
return dataset_id
|
|
470
|
+
|
|
471
|
+
@field_validator("trivial_dimensions")
|
|
472
|
+
@classmethod
|
|
473
|
+
def check_time_not_trivial(cls, trivial_dimensions):
|
|
474
|
+
for dim in trivial_dimensions:
|
|
475
|
+
if dim == DimensionType.TIME:
|
|
476
|
+
msg = "The time dimension is currently not a dsgrid supported trivial dimension."
|
|
477
|
+
raise ValueError(msg)
|
|
478
|
+
return trivial_dimensions
|
|
479
|
+
|
|
480
|
+
@field_validator("dimensions")
|
|
481
|
+
@classmethod
|
|
482
|
+
def check_files(cls, values: list) -> list:
|
|
483
|
+
"""Validate dimension files are unique across all dimensions"""
|
|
484
|
+
check_uniqueness(
|
|
485
|
+
(
|
|
486
|
+
x.filename
|
|
487
|
+
for x in values
|
|
488
|
+
if isinstance(x, DimensionModel) and x.filename is not None
|
|
489
|
+
),
|
|
490
|
+
"dimension record filename",
|
|
491
|
+
)
|
|
492
|
+
return values
|
|
493
|
+
|
|
494
|
+
@field_validator("dimensions")
|
|
495
|
+
@classmethod
|
|
496
|
+
def check_names(cls, values: list) -> list:
|
|
497
|
+
"""Validate dimension names are unique across all dimensions."""
|
|
498
|
+
check_uniqueness(
|
|
499
|
+
[dim.name for dim in values],
|
|
500
|
+
"dimension record name",
|
|
501
|
+
)
|
|
502
|
+
return values
|
|
503
|
+
|
|
504
|
+
@model_validator(mode="after")
|
|
505
|
+
def check_time_zone(self) -> "DatasetConfigModel":
|
|
506
|
+
"""Validate whether required time zone information is present."""
|
|
507
|
+
geo_requires_time_zone = False
|
|
508
|
+
time_dim = None
|
|
509
|
+
if not self.use_project_geography_time_zone:
|
|
510
|
+
for dimension in self.dimensions:
|
|
511
|
+
if dimension.dimension_type == DimensionType.TIME:
|
|
512
|
+
assert isinstance(dimension, TimeDimensionBaseModel)
|
|
513
|
+
geo_requires_time_zone = dimension.is_time_zone_required_in_geography()
|
|
514
|
+
time_dim = dimension
|
|
515
|
+
break
|
|
516
|
+
|
|
517
|
+
if geo_requires_time_zone:
|
|
518
|
+
for dimension in self.dimensions:
|
|
519
|
+
if dimension.dimension_type == DimensionType.GEOGRAPHY:
|
|
520
|
+
check_timezone_in_geography(
|
|
521
|
+
dimension,
|
|
522
|
+
err_msg=f"Dataset with time dimension {time_dim} requires that its "
|
|
523
|
+
"geography dimension records include a time_zone column.",
|
|
524
|
+
)
|
|
525
|
+
|
|
526
|
+
return self
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
def make_unvalidated_dataset_config(
|
|
530
|
+
dataset_id,
|
|
531
|
+
metric_type: str,
|
|
532
|
+
pivoted_dimension_type: DimensionType | None = None,
|
|
533
|
+
data_classification=DataClassificationType.LOW.value,
|
|
534
|
+
dataset_type=InputDatasetType.UNSPECIFIED,
|
|
535
|
+
included_dimensions: list[DimensionType] | None = None,
|
|
536
|
+
time_type: TimeDimensionType | None = None,
|
|
537
|
+
use_project_geography_time_zone: bool = False,
|
|
538
|
+
dimension_references: list[DimensionReferenceModel] | None = None,
|
|
539
|
+
trivial_dimensions: list[DimensionType] | None = None,
|
|
540
|
+
slim: bool = True,
|
|
541
|
+
metadata: dict[str, Any] | None = None,
|
|
542
|
+
) -> dict[str, Any]:
|
|
543
|
+
"""Create a dataset config as a dictionary, skipping validation."""
|
|
544
|
+
trivial_dimensions_ = trivial_dimensions or []
|
|
545
|
+
exclude_dimension_types = {x.dimension_type for x in dimension_references or []}
|
|
546
|
+
if included_dimensions is not None:
|
|
547
|
+
for dim_type in set(DimensionType).difference(included_dimensions):
|
|
548
|
+
exclude_dimension_types.add(dim_type)
|
|
549
|
+
|
|
550
|
+
dimensions = make_base_dimension_template(
|
|
551
|
+
[metric_type],
|
|
552
|
+
exclude_dimension_types=exclude_dimension_types,
|
|
553
|
+
time_type=time_type,
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
if pivoted_dimension_type is None:
|
|
557
|
+
value_format = ValueFormat.STACKED.value
|
|
558
|
+
pivoted_dim_type_value = None
|
|
559
|
+
else:
|
|
560
|
+
value_format = ValueFormat.PIVOTED.value
|
|
561
|
+
pivoted_dim_type_value = pivoted_dimension_type.value
|
|
562
|
+
|
|
563
|
+
result = None
|
|
564
|
+
if slim:
|
|
565
|
+
result = {
|
|
566
|
+
"dataset_id": dataset_id,
|
|
567
|
+
"version": "1.0.0",
|
|
568
|
+
"dataset_type": dataset_type.value,
|
|
569
|
+
"data_layout": {
|
|
570
|
+
"table_format": TableFormat.ONE_TABLE.value,
|
|
571
|
+
"value_format": value_format,
|
|
572
|
+
"pivoted_dimension_type": pivoted_dim_type_value,
|
|
573
|
+
"data_file": {
|
|
574
|
+
"path": "load_data.parquet",
|
|
575
|
+
},
|
|
576
|
+
},
|
|
577
|
+
"description": "",
|
|
578
|
+
"data_classification": data_classification,
|
|
579
|
+
"use_project_geography_time_zone": use_project_geography_time_zone,
|
|
580
|
+
"dimensions": dimensions,
|
|
581
|
+
"dimension_references": [
|
|
582
|
+
x.model_dump(mode="json") for x in dimension_references or []
|
|
583
|
+
],
|
|
584
|
+
"trivial_dimensions": [x.value for x in trivial_dimensions_],
|
|
585
|
+
}
|
|
586
|
+
else:
|
|
587
|
+
result = {
|
|
588
|
+
"dataset_id": dataset_id,
|
|
589
|
+
"version": "1.0.0",
|
|
590
|
+
"dataset_type": dataset_type.value,
|
|
591
|
+
"dataset_qualifier_metadata": {
|
|
592
|
+
"dataset_qualifier_type": DatasetQualifierType.QUANTITY.value
|
|
593
|
+
},
|
|
594
|
+
"data_layout": {
|
|
595
|
+
"table_format": TableFormat.ONE_TABLE.value,
|
|
596
|
+
"value_format": value_format,
|
|
597
|
+
"pivoted_dimension_type": pivoted_dim_type_value,
|
|
598
|
+
"data_file": {
|
|
599
|
+
"path": "load_data.parquet",
|
|
600
|
+
},
|
|
601
|
+
},
|
|
602
|
+
"description": "",
|
|
603
|
+
"sector_description": "",
|
|
604
|
+
"data_source": "",
|
|
605
|
+
"data_source_date": "",
|
|
606
|
+
"data_source_version": "",
|
|
607
|
+
"data_source_authors": [],
|
|
608
|
+
"data_source_doi_url": "",
|
|
609
|
+
"origin_creator": "",
|
|
610
|
+
"origin_organization": "",
|
|
611
|
+
"origin_contributors": [],
|
|
612
|
+
"origin_project": "",
|
|
613
|
+
"user_defined_metadata": {},
|
|
614
|
+
"tags": [],
|
|
615
|
+
"data_classification": data_classification,
|
|
616
|
+
"enable_unit_conversion": True,
|
|
617
|
+
"use_project_geography_time_zone": use_project_geography_time_zone,
|
|
618
|
+
"dimensions": dimensions,
|
|
619
|
+
"dimension_references": [
|
|
620
|
+
x.model_dump(mode="json") for x in dimension_references or []
|
|
621
|
+
],
|
|
622
|
+
"trivial_dimensions": [x.value for x in trivial_dimensions_],
|
|
623
|
+
}
|
|
624
|
+
|
|
625
|
+
if metadata:
|
|
626
|
+
result.update(metadata)
|
|
627
|
+
|
|
628
|
+
return result
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
class DatasetConfig(ConfigBase):
|
|
632
|
+
"""Provides an interface to a DatasetConfigModel."""
|
|
633
|
+
|
|
634
|
+
def __init__(self, model):
|
|
635
|
+
super().__init__(model)
|
|
636
|
+
self._dimensions = {} # ConfigKey to DimensionConfig
|
|
637
|
+
|
|
638
|
+
@staticmethod
|
|
639
|
+
def config_filename():
|
|
640
|
+
return "dataset.json5"
|
|
641
|
+
|
|
642
|
+
@property
|
|
643
|
+
def config_id(self):
|
|
644
|
+
return self._model.dataset_id
|
|
645
|
+
|
|
646
|
+
@staticmethod
|
|
647
|
+
def model_class():
|
|
648
|
+
return DatasetConfigModel
|
|
649
|
+
|
|
650
|
+
@classmethod
|
|
651
|
+
def load_from_user_path(
|
|
652
|
+
cls,
|
|
653
|
+
config_file: Path,
|
|
654
|
+
data_base_dir: Path | None = None,
|
|
655
|
+
missing_associations_base_dir: Path | None = None,
|
|
656
|
+
) -> "DatasetConfig":
|
|
657
|
+
"""Load a dataset config from a user-provided config file.
|
|
658
|
+
|
|
659
|
+
The config file must contain a UserDataLayout with file paths.
|
|
660
|
+
This method validates that all required files exist.
|
|
661
|
+
|
|
662
|
+
Parameters
|
|
663
|
+
----------
|
|
664
|
+
config_file : Path
|
|
665
|
+
Path to the dataset configuration file.
|
|
666
|
+
data_base_dir : Path | None, optional
|
|
667
|
+
Base directory for data files. If set and data file paths are relative,
|
|
668
|
+
prepend them with this path instead of using the config file's parent directory.
|
|
669
|
+
missing_associations_base_dir : Path | None, optional
|
|
670
|
+
Base directory for missing associations files. If set and paths are relative,
|
|
671
|
+
prepend them with this path instead of using the config file's parent directory.
|
|
672
|
+
|
|
673
|
+
Returns
|
|
674
|
+
-------
|
|
675
|
+
DatasetConfig
|
|
676
|
+
|
|
677
|
+
Raises
|
|
678
|
+
------
|
|
679
|
+
DSGInvalidParameter
|
|
680
|
+
If the config doesn't have a UserDataLayout or required files don't exist.
|
|
681
|
+
"""
|
|
682
|
+
config = cls.load(config_file)
|
|
683
|
+
|
|
684
|
+
if not isinstance(config.model.data_layout, UserDataLayout):
|
|
685
|
+
msg = "load_from_user_path requires a UserDataLayout with file paths"
|
|
686
|
+
raise DSGInvalidParameter(msg)
|
|
687
|
+
if config.model.registry_data_layout is not None:
|
|
688
|
+
msg = "load_from_user_path requires registry_data_layout to be None"
|
|
689
|
+
raise DSGInvalidParameter(msg)
|
|
690
|
+
|
|
691
|
+
user_layout = config.model.data_layout
|
|
692
|
+
if user_layout.data_file.path is None:
|
|
693
|
+
msg = "load_from_user_path requires data_file.path to be set"
|
|
694
|
+
raise DSGInvalidParameter(msg)
|
|
695
|
+
|
|
696
|
+
# Resolve data file path
|
|
697
|
+
data_path = Path(user_layout.data_file.path)
|
|
698
|
+
if not data_path.is_absolute():
|
|
699
|
+
if data_base_dir is not None:
|
|
700
|
+
data_path = (data_base_dir / data_path).resolve()
|
|
701
|
+
else:
|
|
702
|
+
data_path = (config_file.parent / data_path).resolve()
|
|
703
|
+
if str(data_path).startswith("s3://"):
|
|
704
|
+
msg = "Registering a dataset from an S3 path is not supported."
|
|
705
|
+
raise DSGInvalidParameter(msg)
|
|
706
|
+
if not data_path.exists():
|
|
707
|
+
msg = f"Data file does not exist: {data_path}"
|
|
708
|
+
raise DSGInvalidParameter(msg)
|
|
709
|
+
user_layout.data_file.path = str(data_path)
|
|
710
|
+
|
|
711
|
+
# Resolve lookup file path
|
|
712
|
+
table_format = config.get_table_format()
|
|
713
|
+
if table_format == TableFormat.TWO_TABLE:
|
|
714
|
+
if user_layout.lookup_data_file is None:
|
|
715
|
+
msg = "Two-table format requires lookup_data_file in data_layout"
|
|
716
|
+
raise DSGInvalidParameter(msg)
|
|
717
|
+
lookup_path = Path(user_layout.lookup_data_file.path)
|
|
718
|
+
if not lookup_path.is_absolute():
|
|
719
|
+
if data_base_dir is not None:
|
|
720
|
+
lookup_path = (data_base_dir / lookup_path).resolve()
|
|
721
|
+
else:
|
|
722
|
+
lookup_path = (config_file.parent / lookup_path).resolve()
|
|
723
|
+
if not lookup_path.exists():
|
|
724
|
+
msg = f"Lookup data file does not exist: {lookup_path}"
|
|
725
|
+
raise DSGInvalidParameter(msg)
|
|
726
|
+
user_layout.lookup_data_file.path = str(lookup_path)
|
|
727
|
+
|
|
728
|
+
# Resolve missing associations paths
|
|
729
|
+
resolved_missing_paths: list[str] = []
|
|
730
|
+
for missing_assoc in user_layout.missing_associations:
|
|
731
|
+
missing_path = Path(missing_assoc)
|
|
732
|
+
if not missing_path.is_absolute():
|
|
733
|
+
if missing_associations_base_dir is not None:
|
|
734
|
+
missing_path = (missing_associations_base_dir / missing_path).resolve()
|
|
735
|
+
else:
|
|
736
|
+
missing_path = (config_file.parent / missing_path).resolve()
|
|
737
|
+
resolved_missing_paths.append(str(missing_path))
|
|
738
|
+
user_layout.missing_associations = resolved_missing_paths
|
|
739
|
+
|
|
740
|
+
return config
|
|
741
|
+
|
|
742
|
+
@property
|
|
743
|
+
def has_user_layout(self) -> bool:
|
|
744
|
+
"""Return True if this config has a UserDataLayout with file paths."""
|
|
745
|
+
return isinstance(self.model.data_layout, UserDataLayout)
|
|
746
|
+
|
|
747
|
+
@property
|
|
748
|
+
def data_file_schema(self) -> FileSchema | None:
|
|
749
|
+
"""Return the data file schema if available."""
|
|
750
|
+
if self.model.data_layout is not None:
|
|
751
|
+
return self.model.data_layout.data_file
|
|
752
|
+
return None
|
|
753
|
+
|
|
754
|
+
@property
|
|
755
|
+
def lookup_file_schema(self) -> FileSchema | None:
|
|
756
|
+
"""Return the lookup file schema if available."""
|
|
757
|
+
if self.model.data_layout is not None:
|
|
758
|
+
return self.model.data_layout.lookup_data_file
|
|
759
|
+
return None
|
|
760
|
+
|
|
761
|
+
@property
|
|
762
|
+
def missing_associations_paths(self) -> list[Path]:
|
|
763
|
+
"""Return the list of missing associations paths if available."""
|
|
764
|
+
if self.model.data_layout is not None:
|
|
765
|
+
return [Path(p) for p in self.model.data_layout.missing_associations]
|
|
766
|
+
return []
|
|
767
|
+
|
|
768
|
+
def update_dimensions(self, dimensions):
|
|
769
|
+
"""Update all dataset dimensions."""
|
|
770
|
+
self._dimensions.update(dimensions)
|
|
771
|
+
|
|
772
|
+
@property
|
|
773
|
+
def dimensions(self):
|
|
774
|
+
return self._dimensions
|
|
775
|
+
|
|
776
|
+
def get_dimension(self, dimension_type: DimensionType) -> DimensionBaseConfig | None:
|
|
777
|
+
"""Return the dimension matching dimension_type."""
|
|
778
|
+
for dim_config in self.dimensions.values():
|
|
779
|
+
if dim_config.model.dimension_type == dimension_type:
|
|
780
|
+
return dim_config
|
|
781
|
+
return None
|
|
782
|
+
|
|
783
|
+
def get_time_dimension(self) -> TimeDimensionBaseConfig | None:
|
|
784
|
+
"""Return the time dimension of the dataset."""
|
|
785
|
+
dim = self.get_dimension(DimensionType.TIME)
|
|
786
|
+
assert dim is None or isinstance(dim, TimeDimensionBaseConfig)
|
|
787
|
+
return dim
|
|
788
|
+
|
|
789
|
+
def get_dimension_with_records(
|
|
790
|
+
self, dimension_type: DimensionType
|
|
791
|
+
) -> DimensionBaseConfigWithFiles | None:
|
|
792
|
+
"""Return the dimension matching dimension_type."""
|
|
793
|
+
for dim_config in self.dimensions.values():
|
|
794
|
+
if dim_config.model.dimension_type == dimension_type and isinstance(
|
|
795
|
+
dim_config, DimensionBaseConfigWithFiles
|
|
796
|
+
):
|
|
797
|
+
return dim_config
|
|
798
|
+
return None
|
|
799
|
+
|
|
800
|
+
def get_pivoted_dimension_type(self) -> DimensionType | None:
|
|
801
|
+
"""Return the table's pivoted dimension type or None if the table isn't pivoted."""
|
|
802
|
+
if self.get_value_format() != ValueFormat.PIVOTED:
|
|
803
|
+
return None
|
|
804
|
+
if self.model.data_layout is not None:
|
|
805
|
+
return self.model.data_layout.pivoted_dimension_type
|
|
806
|
+
if self.model.registry_data_layout is not None:
|
|
807
|
+
return self.model.registry_data_layout.pivoted_dimension_type
|
|
808
|
+
return None
|
|
809
|
+
|
|
810
|
+
def get_pivoted_dimension_columns(self) -> list[str]:
|
|
811
|
+
"""Return the table's pivoted dimension columns or an empty list if the table isn't
|
|
812
|
+
pivoted.
|
|
813
|
+
"""
|
|
814
|
+
if self.get_value_format() != ValueFormat.PIVOTED:
|
|
815
|
+
return []
|
|
816
|
+
dim_type = self.get_pivoted_dimension_type()
|
|
817
|
+
if dim_type is None:
|
|
818
|
+
return []
|
|
819
|
+
dim = self.get_dimension_with_records(dim_type)
|
|
820
|
+
assert dim is not None
|
|
821
|
+
return sorted(list(dim.get_unique_ids()))
|
|
822
|
+
|
|
823
|
+
def get_value_columns(self) -> list[str]:
|
|
824
|
+
"""Return the table's columns that contain values."""
|
|
825
|
+
match self.get_value_format():
|
|
826
|
+
case ValueFormat.PIVOTED:
|
|
827
|
+
return self.get_pivoted_dimension_columns()
|
|
828
|
+
case ValueFormat.STACKED:
|
|
829
|
+
return [VALUE_COLUMN]
|
|
830
|
+
case _:
|
|
831
|
+
raise NotImplementedError(str(self.get_value_format()))
|
|
832
|
+
|
|
833
|
+
def get_table_format(self) -> TableFormat:
|
|
834
|
+
"""Return the table format (one_table or two_table)."""
|
|
835
|
+
if self.model.data_layout is not None:
|
|
836
|
+
return self.model.data_layout.table_format
|
|
837
|
+
if self.model.registry_data_layout is not None:
|
|
838
|
+
return self.model.registry_data_layout.table_format
|
|
839
|
+
msg = "Neither data_layout nor registry_data_layout is set"
|
|
840
|
+
raise DSGInvalidDataset(msg)
|
|
841
|
+
|
|
842
|
+
def get_value_format(self) -> ValueFormat:
|
|
843
|
+
"""Return the value format (stacked or pivoted)."""
|
|
844
|
+
if self.model.data_layout is not None:
|
|
845
|
+
return self.model.data_layout.value_format
|
|
846
|
+
if self.model.registry_data_layout is not None:
|
|
847
|
+
return self.model.registry_data_layout.value_format
|
|
848
|
+
msg = "Neither data_layout nor registry_data_layout is set"
|
|
849
|
+
raise DSGInvalidDataset(msg)
|
|
850
|
+
|
|
851
|
+
def add_trivial_dimensions(self, df: DataFrame):
|
|
852
|
+
"""Add trivial 1-element dimensions to load_data_lookup."""
|
|
853
|
+
for dim in self._dimensions.values():
|
|
854
|
+
if dim.model.dimension_type in self.model.trivial_dimensions:
|
|
855
|
+
self._check_trivial_record_length(dim.model.records)
|
|
856
|
+
val = dim.model.records[0].id
|
|
857
|
+
col = dim.model.dimension_type.value
|
|
858
|
+
df = df.withColumn(col, F.lit(val))
|
|
859
|
+
return df
|
|
860
|
+
|
|
861
|
+
def remove_trivial_dimensions(self, df):
|
|
862
|
+
trivial_cols = {d.value for d in self.model.trivial_dimensions}
|
|
863
|
+
select_cols = [col for col in df.columns if col not in trivial_cols]
|
|
864
|
+
return df[select_cols]
|
|
865
|
+
|
|
866
|
+
def _check_trivial_record_length(self, records):
|
|
867
|
+
"""Check that trivial dimensions have only 1 record."""
|
|
868
|
+
if len(records) > 1:
|
|
869
|
+
msg = f"Trivial dimensions must have only 1 record but {len(records)} records found for dimension: {records}"
|
|
870
|
+
raise DSGInvalidDimension(msg)
|
|
871
|
+
|
|
872
|
+
|
|
873
|
+
def get_unique_dimension_record_ids(
|
|
874
|
+
path: Path,
|
|
875
|
+
table_format: TableFormat,
|
|
876
|
+
pivoted_dimension_type: DimensionType | None,
|
|
877
|
+
time_columns: set[str],
|
|
878
|
+
) -> dict[DimensionType, list[str]]:
|
|
879
|
+
"""Get the unique dimension record IDs from a table."""
|
|
880
|
+
if table_format == TableFormat.TWO_TABLE:
|
|
881
|
+
ld = read_dataframe(check_load_data_filename(path))
|
|
882
|
+
lk = read_dataframe(check_load_data_lookup_filename(path))
|
|
883
|
+
df = ld.join(lk, on="id").drop("id")
|
|
884
|
+
elif table_format == TableFormat.ONE_TABLE:
|
|
885
|
+
ld_path = check_load_data_filename(path)
|
|
886
|
+
df = read_dataframe(ld_path)
|
|
887
|
+
else:
|
|
888
|
+
msg = f"Unsupported table format: {table_format}"
|
|
889
|
+
raise NotImplementedError(msg)
|
|
890
|
+
|
|
891
|
+
ids_by_dimension_type: dict[DimensionType, list[str]] = {}
|
|
892
|
+
for dimension_type in DimensionType:
|
|
893
|
+
if dimension_type.value in df.columns:
|
|
894
|
+
ids_by_dimension_type[dimension_type] = sorted(
|
|
895
|
+
get_unique_values(df, dimension_type.value)
|
|
896
|
+
)
|
|
897
|
+
if pivoted_dimension_type is not None:
|
|
898
|
+
if pivoted_dimension_type.value in df.columns:
|
|
899
|
+
msg = f"{pivoted_dimension_type=} cannot be in the dataframe columns."
|
|
900
|
+
raise DSGInvalidParameter(msg)
|
|
901
|
+
dimension_type_columns = {x.value for x in DimensionType}
|
|
902
|
+
dimension_type_columns.update(time_columns)
|
|
903
|
+
dimension_type_columns.update({"id", SCALING_FACTOR_COLUMN})
|
|
904
|
+
pivoted_columns = set(df.columns) - dimension_type_columns
|
|
905
|
+
ids_by_dimension_type[pivoted_dimension_type] = sorted(pivoted_columns)
|
|
906
|
+
|
|
907
|
+
return ids_by_dimension_type
|