dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- build_backend.py +93 -0
- dsgrid/__init__.py +22 -0
- dsgrid/api/__init__.py +0 -0
- dsgrid/api/api_manager.py +179 -0
- dsgrid/api/app.py +419 -0
- dsgrid/api/models.py +60 -0
- dsgrid/api/response_models.py +116 -0
- dsgrid/apps/__init__.py +0 -0
- dsgrid/apps/project_viewer/app.py +216 -0
- dsgrid/apps/registration_gui.py +444 -0
- dsgrid/chronify.py +32 -0
- dsgrid/cli/__init__.py +0 -0
- dsgrid/cli/common.py +120 -0
- dsgrid/cli/config.py +176 -0
- dsgrid/cli/download.py +13 -0
- dsgrid/cli/dsgrid.py +157 -0
- dsgrid/cli/dsgrid_admin.py +92 -0
- dsgrid/cli/install_notebooks.py +62 -0
- dsgrid/cli/query.py +729 -0
- dsgrid/cli/registry.py +1862 -0
- dsgrid/cloud/__init__.py +0 -0
- dsgrid/cloud/cloud_storage_interface.py +140 -0
- dsgrid/cloud/factory.py +31 -0
- dsgrid/cloud/fake_storage_interface.py +37 -0
- dsgrid/cloud/s3_storage_interface.py +156 -0
- dsgrid/common.py +36 -0
- dsgrid/config/__init__.py +0 -0
- dsgrid/config/annual_time_dimension_config.py +194 -0
- dsgrid/config/common.py +142 -0
- dsgrid/config/config_base.py +148 -0
- dsgrid/config/dataset_config.py +907 -0
- dsgrid/config/dataset_schema_handler_factory.py +46 -0
- dsgrid/config/date_time_dimension_config.py +136 -0
- dsgrid/config/dimension_config.py +54 -0
- dsgrid/config/dimension_config_factory.py +65 -0
- dsgrid/config/dimension_mapping_base.py +350 -0
- dsgrid/config/dimension_mappings_config.py +48 -0
- dsgrid/config/dimensions.py +1025 -0
- dsgrid/config/dimensions_config.py +71 -0
- dsgrid/config/file_schema.py +190 -0
- dsgrid/config/index_time_dimension_config.py +80 -0
- dsgrid/config/input_dataset_requirements.py +31 -0
- dsgrid/config/mapping_tables.py +209 -0
- dsgrid/config/noop_time_dimension_config.py +42 -0
- dsgrid/config/project_config.py +1462 -0
- dsgrid/config/registration_models.py +188 -0
- dsgrid/config/representative_period_time_dimension_config.py +194 -0
- dsgrid/config/simple_models.py +49 -0
- dsgrid/config/supplemental_dimension.py +29 -0
- dsgrid/config/time_dimension_base_config.py +192 -0
- dsgrid/data_models.py +155 -0
- dsgrid/dataset/__init__.py +0 -0
- dsgrid/dataset/dataset.py +123 -0
- dsgrid/dataset/dataset_expression_handler.py +86 -0
- dsgrid/dataset/dataset_mapping_manager.py +121 -0
- dsgrid/dataset/dataset_schema_handler_base.py +945 -0
- dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
- dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
- dsgrid/dataset/growth_rates.py +162 -0
- dsgrid/dataset/models.py +51 -0
- dsgrid/dataset/table_format_handler_base.py +257 -0
- dsgrid/dataset/table_format_handler_factory.py +17 -0
- dsgrid/dataset/unpivoted_table.py +121 -0
- dsgrid/dimension/__init__.py +0 -0
- dsgrid/dimension/base_models.py +230 -0
- dsgrid/dimension/dimension_filters.py +308 -0
- dsgrid/dimension/standard.py +252 -0
- dsgrid/dimension/time.py +352 -0
- dsgrid/dimension/time_utils.py +103 -0
- dsgrid/dsgrid_rc.py +88 -0
- dsgrid/exceptions.py +105 -0
- dsgrid/filesystem/__init__.py +0 -0
- dsgrid/filesystem/cloud_filesystem.py +32 -0
- dsgrid/filesystem/factory.py +32 -0
- dsgrid/filesystem/filesystem_interface.py +136 -0
- dsgrid/filesystem/local_filesystem.py +74 -0
- dsgrid/filesystem/s3_filesystem.py +118 -0
- dsgrid/loggers.py +132 -0
- dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
- dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
- dsgrid/notebooks/registration.ipynb +48 -0
- dsgrid/notebooks/start_notebook.sh +11 -0
- dsgrid/project.py +451 -0
- dsgrid/query/__init__.py +0 -0
- dsgrid/query/dataset_mapping_plan.py +142 -0
- dsgrid/query/derived_dataset.py +388 -0
- dsgrid/query/models.py +728 -0
- dsgrid/query/query_context.py +287 -0
- dsgrid/query/query_submitter.py +994 -0
- dsgrid/query/report_factory.py +19 -0
- dsgrid/query/report_peak_load.py +70 -0
- dsgrid/query/reports_base.py +20 -0
- dsgrid/registry/__init__.py +0 -0
- dsgrid/registry/bulk_register.py +165 -0
- dsgrid/registry/common.py +287 -0
- dsgrid/registry/config_update_checker_base.py +63 -0
- dsgrid/registry/data_store_factory.py +34 -0
- dsgrid/registry/data_store_interface.py +74 -0
- dsgrid/registry/dataset_config_generator.py +158 -0
- dsgrid/registry/dataset_registry_manager.py +950 -0
- dsgrid/registry/dataset_update_checker.py +16 -0
- dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
- dsgrid/registry/dimension_mapping_update_checker.py +16 -0
- dsgrid/registry/dimension_registry_manager.py +413 -0
- dsgrid/registry/dimension_update_checker.py +16 -0
- dsgrid/registry/duckdb_data_store.py +207 -0
- dsgrid/registry/filesystem_data_store.py +150 -0
- dsgrid/registry/filter_registry_manager.py +123 -0
- dsgrid/registry/project_config_generator.py +57 -0
- dsgrid/registry/project_registry_manager.py +1623 -0
- dsgrid/registry/project_update_checker.py +48 -0
- dsgrid/registry/registration_context.py +223 -0
- dsgrid/registry/registry_auto_updater.py +316 -0
- dsgrid/registry/registry_database.py +667 -0
- dsgrid/registry/registry_interface.py +446 -0
- dsgrid/registry/registry_manager.py +558 -0
- dsgrid/registry/registry_manager_base.py +367 -0
- dsgrid/registry/versioning.py +92 -0
- dsgrid/rust_ext/__init__.py +14 -0
- dsgrid/rust_ext/find_minimal_patterns.py +129 -0
- dsgrid/spark/__init__.py +0 -0
- dsgrid/spark/functions.py +589 -0
- dsgrid/spark/types.py +110 -0
- dsgrid/tests/__init__.py +0 -0
- dsgrid/tests/common.py +140 -0
- dsgrid/tests/make_us_data_registry.py +265 -0
- dsgrid/tests/register_derived_datasets.py +103 -0
- dsgrid/tests/utils.py +25 -0
- dsgrid/time/__init__.py +0 -0
- dsgrid/time/time_conversions.py +80 -0
- dsgrid/time/types.py +67 -0
- dsgrid/units/__init__.py +0 -0
- dsgrid/units/constants.py +113 -0
- dsgrid/units/convert.py +71 -0
- dsgrid/units/energy.py +145 -0
- dsgrid/units/power.py +87 -0
- dsgrid/utils/__init__.py +0 -0
- dsgrid/utils/dataset.py +830 -0
- dsgrid/utils/files.py +179 -0
- dsgrid/utils/filters.py +125 -0
- dsgrid/utils/id_remappings.py +100 -0
- dsgrid/utils/py_expression_eval/LICENSE +19 -0
- dsgrid/utils/py_expression_eval/README.md +8 -0
- dsgrid/utils/py_expression_eval/__init__.py +847 -0
- dsgrid/utils/py_expression_eval/tests.py +283 -0
- dsgrid/utils/run_command.py +70 -0
- dsgrid/utils/scratch_dir_context.py +65 -0
- dsgrid/utils/spark.py +918 -0
- dsgrid/utils/spark_partition.py +98 -0
- dsgrid/utils/timing.py +239 -0
- dsgrid/utils/utilities.py +221 -0
- dsgrid/utils/versioning.py +36 -0
- dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
- dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
- dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
- dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
- dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
|
@@ -0,0 +1,1462 @@
|
|
|
1
|
+
import itertools
|
|
2
|
+
import logging
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Annotated, Any, Generator, Iterable, Type
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from pydantic import field_validator, model_validator, Field
|
|
9
|
+
|
|
10
|
+
from dsgrid.config.common import make_base_dimension_template
|
|
11
|
+
from dsgrid.config.dataset_config import DatasetConfig
|
|
12
|
+
from dsgrid.config.dimension_config import (
|
|
13
|
+
DimensionBaseConfig,
|
|
14
|
+
DimensionBaseConfigWithFiles,
|
|
15
|
+
)
|
|
16
|
+
from dsgrid.config.mapping_tables import MappingTableConfig
|
|
17
|
+
from dsgrid.config.time_dimension_base_config import TimeDimensionBaseConfig
|
|
18
|
+
from dsgrid.data_models import DSGBaseModel, DSGBaseDatabaseModel, make_model_config
|
|
19
|
+
from dsgrid.dimension.base_models import (
|
|
20
|
+
check_required_dimensions,
|
|
21
|
+
check_timezone_in_geography,
|
|
22
|
+
DimensionCategory,
|
|
23
|
+
DimensionType,
|
|
24
|
+
)
|
|
25
|
+
from dsgrid.dimension.time import TimeDimensionType
|
|
26
|
+
from dsgrid.exceptions import (
|
|
27
|
+
DSGInvalidDataset,
|
|
28
|
+
DSGInvalidField,
|
|
29
|
+
DSGInvalidDimension,
|
|
30
|
+
DSGInvalidOperation,
|
|
31
|
+
DSGInvalidParameter,
|
|
32
|
+
DSGValueNotRegistered,
|
|
33
|
+
)
|
|
34
|
+
from dsgrid.registry.common import (
|
|
35
|
+
ConfigKey,
|
|
36
|
+
ProjectRegistryStatus,
|
|
37
|
+
DatasetRegistryStatus,
|
|
38
|
+
check_config_id_strict,
|
|
39
|
+
)
|
|
40
|
+
from dsgrid.spark.types import (
|
|
41
|
+
DataFrame,
|
|
42
|
+
)
|
|
43
|
+
from dsgrid.utils.scratch_dir_context import ScratchDirContext
|
|
44
|
+
from dsgrid.utils.spark import (
|
|
45
|
+
cross_join_dfs,
|
|
46
|
+
create_dataframe_from_product,
|
|
47
|
+
)
|
|
48
|
+
from dsgrid.utils.timing import timer_stats_collector, track_timing
|
|
49
|
+
from dsgrid.utils.utilities import check_uniqueness
|
|
50
|
+
from dsgrid.config.config_base import ConfigBase
|
|
51
|
+
from dsgrid.config.dataset_config import InputDatasetType
|
|
52
|
+
from dsgrid.config.supplemental_dimension import SupplementalDimensionModel
|
|
53
|
+
from dsgrid.config.dimension_mapping_base import DimensionMappingReferenceModel
|
|
54
|
+
from dsgrid.config.dimensions import (
|
|
55
|
+
DimensionsListModel,
|
|
56
|
+
DimensionReferenceModel,
|
|
57
|
+
DimensionModel,
|
|
58
|
+
)
|
|
59
|
+
from dsgrid.dimension.time import (
|
|
60
|
+
TimeBasedDataAdjustmentModel,
|
|
61
|
+
DaylightSavingSpringForwardType,
|
|
62
|
+
DaylightSavingFallBackType,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
logger = logging.getLogger(__name__)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class SubsetDimensionSelectorModel(DSGBaseModel):
|
|
70
|
+
"""Defines a subset dimension selector inclusive of the subset's records and information
|
|
71
|
+
required to define the selector as a record within the supplemental dimension defined by the
|
|
72
|
+
subset dimension group.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
name: str
|
|
76
|
+
description: str
|
|
77
|
+
column_values: dict[str, str] = Field(
|
|
78
|
+
title="column_values",
|
|
79
|
+
description="Optional columns to populate in the subset dimension group's supplemental "
|
|
80
|
+
"dimension records table. For example, if each selector in the group defines the end "
|
|
81
|
+
"uses for one sector (e.g., commercial_end_uses, transportation_end_uses), the "
|
|
82
|
+
"supplemental dimension records table needs to define the 'fuel_id' and 'unit' fields of "
|
|
83
|
+
"the EnergyEndUse data model.",
|
|
84
|
+
default={},
|
|
85
|
+
)
|
|
86
|
+
records: list[str] = Field(
|
|
87
|
+
title="records",
|
|
88
|
+
description="Table of values populated by reading the parent subset dimension records "
|
|
89
|
+
"file. Should not be populated by the user.",
|
|
90
|
+
default=[],
|
|
91
|
+
json_schema_extra={
|
|
92
|
+
"dsgrid_internal": True,
|
|
93
|
+
},
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class SubsetDimensionGroupModel(DSGBaseModel):
|
|
98
|
+
"""Defines one or more subset dimension selectors for a dimension type."""
|
|
99
|
+
|
|
100
|
+
name: str
|
|
101
|
+
description: str
|
|
102
|
+
dimension_type: DimensionType = Field(
|
|
103
|
+
title="dimension_type",
|
|
104
|
+
alias="type",
|
|
105
|
+
description="Type of the dimension",
|
|
106
|
+
json_schema_extra={
|
|
107
|
+
"options": DimensionType.format_for_docs(),
|
|
108
|
+
},
|
|
109
|
+
)
|
|
110
|
+
filename: str | None = Field(
|
|
111
|
+
default=None,
|
|
112
|
+
title="filename",
|
|
113
|
+
alias="file",
|
|
114
|
+
description="Filename containing dimension records. Only populated for initial "
|
|
115
|
+
"registration. Each selector's records are stored as JSON objects in the dsgrid registry.",
|
|
116
|
+
)
|
|
117
|
+
selectors: list[SubsetDimensionSelectorModel] = Field(
|
|
118
|
+
title="selectors",
|
|
119
|
+
description="Dimension selectors",
|
|
120
|
+
)
|
|
121
|
+
selector_references: list[DimensionReferenceModel] = Field(
|
|
122
|
+
title="selectors",
|
|
123
|
+
description="References to the subset dimensions generated by dsgrid during registration.",
|
|
124
|
+
default=[],
|
|
125
|
+
)
|
|
126
|
+
create_supplemental_dimension: bool = Field(
|
|
127
|
+
title="create_supplemental_dimension",
|
|
128
|
+
description="Auto-generate supplemental dimensions in order to allow aggregrations on "
|
|
129
|
+
"the subsets.",
|
|
130
|
+
default=True,
|
|
131
|
+
)
|
|
132
|
+
base_dimension_name: str | None = Field(
|
|
133
|
+
default=None,
|
|
134
|
+
title="base_dimension_name",
|
|
135
|
+
description="Name of base dimension for the supplemental dimension mapping, if "
|
|
136
|
+
"create_supplemental_dimension is true. Required if there are multiple base dimensions "
|
|
137
|
+
"for this type.",
|
|
138
|
+
)
|
|
139
|
+
record_ids: set[str] = set()
|
|
140
|
+
|
|
141
|
+
@field_validator("selectors")
|
|
142
|
+
@classmethod
|
|
143
|
+
def check_selectors(cls, selectors):
|
|
144
|
+
"""Check that the selectors are defined consistently."""
|
|
145
|
+
if len(selectors) > 1:
|
|
146
|
+
first = sorted(selectors[0].column_values.keys())
|
|
147
|
+
for selector in selectors[1:]:
|
|
148
|
+
columns = sorted(selector.column_values.keys())
|
|
149
|
+
if columns != first:
|
|
150
|
+
msg = f"All selectors must define the same columns: {first=} {columns=}"
|
|
151
|
+
raise ValueError(msg)
|
|
152
|
+
|
|
153
|
+
return selectors
|
|
154
|
+
|
|
155
|
+
@model_validator(mode="after")
|
|
156
|
+
def load_records(self) -> "SubsetDimensionGroupModel":
|
|
157
|
+
"""Load the records for each subset dimension selector."""
|
|
158
|
+
if self.filename is None:
|
|
159
|
+
return self
|
|
160
|
+
|
|
161
|
+
record_ids, mappings = load_subset_dimensions(Path(self.filename))
|
|
162
|
+
self.record_ids.update(record_ids)
|
|
163
|
+
selector_names = check_uniqueness(
|
|
164
|
+
[x.name for x in self.selectors], "subset dimension selector"
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
diff = selector_names.symmetric_difference(mappings)
|
|
168
|
+
if diff:
|
|
169
|
+
msg = f"subset dimension {self.name} selectors have a mismatch with the records file column names: {diff}"
|
|
170
|
+
raise ValueError(msg)
|
|
171
|
+
|
|
172
|
+
for dim in self.selectors:
|
|
173
|
+
dim.records = mappings[dim.name]
|
|
174
|
+
|
|
175
|
+
self.filename = None
|
|
176
|
+
return self
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
class SubsetDimensionGroupListModel(DSGBaseModel):
|
|
180
|
+
"""Defines a list of subset dimensions."""
|
|
181
|
+
|
|
182
|
+
subset_dimensions: Annotated[list[SubsetDimensionGroupModel], Field(min_length=1)] = Field(
|
|
183
|
+
description="List of subset dimensions to be registered",
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
class DimensionsModel(DSGBaseModel):
|
|
188
|
+
"""Contains dimensions defined by a project"""
|
|
189
|
+
|
|
190
|
+
base_dimensions: DimensionsListModel = Field(
|
|
191
|
+
title="base_dimensions",
|
|
192
|
+
description="List of dimensions for a project's base dimensions. They will be "
|
|
193
|
+
"automatically registered during project registration and then converted to "
|
|
194
|
+
"base_dimension_references.",
|
|
195
|
+
default=[],
|
|
196
|
+
)
|
|
197
|
+
base_dimension_references: list[DimensionReferenceModel] = Field(
|
|
198
|
+
title="base_dimensions",
|
|
199
|
+
description="List of registry references (``DimensionReferenceModel``) for a project's "
|
|
200
|
+
"base dimensions.",
|
|
201
|
+
default=[],
|
|
202
|
+
)
|
|
203
|
+
subset_dimensions: list[SubsetDimensionGroupModel] = Field(
|
|
204
|
+
title="subset_dimensions",
|
|
205
|
+
description="List of subset dimension groups. "
|
|
206
|
+
"Subset dimension groups are used to specify subsets of base dimension records that a "
|
|
207
|
+
"dataset must support, dimensionality of derived datasets, and query filters. "
|
|
208
|
+
"Subset dimension groups also define a new supplemental dimension whose records "
|
|
209
|
+
"correspond to the table columns/subset selectors, such that defining a subset "
|
|
210
|
+
"dimension group can be a convenient way to define reporting at a different level of "
|
|
211
|
+
"aggregation as compared to the project's base dimensions.",
|
|
212
|
+
default=[],
|
|
213
|
+
)
|
|
214
|
+
supplemental_dimensions: list[SupplementalDimensionModel] = Field(
|
|
215
|
+
title="supplemental_dimensions",
|
|
216
|
+
description="List of supplemental dimensions. They will be automatically registered. "
|
|
217
|
+
"during project registration and then converted to supplemental_dimension_references. "
|
|
218
|
+
"Supplemental dimensions are used to support additional querying and transformations "
|
|
219
|
+
"(e.g., aggregations, disgaggregations, filtering, scaling, etc.) of the project's "
|
|
220
|
+
"base data.",
|
|
221
|
+
default=[],
|
|
222
|
+
)
|
|
223
|
+
supplemental_dimension_references: list[DimensionReferenceModel] = Field(
|
|
224
|
+
title="supplemental_dimension_references",
|
|
225
|
+
description="List of registry references for a project's supplemental dimensions. "
|
|
226
|
+
"Dimensions references of the same :class:`dsgrid.dimensions.base_model.DimensionType` "
|
|
227
|
+
"are allowed for supplemental dimension references (i.e., multiple `Geography` types "
|
|
228
|
+
"are allowed).",
|
|
229
|
+
default=[],
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
@model_validator(mode="after")
|
|
233
|
+
def check_dimensions(self) -> "DimensionsModel":
|
|
234
|
+
"""Validate that the dimensions are complete and consistent."""
|
|
235
|
+
dimensions = itertools.chain(self.base_dimensions, self.base_dimension_references)
|
|
236
|
+
check_required_dimensions(dimensions, "project base dimensions")
|
|
237
|
+
return self
|
|
238
|
+
|
|
239
|
+
@model_validator(mode="before")
|
|
240
|
+
@classmethod
|
|
241
|
+
def pre_check_values(cls, values: dict) -> dict:
|
|
242
|
+
"""Checks that base dimensions are defined."""
|
|
243
|
+
if not values.get("base_dimensions", []) and not values.get(
|
|
244
|
+
"base_dimension_references", []
|
|
245
|
+
):
|
|
246
|
+
msg = "Either base_dimensions or base_dimension_references must be defined"
|
|
247
|
+
raise ValueError(msg)
|
|
248
|
+
|
|
249
|
+
return values
|
|
250
|
+
|
|
251
|
+
@field_validator("base_dimensions")
|
|
252
|
+
@classmethod
|
|
253
|
+
def check_files(cls, values: list) -> list:
|
|
254
|
+
"""Validate dimension files are unique across all dimensions"""
|
|
255
|
+
check_uniqueness(
|
|
256
|
+
(
|
|
257
|
+
x.filename
|
|
258
|
+
for x in values
|
|
259
|
+
if isinstance(x, DimensionModel) and x.filename is not None
|
|
260
|
+
),
|
|
261
|
+
"dimension record filename",
|
|
262
|
+
)
|
|
263
|
+
return values
|
|
264
|
+
|
|
265
|
+
@field_validator("base_dimensions")
|
|
266
|
+
@classmethod
|
|
267
|
+
def check_names(cls, values: list) -> list:
|
|
268
|
+
"""Validate dimension names are unique across all dimensions."""
|
|
269
|
+
check_uniqueness(
|
|
270
|
+
[dim.name for dim in values],
|
|
271
|
+
"dimension record name",
|
|
272
|
+
)
|
|
273
|
+
return values
|
|
274
|
+
|
|
275
|
+
@field_validator("base_dimensions")
|
|
276
|
+
@classmethod
|
|
277
|
+
def check_time_zone(cls, values: list) -> list:
|
|
278
|
+
"""Validate the time zone column in geography records."""
|
|
279
|
+
for dimension in values:
|
|
280
|
+
if dimension.dimension_type == DimensionType.GEOGRAPHY:
|
|
281
|
+
check_timezone_in_geography(
|
|
282
|
+
dimension,
|
|
283
|
+
err_msg="Project geography dimension records must include a time_zone column",
|
|
284
|
+
)
|
|
285
|
+
return values
|
|
286
|
+
|
|
287
|
+
@field_validator("subset_dimensions")
|
|
288
|
+
@classmethod
|
|
289
|
+
def check_subset_dimensions(cls, subset_dimensions):
|
|
290
|
+
"""Check that each subset dimension has a unique name."""
|
|
291
|
+
check_uniqueness([x.name for x in subset_dimensions], "subset dimensions name")
|
|
292
|
+
return subset_dimensions
|
|
293
|
+
|
|
294
|
+
@model_validator(mode="after")
|
|
295
|
+
def check_dimension_names(self) -> "DimensionsModel":
|
|
296
|
+
"""Check that all dimension query names are unique."""
|
|
297
|
+
names: set[str] = set()
|
|
298
|
+
|
|
299
|
+
def add_name(name):
|
|
300
|
+
if name in names:
|
|
301
|
+
msg = f"dimension_name={name} is not unique in the project"
|
|
302
|
+
raise ValueError(msg)
|
|
303
|
+
names.add(name)
|
|
304
|
+
|
|
305
|
+
for dim in self.base_dimensions:
|
|
306
|
+
add_name(dim.name)
|
|
307
|
+
for dim in self.supplemental_dimensions:
|
|
308
|
+
add_name(dim.name)
|
|
309
|
+
for group in self.subset_dimensions:
|
|
310
|
+
add_name(group.name)
|
|
311
|
+
for selector in group.selectors:
|
|
312
|
+
add_name(selector.name)
|
|
313
|
+
|
|
314
|
+
return self
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
class RequiredSubsetDimensionRecordsModel(DSGBaseModel):
|
|
318
|
+
name: str = Field(description="Name of a subset dimension")
|
|
319
|
+
selectors: list[str] = Field(description="One or more selectors in the subset dimension")
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
class RequiredSupplementalDimensionRecordsModel(DSGBaseModel):
|
|
323
|
+
name: str = Field(description="Name of a supplemental dimension")
|
|
324
|
+
record_ids: list[str] = Field(
|
|
325
|
+
description="One or more record IDs in the supplemental dimension"
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
class RequiredBaseDimensionModel(DSGBaseModel):
|
|
330
|
+
record_ids: list[str] = []
|
|
331
|
+
dimension_name: str | None = Field(
|
|
332
|
+
default=None,
|
|
333
|
+
description="Identifies which base dimension contains the record IDs. Required if there "
|
|
334
|
+
"is more than one base dimension for a given dimension type.",
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
class RequiredDimensionRecordsByTypeModel(DSGBaseModel):
|
|
339
|
+
base: RequiredBaseDimensionModel = RequiredBaseDimensionModel()
|
|
340
|
+
base_missing: RequiredBaseDimensionModel = RequiredBaseDimensionModel()
|
|
341
|
+
subset: list[RequiredSubsetDimensionRecordsModel] = []
|
|
342
|
+
|
|
343
|
+
@model_validator(mode="before")
|
|
344
|
+
@classmethod
|
|
345
|
+
def handle_legacy_format(cls, values: dict[str, Any]) -> dict[str, Any]:
|
|
346
|
+
# 1. base and base_missing used to be list[str] because we used to allow a single base
|
|
347
|
+
# dimension.
|
|
348
|
+
# 2. We used to allow supplemental dimension requirements.
|
|
349
|
+
# This allows backwards compatibility with old files and databases.
|
|
350
|
+
# This can be removed once we've updated existing dsgrid project repositories.
|
|
351
|
+
for field in ("base", "base_missing"):
|
|
352
|
+
if field in values and isinstance(values[field], list):
|
|
353
|
+
logger.warning(f"Fixing up {field} to conform to new format")
|
|
354
|
+
values[field] = {"record_ids": values[field]}
|
|
355
|
+
|
|
356
|
+
if "supplemental" in values:
|
|
357
|
+
logger.warning(
|
|
358
|
+
"Removing deprecated supplemental dimension requirements from the project config."
|
|
359
|
+
)
|
|
360
|
+
values.pop("supplemental")
|
|
361
|
+
|
|
362
|
+
return values
|
|
363
|
+
|
|
364
|
+
@model_validator(mode="after")
|
|
365
|
+
def check_base(self) -> "RequiredDimensionRecordsByTypeModel":
|
|
366
|
+
if self.base.record_ids and self.base_missing.record_ids:
|
|
367
|
+
msg = f"base and base_missing cannot both contain record_ids: {self.base=} {self.base_missing=}"
|
|
368
|
+
raise ValueError(msg)
|
|
369
|
+
return self
|
|
370
|
+
|
|
371
|
+
def defines_dimension_requirement(self) -> bool:
|
|
372
|
+
"""Returns True if the model defines a dimension requirement."""
|
|
373
|
+
return (
|
|
374
|
+
bool(self.base.record_ids) or bool(self.base_missing.record_ids) or bool(self.subset)
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
class RequiredDimensionRecordsModel(DSGBaseModel):
|
|
379
|
+
# This is here because Pydantic doesn't like fields that start with 'model_'
|
|
380
|
+
model_config = make_model_config(protected_namespaces=())
|
|
381
|
+
|
|
382
|
+
# time is excluded
|
|
383
|
+
geography: RequiredDimensionRecordsByTypeModel = RequiredDimensionRecordsByTypeModel()
|
|
384
|
+
metric: RequiredDimensionRecordsByTypeModel = RequiredDimensionRecordsByTypeModel()
|
|
385
|
+
model_year: RequiredDimensionRecordsByTypeModel = RequiredDimensionRecordsByTypeModel()
|
|
386
|
+
scenario: RequiredDimensionRecordsByTypeModel = RequiredDimensionRecordsByTypeModel()
|
|
387
|
+
sector: RequiredDimensionRecordsByTypeModel = RequiredDimensionRecordsByTypeModel()
|
|
388
|
+
subsector: RequiredDimensionRecordsByTypeModel = RequiredDimensionRecordsByTypeModel()
|
|
389
|
+
weather_year: RequiredDimensionRecordsByTypeModel = RequiredDimensionRecordsByTypeModel()
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
class RequiredDimensionsModel(DSGBaseModel):
|
|
393
|
+
"""Defines required record IDs that must exist for each dimension in a dataset.
|
|
394
|
+
Record IDs can reside in the project's base or subset dimensions.
|
|
395
|
+
|
|
396
|
+
Requirements can be specified for a single dimension or a combination of dimensions.
|
|
397
|
+
For example, if a project includes commercial, residential, and transportation sectors but the
|
|
398
|
+
dataset has only transporation sector records, it should specify a single_dimensional
|
|
399
|
+
requirement that is a subset of of the project's base dimension.
|
|
400
|
+
`{"single_dimensional": "sector": {"base": {"record_ids": ["transportation"]}}}`.
|
|
401
|
+
|
|
402
|
+
If a dataset's requirements span multiple dimensions, such as if it does not have some
|
|
403
|
+
metric records for some geography records, then a multi_dimensional requirement should be
|
|
404
|
+
specified. (By default, a full cross join is assumed to be present.)
|
|
405
|
+
`{"multi_dimensional": {
|
|
406
|
+
"geography": {"base": {"record_ids": ["12345"]}}
|
|
407
|
+
"metric": {"base": {"record_ids": ["electricity_cooling"]}}
|
|
408
|
+
}
|
|
409
|
+
}`
|
|
410
|
+
|
|
411
|
+
If a dataset specifies a dimension type within a multi_dimensional section and wants to use
|
|
412
|
+
all records from a project base dimension, it can specify `base.record_ids = ["__all__"]
|
|
413
|
+
as a shorthand notation.
|
|
414
|
+
|
|
415
|
+
Requirements for a dimension cannot be defined in both single_dimensional and multi_dimensional
|
|
416
|
+
sections.
|
|
417
|
+
|
|
418
|
+
If no records are listed for a dimension then all project base records are required.
|
|
419
|
+
|
|
420
|
+
It might be easier for a dataset to specify what it does not have rather than what it does have.
|
|
421
|
+
In that case, it is recommended to use the RequiredDimensionRecordsModel.base_missing field.
|
|
422
|
+
dsgrid will compute the difference of the base dimension records and the base_missing records
|
|
423
|
+
to determine the dataset's required records.
|
|
424
|
+
|
|
425
|
+
If a project has multiple base dimensions of the same type, the
|
|
426
|
+
RequiredDimensionRecordsModel.dimension_name must be specified to identify the base
|
|
427
|
+
dimension that contains the record IDs.
|
|
428
|
+
|
|
429
|
+
If a dataset contains a subset of project base dimension records that are defined in the
|
|
430
|
+
project's subset dimensions, it is recommended to use that specification. dsgrid will
|
|
431
|
+
substitute base records for mapped subset records at runtime.
|
|
432
|
+
"""
|
|
433
|
+
|
|
434
|
+
single_dimensional: RequiredDimensionRecordsModel = Field(
|
|
435
|
+
description="Required records for a single dimension.",
|
|
436
|
+
default=RequiredDimensionRecordsModel(),
|
|
437
|
+
)
|
|
438
|
+
multi_dimensional: list[RequiredDimensionRecordsModel] = Field(
|
|
439
|
+
description="Required records for a combination of dimensions. For example, there may be "
|
|
440
|
+
"a dataset requirement for only one subsector for a given sector instead of a cross "
|
|
441
|
+
"product.",
|
|
442
|
+
default=[],
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
@model_validator(mode="after")
|
|
446
|
+
def check_for_duplicates(self) -> "RequiredDimensionsModel":
|
|
447
|
+
"""
|
|
448
|
+
1. Ensure that the same dimension does not have requirements in both single and multi
|
|
449
|
+
dimensional sections.
|
|
450
|
+
2. Set any dimensions that do not have specifications to require all base dimension
|
|
451
|
+
records (as long as there is only one project base dimension).
|
|
452
|
+
"""
|
|
453
|
+
single_dimensional: set[str] = set()
|
|
454
|
+
multi_dimensional: set[str] = set()
|
|
455
|
+
|
|
456
|
+
for field in RequiredDimensionRecordsModel.model_fields:
|
|
457
|
+
req = getattr(self.single_dimensional, field)
|
|
458
|
+
if req.defines_dimension_requirement():
|
|
459
|
+
single_dimensional.add(field)
|
|
460
|
+
|
|
461
|
+
dim_combos: set[tuple[str, ...]] = set()
|
|
462
|
+
for item in self.multi_dimensional:
|
|
463
|
+
dims = []
|
|
464
|
+
for field in RequiredDimensionRecordsModel.model_fields:
|
|
465
|
+
req = getattr(item, field)
|
|
466
|
+
if req.defines_dimension_requirement():
|
|
467
|
+
if field in single_dimensional:
|
|
468
|
+
msg = (
|
|
469
|
+
"dimensions cannot be defined in both single_dimensional and "
|
|
470
|
+
f"multi_dimensional sections: {field}"
|
|
471
|
+
)
|
|
472
|
+
raise ValueError(msg)
|
|
473
|
+
dims.append(field)
|
|
474
|
+
multi_dimensional.add(field)
|
|
475
|
+
|
|
476
|
+
if len(dims) < 2:
|
|
477
|
+
msg = (
|
|
478
|
+
"A multi_dimensional dimension requirement must contain at least two "
|
|
479
|
+
f"dimensions: {item}"
|
|
480
|
+
)
|
|
481
|
+
raise ValueError(msg)
|
|
482
|
+
|
|
483
|
+
dim_combo = tuple(sorted(dims))
|
|
484
|
+
if dim_combo not in dim_combos:
|
|
485
|
+
for other in dim_combos:
|
|
486
|
+
if set(dim_combo).intersection(other):
|
|
487
|
+
msg = (
|
|
488
|
+
"All descriptors in the multi-dimensional requirements with an "
|
|
489
|
+
"intersection of dimensions must have a full intersection. "
|
|
490
|
+
f"dimension_set1 = {other} dimension_set2 = {dim_combo}"
|
|
491
|
+
)
|
|
492
|
+
raise ValueError(msg)
|
|
493
|
+
dim_combos.add(dim_combo)
|
|
494
|
+
|
|
495
|
+
not_covered = (
|
|
496
|
+
set([x.value for x in DimensionType]) - multi_dimensional - single_dimensional
|
|
497
|
+
)
|
|
498
|
+
for field in not_covered:
|
|
499
|
+
if field != DimensionType.TIME.value:
|
|
500
|
+
getattr(self.single_dimensional, field).base.record_ids = ["__all__"]
|
|
501
|
+
return self
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
class DatasetBaseDimensionNamesModel(DSGBaseModel):
|
|
505
|
+
"""Defines the query names for project base dimensions to which datasets will be mapped.
|
|
506
|
+
This is important for cases where a project has multiple base dimensions of the same type.
|
|
507
|
+
"""
|
|
508
|
+
|
|
509
|
+
# This is here because Pydantic doesn't like fields that start with 'model_'
|
|
510
|
+
model_config = make_model_config(protected_namespaces=())
|
|
511
|
+
|
|
512
|
+
geography: str | None = None
|
|
513
|
+
metric: str | None = None
|
|
514
|
+
model_year: str | None = None
|
|
515
|
+
scenario: str | None = None
|
|
516
|
+
sector: str | None = None
|
|
517
|
+
subsector: str | None = None
|
|
518
|
+
time: str | None = None
|
|
519
|
+
weather_year: str | None = None
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
class InputDatasetModel(DSGBaseModel):
|
|
523
|
+
"""Defines an input dataset for the project config."""
|
|
524
|
+
|
|
525
|
+
dataset_id: str = Field(
|
|
526
|
+
title="dataset_id",
|
|
527
|
+
description="Unique dataset identifier.",
|
|
528
|
+
json_schema_extra={
|
|
529
|
+
"updateable": False,
|
|
530
|
+
},
|
|
531
|
+
)
|
|
532
|
+
dataset_type: InputDatasetType = Field(
|
|
533
|
+
title="dataset_type",
|
|
534
|
+
description="Dataset type.",
|
|
535
|
+
json_schema_extra={
|
|
536
|
+
"options": InputDatasetType.format_for_docs(),
|
|
537
|
+
"updateable": False,
|
|
538
|
+
},
|
|
539
|
+
)
|
|
540
|
+
version: str | None = Field(
|
|
541
|
+
title="version",
|
|
542
|
+
description="Version of the registered dataset. "
|
|
543
|
+
"The version specification is optional. If no version is supplied, then the latest "
|
|
544
|
+
"version in the registry is assumed. "
|
|
545
|
+
"The version string must be in semver format (e.g., '1.0.0') and it must be a "
|
|
546
|
+
"valid/existing version in the registry.",
|
|
547
|
+
default=None,
|
|
548
|
+
)
|
|
549
|
+
required_dimensions: RequiredDimensionsModel = Field(
|
|
550
|
+
title="required_dimensions",
|
|
551
|
+
description="Defines required record IDs that must exist for each dimension.",
|
|
552
|
+
default=RequiredDimensionsModel(),
|
|
553
|
+
)
|
|
554
|
+
mapping_references: list[DimensionMappingReferenceModel] = Field(
|
|
555
|
+
title="mapping_references",
|
|
556
|
+
description="Defines how to map the dataset dimensions to the project. "
|
|
557
|
+
"Auto-populated during submission.",
|
|
558
|
+
default=[],
|
|
559
|
+
)
|
|
560
|
+
base_dimension_names: DatasetBaseDimensionNamesModel = Field(
|
|
561
|
+
title="base_dimension_names",
|
|
562
|
+
description="Defines the project base dimensions to which the dataset will map itself. "
|
|
563
|
+
"Auto-populated during submission.",
|
|
564
|
+
default=DatasetBaseDimensionNamesModel(),
|
|
565
|
+
)
|
|
566
|
+
status: DatasetRegistryStatus = Field(
|
|
567
|
+
title="status",
|
|
568
|
+
description="Registration status of the dataset, added by dsgrid.",
|
|
569
|
+
default=DatasetRegistryStatus.UNREGISTERED,
|
|
570
|
+
json_schema_extra={
|
|
571
|
+
"dsgrid_internal": True,
|
|
572
|
+
"notes": ("status is "),
|
|
573
|
+
"updateable": False,
|
|
574
|
+
},
|
|
575
|
+
)
|
|
576
|
+
wrap_time_allowed: bool = Field(
|
|
577
|
+
title="wrap_time_allowed",
|
|
578
|
+
description="Whether to allow dataset time to be wrapped to project time if different",
|
|
579
|
+
default=False,
|
|
580
|
+
)
|
|
581
|
+
time_based_data_adjustment: TimeBasedDataAdjustmentModel = Field(
|
|
582
|
+
title="time_based_data_adjustment",
|
|
583
|
+
description="Defines how the rest of the dataframe is adjusted with respect to time. "
|
|
584
|
+
"E.g., when drop associated data when dropping a leap day timestamp.",
|
|
585
|
+
default=TimeBasedDataAdjustmentModel(),
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
@field_validator("time_based_data_adjustment")
|
|
589
|
+
@classmethod
|
|
590
|
+
def check_data_adjustment(cls, time_based_data_adjustment):
|
|
591
|
+
"""Check daylight saving adjustment"""
|
|
592
|
+
sfh = time_based_data_adjustment.daylight_saving_adjustment.spring_forward_hour
|
|
593
|
+
fbh = time_based_data_adjustment.daylight_saving_adjustment.fall_back_hour
|
|
594
|
+
if fbh == DaylightSavingFallBackType.NONE and sfh == DaylightSavingSpringForwardType.NONE:
|
|
595
|
+
return time_based_data_adjustment
|
|
596
|
+
if fbh != DaylightSavingFallBackType.NONE and sfh != DaylightSavingSpringForwardType.NONE:
|
|
597
|
+
return time_based_data_adjustment
|
|
598
|
+
msg = f"mismatch between spring_forward_hour and fall_back_hour, {time_based_data_adjustment=}."
|
|
599
|
+
raise ValueError(msg)
|
|
600
|
+
|
|
601
|
+
# TODO: write validation that if daylight_saving_adjustment is specified, dataset time config must be IndexTimeDimensionConfig
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
class DimensionMappingsModel(DSGBaseModel):
|
|
605
|
+
"""Defines all dimension mappings associated with a dsgrid project,
|
|
606
|
+
including base-to-supplemental mappings and dataset-to-project mappings.
|
|
607
|
+
"""
|
|
608
|
+
|
|
609
|
+
base_to_supplemental_references: list[DimensionMappingReferenceModel] = Field(
|
|
610
|
+
title="base_to_supplemental_references",
|
|
611
|
+
description="Base dimension to supplemental dimension mappings (e.g., county-to-state)"
|
|
612
|
+
" used to support various queries and dimension transformations.",
|
|
613
|
+
default=[],
|
|
614
|
+
)
|
|
615
|
+
dataset_to_project: dict[str, list[DimensionMappingReferenceModel]] = Field(
|
|
616
|
+
title="dataset_to_project",
|
|
617
|
+
description="Dataset-to-project mappings map dataset dimensions to project dimensions. "
|
|
618
|
+
"Once a dataset is submitted to a project, dsgrid adds the dataset-to-project mappings "
|
|
619
|
+
"to the project config. "
|
|
620
|
+
"Some projects may not have any dataset-to-project mappings. Dataset-to-project "
|
|
621
|
+
" mappings are only supplied if a dataset's dimensions do not match the project's "
|
|
622
|
+
"dimension.",
|
|
623
|
+
default={},
|
|
624
|
+
# TODO: need to document missing dimension records, fill values, etc. DSGRID-191.
|
|
625
|
+
)
|
|
626
|
+
|
|
627
|
+
|
|
628
|
+
class ProjectConfigModel(DSGBaseDatabaseModel):
|
|
629
|
+
"""Represents project configurations"""
|
|
630
|
+
|
|
631
|
+
project_id: str = Field(
|
|
632
|
+
title="project_id",
|
|
633
|
+
description="A unique project identifier that is project-specific (e.g., "
|
|
634
|
+
"'standard-scenarios-2021').",
|
|
635
|
+
)
|
|
636
|
+
name: str = Field(
|
|
637
|
+
title="name",
|
|
638
|
+
description="A project name to accompany the ID.",
|
|
639
|
+
)
|
|
640
|
+
description: str = Field(
|
|
641
|
+
title="description",
|
|
642
|
+
description="Detailed project description.",
|
|
643
|
+
)
|
|
644
|
+
status: ProjectRegistryStatus = Field(
|
|
645
|
+
title="status",
|
|
646
|
+
description="project registry status",
|
|
647
|
+
default=ProjectRegistryStatus.INITIAL_REGISTRATION,
|
|
648
|
+
json_schema_extra={
|
|
649
|
+
"dsgrid_internal": True,
|
|
650
|
+
"updateable": False,
|
|
651
|
+
},
|
|
652
|
+
)
|
|
653
|
+
datasets: list[InputDatasetModel] = Field(
|
|
654
|
+
title="datasets",
|
|
655
|
+
description="List of input datasets for the project.",
|
|
656
|
+
)
|
|
657
|
+
dimensions: DimensionsModel = Field(
|
|
658
|
+
title="dimensions",
|
|
659
|
+
description="List of `base` and `supplemental` dimensions.",
|
|
660
|
+
)
|
|
661
|
+
dimension_mappings: DimensionMappingsModel = Field(
|
|
662
|
+
title="dimension_mappings",
|
|
663
|
+
description="List of project mappings. Initialized with base-to-base and"
|
|
664
|
+
" base-to-supplemental mappings. dataset-to-project mappings are added by dsgrid as"
|
|
665
|
+
" datasets get registered with the project.",
|
|
666
|
+
default=DimensionMappingsModel(),
|
|
667
|
+
)
|
|
668
|
+
|
|
669
|
+
@field_validator("project_id")
|
|
670
|
+
@classmethod
|
|
671
|
+
def check_project_id_handle(cls, project_id):
|
|
672
|
+
"""Check for valid characters in project id"""
|
|
673
|
+
if "-" in project_id:
|
|
674
|
+
msg = 'invalid character "-" in project id'
|
|
675
|
+
raise ValueError(msg)
|
|
676
|
+
|
|
677
|
+
check_config_id_strict(project_id, "Project")
|
|
678
|
+
return project_id
|
|
679
|
+
|
|
680
|
+
|
|
681
|
+
def make_unvalidated_project_config(
|
|
682
|
+
project_id: str,
|
|
683
|
+
dataset_ids: Iterable[str],
|
|
684
|
+
metric_types: Iterable[str],
|
|
685
|
+
name: str | None = None,
|
|
686
|
+
description: str | None = None,
|
|
687
|
+
time_type: TimeDimensionType = TimeDimensionType.DATETIME,
|
|
688
|
+
) -> dict[str, Any]:
|
|
689
|
+
"""Create a project config as a dictionary, skipping validation."""
|
|
690
|
+
return {
|
|
691
|
+
"project_id": project_id,
|
|
692
|
+
"name": name or "",
|
|
693
|
+
"description": description or "",
|
|
694
|
+
"dimensions": {
|
|
695
|
+
"base_dimensions": make_base_dimension_template(metric_types, time_type=time_type),
|
|
696
|
+
"subset_dimensions": [],
|
|
697
|
+
"supplemental_dimensions": [],
|
|
698
|
+
},
|
|
699
|
+
"datasets": [
|
|
700
|
+
{
|
|
701
|
+
"dataset_id": x,
|
|
702
|
+
"dataset_type": "",
|
|
703
|
+
"version": "",
|
|
704
|
+
"required_dimensions": {},
|
|
705
|
+
}
|
|
706
|
+
for x in dataset_ids
|
|
707
|
+
],
|
|
708
|
+
}
|
|
709
|
+
|
|
710
|
+
|
|
711
|
+
class DimensionsByCategoryModel(DSGBaseModel):
|
|
712
|
+
"""Defines the query names by base and supplemental category."""
|
|
713
|
+
|
|
714
|
+
base: list[str]
|
|
715
|
+
subset: list[str]
|
|
716
|
+
supplemental: list[str]
|
|
717
|
+
|
|
718
|
+
|
|
719
|
+
class ProjectDimensionNamesModel(DSGBaseModel):
|
|
720
|
+
"""Defines the query names for all base and supplemental dimensions in the project."""
|
|
721
|
+
|
|
722
|
+
# This is here because Pydantic doesn't like fields that start with 'model_'
|
|
723
|
+
model_config = make_model_config(protected_namespaces=())
|
|
724
|
+
|
|
725
|
+
geography: DimensionsByCategoryModel
|
|
726
|
+
metric: DimensionsByCategoryModel
|
|
727
|
+
model_year: DimensionsByCategoryModel
|
|
728
|
+
scenario: DimensionsByCategoryModel
|
|
729
|
+
sector: DimensionsByCategoryModel
|
|
730
|
+
subsector: DimensionsByCategoryModel
|
|
731
|
+
time: DimensionsByCategoryModel
|
|
732
|
+
weather_year: DimensionsByCategoryModel
|
|
733
|
+
|
|
734
|
+
|
|
735
|
+
class ProjectConfig(ConfigBase):
|
|
736
|
+
"""Provides an interface to a ProjectConfigModel."""
|
|
737
|
+
|
|
738
|
+
def __init__(self, model: ProjectConfigModel):
|
|
739
|
+
super().__init__(model)
|
|
740
|
+
self._base_dimensions: dict[ConfigKey, DimensionBaseConfig] = {}
|
|
741
|
+
self._subset_dimensions: dict[
|
|
742
|
+
DimensionType, dict[str, dict[ConfigKey, DimensionBaseConfigWithFiles]]
|
|
743
|
+
] = {}
|
|
744
|
+
self._supplemental_dimensions: dict[ConfigKey, DimensionBaseConfig] = {}
|
|
745
|
+
self._base_to_supplemental_mappings: dict[ConfigKey, MappingTableConfig] = {}
|
|
746
|
+
self._dimensions_by_name: dict[str, DimensionBaseConfig] = {}
|
|
747
|
+
|
|
748
|
+
@staticmethod
|
|
749
|
+
def model_class() -> Type:
|
|
750
|
+
return ProjectConfigModel
|
|
751
|
+
|
|
752
|
+
@staticmethod
|
|
753
|
+
def config_filename() -> str:
|
|
754
|
+
return "project.json5"
|
|
755
|
+
|
|
756
|
+
def get_base_dimension(
|
|
757
|
+
self, dimension_type: DimensionType, dimension_name: str | None = None
|
|
758
|
+
) -> DimensionBaseConfig:
|
|
759
|
+
"""Return the base dimension matching dimension_type.
|
|
760
|
+
If there is more than one base dimension of the given type, dimension_name is
|
|
761
|
+
required.
|
|
762
|
+
|
|
763
|
+
See also
|
|
764
|
+
--------
|
|
765
|
+
list_base_dimensions
|
|
766
|
+
"""
|
|
767
|
+
if dimension_name is None:
|
|
768
|
+
return self._get_single_base_dimension(dimension_type)
|
|
769
|
+
for dim in self._iter_base_dimensions():
|
|
770
|
+
if dim.model.dimension_type == dimension_type and dim.model.name == dimension_name:
|
|
771
|
+
return dim
|
|
772
|
+
msg = f"Did not find a dimension of {dimension_type=} with {dimension_name=}"
|
|
773
|
+
raise DSGValueNotRegistered(msg)
|
|
774
|
+
|
|
775
|
+
def get_base_time_dimension(self) -> TimeDimensionBaseConfig:
|
|
776
|
+
"""Return the base dimension for time."""
|
|
777
|
+
dim = self._get_single_base_dimension(DimensionType.TIME)
|
|
778
|
+
assert isinstance(dim, TimeDimensionBaseConfig)
|
|
779
|
+
return dim
|
|
780
|
+
|
|
781
|
+
def _get_single_base_dimension(self, dimension_type: DimensionType) -> DimensionBaseConfig:
|
|
782
|
+
"""Return the base dimension."""
|
|
783
|
+
dims = [
|
|
784
|
+
x for x in self._iter_base_dimensions() if x.model.dimension_type == dimension_type
|
|
785
|
+
]
|
|
786
|
+
if not dims:
|
|
787
|
+
msg = f"base dimension {dimension_type=} not found"
|
|
788
|
+
raise DSGValueNotRegistered(msg)
|
|
789
|
+
|
|
790
|
+
if len(dims) > 1:
|
|
791
|
+
qnames = " ".join([x.model.name for x in dims])
|
|
792
|
+
msg = (
|
|
793
|
+
f"Found multiple base dimensions for {dimension_type=}: {qnames}. "
|
|
794
|
+
"Call get_base_dimension() with a specific name."
|
|
795
|
+
)
|
|
796
|
+
raise DSGInvalidDimension(msg)
|
|
797
|
+
return dims[0]
|
|
798
|
+
|
|
799
|
+
def get_base_dimension_and_version(
|
|
800
|
+
self, dimension_type: DimensionType, dimension_name: str | None = None
|
|
801
|
+
) -> tuple[DimensionBaseConfig, str]:
|
|
802
|
+
"""Return the base dimension and version matching dimension_type."""
|
|
803
|
+
res: tuple[DimensionBaseConfig, str] | None = None
|
|
804
|
+
for key, dim in self.base_dimensions.items():
|
|
805
|
+
if dim.model.dimension_type == dimension_type:
|
|
806
|
+
if dimension_name is None or dim.model.name == dimension_name:
|
|
807
|
+
if res is not None:
|
|
808
|
+
msg = (
|
|
809
|
+
f"Found multiple base dimensions for {dimension_type=}. "
|
|
810
|
+
"You must specify a dimension query name to remove ambiguity."
|
|
811
|
+
)
|
|
812
|
+
raise DSGInvalidOperation(msg)
|
|
813
|
+
res = dim, key.version
|
|
814
|
+
|
|
815
|
+
if res is None:
|
|
816
|
+
msg = f"Did not find a dimension with {dimension_type=} {dimension_name=}"
|
|
817
|
+
raise DSGValueNotRegistered(msg)
|
|
818
|
+
return res
|
|
819
|
+
|
|
820
|
+
def get_dimension(self, name: str) -> DimensionBaseConfig:
|
|
821
|
+
"""Return the dimension with name."""
|
|
822
|
+
dim = self._dimensions_by_name.get(name)
|
|
823
|
+
if dim is None:
|
|
824
|
+
msg = f"dimension_name={name} is not stored"
|
|
825
|
+
raise DSGValueNotRegistered(msg)
|
|
826
|
+
return dim
|
|
827
|
+
|
|
828
|
+
def get_time_dimension(self, name: str) -> TimeDimensionBaseConfig:
|
|
829
|
+
"""Return the time dimension with dimension_name."""
|
|
830
|
+
dim = self.get_dimension(name)
|
|
831
|
+
if not isinstance(dim, TimeDimensionBaseConfig):
|
|
832
|
+
msg = f"{dim.model.label} is not a time dimension"
|
|
833
|
+
raise DSGInvalidParameter(msg)
|
|
834
|
+
return dim
|
|
835
|
+
|
|
836
|
+
def get_dimension_by_name(self, name: str) -> DimensionBaseConfig:
|
|
837
|
+
"""Return the dimension with name."""
|
|
838
|
+
for dim in self._iter_base_dimensions():
|
|
839
|
+
if dim.model.name == name:
|
|
840
|
+
return dim
|
|
841
|
+
|
|
842
|
+
msg = f"No base dimension with {name=} is stored."
|
|
843
|
+
raise DSGValueNotRegistered(msg)
|
|
844
|
+
|
|
845
|
+
def get_dimension_with_records(self, name: str) -> DimensionBaseConfigWithFiles:
|
|
846
|
+
"""Return a dimension config matching name that has records."""
|
|
847
|
+
dim = self._dimensions_by_name.get(name)
|
|
848
|
+
if dim is None:
|
|
849
|
+
msg = f"{name=} is not stored"
|
|
850
|
+
raise DSGInvalidDimension(msg)
|
|
851
|
+
if not isinstance(dim, DimensionBaseConfigWithFiles):
|
|
852
|
+
msg = f"{dim.model.label} does not have records"
|
|
853
|
+
raise DSGInvalidParameter(msg)
|
|
854
|
+
return dim
|
|
855
|
+
|
|
856
|
+
def get_dimension_records(self, name: str) -> DataFrame:
|
|
857
|
+
"""Return a DataFrame containing the records for a dimension."""
|
|
858
|
+
return self.get_dimension_with_records(name).get_records_dataframe()
|
|
859
|
+
|
|
860
|
+
def get_dimension_record_ids(self, name: str) -> set[str]:
|
|
861
|
+
"""Return the record IDs for the dimension identified by name."""
|
|
862
|
+
return self.get_dimension_with_records(name).get_unique_ids()
|
|
863
|
+
|
|
864
|
+
def get_dimension_reference(self, dimension_id: str) -> DimensionReferenceModel:
|
|
865
|
+
"""Return the reference of the dimension matching dimension_id."""
|
|
866
|
+
for ref in itertools.chain(
|
|
867
|
+
self.model.dimensions.base_dimension_references,
|
|
868
|
+
self.model.dimensions.supplemental_dimension_references,
|
|
869
|
+
):
|
|
870
|
+
if ref.dimension_id == dimension_id:
|
|
871
|
+
return ref
|
|
872
|
+
|
|
873
|
+
msg = f"{dimension_id} is not stored"
|
|
874
|
+
raise DSGInvalidDimension(msg)
|
|
875
|
+
|
|
876
|
+
def list_base_dimensions(
|
|
877
|
+
self, dimension_type: DimensionType | None = None
|
|
878
|
+
) -> list[DimensionBaseConfig]:
|
|
879
|
+
"""Return all base dimensions, optionally filtering to the dimension_type.
|
|
880
|
+
|
|
881
|
+
See also
|
|
882
|
+
--------
|
|
883
|
+
get_base_dimension
|
|
884
|
+
"""
|
|
885
|
+
if dimension_type is None:
|
|
886
|
+
return list(self._iter_base_dimensions())
|
|
887
|
+
return [
|
|
888
|
+
x for x in self._iter_base_dimensions() if x.model.dimension_type == dimension_type
|
|
889
|
+
]
|
|
890
|
+
|
|
891
|
+
def list_base_dimensions_with_records(
|
|
892
|
+
self, dimension_type: DimensionType
|
|
893
|
+
) -> list[DimensionBaseConfigWithFiles]:
|
|
894
|
+
"""Return all base dimensions of the given dimension_type.
|
|
895
|
+
|
|
896
|
+
See also
|
|
897
|
+
--------
|
|
898
|
+
get_base_dimension
|
|
899
|
+
"""
|
|
900
|
+
return [
|
|
901
|
+
x
|
|
902
|
+
for x in self._iter_base_dimensions()
|
|
903
|
+
if x.model.dimension_type == dimension_type
|
|
904
|
+
and isinstance(x, DimensionBaseConfigWithFiles)
|
|
905
|
+
]
|
|
906
|
+
|
|
907
|
+
def list_supplemental_dimensions(
|
|
908
|
+
self, dimension_type: DimensionType, sort_by=None
|
|
909
|
+
) -> list[DimensionBaseConfigWithFiles]:
|
|
910
|
+
"""Return the supplemental dimensions matching dimension (if any).
|
|
911
|
+
|
|
912
|
+
Parameters
|
|
913
|
+
----------
|
|
914
|
+
dimension_type : DimensionType
|
|
915
|
+
sort_by : str | None
|
|
916
|
+
If set, sort the dimensions by this dimension attribute.
|
|
917
|
+
"""
|
|
918
|
+
dims = [
|
|
919
|
+
x
|
|
920
|
+
for x in self.supplemental_dimensions.values()
|
|
921
|
+
if x.model.dimension_type == dimension_type
|
|
922
|
+
]
|
|
923
|
+
if sort_by is not None:
|
|
924
|
+
dims.sort(key=lambda x: getattr(x.model, sort_by))
|
|
925
|
+
return dims
|
|
926
|
+
|
|
927
|
+
def get_matching_subset_dimension(
|
|
928
|
+
self, dimension_type: DimensionType, unique_data_records: set[str]
|
|
929
|
+
) -> DimensionReferenceModel | None:
|
|
930
|
+
"""Return a dimension reference if there is a matching subset dimension, otherwise None."""
|
|
931
|
+
for group in self.model.dimensions.subset_dimensions:
|
|
932
|
+
if group.dimension_type == dimension_type:
|
|
933
|
+
for ref in group.selector_references:
|
|
934
|
+
key = ConfigKey(ref.dimension_id, ref.version)
|
|
935
|
+
records = self._subset_dimensions[dimension_type][group.name][
|
|
936
|
+
key
|
|
937
|
+
].get_unique_ids()
|
|
938
|
+
if not unique_data_records.symmetric_difference(records):
|
|
939
|
+
logger.info("Found matching subset dimension: %s", group.name)
|
|
940
|
+
return ref
|
|
941
|
+
return None
|
|
942
|
+
|
|
943
|
+
def get_base_to_supplemental_dimension_mappings_by_types(
|
|
944
|
+
self, dimension_type: DimensionType
|
|
945
|
+
) -> list[MappingTableConfig]:
|
|
946
|
+
"""Return the base-to-supplemental dimension mappings for the dimension (if any)."""
|
|
947
|
+
return [
|
|
948
|
+
x
|
|
949
|
+
for x in self._base_to_supplemental_mappings.values()
|
|
950
|
+
if x.model.from_dimension.dimension_type == dimension_type
|
|
951
|
+
]
|
|
952
|
+
|
|
953
|
+
def get_base_to_supplemental_config(
|
|
954
|
+
self, base_dim: DimensionBaseConfigWithFiles, supp_dim: DimensionBaseConfigWithFiles
|
|
955
|
+
) -> MappingTableConfig:
|
|
956
|
+
"""Return the project's base-to-supplemental dimension mapping config for the given
|
|
957
|
+
base and supplemental dimensions.
|
|
958
|
+
"""
|
|
959
|
+
self._check_not_base_dimension(supp_dim)
|
|
960
|
+
|
|
961
|
+
for mapping in self._base_to_supplemental_mappings.values():
|
|
962
|
+
if (
|
|
963
|
+
mapping.model.from_dimension.dimension_id == base_dim.model.dimension_id
|
|
964
|
+
and mapping.model.to_dimension.dimension_id == supp_dim.model.dimension_id
|
|
965
|
+
):
|
|
966
|
+
return mapping
|
|
967
|
+
|
|
968
|
+
msg = f"No mapping is stored for base = {base_dim.model.label}, supplemental = {supp_dim.model.label}"
|
|
969
|
+
raise DSGValueNotRegistered(msg)
|
|
970
|
+
|
|
971
|
+
def get_base_to_supplemental_mapping_records(
|
|
972
|
+
self, base_dim: DimensionBaseConfigWithFiles, supp_dim: DimensionBaseConfigWithFiles
|
|
973
|
+
) -> DataFrame:
|
|
974
|
+
"""Return the project's base-to-supplemental dimension mapping records.
|
|
975
|
+
Excludes rows with NULL to_id values.
|
|
976
|
+
"""
|
|
977
|
+
config = self.get_base_to_supplemental_config(base_dim, supp_dim)
|
|
978
|
+
return config.get_records_dataframe().filter("to_id is not NULL")
|
|
979
|
+
|
|
980
|
+
def has_base_to_supplemental_dimension_mapping_types(self, dimension_type) -> bool:
|
|
981
|
+
"""Return True if the config has these base-to-supplemental mappings."""
|
|
982
|
+
return self._has_mapping(
|
|
983
|
+
dimension_type,
|
|
984
|
+
dimension_type,
|
|
985
|
+
self._base_to_supplemental_mappings,
|
|
986
|
+
)
|
|
987
|
+
|
|
988
|
+
def get_base_dimension_by_id(self, dimension_id: str) -> DimensionBaseConfig:
|
|
989
|
+
"""Return the base dimension with dimension_id."""
|
|
990
|
+
for dim in self._iter_base_dimensions():
|
|
991
|
+
if dim.model.dimension_id == dimension_id:
|
|
992
|
+
return dim
|
|
993
|
+
msg = f"Did not find a base dimension with {dimension_id=}"
|
|
994
|
+
raise DSGValueNotRegistered(msg)
|
|
995
|
+
|
|
996
|
+
def get_base_dimension_records_by_id(self, dimension_id: str) -> DataFrame:
|
|
997
|
+
"""Return the records for the base dimension with dimension_id."""
|
|
998
|
+
dim = self.get_base_dimension_by_id(dimension_id)
|
|
999
|
+
if not isinstance(dim, DimensionBaseConfigWithFiles):
|
|
1000
|
+
msg = f"{dim.model.label} does not have records"
|
|
1001
|
+
raise DSGInvalidParameter(msg)
|
|
1002
|
+
return dim.get_records_dataframe()
|
|
1003
|
+
|
|
1004
|
+
def _check_not_base_dimension(self, dim: DimensionBaseConfig) -> None:
|
|
1005
|
+
"""Check that the dimension is not a base dimension."""
|
|
1006
|
+
for base_dim in self.list_base_dimensions(dimension_type=dim.model.dimension_type):
|
|
1007
|
+
if dim.model.dimension_id == base_dim.model.dimension_id:
|
|
1008
|
+
msg = f"Cannot pass base dimension: {dim.model.label}"
|
|
1009
|
+
raise DSGInvalidParameter(msg)
|
|
1010
|
+
|
|
1011
|
+
@staticmethod
|
|
1012
|
+
def _has_mapping(
|
|
1013
|
+
from_dimension_type: DimensionType, to_dimension_type: DimensionType, mapping: dict
|
|
1014
|
+
) -> bool:
|
|
1015
|
+
for config in mapping.values():
|
|
1016
|
+
if (
|
|
1017
|
+
config.model.from_dimension.dimension_type == from_dimension_type
|
|
1018
|
+
and config.model.to_dimension.dimension_type == to_dimension_type
|
|
1019
|
+
):
|
|
1020
|
+
return True
|
|
1021
|
+
return False
|
|
1022
|
+
|
|
1023
|
+
def list_dimension_names(self, category: DimensionCategory | None = None) -> list[str]:
|
|
1024
|
+
"""Return query names for all dimensions in the project.
|
|
1025
|
+
|
|
1026
|
+
Parameters
|
|
1027
|
+
----------
|
|
1028
|
+
category : DimensionCategory | None
|
|
1029
|
+
Optionally, filter return by category.
|
|
1030
|
+
"""
|
|
1031
|
+
if category is None:
|
|
1032
|
+
return sorted(self._dimensions_by_name.keys())
|
|
1033
|
+
|
|
1034
|
+
match category:
|
|
1035
|
+
case DimensionCategory.BASE:
|
|
1036
|
+
method = self._iter_base_dimensions
|
|
1037
|
+
case DimensionCategory.SUBSET:
|
|
1038
|
+
method = self._iter_subset_dimensions
|
|
1039
|
+
case DimensionCategory.SUPPLEMENTAL:
|
|
1040
|
+
method = self._iter_supplemental_dimensions
|
|
1041
|
+
case _:
|
|
1042
|
+
msg = f"{category=}"
|
|
1043
|
+
raise NotImplementedError(msg)
|
|
1044
|
+
|
|
1045
|
+
return sorted((x.model.name for x in method()))
|
|
1046
|
+
|
|
1047
|
+
def list_dimension_names_by_type(self, dimension_type: DimensionType) -> list[str]:
|
|
1048
|
+
"""List the query names available for a dimension type."""
|
|
1049
|
+
return [
|
|
1050
|
+
x.model.name
|
|
1051
|
+
for x in self.iter_dimensions()
|
|
1052
|
+
if x.model.dimension_type == dimension_type
|
|
1053
|
+
]
|
|
1054
|
+
|
|
1055
|
+
def get_dimension_names_mapped_to_type(self) -> dict[str, DimensionType]:
|
|
1056
|
+
"""Return a dict of query names mapped to their dimension type."""
|
|
1057
|
+
return {x.model.name: x.model.dimension_type for x in self.iter_dimensions()}
|
|
1058
|
+
|
|
1059
|
+
def get_dimension_type_to_base_name_mapping(self) -> dict[DimensionType, list[str]]:
|
|
1060
|
+
"""Return a mapping of DimensionType to query names for base dimensions."""
|
|
1061
|
+
query_names: dict[DimensionType, list[str]] = {}
|
|
1062
|
+
for dimension_type in DimensionType:
|
|
1063
|
+
query_names[dimension_type] = [
|
|
1064
|
+
x.model.name for x in self.list_base_dimensions(dimension_type=dimension_type)
|
|
1065
|
+
]
|
|
1066
|
+
return query_names
|
|
1067
|
+
|
|
1068
|
+
def get_subset_dimension_to_name_mapping(self) -> dict[DimensionType, list[str]]:
|
|
1069
|
+
"""Return a mapping of DimensionType to query name for subset dimensions."""
|
|
1070
|
+
query_names = defaultdict(list)
|
|
1071
|
+
for dimension_type in DimensionType:
|
|
1072
|
+
if dimension_type in self._subset_dimensions:
|
|
1073
|
+
for selectors in self._subset_dimensions[dimension_type].values():
|
|
1074
|
+
for dim in selectors.values():
|
|
1075
|
+
query_names[dimension_type].append(dim.model.name)
|
|
1076
|
+
return query_names
|
|
1077
|
+
|
|
1078
|
+
def get_supplemental_dimension_to_name_mapping(self) -> dict[DimensionType, list[str]]:
|
|
1079
|
+
"""Return a mapping of DimensionType to query name for supplemental dimensions."""
|
|
1080
|
+
query_names = {}
|
|
1081
|
+
for dimension_type in DimensionType:
|
|
1082
|
+
query_names[dimension_type] = [
|
|
1083
|
+
x.model.name
|
|
1084
|
+
for x in self.list_supplemental_dimensions(dimension_type, sort_by="name")
|
|
1085
|
+
]
|
|
1086
|
+
return query_names
|
|
1087
|
+
|
|
1088
|
+
def get_dimension_names_model(self) -> ProjectDimensionNamesModel:
|
|
1089
|
+
"""Return an instance of ProjectDimensionNamesModel for the project."""
|
|
1090
|
+
base_names_by_type = self.get_dimension_type_to_base_name_mapping()
|
|
1091
|
+
subset_names_by_type = self.get_subset_dimension_to_name_mapping()
|
|
1092
|
+
supp_names_by_type = self.get_supplemental_dimension_to_name_mapping()
|
|
1093
|
+
model: dict[str, Any] = {}
|
|
1094
|
+
for dimension_type in DimensionType:
|
|
1095
|
+
model[dimension_type.value] = {
|
|
1096
|
+
"base": base_names_by_type[dimension_type],
|
|
1097
|
+
"subset": subset_names_by_type[dimension_type],
|
|
1098
|
+
"supplemental": supp_names_by_type[dimension_type],
|
|
1099
|
+
}
|
|
1100
|
+
return ProjectDimensionNamesModel(**model)
|
|
1101
|
+
|
|
1102
|
+
def set_dimensions(
|
|
1103
|
+
self,
|
|
1104
|
+
base_dimensions: dict[ConfigKey, DimensionBaseConfig],
|
|
1105
|
+
subset_dimensions: dict[
|
|
1106
|
+
DimensionType, dict[str, dict[ConfigKey, DimensionBaseConfigWithFiles]]
|
|
1107
|
+
],
|
|
1108
|
+
supplemental_dimensions: dict[ConfigKey, DimensionBaseConfig],
|
|
1109
|
+
) -> None:
|
|
1110
|
+
self._base_dimensions.clear()
|
|
1111
|
+
self._subset_dimensions.clear()
|
|
1112
|
+
self._supplemental_dimensions.clear()
|
|
1113
|
+
self._base_dimensions.update(base_dimensions)
|
|
1114
|
+
self._subset_dimensions.update(subset_dimensions)
|
|
1115
|
+
self._supplemental_dimensions.update(supplemental_dimensions)
|
|
1116
|
+
self._dimensions_by_name.clear()
|
|
1117
|
+
for dim in self.iter_dimensions():
|
|
1118
|
+
if dim.model.name in self._dimensions_by_name:
|
|
1119
|
+
msg = f"name={dim.model.name} exists multiple times in project {self.config_id}"
|
|
1120
|
+
raise DSGInvalidDimension(msg)
|
|
1121
|
+
self._dimensions_by_name[dim.model.name] = dim
|
|
1122
|
+
|
|
1123
|
+
def set_dimension_mappings(
|
|
1124
|
+
self, base_to_supplemental_mappings: dict[ConfigKey, MappingTableConfig]
|
|
1125
|
+
):
|
|
1126
|
+
self._base_to_supplemental_mappings.clear()
|
|
1127
|
+
self._base_to_supplemental_mappings.update(base_to_supplemental_mappings)
|
|
1128
|
+
# TODO: Once we start using these we may need to store by (from, to) as key instead.
|
|
1129
|
+
|
|
1130
|
+
def add_dataset_dimension_mappings(
|
|
1131
|
+
self, dataset_config: DatasetConfig, references: list[DimensionMappingReferenceModel]
|
|
1132
|
+
):
|
|
1133
|
+
"""Add a dataset's dimension mappings to the project.
|
|
1134
|
+
|
|
1135
|
+
Raises
|
|
1136
|
+
------
|
|
1137
|
+
DSGInvalidDimensionMapping
|
|
1138
|
+
Raised if a requirement is violated.
|
|
1139
|
+
"""
|
|
1140
|
+
if dataset_config.model.dataset_id not in self.model.dimension_mappings.dataset_to_project:
|
|
1141
|
+
self.model.dimension_mappings.dataset_to_project[dataset_config.model.dataset_id] = []
|
|
1142
|
+
mappings = self.model.dimension_mappings.dataset_to_project[
|
|
1143
|
+
dataset_config.model.dataset_id
|
|
1144
|
+
]
|
|
1145
|
+
existing_ids = set((x.mapping_id for x in mappings))
|
|
1146
|
+
for reference in references:
|
|
1147
|
+
if reference.mapping_id not in existing_ids:
|
|
1148
|
+
mappings.append(reference)
|
|
1149
|
+
logger.info(
|
|
1150
|
+
"Added dimension mapping for dataset=%s: %s",
|
|
1151
|
+
dataset_config.model.dataset_id,
|
|
1152
|
+
reference.mapping_id,
|
|
1153
|
+
)
|
|
1154
|
+
|
|
1155
|
+
def add_dataset_base_dimension_names(
|
|
1156
|
+
self, dataset_id: str, base_dimension_names: DatasetBaseDimensionNamesModel
|
|
1157
|
+
):
|
|
1158
|
+
"""Add project base dimension query names represented in the dataset."""
|
|
1159
|
+
for field in type(base_dimension_names).model_fields:
|
|
1160
|
+
if getattr(base_dimension_names, field) is None:
|
|
1161
|
+
msg = f"DatasetBaseDimensionNamesModel {field} cannot be None"
|
|
1162
|
+
raise DSGInvalidParameter(msg)
|
|
1163
|
+
dataset = self.get_dataset(dataset_id)
|
|
1164
|
+
dataset.base_dimension_names = base_dimension_names
|
|
1165
|
+
|
|
1166
|
+
def get_dataset_base_dimension_names(self, dataset_id: str) -> DatasetBaseDimensionNamesModel:
|
|
1167
|
+
"""Return the project base dimension query names represented in the dataset."""
|
|
1168
|
+
return self.get_dataset(dataset_id).base_dimension_names
|
|
1169
|
+
|
|
1170
|
+
@property
|
|
1171
|
+
def config_id(self) -> str:
|
|
1172
|
+
return self._model.project_id
|
|
1173
|
+
|
|
1174
|
+
def get_dataset(self, dataset_id: str) -> InputDatasetModel:
|
|
1175
|
+
"""Return a dataset by ID."""
|
|
1176
|
+
for dataset in self.model.datasets:
|
|
1177
|
+
if dataset.dataset_id == dataset_id:
|
|
1178
|
+
return dataset
|
|
1179
|
+
|
|
1180
|
+
msg = f"project_id={self._model.project_id} does not have dataset_id={dataset_id}"
|
|
1181
|
+
raise DSGInvalidField(msg)
|
|
1182
|
+
|
|
1183
|
+
def has_dataset(self, dataset_id: str, status: DatasetRegistryStatus | None) -> bool:
|
|
1184
|
+
"""Return True if the dataset_id is present in the configuration.
|
|
1185
|
+
|
|
1186
|
+
Parameters
|
|
1187
|
+
----------
|
|
1188
|
+
dataset_id : str
|
|
1189
|
+
status : None | DatasetRegistryStatus
|
|
1190
|
+
If set, only return True if the status matches.
|
|
1191
|
+
"""
|
|
1192
|
+
for dataset in self.iter_datasets():
|
|
1193
|
+
if dataset.dataset_id == dataset_id:
|
|
1194
|
+
if status is None or dataset.status == status:
|
|
1195
|
+
return True
|
|
1196
|
+
return False
|
|
1197
|
+
|
|
1198
|
+
return False
|
|
1199
|
+
|
|
1200
|
+
def get_load_data_time_columns(self, name: str) -> list[str]:
|
|
1201
|
+
"""Return the time dimension columns expected in the load data table for this query name."""
|
|
1202
|
+
dim = self.get_time_dimension(name)
|
|
1203
|
+
time_columns = dim.get_load_data_time_columns()
|
|
1204
|
+
return time_columns
|
|
1205
|
+
|
|
1206
|
+
def iter_datasets(self) -> Generator[InputDatasetModel, None, None]:
|
|
1207
|
+
for dataset in self.model.datasets:
|
|
1208
|
+
yield dataset
|
|
1209
|
+
|
|
1210
|
+
def _iter_base_dimensions(self) -> Generator[DimensionBaseConfig, None, None]:
|
|
1211
|
+
yield from self._base_dimensions.values()
|
|
1212
|
+
|
|
1213
|
+
def _iter_subset_dimensions(self) -> Generator[DimensionBaseConfig, None, None]:
|
|
1214
|
+
for x in self._subset_dimensions.values():
|
|
1215
|
+
for y in x.values():
|
|
1216
|
+
for z in y.values():
|
|
1217
|
+
yield z
|
|
1218
|
+
|
|
1219
|
+
def _iter_supplemental_dimensions(self) -> Generator[DimensionBaseConfig, None, None]:
|
|
1220
|
+
yield from self._supplemental_dimensions.values()
|
|
1221
|
+
|
|
1222
|
+
def iter_dimensions(self) -> Iterable[DimensionBaseConfig]:
|
|
1223
|
+
"""Return an iterator over all dimensions of the project.
|
|
1224
|
+
|
|
1225
|
+
Yields
|
|
1226
|
+
------
|
|
1227
|
+
DimensionConfig
|
|
1228
|
+
|
|
1229
|
+
"""
|
|
1230
|
+
return itertools.chain(
|
|
1231
|
+
self._iter_base_dimensions(),
|
|
1232
|
+
self._iter_subset_dimensions(),
|
|
1233
|
+
self._iter_supplemental_dimensions(),
|
|
1234
|
+
)
|
|
1235
|
+
|
|
1236
|
+
def list_registered_dataset_ids(self) -> list[str]:
|
|
1237
|
+
"""List registered datasets associated with the project."""
|
|
1238
|
+
status = DatasetRegistryStatus.REGISTERED
|
|
1239
|
+
return [x.dataset_id for x in self._iter_datasets_by_status(status)]
|
|
1240
|
+
|
|
1241
|
+
def list_unregistered_dataset_ids(self) -> list[str]:
|
|
1242
|
+
"""List unregistered datasets associated with project registry."""
|
|
1243
|
+
status = DatasetRegistryStatus.UNREGISTERED
|
|
1244
|
+
return [x.dataset_id for x in self._iter_datasets_by_status(status)]
|
|
1245
|
+
|
|
1246
|
+
def _iter_datasets_by_status(
|
|
1247
|
+
self, status: DatasetRegistryStatus
|
|
1248
|
+
) -> Generator[InputDatasetModel, None, None]:
|
|
1249
|
+
for dataset in self.iter_datasets():
|
|
1250
|
+
if dataset.status == status:
|
|
1251
|
+
yield dataset
|
|
1252
|
+
|
|
1253
|
+
def get_required_dimension_record_ids(
|
|
1254
|
+
self, dataset_id: str, dimension_type: DimensionType
|
|
1255
|
+
) -> set[str]:
|
|
1256
|
+
"""Return the required base dimension record IDs for the dataset and dimension type."""
|
|
1257
|
+
dataset = self.get_dataset(dataset_id)
|
|
1258
|
+
req = getattr(dataset.required_dimensions.single_dimensional, dimension_type.value)
|
|
1259
|
+
record_ids = self._get_required_dimension_record_ids(req)
|
|
1260
|
+
for multi_req in dataset.required_dimensions.multi_dimensional:
|
|
1261
|
+
req = getattr(multi_req, dimension_type.value)
|
|
1262
|
+
record_ids.update(self._get_required_dimension_record_ids(req))
|
|
1263
|
+
|
|
1264
|
+
return record_ids
|
|
1265
|
+
|
|
1266
|
+
def _build_multi_dim_requirement_associations(
|
|
1267
|
+
self, multi_dim_reqs: list[RequiredDimensionRecordsModel], context: ScratchDirContext
|
|
1268
|
+
) -> list[DataFrame]:
|
|
1269
|
+
dfs_by_dim_combo: dict[tuple[str, ...], DataFrame] = {}
|
|
1270
|
+
|
|
1271
|
+
# Example: Partial sector and subsector combinations are required.
|
|
1272
|
+
# [
|
|
1273
|
+
# {{"sector": {"base": ["com"]},
|
|
1274
|
+
# "subsector": "supplemental":
|
|
1275
|
+
# {"name": "commercial-subsectors",
|
|
1276
|
+
# "record_ids": ["commercial_subsectors"]}},
|
|
1277
|
+
# {"sector": {"base": ["res"]}, "subsector": {"base": ["MidriseApartment"]}},
|
|
1278
|
+
# ]
|
|
1279
|
+
# This code will replace supplemental records with base records and return a list of
|
|
1280
|
+
# dataframes of those combinations - one per unique combination of dimensions.
|
|
1281
|
+
|
|
1282
|
+
for multi_req in multi_dim_reqs:
|
|
1283
|
+
dim_combo = []
|
|
1284
|
+
columns = {}
|
|
1285
|
+
for field in sorted(RequiredDimensionRecordsModel.model_fields):
|
|
1286
|
+
dim_type = DimensionType(field)
|
|
1287
|
+
req = getattr(multi_req, field)
|
|
1288
|
+
record_ids = self._get_required_dimension_record_ids(req)
|
|
1289
|
+
if record_ids:
|
|
1290
|
+
columns[field] = list(record_ids)
|
|
1291
|
+
dim_combo.append(dim_type.value)
|
|
1292
|
+
|
|
1293
|
+
df = create_dataframe_from_product(columns, context)
|
|
1294
|
+
df = df.select(*sorted(df.columns))
|
|
1295
|
+
|
|
1296
|
+
dim_combo_tp = tuple(sorted(dim_combo))
|
|
1297
|
+
if dim_combo_tp in dfs_by_dim_combo:
|
|
1298
|
+
dfs_by_dim_combo[dim_combo_tp] = dfs_by_dim_combo[dim_combo_tp].union(df)
|
|
1299
|
+
else:
|
|
1300
|
+
dfs_by_dim_combo[dim_combo_tp] = df
|
|
1301
|
+
|
|
1302
|
+
return list(dfs_by_dim_combo.values())
|
|
1303
|
+
|
|
1304
|
+
def _get_required_dimension_record_ids(
|
|
1305
|
+
self, reqs: RequiredDimensionRecordsByTypeModel
|
|
1306
|
+
) -> set[str]:
|
|
1307
|
+
"""Return the required record IDs for a dimension based on the specification in the
|
|
1308
|
+
project config.
|
|
1309
|
+
"""
|
|
1310
|
+
record_ids = self._get_required_base_dimension_record_ids(reqs)
|
|
1311
|
+
record_ids.update(self._get_required_record_ids_from_subsets(reqs))
|
|
1312
|
+
return record_ids
|
|
1313
|
+
|
|
1314
|
+
def _get_required_base_dimension_record_ids(
|
|
1315
|
+
self, reqs: RequiredDimensionRecordsByTypeModel
|
|
1316
|
+
) -> set[str]:
|
|
1317
|
+
"""Return the required record IDs for a base dimension based on the specification in the
|
|
1318
|
+
project config.
|
|
1319
|
+
"""
|
|
1320
|
+
record_ids: set[str] = set()
|
|
1321
|
+
if not reqs.base.record_ids and not reqs.base_missing.record_ids:
|
|
1322
|
+
return record_ids
|
|
1323
|
+
|
|
1324
|
+
base_dim_query_name = reqs.base.dimension_name or reqs.base_missing.dimension_name
|
|
1325
|
+
assert base_dim_query_name is not None
|
|
1326
|
+
all_base_record_ids = self.get_dimension_record_ids(base_dim_query_name)
|
|
1327
|
+
|
|
1328
|
+
if reqs.base.record_ids == ["__all__"]:
|
|
1329
|
+
assert reqs.base.dimension_name is not None
|
|
1330
|
+
record_ids = all_base_record_ids
|
|
1331
|
+
elif reqs.base.record_ids:
|
|
1332
|
+
record_ids = set(reqs.base.record_ids)
|
|
1333
|
+
if diff := record_ids - all_base_record_ids:
|
|
1334
|
+
msg = (
|
|
1335
|
+
"The project config requires these these record IDs in the dataset's 'base' "
|
|
1336
|
+
"field, but they are not in the base dimension records: "
|
|
1337
|
+
f"name={base_dim_query_name}: {diff=}"
|
|
1338
|
+
)
|
|
1339
|
+
raise DSGInvalidDataset(msg)
|
|
1340
|
+
elif reqs.base_missing.record_ids:
|
|
1341
|
+
assert reqs.base_missing.dimension_name is not None
|
|
1342
|
+
missing_ids = set(reqs.base_missing.record_ids)
|
|
1343
|
+
if diff := missing_ids - all_base_record_ids:
|
|
1344
|
+
msg = (
|
|
1345
|
+
"The project config requires these these record IDs in the dataset's "
|
|
1346
|
+
"'base_missing' field, but they are not in the base dimension "
|
|
1347
|
+
f"name={base_dim_query_name}: {diff=}"
|
|
1348
|
+
)
|
|
1349
|
+
raise DSGInvalidDataset(msg)
|
|
1350
|
+
record_ids = all_base_record_ids - missing_ids
|
|
1351
|
+
|
|
1352
|
+
return record_ids
|
|
1353
|
+
|
|
1354
|
+
def _get_subset_dimension_records(self, name: str, selector_name: str) -> set[str]:
|
|
1355
|
+
for group in self.model.dimensions.subset_dimensions:
|
|
1356
|
+
if group.name == name:
|
|
1357
|
+
for ref in group.selector_references:
|
|
1358
|
+
key = ConfigKey(ref.dimension_id, ref.version)
|
|
1359
|
+
dim = self._subset_dimensions[group.dimension_type][group.name][key]
|
|
1360
|
+
if dim.model.name == selector_name:
|
|
1361
|
+
assert isinstance(dim, DimensionBaseConfigWithFiles)
|
|
1362
|
+
return dim.get_unique_ids()
|
|
1363
|
+
|
|
1364
|
+
msg = f"subset dimension selector not found: {name=} {selector_name=}"
|
|
1365
|
+
raise DSGInvalidDimension(msg)
|
|
1366
|
+
|
|
1367
|
+
def _get_required_record_ids_from_subsets(
|
|
1368
|
+
self, req: RequiredDimensionRecordsByTypeModel
|
|
1369
|
+
) -> set[str]:
|
|
1370
|
+
record_ids = set()
|
|
1371
|
+
for subset in req.subset:
|
|
1372
|
+
for selector_name in subset.selectors:
|
|
1373
|
+
record_ids.update(self._get_subset_dimension_records(subset.name, selector_name))
|
|
1374
|
+
return record_ids
|
|
1375
|
+
|
|
1376
|
+
@track_timing(timer_stats_collector)
|
|
1377
|
+
def make_dimension_association_table(
|
|
1378
|
+
self, dataset_id: str, context: ScratchDirContext
|
|
1379
|
+
) -> DataFrame:
|
|
1380
|
+
"""Build a table that includes all combinations of dimension records that must be provided
|
|
1381
|
+
by the dataset.
|
|
1382
|
+
"""
|
|
1383
|
+
required_dimensions = self.get_dataset(dataset_id).required_dimensions
|
|
1384
|
+
multi_dfs = self._build_multi_dim_requirement_associations(
|
|
1385
|
+
required_dimensions.multi_dimensional, context
|
|
1386
|
+
)
|
|
1387
|
+
|
|
1388
|
+
# Project config construction asserts that there is no intersection of dimensions in
|
|
1389
|
+
# multi and single.
|
|
1390
|
+
existing = set()
|
|
1391
|
+
for df in multi_dfs:
|
|
1392
|
+
existing.update(set(df.columns))
|
|
1393
|
+
|
|
1394
|
+
single_dfs: dict[str, list[str]] = {}
|
|
1395
|
+
for field in (x for x in RequiredDimensionRecordsModel.model_fields if x not in existing):
|
|
1396
|
+
req = getattr(required_dimensions.single_dimensional, field)
|
|
1397
|
+
record_ids = self._get_required_dimension_record_ids(req)
|
|
1398
|
+
single_dfs[field] = list(record_ids)
|
|
1399
|
+
|
|
1400
|
+
single_df = create_dataframe_from_product(single_dfs, context)
|
|
1401
|
+
return cross_join_dfs(multi_dfs + [single_df])
|
|
1402
|
+
|
|
1403
|
+
def are_all_datasets_submitted(self) -> bool:
|
|
1404
|
+
"""Return True if all datasets have been submitted."""
|
|
1405
|
+
return not self.list_unregistered_dataset_ids()
|
|
1406
|
+
|
|
1407
|
+
def set_status(self, status: ProjectRegistryStatus) -> None:
|
|
1408
|
+
"""Set the project status to the given value."""
|
|
1409
|
+
self.model.status = status
|
|
1410
|
+
logger.info("Set project_id=%s status=%s", self.config_id, status)
|
|
1411
|
+
|
|
1412
|
+
def set_dataset_status(self, dataset_id: str, status: DatasetRegistryStatus):
|
|
1413
|
+
"""Set the dataset status to the given value.
|
|
1414
|
+
|
|
1415
|
+
Raises
|
|
1416
|
+
------
|
|
1417
|
+
ValueError
|
|
1418
|
+
Raised if dataset_id is not stored.
|
|
1419
|
+
"""
|
|
1420
|
+
dataset = self.get_dataset(dataset_id)
|
|
1421
|
+
dataset.status = status
|
|
1422
|
+
logger.info(
|
|
1423
|
+
"Set dataset_id=%s status=%s for project_id=%s",
|
|
1424
|
+
dataset_id,
|
|
1425
|
+
status,
|
|
1426
|
+
self._model.project_id,
|
|
1427
|
+
)
|
|
1428
|
+
|
|
1429
|
+
@property
|
|
1430
|
+
def base_dimensions(self) -> dict:
|
|
1431
|
+
"""Return the Base Dimensions.
|
|
1432
|
+
|
|
1433
|
+
Returns
|
|
1434
|
+
-------
|
|
1435
|
+
dict
|
|
1436
|
+
dict of DimensionConfig keyed by ConfigKey
|
|
1437
|
+
|
|
1438
|
+
"""
|
|
1439
|
+
return self._base_dimensions
|
|
1440
|
+
|
|
1441
|
+
@property
|
|
1442
|
+
def supplemental_dimensions(self) -> dict:
|
|
1443
|
+
"""Return the supplemental dimensions.
|
|
1444
|
+
|
|
1445
|
+
Returns
|
|
1446
|
+
-------
|
|
1447
|
+
dict
|
|
1448
|
+
dict of DimensionConfig keyed by ConfigKey
|
|
1449
|
+
|
|
1450
|
+
"""
|
|
1451
|
+
return self._supplemental_dimensions
|
|
1452
|
+
|
|
1453
|
+
|
|
1454
|
+
def load_subset_dimensions(filename: Path) -> tuple[set[str], dict[str, list[str]]]:
|
|
1455
|
+
"""Return a mapping of subset dimension name to record IDs."""
|
|
1456
|
+
df = pd.read_csv(filename, index_col="id")
|
|
1457
|
+
if len(df.columns) == 0:
|
|
1458
|
+
msg = "A subset dimension records file must at least one dimension column."
|
|
1459
|
+
raise DSGInvalidDimension(msg)
|
|
1460
|
+
record_ids = set(df.index.values)
|
|
1461
|
+
subset_by_dim_name = {x: df[x].dropna().index.to_list() for x in df.columns}
|
|
1462
|
+
return record_ids, subset_by_dim_name
|