dsgrid-toolkit 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dsgrid-toolkit might be problematic. Click here for more details.

Files changed (152) hide show
  1. dsgrid/__init__.py +22 -0
  2. dsgrid/api/__init__.py +0 -0
  3. dsgrid/api/api_manager.py +179 -0
  4. dsgrid/api/app.py +420 -0
  5. dsgrid/api/models.py +60 -0
  6. dsgrid/api/response_models.py +116 -0
  7. dsgrid/apps/__init__.py +0 -0
  8. dsgrid/apps/project_viewer/app.py +216 -0
  9. dsgrid/apps/registration_gui.py +444 -0
  10. dsgrid/chronify.py +22 -0
  11. dsgrid/cli/__init__.py +0 -0
  12. dsgrid/cli/common.py +120 -0
  13. dsgrid/cli/config.py +177 -0
  14. dsgrid/cli/download.py +13 -0
  15. dsgrid/cli/dsgrid.py +142 -0
  16. dsgrid/cli/dsgrid_admin.py +349 -0
  17. dsgrid/cli/install_notebooks.py +62 -0
  18. dsgrid/cli/query.py +711 -0
  19. dsgrid/cli/registry.py +1773 -0
  20. dsgrid/cloud/__init__.py +0 -0
  21. dsgrid/cloud/cloud_storage_interface.py +140 -0
  22. dsgrid/cloud/factory.py +31 -0
  23. dsgrid/cloud/fake_storage_interface.py +37 -0
  24. dsgrid/cloud/s3_storage_interface.py +156 -0
  25. dsgrid/common.py +35 -0
  26. dsgrid/config/__init__.py +0 -0
  27. dsgrid/config/annual_time_dimension_config.py +187 -0
  28. dsgrid/config/common.py +131 -0
  29. dsgrid/config/config_base.py +148 -0
  30. dsgrid/config/dataset_config.py +684 -0
  31. dsgrid/config/dataset_schema_handler_factory.py +41 -0
  32. dsgrid/config/date_time_dimension_config.py +108 -0
  33. dsgrid/config/dimension_config.py +54 -0
  34. dsgrid/config/dimension_config_factory.py +65 -0
  35. dsgrid/config/dimension_mapping_base.py +349 -0
  36. dsgrid/config/dimension_mappings_config.py +48 -0
  37. dsgrid/config/dimensions.py +775 -0
  38. dsgrid/config/dimensions_config.py +71 -0
  39. dsgrid/config/index_time_dimension_config.py +76 -0
  40. dsgrid/config/input_dataset_requirements.py +31 -0
  41. dsgrid/config/mapping_tables.py +209 -0
  42. dsgrid/config/noop_time_dimension_config.py +42 -0
  43. dsgrid/config/project_config.py +1457 -0
  44. dsgrid/config/registration_models.py +199 -0
  45. dsgrid/config/representative_period_time_dimension_config.py +194 -0
  46. dsgrid/config/simple_models.py +49 -0
  47. dsgrid/config/supplemental_dimension.py +29 -0
  48. dsgrid/config/time_dimension_base_config.py +200 -0
  49. dsgrid/data_models.py +155 -0
  50. dsgrid/dataset/__init__.py +0 -0
  51. dsgrid/dataset/dataset.py +123 -0
  52. dsgrid/dataset/dataset_expression_handler.py +86 -0
  53. dsgrid/dataset/dataset_mapping_manager.py +121 -0
  54. dsgrid/dataset/dataset_schema_handler_base.py +899 -0
  55. dsgrid/dataset/dataset_schema_handler_one_table.py +196 -0
  56. dsgrid/dataset/dataset_schema_handler_standard.py +303 -0
  57. dsgrid/dataset/growth_rates.py +162 -0
  58. dsgrid/dataset/models.py +44 -0
  59. dsgrid/dataset/table_format_handler_base.py +257 -0
  60. dsgrid/dataset/table_format_handler_factory.py +17 -0
  61. dsgrid/dataset/unpivoted_table.py +121 -0
  62. dsgrid/dimension/__init__.py +0 -0
  63. dsgrid/dimension/base_models.py +218 -0
  64. dsgrid/dimension/dimension_filters.py +308 -0
  65. dsgrid/dimension/standard.py +213 -0
  66. dsgrid/dimension/time.py +531 -0
  67. dsgrid/dimension/time_utils.py +88 -0
  68. dsgrid/dsgrid_rc.py +88 -0
  69. dsgrid/exceptions.py +105 -0
  70. dsgrid/filesystem/__init__.py +0 -0
  71. dsgrid/filesystem/cloud_filesystem.py +32 -0
  72. dsgrid/filesystem/factory.py +32 -0
  73. dsgrid/filesystem/filesystem_interface.py +136 -0
  74. dsgrid/filesystem/local_filesystem.py +74 -0
  75. dsgrid/filesystem/s3_filesystem.py +118 -0
  76. dsgrid/loggers.py +132 -0
  77. dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +950 -0
  78. dsgrid/notebooks/registration.ipynb +48 -0
  79. dsgrid/notebooks/start_notebook.sh +11 -0
  80. dsgrid/project.py +451 -0
  81. dsgrid/query/__init__.py +0 -0
  82. dsgrid/query/dataset_mapping_plan.py +142 -0
  83. dsgrid/query/derived_dataset.py +384 -0
  84. dsgrid/query/models.py +726 -0
  85. dsgrid/query/query_context.py +287 -0
  86. dsgrid/query/query_submitter.py +847 -0
  87. dsgrid/query/report_factory.py +19 -0
  88. dsgrid/query/report_peak_load.py +70 -0
  89. dsgrid/query/reports_base.py +20 -0
  90. dsgrid/registry/__init__.py +0 -0
  91. dsgrid/registry/bulk_register.py +161 -0
  92. dsgrid/registry/common.py +287 -0
  93. dsgrid/registry/config_update_checker_base.py +63 -0
  94. dsgrid/registry/data_store_factory.py +34 -0
  95. dsgrid/registry/data_store_interface.py +69 -0
  96. dsgrid/registry/dataset_config_generator.py +156 -0
  97. dsgrid/registry/dataset_registry_manager.py +734 -0
  98. dsgrid/registry/dataset_update_checker.py +16 -0
  99. dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
  100. dsgrid/registry/dimension_mapping_update_checker.py +16 -0
  101. dsgrid/registry/dimension_registry_manager.py +413 -0
  102. dsgrid/registry/dimension_update_checker.py +16 -0
  103. dsgrid/registry/duckdb_data_store.py +185 -0
  104. dsgrid/registry/filesystem_data_store.py +141 -0
  105. dsgrid/registry/filter_registry_manager.py +123 -0
  106. dsgrid/registry/project_config_generator.py +57 -0
  107. dsgrid/registry/project_registry_manager.py +1616 -0
  108. dsgrid/registry/project_update_checker.py +48 -0
  109. dsgrid/registry/registration_context.py +223 -0
  110. dsgrid/registry/registry_auto_updater.py +316 -0
  111. dsgrid/registry/registry_database.py +662 -0
  112. dsgrid/registry/registry_interface.py +446 -0
  113. dsgrid/registry/registry_manager.py +544 -0
  114. dsgrid/registry/registry_manager_base.py +367 -0
  115. dsgrid/registry/versioning.py +92 -0
  116. dsgrid/spark/__init__.py +0 -0
  117. dsgrid/spark/functions.py +545 -0
  118. dsgrid/spark/types.py +50 -0
  119. dsgrid/tests/__init__.py +0 -0
  120. dsgrid/tests/common.py +139 -0
  121. dsgrid/tests/make_us_data_registry.py +204 -0
  122. dsgrid/tests/register_derived_datasets.py +103 -0
  123. dsgrid/tests/utils.py +25 -0
  124. dsgrid/time/__init__.py +0 -0
  125. dsgrid/time/time_conversions.py +80 -0
  126. dsgrid/time/types.py +67 -0
  127. dsgrid/units/__init__.py +0 -0
  128. dsgrid/units/constants.py +113 -0
  129. dsgrid/units/convert.py +71 -0
  130. dsgrid/units/energy.py +145 -0
  131. dsgrid/units/power.py +87 -0
  132. dsgrid/utils/__init__.py +0 -0
  133. dsgrid/utils/dataset.py +612 -0
  134. dsgrid/utils/files.py +179 -0
  135. dsgrid/utils/filters.py +125 -0
  136. dsgrid/utils/id_remappings.py +100 -0
  137. dsgrid/utils/py_expression_eval/LICENSE +19 -0
  138. dsgrid/utils/py_expression_eval/README.md +8 -0
  139. dsgrid/utils/py_expression_eval/__init__.py +847 -0
  140. dsgrid/utils/py_expression_eval/tests.py +283 -0
  141. dsgrid/utils/run_command.py +70 -0
  142. dsgrid/utils/scratch_dir_context.py +64 -0
  143. dsgrid/utils/spark.py +918 -0
  144. dsgrid/utils/spark_partition.py +98 -0
  145. dsgrid/utils/timing.py +239 -0
  146. dsgrid/utils/utilities.py +184 -0
  147. dsgrid/utils/versioning.py +36 -0
  148. dsgrid_toolkit-0.2.0.dist-info/METADATA +216 -0
  149. dsgrid_toolkit-0.2.0.dist-info/RECORD +152 -0
  150. dsgrid_toolkit-0.2.0.dist-info/WHEEL +4 -0
  151. dsgrid_toolkit-0.2.0.dist-info/entry_points.txt +4 -0
  152. dsgrid_toolkit-0.2.0.dist-info/licenses/LICENSE +29 -0
@@ -0,0 +1,1457 @@
1
+ import itertools
2
+ import logging
3
+ from collections import defaultdict
4
+ from pathlib import Path
5
+ from typing import Annotated, Any, Generator, Iterable, Type
6
+
7
+ import pandas as pd
8
+ from pydantic import field_validator, model_validator, Field
9
+
10
+ from dsgrid.config.common import make_base_dimension_template
11
+ from dsgrid.config.dataset_config import DatasetConfig
12
+ from dsgrid.config.dimension_config import (
13
+ DimensionBaseConfig,
14
+ DimensionBaseConfigWithFiles,
15
+ )
16
+ from dsgrid.config.mapping_tables import MappingTableConfig
17
+ from dsgrid.config.time_dimension_base_config import TimeDimensionBaseConfig
18
+ from dsgrid.data_models import DSGBaseModel, DSGBaseDatabaseModel, make_model_config
19
+ from dsgrid.dimension.base_models import (
20
+ check_required_dimensions,
21
+ check_timezone_in_geography,
22
+ DimensionCategory,
23
+ DimensionType,
24
+ )
25
+ from dsgrid.dimension.time import TimeDimensionType
26
+ from dsgrid.exceptions import (
27
+ DSGInvalidDataset,
28
+ DSGInvalidField,
29
+ DSGInvalidDimension,
30
+ DSGInvalidOperation,
31
+ DSGInvalidParameter,
32
+ DSGValueNotRegistered,
33
+ )
34
+ from dsgrid.registry.common import (
35
+ ConfigKey,
36
+ ProjectRegistryStatus,
37
+ DatasetRegistryStatus,
38
+ check_config_id_strict,
39
+ )
40
+ from dsgrid.spark.types import (
41
+ DataFrame,
42
+ )
43
+ from dsgrid.utils.scratch_dir_context import ScratchDirContext
44
+ from dsgrid.utils.spark import (
45
+ cross_join_dfs,
46
+ create_dataframe_from_product,
47
+ )
48
+ from dsgrid.utils.timing import timer_stats_collector, track_timing
49
+ from dsgrid.utils.utilities import check_uniqueness
50
+ from dsgrid.config.config_base import ConfigBase
51
+ from dsgrid.config.dataset_config import InputDatasetType
52
+ from dsgrid.config.supplemental_dimension import SupplementalDimensionModel
53
+ from dsgrid.config.dimension_mapping_base import DimensionMappingReferenceModel
54
+ from dsgrid.config.dimensions import (
55
+ DimensionsListModel,
56
+ DimensionReferenceModel,
57
+ DimensionModel,
58
+ )
59
+ from dsgrid.dimension.time import (
60
+ TimeBasedDataAdjustmentModel,
61
+ DaylightSavingSpringForwardType,
62
+ DaylightSavingFallBackType,
63
+ )
64
+
65
+
66
+ logger = logging.getLogger(__name__)
67
+
68
+
69
+ class SubsetDimensionSelectorModel(DSGBaseModel):
70
+ """Defines a subset dimension selector inclusive of the subset's records and information
71
+ required to define the selector as a record within the supplemental dimension defined by the
72
+ subset dimension group.
73
+ """
74
+
75
+ name: str
76
+ description: str
77
+ column_values: dict[str, str] = Field(
78
+ title="column_values",
79
+ description="Optional columns to populate in the subset dimension group's supplemental "
80
+ "dimension records table. For example, if each selector in the group defines the end "
81
+ "uses for one sector (e.g., commercial_end_uses, transportation_end_uses), the "
82
+ "supplemental dimension records table needs to define the 'fuel_id' and 'unit' fields of "
83
+ "the EnergyEndUse data model.",
84
+ default={},
85
+ )
86
+ records: list[str] = Field(
87
+ title="records",
88
+ description="Table of values populated by reading the parent subset dimension records "
89
+ "file. Should not be populated by the user.",
90
+ default=[],
91
+ json_schema_extra={
92
+ "dsgrid_internal": True,
93
+ },
94
+ )
95
+
96
+
97
+ class SubsetDimensionGroupModel(DSGBaseModel):
98
+ """Defines one or more subset dimension selectors for a dimension type."""
99
+
100
+ name: str
101
+ description: str
102
+ dimension_type: DimensionType = Field(
103
+ title="dimension_type",
104
+ alias="type",
105
+ description="Type of the dimension",
106
+ json_schema_extra={
107
+ "options": DimensionType.format_for_docs(),
108
+ },
109
+ )
110
+ filename: str | None = Field(
111
+ default=None,
112
+ title="filename",
113
+ alias="file",
114
+ description="Filename containing dimension records. Only populated for initial "
115
+ "registration. Each selector's records are stored as JSON objects in the dsgrid registry.",
116
+ )
117
+ selectors: list[SubsetDimensionSelectorModel] = Field(
118
+ title="selectors",
119
+ description="Dimension selectors",
120
+ )
121
+ selector_references: list[DimensionReferenceModel] = Field(
122
+ title="selectors",
123
+ description="References to the subset dimensions generated by dsgrid during registration.",
124
+ default=[],
125
+ )
126
+ create_supplemental_dimension: bool = Field(
127
+ title="create_supplemental_dimension",
128
+ description="Auto-generate supplemental dimensions in order to allow aggregrations on "
129
+ "the subsets.",
130
+ default=True,
131
+ )
132
+ base_dimension_name: str | None = Field(
133
+ default=None,
134
+ title="base_dimension_name",
135
+ description="Name of base dimension for the supplemental dimension mapping, if "
136
+ "create_supplemental_dimension is true. Required if there are multiple base dimensions "
137
+ "for this type.",
138
+ )
139
+ record_ids: set[str] = set()
140
+
141
+ @field_validator("selectors")
142
+ @classmethod
143
+ def check_selectors(cls, selectors):
144
+ """Check that the selectors are defined consistently."""
145
+ if len(selectors) > 1:
146
+ first = sorted(selectors[0].column_values.keys())
147
+ for selector in selectors[1:]:
148
+ columns = sorted(selector.column_values.keys())
149
+ if columns != first:
150
+ msg = f"All selectors must define the same columns: {first=} {columns=}"
151
+ raise ValueError(msg)
152
+
153
+ return selectors
154
+
155
+ @model_validator(mode="after")
156
+ def load_records(self) -> "SubsetDimensionGroupModel":
157
+ """Load the records for each subset dimension selector."""
158
+ if self.filename is None:
159
+ return self
160
+
161
+ record_ids, mappings = load_subset_dimensions(Path(self.filename))
162
+ self.record_ids.update(record_ids)
163
+ selector_names = check_uniqueness(
164
+ [x.name for x in self.selectors], "subset dimension selector"
165
+ )
166
+
167
+ diff = selector_names.symmetric_difference(mappings)
168
+ if diff:
169
+ msg = f"subset dimension {self.name} selectors have a mismatch with the records file column names: {diff}"
170
+ raise ValueError(msg)
171
+
172
+ for dim in self.selectors:
173
+ dim.records = mappings[dim.name]
174
+
175
+ self.filename = None
176
+ return self
177
+
178
+
179
+ class SubsetDimensionGroupListModel(DSGBaseModel):
180
+ """Defines a list of subset dimensions."""
181
+
182
+ subset_dimensions: Annotated[list[SubsetDimensionGroupModel], Field(min_length=1)] = Field(
183
+ description="List of subset dimensions to be registered",
184
+ )
185
+
186
+
187
+ class DimensionsModel(DSGBaseModel):
188
+ """Contains dimensions defined by a project"""
189
+
190
+ base_dimensions: DimensionsListModel = Field(
191
+ title="base_dimensions",
192
+ description="List of dimensions for a project's base dimensions. They will be "
193
+ "automatically registered during project registration and then converted to "
194
+ "base_dimension_references.",
195
+ default=[],
196
+ )
197
+ base_dimension_references: list[DimensionReferenceModel] = Field(
198
+ title="base_dimensions",
199
+ description="List of registry references (``DimensionReferenceModel``) for a project's "
200
+ "base dimensions.",
201
+ default=[],
202
+ )
203
+ subset_dimensions: list[SubsetDimensionGroupModel] = Field(
204
+ title="subset_dimensions",
205
+ description="List of subset dimension groups. "
206
+ "Subset dimension groups are used to specify subsets of base dimension records that a "
207
+ "dataset must support, dimensionality of derived datasets, and query filters. "
208
+ "Subset dimension groups also define a new supplemental dimension whose records "
209
+ "correspond to the table columns/subset selectors, such that defining a subset "
210
+ "dimension group can be a convenient way to define reporting at a different level of "
211
+ "aggregation as compared to the project's base dimensions.",
212
+ default=[],
213
+ )
214
+ supplemental_dimensions: list[SupplementalDimensionModel] = Field(
215
+ title="supplemental_dimensions",
216
+ description="List of supplemental dimensions. They will be automatically registered. "
217
+ "during project registration and then converted to supplemental_dimension_references. "
218
+ "Supplemental dimensions are used to support additional querying and transformations "
219
+ "(e.g., aggregations, disgaggregations, filtering, scaling, etc.) of the project's "
220
+ "base data.",
221
+ default=[],
222
+ )
223
+ supplemental_dimension_references: list[DimensionReferenceModel] = Field(
224
+ title="supplemental_dimension_references",
225
+ description="List of registry references for a project's supplemental dimensions. "
226
+ "Dimensions references of the same :class:`dsgrid.dimensions.base_model.DimensionType` "
227
+ "are allowed for supplemental dimension references (i.e., multiple `Geography` types "
228
+ "are allowed).",
229
+ default=[],
230
+ )
231
+
232
+ @model_validator(mode="after")
233
+ def check_dimensions(self) -> "DimensionsModel":
234
+ """Validate that the dimensions are complete and consistent."""
235
+ dimensions = itertools.chain(self.base_dimensions, self.base_dimension_references)
236
+ check_required_dimensions(dimensions, "project base dimensions")
237
+ return self
238
+
239
+ @model_validator(mode="before")
240
+ @classmethod
241
+ def pre_check_values(cls, values: dict) -> dict:
242
+ """Checks that base dimensions are defined."""
243
+ if not values.get("base_dimensions", []) and not values.get(
244
+ "base_dimension_references", []
245
+ ):
246
+ msg = "Either base_dimensions or base_dimension_references must be defined"
247
+ raise ValueError(msg)
248
+
249
+ return values
250
+
251
+ @field_validator("base_dimensions")
252
+ @classmethod
253
+ def check_files(cls, values: list) -> list:
254
+ """Validate dimension files are unique across all dimensions"""
255
+ check_uniqueness(
256
+ (
257
+ x.filename
258
+ for x in values
259
+ if isinstance(x, DimensionModel) and x.filename is not None
260
+ ),
261
+ "dimension record filename",
262
+ )
263
+ return values
264
+
265
+ @field_validator("base_dimensions")
266
+ @classmethod
267
+ def check_names(cls, values: list) -> list:
268
+ """Validate dimension names are unique across all dimensions."""
269
+ check_uniqueness(
270
+ [dim.name for dim in values],
271
+ "dimension record name",
272
+ )
273
+ return values
274
+
275
+ @field_validator("base_dimensions")
276
+ @classmethod
277
+ def check_time_zone(cls, values: list) -> list:
278
+ """Validate the time zone column in geography records."""
279
+ for dimension in values:
280
+ if dimension.dimension_type == DimensionType.GEOGRAPHY:
281
+ check_timezone_in_geography(
282
+ dimension,
283
+ err_msg="Project geography dimension records must include a time_zone column",
284
+ )
285
+ return values
286
+
287
+ @field_validator("subset_dimensions")
288
+ @classmethod
289
+ def check_subset_dimensions(cls, subset_dimensions):
290
+ """Check that each subset dimension has a unique name."""
291
+ check_uniqueness([x.name for x in subset_dimensions], "subset dimensions name")
292
+ return subset_dimensions
293
+
294
+ @model_validator(mode="after")
295
+ def check_dimension_names(self) -> "DimensionsModel":
296
+ """Check that all dimension query names are unique."""
297
+ names: set[str] = set()
298
+
299
+ def add_name(name):
300
+ if name in names:
301
+ msg = f"dimension_name={name} is not unique in the project"
302
+ raise ValueError(msg)
303
+ names.add(name)
304
+
305
+ for dim in self.base_dimensions:
306
+ add_name(dim.name)
307
+ for dim in self.supplemental_dimensions:
308
+ add_name(dim.name)
309
+ for group in self.subset_dimensions:
310
+ add_name(group.name)
311
+ for selector in group.selectors:
312
+ add_name(selector.name)
313
+
314
+ return self
315
+
316
+
317
+ class RequiredSubsetDimensionRecordsModel(DSGBaseModel):
318
+ name: str = Field(description="Name of a subset dimension")
319
+ selectors: list[str] = Field(description="One or more selectors in the subset dimension")
320
+
321
+
322
+ class RequiredSupplementalDimensionRecordsModel(DSGBaseModel):
323
+ name: str = Field(description="Name of a supplemental dimension")
324
+ record_ids: list[str] = Field(
325
+ description="One or more record IDs in the supplemental dimension"
326
+ )
327
+
328
+
329
+ class RequiredBaseDimensionModel(DSGBaseModel):
330
+ record_ids: list[str] = []
331
+ dimension_name: str | None = Field(
332
+ default=None,
333
+ description="Identifies which base dimension contains the record IDs. Required if there "
334
+ "is more than one base dimension for a given dimension type.",
335
+ )
336
+
337
+
338
+ class RequiredDimensionRecordsByTypeModel(DSGBaseModel):
339
+ base: RequiredBaseDimensionModel = RequiredBaseDimensionModel()
340
+ base_missing: RequiredBaseDimensionModel = RequiredBaseDimensionModel()
341
+ subset: list[RequiredSubsetDimensionRecordsModel] = []
342
+
343
+ @model_validator(mode="before")
344
+ @classmethod
345
+ def handle_legacy_format(cls, values: dict[str, Any]) -> dict[str, Any]:
346
+ # 1. base and base_missing used to be list[str] because we used to allow a single base
347
+ # dimension.
348
+ # 2. We used to allow supplemental dimension requirements.
349
+ # This allows backwards compatibility with old files and databases.
350
+ # This can be removed once we've updated existing dsgrid project repositories.
351
+ for field in ("base", "base_missing"):
352
+ if field in values and isinstance(values[field], list):
353
+ values[field] = {"record_ids": values[field]}
354
+
355
+ values.pop("supplemental", None)
356
+ return values
357
+
358
+ @model_validator(mode="after")
359
+ def check_base(self) -> "RequiredDimensionRecordsByTypeModel":
360
+ if self.base.record_ids and self.base_missing.record_ids:
361
+ msg = f"base and base_missing cannot both contain record_ids: {self.base=} {self.base_missing=}"
362
+ raise ValueError(msg)
363
+ return self
364
+
365
+ def defines_dimension_requirement(self) -> bool:
366
+ """Returns True if the model defines a dimension requirement."""
367
+ return (
368
+ bool(self.base.record_ids) or bool(self.base_missing.record_ids) or bool(self.subset)
369
+ )
370
+
371
+
372
+ class RequiredDimensionRecordsModel(DSGBaseModel):
373
+ # This is here because Pydantic doesn't like fields that start with 'model_'
374
+ model_config = make_model_config(protected_namespaces=())
375
+
376
+ # time is excluded
377
+ geography: RequiredDimensionRecordsByTypeModel = RequiredDimensionRecordsByTypeModel()
378
+ metric: RequiredDimensionRecordsByTypeModel = RequiredDimensionRecordsByTypeModel()
379
+ model_year: RequiredDimensionRecordsByTypeModel = RequiredDimensionRecordsByTypeModel()
380
+ scenario: RequiredDimensionRecordsByTypeModel = RequiredDimensionRecordsByTypeModel()
381
+ sector: RequiredDimensionRecordsByTypeModel = RequiredDimensionRecordsByTypeModel()
382
+ subsector: RequiredDimensionRecordsByTypeModel = RequiredDimensionRecordsByTypeModel()
383
+ weather_year: RequiredDimensionRecordsByTypeModel = RequiredDimensionRecordsByTypeModel()
384
+
385
+
386
+ class RequiredDimensionsModel(DSGBaseModel):
387
+ """Defines required record IDs that must exist for each dimension in a dataset.
388
+ Record IDs can reside in the project's base or subset dimensions.
389
+
390
+ Requirements can be specified for a single dimension or a combination of dimensions.
391
+ For example, if a project includes commercial, residential, and transportation sectors but the
392
+ dataset has only transporation sector records, it should specify a single_dimensional
393
+ requirement that is a subset of of the project's base dimension.
394
+ `{"single_dimensional": "sector": {"base": {"record_ids": ["transportation"]}}}`.
395
+
396
+ If a dataset's requirements span multiple dimensions, such as if it does not have some
397
+ metric records for some geography records, then a multi_dimensional requirement should be
398
+ specified. (By default, a full cross join is assumed to be present.)
399
+ `{"multi_dimensional": {
400
+ "geography": {"base": {"record_ids": ["12345"]}}
401
+ "metric": {"base": {"record_ids": ["electricity_cooling"]}}
402
+ }
403
+ }`
404
+
405
+ If a dataset specifies a dimension type within a multi_dimensional section and wants to use
406
+ all records from a project base dimension, it can specify `base.record_ids = ["__all__"]
407
+ as a shorthand notation.
408
+
409
+ Requirements for a dimension cannot be defined in both single_dimensional and multi_dimensional
410
+ sections.
411
+
412
+ If no records are listed for a dimension then all project base records are required.
413
+
414
+ It might be easier for a dataset to specify what it does not have rather than what it does have.
415
+ In that case, it is recommended to use the RequiredDimensionRecordsModel.base_missing field.
416
+ dsgrid will compute the difference of the base dimension records and the base_missing records
417
+ to determine the dataset's required records.
418
+
419
+ If a project has multiple base dimensions of the same type, the
420
+ RequiredDimensionRecordsModel.dimension_name must be specified to identify the base
421
+ dimension that contains the record IDs.
422
+
423
+ If a dataset contains a subset of project base dimension records that are defined in the
424
+ project's subset dimensions, it is recommended to use that specification. dsgrid will
425
+ substitute base records for mapped subset records at runtime.
426
+ """
427
+
428
+ single_dimensional: RequiredDimensionRecordsModel = Field(
429
+ description="Required records for a single dimension.",
430
+ default=RequiredDimensionRecordsModel(),
431
+ )
432
+ multi_dimensional: list[RequiredDimensionRecordsModel] = Field(
433
+ description="Required records for a combination of dimensions. For example, there may be "
434
+ "a dataset requirement for only one subsector for a given sector instead of a cross "
435
+ "product.",
436
+ default=[],
437
+ )
438
+
439
+ @model_validator(mode="after")
440
+ def check_for_duplicates(self) -> "RequiredDimensionsModel":
441
+ """
442
+ 1. Ensure that the same dimension does not have requirements in both single and multi
443
+ dimensional sections.
444
+ 2. Set any dimensions that do not have specifications to require all base dimension
445
+ records (as long as there is only one project base dimension).
446
+ """
447
+ single_dimensional: set[str] = set()
448
+ multi_dimensional: set[str] = set()
449
+
450
+ for field in RequiredDimensionRecordsModel.model_fields:
451
+ req = getattr(self.single_dimensional, field)
452
+ if req.defines_dimension_requirement():
453
+ single_dimensional.add(field)
454
+
455
+ dim_combos: set[tuple[str, ...]] = set()
456
+ for item in self.multi_dimensional:
457
+ dims = []
458
+ for field in RequiredDimensionRecordsModel.model_fields:
459
+ req = getattr(item, field)
460
+ if req.defines_dimension_requirement():
461
+ if field in single_dimensional:
462
+ msg = (
463
+ "dimensions cannot be defined in both single_dimensional and "
464
+ f"multi_dimensional sections: {field}"
465
+ )
466
+ raise ValueError(msg)
467
+ dims.append(field)
468
+ multi_dimensional.add(field)
469
+
470
+ if len(dims) < 2:
471
+ msg = (
472
+ "A multi_dimensional dimension requirement must contain at least two "
473
+ f"dimensions: {item}"
474
+ )
475
+ raise ValueError(msg)
476
+
477
+ dim_combo = tuple(sorted(dims))
478
+ if dim_combo not in dim_combos:
479
+ for other in dim_combos:
480
+ if set(dim_combo).intersection(other):
481
+ msg = (
482
+ "All descriptors in the multi-dimensional requirements with an "
483
+ "intersection of dimensions must have a full intersection. "
484
+ f"dimension_set1 = {other} dimension_set2 = {dim_combo}"
485
+ )
486
+ raise ValueError(msg)
487
+ dim_combos.add(dim_combo)
488
+
489
+ not_covered = (
490
+ set([x.value for x in DimensionType]) - multi_dimensional - single_dimensional
491
+ )
492
+ for field in not_covered:
493
+ if field != DimensionType.TIME.value:
494
+ getattr(self.single_dimensional, field).base.record_ids = ["__all__"]
495
+ return self
496
+
497
+
498
+ class DatasetBaseDimensionNamesModel(DSGBaseModel):
499
+ """Defines the query names for project base dimensions to which datasets will be mapped.
500
+ This is important for cases where a project has multiple base dimensions of the same type.
501
+ """
502
+
503
+ # This is here because Pydantic doesn't like fields that start with 'model_'
504
+ model_config = make_model_config(protected_namespaces=())
505
+
506
+ geography: str | None = None
507
+ metric: str | None = None
508
+ model_year: str | None = None
509
+ scenario: str | None = None
510
+ sector: str | None = None
511
+ subsector: str | None = None
512
+ time: str | None = None
513
+ weather_year: str | None = None
514
+
515
+
516
+ class InputDatasetModel(DSGBaseModel):
517
+ """Defines an input dataset for the project config."""
518
+
519
+ dataset_id: str = Field(
520
+ title="dataset_id",
521
+ description="Unique dataset identifier.",
522
+ json_schema_extra={
523
+ "updateable": False,
524
+ },
525
+ )
526
+ dataset_type: InputDatasetType = Field(
527
+ title="dataset_type",
528
+ description="Dataset type.",
529
+ json_schema_extra={
530
+ "options": InputDatasetType.format_for_docs(),
531
+ "updateable": False,
532
+ },
533
+ )
534
+ version: str | None = Field(
535
+ title="version",
536
+ description="Version of the registered dataset. "
537
+ "The version specification is optional. If no version is supplied, then the latest "
538
+ "version in the registry is assumed. "
539
+ "The version string must be in semver format (e.g., '1.0.0') and it must be a "
540
+ "valid/existing version in the registry.",
541
+ default=None,
542
+ )
543
+ required_dimensions: RequiredDimensionsModel = Field(
544
+ title="required_dimensions",
545
+ description="Defines required record IDs that must exist for each dimension.",
546
+ default=RequiredDimensionsModel(),
547
+ )
548
+ mapping_references: list[DimensionMappingReferenceModel] = Field(
549
+ title="mapping_references",
550
+ description="Defines how to map the dataset dimensions to the project. "
551
+ "Auto-populated during submission.",
552
+ default=[],
553
+ )
554
+ base_dimension_names: DatasetBaseDimensionNamesModel = Field(
555
+ title="base_dimension_names",
556
+ description="Defines the project base dimensions to which the dataset will map itself. "
557
+ "Auto-populated during submission.",
558
+ default=DatasetBaseDimensionNamesModel(),
559
+ )
560
+ status: DatasetRegistryStatus = Field(
561
+ title="status",
562
+ description="Registration status of the dataset, added by dsgrid.",
563
+ default=DatasetRegistryStatus.UNREGISTERED,
564
+ json_schema_extra={
565
+ "dsgrid_internal": True,
566
+ "notes": ("status is "),
567
+ "updateable": False,
568
+ },
569
+ )
570
+ wrap_time_allowed: bool = Field(
571
+ title="wrap_time_allowed",
572
+ description="Whether to allow dataset time to be wrapped to project time if different",
573
+ default=False,
574
+ )
575
+ time_based_data_adjustment: TimeBasedDataAdjustmentModel = Field(
576
+ title="time_based_data_adjustment",
577
+ description="Defines how the rest of the dataframe is adjusted with respect to time. "
578
+ "E.g., when drop associated data when dropping a leap day timestamp.",
579
+ default=TimeBasedDataAdjustmentModel(),
580
+ )
581
+
582
+ @field_validator("time_based_data_adjustment")
583
+ @classmethod
584
+ def check_data_adjustment(cls, time_based_data_adjustment):
585
+ """Check daylight saving adjustment"""
586
+ sfh = time_based_data_adjustment.daylight_saving_adjustment.spring_forward_hour
587
+ fbh = time_based_data_adjustment.daylight_saving_adjustment.fall_back_hour
588
+ if fbh == DaylightSavingFallBackType.NONE and sfh == DaylightSavingSpringForwardType.NONE:
589
+ return time_based_data_adjustment
590
+ if fbh != DaylightSavingFallBackType.NONE and sfh != DaylightSavingSpringForwardType.NONE:
591
+ return time_based_data_adjustment
592
+ msg = f"mismatch between spring_forward_hour and fall_back_hour, {time_based_data_adjustment=}."
593
+ raise ValueError(msg)
594
+
595
+ # TODO: write validation that if daylight_saving_adjustment is specified, dataset time config must be IndexTimeDimensionConfig
596
+
597
+
598
+ class DimensionMappingsModel(DSGBaseModel):
599
+ """Defines all dimension mappings associated with a dsgrid project,
600
+ including base-to-supplemental mappings and dataset-to-project mappings.
601
+ """
602
+
603
+ base_to_supplemental_references: list[DimensionMappingReferenceModel] = Field(
604
+ title="base_to_supplemental_references",
605
+ description="Base dimension to supplemental dimension mappings (e.g., county-to-state)"
606
+ " used to support various queries and dimension transformations.",
607
+ default=[],
608
+ )
609
+ dataset_to_project: dict[str, list[DimensionMappingReferenceModel]] = Field(
610
+ title="dataset_to_project",
611
+ description="Dataset-to-project mappings map dataset dimensions to project dimensions. "
612
+ "Once a dataset is submitted to a project, dsgrid adds the dataset-to-project mappings "
613
+ "to the project config. "
614
+ "Some projects may not have any dataset-to-project mappings. Dataset-to-project "
615
+ " mappings are only supplied if a dataset's dimensions do not match the project's "
616
+ "dimension.",
617
+ default={},
618
+ # TODO: need to document missing dimension records, fill values, etc. DSGRID-191.
619
+ )
620
+
621
+
622
+ class ProjectConfigModel(DSGBaseDatabaseModel):
623
+ """Represents project configurations"""
624
+
625
+ project_id: str = Field(
626
+ title="project_id",
627
+ description="A unique project identifier that is project-specific (e.g., "
628
+ "'standard-scenarios-2021').",
629
+ )
630
+ name: str = Field(
631
+ title="name",
632
+ description="A project name to accompany the ID.",
633
+ )
634
+ description: str = Field(
635
+ title="description",
636
+ description="Detailed project description.",
637
+ )
638
+ status: ProjectRegistryStatus = Field(
639
+ title="status",
640
+ description="project registry status",
641
+ default=ProjectRegistryStatus.INITIAL_REGISTRATION,
642
+ json_schema_extra={
643
+ "dsgrid_internal": True,
644
+ "updateable": False,
645
+ },
646
+ )
647
+ datasets: list[InputDatasetModel] = Field(
648
+ title="datasets",
649
+ description="List of input datasets for the project.",
650
+ )
651
+ dimensions: DimensionsModel = Field(
652
+ title="dimensions",
653
+ description="List of `base` and `supplemental` dimensions.",
654
+ )
655
+ dimension_mappings: DimensionMappingsModel = Field(
656
+ title="dimension_mappings",
657
+ description="List of project mappings. Initialized with base-to-base and"
658
+ " base-to-supplemental mappings. dataset-to-project mappings are added by dsgrid as"
659
+ " datasets get registered with the project.",
660
+ default=DimensionMappingsModel(),
661
+ )
662
+
663
+ @field_validator("project_id")
664
+ @classmethod
665
+ def check_project_id_handle(cls, project_id):
666
+ """Check for valid characters in project id"""
667
+ if "-" in project_id:
668
+ msg = 'invalid character "-" in project id'
669
+ raise ValueError(msg)
670
+
671
+ check_config_id_strict(project_id, "Project")
672
+ return project_id
673
+
674
+
675
+ def make_unvalidated_project_config(
676
+ project_id: str,
677
+ dataset_ids: Iterable[str],
678
+ metric_types: Iterable[str],
679
+ name: str | None = None,
680
+ description: str | None = None,
681
+ time_type: TimeDimensionType = TimeDimensionType.DATETIME,
682
+ ) -> dict[str, Any]:
683
+ """Create a project config as a dictionary, skipping validation."""
684
+ return {
685
+ "project_id": project_id,
686
+ "name": name or "",
687
+ "description": description or "",
688
+ "dimensions": {
689
+ "base_dimensions": make_base_dimension_template(metric_types, time_type=time_type),
690
+ "subset_dimensions": [],
691
+ "supplemental_dimensions": [],
692
+ },
693
+ "datasets": [
694
+ {
695
+ "dataset_id": x,
696
+ "dataset_type": "",
697
+ "version": "",
698
+ "required_dimensions": {},
699
+ }
700
+ for x in dataset_ids
701
+ ],
702
+ }
703
+
704
+
705
+ class DimensionsByCategoryModel(DSGBaseModel):
706
+ """Defines the query names by base and supplemental category."""
707
+
708
+ base: list[str]
709
+ subset: list[str]
710
+ supplemental: list[str]
711
+
712
+
713
+ class ProjectDimensionNamesModel(DSGBaseModel):
714
+ """Defines the query names for all base and supplemental dimensions in the project."""
715
+
716
+ # This is here because Pydantic doesn't like fields that start with 'model_'
717
+ model_config = make_model_config(protected_namespaces=())
718
+
719
+ geography: DimensionsByCategoryModel
720
+ metric: DimensionsByCategoryModel
721
+ model_year: DimensionsByCategoryModel
722
+ scenario: DimensionsByCategoryModel
723
+ sector: DimensionsByCategoryModel
724
+ subsector: DimensionsByCategoryModel
725
+ time: DimensionsByCategoryModel
726
+ weather_year: DimensionsByCategoryModel
727
+
728
+
729
+ class ProjectConfig(ConfigBase):
730
+ """Provides an interface to a ProjectConfigModel."""
731
+
732
+ def __init__(self, model: ProjectConfigModel):
733
+ super().__init__(model)
734
+ self._base_dimensions: dict[ConfigKey, DimensionBaseConfig] = {}
735
+ self._subset_dimensions: dict[
736
+ DimensionType, dict[str, dict[ConfigKey, DimensionBaseConfigWithFiles]]
737
+ ] = {}
738
+ self._supplemental_dimensions: dict[ConfigKey, DimensionBaseConfig] = {}
739
+ self._base_to_supplemental_mappings: dict[ConfigKey, MappingTableConfig] = {}
740
+ self._dimensions_by_name: dict[str, DimensionBaseConfig] = {}
741
+
742
+ @staticmethod
743
+ def model_class() -> Type:
744
+ return ProjectConfigModel
745
+
746
+ @staticmethod
747
+ def config_filename() -> str:
748
+ return "project.json5"
749
+
750
+ def get_base_dimension(
751
+ self, dimension_type: DimensionType, dimension_name: str | None = None
752
+ ) -> DimensionBaseConfig:
753
+ """Return the base dimension matching dimension_type.
754
+ If there is more than one base dimension of the given type, dimension_name is
755
+ required.
756
+
757
+ See also
758
+ --------
759
+ list_base_dimensions
760
+ """
761
+ if dimension_name is None:
762
+ return self._get_single_base_dimension(dimension_type)
763
+ for dim in self._iter_base_dimensions():
764
+ if dim.model.dimension_type == dimension_type and dim.model.name == dimension_name:
765
+ return dim
766
+ msg = f"Did not find a dimension of {dimension_type=} with {dimension_name=}"
767
+ raise DSGValueNotRegistered(msg)
768
+
769
+ def get_base_time_dimension(self) -> TimeDimensionBaseConfig:
770
+ """Return the base dimension for time."""
771
+ dim = self._get_single_base_dimension(DimensionType.TIME)
772
+ assert isinstance(dim, TimeDimensionBaseConfig)
773
+ return dim
774
+
775
+ def _get_single_base_dimension(self, dimension_type: DimensionType) -> DimensionBaseConfig:
776
+ """Return the base dimension."""
777
+ dims = [
778
+ x for x in self._iter_base_dimensions() if x.model.dimension_type == dimension_type
779
+ ]
780
+ if not dims:
781
+ msg = f"base dimension {dimension_type=} not found"
782
+ raise DSGValueNotRegistered(msg)
783
+
784
+ if len(dims) > 1:
785
+ qnames = " ".join([x.model.name for x in dims])
786
+ msg = (
787
+ f"Found multiple base dimensions for {dimension_type=}: {qnames}. "
788
+ "Call get_base_dimension() with a specific name."
789
+ )
790
+ raise DSGInvalidDimension(msg)
791
+ return dims[0]
792
+
793
+ def get_base_dimension_and_version(
794
+ self, dimension_type: DimensionType, dimension_name: str | None = None
795
+ ) -> tuple[DimensionBaseConfig, str]:
796
+ """Return the base dimension and version matching dimension_type."""
797
+ res: tuple[DimensionBaseConfig, str] | None = None
798
+ for key, dim in self.base_dimensions.items():
799
+ if dim.model.dimension_type == dimension_type:
800
+ if dimension_name is None or dim.model.name == dimension_name:
801
+ if res is not None:
802
+ msg = (
803
+ f"Found multiple base dimensions for {dimension_type=}. "
804
+ "You must specify a dimension query name to remove ambiguity."
805
+ )
806
+ raise DSGInvalidOperation(msg)
807
+ res = dim, key.version
808
+
809
+ if res is None:
810
+ msg = f"Did not find a dimension with {dimension_type=} {dimension_name=}"
811
+ raise DSGValueNotRegistered(msg)
812
+ return res
813
+
814
+ def get_dimension(self, name: str) -> DimensionBaseConfig:
815
+ """Return the dimension with name."""
816
+ dim = self._dimensions_by_name.get(name)
817
+ if dim is None:
818
+ msg = f"dimension_name={name} is not stored"
819
+ raise DSGValueNotRegistered(msg)
820
+ return dim
821
+
822
+ def get_time_dimension(self, name: str) -> TimeDimensionBaseConfig:
823
+ """Return the time dimension with dimension_name."""
824
+ dim = self.get_dimension(name)
825
+ if not isinstance(dim, TimeDimensionBaseConfig):
826
+ msg = f"{dim.model.label} is not a time dimension"
827
+ raise DSGInvalidParameter(msg)
828
+ return dim
829
+
830
+ def get_dimension_by_name(self, name: str) -> DimensionBaseConfig:
831
+ """Return the dimension with name."""
832
+ for dim in self._iter_base_dimensions():
833
+ if dim.model.name == name:
834
+ return dim
835
+
836
+ msg = f"No base dimension with {name=} is stored."
837
+ raise DSGValueNotRegistered(msg)
838
+
839
+ def get_dimension_with_records(self, name: str) -> DimensionBaseConfigWithFiles:
840
+ """Return a dimension config matching name that has records."""
841
+ dim = self._dimensions_by_name.get(name)
842
+ if dim is None:
843
+ msg = f"{name=} is not stored"
844
+ raise DSGInvalidDimension(msg)
845
+ if not isinstance(dim, DimensionBaseConfigWithFiles):
846
+ msg = f"{dim.model.label} does not have records"
847
+ raise DSGInvalidParameter(msg)
848
+ return dim
849
+
850
+ def get_dimension_records(self, name: str) -> DataFrame:
851
+ """Return a DataFrame containing the records for a dimension."""
852
+ return self.get_dimension_with_records(name).get_records_dataframe()
853
+
854
+ def get_dimension_record_ids(self, name: str) -> set[str]:
855
+ """Return the record IDs for the dimension identified by name."""
856
+ return self.get_dimension_with_records(name).get_unique_ids()
857
+
858
+ def get_dimension_reference(self, dimension_id: str) -> DimensionReferenceModel:
859
+ """Return the reference of the dimension matching dimension_id."""
860
+ for ref in itertools.chain(
861
+ self.model.dimensions.base_dimension_references,
862
+ self.model.dimensions.supplemental_dimension_references,
863
+ ):
864
+ if ref.dimension_id == dimension_id:
865
+ return ref
866
+
867
+ msg = f"{dimension_id} is not stored"
868
+ raise DSGInvalidDimension(msg)
869
+
870
+ def list_base_dimensions(
871
+ self, dimension_type: DimensionType | None = None
872
+ ) -> list[DimensionBaseConfig]:
873
+ """Return all base dimensions, optionally filtering to the dimension_type.
874
+
875
+ See also
876
+ --------
877
+ get_base_dimension
878
+ """
879
+ if dimension_type is None:
880
+ return list(self._iter_base_dimensions())
881
+ return [
882
+ x for x in self._iter_base_dimensions() if x.model.dimension_type == dimension_type
883
+ ]
884
+
885
+ def list_base_dimensions_with_records(
886
+ self, dimension_type: DimensionType
887
+ ) -> list[DimensionBaseConfigWithFiles]:
888
+ """Return all base dimensions of the given dimension_type.
889
+
890
+ See also
891
+ --------
892
+ get_base_dimension
893
+ """
894
+ return [
895
+ x
896
+ for x in self._iter_base_dimensions()
897
+ if x.model.dimension_type == dimension_type
898
+ and isinstance(x, DimensionBaseConfigWithFiles)
899
+ ]
900
+
901
+ def list_supplemental_dimensions(
902
+ self, dimension_type: DimensionType, sort_by=None
903
+ ) -> list[DimensionBaseConfigWithFiles]:
904
+ """Return the supplemental dimensions matching dimension (if any).
905
+
906
+ Parameters
907
+ ----------
908
+ dimension_type : DimensionType
909
+ sort_by : str | None
910
+ If set, sort the dimensions by this dimension attribute.
911
+ """
912
+ dims = [
913
+ x
914
+ for x in self.supplemental_dimensions.values()
915
+ if x.model.dimension_type == dimension_type
916
+ ]
917
+ if sort_by is not None:
918
+ dims.sort(key=lambda x: getattr(x.model, sort_by))
919
+ return dims
920
+
921
+ def get_matching_subset_dimension(
922
+ self, dimension_type: DimensionType, unique_data_records: set[str]
923
+ ) -> DimensionReferenceModel | None:
924
+ """Return a dimension reference if there is a matching subset dimension, otherwise None."""
925
+ for group in self.model.dimensions.subset_dimensions:
926
+ if group.dimension_type == dimension_type:
927
+ for ref in group.selector_references:
928
+ key = ConfigKey(ref.dimension_id, ref.version)
929
+ records = self._subset_dimensions[dimension_type][group.name][
930
+ key
931
+ ].get_unique_ids()
932
+ if not unique_data_records.symmetric_difference(records):
933
+ logger.info("Found matching subset dimension: %s", group.name)
934
+ return ref
935
+ return None
936
+
937
+ def get_base_to_supplemental_dimension_mappings_by_types(
938
+ self, dimension_type: DimensionType
939
+ ) -> list[MappingTableConfig]:
940
+ """Return the base-to-supplemental dimension mappings for the dimension (if any)."""
941
+ return [
942
+ x
943
+ for x in self._base_to_supplemental_mappings.values()
944
+ if x.model.from_dimension.dimension_type == dimension_type
945
+ ]
946
+
947
+ def get_base_to_supplemental_config(
948
+ self, base_dim: DimensionBaseConfigWithFiles, supp_dim: DimensionBaseConfigWithFiles
949
+ ) -> MappingTableConfig:
950
+ """Return the project's base-to-supplemental dimension mapping config for the given
951
+ base and supplemental dimensions.
952
+ """
953
+ self._check_not_base_dimension(supp_dim)
954
+
955
+ for mapping in self._base_to_supplemental_mappings.values():
956
+ if (
957
+ mapping.model.from_dimension.dimension_id == base_dim.model.dimension_id
958
+ and mapping.model.to_dimension.dimension_id == supp_dim.model.dimension_id
959
+ ):
960
+ return mapping
961
+
962
+ msg = f"No mapping is stored for base = {base_dim.model.label}, supplemental = {supp_dim.model.label}"
963
+ raise DSGValueNotRegistered(msg)
964
+
965
+ def get_base_to_supplemental_mapping_records(
966
+ self, base_dim: DimensionBaseConfigWithFiles, supp_dim: DimensionBaseConfigWithFiles
967
+ ) -> DataFrame:
968
+ """Return the project's base-to-supplemental dimension mapping records.
969
+ Excludes rows with NULL to_id values.
970
+ """
971
+ config = self.get_base_to_supplemental_config(base_dim, supp_dim)
972
+ return config.get_records_dataframe().filter("to_id is not NULL")
973
+
974
+ def has_base_to_supplemental_dimension_mapping_types(self, dimension_type) -> bool:
975
+ """Return True if the config has these base-to-supplemental mappings."""
976
+ return self._has_mapping(
977
+ dimension_type,
978
+ dimension_type,
979
+ self._base_to_supplemental_mappings,
980
+ )
981
+
982
+ def get_base_dimension_by_id(self, dimension_id: str) -> DimensionBaseConfig:
983
+ """Return the base dimension with dimension_id."""
984
+ for dim in self._iter_base_dimensions():
985
+ if dim.model.dimension_id == dimension_id:
986
+ return dim
987
+ msg = f"Did not find a base dimension with {dimension_id=}"
988
+ raise DSGValueNotRegistered(msg)
989
+
990
+ def get_base_dimension_records_by_id(self, dimension_id: str) -> DataFrame:
991
+ """Return the records for the base dimension with dimension_id."""
992
+ dim = self.get_base_dimension_by_id(dimension_id)
993
+ if not isinstance(dim, DimensionBaseConfigWithFiles):
994
+ msg = f"{dim.model.label} does not have records"
995
+ raise DSGInvalidParameter(msg)
996
+ return dim.get_records_dataframe()
997
+
998
+ def _check_not_base_dimension(self, dim: DimensionBaseConfig) -> None:
999
+ """Check that the dimension is not a base dimension."""
1000
+ for base_dim in self.list_base_dimensions(dimension_type=dim.model.dimension_type):
1001
+ if dim.model.dimension_id == base_dim.model.dimension_id:
1002
+ msg = f"Cannot pass base dimension: {dim.model.label}"
1003
+ raise DSGInvalidParameter(msg)
1004
+
1005
+ @staticmethod
1006
+ def _has_mapping(
1007
+ from_dimension_type: DimensionType, to_dimension_type: DimensionType, mapping: dict
1008
+ ) -> bool:
1009
+ for config in mapping.values():
1010
+ if (
1011
+ config.model.from_dimension.dimension_type == from_dimension_type
1012
+ and config.model.to_dimension.dimension_type == to_dimension_type
1013
+ ):
1014
+ return True
1015
+ return False
1016
+
1017
+ def list_dimension_names(self, category: DimensionCategory | None = None) -> list[str]:
1018
+ """Return query names for all dimensions in the project.
1019
+
1020
+ Parameters
1021
+ ----------
1022
+ category : DimensionCategory | None
1023
+ Optionally, filter return by category.
1024
+ """
1025
+ if category is None:
1026
+ return sorted(self._dimensions_by_name.keys())
1027
+
1028
+ match category:
1029
+ case DimensionCategory.BASE:
1030
+ method = self._iter_base_dimensions
1031
+ case DimensionCategory.SUBSET:
1032
+ method = self._iter_subset_dimensions
1033
+ case DimensionCategory.SUPPLEMENTAL:
1034
+ method = self._iter_supplemental_dimensions
1035
+ case _:
1036
+ msg = f"{category=}"
1037
+ raise NotImplementedError(msg)
1038
+
1039
+ return sorted((x.model.name for x in method()))
1040
+
1041
+ def list_dimension_names_by_type(self, dimension_type: DimensionType) -> list[str]:
1042
+ """List the query names available for a dimension type."""
1043
+ return [
1044
+ x.model.name
1045
+ for x in self.iter_dimensions()
1046
+ if x.model.dimension_type == dimension_type
1047
+ ]
1048
+
1049
+ def get_dimension_names_mapped_to_type(self) -> dict[str, DimensionType]:
1050
+ """Return a dict of query names mapped to their dimension type."""
1051
+ return {x.model.name: x.model.dimension_type for x in self.iter_dimensions()}
1052
+
1053
+ def get_dimension_type_to_base_name_mapping(self) -> dict[DimensionType, list[str]]:
1054
+ """Return a mapping of DimensionType to query names for base dimensions."""
1055
+ query_names: dict[DimensionType, list[str]] = {}
1056
+ for dimension_type in DimensionType:
1057
+ query_names[dimension_type] = [
1058
+ x.model.name for x in self.list_base_dimensions(dimension_type=dimension_type)
1059
+ ]
1060
+ return query_names
1061
+
1062
+ def get_subset_dimension_to_name_mapping(self) -> dict[DimensionType, list[str]]:
1063
+ """Return a mapping of DimensionType to query name for subset dimensions."""
1064
+ query_names = defaultdict(list)
1065
+ for dimension_type in DimensionType:
1066
+ if dimension_type in self._subset_dimensions:
1067
+ for selectors in self._subset_dimensions[dimension_type].values():
1068
+ for dim in selectors.values():
1069
+ query_names[dimension_type].append(dim.model.name)
1070
+ return query_names
1071
+
1072
+ def get_supplemental_dimension_to_name_mapping(self) -> dict[DimensionType, list[str]]:
1073
+ """Return a mapping of DimensionType to query name for supplemental dimensions."""
1074
+ query_names = {}
1075
+ for dimension_type in DimensionType:
1076
+ query_names[dimension_type] = [
1077
+ x.model.name
1078
+ for x in self.list_supplemental_dimensions(dimension_type, sort_by="name")
1079
+ ]
1080
+ return query_names
1081
+
1082
+ def get_dimension_names_model(self) -> ProjectDimensionNamesModel:
1083
+ """Return an instance of ProjectDimensionNamesModel for the project."""
1084
+ base_names_by_type = self.get_dimension_type_to_base_name_mapping()
1085
+ subset_names_by_type = self.get_subset_dimension_to_name_mapping()
1086
+ supp_names_by_type = self.get_supplemental_dimension_to_name_mapping()
1087
+ model: dict[str, Any] = {}
1088
+ for dimension_type in DimensionType:
1089
+ model[dimension_type.value] = {
1090
+ "base": base_names_by_type[dimension_type],
1091
+ "subset": subset_names_by_type[dimension_type],
1092
+ "supplemental": supp_names_by_type[dimension_type],
1093
+ }
1094
+ return ProjectDimensionNamesModel(**model)
1095
+
1096
+ def set_dimensions(
1097
+ self,
1098
+ base_dimensions: dict[ConfigKey, DimensionBaseConfig],
1099
+ subset_dimensions: dict[
1100
+ DimensionType, dict[str, dict[ConfigKey, DimensionBaseConfigWithFiles]]
1101
+ ],
1102
+ supplemental_dimensions: dict[ConfigKey, DimensionBaseConfig],
1103
+ ) -> None:
1104
+ self._base_dimensions.clear()
1105
+ self._subset_dimensions.clear()
1106
+ self._supplemental_dimensions.clear()
1107
+ self._base_dimensions.update(base_dimensions)
1108
+ self._subset_dimensions.update(subset_dimensions)
1109
+ self._supplemental_dimensions.update(supplemental_dimensions)
1110
+ self._dimensions_by_name.clear()
1111
+ for dim in self.iter_dimensions():
1112
+ if dim.model.name in self._dimensions_by_name:
1113
+ msg = f"name={dim.model.name} exists multiple times in project {self.config_id}"
1114
+ raise DSGInvalidDimension(msg)
1115
+ self._dimensions_by_name[dim.model.name] = dim
1116
+
1117
+ def set_dimension_mappings(
1118
+ self, base_to_supplemental_mappings: dict[ConfigKey, MappingTableConfig]
1119
+ ):
1120
+ self._base_to_supplemental_mappings.clear()
1121
+ self._base_to_supplemental_mappings.update(base_to_supplemental_mappings)
1122
+ # TODO: Once we start using these we may need to store by (from, to) as key instead.
1123
+
1124
+ def add_dataset_dimension_mappings(
1125
+ self, dataset_config: DatasetConfig, references: list[DimensionMappingReferenceModel]
1126
+ ):
1127
+ """Add a dataset's dimension mappings to the project.
1128
+
1129
+ Raises
1130
+ ------
1131
+ DSGInvalidDimensionMapping
1132
+ Raised if a requirement is violated.
1133
+ """
1134
+ if dataset_config.model.dataset_id not in self.model.dimension_mappings.dataset_to_project:
1135
+ self.model.dimension_mappings.dataset_to_project[dataset_config.model.dataset_id] = []
1136
+ mappings = self.model.dimension_mappings.dataset_to_project[
1137
+ dataset_config.model.dataset_id
1138
+ ]
1139
+ existing_ids = set((x.mapping_id for x in mappings))
1140
+ for reference in references:
1141
+ if reference.mapping_id not in existing_ids:
1142
+ mappings.append(reference)
1143
+ logger.info(
1144
+ "Added dimension mapping for dataset=%s: %s",
1145
+ dataset_config.model.dataset_id,
1146
+ reference.mapping_id,
1147
+ )
1148
+
1149
+ def add_dataset_base_dimension_names(
1150
+ self, dataset_id: str, base_dimension_names: DatasetBaseDimensionNamesModel
1151
+ ):
1152
+ """Add project base dimension query names represented in the dataset."""
1153
+ for field in type(base_dimension_names).model_fields:
1154
+ if getattr(base_dimension_names, field) is None:
1155
+ msg = f"DatasetBaseDimensionNamesModel {field} cannot be None"
1156
+ raise DSGInvalidParameter(msg)
1157
+ dataset = self.get_dataset(dataset_id)
1158
+ dataset.base_dimension_names = base_dimension_names
1159
+
1160
+ def get_dataset_base_dimension_names(self, dataset_id: str) -> DatasetBaseDimensionNamesModel:
1161
+ """Return the project base dimension query names represented in the dataset."""
1162
+ return self.get_dataset(dataset_id).base_dimension_names
1163
+
1164
+ @property
1165
+ def config_id(self) -> str:
1166
+ return self._model.project_id
1167
+
1168
+ def get_dataset(self, dataset_id: str) -> InputDatasetModel:
1169
+ """Return a dataset by ID."""
1170
+ for dataset in self.model.datasets:
1171
+ if dataset.dataset_id == dataset_id:
1172
+ return dataset
1173
+
1174
+ msg = f"project_id={self._model.project_id} does not have dataset_id={dataset_id}"
1175
+ raise DSGInvalidField(msg)
1176
+
1177
+ def has_dataset(self, dataset_id: str, status: DatasetRegistryStatus | None) -> bool:
1178
+ """Return True if the dataset_id is present in the configuration.
1179
+
1180
+ Parameters
1181
+ ----------
1182
+ dataset_id : str
1183
+ status : None | DatasetRegistryStatus
1184
+ If set, only return True if the status matches.
1185
+ """
1186
+ for dataset in self.iter_datasets():
1187
+ if dataset.dataset_id == dataset_id:
1188
+ if status is None or dataset.status == status:
1189
+ return True
1190
+ return False
1191
+
1192
+ # TODO: what about benchmark and historical?
1193
+ return False
1194
+
1195
+ def get_load_data_time_columns(self, name: str) -> list[str]:
1196
+ """Return the time dimension columns expected in the load data table for this query name."""
1197
+ dim = self.get_time_dimension(name)
1198
+ time_columns = dim.get_load_data_time_columns()
1199
+ return time_columns
1200
+
1201
+ def iter_datasets(self) -> Generator[InputDatasetModel, None, None]:
1202
+ for dataset in self.model.datasets:
1203
+ yield dataset
1204
+
1205
+ def _iter_base_dimensions(self) -> Generator[DimensionBaseConfig, None, None]:
1206
+ yield from self._base_dimensions.values()
1207
+
1208
+ def _iter_subset_dimensions(self) -> Generator[DimensionBaseConfig, None, None]:
1209
+ for x in self._subset_dimensions.values():
1210
+ for y in x.values():
1211
+ for z in y.values():
1212
+ yield z
1213
+
1214
+ def _iter_supplemental_dimensions(self) -> Generator[DimensionBaseConfig, None, None]:
1215
+ yield from self._supplemental_dimensions.values()
1216
+
1217
+ def iter_dimensions(self) -> Iterable[DimensionBaseConfig]:
1218
+ """Return an iterator over all dimensions of the project.
1219
+
1220
+ Yields
1221
+ ------
1222
+ DimensionConfig
1223
+
1224
+ """
1225
+ return itertools.chain(
1226
+ self._iter_base_dimensions(),
1227
+ self._iter_subset_dimensions(),
1228
+ self._iter_supplemental_dimensions(),
1229
+ )
1230
+
1231
+ def list_registered_dataset_ids(self) -> list[str]:
1232
+ """List registered datasets associated with the project."""
1233
+ status = DatasetRegistryStatus.REGISTERED
1234
+ return [x.dataset_id for x in self._iter_datasets_by_status(status)]
1235
+
1236
+ def list_unregistered_dataset_ids(self) -> list[str]:
1237
+ """List unregistered datasets associated with project registry."""
1238
+ status = DatasetRegistryStatus.UNREGISTERED
1239
+ return [x.dataset_id for x in self._iter_datasets_by_status(status)]
1240
+
1241
+ def _iter_datasets_by_status(
1242
+ self, status: DatasetRegistryStatus
1243
+ ) -> Generator[InputDatasetModel, None, None]:
1244
+ for dataset in self.iter_datasets():
1245
+ if dataset.status == status:
1246
+ yield dataset
1247
+
1248
+ def get_required_dimension_record_ids(
1249
+ self, dataset_id: str, dimension_type: DimensionType
1250
+ ) -> set[str]:
1251
+ """Return the required base dimension record IDs for the dataset and dimension type."""
1252
+ dataset = self.get_dataset(dataset_id)
1253
+ req = getattr(dataset.required_dimensions.single_dimensional, dimension_type.value)
1254
+ record_ids = self._get_required_dimension_record_ids(req)
1255
+ for multi_req in dataset.required_dimensions.multi_dimensional:
1256
+ req = getattr(multi_req, dimension_type.value)
1257
+ record_ids.update(self._get_required_dimension_record_ids(req))
1258
+
1259
+ return record_ids
1260
+
1261
+ def _build_multi_dim_requirement_associations(
1262
+ self, multi_dim_reqs: list[RequiredDimensionRecordsModel], context: ScratchDirContext
1263
+ ) -> list[DataFrame]:
1264
+ dfs_by_dim_combo: dict[tuple[str, ...], DataFrame] = {}
1265
+
1266
+ # Example: Partial sector and subsector combinations are required.
1267
+ # [
1268
+ # {{"sector": {"base": ["com"]},
1269
+ # "subsector": "supplemental":
1270
+ # {"name": "commercial-subsectors",
1271
+ # "record_ids": ["commercial_subsectors"]}},
1272
+ # {"sector": {"base": ["res"]}, "subsector": {"base": ["MidriseApartment"]}},
1273
+ # ]
1274
+ # This code will replace supplemental records with base records and return a list of
1275
+ # dataframes of those combinations - one per unique combination of dimensions.
1276
+
1277
+ for multi_req in multi_dim_reqs:
1278
+ dim_combo = []
1279
+ columns = {}
1280
+ for field in sorted(RequiredDimensionRecordsModel.model_fields):
1281
+ dim_type = DimensionType(field)
1282
+ req = getattr(multi_req, field)
1283
+ record_ids = self._get_required_dimension_record_ids(req)
1284
+ if record_ids:
1285
+ columns[field] = list(record_ids)
1286
+ dim_combo.append(dim_type.value)
1287
+
1288
+ df = create_dataframe_from_product(columns, context)
1289
+ df = df.select(*sorted(df.columns))
1290
+
1291
+ dim_combo_tp = tuple(sorted(dim_combo))
1292
+ if dim_combo_tp in dfs_by_dim_combo:
1293
+ dfs_by_dim_combo[dim_combo_tp] = dfs_by_dim_combo[dim_combo_tp].union(df)
1294
+ else:
1295
+ dfs_by_dim_combo[dim_combo_tp] = df
1296
+
1297
+ return list(dfs_by_dim_combo.values())
1298
+
1299
+ def _get_required_dimension_record_ids(
1300
+ self, reqs: RequiredDimensionRecordsByTypeModel
1301
+ ) -> set[str]:
1302
+ """Return the required record IDs for a dimension based on the specification in the
1303
+ project config.
1304
+ """
1305
+ record_ids = self._get_required_base_dimension_record_ids(reqs)
1306
+ record_ids.update(self._get_required_record_ids_from_subsets(reqs))
1307
+ return record_ids
1308
+
1309
+ def _get_required_base_dimension_record_ids(
1310
+ self, reqs: RequiredDimensionRecordsByTypeModel
1311
+ ) -> set[str]:
1312
+ """Return the required record IDs for a base dimension based on the specification in the
1313
+ project config.
1314
+ """
1315
+ record_ids: set[str] = set()
1316
+ if not reqs.base.record_ids and not reqs.base_missing.record_ids:
1317
+ return record_ids
1318
+
1319
+ base_dim_query_name = reqs.base.dimension_name or reqs.base_missing.dimension_name
1320
+ assert base_dim_query_name is not None
1321
+ all_base_record_ids = self.get_dimension_record_ids(base_dim_query_name)
1322
+
1323
+ if reqs.base.record_ids == ["__all__"]:
1324
+ assert reqs.base.dimension_name is not None
1325
+ record_ids = all_base_record_ids
1326
+ elif reqs.base.record_ids:
1327
+ record_ids = set(reqs.base.record_ids)
1328
+ if diff := record_ids - all_base_record_ids:
1329
+ msg = (
1330
+ "The project config requires these these record IDs in the dataset's 'base' "
1331
+ "field, but they are not in the base dimension records: "
1332
+ f"name={base_dim_query_name}: {diff=}"
1333
+ )
1334
+ raise DSGInvalidDataset(msg)
1335
+ elif reqs.base_missing.record_ids:
1336
+ assert reqs.base_missing.dimension_name is not None
1337
+ missing_ids = set(reqs.base_missing.record_ids)
1338
+ if diff := missing_ids - all_base_record_ids:
1339
+ msg = (
1340
+ "The project config requires these these record IDs in the dataset's "
1341
+ "'base_missing' field, but they are not in the base dimension "
1342
+ f"name={base_dim_query_name}: {diff=}"
1343
+ )
1344
+ raise DSGInvalidDataset(msg)
1345
+ record_ids = all_base_record_ids - missing_ids
1346
+
1347
+ return record_ids
1348
+
1349
+ def _get_subset_dimension_records(self, name: str, selector_name: str) -> set[str]:
1350
+ for group in self.model.dimensions.subset_dimensions:
1351
+ if group.name == name:
1352
+ for ref in group.selector_references:
1353
+ key = ConfigKey(ref.dimension_id, ref.version)
1354
+ dim = self._subset_dimensions[group.dimension_type][group.name][key]
1355
+ if dim.model.name == selector_name:
1356
+ assert isinstance(dim, DimensionBaseConfigWithFiles)
1357
+ return dim.get_unique_ids()
1358
+
1359
+ msg = f"subset dimension selector not found: {name=} {selector_name=}"
1360
+ raise DSGInvalidDimension(msg)
1361
+
1362
+ def _get_required_record_ids_from_subsets(
1363
+ self, req: RequiredDimensionRecordsByTypeModel
1364
+ ) -> set[str]:
1365
+ record_ids = set()
1366
+ for subset in req.subset:
1367
+ for selector_name in subset.selectors:
1368
+ record_ids.update(self._get_subset_dimension_records(subset.name, selector_name))
1369
+ return record_ids
1370
+
1371
+ @track_timing(timer_stats_collector)
1372
+ def make_dimension_association_table(
1373
+ self, dataset_id: str, context: ScratchDirContext
1374
+ ) -> DataFrame:
1375
+ """Build a table that includes all combinations of dimension records that must be provided
1376
+ by the dataset.
1377
+ """
1378
+ required_dimensions = self.get_dataset(dataset_id).required_dimensions
1379
+ multi_dfs = self._build_multi_dim_requirement_associations(
1380
+ required_dimensions.multi_dimensional, context
1381
+ )
1382
+
1383
+ # Project config construction asserts that there is no intersection of dimensions in
1384
+ # multi and single.
1385
+ existing = set()
1386
+ for df in multi_dfs:
1387
+ existing.update(set(df.columns))
1388
+
1389
+ single_dfs: dict[str, list[str]] = {}
1390
+ for field in (x for x in RequiredDimensionRecordsModel.model_fields if x not in existing):
1391
+ req = getattr(required_dimensions.single_dimensional, field)
1392
+ record_ids = self._get_required_dimension_record_ids(req)
1393
+ single_dfs[field] = list(record_ids)
1394
+
1395
+ single_df = create_dataframe_from_product(single_dfs, context)
1396
+ return cross_join_dfs(multi_dfs + [single_df])
1397
+
1398
+ def are_all_datasets_submitted(self) -> bool:
1399
+ """Return True if all datasets have been submitted."""
1400
+ return not self.list_unregistered_dataset_ids()
1401
+
1402
+ def set_status(self, status: ProjectRegistryStatus) -> None:
1403
+ """Set the project status to the given value."""
1404
+ self.model.status = status
1405
+ logger.info("Set project_id=%s status=%s", self.config_id, status)
1406
+
1407
+ def set_dataset_status(self, dataset_id: str, status: DatasetRegistryStatus):
1408
+ """Set the dataset status to the given value.
1409
+
1410
+ Raises
1411
+ ------
1412
+ ValueError
1413
+ Raised if dataset_id is not stored.
1414
+ """
1415
+ dataset = self.get_dataset(dataset_id)
1416
+ dataset.status = status
1417
+ logger.info(
1418
+ "Set dataset_id=%s status=%s for project_id=%s",
1419
+ dataset_id,
1420
+ status,
1421
+ self._model.project_id,
1422
+ )
1423
+
1424
+ @property
1425
+ def base_dimensions(self) -> dict:
1426
+ """Return the Base Dimensions.
1427
+
1428
+ Returns
1429
+ -------
1430
+ dict
1431
+ dict of DimensionConfig keyed by ConfigKey
1432
+
1433
+ """
1434
+ return self._base_dimensions
1435
+
1436
+ @property
1437
+ def supplemental_dimensions(self) -> dict:
1438
+ """Return the supplemental dimensions.
1439
+
1440
+ Returns
1441
+ -------
1442
+ dict
1443
+ dict of DimensionConfig keyed by ConfigKey
1444
+
1445
+ """
1446
+ return self._supplemental_dimensions
1447
+
1448
+
1449
+ def load_subset_dimensions(filename: Path) -> tuple[set[str], dict[str, list[str]]]:
1450
+ """Return a mapping of subset dimension name to record IDs."""
1451
+ df = pd.read_csv(filename, index_col="id")
1452
+ if len(df.columns) == 0:
1453
+ msg = "A subset dimension records file must at least one dimension column."
1454
+ raise DSGInvalidDimension(msg)
1455
+ record_ids = set(df.index.values)
1456
+ subset_by_dim_name = {x: df[x].dropna().index.to_list() for x in df.columns}
1457
+ return record_ids, subset_by_dim_name