dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. build_backend.py +93 -0
  2. dsgrid/__init__.py +22 -0
  3. dsgrid/api/__init__.py +0 -0
  4. dsgrid/api/api_manager.py +179 -0
  5. dsgrid/api/app.py +419 -0
  6. dsgrid/api/models.py +60 -0
  7. dsgrid/api/response_models.py +116 -0
  8. dsgrid/apps/__init__.py +0 -0
  9. dsgrid/apps/project_viewer/app.py +216 -0
  10. dsgrid/apps/registration_gui.py +444 -0
  11. dsgrid/chronify.py +32 -0
  12. dsgrid/cli/__init__.py +0 -0
  13. dsgrid/cli/common.py +120 -0
  14. dsgrid/cli/config.py +176 -0
  15. dsgrid/cli/download.py +13 -0
  16. dsgrid/cli/dsgrid.py +157 -0
  17. dsgrid/cli/dsgrid_admin.py +92 -0
  18. dsgrid/cli/install_notebooks.py +62 -0
  19. dsgrid/cli/query.py +729 -0
  20. dsgrid/cli/registry.py +1862 -0
  21. dsgrid/cloud/__init__.py +0 -0
  22. dsgrid/cloud/cloud_storage_interface.py +140 -0
  23. dsgrid/cloud/factory.py +31 -0
  24. dsgrid/cloud/fake_storage_interface.py +37 -0
  25. dsgrid/cloud/s3_storage_interface.py +156 -0
  26. dsgrid/common.py +36 -0
  27. dsgrid/config/__init__.py +0 -0
  28. dsgrid/config/annual_time_dimension_config.py +194 -0
  29. dsgrid/config/common.py +142 -0
  30. dsgrid/config/config_base.py +148 -0
  31. dsgrid/config/dataset_config.py +907 -0
  32. dsgrid/config/dataset_schema_handler_factory.py +46 -0
  33. dsgrid/config/date_time_dimension_config.py +136 -0
  34. dsgrid/config/dimension_config.py +54 -0
  35. dsgrid/config/dimension_config_factory.py +65 -0
  36. dsgrid/config/dimension_mapping_base.py +350 -0
  37. dsgrid/config/dimension_mappings_config.py +48 -0
  38. dsgrid/config/dimensions.py +1025 -0
  39. dsgrid/config/dimensions_config.py +71 -0
  40. dsgrid/config/file_schema.py +190 -0
  41. dsgrid/config/index_time_dimension_config.py +80 -0
  42. dsgrid/config/input_dataset_requirements.py +31 -0
  43. dsgrid/config/mapping_tables.py +209 -0
  44. dsgrid/config/noop_time_dimension_config.py +42 -0
  45. dsgrid/config/project_config.py +1462 -0
  46. dsgrid/config/registration_models.py +188 -0
  47. dsgrid/config/representative_period_time_dimension_config.py +194 -0
  48. dsgrid/config/simple_models.py +49 -0
  49. dsgrid/config/supplemental_dimension.py +29 -0
  50. dsgrid/config/time_dimension_base_config.py +192 -0
  51. dsgrid/data_models.py +155 -0
  52. dsgrid/dataset/__init__.py +0 -0
  53. dsgrid/dataset/dataset.py +123 -0
  54. dsgrid/dataset/dataset_expression_handler.py +86 -0
  55. dsgrid/dataset/dataset_mapping_manager.py +121 -0
  56. dsgrid/dataset/dataset_schema_handler_base.py +945 -0
  57. dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
  58. dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
  59. dsgrid/dataset/growth_rates.py +162 -0
  60. dsgrid/dataset/models.py +51 -0
  61. dsgrid/dataset/table_format_handler_base.py +257 -0
  62. dsgrid/dataset/table_format_handler_factory.py +17 -0
  63. dsgrid/dataset/unpivoted_table.py +121 -0
  64. dsgrid/dimension/__init__.py +0 -0
  65. dsgrid/dimension/base_models.py +230 -0
  66. dsgrid/dimension/dimension_filters.py +308 -0
  67. dsgrid/dimension/standard.py +252 -0
  68. dsgrid/dimension/time.py +352 -0
  69. dsgrid/dimension/time_utils.py +103 -0
  70. dsgrid/dsgrid_rc.py +88 -0
  71. dsgrid/exceptions.py +105 -0
  72. dsgrid/filesystem/__init__.py +0 -0
  73. dsgrid/filesystem/cloud_filesystem.py +32 -0
  74. dsgrid/filesystem/factory.py +32 -0
  75. dsgrid/filesystem/filesystem_interface.py +136 -0
  76. dsgrid/filesystem/local_filesystem.py +74 -0
  77. dsgrid/filesystem/s3_filesystem.py +118 -0
  78. dsgrid/loggers.py +132 -0
  79. dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
  80. dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
  81. dsgrid/notebooks/registration.ipynb +48 -0
  82. dsgrid/notebooks/start_notebook.sh +11 -0
  83. dsgrid/project.py +451 -0
  84. dsgrid/query/__init__.py +0 -0
  85. dsgrid/query/dataset_mapping_plan.py +142 -0
  86. dsgrid/query/derived_dataset.py +388 -0
  87. dsgrid/query/models.py +728 -0
  88. dsgrid/query/query_context.py +287 -0
  89. dsgrid/query/query_submitter.py +994 -0
  90. dsgrid/query/report_factory.py +19 -0
  91. dsgrid/query/report_peak_load.py +70 -0
  92. dsgrid/query/reports_base.py +20 -0
  93. dsgrid/registry/__init__.py +0 -0
  94. dsgrid/registry/bulk_register.py +165 -0
  95. dsgrid/registry/common.py +287 -0
  96. dsgrid/registry/config_update_checker_base.py +63 -0
  97. dsgrid/registry/data_store_factory.py +34 -0
  98. dsgrid/registry/data_store_interface.py +74 -0
  99. dsgrid/registry/dataset_config_generator.py +158 -0
  100. dsgrid/registry/dataset_registry_manager.py +950 -0
  101. dsgrid/registry/dataset_update_checker.py +16 -0
  102. dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
  103. dsgrid/registry/dimension_mapping_update_checker.py +16 -0
  104. dsgrid/registry/dimension_registry_manager.py +413 -0
  105. dsgrid/registry/dimension_update_checker.py +16 -0
  106. dsgrid/registry/duckdb_data_store.py +207 -0
  107. dsgrid/registry/filesystem_data_store.py +150 -0
  108. dsgrid/registry/filter_registry_manager.py +123 -0
  109. dsgrid/registry/project_config_generator.py +57 -0
  110. dsgrid/registry/project_registry_manager.py +1623 -0
  111. dsgrid/registry/project_update_checker.py +48 -0
  112. dsgrid/registry/registration_context.py +223 -0
  113. dsgrid/registry/registry_auto_updater.py +316 -0
  114. dsgrid/registry/registry_database.py +667 -0
  115. dsgrid/registry/registry_interface.py +446 -0
  116. dsgrid/registry/registry_manager.py +558 -0
  117. dsgrid/registry/registry_manager_base.py +367 -0
  118. dsgrid/registry/versioning.py +92 -0
  119. dsgrid/rust_ext/__init__.py +14 -0
  120. dsgrid/rust_ext/find_minimal_patterns.py +129 -0
  121. dsgrid/spark/__init__.py +0 -0
  122. dsgrid/spark/functions.py +589 -0
  123. dsgrid/spark/types.py +110 -0
  124. dsgrid/tests/__init__.py +0 -0
  125. dsgrid/tests/common.py +140 -0
  126. dsgrid/tests/make_us_data_registry.py +265 -0
  127. dsgrid/tests/register_derived_datasets.py +103 -0
  128. dsgrid/tests/utils.py +25 -0
  129. dsgrid/time/__init__.py +0 -0
  130. dsgrid/time/time_conversions.py +80 -0
  131. dsgrid/time/types.py +67 -0
  132. dsgrid/units/__init__.py +0 -0
  133. dsgrid/units/constants.py +113 -0
  134. dsgrid/units/convert.py +71 -0
  135. dsgrid/units/energy.py +145 -0
  136. dsgrid/units/power.py +87 -0
  137. dsgrid/utils/__init__.py +0 -0
  138. dsgrid/utils/dataset.py +830 -0
  139. dsgrid/utils/files.py +179 -0
  140. dsgrid/utils/filters.py +125 -0
  141. dsgrid/utils/id_remappings.py +100 -0
  142. dsgrid/utils/py_expression_eval/LICENSE +19 -0
  143. dsgrid/utils/py_expression_eval/README.md +8 -0
  144. dsgrid/utils/py_expression_eval/__init__.py +847 -0
  145. dsgrid/utils/py_expression_eval/tests.py +283 -0
  146. dsgrid/utils/run_command.py +70 -0
  147. dsgrid/utils/scratch_dir_context.py +65 -0
  148. dsgrid/utils/spark.py +918 -0
  149. dsgrid/utils/spark_partition.py +98 -0
  150. dsgrid/utils/timing.py +239 -0
  151. dsgrid/utils/utilities.py +221 -0
  152. dsgrid/utils/versioning.py +36 -0
  153. dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
  154. dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
  155. dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
  156. dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
  157. dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
@@ -0,0 +1,1462 @@
1
+ import itertools
2
+ import logging
3
+ from collections import defaultdict
4
+ from pathlib import Path
5
+ from typing import Annotated, Any, Generator, Iterable, Type
6
+
7
+ import pandas as pd
8
+ from pydantic import field_validator, model_validator, Field
9
+
10
+ from dsgrid.config.common import make_base_dimension_template
11
+ from dsgrid.config.dataset_config import DatasetConfig
12
+ from dsgrid.config.dimension_config import (
13
+ DimensionBaseConfig,
14
+ DimensionBaseConfigWithFiles,
15
+ )
16
+ from dsgrid.config.mapping_tables import MappingTableConfig
17
+ from dsgrid.config.time_dimension_base_config import TimeDimensionBaseConfig
18
+ from dsgrid.data_models import DSGBaseModel, DSGBaseDatabaseModel, make_model_config
19
+ from dsgrid.dimension.base_models import (
20
+ check_required_dimensions,
21
+ check_timezone_in_geography,
22
+ DimensionCategory,
23
+ DimensionType,
24
+ )
25
+ from dsgrid.dimension.time import TimeDimensionType
26
+ from dsgrid.exceptions import (
27
+ DSGInvalidDataset,
28
+ DSGInvalidField,
29
+ DSGInvalidDimension,
30
+ DSGInvalidOperation,
31
+ DSGInvalidParameter,
32
+ DSGValueNotRegistered,
33
+ )
34
+ from dsgrid.registry.common import (
35
+ ConfigKey,
36
+ ProjectRegistryStatus,
37
+ DatasetRegistryStatus,
38
+ check_config_id_strict,
39
+ )
40
+ from dsgrid.spark.types import (
41
+ DataFrame,
42
+ )
43
+ from dsgrid.utils.scratch_dir_context import ScratchDirContext
44
+ from dsgrid.utils.spark import (
45
+ cross_join_dfs,
46
+ create_dataframe_from_product,
47
+ )
48
+ from dsgrid.utils.timing import timer_stats_collector, track_timing
49
+ from dsgrid.utils.utilities import check_uniqueness
50
+ from dsgrid.config.config_base import ConfigBase
51
+ from dsgrid.config.dataset_config import InputDatasetType
52
+ from dsgrid.config.supplemental_dimension import SupplementalDimensionModel
53
+ from dsgrid.config.dimension_mapping_base import DimensionMappingReferenceModel
54
+ from dsgrid.config.dimensions import (
55
+ DimensionsListModel,
56
+ DimensionReferenceModel,
57
+ DimensionModel,
58
+ )
59
+ from dsgrid.dimension.time import (
60
+ TimeBasedDataAdjustmentModel,
61
+ DaylightSavingSpringForwardType,
62
+ DaylightSavingFallBackType,
63
+ )
64
+
65
+
66
+ logger = logging.getLogger(__name__)
67
+
68
+
69
+ class SubsetDimensionSelectorModel(DSGBaseModel):
70
+ """Defines a subset dimension selector inclusive of the subset's records and information
71
+ required to define the selector as a record within the supplemental dimension defined by the
72
+ subset dimension group.
73
+ """
74
+
75
+ name: str
76
+ description: str
77
+ column_values: dict[str, str] = Field(
78
+ title="column_values",
79
+ description="Optional columns to populate in the subset dimension group's supplemental "
80
+ "dimension records table. For example, if each selector in the group defines the end "
81
+ "uses for one sector (e.g., commercial_end_uses, transportation_end_uses), the "
82
+ "supplemental dimension records table needs to define the 'fuel_id' and 'unit' fields of "
83
+ "the EnergyEndUse data model.",
84
+ default={},
85
+ )
86
+ records: list[str] = Field(
87
+ title="records",
88
+ description="Table of values populated by reading the parent subset dimension records "
89
+ "file. Should not be populated by the user.",
90
+ default=[],
91
+ json_schema_extra={
92
+ "dsgrid_internal": True,
93
+ },
94
+ )
95
+
96
+
97
+ class SubsetDimensionGroupModel(DSGBaseModel):
98
+ """Defines one or more subset dimension selectors for a dimension type."""
99
+
100
+ name: str
101
+ description: str
102
+ dimension_type: DimensionType = Field(
103
+ title="dimension_type",
104
+ alias="type",
105
+ description="Type of the dimension",
106
+ json_schema_extra={
107
+ "options": DimensionType.format_for_docs(),
108
+ },
109
+ )
110
+ filename: str | None = Field(
111
+ default=None,
112
+ title="filename",
113
+ alias="file",
114
+ description="Filename containing dimension records. Only populated for initial "
115
+ "registration. Each selector's records are stored as JSON objects in the dsgrid registry.",
116
+ )
117
+ selectors: list[SubsetDimensionSelectorModel] = Field(
118
+ title="selectors",
119
+ description="Dimension selectors",
120
+ )
121
+ selector_references: list[DimensionReferenceModel] = Field(
122
+ title="selectors",
123
+ description="References to the subset dimensions generated by dsgrid during registration.",
124
+ default=[],
125
+ )
126
+ create_supplemental_dimension: bool = Field(
127
+ title="create_supplemental_dimension",
128
+ description="Auto-generate supplemental dimensions in order to allow aggregrations on "
129
+ "the subsets.",
130
+ default=True,
131
+ )
132
+ base_dimension_name: str | None = Field(
133
+ default=None,
134
+ title="base_dimension_name",
135
+ description="Name of base dimension for the supplemental dimension mapping, if "
136
+ "create_supplemental_dimension is true. Required if there are multiple base dimensions "
137
+ "for this type.",
138
+ )
139
+ record_ids: set[str] = set()
140
+
141
+ @field_validator("selectors")
142
+ @classmethod
143
+ def check_selectors(cls, selectors):
144
+ """Check that the selectors are defined consistently."""
145
+ if len(selectors) > 1:
146
+ first = sorted(selectors[0].column_values.keys())
147
+ for selector in selectors[1:]:
148
+ columns = sorted(selector.column_values.keys())
149
+ if columns != first:
150
+ msg = f"All selectors must define the same columns: {first=} {columns=}"
151
+ raise ValueError(msg)
152
+
153
+ return selectors
154
+
155
+ @model_validator(mode="after")
156
+ def load_records(self) -> "SubsetDimensionGroupModel":
157
+ """Load the records for each subset dimension selector."""
158
+ if self.filename is None:
159
+ return self
160
+
161
+ record_ids, mappings = load_subset_dimensions(Path(self.filename))
162
+ self.record_ids.update(record_ids)
163
+ selector_names = check_uniqueness(
164
+ [x.name for x in self.selectors], "subset dimension selector"
165
+ )
166
+
167
+ diff = selector_names.symmetric_difference(mappings)
168
+ if diff:
169
+ msg = f"subset dimension {self.name} selectors have a mismatch with the records file column names: {diff}"
170
+ raise ValueError(msg)
171
+
172
+ for dim in self.selectors:
173
+ dim.records = mappings[dim.name]
174
+
175
+ self.filename = None
176
+ return self
177
+
178
+
179
+ class SubsetDimensionGroupListModel(DSGBaseModel):
180
+ """Defines a list of subset dimensions."""
181
+
182
+ subset_dimensions: Annotated[list[SubsetDimensionGroupModel], Field(min_length=1)] = Field(
183
+ description="List of subset dimensions to be registered",
184
+ )
185
+
186
+
187
+ class DimensionsModel(DSGBaseModel):
188
+ """Contains dimensions defined by a project"""
189
+
190
+ base_dimensions: DimensionsListModel = Field(
191
+ title="base_dimensions",
192
+ description="List of dimensions for a project's base dimensions. They will be "
193
+ "automatically registered during project registration and then converted to "
194
+ "base_dimension_references.",
195
+ default=[],
196
+ )
197
+ base_dimension_references: list[DimensionReferenceModel] = Field(
198
+ title="base_dimensions",
199
+ description="List of registry references (``DimensionReferenceModel``) for a project's "
200
+ "base dimensions.",
201
+ default=[],
202
+ )
203
+ subset_dimensions: list[SubsetDimensionGroupModel] = Field(
204
+ title="subset_dimensions",
205
+ description="List of subset dimension groups. "
206
+ "Subset dimension groups are used to specify subsets of base dimension records that a "
207
+ "dataset must support, dimensionality of derived datasets, and query filters. "
208
+ "Subset dimension groups also define a new supplemental dimension whose records "
209
+ "correspond to the table columns/subset selectors, such that defining a subset "
210
+ "dimension group can be a convenient way to define reporting at a different level of "
211
+ "aggregation as compared to the project's base dimensions.",
212
+ default=[],
213
+ )
214
+ supplemental_dimensions: list[SupplementalDimensionModel] = Field(
215
+ title="supplemental_dimensions",
216
+ description="List of supplemental dimensions. They will be automatically registered. "
217
+ "during project registration and then converted to supplemental_dimension_references. "
218
+ "Supplemental dimensions are used to support additional querying and transformations "
219
+ "(e.g., aggregations, disgaggregations, filtering, scaling, etc.) of the project's "
220
+ "base data.",
221
+ default=[],
222
+ )
223
+ supplemental_dimension_references: list[DimensionReferenceModel] = Field(
224
+ title="supplemental_dimension_references",
225
+ description="List of registry references for a project's supplemental dimensions. "
226
+ "Dimensions references of the same :class:`dsgrid.dimensions.base_model.DimensionType` "
227
+ "are allowed for supplemental dimension references (i.e., multiple `Geography` types "
228
+ "are allowed).",
229
+ default=[],
230
+ )
231
+
232
+ @model_validator(mode="after")
233
+ def check_dimensions(self) -> "DimensionsModel":
234
+ """Validate that the dimensions are complete and consistent."""
235
+ dimensions = itertools.chain(self.base_dimensions, self.base_dimension_references)
236
+ check_required_dimensions(dimensions, "project base dimensions")
237
+ return self
238
+
239
+ @model_validator(mode="before")
240
+ @classmethod
241
+ def pre_check_values(cls, values: dict) -> dict:
242
+ """Checks that base dimensions are defined."""
243
+ if not values.get("base_dimensions", []) and not values.get(
244
+ "base_dimension_references", []
245
+ ):
246
+ msg = "Either base_dimensions or base_dimension_references must be defined"
247
+ raise ValueError(msg)
248
+
249
+ return values
250
+
251
+ @field_validator("base_dimensions")
252
+ @classmethod
253
+ def check_files(cls, values: list) -> list:
254
+ """Validate dimension files are unique across all dimensions"""
255
+ check_uniqueness(
256
+ (
257
+ x.filename
258
+ for x in values
259
+ if isinstance(x, DimensionModel) and x.filename is not None
260
+ ),
261
+ "dimension record filename",
262
+ )
263
+ return values
264
+
265
+ @field_validator("base_dimensions")
266
+ @classmethod
267
+ def check_names(cls, values: list) -> list:
268
+ """Validate dimension names are unique across all dimensions."""
269
+ check_uniqueness(
270
+ [dim.name for dim in values],
271
+ "dimension record name",
272
+ )
273
+ return values
274
+
275
+ @field_validator("base_dimensions")
276
+ @classmethod
277
+ def check_time_zone(cls, values: list) -> list:
278
+ """Validate the time zone column in geography records."""
279
+ for dimension in values:
280
+ if dimension.dimension_type == DimensionType.GEOGRAPHY:
281
+ check_timezone_in_geography(
282
+ dimension,
283
+ err_msg="Project geography dimension records must include a time_zone column",
284
+ )
285
+ return values
286
+
287
+ @field_validator("subset_dimensions")
288
+ @classmethod
289
+ def check_subset_dimensions(cls, subset_dimensions):
290
+ """Check that each subset dimension has a unique name."""
291
+ check_uniqueness([x.name for x in subset_dimensions], "subset dimensions name")
292
+ return subset_dimensions
293
+
294
+ @model_validator(mode="after")
295
+ def check_dimension_names(self) -> "DimensionsModel":
296
+ """Check that all dimension query names are unique."""
297
+ names: set[str] = set()
298
+
299
+ def add_name(name):
300
+ if name in names:
301
+ msg = f"dimension_name={name} is not unique in the project"
302
+ raise ValueError(msg)
303
+ names.add(name)
304
+
305
+ for dim in self.base_dimensions:
306
+ add_name(dim.name)
307
+ for dim in self.supplemental_dimensions:
308
+ add_name(dim.name)
309
+ for group in self.subset_dimensions:
310
+ add_name(group.name)
311
+ for selector in group.selectors:
312
+ add_name(selector.name)
313
+
314
+ return self
315
+
316
+
317
+ class RequiredSubsetDimensionRecordsModel(DSGBaseModel):
318
+ name: str = Field(description="Name of a subset dimension")
319
+ selectors: list[str] = Field(description="One or more selectors in the subset dimension")
320
+
321
+
322
+ class RequiredSupplementalDimensionRecordsModel(DSGBaseModel):
323
+ name: str = Field(description="Name of a supplemental dimension")
324
+ record_ids: list[str] = Field(
325
+ description="One or more record IDs in the supplemental dimension"
326
+ )
327
+
328
+
329
+ class RequiredBaseDimensionModel(DSGBaseModel):
330
+ record_ids: list[str] = []
331
+ dimension_name: str | None = Field(
332
+ default=None,
333
+ description="Identifies which base dimension contains the record IDs. Required if there "
334
+ "is more than one base dimension for a given dimension type.",
335
+ )
336
+
337
+
338
+ class RequiredDimensionRecordsByTypeModel(DSGBaseModel):
339
+ base: RequiredBaseDimensionModel = RequiredBaseDimensionModel()
340
+ base_missing: RequiredBaseDimensionModel = RequiredBaseDimensionModel()
341
+ subset: list[RequiredSubsetDimensionRecordsModel] = []
342
+
343
+ @model_validator(mode="before")
344
+ @classmethod
345
+ def handle_legacy_format(cls, values: dict[str, Any]) -> dict[str, Any]:
346
+ # 1. base and base_missing used to be list[str] because we used to allow a single base
347
+ # dimension.
348
+ # 2. We used to allow supplemental dimension requirements.
349
+ # This allows backwards compatibility with old files and databases.
350
+ # This can be removed once we've updated existing dsgrid project repositories.
351
+ for field in ("base", "base_missing"):
352
+ if field in values and isinstance(values[field], list):
353
+ logger.warning(f"Fixing up {field} to conform to new format")
354
+ values[field] = {"record_ids": values[field]}
355
+
356
+ if "supplemental" in values:
357
+ logger.warning(
358
+ "Removing deprecated supplemental dimension requirements from the project config."
359
+ )
360
+ values.pop("supplemental")
361
+
362
+ return values
363
+
364
+ @model_validator(mode="after")
365
+ def check_base(self) -> "RequiredDimensionRecordsByTypeModel":
366
+ if self.base.record_ids and self.base_missing.record_ids:
367
+ msg = f"base and base_missing cannot both contain record_ids: {self.base=} {self.base_missing=}"
368
+ raise ValueError(msg)
369
+ return self
370
+
371
+ def defines_dimension_requirement(self) -> bool:
372
+ """Returns True if the model defines a dimension requirement."""
373
+ return (
374
+ bool(self.base.record_ids) or bool(self.base_missing.record_ids) or bool(self.subset)
375
+ )
376
+
377
+
378
+ class RequiredDimensionRecordsModel(DSGBaseModel):
379
+ # This is here because Pydantic doesn't like fields that start with 'model_'
380
+ model_config = make_model_config(protected_namespaces=())
381
+
382
+ # time is excluded
383
+ geography: RequiredDimensionRecordsByTypeModel = RequiredDimensionRecordsByTypeModel()
384
+ metric: RequiredDimensionRecordsByTypeModel = RequiredDimensionRecordsByTypeModel()
385
+ model_year: RequiredDimensionRecordsByTypeModel = RequiredDimensionRecordsByTypeModel()
386
+ scenario: RequiredDimensionRecordsByTypeModel = RequiredDimensionRecordsByTypeModel()
387
+ sector: RequiredDimensionRecordsByTypeModel = RequiredDimensionRecordsByTypeModel()
388
+ subsector: RequiredDimensionRecordsByTypeModel = RequiredDimensionRecordsByTypeModel()
389
+ weather_year: RequiredDimensionRecordsByTypeModel = RequiredDimensionRecordsByTypeModel()
390
+
391
+
392
+ class RequiredDimensionsModel(DSGBaseModel):
393
+ """Defines required record IDs that must exist for each dimension in a dataset.
394
+ Record IDs can reside in the project's base or subset dimensions.
395
+
396
+ Requirements can be specified for a single dimension or a combination of dimensions.
397
+ For example, if a project includes commercial, residential, and transportation sectors but the
398
+ dataset has only transporation sector records, it should specify a single_dimensional
399
+ requirement that is a subset of of the project's base dimension.
400
+ `{"single_dimensional": "sector": {"base": {"record_ids": ["transportation"]}}}`.
401
+
402
+ If a dataset's requirements span multiple dimensions, such as if it does not have some
403
+ metric records for some geography records, then a multi_dimensional requirement should be
404
+ specified. (By default, a full cross join is assumed to be present.)
405
+ `{"multi_dimensional": {
406
+ "geography": {"base": {"record_ids": ["12345"]}}
407
+ "metric": {"base": {"record_ids": ["electricity_cooling"]}}
408
+ }
409
+ }`
410
+
411
+ If a dataset specifies a dimension type within a multi_dimensional section and wants to use
412
+ all records from a project base dimension, it can specify `base.record_ids = ["__all__"]
413
+ as a shorthand notation.
414
+
415
+ Requirements for a dimension cannot be defined in both single_dimensional and multi_dimensional
416
+ sections.
417
+
418
+ If no records are listed for a dimension then all project base records are required.
419
+
420
+ It might be easier for a dataset to specify what it does not have rather than what it does have.
421
+ In that case, it is recommended to use the RequiredDimensionRecordsModel.base_missing field.
422
+ dsgrid will compute the difference of the base dimension records and the base_missing records
423
+ to determine the dataset's required records.
424
+
425
+ If a project has multiple base dimensions of the same type, the
426
+ RequiredDimensionRecordsModel.dimension_name must be specified to identify the base
427
+ dimension that contains the record IDs.
428
+
429
+ If a dataset contains a subset of project base dimension records that are defined in the
430
+ project's subset dimensions, it is recommended to use that specification. dsgrid will
431
+ substitute base records for mapped subset records at runtime.
432
+ """
433
+
434
+ single_dimensional: RequiredDimensionRecordsModel = Field(
435
+ description="Required records for a single dimension.",
436
+ default=RequiredDimensionRecordsModel(),
437
+ )
438
+ multi_dimensional: list[RequiredDimensionRecordsModel] = Field(
439
+ description="Required records for a combination of dimensions. For example, there may be "
440
+ "a dataset requirement for only one subsector for a given sector instead of a cross "
441
+ "product.",
442
+ default=[],
443
+ )
444
+
445
+ @model_validator(mode="after")
446
+ def check_for_duplicates(self) -> "RequiredDimensionsModel":
447
+ """
448
+ 1. Ensure that the same dimension does not have requirements in both single and multi
449
+ dimensional sections.
450
+ 2. Set any dimensions that do not have specifications to require all base dimension
451
+ records (as long as there is only one project base dimension).
452
+ """
453
+ single_dimensional: set[str] = set()
454
+ multi_dimensional: set[str] = set()
455
+
456
+ for field in RequiredDimensionRecordsModel.model_fields:
457
+ req = getattr(self.single_dimensional, field)
458
+ if req.defines_dimension_requirement():
459
+ single_dimensional.add(field)
460
+
461
+ dim_combos: set[tuple[str, ...]] = set()
462
+ for item in self.multi_dimensional:
463
+ dims = []
464
+ for field in RequiredDimensionRecordsModel.model_fields:
465
+ req = getattr(item, field)
466
+ if req.defines_dimension_requirement():
467
+ if field in single_dimensional:
468
+ msg = (
469
+ "dimensions cannot be defined in both single_dimensional and "
470
+ f"multi_dimensional sections: {field}"
471
+ )
472
+ raise ValueError(msg)
473
+ dims.append(field)
474
+ multi_dimensional.add(field)
475
+
476
+ if len(dims) < 2:
477
+ msg = (
478
+ "A multi_dimensional dimension requirement must contain at least two "
479
+ f"dimensions: {item}"
480
+ )
481
+ raise ValueError(msg)
482
+
483
+ dim_combo = tuple(sorted(dims))
484
+ if dim_combo not in dim_combos:
485
+ for other in dim_combos:
486
+ if set(dim_combo).intersection(other):
487
+ msg = (
488
+ "All descriptors in the multi-dimensional requirements with an "
489
+ "intersection of dimensions must have a full intersection. "
490
+ f"dimension_set1 = {other} dimension_set2 = {dim_combo}"
491
+ )
492
+ raise ValueError(msg)
493
+ dim_combos.add(dim_combo)
494
+
495
+ not_covered = (
496
+ set([x.value for x in DimensionType]) - multi_dimensional - single_dimensional
497
+ )
498
+ for field in not_covered:
499
+ if field != DimensionType.TIME.value:
500
+ getattr(self.single_dimensional, field).base.record_ids = ["__all__"]
501
+ return self
502
+
503
+
504
+ class DatasetBaseDimensionNamesModel(DSGBaseModel):
505
+ """Defines the query names for project base dimensions to which datasets will be mapped.
506
+ This is important for cases where a project has multiple base dimensions of the same type.
507
+ """
508
+
509
+ # This is here because Pydantic doesn't like fields that start with 'model_'
510
+ model_config = make_model_config(protected_namespaces=())
511
+
512
+ geography: str | None = None
513
+ metric: str | None = None
514
+ model_year: str | None = None
515
+ scenario: str | None = None
516
+ sector: str | None = None
517
+ subsector: str | None = None
518
+ time: str | None = None
519
+ weather_year: str | None = None
520
+
521
+
522
+ class InputDatasetModel(DSGBaseModel):
523
+ """Defines an input dataset for the project config."""
524
+
525
+ dataset_id: str = Field(
526
+ title="dataset_id",
527
+ description="Unique dataset identifier.",
528
+ json_schema_extra={
529
+ "updateable": False,
530
+ },
531
+ )
532
+ dataset_type: InputDatasetType = Field(
533
+ title="dataset_type",
534
+ description="Dataset type.",
535
+ json_schema_extra={
536
+ "options": InputDatasetType.format_for_docs(),
537
+ "updateable": False,
538
+ },
539
+ )
540
+ version: str | None = Field(
541
+ title="version",
542
+ description="Version of the registered dataset. "
543
+ "The version specification is optional. If no version is supplied, then the latest "
544
+ "version in the registry is assumed. "
545
+ "The version string must be in semver format (e.g., '1.0.0') and it must be a "
546
+ "valid/existing version in the registry.",
547
+ default=None,
548
+ )
549
+ required_dimensions: RequiredDimensionsModel = Field(
550
+ title="required_dimensions",
551
+ description="Defines required record IDs that must exist for each dimension.",
552
+ default=RequiredDimensionsModel(),
553
+ )
554
+ mapping_references: list[DimensionMappingReferenceModel] = Field(
555
+ title="mapping_references",
556
+ description="Defines how to map the dataset dimensions to the project. "
557
+ "Auto-populated during submission.",
558
+ default=[],
559
+ )
560
+ base_dimension_names: DatasetBaseDimensionNamesModel = Field(
561
+ title="base_dimension_names",
562
+ description="Defines the project base dimensions to which the dataset will map itself. "
563
+ "Auto-populated during submission.",
564
+ default=DatasetBaseDimensionNamesModel(),
565
+ )
566
+ status: DatasetRegistryStatus = Field(
567
+ title="status",
568
+ description="Registration status of the dataset, added by dsgrid.",
569
+ default=DatasetRegistryStatus.UNREGISTERED,
570
+ json_schema_extra={
571
+ "dsgrid_internal": True,
572
+ "notes": ("status is "),
573
+ "updateable": False,
574
+ },
575
+ )
576
+ wrap_time_allowed: bool = Field(
577
+ title="wrap_time_allowed",
578
+ description="Whether to allow dataset time to be wrapped to project time if different",
579
+ default=False,
580
+ )
581
+ time_based_data_adjustment: TimeBasedDataAdjustmentModel = Field(
582
+ title="time_based_data_adjustment",
583
+ description="Defines how the rest of the dataframe is adjusted with respect to time. "
584
+ "E.g., when drop associated data when dropping a leap day timestamp.",
585
+ default=TimeBasedDataAdjustmentModel(),
586
+ )
587
+
588
+ @field_validator("time_based_data_adjustment")
589
+ @classmethod
590
+ def check_data_adjustment(cls, time_based_data_adjustment):
591
+ """Check daylight saving adjustment"""
592
+ sfh = time_based_data_adjustment.daylight_saving_adjustment.spring_forward_hour
593
+ fbh = time_based_data_adjustment.daylight_saving_adjustment.fall_back_hour
594
+ if fbh == DaylightSavingFallBackType.NONE and sfh == DaylightSavingSpringForwardType.NONE:
595
+ return time_based_data_adjustment
596
+ if fbh != DaylightSavingFallBackType.NONE and sfh != DaylightSavingSpringForwardType.NONE:
597
+ return time_based_data_adjustment
598
+ msg = f"mismatch between spring_forward_hour and fall_back_hour, {time_based_data_adjustment=}."
599
+ raise ValueError(msg)
600
+
601
+ # TODO: write validation that if daylight_saving_adjustment is specified, dataset time config must be IndexTimeDimensionConfig
602
+
603
+
604
+ class DimensionMappingsModel(DSGBaseModel):
605
+ """Defines all dimension mappings associated with a dsgrid project,
606
+ including base-to-supplemental mappings and dataset-to-project mappings.
607
+ """
608
+
609
+ base_to_supplemental_references: list[DimensionMappingReferenceModel] = Field(
610
+ title="base_to_supplemental_references",
611
+ description="Base dimension to supplemental dimension mappings (e.g., county-to-state)"
612
+ " used to support various queries and dimension transformations.",
613
+ default=[],
614
+ )
615
+ dataset_to_project: dict[str, list[DimensionMappingReferenceModel]] = Field(
616
+ title="dataset_to_project",
617
+ description="Dataset-to-project mappings map dataset dimensions to project dimensions. "
618
+ "Once a dataset is submitted to a project, dsgrid adds the dataset-to-project mappings "
619
+ "to the project config. "
620
+ "Some projects may not have any dataset-to-project mappings. Dataset-to-project "
621
+ " mappings are only supplied if a dataset's dimensions do not match the project's "
622
+ "dimension.",
623
+ default={},
624
+ # TODO: need to document missing dimension records, fill values, etc. DSGRID-191.
625
+ )
626
+
627
+
628
+ class ProjectConfigModel(DSGBaseDatabaseModel):
629
+ """Represents project configurations"""
630
+
631
+ project_id: str = Field(
632
+ title="project_id",
633
+ description="A unique project identifier that is project-specific (e.g., "
634
+ "'standard-scenarios-2021').",
635
+ )
636
+ name: str = Field(
637
+ title="name",
638
+ description="A project name to accompany the ID.",
639
+ )
640
+ description: str = Field(
641
+ title="description",
642
+ description="Detailed project description.",
643
+ )
644
+ status: ProjectRegistryStatus = Field(
645
+ title="status",
646
+ description="project registry status",
647
+ default=ProjectRegistryStatus.INITIAL_REGISTRATION,
648
+ json_schema_extra={
649
+ "dsgrid_internal": True,
650
+ "updateable": False,
651
+ },
652
+ )
653
+ datasets: list[InputDatasetModel] = Field(
654
+ title="datasets",
655
+ description="List of input datasets for the project.",
656
+ )
657
+ dimensions: DimensionsModel = Field(
658
+ title="dimensions",
659
+ description="List of `base` and `supplemental` dimensions.",
660
+ )
661
+ dimension_mappings: DimensionMappingsModel = Field(
662
+ title="dimension_mappings",
663
+ description="List of project mappings. Initialized with base-to-base and"
664
+ " base-to-supplemental mappings. dataset-to-project mappings are added by dsgrid as"
665
+ " datasets get registered with the project.",
666
+ default=DimensionMappingsModel(),
667
+ )
668
+
669
+ @field_validator("project_id")
670
+ @classmethod
671
+ def check_project_id_handle(cls, project_id):
672
+ """Check for valid characters in project id"""
673
+ if "-" in project_id:
674
+ msg = 'invalid character "-" in project id'
675
+ raise ValueError(msg)
676
+
677
+ check_config_id_strict(project_id, "Project")
678
+ return project_id
679
+
680
+
681
+ def make_unvalidated_project_config(
682
+ project_id: str,
683
+ dataset_ids: Iterable[str],
684
+ metric_types: Iterable[str],
685
+ name: str | None = None,
686
+ description: str | None = None,
687
+ time_type: TimeDimensionType = TimeDimensionType.DATETIME,
688
+ ) -> dict[str, Any]:
689
+ """Create a project config as a dictionary, skipping validation."""
690
+ return {
691
+ "project_id": project_id,
692
+ "name": name or "",
693
+ "description": description or "",
694
+ "dimensions": {
695
+ "base_dimensions": make_base_dimension_template(metric_types, time_type=time_type),
696
+ "subset_dimensions": [],
697
+ "supplemental_dimensions": [],
698
+ },
699
+ "datasets": [
700
+ {
701
+ "dataset_id": x,
702
+ "dataset_type": "",
703
+ "version": "",
704
+ "required_dimensions": {},
705
+ }
706
+ for x in dataset_ids
707
+ ],
708
+ }
709
+
710
+
711
+ class DimensionsByCategoryModel(DSGBaseModel):
712
+ """Defines the query names by base and supplemental category."""
713
+
714
+ base: list[str]
715
+ subset: list[str]
716
+ supplemental: list[str]
717
+
718
+
719
+ class ProjectDimensionNamesModel(DSGBaseModel):
720
+ """Defines the query names for all base and supplemental dimensions in the project."""
721
+
722
+ # This is here because Pydantic doesn't like fields that start with 'model_'
723
+ model_config = make_model_config(protected_namespaces=())
724
+
725
+ geography: DimensionsByCategoryModel
726
+ metric: DimensionsByCategoryModel
727
+ model_year: DimensionsByCategoryModel
728
+ scenario: DimensionsByCategoryModel
729
+ sector: DimensionsByCategoryModel
730
+ subsector: DimensionsByCategoryModel
731
+ time: DimensionsByCategoryModel
732
+ weather_year: DimensionsByCategoryModel
733
+
734
+
735
+ class ProjectConfig(ConfigBase):
736
+ """Provides an interface to a ProjectConfigModel."""
737
+
738
+ def __init__(self, model: ProjectConfigModel):
739
+ super().__init__(model)
740
+ self._base_dimensions: dict[ConfigKey, DimensionBaseConfig] = {}
741
+ self._subset_dimensions: dict[
742
+ DimensionType, dict[str, dict[ConfigKey, DimensionBaseConfigWithFiles]]
743
+ ] = {}
744
+ self._supplemental_dimensions: dict[ConfigKey, DimensionBaseConfig] = {}
745
+ self._base_to_supplemental_mappings: dict[ConfigKey, MappingTableConfig] = {}
746
+ self._dimensions_by_name: dict[str, DimensionBaseConfig] = {}
747
+
748
+ @staticmethod
749
+ def model_class() -> Type:
750
+ return ProjectConfigModel
751
+
752
+ @staticmethod
753
+ def config_filename() -> str:
754
+ return "project.json5"
755
+
756
+ def get_base_dimension(
757
+ self, dimension_type: DimensionType, dimension_name: str | None = None
758
+ ) -> DimensionBaseConfig:
759
+ """Return the base dimension matching dimension_type.
760
+ If there is more than one base dimension of the given type, dimension_name is
761
+ required.
762
+
763
+ See also
764
+ --------
765
+ list_base_dimensions
766
+ """
767
+ if dimension_name is None:
768
+ return self._get_single_base_dimension(dimension_type)
769
+ for dim in self._iter_base_dimensions():
770
+ if dim.model.dimension_type == dimension_type and dim.model.name == dimension_name:
771
+ return dim
772
+ msg = f"Did not find a dimension of {dimension_type=} with {dimension_name=}"
773
+ raise DSGValueNotRegistered(msg)
774
+
775
+ def get_base_time_dimension(self) -> TimeDimensionBaseConfig:
776
+ """Return the base dimension for time."""
777
+ dim = self._get_single_base_dimension(DimensionType.TIME)
778
+ assert isinstance(dim, TimeDimensionBaseConfig)
779
+ return dim
780
+
781
+ def _get_single_base_dimension(self, dimension_type: DimensionType) -> DimensionBaseConfig:
782
+ """Return the base dimension."""
783
+ dims = [
784
+ x for x in self._iter_base_dimensions() if x.model.dimension_type == dimension_type
785
+ ]
786
+ if not dims:
787
+ msg = f"base dimension {dimension_type=} not found"
788
+ raise DSGValueNotRegistered(msg)
789
+
790
+ if len(dims) > 1:
791
+ qnames = " ".join([x.model.name for x in dims])
792
+ msg = (
793
+ f"Found multiple base dimensions for {dimension_type=}: {qnames}. "
794
+ "Call get_base_dimension() with a specific name."
795
+ )
796
+ raise DSGInvalidDimension(msg)
797
+ return dims[0]
798
+
799
+ def get_base_dimension_and_version(
800
+ self, dimension_type: DimensionType, dimension_name: str | None = None
801
+ ) -> tuple[DimensionBaseConfig, str]:
802
+ """Return the base dimension and version matching dimension_type."""
803
+ res: tuple[DimensionBaseConfig, str] | None = None
804
+ for key, dim in self.base_dimensions.items():
805
+ if dim.model.dimension_type == dimension_type:
806
+ if dimension_name is None or dim.model.name == dimension_name:
807
+ if res is not None:
808
+ msg = (
809
+ f"Found multiple base dimensions for {dimension_type=}. "
810
+ "You must specify a dimension query name to remove ambiguity."
811
+ )
812
+ raise DSGInvalidOperation(msg)
813
+ res = dim, key.version
814
+
815
+ if res is None:
816
+ msg = f"Did not find a dimension with {dimension_type=} {dimension_name=}"
817
+ raise DSGValueNotRegistered(msg)
818
+ return res
819
+
820
+ def get_dimension(self, name: str) -> DimensionBaseConfig:
821
+ """Return the dimension with name."""
822
+ dim = self._dimensions_by_name.get(name)
823
+ if dim is None:
824
+ msg = f"dimension_name={name} is not stored"
825
+ raise DSGValueNotRegistered(msg)
826
+ return dim
827
+
828
+ def get_time_dimension(self, name: str) -> TimeDimensionBaseConfig:
829
+ """Return the time dimension with dimension_name."""
830
+ dim = self.get_dimension(name)
831
+ if not isinstance(dim, TimeDimensionBaseConfig):
832
+ msg = f"{dim.model.label} is not a time dimension"
833
+ raise DSGInvalidParameter(msg)
834
+ return dim
835
+
836
+ def get_dimension_by_name(self, name: str) -> DimensionBaseConfig:
837
+ """Return the dimension with name."""
838
+ for dim in self._iter_base_dimensions():
839
+ if dim.model.name == name:
840
+ return dim
841
+
842
+ msg = f"No base dimension with {name=} is stored."
843
+ raise DSGValueNotRegistered(msg)
844
+
845
+ def get_dimension_with_records(self, name: str) -> DimensionBaseConfigWithFiles:
846
+ """Return a dimension config matching name that has records."""
847
+ dim = self._dimensions_by_name.get(name)
848
+ if dim is None:
849
+ msg = f"{name=} is not stored"
850
+ raise DSGInvalidDimension(msg)
851
+ if not isinstance(dim, DimensionBaseConfigWithFiles):
852
+ msg = f"{dim.model.label} does not have records"
853
+ raise DSGInvalidParameter(msg)
854
+ return dim
855
+
856
+ def get_dimension_records(self, name: str) -> DataFrame:
857
+ """Return a DataFrame containing the records for a dimension."""
858
+ return self.get_dimension_with_records(name).get_records_dataframe()
859
+
860
+ def get_dimension_record_ids(self, name: str) -> set[str]:
861
+ """Return the record IDs for the dimension identified by name."""
862
+ return self.get_dimension_with_records(name).get_unique_ids()
863
+
864
+ def get_dimension_reference(self, dimension_id: str) -> DimensionReferenceModel:
865
+ """Return the reference of the dimension matching dimension_id."""
866
+ for ref in itertools.chain(
867
+ self.model.dimensions.base_dimension_references,
868
+ self.model.dimensions.supplemental_dimension_references,
869
+ ):
870
+ if ref.dimension_id == dimension_id:
871
+ return ref
872
+
873
+ msg = f"{dimension_id} is not stored"
874
+ raise DSGInvalidDimension(msg)
875
+
876
+ def list_base_dimensions(
877
+ self, dimension_type: DimensionType | None = None
878
+ ) -> list[DimensionBaseConfig]:
879
+ """Return all base dimensions, optionally filtering to the dimension_type.
880
+
881
+ See also
882
+ --------
883
+ get_base_dimension
884
+ """
885
+ if dimension_type is None:
886
+ return list(self._iter_base_dimensions())
887
+ return [
888
+ x for x in self._iter_base_dimensions() if x.model.dimension_type == dimension_type
889
+ ]
890
+
891
+ def list_base_dimensions_with_records(
892
+ self, dimension_type: DimensionType
893
+ ) -> list[DimensionBaseConfigWithFiles]:
894
+ """Return all base dimensions of the given dimension_type.
895
+
896
+ See also
897
+ --------
898
+ get_base_dimension
899
+ """
900
+ return [
901
+ x
902
+ for x in self._iter_base_dimensions()
903
+ if x.model.dimension_type == dimension_type
904
+ and isinstance(x, DimensionBaseConfigWithFiles)
905
+ ]
906
+
907
+ def list_supplemental_dimensions(
908
+ self, dimension_type: DimensionType, sort_by=None
909
+ ) -> list[DimensionBaseConfigWithFiles]:
910
+ """Return the supplemental dimensions matching dimension (if any).
911
+
912
+ Parameters
913
+ ----------
914
+ dimension_type : DimensionType
915
+ sort_by : str | None
916
+ If set, sort the dimensions by this dimension attribute.
917
+ """
918
+ dims = [
919
+ x
920
+ for x in self.supplemental_dimensions.values()
921
+ if x.model.dimension_type == dimension_type
922
+ ]
923
+ if sort_by is not None:
924
+ dims.sort(key=lambda x: getattr(x.model, sort_by))
925
+ return dims
926
+
927
+ def get_matching_subset_dimension(
928
+ self, dimension_type: DimensionType, unique_data_records: set[str]
929
+ ) -> DimensionReferenceModel | None:
930
+ """Return a dimension reference if there is a matching subset dimension, otherwise None."""
931
+ for group in self.model.dimensions.subset_dimensions:
932
+ if group.dimension_type == dimension_type:
933
+ for ref in group.selector_references:
934
+ key = ConfigKey(ref.dimension_id, ref.version)
935
+ records = self._subset_dimensions[dimension_type][group.name][
936
+ key
937
+ ].get_unique_ids()
938
+ if not unique_data_records.symmetric_difference(records):
939
+ logger.info("Found matching subset dimension: %s", group.name)
940
+ return ref
941
+ return None
942
+
943
+ def get_base_to_supplemental_dimension_mappings_by_types(
944
+ self, dimension_type: DimensionType
945
+ ) -> list[MappingTableConfig]:
946
+ """Return the base-to-supplemental dimension mappings for the dimension (if any)."""
947
+ return [
948
+ x
949
+ for x in self._base_to_supplemental_mappings.values()
950
+ if x.model.from_dimension.dimension_type == dimension_type
951
+ ]
952
+
953
+ def get_base_to_supplemental_config(
954
+ self, base_dim: DimensionBaseConfigWithFiles, supp_dim: DimensionBaseConfigWithFiles
955
+ ) -> MappingTableConfig:
956
+ """Return the project's base-to-supplemental dimension mapping config for the given
957
+ base and supplemental dimensions.
958
+ """
959
+ self._check_not_base_dimension(supp_dim)
960
+
961
+ for mapping in self._base_to_supplemental_mappings.values():
962
+ if (
963
+ mapping.model.from_dimension.dimension_id == base_dim.model.dimension_id
964
+ and mapping.model.to_dimension.dimension_id == supp_dim.model.dimension_id
965
+ ):
966
+ return mapping
967
+
968
+ msg = f"No mapping is stored for base = {base_dim.model.label}, supplemental = {supp_dim.model.label}"
969
+ raise DSGValueNotRegistered(msg)
970
+
971
+ def get_base_to_supplemental_mapping_records(
972
+ self, base_dim: DimensionBaseConfigWithFiles, supp_dim: DimensionBaseConfigWithFiles
973
+ ) -> DataFrame:
974
+ """Return the project's base-to-supplemental dimension mapping records.
975
+ Excludes rows with NULL to_id values.
976
+ """
977
+ config = self.get_base_to_supplemental_config(base_dim, supp_dim)
978
+ return config.get_records_dataframe().filter("to_id is not NULL")
979
+
980
+ def has_base_to_supplemental_dimension_mapping_types(self, dimension_type) -> bool:
981
+ """Return True if the config has these base-to-supplemental mappings."""
982
+ return self._has_mapping(
983
+ dimension_type,
984
+ dimension_type,
985
+ self._base_to_supplemental_mappings,
986
+ )
987
+
988
+ def get_base_dimension_by_id(self, dimension_id: str) -> DimensionBaseConfig:
989
+ """Return the base dimension with dimension_id."""
990
+ for dim in self._iter_base_dimensions():
991
+ if dim.model.dimension_id == dimension_id:
992
+ return dim
993
+ msg = f"Did not find a base dimension with {dimension_id=}"
994
+ raise DSGValueNotRegistered(msg)
995
+
996
+ def get_base_dimension_records_by_id(self, dimension_id: str) -> DataFrame:
997
+ """Return the records for the base dimension with dimension_id."""
998
+ dim = self.get_base_dimension_by_id(dimension_id)
999
+ if not isinstance(dim, DimensionBaseConfigWithFiles):
1000
+ msg = f"{dim.model.label} does not have records"
1001
+ raise DSGInvalidParameter(msg)
1002
+ return dim.get_records_dataframe()
1003
+
1004
+ def _check_not_base_dimension(self, dim: DimensionBaseConfig) -> None:
1005
+ """Check that the dimension is not a base dimension."""
1006
+ for base_dim in self.list_base_dimensions(dimension_type=dim.model.dimension_type):
1007
+ if dim.model.dimension_id == base_dim.model.dimension_id:
1008
+ msg = f"Cannot pass base dimension: {dim.model.label}"
1009
+ raise DSGInvalidParameter(msg)
1010
+
1011
+ @staticmethod
1012
+ def _has_mapping(
1013
+ from_dimension_type: DimensionType, to_dimension_type: DimensionType, mapping: dict
1014
+ ) -> bool:
1015
+ for config in mapping.values():
1016
+ if (
1017
+ config.model.from_dimension.dimension_type == from_dimension_type
1018
+ and config.model.to_dimension.dimension_type == to_dimension_type
1019
+ ):
1020
+ return True
1021
+ return False
1022
+
1023
+ def list_dimension_names(self, category: DimensionCategory | None = None) -> list[str]:
1024
+ """Return query names for all dimensions in the project.
1025
+
1026
+ Parameters
1027
+ ----------
1028
+ category : DimensionCategory | None
1029
+ Optionally, filter return by category.
1030
+ """
1031
+ if category is None:
1032
+ return sorted(self._dimensions_by_name.keys())
1033
+
1034
+ match category:
1035
+ case DimensionCategory.BASE:
1036
+ method = self._iter_base_dimensions
1037
+ case DimensionCategory.SUBSET:
1038
+ method = self._iter_subset_dimensions
1039
+ case DimensionCategory.SUPPLEMENTAL:
1040
+ method = self._iter_supplemental_dimensions
1041
+ case _:
1042
+ msg = f"{category=}"
1043
+ raise NotImplementedError(msg)
1044
+
1045
+ return sorted((x.model.name for x in method()))
1046
+
1047
+ def list_dimension_names_by_type(self, dimension_type: DimensionType) -> list[str]:
1048
+ """List the query names available for a dimension type."""
1049
+ return [
1050
+ x.model.name
1051
+ for x in self.iter_dimensions()
1052
+ if x.model.dimension_type == dimension_type
1053
+ ]
1054
+
1055
+ def get_dimension_names_mapped_to_type(self) -> dict[str, DimensionType]:
1056
+ """Return a dict of query names mapped to their dimension type."""
1057
+ return {x.model.name: x.model.dimension_type for x in self.iter_dimensions()}
1058
+
1059
+ def get_dimension_type_to_base_name_mapping(self) -> dict[DimensionType, list[str]]:
1060
+ """Return a mapping of DimensionType to query names for base dimensions."""
1061
+ query_names: dict[DimensionType, list[str]] = {}
1062
+ for dimension_type in DimensionType:
1063
+ query_names[dimension_type] = [
1064
+ x.model.name for x in self.list_base_dimensions(dimension_type=dimension_type)
1065
+ ]
1066
+ return query_names
1067
+
1068
+ def get_subset_dimension_to_name_mapping(self) -> dict[DimensionType, list[str]]:
1069
+ """Return a mapping of DimensionType to query name for subset dimensions."""
1070
+ query_names = defaultdict(list)
1071
+ for dimension_type in DimensionType:
1072
+ if dimension_type in self._subset_dimensions:
1073
+ for selectors in self._subset_dimensions[dimension_type].values():
1074
+ for dim in selectors.values():
1075
+ query_names[dimension_type].append(dim.model.name)
1076
+ return query_names
1077
+
1078
+ def get_supplemental_dimension_to_name_mapping(self) -> dict[DimensionType, list[str]]:
1079
+ """Return a mapping of DimensionType to query name for supplemental dimensions."""
1080
+ query_names = {}
1081
+ for dimension_type in DimensionType:
1082
+ query_names[dimension_type] = [
1083
+ x.model.name
1084
+ for x in self.list_supplemental_dimensions(dimension_type, sort_by="name")
1085
+ ]
1086
+ return query_names
1087
+
1088
+ def get_dimension_names_model(self) -> ProjectDimensionNamesModel:
1089
+ """Return an instance of ProjectDimensionNamesModel for the project."""
1090
+ base_names_by_type = self.get_dimension_type_to_base_name_mapping()
1091
+ subset_names_by_type = self.get_subset_dimension_to_name_mapping()
1092
+ supp_names_by_type = self.get_supplemental_dimension_to_name_mapping()
1093
+ model: dict[str, Any] = {}
1094
+ for dimension_type in DimensionType:
1095
+ model[dimension_type.value] = {
1096
+ "base": base_names_by_type[dimension_type],
1097
+ "subset": subset_names_by_type[dimension_type],
1098
+ "supplemental": supp_names_by_type[dimension_type],
1099
+ }
1100
+ return ProjectDimensionNamesModel(**model)
1101
+
1102
+ def set_dimensions(
1103
+ self,
1104
+ base_dimensions: dict[ConfigKey, DimensionBaseConfig],
1105
+ subset_dimensions: dict[
1106
+ DimensionType, dict[str, dict[ConfigKey, DimensionBaseConfigWithFiles]]
1107
+ ],
1108
+ supplemental_dimensions: dict[ConfigKey, DimensionBaseConfig],
1109
+ ) -> None:
1110
+ self._base_dimensions.clear()
1111
+ self._subset_dimensions.clear()
1112
+ self._supplemental_dimensions.clear()
1113
+ self._base_dimensions.update(base_dimensions)
1114
+ self._subset_dimensions.update(subset_dimensions)
1115
+ self._supplemental_dimensions.update(supplemental_dimensions)
1116
+ self._dimensions_by_name.clear()
1117
+ for dim in self.iter_dimensions():
1118
+ if dim.model.name in self._dimensions_by_name:
1119
+ msg = f"name={dim.model.name} exists multiple times in project {self.config_id}"
1120
+ raise DSGInvalidDimension(msg)
1121
+ self._dimensions_by_name[dim.model.name] = dim
1122
+
1123
+ def set_dimension_mappings(
1124
+ self, base_to_supplemental_mappings: dict[ConfigKey, MappingTableConfig]
1125
+ ):
1126
+ self._base_to_supplemental_mappings.clear()
1127
+ self._base_to_supplemental_mappings.update(base_to_supplemental_mappings)
1128
+ # TODO: Once we start using these we may need to store by (from, to) as key instead.
1129
+
1130
+ def add_dataset_dimension_mappings(
1131
+ self, dataset_config: DatasetConfig, references: list[DimensionMappingReferenceModel]
1132
+ ):
1133
+ """Add a dataset's dimension mappings to the project.
1134
+
1135
+ Raises
1136
+ ------
1137
+ DSGInvalidDimensionMapping
1138
+ Raised if a requirement is violated.
1139
+ """
1140
+ if dataset_config.model.dataset_id not in self.model.dimension_mappings.dataset_to_project:
1141
+ self.model.dimension_mappings.dataset_to_project[dataset_config.model.dataset_id] = []
1142
+ mappings = self.model.dimension_mappings.dataset_to_project[
1143
+ dataset_config.model.dataset_id
1144
+ ]
1145
+ existing_ids = set((x.mapping_id for x in mappings))
1146
+ for reference in references:
1147
+ if reference.mapping_id not in existing_ids:
1148
+ mappings.append(reference)
1149
+ logger.info(
1150
+ "Added dimension mapping for dataset=%s: %s",
1151
+ dataset_config.model.dataset_id,
1152
+ reference.mapping_id,
1153
+ )
1154
+
1155
+ def add_dataset_base_dimension_names(
1156
+ self, dataset_id: str, base_dimension_names: DatasetBaseDimensionNamesModel
1157
+ ):
1158
+ """Add project base dimension query names represented in the dataset."""
1159
+ for field in type(base_dimension_names).model_fields:
1160
+ if getattr(base_dimension_names, field) is None:
1161
+ msg = f"DatasetBaseDimensionNamesModel {field} cannot be None"
1162
+ raise DSGInvalidParameter(msg)
1163
+ dataset = self.get_dataset(dataset_id)
1164
+ dataset.base_dimension_names = base_dimension_names
1165
+
1166
+ def get_dataset_base_dimension_names(self, dataset_id: str) -> DatasetBaseDimensionNamesModel:
1167
+ """Return the project base dimension query names represented in the dataset."""
1168
+ return self.get_dataset(dataset_id).base_dimension_names
1169
+
1170
+ @property
1171
+ def config_id(self) -> str:
1172
+ return self._model.project_id
1173
+
1174
+ def get_dataset(self, dataset_id: str) -> InputDatasetModel:
1175
+ """Return a dataset by ID."""
1176
+ for dataset in self.model.datasets:
1177
+ if dataset.dataset_id == dataset_id:
1178
+ return dataset
1179
+
1180
+ msg = f"project_id={self._model.project_id} does not have dataset_id={dataset_id}"
1181
+ raise DSGInvalidField(msg)
1182
+
1183
+ def has_dataset(self, dataset_id: str, status: DatasetRegistryStatus | None) -> bool:
1184
+ """Return True if the dataset_id is present in the configuration.
1185
+
1186
+ Parameters
1187
+ ----------
1188
+ dataset_id : str
1189
+ status : None | DatasetRegistryStatus
1190
+ If set, only return True if the status matches.
1191
+ """
1192
+ for dataset in self.iter_datasets():
1193
+ if dataset.dataset_id == dataset_id:
1194
+ if status is None or dataset.status == status:
1195
+ return True
1196
+ return False
1197
+
1198
+ return False
1199
+
1200
+ def get_load_data_time_columns(self, name: str) -> list[str]:
1201
+ """Return the time dimension columns expected in the load data table for this query name."""
1202
+ dim = self.get_time_dimension(name)
1203
+ time_columns = dim.get_load_data_time_columns()
1204
+ return time_columns
1205
+
1206
+ def iter_datasets(self) -> Generator[InputDatasetModel, None, None]:
1207
+ for dataset in self.model.datasets:
1208
+ yield dataset
1209
+
1210
+ def _iter_base_dimensions(self) -> Generator[DimensionBaseConfig, None, None]:
1211
+ yield from self._base_dimensions.values()
1212
+
1213
+ def _iter_subset_dimensions(self) -> Generator[DimensionBaseConfig, None, None]:
1214
+ for x in self._subset_dimensions.values():
1215
+ for y in x.values():
1216
+ for z in y.values():
1217
+ yield z
1218
+
1219
+ def _iter_supplemental_dimensions(self) -> Generator[DimensionBaseConfig, None, None]:
1220
+ yield from self._supplemental_dimensions.values()
1221
+
1222
+ def iter_dimensions(self) -> Iterable[DimensionBaseConfig]:
1223
+ """Return an iterator over all dimensions of the project.
1224
+
1225
+ Yields
1226
+ ------
1227
+ DimensionConfig
1228
+
1229
+ """
1230
+ return itertools.chain(
1231
+ self._iter_base_dimensions(),
1232
+ self._iter_subset_dimensions(),
1233
+ self._iter_supplemental_dimensions(),
1234
+ )
1235
+
1236
+ def list_registered_dataset_ids(self) -> list[str]:
1237
+ """List registered datasets associated with the project."""
1238
+ status = DatasetRegistryStatus.REGISTERED
1239
+ return [x.dataset_id for x in self._iter_datasets_by_status(status)]
1240
+
1241
+ def list_unregistered_dataset_ids(self) -> list[str]:
1242
+ """List unregistered datasets associated with project registry."""
1243
+ status = DatasetRegistryStatus.UNREGISTERED
1244
+ return [x.dataset_id for x in self._iter_datasets_by_status(status)]
1245
+
1246
+ def _iter_datasets_by_status(
1247
+ self, status: DatasetRegistryStatus
1248
+ ) -> Generator[InputDatasetModel, None, None]:
1249
+ for dataset in self.iter_datasets():
1250
+ if dataset.status == status:
1251
+ yield dataset
1252
+
1253
+ def get_required_dimension_record_ids(
1254
+ self, dataset_id: str, dimension_type: DimensionType
1255
+ ) -> set[str]:
1256
+ """Return the required base dimension record IDs for the dataset and dimension type."""
1257
+ dataset = self.get_dataset(dataset_id)
1258
+ req = getattr(dataset.required_dimensions.single_dimensional, dimension_type.value)
1259
+ record_ids = self._get_required_dimension_record_ids(req)
1260
+ for multi_req in dataset.required_dimensions.multi_dimensional:
1261
+ req = getattr(multi_req, dimension_type.value)
1262
+ record_ids.update(self._get_required_dimension_record_ids(req))
1263
+
1264
+ return record_ids
1265
+
1266
+ def _build_multi_dim_requirement_associations(
1267
+ self, multi_dim_reqs: list[RequiredDimensionRecordsModel], context: ScratchDirContext
1268
+ ) -> list[DataFrame]:
1269
+ dfs_by_dim_combo: dict[tuple[str, ...], DataFrame] = {}
1270
+
1271
+ # Example: Partial sector and subsector combinations are required.
1272
+ # [
1273
+ # {{"sector": {"base": ["com"]},
1274
+ # "subsector": "supplemental":
1275
+ # {"name": "commercial-subsectors",
1276
+ # "record_ids": ["commercial_subsectors"]}},
1277
+ # {"sector": {"base": ["res"]}, "subsector": {"base": ["MidriseApartment"]}},
1278
+ # ]
1279
+ # This code will replace supplemental records with base records and return a list of
1280
+ # dataframes of those combinations - one per unique combination of dimensions.
1281
+
1282
+ for multi_req in multi_dim_reqs:
1283
+ dim_combo = []
1284
+ columns = {}
1285
+ for field in sorted(RequiredDimensionRecordsModel.model_fields):
1286
+ dim_type = DimensionType(field)
1287
+ req = getattr(multi_req, field)
1288
+ record_ids = self._get_required_dimension_record_ids(req)
1289
+ if record_ids:
1290
+ columns[field] = list(record_ids)
1291
+ dim_combo.append(dim_type.value)
1292
+
1293
+ df = create_dataframe_from_product(columns, context)
1294
+ df = df.select(*sorted(df.columns))
1295
+
1296
+ dim_combo_tp = tuple(sorted(dim_combo))
1297
+ if dim_combo_tp in dfs_by_dim_combo:
1298
+ dfs_by_dim_combo[dim_combo_tp] = dfs_by_dim_combo[dim_combo_tp].union(df)
1299
+ else:
1300
+ dfs_by_dim_combo[dim_combo_tp] = df
1301
+
1302
+ return list(dfs_by_dim_combo.values())
1303
+
1304
+ def _get_required_dimension_record_ids(
1305
+ self, reqs: RequiredDimensionRecordsByTypeModel
1306
+ ) -> set[str]:
1307
+ """Return the required record IDs for a dimension based on the specification in the
1308
+ project config.
1309
+ """
1310
+ record_ids = self._get_required_base_dimension_record_ids(reqs)
1311
+ record_ids.update(self._get_required_record_ids_from_subsets(reqs))
1312
+ return record_ids
1313
+
1314
+ def _get_required_base_dimension_record_ids(
1315
+ self, reqs: RequiredDimensionRecordsByTypeModel
1316
+ ) -> set[str]:
1317
+ """Return the required record IDs for a base dimension based on the specification in the
1318
+ project config.
1319
+ """
1320
+ record_ids: set[str] = set()
1321
+ if not reqs.base.record_ids and not reqs.base_missing.record_ids:
1322
+ return record_ids
1323
+
1324
+ base_dim_query_name = reqs.base.dimension_name or reqs.base_missing.dimension_name
1325
+ assert base_dim_query_name is not None
1326
+ all_base_record_ids = self.get_dimension_record_ids(base_dim_query_name)
1327
+
1328
+ if reqs.base.record_ids == ["__all__"]:
1329
+ assert reqs.base.dimension_name is not None
1330
+ record_ids = all_base_record_ids
1331
+ elif reqs.base.record_ids:
1332
+ record_ids = set(reqs.base.record_ids)
1333
+ if diff := record_ids - all_base_record_ids:
1334
+ msg = (
1335
+ "The project config requires these these record IDs in the dataset's 'base' "
1336
+ "field, but they are not in the base dimension records: "
1337
+ f"name={base_dim_query_name}: {diff=}"
1338
+ )
1339
+ raise DSGInvalidDataset(msg)
1340
+ elif reqs.base_missing.record_ids:
1341
+ assert reqs.base_missing.dimension_name is not None
1342
+ missing_ids = set(reqs.base_missing.record_ids)
1343
+ if diff := missing_ids - all_base_record_ids:
1344
+ msg = (
1345
+ "The project config requires these these record IDs in the dataset's "
1346
+ "'base_missing' field, but they are not in the base dimension "
1347
+ f"name={base_dim_query_name}: {diff=}"
1348
+ )
1349
+ raise DSGInvalidDataset(msg)
1350
+ record_ids = all_base_record_ids - missing_ids
1351
+
1352
+ return record_ids
1353
+
1354
+ def _get_subset_dimension_records(self, name: str, selector_name: str) -> set[str]:
1355
+ for group in self.model.dimensions.subset_dimensions:
1356
+ if group.name == name:
1357
+ for ref in group.selector_references:
1358
+ key = ConfigKey(ref.dimension_id, ref.version)
1359
+ dim = self._subset_dimensions[group.dimension_type][group.name][key]
1360
+ if dim.model.name == selector_name:
1361
+ assert isinstance(dim, DimensionBaseConfigWithFiles)
1362
+ return dim.get_unique_ids()
1363
+
1364
+ msg = f"subset dimension selector not found: {name=} {selector_name=}"
1365
+ raise DSGInvalidDimension(msg)
1366
+
1367
+ def _get_required_record_ids_from_subsets(
1368
+ self, req: RequiredDimensionRecordsByTypeModel
1369
+ ) -> set[str]:
1370
+ record_ids = set()
1371
+ for subset in req.subset:
1372
+ for selector_name in subset.selectors:
1373
+ record_ids.update(self._get_subset_dimension_records(subset.name, selector_name))
1374
+ return record_ids
1375
+
1376
+ @track_timing(timer_stats_collector)
1377
+ def make_dimension_association_table(
1378
+ self, dataset_id: str, context: ScratchDirContext
1379
+ ) -> DataFrame:
1380
+ """Build a table that includes all combinations of dimension records that must be provided
1381
+ by the dataset.
1382
+ """
1383
+ required_dimensions = self.get_dataset(dataset_id).required_dimensions
1384
+ multi_dfs = self._build_multi_dim_requirement_associations(
1385
+ required_dimensions.multi_dimensional, context
1386
+ )
1387
+
1388
+ # Project config construction asserts that there is no intersection of dimensions in
1389
+ # multi and single.
1390
+ existing = set()
1391
+ for df in multi_dfs:
1392
+ existing.update(set(df.columns))
1393
+
1394
+ single_dfs: dict[str, list[str]] = {}
1395
+ for field in (x for x in RequiredDimensionRecordsModel.model_fields if x not in existing):
1396
+ req = getattr(required_dimensions.single_dimensional, field)
1397
+ record_ids = self._get_required_dimension_record_ids(req)
1398
+ single_dfs[field] = list(record_ids)
1399
+
1400
+ single_df = create_dataframe_from_product(single_dfs, context)
1401
+ return cross_join_dfs(multi_dfs + [single_df])
1402
+
1403
+ def are_all_datasets_submitted(self) -> bool:
1404
+ """Return True if all datasets have been submitted."""
1405
+ return not self.list_unregistered_dataset_ids()
1406
+
1407
+ def set_status(self, status: ProjectRegistryStatus) -> None:
1408
+ """Set the project status to the given value."""
1409
+ self.model.status = status
1410
+ logger.info("Set project_id=%s status=%s", self.config_id, status)
1411
+
1412
+ def set_dataset_status(self, dataset_id: str, status: DatasetRegistryStatus):
1413
+ """Set the dataset status to the given value.
1414
+
1415
+ Raises
1416
+ ------
1417
+ ValueError
1418
+ Raised if dataset_id is not stored.
1419
+ """
1420
+ dataset = self.get_dataset(dataset_id)
1421
+ dataset.status = status
1422
+ logger.info(
1423
+ "Set dataset_id=%s status=%s for project_id=%s",
1424
+ dataset_id,
1425
+ status,
1426
+ self._model.project_id,
1427
+ )
1428
+
1429
+ @property
1430
+ def base_dimensions(self) -> dict:
1431
+ """Return the Base Dimensions.
1432
+
1433
+ Returns
1434
+ -------
1435
+ dict
1436
+ dict of DimensionConfig keyed by ConfigKey
1437
+
1438
+ """
1439
+ return self._base_dimensions
1440
+
1441
+ @property
1442
+ def supplemental_dimensions(self) -> dict:
1443
+ """Return the supplemental dimensions.
1444
+
1445
+ Returns
1446
+ -------
1447
+ dict
1448
+ dict of DimensionConfig keyed by ConfigKey
1449
+
1450
+ """
1451
+ return self._supplemental_dimensions
1452
+
1453
+
1454
+ def load_subset_dimensions(filename: Path) -> tuple[set[str], dict[str, list[str]]]:
1455
+ """Return a mapping of subset dimension name to record IDs."""
1456
+ df = pd.read_csv(filename, index_col="id")
1457
+ if len(df.columns) == 0:
1458
+ msg = "A subset dimension records file must at least one dimension column."
1459
+ raise DSGInvalidDimension(msg)
1460
+ record_ids = set(df.index.values)
1461
+ subset_by_dim_name = {x: df[x].dropna().index.to_list() for x in df.columns}
1462
+ return record_ids, subset_by_dim_name