dsgrid-toolkit 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dsgrid-toolkit might be problematic. Click here for more details.

Files changed (152) hide show
  1. dsgrid/__init__.py +22 -0
  2. dsgrid/api/__init__.py +0 -0
  3. dsgrid/api/api_manager.py +179 -0
  4. dsgrid/api/app.py +420 -0
  5. dsgrid/api/models.py +60 -0
  6. dsgrid/api/response_models.py +116 -0
  7. dsgrid/apps/__init__.py +0 -0
  8. dsgrid/apps/project_viewer/app.py +216 -0
  9. dsgrid/apps/registration_gui.py +444 -0
  10. dsgrid/chronify.py +22 -0
  11. dsgrid/cli/__init__.py +0 -0
  12. dsgrid/cli/common.py +120 -0
  13. dsgrid/cli/config.py +177 -0
  14. dsgrid/cli/download.py +13 -0
  15. dsgrid/cli/dsgrid.py +142 -0
  16. dsgrid/cli/dsgrid_admin.py +349 -0
  17. dsgrid/cli/install_notebooks.py +62 -0
  18. dsgrid/cli/query.py +711 -0
  19. dsgrid/cli/registry.py +1773 -0
  20. dsgrid/cloud/__init__.py +0 -0
  21. dsgrid/cloud/cloud_storage_interface.py +140 -0
  22. dsgrid/cloud/factory.py +31 -0
  23. dsgrid/cloud/fake_storage_interface.py +37 -0
  24. dsgrid/cloud/s3_storage_interface.py +156 -0
  25. dsgrid/common.py +35 -0
  26. dsgrid/config/__init__.py +0 -0
  27. dsgrid/config/annual_time_dimension_config.py +187 -0
  28. dsgrid/config/common.py +131 -0
  29. dsgrid/config/config_base.py +148 -0
  30. dsgrid/config/dataset_config.py +684 -0
  31. dsgrid/config/dataset_schema_handler_factory.py +41 -0
  32. dsgrid/config/date_time_dimension_config.py +108 -0
  33. dsgrid/config/dimension_config.py +54 -0
  34. dsgrid/config/dimension_config_factory.py +65 -0
  35. dsgrid/config/dimension_mapping_base.py +349 -0
  36. dsgrid/config/dimension_mappings_config.py +48 -0
  37. dsgrid/config/dimensions.py +775 -0
  38. dsgrid/config/dimensions_config.py +71 -0
  39. dsgrid/config/index_time_dimension_config.py +76 -0
  40. dsgrid/config/input_dataset_requirements.py +31 -0
  41. dsgrid/config/mapping_tables.py +209 -0
  42. dsgrid/config/noop_time_dimension_config.py +42 -0
  43. dsgrid/config/project_config.py +1457 -0
  44. dsgrid/config/registration_models.py +199 -0
  45. dsgrid/config/representative_period_time_dimension_config.py +194 -0
  46. dsgrid/config/simple_models.py +49 -0
  47. dsgrid/config/supplemental_dimension.py +29 -0
  48. dsgrid/config/time_dimension_base_config.py +200 -0
  49. dsgrid/data_models.py +155 -0
  50. dsgrid/dataset/__init__.py +0 -0
  51. dsgrid/dataset/dataset.py +123 -0
  52. dsgrid/dataset/dataset_expression_handler.py +86 -0
  53. dsgrid/dataset/dataset_mapping_manager.py +121 -0
  54. dsgrid/dataset/dataset_schema_handler_base.py +899 -0
  55. dsgrid/dataset/dataset_schema_handler_one_table.py +196 -0
  56. dsgrid/dataset/dataset_schema_handler_standard.py +303 -0
  57. dsgrid/dataset/growth_rates.py +162 -0
  58. dsgrid/dataset/models.py +44 -0
  59. dsgrid/dataset/table_format_handler_base.py +257 -0
  60. dsgrid/dataset/table_format_handler_factory.py +17 -0
  61. dsgrid/dataset/unpivoted_table.py +121 -0
  62. dsgrid/dimension/__init__.py +0 -0
  63. dsgrid/dimension/base_models.py +218 -0
  64. dsgrid/dimension/dimension_filters.py +308 -0
  65. dsgrid/dimension/standard.py +213 -0
  66. dsgrid/dimension/time.py +531 -0
  67. dsgrid/dimension/time_utils.py +88 -0
  68. dsgrid/dsgrid_rc.py +88 -0
  69. dsgrid/exceptions.py +105 -0
  70. dsgrid/filesystem/__init__.py +0 -0
  71. dsgrid/filesystem/cloud_filesystem.py +32 -0
  72. dsgrid/filesystem/factory.py +32 -0
  73. dsgrid/filesystem/filesystem_interface.py +136 -0
  74. dsgrid/filesystem/local_filesystem.py +74 -0
  75. dsgrid/filesystem/s3_filesystem.py +118 -0
  76. dsgrid/loggers.py +132 -0
  77. dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +950 -0
  78. dsgrid/notebooks/registration.ipynb +48 -0
  79. dsgrid/notebooks/start_notebook.sh +11 -0
  80. dsgrid/project.py +451 -0
  81. dsgrid/query/__init__.py +0 -0
  82. dsgrid/query/dataset_mapping_plan.py +142 -0
  83. dsgrid/query/derived_dataset.py +384 -0
  84. dsgrid/query/models.py +726 -0
  85. dsgrid/query/query_context.py +287 -0
  86. dsgrid/query/query_submitter.py +847 -0
  87. dsgrid/query/report_factory.py +19 -0
  88. dsgrid/query/report_peak_load.py +70 -0
  89. dsgrid/query/reports_base.py +20 -0
  90. dsgrid/registry/__init__.py +0 -0
  91. dsgrid/registry/bulk_register.py +161 -0
  92. dsgrid/registry/common.py +287 -0
  93. dsgrid/registry/config_update_checker_base.py +63 -0
  94. dsgrid/registry/data_store_factory.py +34 -0
  95. dsgrid/registry/data_store_interface.py +69 -0
  96. dsgrid/registry/dataset_config_generator.py +156 -0
  97. dsgrid/registry/dataset_registry_manager.py +734 -0
  98. dsgrid/registry/dataset_update_checker.py +16 -0
  99. dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
  100. dsgrid/registry/dimension_mapping_update_checker.py +16 -0
  101. dsgrid/registry/dimension_registry_manager.py +413 -0
  102. dsgrid/registry/dimension_update_checker.py +16 -0
  103. dsgrid/registry/duckdb_data_store.py +185 -0
  104. dsgrid/registry/filesystem_data_store.py +141 -0
  105. dsgrid/registry/filter_registry_manager.py +123 -0
  106. dsgrid/registry/project_config_generator.py +57 -0
  107. dsgrid/registry/project_registry_manager.py +1616 -0
  108. dsgrid/registry/project_update_checker.py +48 -0
  109. dsgrid/registry/registration_context.py +223 -0
  110. dsgrid/registry/registry_auto_updater.py +316 -0
  111. dsgrid/registry/registry_database.py +662 -0
  112. dsgrid/registry/registry_interface.py +446 -0
  113. dsgrid/registry/registry_manager.py +544 -0
  114. dsgrid/registry/registry_manager_base.py +367 -0
  115. dsgrid/registry/versioning.py +92 -0
  116. dsgrid/spark/__init__.py +0 -0
  117. dsgrid/spark/functions.py +545 -0
  118. dsgrid/spark/types.py +50 -0
  119. dsgrid/tests/__init__.py +0 -0
  120. dsgrid/tests/common.py +139 -0
  121. dsgrid/tests/make_us_data_registry.py +204 -0
  122. dsgrid/tests/register_derived_datasets.py +103 -0
  123. dsgrid/tests/utils.py +25 -0
  124. dsgrid/time/__init__.py +0 -0
  125. dsgrid/time/time_conversions.py +80 -0
  126. dsgrid/time/types.py +67 -0
  127. dsgrid/units/__init__.py +0 -0
  128. dsgrid/units/constants.py +113 -0
  129. dsgrid/units/convert.py +71 -0
  130. dsgrid/units/energy.py +145 -0
  131. dsgrid/units/power.py +87 -0
  132. dsgrid/utils/__init__.py +0 -0
  133. dsgrid/utils/dataset.py +612 -0
  134. dsgrid/utils/files.py +179 -0
  135. dsgrid/utils/filters.py +125 -0
  136. dsgrid/utils/id_remappings.py +100 -0
  137. dsgrid/utils/py_expression_eval/LICENSE +19 -0
  138. dsgrid/utils/py_expression_eval/README.md +8 -0
  139. dsgrid/utils/py_expression_eval/__init__.py +847 -0
  140. dsgrid/utils/py_expression_eval/tests.py +283 -0
  141. dsgrid/utils/run_command.py +70 -0
  142. dsgrid/utils/scratch_dir_context.py +64 -0
  143. dsgrid/utils/spark.py +918 -0
  144. dsgrid/utils/spark_partition.py +98 -0
  145. dsgrid/utils/timing.py +239 -0
  146. dsgrid/utils/utilities.py +184 -0
  147. dsgrid/utils/versioning.py +36 -0
  148. dsgrid_toolkit-0.2.0.dist-info/METADATA +216 -0
  149. dsgrid_toolkit-0.2.0.dist-info/RECORD +152 -0
  150. dsgrid_toolkit-0.2.0.dist-info/WHEEL +4 -0
  151. dsgrid_toolkit-0.2.0.dist-info/entry_points.txt +4 -0
  152. dsgrid_toolkit-0.2.0.dist-info/licenses/LICENSE +29 -0
@@ -0,0 +1,384 @@
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ import chronify
5
+ import json5
6
+
7
+ from dsgrid.chronify import create_store
8
+ from dsgrid.common import VALUE_COLUMN
9
+ from dsgrid.config.dataset_config import (
10
+ DataClassificationType,
11
+ DataSchemaType,
12
+ InputDatasetType,
13
+ )
14
+ from dsgrid.config.dataset_config import DatasetConfig
15
+ from dsgrid.config.dimension_config import DimensionBaseConfigWithFiles, DimensionConfig
16
+ from dsgrid.config.dimensions import DimensionModel
17
+ from dsgrid.config.project_config import ProjectConfig
18
+ from dsgrid.config.time_dimension_base_config import TimeDimensionBaseConfig
19
+ from dsgrid.dataset.models import TableFormatType
20
+ from dsgrid.dimension.base_models import DimensionType, DimensionCategory
21
+ from dsgrid.dsgrid_rc import DsgridRuntimeConfig
22
+ from dsgrid.exceptions import DSGInvalidDataset
23
+ from dsgrid.query.models import ProjectQueryModel, DatasetMetadataModel, ColumnType
24
+ from dsgrid.query.query_submitter import QuerySubmitterBase
25
+ from dsgrid.registry.registry_manager import RegistryManager
26
+ from dsgrid.spark.functions import make_temp_view_name
27
+ from dsgrid.spark.types import DataFrame
28
+ from dsgrid.utils.files import dump_data
29
+ from dsgrid.utils.scratch_dir_context import ScratchDirContext
30
+ from dsgrid.utils.spark import read_dataframe, get_unique_values
31
+
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ def create_derived_dataset_config_from_query(
37
+ query_path: Path, dst_path: Path, registry_manager: RegistryManager
38
+ ):
39
+ """Create a DatasetConfigModel and dimensions from a query result.
40
+
41
+ Parameters
42
+ ----------
43
+ query_path : Path
44
+ Output directory from a query.
45
+ dst_path : Path
46
+ Directory in which to create the dataset config files.
47
+ registry_manager : RegistryManager
48
+
49
+ Returns
50
+ -------
51
+ bool
52
+ Returns True if the operation is successful.
53
+ """
54
+ metadata_file = QuerySubmitterBase.metadata_filename(query_path)
55
+ query_file = QuerySubmitterBase.query_filename(query_path)
56
+ table_file = QuerySubmitterBase.table_filename(query_path)
57
+ if not metadata_file.exists() or not query_file.exists() or not table_file.exists():
58
+ logger.error("%s is not a valid query result directory", query_path)
59
+ return False
60
+
61
+ query = ProjectQueryModel.from_file(query_file)
62
+ if not does_query_support_a_derived_dataset(query):
63
+ return False
64
+
65
+ metadata = DatasetMetadataModel.from_file(metadata_file)
66
+ format_type = metadata.get_table_format_type()
67
+ table_format = {"format_type": format_type.value}
68
+ if format_type == TableFormatType.PIVOTED:
69
+ table_format["pivoted_dimension_type"] = metadata.table_format.pivoted_dimension_type.value
70
+
71
+ project = registry_manager.project_manager.load_project(
72
+ query.project.project_id, version=query.project.version
73
+ )
74
+ new_supplemental_dims_path = dst_path / "new_supplemental_dimensions"
75
+ df = read_dataframe(table_file)
76
+ # TODO: should there be a warning if the current project version is later?
77
+
78
+ # This code blocks compares the dimension records in the dataframe against the project's base
79
+ # and supplemental dimensions.
80
+ # If the records match an existing dimension, add a reference to that dimension in the
81
+ # dataset config.
82
+ # If the records don't match an existing dimension, create a new supplemental dimension and
83
+ # base-to-supplemental mapping that the user will need to register.
84
+ dimension_references = []
85
+ dimension_mapping_references = []
86
+ base_dim_query_names = set(
87
+ project.config.list_dimension_names(category=DimensionCategory.BASE)
88
+ )
89
+ num_new_supplemental_dimensions = 0
90
+ for dim_type in DimensionType:
91
+ dimension_names = metadata.dimensions.get_dimension_names(dim_type)
92
+ assert len(dimension_names) == 1, dimension_names
93
+ dim_query_name = next(iter(dimension_names))
94
+ if dim_type == DimensionType.TIME:
95
+ time_dim = project.config.get_time_dimension(dim_query_name)
96
+ is_valid = _does_time_dimension_match(time_dim, df, table_file)
97
+ if not is_valid:
98
+ logger.warning(
99
+ "The dataset does not match the project's time dimension. "
100
+ "If this is expected, add a new time dimension to the dataset config file "
101
+ "and create an appropriate dimension mapping."
102
+ )
103
+ continue
104
+
105
+ dim = project.config.get_dimension_with_records(dim_query_name)
106
+ if (
107
+ format_type == TableFormatType.PIVOTED
108
+ and metadata.table_format.pivoted_dimension_type == dim_type
109
+ ):
110
+ unique_data_records = metadata.dimensions.get_column_names(dim_type)
111
+ else:
112
+ unique_data_records = _get_unique_data_records(df, dim.model, query.result.column_type)
113
+ is_valid = _is_dimension_valid_for_dataset(dim, unique_data_records)
114
+
115
+ if is_valid:
116
+ dimension_references.append(_get_dimension_reference(dim, project.config))
117
+ if dim_query_name not in base_dim_query_names:
118
+ dimension_mapping_references.append(
119
+ _get_supplemental_dimension_mapping_reference(dim, project.config, metadata)
120
+ )
121
+ else:
122
+ subset_dim_ref = project.config.get_matching_subset_dimension(
123
+ dim_type, unique_data_records
124
+ )
125
+ if subset_dim_ref is not None:
126
+ dimension_references.append(subset_dim_ref.serialize())
127
+ continue
128
+
129
+ supp_dim = _get_matching_supplemental_dimension(
130
+ project.config, dim_type, unique_data_records
131
+ )
132
+ if supp_dim is None:
133
+ assert dim_query_name in base_dim_query_names, dim_query_name
134
+ _make_new_supplemental_dimension(
135
+ dim, unique_data_records, new_supplemental_dims_path
136
+ )
137
+ num_new_supplemental_dimensions += 1
138
+ else:
139
+ dimension_references.append(_get_dimension_reference(supp_dim, project.config))
140
+ dimension_mapping_references.append(
141
+ _get_supplemental_dimension_mapping_reference(
142
+ supp_dim, project.config, metadata
143
+ )
144
+ )
145
+
146
+ if dimension_mapping_references:
147
+ _make_dimension_mapping_references_file(dimension_mapping_references, dst_path)
148
+
149
+ _make_dataset_config(
150
+ query.project.dataset.dataset_id,
151
+ table_format,
152
+ dimension_references,
153
+ dst_path,
154
+ num_new_supplemental_dimensions,
155
+ )
156
+ return True
157
+
158
+
159
+ def does_query_support_a_derived_dataset(query: ProjectQueryModel):
160
+ """Return True if a derived dataset can be created from a query.
161
+
162
+ Returns
163
+ -------
164
+ bool
165
+ """
166
+ is_valid = True
167
+ if query.result.column_type != ColumnType.DIMENSION_TYPES:
168
+ is_valid = False
169
+ logger.error(
170
+ "Cannot create a derived dataset from a query with column_type = %s. It must be %s.",
171
+ query.result.column_type.value,
172
+ ColumnType.DIMENSION_TYPES.value,
173
+ )
174
+ if query.result.replace_ids_with_names:
175
+ is_valid = False
176
+ logger.error("Cannot create a derived dataset from a query with replace_ids_with_names")
177
+
178
+ return is_valid
179
+
180
+
181
+ def _does_time_dimension_match(dim_config: TimeDimensionBaseConfig, df: DataFrame, df_path: Path):
182
+ try:
183
+ if dim_config.supports_chronify():
184
+ _check_time_dimension_with_chronify(dim_config, df, df_path)
185
+ else:
186
+ dim_config.check_dataset_time_consistency(df, dim_config.get_load_data_time_columns())
187
+ except DSGInvalidDataset:
188
+ return False
189
+ return True
190
+
191
+
192
+ def _check_time_dimension_with_chronify(
193
+ dim_config: TimeDimensionBaseConfig, df: DataFrame, df_path: Path
194
+ ):
195
+ scratch_dir = DsgridRuntimeConfig.load().get_scratch_dir()
196
+ with ScratchDirContext(scratch_dir) as scratch_dir_context:
197
+ time_cols = dim_config.get_load_data_time_columns()
198
+ time_array_id_columns = [
199
+ x
200
+ for x in df.columns
201
+ # If there are multiple weather years:
202
+ # - that are continuous, weather year needs to be excluded (one overall range).
203
+ # - that are not continuous, weather year needs to be included and chronify
204
+ # needs additional support. TODO: issue #340
205
+ if x != DimensionType.WEATHER_YEAR.value
206
+ and x in set(df.columns).difference(time_cols).difference({VALUE_COLUMN})
207
+ ]
208
+ schema = chronify.TableSchema(
209
+ name=make_temp_view_name(),
210
+ time_config=dim_config.to_chronify(),
211
+ time_array_id_columns=time_array_id_columns,
212
+ value_column=VALUE_COLUMN,
213
+ )
214
+ store_file = scratch_dir_context.get_temp_filename(suffix=".db")
215
+ with create_store(store_file) as store:
216
+ # This performs all of the checks.
217
+ store.create_view_from_parquet(df_path, schema)
218
+ store.drop_view(schema.name)
219
+
220
+
221
+ def _is_dimension_valid_for_dataset(
222
+ dim_config: DimensionBaseConfigWithFiles, unique_data_records: DataFrame
223
+ ):
224
+ records = dim_config.get_records_dataframe()
225
+ dim_values = get_unique_values(records, "id")
226
+ diff = dim_values.symmetric_difference(unique_data_records)
227
+ if not diff:
228
+ return True
229
+
230
+ return False
231
+
232
+
233
+ def _get_matching_supplemental_dimension(
234
+ project_config: ProjectConfig,
235
+ dimension_type: DimensionType,
236
+ unique_data_records: DataFrame,
237
+ ) -> DimensionBaseConfigWithFiles | None:
238
+ for dim_config in project_config.list_supplemental_dimensions(dimension_type):
239
+ if _is_dimension_valid_for_dataset(dim_config, unique_data_records):
240
+ return dim_config
241
+
242
+ return None
243
+
244
+
245
+ def _make_dataset_config(
246
+ dataset_id,
247
+ table_format: dict[str, str],
248
+ dimension_references,
249
+ path: Path,
250
+ num_new_supplemental_dimensions,
251
+ data_classification=DataClassificationType.MODERATE.value,
252
+ ):
253
+ # Use dictionaries instead of DatasetConfigModel to avoid validation, which isn't possible
254
+ # here.
255
+ config = {
256
+ "dataset_id": dataset_id,
257
+ "dataset_type": InputDatasetType.MODELED.value,
258
+ "data_schema": {
259
+ "data_schema_type": DataSchemaType.ONE_TABLE.value,
260
+ "table_format": table_format,
261
+ },
262
+ "version": "1.0.0",
263
+ "description": "",
264
+ "origin_creator": "",
265
+ "origin_organization": "",
266
+ "origin_date": "",
267
+ "origin_project": "",
268
+ "origin_version": "",
269
+ "data_source": "",
270
+ "source": "",
271
+ "data_classification": data_classification,
272
+ "use_project_geography_time_zone": True,
273
+ "dimensions": [],
274
+ "dimension_references": dimension_references,
275
+ }
276
+ config_file = path / DatasetConfig.config_filename()
277
+ config_file.write_text(json5.dumps(config, indent=2))
278
+ if num_new_supplemental_dimensions > 0:
279
+ logger.info(
280
+ "Generated %s new supplemental dimensions. Review the records and fill out "
281
+ "the remaining fields, and then register them.",
282
+ num_new_supplemental_dimensions,
283
+ )
284
+ logger.info(
285
+ "Created %s with default information. Re-used %s project dimensions. "
286
+ "Examine %s, fill out the remaining fields, and register any new dimensions "
287
+ "before registering and submitting the dataset to the project.",
288
+ path,
289
+ len(dimension_references),
290
+ config_file,
291
+ )
292
+
293
+
294
+ def _make_new_supplemental_dimension(orig_dim_config, unique_data_records, path: Path):
295
+ project_record_ids = orig_dim_config.get_unique_ids()
296
+ if not unique_data_records.issubset(project_record_ids):
297
+ diff = project_record_ids.difference(unique_data_records)
298
+ if diff:
299
+ msg = (
300
+ f"The derived dataset records do not include some project base dimension "
301
+ f"records. Dimension type = {orig_dim_config.model.dimension_type} {diff=}"
302
+ )
303
+ raise DSGInvalidDataset(msg)
304
+ assert unique_data_records.issuperset(project_record_ids)
305
+ diff = unique_data_records.difference(project_record_ids)
306
+ msg = (
307
+ f"The derived dataset records is a superset of the project base dimension "
308
+ f"records. Dimension type = {orig_dim_config.model.dimension_type} {diff=}"
309
+ )
310
+ raise DSGInvalidDataset(msg)
311
+
312
+ new_dim_path = path / orig_dim_config.model.dimension_type.value
313
+ new_dim_path.mkdir(parents=True)
314
+ orig_records = orig_dim_config.get_records_dataframe()
315
+ records = orig_records.filter(orig_records.id.isin(unique_data_records))
316
+ # TODO: AWS #186 - not an issue if registry is in a database instead of files
317
+ filename = new_dim_path / "records.csv"
318
+ # Use pandas because spark creates a directory.
319
+ records.toPandas().to_csv(filename, index=False)
320
+ # Use dictionaries instead of DimensionModel to avoid running the Pydantic validators.
321
+ # Some won't work, like loading the records. Others, like file_hash, shouldn't get set yet.
322
+ new_dim = {
323
+ "type": orig_dim_config.model.dimension_type.value,
324
+ "name": "",
325
+ "module": orig_dim_config.model.module,
326
+ "class_name": orig_dim_config.model.class_name,
327
+ "description": "",
328
+ "filename": filename.name,
329
+ }
330
+ dump_data(new_dim, new_dim_path / DimensionConfig.config_filename(), indent=2)
331
+ logger.warning(
332
+ "The derived dataset does not match any project dimension for dimension "
333
+ "type %s. Consider creating a new supplemental dimension out of the files in %s",
334
+ orig_dim_config.model.dimension_type.value,
335
+ new_dim_path,
336
+ )
337
+
338
+
339
+ def _make_dimension_mapping_references_file(dimension_mapping_references, path: Path):
340
+ dim_mapping_ref_filename = path / "dimension_mapping_references.json5"
341
+ dim_mapping_ref_filename.write_text(
342
+ json5.dumps({"references": dimension_mapping_references}, indent=2)
343
+ )
344
+ logger.info(
345
+ "Wrote dimension mapping references file %s with %s references. "
346
+ "Specify that file when submitting the dataset to the project.",
347
+ dim_mapping_ref_filename,
348
+ len(dimension_mapping_references),
349
+ )
350
+
351
+
352
+ def _get_unique_data_records(df, dim_model: DimensionModel, column_type: ColumnType):
353
+ match column_type:
354
+ case ColumnType.DIMENSION_NAMES:
355
+ column = dim_model.name
356
+ case ColumnType.DIMENSION_TYPES:
357
+ column = dim_model.dimension_type.value
358
+ case _:
359
+ msg = f"BUG: unhandled: {column_type=}"
360
+ raise NotImplementedError(msg)
361
+
362
+ return get_unique_values(df, column)
363
+
364
+
365
+ def _get_dimension_reference(dim: DimensionBaseConfigWithFiles, project_config: ProjectConfig):
366
+ dim_ref = project_config.get_dimension_reference(dim.model.dimension_id)
367
+ return dim_ref.serialize()
368
+
369
+
370
+ def _get_supplemental_dimension_mapping_reference(
371
+ supp_dim: DimensionBaseConfigWithFiles,
372
+ project_config: ProjectConfig,
373
+ metadata: DatasetMetadataModel,
374
+ ):
375
+ base_dim_name = getattr(metadata.base_dimension_names, supp_dim.model.dimension_type.value)
376
+ base_dim = project_config.get_dimension_with_records(base_dim_name)
377
+ mapping_config = project_config.get_base_to_supplemental_config(base_dim, supp_dim)
378
+ # Use dictionaries to avoid validation and be consistent with dimension definition.
379
+ return {
380
+ "mapping_id": mapping_config.model.mapping_id,
381
+ "from_dimension_type": base_dim.model.dimension_type.value,
382
+ "to_dimension_type": supp_dim.model.dimension_type.value,
383
+ "version": str(mapping_config.model.version),
384
+ }