dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. build_backend.py +93 -0
  2. dsgrid/__init__.py +22 -0
  3. dsgrid/api/__init__.py +0 -0
  4. dsgrid/api/api_manager.py +179 -0
  5. dsgrid/api/app.py +419 -0
  6. dsgrid/api/models.py +60 -0
  7. dsgrid/api/response_models.py +116 -0
  8. dsgrid/apps/__init__.py +0 -0
  9. dsgrid/apps/project_viewer/app.py +216 -0
  10. dsgrid/apps/registration_gui.py +444 -0
  11. dsgrid/chronify.py +32 -0
  12. dsgrid/cli/__init__.py +0 -0
  13. dsgrid/cli/common.py +120 -0
  14. dsgrid/cli/config.py +176 -0
  15. dsgrid/cli/download.py +13 -0
  16. dsgrid/cli/dsgrid.py +157 -0
  17. dsgrid/cli/dsgrid_admin.py +92 -0
  18. dsgrid/cli/install_notebooks.py +62 -0
  19. dsgrid/cli/query.py +729 -0
  20. dsgrid/cli/registry.py +1862 -0
  21. dsgrid/cloud/__init__.py +0 -0
  22. dsgrid/cloud/cloud_storage_interface.py +140 -0
  23. dsgrid/cloud/factory.py +31 -0
  24. dsgrid/cloud/fake_storage_interface.py +37 -0
  25. dsgrid/cloud/s3_storage_interface.py +156 -0
  26. dsgrid/common.py +36 -0
  27. dsgrid/config/__init__.py +0 -0
  28. dsgrid/config/annual_time_dimension_config.py +194 -0
  29. dsgrid/config/common.py +142 -0
  30. dsgrid/config/config_base.py +148 -0
  31. dsgrid/config/dataset_config.py +907 -0
  32. dsgrid/config/dataset_schema_handler_factory.py +46 -0
  33. dsgrid/config/date_time_dimension_config.py +136 -0
  34. dsgrid/config/dimension_config.py +54 -0
  35. dsgrid/config/dimension_config_factory.py +65 -0
  36. dsgrid/config/dimension_mapping_base.py +350 -0
  37. dsgrid/config/dimension_mappings_config.py +48 -0
  38. dsgrid/config/dimensions.py +1025 -0
  39. dsgrid/config/dimensions_config.py +71 -0
  40. dsgrid/config/file_schema.py +190 -0
  41. dsgrid/config/index_time_dimension_config.py +80 -0
  42. dsgrid/config/input_dataset_requirements.py +31 -0
  43. dsgrid/config/mapping_tables.py +209 -0
  44. dsgrid/config/noop_time_dimension_config.py +42 -0
  45. dsgrid/config/project_config.py +1462 -0
  46. dsgrid/config/registration_models.py +188 -0
  47. dsgrid/config/representative_period_time_dimension_config.py +194 -0
  48. dsgrid/config/simple_models.py +49 -0
  49. dsgrid/config/supplemental_dimension.py +29 -0
  50. dsgrid/config/time_dimension_base_config.py +192 -0
  51. dsgrid/data_models.py +155 -0
  52. dsgrid/dataset/__init__.py +0 -0
  53. dsgrid/dataset/dataset.py +123 -0
  54. dsgrid/dataset/dataset_expression_handler.py +86 -0
  55. dsgrid/dataset/dataset_mapping_manager.py +121 -0
  56. dsgrid/dataset/dataset_schema_handler_base.py +945 -0
  57. dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
  58. dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
  59. dsgrid/dataset/growth_rates.py +162 -0
  60. dsgrid/dataset/models.py +51 -0
  61. dsgrid/dataset/table_format_handler_base.py +257 -0
  62. dsgrid/dataset/table_format_handler_factory.py +17 -0
  63. dsgrid/dataset/unpivoted_table.py +121 -0
  64. dsgrid/dimension/__init__.py +0 -0
  65. dsgrid/dimension/base_models.py +230 -0
  66. dsgrid/dimension/dimension_filters.py +308 -0
  67. dsgrid/dimension/standard.py +252 -0
  68. dsgrid/dimension/time.py +352 -0
  69. dsgrid/dimension/time_utils.py +103 -0
  70. dsgrid/dsgrid_rc.py +88 -0
  71. dsgrid/exceptions.py +105 -0
  72. dsgrid/filesystem/__init__.py +0 -0
  73. dsgrid/filesystem/cloud_filesystem.py +32 -0
  74. dsgrid/filesystem/factory.py +32 -0
  75. dsgrid/filesystem/filesystem_interface.py +136 -0
  76. dsgrid/filesystem/local_filesystem.py +74 -0
  77. dsgrid/filesystem/s3_filesystem.py +118 -0
  78. dsgrid/loggers.py +132 -0
  79. dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
  80. dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
  81. dsgrid/notebooks/registration.ipynb +48 -0
  82. dsgrid/notebooks/start_notebook.sh +11 -0
  83. dsgrid/project.py +451 -0
  84. dsgrid/query/__init__.py +0 -0
  85. dsgrid/query/dataset_mapping_plan.py +142 -0
  86. dsgrid/query/derived_dataset.py +388 -0
  87. dsgrid/query/models.py +728 -0
  88. dsgrid/query/query_context.py +287 -0
  89. dsgrid/query/query_submitter.py +994 -0
  90. dsgrid/query/report_factory.py +19 -0
  91. dsgrid/query/report_peak_load.py +70 -0
  92. dsgrid/query/reports_base.py +20 -0
  93. dsgrid/registry/__init__.py +0 -0
  94. dsgrid/registry/bulk_register.py +165 -0
  95. dsgrid/registry/common.py +287 -0
  96. dsgrid/registry/config_update_checker_base.py +63 -0
  97. dsgrid/registry/data_store_factory.py +34 -0
  98. dsgrid/registry/data_store_interface.py +74 -0
  99. dsgrid/registry/dataset_config_generator.py +158 -0
  100. dsgrid/registry/dataset_registry_manager.py +950 -0
  101. dsgrid/registry/dataset_update_checker.py +16 -0
  102. dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
  103. dsgrid/registry/dimension_mapping_update_checker.py +16 -0
  104. dsgrid/registry/dimension_registry_manager.py +413 -0
  105. dsgrid/registry/dimension_update_checker.py +16 -0
  106. dsgrid/registry/duckdb_data_store.py +207 -0
  107. dsgrid/registry/filesystem_data_store.py +150 -0
  108. dsgrid/registry/filter_registry_manager.py +123 -0
  109. dsgrid/registry/project_config_generator.py +57 -0
  110. dsgrid/registry/project_registry_manager.py +1623 -0
  111. dsgrid/registry/project_update_checker.py +48 -0
  112. dsgrid/registry/registration_context.py +223 -0
  113. dsgrid/registry/registry_auto_updater.py +316 -0
  114. dsgrid/registry/registry_database.py +667 -0
  115. dsgrid/registry/registry_interface.py +446 -0
  116. dsgrid/registry/registry_manager.py +558 -0
  117. dsgrid/registry/registry_manager_base.py +367 -0
  118. dsgrid/registry/versioning.py +92 -0
  119. dsgrid/rust_ext/__init__.py +14 -0
  120. dsgrid/rust_ext/find_minimal_patterns.py +129 -0
  121. dsgrid/spark/__init__.py +0 -0
  122. dsgrid/spark/functions.py +589 -0
  123. dsgrid/spark/types.py +110 -0
  124. dsgrid/tests/__init__.py +0 -0
  125. dsgrid/tests/common.py +140 -0
  126. dsgrid/tests/make_us_data_registry.py +265 -0
  127. dsgrid/tests/register_derived_datasets.py +103 -0
  128. dsgrid/tests/utils.py +25 -0
  129. dsgrid/time/__init__.py +0 -0
  130. dsgrid/time/time_conversions.py +80 -0
  131. dsgrid/time/types.py +67 -0
  132. dsgrid/units/__init__.py +0 -0
  133. dsgrid/units/constants.py +113 -0
  134. dsgrid/units/convert.py +71 -0
  135. dsgrid/units/energy.py +145 -0
  136. dsgrid/units/power.py +87 -0
  137. dsgrid/utils/__init__.py +0 -0
  138. dsgrid/utils/dataset.py +830 -0
  139. dsgrid/utils/files.py +179 -0
  140. dsgrid/utils/filters.py +125 -0
  141. dsgrid/utils/id_remappings.py +100 -0
  142. dsgrid/utils/py_expression_eval/LICENSE +19 -0
  143. dsgrid/utils/py_expression_eval/README.md +8 -0
  144. dsgrid/utils/py_expression_eval/__init__.py +847 -0
  145. dsgrid/utils/py_expression_eval/tests.py +283 -0
  146. dsgrid/utils/run_command.py +70 -0
  147. dsgrid/utils/scratch_dir_context.py +65 -0
  148. dsgrid/utils/spark.py +918 -0
  149. dsgrid/utils/spark_partition.py +98 -0
  150. dsgrid/utils/timing.py +239 -0
  151. dsgrid/utils/utilities.py +221 -0
  152. dsgrid/utils/versioning.py +36 -0
  153. dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
  154. dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
  155. dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
  156. dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
  157. dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
@@ -0,0 +1,388 @@
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ import chronify
5
+ import json5
6
+
7
+ from dsgrid.chronify import create_store
8
+ from dsgrid.common import VALUE_COLUMN
9
+ from dsgrid.config.dataset_config import (
10
+ DataClassificationType,
11
+ InputDatasetType,
12
+ )
13
+ from dsgrid.config.dataset_config import DatasetConfig
14
+ from dsgrid.config.dimension_config import DimensionBaseConfigWithFiles, DimensionConfig
15
+ from dsgrid.config.dimensions import DimensionModel
16
+ from dsgrid.config.project_config import ProjectConfig
17
+ from dsgrid.config.time_dimension_base_config import TimeDimensionBaseConfig
18
+ from dsgrid.dataset.models import TableFormat, ValueFormat
19
+ from dsgrid.dimension.base_models import DimensionType, DimensionCategory
20
+ from dsgrid.dsgrid_rc import DsgridRuntimeConfig
21
+ from dsgrid.exceptions import DSGInvalidDataset
22
+ from dsgrid.query.models import ProjectQueryModel, DatasetMetadataModel, ColumnType
23
+ from dsgrid.query.query_submitter import QuerySubmitterBase
24
+ from dsgrid.registry.registry_manager import RegistryManager
25
+ from dsgrid.spark.functions import make_temp_view_name
26
+ from dsgrid.spark.types import DataFrame
27
+ from dsgrid.utils.files import dump_data
28
+ from dsgrid.utils.scratch_dir_context import ScratchDirContext
29
+ from dsgrid.utils.spark import read_dataframe, get_unique_values
30
+
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ def create_derived_dataset_config_from_query(
36
+ query_path: Path, dst_path: Path, registry_manager: RegistryManager
37
+ ):
38
+ """Create a DatasetConfigModel and dimensions from a query result.
39
+
40
+ Parameters
41
+ ----------
42
+ query_path : Path
43
+ Output directory from a query.
44
+ dst_path : Path
45
+ Directory in which to create the dataset config files.
46
+ registry_manager : RegistryManager
47
+
48
+ Returns
49
+ -------
50
+ bool
51
+ Returns True if the operation is successful.
52
+ """
53
+ metadata_file = QuerySubmitterBase.metadata_filename(query_path)
54
+ query_file = QuerySubmitterBase.query_filename(query_path)
55
+ table_file = QuerySubmitterBase.table_filename(query_path)
56
+ if not metadata_file.exists() or not query_file.exists() or not table_file.exists():
57
+ logger.error("%s is not a valid query result directory", query_path)
58
+ return False
59
+
60
+ query = ProjectQueryModel.from_file(query_file)
61
+ if not does_query_support_a_derived_dataset(query):
62
+ return False
63
+
64
+ metadata = DatasetMetadataModel.from_file(metadata_file)
65
+ value_format = metadata.get_value_format()
66
+ value_format_dict = {"value_format": value_format.value}
67
+ if value_format == ValueFormat.PIVOTED:
68
+ value_format_dict[
69
+ "pivoted_dimension_type"
70
+ ] = metadata.table_format.pivoted_dimension_type.value
71
+
72
+ project = registry_manager.project_manager.load_project(
73
+ query.project.project_id, version=query.project.version
74
+ )
75
+ new_supplemental_dims_path = dst_path / "new_supplemental_dimensions"
76
+ df = read_dataframe(table_file)
77
+ # TODO: should there be a warning if the current project version is later?
78
+
79
+ # This code blocks compares the dimension records in the dataframe against the project's base
80
+ # and supplemental dimensions.
81
+ # If the records match an existing dimension, add a reference to that dimension in the
82
+ # dataset config.
83
+ # If the records don't match an existing dimension, create a new supplemental dimension and
84
+ # base-to-supplemental mapping that the user will need to register.
85
+ dimension_references = []
86
+ dimension_mapping_references = []
87
+ base_dim_query_names = set(
88
+ project.config.list_dimension_names(category=DimensionCategory.BASE)
89
+ )
90
+ num_new_supplemental_dimensions = 0
91
+ for dim_type in DimensionType:
92
+ dimension_names = metadata.dimensions.get_dimension_names(dim_type)
93
+ assert len(dimension_names) == 1, dimension_names
94
+ dim_query_name = next(iter(dimension_names))
95
+ if dim_type == DimensionType.TIME:
96
+ time_dim = project.config.get_time_dimension(dim_query_name)
97
+ is_valid = _does_time_dimension_match(time_dim, df, table_file)
98
+ if not is_valid:
99
+ logger.warning(
100
+ "The dataset does not match the project's time dimension. "
101
+ "If this is expected, add a new time dimension to the dataset config file "
102
+ "and create an appropriate dimension mapping."
103
+ )
104
+ continue
105
+
106
+ dim = project.config.get_dimension_with_records(dim_query_name)
107
+ if (
108
+ value_format == ValueFormat.PIVOTED
109
+ and metadata.table_format.pivoted_dimension_type == dim_type
110
+ ):
111
+ unique_data_records = metadata.dimensions.get_column_names(dim_type)
112
+ else:
113
+ unique_data_records = _get_unique_data_records(df, dim.model, query.result.column_type)
114
+ is_valid = _is_dimension_valid_for_dataset(dim, unique_data_records)
115
+
116
+ if is_valid:
117
+ dimension_references.append(_get_dimension_reference(dim, project.config))
118
+ if dim_query_name not in base_dim_query_names:
119
+ dimension_mapping_references.append(
120
+ _get_supplemental_dimension_mapping_reference(dim, project.config, metadata)
121
+ )
122
+ else:
123
+ subset_dim_ref = project.config.get_matching_subset_dimension(
124
+ dim_type, unique_data_records
125
+ )
126
+ if subset_dim_ref is not None:
127
+ dimension_references.append(subset_dim_ref.serialize())
128
+ continue
129
+
130
+ supp_dim = _get_matching_supplemental_dimension(
131
+ project.config, dim_type, unique_data_records
132
+ )
133
+ if supp_dim is None:
134
+ assert dim_query_name in base_dim_query_names, dim_query_name
135
+ _make_new_supplemental_dimension(
136
+ dim, unique_data_records, new_supplemental_dims_path
137
+ )
138
+ num_new_supplemental_dimensions += 1
139
+ else:
140
+ dimension_references.append(_get_dimension_reference(supp_dim, project.config))
141
+ dimension_mapping_references.append(
142
+ _get_supplemental_dimension_mapping_reference(
143
+ supp_dim, project.config, metadata
144
+ )
145
+ )
146
+
147
+ if dimension_mapping_references:
148
+ _make_dimension_mapping_references_file(dimension_mapping_references, dst_path)
149
+
150
+ _make_dataset_config(
151
+ query.project.dataset.dataset_id,
152
+ value_format_dict,
153
+ dimension_references,
154
+ dst_path,
155
+ num_new_supplemental_dimensions,
156
+ )
157
+ return True
158
+
159
+
160
+ def does_query_support_a_derived_dataset(query: ProjectQueryModel):
161
+ """Return True if a derived dataset can be created from a query.
162
+
163
+ Returns
164
+ -------
165
+ bool
166
+ """
167
+ is_valid = True
168
+ if query.result.column_type != ColumnType.DIMENSION_TYPES:
169
+ is_valid = False
170
+ logger.error(
171
+ "Cannot create a derived dataset from a query with column_type = %s. It must be %s.",
172
+ query.result.column_type.value,
173
+ ColumnType.DIMENSION_TYPES.value,
174
+ )
175
+ if query.result.replace_ids_with_names:
176
+ is_valid = False
177
+ logger.error("Cannot create a derived dataset from a query with replace_ids_with_names")
178
+
179
+ return is_valid
180
+
181
+
182
+ def _does_time_dimension_match(dim_config: TimeDimensionBaseConfig, df: DataFrame, df_path: Path):
183
+ try:
184
+ if dim_config.supports_chronify():
185
+ _check_time_dimension_with_chronify(dim_config, df, df_path)
186
+ else:
187
+ dim_config.check_dataset_time_consistency(df, dim_config.get_load_data_time_columns())
188
+ except DSGInvalidDataset:
189
+ return False
190
+ return True
191
+
192
+
193
+ def _check_time_dimension_with_chronify(
194
+ dim_config: TimeDimensionBaseConfig, df: DataFrame, df_path: Path
195
+ ):
196
+ scratch_dir = DsgridRuntimeConfig.load().get_scratch_dir()
197
+ with ScratchDirContext(scratch_dir) as scratch_dir_context:
198
+ time_cols = dim_config.get_load_data_time_columns()
199
+ time_array_id_columns = [
200
+ x
201
+ for x in df.columns
202
+ # If there are multiple weather years:
203
+ # - that are continuous, weather year needs to be excluded (one overall range).
204
+ # - that are not continuous, weather year needs to be included and chronify
205
+ # needs additional support. TODO: issue #340
206
+ if x != DimensionType.WEATHER_YEAR.value
207
+ and x in set(df.columns).difference(time_cols).difference({VALUE_COLUMN})
208
+ ]
209
+ schema = chronify.TableSchema(
210
+ name=make_temp_view_name(),
211
+ time_config=dim_config.to_chronify(),
212
+ time_array_id_columns=time_array_id_columns,
213
+ value_column=VALUE_COLUMN,
214
+ )
215
+ store_file = scratch_dir_context.get_temp_filename(suffix=".db")
216
+ with create_store(store_file) as store:
217
+ # This performs all of the checks.
218
+ store.create_view_from_parquet(df_path, schema)
219
+ store.drop_view(schema.name)
220
+
221
+
222
+ def _is_dimension_valid_for_dataset(
223
+ dim_config: DimensionBaseConfigWithFiles, unique_data_records: DataFrame
224
+ ):
225
+ records = dim_config.get_records_dataframe()
226
+ dim_values = get_unique_values(records, "id")
227
+ diff = dim_values.symmetric_difference(unique_data_records)
228
+ if not diff:
229
+ return True
230
+
231
+ return False
232
+
233
+
234
+ def _get_matching_supplemental_dimension(
235
+ project_config: ProjectConfig,
236
+ dimension_type: DimensionType,
237
+ unique_data_records: DataFrame,
238
+ ) -> DimensionBaseConfigWithFiles | None:
239
+ for dim_config in project_config.list_supplemental_dimensions(dimension_type):
240
+ if _is_dimension_valid_for_dataset(dim_config, unique_data_records):
241
+ return dim_config
242
+
243
+ return None
244
+
245
+
246
+ def _make_dataset_config(
247
+ dataset_id,
248
+ value_format_dict: dict[str, str],
249
+ dimension_references,
250
+ path: Path,
251
+ num_new_supplemental_dimensions,
252
+ data_classification=DataClassificationType.MODERATE.value,
253
+ ):
254
+ # Use dictionaries instead of DatasetConfigModel to avoid validation, which isn't possible
255
+ # here.
256
+ config = {
257
+ "dataset_id": dataset_id,
258
+ "dataset_type": InputDatasetType.MODELED.value,
259
+ "data_layout": {
260
+ "table_format": TableFormat.ONE_TABLE.value,
261
+ "data_file": {
262
+ "path": "load_data.parquet",
263
+ },
264
+ **value_format_dict,
265
+ },
266
+ "version": "1.0.0",
267
+ "description": "",
268
+ "origin_creator": "",
269
+ "origin_organization": "",
270
+ "origin_date": "",
271
+ "origin_project": "",
272
+ "origin_version": "",
273
+ "data_source": "",
274
+ "source": "",
275
+ "data_classification": data_classification,
276
+ "use_project_geography_time_zone": True,
277
+ "dimensions": [],
278
+ "dimension_references": dimension_references,
279
+ }
280
+ config_file = path / DatasetConfig.config_filename()
281
+ config_file.write_text(json5.dumps(config, indent=2))
282
+ if num_new_supplemental_dimensions > 0:
283
+ logger.info(
284
+ "Generated %s new supplemental dimensions. Review the records and fill out "
285
+ "the remaining fields, and then register them.",
286
+ num_new_supplemental_dimensions,
287
+ )
288
+ logger.info(
289
+ "Created %s with default information. Re-used %s project dimensions. "
290
+ "Examine %s, fill out the remaining fields, and register any new dimensions "
291
+ "before registering and submitting the dataset to the project.",
292
+ path,
293
+ len(dimension_references),
294
+ config_file,
295
+ )
296
+
297
+
298
+ def _make_new_supplemental_dimension(orig_dim_config, unique_data_records, path: Path):
299
+ project_record_ids = orig_dim_config.get_unique_ids()
300
+ if not unique_data_records.issubset(project_record_ids):
301
+ diff = project_record_ids.difference(unique_data_records)
302
+ if diff:
303
+ msg = (
304
+ f"The derived dataset records do not include some project base dimension "
305
+ f"records. Dimension type = {orig_dim_config.model.dimension_type} {diff=}"
306
+ )
307
+ raise DSGInvalidDataset(msg)
308
+ assert unique_data_records.issuperset(project_record_ids)
309
+ diff = unique_data_records.difference(project_record_ids)
310
+ msg = (
311
+ f"The derived dataset records is a superset of the project base dimension "
312
+ f"records. Dimension type = {orig_dim_config.model.dimension_type} {diff=}"
313
+ )
314
+ raise DSGInvalidDataset(msg)
315
+
316
+ new_dim_path = path / orig_dim_config.model.dimension_type.value
317
+ new_dim_path.mkdir(parents=True)
318
+ orig_records = orig_dim_config.get_records_dataframe()
319
+ records = orig_records.filter(orig_records.id.isin(unique_data_records))
320
+ # TODO: AWS #186 - not an issue if registry is in a database instead of files
321
+ filename = new_dim_path / "records.csv"
322
+ # Use pandas because spark creates a directory.
323
+ records.toPandas().to_csv(filename, index=False)
324
+ # Use dictionaries instead of DimensionModel to avoid running the Pydantic validators.
325
+ # Some won't work, like loading the records. Others, like file_hash, shouldn't get set yet.
326
+ new_dim = {
327
+ "type": orig_dim_config.model.dimension_type.value,
328
+ "name": "",
329
+ "module": orig_dim_config.model.module,
330
+ "class_name": orig_dim_config.model.class_name,
331
+ "description": "",
332
+ "filename": filename.name,
333
+ }
334
+ dump_data(new_dim, new_dim_path / DimensionConfig.config_filename(), indent=2)
335
+ logger.warning(
336
+ "The derived dataset does not match any project dimension for dimension "
337
+ "type %s. Consider creating a new supplemental dimension out of the files in %s",
338
+ orig_dim_config.model.dimension_type.value,
339
+ new_dim_path,
340
+ )
341
+
342
+
343
+ def _make_dimension_mapping_references_file(dimension_mapping_references, path: Path):
344
+ dim_mapping_ref_filename = path / "dimension_mapping_references.json5"
345
+ dim_mapping_ref_filename.write_text(
346
+ json5.dumps({"references": dimension_mapping_references}, indent=2)
347
+ )
348
+ logger.info(
349
+ "Wrote dimension mapping references file %s with %s references. "
350
+ "Specify that file when submitting the dataset to the project.",
351
+ dim_mapping_ref_filename,
352
+ len(dimension_mapping_references),
353
+ )
354
+
355
+
356
+ def _get_unique_data_records(df, dim_model: DimensionModel, column_type: ColumnType):
357
+ match column_type:
358
+ case ColumnType.DIMENSION_NAMES:
359
+ column = dim_model.name
360
+ case ColumnType.DIMENSION_TYPES:
361
+ column = dim_model.dimension_type.value
362
+ case _:
363
+ msg = f"BUG: unhandled: {column_type=}"
364
+ raise NotImplementedError(msg)
365
+
366
+ return get_unique_values(df, column)
367
+
368
+
369
+ def _get_dimension_reference(dim: DimensionBaseConfigWithFiles, project_config: ProjectConfig):
370
+ dim_ref = project_config.get_dimension_reference(dim.model.dimension_id)
371
+ return dim_ref.serialize()
372
+
373
+
374
+ def _get_supplemental_dimension_mapping_reference(
375
+ supp_dim: DimensionBaseConfigWithFiles,
376
+ project_config: ProjectConfig,
377
+ metadata: DatasetMetadataModel,
378
+ ):
379
+ base_dim_name = getattr(metadata.base_dimension_names, supp_dim.model.dimension_type.value)
380
+ base_dim = project_config.get_dimension_with_records(base_dim_name)
381
+ mapping_config = project_config.get_base_to_supplemental_config(base_dim, supp_dim)
382
+ # Use dictionaries to avoid validation and be consistent with dimension definition.
383
+ return {
384
+ "mapping_id": mapping_config.model.mapping_id,
385
+ "from_dimension_type": base_dim.model.dimension_type.value,
386
+ "to_dimension_type": supp_dim.model.dimension_type.value,
387
+ "version": str(mapping_config.model.version),
388
+ }