dsgrid-toolkit 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dsgrid-toolkit might be problematic. Click here for more details.

Files changed (152) hide show
  1. dsgrid/__init__.py +22 -0
  2. dsgrid/api/__init__.py +0 -0
  3. dsgrid/api/api_manager.py +179 -0
  4. dsgrid/api/app.py +420 -0
  5. dsgrid/api/models.py +60 -0
  6. dsgrid/api/response_models.py +116 -0
  7. dsgrid/apps/__init__.py +0 -0
  8. dsgrid/apps/project_viewer/app.py +216 -0
  9. dsgrid/apps/registration_gui.py +444 -0
  10. dsgrid/chronify.py +22 -0
  11. dsgrid/cli/__init__.py +0 -0
  12. dsgrid/cli/common.py +120 -0
  13. dsgrid/cli/config.py +177 -0
  14. dsgrid/cli/download.py +13 -0
  15. dsgrid/cli/dsgrid.py +142 -0
  16. dsgrid/cli/dsgrid_admin.py +349 -0
  17. dsgrid/cli/install_notebooks.py +62 -0
  18. dsgrid/cli/query.py +711 -0
  19. dsgrid/cli/registry.py +1773 -0
  20. dsgrid/cloud/__init__.py +0 -0
  21. dsgrid/cloud/cloud_storage_interface.py +140 -0
  22. dsgrid/cloud/factory.py +31 -0
  23. dsgrid/cloud/fake_storage_interface.py +37 -0
  24. dsgrid/cloud/s3_storage_interface.py +156 -0
  25. dsgrid/common.py +35 -0
  26. dsgrid/config/__init__.py +0 -0
  27. dsgrid/config/annual_time_dimension_config.py +187 -0
  28. dsgrid/config/common.py +131 -0
  29. dsgrid/config/config_base.py +148 -0
  30. dsgrid/config/dataset_config.py +684 -0
  31. dsgrid/config/dataset_schema_handler_factory.py +41 -0
  32. dsgrid/config/date_time_dimension_config.py +108 -0
  33. dsgrid/config/dimension_config.py +54 -0
  34. dsgrid/config/dimension_config_factory.py +65 -0
  35. dsgrid/config/dimension_mapping_base.py +349 -0
  36. dsgrid/config/dimension_mappings_config.py +48 -0
  37. dsgrid/config/dimensions.py +775 -0
  38. dsgrid/config/dimensions_config.py +71 -0
  39. dsgrid/config/index_time_dimension_config.py +76 -0
  40. dsgrid/config/input_dataset_requirements.py +31 -0
  41. dsgrid/config/mapping_tables.py +209 -0
  42. dsgrid/config/noop_time_dimension_config.py +42 -0
  43. dsgrid/config/project_config.py +1457 -0
  44. dsgrid/config/registration_models.py +199 -0
  45. dsgrid/config/representative_period_time_dimension_config.py +194 -0
  46. dsgrid/config/simple_models.py +49 -0
  47. dsgrid/config/supplemental_dimension.py +29 -0
  48. dsgrid/config/time_dimension_base_config.py +200 -0
  49. dsgrid/data_models.py +155 -0
  50. dsgrid/dataset/__init__.py +0 -0
  51. dsgrid/dataset/dataset.py +123 -0
  52. dsgrid/dataset/dataset_expression_handler.py +86 -0
  53. dsgrid/dataset/dataset_mapping_manager.py +121 -0
  54. dsgrid/dataset/dataset_schema_handler_base.py +899 -0
  55. dsgrid/dataset/dataset_schema_handler_one_table.py +196 -0
  56. dsgrid/dataset/dataset_schema_handler_standard.py +303 -0
  57. dsgrid/dataset/growth_rates.py +162 -0
  58. dsgrid/dataset/models.py +44 -0
  59. dsgrid/dataset/table_format_handler_base.py +257 -0
  60. dsgrid/dataset/table_format_handler_factory.py +17 -0
  61. dsgrid/dataset/unpivoted_table.py +121 -0
  62. dsgrid/dimension/__init__.py +0 -0
  63. dsgrid/dimension/base_models.py +218 -0
  64. dsgrid/dimension/dimension_filters.py +308 -0
  65. dsgrid/dimension/standard.py +213 -0
  66. dsgrid/dimension/time.py +531 -0
  67. dsgrid/dimension/time_utils.py +88 -0
  68. dsgrid/dsgrid_rc.py +88 -0
  69. dsgrid/exceptions.py +105 -0
  70. dsgrid/filesystem/__init__.py +0 -0
  71. dsgrid/filesystem/cloud_filesystem.py +32 -0
  72. dsgrid/filesystem/factory.py +32 -0
  73. dsgrid/filesystem/filesystem_interface.py +136 -0
  74. dsgrid/filesystem/local_filesystem.py +74 -0
  75. dsgrid/filesystem/s3_filesystem.py +118 -0
  76. dsgrid/loggers.py +132 -0
  77. dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +950 -0
  78. dsgrid/notebooks/registration.ipynb +48 -0
  79. dsgrid/notebooks/start_notebook.sh +11 -0
  80. dsgrid/project.py +451 -0
  81. dsgrid/query/__init__.py +0 -0
  82. dsgrid/query/dataset_mapping_plan.py +142 -0
  83. dsgrid/query/derived_dataset.py +384 -0
  84. dsgrid/query/models.py +726 -0
  85. dsgrid/query/query_context.py +287 -0
  86. dsgrid/query/query_submitter.py +847 -0
  87. dsgrid/query/report_factory.py +19 -0
  88. dsgrid/query/report_peak_load.py +70 -0
  89. dsgrid/query/reports_base.py +20 -0
  90. dsgrid/registry/__init__.py +0 -0
  91. dsgrid/registry/bulk_register.py +161 -0
  92. dsgrid/registry/common.py +287 -0
  93. dsgrid/registry/config_update_checker_base.py +63 -0
  94. dsgrid/registry/data_store_factory.py +34 -0
  95. dsgrid/registry/data_store_interface.py +69 -0
  96. dsgrid/registry/dataset_config_generator.py +156 -0
  97. dsgrid/registry/dataset_registry_manager.py +734 -0
  98. dsgrid/registry/dataset_update_checker.py +16 -0
  99. dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
  100. dsgrid/registry/dimension_mapping_update_checker.py +16 -0
  101. dsgrid/registry/dimension_registry_manager.py +413 -0
  102. dsgrid/registry/dimension_update_checker.py +16 -0
  103. dsgrid/registry/duckdb_data_store.py +185 -0
  104. dsgrid/registry/filesystem_data_store.py +141 -0
  105. dsgrid/registry/filter_registry_manager.py +123 -0
  106. dsgrid/registry/project_config_generator.py +57 -0
  107. dsgrid/registry/project_registry_manager.py +1616 -0
  108. dsgrid/registry/project_update_checker.py +48 -0
  109. dsgrid/registry/registration_context.py +223 -0
  110. dsgrid/registry/registry_auto_updater.py +316 -0
  111. dsgrid/registry/registry_database.py +662 -0
  112. dsgrid/registry/registry_interface.py +446 -0
  113. dsgrid/registry/registry_manager.py +544 -0
  114. dsgrid/registry/registry_manager_base.py +367 -0
  115. dsgrid/registry/versioning.py +92 -0
  116. dsgrid/spark/__init__.py +0 -0
  117. dsgrid/spark/functions.py +545 -0
  118. dsgrid/spark/types.py +50 -0
  119. dsgrid/tests/__init__.py +0 -0
  120. dsgrid/tests/common.py +139 -0
  121. dsgrid/tests/make_us_data_registry.py +204 -0
  122. dsgrid/tests/register_derived_datasets.py +103 -0
  123. dsgrid/tests/utils.py +25 -0
  124. dsgrid/time/__init__.py +0 -0
  125. dsgrid/time/time_conversions.py +80 -0
  126. dsgrid/time/types.py +67 -0
  127. dsgrid/units/__init__.py +0 -0
  128. dsgrid/units/constants.py +113 -0
  129. dsgrid/units/convert.py +71 -0
  130. dsgrid/units/energy.py +145 -0
  131. dsgrid/units/power.py +87 -0
  132. dsgrid/utils/__init__.py +0 -0
  133. dsgrid/utils/dataset.py +612 -0
  134. dsgrid/utils/files.py +179 -0
  135. dsgrid/utils/filters.py +125 -0
  136. dsgrid/utils/id_remappings.py +100 -0
  137. dsgrid/utils/py_expression_eval/LICENSE +19 -0
  138. dsgrid/utils/py_expression_eval/README.md +8 -0
  139. dsgrid/utils/py_expression_eval/__init__.py +847 -0
  140. dsgrid/utils/py_expression_eval/tests.py +283 -0
  141. dsgrid/utils/run_command.py +70 -0
  142. dsgrid/utils/scratch_dir_context.py +64 -0
  143. dsgrid/utils/spark.py +918 -0
  144. dsgrid/utils/spark_partition.py +98 -0
  145. dsgrid/utils/timing.py +239 -0
  146. dsgrid/utils/utilities.py +184 -0
  147. dsgrid/utils/versioning.py +36 -0
  148. dsgrid_toolkit-0.2.0.dist-info/METADATA +216 -0
  149. dsgrid_toolkit-0.2.0.dist-info/RECORD +152 -0
  150. dsgrid_toolkit-0.2.0.dist-info/WHEEL +4 -0
  151. dsgrid_toolkit-0.2.0.dist-info/entry_points.txt +4 -0
  152. dsgrid_toolkit-0.2.0.dist-info/licenses/LICENSE +29 -0
@@ -0,0 +1,1616 @@
1
+ """Manages the registry for dimension projects"""
2
+
3
+ import logging
4
+ import os
5
+ import tempfile
6
+ from collections import defaultdict
7
+ from pathlib import Path
8
+ from tempfile import TemporaryDirectory
9
+ from typing import Any, Type, Union
10
+
11
+ from dsgrid.utils.dataset import handle_dimension_association_errors
12
+ import json5
13
+ import pandas as pd
14
+ from prettytable import PrettyTable
15
+ from sqlalchemy import Connection
16
+
17
+ from dsgrid.config.dimension_config import (
18
+ DimensionBaseConfig,
19
+ DimensionBaseConfigWithFiles,
20
+ )
21
+ from dsgrid.dimension.base_models import DimensionType
22
+ from dsgrid.exceptions import (
23
+ DSGInvalidDataset,
24
+ DSGInvalidDimension,
25
+ DSGInvalidDimensionMapping,
26
+ DSGValueNotRegistered,
27
+ DSGDuplicateValueRegistered,
28
+ DSGInvalidParameter,
29
+ )
30
+ from dsgrid.config.dataset_schema_handler_factory import make_dataset_schema_handler
31
+ from dsgrid.config.dataset_config import DatasetConfig
32
+ from dsgrid.config.dimensions import DimensionModel
33
+ from dsgrid.config.dimensions_config import DimensionsConfig, DimensionsConfigModel
34
+ from dsgrid.config.dimension_mapping_base import (
35
+ DimensionReferenceModel,
36
+ DimensionMappingReferenceModel,
37
+ DimensionMappingReferenceListModel,
38
+ DimensionMappingType,
39
+ )
40
+ from dsgrid.config.dimension_mappings_config import (
41
+ DimensionMappingsConfig,
42
+ DimensionMappingsConfigModel,
43
+ )
44
+ from dsgrid.config.supplemental_dimension import (
45
+ SupplementalDimensionModel,
46
+ SupplementalDimensionsListModel,
47
+ )
48
+ from dsgrid.config.input_dataset_requirements import (
49
+ InputDatasetDimensionRequirementsListModel,
50
+ InputDatasetListModel,
51
+ )
52
+ from dsgrid.config.mapping_tables import (
53
+ MappingTableModel,
54
+ MappingTableByNameModel,
55
+ DatasetBaseToProjectMappingTableListModel,
56
+ )
57
+ from dsgrid.config.project_config import (
58
+ DatasetBaseDimensionNamesModel,
59
+ ProjectConfig,
60
+ ProjectConfigModel,
61
+ RequiredBaseDimensionModel,
62
+ RequiredDimensionRecordsByTypeModel,
63
+ RequiredDimensionRecordsModel,
64
+ SubsetDimensionGroupModel,
65
+ SubsetDimensionGroupListModel,
66
+ )
67
+ from dsgrid.project import Project
68
+ from dsgrid.registry.common import (
69
+ ConfigKey,
70
+ DatasetRegistryStatus,
71
+ ProjectRegistryStatus,
72
+ RegistryManagerParams,
73
+ )
74
+ from dsgrid.spark.functions import (
75
+ cache,
76
+ except_all,
77
+ is_dataframe_empty,
78
+ unpersist,
79
+ )
80
+ from dsgrid.spark.types import (
81
+ DataFrame,
82
+ F,
83
+ use_duckdb,
84
+ )
85
+ from dsgrid.utils.timing import track_timing, timer_stats_collector
86
+ from dsgrid.utils.files import load_data, in_other_dir
87
+ from dsgrid.utils.filters import transform_and_validate_filters, matches_filters
88
+ from dsgrid.utils.scratch_dir_context import ScratchDirContext
89
+ from dsgrid.utils.spark import (
90
+ models_to_dataframe,
91
+ get_unique_values,
92
+ persist_intermediate_table,
93
+ read_dataframe,
94
+ )
95
+ from dsgrid.utils.utilities import check_uniqueness, display_table
96
+ from dsgrid.registry.registry_interface import ProjectRegistryInterface
97
+ from .common import (
98
+ VersionUpdateType,
99
+ RegistryType,
100
+ )
101
+ from .registration_context import RegistrationContext
102
+ from .project_update_checker import ProjectUpdateChecker
103
+ from .dataset_registry_manager import DatasetRegistryManager
104
+ from .dimension_mapping_registry_manager import DimensionMappingRegistryManager
105
+ from .dimension_registry_manager import DimensionRegistryManager
106
+ from .registry_manager_base import RegistryManagerBase
107
+
108
+
109
+ logger = logging.getLogger(__name__)
110
+
111
+
112
+ class ProjectRegistryManager(RegistryManagerBase):
113
+ """Manages registered dimension projects."""
114
+
115
+ def __init__(
116
+ self,
117
+ path: Path,
118
+ params,
119
+ dataset_manager: DatasetRegistryManager,
120
+ dimension_manager: DimensionRegistryManager,
121
+ dimension_mapping_manager: DimensionMappingRegistryManager,
122
+ db: ProjectRegistryInterface,
123
+ ):
124
+ super().__init__(path, params)
125
+ self._projects: dict[ConfigKey, ProjectConfig] = {}
126
+ self._dataset_mgr = dataset_manager
127
+ self._dimension_mgr = dimension_manager
128
+ self._dimension_mapping_mgr = dimension_mapping_manager
129
+ self._db = db
130
+
131
+ @classmethod
132
+ def load(
133
+ cls,
134
+ path: Path,
135
+ params: RegistryManagerParams,
136
+ dataset_manager: DatasetRegistryManager,
137
+ dimension_manager: DimensionRegistryManager,
138
+ dimension_mapping_manager: DimensionMappingRegistryManager,
139
+ db: ProjectRegistryInterface,
140
+ ):
141
+ return cls._load(
142
+ path,
143
+ params,
144
+ dataset_manager,
145
+ dimension_manager,
146
+ dimension_mapping_manager,
147
+ db,
148
+ )
149
+
150
+ @staticmethod
151
+ def config_class() -> Type:
152
+ return ProjectConfig
153
+
154
+ @property
155
+ def db(self) -> ProjectRegistryInterface:
156
+ return self._db
157
+
158
+ @db.setter
159
+ def db(self, db: ProjectRegistryInterface):
160
+ self._db = db
161
+
162
+ @staticmethod
163
+ def name() -> str:
164
+ return "Projects"
165
+
166
+ @property
167
+ def dataset_manager(self) -> DatasetRegistryManager:
168
+ return self._dataset_mgr
169
+
170
+ @property
171
+ def dimension_manager(self) -> DimensionRegistryManager:
172
+ return self._dimension_mgr
173
+
174
+ @property
175
+ def dimension_mapping_manager(self) -> DimensionMappingRegistryManager:
176
+ return self._dimension_mapping_mgr
177
+
178
+ def get_by_id(
179
+ self,
180
+ project_id: str,
181
+ version: str | None = None,
182
+ conn: Connection | None = None,
183
+ ) -> ProjectConfig:
184
+ if version is None:
185
+ assert self._db is not None
186
+ version = self._db.get_latest_version(conn, project_id)
187
+
188
+ key = ConfigKey(project_id, version)
189
+ project = self._projects.get(key)
190
+ if project is not None:
191
+ return project
192
+
193
+ if version is None:
194
+ model = self.db.get_latest(conn, project_id)
195
+ else:
196
+ model = self.db.get_by_version(conn, project_id, version)
197
+
198
+ assert isinstance(model, ProjectConfigModel)
199
+ config = ProjectConfig(model)
200
+ self._update_dimensions_and_mappings(conn, config)
201
+ self._projects[key] = config
202
+ return config
203
+
204
+ def _update_dimensions_and_mappings(self, conn: Connection | None, config: ProjectConfig):
205
+ base_dimensions = self._dimension_mgr.load_dimensions(
206
+ config.model.dimensions.base_dimension_references, conn=conn
207
+ )
208
+ supplemental_dimensions = self._dimension_mgr.load_dimensions(
209
+ config.model.dimensions.supplemental_dimension_references, conn=conn
210
+ )
211
+ base_to_supp_mappings = self._dimension_mapping_mgr.load_dimension_mappings(
212
+ config.model.dimension_mappings.base_to_supplemental_references, conn=conn
213
+ )
214
+ subset_dimensions = self._get_subset_dimensions(conn, config)
215
+ config.set_dimensions(base_dimensions, subset_dimensions, supplemental_dimensions)
216
+ config.set_dimension_mappings(base_to_supp_mappings)
217
+
218
+ def _get_subset_dimensions(self, conn: Connection | None, config: ProjectConfig):
219
+ subset_dimensions: dict[DimensionType, dict[str, dict[ConfigKey, DimensionBaseConfig]]] = (
220
+ defaultdict(dict)
221
+ )
222
+ for subset_dim in config.model.dimensions.subset_dimensions:
223
+ selectors = {
224
+ ConfigKey(x.dimension_id, x.version): self._dimension_mgr.get_by_id(
225
+ x.dimension_id, version=x.version, conn=conn
226
+ )
227
+ for x in subset_dim.selector_references
228
+ }
229
+ subset_dimensions[subset_dim.dimension_type][subset_dim.name] = selectors
230
+ return subset_dimensions
231
+
232
+ def load_project(
233
+ self,
234
+ project_id: str,
235
+ version: str | None = None,
236
+ conn: Connection | None = None,
237
+ ) -> Project:
238
+ """Load a project from the registry.
239
+
240
+ Parameters
241
+ ----------
242
+ project_id : str
243
+ version : str
244
+
245
+ Returns
246
+ -------
247
+ Project
248
+ """
249
+ if conn is None:
250
+ with self.db.engine.connect() as conn:
251
+ return self._load_project(conn, project_id, version=version)
252
+ else:
253
+ return self._load_project(conn, project_id, version=version)
254
+
255
+ def _load_project(self, conn: Connection, project_id: str, version=None) -> Project:
256
+ dataset_manager = self._dataset_mgr
257
+ config = self.get_by_id(project_id, version=version, conn=conn)
258
+
259
+ dataset_configs = {}
260
+ for dataset_id in config.list_registered_dataset_ids():
261
+ dataset_config = dataset_manager.get_by_id(dataset_id, conn=conn)
262
+ dataset_configs[dataset_id] = dataset_config
263
+
264
+ return Project(
265
+ config,
266
+ config.model.version,
267
+ dataset_configs,
268
+ self._dimension_mgr,
269
+ self._dimension_mapping_mgr,
270
+ self._dataset_mgr,
271
+ )
272
+
273
+ def register(
274
+ self,
275
+ config_file: Path,
276
+ submitter: str,
277
+ log_message: str,
278
+ ) -> None:
279
+ """Register a project from a config file."""
280
+ with RegistrationContext(
281
+ self.db, log_message, VersionUpdateType.MAJOR, submitter
282
+ ) as context:
283
+ config = ProjectConfig.load(config_file)
284
+ src_dir = config_file.parent
285
+ self.register_from_config(config, src_dir, context)
286
+
287
+ def register_from_config(
288
+ self,
289
+ config: ProjectConfig,
290
+ src_dir: Path,
291
+ context: RegistrationContext,
292
+ ):
293
+ """Register a project from an existing config."""
294
+ self._register_project_and_dimensions(
295
+ config,
296
+ src_dir,
297
+ context,
298
+ )
299
+
300
+ def _register_project_and_dimensions(
301
+ self,
302
+ config: ProjectConfig,
303
+ src_dir: Path,
304
+ context: RegistrationContext,
305
+ ):
306
+ model = config.model
307
+ logger.info("Start registration of project %s", model.project_id)
308
+ self._check_if_already_registered(context.connection, model.project_id)
309
+ if model.dimensions.base_dimensions:
310
+ logger.info("Register base dimensions")
311
+ for ref in self._register_dimensions_from_models(
312
+ model.dimensions.base_dimensions,
313
+ context,
314
+ ):
315
+ model.dimensions.base_dimension_references.append(ref)
316
+ model.dimensions.base_dimensions.clear()
317
+ if model.dimensions.subset_dimensions:
318
+ self._register_subset_dimensions(
319
+ model,
320
+ model.dimensions.subset_dimensions,
321
+ context,
322
+ )
323
+ if model.dimensions.supplemental_dimensions:
324
+ logger.info("Register supplemental dimensions")
325
+ self._register_supplemental_dimensions_from_models(
326
+ src_dir,
327
+ model,
328
+ model.dimensions.supplemental_dimensions,
329
+ context,
330
+ )
331
+ model.dimensions.supplemental_dimensions.clear()
332
+ logger.info("Register all-in-one supplemental dimensions")
333
+ self._register_all_in_one_dimensions(
334
+ src_dir,
335
+ model,
336
+ context,
337
+ )
338
+
339
+ self._update_dimensions_and_mappings(context.connection, config)
340
+ for subset_dimension in model.dimensions.subset_dimensions:
341
+ subset_dimension.selectors.clear()
342
+ self._register(config, context)
343
+ context.add_id(RegistryType.PROJECT, config.model.project_id, self)
344
+
345
+ def _register_dimensions_from_models(
346
+ self,
347
+ dimensions: list,
348
+ context: RegistrationContext,
349
+ ):
350
+ dim_model = DimensionsConfigModel(dimensions=dimensions)
351
+ dims_config = DimensionsConfig.load_from_model(dim_model)
352
+ dimension_ids = self._dimension_mgr.register_from_config(dims_config, context)
353
+ return self._dimension_mgr.make_dimension_references(context.connection, dimension_ids)
354
+
355
+ def _register_supplemental_dimensions_from_models(
356
+ self,
357
+ src_dir: Path,
358
+ model: ProjectConfigModel,
359
+ dimensions: list,
360
+ context: RegistrationContext,
361
+ ):
362
+ """Registers supplemental dimensions and creates base-to-supplemental mappings for those
363
+ new dimensions.
364
+ """
365
+ dims = []
366
+ for x in dimensions:
367
+ data = x.serialize()
368
+ data.pop("mapping", None)
369
+ dims.append(DimensionModel(**data))
370
+
371
+ refs = self._register_dimensions_from_models(dims, context)
372
+
373
+ model.dimensions.supplemental_dimension_references += refs
374
+ self._register_base_to_supplemental_mappings(
375
+ src_dir,
376
+ model,
377
+ dimensions,
378
+ refs,
379
+ context,
380
+ )
381
+
382
+ def _register_base_to_supplemental_mappings(
383
+ self,
384
+ src_dir: Path,
385
+ model: ProjectConfigModel,
386
+ dimensions: list[SupplementalDimensionModel],
387
+ dimension_references: list[DimensionReferenceModel],
388
+ context: RegistrationContext,
389
+ ):
390
+ conn = context.connection
391
+ base_dim_mapping = defaultdict(list)
392
+ base_dim_refs: dict[str, DimensionReferenceModel] = {}
393
+ for ref in model.dimensions.base_dimension_references:
394
+ dim = self._dimension_mgr.get_by_id(
395
+ ref.dimension_id, version=ref.version, conn=context.connection
396
+ )
397
+ base_dim_mapping[ref.dimension_type].append(dim)
398
+ base_dim_refs[dim.model.dimension_id] = ref
399
+
400
+ mappings = []
401
+ if len(dimensions) != len(dimension_references):
402
+ msg = f"Bug: mismatch in sizes: {dimensions=} {dimension_references=}"
403
+ raise Exception(msg)
404
+
405
+ for dim, ref in zip(dimensions, dimension_references):
406
+ base_dim: DimensionBaseConfig | None = None
407
+ if dim.mapping.project_base_dimension_name is None:
408
+ base_dims = base_dim_mapping[ref.dimension_type]
409
+ if len(base_dims) > 1:
410
+ msg = (
411
+ "If there are multiple base dimenions for a dimension type, each "
412
+ "supplemental dimension mapping must supply a project_base_dimension_name. "
413
+ f"{dim.label}"
414
+ )
415
+ raise DSGInvalidDimensionMapping(msg)
416
+ base_dim = base_dims[0]
417
+ else:
418
+ for base_dim_ in base_dim_mapping[dim.dimension_type]:
419
+ if base_dim_.model.name == dim.mapping.project_base_dimension_name:
420
+ if base_dim is not None:
421
+ msg = (
422
+ "A supplemental dimension can only be mapped to one base dimension:"
423
+ f" supplemental dimension = {dim.label} "
424
+ f"base dimensions = {base_dim.model.label} and "
425
+ f"{base_dim_.model.label}"
426
+ )
427
+ raise DSGInvalidDimensionMapping(msg)
428
+ base_dim = base_dim_
429
+ if base_dim is None:
430
+ msg = f"Bug: unable to find base dimension for {dim.mapping.project_base_dimension_name}"
431
+ raise Exception(msg)
432
+ with in_other_dir(src_dir):
433
+ assert base_dim is not None
434
+ mapping_model = MappingTableModel.from_pre_registered_model(
435
+ dim.mapping,
436
+ base_dim_refs[base_dim.model.dimension_id],
437
+ ref,
438
+ )
439
+ mappings.append(mapping_model)
440
+
441
+ mapping_config = DimensionMappingsConfig.load_from_model(
442
+ DimensionMappingsConfigModel(mappings=mappings),
443
+ )
444
+ mapping_ids = self._dimension_mapping_mgr.register_from_config(mapping_config, context)
445
+ model.dimension_mappings.base_to_supplemental_references += (
446
+ self._dimension_mapping_mgr.make_dimension_mapping_references(mapping_ids, conn=conn)
447
+ )
448
+
449
+ def _register_subset_dimensions(
450
+ self,
451
+ model: ProjectConfigModel,
452
+ subset_dimensions: list[SubsetDimensionGroupModel],
453
+ context: RegistrationContext,
454
+ ):
455
+ logger.info("Register subset dimensions")
456
+ self._register_dimensions_from_subset_dimension_groups(
457
+ subset_dimensions,
458
+ model.dimensions.base_dimension_references,
459
+ context,
460
+ )
461
+ self._register_supplemental_dimensions_from_subset_dimensions(
462
+ model,
463
+ subset_dimensions,
464
+ context,
465
+ )
466
+
467
+ def _register_dimensions_from_subset_dimension_groups(
468
+ self,
469
+ subset_dimensions: list[SubsetDimensionGroupModel],
470
+ base_dimension_references: list[DimensionReferenceModel],
471
+ context: RegistrationContext,
472
+ ):
473
+ """Registers a dimension for each subset specified in the project config's subset
474
+ dimension groups. Appends references to those dimensions to subset_dimensions, which is
475
+ part of the project config.
476
+ """
477
+ conn = context.connection
478
+ with TemporaryDirectory() as tmpdir:
479
+ tmp_path = Path(tmpdir)
480
+ dimensions = []
481
+ subset_refs = {}
482
+ for subset_dimension in subset_dimensions:
483
+ base_dim = None
484
+ for ref in base_dimension_references:
485
+ if ref.dimension_type == subset_dimension.dimension_type:
486
+ base_dim = self._dimension_mgr.get_by_id(ref.dimension_id, conn=conn)
487
+ break
488
+ assert isinstance(base_dim, DimensionBaseConfigWithFiles), subset_dimension
489
+ base_records = base_dim.get_records_dataframe()
490
+ self._check_subset_dimension_consistency(subset_dimension, base_records)
491
+ for selector in subset_dimension.selectors:
492
+ new_records = base_records.filter(base_records["id"].isin(selector.records))
493
+ filename = tmp_path / f"{subset_dimension.name}_{selector.name}.csv"
494
+ new_records.toPandas().to_csv(filename, index=False)
495
+ dim = DimensionModel(
496
+ file=str(filename),
497
+ name=selector.name,
498
+ type=subset_dimension.dimension_type,
499
+ module=base_dim.model.module,
500
+ class_name=base_dim.model.class_name,
501
+ description=selector.description,
502
+ )
503
+ dimensions.append(dim)
504
+ key = (subset_dimension.dimension_type, selector.name)
505
+ if key in subset_refs:
506
+ msg = f"Bug: unhandled case of duplicate dimension name: {key=}"
507
+ raise Exception(msg)
508
+ subset_refs[key] = subset_dimension
509
+
510
+ dim_model = DimensionsConfigModel(dimensions=dimensions)
511
+ dims_config = DimensionsConfig.load_from_model(dim_model)
512
+ dimension_ids = self._dimension_mgr.register_from_config(dims_config, context)
513
+ for dimension_id in dimension_ids:
514
+ dim = self._dimension_mgr.get_by_id(dimension_id, conn=conn)
515
+ key = (dim.model.dimension_type, dim.model.name)
516
+ subset_dim = subset_refs[key]
517
+ subset_dim.selector_references.append(
518
+ DimensionReferenceModel(
519
+ dimension_id=dimension_id,
520
+ type=subset_dim.dimension_type,
521
+ version="1.0.0",
522
+ )
523
+ )
524
+
525
+ def _check_subset_dimension_consistency(
526
+ self,
527
+ subset_dimension: SubsetDimensionGroupModel,
528
+ base_records: DataFrame,
529
+ ) -> None:
530
+ base_record_ids = get_unique_values(base_records, "id")
531
+ diff = subset_dimension.record_ids.difference(base_record_ids)
532
+ if diff:
533
+ msg = (
534
+ f"subset dimension {subset_dimension.name} "
535
+ f"uses dimension records not present in the base dimension: {diff}"
536
+ )
537
+ raise DSGInvalidParameter(msg)
538
+
539
+ diff = base_record_ids.difference(subset_dimension.record_ids)
540
+ if diff:
541
+ msg = (
542
+ f"subset dimension {subset_dimension.name} "
543
+ f"does not list these base dimension records: {diff}"
544
+ )
545
+ raise DSGInvalidParameter(msg)
546
+
547
+ def _register_supplemental_dimensions_from_subset_dimensions(
548
+ self,
549
+ model: ProjectConfigModel,
550
+ subset_dimensions: list[SubsetDimensionGroupModel],
551
+ context: RegistrationContext,
552
+ ):
553
+ """Registers a supplemental dimension for each subset specified in the project config's
554
+ subset dimension groups. Also registers a mapping from the base dimension to each new
555
+ supplemental dimension. Appends references to those dimensions to the project config's
556
+ supplemental_dimension_references list.
557
+ """
558
+ conn = context.connection
559
+ with TemporaryDirectory() as tmpdir:
560
+ tmp_path = Path(tmpdir)
561
+ dimensions = []
562
+ for subset_dimension_group in subset_dimensions:
563
+ if not subset_dimension_group.create_supplemental_dimension:
564
+ continue
565
+ dimension_type = subset_dimension_group.dimension_type
566
+ base_dims: list[DimensionBaseConfigWithFiles] = []
567
+ for ref in model.dimensions.base_dimension_references:
568
+ if ref.dimension_type == dimension_type:
569
+ base_dim = self._dimension_mgr.get_by_id(ref.dimension_id, conn=conn)
570
+ if (
571
+ subset_dimension_group.base_dimension_name is None
572
+ or base_dim.model.name == subset_dimension_group.base_dimension_name
573
+ ):
574
+ base_dims.append(base_dim)
575
+ break
576
+ if len(base_dims) == 0:
577
+ msg = f"Did not find a base dimension for {subset_dimension_group=}"
578
+ raise Exception(msg)
579
+ elif len(base_dims) > 1:
580
+ msg = (
581
+ f"Found multiple base dimensions for {dimension_type=}. Please specify "
582
+ f"'base_dimension_name' in {subset_dimension_group=}"
583
+ )
584
+ raise DSGInvalidParameter(msg)
585
+ base_dim = base_dims[0]
586
+ records: dict[str, list[Any]] = {"id": [], "name": []}
587
+ mapping_records = []
588
+ dim_record_ids = set()
589
+ # The pydantic validator has already checked consistency of these columns.
590
+ for column in subset_dimension_group.selectors[0].column_values:
591
+ records[column] = []
592
+ for selector in subset_dimension_group.selectors:
593
+ records["id"].append(selector.name)
594
+ records["name"].append(selector.name)
595
+ if selector.column_values:
596
+ for column, value in selector.column_values.items():
597
+ records[column].append(value)
598
+ for record_id in selector.records:
599
+ mapping_records.append({"from_id": record_id, "to_id": selector.name})
600
+ dim_record_ids.add(record_id)
601
+
602
+ filename = tmp_path / f"{subset_dimension_group.name}.csv"
603
+ pd.DataFrame(records).to_csv(filename, index=False)
604
+
605
+ for record_id in base_dim.get_unique_ids().difference(dim_record_ids):
606
+ mapping_records.append({"from_id": record_id, "to_id": ""})
607
+ map_record_file = tmp_path / f"{subset_dimension_group.name}_mapping.csv"
608
+ pd.DataFrame.from_records(mapping_records).to_csv(map_record_file, index=False)
609
+
610
+ dim = SupplementalDimensionModel(
611
+ file=str(filename),
612
+ name=subset_dimension_group.name,
613
+ type=dimension_type,
614
+ module=base_dim.model.module,
615
+ class_name=base_dim.model.class_name,
616
+ description=subset_dimension_group.description,
617
+ mapping=MappingTableByNameModel(
618
+ file=str(map_record_file),
619
+ mapping_type=DimensionMappingType.MANY_TO_MANY_EXPLICIT_MULTIPLIERS,
620
+ description=f"Aggregation map for {subset_dimension_group.name}",
621
+ project_base_dimension_name=base_dim.model.name,
622
+ ),
623
+ )
624
+ dimensions.append(dim)
625
+
626
+ self._register_supplemental_dimensions_from_models(
627
+ tmp_path,
628
+ model,
629
+ dimensions,
630
+ context,
631
+ )
632
+
633
+ def _register_all_in_one_dimensions(
634
+ self,
635
+ src_dir,
636
+ model,
637
+ context: RegistrationContext,
638
+ ):
639
+ with TemporaryDirectory() as tmpdir:
640
+ tmp_path = Path(tmpdir)
641
+ new_dimensions = []
642
+ dim_type_to_ref = {
643
+ x.dimension_type: x for x in model.dimensions.base_dimension_references
644
+ }
645
+ # Metric is excluded because fuel_id and unit may not be the same for all records.
646
+ # Time doesn't have records.
647
+ exclude = {DimensionType.METRIC, DimensionType.TIME}
648
+ for dimension_type in (x for x in DimensionType if x not in exclude):
649
+ dim_ref = dim_type_to_ref[dimension_type]
650
+ dim_config = self._dimension_mgr.get_by_id(
651
+ dim_ref.dimension_id, conn=context.connection
652
+ )
653
+ assert isinstance(dim_config, DimensionBaseConfigWithFiles)
654
+ dt_str = dimension_type.value
655
+ if dt_str.endswith("y"):
656
+ dt_plural = dt_str[:-1] + "ies"
657
+ else:
658
+ dt_plural = dt_str + "s"
659
+ dt_all_plural = f"all_{dt_plural}"
660
+ dim_name = f"all_{model.project_id}_{dt_plural}"
661
+ dim_name_formal = f"All {dt_plural.title()}"
662
+ dim_record_file = tmp_path / f"{dt_all_plural}.csv"
663
+ dim_text = f"id,name\n{dt_all_plural},{dim_name_formal}\n"
664
+ dim_record_file.write_text(dim_text)
665
+ map_record_file = tmp_path / f"lookup_{dt_str}_to_{dt_all_plural}.csv"
666
+ with open(map_record_file, "w") as f_out:
667
+ f_out.write("from_id,to_id\n")
668
+ for record in dim_config.get_unique_ids():
669
+ f_out.write(record)
670
+ f_out.write(",")
671
+ f_out.write(dt_all_plural)
672
+ f_out.write("\n")
673
+
674
+ with in_other_dir(src_dir):
675
+ new_dim = SupplementalDimensionModel(
676
+ file=str(dim_record_file),
677
+ name=dim_name,
678
+ type=dimension_type,
679
+ module="dsgrid.dimension.base_models",
680
+ class_name="DimensionRecordBaseModel",
681
+ description=dim_name_formal,
682
+ mapping=MappingTableByNameModel(
683
+ file=str(map_record_file),
684
+ mapping_type=DimensionMappingType.MANY_TO_ONE_AGGREGATION,
685
+ description=f"Aggregation map for all {dt_str}s",
686
+ ),
687
+ )
688
+ new_dimensions.append(new_dim)
689
+
690
+ self._register_supplemental_dimensions_from_models(
691
+ src_dir,
692
+ model,
693
+ new_dimensions,
694
+ context,
695
+ )
696
+
697
+ def _register(self, config: ProjectConfig, context: RegistrationContext):
698
+ self._run_checks(config)
699
+
700
+ config.model.version = "1.0.0"
701
+ model = self.db.insert(context.connection, config.model, context.registration)
702
+ assert isinstance(model, ProjectConfigModel)
703
+ logger.info(
704
+ "%s Registered project %s with version=%s",
705
+ self._log_offline_mode_prefix(),
706
+ model.project_id,
707
+ config.model.version,
708
+ )
709
+
710
+ def _run_checks(self, config: ProjectConfig):
711
+ dims = [x for x in config.iter_dimensions()]
712
+ check_uniqueness((x.model.name for x in dims), "dimension name")
713
+ self._check_base_dimensions(config)
714
+
715
+ for dataset_id in config.list_unregistered_dataset_ids():
716
+ for field in RequiredDimensionRecordsModel.model_fields:
717
+ # This will check that all dimension record IDs listed in the requirements
718
+ # exist in the project.
719
+ config.get_required_dimension_record_ids(dataset_id, DimensionType(field))
720
+
721
+ def _check_base_dimensions(self, config: ProjectConfig) -> None:
722
+ found_time = False
723
+ for dim in config.list_base_dimensions():
724
+ if dim.model.dimension_type == DimensionType.TIME:
725
+ if found_time:
726
+ msg = "Only one time dimension is allowed in a project."
727
+ raise DSGInvalidDimension(msg)
728
+ found_time = True
729
+
730
+ assert found_time
731
+ self._set_dataset_record_requirement_definitions_names(config)
732
+ self._check_dataset_record_requirement_definitions(config)
733
+
734
+ def _set_dataset_record_requirement_definitions_names(
735
+ self,
736
+ config: ProjectConfig,
737
+ ) -> None:
738
+ def set_dimension_name(req: RequiredBaseDimensionModel) -> None:
739
+ if req.dimension_name is None and req.dimension_name is not None:
740
+ dim = config.get_dimension_by_name(req.dimension_name)
741
+ req.dimension_name = None
742
+ req.dimension_name = dim.model.name
743
+
744
+ for dataset in config.model.datasets:
745
+ dim_type_as_fields = RequiredDimensionRecordsModel.model_fields.keys()
746
+ for field in dim_type_as_fields:
747
+ req = getattr(dataset.required_dimensions.single_dimensional, field)
748
+ for base_field in ("base", "base_missing"):
749
+ set_dimension_name(getattr(req, base_field))
750
+ for multi_dim in dataset.required_dimensions.multi_dimensional:
751
+ req = getattr(multi_dim, field)
752
+ for base_field in ("base", "base_missing"):
753
+ set_dimension_name(getattr(req, base_field))
754
+
755
+ def _check_dataset_record_requirement_definitions(
756
+ self,
757
+ config: ProjectConfig,
758
+ ) -> None:
759
+ for dataset in config.model.datasets:
760
+ dim_type_as_fields = RequiredDimensionRecordsModel.model_fields.keys()
761
+ for dim_type_as_field in dim_type_as_fields:
762
+ dim_type = DimensionType(dim_type_as_field)
763
+ required_dimension_records = getattr(
764
+ dataset.required_dimensions.single_dimensional, dim_type_as_field
765
+ )
766
+ self._check_base_dimension_record_requirements(
767
+ required_dimension_records, dim_type, config, dataset.dataset_id
768
+ )
769
+ for multi_dim in dataset.required_dimensions.multi_dimensional:
770
+ required_dimension_records = getattr(multi_dim, dim_type_as_field)
771
+ self._check_base_dimension_record_requirements(
772
+ required_dimension_records, dim_type, config, dataset.dataset_id
773
+ )
774
+
775
+ def _check_base_dimension_record_requirements(
776
+ self,
777
+ req_dim_records: RequiredDimensionRecordsByTypeModel,
778
+ dim_type: DimensionType,
779
+ config: ProjectConfig,
780
+ dataset_id: str,
781
+ ) -> None:
782
+ base_dims = config.list_base_dimensions(dimension_type=dim_type)
783
+ for base_field in ("base", "base_missing"):
784
+ reqs = getattr(req_dim_records, base_field)
785
+ if reqs.record_ids and reqs.dimension_name is None:
786
+ if len(base_dims) == 1:
787
+ reqs.dimension_name = base_dims[0].model.name
788
+ logger.debug(
789
+ "Assigned dimension_name=%s for %s dataset_id=%s",
790
+ reqs.dimension_name,
791
+ dim_type,
792
+ dataset_id,
793
+ )
794
+ else:
795
+ msg = (
796
+ f"{dataset_id=} requires a base dimension name for "
797
+ f"{dim_type} because the project has {len(base_dims)} base dimensions."
798
+ )
799
+ raise DSGInvalidDimensionMapping(msg)
800
+ # Only one of base and base_missing can be set, and that was already checked.
801
+ break
802
+
803
+ @track_timing(timer_stats_collector)
804
+ def register_and_submit_dataset(
805
+ self,
806
+ dataset_config_file: Path,
807
+ dataset_path: Path,
808
+ project_id: str,
809
+ submitter: str,
810
+ log_message: str,
811
+ dimension_mapping_file=None,
812
+ dimension_mapping_references_file=None,
813
+ autogen_reverse_supplemental_mappings=None,
814
+ ):
815
+ with RegistrationContext(
816
+ self.db, log_message, VersionUpdateType.MINOR, submitter
817
+ ) as context:
818
+ conn = context.connection
819
+ if not self.has_id(project_id, conn=conn):
820
+ msg = f"{project_id=}"
821
+ raise DSGValueNotRegistered(msg)
822
+
823
+ dataset_config = DatasetConfig.load_from_user_path(dataset_config_file, dataset_path)
824
+ dataset_id = dataset_config.model.dataset_id
825
+ config = self.get_by_id(project_id, conn=conn)
826
+ # This will raise an exception if the dataset_id is not part of the project or already
827
+ # registered.
828
+ self._raise_if_not_unregistered(config, dataset_id)
829
+
830
+ self._dataset_mgr.register(
831
+ dataset_config_file,
832
+ dataset_path,
833
+ context=context,
834
+ )
835
+ self.submit_dataset(
836
+ project_id,
837
+ context.get_ids(RegistryType.DATASET)[0],
838
+ dimension_mapping_file=dimension_mapping_file,
839
+ dimension_mapping_references_file=dimension_mapping_references_file,
840
+ autogen_reverse_supplemental_mappings=autogen_reverse_supplemental_mappings,
841
+ context=context,
842
+ )
843
+
844
+ @track_timing(timer_stats_collector)
845
+ def submit_dataset(
846
+ self,
847
+ project_id: str,
848
+ dataset_id: str,
849
+ submitter: str | None = None,
850
+ log_message: str | None = None,
851
+ dimension_mapping_file: Path | None = None,
852
+ dimension_mapping_references_file: Path | None = None,
853
+ autogen_reverse_supplemental_mappings: list[DimensionType] | None = None,
854
+ context: RegistrationContext | None = None,
855
+ ):
856
+ """Registers a dataset with a project. This can only be performed on the
857
+ latest version of the project.
858
+
859
+ Parameters
860
+ ----------
861
+ project_id : str
862
+ dataset_id : str
863
+ dimension_mapping_file : Path or None
864
+ Base-to-base dimension mapping file
865
+ dimension_mapping_references_file : Path or None
866
+ autogen_reverse_supplemental_mappings : list[DimensionType] or None
867
+ Dimensions on which to attempt create reverse mappings from supplemental dimensions.
868
+ submitter : str
869
+ Submitter name
870
+ log_message : str
871
+ context : None or RegistrationContext
872
+
873
+ Raises
874
+ ------
875
+ DSGValueNotRegistered
876
+ Raised if the project_id or dataset_id is not registered.
877
+ DSGDuplicateValueRegistered
878
+ Raised if the dataset is already registered with the project.
879
+ ValueError
880
+ Raised if the project does not contain this dataset.
881
+
882
+ """
883
+ if context is None:
884
+ assert submitter is not None
885
+ assert log_message is not None
886
+ with RegistrationContext(
887
+ self.db, log_message, VersionUpdateType.MINOR, submitter
888
+ ) as context:
889
+ config = self.get_by_id(project_id, conn=context.connection)
890
+ self._submit_dataset_and_register_mappings(
891
+ config,
892
+ dataset_id,
893
+ dimension_mapping_file,
894
+ dimension_mapping_references_file,
895
+ autogen_reverse_supplemental_mappings,
896
+ context,
897
+ )
898
+ else:
899
+ config = self.get_by_id(project_id, conn=context.connection)
900
+ self._submit_dataset_and_register_mappings(
901
+ config,
902
+ dataset_id,
903
+ dimension_mapping_file,
904
+ dimension_mapping_references_file,
905
+ autogen_reverse_supplemental_mappings,
906
+ context,
907
+ )
908
+
909
+ def register_subset_dimensions(
910
+ self,
911
+ project_id: str,
912
+ filename: Path,
913
+ submitter: str,
914
+ log_message: str,
915
+ update_type: VersionUpdateType,
916
+ ):
917
+ """Register new subset dimensions."""
918
+ with RegistrationContext(self.db, log_message, update_type, submitter) as context:
919
+ config = self.get_by_id(project_id, conn=context.connection)
920
+ subset_model = SubsetDimensionGroupListModel.from_file(filename)
921
+ self._register_subset_dimensions(
922
+ config.model,
923
+ subset_model.subset_dimensions,
924
+ context,
925
+ )
926
+ self._make_new_config(config, context)
927
+
928
+ def register_supplemental_dimensions(
929
+ self,
930
+ project_id: str,
931
+ filename: Path,
932
+ submitter: str,
933
+ log_message: str,
934
+ update_type: VersionUpdateType,
935
+ ):
936
+ """Register new supplemental dimensions."""
937
+ with RegistrationContext(self.db, log_message, update_type, submitter) as context:
938
+ config = self.get_by_id(project_id, conn=context.connection)
939
+ model = SupplementalDimensionsListModel.from_file(filename)
940
+ self._register_supplemental_dimensions_from_models(
941
+ filename.parent,
942
+ config.model,
943
+ model.supplemental_dimensions,
944
+ context,
945
+ )
946
+ self._make_new_config(config, context)
947
+
948
+ def add_dataset_requirements(
949
+ self,
950
+ project_id: str,
951
+ filename: Path,
952
+ submitter: str,
953
+ log_message: str,
954
+ update_type: VersionUpdateType,
955
+ ):
956
+ """Add requirements for one or more datasets to the project."""
957
+ with RegistrationContext(self.db, log_message, update_type, submitter) as context:
958
+ config = self.get_by_id(project_id, conn=context.connection)
959
+ model = InputDatasetListModel.from_file(filename)
960
+ existing_ids = {x.dataset_id for x in config.model.datasets}
961
+ for dataset in model.datasets:
962
+ if dataset.dataset_id in existing_ids:
963
+ msg = f"{dataset.dataset_id} is already stored in the project"
964
+ raise DSGInvalidParameter(msg)
965
+ if dataset.status != DatasetRegistryStatus.UNREGISTERED:
966
+ msg = f"New dataset {dataset.dataset_id} status must be unregistered: {dataset.status}"
967
+ raise DSGInvalidParameter(msg)
968
+
969
+ config.model.datasets += model.datasets
970
+ self._make_new_config(config, context)
971
+
972
+ def replace_dataset_dimension_requirements(
973
+ self,
974
+ project_id: str,
975
+ filename: Path,
976
+ submitter: str,
977
+ log_message: str,
978
+ update_type: VersionUpdateType,
979
+ ):
980
+ """Replace dataset requirements in a project."""
981
+ with RegistrationContext(self.db, log_message, update_type, submitter) as context:
982
+ config = self.get_by_id(project_id, conn=context.connection)
983
+ model = InputDatasetDimensionRequirementsListModel.from_file(filename)
984
+ for dataset in model.dataset_dimension_requirements:
985
+ found = False
986
+ for i in range(len(config.model.datasets)):
987
+ if config.model.datasets[i].dataset_id == dataset.dataset_id:
988
+ config.model.datasets[i].required_dimensions = dataset.required_dimensions
989
+ if config.model.datasets[i].status == DatasetRegistryStatus.REGISTERED:
990
+ config.model.datasets[i].status = DatasetRegistryStatus.UNREGISTERED
991
+ logger.info(
992
+ "Changed dataset %s status to %s in project %s",
993
+ dataset.dataset_id,
994
+ config.model.datasets[i].status.value,
995
+ project_id,
996
+ )
997
+ # TODO: When issue #309 is addressed, we need to set all dependent
998
+ # derived datasets to unregistered also.
999
+ found = True
1000
+ break
1001
+ if not found:
1002
+ msg = f"{dataset.dataset_type} is not present in the project config"
1003
+ raise DSGInvalidParameter(msg)
1004
+
1005
+ self._make_new_config(config, context)
1006
+
1007
+ def _submit_dataset_and_register_mappings(
1008
+ self,
1009
+ project_config: ProjectConfig,
1010
+ dataset_id: str,
1011
+ dimension_mapping_file: Path | None,
1012
+ dimension_mapping_references_file: Path | None,
1013
+ autogen_reverse_supplemental_mappings: list[DimensionType] | None,
1014
+ context: RegistrationContext,
1015
+ ) -> None:
1016
+ logger.info("Submit dataset=%s to project=%s.", dataset_id, project_config.config_id)
1017
+ self._check_if_not_registered(context.connection, project_config.config_id)
1018
+ self._raise_if_not_unregistered(project_config, dataset_id)
1019
+ dataset_config = self._dataset_mgr.get_by_id(dataset_id, conn=context.connection)
1020
+
1021
+ references = []
1022
+ if dimension_mapping_file is not None:
1023
+ references += self._register_mappings_from_file(
1024
+ project_config,
1025
+ dataset_config,
1026
+ dimension_mapping_file,
1027
+ context,
1028
+ )
1029
+ if dimension_mapping_references_file is not None:
1030
+ for ref in DimensionMappingReferenceListModel.load(
1031
+ dimension_mapping_references_file
1032
+ ).references:
1033
+ if not self.dimension_mapping_manager.has_id(
1034
+ ref.mapping_id, version=ref.version, conn=context.connection
1035
+ ):
1036
+ msg = f"mapping_id={ref.mapping_id}"
1037
+ raise DSGValueNotRegistered(msg)
1038
+ references.append(ref)
1039
+
1040
+ if autogen_reverse_supplemental_mappings:
1041
+ references += self._auto_register_reverse_supplemental_mappings(
1042
+ project_config,
1043
+ dataset_config,
1044
+ references,
1045
+ set((x.value for x in autogen_reverse_supplemental_mappings)),
1046
+ context,
1047
+ )
1048
+
1049
+ self._submit_dataset(project_config, dataset_config, references, context)
1050
+
1051
+ def _raise_if_not_unregistered(self, project_config: ProjectConfig, dataset_id: str) -> None:
1052
+ # This will raise if the dataset is not specified in the project.
1053
+ dataset_model = project_config.get_dataset(dataset_id)
1054
+ status = dataset_model.status
1055
+ if status != DatasetRegistryStatus.UNREGISTERED:
1056
+ msg = (
1057
+ f"{dataset_id=} cannot be submitted to project={project_config.config_id} with "
1058
+ f"{status=}"
1059
+ )
1060
+ raise DSGDuplicateValueRegistered(msg)
1061
+
1062
+ def _register_mappings_from_file(
1063
+ self,
1064
+ project_config: ProjectConfig,
1065
+ dataset_config: DatasetConfig,
1066
+ dimension_mapping_file: Path,
1067
+ context: RegistrationContext,
1068
+ ):
1069
+ references = []
1070
+ src_dir = dimension_mapping_file.parent
1071
+ mappings = DatasetBaseToProjectMappingTableListModel(
1072
+ **load_data(dimension_mapping_file)
1073
+ ).mappings
1074
+ dataset_mapping = {x.dimension_type: x for x in dataset_config.model.dimension_references}
1075
+ project_mapping: dict[DimensionType, list[DimensionBaseConfig]] = defaultdict(list)
1076
+ project_mapping_refs: dict[str, DimensionReferenceModel] = {}
1077
+ for ref in project_config.model.dimensions.base_dimension_references:
1078
+ dim = self._dimension_mgr.get_by_id(
1079
+ ref.dimension_id, version=ref.version, conn=context.connection
1080
+ )
1081
+ project_mapping[ref.dimension_type].append(dim)
1082
+ project_mapping_refs[dim.model.dimension_id] = ref
1083
+ mapping_tables = []
1084
+ for mapping in mappings:
1085
+ base_dim: DimensionBaseConfig | None = None
1086
+ if mapping.project_base_dimension_name is None:
1087
+ base_dims = project_mapping[mapping.dimension_type]
1088
+ if len(base_dims) > 1:
1089
+ msg = (
1090
+ "If there are multiple project base dimensions for a dimension type, the "
1091
+ "dataset dimension mapping must supply a project_base_dimension_name. "
1092
+ f"{mapping}"
1093
+ )
1094
+ raise DSGInvalidDimensionMapping(msg)
1095
+ base_dim = base_dims[0]
1096
+ else:
1097
+ for base_dim_ in project_mapping[mapping.dimension_type]:
1098
+ if base_dim_.model.name == mapping.project_base_dimension_name:
1099
+ base_dim = base_dim_
1100
+ if base_dim is None:
1101
+ msg = f"Bug: unable to find base dimension for {mapping.project_base_dimension_name}"
1102
+ raise Exception(msg)
1103
+ with in_other_dir(src_dir):
1104
+ assert base_dim is not None
1105
+ mapping_table = MappingTableModel.from_pre_registered_model(
1106
+ mapping,
1107
+ dataset_mapping[mapping.dimension_type],
1108
+ project_mapping_refs[base_dim.model.dimension_id],
1109
+ )
1110
+ mapping_tables.append(mapping_table)
1111
+
1112
+ mappings_config = DimensionMappingsConfig.load_from_model(
1113
+ DimensionMappingsConfigModel(mappings=mapping_tables)
1114
+ )
1115
+ mapping_ids = self._dimension_mapping_mgr.register_from_config(mappings_config, context)
1116
+ for mapping_id in mapping_ids:
1117
+ mapping_config = self._dimension_mapping_mgr.get_by_id(
1118
+ mapping_id, conn=context.connection
1119
+ )
1120
+ references.append(
1121
+ DimensionMappingReferenceModel(
1122
+ from_dimension_type=mapping_config.model.from_dimension.dimension_type,
1123
+ to_dimension_type=mapping_config.model.to_dimension.dimension_type,
1124
+ mapping_id=mapping_id,
1125
+ version=str(
1126
+ self._dimension_mapping_mgr.get_latest_version(
1127
+ mapping_id, conn=context.connection
1128
+ )
1129
+ ),
1130
+ )
1131
+ )
1132
+
1133
+ return references
1134
+
1135
+ def _auto_register_reverse_supplemental_mappings(
1136
+ self,
1137
+ project_config: ProjectConfig,
1138
+ dataset_config: DatasetConfig,
1139
+ mapping_references: list[DimensionMappingReferenceModel],
1140
+ autogen_reverse_supplemental_mappings: set[str],
1141
+ context: RegistrationContext,
1142
+ ):
1143
+ conn = context.connection
1144
+ references = []
1145
+ p_model = project_config.model
1146
+ p_supp_dim_ids = {
1147
+ x.dimension_id for x in p_model.dimensions.supplemental_dimension_references
1148
+ }
1149
+ d_dim_from_ids = set()
1150
+ for ref in mapping_references:
1151
+ mapping_config = self._dimension_mapping_mgr.get_by_id(ref.mapping_id, conn=conn)
1152
+ d_dim_from_ids.add(mapping_config.model.from_dimension.dimension_id)
1153
+
1154
+ needs_mapping = []
1155
+ for dim in dataset_config.model.dimension_references:
1156
+ if (
1157
+ dim.dimension_type in autogen_reverse_supplemental_mappings
1158
+ and dim.dimension_id in p_supp_dim_ids
1159
+ and dim.dimension_id not in d_dim_from_ids
1160
+ ):
1161
+ needs_mapping.append((dim.dimension_id, dim.version))
1162
+ # else:
1163
+ # This dimension is the same as a project base dimension.
1164
+ # or
1165
+ # The dataset may only need to provide a subset of records, and those are
1166
+ # checked in the dimension association table.
1167
+
1168
+ if len(needs_mapping) != len(autogen_reverse_supplemental_mappings):
1169
+ msg = (
1170
+ f"Mappings to autgen [{needs_mapping}] does not match user-specified "
1171
+ f"autogen_reverse_supplemental_mappings={autogen_reverse_supplemental_mappings}"
1172
+ )
1173
+ raise DSGInvalidDimensionMapping(msg)
1174
+
1175
+ new_mappings = []
1176
+ for from_id, from_version in needs_mapping:
1177
+ to_dim = self._dimension_mgr.get_by_id(from_id, version=from_version, conn=conn)
1178
+ from_dim, to_version = project_config.get_base_dimension_and_version(
1179
+ to_dim.model.dimension_type
1180
+ )
1181
+ mapping, version = self._try_get_mapping(
1182
+ project_config, to_dim, from_version, from_dim, to_version, context
1183
+ )
1184
+ if mapping is None:
1185
+ p_mapping, _ = self._try_get_mapping(
1186
+ project_config, from_dim, to_version, to_dim, from_version, context
1187
+ )
1188
+ assert (
1189
+ p_mapping is not None
1190
+ ), f"from={from_dim.model.dimension_id} to={to_dim.model.dimension_id}"
1191
+ records = models_to_dataframe(p_mapping.model.records)
1192
+ fraction_vals = get_unique_values(records, "from_fraction")
1193
+ if len(fraction_vals) != 1 and next(iter(fraction_vals)) != 1.0:
1194
+ msg = (
1195
+ f"Cannot auto-generate a dataset-to-project mapping from from a project "
1196
+ "supplemental dimension unless the from_fraction column is empty or only "
1197
+ f"has values of 1.0: {p_mapping.model.mapping_id} - {fraction_vals}"
1198
+ )
1199
+ raise DSGInvalidDimensionMapping(msg)
1200
+ reverse_records = (
1201
+ records.drop("from_fraction")
1202
+ .select(F.col("to_id").alias("from_id"), F.col("from_id").alias("to_id"))
1203
+ .toPandas()
1204
+ )
1205
+ dst = Path(tempfile.gettempdir()) / f"reverse_{p_mapping.config_id}.csv"
1206
+ # Use pandas because spark creates a CSV directory.
1207
+ reverse_records.to_csv(dst, index=False)
1208
+ dimension_type = to_dim.model.dimension_type.value
1209
+ new_mappings.append(
1210
+ {
1211
+ "description": f"Maps {dataset_config.config_id} {dimension_type} to project",
1212
+ "dimension_type": dimension_type,
1213
+ "file": str(dst),
1214
+ "mapping_type": DimensionMappingType.MANY_TO_MANY_EXPLICIT_MULTIPLIERS.value,
1215
+ }
1216
+ )
1217
+ else:
1218
+ assert version is not None
1219
+ reference = DimensionMappingReferenceModel(
1220
+ from_dimension_type=to_dim.model.dimension_type,
1221
+ to_dimension_type=to_dim.model.dimension_type,
1222
+ mapping_id=mapping.model.mapping_id,
1223
+ version=version,
1224
+ )
1225
+ references.append(reference)
1226
+
1227
+ if new_mappings:
1228
+ # We don't currently have a way to register a single dimension mapping. It would be
1229
+ # better to register these mappings directly. But, this code was already here.
1230
+ mapping_file = Path(tempfile.gettempdir()) / "dimension_mappings.json5"
1231
+ mapping_file.write_text(json5.dumps({"mappings": new_mappings}, indent=2))
1232
+ to_delete = [mapping_file] + [x["file"] for x in new_mappings]
1233
+ try:
1234
+ references += self._register_mappings_from_file(
1235
+ project_config,
1236
+ dataset_config,
1237
+ mapping_file,
1238
+ context,
1239
+ )
1240
+ finally:
1241
+ for filename in to_delete:
1242
+ Path(filename).unlink()
1243
+
1244
+ return references
1245
+
1246
+ def _try_get_mapping(
1247
+ self,
1248
+ project_config: ProjectConfig,
1249
+ from_dim,
1250
+ from_version,
1251
+ to_dim,
1252
+ to_version,
1253
+ context: RegistrationContext,
1254
+ ):
1255
+ conn = context.connection
1256
+ dimension_type = from_dim.model.dimension_type
1257
+ for ref in project_config.model.dimension_mappings.base_to_supplemental_references:
1258
+ if (
1259
+ ref.from_dimension_type == dimension_type
1260
+ and ref.to_dimension_type == dimension_type
1261
+ ):
1262
+ mapping_config = self._dimension_mapping_mgr.get_by_id(ref.mapping_id, conn=conn)
1263
+ if (
1264
+ mapping_config.model.from_dimension.dimension_id == from_dim.model.dimension_id
1265
+ and mapping_config.model.from_dimension.version == from_version
1266
+ and mapping_config.model.to_dimension.dimension_id == to_dim.model.dimension_id
1267
+ and mapping_config.model.to_dimension.version == to_version
1268
+ ):
1269
+ return mapping_config, ref.version
1270
+
1271
+ return None, None
1272
+
1273
+ def _submit_dataset(
1274
+ self,
1275
+ project_config: ProjectConfig,
1276
+ dataset_config: DatasetConfig,
1277
+ mapping_references: list[DimensionMappingReferenceModel],
1278
+ context: RegistrationContext,
1279
+ ):
1280
+ project_config.add_dataset_dimension_mappings(dataset_config, mapping_references)
1281
+ project_config.add_dataset_base_dimension_names(
1282
+ dataset_config.model.dataset_id,
1283
+ self._id_base_dimension_names_in_dataset(
1284
+ project_config, dataset_config, mapping_references
1285
+ ),
1286
+ )
1287
+ if os.environ.get("__DSGRID_SKIP_CHECK_DATASET_TO_PROJECT_MAPPING__") is not None:
1288
+ logger.warning("Skip dataset-to-project mapping checks")
1289
+ else:
1290
+ self._check_dataset_base_to_project_base_mappings(
1291
+ project_config,
1292
+ dataset_config,
1293
+ mapping_references,
1294
+ context,
1295
+ )
1296
+
1297
+ dataset_model = project_config.get_dataset(dataset_config.model.dataset_id)
1298
+
1299
+ dataset_model.mapping_references = mapping_references
1300
+ dataset_model.status = DatasetRegistryStatus.REGISTERED
1301
+ if project_config.are_all_datasets_submitted():
1302
+ new_status = ProjectRegistryStatus.COMPLETE
1303
+ else:
1304
+ new_status = ProjectRegistryStatus.IN_PROGRESS
1305
+ project_config.set_status(new_status)
1306
+ config = self.update_with_context(project_config, context)
1307
+ self._db.add_contains_dataset(context.connection, config.model, dataset_config.model)
1308
+
1309
+ logger.info(
1310
+ "%s Registered dataset %s with version=%s in project %s",
1311
+ self._log_offline_mode_prefix(),
1312
+ dataset_config.model.dataset_id,
1313
+ config.model.version,
1314
+ config.model.project_id,
1315
+ )
1316
+
1317
+ @track_timing(timer_stats_collector)
1318
+ def _check_dataset_base_to_project_base_mappings(
1319
+ self,
1320
+ project_config: ProjectConfig,
1321
+ dataset_config: DatasetConfig,
1322
+ mapping_references: list[DimensionMappingReferenceModel],
1323
+ context: RegistrationContext,
1324
+ ):
1325
+ """Check that a dataset has all project-required dimension records."""
1326
+ logger.info("Check dataset-base-to-project-base dimension mappings.")
1327
+ data_store = self._dataset_mgr.store
1328
+ handler = make_dataset_schema_handler(
1329
+ context.connection,
1330
+ dataset_config,
1331
+ self._dimension_mgr,
1332
+ self._dimension_mapping_mgr,
1333
+ store=data_store,
1334
+ mapping_references=mapping_references,
1335
+ )
1336
+ dataset_id = dataset_config.config_id
1337
+
1338
+ with ScratchDirContext(self._params.scratch_dir) as scontext:
1339
+ mapped_dataset_table = handler.make_mapped_dimension_association_table(
1340
+ data_store, scontext
1341
+ )
1342
+ project_table = self._make_dimension_associations(project_config, dataset_id, scontext)
1343
+ cols = sorted(project_table.columns)
1344
+ cache(mapped_dataset_table)
1345
+ diff: DataFrame | None = None
1346
+
1347
+ try:
1348
+ # This check is relatively short and will show the user clear errors.
1349
+ _check_distinct_column_values(project_table, mapped_dataset_table)
1350
+ # This check is long and will produce a full table of differences.
1351
+ # It may require some effort from the user.
1352
+ diff = except_all(project_table.select(*cols), mapped_dataset_table.select(*cols))
1353
+ cache(diff)
1354
+ if not is_dataframe_empty(diff):
1355
+ dataset_id = dataset_config.model.dataset_id
1356
+ handle_dimension_association_errors(diff, mapped_dataset_table, dataset_id)
1357
+ finally:
1358
+ unpersist(mapped_dataset_table)
1359
+ if diff is not None:
1360
+ unpersist(diff)
1361
+
1362
+ def _id_base_dimension_names_in_dataset(
1363
+ self,
1364
+ project_config: ProjectConfig,
1365
+ dataset_config: DatasetConfig,
1366
+ mapping_references: list[DimensionMappingReferenceModel],
1367
+ ) -> DatasetBaseDimensionNamesModel:
1368
+ base_dimension_names: dict[DimensionType, str] = {}
1369
+ for ref in mapping_references:
1370
+ mapping = self._dimension_mapping_mgr.get_by_id(ref.mapping_id, version=ref.version)
1371
+ base_dim = self._dimension_mgr.get_by_id(
1372
+ mapping.model.to_dimension.dimension_id,
1373
+ version=mapping.model.to_dimension.version,
1374
+ ).model
1375
+ base_dimension_names[base_dim.dimension_type] = base_dim.name
1376
+
1377
+ project_base_dims_by_type: dict[DimensionType, list[DimensionBaseConfig]] = defaultdict(
1378
+ list
1379
+ )
1380
+ for dim in project_config.list_base_dimensions():
1381
+ project_base_dims_by_type[dim.model.dimension_type].append(dim)
1382
+
1383
+ dataset_id = dataset_config.model.dataset_id
1384
+ for dim_type in DimensionType:
1385
+ if dim_type == DimensionType.TIME:
1386
+ assert len(project_base_dims_by_type[dim_type]) == 1
1387
+ base_dimension_names[dim_type] = project_base_dims_by_type[dim_type][0].model.name
1388
+ continue
1389
+ if dim_type not in base_dimension_names:
1390
+ project_base_dims = project_base_dims_by_type[dim_type]
1391
+ if len(project_base_dims) > 1:
1392
+ for project_dim in project_base_dims:
1393
+ assert isinstance(project_dim, DimensionBaseConfigWithFiles)
1394
+ project_records = project_dim.get_records_dataframe()
1395
+ project_record_ids = get_unique_values(project_records, "id")
1396
+ dataset_dim = dataset_config.get_dimension_with_records(dim_type)
1397
+ assert dataset_dim is not None
1398
+ dataset_records = dataset_dim.get_records_dataframe()
1399
+ dataset_record_ids = get_unique_values(dataset_records, "id")
1400
+ if dataset_record_ids.issubset(project_record_ids):
1401
+ project_dim_name = project_dim.model.name
1402
+ if dim_type in base_dimension_names:
1403
+ msg = (
1404
+ f"Found multiple project base dimensions for {dataset_id=} "
1405
+ f"and {dim_type=}: {base_dimension_names[dim_type]} and "
1406
+ f"{project_dim_name}. Please specify a mapping."
1407
+ )
1408
+ raise DSGInvalidDataset(msg)
1409
+
1410
+ base_dimension_names[dim_type] = project_dim_name
1411
+ if dim_type not in base_dimension_names:
1412
+ msg = (
1413
+ f"Bug: {dim_type} has multiple base dimensions in the project, dataset "
1414
+ f"{dataset_id} does not specify a mapping, and dsgrid could not "
1415
+ "discern which base dimension to use."
1416
+ )
1417
+ raise DSGInvalidDataset(msg)
1418
+ else:
1419
+ base_dimension_names[dim_type] = project_base_dims[0].model.name
1420
+
1421
+ data = {k.value: v for k, v in base_dimension_names.items()}
1422
+ return DatasetBaseDimensionNamesModel(**data)
1423
+
1424
+ @track_timing(timer_stats_collector)
1425
+ def _make_dimension_associations(
1426
+ self,
1427
+ config: ProjectConfig,
1428
+ dataset_id: str,
1429
+ context: ScratchDirContext,
1430
+ ) -> DataFrame:
1431
+ logger.info("Make dimension association table for %s", dataset_id)
1432
+ df = config.make_dimension_association_table(dataset_id, context)
1433
+ if use_duckdb():
1434
+ df2 = df
1435
+ else:
1436
+ # This operation is slow with Spark. Ensure that we only evaluate the query once.
1437
+ df2 = read_dataframe(persist_intermediate_table(df, context, "dimension_associations"))
1438
+ logger.info("Wrote dimension associations for dataset %s", dataset_id)
1439
+ return df2
1440
+
1441
+ def update_from_file(
1442
+ self,
1443
+ config_file,
1444
+ project_id: str,
1445
+ submitter: str,
1446
+ update_type: VersionUpdateType,
1447
+ log_message: str,
1448
+ version: str,
1449
+ ) -> ProjectConfig:
1450
+ with RegistrationContext(self.db, log_message, update_type, submitter) as context:
1451
+ config = ProjectConfig.load(config_file)
1452
+ self._update_dimensions_and_mappings(context.connection, config)
1453
+ self._check_update(context.connection, config, project_id, version)
1454
+ return self.update_with_context(config, context)
1455
+
1456
+ @track_timing(timer_stats_collector)
1457
+ def update(
1458
+ self,
1459
+ config: ProjectConfig,
1460
+ update_type: VersionUpdateType,
1461
+ log_message: str,
1462
+ submitter: str | None = None,
1463
+ ) -> ProjectConfig:
1464
+ with RegistrationContext(self.db, log_message, update_type, submitter) as context:
1465
+ self._update_dimensions_and_mappings(context.connection, config)
1466
+ return self.update_with_context(config, context)
1467
+
1468
+ def update_with_context(
1469
+ self, config: ProjectConfig, context: RegistrationContext
1470
+ ) -> ProjectConfig:
1471
+ old_config = self.get_by_id(config.model.project_id, conn=context.connection)
1472
+ checker = ProjectUpdateChecker(old_config.model, config.model)
1473
+ checker.run()
1474
+ self._run_checks(config)
1475
+ return self._make_new_config(config, context)
1476
+
1477
+ def _make_new_config(
1478
+ self, config: ProjectConfig, context: RegistrationContext
1479
+ ) -> ProjectConfig:
1480
+ old_version = config.model.version
1481
+ old_key = ConfigKey(config.config_id, old_version)
1482
+ model = self._update_config(config, context)
1483
+ assert isinstance(model, ProjectConfigModel)
1484
+ new_config = ProjectConfig(model)
1485
+ self._update_dimensions_and_mappings(context.connection, new_config)
1486
+ new_key = ConfigKey(new_config.model.project_id, new_config.model.version)
1487
+ self._projects.pop(old_key, None)
1488
+ self._projects[new_key] = new_config
1489
+ return new_config
1490
+
1491
+ def finalize_registration(self, conn: Connection, config_ids: set[str], error_occurred: bool):
1492
+ if error_occurred:
1493
+ logger.info("Remove intermediate project after error")
1494
+ for key in [x for x in self._projects if x.id in config_ids]:
1495
+ self._projects.pop(key)
1496
+
1497
+ def remove(self, config_id: str, conn: Connection | None = None) -> None:
1498
+ self.db.delete_all(conn, config_id)
1499
+ for key in [x for x in self._projects if x.id == config_id]:
1500
+ self._projects.pop(key)
1501
+
1502
+ logger.info("Removed %s from the registry.", config_id)
1503
+
1504
+ def show(
1505
+ self,
1506
+ conn: Connection | None = None,
1507
+ filters: list[str] | None = None,
1508
+ max_width: Union[int, dict] | None = None,
1509
+ drop_fields: list[str] | None = None,
1510
+ return_table: bool = False,
1511
+ **kwargs,
1512
+ ):
1513
+ """Show registry in PrettyTable
1514
+
1515
+ Parameters
1516
+ ----------
1517
+ filters : list or tuple
1518
+ List of filter expressions for reigstry content (e.g., filters=["Submitter==USER", "Description contains comstock"])
1519
+ max_width
1520
+ Max column width in PrettyTable, specify as a single value or as a dict of values by field name
1521
+ drop_fields
1522
+ List of field names not to show
1523
+
1524
+ """
1525
+
1526
+ if filters:
1527
+ logger.info("List registry for: %s", filters)
1528
+
1529
+ table = PrettyTable(title=self.name())
1530
+ all_field_names = (
1531
+ "ID",
1532
+ "Version",
1533
+ "Status",
1534
+ "Datasets",
1535
+ "Date",
1536
+ "Submitter",
1537
+ "Description",
1538
+ )
1539
+ # TODO: may want dataset and dataset status to be separate columns
1540
+ # TODO: this block can be refactored into base, registry should be in HTML table for notebook.
1541
+ if drop_fields is None:
1542
+ table.field_names = all_field_names
1543
+ else:
1544
+ table.field_names = tuple(x for x in all_field_names if x not in drop_fields)
1545
+
1546
+ if max_width is None:
1547
+ table._max_width = {
1548
+ "ID": 20,
1549
+ "Status": 12,
1550
+ "Datasets": 30,
1551
+ "Date": 10,
1552
+ "Description": 30,
1553
+ }
1554
+ if isinstance(max_width, int):
1555
+ table.max_width = max_width
1556
+ elif isinstance(max_width, dict):
1557
+ table._max_width = max_width
1558
+
1559
+ transformed_filters = transform_and_validate_filters(filters) if filters else None
1560
+ field_to_index = {x: i for i, x in enumerate(table.field_names)}
1561
+ rows = []
1562
+ for model in self.db.iter_models(conn):
1563
+ assert isinstance(model, ProjectConfigModel)
1564
+ registration = self.db.get_registration(conn, model)
1565
+ all_fields = (
1566
+ model.project_id,
1567
+ model.version,
1568
+ model.status.value,
1569
+ ",\n".join([f"{x.dataset_id}: {x.status.value}" for x in model.datasets]),
1570
+ registration.timestamp.strftime("%Y-%m-%d %H:%M:%S"),
1571
+ registration.submitter,
1572
+ registration.log_message,
1573
+ )
1574
+ if drop_fields is None:
1575
+ row = all_fields
1576
+ else:
1577
+ row = tuple(
1578
+ y for (x, y) in zip(all_field_names, all_fields) if x not in drop_fields
1579
+ )
1580
+
1581
+ if not filters or matches_filters(row, field_to_index, transformed_filters):
1582
+ rows.append(row)
1583
+
1584
+ rows.sort(key=lambda x: x[0])
1585
+ table.add_rows(rows)
1586
+ table.align = "l"
1587
+ if return_table:
1588
+ return table
1589
+ display_table(table)
1590
+
1591
+
1592
+ def _check_distinct_column_values(project_table: DataFrame, mapped_dataset_table: DataFrame):
1593
+ """Ensure that the mapped dataset has the same distinct values as the project for all
1594
+ columns. This should be called before running a full comparison of the two tables.
1595
+ """
1596
+ has_mismatch = False
1597
+ for column in project_table.columns:
1598
+ project_distinct = {x[column] for x in project_table.select(column).distinct().collect()}
1599
+ dataset_distinct = {
1600
+ x[column] for x in mapped_dataset_table.select(column).distinct().collect()
1601
+ }
1602
+ if diff_values := project_distinct.difference(dataset_distinct):
1603
+ has_mismatch = True
1604
+ logger.error(
1605
+ "The mapped dataset has different distinct values than the project "
1606
+ "for column=%s: diff=%s",
1607
+ column,
1608
+ diff_values,
1609
+ )
1610
+
1611
+ if has_mismatch:
1612
+ msg = (
1613
+ "The mapped dataset has different distinct values than the project for one or "
1614
+ "more columns. Please look in the log file for the exact records."
1615
+ )
1616
+ raise DSGInvalidDataset(msg)