dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. build_backend.py +93 -0
  2. dsgrid/__init__.py +22 -0
  3. dsgrid/api/__init__.py +0 -0
  4. dsgrid/api/api_manager.py +179 -0
  5. dsgrid/api/app.py +419 -0
  6. dsgrid/api/models.py +60 -0
  7. dsgrid/api/response_models.py +116 -0
  8. dsgrid/apps/__init__.py +0 -0
  9. dsgrid/apps/project_viewer/app.py +216 -0
  10. dsgrid/apps/registration_gui.py +444 -0
  11. dsgrid/chronify.py +32 -0
  12. dsgrid/cli/__init__.py +0 -0
  13. dsgrid/cli/common.py +120 -0
  14. dsgrid/cli/config.py +176 -0
  15. dsgrid/cli/download.py +13 -0
  16. dsgrid/cli/dsgrid.py +157 -0
  17. dsgrid/cli/dsgrid_admin.py +92 -0
  18. dsgrid/cli/install_notebooks.py +62 -0
  19. dsgrid/cli/query.py +729 -0
  20. dsgrid/cli/registry.py +1862 -0
  21. dsgrid/cloud/__init__.py +0 -0
  22. dsgrid/cloud/cloud_storage_interface.py +140 -0
  23. dsgrid/cloud/factory.py +31 -0
  24. dsgrid/cloud/fake_storage_interface.py +37 -0
  25. dsgrid/cloud/s3_storage_interface.py +156 -0
  26. dsgrid/common.py +36 -0
  27. dsgrid/config/__init__.py +0 -0
  28. dsgrid/config/annual_time_dimension_config.py +194 -0
  29. dsgrid/config/common.py +142 -0
  30. dsgrid/config/config_base.py +148 -0
  31. dsgrid/config/dataset_config.py +907 -0
  32. dsgrid/config/dataset_schema_handler_factory.py +46 -0
  33. dsgrid/config/date_time_dimension_config.py +136 -0
  34. dsgrid/config/dimension_config.py +54 -0
  35. dsgrid/config/dimension_config_factory.py +65 -0
  36. dsgrid/config/dimension_mapping_base.py +350 -0
  37. dsgrid/config/dimension_mappings_config.py +48 -0
  38. dsgrid/config/dimensions.py +1025 -0
  39. dsgrid/config/dimensions_config.py +71 -0
  40. dsgrid/config/file_schema.py +190 -0
  41. dsgrid/config/index_time_dimension_config.py +80 -0
  42. dsgrid/config/input_dataset_requirements.py +31 -0
  43. dsgrid/config/mapping_tables.py +209 -0
  44. dsgrid/config/noop_time_dimension_config.py +42 -0
  45. dsgrid/config/project_config.py +1462 -0
  46. dsgrid/config/registration_models.py +188 -0
  47. dsgrid/config/representative_period_time_dimension_config.py +194 -0
  48. dsgrid/config/simple_models.py +49 -0
  49. dsgrid/config/supplemental_dimension.py +29 -0
  50. dsgrid/config/time_dimension_base_config.py +192 -0
  51. dsgrid/data_models.py +155 -0
  52. dsgrid/dataset/__init__.py +0 -0
  53. dsgrid/dataset/dataset.py +123 -0
  54. dsgrid/dataset/dataset_expression_handler.py +86 -0
  55. dsgrid/dataset/dataset_mapping_manager.py +121 -0
  56. dsgrid/dataset/dataset_schema_handler_base.py +945 -0
  57. dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
  58. dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
  59. dsgrid/dataset/growth_rates.py +162 -0
  60. dsgrid/dataset/models.py +51 -0
  61. dsgrid/dataset/table_format_handler_base.py +257 -0
  62. dsgrid/dataset/table_format_handler_factory.py +17 -0
  63. dsgrid/dataset/unpivoted_table.py +121 -0
  64. dsgrid/dimension/__init__.py +0 -0
  65. dsgrid/dimension/base_models.py +230 -0
  66. dsgrid/dimension/dimension_filters.py +308 -0
  67. dsgrid/dimension/standard.py +252 -0
  68. dsgrid/dimension/time.py +352 -0
  69. dsgrid/dimension/time_utils.py +103 -0
  70. dsgrid/dsgrid_rc.py +88 -0
  71. dsgrid/exceptions.py +105 -0
  72. dsgrid/filesystem/__init__.py +0 -0
  73. dsgrid/filesystem/cloud_filesystem.py +32 -0
  74. dsgrid/filesystem/factory.py +32 -0
  75. dsgrid/filesystem/filesystem_interface.py +136 -0
  76. dsgrid/filesystem/local_filesystem.py +74 -0
  77. dsgrid/filesystem/s3_filesystem.py +118 -0
  78. dsgrid/loggers.py +132 -0
  79. dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
  80. dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
  81. dsgrid/notebooks/registration.ipynb +48 -0
  82. dsgrid/notebooks/start_notebook.sh +11 -0
  83. dsgrid/project.py +451 -0
  84. dsgrid/query/__init__.py +0 -0
  85. dsgrid/query/dataset_mapping_plan.py +142 -0
  86. dsgrid/query/derived_dataset.py +388 -0
  87. dsgrid/query/models.py +728 -0
  88. dsgrid/query/query_context.py +287 -0
  89. dsgrid/query/query_submitter.py +994 -0
  90. dsgrid/query/report_factory.py +19 -0
  91. dsgrid/query/report_peak_load.py +70 -0
  92. dsgrid/query/reports_base.py +20 -0
  93. dsgrid/registry/__init__.py +0 -0
  94. dsgrid/registry/bulk_register.py +165 -0
  95. dsgrid/registry/common.py +287 -0
  96. dsgrid/registry/config_update_checker_base.py +63 -0
  97. dsgrid/registry/data_store_factory.py +34 -0
  98. dsgrid/registry/data_store_interface.py +74 -0
  99. dsgrid/registry/dataset_config_generator.py +158 -0
  100. dsgrid/registry/dataset_registry_manager.py +950 -0
  101. dsgrid/registry/dataset_update_checker.py +16 -0
  102. dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
  103. dsgrid/registry/dimension_mapping_update_checker.py +16 -0
  104. dsgrid/registry/dimension_registry_manager.py +413 -0
  105. dsgrid/registry/dimension_update_checker.py +16 -0
  106. dsgrid/registry/duckdb_data_store.py +207 -0
  107. dsgrid/registry/filesystem_data_store.py +150 -0
  108. dsgrid/registry/filter_registry_manager.py +123 -0
  109. dsgrid/registry/project_config_generator.py +57 -0
  110. dsgrid/registry/project_registry_manager.py +1623 -0
  111. dsgrid/registry/project_update_checker.py +48 -0
  112. dsgrid/registry/registration_context.py +223 -0
  113. dsgrid/registry/registry_auto_updater.py +316 -0
  114. dsgrid/registry/registry_database.py +667 -0
  115. dsgrid/registry/registry_interface.py +446 -0
  116. dsgrid/registry/registry_manager.py +558 -0
  117. dsgrid/registry/registry_manager_base.py +367 -0
  118. dsgrid/registry/versioning.py +92 -0
  119. dsgrid/rust_ext/__init__.py +14 -0
  120. dsgrid/rust_ext/find_minimal_patterns.py +129 -0
  121. dsgrid/spark/__init__.py +0 -0
  122. dsgrid/spark/functions.py +589 -0
  123. dsgrid/spark/types.py +110 -0
  124. dsgrid/tests/__init__.py +0 -0
  125. dsgrid/tests/common.py +140 -0
  126. dsgrid/tests/make_us_data_registry.py +265 -0
  127. dsgrid/tests/register_derived_datasets.py +103 -0
  128. dsgrid/tests/utils.py +25 -0
  129. dsgrid/time/__init__.py +0 -0
  130. dsgrid/time/time_conversions.py +80 -0
  131. dsgrid/time/types.py +67 -0
  132. dsgrid/units/__init__.py +0 -0
  133. dsgrid/units/constants.py +113 -0
  134. dsgrid/units/convert.py +71 -0
  135. dsgrid/units/energy.py +145 -0
  136. dsgrid/units/power.py +87 -0
  137. dsgrid/utils/__init__.py +0 -0
  138. dsgrid/utils/dataset.py +830 -0
  139. dsgrid/utils/files.py +179 -0
  140. dsgrid/utils/filters.py +125 -0
  141. dsgrid/utils/id_remappings.py +100 -0
  142. dsgrid/utils/py_expression_eval/LICENSE +19 -0
  143. dsgrid/utils/py_expression_eval/README.md +8 -0
  144. dsgrid/utils/py_expression_eval/__init__.py +847 -0
  145. dsgrid/utils/py_expression_eval/tests.py +283 -0
  146. dsgrid/utils/run_command.py +70 -0
  147. dsgrid/utils/scratch_dir_context.py +65 -0
  148. dsgrid/utils/spark.py +918 -0
  149. dsgrid/utils/spark_partition.py +98 -0
  150. dsgrid/utils/timing.py +239 -0
  151. dsgrid/utils/utilities.py +221 -0
  152. dsgrid/utils/versioning.py +36 -0
  153. dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
  154. dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
  155. dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
  156. dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
  157. dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
@@ -0,0 +1,1623 @@
1
+ """Manages the registry for dimension projects"""
2
+
3
+ import logging
4
+ import os
5
+ import tempfile
6
+ from collections import defaultdict
7
+ from pathlib import Path
8
+ from tempfile import TemporaryDirectory
9
+ from typing import Any, Type, Union
10
+
11
+ from dsgrid.utils.dataset import handle_dimension_association_errors
12
+ import json5
13
+ import pandas as pd
14
+ from prettytable import PrettyTable
15
+ from sqlalchemy import Connection
16
+
17
+ from dsgrid.config.dimension_config import (
18
+ DimensionBaseConfig,
19
+ DimensionBaseConfigWithFiles,
20
+ )
21
+ from dsgrid.dimension.base_models import DimensionType
22
+ from dsgrid.exceptions import (
23
+ DSGInvalidDataset,
24
+ DSGInvalidDimension,
25
+ DSGInvalidDimensionMapping,
26
+ DSGValueNotRegistered,
27
+ DSGDuplicateValueRegistered,
28
+ DSGInvalidParameter,
29
+ )
30
+ from dsgrid.config.dataset_schema_handler_factory import make_dataset_schema_handler
31
+ from dsgrid.config.dataset_config import DatasetConfig
32
+ from dsgrid.config.dimensions import DimensionModel
33
+ from dsgrid.config.dimensions_config import DimensionsConfig, DimensionsConfigModel
34
+ from dsgrid.config.dimension_mapping_base import (
35
+ DimensionReferenceModel,
36
+ DimensionMappingReferenceModel,
37
+ DimensionMappingReferenceListModel,
38
+ DimensionMappingType,
39
+ )
40
+ from dsgrid.config.dimension_mappings_config import (
41
+ DimensionMappingsConfig,
42
+ DimensionMappingsConfigModel,
43
+ )
44
+ from dsgrid.config.supplemental_dimension import (
45
+ SupplementalDimensionModel,
46
+ SupplementalDimensionsListModel,
47
+ )
48
+ from dsgrid.config.input_dataset_requirements import (
49
+ InputDatasetDimensionRequirementsListModel,
50
+ InputDatasetListModel,
51
+ )
52
+ from dsgrid.config.mapping_tables import (
53
+ MappingTableModel,
54
+ MappingTableByNameModel,
55
+ DatasetBaseToProjectMappingTableListModel,
56
+ )
57
+ from dsgrid.config.project_config import (
58
+ DatasetBaseDimensionNamesModel,
59
+ ProjectConfig,
60
+ ProjectConfigModel,
61
+ RequiredBaseDimensionModel,
62
+ RequiredDimensionRecordsByTypeModel,
63
+ RequiredDimensionRecordsModel,
64
+ SubsetDimensionGroupModel,
65
+ SubsetDimensionGroupListModel,
66
+ )
67
+ from dsgrid.project import Project
68
+ from dsgrid.registry.common import (
69
+ ConfigKey,
70
+ DatasetRegistryStatus,
71
+ ProjectRegistryStatus,
72
+ RegistryManagerParams,
73
+ )
74
+ from dsgrid.spark.functions import (
75
+ cache,
76
+ except_all,
77
+ is_dataframe_empty,
78
+ unpersist,
79
+ )
80
+ from dsgrid.spark.types import (
81
+ DataFrame,
82
+ F,
83
+ use_duckdb,
84
+ )
85
+ from dsgrid.utils.timing import track_timing, timer_stats_collector
86
+ from dsgrid.utils.files import load_data, in_other_dir
87
+ from dsgrid.utils.filters import transform_and_validate_filters, matches_filters
88
+ from dsgrid.utils.scratch_dir_context import ScratchDirContext
89
+ from dsgrid.utils.spark import (
90
+ models_to_dataframe,
91
+ get_unique_values,
92
+ persist_table,
93
+ read_dataframe,
94
+ )
95
+ from dsgrid.utils.utilities import check_uniqueness, display_table
96
+ from dsgrid.registry.registry_interface import ProjectRegistryInterface
97
+ from .common import (
98
+ VersionUpdateType,
99
+ RegistryType,
100
+ )
101
+ from .registration_context import RegistrationContext
102
+ from .project_update_checker import ProjectUpdateChecker
103
+ from .dataset_registry_manager import DatasetRegistryManager
104
+ from .dimension_mapping_registry_manager import DimensionMappingRegistryManager
105
+ from .dimension_registry_manager import DimensionRegistryManager
106
+ from .registry_manager_base import RegistryManagerBase
107
+
108
+
109
+ logger = logging.getLogger(__name__)
110
+
111
+
112
+ class ProjectRegistryManager(RegistryManagerBase):
113
+ """Manages registered dimension projects."""
114
+
115
+ def __init__(
116
+ self,
117
+ path: Path,
118
+ params,
119
+ dataset_manager: DatasetRegistryManager,
120
+ dimension_manager: DimensionRegistryManager,
121
+ dimension_mapping_manager: DimensionMappingRegistryManager,
122
+ db: ProjectRegistryInterface,
123
+ ):
124
+ super().__init__(path, params)
125
+ self._projects: dict[ConfigKey, ProjectConfig] = {}
126
+ self._dataset_mgr = dataset_manager
127
+ self._dimension_mgr = dimension_manager
128
+ self._dimension_mapping_mgr = dimension_mapping_manager
129
+ self._db = db
130
+
131
+ @classmethod
132
+ def load(
133
+ cls,
134
+ path: Path,
135
+ params: RegistryManagerParams,
136
+ dataset_manager: DatasetRegistryManager,
137
+ dimension_manager: DimensionRegistryManager,
138
+ dimension_mapping_manager: DimensionMappingRegistryManager,
139
+ db: ProjectRegistryInterface,
140
+ ):
141
+ return cls._load(
142
+ path,
143
+ params,
144
+ dataset_manager,
145
+ dimension_manager,
146
+ dimension_mapping_manager,
147
+ db,
148
+ )
149
+
150
+ @staticmethod
151
+ def config_class() -> Type:
152
+ return ProjectConfig
153
+
154
+ @property
155
+ def db(self) -> ProjectRegistryInterface:
156
+ return self._db
157
+
158
+ @db.setter
159
+ def db(self, db: ProjectRegistryInterface):
160
+ self._db = db
161
+
162
+ @staticmethod
163
+ def name() -> str:
164
+ return "Projects"
165
+
166
+ @property
167
+ def dataset_manager(self) -> DatasetRegistryManager:
168
+ return self._dataset_mgr
169
+
170
+ @property
171
+ def dimension_manager(self) -> DimensionRegistryManager:
172
+ return self._dimension_mgr
173
+
174
+ @property
175
+ def dimension_mapping_manager(self) -> DimensionMappingRegistryManager:
176
+ return self._dimension_mapping_mgr
177
+
178
+ def get_by_id(
179
+ self,
180
+ project_id: str,
181
+ version: str | None = None,
182
+ conn: Connection | None = None,
183
+ ) -> ProjectConfig:
184
+ if version is None:
185
+ assert self._db is not None
186
+ version = self._db.get_latest_version(conn, project_id)
187
+
188
+ key = ConfigKey(project_id, version)
189
+ project = self._projects.get(key)
190
+ if project is not None:
191
+ return project
192
+
193
+ if version is None:
194
+ model = self.db.get_latest(conn, project_id)
195
+ else:
196
+ model = self.db.get_by_version(conn, project_id, version)
197
+
198
+ assert isinstance(model, ProjectConfigModel)
199
+ config = ProjectConfig(model)
200
+ self._update_dimensions_and_mappings(conn, config)
201
+ self._projects[key] = config
202
+ return config
203
+
204
+ def _update_dimensions_and_mappings(self, conn: Connection | None, config: ProjectConfig):
205
+ base_dimensions = self._dimension_mgr.load_dimensions(
206
+ config.model.dimensions.base_dimension_references, conn=conn
207
+ )
208
+ supplemental_dimensions = self._dimension_mgr.load_dimensions(
209
+ config.model.dimensions.supplemental_dimension_references, conn=conn
210
+ )
211
+ base_to_supp_mappings = self._dimension_mapping_mgr.load_dimension_mappings(
212
+ config.model.dimension_mappings.base_to_supplemental_references, conn=conn
213
+ )
214
+ subset_dimensions = self._get_subset_dimensions(conn, config)
215
+ config.set_dimensions(base_dimensions, subset_dimensions, supplemental_dimensions)
216
+ config.set_dimension_mappings(base_to_supp_mappings)
217
+
218
+ def _get_subset_dimensions(self, conn: Connection | None, config: ProjectConfig):
219
+ subset_dimensions: dict[
220
+ DimensionType, dict[str, dict[ConfigKey, DimensionBaseConfig]]
221
+ ] = defaultdict(dict)
222
+ for subset_dim in config.model.dimensions.subset_dimensions:
223
+ selectors = {
224
+ ConfigKey(x.dimension_id, x.version): self._dimension_mgr.get_by_id(
225
+ x.dimension_id, version=x.version, conn=conn
226
+ )
227
+ for x in subset_dim.selector_references
228
+ }
229
+ subset_dimensions[subset_dim.dimension_type][subset_dim.name] = selectors
230
+ return subset_dimensions
231
+
232
+ def load_project(
233
+ self,
234
+ project_id: str,
235
+ version: str | None = None,
236
+ conn: Connection | None = None,
237
+ ) -> Project:
238
+ """Load a project from the registry.
239
+
240
+ Parameters
241
+ ----------
242
+ project_id : str
243
+ version : str
244
+
245
+ Returns
246
+ -------
247
+ Project
248
+ """
249
+ if conn is None:
250
+ with self.db.engine.connect() as conn:
251
+ return self._load_project(conn, project_id, version=version)
252
+ else:
253
+ return self._load_project(conn, project_id, version=version)
254
+
255
+ def _load_project(self, conn: Connection, project_id: str, version=None) -> Project:
256
+ dataset_manager = self._dataset_mgr
257
+ config = self.get_by_id(project_id, version=version, conn=conn)
258
+
259
+ dataset_configs = {}
260
+ for dataset_id in config.list_registered_dataset_ids():
261
+ dataset_config = dataset_manager.get_by_id(dataset_id, conn=conn)
262
+ dataset_configs[dataset_id] = dataset_config
263
+
264
+ return Project(
265
+ config,
266
+ config.model.version,
267
+ dataset_configs,
268
+ self._dimension_mgr,
269
+ self._dimension_mapping_mgr,
270
+ self._dataset_mgr,
271
+ )
272
+
273
+ def register(
274
+ self,
275
+ config_file: Path,
276
+ submitter: str,
277
+ log_message: str,
278
+ ) -> None:
279
+ """Register a project from a config file."""
280
+ with RegistrationContext(
281
+ self.db, log_message, VersionUpdateType.MAJOR, submitter
282
+ ) as context:
283
+ config = ProjectConfig.load(config_file)
284
+ src_dir = config_file.parent
285
+ self.register_from_config(config, src_dir, context)
286
+
287
+ def register_from_config(
288
+ self,
289
+ config: ProjectConfig,
290
+ src_dir: Path,
291
+ context: RegistrationContext,
292
+ ):
293
+ """Register a project from an existing config."""
294
+ self._register_project_and_dimensions(
295
+ config,
296
+ src_dir,
297
+ context,
298
+ )
299
+
300
+ def _register_project_and_dimensions(
301
+ self,
302
+ config: ProjectConfig,
303
+ src_dir: Path,
304
+ context: RegistrationContext,
305
+ ):
306
+ model = config.model
307
+ logger.info("Start registration of project %s", model.project_id)
308
+ self._check_if_already_registered(context.connection, model.project_id)
309
+ if model.dimensions.base_dimensions:
310
+ logger.info("Register base dimensions")
311
+ for ref in self._register_dimensions_from_models(
312
+ model.dimensions.base_dimensions,
313
+ context,
314
+ ):
315
+ model.dimensions.base_dimension_references.append(ref)
316
+ model.dimensions.base_dimensions.clear()
317
+ if model.dimensions.subset_dimensions:
318
+ self._register_subset_dimensions(
319
+ model,
320
+ model.dimensions.subset_dimensions,
321
+ context,
322
+ )
323
+ if model.dimensions.supplemental_dimensions:
324
+ logger.info("Register supplemental dimensions")
325
+ self._register_supplemental_dimensions_from_models(
326
+ src_dir,
327
+ model,
328
+ model.dimensions.supplemental_dimensions,
329
+ context,
330
+ )
331
+ model.dimensions.supplemental_dimensions.clear()
332
+ logger.info("Register all-in-one supplemental dimensions")
333
+ self._register_all_in_one_dimensions(
334
+ src_dir,
335
+ model,
336
+ context,
337
+ )
338
+
339
+ self._update_dimensions_and_mappings(context.connection, config)
340
+ for subset_dimension in model.dimensions.subset_dimensions:
341
+ subset_dimension.selectors.clear()
342
+ self._register(config, context)
343
+ context.add_id(RegistryType.PROJECT, config.model.project_id, self)
344
+
345
+ def _register_dimensions_from_models(
346
+ self,
347
+ dimensions: list,
348
+ context: RegistrationContext,
349
+ ):
350
+ dim_model = DimensionsConfigModel(dimensions=dimensions)
351
+ dims_config = DimensionsConfig.load_from_model(dim_model)
352
+ dimension_ids = self._dimension_mgr.register_from_config(dims_config, context)
353
+ return self._dimension_mgr.make_dimension_references(context.connection, dimension_ids)
354
+
355
+ def _register_supplemental_dimensions_from_models(
356
+ self,
357
+ src_dir: Path,
358
+ model: ProjectConfigModel,
359
+ dimensions: list,
360
+ context: RegistrationContext,
361
+ ):
362
+ """Registers supplemental dimensions and creates base-to-supplemental mappings for those
363
+ new dimensions.
364
+ """
365
+ dims = []
366
+ for x in dimensions:
367
+ data = x.serialize()
368
+ data.pop("mapping", None)
369
+ dims.append(DimensionModel(**data))
370
+
371
+ refs = self._register_dimensions_from_models(dims, context)
372
+
373
+ model.dimensions.supplemental_dimension_references += refs
374
+ self._register_base_to_supplemental_mappings(
375
+ src_dir,
376
+ model,
377
+ dimensions,
378
+ refs,
379
+ context,
380
+ )
381
+
382
+ def _register_base_to_supplemental_mappings(
383
+ self,
384
+ src_dir: Path,
385
+ model: ProjectConfigModel,
386
+ dimensions: list[SupplementalDimensionModel],
387
+ dimension_references: list[DimensionReferenceModel],
388
+ context: RegistrationContext,
389
+ ):
390
+ conn = context.connection
391
+ base_dim_mapping = defaultdict(list)
392
+ base_dim_refs: dict[str, DimensionReferenceModel] = {}
393
+ for ref in model.dimensions.base_dimension_references:
394
+ dim = self._dimension_mgr.get_by_id(
395
+ ref.dimension_id, version=ref.version, conn=context.connection
396
+ )
397
+ base_dim_mapping[ref.dimension_type].append(dim)
398
+ base_dim_refs[dim.model.dimension_id] = ref
399
+
400
+ mappings = []
401
+ if len(dimensions) != len(dimension_references):
402
+ msg = f"Bug: mismatch in sizes: {dimensions=} {dimension_references=}"
403
+ raise Exception(msg)
404
+
405
+ for dim, ref in zip(dimensions, dimension_references):
406
+ base_dim: DimensionBaseConfig | None = None
407
+ if dim.mapping.project_base_dimension_name is None:
408
+ base_dims = base_dim_mapping[ref.dimension_type]
409
+ if len(base_dims) > 1:
410
+ msg = (
411
+ "If there are multiple base dimenions for a dimension type, each "
412
+ "supplemental dimension mapping must supply a project_base_dimension_name. "
413
+ f"{dim.label}"
414
+ )
415
+ raise DSGInvalidDimensionMapping(msg)
416
+ base_dim = base_dims[0]
417
+ else:
418
+ for base_dim_ in base_dim_mapping[dim.dimension_type]:
419
+ if base_dim_.model.name == dim.mapping.project_base_dimension_name:
420
+ if base_dim is not None:
421
+ msg = (
422
+ "A supplemental dimension can only be mapped to one base dimension:"
423
+ f" supplemental dimension = {dim.label} "
424
+ f"base dimensions = {base_dim.model.label} and "
425
+ f"{base_dim_.model.label}"
426
+ )
427
+ raise DSGInvalidDimensionMapping(msg)
428
+ base_dim = base_dim_
429
+ if base_dim is None:
430
+ msg = f"Bug: unable to find base dimension for {dim.mapping.project_base_dimension_name}"
431
+ raise Exception(msg)
432
+ with in_other_dir(src_dir):
433
+ assert base_dim is not None
434
+ mapping_model = MappingTableModel.from_pre_registered_model(
435
+ dim.mapping,
436
+ base_dim_refs[base_dim.model.dimension_id],
437
+ ref,
438
+ )
439
+ mappings.append(mapping_model)
440
+
441
+ mapping_config = DimensionMappingsConfig.load_from_model(
442
+ DimensionMappingsConfigModel(mappings=mappings),
443
+ )
444
+ mapping_ids = self._dimension_mapping_mgr.register_from_config(mapping_config, context)
445
+ model.dimension_mappings.base_to_supplemental_references += (
446
+ self._dimension_mapping_mgr.make_dimension_mapping_references(mapping_ids, conn=conn)
447
+ )
448
+
449
+ def _register_subset_dimensions(
450
+ self,
451
+ model: ProjectConfigModel,
452
+ subset_dimensions: list[SubsetDimensionGroupModel],
453
+ context: RegistrationContext,
454
+ ):
455
+ logger.info("Register subset dimensions")
456
+ self._register_dimensions_from_subset_dimension_groups(
457
+ subset_dimensions,
458
+ model.dimensions.base_dimension_references,
459
+ context,
460
+ )
461
+ self._register_supplemental_dimensions_from_subset_dimensions(
462
+ model,
463
+ subset_dimensions,
464
+ context,
465
+ )
466
+
467
+ def _register_dimensions_from_subset_dimension_groups(
468
+ self,
469
+ subset_dimensions: list[SubsetDimensionGroupModel],
470
+ base_dimension_references: list[DimensionReferenceModel],
471
+ context: RegistrationContext,
472
+ ):
473
+ """Registers a dimension for each subset specified in the project config's subset
474
+ dimension groups. Appends references to those dimensions to subset_dimensions, which is
475
+ part of the project config.
476
+ """
477
+ conn = context.connection
478
+ with TemporaryDirectory() as tmpdir:
479
+ tmp_path = Path(tmpdir)
480
+ dimensions = []
481
+ subset_refs = {}
482
+ for subset_dimension in subset_dimensions:
483
+ base_dim = None
484
+ for ref in base_dimension_references:
485
+ if ref.dimension_type == subset_dimension.dimension_type:
486
+ base_dim = self._dimension_mgr.get_by_id(ref.dimension_id, conn=conn)
487
+ break
488
+ assert isinstance(base_dim, DimensionBaseConfigWithFiles), subset_dimension
489
+ base_records = base_dim.get_records_dataframe()
490
+ self._check_subset_dimension_consistency(subset_dimension, base_records)
491
+ for selector in subset_dimension.selectors:
492
+ new_records = base_records.filter(base_records["id"].isin(selector.records))
493
+ filename = tmp_path / f"{subset_dimension.name}_{selector.name}.csv"
494
+ new_records.toPandas().to_csv(filename, index=False)
495
+ dim = DimensionModel(
496
+ file=str(filename),
497
+ name=selector.name,
498
+ type=subset_dimension.dimension_type,
499
+ module=base_dim.model.module,
500
+ class_name=base_dim.model.class_name,
501
+ description=selector.description,
502
+ )
503
+ dimensions.append(dim)
504
+ key = (subset_dimension.dimension_type, selector.name)
505
+ if key in subset_refs:
506
+ msg = f"Bug: unhandled case of duplicate dimension name: {key=}"
507
+ raise Exception(msg)
508
+ subset_refs[key] = subset_dimension
509
+
510
+ dim_model = DimensionsConfigModel(dimensions=dimensions)
511
+ dims_config = DimensionsConfig.load_from_model(dim_model)
512
+ dimension_ids = self._dimension_mgr.register_from_config(dims_config, context)
513
+ for dimension_id in dimension_ids:
514
+ dim = self._dimension_mgr.get_by_id(dimension_id, conn=conn)
515
+ key = (dim.model.dimension_type, dim.model.name)
516
+ subset_dim = subset_refs[key]
517
+ subset_dim.selector_references.append(
518
+ DimensionReferenceModel(
519
+ dimension_id=dimension_id,
520
+ type=subset_dim.dimension_type,
521
+ version="1.0.0",
522
+ )
523
+ )
524
+
525
+ def _check_subset_dimension_consistency(
526
+ self,
527
+ subset_dimension: SubsetDimensionGroupModel,
528
+ base_records: DataFrame,
529
+ ) -> None:
530
+ base_record_ids = get_unique_values(base_records, "id")
531
+ diff = subset_dimension.record_ids.difference(base_record_ids)
532
+ if diff:
533
+ msg = (
534
+ f"subset dimension {subset_dimension.name} "
535
+ f"uses dimension records not present in the base dimension: {diff}"
536
+ )
537
+ raise DSGInvalidParameter(msg)
538
+
539
+ diff = base_record_ids.difference(subset_dimension.record_ids)
540
+ if diff:
541
+ msg = (
542
+ f"subset dimension {subset_dimension.name} "
543
+ f"does not list these base dimension records: {diff}"
544
+ )
545
+ raise DSGInvalidParameter(msg)
546
+
547
+ def _register_supplemental_dimensions_from_subset_dimensions(
548
+ self,
549
+ model: ProjectConfigModel,
550
+ subset_dimensions: list[SubsetDimensionGroupModel],
551
+ context: RegistrationContext,
552
+ ):
553
+ """Registers a supplemental dimension for each subset specified in the project config's
554
+ subset dimension groups. Also registers a mapping from the base dimension to each new
555
+ supplemental dimension. Appends references to those dimensions to the project config's
556
+ supplemental_dimension_references list.
557
+ """
558
+ conn = context.connection
559
+ with TemporaryDirectory() as tmpdir:
560
+ tmp_path = Path(tmpdir)
561
+ dimensions = []
562
+ for subset_dimension_group in subset_dimensions:
563
+ if not subset_dimension_group.create_supplemental_dimension:
564
+ continue
565
+ dimension_type = subset_dimension_group.dimension_type
566
+ base_dims: list[DimensionBaseConfigWithFiles] = []
567
+ for ref in model.dimensions.base_dimension_references:
568
+ if ref.dimension_type == dimension_type:
569
+ base_dim = self._dimension_mgr.get_by_id(ref.dimension_id, conn=conn)
570
+ if (
571
+ subset_dimension_group.base_dimension_name is None
572
+ or base_dim.model.name == subset_dimension_group.base_dimension_name
573
+ ):
574
+ base_dims.append(base_dim)
575
+ break
576
+ if len(base_dims) == 0:
577
+ msg = f"Did not find a base dimension for {subset_dimension_group=}"
578
+ raise Exception(msg)
579
+ elif len(base_dims) > 1:
580
+ msg = (
581
+ f"Found multiple base dimensions for {dimension_type=}. Please specify "
582
+ f"'base_dimension_name' in {subset_dimension_group=}"
583
+ )
584
+ raise DSGInvalidParameter(msg)
585
+ base_dim = base_dims[0]
586
+ records: dict[str, list[Any]] = {"id": [], "name": []}
587
+ mapping_records = []
588
+ dim_record_ids = set()
589
+ # The pydantic validator has already checked consistency of these columns.
590
+ for column in subset_dimension_group.selectors[0].column_values:
591
+ records[column] = []
592
+ for selector in subset_dimension_group.selectors:
593
+ records["id"].append(selector.name)
594
+ records["name"].append(selector.name)
595
+ if selector.column_values:
596
+ for column, value in selector.column_values.items():
597
+ records[column].append(value)
598
+ for record_id in selector.records:
599
+ mapping_records.append({"from_id": record_id, "to_id": selector.name})
600
+ dim_record_ids.add(record_id)
601
+
602
+ filename = tmp_path / f"{subset_dimension_group.name}.csv"
603
+ pd.DataFrame(records).to_csv(filename, index=False)
604
+
605
+ for record_id in base_dim.get_unique_ids().difference(dim_record_ids):
606
+ mapping_records.append({"from_id": record_id, "to_id": ""})
607
+ map_record_file = tmp_path / f"{subset_dimension_group.name}_mapping.csv"
608
+ pd.DataFrame.from_records(mapping_records).to_csv(map_record_file, index=False)
609
+
610
+ dim = SupplementalDimensionModel(
611
+ file=str(filename),
612
+ name=subset_dimension_group.name,
613
+ type=dimension_type,
614
+ module=base_dim.model.module,
615
+ class_name=base_dim.model.class_name,
616
+ description=subset_dimension_group.description,
617
+ mapping=MappingTableByNameModel(
618
+ file=str(map_record_file),
619
+ mapping_type=DimensionMappingType.MANY_TO_MANY_EXPLICIT_MULTIPLIERS,
620
+ description=f"Aggregation map for {subset_dimension_group.name}",
621
+ project_base_dimension_name=base_dim.model.name,
622
+ ),
623
+ )
624
+ dimensions.append(dim)
625
+
626
+ self._register_supplemental_dimensions_from_models(
627
+ tmp_path,
628
+ model,
629
+ dimensions,
630
+ context,
631
+ )
632
+
633
+ def _register_all_in_one_dimensions(
634
+ self,
635
+ src_dir,
636
+ model,
637
+ context: RegistrationContext,
638
+ ):
639
+ with TemporaryDirectory() as tmpdir:
640
+ tmp_path = Path(tmpdir)
641
+ new_dimensions = []
642
+ dim_type_to_ref = {
643
+ x.dimension_type: x for x in model.dimensions.base_dimension_references
644
+ }
645
+ # Metric is excluded because fuel_id and unit may not be the same for all records.
646
+ # Time doesn't have records.
647
+ exclude = {DimensionType.METRIC, DimensionType.TIME}
648
+ for dimension_type in (x for x in DimensionType if x not in exclude):
649
+ dim_ref = dim_type_to_ref[dimension_type]
650
+ dim_config = self._dimension_mgr.get_by_id(
651
+ dim_ref.dimension_id, conn=context.connection
652
+ )
653
+ assert isinstance(dim_config, DimensionBaseConfigWithFiles)
654
+ dt_str = dimension_type.value
655
+ if dt_str.endswith("y"):
656
+ dt_plural = dt_str[:-1] + "ies"
657
+ else:
658
+ dt_plural = dt_str + "s"
659
+ dt_all_plural = f"all_{dt_plural}"
660
+ dim_name = f"all_{model.project_id}_{dt_plural}"
661
+ dim_name_formal = f"All {dt_plural.title()}"
662
+ dim_record_file = tmp_path / f"{dt_all_plural}.csv"
663
+ dim_text = f"id,name\n{dt_all_plural},{dim_name_formal}\n"
664
+ dim_record_file.write_text(dim_text)
665
+ map_record_file = tmp_path / f"lookup_{dt_str}_to_{dt_all_plural}.csv"
666
+ with open(map_record_file, "w") as f_out:
667
+ f_out.write("from_id,to_id\n")
668
+ for record in dim_config.get_unique_ids():
669
+ f_out.write(record)
670
+ f_out.write(",")
671
+ f_out.write(dt_all_plural)
672
+ f_out.write("\n")
673
+
674
+ with in_other_dir(src_dir):
675
+ new_dim = SupplementalDimensionModel(
676
+ file=str(dim_record_file),
677
+ name=dim_name,
678
+ type=dimension_type,
679
+ module="dsgrid.dimension.base_models",
680
+ class_name="DimensionRecordBaseModel",
681
+ description=dim_name_formal,
682
+ mapping=MappingTableByNameModel(
683
+ file=str(map_record_file),
684
+ mapping_type=DimensionMappingType.MANY_TO_ONE_AGGREGATION,
685
+ description=f"Aggregation map for all {dt_str}s",
686
+ ),
687
+ )
688
+ new_dimensions.append(new_dim)
689
+
690
+ self._register_supplemental_dimensions_from_models(
691
+ src_dir,
692
+ model,
693
+ new_dimensions,
694
+ context,
695
+ )
696
+
697
+ def _register(self, config: ProjectConfig, context: RegistrationContext):
698
+ self._run_checks(config)
699
+
700
+ config.model.version = "1.0.0"
701
+ model = self.db.insert(context.connection, config.model, context.registration)
702
+ assert isinstance(model, ProjectConfigModel)
703
+ logger.info(
704
+ "%s Registered project %s with version=%s",
705
+ self._log_offline_mode_prefix(),
706
+ model.project_id,
707
+ config.model.version,
708
+ )
709
+
710
+ def _run_checks(self, config: ProjectConfig):
711
+ dims = [x for x in config.iter_dimensions()]
712
+ check_uniqueness((x.model.name for x in dims), "dimension name")
713
+ self._check_base_dimensions(config)
714
+
715
+ for dataset_id in config.list_unregistered_dataset_ids():
716
+ for field in RequiredDimensionRecordsModel.model_fields:
717
+ # This will check that all dimension record IDs listed in the requirements
718
+ # exist in the project.
719
+ config.get_required_dimension_record_ids(dataset_id, DimensionType(field))
720
+
721
+ def _check_base_dimensions(self, config: ProjectConfig) -> None:
722
+ found_time = False
723
+ for dim in config.list_base_dimensions():
724
+ if dim.model.dimension_type == DimensionType.TIME:
725
+ if found_time:
726
+ msg = "Only one time dimension is allowed in a project."
727
+ raise DSGInvalidDimension(msg)
728
+ found_time = True
729
+
730
+ assert found_time
731
+ self._set_dataset_record_requirement_definitions_names(config)
732
+ self._check_dataset_record_requirement_definitions(config)
733
+
734
+ def _set_dataset_record_requirement_definitions_names(
735
+ self,
736
+ config: ProjectConfig,
737
+ ) -> None:
738
+ def set_dimension_name(req: RequiredBaseDimensionModel) -> None:
739
+ if req.dimension_name is None and req.dimension_name is not None:
740
+ dim = config.get_dimension_by_name(req.dimension_name)
741
+ req.dimension_name = None
742
+ req.dimension_name = dim.model.name
743
+
744
+ for dataset in config.model.datasets:
745
+ dim_type_as_fields = RequiredDimensionRecordsModel.model_fields.keys()
746
+ for field in dim_type_as_fields:
747
+ req = getattr(dataset.required_dimensions.single_dimensional, field)
748
+ for base_field in ("base", "base_missing"):
749
+ set_dimension_name(getattr(req, base_field))
750
+ for multi_dim in dataset.required_dimensions.multi_dimensional:
751
+ req = getattr(multi_dim, field)
752
+ for base_field in ("base", "base_missing"):
753
+ set_dimension_name(getattr(req, base_field))
754
+
755
+ def _check_dataset_record_requirement_definitions(
756
+ self,
757
+ config: ProjectConfig,
758
+ ) -> None:
759
+ for dataset in config.model.datasets:
760
+ dim_type_as_fields = RequiredDimensionRecordsModel.model_fields.keys()
761
+ for dim_type_as_field in dim_type_as_fields:
762
+ dim_type = DimensionType(dim_type_as_field)
763
+ required_dimension_records = getattr(
764
+ dataset.required_dimensions.single_dimensional, dim_type_as_field
765
+ )
766
+ self._check_base_dimension_record_requirements(
767
+ required_dimension_records, dim_type, config, dataset.dataset_id
768
+ )
769
+ for multi_dim in dataset.required_dimensions.multi_dimensional:
770
+ required_dimension_records = getattr(multi_dim, dim_type_as_field)
771
+ self._check_base_dimension_record_requirements(
772
+ required_dimension_records, dim_type, config, dataset.dataset_id
773
+ )
774
+
775
+ def _check_base_dimension_record_requirements(
776
+ self,
777
+ req_dim_records: RequiredDimensionRecordsByTypeModel,
778
+ dim_type: DimensionType,
779
+ config: ProjectConfig,
780
+ dataset_id: str,
781
+ ) -> None:
782
+ base_dims = config.list_base_dimensions(dimension_type=dim_type)
783
+ for base_field in ("base", "base_missing"):
784
+ reqs = getattr(req_dim_records, base_field)
785
+ if reqs.record_ids and reqs.dimension_name is None:
786
+ if len(base_dims) == 1:
787
+ reqs.dimension_name = base_dims[0].model.name
788
+ logger.debug(
789
+ "Assigned dimension_name=%s for %s dataset_id=%s",
790
+ reqs.dimension_name,
791
+ dim_type,
792
+ dataset_id,
793
+ )
794
+ else:
795
+ msg = (
796
+ f"{dataset_id=} requires a base dimension name for "
797
+ f"{dim_type} because the project has {len(base_dims)} base dimensions."
798
+ )
799
+ raise DSGInvalidDimensionMapping(msg)
800
+ # Only one of base and base_missing can be set, and that was already checked.
801
+ break
802
+
803
+ @track_timing(timer_stats_collector)
804
+ def register_and_submit_dataset(
805
+ self,
806
+ dataset_config_file: Path,
807
+ project_id: str,
808
+ submitter: str,
809
+ log_message: str,
810
+ dimension_mapping_file=None,
811
+ dimension_mapping_references_file=None,
812
+ autogen_reverse_supplemental_mappings=None,
813
+ data_base_dir: Path | None = None,
814
+ missing_associations_base_dir: Path | None = None,
815
+ ):
816
+ with RegistrationContext(
817
+ self.db, log_message, VersionUpdateType.MINOR, submitter
818
+ ) as context:
819
+ conn = context.connection
820
+ if not self.has_id(project_id, conn=conn):
821
+ msg = f"{project_id=}"
822
+ raise DSGValueNotRegistered(msg)
823
+
824
+ dataset_config = DatasetConfig.load_from_user_path(
825
+ dataset_config_file,
826
+ data_base_dir=data_base_dir,
827
+ missing_associations_base_dir=missing_associations_base_dir,
828
+ )
829
+ dataset_id = dataset_config.model.dataset_id
830
+ config = self.get_by_id(project_id, conn=conn)
831
+ # This will raise an exception if the dataset_id is not part of the project or already
832
+ # registered.
833
+ self._raise_if_not_unregistered(config, dataset_id)
834
+
835
+ self._dataset_mgr.register(
836
+ dataset_config_file,
837
+ context=context,
838
+ data_base_dir=data_base_dir,
839
+ missing_associations_base_dir=missing_associations_base_dir,
840
+ )
841
+ self.submit_dataset(
842
+ project_id,
843
+ context.get_ids(RegistryType.DATASET)[0],
844
+ dimension_mapping_file=dimension_mapping_file,
845
+ dimension_mapping_references_file=dimension_mapping_references_file,
846
+ autogen_reverse_supplemental_mappings=autogen_reverse_supplemental_mappings,
847
+ context=context,
848
+ )
849
+
850
+ @track_timing(timer_stats_collector)
851
+ def submit_dataset(
852
+ self,
853
+ project_id: str,
854
+ dataset_id: str,
855
+ submitter: str | None = None,
856
+ log_message: str | None = None,
857
+ dimension_mapping_file: Path | None = None,
858
+ dimension_mapping_references_file: Path | None = None,
859
+ autogen_reverse_supplemental_mappings: list[DimensionType] | None = None,
860
+ context: RegistrationContext | None = None,
861
+ ):
862
+ """Registers a dataset with a project. This can only be performed on the
863
+ latest version of the project.
864
+
865
+ Parameters
866
+ ----------
867
+ project_id : str
868
+ dataset_id : str
869
+ dimension_mapping_file : Path or None
870
+ Base-to-base dimension mapping file
871
+ dimension_mapping_references_file : Path or None
872
+ autogen_reverse_supplemental_mappings : list[DimensionType] or None
873
+ Dimensions on which to attempt create reverse mappings from supplemental dimensions.
874
+ submitter : str
875
+ Submitter name
876
+ log_message : str
877
+ context : None or RegistrationContext
878
+
879
+ Raises
880
+ ------
881
+ DSGValueNotRegistered
882
+ Raised if the project_id or dataset_id is not registered.
883
+ DSGDuplicateValueRegistered
884
+ Raised if the dataset is already registered with the project.
885
+ ValueError
886
+ Raised if the project does not contain this dataset.
887
+
888
+ """
889
+ if context is None:
890
+ assert submitter is not None
891
+ assert log_message is not None
892
+ with RegistrationContext(
893
+ self.db, log_message, VersionUpdateType.MINOR, submitter
894
+ ) as context:
895
+ config = self.get_by_id(project_id, conn=context.connection)
896
+ self._submit_dataset_and_register_mappings(
897
+ config,
898
+ dataset_id,
899
+ dimension_mapping_file,
900
+ dimension_mapping_references_file,
901
+ autogen_reverse_supplemental_mappings,
902
+ context,
903
+ )
904
+ else:
905
+ config = self.get_by_id(project_id, conn=context.connection)
906
+ self._submit_dataset_and_register_mappings(
907
+ config,
908
+ dataset_id,
909
+ dimension_mapping_file,
910
+ dimension_mapping_references_file,
911
+ autogen_reverse_supplemental_mappings,
912
+ context,
913
+ )
914
+
915
+ def register_subset_dimensions(
916
+ self,
917
+ project_id: str,
918
+ filename: Path,
919
+ submitter: str,
920
+ log_message: str,
921
+ update_type: VersionUpdateType,
922
+ ):
923
+ """Register new subset dimensions."""
924
+ with RegistrationContext(self.db, log_message, update_type, submitter) as context:
925
+ config = self.get_by_id(project_id, conn=context.connection)
926
+ subset_model = SubsetDimensionGroupListModel.from_file(filename)
927
+ self._register_subset_dimensions(
928
+ config.model,
929
+ subset_model.subset_dimensions,
930
+ context,
931
+ )
932
+ self._make_new_config(config, context)
933
+
934
+ def register_supplemental_dimensions(
935
+ self,
936
+ project_id: str,
937
+ filename: Path,
938
+ submitter: str,
939
+ log_message: str,
940
+ update_type: VersionUpdateType,
941
+ ):
942
+ """Register new supplemental dimensions."""
943
+ with RegistrationContext(self.db, log_message, update_type, submitter) as context:
944
+ config = self.get_by_id(project_id, conn=context.connection)
945
+ model = SupplementalDimensionsListModel.from_file(filename)
946
+ self._register_supplemental_dimensions_from_models(
947
+ filename.parent,
948
+ config.model,
949
+ model.supplemental_dimensions,
950
+ context,
951
+ )
952
+ self._make_new_config(config, context)
953
+
954
+ def add_dataset_requirements(
955
+ self,
956
+ project_id: str,
957
+ filename: Path,
958
+ submitter: str,
959
+ log_message: str,
960
+ update_type: VersionUpdateType,
961
+ ):
962
+ """Add requirements for one or more datasets to the project."""
963
+ with RegistrationContext(self.db, log_message, update_type, submitter) as context:
964
+ config = self.get_by_id(project_id, conn=context.connection)
965
+ model = InputDatasetListModel.from_file(filename)
966
+ existing_ids = {x.dataset_id for x in config.model.datasets}
967
+ for dataset in model.datasets:
968
+ if dataset.dataset_id in existing_ids:
969
+ msg = f"{dataset.dataset_id} is already stored in the project"
970
+ raise DSGInvalidParameter(msg)
971
+ if dataset.status != DatasetRegistryStatus.UNREGISTERED:
972
+ msg = f"New dataset {dataset.dataset_id} status must be unregistered: {dataset.status}"
973
+ raise DSGInvalidParameter(msg)
974
+
975
+ config.model.datasets += model.datasets
976
+ self._make_new_config(config, context)
977
+
978
+ def replace_dataset_dimension_requirements(
979
+ self,
980
+ project_id: str,
981
+ filename: Path,
982
+ submitter: str,
983
+ log_message: str,
984
+ update_type: VersionUpdateType,
985
+ ):
986
+ """Replace dataset requirements in a project."""
987
+ with RegistrationContext(self.db, log_message, update_type, submitter) as context:
988
+ config = self.get_by_id(project_id, conn=context.connection)
989
+ model = InputDatasetDimensionRequirementsListModel.from_file(filename)
990
+ for dataset in model.dataset_dimension_requirements:
991
+ found = False
992
+ for i in range(len(config.model.datasets)):
993
+ if config.model.datasets[i].dataset_id == dataset.dataset_id:
994
+ config.model.datasets[i].required_dimensions = dataset.required_dimensions
995
+ if config.model.datasets[i].status == DatasetRegistryStatus.REGISTERED:
996
+ config.model.datasets[i].status = DatasetRegistryStatus.UNREGISTERED
997
+ logger.info(
998
+ "Changed dataset %s status to %s in project %s",
999
+ dataset.dataset_id,
1000
+ config.model.datasets[i].status.value,
1001
+ project_id,
1002
+ )
1003
+ # TODO: When issue #309 is addressed, we need to set all dependent
1004
+ # derived datasets to unregistered also.
1005
+ found = True
1006
+ break
1007
+ if not found:
1008
+ msg = f"{dataset.dataset_type} is not present in the project config"
1009
+ raise DSGInvalidParameter(msg)
1010
+
1011
+ self._make_new_config(config, context)
1012
+
1013
+ def _submit_dataset_and_register_mappings(
1014
+ self,
1015
+ project_config: ProjectConfig,
1016
+ dataset_id: str,
1017
+ dimension_mapping_file: Path | None,
1018
+ dimension_mapping_references_file: Path | None,
1019
+ autogen_reverse_supplemental_mappings: list[DimensionType] | None,
1020
+ context: RegistrationContext,
1021
+ ) -> None:
1022
+ logger.info("Submit dataset=%s to project=%s.", dataset_id, project_config.config_id)
1023
+ self._check_if_not_registered(context.connection, project_config.config_id)
1024
+ self._raise_if_not_unregistered(project_config, dataset_id)
1025
+ dataset_config = self._dataset_mgr.get_by_id(dataset_id, conn=context.connection)
1026
+
1027
+ references = []
1028
+ if dimension_mapping_file is not None:
1029
+ references += self._register_mappings_from_file(
1030
+ project_config,
1031
+ dataset_config,
1032
+ dimension_mapping_file,
1033
+ context,
1034
+ )
1035
+ if dimension_mapping_references_file is not None:
1036
+ for ref in DimensionMappingReferenceListModel.load(
1037
+ dimension_mapping_references_file
1038
+ ).references:
1039
+ if not self.dimension_mapping_manager.has_id(
1040
+ ref.mapping_id, version=ref.version, conn=context.connection
1041
+ ):
1042
+ msg = f"mapping_id={ref.mapping_id}"
1043
+ raise DSGValueNotRegistered(msg)
1044
+ references.append(ref)
1045
+
1046
+ if autogen_reverse_supplemental_mappings:
1047
+ references += self._auto_register_reverse_supplemental_mappings(
1048
+ project_config,
1049
+ dataset_config,
1050
+ references,
1051
+ set((x.value for x in autogen_reverse_supplemental_mappings)),
1052
+ context,
1053
+ )
1054
+
1055
+ self._submit_dataset(project_config, dataset_config, references, context)
1056
+
1057
+ def _raise_if_not_unregistered(self, project_config: ProjectConfig, dataset_id: str) -> None:
1058
+ # This will raise if the dataset is not specified in the project.
1059
+ dataset_model = project_config.get_dataset(dataset_id)
1060
+ status = dataset_model.status
1061
+ if status != DatasetRegistryStatus.UNREGISTERED:
1062
+ msg = (
1063
+ f"{dataset_id=} cannot be submitted to project={project_config.config_id} with "
1064
+ f"{status=}"
1065
+ )
1066
+ raise DSGDuplicateValueRegistered(msg)
1067
+
1068
+ def _register_mappings_from_file(
1069
+ self,
1070
+ project_config: ProjectConfig,
1071
+ dataset_config: DatasetConfig,
1072
+ dimension_mapping_file: Path,
1073
+ context: RegistrationContext,
1074
+ ):
1075
+ references = []
1076
+ src_dir = dimension_mapping_file.parent
1077
+ mappings = DatasetBaseToProjectMappingTableListModel(
1078
+ **load_data(dimension_mapping_file)
1079
+ ).mappings
1080
+ dataset_mapping = {x.dimension_type: x for x in dataset_config.model.dimension_references}
1081
+ project_mapping: dict[DimensionType, list[DimensionBaseConfig]] = defaultdict(list)
1082
+ project_mapping_refs: dict[str, DimensionReferenceModel] = {}
1083
+ for ref in project_config.model.dimensions.base_dimension_references:
1084
+ dim = self._dimension_mgr.get_by_id(
1085
+ ref.dimension_id, version=ref.version, conn=context.connection
1086
+ )
1087
+ project_mapping[ref.dimension_type].append(dim)
1088
+ project_mapping_refs[dim.model.dimension_id] = ref
1089
+ mapping_tables = []
1090
+ for mapping in mappings:
1091
+ base_dim: DimensionBaseConfig | None = None
1092
+ if mapping.project_base_dimension_name is None:
1093
+ base_dims = project_mapping[mapping.dimension_type]
1094
+ if len(base_dims) > 1:
1095
+ msg = (
1096
+ "If there are multiple project base dimensions for a dimension type, the "
1097
+ "dataset dimension mapping must supply a project_base_dimension_name. "
1098
+ f"{mapping}"
1099
+ )
1100
+ raise DSGInvalidDimensionMapping(msg)
1101
+ base_dim = base_dims[0]
1102
+ else:
1103
+ for base_dim_ in project_mapping[mapping.dimension_type]:
1104
+ if base_dim_.model.name == mapping.project_base_dimension_name:
1105
+ base_dim = base_dim_
1106
+ if base_dim is None:
1107
+ msg = f"Bug: unable to find base dimension for {mapping.project_base_dimension_name}"
1108
+ raise Exception(msg)
1109
+ with in_other_dir(src_dir):
1110
+ assert base_dim is not None
1111
+ mapping_table = MappingTableModel.from_pre_registered_model(
1112
+ mapping,
1113
+ dataset_mapping[mapping.dimension_type],
1114
+ project_mapping_refs[base_dim.model.dimension_id],
1115
+ )
1116
+ mapping_tables.append(mapping_table)
1117
+
1118
+ mappings_config = DimensionMappingsConfig.load_from_model(
1119
+ DimensionMappingsConfigModel(mappings=mapping_tables)
1120
+ )
1121
+ mapping_ids = self._dimension_mapping_mgr.register_from_config(mappings_config, context)
1122
+ for mapping_id in mapping_ids:
1123
+ mapping_config = self._dimension_mapping_mgr.get_by_id(
1124
+ mapping_id, conn=context.connection
1125
+ )
1126
+ references.append(
1127
+ DimensionMappingReferenceModel(
1128
+ from_dimension_type=mapping_config.model.from_dimension.dimension_type,
1129
+ to_dimension_type=mapping_config.model.to_dimension.dimension_type,
1130
+ mapping_id=mapping_id,
1131
+ version=str(
1132
+ self._dimension_mapping_mgr.get_latest_version(
1133
+ mapping_id, conn=context.connection
1134
+ )
1135
+ ),
1136
+ )
1137
+ )
1138
+
1139
+ return references
1140
+
1141
+ def _auto_register_reverse_supplemental_mappings(
1142
+ self,
1143
+ project_config: ProjectConfig,
1144
+ dataset_config: DatasetConfig,
1145
+ mapping_references: list[DimensionMappingReferenceModel],
1146
+ autogen_reverse_supplemental_mappings: set[str],
1147
+ context: RegistrationContext,
1148
+ ):
1149
+ conn = context.connection
1150
+ references = []
1151
+ p_model = project_config.model
1152
+ p_supp_dim_ids = {
1153
+ x.dimension_id for x in p_model.dimensions.supplemental_dimension_references
1154
+ }
1155
+ d_dim_from_ids = set()
1156
+ for ref in mapping_references:
1157
+ mapping_config = self._dimension_mapping_mgr.get_by_id(ref.mapping_id, conn=conn)
1158
+ d_dim_from_ids.add(mapping_config.model.from_dimension.dimension_id)
1159
+
1160
+ needs_mapping = []
1161
+ for dim in dataset_config.model.dimension_references:
1162
+ if (
1163
+ dim.dimension_type in autogen_reverse_supplemental_mappings
1164
+ and dim.dimension_id in p_supp_dim_ids
1165
+ and dim.dimension_id not in d_dim_from_ids
1166
+ ):
1167
+ needs_mapping.append((dim.dimension_id, dim.version))
1168
+ # else:
1169
+ # This dimension is the same as a project base dimension.
1170
+ # or
1171
+ # The dataset may only need to provide a subset of records, and those are
1172
+ # checked in the dimension association table.
1173
+
1174
+ if len(needs_mapping) != len(autogen_reverse_supplemental_mappings):
1175
+ msg = (
1176
+ f"Mappings to autgen [{needs_mapping}] does not match user-specified "
1177
+ f"autogen_reverse_supplemental_mappings={autogen_reverse_supplemental_mappings}"
1178
+ )
1179
+ raise DSGInvalidDimensionMapping(msg)
1180
+
1181
+ new_mappings = []
1182
+ for from_id, from_version in needs_mapping:
1183
+ to_dim = self._dimension_mgr.get_by_id(from_id, version=from_version, conn=conn)
1184
+ from_dim, to_version = project_config.get_base_dimension_and_version(
1185
+ to_dim.model.dimension_type
1186
+ )
1187
+ mapping, version = self._try_get_mapping(
1188
+ project_config, to_dim, from_version, from_dim, to_version, context
1189
+ )
1190
+ if mapping is None:
1191
+ p_mapping, _ = self._try_get_mapping(
1192
+ project_config, from_dim, to_version, to_dim, from_version, context
1193
+ )
1194
+ assert (
1195
+ p_mapping is not None
1196
+ ), f"from={from_dim.model.dimension_id} to={to_dim.model.dimension_id}"
1197
+ records = models_to_dataframe(p_mapping.model.records)
1198
+ fraction_vals = get_unique_values(records, "from_fraction")
1199
+ if len(fraction_vals) != 1 and next(iter(fraction_vals)) != 1.0:
1200
+ msg = (
1201
+ f"Cannot auto-generate a dataset-to-project mapping from from a project "
1202
+ "supplemental dimension unless the from_fraction column is empty or only "
1203
+ f"has values of 1.0: {p_mapping.model.mapping_id} - {fraction_vals}"
1204
+ )
1205
+ raise DSGInvalidDimensionMapping(msg)
1206
+ reverse_records = (
1207
+ records.drop("from_fraction")
1208
+ .select(F.col("to_id").alias("from_id"), F.col("from_id").alias("to_id"))
1209
+ .toPandas()
1210
+ )
1211
+ dst = Path(tempfile.gettempdir()) / f"reverse_{p_mapping.config_id}.csv"
1212
+ # Use pandas because spark creates a CSV directory.
1213
+ reverse_records.to_csv(dst, index=False)
1214
+ dimension_type = to_dim.model.dimension_type.value
1215
+ new_mappings.append(
1216
+ {
1217
+ "description": f"Maps {dataset_config.config_id} {dimension_type} to project",
1218
+ "dimension_type": dimension_type,
1219
+ "file": str(dst),
1220
+ "mapping_type": DimensionMappingType.MANY_TO_MANY_EXPLICIT_MULTIPLIERS.value,
1221
+ }
1222
+ )
1223
+ else:
1224
+ assert version is not None
1225
+ reference = DimensionMappingReferenceModel(
1226
+ from_dimension_type=to_dim.model.dimension_type,
1227
+ to_dimension_type=to_dim.model.dimension_type,
1228
+ mapping_id=mapping.model.mapping_id,
1229
+ version=version,
1230
+ )
1231
+ references.append(reference)
1232
+
1233
+ if new_mappings:
1234
+ # We don't currently have a way to register a single dimension mapping. It would be
1235
+ # better to register these mappings directly. But, this code was already here.
1236
+ mapping_file = Path(tempfile.gettempdir()) / "dimension_mappings.json5"
1237
+ mapping_file.write_text(json5.dumps({"mappings": new_mappings}, indent=2))
1238
+ to_delete = [mapping_file] + [x["file"] for x in new_mappings]
1239
+ try:
1240
+ references += self._register_mappings_from_file(
1241
+ project_config,
1242
+ dataset_config,
1243
+ mapping_file,
1244
+ context,
1245
+ )
1246
+ finally:
1247
+ for filename in to_delete:
1248
+ Path(filename).unlink()
1249
+
1250
+ return references
1251
+
1252
+ def _try_get_mapping(
1253
+ self,
1254
+ project_config: ProjectConfig,
1255
+ from_dim,
1256
+ from_version,
1257
+ to_dim,
1258
+ to_version,
1259
+ context: RegistrationContext,
1260
+ ):
1261
+ conn = context.connection
1262
+ dimension_type = from_dim.model.dimension_type
1263
+ for ref in project_config.model.dimension_mappings.base_to_supplemental_references:
1264
+ if (
1265
+ ref.from_dimension_type == dimension_type
1266
+ and ref.to_dimension_type == dimension_type
1267
+ ):
1268
+ mapping_config = self._dimension_mapping_mgr.get_by_id(ref.mapping_id, conn=conn)
1269
+ if (
1270
+ mapping_config.model.from_dimension.dimension_id == from_dim.model.dimension_id
1271
+ and mapping_config.model.from_dimension.version == from_version
1272
+ and mapping_config.model.to_dimension.dimension_id == to_dim.model.dimension_id
1273
+ and mapping_config.model.to_dimension.version == to_version
1274
+ ):
1275
+ return mapping_config, ref.version
1276
+
1277
+ return None, None
1278
+
1279
+ def _submit_dataset(
1280
+ self,
1281
+ project_config: ProjectConfig,
1282
+ dataset_config: DatasetConfig,
1283
+ mapping_references: list[DimensionMappingReferenceModel],
1284
+ context: RegistrationContext,
1285
+ ):
1286
+ project_config.add_dataset_dimension_mappings(dataset_config, mapping_references)
1287
+ project_config.add_dataset_base_dimension_names(
1288
+ dataset_config.model.dataset_id,
1289
+ self._id_base_dimension_names_in_dataset(
1290
+ project_config, dataset_config, mapping_references
1291
+ ),
1292
+ )
1293
+ if os.environ.get("__DSGRID_SKIP_CHECK_DATASET_TO_PROJECT_MAPPING__") is not None:
1294
+ logger.warning("Skip dataset-to-project mapping checks")
1295
+ else:
1296
+ self._check_dataset_base_to_project_base_mappings(
1297
+ project_config,
1298
+ dataset_config,
1299
+ mapping_references,
1300
+ context,
1301
+ )
1302
+
1303
+ dataset_model = project_config.get_dataset(dataset_config.model.dataset_id)
1304
+
1305
+ dataset_model.mapping_references = mapping_references
1306
+ dataset_model.status = DatasetRegistryStatus.REGISTERED
1307
+ if project_config.are_all_datasets_submitted():
1308
+ new_status = ProjectRegistryStatus.COMPLETE
1309
+ else:
1310
+ new_status = ProjectRegistryStatus.IN_PROGRESS
1311
+ project_config.set_status(new_status)
1312
+ config = self.update_with_context(project_config, context)
1313
+ self._db.add_contains_dataset(context.connection, config.model, dataset_config.model)
1314
+
1315
+ logger.info(
1316
+ "%s Registered dataset %s with version=%s in project %s",
1317
+ self._log_offline_mode_prefix(),
1318
+ dataset_config.model.dataset_id,
1319
+ config.model.version,
1320
+ config.model.project_id,
1321
+ )
1322
+
1323
+ @track_timing(timer_stats_collector)
1324
+ def _check_dataset_base_to_project_base_mappings(
1325
+ self,
1326
+ project_config: ProjectConfig,
1327
+ dataset_config: DatasetConfig,
1328
+ mapping_references: list[DimensionMappingReferenceModel],
1329
+ context: RegistrationContext,
1330
+ ):
1331
+ """Check that a dataset has all project-required dimension records."""
1332
+ logger.info("Check dataset-base-to-project-base dimension mappings.")
1333
+ data_store = self._dataset_mgr.store
1334
+ handler = make_dataset_schema_handler(
1335
+ context.connection,
1336
+ dataset_config,
1337
+ self._dimension_mgr,
1338
+ self._dimension_mapping_mgr,
1339
+ store=data_store,
1340
+ mapping_references=mapping_references,
1341
+ )
1342
+ dataset_id = dataset_config.config_id
1343
+
1344
+ with ScratchDirContext(self._params.scratch_dir) as scontext:
1345
+ project_table = self._make_dimension_associations(project_config, dataset_id, scontext)
1346
+ mapped_dataset_table = handler.make_mapped_dimension_association_table(scontext)
1347
+ project_table = handler.remove_expected_missing_mapped_associations(
1348
+ data_store, project_table, scontext
1349
+ )
1350
+ cols = sorted(project_table.columns)
1351
+ cache(mapped_dataset_table)
1352
+ diff: DataFrame | None = None
1353
+
1354
+ try:
1355
+ # This check is relatively short and will show the user clear errors.
1356
+ _check_distinct_column_values(project_table, mapped_dataset_table)
1357
+ # This check is long and will produce a full table of differences.
1358
+ # It may require some effort from the user.
1359
+ diff = except_all(project_table.select(*cols), mapped_dataset_table.select(*cols))
1360
+ cache(diff)
1361
+ if not is_dataframe_empty(diff):
1362
+ dataset_id = dataset_config.model.dataset_id
1363
+ handle_dimension_association_errors(diff, mapped_dataset_table, dataset_id)
1364
+ finally:
1365
+ unpersist(mapped_dataset_table)
1366
+ if diff is not None:
1367
+ unpersist(diff)
1368
+
1369
+ def _id_base_dimension_names_in_dataset(
1370
+ self,
1371
+ project_config: ProjectConfig,
1372
+ dataset_config: DatasetConfig,
1373
+ mapping_references: list[DimensionMappingReferenceModel],
1374
+ ) -> DatasetBaseDimensionNamesModel:
1375
+ base_dimension_names: dict[DimensionType, str] = {}
1376
+ for ref in mapping_references:
1377
+ mapping = self._dimension_mapping_mgr.get_by_id(ref.mapping_id, version=ref.version)
1378
+ base_dim = self._dimension_mgr.get_by_id(
1379
+ mapping.model.to_dimension.dimension_id,
1380
+ version=mapping.model.to_dimension.version,
1381
+ ).model
1382
+ base_dimension_names[base_dim.dimension_type] = base_dim.name
1383
+
1384
+ project_base_dims_by_type: dict[DimensionType, list[DimensionBaseConfig]] = defaultdict(
1385
+ list
1386
+ )
1387
+ for dim in project_config.list_base_dimensions():
1388
+ project_base_dims_by_type[dim.model.dimension_type].append(dim)
1389
+
1390
+ dataset_id = dataset_config.model.dataset_id
1391
+ for dim_type in DimensionType:
1392
+ if dim_type == DimensionType.TIME:
1393
+ assert len(project_base_dims_by_type[dim_type]) == 1
1394
+ base_dimension_names[dim_type] = project_base_dims_by_type[dim_type][0].model.name
1395
+ continue
1396
+ if dim_type not in base_dimension_names:
1397
+ project_base_dims = project_base_dims_by_type[dim_type]
1398
+ if len(project_base_dims) > 1:
1399
+ for project_dim in project_base_dims:
1400
+ assert isinstance(project_dim, DimensionBaseConfigWithFiles)
1401
+ project_records = project_dim.get_records_dataframe()
1402
+ project_record_ids = get_unique_values(project_records, "id")
1403
+ dataset_dim = dataset_config.get_dimension_with_records(dim_type)
1404
+ assert dataset_dim is not None
1405
+ dataset_records = dataset_dim.get_records_dataframe()
1406
+ dataset_record_ids = get_unique_values(dataset_records, "id")
1407
+ if dataset_record_ids.issubset(project_record_ids):
1408
+ project_dim_name = project_dim.model.name
1409
+ if dim_type in base_dimension_names:
1410
+ msg = (
1411
+ f"Found multiple project base dimensions for {dataset_id=} "
1412
+ f"and {dim_type=}: {base_dimension_names[dim_type]} and "
1413
+ f"{project_dim_name}. Please specify a mapping."
1414
+ )
1415
+ raise DSGInvalidDataset(msg)
1416
+
1417
+ base_dimension_names[dim_type] = project_dim_name
1418
+ if dim_type not in base_dimension_names:
1419
+ msg = (
1420
+ f"Bug: {dim_type} has multiple base dimensions in the project, dataset "
1421
+ f"{dataset_id} does not specify a mapping, and dsgrid could not "
1422
+ "discern which base dimension to use."
1423
+ )
1424
+ raise DSGInvalidDataset(msg)
1425
+ else:
1426
+ base_dimension_names[dim_type] = project_base_dims[0].model.name
1427
+
1428
+ data = {k.value: v for k, v in base_dimension_names.items()}
1429
+ return DatasetBaseDimensionNamesModel(**data)
1430
+
1431
+ @track_timing(timer_stats_collector)
1432
+ def _make_dimension_associations(
1433
+ self,
1434
+ config: ProjectConfig,
1435
+ dataset_id: str,
1436
+ context: ScratchDirContext,
1437
+ ) -> DataFrame:
1438
+ logger.info("Make dimension association table for %s", dataset_id)
1439
+ df = config.make_dimension_association_table(dataset_id, context)
1440
+ if use_duckdb():
1441
+ df2 = df
1442
+ else:
1443
+ # This operation is slow with Spark. Ensure that we only evaluate the query once.
1444
+ df2 = read_dataframe(persist_table(df, context, "dimension_associations"))
1445
+ logger.info("Wrote dimension associations for dataset %s", dataset_id)
1446
+ return df2
1447
+
1448
+ def update_from_file(
1449
+ self,
1450
+ config_file,
1451
+ project_id: str,
1452
+ submitter: str,
1453
+ update_type: VersionUpdateType,
1454
+ log_message: str,
1455
+ version: str,
1456
+ ) -> ProjectConfig:
1457
+ with RegistrationContext(self.db, log_message, update_type, submitter) as context:
1458
+ config = ProjectConfig.load(config_file)
1459
+ self._update_dimensions_and_mappings(context.connection, config)
1460
+ self._check_update(context.connection, config, project_id, version)
1461
+ return self.update_with_context(config, context)
1462
+
1463
+ @track_timing(timer_stats_collector)
1464
+ def update(
1465
+ self,
1466
+ config: ProjectConfig,
1467
+ update_type: VersionUpdateType,
1468
+ log_message: str,
1469
+ submitter: str | None = None,
1470
+ ) -> ProjectConfig:
1471
+ with RegistrationContext(self.db, log_message, update_type, submitter) as context:
1472
+ self._update_dimensions_and_mappings(context.connection, config)
1473
+ return self.update_with_context(config, context)
1474
+
1475
+ def update_with_context(
1476
+ self, config: ProjectConfig, context: RegistrationContext
1477
+ ) -> ProjectConfig:
1478
+ old_config = self.get_by_id(config.model.project_id, conn=context.connection)
1479
+ checker = ProjectUpdateChecker(old_config.model, config.model)
1480
+ checker.run()
1481
+ self._run_checks(config)
1482
+ return self._make_new_config(config, context)
1483
+
1484
+ def _make_new_config(
1485
+ self, config: ProjectConfig, context: RegistrationContext
1486
+ ) -> ProjectConfig:
1487
+ old_version = config.model.version
1488
+ old_key = ConfigKey(config.config_id, old_version)
1489
+ model = self._update_config(config, context)
1490
+ assert isinstance(model, ProjectConfigModel)
1491
+ new_config = ProjectConfig(model)
1492
+ self._update_dimensions_and_mappings(context.connection, new_config)
1493
+ new_key = ConfigKey(new_config.model.project_id, new_config.model.version)
1494
+ self._projects.pop(old_key, None)
1495
+ self._projects[new_key] = new_config
1496
+ return new_config
1497
+
1498
+ def finalize_registration(self, conn: Connection, config_ids: set[str], error_occurred: bool):
1499
+ if error_occurred:
1500
+ logger.info("Remove intermediate project after error")
1501
+ for key in [x for x in self._projects if x.id in config_ids]:
1502
+ self._projects.pop(key)
1503
+
1504
+ def remove(self, config_id: str, conn: Connection | None = None) -> None:
1505
+ self.db.delete_all(conn, config_id)
1506
+ for key in [x for x in self._projects if x.id == config_id]:
1507
+ self._projects.pop(key)
1508
+
1509
+ logger.info("Removed %s from the registry.", config_id)
1510
+
1511
+ def show(
1512
+ self,
1513
+ conn: Connection | None = None,
1514
+ filters: list[str] | None = None,
1515
+ max_width: Union[int, dict] | None = None,
1516
+ drop_fields: list[str] | None = None,
1517
+ return_table: bool = False,
1518
+ **kwargs,
1519
+ ):
1520
+ """Show registry in PrettyTable
1521
+
1522
+ Parameters
1523
+ ----------
1524
+ filters : list or tuple
1525
+ List of filter expressions for reigstry content (e.g., filters=["Submitter==USER", "Description contains comstock"])
1526
+ max_width
1527
+ Max column width in PrettyTable, specify as a single value or as a dict of values by field name
1528
+ drop_fields
1529
+ List of field names not to show
1530
+
1531
+ """
1532
+
1533
+ if filters:
1534
+ logger.info("List registry for: %s", filters)
1535
+
1536
+ table = PrettyTable(title=self.name())
1537
+ all_field_names = (
1538
+ "ID",
1539
+ "Version",
1540
+ "Status",
1541
+ "Datasets",
1542
+ "Date",
1543
+ "Submitter",
1544
+ "Description",
1545
+ )
1546
+ # TODO: may want dataset and dataset status to be separate columns
1547
+ # TODO: this block can be refactored into base, registry should be in HTML table for notebook.
1548
+ if drop_fields is None:
1549
+ table.field_names = all_field_names
1550
+ else:
1551
+ table.field_names = tuple(x for x in all_field_names if x not in drop_fields)
1552
+
1553
+ if max_width is None:
1554
+ table._max_width = {
1555
+ "ID": 20,
1556
+ "Status": 12,
1557
+ "Datasets": 30,
1558
+ "Date": 10,
1559
+ "Description": 30,
1560
+ }
1561
+ if isinstance(max_width, int):
1562
+ table.max_width = max_width
1563
+ elif isinstance(max_width, dict):
1564
+ table._max_width = max_width
1565
+
1566
+ transformed_filters = transform_and_validate_filters(filters) if filters else None
1567
+ field_to_index = {x: i for i, x in enumerate(table.field_names)}
1568
+ rows = []
1569
+ for model in self.db.iter_models(conn):
1570
+ assert isinstance(model, ProjectConfigModel)
1571
+ registration = self.db.get_registration(conn, model)
1572
+ all_fields = (
1573
+ model.project_id,
1574
+ model.version,
1575
+ model.status.value,
1576
+ ",\n".join([f"{x.dataset_id}: {x.status.value}" for x in model.datasets]),
1577
+ registration.timestamp.strftime("%Y-%m-%d %H:%M:%S"),
1578
+ registration.submitter,
1579
+ registration.log_message,
1580
+ )
1581
+ if drop_fields is None:
1582
+ row = all_fields
1583
+ else:
1584
+ row = tuple(
1585
+ y for (x, y) in zip(all_field_names, all_fields) if x not in drop_fields
1586
+ )
1587
+
1588
+ if not filters or matches_filters(row, field_to_index, transformed_filters):
1589
+ rows.append(row)
1590
+
1591
+ rows.sort(key=lambda x: x[0])
1592
+ table.add_rows(rows)
1593
+ table.align = "l"
1594
+ if return_table:
1595
+ return table
1596
+ display_table(table)
1597
+
1598
+
1599
+ def _check_distinct_column_values(project_table: DataFrame, mapped_dataset_table: DataFrame):
1600
+ """Ensure that the mapped dataset has the same distinct values as the project for all
1601
+ columns. This should be called before running a full comparison of the two tables.
1602
+ """
1603
+ has_mismatch = False
1604
+ for column in project_table.columns:
1605
+ project_distinct = {x[column] for x in project_table.select(column).distinct().collect()}
1606
+ dataset_distinct = {
1607
+ x[column] for x in mapped_dataset_table.select(column).distinct().collect()
1608
+ }
1609
+ if diff_values := project_distinct.difference(dataset_distinct):
1610
+ has_mismatch = True
1611
+ logger.error(
1612
+ "The mapped dataset has different distinct values than the project "
1613
+ "for column=%s: diff=%s",
1614
+ column,
1615
+ diff_values,
1616
+ )
1617
+
1618
+ if has_mismatch:
1619
+ msg = (
1620
+ "The mapped dataset has different distinct values than the project for one or "
1621
+ "more columns. Please look in the log file for the exact records."
1622
+ )
1623
+ raise DSGInvalidDataset(msg)