dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- build_backend.py +93 -0
- dsgrid/__init__.py +22 -0
- dsgrid/api/__init__.py +0 -0
- dsgrid/api/api_manager.py +179 -0
- dsgrid/api/app.py +419 -0
- dsgrid/api/models.py +60 -0
- dsgrid/api/response_models.py +116 -0
- dsgrid/apps/__init__.py +0 -0
- dsgrid/apps/project_viewer/app.py +216 -0
- dsgrid/apps/registration_gui.py +444 -0
- dsgrid/chronify.py +32 -0
- dsgrid/cli/__init__.py +0 -0
- dsgrid/cli/common.py +120 -0
- dsgrid/cli/config.py +176 -0
- dsgrid/cli/download.py +13 -0
- dsgrid/cli/dsgrid.py +157 -0
- dsgrid/cli/dsgrid_admin.py +92 -0
- dsgrid/cli/install_notebooks.py +62 -0
- dsgrid/cli/query.py +729 -0
- dsgrid/cli/registry.py +1862 -0
- dsgrid/cloud/__init__.py +0 -0
- dsgrid/cloud/cloud_storage_interface.py +140 -0
- dsgrid/cloud/factory.py +31 -0
- dsgrid/cloud/fake_storage_interface.py +37 -0
- dsgrid/cloud/s3_storage_interface.py +156 -0
- dsgrid/common.py +36 -0
- dsgrid/config/__init__.py +0 -0
- dsgrid/config/annual_time_dimension_config.py +194 -0
- dsgrid/config/common.py +142 -0
- dsgrid/config/config_base.py +148 -0
- dsgrid/config/dataset_config.py +907 -0
- dsgrid/config/dataset_schema_handler_factory.py +46 -0
- dsgrid/config/date_time_dimension_config.py +136 -0
- dsgrid/config/dimension_config.py +54 -0
- dsgrid/config/dimension_config_factory.py +65 -0
- dsgrid/config/dimension_mapping_base.py +350 -0
- dsgrid/config/dimension_mappings_config.py +48 -0
- dsgrid/config/dimensions.py +1025 -0
- dsgrid/config/dimensions_config.py +71 -0
- dsgrid/config/file_schema.py +190 -0
- dsgrid/config/index_time_dimension_config.py +80 -0
- dsgrid/config/input_dataset_requirements.py +31 -0
- dsgrid/config/mapping_tables.py +209 -0
- dsgrid/config/noop_time_dimension_config.py +42 -0
- dsgrid/config/project_config.py +1462 -0
- dsgrid/config/registration_models.py +188 -0
- dsgrid/config/representative_period_time_dimension_config.py +194 -0
- dsgrid/config/simple_models.py +49 -0
- dsgrid/config/supplemental_dimension.py +29 -0
- dsgrid/config/time_dimension_base_config.py +192 -0
- dsgrid/data_models.py +155 -0
- dsgrid/dataset/__init__.py +0 -0
- dsgrid/dataset/dataset.py +123 -0
- dsgrid/dataset/dataset_expression_handler.py +86 -0
- dsgrid/dataset/dataset_mapping_manager.py +121 -0
- dsgrid/dataset/dataset_schema_handler_base.py +945 -0
- dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
- dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
- dsgrid/dataset/growth_rates.py +162 -0
- dsgrid/dataset/models.py +51 -0
- dsgrid/dataset/table_format_handler_base.py +257 -0
- dsgrid/dataset/table_format_handler_factory.py +17 -0
- dsgrid/dataset/unpivoted_table.py +121 -0
- dsgrid/dimension/__init__.py +0 -0
- dsgrid/dimension/base_models.py +230 -0
- dsgrid/dimension/dimension_filters.py +308 -0
- dsgrid/dimension/standard.py +252 -0
- dsgrid/dimension/time.py +352 -0
- dsgrid/dimension/time_utils.py +103 -0
- dsgrid/dsgrid_rc.py +88 -0
- dsgrid/exceptions.py +105 -0
- dsgrid/filesystem/__init__.py +0 -0
- dsgrid/filesystem/cloud_filesystem.py +32 -0
- dsgrid/filesystem/factory.py +32 -0
- dsgrid/filesystem/filesystem_interface.py +136 -0
- dsgrid/filesystem/local_filesystem.py +74 -0
- dsgrid/filesystem/s3_filesystem.py +118 -0
- dsgrid/loggers.py +132 -0
- dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
- dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
- dsgrid/notebooks/registration.ipynb +48 -0
- dsgrid/notebooks/start_notebook.sh +11 -0
- dsgrid/project.py +451 -0
- dsgrid/query/__init__.py +0 -0
- dsgrid/query/dataset_mapping_plan.py +142 -0
- dsgrid/query/derived_dataset.py +388 -0
- dsgrid/query/models.py +728 -0
- dsgrid/query/query_context.py +287 -0
- dsgrid/query/query_submitter.py +994 -0
- dsgrid/query/report_factory.py +19 -0
- dsgrid/query/report_peak_load.py +70 -0
- dsgrid/query/reports_base.py +20 -0
- dsgrid/registry/__init__.py +0 -0
- dsgrid/registry/bulk_register.py +165 -0
- dsgrid/registry/common.py +287 -0
- dsgrid/registry/config_update_checker_base.py +63 -0
- dsgrid/registry/data_store_factory.py +34 -0
- dsgrid/registry/data_store_interface.py +74 -0
- dsgrid/registry/dataset_config_generator.py +158 -0
- dsgrid/registry/dataset_registry_manager.py +950 -0
- dsgrid/registry/dataset_update_checker.py +16 -0
- dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
- dsgrid/registry/dimension_mapping_update_checker.py +16 -0
- dsgrid/registry/dimension_registry_manager.py +413 -0
- dsgrid/registry/dimension_update_checker.py +16 -0
- dsgrid/registry/duckdb_data_store.py +207 -0
- dsgrid/registry/filesystem_data_store.py +150 -0
- dsgrid/registry/filter_registry_manager.py +123 -0
- dsgrid/registry/project_config_generator.py +57 -0
- dsgrid/registry/project_registry_manager.py +1623 -0
- dsgrid/registry/project_update_checker.py +48 -0
- dsgrid/registry/registration_context.py +223 -0
- dsgrid/registry/registry_auto_updater.py +316 -0
- dsgrid/registry/registry_database.py +667 -0
- dsgrid/registry/registry_interface.py +446 -0
- dsgrid/registry/registry_manager.py +558 -0
- dsgrid/registry/registry_manager_base.py +367 -0
- dsgrid/registry/versioning.py +92 -0
- dsgrid/rust_ext/__init__.py +14 -0
- dsgrid/rust_ext/find_minimal_patterns.py +129 -0
- dsgrid/spark/__init__.py +0 -0
- dsgrid/spark/functions.py +589 -0
- dsgrid/spark/types.py +110 -0
- dsgrid/tests/__init__.py +0 -0
- dsgrid/tests/common.py +140 -0
- dsgrid/tests/make_us_data_registry.py +265 -0
- dsgrid/tests/register_derived_datasets.py +103 -0
- dsgrid/tests/utils.py +25 -0
- dsgrid/time/__init__.py +0 -0
- dsgrid/time/time_conversions.py +80 -0
- dsgrid/time/types.py +67 -0
- dsgrid/units/__init__.py +0 -0
- dsgrid/units/constants.py +113 -0
- dsgrid/units/convert.py +71 -0
- dsgrid/units/energy.py +145 -0
- dsgrid/units/power.py +87 -0
- dsgrid/utils/__init__.py +0 -0
- dsgrid/utils/dataset.py +830 -0
- dsgrid/utils/files.py +179 -0
- dsgrid/utils/filters.py +125 -0
- dsgrid/utils/id_remappings.py +100 -0
- dsgrid/utils/py_expression_eval/LICENSE +19 -0
- dsgrid/utils/py_expression_eval/README.md +8 -0
- dsgrid/utils/py_expression_eval/__init__.py +847 -0
- dsgrid/utils/py_expression_eval/tests.py +283 -0
- dsgrid/utils/run_command.py +70 -0
- dsgrid/utils/scratch_dir_context.py +65 -0
- dsgrid/utils/spark.py +918 -0
- dsgrid/utils/spark_partition.py +98 -0
- dsgrid/utils/timing.py +239 -0
- dsgrid/utils/utilities.py +221 -0
- dsgrid/utils/versioning.py +36 -0
- dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
- dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
- dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
- dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
- dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0
|
@@ -0,0 +1,1623 @@
|
|
|
1
|
+
"""Manages the registry for dimension projects"""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import tempfile
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from tempfile import TemporaryDirectory
|
|
9
|
+
from typing import Any, Type, Union
|
|
10
|
+
|
|
11
|
+
from dsgrid.utils.dataset import handle_dimension_association_errors
|
|
12
|
+
import json5
|
|
13
|
+
import pandas as pd
|
|
14
|
+
from prettytable import PrettyTable
|
|
15
|
+
from sqlalchemy import Connection
|
|
16
|
+
|
|
17
|
+
from dsgrid.config.dimension_config import (
|
|
18
|
+
DimensionBaseConfig,
|
|
19
|
+
DimensionBaseConfigWithFiles,
|
|
20
|
+
)
|
|
21
|
+
from dsgrid.dimension.base_models import DimensionType
|
|
22
|
+
from dsgrid.exceptions import (
|
|
23
|
+
DSGInvalidDataset,
|
|
24
|
+
DSGInvalidDimension,
|
|
25
|
+
DSGInvalidDimensionMapping,
|
|
26
|
+
DSGValueNotRegistered,
|
|
27
|
+
DSGDuplicateValueRegistered,
|
|
28
|
+
DSGInvalidParameter,
|
|
29
|
+
)
|
|
30
|
+
from dsgrid.config.dataset_schema_handler_factory import make_dataset_schema_handler
|
|
31
|
+
from dsgrid.config.dataset_config import DatasetConfig
|
|
32
|
+
from dsgrid.config.dimensions import DimensionModel
|
|
33
|
+
from dsgrid.config.dimensions_config import DimensionsConfig, DimensionsConfigModel
|
|
34
|
+
from dsgrid.config.dimension_mapping_base import (
|
|
35
|
+
DimensionReferenceModel,
|
|
36
|
+
DimensionMappingReferenceModel,
|
|
37
|
+
DimensionMappingReferenceListModel,
|
|
38
|
+
DimensionMappingType,
|
|
39
|
+
)
|
|
40
|
+
from dsgrid.config.dimension_mappings_config import (
|
|
41
|
+
DimensionMappingsConfig,
|
|
42
|
+
DimensionMappingsConfigModel,
|
|
43
|
+
)
|
|
44
|
+
from dsgrid.config.supplemental_dimension import (
|
|
45
|
+
SupplementalDimensionModel,
|
|
46
|
+
SupplementalDimensionsListModel,
|
|
47
|
+
)
|
|
48
|
+
from dsgrid.config.input_dataset_requirements import (
|
|
49
|
+
InputDatasetDimensionRequirementsListModel,
|
|
50
|
+
InputDatasetListModel,
|
|
51
|
+
)
|
|
52
|
+
from dsgrid.config.mapping_tables import (
|
|
53
|
+
MappingTableModel,
|
|
54
|
+
MappingTableByNameModel,
|
|
55
|
+
DatasetBaseToProjectMappingTableListModel,
|
|
56
|
+
)
|
|
57
|
+
from dsgrid.config.project_config import (
|
|
58
|
+
DatasetBaseDimensionNamesModel,
|
|
59
|
+
ProjectConfig,
|
|
60
|
+
ProjectConfigModel,
|
|
61
|
+
RequiredBaseDimensionModel,
|
|
62
|
+
RequiredDimensionRecordsByTypeModel,
|
|
63
|
+
RequiredDimensionRecordsModel,
|
|
64
|
+
SubsetDimensionGroupModel,
|
|
65
|
+
SubsetDimensionGroupListModel,
|
|
66
|
+
)
|
|
67
|
+
from dsgrid.project import Project
|
|
68
|
+
from dsgrid.registry.common import (
|
|
69
|
+
ConfigKey,
|
|
70
|
+
DatasetRegistryStatus,
|
|
71
|
+
ProjectRegistryStatus,
|
|
72
|
+
RegistryManagerParams,
|
|
73
|
+
)
|
|
74
|
+
from dsgrid.spark.functions import (
|
|
75
|
+
cache,
|
|
76
|
+
except_all,
|
|
77
|
+
is_dataframe_empty,
|
|
78
|
+
unpersist,
|
|
79
|
+
)
|
|
80
|
+
from dsgrid.spark.types import (
|
|
81
|
+
DataFrame,
|
|
82
|
+
F,
|
|
83
|
+
use_duckdb,
|
|
84
|
+
)
|
|
85
|
+
from dsgrid.utils.timing import track_timing, timer_stats_collector
|
|
86
|
+
from dsgrid.utils.files import load_data, in_other_dir
|
|
87
|
+
from dsgrid.utils.filters import transform_and_validate_filters, matches_filters
|
|
88
|
+
from dsgrid.utils.scratch_dir_context import ScratchDirContext
|
|
89
|
+
from dsgrid.utils.spark import (
|
|
90
|
+
models_to_dataframe,
|
|
91
|
+
get_unique_values,
|
|
92
|
+
persist_table,
|
|
93
|
+
read_dataframe,
|
|
94
|
+
)
|
|
95
|
+
from dsgrid.utils.utilities import check_uniqueness, display_table
|
|
96
|
+
from dsgrid.registry.registry_interface import ProjectRegistryInterface
|
|
97
|
+
from .common import (
|
|
98
|
+
VersionUpdateType,
|
|
99
|
+
RegistryType,
|
|
100
|
+
)
|
|
101
|
+
from .registration_context import RegistrationContext
|
|
102
|
+
from .project_update_checker import ProjectUpdateChecker
|
|
103
|
+
from .dataset_registry_manager import DatasetRegistryManager
|
|
104
|
+
from .dimension_mapping_registry_manager import DimensionMappingRegistryManager
|
|
105
|
+
from .dimension_registry_manager import DimensionRegistryManager
|
|
106
|
+
from .registry_manager_base import RegistryManagerBase
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
logger = logging.getLogger(__name__)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class ProjectRegistryManager(RegistryManagerBase):
|
|
113
|
+
"""Manages registered dimension projects."""
|
|
114
|
+
|
|
115
|
+
def __init__(
|
|
116
|
+
self,
|
|
117
|
+
path: Path,
|
|
118
|
+
params,
|
|
119
|
+
dataset_manager: DatasetRegistryManager,
|
|
120
|
+
dimension_manager: DimensionRegistryManager,
|
|
121
|
+
dimension_mapping_manager: DimensionMappingRegistryManager,
|
|
122
|
+
db: ProjectRegistryInterface,
|
|
123
|
+
):
|
|
124
|
+
super().__init__(path, params)
|
|
125
|
+
self._projects: dict[ConfigKey, ProjectConfig] = {}
|
|
126
|
+
self._dataset_mgr = dataset_manager
|
|
127
|
+
self._dimension_mgr = dimension_manager
|
|
128
|
+
self._dimension_mapping_mgr = dimension_mapping_manager
|
|
129
|
+
self._db = db
|
|
130
|
+
|
|
131
|
+
@classmethod
|
|
132
|
+
def load(
|
|
133
|
+
cls,
|
|
134
|
+
path: Path,
|
|
135
|
+
params: RegistryManagerParams,
|
|
136
|
+
dataset_manager: DatasetRegistryManager,
|
|
137
|
+
dimension_manager: DimensionRegistryManager,
|
|
138
|
+
dimension_mapping_manager: DimensionMappingRegistryManager,
|
|
139
|
+
db: ProjectRegistryInterface,
|
|
140
|
+
):
|
|
141
|
+
return cls._load(
|
|
142
|
+
path,
|
|
143
|
+
params,
|
|
144
|
+
dataset_manager,
|
|
145
|
+
dimension_manager,
|
|
146
|
+
dimension_mapping_manager,
|
|
147
|
+
db,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
@staticmethod
|
|
151
|
+
def config_class() -> Type:
|
|
152
|
+
return ProjectConfig
|
|
153
|
+
|
|
154
|
+
@property
|
|
155
|
+
def db(self) -> ProjectRegistryInterface:
|
|
156
|
+
return self._db
|
|
157
|
+
|
|
158
|
+
@db.setter
|
|
159
|
+
def db(self, db: ProjectRegistryInterface):
|
|
160
|
+
self._db = db
|
|
161
|
+
|
|
162
|
+
@staticmethod
|
|
163
|
+
def name() -> str:
|
|
164
|
+
return "Projects"
|
|
165
|
+
|
|
166
|
+
@property
|
|
167
|
+
def dataset_manager(self) -> DatasetRegistryManager:
|
|
168
|
+
return self._dataset_mgr
|
|
169
|
+
|
|
170
|
+
@property
|
|
171
|
+
def dimension_manager(self) -> DimensionRegistryManager:
|
|
172
|
+
return self._dimension_mgr
|
|
173
|
+
|
|
174
|
+
@property
|
|
175
|
+
def dimension_mapping_manager(self) -> DimensionMappingRegistryManager:
|
|
176
|
+
return self._dimension_mapping_mgr
|
|
177
|
+
|
|
178
|
+
def get_by_id(
|
|
179
|
+
self,
|
|
180
|
+
project_id: str,
|
|
181
|
+
version: str | None = None,
|
|
182
|
+
conn: Connection | None = None,
|
|
183
|
+
) -> ProjectConfig:
|
|
184
|
+
if version is None:
|
|
185
|
+
assert self._db is not None
|
|
186
|
+
version = self._db.get_latest_version(conn, project_id)
|
|
187
|
+
|
|
188
|
+
key = ConfigKey(project_id, version)
|
|
189
|
+
project = self._projects.get(key)
|
|
190
|
+
if project is not None:
|
|
191
|
+
return project
|
|
192
|
+
|
|
193
|
+
if version is None:
|
|
194
|
+
model = self.db.get_latest(conn, project_id)
|
|
195
|
+
else:
|
|
196
|
+
model = self.db.get_by_version(conn, project_id, version)
|
|
197
|
+
|
|
198
|
+
assert isinstance(model, ProjectConfigModel)
|
|
199
|
+
config = ProjectConfig(model)
|
|
200
|
+
self._update_dimensions_and_mappings(conn, config)
|
|
201
|
+
self._projects[key] = config
|
|
202
|
+
return config
|
|
203
|
+
|
|
204
|
+
def _update_dimensions_and_mappings(self, conn: Connection | None, config: ProjectConfig):
|
|
205
|
+
base_dimensions = self._dimension_mgr.load_dimensions(
|
|
206
|
+
config.model.dimensions.base_dimension_references, conn=conn
|
|
207
|
+
)
|
|
208
|
+
supplemental_dimensions = self._dimension_mgr.load_dimensions(
|
|
209
|
+
config.model.dimensions.supplemental_dimension_references, conn=conn
|
|
210
|
+
)
|
|
211
|
+
base_to_supp_mappings = self._dimension_mapping_mgr.load_dimension_mappings(
|
|
212
|
+
config.model.dimension_mappings.base_to_supplemental_references, conn=conn
|
|
213
|
+
)
|
|
214
|
+
subset_dimensions = self._get_subset_dimensions(conn, config)
|
|
215
|
+
config.set_dimensions(base_dimensions, subset_dimensions, supplemental_dimensions)
|
|
216
|
+
config.set_dimension_mappings(base_to_supp_mappings)
|
|
217
|
+
|
|
218
|
+
def _get_subset_dimensions(self, conn: Connection | None, config: ProjectConfig):
|
|
219
|
+
subset_dimensions: dict[
|
|
220
|
+
DimensionType, dict[str, dict[ConfigKey, DimensionBaseConfig]]
|
|
221
|
+
] = defaultdict(dict)
|
|
222
|
+
for subset_dim in config.model.dimensions.subset_dimensions:
|
|
223
|
+
selectors = {
|
|
224
|
+
ConfigKey(x.dimension_id, x.version): self._dimension_mgr.get_by_id(
|
|
225
|
+
x.dimension_id, version=x.version, conn=conn
|
|
226
|
+
)
|
|
227
|
+
for x in subset_dim.selector_references
|
|
228
|
+
}
|
|
229
|
+
subset_dimensions[subset_dim.dimension_type][subset_dim.name] = selectors
|
|
230
|
+
return subset_dimensions
|
|
231
|
+
|
|
232
|
+
def load_project(
|
|
233
|
+
self,
|
|
234
|
+
project_id: str,
|
|
235
|
+
version: str | None = None,
|
|
236
|
+
conn: Connection | None = None,
|
|
237
|
+
) -> Project:
|
|
238
|
+
"""Load a project from the registry.
|
|
239
|
+
|
|
240
|
+
Parameters
|
|
241
|
+
----------
|
|
242
|
+
project_id : str
|
|
243
|
+
version : str
|
|
244
|
+
|
|
245
|
+
Returns
|
|
246
|
+
-------
|
|
247
|
+
Project
|
|
248
|
+
"""
|
|
249
|
+
if conn is None:
|
|
250
|
+
with self.db.engine.connect() as conn:
|
|
251
|
+
return self._load_project(conn, project_id, version=version)
|
|
252
|
+
else:
|
|
253
|
+
return self._load_project(conn, project_id, version=version)
|
|
254
|
+
|
|
255
|
+
def _load_project(self, conn: Connection, project_id: str, version=None) -> Project:
|
|
256
|
+
dataset_manager = self._dataset_mgr
|
|
257
|
+
config = self.get_by_id(project_id, version=version, conn=conn)
|
|
258
|
+
|
|
259
|
+
dataset_configs = {}
|
|
260
|
+
for dataset_id in config.list_registered_dataset_ids():
|
|
261
|
+
dataset_config = dataset_manager.get_by_id(dataset_id, conn=conn)
|
|
262
|
+
dataset_configs[dataset_id] = dataset_config
|
|
263
|
+
|
|
264
|
+
return Project(
|
|
265
|
+
config,
|
|
266
|
+
config.model.version,
|
|
267
|
+
dataset_configs,
|
|
268
|
+
self._dimension_mgr,
|
|
269
|
+
self._dimension_mapping_mgr,
|
|
270
|
+
self._dataset_mgr,
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
def register(
|
|
274
|
+
self,
|
|
275
|
+
config_file: Path,
|
|
276
|
+
submitter: str,
|
|
277
|
+
log_message: str,
|
|
278
|
+
) -> None:
|
|
279
|
+
"""Register a project from a config file."""
|
|
280
|
+
with RegistrationContext(
|
|
281
|
+
self.db, log_message, VersionUpdateType.MAJOR, submitter
|
|
282
|
+
) as context:
|
|
283
|
+
config = ProjectConfig.load(config_file)
|
|
284
|
+
src_dir = config_file.parent
|
|
285
|
+
self.register_from_config(config, src_dir, context)
|
|
286
|
+
|
|
287
|
+
def register_from_config(
|
|
288
|
+
self,
|
|
289
|
+
config: ProjectConfig,
|
|
290
|
+
src_dir: Path,
|
|
291
|
+
context: RegistrationContext,
|
|
292
|
+
):
|
|
293
|
+
"""Register a project from an existing config."""
|
|
294
|
+
self._register_project_and_dimensions(
|
|
295
|
+
config,
|
|
296
|
+
src_dir,
|
|
297
|
+
context,
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
def _register_project_and_dimensions(
|
|
301
|
+
self,
|
|
302
|
+
config: ProjectConfig,
|
|
303
|
+
src_dir: Path,
|
|
304
|
+
context: RegistrationContext,
|
|
305
|
+
):
|
|
306
|
+
model = config.model
|
|
307
|
+
logger.info("Start registration of project %s", model.project_id)
|
|
308
|
+
self._check_if_already_registered(context.connection, model.project_id)
|
|
309
|
+
if model.dimensions.base_dimensions:
|
|
310
|
+
logger.info("Register base dimensions")
|
|
311
|
+
for ref in self._register_dimensions_from_models(
|
|
312
|
+
model.dimensions.base_dimensions,
|
|
313
|
+
context,
|
|
314
|
+
):
|
|
315
|
+
model.dimensions.base_dimension_references.append(ref)
|
|
316
|
+
model.dimensions.base_dimensions.clear()
|
|
317
|
+
if model.dimensions.subset_dimensions:
|
|
318
|
+
self._register_subset_dimensions(
|
|
319
|
+
model,
|
|
320
|
+
model.dimensions.subset_dimensions,
|
|
321
|
+
context,
|
|
322
|
+
)
|
|
323
|
+
if model.dimensions.supplemental_dimensions:
|
|
324
|
+
logger.info("Register supplemental dimensions")
|
|
325
|
+
self._register_supplemental_dimensions_from_models(
|
|
326
|
+
src_dir,
|
|
327
|
+
model,
|
|
328
|
+
model.dimensions.supplemental_dimensions,
|
|
329
|
+
context,
|
|
330
|
+
)
|
|
331
|
+
model.dimensions.supplemental_dimensions.clear()
|
|
332
|
+
logger.info("Register all-in-one supplemental dimensions")
|
|
333
|
+
self._register_all_in_one_dimensions(
|
|
334
|
+
src_dir,
|
|
335
|
+
model,
|
|
336
|
+
context,
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
self._update_dimensions_and_mappings(context.connection, config)
|
|
340
|
+
for subset_dimension in model.dimensions.subset_dimensions:
|
|
341
|
+
subset_dimension.selectors.clear()
|
|
342
|
+
self._register(config, context)
|
|
343
|
+
context.add_id(RegistryType.PROJECT, config.model.project_id, self)
|
|
344
|
+
|
|
345
|
+
def _register_dimensions_from_models(
|
|
346
|
+
self,
|
|
347
|
+
dimensions: list,
|
|
348
|
+
context: RegistrationContext,
|
|
349
|
+
):
|
|
350
|
+
dim_model = DimensionsConfigModel(dimensions=dimensions)
|
|
351
|
+
dims_config = DimensionsConfig.load_from_model(dim_model)
|
|
352
|
+
dimension_ids = self._dimension_mgr.register_from_config(dims_config, context)
|
|
353
|
+
return self._dimension_mgr.make_dimension_references(context.connection, dimension_ids)
|
|
354
|
+
|
|
355
|
+
def _register_supplemental_dimensions_from_models(
|
|
356
|
+
self,
|
|
357
|
+
src_dir: Path,
|
|
358
|
+
model: ProjectConfigModel,
|
|
359
|
+
dimensions: list,
|
|
360
|
+
context: RegistrationContext,
|
|
361
|
+
):
|
|
362
|
+
"""Registers supplemental dimensions and creates base-to-supplemental mappings for those
|
|
363
|
+
new dimensions.
|
|
364
|
+
"""
|
|
365
|
+
dims = []
|
|
366
|
+
for x in dimensions:
|
|
367
|
+
data = x.serialize()
|
|
368
|
+
data.pop("mapping", None)
|
|
369
|
+
dims.append(DimensionModel(**data))
|
|
370
|
+
|
|
371
|
+
refs = self._register_dimensions_from_models(dims, context)
|
|
372
|
+
|
|
373
|
+
model.dimensions.supplemental_dimension_references += refs
|
|
374
|
+
self._register_base_to_supplemental_mappings(
|
|
375
|
+
src_dir,
|
|
376
|
+
model,
|
|
377
|
+
dimensions,
|
|
378
|
+
refs,
|
|
379
|
+
context,
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
def _register_base_to_supplemental_mappings(
|
|
383
|
+
self,
|
|
384
|
+
src_dir: Path,
|
|
385
|
+
model: ProjectConfigModel,
|
|
386
|
+
dimensions: list[SupplementalDimensionModel],
|
|
387
|
+
dimension_references: list[DimensionReferenceModel],
|
|
388
|
+
context: RegistrationContext,
|
|
389
|
+
):
|
|
390
|
+
conn = context.connection
|
|
391
|
+
base_dim_mapping = defaultdict(list)
|
|
392
|
+
base_dim_refs: dict[str, DimensionReferenceModel] = {}
|
|
393
|
+
for ref in model.dimensions.base_dimension_references:
|
|
394
|
+
dim = self._dimension_mgr.get_by_id(
|
|
395
|
+
ref.dimension_id, version=ref.version, conn=context.connection
|
|
396
|
+
)
|
|
397
|
+
base_dim_mapping[ref.dimension_type].append(dim)
|
|
398
|
+
base_dim_refs[dim.model.dimension_id] = ref
|
|
399
|
+
|
|
400
|
+
mappings = []
|
|
401
|
+
if len(dimensions) != len(dimension_references):
|
|
402
|
+
msg = f"Bug: mismatch in sizes: {dimensions=} {dimension_references=}"
|
|
403
|
+
raise Exception(msg)
|
|
404
|
+
|
|
405
|
+
for dim, ref in zip(dimensions, dimension_references):
|
|
406
|
+
base_dim: DimensionBaseConfig | None = None
|
|
407
|
+
if dim.mapping.project_base_dimension_name is None:
|
|
408
|
+
base_dims = base_dim_mapping[ref.dimension_type]
|
|
409
|
+
if len(base_dims) > 1:
|
|
410
|
+
msg = (
|
|
411
|
+
"If there are multiple base dimenions for a dimension type, each "
|
|
412
|
+
"supplemental dimension mapping must supply a project_base_dimension_name. "
|
|
413
|
+
f"{dim.label}"
|
|
414
|
+
)
|
|
415
|
+
raise DSGInvalidDimensionMapping(msg)
|
|
416
|
+
base_dim = base_dims[0]
|
|
417
|
+
else:
|
|
418
|
+
for base_dim_ in base_dim_mapping[dim.dimension_type]:
|
|
419
|
+
if base_dim_.model.name == dim.mapping.project_base_dimension_name:
|
|
420
|
+
if base_dim is not None:
|
|
421
|
+
msg = (
|
|
422
|
+
"A supplemental dimension can only be mapped to one base dimension:"
|
|
423
|
+
f" supplemental dimension = {dim.label} "
|
|
424
|
+
f"base dimensions = {base_dim.model.label} and "
|
|
425
|
+
f"{base_dim_.model.label}"
|
|
426
|
+
)
|
|
427
|
+
raise DSGInvalidDimensionMapping(msg)
|
|
428
|
+
base_dim = base_dim_
|
|
429
|
+
if base_dim is None:
|
|
430
|
+
msg = f"Bug: unable to find base dimension for {dim.mapping.project_base_dimension_name}"
|
|
431
|
+
raise Exception(msg)
|
|
432
|
+
with in_other_dir(src_dir):
|
|
433
|
+
assert base_dim is not None
|
|
434
|
+
mapping_model = MappingTableModel.from_pre_registered_model(
|
|
435
|
+
dim.mapping,
|
|
436
|
+
base_dim_refs[base_dim.model.dimension_id],
|
|
437
|
+
ref,
|
|
438
|
+
)
|
|
439
|
+
mappings.append(mapping_model)
|
|
440
|
+
|
|
441
|
+
mapping_config = DimensionMappingsConfig.load_from_model(
|
|
442
|
+
DimensionMappingsConfigModel(mappings=mappings),
|
|
443
|
+
)
|
|
444
|
+
mapping_ids = self._dimension_mapping_mgr.register_from_config(mapping_config, context)
|
|
445
|
+
model.dimension_mappings.base_to_supplemental_references += (
|
|
446
|
+
self._dimension_mapping_mgr.make_dimension_mapping_references(mapping_ids, conn=conn)
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
def _register_subset_dimensions(
|
|
450
|
+
self,
|
|
451
|
+
model: ProjectConfigModel,
|
|
452
|
+
subset_dimensions: list[SubsetDimensionGroupModel],
|
|
453
|
+
context: RegistrationContext,
|
|
454
|
+
):
|
|
455
|
+
logger.info("Register subset dimensions")
|
|
456
|
+
self._register_dimensions_from_subset_dimension_groups(
|
|
457
|
+
subset_dimensions,
|
|
458
|
+
model.dimensions.base_dimension_references,
|
|
459
|
+
context,
|
|
460
|
+
)
|
|
461
|
+
self._register_supplemental_dimensions_from_subset_dimensions(
|
|
462
|
+
model,
|
|
463
|
+
subset_dimensions,
|
|
464
|
+
context,
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
def _register_dimensions_from_subset_dimension_groups(
|
|
468
|
+
self,
|
|
469
|
+
subset_dimensions: list[SubsetDimensionGroupModel],
|
|
470
|
+
base_dimension_references: list[DimensionReferenceModel],
|
|
471
|
+
context: RegistrationContext,
|
|
472
|
+
):
|
|
473
|
+
"""Registers a dimension for each subset specified in the project config's subset
|
|
474
|
+
dimension groups. Appends references to those dimensions to subset_dimensions, which is
|
|
475
|
+
part of the project config.
|
|
476
|
+
"""
|
|
477
|
+
conn = context.connection
|
|
478
|
+
with TemporaryDirectory() as tmpdir:
|
|
479
|
+
tmp_path = Path(tmpdir)
|
|
480
|
+
dimensions = []
|
|
481
|
+
subset_refs = {}
|
|
482
|
+
for subset_dimension in subset_dimensions:
|
|
483
|
+
base_dim = None
|
|
484
|
+
for ref in base_dimension_references:
|
|
485
|
+
if ref.dimension_type == subset_dimension.dimension_type:
|
|
486
|
+
base_dim = self._dimension_mgr.get_by_id(ref.dimension_id, conn=conn)
|
|
487
|
+
break
|
|
488
|
+
assert isinstance(base_dim, DimensionBaseConfigWithFiles), subset_dimension
|
|
489
|
+
base_records = base_dim.get_records_dataframe()
|
|
490
|
+
self._check_subset_dimension_consistency(subset_dimension, base_records)
|
|
491
|
+
for selector in subset_dimension.selectors:
|
|
492
|
+
new_records = base_records.filter(base_records["id"].isin(selector.records))
|
|
493
|
+
filename = tmp_path / f"{subset_dimension.name}_{selector.name}.csv"
|
|
494
|
+
new_records.toPandas().to_csv(filename, index=False)
|
|
495
|
+
dim = DimensionModel(
|
|
496
|
+
file=str(filename),
|
|
497
|
+
name=selector.name,
|
|
498
|
+
type=subset_dimension.dimension_type,
|
|
499
|
+
module=base_dim.model.module,
|
|
500
|
+
class_name=base_dim.model.class_name,
|
|
501
|
+
description=selector.description,
|
|
502
|
+
)
|
|
503
|
+
dimensions.append(dim)
|
|
504
|
+
key = (subset_dimension.dimension_type, selector.name)
|
|
505
|
+
if key in subset_refs:
|
|
506
|
+
msg = f"Bug: unhandled case of duplicate dimension name: {key=}"
|
|
507
|
+
raise Exception(msg)
|
|
508
|
+
subset_refs[key] = subset_dimension
|
|
509
|
+
|
|
510
|
+
dim_model = DimensionsConfigModel(dimensions=dimensions)
|
|
511
|
+
dims_config = DimensionsConfig.load_from_model(dim_model)
|
|
512
|
+
dimension_ids = self._dimension_mgr.register_from_config(dims_config, context)
|
|
513
|
+
for dimension_id in dimension_ids:
|
|
514
|
+
dim = self._dimension_mgr.get_by_id(dimension_id, conn=conn)
|
|
515
|
+
key = (dim.model.dimension_type, dim.model.name)
|
|
516
|
+
subset_dim = subset_refs[key]
|
|
517
|
+
subset_dim.selector_references.append(
|
|
518
|
+
DimensionReferenceModel(
|
|
519
|
+
dimension_id=dimension_id,
|
|
520
|
+
type=subset_dim.dimension_type,
|
|
521
|
+
version="1.0.0",
|
|
522
|
+
)
|
|
523
|
+
)
|
|
524
|
+
|
|
525
|
+
def _check_subset_dimension_consistency(
|
|
526
|
+
self,
|
|
527
|
+
subset_dimension: SubsetDimensionGroupModel,
|
|
528
|
+
base_records: DataFrame,
|
|
529
|
+
) -> None:
|
|
530
|
+
base_record_ids = get_unique_values(base_records, "id")
|
|
531
|
+
diff = subset_dimension.record_ids.difference(base_record_ids)
|
|
532
|
+
if diff:
|
|
533
|
+
msg = (
|
|
534
|
+
f"subset dimension {subset_dimension.name} "
|
|
535
|
+
f"uses dimension records not present in the base dimension: {diff}"
|
|
536
|
+
)
|
|
537
|
+
raise DSGInvalidParameter(msg)
|
|
538
|
+
|
|
539
|
+
diff = base_record_ids.difference(subset_dimension.record_ids)
|
|
540
|
+
if diff:
|
|
541
|
+
msg = (
|
|
542
|
+
f"subset dimension {subset_dimension.name} "
|
|
543
|
+
f"does not list these base dimension records: {diff}"
|
|
544
|
+
)
|
|
545
|
+
raise DSGInvalidParameter(msg)
|
|
546
|
+
|
|
547
|
+
def _register_supplemental_dimensions_from_subset_dimensions(
|
|
548
|
+
self,
|
|
549
|
+
model: ProjectConfigModel,
|
|
550
|
+
subset_dimensions: list[SubsetDimensionGroupModel],
|
|
551
|
+
context: RegistrationContext,
|
|
552
|
+
):
|
|
553
|
+
"""Registers a supplemental dimension for each subset specified in the project config's
|
|
554
|
+
subset dimension groups. Also registers a mapping from the base dimension to each new
|
|
555
|
+
supplemental dimension. Appends references to those dimensions to the project config's
|
|
556
|
+
supplemental_dimension_references list.
|
|
557
|
+
"""
|
|
558
|
+
conn = context.connection
|
|
559
|
+
with TemporaryDirectory() as tmpdir:
|
|
560
|
+
tmp_path = Path(tmpdir)
|
|
561
|
+
dimensions = []
|
|
562
|
+
for subset_dimension_group in subset_dimensions:
|
|
563
|
+
if not subset_dimension_group.create_supplemental_dimension:
|
|
564
|
+
continue
|
|
565
|
+
dimension_type = subset_dimension_group.dimension_type
|
|
566
|
+
base_dims: list[DimensionBaseConfigWithFiles] = []
|
|
567
|
+
for ref in model.dimensions.base_dimension_references:
|
|
568
|
+
if ref.dimension_type == dimension_type:
|
|
569
|
+
base_dim = self._dimension_mgr.get_by_id(ref.dimension_id, conn=conn)
|
|
570
|
+
if (
|
|
571
|
+
subset_dimension_group.base_dimension_name is None
|
|
572
|
+
or base_dim.model.name == subset_dimension_group.base_dimension_name
|
|
573
|
+
):
|
|
574
|
+
base_dims.append(base_dim)
|
|
575
|
+
break
|
|
576
|
+
if len(base_dims) == 0:
|
|
577
|
+
msg = f"Did not find a base dimension for {subset_dimension_group=}"
|
|
578
|
+
raise Exception(msg)
|
|
579
|
+
elif len(base_dims) > 1:
|
|
580
|
+
msg = (
|
|
581
|
+
f"Found multiple base dimensions for {dimension_type=}. Please specify "
|
|
582
|
+
f"'base_dimension_name' in {subset_dimension_group=}"
|
|
583
|
+
)
|
|
584
|
+
raise DSGInvalidParameter(msg)
|
|
585
|
+
base_dim = base_dims[0]
|
|
586
|
+
records: dict[str, list[Any]] = {"id": [], "name": []}
|
|
587
|
+
mapping_records = []
|
|
588
|
+
dim_record_ids = set()
|
|
589
|
+
# The pydantic validator has already checked consistency of these columns.
|
|
590
|
+
for column in subset_dimension_group.selectors[0].column_values:
|
|
591
|
+
records[column] = []
|
|
592
|
+
for selector in subset_dimension_group.selectors:
|
|
593
|
+
records["id"].append(selector.name)
|
|
594
|
+
records["name"].append(selector.name)
|
|
595
|
+
if selector.column_values:
|
|
596
|
+
for column, value in selector.column_values.items():
|
|
597
|
+
records[column].append(value)
|
|
598
|
+
for record_id in selector.records:
|
|
599
|
+
mapping_records.append({"from_id": record_id, "to_id": selector.name})
|
|
600
|
+
dim_record_ids.add(record_id)
|
|
601
|
+
|
|
602
|
+
filename = tmp_path / f"{subset_dimension_group.name}.csv"
|
|
603
|
+
pd.DataFrame(records).to_csv(filename, index=False)
|
|
604
|
+
|
|
605
|
+
for record_id in base_dim.get_unique_ids().difference(dim_record_ids):
|
|
606
|
+
mapping_records.append({"from_id": record_id, "to_id": ""})
|
|
607
|
+
map_record_file = tmp_path / f"{subset_dimension_group.name}_mapping.csv"
|
|
608
|
+
pd.DataFrame.from_records(mapping_records).to_csv(map_record_file, index=False)
|
|
609
|
+
|
|
610
|
+
dim = SupplementalDimensionModel(
|
|
611
|
+
file=str(filename),
|
|
612
|
+
name=subset_dimension_group.name,
|
|
613
|
+
type=dimension_type,
|
|
614
|
+
module=base_dim.model.module,
|
|
615
|
+
class_name=base_dim.model.class_name,
|
|
616
|
+
description=subset_dimension_group.description,
|
|
617
|
+
mapping=MappingTableByNameModel(
|
|
618
|
+
file=str(map_record_file),
|
|
619
|
+
mapping_type=DimensionMappingType.MANY_TO_MANY_EXPLICIT_MULTIPLIERS,
|
|
620
|
+
description=f"Aggregation map for {subset_dimension_group.name}",
|
|
621
|
+
project_base_dimension_name=base_dim.model.name,
|
|
622
|
+
),
|
|
623
|
+
)
|
|
624
|
+
dimensions.append(dim)
|
|
625
|
+
|
|
626
|
+
self._register_supplemental_dimensions_from_models(
|
|
627
|
+
tmp_path,
|
|
628
|
+
model,
|
|
629
|
+
dimensions,
|
|
630
|
+
context,
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
def _register_all_in_one_dimensions(
|
|
634
|
+
self,
|
|
635
|
+
src_dir,
|
|
636
|
+
model,
|
|
637
|
+
context: RegistrationContext,
|
|
638
|
+
):
|
|
639
|
+
with TemporaryDirectory() as tmpdir:
|
|
640
|
+
tmp_path = Path(tmpdir)
|
|
641
|
+
new_dimensions = []
|
|
642
|
+
dim_type_to_ref = {
|
|
643
|
+
x.dimension_type: x for x in model.dimensions.base_dimension_references
|
|
644
|
+
}
|
|
645
|
+
# Metric is excluded because fuel_id and unit may not be the same for all records.
|
|
646
|
+
# Time doesn't have records.
|
|
647
|
+
exclude = {DimensionType.METRIC, DimensionType.TIME}
|
|
648
|
+
for dimension_type in (x for x in DimensionType if x not in exclude):
|
|
649
|
+
dim_ref = dim_type_to_ref[dimension_type]
|
|
650
|
+
dim_config = self._dimension_mgr.get_by_id(
|
|
651
|
+
dim_ref.dimension_id, conn=context.connection
|
|
652
|
+
)
|
|
653
|
+
assert isinstance(dim_config, DimensionBaseConfigWithFiles)
|
|
654
|
+
dt_str = dimension_type.value
|
|
655
|
+
if dt_str.endswith("y"):
|
|
656
|
+
dt_plural = dt_str[:-1] + "ies"
|
|
657
|
+
else:
|
|
658
|
+
dt_plural = dt_str + "s"
|
|
659
|
+
dt_all_plural = f"all_{dt_plural}"
|
|
660
|
+
dim_name = f"all_{model.project_id}_{dt_plural}"
|
|
661
|
+
dim_name_formal = f"All {dt_plural.title()}"
|
|
662
|
+
dim_record_file = tmp_path / f"{dt_all_plural}.csv"
|
|
663
|
+
dim_text = f"id,name\n{dt_all_plural},{dim_name_formal}\n"
|
|
664
|
+
dim_record_file.write_text(dim_text)
|
|
665
|
+
map_record_file = tmp_path / f"lookup_{dt_str}_to_{dt_all_plural}.csv"
|
|
666
|
+
with open(map_record_file, "w") as f_out:
|
|
667
|
+
f_out.write("from_id,to_id\n")
|
|
668
|
+
for record in dim_config.get_unique_ids():
|
|
669
|
+
f_out.write(record)
|
|
670
|
+
f_out.write(",")
|
|
671
|
+
f_out.write(dt_all_plural)
|
|
672
|
+
f_out.write("\n")
|
|
673
|
+
|
|
674
|
+
with in_other_dir(src_dir):
|
|
675
|
+
new_dim = SupplementalDimensionModel(
|
|
676
|
+
file=str(dim_record_file),
|
|
677
|
+
name=dim_name,
|
|
678
|
+
type=dimension_type,
|
|
679
|
+
module="dsgrid.dimension.base_models",
|
|
680
|
+
class_name="DimensionRecordBaseModel",
|
|
681
|
+
description=dim_name_formal,
|
|
682
|
+
mapping=MappingTableByNameModel(
|
|
683
|
+
file=str(map_record_file),
|
|
684
|
+
mapping_type=DimensionMappingType.MANY_TO_ONE_AGGREGATION,
|
|
685
|
+
description=f"Aggregation map for all {dt_str}s",
|
|
686
|
+
),
|
|
687
|
+
)
|
|
688
|
+
new_dimensions.append(new_dim)
|
|
689
|
+
|
|
690
|
+
self._register_supplemental_dimensions_from_models(
|
|
691
|
+
src_dir,
|
|
692
|
+
model,
|
|
693
|
+
new_dimensions,
|
|
694
|
+
context,
|
|
695
|
+
)
|
|
696
|
+
|
|
697
|
+
def _register(self, config: ProjectConfig, context: RegistrationContext):
|
|
698
|
+
self._run_checks(config)
|
|
699
|
+
|
|
700
|
+
config.model.version = "1.0.0"
|
|
701
|
+
model = self.db.insert(context.connection, config.model, context.registration)
|
|
702
|
+
assert isinstance(model, ProjectConfigModel)
|
|
703
|
+
logger.info(
|
|
704
|
+
"%s Registered project %s with version=%s",
|
|
705
|
+
self._log_offline_mode_prefix(),
|
|
706
|
+
model.project_id,
|
|
707
|
+
config.model.version,
|
|
708
|
+
)
|
|
709
|
+
|
|
710
|
+
def _run_checks(self, config: ProjectConfig):
|
|
711
|
+
dims = [x for x in config.iter_dimensions()]
|
|
712
|
+
check_uniqueness((x.model.name for x in dims), "dimension name")
|
|
713
|
+
self._check_base_dimensions(config)
|
|
714
|
+
|
|
715
|
+
for dataset_id in config.list_unregistered_dataset_ids():
|
|
716
|
+
for field in RequiredDimensionRecordsModel.model_fields:
|
|
717
|
+
# This will check that all dimension record IDs listed in the requirements
|
|
718
|
+
# exist in the project.
|
|
719
|
+
config.get_required_dimension_record_ids(dataset_id, DimensionType(field))
|
|
720
|
+
|
|
721
|
+
def _check_base_dimensions(self, config: ProjectConfig) -> None:
|
|
722
|
+
found_time = False
|
|
723
|
+
for dim in config.list_base_dimensions():
|
|
724
|
+
if dim.model.dimension_type == DimensionType.TIME:
|
|
725
|
+
if found_time:
|
|
726
|
+
msg = "Only one time dimension is allowed in a project."
|
|
727
|
+
raise DSGInvalidDimension(msg)
|
|
728
|
+
found_time = True
|
|
729
|
+
|
|
730
|
+
assert found_time
|
|
731
|
+
self._set_dataset_record_requirement_definitions_names(config)
|
|
732
|
+
self._check_dataset_record_requirement_definitions(config)
|
|
733
|
+
|
|
734
|
+
def _set_dataset_record_requirement_definitions_names(
|
|
735
|
+
self,
|
|
736
|
+
config: ProjectConfig,
|
|
737
|
+
) -> None:
|
|
738
|
+
def set_dimension_name(req: RequiredBaseDimensionModel) -> None:
|
|
739
|
+
if req.dimension_name is None and req.dimension_name is not None:
|
|
740
|
+
dim = config.get_dimension_by_name(req.dimension_name)
|
|
741
|
+
req.dimension_name = None
|
|
742
|
+
req.dimension_name = dim.model.name
|
|
743
|
+
|
|
744
|
+
for dataset in config.model.datasets:
|
|
745
|
+
dim_type_as_fields = RequiredDimensionRecordsModel.model_fields.keys()
|
|
746
|
+
for field in dim_type_as_fields:
|
|
747
|
+
req = getattr(dataset.required_dimensions.single_dimensional, field)
|
|
748
|
+
for base_field in ("base", "base_missing"):
|
|
749
|
+
set_dimension_name(getattr(req, base_field))
|
|
750
|
+
for multi_dim in dataset.required_dimensions.multi_dimensional:
|
|
751
|
+
req = getattr(multi_dim, field)
|
|
752
|
+
for base_field in ("base", "base_missing"):
|
|
753
|
+
set_dimension_name(getattr(req, base_field))
|
|
754
|
+
|
|
755
|
+
def _check_dataset_record_requirement_definitions(
|
|
756
|
+
self,
|
|
757
|
+
config: ProjectConfig,
|
|
758
|
+
) -> None:
|
|
759
|
+
for dataset in config.model.datasets:
|
|
760
|
+
dim_type_as_fields = RequiredDimensionRecordsModel.model_fields.keys()
|
|
761
|
+
for dim_type_as_field in dim_type_as_fields:
|
|
762
|
+
dim_type = DimensionType(dim_type_as_field)
|
|
763
|
+
required_dimension_records = getattr(
|
|
764
|
+
dataset.required_dimensions.single_dimensional, dim_type_as_field
|
|
765
|
+
)
|
|
766
|
+
self._check_base_dimension_record_requirements(
|
|
767
|
+
required_dimension_records, dim_type, config, dataset.dataset_id
|
|
768
|
+
)
|
|
769
|
+
for multi_dim in dataset.required_dimensions.multi_dimensional:
|
|
770
|
+
required_dimension_records = getattr(multi_dim, dim_type_as_field)
|
|
771
|
+
self._check_base_dimension_record_requirements(
|
|
772
|
+
required_dimension_records, dim_type, config, dataset.dataset_id
|
|
773
|
+
)
|
|
774
|
+
|
|
775
|
+
def _check_base_dimension_record_requirements(
|
|
776
|
+
self,
|
|
777
|
+
req_dim_records: RequiredDimensionRecordsByTypeModel,
|
|
778
|
+
dim_type: DimensionType,
|
|
779
|
+
config: ProjectConfig,
|
|
780
|
+
dataset_id: str,
|
|
781
|
+
) -> None:
|
|
782
|
+
base_dims = config.list_base_dimensions(dimension_type=dim_type)
|
|
783
|
+
for base_field in ("base", "base_missing"):
|
|
784
|
+
reqs = getattr(req_dim_records, base_field)
|
|
785
|
+
if reqs.record_ids and reqs.dimension_name is None:
|
|
786
|
+
if len(base_dims) == 1:
|
|
787
|
+
reqs.dimension_name = base_dims[0].model.name
|
|
788
|
+
logger.debug(
|
|
789
|
+
"Assigned dimension_name=%s for %s dataset_id=%s",
|
|
790
|
+
reqs.dimension_name,
|
|
791
|
+
dim_type,
|
|
792
|
+
dataset_id,
|
|
793
|
+
)
|
|
794
|
+
else:
|
|
795
|
+
msg = (
|
|
796
|
+
f"{dataset_id=} requires a base dimension name for "
|
|
797
|
+
f"{dim_type} because the project has {len(base_dims)} base dimensions."
|
|
798
|
+
)
|
|
799
|
+
raise DSGInvalidDimensionMapping(msg)
|
|
800
|
+
# Only one of base and base_missing can be set, and that was already checked.
|
|
801
|
+
break
|
|
802
|
+
|
|
803
|
+
@track_timing(timer_stats_collector)
|
|
804
|
+
def register_and_submit_dataset(
|
|
805
|
+
self,
|
|
806
|
+
dataset_config_file: Path,
|
|
807
|
+
project_id: str,
|
|
808
|
+
submitter: str,
|
|
809
|
+
log_message: str,
|
|
810
|
+
dimension_mapping_file=None,
|
|
811
|
+
dimension_mapping_references_file=None,
|
|
812
|
+
autogen_reverse_supplemental_mappings=None,
|
|
813
|
+
data_base_dir: Path | None = None,
|
|
814
|
+
missing_associations_base_dir: Path | None = None,
|
|
815
|
+
):
|
|
816
|
+
with RegistrationContext(
|
|
817
|
+
self.db, log_message, VersionUpdateType.MINOR, submitter
|
|
818
|
+
) as context:
|
|
819
|
+
conn = context.connection
|
|
820
|
+
if not self.has_id(project_id, conn=conn):
|
|
821
|
+
msg = f"{project_id=}"
|
|
822
|
+
raise DSGValueNotRegistered(msg)
|
|
823
|
+
|
|
824
|
+
dataset_config = DatasetConfig.load_from_user_path(
|
|
825
|
+
dataset_config_file,
|
|
826
|
+
data_base_dir=data_base_dir,
|
|
827
|
+
missing_associations_base_dir=missing_associations_base_dir,
|
|
828
|
+
)
|
|
829
|
+
dataset_id = dataset_config.model.dataset_id
|
|
830
|
+
config = self.get_by_id(project_id, conn=conn)
|
|
831
|
+
# This will raise an exception if the dataset_id is not part of the project or already
|
|
832
|
+
# registered.
|
|
833
|
+
self._raise_if_not_unregistered(config, dataset_id)
|
|
834
|
+
|
|
835
|
+
self._dataset_mgr.register(
|
|
836
|
+
dataset_config_file,
|
|
837
|
+
context=context,
|
|
838
|
+
data_base_dir=data_base_dir,
|
|
839
|
+
missing_associations_base_dir=missing_associations_base_dir,
|
|
840
|
+
)
|
|
841
|
+
self.submit_dataset(
|
|
842
|
+
project_id,
|
|
843
|
+
context.get_ids(RegistryType.DATASET)[0],
|
|
844
|
+
dimension_mapping_file=dimension_mapping_file,
|
|
845
|
+
dimension_mapping_references_file=dimension_mapping_references_file,
|
|
846
|
+
autogen_reverse_supplemental_mappings=autogen_reverse_supplemental_mappings,
|
|
847
|
+
context=context,
|
|
848
|
+
)
|
|
849
|
+
|
|
850
|
+
@track_timing(timer_stats_collector)
|
|
851
|
+
def submit_dataset(
|
|
852
|
+
self,
|
|
853
|
+
project_id: str,
|
|
854
|
+
dataset_id: str,
|
|
855
|
+
submitter: str | None = None,
|
|
856
|
+
log_message: str | None = None,
|
|
857
|
+
dimension_mapping_file: Path | None = None,
|
|
858
|
+
dimension_mapping_references_file: Path | None = None,
|
|
859
|
+
autogen_reverse_supplemental_mappings: list[DimensionType] | None = None,
|
|
860
|
+
context: RegistrationContext | None = None,
|
|
861
|
+
):
|
|
862
|
+
"""Registers a dataset with a project. This can only be performed on the
|
|
863
|
+
latest version of the project.
|
|
864
|
+
|
|
865
|
+
Parameters
|
|
866
|
+
----------
|
|
867
|
+
project_id : str
|
|
868
|
+
dataset_id : str
|
|
869
|
+
dimension_mapping_file : Path or None
|
|
870
|
+
Base-to-base dimension mapping file
|
|
871
|
+
dimension_mapping_references_file : Path or None
|
|
872
|
+
autogen_reverse_supplemental_mappings : list[DimensionType] or None
|
|
873
|
+
Dimensions on which to attempt create reverse mappings from supplemental dimensions.
|
|
874
|
+
submitter : str
|
|
875
|
+
Submitter name
|
|
876
|
+
log_message : str
|
|
877
|
+
context : None or RegistrationContext
|
|
878
|
+
|
|
879
|
+
Raises
|
|
880
|
+
------
|
|
881
|
+
DSGValueNotRegistered
|
|
882
|
+
Raised if the project_id or dataset_id is not registered.
|
|
883
|
+
DSGDuplicateValueRegistered
|
|
884
|
+
Raised if the dataset is already registered with the project.
|
|
885
|
+
ValueError
|
|
886
|
+
Raised if the project does not contain this dataset.
|
|
887
|
+
|
|
888
|
+
"""
|
|
889
|
+
if context is None:
|
|
890
|
+
assert submitter is not None
|
|
891
|
+
assert log_message is not None
|
|
892
|
+
with RegistrationContext(
|
|
893
|
+
self.db, log_message, VersionUpdateType.MINOR, submitter
|
|
894
|
+
) as context:
|
|
895
|
+
config = self.get_by_id(project_id, conn=context.connection)
|
|
896
|
+
self._submit_dataset_and_register_mappings(
|
|
897
|
+
config,
|
|
898
|
+
dataset_id,
|
|
899
|
+
dimension_mapping_file,
|
|
900
|
+
dimension_mapping_references_file,
|
|
901
|
+
autogen_reverse_supplemental_mappings,
|
|
902
|
+
context,
|
|
903
|
+
)
|
|
904
|
+
else:
|
|
905
|
+
config = self.get_by_id(project_id, conn=context.connection)
|
|
906
|
+
self._submit_dataset_and_register_mappings(
|
|
907
|
+
config,
|
|
908
|
+
dataset_id,
|
|
909
|
+
dimension_mapping_file,
|
|
910
|
+
dimension_mapping_references_file,
|
|
911
|
+
autogen_reverse_supplemental_mappings,
|
|
912
|
+
context,
|
|
913
|
+
)
|
|
914
|
+
|
|
915
|
+
def register_subset_dimensions(
|
|
916
|
+
self,
|
|
917
|
+
project_id: str,
|
|
918
|
+
filename: Path,
|
|
919
|
+
submitter: str,
|
|
920
|
+
log_message: str,
|
|
921
|
+
update_type: VersionUpdateType,
|
|
922
|
+
):
|
|
923
|
+
"""Register new subset dimensions."""
|
|
924
|
+
with RegistrationContext(self.db, log_message, update_type, submitter) as context:
|
|
925
|
+
config = self.get_by_id(project_id, conn=context.connection)
|
|
926
|
+
subset_model = SubsetDimensionGroupListModel.from_file(filename)
|
|
927
|
+
self._register_subset_dimensions(
|
|
928
|
+
config.model,
|
|
929
|
+
subset_model.subset_dimensions,
|
|
930
|
+
context,
|
|
931
|
+
)
|
|
932
|
+
self._make_new_config(config, context)
|
|
933
|
+
|
|
934
|
+
def register_supplemental_dimensions(
|
|
935
|
+
self,
|
|
936
|
+
project_id: str,
|
|
937
|
+
filename: Path,
|
|
938
|
+
submitter: str,
|
|
939
|
+
log_message: str,
|
|
940
|
+
update_type: VersionUpdateType,
|
|
941
|
+
):
|
|
942
|
+
"""Register new supplemental dimensions."""
|
|
943
|
+
with RegistrationContext(self.db, log_message, update_type, submitter) as context:
|
|
944
|
+
config = self.get_by_id(project_id, conn=context.connection)
|
|
945
|
+
model = SupplementalDimensionsListModel.from_file(filename)
|
|
946
|
+
self._register_supplemental_dimensions_from_models(
|
|
947
|
+
filename.parent,
|
|
948
|
+
config.model,
|
|
949
|
+
model.supplemental_dimensions,
|
|
950
|
+
context,
|
|
951
|
+
)
|
|
952
|
+
self._make_new_config(config, context)
|
|
953
|
+
|
|
954
|
+
def add_dataset_requirements(
|
|
955
|
+
self,
|
|
956
|
+
project_id: str,
|
|
957
|
+
filename: Path,
|
|
958
|
+
submitter: str,
|
|
959
|
+
log_message: str,
|
|
960
|
+
update_type: VersionUpdateType,
|
|
961
|
+
):
|
|
962
|
+
"""Add requirements for one or more datasets to the project."""
|
|
963
|
+
with RegistrationContext(self.db, log_message, update_type, submitter) as context:
|
|
964
|
+
config = self.get_by_id(project_id, conn=context.connection)
|
|
965
|
+
model = InputDatasetListModel.from_file(filename)
|
|
966
|
+
existing_ids = {x.dataset_id for x in config.model.datasets}
|
|
967
|
+
for dataset in model.datasets:
|
|
968
|
+
if dataset.dataset_id in existing_ids:
|
|
969
|
+
msg = f"{dataset.dataset_id} is already stored in the project"
|
|
970
|
+
raise DSGInvalidParameter(msg)
|
|
971
|
+
if dataset.status != DatasetRegistryStatus.UNREGISTERED:
|
|
972
|
+
msg = f"New dataset {dataset.dataset_id} status must be unregistered: {dataset.status}"
|
|
973
|
+
raise DSGInvalidParameter(msg)
|
|
974
|
+
|
|
975
|
+
config.model.datasets += model.datasets
|
|
976
|
+
self._make_new_config(config, context)
|
|
977
|
+
|
|
978
|
+
def replace_dataset_dimension_requirements(
|
|
979
|
+
self,
|
|
980
|
+
project_id: str,
|
|
981
|
+
filename: Path,
|
|
982
|
+
submitter: str,
|
|
983
|
+
log_message: str,
|
|
984
|
+
update_type: VersionUpdateType,
|
|
985
|
+
):
|
|
986
|
+
"""Replace dataset requirements in a project."""
|
|
987
|
+
with RegistrationContext(self.db, log_message, update_type, submitter) as context:
|
|
988
|
+
config = self.get_by_id(project_id, conn=context.connection)
|
|
989
|
+
model = InputDatasetDimensionRequirementsListModel.from_file(filename)
|
|
990
|
+
for dataset in model.dataset_dimension_requirements:
|
|
991
|
+
found = False
|
|
992
|
+
for i in range(len(config.model.datasets)):
|
|
993
|
+
if config.model.datasets[i].dataset_id == dataset.dataset_id:
|
|
994
|
+
config.model.datasets[i].required_dimensions = dataset.required_dimensions
|
|
995
|
+
if config.model.datasets[i].status == DatasetRegistryStatus.REGISTERED:
|
|
996
|
+
config.model.datasets[i].status = DatasetRegistryStatus.UNREGISTERED
|
|
997
|
+
logger.info(
|
|
998
|
+
"Changed dataset %s status to %s in project %s",
|
|
999
|
+
dataset.dataset_id,
|
|
1000
|
+
config.model.datasets[i].status.value,
|
|
1001
|
+
project_id,
|
|
1002
|
+
)
|
|
1003
|
+
# TODO: When issue #309 is addressed, we need to set all dependent
|
|
1004
|
+
# derived datasets to unregistered also.
|
|
1005
|
+
found = True
|
|
1006
|
+
break
|
|
1007
|
+
if not found:
|
|
1008
|
+
msg = f"{dataset.dataset_type} is not present in the project config"
|
|
1009
|
+
raise DSGInvalidParameter(msg)
|
|
1010
|
+
|
|
1011
|
+
self._make_new_config(config, context)
|
|
1012
|
+
|
|
1013
|
+
def _submit_dataset_and_register_mappings(
|
|
1014
|
+
self,
|
|
1015
|
+
project_config: ProjectConfig,
|
|
1016
|
+
dataset_id: str,
|
|
1017
|
+
dimension_mapping_file: Path | None,
|
|
1018
|
+
dimension_mapping_references_file: Path | None,
|
|
1019
|
+
autogen_reverse_supplemental_mappings: list[DimensionType] | None,
|
|
1020
|
+
context: RegistrationContext,
|
|
1021
|
+
) -> None:
|
|
1022
|
+
logger.info("Submit dataset=%s to project=%s.", dataset_id, project_config.config_id)
|
|
1023
|
+
self._check_if_not_registered(context.connection, project_config.config_id)
|
|
1024
|
+
self._raise_if_not_unregistered(project_config, dataset_id)
|
|
1025
|
+
dataset_config = self._dataset_mgr.get_by_id(dataset_id, conn=context.connection)
|
|
1026
|
+
|
|
1027
|
+
references = []
|
|
1028
|
+
if dimension_mapping_file is not None:
|
|
1029
|
+
references += self._register_mappings_from_file(
|
|
1030
|
+
project_config,
|
|
1031
|
+
dataset_config,
|
|
1032
|
+
dimension_mapping_file,
|
|
1033
|
+
context,
|
|
1034
|
+
)
|
|
1035
|
+
if dimension_mapping_references_file is not None:
|
|
1036
|
+
for ref in DimensionMappingReferenceListModel.load(
|
|
1037
|
+
dimension_mapping_references_file
|
|
1038
|
+
).references:
|
|
1039
|
+
if not self.dimension_mapping_manager.has_id(
|
|
1040
|
+
ref.mapping_id, version=ref.version, conn=context.connection
|
|
1041
|
+
):
|
|
1042
|
+
msg = f"mapping_id={ref.mapping_id}"
|
|
1043
|
+
raise DSGValueNotRegistered(msg)
|
|
1044
|
+
references.append(ref)
|
|
1045
|
+
|
|
1046
|
+
if autogen_reverse_supplemental_mappings:
|
|
1047
|
+
references += self._auto_register_reverse_supplemental_mappings(
|
|
1048
|
+
project_config,
|
|
1049
|
+
dataset_config,
|
|
1050
|
+
references,
|
|
1051
|
+
set((x.value for x in autogen_reverse_supplemental_mappings)),
|
|
1052
|
+
context,
|
|
1053
|
+
)
|
|
1054
|
+
|
|
1055
|
+
self._submit_dataset(project_config, dataset_config, references, context)
|
|
1056
|
+
|
|
1057
|
+
def _raise_if_not_unregistered(self, project_config: ProjectConfig, dataset_id: str) -> None:
|
|
1058
|
+
# This will raise if the dataset is not specified in the project.
|
|
1059
|
+
dataset_model = project_config.get_dataset(dataset_id)
|
|
1060
|
+
status = dataset_model.status
|
|
1061
|
+
if status != DatasetRegistryStatus.UNREGISTERED:
|
|
1062
|
+
msg = (
|
|
1063
|
+
f"{dataset_id=} cannot be submitted to project={project_config.config_id} with "
|
|
1064
|
+
f"{status=}"
|
|
1065
|
+
)
|
|
1066
|
+
raise DSGDuplicateValueRegistered(msg)
|
|
1067
|
+
|
|
1068
|
+
def _register_mappings_from_file(
|
|
1069
|
+
self,
|
|
1070
|
+
project_config: ProjectConfig,
|
|
1071
|
+
dataset_config: DatasetConfig,
|
|
1072
|
+
dimension_mapping_file: Path,
|
|
1073
|
+
context: RegistrationContext,
|
|
1074
|
+
):
|
|
1075
|
+
references = []
|
|
1076
|
+
src_dir = dimension_mapping_file.parent
|
|
1077
|
+
mappings = DatasetBaseToProjectMappingTableListModel(
|
|
1078
|
+
**load_data(dimension_mapping_file)
|
|
1079
|
+
).mappings
|
|
1080
|
+
dataset_mapping = {x.dimension_type: x for x in dataset_config.model.dimension_references}
|
|
1081
|
+
project_mapping: dict[DimensionType, list[DimensionBaseConfig]] = defaultdict(list)
|
|
1082
|
+
project_mapping_refs: dict[str, DimensionReferenceModel] = {}
|
|
1083
|
+
for ref in project_config.model.dimensions.base_dimension_references:
|
|
1084
|
+
dim = self._dimension_mgr.get_by_id(
|
|
1085
|
+
ref.dimension_id, version=ref.version, conn=context.connection
|
|
1086
|
+
)
|
|
1087
|
+
project_mapping[ref.dimension_type].append(dim)
|
|
1088
|
+
project_mapping_refs[dim.model.dimension_id] = ref
|
|
1089
|
+
mapping_tables = []
|
|
1090
|
+
for mapping in mappings:
|
|
1091
|
+
base_dim: DimensionBaseConfig | None = None
|
|
1092
|
+
if mapping.project_base_dimension_name is None:
|
|
1093
|
+
base_dims = project_mapping[mapping.dimension_type]
|
|
1094
|
+
if len(base_dims) > 1:
|
|
1095
|
+
msg = (
|
|
1096
|
+
"If there are multiple project base dimensions for a dimension type, the "
|
|
1097
|
+
"dataset dimension mapping must supply a project_base_dimension_name. "
|
|
1098
|
+
f"{mapping}"
|
|
1099
|
+
)
|
|
1100
|
+
raise DSGInvalidDimensionMapping(msg)
|
|
1101
|
+
base_dim = base_dims[0]
|
|
1102
|
+
else:
|
|
1103
|
+
for base_dim_ in project_mapping[mapping.dimension_type]:
|
|
1104
|
+
if base_dim_.model.name == mapping.project_base_dimension_name:
|
|
1105
|
+
base_dim = base_dim_
|
|
1106
|
+
if base_dim is None:
|
|
1107
|
+
msg = f"Bug: unable to find base dimension for {mapping.project_base_dimension_name}"
|
|
1108
|
+
raise Exception(msg)
|
|
1109
|
+
with in_other_dir(src_dir):
|
|
1110
|
+
assert base_dim is not None
|
|
1111
|
+
mapping_table = MappingTableModel.from_pre_registered_model(
|
|
1112
|
+
mapping,
|
|
1113
|
+
dataset_mapping[mapping.dimension_type],
|
|
1114
|
+
project_mapping_refs[base_dim.model.dimension_id],
|
|
1115
|
+
)
|
|
1116
|
+
mapping_tables.append(mapping_table)
|
|
1117
|
+
|
|
1118
|
+
mappings_config = DimensionMappingsConfig.load_from_model(
|
|
1119
|
+
DimensionMappingsConfigModel(mappings=mapping_tables)
|
|
1120
|
+
)
|
|
1121
|
+
mapping_ids = self._dimension_mapping_mgr.register_from_config(mappings_config, context)
|
|
1122
|
+
for mapping_id in mapping_ids:
|
|
1123
|
+
mapping_config = self._dimension_mapping_mgr.get_by_id(
|
|
1124
|
+
mapping_id, conn=context.connection
|
|
1125
|
+
)
|
|
1126
|
+
references.append(
|
|
1127
|
+
DimensionMappingReferenceModel(
|
|
1128
|
+
from_dimension_type=mapping_config.model.from_dimension.dimension_type,
|
|
1129
|
+
to_dimension_type=mapping_config.model.to_dimension.dimension_type,
|
|
1130
|
+
mapping_id=mapping_id,
|
|
1131
|
+
version=str(
|
|
1132
|
+
self._dimension_mapping_mgr.get_latest_version(
|
|
1133
|
+
mapping_id, conn=context.connection
|
|
1134
|
+
)
|
|
1135
|
+
),
|
|
1136
|
+
)
|
|
1137
|
+
)
|
|
1138
|
+
|
|
1139
|
+
return references
|
|
1140
|
+
|
|
1141
|
+
def _auto_register_reverse_supplemental_mappings(
|
|
1142
|
+
self,
|
|
1143
|
+
project_config: ProjectConfig,
|
|
1144
|
+
dataset_config: DatasetConfig,
|
|
1145
|
+
mapping_references: list[DimensionMappingReferenceModel],
|
|
1146
|
+
autogen_reverse_supplemental_mappings: set[str],
|
|
1147
|
+
context: RegistrationContext,
|
|
1148
|
+
):
|
|
1149
|
+
conn = context.connection
|
|
1150
|
+
references = []
|
|
1151
|
+
p_model = project_config.model
|
|
1152
|
+
p_supp_dim_ids = {
|
|
1153
|
+
x.dimension_id for x in p_model.dimensions.supplemental_dimension_references
|
|
1154
|
+
}
|
|
1155
|
+
d_dim_from_ids = set()
|
|
1156
|
+
for ref in mapping_references:
|
|
1157
|
+
mapping_config = self._dimension_mapping_mgr.get_by_id(ref.mapping_id, conn=conn)
|
|
1158
|
+
d_dim_from_ids.add(mapping_config.model.from_dimension.dimension_id)
|
|
1159
|
+
|
|
1160
|
+
needs_mapping = []
|
|
1161
|
+
for dim in dataset_config.model.dimension_references:
|
|
1162
|
+
if (
|
|
1163
|
+
dim.dimension_type in autogen_reverse_supplemental_mappings
|
|
1164
|
+
and dim.dimension_id in p_supp_dim_ids
|
|
1165
|
+
and dim.dimension_id not in d_dim_from_ids
|
|
1166
|
+
):
|
|
1167
|
+
needs_mapping.append((dim.dimension_id, dim.version))
|
|
1168
|
+
# else:
|
|
1169
|
+
# This dimension is the same as a project base dimension.
|
|
1170
|
+
# or
|
|
1171
|
+
# The dataset may only need to provide a subset of records, and those are
|
|
1172
|
+
# checked in the dimension association table.
|
|
1173
|
+
|
|
1174
|
+
if len(needs_mapping) != len(autogen_reverse_supplemental_mappings):
|
|
1175
|
+
msg = (
|
|
1176
|
+
f"Mappings to autgen [{needs_mapping}] does not match user-specified "
|
|
1177
|
+
f"autogen_reverse_supplemental_mappings={autogen_reverse_supplemental_mappings}"
|
|
1178
|
+
)
|
|
1179
|
+
raise DSGInvalidDimensionMapping(msg)
|
|
1180
|
+
|
|
1181
|
+
new_mappings = []
|
|
1182
|
+
for from_id, from_version in needs_mapping:
|
|
1183
|
+
to_dim = self._dimension_mgr.get_by_id(from_id, version=from_version, conn=conn)
|
|
1184
|
+
from_dim, to_version = project_config.get_base_dimension_and_version(
|
|
1185
|
+
to_dim.model.dimension_type
|
|
1186
|
+
)
|
|
1187
|
+
mapping, version = self._try_get_mapping(
|
|
1188
|
+
project_config, to_dim, from_version, from_dim, to_version, context
|
|
1189
|
+
)
|
|
1190
|
+
if mapping is None:
|
|
1191
|
+
p_mapping, _ = self._try_get_mapping(
|
|
1192
|
+
project_config, from_dim, to_version, to_dim, from_version, context
|
|
1193
|
+
)
|
|
1194
|
+
assert (
|
|
1195
|
+
p_mapping is not None
|
|
1196
|
+
), f"from={from_dim.model.dimension_id} to={to_dim.model.dimension_id}"
|
|
1197
|
+
records = models_to_dataframe(p_mapping.model.records)
|
|
1198
|
+
fraction_vals = get_unique_values(records, "from_fraction")
|
|
1199
|
+
if len(fraction_vals) != 1 and next(iter(fraction_vals)) != 1.0:
|
|
1200
|
+
msg = (
|
|
1201
|
+
f"Cannot auto-generate a dataset-to-project mapping from from a project "
|
|
1202
|
+
"supplemental dimension unless the from_fraction column is empty or only "
|
|
1203
|
+
f"has values of 1.0: {p_mapping.model.mapping_id} - {fraction_vals}"
|
|
1204
|
+
)
|
|
1205
|
+
raise DSGInvalidDimensionMapping(msg)
|
|
1206
|
+
reverse_records = (
|
|
1207
|
+
records.drop("from_fraction")
|
|
1208
|
+
.select(F.col("to_id").alias("from_id"), F.col("from_id").alias("to_id"))
|
|
1209
|
+
.toPandas()
|
|
1210
|
+
)
|
|
1211
|
+
dst = Path(tempfile.gettempdir()) / f"reverse_{p_mapping.config_id}.csv"
|
|
1212
|
+
# Use pandas because spark creates a CSV directory.
|
|
1213
|
+
reverse_records.to_csv(dst, index=False)
|
|
1214
|
+
dimension_type = to_dim.model.dimension_type.value
|
|
1215
|
+
new_mappings.append(
|
|
1216
|
+
{
|
|
1217
|
+
"description": f"Maps {dataset_config.config_id} {dimension_type} to project",
|
|
1218
|
+
"dimension_type": dimension_type,
|
|
1219
|
+
"file": str(dst),
|
|
1220
|
+
"mapping_type": DimensionMappingType.MANY_TO_MANY_EXPLICIT_MULTIPLIERS.value,
|
|
1221
|
+
}
|
|
1222
|
+
)
|
|
1223
|
+
else:
|
|
1224
|
+
assert version is not None
|
|
1225
|
+
reference = DimensionMappingReferenceModel(
|
|
1226
|
+
from_dimension_type=to_dim.model.dimension_type,
|
|
1227
|
+
to_dimension_type=to_dim.model.dimension_type,
|
|
1228
|
+
mapping_id=mapping.model.mapping_id,
|
|
1229
|
+
version=version,
|
|
1230
|
+
)
|
|
1231
|
+
references.append(reference)
|
|
1232
|
+
|
|
1233
|
+
if new_mappings:
|
|
1234
|
+
# We don't currently have a way to register a single dimension mapping. It would be
|
|
1235
|
+
# better to register these mappings directly. But, this code was already here.
|
|
1236
|
+
mapping_file = Path(tempfile.gettempdir()) / "dimension_mappings.json5"
|
|
1237
|
+
mapping_file.write_text(json5.dumps({"mappings": new_mappings}, indent=2))
|
|
1238
|
+
to_delete = [mapping_file] + [x["file"] for x in new_mappings]
|
|
1239
|
+
try:
|
|
1240
|
+
references += self._register_mappings_from_file(
|
|
1241
|
+
project_config,
|
|
1242
|
+
dataset_config,
|
|
1243
|
+
mapping_file,
|
|
1244
|
+
context,
|
|
1245
|
+
)
|
|
1246
|
+
finally:
|
|
1247
|
+
for filename in to_delete:
|
|
1248
|
+
Path(filename).unlink()
|
|
1249
|
+
|
|
1250
|
+
return references
|
|
1251
|
+
|
|
1252
|
+
def _try_get_mapping(
|
|
1253
|
+
self,
|
|
1254
|
+
project_config: ProjectConfig,
|
|
1255
|
+
from_dim,
|
|
1256
|
+
from_version,
|
|
1257
|
+
to_dim,
|
|
1258
|
+
to_version,
|
|
1259
|
+
context: RegistrationContext,
|
|
1260
|
+
):
|
|
1261
|
+
conn = context.connection
|
|
1262
|
+
dimension_type = from_dim.model.dimension_type
|
|
1263
|
+
for ref in project_config.model.dimension_mappings.base_to_supplemental_references:
|
|
1264
|
+
if (
|
|
1265
|
+
ref.from_dimension_type == dimension_type
|
|
1266
|
+
and ref.to_dimension_type == dimension_type
|
|
1267
|
+
):
|
|
1268
|
+
mapping_config = self._dimension_mapping_mgr.get_by_id(ref.mapping_id, conn=conn)
|
|
1269
|
+
if (
|
|
1270
|
+
mapping_config.model.from_dimension.dimension_id == from_dim.model.dimension_id
|
|
1271
|
+
and mapping_config.model.from_dimension.version == from_version
|
|
1272
|
+
and mapping_config.model.to_dimension.dimension_id == to_dim.model.dimension_id
|
|
1273
|
+
and mapping_config.model.to_dimension.version == to_version
|
|
1274
|
+
):
|
|
1275
|
+
return mapping_config, ref.version
|
|
1276
|
+
|
|
1277
|
+
return None, None
|
|
1278
|
+
|
|
1279
|
+
def _submit_dataset(
|
|
1280
|
+
self,
|
|
1281
|
+
project_config: ProjectConfig,
|
|
1282
|
+
dataset_config: DatasetConfig,
|
|
1283
|
+
mapping_references: list[DimensionMappingReferenceModel],
|
|
1284
|
+
context: RegistrationContext,
|
|
1285
|
+
):
|
|
1286
|
+
project_config.add_dataset_dimension_mappings(dataset_config, mapping_references)
|
|
1287
|
+
project_config.add_dataset_base_dimension_names(
|
|
1288
|
+
dataset_config.model.dataset_id,
|
|
1289
|
+
self._id_base_dimension_names_in_dataset(
|
|
1290
|
+
project_config, dataset_config, mapping_references
|
|
1291
|
+
),
|
|
1292
|
+
)
|
|
1293
|
+
if os.environ.get("__DSGRID_SKIP_CHECK_DATASET_TO_PROJECT_MAPPING__") is not None:
|
|
1294
|
+
logger.warning("Skip dataset-to-project mapping checks")
|
|
1295
|
+
else:
|
|
1296
|
+
self._check_dataset_base_to_project_base_mappings(
|
|
1297
|
+
project_config,
|
|
1298
|
+
dataset_config,
|
|
1299
|
+
mapping_references,
|
|
1300
|
+
context,
|
|
1301
|
+
)
|
|
1302
|
+
|
|
1303
|
+
dataset_model = project_config.get_dataset(dataset_config.model.dataset_id)
|
|
1304
|
+
|
|
1305
|
+
dataset_model.mapping_references = mapping_references
|
|
1306
|
+
dataset_model.status = DatasetRegistryStatus.REGISTERED
|
|
1307
|
+
if project_config.are_all_datasets_submitted():
|
|
1308
|
+
new_status = ProjectRegistryStatus.COMPLETE
|
|
1309
|
+
else:
|
|
1310
|
+
new_status = ProjectRegistryStatus.IN_PROGRESS
|
|
1311
|
+
project_config.set_status(new_status)
|
|
1312
|
+
config = self.update_with_context(project_config, context)
|
|
1313
|
+
self._db.add_contains_dataset(context.connection, config.model, dataset_config.model)
|
|
1314
|
+
|
|
1315
|
+
logger.info(
|
|
1316
|
+
"%s Registered dataset %s with version=%s in project %s",
|
|
1317
|
+
self._log_offline_mode_prefix(),
|
|
1318
|
+
dataset_config.model.dataset_id,
|
|
1319
|
+
config.model.version,
|
|
1320
|
+
config.model.project_id,
|
|
1321
|
+
)
|
|
1322
|
+
|
|
1323
|
+
@track_timing(timer_stats_collector)
|
|
1324
|
+
def _check_dataset_base_to_project_base_mappings(
|
|
1325
|
+
self,
|
|
1326
|
+
project_config: ProjectConfig,
|
|
1327
|
+
dataset_config: DatasetConfig,
|
|
1328
|
+
mapping_references: list[DimensionMappingReferenceModel],
|
|
1329
|
+
context: RegistrationContext,
|
|
1330
|
+
):
|
|
1331
|
+
"""Check that a dataset has all project-required dimension records."""
|
|
1332
|
+
logger.info("Check dataset-base-to-project-base dimension mappings.")
|
|
1333
|
+
data_store = self._dataset_mgr.store
|
|
1334
|
+
handler = make_dataset_schema_handler(
|
|
1335
|
+
context.connection,
|
|
1336
|
+
dataset_config,
|
|
1337
|
+
self._dimension_mgr,
|
|
1338
|
+
self._dimension_mapping_mgr,
|
|
1339
|
+
store=data_store,
|
|
1340
|
+
mapping_references=mapping_references,
|
|
1341
|
+
)
|
|
1342
|
+
dataset_id = dataset_config.config_id
|
|
1343
|
+
|
|
1344
|
+
with ScratchDirContext(self._params.scratch_dir) as scontext:
|
|
1345
|
+
project_table = self._make_dimension_associations(project_config, dataset_id, scontext)
|
|
1346
|
+
mapped_dataset_table = handler.make_mapped_dimension_association_table(scontext)
|
|
1347
|
+
project_table = handler.remove_expected_missing_mapped_associations(
|
|
1348
|
+
data_store, project_table, scontext
|
|
1349
|
+
)
|
|
1350
|
+
cols = sorted(project_table.columns)
|
|
1351
|
+
cache(mapped_dataset_table)
|
|
1352
|
+
diff: DataFrame | None = None
|
|
1353
|
+
|
|
1354
|
+
try:
|
|
1355
|
+
# This check is relatively short and will show the user clear errors.
|
|
1356
|
+
_check_distinct_column_values(project_table, mapped_dataset_table)
|
|
1357
|
+
# This check is long and will produce a full table of differences.
|
|
1358
|
+
# It may require some effort from the user.
|
|
1359
|
+
diff = except_all(project_table.select(*cols), mapped_dataset_table.select(*cols))
|
|
1360
|
+
cache(diff)
|
|
1361
|
+
if not is_dataframe_empty(diff):
|
|
1362
|
+
dataset_id = dataset_config.model.dataset_id
|
|
1363
|
+
handle_dimension_association_errors(diff, mapped_dataset_table, dataset_id)
|
|
1364
|
+
finally:
|
|
1365
|
+
unpersist(mapped_dataset_table)
|
|
1366
|
+
if diff is not None:
|
|
1367
|
+
unpersist(diff)
|
|
1368
|
+
|
|
1369
|
+
def _id_base_dimension_names_in_dataset(
|
|
1370
|
+
self,
|
|
1371
|
+
project_config: ProjectConfig,
|
|
1372
|
+
dataset_config: DatasetConfig,
|
|
1373
|
+
mapping_references: list[DimensionMappingReferenceModel],
|
|
1374
|
+
) -> DatasetBaseDimensionNamesModel:
|
|
1375
|
+
base_dimension_names: dict[DimensionType, str] = {}
|
|
1376
|
+
for ref in mapping_references:
|
|
1377
|
+
mapping = self._dimension_mapping_mgr.get_by_id(ref.mapping_id, version=ref.version)
|
|
1378
|
+
base_dim = self._dimension_mgr.get_by_id(
|
|
1379
|
+
mapping.model.to_dimension.dimension_id,
|
|
1380
|
+
version=mapping.model.to_dimension.version,
|
|
1381
|
+
).model
|
|
1382
|
+
base_dimension_names[base_dim.dimension_type] = base_dim.name
|
|
1383
|
+
|
|
1384
|
+
project_base_dims_by_type: dict[DimensionType, list[DimensionBaseConfig]] = defaultdict(
|
|
1385
|
+
list
|
|
1386
|
+
)
|
|
1387
|
+
for dim in project_config.list_base_dimensions():
|
|
1388
|
+
project_base_dims_by_type[dim.model.dimension_type].append(dim)
|
|
1389
|
+
|
|
1390
|
+
dataset_id = dataset_config.model.dataset_id
|
|
1391
|
+
for dim_type in DimensionType:
|
|
1392
|
+
if dim_type == DimensionType.TIME:
|
|
1393
|
+
assert len(project_base_dims_by_type[dim_type]) == 1
|
|
1394
|
+
base_dimension_names[dim_type] = project_base_dims_by_type[dim_type][0].model.name
|
|
1395
|
+
continue
|
|
1396
|
+
if dim_type not in base_dimension_names:
|
|
1397
|
+
project_base_dims = project_base_dims_by_type[dim_type]
|
|
1398
|
+
if len(project_base_dims) > 1:
|
|
1399
|
+
for project_dim in project_base_dims:
|
|
1400
|
+
assert isinstance(project_dim, DimensionBaseConfigWithFiles)
|
|
1401
|
+
project_records = project_dim.get_records_dataframe()
|
|
1402
|
+
project_record_ids = get_unique_values(project_records, "id")
|
|
1403
|
+
dataset_dim = dataset_config.get_dimension_with_records(dim_type)
|
|
1404
|
+
assert dataset_dim is not None
|
|
1405
|
+
dataset_records = dataset_dim.get_records_dataframe()
|
|
1406
|
+
dataset_record_ids = get_unique_values(dataset_records, "id")
|
|
1407
|
+
if dataset_record_ids.issubset(project_record_ids):
|
|
1408
|
+
project_dim_name = project_dim.model.name
|
|
1409
|
+
if dim_type in base_dimension_names:
|
|
1410
|
+
msg = (
|
|
1411
|
+
f"Found multiple project base dimensions for {dataset_id=} "
|
|
1412
|
+
f"and {dim_type=}: {base_dimension_names[dim_type]} and "
|
|
1413
|
+
f"{project_dim_name}. Please specify a mapping."
|
|
1414
|
+
)
|
|
1415
|
+
raise DSGInvalidDataset(msg)
|
|
1416
|
+
|
|
1417
|
+
base_dimension_names[dim_type] = project_dim_name
|
|
1418
|
+
if dim_type not in base_dimension_names:
|
|
1419
|
+
msg = (
|
|
1420
|
+
f"Bug: {dim_type} has multiple base dimensions in the project, dataset "
|
|
1421
|
+
f"{dataset_id} does not specify a mapping, and dsgrid could not "
|
|
1422
|
+
"discern which base dimension to use."
|
|
1423
|
+
)
|
|
1424
|
+
raise DSGInvalidDataset(msg)
|
|
1425
|
+
else:
|
|
1426
|
+
base_dimension_names[dim_type] = project_base_dims[0].model.name
|
|
1427
|
+
|
|
1428
|
+
data = {k.value: v for k, v in base_dimension_names.items()}
|
|
1429
|
+
return DatasetBaseDimensionNamesModel(**data)
|
|
1430
|
+
|
|
1431
|
+
@track_timing(timer_stats_collector)
|
|
1432
|
+
def _make_dimension_associations(
|
|
1433
|
+
self,
|
|
1434
|
+
config: ProjectConfig,
|
|
1435
|
+
dataset_id: str,
|
|
1436
|
+
context: ScratchDirContext,
|
|
1437
|
+
) -> DataFrame:
|
|
1438
|
+
logger.info("Make dimension association table for %s", dataset_id)
|
|
1439
|
+
df = config.make_dimension_association_table(dataset_id, context)
|
|
1440
|
+
if use_duckdb():
|
|
1441
|
+
df2 = df
|
|
1442
|
+
else:
|
|
1443
|
+
# This operation is slow with Spark. Ensure that we only evaluate the query once.
|
|
1444
|
+
df2 = read_dataframe(persist_table(df, context, "dimension_associations"))
|
|
1445
|
+
logger.info("Wrote dimension associations for dataset %s", dataset_id)
|
|
1446
|
+
return df2
|
|
1447
|
+
|
|
1448
|
+
def update_from_file(
|
|
1449
|
+
self,
|
|
1450
|
+
config_file,
|
|
1451
|
+
project_id: str,
|
|
1452
|
+
submitter: str,
|
|
1453
|
+
update_type: VersionUpdateType,
|
|
1454
|
+
log_message: str,
|
|
1455
|
+
version: str,
|
|
1456
|
+
) -> ProjectConfig:
|
|
1457
|
+
with RegistrationContext(self.db, log_message, update_type, submitter) as context:
|
|
1458
|
+
config = ProjectConfig.load(config_file)
|
|
1459
|
+
self._update_dimensions_and_mappings(context.connection, config)
|
|
1460
|
+
self._check_update(context.connection, config, project_id, version)
|
|
1461
|
+
return self.update_with_context(config, context)
|
|
1462
|
+
|
|
1463
|
+
@track_timing(timer_stats_collector)
|
|
1464
|
+
def update(
|
|
1465
|
+
self,
|
|
1466
|
+
config: ProjectConfig,
|
|
1467
|
+
update_type: VersionUpdateType,
|
|
1468
|
+
log_message: str,
|
|
1469
|
+
submitter: str | None = None,
|
|
1470
|
+
) -> ProjectConfig:
|
|
1471
|
+
with RegistrationContext(self.db, log_message, update_type, submitter) as context:
|
|
1472
|
+
self._update_dimensions_and_mappings(context.connection, config)
|
|
1473
|
+
return self.update_with_context(config, context)
|
|
1474
|
+
|
|
1475
|
+
def update_with_context(
|
|
1476
|
+
self, config: ProjectConfig, context: RegistrationContext
|
|
1477
|
+
) -> ProjectConfig:
|
|
1478
|
+
old_config = self.get_by_id(config.model.project_id, conn=context.connection)
|
|
1479
|
+
checker = ProjectUpdateChecker(old_config.model, config.model)
|
|
1480
|
+
checker.run()
|
|
1481
|
+
self._run_checks(config)
|
|
1482
|
+
return self._make_new_config(config, context)
|
|
1483
|
+
|
|
1484
|
+
def _make_new_config(
|
|
1485
|
+
self, config: ProjectConfig, context: RegistrationContext
|
|
1486
|
+
) -> ProjectConfig:
|
|
1487
|
+
old_version = config.model.version
|
|
1488
|
+
old_key = ConfigKey(config.config_id, old_version)
|
|
1489
|
+
model = self._update_config(config, context)
|
|
1490
|
+
assert isinstance(model, ProjectConfigModel)
|
|
1491
|
+
new_config = ProjectConfig(model)
|
|
1492
|
+
self._update_dimensions_and_mappings(context.connection, new_config)
|
|
1493
|
+
new_key = ConfigKey(new_config.model.project_id, new_config.model.version)
|
|
1494
|
+
self._projects.pop(old_key, None)
|
|
1495
|
+
self._projects[new_key] = new_config
|
|
1496
|
+
return new_config
|
|
1497
|
+
|
|
1498
|
+
def finalize_registration(self, conn: Connection, config_ids: set[str], error_occurred: bool):
|
|
1499
|
+
if error_occurred:
|
|
1500
|
+
logger.info("Remove intermediate project after error")
|
|
1501
|
+
for key in [x for x in self._projects if x.id in config_ids]:
|
|
1502
|
+
self._projects.pop(key)
|
|
1503
|
+
|
|
1504
|
+
def remove(self, config_id: str, conn: Connection | None = None) -> None:
|
|
1505
|
+
self.db.delete_all(conn, config_id)
|
|
1506
|
+
for key in [x for x in self._projects if x.id == config_id]:
|
|
1507
|
+
self._projects.pop(key)
|
|
1508
|
+
|
|
1509
|
+
logger.info("Removed %s from the registry.", config_id)
|
|
1510
|
+
|
|
1511
|
+
def show(
|
|
1512
|
+
self,
|
|
1513
|
+
conn: Connection | None = None,
|
|
1514
|
+
filters: list[str] | None = None,
|
|
1515
|
+
max_width: Union[int, dict] | None = None,
|
|
1516
|
+
drop_fields: list[str] | None = None,
|
|
1517
|
+
return_table: bool = False,
|
|
1518
|
+
**kwargs,
|
|
1519
|
+
):
|
|
1520
|
+
"""Show registry in PrettyTable
|
|
1521
|
+
|
|
1522
|
+
Parameters
|
|
1523
|
+
----------
|
|
1524
|
+
filters : list or tuple
|
|
1525
|
+
List of filter expressions for reigstry content (e.g., filters=["Submitter==USER", "Description contains comstock"])
|
|
1526
|
+
max_width
|
|
1527
|
+
Max column width in PrettyTable, specify as a single value or as a dict of values by field name
|
|
1528
|
+
drop_fields
|
|
1529
|
+
List of field names not to show
|
|
1530
|
+
|
|
1531
|
+
"""
|
|
1532
|
+
|
|
1533
|
+
if filters:
|
|
1534
|
+
logger.info("List registry for: %s", filters)
|
|
1535
|
+
|
|
1536
|
+
table = PrettyTable(title=self.name())
|
|
1537
|
+
all_field_names = (
|
|
1538
|
+
"ID",
|
|
1539
|
+
"Version",
|
|
1540
|
+
"Status",
|
|
1541
|
+
"Datasets",
|
|
1542
|
+
"Date",
|
|
1543
|
+
"Submitter",
|
|
1544
|
+
"Description",
|
|
1545
|
+
)
|
|
1546
|
+
# TODO: may want dataset and dataset status to be separate columns
|
|
1547
|
+
# TODO: this block can be refactored into base, registry should be in HTML table for notebook.
|
|
1548
|
+
if drop_fields is None:
|
|
1549
|
+
table.field_names = all_field_names
|
|
1550
|
+
else:
|
|
1551
|
+
table.field_names = tuple(x for x in all_field_names if x not in drop_fields)
|
|
1552
|
+
|
|
1553
|
+
if max_width is None:
|
|
1554
|
+
table._max_width = {
|
|
1555
|
+
"ID": 20,
|
|
1556
|
+
"Status": 12,
|
|
1557
|
+
"Datasets": 30,
|
|
1558
|
+
"Date": 10,
|
|
1559
|
+
"Description": 30,
|
|
1560
|
+
}
|
|
1561
|
+
if isinstance(max_width, int):
|
|
1562
|
+
table.max_width = max_width
|
|
1563
|
+
elif isinstance(max_width, dict):
|
|
1564
|
+
table._max_width = max_width
|
|
1565
|
+
|
|
1566
|
+
transformed_filters = transform_and_validate_filters(filters) if filters else None
|
|
1567
|
+
field_to_index = {x: i for i, x in enumerate(table.field_names)}
|
|
1568
|
+
rows = []
|
|
1569
|
+
for model in self.db.iter_models(conn):
|
|
1570
|
+
assert isinstance(model, ProjectConfigModel)
|
|
1571
|
+
registration = self.db.get_registration(conn, model)
|
|
1572
|
+
all_fields = (
|
|
1573
|
+
model.project_id,
|
|
1574
|
+
model.version,
|
|
1575
|
+
model.status.value,
|
|
1576
|
+
",\n".join([f"{x.dataset_id}: {x.status.value}" for x in model.datasets]),
|
|
1577
|
+
registration.timestamp.strftime("%Y-%m-%d %H:%M:%S"),
|
|
1578
|
+
registration.submitter,
|
|
1579
|
+
registration.log_message,
|
|
1580
|
+
)
|
|
1581
|
+
if drop_fields is None:
|
|
1582
|
+
row = all_fields
|
|
1583
|
+
else:
|
|
1584
|
+
row = tuple(
|
|
1585
|
+
y for (x, y) in zip(all_field_names, all_fields) if x not in drop_fields
|
|
1586
|
+
)
|
|
1587
|
+
|
|
1588
|
+
if not filters or matches_filters(row, field_to_index, transformed_filters):
|
|
1589
|
+
rows.append(row)
|
|
1590
|
+
|
|
1591
|
+
rows.sort(key=lambda x: x[0])
|
|
1592
|
+
table.add_rows(rows)
|
|
1593
|
+
table.align = "l"
|
|
1594
|
+
if return_table:
|
|
1595
|
+
return table
|
|
1596
|
+
display_table(table)
|
|
1597
|
+
|
|
1598
|
+
|
|
1599
|
+
def _check_distinct_column_values(project_table: DataFrame, mapped_dataset_table: DataFrame):
|
|
1600
|
+
"""Ensure that the mapped dataset has the same distinct values as the project for all
|
|
1601
|
+
columns. This should be called before running a full comparison of the two tables.
|
|
1602
|
+
"""
|
|
1603
|
+
has_mismatch = False
|
|
1604
|
+
for column in project_table.columns:
|
|
1605
|
+
project_distinct = {x[column] for x in project_table.select(column).distinct().collect()}
|
|
1606
|
+
dataset_distinct = {
|
|
1607
|
+
x[column] for x in mapped_dataset_table.select(column).distinct().collect()
|
|
1608
|
+
}
|
|
1609
|
+
if diff_values := project_distinct.difference(dataset_distinct):
|
|
1610
|
+
has_mismatch = True
|
|
1611
|
+
logger.error(
|
|
1612
|
+
"The mapped dataset has different distinct values than the project "
|
|
1613
|
+
"for column=%s: diff=%s",
|
|
1614
|
+
column,
|
|
1615
|
+
diff_values,
|
|
1616
|
+
)
|
|
1617
|
+
|
|
1618
|
+
if has_mismatch:
|
|
1619
|
+
msg = (
|
|
1620
|
+
"The mapped dataset has different distinct values than the project for one or "
|
|
1621
|
+
"more columns. Please look in the log file for the exact records."
|
|
1622
|
+
)
|
|
1623
|
+
raise DSGInvalidDataset(msg)
|