climate-ref 0.6.5__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. climate_ref/cli/__init__.py +12 -3
  2. climate_ref/cli/_utils.py +56 -2
  3. climate_ref/cli/datasets.py +48 -9
  4. climate_ref/cli/executions.py +351 -24
  5. climate_ref/cli/providers.py +1 -2
  6. climate_ref/config.py +4 -4
  7. climate_ref/database.py +62 -4
  8. climate_ref/dataset_registry/obs4ref_reference.txt +0 -9
  9. climate_ref/dataset_registry/sample_data.txt +269 -107
  10. climate_ref/datasets/__init__.py +3 -3
  11. climate_ref/datasets/base.py +121 -20
  12. climate_ref/datasets/cmip6.py +2 -0
  13. climate_ref/datasets/obs4mips.py +26 -15
  14. climate_ref/executor/__init__.py +8 -1
  15. climate_ref/executor/hpc.py +7 -1
  16. climate_ref/executor/result_handling.py +151 -64
  17. climate_ref/migrations/env.py +12 -10
  18. climate_ref/migrations/versions/2025-07-20T1521_94beace57a9c_cmip6_finalised.py +1 -1
  19. climate_ref/migrations/versions/2025-08-05T0327_a1b2c3d4e5f6_finalised_on_base_dataset.py +1 -1
  20. climate_ref/migrations/versions/2025-09-05T2019_8d28e5e0f9c3_add_indexes.py +108 -0
  21. climate_ref/migrations/versions/2025-09-10T1358_2f6e36738e06_use_version_as_version_facet_for_.py +35 -0
  22. climate_ref/migrations/versions/2025-09-22T2359_20cd136a5b04_add_pmp_version.py +35 -0
  23. climate_ref/models/__init__.py +1 -6
  24. climate_ref/models/base.py +4 -18
  25. climate_ref/models/dataset.py +10 -6
  26. climate_ref/models/diagnostic.py +2 -1
  27. climate_ref/models/execution.py +225 -12
  28. climate_ref/models/metric_value.py +27 -112
  29. climate_ref/models/mixins.py +144 -0
  30. climate_ref/models/provider.py +2 -1
  31. climate_ref/provider_registry.py +4 -4
  32. climate_ref/slurm.py +2 -2
  33. climate_ref/testing.py +1 -1
  34. {climate_ref-0.6.5.dist-info → climate_ref-0.7.0.dist-info}/METADATA +2 -2
  35. climate_ref-0.7.0.dist-info/RECORD +58 -0
  36. climate_ref-0.6.5.dist-info/RECORD +0 -54
  37. {climate_ref-0.6.5.dist-info → climate_ref-0.7.0.dist-info}/WHEEL +0 -0
  38. {climate_ref-0.6.5.dist-info → climate_ref-0.7.0.dist-info}/entry_points.txt +0 -0
  39. {climate_ref-0.6.5.dist-info → climate_ref-0.7.0.dist-info}/licenses/LICENCE +0 -0
  40. {climate_ref-0.6.5.dist-info → climate_ref-0.7.0.dist-info}/licenses/NOTICE +0 -0
@@ -2,16 +2,36 @@ from pathlib import Path
2
2
  from typing import Any, Protocol, cast
3
3
 
4
4
  import pandas as pd
5
+ from attrs import define
5
6
  from loguru import logger
6
7
  from sqlalchemy.orm import joinedload
7
8
 
8
9
  from climate_ref.config import Config
9
- from climate_ref.database import Database
10
+ from climate_ref.database import Database, ModelState
10
11
  from climate_ref.datasets.utils import validate_path
11
12
  from climate_ref.models.dataset import Dataset, DatasetFile
12
13
  from climate_ref_core.exceptions import RefException
13
14
 
14
15
 
16
+ @define
17
+ class DatasetRegistrationResult:
18
+ """
19
+ Result of registering a dataset, containing information about file changes
20
+ """
21
+
22
+ dataset: Dataset
23
+ dataset_state: ModelState | None
24
+ files_added: list[str]
25
+ files_updated: list[str]
26
+ files_removed: list[str]
27
+ files_unchanged: list[str]
28
+
29
+ @property
30
+ def total_changes(self) -> int:
31
+ """Total number of file changes (added + updated + removed)"""
32
+ return len(self.files_added) + len(self.files_updated) + len(self.files_removed)
33
+
34
+
15
35
  def _log_duplicate_metadata(
16
36
  data_catalog: pd.DataFrame, unique_metadata: pd.DataFrame, slug_column: str
17
37
  ) -> None:
@@ -26,7 +46,8 @@ def _log_duplicate_metadata(
26
46
  invalid_dataset_columns = invalid_dataset_nunique[invalid_dataset_nunique.gt(1)].index.tolist()
27
47
 
28
48
  # Include time_range in the list of invalid columns to make debugging easier
29
- invalid_dataset_columns.append("time_range")
49
+ if "time_range" in data_catalog.columns and "time_range" not in invalid_dataset_columns:
50
+ invalid_dataset_columns.append("time_range")
30
51
 
31
52
  data_catalog_subset = data_catalog[data_catalog[slug_column] == instance_id]
32
53
 
@@ -169,9 +190,9 @@ class DatasetAdapter(Protocol):
169
190
 
170
191
  return data_catalog
171
192
 
172
- def register_dataset(
193
+ def register_dataset( # noqa: PLR0915
173
194
  self, config: Config, db: Database, data_catalog_dataset: pd.DataFrame
174
- ) -> Dataset | None:
195
+ ) -> DatasetRegistrationResult:
175
196
  """
176
197
  Register a dataset in the database using the data catalog
177
198
 
@@ -187,7 +208,7 @@ class DatasetAdapter(Protocol):
187
208
  Returns
188
209
  -------
189
210
  :
190
- Registered dataset if successful, else None
211
+ Registration result with dataset and file change information
191
212
  """
192
213
  DatasetModel = self.dataset_cls
193
214
 
@@ -197,24 +218,104 @@ class DatasetAdapter(Protocol):
197
218
  raise RefException(f"Found multiple datasets in the same directory: {unique_slugs}")
198
219
  slug = unique_slugs[0]
199
220
 
221
+ # Upsert the dataset (create a new dataset or update the metadata)
200
222
  dataset_metadata = data_catalog_dataset[list(self.dataset_specific_metadata)].iloc[0].to_dict()
201
- dataset, created = db.get_or_create(DatasetModel, defaults=dataset_metadata, slug=slug)
202
- if not created:
203
- logger.warning(f"{dataset} already exists in the database. Skipping")
204
- return None
223
+ dataset, dataset_state = db.update_or_create(DatasetModel, defaults=dataset_metadata, slug=slug)
224
+ if dataset_state == ModelState.CREATED:
225
+ logger.info(f"Created new dataset: {dataset}")
226
+ elif dataset_state == ModelState.UPDATED:
227
+ logger.info(f"Updating existing dataset: {dataset}")
205
228
  db.session.flush()
206
- for dataset_file in data_catalog_dataset.to_dict(orient="records"):
207
- path = validate_path(dataset_file.pop("path"))
208
-
209
- db.session.add(
210
- DatasetFile(
211
- path=str(path),
212
- dataset_id=dataset.id,
213
- start_time=dataset_file.pop("start_time"),
214
- end_time=dataset_file.pop("end_time"),
229
+
230
+ # Initialize result tracking
231
+ files_added = []
232
+ files_updated = []
233
+ files_removed = []
234
+ files_unchanged = []
235
+
236
+ # Get current files for this dataset
237
+ current_files = db.session.query(DatasetFile).filter_by(dataset_id=dataset.id).all()
238
+ current_file_paths = {f.path: f for f in current_files}
239
+
240
+ # Get new file data from data catalog
241
+ new_file_data = data_catalog_dataset.to_dict(orient="records")
242
+ new_file_lookup = {}
243
+ for dataset_file in new_file_data:
244
+ file_path = str(validate_path(dataset_file["path"]))
245
+ new_file_lookup[file_path] = {
246
+ "start_time": dataset_file["start_time"],
247
+ "end_time": dataset_file["end_time"],
248
+ }
249
+
250
+ new_file_paths = set(new_file_lookup.keys())
251
+ existing_file_paths = set(current_file_paths.keys())
252
+
253
+ # TODO: support removing files that are no longer present
254
+ # We want to keep a record of the dataset if it was used by a diagnostic in the past
255
+ files_to_remove = existing_file_paths - new_file_paths
256
+ if files_to_remove:
257
+ files_removed = list(files_to_remove)
258
+ logger.warning(f"Files to remove: {files_removed}")
259
+ raise NotImplementedError("Removing files is not yet supported")
260
+
261
+ # Update existing files if start/end times have changed
262
+ for file_path, existing_file in current_file_paths.items():
263
+ if file_path in new_file_lookup:
264
+ new_times = new_file_lookup[file_path]
265
+ if (
266
+ existing_file.start_time != new_times["start_time"]
267
+ or existing_file.end_time != new_times["end_time"]
268
+ ):
269
+ logger.warning(f"Updating file times for {file_path}")
270
+ existing_file.start_time = new_times["start_time"]
271
+ existing_file.end_time = new_times["end_time"]
272
+ files_updated.append(file_path)
273
+ else:
274
+ files_unchanged.append(file_path)
275
+
276
+ # Add new files (batch operation)
277
+ files_to_add = new_file_paths - existing_file_paths
278
+ if files_to_add:
279
+ files_added = list(files_to_add)
280
+ new_dataset_files = []
281
+ for file_path in files_to_add:
282
+ file_times = new_file_lookup[file_path]
283
+ new_dataset_files.append(
284
+ DatasetFile(
285
+ path=file_path,
286
+ dataset_id=dataset.id,
287
+ start_time=file_times["start_time"],
288
+ end_time=file_times["end_time"],
289
+ )
215
290
  )
216
- )
217
- return dataset
291
+ db.session.add_all(new_dataset_files)
292
+
293
+ # Determine final dataset state
294
+ # If dataset metadata changed, use that state
295
+ # If no metadata changed but files changed, consider it updated
296
+ # If nothing changed, keep the original state (None for existing, CREATED for new)
297
+ final_dataset_state = dataset_state
298
+ if dataset_state is None and (files_added or files_updated or files_removed):
299
+ final_dataset_state = ModelState.UPDATED
300
+
301
+ result = DatasetRegistrationResult(
302
+ dataset=dataset,
303
+ dataset_state=final_dataset_state,
304
+ files_added=files_added,
305
+ files_updated=files_updated,
306
+ files_removed=files_removed,
307
+ files_unchanged=files_unchanged,
308
+ )
309
+ change_message = f": ({final_dataset_state.name})" if final_dataset_state else ""
310
+ logger.debug(
311
+ f"Dataset registration complete for {dataset.slug}{change_message} "
312
+ f"{len(files_added)} files added, "
313
+ f"{len(files_updated)} files updated, "
314
+ f"{len(files_removed)} files removed, "
315
+ f"{len(files_unchanged)} files unchanged"
316
+ )
317
+
318
+ return result
218
319
 
219
320
  def _get_dataset_files(self, db: Database, limit: int | None = None) -> pd.DataFrame:
220
321
  dataset_type = self.dataset_cls.__mapper_args__["polymorphic_identity"]
@@ -119,6 +119,8 @@ class CMIP6DatasetAdapter(DatasetAdapter):
119
119
  file_specific_metadata = ("start_time", "end_time", "path")
120
120
 
121
121
  version_metadata = "version"
122
+ # See https://wcrp-cmip.github.io/WGCM_Infrastructure_Panel/Papers/CMIP6_global_attributes_filenames_CVs_v6.2.7.pdf
123
+ # under "Directory structure template"
122
124
  dataset_id_metadata = (
123
125
  "activity_id",
124
126
  "institution_id",
@@ -7,7 +7,6 @@ from typing import Any
7
7
  import pandas as pd
8
8
  import xarray as xr
9
9
  from ecgtools import Builder
10
- from ecgtools.parsers.utilities import extract_attr_with_regex # type: ignore
11
10
  from loguru import logger
12
11
 
13
12
  from climate_ref.datasets.base import DatasetAdapter
@@ -15,7 +14,7 @@ from climate_ref.datasets.cmip6 import _parse_datetime
15
14
  from climate_ref.models.dataset import Dataset, Obs4MIPsDataset
16
15
 
17
16
 
18
- def parse_obs4mips(file: str, **kwargs: Any) -> dict[str, Any]:
17
+ def parse_obs4mips(file: str, **kwargs: Any) -> dict[str, Any]: # noqa: PLR0912
19
18
  """
20
19
  Parser for obs4mips
21
20
 
@@ -41,6 +40,7 @@ def parse_obs4mips(file: str, **kwargs: Any) -> dict[str, Any]:
41
40
  "source_type",
42
41
  "variable_id",
43
42
  "variant_label",
43
+ "source_version_number",
44
44
  }
45
45
  )
46
46
  )
@@ -48,6 +48,10 @@ def parse_obs4mips(file: str, **kwargs: Any) -> dict[str, Any]:
48
48
  try:
49
49
  time_coder = xr.coders.CFDatetimeCoder(use_cftime=True)
50
50
  with xr.open_dataset(file, chunks={}, decode_times=time_coder) as ds:
51
+ if ds.attrs.get("activity_id", "") != "obs4MIPs":
52
+ traceback_message = f"{file} is not an obs4MIPs dataset"
53
+ raise TypeError(traceback_message)
54
+
51
55
  has_none_value = any(ds.attrs.get(key) is None for key in keys)
52
56
  if has_none_value:
53
57
  missing_fields = [key for key in keys if ds.attrs.get(key) is None]
@@ -55,10 +59,6 @@ def parse_obs4mips(file: str, **kwargs: Any) -> dict[str, Any]:
55
59
  raise AttributeError(traceback_message)
56
60
  info = {key: ds.attrs.get(key) for key in keys}
57
61
 
58
- if info["activity_id"] != "obs4MIPs":
59
- traceback_message = f"{file} is not an obs4MIPs dataset"
60
- raise TypeError(traceback_message)
61
-
62
62
  variable_id = info["variable_id"]
63
63
 
64
64
  if variable_id:
@@ -86,12 +86,12 @@ def parse_obs4mips(file: str, **kwargs: Any) -> dict[str, Any]:
86
86
  else:
87
87
  info["time_range"] = f"{start_time}-{end_time}"
88
88
  info["path"] = str(file)
89
- info["source_version_number"] = (
90
- extract_attr_with_regex(
91
- str(file), regex=r"v\d{4}\d{2}\d{2}|v\d{1}", strip_chars=None, ignore_case=True
92
- )
93
- or "v0"
94
- )
89
+ # Parsing the version like for CMIP6 fails because some obs4REF paths
90
+ # do not include "v" in the version directory name.
91
+ # TODO: fix obs4REF paths
92
+ info["version"] = Path(file).parent.name
93
+ if not info["version"].startswith("v"): # type: ignore[union-attr]
94
+ info["version"] = "v{version}".format(**info)
95
95
  return info
96
96
 
97
97
  except (TypeError, AttributeError) as err:
@@ -99,7 +99,7 @@ def parse_obs4mips(file: str, **kwargs: Any) -> dict[str, Any]:
99
99
  logger.warning(str(err.args[0]))
100
100
  else:
101
101
  logger.warning(str(err.args))
102
- return {"INVALID_ASSET": file, "TRACEBACK": traceback_message}
102
+ return {"INVALID_ASSET": file, "TRACEBACK": str(err)}
103
103
  except Exception:
104
104
  logger.warning(traceback.format_exc())
105
105
  return {"INVALID_ASSET": file, "TRACEBACK": traceback.format_exc()}
@@ -129,18 +129,22 @@ class Obs4MIPsDatasetAdapter(DatasetAdapter):
129
129
  "variant_label",
130
130
  "long_name",
131
131
  "units",
132
+ "version",
132
133
  "vertical_levels",
133
134
  "source_version_number",
134
135
  slug_column,
135
136
  )
136
137
 
137
138
  file_specific_metadata = ("start_time", "end_time", "path")
138
- version_metadata = "source_version_number"
139
+ version_metadata = "version"
140
+ # See ODS2.5 at https://doi.org/10.5281/zenodo.11500474 under "Directory structure template"
139
141
  dataset_id_metadata = (
140
142
  "activity_id",
141
143
  "institution_id",
142
144
  "source_id",
145
+ "frequency",
143
146
  "variable_id",
147
+ "nominal_resolution",
144
148
  "grid_label",
145
149
  )
146
150
 
@@ -186,7 +190,14 @@ class Obs4MIPsDatasetAdapter(DatasetAdapter):
186
190
  self.version_metadata,
187
191
  ]
188
192
  datasets["instance_id"] = datasets.apply(
189
- lambda row: "obs4MIPs." + ".".join([row[item] for item in drs_items]), axis=1
193
+ lambda row: "obs4MIPs."
194
+ + ".".join(
195
+ [
196
+ row[item].replace(" ", "") if item == "nominal_resolution" else row[item]
197
+ for item in drs_items
198
+ ]
199
+ ),
200
+ axis=1,
190
201
  )
191
202
  datasets["finalised"] = True
192
203
  return datasets
@@ -9,7 +9,14 @@ The simplest executor is the `LocalExecutor`, which runs the diagnostic in the s
9
9
  This is useful for local testing and debugging.
10
10
  """
11
11
 
12
- from .hpc import HPCExecutor
12
+ from climate_ref_core.exceptions import InvalidExecutorException
13
+
14
+ try:
15
+ from .hpc import HPCExecutor
16
+ except InvalidExecutorException as exc:
17
+ # This exception is reraised when importing the executor as `climate_ref.executors.HPCExecutor`
18
+ HPCExecutor = exc # type: ignore
19
+
13
20
  from .local import LocalExecutor
14
21
  from .result_handling import handle_execution_result
15
22
  from .synchronous import SynchronousExecutor
@@ -5,12 +5,18 @@ If you want to
5
5
  - run REF under the HPC workflows
6
6
  - run REF in multiple nodes
7
7
 
8
+ The `HPCExecutor` requires the optional `parsl` dependency.
9
+ This dependency (and therefore this executor) is not available on Windows.
8
10
  """
9
11
 
10
12
  try:
11
13
  import parsl
12
14
  except ImportError: # pragma: no cover
13
- raise ImportError("The HPCExecutor requires the `parsl` package")
15
+ from climate_ref_core.exceptions import InvalidExecutorException
16
+
17
+ raise InvalidExecutorException(
18
+ "climate_ref_core.executor.hpc.HPCExecutor", "The HPCExecutor requires the `parsl` package"
19
+ )
14
20
 
15
21
  import os
16
22
  import time
@@ -17,11 +17,12 @@ from loguru import logger
17
17
  from sqlalchemy import insert
18
18
 
19
19
  from climate_ref.database import Database
20
- from climate_ref.models import ScalarMetricValue
20
+ from climate_ref.models import ScalarMetricValue, SeriesMetricValue
21
21
  from climate_ref.models.execution import Execution, ExecutionOutput, ResultOutputType
22
22
  from climate_ref_core.diagnostics import ExecutionResult, ensure_relative_path
23
23
  from climate_ref_core.exceptions import ResultValidationError
24
24
  from climate_ref_core.logging import EXECUTION_LOG_FILENAME
25
+ from climate_ref_core.metric_values import SeriesMetricValue as TSeries
25
26
  from climate_ref_core.pycmec.controlled_vocabulary import CV
26
27
  from climate_ref_core.pycmec.metric import CMECMetric
27
28
  from climate_ref_core.pycmec.output import CMECOutput, OutputDict
@@ -65,6 +66,113 @@ def _copy_file_to_results(
65
66
  shutil.copy(input_directory / filename, output_filename)
66
67
 
67
68
 
69
+ def _process_execution_scalar(
70
+ database: Database,
71
+ result: "ExecutionResult",
72
+ execution: Execution,
73
+ cv: CV,
74
+ ) -> None:
75
+ """
76
+ Process the scalar values from the execution result and store them in the database
77
+
78
+ This also validates the scalar values against the controlled vocabulary
79
+ """
80
+ # Load the metric bundle from the file
81
+ cmec_metric_bundle = CMECMetric.load_from_json(result.to_output_path(result.metric_bundle_filename))
82
+
83
+ # Check that the diagnostic values conform with the controlled vocabulary
84
+ try:
85
+ cv.validate_metrics(cmec_metric_bundle)
86
+ except (ResultValidationError, AssertionError):
87
+ # TODO: Remove once we have settled on a controlled vocabulary
88
+ logger.exception("Diagnostic values do not conform with the controlled vocabulary")
89
+ # execution.mark_failed()
90
+
91
+ # Perform a bulk insert of scalar values
92
+ # The current implementation will swallow the exception, but display a log message
93
+ try:
94
+ scalar_values = [
95
+ {
96
+ "execution_id": execution.id,
97
+ "value": result.value,
98
+ "attributes": result.attributes,
99
+ **result.dimensions,
100
+ }
101
+ for result in cmec_metric_bundle.iter_results()
102
+ ]
103
+ logger.debug(f"Ingesting {len(scalar_values)} scalar values for execution {execution.id}")
104
+ if scalar_values:
105
+ # Perform this in a nested transaction to rollback if something goes wrong
106
+ # We will lose the metric values for a given execution, but not the whole execution
107
+ with database.session.begin_nested():
108
+ database.session.execute(
109
+ insert(ScalarMetricValue),
110
+ scalar_values,
111
+ )
112
+ # This is a broad exception catch to ensure we log any issues
113
+ except Exception:
114
+ logger.exception("Something went wrong when ingesting diagnostic values")
115
+
116
+
117
+ def _process_execution_series(
118
+ config: "Config",
119
+ database: Database,
120
+ result: "ExecutionResult",
121
+ execution: Execution,
122
+ cv: CV,
123
+ ) -> None:
124
+ """
125
+ Process the series values from the execution result and store them in the database
126
+
127
+ This also copies the series values file from the scratch directory to the results directory
128
+ and validates the series values against the controlled vocabulary.
129
+ """
130
+ assert result.series_filename, "Series filename must be set in the result"
131
+
132
+ _copy_file_to_results(
133
+ config.paths.scratch,
134
+ config.paths.results,
135
+ execution.output_fragment,
136
+ result.series_filename,
137
+ )
138
+
139
+ # Load the series values from the file
140
+ series_values_path = result.to_output_path(result.series_filename)
141
+ series_values = TSeries.load_from_json(series_values_path)
142
+
143
+ try:
144
+ cv.validate_metrics(series_values)
145
+ except (ResultValidationError, AssertionError):
146
+ # TODO: Remove once we have settled on a controlled vocabulary
147
+ logger.exception("Diagnostic values do not conform with the controlled vocabulary")
148
+ # execution.mark_failed()
149
+
150
+ # Perform a bulk insert of series values
151
+ try:
152
+ series_values_content = [
153
+ {
154
+ "execution_id": execution.id,
155
+ "values": series_result.values,
156
+ "attributes": series_result.attributes,
157
+ "index": series_result.index,
158
+ "index_name": series_result.index_name,
159
+ **series_result.dimensions,
160
+ }
161
+ for series_result in series_values
162
+ ]
163
+ logger.debug(f"Ingesting {len(series_values)} series values for execution {execution.id}")
164
+ if series_values:
165
+ # Perform this in a nested transaction to rollback if something goes wrong
166
+ # We will lose the metric values for a given execution, but not the whole execution
167
+ with database.session.begin_nested():
168
+ database.session.execute(
169
+ insert(SeriesMetricValue),
170
+ series_values_content,
171
+ )
172
+ except Exception:
173
+ logger.exception("Something went wrong when ingesting diagnostic series values")
174
+
175
+
68
176
  def handle_execution_result(
69
177
  config: "Config",
70
178
  database: Database,
@@ -88,7 +196,7 @@ def handle_execution_result(
88
196
  result
89
197
  The result of the diagnostic execution, either successful or failed
90
198
  """
91
- # Always copy log data
199
+ # Always copy log data to the results directory
92
200
  _copy_file_to_results(
93
201
  config.paths.scratch,
94
202
  config.paths.results,
@@ -96,74 +204,52 @@ def handle_execution_result(
96
204
  EXECUTION_LOG_FILENAME,
97
205
  )
98
206
 
99
- if result.successful and result.metric_bundle_filename is not None:
100
- logger.info(f"{execution} successful")
207
+ if not result.successful or result.metric_bundle_filename is None:
208
+ logger.error(f"{execution} failed")
209
+ execution.mark_failed()
210
+ return
211
+
212
+ logger.info(f"{execution} successful")
213
+
214
+ _copy_file_to_results(
215
+ config.paths.scratch,
216
+ config.paths.results,
217
+ execution.output_fragment,
218
+ result.metric_bundle_filename,
219
+ )
101
220
 
221
+ if result.output_bundle_filename:
102
222
  _copy_file_to_results(
103
223
  config.paths.scratch,
104
224
  config.paths.results,
105
225
  execution.output_fragment,
106
- result.metric_bundle_filename,
226
+ result.output_bundle_filename,
227
+ )
228
+ _handle_output_bundle(
229
+ config,
230
+ database,
231
+ execution,
232
+ result.to_output_path(result.output_bundle_filename),
107
233
  )
108
- execution.mark_successful(result.as_relative_path(result.metric_bundle_filename))
109
-
110
- if result.output_bundle_filename:
111
- _copy_file_to_results(
112
- config.paths.scratch,
113
- config.paths.results,
114
- execution.output_fragment,
115
- result.output_bundle_filename,
116
- )
117
- _handle_output_bundle(
118
- config,
119
- database,
120
- execution,
121
- result.to_output_path(result.output_bundle_filename),
122
- )
123
234
 
124
- cmec_metric_bundle = CMECMetric.load_from_json(result.to_output_path(result.metric_bundle_filename))
125
-
126
- # Check that the diagnostic values conform with the controlled vocabulary
127
- try:
128
- cv = CV.load_from_file(config.paths.dimensions_cv)
129
- cv.validate_metrics(cmec_metric_bundle)
130
- except (ResultValidationError, AssertionError):
131
- logger.exception("Diagnostic values do not conform with the controlled vocabulary")
132
- # execution.mark_failed()
133
-
134
- # Perform a bulk insert of scalar values
135
- # The current implementation will swallow the exception, but display a log message
136
- try:
137
- scalar_values = [
138
- {
139
- "execution_id": execution.id,
140
- "value": result.value,
141
- "attributes": result.attributes,
142
- **result.dimensions,
143
- }
144
- for result in cmec_metric_bundle.iter_results()
145
- ]
146
- if scalar_values:
147
- # Perform this in a nested transaction to rollback if something goes wrong
148
- # We will lose the metric values for a given execution, but not the whole execution
149
- with database.session.begin_nested():
150
- database.session.execute(
151
- insert(ScalarMetricValue),
152
- scalar_values,
153
- )
154
- except Exception:
155
- # TODO: Remove once we have settled on a controlled vocabulary
156
- logger.exception("Something went wrong when ingesting diagnostic values")
157
-
158
- # TODO Ingest the series values
159
-
160
- # TODO: This should check if the result is the most recent for the execution,
161
- # if so then update the dirty fields
162
- # i.e. if there are outstanding executions don't make as clean
163
- execution.execution_group.dirty = False
164
- else:
165
- logger.error(f"{execution} failed")
166
- execution.mark_failed()
235
+ cv = CV.load_from_file(config.paths.dimensions_cv)
236
+
237
+ if result.series_filename:
238
+ # Process the series values if they are present
239
+ # This will ingest the series values into the database
240
+ _process_execution_series(config=config, database=database, result=result, execution=execution, cv=cv)
241
+
242
+ # Process the scalar values
243
+ # This will ingest the scalar values into the database
244
+ _process_execution_scalar(database=database, result=result, execution=execution, cv=cv)
245
+
246
+ # TODO: This should check if the result is the most recent for the execution,
247
+ # if so then update the dirty fields
248
+ # i.e. if there are outstanding executions don't make as clean
249
+ execution.execution_group.dirty = False
250
+
251
+ # Finally, mark the execution as successful
252
+ execution.mark_successful(result.as_relative_path(result.metric_bundle_filename))
167
253
 
168
254
 
169
255
  def _handle_output_bundle(
@@ -220,12 +306,13 @@ def _handle_outputs(
220
306
  filename,
221
307
  )
222
308
  database.session.add(
223
- ExecutionOutput(
309
+ ExecutionOutput.build(
224
310
  execution_id=execution.id,
225
311
  output_type=output_type,
226
312
  filename=str(filename),
227
313
  description=output_info.description,
228
314
  short_name=key,
229
315
  long_name=output_info.long_name,
316
+ dimensions=output_info.dimensions or {},
230
317
  )
231
318
  )
@@ -4,7 +4,10 @@ from sqlalchemy import Connection, inspect
4
4
 
5
5
  from climate_ref.config import Config
6
6
  from climate_ref.database import Database
7
- from climate_ref.models import Base, MetricValue
7
+ from climate_ref.models import Base
8
+ from climate_ref.models.execution import ExecutionOutput
9
+ from climate_ref.models.metric_value import MetricValue
10
+ from climate_ref.models.mixins import DimensionMixin
8
11
  from climate_ref_core.logging import capture_logging
9
12
  from climate_ref_core.pycmec.controlled_vocabulary import CV
10
13
 
@@ -33,7 +36,7 @@ target_metadata = Base.metadata
33
36
  # Custom migration functions that are run on every migration
34
37
 
35
38
 
36
- def _add_metric_value_columns(connection: Connection) -> None:
39
+ def _add_dimension_columns(connection: Connection, table: str, Cls: type[DimensionMixin]) -> None:
37
40
  """
38
41
  Add any missing columns in the current CV to the database
39
42
 
@@ -44,27 +47,25 @@ def _add_metric_value_columns(connection: Connection) -> None:
44
47
  connection
45
48
  Open connection to the database
46
49
  """
47
- metric_value_table = "metric_value"
48
-
49
50
  inspector = inspect(connection)
50
51
 
51
52
  # Check if table already exists
52
53
  # Skip if it doesn't
53
54
  tables = inspector.get_table_names()
54
- if metric_value_table not in tables:
55
- logger.warning(f"No table named {metric_value_table!r} found")
55
+ if table not in tables:
56
+ logger.warning(f"No table named {table!r} found")
56
57
  return
57
58
 
58
59
  # Extract the current columns in the DB
59
- existing_columns = [c["name"] for c in inspector.get_columns(metric_value_table)]
60
+ existing_columns = [c["name"] for c in inspector.get_columns(table)]
60
61
 
61
62
  cv_file = ref_config.paths.dimensions_cv
62
63
  cv = CV.load_from_file(cv_file)
63
64
 
64
65
  for dimension in cv.dimensions:
65
66
  if dimension.name not in existing_columns:
66
- logger.info(f"Adding missing metric value dimension: {dimension.name!r}")
67
- op.add_column(metric_value_table, MetricValue.build_dimension_column(dimension))
67
+ logger.info(f"Adding missing value dimension: {dimension.name!r}")
68
+ op.add_column(table, Cls.build_dimension_column(dimension))
68
69
 
69
70
 
70
71
  def include_object(object_, name: str, type_, reflected, compare_to) -> bool:
@@ -134,7 +135,8 @@ def run_migrations_online() -> None:
134
135
  # Set up the Operations context
135
136
  # This is needed to alter the tables
136
137
  with op.Operations.context(context.get_context()): # type: ignore
137
- _add_metric_value_columns(connection)
138
+ _add_dimension_columns(connection, "metric_value", MetricValue)
139
+ _add_dimension_columns(connection, "execution_output", ExecutionOutput)
138
140
 
139
141
 
140
142
  if context.is_offline_mode():