climate-ref 0.6.5__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- climate_ref/cli/__init__.py +12 -3
- climate_ref/cli/_utils.py +56 -2
- climate_ref/cli/datasets.py +48 -9
- climate_ref/cli/executions.py +351 -24
- climate_ref/cli/providers.py +1 -2
- climate_ref/config.py +4 -4
- climate_ref/database.py +62 -4
- climate_ref/dataset_registry/obs4ref_reference.txt +0 -9
- climate_ref/dataset_registry/sample_data.txt +269 -107
- climate_ref/datasets/__init__.py +3 -3
- climate_ref/datasets/base.py +121 -20
- climate_ref/datasets/cmip6.py +2 -0
- climate_ref/datasets/obs4mips.py +26 -15
- climate_ref/executor/__init__.py +8 -1
- climate_ref/executor/hpc.py +7 -1
- climate_ref/executor/result_handling.py +151 -64
- climate_ref/migrations/env.py +12 -10
- climate_ref/migrations/versions/2025-07-20T1521_94beace57a9c_cmip6_finalised.py +1 -1
- climate_ref/migrations/versions/2025-08-05T0327_a1b2c3d4e5f6_finalised_on_base_dataset.py +1 -1
- climate_ref/migrations/versions/2025-09-05T2019_8d28e5e0f9c3_add_indexes.py +108 -0
- climate_ref/migrations/versions/2025-09-10T1358_2f6e36738e06_use_version_as_version_facet_for_.py +35 -0
- climate_ref/migrations/versions/2025-09-22T2359_20cd136a5b04_add_pmp_version.py +35 -0
- climate_ref/models/__init__.py +1 -6
- climate_ref/models/base.py +4 -18
- climate_ref/models/dataset.py +10 -6
- climate_ref/models/diagnostic.py +2 -1
- climate_ref/models/execution.py +225 -12
- climate_ref/models/metric_value.py +27 -112
- climate_ref/models/mixins.py +144 -0
- climate_ref/models/provider.py +2 -1
- climate_ref/provider_registry.py +4 -4
- climate_ref/slurm.py +2 -2
- climate_ref/testing.py +1 -1
- {climate_ref-0.6.5.dist-info → climate_ref-0.7.0.dist-info}/METADATA +2 -2
- climate_ref-0.7.0.dist-info/RECORD +58 -0
- climate_ref-0.6.5.dist-info/RECORD +0 -54
- {climate_ref-0.6.5.dist-info → climate_ref-0.7.0.dist-info}/WHEEL +0 -0
- {climate_ref-0.6.5.dist-info → climate_ref-0.7.0.dist-info}/entry_points.txt +0 -0
- {climate_ref-0.6.5.dist-info → climate_ref-0.7.0.dist-info}/licenses/LICENCE +0 -0
- {climate_ref-0.6.5.dist-info → climate_ref-0.7.0.dist-info}/licenses/NOTICE +0 -0
climate_ref/datasets/base.py
CHANGED
|
@@ -2,16 +2,36 @@ from pathlib import Path
|
|
|
2
2
|
from typing import Any, Protocol, cast
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
|
+
from attrs import define
|
|
5
6
|
from loguru import logger
|
|
6
7
|
from sqlalchemy.orm import joinedload
|
|
7
8
|
|
|
8
9
|
from climate_ref.config import Config
|
|
9
|
-
from climate_ref.database import Database
|
|
10
|
+
from climate_ref.database import Database, ModelState
|
|
10
11
|
from climate_ref.datasets.utils import validate_path
|
|
11
12
|
from climate_ref.models.dataset import Dataset, DatasetFile
|
|
12
13
|
from climate_ref_core.exceptions import RefException
|
|
13
14
|
|
|
14
15
|
|
|
16
|
+
@define
|
|
17
|
+
class DatasetRegistrationResult:
|
|
18
|
+
"""
|
|
19
|
+
Result of registering a dataset, containing information about file changes
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
dataset: Dataset
|
|
23
|
+
dataset_state: ModelState | None
|
|
24
|
+
files_added: list[str]
|
|
25
|
+
files_updated: list[str]
|
|
26
|
+
files_removed: list[str]
|
|
27
|
+
files_unchanged: list[str]
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def total_changes(self) -> int:
|
|
31
|
+
"""Total number of file changes (added + updated + removed)"""
|
|
32
|
+
return len(self.files_added) + len(self.files_updated) + len(self.files_removed)
|
|
33
|
+
|
|
34
|
+
|
|
15
35
|
def _log_duplicate_metadata(
|
|
16
36
|
data_catalog: pd.DataFrame, unique_metadata: pd.DataFrame, slug_column: str
|
|
17
37
|
) -> None:
|
|
@@ -26,7 +46,8 @@ def _log_duplicate_metadata(
|
|
|
26
46
|
invalid_dataset_columns = invalid_dataset_nunique[invalid_dataset_nunique.gt(1)].index.tolist()
|
|
27
47
|
|
|
28
48
|
# Include time_range in the list of invalid columns to make debugging easier
|
|
29
|
-
|
|
49
|
+
if "time_range" in data_catalog.columns and "time_range" not in invalid_dataset_columns:
|
|
50
|
+
invalid_dataset_columns.append("time_range")
|
|
30
51
|
|
|
31
52
|
data_catalog_subset = data_catalog[data_catalog[slug_column] == instance_id]
|
|
32
53
|
|
|
@@ -169,9 +190,9 @@ class DatasetAdapter(Protocol):
|
|
|
169
190
|
|
|
170
191
|
return data_catalog
|
|
171
192
|
|
|
172
|
-
def register_dataset(
|
|
193
|
+
def register_dataset( # noqa: PLR0915
|
|
173
194
|
self, config: Config, db: Database, data_catalog_dataset: pd.DataFrame
|
|
174
|
-
) ->
|
|
195
|
+
) -> DatasetRegistrationResult:
|
|
175
196
|
"""
|
|
176
197
|
Register a dataset in the database using the data catalog
|
|
177
198
|
|
|
@@ -187,7 +208,7 @@ class DatasetAdapter(Protocol):
|
|
|
187
208
|
Returns
|
|
188
209
|
-------
|
|
189
210
|
:
|
|
190
|
-
|
|
211
|
+
Registration result with dataset and file change information
|
|
191
212
|
"""
|
|
192
213
|
DatasetModel = self.dataset_cls
|
|
193
214
|
|
|
@@ -197,24 +218,104 @@ class DatasetAdapter(Protocol):
|
|
|
197
218
|
raise RefException(f"Found multiple datasets in the same directory: {unique_slugs}")
|
|
198
219
|
slug = unique_slugs[0]
|
|
199
220
|
|
|
221
|
+
# Upsert the dataset (create a new dataset or update the metadata)
|
|
200
222
|
dataset_metadata = data_catalog_dataset[list(self.dataset_specific_metadata)].iloc[0].to_dict()
|
|
201
|
-
dataset,
|
|
202
|
-
if
|
|
203
|
-
logger.
|
|
204
|
-
|
|
223
|
+
dataset, dataset_state = db.update_or_create(DatasetModel, defaults=dataset_metadata, slug=slug)
|
|
224
|
+
if dataset_state == ModelState.CREATED:
|
|
225
|
+
logger.info(f"Created new dataset: {dataset}")
|
|
226
|
+
elif dataset_state == ModelState.UPDATED:
|
|
227
|
+
logger.info(f"Updating existing dataset: {dataset}")
|
|
205
228
|
db.session.flush()
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
229
|
+
|
|
230
|
+
# Initialize result tracking
|
|
231
|
+
files_added = []
|
|
232
|
+
files_updated = []
|
|
233
|
+
files_removed = []
|
|
234
|
+
files_unchanged = []
|
|
235
|
+
|
|
236
|
+
# Get current files for this dataset
|
|
237
|
+
current_files = db.session.query(DatasetFile).filter_by(dataset_id=dataset.id).all()
|
|
238
|
+
current_file_paths = {f.path: f for f in current_files}
|
|
239
|
+
|
|
240
|
+
# Get new file data from data catalog
|
|
241
|
+
new_file_data = data_catalog_dataset.to_dict(orient="records")
|
|
242
|
+
new_file_lookup = {}
|
|
243
|
+
for dataset_file in new_file_data:
|
|
244
|
+
file_path = str(validate_path(dataset_file["path"]))
|
|
245
|
+
new_file_lookup[file_path] = {
|
|
246
|
+
"start_time": dataset_file["start_time"],
|
|
247
|
+
"end_time": dataset_file["end_time"],
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
new_file_paths = set(new_file_lookup.keys())
|
|
251
|
+
existing_file_paths = set(current_file_paths.keys())
|
|
252
|
+
|
|
253
|
+
# TODO: support removing files that are no longer present
|
|
254
|
+
# We want to keep a record of the dataset if it was used by a diagnostic in the past
|
|
255
|
+
files_to_remove = existing_file_paths - new_file_paths
|
|
256
|
+
if files_to_remove:
|
|
257
|
+
files_removed = list(files_to_remove)
|
|
258
|
+
logger.warning(f"Files to remove: {files_removed}")
|
|
259
|
+
raise NotImplementedError("Removing files is not yet supported")
|
|
260
|
+
|
|
261
|
+
# Update existing files if start/end times have changed
|
|
262
|
+
for file_path, existing_file in current_file_paths.items():
|
|
263
|
+
if file_path in new_file_lookup:
|
|
264
|
+
new_times = new_file_lookup[file_path]
|
|
265
|
+
if (
|
|
266
|
+
existing_file.start_time != new_times["start_time"]
|
|
267
|
+
or existing_file.end_time != new_times["end_time"]
|
|
268
|
+
):
|
|
269
|
+
logger.warning(f"Updating file times for {file_path}")
|
|
270
|
+
existing_file.start_time = new_times["start_time"]
|
|
271
|
+
existing_file.end_time = new_times["end_time"]
|
|
272
|
+
files_updated.append(file_path)
|
|
273
|
+
else:
|
|
274
|
+
files_unchanged.append(file_path)
|
|
275
|
+
|
|
276
|
+
# Add new files (batch operation)
|
|
277
|
+
files_to_add = new_file_paths - existing_file_paths
|
|
278
|
+
if files_to_add:
|
|
279
|
+
files_added = list(files_to_add)
|
|
280
|
+
new_dataset_files = []
|
|
281
|
+
for file_path in files_to_add:
|
|
282
|
+
file_times = new_file_lookup[file_path]
|
|
283
|
+
new_dataset_files.append(
|
|
284
|
+
DatasetFile(
|
|
285
|
+
path=file_path,
|
|
286
|
+
dataset_id=dataset.id,
|
|
287
|
+
start_time=file_times["start_time"],
|
|
288
|
+
end_time=file_times["end_time"],
|
|
289
|
+
)
|
|
215
290
|
)
|
|
216
|
-
)
|
|
217
|
-
|
|
291
|
+
db.session.add_all(new_dataset_files)
|
|
292
|
+
|
|
293
|
+
# Determine final dataset state
|
|
294
|
+
# If dataset metadata changed, use that state
|
|
295
|
+
# If no metadata changed but files changed, consider it updated
|
|
296
|
+
# If nothing changed, keep the original state (None for existing, CREATED for new)
|
|
297
|
+
final_dataset_state = dataset_state
|
|
298
|
+
if dataset_state is None and (files_added or files_updated or files_removed):
|
|
299
|
+
final_dataset_state = ModelState.UPDATED
|
|
300
|
+
|
|
301
|
+
result = DatasetRegistrationResult(
|
|
302
|
+
dataset=dataset,
|
|
303
|
+
dataset_state=final_dataset_state,
|
|
304
|
+
files_added=files_added,
|
|
305
|
+
files_updated=files_updated,
|
|
306
|
+
files_removed=files_removed,
|
|
307
|
+
files_unchanged=files_unchanged,
|
|
308
|
+
)
|
|
309
|
+
change_message = f": ({final_dataset_state.name})" if final_dataset_state else ""
|
|
310
|
+
logger.debug(
|
|
311
|
+
f"Dataset registration complete for {dataset.slug}{change_message} "
|
|
312
|
+
f"{len(files_added)} files added, "
|
|
313
|
+
f"{len(files_updated)} files updated, "
|
|
314
|
+
f"{len(files_removed)} files removed, "
|
|
315
|
+
f"{len(files_unchanged)} files unchanged"
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
return result
|
|
218
319
|
|
|
219
320
|
def _get_dataset_files(self, db: Database, limit: int | None = None) -> pd.DataFrame:
|
|
220
321
|
dataset_type = self.dataset_cls.__mapper_args__["polymorphic_identity"]
|
climate_ref/datasets/cmip6.py
CHANGED
|
@@ -119,6 +119,8 @@ class CMIP6DatasetAdapter(DatasetAdapter):
|
|
|
119
119
|
file_specific_metadata = ("start_time", "end_time", "path")
|
|
120
120
|
|
|
121
121
|
version_metadata = "version"
|
|
122
|
+
# See https://wcrp-cmip.github.io/WGCM_Infrastructure_Panel/Papers/CMIP6_global_attributes_filenames_CVs_v6.2.7.pdf
|
|
123
|
+
# under "Directory structure template"
|
|
122
124
|
dataset_id_metadata = (
|
|
123
125
|
"activity_id",
|
|
124
126
|
"institution_id",
|
climate_ref/datasets/obs4mips.py
CHANGED
|
@@ -7,7 +7,6 @@ from typing import Any
|
|
|
7
7
|
import pandas as pd
|
|
8
8
|
import xarray as xr
|
|
9
9
|
from ecgtools import Builder
|
|
10
|
-
from ecgtools.parsers.utilities import extract_attr_with_regex # type: ignore
|
|
11
10
|
from loguru import logger
|
|
12
11
|
|
|
13
12
|
from climate_ref.datasets.base import DatasetAdapter
|
|
@@ -15,7 +14,7 @@ from climate_ref.datasets.cmip6 import _parse_datetime
|
|
|
15
14
|
from climate_ref.models.dataset import Dataset, Obs4MIPsDataset
|
|
16
15
|
|
|
17
16
|
|
|
18
|
-
def parse_obs4mips(file: str, **kwargs: Any) -> dict[str, Any]:
|
|
17
|
+
def parse_obs4mips(file: str, **kwargs: Any) -> dict[str, Any]: # noqa: PLR0912
|
|
19
18
|
"""
|
|
20
19
|
Parser for obs4mips
|
|
21
20
|
|
|
@@ -41,6 +40,7 @@ def parse_obs4mips(file: str, **kwargs: Any) -> dict[str, Any]:
|
|
|
41
40
|
"source_type",
|
|
42
41
|
"variable_id",
|
|
43
42
|
"variant_label",
|
|
43
|
+
"source_version_number",
|
|
44
44
|
}
|
|
45
45
|
)
|
|
46
46
|
)
|
|
@@ -48,6 +48,10 @@ def parse_obs4mips(file: str, **kwargs: Any) -> dict[str, Any]:
|
|
|
48
48
|
try:
|
|
49
49
|
time_coder = xr.coders.CFDatetimeCoder(use_cftime=True)
|
|
50
50
|
with xr.open_dataset(file, chunks={}, decode_times=time_coder) as ds:
|
|
51
|
+
if ds.attrs.get("activity_id", "") != "obs4MIPs":
|
|
52
|
+
traceback_message = f"{file} is not an obs4MIPs dataset"
|
|
53
|
+
raise TypeError(traceback_message)
|
|
54
|
+
|
|
51
55
|
has_none_value = any(ds.attrs.get(key) is None for key in keys)
|
|
52
56
|
if has_none_value:
|
|
53
57
|
missing_fields = [key for key in keys if ds.attrs.get(key) is None]
|
|
@@ -55,10 +59,6 @@ def parse_obs4mips(file: str, **kwargs: Any) -> dict[str, Any]:
|
|
|
55
59
|
raise AttributeError(traceback_message)
|
|
56
60
|
info = {key: ds.attrs.get(key) for key in keys}
|
|
57
61
|
|
|
58
|
-
if info["activity_id"] != "obs4MIPs":
|
|
59
|
-
traceback_message = f"{file} is not an obs4MIPs dataset"
|
|
60
|
-
raise TypeError(traceback_message)
|
|
61
|
-
|
|
62
62
|
variable_id = info["variable_id"]
|
|
63
63
|
|
|
64
64
|
if variable_id:
|
|
@@ -86,12 +86,12 @@ def parse_obs4mips(file: str, **kwargs: Any) -> dict[str, Any]:
|
|
|
86
86
|
else:
|
|
87
87
|
info["time_range"] = f"{start_time}-{end_time}"
|
|
88
88
|
info["path"] = str(file)
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
89
|
+
# Parsing the version like for CMIP6 fails because some obs4REF paths
|
|
90
|
+
# do not include "v" in the version directory name.
|
|
91
|
+
# TODO: fix obs4REF paths
|
|
92
|
+
info["version"] = Path(file).parent.name
|
|
93
|
+
if not info["version"].startswith("v"): # type: ignore[union-attr]
|
|
94
|
+
info["version"] = "v{version}".format(**info)
|
|
95
95
|
return info
|
|
96
96
|
|
|
97
97
|
except (TypeError, AttributeError) as err:
|
|
@@ -99,7 +99,7 @@ def parse_obs4mips(file: str, **kwargs: Any) -> dict[str, Any]:
|
|
|
99
99
|
logger.warning(str(err.args[0]))
|
|
100
100
|
else:
|
|
101
101
|
logger.warning(str(err.args))
|
|
102
|
-
return {"INVALID_ASSET": file, "TRACEBACK":
|
|
102
|
+
return {"INVALID_ASSET": file, "TRACEBACK": str(err)}
|
|
103
103
|
except Exception:
|
|
104
104
|
logger.warning(traceback.format_exc())
|
|
105
105
|
return {"INVALID_ASSET": file, "TRACEBACK": traceback.format_exc()}
|
|
@@ -129,18 +129,22 @@ class Obs4MIPsDatasetAdapter(DatasetAdapter):
|
|
|
129
129
|
"variant_label",
|
|
130
130
|
"long_name",
|
|
131
131
|
"units",
|
|
132
|
+
"version",
|
|
132
133
|
"vertical_levels",
|
|
133
134
|
"source_version_number",
|
|
134
135
|
slug_column,
|
|
135
136
|
)
|
|
136
137
|
|
|
137
138
|
file_specific_metadata = ("start_time", "end_time", "path")
|
|
138
|
-
version_metadata = "
|
|
139
|
+
version_metadata = "version"
|
|
140
|
+
# See ODS2.5 at https://doi.org/10.5281/zenodo.11500474 under "Directory structure template"
|
|
139
141
|
dataset_id_metadata = (
|
|
140
142
|
"activity_id",
|
|
141
143
|
"institution_id",
|
|
142
144
|
"source_id",
|
|
145
|
+
"frequency",
|
|
143
146
|
"variable_id",
|
|
147
|
+
"nominal_resolution",
|
|
144
148
|
"grid_label",
|
|
145
149
|
)
|
|
146
150
|
|
|
@@ -186,7 +190,14 @@ class Obs4MIPsDatasetAdapter(DatasetAdapter):
|
|
|
186
190
|
self.version_metadata,
|
|
187
191
|
]
|
|
188
192
|
datasets["instance_id"] = datasets.apply(
|
|
189
|
-
lambda row: "obs4MIPs."
|
|
193
|
+
lambda row: "obs4MIPs."
|
|
194
|
+
+ ".".join(
|
|
195
|
+
[
|
|
196
|
+
row[item].replace(" ", "") if item == "nominal_resolution" else row[item]
|
|
197
|
+
for item in drs_items
|
|
198
|
+
]
|
|
199
|
+
),
|
|
200
|
+
axis=1,
|
|
190
201
|
)
|
|
191
202
|
datasets["finalised"] = True
|
|
192
203
|
return datasets
|
climate_ref/executor/__init__.py
CHANGED
|
@@ -9,7 +9,14 @@ The simplest executor is the `LocalExecutor`, which runs the diagnostic in the s
|
|
|
9
9
|
This is useful for local testing and debugging.
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
|
-
from .
|
|
12
|
+
from climate_ref_core.exceptions import InvalidExecutorException
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
from .hpc import HPCExecutor
|
|
16
|
+
except InvalidExecutorException as exc:
|
|
17
|
+
# This exception is reraised when importing the executor as `climate_ref.executors.HPCExecutor`
|
|
18
|
+
HPCExecutor = exc # type: ignore
|
|
19
|
+
|
|
13
20
|
from .local import LocalExecutor
|
|
14
21
|
from .result_handling import handle_execution_result
|
|
15
22
|
from .synchronous import SynchronousExecutor
|
climate_ref/executor/hpc.py
CHANGED
|
@@ -5,12 +5,18 @@ If you want to
|
|
|
5
5
|
- run REF under the HPC workflows
|
|
6
6
|
- run REF in multiple nodes
|
|
7
7
|
|
|
8
|
+
The `HPCExecutor` requires the optional `parsl` dependency.
|
|
9
|
+
This dependency (and therefore this executor) is not available on Windows.
|
|
8
10
|
"""
|
|
9
11
|
|
|
10
12
|
try:
|
|
11
13
|
import parsl
|
|
12
14
|
except ImportError: # pragma: no cover
|
|
13
|
-
|
|
15
|
+
from climate_ref_core.exceptions import InvalidExecutorException
|
|
16
|
+
|
|
17
|
+
raise InvalidExecutorException(
|
|
18
|
+
"climate_ref_core.executor.hpc.HPCExecutor", "The HPCExecutor requires the `parsl` package"
|
|
19
|
+
)
|
|
14
20
|
|
|
15
21
|
import os
|
|
16
22
|
import time
|
|
@@ -17,11 +17,12 @@ from loguru import logger
|
|
|
17
17
|
from sqlalchemy import insert
|
|
18
18
|
|
|
19
19
|
from climate_ref.database import Database
|
|
20
|
-
from climate_ref.models import ScalarMetricValue
|
|
20
|
+
from climate_ref.models import ScalarMetricValue, SeriesMetricValue
|
|
21
21
|
from climate_ref.models.execution import Execution, ExecutionOutput, ResultOutputType
|
|
22
22
|
from climate_ref_core.diagnostics import ExecutionResult, ensure_relative_path
|
|
23
23
|
from climate_ref_core.exceptions import ResultValidationError
|
|
24
24
|
from climate_ref_core.logging import EXECUTION_LOG_FILENAME
|
|
25
|
+
from climate_ref_core.metric_values import SeriesMetricValue as TSeries
|
|
25
26
|
from climate_ref_core.pycmec.controlled_vocabulary import CV
|
|
26
27
|
from climate_ref_core.pycmec.metric import CMECMetric
|
|
27
28
|
from climate_ref_core.pycmec.output import CMECOutput, OutputDict
|
|
@@ -65,6 +66,113 @@ def _copy_file_to_results(
|
|
|
65
66
|
shutil.copy(input_directory / filename, output_filename)
|
|
66
67
|
|
|
67
68
|
|
|
69
|
+
def _process_execution_scalar(
|
|
70
|
+
database: Database,
|
|
71
|
+
result: "ExecutionResult",
|
|
72
|
+
execution: Execution,
|
|
73
|
+
cv: CV,
|
|
74
|
+
) -> None:
|
|
75
|
+
"""
|
|
76
|
+
Process the scalar values from the execution result and store them in the database
|
|
77
|
+
|
|
78
|
+
This also validates the scalar values against the controlled vocabulary
|
|
79
|
+
"""
|
|
80
|
+
# Load the metric bundle from the file
|
|
81
|
+
cmec_metric_bundle = CMECMetric.load_from_json(result.to_output_path(result.metric_bundle_filename))
|
|
82
|
+
|
|
83
|
+
# Check that the diagnostic values conform with the controlled vocabulary
|
|
84
|
+
try:
|
|
85
|
+
cv.validate_metrics(cmec_metric_bundle)
|
|
86
|
+
except (ResultValidationError, AssertionError):
|
|
87
|
+
# TODO: Remove once we have settled on a controlled vocabulary
|
|
88
|
+
logger.exception("Diagnostic values do not conform with the controlled vocabulary")
|
|
89
|
+
# execution.mark_failed()
|
|
90
|
+
|
|
91
|
+
# Perform a bulk insert of scalar values
|
|
92
|
+
# The current implementation will swallow the exception, but display a log message
|
|
93
|
+
try:
|
|
94
|
+
scalar_values = [
|
|
95
|
+
{
|
|
96
|
+
"execution_id": execution.id,
|
|
97
|
+
"value": result.value,
|
|
98
|
+
"attributes": result.attributes,
|
|
99
|
+
**result.dimensions,
|
|
100
|
+
}
|
|
101
|
+
for result in cmec_metric_bundle.iter_results()
|
|
102
|
+
]
|
|
103
|
+
logger.debug(f"Ingesting {len(scalar_values)} scalar values for execution {execution.id}")
|
|
104
|
+
if scalar_values:
|
|
105
|
+
# Perform this in a nested transaction to rollback if something goes wrong
|
|
106
|
+
# We will lose the metric values for a given execution, but not the whole execution
|
|
107
|
+
with database.session.begin_nested():
|
|
108
|
+
database.session.execute(
|
|
109
|
+
insert(ScalarMetricValue),
|
|
110
|
+
scalar_values,
|
|
111
|
+
)
|
|
112
|
+
# This is a broad exception catch to ensure we log any issues
|
|
113
|
+
except Exception:
|
|
114
|
+
logger.exception("Something went wrong when ingesting diagnostic values")
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _process_execution_series(
|
|
118
|
+
config: "Config",
|
|
119
|
+
database: Database,
|
|
120
|
+
result: "ExecutionResult",
|
|
121
|
+
execution: Execution,
|
|
122
|
+
cv: CV,
|
|
123
|
+
) -> None:
|
|
124
|
+
"""
|
|
125
|
+
Process the series values from the execution result and store them in the database
|
|
126
|
+
|
|
127
|
+
This also copies the series values file from the scratch directory to the results directory
|
|
128
|
+
and validates the series values against the controlled vocabulary.
|
|
129
|
+
"""
|
|
130
|
+
assert result.series_filename, "Series filename must be set in the result"
|
|
131
|
+
|
|
132
|
+
_copy_file_to_results(
|
|
133
|
+
config.paths.scratch,
|
|
134
|
+
config.paths.results,
|
|
135
|
+
execution.output_fragment,
|
|
136
|
+
result.series_filename,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
# Load the series values from the file
|
|
140
|
+
series_values_path = result.to_output_path(result.series_filename)
|
|
141
|
+
series_values = TSeries.load_from_json(series_values_path)
|
|
142
|
+
|
|
143
|
+
try:
|
|
144
|
+
cv.validate_metrics(series_values)
|
|
145
|
+
except (ResultValidationError, AssertionError):
|
|
146
|
+
# TODO: Remove once we have settled on a controlled vocabulary
|
|
147
|
+
logger.exception("Diagnostic values do not conform with the controlled vocabulary")
|
|
148
|
+
# execution.mark_failed()
|
|
149
|
+
|
|
150
|
+
# Perform a bulk insert of series values
|
|
151
|
+
try:
|
|
152
|
+
series_values_content = [
|
|
153
|
+
{
|
|
154
|
+
"execution_id": execution.id,
|
|
155
|
+
"values": series_result.values,
|
|
156
|
+
"attributes": series_result.attributes,
|
|
157
|
+
"index": series_result.index,
|
|
158
|
+
"index_name": series_result.index_name,
|
|
159
|
+
**series_result.dimensions,
|
|
160
|
+
}
|
|
161
|
+
for series_result in series_values
|
|
162
|
+
]
|
|
163
|
+
logger.debug(f"Ingesting {len(series_values)} series values for execution {execution.id}")
|
|
164
|
+
if series_values:
|
|
165
|
+
# Perform this in a nested transaction to rollback if something goes wrong
|
|
166
|
+
# We will lose the metric values for a given execution, but not the whole execution
|
|
167
|
+
with database.session.begin_nested():
|
|
168
|
+
database.session.execute(
|
|
169
|
+
insert(SeriesMetricValue),
|
|
170
|
+
series_values_content,
|
|
171
|
+
)
|
|
172
|
+
except Exception:
|
|
173
|
+
logger.exception("Something went wrong when ingesting diagnostic series values")
|
|
174
|
+
|
|
175
|
+
|
|
68
176
|
def handle_execution_result(
|
|
69
177
|
config: "Config",
|
|
70
178
|
database: Database,
|
|
@@ -88,7 +196,7 @@ def handle_execution_result(
|
|
|
88
196
|
result
|
|
89
197
|
The result of the diagnostic execution, either successful or failed
|
|
90
198
|
"""
|
|
91
|
-
# Always copy log data
|
|
199
|
+
# Always copy log data to the results directory
|
|
92
200
|
_copy_file_to_results(
|
|
93
201
|
config.paths.scratch,
|
|
94
202
|
config.paths.results,
|
|
@@ -96,74 +204,52 @@ def handle_execution_result(
|
|
|
96
204
|
EXECUTION_LOG_FILENAME,
|
|
97
205
|
)
|
|
98
206
|
|
|
99
|
-
if result.successful
|
|
100
|
-
logger.
|
|
207
|
+
if not result.successful or result.metric_bundle_filename is None:
|
|
208
|
+
logger.error(f"{execution} failed")
|
|
209
|
+
execution.mark_failed()
|
|
210
|
+
return
|
|
211
|
+
|
|
212
|
+
logger.info(f"{execution} successful")
|
|
213
|
+
|
|
214
|
+
_copy_file_to_results(
|
|
215
|
+
config.paths.scratch,
|
|
216
|
+
config.paths.results,
|
|
217
|
+
execution.output_fragment,
|
|
218
|
+
result.metric_bundle_filename,
|
|
219
|
+
)
|
|
101
220
|
|
|
221
|
+
if result.output_bundle_filename:
|
|
102
222
|
_copy_file_to_results(
|
|
103
223
|
config.paths.scratch,
|
|
104
224
|
config.paths.results,
|
|
105
225
|
execution.output_fragment,
|
|
106
|
-
result.
|
|
226
|
+
result.output_bundle_filename,
|
|
227
|
+
)
|
|
228
|
+
_handle_output_bundle(
|
|
229
|
+
config,
|
|
230
|
+
database,
|
|
231
|
+
execution,
|
|
232
|
+
result.to_output_path(result.output_bundle_filename),
|
|
107
233
|
)
|
|
108
|
-
execution.mark_successful(result.as_relative_path(result.metric_bundle_filename))
|
|
109
|
-
|
|
110
|
-
if result.output_bundle_filename:
|
|
111
|
-
_copy_file_to_results(
|
|
112
|
-
config.paths.scratch,
|
|
113
|
-
config.paths.results,
|
|
114
|
-
execution.output_fragment,
|
|
115
|
-
result.output_bundle_filename,
|
|
116
|
-
)
|
|
117
|
-
_handle_output_bundle(
|
|
118
|
-
config,
|
|
119
|
-
database,
|
|
120
|
-
execution,
|
|
121
|
-
result.to_output_path(result.output_bundle_filename),
|
|
122
|
-
)
|
|
123
234
|
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
**result.dimensions,
|
|
143
|
-
}
|
|
144
|
-
for result in cmec_metric_bundle.iter_results()
|
|
145
|
-
]
|
|
146
|
-
if scalar_values:
|
|
147
|
-
# Perform this in a nested transaction to rollback if something goes wrong
|
|
148
|
-
# We will lose the metric values for a given execution, but not the whole execution
|
|
149
|
-
with database.session.begin_nested():
|
|
150
|
-
database.session.execute(
|
|
151
|
-
insert(ScalarMetricValue),
|
|
152
|
-
scalar_values,
|
|
153
|
-
)
|
|
154
|
-
except Exception:
|
|
155
|
-
# TODO: Remove once we have settled on a controlled vocabulary
|
|
156
|
-
logger.exception("Something went wrong when ingesting diagnostic values")
|
|
157
|
-
|
|
158
|
-
# TODO Ingest the series values
|
|
159
|
-
|
|
160
|
-
# TODO: This should check if the result is the most recent for the execution,
|
|
161
|
-
# if so then update the dirty fields
|
|
162
|
-
# i.e. if there are outstanding executions don't make as clean
|
|
163
|
-
execution.execution_group.dirty = False
|
|
164
|
-
else:
|
|
165
|
-
logger.error(f"{execution} failed")
|
|
166
|
-
execution.mark_failed()
|
|
235
|
+
cv = CV.load_from_file(config.paths.dimensions_cv)
|
|
236
|
+
|
|
237
|
+
if result.series_filename:
|
|
238
|
+
# Process the series values if they are present
|
|
239
|
+
# This will ingest the series values into the database
|
|
240
|
+
_process_execution_series(config=config, database=database, result=result, execution=execution, cv=cv)
|
|
241
|
+
|
|
242
|
+
# Process the scalar values
|
|
243
|
+
# This will ingest the scalar values into the database
|
|
244
|
+
_process_execution_scalar(database=database, result=result, execution=execution, cv=cv)
|
|
245
|
+
|
|
246
|
+
# TODO: This should check if the result is the most recent for the execution,
|
|
247
|
+
# if so then update the dirty fields
|
|
248
|
+
# i.e. if there are outstanding executions don't make as clean
|
|
249
|
+
execution.execution_group.dirty = False
|
|
250
|
+
|
|
251
|
+
# Finally, mark the execution as successful
|
|
252
|
+
execution.mark_successful(result.as_relative_path(result.metric_bundle_filename))
|
|
167
253
|
|
|
168
254
|
|
|
169
255
|
def _handle_output_bundle(
|
|
@@ -220,12 +306,13 @@ def _handle_outputs(
|
|
|
220
306
|
filename,
|
|
221
307
|
)
|
|
222
308
|
database.session.add(
|
|
223
|
-
ExecutionOutput(
|
|
309
|
+
ExecutionOutput.build(
|
|
224
310
|
execution_id=execution.id,
|
|
225
311
|
output_type=output_type,
|
|
226
312
|
filename=str(filename),
|
|
227
313
|
description=output_info.description,
|
|
228
314
|
short_name=key,
|
|
229
315
|
long_name=output_info.long_name,
|
|
316
|
+
dimensions=output_info.dimensions or {},
|
|
230
317
|
)
|
|
231
318
|
)
|
climate_ref/migrations/env.py
CHANGED
|
@@ -4,7 +4,10 @@ from sqlalchemy import Connection, inspect
|
|
|
4
4
|
|
|
5
5
|
from climate_ref.config import Config
|
|
6
6
|
from climate_ref.database import Database
|
|
7
|
-
from climate_ref.models import Base
|
|
7
|
+
from climate_ref.models import Base
|
|
8
|
+
from climate_ref.models.execution import ExecutionOutput
|
|
9
|
+
from climate_ref.models.metric_value import MetricValue
|
|
10
|
+
from climate_ref.models.mixins import DimensionMixin
|
|
8
11
|
from climate_ref_core.logging import capture_logging
|
|
9
12
|
from climate_ref_core.pycmec.controlled_vocabulary import CV
|
|
10
13
|
|
|
@@ -33,7 +36,7 @@ target_metadata = Base.metadata
|
|
|
33
36
|
# Custom migration functions that are run on every migration
|
|
34
37
|
|
|
35
38
|
|
|
36
|
-
def
|
|
39
|
+
def _add_dimension_columns(connection: Connection, table: str, Cls: type[DimensionMixin]) -> None:
|
|
37
40
|
"""
|
|
38
41
|
Add any missing columns in the current CV to the database
|
|
39
42
|
|
|
@@ -44,27 +47,25 @@ def _add_metric_value_columns(connection: Connection) -> None:
|
|
|
44
47
|
connection
|
|
45
48
|
Open connection to the database
|
|
46
49
|
"""
|
|
47
|
-
metric_value_table = "metric_value"
|
|
48
|
-
|
|
49
50
|
inspector = inspect(connection)
|
|
50
51
|
|
|
51
52
|
# Check if table already exists
|
|
52
53
|
# Skip if it doesn't
|
|
53
54
|
tables = inspector.get_table_names()
|
|
54
|
-
if
|
|
55
|
-
logger.warning(f"No table named {
|
|
55
|
+
if table not in tables:
|
|
56
|
+
logger.warning(f"No table named {table!r} found")
|
|
56
57
|
return
|
|
57
58
|
|
|
58
59
|
# Extract the current columns in the DB
|
|
59
|
-
existing_columns = [c["name"] for c in inspector.get_columns(
|
|
60
|
+
existing_columns = [c["name"] for c in inspector.get_columns(table)]
|
|
60
61
|
|
|
61
62
|
cv_file = ref_config.paths.dimensions_cv
|
|
62
63
|
cv = CV.load_from_file(cv_file)
|
|
63
64
|
|
|
64
65
|
for dimension in cv.dimensions:
|
|
65
66
|
if dimension.name not in existing_columns:
|
|
66
|
-
logger.info(f"Adding missing
|
|
67
|
-
op.add_column(
|
|
67
|
+
logger.info(f"Adding missing value dimension: {dimension.name!r}")
|
|
68
|
+
op.add_column(table, Cls.build_dimension_column(dimension))
|
|
68
69
|
|
|
69
70
|
|
|
70
71
|
def include_object(object_, name: str, type_, reflected, compare_to) -> bool:
|
|
@@ -134,7 +135,8 @@ def run_migrations_online() -> None:
|
|
|
134
135
|
# Set up the Operations context
|
|
135
136
|
# This is needed to alter the tables
|
|
136
137
|
with op.Operations.context(context.get_context()): # type: ignore
|
|
137
|
-
|
|
138
|
+
_add_dimension_columns(connection, "metric_value", MetricValue)
|
|
139
|
+
_add_dimension_columns(connection, "execution_output", ExecutionOutput)
|
|
138
140
|
|
|
139
141
|
|
|
140
142
|
if context.is_offline_mode():
|