climate-ref 0.6.4__py3-none-any.whl → 0.6.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- climate_ref/config.py +11 -1
- climate_ref/datasets/base.py +28 -2
- climate_ref/datasets/cmip6.py +54 -100
- climate_ref/datasets/cmip6_parsers.py +189 -0
- climate_ref/datasets/obs4mips.py +14 -3
- climate_ref/executor/hpc.py +49 -18
- climate_ref/executor/pbs_scheduler.py +152 -0
- climate_ref/migrations/versions/2025-07-20T1521_94beace57a9c_cmip6_finalised.py +57 -0
- climate_ref/migrations/versions/2025-08-05T0327_a1b2c3d4e5f6_finalised_on_base_dataset.py +57 -0
- climate_ref/models/dataset.py +23 -15
- {climate_ref-0.6.4.dist-info → climate_ref-0.6.5.dist-info}/METADATA +1 -1
- {climate_ref-0.6.4.dist-info → climate_ref-0.6.5.dist-info}/RECORD +16 -12
- {climate_ref-0.6.4.dist-info → climate_ref-0.6.5.dist-info}/WHEEL +0 -0
- {climate_ref-0.6.4.dist-info → climate_ref-0.6.5.dist-info}/entry_points.txt +0 -0
- {climate_ref-0.6.4.dist-info → climate_ref-0.6.5.dist-info}/licenses/LICENCE +0 -0
- {climate_ref-0.6.4.dist-info → climate_ref-0.6.5.dist-info}/licenses/NOTICE +0 -0
climate_ref/config.py
CHANGED
|
@@ -17,7 +17,7 @@ which always take precedence over any other configuration values.
|
|
|
17
17
|
import importlib.resources
|
|
18
18
|
import os
|
|
19
19
|
from pathlib import Path
|
|
20
|
-
from typing import TYPE_CHECKING, Any
|
|
20
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
21
21
|
|
|
22
22
|
import tomlkit
|
|
23
23
|
from attr import Factory
|
|
@@ -354,6 +354,16 @@ class Config:
|
|
|
354
354
|
[loguru documentation](https://loguru.readthedocs.io/en/stable/api/logger.html#module-loguru._logger).
|
|
355
355
|
"""
|
|
356
356
|
|
|
357
|
+
cmip6_parser: Literal["drs", "complete"] = env_field("CMIP6_PARSER", default="complete")
|
|
358
|
+
"""
|
|
359
|
+
Parser to use for CMIP6 datasets
|
|
360
|
+
|
|
361
|
+
This can be either `drs` or `complete`.
|
|
362
|
+
|
|
363
|
+
- `drs`: Use the DRS parser, which parses the dataset based on the DRS naming conventions.
|
|
364
|
+
- `complete`: Use the complete parser, which parses the dataset based on all available metadata.
|
|
365
|
+
"""
|
|
366
|
+
|
|
357
367
|
paths: PathConfig = Factory(PathConfig) # noqa
|
|
358
368
|
db: DbConfig = Factory(DbConfig) # noqa
|
|
359
369
|
executor: ExecutorConfig = Factory(ExecutorConfig) # noqa
|
climate_ref/datasets/base.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
|
-
from typing import Protocol, cast
|
|
2
|
+
from typing import Any, Protocol, cast
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
from loguru import logger
|
|
@@ -35,6 +35,31 @@ def _log_duplicate_metadata(
|
|
|
35
35
|
)
|
|
36
36
|
|
|
37
37
|
|
|
38
|
+
class DatasetParsingFunction(Protocol):
|
|
39
|
+
"""
|
|
40
|
+
Protocol for a function that parses metadata from a file or directory
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __call__(self, file: str, **kwargs: Any) -> dict[str, Any]:
|
|
44
|
+
"""
|
|
45
|
+
Parse a file or directory and return metadata for the dataset
|
|
46
|
+
|
|
47
|
+
Parameters
|
|
48
|
+
----------
|
|
49
|
+
file
|
|
50
|
+
File or directory to parse
|
|
51
|
+
|
|
52
|
+
kwargs
|
|
53
|
+
Additional keyword arguments to pass to the parsing function.
|
|
54
|
+
|
|
55
|
+
Returns
|
|
56
|
+
-------
|
|
57
|
+
:
|
|
58
|
+
Data catalog containing the metadata for the dataset
|
|
59
|
+
"""
|
|
60
|
+
...
|
|
61
|
+
|
|
62
|
+
|
|
38
63
|
class DatasetAdapter(Protocol):
|
|
39
64
|
"""
|
|
40
65
|
An adapter to provide a common interface for different dataset types
|
|
@@ -173,7 +198,7 @@ class DatasetAdapter(Protocol):
|
|
|
173
198
|
slug = unique_slugs[0]
|
|
174
199
|
|
|
175
200
|
dataset_metadata = data_catalog_dataset[list(self.dataset_specific_metadata)].iloc[0].to_dict()
|
|
176
|
-
dataset, created = db.get_or_create(DatasetModel, slug=slug
|
|
201
|
+
dataset, created = db.get_or_create(DatasetModel, defaults=dataset_metadata, slug=slug)
|
|
177
202
|
if not created:
|
|
178
203
|
logger.warning(f"{dataset} already exists in the database. Skipping")
|
|
179
204
|
return None
|
|
@@ -212,6 +237,7 @@ class DatasetAdapter(Protocol):
|
|
|
212
237
|
{
|
|
213
238
|
**{k: getattr(file, k) for k in self.file_specific_metadata},
|
|
214
239
|
**{k: getattr(file.dataset, k) for k in self.dataset_specific_metadata},
|
|
240
|
+
"finalised": file.dataset.finalised,
|
|
215
241
|
}
|
|
216
242
|
for file in result
|
|
217
243
|
],
|
climate_ref/datasets/cmip6.py
CHANGED
|
@@ -1,18 +1,17 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import traceback
|
|
4
3
|
import warnings
|
|
5
4
|
from datetime import datetime
|
|
6
5
|
from pathlib import Path
|
|
7
6
|
from typing import Any
|
|
8
7
|
|
|
9
8
|
import pandas as pd
|
|
10
|
-
import xarray as xr
|
|
11
9
|
from ecgtools import Builder
|
|
12
|
-
from ecgtools.parsers.utilities import extract_attr_with_regex # type: ignore
|
|
13
10
|
from loguru import logger
|
|
14
11
|
|
|
15
|
-
from climate_ref.
|
|
12
|
+
from climate_ref.config import Config
|
|
13
|
+
from climate_ref.datasets.base import DatasetAdapter, DatasetParsingFunction
|
|
14
|
+
from climate_ref.datasets.cmip6_parsers import parse_cmip6_complete, parse_cmip6_drs
|
|
16
15
|
from climate_ref.models.dataset import CMIP6Dataset
|
|
17
16
|
|
|
18
17
|
|
|
@@ -22,16 +21,19 @@ def _parse_datetime(dt_str: pd.Series[str]) -> pd.Series[datetime | Any]:
|
|
|
22
21
|
"""
|
|
23
22
|
|
|
24
23
|
def _inner(date_string: str | None) -> datetime | None:
|
|
25
|
-
if not date_string:
|
|
24
|
+
if not date_string or pd.isnull(date_string):
|
|
26
25
|
return None
|
|
27
26
|
|
|
28
27
|
# Try to parse the date string with and without milliseconds
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
28
|
+
for fmt in ("%Y-%m-%d", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M:%S.%f"):
|
|
29
|
+
try:
|
|
30
|
+
return datetime.strptime(date_string, fmt)
|
|
31
|
+
except ValueError:
|
|
32
|
+
continue
|
|
33
33
|
|
|
34
|
-
return
|
|
34
|
+
# If all parsing attempts fail, log an error and return None
|
|
35
|
+
logger.error(f"Failed to parse date string: {date_string}")
|
|
36
|
+
return None
|
|
35
37
|
|
|
36
38
|
return pd.Series(
|
|
37
39
|
[_inner(dt) for dt in dt_str],
|
|
@@ -44,15 +46,16 @@ def _apply_fixes(data_catalog: pd.DataFrame) -> pd.DataFrame:
|
|
|
44
46
|
def _fix_parent_variant_label(group: pd.DataFrame) -> pd.DataFrame:
|
|
45
47
|
if group["parent_variant_label"].nunique() == 1:
|
|
46
48
|
return group
|
|
47
|
-
group["parent_variant_label"] = group["
|
|
49
|
+
group["parent_variant_label"] = group["parent_variant_label"].iloc[0]
|
|
48
50
|
|
|
49
51
|
return group
|
|
50
52
|
|
|
51
|
-
|
|
52
|
-
data_catalog
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
53
|
+
if "parent_variant_label" in data_catalog:
|
|
54
|
+
data_catalog = (
|
|
55
|
+
data_catalog.groupby("instance_id")
|
|
56
|
+
.apply(_fix_parent_variant_label, include_groups=False)
|
|
57
|
+
.reset_index(level="instance_id")
|
|
58
|
+
)
|
|
56
59
|
|
|
57
60
|
if "branch_time_in_child" in data_catalog:
|
|
58
61
|
data_catalog["branch_time_in_child"] = _clean_branch_time(data_catalog["branch_time_in_child"])
|
|
@@ -68,88 +71,6 @@ def _clean_branch_time(branch_time: pd.Series[str]) -> pd.Series[float]:
|
|
|
68
71
|
return pd.to_numeric(branch_time.astype(str).str.replace("D", ""), errors="coerce")
|
|
69
72
|
|
|
70
73
|
|
|
71
|
-
def parse_cmip6(file: str) -> dict[str, Any]:
|
|
72
|
-
"""
|
|
73
|
-
Parser for CMIP6
|
|
74
|
-
|
|
75
|
-
This function parses the CMIP6 dataset and returns a dictionary with the metadata.
|
|
76
|
-
This was copied from the ecgtools package, but we want to log the exception when it fails.
|
|
77
|
-
"""
|
|
78
|
-
keys = sorted(
|
|
79
|
-
{
|
|
80
|
-
"activity_id",
|
|
81
|
-
"branch_method",
|
|
82
|
-
"branch_time_in_child",
|
|
83
|
-
"branch_time_in_parent",
|
|
84
|
-
"experiment",
|
|
85
|
-
"experiment_id",
|
|
86
|
-
"frequency",
|
|
87
|
-
"grid",
|
|
88
|
-
"grid_label",
|
|
89
|
-
"institution_id",
|
|
90
|
-
"nominal_resolution",
|
|
91
|
-
"parent_activity_id",
|
|
92
|
-
"parent_experiment_id",
|
|
93
|
-
"parent_source_id",
|
|
94
|
-
"parent_time_units",
|
|
95
|
-
"parent_variant_label",
|
|
96
|
-
"realm",
|
|
97
|
-
"product",
|
|
98
|
-
"source_id",
|
|
99
|
-
"source_type",
|
|
100
|
-
"sub_experiment",
|
|
101
|
-
"sub_experiment_id",
|
|
102
|
-
"table_id",
|
|
103
|
-
"variable_id",
|
|
104
|
-
"variant_label",
|
|
105
|
-
}
|
|
106
|
-
)
|
|
107
|
-
|
|
108
|
-
try:
|
|
109
|
-
with xr.open_dataset(file, chunks={}, use_cftime=True) as ds:
|
|
110
|
-
info = {key: ds.attrs.get(key) for key in keys}
|
|
111
|
-
info["member_id"] = info["variant_label"]
|
|
112
|
-
|
|
113
|
-
variable_id = info["variable_id"]
|
|
114
|
-
if variable_id: # pragma: no branch
|
|
115
|
-
attrs = ds[variable_id].attrs
|
|
116
|
-
for attr in ["standard_name", "long_name", "units"]:
|
|
117
|
-
info[attr] = attrs.get(attr)
|
|
118
|
-
|
|
119
|
-
# Set the default of # of vertical levels to 1
|
|
120
|
-
vertical_levels = 1
|
|
121
|
-
start_time, end_time = None, None
|
|
122
|
-
init_year = None
|
|
123
|
-
try:
|
|
124
|
-
vertical_levels = ds[ds.cf["vertical"].name].size
|
|
125
|
-
except (KeyError, AttributeError, ValueError):
|
|
126
|
-
...
|
|
127
|
-
|
|
128
|
-
try:
|
|
129
|
-
start_time, end_time = str(ds.cf["T"][0].data), str(ds.cf["T"][-1].data)
|
|
130
|
-
except (KeyError, AttributeError, ValueError):
|
|
131
|
-
...
|
|
132
|
-
if info.get("sub_experiment_id"): # pragma: no branch
|
|
133
|
-
init_year = extract_attr_with_regex(info["sub_experiment_id"], r"\d{4}")
|
|
134
|
-
if init_year: # pragma: no cover
|
|
135
|
-
init_year = int(init_year)
|
|
136
|
-
info["vertical_levels"] = vertical_levels
|
|
137
|
-
info["init_year"] = init_year
|
|
138
|
-
info["start_time"] = start_time
|
|
139
|
-
info["end_time"] = end_time
|
|
140
|
-
if not (start_time and end_time):
|
|
141
|
-
info["time_range"] = None
|
|
142
|
-
else:
|
|
143
|
-
info["time_range"] = f"{start_time}-{end_time}"
|
|
144
|
-
info["path"] = str(file)
|
|
145
|
-
info["version"] = extract_attr_with_regex(str(file), regex=r"v\d{4}\d{2}\d{2}|v\d{1}") or "v0"
|
|
146
|
-
return info
|
|
147
|
-
|
|
148
|
-
except Exception:
|
|
149
|
-
logger.exception(f"Failed to parse {file}")
|
|
150
|
-
return {"INVALID_ASSET": file, "TRACEBACK": traceback.format_exc()}
|
|
151
|
-
|
|
152
|
-
|
|
153
74
|
class CMIP6DatasetAdapter(DatasetAdapter):
|
|
154
75
|
"""
|
|
155
76
|
Adapter for CMIP6 datasets
|
|
@@ -191,6 +112,7 @@ class CMIP6DatasetAdapter(DatasetAdapter):
|
|
|
191
112
|
"standard_name",
|
|
192
113
|
"long_name",
|
|
193
114
|
"units",
|
|
115
|
+
"finalised",
|
|
194
116
|
slug_column,
|
|
195
117
|
)
|
|
196
118
|
|
|
@@ -208,8 +130,30 @@ class CMIP6DatasetAdapter(DatasetAdapter):
|
|
|
208
130
|
"grid_label",
|
|
209
131
|
)
|
|
210
132
|
|
|
211
|
-
def __init__(self, n_jobs: int = 1):
|
|
133
|
+
def __init__(self, n_jobs: int = 1, config: Config | None = None):
|
|
212
134
|
self.n_jobs = n_jobs
|
|
135
|
+
self.config = config or Config.default()
|
|
136
|
+
|
|
137
|
+
def get_parsing_function(self) -> DatasetParsingFunction:
|
|
138
|
+
"""
|
|
139
|
+
Get the parsing function for CMIP6 datasets based on configuration
|
|
140
|
+
|
|
141
|
+
The parsing function used is determined by the `cmip6_parser` configuration value:
|
|
142
|
+
- "drs": Use the DRS parser (default)
|
|
143
|
+
- "complete": Use the complete parser that extracts all available metadata
|
|
144
|
+
|
|
145
|
+
Returns
|
|
146
|
+
-------
|
|
147
|
+
:
|
|
148
|
+
The appropriate parsing function based on configuration
|
|
149
|
+
"""
|
|
150
|
+
parser_type = self.config.cmip6_parser
|
|
151
|
+
if parser_type == "complete":
|
|
152
|
+
logger.info("Using complete CMIP6 parser")
|
|
153
|
+
return parse_cmip6_complete
|
|
154
|
+
else:
|
|
155
|
+
logger.info(f"Using DRS CMIP6 parser (config value: {parser_type})")
|
|
156
|
+
return parse_cmip6_drs
|
|
213
157
|
|
|
214
158
|
def find_local_datasets(self, file_or_directory: Path) -> pd.DataFrame:
|
|
215
159
|
"""
|
|
@@ -228,6 +172,8 @@ class CMIP6DatasetAdapter(DatasetAdapter):
|
|
|
228
172
|
:
|
|
229
173
|
Data catalog containing the metadata for the dataset
|
|
230
174
|
"""
|
|
175
|
+
parsing_function = self.get_parsing_function()
|
|
176
|
+
|
|
231
177
|
with warnings.catch_warnings():
|
|
232
178
|
# Ignore the DeprecationWarning from xarray
|
|
233
179
|
warnings.simplefilter("ignore", DeprecationWarning)
|
|
@@ -237,7 +183,7 @@ class CMIP6DatasetAdapter(DatasetAdapter):
|
|
|
237
183
|
depth=10,
|
|
238
184
|
include_patterns=["*.nc"],
|
|
239
185
|
joblib_parallel_kwargs={"n_jobs": self.n_jobs},
|
|
240
|
-
).build(parsing_func=
|
|
186
|
+
).build(parsing_func=parsing_function)
|
|
241
187
|
|
|
242
188
|
datasets: pd.DataFrame = builder.df.drop(["init_year"], axis=1)
|
|
243
189
|
|
|
@@ -254,6 +200,14 @@ class CMIP6DatasetAdapter(DatasetAdapter):
|
|
|
254
200
|
lambda row: "CMIP6." + ".".join([row[item] for item in drs_items]), axis=1
|
|
255
201
|
)
|
|
256
202
|
|
|
203
|
+
# Add in any missing metadata columns
|
|
204
|
+
missing_columns = set(self.dataset_specific_metadata + self.file_specific_metadata) - set(
|
|
205
|
+
datasets.columns
|
|
206
|
+
)
|
|
207
|
+
if missing_columns:
|
|
208
|
+
for column in missing_columns:
|
|
209
|
+
datasets[column] = pd.NA
|
|
210
|
+
|
|
257
211
|
# Temporary fix for some datasets
|
|
258
212
|
# TODO: Replace with a standalone package that contains metadata fixes for CMIP6 datasets
|
|
259
213
|
datasets = _apply_fixes(datasets)
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CMIP6 parser functions for extracting metadata from netCDF files
|
|
3
|
+
|
|
4
|
+
Additional non-official DRS's may be added in the future.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import traceback
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import xarray as xr
|
|
11
|
+
from ecgtools.parsers.cmip import parse_cmip6_using_directories # type: ignore
|
|
12
|
+
from ecgtools.parsers.utilities import extract_attr_with_regex # type: ignore
|
|
13
|
+
from loguru import logger
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _parse_daterange(date_range: str) -> tuple[str | None, str | None]:
|
|
17
|
+
"""
|
|
18
|
+
Parse a date range string into start and end dates
|
|
19
|
+
|
|
20
|
+
The output from this is an estimated date range until the file is completely parsed.
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
date_range
|
|
25
|
+
Date range string in the format "YYYYMM-YYYYMM"
|
|
26
|
+
|
|
27
|
+
Returns
|
|
28
|
+
-------
|
|
29
|
+
:
|
|
30
|
+
Tuple containing start and end dates as strings in the format "YYYY-MM-DD"
|
|
31
|
+
"""
|
|
32
|
+
try:
|
|
33
|
+
start, end = date_range.split("-")
|
|
34
|
+
if len(start) != 6 or len(end) != 6: # noqa: PLR2004
|
|
35
|
+
raise ValueError("Date range must be in the format 'YYYYMM-YYYYMM'")
|
|
36
|
+
|
|
37
|
+
start = f"{start[:4]}-{start[4:6]}-01"
|
|
38
|
+
# Up to the 30th of the month, assuming a 30-day month
|
|
39
|
+
# These values will be corrected later when the file is parsed
|
|
40
|
+
end = f"{end[:4]}-{end[4:6]}-30"
|
|
41
|
+
|
|
42
|
+
return start, end
|
|
43
|
+
except ValueError:
|
|
44
|
+
logger.error(f"Invalid date range format: {date_range}")
|
|
45
|
+
return None, None
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def parse_cmip6_complete(file: str, **kwargs: Any) -> dict[str, Any]:
|
|
49
|
+
"""
|
|
50
|
+
Complete parser for CMIP6 files
|
|
51
|
+
|
|
52
|
+
This parser loads each file and extracts all available metadata.
|
|
53
|
+
|
|
54
|
+
For some filesystems this may be slow, as it involves a lot of I/O operations.
|
|
55
|
+
|
|
56
|
+
Parameters
|
|
57
|
+
----------
|
|
58
|
+
file
|
|
59
|
+
File to parse
|
|
60
|
+
kwargs
|
|
61
|
+
Additional keyword arguments (not used, but required for compatibility)
|
|
62
|
+
|
|
63
|
+
Returns
|
|
64
|
+
-------
|
|
65
|
+
:
|
|
66
|
+
Dictionary with extracted metadata
|
|
67
|
+
"""
|
|
68
|
+
keys = sorted(
|
|
69
|
+
{
|
|
70
|
+
"activity_id",
|
|
71
|
+
"branch_method",
|
|
72
|
+
"branch_time_in_child",
|
|
73
|
+
"branch_time_in_parent",
|
|
74
|
+
"experiment",
|
|
75
|
+
"experiment_id",
|
|
76
|
+
"frequency",
|
|
77
|
+
"grid",
|
|
78
|
+
"grid_label",
|
|
79
|
+
"institution_id",
|
|
80
|
+
"nominal_resolution",
|
|
81
|
+
"parent_activity_id",
|
|
82
|
+
"parent_experiment_id",
|
|
83
|
+
"parent_source_id",
|
|
84
|
+
"parent_time_units",
|
|
85
|
+
"parent_variant_label",
|
|
86
|
+
"realm",
|
|
87
|
+
"product",
|
|
88
|
+
"source_id",
|
|
89
|
+
"source_type",
|
|
90
|
+
"sub_experiment",
|
|
91
|
+
"sub_experiment_id",
|
|
92
|
+
"table_id",
|
|
93
|
+
"variable_id",
|
|
94
|
+
"variant_label",
|
|
95
|
+
}
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
try:
|
|
99
|
+
with xr.open_dataset(file, chunks={}, use_cftime=True) as ds:
|
|
100
|
+
info = {key: ds.attrs.get(key) for key in keys}
|
|
101
|
+
info["member_id"] = info["variant_label"]
|
|
102
|
+
|
|
103
|
+
variable_id = info["variable_id"]
|
|
104
|
+
if variable_id: # pragma: no branch
|
|
105
|
+
attrs = ds[variable_id].attrs
|
|
106
|
+
for attr in ["standard_name", "long_name", "units"]:
|
|
107
|
+
info[attr] = attrs.get(attr)
|
|
108
|
+
|
|
109
|
+
# Set the default of # of vertical levels to 1
|
|
110
|
+
vertical_levels = 1
|
|
111
|
+
start_time, end_time = None, None
|
|
112
|
+
init_year = None
|
|
113
|
+
try:
|
|
114
|
+
vertical_levels = ds[ds.cf["vertical"].name].size
|
|
115
|
+
except (KeyError, AttributeError, ValueError):
|
|
116
|
+
...
|
|
117
|
+
|
|
118
|
+
try:
|
|
119
|
+
start_time, end_time = str(ds.cf["T"][0].data), str(ds.cf["T"][-1].data)
|
|
120
|
+
except (KeyError, AttributeError, ValueError):
|
|
121
|
+
...
|
|
122
|
+
if info.get("sub_experiment_id"): # pragma: no branch
|
|
123
|
+
init_year = extract_attr_with_regex(info["sub_experiment_id"], r"\d{4}")
|
|
124
|
+
if init_year: # pragma: no cover
|
|
125
|
+
init_year = int(init_year)
|
|
126
|
+
info["vertical_levels"] = vertical_levels
|
|
127
|
+
info["init_year"] = init_year
|
|
128
|
+
info["start_time"] = start_time
|
|
129
|
+
info["end_time"] = end_time
|
|
130
|
+
if not (start_time and end_time):
|
|
131
|
+
info["time_range"] = None
|
|
132
|
+
else:
|
|
133
|
+
info["time_range"] = f"{start_time}-{end_time}"
|
|
134
|
+
info["path"] = str(file)
|
|
135
|
+
info["version"] = extract_attr_with_regex(str(file), regex=r"v\d{4}\d{2}\d{2}|v\d{1}") or "v0"
|
|
136
|
+
|
|
137
|
+
# Mark the dataset as finalised
|
|
138
|
+
# This is used to indicate that the dataset has been fully parsed and is ready for use
|
|
139
|
+
info["finalised"] = True
|
|
140
|
+
|
|
141
|
+
return info
|
|
142
|
+
|
|
143
|
+
except Exception:
|
|
144
|
+
logger.exception(f"Failed to parse {file}")
|
|
145
|
+
return {"INVALID_ASSET": file, "TRACEBACK": traceback.format_exc()}
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def parse_cmip6_drs(file: str, **kwargs: Any) -> dict[str, Any]:
|
|
149
|
+
"""
|
|
150
|
+
DRS parser for CMIP6 files
|
|
151
|
+
|
|
152
|
+
This parser extracts metadata according to the CMIP6 Data Reference Syntax (DRS).
|
|
153
|
+
This includes the essential metadata required to identify the dataset and is included in the filename.
|
|
154
|
+
|
|
155
|
+
Parameters
|
|
156
|
+
----------
|
|
157
|
+
file
|
|
158
|
+
File to parse
|
|
159
|
+
kwargs
|
|
160
|
+
Additional keyword arguments (not used, but required for compatibility)
|
|
161
|
+
|
|
162
|
+
Returns
|
|
163
|
+
-------
|
|
164
|
+
:
|
|
165
|
+
Dictionary with extracted metadata
|
|
166
|
+
"""
|
|
167
|
+
info: dict[str, Any] = parse_cmip6_using_directories(file)
|
|
168
|
+
|
|
169
|
+
if "INVALID_ASSET" in info:
|
|
170
|
+
logger.warning(f"Failed to parse {file}: {info['INVALID_ASSET']}")
|
|
171
|
+
return info
|
|
172
|
+
|
|
173
|
+
# The member_id is technically incorrect
|
|
174
|
+
# but for simplicity we are going to ignore sub-experiments for the DRS parser
|
|
175
|
+
info["variant_label"] = info["member_id"]
|
|
176
|
+
|
|
177
|
+
# Rename the `dcpp_init_year` key to `init_year` if it exists
|
|
178
|
+
if "dcpp_init_year" in info:
|
|
179
|
+
info["init_year"] = info.pop("dcpp_init_year")
|
|
180
|
+
|
|
181
|
+
if info.get("time_range"):
|
|
182
|
+
# Parse the time range if it exists
|
|
183
|
+
start_time, end_time = _parse_daterange(info["time_range"])
|
|
184
|
+
info["start_time"] = start_time
|
|
185
|
+
info["end_time"] = end_time
|
|
186
|
+
|
|
187
|
+
info["finalised"] = False
|
|
188
|
+
|
|
189
|
+
return info
|
climate_ref/datasets/obs4mips.py
CHANGED
|
@@ -15,8 +15,17 @@ from climate_ref.datasets.cmip6 import _parse_datetime
|
|
|
15
15
|
from climate_ref.models.dataset import Dataset, Obs4MIPsDataset
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
def parse_obs4mips(file: str) -> dict[str, Any
|
|
19
|
-
"""
|
|
18
|
+
def parse_obs4mips(file: str, **kwargs: Any) -> dict[str, Any]:
|
|
19
|
+
"""
|
|
20
|
+
Parser for obs4mips
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
file
|
|
25
|
+
File to parse
|
|
26
|
+
kwargs
|
|
27
|
+
Additional keyword arguments (not used, but required for protocol compatibility)
|
|
28
|
+
"""
|
|
20
29
|
keys = sorted(
|
|
21
30
|
list(
|
|
22
31
|
{
|
|
@@ -106,6 +115,7 @@ class Obs4MIPsDatasetAdapter(DatasetAdapter):
|
|
|
106
115
|
|
|
107
116
|
dataset_specific_metadata = (
|
|
108
117
|
"activity_id",
|
|
118
|
+
"finalised",
|
|
109
119
|
"frequency",
|
|
110
120
|
"grid",
|
|
111
121
|
"grid_label",
|
|
@@ -159,7 +169,7 @@ class Obs4MIPsDatasetAdapter(DatasetAdapter):
|
|
|
159
169
|
depth=10,
|
|
160
170
|
include_patterns=["*.nc"],
|
|
161
171
|
joblib_parallel_kwargs={"n_jobs": self.n_jobs},
|
|
162
|
-
).build(parsing_func=parse_obs4mips)
|
|
172
|
+
).build(parsing_func=parse_obs4mips)
|
|
163
173
|
|
|
164
174
|
datasets = builder.df
|
|
165
175
|
if datasets.empty:
|
|
@@ -178,4 +188,5 @@ class Obs4MIPsDatasetAdapter(DatasetAdapter):
|
|
|
178
188
|
datasets["instance_id"] = datasets.apply(
|
|
179
189
|
lambda row: "obs4MIPs." + ".".join([row[item] for item in drs_items]), axis=1
|
|
180
190
|
)
|
|
191
|
+
datasets["finalised"] = True
|
|
181
192
|
return datasets
|
climate_ref/executor/hpc.py
CHANGED
|
@@ -21,7 +21,7 @@ from loguru import logger
|
|
|
21
21
|
from parsl import python_app
|
|
22
22
|
from parsl.config import Config as ParslConfig
|
|
23
23
|
from parsl.executors import HighThroughputExecutor
|
|
24
|
-
from parsl.launchers import SrunLauncher
|
|
24
|
+
from parsl.launchers import SimpleLauncher, SrunLauncher
|
|
25
25
|
from parsl.providers import SlurmProvider
|
|
26
26
|
from tqdm import tqdm
|
|
27
27
|
|
|
@@ -34,6 +34,7 @@ from climate_ref_core.exceptions import DiagnosticError, ExecutionError
|
|
|
34
34
|
from climate_ref_core.executor import execute_locally
|
|
35
35
|
|
|
36
36
|
from .local import ExecutionFuture, process_result
|
|
37
|
+
from .pbs_scheduler import SmartPBSProvider
|
|
37
38
|
|
|
38
39
|
|
|
39
40
|
@python_app
|
|
@@ -96,8 +97,9 @@ class HPCExecutor:
|
|
|
96
97
|
self.account = str(executor_config.get("account", os.environ.get("USER")))
|
|
97
98
|
self.username = executor_config.get("username", os.environ.get("USER"))
|
|
98
99
|
self.partition = str(executor_config.get("partition")) if executor_config.get("partition") else None
|
|
100
|
+
self.queue = str(executor_config.get("queue")) if executor_config.get("queue") else None
|
|
99
101
|
self.qos = str(executor_config.get("qos")) if executor_config.get("qos") else None
|
|
100
|
-
self.req_nodes = int(executor_config.get("req_nodes", 1))
|
|
102
|
+
self.req_nodes = int(executor_config.get("req_nodes", 1)) if self.scheduler == "slurm" else 1
|
|
101
103
|
self.walltime = str(executor_config.get("walltime", "00:10:00"))
|
|
102
104
|
self.log_dir = str(executor_config.get("log_dir", "runinfo"))
|
|
103
105
|
|
|
@@ -181,21 +183,47 @@ class HPCExecutor:
|
|
|
181
183
|
def _initialize_parsl(self) -> None:
|
|
182
184
|
executor_config = self.config.executor.config
|
|
183
185
|
|
|
184
|
-
provider
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
186
|
+
provider: SlurmProvider | SmartPBSProvider
|
|
187
|
+
if self.scheduler == "slurm":
|
|
188
|
+
provider = SlurmProvider(
|
|
189
|
+
account=self.account,
|
|
190
|
+
partition=self.partition,
|
|
191
|
+
qos=self.qos,
|
|
192
|
+
nodes_per_block=self.req_nodes,
|
|
193
|
+
max_blocks=int(executor_config.get("max_blocks", 1)),
|
|
194
|
+
scheduler_options=executor_config.get("scheduler_options", "#SBATCH -C cpu"),
|
|
195
|
+
worker_init=executor_config.get("worker_init", "source .venv/bin/activate"),
|
|
196
|
+
launcher=SrunLauncher(
|
|
197
|
+
debug=True,
|
|
198
|
+
overrides=executor_config.get("overrides", ""),
|
|
199
|
+
),
|
|
200
|
+
walltime=self.walltime,
|
|
201
|
+
cmd_timeout=int(executor_config.get("cmd_timeout", 120)),
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
elif self.scheduler == "pbs":
|
|
205
|
+
provider = SmartPBSProvider(
|
|
206
|
+
account=self.account,
|
|
207
|
+
queue=self.queue,
|
|
208
|
+
worker_init=executor_config.get("worker_init", "source .venv/bin/activate"),
|
|
209
|
+
nodes_per_block=_to_int(executor_config.get("nodes_per_block", 1)),
|
|
210
|
+
cpus_per_node=_to_int(executor_config.get("cpus_per_node", None)),
|
|
211
|
+
ncpus=_to_int(executor_config.get("ncpus", None)),
|
|
212
|
+
mem=executor_config.get("mem", "4GB"),
|
|
213
|
+
jobfs=executor_config.get("jobfs", "10GB"),
|
|
214
|
+
storage=executor_config.get("storage", ""),
|
|
215
|
+
init_blocks=executor_config.get("init_blocks", 1),
|
|
216
|
+
min_blocks=executor_config.get("min_blocks", 0),
|
|
217
|
+
max_blocks=executor_config.get("max_blocks", 1),
|
|
218
|
+
parallelism=executor_config.get("parallelism", 1),
|
|
219
|
+
scheduler_options=executor_config.get("scheduler_options", ""),
|
|
220
|
+
launcher=SimpleLauncher(),
|
|
221
|
+
walltime=self.walltime,
|
|
222
|
+
cmd_timeout=int(executor_config.get("cmd_timeout", 120)),
|
|
223
|
+
)
|
|
224
|
+
else:
|
|
225
|
+
raise ValueError(f"Unsupported scheduler: {self.scheduler}")
|
|
226
|
+
|
|
199
227
|
executor = HighThroughputExecutor(
|
|
200
228
|
label="ref_hpc_executor",
|
|
201
229
|
cores_per_worker=self.cores_per_worker if self.cores_per_worker else 1,
|
|
@@ -206,8 +234,11 @@ class HPCExecutor:
|
|
|
206
234
|
)
|
|
207
235
|
|
|
208
236
|
hpc_config = ParslConfig(
|
|
209
|
-
run_dir=self.log_dir,
|
|
237
|
+
run_dir=self.log_dir,
|
|
238
|
+
executors=[executor],
|
|
239
|
+
retries=int(executor_config.get("retries", 2)),
|
|
210
240
|
)
|
|
241
|
+
|
|
211
242
|
parsl.load(hpc_config)
|
|
212
243
|
|
|
213
244
|
def run(
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import shutil
|
|
3
|
+
import subprocess
|
|
4
|
+
import textwrap
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from parsl.launchers import SimpleLauncher
|
|
8
|
+
from parsl.providers import PBSProProvider
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SmartPBSProvider(PBSProProvider):
|
|
12
|
+
"""
|
|
13
|
+
A PBSProProvider subclass that adapts to systems where `-l select` is not supported.
|
|
14
|
+
|
|
15
|
+
Falls back to individual resource requests (ncpus, mem, jobfs, storage) if needed.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__( # noqa: PLR0913
|
|
19
|
+
self,
|
|
20
|
+
account: str | None = None,
|
|
21
|
+
queue: str | None = None,
|
|
22
|
+
scheduler_options: str = "",
|
|
23
|
+
worker_init: str = "",
|
|
24
|
+
nodes_per_block: int | None = 1,
|
|
25
|
+
cpus_per_node: int | None = 1,
|
|
26
|
+
ncpus: int | None = None,
|
|
27
|
+
mem: str = "4GB",
|
|
28
|
+
jobfs: str = "10GB",
|
|
29
|
+
storage: str = "",
|
|
30
|
+
init_blocks: int = 1,
|
|
31
|
+
min_blocks: int = 0,
|
|
32
|
+
max_blocks: int = 1,
|
|
33
|
+
parallelism: int = 1,
|
|
34
|
+
launcher: SimpleLauncher = SimpleLauncher(),
|
|
35
|
+
walltime: str = "00:20:00",
|
|
36
|
+
cmd_timeout: int = 120,
|
|
37
|
+
) -> None:
|
|
38
|
+
self.ncpus = ncpus
|
|
39
|
+
self.mem = mem
|
|
40
|
+
self.jobfs = jobfs
|
|
41
|
+
self.storage = storage
|
|
42
|
+
self._select_supported = self._detect_select_support()
|
|
43
|
+
|
|
44
|
+
# Prepare fallback resource dictionary
|
|
45
|
+
self._fallback_resources = {"mem": mem, "jobfs": jobfs, "storage": storage}
|
|
46
|
+
|
|
47
|
+
# Parse and strip select if present in scheduler_options
|
|
48
|
+
if not self._select_supported and "-l select=" in scheduler_options:
|
|
49
|
+
scheduler_options = self._parse_select_from_scheduler_options(scheduler_options)
|
|
50
|
+
|
|
51
|
+
# Determine fallback ncpus
|
|
52
|
+
if "ncpus" not in self._fallback_resources:
|
|
53
|
+
self._fallback_resources["ncpus"] = str(ncpus if ncpus is not None else (cpus_per_node or 1))
|
|
54
|
+
|
|
55
|
+
# Map ncpus to cpus_per_node if needed (select mode only)
|
|
56
|
+
if self._select_supported:
|
|
57
|
+
if not ncpus and cpus_per_node:
|
|
58
|
+
cpus_per_node = ncpus
|
|
59
|
+
elif ncpus and cpus_per_node and int(ncpus) != int(cpus_per_node):
|
|
60
|
+
print(f"Warning: ncpus={ncpus} and cpus_per_node={cpus_per_node} differ.")
|
|
61
|
+
print(f"Using cpus_per_node={cpus_per_node}.")
|
|
62
|
+
else:
|
|
63
|
+
cpus_per_node = int(self._fallback_resources["ncpus"])
|
|
64
|
+
|
|
65
|
+
super().__init__(
|
|
66
|
+
account=account,
|
|
67
|
+
queue=queue,
|
|
68
|
+
scheduler_options=scheduler_options,
|
|
69
|
+
select_options="", # Not used; we handle resources ourselves
|
|
70
|
+
worker_init=worker_init,
|
|
71
|
+
nodes_per_block=nodes_per_block,
|
|
72
|
+
cpus_per_node=cpus_per_node,
|
|
73
|
+
init_blocks=init_blocks,
|
|
74
|
+
min_blocks=min_blocks,
|
|
75
|
+
max_blocks=max_blocks,
|
|
76
|
+
parallelism=parallelism,
|
|
77
|
+
launcher=launcher,
|
|
78
|
+
walltime=walltime,
|
|
79
|
+
cmd_timeout=cmd_timeout,
|
|
80
|
+
) # type: ignore
|
|
81
|
+
|
|
82
|
+
if not self._select_supported:
|
|
83
|
+
self.template_string = self._fallback_template()
|
|
84
|
+
|
|
85
|
+
def _detect_select_support(self) -> bool:
|
|
86
|
+
"""Detect whether `-l select` is supported by the underlying PBS system."""
|
|
87
|
+
qsub_path = shutil.which("qsub")
|
|
88
|
+
if qsub_path is None:
|
|
89
|
+
raise RuntimeError("qsub command not found. Ensure PBS is installed and in PATH.")
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
result = subprocess.run( # noqa: S603
|
|
93
|
+
[qsub_path, "-l", "wd,select=1:ncpus=1", "--version"],
|
|
94
|
+
capture_output=True,
|
|
95
|
+
timeout=5,
|
|
96
|
+
check=False,
|
|
97
|
+
)
|
|
98
|
+
stderr = result.stderr.decode().lower()
|
|
99
|
+
return "unknown" not in stderr and result.returncode == 0
|
|
100
|
+
except Exception:
|
|
101
|
+
return False
|
|
102
|
+
|
|
103
|
+
def _parse_select_from_scheduler_options(self, scheduler_options: str) -> str:
|
|
104
|
+
"""
|
|
105
|
+
Parse `-l select=...` from scheduler_options and update fallback resources.
|
|
106
|
+
|
|
107
|
+
Removes the select line from scheduler_options.
|
|
108
|
+
"""
|
|
109
|
+
select_pattern = r"-l\s+select=([^\s]+)"
|
|
110
|
+
match = re.search(select_pattern, scheduler_options)
|
|
111
|
+
if match:
|
|
112
|
+
select_string = match.group(1)
|
|
113
|
+
scheduler_options = re.sub(select_pattern, "", scheduler_options).strip()
|
|
114
|
+
|
|
115
|
+
parts = select_string.split(":")[1:] # skip the initial `select=1`
|
|
116
|
+
for part in parts:
|
|
117
|
+
if "=" in part:
|
|
118
|
+
key, val = part.split("=")
|
|
119
|
+
self._fallback_resources[key.strip()] = val.strip()
|
|
120
|
+
return scheduler_options
|
|
121
|
+
|
|
122
|
+
def _fallback_template(self) -> str:
|
|
123
|
+
"""Submit script template used if `select` is not supported."""
|
|
124
|
+
return textwrap.dedent("""\
|
|
125
|
+
#!/bin/bash
|
|
126
|
+
#PBS -N ${jobname}
|
|
127
|
+
#PBS -l ncpus=${ncpus}
|
|
128
|
+
#PBS -l mem=${mem}
|
|
129
|
+
#PBS -l jobfs=${jobfs}
|
|
130
|
+
#PBS -l walltime=${walltime}
|
|
131
|
+
#PBS -l storage=${storage}
|
|
132
|
+
#PBS -o ${job_stdout_path}
|
|
133
|
+
#PBS -e ${job_stderr_path}
|
|
134
|
+
${scheduler_options}
|
|
135
|
+
|
|
136
|
+
${worker_init}
|
|
137
|
+
|
|
138
|
+
export JOBNAME="${jobname}"
|
|
139
|
+
${user_script}
|
|
140
|
+
|
|
141
|
+
""")
|
|
142
|
+
|
|
143
|
+
def _write_submit_script(
|
|
144
|
+
self, template: str, script_filename: str, job_name: str, configs: dict[str, Any]
|
|
145
|
+
) -> str:
|
|
146
|
+
"""Inject fallback values into the submit script if `select` is not supported."""
|
|
147
|
+
if not self._select_supported:
|
|
148
|
+
configs.setdefault("ncpus", self._fallback_resources.get("ncpus", "1"))
|
|
149
|
+
configs.setdefault("mem", self._fallback_resources.get("mem", "4GB"))
|
|
150
|
+
configs.setdefault("jobfs", self._fallback_resources.get("jobfs", "10GB"))
|
|
151
|
+
configs.setdefault("storage", self._fallback_resources.get("storage", "gdata1"))
|
|
152
|
+
return super()._write_submit_script(template, script_filename, job_name, configs) # type: ignore
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""cmip6-finalised
|
|
2
|
+
|
|
3
|
+
Revision ID: 94beace57a9c
|
|
4
|
+
Revises: 795c1e6cf496
|
|
5
|
+
Create Date: 2025-07-20 15:21:17.132458
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from collections.abc import Sequence
|
|
10
|
+
from typing import Union
|
|
11
|
+
|
|
12
|
+
import sqlalchemy as sa
|
|
13
|
+
from alembic import op
|
|
14
|
+
|
|
15
|
+
# revision identifiers, used by Alembic.
|
|
16
|
+
revision: str = "94beace57a9c"
|
|
17
|
+
down_revision: Union[str, None] = "795c1e6cf496"
|
|
18
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
19
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def upgrade() -> None:
|
|
23
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
24
|
+
with op.batch_alter_table("cmip6_dataset", schema=None) as batch_op:
|
|
25
|
+
batch_op.add_column(sa.Column("finalised", sa.Boolean(), nullable=False))
|
|
26
|
+
batch_op.alter_column("experiment", existing_type=sa.VARCHAR(), nullable=True)
|
|
27
|
+
batch_op.alter_column("frequency", existing_type=sa.VARCHAR(), nullable=True)
|
|
28
|
+
batch_op.alter_column("grid", existing_type=sa.VARCHAR(), nullable=True)
|
|
29
|
+
batch_op.alter_column("nominal_resolution", existing_type=sa.VARCHAR(), nullable=True)
|
|
30
|
+
batch_op.alter_column("realm", existing_type=sa.VARCHAR(), nullable=True)
|
|
31
|
+
batch_op.alter_column("product", existing_type=sa.VARCHAR(), nullable=True)
|
|
32
|
+
batch_op.alter_column("standard_name", existing_type=sa.VARCHAR(), nullable=True)
|
|
33
|
+
batch_op.alter_column("source_type", existing_type=sa.VARCHAR(), nullable=True)
|
|
34
|
+
batch_op.alter_column("sub_experiment", existing_type=sa.VARCHAR(), nullable=True)
|
|
35
|
+
batch_op.alter_column("sub_experiment_id", existing_type=sa.VARCHAR(), nullable=True)
|
|
36
|
+
batch_op.alter_column("units", existing_type=sa.VARCHAR(), nullable=True)
|
|
37
|
+
|
|
38
|
+
# ### end Alembic commands ###
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def downgrade() -> None:
|
|
42
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
43
|
+
with op.batch_alter_table("cmip6_dataset", schema=None) as batch_op:
|
|
44
|
+
batch_op.alter_column("units", existing_type=sa.VARCHAR(), nullable=False)
|
|
45
|
+
batch_op.alter_column("sub_experiment_id", existing_type=sa.VARCHAR(), nullable=False)
|
|
46
|
+
batch_op.alter_column("sub_experiment", existing_type=sa.VARCHAR(), nullable=False)
|
|
47
|
+
batch_op.alter_column("source_type", existing_type=sa.VARCHAR(), nullable=False)
|
|
48
|
+
batch_op.alter_column("standard_name", existing_type=sa.VARCHAR(), nullable=False)
|
|
49
|
+
batch_op.alter_column("product", existing_type=sa.VARCHAR(), nullable=False)
|
|
50
|
+
batch_op.alter_column("realm", existing_type=sa.VARCHAR(), nullable=False)
|
|
51
|
+
batch_op.alter_column("nominal_resolution", existing_type=sa.VARCHAR(), nullable=False)
|
|
52
|
+
batch_op.alter_column("grid", existing_type=sa.VARCHAR(), nullable=False)
|
|
53
|
+
batch_op.alter_column("frequency", existing_type=sa.VARCHAR(), nullable=False)
|
|
54
|
+
batch_op.alter_column("experiment", existing_type=sa.VARCHAR(), nullable=False)
|
|
55
|
+
batch_op.drop_column("finalised")
|
|
56
|
+
|
|
57
|
+
# ### end Alembic commands ###
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""finalised-on-base-dataset
|
|
2
|
+
|
|
3
|
+
Move finalised from cmip6_dataset to base dataset table and default all existing rows to True.
|
|
4
|
+
|
|
5
|
+
Revision ID: a1b2c3d4e5f6
|
|
6
|
+
Revises: 94beace57a9c
|
|
7
|
+
Create Date: 2025-08-05 03:27:00
|
|
8
|
+
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from collections.abc import Sequence
|
|
12
|
+
from typing import Union
|
|
13
|
+
|
|
14
|
+
import sqlalchemy as sa
|
|
15
|
+
from alembic import op
|
|
16
|
+
|
|
17
|
+
# revision identifiers, used by Alembic.
|
|
18
|
+
revision: str = "ba5e"
|
|
19
|
+
down_revision: Union[str, None] = "94beace57a9c"
|
|
20
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
21
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def upgrade() -> None:
|
|
25
|
+
# Add finalised to base dataset with default True, non-null
|
|
26
|
+
with op.batch_alter_table("dataset", schema=None) as batch_op:
|
|
27
|
+
batch_op.add_column(
|
|
28
|
+
sa.Column("finalised", sa.Boolean(), nullable=True, server_default=sa.text("true"))
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
# Backfill: ensure all existing rows are True
|
|
32
|
+
op.execute("UPDATE dataset SET finalised = TRUE WHERE finalised IS NULL")
|
|
33
|
+
|
|
34
|
+
# Enforce NOT NULL after backfill
|
|
35
|
+
with op.batch_alter_table("dataset", schema=None) as batch_op:
|
|
36
|
+
batch_op.alter_column("finalised", nullable=False)
|
|
37
|
+
|
|
38
|
+
# Drop column from cmip6_dataset if it exists
|
|
39
|
+
conn = op.get_bind()
|
|
40
|
+
inspector = sa.inspect(conn)
|
|
41
|
+
cmip6_cols = {col["name"] for col in inspector.get_columns("cmip6_dataset")}
|
|
42
|
+
if "finalised" in cmip6_cols:
|
|
43
|
+
with op.batch_alter_table("cmip6_dataset", schema=None) as batch_op:
|
|
44
|
+
batch_op.drop_column("finalised")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def downgrade() -> None:
|
|
48
|
+
# Re-create cmip6_dataset.finalised as non-nullable boolean default False
|
|
49
|
+
# Note: Original migration 94beace57a9c added cmip6_dataset.finalised NOT NULL, with no default.
|
|
50
|
+
with op.batch_alter_table("cmip6_dataset", schema=None) as batch_op:
|
|
51
|
+
batch_op.add_column(
|
|
52
|
+
sa.Column("finalised", sa.Boolean(), nullable=False, server_default=sa.text("false"))
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# Drop base dataset finalised
|
|
56
|
+
with op.batch_alter_table("dataset", schema=None) as batch_op:
|
|
57
|
+
batch_op.drop_column("finalised")
|
climate_ref/models/dataset.py
CHANGED
|
@@ -45,6 +45,16 @@ class Dataset(Base):
|
|
|
45
45
|
Updating a dataset will trigger a new diagnostic calculation.
|
|
46
46
|
"""
|
|
47
47
|
|
|
48
|
+
# Universal finalisation flag for all dataset types
|
|
49
|
+
# Only CMIP6 currently uses unfinalised datasets in practice; other types should be finalised on creation.
|
|
50
|
+
finalised: Mapped[bool] = mapped_column(default=True, nullable=False)
|
|
51
|
+
"""
|
|
52
|
+
Whether the complete set of metadata for the dataset has been finalised.
|
|
53
|
+
|
|
54
|
+
For CMIP6, ingestion may initially create unfinalised datasets (False) until all metadata is extracted.
|
|
55
|
+
For other dataset types (e.g., obs4MIPs, PMP climatology), this should be True upon creation.
|
|
56
|
+
"""
|
|
57
|
+
|
|
48
58
|
def __repr__(self) -> str:
|
|
49
59
|
return f"<Dataset slug={self.slug} dataset_type={self.dataset_type} >"
|
|
50
60
|
|
|
@@ -90,9 +100,7 @@ class CMIP6Dataset(Dataset):
|
|
|
90
100
|
"""
|
|
91
101
|
Represents a CMIP6 dataset
|
|
92
102
|
|
|
93
|
-
Fields that are not
|
|
94
|
-
https://wcrp-cmip.github.io/WGCM_Infrastructure_Panel/Papers/CMIP6_global_attributes_filenames_CVs_v6.2.7.pdf
|
|
95
|
-
are optional.
|
|
103
|
+
Fields that are not in the DRS are marked optional.
|
|
96
104
|
"""
|
|
97
105
|
|
|
98
106
|
__tablename__ = "cmip6_dataset"
|
|
@@ -102,29 +110,29 @@ class CMIP6Dataset(Dataset):
|
|
|
102
110
|
branch_method: Mapped[str] = mapped_column(nullable=True)
|
|
103
111
|
branch_time_in_child: Mapped[float] = mapped_column(nullable=True)
|
|
104
112
|
branch_time_in_parent: Mapped[float] = mapped_column(nullable=True)
|
|
105
|
-
experiment: Mapped[str] = mapped_column()
|
|
113
|
+
experiment: Mapped[str] = mapped_column(nullable=True)
|
|
106
114
|
experiment_id: Mapped[str] = mapped_column()
|
|
107
|
-
frequency: Mapped[str] = mapped_column()
|
|
108
|
-
grid: Mapped[str] = mapped_column()
|
|
115
|
+
frequency: Mapped[str] = mapped_column(nullable=True)
|
|
116
|
+
grid: Mapped[str] = mapped_column(nullable=True)
|
|
109
117
|
grid_label: Mapped[str] = mapped_column()
|
|
110
118
|
institution_id: Mapped[str] = mapped_column()
|
|
111
119
|
long_name: Mapped[str] = mapped_column(nullable=True)
|
|
112
120
|
member_id: Mapped[str] = mapped_column()
|
|
113
|
-
nominal_resolution: Mapped[str] = mapped_column()
|
|
121
|
+
nominal_resolution: Mapped[str] = mapped_column(nullable=True)
|
|
114
122
|
parent_activity_id: Mapped[str] = mapped_column(nullable=True)
|
|
115
123
|
parent_experiment_id: Mapped[str] = mapped_column(nullable=True)
|
|
116
124
|
parent_source_id: Mapped[str] = mapped_column(nullable=True)
|
|
117
125
|
parent_time_units: Mapped[str] = mapped_column(nullable=True)
|
|
118
126
|
parent_variant_label: Mapped[str] = mapped_column(nullable=True)
|
|
119
|
-
realm: Mapped[str] = mapped_column()
|
|
120
|
-
product: Mapped[str] = mapped_column()
|
|
127
|
+
realm: Mapped[str] = mapped_column(nullable=True)
|
|
128
|
+
product: Mapped[str] = mapped_column(nullable=True)
|
|
121
129
|
source_id: Mapped[str] = mapped_column()
|
|
122
|
-
standard_name: Mapped[str] = mapped_column()
|
|
123
|
-
source_type: Mapped[str] = mapped_column()
|
|
124
|
-
sub_experiment: Mapped[str] = mapped_column()
|
|
125
|
-
sub_experiment_id: Mapped[str] = mapped_column()
|
|
130
|
+
standard_name: Mapped[str] = mapped_column(nullable=True)
|
|
131
|
+
source_type: Mapped[str] = mapped_column(nullable=True)
|
|
132
|
+
sub_experiment: Mapped[str] = mapped_column(nullable=True)
|
|
133
|
+
sub_experiment_id: Mapped[str] = mapped_column(nullable=True)
|
|
126
134
|
table_id: Mapped[str] = mapped_column()
|
|
127
|
-
units: Mapped[str] = mapped_column()
|
|
135
|
+
units: Mapped[str] = mapped_column(nullable=True)
|
|
128
136
|
variable_id: Mapped[str] = mapped_column()
|
|
129
137
|
variant_label: Mapped[str] = mapped_column()
|
|
130
138
|
vertical_levels: Mapped[int] = mapped_column(nullable=True)
|
|
@@ -132,7 +140,7 @@ class CMIP6Dataset(Dataset):
|
|
|
132
140
|
|
|
133
141
|
instance_id: Mapped[str] = mapped_column()
|
|
134
142
|
"""
|
|
135
|
-
Unique identifier for the dataset.
|
|
143
|
+
Unique identifier for the dataset (including the version).
|
|
136
144
|
"""
|
|
137
145
|
|
|
138
146
|
__mapper_args__: ClassVar[Any] = {"polymorphic_identity": SourceDatasetType.CMIP6} # type: ignore
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: climate-ref
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.5
|
|
4
4
|
Summary: Application which runs the CMIP Rapid Evaluation Framework
|
|
5
5
|
Author-email: Jared Lewis <jared.lewis@climate-resource.com>, Mika Pflueger <mika.pflueger@climate-resource.com>, Bouwe Andela <b.andela@esciencecenter.nl>, Jiwoo Lee <lee1043@llnl.gov>, Min Xu <xum1@ornl.gov>, Nathan Collier <collierno@ornl.gov>, Dora Hegedus <dora.hegedus@stfc.ac.uk>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
climate_ref/__init__.py,sha256=M45QGfl0KCPK48A8MjI08weNvZHMYH__GblraQMxsoM,808
|
|
2
2
|
climate_ref/_config_helpers.py,sha256=-atI5FX7SukhLE_jz_rL-EHQ7s0YYqKu3dSFYWxSyMU,6632
|
|
3
3
|
climate_ref/alembic.ini,sha256=WRvbwSIFuZ7hWNMnR2-yHPJAwYUnwhvRYBzkJhtpGdg,3535
|
|
4
|
-
climate_ref/config.py,sha256=
|
|
4
|
+
climate_ref/config.py,sha256=WW6R7RLwEDuI11XYLYO57FwvmQz1psq9bNM3WVL3e_s,17481
|
|
5
5
|
climate_ref/constants.py,sha256=9RaNLgUSuQva7ki4eRW3TjOKeVP6T81QNiu0veB1zVk,111
|
|
6
6
|
climate_ref/database.py,sha256=b_6XHdr78Mo7KeLqQJ5DjLsySHPdQE83P8dRpdMfzfM,8661
|
|
7
7
|
climate_ref/provider_registry.py,sha256=dyfj4vU6unKHNXtT03HafQtAi3LilL37uvu3paCnmNY,4159
|
|
@@ -19,14 +19,16 @@ climate_ref/cli/solve.py,sha256=ZTXrwDFDXNrX5GLMJTN9tFnpV3zlcZbEu2aF3JDJVxI,2367
|
|
|
19
19
|
climate_ref/dataset_registry/obs4ref_reference.txt,sha256=2zJMbsAsQ49KaWziX3CqrlILq9yN7S2ygmfV3V5rsnw,8395
|
|
20
20
|
climate_ref/dataset_registry/sample_data.txt,sha256=3JAHy14pRbLlo9-oNxUXLgZ_QOFJXUieEftBbapSY8E,20124
|
|
21
21
|
climate_ref/datasets/__init__.py,sha256=PV3u5ZmhyfcHbKqySgwVA8m4-naZgxzydLXSBqdTGLM,1171
|
|
22
|
-
climate_ref/datasets/base.py,sha256=
|
|
23
|
-
climate_ref/datasets/cmip6.py,sha256=
|
|
24
|
-
climate_ref/datasets/
|
|
22
|
+
climate_ref/datasets/base.py,sha256=uZ55u625ckRNjsn-AqJg4_xO5uvHchqYvwBZIt4iHtY,11017
|
|
23
|
+
climate_ref/datasets/cmip6.py,sha256=KO761ConHvX40n9X0xLrxjhzN7wmighNWL2JyYygRAA,7049
|
|
24
|
+
climate_ref/datasets/cmip6_parsers.py,sha256=wH4WKQAR2_aniXwsW7nch6nIpXk2pSpPxkT4unjV4hQ,6041
|
|
25
|
+
climate_ref/datasets/obs4mips.py,sha256=q0_erQb4k5KBaGMvEGgUtVSDvXQjuftqDmvW4QZpWZI,6138
|
|
25
26
|
climate_ref/datasets/pmp_climatology.py,sha256=goHDc_3B2Wdiy_hmpERNvWDdDYZACPOyFDt3Du6nGc0,534
|
|
26
27
|
climate_ref/datasets/utils.py,sha256=iLJO7h4G3DWsRe9hIC4qkIyi5_zIW1ZMw-FDASLujtM,359
|
|
27
28
|
climate_ref/executor/__init__.py,sha256=PYtJs3oBS_GiUHbt8BF-6wJibpF6_vREm1Cg9TxVbLI,648
|
|
28
|
-
climate_ref/executor/hpc.py,sha256=
|
|
29
|
+
climate_ref/executor/hpc.py,sha256=ZhGtzM0skH_ojnkSc6UNYIetXoyBRCwfXJusuezBZGw,13876
|
|
29
30
|
climate_ref/executor/local.py,sha256=65LUl41YtURFb87YTWZQHjDpIRlIKJ5Ny51c9DZjy0s,8582
|
|
31
|
+
climate_ref/executor/pbs_scheduler.py,sha256=WoH1sTmDl7bdmYodpcxZjkUSvInYUcWR4x7buIgBxqk,5807
|
|
30
32
|
climate_ref/executor/result_handling.py,sha256=i7ZMX5vvyPY5gW-WWd-JHLi1BLviB9FXhn4FE8C9d4w,7787
|
|
31
33
|
climate_ref/executor/synchronous.py,sha256=o4TndsoKMu9AzJYLkusU9lRkgHCy6HcCP46tEs6o86U,1895
|
|
32
34
|
climate_ref/migrations/README,sha256=xM5osYbyEbEFA2eh5kwary_oh-5VFWtDubA-vgWwvlE,935
|
|
@@ -35,16 +37,18 @@ climate_ref/migrations/script.py.mako,sha256=MEqL-2qATlST9TAOeYgscMn1uy6HUS9NFvD
|
|
|
35
37
|
climate_ref/migrations/versions/2025-05-02T1418_341a4aa2551e_regenerate.py,sha256=S8Q4THCI4TPnlaQHgQJUCiNW5LAyQClaiTB-0dwhtXU,14050
|
|
36
38
|
climate_ref/migrations/versions/2025-05-09T2032_03dbb4998e49_series_metric_value.py,sha256=s9nZ_l64pSF7sWN53rRPCQlqW_xHqR8tlWhU-ovmsME,2043
|
|
37
39
|
climate_ref/migrations/versions/2025-07-03T1505_795c1e6cf496_drop_unique_requirement_on_slug.py,sha256=TfBHJkm3oPlz0P5Z1tiY6LBp2B1oDvdyL_OOYoV-OiI,984
|
|
40
|
+
climate_ref/migrations/versions/2025-07-20T1521_94beace57a9c_cmip6_finalised.py,sha256=NSCMMV65v48B8_OoEf4X4bRthAlhzbDo0UlC6nqW3qs,2908
|
|
41
|
+
climate_ref/migrations/versions/2025-08-05T0327_a1b2c3d4e5f6_finalised_on_base_dataset.py,sha256=G-SZKdU9dx9WyMh4JLwPKcud4gtFrxu-tULXG9vXGAU,2034
|
|
38
42
|
climate_ref/models/__init__.py,sha256=rUDKRANeAEAHVOrzJVIZoZ99dDG5O4AGzHmOpC876Nc,801
|
|
39
43
|
climate_ref/models/base.py,sha256=YMyovT2Z_tRv59zz6qC9YCCDodhO3x6OLnFdBtPJkho,1271
|
|
40
|
-
climate_ref/models/dataset.py,sha256=
|
|
44
|
+
climate_ref/models/dataset.py,sha256=in9FNLR4K_bpVSlWlk6A6IyFtkFy2v8ZFNcDXbwSMWI,8078
|
|
41
45
|
climate_ref/models/diagnostic.py,sha256=0mKVvASEWNxx41R2Y-5VxplarZ4JAP6q0oaO14FKZuk,1751
|
|
42
46
|
climate_ref/models/execution.py,sha256=lRCpaKLSR7rZbuoL94GW76tm9wLMsSDoIOA7bIa6xgY,9848
|
|
43
47
|
climate_ref/models/metric_value.py,sha256=44OLcZz-qLx-p_9w7YWDKpD5S7Y9HyTKKsvSb77RBro,10190
|
|
44
48
|
climate_ref/models/provider.py,sha256=RAE2qAAxwObu-72CdK4kt5ACMmKYEn07WJm7DU9hF28,990
|
|
45
|
-
climate_ref-0.6.
|
|
46
|
-
climate_ref-0.6.
|
|
47
|
-
climate_ref-0.6.
|
|
48
|
-
climate_ref-0.6.
|
|
49
|
-
climate_ref-0.6.
|
|
50
|
-
climate_ref-0.6.
|
|
49
|
+
climate_ref-0.6.5.dist-info/METADATA,sha256=uKVSOC5iN1SGV3eoQ9uClB1UC_FpDbbM2ArYL0EHd0U,4505
|
|
50
|
+
climate_ref-0.6.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
51
|
+
climate_ref-0.6.5.dist-info/entry_points.txt,sha256=IaggEJlDIhoYWXdXJafacWbWtCcoEqUKceP1qD7_7vU,44
|
|
52
|
+
climate_ref-0.6.5.dist-info/licenses/LICENCE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
53
|
+
climate_ref-0.6.5.dist-info/licenses/NOTICE,sha256=4qTlax9aX2-mswYJuVrLqJ9jK1IkN5kSBqfVvYLF3Ws,128
|
|
54
|
+
climate_ref-0.6.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|