climate-ref-ilamb 0.6.5__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (19) hide show
  1. {climate_ref_ilamb-0.6.5 → climate_ref_ilamb-0.7.0}/PKG-INFO +2 -2
  2. {climate_ref_ilamb-0.6.5 → climate_ref_ilamb-0.7.0}/pyproject.toml +3 -3
  3. {climate_ref_ilamb-0.6.5 → climate_ref_ilamb-0.7.0}/src/climate_ref_ilamb/configure/ilamb.yaml +1 -1
  4. {climate_ref_ilamb-0.6.5 → climate_ref_ilamb-0.7.0}/src/climate_ref_ilamb/configure/iomb.yaml +6 -6
  5. climate_ref_ilamb-0.7.0/src/climate_ref_ilamb/standard.py +467 -0
  6. {climate_ref_ilamb-0.6.5 → climate_ref_ilamb-0.7.0}/tests/integration/test_diagnostics.py +7 -4
  7. {climate_ref_ilamb-0.6.5 → climate_ref_ilamb-0.7.0}/tests/unit/test_standard_metrics.py +44 -21
  8. climate_ref_ilamb-0.6.5/src/climate_ref_ilamb/standard.py +0 -295
  9. {climate_ref_ilamb-0.6.5 → climate_ref_ilamb-0.7.0}/.gitignore +0 -0
  10. {climate_ref_ilamb-0.6.5 → climate_ref_ilamb-0.7.0}/LICENCE +0 -0
  11. {climate_ref_ilamb-0.6.5 → climate_ref_ilamb-0.7.0}/NOTICE +0 -0
  12. {climate_ref_ilamb-0.6.5 → climate_ref_ilamb-0.7.0}/README.md +0 -0
  13. {climate_ref_ilamb-0.6.5 → climate_ref_ilamb-0.7.0}/src/climate_ref_ilamb/__init__.py +0 -0
  14. {climate_ref_ilamb-0.6.5 → climate_ref_ilamb-0.7.0}/src/climate_ref_ilamb/dataset_registry/ilamb.txt +0 -0
  15. {climate_ref_ilamb-0.6.5 → climate_ref_ilamb-0.7.0}/src/climate_ref_ilamb/dataset_registry/iomb.txt +0 -0
  16. {climate_ref_ilamb-0.6.5 → climate_ref_ilamb-0.7.0}/src/climate_ref_ilamb/dataset_registry/test.txt +0 -0
  17. {climate_ref_ilamb-0.6.5 → climate_ref_ilamb-0.7.0}/src/climate_ref_ilamb/datasets.py +0 -0
  18. {climate_ref_ilamb-0.6.5 → climate_ref_ilamb-0.7.0}/src/climate_ref_ilamb/py.typed +0 -0
  19. {climate_ref_ilamb-0.6.5 → climate_ref_ilamb-0.7.0}/tests/unit/test_provider.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: climate-ref-ilamb
3
- Version: 0.6.5
3
+ Version: 0.7.0
4
4
  Summary: ILAMB diagnostic provider for the Rapid Evaluation Framework
5
5
  Author-email: Nathan Collier <nathaniel.collier@gmail.com>, Jared Lewis <jared.lewis@climate-resource.com>
6
6
  License-Expression: Apache-2.0
@@ -19,7 +19,7 @@ Classifier: Programming Language :: Python :: 3.13
19
19
  Classifier: Topic :: Scientific/Engineering
20
20
  Requires-Python: >=3.11
21
21
  Requires-Dist: climate-ref-core
22
- Requires-Dist: ilamb3>=2025.5.20
22
+ Requires-Dist: ilamb3>=2025.9.9
23
23
  Requires-Dist: scipy<1.16
24
24
  Description-Content-Type: text/markdown
25
25
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "climate-ref-ilamb"
3
- version = "0.6.5"
3
+ version = "0.7.0"
4
4
  description = "ILAMB diagnostic provider for the Rapid Evaluation Framework"
5
5
  readme = "README.md"
6
6
  authors = [
@@ -24,8 +24,8 @@ classifiers = [
24
24
  ]
25
25
  dependencies = [
26
26
  "climate-ref-core",
27
- "ilamb3>=2025.5.20",
28
- "scipy<1.16", # https://github.com/statsmodels/statsmodels/issues/9584
27
+ "ilamb3>=2025.9.9",
28
+ "scipy<1.16", # https://github.com/statsmodels/statsmodels/issues/9584
29
29
  ]
30
30
  [project.entry-points."climate-ref.providers"]
31
31
  ilamb = "climate_ref_ilamb:provider"
@@ -25,7 +25,7 @@ mrsos-WangMao:
25
25
  mrsol: ilamb/mrsol/WangMao/mrsol_olc.nc
26
26
  alternate_vars:
27
27
  - mrsos
28
- transform:
28
+ transforms:
29
29
  - select_depth:
30
30
  value: 0
31
31
  - soil_moisture_to_vol_fraction
@@ -5,7 +5,7 @@ thetao-WOA2023-surface:
5
5
  # TODO: Update to use the obs4REF equiv
6
6
  thetao: ilamb/WOA/thetao_mon_WOA_A5B4_gn_200501-201412.nc
7
7
  variable_cmap: Reds
8
- transform:
8
+ transforms:
9
9
  - select_depth:
10
10
  value: 0
11
11
  alternate_vars:
@@ -15,7 +15,7 @@ so-WOA2023-surface:
15
15
  sources:
16
16
  # TODO: Update to use the obs4REF equiv
17
17
  so: ilamb/WOA/so_mon_WOA_A5B4_gn_200501-201412.nc
18
- transform:
18
+ transforms:
19
19
  - select_depth:
20
20
  value: 0
21
21
  variable_cmap: YlGn
@@ -27,7 +27,7 @@ amoc-RAPID:
27
27
  - timeseries
28
28
  related_vars:
29
29
  - msftmz
30
- transform:
30
+ transforms:
31
31
  - msftmz_to_rapid
32
32
  sources:
33
33
  # TODO: Update to use the obs4REF equiv
@@ -39,10 +39,10 @@ ohc-NOAA:
39
39
  related_vars:
40
40
  - thetao
41
41
  - volcello
42
- transform:
42
+ transforms:
43
43
  - select_depth:
44
- min: 0
45
- max: 2000
44
+ vmin: 0
45
+ vmax: 2000
46
46
  - ocean_heat_content:
47
47
  reference_year: 2005
48
48
  analyses:
@@ -0,0 +1,467 @@
1
+ from pathlib import Path
2
+ from typing import Any
3
+
4
+ import dask.config
5
+ import ilamb3
6
+ import ilamb3.regions as ilr
7
+ import pandas as pd
8
+ import pooch
9
+ import xarray as xr
10
+ from ilamb3 import run
11
+
12
+ from climate_ref_core.constraints import AddSupplementaryDataset, RequireFacets
13
+ from climate_ref_core.dataset_registry import dataset_registry_manager
14
+ from climate_ref_core.datasets import FacetFilter, SourceDatasetType
15
+ from climate_ref_core.diagnostics import (
16
+ DataRequirement,
17
+ Diagnostic,
18
+ ExecutionDefinition,
19
+ ExecutionResult,
20
+ )
21
+ from climate_ref_core.metric_values.typing import SeriesMetricValue
22
+ from climate_ref_core.pycmec.metric import CMECMetric
23
+ from climate_ref_core.pycmec.output import CMECOutput, OutputCV
24
+ from climate_ref_ilamb.datasets import (
25
+ registry_to_collection,
26
+ )
27
+
28
+
29
+ def format_cmec_output_bundle(
30
+ dataset: pd.DataFrame,
31
+ dimensions: list[str],
32
+ metadata_columns: list[str],
33
+ value_column: str = "value",
34
+ ) -> dict[str, Any]:
35
+ """
36
+ Create a CMEC output bundle for the dataset.
37
+
38
+ Parameters
39
+ ----------
40
+ dataset
41
+ Processed dataset
42
+ dimensions
43
+ The dimensions of the dataset (e.g., ["source_id", "member_id", "region"])
44
+ metadata_columns
45
+ The columns to be used as metadata (e.g., ["Description", "LongName"])
46
+ value_column
47
+ The column containing the values
48
+
49
+ Returns
50
+ -------
51
+ A CMEC output bundle ready to be written to disk
52
+ """
53
+ # Validate that all required columns exist
54
+ required_columns = set(dimensions) | {value_column} | set(metadata_columns)
55
+ missing_columns = required_columns - set(dataset.columns)
56
+ if missing_columns:
57
+ raise ValueError(f"Missing required columns: {missing_columns}")
58
+
59
+ # Build the dimensions section
60
+ dimensions_dict: dict[str, dict[str, dict[str, str]]] = {}
61
+
62
+ # For each dimension, create a dictionary of unique values and their metadata
63
+ for dim in dimensions:
64
+ unique_values = dataset[dim].unique()
65
+ dim_dict: dict[str, dict[str, str]] = {}
66
+
67
+ for val in unique_values:
68
+ # Get the row for this dimension value
69
+
70
+ dim_dict[str(val)] = {}
71
+
72
+ if dim == dimensions[-1]:
73
+ # If this is the last dimension, add the value column to the metadata
74
+
75
+ dim_dict[str(val)] = dataset[dataset[dim] == val].iloc[0][metadata_columns].to_dict()
76
+
77
+ dimensions_dict[dim] = dim_dict
78
+
79
+ # Build the results section - create nested structure based on dimensions
80
+ def nest_results(df: pd.DataFrame, dims: list[str]) -> dict[str, Any] | float:
81
+ if not dims:
82
+ return float(df[value_column].iloc[0].item())
83
+
84
+ current_dim = dims[0]
85
+ remaining_dims = dims[1:]
86
+
87
+ return {
88
+ str(group_name): nest_results(group_df, remaining_dims)
89
+ for group_name, group_df in df.groupby(current_dim)
90
+ }
91
+
92
+ results = nest_results(dataset, list(dimensions))
93
+
94
+ return {"DIMENSIONS": {"json_structure": list(dimensions), **dimensions_dict}, "RESULTS": results}
95
+
96
+
97
+ def _build_cmec_bundle(df: pd.DataFrame) -> dict[str, Any]:
98
+ """
99
+ Build a CMEC bundle from information in the dataframe.
100
+
101
+ """
102
+ # TODO: Handle the reference data
103
+ # reference_df = df[df["source"] == "Reference"]
104
+ model_df = df[df["source"] != "Reference"]
105
+
106
+ # Strip out units from the name (available in the attributes)
107
+ extracted_source = model_df.name.str.extract(r"(.*)\s\[.*\]")
108
+ model_df.loc[:, "name"] = extracted_source[0]
109
+
110
+ model_df = model_df.rename(
111
+ columns={
112
+ "analysis": "metric",
113
+ "name": "statistic",
114
+ }
115
+ )
116
+
117
+ # Convert the value column to numeric, coercing errors to NaN
118
+ model_df.loc[:, "value"] = pd.to_numeric(model_df["value"], errors="coerce")
119
+ model_df = model_df.astype({"value": "float64"})
120
+
121
+ dimensions = ["experiment_id", "source_id", "member_id", "grid_label", "region", "metric", "statistic"]
122
+ attributes = ["type", "units"]
123
+
124
+ bundle = format_cmec_output_bundle(
125
+ model_df,
126
+ dimensions=dimensions,
127
+ metadata_columns=attributes,
128
+ value_column="value",
129
+ )
130
+
131
+ ilamb_regions = ilr.Regions()
132
+ for region, region_info in bundle["DIMENSIONS"]["region"].items():
133
+ if region == "None":
134
+ region_info["LongName"] = "None"
135
+ region_info["Description"] = "Reference data extents"
136
+ region_info["Generator"] = "N/A"
137
+ else:
138
+ region_info["LongName"] = ilamb_regions.get_name(region)
139
+ region_info["Description"] = ilamb_regions.get_name(region)
140
+ region_info["Generator"] = ilamb_regions.get_source(region)
141
+
142
+ return bundle
143
+
144
+
145
+ def _set_ilamb3_options(registry: pooch.Pooch, registry_file: str) -> None:
146
+ """
147
+ Set options for ILAMB based on which registry file is being used.
148
+ """
149
+ ilamb3.conf.reset() # type: ignore
150
+ ilamb_regions = ilr.Regions()
151
+ if registry_file == "ilamb":
152
+ ilamb_regions.add_netcdf(registry.fetch("ilamb/regions/GlobalLand.nc"))
153
+ ilamb_regions.add_netcdf(registry.fetch("ilamb/regions/Koppen_coarse.nc"))
154
+ ilamb3.conf.set(regions=["global", "tropical"])
155
+ # REF's data requirement correctly will add measure data from another
156
+ # ensemble, but internally I also groupby. Since REF is only giving 1
157
+ # source_id/member_id/grid_label at a time, relax the groupby option here so
158
+ # these measures are part of the dataframe in ilamb3.
159
+ ilamb3.conf.set(comparison_groupby=["source_id", "grid_label"])
160
+
161
+
162
+ def _load_csv_and_merge(output_directory: Path) -> pd.DataFrame:
163
+ """
164
+ Load individual csv scalar data and merge into a dataframe.
165
+ """
166
+ df = pd.concat(
167
+ [pd.read_csv(f, keep_default_na=False, na_values=["NaN"]) for f in output_directory.glob("*.csv")]
168
+ ).drop_duplicates(subset=["source", "region", "analysis", "name"])
169
+ return df
170
+
171
+
172
+ class ILAMBStandard(Diagnostic):
173
+ """
174
+ Apply the standard ILAMB analysis with respect to a given reference dataset.
175
+ """
176
+
177
+ def __init__(
178
+ self,
179
+ registry_file: str,
180
+ metric_name: str,
181
+ sources: dict[str, str],
182
+ **ilamb_kwargs: Any,
183
+ ):
184
+ # Setup the diagnostic
185
+ if len(sources) != 1:
186
+ raise ValueError("Only single source ILAMB diagnostics have been implemented.")
187
+ self.variable_id = next(iter(sources.keys()))
188
+ if "sources" not in ilamb_kwargs: # pragma: no cover
189
+ ilamb_kwargs["sources"] = sources
190
+ if "relationships" not in ilamb_kwargs:
191
+ ilamb_kwargs["relationships"] = {}
192
+ self.ilamb_kwargs = ilamb_kwargs
193
+
194
+ # REF stuff
195
+ self.name = metric_name
196
+ self.slug = self.name.lower().replace(" ", "-")
197
+ self.data_requirements = (
198
+ DataRequirement(
199
+ source_type=SourceDatasetType.CMIP6,
200
+ filters=(
201
+ FacetFilter(
202
+ facets={
203
+ "variable_id": (
204
+ self.variable_id,
205
+ *ilamb_kwargs.get("alternate_vars", []),
206
+ *ilamb_kwargs.get("related_vars", []),
207
+ *ilamb_kwargs.get("relationships", {}).keys(),
208
+ ),
209
+ "frequency": "mon",
210
+ "experiment_id": ("historical", "land-hist"),
211
+ "table_id": (
212
+ "AERmonZ",
213
+ "Amon",
214
+ "CFmon",
215
+ "Emon",
216
+ "EmonZ",
217
+ "LImon",
218
+ "Lmon",
219
+ "Omon",
220
+ "SImon",
221
+ ),
222
+ }
223
+ ),
224
+ ),
225
+ constraints=(
226
+ RequireFacets(
227
+ "variable_id",
228
+ (
229
+ self.variable_id,
230
+ *ilamb_kwargs.get("alternate_vars", []),
231
+ *ilamb_kwargs.get("related_vars", []),
232
+ ),
233
+ operator="any",
234
+ ),
235
+ *(
236
+ [
237
+ RequireFacets(
238
+ "variable_id",
239
+ required_facets=tuple(ilamb_kwargs.get("relationships", {}).keys()),
240
+ )
241
+ ]
242
+ if "relationships" in ilamb_kwargs
243
+ else []
244
+ ),
245
+ *(
246
+ (
247
+ AddSupplementaryDataset.from_defaults("areacella", SourceDatasetType.CMIP6),
248
+ AddSupplementaryDataset.from_defaults("sftlf", SourceDatasetType.CMIP6),
249
+ )
250
+ if registry_file == "ilamb"
251
+ else (
252
+ AddSupplementaryDataset.from_defaults("volcello", SourceDatasetType.CMIP6),
253
+ AddSupplementaryDataset.from_defaults("areacello", SourceDatasetType.CMIP6),
254
+ AddSupplementaryDataset.from_defaults("sftof", SourceDatasetType.CMIP6),
255
+ )
256
+ ),
257
+ ),
258
+ group_by=("experiment_id", "source_id", "member_id", "grid_label"),
259
+ ),
260
+ )
261
+
262
+ self.facets = (
263
+ "experiment_id",
264
+ "source_id",
265
+ "member_id",
266
+ "grid_label",
267
+ "region",
268
+ "metric",
269
+ "statistic",
270
+ )
271
+
272
+ # Setup ILAMB data and options
273
+ self.registry_file = registry_file
274
+ self.registry = dataset_registry_manager[self.registry_file]
275
+ self.ilamb_data = registry_to_collection(
276
+ dataset_registry_manager[self.registry_file],
277
+ )
278
+
279
+ def execute(self, definition: ExecutionDefinition) -> None:
280
+ """
281
+ Run the ILAMB standard analysis.
282
+ """
283
+ _set_ilamb3_options(self.registry, self.registry_file)
284
+ ref_datasets = self.ilamb_data.datasets.set_index(self.ilamb_data.slug_column)
285
+
286
+ # Run ILAMB in a single-threaded mode to avoid issues with multithreading (#394)
287
+ with dask.config.set(scheduler="synchronous"):
288
+ run.run_single_block(
289
+ self.slug,
290
+ ref_datasets,
291
+ definition.datasets[SourceDatasetType.CMIP6].datasets,
292
+ definition.output_directory,
293
+ **self.ilamb_kwargs,
294
+ )
295
+
296
+ def build_execution_result(self, definition: ExecutionDefinition) -> ExecutionResult:
297
+ """
298
+ Build the diagnostic result after running ILAMB.
299
+
300
+ Parameters
301
+ ----------
302
+ definition
303
+ The definition of the diagnostic execution
304
+
305
+ Returns
306
+ -------
307
+ An execution result object
308
+ """
309
+ _set_ilamb3_options(self.registry, self.registry_file)
310
+ # In ILAMB, scalars are saved in CSV files in the output directory. To
311
+ # be compatible with the REF system we will need to add the metadata
312
+ # that is associated with the execution group, called the selector.
313
+ df = _load_csv_and_merge(definition.output_directory)
314
+ selectors = definition.datasets[SourceDatasetType.CMIP6].selector_dict()
315
+
316
+ # TODO: Fix reference data once we are using the obs4MIPs dataset
317
+ dataset_source = self.name.split("-")[1] if "-" in self.name else "None"
318
+ common_dimensions = {**selectors, "reference_source_id": dataset_source}
319
+ for key, value in common_dimensions.items():
320
+ df[key] = value
321
+ metric_bundle = CMECMetric.model_validate(_build_cmec_bundle(df))
322
+
323
+ # Add each png file plot to the output
324
+ output_bundle = CMECOutput.create_template()
325
+ for plotfile in definition.output_directory.glob("*.png"):
326
+ relative_path = str(definition.as_relative_path(plotfile))
327
+ caption, figure_dimensions = _caption_from_filename(plotfile, common_dimensions)
328
+
329
+ output_bundle[OutputCV.PLOTS.value][relative_path] = {
330
+ OutputCV.FILENAME.value: relative_path,
331
+ OutputCV.LONG_NAME.value: caption,
332
+ OutputCV.DESCRIPTION.value: "",
333
+ OutputCV.DIMENSIONS.value: figure_dimensions,
334
+ }
335
+
336
+ # Add the html page to the output
337
+ index_html = definition.to_output_path("index.html")
338
+ if index_html.exists():
339
+ relative_path = str(definition.as_relative_path(index_html))
340
+ output_bundle[OutputCV.HTML.value][relative_path] = {
341
+ OutputCV.FILENAME.value: relative_path,
342
+ OutputCV.LONG_NAME.value: "Results page",
343
+ OutputCV.DESCRIPTION.value: "Page displaying scalars and plots from the ILAMB execution.",
344
+ OutputCV.DIMENSIONS.value: common_dimensions,
345
+ }
346
+ output_bundle[OutputCV.INDEX.value] = relative_path
347
+
348
+ # Add series to the output based on the time traces we find in the
349
+ # output files
350
+ series = []
351
+ for ncfile in definition.output_directory.glob("*.nc"):
352
+ ds = xr.open_dataset(ncfile, use_cftime=True)
353
+ for name, da in ds.items():
354
+ # Only create series for 1d DataArray's with these dimensions
355
+ if not (da.ndim == 1 and set(da.dims).intersection(["time", "month"])):
356
+ continue
357
+ # Convert dimension values
358
+ attrs = {
359
+ "units": da.attrs.get("units", ""),
360
+ "long_name": da.attrs.get("long_name", str(name)),
361
+ "standard_name": da.attrs.get("standard_name", ""),
362
+ }
363
+ str_name = str(name)
364
+ index_name = str(da.dims[0])
365
+ index = ds[index_name].values.tolist()
366
+ if hasattr(index[0], "isoformat"):
367
+ index = [v.isoformat() for v in index]
368
+ if hasattr(index[0], "calendar"):
369
+ attrs["calendar"] = index[0].calendar
370
+
371
+ # Parse out some dimensions
372
+ if ncfile.stem == "Reference":
373
+ dimensions = {
374
+ "source_id": "Reference",
375
+ "metric": str_name,
376
+ }
377
+ else:
378
+ dimensions = {"metric": str_name, **common_dimensions}
379
+
380
+ # Split the metric into metric and region if possible
381
+ if "_" in str_name:
382
+ dimensions["metric"] = str_name.split("_")[0]
383
+ dimensions["region"] = str_name.split("_")[1]
384
+ else:
385
+ dimensions["region"] = "None"
386
+
387
+ series.append(
388
+ SeriesMetricValue(
389
+ dimensions=dimensions,
390
+ values=da.values.tolist(),
391
+ index=index,
392
+ index_name=index_name,
393
+ attributes=attrs,
394
+ )
395
+ )
396
+
397
+ return ExecutionResult.build_from_output_bundle(
398
+ definition, cmec_output_bundle=output_bundle, cmec_metric_bundle=metric_bundle, series=series
399
+ )
400
+
401
+
402
+ def _caption_from_filename(filename: Path, common_dimensions: dict[str, str]) -> tuple[str, dict[str, str]]:
403
+ source, region, plot = filename.stem.split("_")
404
+ plot_texts = {
405
+ "bias": "bias",
406
+ "biasscore": "bias score",
407
+ "cycle": "annual cycle",
408
+ "cyclescore": "annual cycle score",
409
+ "mean": "period mean",
410
+ "rmse": "RMSE",
411
+ "rmsescore": "RMSE score",
412
+ "shift": "shift in maximum month",
413
+ "tmax": "maxmimum month",
414
+ "trace": "regional mean",
415
+ "taylor": "Taylor diagram",
416
+ "distribution": "distribution",
417
+ "response": "response",
418
+ }
419
+ # Name of statistics dimension in CMEC output
420
+ plot_statistics = {
421
+ "bias": "Bias",
422
+ "biasscore": "Bias score",
423
+ "cycle": "Annual cycle",
424
+ "cyclescore": "Annual cycle score",
425
+ "mean": "Period Mean",
426
+ "rmse": "RMSE",
427
+ "rmsescore": "RMSE score",
428
+ "shift": "Shift in maximum month",
429
+ "tmax": "Maximum month",
430
+ "trace": "Regional mean",
431
+ "taylor": "Taylor diagram",
432
+ "distribution": "Distribution",
433
+ "response": "Response",
434
+ }
435
+ figure_dimensions = {
436
+ "region": region,
437
+ }
438
+ plot_option = None
439
+ # Some plots have options appended with a dash (distribution-pr, response-tas)
440
+ if "-" in plot:
441
+ plot, plot_option = plot.split("-", 1)
442
+
443
+ if plot not in plot_texts:
444
+ return "", figure_dimensions
445
+
446
+ # Build the caption
447
+ caption = f"The {plot_texts.get(plot)}"
448
+ if plot_option is not None:
449
+ caption += f" of {plot_option}"
450
+ if source != "None":
451
+ caption += f" for {'the reference data' if source == 'Reference' else source}"
452
+ if region.lower() != "none":
453
+ caption += f" over the {ilr.Regions().get_name(region)} region."
454
+
455
+ # Use the statistic dimension to determine what is being plotted
456
+ if plot_statistics.get(plot) is not None:
457
+ figure_dimensions["statistic"] = plot_statistics[plot]
458
+ if plot_option is not None:
459
+ figure_dimensions["statistic"] += f"|{plot_option}"
460
+
461
+ # If the source is the reference we don't need some dimensions as they are not applicable
462
+ if source == "Reference":
463
+ figure_dimensions["source_id"] = "Reference"
464
+ else:
465
+ figure_dimensions = {**common_dimensions, **figure_dimensions}
466
+
467
+ return caption, figure_dimensions
@@ -3,17 +3,20 @@ from climate_ref_ilamb import provider as ilamb_provider
3
3
 
4
4
  from climate_ref_core.diagnostics import Diagnostic
5
5
 
6
- skipped_diagnostics = [
6
+ xfail_diagnostics = [
7
7
  "ohc-noaa", # Missing sample data
8
8
  ]
9
+ skipped_diagnostics = []
10
+
9
11
 
10
12
  diagnostics = [
11
13
  pytest.param(
12
14
  diagnostic,
13
15
  id=diagnostic.slug,
14
- marks=[pytest.mark.xfail(reason="Expected failure")]
15
- if diagnostic.slug in skipped_diagnostics
16
- else [],
16
+ marks=[
17
+ *([pytest.mark.xfail(reason="Expected failure")] if diagnostic.slug in xfail_diagnostics else []),
18
+ *([pytest.mark.skip(reason="Problem test")] if diagnostic.slug in skipped_diagnostics else []),
19
+ ],
17
20
  )
18
21
  for diagnostic in ilamb_provider.diagnostics()
19
22
  ]
@@ -13,18 +13,27 @@ def test_standard_site(cmip6_data_catalog, definition_factory):
13
13
  diagnostic = ILAMBStandard(
14
14
  registry_file="ilamb-test", metric_name="test-site-tas", sources={"tas": "ilamb/test/Site/tas.nc"}
15
15
  )
16
- ds = (
17
- cmip6_data_catalog[
18
- (cmip6_data_catalog["experiment_id"] == "historical")
19
- & (cmip6_data_catalog["variable_id"] == "tas")
20
- ]
21
- .groupby("instance_id")
22
- .first()
16
+ _, ds = next(
17
+ iter(
18
+ cmip6_data_catalog[
19
+ (cmip6_data_catalog["experiment_id"] == "historical")
20
+ & (cmip6_data_catalog["variable_id"] == "tas")
21
+ ].groupby("instance_id")
22
+ )
23
23
  )
24
-
25
24
  definition = definition_factory(
26
25
  diagnostic=diagnostic,
27
- cmip6=DatasetCollection(ds, "instance_id", selector=(("experiment_id", "historical"),)),
26
+ cmip6=DatasetCollection(
27
+ ds,
28
+ "instance_id",
29
+ selector=(
30
+ ("experiment_id", "historical"),
31
+ ("variable_id", "tas"),
32
+ ("source_id", "CanESM5"),
33
+ ("member_id", "r1i1p1f1"),
34
+ ("grid_label", "gn"),
35
+ ),
36
+ ),
28
37
  )
29
38
  definition.output_directory.mkdir(parents=True, exist_ok=True)
30
39
 
@@ -54,15 +63,28 @@ def test_standard_grid(cmip6_data_catalog, definition_factory):
54
63
  sources={"gpp": "ilamb/test/Grid/gpp.nc"},
55
64
  relationships={"pr": "ilamb/test/Grid/pr.nc"},
56
65
  )
57
- grp = cmip6_data_catalog[
58
- (cmip6_data_catalog["experiment_id"] == "historical")
59
- & ((cmip6_data_catalog["variable_id"] == "gpp") | (cmip6_data_catalog["variable_id"] == "pr"))
60
- ].groupby(["source_id", "member_id", "grid_label"])
61
- _, ds = next(iter(grp))
66
+ _, ds = next(
67
+ iter(
68
+ cmip6_data_catalog[
69
+ (cmip6_data_catalog["experiment_id"] == "historical")
70
+ & ((cmip6_data_catalog["variable_id"] == "gpp") | (cmip6_data_catalog["variable_id"] == "pr"))
71
+ ].groupby(["source_id", "member_id", "grid_label"])
72
+ )
73
+ )
62
74
 
63
75
  definition = definition_factory(
64
76
  diagnostic=diagnostic,
65
- cmip6=DatasetCollection(ds, "instance_id", selector=(("experiment_id", "historical"),)),
77
+ cmip6=DatasetCollection(
78
+ ds,
79
+ "instance_id",
80
+ selector=(
81
+ ("experiment_id", "historical"),
82
+ ("variable_id", "tas"),
83
+ ("source_id", "CanESM5"),
84
+ ("member_id", "r1i1p1f1"),
85
+ ("grid_label", "gn"),
86
+ ),
87
+ ),
66
88
  )
67
89
  definition.output_directory.mkdir(parents=True, exist_ok=True)
68
90
 
@@ -129,17 +151,18 @@ def test_expected_executions():
129
151
  ),
130
152
  }
131
153
  executions = list(solve_executions(data_catalog, diagnostic, provider=ilamb_provider))
132
- assert len(executions) == 1
133
-
134
- # ts
135
- assert executions[0].datasets[SourceDatasetType.CMIP6].selector == (("experiment_id", "historical"),)
154
+ assert len(executions) == 2
155
+ assert executions[0].datasets[SourceDatasetType.CMIP6].selector == (
156
+ ("experiment_id", "historical"),
157
+ ("grid_label", "gn"),
158
+ ("member_id", "r1i1p1f1"),
159
+ ("source_id", "ACCESS-ESM1-5"),
160
+ )
136
161
  assert executions[0].datasets[SourceDatasetType.CMIP6].datasets["variable_id"].tolist() == [
137
- "cSoil",
138
162
  "cSoil",
139
163
  "areacella",
140
164
  ]
141
165
  assert executions[0].datasets[SourceDatasetType.CMIP6].datasets["member_id"].tolist() == [
142
166
  "r1i1p1f1",
143
- "r2i1p1f1",
144
167
  "r1i1p1f1",
145
168
  ]
@@ -1,295 +0,0 @@
1
- from pathlib import Path
2
- from typing import Any
3
-
4
- import ilamb3 # type: ignore
5
- import ilamb3.regions as ilr # type: ignore
6
- import matplotlib.pyplot as plt
7
- import pandas as pd
8
- import pooch
9
- from ilamb3 import run
10
-
11
- from climate_ref_core.constraints import AddSupplementaryDataset
12
- from climate_ref_core.dataset_registry import dataset_registry_manager
13
- from climate_ref_core.datasets import FacetFilter, SourceDatasetType
14
- from climate_ref_core.diagnostics import (
15
- DataRequirement,
16
- Diagnostic,
17
- ExecutionDefinition,
18
- ExecutionResult,
19
- )
20
- from climate_ref_core.pycmec.metric import CMECMetric
21
- from climate_ref_core.pycmec.output import CMECOutput
22
- from climate_ref_ilamb.datasets import (
23
- registry_to_collection,
24
- )
25
-
26
-
27
- def format_cmec_output_bundle(
28
- dataset: pd.DataFrame,
29
- dimensions: list[str],
30
- metadata_columns: list[str],
31
- value_column: str = "value",
32
- ) -> dict[str, Any]:
33
- """
34
- Create a CMEC output bundle for the dataset.
35
-
36
- Parameters
37
- ----------
38
- dataset
39
- Processed dataset
40
- dimensions
41
- The dimensions of the dataset (e.g., ["source_id", "member_id", "region"])
42
- metadata_columns
43
- The columns to be used as metadata (e.g., ["Description", "LongName"])
44
- value_column
45
- The column containing the values
46
-
47
- Returns
48
- -------
49
- A CMEC output bundle ready to be written to disk
50
- """
51
- # Validate that all required columns exist
52
- required_columns = set(dimensions) | {value_column} | set(metadata_columns)
53
- missing_columns = required_columns - set(dataset.columns)
54
- if missing_columns:
55
- raise ValueError(f"Missing required columns: {missing_columns}")
56
-
57
- # Build the dimensions section
58
- dimensions_dict: dict[str, dict[str, dict[str, str]]] = {}
59
-
60
- # For each dimension, create a dictionary of unique values and their metadata
61
- for dim in dimensions:
62
- unique_values = dataset[dim].unique()
63
- dim_dict: dict[str, dict[str, str]] = {}
64
-
65
- for val in unique_values:
66
- # Get the row for this dimension value
67
-
68
- dim_dict[str(val)] = {}
69
-
70
- if dim == dimensions[-1]:
71
- # If this is the last dimension, add the value column to the metadata
72
-
73
- dim_dict[str(val)] = dataset[dataset[dim] == val].iloc[0][metadata_columns].to_dict()
74
-
75
- dimensions_dict[dim] = dim_dict
76
-
77
- # Build the results section - create nested structure based on dimensions
78
- def nest_results(df: pd.DataFrame, dims: list[str]) -> dict[str, Any] | float:
79
- if not dims:
80
- return float(df[value_column].iloc[0].item())
81
-
82
- current_dim = dims[0]
83
- remaining_dims = dims[1:]
84
-
85
- return {
86
- str(group_name): nest_results(group_df, remaining_dims)
87
- for group_name, group_df in df.groupby(current_dim)
88
- }
89
-
90
- results = nest_results(dataset, list(dimensions))
91
-
92
- return {"DIMENSIONS": {"json_structure": list(dimensions), **dimensions_dict}, "RESULTS": results}
93
-
94
-
95
- def _build_cmec_bundle(df: pd.DataFrame) -> dict[str, Any]:
96
- """
97
- Build a CMEC bundle from information in the dataframe.
98
-
99
- """
100
- # TODO: Handle the reference data
101
- # reference_df = df[df["source"] == "Reference"]
102
- model_df = df[df["source"] != "Reference"]
103
-
104
- # Source is formatted as "ACCESS-ESM1-5-r1i1p1f1-gn"
105
- # This assumes that the member_id and grid_label are always the last two parts of the source string
106
- # and don't contain '-'
107
- extracted_source = model_df.source.str.extract(r"([\w-]+)-([\w\d]+)-([\w\d]+)")
108
- model_df.loc[:, "source_id"] = extracted_source[0]
109
- model_df.loc[:, "member_id"] = extracted_source[1]
110
- model_df.loc[:, "grid_label"] = extracted_source[2]
111
-
112
- # Strip out units from the name
113
- # These are available in the attributes
114
- extracted_source = model_df.name.str.extract(r"(.*)\s\[.*\]")
115
- model_df.loc[:, "name"] = extracted_source[0]
116
-
117
- model_df = model_df.rename(
118
- columns={
119
- "analysis": "metric",
120
- "name": "statistic",
121
- }
122
- )
123
-
124
- # Convert the value column to numeric, coercing errors to NaN
125
- model_df.loc[:, "value"] = pd.to_numeric(model_df["value"], errors="coerce")
126
- model_df = model_df.astype({"value": "float64"})
127
-
128
- dimensions = ["experiment_id", "source_id", "member_id", "grid_label", "region", "metric", "statistic"]
129
- attributes = ["type", "units"]
130
-
131
- bundle = format_cmec_output_bundle(
132
- model_df,
133
- dimensions=dimensions,
134
- metadata_columns=attributes,
135
- value_column="value",
136
- )
137
-
138
- ilamb_regions = ilr.Regions()
139
- for region, region_info in bundle["DIMENSIONS"]["region"].items():
140
- if region == "None":
141
- region_info["LongName"] = "None"
142
- region_info["Description"] = "Reference data extents"
143
- region_info["Generator"] = "N/A"
144
- else:
145
- region_info["LongName"] = ilamb_regions.get_name(region)
146
- region_info["Description"] = ilamb_regions.get_name(region)
147
- region_info["Generator"] = ilamb_regions.get_source(region)
148
-
149
- return bundle
150
-
151
-
152
- def _form_bundles(df: pd.DataFrame) -> tuple[CMECMetric, CMECOutput]:
153
- """
154
- Create the output bundles (really a lift to make Ruff happy with the size of run()).
155
- """
156
- metric_bundle = _build_cmec_bundle(df)
157
- output_bundle = CMECOutput.create_template()
158
- return CMECMetric.model_validate(metric_bundle), CMECOutput.model_validate(output_bundle)
159
-
160
-
161
- def _set_ilamb3_options(registry: pooch.Pooch, registry_file: str) -> None:
162
- """
163
- Set options for ILAMB based on which registry file is being used.
164
- """
165
- ilamb3.conf.reset()
166
- ilamb_regions = ilr.Regions()
167
- if registry_file == "ilamb":
168
- ilamb_regions.add_netcdf(registry.fetch("ilamb/regions/GlobalLand.nc"))
169
- ilamb_regions.add_netcdf(registry.fetch("ilamb/regions/Koppen_coarse.nc"))
170
- ilamb3.conf.set(regions=["global", "tropical"])
171
-
172
-
173
- def _load_csv_and_merge(output_directory: Path) -> pd.DataFrame:
174
- """
175
- Load individual csv scalar data and merge into a dataframe.
176
- """
177
- df = pd.concat(
178
- [pd.read_csv(f, keep_default_na=False, na_values=["NaN"]) for f in output_directory.glob("*.csv")]
179
- ).drop_duplicates(subset=["source", "region", "analysis", "name"])
180
- return df
181
-
182
-
183
- class ILAMBStandard(Diagnostic):
184
- """
185
- Apply the standard ILAMB analysis with respect to a given reference dataset.
186
- """
187
-
188
- def __init__(
189
- self,
190
- registry_file: str,
191
- metric_name: str,
192
- sources: dict[str, str],
193
- **ilamb_kwargs: Any,
194
- ):
195
- # Setup the diagnostic
196
- if len(sources) != 1:
197
- raise ValueError("Only single source ILAMB diagnostics have been implemented.")
198
- self.variable_id = next(iter(sources.keys()))
199
- if "sources" not in ilamb_kwargs: # pragma: no cover
200
- ilamb_kwargs["sources"] = sources
201
- if "relationships" not in ilamb_kwargs:
202
- ilamb_kwargs["relationships"] = {}
203
- self.ilamb_kwargs = ilamb_kwargs
204
-
205
- # REF stuff
206
- self.name = metric_name
207
- self.slug = self.name.lower().replace(" ", "-")
208
- self.data_requirements = (
209
- DataRequirement(
210
- source_type=SourceDatasetType.CMIP6,
211
- filters=(
212
- FacetFilter(
213
- facets={
214
- "variable_id": (
215
- self.variable_id,
216
- *ilamb_kwargs.get("relationships", {}).keys(),
217
- *ilamb_kwargs.get("alternate_vars", []),
218
- *ilamb_kwargs.get("related_vars", []),
219
- )
220
- }
221
- ),
222
- FacetFilter(facets={"frequency": ("mon",)}),
223
- FacetFilter(facets={"experiment_id": ("historical", "land-hist")}),
224
- # Exclude unneeded snc tables
225
- FacetFilter(facets={"table_id": ("ImonAnt", "ImonGre")}, keep=False),
226
- ),
227
- constraints=(
228
- AddSupplementaryDataset.from_defaults("areacella", SourceDatasetType.CMIP6),
229
- AddSupplementaryDataset.from_defaults("sftlf", SourceDatasetType.CMIP6),
230
- )
231
- if registry_file == "ilamb"
232
- else (
233
- AddSupplementaryDataset.from_defaults("areacello", SourceDatasetType.CMIP6),
234
- AddSupplementaryDataset.from_defaults("sftof", SourceDatasetType.CMIP6),
235
- ),
236
- group_by=("experiment_id",),
237
- ),
238
- )
239
- self.facets = (
240
- "experiment_id",
241
- "source_id",
242
- "member_id",
243
- "grid_label",
244
- "region",
245
- "metric",
246
- "statistic",
247
- )
248
-
249
- # Setup ILAMB data and options
250
- self.registry_file = registry_file
251
- self.registry = dataset_registry_manager[self.registry_file]
252
- self.ilamb_data = registry_to_collection(
253
- dataset_registry_manager[self.registry_file],
254
- )
255
-
256
- def execute(self, definition: ExecutionDefinition) -> None:
257
- """
258
- Run the ILAMB standard analysis.
259
- """
260
- plt.rcParams.update({"figure.max_open_warning": 0})
261
- _set_ilamb3_options(self.registry, self.registry_file)
262
- ref_datasets = self.ilamb_data.datasets.set_index(self.ilamb_data.slug_column)
263
- run.run_simple(
264
- ref_datasets,
265
- self.slug,
266
- definition.datasets[SourceDatasetType.CMIP6].datasets,
267
- definition.output_directory,
268
- **self.ilamb_kwargs,
269
- )
270
-
271
- def build_execution_result(self, definition: ExecutionDefinition) -> ExecutionResult:
272
- """
273
- Build the diagnostic result after running ILAMB.
274
-
275
- Parameters
276
- ----------
277
- definition
278
- The definition of the diagnostic execution
279
-
280
- Returns
281
- -------
282
- An execution result object
283
- """
284
- selectors = definition.datasets[SourceDatasetType.CMIP6].selector_dict()
285
- _set_ilamb3_options(self.registry, self.registry_file)
286
-
287
- df = _load_csv_and_merge(definition.output_directory)
288
- # Add the selectors to the dataframe
289
- for key, value in selectors.items():
290
- df[key] = value
291
- metric_bundle, output_bundle = _form_bundles(df)
292
-
293
- return ExecutionResult.build_from_output_bundle(
294
- definition, cmec_output_bundle=output_bundle, cmec_metric_bundle=metric_bundle
295
- )