climate-ref-core 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,598 @@
1
+ """
2
+ CMIP6 to CMIP7 format converter.
3
+
4
+ This module provides utilities to convert CMIP6 xarray datasets to CMIP7 format,
5
+ following the CMIP7 Global Attributes V1.0 specification (DOI: 10.5281/zenodo.17250297).
6
+
7
+
8
+ Key differences between CMIP6 and CMIP7
9
+ ---------------------------------------
10
+ - Variable naming: CMIP7 uses branded names like `tas_tavg-h2m-hxy-u` instead of `tas`
11
+ - Branding suffix: `<temporal>-<vertical>-<horizontal>-<area>` labels (e.g., `tavg-h2m-hxy-u`)
12
+ - Variant indices: Changed from integers to prefixed strings (1 -> "r1", "i1", "p1", "f1")
13
+ - New mandatory attributes: license_id
14
+ - table_id: Uses realm names instead of CMOR table names (atmos vs Amon)
15
+ - Directory structure: MIP-DRS7 specification
16
+ - Filename format: Includes branding suffix, region, and grid_label
17
+ - Removed CMIP6 attributes: further_info_url, grid, member_id, sub_experiment, sub_experiment_id
18
+
19
+ References
20
+ ----------
21
+ - CMIP7 Global Attributes V1.0: https://doi.org/10.5281/zenodo.17250297
22
+ - CMIP7 CVs: https://github.com/WCRP-CMIP/CMIP7_CVs
23
+ - CMIP7 Guidance: https://wcrp-cmip.github.io/cmip7-guidance/
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import re
29
+ import uuid
30
+ from dataclasses import dataclass, field
31
+ from datetime import datetime, timezone
32
+ from typing import TYPE_CHECKING, Any
33
+
34
+ if TYPE_CHECKING:
35
+ import xarray as xr
36
+
37
+
38
+ # CMIP6 table_id to CMIP7 realm mapping
39
+ TABLE_TO_REALM = {
40
+ "Amon": "atmos",
41
+ "Omon": "ocean",
42
+ "Lmon": "land",
43
+ "LImon": "landIce",
44
+ "SImon": "seaIce",
45
+ "AERmon": "aerosol",
46
+ "Oday": "ocean",
47
+ "day": "atmos",
48
+ "Aday": "atmos",
49
+ "Eday": "atmos",
50
+ "CFday": "atmos",
51
+ "3hr": "atmos",
52
+ "6hrLev": "atmos",
53
+ "6hrPlev": "atmos",
54
+ "6hrPlevPt": "atmos",
55
+ "fx": "atmos", # Fixed fields default to atmos
56
+ "Ofx": "ocean",
57
+ "Efx": "atmos",
58
+ "Lfx": "land",
59
+ }
60
+
61
+ # CMIP6 frequency values (table_id prefix patterns)
62
+ FREQUENCY_MAP = {
63
+ "mon": "mon",
64
+ "day": "day",
65
+ "3hr": "3hr",
66
+ "6hr": "6hr",
67
+ "1hr": "1hr",
68
+ "yr": "yr",
69
+ "fx": "fx",
70
+ }
71
+
72
+ # CMIP6-only attributes that should be removed when converting to CMIP7
73
+ # These are not part of the CMIP7 Global Attributes specification (V1.0)
74
+ # These may be included in output, but they won't be checked
75
+ CMIP6_ONLY_ATTRIBUTES = {
76
+ "further_info_url", # CMIP6-specific URL format, replaced by different mechanism in CMIP7
77
+ "grid", # Replaced by grid_label in CMIP7
78
+ "member_id", # Redundant with variant_label, not in CMIP7 spec
79
+ "sub_experiment", # Not in CMIP7 spec
80
+ "sub_experiment_id", # Not in CMIP7 spec
81
+ "table_id", # Not in CMIP7 spec
82
+ }
83
+
84
+
85
+ @dataclass
86
+ class BrandingSuffix:
87
+ """
88
+ CMIP7 branding suffix components.
89
+
90
+ Format: <temporal_label>-<vertical_label>-<horizontal_label>-<area_label>
91
+ Example: tavg-h2m-hxy-u
92
+ """
93
+
94
+ temporal_label: str = "tavg" # tavg, tpt, tmax, tmin, tsum, tclm, ti
95
+ vertical_label: str = "u" # h2m, h10m, u (unspecified), p19, etc.
96
+ horizontal_label: str = "hxy" # hxy (gridded), hm (mean), hy (zonal), etc.
97
+ area_label: str = "u" # u (unmasked), lnd, sea, si, etc.
98
+
99
+ def __str__(self) -> str:
100
+ return f"{self.temporal_label}-{self.vertical_label}-{self.horizontal_label}-{self.area_label}"
101
+
102
+
103
+ # Common variable to branding suffix mappings
104
+ # These are based on typical CMIP6 variable definitions
105
+ VARIABLE_BRANDING: dict[str, BrandingSuffix] = {
106
+ # Atmosphere 2D variables
107
+ "tas": BrandingSuffix("tavg", "h2m", "hxy", "u"),
108
+ "tasmax": BrandingSuffix("tmax", "h2m", "hxy", "u"),
109
+ "tasmin": BrandingSuffix("tmin", "h2m", "hxy", "u"),
110
+ "pr": BrandingSuffix("tavg", "u", "hxy", "u"),
111
+ "psl": BrandingSuffix("tavg", "u", "hxy", "u"),
112
+ "ps": BrandingSuffix("tavg", "u", "hxy", "u"),
113
+ "uas": BrandingSuffix("tavg", "h10m", "hxy", "u"),
114
+ "vas": BrandingSuffix("tavg", "h10m", "hxy", "u"),
115
+ "sfcWind": BrandingSuffix("tavg", "h10m", "hxy", "u"),
116
+ "hurs": BrandingSuffix("tavg", "h2m", "hxy", "u"),
117
+ "huss": BrandingSuffix("tavg", "h2m", "hxy", "u"),
118
+ "clt": BrandingSuffix("tavg", "u", "hxy", "u"),
119
+ "rsds": BrandingSuffix("tavg", "u", "hxy", "u"),
120
+ "rsus": BrandingSuffix("tavg", "u", "hxy", "u"),
121
+ "rlds": BrandingSuffix("tavg", "u", "hxy", "u"),
122
+ "rlus": BrandingSuffix("tavg", "u", "hxy", "u"),
123
+ "rsdt": BrandingSuffix("tavg", "u", "hxy", "u"),
124
+ "rsut": BrandingSuffix("tavg", "u", "hxy", "u"),
125
+ "rlut": BrandingSuffix("tavg", "u", "hxy", "u"),
126
+ "evspsbl": BrandingSuffix("tavg", "u", "hxy", "u"),
127
+ "tauu": BrandingSuffix("tavg", "u", "hxy", "u"),
128
+ "tauv": BrandingSuffix("tavg", "u", "hxy", "u"),
129
+ # Ocean 2D variables
130
+ "tos": BrandingSuffix("tavg", "d0m", "hxy", "sea"),
131
+ "sos": BrandingSuffix("tavg", "d0m", "hxy", "sea"),
132
+ "zos": BrandingSuffix("tavg", "u", "hxy", "sea"),
133
+ "mlotst": BrandingSuffix("tavg", "u", "hxy", "sea"),
134
+ # Sea ice variables
135
+ "siconc": BrandingSuffix("tavg", "u", "hxy", "u"),
136
+ "sithick": BrandingSuffix("tavg", "u", "hxy", "si"),
137
+ "sisnthick": BrandingSuffix("tavg", "u", "hxy", "si"),
138
+ # Land variables
139
+ "mrso": BrandingSuffix("tavg", "u", "hxy", "lnd"),
140
+ "mrsos": BrandingSuffix("tavg", "d10cm", "hxy", "lnd"),
141
+ "mrro": BrandingSuffix("tavg", "u", "hxy", "lnd"),
142
+ "snw": BrandingSuffix("tavg", "u", "hxy", "lnd"),
143
+ "lai": BrandingSuffix("tavg", "u", "hxy", "lnd"),
144
+ "gpp": BrandingSuffix("tavg", "u", "hxy", "lnd"),
145
+ "npp": BrandingSuffix("tavg", "u", "hxy", "lnd"),
146
+ "nbp": BrandingSuffix("tavg", "u", "hxy", "lnd"),
147
+ "cVeg": BrandingSuffix("tavg", "u", "hxy", "lnd"),
148
+ "cSoil": BrandingSuffix("tavg", "u", "hxy", "lnd"),
149
+ "treeFrac": BrandingSuffix("tavg", "u", "hxy", "lnd"),
150
+ "vegFrac": BrandingSuffix("tavg", "u", "hxy", "lnd"),
151
+ # Fixed fields
152
+ "areacella": BrandingSuffix("ti", "u", "hxy", "u"),
153
+ "areacello": BrandingSuffix("ti", "u", "hxy", "u"),
154
+ "sftlf": BrandingSuffix("ti", "u", "hxy", "u"),
155
+ "sftof": BrandingSuffix("ti", "u", "hxy", "u"),
156
+ "orog": BrandingSuffix("ti", "u", "hxy", "u"),
157
+ }
158
+
159
+
160
+ def get_branding_suffix(variable_id: str, cell_methods: str | None = None) -> BrandingSuffix:
161
+ """
162
+ Determine the CMIP7 branding suffix for a variable.
163
+
164
+ Parameters
165
+ ----------
166
+ variable_id
167
+ The CMIP6 variable ID (e.g., "tas", "pr")
168
+ cell_methods
169
+ Optional cell_methods attribute to help determine temporal/spatial operations
170
+
171
+ Returns
172
+ -------
173
+ BrandingSuffix
174
+ The branding suffix components
175
+ """
176
+ # Use predefined mapping if available
177
+ if variable_id in VARIABLE_BRANDING:
178
+ return VARIABLE_BRANDING[variable_id]
179
+
180
+ # Fallback: infer from variable name patterns
181
+ suffix = BrandingSuffix()
182
+
183
+ # Check for max/min in variable name
184
+ if variable_id.endswith("max") or (cell_methods and "maximum" in cell_methods):
185
+ suffix = BrandingSuffix(temporal_label="tmax")
186
+ elif variable_id.endswith("min") or (cell_methods and "minimum" in cell_methods):
187
+ suffix = BrandingSuffix(temporal_label="tmin")
188
+
189
+ return suffix
190
+
191
+
192
+ def get_cmip7_variable_name(variable_id: str, branding: BrandingSuffix | None = None) -> str:
193
+ """
194
+ Convert a CMIP6 variable name to CMIP7 branded format.
195
+
196
+ Parameters
197
+ ----------
198
+ variable_id
199
+ The CMIP6 variable ID (e.g., "tas")
200
+ branding
201
+ Optional branding suffix; if None, determined automatically
202
+
203
+ Returns
204
+ -------
205
+ str
206
+ The CMIP7 variable name (e.g., "tas_tavg-h2m-hxy-u")
207
+ """
208
+ if branding is None:
209
+ branding = get_branding_suffix(variable_id)
210
+ return f"{variable_id}_{branding}"
211
+
212
+
213
+ def get_frequency_from_table(table_id: str) -> str: # noqa: PLR0911
214
+ """
215
+ Extract frequency from CMIP6 table_id.
216
+
217
+ Parameters
218
+ ----------
219
+ table_id
220
+ CMIP6 table identifier (e.g., "Amon", "Oday", "3hr")
221
+
222
+ Returns
223
+ -------
224
+ str
225
+ Frequency string (e.g., "mon", "day", "3hr")
226
+ """
227
+ # Check common patterns
228
+ if "mon" in table_id.lower():
229
+ return "mon"
230
+ elif "day" in table_id.lower():
231
+ return "day"
232
+ elif "yr" in table_id.lower():
233
+ return "yr"
234
+ elif "hr" in table_id.lower():
235
+ # Extract hour value
236
+ match = re.search(r"(\d+)hr", table_id.lower())
237
+ if match:
238
+ return f"{match.group(1)}hr"
239
+ return "1hr"
240
+ elif table_id.lower().startswith("fx") or table_id.lower().endswith("fx"):
241
+ return "fx"
242
+
243
+ return "mon" # Default
244
+
245
+
246
+ def get_realm_from_table(table_id: str) -> str:
247
+ """
248
+ Convert CMIP6 table_id to CMIP7 realm.
249
+
250
+ Parameters
251
+ ----------
252
+ table_id
253
+ CMIP6 table identifier (e.g., "Amon", "Omon")
254
+
255
+ Returns
256
+ -------
257
+ str
258
+ CMIP7 realm (e.g., "atmos", "ocean")
259
+ """
260
+ return TABLE_TO_REALM.get(table_id, "atmos")
261
+
262
+
263
+ def convert_variant_index(value: int | str, prefix: str) -> str:
264
+ """
265
+ Convert CMIP6 numeric variant index to CMIP7 string format.
266
+
267
+ In CMIP6, indices like realization_index were integers (e.g., 1).
268
+ In CMIP7, they are strings with a prefix (e.g., "r1").
269
+
270
+ Parameters
271
+ ----------
272
+ value
273
+ The index value (int or str)
274
+ prefix
275
+ The prefix to use ("r", "i", "p", or "f")
276
+
277
+ Returns
278
+ -------
279
+ str
280
+ The CMIP7 format index (e.g., "r1", "i1", "p1", "f1")
281
+ """
282
+ if isinstance(value, int):
283
+ return f"{prefix}{value}"
284
+ elif isinstance(value, str):
285
+ # Already has prefix
286
+ if value.startswith(prefix):
287
+ return value
288
+ # Try to extract numeric part
289
+ try:
290
+ return f"{prefix}{int(value)}"
291
+ except ValueError:
292
+ return f"{prefix}{value}"
293
+
294
+ return f"{prefix}1" # type: ignore
295
+
296
+
297
+ @dataclass
298
+ class CMIP7Metadata:
299
+ """
300
+ CMIP7 metadata attributes for conversion.
301
+
302
+ This captures the additional/modified attributes needed for CMIP7 format.
303
+ Based on CMIP7 Global Attributes V1.0 (DOI: 10.5281/zenodo.17250297).
304
+ """
305
+
306
+ # Required new attributes
307
+ mip_era: str = "CMIP7"
308
+ region: str = "glb"
309
+ drs_specs: str = "MIP-DRS7"
310
+ data_specs_version: str = "MIP-DS7.1.0.0"
311
+ product: str = "model-output"
312
+ license_id: str = "CC-BY-4.0"
313
+
314
+ # Label attributes (derived from branding_suffix)
315
+ temporal_label: str = "tavg"
316
+ vertical_label: str = "u"
317
+ horizontal_label: str = "hxy"
318
+ area_label: str = "u"
319
+
320
+ # Derived attributes
321
+ branding_suffix: str = field(init=False)
322
+
323
+ def __post_init__(self) -> None:
324
+ self.branding_suffix = (
325
+ f"{self.temporal_label}-{self.vertical_label}-{self.horizontal_label}-{self.area_label}"
326
+ )
327
+
328
+ @classmethod
329
+ def from_branding(cls, branding: BrandingSuffix, **kwargs: Any) -> CMIP7Metadata:
330
+ """Create metadata from a BrandingSuffix."""
331
+ return cls(
332
+ temporal_label=branding.temporal_label,
333
+ vertical_label=branding.vertical_label,
334
+ horizontal_label=branding.horizontal_label,
335
+ area_label=branding.area_label,
336
+ **kwargs,
337
+ )
338
+
339
+
340
+ def convert_cmip6_to_cmip7_attrs(
341
+ cmip6_attrs: dict[str, Any],
342
+ variable_id: str | None = None,
343
+ branding: BrandingSuffix | None = None,
344
+ ) -> dict[str, Any]:
345
+ """
346
+ Convert CMIP6 global attributes to CMIP7 format.
347
+
348
+ Based on CMIP7 Global Attributes V1.0 (DOI: 10.5281/zenodo.17250297).
349
+
350
+ Parameters
351
+ ----------
352
+ cmip6_attrs
353
+ Dictionary of CMIP6 global attributes
354
+ variable_id
355
+ Variable ID for determining branding suffix
356
+ branding
357
+ Optional explicit branding suffix
358
+
359
+ Returns
360
+ -------
361
+ dict
362
+ Dictionary of CMIP7 global attributes
363
+ """
364
+ # Start with a copy of existing attributes
365
+ attrs = dict(cmip6_attrs)
366
+
367
+ # Determine variable_id if not provided
368
+ if variable_id is None:
369
+ variable_id = attrs.get("variable_id", "unknown")
370
+
371
+ # Get branding suffix
372
+ if branding is None:
373
+ branding = get_branding_suffix(variable_id, attrs.get("cell_methods"))
374
+
375
+ # Create CMIP7 metadata
376
+ cmip7_meta = CMIP7Metadata.from_branding(branding)
377
+
378
+ # Update mip_era
379
+ attrs["mip_era"] = cmip7_meta.mip_era
380
+ attrs["parent_mip_era"] = attrs.get("parent_mip_era", "CMIP6")
381
+
382
+ # New/updated CMIP7 attributes
383
+ attrs["region"] = cmip7_meta.region
384
+ attrs["drs_specs"] = cmip7_meta.drs_specs
385
+ attrs["data_specs_version"] = cmip7_meta.data_specs_version
386
+ attrs["product"] = cmip7_meta.product
387
+ attrs["license_id"] = cmip7_meta.license_id
388
+
389
+ # Add tracking_id with CMIP7 handle prefix
390
+ attrs["tracking_id"] = f"hdl:21.14107/{uuid.uuid4()}"
391
+
392
+ # Add creation_date in ISO format
393
+ attrs["creation_date"] = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
394
+
395
+ # Add label attributes
396
+ attrs["temporal_label"] = cmip7_meta.temporal_label
397
+ attrs["vertical_label"] = cmip7_meta.vertical_label
398
+ attrs["horizontal_label"] = cmip7_meta.horizontal_label
399
+ attrs["area_label"] = cmip7_meta.area_label
400
+ attrs["branding_suffix"] = cmip7_meta.branding_suffix
401
+
402
+ # Add branded_variable (required in CMIP7)
403
+ attrs["branded_variable"] = f"{variable_id}_{cmip7_meta.branding_suffix}"
404
+
405
+ # Convert variant indices from CMIP6 integer to CMIP7 string format
406
+ if "realization_index" in attrs:
407
+ attrs["realization_index"] = convert_variant_index(attrs["realization_index"], "r")
408
+ if "initialization_index" in attrs:
409
+ attrs["initialization_index"] = convert_variant_index(attrs["initialization_index"], "i")
410
+ if "physics_index" in attrs:
411
+ attrs["physics_index"] = convert_variant_index(attrs["physics_index"], "p")
412
+ if "forcing_index" in attrs:
413
+ attrs["forcing_index"] = convert_variant_index(attrs["forcing_index"], "f")
414
+
415
+ # Rebuild variant_label from converted indices
416
+ r = attrs.get("realization_index", "r1")
417
+ i = attrs.get("initialization_index", "i1")
418
+ p = attrs.get("physics_index", "p1")
419
+ f = attrs.get("forcing_index", "f1")
420
+ attrs["variant_label"] = f"{r}{i}{p}{f}"
421
+
422
+ # Convert table_id to realm-based and set realm attribute
423
+ if "table_id" in attrs:
424
+ old_table_id = attrs["table_id"]
425
+ realm = get_realm_from_table(old_table_id)
426
+ attrs["realm"] = realm
427
+ # Also update frequency if not present
428
+ if "frequency" not in attrs:
429
+ attrs["frequency"] = get_frequency_from_table(old_table_id)
430
+ # Store legacy CMIP6 compound name for reference (optional but recommended)
431
+ attrs["cmip6_compound_name"] = f"{old_table_id}.{variable_id}"
432
+
433
+ # Update Conventions (CF version only, per CMIP7 spec)
434
+ attrs["Conventions"] = "CF-1.12"
435
+
436
+ # Remove CMIP6-only attributes that are not in CMIP7 spec
437
+ for attr in CMIP6_ONLY_ATTRIBUTES:
438
+ attrs.pop(attr, None)
439
+
440
+ return attrs
441
+
442
+
443
+ def convert_cmip6_dataset(
444
+ ds: xr.Dataset,
445
+ inplace: bool = False,
446
+ ) -> xr.Dataset:
447
+ """
448
+ Convert a CMIP6 xarray Dataset to CMIP7 format in-memory.
449
+
450
+ This function modifies the dataset attributes and optionally renames
451
+ variables to use CMIP7 branded names.
452
+
453
+ Parameters
454
+ ----------
455
+ ds
456
+ The CMIP6 xarray Dataset to convert
457
+ inplace
458
+ If True, modify the dataset in place; otherwise return a copy
459
+
460
+ Returns
461
+ -------
462
+ xr.Dataset
463
+ The converted CMIP7-style dataset
464
+ """
465
+ if not inplace:
466
+ ds = ds.copy(deep=False)
467
+
468
+ # Determine the primary variable (skip coordinates/bounds)
469
+ data_vars = [str(v) for v in ds.data_vars if not str(v).endswith("_bnds") and v not in ds.coords]
470
+
471
+ # Convert global attributes
472
+ variable_id = ds.attrs.get("variable_id")
473
+ if variable_id is None and data_vars:
474
+ variable_id = data_vars[0]
475
+
476
+ branding = get_branding_suffix(variable_id) if variable_id else None
477
+ ds.attrs = convert_cmip6_to_cmip7_attrs(ds.attrs, variable_id=variable_id, branding=branding)
478
+
479
+ return ds
480
+
481
+
482
+ def create_cmip7_filename(
483
+ attrs: dict[str, Any],
484
+ time_range: str | None = None,
485
+ ) -> str:
486
+ """
487
+ Create a CMIP7 filename from attributes.
488
+
489
+ The CMIP7 filename follows the MIP-DRS7 specification (V1.0):
490
+ <variable_id>_<branding_suffix>_<frequency>_<region>_<grid_label>_<source_id>_<experiment_id>_<variant_label>[_<timeRangeDD>].nc
491
+
492
+ Parameters
493
+ ----------
494
+ attrs
495
+ Dictionary containing CMIP7 attributes
496
+ time_range
497
+ Optional time range string (e.g., "190001-190912").
498
+ Format depends on frequency: "YYYY" for yearly, "YYYYMM" for monthly, "YYYYMMDD" for daily.
499
+ Omit for fixed/time-independent variables.
500
+
501
+ Returns
502
+ -------
503
+ str
504
+ The CMIP7 filename
505
+
506
+ Examples
507
+ --------
508
+ >>> attrs = {
509
+ ... "variable_id": "tas",
510
+ ... "branding_suffix": "tavg-h2m-hxy-u",
511
+ ... "frequency": "mon",
512
+ ... "region": "glb",
513
+ ... "grid_label": "g13s",
514
+ ... "source_id": "CanESM6-MR",
515
+ ... "experiment_id": "historical",
516
+ ... "variant_label": "r2i1p1f1",
517
+ ... }
518
+ >>> create_cmip7_filename(attrs, "190001-190912")
519
+ 'tas_tavg-h2m-hxy-u_mon_glb_g13s_CanESM6-MR_historical_r2i1p1f1_190001-190912.nc'
520
+ """
521
+ components = [
522
+ attrs.get("variable_id", ""),
523
+ attrs.get("branding_suffix", ""),
524
+ attrs.get("frequency", "mon"),
525
+ attrs.get("region", "glb"),
526
+ attrs.get("grid_label", "gn"),
527
+ attrs.get("source_id", ""),
528
+ attrs.get("experiment_id", ""),
529
+ attrs.get("variant_label", ""),
530
+ ]
531
+
532
+ filename = "_".join(str(c) for c in components)
533
+
534
+ # Add time range if provided (omit for fixed/time-independent variables)
535
+ if time_range:
536
+ filename = f"{filename}_{time_range}"
537
+
538
+ return f"{filename}.nc"
539
+
540
+
541
+ def create_cmip7_path(attrs: dict[str, Any], version: str | None = None) -> str:
542
+ """
543
+ Create a CMIP7 directory path from attributes.
544
+
545
+ The CMIP7 path follows the MIP-DRS7 specification (V1.0):
546
+ <drs_specs>/<mip_era>/<activity_id>/<institution_id>/<source_id>/<experiment_id>/
547
+ <variant_label>/<region>/<frequency>/<variable_id>/<branding_suffix>/<grid_label>/<version>
548
+
549
+ Parameters
550
+ ----------
551
+ attrs
552
+ Dictionary containing CMIP7 attributes
553
+ version
554
+ Optional version string (e.g., "v20250622"). If not provided, uses attrs["version"]
555
+ or defaults to "v1".
556
+
557
+ Returns
558
+ -------
559
+ str
560
+ The CMIP7 directory path
561
+
562
+ Examples
563
+ --------
564
+ >>> attrs = {
565
+ ... "drs_specs": "MIP-DRS7",
566
+ ... "mip_era": "CMIP7",
567
+ ... "activity_id": "CMIP",
568
+ ... "institution_id": "CCCma",
569
+ ... "source_id": "CanESM6-MR",
570
+ ... "experiment_id": "historical",
571
+ ... "variant_label": "r2i1p1f1",
572
+ ... "region": "glb",
573
+ ... "frequency": "mon",
574
+ ... "variable_id": "tas",
575
+ ... "branding_suffix": "tavg-h2m-hxy-u",
576
+ ... "grid_label": "g13s",
577
+ ... }
578
+ >>> create_cmip7_path(attrs, "v20250622")
579
+ 'MIP-DRS7/CMIP7/CMIP/CCCma/CanESM6-MR/historical/r2i1p1f1/glb/mon/tas/tavg-h2m-hxy-u/g13s/v20250622'
580
+ """
581
+ version_str = version or attrs.get("version", "v1")
582
+
583
+ components = [
584
+ attrs.get("drs_specs", "MIP-DRS7"),
585
+ attrs.get("mip_era", "CMIP7"),
586
+ attrs.get("activity_id", "CMIP"),
587
+ attrs.get("institution_id", ""),
588
+ attrs.get("source_id", ""),
589
+ attrs.get("experiment_id", ""),
590
+ attrs.get("variant_label", ""),
591
+ attrs.get("region", "glb"),
592
+ attrs.get("frequency", "mon"),
593
+ attrs.get("variable_id", ""),
594
+ attrs.get("branding_suffix", ""),
595
+ attrs.get("grid_label", "gn"),
596
+ version_str,
597
+ ]
598
+ return "/".join(str(c) for c in components)
@@ -68,6 +68,49 @@ def _verify_hash_matches(fname: str | pathlib.Path, known_hash: str) -> bool:
68
68
  return matches
69
69
 
70
70
 
71
+ def validate_registry_cache(
72
+ registry: pooch.Pooch,
73
+ name: str,
74
+ ) -> list[str]:
75
+ """
76
+ Validate that all files in a registry are cached and have correct checksums.
77
+
78
+ Parameters
79
+ ----------
80
+ registry
81
+ Pooch registry to validate.
82
+ name
83
+ Name of the registry (for error messages).
84
+
85
+ Returns
86
+ -------
87
+ list[str]
88
+ List of error messages for any validation failures.
89
+ Empty list if all files are valid.
90
+ """
91
+ errors: list[str] = []
92
+
93
+ for key in registry.registry.keys():
94
+ expected_hash = registry.registry[key]
95
+ if not isinstance(expected_hash, str) or not expected_hash: # pragma: no cover
96
+ errors.append(f"{name}: No hash defined for {key}")
97
+ continue
98
+
99
+ # Check if file exists in cache
100
+ cached_path = registry.abspath / key # type: ignore[attr-defined]
101
+ if not cached_path.exists():
102
+ errors.append(f"{name}: File not cached: {key}")
103
+ continue
104
+
105
+ # Verify checksum
106
+ try:
107
+ _verify_hash_matches(cached_path, expected_hash)
108
+ except ValueError as e:
109
+ errors.append(f"{name}: {e}")
110
+
111
+ return errors
112
+
113
+
71
114
  def fetch_all_files(
72
115
  registry: pooch.Pooch,
73
116
  name: str,
@@ -20,6 +20,7 @@ from climate_ref_core.pycmec.output import CMECOutput
20
20
 
21
21
  if TYPE_CHECKING:
22
22
  from climate_ref_core.providers import CommandLineDiagnosticProvider, DiagnosticProvider
23
+ from climate_ref_core.testing import TestDataSpecification
23
24
 
24
25
 
25
26
  def ensure_relative_path(path: pathlib.Path | str, root_directory: pathlib.Path) -> pathlib.Path:
@@ -459,6 +460,14 @@ class AbstractDiagnostic(Protocol):
459
460
  The provider that provides the diagnostic.
460
461
  """
461
462
 
463
+ test_data_spec: TestDataSpecification | None
464
+ """
465
+ Optional specification of test data and test cases for this diagnostic.
466
+
467
+ If provided, defines how to fetch test data from ESGF
468
+ and what test cases are available for testing this diagnostic.
469
+ """
470
+
462
471
  def execute(self, definition: ExecutionDefinition) -> None:
463
472
  """
464
473
  Execute the diagnostic on the given configuration.
@@ -516,6 +525,7 @@ class Diagnostic(AbstractDiagnostic):
516
525
  """
517
526
 
518
527
  series: Sequence[SeriesDefinition] = tuple()
528
+ test_data_spec: TestDataSpecification | None = None
519
529
 
520
530
  def __init__(self) -> None:
521
531
  super().__init__()