climate-ref 0.6.6__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. climate_ref/cli/__init__.py +12 -3
  2. climate_ref/cli/_utils.py +56 -2
  3. climate_ref/cli/datasets.py +49 -12
  4. climate_ref/cli/executions.py +333 -24
  5. climate_ref/cli/providers.py +1 -2
  6. climate_ref/config.py +67 -4
  7. climate_ref/database.py +62 -4
  8. climate_ref/dataset_registry/obs4ref_reference.txt +0 -9
  9. climate_ref/dataset_registry/sample_data.txt +10 -19
  10. climate_ref/datasets/__init__.py +3 -3
  11. climate_ref/datasets/base.py +121 -20
  12. climate_ref/datasets/cmip6.py +2 -0
  13. climate_ref/datasets/obs4mips.py +26 -15
  14. climate_ref/executor/hpc.py +149 -53
  15. climate_ref/executor/local.py +1 -2
  16. climate_ref/executor/result_handling.py +17 -7
  17. climate_ref/migrations/env.py +12 -10
  18. climate_ref/migrations/versions/2025-09-10T1358_2f6e36738e06_use_version_as_version_facet_for_.py +35 -0
  19. climate_ref/migrations/versions/2025-09-22T2359_20cd136a5b04_add_pmp_version.py +35 -0
  20. climate_ref/models/__init__.py +1 -6
  21. climate_ref/models/base.py +4 -20
  22. climate_ref/models/dataset.py +2 -0
  23. climate_ref/models/diagnostic.py +2 -1
  24. climate_ref/models/execution.py +219 -7
  25. climate_ref/models/metric_value.py +25 -110
  26. climate_ref/models/mixins.py +144 -0
  27. climate_ref/models/provider.py +2 -1
  28. climate_ref/provider_registry.py +4 -4
  29. climate_ref/slurm.py +2 -2
  30. climate_ref/solver.py +17 -6
  31. climate_ref/testing.py +1 -1
  32. {climate_ref-0.6.6.dist-info → climate_ref-0.8.0.dist-info}/METADATA +1 -1
  33. climate_ref-0.8.0.dist-info/RECORD +58 -0
  34. {climate_ref-0.6.6.dist-info → climate_ref-0.8.0.dist-info}/WHEEL +1 -1
  35. climate_ref-0.6.6.dist-info/RECORD +0 -55
  36. {climate_ref-0.6.6.dist-info → climate_ref-0.8.0.dist-info}/entry_points.txt +0 -0
  37. {climate_ref-0.6.6.dist-info → climate_ref-0.8.0.dist-info}/licenses/LICENCE +0 -0
  38. {climate_ref-0.6.6.dist-info → climate_ref-0.8.0.dist-info}/licenses/NOTICE +0 -0
@@ -7,14 +7,12 @@ from typing import Annotated
7
7
  import pandas as pd
8
8
  import typer
9
9
  from loguru import logger
10
- from rich.console import Console
11
10
 
12
11
  from climate_ref.cli._utils import pretty_print_df
13
12
  from climate_ref.provider_registry import ProviderRegistry
14
13
  from climate_ref_core.providers import CondaDiagnosticProvider, DiagnosticProvider
15
14
 
16
15
  app = typer.Typer(help=__doc__)
17
- console = Console()
18
16
 
19
17
 
20
18
  @app.command(name="list")
@@ -24,6 +22,7 @@ def list_(ctx: typer.Context) -> None:
24
22
  """
25
23
  config = ctx.obj.config
26
24
  db = ctx.obj.database
25
+ console = ctx.obj.console
27
26
  provider_registry = ProviderRegistry.build_from_config(config, db)
28
27
 
29
28
  def get_env(provider: DiagnosticProvider) -> str:
climate_ref/config.py CHANGED
@@ -14,11 +14,14 @@ which always take precedence over any other configuration values.
14
14
  # `esgpull` configuration management system with some of the extra complexity removed.
15
15
  # https://github.com/ESGF/esgf-download/blob/main/esgpull/config.py
16
16
 
17
+ import datetime
17
18
  import importlib.resources
18
19
  import os
19
20
  from pathlib import Path
20
21
  from typing import TYPE_CHECKING, Any, Literal
21
22
 
23
+ import platformdirs
24
+ import requests
22
25
  import tomlkit
23
26
  from attr import Factory
24
27
  from attrs import define, field
@@ -334,6 +337,46 @@ def _load_config(config_file: str | Path, doc: dict[str, Any]) -> "Config":
334
337
  return _converter_defaults_relaxed.structure(doc, Config)
335
338
 
336
339
 
340
+ DEFAULT_IGNORE_DATASETS_MAX_AGE = datetime.timedelta(hours=6)
341
+ DEFAULT_IGNORE_DATASETS_URL = (
342
+ "https://raw.githubusercontent.com/Climate-REF/climate-ref/refs/heads/main/default_ignore_datasets.yaml"
343
+ )
344
+
345
+
346
+ def _get_default_ignore_datasets_file() -> Path:
347
+ """
348
+ Get the path to the ignore datasets file
349
+ """
350
+ cache_dir = platformdirs.user_cache_path("climate_ref")
351
+ cache_dir.mkdir(parents=True, exist_ok=True)
352
+ ignore_datasets_file = cache_dir / "default_ignore_datasets.yaml"
353
+
354
+ download = True
355
+ if ignore_datasets_file.exists():
356
+ # Only update if the ignore datasets file is older than `DEFAULT_IGNORE_DATASETS_MAX_AGE`.
357
+ modification_time = datetime.datetime.fromtimestamp(ignore_datasets_file.stat().st_mtime)
358
+ age = datetime.datetime.now() - modification_time
359
+ if age < DEFAULT_IGNORE_DATASETS_MAX_AGE:
360
+ download = False
361
+
362
+ if download:
363
+ logger.info(
364
+ f"Downloading default ignore datasets file from {DEFAULT_IGNORE_DATASETS_URL} "
365
+ f"to {ignore_datasets_file}"
366
+ )
367
+ response = requests.get(DEFAULT_IGNORE_DATASETS_URL, timeout=120)
368
+ try:
369
+ response.raise_for_status()
370
+ except requests.RequestException as exc:
371
+ logger.warning(f"Failed to download default ignore datasets file: {exc}")
372
+ ignore_datasets_file.touch(exist_ok=True)
373
+ else:
374
+ with ignore_datasets_file.open(mode="wb") as file:
375
+ file.write(response.content)
376
+
377
+ return ignore_datasets_file
378
+
379
+
337
380
  @define(auto_attribs=True)
338
381
  class Config:
339
382
  """
@@ -364,10 +407,30 @@ class Config:
364
407
  - `complete`: Use the complete parser, which parses the dataset based on all available metadata.
365
408
  """
366
409
 
367
- paths: PathConfig = Factory(PathConfig) # noqa
368
- db: DbConfig = Factory(DbConfig) # noqa
369
- executor: ExecutorConfig = Factory(ExecutorConfig) # noqa
370
- diagnostic_providers: list[DiagnosticProviderConfig] = Factory(default_providers) # noqa
410
+ ignore_datasets_file: Path = field(factory=_get_default_ignore_datasets_file)
411
+ """
412
+ Path to the file containing the ignore datasets
413
+
414
+ This file is a YAML file that contains a list of facets to ignore per diagnostic.
415
+
416
+ The format is:
417
+ ```yaml
418
+ provider:
419
+ diagnostic:
420
+ source_type:
421
+ - facet: value
422
+ - another_facet: [another_value1, another_value2]
423
+ ```
424
+
425
+ If this is not specified, a default ignore datasets file will be used.
426
+ The default file is downloaded from the Climate-REF GitHub repository
427
+ if it does not exist or is older than 6 hours.
428
+ """
429
+
430
+ paths: PathConfig = Factory(PathConfig)
431
+ db: DbConfig = Factory(DbConfig)
432
+ executor: ExecutorConfig = Factory(ExecutorConfig)
433
+ diagnostic_providers: list[DiagnosticProviderConfig] = Factory(default_providers) # noqa: RUF009, RUF100
371
434
  _raw: TOMLDocument | None = field(init=False, default=None, repr=False)
372
435
  _config_file: Path | None = field(init=False, default=None, repr=False)
373
436
 
climate_ref/database.py CHANGED
@@ -8,6 +8,7 @@ The `Database` class is the main entry point for interacting with the database.
8
8
  It provides a session object that can be used to interact with the database and run queries.
9
9
  """
10
10
 
11
+ import enum
11
12
  import importlib.resources
12
13
  import shutil
13
14
  from datetime import datetime
@@ -23,6 +24,7 @@ from loguru import logger
23
24
  from sqlalchemy.orm import Session
24
25
 
25
26
  from climate_ref.models import MetricValue, Table
27
+ from climate_ref.models.execution import ExecutionOutput
26
28
  from climate_ref_core.pycmec.controlled_vocabulary import CV
27
29
 
28
30
  if TYPE_CHECKING:
@@ -135,6 +137,16 @@ def validate_database_url(database_url: str) -> str:
135
137
  return database_url
136
138
 
137
139
 
140
+ class ModelState(enum.Enum):
141
+ """
142
+ State of a model instance
143
+ """
144
+
145
+ CREATED = "created"
146
+ UPDATED = "updated"
147
+ DELETED = "deleted"
148
+
149
+
138
150
  class Database:
139
151
  """
140
152
  Manage the database connection and migrations
@@ -234,11 +246,57 @@ class Database:
234
246
  # This will add new columns to the db if the CVs have changed
235
247
  MetricValue.register_cv_dimensions(cv)
236
248
 
249
+ # Register the CV dimensions with the ExecutionOutput model
250
+ # This enables dimension-based filtering of outputs
251
+ ExecutionOutput.register_cv_dimensions(cv)
252
+
237
253
  return db
238
254
 
255
+ def update_or_create(
256
+ self, model: type[Table], defaults: dict[str, Any] | None = None, **kwargs: Any
257
+ ) -> tuple[Table, ModelState | None]:
258
+ """
259
+ Update an existing instance or create a new one
260
+
261
+ This doesn't commit the transaction,
262
+ so you will need to call `session.commit()` after this method
263
+ or use a transaction context manager.
264
+
265
+ Parameters
266
+ ----------
267
+ model
268
+ The model to update or create
269
+ defaults
270
+ Default values to use when creating a new instance, or values to update on existing instance
271
+ kwargs
272
+ The filter parameters to use when querying for an instance
273
+
274
+ Returns
275
+ -------
276
+ :
277
+ A tuple containing the instance and a state enum indicating if the instance was created or updated
278
+ """
279
+ instance = self.session.query(model).filter_by(**kwargs).first()
280
+ state: ModelState | None = None
281
+ if instance:
282
+ # Update existing instance with defaults
283
+ if defaults:
284
+ for key, value in defaults.items():
285
+ if getattr(instance, key) != value:
286
+ logger.debug(f"Updating {model.__name__} {key} to {value}")
287
+ setattr(instance, key, value)
288
+ state = ModelState.UPDATED
289
+ return instance, state
290
+ else:
291
+ # Create new instance
292
+ params = {**kwargs, **(defaults or {})}
293
+ instance = model(**params)
294
+ self.session.add(instance)
295
+ return instance, ModelState.CREATED
296
+
239
297
  def get_or_create(
240
298
  self, model: type[Table], defaults: dict[str, Any] | None = None, **kwargs: Any
241
- ) -> tuple[Table, bool]:
299
+ ) -> tuple[Table, ModelState | None]:
242
300
  """
243
301
  Get or create an instance of a model
244
302
 
@@ -258,13 +316,13 @@ class Database:
258
316
  Returns
259
317
  -------
260
318
  :
261
- A tuple containing the instance and a boolean indicating if the instance was created
319
+ A tuple containing the instance and enum indicating if the instance was created
262
320
  """
263
321
  instance = self.session.query(model).filter_by(**kwargs).first()
264
322
  if instance:
265
- return instance, False
323
+ return instance, None
266
324
  else:
267
325
  params = {**kwargs, **(defaults or {})}
268
326
  instance = model(**params)
269
327
  self.session.add(instance)
270
- return instance, True
328
+ return instance, ModelState.CREATED
@@ -5,15 +5,6 @@ obs4REF/ColumbiaU/WECANN-1-0/mon/hfls/gn/20250516/hfls_mon_WECANN-1-0_REF_gn_200
5
5
  obs4REF/ColumbiaU/WECANN-1-0/mon/hfss/gn/20250516/hfss_mon_WECANN-1-0_REF_gn_200701-201512.nc md5:b7a911e0fc164d07d3ab42a86d09b18b
6
6
  obs4REF/ECMWF/ERA-20C/mon/psl/gn/v20210727/psl_mon_ERA-20C_PCMDI_gn_190001-201012.nc md5:c100cf25d5681c375cd6c1ee60b678ba
7
7
  obs4REF/ECMWF/ERA-20C/mon/ts/gn/v20210727/ts_mon_ERA-20C_PCMDI_gn_190001-201012.nc md5:9ed8dfbb805ed4caa282ed70f873a3a0
8
- obs4REF/ECMWF/ERA-5/mon/ta/gn/v20250220/ta_mon_ERA-5_PCMDI_gn_200701-200712.nc md5:695633a2b401cfb66c8addbf58073dbc
9
- obs4REF/ECMWF/ERA-5/mon/ta/gn/v20250220/ta_mon_ERA-5_PCMDI_gn_200801-200812.nc md5:404f1e1f111859be06c00bcb8d740ff2
10
- obs4REF/ECMWF/ERA-5/mon/ta/gn/v20250220/ta_mon_ERA-5_PCMDI_gn_200901-200912.nc md5:a1bb8584d60cdd71154c01a692fa1fb4
11
- obs4REF/ECMWF/ERA-5/mon/ta/gn/v20250220/ta_mon_ERA-5_PCMDI_gn_201001-201012.nc md5:b78016a3c61d99dc0fd29563aa344ca1
12
- obs4REF/ECMWF/ERA-5/mon/ta/gn/v20250220/ta_mon_ERA-5_PCMDI_gn_201101-201112.nc md5:d64c231a7f798a255997ffe196613ea1
13
- obs4REF/ECMWF/ERA-5/mon/ta/gn/v20250220/ta_mon_ERA-5_PCMDI_gn_201201-201212.nc md5:7d90ce60b872dc4f044b9b0101114983
14
- obs4REF/ECMWF/ERA-5/mon/ta/gn/v20250220/ta_mon_ERA-5_PCMDI_gn_201301-201312.nc md5:2fc032707cb8a31ac60fa4abe9efe183
15
- obs4REF/ECMWF/ERA-5/mon/ta/gn/v20250220/ta_mon_ERA-5_PCMDI_gn_201401-201412.nc md5:6022d17e11df7818f5b0429d6e401d17
16
- obs4REF/ECMWF/ERA-5/mon/ta/gn/v20250220/ta_mon_ERA-5_PCMDI_gn_201501-201512.nc md5:c68fdabf6eeb4813befceace089c9494
17
8
  obs4REF/ECMWF/ERA-INT/mon/hfls/gn/v20210727/hfls_mon_ERA-INT_PCMDI_gn_197901-201903.nc md5:1ae4587143f05ee81432b3d9960aab63
18
9
  obs4REF/ECMWF/ERA-INT/mon/hfss/gn/v20210727/hfss_mon_ERA-INT_PCMDI_gn_197901-201903.nc md5:261f02b8cbce18486548882a11f9aa34
19
10
  obs4REF/ECMWF/ERA-INT/mon/hur/gn/v20210727/hur_mon_ERA-INT_PCMDI_gn_198901-201001.nc md5:56fcd2df8ed2879f18b5e8c78134a148
@@ -68,16 +68,16 @@ CMIP6/CMIP/MPI-M/MPI-ESM1-2-LR/esm-piControl/r1i1p1f1/Amon/tas/gn/v20190815/tas_
68
68
  CMIP6/CMIP/MPI-M/MPI-ESM1-2-LR/esm-piControl/r1i1p1f1/Amon/tas/gn/v20190815/tas_Amon_MPI-ESM1-2-LR_esm-piControl_r1i1p1f1_gn_189001-190912.nc 5ce6e74fb80748e34a567b2895f029131c5980a292c744fbbf555c2235afe77f
69
69
  CMIP6/CMIP/MPI-M/MPI-ESM1-2-LR/esm-piControl/r1i1p1f1/Amon/tas/gn/v20190815/tas_Amon_MPI-ESM1-2-LR_esm-piControl_r1i1p1f1_gn_191001-191512.nc f4a83f01af6563a63f43e4497ba0ea6e90297fb076fbcc8d63ac0105e6450ab5
70
70
  CMIP6/CMIP/MPI-M/MPI-ESM1-2-LR/esm-piControl/r1i1p1f1/fx/areacella/gn/v20190815/areacella_fx_MPI-ESM1-2-LR_esm-piControl_r1i1p1f1_gn.nc b67f7d92ee13d5f0fabc5397e8ba5743f11cb062fd2f761e42ae5ac8438e69a4
71
- CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/Amon/cli/gn/v20190308/cli_Amon_CESM2_historical_r1i1p1f1_gn_200701-201412.nc 579cdab588f2bfdc501fb296af15b5ff578bc0b05c65a5ed15848cdf96f4c5bd
72
- CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/Amon/clivi/gn/v20190308/clivi_Amon_CESM2_historical_r1i1p1f1_gn_200701-201412.nc 84cffb47d106ead933f174da8dd1fdff55c7672b28204b012adbb73eb0b59d8f
73
- CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/Amon/clt/gn/v20190308/clt_Amon_CESM2_historical_r1i1p1f1_gn_200701-201412.nc ba35c7ce1b221626cbb6363648e2de10a4bb403e214d6933ce650ed895c3f29e
74
- CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/Amon/clwvi/gn/v20190308/clwvi_Amon_CESM2_historical_r1i1p1f1_gn_200701-201412.nc bd15aff61ce2c9e2a17d1bc417a6ca0103ae3799d45f6a688d1ac1e994f85155
75
- CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/Amon/pr/gn/v20190401/pr_Amon_CESM2_historical_r1i1p1f1_gn_200701-201412.nc b6aae96e7bc02b20dbba58c43f6976700e9147c7adb6f72c123cde99d250bc74
76
- CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/Amon/rlut/gn/v20190308/rlut_Amon_CESM2_historical_r1i1p1f1_gn_200701-201412.nc af072cf5e065b18207f140045f1a260d03ea85763319e9cf41ace71394d55478
77
- CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/Amon/rlutcs/gn/v20190308/rlutcs_Amon_CESM2_historical_r1i1p1f1_gn_200701-201412.nc 469b9fcab4d7fb09426f2c5c67f8acf50a0904c9d8c6ef857abe933fac31a211
78
- CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/Amon/rsut/gn/v20190308/rsut_Amon_CESM2_historical_r1i1p1f1_gn_200701-201412.nc 634e496041968a53223bc7ca709bd891222925253ebf14ce2d6b19f3bbd039e0
79
- CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/Amon/rsutcs/gn/v20190308/rsutcs_Amon_CESM2_historical_r1i1p1f1_gn_200701-201412.nc 9ec147f4c44acb8de4d06b25bc4a890a9090726ea75b2dc855d5b3aa96f174f0
80
- CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/Amon/ta/gn/v20190308/ta_Amon_CESM2_historical_r1i1p1f1_gn_200701-201412.nc 08865486d7f8e2f086a957b002257b65e03a1b332540da484423bdce652af873
71
+ CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/Amon/cli/gn/v20190308/cli_Amon_CESM2_historical_r1i1p1f1_gn_199601-201412.nc a3cf533720f63ad0cf7ae6668649df4b43169dfee8783ffea7889c285c6df925
72
+ CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/Amon/clivi/gn/v20190308/clivi_Amon_CESM2_historical_r1i1p1f1_gn_199601-201412.nc 564c22fa5c94fe257a0bf613d4674b69e505b3c2967e69d0cf529654256bb5fd
73
+ CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/Amon/clt/gn/v20190308/clt_Amon_CESM2_historical_r1i1p1f1_gn_199601-201412.nc bb1e70874e7589f98ed6c9a02bc0a0612050ca373bc66670ae404e0a2d0a138b
74
+ CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/Amon/clwvi/gn/v20190308/clwvi_Amon_CESM2_historical_r1i1p1f1_gn_199601-201412.nc 7194d9f605e11f4f54ca93e08c045f3889f8d8d63ecebe40a78cf07913647d7a
75
+ CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/Amon/pr/gn/v20190401/pr_Amon_CESM2_historical_r1i1p1f1_gn_199601-201412.nc cb5c48b389bcce3af009b32a7100c5669da0f79bd2058ebc3711489ca7ccbfb7
76
+ CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/Amon/rlut/gn/v20190308/rlut_Amon_CESM2_historical_r1i1p1f1_gn_199601-201412.nc 948760afc79c7f9401d5ed1b94bded8919aed9297dc672f45917b2f9e0228973
77
+ CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/Amon/rlutcs/gn/v20190308/rlutcs_Amon_CESM2_historical_r1i1p1f1_gn_199601-201412.nc 855961882d420cc2ddecd573c6b64c027c822402d57c3d157832fef42de0247b
78
+ CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/Amon/rsut/gn/v20190308/rsut_Amon_CESM2_historical_r1i1p1f1_gn_199601-201412.nc 7f3cea7e273ad8b593b00fd0bee865949f20a11bf468df4ac91ca80657e8e37d
79
+ CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/Amon/rsutcs/gn/v20190308/rsutcs_Amon_CESM2_historical_r1i1p1f1_gn_199601-201412.nc 0f7cd2a564cbc6382e35d4bbbd67fed3b4c337f8a926dd56222e1fbc7b77380d
80
+ CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/Amon/ta/gn/v20190308/ta_Amon_CESM2_historical_r1i1p1f1_gn_199601-201412.nc c3cab5bbcd4cbf3563271e4e8b634b6a849182ae391e5dde21865ec7b22061ba
81
81
  CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/ImonAnt/snc/gn/v20190308/snc_ImonAnt_CESM2_historical_r1i1p1f1_gn_200001-201412.nc 3c933b6aaf471b170d6498fed4e01fa73e45169cb8e0790ed70051b69107482f
82
82
  CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/ImonGre/snc/gn/v20190308/snc_ImonGre_CESM2_historical_r1i1p1f1_gn_200001-201412.nc 6b6d84052cd6283663a60f416cc8e80cccfa0f4a2c963b5256ad208f567bbfde
83
83
  CMIP6/CMIP/NCAR/CESM2/historical/r1i1p1f1/LImon/snc/gn/v20190308/snc_LImon_CESM2_historical_r1i1p1f1_gn_200001-201412.nc b0f91e0eb9eef40ffa0ff6b57b8016bf2d3c25e6b3fdc8d12a8aca4d27e692a3
@@ -216,15 +216,6 @@ obs4REF/obs4REF/ColumbiaU/WECANN-1-0/mon/hfls/gn/20250516/hfls_mon_WECANN-1-0_RE
216
216
  obs4REF/obs4REF/ColumbiaU/WECANN-1-0/mon/hfss/gn/20250516/hfss_mon_WECANN-1-0_REF_gn_200701-201512.nc 14bdeae9e0b4b7bfe849c97dbdd29eae87f27d9464e8b3795d815369b13ffd0c
217
217
  obs4REF/obs4REF/ECMWF/ERA-20C/mon/psl/gn/v20210727/psl_mon_ERA-20C_PCMDI_gn_190001-201012.nc 53262d8f9076f233399d149810a644464d3bb36ae0f131fd55f164bc623b78da
218
218
  obs4REF/obs4REF/ECMWF/ERA-20C/mon/ts/gn/v20210727/ts_mon_ERA-20C_PCMDI_gn_190001-201012.nc 95bf8da4b8a071464688b527e822724c33c2794d100052eb12eb2804219ddb94
219
- obs4REF/obs4REF/ECMWF/ERA-5/mon/ta/gn/v20250220/ta_mon_ERA-5_PCMDI_gn_200701-200712.nc 36bd5cbda06258fb6aafd9fb2ccb79b4d08574116a6ebe8ccc48b6462bdb6419
220
- obs4REF/obs4REF/ECMWF/ERA-5/mon/ta/gn/v20250220/ta_mon_ERA-5_PCMDI_gn_200801-200812.nc 9b7802f845ca67f6b4d4bd0a73e0bce1c5042ecf3e7b209a5e470fd084ead238
221
- obs4REF/obs4REF/ECMWF/ERA-5/mon/ta/gn/v20250220/ta_mon_ERA-5_PCMDI_gn_200901-200912.nc 208a988bc440699beda1738342e7571c28dd2c3b2d169e0770c1764996bd41a4
222
- obs4REF/obs4REF/ECMWF/ERA-5/mon/ta/gn/v20250220/ta_mon_ERA-5_PCMDI_gn_201001-201012.nc 3bfb4dec6966cea160af4ce872302af4d84ee2bd8bd3bba91468a424e17d9eae
223
- obs4REF/obs4REF/ECMWF/ERA-5/mon/ta/gn/v20250220/ta_mon_ERA-5_PCMDI_gn_201101-201112.nc da16b7d20e764e25af3c6b834376bed5041872a0b11fab59234eca5cf1124495
224
- obs4REF/obs4REF/ECMWF/ERA-5/mon/ta/gn/v20250220/ta_mon_ERA-5_PCMDI_gn_201201-201212.nc 08ae50141a576dfcbba0a9cf15a32653f48fa88d58406b60d21383e50dd309f0
225
- obs4REF/obs4REF/ECMWF/ERA-5/mon/ta/gn/v20250220/ta_mon_ERA-5_PCMDI_gn_201301-201312.nc 488e55c4f6c858301abb957a5fb7de866e93fa54b234dbce08df652fad634649
226
- obs4REF/obs4REF/ECMWF/ERA-5/mon/ta/gn/v20250220/ta_mon_ERA-5_PCMDI_gn_201401-201412.nc 9c5c4656b929d1c6dba5d83d5459db61d7d543182e58e29168eacdb7f151b125
227
- obs4REF/obs4REF/ECMWF/ERA-5/mon/ta/gn/v20250220/ta_mon_ERA-5_PCMDI_gn_201501-201512.nc 98e254f10b15c4d90dd258f66b8352f6e8b758f9bd64f435c90cb3bdd99c7086
228
219
  obs4REF/obs4REF/ECMWF/ERA-INT/mon/hfls/gn/v20210727/hfls_mon_ERA-INT_PCMDI_gn_197901-201903.nc 50d2b48789dcd642641b30ab52cc0f3ad161c057220cda52788080b2be2b927e
229
220
  obs4REF/obs4REF/ECMWF/ERA-INT/mon/hfss/gn/v20210727/hfss_mon_ERA-INT_PCMDI_gn_197901-201903.nc 72f15a671e88cb0ec239af9e8c1a608bdf2837c884efde9721213481bcfa02a0
230
221
  obs4REF/obs4REF/ECMWF/ERA-INT/mon/hur/gn/v20210727/hur_mon_ERA-INT_PCMDI_gn_198901-201001.nc 54c939a1a461930230a1ae1423856c1929d5dd6bab72cbdad1fe24c5da579908
@@ -25,15 +25,15 @@ def get_dataset_adapter(source_type: str, **kwargs: Any) -> "DatasetAdapter":
25
25
  DatasetAdapter instance
26
26
  """
27
27
  if source_type.lower() == SourceDatasetType.CMIP6.value:
28
- from climate_ref.datasets.cmip6 import CMIP6DatasetAdapter
28
+ from climate_ref.datasets.cmip6 import CMIP6DatasetAdapter # noqa: PLC0415
29
29
 
30
30
  return CMIP6DatasetAdapter(**kwargs)
31
31
  elif source_type.lower() == SourceDatasetType.obs4MIPs.value.lower():
32
- from climate_ref.datasets.obs4mips import Obs4MIPsDatasetAdapter
32
+ from climate_ref.datasets.obs4mips import Obs4MIPsDatasetAdapter # noqa: PLC0415
33
33
 
34
34
  return Obs4MIPsDatasetAdapter(**kwargs)
35
35
  elif source_type.lower() == SourceDatasetType.PMPClimatology.value.lower():
36
- from climate_ref.datasets.pmp_climatology import PMPClimatologyDatasetAdapter
36
+ from climate_ref.datasets.pmp_climatology import PMPClimatologyDatasetAdapter # noqa: PLC0415
37
37
 
38
38
  return PMPClimatologyDatasetAdapter(**kwargs)
39
39
  else:
@@ -2,16 +2,36 @@ from pathlib import Path
2
2
  from typing import Any, Protocol, cast
3
3
 
4
4
  import pandas as pd
5
+ from attrs import define
5
6
  from loguru import logger
6
7
  from sqlalchemy.orm import joinedload
7
8
 
8
9
  from climate_ref.config import Config
9
- from climate_ref.database import Database
10
+ from climate_ref.database import Database, ModelState
10
11
  from climate_ref.datasets.utils import validate_path
11
12
  from climate_ref.models.dataset import Dataset, DatasetFile
12
13
  from climate_ref_core.exceptions import RefException
13
14
 
14
15
 
16
+ @define
17
+ class DatasetRegistrationResult:
18
+ """
19
+ Result of registering a dataset, containing information about file changes
20
+ """
21
+
22
+ dataset: Dataset
23
+ dataset_state: ModelState | None
24
+ files_added: list[str]
25
+ files_updated: list[str]
26
+ files_removed: list[str]
27
+ files_unchanged: list[str]
28
+
29
+ @property
30
+ def total_changes(self) -> int:
31
+ """Total number of file changes (added + updated + removed)"""
32
+ return len(self.files_added) + len(self.files_updated) + len(self.files_removed)
33
+
34
+
15
35
  def _log_duplicate_metadata(
16
36
  data_catalog: pd.DataFrame, unique_metadata: pd.DataFrame, slug_column: str
17
37
  ) -> None:
@@ -26,7 +46,8 @@ def _log_duplicate_metadata(
26
46
  invalid_dataset_columns = invalid_dataset_nunique[invalid_dataset_nunique.gt(1)].index.tolist()
27
47
 
28
48
  # Include time_range in the list of invalid columns to make debugging easier
29
- invalid_dataset_columns.append("time_range")
49
+ if "time_range" in data_catalog.columns and "time_range" not in invalid_dataset_columns:
50
+ invalid_dataset_columns.append("time_range")
30
51
 
31
52
  data_catalog_subset = data_catalog[data_catalog[slug_column] == instance_id]
32
53
 
@@ -169,9 +190,9 @@ class DatasetAdapter(Protocol):
169
190
 
170
191
  return data_catalog
171
192
 
172
- def register_dataset(
193
+ def register_dataset( # noqa: PLR0915
173
194
  self, config: Config, db: Database, data_catalog_dataset: pd.DataFrame
174
- ) -> Dataset | None:
195
+ ) -> DatasetRegistrationResult:
175
196
  """
176
197
  Register a dataset in the database using the data catalog
177
198
 
@@ -187,7 +208,7 @@ class DatasetAdapter(Protocol):
187
208
  Returns
188
209
  -------
189
210
  :
190
- Registered dataset if successful, else None
211
+ Registration result with dataset and file change information
191
212
  """
192
213
  DatasetModel = self.dataset_cls
193
214
 
@@ -197,24 +218,104 @@ class DatasetAdapter(Protocol):
197
218
  raise RefException(f"Found multiple datasets in the same directory: {unique_slugs}")
198
219
  slug = unique_slugs[0]
199
220
 
221
+ # Upsert the dataset (create a new dataset or update the metadata)
200
222
  dataset_metadata = data_catalog_dataset[list(self.dataset_specific_metadata)].iloc[0].to_dict()
201
- dataset, created = db.get_or_create(DatasetModel, defaults=dataset_metadata, slug=slug)
202
- if not created:
203
- logger.warning(f"{dataset} already exists in the database. Skipping")
204
- return None
223
+ dataset, dataset_state = db.update_or_create(DatasetModel, defaults=dataset_metadata, slug=slug)
224
+ if dataset_state == ModelState.CREATED:
225
+ logger.info(f"Created new dataset: {dataset}")
226
+ elif dataset_state == ModelState.UPDATED:
227
+ logger.info(f"Updating existing dataset: {dataset}")
205
228
  db.session.flush()
206
- for dataset_file in data_catalog_dataset.to_dict(orient="records"):
207
- path = validate_path(dataset_file.pop("path"))
208
-
209
- db.session.add(
210
- DatasetFile(
211
- path=str(path),
212
- dataset_id=dataset.id,
213
- start_time=dataset_file.pop("start_time"),
214
- end_time=dataset_file.pop("end_time"),
229
+
230
+ # Initialize result tracking
231
+ files_added = []
232
+ files_updated = []
233
+ files_removed = []
234
+ files_unchanged = []
235
+
236
+ # Get current files for this dataset
237
+ current_files = db.session.query(DatasetFile).filter_by(dataset_id=dataset.id).all()
238
+ current_file_paths = {f.path: f for f in current_files}
239
+
240
+ # Get new file data from data catalog
241
+ new_file_data = data_catalog_dataset.to_dict(orient="records")
242
+ new_file_lookup = {}
243
+ for dataset_file in new_file_data:
244
+ file_path = str(validate_path(dataset_file["path"]))
245
+ new_file_lookup[file_path] = {
246
+ "start_time": dataset_file["start_time"],
247
+ "end_time": dataset_file["end_time"],
248
+ }
249
+
250
+ new_file_paths = set(new_file_lookup.keys())
251
+ existing_file_paths = set(current_file_paths.keys())
252
+
253
+ # TODO: support removing files that are no longer present
254
+ # We want to keep a record of the dataset if it was used by a diagnostic in the past
255
+ files_to_remove = existing_file_paths - new_file_paths
256
+ if files_to_remove:
257
+ files_removed = list(files_to_remove)
258
+ logger.warning(f"Files to remove: {files_removed}")
259
+ raise NotImplementedError("Removing files is not yet supported")
260
+
261
+ # Update existing files if start/end times have changed
262
+ for file_path, existing_file in current_file_paths.items():
263
+ if file_path in new_file_lookup:
264
+ new_times = new_file_lookup[file_path]
265
+ if (
266
+ existing_file.start_time != new_times["start_time"]
267
+ or existing_file.end_time != new_times["end_time"]
268
+ ):
269
+ logger.warning(f"Updating file times for {file_path}")
270
+ existing_file.start_time = new_times["start_time"]
271
+ existing_file.end_time = new_times["end_time"]
272
+ files_updated.append(file_path)
273
+ else:
274
+ files_unchanged.append(file_path)
275
+
276
+ # Add new files (batch operation)
277
+ files_to_add = new_file_paths - existing_file_paths
278
+ if files_to_add:
279
+ files_added = list(files_to_add)
280
+ new_dataset_files = []
281
+ for file_path in files_to_add:
282
+ file_times = new_file_lookup[file_path]
283
+ new_dataset_files.append(
284
+ DatasetFile(
285
+ path=file_path,
286
+ dataset_id=dataset.id,
287
+ start_time=file_times["start_time"],
288
+ end_time=file_times["end_time"],
289
+ )
215
290
  )
216
- )
217
- return dataset
291
+ db.session.add_all(new_dataset_files)
292
+
293
+ # Determine final dataset state
294
+ # If dataset metadata changed, use that state
295
+ # If no metadata changed but files changed, consider it updated
296
+ # If nothing changed, keep the original state (None for existing, CREATED for new)
297
+ final_dataset_state = dataset_state
298
+ if dataset_state is None and (files_added or files_updated or files_removed):
299
+ final_dataset_state = ModelState.UPDATED
300
+
301
+ result = DatasetRegistrationResult(
302
+ dataset=dataset,
303
+ dataset_state=final_dataset_state,
304
+ files_added=files_added,
305
+ files_updated=files_updated,
306
+ files_removed=files_removed,
307
+ files_unchanged=files_unchanged,
308
+ )
309
+ change_message = f": ({final_dataset_state.name})" if final_dataset_state else ""
310
+ logger.debug(
311
+ f"Dataset registration complete for {dataset.slug}{change_message} "
312
+ f"{len(files_added)} files added, "
313
+ f"{len(files_updated)} files updated, "
314
+ f"{len(files_removed)} files removed, "
315
+ f"{len(files_unchanged)} files unchanged"
316
+ )
317
+
318
+ return result
218
319
 
219
320
  def _get_dataset_files(self, db: Database, limit: int | None = None) -> pd.DataFrame:
220
321
  dataset_type = self.dataset_cls.__mapper_args__["polymorphic_identity"]
@@ -119,6 +119,8 @@ class CMIP6DatasetAdapter(DatasetAdapter):
119
119
  file_specific_metadata = ("start_time", "end_time", "path")
120
120
 
121
121
  version_metadata = "version"
122
+ # See https://wcrp-cmip.github.io/WGCM_Infrastructure_Panel/Papers/CMIP6_global_attributes_filenames_CVs_v6.2.7.pdf
123
+ # under "Directory structure template"
122
124
  dataset_id_metadata = (
123
125
  "activity_id",
124
126
  "institution_id",
@@ -7,7 +7,6 @@ from typing import Any
7
7
  import pandas as pd
8
8
  import xarray as xr
9
9
  from ecgtools import Builder
10
- from ecgtools.parsers.utilities import extract_attr_with_regex # type: ignore
11
10
  from loguru import logger
12
11
 
13
12
  from climate_ref.datasets.base import DatasetAdapter
@@ -15,7 +14,7 @@ from climate_ref.datasets.cmip6 import _parse_datetime
15
14
  from climate_ref.models.dataset import Dataset, Obs4MIPsDataset
16
15
 
17
16
 
18
- def parse_obs4mips(file: str, **kwargs: Any) -> dict[str, Any]:
17
+ def parse_obs4mips(file: str, **kwargs: Any) -> dict[str, Any]: # noqa: PLR0912
19
18
  """
20
19
  Parser for obs4mips
21
20
 
@@ -41,6 +40,7 @@ def parse_obs4mips(file: str, **kwargs: Any) -> dict[str, Any]:
41
40
  "source_type",
42
41
  "variable_id",
43
42
  "variant_label",
43
+ "source_version_number",
44
44
  }
45
45
  )
46
46
  )
@@ -48,6 +48,10 @@ def parse_obs4mips(file: str, **kwargs: Any) -> dict[str, Any]:
48
48
  try:
49
49
  time_coder = xr.coders.CFDatetimeCoder(use_cftime=True)
50
50
  with xr.open_dataset(file, chunks={}, decode_times=time_coder) as ds:
51
+ if ds.attrs.get("activity_id", "") != "obs4MIPs":
52
+ traceback_message = f"{file} is not an obs4MIPs dataset"
53
+ raise TypeError(traceback_message)
54
+
51
55
  has_none_value = any(ds.attrs.get(key) is None for key in keys)
52
56
  if has_none_value:
53
57
  missing_fields = [key for key in keys if ds.attrs.get(key) is None]
@@ -55,10 +59,6 @@ def parse_obs4mips(file: str, **kwargs: Any) -> dict[str, Any]:
55
59
  raise AttributeError(traceback_message)
56
60
  info = {key: ds.attrs.get(key) for key in keys}
57
61
 
58
- if info["activity_id"] != "obs4MIPs":
59
- traceback_message = f"{file} is not an obs4MIPs dataset"
60
- raise TypeError(traceback_message)
61
-
62
62
  variable_id = info["variable_id"]
63
63
 
64
64
  if variable_id:
@@ -86,12 +86,12 @@ def parse_obs4mips(file: str, **kwargs: Any) -> dict[str, Any]:
86
86
  else:
87
87
  info["time_range"] = f"{start_time}-{end_time}"
88
88
  info["path"] = str(file)
89
- info["source_version_number"] = (
90
- extract_attr_with_regex(
91
- str(file), regex=r"v\d{4}\d{2}\d{2}|v\d{1}", strip_chars=None, ignore_case=True
92
- )
93
- or "v0"
94
- )
89
+ # Parsing the version like for CMIP6 fails because some obs4REF paths
90
+ # do not include "v" in the version directory name.
91
+ # TODO: fix obs4REF paths
92
+ info["version"] = Path(file).parent.name
93
+ if not info["version"].startswith("v"): # type: ignore[union-attr]
94
+ info["version"] = "v{version}".format(**info)
95
95
  return info
96
96
 
97
97
  except (TypeError, AttributeError) as err:
@@ -99,7 +99,7 @@ def parse_obs4mips(file: str, **kwargs: Any) -> dict[str, Any]:
99
99
  logger.warning(str(err.args[0]))
100
100
  else:
101
101
  logger.warning(str(err.args))
102
- return {"INVALID_ASSET": file, "TRACEBACK": traceback_message}
102
+ return {"INVALID_ASSET": file, "TRACEBACK": str(err)}
103
103
  except Exception:
104
104
  logger.warning(traceback.format_exc())
105
105
  return {"INVALID_ASSET": file, "TRACEBACK": traceback.format_exc()}
@@ -129,18 +129,22 @@ class Obs4MIPsDatasetAdapter(DatasetAdapter):
129
129
  "variant_label",
130
130
  "long_name",
131
131
  "units",
132
+ "version",
132
133
  "vertical_levels",
133
134
  "source_version_number",
134
135
  slug_column,
135
136
  )
136
137
 
137
138
  file_specific_metadata = ("start_time", "end_time", "path")
138
- version_metadata = "source_version_number"
139
+ version_metadata = "version"
140
+ # See ODS2.5 at https://doi.org/10.5281/zenodo.11500474 under "Directory structure template"
139
141
  dataset_id_metadata = (
140
142
  "activity_id",
141
143
  "institution_id",
142
144
  "source_id",
145
+ "frequency",
143
146
  "variable_id",
147
+ "nominal_resolution",
144
148
  "grid_label",
145
149
  )
146
150
 
@@ -186,7 +190,14 @@ class Obs4MIPsDatasetAdapter(DatasetAdapter):
186
190
  self.version_metadata,
187
191
  ]
188
192
  datasets["instance_id"] = datasets.apply(
189
- lambda row: "obs4MIPs." + ".".join([row[item] for item in drs_items]), axis=1
193
+ lambda row: "obs4MIPs."
194
+ + ".".join(
195
+ [
196
+ row[item].replace(" ", "") if item == "nominal_resolution" else row[item]
197
+ for item in drs_items
198
+ ]
199
+ ),
200
+ axis=1,
190
201
  )
191
202
  datasets["finalised"] = True
192
203
  return datasets