climate-ref 0.7.0__tar.gz → 0.8.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. {climate_ref-0.7.0 → climate_ref-0.8.1}/PKG-INFO +1 -1
  2. {climate_ref-0.7.0 → climate_ref-0.8.1}/pyproject.toml +1 -1
  3. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/cli/datasets.py +1 -3
  4. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/config.py +63 -0
  5. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/executor/hpc.py +149 -53
  6. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/executor/local.py +1 -2
  7. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/executor/result_handling.py +13 -6
  8. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/solver.py +17 -6
  9. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/cli/test_datasets.py +1 -3
  10. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/executor/test_hpc_executor.py +76 -11
  11. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/test_config.py +52 -1
  12. {climate_ref-0.7.0 → climate_ref-0.8.1}/.gitignore +0 -0
  13. {climate_ref-0.7.0 → climate_ref-0.8.1}/Dockerfile +0 -0
  14. {climate_ref-0.7.0 → climate_ref-0.8.1}/LICENCE +0 -0
  15. {climate_ref-0.7.0 → climate_ref-0.8.1}/NOTICE +0 -0
  16. {climate_ref-0.7.0 → climate_ref-0.8.1}/README.md +0 -0
  17. {climate_ref-0.7.0 → climate_ref-0.8.1}/conftest.py +0 -0
  18. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/__init__.py +0 -0
  19. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/_config_helpers.py +0 -0
  20. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/alembic.ini +0 -0
  21. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/cli/__init__.py +0 -0
  22. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/cli/_utils.py +0 -0
  23. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/cli/config.py +0 -0
  24. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/cli/executions.py +0 -0
  25. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/cli/providers.py +0 -0
  26. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/cli/solve.py +0 -0
  27. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/constants.py +0 -0
  28. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/database.py +0 -0
  29. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/dataset_registry/obs4ref_reference.txt +0 -0
  30. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/dataset_registry/sample_data.txt +0 -0
  31. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/datasets/__init__.py +0 -0
  32. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/datasets/base.py +0 -0
  33. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/datasets/cmip6.py +0 -0
  34. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/datasets/cmip6_parsers.py +0 -0
  35. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/datasets/obs4mips.py +0 -0
  36. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/datasets/pmp_climatology.py +0 -0
  37. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/datasets/utils.py +0 -0
  38. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/executor/__init__.py +0 -0
  39. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/executor/pbs_scheduler.py +0 -0
  40. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/executor/synchronous.py +0 -0
  41. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/migrations/README +0 -0
  42. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/migrations/env.py +0 -0
  43. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/migrations/script.py.mako +0 -0
  44. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/migrations/versions/2025-05-02T1418_341a4aa2551e_regenerate.py +0 -0
  45. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/migrations/versions/2025-05-09T2032_03dbb4998e49_series_metric_value.py +0 -0
  46. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/migrations/versions/2025-07-03T1505_795c1e6cf496_drop_unique_requirement_on_slug.py +0 -0
  47. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/migrations/versions/2025-07-20T1521_94beace57a9c_cmip6_finalised.py +0 -0
  48. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/migrations/versions/2025-08-05T0327_a1b2c3d4e5f6_finalised_on_base_dataset.py +0 -0
  49. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/migrations/versions/2025-09-05T2019_8d28e5e0f9c3_add_indexes.py +0 -0
  50. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/migrations/versions/2025-09-10T1358_2f6e36738e06_use_version_as_version_facet_for_.py +0 -0
  51. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/migrations/versions/2025-09-22T2359_20cd136a5b04_add_pmp_version.py +0 -0
  52. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/models/__init__.py +0 -0
  53. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/models/base.py +0 -0
  54. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/models/dataset.py +0 -0
  55. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/models/diagnostic.py +0 -0
  56. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/models/execution.py +0 -0
  57. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/models/metric_value.py +0 -0
  58. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/models/mixins.py +0 -0
  59. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/models/provider.py +0 -0
  60. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/provider_registry.py +0 -0
  61. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/py.typed +0 -0
  62. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/slurm.py +0 -0
  63. {climate_ref-0.7.0 → climate_ref-0.8.1}/src/climate_ref/testing.py +0 -0
  64. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/cli/test_config.py +0 -0
  65. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/cli/test_executions/test_inspect.txt +0 -0
  66. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/cli/test_executions.py +0 -0
  67. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/cli/test_providers.py +0 -0
  68. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/cli/test_root.py +0 -0
  69. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/cli/test_solve.py +0 -0
  70. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/cli/test_utils.py +0 -0
  71. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/datasets/conftest.py +0 -0
  72. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/datasets/test_cmip6/cmip6_catalog_db.yml +0 -0
  73. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/datasets/test_cmip6/cmip6_catalog_db_complete.yml +0 -0
  74. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/datasets/test_cmip6/cmip6_catalog_db_drs.yml +0 -0
  75. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/datasets/test_cmip6/cmip6_catalog_local_complete.yml +0 -0
  76. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/datasets/test_cmip6/cmip6_catalog_local_drs.yml +0 -0
  77. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/datasets/test_cmip6.py +0 -0
  78. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/datasets/test_datasets.py +0 -0
  79. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/datasets/test_obs4mips/obs4mips_catalog_db.yml +0 -0
  80. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/datasets/test_obs4mips/obs4mips_catalog_local.yml +0 -0
  81. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/datasets/test_obs4mips.py +0 -0
  82. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/datasets/test_pmp_climatology/pmp_catalog_local.yml +0 -0
  83. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/datasets/test_pmp_climatology.py +0 -0
  84. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/datasets/test_utils.py +0 -0
  85. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/executor/test_local_executor.py +0 -0
  86. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/executor/test_result_handling.py +0 -0
  87. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/executor/test_synchronous_executor.py +0 -0
  88. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/models/test_metric_execution.py +0 -0
  89. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/models/test_metric_value.py +0 -0
  90. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/test_database.py +0 -0
  91. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/test_pbssmartprovider.py +0 -0
  92. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/test_provider_registry.py +0 -0
  93. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/test_slurm.py +0 -0
  94. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/test_solver/test_solve_metrics.yml +0 -0
  95. {climate_ref-0.7.0 → climate_ref-0.8.1}/tests/unit/test_solver.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: climate-ref
3
- Version: 0.7.0
3
+ Version: 0.8.1
4
4
  Summary: Application which runs the CMIP Rapid Evaluation Framework
5
5
  Author-email: Jared Lewis <jared.lewis@climate-resource.com>, Mika Pflueger <mika.pflueger@climate-resource.com>, Bouwe Andela <b.andela@esciencecenter.nl>, Jiwoo Lee <lee1043@llnl.gov>, Min Xu <xum1@ornl.gov>, Nathan Collier <collierno@ornl.gov>, Dora Hegedus <dora.hegedus@stfc.ac.uk>
6
6
  License-Expression: Apache-2.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "climate-ref"
3
- version = "0.7.0"
3
+ version = "0.8.1"
4
4
  description = "Application which runs the CMIP Rapid Evaluation Framework"
5
5
  readme = "README.md"
6
6
  authors = [
@@ -6,8 +6,6 @@ which executions are required for a given diagnostic without having to re-parse
6
6
 
7
7
  """
8
8
 
9
- import errno
10
- import os
11
9
  import shutil
12
10
  from collections.abc import Iterable
13
11
  from pathlib import Path
@@ -133,7 +131,7 @@ def ingest( # noqa
133
131
 
134
132
  if not _dir.exists():
135
133
  logger.error(f"File or directory {_dir} does not exist")
136
- raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), _dir)
134
+ continue
137
135
 
138
136
  # TODO: This assumes that all datasets are nc files.
139
137
  # THis is true for CMIP6 and obs4MIPs but may not be true for other dataset types in the future.
@@ -14,11 +14,14 @@ which always take precedence over any other configuration values.
14
14
  # `esgpull` configuration management system with some of the extra complexity removed.
15
15
  # https://github.com/ESGF/esgf-download/blob/main/esgpull/config.py
16
16
 
17
+ import datetime
17
18
  import importlib.resources
18
19
  import os
19
20
  from pathlib import Path
20
21
  from typing import TYPE_CHECKING, Any, Literal
21
22
 
23
+ import platformdirs
24
+ import requests
22
25
  import tomlkit
23
26
  from attr import Factory
24
27
  from attrs import define, field
@@ -334,6 +337,46 @@ def _load_config(config_file: str | Path, doc: dict[str, Any]) -> "Config":
334
337
  return _converter_defaults_relaxed.structure(doc, Config)
335
338
 
336
339
 
340
+ DEFAULT_IGNORE_DATASETS_MAX_AGE = datetime.timedelta(hours=6)
341
+ DEFAULT_IGNORE_DATASETS_URL = (
342
+ "https://raw.githubusercontent.com/Climate-REF/climate-ref/refs/heads/main/default_ignore_datasets.yaml"
343
+ )
344
+
345
+
346
+ def _get_default_ignore_datasets_file() -> Path:
347
+ """
348
+ Get the path to the ignore datasets file
349
+ """
350
+ cache_dir = platformdirs.user_cache_path("climate_ref")
351
+ cache_dir.mkdir(parents=True, exist_ok=True)
352
+ ignore_datasets_file = cache_dir / "default_ignore_datasets.yaml"
353
+
354
+ download = True
355
+ if ignore_datasets_file.exists():
356
+ # Only update if the ignore datasets file is older than `DEFAULT_IGNORE_DATASETS_MAX_AGE`.
357
+ modification_time = datetime.datetime.fromtimestamp(ignore_datasets_file.stat().st_mtime)
358
+ age = datetime.datetime.now() - modification_time
359
+ if age < DEFAULT_IGNORE_DATASETS_MAX_AGE:
360
+ download = False
361
+
362
+ if download:
363
+ logger.info(
364
+ f"Downloading default ignore datasets file from {DEFAULT_IGNORE_DATASETS_URL} "
365
+ f"to {ignore_datasets_file}"
366
+ )
367
+ response = requests.get(DEFAULT_IGNORE_DATASETS_URL, timeout=120)
368
+ try:
369
+ response.raise_for_status()
370
+ except requests.RequestException as exc:
371
+ logger.warning(f"Failed to download default ignore datasets file: {exc}")
372
+ ignore_datasets_file.touch(exist_ok=True)
373
+ else:
374
+ with ignore_datasets_file.open(mode="wb") as file:
375
+ file.write(response.content)
376
+
377
+ return ignore_datasets_file
378
+
379
+
337
380
  @define(auto_attribs=True)
338
381
  class Config:
339
382
  """
@@ -364,6 +407,26 @@ class Config:
364
407
  - `complete`: Use the complete parser, which parses the dataset based on all available metadata.
365
408
  """
366
409
 
410
+ ignore_datasets_file: Path = field(factory=_get_default_ignore_datasets_file)
411
+ """
412
+ Path to the file containing the ignore datasets
413
+
414
+ This file is a YAML file that contains a list of facets to ignore per diagnostic.
415
+
416
+ The format is:
417
+ ```yaml
418
+ provider:
419
+ diagnostic:
420
+ source_type:
421
+ - facet: value
422
+ - another_facet: [another_value1, another_value2]
423
+ ```
424
+
425
+ If this is not specified, a default ignore datasets file will be used.
426
+ The default file is downloaded from the Climate-REF GitHub repository
427
+ if it does not exist or is older than 6 hours.
428
+ """
429
+
367
430
  paths: PathConfig = Factory(PathConfig)
368
431
  db: DbConfig = Factory(DbConfig)
369
432
  executor: ExecutorConfig = Factory(ExecutorConfig)
@@ -19,8 +19,9 @@ except ImportError: # pragma: no cover
19
19
  )
20
20
 
21
21
  import os
22
+ import re
22
23
  import time
23
- from typing import Any
24
+ from typing import Annotated, Any, Literal
24
25
 
25
26
  import parsl
26
27
  from loguru import logger
@@ -29,6 +30,7 @@ from parsl.config import Config as ParslConfig
29
30
  from parsl.executors import HighThroughputExecutor
30
31
  from parsl.launchers import SimpleLauncher, SrunLauncher
31
32
  from parsl.providers import SlurmProvider
33
+ from pydantic import BaseModel, Field, StrictBool, field_validator, model_validator
32
34
  from tqdm import tqdm
33
35
 
34
36
  from climate_ref.config import Config
@@ -43,6 +45,72 @@ from .local import ExecutionFuture, process_result
43
45
  from .pbs_scheduler import SmartPBSProvider
44
46
 
45
47
 
48
+ class SlurmConfig(BaseModel):
49
+ """Slurm Configurations"""
50
+
51
+ scheduler: Literal["slurm"]
52
+ account: str
53
+ username: str
54
+ partition: str | None = None
55
+ log_dir: str = "runinfo"
56
+ qos: str | None = None
57
+ req_nodes: Annotated[int, Field(strict=True, ge=1, le=1000)] = 1
58
+ cores_per_worker: Annotated[int, Field(strict=True, ge=1, le=1000)] = 1
59
+ mem_per_worker: Annotated[float, Field(strict=True, gt=0, lt=1000.0)] | None = None
60
+ max_workers_per_node: Annotated[int, Field(strict=True, ge=1, le=1000)] = 16
61
+ validation: StrictBool = False
62
+ walltime: str = "00:30:00"
63
+ scheduler_options: str = ""
64
+ retries: Annotated[int, Field(strict=True, ge=1, le=3)] = 2
65
+ max_blocks: Annotated[int, Field(strict=True, ge=1)] = 1 # one block mean one job?
66
+ worker_init: str = ""
67
+ overrides: str = ""
68
+ cmd_timeout: Annotated[int, Field(strict=True, ge=0)] = 120
69
+ cpu_affinity: str = "none"
70
+
71
+ @model_validator(mode="before")
72
+ def _check_parition_qos(cls, data: Any) -> Any:
73
+ if not ("partition" in data or "qos" in data):
74
+ raise ValueError("partition or qos is needed")
75
+ return data
76
+
77
+ @field_validator("scheduler_options")
78
+ def _validate_sbatch_syntax(cls, v: str | None) -> Any:
79
+ if not v:
80
+ return v
81
+
82
+ sbatch_pattern = re.compile(
83
+ r"^\s*#SBATCH\s+" # Start with #SBATCH
84
+ r"(?:-\w+\s+[^\s]+" # Option-value pairs
85
+ r"(?:\s+-\w+\s+[^\s]+)*)" # Additional options
86
+ r"\s*$",
87
+ re.IGNORECASE | re.MULTILINE,
88
+ )
89
+
90
+ invalid_lines = [
91
+ line
92
+ for line in v.split("\n")
93
+ if not (line.strip().upper().startswith("#SBATCH") and sbatch_pattern.match(line.strip()))
94
+ ]
95
+
96
+ if invalid_lines:
97
+ error_msg = (
98
+ "Invalid SBATCH directives:\n"
99
+ + "\n".join(invalid_lines)
100
+ + "\n"
101
+ + "Expected format: '#SBATCH -option value [-option value ...]'"
102
+ )
103
+ raise ValueError(error_msg)
104
+ return v
105
+
106
+ @field_validator("walltime")
107
+ def _validate_walltime(cls, v: str) -> str:
108
+ pattern = r"^(\d+-)?\d{1,5}:[0-5][0-9]:[0-5][0-9]$"
109
+ if not re.match(pattern, v):
110
+ raise ValueError("Walltime must be in `D-HH:MM:SS/HH:MM:SS` format")
111
+ return v
112
+
113
+
46
114
  @python_app
47
115
  def _process_run(definition: ExecutionDefinition, log_level: str) -> ExecutionResult:
48
116
  """Run the function on computer nodes"""
@@ -112,13 +180,18 @@ class HPCExecutor:
112
180
  self.cores_per_worker = _to_int(executor_config.get("cores_per_worker"))
113
181
  self.mem_per_worker = _to_float(executor_config.get("mem_per_worker"))
114
182
 
115
- hours, minutes, seconds = map(int, self.walltime.split(":"))
183
+ if self.scheduler == "slurm":
184
+ self.slurm_config = SlurmConfig.model_validate(executor_config)
185
+ hours, minutes, seconds = map(int, self.slurm_config.walltime.split(":"))
186
+
187
+ if self.slurm_config.validation and HAS_REAL_SLURM:
188
+ self._validate_slurm_params()
189
+ else:
190
+ hours, minutes, seconds = map(int, self.walltime.split(":"))
191
+
116
192
  total_minutes = hours * 60 + minutes + seconds / 60
117
193
  self.total_minutes = total_minutes
118
194
 
119
- if executor_config.get("validation") and HAS_REAL_SLURM:
120
- self._validate_slurm_params()
121
-
122
195
  self._initialize_parsl()
123
196
 
124
197
  self.parsl_results: list[ExecutionFuture] = []
@@ -131,45 +204,52 @@ class HPCExecutor:
131
204
  ValueError: If account, partition or QOS are invalid or inaccessible.
132
205
  """
133
206
  slurm_checker = SlurmChecker()
134
- if self.account and not slurm_checker.get_account_info(self.account):
135
- raise ValueError(f"Account: {self.account} not valid")
207
+ if self.slurm_config.account and not slurm_checker.get_account_info(self.slurm_config.account):
208
+ raise ValueError(f"Account: {self.slurm_config.account} not valid")
136
209
 
137
210
  partition_limits = None
138
211
  node_info = None
139
212
 
140
- if self.partition:
141
- if not slurm_checker.get_partition_info(self.partition):
142
- raise ValueError(f"Partition: {self.partition} not valid")
213
+ if self.slurm_config.partition:
214
+ if not slurm_checker.get_partition_info(self.slurm_config.partition):
215
+ raise ValueError(f"Partition: {self.slurm_config.partition} not valid")
143
216
 
144
- if not slurm_checker.can_account_use_partition(self.account, self.partition):
145
- raise ValueError(f"Account: {self.account} cannot access partiton: {self.partition}")
217
+ if not slurm_checker.can_account_use_partition(
218
+ self.slurm_config.account, self.slurm_config.partition
219
+ ):
220
+ raise ValueError(
221
+ f"Account: {self.slurm_config.account}"
222
+ f" cannot access partiton: {self.slurm_config.partition}"
223
+ )
146
224
 
147
- partition_limits = slurm_checker.get_partition_limits(self.partition)
148
- node_info = slurm_checker.get_node_from_partition(self.partition)
225
+ partition_limits = slurm_checker.get_partition_limits(self.slurm_config.partition)
226
+ node_info = slurm_checker.get_node_from_partition(self.slurm_config.partition)
149
227
 
150
228
  qos_limits = None
151
- if self.qos:
152
- if not slurm_checker.get_qos_info(self.qos):
153
- raise ValueError(f"QOS: {self.qos} not valid")
229
+ if self.slurm_config.qos:
230
+ if not slurm_checker.get_qos_info(self.slurm_config.qos):
231
+ raise ValueError(f"QOS: {self.slurm_config.qos} not valid")
154
232
 
155
- if not slurm_checker.can_account_use_qos(self.account, self.qos):
156
- raise ValueError(f"Account: {self.account} cannot access qos: {self.qos}")
233
+ if not slurm_checker.can_account_use_qos(self.slurm_config.account, self.slurm_config.qos):
234
+ raise ValueError(
235
+ f"Account: {self.slurm_config.account} cannot access qos: {self.slurm_config.qos}"
236
+ )
157
237
 
158
- qos_limits = slurm_checker.get_qos_limits(self.qos)
238
+ qos_limits = slurm_checker.get_qos_limits(self.slurm_config.qos)
159
239
 
160
240
  max_cores_per_node = int(node_info["cpus"]) if node_info else None
161
- if max_cores_per_node and self.cores_per_worker:
162
- if self.cores_per_worker > max_cores_per_node:
241
+ if max_cores_per_node and self.slurm_config.cores_per_worker:
242
+ if self.slurm_config.cores_per_worker > max_cores_per_node:
163
243
  raise ValueError(
164
- f"cores_per_work:{self.cores_per_worker}"
244
+ f"cores_per_work:{self.slurm_config.cores_per_worker}"
165
245
  f"larger than the maximum in a node {max_cores_per_node}"
166
246
  )
167
247
 
168
248
  max_mem_per_node = float(node_info["real_memory"]) if node_info else None
169
- if max_mem_per_node and self.mem_per_worker:
170
- if self.mem_per_worker > max_mem_per_node:
249
+ if max_mem_per_node and self.slurm_config.mem_per_worker:
250
+ if self.slurm_config.mem_per_worker > max_mem_per_node:
171
251
  raise ValueError(
172
- f"mem_per_work:{self.mem_per_worker}"
252
+ f"mem_per_work:{self.slurm_config.mem_per_worker}"
173
253
  f"larger than the maximum mem in a node {max_mem_per_node}"
174
254
  )
175
255
 
@@ -182,8 +262,8 @@ class HPCExecutor:
182
262
 
183
263
  if self.total_minutes > float(max_walltime_minutes):
184
264
  raise ValueError(
185
- f"Walltime: {self.walltime} exceed the maximum time "
186
- f"{max_walltime_minutes} allowed by {self.partition} and {self.qos}"
265
+ f"Walltime: {self.slurm_config.walltime} exceed the maximum time "
266
+ f"{max_walltime_minutes} allowed by {self.slurm_config.partition} and {self.slurm_config.qos}"
187
267
  )
188
268
 
189
269
  def _initialize_parsl(self) -> None:
@@ -192,19 +272,34 @@ class HPCExecutor:
192
272
  provider: SlurmProvider | SmartPBSProvider
193
273
  if self.scheduler == "slurm":
194
274
  provider = SlurmProvider(
195
- account=self.account,
196
- partition=self.partition,
197
- qos=self.qos,
198
- nodes_per_block=self.req_nodes,
199
- max_blocks=int(executor_config.get("max_blocks", 1)),
200
- scheduler_options=executor_config.get("scheduler_options", "#SBATCH -C cpu"),
201
- worker_init=executor_config.get("worker_init", "source .venv/bin/activate"),
275
+ account=self.slurm_config.account,
276
+ partition=self.slurm_config.partition,
277
+ qos=self.slurm_config.qos,
278
+ nodes_per_block=self.slurm_config.req_nodes,
279
+ max_blocks=self.slurm_config.max_blocks,
280
+ scheduler_options=self.slurm_config.scheduler_options,
281
+ worker_init=self.slurm_config.worker_init,
202
282
  launcher=SrunLauncher(
203
283
  debug=True,
204
- overrides=executor_config.get("overrides", ""),
284
+ overrides=self.slurm_config.overrides,
205
285
  ),
206
- walltime=self.walltime,
207
- cmd_timeout=int(executor_config.get("cmd_timeout", 120)),
286
+ walltime=self.slurm_config.walltime,
287
+ cmd_timeout=self.slurm_config.cmd_timeout,
288
+ )
289
+
290
+ executor = HighThroughputExecutor(
291
+ label="ref_hpc_executor",
292
+ cores_per_worker=self.slurm_config.cores_per_worker,
293
+ mem_per_worker=self.slurm_config.mem_per_worker,
294
+ max_workers_per_node=self.slurm_config.max_workers_per_node,
295
+ cpu_affinity=self.slurm_config.cpu_affinity,
296
+ provider=provider,
297
+ )
298
+
299
+ hpc_config = ParslConfig(
300
+ run_dir=self.slurm_config.log_dir,
301
+ executors=[executor],
302
+ retries=self.slurm_config.retries,
208
303
  )
209
304
 
210
305
  elif self.scheduler == "pbs":
@@ -227,23 +322,24 @@ class HPCExecutor:
227
322
  walltime=self.walltime,
228
323
  cmd_timeout=int(executor_config.get("cmd_timeout", 120)),
229
324
  )
230
- else:
231
- raise ValueError(f"Unsupported scheduler: {self.scheduler}")
232
325
 
233
- executor = HighThroughputExecutor(
234
- label="ref_hpc_executor",
235
- cores_per_worker=self.cores_per_worker if self.cores_per_worker else 1,
236
- mem_per_worker=self.mem_per_worker,
237
- max_workers_per_node=_to_int(executor_config.get("max_workers_per_node", 16)),
238
- cpu_affinity=str(executor_config.get("cpu_affinity")),
239
- provider=provider,
240
- )
326
+ executor = HighThroughputExecutor(
327
+ label="ref_hpc_executor",
328
+ cores_per_worker=self.cores_per_worker if self.cores_per_worker else 1,
329
+ mem_per_worker=self.mem_per_worker,
330
+ max_workers_per_node=_to_int(executor_config.get("max_workers_per_node", 16)),
331
+ cpu_affinity=str(executor_config.get("cpu_affinity")),
332
+ provider=provider,
333
+ )
241
334
 
242
- hpc_config = ParslConfig(
243
- run_dir=self.log_dir,
244
- executors=[executor],
245
- retries=int(executor_config.get("retries", 2)),
246
- )
335
+ hpc_config = ParslConfig(
336
+ run_dir=self.log_dir,
337
+ executors=[executor],
338
+ retries=int(executor_config.get("retries", 2)),
339
+ )
340
+
341
+ else:
342
+ raise ValueError(f"Unsupported scheduler: {self.scheduler}")
247
343
 
248
344
  parsl.load(hpc_config)
249
345
 
@@ -88,8 +88,7 @@ def _process_run(definition: ExecutionDefinition, log_level: str) -> ExecutionRe
88
88
  except Exception: # pragma: no cover
89
89
  # This isn't expected but if it happens we want to log the error before the process exits
90
90
  logger.exception("Error running diagnostic")
91
- # This will kill the process pool
92
- raise
91
+ return ExecutionResult.build_from_failure(definition)
93
92
 
94
93
 
95
94
  class LocalExecutor:
@@ -197,12 +197,19 @@ def handle_execution_result(
197
197
  The result of the diagnostic execution, either successful or failed
198
198
  """
199
199
  # Always copy log data to the results directory
200
- _copy_file_to_results(
201
- config.paths.scratch,
202
- config.paths.results,
203
- execution.output_fragment,
204
- EXECUTION_LOG_FILENAME,
205
- )
200
+ try:
201
+ _copy_file_to_results(
202
+ config.paths.scratch,
203
+ config.paths.results,
204
+ execution.output_fragment,
205
+ EXECUTION_LOG_FILENAME,
206
+ )
207
+ except FileNotFoundError:
208
+ logger.error(
209
+ f"Could not find log file {EXECUTION_LOG_FILENAME} in scratch directory: {config.paths.scratch}"
210
+ )
211
+ execution.mark_failed()
212
+ return
206
213
 
207
214
  if not result.successful or result.metric_bundle_filename is None:
208
215
  logger.error(f"{execution} failed")
@@ -353,7 +353,7 @@ class ExecutionSolver:
353
353
  yield from solve_executions(self.data_catalog, diagnostic, provider)
354
354
 
355
355
 
356
- def solve_required_executions( # noqa: PLR0913
356
+ def solve_required_executions( # noqa: PLR0912, PLR0913
357
357
  db: Database,
358
358
  dry_run: bool = False,
359
359
  execute: bool = True,
@@ -396,7 +396,14 @@ def solve_required_executions( # noqa: PLR0913
396
396
  f"for {potential_execution.diagnostic.full_slug()}"
397
397
  )
398
398
 
399
+ if potential_execution.provider.slug not in provider_count:
400
+ provider_count[potential_execution.provider.slug] = 0
401
+ if potential_execution.diagnostic.full_slug() not in diagnostic_count:
402
+ diagnostic_count[potential_execution.diagnostic.full_slug()] = 0
403
+
399
404
  if dry_run:
405
+ provider_count[potential_execution.provider.slug] += 1
406
+ diagnostic_count[potential_execution.diagnostic.full_slug()] += 1
400
407
  continue
401
408
 
402
409
  # Use a transaction to make sure that the models
@@ -421,11 +428,6 @@ def solve_required_executions( # noqa: PLR0913
421
428
  },
422
429
  )
423
430
 
424
- if diagnostic.provider.slug not in provider_count:
425
- provider_count[diagnostic.provider.slug] = 0
426
- if diagnostic.full_slug() not in diagnostic_count:
427
- diagnostic_count[diagnostic.full_slug()] = 0
428
-
429
431
  if created:
430
432
  logger.info(f"Created new execution group: {potential_execution.execution_slug()!r}")
431
433
  db.session.flush()
@@ -471,5 +473,14 @@ def solve_required_executions( # noqa: PLR0913
471
473
 
472
474
  provider_count[diagnostic.provider.slug] += 1
473
475
  diagnostic_count[diagnostic.full_slug()] += 1
476
+
477
+ logger.info("Solve complete")
478
+ logger.info(f"Found {sum(diagnostic_count.values())} new executions")
479
+ for diag, count in diagnostic_count.items():
480
+ logger.info(f" {diag}: {count} new executions")
481
+ for prov, count in provider_count.items():
482
+ logger.info(f" {prov}: {count} new executions")
483
+
474
484
  if timeout > 0:
475
485
  executor.join(timeout=timeout)
486
+ logger.info("All executions complete")
@@ -142,11 +142,9 @@ class TestIngest:
142
142
  "--source-type",
143
143
  "cmip6",
144
144
  ],
145
- expected_exit_code=1,
146
145
  )
147
- assert isinstance(result.exception, FileNotFoundError)
148
- assert result.exception.filename == sample_data_dir / "missing"
149
146
 
147
+ # Continues past the missing directory
150
148
  assert f"File or directory {sample_data_dir / 'missing'} does not exist" in result.stderr
151
149
 
152
150
  def test_ingest_dryrun(self, sample_data_dir, db, invoke_cli):
@@ -4,8 +4,9 @@ from unittest.mock import MagicMock, patch
4
4
  import parsl
5
5
  import pytest
6
6
  from parsl.dataflow import futures
7
+ from pydantic import ValidationError
7
8
 
8
- from climate_ref.executor.hpc import HPCExecutor, execute_locally
9
+ from climate_ref.executor.hpc import HPCExecutor, SlurmConfig, execute_locally
9
10
  from climate_ref.executor.local import ExecutionFuture
10
11
  from climate_ref_core.diagnostics import ExecutionResult
11
12
  from climate_ref_core.exceptions import DiagnosticError
@@ -27,14 +28,26 @@ def test_execute_locally_failed(definition_factory, mock_diagnostic):
27
28
 
28
29
 
29
30
  class TestHPCExecutor:
30
- def test_is_executor(self, tmp_path):
31
- executor = HPCExecutor(log_dir=tmp_path / "parsl_runinfo")
31
+ @pytest.fixture
32
+ def base_config(self, tmp_path):
33
+ """Shared config dictionary for all tests in this class."""
34
+ return {
35
+ "scheduler": "slurm",
36
+ "account": "myaccount",
37
+ "username": "myname",
38
+ "qos": "myqos",
39
+ "partition": "mypartition",
40
+ "log_dir": str(tmp_path / "parsl_runinfo"),
41
+ }
42
+
43
+ def test_is_executor(self, base_config):
44
+ executor = HPCExecutor(**base_config)
32
45
 
33
46
  assert executor.name == "hpc"
34
47
  assert isinstance(executor, Executor)
35
48
  parsl.dfk().cleanup()
36
49
 
37
- def test_run_metric(self, metric_definition, provider, mock_diagnostic, mocker, caplog, tmp_path):
50
+ def test_run_metric(self, metric_definition, provider, mock_diagnostic, mocker, caplog, base_config):
38
51
  with patch.object(HPCExecutor, "run", autospec=True) as mock_run:
39
52
  # Configure the mock to behave similarly to the original
40
53
  mock_run.side_effect = lambda self, definition, execution=None: (
@@ -47,7 +60,7 @@ class TestHPCExecutor:
47
60
  )
48
61
  )
49
62
 
50
- executor = HPCExecutor(log_dir=tmp_path / "parsl_runinfo")
63
+ executor = HPCExecutor(**base_config)
51
64
 
52
65
  # shall have the SerializationError, but not raised
53
66
  executor.run(metric_definition, None)
@@ -57,8 +70,8 @@ class TestHPCExecutor:
57
70
 
58
71
  parsl.dfk().cleanup()
59
72
 
60
- def test_join(self, metric_definition, tmp_path):
61
- executor = HPCExecutor(log_dir=tmp_path / "parsl_runinfo")
73
+ def test_join(self, metric_definition, base_config):
74
+ executor = HPCExecutor(**base_config)
62
75
  future = futures.AppFuture(1)
63
76
  executor.parsl_results = [ExecutionFuture(future, definition=metric_definition, execution_id=None)]
64
77
 
@@ -74,8 +87,8 @@ class TestHPCExecutor:
74
87
 
75
88
  assert len(executor.parsl_results) == 0
76
89
 
77
- def test_join_diagnostic_exception(self, metric_definition, tmp_path):
78
- executor = HPCExecutor(log_dir=tmp_path / "parsl_runinfo")
90
+ def test_join_diagnostic_exception(self, metric_definition, base_config):
91
+ executor = HPCExecutor(**base_config)
79
92
  future = futures.AppFuture(1)
80
93
  executor.parsl_results = [ExecutionFuture(future, definition=metric_definition, execution_id=None)]
81
94
 
@@ -99,8 +112,8 @@ class TestHPCExecutor:
99
112
  )
100
113
  assert len(executor.parsl_results) == 0
101
114
 
102
- def test_join_other_exception(self, metric_definition, tmp_path):
103
- executor = HPCExecutor(log_dir=tmp_path / "parsl_runinfo")
115
+ def test_join_other_exception(self, metric_definition, base_config):
116
+ executor = HPCExecutor(**base_config)
104
117
  future = futures.AppFuture(1)
105
118
  executor.parsl_results = [ExecutionFuture(future, definition=metric_definition, execution_id=None)]
106
119
 
@@ -108,3 +121,55 @@ class TestHPCExecutor:
108
121
 
109
122
  with pytest.raises(AssertionError, match=re.escape("Execution result should not be None")):
110
123
  executor.join(0.1)
124
+
125
+ @pytest.mark.parametrize(
126
+ "field_name, invalid_value",
127
+ [
128
+ ("scheduler", "pbs"),
129
+ ("account", 1234),
130
+ ("username", 0.001),
131
+ ("log_dir", True),
132
+ ("qos", 1234),
133
+ ("req_nodes", 1001),
134
+ ("cores_per_worker", 1001),
135
+ ("mem_per_worker", -1),
136
+ ("max_workers_per_node", 1001),
137
+ ("validation", "true"),
138
+ ("walltime", "3"),
139
+ ("scheduler_options", 10),
140
+ ("scheduler_options", "#SABTCH -C cpu\n$LLLL -C"),
141
+ ("retries", "2"),
142
+ ("max_blocks", "1"),
143
+ ("worker_init", 1),
144
+ ("overrides", 0),
145
+ ("cmd_timeout", -1),
146
+ ("cpu_affinity", 1),
147
+ ],
148
+ )
149
+ def test_hpc_slurm_error_config(self, field_name, invalid_value):
150
+ slurm_cfg_dict = {
151
+ "scheduler": "slurm",
152
+ "qos": "myqos",
153
+ "account": "myaccount",
154
+ "username": "myname",
155
+ "req_nodes": 3,
156
+ "scheduler_options": "#SBATCH -C cpu",
157
+ }
158
+ slurm_cfg_dict[field_name] = invalid_value
159
+ with pytest.raises(ValidationError):
160
+ SlurmConfig.model_validate(slurm_cfg_dict)
161
+
162
+ @pytest.mark.parametrize(
163
+ "missing_config",
164
+ [
165
+ ["scheduler"],
166
+ ["account"],
167
+ ["username"],
168
+ ["partition", "qos"],
169
+ ],
170
+ )
171
+ def test_hpc_slurm_missing_required_config(self, missing_config, base_config):
172
+ slurm_cfg_dict = base_config
173
+ [slurm_cfg_dict.pop(m) for m in missing_config]
174
+ with pytest.raises(ValidationError):
175
+ SlurmConfig.model_validate(slurm_cfg_dict)
@@ -1,13 +1,23 @@
1
1
  import importlib.metadata
2
2
  import logging
3
3
  import sys
4
+ from datetime import timedelta
4
5
  from pathlib import Path
5
6
 
7
+ import platformdirs
6
8
  import pytest
9
+ import requests
7
10
  from attr import evolve
8
11
  from cattrs import IterableValidationError
9
12
 
10
- from climate_ref.config import DEFAULT_LOG_FORMAT, Config, PathConfig, transform_error
13
+ import climate_ref.config
14
+ from climate_ref.config import (
15
+ DEFAULT_LOG_FORMAT,
16
+ Config,
17
+ PathConfig,
18
+ _get_default_ignore_datasets_file,
19
+ transform_error,
20
+ )
11
21
  from climate_ref_core.exceptions import InvalidExecutorException
12
22
  from climate_ref_core.executor import Executor
13
23
 
@@ -141,6 +151,9 @@ filename = "sqlite://climate_ref.db"
141
151
  without_defaults = cfg.dump(defaults=False)
142
152
 
143
153
  assert without_defaults == {
154
+ "ignore_datasets_file": str(
155
+ platformdirs.user_cache_path("climate_ref") / "default_ignore_datasets.yaml"
156
+ ),
144
157
  "log_level": "INFO",
145
158
  "log_format": DEFAULT_LOG_FORMAT,
146
159
  "cmip6_parser": "complete",
@@ -149,6 +162,9 @@ filename = "sqlite://climate_ref.db"
149
162
  ],
150
163
  }
151
164
  assert with_defaults == {
165
+ "ignore_datasets_file": str(
166
+ platformdirs.user_cache_path("climate_ref") / "default_ignore_datasets.yaml"
167
+ ),
152
168
  "log_level": "INFO",
153
169
  "log_format": DEFAULT_LOG_FORMAT,
154
170
  "cmip6_parser": "complete",
@@ -245,3 +261,38 @@ def test_transform_error():
245
261
 
246
262
  err = IterableValidationError("Validation error", [ValueError("Test error"), KeyError()], Config)
247
263
  assert transform_error(err, "test") == ["invalid value @ test", "required field missing @ test"]
264
+
265
+
266
+ @pytest.mark.parametrize("status", ["fresh", "stale", "missing"])
267
+ def test_get_default_ignore_datasets_file(mocker, tmp_path, status):
268
+ mocker.patch.object(climate_ref.config.platformdirs, "user_cache_path", return_value=tmp_path)
269
+ mocker.patch.object(
270
+ climate_ref.config.requests,
271
+ "get",
272
+ return_value=mocker.MagicMock(status_code=200, content=b"downloaded"),
273
+ )
274
+ expected_path = tmp_path / "default_ignore_datasets.yaml"
275
+ if status != "missing":
276
+ expected_path.write_text("existing", encoding="utf-8")
277
+ if status == "stale":
278
+ mocker.patch.object(climate_ref.config, "DEFAULT_IGNORE_DATASETS_MAX_AGE", timedelta(seconds=-1))
279
+
280
+ path = climate_ref.config._get_default_ignore_datasets_file()
281
+
282
+ assert path == tmp_path / "default_ignore_datasets.yaml"
283
+ if status == "fresh":
284
+ assert path.read_text(encoding="utf-8") == "existing"
285
+ else:
286
+ assert path.read_text(encoding="utf-8") == "downloaded"
287
+
288
+
289
+ def test_get_default_ignore_datasets_file_fail(mocker, tmp_path):
290
+ mocker.patch.object(climate_ref.config.platformdirs, "user_cache_path", return_value=tmp_path)
291
+ result = mocker.MagicMock(status_code=404, content=b"{}")
292
+ result.raise_for_status.side_effect = requests.RequestException
293
+ mocker.patch.object(climate_ref.config.requests, "get", return_value=result)
294
+
295
+ path = _get_default_ignore_datasets_file()
296
+ assert path == tmp_path / "default_ignore_datasets.yaml"
297
+ assert path.parent.exists()
298
+ assert path.read_text(encoding="utf-8") == ""
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes