climate-ref 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- climate_ref/__init__.py +30 -0
- climate_ref/_config_helpers.py +214 -0
- climate_ref/alembic.ini +114 -0
- climate_ref/cli/__init__.py +138 -0
- climate_ref/cli/_utils.py +68 -0
- climate_ref/cli/config.py +28 -0
- climate_ref/cli/datasets.py +205 -0
- climate_ref/cli/executions.py +201 -0
- climate_ref/cli/providers.py +84 -0
- climate_ref/cli/solve.py +23 -0
- climate_ref/config.py +475 -0
- climate_ref/constants.py +8 -0
- climate_ref/database.py +223 -0
- climate_ref/dataset_registry/obs4ref_reference.txt +2 -0
- climate_ref/dataset_registry/sample_data.txt +60 -0
- climate_ref/datasets/__init__.py +40 -0
- climate_ref/datasets/base.py +214 -0
- climate_ref/datasets/cmip6.py +202 -0
- climate_ref/datasets/obs4mips.py +224 -0
- climate_ref/datasets/pmp_climatology.py +15 -0
- climate_ref/datasets/utils.py +16 -0
- climate_ref/executor/__init__.py +274 -0
- climate_ref/executor/local.py +89 -0
- climate_ref/migrations/README +22 -0
- climate_ref/migrations/env.py +139 -0
- climate_ref/migrations/script.py.mako +26 -0
- climate_ref/migrations/versions/2025-05-02T1418_341a4aa2551e_regenerate.py +292 -0
- climate_ref/models/__init__.py +33 -0
- climate_ref/models/base.py +42 -0
- climate_ref/models/dataset.py +206 -0
- climate_ref/models/diagnostic.py +61 -0
- climate_ref/models/execution.py +306 -0
- climate_ref/models/metric_value.py +195 -0
- climate_ref/models/provider.py +39 -0
- climate_ref/provider_registry.py +146 -0
- climate_ref/py.typed +0 -0
- climate_ref/solver.py +395 -0
- climate_ref/testing.py +109 -0
- climate_ref-0.5.0.dist-info/METADATA +97 -0
- climate_ref-0.5.0.dist-info/RECORD +44 -0
- climate_ref-0.5.0.dist-info/WHEEL +4 -0
- climate_ref-0.5.0.dist-info/entry_points.txt +2 -0
- climate_ref-0.5.0.dist-info/licenses/LICENCE +201 -0
- climate_ref-0.5.0.dist-info/licenses/NOTICE +3 -0
climate_ref/solver.py
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Solver to determine which diagnostics need to be calculated
|
|
3
|
+
|
|
4
|
+
This module provides a solver to determine which diagnostics need to be calculated.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import itertools
|
|
8
|
+
import pathlib
|
|
9
|
+
import typing
|
|
10
|
+
from collections.abc import Sequence
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from attrs import define, frozen
|
|
14
|
+
from loguru import logger
|
|
15
|
+
|
|
16
|
+
from climate_ref.config import Config
|
|
17
|
+
from climate_ref.database import Database
|
|
18
|
+
from climate_ref.datasets import get_dataset_adapter
|
|
19
|
+
from climate_ref.datasets.cmip6 import CMIP6DatasetAdapter
|
|
20
|
+
from climate_ref.datasets.obs4mips import Obs4MIPsDatasetAdapter
|
|
21
|
+
from climate_ref.datasets.pmp_climatology import PMPClimatologyDatasetAdapter
|
|
22
|
+
from climate_ref.models import Diagnostic as DiagnosticModel
|
|
23
|
+
from climate_ref.models import ExecutionGroup
|
|
24
|
+
from climate_ref.models import Provider as ProviderModel
|
|
25
|
+
from climate_ref.models.execution import Execution
|
|
26
|
+
from climate_ref.provider_registry import ProviderRegistry
|
|
27
|
+
from climate_ref_core.constraints import apply_constraint
|
|
28
|
+
from climate_ref_core.datasets import DatasetCollection, ExecutionDatasetCollection, SourceDatasetType
|
|
29
|
+
from climate_ref_core.diagnostics import DataRequirement, Diagnostic, ExecutionDefinition
|
|
30
|
+
from climate_ref_core.exceptions import InvalidDiagnosticException
|
|
31
|
+
from climate_ref_core.providers import DiagnosticProvider
|
|
32
|
+
|
|
33
|
+
SelectorKey = tuple[tuple[str, str], ...]
|
|
34
|
+
"""
|
|
35
|
+
Type describing the key used to identify a group of datasets
|
|
36
|
+
|
|
37
|
+
This is a tuple of tuples, where each inner tuple contains a metadata and dimension value
|
|
38
|
+
that was used to group the datasets together.
|
|
39
|
+
|
|
40
|
+
This SelectorKey type must be hashable, as it is used as a key in a dictionary.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@frozen
|
|
45
|
+
class DiagnosticExecution:
|
|
46
|
+
"""
|
|
47
|
+
Class to hold information about the execution of a diagnostic
|
|
48
|
+
|
|
49
|
+
This is a temporary class used by the solver to hold information about an execution that might
|
|
50
|
+
be required.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
provider: DiagnosticProvider
|
|
54
|
+
diagnostic: Diagnostic
|
|
55
|
+
datasets: ExecutionDatasetCollection
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def dataset_key(self) -> str:
|
|
59
|
+
"""
|
|
60
|
+
Key used to uniquely identify the execution group
|
|
61
|
+
|
|
62
|
+
This key is unique to an execution group and uses unique set of metadata (selectors)
|
|
63
|
+
that defines the group.
|
|
64
|
+
This key is combines the selectors from each source dataset type into a single key
|
|
65
|
+
and should be stable if new datasets are added or removed.
|
|
66
|
+
"""
|
|
67
|
+
key_values = []
|
|
68
|
+
|
|
69
|
+
for source_type in SourceDatasetType.ordered():
|
|
70
|
+
# Ensure the selector is sorted using the dimension names
|
|
71
|
+
# This will ensure a stable key even if the groupby order changes
|
|
72
|
+
if source_type not in self.datasets:
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
selector = self.datasets[source_type].selector
|
|
76
|
+
selector_sorted = sorted(selector, key=lambda item: item[0])
|
|
77
|
+
|
|
78
|
+
source_key = f"{source_type.value}_" + "_".join(value for _, value in selector_sorted)
|
|
79
|
+
key_values.append(source_key)
|
|
80
|
+
|
|
81
|
+
return "__".join(key_values)
|
|
82
|
+
|
|
83
|
+
@property
|
|
84
|
+
def selectors(self) -> dict[str, SelectorKey]:
|
|
85
|
+
"""
|
|
86
|
+
Collection of selectors used to identify the datasets
|
|
87
|
+
|
|
88
|
+
These are the key, value pairs that were selected during the initial group-by,
|
|
89
|
+
for each data requirement.
|
|
90
|
+
"""
|
|
91
|
+
# The "value" of SourceType is used here so this can be stored in the db
|
|
92
|
+
s = {}
|
|
93
|
+
for source_type in SourceDatasetType.ordered():
|
|
94
|
+
if source_type not in self.datasets:
|
|
95
|
+
continue
|
|
96
|
+
s[source_type.value] = self.datasets[source_type].selector
|
|
97
|
+
return s
|
|
98
|
+
|
|
99
|
+
def build_execution_definition(self, output_root: pathlib.Path) -> ExecutionDefinition:
|
|
100
|
+
"""
|
|
101
|
+
Build the execution definition for the current diagnostic execution
|
|
102
|
+
"""
|
|
103
|
+
# Ensure that the output root is always an absolute path
|
|
104
|
+
output_root = output_root.resolve()
|
|
105
|
+
|
|
106
|
+
# This is the desired path relative to the output directory
|
|
107
|
+
fragment = pathlib.Path() / self.provider.slug / self.diagnostic.slug / self.datasets.hash
|
|
108
|
+
|
|
109
|
+
return ExecutionDefinition(
|
|
110
|
+
root_directory=output_root,
|
|
111
|
+
output_directory=output_root / fragment,
|
|
112
|
+
key=self.dataset_key,
|
|
113
|
+
datasets=self.datasets,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def extract_covered_datasets(
|
|
118
|
+
data_catalog: pd.DataFrame, requirement: DataRequirement
|
|
119
|
+
) -> dict[SelectorKey, pd.DataFrame]:
|
|
120
|
+
"""
|
|
121
|
+
Determine the different diagnostic executions that should be performed with the current data catalog
|
|
122
|
+
"""
|
|
123
|
+
if len(data_catalog) == 0:
|
|
124
|
+
logger.error(f"No datasets found in the data catalog: {requirement.source_type.value}")
|
|
125
|
+
return {}
|
|
126
|
+
|
|
127
|
+
subset = requirement.apply_filters(data_catalog)
|
|
128
|
+
|
|
129
|
+
if len(subset) == 0:
|
|
130
|
+
logger.debug(f"No datasets found for requirement {requirement}")
|
|
131
|
+
return {}
|
|
132
|
+
|
|
133
|
+
if requirement.group_by is None:
|
|
134
|
+
# Use a single group
|
|
135
|
+
groups = [((), subset)]
|
|
136
|
+
else:
|
|
137
|
+
groups = list(subset.groupby(list(requirement.group_by)))
|
|
138
|
+
|
|
139
|
+
results = {}
|
|
140
|
+
|
|
141
|
+
for name, group in groups:
|
|
142
|
+
if requirement.group_by is None:
|
|
143
|
+
assert len(groups) == 1 # noqa: S101
|
|
144
|
+
group_keys: SelectorKey = ()
|
|
145
|
+
else:
|
|
146
|
+
group_keys = tuple(zip(requirement.group_by, name))
|
|
147
|
+
constrained_group = _process_group_constraints(data_catalog, group, requirement)
|
|
148
|
+
|
|
149
|
+
if constrained_group is not None:
|
|
150
|
+
results[group_keys] = constrained_group
|
|
151
|
+
|
|
152
|
+
return results
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _process_group_constraints(
|
|
156
|
+
data_catalog: pd.DataFrame, group: pd.DataFrame, requirement: DataRequirement
|
|
157
|
+
) -> pd.DataFrame | None:
|
|
158
|
+
for constraint in requirement.constraints or []:
|
|
159
|
+
constrained_group = apply_constraint(group, constraint, data_catalog)
|
|
160
|
+
if constrained_group is None:
|
|
161
|
+
return None
|
|
162
|
+
|
|
163
|
+
group = constrained_group
|
|
164
|
+
return group
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def solve_executions(
|
|
168
|
+
data_catalog: dict[SourceDatasetType, pd.DataFrame], diagnostic: Diagnostic, provider: DiagnosticProvider
|
|
169
|
+
) -> typing.Generator["DiagnosticExecution", None, None]:
|
|
170
|
+
"""
|
|
171
|
+
Calculate the diagnostic executions that need to be performed for a given diagnostic
|
|
172
|
+
|
|
173
|
+
Parameters
|
|
174
|
+
----------
|
|
175
|
+
data_catalog
|
|
176
|
+
Data catalogs for each source dataset type
|
|
177
|
+
diagnostic
|
|
178
|
+
Diagnostic of interest
|
|
179
|
+
provider
|
|
180
|
+
Provider of the diagnostic
|
|
181
|
+
|
|
182
|
+
Returns
|
|
183
|
+
-------
|
|
184
|
+
:
|
|
185
|
+
A generator that yields the diagnostic executions that need to be performed
|
|
186
|
+
|
|
187
|
+
"""
|
|
188
|
+
if not diagnostic.data_requirements:
|
|
189
|
+
raise ValueError(f"Diagnostic {diagnostic.slug!r} has no data requirements")
|
|
190
|
+
|
|
191
|
+
first_item = next(iter(diagnostic.data_requirements))
|
|
192
|
+
|
|
193
|
+
if isinstance(first_item, DataRequirement):
|
|
194
|
+
# We have a single collection of data requirements
|
|
195
|
+
yield from _solve_from_data_requirements(
|
|
196
|
+
data_catalog,
|
|
197
|
+
diagnostic,
|
|
198
|
+
typing.cast(Sequence[DataRequirement], diagnostic.data_requirements),
|
|
199
|
+
provider,
|
|
200
|
+
)
|
|
201
|
+
elif isinstance(first_item, Sequence):
|
|
202
|
+
# We have a sequence of collections of data requirements
|
|
203
|
+
for requirement_collection in diagnostic.data_requirements:
|
|
204
|
+
if not isinstance(requirement_collection, Sequence):
|
|
205
|
+
raise TypeError(f"Expected a sequence of DataRequirement, got {type(requirement_collection)}")
|
|
206
|
+
yield from _solve_from_data_requirements(
|
|
207
|
+
data_catalog, diagnostic, requirement_collection, provider
|
|
208
|
+
)
|
|
209
|
+
else:
|
|
210
|
+
raise TypeError(f"Expected a DataRequirement, got {type(first_item)}")
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _solve_from_data_requirements(
|
|
214
|
+
data_catalog: dict[SourceDatasetType, pd.DataFrame],
|
|
215
|
+
diagnostic: Diagnostic,
|
|
216
|
+
data_requirements: Sequence[DataRequirement],
|
|
217
|
+
provider: DiagnosticProvider,
|
|
218
|
+
) -> typing.Generator["DiagnosticExecution", None, None]:
|
|
219
|
+
# Collect up the different data groups that can be used to calculate the diagnostic
|
|
220
|
+
dataset_groups = {}
|
|
221
|
+
|
|
222
|
+
for requirement in data_requirements:
|
|
223
|
+
if not isinstance(requirement, DataRequirement):
|
|
224
|
+
raise TypeError(f"Expected a DataRequirement, got {type(requirement)}")
|
|
225
|
+
if requirement.source_type not in data_catalog:
|
|
226
|
+
raise InvalidDiagnosticException(
|
|
227
|
+
diagnostic, f"No data catalog for source type {requirement.source_type}"
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
dataset_groups[requirement.source_type] = extract_covered_datasets(
|
|
231
|
+
data_catalog[requirement.source_type], requirement
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
# Calculate the product across each of the source types
|
|
235
|
+
for items in itertools.product(*dataset_groups.values()):
|
|
236
|
+
yield DiagnosticExecution(
|
|
237
|
+
provider=provider,
|
|
238
|
+
diagnostic=diagnostic,
|
|
239
|
+
datasets=ExecutionDatasetCollection(
|
|
240
|
+
{
|
|
241
|
+
key: DatasetCollection(
|
|
242
|
+
datasets=dataset_groups[key][dataset_group_key],
|
|
243
|
+
slug_column=get_dataset_adapter(key.value).slug_column,
|
|
244
|
+
selector=dataset_group_key,
|
|
245
|
+
)
|
|
246
|
+
for key, dataset_group_key in zip(dataset_groups.keys(), items)
|
|
247
|
+
}
|
|
248
|
+
),
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
@define
|
|
253
|
+
class ExecutionSolver:
|
|
254
|
+
"""
|
|
255
|
+
A solver to determine which executions need to be calculated.
|
|
256
|
+
"""
|
|
257
|
+
|
|
258
|
+
provider_registry: ProviderRegistry
|
|
259
|
+
data_catalog: dict[SourceDatasetType, pd.DataFrame]
|
|
260
|
+
|
|
261
|
+
@staticmethod
|
|
262
|
+
def build_from_db(config: Config, db: Database) -> "ExecutionSolver":
|
|
263
|
+
"""
|
|
264
|
+
Initialise the solver using information from the database
|
|
265
|
+
|
|
266
|
+
Parameters
|
|
267
|
+
----------
|
|
268
|
+
db
|
|
269
|
+
Database instance
|
|
270
|
+
|
|
271
|
+
Returns
|
|
272
|
+
-------
|
|
273
|
+
:
|
|
274
|
+
A new ExecutionSolver instance
|
|
275
|
+
"""
|
|
276
|
+
return ExecutionSolver(
|
|
277
|
+
provider_registry=ProviderRegistry.build_from_config(config, db),
|
|
278
|
+
data_catalog={
|
|
279
|
+
SourceDatasetType.CMIP6: CMIP6DatasetAdapter().load_catalog(db),
|
|
280
|
+
SourceDatasetType.obs4MIPs: Obs4MIPsDatasetAdapter().load_catalog(db),
|
|
281
|
+
SourceDatasetType.PMPClimatology: PMPClimatologyDatasetAdapter().load_catalog(db),
|
|
282
|
+
},
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
def solve(self) -> typing.Generator[DiagnosticExecution, None, None]:
|
|
286
|
+
"""
|
|
287
|
+
Solve which executions need to be calculated for a dataset
|
|
288
|
+
|
|
289
|
+
The solving scheme is iterative,
|
|
290
|
+
for each iteration we find all diagnostics that can be solved and calculate them.
|
|
291
|
+
After each iteration we check if there are any more diagnostics to solve.
|
|
292
|
+
|
|
293
|
+
Yields
|
|
294
|
+
------
|
|
295
|
+
DiagnosticExecution
|
|
296
|
+
A class containing the information related to the execution of a diagnostic
|
|
297
|
+
"""
|
|
298
|
+
for provider in self.provider_registry.providers:
|
|
299
|
+
for diagnostic in provider.diagnostics():
|
|
300
|
+
yield from solve_executions(self.data_catalog, diagnostic, provider)
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def solve_required_executions(
|
|
304
|
+
db: Database,
|
|
305
|
+
dry_run: bool = False,
|
|
306
|
+
solver: ExecutionSolver | None = None,
|
|
307
|
+
config: Config | None = None,
|
|
308
|
+
timeout: int = 60,
|
|
309
|
+
) -> None:
|
|
310
|
+
"""
|
|
311
|
+
Solve for executions that require recalculation
|
|
312
|
+
|
|
313
|
+
This may trigger a number of additional calculations depending on what data has been ingested
|
|
314
|
+
since the last solve.
|
|
315
|
+
|
|
316
|
+
Raises
|
|
317
|
+
------
|
|
318
|
+
TimeoutError
|
|
319
|
+
If the execution isn't completed within the specified timeout
|
|
320
|
+
"""
|
|
321
|
+
if config is None:
|
|
322
|
+
config = Config.default()
|
|
323
|
+
if solver is None:
|
|
324
|
+
solver = ExecutionSolver.build_from_db(config, db)
|
|
325
|
+
|
|
326
|
+
logger.info("Solving for diagnostics that require recalculation...")
|
|
327
|
+
|
|
328
|
+
executor = config.executor.build(config, db)
|
|
329
|
+
|
|
330
|
+
for potential_execution in solver.solve():
|
|
331
|
+
# The diagnostic output is first written to the scratch directory
|
|
332
|
+
definition = potential_execution.build_execution_definition(output_root=config.paths.scratch)
|
|
333
|
+
|
|
334
|
+
logger.debug(
|
|
335
|
+
f"Identified candidate execution {definition.key} "
|
|
336
|
+
f"for {potential_execution.diagnostic.full_slug()}"
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
if dry_run:
|
|
340
|
+
continue
|
|
341
|
+
|
|
342
|
+
# Use a transaction to make sure that the models
|
|
343
|
+
# are created correctly before potentially executing out of process
|
|
344
|
+
with db.session.begin(nested=True):
|
|
345
|
+
diagnostic = (
|
|
346
|
+
db.session.query(DiagnosticModel)
|
|
347
|
+
.join(DiagnosticModel.provider)
|
|
348
|
+
.filter(
|
|
349
|
+
ProviderModel.slug == potential_execution.provider.slug,
|
|
350
|
+
ProviderModel.version == potential_execution.provider.version,
|
|
351
|
+
DiagnosticModel.slug == potential_execution.diagnostic.slug,
|
|
352
|
+
)
|
|
353
|
+
.one()
|
|
354
|
+
)
|
|
355
|
+
execution_group, created = db.get_or_create(
|
|
356
|
+
ExecutionGroup,
|
|
357
|
+
key=definition.key,
|
|
358
|
+
diagnostic_id=diagnostic.id,
|
|
359
|
+
defaults={
|
|
360
|
+
"selectors": potential_execution.selectors,
|
|
361
|
+
"dirty": True,
|
|
362
|
+
},
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
if created:
|
|
366
|
+
logger.info(
|
|
367
|
+
f"Created new execution group: "
|
|
368
|
+
f"{definition.key!r} for {potential_execution.diagnostic.full_slug()}"
|
|
369
|
+
)
|
|
370
|
+
db.session.flush()
|
|
371
|
+
|
|
372
|
+
if execution_group.should_run(definition.datasets.hash):
|
|
373
|
+
logger.info(
|
|
374
|
+
f"Running new execution for execution group: "
|
|
375
|
+
f"{definition.key!r} for {potential_execution.diagnostic.full_slug()}"
|
|
376
|
+
)
|
|
377
|
+
execution = Execution(
|
|
378
|
+
execution_group=execution_group,
|
|
379
|
+
dataset_hash=definition.datasets.hash,
|
|
380
|
+
output_fragment=str(definition.output_fragment()),
|
|
381
|
+
)
|
|
382
|
+
db.session.add(execution)
|
|
383
|
+
db.session.flush()
|
|
384
|
+
|
|
385
|
+
# Add links to the datasets used in the execution
|
|
386
|
+
execution.register_datasets(db, definition.datasets)
|
|
387
|
+
|
|
388
|
+
executor.run(
|
|
389
|
+
provider=potential_execution.provider,
|
|
390
|
+
diagnostic=potential_execution.diagnostic,
|
|
391
|
+
definition=definition,
|
|
392
|
+
execution=execution,
|
|
393
|
+
)
|
|
394
|
+
if timeout > 0:
|
|
395
|
+
executor.join(timeout=timeout)
|
climate_ref/testing.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Testing utilities
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import shutil
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from loguru import logger
|
|
9
|
+
|
|
10
|
+
from climate_ref.config import Config
|
|
11
|
+
from climate_ref.database import Database
|
|
12
|
+
from climate_ref.executor import handle_execution_result
|
|
13
|
+
from climate_ref.models import Execution
|
|
14
|
+
from climate_ref_core.dataset_registry import dataset_registry_manager, fetch_all_files
|
|
15
|
+
from climate_ref_core.diagnostics import Diagnostic, ExecutionResult
|
|
16
|
+
from climate_ref_core.pycmec.metric import CMECMetric
|
|
17
|
+
from climate_ref_core.pycmec.output import CMECOutput
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _determine_test_directory() -> Path | None:
|
|
21
|
+
expected = Path(__file__).parents[4] / "tests" / "test-data"
|
|
22
|
+
|
|
23
|
+
if not expected.exists(): # pragma: no cover
|
|
24
|
+
return None
|
|
25
|
+
return expected
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
TEST_DATA_DIR = _determine_test_directory()
|
|
29
|
+
SAMPLE_DATA_VERSION = "v0.5.0"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def fetch_sample_data(force_cleanup: bool = False, symlink: bool = False) -> None:
|
|
33
|
+
"""
|
|
34
|
+
Fetch the sample data for the given version.
|
|
35
|
+
|
|
36
|
+
The sample data is produced in the [Climate-REF/ref-sample-data](https://github.com/Climate-REF/ref-sample-data)
|
|
37
|
+
repository.
|
|
38
|
+
This repository contains decimated versions of key datasets used by the diagnostics packages.
|
|
39
|
+
Decimating these data greatly reduces the data volumes needed to run the test-suite.
|
|
40
|
+
|
|
41
|
+
Parameters
|
|
42
|
+
----------
|
|
43
|
+
force_cleanup
|
|
44
|
+
If True, remove any existing files
|
|
45
|
+
symlink
|
|
46
|
+
If True, symlink in the data otherwise copy the files
|
|
47
|
+
|
|
48
|
+
The symlink approach is faster, but will fail when running with a non-local executor
|
|
49
|
+
because the symlinks can't be followed.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
if TEST_DATA_DIR is None: # pragma: no cover
|
|
53
|
+
logger.warning("Test data directory not found, skipping sample data fetch")
|
|
54
|
+
return
|
|
55
|
+
|
|
56
|
+
sample_data_registry = dataset_registry_manager["sample-data"]
|
|
57
|
+
|
|
58
|
+
output_dir = TEST_DATA_DIR / "sample-data"
|
|
59
|
+
version_file = output_dir / "version.txt"
|
|
60
|
+
existing_version = None
|
|
61
|
+
|
|
62
|
+
if output_dir.exists(): # pragma: no branch
|
|
63
|
+
if version_file.exists(): # pragma: no branch
|
|
64
|
+
with open(version_file) as fh:
|
|
65
|
+
existing_version = fh.read().strip()
|
|
66
|
+
|
|
67
|
+
if force_cleanup or existing_version != SAMPLE_DATA_VERSION: # pragma: no branch
|
|
68
|
+
logger.warning("Removing existing sample data")
|
|
69
|
+
shutil.rmtree(output_dir)
|
|
70
|
+
|
|
71
|
+
fetch_all_files(sample_data_registry, "sample", output_dir, symlink)
|
|
72
|
+
|
|
73
|
+
# Write out the current sample data version to the copying as complete
|
|
74
|
+
with open(output_dir / "version.txt", "w") as fh:
|
|
75
|
+
fh.write(SAMPLE_DATA_VERSION)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def validate_result(diagnostic: Diagnostic, config: Config, result: ExecutionResult) -> None:
|
|
79
|
+
"""
|
|
80
|
+
Asserts the correctness of the result of a diagnostic execution
|
|
81
|
+
|
|
82
|
+
This should only be used by the test suite as it will create a fake
|
|
83
|
+
database entry for the diagnostic execution result.
|
|
84
|
+
"""
|
|
85
|
+
# Add a fake item in the Database
|
|
86
|
+
database = Database.from_config(config)
|
|
87
|
+
execution = Execution(
|
|
88
|
+
execution_group_id=1,
|
|
89
|
+
dataset_hash=result.definition.datasets.hash,
|
|
90
|
+
output_fragment=str(result.definition.output_fragment()),
|
|
91
|
+
)
|
|
92
|
+
database.session.add(execution)
|
|
93
|
+
database.session.flush()
|
|
94
|
+
|
|
95
|
+
assert result.successful
|
|
96
|
+
|
|
97
|
+
# Validate bundles
|
|
98
|
+
metric_bundle = CMECMetric.load_from_json(result.to_output_path(result.metric_bundle_filename))
|
|
99
|
+
assert diagnostic.facets == tuple(metric_bundle.DIMENSIONS.root["json_structure"]), (
|
|
100
|
+
metric_bundle.DIMENSIONS.root["json_structure"]
|
|
101
|
+
)
|
|
102
|
+
CMECOutput.load_from_json(result.to_output_path(result.output_bundle_filename))
|
|
103
|
+
|
|
104
|
+
# Create a fake log file if one doesn't exist
|
|
105
|
+
if not result.to_output_path("out.log").exists():
|
|
106
|
+
result.to_output_path("out.log").touch()
|
|
107
|
+
|
|
108
|
+
# This checks if the bundles are valid
|
|
109
|
+
handle_execution_result(config, database=database, execution=execution, result=result)
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: climate-ref
|
|
3
|
+
Version: 0.5.0
|
|
4
|
+
Summary: Application which runs the CMIP Rapid Evaluation Framework
|
|
5
|
+
Author-email: Jared Lewis <jared.lewis@climate-resource.com>, Mika Pflueger <mika.pflueger@climate-resource.com>, Bouwe Andela <b.andela@esciencecenter.nl>, Jiwoo Lee <lee1043@llnl.gov>, Min Xu <xum1@ornl.gov>, Nathan Collier <collierno@ornl.gov>, Dora Hegedus <dora.hegedus@stfc.ac.uk>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
License-File: LICENCE
|
|
8
|
+
License-File: NOTICE
|
|
9
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering
|
|
19
|
+
Requires-Python: >=3.11
|
|
20
|
+
Requires-Dist: alembic>=1.13.3
|
|
21
|
+
Requires-Dist: attrs>=24.2.0
|
|
22
|
+
Requires-Dist: cattrs>=24.1.2
|
|
23
|
+
Requires-Dist: climate-ref-core
|
|
24
|
+
Requires-Dist: ecgtools>=2024.7.31
|
|
25
|
+
Requires-Dist: environs>=11.0.0
|
|
26
|
+
Requires-Dist: loguru>=0.7.2
|
|
27
|
+
Requires-Dist: platformdirs>=4.3.6
|
|
28
|
+
Requires-Dist: setuptools>=75.8.0
|
|
29
|
+
Requires-Dist: sqlalchemy>=2.0.36
|
|
30
|
+
Requires-Dist: tomlkit>=0.13.2
|
|
31
|
+
Requires-Dist: typer>=0.12.5
|
|
32
|
+
Provides-Extra: celery
|
|
33
|
+
Requires-Dist: climate-ref-celery>=0.5.0; extra == 'celery'
|
|
34
|
+
Provides-Extra: metrics
|
|
35
|
+
Requires-Dist: climate-ref-esmvaltool>=0.5.0; extra == 'metrics'
|
|
36
|
+
Requires-Dist: climate-ref-ilamb>=0.5.0; extra == 'metrics'
|
|
37
|
+
Requires-Dist: climate-ref-pmp>=0.5.0; extra == 'metrics'
|
|
38
|
+
Provides-Extra: postgres
|
|
39
|
+
Requires-Dist: psycopg2-binary>=2.9.2; extra == 'postgres'
|
|
40
|
+
Description-Content-Type: text/markdown
|
|
41
|
+
|
|
42
|
+
# Climate REF (Rapid Evaluation Framework)
|
|
43
|
+
|
|
44
|
+
[](https://badge.fury.io/py/climate-ref)
|
|
45
|
+
[](https://climate-ref.readthedocs.io/en/latest/?badge=latest)
|
|
46
|
+
[](https://opensource.org/licenses/Apache-2.0)
|
|
47
|
+
[](https://www.python.org/downloads/)
|
|
48
|
+
|
|
49
|
+
**Status**: This project is in active development. We expect to be ready for beta releases in Q2 2025.
|
|
50
|
+
|
|
51
|
+
The Rapid Evaluation Framework (REF) is a set of Python packages that provide the ability to manage the execution of calculations against climate datasets.
|
|
52
|
+
The aim is to be able to evaluate climate data against a set of reference data in near-real time as datasets are published,
|
|
53
|
+
and to update any produced data and figures as new datasets become available.
|
|
54
|
+
This is somewhat analogous to a CI/CD pipeline for climate data.
|
|
55
|
+
|
|
56
|
+
## Installation
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
pip install climate-ref
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
If you want to use the diagnostic providers for the Assessment Fast Track, you can install them with:
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
pip install climate-ref[metrics]
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Quick Start
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
# Ingest some observation datasets
|
|
72
|
+
ref datasets fetch-data --registry obs4ref --output-dir datasets/obs4ref
|
|
73
|
+
ref datasets fetch-data --registry sample-data --output-dir datasets/sample-data
|
|
74
|
+
|
|
75
|
+
# Run metrics against your climate data
|
|
76
|
+
ref solve
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Features
|
|
80
|
+
|
|
81
|
+
- Real-time evaluation of climate datasets
|
|
82
|
+
- Support for multiple metrics providers (PMP, ILAMB, ESMValTool)
|
|
83
|
+
- Distributed processing capabilities
|
|
84
|
+
- Extensible architecture for adding new metrics providers
|
|
85
|
+
- Command-line interface for easy interaction
|
|
86
|
+
|
|
87
|
+
## Documentation
|
|
88
|
+
|
|
89
|
+
For detailed documentation, please visit [https://climate-ref.readthedocs.io/](https://climate-ref.readthedocs.io/)
|
|
90
|
+
|
|
91
|
+
## Contributing
|
|
92
|
+
|
|
93
|
+
REF is a community project, and we welcome contributions from anyone. Please see our [Contributing Guide](https://climate-ref.readthedocs.io/en/latest/contributing/) for more information.
|
|
94
|
+
|
|
95
|
+
## License
|
|
96
|
+
|
|
97
|
+
This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
climate_ref/__init__.py,sha256=OJl5EnjLyEoCQpa0zQ8edV8EcU2YxBJ0xjermIlm9Bw,820
|
|
2
|
+
climate_ref/_config_helpers.py,sha256=-atI5FX7SukhLE_jz_rL-EHQ7s0YYqKu3dSFYWxSyMU,6632
|
|
3
|
+
climate_ref/alembic.ini,sha256=WRvbwSIFuZ7hWNMnR2-yHPJAwYUnwhvRYBzkJhtpGdg,3535
|
|
4
|
+
climate_ref/config.py,sha256=QW1HOLajC2Gc5xZnrGQ8YLAver3BAlVBrfl1kVd_IyM,15072
|
|
5
|
+
climate_ref/constants.py,sha256=rFk3XxNuP0lkzTvUneIhNLq16uadXsT45aUFIlSiBmg,111
|
|
6
|
+
climate_ref/database.py,sha256=RCffNHbJcxxukN6PIOXBTW9TALE2rRsxU0chJHxyNK4,7257
|
|
7
|
+
climate_ref/provider_registry.py,sha256=P35H4VFcsxJ8-Ly4Czwi_gelDU_nF5RiqSC-iBcx_Ws,4116
|
|
8
|
+
climate_ref/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
climate_ref/solver.py,sha256=Hpb_3g-hYWopuBYyqaEtOgfuLOHfZpWbwP1rco802uE,14310
|
|
10
|
+
climate_ref/testing.py,sha256=rUdtU8a4p2OV35IO87veW0mEB9C4Bqwe7BBQU6BILhs,3889
|
|
11
|
+
climate_ref/cli/__init__.py,sha256=RSzaFkgwn1qWRBVbWlDKtNrIxLvQ2T7IWDBIqptpjjU,3658
|
|
12
|
+
climate_ref/cli/_utils.py,sha256=6bIb8zEVvzXyKpv8MG58T-T2L2jH-G8WNrOOGpz3uCw,1918
|
|
13
|
+
climate_ref/cli/config.py,sha256=8I6CLdqKgTu6yaASy-qG0T839Fc0lDZtLSZ6YCc4wOY,520
|
|
14
|
+
climate_ref/cli/datasets.py,sha256=SLl88S3BxKPRbHy9OJ1ymhMnxDmkadFO_BZTIeLR0k8,7367
|
|
15
|
+
climate_ref/cli/executions.py,sha256=6cnMxPK4ZydscUw_Mk9RMISNjP2Yr98BgsOsei8fQ7w,6799
|
|
16
|
+
climate_ref/cli/providers.py,sha256=XVZQsZoEqiCBvgSmp6cNf0mCTxeq_Ycoc6DwVxWDYKg,2521
|
|
17
|
+
climate_ref/cli/solve.py,sha256=D6rAivfm_yl1TTey_zc4KKwZ96LGGF8N1wHjcJ_0XpE,703
|
|
18
|
+
climate_ref/dataset_registry/obs4ref_reference.txt,sha256=1NodZd3tOS9Z1Afpb_Oq4obp4OGAFDSAwEl3FssPwAQ,251
|
|
19
|
+
climate_ref/dataset_registry/sample_data.txt,sha256=aKl9tfO4vknZ5X2mmdyxKOv-nyWhkPDXnpDoNLLTzE8,11892
|
|
20
|
+
climate_ref/datasets/__init__.py,sha256=PV3u5ZmhyfcHbKqySgwVA8m4-naZgxzydLXSBqdTGLM,1171
|
|
21
|
+
climate_ref/datasets/base.py,sha256=XplxCu4bfFmNHp2q8tHT26lB0RHv5swK0QqfUmuMO-c,8154
|
|
22
|
+
climate_ref/datasets/cmip6.py,sha256=Dhq97ow8OmTshDCaL7vfrwn83Nfi6SY8uxJHeY4ZDHk,6083
|
|
23
|
+
climate_ref/datasets/obs4mips.py,sha256=PQhI3QKlYA9L2d_MpnlcVrUn4irMG7Iu-II8l1ncjUs,7032
|
|
24
|
+
climate_ref/datasets/pmp_climatology.py,sha256=goHDc_3B2Wdiy_hmpERNvWDdDYZACPOyFDt3Du6nGc0,534
|
|
25
|
+
climate_ref/datasets/utils.py,sha256=iLJO7h4G3DWsRe9hIC4qkIyi5_zIW1ZMw-FDASLujtM,359
|
|
26
|
+
climate_ref/executor/__init__.py,sha256=vUkE5Izfietvc57gA8LTdaD5IErKVebcE6qO7M7sCRo,9286
|
|
27
|
+
climate_ref/executor/local.py,sha256=3icom02FCHiN0tIpsXR9tvn8-cQrUyoY-LlbHapbTx4,2920
|
|
28
|
+
climate_ref/migrations/README,sha256=xM5osYbyEbEFA2eh5kwary_oh-5VFWtDubA-vgWwvlE,935
|
|
29
|
+
climate_ref/migrations/env.py,sha256=b8om-LvFhVo_2BgaRsR8LcPQ-YcevjWikaWE6uhScAs,4213
|
|
30
|
+
climate_ref/migrations/script.py.mako,sha256=MEqL-2qATlST9TAOeYgscMn1uy6HUS9NFvDgl93dMj8,635
|
|
31
|
+
climate_ref/migrations/versions/2025-05-02T1418_341a4aa2551e_regenerate.py,sha256=349kbd58NdFHqqUAPDX1kR9RUOcxT2zXh9v9yg9-Je8,15533
|
|
32
|
+
climate_ref/models/__init__.py,sha256=dFyqfhTffZz4m06xD4SyvRL9kIBRyVYetHwOxFGy4VM,713
|
|
33
|
+
climate_ref/models/base.py,sha256=cMjNpGNU7pxRi9A5KXEmQIA9pvQDwqGCwo539yndpGY,1199
|
|
34
|
+
climate_ref/models/dataset.py,sha256=Rpwrx0HqOJBHs4Sb4n6B0In__Uo0PqXSZKvZR-juGCg,7491
|
|
35
|
+
climate_ref/models/diagnostic.py,sha256=YB6xzbEXdpz2j-Ddf19RV8mAiWBrkmtRmiAEUV3tl4Q,1762
|
|
36
|
+
climate_ref/models/execution.py,sha256=lRCpaKLSR7rZbuoL94GW76tm9wLMsSDoIOA7bIa6xgY,9848
|
|
37
|
+
climate_ref/models/metric_value.py,sha256=Sfjem65ih9g6WDpjGsiOphSjhYQ1ZAYUPZmsKyb_psU,6452
|
|
38
|
+
climate_ref/models/provider.py,sha256=RAE2qAAxwObu-72CdK4kt5ACMmKYEn07WJm7DU9hF28,990
|
|
39
|
+
climate_ref-0.5.0.dist-info/METADATA,sha256=pTjBsQveKvV8KGgvD9fy8LoxQ5CS-1ruBoFM4ReeLvY,4028
|
|
40
|
+
climate_ref-0.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
41
|
+
climate_ref-0.5.0.dist-info/entry_points.txt,sha256=IaggEJlDIhoYWXdXJafacWbWtCcoEqUKceP1qD7_7vU,44
|
|
42
|
+
climate_ref-0.5.0.dist-info/licenses/LICENCE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
43
|
+
climate_ref-0.5.0.dist-info/licenses/NOTICE,sha256=4qTlax9aX2-mswYJuVrLqJ9jK1IkN5kSBqfVvYLF3Ws,128
|
|
44
|
+
climate_ref-0.5.0.dist-info/RECORD,,
|