climate-ref 0.6.3__tar.gz → 0.6.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {climate_ref-0.6.3 → climate_ref-0.6.5}/PKG-INFO +1 -1
- {climate_ref-0.6.3 → climate_ref-0.6.5}/pyproject.toml +1 -1
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/config.py +19 -7
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/datasets/base.py +28 -2
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/datasets/cmip6.py +54 -100
- climate_ref-0.6.5/src/climate_ref/datasets/cmip6_parsers.py +189 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/datasets/obs4mips.py +14 -3
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/executor/hpc.py +49 -18
- climate_ref-0.6.5/src/climate_ref/executor/pbs_scheduler.py +152 -0
- climate_ref-0.6.5/src/climate_ref/migrations/versions/2025-07-03T1505_795c1e6cf496_drop_unique_requirement_on_slug.py +34 -0
- climate_ref-0.6.5/src/climate_ref/migrations/versions/2025-07-20T1521_94beace57a9c_cmip6_finalised.py +57 -0
- climate_ref-0.6.5/src/climate_ref/migrations/versions/2025-08-05T0327_a1b2c3d4e5f6_finalised_on_base_dataset.py +57 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/models/dataset.py +23 -15
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/models/diagnostic.py +1 -1
- climate_ref-0.6.5/tests/unit/datasets/test_cmip6/cmip6_catalog_db_complete.yml +2565 -0
- climate_ref-0.6.5/tests/unit/datasets/test_cmip6/cmip6_catalog_db_drs.yml +2565 -0
- climate_ref-0.6.3/tests/unit/datasets/test_cmip6/cmip6_catalog_local.yml → climate_ref-0.6.5/tests/unit/datasets/test_cmip6/cmip6_catalog_local_complete.yml +71 -0
- climate_ref-0.6.5/tests/unit/datasets/test_cmip6/cmip6_catalog_local_drs.yml +2627 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/datasets/test_cmip6.py +46 -16
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/datasets/test_obs4mips/obs4mips_catalog_db.yml +36 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/datasets/test_obs4mips/obs4mips_catalog_local.yml +36 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/datasets/test_pmp_climatology/pmp_catalog_local.yml +36 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/test_config.py +17 -12
- climate_ref-0.6.5/tests/unit/test_pbssmartprovider.py +69 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/.gitignore +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/Dockerfile +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/LICENCE +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/NOTICE +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/README.md +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/conftest.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/__init__.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/_config_helpers.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/alembic.ini +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/cli/__init__.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/cli/_utils.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/cli/config.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/cli/datasets.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/cli/executions.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/cli/providers.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/cli/solve.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/constants.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/database.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/dataset_registry/obs4ref_reference.txt +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/dataset_registry/sample_data.txt +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/datasets/__init__.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/datasets/pmp_climatology.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/datasets/utils.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/executor/__init__.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/executor/local.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/executor/result_handling.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/executor/synchronous.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/migrations/README +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/migrations/env.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/migrations/script.py.mako +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/migrations/versions/2025-05-02T1418_341a4aa2551e_regenerate.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/migrations/versions/2025-05-09T2032_03dbb4998e49_series_metric_value.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/models/__init__.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/models/base.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/models/execution.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/models/metric_value.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/models/provider.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/provider_registry.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/py.typed +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/slurm.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/solver.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/src/climate_ref/testing.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/cli/test_config.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/cli/test_datasets.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/cli/test_executions/test_inspect.txt +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/cli/test_executions.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/cli/test_providers.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/cli/test_root.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/cli/test_solve.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/datasets/conftest.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/datasets/test_cmip6/cmip6_catalog_db.yml +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/datasets/test_datasets.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/datasets/test_obs4mips.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/datasets/test_pmp_climatology.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/datasets/test_utils.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/executor/test_hpc_executor.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/executor/test_local_executor.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/executor/test_result_handling.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/executor/test_synchronous_executor.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/models/test_metric_execution.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/models/test_metric_value.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/test_database.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/test_provider_registry.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/test_slurm.py +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/test_solver/test_solve_metrics.yml +0 -0
- {climate_ref-0.6.3 → climate_ref-0.6.5}/tests/unit/test_solver.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: climate-ref
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.5
|
|
4
4
|
Summary: Application which runs the CMIP Rapid Evaluation Framework
|
|
5
5
|
Author-email: Jared Lewis <jared.lewis@climate-resource.com>, Mika Pflueger <mika.pflueger@climate-resource.com>, Bouwe Andela <b.andela@esciencecenter.nl>, Jiwoo Lee <lee1043@llnl.gov>, Min Xu <xum1@ornl.gov>, Nathan Collier <collierno@ornl.gov>, Dora Hegedus <dora.hegedus@stfc.ac.uk>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -17,7 +17,7 @@ which always take precedence over any other configuration values.
|
|
|
17
17
|
import importlib.resources
|
|
18
18
|
import os
|
|
19
19
|
from pathlib import Path
|
|
20
|
-
from typing import TYPE_CHECKING, Any
|
|
20
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
21
21
|
|
|
22
22
|
import tomlkit
|
|
23
23
|
from attr import Factory
|
|
@@ -215,17 +215,17 @@ class DiagnosticProviderConfig:
|
|
|
215
215
|
|
|
216
216
|
```toml
|
|
217
217
|
[[diagnostic_providers]]
|
|
218
|
-
provider = "climate_ref_esmvaltool
|
|
218
|
+
provider = "climate_ref_esmvaltool:provider"
|
|
219
219
|
|
|
220
220
|
[diagnostic_providers.config]
|
|
221
221
|
|
|
222
222
|
[[diagnostic_providers]]
|
|
223
|
-
provider = "climate_ref_ilamb
|
|
223
|
+
provider = "climate_ref_ilamb:provider"
|
|
224
224
|
|
|
225
225
|
[diagnostic_providers.config]
|
|
226
226
|
|
|
227
227
|
[[diagnostic_providers]]
|
|
228
|
-
provider = "climate_ref_pmp
|
|
228
|
+
provider = "climate_ref_pmp:provider"
|
|
229
229
|
|
|
230
230
|
[diagnostic_providers.config]
|
|
231
231
|
```
|
|
@@ -311,10 +311,12 @@ def default_providers() -> list[DiagnosticProviderConfig]:
|
|
|
311
311
|
if env_providers:
|
|
312
312
|
return [DiagnosticProviderConfig(provider=provider) for provider in env_providers]
|
|
313
313
|
|
|
314
|
+
# Refer to https://setuptools.pypa.io/en/latest/userguide/entry_point.html#entry-points-for-plugins
|
|
315
|
+
# and https://packaging.python.org/en/latest/specifications/entry-points/
|
|
316
|
+
# to learn more about entry points.
|
|
314
317
|
return [
|
|
315
|
-
DiagnosticProviderConfig(provider=
|
|
316
|
-
|
|
317
|
-
DiagnosticProviderConfig(provider="climate_ref_pmp.provider", config={}),
|
|
318
|
+
DiagnosticProviderConfig(provider=entry_point.value, config={})
|
|
319
|
+
for entry_point in importlib.metadata.entry_points(group="climate-ref.providers")
|
|
318
320
|
]
|
|
319
321
|
|
|
320
322
|
|
|
@@ -352,6 +354,16 @@ class Config:
|
|
|
352
354
|
[loguru documentation](https://loguru.readthedocs.io/en/stable/api/logger.html#module-loguru._logger).
|
|
353
355
|
"""
|
|
354
356
|
|
|
357
|
+
cmip6_parser: Literal["drs", "complete"] = env_field("CMIP6_PARSER", default="complete")
|
|
358
|
+
"""
|
|
359
|
+
Parser to use for CMIP6 datasets
|
|
360
|
+
|
|
361
|
+
This can be either `drs` or `complete`.
|
|
362
|
+
|
|
363
|
+
- `drs`: Use the DRS parser, which parses the dataset based on the DRS naming conventions.
|
|
364
|
+
- `complete`: Use the complete parser, which parses the dataset based on all available metadata.
|
|
365
|
+
"""
|
|
366
|
+
|
|
355
367
|
paths: PathConfig = Factory(PathConfig) # noqa
|
|
356
368
|
db: DbConfig = Factory(DbConfig) # noqa
|
|
357
369
|
executor: ExecutorConfig = Factory(ExecutorConfig) # noqa
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
|
-
from typing import Protocol, cast
|
|
2
|
+
from typing import Any, Protocol, cast
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
from loguru import logger
|
|
@@ -35,6 +35,31 @@ def _log_duplicate_metadata(
|
|
|
35
35
|
)
|
|
36
36
|
|
|
37
37
|
|
|
38
|
+
class DatasetParsingFunction(Protocol):
|
|
39
|
+
"""
|
|
40
|
+
Protocol for a function that parses metadata from a file or directory
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __call__(self, file: str, **kwargs: Any) -> dict[str, Any]:
|
|
44
|
+
"""
|
|
45
|
+
Parse a file or directory and return metadata for the dataset
|
|
46
|
+
|
|
47
|
+
Parameters
|
|
48
|
+
----------
|
|
49
|
+
file
|
|
50
|
+
File or directory to parse
|
|
51
|
+
|
|
52
|
+
kwargs
|
|
53
|
+
Additional keyword arguments to pass to the parsing function.
|
|
54
|
+
|
|
55
|
+
Returns
|
|
56
|
+
-------
|
|
57
|
+
:
|
|
58
|
+
Data catalog containing the metadata for the dataset
|
|
59
|
+
"""
|
|
60
|
+
...
|
|
61
|
+
|
|
62
|
+
|
|
38
63
|
class DatasetAdapter(Protocol):
|
|
39
64
|
"""
|
|
40
65
|
An adapter to provide a common interface for different dataset types
|
|
@@ -173,7 +198,7 @@ class DatasetAdapter(Protocol):
|
|
|
173
198
|
slug = unique_slugs[0]
|
|
174
199
|
|
|
175
200
|
dataset_metadata = data_catalog_dataset[list(self.dataset_specific_metadata)].iloc[0].to_dict()
|
|
176
|
-
dataset, created = db.get_or_create(DatasetModel, slug=slug
|
|
201
|
+
dataset, created = db.get_or_create(DatasetModel, defaults=dataset_metadata, slug=slug)
|
|
177
202
|
if not created:
|
|
178
203
|
logger.warning(f"{dataset} already exists in the database. Skipping")
|
|
179
204
|
return None
|
|
@@ -212,6 +237,7 @@ class DatasetAdapter(Protocol):
|
|
|
212
237
|
{
|
|
213
238
|
**{k: getattr(file, k) for k in self.file_specific_metadata},
|
|
214
239
|
**{k: getattr(file.dataset, k) for k in self.dataset_specific_metadata},
|
|
240
|
+
"finalised": file.dataset.finalised,
|
|
215
241
|
}
|
|
216
242
|
for file in result
|
|
217
243
|
],
|
|
@@ -1,18 +1,17 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import traceback
|
|
4
3
|
import warnings
|
|
5
4
|
from datetime import datetime
|
|
6
5
|
from pathlib import Path
|
|
7
6
|
from typing import Any
|
|
8
7
|
|
|
9
8
|
import pandas as pd
|
|
10
|
-
import xarray as xr
|
|
11
9
|
from ecgtools import Builder
|
|
12
|
-
from ecgtools.parsers.utilities import extract_attr_with_regex # type: ignore
|
|
13
10
|
from loguru import logger
|
|
14
11
|
|
|
15
|
-
from climate_ref.
|
|
12
|
+
from climate_ref.config import Config
|
|
13
|
+
from climate_ref.datasets.base import DatasetAdapter, DatasetParsingFunction
|
|
14
|
+
from climate_ref.datasets.cmip6_parsers import parse_cmip6_complete, parse_cmip6_drs
|
|
16
15
|
from climate_ref.models.dataset import CMIP6Dataset
|
|
17
16
|
|
|
18
17
|
|
|
@@ -22,16 +21,19 @@ def _parse_datetime(dt_str: pd.Series[str]) -> pd.Series[datetime | Any]:
|
|
|
22
21
|
"""
|
|
23
22
|
|
|
24
23
|
def _inner(date_string: str | None) -> datetime | None:
|
|
25
|
-
if not date_string:
|
|
24
|
+
if not date_string or pd.isnull(date_string):
|
|
26
25
|
return None
|
|
27
26
|
|
|
28
27
|
# Try to parse the date string with and without milliseconds
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
28
|
+
for fmt in ("%Y-%m-%d", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M:%S.%f"):
|
|
29
|
+
try:
|
|
30
|
+
return datetime.strptime(date_string, fmt)
|
|
31
|
+
except ValueError:
|
|
32
|
+
continue
|
|
33
33
|
|
|
34
|
-
return
|
|
34
|
+
# If all parsing attempts fail, log an error and return None
|
|
35
|
+
logger.error(f"Failed to parse date string: {date_string}")
|
|
36
|
+
return None
|
|
35
37
|
|
|
36
38
|
return pd.Series(
|
|
37
39
|
[_inner(dt) for dt in dt_str],
|
|
@@ -44,15 +46,16 @@ def _apply_fixes(data_catalog: pd.DataFrame) -> pd.DataFrame:
|
|
|
44
46
|
def _fix_parent_variant_label(group: pd.DataFrame) -> pd.DataFrame:
|
|
45
47
|
if group["parent_variant_label"].nunique() == 1:
|
|
46
48
|
return group
|
|
47
|
-
group["parent_variant_label"] = group["
|
|
49
|
+
group["parent_variant_label"] = group["parent_variant_label"].iloc[0]
|
|
48
50
|
|
|
49
51
|
return group
|
|
50
52
|
|
|
51
|
-
|
|
52
|
-
data_catalog
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
53
|
+
if "parent_variant_label" in data_catalog:
|
|
54
|
+
data_catalog = (
|
|
55
|
+
data_catalog.groupby("instance_id")
|
|
56
|
+
.apply(_fix_parent_variant_label, include_groups=False)
|
|
57
|
+
.reset_index(level="instance_id")
|
|
58
|
+
)
|
|
56
59
|
|
|
57
60
|
if "branch_time_in_child" in data_catalog:
|
|
58
61
|
data_catalog["branch_time_in_child"] = _clean_branch_time(data_catalog["branch_time_in_child"])
|
|
@@ -68,88 +71,6 @@ def _clean_branch_time(branch_time: pd.Series[str]) -> pd.Series[float]:
|
|
|
68
71
|
return pd.to_numeric(branch_time.astype(str).str.replace("D", ""), errors="coerce")
|
|
69
72
|
|
|
70
73
|
|
|
71
|
-
def parse_cmip6(file: str) -> dict[str, Any]:
|
|
72
|
-
"""
|
|
73
|
-
Parser for CMIP6
|
|
74
|
-
|
|
75
|
-
This function parses the CMIP6 dataset and returns a dictionary with the metadata.
|
|
76
|
-
This was copied from the ecgtools package, but we want to log the exception when it fails.
|
|
77
|
-
"""
|
|
78
|
-
keys = sorted(
|
|
79
|
-
{
|
|
80
|
-
"activity_id",
|
|
81
|
-
"branch_method",
|
|
82
|
-
"branch_time_in_child",
|
|
83
|
-
"branch_time_in_parent",
|
|
84
|
-
"experiment",
|
|
85
|
-
"experiment_id",
|
|
86
|
-
"frequency",
|
|
87
|
-
"grid",
|
|
88
|
-
"grid_label",
|
|
89
|
-
"institution_id",
|
|
90
|
-
"nominal_resolution",
|
|
91
|
-
"parent_activity_id",
|
|
92
|
-
"parent_experiment_id",
|
|
93
|
-
"parent_source_id",
|
|
94
|
-
"parent_time_units",
|
|
95
|
-
"parent_variant_label",
|
|
96
|
-
"realm",
|
|
97
|
-
"product",
|
|
98
|
-
"source_id",
|
|
99
|
-
"source_type",
|
|
100
|
-
"sub_experiment",
|
|
101
|
-
"sub_experiment_id",
|
|
102
|
-
"table_id",
|
|
103
|
-
"variable_id",
|
|
104
|
-
"variant_label",
|
|
105
|
-
}
|
|
106
|
-
)
|
|
107
|
-
|
|
108
|
-
try:
|
|
109
|
-
with xr.open_dataset(file, chunks={}, use_cftime=True) as ds:
|
|
110
|
-
info = {key: ds.attrs.get(key) for key in keys}
|
|
111
|
-
info["member_id"] = info["variant_label"]
|
|
112
|
-
|
|
113
|
-
variable_id = info["variable_id"]
|
|
114
|
-
if variable_id: # pragma: no branch
|
|
115
|
-
attrs = ds[variable_id].attrs
|
|
116
|
-
for attr in ["standard_name", "long_name", "units"]:
|
|
117
|
-
info[attr] = attrs.get(attr)
|
|
118
|
-
|
|
119
|
-
# Set the default of # of vertical levels to 1
|
|
120
|
-
vertical_levels = 1
|
|
121
|
-
start_time, end_time = None, None
|
|
122
|
-
init_year = None
|
|
123
|
-
try:
|
|
124
|
-
vertical_levels = ds[ds.cf["vertical"].name].size
|
|
125
|
-
except (KeyError, AttributeError, ValueError):
|
|
126
|
-
...
|
|
127
|
-
|
|
128
|
-
try:
|
|
129
|
-
start_time, end_time = str(ds.cf["T"][0].data), str(ds.cf["T"][-1].data)
|
|
130
|
-
except (KeyError, AttributeError, ValueError):
|
|
131
|
-
...
|
|
132
|
-
if info.get("sub_experiment_id"): # pragma: no branch
|
|
133
|
-
init_year = extract_attr_with_regex(info["sub_experiment_id"], r"\d{4}")
|
|
134
|
-
if init_year: # pragma: no cover
|
|
135
|
-
init_year = int(init_year)
|
|
136
|
-
info["vertical_levels"] = vertical_levels
|
|
137
|
-
info["init_year"] = init_year
|
|
138
|
-
info["start_time"] = start_time
|
|
139
|
-
info["end_time"] = end_time
|
|
140
|
-
if not (start_time and end_time):
|
|
141
|
-
info["time_range"] = None
|
|
142
|
-
else:
|
|
143
|
-
info["time_range"] = f"{start_time}-{end_time}"
|
|
144
|
-
info["path"] = str(file)
|
|
145
|
-
info["version"] = extract_attr_with_regex(str(file), regex=r"v\d{4}\d{2}\d{2}|v\d{1}") or "v0"
|
|
146
|
-
return info
|
|
147
|
-
|
|
148
|
-
except Exception:
|
|
149
|
-
logger.exception(f"Failed to parse {file}")
|
|
150
|
-
return {"INVALID_ASSET": file, "TRACEBACK": traceback.format_exc()}
|
|
151
|
-
|
|
152
|
-
|
|
153
74
|
class CMIP6DatasetAdapter(DatasetAdapter):
|
|
154
75
|
"""
|
|
155
76
|
Adapter for CMIP6 datasets
|
|
@@ -191,6 +112,7 @@ class CMIP6DatasetAdapter(DatasetAdapter):
|
|
|
191
112
|
"standard_name",
|
|
192
113
|
"long_name",
|
|
193
114
|
"units",
|
|
115
|
+
"finalised",
|
|
194
116
|
slug_column,
|
|
195
117
|
)
|
|
196
118
|
|
|
@@ -208,8 +130,30 @@ class CMIP6DatasetAdapter(DatasetAdapter):
|
|
|
208
130
|
"grid_label",
|
|
209
131
|
)
|
|
210
132
|
|
|
211
|
-
def __init__(self, n_jobs: int = 1):
|
|
133
|
+
def __init__(self, n_jobs: int = 1, config: Config | None = None):
|
|
212
134
|
self.n_jobs = n_jobs
|
|
135
|
+
self.config = config or Config.default()
|
|
136
|
+
|
|
137
|
+
def get_parsing_function(self) -> DatasetParsingFunction:
|
|
138
|
+
"""
|
|
139
|
+
Get the parsing function for CMIP6 datasets based on configuration
|
|
140
|
+
|
|
141
|
+
The parsing function used is determined by the `cmip6_parser` configuration value:
|
|
142
|
+
- "drs": Use the DRS parser (default)
|
|
143
|
+
- "complete": Use the complete parser that extracts all available metadata
|
|
144
|
+
|
|
145
|
+
Returns
|
|
146
|
+
-------
|
|
147
|
+
:
|
|
148
|
+
The appropriate parsing function based on configuration
|
|
149
|
+
"""
|
|
150
|
+
parser_type = self.config.cmip6_parser
|
|
151
|
+
if parser_type == "complete":
|
|
152
|
+
logger.info("Using complete CMIP6 parser")
|
|
153
|
+
return parse_cmip6_complete
|
|
154
|
+
else:
|
|
155
|
+
logger.info(f"Using DRS CMIP6 parser (config value: {parser_type})")
|
|
156
|
+
return parse_cmip6_drs
|
|
213
157
|
|
|
214
158
|
def find_local_datasets(self, file_or_directory: Path) -> pd.DataFrame:
|
|
215
159
|
"""
|
|
@@ -228,6 +172,8 @@ class CMIP6DatasetAdapter(DatasetAdapter):
|
|
|
228
172
|
:
|
|
229
173
|
Data catalog containing the metadata for the dataset
|
|
230
174
|
"""
|
|
175
|
+
parsing_function = self.get_parsing_function()
|
|
176
|
+
|
|
231
177
|
with warnings.catch_warnings():
|
|
232
178
|
# Ignore the DeprecationWarning from xarray
|
|
233
179
|
warnings.simplefilter("ignore", DeprecationWarning)
|
|
@@ -237,7 +183,7 @@ class CMIP6DatasetAdapter(DatasetAdapter):
|
|
|
237
183
|
depth=10,
|
|
238
184
|
include_patterns=["*.nc"],
|
|
239
185
|
joblib_parallel_kwargs={"n_jobs": self.n_jobs},
|
|
240
|
-
).build(parsing_func=
|
|
186
|
+
).build(parsing_func=parsing_function)
|
|
241
187
|
|
|
242
188
|
datasets: pd.DataFrame = builder.df.drop(["init_year"], axis=1)
|
|
243
189
|
|
|
@@ -254,6 +200,14 @@ class CMIP6DatasetAdapter(DatasetAdapter):
|
|
|
254
200
|
lambda row: "CMIP6." + ".".join([row[item] for item in drs_items]), axis=1
|
|
255
201
|
)
|
|
256
202
|
|
|
203
|
+
# Add in any missing metadata columns
|
|
204
|
+
missing_columns = set(self.dataset_specific_metadata + self.file_specific_metadata) - set(
|
|
205
|
+
datasets.columns
|
|
206
|
+
)
|
|
207
|
+
if missing_columns:
|
|
208
|
+
for column in missing_columns:
|
|
209
|
+
datasets[column] = pd.NA
|
|
210
|
+
|
|
257
211
|
# Temporary fix for some datasets
|
|
258
212
|
# TODO: Replace with a standalone package that contains metadata fixes for CMIP6 datasets
|
|
259
213
|
datasets = _apply_fixes(datasets)
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CMIP6 parser functions for extracting metadata from netCDF files
|
|
3
|
+
|
|
4
|
+
Additional non-official DRS's may be added in the future.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import traceback
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import xarray as xr
|
|
11
|
+
from ecgtools.parsers.cmip import parse_cmip6_using_directories # type: ignore
|
|
12
|
+
from ecgtools.parsers.utilities import extract_attr_with_regex # type: ignore
|
|
13
|
+
from loguru import logger
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _parse_daterange(date_range: str) -> tuple[str | None, str | None]:
|
|
17
|
+
"""
|
|
18
|
+
Parse a date range string into start and end dates
|
|
19
|
+
|
|
20
|
+
The output from this is an estimated date range until the file is completely parsed.
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
date_range
|
|
25
|
+
Date range string in the format "YYYYMM-YYYYMM"
|
|
26
|
+
|
|
27
|
+
Returns
|
|
28
|
+
-------
|
|
29
|
+
:
|
|
30
|
+
Tuple containing start and end dates as strings in the format "YYYY-MM-DD"
|
|
31
|
+
"""
|
|
32
|
+
try:
|
|
33
|
+
start, end = date_range.split("-")
|
|
34
|
+
if len(start) != 6 or len(end) != 6: # noqa: PLR2004
|
|
35
|
+
raise ValueError("Date range must be in the format 'YYYYMM-YYYYMM'")
|
|
36
|
+
|
|
37
|
+
start = f"{start[:4]}-{start[4:6]}-01"
|
|
38
|
+
# Up to the 30th of the month, assuming a 30-day month
|
|
39
|
+
# These values will be corrected later when the file is parsed
|
|
40
|
+
end = f"{end[:4]}-{end[4:6]}-30"
|
|
41
|
+
|
|
42
|
+
return start, end
|
|
43
|
+
except ValueError:
|
|
44
|
+
logger.error(f"Invalid date range format: {date_range}")
|
|
45
|
+
return None, None
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def parse_cmip6_complete(file: str, **kwargs: Any) -> dict[str, Any]:
|
|
49
|
+
"""
|
|
50
|
+
Complete parser for CMIP6 files
|
|
51
|
+
|
|
52
|
+
This parser loads each file and extracts all available metadata.
|
|
53
|
+
|
|
54
|
+
For some filesystems this may be slow, as it involves a lot of I/O operations.
|
|
55
|
+
|
|
56
|
+
Parameters
|
|
57
|
+
----------
|
|
58
|
+
file
|
|
59
|
+
File to parse
|
|
60
|
+
kwargs
|
|
61
|
+
Additional keyword arguments (not used, but required for compatibility)
|
|
62
|
+
|
|
63
|
+
Returns
|
|
64
|
+
-------
|
|
65
|
+
:
|
|
66
|
+
Dictionary with extracted metadata
|
|
67
|
+
"""
|
|
68
|
+
keys = sorted(
|
|
69
|
+
{
|
|
70
|
+
"activity_id",
|
|
71
|
+
"branch_method",
|
|
72
|
+
"branch_time_in_child",
|
|
73
|
+
"branch_time_in_parent",
|
|
74
|
+
"experiment",
|
|
75
|
+
"experiment_id",
|
|
76
|
+
"frequency",
|
|
77
|
+
"grid",
|
|
78
|
+
"grid_label",
|
|
79
|
+
"institution_id",
|
|
80
|
+
"nominal_resolution",
|
|
81
|
+
"parent_activity_id",
|
|
82
|
+
"parent_experiment_id",
|
|
83
|
+
"parent_source_id",
|
|
84
|
+
"parent_time_units",
|
|
85
|
+
"parent_variant_label",
|
|
86
|
+
"realm",
|
|
87
|
+
"product",
|
|
88
|
+
"source_id",
|
|
89
|
+
"source_type",
|
|
90
|
+
"sub_experiment",
|
|
91
|
+
"sub_experiment_id",
|
|
92
|
+
"table_id",
|
|
93
|
+
"variable_id",
|
|
94
|
+
"variant_label",
|
|
95
|
+
}
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
try:
|
|
99
|
+
with xr.open_dataset(file, chunks={}, use_cftime=True) as ds:
|
|
100
|
+
info = {key: ds.attrs.get(key) for key in keys}
|
|
101
|
+
info["member_id"] = info["variant_label"]
|
|
102
|
+
|
|
103
|
+
variable_id = info["variable_id"]
|
|
104
|
+
if variable_id: # pragma: no branch
|
|
105
|
+
attrs = ds[variable_id].attrs
|
|
106
|
+
for attr in ["standard_name", "long_name", "units"]:
|
|
107
|
+
info[attr] = attrs.get(attr)
|
|
108
|
+
|
|
109
|
+
# Set the default of # of vertical levels to 1
|
|
110
|
+
vertical_levels = 1
|
|
111
|
+
start_time, end_time = None, None
|
|
112
|
+
init_year = None
|
|
113
|
+
try:
|
|
114
|
+
vertical_levels = ds[ds.cf["vertical"].name].size
|
|
115
|
+
except (KeyError, AttributeError, ValueError):
|
|
116
|
+
...
|
|
117
|
+
|
|
118
|
+
try:
|
|
119
|
+
start_time, end_time = str(ds.cf["T"][0].data), str(ds.cf["T"][-1].data)
|
|
120
|
+
except (KeyError, AttributeError, ValueError):
|
|
121
|
+
...
|
|
122
|
+
if info.get("sub_experiment_id"): # pragma: no branch
|
|
123
|
+
init_year = extract_attr_with_regex(info["sub_experiment_id"], r"\d{4}")
|
|
124
|
+
if init_year: # pragma: no cover
|
|
125
|
+
init_year = int(init_year)
|
|
126
|
+
info["vertical_levels"] = vertical_levels
|
|
127
|
+
info["init_year"] = init_year
|
|
128
|
+
info["start_time"] = start_time
|
|
129
|
+
info["end_time"] = end_time
|
|
130
|
+
if not (start_time and end_time):
|
|
131
|
+
info["time_range"] = None
|
|
132
|
+
else:
|
|
133
|
+
info["time_range"] = f"{start_time}-{end_time}"
|
|
134
|
+
info["path"] = str(file)
|
|
135
|
+
info["version"] = extract_attr_with_regex(str(file), regex=r"v\d{4}\d{2}\d{2}|v\d{1}") or "v0"
|
|
136
|
+
|
|
137
|
+
# Mark the dataset as finalised
|
|
138
|
+
# This is used to indicate that the dataset has been fully parsed and is ready for use
|
|
139
|
+
info["finalised"] = True
|
|
140
|
+
|
|
141
|
+
return info
|
|
142
|
+
|
|
143
|
+
except Exception:
|
|
144
|
+
logger.exception(f"Failed to parse {file}")
|
|
145
|
+
return {"INVALID_ASSET": file, "TRACEBACK": traceback.format_exc()}
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def parse_cmip6_drs(file: str, **kwargs: Any) -> dict[str, Any]:
|
|
149
|
+
"""
|
|
150
|
+
DRS parser for CMIP6 files
|
|
151
|
+
|
|
152
|
+
This parser extracts metadata according to the CMIP6 Data Reference Syntax (DRS).
|
|
153
|
+
This includes the essential metadata required to identify the dataset and is included in the filename.
|
|
154
|
+
|
|
155
|
+
Parameters
|
|
156
|
+
----------
|
|
157
|
+
file
|
|
158
|
+
File to parse
|
|
159
|
+
kwargs
|
|
160
|
+
Additional keyword arguments (not used, but required for compatibility)
|
|
161
|
+
|
|
162
|
+
Returns
|
|
163
|
+
-------
|
|
164
|
+
:
|
|
165
|
+
Dictionary with extracted metadata
|
|
166
|
+
"""
|
|
167
|
+
info: dict[str, Any] = parse_cmip6_using_directories(file)
|
|
168
|
+
|
|
169
|
+
if "INVALID_ASSET" in info:
|
|
170
|
+
logger.warning(f"Failed to parse {file}: {info['INVALID_ASSET']}")
|
|
171
|
+
return info
|
|
172
|
+
|
|
173
|
+
# The member_id is technically incorrect
|
|
174
|
+
# but for simplicity we are going to ignore sub-experiments for the DRS parser
|
|
175
|
+
info["variant_label"] = info["member_id"]
|
|
176
|
+
|
|
177
|
+
# Rename the `dcpp_init_year` key to `init_year` if it exists
|
|
178
|
+
if "dcpp_init_year" in info:
|
|
179
|
+
info["init_year"] = info.pop("dcpp_init_year")
|
|
180
|
+
|
|
181
|
+
if info.get("time_range"):
|
|
182
|
+
# Parse the time range if it exists
|
|
183
|
+
start_time, end_time = _parse_daterange(info["time_range"])
|
|
184
|
+
info["start_time"] = start_time
|
|
185
|
+
info["end_time"] = end_time
|
|
186
|
+
|
|
187
|
+
info["finalised"] = False
|
|
188
|
+
|
|
189
|
+
return info
|
|
@@ -15,8 +15,17 @@ from climate_ref.datasets.cmip6 import _parse_datetime
|
|
|
15
15
|
from climate_ref.models.dataset import Dataset, Obs4MIPsDataset
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
def parse_obs4mips(file: str) -> dict[str, Any
|
|
19
|
-
"""
|
|
18
|
+
def parse_obs4mips(file: str, **kwargs: Any) -> dict[str, Any]:
|
|
19
|
+
"""
|
|
20
|
+
Parser for obs4mips
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
file
|
|
25
|
+
File to parse
|
|
26
|
+
kwargs
|
|
27
|
+
Additional keyword arguments (not used, but required for protocol compatibility)
|
|
28
|
+
"""
|
|
20
29
|
keys = sorted(
|
|
21
30
|
list(
|
|
22
31
|
{
|
|
@@ -106,6 +115,7 @@ class Obs4MIPsDatasetAdapter(DatasetAdapter):
|
|
|
106
115
|
|
|
107
116
|
dataset_specific_metadata = (
|
|
108
117
|
"activity_id",
|
|
118
|
+
"finalised",
|
|
109
119
|
"frequency",
|
|
110
120
|
"grid",
|
|
111
121
|
"grid_label",
|
|
@@ -159,7 +169,7 @@ class Obs4MIPsDatasetAdapter(DatasetAdapter):
|
|
|
159
169
|
depth=10,
|
|
160
170
|
include_patterns=["*.nc"],
|
|
161
171
|
joblib_parallel_kwargs={"n_jobs": self.n_jobs},
|
|
162
|
-
).build(parsing_func=parse_obs4mips)
|
|
172
|
+
).build(parsing_func=parse_obs4mips)
|
|
163
173
|
|
|
164
174
|
datasets = builder.df
|
|
165
175
|
if datasets.empty:
|
|
@@ -178,4 +188,5 @@ class Obs4MIPsDatasetAdapter(DatasetAdapter):
|
|
|
178
188
|
datasets["instance_id"] = datasets.apply(
|
|
179
189
|
lambda row: "obs4MIPs." + ".".join([row[item] for item in drs_items]), axis=1
|
|
180
190
|
)
|
|
191
|
+
datasets["finalised"] = True
|
|
181
192
|
return datasets
|