climate-ref-core 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
climate_ref_core/env.py CHANGED
@@ -32,4 +32,41 @@ def get_env() -> Env:
32
32
  return env
33
33
 
34
34
 
35
+ def get_available_cpu_count() -> int:
36
+ """
37
+ Detect the number of CPU cores available considering cgroup limitations.
38
+
39
+ Returns
40
+ -------
41
+ :
42
+ The number of allocated CPUs or total cpu count if not running in a cgroup-limited environment.
43
+ """
44
+ try:
45
+ # Check for CPU quota
46
+ with open("/sys/fs/cgroup/cpu/cpu.cfs_quota_us") as f:
47
+ quota = int(f.read())
48
+ with open("/sys/fs/cgroup/cpu/cpu.cfs_period_us") as f:
49
+ period = int(f.read())
50
+
51
+ if quota > 0 and period > 0:
52
+ return quota // period
53
+
54
+ # If no quota, check for cpuset
55
+ with open("/sys/fs/cgroup/cpuset/cpuset.cpus") as f:
56
+ cpuset = f.read().strip()
57
+ # Parse the cpuset string (e.g., "0-3", "0,2")
58
+ count = 0
59
+ for part in cpuset.split(","):
60
+ if "-" in part:
61
+ start, end = map(int, part.split("-"))
62
+ count += end - start + 1
63
+ else:
64
+ count += 1
65
+ return count
66
+
67
+ except FileNotFoundError:
68
+ # Not running in a cgroup-limited environment or cgroup files not found
69
+ return os.cpu_count() or 1
70
+
71
+
35
72
  env = get_env()
@@ -0,0 +1,21 @@
1
+ """
2
+ ESGF dataset fetching
3
+
4
+ This module provides classes for searching and fetching datasets from ESGF
5
+ (Earth System Grid Federation) and other data registries.
6
+ """
7
+
8
+ from climate_ref_core.esgf.base import ESGFRequest, IntakeESGFMixin
9
+ from climate_ref_core.esgf.cmip6 import CMIP6Request
10
+ from climate_ref_core.esgf.fetcher import ESGFFetcher
11
+ from climate_ref_core.esgf.obs4mips import Obs4MIPsRequest
12
+ from climate_ref_core.esgf.registry import RegistryRequest
13
+
14
+ __all__ = [
15
+ "CMIP6Request",
16
+ "ESGFFetcher",
17
+ "ESGFRequest",
18
+ "IntakeESGFMixin",
19
+ "Obs4MIPsRequest",
20
+ "RegistryRequest",
21
+ ]
@@ -0,0 +1,122 @@
1
+ """
2
+ Base classes and protocols for ESGF data requests.
3
+
4
+ This module provides the infrastructure for fetching datasets from ESGF
5
+ using the intake-esgf package.
6
+ """
7
+
8
+ from typing import Any, Protocol, runtime_checkable
9
+
10
+ import pandas as pd
11
+ from intake_esgf import ESGFCatalog
12
+
13
+
14
+ @runtime_checkable
15
+ class ESGFRequest(Protocol):
16
+ """
17
+ Protocol for ESGF dataset requests.
18
+
19
+ Implementations provide the logic for searching ESGF and generating
20
+ output paths for downloaded datasets.
21
+ """
22
+
23
+ slug: str
24
+ """Unique identifier for this request."""
25
+
26
+ source_type: str
27
+ """Type of dataset (e.g., 'CMIP6', 'obs4MIPs')."""
28
+
29
+ time_span: tuple[str, str] | None
30
+ """Optional time range to filter datasets (start, end)."""
31
+
32
+ def fetch_datasets(self) -> pd.DataFrame:
33
+ """
34
+ Fetch dataset metadata from ESGF.
35
+
36
+ Returns
37
+ -------
38
+ pd.DataFrame
39
+ DataFrame containing dataset metadata and file paths.
40
+ Must contain at minimum:
41
+ - key: A unique identifier for the dataset
42
+ - files: A list of files for the dataset
43
+ """
44
+ ...
45
+
46
+
47
+ def _deduplicate_datasets(datasets: pd.DataFrame) -> pd.DataFrame:
48
+ """
49
+ Deduplicate a dataset collection.
50
+
51
+ Uses the metadata from the first dataset in each group,
52
+ but expands the time range to the min/max timespan of the group.
53
+
54
+ Parameters
55
+ ----------
56
+ datasets
57
+ The dataset collection
58
+
59
+ Returns
60
+ -------
61
+ pd.DataFrame
62
+ The deduplicated dataset collection spanning the times requested
63
+ """
64
+
65
+ def _deduplicate_group(group: pd.DataFrame) -> pd.DataFrame:
66
+ first = group.iloc[[0]].copy()
67
+ if "time_start" in first.columns:
68
+ first["time_start"] = group["time_start"].min()
69
+ if "time_end" in first.columns:
70
+ first["time_end"] = group["time_end"].max()
71
+ return first
72
+
73
+ result: pd.DataFrame = (
74
+ datasets.groupby("key")
75
+ .apply(_deduplicate_group, include_groups=False) # type: ignore[call-overload]
76
+ .reset_index()
77
+ )
78
+ return result
79
+
80
+
81
+ class IntakeESGFMixin:
82
+ """
83
+ Mixin that fetches datasets from ESGF using intake-esgf.
84
+
85
+ Subclasses must define:
86
+ - facets: dict[str, str | tuple[str, ...]]
87
+ - remove_ensembles: bool
88
+ - time_span: tuple[str, str] | None
89
+ """
90
+
91
+ facets: dict[str, str | tuple[str, ...]]
92
+ remove_ensembles: bool
93
+ time_span: tuple[str, str] | None
94
+
95
+ def fetch_datasets(self) -> pd.DataFrame:
96
+ """Fetch dataset metadata from ESGF."""
97
+ facets: dict[str, Any] = dict(self.facets)
98
+ if self.time_span:
99
+ facets["file_start"] = self.time_span[0]
100
+ facets["file_end"] = self.time_span[1]
101
+
102
+ # Convert tuples to lists for intake-esgf compatibility
103
+ for key, value in facets.items():
104
+ if isinstance(value, tuple):
105
+ facets[key] = list(value)
106
+
107
+ cat = ESGFCatalog() # type: ignore[no-untyped-call]
108
+ cat.search(**facets)
109
+
110
+ if self.remove_ensembles:
111
+ cat.remove_ensembles()
112
+
113
+ path_dict = cat.to_path_dict(prefer_streaming=False, minimal_keys=False, quiet=True)
114
+ if cat.df is None or cat.df.empty:
115
+ raise ValueError("No datasets found for the given ESGF request")
116
+ merged_df = cat.df.merge(pd.Series(path_dict, name="files"), left_on="key", right_index=True)
117
+
118
+ if self.time_span:
119
+ merged_df["time_start"] = self.time_span[0]
120
+ merged_df["time_end"] = self.time_span[1]
121
+
122
+ return _deduplicate_datasets(merged_df)
@@ -0,0 +1,119 @@
1
+ """
2
+ CMIP6 dataset request implementation.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import TYPE_CHECKING, Any
8
+
9
+ from climate_ref_core.esgf.base import IntakeESGFMixin
10
+
11
+ if TYPE_CHECKING:
12
+ import xarray as xr
13
+
14
+
15
+ def prefix_to_filename(ds: xr.Dataset, filename_prefix: str) -> str:
16
+ """
17
+ Create a filename from a dataset and a prefix.
18
+
19
+ Optionally includes the time range of the dataset if it has a time dimension.
20
+
21
+ Parameters
22
+ ----------
23
+ ds
24
+ Dataset
25
+ filename_prefix
26
+ Prefix for the filename (includes the different facets of the dataset)
27
+
28
+ Returns
29
+ -------
30
+ str
31
+ Filename for the dataset
32
+ """
33
+ if "time" in ds.dims:
34
+ time_range = f"{ds.time.min().dt.strftime('%Y%m').item()}-{ds.time.max().dt.strftime('%Y%m').item()}"
35
+ filename = f"{filename_prefix}_{time_range}.nc"
36
+ else:
37
+ filename = f"{filename_prefix}.nc"
38
+ return filename
39
+
40
+
41
+ class CMIP6Request(IntakeESGFMixin):
42
+ """
43
+ Represents a CMIP6 dataset request.
44
+
45
+ These data are fetched from ESGF based on the provided facets.
46
+ """
47
+
48
+ source_type = "CMIP6"
49
+
50
+ cmip6_path_items = (
51
+ "mip_era",
52
+ "activity_drs",
53
+ "institution_id",
54
+ "source_id",
55
+ "experiment_id",
56
+ "member_id",
57
+ "table_id",
58
+ "variable_id",
59
+ "grid_label",
60
+ )
61
+
62
+ cmip6_filename_paths = (
63
+ "variable_id",
64
+ "table_id",
65
+ "source_id",
66
+ "experiment_id",
67
+ "member_id",
68
+ "grid_label",
69
+ )
70
+
71
+ available_facets = (
72
+ "mip_era",
73
+ "activity_drs",
74
+ "institution_id",
75
+ "source_id",
76
+ "experiment_id",
77
+ "member_id",
78
+ "table_id",
79
+ "variable_id",
80
+ "grid_label",
81
+ "version",
82
+ "data_node",
83
+ )
84
+
85
+ def __init__(
86
+ self,
87
+ slug: str,
88
+ facets: dict[str, Any],
89
+ remove_ensembles: bool = False,
90
+ time_span: tuple[str, str] | None = None,
91
+ ):
92
+ """
93
+ Initialize a CMIP6 request.
94
+
95
+ Parameters
96
+ ----------
97
+ slug
98
+ Unique identifier for this request
99
+ facets
100
+ ESGF search facets (e.g., source_id, variable_id, experiment_id)
101
+ remove_ensembles
102
+ If True, keep only one ensemble member per model
103
+ time_span
104
+ Optional time range filter (start, end) in YYYY-MM format
105
+ """
106
+ self.slug = slug
107
+ self.facets = facets
108
+ self.remove_ensembles = remove_ensembles
109
+ self.time_span = time_span
110
+
111
+ for key in self.cmip6_path_items:
112
+ if key not in self.available_facets:
113
+ raise ValueError(f"Path item {key!r} not in available facets")
114
+ for key in self.cmip6_filename_paths:
115
+ if key not in self.available_facets:
116
+ raise ValueError(f"Filename path {key!r} not in available facets")
117
+
118
+ def __repr__(self) -> str:
119
+ return f"CMIP6Request(slug={self.slug!r}, facets={self.facets!r})"
@@ -0,0 +1,138 @@
1
+ """
2
+ ESGF dataset fetcher for downloading test data.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from pathlib import Path
8
+ from typing import TYPE_CHECKING
9
+
10
+ import pandas as pd
11
+ from loguru import logger
12
+
13
+ from climate_ref_core.esgf.base import ESGFRequest
14
+
15
+ if TYPE_CHECKING:
16
+ from climate_ref_core.diagnostics import AbstractDiagnostic as Diagnostic
17
+
18
+
19
+ class ESGFFetcher:
20
+ """
21
+ Fetches datasets from ESGF and returns metadata with file paths.
22
+
23
+ Uses intake-esgf to search and download datasets.
24
+ Files that cannot be found locally are stored in intake-esgf's cache directory.
25
+ """
26
+
27
+ def fetch_request(self, request: ESGFRequest) -> pd.DataFrame:
28
+ """
29
+ Fetch datasets for a single ESGF request.
30
+
31
+ Parameters
32
+ ----------
33
+ request
34
+ The ESGF request specifying what to fetch
35
+
36
+ Returns
37
+ -------
38
+ pd.DataFrame
39
+ DataFrame containing dataset metadata and file paths.
40
+ Each row represents one file, with a 'path' column pointing
41
+ to the file (either in intake-esgf's cache or one of the root data locations).
42
+
43
+ This format is not identical to the DataCatalog, but it is broadly compatible.
44
+ """
45
+ logger.info(f"Fetching datasets for request: {request.slug}")
46
+
47
+ # Search ESGF for matching datasets
48
+ datasets_df = request.fetch_datasets()
49
+
50
+ if datasets_df.empty:
51
+ logger.warning(f"No datasets found for request: {request.slug}")
52
+ return pd.DataFrame()
53
+
54
+ logger.info(f"Found {len(datasets_df)} datasets for request: {request.slug}")
55
+
56
+ # Expand files column - each file becomes a row with a 'path' column
57
+ rows = []
58
+ for _, row in datasets_df.iterrows():
59
+ files = row.get("files", [])
60
+ if not files:
61
+ logger.warning(f"No files for dataset: {row.get('key', 'unknown')}")
62
+ continue
63
+
64
+ for file_path in files:
65
+ if not Path(file_path).exists():
66
+ logger.warning(f"File not found (may need to download from ESGF): {file_path}")
67
+ continue
68
+
69
+ row_copy = row.to_dict()
70
+ row_copy["path"] = str(file_path)
71
+ rows.append(row_copy)
72
+
73
+ if not rows:
74
+ logger.warning(f"No files found for request: {request.slug}")
75
+ return pd.DataFrame()
76
+
77
+ result = pd.DataFrame(rows)
78
+ result["source_type"] = request.source_type
79
+
80
+ logger.info(f"Fetched {len(result)} files for request: {request.slug}")
81
+ return result
82
+
83
+ def fetch_for_test_case(
84
+ self,
85
+ requests: tuple[ESGFRequest, ...] | None,
86
+ ) -> pd.DataFrame:
87
+ """
88
+ Fetch all data for a test case's requests.
89
+
90
+ Parameters
91
+ ----------
92
+ requests
93
+ The ESGF requests from the test case
94
+
95
+ Returns
96
+ -------
97
+ pd.DataFrame
98
+ Combined DataFrame with all datasets, grouped by source_type
99
+ """
100
+ if not requests:
101
+ return pd.DataFrame()
102
+
103
+ dfs = []
104
+ for request in requests:
105
+ df = self.fetch_request(request)
106
+ if not df.empty:
107
+ dfs.append(df)
108
+
109
+ if not dfs:
110
+ return pd.DataFrame()
111
+
112
+ return pd.concat(dfs, ignore_index=True)
113
+
114
+ def list_requests_for_diagnostic(self, diagnostic: Diagnostic) -> list[tuple[str, ESGFRequest]]:
115
+ """
116
+ List all ESGF requests for a diagnostic across all test cases.
117
+
118
+ Parameters
119
+ ----------
120
+ diagnostic
121
+ The diagnostic to list requests for
122
+
123
+ Returns
124
+ -------
125
+ list[tuple[str, ESGFRequest]]
126
+ List of (test_case_name, request) tuples
127
+ """
128
+ if diagnostic.test_data_spec is None:
129
+ return []
130
+
131
+ results: list[tuple[str, ESGFRequest]] = []
132
+
133
+ for test_case in diagnostic.test_data_spec.test_cases:
134
+ if test_case.requests:
135
+ for request in test_case.requests:
136
+ results.append((test_case.name, request))
137
+
138
+ return results
@@ -0,0 +1,94 @@
1
+ """
2
+ Obs4MIPs dataset request implementation.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import TYPE_CHECKING, Any
8
+
9
+ import pandas as pd
10
+
11
+ from climate_ref_core.esgf.base import IntakeESGFMixin
12
+
13
+ if TYPE_CHECKING:
14
+ pass
15
+
16
+
17
+ class Obs4MIPsRequest(IntakeESGFMixin):
18
+ """
19
+ Represents an Obs4MIPs dataset request.
20
+
21
+ These data are fetched from ESGF based on the provided facets.
22
+ """
23
+
24
+ source_type = "obs4MIPs"
25
+
26
+ obs4mips_path_items = (
27
+ "activity_id",
28
+ "institution_id",
29
+ "source_id",
30
+ "variable_id",
31
+ "grid_label",
32
+ )
33
+
34
+ obs4mips_filename_paths = (
35
+ "variable_id",
36
+ "source_id",
37
+ "grid_label",
38
+ )
39
+
40
+ avail_facets = (
41
+ "activity_id",
42
+ "institution_id",
43
+ "source_id",
44
+ "frequency",
45
+ "variable_id",
46
+ "grid_label",
47
+ "version",
48
+ "data_node",
49
+ "project",
50
+ )
51
+
52
+ def __init__(
53
+ self,
54
+ slug: str,
55
+ facets: dict[str, Any],
56
+ remove_ensembles: bool = False,
57
+ time_span: tuple[str, str] | None = None,
58
+ ):
59
+ """
60
+ Initialize an Obs4MIPs request.
61
+
62
+ Parameters
63
+ ----------
64
+ slug
65
+ Unique identifier for this request
66
+ facets
67
+ ESGF search facets (e.g., source_id, variable_id)
68
+ remove_ensembles
69
+ If True, keep only one ensemble member (typically not relevant for obs)
70
+ time_span
71
+ Optional time range filter (start, end) in YYYY-MM format
72
+ """
73
+ self.slug = slug
74
+ self.facets = facets
75
+ self.remove_ensembles = remove_ensembles
76
+ self.time_span = time_span
77
+
78
+ for key in self.obs4mips_path_items:
79
+ if key not in self.avail_facets:
80
+ raise ValueError(f"Path item {key!r} not in available facets")
81
+ for key in self.obs4mips_filename_paths:
82
+ if key not in self.avail_facets:
83
+ raise ValueError(f"Filename path {key!r} not in available facets")
84
+
85
+ def fetch_datasets(self) -> pd.DataFrame:
86
+ """Fetch dataset metadata from ESGF with project=obs4MIPs."""
87
+ # Ensure project facet is set to obs4MIPs
88
+ if "project" not in self.facets:
89
+ self.facets["project"] = "obs4MIPs"
90
+
91
+ return super().fetch_datasets()
92
+
93
+ def __repr__(self) -> str:
94
+ return f"Obs4MIPsRequest(slug={self.slug!r}, facets={self.facets!r})"