cdo-toolkit 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdo_toolkit/__init__.py +47 -0
- cdo_toolkit/__main__.py +6 -0
- cdo_toolkit/api.py +573 -0
- cdo_toolkit/cli.py +166 -0
- cdo_toolkit/cmip.py +61 -0
- cdo_toolkit/constants.py +9 -0
- cdo_toolkit/errors.py +79 -0
- cdo_toolkit/memory.py +22 -0
- cdo_toolkit/paths.py +30 -0
- cdo_toolkit/pipeline.py +2230 -0
- cdo_toolkit/resolution.py +19 -0
- cdo_toolkit/timing.py +36 -0
- cdo_toolkit/ui.py +650 -0
- cdo_toolkit/workers.py +277 -0
- cdo_toolkit-0.1.0.dist-info/METADATA +78 -0
- cdo_toolkit-0.1.0.dist-info/RECORD +19 -0
- cdo_toolkit-0.1.0.dist-info/WHEEL +4 -0
- cdo_toolkit-0.1.0.dist-info/entry_points.txt +2 -0
- cdo_toolkit-0.1.0.dist-info/licenses/LICENSE +28 -0
cdo_toolkit/cli.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""Command-line interface for CDO regridding."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from rich.console import Console
|
|
7
|
+
|
|
8
|
+
from cdo_toolkit.api import (
|
|
9
|
+
filter_files_by_variables,
|
|
10
|
+
get_cmip_variable_name,
|
|
11
|
+
parse_variable_list,
|
|
12
|
+
regrid_directory,
|
|
13
|
+
regrid_directory_both_levels,
|
|
14
|
+
regrid_single_file,
|
|
15
|
+
regrid_single_file_extreme_levels,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def main() -> None:
|
|
20
|
+
parser = argparse.ArgumentParser(description="CDO-based NetCDF regridding pipeline")
|
|
21
|
+
parser.add_argument("input", type=Path, help="Input file or directory")
|
|
22
|
+
parser.add_argument("-o", "--output", type=Path, help="Output file or directory")
|
|
23
|
+
parser.add_argument("-r", "--resolution", nargs=2, type=float, default=[1.0, 1.0],
|
|
24
|
+
help="Target resolution (lon_res lat_res)")
|
|
25
|
+
parser.add_argument("-p", "--pattern", default="*.nc", help="File pattern (for directories)")
|
|
26
|
+
parser.add_argument(
|
|
27
|
+
"--variable", "-V", nargs="+", default=None,
|
|
28
|
+
help="CMIP variable prefix(es) to regrid (e.g. tos or tos uo). Comma-separated values allowed.",
|
|
29
|
+
)
|
|
30
|
+
parser.add_argument("--include-subdirectories", action="store_true", default=True, help="Include subdirectories")
|
|
31
|
+
parser.add_argument("--extract-surface", action="store_true", default=False,
|
|
32
|
+
help="Extract top level only and regrid that (surface)")
|
|
33
|
+
parser.add_argument("--extract-seafloor", action="store_true", default=False,
|
|
34
|
+
help="Extract seafloor values and regrid only that")
|
|
35
|
+
parser.add_argument("--extreme-levels", action="store_true", default=False,
|
|
36
|
+
help="Extract and regrid both surface (top level) and seafloor for each file")
|
|
37
|
+
parser.add_argument("--no-regrid-cache", action="store_true", default=False,
|
|
38
|
+
help="Do not reuse regrid weight cache (regenerate weights each time)")
|
|
39
|
+
parser.add_argument("--no-seafloor-cache", action="store_true", default=False,
|
|
40
|
+
help="Do not reuse seafloor depth indices cache")
|
|
41
|
+
parser.add_argument("-v", "--verbose", action="store_true", default=True, help="Verbose output (progress UI)")
|
|
42
|
+
parser.add_argument("--verbose-max", action="store_true", default=False,
|
|
43
|
+
help="Maximum verbosity: print Grid type, File size, Large file messages")
|
|
44
|
+
parser.add_argument("--quiet", action="store_true", help="Disable verbose output")
|
|
45
|
+
parser.add_argument("--max-workers", "-w", default=4, type=int, help="Maximum parallel workers")
|
|
46
|
+
parser.add_argument("--chunk-size-gb", type=float, default=2.0,
|
|
47
|
+
help="Maximum chunk size in GB")
|
|
48
|
+
parser.add_argument("--max-memory-gb", default=8.0, type=float, help="Maximum memory usage in GB")
|
|
49
|
+
parser.add_argument("--no-parallel", action="store_true", default=False, help="Disable parallel processing")
|
|
50
|
+
parser.add_argument("--no-chunking", action="store_true", default=False, help="Disable chunked processing")
|
|
51
|
+
parser.add_argument("--use-ui", action="store_true", default=True, help="Use UI for processing")
|
|
52
|
+
# parser.add_argument("--cleanup", action="store_true", help="Clean up problematic files (*_top_level, *_chunk_*) before processing")
|
|
53
|
+
parser.add_argument("--unlink-unprocessed", action="store_true", default=False, help="Unlink unprocessed files after processing")
|
|
54
|
+
parser.add_argument("--overwrite", action="store_true", default=False, help="Overwrite existing output files")
|
|
55
|
+
|
|
56
|
+
args = parser.parse_args()
|
|
57
|
+
|
|
58
|
+
# handle verbose/quiet logic
|
|
59
|
+
verbose = args.verbose and not args.quiet
|
|
60
|
+
verbose_diagnostics = getattr(args, "verbose_max", False)
|
|
61
|
+
use_regrid_cache = not args.no_regrid_cache
|
|
62
|
+
use_seafloor_cache = not args.no_seafloor_cache
|
|
63
|
+
|
|
64
|
+
# # handle cleanup if requested
|
|
65
|
+
# if args.cleanup:
|
|
66
|
+
# if args.input.is_file():
|
|
67
|
+
# # clean up in the same directory as the file
|
|
68
|
+
# cleaned_count = cleanup_problematic_files(args.input.parent, verbose=verbose)
|
|
69
|
+
# else:
|
|
70
|
+
# # clean up in the directory
|
|
71
|
+
# cleaned_count = cleanup_problematic_files(args.input, verbose=verbose)
|
|
72
|
+
|
|
73
|
+
# if cleaned_count == 0:
|
|
74
|
+
# print("No problematic files found to clean up.")
|
|
75
|
+
# else:
|
|
76
|
+
# print(f"Cleaned up {cleaned_count} problematic files.")
|
|
77
|
+
|
|
78
|
+
# # exit after cleanup
|
|
79
|
+
# exit(0)
|
|
80
|
+
|
|
81
|
+
variables = args.variable
|
|
82
|
+
|
|
83
|
+
# determine if input is file or directory
|
|
84
|
+
if args.input.is_file():
|
|
85
|
+
if variables and not filter_files_by_variables([args.input], variables):
|
|
86
|
+
allowed = sorted(parse_variable_list(variables) or [])
|
|
87
|
+
print(f"Skipping {args.input.name}: variable '{get_cmip_variable_name(args.input)}' not in {allowed}")
|
|
88
|
+
raise SystemExit(0)
|
|
89
|
+
# single file processing
|
|
90
|
+
if args.extreme_levels:
|
|
91
|
+
status = regrid_single_file_extreme_levels(
|
|
92
|
+
input_path=args.input,
|
|
93
|
+
output_dir=args.output,
|
|
94
|
+
target_resolution=tuple(args.resolution),
|
|
95
|
+
use_regrid_cache=use_regrid_cache,
|
|
96
|
+
use_seafloor_cache=use_seafloor_cache,
|
|
97
|
+
verbose=verbose,
|
|
98
|
+
overwrite=args.overwrite,
|
|
99
|
+
)
|
|
100
|
+
success = status["top_level"] and status["seafloor"]
|
|
101
|
+
else:
|
|
102
|
+
out_path = getattr(args, "output", None)
|
|
103
|
+
out_dir = out_path if (out_path and out_path.is_dir()) else None
|
|
104
|
+
success = regrid_single_file(
|
|
105
|
+
input_path=args.input,
|
|
106
|
+
output_path=None if out_dir else out_path,
|
|
107
|
+
output_dir=out_dir,
|
|
108
|
+
target_resolution=tuple(args.resolution),
|
|
109
|
+
extract_surface=args.extract_surface,
|
|
110
|
+
extract_seafloor=args.extract_seafloor,
|
|
111
|
+
use_regrid_cache=use_regrid_cache,
|
|
112
|
+
use_seafloor_cache=use_seafloor_cache,
|
|
113
|
+
verbose=verbose,
|
|
114
|
+
verbose_diagnostics=verbose_diagnostics,
|
|
115
|
+
use_ui=args.use_ui,
|
|
116
|
+
overwrite=args.overwrite,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
if success:
|
|
120
|
+
print("Regridding successful!")
|
|
121
|
+
else:
|
|
122
|
+
print("Regridding failed!")
|
|
123
|
+
else:
|
|
124
|
+
# directory processing
|
|
125
|
+
if args.extreme_levels:
|
|
126
|
+
results = regrid_directory_both_levels(
|
|
127
|
+
input_dir=args.input,
|
|
128
|
+
output_dir=args.output,
|
|
129
|
+
include_subdirectories=args.include_subdirectories,
|
|
130
|
+
target_resolution=tuple(args.resolution),
|
|
131
|
+
file_pattern=args.pattern,
|
|
132
|
+
variables=variables,
|
|
133
|
+
verbose=verbose,
|
|
134
|
+
overwrite=args.overwrite,
|
|
135
|
+
max_workers=args.max_workers,
|
|
136
|
+
enable_parallel=not args.no_parallel,
|
|
137
|
+
)
|
|
138
|
+
else:
|
|
139
|
+
results = regrid_directory(
|
|
140
|
+
input_dir=args.input,
|
|
141
|
+
output_dir=args.output,
|
|
142
|
+
include_subdirectories=args.include_subdirectories,
|
|
143
|
+
target_resolution=tuple(args.resolution),
|
|
144
|
+
file_pattern=args.pattern,
|
|
145
|
+
variables=variables,
|
|
146
|
+
extract_surface=args.extract_surface,
|
|
147
|
+
extract_seafloor=args.extract_seafloor,
|
|
148
|
+
use_regrid_cache=use_regrid_cache,
|
|
149
|
+
use_seafloor_cache=use_seafloor_cache,
|
|
150
|
+
verbose=verbose,
|
|
151
|
+
verbose_diagnostics=verbose_diagnostics,
|
|
152
|
+
max_workers=args.max_workers,
|
|
153
|
+
enable_parallel=not args.no_parallel,
|
|
154
|
+
use_ui=args.use_ui,
|
|
155
|
+
overwrite=args.overwrite,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# print results
|
|
159
|
+
console = Console()
|
|
160
|
+
console.print(f"\n[green]Successful: {len(results['successful'])}[/green]")
|
|
161
|
+
console.print(f"[red]Failed: {len(results['failed'])}[/red]")
|
|
162
|
+
console.print(f"[yellow]Skipped: {len(results['skipped'])}[/yellow]")
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
if __name__ == "__main__":
|
|
166
|
+
main()
|
cdo_toolkit/cmip.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Optional CMIP6 filename conventions and filters."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
import xarray as xa
|
|
7
|
+
|
|
8
|
+
def parse_variable_list(variables: Optional[list[str] | str]) -> Optional[set[str]]:
|
|
9
|
+
"""Parse CLI/YAML variable filter into a lowercase set, or None if unset."""
|
|
10
|
+
if not variables:
|
|
11
|
+
return None
|
|
12
|
+
tokens: list[str] = []
|
|
13
|
+
if isinstance(variables, str):
|
|
14
|
+
tokens = variables.split(",")
|
|
15
|
+
else:
|
|
16
|
+
for item in variables:
|
|
17
|
+
tokens.extend(item.split(","))
|
|
18
|
+
parsed = {token.strip().lower() for token in tokens if token.strip()}
|
|
19
|
+
return parsed or None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_cmip_variable_name(file_path: Path) -> str:
|
|
23
|
+
"""CMIP6 filename prefix before the first underscore (e.g. tos from tos_Omon_...)."""
|
|
24
|
+
return file_path.stem.split("_")[0].lower()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def filter_files_by_variables(
|
|
28
|
+
files: list[Path],
|
|
29
|
+
variables: Optional[list[str] | str],
|
|
30
|
+
) -> list[Path]:
|
|
31
|
+
"""Keep only files whose CMIP variable prefix is in *variables*."""
|
|
32
|
+
allowed = parse_variable_list(variables)
|
|
33
|
+
if not allowed:
|
|
34
|
+
return files
|
|
35
|
+
return [f for f in files if get_cmip_variable_name(f) in allowed]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def pick_representative_file(input_files: list[Path]) -> Optional[Path]:
|
|
39
|
+
"""Pick a file with nominal_resolution metadata, else the first file in the group."""
|
|
40
|
+
if not input_files:
|
|
41
|
+
return None
|
|
42
|
+
for file_path in input_files:
|
|
43
|
+
try:
|
|
44
|
+
with xa.open_dataset(file_path, decode_times=False) as ds:
|
|
45
|
+
if "nominal_resolution" in ds.attrs:
|
|
46
|
+
return file_path
|
|
47
|
+
except Exception:
|
|
48
|
+
continue
|
|
49
|
+
return input_files[0]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def representative_files_by_directory(files: list[Path]) -> dict[Path, Path]:
|
|
53
|
+
"""Map each file's parent directory to a representative file for resolution."""
|
|
54
|
+
by_dir: dict[Path, list[Path]] = {}
|
|
55
|
+
for file_path in files:
|
|
56
|
+
by_dir.setdefault(file_path.parent, []).append(file_path)
|
|
57
|
+
return {
|
|
58
|
+
parent: rep
|
|
59
|
+
for parent, group_files in by_dir.items()
|
|
60
|
+
if (rep := pick_representative_file(group_files)) is not None
|
|
61
|
+
}
|
cdo_toolkit/constants.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""Shared constants for the CDO regrid package."""
|
|
2
|
+
|
|
3
|
+
REGRID_ERROR_LOGGER_NAME = "cdo_toolkit.errors"
|
|
4
|
+
|
|
5
|
+
NC4_ENCODING_KEYS = frozenset({
|
|
6
|
+
"szip_pixels_per_block", "contiguous", "quantize_mode", "_FillValue", "fletcher32",
|
|
7
|
+
"endian", "chunksizes", "least_significant_digit", "complevel", "szip_coding",
|
|
8
|
+
"significant_digits", "dtype", "shuffle", "zlib", "blosc_shuffle", "compression",
|
|
9
|
+
})
|
cdo_toolkit/errors.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Error logging and file locking for regridding."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import traceback
|
|
6
|
+
from contextlib import contextmanager
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import fcntl
|
|
13
|
+
except ImportError:
|
|
14
|
+
fcntl = None
|
|
15
|
+
|
|
16
|
+
from cdo_toolkit.constants import REGRID_ERROR_LOGGER_NAME
|
|
17
|
+
|
|
18
|
+
_regrid_error_log_path: Optional[Path] = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def default_log_dir() -> Path:
|
|
22
|
+
"""Default directory for regrid error logs."""
|
|
23
|
+
return Path.cwd() / "logs"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def init_regrid_error_log(path: Optional[Path] = None) -> Path:
|
|
27
|
+
"""One regrid error log per run; safe for parallel workers to append."""
|
|
28
|
+
global _regrid_error_log_path
|
|
29
|
+
if path is not None:
|
|
30
|
+
_regrid_error_log_path = Path(path)
|
|
31
|
+
elif _regrid_error_log_path is None:
|
|
32
|
+
log_dir = default_log_dir()
|
|
33
|
+
log_dir.mkdir(parents=True, exist_ok=True)
|
|
34
|
+
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
|
35
|
+
_regrid_error_log_path = log_dir / f"regrid_errors_{timestamp}.log"
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger(REGRID_ERROR_LOGGER_NAME)
|
|
38
|
+
logger.setLevel(logging.WARNING)
|
|
39
|
+
logger.propagate = False
|
|
40
|
+
if not any(
|
|
41
|
+
isinstance(h, logging.FileHandler)
|
|
42
|
+
and getattr(h, "baseFilename", None) == str(_regrid_error_log_path.resolve())
|
|
43
|
+
for h in logger.handlers
|
|
44
|
+
):
|
|
45
|
+
handler = logging.FileHandler(_regrid_error_log_path)
|
|
46
|
+
handler.setFormatter(
|
|
47
|
+
logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
|
48
|
+
)
|
|
49
|
+
logger.addHandler(handler)
|
|
50
|
+
return _regrid_error_log_path
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def log_regrid_error(message: str, exc: Optional[BaseException] = None) -> None:
|
|
54
|
+
"""Append a regrid error to the shared session log."""
|
|
55
|
+
init_regrid_error_log()
|
|
56
|
+
logger = logging.getLogger(REGRID_ERROR_LOGGER_NAME)
|
|
57
|
+
if exc is not None:
|
|
58
|
+
logger.error("%s\n%s", message, traceback.format_exc())
|
|
59
|
+
else:
|
|
60
|
+
logger.error(message)
|
|
61
|
+
for handler in logger.handlers:
|
|
62
|
+
handler.flush()
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@contextmanager
|
|
66
|
+
def weight_file_lock(weight_path: Path):
|
|
67
|
+
"""Serialize weight generation and validation across parallel workers."""
|
|
68
|
+
if fcntl is None:
|
|
69
|
+
yield
|
|
70
|
+
return
|
|
71
|
+
lock_path = weight_path.parent / f"{weight_path.name}.lock"
|
|
72
|
+
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
|
73
|
+
fd = os.open(str(lock_path), os.O_CREAT | os.O_RDWR, 0o644)
|
|
74
|
+
try:
|
|
75
|
+
fcntl.flock(fd, fcntl.LOCK_EX)
|
|
76
|
+
yield
|
|
77
|
+
finally:
|
|
78
|
+
fcntl.flock(fd, fcntl.LOCK_UN)
|
|
79
|
+
os.close(fd)
|
cdo_toolkit/memory.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Memory usage monitoring."""
|
|
2
|
+
|
|
3
|
+
import psutil
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class MemoryMonitor:
|
|
7
|
+
"""Track peak memory usage during processing."""
|
|
8
|
+
|
|
9
|
+
def __init__(self):
|
|
10
|
+
self.peak_memory_gb = 0.0
|
|
11
|
+
|
|
12
|
+
def get_memory_usage_gb(self) -> float:
|
|
13
|
+
process = psutil.Process()
|
|
14
|
+
return process.memory_info().rss / (1024**3)
|
|
15
|
+
|
|
16
|
+
def update_peak(self):
|
|
17
|
+
current = self.get_memory_usage_gb()
|
|
18
|
+
if current > self.peak_memory_gb:
|
|
19
|
+
self.peak_memory_gb = current
|
|
20
|
+
|
|
21
|
+
def get_peak_memory_gb(self) -> float:
|
|
22
|
+
return self.peak_memory_gb
|
cdo_toolkit/paths.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Path helpers for weight caches and intermediate files."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def weight_cache_dir_for_input(input_path: Path) -> Path:
|
|
7
|
+
"""Per-leaf-directory CDO weight cache (avoids cross-model collisions)."""
|
|
8
|
+
return input_path.parent / "cdo_weights"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def is_weights_or_cache_file(path: Path) -> bool:
|
|
12
|
+
"""Return True if path is a weight file or under a weight cache directory (exclude from regrid list)."""
|
|
13
|
+
if path.stem.lower().startswith("weights_"):
|
|
14
|
+
return True
|
|
15
|
+
if "cdo_weights" in path.parts:
|
|
16
|
+
return True
|
|
17
|
+
return False
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def is_intermediate_nc(path: Path) -> bool:
|
|
21
|
+
"""Return True if path is an intermediate/product we should not regrid (top_level, regridded, seafloor, chunk, weights)."""
|
|
22
|
+
if is_weights_or_cache_file(path):
|
|
23
|
+
return True
|
|
24
|
+
stem = path.stem.lower()
|
|
25
|
+
return (
|
|
26
|
+
"_top_level" in stem
|
|
27
|
+
or "_regridded" in stem
|
|
28
|
+
or "_seafloor" in stem
|
|
29
|
+
or "_chunk_" in stem
|
|
30
|
+
)
|