cdo-toolkit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cdo_toolkit/cli.py ADDED
@@ -0,0 +1,166 @@
1
+ """Command-line interface for CDO regridding."""
2
+
3
+ import argparse
4
+ from pathlib import Path
5
+
6
+ from rich.console import Console
7
+
8
+ from cdo_toolkit.api import (
9
+ filter_files_by_variables,
10
+ get_cmip_variable_name,
11
+ parse_variable_list,
12
+ regrid_directory,
13
+ regrid_directory_both_levels,
14
+ regrid_single_file,
15
+ regrid_single_file_extreme_levels,
16
+ )
17
+
18
+
19
+ def main() -> None:
20
+ parser = argparse.ArgumentParser(description="CDO-based NetCDF regridding pipeline")
21
+ parser.add_argument("input", type=Path, help="Input file or directory")
22
+ parser.add_argument("-o", "--output", type=Path, help="Output file or directory")
23
+ parser.add_argument("-r", "--resolution", nargs=2, type=float, default=[1.0, 1.0],
24
+ help="Target resolution (lon_res lat_res)")
25
+ parser.add_argument("-p", "--pattern", default="*.nc", help="File pattern (for directories)")
26
+ parser.add_argument(
27
+ "--variable", "-V", nargs="+", default=None,
28
+ help="CMIP variable prefix(es) to regrid (e.g. tos or tos uo). Comma-separated values allowed.",
29
+ )
30
+ parser.add_argument("--include-subdirectories", action="store_true", default=True, help="Include subdirectories")
31
+ parser.add_argument("--extract-surface", action="store_true", default=False,
32
+ help="Extract top level only and regrid that (surface)")
33
+ parser.add_argument("--extract-seafloor", action="store_true", default=False,
34
+ help="Extract seafloor values and regrid only that")
35
+ parser.add_argument("--extreme-levels", action="store_true", default=False,
36
+ help="Extract and regrid both surface (top level) and seafloor for each file")
37
+ parser.add_argument("--no-regrid-cache", action="store_true", default=False,
38
+ help="Do not reuse regrid weight cache (regenerate weights each time)")
39
+ parser.add_argument("--no-seafloor-cache", action="store_true", default=False,
40
+ help="Do not reuse seafloor depth indices cache")
41
+ parser.add_argument("-v", "--verbose", action="store_true", default=True, help="Verbose output (progress UI)")
42
+ parser.add_argument("--verbose-max", action="store_true", default=False,
43
+ help="Maximum verbosity: print Grid type, File size, Large file messages")
44
+ parser.add_argument("--quiet", action="store_true", help="Disable verbose output")
45
+ parser.add_argument("--max-workers", "-w", default=4, type=int, help="Maximum parallel workers")
46
+ parser.add_argument("--chunk-size-gb", type=float, default=2.0,
47
+ help="Maximum chunk size in GB")
48
+ parser.add_argument("--max-memory-gb", default=8.0, type=float, help="Maximum memory usage in GB")
49
+ parser.add_argument("--no-parallel", action="store_true", default=False, help="Disable parallel processing")
50
+ parser.add_argument("--no-chunking", action="store_true", default=False, help="Disable chunked processing")
51
+ parser.add_argument("--use-ui", action="store_true", default=True, help="Use UI for processing")
52
+ # parser.add_argument("--cleanup", action="store_true", help="Clean up problematic files (*_top_level, *_chunk_*) before processing")
53
+ parser.add_argument("--unlink-unprocessed", action="store_true", default=False, help="Unlink unprocessed files after processing")
54
+ parser.add_argument("--overwrite", action="store_true", default=False, help="Overwrite existing output files")
55
+
56
+ args = parser.parse_args()
57
+
58
+ # handle verbose/quiet logic
59
+ verbose = args.verbose and not args.quiet
60
+ verbose_diagnostics = getattr(args, "verbose_max", False)
61
+ use_regrid_cache = not args.no_regrid_cache
62
+ use_seafloor_cache = not args.no_seafloor_cache
63
+
64
+ # # handle cleanup if requested
65
+ # if args.cleanup:
66
+ # if args.input.is_file():
67
+ # # clean up in the same directory as the file
68
+ # cleaned_count = cleanup_problematic_files(args.input.parent, verbose=verbose)
69
+ # else:
70
+ # # clean up in the directory
71
+ # cleaned_count = cleanup_problematic_files(args.input, verbose=verbose)
72
+
73
+ # if cleaned_count == 0:
74
+ # print("No problematic files found to clean up.")
75
+ # else:
76
+ # print(f"Cleaned up {cleaned_count} problematic files.")
77
+
78
+ # # exit after cleanup
79
+ # exit(0)
80
+
81
+ variables = args.variable
82
+
83
+ # determine if input is file or directory
84
+ if args.input.is_file():
85
+ if variables and not filter_files_by_variables([args.input], variables):
86
+ allowed = sorted(parse_variable_list(variables) or [])
87
+ print(f"Skipping {args.input.name}: variable '{get_cmip_variable_name(args.input)}' not in {allowed}")
88
+ raise SystemExit(0)
89
+ # single file processing
90
+ if args.extreme_levels:
91
+ status = regrid_single_file_extreme_levels(
92
+ input_path=args.input,
93
+ output_dir=args.output,
94
+ target_resolution=tuple(args.resolution),
95
+ use_regrid_cache=use_regrid_cache,
96
+ use_seafloor_cache=use_seafloor_cache,
97
+ verbose=verbose,
98
+ overwrite=args.overwrite,
99
+ )
100
+ success = status["top_level"] and status["seafloor"]
101
+ else:
102
+ out_path = getattr(args, "output", None)
103
+ out_dir = out_path if (out_path and out_path.is_dir()) else None
104
+ success = regrid_single_file(
105
+ input_path=args.input,
106
+ output_path=None if out_dir else out_path,
107
+ output_dir=out_dir,
108
+ target_resolution=tuple(args.resolution),
109
+ extract_surface=args.extract_surface,
110
+ extract_seafloor=args.extract_seafloor,
111
+ use_regrid_cache=use_regrid_cache,
112
+ use_seafloor_cache=use_seafloor_cache,
113
+ verbose=verbose,
114
+ verbose_diagnostics=verbose_diagnostics,
115
+ use_ui=args.use_ui,
116
+ overwrite=args.overwrite,
117
+ )
118
+
119
+ if success:
120
+ print("Regridding successful!")
121
+ else:
122
+ print("Regridding failed!")
123
+ else:
124
+ # directory processing
125
+ if args.extreme_levels:
126
+ results = regrid_directory_both_levels(
127
+ input_dir=args.input,
128
+ output_dir=args.output,
129
+ include_subdirectories=args.include_subdirectories,
130
+ target_resolution=tuple(args.resolution),
131
+ file_pattern=args.pattern,
132
+ variables=variables,
133
+ verbose=verbose,
134
+ overwrite=args.overwrite,
135
+ max_workers=args.max_workers,
136
+ enable_parallel=not args.no_parallel,
137
+ )
138
+ else:
139
+ results = regrid_directory(
140
+ input_dir=args.input,
141
+ output_dir=args.output,
142
+ include_subdirectories=args.include_subdirectories,
143
+ target_resolution=tuple(args.resolution),
144
+ file_pattern=args.pattern,
145
+ variables=variables,
146
+ extract_surface=args.extract_surface,
147
+ extract_seafloor=args.extract_seafloor,
148
+ use_regrid_cache=use_regrid_cache,
149
+ use_seafloor_cache=use_seafloor_cache,
150
+ verbose=verbose,
151
+ verbose_diagnostics=verbose_diagnostics,
152
+ max_workers=args.max_workers,
153
+ enable_parallel=not args.no_parallel,
154
+ use_ui=args.use_ui,
155
+ overwrite=args.overwrite,
156
+ )
157
+
158
+ # print results
159
+ console = Console()
160
+ console.print(f"\n[green]Successful: {len(results['successful'])}[/green]")
161
+ console.print(f"[red]Failed: {len(results['failed'])}[/red]")
162
+ console.print(f"[yellow]Skipped: {len(results['skipped'])}[/yellow]")
163
+
164
+
165
+ if __name__ == "__main__":
166
+ main()
cdo_toolkit/cmip.py ADDED
@@ -0,0 +1,61 @@
1
+ """Optional CMIP6 filename conventions and filters."""
2
+
3
+ from pathlib import Path
4
+ from typing import Optional
5
+
6
+ import xarray as xa
7
+
8
+ def parse_variable_list(variables: Optional[list[str] | str]) -> Optional[set[str]]:
9
+ """Parse CLI/YAML variable filter into a lowercase set, or None if unset."""
10
+ if not variables:
11
+ return None
12
+ tokens: list[str] = []
13
+ if isinstance(variables, str):
14
+ tokens = variables.split(",")
15
+ else:
16
+ for item in variables:
17
+ tokens.extend(item.split(","))
18
+ parsed = {token.strip().lower() for token in tokens if token.strip()}
19
+ return parsed or None
20
+
21
+
22
+ def get_cmip_variable_name(file_path: Path) -> str:
23
+ """CMIP6 filename prefix before the first underscore (e.g. tos from tos_Omon_...)."""
24
+ return file_path.stem.split("_")[0].lower()
25
+
26
+
27
+ def filter_files_by_variables(
28
+ files: list[Path],
29
+ variables: Optional[list[str] | str],
30
+ ) -> list[Path]:
31
+ """Keep only files whose CMIP variable prefix is in *variables*."""
32
+ allowed = parse_variable_list(variables)
33
+ if not allowed:
34
+ return files
35
+ return [f for f in files if get_cmip_variable_name(f) in allowed]
36
+
37
+
38
+ def pick_representative_file(input_files: list[Path]) -> Optional[Path]:
39
+ """Pick a file with nominal_resolution metadata, else the first file in the group."""
40
+ if not input_files:
41
+ return None
42
+ for file_path in input_files:
43
+ try:
44
+ with xa.open_dataset(file_path, decode_times=False) as ds:
45
+ if "nominal_resolution" in ds.attrs:
46
+ return file_path
47
+ except Exception:
48
+ continue
49
+ return input_files[0]
50
+
51
+
52
+ def representative_files_by_directory(files: list[Path]) -> dict[Path, Path]:
53
+ """Map each file's parent directory to a representative file for resolution."""
54
+ by_dir: dict[Path, list[Path]] = {}
55
+ for file_path in files:
56
+ by_dir.setdefault(file_path.parent, []).append(file_path)
57
+ return {
58
+ parent: rep
59
+ for parent, group_files in by_dir.items()
60
+ if (rep := pick_representative_file(group_files)) is not None
61
+ }
@@ -0,0 +1,9 @@
1
+ """Shared constants for the CDO regrid package."""
2
+
3
+ REGRID_ERROR_LOGGER_NAME = "cdo_toolkit.errors"
4
+
5
+ NC4_ENCODING_KEYS = frozenset({
6
+ "szip_pixels_per_block", "contiguous", "quantize_mode", "_FillValue", "fletcher32",
7
+ "endian", "chunksizes", "least_significant_digit", "complevel", "szip_coding",
8
+ "significant_digits", "dtype", "shuffle", "zlib", "blosc_shuffle", "compression",
9
+ })
cdo_toolkit/errors.py ADDED
@@ -0,0 +1,79 @@
1
+ """Error logging and file locking for regridding."""
2
+
3
+ import logging
4
+ import os
5
+ import traceback
6
+ from contextlib import contextmanager
7
+ from datetime import datetime
8
+ from pathlib import Path
9
+ from typing import Optional
10
+
11
+ try:
12
+ import fcntl
13
+ except ImportError:
14
+ fcntl = None
15
+
16
+ from cdo_toolkit.constants import REGRID_ERROR_LOGGER_NAME
17
+
18
+ _regrid_error_log_path: Optional[Path] = None
19
+
20
+
21
+ def default_log_dir() -> Path:
22
+ """Default directory for regrid error logs."""
23
+ return Path.cwd() / "logs"
24
+
25
+
26
+ def init_regrid_error_log(path: Optional[Path] = None) -> Path:
27
+ """One regrid error log per run; safe for parallel workers to append."""
28
+ global _regrid_error_log_path
29
+ if path is not None:
30
+ _regrid_error_log_path = Path(path)
31
+ elif _regrid_error_log_path is None:
32
+ log_dir = default_log_dir()
33
+ log_dir.mkdir(parents=True, exist_ok=True)
34
+ timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
35
+ _regrid_error_log_path = log_dir / f"regrid_errors_{timestamp}.log"
36
+
37
+ logger = logging.getLogger(REGRID_ERROR_LOGGER_NAME)
38
+ logger.setLevel(logging.WARNING)
39
+ logger.propagate = False
40
+ if not any(
41
+ isinstance(h, logging.FileHandler)
42
+ and getattr(h, "baseFilename", None) == str(_regrid_error_log_path.resolve())
43
+ for h in logger.handlers
44
+ ):
45
+ handler = logging.FileHandler(_regrid_error_log_path)
46
+ handler.setFormatter(
47
+ logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
48
+ )
49
+ logger.addHandler(handler)
50
+ return _regrid_error_log_path
51
+
52
+
53
+ def log_regrid_error(message: str, exc: Optional[BaseException] = None) -> None:
54
+ """Append a regrid error to the shared session log."""
55
+ init_regrid_error_log()
56
+ logger = logging.getLogger(REGRID_ERROR_LOGGER_NAME)
57
+ if exc is not None:
58
+ logger.error("%s\n%s", message, traceback.format_exc())
59
+ else:
60
+ logger.error(message)
61
+ for handler in logger.handlers:
62
+ handler.flush()
63
+
64
+
65
+ @contextmanager
66
+ def weight_file_lock(weight_path: Path):
67
+ """Serialize weight generation and validation across parallel workers."""
68
+ if fcntl is None:
69
+ yield
70
+ return
71
+ lock_path = weight_path.parent / f"{weight_path.name}.lock"
72
+ lock_path.parent.mkdir(parents=True, exist_ok=True)
73
+ fd = os.open(str(lock_path), os.O_CREAT | os.O_RDWR, 0o644)
74
+ try:
75
+ fcntl.flock(fd, fcntl.LOCK_EX)
76
+ yield
77
+ finally:
78
+ fcntl.flock(fd, fcntl.LOCK_UN)
79
+ os.close(fd)
cdo_toolkit/memory.py ADDED
@@ -0,0 +1,22 @@
1
+ """Memory usage monitoring."""
2
+
3
+ import psutil
4
+
5
+
6
+ class MemoryMonitor:
7
+ """Track peak memory usage during processing."""
8
+
9
+ def __init__(self):
10
+ self.peak_memory_gb = 0.0
11
+
12
+ def get_memory_usage_gb(self) -> float:
13
+ process = psutil.Process()
14
+ return process.memory_info().rss / (1024**3)
15
+
16
+ def update_peak(self):
17
+ current = self.get_memory_usage_gb()
18
+ if current > self.peak_memory_gb:
19
+ self.peak_memory_gb = current
20
+
21
+ def get_peak_memory_gb(self) -> float:
22
+ return self.peak_memory_gb
cdo_toolkit/paths.py ADDED
@@ -0,0 +1,30 @@
1
+ """Path helpers for weight caches and intermediate files."""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ def weight_cache_dir_for_input(input_path: Path) -> Path:
7
+ """Per-leaf-directory CDO weight cache (avoids cross-model collisions)."""
8
+ return input_path.parent / "cdo_weights"
9
+
10
+
11
+ def is_weights_or_cache_file(path: Path) -> bool:
12
+ """Return True if path is a weight file or under a weight cache directory (exclude from regrid list)."""
13
+ if path.stem.lower().startswith("weights_"):
14
+ return True
15
+ if "cdo_weights" in path.parts:
16
+ return True
17
+ return False
18
+
19
+
20
+ def is_intermediate_nc(path: Path) -> bool:
21
+ """Return True if path is an intermediate/product we should not regrid (top_level, regridded, seafloor, chunk, weights)."""
22
+ if is_weights_or_cache_file(path):
23
+ return True
24
+ stem = path.stem.lower()
25
+ return (
26
+ "_top_level" in stem
27
+ or "_regridded" in stem
28
+ or "_seafloor" in stem
29
+ or "_chunk_" in stem
30
+ )