cdo-toolkit 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cdo_toolkit/__init__.py +47 -0
- cdo_toolkit/__main__.py +6 -0
- cdo_toolkit/api.py +573 -0
- cdo_toolkit/cli.py +166 -0
- cdo_toolkit/cmip.py +61 -0
- cdo_toolkit/constants.py +9 -0
- cdo_toolkit/errors.py +79 -0
- cdo_toolkit/memory.py +22 -0
- cdo_toolkit/paths.py +30 -0
- cdo_toolkit/pipeline.py +2230 -0
- cdo_toolkit/resolution.py +19 -0
- cdo_toolkit/timing.py +36 -0
- cdo_toolkit/ui.py +650 -0
- cdo_toolkit/workers.py +277 -0
- cdo_toolkit-0.1.0.dist-info/METADATA +78 -0
- cdo_toolkit-0.1.0.dist-info/RECORD +19 -0
- cdo_toolkit-0.1.0.dist-info/WHEEL +4 -0
- cdo_toolkit-0.1.0.dist-info/entry_points.txt +2 -0
- cdo_toolkit-0.1.0.dist-info/licenses/LICENSE +28 -0
cdo_toolkit/__init__.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""CDO-based NetCDF regridding toolkit."""
|
|
2
|
+
|
|
3
|
+
from cdo_toolkit.api import (
|
|
4
|
+
extract_seafloor_single_file,
|
|
5
|
+
regrid_directory,
|
|
6
|
+
regrid_directory_both_levels,
|
|
7
|
+
regrid_large_files,
|
|
8
|
+
regrid_single_file,
|
|
9
|
+
regrid_single_file_both_levels,
|
|
10
|
+
regrid_single_file_extreme_levels,
|
|
11
|
+
)
|
|
12
|
+
from cdo_toolkit.cmip import (
|
|
13
|
+
filter_files_by_variables,
|
|
14
|
+
get_cmip_variable_name,
|
|
15
|
+
parse_variable_list,
|
|
16
|
+
pick_representative_file,
|
|
17
|
+
representative_files_by_directory,
|
|
18
|
+
)
|
|
19
|
+
from cdo_toolkit.errors import default_log_dir, init_regrid_error_log, log_regrid_error
|
|
20
|
+
from cdo_toolkit.paths import is_intermediate_nc, is_weights_or_cache_file, weight_cache_dir_for_input
|
|
21
|
+
from cdo_toolkit.pipeline import CDORegridPipeline
|
|
22
|
+
from cdo_toolkit.workers import process_single_file_standalone
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"CDORegridPipeline",
|
|
26
|
+
"default_log_dir",
|
|
27
|
+
"extract_seafloor_single_file",
|
|
28
|
+
"filter_files_by_variables",
|
|
29
|
+
"get_cmip_variable_name",
|
|
30
|
+
"init_regrid_error_log",
|
|
31
|
+
"is_intermediate_nc",
|
|
32
|
+
"is_weights_or_cache_file",
|
|
33
|
+
"log_regrid_error",
|
|
34
|
+
"parse_variable_list",
|
|
35
|
+
"pick_representative_file",
|
|
36
|
+
"process_single_file_standalone",
|
|
37
|
+
"representative_files_by_directory",
|
|
38
|
+
"regrid_directory",
|
|
39
|
+
"regrid_directory_both_levels",
|
|
40
|
+
"regrid_large_files",
|
|
41
|
+
"regrid_single_file",
|
|
42
|
+
"regrid_single_file_both_levels",
|
|
43
|
+
"regrid_single_file_extreme_levels",
|
|
44
|
+
"weight_cache_dir_for_input",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
__version__ = "0.1.0"
|
cdo_toolkit/__main__.py
ADDED
cdo_toolkit/api.py
ADDED
|
@@ -0,0 +1,573 @@
|
|
|
1
|
+
"""High-level regridding API."""
|
|
2
|
+
|
|
3
|
+
import multiprocessing as mp
|
|
4
|
+
import os
|
|
5
|
+
import tempfile
|
|
6
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
import xarray as xa
|
|
11
|
+
|
|
12
|
+
from cdo_toolkit.cmip import (
|
|
13
|
+
filter_files_by_variables,
|
|
14
|
+
get_cmip_variable_name,
|
|
15
|
+
parse_variable_list,
|
|
16
|
+
pick_representative_file,
|
|
17
|
+
representative_files_by_directory,
|
|
18
|
+
)
|
|
19
|
+
from cdo_toolkit.errors import init_regrid_error_log
|
|
20
|
+
from cdo_toolkit.paths import is_intermediate_nc
|
|
21
|
+
from cdo_toolkit.pipeline import CDORegridPipeline
|
|
22
|
+
from cdo_toolkit.timing import format_processing_time, get_processing_time, print_timestamp
|
|
23
|
+
from cdo_toolkit.ui import RegridProgressUI
|
|
24
|
+
|
|
25
|
+
def extract_seafloor_single_file(
|
|
26
|
+
input_path: Path,
|
|
27
|
+
output_path: Optional[Path] = None,
|
|
28
|
+
verbose: bool = True,
|
|
29
|
+
overwrite: bool = False,
|
|
30
|
+
) -> Path:
|
|
31
|
+
"""
|
|
32
|
+
Convenience function to extract seafloor values from a single file.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
- input_path (Path): Path to input file
|
|
36
|
+
- output_path (Path): Path to output file (if None, auto-generates <filename>_seafloor.nc)
|
|
37
|
+
- verbose (bool): Enable verbose output
|
|
38
|
+
- overwrite (bool): If True, overwrite existing output files
|
|
39
|
+
|
|
40
|
+
Returns (Path): Path to the seafloor-extracted file
|
|
41
|
+
|
|
42
|
+
Raises:
|
|
43
|
+
- RuntimeError: If seafloor extraction fails
|
|
44
|
+
"""
|
|
45
|
+
pipeline = CDORegridPipeline(
|
|
46
|
+
extract_seafloor=True,
|
|
47
|
+
verbose=verbose,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
if output_path is None:
|
|
51
|
+
output_path = input_path.parent / f"{input_path.stem}_seafloor{input_path.suffix}"
|
|
52
|
+
|
|
53
|
+
if output_path.exists() and not overwrite:
|
|
54
|
+
if verbose:
|
|
55
|
+
pipeline.console.print(f"[yellow]Seafloor file already exists: {output_path.name}[/yellow]")
|
|
56
|
+
return output_path
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
seafloor_path = pipeline._extract_seafloor_values(input_path)
|
|
60
|
+
if verbose:
|
|
61
|
+
pipeline.console.print(f"[green]Seafloor file created: {seafloor_path}[/green]")
|
|
62
|
+
return seafloor_path
|
|
63
|
+
except Exception as e:
|
|
64
|
+
if verbose:
|
|
65
|
+
pipeline.console.print(f"[red]Failed to extract seafloor: {e}[/red]")
|
|
66
|
+
raise
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def regrid_single_file(
|
|
70
|
+
input_path: Path,
|
|
71
|
+
output_path: Optional[Path] = None,
|
|
72
|
+
output_dir: Optional[Path] = None,
|
|
73
|
+
target_resolution: tuple[float, float] = (1.0, 1.0),
|
|
74
|
+
extract_surface: bool = False,
|
|
75
|
+
extract_seafloor: bool = False,
|
|
76
|
+
use_regrid_cache: bool = True,
|
|
77
|
+
use_seafloor_cache: bool = True,
|
|
78
|
+
verbose: bool = True,
|
|
79
|
+
verbose_diagnostics: bool = False,
|
|
80
|
+
cleanup_weights: bool = False,
|
|
81
|
+
overwrite: bool = False,
|
|
82
|
+
use_ui: bool = True,
|
|
83
|
+
) -> bool:
|
|
84
|
+
"""
|
|
85
|
+
Convenience function to regrid a single file.
|
|
86
|
+
|
|
87
|
+
Pipeline: if neither extract_surface nor extract_seafloor, regrid whole file;
|
|
88
|
+
if extract_surface, extract top level and regrid; if extract_seafloor, extract
|
|
89
|
+
seafloor and regrid. For both surface and seafloor use regrid_single_file_extreme_levels.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
- input_path (Path): Path to input file
|
|
93
|
+
- output_path (Path): Path to output file
|
|
94
|
+
- target_resolution (tuple): Target resolution as (lon_res, lat_res)
|
|
95
|
+
- extract_surface (bool): Extract top level only and regrid that
|
|
96
|
+
- extract_seafloor (bool): Extract seafloor values and regrid only that
|
|
97
|
+
- use_regrid_cache (bool): Reuse existing regrid weight files when present
|
|
98
|
+
- use_seafloor_cache (bool): Reuse seafloor depth indices cache
|
|
99
|
+
- verbose (bool): Enable verbose output (progress UI)
|
|
100
|
+
- verbose_diagnostics (bool): If True, print Grid type, File size, Large file (max verbosity)
|
|
101
|
+
- cleanup_weights (bool): Clean up weights after processing
|
|
102
|
+
- overwrite (bool): If True, overwrite existing output files
|
|
103
|
+
- use_ui (bool): Use rich progress UI
|
|
104
|
+
|
|
105
|
+
Returns (bool): True if successful, False otherwise
|
|
106
|
+
"""
|
|
107
|
+
pipeline = CDORegridPipeline(
|
|
108
|
+
target_resolution=target_resolution,
|
|
109
|
+
extract_surface=extract_surface,
|
|
110
|
+
extract_seafloor=extract_seafloor,
|
|
111
|
+
use_regrid_cache=use_regrid_cache,
|
|
112
|
+
use_seafloor_cache=use_seafloor_cache,
|
|
113
|
+
verbose=verbose,
|
|
114
|
+
verbose_diagnostics=verbose_diagnostics,
|
|
115
|
+
cleanup_weights=cleanup_weights,
|
|
116
|
+
)
|
|
117
|
+
if output_dir is not None and output_path is None:
|
|
118
|
+
has_level = pipeline._has_level_lightweight(input_path)
|
|
119
|
+
filename = pipeline._generate_output_filename(
|
|
120
|
+
input_path, has_level, extract_surface, extract_seafloor
|
|
121
|
+
)
|
|
122
|
+
output_path = output_dir / filename
|
|
123
|
+
error_log_path = init_regrid_error_log()
|
|
124
|
+
pipeline.set_error_log_path(error_log_path)
|
|
125
|
+
# Initialize UI if requested
|
|
126
|
+
ui = None
|
|
127
|
+
if use_ui and verbose:
|
|
128
|
+
ui = RegridProgressUI(
|
|
129
|
+
[input_path],
|
|
130
|
+
verbose=verbose,
|
|
131
|
+
verbose_diagnostics=pipeline.verbose_diagnostics,
|
|
132
|
+
log_file=error_log_path,
|
|
133
|
+
)
|
|
134
|
+
ui.__enter__()
|
|
135
|
+
# Track processing time
|
|
136
|
+
import time
|
|
137
|
+
start_time = print_timestamp(pipeline.console, "START") if verbose else time.localtime()
|
|
138
|
+
try:
|
|
139
|
+
result = pipeline.regrid_file(input_path, output_path, overwrite=overwrite, ui=ui)
|
|
140
|
+
return result
|
|
141
|
+
finally:
|
|
142
|
+
if ui:
|
|
143
|
+
# Add timing to stats
|
|
144
|
+
end_time = print_timestamp(pipeline.console, "END") if verbose else time.localtime()
|
|
145
|
+
pipeline.stats['processing_time'] = format_processing_time(
|
|
146
|
+
get_processing_time(start_time, end_time)
|
|
147
|
+
)
|
|
148
|
+
ui._update_stats(pipeline.stats)
|
|
149
|
+
ui.print_summary()
|
|
150
|
+
ui.__exit__(None, None, None)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def regrid_single_file_both_levels(
|
|
154
|
+
input_path: Path,
|
|
155
|
+
output_dir: Optional[Path] = None,
|
|
156
|
+
target_resolution: tuple[float, float] = (1.0, 1.0),
|
|
157
|
+
use_regrid_cache: bool = True,
|
|
158
|
+
use_seafloor_cache: bool = True,
|
|
159
|
+
verbose: bool = True,
|
|
160
|
+
cleanup_weights: bool = False,
|
|
161
|
+
overwrite: bool = False,
|
|
162
|
+
weight_cache_dir: Optional[Path] = None,
|
|
163
|
+
) -> dict[str, bool]:
|
|
164
|
+
"""
|
|
165
|
+
Regrid both the top level (surface) and the seafloor values for a single file.
|
|
166
|
+
|
|
167
|
+
Performs steps 2 and 3 sequentially: seafloor extraction+regrid, then surface extraction+regrid.
|
|
168
|
+
Outputs: ``<name>_seafloor_regridded.nc`` and ``<name>_top_level_regridded.nc`` (if multi-level).
|
|
169
|
+
|
|
170
|
+
Parameters
|
|
171
|
+
----------
|
|
172
|
+
input_path : Path
|
|
173
|
+
Input NetCDF file.
|
|
174
|
+
output_dir : Path, optional
|
|
175
|
+
Directory for regridded outputs. If ``None``, files are written next to ``input_path``.
|
|
176
|
+
target_resolution : tuple[float, float]
|
|
177
|
+
Target grid resolution.
|
|
178
|
+
use_regrid_cache : bool
|
|
179
|
+
Reuse existing regrid weight files when present.
|
|
180
|
+
use_seafloor_cache : bool
|
|
181
|
+
Reuse seafloor depth indices cache.
|
|
182
|
+
verbose : bool
|
|
183
|
+
Enable verbose logging.
|
|
184
|
+
cleanup_weights : bool
|
|
185
|
+
Clean up weights after processing.
|
|
186
|
+
overwrite : bool
|
|
187
|
+
Overwrite existing outputs.
|
|
188
|
+
weight_cache_dir : Path, optional
|
|
189
|
+
Directory for regrid weight cache. If None, each input file uses ``<input_dir>/cdo_weights``.
|
|
190
|
+
|
|
191
|
+
Returns
|
|
192
|
+
-------
|
|
193
|
+
dict[str, bool]
|
|
194
|
+
Mapping ``{'top_level': bool, 'seafloor': bool}`` indicating success for each stream.
|
|
195
|
+
"""
|
|
196
|
+
# 1) Seafloor: extract seafloor indices (from file or cache), extract values, regrid
|
|
197
|
+
seafloor_pipeline = CDORegridPipeline(
|
|
198
|
+
target_resolution=target_resolution,
|
|
199
|
+
extract_surface=False,
|
|
200
|
+
extract_seafloor=True,
|
|
201
|
+
use_regrid_cache=use_regrid_cache,
|
|
202
|
+
use_seafloor_cache=use_seafloor_cache,
|
|
203
|
+
verbose=verbose,
|
|
204
|
+
cleanup_weights=cleanup_weights,
|
|
205
|
+
weight_cache_dir=weight_cache_dir,
|
|
206
|
+
)
|
|
207
|
+
if overwrite:
|
|
208
|
+
seafloor_intermediate = input_path.parent / f"{input_path.stem}_seafloor{input_path.suffix}"
|
|
209
|
+
if seafloor_intermediate.exists():
|
|
210
|
+
seafloor_intermediate.unlink()
|
|
211
|
+
seafloor_success = seafloor_pipeline.regrid_file(
|
|
212
|
+
input_path=input_path,
|
|
213
|
+
output_path=output_dir,
|
|
214
|
+
overwrite=overwrite,
|
|
215
|
+
ui=None,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
# 2) Surface: extract top level and regrid
|
|
219
|
+
top_pipeline = CDORegridPipeline(
|
|
220
|
+
target_resolution=target_resolution,
|
|
221
|
+
extract_surface=True,
|
|
222
|
+
extract_seafloor=False,
|
|
223
|
+
use_regrid_cache=use_regrid_cache,
|
|
224
|
+
use_seafloor_cache=use_seafloor_cache,
|
|
225
|
+
verbose=verbose,
|
|
226
|
+
cleanup_weights=cleanup_weights,
|
|
227
|
+
weight_cache_dir=weight_cache_dir,
|
|
228
|
+
)
|
|
229
|
+
top_success = top_pipeline.regrid_file(
|
|
230
|
+
input_path=input_path,
|
|
231
|
+
output_path=output_dir,
|
|
232
|
+
overwrite=overwrite,
|
|
233
|
+
ui=None,
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
return {"top_level": top_success, "seafloor": seafloor_success}
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
# Alias for CLI --extreme-levels
|
|
240
|
+
regrid_single_file_extreme_levels = regrid_single_file_both_levels
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def process_directory_both_levels(
|
|
244
|
+
dir_path: Path,
|
|
245
|
+
file_list: list[Path],
|
|
246
|
+
output_dir: Optional[Path] = None,
|
|
247
|
+
target_resolution: tuple[float, float] = (1.0, 1.0),
|
|
248
|
+
use_regrid_cache: bool = True,
|
|
249
|
+
use_seafloor_cache: bool = True,
|
|
250
|
+
verbose: bool = True,
|
|
251
|
+
overwrite: bool = False,
|
|
252
|
+
) -> list[tuple[Path, dict[str, bool]]]:
|
|
253
|
+
"""
|
|
254
|
+
Process all files in one directory with shared pipelines (seafloor cache + weight cache).
|
|
255
|
+
Per file: seafloor then surface (extreme levels). Call from a single worker per directory.
|
|
256
|
+
"""
|
|
257
|
+
weight_cache_dir = dir_path / "cdo_weights"
|
|
258
|
+
seafloor_pipeline = CDORegridPipeline(
|
|
259
|
+
target_resolution=target_resolution,
|
|
260
|
+
extract_surface=False,
|
|
261
|
+
extract_seafloor=True,
|
|
262
|
+
use_regrid_cache=use_regrid_cache,
|
|
263
|
+
use_seafloor_cache=use_seafloor_cache,
|
|
264
|
+
verbose=verbose,
|
|
265
|
+
cleanup_weights=False,
|
|
266
|
+
weight_cache_dir=weight_cache_dir,
|
|
267
|
+
)
|
|
268
|
+
top_pipeline = CDORegridPipeline(
|
|
269
|
+
target_resolution=target_resolution,
|
|
270
|
+
extract_surface=True,
|
|
271
|
+
extract_seafloor=False,
|
|
272
|
+
use_regrid_cache=use_regrid_cache,
|
|
273
|
+
use_seafloor_cache=use_seafloor_cache,
|
|
274
|
+
verbose=verbose,
|
|
275
|
+
cleanup_weights=False,
|
|
276
|
+
weight_cache_dir=weight_cache_dir,
|
|
277
|
+
)
|
|
278
|
+
results: list[tuple[Path, dict[str, bool]]] = []
|
|
279
|
+
for fp in file_list:
|
|
280
|
+
seafloor_ok = seafloor_pipeline.regrid_file(
|
|
281
|
+
input_path=fp,
|
|
282
|
+
output_path=output_dir,
|
|
283
|
+
overwrite=overwrite,
|
|
284
|
+
ui=None,
|
|
285
|
+
)
|
|
286
|
+
top_ok = top_pipeline.regrid_file(
|
|
287
|
+
input_path=fp,
|
|
288
|
+
output_path=output_dir,
|
|
289
|
+
overwrite=overwrite,
|
|
290
|
+
ui=None,
|
|
291
|
+
)
|
|
292
|
+
results.append((fp, {"top_level": top_ok, "seafloor": seafloor_ok}))
|
|
293
|
+
return results
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _worker_process_directory_both_levels(
|
|
297
|
+
args: tuple,
|
|
298
|
+
) -> list[tuple[Path, dict[str, bool]]]:
|
|
299
|
+
"""Picklable worker: process one directory (all files sequentially) with shared pipelines."""
|
|
300
|
+
dir_path, file_list, output_dir, target_resolution, verbose, overwrite = args
|
|
301
|
+
return process_directory_both_levels(
|
|
302
|
+
dir_path=dir_path,
|
|
303
|
+
file_list=file_list,
|
|
304
|
+
output_dir=output_dir,
|
|
305
|
+
target_resolution=target_resolution,
|
|
306
|
+
verbose=verbose,
|
|
307
|
+
overwrite=overwrite,
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def _worker_both_levels(
|
|
312
|
+
args: tuple,
|
|
313
|
+
) -> tuple[Path, dict[str, bool]]:
|
|
314
|
+
"""Picklable worker for regrid_single_file_both_levels. Used by ProcessPoolExecutor.
|
|
315
|
+
Uses a per-process weight cache dir to avoid races on weight generation/reuse.
|
|
316
|
+
Seafloor depth cache is in-memory per pipeline, so no cross-process conflict.
|
|
317
|
+
"""
|
|
318
|
+
input_path, output_dir, target_resolution, verbose, overwrite = args
|
|
319
|
+
# Per-process weight dir so parallel workers don't share weight files (avoid races)
|
|
320
|
+
weight_cache_dir = Path(tempfile.gettempdir()) / f"cdo_weights_{os.getpid()}"
|
|
321
|
+
status = regrid_single_file_both_levels(
|
|
322
|
+
input_path=input_path,
|
|
323
|
+
output_dir=output_dir,
|
|
324
|
+
target_resolution=target_resolution,
|
|
325
|
+
verbose=verbose,
|
|
326
|
+
cleanup_weights=False,
|
|
327
|
+
overwrite=overwrite,
|
|
328
|
+
weight_cache_dir=weight_cache_dir,
|
|
329
|
+
)
|
|
330
|
+
return (input_path, status)
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def regrid_directory(
|
|
334
|
+
input_dir: Path,
|
|
335
|
+
include_subdirectories: bool = False,
|
|
336
|
+
output_dir: Optional[Path] = None,
|
|
337
|
+
target_resolution: tuple[float, float] = None,
|
|
338
|
+
file_pattern: str = "*.nc",
|
|
339
|
+
variables: Optional[list[str] | str] = None,
|
|
340
|
+
extract_surface: bool = False,
|
|
341
|
+
extract_seafloor: bool = False,
|
|
342
|
+
use_regrid_cache: bool = True,
|
|
343
|
+
use_seafloor_cache: bool = True,
|
|
344
|
+
verbose: bool = True,
|
|
345
|
+
verbose_diagnostics: bool = False,
|
|
346
|
+
max_workers: Optional[int] = 4,
|
|
347
|
+
enable_parallel: bool = True,
|
|
348
|
+
overwrite: bool = False,
|
|
349
|
+
use_ui: bool = True,
|
|
350
|
+
) -> dict[str, list[Path]]:
|
|
351
|
+
"""
|
|
352
|
+
Convenience function to regrid all files in a directory.
|
|
353
|
+
|
|
354
|
+
Args:
|
|
355
|
+
- input_dir (Path): Input directory containing NetCDF files
|
|
356
|
+
- output_dir (Path): Output directory for regridded files
|
|
357
|
+
- target_resolution (tuple): Target resolution as (lon_res, lat_res)
|
|
358
|
+
- file_pattern (str): File pattern to match (e.g., "*.nc", "*.nc4")
|
|
359
|
+
- variables (list[str] | str, optional): CMIP variable prefix(es) to process (e.g. ``tos`` or ``tos,uo``)
|
|
360
|
+
- extract_surface (bool): Extract top level only and regrid that
|
|
361
|
+
- extract_seafloor (bool): Extract seafloor values and regrid only that
|
|
362
|
+
- use_regrid_cache (bool): Reuse existing regrid weight files
|
|
363
|
+
- use_seafloor_cache (bool): Reuse seafloor depth indices cache
|
|
364
|
+
- verbose (bool): Enable verbose output (progress UI)
|
|
365
|
+
- verbose_diagnostics (bool): If True, print Grid type, File size, Large file (max verbosity)
|
|
366
|
+
- max_workers (int): Maximum number of parallel workers
|
|
367
|
+
- enable_parallel (bool): Enable parallel processing
|
|
368
|
+
- overwrite (bool): If True, overwrite existing output files
|
|
369
|
+
- use_ui (bool): Use rich progress UI
|
|
370
|
+
|
|
371
|
+
Returns (dict[str, list[Path]]): Dictionary mapping status to list of file paths
|
|
372
|
+
"""
|
|
373
|
+
# find all matching files (exclude intermediates: _top_level, _regridded, _seafloor, _chunk_)
|
|
374
|
+
if include_subdirectories:
|
|
375
|
+
raw = list(input_dir.rglob(file_pattern))
|
|
376
|
+
else:
|
|
377
|
+
raw = list(input_dir.glob(file_pattern))
|
|
378
|
+
input_files = [p for p in raw if not is_intermediate_nc(p)]
|
|
379
|
+
input_files = filter_files_by_variables(input_files, variables)
|
|
380
|
+
|
|
381
|
+
if not input_files:
|
|
382
|
+
var_msg = f" and variables {sorted(parse_variable_list(variables) or [])}" if variables else ""
|
|
383
|
+
print(f"No files found matching pattern '{file_pattern}'{var_msg} in {input_dir}")
|
|
384
|
+
return {"successful": [], "failed": [], "skipped": []}
|
|
385
|
+
|
|
386
|
+
error_log_path = init_regrid_error_log()
|
|
387
|
+
if verbose:
|
|
388
|
+
print(f"Regrid errors log: {error_log_path}")
|
|
389
|
+
|
|
390
|
+
# create pipeline
|
|
391
|
+
pipeline = CDORegridPipeline(
|
|
392
|
+
target_resolution=target_resolution,
|
|
393
|
+
extract_surface=extract_surface,
|
|
394
|
+
extract_seafloor=extract_seafloor,
|
|
395
|
+
use_regrid_cache=use_regrid_cache,
|
|
396
|
+
use_seafloor_cache=use_seafloor_cache,
|
|
397
|
+
verbose=verbose,
|
|
398
|
+
verbose_diagnostics=verbose_diagnostics,
|
|
399
|
+
max_workers=max_workers,
|
|
400
|
+
enable_parallel=enable_parallel,
|
|
401
|
+
)
|
|
402
|
+
pipeline.set_error_log_path(error_log_path)
|
|
403
|
+
|
|
404
|
+
# process files
|
|
405
|
+
results = pipeline.regrid_batch(input_files, output_dir, overwrite=overwrite, use_ui=use_ui)
|
|
406
|
+
# print statistics
|
|
407
|
+
pipeline.print_statistics()
|
|
408
|
+
|
|
409
|
+
return results
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
def regrid_directory_both_levels(
|
|
413
|
+
input_dir: Path,
|
|
414
|
+
include_subdirectories: bool = False,
|
|
415
|
+
output_dir: Optional[Path] = None,
|
|
416
|
+
target_resolution: tuple[float, float] = (1.0, 1.0),
|
|
417
|
+
file_pattern: str = "*.nc",
|
|
418
|
+
variables: Optional[list[str] | str] = None,
|
|
419
|
+
verbose: bool = True,
|
|
420
|
+
overwrite: bool = False,
|
|
421
|
+
max_workers: Optional[int] = 4,
|
|
422
|
+
enable_parallel: bool = True,
|
|
423
|
+
) -> dict[str, list[Path]]:
|
|
424
|
+
"""
|
|
425
|
+
Regrid both the top level and the seafloor values for all files
|
|
426
|
+
in a directory.
|
|
427
|
+
|
|
428
|
+
This is a higher-level orchestrator that calls
|
|
429
|
+
:func:`regrid_single_file_both_levels` for each matching file and
|
|
430
|
+
aggregates the results into the same status dictionary structure
|
|
431
|
+
as :func:`regrid_directory`. When ``enable_parallel`` is True and there are multiple directories, directories
|
|
432
|
+
are processed in parallel (one worker per directory). Within each directory
|
|
433
|
+
all files are processed sequentially with shared pipelines, so per-directory
|
|
434
|
+
seafloor depth cache and weight cache (``<dir>/cdo_weights``) are reused and
|
|
435
|
+
no workers contend for the same directory.
|
|
436
|
+
|
|
437
|
+
Parameters
|
|
438
|
+
----------
|
|
439
|
+
input_dir : Path
|
|
440
|
+
Directory containing input NetCDF files.
|
|
441
|
+
include_subdirectories : bool
|
|
442
|
+
Recurse into subdirectories.
|
|
443
|
+
output_dir : Path, optional
|
|
444
|
+
Directory for outputs (defaults to alongside inputs when None).
|
|
445
|
+
target_resolution : tuple[float, float]
|
|
446
|
+
Target grid resolution.
|
|
447
|
+
file_pattern : str
|
|
448
|
+
Glob pattern for selecting input files.
|
|
449
|
+
verbose : bool
|
|
450
|
+
Enable verbose logging.
|
|
451
|
+
overwrite : bool
|
|
452
|
+
Overwrite existing outputs.
|
|
453
|
+
max_workers : int, optional
|
|
454
|
+
Maximum parallel workers (default 4). Used only if enable_parallel is True.
|
|
455
|
+
enable_parallel : bool
|
|
456
|
+
Process multiple files in parallel (default True).
|
|
457
|
+
|
|
458
|
+
Returns
|
|
459
|
+
-------
|
|
460
|
+
dict[str, list[Path]]
|
|
461
|
+
Dictionary with ``'successful'``, ``'failed'`` and ``'skipped'``
|
|
462
|
+
keys mapping to lists of input file paths.
|
|
463
|
+
"""
|
|
464
|
+
# Only process source files; exclude intermediates (_top_level, _regridded, _seafloor, _chunk_)
|
|
465
|
+
if include_subdirectories:
|
|
466
|
+
raw = list(input_dir.rglob(file_pattern))
|
|
467
|
+
else:
|
|
468
|
+
raw = list(input_dir.glob(file_pattern))
|
|
469
|
+
input_files = [p for p in raw if not is_intermediate_nc(p)]
|
|
470
|
+
input_files = filter_files_by_variables(input_files, variables)
|
|
471
|
+
|
|
472
|
+
if not input_files:
|
|
473
|
+
var_msg = f" and variables {sorted(parse_variable_list(variables) or [])}" if variables else ""
|
|
474
|
+
print(f"No files found matching pattern '{file_pattern}'{var_msg} in {input_dir}")
|
|
475
|
+
return {"successful": [], "failed": [], "skipped": []}
|
|
476
|
+
|
|
477
|
+
results: dict[str, list[Path]] = {
|
|
478
|
+
"successful": [],
|
|
479
|
+
"failed": [],
|
|
480
|
+
"skipped": [],
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
# Group files by directory so one worker owns each directory (reuses seafloor cache + weight cache)
|
|
484
|
+
by_dir: dict[Path, list[Path]] = {}
|
|
485
|
+
for fp in input_files:
|
|
486
|
+
by_dir.setdefault(fp.parent, []).append(fp)
|
|
487
|
+
dir_jobs = [(d, sorted(fs)) for d, fs in by_dir.items()]
|
|
488
|
+
|
|
489
|
+
use_parallel = (
|
|
490
|
+
enable_parallel
|
|
491
|
+
and len(dir_jobs) >= 2
|
|
492
|
+
and max_workers is not None
|
|
493
|
+
and max_workers > 1
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
if use_parallel:
|
|
497
|
+
worker_args = [
|
|
498
|
+
(dir_path, file_list, output_dir, target_resolution, verbose, overwrite)
|
|
499
|
+
for dir_path, file_list in dir_jobs
|
|
500
|
+
]
|
|
501
|
+
n_workers = min(max_workers, len(dir_jobs), mp.cpu_count())
|
|
502
|
+
if verbose:
|
|
503
|
+
print(f"Processing {len(dir_jobs)} directories in parallel with {n_workers} workers (files within each directory processed sequentially with shared cache).")
|
|
504
|
+
with ProcessPoolExecutor(max_workers=n_workers) as executor:
|
|
505
|
+
for file_results in executor.map(_worker_process_directory_both_levels, worker_args):
|
|
506
|
+
for input_path, status in file_results:
|
|
507
|
+
if status["top_level"] and status["seafloor"]:
|
|
508
|
+
results["successful"].append(input_path)
|
|
509
|
+
elif status["top_level"] or status["seafloor"]:
|
|
510
|
+
results["failed"].append(input_path)
|
|
511
|
+
else:
|
|
512
|
+
results["failed"].append(input_path)
|
|
513
|
+
else:
|
|
514
|
+
for dir_path, file_list in dir_jobs:
|
|
515
|
+
try:
|
|
516
|
+
file_results = process_directory_both_levels(
|
|
517
|
+
dir_path=dir_path,
|
|
518
|
+
file_list=file_list,
|
|
519
|
+
output_dir=output_dir,
|
|
520
|
+
target_resolution=target_resolution,
|
|
521
|
+
verbose=verbose,
|
|
522
|
+
overwrite=overwrite,
|
|
523
|
+
)
|
|
524
|
+
for fp, status in file_results:
|
|
525
|
+
if status["top_level"] and status["seafloor"]:
|
|
526
|
+
results["successful"].append(fp)
|
|
527
|
+
elif status["top_level"] or status["seafloor"]:
|
|
528
|
+
results["failed"].append(fp)
|
|
529
|
+
else:
|
|
530
|
+
results["failed"].append(fp)
|
|
531
|
+
except Exception:
|
|
532
|
+
for fp in file_list:
|
|
533
|
+
results["failed"].append(fp)
|
|
534
|
+
|
|
535
|
+
return results
|
|
536
|
+
|
|
537
|
+
def regrid_large_files(
|
|
538
|
+
input_files: list[Path],
|
|
539
|
+
output_dir: Optional[Path] = None,
|
|
540
|
+
target_resolution: tuple[float, float] = (1.0, 1.0),
|
|
541
|
+
chunk_size_gb: float = 2.0,
|
|
542
|
+
max_memory_gb: float = 8.0,
|
|
543
|
+
verbose: bool = True,
|
|
544
|
+
overwrite: bool = False,
|
|
545
|
+
) -> dict[str, list[Path]]:
|
|
546
|
+
"""
|
|
547
|
+
Convenience function for regridding large files with memory optimization.
|
|
548
|
+
|
|
549
|
+
Args:
|
|
550
|
+
- input_files (list[Path]): List of input files to regrid
|
|
551
|
+
- output_dir (Path): Output directory for regridded files
|
|
552
|
+
- target_resolution (tuple): Target resolution as (lon_res, lat_res)
|
|
553
|
+
- chunk_size_gb (float): Maximum chunk size in GB
|
|
554
|
+
- max_memory_gb (float): Maximum memory usage in GB
|
|
555
|
+
- verbose (bool): Enable verbose output
|
|
556
|
+
- overwrite (bool): If True, overwrite existing output files
|
|
557
|
+
|
|
558
|
+
Returns (dict[str, list[Path]]): Dictionary mapping status to list of file paths
|
|
559
|
+
"""
|
|
560
|
+
pipeline = CDORegridPipeline(
|
|
561
|
+
target_resolution=target_resolution,
|
|
562
|
+
chunk_size_gb=chunk_size_gb,
|
|
563
|
+
max_memory_gb=max_memory_gb,
|
|
564
|
+
verbose=verbose,
|
|
565
|
+
enable_chunking=True,
|
|
566
|
+
memory_monitoring=True,
|
|
567
|
+
)
|
|
568
|
+
|
|
569
|
+
results = pipeline.regrid_batch(input_files, output_dir, overwrite=overwrite)
|
|
570
|
+
pipeline.print_statistics()
|
|
571
|
+
|
|
572
|
+
return results
|
|
573
|
+
|