cdo-toolkit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,47 @@
1
+ """CDO-based NetCDF regridding toolkit."""
2
+
3
+ from cdo_toolkit.api import (
4
+ extract_seafloor_single_file,
5
+ regrid_directory,
6
+ regrid_directory_both_levels,
7
+ regrid_large_files,
8
+ regrid_single_file,
9
+ regrid_single_file_both_levels,
10
+ regrid_single_file_extreme_levels,
11
+ )
12
+ from cdo_toolkit.cmip import (
13
+ filter_files_by_variables,
14
+ get_cmip_variable_name,
15
+ parse_variable_list,
16
+ pick_representative_file,
17
+ representative_files_by_directory,
18
+ )
19
+ from cdo_toolkit.errors import default_log_dir, init_regrid_error_log, log_regrid_error
20
+ from cdo_toolkit.paths import is_intermediate_nc, is_weights_or_cache_file, weight_cache_dir_for_input
21
+ from cdo_toolkit.pipeline import CDORegridPipeline
22
+ from cdo_toolkit.workers import process_single_file_standalone
23
+
24
+ __all__ = [
25
+ "CDORegridPipeline",
26
+ "default_log_dir",
27
+ "extract_seafloor_single_file",
28
+ "filter_files_by_variables",
29
+ "get_cmip_variable_name",
30
+ "init_regrid_error_log",
31
+ "is_intermediate_nc",
32
+ "is_weights_or_cache_file",
33
+ "log_regrid_error",
34
+ "parse_variable_list",
35
+ "pick_representative_file",
36
+ "process_single_file_standalone",
37
+ "representative_files_by_directory",
38
+ "regrid_directory",
39
+ "regrid_directory_both_levels",
40
+ "regrid_large_files",
41
+ "regrid_single_file",
42
+ "regrid_single_file_both_levels",
43
+ "regrid_single_file_extreme_levels",
44
+ "weight_cache_dir_for_input",
45
+ ]
46
+
47
+ __version__ = "0.1.0"
@@ -0,0 +1,6 @@
1
+ """Allow ``python -m cdo_toolkit``."""
2
+
3
+ from cdo_toolkit.cli import main
4
+
5
+ if __name__ == "__main__":
6
+ main()
cdo_toolkit/api.py ADDED
@@ -0,0 +1,573 @@
1
+ """High-level regridding API."""
2
+
3
+ import multiprocessing as mp
4
+ import os
5
+ import tempfile
6
+ from concurrent.futures import ProcessPoolExecutor
7
+ from pathlib import Path
8
+ from typing import Optional
9
+
10
+ import xarray as xa
11
+
12
+ from cdo_toolkit.cmip import (
13
+ filter_files_by_variables,
14
+ get_cmip_variable_name,
15
+ parse_variable_list,
16
+ pick_representative_file,
17
+ representative_files_by_directory,
18
+ )
19
+ from cdo_toolkit.errors import init_regrid_error_log
20
+ from cdo_toolkit.paths import is_intermediate_nc
21
+ from cdo_toolkit.pipeline import CDORegridPipeline
22
+ from cdo_toolkit.timing import format_processing_time, get_processing_time, print_timestamp
23
+ from cdo_toolkit.ui import RegridProgressUI
24
+
25
+ def extract_seafloor_single_file(
26
+ input_path: Path,
27
+ output_path: Optional[Path] = None,
28
+ verbose: bool = True,
29
+ overwrite: bool = False,
30
+ ) -> Path:
31
+ """
32
+ Convenience function to extract seafloor values from a single file.
33
+
34
+ Args:
35
+ - input_path (Path): Path to input file
36
+ - output_path (Path): Path to output file (if None, auto-generates <filename>_seafloor.nc)
37
+ - verbose (bool): Enable verbose output
38
+ - overwrite (bool): If True, overwrite existing output files
39
+
40
+ Returns (Path): Path to the seafloor-extracted file
41
+
42
+ Raises:
43
+ - RuntimeError: If seafloor extraction fails
44
+ """
45
+ pipeline = CDORegridPipeline(
46
+ extract_seafloor=True,
47
+ verbose=verbose,
48
+ )
49
+
50
+ if output_path is None:
51
+ output_path = input_path.parent / f"{input_path.stem}_seafloor{input_path.suffix}"
52
+
53
+ if output_path.exists() and not overwrite:
54
+ if verbose:
55
+ pipeline.console.print(f"[yellow]Seafloor file already exists: {output_path.name}[/yellow]")
56
+ return output_path
57
+
58
+ try:
59
+ seafloor_path = pipeline._extract_seafloor_values(input_path)
60
+ if verbose:
61
+ pipeline.console.print(f"[green]Seafloor file created: {seafloor_path}[/green]")
62
+ return seafloor_path
63
+ except Exception as e:
64
+ if verbose:
65
+ pipeline.console.print(f"[red]Failed to extract seafloor: {e}[/red]")
66
+ raise
67
+
68
+
69
+ def regrid_single_file(
70
+ input_path: Path,
71
+ output_path: Optional[Path] = None,
72
+ output_dir: Optional[Path] = None,
73
+ target_resolution: tuple[float, float] = (1.0, 1.0),
74
+ extract_surface: bool = False,
75
+ extract_seafloor: bool = False,
76
+ use_regrid_cache: bool = True,
77
+ use_seafloor_cache: bool = True,
78
+ verbose: bool = True,
79
+ verbose_diagnostics: bool = False,
80
+ cleanup_weights: bool = False,
81
+ overwrite: bool = False,
82
+ use_ui: bool = True,
83
+ ) -> bool:
84
+ """
85
+ Convenience function to regrid a single file.
86
+
87
+ Pipeline: if neither extract_surface nor extract_seafloor, regrid whole file;
88
+ if extract_surface, extract top level and regrid; if extract_seafloor, extract
89
+ seafloor and regrid. For both surface and seafloor use regrid_single_file_extreme_levels.
90
+
91
+ Args:
92
+ - input_path (Path): Path to input file
93
+ - output_path (Path): Path to output file
94
+ - target_resolution (tuple): Target resolution as (lon_res, lat_res)
95
+ - extract_surface (bool): Extract top level only and regrid that
96
+ - extract_seafloor (bool): Extract seafloor values and regrid only that
97
+ - use_regrid_cache (bool): Reuse existing regrid weight files when present
98
+ - use_seafloor_cache (bool): Reuse seafloor depth indices cache
99
+ - verbose (bool): Enable verbose output (progress UI)
100
+ - verbose_diagnostics (bool): If True, print Grid type, File size, Large file (max verbosity)
101
+ - cleanup_weights (bool): Clean up weights after processing
102
+ - overwrite (bool): If True, overwrite existing output files
103
+ - use_ui (bool): Use rich progress UI
104
+
105
+ Returns (bool): True if successful, False otherwise
106
+ """
107
+ pipeline = CDORegridPipeline(
108
+ target_resolution=target_resolution,
109
+ extract_surface=extract_surface,
110
+ extract_seafloor=extract_seafloor,
111
+ use_regrid_cache=use_regrid_cache,
112
+ use_seafloor_cache=use_seafloor_cache,
113
+ verbose=verbose,
114
+ verbose_diagnostics=verbose_diagnostics,
115
+ cleanup_weights=cleanup_weights,
116
+ )
117
+ if output_dir is not None and output_path is None:
118
+ has_level = pipeline._has_level_lightweight(input_path)
119
+ filename = pipeline._generate_output_filename(
120
+ input_path, has_level, extract_surface, extract_seafloor
121
+ )
122
+ output_path = output_dir / filename
123
+ error_log_path = init_regrid_error_log()
124
+ pipeline.set_error_log_path(error_log_path)
125
+ # Initialize UI if requested
126
+ ui = None
127
+ if use_ui and verbose:
128
+ ui = RegridProgressUI(
129
+ [input_path],
130
+ verbose=verbose,
131
+ verbose_diagnostics=pipeline.verbose_diagnostics,
132
+ log_file=error_log_path,
133
+ )
134
+ ui.__enter__()
135
+ # Track processing time
136
+ import time
137
+ start_time = print_timestamp(pipeline.console, "START") if verbose else time.localtime()
138
+ try:
139
+ result = pipeline.regrid_file(input_path, output_path, overwrite=overwrite, ui=ui)
140
+ return result
141
+ finally:
142
+ if ui:
143
+ # Add timing to stats
144
+ end_time = print_timestamp(pipeline.console, "END") if verbose else time.localtime()
145
+ pipeline.stats['processing_time'] = format_processing_time(
146
+ get_processing_time(start_time, end_time)
147
+ )
148
+ ui._update_stats(pipeline.stats)
149
+ ui.print_summary()
150
+ ui.__exit__(None, None, None)
151
+
152
+
153
+ def regrid_single_file_both_levels(
154
+ input_path: Path,
155
+ output_dir: Optional[Path] = None,
156
+ target_resolution: tuple[float, float] = (1.0, 1.0),
157
+ use_regrid_cache: bool = True,
158
+ use_seafloor_cache: bool = True,
159
+ verbose: bool = True,
160
+ cleanup_weights: bool = False,
161
+ overwrite: bool = False,
162
+ weight_cache_dir: Optional[Path] = None,
163
+ ) -> dict[str, bool]:
164
+ """
165
+ Regrid both the top level (surface) and the seafloor values for a single file.
166
+
167
+ Performs steps 2 and 3 sequentially: seafloor extraction+regrid, then surface extraction+regrid.
168
+ Outputs: ``<name>_seafloor_regridded.nc`` and ``<name>_top_level_regridded.nc`` (if multi-level).
169
+
170
+ Parameters
171
+ ----------
172
+ input_path : Path
173
+ Input NetCDF file.
174
+ output_dir : Path, optional
175
+ Directory for regridded outputs. If ``None``, files are written next to ``input_path``.
176
+ target_resolution : tuple[float, float]
177
+ Target grid resolution.
178
+ use_regrid_cache : bool
179
+ Reuse existing regrid weight files when present.
180
+ use_seafloor_cache : bool
181
+ Reuse seafloor depth indices cache.
182
+ verbose : bool
183
+ Enable verbose logging.
184
+ cleanup_weights : bool
185
+ Clean up weights after processing.
186
+ overwrite : bool
187
+ Overwrite existing outputs.
188
+ weight_cache_dir : Path, optional
189
+ Directory for regrid weight cache. If None, each input file uses ``<input_dir>/cdo_weights``.
190
+
191
+ Returns
192
+ -------
193
+ dict[str, bool]
194
+ Mapping ``{'top_level': bool, 'seafloor': bool}`` indicating success for each stream.
195
+ """
196
+ # 1) Seafloor: extract seafloor indices (from file or cache), extract values, regrid
197
+ seafloor_pipeline = CDORegridPipeline(
198
+ target_resolution=target_resolution,
199
+ extract_surface=False,
200
+ extract_seafloor=True,
201
+ use_regrid_cache=use_regrid_cache,
202
+ use_seafloor_cache=use_seafloor_cache,
203
+ verbose=verbose,
204
+ cleanup_weights=cleanup_weights,
205
+ weight_cache_dir=weight_cache_dir,
206
+ )
207
+ if overwrite:
208
+ seafloor_intermediate = input_path.parent / f"{input_path.stem}_seafloor{input_path.suffix}"
209
+ if seafloor_intermediate.exists():
210
+ seafloor_intermediate.unlink()
211
+ seafloor_success = seafloor_pipeline.regrid_file(
212
+ input_path=input_path,
213
+ output_path=output_dir,
214
+ overwrite=overwrite,
215
+ ui=None,
216
+ )
217
+
218
+ # 2) Surface: extract top level and regrid
219
+ top_pipeline = CDORegridPipeline(
220
+ target_resolution=target_resolution,
221
+ extract_surface=True,
222
+ extract_seafloor=False,
223
+ use_regrid_cache=use_regrid_cache,
224
+ use_seafloor_cache=use_seafloor_cache,
225
+ verbose=verbose,
226
+ cleanup_weights=cleanup_weights,
227
+ weight_cache_dir=weight_cache_dir,
228
+ )
229
+ top_success = top_pipeline.regrid_file(
230
+ input_path=input_path,
231
+ output_path=output_dir,
232
+ overwrite=overwrite,
233
+ ui=None,
234
+ )
235
+
236
+ return {"top_level": top_success, "seafloor": seafloor_success}
237
+
238
+
239
+ # Alias for CLI --extreme-levels
240
+ regrid_single_file_extreme_levels = regrid_single_file_both_levels
241
+
242
+
243
+ def process_directory_both_levels(
244
+ dir_path: Path,
245
+ file_list: list[Path],
246
+ output_dir: Optional[Path] = None,
247
+ target_resolution: tuple[float, float] = (1.0, 1.0),
248
+ use_regrid_cache: bool = True,
249
+ use_seafloor_cache: bool = True,
250
+ verbose: bool = True,
251
+ overwrite: bool = False,
252
+ ) -> list[tuple[Path, dict[str, bool]]]:
253
+ """
254
+ Process all files in one directory with shared pipelines (seafloor cache + weight cache).
255
+ Per file: seafloor then surface (extreme levels). Call from a single worker per directory.
256
+ """
257
+ weight_cache_dir = dir_path / "cdo_weights"
258
+ seafloor_pipeline = CDORegridPipeline(
259
+ target_resolution=target_resolution,
260
+ extract_surface=False,
261
+ extract_seafloor=True,
262
+ use_regrid_cache=use_regrid_cache,
263
+ use_seafloor_cache=use_seafloor_cache,
264
+ verbose=verbose,
265
+ cleanup_weights=False,
266
+ weight_cache_dir=weight_cache_dir,
267
+ )
268
+ top_pipeline = CDORegridPipeline(
269
+ target_resolution=target_resolution,
270
+ extract_surface=True,
271
+ extract_seafloor=False,
272
+ use_regrid_cache=use_regrid_cache,
273
+ use_seafloor_cache=use_seafloor_cache,
274
+ verbose=verbose,
275
+ cleanup_weights=False,
276
+ weight_cache_dir=weight_cache_dir,
277
+ )
278
+ results: list[tuple[Path, dict[str, bool]]] = []
279
+ for fp in file_list:
280
+ seafloor_ok = seafloor_pipeline.regrid_file(
281
+ input_path=fp,
282
+ output_path=output_dir,
283
+ overwrite=overwrite,
284
+ ui=None,
285
+ )
286
+ top_ok = top_pipeline.regrid_file(
287
+ input_path=fp,
288
+ output_path=output_dir,
289
+ overwrite=overwrite,
290
+ ui=None,
291
+ )
292
+ results.append((fp, {"top_level": top_ok, "seafloor": seafloor_ok}))
293
+ return results
294
+
295
+
296
+ def _worker_process_directory_both_levels(
297
+ args: tuple,
298
+ ) -> list[tuple[Path, dict[str, bool]]]:
299
+ """Picklable worker: process one directory (all files sequentially) with shared pipelines."""
300
+ dir_path, file_list, output_dir, target_resolution, verbose, overwrite = args
301
+ return process_directory_both_levels(
302
+ dir_path=dir_path,
303
+ file_list=file_list,
304
+ output_dir=output_dir,
305
+ target_resolution=target_resolution,
306
+ verbose=verbose,
307
+ overwrite=overwrite,
308
+ )
309
+
310
+
311
+ def _worker_both_levels(
312
+ args: tuple,
313
+ ) -> tuple[Path, dict[str, bool]]:
314
+ """Picklable worker for regrid_single_file_both_levels. Used by ProcessPoolExecutor.
315
+ Uses a per-process weight cache dir to avoid races on weight generation/reuse.
316
+ Seafloor depth cache is in-memory per pipeline, so no cross-process conflict.
317
+ """
318
+ input_path, output_dir, target_resolution, verbose, overwrite = args
319
+ # Per-process weight dir so parallel workers don't share weight files (avoid races)
320
+ weight_cache_dir = Path(tempfile.gettempdir()) / f"cdo_weights_{os.getpid()}"
321
+ status = regrid_single_file_both_levels(
322
+ input_path=input_path,
323
+ output_dir=output_dir,
324
+ target_resolution=target_resolution,
325
+ verbose=verbose,
326
+ cleanup_weights=False,
327
+ overwrite=overwrite,
328
+ weight_cache_dir=weight_cache_dir,
329
+ )
330
+ return (input_path, status)
331
+
332
+
333
+ def regrid_directory(
334
+ input_dir: Path,
335
+ include_subdirectories: bool = False,
336
+ output_dir: Optional[Path] = None,
337
+ target_resolution: tuple[float, float] = None,
338
+ file_pattern: str = "*.nc",
339
+ variables: Optional[list[str] | str] = None,
340
+ extract_surface: bool = False,
341
+ extract_seafloor: bool = False,
342
+ use_regrid_cache: bool = True,
343
+ use_seafloor_cache: bool = True,
344
+ verbose: bool = True,
345
+ verbose_diagnostics: bool = False,
346
+ max_workers: Optional[int] = 4,
347
+ enable_parallel: bool = True,
348
+ overwrite: bool = False,
349
+ use_ui: bool = True,
350
+ ) -> dict[str, list[Path]]:
351
+ """
352
+ Convenience function to regrid all files in a directory.
353
+
354
+ Args:
355
+ - input_dir (Path): Input directory containing NetCDF files
356
+ - output_dir (Path): Output directory for regridded files
357
+ - target_resolution (tuple): Target resolution as (lon_res, lat_res)
358
+ - file_pattern (str): File pattern to match (e.g., "*.nc", "*.nc4")
359
+ - variables (list[str] | str, optional): CMIP variable prefix(es) to process (e.g. ``tos`` or ``tos,uo``)
360
+ - extract_surface (bool): Extract top level only and regrid that
361
+ - extract_seafloor (bool): Extract seafloor values and regrid only that
362
+ - use_regrid_cache (bool): Reuse existing regrid weight files
363
+ - use_seafloor_cache (bool): Reuse seafloor depth indices cache
364
+ - verbose (bool): Enable verbose output (progress UI)
365
+ - verbose_diagnostics (bool): If True, print Grid type, File size, Large file (max verbosity)
366
+ - max_workers (int): Maximum number of parallel workers
367
+ - enable_parallel (bool): Enable parallel processing
368
+ - overwrite (bool): If True, overwrite existing output files
369
+ - use_ui (bool): Use rich progress UI
370
+
371
+ Returns (dict[str, list[Path]]): Dictionary mapping status to list of file paths
372
+ """
373
+ # find all matching files (exclude intermediates: _top_level, _regridded, _seafloor, _chunk_)
374
+ if include_subdirectories:
375
+ raw = list(input_dir.rglob(file_pattern))
376
+ else:
377
+ raw = list(input_dir.glob(file_pattern))
378
+ input_files = [p for p in raw if not is_intermediate_nc(p)]
379
+ input_files = filter_files_by_variables(input_files, variables)
380
+
381
+ if not input_files:
382
+ var_msg = f" and variables {sorted(parse_variable_list(variables) or [])}" if variables else ""
383
+ print(f"No files found matching pattern '{file_pattern}'{var_msg} in {input_dir}")
384
+ return {"successful": [], "failed": [], "skipped": []}
385
+
386
+ error_log_path = init_regrid_error_log()
387
+ if verbose:
388
+ print(f"Regrid errors log: {error_log_path}")
389
+
390
+ # create pipeline
391
+ pipeline = CDORegridPipeline(
392
+ target_resolution=target_resolution,
393
+ extract_surface=extract_surface,
394
+ extract_seafloor=extract_seafloor,
395
+ use_regrid_cache=use_regrid_cache,
396
+ use_seafloor_cache=use_seafloor_cache,
397
+ verbose=verbose,
398
+ verbose_diagnostics=verbose_diagnostics,
399
+ max_workers=max_workers,
400
+ enable_parallel=enable_parallel,
401
+ )
402
+ pipeline.set_error_log_path(error_log_path)
403
+
404
+ # process files
405
+ results = pipeline.regrid_batch(input_files, output_dir, overwrite=overwrite, use_ui=use_ui)
406
+ # print statistics
407
+ pipeline.print_statistics()
408
+
409
+ return results
410
+
411
+
412
+ def regrid_directory_both_levels(
413
+ input_dir: Path,
414
+ include_subdirectories: bool = False,
415
+ output_dir: Optional[Path] = None,
416
+ target_resolution: tuple[float, float] = (1.0, 1.0),
417
+ file_pattern: str = "*.nc",
418
+ variables: Optional[list[str] | str] = None,
419
+ verbose: bool = True,
420
+ overwrite: bool = False,
421
+ max_workers: Optional[int] = 4,
422
+ enable_parallel: bool = True,
423
+ ) -> dict[str, list[Path]]:
424
+ """
425
+ Regrid both the top level and the seafloor values for all files
426
+ in a directory.
427
+
428
+ This is a higher-level orchestrator that calls
429
+ :func:`regrid_single_file_both_levels` for each matching file and
430
+ aggregates the results into the same status dictionary structure
431
+ as :func:`regrid_directory`. When ``enable_parallel`` is True and there are multiple directories, directories
432
+ are processed in parallel (one worker per directory). Within each directory
433
+ all files are processed sequentially with shared pipelines, so per-directory
434
+ seafloor depth cache and weight cache (``<dir>/cdo_weights``) are reused and
435
+ no workers contend for the same directory.
436
+
437
+ Parameters
438
+ ----------
439
+ input_dir : Path
440
+ Directory containing input NetCDF files.
441
+ include_subdirectories : bool
442
+ Recurse into subdirectories.
443
+ output_dir : Path, optional
444
+ Directory for outputs (defaults to alongside inputs when None).
445
+ target_resolution : tuple[float, float]
446
+ Target grid resolution.
447
+ file_pattern : str
448
+ Glob pattern for selecting input files.
449
+ verbose : bool
450
+ Enable verbose logging.
451
+ overwrite : bool
452
+ Overwrite existing outputs.
453
+ max_workers : int, optional
454
+ Maximum parallel workers (default 4). Used only if enable_parallel is True.
455
+ enable_parallel : bool
456
+ Process multiple files in parallel (default True).
457
+
458
+ Returns
459
+ -------
460
+ dict[str, list[Path]]
461
+ Dictionary with ``'successful'``, ``'failed'`` and ``'skipped'``
462
+ keys mapping to lists of input file paths.
463
+ """
464
+ # Only process source files; exclude intermediates (_top_level, _regridded, _seafloor, _chunk_)
465
+ if include_subdirectories:
466
+ raw = list(input_dir.rglob(file_pattern))
467
+ else:
468
+ raw = list(input_dir.glob(file_pattern))
469
+ input_files = [p for p in raw if not is_intermediate_nc(p)]
470
+ input_files = filter_files_by_variables(input_files, variables)
471
+
472
+ if not input_files:
473
+ var_msg = f" and variables {sorted(parse_variable_list(variables) or [])}" if variables else ""
474
+ print(f"No files found matching pattern '{file_pattern}'{var_msg} in {input_dir}")
475
+ return {"successful": [], "failed": [], "skipped": []}
476
+
477
+ results: dict[str, list[Path]] = {
478
+ "successful": [],
479
+ "failed": [],
480
+ "skipped": [],
481
+ }
482
+
483
+ # Group files by directory so one worker owns each directory (reuses seafloor cache + weight cache)
484
+ by_dir: dict[Path, list[Path]] = {}
485
+ for fp in input_files:
486
+ by_dir.setdefault(fp.parent, []).append(fp)
487
+ dir_jobs = [(d, sorted(fs)) for d, fs in by_dir.items()]
488
+
489
+ use_parallel = (
490
+ enable_parallel
491
+ and len(dir_jobs) >= 2
492
+ and max_workers is not None
493
+ and max_workers > 1
494
+ )
495
+
496
+ if use_parallel:
497
+ worker_args = [
498
+ (dir_path, file_list, output_dir, target_resolution, verbose, overwrite)
499
+ for dir_path, file_list in dir_jobs
500
+ ]
501
+ n_workers = min(max_workers, len(dir_jobs), mp.cpu_count())
502
+ if verbose:
503
+ print(f"Processing {len(dir_jobs)} directories in parallel with {n_workers} workers (files within each directory processed sequentially with shared cache).")
504
+ with ProcessPoolExecutor(max_workers=n_workers) as executor:
505
+ for file_results in executor.map(_worker_process_directory_both_levels, worker_args):
506
+ for input_path, status in file_results:
507
+ if status["top_level"] and status["seafloor"]:
508
+ results["successful"].append(input_path)
509
+ elif status["top_level"] or status["seafloor"]:
510
+ results["failed"].append(input_path)
511
+ else:
512
+ results["failed"].append(input_path)
513
+ else:
514
+ for dir_path, file_list in dir_jobs:
515
+ try:
516
+ file_results = process_directory_both_levels(
517
+ dir_path=dir_path,
518
+ file_list=file_list,
519
+ output_dir=output_dir,
520
+ target_resolution=target_resolution,
521
+ verbose=verbose,
522
+ overwrite=overwrite,
523
+ )
524
+ for fp, status in file_results:
525
+ if status["top_level"] and status["seafloor"]:
526
+ results["successful"].append(fp)
527
+ elif status["top_level"] or status["seafloor"]:
528
+ results["failed"].append(fp)
529
+ else:
530
+ results["failed"].append(fp)
531
+ except Exception:
532
+ for fp in file_list:
533
+ results["failed"].append(fp)
534
+
535
+ return results
536
+
537
+ def regrid_large_files(
538
+ input_files: list[Path],
539
+ output_dir: Optional[Path] = None,
540
+ target_resolution: tuple[float, float] = (1.0, 1.0),
541
+ chunk_size_gb: float = 2.0,
542
+ max_memory_gb: float = 8.0,
543
+ verbose: bool = True,
544
+ overwrite: bool = False,
545
+ ) -> dict[str, list[Path]]:
546
+ """
547
+ Convenience function for regridding large files with memory optimization.
548
+
549
+ Args:
550
+ - input_files (list[Path]): List of input files to regrid
551
+ - output_dir (Path): Output directory for regridded files
552
+ - target_resolution (tuple): Target resolution as (lon_res, lat_res)
553
+ - chunk_size_gb (float): Maximum chunk size in GB
554
+ - max_memory_gb (float): Maximum memory usage in GB
555
+ - verbose (bool): Enable verbose output
556
+ - overwrite (bool): If True, overwrite existing output files
557
+
558
+ Returns (dict[str, list[Path]]): Dictionary mapping status to list of file paths
559
+ """
560
+ pipeline = CDORegridPipeline(
561
+ target_resolution=target_resolution,
562
+ chunk_size_gb=chunk_size_gb,
563
+ max_memory_gb=max_memory_gb,
564
+ verbose=verbose,
565
+ enable_chunking=True,
566
+ memory_monitoring=True,
567
+ )
568
+
569
+ results = pipeline.regrid_batch(input_files, output_dir, overwrite=overwrite)
570
+ pipeline.print_statistics()
571
+
572
+ return results
573
+