cdo-toolkit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cdo_toolkit/workers.py ADDED
@@ -0,0 +1,277 @@
1
+ """Parallel worker entrypoints (picklable for ProcessPoolExecutor)."""
2
+
3
+ import os
4
+ import tempfile
5
+ import threading
6
+ from pathlib import Path
7
+ from typing import TYPE_CHECKING, Optional
8
+
9
+ try:
10
+ import fcntl
11
+ except ImportError:
12
+ fcntl = None
13
+
14
+ from cdo_toolkit.errors import init_regrid_error_log, log_regrid_error
15
+
16
+ if TYPE_CHECKING:
17
+ from cdo_toolkit.ui import BatchRegridUI
18
+
19
+
20
+ def process_chunk_standalone(args):
21
+ """Helper function to process a single chunk in parallel.
22
+ Must be at module level for ProcessPoolExecutor pickling.
23
+
24
+ Args:
25
+ args: Tuple of (chunk_idx, chunk_file, tmpdir, grid_file, weight_path)
26
+
27
+ Returns:
28
+ Tuple of (chunk_idx, chunk_output, success, error_message)
29
+ """
30
+ chunk_idx, chunk_file, tmpdir, grid_file, weight_path = args
31
+ chunk_output = Path(tmpdir) / f"chunk_{chunk_idx:03d}.nc"
32
+ try:
33
+ # Create a new CDO instance for this worker
34
+ from cdo import Cdo
35
+ cdo = Cdo()
36
+
37
+ # Regrid using existing weights
38
+ cdo.remap(
39
+ str(grid_file),
40
+ str(weight_path),
41
+ input=str(chunk_file),
42
+ output=str(chunk_output),
43
+ )
44
+ return (chunk_idx, chunk_output, True, None)
45
+ except Exception as e:
46
+ log_regrid_error(
47
+ f"Chunk {chunk_idx} remap failed ({chunk_file}): {e}",
48
+ exc=e,
49
+ )
50
+ return (chunk_idx, chunk_output, False, str(e))
51
+
52
+
53
+ def poll_batch_progress(
54
+ ui: "BatchRegridUI",
55
+ progress_state,
56
+ stop_event: threading.Event,
57
+ interval: float = 0.4,
58
+ ) -> None:
59
+ """Background thread: refresh in-progress bars from worker-shared state."""
60
+ while not stop_event.is_set():
61
+ try:
62
+ ui.sync_from_progress_state(dict(progress_state))
63
+ except Exception:
64
+ pass
65
+ if stop_event.wait(interval):
66
+ break
67
+
68
+
69
+
70
+ def process_single_file_standalone(
71
+ file_path: Path,
72
+ output_dir: Optional[Path],
73
+ target_resolution: tuple[float, float],
74
+ target_grid: str,
75
+ weight_cache_dir: Path,
76
+ extract_surface: bool,
77
+ extract_seafloor: bool,
78
+ use_regrid_cache: bool,
79
+ use_seafloor_cache: bool,
80
+ max_memory_gb: float,
81
+ chunk_size_gb: float,
82
+ enable_chunking: bool,
83
+ overwrite: bool = False,
84
+ representative_file: Optional[Path] = None,
85
+ verbose: bool = False,
86
+ error_log_path: Optional[Path] = None,
87
+ progress_state=None,
88
+ ) -> dict[str, any]:
89
+ """
90
+ Standalone function for processing a single file in parallel.
91
+ Creates its own pipeline instance to avoid pickle issues.
92
+
93
+ Args:
94
+ - file_path (Path): Path to the input file
95
+ - output_dir (Optional[Path]): Output directory for regridded files
96
+ - target_resolution (tuple[float, float]): Target resolution as (lon_res, lat_res)
97
+ - target_grid (str): Target grid type ('lonlat', 'gaussian', etc.)
98
+ - weight_cache_dir (Path): Directory to cache regrid weights
99
+ - extract_surface (bool): If True, extract top level only and regrid that
100
+ - extract_seafloor (bool): If True, extract seafloor values and regrid only that
101
+ - use_regrid_cache (bool): If True, reuse existing regrid weight files
102
+ - use_seafloor_cache (bool): If True, reuse seafloor depth indices cache
103
+ - max_memory_gb (float): Maximum memory usage in GB
104
+ - chunk_size_gb (float): Maximum chunk size in GB
105
+ - enable_chunking (bool): If True, chunk the file for processing
106
+ - overwrite (bool): If True, overwrite existing output files
107
+ - representative_file (Optional[Path]): Representative file for resolution calculation
108
+ - error_log_path (Optional[Path]): If set, pipeline errors in this worker are written here instead of stderr
109
+
110
+ Returns:
111
+ - dict[str, any]: Dictionary containing the result of the regridding
112
+ - 'success': Boolean indicating if the regridding was successful
113
+ - 'file_path': Path to the input file
114
+ - 'skipped': Boolean indicating if the file was skipped
115
+ - 'message': Message indicating the result of the regridding
116
+ - 'stats': Dictionary containing the statistics of the regridding
117
+ - 'files_processed': Number of files processed
118
+ - 'weights_reused': Number of weights reused
119
+ - 'weights_generated': Number of weights generated
120
+ - 'chunks_processed': Number of chunks processed
121
+ - 'errors': Number of errors
122
+ - 'total_size_gb': Total size of the regridded files in GB
123
+ - 'memory_peak_gb': Peak memory usage in GB
124
+ - 'grid_types': Dictionary containing the grid types of the regridded files
125
+ - 'structured': Number of structured grids
126
+ - 'curvilinear': Number of curvilinear grids
127
+ - 'unstructured_ncells': Number of unstructured grids
128
+ - 'unknown': Number of unknown grids
129
+ """
130
+ # create a new pipeline instance for a specific worker
131
+ from cdo_toolkit.pipeline import CDORegridPipeline
132
+
133
+ pipeline = CDORegridPipeline(
134
+ target_resolution=target_resolution,
135
+ target_grid=target_grid,
136
+ weight_cache_dir=weight_cache_dir,
137
+ extract_surface=extract_surface,
138
+ extract_seafloor=extract_seafloor,
139
+ use_regrid_cache=use_regrid_cache,
140
+ use_seafloor_cache=use_seafloor_cache,
141
+ verbose=verbose,
142
+ max_memory_gb=max_memory_gb,
143
+ chunk_size_gb=chunk_size_gb,
144
+ max_workers=1,
145
+ enable_parallel=False,
146
+ enable_chunking=enable_chunking,
147
+ memory_monitoring=False,
148
+ )
149
+ error_log = init_regrid_error_log(error_log_path)
150
+ if representative_file:
151
+ pipeline._representative_file = representative_file
152
+ pipeline._error_log_path = error_log
153
+ progress_key = str(file_path)
154
+ pipeline._progress_state = progress_state
155
+ pipeline._progress_key = progress_key if progress_state is not None else None
156
+
157
+ try:
158
+ # Use lightweight check for has_level to avoid expensive full file analysis
159
+ has_level = pipeline._has_level_lightweight(file_path)
160
+ # determine output path
161
+ if output_dir:
162
+ output_filename = pipeline._generate_output_filename(file_path, has_level, extract_surface, extract_seafloor)
163
+ output_path = output_dir / output_filename
164
+ else:
165
+ output_filename = pipeline._generate_output_filename(file_path, has_level, extract_surface, extract_seafloor)
166
+ output_path = file_path.parent / output_filename
167
+
168
+ # check if output already exists (unless overwrite is True)
169
+ if output_path.exists() and not overwrite:
170
+ return {
171
+ 'success': True,
172
+ 'file_path': file_path,
173
+ 'skipped': True,
174
+ 'message': 'File already exists',
175
+ 'stats': {
176
+ 'files_processed': 0, # skipped files don't count as processed
177
+ 'weights_reused': 0,
178
+ 'weights_generated': 0,
179
+ 'chunks_processed': 0,
180
+ 'errors': 0,
181
+ 'total_size_gb': 0.0,
182
+ 'memory_peak_gb': 0.0,
183
+ 'grid_types': {}
184
+ }
185
+ }
186
+
187
+ # Exclusive lock on the input file so only one worker processes it (avoids duplicate
188
+ # work and races on the same output when the same path is submitted from multiple batches).
189
+ lock_fd = None
190
+ lock_path = file_path.parent / (file_path.name + ".regrid_lock")
191
+ if fcntl is not None:
192
+ try:
193
+ lock_fd = os.open(str(lock_path), os.O_CREAT | os.O_RDWR, 0o644)
194
+ fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
195
+ except (OSError, BlockingIOError):
196
+ if lock_fd is not None:
197
+ try:
198
+ os.close(lock_fd)
199
+ except OSError:
200
+ pass
201
+ lock_fd = None
202
+ return {
203
+ 'success': False,
204
+ 'file_path': file_path,
205
+ 'skipped': True,
206
+ 'message': 'File already being processed by another worker',
207
+ 'stats': {
208
+ 'files_processed': 0,
209
+ 'weights_reused': 0,
210
+ 'weights_generated': 0,
211
+ 'chunks_processed': 0,
212
+ 'errors': 0,
213
+ 'total_size_gb': 0.0,
214
+ 'memory_peak_gb': 0.0,
215
+ 'grid_types': {}
216
+ }
217
+ }
218
+
219
+ if progress_state is not None:
220
+ progress_state[progress_key] = {
221
+ "phase": "starting",
222
+ "chunks_done": 0,
223
+ "chunks_total": 0,
224
+ }
225
+ try:
226
+ success = pipeline.regrid_file(file_path, output_path, overwrite=overwrite)
227
+ finally:
228
+ if progress_state is not None:
229
+ progress_state[progress_key] = {"phase": "done", "chunks_done": 0, "chunks_total": 0}
230
+ if lock_fd is not None:
231
+ try:
232
+ fcntl.flock(lock_fd, fcntl.LOCK_UN)
233
+ os.close(lock_fd)
234
+ except OSError:
235
+ pass
236
+ try:
237
+ lock_path.unlink(missing_ok=True)
238
+ except OSError:
239
+ pass
240
+
241
+ # collect statistics from the worker's pipeline
242
+ worker_stats = pipeline.stats.copy()
243
+
244
+ if success:
245
+ return {
246
+ 'success': True,
247
+ 'file_path': file_path,
248
+ 'skipped': False,
249
+ 'message': 'Successfully regridded',
250
+ 'stats': worker_stats
251
+ }
252
+ else:
253
+ return {
254
+ 'success': False,
255
+ 'file_path': file_path,
256
+ 'skipped': False,
257
+ 'message': 'Regridding failed',
258
+ 'stats': worker_stats
259
+ }
260
+
261
+ except Exception as e:
262
+ return {
263
+ 'success': False,
264
+ 'file_path': file_path,
265
+ 'skipped': False,
266
+ 'message': f'Error: {str(e)}',
267
+ 'stats': {
268
+ 'files_processed': 0,
269
+ 'weights_reused': 0,
270
+ 'weights_generated': 0,
271
+ 'chunks_processed': 0,
272
+ 'errors': 1,
273
+ 'total_size_gb': 0.0,
274
+ 'memory_peak_gb': 0.0,
275
+ 'grid_types': {}
276
+ }
277
+ }
@@ -0,0 +1,78 @@
1
+ Metadata-Version: 2.4
2
+ Name: cdo-toolkit
3
+ Version: 0.1.0
4
+ Summary: CDO-based NetCDF regridding with chunking, parallel workers, and optional CMIP filename helpers
5
+ Project-URL: Repository, https://github.com/orlando-code/cdo-toolkit
6
+ Project-URL: Documentation, https://github.com/orlando-code/cdo-toolkit#readme
7
+ Project-URL: Issues, https://github.com/orlando-code/cdo-toolkit/issues
8
+ Author-email: Orlando Timmerman <rt582@cam.ac.uk>
9
+ License: BSD-3-Clause
10
+ License-File: LICENSE
11
+ Keywords: cdo,climate,netcdf,regridding,xarray
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: BSD License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Scientific/Engineering
20
+ Requires-Python: >=3.10
21
+ Requires-Dist: cdo>=1.6.1
22
+ Requires-Dist: numpy>=1.24.0
23
+ Requires-Dist: psutil>=7.2.2
24
+ Requires-Dist: rich>=12.6.0
25
+ Requires-Dist: xarray>=2023.1.0
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest>=8.3.3; extra == 'dev'
28
+ Description-Content-Type: text/markdown
29
+
30
+ # cdo-toolkit
31
+
32
+ CDO-based NetCDF regridding toolkit: weight caching, time-chunked processing, parallel workers, Rich progress UI, and optional surface/seafloor level extraction.
33
+
34
+ Works with **general NetCDF files**. CMIP6 filename helpers (`tos_Omon_...` variable filtering, `nominal_resolution` metadata) live in `cdo_toolkit.cmip` but are optional.
35
+
36
+ ## Requirements
37
+
38
+ - Python ≥ 3.10
39
+ - [CDO](https://code.mpimet.mpg.de/projects/cdo) binary on `PATH` (e.g. `conda install -c conda-forge cdo`)
40
+
41
+ ## Install
42
+
43
+ ```bash
44
+ pip install cdo-toolkit
45
+ # editable from source
46
+ pip install -e .
47
+ ```
48
+
49
+ ## CLI
50
+
51
+ ```bash
52
+ cdo-toolkit /path/to/data -o /path/to/out -r 1.0 1.0 -w 4
53
+ python -m cdo_toolkit /path/to/file.nc --extract-surface
54
+ ```
55
+
56
+ ## Python API
57
+
58
+ ```python
59
+ from pathlib import Path
60
+ from cdo_toolkit import CDORegridPipeline, regrid_directory, regrid_single_file
61
+
62
+ regrid_directory(Path("data/"), target_resolution=(1.0, 1.0), max_workers=8)
63
+ ```
64
+
65
+ ## Package layout
66
+
67
+ | Module | Role |
68
+ |--------|------|
69
+ | `pipeline` | `CDORegridPipeline` core |
70
+ | `api` | High-level `regrid_directory`, `regrid_single_file`, … |
71
+ | `workers` | Process-pool worker entrypoints |
72
+ | `ui` | Rich progress bars |
73
+ | `cmip` | Optional CMIP6 filename/metadata helpers |
74
+ | `paths` | Weight-cache and intermediate-file path helpers |
75
+
76
+ ## License
77
+
78
+ BSD-3-Clause
@@ -0,0 +1,19 @@
1
+ cdo_toolkit/__init__.py,sha256=YRWXaVjIFA2_Q7cHXD4AwI8t0C-bzY5_mbikxYhwWUE,1439
2
+ cdo_toolkit/__main__.py,sha256=EppaGAY8zJyWM5G-JfQa2MGhFJJ0JDiD7tlN4oEk7qg,112
3
+ cdo_toolkit/api.py,sha256=26WTsMTlS5chZAXEsKeIbXa92V9o1k0JIGbb494g4-g,21648
4
+ cdo_toolkit/cli.py,sha256=4SBcS-3zoGDegByiA8pCFAYPuloDltOVH4njJdgc2Is,8524
5
+ cdo_toolkit/cmip.py,sha256=DusL55OtYIkMVPCTTaga9_L9RnSh27WqoPdP3HkapQ0,2105
6
+ cdo_toolkit/constants.py,sha256=I_Nqb1N-qlxxhouUbPVVd-N4kZwcVIKuq-3yS2ThWho,393
7
+ cdo_toolkit/errors.py,sha256=Kt5A5scphaD6SbsiKrwad2SIyOSZ4aQLJNfG8rmXcxg,2491
8
+ cdo_toolkit/memory.py,sha256=8UDgxrSZK2up7gbqA5wVSWZaUajbguMvwGM07xSbqFE,550
9
+ cdo_toolkit/paths.py,sha256=bVtPCCulySmZwfAG6Dlyc0jxBbOHAHCsfuMMezZXBfk,976
10
+ cdo_toolkit/pipeline.py,sha256=QV5_Flw_oU58QKyczYzlgftap1my9DllmM1E2Z25ENM,104530
11
+ cdo_toolkit/resolution.py,sha256=wkAyl0VSyCTBHNpdvG4AC7yPdU6TwITuaGC3eKaMfv0,601
12
+ cdo_toolkit/timing.py,sha256=YPFJJ07Ge2SjsCiAGaWYIxoE9hu-CbB8Gu-CogOfnsU,1217
13
+ cdo_toolkit/ui.py,sha256=f_RAqpg5dFIedIES_mNnB54PlKu3tIQyNd1S1yOOFuc,27126
14
+ cdo_toolkit/workers.py,sha256=tj4BN_XceqIHx49qAAPSin0tuOCYqg0FkQxh39OtqMQ,10537
15
+ cdo_toolkit-0.1.0.dist-info/METADATA,sha256=ir9F1MAsBg7swtQP3jIAji0Sijo0F5m_O7uxM3dwKyQ,2511
16
+ cdo_toolkit-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
17
+ cdo_toolkit-0.1.0.dist-info/entry_points.txt,sha256=q1DlSLt2a4omyK0lYfBhlHUi-HHG8VUXjkPqERyUXrc,53
18
+ cdo_toolkit-0.1.0.dist-info/licenses/LICENSE,sha256=lUqGPGWDHHxjkUDuYgjLLY2XQXXn_EHU7fnrQWHGugc,1540
19
+ cdo_toolkit-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ cdo-toolkit = cdo_toolkit.cli:main
@@ -0,0 +1,28 @@
1
+ BSD 3-Clause License
2
+
3
+ Copyright (c) 2023, Institut Pierre-Simon Laplace (IPSL) and contributors
4
+
5
+ Redistribution and use in source and binary forms, with or without
6
+ modification, are permitted provided that the following conditions are met:
7
+
8
+ 1. Redistributions of source code must retain the above copyright notice, this
9
+ list of conditions and the following disclaimer.
10
+
11
+ 2. Redistributions in binary form must reproduce the above copyright notice,
12
+ this list of conditions and the following disclaimer in the documentation
13
+ and/or other materials provided with the distribution.
14
+
15
+ 3. Neither the name of the copyright holder nor the names of its
16
+ contributors may be used to endorse or promote products derived from
17
+ this software without specific prior written permission.
18
+
19
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.