PyPI - cdo-toolkit - Versions diffs - 0.1.0__py3-none-any.whl - Mend

cdo-toolkit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

cdo_toolkit/__init__.py +47 -0
cdo_toolkit/__main__.py +6 -0
cdo_toolkit/api.py +573 -0
cdo_toolkit/cli.py +166 -0
cdo_toolkit/cmip.py +61 -0
cdo_toolkit/constants.py +9 -0
cdo_toolkit/errors.py +79 -0
cdo_toolkit/memory.py +22 -0
cdo_toolkit/paths.py +30 -0
cdo_toolkit/pipeline.py +2230 -0
cdo_toolkit/resolution.py +19 -0
cdo_toolkit/timing.py +36 -0
cdo_toolkit/ui.py +650 -0
cdo_toolkit/workers.py +277 -0
cdo_toolkit-0.1.0.dist-info/METADATA +78 -0
cdo_toolkit-0.1.0.dist-info/RECORD +19 -0
cdo_toolkit-0.1.0.dist-info/WHEEL +4 -0
cdo_toolkit-0.1.0.dist-info/entry_points.txt +2 -0
cdo_toolkit-0.1.0.dist-info/licenses/LICENSE +28 -0

cdo_toolkit/workers.py ADDED Viewed

@@ -0,0 +1,277 @@
+"""Parallel worker entrypoints (picklable for ProcessPoolExecutor)."""
+import os
+import tempfile
+import threading
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional
+try:
+    import fcntl
+except ImportError:
+    fcntl = None
+from cdo_toolkit.errors import init_regrid_error_log, log_regrid_error
+if TYPE_CHECKING:
+    from cdo_toolkit.ui import BatchRegridUI
+def process_chunk_standalone(args):
+    """Helper function to process a single chunk in parallel.
+    Must be at module level for ProcessPoolExecutor pickling.
+    Args:
+        args: Tuple of (chunk_idx, chunk_file, tmpdir, grid_file, weight_path)
+    Returns:
+        Tuple of (chunk_idx, chunk_output, success, error_message)
+    """
+    chunk_idx, chunk_file, tmpdir, grid_file, weight_path = args
+    chunk_output = Path(tmpdir) / f"chunk_{chunk_idx:03d}.nc"
+    try:
+        # Create a new CDO instance for this worker
+        from cdo import Cdo
+        cdo = Cdo()
+        # Regrid using existing weights
+        cdo.remap(
+            str(grid_file),
+            str(weight_path),
+            input=str(chunk_file),
+            output=str(chunk_output),
+        )
+        return (chunk_idx, chunk_output, True, None)
+    except Exception as e:
+        log_regrid_error(
+            f"Chunk {chunk_idx} remap failed ({chunk_file}): {e}",
+            exc=e,
+        )
+        return (chunk_idx, chunk_output, False, str(e))
+def poll_batch_progress(
+    ui: "BatchRegridUI",
+    progress_state,
+    stop_event: threading.Event,
+    interval: float = 0.4,
+) -> None:
+    """Background thread: refresh in-progress bars from worker-shared state."""
+    while not stop_event.is_set():
+        try:
+            ui.sync_from_progress_state(dict(progress_state))
+        except Exception:
+            pass
+        if stop_event.wait(interval):
+            break
+def process_single_file_standalone(
+    file_path: Path,
+    output_dir: Optional[Path],
+    target_resolution: tuple[float, float],
+    target_grid: str,
+    weight_cache_dir: Path,
+    extract_surface: bool,
+    extract_seafloor: bool,
+    use_regrid_cache: bool,
+    use_seafloor_cache: bool,
+    max_memory_gb: float,
+    chunk_size_gb: float,
+    enable_chunking: bool,
+    overwrite: bool = False,
+    representative_file: Optional[Path] = None,
+    verbose: bool = False,
+    error_log_path: Optional[Path] = None,
+    progress_state=None,
+) -> dict[str, any]:
+    """
+    Standalone function for processing a single file in parallel.
+    Creates its own pipeline instance to avoid pickle issues.
+    Args:
+    - file_path (Path): Path to the input file
+    - output_dir (Optional[Path]): Output directory for regridded files
+    - target_resolution (tuple[float, float]): Target resolution as (lon_res, lat_res)
+    - target_grid (str): Target grid type ('lonlat', 'gaussian', etc.)
+    - weight_cache_dir (Path): Directory to cache regrid weights
+    - extract_surface (bool): If True, extract top level only and regrid that
+    - extract_seafloor (bool): If True, extract seafloor values and regrid only that
+    - use_regrid_cache (bool): If True, reuse existing regrid weight files
+    - use_seafloor_cache (bool): If True, reuse seafloor depth indices cache
+    - max_memory_gb (float): Maximum memory usage in GB
+    - chunk_size_gb (float): Maximum chunk size in GB
+    - enable_chunking (bool): If True, chunk the file for processing
+    - overwrite (bool): If True, overwrite existing output files
+    - representative_file (Optional[Path]): Representative file for resolution calculation
+    - error_log_path (Optional[Path]): If set, pipeline errors in this worker are written here instead of stderr
+    Returns:
+    - dict[str, any]: Dictionary containing the result of the regridding
+        - 'success': Boolean indicating if the regridding was successful
+        - 'file_path': Path to the input file
+        - 'skipped': Boolean indicating if the file was skipped
+        - 'message': Message indicating the result of the regridding
+        - 'stats': Dictionary containing the statistics of the regridding
+            - 'files_processed': Number of files processed
+            - 'weights_reused': Number of weights reused
+            - 'weights_generated': Number of weights generated
+            - 'chunks_processed': Number of chunks processed
+            - 'errors': Number of errors
+            - 'total_size_gb': Total size of the regridded files in GB
+            - 'memory_peak_gb': Peak memory usage in GB
+            - 'grid_types': Dictionary containing the grid types of the regridded files
+                - 'structured': Number of structured grids
+                - 'curvilinear': Number of curvilinear grids
+                - 'unstructured_ncells': Number of unstructured grids
+                - 'unknown': Number of unknown grids
+    """
+    # create a new pipeline instance for a specific worker
+    from cdo_toolkit.pipeline import CDORegridPipeline
+    pipeline = CDORegridPipeline(
+        target_resolution=target_resolution,
+        target_grid=target_grid,
+        weight_cache_dir=weight_cache_dir,
+        extract_surface=extract_surface,
+        extract_seafloor=extract_seafloor,
+        use_regrid_cache=use_regrid_cache,
+        use_seafloor_cache=use_seafloor_cache,
+        verbose=verbose,
+        max_memory_gb=max_memory_gb,
+        chunk_size_gb=chunk_size_gb,
+        max_workers=1,
+        enable_parallel=False,
+        enable_chunking=enable_chunking,
+        memory_monitoring=False,
+    )
+    error_log = init_regrid_error_log(error_log_path)
+    if representative_file:
+        pipeline._representative_file = representative_file
+    pipeline._error_log_path = error_log
+    progress_key = str(file_path)
+    pipeline._progress_state = progress_state
+    pipeline._progress_key = progress_key if progress_state is not None else None
+    try:
+        # Use lightweight check for has_level to avoid expensive full file analysis
+        has_level = pipeline._has_level_lightweight(file_path)
+        # determine output path
+        if output_dir:
+            output_filename = pipeline._generate_output_filename(file_path, has_level, extract_surface, extract_seafloor)
+            output_path = output_dir / output_filename
+        else:
+            output_filename = pipeline._generate_output_filename(file_path, has_level, extract_surface, extract_seafloor)
+            output_path = file_path.parent / output_filename
+        # check if output already exists (unless overwrite is True)
+        if output_path.exists() and not overwrite:
+            return {
+                'success': True,
+                'file_path': file_path,
+                'skipped': True,
+                'message': 'File already exists',
+                'stats': {
+                    'files_processed': 0,  # skipped files don't count as processed
+                    'weights_reused': 0,
+                    'weights_generated': 0,
+                    'chunks_processed': 0,
+                    'errors': 0,
+                    'total_size_gb': 0.0,
+                    'memory_peak_gb': 0.0,
+                    'grid_types': {}
+                }
+            }
+        # Exclusive lock on the input file so only one worker processes it (avoids duplicate
+        # work and races on the same output when the same path is submitted from multiple batches).
+        lock_fd = None
+        lock_path = file_path.parent / (file_path.name + ".regrid_lock")
+        if fcntl is not None:
+            try:
+                lock_fd = os.open(str(lock_path), os.O_CREAT | os.O_RDWR, 0o644)
+                fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
+            except (OSError, BlockingIOError):
+                if lock_fd is not None:
+                    try:
+                        os.close(lock_fd)
+                    except OSError:
+                        pass
+                    lock_fd = None
+                return {
+                    'success': False,
+                    'file_path': file_path,
+                    'skipped': True,
+                    'message': 'File already being processed by another worker',
+                    'stats': {
+                        'files_processed': 0,
+                        'weights_reused': 0,
+                        'weights_generated': 0,
+                        'chunks_processed': 0,
+                        'errors': 0,
+                        'total_size_gb': 0.0,
+                        'memory_peak_gb': 0.0,
+                        'grid_types': {}
+                    }
+                }
+        if progress_state is not None:
+            progress_state[progress_key] = {
+                "phase": "starting",
+                "chunks_done": 0,
+                "chunks_total": 0,
+            }
+        try:
+            success = pipeline.regrid_file(file_path, output_path, overwrite=overwrite)
+        finally:
+            if progress_state is not None:
+                progress_state[progress_key] = {"phase": "done", "chunks_done": 0, "chunks_total": 0}
+            if lock_fd is not None:
+                try:
+                    fcntl.flock(lock_fd, fcntl.LOCK_UN)
+                    os.close(lock_fd)
+                except OSError:
+                    pass
+                try:
+                    lock_path.unlink(missing_ok=True)
+                except OSError:
+                    pass
+        # collect statistics from the worker's pipeline
+        worker_stats = pipeline.stats.copy()
+        if success:
+            return {
+                'success': True,
+                'file_path': file_path,
+                'skipped': False,
+                'message': 'Successfully regridded',
+                'stats': worker_stats
+            }
+        else:
+            return {
+                'success': False,
+                'file_path': file_path,
+                'skipped': False,
+                'message': 'Regridding failed',
+                'stats': worker_stats
+            }
+    except Exception as e:
+        return {
+            'success': False,
+            'file_path': file_path,
+            'skipped': False,
+            'message': f'Error: {str(e)}',
+            'stats': {
+                'files_processed': 0,
+                'weights_reused': 0,
+                'weights_generated': 0,
+                'chunks_processed': 0,
+                'errors': 1,
+                'total_size_gb': 0.0,
+                'memory_peak_gb': 0.0,
+                'grid_types': {}
+            }
+        }

cdo_toolkit-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,78 @@
+Metadata-Version: 2.4
+Name: cdo-toolkit
+Version: 0.1.0
+Summary: CDO-based NetCDF regridding with chunking, parallel workers, and optional CMIP filename helpers
+Project-URL: Repository, https://github.com/orlando-code/cdo-toolkit
+Project-URL: Documentation, https://github.com/orlando-code/cdo-toolkit#readme
+Project-URL: Issues, https://github.com/orlando-code/cdo-toolkit/issues
+Author-email: Orlando Timmerman <rt582@cam.ac.uk>
+License: BSD-3-Clause
+License-File: LICENSE
+Keywords: cdo,climate,netcdf,regridding,xarray
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Scientific/Engineering
+Requires-Python: >=3.10
+Requires-Dist: cdo>=1.6.1
+Requires-Dist: numpy>=1.24.0
+Requires-Dist: psutil>=7.2.2
+Requires-Dist: rich>=12.6.0
+Requires-Dist: xarray>=2023.1.0
+Provides-Extra: dev
+Requires-Dist: pytest>=8.3.3; extra == 'dev'
+Description-Content-Type: text/markdown
+# cdo-toolkit
+CDO-based NetCDF regridding toolkit: weight caching, time-chunked processing, parallel workers, Rich progress UI, and optional surface/seafloor level extraction.
+Works with **general NetCDF files**. CMIP6 filename helpers (`tos_Omon_...` variable filtering, `nominal_resolution` metadata) live in `cdo_toolkit.cmip` but are optional.
+## Requirements
+- Python ≥ 3.10
+- [CDO](https://code.mpimet.mpg.de/projects/cdo) binary on `PATH` (e.g. `conda install -c conda-forge cdo`)
+## Install
+```bash
+pip install cdo-toolkit
+# editable from source
+pip install -e .
+```
+## CLI
+```bash
+cdo-toolkit /path/to/data -o /path/to/out -r 1.0 1.0 -w 4
+python -m cdo_toolkit /path/to/file.nc --extract-surface
+```
+## Python API
+```python
+from pathlib import Path
+from cdo_toolkit import CDORegridPipeline, regrid_directory, regrid_single_file
+regrid_directory(Path("data/"), target_resolution=(1.0, 1.0), max_workers=8)
+```
+## Package layout
+| Module | Role |
+|--------|------|
+| `pipeline` | `CDORegridPipeline` core |
+| `api` | High-level `regrid_directory`, `regrid_single_file`, … |
+| `workers` | Process-pool worker entrypoints |
+| `ui` | Rich progress bars |
+| `cmip` | Optional CMIP6 filename/metadata helpers |
+| `paths` | Weight-cache and intermediate-file path helpers |
+## License
+BSD-3-Clause

cdo_toolkit-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,19 @@
+cdo_toolkit/__init__.py,sha256=YRWXaVjIFA2_Q7cHXD4AwI8t0C-bzY5_mbikxYhwWUE,1439
+cdo_toolkit/__main__.py,sha256=EppaGAY8zJyWM5G-JfQa2MGhFJJ0JDiD7tlN4oEk7qg,112
+cdo_toolkit/api.py,sha256=26WTsMTlS5chZAXEsKeIbXa92V9o1k0JIGbb494g4-g,21648
+cdo_toolkit/cli.py,sha256=4SBcS-3zoGDegByiA8pCFAYPuloDltOVH4njJdgc2Is,8524
+cdo_toolkit/cmip.py,sha256=DusL55OtYIkMVPCTTaga9_L9RnSh27WqoPdP3HkapQ0,2105
+cdo_toolkit/constants.py,sha256=I_Nqb1N-qlxxhouUbPVVd-N4kZwcVIKuq-3yS2ThWho,393
+cdo_toolkit/errors.py,sha256=Kt5A5scphaD6SbsiKrwad2SIyOSZ4aQLJNfG8rmXcxg,2491
+cdo_toolkit/memory.py,sha256=8UDgxrSZK2up7gbqA5wVSWZaUajbguMvwGM07xSbqFE,550
+cdo_toolkit/paths.py,sha256=bVtPCCulySmZwfAG6Dlyc0jxBbOHAHCsfuMMezZXBfk,976
+cdo_toolkit/pipeline.py,sha256=QV5_Flw_oU58QKyczYzlgftap1my9DllmM1E2Z25ENM,104530
+cdo_toolkit/resolution.py,sha256=wkAyl0VSyCTBHNpdvG4AC7yPdU6TwITuaGC3eKaMfv0,601
+cdo_toolkit/timing.py,sha256=YPFJJ07Ge2SjsCiAGaWYIxoE9hu-CbB8Gu-CogOfnsU,1217
+cdo_toolkit/ui.py,sha256=f_RAqpg5dFIedIES_mNnB54PlKu3tIQyNd1S1yOOFuc,27126
+cdo_toolkit/workers.py,sha256=tj4BN_XceqIHx49qAAPSin0tuOCYqg0FkQxh39OtqMQ,10537
+cdo_toolkit-0.1.0.dist-info/METADATA,sha256=ir9F1MAsBg7swtQP3jIAji0Sijo0F5m_O7uxM3dwKyQ,2511
+cdo_toolkit-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
+cdo_toolkit-0.1.0.dist-info/entry_points.txt,sha256=q1DlSLt2a4omyK0lYfBhlHUi-HHG8VUXjkPqERyUXrc,53
+cdo_toolkit-0.1.0.dist-info/licenses/LICENSE,sha256=lUqGPGWDHHxjkUDuYgjLLY2XQXXn_EHU7fnrQWHGugc,1540
+cdo_toolkit-0.1.0.dist-info/RECORD,,

cdo_toolkit-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.30.1
+Root-Is-Purelib: true
+Tag: py3-none-any

cdo_toolkit-0.1.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ cdo-toolkit = cdo_toolkit.cli:main

cdo_toolkit-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,28 @@
+BSD 3-Clause License
+Copyright (c) 2023, Institut Pierre-Simon Laplace (IPSL) and contributors
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.