PyPI - ngiab-data-preprocess - Versions diffs - 4.2.2__tar.gz → 4.3.0__tar.gz - Mend

ngiab-data-preprocess 4.2.2tar.gz → 4.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

{ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/.gitignore RENAMED Viewed

@@ -14,3 +14,5 @@ dist
 *.dat
 uv.lock
 /build
+*.pmtiles
+*style.json

{ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ngiab_data_preprocess
-Version: 4.2.2
+Version: 4.3.0
 Summary: Graphical Tools for creating Next Gen Water model input data.
 Author-email: Josh Cunningham <jcunningham8@ua.edu>
 Project-URL: Homepage, https://github.com/CIROH-UA/NGIAB_data_preprocess
@@ -15,7 +15,7 @@ Requires-Dist: pyogrio>=0.7.2
 Requires-Dist: pyproj>=3.6.1
 Requires-Dist: Flask==3.0.2
 Requires-Dist: geopandas>=1.0.0
-Requires-Dist: requests==2.32.2
+Requires-Dist: requests==2.32.4
 Requires-Dist: igraph==0.11.4
 Requires-Dist: s3fs==2024.3.1
 Requires-Dist: xarray==2024.2.0
@@ -47,15 +47,19 @@ This repository contains tools for preparing data to run a [next gen](https://gi
 ## Table of Contents
 1. [What does this tool do?](#what-does-this-tool-do)
-2. [Requirements](#requirements)
-3. [Installation and Running](#installation-and-running)
-4. [Development Installation](#development-installation)
-5. [Usage](#usage)
-6. [CLI Documentation](#cli-documentation)
+2. [What does it not do?](#what-does-it-not-do)
+   - [Evaluation](#evaluation)
+   - [Visualisation](#visualisation)
+3. [Requirements](#requirements)
+4. [Installation and Running](#installation-and-running)
+   - [Running without install](#running-without-install)
+5. [For legacy pip installation](#for-legacy-pip-installation)
+6. [Development Installation](#development-installation)
+7. [Usage](#usage)
+8. [CLI Documentation](#cli-documentation)
    - [Arguments](#arguments)
+   - [Usage Notes](#usage-notes)
    - [Examples](#examples)
-   - [File Formats](#file-formats)
-   - [Output](#output)
 ## What does this tool do?
@@ -229,12 +233,12 @@ Once all the steps are finished, you can run NGIAB on the folder shown underneat
 3. Create realization using a lat/lon pair and output to a named folder:
    ```bash
-   python -m ngiab_data_cli -i 54.33,-69.4 -l -r --start 2022-01-01 --end 2022-02-28 -o custom_output
+   python -m ngiab_data_cli -i 33.22,-87.54 -l -r --start 2022-01-01 --end 2022-02-28 -o custom_output
    ```
 4. Perform all operations using a lat/lon pair:
    ```bash
-   python -m ngiab_data_cli -i 54.33,-69.4 -l -s -f -r --start 2022-01-01 --end 2022-02-28
+   python -m ngiab_data_cli -i 33.22,-87.54 -l -s -f -r --start 2022-01-01 --end 2022-02-28
    ```
 5. Subset hydrofabric using gage ID:

{ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/README.md RENAMED Viewed

@@ -7,15 +7,19 @@ This repository contains tools for preparing data to run a [next gen](https://gi
 ## Table of Contents
 1. [What does this tool do?](#what-does-this-tool-do)
-2. [Requirements](#requirements)
-3. [Installation and Running](#installation-and-running)
-4. [Development Installation](#development-installation)
-5. [Usage](#usage)
-6. [CLI Documentation](#cli-documentation)
+2. [What does it not do?](#what-does-it-not-do)
+   - [Evaluation](#evaluation)
+   - [Visualisation](#visualisation)
+3. [Requirements](#requirements)
+4. [Installation and Running](#installation-and-running)
+   - [Running without install](#running-without-install)
+5. [For legacy pip installation](#for-legacy-pip-installation)
+6. [Development Installation](#development-installation)
+7. [Usage](#usage)
+8. [CLI Documentation](#cli-documentation)
    - [Arguments](#arguments)
+   - [Usage Notes](#usage-notes)
    - [Examples](#examples)
-   - [File Formats](#file-formats)
-   - [Output](#output)
 ## What does this tool do?
@@ -189,12 +193,12 @@ Once all the steps are finished, you can run NGIAB on the folder shown underneat
 3. Create realization using a lat/lon pair and output to a named folder:
    ```bash
-   python -m ngiab_data_cli -i 54.33,-69.4 -l -r --start 2022-01-01 --end 2022-02-28 -o custom_output
+   python -m ngiab_data_cli -i 33.22,-87.54 -l -r --start 2022-01-01 --end 2022-02-28 -o custom_output
    ```
 4. Perform all operations using a lat/lon pair:
    ```bash
-   python -m ngiab_data_cli -i 54.33,-69.4 -l -s -f -r --start 2022-01-01 --end 2022-02-28
+   python -m ngiab_data_cli -i 33.22,-87.54 -l -s -f -r --start 2022-01-01 --end 2022-02-28
    ```
 5. Subset hydrofabric using gage ID:

{ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/data_processing/create_realization.py RENAMED Viewed

@@ -3,15 +3,15 @@ import logging
 import multiprocessing
 import shutil
 import sqlite3
-from collections import defaultdict
 from datetime import datetime
 from pathlib import Path
+from typing import Dict, Optional
 import pandas
 import requests
 import s3fs
 import xarray as xr
-from dask.distributed import Client, LocalCluster
+from data_processing.dask_utils import temp_cluster
 from data_processing.file_paths import file_paths
 from data_processing.gpkg_utils import (
     GeoPackage,
@@ -25,7 +25,8 @@ from tqdm.rich import tqdm
 logger = logging.getLogger(__name__)
-def get_approximate_gw_storage(paths: file_paths, start_date: datetime):
+@temp_cluster
+def get_approximate_gw_storage(paths: file_paths, start_date: datetime) -> Dict[str, int]:
     # get the gw levels from the NWM output on a given start date
     # this kind of works in place of warmstates for now
     year = start_date.strftime("%Y")
@@ -35,17 +36,10 @@ def get_approximate_gw_storage(paths: file_paths, start_date: datetime):
     fs = s3fs.S3FileSystem(anon=True)
     nc_url = f"s3://noaa-nwm-retrospective-3-0-pds/CONUS/netcdf/GWOUT/{year}/{formatted_dt}.GWOUT_DOMAIN1"
-    # make sure there's a dask cluster running
-    try:
-        client = Client.current()
-    except ValueError:
-        cluster = LocalCluster()
-        client = Client(cluster)
     with fs.open(nc_url) as file_obj:
-        ds = xr.open_dataset(file_obj)
+        ds = xr.open_dataset(file_obj)  # type: ignore
-        water_levels = dict()
+        water_levels: Dict[str, int] = dict()
         for cat, feature in tqdm(cat_to_feature.items()):
             # this value is in CM, we need meters to match max_gw_depth
             # xarray says it's in mm, with 0.1 scale factor. calling .values doesn't apply the scale
@@ -114,13 +108,13 @@ def make_noahowp_config(
                     lon=divide_conf_df.loc[divide, "longitude"],
                     terrain_slope=divide_conf_df.loc[divide, "mean.slope_1km"],
                     azimuth=divide_conf_df.loc[divide, "circ_mean.aspect"],
-                    ISLTYP=int(divide_conf_df.loc[divide, "mode.ISLTYP"]),
-                    IVGTYP=int(divide_conf_df.loc[divide, "mode.IVGTYP"]),
+                    ISLTYP=int(divide_conf_df.loc[divide, "mode.ISLTYP"]),  # type: ignore
+                    IVGTYP=int(divide_conf_df.loc[divide, "mode.IVGTYP"]),  # type: ignore
                 )
             )
-def get_model_attributes_modspatialite(hydrofabric: Path):
+def get_model_attributes_modspatialite(hydrofabric: Path) -> pandas.DataFrame:
     # modspatialite is faster than pyproj but can't be added as a pip dependency
     # This incantation took a while
     with GeoPackage(hydrofabric) as conn:
@@ -151,7 +145,7 @@ def get_model_attributes_modspatialite(hydrofabric: Path):
     return divide_conf_df
-def get_model_attributes_pyproj(hydrofabric: Path):
+def get_model_attributes_pyproj(hydrofabric: Path) -> pandas.DataFrame:
     # if modspatialite is not available, use pyproj
     with sqlite3.connect(hydrofabric) as conn:
         sql = """
@@ -185,7 +179,7 @@ def get_model_attributes_pyproj(hydrofabric: Path):
     return divide_conf_df
-def get_model_attributes(hydrofabric: Path):
+def get_model_attributes(hydrofabric: Path) -> pandas.DataFrame:
     try:
         with GeoPackage(hydrofabric) as conn:
             conf_df = pandas.read_sql_query(
@@ -259,7 +253,7 @@ def make_em_config(
 def configure_troute(
     cat_id: str, config_dir: Path, start_time: datetime, end_time: datetime
-) -> int:
+) -> None:
     with open(file_paths.template_troute_config, "r") as file:
         troute_template = file.read()
     time_step_size = 300
@@ -316,7 +310,7 @@ def create_realization(
     start_time: datetime,
     end_time: datetime,
     use_nwm_gw: bool = False,
-    gage_id: str = None,
+    gage_id: Optional[str] = None,
 ):
     paths = file_paths(cat_id)
@@ -354,12 +348,12 @@ def create_realization(
     create_partitions(paths)
-def create_partitions(paths: Path, num_partitions: int = None) -> None:
+def create_partitions(paths: file_paths, num_partitions: Optional[int] = None) -> None:
     if num_partitions is None:
         num_partitions = multiprocessing.cpu_count()
     cat_to_nex_pairs = get_cat_to_nex_flowpairs(hydrofabric=paths.geopackage_path)
-    nexus = defaultdict(list)
+    # nexus = defaultdict(list)
     # for cat, nex in cat_to_nex_pairs:
     #     nexus[nex].append(cat)

ngiab_data_preprocess-4.3.0/modules/data_processing/dask_utils.py ADDED Viewed

@@ -0,0 +1,92 @@
+import logging
+from dask.distributed import Client
+logger = logging.getLogger(__name__)
+def shutdown_cluster():
+    try:
+        client = Client.current()
+        client.shutdown()
+    except ValueError:
+        logger.debug("No cluster found to shutdown")
+def no_cluster(func):
+    """
+    Decorator that ensures the wrapped function runs with no active Dask cluster.
+    This decorator attempts to shut down any existing Dask cluster before
+    executing the wrapped function. If no cluster is found, it logs a debug message
+    and continues execution.
+    Parameters:
+        func: The function to be executed without a Dask cluster
+    Returns:
+        wrapper: The wrapped function that will be executed without a Dask cluster
+    """
+    def wrapper(*args, **kwargs):
+        shutdown_cluster()
+        result = func(*args, **kwargs)
+        return result
+    return wrapper
+def use_cluster(func):
+    """
+    Decorator that ensures the wrapped function has access to a Dask cluster.
+    If a Dask cluster is already running, it uses the existing one.
+    If no cluster is available, it creates a new one before executing the function.
+    The cluster remains active after the function completes.
+    Parameters:
+        func: The function to be executed with a Dask cluster
+    Returns:
+        wrapper: The wrapped function with access to a Dask cluster
+    """
+    def wrapper(*args, **kwargs):
+        try:
+            client = Client.current()
+        except ValueError:
+            client = Client()
+        result = func(*args, **kwargs)
+        return result
+    return wrapper
+def temp_cluster(func):
+    """
+    Decorator that provides a temporary Dask cluster for the wrapped function.
+    If a Dask cluster is already running, it uses the existing one and leaves it running.
+    If no cluster exists, it creates a temporary one and shuts it down after
+    the function completes.
+    Parameters:
+        func: The function to be executed with a Dask cluster
+    Returns:
+        wrapper: The wrapped function with access to a Dask cluster
+    """
+    def wrapper(*args, **kwargs):
+        cluster_was_running = True
+        try:
+            client = Client.current()
+        except ValueError:
+            cluster_was_running = False
+            client = Client()
+        result = func(*args, **kwargs)
+        if not cluster_was_running:
+            client.shutdown()
+        return result
+    return wrapper

{ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/data_processing/dataset_utils.py RENAMED Viewed

@@ -1,19 +1,22 @@
 import logging
 import os
+from datetime import datetime
 from pathlib import Path
-from typing import Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 import geopandas as gpd
 import numpy as np
 import xarray as xr
-from dask.distributed import Client,  progress
-import datetime
+from xarray.core.types import InterpOptions
+from dask.distributed import Client, progress, Future
+from data_processing.dask_utils import use_cluster
 logger = logging.getLogger(__name__)
 # known ngen variable names
 # https://github.com/CIROH-UA/ngen/blob/4fb5bb68dc397298bca470dfec94db2c1dcb42fe/include/forcing/AorcForcing.hpp#L77
 def validate_dataset_format(dataset: xr.Dataset) -> None:
     """
     Validate the format of the dataset.
@@ -41,8 +44,9 @@ def validate_dataset_format(dataset: xr.Dataset) -> None:
     if "name" not in dataset.attrs:
         raise ValueError("Dataset must have a name attribute to identify it")
 def validate_time_range(dataset: xr.Dataset, start_time: str, end_time: str) -> Tuple[str, str]:
-    '''
+    """
     Ensure that all selected times are in the passed dataset.
     Parameters
@@ -60,7 +64,7 @@ def validate_time_range(dataset: xr.Dataset, start_time: str, end_time: str) ->
         start_time, or if not available, earliest available timestep in dataset.
     str
         end_time, or if not available, latest available timestep in dataset.
-    '''
+    """
     end_time_in_dataset = dataset.time.isel(time=-1).values
     start_time_in_dataset = dataset.time.isel(time=0).values
     if np.datetime64(start_time) < start_time_in_dataset:
@@ -77,7 +81,10 @@ def validate_time_range(dataset: xr.Dataset, start_time: str, end_time: str) ->
 def clip_dataset_to_bounds(
-    dataset: xr.Dataset, bounds: Tuple[float, float, float, float], start_time: str, end_time: str
+    dataset: xr.Dataset,
+    bounds: Tuple[float, float, float, float] | np.ndarray[tuple[int], np.dtype[np.float64]],
+    start_time: str,
+    end_time: str,
 ) -> xr.Dataset:
     """
     Clip the dataset to specified geographical bounds.
@@ -86,14 +93,14 @@ def clip_dataset_to_bounds(
     ----------
     dataset : xr.Dataset
         Dataset to be clipped.
-    bounds : tuple[float, float, float, float]
-        Corners of bounding box. bounds[0] is x_min, bounds[1] is y_min,
+    bounds : tuple[float, float, float, float] | np.ndarray[tuple[int], np.dtype[np.float64]]
+        Corners of bounding box. bounds[0] is x_min, bounds[1] is y_min,
         bounds[2] is x_max, bounds[3] is y_max.
     start_time : str
         Desired start time in YYYY/MM/DD HH:MM:SS format.
     end_time : str
         Desired end time in YYYY/MM/DD HH:MM:SS format.
     Returns
     -------
     xr.Dataset
@@ -110,33 +117,103 @@ def clip_dataset_to_bounds(
     return dataset
-def save_to_cache(stores: xr.Dataset, cached_nc_path: Path) -> xr.Dataset:
-    """Compute the store and save it to a cached netCDF file. This is not required but will save time and bandwidth."""
-    logger.info("Downloading and caching forcing data, this may take a while")
+def interpolate_nan_values(
+    dataset: xr.Dataset,
+    variables: Optional[List[str]] = None,
+    dim: str = "time",
+    method: InterpOptions = "nearest",
+    fill_value: str = "extrapolate",
+) -> None:
+    """
+    Interpolates NaN values in specified (or all numeric time-dependent)
+    variables of an xarray.Dataset. Operates inplace on the dataset.
-    if not cached_nc_path.parent.exists():
-        cached_nc_path.parent.mkdir(parents=True)
+    Parameters
+    ----------
+    dataset : xr.Dataset
+        The input dataset.
+    variables : Optional[List[str]], optional
+        A list of variable names to process. If None (default),
+        all numeric variables containing the specified dimension will be processed.
+    dim : str, optional
+        The dimension along which to interpolate (default is "time").
+    method : str, optional
+        Interpolation method to use (e.g., "linear", "nearest", "cubic").
+        Default is "nearest".
+    fill_value : str, optional
+        Method for filling NaNs at the start/end of the series after interpolation.
+        Set to "extrapolate" to fill with the nearest valid value when using 'nearest' or 'linear'.
+        Default is "extrapolate".
+    """
+    for name, var in dataset.data_vars.items():
+        # if the variable is non-numeric, skip
+        if not np.issubdtype(var.dtype, np.number):
+            continue
+        # if there are no NANs, skip
+        if not var.isnull().any().compute():
+            continue
+        dataset[name] = var.interpolate_na(
+            dim=dim,
+            method=method,
+            fill_value=fill_value if method in ["nearest", "linear"] else None,
+        )
-    # sort of terrible work around for half downloaded files
-    temp_path = cached_nc_path.with_suffix(".downloading.nc")
-    if os.path.exists(temp_path):
-        os.remove(temp_path)
-    ## Cast every single variable to float32 to save space to save a lot of memory issues later
-    ## easier to do it now in this slow download step than later in the steps without dask
-    for var in stores.data_vars:
-        stores[var] = stores[var].astype("float32")
+@use_cluster
+def save_dataset(
+    ds_to_save: xr.Dataset,
+    target_path: Path,
+    engine: Literal["netcdf4", "scipy", "h5netcdf"] = "h5netcdf",
+):
+    """
+    Helper function to compute and save an xarray.Dataset to a NetCDF file.
+    Uses a temporary file and rename for atomicity.
+    """
+    if not target_path.parent.exists():
+        target_path.parent.mkdir(parents=True, exist_ok=True)
+    temp_file_path = target_path.with_name(target_path.name + ".saving.nc")
+    if temp_file_path.exists():
+        os.remove(temp_file_path)
     client = Client.current()
-    future = client.compute(stores.to_netcdf(temp_path, compute=False))
-    # Display progress bar
+    future: Future = client.compute(
+        ds_to_save.to_netcdf(temp_file_path, engine=engine, compute=False)
+    )  # type: ignore
+    logger.debug(
+        f"NetCDF write task submitted to Dask. Waiting for completion to {temp_file_path}..."
+    )
     progress(future)
     future.result()
+    os.rename(str(temp_file_path), str(target_path))
+    logger.info(f"Successfully saved data to: {target_path}")
+@use_cluster
+def save_to_cache(
+    stores: xr.Dataset, cached_nc_path: Path, interpolate_nans: bool = True
+) -> xr.Dataset:
+    """
+    Compute the store and save it to a cached netCDF file. This is not required but will save time and bandwidth.
+    """
+    logger.info(f"Processing dataset for caching. Final cache target: {cached_nc_path}")
-    os.rename(temp_path, cached_nc_path)
+    # lasily cast all numbers to f32
+    for name, var in stores.data_vars.items():
+        if np.issubdtype(var.dtype, np.number):
+            stores[name] = var.astype("float32", casting="same_kind")
-    data = xr.open_mfdataset(cached_nc_path, parallel=True, engine="h5netcdf")
-    return data
+    # save dataset locally before manipulating it
+    save_dataset(stores, cached_nc_path)
+    stores = xr.open_mfdataset(cached_nc_path, parallel=True, engine="h5netcdf")
+    if interpolate_nans:
+        interpolate_nan_values(dataset=stores)
+        save_dataset(stores, cached_nc_path)
+        stores = xr.open_mfdataset(cached_nc_path, parallel=True, engine="h5netcdf")
+    return stores
 def check_local_cache(
@@ -144,9 +221,8 @@ def check_local_cache(
     start_time: str,
     end_time: str,
     gdf: gpd.GeoDataFrame,
-    remote_dataset: xr.Dataset
+    remote_dataset: xr.Dataset,
 ) -> Union[xr.Dataset, None]:
     merged_data = None
     if not os.path.exists(cached_nc_path):
@@ -155,9 +231,7 @@ def check_local_cache(
     logger.info("Found cached nc file")
     # open the cached file and check that the time range is correct
-    cached_data = xr.open_mfdataset(
-        cached_nc_path, parallel=True, engine="h5netcdf"
-    )
+    cached_data = xr.open_mfdataset(cached_nc_path, parallel=True, engine="h5netcdf")
     if "name" not in cached_data.attrs or "name" not in remote_dataset.attrs:
         logger.warning("No name attribute found to compare datasets")
@@ -166,9 +240,9 @@ def check_local_cache(
         logger.warning("Cached data from different source, .name attr doesn't match")
         return
-    range_in_cache = cached_data.time[0].values <= np.datetime64(
-        start_time
-    ) and cached_data.time[-1].values >= np.datetime64(end_time)
+    range_in_cache = cached_data.time[0].values <= np.datetime64(start_time) and cached_data.time[
+        -1
+    ].values >= np.datetime64(end_time)
     if not range_in_cache:
         # the cache does not contain the desired time range
@@ -186,10 +260,8 @@ def check_local_cache(
     if range_in_cache:
         logger.info("Time range is within cached data")
         logger.debug(f"Opened cached nc file: [{cached_nc_path}]")
-        merged_data = clip_dataset_to_bounds(
-            cached_data, gdf.total_bounds, start_time, end_time
-        )
-        logger.debug("Clipped stores")
+        merged_data = clip_dataset_to_bounds(cached_data, gdf.total_bounds, start_time, end_time)
+        logger.debug("Clipped stores")
     return merged_data
@@ -197,16 +269,27 @@ def check_local_cache(
 def save_and_clip_dataset(
     dataset: xr.Dataset,
     gdf: gpd.GeoDataFrame,
-    start_time: datetime.datetime,
-    end_time: datetime.datetime,
+    start_time: datetime,
+    end_time: datetime,
     cache_location: Path,
 ) -> xr.Dataset:
     """convenience function clip the remote dataset, and either load from cache or save to cache if it's not present"""
     gdf = gdf.to_crs(dataset.crs)
-    cached_data = check_local_cache(cache_location, start_time, end_time, gdf, dataset)
+    cached_data = check_local_cache(
+        cache_location,
+        start_time,  # type: ignore
+        end_time,  # type: ignore
+        gdf,
+        dataset,
+    )
     if not cached_data:
-        clipped_data = clip_dataset_to_bounds(dataset, gdf.total_bounds, start_time, end_time)
+        clipped_data = clip_dataset_to_bounds(
+            dataset,
+            gdf.total_bounds,
+            start_time,  # type: ignore
+            end_time,  # type: ignore
+        )
         cached_data = save_to_cache(clipped_data, cache_location)
-    return cached_data
+    return cached_data

ngiab-data-preprocess 4.2.2__tar.gz → 4.3.0__tar.gz

ngiab-data-preprocess 4.2.2tar.gz → 4.3.0tar.gz