PyPI - ngiab-data-preprocess - Versions diffs - 4.2.2__py3-none-any.whl → 4.4.0__py3-none-any.whl - Mend

ngiab-data-preprocess 4.2.2py3-none-any.whl → 4.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

data_processing/create_realization.py +39 -22
data_processing/dask_utils.py +92 -0
data_processing/dataset_utils.py +161 -44
data_processing/datasets.py +18 -29
data_processing/file_paths.py +7 -7
data_processing/forcings.py +40 -38
data_processing/gpkg_utils.py +13 -13
data_processing/graph_utils.py +4 -6
data_processing/s3fs_utils.py +1 -1
data_processing/subset.py +39 -8
data_sources/ngen-routing-template.yaml +1 -1
data_sources/source_validation.py +72 -34
map_app/__main__.py +3 -2
map_app/static/css/main.css +14 -3
map_app/static/js/data_processing.js +31 -55
map_app/static/js/main.js +224 -106
map_app/templates/index.html +10 -1
map_app/views.py +17 -3
ngiab_data_cli/__main__.py +32 -29
ngiab_data_cli/arguments.py +0 -1
ngiab_data_cli/forcing_cli.py +10 -19
ngiab_data_preprocess-4.4.0.dist-info/METADATA +308 -0
ngiab_data_preprocess-4.4.0.dist-info/RECORD +43 -0
{ngiab_data_preprocess-4.2.2.dist-info → ngiab_data_preprocess-4.4.0.dist-info}/WHEEL +1 -1
ngiab_data_preprocess-4.2.2.dist-info/METADATA +0 -258
ngiab_data_preprocess-4.2.2.dist-info/RECORD +0 -42
{ngiab_data_preprocess-4.2.2.dist-info → ngiab_data_preprocess-4.4.0.dist-info}/entry_points.txt +0 -0
{ngiab_data_preprocess-4.2.2.dist-info → ngiab_data_preprocess-4.4.0.dist-info}/licenses/LICENSE +0 -0
{ngiab_data_preprocess-4.2.2.dist-info → ngiab_data_preprocess-4.4.0.dist-info}/top_level.txt +0 -0

data_processing/create_realization.py CHANGED Viewed

@@ -3,15 +3,17 @@ import logging
 import multiprocessing
 import shutil
 import sqlite3
-from collections import defaultdict
 from datetime import datetime
 from pathlib import Path
+from typing import Dict, Optional
+import psutil
+import os
 import pandas
 import requests
 import s3fs
 import xarray as xr
-from dask.distributed import Client, LocalCluster
+from data_processing.dask_utils import temp_cluster
 from data_processing.file_paths import file_paths
 from data_processing.gpkg_utils import (
     GeoPackage,
@@ -25,7 +27,8 @@ from tqdm.rich import tqdm
 logger = logging.getLogger(__name__)
-def get_approximate_gw_storage(paths: file_paths, start_date: datetime):
+@temp_cluster
+def get_approximate_gw_storage(paths: file_paths, start_date: datetime) -> Dict[str, int]:
     # get the gw levels from the NWM output on a given start date
     # this kind of works in place of warmstates for now
     year = start_date.strftime("%Y")
@@ -35,17 +38,10 @@ def get_approximate_gw_storage(paths: file_paths, start_date: datetime):
     fs = s3fs.S3FileSystem(anon=True)
     nc_url = f"s3://noaa-nwm-retrospective-3-0-pds/CONUS/netcdf/GWOUT/{year}/{formatted_dt}.GWOUT_DOMAIN1"
-    # make sure there's a dask cluster running
-    try:
-        client = Client.current()
-    except ValueError:
-        cluster = LocalCluster()
-        client = Client(cluster)
     with fs.open(nc_url) as file_obj:
-        ds = xr.open_dataset(file_obj)
+        ds = xr.open_dataset(file_obj)  # type: ignore
-        water_levels = dict()
+        water_levels: Dict[str, int] = dict()
         for cat, feature in tqdm(cat_to_feature.items()):
             # this value is in CM, we need meters to match max_gw_depth
             # xarray says it's in mm, with 0.1 scale factor. calling .values doesn't apply the scale
@@ -114,13 +110,13 @@ def make_noahowp_config(
                     lon=divide_conf_df.loc[divide, "longitude"],
                     terrain_slope=divide_conf_df.loc[divide, "mean.slope_1km"],
                     azimuth=divide_conf_df.loc[divide, "circ_mean.aspect"],
-                    ISLTYP=int(divide_conf_df.loc[divide, "mode.ISLTYP"]),
-                    IVGTYP=int(divide_conf_df.loc[divide, "mode.IVGTYP"]),
+                    ISLTYP=int(divide_conf_df.loc[divide, "mode.ISLTYP"]),  # type: ignore
+                    IVGTYP=int(divide_conf_df.loc[divide, "mode.IVGTYP"]),  # type: ignore
                 )
             )
-def get_model_attributes_modspatialite(hydrofabric: Path):
+def get_model_attributes_modspatialite(hydrofabric: Path) -> pandas.DataFrame:
     # modspatialite is faster than pyproj but can't be added as a pip dependency
     # This incantation took a while
     with GeoPackage(hydrofabric) as conn:
@@ -151,7 +147,7 @@ def get_model_attributes_modspatialite(hydrofabric: Path):
     return divide_conf_df
-def get_model_attributes_pyproj(hydrofabric: Path):
+def get_model_attributes_pyproj(hydrofabric: Path) -> pandas.DataFrame:
     # if modspatialite is not available, use pyproj
     with sqlite3.connect(hydrofabric) as conn:
         sql = """
@@ -185,7 +181,7 @@ def get_model_attributes_pyproj(hydrofabric: Path):
     return divide_conf_df
-def get_model_attributes(hydrofabric: Path):
+def get_model_attributes(hydrofabric: Path) -> pandas.DataFrame:
     try:
         with GeoPackage(hydrofabric) as conn:
             conf_df = pandas.read_sql_query(
@@ -259,11 +255,31 @@ def make_em_config(
 def configure_troute(
     cat_id: str, config_dir: Path, start_time: datetime, end_time: datetime
-) -> int:
+) -> None:
     with open(file_paths.template_troute_config, "r") as file:
         troute_template = file.read()
     time_step_size = 300
+    gpkg_file_path=f"{config_dir}/{cat_id}_subset.gpkg"
     nts = (end_time - start_time).total_seconds() / time_step_size
+    with sqlite3.connect(gpkg_file_path) as conn:
+        ncats_df = pandas.read_sql_query("SELECT COUNT(id) FROM 'divides';", conn)
+        ncats = ncats_df['COUNT(id)'][0]
+    est_bytes_required = nts * ncats * 45 # extremely rough calculation based on about 3 tests :)
+    local_ram_available = 0.8 * psutil.virtual_memory().available # buffer to not accidentally explode machine
+    if est_bytes_required > local_ram_available:
+        max_loop_size = nts // (est_bytes_required // local_ram_available)
+        binary_nexus_file_folder_comment = ""
+        parent_dir = config_dir.parent
+        output_parquet_path = Path(f"{parent_dir}/outputs/parquet/")
+        if not output_parquet_path.exists():
+            os.makedirs(output_parquet_path)
+    else:
+        max_loop_size = nts
+        binary_nexus_file_folder_comment = "#"
     filled_template = troute_template.format(
         # hard coded to 5 minutes
         time_step_size=time_step_size,
@@ -272,7 +288,8 @@ def configure_troute(
         geo_file_path=f"./config/{cat_id}_subset.gpkg",
         start_datetime=start_time.strftime("%Y-%m-%d %H:%M:%S"),
         nts=nts,
-        max_loop_size=nts,
+        max_loop_size=max_loop_size,
+        binary_nexus_file_folder_comment=binary_nexus_file_folder_comment
     )
     with open(config_dir / "troute.yaml", "w") as file:
@@ -316,7 +333,7 @@ def create_realization(
     start_time: datetime,
     end_time: datetime,
     use_nwm_gw: bool = False,
-    gage_id: str = None,
+    gage_id: Optional[str] = None,
 ):
     paths = file_paths(cat_id)
@@ -354,12 +371,12 @@ def create_realization(
     create_partitions(paths)
-def create_partitions(paths: Path, num_partitions: int = None) -> None:
+def create_partitions(paths: file_paths, num_partitions: Optional[int] = None) -> None:
     if num_partitions is None:
         num_partitions = multiprocessing.cpu_count()
     cat_to_nex_pairs = get_cat_to_nex_flowpairs(hydrofabric=paths.geopackage_path)
-    nexus = defaultdict(list)
+    # nexus = defaultdict(list)
     # for cat, nex in cat_to_nex_pairs:
     #     nexus[nex].append(cat)

data_processing/dask_utils.py ADDED Viewed

@@ -0,0 +1,92 @@
+import logging
+from dask.distributed import Client
+logger = logging.getLogger(__name__)
+def shutdown_cluster():
+    try:
+        client = Client.current()
+        client.shutdown()
+    except ValueError:
+        logger.debug("No cluster found to shutdown")
+def no_cluster(func):
+    """
+    Decorator that ensures the wrapped function runs with no active Dask cluster.
+    This decorator attempts to shut down any existing Dask cluster before
+    executing the wrapped function. If no cluster is found, it logs a debug message
+    and continues execution.
+    Parameters:
+        func: The function to be executed without a Dask cluster
+    Returns:
+        wrapper: The wrapped function that will be executed without a Dask cluster
+    """
+    def wrapper(*args, **kwargs):
+        shutdown_cluster()
+        result = func(*args, **kwargs)
+        return result
+    return wrapper
+def use_cluster(func):
+    """
+    Decorator that ensures the wrapped function has access to a Dask cluster.
+    If a Dask cluster is already running, it uses the existing one.
+    If no cluster is available, it creates a new one before executing the function.
+    The cluster remains active after the function completes.
+    Parameters:
+        func: The function to be executed with a Dask cluster
+    Returns:
+        wrapper: The wrapped function with access to a Dask cluster
+    """
+    def wrapper(*args, **kwargs):
+        try:
+            client = Client.current()
+        except ValueError:
+            client = Client()
+        result = func(*args, **kwargs)
+        return result
+    return wrapper
+def temp_cluster(func):
+    """
+    Decorator that provides a temporary Dask cluster for the wrapped function.
+    If a Dask cluster is already running, it uses the existing one and leaves it running.
+    If no cluster exists, it creates a temporary one and shuts it down after
+    the function completes.
+    Parameters:
+        func: The function to be executed with a Dask cluster
+    Returns:
+        wrapper: The wrapped function with access to a Dask cluster
+    """
+    def wrapper(*args, **kwargs):
+        cluster_was_running = True
+        try:
+            client = Client.current()
+        except ValueError:
+            cluster_was_running = False
+            client = Client()
+        result = func(*args, **kwargs)
+        if not cluster_was_running:
+            client.shutdown()
+        return result
+    return wrapper

data_processing/dataset_utils.py CHANGED Viewed

@@ -1,19 +1,22 @@
 import logging
 import os
+from datetime import datetime
 from pathlib import Path
-from typing import Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 import geopandas as gpd
 import numpy as np
 import xarray as xr
-from dask.distributed import Client,  progress
-import datetime
+from dask.distributed import Client, Future, progress
+from data_processing.dask_utils import no_cluster, temp_cluster
+from xarray.core.types import InterpOptions
 logger = logging.getLogger(__name__)
 # known ngen variable names
 # https://github.com/CIROH-UA/ngen/blob/4fb5bb68dc397298bca470dfec94db2c1dcb42fe/include/forcing/AorcForcing.hpp#L77
 def validate_dataset_format(dataset: xr.Dataset) -> None:
     """
     Validate the format of the dataset.
@@ -41,8 +44,9 @@ def validate_dataset_format(dataset: xr.Dataset) -> None:
     if "name" not in dataset.attrs:
         raise ValueError("Dataset must have a name attribute to identify it")
 def validate_time_range(dataset: xr.Dataset, start_time: str, end_time: str) -> Tuple[str, str]:
-    '''
+    """
     Ensure that all selected times are in the passed dataset.
     Parameters
@@ -60,7 +64,7 @@ def validate_time_range(dataset: xr.Dataset, start_time: str, end_time: str) ->
         start_time, or if not available, earliest available timestep in dataset.
     str
         end_time, or if not available, latest available timestep in dataset.
-    '''
+    """
     end_time_in_dataset = dataset.time.isel(time=-1).values
     start_time_in_dataset = dataset.time.isel(time=0).values
     if np.datetime64(start_time) < start_time_in_dataset:
@@ -77,7 +81,10 @@ def validate_time_range(dataset: xr.Dataset, start_time: str, end_time: str) ->
 def clip_dataset_to_bounds(
-    dataset: xr.Dataset, bounds: Tuple[float, float, float, float], start_time: str, end_time: str
+    dataset: xr.Dataset,
+    bounds: Tuple[float, float, float, float] | np.ndarray[tuple[int], np.dtype[np.float64]],
+    start_time: str,
+    end_time: str,
 ) -> xr.Dataset:
     """
     Clip the dataset to specified geographical bounds.
@@ -86,14 +93,14 @@ def clip_dataset_to_bounds(
     ----------
     dataset : xr.Dataset
         Dataset to be clipped.
-    bounds : tuple[float, float, float, float]
-        Corners of bounding box. bounds[0] is x_min, bounds[1] is y_min,
+    bounds : tuple[float, float, float, float] | np.ndarray[tuple[int], np.dtype[np.float64]]
+        Corners of bounding box. bounds[0] is x_min, bounds[1] is y_min,
         bounds[2] is x_max, bounds[3] is y_max.
     start_time : str
         Desired start time in YYYY/MM/DD HH:MM:SS format.
     end_time : str
         Desired end time in YYYY/MM/DD HH:MM:SS format.
     Returns
     -------
     xr.Dataset
@@ -110,33 +117,137 @@ def clip_dataset_to_bounds(
     return dataset
-def save_to_cache(stores: xr.Dataset, cached_nc_path: Path) -> xr.Dataset:
-    """Compute the store and save it to a cached netCDF file. This is not required but will save time and bandwidth."""
-    logger.info("Downloading and caching forcing data, this may take a while")
+@no_cluster
+def interpolate_nan_values(
+    dataset: xr.Dataset,
+    variables: Optional[List[str]] = None,
+    dim: str = "time",
+    method: InterpOptions = "nearest",
+    fill_value: str = "extrapolate",
+) -> bool:
+    """
+    Interpolates NaN values in specified (or all numeric time-dependent)
+    variables of an xarray.Dataset. Operates inplace on the dataset.
+    Parameters
+    ----------
+    dataset : xr.Dataset
+        The input dataset.
+    variables : Optional[List[str]], optional
+        A list of variable names to process. If None (default),
+        all numeric variables containing the specified dimension will be processed.
+    dim : str, optional
+        The dimension along which to interpolate (default is "time").
+    method : str, optional
+        Interpolation method to use (e.g., "linear", "nearest", "cubic").
+        Default is "nearest".
+    fill_value : str, optional
+        Method for filling NaNs at the start/end of the series after interpolation.
+        Set to "extrapolate" to fill with the nearest valid value when using 'nearest' or 'linear'.
+        Default is "extrapolate".
+    """
+    interpolation_used = False
+    for name, var in dataset.data_vars.items():
+        # if the variable is non-numeric, skip
+        if not np.issubdtype(var.dtype, np.number):
+            continue
+        # if there are no NANs, skip
+        if not var.isnull().any().compute():
+            continue
+        dataset[name] = var.interpolate_na(
+            dim=dim,
+            method=method,
+            fill_value=fill_value if method in ["nearest", "linear"] else None,
+        )
+        interpolation_used = True
+    return interpolation_used
+@no_cluster
+def save_dataset_no_cluster(
+    ds_to_save: xr.Dataset,
+    target_path: Path,
+    engine: Literal["netcdf4", "scipy", "h5netcdf"] = "h5netcdf",
+):
+    """
+    This explicitly does not use dask distributed.
+    Helper function to compute and save an xarray.Dataset to a NetCDF file.
+    Uses a temporary file and rename for avoid leaving a half written file.
+    """
+    if not target_path.parent.exists():
+        target_path.parent.mkdir(parents=True, exist_ok=True)
-    if not cached_nc_path.parent.exists():
-        cached_nc_path.parent.mkdir(parents=True)
+    temp_file_path = target_path.with_name(target_path.name + ".saving.nc")
+    if temp_file_path.exists():
+        os.remove(temp_file_path)
-    # sort of terrible work around for half downloaded files
-    temp_path = cached_nc_path.with_suffix(".downloading.nc")
-    if os.path.exists(temp_path):
-        os.remove(temp_path)
+    ds_to_save.to_netcdf(temp_file_path, engine=engine, compute=True)
-    ## Cast every single variable to float32 to save space to save a lot of memory issues later
-    ## easier to do it now in this slow download step than later in the steps without dask
-    for var in stores.data_vars:
-        stores[var] = stores[var].astype("float32")
+    os.rename(str(temp_file_path), str(target_path))
+    logger.info(f"Successfully saved data to: {target_path}")
+@temp_cluster
+def save_dataset(
+    ds_to_save: xr.Dataset,
+    target_path: Path,
+    engine: Literal["netcdf4", "scipy", "h5netcdf"] = "h5netcdf",
+):
+    """
+    Helper function to compute and save an xarray.Dataset to a NetCDF file.
+    Uses a temporary file and rename for atomicity.
+    """
+    if not target_path.parent.exists():
+        target_path.parent.mkdir(parents=True, exist_ok=True)
+    temp_file_path = target_path.with_name(target_path.name + ".saving.nc")
+    if temp_file_path.exists():
+        os.remove(temp_file_path)
     client = Client.current()
-    future = client.compute(stores.to_netcdf(temp_path, compute=False))
-    # Display progress bar
+    future: Future = client.compute(
+        ds_to_save.to_netcdf(temp_file_path, engine=engine, compute=False)
+    )  # type: ignore
+    logger.debug(
+        f"NetCDF write task submitted to Dask. Waiting for completion to {temp_file_path}..."
+    )
+    logger.info("For more detailed progress, see the Dask dashboard http://localhost:8787/status")
     progress(future)
     future.result()
+    os.rename(str(temp_file_path), str(target_path))
+    logger.info(f"Successfully saved data to: {target_path}")
-    os.rename(temp_path, cached_nc_path)
+@no_cluster
+def save_to_cache(
+    stores: xr.Dataset, cached_nc_path: Path, interpolate_nans: bool = True
+) -> xr.Dataset:
+    """
+    Compute the store and save it to a cached netCDF file. This is not required but will save time and bandwidth.
+    """
+    logger.debug(f"Processing dataset for caching. Final cache target: {cached_nc_path}")
-    data = xr.open_mfdataset(cached_nc_path, parallel=True, engine="h5netcdf")
-    return data
+    # lasily cast all numbers to f32
+    for name, var in stores.data_vars.items():
+        if np.issubdtype(var.dtype, np.number):
+            stores[name] = var.astype("float32", casting="same_kind")
+    # save dataset locally before manipulating it
+    save_dataset(stores, cached_nc_path)
+    if interpolate_nans:
+        stores = xr.open_mfdataset(
+            cached_nc_path,
+            parallel=True,
+            engine="h5netcdf",
+        )
+        was_interpolated = interpolate_nan_values(dataset=stores)
+        if was_interpolated:
+            save_dataset_no_cluster(stores, cached_nc_path)
+    stores = xr.open_mfdataset(cached_nc_path, parallel=True, engine="h5netcdf")
+    return stores
 def check_local_cache(
@@ -144,9 +255,8 @@ def check_local_cache(
     start_time: str,
     end_time: str,
     gdf: gpd.GeoDataFrame,
-    remote_dataset: xr.Dataset
+    remote_dataset: xr.Dataset,
 ) -> Union[xr.Dataset, None]:
     merged_data = None
     if not os.path.exists(cached_nc_path):
@@ -155,9 +265,7 @@ def check_local_cache(
     logger.info("Found cached nc file")
     # open the cached file and check that the time range is correct
-    cached_data = xr.open_mfdataset(
-        cached_nc_path, parallel=True, engine="h5netcdf"
-    )
+    cached_data = xr.open_mfdataset(cached_nc_path, parallel=True, engine="h5netcdf")
     if "name" not in cached_data.attrs or "name" not in remote_dataset.attrs:
         logger.warning("No name attribute found to compare datasets")
@@ -166,9 +274,9 @@ def check_local_cache(
         logger.warning("Cached data from different source, .name attr doesn't match")
         return
-    range_in_cache = cached_data.time[0].values <= np.datetime64(
-        start_time
-    ) and cached_data.time[-1].values >= np.datetime64(end_time)
+    range_in_cache = cached_data.time[0].values <= np.datetime64(start_time) and cached_data.time[
+        -1
+    ].values >= np.datetime64(end_time)
     if not range_in_cache:
         # the cache does not contain the desired time range
@@ -186,10 +294,8 @@ def check_local_cache(
     if range_in_cache:
         logger.info("Time range is within cached data")
         logger.debug(f"Opened cached nc file: [{cached_nc_path}]")
-        merged_data = clip_dataset_to_bounds(
-            cached_data, gdf.total_bounds, start_time, end_time
-        )
-        logger.debug("Clipped stores")
+        merged_data = clip_dataset_to_bounds(cached_data, gdf.total_bounds, start_time, end_time)
+        logger.debug("Clipped stores")
     return merged_data
@@ -197,16 +303,27 @@ def check_local_cache(
 def save_and_clip_dataset(
     dataset: xr.Dataset,
     gdf: gpd.GeoDataFrame,
-    start_time: datetime.datetime,
-    end_time: datetime.datetime,
+    start_time: datetime,
+    end_time: datetime,
     cache_location: Path,
 ) -> xr.Dataset:
     """convenience function clip the remote dataset, and either load from cache or save to cache if it's not present"""
     gdf = gdf.to_crs(dataset.crs)
-    cached_data = check_local_cache(cache_location, start_time, end_time, gdf, dataset)
+    cached_data = check_local_cache(
+        cache_location,
+        start_time,  # type: ignore
+        end_time,  # type: ignore
+        gdf,
+        dataset,
+    )
     if not cached_data:
-        clipped_data = clip_dataset_to_bounds(dataset, gdf.total_bounds, start_time, end_time)
+        clipped_data = clip_dataset_to_bounds(
+            dataset,
+            gdf.total_bounds,
+            start_time,  # type: ignore
+            end_time,  # type: ignore
+        )
         cached_data = save_to_cache(clipped_data, cache_location)
-    return cached_data
+    return cached_data

data_processing/datasets.py CHANGED Viewed

@@ -1,35 +1,31 @@
 import logging
+from typing import Optional
 import s3fs
-from data_processing.s3fs_utils import S3ParallelFileSystem
 import xarray as xr
-from dask.distributed import Client, LocalCluster
+from data_processing.dask_utils import use_cluster
 from data_processing.dataset_utils import validate_dataset_format
+from data_processing.s3fs_utils import S3ParallelFileSystem
 logger = logging.getLogger(__name__)
-def load_v3_retrospective_zarr(forcing_vars: list[str] = None) -> xr.Dataset:
+@use_cluster
+def load_v3_retrospective_zarr(forcing_vars: Optional[list[str]] = None) -> xr.Dataset:
     """Load zarr datasets from S3 within the specified time range."""
     # if a LocalCluster is not already running, start one
     if not forcing_vars:
         forcing_vars = ["lwdown", "precip", "psfc", "q2d", "swdown", "t2d", "u2d", "v2d"]
-    try:
-        client = Client.current()
-    except ValueError:
-        cluster = LocalCluster()
-        client = Client(cluster)
     s3_urls = [
-        f"s3://noaa-nwm-retrospective-3-0-pds/CONUS/zarr/forcing/{var}.zarr"
-        for var in forcing_vars
+        f"s3://noaa-nwm-retrospective-3-0-pds/CONUS/zarr/forcing/{var}.zarr" for var in forcing_vars
     ]
     # default cache is readahead which is detrimental to performance in this case
     fs = S3ParallelFileSystem(anon=True, default_cache_type="none")  # default_block_size
     s3_stores = [s3fs.S3Map(url, s3=fs) for url in s3_urls]
     # the cache option here just holds accessed data in memory to prevent s3 being queried multiple times
     # most of the data is read once and written to disk but some of the coordinate data is read multiple times
-    dataset = xr.open_mfdataset(s3_stores, parallel=True, engine="zarr", cache=True)
+    dataset = xr.open_mfdataset(s3_stores, parallel=True, engine="zarr", cache=True)  # type: ignore
     # set the crs attribute to conform with the format
     esri_pe_string = dataset.crs.esri_pe_string
@@ -54,7 +50,8 @@ def load_v3_retrospective_zarr(forcing_vars: list[str] = None) -> xr.Dataset:
     return dataset
-def load_aorc_zarr(start_year: int = None, end_year: int = None) -> xr.Dataset:
+@use_cluster
+def load_aorc_zarr(start_year: Optional[int] = None, end_year: Optional[int] = None) -> xr.Dataset:
     """Load the aorc zarr dataset from S3."""
     if not start_year or not end_year:
         logger.warning("No start or end year provided, defaulting to 1979-2023")
@@ -63,11 +60,6 @@ def load_aorc_zarr(start_year: int = None, end_year: int = None) -> xr.Dataset:
         start_year = 1979
     if not end_year:
         end_year = 2023
-    try:
-        client = Client.current()
-    except ValueError:
-        cluster = LocalCluster()
-        client = Client(cluster)
     logger.info(f"Loading AORC zarr datasets from {start_year} to {end_year}")
     estimated_time_s = ((end_year - start_year) * 2.5) + 3.5
@@ -75,9 +67,9 @@ def load_aorc_zarr(start_year: int = None, end_year: int = None) -> xr.Dataset:
     logger.info(f"This should take roughly {estimated_time_s} seconds")
     fs = S3ParallelFileSystem(anon=True, default_cache_type="none")
     s3_url = "s3://noaa-nws-aorc-v1-1-1km/"
-    urls = [f"{s3_url}{i}.zarr" for i in range(start_year, end_year+1)]
+    urls = [f"{s3_url}{i}.zarr" for i in range(start_year, end_year + 1)]
     filestores = [s3fs.S3Map(url, s3=fs) for url in urls]
-    dataset = xr.open_mfdataset(filestores, parallel=True, engine="zarr", cache=True)
+    dataset = xr.open_mfdataset(filestores, parallel=True, engine="zarr", cache=True)  # type: ignore
     dataset.attrs["crs"] = "+proj=longlat +datum=WGS84 +no_defs"
     dataset.attrs["name"] = "aorc_1km_zarr"
     # rename latitude and longitude to x and y
@@ -87,32 +79,29 @@ def load_aorc_zarr(start_year: int = None, end_year: int = None) -> xr.Dataset:
     return dataset
+@use_cluster
 def load_swe_zarr() -> xr.Dataset:
     """Load the swe zarr dataset from S3."""
-    s3_urls = [
-        f"s3://noaa-nwm-retrospective-3-0-pds/CONUS/zarr/ldasout.zarr"
-    ]
+    s3_urls = ["s3://noaa-nwm-retrospective-3-0-pds/CONUS/zarr/ldasout.zarr"]
     # default cache is readahead which is detrimental to performance in this case
     fs = S3ParallelFileSystem(anon=True, default_cache_type="none")  # default_block_size
     s3_stores = [s3fs.S3Map(url, s3=fs) for url in s3_urls]
     # the cache option here just holds accessed data in memory to prevent s3 being queried multiple times
     # most of the data is read once and written to disk but some of the coordinate data is read multiple times
-    dataset = xr.open_mfdataset(s3_stores, parallel=True, engine="zarr", cache=True)
+    dataset = xr.open_mfdataset(s3_stores, parallel=True, engine="zarr", cache=True)  # type: ignore
     # set the crs attribute to conform with the format
     esri_pe_string = dataset.crs.esri_pe_string
     dataset = dataset.drop_vars(["crs"])
     dataset.attrs["crs"] = esri_pe_string
     # drop everything except SNEQV
     vars_to_drop = list(dataset.data_vars)
-    vars_to_drop.remove('SNEQV')
+    vars_to_drop.remove("SNEQV")
     dataset = dataset.drop_vars(vars_to_drop)
     dataset.attrs["name"] = "v3_swe_zarr"
     # rename the data vars to work with ngen
-    variables = {
-        "SNEQV": "swe"
-    }
+    variables = {"SNEQV": "swe"}
     dataset = dataset.rename_vars(variables)
     validate_dataset_format(dataset)

ngiab-data-preprocess 4.2.2__py3-none-any.whl → 4.4.0__py3-none-any.whl

ngiab-data-preprocess 4.2.2py3-none-any.whl → 4.4.0py3-none-any.whl