PyPI - ngiab-data-preprocess - Versions diffs - 4.2.1__py3-none-any.whl → 4.3.0__py3-none-any.whl - Mend

ngiab-data-preprocess 4.2.1py3-none-any.whl → 4.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

data_processing/create_realization.py +42 -50
data_processing/dask_utils.py +92 -0
data_processing/dataset_utils.py +127 -44
data_processing/datasets.py +18 -29
data_processing/file_paths.py +7 -7
data_processing/forcings.py +102 -102
data_processing/gpkg_utils.py +18 -18
data_processing/graph_utils.py +4 -4
data_processing/s3fs_utils.py +1 -1
data_processing/subset.py +1 -2
data_sources/source_validation.py +57 -32
map_app/__main__.py +3 -2
map_app/static/css/main.css +33 -10
map_app/static/css/toggle.css +8 -5
map_app/static/js/main.js +232 -90
map_app/templates/index.html +31 -9
map_app/views.py +8 -8
ngiab_data_cli/__main__.py +31 -28
ngiab_data_cli/arguments.py +0 -1
ngiab_data_cli/forcing_cli.py +10 -19
{ngiab_data_preprocess-4.2.1.dist-info → ngiab_data_preprocess-4.3.0.dist-info}/METADATA +15 -13
ngiab_data_preprocess-4.3.0.dist-info/RECORD +43 -0
{ngiab_data_preprocess-4.2.1.dist-info → ngiab_data_preprocess-4.3.0.dist-info}/WHEEL +1 -1
map_app/static/resources/dark-style.json +0 -11068
map_app/static/resources/light-style.json +0 -11068
ngiab_data_preprocess-4.2.1.dist-info/RECORD +0 -44
{ngiab_data_preprocess-4.2.1.dist-info → ngiab_data_preprocess-4.3.0.dist-info}/entry_points.txt +0 -0
{ngiab_data_preprocess-4.2.1.dist-info → ngiab_data_preprocess-4.3.0.dist-info}/licenses/LICENSE +0 -0
{ngiab_data_preprocess-4.2.1.dist-info → ngiab_data_preprocess-4.3.0.dist-info}/top_level.txt +0 -0

data_processing/datasets.py CHANGED Viewed

@@ -1,35 +1,31 @@
 import logging
+from typing import Optional
 import s3fs
-from data_processing.s3fs_utils import S3ParallelFileSystem
 import xarray as xr
-from dask.distributed import Client, LocalCluster
+from data_processing.dask_utils import use_cluster
 from data_processing.dataset_utils import validate_dataset_format
+from data_processing.s3fs_utils import S3ParallelFileSystem
 logger = logging.getLogger(__name__)
-def load_v3_retrospective_zarr(forcing_vars: list[str] = None) -> xr.Dataset:
+@use_cluster
+def load_v3_retrospective_zarr(forcing_vars: Optional[list[str]] = None) -> xr.Dataset:
     """Load zarr datasets from S3 within the specified time range."""
     # if a LocalCluster is not already running, start one
     if not forcing_vars:
         forcing_vars = ["lwdown", "precip", "psfc", "q2d", "swdown", "t2d", "u2d", "v2d"]
-    try:
-        client = Client.current()
-    except ValueError:
-        cluster = LocalCluster()
-        client = Client(cluster)
     s3_urls = [
-        f"s3://noaa-nwm-retrospective-3-0-pds/CONUS/zarr/forcing/{var}.zarr"
-        for var in forcing_vars
+        f"s3://noaa-nwm-retrospective-3-0-pds/CONUS/zarr/forcing/{var}.zarr" for var in forcing_vars
     ]
     # default cache is readahead which is detrimental to performance in this case
     fs = S3ParallelFileSystem(anon=True, default_cache_type="none")  # default_block_size
     s3_stores = [s3fs.S3Map(url, s3=fs) for url in s3_urls]
     # the cache option here just holds accessed data in memory to prevent s3 being queried multiple times
     # most of the data is read once and written to disk but some of the coordinate data is read multiple times
-    dataset = xr.open_mfdataset(s3_stores, parallel=True, engine="zarr", cache=True)
+    dataset = xr.open_mfdataset(s3_stores, parallel=True, engine="zarr", cache=True)  # type: ignore
     # set the crs attribute to conform with the format
     esri_pe_string = dataset.crs.esri_pe_string
@@ -54,7 +50,8 @@ def load_v3_retrospective_zarr(forcing_vars: list[str] = None) -> xr.Dataset:
     return dataset
-def load_aorc_zarr(start_year: int = None, end_year: int = None) -> xr.Dataset:
+@use_cluster
+def load_aorc_zarr(start_year: Optional[int] = None, end_year: Optional[int] = None) -> xr.Dataset:
     """Load the aorc zarr dataset from S3."""
     if not start_year or not end_year:
         logger.warning("No start or end year provided, defaulting to 1979-2023")
@@ -63,11 +60,6 @@ def load_aorc_zarr(start_year: int = None, end_year: int = None) -> xr.Dataset:
         start_year = 1979
     if not end_year:
         end_year = 2023
-    try:
-        client = Client.current()
-    except ValueError:
-        cluster = LocalCluster()
-        client = Client(cluster)
     logger.info(f"Loading AORC zarr datasets from {start_year} to {end_year}")
     estimated_time_s = ((end_year - start_year) * 2.5) + 3.5
@@ -75,9 +67,9 @@ def load_aorc_zarr(start_year: int = None, end_year: int = None) -> xr.Dataset:
     logger.info(f"This should take roughly {estimated_time_s} seconds")
     fs = S3ParallelFileSystem(anon=True, default_cache_type="none")
     s3_url = "s3://noaa-nws-aorc-v1-1-1km/"
-    urls = [f"{s3_url}{i}.zarr" for i in range(start_year, end_year+1)]
+    urls = [f"{s3_url}{i}.zarr" for i in range(start_year, end_year + 1)]
     filestores = [s3fs.S3Map(url, s3=fs) for url in urls]
-    dataset = xr.open_mfdataset(filestores, parallel=True, engine="zarr", cache=True)
+    dataset = xr.open_mfdataset(filestores, parallel=True, engine="zarr", cache=True)  # type: ignore
     dataset.attrs["crs"] = "+proj=longlat +datum=WGS84 +no_defs"
     dataset.attrs["name"] = "aorc_1km_zarr"
     # rename latitude and longitude to x and y
@@ -87,32 +79,29 @@ def load_aorc_zarr(start_year: int = None, end_year: int = None) -> xr.Dataset:
     return dataset
+@use_cluster
 def load_swe_zarr() -> xr.Dataset:
     """Load the swe zarr dataset from S3."""
-    s3_urls = [
-        f"s3://noaa-nwm-retrospective-3-0-pds/CONUS/zarr/ldasout.zarr"
-    ]
+    s3_urls = ["s3://noaa-nwm-retrospective-3-0-pds/CONUS/zarr/ldasout.zarr"]
     # default cache is readahead which is detrimental to performance in this case
     fs = S3ParallelFileSystem(anon=True, default_cache_type="none")  # default_block_size
     s3_stores = [s3fs.S3Map(url, s3=fs) for url in s3_urls]
     # the cache option here just holds accessed data in memory to prevent s3 being queried multiple times
     # most of the data is read once and written to disk but some of the coordinate data is read multiple times
-    dataset = xr.open_mfdataset(s3_stores, parallel=True, engine="zarr", cache=True)
+    dataset = xr.open_mfdataset(s3_stores, parallel=True, engine="zarr", cache=True)  # type: ignore
     # set the crs attribute to conform with the format
     esri_pe_string = dataset.crs.esri_pe_string
     dataset = dataset.drop_vars(["crs"])
     dataset.attrs["crs"] = esri_pe_string
     # drop everything except SNEQV
     vars_to_drop = list(dataset.data_vars)
-    vars_to_drop.remove('SNEQV')
+    vars_to_drop.remove("SNEQV")
     dataset = dataset.drop_vars(vars_to_drop)
     dataset.attrs["name"] = "v3_swe_zarr"
     # rename the data vars to work with ngen
-    variables = {
-        "SNEQV": "swe"
-    }
+    variables = {"SNEQV": "swe"}
     dataset = dataset.rename_vars(variables)
     validate_dataset_format(dataset)

data_processing/file_paths.py CHANGED Viewed

@@ -1,11 +1,13 @@
 from pathlib import Path
+from typing import Optional
+from datetime import datetime
 class file_paths:
     """
     This class contains all of the file paths used in the data processing
     workflow.
     """
     config_file = Path("~/.ngiab/preprocessor").expanduser()
     hydrofabric_dir = Path("~/.ngiab/hydrofabric/v2.2").expanduser()
     hydrofabric_download_log = Path("~/.ngiab/hydrofabric/v2.2/download_log.json").expanduser()
@@ -31,7 +33,7 @@ class file_paths:
     template_em_config = data_sources / "em-catchment-template.yml"
     template_em_model_config = data_sources / "em-config.yml"
-    def __init__(self, folder_name: str = None, output_dir: Path = None):
+    def __init__(self, folder_name: Optional[str] = None, output_dir: Optional[Path] = None):
         """
         Initialize the file_paths class with a the name of the output subfolder.
         OR the path to the output folder you want to use.
@@ -53,7 +55,7 @@ class file_paths:
         self.cache_dir.mkdir(parents=True, exist_ok=True)
     @classmethod
-    def get_working_dir(cls) -> Path:
+    def get_working_dir(cls) -> Path | None:
         try:
             with open(cls.config_file, "r") as f:
                 return Path(f.readline().strip()).expanduser()
@@ -67,9 +69,7 @@ class file_paths:
     @classmethod
     def root_output_dir(cls) -> Path:
-        if cls.get_working_dir() is not None:
-            return cls.get_working_dir()
-        return Path(__file__).parent.parent.parent / "output"
+        return cls.get_working_dir() or Path(__file__).parent.parent.parent / "output"
     @property
     def subset_dir(self) -> Path:
@@ -102,7 +102,7 @@ class file_paths:
     def append_cli_command(self, command: list[str]) -> None:
         current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
         command_string = " ".join(command)
-        history_file  = self.metadata_dir / "cli_commands_history.txt"
+        history_file = self.metadata_dir / "cli_commands_history.txt"
         if not history_file.parent.exists():
             history_file.parent.mkdir(parents=True, exist_ok=True)
         with open(self.metadata_dir / "cli_commands_history.txt", "a") as f:

data_processing/forcings.py CHANGED Viewed

@@ -3,32 +3,29 @@ import multiprocessing
 import os
 import time
 import warnings
-from datetime import datetime
 from functools import partial
 from math import ceil
 from multiprocessing import shared_memory
 from pathlib import Path
-from dask.distributed import Client, LocalCluster
+from typing import List, Tuple
 import geopandas as gpd
 import numpy as np
 import pandas as pd
 import psutil
 import xarray as xr
-from data_processing.file_paths import file_paths
+from data_processing.dask_utils import no_cluster, use_cluster
 from data_processing.dataset_utils import validate_dataset_format
+from data_processing.file_paths import file_paths
 from exactextract import exact_extract
 from exactextract.raster import NumPyRasterSource
 from rich.progress import (
-    Progress,
     BarColumn,
+    Progress,
     TextColumn,
     TimeElapsedColumn,
     TimeRemainingColumn,
 )
-from typing import Tuple
 logger = logging.getLogger(__name__)
 # Suppress the specific warning from numpy to keep the cli output clean
@@ -40,13 +37,13 @@ warnings.filterwarnings(
 )
-def weighted_sum_of_cells(flat_raster: np.ndarray,
-                          cell_ids: np.ndarray,
-                          factors: np.ndarray) -> np.ndarray:
-    '''
+def weighted_sum_of_cells(
+    flat_raster: np.ndarray, cell_ids: np.ndarray, factors: np.ndarray
+) -> np.ndarray:
+    """
     Take an average of each forcing variable in a catchment. Create an output
-    array initialized with zeros, and then sum up the forcing variable and
-    divide by the sum of the cell weights to get an averaged forcing variable
+    array initialized with zeros, and then sum up the forcing variable and
+    divide by the sum of the cell weights to get an averaged forcing variable
     for the entire catchment.
     Parameters
@@ -65,7 +62,7 @@ def weighted_sum_of_cells(flat_raster: np.ndarray,
         An one-dimensional array, where each element corresponds to a timestep.
         Each element contains the averaged forcing value for the whole catchment
         over one timestep.
-    '''
+    """
     result = np.zeros(flat_raster.shape[0])
     result = np.sum(flat_raster[:, cell_ids] * factors, axis=1)
     sum_of_weights = np.sum(factors)
@@ -73,12 +70,10 @@ def weighted_sum_of_cells(flat_raster: np.ndarray,
     return result
-def get_cell_weights(raster: xr.Dataset,
-                     gdf: gpd.GeoDataFrame,
-                     wkt: str) -> pd.DataFrame:
-    '''
-    Get the cell weights (coverage) for each cell in a divide. Coverage is
-    defined as the fraction (a float in [0,1]) of a raster cell that overlaps
+def get_cell_weights(raster: xr.Dataset, gdf: gpd.GeoDataFrame, wkt: str) -> pd.DataFrame:
+    """
+    Get the cell weights (coverage) for each cell in a divide. Coverage is
+    defined as the fraction (a float in [0,1]) of a raster cell that overlaps
     with the polygon in the passed gdf.
     Parameters
@@ -96,35 +91,37 @@ def get_cell_weights(raster: xr.Dataset,
     pd.DataFrame
         DataFrame indexed by divide_id that contains information about coverage
         for each raster cell in gridded forcing file.
-    '''
-    xmin = raster.x[0]
-    xmax = raster.x[-1]
-    ymin = raster.y[0]
-    ymax = raster.y[-1]
+    """
+    xmin = min(raster.x)
+    xmax = max(raster.x)
+    ymin = min(raster.y)
+    ymax = max(raster.y)
     data_vars = list(raster.data_vars)
     rastersource = NumPyRasterSource(
         raster[data_vars[0]], srs_wkt=wkt, xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax
     )
-    output = exact_extract(
+    output: pd.DataFrame = exact_extract(
         rastersource,
         gdf,
         ["cell_id", "coverage"],
         include_cols=["divide_id"],
         output="pandas",
-    )
+    )  # type: ignore
     return output.set_index("divide_id")
 def add_APCP_SURFACE_to_dataset(dataset: xr.Dataset) -> xr.Dataset:
-    '''Convert precipitation value to correct units.'''
+    """Convert precipitation value to correct units."""
     # precip_rate is mm/s
     # cfe says input atmosphere_water__liquid_equivalent_precipitation_rate is mm/h
     # nom says prcpnonc input is mm/s
     # technically should be kg/m^2/s at 1kg = 1l it equates to mm/s
     # nom says qinsur output is m/s, hopefully qinsur is converted to mm/h by ngen
     dataset["APCP_surface"] = dataset["precip_rate"] * 3600
-    dataset["APCP_surface"].attrs["units"] = "mm h^-1" # ^-1 notation copied from source data
-    dataset["APCP_surface"].attrs["source_note"] = "This is just the precip_rate variable converted to mm/h by multiplying by 3600"
+    dataset["APCP_surface"].attrs["units"] = "mm h^-1"  # ^-1 notation copied from source data
+    dataset["APCP_surface"].attrs["source_note"] = (
+        "This is just the precip_rate variable converted to mm/h by multiplying by 3600"
+    )
     return dataset
@@ -132,14 +129,14 @@ def add_precip_rate_to_dataset(dataset: xr.Dataset) -> xr.Dataset:
     # the inverse of the function above
     dataset["precip_rate"] = dataset["APCP_surface"] / 3600
     dataset["precip_rate"].attrs["units"] = "mm s^-1"
-    dataset["precip_rate"].attrs[
-        "source_note"
-    ] = "This is just the APCP_surface variable converted to mm/s by dividing by 3600"
+    dataset["precip_rate"].attrs["source_note"] = (
+        "This is just the APCP_surface variable converted to mm/s by dividing by 3600"
+    )
     return dataset
 def get_index_chunks(data: xr.DataArray) -> list[tuple[int, int]]:
-    '''
+    """
     Take a DataArray and calculate the start and end index for each chunk based
     on the available memory.
@@ -153,7 +150,7 @@ def get_index_chunks(data: xr.DataArray) -> list[tuple[int, int]]:
     list[Tuple[int, int]]
         Each element in the list represents a chunk of data. The tuple within
         the chunk indicates the start index and end index of the chunk.
-    '''
+    """
     array_memory_usage = data.nbytes
     free_memory = psutil.virtual_memory().available * 0.8  # 80% of available memory
     # limit the chunk to 20gb, makes things more stable
@@ -166,15 +163,13 @@ def get_index_chunks(data: xr.DataArray) -> list[tuple[int, int]]:
     return index_chunks
-def create_shared_memory(lazy_array: xr.Dataset) -> Tuple[
-    shared_memory.SharedMemory,
-    np.dtype,
-    np.dtype
-]:
-    '''
-    Create a shared memory object so that multiple processes can access loaded
+def create_shared_memory(
+    lazy_array: xr.DataArray,
+) -> Tuple[shared_memory.SharedMemory, Tuple[int, ...], np.dtype]:
+    """
+    Create a shared memory object so that multiple processes can access loaded
     data.
     Parameters
     ----------
     lazy_array : xr.Dataset
@@ -183,22 +178,22 @@ def create_shared_memory(lazy_array: xr.Dataset) -> Tuple[
     Returns
     -------
     shared_memory.SharedMemory
-        A specific block of memory allocated by the OS of the size of
+        A specific block of memory allocated by the OS of the size of
         lazy_array.
-    np.dtype.shape
+    Tuple[int, ...]
         A shape object with dimensions (# timesteps, # of raster cells) in
         reference to lazy_array.
     np.dtype
         Data type of objects in lazy_array.
-    '''
-    logger.debug(f"Creating shared memory size {lazy_array.nbytes/ 10**6} Mb.")
+    """
+    logger.debug(f"Creating shared memory size {lazy_array.nbytes / 10**6} Mb.")
     shm = shared_memory.SharedMemory(create=True, size=lazy_array.nbytes)
     shared_array = np.ndarray(lazy_array.shape, dtype=np.float32, buffer=shm.buf)
     # if your data is not float32, xarray will do an automatic conversion here
     # which consumes a lot more memory, forcings downloaded with this tool will work
     for start, end in get_index_chunks(lazy_array):
-            # copy data from lazy to shared memory one chunk at a time
-            shared_array[start:end] = lazy_array[start:end]
+        # copy data from lazy to shared memory one chunk at a time
+        shared_array[start:end] = lazy_array[start:end]
     time, x, y = shared_array.shape
     shared_array = shared_array.reshape(time, -1)
@@ -206,14 +201,16 @@ def create_shared_memory(lazy_array: xr.Dataset) -> Tuple[
     return shm, shared_array.shape, shared_array.dtype
-def process_chunk_shared(variable: str,
-                         times: np.ndarray,
-                         shm_name: str,
-                         shape: np.dtype.shape,
-                         dtype: np.dtype,
-                         chunk: gpd.GeoDataFrame) -> xr.DataArray:
-    '''
-    Process the gridded forcings chunk loaded into a SharedMemory block.
+def process_chunk_shared(
+    variable: str,
+    times: np.ndarray,
+    shm_name: str,
+    shape: Tuple[int, ...],
+    dtype: np.dtype,
+    chunk: pd.DataFrame,
+) -> xr.DataArray:
+    """
+    Process the gridded forcings chunk loaded into a SharedMemory block.
     Parameters
     ----------
@@ -235,7 +232,7 @@ def process_chunk_shared(variable: str,
     -------
     xr.DataArray
         Averaged forcings data for each timestep for each catchment.
-    '''
+    """
     existing_shm = shared_memory.SharedMemory(name=shm_name)
     raster = np.ndarray(shape, dtype=dtype, buffer=existing_shm.buf)
     results = []
@@ -256,10 +253,10 @@ def process_chunk_shared(variable: str,
     return xr.concat(results, dim="catchment")
-def get_cell_weights_parallel(gdf: gpd.GeoDataFrame,
-                              input_forcings: xr.Dataset,
-                              num_partitions: int) -> pd.DataFrame:
-    '''
+def get_cell_weights_parallel(
+    gdf: gpd.GeoDataFrame, input_forcings: xr.Dataset, num_partitions: int
+) -> pd.DataFrame:
+    """
     Execute get_cell_weights with multiprocessing, with chunking for the passed
     GeoDataFrame to conserve memory usage.
@@ -277,29 +274,30 @@ def get_cell_weights_parallel(gdf: gpd.GeoDataFrame,
     pd.DataFrame
         DataFrame indexed by divide_id that contains information about coverage
         for each raster cell and each timestep in gridded forcing file.
-    '''
+    """
     gdf_chunks = np.array_split(gdf, num_partitions)
-    wkt = gdf.crs.to_wkt()
+    wkt = gdf.crs.to_wkt()  # type: ignore
     one_timestep = input_forcings.isel(time=0).compute()
     with multiprocessing.Pool() as pool:
         args = [(one_timestep, gdf_chunk, wkt) for gdf_chunk in gdf_chunks]
         catchments = pool.starmap(get_cell_weights, args)
     return pd.concat(catchments)
 def get_units(dataset: xr.Dataset) -> dict:
-    '''
+    """
     Return dictionary of units for each variable in dataset.
     Parameters
     ----------
     dataset : xr.Dataset
         Dataset with variables and units.
     Returns
     -------
-    dict
+    dict
         {variable name: unit}
-    '''
+    """
     units = {}
     for var in dataset.data_vars:
         if dataset[var].attrs["units"]:
@@ -307,12 +305,13 @@ def get_units(dataset: xr.Dataset) -> dict:
     return units
+@no_cluster
 def compute_zonal_stats(
     gdf: gpd.GeoDataFrame, gridded_data: xr.Dataset, forcings_dir: Path
 ) -> None:
-    '''
-    Compute zonal statistics in parallel for all timesteps over all desired
-    catchments. Create chunks of catchments and within those, chunks of
+    """
+    Compute zonal statistics in parallel for all timesteps over all desired
+    catchments. Create chunks of catchments and within those, chunks of
     timesteps for memory management.
     Parameters
@@ -323,7 +322,7 @@ def compute_zonal_stats(
         Gridded forcing data that intersects with desired catchments.
     forcings_dir : Path
         Path to directory where outputs are to be stored.
-    '''
+    """
     logger.info("Computing zonal stats in parallel for all timesteps")
     timer_start = time.time()
     num_partitions = multiprocessing.cpu_count() - 1
@@ -333,7 +332,7 @@ def compute_zonal_stats(
     catchments = get_cell_weights_parallel(gdf, gridded_data, num_partitions)
     units = get_units(gridded_data)
-    cat_chunks = np.array_split(catchments, num_partitions)
+    cat_chunks: List[pd.DataFrame] = np.array_split(catchments, num_partitions)  # type: ignore
     progress = Progress(
         TextColumn("[progress.description]{task.description}"),
@@ -352,25 +351,28 @@ def compute_zonal_stats(
         "[cyan]Processing variables...", total=len(gridded_data.data_vars), elapsed=0
     )
     progress.start()
-    for variable in list(gridded_data.data_vars):
+    for data_var_name in list(gridded_data.data_vars):
+        data_var_name: str
         progress.update(variable_task, advance=1)
-        progress.update(variable_task, description=f"Processing {variable}")
+        progress.update(variable_task, description=f"Processing {data_var_name}")
         # to make sure this fits in memory, we need to chunk the data
-        time_chunks = get_index_chunks(gridded_data[variable])
+        time_chunks = get_index_chunks(gridded_data[data_var_name])
         chunk_task = progress.add_task("[purple] processing chunks", total=len(time_chunks))
         for i, times in enumerate(time_chunks):
             progress.update(chunk_task, advance=1)
             start, end = times
             # select the chunk of time we want to process
-            data_chunk = gridded_data[variable].isel(time=slice(start, end))
+            data_chunk = gridded_data[data_var_name].isel(time=slice(start, end))
             # put it in shared memory
             shm, shape, dtype = create_shared_memory(data_chunk)
             times = data_chunk.time.values
             # create a partial function to pass to the multiprocessing pool
-            partial_process_chunk = partial(process_chunk_shared,variable,times,shm.name,shape,dtype)
+            partial_process_chunk = partial(
+                process_chunk_shared, data_var_name, times, shm.name, shape, dtype
+            )
-            logger.debug(f"Processing variable: {variable}")
+            logger.debug(f"Processing variable: {data_var_name}")
             # process the chunks of catchments in parallel
             with multiprocessing.Pool(num_partitions) as pool:
                 variable_data = pool.map(partial_process_chunk, cat_chunks)
@@ -378,24 +380,24 @@ def compute_zonal_stats(
             # clean up the shared memory
             shm.close()
             shm.unlink()
-            logger.debug(f"Processed variable: {variable}")
+            logger.debug(f"Processed variable: {data_var_name}")
             concatenated_da = xr.concat(variable_data, dim="catchment")
             # delete the data to free up memory
             del variable_data
-            logger.debug(f"Concatenated variable: {variable}")
+            logger.debug(f"Concatenated variable: {data_var_name}")
             # write this to disk now to save memory
             # xarray will monitor memory usage, but it doesn't account for the shared memory used to store the raster
             # This reduces memory usage by about 60%
-            concatenated_da.to_dataset(name=variable).to_netcdf(
-                forcings_dir / "temp" / f"{variable}_timechunk_{i}.nc"
+            concatenated_da.to_dataset(name=data_var_name).to_netcdf(
+                forcings_dir / "temp" / f"{data_var_name}_timechunk_{i}.nc"
             )
         # Merge the chunks back together
         datasets = [
-            xr.open_dataset(forcings_dir / "temp" / f"{variable}_timechunk_{i}.nc")
+            xr.open_dataset(forcings_dir / "temp" / f"{data_var_name}_timechunk_{i}.nc")
             for i in range(len(time_chunks))
         ]
         result = xr.concat(datasets, dim="time")
-        result.to_netcdf(forcings_dir / "temp" / f"{variable}.nc")
+        result.to_netcdf(forcings_dir / "temp" / f"{data_var_name}.nc")
         # close the datasets
         result.close()
         _ = [dataset.close() for dataset in datasets]
@@ -413,8 +415,9 @@ def compute_zonal_stats(
     write_outputs(forcings_dir, units)
+@use_cluster
 def write_outputs(forcings_dir: Path, units: dict) -> None:
-    '''
+    """
     Write outputs to disk in the form of a NetCDF file, using dask clusters to
     facilitate parallel computing.
@@ -423,20 +426,13 @@ def write_outputs(forcings_dir: Path, units: dict) -> None:
     forcings_dir : Path
         Path to directory where outputs are to be stored.
     variables : dict
-        Preset dictionary where the keys are forcing variable names and the
+        Preset dictionary where the keys are forcing variable names and the
         values are units.
     units : dict
-        Dictionary where the keys are forcing variable names and the values are
+        Dictionary where the keys are forcing variable names and the values are
         units. Differs from variables, as this dictionary depends on the gridded
         forcing dataset.
-    '''
-    # start a dask cluster if there isn't one already running
-    try:
-        client = Client.current()
-    except ValueError:
-        cluster = LocalCluster()
-        client = Client(cluster)
+    """
     temp_forcings_dir = forcings_dir / "temp"
     # Combine all variables into a single dataset using dask
     results = [xr.open_dataset(file, chunks="auto") for file in temp_forcings_dir.glob("*.nc")]
@@ -473,14 +469,18 @@ def write_outputs(forcings_dir: Path, units: dict) -> None:
         time_array = (
             final_ds.time.astype("datetime64[s]").astype(np.int64).values // 10**9
         )  ## convert from ns to s
-    time_array = time_array.astype(np.int32) ## convert to int32 to save space
-    final_ds = final_ds.drop_vars(["catchment", "time"]) ## drop the original time and catchment vars
-    final_ds = final_ds.rename_dims({"catchment": "catchment-id"}) # rename the catchment dimension
+    time_array = time_array.astype(np.int32)  ## convert to int32 to save space
+    final_ds = final_ds.drop_vars(
+        ["catchment", "time"]
+    )  ## drop the original time and catchment vars
+    final_ds = final_ds.rename_dims({"catchment": "catchment-id"})  # rename the catchment dimension
     # add the time as a 2d data var, yes this is wasting disk space.
     final_ds["Time"] = (("catchment-id", "time"), [time_array for _ in range(len(final_ds["ids"]))])
     # set the time unit
     final_ds["Time"].attrs["units"] = "s"
-    final_ds["Time"].attrs["epoch_start"] = "01/01/1970 00:00:00" # not needed but suppresses the ngen warning
+    final_ds["Time"].attrs["epoch_start"] = (
+        "01/01/1970 00:00:00"  # not needed but suppresses the ngen warning
+    )
     final_ds.to_netcdf(forcings_dir / "forcings.nc", engine="netcdf4")
     # close the datasets
@@ -508,7 +508,7 @@ def setup_directories(cat_id: str) -> file_paths:
 def create_forcings(dataset: xr.Dataset, output_folder_name: str) -> None:
     validate_dataset_format(dataset)
     forcing_paths = setup_directories(output_folder_name)
-    print(f"forcing path {output_folder_name} {forcing_paths.forcings_dir}")
+    logger.debug(f"forcing path {output_folder_name} {forcing_paths.forcings_dir}")
     gdf = gpd.read_file(forcing_paths.geopackage_path, layer="divides")
     logger.debug(f"gdf  bounds: {gdf.total_bounds}")
     gdf = gdf.to_crs(dataset.crs)

ngiab-data-preprocess 4.2.1__py3-none-any.whl → 4.3.0__py3-none-any.whl

ngiab-data-preprocess 4.2.1py3-none-any.whl → 4.3.0py3-none-any.whl