PyPI - ngiab-data-preprocess - Versions diffs - 4.3.0__py3-none-any.whl → 4.4.0__py3-none-any.whl - Mend

ngiab-data-preprocess 4.3.0py3-none-any.whl → 4.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

data_processing/create_realization.py CHANGED Viewed

@@ -6,6 +6,8 @@ import sqlite3
 from datetime import datetime
 from pathlib import Path
 from typing import Dict, Optional
+import psutil
+import os
 import pandas
 import requests
@@ -257,7 +259,27 @@ def configure_troute(
     with open(file_paths.template_troute_config, "r") as file:
         troute_template = file.read()
     time_step_size = 300
+    gpkg_file_path=f"{config_dir}/{cat_id}_subset.gpkg"
     nts = (end_time - start_time).total_seconds() / time_step_size
+    with sqlite3.connect(gpkg_file_path) as conn:
+        ncats_df = pandas.read_sql_query("SELECT COUNT(id) FROM 'divides';", conn)
+        ncats = ncats_df['COUNT(id)'][0]
+    est_bytes_required = nts * ncats * 45 # extremely rough calculation based on about 3 tests :)
+    local_ram_available = 0.8 * psutil.virtual_memory().available # buffer to not accidentally explode machine
+    if est_bytes_required > local_ram_available:
+        max_loop_size = nts // (est_bytes_required // local_ram_available)
+        binary_nexus_file_folder_comment = ""
+        parent_dir = config_dir.parent
+        output_parquet_path = Path(f"{parent_dir}/outputs/parquet/")
+        if not output_parquet_path.exists():
+            os.makedirs(output_parquet_path)
+    else:
+        max_loop_size = nts
+        binary_nexus_file_folder_comment = "#"
     filled_template = troute_template.format(
         # hard coded to 5 minutes
         time_step_size=time_step_size,
@@ -266,7 +288,8 @@ def configure_troute(
         geo_file_path=f"./config/{cat_id}_subset.gpkg",
         start_datetime=start_time.strftime("%Y-%m-%d %H:%M:%S"),
         nts=nts,
-        max_loop_size=nts,
+        max_loop_size=max_loop_size,
+        binary_nexus_file_folder_comment=binary_nexus_file_folder_comment
     )
     with open(config_dir / "troute.yaml", "w") as file:

data_processing/dataset_utils.py CHANGED Viewed

@@ -7,9 +7,9 @@ from typing import List, Literal, Optional, Tuple, Union
 import geopandas as gpd
 import numpy as np
 import xarray as xr
+from dask.distributed import Client, Future, progress
+from data_processing.dask_utils import no_cluster, temp_cluster
 from xarray.core.types import InterpOptions
-from dask.distributed import Client, progress, Future
-from data_processing.dask_utils import use_cluster
 logger = logging.getLogger(__name__)
@@ -117,13 +117,14 @@ def clip_dataset_to_bounds(
     return dataset
+@no_cluster
 def interpolate_nan_values(
     dataset: xr.Dataset,
     variables: Optional[List[str]] = None,
     dim: str = "time",
     method: InterpOptions = "nearest",
     fill_value: str = "extrapolate",
-) -> None:
+) -> bool:
     """
     Interpolates NaN values in specified (or all numeric time-dependent)
     variables of an xarray.Dataset. Operates inplace on the dataset.
@@ -145,6 +146,7 @@ def interpolate_nan_values(
         Set to "extrapolate" to fill with the nearest valid value when using 'nearest' or 'linear'.
         Default is "extrapolate".
     """
+    interpolation_used = False
     for name, var in dataset.data_vars.items():
         # if the variable is non-numeric, skip
         if not np.issubdtype(var.dtype, np.number):
@@ -158,9 +160,35 @@ def interpolate_nan_values(
             method=method,
             fill_value=fill_value if method in ["nearest", "linear"] else None,
         )
+        interpolation_used = True
+    return interpolation_used
-@use_cluster
+@no_cluster
+def save_dataset_no_cluster(
+    ds_to_save: xr.Dataset,
+    target_path: Path,
+    engine: Literal["netcdf4", "scipy", "h5netcdf"] = "h5netcdf",
+):
+    """
+    This explicitly does not use dask distributed.
+    Helper function to compute and save an xarray.Dataset to a NetCDF file.
+    Uses a temporary file and rename for avoid leaving a half written file.
+    """
+    if not target_path.parent.exists():
+        target_path.parent.mkdir(parents=True, exist_ok=True)
+    temp_file_path = target_path.with_name(target_path.name + ".saving.nc")
+    if temp_file_path.exists():
+        os.remove(temp_file_path)
+    ds_to_save.to_netcdf(temp_file_path, engine=engine, compute=True)
+    os.rename(str(temp_file_path), str(target_path))
+    logger.info(f"Successfully saved data to: {target_path}")
+@temp_cluster
 def save_dataset(
     ds_to_save: xr.Dataset,
     target_path: Path,
@@ -184,20 +212,21 @@ def save_dataset(
     logger.debug(
         f"NetCDF write task submitted to Dask. Waiting for completion to {temp_file_path}..."
     )
+    logger.info("For more detailed progress, see the Dask dashboard http://localhost:8787/status")
     progress(future)
     future.result()
     os.rename(str(temp_file_path), str(target_path))
     logger.info(f"Successfully saved data to: {target_path}")
-@use_cluster
+@no_cluster
 def save_to_cache(
     stores: xr.Dataset, cached_nc_path: Path, interpolate_nans: bool = True
 ) -> xr.Dataset:
     """
     Compute the store and save it to a cached netCDF file. This is not required but will save time and bandwidth.
     """
-    logger.info(f"Processing dataset for caching. Final cache target: {cached_nc_path}")
+    logger.debug(f"Processing dataset for caching. Final cache target: {cached_nc_path}")
     # lasily cast all numbers to f32
     for name, var in stores.data_vars.items():
@@ -206,13 +235,18 @@ def save_to_cache(
     # save dataset locally before manipulating it
     save_dataset(stores, cached_nc_path)
-    stores = xr.open_mfdataset(cached_nc_path, parallel=True, engine="h5netcdf")
     if interpolate_nans:
-        interpolate_nan_values(dataset=stores)
-        save_dataset(stores, cached_nc_path)
-        stores = xr.open_mfdataset(cached_nc_path, parallel=True, engine="h5netcdf")
+        stores = xr.open_mfdataset(
+            cached_nc_path,
+            parallel=True,
+            engine="h5netcdf",
+        )
+        was_interpolated = interpolate_nan_values(dataset=stores)
+        if was_interpolated:
+            save_dataset_no_cluster(stores, cached_nc_path)
+    stores = xr.open_mfdataset(cached_nc_path, parallel=True, engine="h5netcdf")
     return stores

data_processing/graph_utils.py CHANGED Viewed

@@ -169,7 +169,6 @@ def get_upstream_cats(names: Union[str, List[str]]) -> Set[str]:
                 node_index = graph.vs.find(cat=name).index
             else:
                 node_index = graph.vs.find(name=name).index
-            node_index = graph.vs.find(cat=name).index
             upstream_nodes = graph.subcomponent(node_index, mode="IN")
             for node in upstream_nodes:
                 parent_ids.add(graph.vs[node]["name"])
@@ -178,7 +177,6 @@ def get_upstream_cats(names: Union[str, List[str]]) -> Set[str]:
             logger.critical(f"Catchment {name} not found in the hydrofabric graph.")
         except ValueError:
             logger.critical(f"Catchment {name} not found in the hydrofabric graph.")
     # sometimes returns None, which isn't helpful
     if None in cat_ids:
         cat_ids.remove(None)

data_processing/subset.py CHANGED Viewed

@@ -12,9 +12,11 @@ from data_processing.gpkg_utils import (
     update_geopackage_metadata,
 )
 from data_processing.graph_utils import get_upstream_ids
+from rich.console import Console
+from rich.prompt import Prompt
 logger = logging.getLogger(__name__)
+console = Console()
 subset_tables = [
     "divides",
     "divide-attributes",  # requires divides
@@ -30,15 +32,33 @@ subset_tables = [
 def create_subset_gpkg(
-    ids: Union[List[str], str], hydrofabric: Path, output_gpkg_path: Path, is_vpu: bool = False
+    ids: Union[List[str], str],
+    hydrofabric: Path,
+    output_gpkg_path: Path,
+    is_vpu: bool = False,
+    override_gpkg: bool = True,
 ):
     # ids is a list of nexus and wb ids, or a single vpu id
     if not isinstance(ids, list):
         ids = [ids]
     output_gpkg_path.parent.mkdir(parents=True, exist_ok=True)
-    if os.path.exists(output_gpkg_path):
-        os.remove(output_gpkg_path)
+    if not override_gpkg:
+        if os.path.exists(output_gpkg_path):
+            response = Prompt.ask(
+                f"Subset geopackage at {output_gpkg_path} already exists. Are you sure you want to overwrite it?",
+                default="n",
+                choices=["y", "n"],
+            )
+            if response == "y":
+                console.print(f"Removing {output_gpkg_path}...", style="yellow")
+                os.remove(output_gpkg_path)
+            else:
+                console.print("Exiting...", style="bold red")
+                exit()
+    else:
+        if os.path.exists(output_gpkg_path):
+            os.remove(output_gpkg_path)
     create_empty_gpkg(output_gpkg_path)
     logger.info(f"Subsetting tables: {subset_tables}")
@@ -55,8 +75,18 @@ def create_subset_gpkg(
 def subset_vpu(
     vpu_id: str, output_gpkg_path: Path, hydrofabric: Path = file_paths.conus_hydrofabric
 ):
-    if output_gpkg_path.exists():
-        os.remove(output_gpkg_path)
+    if os.path.exists(output_gpkg_path):
+        response = Prompt.ask(
+            f"Subset geopackage at {output_gpkg_path} already exists. Are you sure you want to overwrite it?",
+            default="n",
+            choices=["y", "n"],
+        )
+        if response == "y":
+            console.print(f"Removing {output_gpkg_path}...", style="yellow")
+            os.remove(output_gpkg_path)
+        else:
+            console.print("Exiting...", style="bold red")
+            exit()
     create_subset_gpkg(vpu_id, hydrofabric, output_gpkg_path=output_gpkg_path, is_vpu=True)
     logger.info(f"Subset complete for VPU {vpu_id}")
@@ -68,6 +98,7 @@ def subset(
     hydrofabric: Path = file_paths.conus_hydrofabric,
     output_gpkg_path: Path = Path(),
     include_outlet: bool = True,
+    override_gpkg: bool = True,
 ):
     upstream_ids = list(get_upstream_ids(cat_ids, include_outlet))
@@ -78,6 +109,6 @@ def subset(
         paths = file_paths(output_folder_name)
         output_gpkg_path = paths.geopackage_path
-    create_subset_gpkg(upstream_ids, hydrofabric, output_gpkg_path)
+    create_subset_gpkg(upstream_ids, hydrofabric, output_gpkg_path, override_gpkg=override_gpkg)
     logger.info(f"Subset complete for {len(upstream_ids)} features (catchments + nexuses)")
     logger.debug(f"Subset complete for {upstream_ids} catchments")

data_sources/ngen-routing-template.yaml CHANGED Viewed

@@ -62,7 +62,7 @@ compute_parameters:
         qlat_input_folder: ./outputs/ngen/
         qlat_file_pattern_filter: "nex-*"
-        #binary_nexus_file_folder: ./outputs/parquet/ # if nexus_file_pattern_filter="nex-*" and you want it to reformat them as parquet, you need this
+        {binary_nexus_file_folder_comment}binary_nexus_file_folder: ./outputs/parquet/ # if nexus_file_pattern_filter="nex-*" and you want it to reformat them as parquet, you need this
         #coastal_boundary_input_file : channel_forcing/schout_1.nc
         nts: {nts} #288 for 1day
         max_loop_size: {max_loop_size} # [number of timesteps]

data_sources/source_validation.py CHANGED Viewed

@@ -141,6 +141,20 @@ def download_and_update_hf():
         bucket="communityhydrofabric",
         key="hydrofabrics/community/conus_nextgen.tar.gz",
     )
+    if file_paths.hydrofabric_graph.is_file():
+        console.print(
+            f"Hydrofabric graph already exists at {file_paths.hydrofabric_graph}, removing it to download the latest version.",
+            style="bold yellow",
+        )
+        file_paths.hydrofabric_graph.unlink()
+    download_from_s3(
+        file_paths.hydrofabric_graph,
+        bucket="communityhydrofabric",
+        key="hydrofabrics/community/conus_igraph_network.gpickle"
+    )
     status, headers = get_headers()
     if status == 200:
@@ -153,11 +167,10 @@ def download_and_update_hf():
         file_paths.conus_hydrofabric.parent,
     )
 def validate_hydrofabric():
     if not file_paths.conus_hydrofabric.is_file():
         response = Prompt.ask(
-            "Hydrofabric is missing. Would you like to download it now?",
+            "Hydrofabric files are missing. Would you like to download them now?",
             default="y",
             choices=["y", "n"],
         )

map_app/static/js/data_processing.js CHANGED Viewed

@@ -4,67 +4,44 @@ async function subset() {
         alert('Please select at least one basin in the map before subsetting');
         return;
     }
-    console.log('subsetting');
-    document.getElementById('subset-button').disabled = true;
-    document.getElementById('subset-loading').style.visibility = "visible";
-    const startTime = performance.now(); // Start the timer
-    document.getElementById('output-path').innerHTML = "Subsetting...";
-    fetch('/subset', {
+    fetch('/subset_check', {
         method: 'POST',
         headers: { 'Content-Type': 'application/json' },
         body: JSON.stringify([cat_id]),
     })
-        .then(response => response.text())
-        .then(filename => {
-            console.log(filename);
-            const endTime = performance.now(); // Stop the timer
-            const duration = endTime - startTime; // Calculate the duration in milliseconds
-            console.log('Request took ' + duration / 1000 + ' milliseconds');
-            document.getElementById('output-path').innerHTML = "Done in " + duration / 1000 + "s, subset to <a href='file://" + filename + "'>" + filename + "</a>";
-        })
-        .catch(error => {
-            console.error('Error:', error);
-        }).finally(() => {
-            document.getElementById('subset-button').disabled = false;
-            document.getElementById('subset-loading').style.visibility = "hidden";
-        });
+    .then((response) => {
+    // 409 if that subset gpkg path already exists
+        if (response.status == 409) {
+            console.log("check response")
+            if (!confirm('A geopackage already exists with that catchment name. Overwrite?')) {
+                alert("Subset canceled.");
+                return;
+            }
+        }
+        const startTime = performance.now(); // Start the timer
+        fetch('/subset', {
+                method: 'POST',
+                headers: { 'Content-Type': 'application/json' },
+                body: JSON.stringify([cat_id]),
+            })
+                .then(response => response.text())
+                .then(filename => {
+                    console.log(filename);
+                    const endTime = performance.now(); // Stop the timer
+                    const duration = endTime - startTime; // Calculate the duration in milliseconds
+                    console.log('Request took ' + duration / 1000 + ' milliseconds');
+                    document.getElementById('output-path').innerHTML = "Done in " + (duration / 1000).toFixed(2) + "s, subset to <a href='file://" + filename + "'>" + filename + "</a>";
+                })
+                .catch(error => {
+                    console.error('Error:', error);
+                }).finally(() => {
+                    document.getElementById('subset-button').disabled = false;
+                    document.getElementById('subset-loading').style.visibility = "hidden";
+                });
+    });
 }
-// async function subset_to_file() {
-//     if (Object.keys(cat_id_dict).length === 0) {
-//         alert('Please select at least one basin in the map before subsetting');
-//         return;
-//     }
-//     console.log('subsetting to file');
-//     document.getElementById('subset-to-file-button').disabled = true;
-//     document.getElementById('subset-to-file-loading').style.visibility = "visible";
-//     const startTime = performance.now(); // Start the timer
-//     document.getElementById('output-path').innerHTML = "Subsetting...";
-//     fetch('/subset_to_file', {
-//         method: 'POST',
-//         headers: { 'Content-Type': 'application/json' },
-//         body: JSON.stringify(cat_id_dict),
-//     })
-//         .then(response => response.text())
-//         .then(filename => {
-//             console.log(filename);
-//             const endTime = performance.now(); // Stop the timer
-//             const duration = endTime - startTime; // Calculate the duration in milliseconds
-//             console.log('Request took ' + duration / 1000 + ' milliseconds');
-//             document.getElementById('output-path').innerHTML = "Done in " + duration / 1000 + "s, subset to <a href='file://" + filename + "'>" + filename + "</a>";
-//         })
-//         .catch(error => {
-//             console.error('Error:', error);
-//         }).finally(() => {
-//             document.getElementById('subset-to-file-button').disabled = false;
-//             document.getElementById('subset-to-file-loading').style.visibility = "hidden";
-//         });
-// }
 async function forcings() {
     if (document.getElementById('output-path').textContent === '') {
         alert('Please subset the data before getting forcings');
         return;
@@ -139,6 +116,5 @@ async function realization() {
 // These functions are exported by data_processing.js
 document.getElementById('subset-button').addEventListener('click', subset);
-// document.getElementById('subset-to-file-button').addEventListener('click', subset_to_file);
 document.getElementById('forcings-button').addEventListener('click', forcings);
 document.getElementById('realization-button').addEventListener('click', realization);

map_app/static/js/main.js CHANGED Viewed

@@ -133,7 +133,6 @@ function update_map(cat_id, e) {
   $('#selected-basins').text(cat_id)
   map.setFilter('selected-catchments', ['any', ['in', 'divide_id', cat_id]]);
   map.setFilter('upstream-catchments', ['any', ['in', 'divide_id', ""]])
   fetch('/get_upstream_catids', {
     method: 'POST',
     headers: { 'Content-Type': 'application/json' },

map_app/views.py CHANGED Viewed

@@ -27,7 +27,9 @@ def index():
 @main.route("/get_upstream_catids", methods=["POST"])
 def get_upstream_catids():
     cat_id = json.loads(request.data.decode("utf-8"))
-    upstream_cats = get_upstream_cats(cat_id)
+    # give wb_id to get_upstream_cats because the graph search is 1000x faster
+    wb_id = "wb-" + cat_id.split("-")[-1]
+    upstream_cats = get_upstream_cats(wb_id)
     if cat_id in upstream_cats:
         upstream_cats.remove(cat_id)
     return list(upstream_cats), 200
@@ -41,13 +43,25 @@ def get_upstream_wbids():
     return [id for id in upstream_ids if id.startswith("wb")], 200
+@main.route("/subset_check", methods=["POST"])
+def subset_check():
+    cat_ids = list(json.loads(request.data.decode("utf-8")))
+    logger.info(cat_ids)
+    subset_name = cat_ids[0]
+    run_paths = file_paths(subset_name)
+    if run_paths.geopackage_path.exists():
+        return "check required", 409
+    else:
+        return "success", 200
 @main.route("/subset", methods=["POST"])
 def subset_selection():
     cat_ids = list(json.loads(request.data.decode("utf-8")))
     logger.info(cat_ids)
     subset_name = cat_ids[0]
     run_paths = file_paths(subset_name)
-    subset(cat_ids, output_gpkg_path=run_paths.geopackage_path)
+    subset(cat_ids, output_gpkg_path=run_paths.geopackage_path, override_gpkg=True)
     return str(run_paths.geopackage_path), 200

ngiab_data_cli/__main__.py CHANGED Viewed

@@ -1,13 +1,13 @@
 from typing import Tuple
 import rich.status
 # add a status bar for these imports so the cli feels more responsive
-with rich.status.Status("Initializing...") as status:
+with rich.status.Status("loading") as status:
     import argparse
     import logging
     import subprocess
     import time
-    from typing import List
     import geopandas as gpd
     from data_processing.create_realization import create_em_realization, create_realization
@@ -19,7 +19,7 @@ with rich.status.Status("Initializing...") as status:
     from data_processing.gpkg_utils import get_cat_from_gage_id, get_catid_from_point
     from data_processing.graph_utils import get_upstream_cats
     from data_processing.subset import subset, subset_vpu
-    from data_sources.source_validation import validate_output_dir, validate_hydrofabric
+    from data_sources.source_validation import validate_hydrofabric, validate_output_dir
     from ngiab_data_cli.arguments import parse_arguments
     from ngiab_data_cli.custom_logging import set_logging_to_critical_only, setup_logging

{ngiab_data_preprocess-4.3.0.dist-info → ngiab_data_preprocess-4.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ngiab_data_preprocess
-Version: 4.3.0
+Version: 4.4.0
 Summary: Graphical Tools for creating Next Gen Water model input data.
 Author-email: Josh Cunningham <jcunningham8@ua.edu>
 Project-URL: Homepage, https://github.com/CIROH-UA/NGIAB_data_preprocess
@@ -32,6 +32,7 @@ Requires-Dist: colorama==0.4.6
 Requires-Dist: bokeh==3.5.1
 Requires-Dist: boto3
 Requires-Dist: numcodecs<0.16.0
+Requires-Dist: scipy>=1.15.3
 Provides-Extra: eval
 Requires-Dist: ngiab_eval; extra == "eval"
 Provides-Extra: plot
@@ -40,55 +41,92 @@ Dynamic: license-file
 # NGIAB Data Preprocess
-This repository contains tools for preparing data to run a [next gen](https://github.com/NOAA-OWP/ngen) simulation using [NGIAB](https://github.com/CIROH-UA/NGIAB-CloudInfra). The tools allow you to select a catchment of interest on an interactive map, choose a date range, and prepare the data with just a few clicks!
+This repository contains tools for preparing data to run a [NextGen](https://github.com/NOAA-OWP/ngen)-based simulation using [NGIAB](https://github.com/CIROH-UA/NGIAB-CloudInfra). The tools allow you to select a catchment of interest on an interactive map, choose a date range, and prepare the data with just a few clicks!
 ![map screenshot](https://github.com/CIROH-UA/NGIAB_data_preprocess/blob/main/modules/map_app/static/resources/screenshot.jpg)
+| | |
+| --- | --- |
+| ![CIROH Logo](./ciroh-bgsafe.png) | Funding for this project was provided by the National Oceanic & Atmospheric Administration (NOAA), awarded to the Cooperative Institute for Research to Operations in Hydrology (CIROH) through the NOAA Cooperative Agreement with The University of Alabama (NA22NWS4320003). |
 ## Table of Contents
 1. [What does this tool do?](#what-does-this-tool-do)
-2. [What does it not do?](#what-does-it-not-do)
+2. [Limitations](#limitations)
+   - [Custom realizations](#custom-realizations)
+   - [Calibration](#calibration)
    - [Evaluation](#evaluation)
    - [Visualisation](#visualisation)
 3. [Requirements](#requirements)
-4. [Installation and Running](#installation-and-running)
+4. [Installation and running](#installation-and-running)
    - [Running without install](#running-without-install)
-5. [For legacy pip installation](#for-legacy-pip-installation)
-6. [Development Installation](#development-installation)
-7. [Usage](#usage)
-8. [CLI Documentation](#cli-documentation)
+   - [For uv installation](#for-uv-installation)
+   - [For legacy pip installation](#for-legacy-pip-installation)
+   - [Development installation](#development-installation)
+5. [Map interface documentation](#map-interface-documentation)
+   - [Running the map interface app](#running-the-map-interface-app)
+   - [Using the map interace](#using-the-map-interface)
+6. [CLI documentation](#cli-documentation)
+   - [Running the CLI](#running-the-cli)
    - [Arguments](#arguments)
-   - [Usage Notes](#usage-notes)
+   - [Usage notes](#usage-notes)
    - [Examples](#examples)
+7. [Realization information](#realization-information)
+   - [NOAH + CFE](#noah--cfe)
 ## What does this tool do?
-This tool prepares data to run a next gen simulation by creating a run package that can be used with NGIAB.
+This tool prepares data to run a NextGen-based simulation by creating a run package that can be used with NGIAB.
 It uses geometry and model attributes from the [v2.2 hydrofabric](https://lynker-spatial.s3-us-west-2.amazonaws.com/hydrofabric/v2.2/conus/conus_nextgen.gpkg) more information on [all data sources here](https://lynker-spatial.s3-us-west-2.amazonaws.com/hydrofabric/v2.2/hfv2.2-data_model.html).
 The raw forcing data is [nwm retrospective v3 forcing](https://noaa-nwm-retrospective-3-0-pds.s3.amazonaws.com/index.html#CONUS/zarr/forcing/) data or the [AORC 1km gridded data](https://noaa-nws-aorc-v1-1-1km.s3.amazonaws.com/index.html) depending on user input
-1. **Subset** (delineate) everything upstream of your point of interest (catchment, gage, flowpath etc). Outputs as a geopackage.
-2. **Calculates** Forcings as a weighted mean of the gridded AORC forcings. Weights are calculated using [exact extract](https://isciences.github.io/exactextract/) and computed with numpy.
-3. Creates **configuration files** needed to run nextgen.
+1. **Subsets** (delineates) everything upstream of your point of interest (catchment, gage, flowpath etc) from the hydrofabric. This subset is output as a geopackage (.gpkg).
+2. Calculates **forcings** as a weighted mean of the gridded NWM or AORC forcings. Weights are calculated using [exact extract](https://isciences.github.io/exactextract/) and computed with numpy.
+3. Creates **configuration files** for a default NGIAB model run.
     -  realization.json  - ngen model configuration
     -  troute.yaml - routing configuration.
     -  **per catchment** model configuration
-4. Optionally Runs a non-interactive [Next gen in a box](https://github.com/CIROH-UA/NGIAB-CloudInfra).
+4. Optionally performs a non-interactive [Docker-based NGIAB](https://github.com/CIROH-UA/NGIAB-CloudInfra) run.
+## Limitations
+This tool cannot do the following:
+### Custom realizations
+This tool currently only outputs a single, default realization, which is described in "[Realization information](#realization-information)". Support for additional model configurations is planned, but not currently available.
+### Calibration
+If available, this repository will download [calibrated parameters](https://communityhydrofabric.s3.us-east-1.amazonaws.com/index.html#hydrofabrics/community/gage_parameters/) from the [Community Hydrofabric](https://github.com/CIROH-UA/community_hf_patcher) AWS S3 bucket.
+However, many gages and catchments will not have such parameters available. In these cases, Data Preprocess will output realizations with default values.
-## What does it not do?
+For automatic calibration, please see [ngiab-cal](https://github.com/CIROH-UA/ngiab-cal), which is under active development.
 ### Evaluation
-For automatic evaluation using [Teehr](https://github.com/RTIInternational/teehr), please run [NGIAB](https://github.com/CIROH-UA/NGIAB-CloudInfra) interactively using the `guide.sh` script.
+For automatic evaluation using [TEEHR](https://github.com/RTIInternational/teehr), please run [NGIAB](https://github.com/CIROH-UA/NGIAB-CloudInfra) interactively using the `guide.sh` script.
 ### Visualisation
 For automatic interactive visualisation, please run [NGIAB](https://github.com/CIROH-UA/NGIAB-CloudInfra) interactively using the `guide.sh` script
-## Requirements
+# Requirements
-* This tool is officially supported on macOS or Ubuntu (tested on 22.04 & 24.04). To use it on Windows, please install [WSL](https://learn.microsoft.com/en-us/windows/wsl/install).
+This tool is **officially supported** on **macOS** and **Ubuntu** (tested on 22.04 & 24.04). To use it on Windows, please install [**WSL**](https://learn.microsoft.com/en-us/windows/wsl/install).
-## Installation and Running
-It is highly recommended to use [Astral UV](https://docs.astral.sh/uv/) to install and run this tool. It works similarly to pip and conda, and I would also recommend you use it for other python projects as it is so useful.
+It is also **highly recommended** to use [Astral UV](https://docs.astral.sh/uv/) to install and run this tool. Installing the project via `pip` without the use of a virtual environment creates a **severe risk** of dependency conflicts.
+# Installation and running
+### Running without install
+This package supports pipx and uvx, which means you can run the tool without installing it. No virtual environment needed, just UV.
+```bash
+# Run these from anywhere!
+uvx --from ngiab-data-preprocess cli --help  # Running the CLI
+uvx ngiab-prep --help                        # Alias for the CLI
+uvx --from ngiab-data-preprocess map_app     # Running the map interface
+```
+### For uv installation
+<details>
+  <summary>Click here to expand</summary>
 ```bash
 # Install UV
@@ -111,16 +149,10 @@ uv run map_app
 UV automatically detects any virtual environments in the current directory and will use them when you use `uv run`.
-### Running without install
-This package supports pipx and uvx which means you can run the tool without installing it. No virtual environment needed, just UV.
-```bash
-# run this from anywhere
-uvx --from ngiab_data_preprocess cli --help
-# for the map
-uvx --from ngiab_data_preprocess map_app
-```
+</details>
+### For legacy pip installation
-## For legacy pip installation
 <details>
   <summary>Click here to expand</summary>
@@ -142,7 +174,7 @@ python -m map_app
 ```
 </details>
-## Development Installation
+### Development installation
 <details>
   <summary>Click to expand installation steps</summary>
@@ -168,11 +200,17 @@ To install and run the tool, follow these steps:
    ```
 </details>
-## Usage
+# Map interface documentation
+## Running the map interface app
-Running the command `uv run map_app` will open the app in a new browser tab.
+Running the `map_app` tool will open the app in a new browser tab.
+Install-free: `uvx --from ngiab-data-preprocess map_app`
+Installed with uv: `uv run map_app`
+## Using the map interface
-To use the tool:
 1. Select the catchment you're interested in on the map.
 2. Pick the time period you want to simulate.
 3. Click the following buttons in order:
@@ -184,7 +222,12 @@ Once all the steps are finished, you can run NGIAB on the folder shown underneat
 **Note:** When using the tool, the default output will be stored in the `~/ngiab_preprocess_output/<your-input-feature>/` folder. There is no overwrite protection on the folders.
-# CLI Documentation
+# CLI documentation
+## Running the CLI
+Install-free: `uvx ngiab-prep`
+Installed with uv: `uv run cli`
 ## Arguments
@@ -201,11 +244,11 @@ Once all the steps are finished, you can run NGIAB on the folder shown underneat
 - `-o OUTPUT_NAME`, `--output_name OUTPUT_NAME`: Name of the output folder.
 - `--source` : The datasource you want to use, either `nwm` for retrospective v3 or `aorc`. Default is `nwm`
 - `-D`, `--debug`: Enable debug logging.
-- `--run`: Automatically run Next Gen against the output folder.
-- `--validate`: Run every missing step required to run ngiab.
-- `-a`, `--all`: Run all operations: subset, forcings, realization, run Next Gen
+- `--run`: Automatically run [NGIAB's docker distribution](https://github.com/CIROH-UA/NGIAB-CloudInfra) against the output folder.
+- `--validate`: Run every missing step required to run NGIAB.
+- `-a`, `--all`: Run all operations. Equivalent to `-sfr` and `--run`.
-## Usage Notes
+## Usage notes
 - If your input has a prefix of `gage-`, you do not need to pass `-g`.
 - The `-l`, `-g`, `-s`, `-f`, `-r` flags can be combined like normal CLI flags. For example, to subset, generate forcings, and create a realization, you can use `-sfr` or `-s -f -r`.
 - When using the `--all` flag, it automatically sets `subset`, `forcings`, `realization`, and `run` to `True`.
@@ -213,50 +256,53 @@ Once all the steps are finished, you can run NGIAB on the folder shown underneat
 ## Examples
-0. Prepare everything for a nextgen run at a given gage:
+1. Prepare everything for an NGIAB run at a given gage:
    ```bash
-   python -m ngiab_data_cli -i gage-10154200 -sfr --start 2022-01-01 --end 2022-02-28
-   #         add --run or replace -sfr with --all to run nextgen in a box too
+   uvx ngiab-prep -i gage-10154200 -sfr --start 2022-01-01 --end 2022-02-28
+   #         add --run or replace -sfr with --all to run NGIAB, too
    # to name the folder, add -o folder_name
    ```
-1. Subset hydrofabric using catchment ID or VPU:
+2. Subset the hydrofabric using a catchment ID or VPU:
    ```bash
-   python -m ngiab_data_cli -i cat-7080 -s
-   python -m ngiab_data_cli --vpu 01 -s
+   uvx ngiab-prep -i cat-7080 -s
+   uvx ngiab-prep --vpu 01 -s
    ```
-2. Generate forcings using a single catchment ID:
+3. Generate forcings using a single catchment ID:
    ```bash
-   python -m ngiab_data_cli -i cat-5173 -f --start 2022-01-01 --end 2022-02-28
+   uvx ngiab-prep -i cat-5173 -f --start 2022-01-01 --end 2022-02-28
    ```
-3. Create realization using a lat/lon pair and output to a named folder:
+4. Create realization using a latitude/longitude pair and output to a named folder:
    ```bash
-   python -m ngiab_data_cli -i 33.22,-87.54 -l -r --start 2022-01-01 --end 2022-02-28 -o custom_output
+   uvx ngiab-prep -i 33.22,-87.54 -l -r --start 2022-01-01 --end 2022-02-28 -o custom_output
    ```
-4. Perform all operations using a lat/lon pair:
+5. Perform all operations using a latitude/longitude pair:
    ```bash
-   python -m ngiab_data_cli -i 33.22,-87.54 -l -s -f -r --start 2022-01-01 --end 2022-02-28
+   uvx ngiab-prep -i 33.22,-87.54 -l -s -f -r --start 2022-01-01 --end 2022-02-28
    ```
-5. Subset hydrofabric using gage ID:
+6. Subset the hydrofabric using a gage ID:
    ```bash
-   python -m ngiab_data_cli -i 10154200 -g -s
+   uvx ngiab-prep -i 10154200 -g -s
    # or
-   python -m ngiab_data_cli -i gage-10154200 -s
+   uvx ngiab-prep -i gage-10154200 -s
    ```
-6. Generate forcings using a single gage ID:
+7. Generate forcings using a single gage ID:
    ```bash
-   python -m ngiab_data_cli -i 01646500 -g -f --start 2022-01-01 --end 2022-02-28
+   uvx ngiab-prep -i 01646500 -g -f --start 2022-01-01 --end 2022-02-28
    ```
-7. Run all operations, including Next Gen and evaluation/plotting:
-   ```bash
-   python -m ngiab_data_cli -i cat-5173 -a --start 2022-01-01 --end 2022-02-28
-   ```
+# Realization information
+This tool currently offers one default realization.
+## NOAH + CFE
+[This realization](https://github.com/CIROH-UA/NGIAB_data_preprocess/blob/main/modules/data_sources/cfe-nowpm-realization-template.json) is intended to be roughly comparable to earlier versions of the National Water Model.
+- [NOAH-OWP-Modular](https://github.com/NOAA-OWP/NOAH-OWP-Modular): A refactoring of Noah-MP, a land-surface model. Used to model groundwater properties.
+- [Conceptual Functional Equivalent (CFE)](https://github.com/NOAA-OWP/CFE): A simplified conceptual approximation of versions 1.2, 2.0, and 2.1 of the National Water Model. Used to model precipitation and evaporation.
+- [SLoTH](https://github.com/NOAA-OWP/SLoTH): A module used to feed through unchanged values. In this default configuration, it simply forces certain soil moisture and ice fraction properties to zero.

{ngiab_data_preprocess-4.3.0.dist-info → ngiab_data_preprocess-4.4.0.dist-info}/RECORD RENAMED Viewed

@@ -1,43 +1,43 @@
-data_processing/create_realization.py,sha256=WZCnYps-d3xd6_F4-Fy95nyXoh3GX4DzpUBWXvSvzKY,14953
+data_processing/create_realization.py,sha256=mdse8W2DgPg5Lj2_ErUsLJh-touTmShKwQrrOWO0jlY,15958
 data_processing/dask_utils.py,sha256=A2IP94WAz8W9nek3etXKEKTOxGPf0NWSFLh8cZ5S-xU,2454
-data_processing/dataset_utils.py,sha256=CMDy-YfjFQ9FM_BbRHnRKUFwERWK9ATJ0wn4wI0gUwY,10024
+data_processing/dataset_utils.py,sha256=AJOxE2nRfZnWYon_qqGcfkpRZuRW8Yy8YI86SxVDU3M,11168
 data_processing/datasets.py,sha256=_EJ1uZSWTU1HWpvF7TQSikneJqWZFikTrdo9usCV8A0,4665
 data_processing/file_paths.py,sha256=l2iCUFt_pk-jjzl7OS7npROAnQxwqFfZ7b2wRjViqiU,4720
 data_processing/forcings.py,sha256=k-JhBncTnXcdjSieam1Q2cDx5Xt9hH5Aywv0gDY4O2U,19010
 data_processing/gpkg_utils.py,sha256=tSSIMlHeqqgxTJQyF3X9tPmunQTJYx0xrCNHqUBQxkg,20590
-data_processing/graph_utils.py,sha256=-0vmLZvuhi9jLFSUfA-3Lo-wGfX4hMfB2QQ6A2D2FO8,8362
+data_processing/graph_utils.py,sha256=qvHw6JlzQxLi--eMsGgC_rUBP4nDatl6X9mSa03Xxyo,8306
 data_processing/s3fs_utils.py,sha256=ki1EmA0ezV0r26re6dRWIGzL5FudGdwF9Qw1eVLR0Bc,2747
-data_processing/subset.py,sha256=15rzjKlTPAHtFYZusKDtb4-zhG-8sTKU68ou9BA-_9Q,2610
+data_processing/subset.py,sha256=XoojOgWCwxOi5Q4KXHXARNQeoZlobJp-mqhIIvTRtTw,3793
 data_sources/cfe-nowpm-realization-template.json,sha256=8an6q1drWD8wU1ocvdPab-GvZDvlQ-0di_-NommH3QI,3528
 data_sources/cfe-template.ini,sha256=6e5-usqjWtm3MWVvtm8CTeZTJJMxO1ZswkOXq0L9mnc,2033
 data_sources/em-catchment-template.yml,sha256=M08ixazEUHYI2PNavtI0xPZeSzcQ9bg2g0XzNT-8_u4,292
 data_sources/em-config.yml,sha256=y0J8kEA70rxLWXJjz-CQ7sawcVyhQcayofeLlq4Svbo,1330
 data_sources/em-realization-template.json,sha256=DJvB7N8lCeS2vLFenmbTzysBDR-xPaJ09XA8heu1ijY,1466
 data_sources/forcing_template.nc,sha256=uRuVAqX3ngdlougZINavtwl_wC2VLD8fHqG7_CLim1s,85284
-data_sources/ngen-routing-template.yaml,sha256=RV28MAbyQNx9U8FAYmZhD2Fv8Yu6o_08Ekoc77KNdH4,4622
+data_sources/ngen-routing-template.yaml,sha256=wM5v6jj0kwcJBVatLFuy2big6g8nlSXxzc8a23nwI5s,4655
 data_sources/noah-owp-modular-init.namelist.input,sha256=Vb7mp40hFpJogruOrXrDHwVW1bKi9h1ciDNyDvTzn20,3045
-data_sources/source_validation.py,sha256=vrCuh2nFy9x-8MKqbUtxpdWCm3ohKK6UFcGR87n4I7I,9029
+data_sources/source_validation.py,sha256=RmvyPLjuDetpuNOUqCclgDfe8zd_Ojr7pfbUoUya2pQ,9498
 data_sources/template.sql,sha256=ZnFqAqleEq9wgmAhNO90Wue_L9k0JAn8KF99DYtcxgs,10457
 data_sources/triggers.sql,sha256=G0d_175eNsamKAFhsbphPATvzMPuPL_iCleIhlToduQ,14906
 map_app/__init__.py,sha256=OarJao9X98kcbLyiwewN4ObWNAYkKDichcxbuWywTsA,818
 map_app/__main__.py,sha256=Uj7__cJUyPQkZo2tNQ2x2g6rwizsyg1DcNtJkQniHzY,1650
-map_app/views.py,sha256=SMrnXDjoIMk8yMrBsrif41GLS-QLuN79cWYbA-uqKX8,5138
+map_app/views.py,sha256=ajU_QSd-Oa7UrRQEZPX4rmOlaKwo76Q8UPQNXtt-e2k,5622
 map_app/static/css/console.css,sha256=xN6G2MMFyKc9YW9HEVpUUTUjx2o2nokBR4nCX5c18UM,803
 map_app/static/css/main.css,sha256=HmRIfhWeHTrNLOCHGpaKuzwGj05LkkUiQy538D-ZRLY,6464
 map_app/static/css/toggle.css,sha256=Ep6tXT7gCrPRRITuEMpXyisuiTQgiLIEKFFTWRmC82o,1913
 map_app/static/js/console.js,sha256=BnG0pED5B9d563sLWshDNt_q-SyoTY48sETvVoOVJkU,1377
-map_app/static/js/data_processing.js,sha256=X6NSuggOGNIJUF-LEyGGYJjtiA5J29xmkXgFFmfBw18,6711
-map_app/static/js/main.js,sha256=JkvZqDuzYQaNtVmGeOdg0Za6OUBIG7hGOR3CB-uoviQ,9691
+map_app/static/js/data_processing.js,sha256=wXv0p_bPmNOrSpU_p6Yqtfd17vqOFRJFAmLdUUWLF7s,5486
+map_app/static/js/main.js,sha256=_Yq1tuzyREqWU24rFQJSh5zIaXtAXEGlfZPo36QLHvI,9690
 map_app/static/resources/loading.gif,sha256=ggdkZf1AD7rSwIpSJwfiIqANgmVV1WHlxGuKxQKv7uY,72191
 map_app/static/resources/screenshot.jpg,sha256=Ia358aX-OHM9BP4B8lX05cLnguF2fHUIimno9bnFLYw,253730
 map_app/templates/index.html,sha256=Jy2k1Ob2_et--BPpfmTYO22Yin3vrG6IOeNlwzUoEqY,7878
-ngiab_data_cli/__main__.py,sha256=LIRuzYCT2bF1eeW51hJIrAeeMmyHL7MevpTftcWbvR0,10605
+ngiab_data_cli/__main__.py,sha256=13W3RnD73weQNYZdq6munx_0oMBgzc-yzluKEm5nSxg,10570
 ngiab_data_cli/arguments.py,sha256=yBULJnFgUvgP4YZmZ5HhR7g0EfdMtBCdQuDkDuYSXCQ,4322
 ngiab_data_cli/custom_logging.py,sha256=iS2XozaxudcxQj17qAsrCgbVK9LJAYAPmarJuVWJo1k,1280
 ngiab_data_cli/forcing_cli.py,sha256=eIWRxRWUwPqR16fihFDEIV4VzGlNuvcD6lJW5VYjkPU,3635
-ngiab_data_preprocess-4.3.0.dist-info/licenses/LICENSE,sha256=6dMSprwwnsRzEm02mEDbKHD9dUbL8bPIt9Vhrhb0Ulk,1081
-ngiab_data_preprocess-4.3.0.dist-info/METADATA,sha256=zzIirFNOmhxVhaYD09onH14VbLTRV3EUIVxdCnh1EdA,10465
-ngiab_data_preprocess-4.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-ngiab_data_preprocess-4.3.0.dist-info/entry_points.txt,sha256=spwlhKEJ3ZnNETQsJGeTjD7Vwy8O_zGHb9GdX8ACCtw,128
-ngiab_data_preprocess-4.3.0.dist-info/top_level.txt,sha256=CjhYAUZrdveR2fOK6rxffU09VIN2IuPD7hk4V3l3pV0,52
-ngiab_data_preprocess-4.3.0.dist-info/RECORD,,
+ngiab_data_preprocess-4.4.0.dist-info/licenses/LICENSE,sha256=6dMSprwwnsRzEm02mEDbKHD9dUbL8bPIt9Vhrhb0Ulk,1081
+ngiab_data_preprocess-4.4.0.dist-info/METADATA,sha256=8PlfoGwOJIpuKhFwtfWmfxdMaDeXBfFRz9CAeZ3sZKk,13344
+ngiab_data_preprocess-4.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+ngiab_data_preprocess-4.4.0.dist-info/entry_points.txt,sha256=spwlhKEJ3ZnNETQsJGeTjD7Vwy8O_zGHb9GdX8ACCtw,128
+ngiab_data_preprocess-4.4.0.dist-info/top_level.txt,sha256=CjhYAUZrdveR2fOK6rxffU09VIN2IuPD7hk4V3l3pV0,52
+ngiab_data_preprocess-4.4.0.dist-info/RECORD,,

{ngiab_data_preprocess-4.3.0.dist-info → ngiab_data_preprocess-4.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{ngiab_data_preprocess-4.3.0.dist-info → ngiab_data_preprocess-4.4.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{ngiab_data_preprocess-4.3.0.dist-info → ngiab_data_preprocess-4.4.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{ngiab_data_preprocess-4.3.0.dist-info → ngiab_data_preprocess-4.4.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

ngiab-data-preprocess 4.3.0__py3-none-any.whl → 4.4.0__py3-none-any.whl

ngiab-data-preprocess 4.3.0py3-none-any.whl → 4.4.0py3-none-any.whl