PyPI - ngiab-data-preprocess - Versions diffs - 4.3.0__py3-none-any.whl → 4.5.0__py3-none-any.whl - Mend

ngiab-data-preprocess 4.3.0py3-none-any.whl → 4.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

data_processing/create_realization.py +71 -180
data_processing/dataset_utils.py +44 -10
data_processing/file_paths.py +4 -4
data_processing/graph_utils.py +0 -2
data_processing/subset.py +38 -7
data_sources/lstm-catchment-template.yml +17 -0
data_sources/{em-realization-template.json → lstm-realization-template.json} +5 -8
data_sources/ngen-routing-template.yaml +1 -1
data_sources/source_validation.py +15 -2
map_app/static/js/data_processing.js +31 -55
map_app/static/js/main.js +0 -1
map_app/views.py +16 -2
ngiab_data_cli/__main__.py +10 -12
ngiab_data_cli/arguments.py +2 -3
{ngiab_data_preprocess-4.3.0.dist-info → ngiab_data_preprocess-4.5.0.dist-info}/METADATA +105 -59
{ngiab_data_preprocess-4.3.0.dist-info → ngiab_data_preprocess-4.5.0.dist-info}/RECORD +20 -21
data_sources/em-catchment-template.yml +0 -10
data_sources/em-config.yml +0 -60
{ngiab_data_preprocess-4.3.0.dist-info → ngiab_data_preprocess-4.5.0.dist-info}/WHEEL +0 -0
{ngiab_data_preprocess-4.3.0.dist-info → ngiab_data_preprocess-4.5.0.dist-info}/entry_points.txt +0 -0
{ngiab_data_preprocess-4.3.0.dist-info → ngiab_data_preprocess-4.5.0.dist-info}/licenses/LICENSE +0 -0
{ngiab_data_preprocess-4.3.0.dist-info → ngiab_data_preprocess-4.5.0.dist-info}/top_level.txt +0 -0

data_processing/create_realization.py CHANGED Viewed

@@ -6,7 +6,10 @@ import sqlite3
 from datetime import datetime
 from pathlib import Path
 from typing import Dict, Optional
+import psutil
+import os
+import numpy as np
 import pandas
 import requests
 import s3fs
@@ -14,8 +17,6 @@ import xarray as xr
 from data_processing.dask_utils import temp_cluster
 from data_processing.file_paths import file_paths
 from data_processing.gpkg_utils import (
-    GeoPackage,
-    get_cat_to_nex_flowpairs,
     get_cat_to_nhd_feature_id,
     get_table_crs_short,
 )
@@ -89,7 +90,6 @@ def make_cfe_config(
 def make_noahowp_config(
     base_dir: Path, divide_conf_df: pandas.DataFrame, start_time: datetime, end_time: datetime
 ) -> None:
-    divide_conf_df.set_index("divide_id", inplace=True)
     start_datetime = start_time.strftime("%Y%m%d%H%M")
     end_datetime = end_time.strftime("%Y%m%d%H%M")
     with open(file_paths.template_noahowp_config, "r") as file:
@@ -98,155 +98,78 @@ def make_noahowp_config(
     cat_config_dir = base_dir / "cat_config" / "NOAH-OWP-M"
     cat_config_dir.mkdir(parents=True, exist_ok=True)
-    for divide in divide_conf_df.index:
-        with open(cat_config_dir / f"{divide}.input", "w") as file:
+    for _, row in divide_conf_df.iterrows():
+        with open(cat_config_dir / f"{row['divide_id']}.input", "w") as file:
             file.write(
                 template.format(
                     start_datetime=start_datetime,
                     end_datetime=end_datetime,
-                    lat=divide_conf_df.loc[divide, "latitude"],
-                    lon=divide_conf_df.loc[divide, "longitude"],
-                    terrain_slope=divide_conf_df.loc[divide, "mean.slope_1km"],
-                    azimuth=divide_conf_df.loc[divide, "circ_mean.aspect"],
-                    ISLTYP=int(divide_conf_df.loc[divide, "mode.ISLTYP"]),  # type: ignore
-                    IVGTYP=int(divide_conf_df.loc[divide, "mode.IVGTYP"]),  # type: ignore
+                    lat=row["latitude"],
+                    lon=row["longitude"],
+                    terrain_slope=row["mean.slope_1km"],
+                    azimuth=row["circ_mean.aspect"],
+                    ISLTYP=int(row["mode.ISLTYP"]),  # type: ignore
+                    IVGTYP=int(row["mode.IVGTYP"]),  # type: ignore
                 )
             )
-def get_model_attributes_modspatialite(hydrofabric: Path) -> pandas.DataFrame:
-    # modspatialite is faster than pyproj but can't be added as a pip dependency
-    # This incantation took a while
-    with GeoPackage(hydrofabric) as conn:
-        sql = """WITH source_crs AS (
-        SELECT organization || ':' || organization_coordsys_id AS crs_string
-        FROM gpkg_spatial_ref_sys
-        WHERE srs_id = (
-            SELECT srs_id
-            FROM gpkg_geometry_columns
-            WHERE table_name = 'divides'
-        )
-        )
-        SELECT
-        d.divide_id,
-        d.areasqkm,
-        da."mean.slope",
-        da."mean.slope_1km",
-        da."mean.elevation",
-        ST_X(Transform(MakePoint(da.centroid_x, da.centroid_y), 4326, NULL,
-            (SELECT crs_string FROM source_crs), 'EPSG:4326')) AS longitude,
-        ST_Y(Transform(MakePoint(da.centroid_x, da.centroid_y), 4326, NULL,
-            (SELECT crs_string FROM source_crs), 'EPSG:4326')) AS latitude
-        FROM divides AS d
-        JOIN 'divide-attributes' AS da ON d.divide_id = da.divide_id
-        """
-        divide_conf_df = pandas.read_sql_query(sql, conn)
-    divide_conf_df.set_index("divide_id", inplace=True)
-    return divide_conf_df
-def get_model_attributes_pyproj(hydrofabric: Path) -> pandas.DataFrame:
-    # if modspatialite is not available, use pyproj
+def get_model_attributes(hydrofabric: Path) -> pandas.DataFrame:
     with sqlite3.connect(hydrofabric) as conn:
-        sql = """
-        SELECT
-        d.divide_id,
-        d.areasqkm,
-        da."mean.slope",
-        da."mean.slope_1km",
-        da."mean.elevation",
-        da.centroid_x,
-        da.centroid_y
-        FROM divides AS d
-        JOIN 'divide-attributes' AS da ON d.divide_id = da.divide_id
-        """
-        divide_conf_df = pandas.read_sql_query(sql, conn)
+        conf_df = pandas.read_sql_query(
+            """
+            SELECT
+            d.areasqkm,
+            da.*
+            FROM divides AS d
+            JOIN 'divide-attributes' AS da ON d.divide_id = da.divide_id
+            """,
+            conn,
+        )
     source_crs = get_table_crs_short(hydrofabric, "divides")
     transformer = Transformer.from_crs(source_crs, "EPSG:4326", always_xy=True)
-    lon, lat = transformer.transform(
-        divide_conf_df["centroid_x"].values, divide_conf_df["centroid_y"].values
-    )
-    divide_conf_df["longitude"] = lon
-    divide_conf_df["latitude"] = lat
-    divide_conf_df.drop(columns=["centroid_x", "centroid_y"], axis=1, inplace=True)
-    divide_conf_df.set_index("divide_id", inplace=True)
-    return divide_conf_df
-def get_model_attributes(hydrofabric: Path) -> pandas.DataFrame:
-    try:
-        with GeoPackage(hydrofabric) as conn:
-            conf_df = pandas.read_sql_query(
-                """WITH source_crs AS (
-            SELECT organization || ':' || organization_coordsys_id AS crs_string
-            FROM gpkg_spatial_ref_sys
-            WHERE srs_id = (
-                SELECT srs_id
-                FROM gpkg_geometry_columns
-                WHERE table_name = 'divides'
-            )
-            )
-            SELECT
-            *,
-            ST_X(Transform(MakePoint(centroid_x, centroid_y), 4326, NULL,
-                (SELECT crs_string FROM source_crs), 'EPSG:4326')) AS longitude,
-            ST_Y(Transform(MakePoint(centroid_x, centroid_y), 4326, NULL,
-                (SELECT crs_string FROM source_crs), 'EPSG:4326')) AS latitude FROM 'divide-attributes';""",
-                conn,
-            )
-    except sqlite3.OperationalError:
-        with sqlite3.connect(hydrofabric) as conn:
-            conf_df = pandas.read_sql_query(
-                "SELECT* FROM 'divide-attributes';",
-                conn,
-            )
-        source_crs = get_table_crs_short(hydrofabric, "divides")
-        transformer = Transformer.from_crs(source_crs, "EPSG:4326", always_xy=True)
-        lon, lat = transformer.transform(conf_df["centroid_x"].values, conf_df["centroid_y"].values)
-        conf_df["longitude"] = lon
-        conf_df["latitude"] = lat
-        conf_df.drop(columns=["centroid_x", "centroid_y"], axis=1, inplace=True)
+    lon, lat = transformer.transform(conf_df["centroid_x"].values, conf_df["centroid_y"].values)
+    conf_df["longitude"] = lon
+    conf_df["latitude"] = lat
     return conf_df
-def make_em_config(
+def make_lstm_config(
     hydrofabric: Path,
     output_dir: Path,
-    template_path: Path = file_paths.template_em_config,
+    template_path: Path = file_paths.template_lstm_config,
 ):
     # test if modspatialite is available
-    try:
-        divide_conf_df = get_model_attributes_modspatialite(hydrofabric)
-    except Exception as e:
-        logger.warning(f"mod_spatialite not available, using pyproj instead: {e}")
-        logger.warning("Install mod_spatialite for improved performance")
-        divide_conf_df = get_model_attributes_pyproj(hydrofabric)
-    cat_config_dir = output_dir / "cat_config" / "empirical_model"
+    divide_conf_df = get_model_attributes(hydrofabric)
+    cat_config_dir = output_dir / "cat_config" / "lstm"
     if cat_config_dir.exists():
         shutil.rmtree(cat_config_dir)
     cat_config_dir.mkdir(parents=True, exist_ok=True)
+    # convert the mean.slope from degrees 0-90 where 90 is flat and 0 is vertical to m/km
+    # flip 0 and 90 degree values
+    divide_conf_df["flipped_mean_slope"] = abs(divide_conf_df["mean.slope"] - 90)
+    # Convert degrees to meters per kmmeter
+    divide_conf_df["mean_slope_mpkm"] = (
+        np.tan(np.radians(divide_conf_df["flipped_mean_slope"])) * 1000
+    )
     with open(template_path, "r") as file:
         template = file.read()
-    for divide in divide_conf_df.index:
+    for _, row in divide_conf_df.iterrows():
+        divide = row["divide_id"]
         with open(cat_config_dir / f"{divide}.yml", "w") as file:
             file.write(
                 template.format(
-                    area_sqkm=divide_conf_df.loc[divide, "areasqkm"],
+                    area_sqkm=row["areasqkm"],
                     divide_id=divide,
-                    lat=divide_conf_df.loc[divide, "latitude"],
-                    lon=divide_conf_df.loc[divide, "longitude"],
-                    slope_mean=divide_conf_df.loc[divide, "mean.slope"],
-                    elevation_mean=divide_conf_df.loc[divide, "mean.slope"],
+                    lat=row["latitude"],
+                    lon=row["longitude"],
+                    slope_mean=row["mean_slope_mpkm"],
+                    elevation_mean=row["mean.elevation"] / 1000,  # convert mm in hf to m
                 )
             )
@@ -257,7 +180,27 @@ def configure_troute(
     with open(file_paths.template_troute_config, "r") as file:
         troute_template = file.read()
     time_step_size = 300
+    gpkg_file_path=f"{config_dir}/{cat_id}_subset.gpkg"
     nts = (end_time - start_time).total_seconds() / time_step_size
+    with sqlite3.connect(gpkg_file_path) as conn:
+        ncats_df = pandas.read_sql_query("SELECT COUNT(id) FROM 'divides';", conn)
+        ncats = ncats_df['COUNT(id)'][0]
+    est_bytes_required = nts * ncats * 45 # extremely rough calculation based on about 3 tests :)
+    local_ram_available = 0.8 * psutil.virtual_memory().available # buffer to not accidentally explode machine
+    if est_bytes_required > local_ram_available:
+        max_loop_size = nts // (est_bytes_required // local_ram_available)
+        binary_nexus_file_folder_comment = ""
+        parent_dir = config_dir.parent
+        output_parquet_path = Path(f"{parent_dir}/outputs/parquet/")
+        if not output_parquet_path.exists():
+            os.makedirs(output_parquet_path)
+    else:
+        max_loop_size = nts
+        binary_nexus_file_folder_comment = "#"
     filled_template = troute_template.format(
         # hard coded to 5 minutes
         time_step_size=time_step_size,
@@ -266,7 +209,8 @@ def configure_troute(
         geo_file_path=f"./config/{cat_id}_subset.gpkg",
         start_datetime=start_time.strftime("%Y-%m-%d %H:%M:%S"),
         nts=nts,
-        max_loop_size=nts,
+        max_loop_size=max_loop_size,
+        binary_nexus_file_folder_comment=binary_nexus_file_folder_comment
     )
     with open(config_dir / "troute.yaml", "w") as file:
@@ -287,22 +231,14 @@ def make_ngen_realization_json(
         json.dump(realization, file, indent=4)
-def create_em_realization(cat_id: str, start_time: datetime, end_time: datetime):
+def create_lstm_realization(cat_id: str, start_time: datetime, end_time: datetime):
     paths = file_paths(cat_id)
-    template_path = file_paths.template_em_realization_config
-    em_config = file_paths.template_em_model_config
-    # move em_config to paths.config_dir
-    with open(em_config, "r") as f:
-        em_config = f.read()
-    with open(paths.config_dir / "em-config.yml", "w") as f:
-        f.write(em_config)
+    template_path = file_paths.template_lstm_realization_config
     configure_troute(cat_id, paths.config_dir, start_time, end_time)
     make_ngen_realization_json(paths.config_dir, template_path, start_time, end_time)
-    make_em_config(paths.geopackage_path, paths.config_dir)
+    make_lstm_config(paths.geopackage_path, paths.config_dir)
     # create some partitions for parallelization
     paths.setup_run_folders()
-    create_partitions(paths)
 def create_realization(
@@ -345,48 +281,3 @@ def create_realization(
     # create some partitions for parallelization
     paths.setup_run_folders()
-    create_partitions(paths)
-def create_partitions(paths: file_paths, num_partitions: Optional[int] = None) -> None:
-    if num_partitions is None:
-        num_partitions = multiprocessing.cpu_count()
-    cat_to_nex_pairs = get_cat_to_nex_flowpairs(hydrofabric=paths.geopackage_path)
-    # nexus = defaultdict(list)
-    # for cat, nex in cat_to_nex_pairs:
-    #     nexus[nex].append(cat)
-    num_partitions = min(num_partitions, len(cat_to_nex_pairs))
-    # partition_size = ceil(len(nexus) / num_partitions)
-    # num_nexus = len(nexus)
-    # nexus = list(nexus.items())
-    # partitions = []
-    # for i in range(0, num_nexus, partition_size):
-    #     part = {}
-    #     part["id"] = i // partition_size
-    #     part["cat-ids"] = []
-    #     part["nex-ids"] = []
-    #     part["remote-connections"] = []
-    #     for j in range(i, i + partition_size):
-    #         if j < num_nexus:
-    #             part["cat-ids"].extend(nexus[j][1])
-    #             part["nex-ids"].append(nexus[j][0])
-    #     partitions.append(part)
-    # with open(paths.subset_dir / f"partitions_{num_partitions}.json", "w") as f:
-    #     f.write(json.dumps({"partitions": partitions}, indent=4))
-    # write this to a metadata file to save on repeated file io to recalculate
-    with open(paths.metadata_dir / "num_partitions", "w") as f:
-        f.write(str(num_partitions))
-if __name__ == "__main__":
-    cat_id = "cat-1643991"
-    start_time = datetime(2010, 1, 1, 0, 0, 0)
-    end_time = datetime(2010, 1, 2, 0, 0, 0)
-    # output_interval = 3600
-    # nts = 2592
-    create_realization(cat_id, start_time, end_time)

data_processing/dataset_utils.py CHANGED Viewed

@@ -7,9 +7,9 @@ from typing import List, Literal, Optional, Tuple, Union
 import geopandas as gpd
 import numpy as np
 import xarray as xr
+from dask.distributed import Client, Future, progress
+from data_processing.dask_utils import no_cluster, temp_cluster
 from xarray.core.types import InterpOptions
-from dask.distributed import Client, progress, Future
-from data_processing.dask_utils import use_cluster
 logger = logging.getLogger(__name__)
@@ -117,13 +117,14 @@ def clip_dataset_to_bounds(
     return dataset
+@no_cluster
 def interpolate_nan_values(
     dataset: xr.Dataset,
     variables: Optional[List[str]] = None,
     dim: str = "time",
     method: InterpOptions = "nearest",
     fill_value: str = "extrapolate",
-) -> None:
+) -> bool:
     """
     Interpolates NaN values in specified (or all numeric time-dependent)
     variables of an xarray.Dataset. Operates inplace on the dataset.
@@ -145,6 +146,7 @@ def interpolate_nan_values(
         Set to "extrapolate" to fill with the nearest valid value when using 'nearest' or 'linear'.
         Default is "extrapolate".
     """
+    interpolation_used = False
     for name, var in dataset.data_vars.items():
         # if the variable is non-numeric, skip
         if not np.issubdtype(var.dtype, np.number):
@@ -158,9 +160,35 @@ def interpolate_nan_values(
             method=method,
             fill_value=fill_value if method in ["nearest", "linear"] else None,
         )
+        interpolation_used = True
+    return interpolation_used
-@use_cluster
+@no_cluster
+def save_dataset_no_cluster(
+    ds_to_save: xr.Dataset,
+    target_path: Path,
+    engine: Literal["netcdf4", "scipy", "h5netcdf"] = "h5netcdf",
+):
+    """
+    This explicitly does not use dask distributed.
+    Helper function to compute and save an xarray.Dataset to a NetCDF file.
+    Uses a temporary file and rename for avoid leaving a half written file.
+    """
+    if not target_path.parent.exists():
+        target_path.parent.mkdir(parents=True, exist_ok=True)
+    temp_file_path = target_path.with_name(target_path.name + ".saving.nc")
+    if temp_file_path.exists():
+        os.remove(temp_file_path)
+    ds_to_save.to_netcdf(temp_file_path, engine=engine, compute=True)
+    os.rename(str(temp_file_path), str(target_path))
+    logger.info(f"Successfully saved data to: {target_path}")
+@temp_cluster
 def save_dataset(
     ds_to_save: xr.Dataset,
     target_path: Path,
@@ -184,20 +212,21 @@ def save_dataset(
     logger.debug(
         f"NetCDF write task submitted to Dask. Waiting for completion to {temp_file_path}..."
     )
+    logger.info("For more detailed progress, see the Dask dashboard http://localhost:8787/status")
     progress(future)
     future.result()
     os.rename(str(temp_file_path), str(target_path))
     logger.info(f"Successfully saved data to: {target_path}")
-@use_cluster
+@no_cluster
 def save_to_cache(
     stores: xr.Dataset, cached_nc_path: Path, interpolate_nans: bool = True
 ) -> xr.Dataset:
     """
     Compute the store and save it to a cached netCDF file. This is not required but will save time and bandwidth.
     """
-    logger.info(f"Processing dataset for caching. Final cache target: {cached_nc_path}")
+    logger.debug(f"Processing dataset for caching. Final cache target: {cached_nc_path}")
     # lasily cast all numbers to f32
     for name, var in stores.data_vars.items():
@@ -206,13 +235,18 @@ def save_to_cache(
     # save dataset locally before manipulating it
     save_dataset(stores, cached_nc_path)
-    stores = xr.open_mfdataset(cached_nc_path, parallel=True, engine="h5netcdf")
     if interpolate_nans:
-        interpolate_nan_values(dataset=stores)
-        save_dataset(stores, cached_nc_path)
-        stores = xr.open_mfdataset(cached_nc_path, parallel=True, engine="h5netcdf")
+        stores = xr.open_mfdataset(
+            cached_nc_path,
+            parallel=True,
+            engine="h5netcdf",
+        )
+        was_interpolated = interpolate_nan_values(dataset=stores)
+        if was_interpolated:
+            save_dataset_no_cluster(stores, cached_nc_path)
+    stores = xr.open_mfdataset(cached_nc_path, parallel=True, engine="h5netcdf")
     return stores

data_processing/file_paths.py CHANGED Viewed

@@ -1,6 +1,7 @@
+from datetime import datetime
 from pathlib import Path
 from typing import Optional
-from datetime import datetime
 class file_paths:
     """
@@ -27,11 +28,10 @@ class file_paths:
     dev_file = Path(__file__).parent.parent.parent / ".dev"
     template_troute_config = data_sources / "ngen-routing-template.yaml"
     template_cfe_nowpm_realization_config = data_sources / "cfe-nowpm-realization-template.json"
-    template_em_realization_config = data_sources / "em-realization-template.json"
+    template_lstm_realization_config = data_sources / "lstm-realization-template.json"
     template_noahowp_config = data_sources / "noah-owp-modular-init.namelist.input"
     template_cfe_config = data_sources / "cfe-template.ini"
-    template_em_config = data_sources / "em-catchment-template.yml"
-    template_em_model_config = data_sources / "em-config.yml"
+    template_lstm_config = data_sources / "lstm-catchment-template.yml"
     def __init__(self, folder_name: Optional[str] = None, output_dir: Optional[Path] = None):
         """

data_processing/graph_utils.py CHANGED Viewed

@@ -169,7 +169,6 @@ def get_upstream_cats(names: Union[str, List[str]]) -> Set[str]:
                 node_index = graph.vs.find(cat=name).index
             else:
                 node_index = graph.vs.find(name=name).index
-            node_index = graph.vs.find(cat=name).index
             upstream_nodes = graph.subcomponent(node_index, mode="IN")
             for node in upstream_nodes:
                 parent_ids.add(graph.vs[node]["name"])
@@ -178,7 +177,6 @@ def get_upstream_cats(names: Union[str, List[str]]) -> Set[str]:
             logger.critical(f"Catchment {name} not found in the hydrofabric graph.")
         except ValueError:
             logger.critical(f"Catchment {name} not found in the hydrofabric graph.")
     # sometimes returns None, which isn't helpful
     if None in cat_ids:
         cat_ids.remove(None)

data_processing/subset.py CHANGED Viewed

@@ -12,9 +12,11 @@ from data_processing.gpkg_utils import (
     update_geopackage_metadata,
 )
 from data_processing.graph_utils import get_upstream_ids
+from rich.console import Console
+from rich.prompt import Prompt
 logger = logging.getLogger(__name__)
+console = Console()
 subset_tables = [
     "divides",
     "divide-attributes",  # requires divides
@@ -30,15 +32,33 @@ subset_tables = [
 def create_subset_gpkg(
-    ids: Union[List[str], str], hydrofabric: Path, output_gpkg_path: Path, is_vpu: bool = False
+    ids: Union[List[str], str],
+    hydrofabric: Path,
+    output_gpkg_path: Path,
+    is_vpu: bool = False,
+    override_gpkg: bool = True,
 ):
     # ids is a list of nexus and wb ids, or a single vpu id
     if not isinstance(ids, list):
         ids = [ids]
     output_gpkg_path.parent.mkdir(parents=True, exist_ok=True)
-    if os.path.exists(output_gpkg_path):
-        os.remove(output_gpkg_path)
+    if not override_gpkg:
+        if os.path.exists(output_gpkg_path):
+            response = Prompt.ask(
+                f"Subset geopackage at {output_gpkg_path} already exists. Are you sure you want to overwrite it?",
+                default="n",
+                choices=["y", "n"],
+            )
+            if response == "y":
+                console.print(f"Removing {output_gpkg_path}...", style="yellow")
+                os.remove(output_gpkg_path)
+            else:
+                console.print("Exiting...", style="bold red")
+                exit()
+    else:
+        if os.path.exists(output_gpkg_path):
+            os.remove(output_gpkg_path)
     create_empty_gpkg(output_gpkg_path)
     logger.info(f"Subsetting tables: {subset_tables}")
@@ -55,8 +75,18 @@ def create_subset_gpkg(
 def subset_vpu(
     vpu_id: str, output_gpkg_path: Path, hydrofabric: Path = file_paths.conus_hydrofabric
 ):
-    if output_gpkg_path.exists():
-        os.remove(output_gpkg_path)
+    if os.path.exists(output_gpkg_path):
+        response = Prompt.ask(
+            f"Subset geopackage at {output_gpkg_path} already exists. Are you sure you want to overwrite it?",
+            default="n",
+            choices=["y", "n"],
+        )
+        if response == "y":
+            console.print(f"Removing {output_gpkg_path}...", style="yellow")
+            os.remove(output_gpkg_path)
+        else:
+            console.print("Exiting...", style="bold red")
+            exit()
     create_subset_gpkg(vpu_id, hydrofabric, output_gpkg_path=output_gpkg_path, is_vpu=True)
     logger.info(f"Subset complete for VPU {vpu_id}")
@@ -68,6 +98,7 @@ def subset(
     hydrofabric: Path = file_paths.conus_hydrofabric,
     output_gpkg_path: Path = Path(),
     include_outlet: bool = True,
+    override_gpkg: bool = True,
 ):
     upstream_ids = list(get_upstream_ids(cat_ids, include_outlet))
@@ -78,6 +109,6 @@ def subset(
         paths = file_paths(output_folder_name)
         output_gpkg_path = paths.geopackage_path
-    create_subset_gpkg(upstream_ids, hydrofabric, output_gpkg_path)
+    create_subset_gpkg(upstream_ids, hydrofabric, output_gpkg_path, override_gpkg=override_gpkg)
     logger.info(f"Subset complete for {len(upstream_ids)} features (catchments + nexuses)")
     logger.debug(f"Subset complete for {upstream_ids} catchments")

data_sources/lstm-catchment-template.yml ADDED Viewed

@@ -0,0 +1,17 @@
+time_step: "1 hour"
+area_sqkm: {area_sqkm} # areasqkm
+basin_id: {divide_id}
+basin_name: {divide_id}
+elev_mean: {elevation_mean} # mean.elevation
+initial_state: zero
+lat: {lat} # needs calulating
+lon: {lon} # needs calulating
+slope_mean: {slope_mean} # mean.slope
+train_cfg_file:
+  - /ngen/ngen/extern/lstm/trained_neuralhydrology_models/nh_AORC_hourly_25yr_1210_112435_7/config.yml
+  - /ngen/ngen/extern/lstm/trained_neuralhydrology_models/nh_AORC_hourly_25yr_1210_112435_8/config.yml
+  - /ngen/ngen/extern/lstm/trained_neuralhydrology_models/nh_AORC_hourly_25yr_1210_112435_9/config.yml
+  - /ngen/ngen/extern/lstm/trained_neuralhydrology_models/nh_AORC_hourly_25yr_seq999_seed101_0701_143442/config.yml
+  - /ngen/ngen/extern/lstm/trained_neuralhydrology_models/nh_AORC_hourly_25yr_seq999_seed103_2701_171540/config.yml
+  - /ngen/ngen/extern/lstm/trained_neuralhydrology_models/nh_AORC_hourly_slope_elev_precip_temp_seq999_seed101_2801_191806/config.yml
+verbose: 0

data_sources/{em-realization-template.json → lstm-realization-template.json} RENAMED Viewed

@@ -5,25 +5,22 @@
         "name": "bmi_multi",
         "params": {
           "name": "bmi_multi",
-          "model_type_name": "empirical_model",
+          "model_type_name": "lstm",
           "forcing_file": "",
           "init_config": "",
           "allow_exceed_end_time": true,
           "main_output_variable": "land_surface_water__runoff_depth",
-          "modules": [
+          "modules": [
             {
               "name": "bmi_python",
               "params": {
                 "name": "bmi_python",
                 "python_type": "lstm.bmi_lstm.bmi_LSTM",
-                "model_type_name": "bmi_empirical_model",
-                "init_config": "./config/cat_config/empirical_model/{{id}}.yml",
+                "model_type_name": "bmi_lstm",
+                "init_config": "./config/cat_config/lstm/{{id}}.yml",
                 "allow_exceed_end_time": true,
                 "main_output_variable": "land_surface_water__runoff_depth",
-                "uses_forcing_file": false,
-                "variables_names_map": {
-                  "atmosphere_water__liquid_equivalent_precipitation_rate": "APCP_surface"
-                }
+                "uses_forcing_file": false
               }
             }
           ]

data_sources/ngen-routing-template.yaml CHANGED Viewed

@@ -62,7 +62,7 @@ compute_parameters:
         qlat_input_folder: ./outputs/ngen/
         qlat_file_pattern_filter: "nex-*"
-        #binary_nexus_file_folder: ./outputs/parquet/ # if nexus_file_pattern_filter="nex-*" and you want it to reformat them as parquet, you need this
+        {binary_nexus_file_folder_comment}binary_nexus_file_folder: ./outputs/parquet/ # if nexus_file_pattern_filter="nex-*" and you want it to reformat them as parquet, you need this
         #coastal_boundary_input_file : channel_forcing/schout_1.nc
         nts: {nts} #288 for 1day
         max_loop_size: {max_loop_size} # [number of timesteps]

ngiab-data-preprocess 4.3.0__py3-none-any.whl → 4.5.0__py3-none-any.whl

ngiab-data-preprocess 4.3.0py3-none-any.whl → 4.5.0py3-none-any.whl