PyPI - ngiab-data-preprocess - Versions diffs - 4.5.0__py3-none-any.whl → 4.6.0__py3-none-any.whl - Mend

ngiab-data-preprocess 4.5.0py3-none-any.whl → 4.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

data_processing/create_realization.py +39 -22
data_processing/dataset_utils.py +4 -86
data_processing/file_paths.py +12 -5
data_processing/forcings.py +70 -5
data_processing/gpkg_utils.py +13 -15
data_processing/graph_utils.py +5 -5
data_processing/subset.py +4 -4
data_sources/lstm-rust-realization-template.json +47 -0
data_sources/source_validation.py +57 -45
map_app/__main__.py +3 -2
map_app/static/css/main.css +41 -0
map_app/static/css/toggle.css +30 -1
map_app/static/js/data_processing.js +101 -37
map_app/static/js/main.js +51 -15
map_app/templates/index.html +16 -1
map_app/views.py +83 -34
ngiab_data_cli/__main__.py +73 -47
ngiab_data_cli/arguments.py +16 -3
{ngiab_data_preprocess-4.5.0.dist-info → ngiab_data_preprocess-4.6.0.dist-info}/METADATA +1 -1
ngiab_data_preprocess-4.6.0.dist-info/RECORD +43 -0
ngiab_data_preprocess-4.5.0.dist-info/RECORD +0 -42
{ngiab_data_preprocess-4.5.0.dist-info → ngiab_data_preprocess-4.6.0.dist-info}/WHEEL +0 -0
{ngiab_data_preprocess-4.5.0.dist-info → ngiab_data_preprocess-4.6.0.dist-info}/entry_points.txt +0 -0
{ngiab_data_preprocess-4.5.0.dist-info → ngiab_data_preprocess-4.6.0.dist-info}/licenses/LICENSE +0 -0
{ngiab_data_preprocess-4.5.0.dist-info → ngiab_data_preprocess-4.6.0.dist-info}/top_level.txt +0 -0

data_processing/create_realization.py CHANGED Viewed

@@ -1,21 +1,21 @@
 import json
 import logging
 import multiprocessing
+import os
 import shutil
 import sqlite3
 from datetime import datetime
 from pathlib import Path
 from typing import Dict, Optional
-import psutil
-import os
 import numpy as np
 import pandas
+import psutil
 import requests
 import s3fs
 import xarray as xr
 from data_processing.dask_utils import temp_cluster
-from data_processing.file_paths import file_paths
+from data_processing.file_paths import FilePaths
 from data_processing.gpkg_utils import (
     get_cat_to_nhd_feature_id,
     get_table_crs_short,
@@ -27,7 +27,7 @@ logger = logging.getLogger(__name__)
 @temp_cluster
-def get_approximate_gw_storage(paths: file_paths, start_date: datetime) -> Dict[str, int]:
+def get_approximate_gw_storage(paths: FilePaths, start_date: datetime) -> Dict[str, int]:
     # get the gw levels from the NWM output on a given start date
     # this kind of works in place of warmstates for now
     year = start_date.strftime("%Y")
@@ -50,11 +50,9 @@ def get_approximate_gw_storage(paths: file_paths, start_date: datetime) -> Dict[
     return water_levels
-def make_cfe_config(
-    divide_conf_df: pandas.DataFrame, files: file_paths, water_levels: dict
-) -> None:
+def make_cfe_config(divide_conf_df: pandas.DataFrame, files: FilePaths, water_levels: dict) -> None:
     """Parses parameters from NOAHOWP_CFE DataFrame and returns a dictionary of catchment configurations."""
-    with open(file_paths.template_cfe_config, "r") as f:
+    with open(FilePaths.template_cfe_config, "r") as f:
         cfe_template = f.read()
     cat_config_dir = files.config_dir / "cat_config" / "CFE"
     cat_config_dir.mkdir(parents=True, exist_ok=True)
@@ -92,7 +90,7 @@ def make_noahowp_config(
 ) -> None:
     start_datetime = start_time.strftime("%Y%m%d%H%M")
     end_datetime = end_time.strftime("%Y%m%d%H%M")
-    with open(file_paths.template_noahowp_config, "r") as file:
+    with open(FilePaths.template_noahowp_config, "r") as file:
         template = file.read()
     cat_config_dir = base_dir / "cat_config" / "NOAH-OWP-M"
@@ -137,7 +135,7 @@ def get_model_attributes(hydrofabric: Path) -> pandas.DataFrame:
 def make_lstm_config(
     hydrofabric: Path,
     output_dir: Path,
-    template_path: Path = file_paths.template_lstm_config,
+    template_path: Path = FilePaths.template_lstm_config,
 ):
     # test if modspatialite is available
@@ -169,7 +167,7 @@ def make_lstm_config(
                     lat=row["latitude"],
                     lon=row["longitude"],
                     slope_mean=row["mean_slope_mpkm"],
-                    elevation_mean=row["mean.elevation"] / 1000,  # convert mm in hf to m
+                    elevation_mean=row["mean.elevation"] / 100,  # convert cm in hf to m
                 )
             )
@@ -177,17 +175,19 @@ def make_lstm_config(
 def configure_troute(
     cat_id: str, config_dir: Path, start_time: datetime, end_time: datetime
 ) -> None:
-    with open(file_paths.template_troute_config, "r") as file:
+    with open(FilePaths.template_troute_config, "r") as file:
         troute_template = file.read()
     time_step_size = 300
-    gpkg_file_path=f"{config_dir}/{cat_id}_subset.gpkg"
+    gpkg_file_path = f"{config_dir}/{cat_id}_subset.gpkg"
     nts = (end_time - start_time).total_seconds() / time_step_size
     with sqlite3.connect(gpkg_file_path) as conn:
         ncats_df = pandas.read_sql_query("SELECT COUNT(id) FROM 'divides';", conn)
-        ncats = ncats_df['COUNT(id)'][0]
+        ncats = ncats_df["COUNT(id)"][0]
-    est_bytes_required = nts * ncats * 45 # extremely rough calculation based on about 3 tests :)
-    local_ram_available = 0.8 * psutil.virtual_memory().available # buffer to not accidentally explode machine
+    est_bytes_required = nts * ncats * 45  # extremely rough calculation based on about 3 tests :)
+    local_ram_available = (
+        0.8 * psutil.virtual_memory().available
+    )  # buffer to not accidentally explode machine
     if est_bytes_required > local_ram_available:
         max_loop_size = nts // (est_bytes_required // local_ram_available)
@@ -210,7 +210,7 @@ def configure_troute(
         start_datetime=start_time.strftime("%Y-%m-%d %H:%M:%S"),
         nts=nts,
         max_loop_size=max_loop_size,
-        binary_nexus_file_folder_comment=binary_nexus_file_folder_comment
+        binary_nexus_file_folder_comment=binary_nexus_file_folder_comment,
     )
     with open(config_dir / "troute.yaml", "w") as file:
@@ -231,11 +231,26 @@ def make_ngen_realization_json(
         json.dump(realization, file, indent=4)
-def create_lstm_realization(cat_id: str, start_time: datetime, end_time: datetime):
-    paths = file_paths(cat_id)
-    template_path = file_paths.template_lstm_realization_config
+def create_lstm_realization(
+    cat_id: str, start_time: datetime, end_time: datetime, use_rust: bool = False
+):
+    paths = FilePaths(cat_id)
+    realization_path = paths.config_dir / "realization.json"
     configure_troute(cat_id, paths.config_dir, start_time, end_time)
-    make_ngen_realization_json(paths.config_dir, template_path, start_time, end_time)
+    # python version of the lstm
+    python_template_path = FilePaths.template_lstm_realization_config
+    make_ngen_realization_json(paths.config_dir, python_template_path, start_time, end_time)
+    realization_path.rename(paths.config_dir / "python_lstm_real.json")
+    # rust version of the lstm
+    rust_template_path = FilePaths.template_lstm_rust_realization_config
+    make_ngen_realization_json(paths.config_dir, rust_template_path, start_time, end_time)
+    realization_path.rename(paths.config_dir / "rust_lstm_real.json")
+    if use_rust:
+        (paths.config_dir / "rust_lstm_real.json").rename(realization_path)
+    else:
+        (paths.config_dir / "python_lstm_real.json").rename(realization_path)
     make_lstm_config(paths.geopackage_path, paths.config_dir)
     # create some partitions for parallelization
     paths.setup_run_folders()
@@ -248,7 +263,7 @@ def create_realization(
     use_nwm_gw: bool = False,
     gage_id: Optional[str] = None,
 ):
-    paths = file_paths(cat_id)
+    paths = FilePaths(cat_id)
     template_path = paths.template_cfe_nowpm_realization_config
@@ -263,6 +278,8 @@ def create_realization(
             with open(template_path, "w") as f:
                 json.dump(new_template, f)
             logger.info(f"downloaded calibrated parameters for {gage_id}")
+        else:
+            logger.warning(f"could not download parameters for {gage_id}, using default template")
     conf_df = get_model_attributes(paths.geopackage_path)

data_processing/dataset_utils.py CHANGED Viewed

@@ -2,14 +2,13 @@ import logging
 import os
 from datetime import datetime
 from pathlib import Path
-from typing import List, Literal, Optional, Tuple, Union
+from typing import Literal, Tuple, Union
 import geopandas as gpd
 import numpy as np
 import xarray as xr
 from dask.distributed import Client, Future, progress
 from data_processing.dask_utils import no_cluster, temp_cluster
-from xarray.core.types import InterpOptions
 logger = logging.getLogger(__name__)
@@ -116,78 +115,6 @@ def clip_dataset_to_bounds(
     logger.info("Selected time range and clipped to bounds")
     return dataset
-@no_cluster
-def interpolate_nan_values(
-    dataset: xr.Dataset,
-    variables: Optional[List[str]] = None,
-    dim: str = "time",
-    method: InterpOptions = "nearest",
-    fill_value: str = "extrapolate",
-) -> bool:
-    """
-    Interpolates NaN values in specified (or all numeric time-dependent)
-    variables of an xarray.Dataset. Operates inplace on the dataset.
-    Parameters
-    ----------
-    dataset : xr.Dataset
-        The input dataset.
-    variables : Optional[List[str]], optional
-        A list of variable names to process. If None (default),
-        all numeric variables containing the specified dimension will be processed.
-    dim : str, optional
-        The dimension along which to interpolate (default is "time").
-    method : str, optional
-        Interpolation method to use (e.g., "linear", "nearest", "cubic").
-        Default is "nearest".
-    fill_value : str, optional
-        Method for filling NaNs at the start/end of the series after interpolation.
-        Set to "extrapolate" to fill with the nearest valid value when using 'nearest' or 'linear'.
-        Default is "extrapolate".
-    """
-    interpolation_used = False
-    for name, var in dataset.data_vars.items():
-        # if the variable is non-numeric, skip
-        if not np.issubdtype(var.dtype, np.number):
-            continue
-        # if there are no NANs, skip
-        if not var.isnull().any().compute():
-            continue
-        dataset[name] = var.interpolate_na(
-            dim=dim,
-            method=method,
-            fill_value=fill_value if method in ["nearest", "linear"] else None,
-        )
-        interpolation_used = True
-    return interpolation_used
-@no_cluster
-def save_dataset_no_cluster(
-    ds_to_save: xr.Dataset,
-    target_path: Path,
-    engine: Literal["netcdf4", "scipy", "h5netcdf"] = "h5netcdf",
-):
-    """
-    This explicitly does not use dask distributed.
-    Helper function to compute and save an xarray.Dataset to a NetCDF file.
-    Uses a temporary file and rename for avoid leaving a half written file.
-    """
-    if not target_path.parent.exists():
-        target_path.parent.mkdir(parents=True, exist_ok=True)
-    temp_file_path = target_path.with_name(target_path.name + ".saving.nc")
-    if temp_file_path.exists():
-        os.remove(temp_file_path)
-    ds_to_save.to_netcdf(temp_file_path, engine=engine, compute=True)
-    os.rename(str(temp_file_path), str(target_path))
-    logger.info(f"Successfully saved data to: {target_path}")
 @temp_cluster
 def save_dataset(
     ds_to_save: xr.Dataset,
@@ -195,7 +122,8 @@ def save_dataset(
     engine: Literal["netcdf4", "scipy", "h5netcdf"] = "h5netcdf",
 ):
     """
-    Helper function to compute and save an xarray.Dataset to a NetCDF file.
+    Helper function to compute and save an xarray.Dataset (specifically, the raw
+    forcing data) to a NetCDF file.
     Uses a temporary file and rename for atomicity.
     """
     if not target_path.parent.exists():
@@ -221,7 +149,7 @@ def save_dataset(
 @no_cluster
 def save_to_cache(
-    stores: xr.Dataset, cached_nc_path: Path, interpolate_nans: bool = True
+    stores: xr.Dataset, cached_nc_path: Path
 ) -> xr.Dataset:
     """
     Compute the store and save it to a cached netCDF file. This is not required but will save time and bandwidth.
@@ -236,16 +164,6 @@ def save_to_cache(
     # save dataset locally before manipulating it
     save_dataset(stores, cached_nc_path)
-    if interpolate_nans:
-        stores = xr.open_mfdataset(
-            cached_nc_path,
-            parallel=True,
-            engine="h5netcdf",
-        )
-        was_interpolated = interpolate_nan_values(dataset=stores)
-        if was_interpolated:
-            save_dataset_no_cluster(stores, cached_nc_path)
     stores = xr.open_mfdataset(cached_nc_path, parallel=True, engine="h5netcdf")
     return stores

data_processing/file_paths.py CHANGED Viewed

@@ -3,7 +3,7 @@ from pathlib import Path
 from typing import Optional
-class file_paths:
+class FilePaths:
     """
     This class contains all of the file paths used in the data processing
     workflow.
@@ -29,13 +29,14 @@ class file_paths:
     template_troute_config = data_sources / "ngen-routing-template.yaml"
     template_cfe_nowpm_realization_config = data_sources / "cfe-nowpm-realization-template.json"
     template_lstm_realization_config = data_sources / "lstm-realization-template.json"
+    template_lstm_rust_realization_config = data_sources / "lstm-rust-realization-template.json"
     template_noahowp_config = data_sources / "noah-owp-modular-init.namelist.input"
     template_cfe_config = data_sources / "cfe-template.ini"
     template_lstm_config = data_sources / "lstm-catchment-template.yml"
     def __init__(self, folder_name: Optional[str] = None, output_dir: Optional[Path] = None):
         """
-        Initialize the file_paths class with a the name of the output subfolder.
+        Initialize the FilePaths class with a the name of the output subfolder.
         OR the path to the output folder you want to use.
         use one or the other, not both
@@ -49,8 +50,8 @@ class file_paths:
             self.folder_name = folder_name
             self.output_dir = self.root_output_dir() / folder_name
         if output_dir:
-            self.output_dir = output_dir
-            self.folder_name = str(output_dir.stem)
+            self.output_dir = Path(output_dir)
+            self.folder_name = self.output_dir.stem
         self.cache_dir.mkdir(parents=True, exist_ok=True)
@@ -89,7 +90,13 @@ class file_paths:
     @property
     def metadata_dir(self) -> Path:
-        return self.subset_dir / "metadata"
+        meta_dir = self.subset_dir / "metadata"
+        meta_dir.mkdir(parents=True, exist_ok=True)
+        return meta_dir
+    @property
+    def forcing_progress_file(self) -> Path:
+        return self.metadata_dir / "forcing_progress.json"
     @property
     def geopackage_path(self) -> Path:

data_processing/forcings.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import json
 import logging
 import multiprocessing
 import os
@@ -16,7 +17,7 @@ import psutil
 import xarray as xr
 from data_processing.dask_utils import no_cluster, use_cluster
 from data_processing.dataset_utils import validate_dataset_format
-from data_processing.file_paths import file_paths
+from data_processing.file_paths import FilePaths
 from exactextract import exact_extract
 from exactextract.raster import NumPyRasterSource
 from rich.progress import (
@@ -26,6 +27,7 @@ from rich.progress import (
     TimeElapsedColumn,
     TimeRemainingColumn,
 )
+from xarray.core.types import InterpOptions
 logger = logging.getLogger(__name__)
 # Suppress the specific warning from numpy to keep the cli output clean
@@ -63,10 +65,16 @@ def weighted_sum_of_cells(
         Each element contains the averaged forcing value for the whole catchment
         over one timestep.
     """
-    result = np.zeros(flat_raster.shape[0])
+    # early exit for divide by zero
+    if np.all(factors == 0):
+        return np.zeros(flat_raster.shape[0])
+    selected_cells = flat_raster[:, cell_ids]
+    has_nan = np.isnan(selected_cells).any(axis=1)
     result = np.sum(flat_raster[:, cell_ids] * factors, axis=1)
     sum_of_weights = np.sum(factors)
     result /= sum_of_weights
+    result[has_nan] = np.nan
     return result
@@ -305,6 +313,46 @@ def get_units(dataset: xr.Dataset) -> dict:
     return units
+def interpolate_nan_values(
+    dataset: xr.Dataset,
+    dim: str = "time",
+    method: InterpOptions = "linear",
+    fill_value: str = "extrapolate",
+) -> bool:
+    """
+    Interpolates NaN values in specified (or all numeric time-dependent)
+    variables of an xarray.Dataset. Operates inplace on the dataset.
+    Parameters
+    ----------
+    dataset : xr.Dataset
+        The input dataset.
+    dim : str, optional
+        The dimension along which to interpolate (default is "time").
+    method : str, optional
+        Interpolation method to use (e.g., "linear", "nearest", "cubic").
+        Default is "linear".
+    fill_value : str, optional
+        Method for filling NaNs at the start/end of the series after interpolation.
+        Set to "extrapolate" to fill with the nearest valid value when using 'nearest' or 'linear'.
+        Default is "extrapolate".
+    """
+    for name, var in dataset.data_vars.items():
+        # if the variable is non-numeric, skip
+        if not np.issubdtype(var.dtype, np.number):
+            continue
+        # if there are no NANs, skip
+        if not var.isnull().any().compute():
+            continue
+        logger.info("Interpolating NaN values in %s", name)
+        dataset[name] = var.interpolate_na(
+            dim=dim,
+            method=method,
+            fill_value=fill_value if method in ["nearest", "linear"] else None,
+        )
 @no_cluster
 def compute_zonal_stats(
     gdf: gpd.GeoDataFrame, gridded_data: xr.Dataset, forcings_dir: Path
@@ -334,6 +382,17 @@ def compute_zonal_stats(
     cat_chunks: List[pd.DataFrame] = np.array_split(catchments, num_partitions)  # type: ignore
+    progress_file = FilePaths(output_dir=forcings_dir.parent.stem).forcing_progress_file
+    ex_var_name = list(gridded_data.data_vars)[0]
+    example_time_chunks = get_index_chunks(gridded_data[ex_var_name])
+    all_steps = len(example_time_chunks) * len(gridded_data.data_vars)
+    logger.info(
+        f"Total steps: {all_steps}, Number of time chunks: {len(example_time_chunks)}, Number of variables: {len(gridded_data.data_vars)}"
+    )
+    steps_completed = 0
+    with open(progress_file, "w") as f:
+        json.dump({"total_steps": all_steps, "steps_completed": steps_completed}, f)
     progress = Progress(
         TextColumn("[progress.description]{task.description}"),
         BarColumn(),
@@ -391,6 +450,9 @@ def compute_zonal_stats(
             concatenated_da.to_dataset(name=data_var_name).to_netcdf(
                 forcings_dir / "temp" / f"{data_var_name}_timechunk_{i}.nc"
             )
+            steps_completed += 1
+            with open(progress_file, "w") as f:
+                json.dump({"total_steps": all_steps, "steps_completed": steps_completed}, f)
         # Merge the chunks back together
         datasets = [
             xr.open_dataset(forcings_dir / "temp" / f"{data_var_name}_timechunk_{i}.nc")
@@ -413,6 +475,8 @@ def compute_zonal_stats(
         f"Forcing generation complete! Zonal stats computed in {time.time() - timer_start:2f} seconds"
     )
     write_outputs(forcings_dir, units)
+    time.sleep(1)  # wait for progress bar to update
+    progress_file.unlink()
 @use_cluster
@@ -455,7 +519,6 @@ def write_outputs(forcings_dir: Path, units: dict) -> None:
     for var in final_ds.data_vars:
         final_ds[var] = final_ds[var].astype(np.float32)
-    logger.info("Saving to disk")
     # The format for the netcdf is to support a legacy format
     # which is why it's a little "unorthodox"
     # There are no coordinates, just dimensions, catchment ids are stored in a 1d data var
@@ -481,7 +544,9 @@ def write_outputs(forcings_dir: Path, units: dict) -> None:
     final_ds["Time"].attrs["epoch_start"] = (
         "01/01/1970 00:00:00"  # not needed but suppresses the ngen warning
     )
+    interpolate_nan_values(final_ds)
+    logger.info("Saving to disk")
     final_ds.to_netcdf(forcings_dir / "forcings.nc", engine="netcdf4")
     # close the datasets
     _ = [result.close() for result in results]
@@ -493,8 +558,8 @@ def write_outputs(forcings_dir: Path, units: dict) -> None:
     temp_forcings_dir.rmdir()
-def setup_directories(cat_id: str) -> file_paths:
-    forcing_paths = file_paths(cat_id)
+def setup_directories(cat_id: str) -> FilePaths:
+    forcing_paths = FilePaths(cat_id)
     # delete everything in the forcing folder except the cached nc file
     for file in forcing_paths.forcings_dir.glob("*.*"):
         if file != forcing_paths.cached_nc_file:

data_processing/gpkg_utils.py CHANGED Viewed

@@ -2,10 +2,10 @@ import logging
 import sqlite3
 import struct
 from pathlib import Path
-from typing import List, Tuple, Dict
+from typing import Dict, List, Tuple
 import pyproj
-from data_processing.file_paths import file_paths
+from data_processing.file_paths import FilePaths
 from shapely.geometry import Point
 from shapely.geometry.base import BaseGeometry
 from shapely.ops import transform
@@ -28,7 +28,7 @@ class GeoPackage:
         self.conn.close()
-def verify_indices(gpkg: Path = file_paths.conus_hydrofabric) -> None:
+def verify_indices(gpkg: Path = FilePaths.conus_hydrofabric) -> None:
     """
     Verify that the indices in the specified geopackage are correct.
     If they are not, create the correct indices.
@@ -74,7 +74,7 @@ def create_empty_gpkg(gpkg: Path) -> None:
     """
     Create an empty geopackage with the necessary tables and indices.
     """
-    with open(file_paths.template_sql) as f:
+    with open(FilePaths.template_sql) as f:
         sql_script = f.read()
     with sqlite3.connect(gpkg) as conn:
@@ -85,7 +85,7 @@ def add_triggers_to_gpkg(gpkg: Path) -> None:
     """
     Adds geopackage triggers required to maintain spatial index integrity
     """
-    with open(file_paths.triggers_sql) as f:
+    with open(FilePaths.triggers_sql) as f:
         triggers = f.read()
     with sqlite3.connect(gpkg) as conn:
         conn.executescript(triggers)
@@ -93,8 +93,6 @@ def add_triggers_to_gpkg(gpkg: Path) -> None:
     logger.debug(f"Added triggers to subset gpkg {gpkg}")
 def blob_to_geometry(blob: bytes) -> BaseGeometry | None:
     """
     Convert a blob to a geometry.
@@ -178,7 +176,7 @@ def get_catid_from_point(coords: Dict[str, float]) -> str:
     """
     logger.info(f"Getting catid for {coords}")
-    q = file_paths.conus_hydrofabric
+    q = FilePaths.conus_hydrofabric
     point = Point(coords["lng"], coords["lat"])
     point = convert_to_5070(point)
     with sqlite3.connect(q) as con:
@@ -261,7 +259,7 @@ def update_geopackage_metadata(gpkg: Path) -> None:
     Update the contents of the gpkg_contents table in the specified geopackage.
     """
     # table_name, data_type, identifier, description, last_change, min_x, min_y, max_x, max_y, srs_id
-    tables = get_feature_tables(file_paths.conus_hydrofabric)
+    tables = get_feature_tables(FilePaths.conus_hydrofabric)
     con = sqlite3.connect(gpkg)
     for table in tables:
         min_x = con.execute(f"SELECT MIN(minx) FROM rtree_{table}_geom").fetchone()[0]
@@ -336,7 +334,7 @@ def subset_table_by_vpu(table: str, vpu: str, hydrofabric: Path, subset_gpkg_nam
     insert_data(dest_db, table, contents)
-    if table in get_feature_tables(file_paths.conus_hydrofabric):
+    if table in get_feature_tables(FilePaths.conus_hydrofabric):
         fids = [str(x[0]) for x in contents]
         copy_rTree_tables(table, fids, source_db, dest_db)
@@ -389,7 +387,7 @@ def subset_table(table: str, ids: List[str], hydrofabric: Path, subset_gpkg_name
     insert_data(dest_db, table, contents)
-    if table in get_feature_tables(file_paths.conus_hydrofabric):
+    if table in get_feature_tables(FilePaths.conus_hydrofabric):
         fids = [str(x[0]) for x in contents]
         copy_rTree_tables(table, fids, source_db, dest_db)
@@ -436,7 +434,7 @@ def get_table_crs(gpkg: str, table: str) -> str:
     return crs
-def get_cat_from_gage_id(gage_id: str, gpkg: Path = file_paths.conus_hydrofabric) -> str:
+def get_cat_from_gage_id(gage_id: str, gpkg: Path = FilePaths.conus_hydrofabric) -> str:
     """
     Get the catchment id associated with a gage id.
@@ -476,7 +474,7 @@ def get_cat_from_gage_id(gage_id: str, gpkg: Path = file_paths.conus_hydrofabric
     return cat_id
-def get_cat_to_nex_flowpairs(hydrofabric: Path = file_paths.conus_hydrofabric) -> List[Tuple]:
+def get_cat_to_nex_flowpairs(hydrofabric: Path = FilePaths.conus_hydrofabric) -> List[Tuple]:
     """
     Retrieves the from and to IDs from the specified hydrofabric.
@@ -484,7 +482,7 @@ def get_cat_to_nex_flowpairs(hydrofabric: Path = file_paths.conus_hydrofabric) -
     The true network flows catchment to waterbody to nexus, this bypasses the waterbody and returns catchment to nexus.
     Args:
-        hydrofabric (Path, optional): The file path to the hydrofabric. Defaults to file_paths.conus_hydrofabric.
+        hydrofabric (Path, optional): The file path to the hydrofabric. Defaults to FilePaths.conus_hydrofabric.
     Returns:
         List[tuple]: A list of tuples containing the from and to IDs.
     """
@@ -518,7 +516,7 @@ def get_available_tables(gpkg: Path) -> List[str]:
     return tables
-def get_cat_to_nhd_feature_id(gpkg: Path = file_paths.conus_hydrofabric) -> Dict[str, int]:
+def get_cat_to_nhd_feature_id(gpkg: Path = FilePaths.conus_hydrofabric) -> Dict[str, int]:
     available_tables = get_available_tables(gpkg)
     possible_tables = ["flowpath_edge_list", "network"]

data_processing/graph_utils.py CHANGED Viewed

@@ -5,13 +5,13 @@ from pathlib import Path
 from typing import List, Optional, Set, Union
 import igraph as ig
-from data_processing.file_paths import file_paths
+from data_processing.file_paths import FilePaths
 logger = logging.getLogger(__name__)
 def get_from_to_id_pairs(
-    hydrofabric: Path = file_paths.conus_hydrofabric, ids: Optional[Set | List] = None
+    hydrofabric: Path = FilePaths.conus_hydrofabric, ids: Optional[Set | List] = None
 ) -> List[tuple]:
     """
     Retrieves the from and to IDs from the specified hydrofabric.
@@ -19,7 +19,7 @@ def get_from_to_id_pairs(
     This function reads the from and to IDs from the specified hydrofabric and returns them as a list of tuples.
     Args:
-        hydrofabric (Path, optional): The file path to the hydrofabric. Defaults to file_paths.conus_hydrofabric.
+        hydrofabric (Path, optional): The file path to the hydrofabric. Defaults to FilePaths.conus_hydrofabric.
         ids (Set, optional): A set of IDs to filter the results. Defaults to None.
     Returns:
         List[tuple]: A list of tuples containing the from and to IDs.
@@ -96,10 +96,10 @@ def get_graph() -> ig.Graph:
     Returns:
         ig.Graph: The hydrological network graph.
     """
-    pickled_graph_path = file_paths.hydrofabric_graph
+    pickled_graph_path = FilePaths.hydrofabric_graph
     if not pickled_graph_path.exists():
         logger.debug("Graph pickle does not exist, creating a new graph.")
-        network_graph = create_graph_from_gpkg(file_paths.conus_hydrofabric)
+        network_graph = create_graph_from_gpkg(FilePaths.conus_hydrofabric)
         network_graph.write_pickle(pickled_graph_path)
     else:
         try:

data_processing/subset.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 from pathlib import Path
 from typing import List, Union
-from data_processing.file_paths import file_paths
+from data_processing.file_paths import FilePaths
 from data_processing.gpkg_utils import (
     add_triggers_to_gpkg,
     create_empty_gpkg,
@@ -73,7 +73,7 @@ def create_subset_gpkg(
 def subset_vpu(
-    vpu_id: str, output_gpkg_path: Path, hydrofabric: Path = file_paths.conus_hydrofabric
+    vpu_id: str, output_gpkg_path: Path, hydrofabric: Path = FilePaths.conus_hydrofabric
 ):
     if os.path.exists(output_gpkg_path):
         response = Prompt.ask(
@@ -95,7 +95,7 @@ def subset_vpu(
 def subset(
     cat_ids: str | List[str],
-    hydrofabric: Path = file_paths.conus_hydrofabric,
+    hydrofabric: Path = FilePaths.conus_hydrofabric,
     output_gpkg_path: Path = Path(),
     include_outlet: bool = True,
     override_gpkg: bool = True,
@@ -106,7 +106,7 @@ def subset(
         # if the name isn't provided, use the first upstream id
         upstream_ids = sorted(upstream_ids)
         output_folder_name = upstream_ids[0]
-        paths = file_paths(output_folder_name)
+        paths = FilePaths(output_folder_name)
         output_gpkg_path = paths.geopackage_path
     create_subset_gpkg(upstream_ids, hydrofabric, output_gpkg_path, override_gpkg=override_gpkg)

ngiab-data-preprocess 4.5.0__py3-none-any.whl → 4.6.0__py3-none-any.whl

ngiab-data-preprocess 4.5.0py3-none-any.whl → 4.6.0py3-none-any.whl