PyPI - ngiab-data-preprocess - Versions diffs - 3.0.3__py3-none-any.whl → 3.1.0__py3-none-any.whl - Mend

ngiab-data-preprocess 3.0.3py3-none-any.whl → 3.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

data_processing/file_paths.py CHANGED Viewed

@@ -8,6 +8,8 @@ class file_paths:
     """
     config_file = Path("~/.ngiab/preprocessor").expanduser()
     hydrofabric_dir = Path("~/.ngiab/hydrofabric/v2.2").expanduser()
+    hydrofabric_download_log = Path("~/.ngiab/hydrofabric/v2.2/download_log.json").expanduser()
+    no_update_hf = Path("~/.ngiab/hydrofabric/v2.2/no_update").expanduser()
     cache_dir = Path("~/.ngiab/zarr_cache").expanduser()
     output_dir = None
     data_sources = Path(__file__).parent.parent / "data_sources"

data_processing/forcings.py CHANGED Viewed

@@ -17,6 +17,15 @@ import xarray as xr
 from data_processing.file_paths import file_paths
 from data_processing.zarr_utils import get_forcing_data
 from exactextract import exact_extract
+from exactextract.raster import NumPyRasterSource
+from rich.progress import (
+    Progress,
+    BarColumn,
+    TextColumn,
+    TimeElapsedColumn,
+    TimeRemainingColumn,
+)
 logger = logging.getLogger(__name__)
 # Suppress the specific warning from numpy to keep the cli output clean
@@ -27,6 +36,7 @@ warnings.filterwarnings(
     "ignore", message="'GeoDataFrame.swapaxes' is deprecated", category=FutureWarning
 )
 def weighted_sum_of_cells(flat_raster: np.ndarray, cell_ids: np.ndarray , factors: np.ndarray):
     # Create an output array initialized with zeros
     # dimensions are raster[time][x*y]
@@ -37,10 +47,17 @@ def weighted_sum_of_cells(flat_raster: np.ndarray, cell_ids: np.ndarray , factor
     return result
-def get_cell_weights(raster, gdf):
+def get_cell_weights(raster, gdf, wkt):
     # Get the cell weights for each divide
+    xmin = raster.x[0]
+    xmax = raster.x[-1]
+    ymin = raster.y[0]
+    ymax = raster.y[-1]
+    rastersource = NumPyRasterSource(
+        raster["RAINRATE"], srs_wkt=wkt, xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax
+    )
     output = exact_extract(
-        raster["RAINRATE"],
+        rastersource,
         gdf,
         ["cell_id", "coverage"],
         include_cols=["divide_id"],
@@ -109,11 +126,11 @@ def process_chunk_shared(variable, times, shm_name, shape, dtype, chunk):
 def get_cell_weights_parallel(gdf, input_forcings, num_partitions):
     gdf_chunks = np.array_split(gdf, num_partitions)
+    wkt = gdf.crs.to_wkt()
     one_timestep = input_forcings.isel(time=0).compute()
     with multiprocessing.Pool() as pool:
-        args = [(one_timestep, gdf_chunk) for gdf_chunk in gdf_chunks]
+        args = [(one_timestep, gdf_chunk, wkt) for gdf_chunk in gdf_chunks]
         catchments = pool.starmap(get_cell_weights, args)
     return pd.concat(catchments)
@@ -139,11 +156,28 @@ def compute_zonal_stats(
                 "V2D": "VGRD_10maboveground",
             }
-    results = []
     cat_chunks = np.array_split(catchments, num_partitions)
-    forcing_times = merged_data.time.values
+    progress = Progress(
+        TextColumn("[progress.description]{task.description}"),
+        BarColumn(),
+        "[progress.percentage]{task.percentage:>3.0f}%",
+        TextColumn("{task.completed}/{task.total}"),
+        "•",
+        TextColumn(" Elapsed Time:"),
+        TimeElapsedColumn(),
+        TextColumn(" Remaining Time:"),
+        TimeRemainingColumn(),
+    )
+    timer = time.perf_counter()
+    variable_task = progress.add_task(
+        "[cyan]Processing variables...", total=len(variables), elapsed=0
+    )
+    progress.start()
     for variable in variables.keys():
+        progress.update(variable_task, advance=1)
+        progress.update(variable_task, description=f"Processing {variable}")
         if variable not in merged_data.data_vars:
             logger.warning(f"Variable {variable} not in forcings, skipping")
@@ -151,8 +185,9 @@ def compute_zonal_stats(
         # to make sure this fits in memory, we need to chunk the data
         time_chunks = get_index_chunks(merged_data[variable])
+        chunk_task = progress.add_task("[purple] processing chunks", total=len(time_chunks))
         for i, times in enumerate(time_chunks):
+            progress.update(chunk_task, advance=1)
             start, end = times
             # select the chunk of time we want to process
             data_chunk = merged_data[variable].isel(time=slice(start,end))
@@ -184,8 +219,14 @@ def compute_zonal_stats(
         xr.concat(datasets, dim="time").to_netcdf(forcings_dir / f"{variable}.nc")
         for file in forcings_dir.glob("temp/*.nc"):
             file.unlink()
+        progress.remove_task(chunk_task)
+    progress.update(
+        variable_task,
+        description=f"Forcings processed in {time.perf_counter() - timer:2f} seconds",
+    )
+    progress.stop()
     logger.info(
-        f"Forcing generation complete! Zonal stats computed in {time.time() - timer_start} seconds"
+        f"Forcing generation complete! Zonal stats computed in {time.time() - timer_start:2f} seconds"
     )
     write_outputs(forcings_dir, variables)

data_processing/gpkg_utils.py CHANGED Viewed

@@ -32,7 +32,7 @@ def verify_indices(gpkg: str = file_paths.conus_hydrofabric) -> None:
     Verify that the indices in the specified geopackage are correct.
     If they are not, create the correct indices.
     """
-    logger.info("Building database indices")
+    logger.debug("Building database indices")
     new_indicies = [
         'CREATE INDEX "diid" ON "divides" ( "divide_id" ASC );',
         'CREATE INDEX "ditid" ON "divides" ( "toid" ASC );',
@@ -55,6 +55,9 @@ def verify_indices(gpkg: str = file_paths.conus_hydrofabric) -> None:
     con = sqlite3.connect(gpkg)
     indices = con.execute("SELECT name FROM sqlite_master WHERE type = 'index'").fetchall()
     indices = [x[0] for x in indices]
+    missing = [x for x in new_indicies if x.split('"')[1] not in indices]
+    if len(missing) > 0:
+        logger.info("Creating indices")
     for index in new_indicies:
         if index.split('"')[1] not in indices:
             logger.info(f"Creating index {index}")
@@ -299,7 +302,7 @@ def subset_table(table: str, ids: List[str], hydrofabric: str, subset_gpkg_name:
         subset_gpkg_name (str): The name of the subset geopackage.
     """
     logger.info(f"Subsetting {table} in {subset_gpkg_name}")
-    source_db = sqlite3.connect(hydrofabric)
+    source_db = sqlite3.connect(f"file:{hydrofabric}?mode=ro", uri=True)
     dest_db = sqlite3.connect(subset_gpkg_name)
     table_keys = {"divides": "toid", "divide-attributes": "divide_id", "lakes": "poi_id"}

data_processing/s3fs_utils.py ADDED Viewed

@@ -0,0 +1,77 @@
+from s3fs import S3FileSystem
+from s3fs.core import _error_wrapper, version_id_kw
+from typing import Optional
+import asyncio
+class S3ParallelFileSystem(S3FileSystem):
+    """S3FileSystem subclass that supports parallel downloads"""
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    async def _cat_file(
+        self,
+        path: str,
+        version_id: Optional[str] = None,
+        start: Optional[int] = None,
+        end: Optional[int] = None,
+    ) -> bytes:
+        bucket, key, vers = self.split_path(path)
+        version_kw = version_id_kw(version_id or vers)
+        # If start/end specified, use single range request
+        if start is not None or end is not None:
+            head = {"Range": await self._process_limits(path, start, end)}
+            return await self._download_chunk(bucket, key, head, version_kw)
+        # For large files, use parallel downloads
+        try:
+            obj_size = (
+                await self._call_s3(
+                    "head_object", Bucket=bucket, Key=key, **version_kw, **self.req_kw
+                )
+            )["ContentLength"]
+        except Exception as e:
+            # Fall back to single request if HEAD fails
+            return await self._download_chunk(bucket, key, {}, version_kw)
+        CHUNK_SIZE = 1 * 1024 * 1024  # 1MB chunks
+        if obj_size <= CHUNK_SIZE:
+            return await self._download_chunk(bucket, key, {}, version_kw)
+        # Calculate chunks for parallel download
+        chunks = []
+        for start in range(0, obj_size, CHUNK_SIZE):
+            end = min(start + CHUNK_SIZE - 1, obj_size - 1)
+            range_header = f"bytes={start}-{end}"
+            chunks.append({"Range": range_header})
+        # Download chunks in parallel
+        async def download_all_chunks():
+            tasks = [
+                self._download_chunk(bucket, key, chunk_head, version_kw) for chunk_head in chunks
+            ]
+            chunks_data = await asyncio.gather(*tasks)
+            return b"".join(chunks_data)
+        return await _error_wrapper(download_all_chunks, retries=self.retries)
+    async def _download_chunk(self, bucket: str, key: str, head: dict, version_kw: dict) -> bytes:
+        """Helper function to download a single chunk"""
+        async def _call_and_read():
+            resp = await self._call_s3(
+                "get_object",
+                Bucket=bucket,
+                Key=key,
+                **version_kw,
+                **head,
+                **self.req_kw,
+            )
+            try:
+                return await resp["Body"].read()
+            finally:
+                resp["Body"].close()
+        return await _error_wrapper(_call_and_read, retries=self.retries)

data_processing/zarr_utils.py CHANGED Viewed

@@ -6,16 +6,15 @@ from typing import Tuple
 import geopandas as gpd
 import numpy as np
 import s3fs
+from data_processing.s3fs_utils import S3ParallelFileSystem
 import xarray as xr
 from dask.distributed import Client, LocalCluster, progress
 from data_processing.file_paths import file_paths
 from fsspec.mapping import FSMap
 logger = logging.getLogger(__name__)
-def open_s3_store(url: str) -> FSMap:
-    """Open an s3 store from a given url."""
-    return s3fs.S3Map(url, s3=s3fs.S3FileSystem(anon=True))
 def load_zarr_datasets() -> xr.Dataset:
     """Load zarr datasets from S3 within the specified time range."""
@@ -30,14 +29,18 @@ def load_zarr_datasets() -> xr.Dataset:
         f"s3://noaa-nwm-retrospective-3-0-pds/CONUS/zarr/forcing/{var}.zarr"
         for var in forcing_vars
     ]
-    s3_stores = [open_s3_store(url) for url in s3_urls]
-    dataset = xr.open_mfdataset(s3_stores, parallel=True, engine="zarr")
+    # default cache is readahead which is detrimental to performance in this case
+    fs = S3ParallelFileSystem(anon=True, default_cache_type="none")  # default_block_size
+    s3_stores = [s3fs.S3Map(url, s3=fs) for url in s3_urls]
+    # the cache option here just holds accessed data in memory to prevent s3 being queried multiple times
+    # most of the data is read once and written to disk but some of the coordinate data is read multiple times
+    dataset = xr.open_mfdataset(s3_stores, parallel=True, engine="zarr", cache=True)
     return dataset
 def validate_time_range(dataset: xr.Dataset, start_time: str, end_time: str) -> Tuple[str, str]:
-    end_time_in_dataset = dataset.time[-1].values
-    start_time_in_dataset = dataset.time[0].values
+    end_time_in_dataset = dataset.time.isel(time=-1).values
+    start_time_in_dataset = dataset.time.isel(time=0).values
     if np.datetime64(start_time) < start_time_in_dataset:
         logger.warning(
             f"provided start {start_time} is before the start of the dataset {start_time_in_dataset}, selecting from {start_time_in_dataset}"
@@ -130,11 +133,13 @@ def get_forcing_data(
     if merged_data is None:
         logger.info("Loading zarr stores")
+        # create new event loop
         lazy_store = load_zarr_datasets()
         logger.debug("Got zarr stores")
         clipped_store = clip_dataset_to_bounds(lazy_store, gdf.total_bounds, start_time, end_time)
         logger.info("Clipped forcing data to bounds")
         merged_data = compute_store(clipped_store, forcing_paths.cached_nc_file)
         logger.info("Forcing data loaded and cached")
+        # close the event loop
     return merged_data

data_sources/source_validation.py CHANGED Viewed

@@ -2,75 +2,222 @@ import gzip
 import os
 import tarfile
 import warnings
+import json
+from concurrent.futures import ThreadPoolExecutor
 import requests
 from data_processing.file_paths import file_paths
 from tqdm import TqdmExperimentalWarning
 from tqdm.rich import tqdm
+from time import sleep
+from rich.console import Console
+from rich.prompt import Prompt
+from rich.progress import Progress, BarColumn, TextColumn, TimeElapsedColumn, SpinnerColumn, TimeRemainingColumn, DownloadColumn, TransferSpeedColumn
+import threading
+import psutil
 warnings.filterwarnings("ignore", category=TqdmExperimentalWarning)
+console = Console()
-def decompress_gzip_tar(file_path, output_dir):
-    # Get the total size of the compressed file
-    total_size = os.path.getsize(file_path)
+def decompress_gzip_tar(file_path, output_dir):
+    # use rich to display "decompressing" message with a progress bar that just counts down from 30s
+    # actually measuring this is hard and it usually takes ~20s to decompress
+    progress = Progress(
+        SpinnerColumn(),
+        TextColumn("[progress.description]{task.description}"),
+        TimeElapsedColumn(),
+    )
+    task = progress.add_task("Decompressing", total=1)
+    progress.start()
     with gzip.open(file_path, "rb") as f_in:
-        # Create a tqdm progress bar
-        with tqdm(total=total_size, unit="MB", unit_scale=True, desc=f"Decompressing") as pbar:
-            # Open the tar archive
-            with tarfile.open(fileobj=f_in) as tar:
-                # Extract all contents
-                for member in tar:
-                    tar.extract(member, path=output_dir)
-                    # Update the progress bar
-                    pbar.update(member.size)
-def download_file(url, save_path):
+        with tarfile.open(fileobj=f_in) as tar:
+            # Extract all contents
+            for member in tar:
+                tar.extract(member, path=output_dir)
+                # Update the progress bar
+    progress.update(task, completed=1)
+    progress.stop()
+def download_chunk(url, start, end, index, save_path):
+    headers = {"Range": f"bytes={start}-{end}"}
+    response = requests.get(url, headers=headers, stream=True)
+    chunk_path = f"{save_path}.part{index}"
+    with open(chunk_path, "wb") as f_out:
+        for chunk in response.iter_content(chunk_size=8192):
+            if chunk:
+                f_out.write(chunk)
+    return chunk_path
+def download_progress_estimate(progress, task, total_size):
+    network_bytes_start = psutil.net_io_counters().bytes_recv
+    # make a new progress bar that will be updated by a separate thread
+    progress.start()
+    interval = 0.5
+    while not progress.finished:
+        current_downloaded = psutil.net_io_counters().bytes_recv
+        total_downloaded = current_downloaded - network_bytes_start
+        progress.update(task, completed=total_downloaded)
+        sleep(interval)
+        if total_downloaded >= total_size or progress.finished:
+            break
+    progress.stop()
+def download_file(url, save_path, num_threads=150):
     if not os.path.exists(os.path.dirname(save_path)):
         os.makedirs(os.path.dirname(save_path))
-    response = requests.get(url, stream=True)
+    response = requests.head(url)
     total_size = int(response.headers.get("content-length", 0))
-    bytes_downloaded = 0
-    chunk_size = 1048576
-    with open(save_path, "wb") as f:
-        for data in tqdm(
-            response.iter_content(chunk_size=chunk_size),
-            total=total_size / chunk_size,
-            unit="B",
-            unit_scale=True,
-            desc=f"Downloading",
-        ):
-            bytes_downloaded += len(data)
-            f.write(data)
+    chunk_size = total_size // num_threads
+    progress = Progress(
+            TextColumn("[progress.description]{task.description}"),
+            BarColumn(),
+            DownloadColumn(),
+            TransferSpeedColumn(),
+            TextColumn(" Elapsed Time:"),
+            TimeElapsedColumn(),
+            TextColumn(" Remaining Time:"),
+            TimeRemainingColumn(),
+        )
+    task = progress.add_task("Downloading", total=total_size)
+    download_progress_thread = threading.Thread(target=download_progress_estimate, args=(progress, task ,total_size))
+    download_progress_thread.start()
+    with ThreadPoolExecutor(max_workers=num_threads) as executor:
+        futures = []
+        for i in range(num_threads):
+            start = i * chunk_size
+            end = start + chunk_size - 1 if i < num_threads - 1 else total_size - 1
+            futures.append(executor.submit(download_chunk, url, start, end, i, save_path))
+        chunk_paths = [
+            future.result() for future in futures
+        ]
+    with open(save_path, "wb") as f_out:
+        for chunk_path in chunk_paths:
+            with open(chunk_path, "rb") as f_in:
+                f_out.write(f_in.read())
+            os.remove(chunk_path)
+    progress.update(task, completed=total_size)
+    download_progress_thread.join()
+hydrofabric_url = "https://communityhydrofabric.s3.us-east-1.amazonaws.com/hydrofabrics/community/conus_nextgen.tar.gz"
+def get_headers():
+    # for versioning
+    # Useful Headers: { 'Last-Modified': 'Wed, 20 Nov 2024 18:45:59 GMT', 'ETag': '"cc1452838886a7ab3065a61073fa991b-207"'}
+    response = requests.head(hydrofabric_url)
+    return response.status_code, response.headers
-hydrofabric_url = "https://communityhydrofabric.s3.us-east-1.amazonaws.com/conus_nextgen.gpkg"
+def download_and_update_hf():
+    download_file(hydrofabric_url, file_paths.conus_hydrofabric.with_suffix(".tar.gz"))
+    status, headers = get_headers()
+    if status == 200:
+        # write headers to a file
+        with open(file_paths.hydrofabric_download_log, "w") as f:
+            json.dump(dict(headers), f)
+    decompress_gzip_tar(
+        file_paths.conus_hydrofabric.with_suffix(".tar.gz"),
+        file_paths.conus_hydrofabric.parent,
+    )
 def validate_hydrofabric():
     if not file_paths.conus_hydrofabric.is_file():
-        # alert the user that the hydrofabric is missing
-        print("Hydrofabric is missing. Would you like to download it now? (Y/n)")
-        response = input()
-        if response == "" or response.lower() == "y":
-            download_file(hydrofabric_url, file_paths.conus_hydrofabric)
+        response = Prompt.ask(
+            "Hydrofabric is missing. Would you like to download it now?",
+            default="y",
+            choices=["y", "n"],
+        )
+        if response == "y":
+            download_and_update_hf()
         else:
-            print("Exiting...")
+            console.print("Exiting...", style="bold red")
             exit()
+    if file_paths.no_update_hf.exists():
+        # skip the updates
+        return
+    if not file_paths.hydrofabric_download_log.is_file():
+        response = Prompt.ask(
+            "Hydrofabric version information unavailable, Would you like to fetch the updated version?",
+            default="y",
+            style="bold yellow",
+            choices=["y", "n"],
+        )
+        if response == "y":
+            download_and_update_hf()
+        else:
+            console.print("Continuing... ", style="bold yellow")
+            console.print(
+                f"To disable this warning, create an empty file called {file_paths.no_update_hf.resolve()}",
+                style="bold yellow",
+            )
+            sleep(2)
+            return
+    with open(file_paths.hydrofabric_download_log, "r") as f:
+        content = f.read()
+        headers = json.loads(content)
+    status, latest_headers = get_headers()
+    if status != 200:
+        console.print(
+            "Unable to contact servers, proceeding without updating hydrofabric", style="bold red"
+        )
+        sleep(2)
+    if headers.get("ETag", "") != latest_headers.get("ETag", ""):
+        console.print("Local and remote Hydrofabric Differ", style="bold yellow")
+        console.print(
+            f"Local last updated at {headers.get('Last-Modified', 'NA')}, remote last updated at {latest_headers.get('Last-Modified', 'NA')}",
+            style="bold yellow",
+        )
+        response = Prompt.ask(
+            "Would you like to fetch the updated version?",
+            default="y",
+            choices=["y", "n"],
+        )
+        if response == "y":
+            download_and_update_hf()
+        else:
+            console.print("Continuing... ", style="bold yellow")
+            console.print(
+                f"To disable this warning, create an empty file called {file_paths.no_update_hf.resolve()}",
+                style="bold yellow",
+            )
+            sleep(2)
+            return
 def validate_output_dir():
     if not file_paths.config_file.is_file():
-        # prompt the user to set the working directory
-        print(
-            "Output directory is not set. Would you like to set it now? Defaults to ~/ngiab_preprocess_output/ (y/N)"
+        response = Prompt.ask(
+            "Output directory is not set. Would you like to use the default? ~/ngiab_preprocess_output/",
+            default="y",
+            choices=["y", "n"],
         )
-        response = input()
-        if response.lower() == "y":
-            response = input("Enter the path to the working directory: ")
-        if response == "" or response.lower() == "n":
+        if response.lower() == "n":
+            response = Prompt.ask("Enter the path to the working directory")
+        if response == "" or response.lower() == "y":
             response = "~/ngiab_preprocess_output/"
         file_paths.set_working_dir(response)
@@ -78,3 +225,7 @@ def validate_output_dir():
 def validate_all():
     validate_hydrofabric()
     validate_output_dir()
+if __name__ == "__main__":
+    validate_all()

data_sources/template.sql CHANGED Viewed

@@ -151,6 +151,7 @@ CREATE TABLE IF NOT EXISTS "hydrolocations" (
 	"hl_x"	REAL,
 	"hl_y"	REAL,
 	"vpuid"	TEXT,
+	"geom" POINT,
 	PRIMARY KEY("fid" AUTOINCREMENT)
 );
 CREATE TABLE IF NOT EXISTS "flowpath-attributes" (
@@ -291,7 +292,7 @@ INSERT INTO "gpkg_contents" VALUES
 --  ('divides','features','divides','','2024-10-02T21:40:02.814Z',-2356125.0012,209715.0003,2258234.9955,3506235.0003,5070),
 --  ('lakes','features','lakes','','2024-10-02T21:40:03.033Z',-2306232.84864919,329124.789725057,2240264.19930738,3149850.04714446,5070),
  ('pois','attributes','pois','','2024-10-02T21:40:34.220Z',NULL,NULL,NULL,NULL,0),
- ('hydrolocations','attributes','hydrolocations','','2024-10-02T21:40:35.451Z',NULL,NULL,NULL,NULL,0),
+--  ('hydrolocations','attributes','hydrolocations','','2024-10-02T21:40:35.451Z',NULL,NULL,NULL,NULL,0),
  ('flowpath-attributes','attributes','flowpath-attributes','','2024-10-02T21:40:43.663Z',NULL,NULL,NULL,NULL,0),
  ('flowpath-attributes-ml','attributes','flowpath-attributes-ml','','2024-10-02T21:40:53.358Z',NULL,NULL,NULL,NULL,0),
  ('network','attributes','network','','2024-10-02T21:42:24.445Z',NULL,NULL,NULL,NULL,0),
@@ -300,9 +301,11 @@ INSERT INTO "gpkg_contents" VALUES
 INSERT INTO "gpkg_geometry_columns" VALUES ('flowpaths','geom','GEOMETRY',5070,0,0),
  ('divides','geom','POLYGON',5070,0,0),
  ('lakes','geom','POINT',5070,0,0),
- ('nexus','geom','POINT',5070,0,0);
+ ('nexus','geom','POINT',5070,0,0),
+ ('hydrolocations','geom','POINT',5070,0,0);
 INSERT INTO "gpkg_extensions" VALUES ('flowpaths','geom','gpkg_rtree_index','http://www.geopackage.org/spec120/#extension_rtree','write-only'),
  ('divides','geom','gpkg_rtree_index','http://www.geopackage.org/spec120/#extension_rtree','write-only'),
  ('lakes','geom','gpkg_rtree_index','http://www.geopackage.org/spec120/#extension_rtree','write-only'),
- ('nexus','geom','gpkg_rtree_index','http://www.geopackage.org/spec120/#extension_rtree','write-only');
+ ('nexus','geom','gpkg_rtree_index','http://www.geopackage.org/spec120/#extension_rtree','write-only'),
+ ('hydrolocations','geom','gpkg_rtree_index','http://www.geopackage.org/spec120/#extension_rtree','write-only');
 COMMIT;

data_sources/triggers.sql CHANGED Viewed

@@ -51,4 +51,40 @@ CREATE TRIGGER "rtree_nexus_geom_delete" AFTER DELETE ON "nexus" WHEN old."geom"
 CREATE TRIGGER "trigger_insert_feature_count_nexus" AFTER INSERT ON "nexus" BEGIN UPDATE gpkg_ogr_contents SET feature_count = feature_count + 1 WHERE lower(table_name) = lower('nexus'); END;
 CREATE TRIGGER "trigger_delete_feature_count_nexus" AFTER DELETE ON "nexus" BEGIN UPDATE gpkg_ogr_contents SET feature_count = feature_count - 1 WHERE lower(table_name) = lower('nexus'); END;
 CREATE TRIGGER "trigger_insert_feature_count_divide-attributes" AFTER INSERT ON "divide-attributes" BEGIN UPDATE gpkg_ogr_contents SET feature_count = feature_count + 1 WHERE lower(table_name) = lower('divide-attributes'); END;
-CREATE TRIGGER "trigger_delete_feature_count_divide-attributes" AFTER DELETE ON "divide-attributes" BEGIN UPDATE gpkg_ogr_contents SET feature_count = feature_count - 1 WHERE lower(table_name) = lower('divide-attributes'); END;
+CREATE TRIGGER "trigger_delete_feature_count_divide-attributes" AFTER DELETE ON "divide-attributes" BEGIN UPDATE gpkg_ogr_contents SET feature_count = feature_count - 1 WHERE lower(table_name) = lower('divide-attributes'); END;
+CREATE TRIGGER "rtree_hydrolocations_geom_insert"
+AFTER INSERT ON "hydrolocations"
+WHEN (new."geom" NOT NULL AND NOT ST_IsEmpty(NEW."geom"))
+BEGIN
+INSERT OR REPLACE INTO "rtree_hydrolocations_geom" VALUES (NEW.ROWID, ST_MinX(NEW."geom"), ST_MaxX(NEW."geom"), ST_MinY(NEW."geom"), ST_MaxY(NEW."geom"));
+END;
+CREATE TRIGGER "rtree_hydrolocations_geom_update1"
+AFTER UPDATE OF "geom" ON "hydrolocations"
+WHEN OLD.ROWID = NEW.ROWID AND (NEW."geom" NOT NULL AND NOT ST_IsEmpty(NEW."geom"))
+BEGIN
+INSERT OR REPLACE INTO "rtree_hydrolocations_geom" VALUES (NEW.ROWID, ST_MinX(NEW."geom"), ST_MaxX(NEW."geom"), ST_MinY(NEW."geom"), ST_MaxY(NEW."geom"));
+END;
+CREATE TRIGGER "rtree_hydrolocations_geom_update2"
+AFTER UPDATE OF "geom" ON "hydrolocations"
+WHEN OLD.ROWID = NEW.ROWID AND (NEW."geom" IS NULL OR ST_IsEmpty(NEW."geom"))
+BEGIN
+DELETE FROM "rtree_hydrolocations_geom" WHERE id = OLD.ROWID;
+END;
+CREATE TRIGGER "rtree_hydrolocations_geom_update3"
+AFTER UPDATE OF "geom" ON "hydrolocations"
+WHEN OLD.ROWID != NEW.ROWID AND (NEW."geom" NOT NULL AND NOT ST_IsEmpty(NEW."geom"))
+BEGIN
+DELETE FROM "rtree_hydrolocations_geom" WHERE id = OLD.ROWID;
+INSERT OR REPLACE INTO "rtree_hydrolocations_geom" VALUES (NEW.ROWID, ST_MinX(NEW."geom"), ST_MaxX(NEW."geom"), ST_MinY(NEW."geom"), ST_MaxY(NEW."geom"));
+END;
+CREATE TRIGGER "rtree_hydrolocations_geom_update4"
+AFTER UPDATE ON "hydrolocations"
+WHEN OLD.ROWID != NEW.ROWID AND (NEW."geom" IS NULL OR ST_IsEmpty(NEW."geom"))
+BEGIN
+DELETE FROM "rtree_hydrolocations_geom" WHERE id IN (OLD.ROWID, NEW.ROWID);
+END;
+CREATE TRIGGER "rtree_hydrolocations_geom_delete"
+AFTER DELETE ON "hydrolocations"WHEN old."geom" NOT NULL
+BEGIN
+DELETE FROM "rtree_hydrolocations_geom" WHERE id = OLD.ROWID;
+END;

ngiab_data_cli/__main__.py CHANGED Viewed

@@ -1,21 +1,22 @@
-import argparse
-import logging
-import time
-from typing import List
-import subprocess
-from dask.distributed import Client
-from data_processing.file_paths import file_paths
-from data_processing.gpkg_utils import get_catid_from_point, get_cat_from_gage_id
-from data_processing.subset import subset
-from data_processing.forcings import create_forcings
-from data_processing.create_realization import create_realization, create_dd_realization
-from data_sources.source_validation import validate_all
-from ngiab_data_cli.custom_logging import setup_logging, set_logging_to_critical_only
-from ngiab_data_cli.arguments import parse_arguments
+import rich.status
+# add a status bar for these imports so the cli feels more responsive
+with rich.status.Status("Initializing...") as status:
+    from data_sources.source_validation import validate_all
+    from ngiab_data_cli.custom_logging import setup_logging, set_logging_to_critical_only
+    from ngiab_data_cli.arguments import parse_arguments
+    from data_processing.file_paths import file_paths
+    import argparse
+    import logging
+    import time
+    from typing import List
+    import subprocess
+    import time
+    from dask.distributed import Client
+    from data_processing.gpkg_utils import get_catid_from_point, get_cat_from_gage_id
+    from data_processing.subset import subset
+    from data_processing.forcings import create_forcings
+    from data_processing.create_realization import create_realization, create_dd_realization
 def validate_input(args: argparse.Namespace) -> None:
     """Validate input arguments."""
@@ -119,7 +120,7 @@ def main() -> None:
         cat_to_subset, output_folder = validate_input(args)
         paths = file_paths(output_folder)
         args = set_dependent_flags(args, paths)  # --validate
-        logging.info(f"Using output folder: {paths.subset_dir}")
+        logging.info(f"Using output folder: {paths.subset_dir}")
         if args.subset:
             logging.info(f"Subsetting hydrofabric")

{ngiab_data_preprocess-3.0.3.dist-info → ngiab_data_preprocess-3.1.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ngiab_data_preprocess
-Version: 3.0.3
+Version: 3.1.0
 Summary: Graphical Tools for creating Next Gen Water model input data.
 Author-email: Josh Cunningham <jcunningham8@ua.edu>
 Project-URL: Homepage, https://github.com/CIROH-UA/NGIAB_data_preprocess
@@ -19,7 +19,6 @@ Requires-Dist: requests==2.32.2
 Requires-Dist: igraph==0.11.4
 Requires-Dist: s3fs==2024.3.1
 Requires-Dist: xarray==2024.2.0
-Requires-Dist: rioxarray==0.15.1
 Requires-Dist: zarr==2.17.1
 Requires-Dist: netCDF4==1.6.5
 Requires-Dist: dask==2024.4.1

{ngiab_data_preprocess-3.0.3.dist-info → ngiab_data_preprocess-3.1.0.dist-info}/RECORD RENAMED Viewed

@@ -1,10 +1,11 @@
 data_processing/create_realization.py,sha256=2-w-TfJ6e5SFYchDZNAlOIEVK_iP79-EUC_jQ2Un1jk,10893
-data_processing/file_paths.py,sha256=QWjtRDSqJi8Cu0_EG_sssrxmdJBaz-hMfng5CJxyhf8,4005
-data_processing/forcings.py,sha256=bK1o7PTBXPUYCIK1hT-ccBpOxygrlYIsaCtdGmQugNM,11154
-data_processing/gpkg_utils.py,sha256=SJ3IHTGnI6nLIw78Am9wrNO3jZ1OOpuyjkpBvQlnLko,17098
+data_processing/file_paths.py,sha256=jyiN3hCK3H_Upt8C4NoTInMrpcCZaTflAv41Oh1K6a8,4177
+data_processing/forcings.py,sha256=ADQOgCWUMGuiPvD5WUnJ94CqILM6akwE-xUlmckksik,12542
+data_processing/gpkg_utils.py,sha256=pMmuJT_iHSt7Caw_JMxui5DtIzvL8HSseAelGgB41_I,17266
 data_processing/graph_utils.py,sha256=uN2MoUFHQoeQEqYtf3o45QrRaTgqKoi0n3ZrB6XOh6Y,7611
+data_processing/s3fs_utils.py,sha256=DisX_PqIPn48EltmE85m4hJdcxaC6r4Mb-deptRT9O0,2752
 data_processing/subset.py,sha256=bXDDoDmp8TtfbHvx0rztNWDmqiri8LeSkn7hBe4kb-4,2876
-data_processing/zarr_utils.py,sha256=kx8M_9QZ6IwdLEmPn3LzRDi01JkKMDDBBukuXaOjkDo,5641
+data_processing/zarr_utils.py,sha256=F-ukVuJFXamgU77k0e8lSfUPA-AOd2Zq8uTWADfykxA,6047
 data_sources/cfe-nowpm-realization-template.json,sha256=jBv6jGuHtAVFC7X2KaUdvxrhCopgnBiCuW3pS8Jng0w,3541
 data_sources/cfe-template.ini,sha256=SGq7bShD0Z83C0n4mztmzz3GnFdP_uJXPhheOizNpzc,1973
 data_sources/dd-catchment-template.yml,sha256=7lzpUB0o4bFuD4qgFnsNOUw-ZtwxZ_dNTMhB6sDMklA,292
@@ -13,9 +14,9 @@ data_sources/dd-realization-template.json,sha256=xt3BgzAEqn3eERO1lODdWdbV0T9UuQG
 data_sources/forcing_template.nc,sha256=uRuVAqX3ngdlougZINavtwl_wC2VLD8fHqG7_CLim1s,85284
 data_sources/ngen-routing-template.yaml,sha256=8xTTLRHAbXS0QN6C0cO0Mt_orwqKpD2LWz9Bq3sQGuA,4649
 data_sources/noah-owp-modular-init.namelist.input,sha256=ssEcr_hPfRmslcpXbKJqzas4aSuDY-qd_k6zfxKbvhA,3045
-data_sources/source_validation.py,sha256=5i0fM0ejxQTDH-B1TIk45_fqenypCDdFHYhVfzsCdC0,2685
-data_sources/template.sql,sha256=5qr3FsaxtPGxmLBM2Z0UFynql37A0oCBxvk_YrExIuE,10281
-data_sources/triggers.sql,sha256=-UQej1rjao8N4IlWN4S3ZfLWLUOg5VL1t5pbwIqttsQ,13172
+data_sources/source_validation.py,sha256=G_qrh6PaCgZ6wgPJ3UdE2lAQQhSPMEYGTVbyXax2J4M,7872
+data_sources/template.sql,sha256=ZnFqAqleEq9wgmAhNO90Wue_L9k0JAn8KF99DYtcxgs,10457
+data_sources/triggers.sql,sha256=G0d_175eNsamKAFhsbphPATvzMPuPL_iCleIhlToduQ,14906
 map_app/__main__.py,sha256=m9UpLD0oihMeJa2nTPewDYda-vpm3aP1_AZOhp6SuQk,2351
 map_app/views.py,sha256=Az5BLXXlbTWzGA7y_vLHWQi-aAUE13X_YuwUr-fkz_w,4183
 map_app/static/css/console.css,sha256=xN6G2MMFyKc9YW9HEVpUUTUjx2o2nokBR4nCX5c18UM,803
@@ -28,11 +29,11 @@ map_app/static/resources/light-style.json,sha256=DaE52qKpAkjiWSKY_z7LxreqA2rW4Zy
 map_app/static/resources/loading.gif,sha256=ggdkZf1AD7rSwIpSJwfiIqANgmVV1WHlxGuKxQKv7uY,72191
 map_app/static/resources/screenshot.png,sha256=-sl_R9_WJShjZ52Djz5ZxLxbsh1VgdCeHmPGebIxEOA,1412174
 map_app/templates/index.html,sha256=3TFbV0dO33UrYNps2CTLocEN6Z-DHDfQTvwFfRpOY0I,6005
-ngiab_data_cli/__main__.py,sha256=b2bHuzMGNiA5idx_PYuurup4PzrjLQTUCi5jUqop-5Q,8419
+ngiab_data_cli/__main__.py,sha256=xNkZL2YziMYznAMGom-8gvbFTaHwYA96cSkSc5RmtfM,8639
 ngiab_data_cli/arguments.py,sha256=6CkA9_-hGPqj0yedhcf9G2DbWgJcn2t3oLodKWY7r-E,3402
 ngiab_data_cli/custom_logging.py,sha256=iS2XozaxudcxQj17qAsrCgbVK9LJAYAPmarJuVWJo1k,1280
-ngiab_data_preprocess-3.0.3.dist-info/LICENSE,sha256=6dMSprwwnsRzEm02mEDbKHD9dUbL8bPIt9Vhrhb0Ulk,1081
-ngiab_data_preprocess-3.0.3.dist-info/METADATA,sha256=uwbrkIA6PsGnyjvmk8k4aiXriF5nIvh5eMNyg9gXOIo,9020
-ngiab_data_preprocess-3.0.3.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-ngiab_data_preprocess-3.0.3.dist-info/top_level.txt,sha256=CjhYAUZrdveR2fOK6rxffU09VIN2IuPD7hk4V3l3pV0,52
-ngiab_data_preprocess-3.0.3.dist-info/RECORD,,
+ngiab_data_preprocess-3.1.0.dist-info/LICENSE,sha256=6dMSprwwnsRzEm02mEDbKHD9dUbL8bPIt9Vhrhb0Ulk,1081
+ngiab_data_preprocess-3.1.0.dist-info/METADATA,sha256=vlVgzkAnT36hXGBDZmoha91BLSESovcKkkTMwfdVQfM,8987
+ngiab_data_preprocess-3.1.0.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
+ngiab_data_preprocess-3.1.0.dist-info/top_level.txt,sha256=CjhYAUZrdveR2fOK6rxffU09VIN2IuPD7hk4V3l3pV0,52
+ngiab_data_preprocess-3.1.0.dist-info/RECORD,,

{ngiab_data_preprocess-3.0.3.dist-info → ngiab_data_preprocess-3.1.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{ngiab_data_preprocess-3.0.3.dist-info → ngiab_data_preprocess-3.1.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{ngiab_data_preprocess-3.0.3.dist-info → ngiab_data_preprocess-3.1.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

ngiab-data-preprocess 3.0.3__py3-none-any.whl → 3.1.0__py3-none-any.whl

ngiab-data-preprocess 3.0.3py3-none-any.whl → 3.1.0py3-none-any.whl