ngiab-data-preprocess 3.0.3__py3-none-any.whl → 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,6 +8,8 @@ class file_paths:
8
8
  """
9
9
  config_file = Path("~/.ngiab/preprocessor").expanduser()
10
10
  hydrofabric_dir = Path("~/.ngiab/hydrofabric/v2.2").expanduser()
11
+ hydrofabric_download_log = Path("~/.ngiab/hydrofabric/v2.2/download_log.json").expanduser()
12
+ no_update_hf = Path("~/.ngiab/hydrofabric/v2.2/no_update").expanduser()
11
13
  cache_dir = Path("~/.ngiab/zarr_cache").expanduser()
12
14
  output_dir = None
13
15
  data_sources = Path(__file__).parent.parent / "data_sources"
@@ -17,6 +17,15 @@ import xarray as xr
17
17
  from data_processing.file_paths import file_paths
18
18
  from data_processing.zarr_utils import get_forcing_data
19
19
  from exactextract import exact_extract
20
+ from exactextract.raster import NumPyRasterSource
21
+ from rich.progress import (
22
+ Progress,
23
+ BarColumn,
24
+ TextColumn,
25
+ TimeElapsedColumn,
26
+ TimeRemainingColumn,
27
+ )
28
+
20
29
 
21
30
  logger = logging.getLogger(__name__)
22
31
  # Suppress the specific warning from numpy to keep the cli output clean
@@ -27,6 +36,7 @@ warnings.filterwarnings(
27
36
  "ignore", message="'GeoDataFrame.swapaxes' is deprecated", category=FutureWarning
28
37
  )
29
38
 
39
+
30
40
  def weighted_sum_of_cells(flat_raster: np.ndarray, cell_ids: np.ndarray , factors: np.ndarray):
31
41
  # Create an output array initialized with zeros
32
42
  # dimensions are raster[time][x*y]
@@ -37,10 +47,17 @@ def weighted_sum_of_cells(flat_raster: np.ndarray, cell_ids: np.ndarray , factor
37
47
  return result
38
48
 
39
49
 
40
- def get_cell_weights(raster, gdf):
50
+ def get_cell_weights(raster, gdf, wkt):
41
51
  # Get the cell weights for each divide
52
+ xmin = raster.x[0]
53
+ xmax = raster.x[-1]
54
+ ymin = raster.y[0]
55
+ ymax = raster.y[-1]
56
+ rastersource = NumPyRasterSource(
57
+ raster["RAINRATE"], srs_wkt=wkt, xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax
58
+ )
42
59
  output = exact_extract(
43
- raster["RAINRATE"],
60
+ rastersource,
44
61
  gdf,
45
62
  ["cell_id", "coverage"],
46
63
  include_cols=["divide_id"],
@@ -109,11 +126,11 @@ def process_chunk_shared(variable, times, shm_name, shape, dtype, chunk):
109
126
 
110
127
  def get_cell_weights_parallel(gdf, input_forcings, num_partitions):
111
128
  gdf_chunks = np.array_split(gdf, num_partitions)
129
+ wkt = gdf.crs.to_wkt()
112
130
  one_timestep = input_forcings.isel(time=0).compute()
113
131
  with multiprocessing.Pool() as pool:
114
- args = [(one_timestep, gdf_chunk) for gdf_chunk in gdf_chunks]
132
+ args = [(one_timestep, gdf_chunk, wkt) for gdf_chunk in gdf_chunks]
115
133
  catchments = pool.starmap(get_cell_weights, args)
116
-
117
134
  return pd.concat(catchments)
118
135
 
119
136
 
@@ -139,11 +156,28 @@ def compute_zonal_stats(
139
156
  "V2D": "VGRD_10maboveground",
140
157
  }
141
158
 
142
- results = []
143
159
  cat_chunks = np.array_split(catchments, num_partitions)
144
- forcing_times = merged_data.time.values
145
160
 
161
+ progress = Progress(
162
+ TextColumn("[progress.description]{task.description}"),
163
+ BarColumn(),
164
+ "[progress.percentage]{task.percentage:>3.0f}%",
165
+ TextColumn("{task.completed}/{task.total}"),
166
+ "•",
167
+ TextColumn(" Elapsed Time:"),
168
+ TimeElapsedColumn(),
169
+ TextColumn(" Remaining Time:"),
170
+ TimeRemainingColumn(),
171
+ )
172
+
173
+ timer = time.perf_counter()
174
+ variable_task = progress.add_task(
175
+ "[cyan]Processing variables...", total=len(variables), elapsed=0
176
+ )
177
+ progress.start()
146
178
  for variable in variables.keys():
179
+ progress.update(variable_task, advance=1)
180
+ progress.update(variable_task, description=f"Processing {variable}")
147
181
 
148
182
  if variable not in merged_data.data_vars:
149
183
  logger.warning(f"Variable {variable} not in forcings, skipping")
@@ -151,8 +185,9 @@ def compute_zonal_stats(
151
185
 
152
186
  # to make sure this fits in memory, we need to chunk the data
153
187
  time_chunks = get_index_chunks(merged_data[variable])
154
-
188
+ chunk_task = progress.add_task("[purple] processing chunks", total=len(time_chunks))
155
189
  for i, times in enumerate(time_chunks):
190
+ progress.update(chunk_task, advance=1)
156
191
  start, end = times
157
192
  # select the chunk of time we want to process
158
193
  data_chunk = merged_data[variable].isel(time=slice(start,end))
@@ -184,8 +219,14 @@ def compute_zonal_stats(
184
219
  xr.concat(datasets, dim="time").to_netcdf(forcings_dir / f"{variable}.nc")
185
220
  for file in forcings_dir.glob("temp/*.nc"):
186
221
  file.unlink()
222
+ progress.remove_task(chunk_task)
223
+ progress.update(
224
+ variable_task,
225
+ description=f"Forcings processed in {time.perf_counter() - timer:2f} seconds",
226
+ )
227
+ progress.stop()
187
228
  logger.info(
188
- f"Forcing generation complete! Zonal stats computed in {time.time() - timer_start} seconds"
229
+ f"Forcing generation complete! Zonal stats computed in {time.time() - timer_start:2f} seconds"
189
230
  )
190
231
  write_outputs(forcings_dir, variables)
191
232
 
@@ -32,7 +32,7 @@ def verify_indices(gpkg: str = file_paths.conus_hydrofabric) -> None:
32
32
  Verify that the indices in the specified geopackage are correct.
33
33
  If they are not, create the correct indices.
34
34
  """
35
- logger.info("Building database indices")
35
+ logger.debug("Building database indices")
36
36
  new_indicies = [
37
37
  'CREATE INDEX "diid" ON "divides" ( "divide_id" ASC );',
38
38
  'CREATE INDEX "ditid" ON "divides" ( "toid" ASC );',
@@ -55,6 +55,9 @@ def verify_indices(gpkg: str = file_paths.conus_hydrofabric) -> None:
55
55
  con = sqlite3.connect(gpkg)
56
56
  indices = con.execute("SELECT name FROM sqlite_master WHERE type = 'index'").fetchall()
57
57
  indices = [x[0] for x in indices]
58
+ missing = [x for x in new_indicies if x.split('"')[1] not in indices]
59
+ if len(missing) > 0:
60
+ logger.info("Creating indices")
58
61
  for index in new_indicies:
59
62
  if index.split('"')[1] not in indices:
60
63
  logger.info(f"Creating index {index}")
@@ -299,7 +302,7 @@ def subset_table(table: str, ids: List[str], hydrofabric: str, subset_gpkg_name:
299
302
  subset_gpkg_name (str): The name of the subset geopackage.
300
303
  """
301
304
  logger.info(f"Subsetting {table} in {subset_gpkg_name}")
302
- source_db = sqlite3.connect(hydrofabric)
305
+ source_db = sqlite3.connect(f"file:{hydrofabric}?mode=ro", uri=True)
303
306
  dest_db = sqlite3.connect(subset_gpkg_name)
304
307
 
305
308
  table_keys = {"divides": "toid", "divide-attributes": "divide_id", "lakes": "poi_id"}
@@ -0,0 +1,77 @@
1
+ from s3fs import S3FileSystem
2
+ from s3fs.core import _error_wrapper, version_id_kw
3
+ from typing import Optional
4
+ import asyncio
5
+
6
+
7
+ class S3ParallelFileSystem(S3FileSystem):
8
+ """S3FileSystem subclass that supports parallel downloads"""
9
+
10
+ def __init__(self, *args, **kwargs):
11
+ super().__init__(*args, **kwargs)
12
+
13
+ async def _cat_file(
14
+ self,
15
+ path: str,
16
+ version_id: Optional[str] = None,
17
+ start: Optional[int] = None,
18
+ end: Optional[int] = None,
19
+ ) -> bytes:
20
+ bucket, key, vers = self.split_path(path)
21
+ version_kw = version_id_kw(version_id or vers)
22
+
23
+ # If start/end specified, use single range request
24
+ if start is not None or end is not None:
25
+ head = {"Range": await self._process_limits(path, start, end)}
26
+ return await self._download_chunk(bucket, key, head, version_kw)
27
+
28
+ # For large files, use parallel downloads
29
+ try:
30
+ obj_size = (
31
+ await self._call_s3(
32
+ "head_object", Bucket=bucket, Key=key, **version_kw, **self.req_kw
33
+ )
34
+ )["ContentLength"]
35
+ except Exception as e:
36
+ # Fall back to single request if HEAD fails
37
+ return await self._download_chunk(bucket, key, {}, version_kw)
38
+
39
+ CHUNK_SIZE = 1 * 1024 * 1024 # 1MB chunks
40
+ if obj_size <= CHUNK_SIZE:
41
+ return await self._download_chunk(bucket, key, {}, version_kw)
42
+
43
+ # Calculate chunks for parallel download
44
+ chunks = []
45
+ for start in range(0, obj_size, CHUNK_SIZE):
46
+ end = min(start + CHUNK_SIZE - 1, obj_size - 1)
47
+ range_header = f"bytes={start}-{end}"
48
+ chunks.append({"Range": range_header})
49
+
50
+ # Download chunks in parallel
51
+ async def download_all_chunks():
52
+ tasks = [
53
+ self._download_chunk(bucket, key, chunk_head, version_kw) for chunk_head in chunks
54
+ ]
55
+ chunks_data = await asyncio.gather(*tasks)
56
+ return b"".join(chunks_data)
57
+
58
+ return await _error_wrapper(download_all_chunks, retries=self.retries)
59
+
60
+ async def _download_chunk(self, bucket: str, key: str, head: dict, version_kw: dict) -> bytes:
61
+ """Helper function to download a single chunk"""
62
+
63
+ async def _call_and_read():
64
+ resp = await self._call_s3(
65
+ "get_object",
66
+ Bucket=bucket,
67
+ Key=key,
68
+ **version_kw,
69
+ **head,
70
+ **self.req_kw,
71
+ )
72
+ try:
73
+ return await resp["Body"].read()
74
+ finally:
75
+ resp["Body"].close()
76
+
77
+ return await _error_wrapper(_call_and_read, retries=self.retries)
@@ -6,16 +6,15 @@ from typing import Tuple
6
6
  import geopandas as gpd
7
7
  import numpy as np
8
8
  import s3fs
9
+ from data_processing.s3fs_utils import S3ParallelFileSystem
9
10
  import xarray as xr
10
11
  from dask.distributed import Client, LocalCluster, progress
11
12
  from data_processing.file_paths import file_paths
12
13
  from fsspec.mapping import FSMap
13
14
 
15
+
14
16
  logger = logging.getLogger(__name__)
15
17
 
16
- def open_s3_store(url: str) -> FSMap:
17
- """Open an s3 store from a given url."""
18
- return s3fs.S3Map(url, s3=s3fs.S3FileSystem(anon=True))
19
18
 
20
19
  def load_zarr_datasets() -> xr.Dataset:
21
20
  """Load zarr datasets from S3 within the specified time range."""
@@ -30,14 +29,18 @@ def load_zarr_datasets() -> xr.Dataset:
30
29
  f"s3://noaa-nwm-retrospective-3-0-pds/CONUS/zarr/forcing/{var}.zarr"
31
30
  for var in forcing_vars
32
31
  ]
33
- s3_stores = [open_s3_store(url) for url in s3_urls]
34
- dataset = xr.open_mfdataset(s3_stores, parallel=True, engine="zarr")
32
+ # default cache is readahead which is detrimental to performance in this case
33
+ fs = S3ParallelFileSystem(anon=True, default_cache_type="none") # default_block_size
34
+ s3_stores = [s3fs.S3Map(url, s3=fs) for url in s3_urls]
35
+ # the cache option here just holds accessed data in memory to prevent s3 being queried multiple times
36
+ # most of the data is read once and written to disk but some of the coordinate data is read multiple times
37
+ dataset = xr.open_mfdataset(s3_stores, parallel=True, engine="zarr", cache=True)
35
38
  return dataset
36
39
 
37
40
 
38
41
  def validate_time_range(dataset: xr.Dataset, start_time: str, end_time: str) -> Tuple[str, str]:
39
- end_time_in_dataset = dataset.time[-1].values
40
- start_time_in_dataset = dataset.time[0].values
42
+ end_time_in_dataset = dataset.time.isel(time=-1).values
43
+ start_time_in_dataset = dataset.time.isel(time=0).values
41
44
  if np.datetime64(start_time) < start_time_in_dataset:
42
45
  logger.warning(
43
46
  f"provided start {start_time} is before the start of the dataset {start_time_in_dataset}, selecting from {start_time_in_dataset}"
@@ -130,11 +133,13 @@ def get_forcing_data(
130
133
 
131
134
  if merged_data is None:
132
135
  logger.info("Loading zarr stores")
136
+ # create new event loop
133
137
  lazy_store = load_zarr_datasets()
134
138
  logger.debug("Got zarr stores")
135
139
  clipped_store = clip_dataset_to_bounds(lazy_store, gdf.total_bounds, start_time, end_time)
136
140
  logger.info("Clipped forcing data to bounds")
137
141
  merged_data = compute_store(clipped_store, forcing_paths.cached_nc_file)
138
142
  logger.info("Forcing data loaded and cached")
143
+ # close the event loop
139
144
 
140
145
  return merged_data
@@ -2,75 +2,222 @@ import gzip
2
2
  import os
3
3
  import tarfile
4
4
  import warnings
5
-
5
+ import json
6
+ from concurrent.futures import ThreadPoolExecutor
6
7
  import requests
7
8
  from data_processing.file_paths import file_paths
8
9
  from tqdm import TqdmExperimentalWarning
9
10
  from tqdm.rich import tqdm
11
+ from time import sleep
12
+ from rich.console import Console
13
+ from rich.prompt import Prompt
14
+ from rich.progress import Progress, BarColumn, TextColumn, TimeElapsedColumn, SpinnerColumn, TimeRemainingColumn, DownloadColumn, TransferSpeedColumn
15
+ import threading
16
+ import psutil
10
17
 
11
18
  warnings.filterwarnings("ignore", category=TqdmExperimentalWarning)
12
19
 
20
+ console = Console()
13
21
 
14
- def decompress_gzip_tar(file_path, output_dir):
15
- # Get the total size of the compressed file
16
- total_size = os.path.getsize(file_path)
17
22
 
23
+ def decompress_gzip_tar(file_path, output_dir):
24
+ # use rich to display "decompressing" message with a progress bar that just counts down from 30s
25
+ # actually measuring this is hard and it usually takes ~20s to decompress
26
+ progress = Progress(
27
+ SpinnerColumn(),
28
+ TextColumn("[progress.description]{task.description}"),
29
+ TimeElapsedColumn(),
30
+ )
31
+ task = progress.add_task("Decompressing", total=1)
32
+ progress.start()
18
33
  with gzip.open(file_path, "rb") as f_in:
19
- # Create a tqdm progress bar
20
- with tqdm(total=total_size, unit="MB", unit_scale=True, desc=f"Decompressing") as pbar:
21
- # Open the tar archive
22
- with tarfile.open(fileobj=f_in) as tar:
23
- # Extract all contents
24
- for member in tar:
25
- tar.extract(member, path=output_dir)
26
- # Update the progress bar
27
- pbar.update(member.size)
28
-
29
-
30
- def download_file(url, save_path):
34
+ with tarfile.open(fileobj=f_in) as tar:
35
+ # Extract all contents
36
+ for member in tar:
37
+ tar.extract(member, path=output_dir)
38
+ # Update the progress bar
39
+ progress.update(task, completed=1)
40
+ progress.stop()
41
+
42
+
43
+
44
+
45
+ def download_chunk(url, start, end, index, save_path):
46
+ headers = {"Range": f"bytes={start}-{end}"}
47
+ response = requests.get(url, headers=headers, stream=True)
48
+ chunk_path = f"{save_path}.part{index}"
49
+ with open(chunk_path, "wb") as f_out:
50
+ for chunk in response.iter_content(chunk_size=8192):
51
+ if chunk:
52
+ f_out.write(chunk)
53
+ return chunk_path
54
+
55
+ def download_progress_estimate(progress, task, total_size):
56
+ network_bytes_start = psutil.net_io_counters().bytes_recv
57
+ # make a new progress bar that will be updated by a separate thread
58
+ progress.start()
59
+ interval = 0.5
60
+ while not progress.finished:
61
+ current_downloaded = psutil.net_io_counters().bytes_recv
62
+ total_downloaded = current_downloaded - network_bytes_start
63
+ progress.update(task, completed=total_downloaded)
64
+ sleep(interval)
65
+ if total_downloaded >= total_size or progress.finished:
66
+ break
67
+ progress.stop()
68
+
69
+
70
+
71
+ def download_file(url, save_path, num_threads=150):
31
72
  if not os.path.exists(os.path.dirname(save_path)):
32
73
  os.makedirs(os.path.dirname(save_path))
33
- response = requests.get(url, stream=True)
74
+
75
+ response = requests.head(url)
34
76
  total_size = int(response.headers.get("content-length", 0))
35
- bytes_downloaded = 0
36
- chunk_size = 1048576
37
- with open(save_path, "wb") as f:
38
- for data in tqdm(
39
- response.iter_content(chunk_size=chunk_size),
40
- total=total_size / chunk_size,
41
- unit="B",
42
- unit_scale=True,
43
- desc=f"Downloading",
44
- ):
45
- bytes_downloaded += len(data)
46
- f.write(data)
77
+ chunk_size = total_size // num_threads
78
+
79
+ progress = Progress(
80
+ TextColumn("[progress.description]{task.description}"),
81
+ BarColumn(),
82
+ DownloadColumn(),
83
+ TransferSpeedColumn(),
84
+ TextColumn(" Elapsed Time:"),
85
+ TimeElapsedColumn(),
86
+ TextColumn(" Remaining Time:"),
87
+ TimeRemainingColumn(),
88
+ )
89
+ task = progress.add_task("Downloading", total=total_size)
90
+
91
+ download_progress_thread = threading.Thread(target=download_progress_estimate, args=(progress, task ,total_size))
92
+ download_progress_thread.start()
93
+
94
+ with ThreadPoolExecutor(max_workers=num_threads) as executor:
95
+ futures = []
96
+ for i in range(num_threads):
97
+ start = i * chunk_size
98
+ end = start + chunk_size - 1 if i < num_threads - 1 else total_size - 1
99
+ futures.append(executor.submit(download_chunk, url, start, end, i, save_path))
100
+
101
+ chunk_paths = [
102
+ future.result() for future in futures
103
+ ]
104
+
105
+
106
+ with open(save_path, "wb") as f_out:
107
+ for chunk_path in chunk_paths:
108
+ with open(chunk_path, "rb") as f_in:
109
+ f_out.write(f_in.read())
110
+ os.remove(chunk_path)
111
+
112
+ progress.update(task, completed=total_size)
113
+ download_progress_thread.join()
114
+
115
+
116
+ hydrofabric_url = "https://communityhydrofabric.s3.us-east-1.amazonaws.com/hydrofabrics/community/conus_nextgen.tar.gz"
117
+
47
118
 
119
+ def get_headers():
120
+ # for versioning
121
+ # Useful Headers: { 'Last-Modified': 'Wed, 20 Nov 2024 18:45:59 GMT', 'ETag': '"cc1452838886a7ab3065a61073fa991b-207"'}
122
+ response = requests.head(hydrofabric_url)
123
+ return response.status_code, response.headers
48
124
 
49
- hydrofabric_url = "https://communityhydrofabric.s3.us-east-1.amazonaws.com/conus_nextgen.gpkg"
125
+
126
+ def download_and_update_hf():
127
+ download_file(hydrofabric_url, file_paths.conus_hydrofabric.with_suffix(".tar.gz"))
128
+ status, headers = get_headers()
129
+
130
+ if status == 200:
131
+ # write headers to a file
132
+ with open(file_paths.hydrofabric_download_log, "w") as f:
133
+ json.dump(dict(headers), f)
134
+
135
+ decompress_gzip_tar(
136
+ file_paths.conus_hydrofabric.with_suffix(".tar.gz"),
137
+ file_paths.conus_hydrofabric.parent,
138
+ )
50
139
 
51
140
 
52
141
  def validate_hydrofabric():
53
142
  if not file_paths.conus_hydrofabric.is_file():
54
- # alert the user that the hydrofabric is missing
55
- print("Hydrofabric is missing. Would you like to download it now? (Y/n)")
56
- response = input()
57
- if response == "" or response.lower() == "y":
58
- download_file(hydrofabric_url, file_paths.conus_hydrofabric)
143
+ response = Prompt.ask(
144
+ "Hydrofabric is missing. Would you like to download it now?",
145
+ default="y",
146
+ choices=["y", "n"],
147
+ )
148
+ if response == "y":
149
+ download_and_update_hf()
59
150
  else:
60
- print("Exiting...")
151
+ console.print("Exiting...", style="bold red")
61
152
  exit()
62
153
 
154
+ if file_paths.no_update_hf.exists():
155
+ # skip the updates
156
+ return
157
+
158
+ if not file_paths.hydrofabric_download_log.is_file():
159
+ response = Prompt.ask(
160
+ "Hydrofabric version information unavailable, Would you like to fetch the updated version?",
161
+ default="y",
162
+ style="bold yellow",
163
+ choices=["y", "n"],
164
+ )
165
+ if response == "y":
166
+ download_and_update_hf()
167
+ else:
168
+ console.print("Continuing... ", style="bold yellow")
169
+ console.print(
170
+ f"To disable this warning, create an empty file called {file_paths.no_update_hf.resolve()}",
171
+ style="bold yellow",
172
+ )
173
+ sleep(2)
174
+ return
175
+
176
+ with open(file_paths.hydrofabric_download_log, "r") as f:
177
+ content = f.read()
178
+ headers = json.loads(content)
179
+
180
+ status, latest_headers = get_headers()
181
+
182
+ if status != 200:
183
+ console.print(
184
+ "Unable to contact servers, proceeding without updating hydrofabric", style="bold red"
185
+ )
186
+ sleep(2)
187
+
188
+ if headers.get("ETag", "") != latest_headers.get("ETag", ""):
189
+ console.print("Local and remote Hydrofabric Differ", style="bold yellow")
190
+ console.print(
191
+ f"Local last updated at {headers.get('Last-Modified', 'NA')}, remote last updated at {latest_headers.get('Last-Modified', 'NA')}",
192
+ style="bold yellow",
193
+ )
194
+ response = Prompt.ask(
195
+ "Would you like to fetch the updated version?",
196
+ default="y",
197
+ choices=["y", "n"],
198
+ )
199
+ if response == "y":
200
+ download_and_update_hf()
201
+ else:
202
+ console.print("Continuing... ", style="bold yellow")
203
+ console.print(
204
+ f"To disable this warning, create an empty file called {file_paths.no_update_hf.resolve()}",
205
+ style="bold yellow",
206
+ )
207
+ sleep(2)
208
+ return
209
+
63
210
 
64
211
  def validate_output_dir():
65
212
  if not file_paths.config_file.is_file():
66
- # prompt the user to set the working directory
67
- print(
68
- "Output directory is not set. Would you like to set it now? Defaults to ~/ngiab_preprocess_output/ (y/N)"
213
+ response = Prompt.ask(
214
+ "Output directory is not set. Would you like to use the default? ~/ngiab_preprocess_output/",
215
+ default="y",
216
+ choices=["y", "n"],
69
217
  )
70
- response = input()
71
- if response.lower() == "y":
72
- response = input("Enter the path to the working directory: ")
73
- if response == "" or response.lower() == "n":
218
+ if response.lower() == "n":
219
+ response = Prompt.ask("Enter the path to the working directory")
220
+ if response == "" or response.lower() == "y":
74
221
  response = "~/ngiab_preprocess_output/"
75
222
  file_paths.set_working_dir(response)
76
223
 
@@ -78,3 +225,7 @@ def validate_output_dir():
78
225
  def validate_all():
79
226
  validate_hydrofabric()
80
227
  validate_output_dir()
228
+
229
+
230
+ if __name__ == "__main__":
231
+ validate_all()
data_sources/template.sql CHANGED
@@ -151,6 +151,7 @@ CREATE TABLE IF NOT EXISTS "hydrolocations" (
151
151
  "hl_x" REAL,
152
152
  "hl_y" REAL,
153
153
  "vpuid" TEXT,
154
+ "geom" POINT,
154
155
  PRIMARY KEY("fid" AUTOINCREMENT)
155
156
  );
156
157
  CREATE TABLE IF NOT EXISTS "flowpath-attributes" (
@@ -291,7 +292,7 @@ INSERT INTO "gpkg_contents" VALUES
291
292
  -- ('divides','features','divides','','2024-10-02T21:40:02.814Z',-2356125.0012,209715.0003,2258234.9955,3506235.0003,5070),
292
293
  -- ('lakes','features','lakes','','2024-10-02T21:40:03.033Z',-2306232.84864919,329124.789725057,2240264.19930738,3149850.04714446,5070),
293
294
  ('pois','attributes','pois','','2024-10-02T21:40:34.220Z',NULL,NULL,NULL,NULL,0),
294
- ('hydrolocations','attributes','hydrolocations','','2024-10-02T21:40:35.451Z',NULL,NULL,NULL,NULL,0),
295
+ -- ('hydrolocations','attributes','hydrolocations','','2024-10-02T21:40:35.451Z',NULL,NULL,NULL,NULL,0),
295
296
  ('flowpath-attributes','attributes','flowpath-attributes','','2024-10-02T21:40:43.663Z',NULL,NULL,NULL,NULL,0),
296
297
  ('flowpath-attributes-ml','attributes','flowpath-attributes-ml','','2024-10-02T21:40:53.358Z',NULL,NULL,NULL,NULL,0),
297
298
  ('network','attributes','network','','2024-10-02T21:42:24.445Z',NULL,NULL,NULL,NULL,0),
@@ -300,9 +301,11 @@ INSERT INTO "gpkg_contents" VALUES
300
301
  INSERT INTO "gpkg_geometry_columns" VALUES ('flowpaths','geom','GEOMETRY',5070,0,0),
301
302
  ('divides','geom','POLYGON',5070,0,0),
302
303
  ('lakes','geom','POINT',5070,0,0),
303
- ('nexus','geom','POINT',5070,0,0);
304
+ ('nexus','geom','POINT',5070,0,0),
305
+ ('hydrolocations','geom','POINT',5070,0,0);
304
306
  INSERT INTO "gpkg_extensions" VALUES ('flowpaths','geom','gpkg_rtree_index','http://www.geopackage.org/spec120/#extension_rtree','write-only'),
305
307
  ('divides','geom','gpkg_rtree_index','http://www.geopackage.org/spec120/#extension_rtree','write-only'),
306
308
  ('lakes','geom','gpkg_rtree_index','http://www.geopackage.org/spec120/#extension_rtree','write-only'),
307
- ('nexus','geom','gpkg_rtree_index','http://www.geopackage.org/spec120/#extension_rtree','write-only');
309
+ ('nexus','geom','gpkg_rtree_index','http://www.geopackage.org/spec120/#extension_rtree','write-only'),
310
+ ('hydrolocations','geom','gpkg_rtree_index','http://www.geopackage.org/spec120/#extension_rtree','write-only');
308
311
  COMMIT;
data_sources/triggers.sql CHANGED
@@ -51,4 +51,40 @@ CREATE TRIGGER "rtree_nexus_geom_delete" AFTER DELETE ON "nexus" WHEN old."geom"
51
51
  CREATE TRIGGER "trigger_insert_feature_count_nexus" AFTER INSERT ON "nexus" BEGIN UPDATE gpkg_ogr_contents SET feature_count = feature_count + 1 WHERE lower(table_name) = lower('nexus'); END;
52
52
  CREATE TRIGGER "trigger_delete_feature_count_nexus" AFTER DELETE ON "nexus" BEGIN UPDATE gpkg_ogr_contents SET feature_count = feature_count - 1 WHERE lower(table_name) = lower('nexus'); END;
53
53
  CREATE TRIGGER "trigger_insert_feature_count_divide-attributes" AFTER INSERT ON "divide-attributes" BEGIN UPDATE gpkg_ogr_contents SET feature_count = feature_count + 1 WHERE lower(table_name) = lower('divide-attributes'); END;
54
- CREATE TRIGGER "trigger_delete_feature_count_divide-attributes" AFTER DELETE ON "divide-attributes" BEGIN UPDATE gpkg_ogr_contents SET feature_count = feature_count - 1 WHERE lower(table_name) = lower('divide-attributes'); END;
54
+ CREATE TRIGGER "trigger_delete_feature_count_divide-attributes" AFTER DELETE ON "divide-attributes" BEGIN UPDATE gpkg_ogr_contents SET feature_count = feature_count - 1 WHERE lower(table_name) = lower('divide-attributes'); END;
55
+ CREATE TRIGGER "rtree_hydrolocations_geom_insert"
56
+ AFTER INSERT ON "hydrolocations"
57
+ WHEN (new."geom" NOT NULL AND NOT ST_IsEmpty(NEW."geom"))
58
+ BEGIN
59
+ INSERT OR REPLACE INTO "rtree_hydrolocations_geom" VALUES (NEW.ROWID, ST_MinX(NEW."geom"), ST_MaxX(NEW."geom"), ST_MinY(NEW."geom"), ST_MaxY(NEW."geom"));
60
+ END;
61
+ CREATE TRIGGER "rtree_hydrolocations_geom_update1"
62
+ AFTER UPDATE OF "geom" ON "hydrolocations"
63
+ WHEN OLD.ROWID = NEW.ROWID AND (NEW."geom" NOT NULL AND NOT ST_IsEmpty(NEW."geom"))
64
+ BEGIN
65
+ INSERT OR REPLACE INTO "rtree_hydrolocations_geom" VALUES (NEW.ROWID, ST_MinX(NEW."geom"), ST_MaxX(NEW."geom"), ST_MinY(NEW."geom"), ST_MaxY(NEW."geom"));
66
+ END;
67
+ CREATE TRIGGER "rtree_hydrolocations_geom_update2"
68
+ AFTER UPDATE OF "geom" ON "hydrolocations"
69
+ WHEN OLD.ROWID = NEW.ROWID AND (NEW."geom" IS NULL OR ST_IsEmpty(NEW."geom"))
70
+ BEGIN
71
+ DELETE FROM "rtree_hydrolocations_geom" WHERE id = OLD.ROWID;
72
+ END;
73
+ CREATE TRIGGER "rtree_hydrolocations_geom_update3"
74
+ AFTER UPDATE OF "geom" ON "hydrolocations"
75
+ WHEN OLD.ROWID != NEW.ROWID AND (NEW."geom" NOT NULL AND NOT ST_IsEmpty(NEW."geom"))
76
+ BEGIN
77
+ DELETE FROM "rtree_hydrolocations_geom" WHERE id = OLD.ROWID;
78
+ INSERT OR REPLACE INTO "rtree_hydrolocations_geom" VALUES (NEW.ROWID, ST_MinX(NEW."geom"), ST_MaxX(NEW."geom"), ST_MinY(NEW."geom"), ST_MaxY(NEW."geom"));
79
+ END;
80
+ CREATE TRIGGER "rtree_hydrolocations_geom_update4"
81
+ AFTER UPDATE ON "hydrolocations"
82
+ WHEN OLD.ROWID != NEW.ROWID AND (NEW."geom" IS NULL OR ST_IsEmpty(NEW."geom"))
83
+ BEGIN
84
+ DELETE FROM "rtree_hydrolocations_geom" WHERE id IN (OLD.ROWID, NEW.ROWID);
85
+ END;
86
+ CREATE TRIGGER "rtree_hydrolocations_geom_delete"
87
+ AFTER DELETE ON "hydrolocations"WHEN old."geom" NOT NULL
88
+ BEGIN
89
+ DELETE FROM "rtree_hydrolocations_geom" WHERE id = OLD.ROWID;
90
+ END;
@@ -1,21 +1,22 @@
1
- import argparse
2
- import logging
3
- import time
4
- from typing import List
5
- import subprocess
6
-
7
- from dask.distributed import Client
8
-
9
- from data_processing.file_paths import file_paths
10
- from data_processing.gpkg_utils import get_catid_from_point, get_cat_from_gage_id
11
- from data_processing.subset import subset
12
- from data_processing.forcings import create_forcings
13
- from data_processing.create_realization import create_realization, create_dd_realization
14
- from data_sources.source_validation import validate_all
15
-
16
- from ngiab_data_cli.custom_logging import setup_logging, set_logging_to_critical_only
17
- from ngiab_data_cli.arguments import parse_arguments
18
-
1
+ import rich.status
2
+
3
+ # add a status bar for these imports so the cli feels more responsive
4
+ with rich.status.Status("Initializing...") as status:
5
+ from data_sources.source_validation import validate_all
6
+ from ngiab_data_cli.custom_logging import setup_logging, set_logging_to_critical_only
7
+ from ngiab_data_cli.arguments import parse_arguments
8
+ from data_processing.file_paths import file_paths
9
+ import argparse
10
+ import logging
11
+ import time
12
+ from typing import List
13
+ import subprocess
14
+ import time
15
+ from dask.distributed import Client
16
+ from data_processing.gpkg_utils import get_catid_from_point, get_cat_from_gage_id
17
+ from data_processing.subset import subset
18
+ from data_processing.forcings import create_forcings
19
+ from data_processing.create_realization import create_realization, create_dd_realization
19
20
 
20
21
  def validate_input(args: argparse.Namespace) -> None:
21
22
  """Validate input arguments."""
@@ -119,7 +120,7 @@ def main() -> None:
119
120
  cat_to_subset, output_folder = validate_input(args)
120
121
  paths = file_paths(output_folder)
121
122
  args = set_dependent_flags(args, paths) # --validate
122
- logging.info(f"Using output folder: {paths.subset_dir}")
123
+ logging.info(f"Using output folder: {paths.subset_dir}")
123
124
 
124
125
  if args.subset:
125
126
  logging.info(f"Subsetting hydrofabric")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ngiab_data_preprocess
3
- Version: 3.0.3
3
+ Version: 3.1.0
4
4
  Summary: Graphical Tools for creating Next Gen Water model input data.
5
5
  Author-email: Josh Cunningham <jcunningham8@ua.edu>
6
6
  Project-URL: Homepage, https://github.com/CIROH-UA/NGIAB_data_preprocess
@@ -19,7 +19,6 @@ Requires-Dist: requests==2.32.2
19
19
  Requires-Dist: igraph==0.11.4
20
20
  Requires-Dist: s3fs==2024.3.1
21
21
  Requires-Dist: xarray==2024.2.0
22
- Requires-Dist: rioxarray==0.15.1
23
22
  Requires-Dist: zarr==2.17.1
24
23
  Requires-Dist: netCDF4==1.6.5
25
24
  Requires-Dist: dask==2024.4.1
@@ -1,10 +1,11 @@
1
1
  data_processing/create_realization.py,sha256=2-w-TfJ6e5SFYchDZNAlOIEVK_iP79-EUC_jQ2Un1jk,10893
2
- data_processing/file_paths.py,sha256=QWjtRDSqJi8Cu0_EG_sssrxmdJBaz-hMfng5CJxyhf8,4005
3
- data_processing/forcings.py,sha256=bK1o7PTBXPUYCIK1hT-ccBpOxygrlYIsaCtdGmQugNM,11154
4
- data_processing/gpkg_utils.py,sha256=SJ3IHTGnI6nLIw78Am9wrNO3jZ1OOpuyjkpBvQlnLko,17098
2
+ data_processing/file_paths.py,sha256=jyiN3hCK3H_Upt8C4NoTInMrpcCZaTflAv41Oh1K6a8,4177
3
+ data_processing/forcings.py,sha256=ADQOgCWUMGuiPvD5WUnJ94CqILM6akwE-xUlmckksik,12542
4
+ data_processing/gpkg_utils.py,sha256=pMmuJT_iHSt7Caw_JMxui5DtIzvL8HSseAelGgB41_I,17266
5
5
  data_processing/graph_utils.py,sha256=uN2MoUFHQoeQEqYtf3o45QrRaTgqKoi0n3ZrB6XOh6Y,7611
6
+ data_processing/s3fs_utils.py,sha256=DisX_PqIPn48EltmE85m4hJdcxaC6r4Mb-deptRT9O0,2752
6
7
  data_processing/subset.py,sha256=bXDDoDmp8TtfbHvx0rztNWDmqiri8LeSkn7hBe4kb-4,2876
7
- data_processing/zarr_utils.py,sha256=kx8M_9QZ6IwdLEmPn3LzRDi01JkKMDDBBukuXaOjkDo,5641
8
+ data_processing/zarr_utils.py,sha256=F-ukVuJFXamgU77k0e8lSfUPA-AOd2Zq8uTWADfykxA,6047
8
9
  data_sources/cfe-nowpm-realization-template.json,sha256=jBv6jGuHtAVFC7X2KaUdvxrhCopgnBiCuW3pS8Jng0w,3541
9
10
  data_sources/cfe-template.ini,sha256=SGq7bShD0Z83C0n4mztmzz3GnFdP_uJXPhheOizNpzc,1973
10
11
  data_sources/dd-catchment-template.yml,sha256=7lzpUB0o4bFuD4qgFnsNOUw-ZtwxZ_dNTMhB6sDMklA,292
@@ -13,9 +14,9 @@ data_sources/dd-realization-template.json,sha256=xt3BgzAEqn3eERO1lODdWdbV0T9UuQG
13
14
  data_sources/forcing_template.nc,sha256=uRuVAqX3ngdlougZINavtwl_wC2VLD8fHqG7_CLim1s,85284
14
15
  data_sources/ngen-routing-template.yaml,sha256=8xTTLRHAbXS0QN6C0cO0Mt_orwqKpD2LWz9Bq3sQGuA,4649
15
16
  data_sources/noah-owp-modular-init.namelist.input,sha256=ssEcr_hPfRmslcpXbKJqzas4aSuDY-qd_k6zfxKbvhA,3045
16
- data_sources/source_validation.py,sha256=5i0fM0ejxQTDH-B1TIk45_fqenypCDdFHYhVfzsCdC0,2685
17
- data_sources/template.sql,sha256=5qr3FsaxtPGxmLBM2Z0UFynql37A0oCBxvk_YrExIuE,10281
18
- data_sources/triggers.sql,sha256=-UQej1rjao8N4IlWN4S3ZfLWLUOg5VL1t5pbwIqttsQ,13172
17
+ data_sources/source_validation.py,sha256=G_qrh6PaCgZ6wgPJ3UdE2lAQQhSPMEYGTVbyXax2J4M,7872
18
+ data_sources/template.sql,sha256=ZnFqAqleEq9wgmAhNO90Wue_L9k0JAn8KF99DYtcxgs,10457
19
+ data_sources/triggers.sql,sha256=G0d_175eNsamKAFhsbphPATvzMPuPL_iCleIhlToduQ,14906
19
20
  map_app/__main__.py,sha256=m9UpLD0oihMeJa2nTPewDYda-vpm3aP1_AZOhp6SuQk,2351
20
21
  map_app/views.py,sha256=Az5BLXXlbTWzGA7y_vLHWQi-aAUE13X_YuwUr-fkz_w,4183
21
22
  map_app/static/css/console.css,sha256=xN6G2MMFyKc9YW9HEVpUUTUjx2o2nokBR4nCX5c18UM,803
@@ -28,11 +29,11 @@ map_app/static/resources/light-style.json,sha256=DaE52qKpAkjiWSKY_z7LxreqA2rW4Zy
28
29
  map_app/static/resources/loading.gif,sha256=ggdkZf1AD7rSwIpSJwfiIqANgmVV1WHlxGuKxQKv7uY,72191
29
30
  map_app/static/resources/screenshot.png,sha256=-sl_R9_WJShjZ52Djz5ZxLxbsh1VgdCeHmPGebIxEOA,1412174
30
31
  map_app/templates/index.html,sha256=3TFbV0dO33UrYNps2CTLocEN6Z-DHDfQTvwFfRpOY0I,6005
31
- ngiab_data_cli/__main__.py,sha256=b2bHuzMGNiA5idx_PYuurup4PzrjLQTUCi5jUqop-5Q,8419
32
+ ngiab_data_cli/__main__.py,sha256=xNkZL2YziMYznAMGom-8gvbFTaHwYA96cSkSc5RmtfM,8639
32
33
  ngiab_data_cli/arguments.py,sha256=6CkA9_-hGPqj0yedhcf9G2DbWgJcn2t3oLodKWY7r-E,3402
33
34
  ngiab_data_cli/custom_logging.py,sha256=iS2XozaxudcxQj17qAsrCgbVK9LJAYAPmarJuVWJo1k,1280
34
- ngiab_data_preprocess-3.0.3.dist-info/LICENSE,sha256=6dMSprwwnsRzEm02mEDbKHD9dUbL8bPIt9Vhrhb0Ulk,1081
35
- ngiab_data_preprocess-3.0.3.dist-info/METADATA,sha256=uwbrkIA6PsGnyjvmk8k4aiXriF5nIvh5eMNyg9gXOIo,9020
36
- ngiab_data_preprocess-3.0.3.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
37
- ngiab_data_preprocess-3.0.3.dist-info/top_level.txt,sha256=CjhYAUZrdveR2fOK6rxffU09VIN2IuPD7hk4V3l3pV0,52
38
- ngiab_data_preprocess-3.0.3.dist-info/RECORD,,
35
+ ngiab_data_preprocess-3.1.0.dist-info/LICENSE,sha256=6dMSprwwnsRzEm02mEDbKHD9dUbL8bPIt9Vhrhb0Ulk,1081
36
+ ngiab_data_preprocess-3.1.0.dist-info/METADATA,sha256=vlVgzkAnT36hXGBDZmoha91BLSESovcKkkTMwfdVQfM,8987
37
+ ngiab_data_preprocess-3.1.0.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
38
+ ngiab_data_preprocess-3.1.0.dist-info/top_level.txt,sha256=CjhYAUZrdveR2fOK6rxffU09VIN2IuPD7hk4V3l3pV0,52
39
+ ngiab_data_preprocess-3.1.0.dist-info/RECORD,,