ngiab-data-preprocess 4.2.2__py3-none-any.whl → 4.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_processing/create_realization.py +15 -21
- data_processing/dask_utils.py +92 -0
- data_processing/dataset_utils.py +127 -44
- data_processing/datasets.py +18 -29
- data_processing/file_paths.py +7 -7
- data_processing/forcings.py +40 -38
- data_processing/gpkg_utils.py +13 -13
- data_processing/graph_utils.py +4 -4
- data_processing/s3fs_utils.py +1 -1
- data_processing/subset.py +1 -1
- data_sources/source_validation.py +57 -32
- map_app/__main__.py +3 -2
- map_app/static/css/main.css +14 -3
- map_app/static/js/main.js +225 -106
- map_app/templates/index.html +10 -1
- map_app/views.py +1 -1
- ngiab_data_cli/__main__.py +31 -28
- ngiab_data_cli/arguments.py +0 -1
- ngiab_data_cli/forcing_cli.py +10 -19
- {ngiab_data_preprocess-4.2.2.dist-info → ngiab_data_preprocess-4.3.0.dist-info}/METADATA +15 -11
- ngiab_data_preprocess-4.3.0.dist-info/RECORD +43 -0
- {ngiab_data_preprocess-4.2.2.dist-info → ngiab_data_preprocess-4.3.0.dist-info}/WHEEL +1 -1
- ngiab_data_preprocess-4.2.2.dist-info/RECORD +0 -42
- {ngiab_data_preprocess-4.2.2.dist-info → ngiab_data_preprocess-4.3.0.dist-info}/entry_points.txt +0 -0
- {ngiab_data_preprocess-4.2.2.dist-info → ngiab_data_preprocess-4.3.0.dist-info}/licenses/LICENSE +0 -0
- {ngiab_data_preprocess-4.2.2.dist-info → ngiab_data_preprocess-4.3.0.dist-info}/top_level.txt +0 -0
data_processing/forcings.py
CHANGED
|
@@ -7,14 +7,14 @@ from functools import partial
|
|
|
7
7
|
from math import ceil
|
|
8
8
|
from multiprocessing import shared_memory
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
from typing import Tuple
|
|
10
|
+
from typing import List, Tuple
|
|
11
11
|
|
|
12
12
|
import geopandas as gpd
|
|
13
13
|
import numpy as np
|
|
14
14
|
import pandas as pd
|
|
15
15
|
import psutil
|
|
16
16
|
import xarray as xr
|
|
17
|
-
from
|
|
17
|
+
from data_processing.dask_utils import no_cluster, use_cluster
|
|
18
18
|
from data_processing.dataset_utils import validate_dataset_format
|
|
19
19
|
from data_processing.file_paths import file_paths
|
|
20
20
|
from exactextract import exact_extract
|
|
@@ -92,21 +92,21 @@ def get_cell_weights(raster: xr.Dataset, gdf: gpd.GeoDataFrame, wkt: str) -> pd.
|
|
|
92
92
|
DataFrame indexed by divide_id that contains information about coverage
|
|
93
93
|
for each raster cell in gridded forcing file.
|
|
94
94
|
"""
|
|
95
|
-
xmin = raster.x
|
|
96
|
-
xmax = raster.x
|
|
97
|
-
ymin = raster.y
|
|
98
|
-
ymax = raster.y
|
|
95
|
+
xmin = min(raster.x)
|
|
96
|
+
xmax = max(raster.x)
|
|
97
|
+
ymin = min(raster.y)
|
|
98
|
+
ymax = max(raster.y)
|
|
99
99
|
data_vars = list(raster.data_vars)
|
|
100
100
|
rastersource = NumPyRasterSource(
|
|
101
101
|
raster[data_vars[0]], srs_wkt=wkt, xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax
|
|
102
102
|
)
|
|
103
|
-
output = exact_extract(
|
|
103
|
+
output: pd.DataFrame = exact_extract(
|
|
104
104
|
rastersource,
|
|
105
105
|
gdf,
|
|
106
106
|
["cell_id", "coverage"],
|
|
107
107
|
include_cols=["divide_id"],
|
|
108
108
|
output="pandas",
|
|
109
|
-
)
|
|
109
|
+
) # type: ignore
|
|
110
110
|
return output.set_index("divide_id")
|
|
111
111
|
|
|
112
112
|
|
|
@@ -164,8 +164,8 @@ def get_index_chunks(data: xr.DataArray) -> list[tuple[int, int]]:
|
|
|
164
164
|
|
|
165
165
|
|
|
166
166
|
def create_shared_memory(
|
|
167
|
-
lazy_array: xr.
|
|
168
|
-
) -> Tuple[shared_memory.SharedMemory,
|
|
167
|
+
lazy_array: xr.DataArray,
|
|
168
|
+
) -> Tuple[shared_memory.SharedMemory, Tuple[int, ...], np.dtype]:
|
|
169
169
|
"""
|
|
170
170
|
Create a shared memory object so that multiple processes can access loaded
|
|
171
171
|
data.
|
|
@@ -180,7 +180,7 @@ def create_shared_memory(
|
|
|
180
180
|
shared_memory.SharedMemory
|
|
181
181
|
A specific block of memory allocated by the OS of the size of
|
|
182
182
|
lazy_array.
|
|
183
|
-
|
|
183
|
+
Tuple[int, ...]
|
|
184
184
|
A shape object with dimensions (# timesteps, # of raster cells) in
|
|
185
185
|
reference to lazy_array.
|
|
186
186
|
np.dtype
|
|
@@ -205,9 +205,9 @@ def process_chunk_shared(
|
|
|
205
205
|
variable: str,
|
|
206
206
|
times: np.ndarray,
|
|
207
207
|
shm_name: str,
|
|
208
|
-
shape:
|
|
208
|
+
shape: Tuple[int, ...],
|
|
209
209
|
dtype: np.dtype,
|
|
210
|
-
chunk:
|
|
210
|
+
chunk: pd.DataFrame,
|
|
211
211
|
) -> xr.DataArray:
|
|
212
212
|
"""
|
|
213
213
|
Process the gridded forcings chunk loaded into a SharedMemory block.
|
|
@@ -276,7 +276,7 @@ def get_cell_weights_parallel(
|
|
|
276
276
|
for each raster cell and each timestep in gridded forcing file.
|
|
277
277
|
"""
|
|
278
278
|
gdf_chunks = np.array_split(gdf, num_partitions)
|
|
279
|
-
wkt = gdf.crs.to_wkt()
|
|
279
|
+
wkt = gdf.crs.to_wkt() # type: ignore
|
|
280
280
|
one_timestep = input_forcings.isel(time=0).compute()
|
|
281
281
|
with multiprocessing.Pool() as pool:
|
|
282
282
|
args = [(one_timestep, gdf_chunk, wkt) for gdf_chunk in gdf_chunks]
|
|
@@ -305,6 +305,7 @@ def get_units(dataset: xr.Dataset) -> dict:
|
|
|
305
305
|
return units
|
|
306
306
|
|
|
307
307
|
|
|
308
|
+
@no_cluster
|
|
308
309
|
def compute_zonal_stats(
|
|
309
310
|
gdf: gpd.GeoDataFrame, gridded_data: xr.Dataset, forcings_dir: Path
|
|
310
311
|
) -> None:
|
|
@@ -331,7 +332,7 @@ def compute_zonal_stats(
|
|
|
331
332
|
catchments = get_cell_weights_parallel(gdf, gridded_data, num_partitions)
|
|
332
333
|
units = get_units(gridded_data)
|
|
333
334
|
|
|
334
|
-
cat_chunks = np.array_split(catchments, num_partitions)
|
|
335
|
+
cat_chunks: List[pd.DataFrame] = np.array_split(catchments, num_partitions) # type: ignore
|
|
335
336
|
|
|
336
337
|
progress = Progress(
|
|
337
338
|
TextColumn("[progress.description]{task.description}"),
|
|
@@ -350,25 +351,28 @@ def compute_zonal_stats(
|
|
|
350
351
|
"[cyan]Processing variables...", total=len(gridded_data.data_vars), elapsed=0
|
|
351
352
|
)
|
|
352
353
|
progress.start()
|
|
353
|
-
for
|
|
354
|
+
for data_var_name in list(gridded_data.data_vars):
|
|
355
|
+
data_var_name: str
|
|
354
356
|
progress.update(variable_task, advance=1)
|
|
355
|
-
progress.update(variable_task, description=f"Processing {
|
|
357
|
+
progress.update(variable_task, description=f"Processing {data_var_name}")
|
|
356
358
|
|
|
357
359
|
# to make sure this fits in memory, we need to chunk the data
|
|
358
|
-
time_chunks = get_index_chunks(gridded_data[
|
|
360
|
+
time_chunks = get_index_chunks(gridded_data[data_var_name])
|
|
359
361
|
chunk_task = progress.add_task("[purple] processing chunks", total=len(time_chunks))
|
|
360
362
|
for i, times in enumerate(time_chunks):
|
|
361
363
|
progress.update(chunk_task, advance=1)
|
|
362
364
|
start, end = times
|
|
363
365
|
# select the chunk of time we want to process
|
|
364
|
-
data_chunk = gridded_data[
|
|
366
|
+
data_chunk = gridded_data[data_var_name].isel(time=slice(start, end))
|
|
365
367
|
# put it in shared memory
|
|
366
368
|
shm, shape, dtype = create_shared_memory(data_chunk)
|
|
367
369
|
times = data_chunk.time.values
|
|
368
370
|
# create a partial function to pass to the multiprocessing pool
|
|
369
|
-
partial_process_chunk = partial(
|
|
371
|
+
partial_process_chunk = partial(
|
|
372
|
+
process_chunk_shared, data_var_name, times, shm.name, shape, dtype
|
|
373
|
+
)
|
|
370
374
|
|
|
371
|
-
logger.debug(f"Processing variable: {
|
|
375
|
+
logger.debug(f"Processing variable: {data_var_name}")
|
|
372
376
|
# process the chunks of catchments in parallel
|
|
373
377
|
with multiprocessing.Pool(num_partitions) as pool:
|
|
374
378
|
variable_data = pool.map(partial_process_chunk, cat_chunks)
|
|
@@ -376,24 +380,24 @@ def compute_zonal_stats(
|
|
|
376
380
|
# clean up the shared memory
|
|
377
381
|
shm.close()
|
|
378
382
|
shm.unlink()
|
|
379
|
-
logger.debug(f"Processed variable: {
|
|
383
|
+
logger.debug(f"Processed variable: {data_var_name}")
|
|
380
384
|
concatenated_da = xr.concat(variable_data, dim="catchment")
|
|
381
385
|
# delete the data to free up memory
|
|
382
386
|
del variable_data
|
|
383
|
-
logger.debug(f"Concatenated variable: {
|
|
387
|
+
logger.debug(f"Concatenated variable: {data_var_name}")
|
|
384
388
|
# write this to disk now to save memory
|
|
385
389
|
# xarray will monitor memory usage, but it doesn't account for the shared memory used to store the raster
|
|
386
390
|
# This reduces memory usage by about 60%
|
|
387
|
-
concatenated_da.to_dataset(name=
|
|
388
|
-
forcings_dir / "temp" / f"{
|
|
391
|
+
concatenated_da.to_dataset(name=data_var_name).to_netcdf(
|
|
392
|
+
forcings_dir / "temp" / f"{data_var_name}_timechunk_{i}.nc"
|
|
389
393
|
)
|
|
390
394
|
# Merge the chunks back together
|
|
391
395
|
datasets = [
|
|
392
|
-
xr.open_dataset(forcings_dir / "temp" / f"{
|
|
396
|
+
xr.open_dataset(forcings_dir / "temp" / f"{data_var_name}_timechunk_{i}.nc")
|
|
393
397
|
for i in range(len(time_chunks))
|
|
394
398
|
]
|
|
395
399
|
result = xr.concat(datasets, dim="time")
|
|
396
|
-
result.to_netcdf(forcings_dir / "temp" / f"{
|
|
400
|
+
result.to_netcdf(forcings_dir / "temp" / f"{data_var_name}.nc")
|
|
397
401
|
# close the datasets
|
|
398
402
|
result.close()
|
|
399
403
|
_ = [dataset.close() for dataset in datasets]
|
|
@@ -411,6 +415,7 @@ def compute_zonal_stats(
|
|
|
411
415
|
write_outputs(forcings_dir, units)
|
|
412
416
|
|
|
413
417
|
|
|
418
|
+
@use_cluster
|
|
414
419
|
def write_outputs(forcings_dir: Path, units: dict) -> None:
|
|
415
420
|
"""
|
|
416
421
|
Write outputs to disk in the form of a NetCDF file, using dask clusters to
|
|
@@ -428,13 +433,6 @@ def write_outputs(forcings_dir: Path, units: dict) -> None:
|
|
|
428
433
|
units. Differs from variables, as this dictionary depends on the gridded
|
|
429
434
|
forcing dataset.
|
|
430
435
|
"""
|
|
431
|
-
|
|
432
|
-
# start a dask cluster if there isn't one already running
|
|
433
|
-
try:
|
|
434
|
-
client = Client.current()
|
|
435
|
-
except ValueError:
|
|
436
|
-
cluster = LocalCluster()
|
|
437
|
-
client = Client(cluster)
|
|
438
436
|
temp_forcings_dir = forcings_dir / "temp"
|
|
439
437
|
# Combine all variables into a single dataset using dask
|
|
440
438
|
results = [xr.open_dataset(file, chunks="auto") for file in temp_forcings_dir.glob("*.nc")]
|
|
@@ -471,14 +469,18 @@ def write_outputs(forcings_dir: Path, units: dict) -> None:
|
|
|
471
469
|
time_array = (
|
|
472
470
|
final_ds.time.astype("datetime64[s]").astype(np.int64).values // 10**9
|
|
473
471
|
) ## convert from ns to s
|
|
474
|
-
time_array = time_array.astype(np.int32)
|
|
475
|
-
final_ds = final_ds.drop_vars(
|
|
476
|
-
|
|
472
|
+
time_array = time_array.astype(np.int32) ## convert to int32 to save space
|
|
473
|
+
final_ds = final_ds.drop_vars(
|
|
474
|
+
["catchment", "time"]
|
|
475
|
+
) ## drop the original time and catchment vars
|
|
476
|
+
final_ds = final_ds.rename_dims({"catchment": "catchment-id"}) # rename the catchment dimension
|
|
477
477
|
# add the time as a 2d data var, yes this is wasting disk space.
|
|
478
478
|
final_ds["Time"] = (("catchment-id", "time"), [time_array for _ in range(len(final_ds["ids"]))])
|
|
479
479
|
# set the time unit
|
|
480
480
|
final_ds["Time"].attrs["units"] = "s"
|
|
481
|
-
final_ds["Time"].attrs["epoch_start"] =
|
|
481
|
+
final_ds["Time"].attrs["epoch_start"] = (
|
|
482
|
+
"01/01/1970 00:00:00" # not needed but suppresses the ngen warning
|
|
483
|
+
)
|
|
482
484
|
|
|
483
485
|
final_ds.to_netcdf(forcings_dir / "forcings.nc", engine="netcdf4")
|
|
484
486
|
# close the datasets
|
data_processing/gpkg_utils.py
CHANGED
|
@@ -2,11 +2,12 @@ import logging
|
|
|
2
2
|
import sqlite3
|
|
3
3
|
import struct
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import List, Tuple,
|
|
5
|
+
from typing import List, Tuple, Dict
|
|
6
6
|
|
|
7
7
|
import pyproj
|
|
8
8
|
from data_processing.file_paths import file_paths
|
|
9
|
-
from shapely.geometry import Point
|
|
9
|
+
from shapely.geometry import Point
|
|
10
|
+
from shapely.geometry.base import BaseGeometry
|
|
10
11
|
from shapely.ops import transform
|
|
11
12
|
from shapely.wkb import loads
|
|
12
13
|
|
|
@@ -27,7 +28,7 @@ class GeoPackage:
|
|
|
27
28
|
self.conn.close()
|
|
28
29
|
|
|
29
30
|
|
|
30
|
-
def verify_indices(gpkg:
|
|
31
|
+
def verify_indices(gpkg: Path = file_paths.conus_hydrofabric) -> None:
|
|
31
32
|
"""
|
|
32
33
|
Verify that the indices in the specified geopackage are correct.
|
|
33
34
|
If they are not, create the correct indices.
|
|
@@ -92,12 +93,9 @@ def add_triggers_to_gpkg(gpkg: Path) -> None:
|
|
|
92
93
|
logger.debug(f"Added triggers to subset gpkg {gpkg}")
|
|
93
94
|
|
|
94
95
|
|
|
95
|
-
# whenever this is imported, check if the indices are correct
|
|
96
|
-
if file_paths.conus_hydrofabric.is_file():
|
|
97
|
-
verify_indices()
|
|
98
96
|
|
|
99
97
|
|
|
100
|
-
def blob_to_geometry(blob: bytes) ->
|
|
98
|
+
def blob_to_geometry(blob: bytes) -> BaseGeometry | None:
|
|
101
99
|
"""
|
|
102
100
|
Convert a blob to a geometry.
|
|
103
101
|
from http://www.geopackage.org/spec/#gpb_format
|
|
@@ -120,7 +118,7 @@ def blob_to_geometry(blob: bytes) -> Union[Point, Polygon]:
|
|
|
120
118
|
return geometry
|
|
121
119
|
|
|
122
120
|
|
|
123
|
-
def blob_to_centre_point(blob: bytes) -> Point:
|
|
121
|
+
def blob_to_centre_point(blob: bytes) -> Point | None:
|
|
124
122
|
"""
|
|
125
123
|
Convert a blob to a geometry.
|
|
126
124
|
from http://www.geopackage.org/spec/#gpb_format
|
|
@@ -151,7 +149,7 @@ def blob_to_centre_point(blob: bytes) -> Point:
|
|
|
151
149
|
return Point(x, y)
|
|
152
150
|
|
|
153
151
|
|
|
154
|
-
def convert_to_5070(shapely_geometry):
|
|
152
|
+
def convert_to_5070(shapely_geometry: Point) -> Point:
|
|
155
153
|
# convert to web mercator
|
|
156
154
|
if shapely_geometry.is_empty:
|
|
157
155
|
return shapely_geometry
|
|
@@ -164,7 +162,7 @@ def convert_to_5070(shapely_geometry):
|
|
|
164
162
|
return new_geometry
|
|
165
163
|
|
|
166
164
|
|
|
167
|
-
def get_catid_from_point(coords):
|
|
165
|
+
def get_catid_from_point(coords: Dict[str, float]) -> str:
|
|
168
166
|
"""
|
|
169
167
|
Retrieves the watershed boundary ID (catid) of the watershed that contains the given point.
|
|
170
168
|
|
|
@@ -196,6 +194,8 @@ def get_catid_from_point(coords):
|
|
|
196
194
|
# check the geometries to see which one contains the point
|
|
197
195
|
for result in results:
|
|
198
196
|
geom = blob_to_geometry(result[1])
|
|
197
|
+
if geom is None:
|
|
198
|
+
continue
|
|
199
199
|
if geom.contains(point):
|
|
200
200
|
return result[0]
|
|
201
201
|
return results[0][0]
|
|
@@ -398,7 +398,7 @@ def subset_table(table: str, ids: List[str], hydrofabric: Path, subset_gpkg_name
|
|
|
398
398
|
dest_db.close()
|
|
399
399
|
|
|
400
400
|
|
|
401
|
-
def get_table_crs_short(gpkg: str, table: str) -> str:
|
|
401
|
+
def get_table_crs_short(gpkg: str | Path, table: str) -> str:
|
|
402
402
|
"""
|
|
403
403
|
Gets the CRS of the specified table in the specified geopackage as a short string. e.g. EPSG:5070
|
|
404
404
|
|
|
@@ -518,7 +518,7 @@ def get_available_tables(gpkg: Path) -> List[str]:
|
|
|
518
518
|
return tables
|
|
519
519
|
|
|
520
520
|
|
|
521
|
-
def get_cat_to_nhd_feature_id(gpkg: Path = file_paths.conus_hydrofabric) ->
|
|
521
|
+
def get_cat_to_nhd_feature_id(gpkg: Path = file_paths.conus_hydrofabric) -> Dict[str, int]:
|
|
522
522
|
available_tables = get_available_tables(gpkg)
|
|
523
523
|
possible_tables = ["flowpath_edge_list", "network"]
|
|
524
524
|
|
|
@@ -535,7 +535,7 @@ def get_cat_to_nhd_feature_id(gpkg: Path = file_paths.conus_hydrofabric) -> dict
|
|
|
535
535
|
sql_query = f"SELECT divide_id, hf_id FROM {table_name} WHERE divide_id IS NOT NULL AND hf_id IS NOT NULL"
|
|
536
536
|
|
|
537
537
|
with sqlite3.connect(gpkg) as conn:
|
|
538
|
-
result = conn.execute(sql_query).fetchall()
|
|
538
|
+
result: List[Tuple[str, str]] = conn.execute(sql_query).fetchall()
|
|
539
539
|
|
|
540
540
|
mapping = {}
|
|
541
541
|
for cat, feature in result:
|
data_processing/graph_utils.py
CHANGED
|
@@ -2,7 +2,7 @@ import logging
|
|
|
2
2
|
import sqlite3
|
|
3
3
|
from functools import cache
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import List, Set, Union
|
|
5
|
+
from typing import List, Optional, Set, Union
|
|
6
6
|
|
|
7
7
|
import igraph as ig
|
|
8
8
|
from data_processing.file_paths import file_paths
|
|
@@ -11,7 +11,7 @@ logger = logging.getLogger(__name__)
|
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def get_from_to_id_pairs(
|
|
14
|
-
hydrofabric: Path = file_paths.conus_hydrofabric, ids: Set = None
|
|
14
|
+
hydrofabric: Path = file_paths.conus_hydrofabric, ids: Optional[Set | List] = None
|
|
15
15
|
) -> List[tuple]:
|
|
16
16
|
"""
|
|
17
17
|
Retrieves the from and to IDs from the specified hydrofabric.
|
|
@@ -112,7 +112,7 @@ def get_graph() -> ig.Graph:
|
|
|
112
112
|
return network_graph
|
|
113
113
|
|
|
114
114
|
|
|
115
|
-
def get_outlet_id(wb_or_cat_id: str) -> str:
|
|
115
|
+
def get_outlet_id(wb_or_cat_id: str) -> str | None:
|
|
116
116
|
"""
|
|
117
117
|
Retrieves the ID of the node downstream of the given node in the hydrological network.
|
|
118
118
|
|
|
@@ -209,7 +209,7 @@ def get_upstream_ids(names: Union[str, List[str]], include_outlet: bool = True)
|
|
|
209
209
|
if name in parent_ids:
|
|
210
210
|
continue
|
|
211
211
|
try:
|
|
212
|
-
if "cat" in name:
|
|
212
|
+
if "cat" in name: # type: ignore # If name is None, this will raise an error, which is handled below
|
|
213
213
|
node_index = graph.vs.find(cat=name).index
|
|
214
214
|
else:
|
|
215
215
|
node_index = graph.vs.find(name=name).index
|
data_processing/s3fs_utils.py
CHANGED
|
@@ -32,7 +32,7 @@ class S3ParallelFileSystem(S3FileSystem):
|
|
|
32
32
|
"head_object", Bucket=bucket, Key=key, **version_kw, **self.req_kw
|
|
33
33
|
)
|
|
34
34
|
)["ContentLength"]
|
|
35
|
-
except Exception
|
|
35
|
+
except Exception:
|
|
36
36
|
# Fall back to single request if HEAD fails
|
|
37
37
|
return await self._download_chunk(bucket, key, {}, version_kw)
|
|
38
38
|
|
data_processing/subset.py
CHANGED
|
@@ -1,19 +1,29 @@
|
|
|
1
1
|
import gzip
|
|
2
|
+
import json
|
|
2
3
|
import os
|
|
3
4
|
import tarfile
|
|
4
5
|
import warnings
|
|
5
|
-
import json
|
|
6
|
-
import requests
|
|
7
|
-
from data_processing.file_paths import file_paths
|
|
8
|
-
from tqdm import TqdmExperimentalWarning
|
|
9
6
|
from time import sleep
|
|
7
|
+
|
|
10
8
|
import boto3
|
|
11
|
-
|
|
9
|
+
import psutil
|
|
10
|
+
import requests
|
|
12
11
|
from boto3.s3.transfer import TransferConfig
|
|
12
|
+
from botocore.exceptions import ClientError
|
|
13
|
+
import botocore
|
|
14
|
+
from data_processing.file_paths import file_paths
|
|
13
15
|
from rich.console import Console
|
|
16
|
+
from rich.progress import (Progress,
|
|
17
|
+
SpinnerColumn,
|
|
18
|
+
TextColumn,
|
|
19
|
+
TimeElapsedColumn,
|
|
20
|
+
BarColumn,
|
|
21
|
+
DownloadColumn,
|
|
22
|
+
TransferSpeedColumn)
|
|
14
23
|
from rich.prompt import Prompt
|
|
15
|
-
from
|
|
16
|
-
import
|
|
24
|
+
from tqdm import TqdmExperimentalWarning
|
|
25
|
+
from data_processing.gpkg_utils import verify_indices
|
|
26
|
+
import sqlite3
|
|
17
27
|
|
|
18
28
|
warnings.filterwarnings("ignore", category=TqdmExperimentalWarning)
|
|
19
29
|
|
|
@@ -23,25 +33,22 @@ S3_KEY = "hydrofabrics/community/conus_nextgen.tar.gz"
|
|
|
23
33
|
S3_REGION = "us-east-1"
|
|
24
34
|
hydrofabric_url = f"https://{S3_BUCKET}.s3.{S3_REGION}.amazonaws.com/{S3_KEY}"
|
|
25
35
|
|
|
36
|
+
|
|
26
37
|
def decompress_gzip_tar(file_path, output_dir):
|
|
27
|
-
# use rich to display "decompressing" message with a progress bar that just counts down from 30s
|
|
28
|
-
# actually measuring this is hard and it usually takes ~20s to decompress
|
|
29
38
|
console.print("Decompressing Hydrofabric...", style="bold green")
|
|
30
39
|
progress = Progress(
|
|
31
40
|
SpinnerColumn(),
|
|
32
41
|
TextColumn("[progress.description]{task.description}"),
|
|
33
|
-
TimeElapsedColumn(),
|
|
42
|
+
TimeElapsedColumn(),
|
|
34
43
|
)
|
|
35
44
|
task = progress.add_task("Decompressing", total=1)
|
|
36
|
-
progress
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
progress.update(task, completed=1)
|
|
44
|
-
progress.stop()
|
|
45
|
+
with progress:
|
|
46
|
+
with gzip.open(file_path, "rb") as f_in:
|
|
47
|
+
with tarfile.open(fileobj=f_in) as tar:
|
|
48
|
+
# Extract all contents
|
|
49
|
+
for member in tar:
|
|
50
|
+
tar.extract(member, path=output_dir)
|
|
51
|
+
progress.update(task, advance=1 / len(tar.getmembers()))
|
|
45
52
|
|
|
46
53
|
|
|
47
54
|
def download_from_s3(save_path, bucket=S3_BUCKET, key=S3_KEY, region=S3_REGION):
|
|
@@ -53,10 +60,13 @@ def download_from_s3(save_path, bucket=S3_BUCKET, key=S3_KEY, region=S3_REGION):
|
|
|
53
60
|
if os.path.exists(save_path):
|
|
54
61
|
console.print(f"File already exists: {save_path}", style="bold yellow")
|
|
55
62
|
os.remove(save_path)
|
|
56
|
-
|
|
63
|
+
|
|
64
|
+
client_config = botocore.config.Config(
|
|
65
|
+
max_pool_connections=75
|
|
66
|
+
)
|
|
57
67
|
# Initialize S3 client
|
|
58
68
|
s3_client = boto3.client(
|
|
59
|
-
"s3", aws_access_key_id="", aws_secret_access_key="", region_name=region
|
|
69
|
+
"s3", aws_access_key_id="", aws_secret_access_key="", region_name=region, config=client_config
|
|
60
70
|
)
|
|
61
71
|
# Disable request signing for public buckets
|
|
62
72
|
s3_client._request_signer.sign = lambda *args, **kwargs: None
|
|
@@ -92,19 +102,15 @@ def download_from_s3(save_path, bucket=S3_BUCKET, key=S3_KEY, region=S3_REGION):
|
|
|
92
102
|
use_threads=True,
|
|
93
103
|
)
|
|
94
104
|
|
|
95
|
-
console.print(f"Downloading {key} to {save_path}...", style="bold green")
|
|
96
|
-
console.print(
|
|
97
|
-
f"The file downloads faster with no progress indicator, this should take around 30s",
|
|
98
|
-
style="bold yellow",
|
|
99
|
-
)
|
|
100
|
-
console.print(
|
|
101
|
-
f"Please use network monitoring on your computer if you wish to track the download",
|
|
102
|
-
style="green",
|
|
103
|
-
)
|
|
104
105
|
|
|
105
106
|
try:
|
|
107
|
+
dl_progress = Progress(BarColumn(), DownloadColumn(), TransferSpeedColumn())
|
|
106
108
|
# Download file using optimized transfer config
|
|
107
|
-
|
|
109
|
+
with dl_progress:
|
|
110
|
+
task = dl_progress.add_task("Downloading...", total=total_size)
|
|
111
|
+
s3_client.download_file(Bucket=bucket, Key=key, Filename=save_path, Config=config,
|
|
112
|
+
Callback=lambda bytes_downloaded: dl_progress.update(
|
|
113
|
+
task, advance=bytes_downloaded))
|
|
108
114
|
return True
|
|
109
115
|
except Exception as e:
|
|
110
116
|
console.print(f"Error downloading file: {e}", style="bold red")
|
|
@@ -122,6 +128,14 @@ def get_headers():
|
|
|
122
128
|
|
|
123
129
|
|
|
124
130
|
def download_and_update_hf():
|
|
131
|
+
|
|
132
|
+
if file_paths.conus_hydrofabric.is_file():
|
|
133
|
+
console.print(
|
|
134
|
+
f"Hydrofabric already exists at {file_paths.conus_hydrofabric}, removing it to download the latest version.",
|
|
135
|
+
style="bold yellow",
|
|
136
|
+
)
|
|
137
|
+
file_paths.conus_hydrofabric.unlink()
|
|
138
|
+
|
|
125
139
|
download_from_s3(
|
|
126
140
|
file_paths.conus_hydrofabric.with_suffix(".tar.gz"),
|
|
127
141
|
bucket="communityhydrofabric",
|
|
@@ -207,6 +221,17 @@ def validate_hydrofabric():
|
|
|
207
221
|
)
|
|
208
222
|
sleep(2)
|
|
209
223
|
return
|
|
224
|
+
|
|
225
|
+
# moved this from gpkg_utils to here to avoid potential nested rich live displays
|
|
226
|
+
if file_paths.conus_hydrofabric.is_file():
|
|
227
|
+
valid_hf = False
|
|
228
|
+
while not valid_hf:
|
|
229
|
+
try:
|
|
230
|
+
verify_indices()
|
|
231
|
+
valid_hf = True
|
|
232
|
+
except sqlite3.DatabaseError:
|
|
233
|
+
console.print(f"Hydrofabric {file_paths.conus_hydrofabric} is corrupted. Redownloading...", style="red")
|
|
234
|
+
download_and_update_hf()
|
|
210
235
|
|
|
211
236
|
|
|
212
237
|
def validate_output_dir():
|
|
@@ -220,7 +245,7 @@ def validate_output_dir():
|
|
|
220
245
|
response = Prompt.ask("Enter the path to the working directory")
|
|
221
246
|
if response == "" or response.lower() == "y":
|
|
222
247
|
response = "~/ngiab_preprocess_output/"
|
|
223
|
-
file_paths.set_working_dir(response)
|
|
248
|
+
file_paths.set_working_dir(response) # type: ignore
|
|
224
249
|
|
|
225
250
|
|
|
226
251
|
def validate_all():
|
map_app/__main__.py
CHANGED
|
@@ -39,12 +39,13 @@ def main():
|
|
|
39
39
|
Timer(2, set_logs_to_warning).start()
|
|
40
40
|
with open("app.log", "a") as f:
|
|
41
41
|
f.write("Running in debug mode\n")
|
|
42
|
-
app.run(debug=True, host="0.0.0.0", port="8080")
|
|
42
|
+
app.run(debug=True, host="0.0.0.0", port="8080") # type: ignore
|
|
43
43
|
else:
|
|
44
44
|
Timer(1, open_browser).start()
|
|
45
45
|
with open("app.log", "a") as f:
|
|
46
46
|
f.write("Running in production mode\n")
|
|
47
|
-
app.run(host="0.0.0.0", port="0")
|
|
47
|
+
app.run(host="0.0.0.0", port="0") # type: ignore
|
|
48
|
+
|
|
48
49
|
|
|
49
50
|
if __name__ == "__main__":
|
|
50
51
|
main()
|
map_app/static/css/main.css
CHANGED
|
@@ -28,10 +28,9 @@ main {
|
|
|
28
28
|
|
|
29
29
|
.maplibregl-popup-content {
|
|
30
30
|
background: var(--surface-color) !important;
|
|
31
|
-
|
|
32
31
|
}
|
|
33
32
|
|
|
34
|
-
#toggle-button {
|
|
33
|
+
#toggle-button-gages, #toggle-button-camels, #toggle-button-nwm, #toggle-button-aorc {
|
|
35
34
|
position: relative;
|
|
36
35
|
top: 20px;
|
|
37
36
|
left: 20px;
|
|
@@ -46,11 +45,23 @@ main {
|
|
|
46
45
|
z-index: 1;
|
|
47
46
|
}
|
|
48
47
|
|
|
49
|
-
#toggle-button:hover {
|
|
48
|
+
#toggle-button-gages:hover, #toggle-button-camels:hover, #toggle-button-nwm:hover, #toggle-button-aorc:hover {
|
|
50
49
|
scale: 1.1;
|
|
51
50
|
box-shadow: var(--shadow-md);
|
|
52
51
|
}
|
|
53
52
|
|
|
53
|
+
#toggle-button-camels {
|
|
54
|
+
left: 30px;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
#toggle-button-nwm {
|
|
58
|
+
left: 40px;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
#toggle-button-aorc {
|
|
62
|
+
left: 50px;
|
|
63
|
+
}
|
|
64
|
+
|
|
54
65
|
body {
|
|
55
66
|
font-family: 'Inter', system-ui, -apple-system, sans-serif;
|
|
56
67
|
margin: 0;
|