ngiab-data-preprocess 4.2.2__tar.gz → 4.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/.gitignore +2 -0
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/PKG-INFO +15 -11
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/README.md +13 -9
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/data_processing/create_realization.py +15 -21
- ngiab_data_preprocess-4.3.0/modules/data_processing/dask_utils.py +92 -0
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/data_processing/dataset_utils.py +127 -44
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/data_processing/datasets.py +18 -29
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/data_processing/file_paths.py +7 -7
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/data_processing/forcings.py +40 -38
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/data_processing/gpkg_utils.py +13 -13
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/data_processing/graph_utils.py +4 -4
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/data_processing/s3fs_utils.py +1 -1
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/data_processing/subset.py +1 -1
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/data_sources/source_validation.py +57 -32
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/map_app/__main__.py +3 -2
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/map_app/static/css/main.css +14 -3
- ngiab_data_preprocess-4.3.0/modules/map_app/static/js/main.js +280 -0
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/map_app/templates/index.html +10 -1
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/map_app/views.py +1 -1
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/ngiab_data_cli/__main__.py +31 -28
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/ngiab_data_cli/arguments.py +0 -1
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/ngiab_data_cli/forcing_cli.py +10 -19
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/ngiab_data_preprocess.egg-info/PKG-INFO +15 -11
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/ngiab_data_preprocess.egg-info/SOURCES.txt +3 -1
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/ngiab_data_preprocess.egg-info/requires.txt +1 -1
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/pyproject.toml +10 -3
- ngiab_data_preprocess-4.3.0/tests/test_nan_impute.py +200 -0
- ngiab_data_preprocess-4.2.2/modules/map_app/static/js/main.js +0 -161
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/.github/workflows/build_only.yml +0 -0
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/.github/workflows/publish.yml +0 -0
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/LICENSE +0 -0
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/data_sources/cfe-nowpm-realization-template.json +0 -0
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/data_sources/cfe-template.ini +0 -0
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/data_sources/em-catchment-template.yml +0 -0
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/data_sources/em-config.yml +0 -0
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/data_sources/em-realization-template.json +0 -0
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/data_sources/forcing_template.nc +0 -0
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/data_sources/ngen-routing-template.yaml +0 -0
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/data_sources/noah-owp-modular-init.namelist.input +0 -0
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/data_sources/template.sql +0 -0
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/data_sources/triggers.sql +0 -0
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/map_app/__init__.py +0 -0
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/map_app/static/css/console.css +0 -0
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/map_app/static/css/toggle.css +0 -0
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/map_app/static/js/console.js +0 -0
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/map_app/static/js/data_processing.js +0 -0
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/map_app/static/resources/loading.gif +0 -0
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/map_app/static/resources/screenshot.jpg +0 -0
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/ngiab_data_cli/custom_logging.py +0 -0
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/ngiab_data_preprocess.egg-info/dependency_links.txt +0 -0
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/ngiab_data_preprocess.egg-info/entry_points.txt +0 -0
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/ngiab_data_preprocess.egg-info/top_level.txt +0 -0
- {ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ngiab_data_preprocess
|
|
3
|
-
Version: 4.
|
|
3
|
+
Version: 4.3.0
|
|
4
4
|
Summary: Graphical Tools for creating Next Gen Water model input data.
|
|
5
5
|
Author-email: Josh Cunningham <jcunningham8@ua.edu>
|
|
6
6
|
Project-URL: Homepage, https://github.com/CIROH-UA/NGIAB_data_preprocess
|
|
@@ -15,7 +15,7 @@ Requires-Dist: pyogrio>=0.7.2
|
|
|
15
15
|
Requires-Dist: pyproj>=3.6.1
|
|
16
16
|
Requires-Dist: Flask==3.0.2
|
|
17
17
|
Requires-Dist: geopandas>=1.0.0
|
|
18
|
-
Requires-Dist: requests==2.32.
|
|
18
|
+
Requires-Dist: requests==2.32.4
|
|
19
19
|
Requires-Dist: igraph==0.11.4
|
|
20
20
|
Requires-Dist: s3fs==2024.3.1
|
|
21
21
|
Requires-Dist: xarray==2024.2.0
|
|
@@ -47,15 +47,19 @@ This repository contains tools for preparing data to run a [next gen](https://gi
|
|
|
47
47
|
## Table of Contents
|
|
48
48
|
|
|
49
49
|
1. [What does this tool do?](#what-does-this-tool-do)
|
|
50
|
-
2. [
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
50
|
+
2. [What does it not do?](#what-does-it-not-do)
|
|
51
|
+
- [Evaluation](#evaluation)
|
|
52
|
+
- [Visualisation](#visualisation)
|
|
53
|
+
3. [Requirements](#requirements)
|
|
54
|
+
4. [Installation and Running](#installation-and-running)
|
|
55
|
+
- [Running without install](#running-without-install)
|
|
56
|
+
5. [For legacy pip installation](#for-legacy-pip-installation)
|
|
57
|
+
6. [Development Installation](#development-installation)
|
|
58
|
+
7. [Usage](#usage)
|
|
59
|
+
8. [CLI Documentation](#cli-documentation)
|
|
55
60
|
- [Arguments](#arguments)
|
|
61
|
+
- [Usage Notes](#usage-notes)
|
|
56
62
|
- [Examples](#examples)
|
|
57
|
-
- [File Formats](#file-formats)
|
|
58
|
-
- [Output](#output)
|
|
59
63
|
|
|
60
64
|
## What does this tool do?
|
|
61
65
|
|
|
@@ -229,12 +233,12 @@ Once all the steps are finished, you can run NGIAB on the folder shown underneat
|
|
|
229
233
|
|
|
230
234
|
3. Create realization using a lat/lon pair and output to a named folder:
|
|
231
235
|
```bash
|
|
232
|
-
python -m ngiab_data_cli -i
|
|
236
|
+
python -m ngiab_data_cli -i 33.22,-87.54 -l -r --start 2022-01-01 --end 2022-02-28 -o custom_output
|
|
233
237
|
```
|
|
234
238
|
|
|
235
239
|
4. Perform all operations using a lat/lon pair:
|
|
236
240
|
```bash
|
|
237
|
-
python -m ngiab_data_cli -i
|
|
241
|
+
python -m ngiab_data_cli -i 33.22,-87.54 -l -s -f -r --start 2022-01-01 --end 2022-02-28
|
|
238
242
|
```
|
|
239
243
|
|
|
240
244
|
5. Subset hydrofabric using gage ID:
|
|
@@ -7,15 +7,19 @@ This repository contains tools for preparing data to run a [next gen](https://gi
|
|
|
7
7
|
## Table of Contents
|
|
8
8
|
|
|
9
9
|
1. [What does this tool do?](#what-does-this-tool-do)
|
|
10
|
-
2. [
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
10
|
+
2. [What does it not do?](#what-does-it-not-do)
|
|
11
|
+
- [Evaluation](#evaluation)
|
|
12
|
+
- [Visualisation](#visualisation)
|
|
13
|
+
3. [Requirements](#requirements)
|
|
14
|
+
4. [Installation and Running](#installation-and-running)
|
|
15
|
+
- [Running without install](#running-without-install)
|
|
16
|
+
5. [For legacy pip installation](#for-legacy-pip-installation)
|
|
17
|
+
6. [Development Installation](#development-installation)
|
|
18
|
+
7. [Usage](#usage)
|
|
19
|
+
8. [CLI Documentation](#cli-documentation)
|
|
15
20
|
- [Arguments](#arguments)
|
|
21
|
+
- [Usage Notes](#usage-notes)
|
|
16
22
|
- [Examples](#examples)
|
|
17
|
-
- [File Formats](#file-formats)
|
|
18
|
-
- [Output](#output)
|
|
19
23
|
|
|
20
24
|
## What does this tool do?
|
|
21
25
|
|
|
@@ -189,12 +193,12 @@ Once all the steps are finished, you can run NGIAB on the folder shown underneat
|
|
|
189
193
|
|
|
190
194
|
3. Create realization using a lat/lon pair and output to a named folder:
|
|
191
195
|
```bash
|
|
192
|
-
python -m ngiab_data_cli -i
|
|
196
|
+
python -m ngiab_data_cli -i 33.22,-87.54 -l -r --start 2022-01-01 --end 2022-02-28 -o custom_output
|
|
193
197
|
```
|
|
194
198
|
|
|
195
199
|
4. Perform all operations using a lat/lon pair:
|
|
196
200
|
```bash
|
|
197
|
-
python -m ngiab_data_cli -i
|
|
201
|
+
python -m ngiab_data_cli -i 33.22,-87.54 -l -s -f -r --start 2022-01-01 --end 2022-02-28
|
|
198
202
|
```
|
|
199
203
|
|
|
200
204
|
5. Subset hydrofabric using gage ID:
|
|
@@ -3,15 +3,15 @@ import logging
|
|
|
3
3
|
import multiprocessing
|
|
4
4
|
import shutil
|
|
5
5
|
import sqlite3
|
|
6
|
-
from collections import defaultdict
|
|
7
6
|
from datetime import datetime
|
|
8
7
|
from pathlib import Path
|
|
8
|
+
from typing import Dict, Optional
|
|
9
9
|
|
|
10
10
|
import pandas
|
|
11
11
|
import requests
|
|
12
12
|
import s3fs
|
|
13
13
|
import xarray as xr
|
|
14
|
-
from
|
|
14
|
+
from data_processing.dask_utils import temp_cluster
|
|
15
15
|
from data_processing.file_paths import file_paths
|
|
16
16
|
from data_processing.gpkg_utils import (
|
|
17
17
|
GeoPackage,
|
|
@@ -25,7 +25,8 @@ from tqdm.rich import tqdm
|
|
|
25
25
|
logger = logging.getLogger(__name__)
|
|
26
26
|
|
|
27
27
|
|
|
28
|
-
|
|
28
|
+
@temp_cluster
|
|
29
|
+
def get_approximate_gw_storage(paths: file_paths, start_date: datetime) -> Dict[str, int]:
|
|
29
30
|
# get the gw levels from the NWM output on a given start date
|
|
30
31
|
# this kind of works in place of warmstates for now
|
|
31
32
|
year = start_date.strftime("%Y")
|
|
@@ -35,17 +36,10 @@ def get_approximate_gw_storage(paths: file_paths, start_date: datetime):
|
|
|
35
36
|
fs = s3fs.S3FileSystem(anon=True)
|
|
36
37
|
nc_url = f"s3://noaa-nwm-retrospective-3-0-pds/CONUS/netcdf/GWOUT/{year}/{formatted_dt}.GWOUT_DOMAIN1"
|
|
37
38
|
|
|
38
|
-
# make sure there's a dask cluster running
|
|
39
|
-
try:
|
|
40
|
-
client = Client.current()
|
|
41
|
-
except ValueError:
|
|
42
|
-
cluster = LocalCluster()
|
|
43
|
-
client = Client(cluster)
|
|
44
|
-
|
|
45
39
|
with fs.open(nc_url) as file_obj:
|
|
46
|
-
ds = xr.open_dataset(file_obj)
|
|
40
|
+
ds = xr.open_dataset(file_obj) # type: ignore
|
|
47
41
|
|
|
48
|
-
water_levels = dict()
|
|
42
|
+
water_levels: Dict[str, int] = dict()
|
|
49
43
|
for cat, feature in tqdm(cat_to_feature.items()):
|
|
50
44
|
# this value is in CM, we need meters to match max_gw_depth
|
|
51
45
|
# xarray says it's in mm, with 0.1 scale factor. calling .values doesn't apply the scale
|
|
@@ -114,13 +108,13 @@ def make_noahowp_config(
|
|
|
114
108
|
lon=divide_conf_df.loc[divide, "longitude"],
|
|
115
109
|
terrain_slope=divide_conf_df.loc[divide, "mean.slope_1km"],
|
|
116
110
|
azimuth=divide_conf_df.loc[divide, "circ_mean.aspect"],
|
|
117
|
-
ISLTYP=int(divide_conf_df.loc[divide, "mode.ISLTYP"]),
|
|
118
|
-
IVGTYP=int(divide_conf_df.loc[divide, "mode.IVGTYP"]),
|
|
111
|
+
ISLTYP=int(divide_conf_df.loc[divide, "mode.ISLTYP"]), # type: ignore
|
|
112
|
+
IVGTYP=int(divide_conf_df.loc[divide, "mode.IVGTYP"]), # type: ignore
|
|
119
113
|
)
|
|
120
114
|
)
|
|
121
115
|
|
|
122
116
|
|
|
123
|
-
def get_model_attributes_modspatialite(hydrofabric: Path):
|
|
117
|
+
def get_model_attributes_modspatialite(hydrofabric: Path) -> pandas.DataFrame:
|
|
124
118
|
# modspatialite is faster than pyproj but can't be added as a pip dependency
|
|
125
119
|
# This incantation took a while
|
|
126
120
|
with GeoPackage(hydrofabric) as conn:
|
|
@@ -151,7 +145,7 @@ def get_model_attributes_modspatialite(hydrofabric: Path):
|
|
|
151
145
|
return divide_conf_df
|
|
152
146
|
|
|
153
147
|
|
|
154
|
-
def get_model_attributes_pyproj(hydrofabric: Path):
|
|
148
|
+
def get_model_attributes_pyproj(hydrofabric: Path) -> pandas.DataFrame:
|
|
155
149
|
# if modspatialite is not available, use pyproj
|
|
156
150
|
with sqlite3.connect(hydrofabric) as conn:
|
|
157
151
|
sql = """
|
|
@@ -185,7 +179,7 @@ def get_model_attributes_pyproj(hydrofabric: Path):
|
|
|
185
179
|
return divide_conf_df
|
|
186
180
|
|
|
187
181
|
|
|
188
|
-
def get_model_attributes(hydrofabric: Path):
|
|
182
|
+
def get_model_attributes(hydrofabric: Path) -> pandas.DataFrame:
|
|
189
183
|
try:
|
|
190
184
|
with GeoPackage(hydrofabric) as conn:
|
|
191
185
|
conf_df = pandas.read_sql_query(
|
|
@@ -259,7 +253,7 @@ def make_em_config(
|
|
|
259
253
|
|
|
260
254
|
def configure_troute(
|
|
261
255
|
cat_id: str, config_dir: Path, start_time: datetime, end_time: datetime
|
|
262
|
-
) ->
|
|
256
|
+
) -> None:
|
|
263
257
|
with open(file_paths.template_troute_config, "r") as file:
|
|
264
258
|
troute_template = file.read()
|
|
265
259
|
time_step_size = 300
|
|
@@ -316,7 +310,7 @@ def create_realization(
|
|
|
316
310
|
start_time: datetime,
|
|
317
311
|
end_time: datetime,
|
|
318
312
|
use_nwm_gw: bool = False,
|
|
319
|
-
gage_id: str = None,
|
|
313
|
+
gage_id: Optional[str] = None,
|
|
320
314
|
):
|
|
321
315
|
paths = file_paths(cat_id)
|
|
322
316
|
|
|
@@ -354,12 +348,12 @@ def create_realization(
|
|
|
354
348
|
create_partitions(paths)
|
|
355
349
|
|
|
356
350
|
|
|
357
|
-
def create_partitions(paths:
|
|
351
|
+
def create_partitions(paths: file_paths, num_partitions: Optional[int] = None) -> None:
|
|
358
352
|
if num_partitions is None:
|
|
359
353
|
num_partitions = multiprocessing.cpu_count()
|
|
360
354
|
|
|
361
355
|
cat_to_nex_pairs = get_cat_to_nex_flowpairs(hydrofabric=paths.geopackage_path)
|
|
362
|
-
nexus = defaultdict(list)
|
|
356
|
+
# nexus = defaultdict(list)
|
|
363
357
|
|
|
364
358
|
# for cat, nex in cat_to_nex_pairs:
|
|
365
359
|
# nexus[nex].append(cat)
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from dask.distributed import Client
|
|
4
|
+
|
|
5
|
+
logger = logging.getLogger(__name__)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def shutdown_cluster():
|
|
9
|
+
try:
|
|
10
|
+
client = Client.current()
|
|
11
|
+
client.shutdown()
|
|
12
|
+
except ValueError:
|
|
13
|
+
logger.debug("No cluster found to shutdown")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def no_cluster(func):
|
|
17
|
+
"""
|
|
18
|
+
Decorator that ensures the wrapped function runs with no active Dask cluster.
|
|
19
|
+
|
|
20
|
+
This decorator attempts to shut down any existing Dask cluster before
|
|
21
|
+
executing the wrapped function. If no cluster is found, it logs a debug message
|
|
22
|
+
and continues execution.
|
|
23
|
+
|
|
24
|
+
Parameters:
|
|
25
|
+
func: The function to be executed without a Dask cluster
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
wrapper: The wrapped function that will be executed without a Dask cluster
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def wrapper(*args, **kwargs):
|
|
32
|
+
shutdown_cluster()
|
|
33
|
+
result = func(*args, **kwargs)
|
|
34
|
+
return result
|
|
35
|
+
|
|
36
|
+
return wrapper
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def use_cluster(func):
|
|
40
|
+
"""
|
|
41
|
+
Decorator that ensures the wrapped function has access to a Dask cluster.
|
|
42
|
+
|
|
43
|
+
If a Dask cluster is already running, it uses the existing one.
|
|
44
|
+
If no cluster is available, it creates a new one before executing the function.
|
|
45
|
+
The cluster remains active after the function completes.
|
|
46
|
+
|
|
47
|
+
Parameters:
|
|
48
|
+
func: The function to be executed with a Dask cluster
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
wrapper: The wrapped function with access to a Dask cluster
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def wrapper(*args, **kwargs):
|
|
55
|
+
try:
|
|
56
|
+
client = Client.current()
|
|
57
|
+
except ValueError:
|
|
58
|
+
client = Client()
|
|
59
|
+
result = func(*args, **kwargs)
|
|
60
|
+
return result
|
|
61
|
+
|
|
62
|
+
return wrapper
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def temp_cluster(func):
|
|
66
|
+
"""
|
|
67
|
+
Decorator that provides a temporary Dask cluster for the wrapped function.
|
|
68
|
+
|
|
69
|
+
If a Dask cluster is already running, it uses the existing one and leaves it running.
|
|
70
|
+
If no cluster exists, it creates a temporary one and shuts it down after
|
|
71
|
+
the function completes.
|
|
72
|
+
|
|
73
|
+
Parameters:
|
|
74
|
+
func: The function to be executed with a Dask cluster
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
wrapper: The wrapped function with access to a Dask cluster
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
def wrapper(*args, **kwargs):
|
|
81
|
+
cluster_was_running = True
|
|
82
|
+
try:
|
|
83
|
+
client = Client.current()
|
|
84
|
+
except ValueError:
|
|
85
|
+
cluster_was_running = False
|
|
86
|
+
client = Client()
|
|
87
|
+
result = func(*args, **kwargs)
|
|
88
|
+
if not cluster_was_running:
|
|
89
|
+
client.shutdown()
|
|
90
|
+
return result
|
|
91
|
+
|
|
92
|
+
return wrapper
|
{ngiab_data_preprocess-4.2.2 → ngiab_data_preprocess-4.3.0}/modules/data_processing/dataset_utils.py
RENAMED
|
@@ -1,19 +1,22 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
|
+
from datetime import datetime
|
|
3
4
|
from pathlib import Path
|
|
4
|
-
from typing import Tuple, Union
|
|
5
|
+
from typing import List, Literal, Optional, Tuple, Union
|
|
5
6
|
|
|
6
7
|
import geopandas as gpd
|
|
7
8
|
import numpy as np
|
|
8
9
|
import xarray as xr
|
|
9
|
-
from
|
|
10
|
-
import
|
|
10
|
+
from xarray.core.types import InterpOptions
|
|
11
|
+
from dask.distributed import Client, progress, Future
|
|
12
|
+
from data_processing.dask_utils import use_cluster
|
|
11
13
|
|
|
12
14
|
logger = logging.getLogger(__name__)
|
|
13
15
|
|
|
14
16
|
# known ngen variable names
|
|
15
17
|
# https://github.com/CIROH-UA/ngen/blob/4fb5bb68dc397298bca470dfec94db2c1dcb42fe/include/forcing/AorcForcing.hpp#L77
|
|
16
18
|
|
|
19
|
+
|
|
17
20
|
def validate_dataset_format(dataset: xr.Dataset) -> None:
|
|
18
21
|
"""
|
|
19
22
|
Validate the format of the dataset.
|
|
@@ -41,8 +44,9 @@ def validate_dataset_format(dataset: xr.Dataset) -> None:
|
|
|
41
44
|
if "name" not in dataset.attrs:
|
|
42
45
|
raise ValueError("Dataset must have a name attribute to identify it")
|
|
43
46
|
|
|
47
|
+
|
|
44
48
|
def validate_time_range(dataset: xr.Dataset, start_time: str, end_time: str) -> Tuple[str, str]:
|
|
45
|
-
|
|
49
|
+
"""
|
|
46
50
|
Ensure that all selected times are in the passed dataset.
|
|
47
51
|
|
|
48
52
|
Parameters
|
|
@@ -60,7 +64,7 @@ def validate_time_range(dataset: xr.Dataset, start_time: str, end_time: str) ->
|
|
|
60
64
|
start_time, or if not available, earliest available timestep in dataset.
|
|
61
65
|
str
|
|
62
66
|
end_time, or if not available, latest available timestep in dataset.
|
|
63
|
-
|
|
67
|
+
"""
|
|
64
68
|
end_time_in_dataset = dataset.time.isel(time=-1).values
|
|
65
69
|
start_time_in_dataset = dataset.time.isel(time=0).values
|
|
66
70
|
if np.datetime64(start_time) < start_time_in_dataset:
|
|
@@ -77,7 +81,10 @@ def validate_time_range(dataset: xr.Dataset, start_time: str, end_time: str) ->
|
|
|
77
81
|
|
|
78
82
|
|
|
79
83
|
def clip_dataset_to_bounds(
|
|
80
|
-
dataset: xr.Dataset,
|
|
84
|
+
dataset: xr.Dataset,
|
|
85
|
+
bounds: Tuple[float, float, float, float] | np.ndarray[tuple[int], np.dtype[np.float64]],
|
|
86
|
+
start_time: str,
|
|
87
|
+
end_time: str,
|
|
81
88
|
) -> xr.Dataset:
|
|
82
89
|
"""
|
|
83
90
|
Clip the dataset to specified geographical bounds.
|
|
@@ -86,14 +93,14 @@ def clip_dataset_to_bounds(
|
|
|
86
93
|
----------
|
|
87
94
|
dataset : xr.Dataset
|
|
88
95
|
Dataset to be clipped.
|
|
89
|
-
bounds : tuple[float, float, float, float]
|
|
90
|
-
Corners of bounding box. bounds[0] is x_min, bounds[1] is y_min,
|
|
96
|
+
bounds : tuple[float, float, float, float] | np.ndarray[tuple[int], np.dtype[np.float64]]
|
|
97
|
+
Corners of bounding box. bounds[0] is x_min, bounds[1] is y_min,
|
|
91
98
|
bounds[2] is x_max, bounds[3] is y_max.
|
|
92
99
|
start_time : str
|
|
93
100
|
Desired start time in YYYY/MM/DD HH:MM:SS format.
|
|
94
101
|
end_time : str
|
|
95
102
|
Desired end time in YYYY/MM/DD HH:MM:SS format.
|
|
96
|
-
|
|
103
|
+
|
|
97
104
|
Returns
|
|
98
105
|
-------
|
|
99
106
|
xr.Dataset
|
|
@@ -110,33 +117,103 @@ def clip_dataset_to_bounds(
|
|
|
110
117
|
return dataset
|
|
111
118
|
|
|
112
119
|
|
|
113
|
-
def
|
|
114
|
-
|
|
115
|
-
|
|
120
|
+
def interpolate_nan_values(
|
|
121
|
+
dataset: xr.Dataset,
|
|
122
|
+
variables: Optional[List[str]] = None,
|
|
123
|
+
dim: str = "time",
|
|
124
|
+
method: InterpOptions = "nearest",
|
|
125
|
+
fill_value: str = "extrapolate",
|
|
126
|
+
) -> None:
|
|
127
|
+
"""
|
|
128
|
+
Interpolates NaN values in specified (or all numeric time-dependent)
|
|
129
|
+
variables of an xarray.Dataset. Operates inplace on the dataset.
|
|
116
130
|
|
|
117
|
-
|
|
118
|
-
|
|
131
|
+
Parameters
|
|
132
|
+
----------
|
|
133
|
+
dataset : xr.Dataset
|
|
134
|
+
The input dataset.
|
|
135
|
+
variables : Optional[List[str]], optional
|
|
136
|
+
A list of variable names to process. If None (default),
|
|
137
|
+
all numeric variables containing the specified dimension will be processed.
|
|
138
|
+
dim : str, optional
|
|
139
|
+
The dimension along which to interpolate (default is "time").
|
|
140
|
+
method : str, optional
|
|
141
|
+
Interpolation method to use (e.g., "linear", "nearest", "cubic").
|
|
142
|
+
Default is "nearest".
|
|
143
|
+
fill_value : str, optional
|
|
144
|
+
Method for filling NaNs at the start/end of the series after interpolation.
|
|
145
|
+
Set to "extrapolate" to fill with the nearest valid value when using 'nearest' or 'linear'.
|
|
146
|
+
Default is "extrapolate".
|
|
147
|
+
"""
|
|
148
|
+
for name, var in dataset.data_vars.items():
|
|
149
|
+
# if the variable is non-numeric, skip
|
|
150
|
+
if not np.issubdtype(var.dtype, np.number):
|
|
151
|
+
continue
|
|
152
|
+
# if there are no NANs, skip
|
|
153
|
+
if not var.isnull().any().compute():
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
dataset[name] = var.interpolate_na(
|
|
157
|
+
dim=dim,
|
|
158
|
+
method=method,
|
|
159
|
+
fill_value=fill_value if method in ["nearest", "linear"] else None,
|
|
160
|
+
)
|
|
119
161
|
|
|
120
|
-
# sort of terrible work around for half downloaded files
|
|
121
|
-
temp_path = cached_nc_path.with_suffix(".downloading.nc")
|
|
122
|
-
if os.path.exists(temp_path):
|
|
123
|
-
os.remove(temp_path)
|
|
124
162
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
163
|
+
@use_cluster
|
|
164
|
+
def save_dataset(
|
|
165
|
+
ds_to_save: xr.Dataset,
|
|
166
|
+
target_path: Path,
|
|
167
|
+
engine: Literal["netcdf4", "scipy", "h5netcdf"] = "h5netcdf",
|
|
168
|
+
):
|
|
169
|
+
"""
|
|
170
|
+
Helper function to compute and save an xarray.Dataset to a NetCDF file.
|
|
171
|
+
Uses a temporary file and rename for atomicity.
|
|
172
|
+
"""
|
|
173
|
+
if not target_path.parent.exists():
|
|
174
|
+
target_path.parent.mkdir(parents=True, exist_ok=True)
|
|
175
|
+
|
|
176
|
+
temp_file_path = target_path.with_name(target_path.name + ".saving.nc")
|
|
177
|
+
if temp_file_path.exists():
|
|
178
|
+
os.remove(temp_file_path)
|
|
129
179
|
|
|
130
180
|
client = Client.current()
|
|
131
|
-
future = client.compute(
|
|
132
|
-
|
|
181
|
+
future: Future = client.compute(
|
|
182
|
+
ds_to_save.to_netcdf(temp_file_path, engine=engine, compute=False)
|
|
183
|
+
) # type: ignore
|
|
184
|
+
logger.debug(
|
|
185
|
+
f"NetCDF write task submitted to Dask. Waiting for completion to {temp_file_path}..."
|
|
186
|
+
)
|
|
133
187
|
progress(future)
|
|
134
188
|
future.result()
|
|
189
|
+
os.rename(str(temp_file_path), str(target_path))
|
|
190
|
+
logger.info(f"Successfully saved data to: {target_path}")
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
@use_cluster
|
|
194
|
+
def save_to_cache(
|
|
195
|
+
stores: xr.Dataset, cached_nc_path: Path, interpolate_nans: bool = True
|
|
196
|
+
) -> xr.Dataset:
|
|
197
|
+
"""
|
|
198
|
+
Compute the store and save it to a cached netCDF file. This is not required but will save time and bandwidth.
|
|
199
|
+
"""
|
|
200
|
+
logger.info(f"Processing dataset for caching. Final cache target: {cached_nc_path}")
|
|
135
201
|
|
|
136
|
-
|
|
202
|
+
# lasily cast all numbers to f32
|
|
203
|
+
for name, var in stores.data_vars.items():
|
|
204
|
+
if np.issubdtype(var.dtype, np.number):
|
|
205
|
+
stores[name] = var.astype("float32", casting="same_kind")
|
|
137
206
|
|
|
138
|
-
|
|
139
|
-
|
|
207
|
+
# save dataset locally before manipulating it
|
|
208
|
+
save_dataset(stores, cached_nc_path)
|
|
209
|
+
stores = xr.open_mfdataset(cached_nc_path, parallel=True, engine="h5netcdf")
|
|
210
|
+
|
|
211
|
+
if interpolate_nans:
|
|
212
|
+
interpolate_nan_values(dataset=stores)
|
|
213
|
+
save_dataset(stores, cached_nc_path)
|
|
214
|
+
stores = xr.open_mfdataset(cached_nc_path, parallel=True, engine="h5netcdf")
|
|
215
|
+
|
|
216
|
+
return stores
|
|
140
217
|
|
|
141
218
|
|
|
142
219
|
def check_local_cache(
|
|
@@ -144,9 +221,8 @@ def check_local_cache(
|
|
|
144
221
|
start_time: str,
|
|
145
222
|
end_time: str,
|
|
146
223
|
gdf: gpd.GeoDataFrame,
|
|
147
|
-
remote_dataset: xr.Dataset
|
|
224
|
+
remote_dataset: xr.Dataset,
|
|
148
225
|
) -> Union[xr.Dataset, None]:
|
|
149
|
-
|
|
150
226
|
merged_data = None
|
|
151
227
|
|
|
152
228
|
if not os.path.exists(cached_nc_path):
|
|
@@ -155,9 +231,7 @@ def check_local_cache(
|
|
|
155
231
|
|
|
156
232
|
logger.info("Found cached nc file")
|
|
157
233
|
# open the cached file and check that the time range is correct
|
|
158
|
-
cached_data = xr.open_mfdataset(
|
|
159
|
-
cached_nc_path, parallel=True, engine="h5netcdf"
|
|
160
|
-
)
|
|
234
|
+
cached_data = xr.open_mfdataset(cached_nc_path, parallel=True, engine="h5netcdf")
|
|
161
235
|
|
|
162
236
|
if "name" not in cached_data.attrs or "name" not in remote_dataset.attrs:
|
|
163
237
|
logger.warning("No name attribute found to compare datasets")
|
|
@@ -166,9 +240,9 @@ def check_local_cache(
|
|
|
166
240
|
logger.warning("Cached data from different source, .name attr doesn't match")
|
|
167
241
|
return
|
|
168
242
|
|
|
169
|
-
range_in_cache = cached_data.time[0].values <= np.datetime64(
|
|
170
|
-
|
|
171
|
-
|
|
243
|
+
range_in_cache = cached_data.time[0].values <= np.datetime64(start_time) and cached_data.time[
|
|
244
|
+
-1
|
|
245
|
+
].values >= np.datetime64(end_time)
|
|
172
246
|
|
|
173
247
|
if not range_in_cache:
|
|
174
248
|
# the cache does not contain the desired time range
|
|
@@ -186,10 +260,8 @@ def check_local_cache(
|
|
|
186
260
|
if range_in_cache:
|
|
187
261
|
logger.info("Time range is within cached data")
|
|
188
262
|
logger.debug(f"Opened cached nc file: [{cached_nc_path}]")
|
|
189
|
-
merged_data = clip_dataset_to_bounds(
|
|
190
|
-
|
|
191
|
-
)
|
|
192
|
-
logger.debug("Clipped stores")
|
|
263
|
+
merged_data = clip_dataset_to_bounds(cached_data, gdf.total_bounds, start_time, end_time)
|
|
264
|
+
logger.debug("Clipped stores")
|
|
193
265
|
|
|
194
266
|
return merged_data
|
|
195
267
|
|
|
@@ -197,16 +269,27 @@ def check_local_cache(
|
|
|
197
269
|
def save_and_clip_dataset(
|
|
198
270
|
dataset: xr.Dataset,
|
|
199
271
|
gdf: gpd.GeoDataFrame,
|
|
200
|
-
start_time: datetime
|
|
201
|
-
end_time: datetime
|
|
272
|
+
start_time: datetime,
|
|
273
|
+
end_time: datetime,
|
|
202
274
|
cache_location: Path,
|
|
203
275
|
) -> xr.Dataset:
|
|
204
276
|
"""convenience function clip the remote dataset, and either load from cache or save to cache if it's not present"""
|
|
205
277
|
gdf = gdf.to_crs(dataset.crs)
|
|
206
278
|
|
|
207
|
-
cached_data = check_local_cache(
|
|
279
|
+
cached_data = check_local_cache(
|
|
280
|
+
cache_location,
|
|
281
|
+
start_time, # type: ignore
|
|
282
|
+
end_time, # type: ignore
|
|
283
|
+
gdf,
|
|
284
|
+
dataset,
|
|
285
|
+
)
|
|
208
286
|
|
|
209
287
|
if not cached_data:
|
|
210
|
-
clipped_data = clip_dataset_to_bounds(
|
|
288
|
+
clipped_data = clip_dataset_to_bounds(
|
|
289
|
+
dataset,
|
|
290
|
+
gdf.total_bounds,
|
|
291
|
+
start_time, # type: ignore
|
|
292
|
+
end_time, # type: ignore
|
|
293
|
+
)
|
|
211
294
|
cached_data = save_to_cache(clipped_data, cache_location)
|
|
212
|
-
return cached_data
|
|
295
|
+
return cached_data
|