ngiab-data-preprocess 3.3.2__tar.gz → 4.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/PKG-INFO +4 -3
  2. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/README.md +3 -2
  3. ngiab_data_preprocess-4.0.0/modules/data_processing/dataset_utils.py +212 -0
  4. ngiab_data_preprocess-4.0.0/modules/data_processing/datasets.py +87 -0
  5. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/data_processing/forcings.py +230 -75
  6. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/data_processing/gpkg_utils.py +2 -2
  7. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/data_processing/subset.py +2 -2
  8. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/data_sources/source_validation.py +4 -1
  9. ngiab_data_preprocess-4.0.0/modules/map_app/static/css/toggle.css +82 -0
  10. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/map_app/static/js/data_processing.js +10 -1
  11. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/map_app/static/js/main.js +17 -0
  12. ngiab_data_preprocess-4.0.0/modules/map_app/static/resources/screenshot.jpg +0 -0
  13. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/map_app/templates/index.html +15 -3
  14. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/map_app/views.py +18 -2
  15. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/ngiab_data_cli/__main__.py +19 -15
  16. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/ngiab_data_cli/arguments.py +7 -0
  17. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/ngiab_data_cli/forcing_cli.py +36 -7
  18. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/ngiab_data_preprocess.egg-info/PKG-INFO +4 -3
  19. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/ngiab_data_preprocess.egg-info/SOURCES.txt +4 -2
  20. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/pyproject.toml +1 -1
  21. ngiab_data_preprocess-3.3.2/modules/data_processing/zarr_utils.py +0 -162
  22. ngiab_data_preprocess-3.3.2/modules/map_app/static/resources/screenshot.png +0 -0
  23. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/.github/workflows/build_only.yml +0 -0
  24. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/.github/workflows/publish.yml +0 -0
  25. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/.gitignore +0 -0
  26. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/LICENSE +0 -0
  27. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/map.html +0 -0
  28. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/data_processing/create_realization.py +0 -0
  29. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/data_processing/file_paths.py +0 -0
  30. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/data_processing/graph_utils.py +0 -0
  31. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/data_processing/s3fs_utils.py +0 -0
  32. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/data_sources/cfe-nowpm-realization-template.json +0 -0
  33. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/data_sources/cfe-template.ini +0 -0
  34. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/data_sources/em-catchment-template.yml +0 -0
  35. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/data_sources/em-config.yml +0 -0
  36. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/data_sources/em-realization-template.json +0 -0
  37. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/data_sources/forcing_template.nc +0 -0
  38. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/data_sources/ngen-routing-template.yaml +0 -0
  39. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/data_sources/noah-owp-modular-init.namelist.input +0 -0
  40. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/data_sources/template.sql +0 -0
  41. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/data_sources/triggers.sql +0 -0
  42. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/map_app/__init__.py +0 -0
  43. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/map_app/__main__.py +0 -0
  44. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/map_app/static/css/console.css +0 -0
  45. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/map_app/static/css/main.css +0 -0
  46. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/map_app/static/js/console.js +0 -0
  47. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/map_app/static/resources/dark-style.json +0 -0
  48. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/map_app/static/resources/light-style.json +0 -0
  49. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/map_app/static/resources/loading.gif +0 -0
  50. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/ngiab_data_cli/custom_logging.py +0 -0
  51. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/ngiab_data_preprocess.egg-info/dependency_links.txt +0 -0
  52. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/ngiab_data_preprocess.egg-info/entry_points.txt +0 -0
  53. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/ngiab_data_preprocess.egg-info/requires.txt +0 -0
  54. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/modules/ngiab_data_preprocess.egg-info/top_level.txt +0 -0
  55. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/output/.gitkeep +0 -0
  56. {ngiab_data_preprocess-3.3.2 → ngiab_data_preprocess-4.0.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: ngiab_data_preprocess
3
- Version: 3.3.2
3
+ Version: 4.0.0
4
4
  Summary: Graphical Tools for creating Next Gen Water model input data.
5
5
  Author-email: Josh Cunningham <jcunningham8@ua.edu>
6
6
  Project-URL: Homepage, https://github.com/CIROH-UA/NGIAB_data_preprocess
@@ -41,7 +41,7 @@ Requires-Dist: ngiab_eval[plot]; extra == "plot"
41
41
 
42
42
  This repository contains tools for preparing data to run a [next gen](https://github.com/NOAA-OWP/ngen) simulation using [NGIAB](https://github.com/CIROH-UA/NGIAB-CloudInfra). The tools allow you to select a catchment of interest on an interactive map, choose a date range, and prepare the data with just a few clicks!
43
43
 
44
- ![map screenshot](https://github.com/CIROH-UA/NGIAB_data_preprocess/blob/main/modules/map_app/static/resources/screenshot.png)
44
+ ![map screenshot](https://github.com/CIROH-UA/NGIAB_data_preprocess/blob/main/modules/map_app/static/resources/screenshot.jpg)
45
45
 
46
46
  ## Table of Contents
47
47
 
@@ -60,7 +60,7 @@ This repository contains tools for preparing data to run a [next gen](https://gi
60
60
 
61
61
  This tool prepares data to run a next gen simulation by creating a run package that can be used with NGIAB.
62
62
  It uses geometry and model attributes from the [v2.2 hydrofabric](https://lynker-spatial.s3-us-west-2.amazonaws.com/hydrofabric/v2.2/conus/conus_nextgen.gpkg) more information on [all data sources here](https://lynker-spatial.s3-us-west-2.amazonaws.com/hydrofabric/v2.2/hfv2.2-data_model.html).
63
- The raw forcing data is [nwm retrospective v3 forcing](https://noaa-nwm-retrospective-3-0-pds.s3.amazonaws.com/index.html#CONUS/zarr/forcing/) data.
63
+ The raw forcing data is [nwm retrospective v3 forcing](https://noaa-nwm-retrospective-3-0-pds.s3.amazonaws.com/index.html#CONUS/zarr/forcing/) data or the [AORC 1km gridded data](https://noaa-nws-aorc-v1-1-1km.s3.amazonaws.com/index.html) depending on user input
64
64
 
65
65
  1. **Subset** (delineate) everything upstream of your point of interest (catchment, gage, flowpath etc). Outputs as a geopackage.
66
66
  2. **Calculates** Forcings as a weighted mean of the gridded AORC forcings. Weights are calculated using [exact extract](https://isciences.github.io/exactextract/) and computed with numpy.
@@ -161,6 +161,7 @@ Once all the steps are finished, you can run NGIAB on the folder shown underneat
161
161
  - `--start_date START_DATE`, `--start START_DATE`: Start date for forcings/realization (format YYYY-MM-DD).
162
162
  - `--end_date END_DATE`, `--end END_DATE`: End date for forcings/realization (format YYYY-MM-DD).
163
163
  - `-o OUTPUT_NAME`, `--output_name OUTPUT_NAME`: Name of the output folder.
164
+ - `--source` : The datasource you want to use, either `nwm` for retrospective v3 or `aorc`. Default is `nwm`
164
165
  - `-D`, `--debug`: Enable debug logging.
165
166
  - `--run`: Automatically run Next Gen against the output folder.
166
167
  - `--validate`: Run every missing step required to run ngiab.
@@ -2,7 +2,7 @@
2
2
 
3
3
  This repository contains tools for preparing data to run a [next gen](https://github.com/NOAA-OWP/ngen) simulation using [NGIAB](https://github.com/CIROH-UA/NGIAB-CloudInfra). The tools allow you to select a catchment of interest on an interactive map, choose a date range, and prepare the data with just a few clicks!
4
4
 
5
- ![map screenshot](https://github.com/CIROH-UA/NGIAB_data_preprocess/blob/main/modules/map_app/static/resources/screenshot.png)
5
+ ![map screenshot](https://github.com/CIROH-UA/NGIAB_data_preprocess/blob/main/modules/map_app/static/resources/screenshot.jpg)
6
6
 
7
7
  ## Table of Contents
8
8
 
@@ -21,7 +21,7 @@ This repository contains tools for preparing data to run a [next gen](https://gi
21
21
 
22
22
  This tool prepares data to run a next gen simulation by creating a run package that can be used with NGIAB.
23
23
  It uses geometry and model attributes from the [v2.2 hydrofabric](https://lynker-spatial.s3-us-west-2.amazonaws.com/hydrofabric/v2.2/conus/conus_nextgen.gpkg) more information on [all data sources here](https://lynker-spatial.s3-us-west-2.amazonaws.com/hydrofabric/v2.2/hfv2.2-data_model.html).
24
- The raw forcing data is [nwm retrospective v3 forcing](https://noaa-nwm-retrospective-3-0-pds.s3.amazonaws.com/index.html#CONUS/zarr/forcing/) data.
24
+ The raw forcing data is [nwm retrospective v3 forcing](https://noaa-nwm-retrospective-3-0-pds.s3.amazonaws.com/index.html#CONUS/zarr/forcing/) data or the [AORC 1km gridded data](https://noaa-nws-aorc-v1-1-1km.s3.amazonaws.com/index.html) depending on user input
25
25
 
26
26
  1. **Subset** (delineate) everything upstream of your point of interest (catchment, gage, flowpath etc). Outputs as a geopackage.
27
27
  2. **Calculates** Forcings as a weighted mean of the gridded AORC forcings. Weights are calculated using [exact extract](https://isciences.github.io/exactextract/) and computed with numpy.
@@ -122,6 +122,7 @@ Once all the steps are finished, you can run NGIAB on the folder shown underneat
122
122
  - `--start_date START_DATE`, `--start START_DATE`: Start date for forcings/realization (format YYYY-MM-DD).
123
123
  - `--end_date END_DATE`, `--end END_DATE`: End date for forcings/realization (format YYYY-MM-DD).
124
124
  - `-o OUTPUT_NAME`, `--output_name OUTPUT_NAME`: Name of the output folder.
125
+ - `--source` : The datasource you want to use, either `nwm` for retrospective v3 or `aorc`. Default is `nwm`
125
126
  - `-D`, `--debug`: Enable debug logging.
126
127
  - `--run`: Automatically run Next Gen against the output folder.
127
128
  - `--validate`: Run every missing step required to run ngiab.
@@ -0,0 +1,212 @@
1
+ import logging
2
+ import os
3
+ from pathlib import Path
4
+ from typing import Tuple, Union
5
+
6
+ import geopandas as gpd
7
+ import numpy as np
8
+ import xarray as xr
9
+ from dask.distributed import Client, progress
10
+ import datetime
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # known ngen variable names
15
+ # https://github.com/CIROH-UA/ngen/blob/4fb5bb68dc397298bca470dfec94db2c1dcb42fe/include/forcing/AorcForcing.hpp#L77
16
+
17
+ def validate_dataset_format(dataset: xr.Dataset) -> None:
18
+ """
19
+ Validate the format of the dataset.
20
+
21
+ Parameters
22
+ ----------
23
+ dataset : xr.Dataset
24
+ Dataset to be validated.
25
+
26
+ Raises
27
+ ------
28
+ ValueError
29
+ If the dataset is not in the correct format.
30
+ """
31
+ if "time" not in dataset.coords:
32
+ raise ValueError("Dataset must have a 'time' coordinate")
33
+ if not np.issubdtype(dataset.time.dtype, np.datetime64):
34
+ raise ValueError("Time coordinate must be a numpy datetime64 type")
35
+ if "x" not in dataset.coords:
36
+ raise ValueError("Dataset must have an 'x' coordinate")
37
+ if "y" not in dataset.coords:
38
+ raise ValueError("Dataset must have a 'y' coordinate")
39
+ if "crs" not in dataset.attrs:
40
+ raise ValueError("Dataset must have a 'crs' attribute")
41
+ if "name" not in dataset.attrs:
42
+ raise ValueError("Dataset must have a name attribute to identify it")
43
+
44
+ def validate_time_range(dataset: xr.Dataset, start_time: str, end_time: str) -> Tuple[str, str]:
45
+ '''
46
+ Ensure that all selected times are in the passed dataset.
47
+
48
+ Parameters
49
+ ----------
50
+ dataset : xr.Dataset
51
+ Dataset with a time coordinate.
52
+ start_time : str
53
+ Desired start time in YYYY/MM/DD HH:MM:SS format.
54
+ end_time : str
55
+ Desired end time in YYYY/MM/DD HH:MM:SS format.
56
+
57
+ Returns
58
+ -------
59
+ str
60
+ start_time, or if not available, earliest available timestep in dataset.
61
+ str
62
+ end_time, or if not available, latest available timestep in dataset.
63
+ '''
64
+ end_time_in_dataset = dataset.time.isel(time=-1).values
65
+ start_time_in_dataset = dataset.time.isel(time=0).values
66
+ if np.datetime64(start_time) < start_time_in_dataset:
67
+ logger.warning(
68
+ f"provided start {start_time} is before the start of the dataset {start_time_in_dataset}, selecting from {start_time_in_dataset}"
69
+ )
70
+ start_time = start_time_in_dataset
71
+ if np.datetime64(end_time) > end_time_in_dataset:
72
+ logger.warning(
73
+ f"provided end {end_time} is after the end of the dataset {end_time_in_dataset}, selecting until {end_time_in_dataset}"
74
+ )
75
+ end_time = end_time_in_dataset
76
+ return start_time, end_time
77
+
78
+
79
+ def clip_dataset_to_bounds(
80
+ dataset: xr.Dataset, bounds: Tuple[float, float, float, float], start_time: str, end_time: str
81
+ ) -> xr.Dataset:
82
+ """
83
+ Clip the dataset to specified geographical bounds.
84
+
85
+ Parameters
86
+ ----------
87
+ dataset : xr.Dataset
88
+ Dataset to be clipped.
89
+ bounds : tuple[float, float, float, float]
90
+ Corners of bounding box. bounds[0] is x_min, bounds[1] is y_min,
91
+ bounds[2] is x_max, bounds[3] is y_max.
92
+ start_time : str
93
+ Desired start time in YYYY/MM/DD HH:MM:SS format.
94
+ end_time : str
95
+ Desired end time in YYYY/MM/DD HH:MM:SS format.
96
+
97
+ Returns
98
+ -------
99
+ xr.Dataset
100
+ Clipped dataset.
101
+ """
102
+ # check time range here in case just this function is imported and not the whole module
103
+ start_time, end_time = validate_time_range(dataset, start_time, end_time)
104
+ dataset = dataset.sel(
105
+ x=slice(bounds[0], bounds[2]),
106
+ y=slice(bounds[1], bounds[3]),
107
+ time=slice(start_time, end_time),
108
+ )
109
+ logger.info("Selected time range and clipped to bounds")
110
+ return dataset
111
+
112
+
113
+ def save_to_cache(stores: xr.Dataset, cached_nc_path: Path) -> xr.Dataset:
114
+ """Compute the store and save it to a cached netCDF file. This is not required but will save time and bandwidth."""
115
+ logger.info("Downloading and caching forcing data, this may take a while")
116
+
117
+ if not cached_nc_path.parent.exists():
118
+ cached_nc_path.parent.mkdir(parents=True)
119
+
120
+ # sort of terrible work around for half downloaded files
121
+ temp_path = cached_nc_path.with_suffix(".downloading.nc")
122
+ if os.path.exists(temp_path):
123
+ os.remove(temp_path)
124
+
125
+ ## Cast every single variable to float32 to save space to save a lot of memory issues later
126
+ ## easier to do it now in this slow download step than later in the steps without dask
127
+ for var in stores.data_vars:
128
+ stores[var] = stores[var].astype("float32")
129
+
130
+ client = Client.current()
131
+ future = client.compute(stores.to_netcdf(temp_path, compute=False))
132
+ # Display progress bar
133
+ progress(future)
134
+ future.result()
135
+
136
+ os.rename(temp_path, cached_nc_path)
137
+
138
+ data = xr.open_mfdataset(cached_nc_path, parallel=True, engine="h5netcdf")
139
+ return data
140
+
141
+
142
+ def check_local_cache(
143
+ cached_nc_path: Path,
144
+ start_time: str,
145
+ end_time: str,
146
+ gdf: gpd.GeoDataFrame,
147
+ remote_dataset: xr.Dataset
148
+ ) -> Union[xr.Dataset, None]:
149
+
150
+ merged_data = None
151
+
152
+ if not os.path.exists(cached_nc_path):
153
+ logger.info("No cache found")
154
+ return
155
+
156
+ logger.info("Found cached nc file")
157
+ # open the cached file and check that the time range is correct
158
+ cached_data = xr.open_mfdataset(
159
+ cached_nc_path, parallel=True, engine="h5netcdf"
160
+ )
161
+
162
+ if "name" not in cached_data.attrs or "name" not in remote_dataset.attrs:
163
+ logger.warning("No name attribute found to compare datasets")
164
+ return
165
+ if cached_data.name != remote_dataset.name:
166
+ logger.warning("Cached data from different source, .name attr doesn't match")
167
+ return
168
+
169
+ range_in_cache = cached_data.time[0].values <= np.datetime64(
170
+ start_time
171
+ ) and cached_data.time[-1].values >= np.datetime64(end_time)
172
+
173
+ if not range_in_cache:
174
+ # the cache does not contain the desired time range
175
+ logger.warning("Requested time range not in cache")
176
+ return
177
+
178
+ cached_vars = cached_data.data_vars.keys()
179
+ forcing_vars = remote_dataset.data_vars.keys()
180
+ # replace rainrate with precip
181
+ missing_vars = set(forcing_vars) - set(cached_vars)
182
+ if len(missing_vars) > 0:
183
+ logger.warning(f"Missing forcing vars in cache: {missing_vars}")
184
+ return
185
+
186
+ if range_in_cache:
187
+ logger.info("Time range is within cached data")
188
+ logger.debug(f"Opened cached nc file: [{cached_nc_path}]")
189
+ merged_data = clip_dataset_to_bounds(
190
+ cached_data, gdf.total_bounds, start_time, end_time
191
+ )
192
+ logger.debug("Clipped stores")
193
+
194
+ return merged_data
195
+
196
+
197
+ def save_and_clip_dataset(
198
+ dataset: xr.Dataset,
199
+ gdf: gpd.GeoDataFrame,
200
+ start_time: datetime.datetime,
201
+ end_time: datetime.datetime,
202
+ cache_location: Path,
203
+ ) -> xr.Dataset:
204
+ """convenience function clip the remote dataset, and either load from cache or save to cache if it's not present"""
205
+ gdf = gdf.to_crs(dataset.crs)
206
+
207
+ cached_data = check_local_cache(cache_location, start_time, end_time, gdf, dataset)
208
+
209
+ if not cached_data:
210
+ clipped_data = clip_dataset_to_bounds(dataset, gdf.total_bounds, start_time, end_time)
211
+ cached_data = save_to_cache(clipped_data, cache_location)
212
+ return cached_data
@@ -0,0 +1,87 @@
1
+ import logging
2
+
3
+ import s3fs
4
+ from data_processing.s3fs_utils import S3ParallelFileSystem
5
+ import xarray as xr
6
+ from dask.distributed import Client, LocalCluster
7
+ from data_processing.dataset_utils import validate_dataset_format
8
+
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def load_v3_retrospective_zarr(forcing_vars: list[str] = None) -> xr.Dataset:
14
+ """Load zarr datasets from S3 within the specified time range."""
15
+ # if a LocalCluster is not already running, start one
16
+ if not forcing_vars:
17
+ forcing_vars = ["lwdown", "precip", "psfc", "q2d", "swdown", "t2d", "u2d", "v2d"]
18
+ try:
19
+ client = Client.current()
20
+ except ValueError:
21
+ cluster = LocalCluster()
22
+ client = Client(cluster)
23
+ s3_urls = [
24
+ f"s3://noaa-nwm-retrospective-3-0-pds/CONUS/zarr/forcing/{var}.zarr"
25
+ for var in forcing_vars
26
+ ]
27
+ # default cache is readahead which is detrimental to performance in this case
28
+ fs = S3ParallelFileSystem(anon=True, default_cache_type="none") # default_block_size
29
+ s3_stores = [s3fs.S3Map(url, s3=fs) for url in s3_urls]
30
+ # the cache option here just holds accessed data in memory to prevent s3 being queried multiple times
31
+ # most of the data is read once and written to disk but some of the coordinate data is read multiple times
32
+ dataset = xr.open_mfdataset(s3_stores, parallel=True, engine="zarr", cache=True)
33
+
34
+ # set the crs attribute to conform with the format
35
+ esri_pe_string = dataset.crs.esri_pe_string
36
+ dataset = dataset.drop_vars(["crs"])
37
+ dataset.attrs["crs"] = esri_pe_string
38
+ dataset.attrs["name"] = "v3_retrospective_zarr"
39
+
40
+ # rename the data vars to work with ngen
41
+ variables = {
42
+ "LWDOWN": "DLWRF_surface",
43
+ "PSFC": "PRES_surface",
44
+ "Q2D": "SPFH_2maboveground",
45
+ "RAINRATE": "precip_rate",
46
+ "SWDOWN": "DSWRF_surface",
47
+ "T2D": "TMP_2maboveground",
48
+ "U2D": "UGRD_10maboveground",
49
+ "V2D": "VGRD_10maboveground",
50
+ }
51
+ dataset = dataset.rename_vars(variables)
52
+
53
+ validate_dataset_format(dataset)
54
+ return dataset
55
+
56
+
57
+ def load_aorc_zarr(start_year: int = None, end_year: int = None) -> xr.Dataset:
58
+ """Load the aorc zarr dataset from S3."""
59
+ if not start_year or not end_year:
60
+ logger.warning("No start or end year provided, defaulting to 1979-2023")
61
+ logger.warning("To reduce the time taken to load the data, provide a smaller range")
62
+ if not start_year:
63
+ start_year = 1979
64
+ if not end_year:
65
+ end_year = 2023
66
+ try:
67
+ client = Client.current()
68
+ except ValueError:
69
+ cluster = LocalCluster()
70
+ client = Client(cluster)
71
+
72
+ logger.info(f"Loading AORC zarr datasets from {start_year} to {end_year}")
73
+ estimated_time_s = ((end_year - start_year) * 2.5) + 3.5
74
+ # from testing, it's about 2.1s per year + 3.5s overhead
75
+ logger.info(f"This should take roughly {estimated_time_s} seconds")
76
+ fs = S3ParallelFileSystem(anon=True, default_cache_type="none")
77
+ s3_url = "s3://noaa-nws-aorc-v1-1-1km/"
78
+ urls = [f"{s3_url}{i}.zarr" for i in range(start_year, end_year+1)]
79
+ filestores = [s3fs.S3Map(url, s3=fs) for url in urls]
80
+ dataset = xr.open_mfdataset(filestores, parallel=True, engine="zarr", cache=True)
81
+ dataset.attrs["crs"] = "+proj=longlat +datum=WGS84 +no_defs"
82
+ dataset.attrs["name"] = "aorc_1km_zarr"
83
+ # rename latitude and longitude to x and y
84
+ dataset = dataset.rename({"latitude": "y", "longitude": "x"})
85
+
86
+ validate_dataset_format(dataset)
87
+ return dataset