roms-tools 3.1.1__py3-none-any.whl → 3.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- roms_tools/__init__.py +8 -1
- roms_tools/analysis/cdr_analysis.py +203 -0
- roms_tools/analysis/cdr_ensemble.py +198 -0
- roms_tools/analysis/roms_output.py +80 -46
- roms_tools/data/grids/GLORYS_global_grid.nc +0 -0
- roms_tools/download.py +4 -0
- roms_tools/plot.py +131 -30
- roms_tools/regrid.py +6 -1
- roms_tools/setup/boundary_forcing.py +94 -44
- roms_tools/setup/cdr_forcing.py +123 -15
- roms_tools/setup/cdr_release.py +161 -8
- roms_tools/setup/datasets.py +709 -341
- roms_tools/setup/grid.py +167 -139
- roms_tools/setup/initial_conditions.py +113 -48
- roms_tools/setup/mask.py +63 -7
- roms_tools/setup/nesting.py +67 -42
- roms_tools/setup/river_forcing.py +45 -19
- roms_tools/setup/surface_forcing.py +16 -10
- roms_tools/setup/tides.py +1 -2
- roms_tools/setup/topography.py +4 -4
- roms_tools/setup/utils.py +134 -22
- roms_tools/tests/test_analysis/test_cdr_analysis.py +144 -0
- roms_tools/tests/test_analysis/test_cdr_ensemble.py +202 -0
- roms_tools/tests/test_analysis/test_roms_output.py +61 -3
- roms_tools/tests/test_setup/test_boundary_forcing.py +111 -52
- roms_tools/tests/test_setup/test_cdr_forcing.py +54 -0
- roms_tools/tests/test_setup/test_cdr_release.py +118 -1
- roms_tools/tests/test_setup/test_datasets.py +458 -34
- roms_tools/tests/test_setup/test_grid.py +238 -121
- roms_tools/tests/test_setup/test_initial_conditions.py +94 -41
- roms_tools/tests/test_setup/test_surface_forcing.py +28 -3
- roms_tools/tests/test_setup/test_utils.py +91 -1
- roms_tools/tests/test_setup/test_validation.py +21 -15
- roms_tools/tests/test_setup/utils.py +71 -0
- roms_tools/tests/test_tiling/test_join.py +241 -0
- roms_tools/tests/test_tiling/test_partition.py +45 -0
- roms_tools/tests/test_utils.py +224 -2
- roms_tools/tiling/join.py +189 -0
- roms_tools/tiling/partition.py +44 -30
- roms_tools/utils.py +488 -161
- {roms_tools-3.1.1.dist-info → roms_tools-3.2.0.dist-info}/METADATA +15 -4
- {roms_tools-3.1.1.dist-info → roms_tools-3.2.0.dist-info}/RECORD +45 -37
- {roms_tools-3.1.1.dist-info → roms_tools-3.2.0.dist-info}/WHEEL +0 -0
- {roms_tools-3.1.1.dist-info → roms_tools-3.2.0.dist-info}/licenses/LICENSE +0 -0
- {roms_tools-3.1.1.dist-info → roms_tools-3.2.0.dist-info}/top_level.txt +0 -0
roms_tools/utils.py
CHANGED
|
@@ -1,26 +1,213 @@
|
|
|
1
1
|
import glob
|
|
2
2
|
import logging
|
|
3
3
|
import re
|
|
4
|
+
import textwrap
|
|
4
5
|
import warnings
|
|
6
|
+
from collections.abc import Callable, Iterable, Sequence
|
|
7
|
+
from dataclasses import dataclass
|
|
5
8
|
from importlib.util import find_spec
|
|
6
9
|
from pathlib import Path
|
|
10
|
+
from typing import TypeAlias
|
|
7
11
|
|
|
8
12
|
import numpy as np
|
|
9
13
|
import xarray as xr
|
|
10
14
|
|
|
11
15
|
from roms_tools.constants import R_EARTH
|
|
12
16
|
|
|
17
|
+
FilePaths: TypeAlias = str | Path | list[Path | str]
|
|
13
18
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
19
|
+
|
|
20
|
+
def _path_list_from_input(files: FilePaths) -> list[Path]:
|
|
21
|
+
"""Converts a generic user input to a list of Paths.
|
|
22
|
+
|
|
23
|
+
Takes a list of strings or paths, or wildcard pattern, and
|
|
24
|
+
returns a list of pathlib.Path objects
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
files: FilePaths
|
|
29
|
+
A list of files (str, Path), single path as a str or Path, or a wildcard string
|
|
30
|
+
|
|
31
|
+
Returns
|
|
32
|
+
-------
|
|
33
|
+
List[Path]
|
|
34
|
+
A list of pathlib.Paths
|
|
35
|
+
"""
|
|
36
|
+
if isinstance(files, str):
|
|
37
|
+
filepaths = sorted(Path(files).parent.glob(Path(files).name))
|
|
38
|
+
if not filepaths:
|
|
39
|
+
raise FileNotFoundError(f"No files matched: {files}")
|
|
40
|
+
elif isinstance(files, Path):
|
|
41
|
+
filepaths = [
|
|
42
|
+
files,
|
|
43
|
+
]
|
|
44
|
+
elif isinstance(files, list):
|
|
45
|
+
filepaths = [Path(f) for f in files]
|
|
46
|
+
else:
|
|
47
|
+
raise TypeError("'files' should be str, Path, or List[Path | str]")
|
|
48
|
+
|
|
49
|
+
return filepaths
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class FileMatchResult:
|
|
54
|
+
"""The result of performing a wildcard search."""
|
|
55
|
+
|
|
56
|
+
contains_wildcard: bool
|
|
57
|
+
"""Return `True` if the search contained a wildcard."""
|
|
58
|
+
matches: list[str]
|
|
59
|
+
"""The items matching the wildcard search."""
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _get_file_matches(
|
|
63
|
+
filename: str | Path | Sequence[str | Path],
|
|
64
|
+
) -> FileMatchResult:
|
|
65
|
+
"""Filter the filename using an optional wildcard search in the filename.
|
|
66
|
+
|
|
67
|
+
Parameters
|
|
68
|
+
----------
|
|
69
|
+
filename : str | Path | Sequence[str | Path]
|
|
70
|
+
An item to search for matches.
|
|
71
|
+
"""
|
|
72
|
+
# Precompile the regex for matching wildcard characters
|
|
73
|
+
wildcard_regex = re.compile(r"[\*\?\[\]]")
|
|
74
|
+
|
|
75
|
+
# Normalize input into a list of strings
|
|
76
|
+
if isinstance(filename, (str | Path)):
|
|
77
|
+
filenames: list[str] = [str(filename)]
|
|
78
|
+
elif isinstance(filename, Sequence):
|
|
79
|
+
filenames = [str(f) for f in filename]
|
|
80
|
+
else:
|
|
81
|
+
msg = "filename must be a string, Path, or a sequence of strings/Paths."
|
|
82
|
+
raise ValueError(msg)
|
|
83
|
+
|
|
84
|
+
contains_wildcard = any(wildcard_regex.search(f) for f in filenames)
|
|
85
|
+
matching_files: list[str] = []
|
|
86
|
+
|
|
87
|
+
for f in filenames:
|
|
88
|
+
if wildcard_regex.search(f):
|
|
89
|
+
files = glob.glob(f)
|
|
90
|
+
if not files:
|
|
91
|
+
raise FileNotFoundError(f"No files found matching the pattern '{f}'.")
|
|
92
|
+
matching_files.extend(files)
|
|
93
|
+
else:
|
|
94
|
+
matching_files.append(f)
|
|
95
|
+
|
|
96
|
+
return FileMatchResult(
|
|
97
|
+
contains_wildcard=contains_wildcard,
|
|
98
|
+
matches=sorted(matching_files),
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _get_ds_combination_params(
|
|
103
|
+
force_combine_nested: bool,
|
|
104
|
+
dim_names: dict[str, str],
|
|
105
|
+
match_result: FileMatchResult,
|
|
106
|
+
) -> dict[str, str]:
|
|
107
|
+
"""Determine the non-base parameters for combining datasets.
|
|
108
|
+
|
|
109
|
+
Parameters
|
|
110
|
+
----------
|
|
111
|
+
force_combine_nested: bool, optional
|
|
112
|
+
If True, forces the use of nested combination (`combine_nested`) regardless of whether wildcards are used.
|
|
113
|
+
Defaults to False.
|
|
114
|
+
dim_names : Dict[str, str], optional
|
|
115
|
+
Dictionary specifying the names of dimensions in the dataset.
|
|
116
|
+
Required only for lat-lon datasets to map dimension names like "latitude" and "longitude".
|
|
117
|
+
For ROMS datasets, this parameter can be omitted, as default ROMS dimensions ("eta_rho", "xi_rho", "s_rho") are assumed.
|
|
118
|
+
match_result : FileMatchResult
|
|
119
|
+
The result of an optional wildcard search of dataset filename(s).
|
|
120
|
+
|
|
121
|
+
Returns
|
|
122
|
+
-------
|
|
123
|
+
dict[str, str]
|
|
124
|
+
The default dataset combination parameters
|
|
125
|
+
|
|
126
|
+
"""
|
|
127
|
+
if force_combine_nested:
|
|
128
|
+
load_kwargs = {"combine": "nested", "concat_dim": dim_names["time"]}
|
|
129
|
+
elif match_result.contains_wildcard or len(match_result.matches) == 1:
|
|
130
|
+
load_kwargs = {"combine": "by_coords"}
|
|
131
|
+
else:
|
|
132
|
+
load_kwargs = {"combine": "nested", "concat_dim": dim_names["time"]}
|
|
133
|
+
|
|
134
|
+
return load_kwargs
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _get_ds_combine_base_params() -> dict[str, str]:
|
|
138
|
+
"""Return the base parameters used when combining an xr.Dataset.
|
|
139
|
+
|
|
140
|
+
Returns
|
|
141
|
+
-------
|
|
142
|
+
dict[str, str]
|
|
143
|
+
The default dataset combination parameters
|
|
144
|
+
|
|
145
|
+
"""
|
|
146
|
+
return {
|
|
147
|
+
"coords": "minimal",
|
|
148
|
+
"compat": "override",
|
|
149
|
+
"combine_attrs": "override",
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def get_dask_chunks(
|
|
154
|
+
dim_names: dict[str, str], time_chunking: bool = True
|
|
155
|
+
) -> dict[str, int]:
|
|
156
|
+
"""Return the default dask chunks for ROMS datasets.
|
|
157
|
+
|
|
158
|
+
Parameters
|
|
159
|
+
----------
|
|
160
|
+
dim_names : dict[str, str]
|
|
161
|
+
Dictionary specifying the names of dimensions in the dataset.
|
|
162
|
+
- For lat-lon datasets, provide keys "latitude" and "longitude" (and optionally "depth" and "time").
|
|
163
|
+
- For ROMS datasets, the default ROMS dimensions are assumed ("eta_rho", "xi_rho", "s_rho", etc.).
|
|
164
|
+
time_chunking : bool, optional
|
|
165
|
+
Whether to chunk along the time dimension.
|
|
166
|
+
- True: chunk time dimension with size 1 (useful for processing large time-series data with Dask).
|
|
167
|
+
- False: do not explicitly chunk time; Dask will use default auto-chunking.
|
|
168
|
+
Defaults to True.
|
|
169
|
+
|
|
170
|
+
Returns
|
|
171
|
+
-------
|
|
172
|
+
dict[str, int]
|
|
173
|
+
The default dask chunks for ROMS datasets.
|
|
174
|
+
"""
|
|
175
|
+
if "latitude" in dim_names and "longitude" in dim_names:
|
|
176
|
+
# for lat-lon datasets
|
|
177
|
+
chunks = {
|
|
178
|
+
dim_names["latitude"]: -1,
|
|
179
|
+
dim_names["longitude"]: -1,
|
|
180
|
+
}
|
|
181
|
+
else:
|
|
182
|
+
# For ROMS datasets
|
|
183
|
+
chunks = {
|
|
184
|
+
"eta_rho": -1,
|
|
185
|
+
"eta_v": -1,
|
|
186
|
+
"xi_rho": -1,
|
|
187
|
+
"xi_u": -1,
|
|
188
|
+
"s_rho": -1,
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
if "depth" in dim_names:
|
|
192
|
+
chunks[dim_names["depth"]] = -1
|
|
193
|
+
if "time" in dim_names and time_chunking:
|
|
194
|
+
chunks[dim_names["time"]] = 1
|
|
195
|
+
if "ntides" in dim_names:
|
|
196
|
+
chunks[dim_names["ntides"]] = 1
|
|
197
|
+
|
|
198
|
+
return chunks
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _load_data_dask(
|
|
202
|
+
filenames: list[str],
|
|
203
|
+
dim_names: dict[str, str],
|
|
204
|
+
time_chunking: bool = True,
|
|
205
|
+
decode_times: bool = True,
|
|
206
|
+
decode_timedelta: bool = True,
|
|
207
|
+
read_zarr: bool = True,
|
|
208
|
+
load_kwargs: dict[str, str] | None = None,
|
|
209
|
+
) -> xr.Dataset:
|
|
210
|
+
"""Load dataset from the specified file using Dask.
|
|
24
211
|
|
|
25
212
|
Parameters
|
|
26
213
|
----------
|
|
@@ -31,8 +218,6 @@ def _load_data(
|
|
|
31
218
|
Dictionary specifying the names of dimensions in the dataset.
|
|
32
219
|
Required only for lat-lon datasets to map dimension names like "latitude" and "longitude".
|
|
33
220
|
For ROMS datasets, this parameter can be omitted, as default ROMS dimensions ("eta_rho", "xi_rho", "s_rho") are assumed.
|
|
34
|
-
use_dask: bool
|
|
35
|
-
Indicates whether to use dask for chunking. If True, data is loaded with dask; if False, data is loaded eagerly. Defaults to False.
|
|
36
221
|
time_chunking : bool, optional
|
|
37
222
|
If True and `use_dask=True`, the data will be chunked along the time dimension with a chunk size of 1.
|
|
38
223
|
If False, the data will not be chunked explicitly along the time dimension, but will follow the default auto chunking scheme. This option is useful for ROMS restart files.
|
|
@@ -40,9 +225,9 @@ def _load_data(
|
|
|
40
225
|
decode_times: bool, optional
|
|
41
226
|
If True, decode times and timedeltas encoded in the standard NetCDF datetime format into datetime objects. Otherwise, leave them encoded as numbers.
|
|
42
227
|
Defaults to True.
|
|
43
|
-
|
|
44
|
-
If True,
|
|
45
|
-
Defaults to
|
|
228
|
+
decode_timedelta: bool, optional
|
|
229
|
+
If True, decode timedeltas encoded in the standard NetCDF datetime format into datetime objects. Otherwise, leave them encoded as numbers.
|
|
230
|
+
Defaults to True.
|
|
46
231
|
read_zarr: bool, optional
|
|
47
232
|
If True, use the zarr engine to read the dataset, and don't use mfdataset.
|
|
48
233
|
Defaults to False.
|
|
@@ -58,155 +243,230 @@ def _load_data(
|
|
|
58
243
|
If the specified file does not exist.
|
|
59
244
|
ValueError
|
|
60
245
|
If a list of files is provided but dim_names["time"] is not available or use_dask=False.
|
|
246
|
+
|
|
61
247
|
"""
|
|
62
|
-
|
|
63
|
-
dim_names = {}
|
|
248
|
+
chunks = get_dask_chunks(dim_names, time_chunking)
|
|
64
249
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
250
|
+
with warnings.catch_warnings():
|
|
251
|
+
warnings.filterwarnings(
|
|
252
|
+
"ignore",
|
|
253
|
+
category=UserWarning,
|
|
254
|
+
message=r"^The specified chunks separate.*",
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
if read_zarr:
|
|
258
|
+
return xr.open_zarr(
|
|
259
|
+
filenames[0],
|
|
260
|
+
decode_times=decode_times,
|
|
261
|
+
decode_timedelta=decode_timedelta,
|
|
262
|
+
chunks=chunks,
|
|
263
|
+
consolidated=None,
|
|
264
|
+
storage_options={"token": "anon"},
|
|
72
265
|
)
|
|
266
|
+
|
|
267
|
+
kwargs = {**_get_ds_combine_base_params(), **(load_kwargs or {})}
|
|
268
|
+
return xr.open_mfdataset(
|
|
269
|
+
filenames,
|
|
270
|
+
decode_times=decode_times,
|
|
271
|
+
decode_timedelta=decode_timedelta,
|
|
272
|
+
chunks=chunks,
|
|
273
|
+
**kwargs,
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def _check_load_data_dask(use_dask: bool) -> None:
|
|
278
|
+
"""Determine if dask is installed.
|
|
279
|
+
|
|
280
|
+
Parameters
|
|
281
|
+
----------
|
|
282
|
+
use_dask: bool
|
|
283
|
+
Indicates whether to use dask for chunking. If True, data is loaded with dask; if False, data is loaded eagerly. Defaults to False.
|
|
284
|
+
|
|
285
|
+
Raises
|
|
286
|
+
------
|
|
287
|
+
RuntimeError
|
|
288
|
+
If dask is requested but not installed.
|
|
289
|
+
"""
|
|
290
|
+
if use_dask and not has_dask():
|
|
291
|
+
msg = (
|
|
292
|
+
"Dask is required but not installed. Install it with:\n"
|
|
293
|
+
" • `pip install roms-tools[dask]` or\n"
|
|
294
|
+
" • `conda install dask`\n"
|
|
295
|
+
"Alternatively, install `roms-tools` with conda to include all dependencies."
|
|
296
|
+
)
|
|
297
|
+
raise RuntimeError(msg)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def _check_load_data_zarr(
|
|
301
|
+
use_dask: bool, read_zarr: bool, filename: str | Path | list[str | Path]
|
|
302
|
+
) -> None:
|
|
303
|
+
"""Determine if zarr streaming will conflict with the current request configuration.
|
|
304
|
+
|
|
305
|
+
Parameters
|
|
306
|
+
----------
|
|
307
|
+
filename : Union[str, Path, List[Union[str, Path]]]
|
|
308
|
+
The path to the data file(s). Can be a single string (with or without wildcards), a single Path object,
|
|
309
|
+
or a list of strings or Path objects containing multiple files.
|
|
310
|
+
use_dask: bool
|
|
311
|
+
Indicates whether to use dask for chunking. If True, data is loaded with dask; if False, data is loaded eagerly. Defaults to False.
|
|
312
|
+
read_zarr: bool, optional
|
|
313
|
+
If True, use the zarr engine to read the dataset, and don't use mfdataset.
|
|
314
|
+
Defaults to False.
|
|
315
|
+
|
|
316
|
+
Raises
|
|
317
|
+
------
|
|
318
|
+
RuntimeError
|
|
319
|
+
If read_zarr is requested, but:
|
|
320
|
+
- the request doesn't specify a dependency on dask
|
|
321
|
+
- the request includes a list of filenames
|
|
322
|
+
|
|
323
|
+
"""
|
|
73
324
|
if read_zarr:
|
|
74
325
|
if isinstance(filename, list):
|
|
75
|
-
|
|
326
|
+
msg = "read_zarr requires a single path, not a list of paths"
|
|
327
|
+
raise ValueError(msg)
|
|
328
|
+
|
|
76
329
|
if not use_dask:
|
|
77
|
-
|
|
330
|
+
msg = "read_zarr must be used with use_dask"
|
|
331
|
+
raise ValueError(msg)
|
|
78
332
|
|
|
79
|
-
# Precompile the regex for matching wildcard characters
|
|
80
|
-
wildcard_regex = re.compile(r"[\*\?\[\]]")
|
|
81
333
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
filename_str = [str(f) for f in filename]
|
|
87
|
-
else:
|
|
88
|
-
raise ValueError("filename must be a string, Path, or a list of strings/Paths.")
|
|
89
|
-
|
|
90
|
-
# Handle the case when filename is a string
|
|
91
|
-
contains_wildcard = False
|
|
92
|
-
if isinstance(filename_str, str):
|
|
93
|
-
contains_wildcard = bool(wildcard_regex.search(filename_str))
|
|
94
|
-
if contains_wildcard:
|
|
95
|
-
matching_files = glob.glob(filename_str)
|
|
96
|
-
if not matching_files:
|
|
97
|
-
raise FileNotFoundError(
|
|
98
|
-
f"No files found matching the pattern '{filename_str}'."
|
|
99
|
-
)
|
|
100
|
-
else:
|
|
101
|
-
matching_files = [filename_str]
|
|
102
|
-
|
|
103
|
-
# Handle the case when filename is a list
|
|
104
|
-
elif isinstance(filename_str, list):
|
|
105
|
-
contains_wildcard = any(wildcard_regex.search(f) for f in filename_str)
|
|
106
|
-
if contains_wildcard:
|
|
107
|
-
matching_files = []
|
|
108
|
-
for f in filename_str:
|
|
109
|
-
files = glob.glob(f)
|
|
110
|
-
if not files:
|
|
111
|
-
raise FileNotFoundError(
|
|
112
|
-
f"No files found matching the pattern '{f}'."
|
|
113
|
-
)
|
|
114
|
-
matching_files.extend(files)
|
|
115
|
-
else:
|
|
116
|
-
matching_files = filename_str
|
|
334
|
+
def _check_load_data_filename(
|
|
335
|
+
filename: str | Path | list[str | Path], dim_names: Iterable[str]
|
|
336
|
+
) -> None:
|
|
337
|
+
"""Determine if time dimension is available when multiple files are provided.
|
|
117
338
|
|
|
118
|
-
|
|
119
|
-
|
|
339
|
+
Parameters
|
|
340
|
+
----------
|
|
341
|
+
filename : Union[str, Path, List[Union[str, Path]]]
|
|
342
|
+
The path to the data file(s). Can be a single string (with or without wildcards), a single Path object,
|
|
343
|
+
or a list of strings or Path objects containing multiple files.
|
|
344
|
+
dim_names : Dict[str, str], optional
|
|
345
|
+
Dictionary specifying the names of dimensions in the dataset.
|
|
346
|
+
Required only for lat-lon datasets to map dimension names like "latitude" and "longitude".
|
|
347
|
+
For ROMS datasets, this parameter can be omitted, as default ROMS dimensions ("eta_rho", "xi_rho", "s_rho") are assumed.
|
|
348
|
+
|
|
349
|
+
Raises
|
|
350
|
+
------
|
|
351
|
+
ValueError
|
|
352
|
+
If time dimension is not found and a list of files is provided.
|
|
120
353
|
|
|
121
|
-
|
|
122
|
-
if isinstance(
|
|
123
|
-
|
|
354
|
+
"""
|
|
355
|
+
if isinstance(filename, list) and "time" not in dim_names:
|
|
356
|
+
msg = (
|
|
124
357
|
"A list of files is provided, but time dimension is not available. "
|
|
125
358
|
"A time dimension must be available to concatenate the files."
|
|
126
359
|
)
|
|
360
|
+
raise ValueError(msg)
|
|
127
361
|
|
|
128
|
-
# Determine the kwargs for combining datasets
|
|
129
|
-
if force_combine_nested:
|
|
130
|
-
kwargs = {"combine": "nested", "concat_dim": dim_names["time"]}
|
|
131
|
-
elif contains_wildcard or len(matching_files) == 1:
|
|
132
|
-
kwargs = {"combine": "by_coords"}
|
|
133
|
-
else:
|
|
134
|
-
kwargs = {"combine": "nested", "concat_dim": dim_names["time"]}
|
|
135
362
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
363
|
+
def load_data(
|
|
364
|
+
filename: str | Path | list[str | Path],
|
|
365
|
+
dim_names: dict[str, str] | None = None,
|
|
366
|
+
use_dask: bool = False,
|
|
367
|
+
time_chunking: bool = True,
|
|
368
|
+
decode_times: bool = True,
|
|
369
|
+
decode_timedelta: bool = True,
|
|
370
|
+
force_combine_nested: bool = False,
|
|
371
|
+
read_zarr: bool = False,
|
|
372
|
+
ds_loader_fn: Callable[[], xr.Dataset] | None = None,
|
|
373
|
+
):
|
|
374
|
+
"""Load dataset from the specified file.
|
|
142
375
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
category=UserWarning,
|
|
171
|
-
message=r"^The specified chunks separate.*",
|
|
172
|
-
)
|
|
376
|
+
Parameters
|
|
377
|
+
----------
|
|
378
|
+
filename : str | Path | list[str | Path]
|
|
379
|
+
The path to the data file(s). Can be a single string (with or without wildcards), a single Path object,
|
|
380
|
+
or a list of strings or Path objects containing multiple files.
|
|
381
|
+
dim_names : dict[str, str], optional
|
|
382
|
+
Dictionary specifying the names of dimensions in the dataset.
|
|
383
|
+
Required only for lat-lon datasets to map dimension names like "latitude" and "longitude".
|
|
384
|
+
For ROMS datasets, this parameter can be omitted, as default ROMS dimensions ("eta_rho", "xi_rho", "s_rho") are assumed.
|
|
385
|
+
use_dask: bool, optional
|
|
386
|
+
Indicates whether to use dask for chunking. If True, data is loaded with dask; if False, data is loaded eagerly. Defaults to False.
|
|
387
|
+
time_chunking : bool, optional
|
|
388
|
+
If True and `use_dask=True`, the data will be chunked along the time dimension with a chunk size of 1.
|
|
389
|
+
If False, the data will not be chunked explicitly along the time dimension, but will follow the default auto chunking scheme. This option is useful for ROMS restart files.
|
|
390
|
+
Defaults to True.
|
|
391
|
+
decode_times: bool, optional
|
|
392
|
+
If True, decode times encoded in the standard NetCDF datetime format into datetime objects. Otherwise, leave them encoded as numbers.
|
|
393
|
+
Defaults to True.
|
|
394
|
+
decode_timedelta: bool, optional
|
|
395
|
+
If True, decode timedeltas encoded in the standard NetCDF datetime format into datetime objects. Otherwise, leave them encoded as numbers.
|
|
396
|
+
Defaults to True.
|
|
397
|
+
force_combine_nested: bool, optional
|
|
398
|
+
If True, forces the use of nested combination (`combine_nested`) regardless of whether wildcards are used.
|
|
399
|
+
Defaults to False.
|
|
400
|
+
read_zarr: bool, optional
|
|
401
|
+
If True, use the zarr engine to read the dataset, and don't use mfdataset.
|
|
402
|
+
Defaults to False.
|
|
173
403
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
404
|
+
Returns
|
|
405
|
+
-------
|
|
406
|
+
ds : xr.Dataset
|
|
407
|
+
The loaded xarray Dataset containing the forcing data.
|
|
408
|
+
|
|
409
|
+
Raises
|
|
410
|
+
------
|
|
411
|
+
FileNotFoundError
|
|
412
|
+
If the specified file does not exist.
|
|
413
|
+
ValueError
|
|
414
|
+
If a list of files is provided but dim_names["time"] is not available or use_dask=False.
|
|
415
|
+
RuntimeError
|
|
416
|
+
If loading the dataset fails
|
|
417
|
+
"""
|
|
418
|
+
dim_names = dim_names or {}
|
|
419
|
+
|
|
420
|
+
_check_load_data_dask(use_dask)
|
|
421
|
+
_check_load_data_zarr(use_dask, read_zarr, filename)
|
|
422
|
+
_check_load_data_filename(filename, dim_names.keys())
|
|
423
|
+
|
|
424
|
+
match_result = _get_file_matches(filename)
|
|
191
425
|
|
|
426
|
+
load_kwargs = _get_ds_combination_params(
|
|
427
|
+
force_combine_nested,
|
|
428
|
+
dim_names,
|
|
429
|
+
match_result,
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
ds: xr.Dataset | xr.DataArray | None = None
|
|
433
|
+
|
|
434
|
+
if ds_loader_fn is not None:
|
|
435
|
+
ds = ds_loader_fn()
|
|
436
|
+
elif use_dask:
|
|
437
|
+
ds = _load_data_dask(
|
|
438
|
+
match_result.matches,
|
|
439
|
+
dim_names,
|
|
440
|
+
time_chunking,
|
|
441
|
+
decode_times,
|
|
442
|
+
decode_timedelta,
|
|
443
|
+
read_zarr,
|
|
444
|
+
load_kwargs,
|
|
445
|
+
)
|
|
192
446
|
else:
|
|
193
447
|
ds_list = []
|
|
194
|
-
for file in
|
|
448
|
+
for file in match_result.matches:
|
|
195
449
|
ds = xr.open_dataset(
|
|
196
450
|
file,
|
|
197
451
|
decode_times=decode_times,
|
|
198
|
-
decode_timedelta=
|
|
452
|
+
decode_timedelta=decode_timedelta,
|
|
199
453
|
chunks=None,
|
|
200
454
|
)
|
|
201
455
|
ds_list.append(ds)
|
|
202
456
|
|
|
203
|
-
|
|
457
|
+
combine_kwargs = _get_ds_combine_base_params()
|
|
458
|
+
|
|
459
|
+
if load_kwargs["combine"] == "by_coords":
|
|
204
460
|
ds = xr.combine_by_coords(ds_list, **combine_kwargs)
|
|
205
|
-
elif
|
|
461
|
+
elif load_kwargs["combine"] == "nested":
|
|
206
462
|
ds = xr.combine_nested(
|
|
207
|
-
ds_list, concat_dim=
|
|
463
|
+
ds_list, concat_dim=load_kwargs["concat_dim"], **combine_kwargs
|
|
208
464
|
)
|
|
209
465
|
|
|
466
|
+
if ds is None:
|
|
467
|
+
msg = "A dataset was not loaded."
|
|
468
|
+
raise RuntimeError(msg)
|
|
469
|
+
|
|
210
470
|
if "time" in dim_names and dim_names["time"] not in ds.dims:
|
|
211
471
|
ds = ds.expand_dims(dim_names["time"])
|
|
212
472
|
|
|
@@ -366,7 +626,44 @@ def save_datasets(dataset_list, output_filenames, use_dask=False, verbose=True):
|
|
|
366
626
|
List[Path]
|
|
367
627
|
A list of Path objects for the filenames that were saved.
|
|
368
628
|
"""
|
|
629
|
+
|
|
630
|
+
def _patch_1d_encodings(dataset_list: list[xr.Dataset]) -> None:
|
|
631
|
+
"""Replaces problematic encodings in 1D variables.
|
|
632
|
+
|
|
633
|
+
ROMS' Fortran-based tools fail with certain encoding types that are common
|
|
634
|
+
in roms-tools' exported 1D vars (e.g. `abs_time`, `river_name`). This function
|
|
635
|
+
replaces int64 -> int32 (for true integers), int64 -> float64
|
|
636
|
+
(for non-integer vars encoded as int64 on disk), and NC_STRING -> char.
|
|
637
|
+
|
|
638
|
+
Parameters
|
|
639
|
+
----------
|
|
640
|
+
dataset_list: list[xr.Dataset]
|
|
641
|
+
List of datasets to be saved
|
|
642
|
+
|
|
643
|
+
"""
|
|
644
|
+
for ds in dataset_list:
|
|
645
|
+
for name in ds.variables:
|
|
646
|
+
da = ds[name]
|
|
647
|
+
if da.ndim != 1:
|
|
648
|
+
continue
|
|
649
|
+
|
|
650
|
+
enc_var = xr.conventions.encode_cf_variable(da.variable, name=name)
|
|
651
|
+
enc_dtype = enc_var.dtype
|
|
652
|
+
|
|
653
|
+
# NC_STRING → fixed-width char
|
|
654
|
+
if enc_dtype.kind in ("O", "U", "S"):
|
|
655
|
+
da.encoding["dtype"] = "S1"
|
|
656
|
+
continue
|
|
657
|
+
|
|
658
|
+
# NC_INT64 → int32 for true integers; float64 otherwise
|
|
659
|
+
if enc_dtype == np.int64:
|
|
660
|
+
if da.dtype.kind in ("i", "u"):
|
|
661
|
+
da.encoding["dtype"] = "int32"
|
|
662
|
+
else:
|
|
663
|
+
da.encoding["dtype"] = "float64"
|
|
664
|
+
|
|
369
665
|
saved_filenames = []
|
|
666
|
+
_patch_1d_encodings(dataset_list)
|
|
370
667
|
|
|
371
668
|
output_filenames = [f"{filename}.nc" for filename in output_filenames]
|
|
372
669
|
if verbose:
|
|
@@ -387,30 +684,7 @@ def save_datasets(dataset_list, output_filenames, use_dask=False, verbose=True):
|
|
|
387
684
|
return saved_filenames
|
|
388
685
|
|
|
389
686
|
|
|
390
|
-
def
|
|
391
|
-
"""Returns the appropriate Dask chunking dictionary based on grid location.
|
|
392
|
-
|
|
393
|
-
Parameters
|
|
394
|
-
----------
|
|
395
|
-
location : str
|
|
396
|
-
The grid location, one of "rho", "u", or "v".
|
|
397
|
-
chunk_size : int
|
|
398
|
-
The chunk size to apply.
|
|
399
|
-
|
|
400
|
-
Returns
|
|
401
|
-
-------
|
|
402
|
-
dict
|
|
403
|
-
Dictionary specifying the chunking strategy.
|
|
404
|
-
"""
|
|
405
|
-
chunk_mapping = {
|
|
406
|
-
"rho": {"eta_rho": chunk_size, "xi_rho": chunk_size},
|
|
407
|
-
"u": {"eta_rho": chunk_size, "xi_u": chunk_size},
|
|
408
|
-
"v": {"eta_v": chunk_size, "xi_rho": chunk_size},
|
|
409
|
-
}
|
|
410
|
-
return chunk_mapping.get(location, {})
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
def _generate_coordinate_range(min_val: float, max_val: float, resolution: float):
|
|
687
|
+
def generate_coordinate_range(min_val: float, max_val: float, resolution: float):
|
|
414
688
|
"""Generate an array of target coordinates (e.g., latitude or longitude) within a
|
|
415
689
|
specified range, with a resolution that is rounded to the nearest value of the form
|
|
416
690
|
`1/n` (or integer).
|
|
@@ -472,7 +746,7 @@ def _generate_coordinate_range(min_val: float, max_val: float, resolution: float
|
|
|
472
746
|
return target.astype(np.float32)
|
|
473
747
|
|
|
474
748
|
|
|
475
|
-
def
|
|
749
|
+
def generate_focused_coordinate_range(
|
|
476
750
|
center: float,
|
|
477
751
|
sc: float,
|
|
478
752
|
min_val: float,
|
|
@@ -558,7 +832,7 @@ def _generate_focused_coordinate_range(
|
|
|
558
832
|
return centers, faces
|
|
559
833
|
|
|
560
834
|
|
|
561
|
-
def
|
|
835
|
+
def remove_edge_nans(
|
|
562
836
|
field: xr.DataArray, xdim: str, layer_depth: xr.DataArray | None = None
|
|
563
837
|
) -> tuple[xr.DataArray, xr.DataArray | None]:
|
|
564
838
|
"""Remove NaN-only slices at the edges of a specified dimension.
|
|
@@ -634,14 +908,42 @@ def _remove_edge_nans(
|
|
|
634
908
|
return field, layer_depth
|
|
635
909
|
|
|
636
910
|
|
|
637
|
-
def
|
|
911
|
+
def has_dask() -> bool:
|
|
912
|
+
"""Determine if the Dask package is installed.
|
|
913
|
+
|
|
914
|
+
Returns
|
|
915
|
+
-------
|
|
916
|
+
bool
|
|
917
|
+
`True` if package is found, `False` otherwise.
|
|
918
|
+
|
|
919
|
+
"""
|
|
638
920
|
return find_spec("dask") is not None
|
|
639
921
|
|
|
640
922
|
|
|
641
|
-
def
|
|
923
|
+
def has_gcsfs() -> bool:
|
|
924
|
+
"""Determine if the GCSFS package is installed.
|
|
925
|
+
|
|
926
|
+
Returns
|
|
927
|
+
-------
|
|
928
|
+
bool
|
|
929
|
+
`True` if package is found, `False` otherwise.
|
|
930
|
+
|
|
931
|
+
"""
|
|
642
932
|
return find_spec("gcsfs") is not None
|
|
643
933
|
|
|
644
934
|
|
|
935
|
+
def has_copernicus() -> bool:
|
|
936
|
+
"""Determine if the Copernicus Marine Toolkit package is installed.
|
|
937
|
+
|
|
938
|
+
Returns
|
|
939
|
+
-------
|
|
940
|
+
bool
|
|
941
|
+
`True` if package is found, `False` otherwise.
|
|
942
|
+
|
|
943
|
+
"""
|
|
944
|
+
return find_spec("copernicusmarine") is not None
|
|
945
|
+
|
|
946
|
+
|
|
645
947
|
def normalize_longitude(lon: float, straddle: bool) -> float:
|
|
646
948
|
"""Normalize longitude to the appropriate range depending on whether the grid
|
|
647
949
|
straddles the dateline.
|
|
@@ -704,3 +1006,28 @@ def infer_nominal_horizontal_resolution(
|
|
|
704
1006
|
resolution_in_degrees = resolution_in_m / (meters_per_degree * np.cos(lat_rad))
|
|
705
1007
|
|
|
706
1008
|
return float(resolution_in_degrees)
|
|
1009
|
+
|
|
1010
|
+
|
|
1011
|
+
def get_pkg_error_msg(purpose: str, package_name: str, option_name: str) -> str:
|
|
1012
|
+
"""Generate an error message indicating how to install an optional dependency.
|
|
1013
|
+
|
|
1014
|
+
Parameters
|
|
1015
|
+
----------
|
|
1016
|
+
purpose : str
|
|
1017
|
+
Description of the feature the package enables.
|
|
1018
|
+
package_name : str
|
|
1019
|
+
The package name
|
|
1020
|
+
option_name : str
|
|
1021
|
+
The optional dependency containing the package
|
|
1022
|
+
|
|
1023
|
+
Returns
|
|
1024
|
+
-------
|
|
1025
|
+
str
|
|
1026
|
+
The formatted error message
|
|
1027
|
+
"""
|
|
1028
|
+
return textwrap.dedent(f"""\
|
|
1029
|
+
To use {purpose}, {package_name} is required but not installed. Install it with:
|
|
1030
|
+
• `pip install roms-tools[{option_name}]` or
|
|
1031
|
+
• `pip install {package_name}` or
|
|
1032
|
+
• `conda install {package_name}`
|
|
1033
|
+
Alternatively, install `roms-tools` with conda to include all dependencies.""")
|