eo-tides 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eo_tides/__init__.py +50 -0
- eo_tides/eo.py +532 -0
- eo_tides/model.py +825 -0
- eo_tides/stats.py +581 -0
- eo_tides/utils.py +705 -0
- eo_tides/validation.py +334 -0
- eo_tides-0.5.0.dist-info/LICENSE +201 -0
- eo_tides-0.5.0.dist-info/METADATA +118 -0
- eo_tides-0.5.0.dist-info/RECORD +11 -0
- eo_tides-0.5.0.dist-info/WHEEL +5 -0
- eo_tides-0.5.0.dist-info/top_level.txt +1 -0
eo_tides/model.py
ADDED
@@ -0,0 +1,825 @@
|
|
1
|
+
# Used to postpone evaluation of type annotations
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
import os
|
5
|
+
import textwrap
|
6
|
+
from concurrent.futures import ProcessPoolExecutor
|
7
|
+
from concurrent.futures.process import BrokenProcessPool
|
8
|
+
from functools import partial
|
9
|
+
from typing import TYPE_CHECKING
|
10
|
+
|
11
|
+
import psutil
|
12
|
+
|
13
|
+
# Only import if running type checking
|
14
|
+
if TYPE_CHECKING:
|
15
|
+
import xarray as xr
|
16
|
+
|
17
|
+
import geopandas as gpd
|
18
|
+
import numpy as np
|
19
|
+
import pandas as pd
|
20
|
+
import pyproj
|
21
|
+
import pyTMD
|
22
|
+
from tqdm import tqdm
|
23
|
+
|
24
|
+
from .utils import DatetimeLike, _set_directory, _standardise_models, _standardise_time, idw
|
25
|
+
|
26
|
+
|
27
|
+
def _parallel_splits(
|
28
|
+
total_points: int,
|
29
|
+
model_count: int,
|
30
|
+
parallel_max: int | None = None,
|
31
|
+
min_points_per_split: int = 1000,
|
32
|
+
) -> int:
|
33
|
+
"""
|
34
|
+
Calculates the optimal number of parallel splits for data
|
35
|
+
processing based on system resources and processing constraints.
|
36
|
+
|
37
|
+
Parameters:
|
38
|
+
-----------
|
39
|
+
total_points : int
|
40
|
+
Total number of data points to process
|
41
|
+
model_count : int
|
42
|
+
Number of models that will be run in parallel
|
43
|
+
parallel_max : int, optional
|
44
|
+
Maximum number of parallel processes to use. If None, uses CPU core count
|
45
|
+
min_points_per_split : int, default=1000
|
46
|
+
Minimum number of points that should be processed in each split
|
47
|
+
"""
|
48
|
+
# Get available CPUs. First see if `CPU_GUARANTEE` exists in
|
49
|
+
# environment (if running in JupyterHub); if not use psutil
|
50
|
+
# followed by standard CPU count
|
51
|
+
if parallel_max is None:
|
52
|
+
# Take the first valid output
|
53
|
+
raw_value = os.environ.get("CPU_GUARANTEE") or psutil.cpu_count(logical=False) or os.cpu_count() or 1
|
54
|
+
|
55
|
+
# Convert to integer
|
56
|
+
if isinstance(raw_value, str):
|
57
|
+
parallel_max = int(float(raw_value))
|
58
|
+
else:
|
59
|
+
parallel_max = int(raw_value)
|
60
|
+
|
61
|
+
# Calculate optimal number of splits based on constraints
|
62
|
+
splits_by_size = total_points / min_points_per_split
|
63
|
+
splits_by_cpu = parallel_max / model_count
|
64
|
+
optimal_splits = min(splits_by_size, splits_by_cpu)
|
65
|
+
|
66
|
+
# Convert to integer and ensure at least 1 split
|
67
|
+
final_split_count = int(max(1, optimal_splits))
|
68
|
+
return final_split_count
|
69
|
+
|
70
|
+
|
71
|
+
def _model_tides(
|
72
|
+
model,
|
73
|
+
x,
|
74
|
+
y,
|
75
|
+
time,
|
76
|
+
directory,
|
77
|
+
crs,
|
78
|
+
mode,
|
79
|
+
output_units,
|
80
|
+
method,
|
81
|
+
extrapolate,
|
82
|
+
cutoff,
|
83
|
+
crop,
|
84
|
+
crop_buffer,
|
85
|
+
):
|
86
|
+
"""Worker function applied in parallel by `model_tides`. Handles the
|
87
|
+
extraction of tide modelling constituents and tide modelling using
|
88
|
+
`pyTMD`.
|
89
|
+
"""
|
90
|
+
# Obtain model details
|
91
|
+
pytmd_model = pyTMD.io.model(directory).elevation(model)
|
92
|
+
|
93
|
+
# Reproject x, y to latitude/longitude
|
94
|
+
transformer = pyproj.Transformer.from_crs(crs, "EPSG:4326", always_xy=True)
|
95
|
+
lon, lat = transformer.transform(x.flatten(), y.flatten())
|
96
|
+
|
97
|
+
# Convert datetime
|
98
|
+
timescale = pyTMD.time.timescale().from_datetime(time.flatten())
|
99
|
+
|
100
|
+
try:
|
101
|
+
# Read tidal constants and interpolate to grid points
|
102
|
+
amp, ph, c = pytmd_model.extract_constants(
|
103
|
+
lon,
|
104
|
+
lat,
|
105
|
+
type=pytmd_model.type,
|
106
|
+
crop=crop,
|
107
|
+
buffer=crop_buffer,
|
108
|
+
method=method,
|
109
|
+
extrapolate=extrapolate,
|
110
|
+
cutoff=cutoff,
|
111
|
+
append_node=False,
|
112
|
+
)
|
113
|
+
|
114
|
+
# TODO: Return constituents
|
115
|
+
# print(model, amp.shape)
|
116
|
+
# print(amp.shape, ph.shape, c)
|
117
|
+
# print(pd.DataFrame({"amplitude": amp}))
|
118
|
+
|
119
|
+
# Raise error if constituent files no not cover analysis extent
|
120
|
+
except IndexError:
|
121
|
+
error_msg = f"""
|
122
|
+
The {model} tide model constituent files do not cover the analysis extent
|
123
|
+
({min(lon):.2f}, {max(lon):.2f}, {min(lat):.2f}, {max(lat):.2f}).
|
124
|
+
This can occur if you are using clipped model files to improve run times.
|
125
|
+
Consider using model files that cover your entire analysis area, or set `crop=False`
|
126
|
+
to reduce the extent of tide model constituent files that is loaded.
|
127
|
+
"""
|
128
|
+
raise Exception(textwrap.dedent(error_msg).strip()) from None
|
129
|
+
|
130
|
+
# Calculate complex phase in radians for Euler's
|
131
|
+
cph = -1j * ph * np.pi / 180.0
|
132
|
+
|
133
|
+
# Calculate constituent oscillation
|
134
|
+
hc = amp * np.exp(cph)
|
135
|
+
|
136
|
+
# Compute delta times based on model
|
137
|
+
if pytmd_model.corrections in ("OTIS", "ATLAS", "TMD3", "netcdf"):
|
138
|
+
# Use delta time at 2000.0 to match TMD outputs
|
139
|
+
deltat = np.zeros_like(timescale.tt_ut1)
|
140
|
+
else:
|
141
|
+
# Use interpolated delta times
|
142
|
+
deltat = timescale.tt_ut1
|
143
|
+
|
144
|
+
# In "one-to-many" mode, extracted tidal constituents and timesteps
|
145
|
+
# are repeated/multiplied out to match the number of input points and
|
146
|
+
# timesteps, enabling the modeling of tides across all combinations
|
147
|
+
# of input times and points. In "one-to-one" mode, no repetition is
|
148
|
+
# needed, so each repeat count is set to 1.
|
149
|
+
points_repeat = len(x) if mode == "one-to-many" else 1
|
150
|
+
time_repeat = len(time) if mode == "one-to-many" else 1
|
151
|
+
t, hc, deltat = (
|
152
|
+
np.tile(timescale.tide, points_repeat),
|
153
|
+
hc.repeat(time_repeat, axis=0),
|
154
|
+
np.tile(deltat, points_repeat),
|
155
|
+
)
|
156
|
+
|
157
|
+
# Create arrays to hold outputs
|
158
|
+
tide = np.ma.zeros((len(t)), fill_value=np.nan)
|
159
|
+
tide.mask = np.any(hc.mask, axis=1)
|
160
|
+
|
161
|
+
# Predict tidal elevations at time and infer minor corrections
|
162
|
+
tide.data[:] = pyTMD.predict.drift(
|
163
|
+
t,
|
164
|
+
hc,
|
165
|
+
c,
|
166
|
+
deltat=deltat,
|
167
|
+
corrections=pytmd_model.corrections,
|
168
|
+
)
|
169
|
+
minor = pyTMD.predict.infer_minor(
|
170
|
+
t,
|
171
|
+
hc,
|
172
|
+
c,
|
173
|
+
deltat=deltat,
|
174
|
+
corrections=pytmd_model.corrections,
|
175
|
+
minor=pytmd_model.minor,
|
176
|
+
)
|
177
|
+
tide.data[:] += minor.data[:]
|
178
|
+
|
179
|
+
# Replace invalid values with fill value
|
180
|
+
tide.data[tide.mask] = tide.fill_value
|
181
|
+
|
182
|
+
# Convert data to pandas.DataFrame, and set index to our input
|
183
|
+
# time/x/y values
|
184
|
+
tide_df = pd.DataFrame({
|
185
|
+
"time": np.tile(time, points_repeat),
|
186
|
+
"x": np.repeat(x, time_repeat),
|
187
|
+
"y": np.repeat(y, time_repeat),
|
188
|
+
"tide_model": model,
|
189
|
+
"tide_height": tide,
|
190
|
+
}).set_index(["time", "x", "y"])
|
191
|
+
|
192
|
+
# Optionally convert outputs to integer units (can save memory)
|
193
|
+
if output_units == "m":
|
194
|
+
tide_df["tide_height"] = tide_df.tide_height.astype(np.float32)
|
195
|
+
elif output_units == "cm":
|
196
|
+
tide_df["tide_height"] = (tide_df.tide_height * 100).astype(np.int16)
|
197
|
+
elif output_units == "mm":
|
198
|
+
tide_df["tide_height"] = (tide_df.tide_height * 1000).astype(np.int16)
|
199
|
+
|
200
|
+
return tide_df
|
201
|
+
|
202
|
+
|
203
|
+
def ensemble_tides(
|
204
|
+
tide_df,
|
205
|
+
crs,
|
206
|
+
ensemble_models,
|
207
|
+
ensemble_func=None,
|
208
|
+
ensemble_top_n=3,
|
209
|
+
ranking_points="https://dea-public-data-dev.s3-ap-southeast-2.amazonaws.com/derivative/dea_intertidal/supplementary/rankings_ensemble_2017-2019.fgb",
|
210
|
+
ranking_valid_perc=0.02,
|
211
|
+
**idw_kwargs,
|
212
|
+
):
|
213
|
+
"""Combine multiple tide models into a single locally optimised
|
214
|
+
ensemble tide model using external model ranking data (e.g.
|
215
|
+
satellite altimetry or NDWI-tide correlations along the coastline)
|
216
|
+
to inform the selection of the best local models.
|
217
|
+
|
218
|
+
This function performs the following steps:
|
219
|
+
|
220
|
+
1. Takes a dataframe of tide heights from multiple tide models, as
|
221
|
+
produced by `eo_tides.model.model_tides`
|
222
|
+
2. Loads model ranking points from an external file, filters them
|
223
|
+
based on the valid data percentage, and retains relevant columns
|
224
|
+
3. Interpolates the model rankings into the coordinates of the
|
225
|
+
original dataframe using Inverse Weighted Interpolation (IDW)
|
226
|
+
4. Uses rankings to combine multiple tide models into a single
|
227
|
+
optimised ensemble model (by default, by taking the mean of the
|
228
|
+
top 3 ranked models)
|
229
|
+
5. Returns a new dataframe with the combined ensemble model predictions
|
230
|
+
|
231
|
+
Parameters
|
232
|
+
----------
|
233
|
+
tide_df : pandas.DataFrame
|
234
|
+
DataFrame produced by `eo_tides.model.model_tides`, containing
|
235
|
+
tide model predictions in long format with columns:
|
236
|
+
`["time", "x", "y", "tide_height", "tide_model"]`.
|
237
|
+
crs : string
|
238
|
+
Coordinate reference system for the "x" and "y" coordinates in
|
239
|
+
`tide_df`. Used to ensure that interpolations are performed
|
240
|
+
in the correct CRS.
|
241
|
+
ensemble_models : list
|
242
|
+
A list of models to include in the ensemble modelling process.
|
243
|
+
All values must exist as columns with the prefix "rank_" in
|
244
|
+
`ranking_points`.
|
245
|
+
ensemble_func : dict, optional
|
246
|
+
By default, a simple ensemble model will be calculated by taking
|
247
|
+
the mean of the `ensemble_top_n` tide models at each location.
|
248
|
+
However, a dictionary containing more complex ensemble
|
249
|
+
calculations can also be provided. Dictionary keys are used
|
250
|
+
to name output ensemble models; functions should take a column
|
251
|
+
named "rank" and convert it to a weighting, e.g.:
|
252
|
+
`ensemble_func = {"ensemble-custom": lambda x: x["rank"] <= 3}`
|
253
|
+
ensemble_top_n : int, optional
|
254
|
+
If `ensemble_func` is None, this sets the number of top models
|
255
|
+
to include in the mean ensemble calculation. Defaults to 3.
|
256
|
+
ranking_points : str, optional
|
257
|
+
Path to the file containing model ranking points. This dataset
|
258
|
+
should include columns containing rankings for each tide
|
259
|
+
model, named with the prefix "rank_". e.g. "rank_EOT20".
|
260
|
+
Low values should represent high rankings (e.g. 1 = top ranked).
|
261
|
+
The default value points to an example file covering Australia.
|
262
|
+
ranking_valid_perc : float, optional
|
263
|
+
Minimum percentage of valid data required to include a model
|
264
|
+
rank point in the analysis, as defined in a column named
|
265
|
+
"valid_perc". Defaults to 0.02.
|
266
|
+
**idw_kwargs
|
267
|
+
Optional keyword arguments to pass to the `idw` function used
|
268
|
+
for interpolation. Useful values include `k` (number of nearest
|
269
|
+
neighbours to use in interpolation), `max_dist` (maximum
|
270
|
+
distance to nearest neighbours), and `k_min` (minimum number of
|
271
|
+
neighbours required after `max_dist` is applied).
|
272
|
+
|
273
|
+
Returns
|
274
|
+
-------
|
275
|
+
pandas.DataFrame
|
276
|
+
DataFrame containing the ensemble model predictions, matching
|
277
|
+
the format of the input `tide_df` (e.g. columns `["time", "x",
|
278
|
+
"y", "tide_height", "tide_model"]`. By default the 'tide_model'
|
279
|
+
column will be labeled "ensemble" for the combined model
|
280
|
+
predictions (but if a custom dictionary of ensemble functions is
|
281
|
+
provided via `ensemble_func`, each ensemble will be named using
|
282
|
+
the provided dictionary keys).
|
283
|
+
|
284
|
+
"""
|
285
|
+
# Raise data if `tide_df` provided in wide format
|
286
|
+
if "tide_model" not in tide_df:
|
287
|
+
raise Exception(
|
288
|
+
"`tide_df` does not contain the expected 'tide_model' and "
|
289
|
+
"'tide_height' columns. Ensure that tides were modelled in "
|
290
|
+
"long format (i.e. `output_format='long'` in `model_tides`)."
|
291
|
+
)
|
292
|
+
|
293
|
+
# Extract x and y coords from dataframe
|
294
|
+
x = tide_df.index.get_level_values(level="x")
|
295
|
+
y = tide_df.index.get_level_values(level="y")
|
296
|
+
|
297
|
+
# Load model ranks points and reproject to same CRS as x and y
|
298
|
+
model_ranking_cols = [f"rank_{m}" for m in ensemble_models]
|
299
|
+
try:
|
300
|
+
model_ranks_gdf = (
|
301
|
+
gpd.read_file(ranking_points, engine="pyogrio")
|
302
|
+
.to_crs(crs)
|
303
|
+
.query(f"valid_perc > {ranking_valid_perc}")
|
304
|
+
.dropna(how="all")[model_ranking_cols + ["geometry"]]
|
305
|
+
)
|
306
|
+
except KeyError:
|
307
|
+
error_msg = f"""
|
308
|
+
Not all of the expected "rank_" columns {model_ranking_cols} were
|
309
|
+
found in the columns of the ranking points file ({ranking_points}).
|
310
|
+
Consider passing a custom list of models using `ensemble_models`.
|
311
|
+
"""
|
312
|
+
raise Exception(textwrap.dedent(error_msg).strip()) from None
|
313
|
+
|
314
|
+
# Use points to interpolate model rankings into requested x and y
|
315
|
+
id_kwargs_str = "" if idw_kwargs == {} else idw_kwargs
|
316
|
+
print(f"Interpolating model rankings using IDW interpolation {id_kwargs_str}")
|
317
|
+
ensemble_ranks_df = (
|
318
|
+
# Run IDW interpolation on subset of ranking columns
|
319
|
+
pd.DataFrame(
|
320
|
+
idw(
|
321
|
+
input_z=model_ranks_gdf[model_ranking_cols],
|
322
|
+
input_x=model_ranks_gdf.geometry.x,
|
323
|
+
input_y=model_ranks_gdf.geometry.y,
|
324
|
+
output_x=x,
|
325
|
+
output_y=y,
|
326
|
+
**idw_kwargs,
|
327
|
+
),
|
328
|
+
columns=model_ranking_cols,
|
329
|
+
)
|
330
|
+
.assign(x=x, y=y)
|
331
|
+
# Drop any duplicates then melt columns into long format
|
332
|
+
.drop_duplicates()
|
333
|
+
.melt(id_vars=["x", "y"], var_name="tide_model", value_name="rank")
|
334
|
+
# Remore "rank_" prefix to get plain model names
|
335
|
+
.replace({"^rank_": ""}, regex=True)
|
336
|
+
# Set index columns and rank across groups
|
337
|
+
.set_index(["tide_model", "x", "y"])
|
338
|
+
.groupby(["x", "y"])
|
339
|
+
.rank()
|
340
|
+
)
|
341
|
+
|
342
|
+
# If no custom ensemble funcs are provided, use a default ensemble
|
343
|
+
# calculation that takes the mean of the top N tide models
|
344
|
+
if ensemble_func is None:
|
345
|
+
ensemble_func = {"ensemble": lambda x: x["rank"] <= ensemble_top_n}
|
346
|
+
|
347
|
+
# Create output list to hold computed ensemble model outputs
|
348
|
+
ensemble_list = []
|
349
|
+
|
350
|
+
# Loop through all provided ensemble generation functions
|
351
|
+
for ensemble_n, ensemble_f in ensemble_func.items():
|
352
|
+
print(f"Combining models into single {ensemble_n} model")
|
353
|
+
|
354
|
+
# Join ranks to input tide data, compute weightings and group
|
355
|
+
grouped = (
|
356
|
+
# Add tide model as an index so we can join with model ranks
|
357
|
+
tide_df.set_index("tide_model", append=True)
|
358
|
+
.join(ensemble_ranks_df)
|
359
|
+
# Add temp columns containing weightings and weighted values
|
360
|
+
.assign(
|
361
|
+
weights=ensemble_f, # use custom func to compute weights
|
362
|
+
weighted=lambda i: i.tide_height * i.weights,
|
363
|
+
)
|
364
|
+
# Groupby is specified in a weird order here as this seems
|
365
|
+
# to be the easiest way to preserve correct index sorting
|
366
|
+
.groupby(["x", "y", "time"])
|
367
|
+
)
|
368
|
+
|
369
|
+
# Use weightings to combine multiple models into single ensemble
|
370
|
+
ensemble_df = (
|
371
|
+
# Calculate weighted mean and convert back to dataframe
|
372
|
+
grouped.weighted.sum()
|
373
|
+
.div(grouped.weights.sum())
|
374
|
+
.to_frame("tide_height")
|
375
|
+
# Label ensemble model and ensure indexes are in expected order
|
376
|
+
.assign(tide_model=ensemble_n)
|
377
|
+
.reorder_levels(["time", "x", "y"], axis=0)
|
378
|
+
)
|
379
|
+
|
380
|
+
ensemble_list.append(ensemble_df)
|
381
|
+
|
382
|
+
# Combine all ensemble models and return as a single dataframe
|
383
|
+
return pd.concat(ensemble_list)
|
384
|
+
|
385
|
+
|
386
|
+
def model_tides(
|
387
|
+
x: float | list[float] | xr.DataArray,
|
388
|
+
y: float | list[float] | xr.DataArray,
|
389
|
+
time: DatetimeLike,
|
390
|
+
model: str | list[str] = "EOT20",
|
391
|
+
directory: str | os.PathLike | None = None,
|
392
|
+
crs: str = "EPSG:4326",
|
393
|
+
mode: str = "one-to-many",
|
394
|
+
output_format: str = "long",
|
395
|
+
output_units: str = "m",
|
396
|
+
method: str = "linear",
|
397
|
+
extrapolate: bool = True,
|
398
|
+
cutoff: float | None = None,
|
399
|
+
crop: bool = True,
|
400
|
+
crop_buffer: float | None = 5,
|
401
|
+
parallel: bool = True,
|
402
|
+
parallel_splits: int | str = "auto",
|
403
|
+
parallel_max: int | None = None,
|
404
|
+
ensemble_models: list[str] | None = None,
|
405
|
+
**ensemble_kwargs,
|
406
|
+
) -> pd.DataFrame:
|
407
|
+
"""
|
408
|
+
Model tide heights at multiple coordinates and/or timesteps
|
409
|
+
using using one or more ocean tide models.
|
410
|
+
|
411
|
+
This function is parallelised to improve performance, and
|
412
|
+
supports all tidal models supported by `pyTMD`, including:
|
413
|
+
|
414
|
+
- Empirical Ocean Tide model (EOT20)
|
415
|
+
- Finite Element Solution tide models (FES2022, FES2014, FES2012)
|
416
|
+
- TOPEX/POSEIDON global tide models (TPXO10, TPXO9, TPXO8)
|
417
|
+
- Global Ocean Tide models (GOT5.6, GOT5.5, GOT4.10, GOT4.8, GOT4.7)
|
418
|
+
- Hamburg direct data Assimilation Methods for Tides models (HAMTIDE11)
|
419
|
+
|
420
|
+
This function requires access to tide model data files.
|
421
|
+
These should be placed in a folder with subfolders matching
|
422
|
+
the structure required by `pyTMD`. For more details:
|
423
|
+
<https://geoscienceaustralia.github.io/eo-tides/setup/>
|
424
|
+
<https://pytmd.readthedocs.io/en/latest/getting_started/Getting-Started.html#directories>
|
425
|
+
|
426
|
+
This function is a modification of the `pyTMD` package's
|
427
|
+
`pyTMD.compute.tide_elevations` function. For more info:
|
428
|
+
<https://pytmd.readthedocs.io/en/latest/api_reference/compute.html#pyTMD.compute.tide_elevations>
|
429
|
+
|
430
|
+
Parameters
|
431
|
+
----------
|
432
|
+
x, y : float or list of float
|
433
|
+
One or more x and y coordinates used to define
|
434
|
+
the location at which to model tides. By default these
|
435
|
+
coordinates should be lat/lon; use "crs" if they
|
436
|
+
are in a custom coordinate reference system.
|
437
|
+
time : DatetimeLike
|
438
|
+
Times at which to model tide heights (in UTC). Accepts
|
439
|
+
any format that can be converted by `pandas.to_datetime()`;
|
440
|
+
e.g. np.ndarray[datetime64], pd.DatetimeIndex, pd.Timestamp,
|
441
|
+
datetime.datetime and strings (e.g. "2020-01-01 23:00").
|
442
|
+
For example: `time=pd.date_range(start="2000", end="2001", freq="5h")`
|
443
|
+
model : str or list of str, optional
|
444
|
+
The tide model (or list of models) to use to model tides.
|
445
|
+
Defaults to "EOT20"; specify "all" to use all models available
|
446
|
+
in `directory`. For a full list of available and supported models,
|
447
|
+
run `eo_tides.utils.list_models`.
|
448
|
+
directory : str, optional
|
449
|
+
The directory containing tide model data files. If no path is
|
450
|
+
provided, this will default to the environment variable
|
451
|
+
`EO_TIDES_TIDE_MODELS` if set, or raise an error if not.
|
452
|
+
Tide modelling files should be stored in sub-folders for each
|
453
|
+
model that match the structure required by `pyTMD`
|
454
|
+
(<https://geoscienceaustralia.github.io/eo-tides/setup/>).
|
455
|
+
crs : str, optional
|
456
|
+
Input coordinate reference system for x and y coordinates.
|
457
|
+
Defaults to "EPSG:4326" (WGS84; degrees latitude, longitude).
|
458
|
+
mode : str, optional
|
459
|
+
The analysis mode to use for tide modelling. Supports two options:
|
460
|
+
|
461
|
+
- "one-to-many": Models tides for every timestep in "time" at
|
462
|
+
every input x and y coordinate point. This is useful if you
|
463
|
+
want to model tides for a specific list of timesteps across
|
464
|
+
multiple spatial points (e.g. for the same set of satellite
|
465
|
+
acquisition times at various locations across your study area).
|
466
|
+
- "one-to-one": Model tides using a unique timestep for each
|
467
|
+
set of x and y coordinates. In this mode, the number of x and
|
468
|
+
y points must equal the number of timesteps provided in "time".
|
469
|
+
output_format : str, optional
|
470
|
+
Whether to return the output dataframe in long format (with
|
471
|
+
results stacked vertically along "tide_model" and "tide_height"
|
472
|
+
columns), or wide format (with a column for each tide model).
|
473
|
+
Defaults to "long".
|
474
|
+
output_units : str, optional
|
475
|
+
Whether to return modelled tides in floating point metre units,
|
476
|
+
or integer centimetre units (i.e. scaled by 100) or integer
|
477
|
+
millimetre units (i.e. scaled by 1000. Returning outputs in
|
478
|
+
integer units can be useful for reducing memory usage.
|
479
|
+
Defaults to "m" for metres; set to "cm" for centimetres or "mm"
|
480
|
+
for millimetres.
|
481
|
+
method : str, optional
|
482
|
+
Method used to interpolate tidal constituents
|
483
|
+
from model files. Defaults to "linear"; options include:
|
484
|
+
|
485
|
+
- "linear", "nearest": scipy regular grid interpolations
|
486
|
+
- "spline": scipy bivariate spline interpolation
|
487
|
+
- "bilinear": quick bilinear interpolation
|
488
|
+
extrapolate : bool, optional
|
489
|
+
Whether to extrapolate tides into x and y coordinates outside of
|
490
|
+
the valid tide modelling domain using nearest-neighbor.
|
491
|
+
cutoff : float, optional
|
492
|
+
Extrapolation cutoff in kilometers. The default is None, which
|
493
|
+
will extrapolate for all points regardless of distance from the
|
494
|
+
valid tide modelling domain.
|
495
|
+
crop : bool, optional
|
496
|
+
Whether to crop tide model constituent files on-the-fly to
|
497
|
+
improve performance. Defaults to True; use `crop_buffer`
|
498
|
+
to customise the buffer distance used to crop the files.
|
499
|
+
crop_buffer : int or float, optional
|
500
|
+
The buffer distance in degrees used to crop tide model
|
501
|
+
constituent files around the modelling area. Defaults to 5,
|
502
|
+
which will crop constituents using a five degree buffer on either
|
503
|
+
side of the analysis extent.
|
504
|
+
parallel : bool, optional
|
505
|
+
Whether to parallelise tide modelling. If multiple tide models are
|
506
|
+
requested, these will be run in parallel using `concurrent.futures`.
|
507
|
+
If enough workers are available, the analysis will also be split
|
508
|
+
into spatial chunks for additional parallelisation (see "parallel_splits"
|
509
|
+
below). Default is True.
|
510
|
+
parallel_splits : str or int, optional
|
511
|
+
Whether to split the input x and y coordinates into smaller,
|
512
|
+
evenly-sized chunks that are processed in parallel. This can
|
513
|
+
provide a large performance boost when processing large numbers
|
514
|
+
of coordinates. The default is "auto", which will automatically
|
515
|
+
attempt to determine optimal splits based on available CPUs,
|
516
|
+
the number of input points, and the number of models.
|
517
|
+
parallel_max : int, optional
|
518
|
+
Maximum number of processes to run in parallel. The default of
|
519
|
+
None will automatically determine this from your available CPUs.
|
520
|
+
ensemble_models : list of str, optional
|
521
|
+
An optional list of models used to generate the ensemble tide
|
522
|
+
model if "ensemble" tide modelling is requested. Defaults to
|
523
|
+
`["EOT20", "FES2012", "FES2014_extrapolated", "FES2022_extrapolated",
|
524
|
+
"GOT4.10", "GOT5.5_extrapolated", "GOT5.6_extrapolated",
|
525
|
+
"TPXO10-atlas-v2-nc", "TPXO8-atlas-nc", "TPXO9-atlas-v5-nc"]`.
|
526
|
+
**ensemble_kwargs :
|
527
|
+
Keyword arguments used to customise the generation of optional
|
528
|
+
ensemble tide models if "ensemble" modelling are requested.
|
529
|
+
These are passed to the underlying `_ensemble_model` function.
|
530
|
+
Useful parameters include `ranking_points` (path to model
|
531
|
+
rankings data), `k` (for controlling how model rankings are
|
532
|
+
interpolated), and `ensemble_top_n` (how many top models to use
|
533
|
+
in the ensemble calculation).
|
534
|
+
|
535
|
+
Returns
|
536
|
+
-------
|
537
|
+
pandas.DataFrame
|
538
|
+
A dataframe containing modelled tide heights.
|
539
|
+
|
540
|
+
"""
|
541
|
+
# Turn inputs into arrays for consistent handling
|
542
|
+
x = np.atleast_1d(x)
|
543
|
+
y = np.atleast_1d(y)
|
544
|
+
time = _standardise_time(time)
|
545
|
+
|
546
|
+
# Validate input arguments
|
547
|
+
assert time is not None, "Times for modelling tides must be provided via `time`."
|
548
|
+
assert method in ("bilinear", "spline", "linear", "nearest")
|
549
|
+
assert output_units in (
|
550
|
+
"m",
|
551
|
+
"cm",
|
552
|
+
"mm",
|
553
|
+
), "Output units must be either 'm', 'cm', or 'mm'."
|
554
|
+
assert output_format in (
|
555
|
+
"long",
|
556
|
+
"wide",
|
557
|
+
), "Output format must be either 'long' or 'wide'."
|
558
|
+
assert np.issubdtype(x.dtype, np.number), "`x` must contain only valid numeric values, and must not be None."
|
559
|
+
assert np.issubdtype(y.dtype, np.number), "`y` must contain only valid numeric values, and must not be None.."
|
560
|
+
assert len(x) == len(y), "x and y must be the same length."
|
561
|
+
if mode == "one-to-one":
|
562
|
+
assert len(x) == len(time), (
|
563
|
+
"The number of supplied x and y points and times must be "
|
564
|
+
"identical in 'one-to-one' mode. Use 'one-to-many' mode if "
|
565
|
+
"you intended to model multiple timesteps at each point."
|
566
|
+
)
|
567
|
+
|
568
|
+
# Set tide modelling files directory. If no custom path is
|
569
|
+
# provided, try global environment variable.
|
570
|
+
directory = _set_directory(directory)
|
571
|
+
|
572
|
+
# Standardise model list, handling "all" and "ensemble" functionality
|
573
|
+
models_to_process, models_requested, ensemble_models = _standardise_models(
|
574
|
+
model=model,
|
575
|
+
directory=directory,
|
576
|
+
ensemble_models=ensemble_models,
|
577
|
+
)
|
578
|
+
|
579
|
+
# Update tide modelling func to add default keyword arguments that
|
580
|
+
# are used for every iteration during parallel processing
|
581
|
+
iter_func = partial(
|
582
|
+
_model_tides,
|
583
|
+
directory=directory,
|
584
|
+
crs=crs,
|
585
|
+
mode=mode,
|
586
|
+
output_units=output_units,
|
587
|
+
method=method,
|
588
|
+
extrapolate=extrapolate,
|
589
|
+
cutoff=np.inf if cutoff is None else cutoff,
|
590
|
+
crop=crop,
|
591
|
+
crop_buffer=crop_buffer,
|
592
|
+
)
|
593
|
+
|
594
|
+
# If automatic parallel splits, calculate optimal value
|
595
|
+
# based on available parallelisation, number of points
|
596
|
+
# and number of models
|
597
|
+
if parallel_splits == "auto":
|
598
|
+
parallel_splits = _parallel_splits(
|
599
|
+
total_points=len(x),
|
600
|
+
model_count=len(models_to_process),
|
601
|
+
parallel_max=parallel_max,
|
602
|
+
)
|
603
|
+
|
604
|
+
# Verify that parallel splits are not larger than number of points
|
605
|
+
assert isinstance(parallel_splits, int)
|
606
|
+
if parallel_splits > len(x):
|
607
|
+
raise ValueError(f"Parallel splits ({parallel_splits}) cannot be larger than the number of points ({len(x)}).")
|
608
|
+
|
609
|
+
# Parallelise if either multiple models or multiple splits requested
|
610
|
+
if parallel & ((len(models_to_process) > 1) | (parallel_splits > 1)):
|
611
|
+
with ProcessPoolExecutor(max_workers=parallel_max) as executor:
|
612
|
+
print(
|
613
|
+
f"Modelling tides with {', '.join(models_to_process)} in parallel (models: {len(models_to_process)}, splits: {parallel_splits})"
|
614
|
+
)
|
615
|
+
|
616
|
+
# Optionally split lon/lat points into `splits_n` chunks
|
617
|
+
# that will be applied in parallel
|
618
|
+
x_split = np.array_split(x, parallel_splits)
|
619
|
+
y_split = np.array_split(y, parallel_splits)
|
620
|
+
|
621
|
+
# Get every combination of models and lat/lon points, and
|
622
|
+
# extract as iterables that can be passed to `executor.map()`
|
623
|
+
# In "one-to-many" mode, pass entire set of timesteps to each
|
624
|
+
# parallel iteration by repeating timesteps by number of total
|
625
|
+
# parallel iterations. In "one-to-one" mode, split up
|
626
|
+
# timesteps into smaller parallel chunks too.
|
627
|
+
if mode == "one-to-many":
|
628
|
+
model_iters, x_iters, y_iters = zip(
|
629
|
+
*[(m, x_split[i], y_split[i]) for m in models_to_process for i in range(parallel_splits)],
|
630
|
+
)
|
631
|
+
time_iters = [time] * len(model_iters)
|
632
|
+
elif mode == "one-to-one":
|
633
|
+
time_split = np.array_split(time, parallel_splits)
|
634
|
+
model_iters, x_iters, y_iters, time_iters = zip(
|
635
|
+
*[
|
636
|
+
(m, x_split[i], y_split[i], time_split[i])
|
637
|
+
for m in models_to_process
|
638
|
+
for i in range(parallel_splits)
|
639
|
+
],
|
640
|
+
)
|
641
|
+
|
642
|
+
# Apply func in parallel, iterating through each input param
|
643
|
+
try:
|
644
|
+
model_outputs = list(
|
645
|
+
tqdm(
|
646
|
+
executor.map(iter_func, model_iters, x_iters, y_iters, time_iters),
|
647
|
+
total=len(model_iters),
|
648
|
+
),
|
649
|
+
)
|
650
|
+
except BrokenProcessPool:
|
651
|
+
error_msg = (
|
652
|
+
"Parallelised tide modelling failed, likely to to an out-of-memory error. "
|
653
|
+
"Try reducing the size of your analysis, or set `parallel=False`."
|
654
|
+
)
|
655
|
+
raise RuntimeError(error_msg)
|
656
|
+
|
657
|
+
# Model tides in series if parallelisation is off
|
658
|
+
else:
|
659
|
+
model_outputs = []
|
660
|
+
|
661
|
+
for model_i in models_to_process:
|
662
|
+
print(f"Modelling tides with {model_i}")
|
663
|
+
tide_df = iter_func(model_i, x, y, time)
|
664
|
+
model_outputs.append(tide_df)
|
665
|
+
|
666
|
+
# Combine outputs into a single dataframe
|
667
|
+
tide_df = pd.concat(model_outputs, axis=0)
|
668
|
+
|
669
|
+
# Optionally compute ensemble model and add to dataframe
|
670
|
+
if "ensemble" in models_requested:
|
671
|
+
ensemble_df = ensemble_tides(tide_df, crs, ensemble_models, **ensemble_kwargs)
|
672
|
+
|
673
|
+
# Update requested models with any custom ensemble models, then
|
674
|
+
# filter the dataframe to keep only models originally requested
|
675
|
+
models_requested = list(np.union1d(models_requested, ensemble_df.tide_model.unique()))
|
676
|
+
tide_df = pd.concat([tide_df, ensemble_df]).query("tide_model in @models_requested")
|
677
|
+
|
678
|
+
# Optionally convert to a wide format dataframe with a tide model in
|
679
|
+
# each dataframe column
|
680
|
+
if output_format == "wide":
|
681
|
+
# Pivot into wide format with each time model as a column
|
682
|
+
print("Converting to a wide format dataframe")
|
683
|
+
tide_df = tide_df.pivot(columns="tide_model", values="tide_height")
|
684
|
+
|
685
|
+
# If in 'one-to-one' mode, reindex using our input time/x/y
|
686
|
+
# values to ensure the output is sorted the same as our inputs
|
687
|
+
if mode == "one-to-one":
|
688
|
+
output_indices = pd.MultiIndex.from_arrays([time, x, y], names=["time", "x", "y"])
|
689
|
+
tide_df = tide_df.reindex(output_indices)
|
690
|
+
|
691
|
+
return tide_df
|
692
|
+
|
693
|
+
|
694
|
+
def model_phases(
|
695
|
+
x: float | list[float] | xr.DataArray,
|
696
|
+
y: float | list[float] | xr.DataArray,
|
697
|
+
time: DatetimeLike,
|
698
|
+
model: str | list[str] = "EOT20",
|
699
|
+
directory: str | os.PathLike | None = None,
|
700
|
+
time_offset: str = "15 min",
|
701
|
+
return_tides: bool = False,
|
702
|
+
**model_tides_kwargs,
|
703
|
+
) -> pd.DataFrame:
|
704
|
+
"""
|
705
|
+
Model tide phases (low-flow, high-flow, high-ebb, low-ebb)
|
706
|
+
at multiple coordinates and/or timesteps using using one
|
707
|
+
or more ocean tide models.
|
708
|
+
|
709
|
+
Ebb and low phases are calculated by running the
|
710
|
+
`eo_tides.model.model_tides` function twice, once for
|
711
|
+
the requested timesteps, and again after subtracting a
|
712
|
+
small time offset (by default, 15 minutes). If tides
|
713
|
+
increased over this period, they are assigned as "flow";
|
714
|
+
if they decreased, they are assigned as "ebb".
|
715
|
+
Tides are considered "high" if equal or greater than 0
|
716
|
+
metres tide height, otherwise "low".
|
717
|
+
|
718
|
+
This function supports all parameters that are supported
|
719
|
+
by `model_tides`.
|
720
|
+
|
721
|
+
Parameters
|
722
|
+
----------
|
723
|
+
x, y : float or list of float
|
724
|
+
One or more x and y coordinates used to define
|
725
|
+
the location at which to model tide phases. By default
|
726
|
+
these coordinates should be lat/lon; use "crs" if they
|
727
|
+
are in a custom coordinate reference system.
|
728
|
+
time : DatetimeLike
|
729
|
+
Times at which to model tide phases (in UTC). Accepts
|
730
|
+
any format that can be converted by `pandas.to_datetime()`;
|
731
|
+
e.g. np.ndarray[datetime64], pd.DatetimeIndex, pd.Timestamp,
|
732
|
+
datetime.datetime and strings (e.g. "2020-01-01 23:00").
|
733
|
+
For example: `time=pd.date_range(start="2000", end="2001", freq="5h")`
|
734
|
+
model : str or list of str, optional
|
735
|
+
The tide model (or list of models) to use to model tides.
|
736
|
+
Defaults to "EOT20"; specify "all" to use all models available
|
737
|
+
in `directory`. For a full list of available and supported models,
|
738
|
+
run `eo_tides.utils.list_models`.
|
739
|
+
directory : str, optional
|
740
|
+
The directory containing tide model data files. If no path is
|
741
|
+
provided, this will default to the environment variable
|
742
|
+
`EO_TIDES_TIDE_MODELS` if set, or raise an error if not.
|
743
|
+
Tide modelling files should be stored in sub-folders for each
|
744
|
+
model that match the structure required by `pyTMD`
|
745
|
+
(<https://geoscienceaustralia.github.io/eo-tides/setup/>).
|
746
|
+
time_offset: str, optional
|
747
|
+
The time offset/delta used to generate a time series of
|
748
|
+
offset tide heights required for phase calculation. Defeaults
|
749
|
+
to "15 min"; can be any string passed to `pandas.Timedelta`.
|
750
|
+
return_tides: bool, optional
|
751
|
+
Whether to return intermediate modelled tide heights as a
|
752
|
+
"tide_height" column in the output dataframe. Defaults to False.
|
753
|
+
**model_tides_kwargs :
|
754
|
+
Optional parameters passed to the `eo_tides.model.model_tides`
|
755
|
+
function. Important parameters include `output_format` (e.g.
|
756
|
+
whether to return results in wide or long format), `crop`
|
757
|
+
(whether to crop tide model constituent files on-the-fly to
|
758
|
+
improve performance) etc.
|
759
|
+
|
760
|
+
Returns
|
761
|
+
-------
|
762
|
+
pandas.DataFrame
|
763
|
+
A dataframe containing modelled tide phases.
|
764
|
+
|
765
|
+
"""
|
766
|
+
|
767
|
+
# Pop output format and mode for special handling
|
768
|
+
output_format = model_tides_kwargs.pop("output_format", "long")
|
769
|
+
mode = model_tides_kwargs.pop("mode", "one-to-many")
|
770
|
+
|
771
|
+
# Model tides
|
772
|
+
tide_df = model_tides(
|
773
|
+
x=x,
|
774
|
+
y=y,
|
775
|
+
time=time,
|
776
|
+
model=model,
|
777
|
+
directory=directory,
|
778
|
+
**model_tides_kwargs,
|
779
|
+
)
|
780
|
+
|
781
|
+
# Model tides for a time 15 minutes prior to each previously
|
782
|
+
# modelled satellite acquisition time. This allows us to compare
|
783
|
+
# tide heights to see if they are rising or falling.
|
784
|
+
pre_df = model_tides(
|
785
|
+
x=x,
|
786
|
+
y=y,
|
787
|
+
time=time - pd.Timedelta(time_offset),
|
788
|
+
model=model,
|
789
|
+
directory=directory,
|
790
|
+
**model_tides_kwargs,
|
791
|
+
)
|
792
|
+
|
793
|
+
# Compare tides computed for each timestep. If the previous tide
|
794
|
+
# was higher than the current tide, the tide is 'ebbing'. If the
|
795
|
+
# previous tide was lower, the tide is 'flowing'
|
796
|
+
ebb_flow = (tide_df.tide_height < pre_df.tide_height.values).replace({True: "ebb", False: "flow"})
|
797
|
+
|
798
|
+
# If tides are greater than 0, then "high", otherwise "low"
|
799
|
+
high_low = (tide_df.tide_height >= 0).replace({True: "high", False: "low"})
|
800
|
+
|
801
|
+
# Combine into one string and add to data
|
802
|
+
tide_df["tide_phase"] = high_low.astype(str) + "-" + ebb_flow.astype(str)
|
803
|
+
|
804
|
+
# Optionally convert to a wide format dataframe with a tide model in
|
805
|
+
# each dataframe column
|
806
|
+
if output_format == "wide":
|
807
|
+
# Pivot into wide format with each time model as a column
|
808
|
+
print("Converting to a wide format dataframe")
|
809
|
+
tide_df = tide_df.pivot(columns="tide_model")
|
810
|
+
|
811
|
+
# If in 'one-to-one' mode, reindex using our input time/x/y
|
812
|
+
# values to ensure the output is sorted the same as our inputs
|
813
|
+
if mode == "one-to-one":
|
814
|
+
output_indices = pd.MultiIndex.from_arrays([time, x, y], names=["time", "x", "y"])
|
815
|
+
tide_df = tide_df.reindex(output_indices)
|
816
|
+
|
817
|
+
# Optionally drop tides
|
818
|
+
if not return_tides:
|
819
|
+
return tide_df.drop("tide_height", axis=1)["tide_phase"]
|
820
|
+
|
821
|
+
# Optionally drop tide heights
|
822
|
+
if not return_tides:
|
823
|
+
return tide_df.drop("tide_height", axis=1)
|
824
|
+
|
825
|
+
return tide_df
|