eo-tides 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
eo_tides/model.py ADDED
@@ -0,0 +1,825 @@
1
+ # Used to postpone evaluation of type annotations
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ import textwrap
6
+ from concurrent.futures import ProcessPoolExecutor
7
+ from concurrent.futures.process import BrokenProcessPool
8
+ from functools import partial
9
+ from typing import TYPE_CHECKING
10
+
11
+ import psutil
12
+
13
+ # Only import if running type checking
14
+ if TYPE_CHECKING:
15
+ import xarray as xr
16
+
17
+ import geopandas as gpd
18
+ import numpy as np
19
+ import pandas as pd
20
+ import pyproj
21
+ import pyTMD
22
+ from tqdm import tqdm
23
+
24
+ from .utils import DatetimeLike, _set_directory, _standardise_models, _standardise_time, idw
25
+
26
+
27
+ def _parallel_splits(
28
+ total_points: int,
29
+ model_count: int,
30
+ parallel_max: int | None = None,
31
+ min_points_per_split: int = 1000,
32
+ ) -> int:
33
+ """
34
+ Calculates the optimal number of parallel splits for data
35
+ processing based on system resources and processing constraints.
36
+
37
+ Parameters:
38
+ -----------
39
+ total_points : int
40
+ Total number of data points to process
41
+ model_count : int
42
+ Number of models that will be run in parallel
43
+ parallel_max : int, optional
44
+ Maximum number of parallel processes to use. If None, uses CPU core count
45
+ min_points_per_split : int, default=1000
46
+ Minimum number of points that should be processed in each split
47
+ """
48
+ # Get available CPUs. First see if `CPU_GUARANTEE` exists in
49
+ # environment (if running in JupyterHub); if not use psutil
50
+ # followed by standard CPU count
51
+ if parallel_max is None:
52
+ # Take the first valid output
53
+ raw_value = os.environ.get("CPU_GUARANTEE") or psutil.cpu_count(logical=False) or os.cpu_count() or 1
54
+
55
+ # Convert to integer
56
+ if isinstance(raw_value, str):
57
+ parallel_max = int(float(raw_value))
58
+ else:
59
+ parallel_max = int(raw_value)
60
+
61
+ # Calculate optimal number of splits based on constraints
62
+ splits_by_size = total_points / min_points_per_split
63
+ splits_by_cpu = parallel_max / model_count
64
+ optimal_splits = min(splits_by_size, splits_by_cpu)
65
+
66
+ # Convert to integer and ensure at least 1 split
67
+ final_split_count = int(max(1, optimal_splits))
68
+ return final_split_count
69
+
70
+
71
+ def _model_tides(
72
+ model,
73
+ x,
74
+ y,
75
+ time,
76
+ directory,
77
+ crs,
78
+ mode,
79
+ output_units,
80
+ method,
81
+ extrapolate,
82
+ cutoff,
83
+ crop,
84
+ crop_buffer,
85
+ ):
86
+ """Worker function applied in parallel by `model_tides`. Handles the
87
+ extraction of tide modelling constituents and tide modelling using
88
+ `pyTMD`.
89
+ """
90
+ # Obtain model details
91
+ pytmd_model = pyTMD.io.model(directory).elevation(model)
92
+
93
+ # Reproject x, y to latitude/longitude
94
+ transformer = pyproj.Transformer.from_crs(crs, "EPSG:4326", always_xy=True)
95
+ lon, lat = transformer.transform(x.flatten(), y.flatten())
96
+
97
+ # Convert datetime
98
+ timescale = pyTMD.time.timescale().from_datetime(time.flatten())
99
+
100
+ try:
101
+ # Read tidal constants and interpolate to grid points
102
+ amp, ph, c = pytmd_model.extract_constants(
103
+ lon,
104
+ lat,
105
+ type=pytmd_model.type,
106
+ crop=crop,
107
+ buffer=crop_buffer,
108
+ method=method,
109
+ extrapolate=extrapolate,
110
+ cutoff=cutoff,
111
+ append_node=False,
112
+ )
113
+
114
+ # TODO: Return constituents
115
+ # print(model, amp.shape)
116
+ # print(amp.shape, ph.shape, c)
117
+ # print(pd.DataFrame({"amplitude": amp}))
118
+
119
+ # Raise error if constituent files no not cover analysis extent
120
+ except IndexError:
121
+ error_msg = f"""
122
+ The {model} tide model constituent files do not cover the analysis extent
123
+ ({min(lon):.2f}, {max(lon):.2f}, {min(lat):.2f}, {max(lat):.2f}).
124
+ This can occur if you are using clipped model files to improve run times.
125
+ Consider using model files that cover your entire analysis area, or set `crop=False`
126
+ to reduce the extent of tide model constituent files that is loaded.
127
+ """
128
+ raise Exception(textwrap.dedent(error_msg).strip()) from None
129
+
130
+ # Calculate complex phase in radians for Euler's
131
+ cph = -1j * ph * np.pi / 180.0
132
+
133
+ # Calculate constituent oscillation
134
+ hc = amp * np.exp(cph)
135
+
136
+ # Compute delta times based on model
137
+ if pytmd_model.corrections in ("OTIS", "ATLAS", "TMD3", "netcdf"):
138
+ # Use delta time at 2000.0 to match TMD outputs
139
+ deltat = np.zeros_like(timescale.tt_ut1)
140
+ else:
141
+ # Use interpolated delta times
142
+ deltat = timescale.tt_ut1
143
+
144
+ # In "one-to-many" mode, extracted tidal constituents and timesteps
145
+ # are repeated/multiplied out to match the number of input points and
146
+ # timesteps, enabling the modeling of tides across all combinations
147
+ # of input times and points. In "one-to-one" mode, no repetition is
148
+ # needed, so each repeat count is set to 1.
149
+ points_repeat = len(x) if mode == "one-to-many" else 1
150
+ time_repeat = len(time) if mode == "one-to-many" else 1
151
+ t, hc, deltat = (
152
+ np.tile(timescale.tide, points_repeat),
153
+ hc.repeat(time_repeat, axis=0),
154
+ np.tile(deltat, points_repeat),
155
+ )
156
+
157
+ # Create arrays to hold outputs
158
+ tide = np.ma.zeros((len(t)), fill_value=np.nan)
159
+ tide.mask = np.any(hc.mask, axis=1)
160
+
161
+ # Predict tidal elevations at time and infer minor corrections
162
+ tide.data[:] = pyTMD.predict.drift(
163
+ t,
164
+ hc,
165
+ c,
166
+ deltat=deltat,
167
+ corrections=pytmd_model.corrections,
168
+ )
169
+ minor = pyTMD.predict.infer_minor(
170
+ t,
171
+ hc,
172
+ c,
173
+ deltat=deltat,
174
+ corrections=pytmd_model.corrections,
175
+ minor=pytmd_model.minor,
176
+ )
177
+ tide.data[:] += minor.data[:]
178
+
179
+ # Replace invalid values with fill value
180
+ tide.data[tide.mask] = tide.fill_value
181
+
182
+ # Convert data to pandas.DataFrame, and set index to our input
183
+ # time/x/y values
184
+ tide_df = pd.DataFrame({
185
+ "time": np.tile(time, points_repeat),
186
+ "x": np.repeat(x, time_repeat),
187
+ "y": np.repeat(y, time_repeat),
188
+ "tide_model": model,
189
+ "tide_height": tide,
190
+ }).set_index(["time", "x", "y"])
191
+
192
+ # Optionally convert outputs to integer units (can save memory)
193
+ if output_units == "m":
194
+ tide_df["tide_height"] = tide_df.tide_height.astype(np.float32)
195
+ elif output_units == "cm":
196
+ tide_df["tide_height"] = (tide_df.tide_height * 100).astype(np.int16)
197
+ elif output_units == "mm":
198
+ tide_df["tide_height"] = (tide_df.tide_height * 1000).astype(np.int16)
199
+
200
+ return tide_df
201
+
202
+
203
+ def ensemble_tides(
204
+ tide_df,
205
+ crs,
206
+ ensemble_models,
207
+ ensemble_func=None,
208
+ ensemble_top_n=3,
209
+ ranking_points="https://dea-public-data-dev.s3-ap-southeast-2.amazonaws.com/derivative/dea_intertidal/supplementary/rankings_ensemble_2017-2019.fgb",
210
+ ranking_valid_perc=0.02,
211
+ **idw_kwargs,
212
+ ):
213
+ """Combine multiple tide models into a single locally optimised
214
+ ensemble tide model using external model ranking data (e.g.
215
+ satellite altimetry or NDWI-tide correlations along the coastline)
216
+ to inform the selection of the best local models.
217
+
218
+ This function performs the following steps:
219
+
220
+ 1. Takes a dataframe of tide heights from multiple tide models, as
221
+ produced by `eo_tides.model.model_tides`
222
+ 2. Loads model ranking points from an external file, filters them
223
+ based on the valid data percentage, and retains relevant columns
224
+ 3. Interpolates the model rankings into the coordinates of the
225
+ original dataframe using Inverse Weighted Interpolation (IDW)
226
+ 4. Uses rankings to combine multiple tide models into a single
227
+ optimised ensemble model (by default, by taking the mean of the
228
+ top 3 ranked models)
229
+ 5. Returns a new dataframe with the combined ensemble model predictions
230
+
231
+ Parameters
232
+ ----------
233
+ tide_df : pandas.DataFrame
234
+ DataFrame produced by `eo_tides.model.model_tides`, containing
235
+ tide model predictions in long format with columns:
236
+ `["time", "x", "y", "tide_height", "tide_model"]`.
237
+ crs : string
238
+ Coordinate reference system for the "x" and "y" coordinates in
239
+ `tide_df`. Used to ensure that interpolations are performed
240
+ in the correct CRS.
241
+ ensemble_models : list
242
+ A list of models to include in the ensemble modelling process.
243
+ All values must exist as columns with the prefix "rank_" in
244
+ `ranking_points`.
245
+ ensemble_func : dict, optional
246
+ By default, a simple ensemble model will be calculated by taking
247
+ the mean of the `ensemble_top_n` tide models at each location.
248
+ However, a dictionary containing more complex ensemble
249
+ calculations can also be provided. Dictionary keys are used
250
+ to name output ensemble models; functions should take a column
251
+ named "rank" and convert it to a weighting, e.g.:
252
+ `ensemble_func = {"ensemble-custom": lambda x: x["rank"] <= 3}`
253
+ ensemble_top_n : int, optional
254
+ If `ensemble_func` is None, this sets the number of top models
255
+ to include in the mean ensemble calculation. Defaults to 3.
256
+ ranking_points : str, optional
257
+ Path to the file containing model ranking points. This dataset
258
+ should include columns containing rankings for each tide
259
+ model, named with the prefix "rank_". e.g. "rank_EOT20".
260
+ Low values should represent high rankings (e.g. 1 = top ranked).
261
+ The default value points to an example file covering Australia.
262
+ ranking_valid_perc : float, optional
263
+ Minimum percentage of valid data required to include a model
264
+ rank point in the analysis, as defined in a column named
265
+ "valid_perc". Defaults to 0.02.
266
+ **idw_kwargs
267
+ Optional keyword arguments to pass to the `idw` function used
268
+ for interpolation. Useful values include `k` (number of nearest
269
+ neighbours to use in interpolation), `max_dist` (maximum
270
+ distance to nearest neighbours), and `k_min` (minimum number of
271
+ neighbours required after `max_dist` is applied).
272
+
273
+ Returns
274
+ -------
275
+ pandas.DataFrame
276
+ DataFrame containing the ensemble model predictions, matching
277
+ the format of the input `tide_df` (e.g. columns `["time", "x",
278
+ "y", "tide_height", "tide_model"]`. By default the 'tide_model'
279
+ column will be labeled "ensemble" for the combined model
280
+ predictions (but if a custom dictionary of ensemble functions is
281
+ provided via `ensemble_func`, each ensemble will be named using
282
+ the provided dictionary keys).
283
+
284
+ """
285
+ # Raise data if `tide_df` provided in wide format
286
+ if "tide_model" not in tide_df:
287
+ raise Exception(
288
+ "`tide_df` does not contain the expected 'tide_model' and "
289
+ "'tide_height' columns. Ensure that tides were modelled in "
290
+ "long format (i.e. `output_format='long'` in `model_tides`)."
291
+ )
292
+
293
+ # Extract x and y coords from dataframe
294
+ x = tide_df.index.get_level_values(level="x")
295
+ y = tide_df.index.get_level_values(level="y")
296
+
297
+ # Load model ranks points and reproject to same CRS as x and y
298
+ model_ranking_cols = [f"rank_{m}" for m in ensemble_models]
299
+ try:
300
+ model_ranks_gdf = (
301
+ gpd.read_file(ranking_points, engine="pyogrio")
302
+ .to_crs(crs)
303
+ .query(f"valid_perc > {ranking_valid_perc}")
304
+ .dropna(how="all")[model_ranking_cols + ["geometry"]]
305
+ )
306
+ except KeyError:
307
+ error_msg = f"""
308
+ Not all of the expected "rank_" columns {model_ranking_cols} were
309
+ found in the columns of the ranking points file ({ranking_points}).
310
+ Consider passing a custom list of models using `ensemble_models`.
311
+ """
312
+ raise Exception(textwrap.dedent(error_msg).strip()) from None
313
+
314
+ # Use points to interpolate model rankings into requested x and y
315
+ id_kwargs_str = "" if idw_kwargs == {} else idw_kwargs
316
+ print(f"Interpolating model rankings using IDW interpolation {id_kwargs_str}")
317
+ ensemble_ranks_df = (
318
+ # Run IDW interpolation on subset of ranking columns
319
+ pd.DataFrame(
320
+ idw(
321
+ input_z=model_ranks_gdf[model_ranking_cols],
322
+ input_x=model_ranks_gdf.geometry.x,
323
+ input_y=model_ranks_gdf.geometry.y,
324
+ output_x=x,
325
+ output_y=y,
326
+ **idw_kwargs,
327
+ ),
328
+ columns=model_ranking_cols,
329
+ )
330
+ .assign(x=x, y=y)
331
+ # Drop any duplicates then melt columns into long format
332
+ .drop_duplicates()
333
+ .melt(id_vars=["x", "y"], var_name="tide_model", value_name="rank")
334
+ # Remore "rank_" prefix to get plain model names
335
+ .replace({"^rank_": ""}, regex=True)
336
+ # Set index columns and rank across groups
337
+ .set_index(["tide_model", "x", "y"])
338
+ .groupby(["x", "y"])
339
+ .rank()
340
+ )
341
+
342
+ # If no custom ensemble funcs are provided, use a default ensemble
343
+ # calculation that takes the mean of the top N tide models
344
+ if ensemble_func is None:
345
+ ensemble_func = {"ensemble": lambda x: x["rank"] <= ensemble_top_n}
346
+
347
+ # Create output list to hold computed ensemble model outputs
348
+ ensemble_list = []
349
+
350
+ # Loop through all provided ensemble generation functions
351
+ for ensemble_n, ensemble_f in ensemble_func.items():
352
+ print(f"Combining models into single {ensemble_n} model")
353
+
354
+ # Join ranks to input tide data, compute weightings and group
355
+ grouped = (
356
+ # Add tide model as an index so we can join with model ranks
357
+ tide_df.set_index("tide_model", append=True)
358
+ .join(ensemble_ranks_df)
359
+ # Add temp columns containing weightings and weighted values
360
+ .assign(
361
+ weights=ensemble_f, # use custom func to compute weights
362
+ weighted=lambda i: i.tide_height * i.weights,
363
+ )
364
+ # Groupby is specified in a weird order here as this seems
365
+ # to be the easiest way to preserve correct index sorting
366
+ .groupby(["x", "y", "time"])
367
+ )
368
+
369
+ # Use weightings to combine multiple models into single ensemble
370
+ ensemble_df = (
371
+ # Calculate weighted mean and convert back to dataframe
372
+ grouped.weighted.sum()
373
+ .div(grouped.weights.sum())
374
+ .to_frame("tide_height")
375
+ # Label ensemble model and ensure indexes are in expected order
376
+ .assign(tide_model=ensemble_n)
377
+ .reorder_levels(["time", "x", "y"], axis=0)
378
+ )
379
+
380
+ ensemble_list.append(ensemble_df)
381
+
382
+ # Combine all ensemble models and return as a single dataframe
383
+ return pd.concat(ensemble_list)
384
+
385
+
386
+ def model_tides(
387
+ x: float | list[float] | xr.DataArray,
388
+ y: float | list[float] | xr.DataArray,
389
+ time: DatetimeLike,
390
+ model: str | list[str] = "EOT20",
391
+ directory: str | os.PathLike | None = None,
392
+ crs: str = "EPSG:4326",
393
+ mode: str = "one-to-many",
394
+ output_format: str = "long",
395
+ output_units: str = "m",
396
+ method: str = "linear",
397
+ extrapolate: bool = True,
398
+ cutoff: float | None = None,
399
+ crop: bool = True,
400
+ crop_buffer: float | None = 5,
401
+ parallel: bool = True,
402
+ parallel_splits: int | str = "auto",
403
+ parallel_max: int | None = None,
404
+ ensemble_models: list[str] | None = None,
405
+ **ensemble_kwargs,
406
+ ) -> pd.DataFrame:
407
+ """
408
+ Model tide heights at multiple coordinates and/or timesteps
409
+ using using one or more ocean tide models.
410
+
411
+ This function is parallelised to improve performance, and
412
+ supports all tidal models supported by `pyTMD`, including:
413
+
414
+ - Empirical Ocean Tide model (EOT20)
415
+ - Finite Element Solution tide models (FES2022, FES2014, FES2012)
416
+ - TOPEX/POSEIDON global tide models (TPXO10, TPXO9, TPXO8)
417
+ - Global Ocean Tide models (GOT5.6, GOT5.5, GOT4.10, GOT4.8, GOT4.7)
418
+ - Hamburg direct data Assimilation Methods for Tides models (HAMTIDE11)
419
+
420
+ This function requires access to tide model data files.
421
+ These should be placed in a folder with subfolders matching
422
+ the structure required by `pyTMD`. For more details:
423
+ <https://geoscienceaustralia.github.io/eo-tides/setup/>
424
+ <https://pytmd.readthedocs.io/en/latest/getting_started/Getting-Started.html#directories>
425
+
426
+ This function is a modification of the `pyTMD` package's
427
+ `pyTMD.compute.tide_elevations` function. For more info:
428
+ <https://pytmd.readthedocs.io/en/latest/api_reference/compute.html#pyTMD.compute.tide_elevations>
429
+
430
+ Parameters
431
+ ----------
432
+ x, y : float or list of float
433
+ One or more x and y coordinates used to define
434
+ the location at which to model tides. By default these
435
+ coordinates should be lat/lon; use "crs" if they
436
+ are in a custom coordinate reference system.
437
+ time : DatetimeLike
438
+ Times at which to model tide heights (in UTC). Accepts
439
+ any format that can be converted by `pandas.to_datetime()`;
440
+ e.g. np.ndarray[datetime64], pd.DatetimeIndex, pd.Timestamp,
441
+ datetime.datetime and strings (e.g. "2020-01-01 23:00").
442
+ For example: `time=pd.date_range(start="2000", end="2001", freq="5h")`
443
+ model : str or list of str, optional
444
+ The tide model (or list of models) to use to model tides.
445
+ Defaults to "EOT20"; specify "all" to use all models available
446
+ in `directory`. For a full list of available and supported models,
447
+ run `eo_tides.utils.list_models`.
448
+ directory : str, optional
449
+ The directory containing tide model data files. If no path is
450
+ provided, this will default to the environment variable
451
+ `EO_TIDES_TIDE_MODELS` if set, or raise an error if not.
452
+ Tide modelling files should be stored in sub-folders for each
453
+ model that match the structure required by `pyTMD`
454
+ (<https://geoscienceaustralia.github.io/eo-tides/setup/>).
455
+ crs : str, optional
456
+ Input coordinate reference system for x and y coordinates.
457
+ Defaults to "EPSG:4326" (WGS84; degrees latitude, longitude).
458
+ mode : str, optional
459
+ The analysis mode to use for tide modelling. Supports two options:
460
+
461
+ - "one-to-many": Models tides for every timestep in "time" at
462
+ every input x and y coordinate point. This is useful if you
463
+ want to model tides for a specific list of timesteps across
464
+ multiple spatial points (e.g. for the same set of satellite
465
+ acquisition times at various locations across your study area).
466
+ - "one-to-one": Model tides using a unique timestep for each
467
+ set of x and y coordinates. In this mode, the number of x and
468
+ y points must equal the number of timesteps provided in "time".
469
+ output_format : str, optional
470
+ Whether to return the output dataframe in long format (with
471
+ results stacked vertically along "tide_model" and "tide_height"
472
+ columns), or wide format (with a column for each tide model).
473
+ Defaults to "long".
474
+ output_units : str, optional
475
+ Whether to return modelled tides in floating point metre units,
476
+ or integer centimetre units (i.e. scaled by 100) or integer
477
+ millimetre units (i.e. scaled by 1000. Returning outputs in
478
+ integer units can be useful for reducing memory usage.
479
+ Defaults to "m" for metres; set to "cm" for centimetres or "mm"
480
+ for millimetres.
481
+ method : str, optional
482
+ Method used to interpolate tidal constituents
483
+ from model files. Defaults to "linear"; options include:
484
+
485
+ - "linear", "nearest": scipy regular grid interpolations
486
+ - "spline": scipy bivariate spline interpolation
487
+ - "bilinear": quick bilinear interpolation
488
+ extrapolate : bool, optional
489
+ Whether to extrapolate tides into x and y coordinates outside of
490
+ the valid tide modelling domain using nearest-neighbor.
491
+ cutoff : float, optional
492
+ Extrapolation cutoff in kilometers. The default is None, which
493
+ will extrapolate for all points regardless of distance from the
494
+ valid tide modelling domain.
495
+ crop : bool, optional
496
+ Whether to crop tide model constituent files on-the-fly to
497
+ improve performance. Defaults to True; use `crop_buffer`
498
+ to customise the buffer distance used to crop the files.
499
+ crop_buffer : int or float, optional
500
+ The buffer distance in degrees used to crop tide model
501
+ constituent files around the modelling area. Defaults to 5,
502
+ which will crop constituents using a five degree buffer on either
503
+ side of the analysis extent.
504
+ parallel : bool, optional
505
+ Whether to parallelise tide modelling. If multiple tide models are
506
+ requested, these will be run in parallel using `concurrent.futures`.
507
+ If enough workers are available, the analysis will also be split
508
+ into spatial chunks for additional parallelisation (see "parallel_splits"
509
+ below). Default is True.
510
+ parallel_splits : str or int, optional
511
+ Whether to split the input x and y coordinates into smaller,
512
+ evenly-sized chunks that are processed in parallel. This can
513
+ provide a large performance boost when processing large numbers
514
+ of coordinates. The default is "auto", which will automatically
515
+ attempt to determine optimal splits based on available CPUs,
516
+ the number of input points, and the number of models.
517
+ parallel_max : int, optional
518
+ Maximum number of processes to run in parallel. The default of
519
+ None will automatically determine this from your available CPUs.
520
+ ensemble_models : list of str, optional
521
+ An optional list of models used to generate the ensemble tide
522
+ model if "ensemble" tide modelling is requested. Defaults to
523
+ `["EOT20", "FES2012", "FES2014_extrapolated", "FES2022_extrapolated",
524
+ "GOT4.10", "GOT5.5_extrapolated", "GOT5.6_extrapolated",
525
+ "TPXO10-atlas-v2-nc", "TPXO8-atlas-nc", "TPXO9-atlas-v5-nc"]`.
526
+ **ensemble_kwargs :
527
+ Keyword arguments used to customise the generation of optional
528
+ ensemble tide models if "ensemble" modelling are requested.
529
+ These are passed to the underlying `_ensemble_model` function.
530
+ Useful parameters include `ranking_points` (path to model
531
+ rankings data), `k` (for controlling how model rankings are
532
+ interpolated), and `ensemble_top_n` (how many top models to use
533
+ in the ensemble calculation).
534
+
535
+ Returns
536
+ -------
537
+ pandas.DataFrame
538
+ A dataframe containing modelled tide heights.
539
+
540
+ """
541
+ # Turn inputs into arrays for consistent handling
542
+ x = np.atleast_1d(x)
543
+ y = np.atleast_1d(y)
544
+ time = _standardise_time(time)
545
+
546
+ # Validate input arguments
547
+ assert time is not None, "Times for modelling tides must be provided via `time`."
548
+ assert method in ("bilinear", "spline", "linear", "nearest")
549
+ assert output_units in (
550
+ "m",
551
+ "cm",
552
+ "mm",
553
+ ), "Output units must be either 'm', 'cm', or 'mm'."
554
+ assert output_format in (
555
+ "long",
556
+ "wide",
557
+ ), "Output format must be either 'long' or 'wide'."
558
+ assert np.issubdtype(x.dtype, np.number), "`x` must contain only valid numeric values, and must not be None."
559
+ assert np.issubdtype(y.dtype, np.number), "`y` must contain only valid numeric values, and must not be None.."
560
+ assert len(x) == len(y), "x and y must be the same length."
561
+ if mode == "one-to-one":
562
+ assert len(x) == len(time), (
563
+ "The number of supplied x and y points and times must be "
564
+ "identical in 'one-to-one' mode. Use 'one-to-many' mode if "
565
+ "you intended to model multiple timesteps at each point."
566
+ )
567
+
568
+ # Set tide modelling files directory. If no custom path is
569
+ # provided, try global environment variable.
570
+ directory = _set_directory(directory)
571
+
572
+ # Standardise model list, handling "all" and "ensemble" functionality
573
+ models_to_process, models_requested, ensemble_models = _standardise_models(
574
+ model=model,
575
+ directory=directory,
576
+ ensemble_models=ensemble_models,
577
+ )
578
+
579
+ # Update tide modelling func to add default keyword arguments that
580
+ # are used for every iteration during parallel processing
581
+ iter_func = partial(
582
+ _model_tides,
583
+ directory=directory,
584
+ crs=crs,
585
+ mode=mode,
586
+ output_units=output_units,
587
+ method=method,
588
+ extrapolate=extrapolate,
589
+ cutoff=np.inf if cutoff is None else cutoff,
590
+ crop=crop,
591
+ crop_buffer=crop_buffer,
592
+ )
593
+
594
+ # If automatic parallel splits, calculate optimal value
595
+ # based on available parallelisation, number of points
596
+ # and number of models
597
+ if parallel_splits == "auto":
598
+ parallel_splits = _parallel_splits(
599
+ total_points=len(x),
600
+ model_count=len(models_to_process),
601
+ parallel_max=parallel_max,
602
+ )
603
+
604
+ # Verify that parallel splits are not larger than number of points
605
+ assert isinstance(parallel_splits, int)
606
+ if parallel_splits > len(x):
607
+ raise ValueError(f"Parallel splits ({parallel_splits}) cannot be larger than the number of points ({len(x)}).")
608
+
609
+ # Parallelise if either multiple models or multiple splits requested
610
+ if parallel & ((len(models_to_process) > 1) | (parallel_splits > 1)):
611
+ with ProcessPoolExecutor(max_workers=parallel_max) as executor:
612
+ print(
613
+ f"Modelling tides with {', '.join(models_to_process)} in parallel (models: {len(models_to_process)}, splits: {parallel_splits})"
614
+ )
615
+
616
+ # Optionally split lon/lat points into `splits_n` chunks
617
+ # that will be applied in parallel
618
+ x_split = np.array_split(x, parallel_splits)
619
+ y_split = np.array_split(y, parallel_splits)
620
+
621
+ # Get every combination of models and lat/lon points, and
622
+ # extract as iterables that can be passed to `executor.map()`
623
+ # In "one-to-many" mode, pass entire set of timesteps to each
624
+ # parallel iteration by repeating timesteps by number of total
625
+ # parallel iterations. In "one-to-one" mode, split up
626
+ # timesteps into smaller parallel chunks too.
627
+ if mode == "one-to-many":
628
+ model_iters, x_iters, y_iters = zip(
629
+ *[(m, x_split[i], y_split[i]) for m in models_to_process for i in range(parallel_splits)],
630
+ )
631
+ time_iters = [time] * len(model_iters)
632
+ elif mode == "one-to-one":
633
+ time_split = np.array_split(time, parallel_splits)
634
+ model_iters, x_iters, y_iters, time_iters = zip(
635
+ *[
636
+ (m, x_split[i], y_split[i], time_split[i])
637
+ for m in models_to_process
638
+ for i in range(parallel_splits)
639
+ ],
640
+ )
641
+
642
+ # Apply func in parallel, iterating through each input param
643
+ try:
644
+ model_outputs = list(
645
+ tqdm(
646
+ executor.map(iter_func, model_iters, x_iters, y_iters, time_iters),
647
+ total=len(model_iters),
648
+ ),
649
+ )
650
+ except BrokenProcessPool:
651
+ error_msg = (
652
+ "Parallelised tide modelling failed, likely to to an out-of-memory error. "
653
+ "Try reducing the size of your analysis, or set `parallel=False`."
654
+ )
655
+ raise RuntimeError(error_msg)
656
+
657
+ # Model tides in series if parallelisation is off
658
+ else:
659
+ model_outputs = []
660
+
661
+ for model_i in models_to_process:
662
+ print(f"Modelling tides with {model_i}")
663
+ tide_df = iter_func(model_i, x, y, time)
664
+ model_outputs.append(tide_df)
665
+
666
+ # Combine outputs into a single dataframe
667
+ tide_df = pd.concat(model_outputs, axis=0)
668
+
669
+ # Optionally compute ensemble model and add to dataframe
670
+ if "ensemble" in models_requested:
671
+ ensemble_df = ensemble_tides(tide_df, crs, ensemble_models, **ensemble_kwargs)
672
+
673
+ # Update requested models with any custom ensemble models, then
674
+ # filter the dataframe to keep only models originally requested
675
+ models_requested = list(np.union1d(models_requested, ensemble_df.tide_model.unique()))
676
+ tide_df = pd.concat([tide_df, ensemble_df]).query("tide_model in @models_requested")
677
+
678
+ # Optionally convert to a wide format dataframe with a tide model in
679
+ # each dataframe column
680
+ if output_format == "wide":
681
+ # Pivot into wide format with each time model as a column
682
+ print("Converting to a wide format dataframe")
683
+ tide_df = tide_df.pivot(columns="tide_model", values="tide_height")
684
+
685
+ # If in 'one-to-one' mode, reindex using our input time/x/y
686
+ # values to ensure the output is sorted the same as our inputs
687
+ if mode == "one-to-one":
688
+ output_indices = pd.MultiIndex.from_arrays([time, x, y], names=["time", "x", "y"])
689
+ tide_df = tide_df.reindex(output_indices)
690
+
691
+ return tide_df
692
+
693
+
694
+ def model_phases(
695
+ x: float | list[float] | xr.DataArray,
696
+ y: float | list[float] | xr.DataArray,
697
+ time: DatetimeLike,
698
+ model: str | list[str] = "EOT20",
699
+ directory: str | os.PathLike | None = None,
700
+ time_offset: str = "15 min",
701
+ return_tides: bool = False,
702
+ **model_tides_kwargs,
703
+ ) -> pd.DataFrame:
704
+ """
705
+ Model tide phases (low-flow, high-flow, high-ebb, low-ebb)
706
+ at multiple coordinates and/or timesteps using using one
707
+ or more ocean tide models.
708
+
709
+ Ebb and low phases are calculated by running the
710
+ `eo_tides.model.model_tides` function twice, once for
711
+ the requested timesteps, and again after subtracting a
712
+ small time offset (by default, 15 minutes). If tides
713
+ increased over this period, they are assigned as "flow";
714
+ if they decreased, they are assigned as "ebb".
715
+ Tides are considered "high" if equal or greater than 0
716
+ metres tide height, otherwise "low".
717
+
718
+ This function supports all parameters that are supported
719
+ by `model_tides`.
720
+
721
+ Parameters
722
+ ----------
723
+ x, y : float or list of float
724
+ One or more x and y coordinates used to define
725
+ the location at which to model tide phases. By default
726
+ these coordinates should be lat/lon; use "crs" if they
727
+ are in a custom coordinate reference system.
728
+ time : DatetimeLike
729
+ Times at which to model tide phases (in UTC). Accepts
730
+ any format that can be converted by `pandas.to_datetime()`;
731
+ e.g. np.ndarray[datetime64], pd.DatetimeIndex, pd.Timestamp,
732
+ datetime.datetime and strings (e.g. "2020-01-01 23:00").
733
+ For example: `time=pd.date_range(start="2000", end="2001", freq="5h")`
734
+ model : str or list of str, optional
735
+ The tide model (or list of models) to use to model tides.
736
+ Defaults to "EOT20"; specify "all" to use all models available
737
+ in `directory`. For a full list of available and supported models,
738
+ run `eo_tides.utils.list_models`.
739
+ directory : str, optional
740
+ The directory containing tide model data files. If no path is
741
+ provided, this will default to the environment variable
742
+ `EO_TIDES_TIDE_MODELS` if set, or raise an error if not.
743
+ Tide modelling files should be stored in sub-folders for each
744
+ model that match the structure required by `pyTMD`
745
+ (<https://geoscienceaustralia.github.io/eo-tides/setup/>).
746
+ time_offset: str, optional
747
+ The time offset/delta used to generate a time series of
748
+ offset tide heights required for phase calculation. Defeaults
749
+ to "15 min"; can be any string passed to `pandas.Timedelta`.
750
+ return_tides: bool, optional
751
+ Whether to return intermediate modelled tide heights as a
752
+ "tide_height" column in the output dataframe. Defaults to False.
753
+ **model_tides_kwargs :
754
+ Optional parameters passed to the `eo_tides.model.model_tides`
755
+ function. Important parameters include `output_format` (e.g.
756
+ whether to return results in wide or long format), `crop`
757
+ (whether to crop tide model constituent files on-the-fly to
758
+ improve performance) etc.
759
+
760
+ Returns
761
+ -------
762
+ pandas.DataFrame
763
+ A dataframe containing modelled tide phases.
764
+
765
+ """
766
+
767
+ # Pop output format and mode for special handling
768
+ output_format = model_tides_kwargs.pop("output_format", "long")
769
+ mode = model_tides_kwargs.pop("mode", "one-to-many")
770
+
771
+ # Model tides
772
+ tide_df = model_tides(
773
+ x=x,
774
+ y=y,
775
+ time=time,
776
+ model=model,
777
+ directory=directory,
778
+ **model_tides_kwargs,
779
+ )
780
+
781
+ # Model tides for a time 15 minutes prior to each previously
782
+ # modelled satellite acquisition time. This allows us to compare
783
+ # tide heights to see if they are rising or falling.
784
+ pre_df = model_tides(
785
+ x=x,
786
+ y=y,
787
+ time=time - pd.Timedelta(time_offset),
788
+ model=model,
789
+ directory=directory,
790
+ **model_tides_kwargs,
791
+ )
792
+
793
+ # Compare tides computed for each timestep. If the previous tide
794
+ # was higher than the current tide, the tide is 'ebbing'. If the
795
+ # previous tide was lower, the tide is 'flowing'
796
+ ebb_flow = (tide_df.tide_height < pre_df.tide_height.values).replace({True: "ebb", False: "flow"})
797
+
798
+ # If tides are greater than 0, then "high", otherwise "low"
799
+ high_low = (tide_df.tide_height >= 0).replace({True: "high", False: "low"})
800
+
801
+ # Combine into one string and add to data
802
+ tide_df["tide_phase"] = high_low.astype(str) + "-" + ebb_flow.astype(str)
803
+
804
+ # Optionally convert to a wide format dataframe with a tide model in
805
+ # each dataframe column
806
+ if output_format == "wide":
807
+ # Pivot into wide format with each time model as a column
808
+ print("Converting to a wide format dataframe")
809
+ tide_df = tide_df.pivot(columns="tide_model")
810
+
811
+ # If in 'one-to-one' mode, reindex using our input time/x/y
812
+ # values to ensure the output is sorted the same as our inputs
813
+ if mode == "one-to-one":
814
+ output_indices = pd.MultiIndex.from_arrays([time, x, y], names=["time", "x", "y"])
815
+ tide_df = tide_df.reindex(output_indices)
816
+
817
+ # Optionally drop tides
818
+ if not return_tides:
819
+ return tide_df.drop("tide_height", axis=1)["tide_phase"]
820
+
821
+ # Optionally drop tide heights
822
+ if not return_tides:
823
+ return tide_df.drop("tide_height", axis=1)
824
+
825
+ return tide_df