eo-tides 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
eo_tides/stats.py ADDED
@@ -0,0 +1,15 @@
1
+ def tide_stats(a):
2
+ """
3
+ Test function.
4
+
5
+ Parameters
6
+ ----------
7
+ a : int
8
+ Test
9
+
10
+ Returns
11
+ -------
12
+ Test
13
+
14
+ """
15
+ return a
eo_tides/utils.py ADDED
@@ -0,0 +1,144 @@
1
+ import numpy as np
2
+ from scipy.spatial import cKDTree as KDTree
3
+
4
+
5
+ def idw(
6
+ input_z,
7
+ input_x,
8
+ input_y,
9
+ output_x,
10
+ output_y,
11
+ p=1,
12
+ k=10,
13
+ max_dist=None,
14
+ k_min=1,
15
+ epsilon=1e-12,
16
+ ):
17
+ """Perform Inverse Distance Weighting (IDW) interpolation.
18
+
19
+ This function performs fast IDW interpolation by creating a KDTree
20
+ from the input coordinates then uses it to find the `k` nearest
21
+ neighbors for each output point. Weights are calculated based on the
22
+ inverse distance to each neighbor, with weights descreasing with
23
+ increasing distance.
24
+
25
+ Code inspired by: https://github.com/DahnJ/REM-xarray
26
+
27
+ Parameters
28
+ ----------
29
+ input_z : array-like
30
+ Array of values at the input points. This can be either a
31
+ 1-dimensional array, or a 2-dimensional array where each column
32
+ (axis=1) represents a different set of values to be interpolated.
33
+ input_x : array-like
34
+ Array of x-coordinates of the input points.
35
+ input_y : array-like
36
+ Array of y-coordinates of the input points.
37
+ output_x : array-like
38
+ Array of x-coordinates where the interpolation is to be computed.
39
+ output_y : array-like
40
+ Array of y-coordinates where the interpolation is to be computed.
41
+ p : int or float, optional
42
+ Power function parameter defining how rapidly weightings should
43
+ decrease as distance increases. Higher values of `p` will cause
44
+ weights for distant points to decrease rapidly, resulting in
45
+ nearby points having more influence on predictions. Defaults to 1.
46
+ k : int, optional
47
+ Number of nearest neighbors to use for interpolation. `k=1` is
48
+ equivalent to "nearest" neighbour interpolation. Defaults to 10.
49
+ max_dist : int or float, optional
50
+ Restrict neighbouring points to less than this distance.
51
+ By default, no distance limit is applied.
52
+ k_min : int, optional
53
+ If `max_dist` is provided, some points may end up with less than
54
+ `k` nearest neighbours, potentially producing less reliable
55
+ interpolations. Set `k_min` to set any points with less than
56
+ `k_min` neighbours to NaN. Defaults to 1.
57
+ epsilon : float, optional
58
+ Small value added to distances to prevent division by zero
59
+ errors in the case that output coordinates are identical to
60
+ input coordinates. Defaults to 1e-12.
61
+
62
+ Returns
63
+ -------
64
+ interp_values : numpy.ndarray
65
+ Interpolated values at the output coordinates. If `input_z` is
66
+ 1-dimensional, `interp_values` will also be 1-dimensional. If
67
+ `input_z` is 2-dimensional, `interp_values` will have the same
68
+ number of rows as `input_z`, with each column (axis=1)
69
+ representing interpolated values for one set of input data.
70
+
71
+ Examples
72
+ --------
73
+ >>> input_z = [1, 2, 3, 4, 5]
74
+ >>> input_x = [0, 1, 2, 3, 4]
75
+ >>> input_y = [0, 1, 2, 3, 4]
76
+ >>> output_x = [0.5, 1.5, 2.5]
77
+ >>> output_y = [0.5, 1.5, 2.5]
78
+ >>> idw(input_z, input_x, input_y, output_x, output_y, k=2)
79
+ array([1.5, 2.5, 3.5])
80
+
81
+ """
82
+ # Convert to numpy arrays
83
+ input_x = np.atleast_1d(input_x)
84
+ input_y = np.atleast_1d(input_y)
85
+ input_z = np.atleast_1d(input_z)
86
+ output_x = np.atleast_1d(output_x)
87
+ output_y = np.atleast_1d(output_y)
88
+
89
+ # Verify input and outputs have matching lengths
90
+ if not (input_z.shape[0] == len(input_x) == len(input_y)):
91
+ raise ValueError("All of `input_z`, `input_x` and `input_y` must be the same length.")
92
+ if not (len(output_x) == len(output_y)):
93
+ raise ValueError("Both `output_x` and `output_y` must be the same length.")
94
+
95
+ # Verify k is smaller than total number of points, and non-zero
96
+ if k > input_z.shape[0]:
97
+ raise ValueError(
98
+ f"The requested number of nearest neighbours (`k={k}`) "
99
+ f"is smaller than the total number of points ({input_z.shape[0]}).",
100
+ )
101
+ if k == 0:
102
+ raise ValueError("Interpolation based on `k=0` nearest neighbours is not valid.")
103
+
104
+ # Create KDTree to efficiently find nearest neighbours
105
+ points_xy = np.column_stack((input_y, input_x))
106
+ tree = KDTree(points_xy)
107
+
108
+ # Determine nearest neighbours and distances to each
109
+ grid_stacked = np.column_stack((output_y, output_x))
110
+ distances, indices = tree.query(grid_stacked, k=k, workers=-1)
111
+
112
+ # If k == 1, add an additional axis for consistency
113
+ if k == 1:
114
+ distances = distances[..., np.newaxis]
115
+ indices = indices[..., np.newaxis]
116
+
117
+ # Add small epsilon to distances to prevent division by zero errors
118
+ # if output coordinates are the same as input coordinates
119
+ distances = np.maximum(distances, epsilon)
120
+
121
+ # Set distances above max to NaN if specified
122
+ if max_dist is not None:
123
+ distances[distances > max_dist] = np.nan
124
+
125
+ # Calculate weights based on distance to k nearest neighbours.
126
+ weights = 1 / np.power(distances, p)
127
+ weights = weights / np.nansum(weights, axis=1).reshape(-1, 1)
128
+
129
+ # 1D case: Compute weighted sum of input_z values for each output point
130
+ if input_z.ndim == 1:
131
+ interp_values = np.nansum(weights * input_z[indices], axis=1)
132
+
133
+ # 2D case: Compute weighted sum for each set of input_z values
134
+ # weights[..., np.newaxis] adds a dimension for broadcasting
135
+ else:
136
+ interp_values = np.nansum(
137
+ weights[..., np.newaxis] * input_z[indices],
138
+ axis=1,
139
+ )
140
+
141
+ # Set any points with less than `k_min` valid weights to NaN
142
+ interp_values[np.isfinite(weights).sum(axis=1) < k_min] = np.nan
143
+
144
+ return interp_values
eo_tides/validation.py ADDED
@@ -0,0 +1,325 @@
1
+ import datetime
2
+ import glob
3
+ import warnings
4
+ from math import sqrt
5
+ from numbers import Number
6
+
7
+ import geopandas as gpd
8
+ import pandas as pd
9
+ from odc.geo.geom import BoundingBox
10
+ from pandas.tseries.offsets import MonthBegin, MonthEnd, YearBegin, YearEnd
11
+ from scipy import stats
12
+ from shapely.geometry import Point
13
+ from sklearn.metrics import mean_absolute_error, mean_squared_error
14
+
15
+
16
+ def eval_metrics(x, y, round=3, all_regress=False):
17
+ """
18
+ Calculate a set of common statistical metrics
19
+ based on two input actual and predicted vectors.
20
+
21
+ These include:
22
+
23
+ * Pearson correlation
24
+ * Root Mean Squared Error
25
+ * Mean Absolute Error
26
+ * R-squared
27
+ * Bias
28
+ * Linear regression parameters (slope, p-value, intercept, standard error)
29
+
30
+ Parameters
31
+ ----------
32
+ x : numpy.array
33
+ An array providing "actual" variable values.
34
+ y : numpy.array
35
+ An array providing "predicted" variable values.
36
+ round : int
37
+ Number of decimal places to round each metric
38
+ to. Defaults to 3.
39
+ all_regress : bool
40
+ Whether to return linear regression p-value,
41
+ intercept and standard error (in addition to
42
+ only regression slope). Defaults to False.
43
+
44
+ Returns
45
+ -------
46
+ pandas.Series
47
+ A `pd.Series` containing all calculated metrics.
48
+ """
49
+
50
+ # Create dataframe to drop na
51
+ xy_df = pd.DataFrame({"x": x, "y": y}).dropna()
52
+
53
+ # Compute linear regression
54
+ lin_reg = stats.linregress(x=xy_df.x, y=xy_df.y)
55
+
56
+ # Calculate statistics
57
+ stats_dict = {
58
+ "Correlation": xy_df.corr().iloc[0, 1],
59
+ "RMSE": sqrt(mean_squared_error(xy_df.x, xy_df.y)),
60
+ "MAE": mean_absolute_error(xy_df.x, xy_df.y),
61
+ "R-squared": lin_reg.rvalue**2,
62
+ "Bias": (xy_df.y - xy_df.x).mean(),
63
+ "Regression slope": lin_reg.slope,
64
+ }
65
+
66
+ # Additional regression params
67
+ if all_regress:
68
+ stats_dict.update({
69
+ "Regression p-value": lin_reg.pvalue,
70
+ "Regression intercept": lin_reg.intercept,
71
+ "Regression standard error": lin_reg.stderr,
72
+ })
73
+
74
+ # Return as
75
+ return pd.Series(stats_dict).round(round)
76
+
77
+
78
+ def round_date_strings(date, round_type="end"):
79
+ """
80
+ Round a date string up or down to the start or end of a given time
81
+ period.
82
+
83
+ Parameters
84
+ ----------
85
+ date : str
86
+ Date string of variable precision (e.g. "2020", "2020-01",
87
+ "2020-01-01").
88
+ round_type : str, optional
89
+ Type of rounding to perform. Valid options are "start" or "end".
90
+ If "start", date is rounded down to the start of the time period.
91
+ If "end", date is rounded up to the end of the time period.
92
+ Default is "end".
93
+
94
+ Returns
95
+ -------
96
+ date_rounded : str
97
+ The rounded date string.
98
+
99
+ Examples
100
+ --------
101
+ >>> round_date_strings('2020')
102
+ '2020-12-31 00:00:00'
103
+
104
+ >>> round_date_strings('2020-01', round_type='start')
105
+ '2020-01-01 00:00:00'
106
+
107
+ >>> round_date_strings('2020-01', round_type='end')
108
+ '2020-01-31 00:00:00'
109
+ """
110
+
111
+ # Determine precision of input date string
112
+ date_segments = len(date.split("-"))
113
+
114
+ # If provided date has no "-", treat it as having year precision
115
+ if date_segments == 1 and round_type == "start":
116
+ date_rounded = str(pd.to_datetime(date) + YearBegin(0))
117
+ elif date_segments == 1 and round_type == "end":
118
+ date_rounded = str(pd.to_datetime(date) + YearEnd(0))
119
+
120
+ # If provided date has one "-", treat it as having month precision
121
+ elif date_segments == 2 and round_type == "start":
122
+ date_rounded = str(pd.to_datetime(date) + MonthBegin(0))
123
+ elif date_segments == 2 and round_type == "end":
124
+ date_rounded = str(pd.to_datetime(date) + MonthEnd(0))
125
+
126
+ # If more than one "-", then return date as-is
127
+ elif date_segments > 2:
128
+ date_rounded = date
129
+
130
+ return date_rounded
131
+
132
+
133
+ def _load_gauge_metadata(metadata_path):
134
+ # Load metadata
135
+ metadata_df = pd.read_csv(metadata_path)
136
+ metadata_df.columns = (
137
+ metadata_df.columns.str.replace(" ", "_", regex=False)
138
+ .str.replace("(", "", regex=False)
139
+ .str.replace(")", "", regex=False)
140
+ .str.replace("/", "_", regex=False)
141
+ .str.lower()
142
+ )
143
+ metadata_df = metadata_df.set_index("site_code")
144
+
145
+ # Convert metadata to GeoDataFrame
146
+ metadata_gdf = gpd.GeoDataFrame(
147
+ data=metadata_df,
148
+ geometry=gpd.points_from_xy(metadata_df.longitude, metadata_df.latitude),
149
+ crs="EPSG:4326",
150
+ )
151
+
152
+ return metadata_df, metadata_gdf
153
+
154
+
155
+ def _load_gesla_dataset(site, path, na_value):
156
+ gesla_df = (
157
+ pd.read_csv(
158
+ path,
159
+ skiprows=41,
160
+ names=["date", "time", "sea_level", "qc_flag", "use_flag"],
161
+ sep=r"\s+", # sep="\s+",
162
+ parse_dates=[[0, 1]],
163
+ index_col=0,
164
+ na_values=na_value,
165
+ )
166
+ .rename_axis("time")
167
+ .assign(site_code=site)
168
+ )
169
+
170
+ return gesla_df
171
+
172
+
173
+ def _nearest_row(gdf, x, y, max_distance=None):
174
+ # Create a point to find the nearest neighbor for
175
+ target_point = gpd.GeoDataFrame({"geometry": [Point(x, y)]}, crs="EPSG:4326")
176
+
177
+ # Use sjoin_nearest to find the closest point
178
+ return gpd.sjoin_nearest(target_point, gdf, how="left", max_distance=max_distance)
179
+
180
+
181
+ def load_gauge_gesla(
182
+ x=None,
183
+ y=None,
184
+ site_code=None,
185
+ time=("2018", "2020"),
186
+ max_distance=None,
187
+ correct_mean=False,
188
+ filter_use_flag=True,
189
+ site_metadata=True,
190
+ data_path="/gdata1/data/sea_level/gesla/",
191
+ metadata_path="/gdata1/data/sea_level/GESLA3_ALL 2.csv",
192
+ ):
193
+ """
194
+ Load and process all available Global Extreme Sea Level Analysis
195
+ (GESLA) tide gauge data with an `x, y, time` spatiotemporal query,
196
+ or from a list of specific tide gauges.
197
+
198
+ Can optionally filter by gauge quality and append detailed gauge metadata.
199
+
200
+ Modified from original code in <https://github.com/philiprt/GeslaDataset>.
201
+
202
+ Parameters
203
+ ----------
204
+ x, y : numeric or list/tuple, optional
205
+ Coordinates (in degrees longitude, latitude) used to load GESLA
206
+ tide gauge observations. If provided as singular values
207
+ (e.g. `x=150, y=-32`), then the nearest tide gauge will be returned.
208
+ If provided as a list or tuple (e.g. `x=(150, 152), y=(-32, -30)`),
209
+ then all gauges within the provided bounding box will be loaded.
210
+ Leave as `None` to return all available gauges, or if providing a
211
+ list of site codes using `site_code`.
212
+ site_code : str or list of str, optional
213
+ GESLA site code(s) for which to load data (e.g. `site_code="62650"`).
214
+ If `site_code` is provided, `x` and `y` will be ignored.
215
+ time : tuple or list of str, optional
216
+ Time range to consider, given as a tuple of start and end dates,
217
+ e.g. `time=("2020", "2021")`. The default of None will return all
218
+ tide observations from the year 1800 onward.
219
+ max_distance : numeric, optional
220
+ Optional max distance within which to return the nearest tide gauge
221
+ when `x` and `y` are provided as singular coordinates. Defaults to
222
+ None, which will always return a tide gauge no matter how far away
223
+ it is located from `x` and `y`.
224
+ correct_mean : bool, optional
225
+ Whether to correct sea level measurements to a standardised mean
226
+ sea level by subtracting the mean of all observed sea level
227
+ observations. This can be useful when GESLA tide heights come
228
+ from different or unknown tide datums. Note: the observed mean
229
+ sea level calculated here may differ from true long-term/
230
+ astronomical Mean Sea Level (MSL) datum.
231
+ filter_use_flag : bool, optional
232
+ Whether to filter out low quality observations with a "use_flag"
233
+ value of 0 (do not use). Defaults to True.
234
+ site_metadata : bool, optional
235
+ Whether to add tide gauge station metadata as additional columns
236
+ in the output DataFrame. Defaults to True.
237
+ data_path : str, optional
238
+ Path to the raw GESLA data files. Default is
239
+ `/gdata1/data/sea_level/gesla/`.
240
+ metadata_path : str, optional
241
+ Path to the GESLA station metadata file.
242
+ Default is `/gdata1/data/sea_level/GESLA3_ALL 2.csv`.
243
+
244
+ Returns
245
+ -------
246
+ pd.DataFrame
247
+ Processed GESLA data as a DataFrame with columns including:
248
+
249
+ - "time": Timestamps,
250
+ - "sea_level": Observed sea level (m),
251
+ - "qc_flag": Observed sea level QC flag,
252
+ - "use_flag": Use-in-analysis flag (1 = use, 0 = do not use),
253
+
254
+ ...and additional columns from station metadata.
255
+ """
256
+ # Load tide gauge metadata
257
+ metadata_df, metadata_gdf = _load_gauge_metadata(metadata_path)
258
+
259
+ # Use supplied site codes if available
260
+ if site_code is not None:
261
+ site_code = [site_code] if not isinstance(site_code, list) else site_code
262
+
263
+ # If x and y are tuples, use xy bounds to identify sites
264
+ elif isinstance(x, (tuple, list)) & isinstance(y, (tuple, list)):
265
+ bbox = BoundingBox.from_xy(x, y)
266
+ site_code = metadata_gdf.cx[bbox.left : bbox.right, bbox.top : bbox.bottom].index
267
+
268
+ # If x and y are single numbers, select nearest row
269
+ elif isinstance(x, Number) & isinstance(y, Number):
270
+ site_code = _nearest_row(metadata_gdf, x, y, max_distance).site_code
271
+
272
+ # Raise exception if no valid tide gauges are found
273
+ if site_code.isnull().all():
274
+ raise Exception(f"No tide gauge found within {max_distance} degrees of {x}, {y}.")
275
+
276
+ # Otherwise if all are None, return all available site codes
277
+ elif (site_code is None) & (x is None) & (y is None):
278
+ site_code = metadata_df.index.to_list()
279
+
280
+ else:
281
+ raise TypeError(
282
+ "`x` and `y` must be provided as either singular coordinates (e.g. `x=150`), or as a tuple bounding box (e.g. `x=(150, 152)`)."
283
+ )
284
+
285
+ # Prepare times
286
+ if time is None:
287
+ time = ["1800", str(datetime.datetime.now().year)]
288
+ time = [time] if not isinstance(time, (list, tuple)) else time
289
+ start_time = round_date_strings(time[0], round_type="start")
290
+ end_time = round_date_strings(time[-1], round_type="end")
291
+
292
+ # Identify paths to load and nodata values for each site
293
+ metadata_df["file_name"] = data_path + metadata_df["file_name"]
294
+ paths_na = metadata_df.loc[site_code, ["file_name", "null_value"]]
295
+
296
+ # Load and combine into a single dataframe
297
+ data_df = (
298
+ pd.concat([_load_gesla_dataset(s, p, na_value=na) for s, p, na in paths_na.itertuples()])
299
+ .sort_index()
300
+ .loc[slice(start_time, end_time)]
301
+ .reset_index()
302
+ .set_index("site_code")
303
+ )
304
+
305
+ # Optionally filter by use flag column
306
+ if filter_use_flag:
307
+ data_df = data_df.loc[data_df.use_flag == 1]
308
+
309
+ # Optionally insert metadata into dataframe
310
+ if site_metadata:
311
+ data_df[metadata_df.columns] = metadata_df.loc[site_code]
312
+
313
+ # Add time to index and remove duplicates
314
+ data_df = data_df.set_index("time", append=True)
315
+ duplicates = data_df.index.duplicated()
316
+ if duplicates.sum() > 0:
317
+ warnings.warn("Duplicate timestamps were removed.")
318
+ data_df = data_df.loc[~duplicates]
319
+
320
+ # Remove observed mean sea level if requested
321
+ if correct_mean:
322
+ data_df["sea_level"] = data_df["sea_level"].sub(data_df.groupby("site_code")["sea_level"].transform("mean"))
323
+
324
+ # Return data
325
+ return data_df
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: eo-tides
3
- Version: 0.0.13
3
+ Version: 0.0.15
4
4
  Summary: Tide modelling tools for large-scale satellite earth observation analysis
5
5
  Author-email: Robbi Bishop-Taylor <Robbi.BishopTaylor@ga.gov.au>
6
6
  Project-URL: Homepage, https://GeoscienceAustralia.github.io/eo-tides/
@@ -0,0 +1,10 @@
1
+ eo_tides/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ eo_tides/model.py,sha256=Gx8oUAyo5kzyl7SZhqzS2QlnblXkubr_Wn2NijYzUlc,43811
3
+ eo_tides/stats.py,sha256=Lzo46pWUhox3ZUnMLtyLzqZ9FrCNG6nJ6iS5IpqEsy8,158
4
+ eo_tides/utils.py,sha256=l9VXJawQzaRBYaFMsP8VBeaN5VA3rFDdzcvF7Rk04Vc,5620
5
+ eo_tides/validation.py,sha256=kpYGHOeK-YP11c3tHt9l5_8IvOHF1SAJP79PXA7i-Vs,11434
6
+ eo_tides-0.0.15.dist-info/LICENSE,sha256=NYULqbFuDRV6CysPbkR2WZk863YwwHeftBtnsb4cWf8,1077
7
+ eo_tides-0.0.15.dist-info/METADATA,sha256=fbju5m5znWM4g24fB7L0LWSrV-torhoc51Pj3atSJG0,3585
8
+ eo_tides-0.0.15.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
9
+ eo_tides-0.0.15.dist-info/top_level.txt,sha256=lXZDUUM1DlLdKWHRn8zdmtW8Rx-eQOIWVvt0b8VGiyQ,9
10
+ eo_tides-0.0.15.dist-info/RECORD,,
@@ -1,5 +0,0 @@
1
- eo_tides-0.0.13.dist-info/LICENSE,sha256=NYULqbFuDRV6CysPbkR2WZk863YwwHeftBtnsb4cWf8,1077
2
- eo_tides-0.0.13.dist-info/METADATA,sha256=VWZoUeWqxrW1FWYBpUXZfL-ZQyC9UM4as89k1ndfWmU,3585
3
- eo_tides-0.0.13.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
4
- eo_tides-0.0.13.dist-info/top_level.txt,sha256=lXZDUUM1DlLdKWHRn8zdmtW8Rx-eQOIWVvt0b8VGiyQ,9
5
- eo_tides-0.0.13.dist-info/RECORD,,