PyPI - eo-tides - Versions diffs - 0.0.12__py3-none-any.whl → 0.0.14__py3-none-any.whl - Mend

eo-tides 0.0.12py3-none-any.whl → 0.0.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

eo_tides/__init__.py +0 -0
eo_tides/model.py +1104 -0
eo_tides/stats.py +15 -0
eo_tides/utils.py +144 -0
eo_tides/validation.py +325 -0
{eo_tides-0.0.12.dist-info → eo_tides-0.0.14.dist-info}/METADATA +1 -1
eo_tides-0.0.14.dist-info/RECORD +10 -0
eo_tides-0.0.12.dist-info/RECORD +0 -5
{eo_tides-0.0.12.dist-info → eo_tides-0.0.14.dist-info}/LICENSE +0 -0
{eo_tides-0.0.12.dist-info → eo_tides-0.0.14.dist-info}/WHEEL +0 -0
{eo_tides-0.0.12.dist-info → eo_tides-0.0.14.dist-info}/top_level.txt +0 -0

eo_tides/stats.py ADDED Viewed

@@ -0,0 +1,15 @@
+def tide_stats(a):
+    """
+    Test function.
+    Parameters
+    ----------
+    a : int
+        Test
+    Returns
+    -------
+    Test
+    """
+    return a

eo_tides/utils.py ADDED Viewed

@@ -0,0 +1,144 @@
+import numpy as np
+from scipy.spatial import cKDTree as KDTree
+def idw(
+    input_z,
+    input_x,
+    input_y,
+    output_x,
+    output_y,
+    p=1,
+    k=10,
+    max_dist=None,
+    k_min=1,
+    epsilon=1e-12,
+):
+    """Perform Inverse Distance Weighting (IDW) interpolation.
+    This function performs fast IDW interpolation by creating a KDTree
+    from the input coordinates then uses it to find the `k` nearest
+    neighbors for each output point. Weights are calculated based on the
+    inverse distance to each neighbor, with weights descreasing with
+    increasing distance.
+    Code inspired by: https://github.com/DahnJ/REM-xarray
+    Parameters
+    ----------
+    input_z : array-like
+        Array of values at the input points. This can be either a
+        1-dimensional array, or a 2-dimensional array where each column
+        (axis=1) represents a different set of values to be interpolated.
+    input_x : array-like
+        Array of x-coordinates of the input points.
+    input_y : array-like
+        Array of y-coordinates of the input points.
+    output_x : array-like
+        Array of x-coordinates where the interpolation is to be computed.
+    output_y : array-like
+        Array of y-coordinates where the interpolation is to be computed.
+    p : int or float, optional
+        Power function parameter defining how rapidly weightings should
+        decrease as distance increases. Higher values of `p` will cause
+        weights for distant points to decrease rapidly, resulting in
+        nearby points having more influence on predictions. Defaults to 1.
+    k : int, optional
+        Number of nearest neighbors to use for interpolation. `k=1` is
+        equivalent to "nearest" neighbour interpolation. Defaults to 10.
+    max_dist : int or float, optional
+        Restrict neighbouring points to less than this distance.
+        By default, no distance limit is applied.
+    k_min : int, optional
+        If `max_dist` is provided, some points may end up with less than
+        `k` nearest neighbours, potentially producing less reliable
+        interpolations. Set `k_min` to set any points with less than
+        `k_min` neighbours to NaN. Defaults to 1.
+    epsilon : float, optional
+        Small value added to distances to prevent division by zero
+        errors in the case that output coordinates are identical to
+        input coordinates. Defaults to 1e-12.
+    Returns
+    -------
+    interp_values : numpy.ndarray
+        Interpolated values at the output coordinates. If `input_z` is
+        1-dimensional, `interp_values` will also be 1-dimensional. If
+        `input_z` is 2-dimensional, `interp_values` will have the same
+        number of rows as `input_z`, with each column (axis=1)
+        representing interpolated values for one set of input data.
+    Examples
+    --------
+    >>> input_z = [1, 2, 3, 4, 5]
+    >>> input_x = [0, 1, 2, 3, 4]
+    >>> input_y = [0, 1, 2, 3, 4]
+    >>> output_x = [0.5, 1.5, 2.5]
+    >>> output_y = [0.5, 1.5, 2.5]
+    >>> idw(input_z, input_x, input_y, output_x, output_y, k=2)
+    array([1.5, 2.5, 3.5])
+    """
+    # Convert to numpy arrays
+    input_x = np.atleast_1d(input_x)
+    input_y = np.atleast_1d(input_y)
+    input_z = np.atleast_1d(input_z)
+    output_x = np.atleast_1d(output_x)
+    output_y = np.atleast_1d(output_y)
+    # Verify input and outputs have matching lengths
+    if not (input_z.shape[0] == len(input_x) == len(input_y)):
+        raise ValueError("All of `input_z`, `input_x` and `input_y` must be the same length.")
+    if not (len(output_x) == len(output_y)):
+        raise ValueError("Both `output_x` and `output_y` must be the same length.")
+    # Verify k is smaller than total number of points, and non-zero
+    if k > input_z.shape[0]:
+        raise ValueError(
+            f"The requested number of nearest neighbours (`k={k}`) "
+            f"is smaller than the total number of points ({input_z.shape[0]}).",
+        )
+    if k == 0:
+        raise ValueError("Interpolation based on `k=0` nearest neighbours is not valid.")
+    # Create KDTree to efficiently find nearest neighbours
+    points_xy = np.column_stack((input_y, input_x))
+    tree = KDTree(points_xy)
+    # Determine nearest neighbours and distances to each
+    grid_stacked = np.column_stack((output_y, output_x))
+    distances, indices = tree.query(grid_stacked, k=k, workers=-1)
+    # If k == 1, add an additional axis for consistency
+    if k == 1:
+        distances = distances[..., np.newaxis]
+        indices = indices[..., np.newaxis]
+    # Add small epsilon to distances to prevent division by zero errors
+    # if output coordinates are the same as input coordinates
+    distances = np.maximum(distances, epsilon)
+    # Set distances above max to NaN if specified
+    if max_dist is not None:
+        distances[distances > max_dist] = np.nan
+    # Calculate weights based on distance to k nearest neighbours.
+    weights = 1 / np.power(distances, p)
+    weights = weights / np.nansum(weights, axis=1).reshape(-1, 1)
+    # 1D case: Compute weighted sum of input_z values for each output point
+    if input_z.ndim == 1:
+        interp_values = np.nansum(weights * input_z[indices], axis=1)
+    # 2D case: Compute weighted sum for each set of input_z values
+    # weights[..., np.newaxis] adds a dimension for broadcasting
+    else:
+        interp_values = np.nansum(
+            weights[..., np.newaxis] * input_z[indices],
+            axis=1,
+        )
+    # Set any points with less than `k_min` valid weights to NaN
+    interp_values[np.isfinite(weights).sum(axis=1) < k_min] = np.nan
+    return interp_values

eo_tides/validation.py ADDED Viewed

@@ -0,0 +1,325 @@
+import datetime
+import glob
+import warnings
+from math import sqrt
+from numbers import Number
+import geopandas as gpd
+import pandas as pd
+from odc.geo.geom import BoundingBox
+from pandas.tseries.offsets import MonthBegin, MonthEnd, YearBegin, YearEnd
+from scipy import stats
+from shapely.geometry import Point
+from sklearn.metrics import mean_absolute_error, mean_squared_error
+def eval_metrics(x, y, round=3, all_regress=False):
+    """
+    Calculate a set of common statistical metrics
+    based on two input actual and predicted vectors.
+    These include:
+    * Pearson correlation
+    * Root Mean Squared Error
+    * Mean Absolute Error
+    * R-squared
+    * Bias
+    * Linear regression parameters (slope, p-value, intercept, standard error)
+    Parameters
+    ----------
+    x : numpy.array
+        An array providing "actual" variable values.
+    y : numpy.array
+        An array providing "predicted" variable values.
+    round : int
+        Number of decimal places to round each metric
+        to. Defaults to 3.
+    all_regress : bool
+        Whether to return linear regression p-value,
+        intercept and standard error (in addition to
+        only regression slope). Defaults to False.
+    Returns
+    -------
+    pandas.Series
+        A `pd.Series` containing all calculated metrics.
+    """
+    # Create dataframe to drop na
+    xy_df = pd.DataFrame({"x": x, "y": y}).dropna()
+    # Compute linear regression
+    lin_reg = stats.linregress(x=xy_df.x, y=xy_df.y)
+    # Calculate statistics
+    stats_dict = {
+        "Correlation": xy_df.corr().iloc[0, 1],
+        "RMSE": sqrt(mean_squared_error(xy_df.x, xy_df.y)),
+        "MAE": mean_absolute_error(xy_df.x, xy_df.y),
+        "R-squared": lin_reg.rvalue**2,
+        "Bias": (xy_df.y - xy_df.x).mean(),
+        "Regression slope": lin_reg.slope,
+    }
+    # Additional regression params
+    if all_regress:
+        stats_dict.update({
+            "Regression p-value": lin_reg.pvalue,
+            "Regression intercept": lin_reg.intercept,
+            "Regression standard error": lin_reg.stderr,
+        })
+    # Return as
+    return pd.Series(stats_dict).round(round)
+def round_date_strings(date, round_type="end"):
+    """
+    Round a date string up or down to the start or end of a given time
+    period.
+    Parameters
+    ----------
+    date : str
+        Date string of variable precision (e.g. "2020", "2020-01",
+        "2020-01-01").
+    round_type : str, optional
+        Type of rounding to perform. Valid options are "start" or "end".
+        If "start", date is rounded down to the start of the time period.
+        If "end", date is rounded up to the end of the time period.
+        Default is "end".
+    Returns
+    -------
+    date_rounded : str
+        The rounded date string.
+    Examples
+    --------
+    >>> round_date_strings('2020')
+    '2020-12-31 00:00:00'
+    >>> round_date_strings('2020-01', round_type='start')
+    '2020-01-01 00:00:00'
+    >>> round_date_strings('2020-01', round_type='end')
+    '2020-01-31 00:00:00'
+    """
+    # Determine precision of input date string
+    date_segments = len(date.split("-"))
+    # If provided date has no "-", treat it as having year precision
+    if date_segments == 1 and round_type == "start":
+        date_rounded = str(pd.to_datetime(date) + YearBegin(0))
+    elif date_segments == 1 and round_type == "end":
+        date_rounded = str(pd.to_datetime(date) + YearEnd(0))
+    # If provided date has one "-", treat it as having month precision
+    elif date_segments == 2 and round_type == "start":
+        date_rounded = str(pd.to_datetime(date) + MonthBegin(0))
+    elif date_segments == 2 and round_type == "end":
+        date_rounded = str(pd.to_datetime(date) + MonthEnd(0))
+    # If more than one "-", then return date as-is
+    elif date_segments > 2:
+        date_rounded = date
+    return date_rounded
+def _load_gauge_metadata(metadata_path):
+    # Load metadata
+    metadata_df = pd.read_csv(metadata_path)
+    metadata_df.columns = (
+        metadata_df.columns.str.replace(" ", "_", regex=False)
+        .str.replace("(", "", regex=False)
+        .str.replace(")", "", regex=False)
+        .str.replace("/", "_", regex=False)
+        .str.lower()
+    )
+    metadata_df = metadata_df.set_index("site_code")
+    # Convert metadata to GeoDataFrame
+    metadata_gdf = gpd.GeoDataFrame(
+        data=metadata_df,
+        geometry=gpd.points_from_xy(metadata_df.longitude, metadata_df.latitude),
+        crs="EPSG:4326",
+    )
+    return metadata_df, metadata_gdf
+def _load_gesla_dataset(site, path, na_value):
+    gesla_df = (
+        pd.read_csv(
+            path,
+            skiprows=41,
+            names=["date", "time", "sea_level", "qc_flag", "use_flag"],
+            sep=r"\s+",  # sep="\s+",
+            parse_dates=[[0, 1]],
+            index_col=0,
+            na_values=na_value,
+        )
+        .rename_axis("time")
+        .assign(site_code=site)
+    )
+    return gesla_df
+def _nearest_row(gdf, x, y, max_distance=None):
+    # Create a point to find the nearest neighbor for
+    target_point = gpd.GeoDataFrame({"geometry": [Point(x, y)]}, crs="EPSG:4326")
+    # Use sjoin_nearest to find the closest point
+    return gpd.sjoin_nearest(target_point, gdf, how="left", max_distance=max_distance)
+def load_gauge_gesla(
+    x=None,
+    y=None,
+    site_code=None,
+    time=("2018", "2020"),
+    max_distance=None,
+    correct_mean=False,
+    filter_use_flag=True,
+    site_metadata=True,
+    data_path="/gdata1/data/sea_level/gesla/",
+    metadata_path="/gdata1/data/sea_level/GESLA3_ALL 2.csv",
+):
+    """
+    Load and process all available Global Extreme Sea Level Analysis
+    (GESLA) tide gauge data with an `x, y, time` spatiotemporal query,
+    or from a list of specific tide gauges.
+    Can optionally filter by gauge quality and append detailed gauge metadata.
+    Modified from original code in <https://github.com/philiprt/GeslaDataset>.
+    Parameters
+    ----------
+    x, y : numeric or list/tuple, optional
+        Coordinates (in degrees longitude, latitude) used to load GESLA
+        tide gauge observations. If provided as singular values
+        (e.g. `x=150, y=-32`), then the nearest tide gauge will be returned.
+        If provided as a list or tuple (e.g. `x=(150, 152), y=(-32, -30)`),
+        then all gauges within the provided bounding box will be loaded.
+        Leave as `None` to return all available gauges, or if providing a
+        list of site codes using `site_code`.
+    site_code : str or list of str, optional
+        GESLA site code(s) for which to load data (e.g. `site_code="62650"`).
+        If `site_code` is provided, `x` and `y` will be ignored.
+    time : tuple or list of str, optional
+        Time range to consider, given as a tuple of start and end dates,
+        e.g. `time=("2020", "2021")`. The default of None will return all
+        tide observations from the year 1800 onward.
+    max_distance : numeric, optional
+        Optional max distance within which to return the nearest tide gauge
+        when `x` and `y` are provided as singular coordinates. Defaults to
+        None, which will always return a tide gauge no matter how far away
+        it is located from `x` and `y`.
+    correct_mean : bool, optional
+        Whether to correct sea level measurements to a standardised mean
+        sea level by subtracting the mean of all observed sea level
+        observations. This can be useful when GESLA tide heights come
+        from different or unknown tide datums. Note: the observed mean
+        sea level calculated here may differ from true long-term/
+        astronomical Mean Sea Level (MSL) datum.
+    filter_use_flag : bool, optional
+        Whether to filter out low quality observations with a "use_flag"
+        value of 0 (do not use). Defaults to True.
+    site_metadata : bool, optional
+        Whether to add tide gauge station metadata as additional columns
+        in the output DataFrame. Defaults to True.
+    data_path : str, optional
+        Path to the raw GESLA data files. Default is
+        `/gdata1/data/sea_level/gesla/`.
+    metadata_path : str, optional
+        Path to the GESLA station metadata file.
+        Default is `/gdata1/data/sea_level/GESLA3_ALL 2.csv`.
+    Returns
+    -------
+    pd.DataFrame
+        Processed GESLA data as a DataFrame with columns including:
+        - "time": Timestamps,
+        - "sea_level": Observed sea level (m),
+        - "qc_flag": Observed sea level QC flag,
+        - "use_flag": Use-in-analysis flag (1 = use, 0 = do not use),
+        ...and additional columns from station metadata.
+    """
+    # Load tide gauge metadata
+    metadata_df, metadata_gdf = _load_gauge_metadata(metadata_path)
+    # Use supplied site codes if available
+    if site_code is not None:
+        site_code = [site_code] if not isinstance(site_code, list) else site_code
+    # If x and y are tuples, use xy bounds to identify sites
+    elif isinstance(x, (tuple, list)) & isinstance(y, (tuple, list)):
+        bbox = BoundingBox.from_xy(x, y)
+        site_code = metadata_gdf.cx[bbox.left : bbox.right, bbox.top : bbox.bottom].index
+    # If x and y are single numbers, select nearest row
+    elif isinstance(x, Number) & isinstance(y, Number):
+        site_code = _nearest_row(metadata_gdf, x, y, max_distance).site_code
+        # Raise exception if no valid tide gauges are found
+        if site_code.isnull().all():
+            raise Exception(f"No tide gauge found within {max_distance} degrees of {x}, {y}.")
+    # Otherwise if all are None, return all available site codes
+    elif (site_code is None) & (x is None) & (y is None):
+        site_code = metadata_df.index.to_list()
+    else:
+        raise TypeError(
+            "`x` and `y` must be provided as either singular coordinates (e.g. `x=150`), or as a tuple bounding box (e.g. `x=(150, 152)`)."
+        )
+    # Prepare times
+    if time is None:
+        time = ["1800", str(datetime.datetime.now().year)]
+    time = [time] if not isinstance(time, (list, tuple)) else time
+    start_time = round_date_strings(time[0], round_type="start")
+    end_time = round_date_strings(time[-1], round_type="end")
+    # Identify paths to load and nodata values for each site
+    metadata_df["file_name"] = data_path + metadata_df["file_name"]
+    paths_na = metadata_df.loc[site_code, ["file_name", "null_value"]]
+    # Load and combine into a single dataframe
+    data_df = (
+        pd.concat([_load_gesla_dataset(s, p, na_value=na) for s, p, na in paths_na.itertuples()])
+        .sort_index()
+        .loc[slice(start_time, end_time)]
+        .reset_index()
+        .set_index("site_code")
+    )
+    # Optionally filter by use flag column
+    if filter_use_flag:
+        data_df = data_df.loc[data_df.use_flag == 1]
+    # Optionally insert metadata into dataframe
+    if site_metadata:
+        data_df[metadata_df.columns] = metadata_df.loc[site_code]
+    # Add time to index and remove duplicates
+    data_df = data_df.set_index("time", append=True)
+    duplicates = data_df.index.duplicated()
+    if duplicates.sum() > 0:
+        warnings.warn("Duplicate timestamps were removed.")
+        data_df = data_df.loc[~duplicates]
+    # Remove observed mean sea level if requested
+    if correct_mean:
+        data_df["sea_level"] = data_df["sea_level"].sub(data_df.groupby("site_code")["sea_level"].transform("mean"))
+    # Return data
+    return data_df

{eo_tides-0.0.12.dist-info → eo_tides-0.0.14.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: eo-tides
-Version: 0.0.12
+Version: 0.0.14
 Summary: Tide modelling tools for large-scale satellite earth observation analysis
 Author-email: Robbi Bishop-Taylor <Robbi.BishopTaylor@ga.gov.au>
 Project-URL: Homepage, https://GeoscienceAustralia.github.io/eo-tides/

eo_tides-0.0.14.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+eo_tides/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+eo_tides/model.py,sha256=Gx8oUAyo5kzyl7SZhqzS2QlnblXkubr_Wn2NijYzUlc,43811
+eo_tides/stats.py,sha256=Lzo46pWUhox3ZUnMLtyLzqZ9FrCNG6nJ6iS5IpqEsy8,158
+eo_tides/utils.py,sha256=l9VXJawQzaRBYaFMsP8VBeaN5VA3rFDdzcvF7Rk04Vc,5620
+eo_tides/validation.py,sha256=kpYGHOeK-YP11c3tHt9l5_8IvOHF1SAJP79PXA7i-Vs,11434
+eo_tides-0.0.14.dist-info/LICENSE,sha256=NYULqbFuDRV6CysPbkR2WZk863YwwHeftBtnsb4cWf8,1077
+eo_tides-0.0.14.dist-info/METADATA,sha256=wAsXUcLSqB9YVFNcO_YQpPuOBp1IDCRy8jBmTy3sdog,3585
+eo_tides-0.0.14.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
+eo_tides-0.0.14.dist-info/top_level.txt,sha256=lXZDUUM1DlLdKWHRn8zdmtW8Rx-eQOIWVvt0b8VGiyQ,9
+eo_tides-0.0.14.dist-info/RECORD,,

eo_tides-0.0.12.dist-info/RECORD DELETED Viewed

@@ -1,5 +0,0 @@
-eo_tides-0.0.12.dist-info/LICENSE,sha256=NYULqbFuDRV6CysPbkR2WZk863YwwHeftBtnsb4cWf8,1077
-eo_tides-0.0.12.dist-info/METADATA,sha256=-wCBk9eKlPP5v59QDZPlW_9rqFQNqPK7IagpjdJ8tKQ,3585
-eo_tides-0.0.12.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
-eo_tides-0.0.12.dist-info/top_level.txt,sha256=lXZDUUM1DlLdKWHRn8zdmtW8Rx-eQOIWVvt0b8VGiyQ,9
-eo_tides-0.0.12.dist-info/RECORD,,

{eo_tides-0.0.12.dist-info → eo_tides-0.0.14.dist-info}/LICENSE RENAMED Viewed

File without changes

{eo_tides-0.0.12.dist-info → eo_tides-0.0.14.dist-info}/WHEEL RENAMED Viewed

File without changes

{eo_tides-0.0.12.dist-info → eo_tides-0.0.14.dist-info}/top_level.txt RENAMED Viewed

File without changes

eo-tides 0.0.12__py3-none-any.whl → 0.0.14__py3-none-any.whl

eo-tides 0.0.12py3-none-any.whl → 0.0.14py3-none-any.whl