PyPI - tobac - Versions diffs - 1.6.2__py3-none-any.whl - Mend

tobac 1.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

tobac/__init__.py +112 -0
tobac/analysis/__init__.py +31 -0
tobac/analysis/cell_analysis.py +628 -0
tobac/analysis/feature_analysis.py +212 -0
tobac/analysis/spatial.py +619 -0
tobac/centerofgravity.py +226 -0
tobac/feature_detection.py +1758 -0
tobac/merge_split.py +324 -0
tobac/plotting.py +2321 -0
tobac/segmentation/__init__.py +10 -0
tobac/segmentation/watershed_segmentation.py +1316 -0
tobac/testing.py +1179 -0
tobac/tests/segmentation_tests/test_iris_xarray_segmentation.py +0 -0
tobac/tests/segmentation_tests/test_segmentation.py +1183 -0
tobac/tests/segmentation_tests/test_segmentation_time_pad.py +104 -0
tobac/tests/test_analysis_spatial.py +1109 -0
tobac/tests/test_convert.py +265 -0
tobac/tests/test_datetime.py +216 -0
tobac/tests/test_decorators.py +148 -0
tobac/tests/test_feature_detection.py +1321 -0
tobac/tests/test_generators.py +273 -0
tobac/tests/test_import.py +24 -0
tobac/tests/test_iris_xarray_match_utils.py +244 -0
tobac/tests/test_merge_split.py +351 -0
tobac/tests/test_pbc_utils.py +497 -0
tobac/tests/test_sample_data.py +197 -0
tobac/tests/test_testing.py +747 -0
tobac/tests/test_tracking.py +714 -0
tobac/tests/test_utils.py +650 -0
tobac/tests/test_utils_bulk_statistics.py +789 -0
tobac/tests/test_utils_coordinates.py +328 -0
tobac/tests/test_utils_internal.py +97 -0
tobac/tests/test_xarray_utils.py +232 -0
tobac/tracking.py +613 -0
tobac/utils/__init__.py +27 -0
tobac/utils/bulk_statistics.py +360 -0
tobac/utils/datetime.py +184 -0
tobac/utils/decorators.py +540 -0
tobac/utils/general.py +753 -0
tobac/utils/generators.py +87 -0
tobac/utils/internal/__init__.py +2 -0
tobac/utils/internal/coordinates.py +430 -0
tobac/utils/internal/iris_utils.py +462 -0
tobac/utils/internal/label_props.py +82 -0
tobac/utils/internal/xarray_utils.py +439 -0
tobac/utils/mask.py +364 -0
tobac/utils/periodic_boundaries.py +419 -0
tobac/wrapper.py +244 -0
tobac-1.6.2.dist-info/METADATA +154 -0
tobac-1.6.2.dist-info/RECORD +53 -0
tobac-1.6.2.dist-info/WHEEL +5 -0
tobac-1.6.2.dist-info/licenses/LICENSE +29 -0
tobac-1.6.2.dist-info/top_level.txt +1 -0

tobac/utils/bulk_statistics.py ADDED Viewed

@@ -0,0 +1,360 @@
+"""
+Support functions to compute bulk statistics of features, either as a postprocessing step
+or within feature detection or segmentation.
+"""
+from __future__ import annotations
+from datetime import timedelta
+import logging
+import warnings
+from functools import partial
+from typing import Callable, Optional, Union
+import numpy as np
+from tobac.utils.generators import field_and_features_over_time
+# numpy renamed core to _core recently
+try:
+    from numpy._core import multiarray as mu
+except ModuleNotFoundError:
+    from numpy.core import multiarray as mu
+import pandas as pd
+import xarray as xr
+from tobac.utils import decorators
+def get_statistics(
+    features: pd.DataFrame,
+    labels: np.ndarray[int],
+    *fields: tuple[xr.DataArray],
+    statistic: dict[str, Union[Callable, tuple[Callable, dict]]] = {
+        "ncells": np.count_nonzero
+    },
+    index: Union[None, list[int]] = None,
+    default: Union[None, float] = None,
+    id_column: str = "feature",
+    collapse_axis: Union[None, int, list[int]] = None,
+) -> pd.DataFrame:
+    """Get bulk statistics for objects (e.g. features or segmented features)
+    given a labelled mask of the objects and any input field with the same
+    dimensions or that can be broadcast with labels according to numpy-like
+    broadcasting rules.
+    The statistics are added as a new column to the existing feature dataframe.
+    Users can specify which statistics are computed by providing a dictionary
+    with the column name of the metric and the respective function.
+    Parameters
+    ----------
+    features: pd.DataFrame
+        Dataframe with features or segmented features (output from feature
+        detection or segmentation), which can be for the specific timestep or
+        for the whole dataset
+    labels : np.ndarray[int]
+        Mask with labels of each regions to apply function to (e.g. output of
+        segmentation for a specific timestep)
+    *fields : tuple[xr.DataArray]
+        Fields to give as arguments to each function call. If the shape does not
+        match that of labels, numpy-style broadcasting will be applied.
+    statistic: dict[str, Callable], optional (default: {'ncells':np.count_nonzero})
+        Dictionary with function(s) to apply over each region as values and the
+        name of the respective statistics as keys. Default is to just count the
+        number of cells associated with each feature and write it to the feature
+        dataframe.
+    index: None | list[int], optional (default: None)
+        list of indices of regions in labels to apply function to. If None, will
+        default to all integer feature labels in labels.
+    default: None | float, optional (default: None)
+        default value to return in a region that has no values.
+    id_column: str, optional (default: "feature")
+        Name of the column in feature dataframe that contains IDs that match with
+        the labels in mask. The default is the column "feature".
+    collapse_axis: None | int | list[int], optional (default: None):
+        Index or indices of axes of labels to collapse. This will reduce the dimensionality of labels
+        while allowing labelled features to overlap. This can be used, for example, to calculate the
+        footprint area (2D) of 3D labels
+    Returns
+    -------
+    features: pd.DataFrame
+        Updated feature dataframe with bulk statistics for each feature saved
+        in a new column.
+    """
+    # if mask and input data dimensions do not match we can broadcast using numpy broadcasting rules
+    if collapse_axis is not None:
+        # Test if iterable and if not make a list
+        try:
+            collapse_axis = list(iter(collapse_axis))
+        except TypeError:
+            collapse_axis = [collapse_axis]
+        # Normalise axes to handle negative axis number conventions
+        ndim = len(labels.shape)
+        collapse_axis = [mu.normalize_axis_index(axis, ndim) for axis in collapse_axis]
+        uncollapsed_axes = [
+            i for i, _ in enumerate(labels.shape) if i not in collapse_axis
+        ]
+        if not len(uncollapsed_axes):
+            raise ValueError("Cannot collapse all axes of labels")
+        collapsed_shape = tuple(
+            [s for i, s in enumerate(labels.shape) if i not in collapse_axis]
+        )
+        broadcast_flag = any([collapsed_shape != field.shape for field in fields])
+        if broadcast_flag:
+            raise ValueError("Broadcasting not supported with collapse_axis")
+    else:
+        broadcast_flag = any([labels.shape != field.shape for field in fields])
+        if broadcast_flag:
+            # Broadcast input labels and fields to ensure they work according to numpy broadcasting rules
+            broadcast_fields = np.broadcast_arrays(labels, *fields)
+            labels = broadcast_fields[0]
+            fields = broadcast_fields[1:]
+    # mask must contain positive values to calculate statistics
+    if np.any(labels > 0):
+        if index is None:
+            index = features[id_column].to_numpy().astype(int)
+        else:
+            # get the statistics only for specified feature objects
+            if np.max(index) > np.max(labels):
+                raise ValueError("Index contains values that are not in labels!")
+        # Find which labels exist in features for output:
+        index_in_features = np.isin(index, features[id_column])
+        # set negative markers to 0 as they are unsegmented
+        bins = np.cumsum(np.bincount(np.maximum(labels.ravel(), 0)))
+        argsorted = np.argsort(labels.ravel())
+        # Create lambdas to get (ravelled) label locations using argsorted and bins
+        if collapse_axis is None:
+            label_locs = lambda i: argsorted[bins[i - 1] : bins[i]]
+        else:
+            # Collapse ravelled locations to the remaining axes
+            label_locs = lambda i: np.unique(
+                np.ravel_multi_index(
+                    np.array(
+                        np.unravel_index(argsorted[bins[i - 1] : bins[i]], labels.shape)
+                    )[uncollapsed_axes],
+                    collapsed_shape,
+                )
+            )
+        # apply each function given per statistic parameter for the labeled regions sorted in ascending order
+        for stats_name in statistic.keys():
+            # if function is given as a tuple, take the input parameters provided
+            if type(statistic[stats_name]) is tuple:
+                # assure that key word arguments are provided as dictionary
+                if not type(statistic[stats_name][1]) is dict:
+                    raise TypeError(
+                        "Tuple must contain dictionary with key word arguments for function."
+                    )
+                func = partial(statistic[stats_name][0], **statistic[stats_name][1])
+            else:
+                func = statistic[stats_name]
+            # default needs to be sequence when function output is array-like
+            output = func(*([np.random.rand(10)] * len(fields)))
+            if hasattr(output, "__len__"):
+                default = np.full(output.shape, default)
+            stats = np.array(
+                [
+                    (
+                        func(*(field.ravel()[label_locs(i)] for field in fields))
+                        if i < bins.size and bins[i] > bins[i - 1]
+                        else default
+                    )
+                    for i in index
+                ]
+            )
+            # add results of computed statistics to feature dataframe with column name given per statistic
+            # initiate new column in feature dataframe if it does not already exist
+            if stats_name not in features.columns:
+                if default is not None and not hasattr(default, "__len__"):
+                    # If result is a scalar value we can create an empty column with the correct dtype
+                    features[stats_name] = np.full(
+                        [len(features)], default, type(default)
+                    )
+                else:
+                    features[stats_name] = np.full([len(features)], None, object)
+            for idx, label in enumerate(index):
+                if index_in_features[idx]:
+                    # test if values are scalars
+                    if not hasattr(stats[idx], "__len__"):
+                        # if yes, we can just assign the value to the new column and row of the respective feature
+                        features.loc[features[id_column] == label, stats_name] = stats[
+                            idx
+                        ]
+                        # if stats output is array-like it has to be added in a different way
+                    else:
+                        df = pd.DataFrame({stats_name: [stats[idx]]})
+                        # get row index rather than pd.Dataframe index value since we need to use .iloc indexing
+                        row_idx = np.where(features[id_column] == label)[0]
+                        features.iloc[
+                            row_idx,
+                            features.columns.get_loc(stats_name),
+                        ] = df.apply(lambda r: tuple(r), axis=1)
+    return features
+@decorators.iris_to_xarray()
+def get_statistics_from_mask(
+    features: pd.DataFrame,
+    segmentation_mask: xr.DataArray,
+    *fields: tuple[xr.DataArray],
+    statistic: dict[str, tuple[Callable]] = {"Mean": np.mean},
+    index: Union[None, list[int]] = None,
+    default: Union[None, float] = None,
+    id_column: str = "feature",
+    collapse_dim: Union[None, str, list[str]] = None,
+    time_var_name: str = "time",
+    time_padding: Optional[timedelta] = None,
+) -> pd.DataFrame:
+    """Derives bulk statistics for each object in the segmentation mask, and
+    returns a features Dataframe with these properties for each feature.
+    Parameters
+    ----------
+    features: pd.DataFrame
+        Dataframe with segmented features (output from feature detection or
+        segmentation). Timesteps must not be exactly the same as in segmentation
+        mask but all labels in the mask need to be present in the feature
+        dataframe.
+    segmentation_mask : xr.DataArray
+        Segmentation mask output
+    *fields : tuple[xr.DataArray]
+        Field(s) with input data. If field does not have a time dimension it
+        will be considered time invariant, and the entire field will be passed
+        for each time step in segmentation_mask. If the shape does not match
+        that of labels, numpy-style broadcasting will be applied.
+    statistic: dict[str, Callable], optional (default: {'ncells':np.count_nonzero})
+        Dictionary with function(s) to apply over each region as values and the
+        name of the respective statistics as keys. Default is to calculate the
+        mean value of the field over each feature.
+    index: None | list[int], optional (default: None)
+        list of indexes of regions in labels to apply function to. If None, will
+        default to all integers between 1 and the maximum value in labels
+    default: None | float, optional (default: None)
+        default value to return in a region that has no values
+    id_column: str, optional (default: "feature")
+        Name of the column in feature dataframe that contains IDs that match with the labels in mask. The default is the column "feature".
+    collapse_dim: None | str | list[str], optional (default: None)
+        Dimension names of labels to collapse, allowing, e.g. calulcation of statistics on 2D
+        fields for the footprint of 3D objects
+    time_var_name : str, optional (default: "time")
+        The name of the time dimension in the input fields and the time column
+        in features, by default "time"
+    time_padding: timedelta, optional (default: None)
+        If set, allows for statistics to be associated with a feature input
+        timestep that is within time_padding off of the feature. Extremely useful when
+        converting between micro- and nanoseconds, as is common when using Pandas
+        dataframes.
+     Returns
+     -------
+     features: pd.DataFrame
+         Updated feature dataframe with bulk statistics for each feature saved in a new column
+    """
+    # warning when feature labels are not unique in dataframe
+    if not features[id_column].is_unique:
+        logging.warning(
+            "Feature labels are not unique which may cause unexpected results for the computation of bulk statistics."
+        )
+    # extra warning when feature labels are not unique in timestep
+    uniques = features.groupby("time")[id_column].value_counts().values
+    if not uniques[uniques > 1].size == 0:
+        logging.warning(
+            "Note that non-unique feature labels occur also in the same timestep. This likely causes unexpected results for the computation of bulk statistics."
+        )
+    if collapse_dim is not None:
+        if isinstance(collapse_dim, str):
+            collapse_dim = [collapse_dim]
+        non_time_dims = [dim for dim in segmentation_mask.dims if dim != "time"]
+        collapse_axis = [
+            i for i, dim in enumerate(non_time_dims) if dim in collapse_dim
+        ]
+        if len(collapse_dim) != len(collapse_axis):
+            raise ValueError(
+                "One or more of collapse_dim not found in dimensions of segmentation_mask"
+            )
+    else:
+        collapse_axis = None
+    # check if any of the feature dataframe input values match with segmentaion mask IDs
+    if not np.any(np.isin(features[id_column], np.unique(segmentation_mask))):
+        raise ValueError(
+            "The labels of the segmentation mask and the feature dataframe do not seem to match. Please make sure you provide the correct input feature dataframe  to calculate the bulk statistics."
+        )
+    # get bulk statistics for each timestep
+    step_statistics = []
+    for _, tt, segmentation_mask_t, features_t in field_and_features_over_time(
+        segmentation_mask,
+        features,
+        time_var_name=time_var_name,
+        time_padding=time_padding,
+    ):
+        # select specific timestep
+        fields_t = (
+            (
+                field.sel(
+                    {time_var_name: tt}, method="nearest", tolerance=time_padding
+                ).values
+                if time_var_name in field.coords
+                else field.values
+            )
+            for field in fields
+        )
+        # make sure that the labels in the segmentation mask exist in feature dataframe
+        # continue loop because not all timesteps might have matching IDs
+        if not np.any(np.isin(features_t[id_column], np.unique(segmentation_mask_t))):
+            warnings.warn("Not all timesteps have matching features", UserWarning)
+            step_statistics.append(features_t.copy())
+            continue
+        else:
+            # make sure that features are not double-defined
+            step_statistics.append(
+                get_statistics(
+                    features_t.copy(),
+                    segmentation_mask_t.values.astype(np.int64),
+                    *fields_t,
+                    statistic=statistic,
+                    default=default,
+                    index=index,
+                    id_column=id_column,
+                    collapse_axis=collapse_axis,
+                )
+            )
+    features = pd.concat(step_statistics)
+    return features

tobac/utils/datetime.py ADDED Viewed

@@ -0,0 +1,184 @@
+"""Functions for converting between and working with different datetime formats"""
+from typing import Union
+import datetime
+import numpy as np
+import pandas as pd
+import xarray as xr
+import cftime
+def to_cftime(
+    dates: Union[str, datetime.datetime, np.datetime64, pd.Timestamp, cftime.datetime],
+    calendar: str,
+    align_on: str = "date",
+) -> cftime.datetime:
+    """Converts a provided datetime-like object to a cftime datetime with the
+    given calendar
+    Parameters
+    ----------
+    dates : Union[str, datetime.datetime, np.datetime64, pd.Timestamp, cftime.datetime]
+        A datetime-like object or array of datetime-like objects to be converted
+    calendar : str
+        The requested cftime calender
+    align_on : str, optional
+        The 'align-on' parameter required for 360-day, 365-day and 366-day
+        cftime dates, by default "date"
+    Returns
+    -------
+    cftime.datetime
+        A cftime object or array of cftime objects in the requested calendar
+    """
+    dates_arr = np.atleast_1d(dates)
+    if isinstance(dates_arr[0], cftime.datetime):
+        cftime_dates = (
+            xr.DataArray(dates_arr, {"time": dates_arr})
+            .convert_calendar(calendar, use_cftime=True, align_on=align_on)
+            .time.values
+        )
+    else:
+        cftime_dates = (
+            xr.DataArray(dates_arr, {"time": pd.to_datetime(dates_arr)})
+            .convert_calendar(calendar, use_cftime=True, align_on=align_on)
+            .time.values
+        )
+    if not hasattr(dates, "__iter__") or isinstance(dates, str) and len(cftime_dates):
+        return cftime_dates[0]
+    return cftime_dates
+def to_timestamp(
+    dates: Union[str, datetime.datetime, np.datetime64, pd.Timestamp, cftime.datetime],
+) -> pd.Timestamp:
+    """Converts a provided datetime-like object to a pandas timestamp
+    Parameters
+    ----------
+    dates : Union[str, datetime.datetime, np.datetime64, pd.Timestamp, cftime.datetime]
+        A datetime-like object or array of datetime-like objects to be converted
+    Returns
+    -------
+    pd.Timestamp
+        A pandas timestamp or array of pandas timestamps
+    """
+    squeeze_output = False
+    if not hasattr(dates, "__iter__") or isinstance(dates, str):
+        dates = np.atleast_1d(dates)
+        squeeze_output = True
+    if isinstance(next(iter(dates)), cftime.datetime):
+        pd_dates = xr.CFTimeIndex(dates).to_datetimeindex()
+    else:
+        pd_dates = pd.to_datetime(dates)
+    if squeeze_output:
+        return next(iter(pd_dates))
+    return pd_dates
+def to_datetime(
+    dates: Union[str, datetime.datetime, np.datetime64, pd.Timestamp, cftime.datetime],
+) -> datetime.datetime:
+    """Converts a provided datetime-like object to python datetime objects
+    Parameters
+    ----------
+    dates : Union[str, datetime.datetime, np.datetime64, pd.Timestamp, cftime.datetime]
+        A datetime-like object or array of datetime-like objects to be converted
+    Returns
+    -------
+    datetime.datetime
+        A python datetime or array of python datetimes
+    """
+    return to_timestamp(dates).to_pydatetime()
+def to_datetime64(
+    dates: Union[str, datetime.datetime, np.datetime64, pd.Timestamp, cftime.datetime],
+) -> np.datetime64:
+    """Converts a provided datetime-like object to numpy datetime64 objects
+    Parameters
+    ----------
+    dates : Union[str, datetime.datetime, np.datetime64, pd.Timestamp, cftime.datetime]
+        A datetime-like object or array of datetime-like objects to be converted
+    Returns
+    -------
+    np.datetime64
+        A numpy datetime64 or array of numpy datetime64s
+    """
+    return to_timestamp(dates).to_numpy()
+def to_datestr(
+    dates: Union[str, datetime.datetime, np.datetime64, pd.Timestamp, cftime.datetime],
+) -> str:
+    """Converts a provided datetime-like object to ISO format date strings
+    Parameters
+    ----------
+    dates : Union[str, datetime.datetime, np.datetime64, pd.Timestamp, cftime.datetime]
+        A datetime-like object or array of datetime-like objects to be converted
+    Returns
+    -------
+    str
+        A string or array of strings in ISO date format
+    """
+    dates = to_datetime64(dates)
+    if hasattr(dates, "__iter__"):
+        return dates.astype(str)
+    return str(dates)
+def match_datetime_format(
+    dates: Union[str, datetime.datetime, np.datetime64, pd.Timestamp, cftime.datetime],
+    target: Union[str, datetime.datetime, np.datetime64, pd.Timestamp, cftime.datetime],
+) -> Union[str, datetime.datetime, np.datetime64, pd.Timestamp, cftime.datetime]:
+    """Converts the provided datetime-like objects to the same datetime format
+    as the provided target
+    Parameters
+    ----------
+    dates : Union[str, datetime.datetime, np.datetime64, pd.Timestamp, cftime.datetime]
+        A datetime-like object or array of datetime-like objects to be converted
+    target : Union[str, datetime.datetime, np.datetime64, pd.Timestamp, cftime.datetime]
+        A datetime-like object or array of datetime-like objects which the dates
+        input will be converted to match
+    Returns
+    -------
+    Union[str, datetime.datetime, np.datetime64, pd.Timestamp, cftime.datetime]
+        The datetime-like values of the date parameter, converted to a format
+        which matches that of the target input
+    Raises
+    ------
+    ValueError
+        If the target parameter provided is not a datetime-time object or array
+        of datetime-like objects
+    """
+    if isinstance(target, str):
+        return to_datestr(dates)
+    if isinstance(target, xr.DataArray):
+        target = target.values
+    if isinstance(target, pd.Series):
+        target = target.to_numpy()
+    if hasattr(target, "__iter__"):
+        target = target[0]
+    if isinstance(target, str):
+        return to_datestr(dates)
+    if isinstance(target, cftime.datetime):
+        return to_cftime(dates, target.calendar)
+    if isinstance(target, pd.Timestamp):
+        return to_timestamp(dates)
+    if isinstance(target, np.datetime64):
+        return to_datetime64(dates)
+    if isinstance(target, datetime.datetime):
+        return to_datetime(dates)
+    raise ValueError("Target is not a valid datetime format")