PyPI - downsampler - Versions diffs - 0.1.0__py3-none-any.whl - Mend

downsampler 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

downsampler/__init__.py +80 -0
downsampler/aggregators.py +338 -0
downsampler/config.py +72 -0
downsampler/core.py +166 -0
downsampler/deferred.py +357 -0
downsampler/edges.py +202 -0
downsampler/fidelity/__init__.py +23 -0
downsampler/fidelity/comparison.py +343 -0
downsampler/fidelity/metrics.py +212 -0
downsampler/fidelity/visualization.py +359 -0
downsampler/gaps.py +310 -0
downsampler/lttb.py +207 -0
downsampler/utils.py +150 -0
downsampler-0.1.0.dist-info/METADATA +246 -0
downsampler-0.1.0.dist-info/RECORD +18 -0
downsampler-0.1.0.dist-info/WHEEL +5 -0
downsampler-0.1.0.dist-info/licenses/LICENSE +21 -0
downsampler-0.1.0.dist-info/top_level.txt +1 -0

downsampler/__init__.py ADDED Viewed

@@ -0,0 +1,80 @@
+"""downsampler - Timeseries DataFrame downsampling with LTTB, aggregation methods, and fidelity testing.
+This package provides tools for downsampling time series data in pandas DataFrames,
+with support for:
+- LTTB (Largest Triangle Three Buckets) algorithm for visual fidelity
+- Multiple aggregation methods (mean, median, min, max)
+- Gap-aware processing
+- Edge handling strategies
+- Deferred/lazy data fetching
+- Fidelity testing and comparison
+Example:
+    >>> import pandas as pd
+    >>> from downsampler import downsample, DownsampleConfig, AggregationMethod
+    >>>
+    >>> # Create sample data
+    >>> df = pd.DataFrame(
+    ...     {'value': [1, 2, 3, 4, 5]},
+    ...     index=pd.date_range('2024-01-01', periods=5, freq='1min')
+    ... )
+    >>>
+    >>> # Downsample using mean
+    >>> result = downsample(df, target_cadence='5min')
+    >>>
+    >>> # Downsample using LTTB
+    >>> config = DownsampleConfig(
+    ...     method=AggregationMethod.LTTB,
+    ...     lttb_target_column='value'
+    ... )
+    >>> result = downsample(df, target_cadence='5min', config=config)
+"""
+from downsampler.config import (
+    AggregationMethod,
+    EdgeHandling,
+    GapHandling,
+    DownsampleConfig,
+)
+from downsampler.core import downsample, downsample_multi_aggregate
+from downsampler.gaps import (
+    find_gap_indices,
+    groupby_gaps,
+    wrap_in_nans,
+    mark_gaps_in_dataframe,
+)
+from downsampler.lttb import downsample_lttb
+from downsampler.aggregators import (
+    downsample_mean,
+    downsample_median,
+    downsample_min,
+    downsample_max,
+)
+from downsampler.deferred import deferred_downsample
+__version__ = "0.1.0"
+__all__ = [
+    # Config
+    "AggregationMethod",
+    "EdgeHandling",
+    "GapHandling",
+    "DownsampleConfig",
+    # Core
+    "downsample",
+    "downsample_multi_aggregate",
+    # Gaps
+    "find_gap_indices",
+    "groupby_gaps",
+    "wrap_in_nans",
+    "mark_gaps_in_dataframe",
+    # LTTB
+    "downsample_lttb",
+    # Aggregators
+    "downsample_mean",
+    "downsample_median",
+    "downsample_min",
+    "downsample_max",
+    # Deferred
+    "deferred_downsample",
+]

downsampler/aggregators.py ADDED Viewed

@@ -0,0 +1,338 @@
+"""Aggregation-based downsampling methods."""
+import pandas as pd
+import numpy as np
+from downsampler.config import DownsampleConfig, GapHandling, EdgeHandling, AggregationMethod
+from downsampler.gaps import split_at_gaps, mark_gaps_in_dataframe
+from downsampler.edges import apply_edge_handling
+from downsampler.utils import parse_cadence, get_numeric_columns
+def _apply_aggregation(
+    df: pd.DataFrame,
+    target_cadence: pd.Timedelta,
+    method: str,
+    columns: list[str] | None = None
+) -> pd.DataFrame:
+    """Apply an aggregation method to downsample a DataFrame.
+    Args:
+        df: Input DataFrame with DatetimeIndex.
+        target_cadence: Target cadence for resampling.
+        method: Aggregation method ('mean', 'median', 'min', 'max').
+        columns: Columns to aggregate. If None, all numeric columns.
+    Returns:
+        Aggregated DataFrame.
+    """
+    if columns is None:
+        columns = get_numeric_columns(df)
+    resampler = df[columns].resample(target_cadence, origin='epoch')
+    if method == 'mean':
+        return resampler.mean()
+    elif method == 'median':
+        return resampler.median()
+    elif method == 'min':
+        return resampler.min()
+    elif method == 'max':
+        return resampler.max()
+    else:
+        raise ValueError(f"Unknown aggregation method: {method}")
+def downsample_mean(
+    df: pd.DataFrame,
+    target_cadence: str | pd.Timedelta,
+    columns: list[str] | None = None,
+    gap_threshold: pd.Timedelta | None = None,
+    mark_gaps: bool = True
+) -> pd.DataFrame:
+    """Downsample using mean aggregation.
+    Args:
+        df: Input DataFrame with DatetimeIndex.
+        target_cadence: Target cadence as ISO duration string or Timedelta.
+        columns: Columns to include. If None, all numeric columns.
+        gap_threshold: Minimum duration to consider as a gap.
+        mark_gaps: Whether to insert NaN markers at gaps.
+    Returns:
+        Downsampled DataFrame.
+    Example:
+        >>> df = pd.DataFrame(
+        ...     {'value': range(100)},
+        ...     index=pd.date_range('2024-01-01', periods=100, freq='1min')
+        ... )
+        >>> result = downsample_mean(df, '10min')
+        >>> len(result)
+        10
+    """
+    return _downsample_with_aggregation(
+        df, target_cadence, 'mean', columns, gap_threshold, mark_gaps
+    )
+def downsample_median(
+    df: pd.DataFrame,
+    target_cadence: str | pd.Timedelta,
+    columns: list[str] | None = None,
+    gap_threshold: pd.Timedelta | None = None,
+    mark_gaps: bool = True
+) -> pd.DataFrame:
+    """Downsample using median aggregation.
+    Args:
+        df: Input DataFrame with DatetimeIndex.
+        target_cadence: Target cadence as ISO duration string or Timedelta.
+        columns: Columns to include. If None, all numeric columns.
+        gap_threshold: Minimum duration to consider as a gap.
+        mark_gaps: Whether to insert NaN markers at gaps.
+    Returns:
+        Downsampled DataFrame.
+    """
+    return _downsample_with_aggregation(
+        df, target_cadence, 'median', columns, gap_threshold, mark_gaps
+    )
+def downsample_min(
+    df: pd.DataFrame,
+    target_cadence: str | pd.Timedelta,
+    columns: list[str] | None = None,
+    gap_threshold: pd.Timedelta | None = None,
+    mark_gaps: bool = True
+) -> pd.DataFrame:
+    """Downsample using minimum aggregation.
+    Args:
+        df: Input DataFrame with DatetimeIndex.
+        target_cadence: Target cadence as ISO duration string or Timedelta.
+        columns: Columns to include. If None, all numeric columns.
+        gap_threshold: Minimum duration to consider as a gap.
+        mark_gaps: Whether to insert NaN markers at gaps.
+    Returns:
+        Downsampled DataFrame.
+    """
+    return _downsample_with_aggregation(
+        df, target_cadence, 'min', columns, gap_threshold, mark_gaps
+    )
+def downsample_max(
+    df: pd.DataFrame,
+    target_cadence: str | pd.Timedelta,
+    columns: list[str] | None = None,
+    gap_threshold: pd.Timedelta | None = None,
+    mark_gaps: bool = True
+) -> pd.DataFrame:
+    """Downsample using maximum aggregation.
+    Args:
+        df: Input DataFrame with DatetimeIndex.
+        target_cadence: Target cadence as ISO duration string or Timedelta.
+        columns: Columns to include. If None, all numeric columns.
+        gap_threshold: Minimum duration to consider as a gap.
+        mark_gaps: Whether to insert NaN markers at gaps.
+    Returns:
+        Downsampled DataFrame.
+    """
+    return _downsample_with_aggregation(
+        df, target_cadence, 'max', columns, gap_threshold, mark_gaps
+    )
+def _downsample_with_aggregation(
+    df: pd.DataFrame,
+    target_cadence: str | pd.Timedelta,
+    method: str,
+    columns: list[str] | None = None,
+    gap_threshold: pd.Timedelta | None = None,
+    mark_gaps: bool = True
+) -> pd.DataFrame:
+    """Internal function for aggregation-based downsampling.
+    Args:
+        df: Input DataFrame.
+        target_cadence: Target cadence.
+        method: Aggregation method.
+        columns: Columns to include.
+        gap_threshold: Gap threshold.
+        mark_gaps: Whether to mark gaps.
+    Returns:
+        Downsampled DataFrame.
+    """
+    target_cadence = parse_cadence(target_cadence)
+    if gap_threshold is None:
+        gap_threshold = 2 * target_cadence
+    # Apply aggregation
+    result = _apply_aggregation(df, target_cadence, method, columns)
+    # Mark gaps if requested
+    if mark_gaps:
+        result = mark_gaps_in_dataframe(
+            result,
+            nominal_timedelta=target_cadence,
+            nominal_start_time=df.index[0] if len(df) > 0 else None,
+            nominal_end_time=df.index[-1] + target_cadence if len(df) > 0 else None
+        )
+    return result
+def downsample_multi_aggregate(
+    df: pd.DataFrame,
+    target_cadence: str | pd.Timedelta,
+    variables: list[str],
+    aggregations: list[str] = ["min", "mean", "max"],
+    min_completeness: float = 0.9,
+    source_cadence: str | pd.Timedelta | None = None
+) -> pd.DataFrame:
+    """Create multiple aggregation columns for specified variables.
+    Produces columns like 'density_min', 'density_mean', 'density_max'
+    from a single 'density' column.
+    Args:
+        df: Input DataFrame with DatetimeIndex.
+        target_cadence: Target cadence as ISO duration string or Timedelta.
+        variables: List of column names to aggregate.
+        aggregations: List of aggregation methods to apply.
+        min_completeness: Minimum fraction of expected points required
+            for valid output (0.0 to 1.0).
+        source_cadence: Original cadence of the data for completeness
+            calculation. If None, estimated from data.
+    Returns:
+        DataFrame with aggregated columns named {variable}_{aggregation}.
+    Example:
+        >>> df = pd.DataFrame(
+        ...     {'density': np.random.randn(1000), 'velocity': np.random.randn(1000)},
+        ...     index=pd.date_range('2024-01-01', periods=1000, freq='1s')
+        ... )
+        >>> result = downsample_multi_aggregate(
+        ...     df, '1min', ['density', 'velocity'], ['min', 'mean', 'max']
+        ... )
+        >>> list(result.columns)
+        ['density_min', 'density_mean', 'density_max', 'velocity_min', 'velocity_mean', 'velocity_max', 'coverage']
+    """
+    target_cadence = parse_cadence(target_cadence)
+    # Estimate source cadence if not provided
+    if source_cadence is None:
+        from downsampler.utils import estimate_cadence
+        source_cadence = estimate_cadence(df)
+    else:
+        source_cadence = parse_cadence(source_cadence)
+    # Compute statistics with count
+    aggstats = [*aggregations, 'count']
+    df_agg = df[variables].resample(
+        target_cadence, label='left', origin='epoch'
+    ).agg(aggstats)
+    # Adjust index to middle of cadence (for proper time representation)
+    df_agg.index = df_agg.index + 0.5 * target_cadence
+    # Compute completeness/coverage
+    maxcount = target_cadence / source_cadence
+    coverage = df_agg[[(v, 'count') for v in variables]].apply(max, axis=1) / maxcount
+    # Set data to NaN if statistics are based on limited observations
+    for var in variables:
+        for aggstat in aggregations:
+            df_agg.loc[:, (var, aggstat)] = (
+                df_agg.loc[:, (var, aggstat)].where(
+                    df_agg.loc[:, (var, 'count')] > min_completeness * maxcount
+                )
+            )
+    # Remove count columns
+    for var in variables:
+        df_agg.drop((var, "count"), axis=1, inplace=True)
+    # Flatten multi-index columns to single index (e.g., "density_min")
+    df_agg.columns = ["_".join(col_name) for col_name in df_agg.columns.to_flat_index()]
+    # Add coverage column
+    df_agg['coverage'] = coverage
+    return df_agg
+def downsample_with_config(
+    df: pd.DataFrame,
+    target_cadence: str | pd.Timedelta,
+    config: DownsampleConfig
+) -> pd.DataFrame:
+    """Apply aggregation-based downsampling with full configuration.
+    Args:
+        df: Input DataFrame with DatetimeIndex.
+        target_cadence: Target cadence.
+        config: Downsampling configuration.
+    Returns:
+        Downsampled DataFrame.
+    """
+    target_cadence = parse_cadence(target_cadence)
+    gap_threshold = config.get_gap_threshold(target_cadence)
+    # Determine columns to process
+    columns = config.include_columns if config.include_columns else None
+    # Map method to function
+    method_map = {
+        AggregationMethod.MEAN: 'mean',
+        AggregationMethod.MEDIAN: 'median',
+        AggregationMethod.MIN: 'min',
+        AggregationMethod.MAX: 'max',
+    }
+    method_str = method_map.get(config.method)
+    if method_str is None:
+        raise ValueError(f"Method {config.method} is not an aggregation method")
+    # Process based on gap handling
+    if config.gap_handling == GapHandling.SEGMENT:
+        segments = split_at_gaps(df, gap_threshold)
+        results = []
+        for segment in segments:
+            if len(segment) < config.min_points_per_segment:
+                continue
+            result = _apply_aggregation(segment, target_cadence, method_str, columns)
+            results.append(result)
+        if not results:
+            return pd.DataFrame(columns=df.columns if columns is None else columns)
+        result = pd.concat(results).sort_index()
+        result = mark_gaps_in_dataframe(result, nominal_timedelta=target_cadence)
+    else:
+        result = _apply_aggregation(df, target_cadence, method_str, columns)
+        result = mark_gaps_in_dataframe(result, nominal_timedelta=target_cadence)
+    # Apply edge handling
+    if len(result) > 0:
+        result = apply_edge_handling(
+            result,
+            config.edge_handling,
+            config.edge_window
+        )
+    # Filter out excluded columns
+    if config.exclude_columns:
+        cols_to_drop = [c for c in config.exclude_columns if c in result.columns]
+        result = result.drop(columns=cols_to_drop)
+    return result

downsampler/config.py ADDED Viewed

@@ -0,0 +1,72 @@
+"""Configuration dataclasses and enums for downsampler."""
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Union
+import pandas as pd
+class AggregationMethod(str, Enum):
+    """Aggregation methods for downsampling."""
+    MEAN = "mean"
+    MEDIAN = "median"
+    MIN = "min"
+    MAX = "max"
+    LTTB = "lttb"
+class EdgeHandling(str, Enum):
+    """Strategies for handling edge points in downsampled data."""
+    DISCARD = "discard"     # Remove edge points
+    FLAG = "flag"           # Keep edges, add '_is_edge' column
+    KEEP = "keep"           # Keep as-is
+class GapHandling(str, Enum):
+    """Strategies for handling gaps in time series data."""
+    SEGMENT = "segment"         # Split at gaps, process independently
+    INTERPOLATE = "interpolate" # Fill gaps first
+    IGNORE = "ignore"           # Treat as continuous
+@dataclass
+class DownsampleConfig:
+    """Configuration for downsampling operations.
+    Attributes:
+        method: The aggregation method to use for downsampling.
+        lttb_target_column: For LTTB, the column to optimize visual fidelity for.
+        include_columns: Columns to include in the output (empty means all).
+        exclude_columns: Columns to exclude from the output.
+        gap_handling: Strategy for handling gaps in the data.
+        gap_threshold: Minimum duration to consider as a gap.
+            "auto" means 2x the target cadence.
+        edge_handling: Strategy for handling edge points.
+        edge_window: Number of points at each edge to consider as edge points.
+        min_points_per_segment: Minimum points required in a segment for processing.
+    """
+    method: AggregationMethod = AggregationMethod.MEAN
+    lttb_target_column: str | None = None
+    include_columns: list[str] = field(default_factory=list)
+    exclude_columns: list[str] = field(default_factory=list)
+    gap_handling: GapHandling = GapHandling.SEGMENT
+    gap_threshold: Union[str, pd.Timedelta] = "auto"
+    edge_handling: EdgeHandling = EdgeHandling.FLAG
+    edge_window: int = 2
+    min_points_per_segment: int = 3
+    def get_gap_threshold(self, target_cadence: pd.Timedelta) -> pd.Timedelta:
+        """Get the gap threshold, computing auto value if needed.
+        Args:
+            target_cadence: The target cadence for downsampling.
+        Returns:
+            The gap threshold as a Timedelta.
+        """
+        if self.gap_threshold == "auto":
+            return 2 * target_cadence
+        elif isinstance(self.gap_threshold, str):
+            return pd.to_timedelta(self.gap_threshold)
+        return self.gap_threshold

downsampler/core.py ADDED Viewed

@@ -0,0 +1,166 @@
+"""Core downsampling functions."""
+import pandas as pd
+from downsampler.config import DownsampleConfig, AggregationMethod
+from downsampler.utils import parse_cadence
+from downsampler.lttb import downsample_lttb_with_config
+from downsampler.aggregators import downsample_with_config as aggregate_with_config
+from downsampler.aggregators import downsample_multi_aggregate as _downsample_multi_aggregate
+def downsample(
+    df: pd.DataFrame,
+    target_cadence: str | pd.Timedelta,
+    config: DownsampleConfig | None = None,
+    **kwargs
+) -> pd.DataFrame:
+    """Downsample a DataFrame to a lower cadence.
+    This is the main entry point for downsampling operations. It supports
+    multiple methods including LTTB and various aggregation methods.
+    Args:
+        df: Input DataFrame with DatetimeIndex.
+        target_cadence: Target cadence as ISO duration string (e.g., "PT1H")
+            or pandas Timedelta.
+        config: Downsampling configuration. If None, uses default config
+            with mean aggregation.
+        **kwargs: Additional keyword arguments that override config settings.
+            Supported kwargs:
+            - method: AggregationMethod or string ('mean', 'lttb', etc.)
+            - lttb_target_column: Column to optimize for LTTB
+            - include_columns: Columns to include
+            - exclude_columns: Columns to exclude
+            - gap_threshold: Gap threshold
+            - edge_handling: Edge handling strategy
+            - edge_window: Edge window size
+    Returns:
+        Downsampled DataFrame.
+    Examples:
+        Basic mean downsampling:
+        >>> df = pd.DataFrame(
+        ...     {'value': range(100)},
+        ...     index=pd.date_range('2024-01-01', periods=100, freq='1min')
+        ... )
+        >>> result = downsample(df, '10min')
+        >>> len(result)
+        10
+        LTTB downsampling:
+        >>> from downsampler import AggregationMethod, DownsampleConfig
+        >>> config = DownsampleConfig(
+        ...     method=AggregationMethod.LTTB,
+        ...     lttb_target_column='value'
+        ... )
+        >>> result = downsample(df, '10min', config=config)
+        Using kwargs:
+        >>> result = downsample(df, '10min', method='max')
+    """
+    # Create config if not provided
+    if config is None:
+        config = DownsampleConfig()
+    # Apply kwargs overrides
+    if kwargs:
+        config = _apply_kwargs_to_config(config, kwargs)
+    target_cadence = parse_cadence(target_cadence)
+    # Route to appropriate implementation
+    if config.method == AggregationMethod.LTTB:
+        return downsample_lttb_with_config(df, target_cadence, config)
+    else:
+        return aggregate_with_config(df, target_cadence, config)
+def downsample_multi_aggregate(
+    df: pd.DataFrame,
+    target_cadence: str | pd.Timedelta,
+    variables: list[str],
+    aggregations: list[str] = ["min", "mean", "max"],
+    config: DownsampleConfig | None = None,
+    **kwargs
+) -> pd.DataFrame:
+    """Create columns like 'input_min', 'input_mean', 'input_max'.
+    This function creates multiple aggregated columns from each input
+    variable, useful for showing data ranges in visualizations.
+    Args:
+        df: Input DataFrame with DatetimeIndex.
+        target_cadence: Target cadence as ISO duration string or Timedelta.
+        variables: List of column names to aggregate.
+        aggregations: List of aggregation methods to apply.
+            Default: ["min", "mean", "max"]
+        config: Downsampling configuration (used for min_completeness if
+            specified in a future version).
+        **kwargs: Additional keyword arguments:
+            - min_completeness: Minimum fraction of expected points (0.0-1.0)
+            - source_cadence: Original cadence for completeness calculation
+    Returns:
+        DataFrame with aggregated columns named {variable}_{aggregation}.
+    Example:
+        >>> import numpy as np
+        >>> df = pd.DataFrame(
+        ...     {'density': np.random.randn(1000), 'velocity': np.random.randn(1000)},
+        ...     index=pd.date_range('2024-01-01', periods=1000, freq='1s')
+        ... )
+        >>> result = downsample_multi_aggregate(
+        ...     df, '1min', ['density', 'velocity']
+        ... )
+        >>> 'density_min' in result.columns
+        True
+        >>> 'density_mean' in result.columns
+        True
+        >>> 'density_max' in result.columns
+        True
+    """
+    min_completeness = kwargs.get('min_completeness', 0.9)
+    source_cadence = kwargs.get('source_cadence', None)
+    return _downsample_multi_aggregate(
+        df=df,
+        target_cadence=target_cadence,
+        variables=variables,
+        aggregations=aggregations,
+        min_completeness=min_completeness,
+        source_cadence=source_cadence
+    )
+def _apply_kwargs_to_config(
+    config: DownsampleConfig,
+    kwargs: dict
+) -> DownsampleConfig:
+    """Apply keyword arguments to a config, creating a new config.
+    Args:
+        config: Base configuration.
+        kwargs: Keyword arguments to apply.
+    Returns:
+        New configuration with kwargs applied.
+    """
+    from dataclasses import replace
+    # Map string method names to enum values
+    if 'method' in kwargs:
+        method = kwargs['method']
+        if isinstance(method, str):
+            kwargs['method'] = AggregationMethod(method)
+    # Filter to only valid config fields
+    valid_fields = {
+        'method', 'lttb_target_column', 'include_columns', 'exclude_columns',
+        'gap_handling', 'gap_threshold', 'edge_handling', 'edge_window',
+        'min_points_per_segment'
+    }
+    filtered_kwargs = {k: v for k, v in kwargs.items() if k in valid_fields}
+    return replace(config, **filtered_kwargs)