PyPI - pattern-fill - Versions diffs - 0.1.1__py3-none-any.whl - Mend

pattern-fill 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

pattern_fill/__init__.py +14 -0
pattern_fill/fitting.py +80 -0
pattern_fill/gap_fill.py +581 -0
pattern_fill/pattern.py +354 -0
pattern_fill/sine_fitting.py +179 -0
pattern_fill-0.1.1.dist-info/METADATA +226 -0
pattern_fill-0.1.1.dist-info/RECORD +9 -0
pattern_fill-0.1.1.dist-info/WHEEL +4 -0
pattern_fill-0.1.1.dist-info/licenses/LICENSE +1 -0

pattern_fill/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+from pattern_fill.pattern import DailyPattern, SineComponent
+from pattern_fill.fitting import extract_daily_profile, fit_pattern
+from pattern_fill.sine_fitting import fit_sine_pattern
+from pattern_fill.gap_fill import pattern_fill, pattern_fill_dataset
+__all__ = [
+    "DailyPattern",
+    "SineComponent",
+    "extract_daily_profile",
+    "fit_pattern",
+    "fit_sine_pattern",
+    "pattern_fill",
+    "pattern_fill_dataset",
+]

pattern_fill/fitting.py ADDED Viewed

@@ -0,0 +1,80 @@
+from __future__ import annotations
+import numpy as np
+import pandas as pd
+from pattern_fill.pattern import DailyPattern
+def extract_daily_profile(
+    series: pd.Series,
+    resolution_minutes: int = 15,
+    aggregation: str = "median",
+) -> pd.Series:
+    """Group a time series by fractional hour-of-day and aggregate.
+    Returns a Series indexed by fractional hour (e.g. 8.25 for 08:15)
+    with one value per bin.
+    """
+    if not isinstance(series.index, pd.DatetimeIndex):
+        raise TypeError("series must have a DatetimeIndex")
+    s = series.dropna()
+    fractional_hour = s.index.hour + s.index.minute / 60.0 + s.index.second / 3600.0
+    bin_edges = np.arange(0, 24 + resolution_minutes / 60, resolution_minutes / 60)
+    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2.0
+    bin_idx = np.digitize(fractional_hour, bin_edges) - 1
+    bin_idx = np.clip(bin_idx, 0, len(bin_centers) - 1)
+    grouped = pd.Series(s.values, index=bin_idx)
+    if aggregation == "median":
+        profile_values = grouped.groupby(level=0).median()
+    elif aggregation == "mean":
+        profile_values = grouped.groupby(level=0).mean()
+    else:
+        raise ValueError(f"aggregation must be 'median' or 'mean', got {aggregation!r}")
+    result = pd.Series(
+        index=bin_centers[profile_values.index.values],
+        data=profile_values.values,
+        name="daily_profile",
+    )
+    result.index.name = "hour"
+    return result
+def fit_pattern(
+    series: pd.Series,
+    n_control_points: int = 8,
+    resolution_minutes: int = 15,
+    aggregation: str = "median",
+    name: str = "fitted",
+    day_type: str = "all",
+) -> DailyPattern:
+    """Fit a DailyPattern from observed time series data.
+    Extracts a daily profile, picks *n_control_points* evenly spaced along
+    the 0-24 h axis, normalizes values to 0-1, and returns a DailyPattern.
+    """
+    profile = extract_daily_profile(
+        series,
+        resolution_minutes=resolution_minutes,
+        aggregation=aggregation,
+    )
+    target_hours = np.linspace(0, 24, n_control_points + 1)[:-1]
+    cp_values = np.interp(target_hours, profile.index.values, profile.values)
+    v_min, v_max = cp_values.min(), cp_values.max()
+    if v_max - v_min > 0:
+        cp_norm = (cp_values - v_min) / (v_max - v_min)
+    else:
+        cp_norm = np.full_like(cp_values, 0.5)
+    return DailyPattern(
+        hours=target_hours.tolist(),
+        values=cp_norm.tolist(),
+        name=name,
+        periodic=True,
+        day_type=day_type,
+    )

pattern_fill/gap_fill.py ADDED Viewed

@@ -0,0 +1,581 @@
+from __future__ import annotations
+import datetime
+from typing import Any
+import numpy as np
+import pandas as pd
+from meteaudata.types import (
+    FunctionInfo,
+    Parameters,
+    ProcessingStep,
+    ProcessingType,
+    Signal,
+    TimeSeries,
+)
+from pattern_fill.fitting import extract_daily_profile
+from pattern_fill.pattern import DailyPattern
+def _find_nan_runs(mask: np.ndarray) -> list[tuple[int, int]]:
+    """Return (start, stop) index pairs for contiguous True runs in *mask*."""
+    if not mask.any():
+        return []
+    diff = np.diff(mask.astype(int))
+    starts = np.where(diff == 1)[0] + 1
+    ends = np.where(diff == -1)[0] + 1
+    if mask[0]:
+        starts = np.r_[0, starts]
+    if mask[-1]:
+        ends = np.r_[ends, len(mask)]
+    return list(zip(starts.tolist(), ends.tolist()))
+def _classify_runs(
+    runs: list[tuple[int, int]], series_len: int
+) -> list[tuple[int, int]]:
+    """Return only interior NaN runs, excluding leading and trailing."""
+    return [
+        (start, stop)
+        for start, stop in runs
+        if start != 0 and stop != series_len
+    ]
+def _select_pattern(
+    pattern: DailyPattern | dict[str, DailyPattern],
+    timestamp: pd.Timestamp,
+) -> DailyPattern:
+    """Pick the right pattern for a timestamp based on day_type."""
+    if isinstance(pattern, DailyPattern):
+        return pattern
+    dow = timestamp.dayofweek  # 0=Monday … 6=Sunday
+    key = "weekday" if dow < 5 else "weekend"
+    if key in pattern:
+        return pattern[key]
+    if "all" in pattern:
+        return pattern["all"]
+    raise KeyError(
+        f"No pattern for day_type={key!r} or 'all' in pattern dict "
+        f"(available keys: {list(pattern.keys())})"
+    )
+def _infer_freq_minutes(index: pd.DatetimeIndex) -> float:
+    """Infer the sampling frequency in minutes from a DatetimeIndex."""
+    freq = pd.infer_freq(index)
+    if freq is not None:
+        offset = pd.tseries.frequencies.to_offset(freq)
+        return offset.nanos / 1e9 / 60
+    # Fallback: median of first diffs
+    n = min(10, len(index))
+    diffs = pd.Series(index[:n]).diff().dropna()
+    if len(diffs) == 0:
+        return 15.0
+    return diffs.median().total_seconds() / 60
+def _smoothed_anchor(
+    col: pd.Series,
+    gap_edge_idx: int,
+    side: str,
+    blend_n: int,
+) -> float | None:
+    """Compute a noise-resistant anchor value near a gap edge.
+    Returns the weighted average of up to *blend_n* non-NaN points
+    adjacent to the gap.  Weights increase linearly toward the gap edge
+    (the closest point gets the highest weight).
+    """
+    if side == "left":
+        start = max(0, gap_edge_idx - blend_n + 1)
+        window = col.iloc[start : gap_edge_idx + 1]
+    else:
+        end = min(len(col), gap_edge_idx + blend_n)
+        window = col.iloc[gap_edge_idx:end]
+    valid = window.dropna()
+    if len(valid) == 0:
+        return None
+    k = len(valid)
+    if side == "left":
+        weights = np.arange(1, k + 1, dtype=float)
+    else:
+        weights = np.arange(k, 0, -1, dtype=float)
+    return float(np.average(valid.values, weights=weights))
+def _estimate_data_range(
+    col: pd.Series,
+    gap_start: int,
+    gap_stop: int,
+    window: str,
+    scaling: str,
+) -> tuple[float, float]:
+    """Estimate (data_min, data_max) for pattern scaling."""
+    if scaling == "none":
+        return 0.0, 1.0
+    if scaling == "global":
+        clean = col.dropna()
+        if len(clean) < 2:
+            return 0.0, 1.0
+        return float(clean.min()), float(clean.max())
+    if scaling != "local":
+        raise ValueError(
+            f"scaling must be 'local', 'global', or 'none', got {scaling!r}"
+        )
+    # Local: data within time window on each side of the gap
+    gap_left_ts = col.index[gap_start]
+    gap_right_ts = col.index[gap_stop - 1]
+    td = pd.Timedelta(window)
+    left_data = col.loc[: col.index[gap_start - 1]].dropna() if gap_start > 0 else pd.Series(dtype=float)
+    if len(left_data) > 0:
+        left_data = left_data.loc[left_data.index >= (gap_left_ts - td)]
+    right_data = col.loc[col.index[gap_stop] :].dropna() if gap_stop < len(col) else pd.Series(dtype=float)
+    if len(right_data) > 0:
+        right_data = right_data.loc[right_data.index <= (gap_right_ts + td)]
+    nearby = pd.concat([left_data, right_data])
+    if len(nearby) < 2:
+        clean = col.dropna()
+        if len(clean) < 2:
+            return 0.0, 1.0
+        return float(clean.min()), float(clean.max())
+    return float(nearby.min()), float(nearby.max())
+def _blend_fill(
+    p_scaled: np.ndarray,
+    left_anchor: float | None,
+    right_anchor: float | None,
+    blend_n: int,
+) -> np.ndarray:
+    """Apply cosine-decay boundary corrections to pre-scaled pattern values.
+    When both boundaries are available, linearly interpolates between the
+    anchors and blends with the scaled pattern using a cosine edge-weight
+    (1 at each edge, 0 at ``blend_n`` steps inward).  This guarantees exact
+    boundary continuity even when blend zones overlap in short gaps.
+    For single-boundary cases an additive correction is used instead.
+    """
+    N = len(p_scaled)
+    positions = np.arange(N, dtype=float)
+    if left_anchor is not None and right_anchor is not None:
+        # Linearly interpolate between anchors
+        alpha = positions / max(N - 1, 1)
+        anchor_interp = (1.0 - alpha) * left_anchor + alpha * right_anchor
+        # Cosine edge weight: 1 at the edges, 0 beyond blend_n from edge
+        edge_dist = np.minimum(positions, float(N - 1) - positions)
+        t = np.clip(edge_dist / max(blend_n, 1), 0.0, 1.0)
+        w = 0.5 * (1.0 + np.cos(np.pi * t))
+        return w * anchor_interp + (1.0 - w) * p_scaled
+    # Single-boundary: additive correction that decays inward
+    result = p_scaled.copy()
+    if left_anchor is not None:
+        r_L = left_anchor - p_scaled[0]
+        t_L = np.clip(positions / max(blend_n, 1), 0.0, 1.0)
+        w_L = 0.5 * (1.0 + np.cos(np.pi * t_L))
+        result += w_L * r_L
+    if right_anchor is not None:
+        r_R = right_anchor - p_scaled[-1]
+        dist_from_right = float(N - 1) - positions
+        t_R = np.clip(dist_from_right / max(blend_n, 1), 0.0, 1.0)
+        w_R = 0.5 * (1.0 + np.cos(np.pi * t_R))
+        result += w_R * r_R
+    return result
+def _compute_expected_area(
+    col: pd.Series,
+    gap_idx: pd.DatetimeIndex,
+    pattern: DailyPattern | dict[str, DailyPattern],
+) -> float | None:
+    """Expected sum of values across a gap, based on the daily profile of clean data.
+    Splits clean data by day type when *pattern* is a dict, so the expected
+    area respects weekday/weekend differences.
+    """
+    clean = col.dropna()
+    if len(clean) < 10:
+        return None
+    uses_day_types = isinstance(pattern, dict)
+    if uses_day_types:
+        profiles: dict[str, pd.Series] = {}
+        for dtype in ("weekday", "weekend"):
+            mask = (
+                clean.index.dayofweek < 5
+                if dtype == "weekday"
+                else clean.index.dayofweek >= 5
+            )
+            subset = clean[mask]
+            profiles[dtype] = extract_daily_profile(
+                subset if len(subset) > 10 else clean, aggregation="mean"
+            )
+    else:
+        profile_all = extract_daily_profile(clean, aggregation="mean")
+    expected = np.empty(len(gap_idx))
+    for i, ts in enumerate(gap_idx):
+        frac_h = ts.hour + ts.minute / 60.0 + ts.second / 3600.0
+        if uses_day_types:
+            dtype = "weekday" if ts.dayofweek < 5 else "weekend"
+            profile = profiles[dtype]
+        else:
+            profile = profile_all
+        expected[i] = np.interp(frac_h, profile.index.values, profile.values)
+    total = expected.sum()
+    return total if abs(total) > 1e-12 else None
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+def pattern_fill(
+    input_series: list[pd.Series],
+    pattern: DailyPattern | dict[str, DailyPattern],
+    scaling: str = "local",
+    window: str = "24h",
+    blend_minutes: int = 60,
+    normalize_area: bool = False,
+    *args: Any,
+    **kwargs: Any,
+) -> list[tuple[pd.Series, list[ProcessingStep]]]:
+    """Fill NaN gaps using a daily diurnal pattern.
+    Conforms to metEAUdata's ``SignalTransformFunctionProtocol``.
+    Parameters
+    ----------
+    blend_minutes : int
+        Width (in minutes) of the smoothing window used for anchor
+        computation **and** the cosine blend zone inside each gap.
+    normalize_area : bool
+        When True, the fill's area is normalized to match the expected daily
+        profile computed from the clean portions of the series.
+    """
+    if isinstance(pattern, DailyPattern):
+        pattern_meta = pattern.to_dict()
+    else:
+        pattern_meta = {k: v.to_dict() for k, v in pattern.items()}
+    func_info = FunctionInfo(
+        name="pattern_fill",
+        version="0.2.0",
+        author="pattern-fill",
+        reference="https://github.com/jeandavidt/pattern-fill",
+    )
+    parameters = Parameters(
+        pattern=pattern_meta,
+        scaling=scaling,
+        window=window,
+        blend_minutes=blend_minutes,
+        normalize_area=normalize_area,
+    )
+    processing_step = ProcessingStep(
+        type=ProcessingType.GAP_FILLING,
+        function_info=func_info,
+        parameters=parameters,
+        description=(
+            "Gap-filling using a daily diurnal pattern with "
+            "cosine-blended boundary matching"
+        ),
+        run_datetime=datetime.datetime.now(),
+        requires_calibration=False,
+        input_series_names=[str(col.name) for col in input_series],
+        suffix="PAT-FILL",
+    )
+    outputs: list[tuple[pd.Series, list[ProcessingStep]]] = []
+    for col in input_series:
+        col = col.copy()
+        signal_name, _, _ = Signal.extract_ts_base_and_number(str(col.name))
+        if not isinstance(col.index, pd.DatetimeIndex):
+            raise TypeError(
+                f"Series {col.name} must have a DatetimeIndex, "
+                f"got {type(col.index)}"
+            )
+        freq_min = _infer_freq_minutes(col.index)
+        blend_n = max(1, int(round(blend_minutes / freq_min)))
+        nan_mask = col.isna().values
+        runs = _find_nan_runs(nan_mask)
+        interior_runs = _classify_runs(runs, len(col))
+        for start, stop in interior_runs:
+            gap_idx = col.index[start:stop]
+            # Evaluate pattern over the gap
+            pat_vals = np.empty(len(gap_idx))
+            for i, ts in enumerate(gap_idx):
+                pat = _select_pattern(pattern, ts)
+                frac_h = ts.hour + ts.minute / 60.0 + ts.second / 3600.0
+                pat_vals[i] = pat.evaluate(np.array([frac_h]))[0]
+            # Scale pattern to data range
+            data_min, data_max = _estimate_data_range(
+                col, start, stop, window, scaling
+            )
+            dr = data_max - data_min
+            if abs(dr) < 1e-12:
+                dr = 1.0
+            p_scaled = pat_vals * dr + data_min
+            # Area normalization (opt-in)
+            if normalize_area:
+                expected = _compute_expected_area(col, gap_idx, pattern)
+                actual = p_scaled.sum()
+                if expected is not None and abs(actual) > 1e-12:
+                    p_scaled *= expected / actual
+            # Smoothed boundary anchors
+            left_anchor = (
+                _smoothed_anchor(col, start - 1, "left", blend_n)
+                if start > 0
+                else None
+            )
+            right_anchor = (
+                _smoothed_anchor(col, stop, "right", blend_n)
+                if stop < len(col)
+                else None
+            )
+            filled = _blend_fill(p_scaled, left_anchor, right_anchor, blend_n)
+            col.iloc[start:stop] = filled
+        col.name = f"{signal_name}_{processing_step.suffix}"
+        outputs.append((col, [processing_step]))
+    return outputs
+def pattern_fill_dataset(
+    input_signals: list[Signal],
+    input_series_names: list[str],
+    patterns: list[DailyPattern | dict[str, DailyPattern]],
+    mode: str = "load",
+    blend_minutes: int = 60,
+    scaling: str = "local",
+    window: str = "24h",
+    *args: Any,
+    **kwargs: Any,
+) -> list[Signal]:
+    """Fill NaN gaps with area normalization at the dataset level.
+    Conforms to metEAUdata's ``DatasetTransformFunctionProtocol``.
+    Parameters
+    ----------
+    mode : str
+        ``"concentration"`` or ``"flow"`` — single signal, area normalized to
+        its own daily profile.  ``"load"`` — two signals (concentration first,
+        flow second), both filled, then concentration normalized so that
+        ``conc × flow`` matches the expected daily load.
+    """
+    valid_modes = ("concentration", "flow", "load")
+    if mode not in valid_modes:
+        raise ValueError(f"mode must be one of {valid_modes}, got {mode!r}")
+    if mode == "load":
+        if len(input_signals) != 2 or len(patterns) != 2:
+            raise ValueError(
+                "load mode requires exactly 2 signals and 2 patterns "
+                f"(concentration, flow); got {len(input_signals)} signals "
+                f"and {len(patterns)} patterns"
+            )
+    else:
+        if len(input_signals) != 1 or len(patterns) != 1:
+            raise ValueError(
+                f"{mode} mode requires exactly 1 signal and 1 pattern; "
+                f"got {len(input_signals)} signals and {len(patterns)} patterns"
+            )
+    func_info = FunctionInfo(
+        name="pattern_fill_dataset",
+        version="0.2.0",
+        author="pattern-fill",
+        reference="https://github.com/jeandavidt/pattern-fill",
+    )
+    # ---- concentration / flow modes (single signal, area-normalized) --------
+    if mode in ("concentration", "flow"):
+        signal = input_signals[0]
+        ts_name = input_series_names[0]
+        series = signal.time_series[ts_name].series
+        pat = patterns[0]
+        results = pattern_fill(
+            [series],
+            pattern=pat,
+            scaling=scaling,
+            window=window,
+            blend_minutes=blend_minutes,
+            normalize_area=True,
+        )
+        filled_series, steps = results[0]
+        ts_obj = TimeSeries(series=filled_series, processing_steps=steps)
+        out_signal = Signal(
+            input_data=ts_obj,
+            name=Signal.extract_ts_base_and_number(str(filled_series.name))[0],
+            provenance=signal.provenance,
+            units=signal.units,
+        )
+        return [out_signal]
+    # ---- load mode (two signals: concentration + flow) ----------------------
+    conc_signal, flow_signal = input_signals
+    conc_ts_name, flow_ts_name = input_series_names
+    conc_series = conc_signal.time_series[conc_ts_name].series
+    flow_series = flow_signal.time_series[flow_ts_name].series
+    conc_pat, flow_pat = patterns
+    # Step 1: fill both signals (without area normalization)
+    conc_results = pattern_fill(
+        [conc_series],
+        pattern=conc_pat,
+        scaling=scaling,
+        window=window,
+        blend_minutes=blend_minutes,
+        normalize_area=False,
+    )
+    flow_results = pattern_fill(
+        [flow_series],
+        pattern=flow_pat,
+        scaling=scaling,
+        window=window,
+        blend_minutes=blend_minutes,
+        normalize_area=False,
+    )
+    filled_conc, conc_steps = conc_results[0]
+    filled_flow, flow_steps = flow_results[0]
+    # Step 2: compute daily load profile from clean data
+    conc_clean = conc_series.dropna()
+    flow_clean = flow_series.dropna()
+    # Align clean data to timestamps present in both
+    common_idx = conc_clean.index.intersection(flow_clean.index)
+    if len(common_idx) > 20:
+        load_clean = pd.Series(
+            conc_clean.loc[common_idx].values * flow_clean.loc[common_idx].values,
+            index=common_idx,
+            name="load",
+        )
+        uses_day_types = isinstance(conc_pat, dict)
+        if uses_day_types:
+            load_profiles: dict[str, pd.Series] = {}
+            for dtype in ("weekday", "weekend"):
+                mask = (
+                    load_clean.index.dayofweek < 5
+                    if dtype == "weekday"
+                    else load_clean.index.dayofweek >= 5
+                )
+                subset = load_clean[mask]
+                load_profiles[dtype] = extract_daily_profile(
+                    subset if len(subset) > 10 else load_clean,
+                    aggregation="mean",
+                )
+        else:
+            load_profile_all = extract_daily_profile(
+                load_clean, aggregation="mean"
+            )
+        # Step 3: normalize concentration fills so load matches expected
+        nan_mask = conc_series.isna().values
+        runs = _find_nan_runs(nan_mask)
+        interior_runs = _classify_runs(runs, len(conc_series))
+        for gap_start, gap_stop in interior_runs:
+            gap_idx = conc_series.index[gap_start:gap_stop]
+            # Expected load for the gap period
+            expected_load = np.empty(len(gap_idx))
+            for i, ts in enumerate(gap_idx):
+                frac_h = ts.hour + ts.minute / 60.0 + ts.second / 3600.0
+                if uses_day_types:
+                    dtype = "weekday" if ts.dayofweek < 5 else "weekend"
+                    lp = load_profiles[dtype]
+                else:
+                    lp = load_profile_all
+                expected_load[i] = np.interp(
+                    frac_h, lp.index.values, lp.values
+                )
+            expected_sum = expected_load.sum()
+            # Actual load from fills
+            gap_conc = filled_conc.iloc[gap_start:gap_stop].values
+            gap_flow = filled_flow.iloc[gap_start:gap_stop].values
+            actual_sum = (gap_conc * gap_flow).sum()
+            if abs(actual_sum) > 1e-12 and abs(expected_sum) > 1e-12:
+                ratio = expected_sum / actual_sum
+                filled_conc.iloc[gap_start:gap_stop] = gap_conc * ratio
+    # Build processing step for load normalization
+    load_step = ProcessingStep(
+        type=ProcessingType.GAP_FILLING,
+        function_info=func_info,
+        parameters=Parameters(
+            mode=mode,
+            scaling=scaling,
+            window=window,
+            blend_minutes=blend_minutes,
+        ),
+        description=(
+            "Gap-filling with load-normalized daily pattern "
+            "(concentration adjusted so conc × flow matches expected load)"
+        ),
+        run_datetime=datetime.datetime.now(),
+        requires_calibration=False,
+        input_series_names=input_series_names,
+        suffix="PAT-FILL",
+    )
+    conc_ts = TimeSeries(
+        series=filled_conc,
+        processing_steps=conc_steps + [load_step],
+    )
+    flow_ts = TimeSeries(
+        series=filled_flow,
+        processing_steps=flow_steps,
+    )
+    out_conc = Signal(
+        input_data=conc_ts,
+        name=Signal.extract_ts_base_and_number(str(filled_conc.name))[0],
+        provenance=conc_signal.provenance,
+        units=conc_signal.units,
+    )
+    out_flow = Signal(
+        input_data=flow_ts,
+        name=Signal.extract_ts_base_and_number(str(filled_flow.name))[0],
+        provenance=flow_signal.provenance,
+        units=flow_signal.units,
+    )
+    return [out_conc, out_flow]