downsampler 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- downsampler/__init__.py +80 -0
- downsampler/aggregators.py +338 -0
- downsampler/config.py +72 -0
- downsampler/core.py +166 -0
- downsampler/deferred.py +357 -0
- downsampler/edges.py +202 -0
- downsampler/fidelity/__init__.py +23 -0
- downsampler/fidelity/comparison.py +343 -0
- downsampler/fidelity/metrics.py +212 -0
- downsampler/fidelity/visualization.py +359 -0
- downsampler/gaps.py +310 -0
- downsampler/lttb.py +207 -0
- downsampler/utils.py +150 -0
- downsampler-0.1.0.dist-info/METADATA +246 -0
- downsampler-0.1.0.dist-info/RECORD +18 -0
- downsampler-0.1.0.dist-info/WHEEL +5 -0
- downsampler-0.1.0.dist-info/licenses/LICENSE +21 -0
- downsampler-0.1.0.dist-info/top_level.txt +1 -0
downsampler/lttb.py
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
"""LTTB (Largest Triangle Three Buckets) downsampling with gap handling."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import numpy as np
|
|
6
|
+
import lttb
|
|
7
|
+
|
|
8
|
+
from downsampler.config import DownsampleConfig, GapHandling, EdgeHandling
|
|
9
|
+
from downsampler.gaps import split_at_gaps
|
|
10
|
+
from downsampler.edges import apply_edge_handling
|
|
11
|
+
from downsampler.utils import parse_cadence, get_numeric_columns, compute_output_points
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def downsample_lttb(
|
|
15
|
+
df_in: pd.DataFrame,
|
|
16
|
+
target_column: str,
|
|
17
|
+
target_cadence: str | pd.Timedelta,
|
|
18
|
+
include_columns: list[str] | None = None,
|
|
19
|
+
gap_threshold: pd.Timedelta | None = None,
|
|
20
|
+
min_points_per_segment: int = 3
|
|
21
|
+
) -> pd.DataFrame:
|
|
22
|
+
"""Perform LTTB downsampling on a pandas DataFrame.
|
|
23
|
+
|
|
24
|
+
LTTB (Largest Triangle Three Buckets) is a downsampling algorithm that
|
|
25
|
+
preserves visual characteristics of the data by selecting points that
|
|
26
|
+
maximize the area of triangles formed with adjacent buckets.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
df_in: Input DataFrame with DatetimeIndex.
|
|
30
|
+
target_column: Column to optimize visual fidelity for.
|
|
31
|
+
target_cadence: Target cadence as ISO duration string or Timedelta.
|
|
32
|
+
include_columns: Additional columns to include in output (interpolated
|
|
33
|
+
to LTTB-selected time points). If None, includes all numeric columns.
|
|
34
|
+
gap_threshold: Minimum duration to consider as a gap. If None, uses
|
|
35
|
+
2x target_cadence.
|
|
36
|
+
min_points_per_segment: Minimum points required per segment.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
DataFrame downsampled using LTTB algorithm.
|
|
40
|
+
|
|
41
|
+
Example:
|
|
42
|
+
>>> df = pd.DataFrame(
|
|
43
|
+
... {'signal': np.sin(np.linspace(0, 10*np.pi, 1000))},
|
|
44
|
+
... index=pd.date_range('2024-01-01', periods=1000, freq='1s')
|
|
45
|
+
... )
|
|
46
|
+
>>> result = downsample_lttb(df, 'signal', 'PT10S')
|
|
47
|
+
>>> len(result) < len(df)
|
|
48
|
+
True
|
|
49
|
+
"""
|
|
50
|
+
target_cadence = parse_cadence(target_cadence)
|
|
51
|
+
|
|
52
|
+
if gap_threshold is None:
|
|
53
|
+
gap_threshold = 2 * target_cadence
|
|
54
|
+
|
|
55
|
+
# Split at gaps and process each segment
|
|
56
|
+
segments = split_at_gaps(df_in, gap_threshold)
|
|
57
|
+
|
|
58
|
+
resampled_segments = []
|
|
59
|
+
for segment in segments:
|
|
60
|
+
if len(segment) < min_points_per_segment:
|
|
61
|
+
continue
|
|
62
|
+
|
|
63
|
+
resampled = _lttb_single_segment(
|
|
64
|
+
segment,
|
|
65
|
+
target_column,
|
|
66
|
+
target_cadence,
|
|
67
|
+
include_columns
|
|
68
|
+
)
|
|
69
|
+
if resampled is not None and len(resampled) > 0:
|
|
70
|
+
resampled_segments.append(resampled)
|
|
71
|
+
|
|
72
|
+
if not resampled_segments:
|
|
73
|
+
return pd.DataFrame(columns=df_in.columns)
|
|
74
|
+
|
|
75
|
+
return pd.concat(resampled_segments, axis=0).sort_index()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _lttb_single_segment(
|
|
79
|
+
df: pd.DataFrame,
|
|
80
|
+
target_column: str,
|
|
81
|
+
target_cadence: pd.Timedelta,
|
|
82
|
+
include_columns: list[str] | None = None
|
|
83
|
+
) -> pd.DataFrame | None:
|
|
84
|
+
"""Apply LTTB to a single contiguous segment.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
df: Input DataFrame (no gaps).
|
|
88
|
+
target_column: Column to optimize for.
|
|
89
|
+
target_cadence: Target cadence.
|
|
90
|
+
include_columns: Additional columns to include.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Downsampled DataFrame or None if cannot process.
|
|
94
|
+
"""
|
|
95
|
+
# Compute number of output points
|
|
96
|
+
n_out = compute_output_points(df.index[0], df.index[-1], target_cadence)
|
|
97
|
+
|
|
98
|
+
if n_out < 3:
|
|
99
|
+
logging.warning("Cannot perform LTTB downsampling on less than 3 points")
|
|
100
|
+
return None
|
|
101
|
+
|
|
102
|
+
# Set up the data - convert time to numeric for LTTB algorithm
|
|
103
|
+
df_work = df.copy()
|
|
104
|
+
timeref = df.index[0]
|
|
105
|
+
timeunit = '1min'
|
|
106
|
+
df_work['time_num'] = (df_work.index - timeref) / pd.to_timedelta(timeunit)
|
|
107
|
+
|
|
108
|
+
# Prepare data for LTTB (time_num, target_column)
|
|
109
|
+
data = df_work[['time_num', target_column]].dropna().values
|
|
110
|
+
|
|
111
|
+
if len(data) < 3:
|
|
112
|
+
logging.warning("Insufficient non-NaN data points for LTTB")
|
|
113
|
+
return None
|
|
114
|
+
|
|
115
|
+
# Apply LTTB downsampling
|
|
116
|
+
result = lttb.downsample(data, n_out)
|
|
117
|
+
df_resampled = pd.DataFrame(
|
|
118
|
+
result,
|
|
119
|
+
columns=['time_num', target_column]
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Reconstruct the datetime index
|
|
123
|
+
df_resampled.index = (
|
|
124
|
+
timeref +
|
|
125
|
+
pd.to_timedelta(df_resampled['time_num'], unit='min')
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# Determine which columns to interpolate
|
|
129
|
+
if include_columns is None:
|
|
130
|
+
cols_to_interp = get_numeric_columns(df)
|
|
131
|
+
else:
|
|
132
|
+
cols_to_interp = include_columns
|
|
133
|
+
|
|
134
|
+
# Interpolate other columns to LTTB-selected time points
|
|
135
|
+
for col in df.columns:
|
|
136
|
+
if col in ['time', 'time_num', target_column]:
|
|
137
|
+
continue
|
|
138
|
+
if include_columns is not None and col not in include_columns:
|
|
139
|
+
continue
|
|
140
|
+
if not pd.api.types.is_numeric_dtype(df[col]):
|
|
141
|
+
continue
|
|
142
|
+
|
|
143
|
+
df_resampled[col] = np.interp(
|
|
144
|
+
x=df_resampled['time_num'].values,
|
|
145
|
+
xp=df_work['time_num'].values,
|
|
146
|
+
fp=df_work[col].values
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# Clean up
|
|
150
|
+
df_resampled = df_resampled.drop(['time_num'], axis=1)
|
|
151
|
+
|
|
152
|
+
return df_resampled
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def downsample_lttb_with_config(
|
|
156
|
+
df: pd.DataFrame,
|
|
157
|
+
target_cadence: str | pd.Timedelta,
|
|
158
|
+
config: DownsampleConfig
|
|
159
|
+
) -> pd.DataFrame:
|
|
160
|
+
"""Apply LTTB downsampling with full configuration.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
df: Input DataFrame with DatetimeIndex.
|
|
164
|
+
target_cadence: Target cadence.
|
|
165
|
+
config: Downsampling configuration.
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
Downsampled DataFrame.
|
|
169
|
+
|
|
170
|
+
Raises:
|
|
171
|
+
ValueError: If lttb_target_column is not specified in config.
|
|
172
|
+
"""
|
|
173
|
+
target_cadence = parse_cadence(target_cadence)
|
|
174
|
+
|
|
175
|
+
if config.lttb_target_column is None:
|
|
176
|
+
raise ValueError("lttb_target_column must be specified for LTTB method")
|
|
177
|
+
|
|
178
|
+
# Determine gap threshold
|
|
179
|
+
gap_threshold = config.get_gap_threshold(target_cadence)
|
|
180
|
+
|
|
181
|
+
# Determine include columns
|
|
182
|
+
include_columns = config.include_columns if config.include_columns else None
|
|
183
|
+
|
|
184
|
+
# Apply LTTB
|
|
185
|
+
result = downsample_lttb(
|
|
186
|
+
df_in=df,
|
|
187
|
+
target_column=config.lttb_target_column,
|
|
188
|
+
target_cadence=target_cadence,
|
|
189
|
+
include_columns=include_columns,
|
|
190
|
+
gap_threshold=gap_threshold,
|
|
191
|
+
min_points_per_segment=config.min_points_per_segment
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# Apply edge handling
|
|
195
|
+
if len(result) > 0:
|
|
196
|
+
result = apply_edge_handling(
|
|
197
|
+
result,
|
|
198
|
+
config.edge_handling,
|
|
199
|
+
config.edge_window
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# Filter out excluded columns
|
|
203
|
+
if config.exclude_columns:
|
|
204
|
+
cols_to_drop = [c for c in config.exclude_columns if c in result.columns]
|
|
205
|
+
result = result.drop(columns=cols_to_drop)
|
|
206
|
+
|
|
207
|
+
return result
|
downsampler/utils.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""Utility functions for downsampler."""
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
from scipy.interpolate import interp1d
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def estimate_cadence(df: pd.DataFrame) -> pd.Timedelta:
|
|
9
|
+
"""Estimate the cadence of a time series DataFrame.
|
|
10
|
+
|
|
11
|
+
Uses the median of time differences to be robust to gaps.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
df: DataFrame with DatetimeIndex.
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
Estimated cadence as a Timedelta.
|
|
18
|
+
|
|
19
|
+
Raises:
|
|
20
|
+
ValueError: If DataFrame has fewer than 2 rows.
|
|
21
|
+
"""
|
|
22
|
+
if len(df) < 2:
|
|
23
|
+
raise ValueError("DataFrame must have at least 2 rows to estimate cadence")
|
|
24
|
+
|
|
25
|
+
deltas = pd.Series(df.index).diff().dropna()
|
|
26
|
+
return deltas.median()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def parse_cadence(cadence: str | pd.Timedelta) -> pd.Timedelta:
|
|
30
|
+
"""Parse a cadence specification into a Timedelta.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
cadence: Either a Timedelta or an ISO 8601 duration string
|
|
34
|
+
(e.g., "PT1H", "P1D").
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
The cadence as a Timedelta.
|
|
38
|
+
"""
|
|
39
|
+
if isinstance(cadence, pd.Timedelta):
|
|
40
|
+
return cadence
|
|
41
|
+
return pd.to_timedelta(cadence)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_numeric_columns(df: pd.DataFrame) -> list[str]:
|
|
45
|
+
"""Get list of numeric columns in a DataFrame.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
df: Input DataFrame.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
List of column names that have numeric dtype.
|
|
52
|
+
"""
|
|
53
|
+
return [col for col in df.columns if pd.api.types.is_numeric_dtype(df[col])]
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def filter_columns(
|
|
57
|
+
df: pd.DataFrame,
|
|
58
|
+
include: list[str] | None = None,
|
|
59
|
+
exclude: list[str] | None = None
|
|
60
|
+
) -> list[str]:
|
|
61
|
+
"""Filter columns based on include/exclude lists.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
df: Input DataFrame.
|
|
65
|
+
include: Columns to include. If empty or None, all columns are included.
|
|
66
|
+
exclude: Columns to exclude.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
Filtered list of column names.
|
|
70
|
+
"""
|
|
71
|
+
columns = list(df.columns)
|
|
72
|
+
|
|
73
|
+
if include:
|
|
74
|
+
columns = [c for c in columns if c in include]
|
|
75
|
+
|
|
76
|
+
if exclude:
|
|
77
|
+
columns = [c for c in columns if c not in exclude]
|
|
78
|
+
|
|
79
|
+
return columns
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def setup_interpolator(
|
|
83
|
+
dataframe: pd.DataFrame,
|
|
84
|
+
field: str,
|
|
85
|
+
kind: str = 'linear'
|
|
86
|
+
) -> callable:
|
|
87
|
+
"""Set up an interpolation function for a DataFrame column.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
dataframe: Input DataFrame with DatetimeIndex.
|
|
91
|
+
field: Column name to interpolate.
|
|
92
|
+
kind: Type of interpolation ('linear', 'cubic', etc.).
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
A function that takes a timestamp and returns the interpolated value.
|
|
96
|
+
"""
|
|
97
|
+
reftime = dataframe.index[0]
|
|
98
|
+
raw_interpolator = interp1d(
|
|
99
|
+
x=((dataframe.index - reftime) / pd.to_timedelta(1, 'min')),
|
|
100
|
+
y=dataframe[field],
|
|
101
|
+
kind=kind,
|
|
102
|
+
bounds_error=False,
|
|
103
|
+
fill_value=np.nan
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
def func(t):
|
|
107
|
+
result = raw_interpolator((t - reftime) / pd.to_timedelta(1, 'min'))
|
|
108
|
+
if np.shape(result) == (): # Convert to float if result is scalar
|
|
109
|
+
result = float(result)
|
|
110
|
+
return result
|
|
111
|
+
|
|
112
|
+
return func
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def setup_interpolators(
|
|
116
|
+
dataframe: pd.DataFrame,
|
|
117
|
+
kind: str = 'linear'
|
|
118
|
+
) -> dict[str, callable]:
|
|
119
|
+
"""Set up interpolation functions for all numeric columns in a DataFrame.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
dataframe: Input DataFrame with DatetimeIndex.
|
|
123
|
+
kind: Type of interpolation ('linear', 'cubic', etc.).
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
Dictionary mapping column names to interpolation functions.
|
|
127
|
+
"""
|
|
128
|
+
interpolators = {}
|
|
129
|
+
fields = get_numeric_columns(dataframe)
|
|
130
|
+
for field in fields:
|
|
131
|
+
interpolators[field] = setup_interpolator(dataframe, field, kind)
|
|
132
|
+
return interpolators
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def compute_output_points(
|
|
136
|
+
start: pd.Timestamp,
|
|
137
|
+
end: pd.Timestamp,
|
|
138
|
+
target_cadence: pd.Timedelta
|
|
139
|
+
) -> int:
|
|
140
|
+
"""Compute the number of output points for a given time range and cadence.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
start: Start timestamp.
|
|
144
|
+
end: End timestamp.
|
|
145
|
+
target_cadence: Target cadence.
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
Number of output points.
|
|
149
|
+
"""
|
|
150
|
+
return int(1 + (end - start) / target_cadence)
|
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: downsampler
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Timeseries DataFrame downsampling with LTTB, aggregation methods, gap handling, and fidelity testing
|
|
5
|
+
Author-email: Eelco Doornbos <eelco.doornbos@knmi.nl>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://gitlab.com/KNMI-OSS/spaceweather/downsampler
|
|
8
|
+
Project-URL: Repository, https://gitlab.com/KNMI-OSS/spaceweather/downsampler
|
|
9
|
+
Keywords: timeseries,downsampling,lttb,pandas,visualization
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: numpy>=1.20
|
|
22
|
+
Requires-Dist: pandas>=1.3
|
|
23
|
+
Requires-Dist: lttb>=0.3
|
|
24
|
+
Requires-Dist: scipy>=1.7
|
|
25
|
+
Provides-Extra: viz
|
|
26
|
+
Requires-Dist: matplotlib>=3.5; extra == "viz"
|
|
27
|
+
Requires-Dist: altair>=5.0; extra == "viz"
|
|
28
|
+
Provides-Extra: test
|
|
29
|
+
Requires-Dist: pytest>=7.0; extra == "test"
|
|
30
|
+
Requires-Dist: pytest-cov>=4.0; extra == "test"
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: downsampler[test,viz]; extra == "dev"
|
|
33
|
+
Dynamic: license-file
|
|
34
|
+
|
|
35
|
+
# downsampler
|
|
36
|
+
|
|
37
|
+
A Python package for time series DataFrame downsampling with LTTB, multiple aggregation methods, gap handling, and fidelity testing.
|
|
38
|
+
|
|
39
|
+
## Features
|
|
40
|
+
|
|
41
|
+
- **Multiple downsampling methods**: LTTB (visual fidelity), mean, median, min, max
|
|
42
|
+
- **Gap-aware processing**: Automatically detects and handles gaps in time series
|
|
43
|
+
- **Edge handling**: Flag, discard, or keep edge points
|
|
44
|
+
- **Multi-aggregate output**: Generate min/mean/max columns in a single call
|
|
45
|
+
- **Deferred fetching**: Lazy data loading with automatic edge buffering
|
|
46
|
+
- **Fidelity testing**: Compare methods and measure visual accuracy
|
|
47
|
+
|
|
48
|
+
## Installation
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install downsampler
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Or with visualization support:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install downsampler[viz]
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Quick Start
|
|
61
|
+
|
|
62
|
+
### Basic Downsampling
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
import pandas as pd
|
|
66
|
+
from downsampler import downsample
|
|
67
|
+
|
|
68
|
+
# Create sample data
|
|
69
|
+
df = pd.DataFrame(
|
|
70
|
+
{'temperature': range(1000)},
|
|
71
|
+
index=pd.date_range('2024-01-01', periods=1000, freq='1s')
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# Downsample to 1-minute cadence (default: mean)
|
|
75
|
+
result = downsample(df, target_cadence='PT1M')
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### Using Different Methods
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from downsampler import downsample, DownsampleConfig, AggregationMethod
|
|
82
|
+
|
|
83
|
+
# Mean (default)
|
|
84
|
+
result = downsample(df, '10min')
|
|
85
|
+
|
|
86
|
+
# Maximum
|
|
87
|
+
result = downsample(df, '10min', method='max')
|
|
88
|
+
|
|
89
|
+
# LTTB for visual fidelity
|
|
90
|
+
config = DownsampleConfig(
|
|
91
|
+
method=AggregationMethod.LTTB,
|
|
92
|
+
lttb_target_column='temperature'
|
|
93
|
+
)
|
|
94
|
+
result = downsample(df, '10min', config=config)
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Multi-Aggregate Downsampling
|
|
98
|
+
|
|
99
|
+
Create min/mean/max columns for visualization with error bands:
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from downsampler import downsample_multi_aggregate
|
|
103
|
+
|
|
104
|
+
result = downsample_multi_aggregate(
|
|
105
|
+
df,
|
|
106
|
+
target_cadence='1min',
|
|
107
|
+
variables=['temperature', 'pressure'],
|
|
108
|
+
aggregations=['min', 'mean', 'max']
|
|
109
|
+
)
|
|
110
|
+
# Result has columns: temperature_min, temperature_mean, temperature_max, etc.
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### Handling Gaps
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
from downsampler import DownsampleConfig, GapHandling
|
|
117
|
+
|
|
118
|
+
config = DownsampleConfig(
|
|
119
|
+
gap_handling=GapHandling.SEGMENT, # Process segments independently
|
|
120
|
+
gap_threshold='5min' # Gaps > 5 min trigger segmentation
|
|
121
|
+
)
|
|
122
|
+
result = downsample(df, '1min', config=config)
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### Deferred Data Fetching
|
|
126
|
+
|
|
127
|
+
For data that needs to be fetched from an external source:
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
from downsampler.deferred import deferred_downsample
|
|
131
|
+
|
|
132
|
+
def fetch_from_api(start, end):
|
|
133
|
+
# Your data fetching logic here
|
|
134
|
+
return pd.DataFrame(...)
|
|
135
|
+
|
|
136
|
+
result = deferred_downsample(
|
|
137
|
+
fetcher=fetch_from_api,
|
|
138
|
+
output_start=pd.Timestamp('2024-01-01'),
|
|
139
|
+
output_end=pd.Timestamp('2024-01-02'),
|
|
140
|
+
target_cadence='1H'
|
|
141
|
+
)
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Fidelity Comparison
|
|
145
|
+
|
|
146
|
+
Compare different methods to find the best one for your data:
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
from downsampler.fidelity import FidelityComparison
|
|
150
|
+
|
|
151
|
+
comp = FidelityComparison(original_df, 'signal')
|
|
152
|
+
results = comp.compare_methods('10s')
|
|
153
|
+
|
|
154
|
+
for r in results:
|
|
155
|
+
print(f"{r.method.value}: RMSE={r.metrics.rmse:.4f}")
|
|
156
|
+
|
|
157
|
+
# Get recommendation
|
|
158
|
+
config = comp.recommend_settings('10s', priority='visual')
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## Configuration Options
|
|
162
|
+
|
|
163
|
+
### DownsampleConfig
|
|
164
|
+
|
|
165
|
+
| Parameter | Type | Default | Description |
|
|
166
|
+
|-----------|------|---------|-------------|
|
|
167
|
+
| `method` | AggregationMethod | MEAN | Downsampling method |
|
|
168
|
+
| `lttb_target_column` | str | None | Column to optimize for LTTB |
|
|
169
|
+
| `include_columns` | list[str] | [] | Columns to include (empty = all) |
|
|
170
|
+
| `exclude_columns` | list[str] | [] | Columns to exclude |
|
|
171
|
+
| `gap_handling` | GapHandling | SEGMENT | How to handle gaps |
|
|
172
|
+
| `gap_threshold` | str/Timedelta | "auto" | Min duration for gaps |
|
|
173
|
+
| `edge_handling` | EdgeHandling | FLAG | How to handle edges |
|
|
174
|
+
| `edge_window` | int | 2 | Points at each edge |
|
|
175
|
+
| `min_points_per_segment` | int | 3 | Min points for processing |
|
|
176
|
+
|
|
177
|
+
### Aggregation Methods
|
|
178
|
+
|
|
179
|
+
- `MEAN`: Arithmetic mean (best for general use)
|
|
180
|
+
- `MEDIAN`: Median (robust to outliers)
|
|
181
|
+
- `MIN`: Minimum value (preserves lows)
|
|
182
|
+
- `MAX`: Maximum value (preserves highs)
|
|
183
|
+
- `LTTB`: Largest Triangle Three Buckets (best visual fidelity)
|
|
184
|
+
|
|
185
|
+
### Gap Handling
|
|
186
|
+
|
|
187
|
+
- `SEGMENT`: Split at gaps, process independently (recommended)
|
|
188
|
+
- `INTERPOLATE`: Fill gaps before processing
|
|
189
|
+
- `IGNORE`: Treat as continuous data
|
|
190
|
+
|
|
191
|
+
### Edge Handling
|
|
192
|
+
|
|
193
|
+
- `KEEP`: Keep edge points as-is
|
|
194
|
+
- `FLAG`: Add `_is_edge` column
|
|
195
|
+
- `DISCARD`: Remove edge points
|
|
196
|
+
|
|
197
|
+
## Examples
|
|
198
|
+
|
|
199
|
+
See the `examples/` directory for complete examples:
|
|
200
|
+
|
|
201
|
+
- `basic_downsampling.py`: Core downsampling features
|
|
202
|
+
- `multi_aggregate.py`: Creating min/mean/max columns
|
|
203
|
+
- `deferred_fetch.py`: Lazy data loading
|
|
204
|
+
- `fidelity_comparison.marimo.py`: Interactive comparison notebook
|
|
205
|
+
|
|
206
|
+
## API Reference
|
|
207
|
+
|
|
208
|
+
### Core Functions
|
|
209
|
+
|
|
210
|
+
```python
|
|
211
|
+
downsample(df, target_cadence, config=None, **kwargs) -> DataFrame
|
|
212
|
+
downsample_multi_aggregate(df, target_cadence, variables, aggregations, ...) -> DataFrame
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
### Gap Functions
|
|
216
|
+
|
|
217
|
+
```python
|
|
218
|
+
find_gap_indices(df, timedelta_max_gap) -> Series
|
|
219
|
+
groupby_gaps(df, timedelta_max_gap) -> DataFrameGroupBy
|
|
220
|
+
split_at_gaps(df, timedelta_max_gap) -> list[DataFrame]
|
|
221
|
+
mark_gaps_in_dataframe(df, nominal_timedelta, ...) -> DataFrame
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
### Deferred Functions
|
|
225
|
+
|
|
226
|
+
```python
|
|
227
|
+
deferred_downsample(fetcher, output_start, output_end, target_cadence, ...) -> DataFrame
|
|
228
|
+
batch_deferred_downsample(fetcher, ..., batch_size) -> DataFrame
|
|
229
|
+
LazyDownsampler(fetcher, cache_buffer) -> LazyDownsampler
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
### Fidelity Functions
|
|
233
|
+
|
|
234
|
+
```python
|
|
235
|
+
compute_metrics(original, downsampled, column) -> FidelityMetrics
|
|
236
|
+
FidelityComparison(original_df, column) -> FidelityComparison
|
|
237
|
+
plot_comparison(original, downsampled, column, backend) -> Figure
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
## License
|
|
241
|
+
|
|
242
|
+
MIT License - see LICENSE file for details.
|
|
243
|
+
|
|
244
|
+
## Contributing
|
|
245
|
+
|
|
246
|
+
Contributions are welcome! Please feel free to submit issues and pull requests.
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
downsampler/__init__.py,sha256=VbLTpckyt-beR7XNugIFmB4ki7A9UwoMQglLnsQBjDs,2104
|
|
2
|
+
downsampler/aggregators.py,sha256=QXraUXjX4s335Fz9AXq5dIRnIl1GOKVViO1u1IVYQAY,10989
|
|
3
|
+
downsampler/config.py,sha256=YU2Z4JEcb-xlHdanHef-lGSMwrjyaFDALLGtV1FwXxs,2613
|
|
4
|
+
downsampler/core.py,sha256=j9dDSO_D_IfVGwwGIVw3Zg4e8RThSR0UkKY8I5xzfDw,5727
|
|
5
|
+
downsampler/deferred.py,sha256=-uFRpHV5bGb06uXehhO0BPmjNt7uyvzPtTmEYMTlCRs,11859
|
|
6
|
+
downsampler/edges.py,sha256=gzXX9bScf6rx2fEUts3oNVmAYAiE2hmrXIbvqTxLEsI,5863
|
|
7
|
+
downsampler/gaps.py,sha256=-yxk5dNtBORevTCA5_LZz-fxGAdsUaoPabos7jagUHg,9969
|
|
8
|
+
downsampler/lttb.py,sha256=3fWf4pOC61GRsmF_z-clOnPJ8W-XS9auI73eKpwMVaI,6471
|
|
9
|
+
downsampler/utils.py,sha256=yu_AXEMyy8F3LP4a9QoWeravNGYR9m56viC9menlYAU,3914
|
|
10
|
+
downsampler/fidelity/__init__.py,sha256=Ta4wNREv06A53avBizTzPfORK7VzEgkyAjekcZNdsSk,658
|
|
11
|
+
downsampler/fidelity/comparison.py,sha256=EP-l-Dpnkp9MJVoNXqj6a7-Y5MlL8UrBD9eBWJhbnDY,11673
|
|
12
|
+
downsampler/fidelity/metrics.py,sha256=Ls1yWPuDaB5WoAHYNKSsA92hrodo6eCyOVUcFYEAbzM,6687
|
|
13
|
+
downsampler/fidelity/visualization.py,sha256=j6NcdZud6JM8CoMJjajSzGY3qZF1CfpDH8rNnc80-1g,10251
|
|
14
|
+
downsampler-0.1.0.dist-info/licenses/LICENSE,sha256=F0hoUNYeL_QW7JqUFG8CHNuxBnmU0dzLf_xB78xb9gI,1061
|
|
15
|
+
downsampler-0.1.0.dist-info/METADATA,sha256=secDw53t09IzwiagKz98QhjzLPgstivWgcFBYLwPGNM,7006
|
|
16
|
+
downsampler-0.1.0.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
|
|
17
|
+
downsampler-0.1.0.dist-info/top_level.txt,sha256=LjlUQA5drUOEpNbPWlA_dx5GT8Xp45GZp1Yhs_r_DSQ,12
|
|
18
|
+
downsampler-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 KNMI
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
downsampler
|