downsampler 0.1.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- downsampler-0.3.0/PKG-INFO +335 -0
- downsampler-0.3.0/README.md +309 -0
- {downsampler-0.1.0 → downsampler-0.3.0}/pyproject.toml +20 -12
- {downsampler-0.1.0 → downsampler-0.3.0}/src/downsampler/__init__.py +36 -15
- {downsampler-0.1.0 → downsampler-0.3.0}/src/downsampler/aggregators.py +80 -78
- {downsampler-0.1.0 → downsampler-0.3.0}/src/downsampler/config.py +26 -12
- {downsampler-0.1.0 → downsampler-0.3.0}/src/downsampler/core.py +71 -15
- downsampler-0.3.0/src/downsampler/edges.py +89 -0
- {downsampler-0.1.0 → downsampler-0.3.0}/src/downsampler/fidelity/__init__.py +2 -9
- downsampler-0.3.0/src/downsampler/fidelity/comparison.py +150 -0
- {downsampler-0.1.0 → downsampler-0.3.0}/src/downsampler/fidelity/metrics.py +9 -84
- {downsampler-0.1.0 → downsampler-0.3.0}/src/downsampler/gaps.py +135 -20
- {downsampler-0.1.0 → downsampler-0.3.0}/src/downsampler/lttb.py +69 -39
- downsampler-0.3.0/src/downsampler/m4.py +396 -0
- downsampler-0.3.0/src/downsampler/ranged.py +457 -0
- {downsampler-0.1.0 → downsampler-0.3.0}/src/downsampler/utils.py +26 -2
- downsampler-0.3.0/src/downsampler.egg-info/PKG-INFO +335 -0
- {downsampler-0.1.0 → downsampler-0.3.0}/src/downsampler.egg-info/SOURCES.txt +5 -4
- downsampler-0.3.0/src/downsampler.egg-info/requires.txt +5 -0
- downsampler-0.3.0/tests/test_aggregators.py +237 -0
- downsampler-0.3.0/tests/test_core.py +211 -0
- {downsampler-0.1.0 → downsampler-0.3.0}/tests/test_edges.py +0 -56
- downsampler-0.3.0/tests/test_fidelity.py +160 -0
- downsampler-0.3.0/tests/test_gaps.py +380 -0
- downsampler-0.3.0/tests/test_lttb.py +343 -0
- downsampler-0.3.0/tests/test_m4.py +493 -0
- downsampler-0.3.0/tests/test_ranged.py +262 -0
- downsampler-0.1.0/PKG-INFO +0 -246
- downsampler-0.1.0/README.md +0 -212
- downsampler-0.1.0/src/downsampler/deferred.py +0 -357
- downsampler-0.1.0/src/downsampler/edges.py +0 -202
- downsampler-0.1.0/src/downsampler/fidelity/comparison.py +0 -343
- downsampler-0.1.0/src/downsampler/fidelity/visualization.py +0 -359
- downsampler-0.1.0/src/downsampler.egg-info/PKG-INFO +0 -246
- downsampler-0.1.0/src/downsampler.egg-info/requires.txt +0 -15
- downsampler-0.1.0/tests/test_aggregators.py +0 -83
- downsampler-0.1.0/tests/test_core.py +0 -115
- downsampler-0.1.0/tests/test_deferred.py +0 -173
- downsampler-0.1.0/tests/test_fidelity.py +0 -149
- downsampler-0.1.0/tests/test_gaps.py +0 -145
- downsampler-0.1.0/tests/test_lttb.py +0 -112
- {downsampler-0.1.0 → downsampler-0.3.0}/LICENSE +0 -0
- {downsampler-0.1.0 → downsampler-0.3.0}/setup.cfg +0 -0
- {downsampler-0.1.0 → downsampler-0.3.0}/src/downsampler.egg-info/dependency_links.txt +0 -0
- {downsampler-0.1.0 → downsampler-0.3.0}/src/downsampler.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: downsampler
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Timeseries DataFrame downsampling with LTTB, aggregation methods, gap handling, and fidelity testing
|
|
5
|
+
Author-email: Eelco Doornbos <eelco.doornbos@knmi.nl>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://gitlab.com/KNMI-OSS/spaceweather/libs/downsampler
|
|
8
|
+
Project-URL: Repository, https://gitlab.com/KNMI-OSS/spaceweather/libs/downsampler
|
|
9
|
+
Keywords: timeseries,downsampling,lttb,pandas,visualization
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering
|
|
17
|
+
Requires-Python: >=3.11
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Requires-Dist: numpy>=2.0
|
|
21
|
+
Requires-Dist: pandas>=1.3
|
|
22
|
+
Requires-Dist: lttbc>=0.3
|
|
23
|
+
Requires-Dist: scipy>=1.7
|
|
24
|
+
Requires-Dist: requests>=2.32.5
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# downsampler
|
|
28
|
+
|
|
29
|
+
[](https://pypi.org/project/downsampler/)
|
|
30
|
+
[](https://pypi.org/project/downsampler/)
|
|
31
|
+
[](https://opensource.org/licenses/MIT)
|
|
32
|
+
|
|
33
|
+
A Python package for time series DataFrame downsampling with LTTB, M4, multiple aggregation methods, gap handling, and fidelity testing.
|
|
34
|
+
|
|
35
|
+
## Features
|
|
36
|
+
|
|
37
|
+
- **Multiple downsampling methods**:
|
|
38
|
+
- LTTB (visual fidelity)
|
|
39
|
+
- M4 (guaranteed extrema preservation)
|
|
40
|
+
- Traditional aggregations (mean, median, min, max)
|
|
41
|
+
- **Gap-aware processing**: Automatically detects and handles gaps in time series
|
|
42
|
+
- **Edge handling**: Flag, discard, or keep edge points
|
|
43
|
+
- **Multi-aggregate output**: Generate min/mean/max columns in a single call
|
|
44
|
+
- **Range-based downsampling**: Fetch data from external sources with automatic edge buffering
|
|
45
|
+
- **Multi-resolution pyramid**: Generate downsampled versions at multiple cadences in one call
|
|
46
|
+
- **Fidelity testing**: Compare methods and measure visual accuracy
|
|
47
|
+
|
|
48
|
+
## Installation
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install downsampler
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
> **Note (Linux + Python 3.11):** the `lttbc` dependency's prebuilt cp311
|
|
55
|
+
> Linux wheel was compiled against NumPy 1.x and fails to import under
|
|
56
|
+
> NumPy 2 (`numpy.core.multiarray failed to import`). Force a source build:
|
|
57
|
+
> `pip install --no-binary lttbc downsampler`. Other Python versions and
|
|
58
|
+
> macOS have no prebuilt wheel and build from source automatically. With
|
|
59
|
+
> uv, this repo's `[tool.uv] no-binary-package` setting handles it.
|
|
60
|
+
|
|
61
|
+
## Quick Start
|
|
62
|
+
|
|
63
|
+
### Basic Downsampling
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
import pandas as pd
|
|
67
|
+
from downsampler import downsample_dataframe
|
|
68
|
+
|
|
69
|
+
# Create sample data
|
|
70
|
+
df = pd.DataFrame(
|
|
71
|
+
{'temperature': range(1000)},
|
|
72
|
+
index=pd.date_range('2024-01-01', periods=1000, freq='1s')
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# Downsample to 1-minute cadence (default: mean)
|
|
76
|
+
result = downsample_dataframe(df, target_cadence='PT1M')
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### Using Different Methods
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
from downsampler import downsample_dataframe, DownsampleConfig, AggregationMethod
|
|
83
|
+
|
|
84
|
+
# Mean (default)
|
|
85
|
+
result = downsample_dataframe(df, '10min')
|
|
86
|
+
|
|
87
|
+
# Maximum
|
|
88
|
+
result = downsample_dataframe(df, '10min', method='max')
|
|
89
|
+
|
|
90
|
+
# LTTB for visual fidelity
|
|
91
|
+
config = DownsampleConfig(
|
|
92
|
+
method=AggregationMethod.LTTB,
|
|
93
|
+
lttb_target_column='temperature'
|
|
94
|
+
)
|
|
95
|
+
result = downsample_dataframe(df, '10min', config=config)
|
|
96
|
+
|
|
97
|
+
# M4 for guaranteed extrema preservation
|
|
98
|
+
result = downsample_dataframe(df, '10min', method='m4')
|
|
99
|
+
|
|
100
|
+
# M4 with collinearity filtering (reduces output size)
|
|
101
|
+
result = downsample_dataframe(df, '10min', method='m4', m4_collinearity_threshold=0.01)
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Multi-Aggregate Downsampling
|
|
105
|
+
|
|
106
|
+
Create min/mean/max columns for visualization with error bands:
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
from downsampler import downsample_dataframe_multi_aggregate
|
|
110
|
+
|
|
111
|
+
result = downsample_dataframe_multi_aggregate(
|
|
112
|
+
df,
|
|
113
|
+
target_cadence='1min',
|
|
114
|
+
variables=['temperature', 'pressure'],
|
|
115
|
+
aggregations=['min', 'mean', 'max']
|
|
116
|
+
)
|
|
117
|
+
# Result has columns: temperature_min, temperature_mean, temperature_max, etc.
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### Multi-Resolution Pyramid
|
|
121
|
+
|
|
122
|
+
Generate downsampled versions at multiple cadences for storage:
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
from downsampler import downsample_dataframe_resolutions
|
|
126
|
+
|
|
127
|
+
results = downsample_dataframe_resolutions(
|
|
128
|
+
df,
|
|
129
|
+
cadences=['1min', '5min', '15min', '1h'],
|
|
130
|
+
)
|
|
131
|
+
# Returns {Timedelta('0 days 00:01:00'): DataFrame, ...}
|
|
132
|
+
|
|
133
|
+
for cadence, result_df in results.items():
|
|
134
|
+
print(f"{cadence}: {len(result_df)} points")
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### M4 Downsampling (Extrema Preservation)
|
|
138
|
+
|
|
139
|
+
M4 guarantees exact preservation of minimum and maximum values, making it ideal for monitoring dashboards and alerting systems:
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
from downsampler import downsample_dataframe
|
|
143
|
+
|
|
144
|
+
# Basic M4 - preserves exact min/max
|
|
145
|
+
result = downsample_dataframe(df, '1min', method='m4')
|
|
146
|
+
|
|
147
|
+
# Verify extrema preservation
|
|
148
|
+
assert df['temperature'].min() == result['temperature'].min()
|
|
149
|
+
assert df['temperature'].max() == result['temperature'].max()
|
|
150
|
+
|
|
151
|
+
# M4 with deduplication (default, removes consecutive duplicates)
|
|
152
|
+
result = downsample_dataframe(df, '1min', method='m4', m4_deduplicate=True)
|
|
153
|
+
|
|
154
|
+
# M4 with collinearity filtering (reduces size on smooth data)
|
|
155
|
+
result = downsample_dataframe(df, '1min', method='m4', m4_collinearity_threshold=0.01)
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
**M4 Features:**
|
|
159
|
+
- Selects up to 4 points per bucket: first, last, min, max
|
|
160
|
+
- **Guaranteed** exact extrema preservation (no approximation)
|
|
161
|
+
- Variable output size (typically 2-4x reduction vs 10x for traditional methods)
|
|
162
|
+
- Deduplication: removes consecutive duplicates (20-50% reduction)
|
|
163
|
+
- Collinearity filtering: removes min/max points near first-last line (0-75% reduction)
|
|
164
|
+
- Superior peak detection compared to LTTB
|
|
165
|
+
|
|
166
|
+
**When to use M4:**
|
|
167
|
+
- Monitoring dashboards where missing a spike could be critical
|
|
168
|
+
- Alerting systems that need exact threshold crossings
|
|
169
|
+
- Pre-computing multiple cadences with controllable size/fidelity trade-offs
|
|
170
|
+
- Multi-variable sensor data where each variable's extrema matter
|
|
171
|
+
|
|
172
|
+
### Handling Gaps
|
|
173
|
+
|
|
174
|
+
```python
|
|
175
|
+
from downsampler import DownsampleConfig
|
|
176
|
+
|
|
177
|
+
config = DownsampleConfig(
|
|
178
|
+
gap_threshold='5min' # Gaps > 5 min trigger segmentation
|
|
179
|
+
)
|
|
180
|
+
result = downsample_dataframe(df, '1min', config=config)
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
### Range-Based Downsampling
|
|
184
|
+
|
|
185
|
+
For data that needs to be fetched from an external source:
|
|
186
|
+
|
|
187
|
+
```python
|
|
188
|
+
from downsampler import downsample_range
|
|
189
|
+
|
|
190
|
+
def fetch_from_api(start, end):
|
|
191
|
+
# Your data fetching logic here
|
|
192
|
+
return pd.DataFrame(...)
|
|
193
|
+
|
|
194
|
+
# Single fetch with automatic edge buffering
|
|
195
|
+
result = downsample_range(
|
|
196
|
+
fetcher=fetch_from_api,
|
|
197
|
+
output_start=pd.Timestamp('2024-01-01'),
|
|
198
|
+
output_end=pd.Timestamp('2024-01-02'),
|
|
199
|
+
target_cadence='1H'
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# Batched mode for large ranges
|
|
203
|
+
result = downsample_range(
|
|
204
|
+
fetcher=fetch_from_api,
|
|
205
|
+
output_start=pd.Timestamp('2024-01-01'),
|
|
206
|
+
output_end=pd.Timestamp('2024-02-01'),
|
|
207
|
+
target_cadence='1H',
|
|
208
|
+
batch_size='P1D' # Process one day at a time
|
|
209
|
+
)
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
### Fidelity Comparison
|
|
213
|
+
|
|
214
|
+
Compare different methods to find the best one for your data:
|
|
215
|
+
|
|
216
|
+
```python
|
|
217
|
+
from downsampler.fidelity import FidelityComparison, summary_table
|
|
218
|
+
|
|
219
|
+
comp = FidelityComparison(original_df, 'signal')
|
|
220
|
+
results = comp.compare('10s', store_downsampled=True)
|
|
221
|
+
|
|
222
|
+
print(summary_table(results))
|
|
223
|
+
# See examples/fidelity_comparison.py (marimo notebook) for interactive visualization
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
## Configuration Options
|
|
227
|
+
|
|
228
|
+
### DownsampleConfig
|
|
229
|
+
|
|
230
|
+
| Parameter | Type | Default | Description |
|
|
231
|
+
|-----------|------|---------|-------------|
|
|
232
|
+
| `method` | AggregationMethod | MEAN | Downsampling method |
|
|
233
|
+
| `lttb_target_column` | str | None | Column to optimize for LTTB |
|
|
234
|
+
| `m4_deduplicate` | bool | True | For M4: remove consecutive duplicates |
|
|
235
|
+
| `m4_collinearity_threshold` | float | None | For M4: filter collinear points (0.0-1.0) |
|
|
236
|
+
| `include_columns` | list[str] | [] | Columns to include (empty = all) |
|
|
237
|
+
| `exclude_columns` | list[str] | [] | Columns to exclude |
|
|
238
|
+
| `gap_threshold` | str/Timedelta | "auto" | Min duration for gaps |
|
|
239
|
+
| `edge_handling` | EdgeHandling | KEEP | How to handle edges |
|
|
240
|
+
| `edge_window` | int | 2 | Points at each edge |
|
|
241
|
+
| `min_points_per_segment` | int | 1 | Min points per segment; smaller segments are dropped |
|
|
242
|
+
| `min_completeness` | float | 0.9 | Min fraction of expected points per bucket |
|
|
243
|
+
| `source_cadence` | str/Timedelta | None | Source data cadence (estimated if None) |
|
|
244
|
+
|
|
245
|
+
### Aggregation Methods
|
|
246
|
+
|
|
247
|
+
- `MEAN`: Arithmetic mean (best for general use)
|
|
248
|
+
- `MEDIAN`: Median (robust to outliers)
|
|
249
|
+
- `MIN`: Minimum value (preserves lows)
|
|
250
|
+
- `MAX`: Maximum value (preserves highs)
|
|
251
|
+
- `LTTB`: Largest Triangle Three Buckets (best visual fidelity)
|
|
252
|
+
- `M4`: Min-Max-First-Last (guaranteed extrema preservation, best for monitoring/alerting)
|
|
253
|
+
|
|
254
|
+
### Edge Handling
|
|
255
|
+
|
|
256
|
+
- `KEEP`: Keep edge points as-is (default)
|
|
257
|
+
- `FLAG`: Add `_is_edge` column
|
|
258
|
+
- `DISCARD`: Remove edge points
|
|
259
|
+
|
|
260
|
+
## Examples
|
|
261
|
+
|
|
262
|
+
See the `examples/` directory for complete examples:
|
|
263
|
+
|
|
264
|
+
- `basic_downsampling.py`: Core downsampling features
|
|
265
|
+
- `multi_aggregate.py`: Creating min/mean/max columns
|
|
266
|
+
- `range_downsample.py`: Range-based downsampling with automatic edge buffering
|
|
267
|
+
- `fidelity_comparison.py`: Interactive fidelity comparison (marimo notebook)
|
|
268
|
+
|
|
269
|
+
### Running the fidelity comparison notebook
|
|
270
|
+
|
|
271
|
+
**Option 1 — Project install via uv** (best for development):
|
|
272
|
+
|
|
273
|
+
```bash
|
|
274
|
+
uv run --extra dev marimo edit examples/fidelity_comparison.py
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
**Option 2 — Marimo sandbox** (self-contained, uses inline PEP 723 metadata):
|
|
278
|
+
|
|
279
|
+
```bash
|
|
280
|
+
marimo edit --sandbox examples/fidelity_comparison.py
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
## API Reference
|
|
284
|
+
|
|
285
|
+
### DataFrame-Mode Functions
|
|
286
|
+
|
|
287
|
+
```python
|
|
288
|
+
downsample_dataframe(df, target_cadence, config=None, **kwargs) -> DataFrame
|
|
289
|
+
downsample_dataframe_multi_aggregate(df, target_cadence, variables, aggregations, ...) -> DataFrame
|
|
290
|
+
downsample_dataframe_resolutions(df, cadences, config=None, **kwargs) -> dict[Timedelta, DataFrame]
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
### Range-Mode Functions
|
|
294
|
+
|
|
295
|
+
```python
|
|
296
|
+
downsample_range(fetcher, output_start, output_end, target_cadence, config=None, batch_size=None, ...) -> DataFrame
|
|
297
|
+
downsample_range_multi_aggregate(fetcher, output_start, output_end, target_cadence, variables, ...) -> DataFrame
|
|
298
|
+
downsample_range_resolutions(fetcher, output_start, output_end, cadences, config=None, ...) -> dict[Timedelta, DataFrame]
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
### Low-Level Functions
|
|
302
|
+
|
|
303
|
+
```python
|
|
304
|
+
downsample_lttb(df, target_column, target_cadence, ...) -> DataFrame
|
|
305
|
+
downsample_m4(df, target_cadence, deduplicate=True, collinearity_threshold=None, ...) -> DataFrame
|
|
306
|
+
downsample_mean(df, target_cadence, ...) -> DataFrame
|
|
307
|
+
downsample_median(df, target_cadence, ...) -> DataFrame
|
|
308
|
+
downsample_min(df, target_cadence, ...) -> DataFrame
|
|
309
|
+
downsample_max(df, target_cadence, ...) -> DataFrame
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
### Gap Functions
|
|
313
|
+
|
|
314
|
+
```python
|
|
315
|
+
find_gap_indices(df, timedelta_max_gap) -> Series
|
|
316
|
+
groupby_gaps(df, timedelta_max_gap) -> DataFrameGroupBy
|
|
317
|
+
split_at_gaps(df, timedelta_max_gap) -> list[DataFrame]
|
|
318
|
+
mark_gaps_in_dataframe(df, nominal_timedelta, ...) -> DataFrame
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
### Fidelity Functions
|
|
322
|
+
|
|
323
|
+
```python
|
|
324
|
+
compute_metrics(original, downsampled, column) -> FidelityMetrics
|
|
325
|
+
FidelityComparison(original_df, column).compare(cadences, methods, ...) -> list[ComparisonResult]
|
|
326
|
+
summary_table(results) -> DataFrame
|
|
327
|
+
```
|
|
328
|
+
|
|
329
|
+
## License
|
|
330
|
+
|
|
331
|
+
MIT License - see LICENSE file for details.
|
|
332
|
+
|
|
333
|
+
## Contributing
|
|
334
|
+
|
|
335
|
+
Contributions are welcome! Please feel free to submit issues and pull requests.
|
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
# downsampler
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/downsampler/)
|
|
4
|
+
[](https://pypi.org/project/downsampler/)
|
|
5
|
+
[](https://opensource.org/licenses/MIT)
|
|
6
|
+
|
|
7
|
+
A Python package for time series DataFrame downsampling with LTTB, M4, multiple aggregation methods, gap handling, and fidelity testing.
|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
|
|
11
|
+
- **Multiple downsampling methods**:
|
|
12
|
+
- LTTB (visual fidelity)
|
|
13
|
+
- M4 (guaranteed extrema preservation)
|
|
14
|
+
- Traditional aggregations (mean, median, min, max)
|
|
15
|
+
- **Gap-aware processing**: Automatically detects and handles gaps in time series
|
|
16
|
+
- **Edge handling**: Flag, discard, or keep edge points
|
|
17
|
+
- **Multi-aggregate output**: Generate min/mean/max columns in a single call
|
|
18
|
+
- **Range-based downsampling**: Fetch data from external sources with automatic edge buffering
|
|
19
|
+
- **Multi-resolution pyramid**: Generate downsampled versions at multiple cadences in one call
|
|
20
|
+
- **Fidelity testing**: Compare methods and measure visual accuracy
|
|
21
|
+
|
|
22
|
+
## Installation
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install downsampler
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
> **Note (Linux + Python 3.11):** the `lttbc` dependency's prebuilt cp311
|
|
29
|
+
> Linux wheel was compiled against NumPy 1.x and fails to import under
|
|
30
|
+
> NumPy 2 (`numpy.core.multiarray failed to import`). Force a source build:
|
|
31
|
+
> `pip install --no-binary lttbc downsampler`. Other Python versions and
|
|
32
|
+
> macOS have no prebuilt wheel and build from source automatically. With
|
|
33
|
+
> uv, this repo's `[tool.uv] no-binary-package` setting handles it.
|
|
34
|
+
|
|
35
|
+
## Quick Start
|
|
36
|
+
|
|
37
|
+
### Basic Downsampling
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
import pandas as pd
|
|
41
|
+
from downsampler import downsample_dataframe
|
|
42
|
+
|
|
43
|
+
# Create sample data
|
|
44
|
+
df = pd.DataFrame(
|
|
45
|
+
{'temperature': range(1000)},
|
|
46
|
+
index=pd.date_range('2024-01-01', periods=1000, freq='1s')
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
# Downsample to 1-minute cadence (default: mean)
|
|
50
|
+
result = downsample_dataframe(df, target_cadence='PT1M')
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### Using Different Methods
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from downsampler import downsample_dataframe, DownsampleConfig, AggregationMethod
|
|
57
|
+
|
|
58
|
+
# Mean (default)
|
|
59
|
+
result = downsample_dataframe(df, '10min')
|
|
60
|
+
|
|
61
|
+
# Maximum
|
|
62
|
+
result = downsample_dataframe(df, '10min', method='max')
|
|
63
|
+
|
|
64
|
+
# LTTB for visual fidelity
|
|
65
|
+
config = DownsampleConfig(
|
|
66
|
+
method=AggregationMethod.LTTB,
|
|
67
|
+
lttb_target_column='temperature'
|
|
68
|
+
)
|
|
69
|
+
result = downsample_dataframe(df, '10min', config=config)
|
|
70
|
+
|
|
71
|
+
# M4 for guaranteed extrema preservation
|
|
72
|
+
result = downsample_dataframe(df, '10min', method='m4')
|
|
73
|
+
|
|
74
|
+
# M4 with collinearity filtering (reduces output size)
|
|
75
|
+
result = downsample_dataframe(df, '10min', method='m4', m4_collinearity_threshold=0.01)
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### Multi-Aggregate Downsampling
|
|
79
|
+
|
|
80
|
+
Create min/mean/max columns for visualization with error bands:
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
from downsampler import downsample_dataframe_multi_aggregate
|
|
84
|
+
|
|
85
|
+
result = downsample_dataframe_multi_aggregate(
|
|
86
|
+
df,
|
|
87
|
+
target_cadence='1min',
|
|
88
|
+
variables=['temperature', 'pressure'],
|
|
89
|
+
aggregations=['min', 'mean', 'max']
|
|
90
|
+
)
|
|
91
|
+
# Result has columns: temperature_min, temperature_mean, temperature_max, etc.
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Multi-Resolution Pyramid
|
|
95
|
+
|
|
96
|
+
Generate downsampled versions at multiple cadences for storage:
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
from downsampler import downsample_dataframe_resolutions
|
|
100
|
+
|
|
101
|
+
results = downsample_dataframe_resolutions(
|
|
102
|
+
df,
|
|
103
|
+
cadences=['1min', '5min', '15min', '1h'],
|
|
104
|
+
)
|
|
105
|
+
# Returns {Timedelta('0 days 00:01:00'): DataFrame, ...}
|
|
106
|
+
|
|
107
|
+
for cadence, result_df in results.items():
|
|
108
|
+
print(f"{cadence}: {len(result_df)} points")
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### M4 Downsampling (Extrema Preservation)
|
|
112
|
+
|
|
113
|
+
M4 guarantees exact preservation of minimum and maximum values, making it ideal for monitoring dashboards and alerting systems:
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
from downsampler import downsample_dataframe
|
|
117
|
+
|
|
118
|
+
# Basic M4 - preserves exact min/max
|
|
119
|
+
result = downsample_dataframe(df, '1min', method='m4')
|
|
120
|
+
|
|
121
|
+
# Verify extrema preservation
|
|
122
|
+
assert df['temperature'].min() == result['temperature'].min()
|
|
123
|
+
assert df['temperature'].max() == result['temperature'].max()
|
|
124
|
+
|
|
125
|
+
# M4 with deduplication (default, removes consecutive duplicates)
|
|
126
|
+
result = downsample_dataframe(df, '1min', method='m4', m4_deduplicate=True)
|
|
127
|
+
|
|
128
|
+
# M4 with collinearity filtering (reduces size on smooth data)
|
|
129
|
+
result = downsample_dataframe(df, '1min', method='m4', m4_collinearity_threshold=0.01)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
**M4 Features:**
|
|
133
|
+
- Selects up to 4 points per bucket: first, last, min, max
|
|
134
|
+
- **Guaranteed** exact extrema preservation (no approximation)
|
|
135
|
+
- Variable output size (typically 2-4x reduction vs 10x for traditional methods)
|
|
136
|
+
- Deduplication: removes consecutive duplicates (20-50% reduction)
|
|
137
|
+
- Collinearity filtering: removes min/max points near first-last line (0-75% reduction)
|
|
138
|
+
- Superior peak detection compared to LTTB
|
|
139
|
+
|
|
140
|
+
**When to use M4:**
|
|
141
|
+
- Monitoring dashboards where missing a spike could be critical
|
|
142
|
+
- Alerting systems that need exact threshold crossings
|
|
143
|
+
- Pre-computing multiple cadences with controllable size/fidelity trade-offs
|
|
144
|
+
- Multi-variable sensor data where each variable's extrema matter
|
|
145
|
+
|
|
146
|
+
### Handling Gaps
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
from downsampler import DownsampleConfig
|
|
150
|
+
|
|
151
|
+
config = DownsampleConfig(
|
|
152
|
+
gap_threshold='5min' # Gaps > 5 min trigger segmentation
|
|
153
|
+
)
|
|
154
|
+
result = downsample_dataframe(df, '1min', config=config)
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### Range-Based Downsampling
|
|
158
|
+
|
|
159
|
+
For data that needs to be fetched from an external source:
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
from downsampler import downsample_range
|
|
163
|
+
|
|
164
|
+
def fetch_from_api(start, end):
|
|
165
|
+
# Your data fetching logic here
|
|
166
|
+
return pd.DataFrame(...)
|
|
167
|
+
|
|
168
|
+
# Single fetch with automatic edge buffering
|
|
169
|
+
result = downsample_range(
|
|
170
|
+
fetcher=fetch_from_api,
|
|
171
|
+
output_start=pd.Timestamp('2024-01-01'),
|
|
172
|
+
output_end=pd.Timestamp('2024-01-02'),
|
|
173
|
+
target_cadence='1H'
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# Batched mode for large ranges
|
|
177
|
+
result = downsample_range(
|
|
178
|
+
fetcher=fetch_from_api,
|
|
179
|
+
output_start=pd.Timestamp('2024-01-01'),
|
|
180
|
+
output_end=pd.Timestamp('2024-02-01'),
|
|
181
|
+
target_cadence='1H',
|
|
182
|
+
batch_size='P1D' # Process one day at a time
|
|
183
|
+
)
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
### Fidelity Comparison
|
|
187
|
+
|
|
188
|
+
Compare different methods to find the best one for your data:
|
|
189
|
+
|
|
190
|
+
```python
|
|
191
|
+
from downsampler.fidelity import FidelityComparison, summary_table
|
|
192
|
+
|
|
193
|
+
comp = FidelityComparison(original_df, 'signal')
|
|
194
|
+
results = comp.compare('10s', store_downsampled=True)
|
|
195
|
+
|
|
196
|
+
print(summary_table(results))
|
|
197
|
+
# See examples/fidelity_comparison.py (marimo notebook) for interactive visualization
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
## Configuration Options
|
|
201
|
+
|
|
202
|
+
### DownsampleConfig
|
|
203
|
+
|
|
204
|
+
| Parameter | Type | Default | Description |
|
|
205
|
+
|-----------|------|---------|-------------|
|
|
206
|
+
| `method` | AggregationMethod | MEAN | Downsampling method |
|
|
207
|
+
| `lttb_target_column` | str | None | Column to optimize for LTTB |
|
|
208
|
+
| `m4_deduplicate` | bool | True | For M4: remove consecutive duplicates |
|
|
209
|
+
| `m4_collinearity_threshold` | float | None | For M4: filter collinear points (0.0-1.0) |
|
|
210
|
+
| `include_columns` | list[str] | [] | Columns to include (empty = all) |
|
|
211
|
+
| `exclude_columns` | list[str] | [] | Columns to exclude |
|
|
212
|
+
| `gap_threshold` | str/Timedelta | "auto" | Min duration for gaps |
|
|
213
|
+
| `edge_handling` | EdgeHandling | KEEP | How to handle edges |
|
|
214
|
+
| `edge_window` | int | 2 | Points at each edge |
|
|
215
|
+
| `min_points_per_segment` | int | 1 | Min points per segment; smaller segments are dropped |
|
|
216
|
+
| `min_completeness` | float | 0.9 | Min fraction of expected points per bucket |
|
|
217
|
+
| `source_cadence` | str/Timedelta | None | Source data cadence (estimated if None) |
|
|
218
|
+
|
|
219
|
+
### Aggregation Methods
|
|
220
|
+
|
|
221
|
+
- `MEAN`: Arithmetic mean (best for general use)
|
|
222
|
+
- `MEDIAN`: Median (robust to outliers)
|
|
223
|
+
- `MIN`: Minimum value (preserves lows)
|
|
224
|
+
- `MAX`: Maximum value (preserves highs)
|
|
225
|
+
- `LTTB`: Largest Triangle Three Buckets (best visual fidelity)
|
|
226
|
+
- `M4`: Min-Max-First-Last (guaranteed extrema preservation, best for monitoring/alerting)
|
|
227
|
+
|
|
228
|
+
### Edge Handling
|
|
229
|
+
|
|
230
|
+
- `KEEP`: Keep edge points as-is (default)
|
|
231
|
+
- `FLAG`: Add `_is_edge` column
|
|
232
|
+
- `DISCARD`: Remove edge points
|
|
233
|
+
|
|
234
|
+
## Examples
|
|
235
|
+
|
|
236
|
+
See the `examples/` directory for complete examples:
|
|
237
|
+
|
|
238
|
+
- `basic_downsampling.py`: Core downsampling features
|
|
239
|
+
- `multi_aggregate.py`: Creating min/mean/max columns
|
|
240
|
+
- `range_downsample.py`: Range-based downsampling with automatic edge buffering
|
|
241
|
+
- `fidelity_comparison.py`: Interactive fidelity comparison (marimo notebook)
|
|
242
|
+
|
|
243
|
+
### Running the fidelity comparison notebook
|
|
244
|
+
|
|
245
|
+
**Option 1 — Project install via uv** (best for development):
|
|
246
|
+
|
|
247
|
+
```bash
|
|
248
|
+
uv run --extra dev marimo edit examples/fidelity_comparison.py
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
**Option 2 — Marimo sandbox** (self-contained, uses inline PEP 723 metadata):
|
|
252
|
+
|
|
253
|
+
```bash
|
|
254
|
+
marimo edit --sandbox examples/fidelity_comparison.py
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
## API Reference
|
|
258
|
+
|
|
259
|
+
### DataFrame-Mode Functions
|
|
260
|
+
|
|
261
|
+
```python
|
|
262
|
+
downsample_dataframe(df, target_cadence, config=None, **kwargs) -> DataFrame
|
|
263
|
+
downsample_dataframe_multi_aggregate(df, target_cadence, variables, aggregations, ...) -> DataFrame
|
|
264
|
+
downsample_dataframe_resolutions(df, cadences, config=None, **kwargs) -> dict[Timedelta, DataFrame]
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
### Range-Mode Functions
|
|
268
|
+
|
|
269
|
+
```python
|
|
270
|
+
downsample_range(fetcher, output_start, output_end, target_cadence, config=None, batch_size=None, ...) -> DataFrame
|
|
271
|
+
downsample_range_multi_aggregate(fetcher, output_start, output_end, target_cadence, variables, ...) -> DataFrame
|
|
272
|
+
downsample_range_resolutions(fetcher, output_start, output_end, cadences, config=None, ...) -> dict[Timedelta, DataFrame]
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
### Low-Level Functions
|
|
276
|
+
|
|
277
|
+
```python
|
|
278
|
+
downsample_lttb(df, target_column, target_cadence, ...) -> DataFrame
|
|
279
|
+
downsample_m4(df, target_cadence, deduplicate=True, collinearity_threshold=None, ...) -> DataFrame
|
|
280
|
+
downsample_mean(df, target_cadence, ...) -> DataFrame
|
|
281
|
+
downsample_median(df, target_cadence, ...) -> DataFrame
|
|
282
|
+
downsample_min(df, target_cadence, ...) -> DataFrame
|
|
283
|
+
downsample_max(df, target_cadence, ...) -> DataFrame
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
### Gap Functions
|
|
287
|
+
|
|
288
|
+
```python
|
|
289
|
+
find_gap_indices(df, timedelta_max_gap) -> Series
|
|
290
|
+
groupby_gaps(df, timedelta_max_gap) -> DataFrameGroupBy
|
|
291
|
+
split_at_gaps(df, timedelta_max_gap) -> list[DataFrame]
|
|
292
|
+
mark_gaps_in_dataframe(df, nominal_timedelta, ...) -> DataFrame
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
### Fidelity Functions
|
|
296
|
+
|
|
297
|
+
```python
|
|
298
|
+
compute_metrics(original, downsampled, column) -> FidelityMetrics
|
|
299
|
+
FidelityComparison(original_df, column).compare(cadences, methods, ...) -> list[ComparisonResult]
|
|
300
|
+
summary_table(results) -> DataFrame
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
## License
|
|
304
|
+
|
|
305
|
+
MIT License - see LICENSE file for details.
|
|
306
|
+
|
|
307
|
+
## Contributing
|
|
308
|
+
|
|
309
|
+
Contributions are welcome! Please feel free to submit issues and pull requests.
|
|
@@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "downsampler"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.3.0"
|
|
8
8
|
description = "Timeseries DataFrame downsampling with LTTB, aggregation methods, gap handling, and fidelity testing"
|
|
9
9
|
readme = "README.md"
|
|
10
|
-
license =
|
|
11
|
-
requires-python = ">=3.
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.11"
|
|
12
12
|
authors = [
|
|
13
13
|
{name = "Eelco Doornbos", email = "eelco.doornbos@knmi.nl"}
|
|
14
14
|
]
|
|
@@ -16,28 +16,36 @@ keywords = ["timeseries", "downsampling", "lttb", "pandas", "visualization"]
|
|
|
16
16
|
classifiers = [
|
|
17
17
|
"Development Status :: 3 - Alpha",
|
|
18
18
|
"Intended Audience :: Science/Research",
|
|
19
|
-
"License :: OSI Approved :: MIT License",
|
|
20
19
|
"Programming Language :: Python :: 3",
|
|
21
|
-
"Programming Language :: Python :: 3.10",
|
|
22
20
|
"Programming Language :: Python :: 3.11",
|
|
23
21
|
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Programming Language :: Python :: 3.13",
|
|
24
23
|
"Topic :: Scientific/Engineering",
|
|
25
24
|
]
|
|
26
25
|
dependencies = [
|
|
27
|
-
"numpy>=
|
|
26
|
+
"numpy>=2.0",
|
|
28
27
|
"pandas>=1.3",
|
|
29
|
-
"
|
|
28
|
+
"lttbc>=0.3",
|
|
30
29
|
"scipy>=1.7",
|
|
30
|
+
"requests>=2.32.5",
|
|
31
31
|
]
|
|
32
32
|
|
|
33
|
-
[
|
|
34
|
-
viz = ["matplotlib>=3.5", "altair>=5.0"]
|
|
33
|
+
[dependency-groups]
|
|
35
34
|
test = ["pytest>=7.0", "pytest-cov>=4.0"]
|
|
36
|
-
dev = ["
|
|
35
|
+
dev = [{include-group = "test"}, "marimo"]
|
|
37
36
|
|
|
38
37
|
[project.urls]
|
|
39
|
-
Homepage = "https://gitlab.com/KNMI-OSS/spaceweather/downsampler"
|
|
40
|
-
Repository = "https://gitlab.com/KNMI-OSS/spaceweather/downsampler"
|
|
38
|
+
Homepage = "https://gitlab.com/KNMI-OSS/spaceweather/libs/downsampler"
|
|
39
|
+
Repository = "https://gitlab.com/KNMI-OSS/spaceweather/libs/downsampler"
|
|
40
|
+
|
|
41
|
+
[tool.uv]
|
|
42
|
+
# lttbc's prebuilt cp311 Linux wheel was compiled against NumPy 1.x and
|
|
43
|
+
# crashes on import under NumPy 2 ("numpy.core.multiarray failed to
|
|
44
|
+
# import"). Building from source compiles it against the installed NumPy.
|
|
45
|
+
# Other Python versions ship no Linux wheel and always build from source.
|
|
46
|
+
# Same workaround as in spaceweather-data-pipelines; drop when lttbc
|
|
47
|
+
# publishes NumPy-2 wheels.
|
|
48
|
+
no-binary-package = ["lttbc"]
|
|
41
49
|
|
|
42
50
|
[tool.setuptools.packages.find]
|
|
43
51
|
where = ["src"]
|