downsampler 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. downsampler-0.2.0/PKG-INFO +334 -0
  2. downsampler-0.2.0/README.md +302 -0
  3. {downsampler-0.1.0 → downsampler-0.2.0}/pyproject.toml +10 -11
  4. {downsampler-0.1.0 → downsampler-0.2.0}/src/downsampler/__init__.py +36 -15
  5. {downsampler-0.1.0 → downsampler-0.2.0}/src/downsampler/aggregators.py +80 -78
  6. {downsampler-0.1.0 → downsampler-0.2.0}/src/downsampler/config.py +22 -10
  7. {downsampler-0.1.0 → downsampler-0.2.0}/src/downsampler/core.py +71 -15
  8. downsampler-0.2.0/src/downsampler/edges.py +89 -0
  9. {downsampler-0.1.0 → downsampler-0.2.0}/src/downsampler/fidelity/__init__.py +2 -9
  10. downsampler-0.2.0/src/downsampler/fidelity/comparison.py +150 -0
  11. {downsampler-0.1.0 → downsampler-0.2.0}/src/downsampler/fidelity/metrics.py +9 -84
  12. {downsampler-0.1.0 → downsampler-0.2.0}/src/downsampler/gaps.py +129 -19
  13. {downsampler-0.1.0 → downsampler-0.2.0}/src/downsampler/lttb.py +36 -24
  14. downsampler-0.2.0/src/downsampler/m4.py +396 -0
  15. downsampler-0.2.0/src/downsampler/ranged.py +457 -0
  16. {downsampler-0.1.0 → downsampler-0.2.0}/src/downsampler/utils.py +26 -2
  17. downsampler-0.2.0/src/downsampler.egg-info/PKG-INFO +334 -0
  18. {downsampler-0.1.0 → downsampler-0.2.0}/src/downsampler.egg-info/SOURCES.txt +5 -4
  19. downsampler-0.2.0/src/downsampler.egg-info/requires.txt +13 -0
  20. downsampler-0.2.0/tests/test_aggregators.py +237 -0
  21. downsampler-0.2.0/tests/test_core.py +211 -0
  22. {downsampler-0.1.0 → downsampler-0.2.0}/tests/test_edges.py +0 -56
  23. downsampler-0.2.0/tests/test_fidelity.py +160 -0
  24. downsampler-0.2.0/tests/test_gaps.py +380 -0
  25. {downsampler-0.1.0 → downsampler-0.2.0}/tests/test_lttb.py +66 -0
  26. downsampler-0.2.0/tests/test_m4.py +493 -0
  27. downsampler-0.2.0/tests/test_ranged.py +262 -0
  28. downsampler-0.1.0/PKG-INFO +0 -246
  29. downsampler-0.1.0/README.md +0 -212
  30. downsampler-0.1.0/src/downsampler/deferred.py +0 -357
  31. downsampler-0.1.0/src/downsampler/edges.py +0 -202
  32. downsampler-0.1.0/src/downsampler/fidelity/comparison.py +0 -343
  33. downsampler-0.1.0/src/downsampler/fidelity/visualization.py +0 -359
  34. downsampler-0.1.0/src/downsampler.egg-info/PKG-INFO +0 -246
  35. downsampler-0.1.0/src/downsampler.egg-info/requires.txt +0 -15
  36. downsampler-0.1.0/tests/test_aggregators.py +0 -83
  37. downsampler-0.1.0/tests/test_core.py +0 -115
  38. downsampler-0.1.0/tests/test_deferred.py +0 -173
  39. downsampler-0.1.0/tests/test_fidelity.py +0 -149
  40. downsampler-0.1.0/tests/test_gaps.py +0 -145
  41. {downsampler-0.1.0 → downsampler-0.2.0}/LICENSE +0 -0
  42. {downsampler-0.1.0 → downsampler-0.2.0}/setup.cfg +0 -0
  43. {downsampler-0.1.0 → downsampler-0.2.0}/src/downsampler.egg-info/dependency_links.txt +0 -0
  44. {downsampler-0.1.0 → downsampler-0.2.0}/src/downsampler.egg-info/top_level.txt +0 -0
@@ -0,0 +1,334 @@
1
+ Metadata-Version: 2.4
2
+ Name: downsampler
3
+ Version: 0.2.0
4
+ Summary: Timeseries DataFrame downsampling with LTTB, aggregation methods, gap handling, and fidelity testing
5
+ Author-email: Eelco Doornbos <eelco.doornbos@knmi.nl>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://gitlab.com/KNMI-OSS/spaceweather/libs/downsampler
8
+ Project-URL: Repository, https://gitlab.com/KNMI-OSS/spaceweather/libs/downsampler
9
+ Keywords: timeseries,downsampling,lttb,pandas,visualization
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Topic :: Scientific/Engineering
17
+ Requires-Python: >=3.11
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: numpy>=2.0
21
+ Requires-Dist: pandas>=1.3
22
+ Requires-Dist: lttbc>=0.3
23
+ Requires-Dist: scipy>=1.7
24
+ Requires-Dist: requests>=2.32.5
25
+ Provides-Extra: test
26
+ Requires-Dist: pytest>=7.0; extra == "test"
27
+ Requires-Dist: pytest-cov>=4.0; extra == "test"
28
+ Provides-Extra: dev
29
+ Requires-Dist: downsampler[test]; extra == "dev"
30
+ Requires-Dist: marimo; extra == "dev"
31
+ Dynamic: license-file
32
+
33
+ # downsampler
34
+
35
+ [![PyPI](https://img.shields.io/pypi/v/downsampler)](https://pypi.org/project/downsampler/)
36
+ [![Python](https://img.shields.io/pypi/pyversions/downsampler)](https://pypi.org/project/downsampler/)
37
+ [![License](https://img.shields.io/pypi/l/downsampler)](https://opensource.org/licenses/MIT)
38
+
39
+ A Python package for time series DataFrame downsampling with LTTB, M4, multiple aggregation methods, gap handling, and fidelity testing.
40
+
41
+ ## Features
42
+
43
+ - **Multiple downsampling methods**:
44
+ - LTTB (visual fidelity)
45
+ - M4 (guaranteed extrema preservation)
46
+ - Traditional aggregations (mean, median, min, max)
47
+ - **Gap-aware processing**: Automatically detects and handles gaps in time series
48
+ - **Edge handling**: Flag, discard, or keep edge points
49
+ - **Multi-aggregate output**: Generate min/mean/max columns in a single call
50
+ - **Range-based downsampling**: Fetch data from external sources with automatic edge buffering
51
+ - **Multi-resolution pyramid**: Generate downsampled versions at multiple cadences in one call
52
+ - **Fidelity testing**: Compare methods and measure visual accuracy
53
+
54
+ ## Installation
55
+
56
+ ```bash
57
+ pip install downsampler
58
+ ```
59
+
60
+ ## Quick Start
61
+
62
+ ### Basic Downsampling
63
+
64
+ ```python
65
+ import pandas as pd
66
+ from downsampler import downsample_dataframe
67
+
68
+ # Create sample data
69
+ df = pd.DataFrame(
70
+ {'temperature': range(1000)},
71
+ index=pd.date_range('2024-01-01', periods=1000, freq='1s')
72
+ )
73
+
74
+ # Downsample to 1-minute cadence (default: mean)
75
+ result = downsample_dataframe(df, target_cadence='PT1M')
76
+ ```
77
+
78
+ ### Using Different Methods
79
+
80
+ ```python
81
+ from downsampler import downsample_dataframe, DownsampleConfig, AggregationMethod
82
+
83
+ # Mean (default)
84
+ result = downsample_dataframe(df, '10min')
85
+
86
+ # Maximum
87
+ result = downsample_dataframe(df, '10min', method='max')
88
+
89
+ # LTTB for visual fidelity
90
+ config = DownsampleConfig(
91
+ method=AggregationMethod.LTTB,
92
+ lttb_target_column='temperature'
93
+ )
94
+ result = downsample_dataframe(df, '10min', config=config)
95
+
96
+ # M4 for guaranteed extrema preservation
97
+ result = downsample_dataframe(df, '10min', method='m4')
98
+
99
+ # M4 with collinearity filtering (reduces output size)
100
+ result = downsample_dataframe(df, '10min', method='m4', m4_collinearity_threshold=0.01)
101
+ ```
102
+
103
+ ### Multi-Aggregate Downsampling
104
+
105
+ Create min/mean/max columns for visualization with error bands:
106
+
107
+ ```python
108
+ from downsampler import downsample_dataframe_multi_aggregate
109
+
110
+ result = downsample_dataframe_multi_aggregate(
111
+ df,
112
+ target_cadence='1min',
113
+ variables=['temperature', 'pressure'],
114
+ aggregations=['min', 'mean', 'max']
115
+ )
116
+ # Result has columns: temperature_min, temperature_mean, temperature_max, etc.
117
+ ```
118
+
119
+ ### Multi-Resolution Pyramid
120
+
121
+ Generate downsampled versions at multiple cadences for storage:
122
+
123
+ ```python
124
+ from downsampler import downsample_dataframe_resolutions
125
+
126
+ results = downsample_dataframe_resolutions(
127
+ df,
128
+ cadences=['1min', '5min', '15min', '1h'],
129
+ )
130
+ # Returns {Timedelta('0 days 00:01:00'): DataFrame, ...}
131
+
132
+ for cadence, result_df in results.items():
133
+ print(f"{cadence}: {len(result_df)} points")
134
+ ```
135
+
136
+ ### M4 Downsampling (Extrema Preservation)
137
+
138
+ M4 guarantees exact preservation of minimum and maximum values, making it ideal for monitoring dashboards and alerting systems:
139
+
140
+ ```python
141
+ from downsampler import downsample_dataframe
142
+
143
+ # Basic M4 - preserves exact min/max
144
+ result = downsample_dataframe(df, '1min', method='m4')
145
+
146
+ # Verify extrema preservation
147
+ assert df['temperature'].min() == result['temperature'].min()
148
+ assert df['temperature'].max() == result['temperature'].max()
149
+
150
+ # M4 with deduplication (default, removes consecutive duplicates)
151
+ result = downsample_dataframe(df, '1min', method='m4', m4_deduplicate=True)
152
+
153
+ # M4 with collinearity filtering (reduces size on smooth data)
154
+ result = downsample_dataframe(df, '1min', method='m4', m4_collinearity_threshold=0.01)
155
+ ```
156
+
157
+ **M4 Features:**
158
+ - Selects up to 4 points per bucket: first, last, min, max
159
+ - **Guaranteed** exact extrema preservation (no approximation)
160
+ - Variable output size (typically 2-4x reduction vs 10x for traditional methods)
161
+ - Deduplication: removes consecutive duplicates (20-50% reduction)
162
+ - Collinearity filtering: removes min/max points near first-last line (0-75% reduction)
163
+ - Superior peak detection compared to LTTB
164
+
165
+ **When to use M4:**
166
+ - Monitoring dashboards where missing a spike could be critical
167
+ - Alerting systems that need exact threshold crossings
168
+ - Pre-computing multiple cadences with controllable size/fidelity trade-offs
169
+ - Multi-variable sensor data where each variable's extrema matter
170
+
171
+ ### Handling Gaps
172
+
173
+ ```python
174
+ from downsampler import DownsampleConfig
175
+
176
+ config = DownsampleConfig(
177
+ gap_threshold='5min' # Gaps > 5 min trigger segmentation
178
+ )
179
+ result = downsample_dataframe(df, '1min', config=config)
180
+ ```
181
+
182
+ ### Range-Based Downsampling
183
+
184
+ For data that needs to be fetched from an external source:
185
+
186
+ ```python
187
+ from downsampler import downsample_range
188
+
189
+ def fetch_from_api(start, end):
190
+ # Your data fetching logic here
191
+ return pd.DataFrame(...)
192
+
193
+ # Single fetch with automatic edge buffering
194
+ result = downsample_range(
195
+ fetcher=fetch_from_api,
196
+ output_start=pd.Timestamp('2024-01-01'),
197
+ output_end=pd.Timestamp('2024-01-02'),
198
+ target_cadence='1H'
199
+ )
200
+
201
+ # Batched mode for large ranges
202
+ result = downsample_range(
203
+ fetcher=fetch_from_api,
204
+ output_start=pd.Timestamp('2024-01-01'),
205
+ output_end=pd.Timestamp('2024-02-01'),
206
+ target_cadence='1H',
207
+ batch_size='P1D' # Process one day at a time
208
+ )
209
+ ```
210
+
211
+ ### Fidelity Comparison
212
+
213
+ Compare different methods to find the best one for your data:
214
+
215
+ ```python
216
+ from downsampler.fidelity import FidelityComparison, summary_table
217
+
218
+ comp = FidelityComparison(original_df, 'signal')
219
+ results = comp.compare('10s', store_downsampled=True)
220
+
221
+ print(summary_table(results))
222
+ # See examples/fidelity_comparison.py (marimo notebook) for interactive visualization
223
+ ```
224
+
225
+ ## Configuration Options
226
+
227
+ ### DownsampleConfig
228
+
229
+ | Parameter | Type | Default | Description |
230
+ |-----------|------|---------|-------------|
231
+ | `method` | AggregationMethod | MEAN | Downsampling method |
232
+ | `lttb_target_column` | str | None | Column to optimize for LTTB |
233
+ | `m4_deduplicate` | bool | True | For M4: remove consecutive duplicates |
234
+ | `m4_collinearity_threshold` | float | None | For M4: filter collinear points (0.0-1.0) |
235
+ | `include_columns` | list[str] | [] | Columns to include (empty = all) |
236
+ | `exclude_columns` | list[str] | [] | Columns to exclude |
237
+ | `gap_threshold` | str/Timedelta | "auto" | Min duration for gaps |
238
+ | `edge_handling` | EdgeHandling | KEEP | How to handle edges |
239
+ | `edge_window` | int | 2 | Points at each edge |
240
+ | `min_points_per_segment` | int | 3 | Min points for processing |
241
+ | `min_completeness` | float | 0.9 | Min fraction of expected points per bucket |
242
+ | `source_cadence` | str/Timedelta | None | Source data cadence (estimated if None) |
243
+
244
+ ### Aggregation Methods
245
+
246
+ - `MEAN`: Arithmetic mean (best for general use)
247
+ - `MEDIAN`: Median (robust to outliers)
248
+ - `MIN`: Minimum value (preserves lows)
249
+ - `MAX`: Maximum value (preserves highs)
250
+ - `LTTB`: Largest Triangle Three Buckets (best visual fidelity)
251
+ - `M4`: Min-Max-First-Last (guaranteed extrema preservation, best for monitoring/alerting)
252
+
253
+ ### Edge Handling
254
+
255
+ - `KEEP`: Keep edge points as-is (default)
256
+ - `FLAG`: Add `_is_edge` column
257
+ - `DISCARD`: Remove edge points
258
+
259
+ ## Examples
260
+
261
+ See the `examples/` directory for complete examples:
262
+
263
+ - `basic_downsampling.py`: Core downsampling features
264
+ - `multi_aggregate.py`: Creating min/mean/max columns
265
+ - `range_downsample.py`: Range-based downsampling with automatic edge buffering
266
+ - `fidelity_comparison.py`: Interactive fidelity comparison (marimo notebook)
267
+
268
+ ### Running the fidelity comparison notebook
269
+
270
+ **Option 1 — Project install via uv** (best for development):
271
+
272
+ ```bash
273
+ uv run --extra dev marimo edit examples/fidelity_comparison.py
274
+ ```
275
+
276
+ **Option 2 — Marimo sandbox** (self-contained, uses inline PEP 723 metadata):
277
+
278
+ ```bash
279
+ marimo edit --sandbox examples/fidelity_comparison.py
280
+ ```
281
+
282
+ ## API Reference
283
+
284
+ ### DataFrame-Mode Functions
285
+
286
+ ```python
287
+ downsample_dataframe(df, target_cadence, config=None, **kwargs) -> DataFrame
288
+ downsample_dataframe_multi_aggregate(df, target_cadence, variables, aggregations, ...) -> DataFrame
289
+ downsample_dataframe_resolutions(df, cadences, config=None, **kwargs) -> dict[Timedelta, DataFrame]
290
+ ```
291
+
292
+ ### Range-Mode Functions
293
+
294
+ ```python
295
+ downsample_range(fetcher, output_start, output_end, target_cadence, config=None, batch_size=None, ...) -> DataFrame
296
+ downsample_range_multi_aggregate(fetcher, output_start, output_end, target_cadence, variables, ...) -> DataFrame
297
+ downsample_range_resolutions(fetcher, output_start, output_end, cadences, config=None, ...) -> dict[Timedelta, DataFrame]
298
+ ```
299
+
300
+ ### Low-Level Functions
301
+
302
+ ```python
303
+ downsample_lttb(df, target_column, target_cadence, ...) -> DataFrame
304
+ downsample_m4(df, target_cadence, deduplicate=True, collinearity_threshold=None, ...) -> DataFrame
305
+ downsample_mean(df, target_cadence, ...) -> DataFrame
306
+ downsample_median(df, target_cadence, ...) -> DataFrame
307
+ downsample_min(df, target_cadence, ...) -> DataFrame
308
+ downsample_max(df, target_cadence, ...) -> DataFrame
309
+ ```
310
+
311
+ ### Gap Functions
312
+
313
+ ```python
314
+ find_gap_indices(df, timedelta_max_gap) -> Series
315
+ groupby_gaps(df, timedelta_max_gap) -> DataFrameGroupBy
316
+ split_at_gaps(df, timedelta_max_gap) -> list[DataFrame]
317
+ mark_gaps_in_dataframe(df, nominal_timedelta, ...) -> DataFrame
318
+ ```
319
+
320
+ ### Fidelity Functions
321
+
322
+ ```python
323
+ compute_metrics(original, downsampled, column) -> FidelityMetrics
324
+ FidelityComparison(original_df, column).compare(cadences, methods, ...) -> list[ComparisonResult]
325
+ summary_table(results) -> DataFrame
326
+ ```
327
+
328
+ ## License
329
+
330
+ MIT License - see LICENSE file for details.
331
+
332
+ ## Contributing
333
+
334
+ Contributions are welcome! Please feel free to submit issues and pull requests.
@@ -0,0 +1,302 @@
1
+ # downsampler
2
+
3
+ [![PyPI](https://img.shields.io/pypi/v/downsampler)](https://pypi.org/project/downsampler/)
4
+ [![Python](https://img.shields.io/pypi/pyversions/downsampler)](https://pypi.org/project/downsampler/)
5
+ [![License](https://img.shields.io/pypi/l/downsampler)](https://opensource.org/licenses/MIT)
6
+
7
+ A Python package for time series DataFrame downsampling with LTTB, M4, multiple aggregation methods, gap handling, and fidelity testing.
8
+
9
+ ## Features
10
+
11
+ - **Multiple downsampling methods**:
12
+ - LTTB (visual fidelity)
13
+ - M4 (guaranteed extrema preservation)
14
+ - Traditional aggregations (mean, median, min, max)
15
+ - **Gap-aware processing**: Automatically detects and handles gaps in time series
16
+ - **Edge handling**: Flag, discard, or keep edge points
17
+ - **Multi-aggregate output**: Generate min/mean/max columns in a single call
18
+ - **Range-based downsampling**: Fetch data from external sources with automatic edge buffering
19
+ - **Multi-resolution pyramid**: Generate downsampled versions at multiple cadences in one call
20
+ - **Fidelity testing**: Compare methods and measure visual accuracy
21
+
22
+ ## Installation
23
+
24
+ ```bash
25
+ pip install downsampler
26
+ ```
27
+
28
+ ## Quick Start
29
+
30
+ ### Basic Downsampling
31
+
32
+ ```python
33
+ import pandas as pd
34
+ from downsampler import downsample_dataframe
35
+
36
+ # Create sample data
37
+ df = pd.DataFrame(
38
+ {'temperature': range(1000)},
39
+ index=pd.date_range('2024-01-01', periods=1000, freq='1s')
40
+ )
41
+
42
+ # Downsample to 1-minute cadence (default: mean)
43
+ result = downsample_dataframe(df, target_cadence='PT1M')
44
+ ```
45
+
46
+ ### Using Different Methods
47
+
48
+ ```python
49
+ from downsampler import downsample_dataframe, DownsampleConfig, AggregationMethod
50
+
51
+ # Mean (default)
52
+ result = downsample_dataframe(df, '10min')
53
+
54
+ # Maximum
55
+ result = downsample_dataframe(df, '10min', method='max')
56
+
57
+ # LTTB for visual fidelity
58
+ config = DownsampleConfig(
59
+ method=AggregationMethod.LTTB,
60
+ lttb_target_column='temperature'
61
+ )
62
+ result = downsample_dataframe(df, '10min', config=config)
63
+
64
+ # M4 for guaranteed extrema preservation
65
+ result = downsample_dataframe(df, '10min', method='m4')
66
+
67
+ # M4 with collinearity filtering (reduces output size)
68
+ result = downsample_dataframe(df, '10min', method='m4', m4_collinearity_threshold=0.01)
69
+ ```
70
+
71
+ ### Multi-Aggregate Downsampling
72
+
73
+ Create min/mean/max columns for visualization with error bands:
74
+
75
+ ```python
76
+ from downsampler import downsample_dataframe_multi_aggregate
77
+
78
+ result = downsample_dataframe_multi_aggregate(
79
+ df,
80
+ target_cadence='1min',
81
+ variables=['temperature', 'pressure'],
82
+ aggregations=['min', 'mean', 'max']
83
+ )
84
+ # Result has columns: temperature_min, temperature_mean, temperature_max, etc.
85
+ ```
86
+
87
+ ### Multi-Resolution Pyramid
88
+
89
+ Generate downsampled versions at multiple cadences for storage:
90
+
91
+ ```python
92
+ from downsampler import downsample_dataframe_resolutions
93
+
94
+ results = downsample_dataframe_resolutions(
95
+ df,
96
+ cadences=['1min', '5min', '15min', '1h'],
97
+ )
98
+ # Returns {Timedelta('0 days 00:01:00'): DataFrame, ...}
99
+
100
+ for cadence, result_df in results.items():
101
+ print(f"{cadence}: {len(result_df)} points")
102
+ ```
103
+
104
+ ### M4 Downsampling (Extrema Preservation)
105
+
106
+ M4 guarantees exact preservation of minimum and maximum values, making it ideal for monitoring dashboards and alerting systems:
107
+
108
+ ```python
109
+ from downsampler import downsample_dataframe
110
+
111
+ # Basic M4 - preserves exact min/max
112
+ result = downsample_dataframe(df, '1min', method='m4')
113
+
114
+ # Verify extrema preservation
115
+ assert df['temperature'].min() == result['temperature'].min()
116
+ assert df['temperature'].max() == result['temperature'].max()
117
+
118
+ # M4 with deduplication (default, removes consecutive duplicates)
119
+ result = downsample_dataframe(df, '1min', method='m4', m4_deduplicate=True)
120
+
121
+ # M4 with collinearity filtering (reduces size on smooth data)
122
+ result = downsample_dataframe(df, '1min', method='m4', m4_collinearity_threshold=0.01)
123
+ ```
124
+
125
+ **M4 Features:**
126
+ - Selects up to 4 points per bucket: first, last, min, max
127
+ - **Guaranteed** exact extrema preservation (no approximation)
128
+ - Variable output size (typically 2-4x reduction vs 10x for traditional methods)
129
+ - Deduplication: removes consecutive duplicates (20-50% reduction)
130
+ - Collinearity filtering: removes min/max points near first-last line (0-75% reduction)
131
+ - Superior peak detection compared to LTTB
132
+
133
+ **When to use M4:**
134
+ - Monitoring dashboards where missing a spike could be critical
135
+ - Alerting systems that need exact threshold crossings
136
+ - Pre-computing multiple cadences with controllable size/fidelity trade-offs
137
+ - Multi-variable sensor data where each variable's extrema matter
138
+
139
+ ### Handling Gaps
140
+
141
+ ```python
142
+ from downsampler import DownsampleConfig
143
+
144
+ config = DownsampleConfig(
145
+ gap_threshold='5min' # Gaps > 5 min trigger segmentation
146
+ )
147
+ result = downsample_dataframe(df, '1min', config=config)
148
+ ```
149
+
150
+ ### Range-Based Downsampling
151
+
152
+ For data that needs to be fetched from an external source:
153
+
154
+ ```python
155
+ from downsampler import downsample_range
156
+
157
+ def fetch_from_api(start, end):
158
+ # Your data fetching logic here
159
+ return pd.DataFrame(...)
160
+
161
+ # Single fetch with automatic edge buffering
162
+ result = downsample_range(
163
+ fetcher=fetch_from_api,
164
+ output_start=pd.Timestamp('2024-01-01'),
165
+ output_end=pd.Timestamp('2024-01-02'),
166
+ target_cadence='1H'
167
+ )
168
+
169
+ # Batched mode for large ranges
170
+ result = downsample_range(
171
+ fetcher=fetch_from_api,
172
+ output_start=pd.Timestamp('2024-01-01'),
173
+ output_end=pd.Timestamp('2024-02-01'),
174
+ target_cadence='1H',
175
+ batch_size='P1D' # Process one day at a time
176
+ )
177
+ ```
178
+
179
+ ### Fidelity Comparison
180
+
181
+ Compare different methods to find the best one for your data:
182
+
183
+ ```python
184
+ from downsampler.fidelity import FidelityComparison, summary_table
185
+
186
+ comp = FidelityComparison(original_df, 'signal')
187
+ results = comp.compare('10s', store_downsampled=True)
188
+
189
+ print(summary_table(results))
190
+ # See examples/fidelity_comparison.py (marimo notebook) for interactive visualization
191
+ ```
192
+
193
+ ## Configuration Options
194
+
195
+ ### DownsampleConfig
196
+
197
+ | Parameter | Type | Default | Description |
198
+ |-----------|------|---------|-------------|
199
+ | `method` | AggregationMethod | MEAN | Downsampling method |
200
+ | `lttb_target_column` | str | None | Column to optimize for LTTB |
201
+ | `m4_deduplicate` | bool | True | For M4: remove consecutive duplicates |
202
+ | `m4_collinearity_threshold` | float | None | For M4: filter collinear points (0.0-1.0) |
203
+ | `include_columns` | list[str] | [] | Columns to include (empty = all) |
204
+ | `exclude_columns` | list[str] | [] | Columns to exclude |
205
+ | `gap_threshold` | str/Timedelta | "auto" | Min duration for gaps |
206
+ | `edge_handling` | EdgeHandling | KEEP | How to handle edges |
207
+ | `edge_window` | int | 2 | Points at each edge |
208
+ | `min_points_per_segment` | int | 3 | Min points for processing |
209
+ | `min_completeness` | float | 0.9 | Min fraction of expected points per bucket |
210
+ | `source_cadence` | str/Timedelta | None | Source data cadence (estimated if None) |
211
+
212
+ ### Aggregation Methods
213
+
214
+ - `MEAN`: Arithmetic mean (best for general use)
215
+ - `MEDIAN`: Median (robust to outliers)
216
+ - `MIN`: Minimum value (preserves lows)
217
+ - `MAX`: Maximum value (preserves highs)
218
+ - `LTTB`: Largest Triangle Three Buckets (best visual fidelity)
219
+ - `M4`: Min-Max-First-Last (guaranteed extrema preservation, best for monitoring/alerting)
220
+
221
+ ### Edge Handling
222
+
223
+ - `KEEP`: Keep edge points as-is (default)
224
+ - `FLAG`: Add `_is_edge` column
225
+ - `DISCARD`: Remove edge points
226
+
227
+ ## Examples
228
+
229
+ See the `examples/` directory for complete examples:
230
+
231
+ - `basic_downsampling.py`: Core downsampling features
232
+ - `multi_aggregate.py`: Creating min/mean/max columns
233
+ - `range_downsample.py`: Range-based downsampling with automatic edge buffering
234
+ - `fidelity_comparison.py`: Interactive fidelity comparison (marimo notebook)
235
+
236
+ ### Running the fidelity comparison notebook
237
+
238
+ **Option 1 — Project install via uv** (best for development):
239
+
240
+ ```bash
241
+ uv run --extra dev marimo edit examples/fidelity_comparison.py
242
+ ```
243
+
244
+ **Option 2 — Marimo sandbox** (self-contained, uses inline PEP 723 metadata):
245
+
246
+ ```bash
247
+ marimo edit --sandbox examples/fidelity_comparison.py
248
+ ```
249
+
250
+ ## API Reference
251
+
252
+ ### DataFrame-Mode Functions
253
+
254
+ ```python
255
+ downsample_dataframe(df, target_cadence, config=None, **kwargs) -> DataFrame
256
+ downsample_dataframe_multi_aggregate(df, target_cadence, variables, aggregations, ...) -> DataFrame
257
+ downsample_dataframe_resolutions(df, cadences, config=None, **kwargs) -> dict[Timedelta, DataFrame]
258
+ ```
259
+
260
+ ### Range-Mode Functions
261
+
262
+ ```python
263
+ downsample_range(fetcher, output_start, output_end, target_cadence, config=None, batch_size=None, ...) -> DataFrame
264
+ downsample_range_multi_aggregate(fetcher, output_start, output_end, target_cadence, variables, ...) -> DataFrame
265
+ downsample_range_resolutions(fetcher, output_start, output_end, cadences, config=None, ...) -> dict[Timedelta, DataFrame]
266
+ ```
267
+
268
+ ### Low-Level Functions
269
+
270
+ ```python
271
+ downsample_lttb(df, target_column, target_cadence, ...) -> DataFrame
272
+ downsample_m4(df, target_cadence, deduplicate=True, collinearity_threshold=None, ...) -> DataFrame
273
+ downsample_mean(df, target_cadence, ...) -> DataFrame
274
+ downsample_median(df, target_cadence, ...) -> DataFrame
275
+ downsample_min(df, target_cadence, ...) -> DataFrame
276
+ downsample_max(df, target_cadence, ...) -> DataFrame
277
+ ```
278
+
279
+ ### Gap Functions
280
+
281
+ ```python
282
+ find_gap_indices(df, timedelta_max_gap) -> Series
283
+ groupby_gaps(df, timedelta_max_gap) -> DataFrameGroupBy
284
+ split_at_gaps(df, timedelta_max_gap) -> list[DataFrame]
285
+ mark_gaps_in_dataframe(df, nominal_timedelta, ...) -> DataFrame
286
+ ```
287
+
288
+ ### Fidelity Functions
289
+
290
+ ```python
291
+ compute_metrics(original, downsampled, column) -> FidelityMetrics
292
+ FidelityComparison(original_df, column).compare(cadences, methods, ...) -> list[ComparisonResult]
293
+ summary_table(results) -> DataFrame
294
+ ```
295
+
296
+ ## License
297
+
298
+ MIT License - see LICENSE file for details.
299
+
300
+ ## Contributing
301
+
302
+ Contributions are welcome! Please feel free to submit issues and pull requests.
@@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "downsampler"
7
- version = "0.1.0"
7
+ version = "0.2.0"
8
8
  description = "Timeseries DataFrame downsampling with LTTB, aggregation methods, gap handling, and fidelity testing"
9
9
  readme = "README.md"
10
- license = {text = "MIT"}
11
- requires-python = ">=3.10"
10
+ license = "MIT"
11
+ requires-python = ">=3.11"
12
12
  authors = [
13
13
  {name = "Eelco Doornbos", email = "eelco.doornbos@knmi.nl"}
14
14
  ]
@@ -16,28 +16,27 @@ keywords = ["timeseries", "downsampling", "lttb", "pandas", "visualization"]
16
16
  classifiers = [
17
17
  "Development Status :: 3 - Alpha",
18
18
  "Intended Audience :: Science/Research",
19
- "License :: OSI Approved :: MIT License",
20
19
  "Programming Language :: Python :: 3",
21
- "Programming Language :: Python :: 3.10",
22
20
  "Programming Language :: Python :: 3.11",
23
21
  "Programming Language :: Python :: 3.12",
22
+ "Programming Language :: Python :: 3.13",
24
23
  "Topic :: Scientific/Engineering",
25
24
  ]
26
25
  dependencies = [
27
- "numpy>=1.20",
26
+ "numpy>=2.0",
28
27
  "pandas>=1.3",
29
- "lttb>=0.3",
28
+ "lttbc>=0.3",
30
29
  "scipy>=1.7",
30
+ "requests>=2.32.5",
31
31
  ]
32
32
 
33
33
  [project.optional-dependencies]
34
- viz = ["matplotlib>=3.5", "altair>=5.0"]
35
34
  test = ["pytest>=7.0", "pytest-cov>=4.0"]
36
- dev = ["downsampler[viz,test]"]
35
+ dev = ["downsampler[test]", "marimo"]
37
36
 
38
37
  [project.urls]
39
- Homepage = "https://gitlab.com/KNMI-OSS/spaceweather/downsampler"
40
- Repository = "https://gitlab.com/KNMI-OSS/spaceweather/downsampler"
38
+ Homepage = "https://gitlab.com/KNMI-OSS/spaceweather/libs/downsampler"
39
+ Repository = "https://gitlab.com/KNMI-OSS/spaceweather/libs/downsampler"
41
40
 
42
41
  [tool.setuptools.packages.find]
43
42
  where = ["src"]