downsampler 0.1.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. downsampler-0.3.0/PKG-INFO +335 -0
  2. downsampler-0.3.0/README.md +309 -0
  3. {downsampler-0.1.0 → downsampler-0.3.0}/pyproject.toml +20 -12
  4. {downsampler-0.1.0 → downsampler-0.3.0}/src/downsampler/__init__.py +36 -15
  5. {downsampler-0.1.0 → downsampler-0.3.0}/src/downsampler/aggregators.py +80 -78
  6. {downsampler-0.1.0 → downsampler-0.3.0}/src/downsampler/config.py +26 -12
  7. {downsampler-0.1.0 → downsampler-0.3.0}/src/downsampler/core.py +71 -15
  8. downsampler-0.3.0/src/downsampler/edges.py +89 -0
  9. {downsampler-0.1.0 → downsampler-0.3.0}/src/downsampler/fidelity/__init__.py +2 -9
  10. downsampler-0.3.0/src/downsampler/fidelity/comparison.py +150 -0
  11. {downsampler-0.1.0 → downsampler-0.3.0}/src/downsampler/fidelity/metrics.py +9 -84
  12. {downsampler-0.1.0 → downsampler-0.3.0}/src/downsampler/gaps.py +135 -20
  13. {downsampler-0.1.0 → downsampler-0.3.0}/src/downsampler/lttb.py +69 -39
  14. downsampler-0.3.0/src/downsampler/m4.py +396 -0
  15. downsampler-0.3.0/src/downsampler/ranged.py +457 -0
  16. {downsampler-0.1.0 → downsampler-0.3.0}/src/downsampler/utils.py +26 -2
  17. downsampler-0.3.0/src/downsampler.egg-info/PKG-INFO +335 -0
  18. {downsampler-0.1.0 → downsampler-0.3.0}/src/downsampler.egg-info/SOURCES.txt +5 -4
  19. downsampler-0.3.0/src/downsampler.egg-info/requires.txt +5 -0
  20. downsampler-0.3.0/tests/test_aggregators.py +237 -0
  21. downsampler-0.3.0/tests/test_core.py +211 -0
  22. {downsampler-0.1.0 → downsampler-0.3.0}/tests/test_edges.py +0 -56
  23. downsampler-0.3.0/tests/test_fidelity.py +160 -0
  24. downsampler-0.3.0/tests/test_gaps.py +380 -0
  25. downsampler-0.3.0/tests/test_lttb.py +343 -0
  26. downsampler-0.3.0/tests/test_m4.py +493 -0
  27. downsampler-0.3.0/tests/test_ranged.py +262 -0
  28. downsampler-0.1.0/PKG-INFO +0 -246
  29. downsampler-0.1.0/README.md +0 -212
  30. downsampler-0.1.0/src/downsampler/deferred.py +0 -357
  31. downsampler-0.1.0/src/downsampler/edges.py +0 -202
  32. downsampler-0.1.0/src/downsampler/fidelity/comparison.py +0 -343
  33. downsampler-0.1.0/src/downsampler/fidelity/visualization.py +0 -359
  34. downsampler-0.1.0/src/downsampler.egg-info/PKG-INFO +0 -246
  35. downsampler-0.1.0/src/downsampler.egg-info/requires.txt +0 -15
  36. downsampler-0.1.0/tests/test_aggregators.py +0 -83
  37. downsampler-0.1.0/tests/test_core.py +0 -115
  38. downsampler-0.1.0/tests/test_deferred.py +0 -173
  39. downsampler-0.1.0/tests/test_fidelity.py +0 -149
  40. downsampler-0.1.0/tests/test_gaps.py +0 -145
  41. downsampler-0.1.0/tests/test_lttb.py +0 -112
  42. {downsampler-0.1.0 → downsampler-0.3.0}/LICENSE +0 -0
  43. {downsampler-0.1.0 → downsampler-0.3.0}/setup.cfg +0 -0
  44. {downsampler-0.1.0 → downsampler-0.3.0}/src/downsampler.egg-info/dependency_links.txt +0 -0
  45. {downsampler-0.1.0 → downsampler-0.3.0}/src/downsampler.egg-info/top_level.txt +0 -0
@@ -0,0 +1,335 @@
1
+ Metadata-Version: 2.4
2
+ Name: downsampler
3
+ Version: 0.3.0
4
+ Summary: Timeseries DataFrame downsampling with LTTB, aggregation methods, gap handling, and fidelity testing
5
+ Author-email: Eelco Doornbos <eelco.doornbos@knmi.nl>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://gitlab.com/KNMI-OSS/spaceweather/libs/downsampler
8
+ Project-URL: Repository, https://gitlab.com/KNMI-OSS/spaceweather/libs/downsampler
9
+ Keywords: timeseries,downsampling,lttb,pandas,visualization
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Topic :: Scientific/Engineering
17
+ Requires-Python: >=3.11
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: numpy>=2.0
21
+ Requires-Dist: pandas>=1.3
22
+ Requires-Dist: lttbc>=0.3
23
+ Requires-Dist: scipy>=1.7
24
+ Requires-Dist: requests>=2.32.5
25
+ Dynamic: license-file
26
+
27
+ # downsampler
28
+
29
+ [![PyPI](https://img.shields.io/pypi/v/downsampler)](https://pypi.org/project/downsampler/)
30
+ [![Python](https://img.shields.io/pypi/pyversions/downsampler)](https://pypi.org/project/downsampler/)
31
+ [![License](https://img.shields.io/pypi/l/downsampler)](https://opensource.org/licenses/MIT)
32
+
33
+ A Python package for time series DataFrame downsampling with LTTB, M4, multiple aggregation methods, gap handling, and fidelity testing.
34
+
35
+ ## Features
36
+
37
+ - **Multiple downsampling methods**:
38
+ - LTTB (visual fidelity)
39
+ - M4 (guaranteed extrema preservation)
40
+ - Traditional aggregations (mean, median, min, max)
41
+ - **Gap-aware processing**: Automatically detects and handles gaps in time series
42
+ - **Edge handling**: Flag, discard, or keep edge points
43
+ - **Multi-aggregate output**: Generate min/mean/max columns in a single call
44
+ - **Range-based downsampling**: Fetch data from external sources with automatic edge buffering
45
+ - **Multi-resolution pyramid**: Generate downsampled versions at multiple cadences in one call
46
+ - **Fidelity testing**: Compare methods and measure visual accuracy
47
+
48
+ ## Installation
49
+
50
+ ```bash
51
+ pip install downsampler
52
+ ```
53
+
54
+ > **Note (Linux + Python 3.11):** the `lttbc` dependency's prebuilt cp311
55
+ > Linux wheel was compiled against NumPy 1.x and fails to import under
56
+ > NumPy 2 (`numpy.core.multiarray failed to import`). Force a source build:
57
+ > `pip install --no-binary lttbc downsampler`. Other Python versions and
58
+ > macOS have no prebuilt wheel and build from source automatically. With
59
+ > uv, this repo's `[tool.uv] no-binary-package` setting handles it.
60
+
61
+ ## Quick Start
62
+
63
+ ### Basic Downsampling
64
+
65
+ ```python
66
+ import pandas as pd
67
+ from downsampler import downsample_dataframe
68
+
69
+ # Create sample data
70
+ df = pd.DataFrame(
71
+ {'temperature': range(1000)},
72
+ index=pd.date_range('2024-01-01', periods=1000, freq='1s')
73
+ )
74
+
75
+ # Downsample to 1-minute cadence (default: mean)
76
+ result = downsample_dataframe(df, target_cadence='PT1M')
77
+ ```
78
+
79
+ ### Using Different Methods
80
+
81
+ ```python
82
+ from downsampler import downsample_dataframe, DownsampleConfig, AggregationMethod
83
+
84
+ # Mean (default)
85
+ result = downsample_dataframe(df, '10min')
86
+
87
+ # Maximum
88
+ result = downsample_dataframe(df, '10min', method='max')
89
+
90
+ # LTTB for visual fidelity
91
+ config = DownsampleConfig(
92
+ method=AggregationMethod.LTTB,
93
+ lttb_target_column='temperature'
94
+ )
95
+ result = downsample_dataframe(df, '10min', config=config)
96
+
97
+ # M4 for guaranteed extrema preservation
98
+ result = downsample_dataframe(df, '10min', method='m4')
99
+
100
+ # M4 with collinearity filtering (reduces output size)
101
+ result = downsample_dataframe(df, '10min', method='m4', m4_collinearity_threshold=0.01)
102
+ ```
103
+
104
+ ### Multi-Aggregate Downsampling
105
+
106
+ Create min/mean/max columns for visualization with error bands:
107
+
108
+ ```python
109
+ from downsampler import downsample_dataframe_multi_aggregate
110
+
111
+ result = downsample_dataframe_multi_aggregate(
112
+ df,
113
+ target_cadence='1min',
114
+ variables=['temperature', 'pressure'],
115
+ aggregations=['min', 'mean', 'max']
116
+ )
117
+ # Result has columns: temperature_min, temperature_mean, temperature_max, etc.
118
+ ```
119
+
120
+ ### Multi-Resolution Pyramid
121
+
122
+ Generate downsampled versions at multiple cadences for storage:
123
+
124
+ ```python
125
+ from downsampler import downsample_dataframe_resolutions
126
+
127
+ results = downsample_dataframe_resolutions(
128
+ df,
129
+ cadences=['1min', '5min', '15min', '1h'],
130
+ )
131
+ # Returns {Timedelta('0 days 00:01:00'): DataFrame, ...}
132
+
133
+ for cadence, result_df in results.items():
134
+ print(f"{cadence}: {len(result_df)} points")
135
+ ```
136
+
137
+ ### M4 Downsampling (Extrema Preservation)
138
+
139
+ M4 guarantees exact preservation of minimum and maximum values, making it ideal for monitoring dashboards and alerting systems:
140
+
141
+ ```python
142
+ from downsampler import downsample_dataframe
143
+
144
+ # Basic M4 - preserves exact min/max
145
+ result = downsample_dataframe(df, '1min', method='m4')
146
+
147
+ # Verify extrema preservation
148
+ assert df['temperature'].min() == result['temperature'].min()
149
+ assert df['temperature'].max() == result['temperature'].max()
150
+
151
+ # M4 with deduplication (default, removes consecutive duplicates)
152
+ result = downsample_dataframe(df, '1min', method='m4', m4_deduplicate=True)
153
+
154
+ # M4 with collinearity filtering (reduces size on smooth data)
155
+ result = downsample_dataframe(df, '1min', method='m4', m4_collinearity_threshold=0.01)
156
+ ```
157
+
158
+ **M4 Features:**
159
+ - Selects up to 4 points per bucket: first, last, min, max
160
+ - **Guaranteed** exact extrema preservation (no approximation)
161
+ - Variable output size (typically 2-4x reduction vs 10x for traditional methods)
162
+ - Deduplication: removes consecutive duplicates (20-50% reduction)
163
+ - Collinearity filtering: removes min/max points near first-last line (0-75% reduction)
164
+ - Superior peak detection compared to LTTB
165
+
166
+ **When to use M4:**
167
+ - Monitoring dashboards where missing a spike could be critical
168
+ - Alerting systems that need exact threshold crossings
169
+ - Pre-computing multiple cadences with controllable size/fidelity trade-offs
170
+ - Multi-variable sensor data where each variable's extrema matter
171
+
172
+ ### Handling Gaps
173
+
174
+ ```python
175
+ from downsampler import DownsampleConfig
176
+
177
+ config = DownsampleConfig(
178
+ gap_threshold='5min' # Gaps > 5 min trigger segmentation
179
+ )
180
+ result = downsample_dataframe(df, '1min', config=config)
181
+ ```
182
+
183
+ ### Range-Based Downsampling
184
+
185
+ For data that needs to be fetched from an external source:
186
+
187
+ ```python
188
+ from downsampler import downsample_range
189
+
190
+ def fetch_from_api(start, end):
191
+ # Your data fetching logic here
192
+ return pd.DataFrame(...)
193
+
194
+ # Single fetch with automatic edge buffering
195
+ result = downsample_range(
196
+ fetcher=fetch_from_api,
197
+ output_start=pd.Timestamp('2024-01-01'),
198
+ output_end=pd.Timestamp('2024-01-02'),
199
+ target_cadence='1H'
200
+ )
201
+
202
+ # Batched mode for large ranges
203
+ result = downsample_range(
204
+ fetcher=fetch_from_api,
205
+ output_start=pd.Timestamp('2024-01-01'),
206
+ output_end=pd.Timestamp('2024-02-01'),
207
+ target_cadence='1H',
208
+ batch_size='P1D' # Process one day at a time
209
+ )
210
+ ```
211
+
212
+ ### Fidelity Comparison
213
+
214
+ Compare different methods to find the best one for your data:
215
+
216
+ ```python
217
+ from downsampler.fidelity import FidelityComparison, summary_table
218
+
219
+ comp = FidelityComparison(original_df, 'signal')
220
+ results = comp.compare('10s', store_downsampled=True)
221
+
222
+ print(summary_table(results))
223
+ # See examples/fidelity_comparison.py (marimo notebook) for interactive visualization
224
+ ```
225
+
226
+ ## Configuration Options
227
+
228
+ ### DownsampleConfig
229
+
230
+ | Parameter | Type | Default | Description |
231
+ |-----------|------|---------|-------------|
232
+ | `method` | AggregationMethod | MEAN | Downsampling method |
233
+ | `lttb_target_column` | str | None | Column to optimize for LTTB |
234
+ | `m4_deduplicate` | bool | True | For M4: remove consecutive duplicates |
235
+ | `m4_collinearity_threshold` | float | None | For M4: filter collinear points (0.0-1.0) |
236
+ | `include_columns` | list[str] | [] | Columns to include (empty = all) |
237
+ | `exclude_columns` | list[str] | [] | Columns to exclude |
238
+ | `gap_threshold` | str/Timedelta | "auto" | Min duration for gaps |
239
+ | `edge_handling` | EdgeHandling | KEEP | How to handle edges |
240
+ | `edge_window` | int | 2 | Points at each edge |
241
+ | `min_points_per_segment` | int | 1 | Min points per segment; smaller segments are dropped |
242
+ | `min_completeness` | float | 0.9 | Min fraction of expected points per bucket |
243
+ | `source_cadence` | str/Timedelta | None | Source data cadence (estimated if None) |
244
+
245
+ ### Aggregation Methods
246
+
247
+ - `MEAN`: Arithmetic mean (best for general use)
248
+ - `MEDIAN`: Median (robust to outliers)
249
+ - `MIN`: Minimum value (preserves lows)
250
+ - `MAX`: Maximum value (preserves highs)
251
+ - `LTTB`: Largest Triangle Three Buckets (best visual fidelity)
252
+ - `M4`: Min-Max-First-Last (guaranteed extrema preservation, best for monitoring/alerting)
253
+
254
+ ### Edge Handling
255
+
256
+ - `KEEP`: Keep edge points as-is (default)
257
+ - `FLAG`: Add `_is_edge` column
258
+ - `DISCARD`: Remove edge points
259
+
260
+ ## Examples
261
+
262
+ See the `examples/` directory for complete examples:
263
+
264
+ - `basic_downsampling.py`: Core downsampling features
265
+ - `multi_aggregate.py`: Creating min/mean/max columns
266
+ - `range_downsample.py`: Range-based downsampling with automatic edge buffering
267
+ - `fidelity_comparison.py`: Interactive fidelity comparison (marimo notebook)
268
+
269
+ ### Running the fidelity comparison notebook
270
+
271
+ **Option 1 — Project install via uv** (best for development):
272
+
273
+ ```bash
274
+ uv run --extra dev marimo edit examples/fidelity_comparison.py
275
+ ```
276
+
277
+ **Option 2 — Marimo sandbox** (self-contained, uses inline PEP 723 metadata):
278
+
279
+ ```bash
280
+ marimo edit --sandbox examples/fidelity_comparison.py
281
+ ```
282
+
283
+ ## API Reference
284
+
285
+ ### DataFrame-Mode Functions
286
+
287
+ ```python
288
+ downsample_dataframe(df, target_cadence, config=None, **kwargs) -> DataFrame
289
+ downsample_dataframe_multi_aggregate(df, target_cadence, variables, aggregations, ...) -> DataFrame
290
+ downsample_dataframe_resolutions(df, cadences, config=None, **kwargs) -> dict[Timedelta, DataFrame]
291
+ ```
292
+
293
+ ### Range-Mode Functions
294
+
295
+ ```python
296
+ downsample_range(fetcher, output_start, output_end, target_cadence, config=None, batch_size=None, ...) -> DataFrame
297
+ downsample_range_multi_aggregate(fetcher, output_start, output_end, target_cadence, variables, ...) -> DataFrame
298
+ downsample_range_resolutions(fetcher, output_start, output_end, cadences, config=None, ...) -> dict[Timedelta, DataFrame]
299
+ ```
300
+
301
+ ### Low-Level Functions
302
+
303
+ ```python
304
+ downsample_lttb(df, target_column, target_cadence, ...) -> DataFrame
305
+ downsample_m4(df, target_cadence, deduplicate=True, collinearity_threshold=None, ...) -> DataFrame
306
+ downsample_mean(df, target_cadence, ...) -> DataFrame
307
+ downsample_median(df, target_cadence, ...) -> DataFrame
308
+ downsample_min(df, target_cadence, ...) -> DataFrame
309
+ downsample_max(df, target_cadence, ...) -> DataFrame
310
+ ```
311
+
312
+ ### Gap Functions
313
+
314
+ ```python
315
+ find_gap_indices(df, timedelta_max_gap) -> Series
316
+ groupby_gaps(df, timedelta_max_gap) -> DataFrameGroupBy
317
+ split_at_gaps(df, timedelta_max_gap) -> list[DataFrame]
318
+ mark_gaps_in_dataframe(df, nominal_timedelta, ...) -> DataFrame
319
+ ```
320
+
321
+ ### Fidelity Functions
322
+
323
+ ```python
324
+ compute_metrics(original, downsampled, column) -> FidelityMetrics
325
+ FidelityComparison(original_df, column).compare(cadences, methods, ...) -> list[ComparisonResult]
326
+ summary_table(results) -> DataFrame
327
+ ```
328
+
329
+ ## License
330
+
331
+ MIT License - see LICENSE file for details.
332
+
333
+ ## Contributing
334
+
335
+ Contributions are welcome! Please feel free to submit issues and pull requests.
@@ -0,0 +1,309 @@
1
+ # downsampler
2
+
3
+ [![PyPI](https://img.shields.io/pypi/v/downsampler)](https://pypi.org/project/downsampler/)
4
+ [![Python](https://img.shields.io/pypi/pyversions/downsampler)](https://pypi.org/project/downsampler/)
5
+ [![License](https://img.shields.io/pypi/l/downsampler)](https://opensource.org/licenses/MIT)
6
+
7
+ A Python package for time series DataFrame downsampling with LTTB, M4, multiple aggregation methods, gap handling, and fidelity testing.
8
+
9
+ ## Features
10
+
11
+ - **Multiple downsampling methods**:
12
+ - LTTB (visual fidelity)
13
+ - M4 (guaranteed extrema preservation)
14
+ - Traditional aggregations (mean, median, min, max)
15
+ - **Gap-aware processing**: Automatically detects and handles gaps in time series
16
+ - **Edge handling**: Flag, discard, or keep edge points
17
+ - **Multi-aggregate output**: Generate min/mean/max columns in a single call
18
+ - **Range-based downsampling**: Fetch data from external sources with automatic edge buffering
19
+ - **Multi-resolution pyramid**: Generate downsampled versions at multiple cadences in one call
20
+ - **Fidelity testing**: Compare methods and measure visual accuracy
21
+
22
+ ## Installation
23
+
24
+ ```bash
25
+ pip install downsampler
26
+ ```
27
+
28
+ > **Note (Linux + Python 3.11):** the `lttbc` dependency's prebuilt cp311
29
+ > Linux wheel was compiled against NumPy 1.x and fails to import under
30
+ > NumPy 2 (`numpy.core.multiarray failed to import`). Force a source build:
31
+ > `pip install --no-binary lttbc downsampler`. Other Python versions and
32
+ > macOS have no prebuilt wheel and build from source automatically. With
33
+ > uv, this repo's `[tool.uv] no-binary-package` setting handles it.
34
+
35
+ ## Quick Start
36
+
37
+ ### Basic Downsampling
38
+
39
+ ```python
40
+ import pandas as pd
41
+ from downsampler import downsample_dataframe
42
+
43
+ # Create sample data
44
+ df = pd.DataFrame(
45
+ {'temperature': range(1000)},
46
+ index=pd.date_range('2024-01-01', periods=1000, freq='1s')
47
+ )
48
+
49
+ # Downsample to 1-minute cadence (default: mean)
50
+ result = downsample_dataframe(df, target_cadence='PT1M')
51
+ ```
52
+
53
+ ### Using Different Methods
54
+
55
+ ```python
56
+ from downsampler import downsample_dataframe, DownsampleConfig, AggregationMethod
57
+
58
+ # Mean (default)
59
+ result = downsample_dataframe(df, '10min')
60
+
61
+ # Maximum
62
+ result = downsample_dataframe(df, '10min', method='max')
63
+
64
+ # LTTB for visual fidelity
65
+ config = DownsampleConfig(
66
+ method=AggregationMethod.LTTB,
67
+ lttb_target_column='temperature'
68
+ )
69
+ result = downsample_dataframe(df, '10min', config=config)
70
+
71
+ # M4 for guaranteed extrema preservation
72
+ result = downsample_dataframe(df, '10min', method='m4')
73
+
74
+ # M4 with collinearity filtering (reduces output size)
75
+ result = downsample_dataframe(df, '10min', method='m4', m4_collinearity_threshold=0.01)
76
+ ```
77
+
78
+ ### Multi-Aggregate Downsampling
79
+
80
+ Create min/mean/max columns for visualization with error bands:
81
+
82
+ ```python
83
+ from downsampler import downsample_dataframe_multi_aggregate
84
+
85
+ result = downsample_dataframe_multi_aggregate(
86
+ df,
87
+ target_cadence='1min',
88
+ variables=['temperature', 'pressure'],
89
+ aggregations=['min', 'mean', 'max']
90
+ )
91
+ # Result has columns: temperature_min, temperature_mean, temperature_max, etc.
92
+ ```
93
+
94
+ ### Multi-Resolution Pyramid
95
+
96
+ Generate downsampled versions at multiple cadences for storage:
97
+
98
+ ```python
99
+ from downsampler import downsample_dataframe_resolutions
100
+
101
+ results = downsample_dataframe_resolutions(
102
+ df,
103
+ cadences=['1min', '5min', '15min', '1h'],
104
+ )
105
+ # Returns {Timedelta('0 days 00:01:00'): DataFrame, ...}
106
+
107
+ for cadence, result_df in results.items():
108
+ print(f"{cadence}: {len(result_df)} points")
109
+ ```
110
+
111
+ ### M4 Downsampling (Extrema Preservation)
112
+
113
+ M4 guarantees exact preservation of minimum and maximum values, making it ideal for monitoring dashboards and alerting systems:
114
+
115
+ ```python
116
+ from downsampler import downsample_dataframe
117
+
118
+ # Basic M4 - preserves exact min/max
119
+ result = downsample_dataframe(df, '1min', method='m4')
120
+
121
+ # Verify extrema preservation
122
+ assert df['temperature'].min() == result['temperature'].min()
123
+ assert df['temperature'].max() == result['temperature'].max()
124
+
125
+ # M4 with deduplication (default, removes consecutive duplicates)
126
+ result = downsample_dataframe(df, '1min', method='m4', m4_deduplicate=True)
127
+
128
+ # M4 with collinearity filtering (reduces size on smooth data)
129
+ result = downsample_dataframe(df, '1min', method='m4', m4_collinearity_threshold=0.01)
130
+ ```
131
+
132
+ **M4 Features:**
133
+ - Selects up to 4 points per bucket: first, last, min, max
134
+ - **Guaranteed** exact extrema preservation (no approximation)
135
+ - Variable output size (typically 2-4x reduction vs 10x for traditional methods)
136
+ - Deduplication: removes consecutive duplicates (20-50% reduction)
137
+ - Collinearity filtering: removes min/max points near first-last line (0-75% reduction)
138
+ - Superior peak detection compared to LTTB
139
+
140
+ **When to use M4:**
141
+ - Monitoring dashboards where missing a spike could be critical
142
+ - Alerting systems that need exact threshold crossings
143
+ - Pre-computing multiple cadences with controllable size/fidelity trade-offs
144
+ - Multi-variable sensor data where each variable's extrema matter
145
+
146
+ ### Handling Gaps
147
+
148
+ ```python
149
+ from downsampler import DownsampleConfig
150
+
151
+ config = DownsampleConfig(
152
+ gap_threshold='5min' # Gaps > 5 min trigger segmentation
153
+ )
154
+ result = downsample_dataframe(df, '1min', config=config)
155
+ ```
156
+
157
+ ### Range-Based Downsampling
158
+
159
+ For data that needs to be fetched from an external source:
160
+
161
+ ```python
162
+ from downsampler import downsample_range
163
+
164
+ def fetch_from_api(start, end):
165
+ # Your data fetching logic here
166
+ return pd.DataFrame(...)
167
+
168
+ # Single fetch with automatic edge buffering
169
+ result = downsample_range(
170
+ fetcher=fetch_from_api,
171
+ output_start=pd.Timestamp('2024-01-01'),
172
+ output_end=pd.Timestamp('2024-01-02'),
173
+ target_cadence='1H'
174
+ )
175
+
176
+ # Batched mode for large ranges
177
+ result = downsample_range(
178
+ fetcher=fetch_from_api,
179
+ output_start=pd.Timestamp('2024-01-01'),
180
+ output_end=pd.Timestamp('2024-02-01'),
181
+ target_cadence='1H',
182
+ batch_size='P1D' # Process one day at a time
183
+ )
184
+ ```
185
+
186
+ ### Fidelity Comparison
187
+
188
+ Compare different methods to find the best one for your data:
189
+
190
+ ```python
191
+ from downsampler.fidelity import FidelityComparison, summary_table
192
+
193
+ comp = FidelityComparison(original_df, 'signal')
194
+ results = comp.compare('10s', store_downsampled=True)
195
+
196
+ print(summary_table(results))
197
+ # See examples/fidelity_comparison.py (marimo notebook) for interactive visualization
198
+ ```
199
+
200
+ ## Configuration Options
201
+
202
+ ### DownsampleConfig
203
+
204
+ | Parameter | Type | Default | Description |
205
+ |-----------|------|---------|-------------|
206
+ | `method` | AggregationMethod | MEAN | Downsampling method |
207
+ | `lttb_target_column` | str | None | Column to optimize for LTTB |
208
+ | `m4_deduplicate` | bool | True | For M4: remove consecutive duplicates |
209
+ | `m4_collinearity_threshold` | float | None | For M4: filter collinear points (0.0-1.0) |
210
+ | `include_columns` | list[str] | [] | Columns to include (empty = all) |
211
+ | `exclude_columns` | list[str] | [] | Columns to exclude |
212
+ | `gap_threshold` | str/Timedelta | "auto" | Min duration for gaps |
213
+ | `edge_handling` | EdgeHandling | KEEP | How to handle edges |
214
+ | `edge_window` | int | 2 | Points at each edge |
215
+ | `min_points_per_segment` | int | 1 | Min points per segment; smaller segments are dropped |
216
+ | `min_completeness` | float | 0.9 | Min fraction of expected points per bucket |
217
+ | `source_cadence` | str/Timedelta | None | Source data cadence (estimated if None) |
218
+
219
+ ### Aggregation Methods
220
+
221
+ - `MEAN`: Arithmetic mean (best for general use)
222
+ - `MEDIAN`: Median (robust to outliers)
223
+ - `MIN`: Minimum value (preserves lows)
224
+ - `MAX`: Maximum value (preserves highs)
225
+ - `LTTB`: Largest Triangle Three Buckets (best visual fidelity)
226
+ - `M4`: Min-Max-First-Last (guaranteed extrema preservation, best for monitoring/alerting)
227
+
228
+ ### Edge Handling
229
+
230
+ - `KEEP`: Keep edge points as-is (default)
231
+ - `FLAG`: Add `_is_edge` column
232
+ - `DISCARD`: Remove edge points
233
+
234
+ ## Examples
235
+
236
+ See the `examples/` directory for complete examples:
237
+
238
+ - `basic_downsampling.py`: Core downsampling features
239
+ - `multi_aggregate.py`: Creating min/mean/max columns
240
+ - `range_downsample.py`: Range-based downsampling with automatic edge buffering
241
+ - `fidelity_comparison.py`: Interactive fidelity comparison (marimo notebook)
242
+
243
+ ### Running the fidelity comparison notebook
244
+
245
+ **Option 1 — Project install via uv** (best for development):
246
+
247
+ ```bash
248
+ uv run --extra dev marimo edit examples/fidelity_comparison.py
249
+ ```
250
+
251
+ **Option 2 — Marimo sandbox** (self-contained, uses inline PEP 723 metadata):
252
+
253
+ ```bash
254
+ marimo edit --sandbox examples/fidelity_comparison.py
255
+ ```
256
+
257
+ ## API Reference
258
+
259
+ ### DataFrame-Mode Functions
260
+
261
+ ```python
262
+ downsample_dataframe(df, target_cadence, config=None, **kwargs) -> DataFrame
263
+ downsample_dataframe_multi_aggregate(df, target_cadence, variables, aggregations, ...) -> DataFrame
264
+ downsample_dataframe_resolutions(df, cadences, config=None, **kwargs) -> dict[Timedelta, DataFrame]
265
+ ```
266
+
267
+ ### Range-Mode Functions
268
+
269
+ ```python
270
+ downsample_range(fetcher, output_start, output_end, target_cadence, config=None, batch_size=None, ...) -> DataFrame
271
+ downsample_range_multi_aggregate(fetcher, output_start, output_end, target_cadence, variables, ...) -> DataFrame
272
+ downsample_range_resolutions(fetcher, output_start, output_end, cadences, config=None, ...) -> dict[Timedelta, DataFrame]
273
+ ```
274
+
275
+ ### Low-Level Functions
276
+
277
+ ```python
278
+ downsample_lttb(df, target_column, target_cadence, ...) -> DataFrame
279
+ downsample_m4(df, target_cadence, deduplicate=True, collinearity_threshold=None, ...) -> DataFrame
280
+ downsample_mean(df, target_cadence, ...) -> DataFrame
281
+ downsample_median(df, target_cadence, ...) -> DataFrame
282
+ downsample_min(df, target_cadence, ...) -> DataFrame
283
+ downsample_max(df, target_cadence, ...) -> DataFrame
284
+ ```
285
+
286
+ ### Gap Functions
287
+
288
+ ```python
289
+ find_gap_indices(df, timedelta_max_gap) -> Series
290
+ groupby_gaps(df, timedelta_max_gap) -> DataFrameGroupBy
291
+ split_at_gaps(df, timedelta_max_gap) -> list[DataFrame]
292
+ mark_gaps_in_dataframe(df, nominal_timedelta, ...) -> DataFrame
293
+ ```
294
+
295
+ ### Fidelity Functions
296
+
297
+ ```python
298
+ compute_metrics(original, downsampled, column) -> FidelityMetrics
299
+ FidelityComparison(original_df, column).compare(cadences, methods, ...) -> list[ComparisonResult]
300
+ summary_table(results) -> DataFrame
301
+ ```
302
+
303
+ ## License
304
+
305
+ MIT License - see LICENSE file for details.
306
+
307
+ ## Contributing
308
+
309
+ Contributions are welcome! Please feel free to submit issues and pull requests.
@@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "downsampler"
7
- version = "0.1.0"
7
+ version = "0.3.0"
8
8
  description = "Timeseries DataFrame downsampling with LTTB, aggregation methods, gap handling, and fidelity testing"
9
9
  readme = "README.md"
10
- license = {text = "MIT"}
11
- requires-python = ">=3.10"
10
+ license = "MIT"
11
+ requires-python = ">=3.11"
12
12
  authors = [
13
13
  {name = "Eelco Doornbos", email = "eelco.doornbos@knmi.nl"}
14
14
  ]
@@ -16,28 +16,36 @@ keywords = ["timeseries", "downsampling", "lttb", "pandas", "visualization"]
16
16
  classifiers = [
17
17
  "Development Status :: 3 - Alpha",
18
18
  "Intended Audience :: Science/Research",
19
- "License :: OSI Approved :: MIT License",
20
19
  "Programming Language :: Python :: 3",
21
- "Programming Language :: Python :: 3.10",
22
20
  "Programming Language :: Python :: 3.11",
23
21
  "Programming Language :: Python :: 3.12",
22
+ "Programming Language :: Python :: 3.13",
24
23
  "Topic :: Scientific/Engineering",
25
24
  ]
26
25
  dependencies = [
27
- "numpy>=1.20",
26
+ "numpy>=2.0",
28
27
  "pandas>=1.3",
29
- "lttb>=0.3",
28
+ "lttbc>=0.3",
30
29
  "scipy>=1.7",
30
+ "requests>=2.32.5",
31
31
  ]
32
32
 
33
- [project.optional-dependencies]
34
- viz = ["matplotlib>=3.5", "altair>=5.0"]
33
+ [dependency-groups]
35
34
  test = ["pytest>=7.0", "pytest-cov>=4.0"]
36
- dev = ["downsampler[viz,test]"]
35
+ dev = [{include-group = "test"}, "marimo"]
37
36
 
38
37
  [project.urls]
39
- Homepage = "https://gitlab.com/KNMI-OSS/spaceweather/downsampler"
40
- Repository = "https://gitlab.com/KNMI-OSS/spaceweather/downsampler"
38
+ Homepage = "https://gitlab.com/KNMI-OSS/spaceweather/libs/downsampler"
39
+ Repository = "https://gitlab.com/KNMI-OSS/spaceweather/libs/downsampler"
40
+
41
+ [tool.uv]
42
+ # lttbc's prebuilt cp311 Linux wheel was compiled against NumPy 1.x and
43
+ # crashes on import under NumPy 2 ("numpy.core.multiarray failed to
44
+ # import"). Building from source compiles it against the installed NumPy.
45
+ # Other Python versions ship no Linux wheel and always build from source.
46
+ # Same workaround as in spaceweather-data-pipelines; drop when lttbc
47
+ # publishes NumPy-2 wheels.
48
+ no-binary-package = ["lttbc"]
41
49
 
42
50
  [tool.setuptools.packages.find]
43
51
  where = ["src"]