downsampler 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- downsampler-0.1.0/LICENSE +21 -0
- downsampler-0.1.0/PKG-INFO +246 -0
- downsampler-0.1.0/README.md +212 -0
- downsampler-0.1.0/pyproject.toml +43 -0
- downsampler-0.1.0/setup.cfg +4 -0
- downsampler-0.1.0/src/downsampler/__init__.py +80 -0
- downsampler-0.1.0/src/downsampler/aggregators.py +338 -0
- downsampler-0.1.0/src/downsampler/config.py +72 -0
- downsampler-0.1.0/src/downsampler/core.py +166 -0
- downsampler-0.1.0/src/downsampler/deferred.py +357 -0
- downsampler-0.1.0/src/downsampler/edges.py +202 -0
- downsampler-0.1.0/src/downsampler/fidelity/__init__.py +23 -0
- downsampler-0.1.0/src/downsampler/fidelity/comparison.py +343 -0
- downsampler-0.1.0/src/downsampler/fidelity/metrics.py +212 -0
- downsampler-0.1.0/src/downsampler/fidelity/visualization.py +359 -0
- downsampler-0.1.0/src/downsampler/gaps.py +310 -0
- downsampler-0.1.0/src/downsampler/lttb.py +207 -0
- downsampler-0.1.0/src/downsampler/utils.py +150 -0
- downsampler-0.1.0/src/downsampler.egg-info/PKG-INFO +246 -0
- downsampler-0.1.0/src/downsampler.egg-info/SOURCES.txt +28 -0
- downsampler-0.1.0/src/downsampler.egg-info/dependency_links.txt +1 -0
- downsampler-0.1.0/src/downsampler.egg-info/requires.txt +15 -0
- downsampler-0.1.0/src/downsampler.egg-info/top_level.txt +1 -0
- downsampler-0.1.0/tests/test_aggregators.py +83 -0
- downsampler-0.1.0/tests/test_core.py +115 -0
- downsampler-0.1.0/tests/test_deferred.py +173 -0
- downsampler-0.1.0/tests/test_edges.py +117 -0
- downsampler-0.1.0/tests/test_fidelity.py +149 -0
- downsampler-0.1.0/tests/test_gaps.py +145 -0
- downsampler-0.1.0/tests/test_lttb.py +112 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 KNMI
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: downsampler
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Timeseries DataFrame downsampling with LTTB, aggregation methods, gap handling, and fidelity testing
|
|
5
|
+
Author-email: Eelco Doornbos <eelco.doornbos@knmi.nl>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://gitlab.com/KNMI-OSS/spaceweather/downsampler
|
|
8
|
+
Project-URL: Repository, https://gitlab.com/KNMI-OSS/spaceweather/downsampler
|
|
9
|
+
Keywords: timeseries,downsampling,lttb,pandas,visualization
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: numpy>=1.20
|
|
22
|
+
Requires-Dist: pandas>=1.3
|
|
23
|
+
Requires-Dist: lttb>=0.3
|
|
24
|
+
Requires-Dist: scipy>=1.7
|
|
25
|
+
Provides-Extra: viz
|
|
26
|
+
Requires-Dist: matplotlib>=3.5; extra == "viz"
|
|
27
|
+
Requires-Dist: altair>=5.0; extra == "viz"
|
|
28
|
+
Provides-Extra: test
|
|
29
|
+
Requires-Dist: pytest>=7.0; extra == "test"
|
|
30
|
+
Requires-Dist: pytest-cov>=4.0; extra == "test"
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: downsampler[test,viz]; extra == "dev"
|
|
33
|
+
Dynamic: license-file
|
|
34
|
+
|
|
35
|
+
# downsampler
|
|
36
|
+
|
|
37
|
+
A Python package for time series DataFrame downsampling with LTTB, multiple aggregation methods, gap handling, and fidelity testing.
|
|
38
|
+
|
|
39
|
+
## Features
|
|
40
|
+
|
|
41
|
+
- **Multiple downsampling methods**: LTTB (visual fidelity), mean, median, min, max
|
|
42
|
+
- **Gap-aware processing**: Automatically detects and handles gaps in time series
|
|
43
|
+
- **Edge handling**: Flag, discard, or keep edge points
|
|
44
|
+
- **Multi-aggregate output**: Generate min/mean/max columns in a single call
|
|
45
|
+
- **Deferred fetching**: Lazy data loading with automatic edge buffering
|
|
46
|
+
- **Fidelity testing**: Compare methods and measure visual accuracy
|
|
47
|
+
|
|
48
|
+
## Installation
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install downsampler
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Or with visualization support:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install downsampler[viz]
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Quick Start
|
|
61
|
+
|
|
62
|
+
### Basic Downsampling
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
import pandas as pd
|
|
66
|
+
from downsampler import downsample
|
|
67
|
+
|
|
68
|
+
# Create sample data
|
|
69
|
+
df = pd.DataFrame(
|
|
70
|
+
{'temperature': range(1000)},
|
|
71
|
+
index=pd.date_range('2024-01-01', periods=1000, freq='1s')
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# Downsample to 1-minute cadence (default: mean)
|
|
75
|
+
result = downsample(df, target_cadence='PT1M')
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### Using Different Methods
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from downsampler import downsample, DownsampleConfig, AggregationMethod
|
|
82
|
+
|
|
83
|
+
# Mean (default)
|
|
84
|
+
result = downsample(df, '10min')
|
|
85
|
+
|
|
86
|
+
# Maximum
|
|
87
|
+
result = downsample(df, '10min', method='max')
|
|
88
|
+
|
|
89
|
+
# LTTB for visual fidelity
|
|
90
|
+
config = DownsampleConfig(
|
|
91
|
+
method=AggregationMethod.LTTB,
|
|
92
|
+
lttb_target_column='temperature'
|
|
93
|
+
)
|
|
94
|
+
result = downsample(df, '10min', config=config)
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Multi-Aggregate Downsampling
|
|
98
|
+
|
|
99
|
+
Create min/mean/max columns for visualization with error bands:
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from downsampler import downsample_multi_aggregate
|
|
103
|
+
|
|
104
|
+
result = downsample_multi_aggregate(
|
|
105
|
+
df,
|
|
106
|
+
target_cadence='1min',
|
|
107
|
+
variables=['temperature', 'pressure'],
|
|
108
|
+
aggregations=['min', 'mean', 'max']
|
|
109
|
+
)
|
|
110
|
+
# Result has columns: temperature_min, temperature_mean, temperature_max, etc.
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### Handling Gaps
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
from downsampler import DownsampleConfig, GapHandling
|
|
117
|
+
|
|
118
|
+
config = DownsampleConfig(
|
|
119
|
+
gap_handling=GapHandling.SEGMENT, # Process segments independently
|
|
120
|
+
gap_threshold='5min' # Gaps > 5 min trigger segmentation
|
|
121
|
+
)
|
|
122
|
+
result = downsample(df, '1min', config=config)
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### Deferred Data Fetching
|
|
126
|
+
|
|
127
|
+
For data that needs to be fetched from an external source:
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
from downsampler.deferred import deferred_downsample
|
|
131
|
+
|
|
132
|
+
def fetch_from_api(start, end):
|
|
133
|
+
# Your data fetching logic here
|
|
134
|
+
return pd.DataFrame(...)
|
|
135
|
+
|
|
136
|
+
result = deferred_downsample(
|
|
137
|
+
fetcher=fetch_from_api,
|
|
138
|
+
output_start=pd.Timestamp('2024-01-01'),
|
|
139
|
+
output_end=pd.Timestamp('2024-01-02'),
|
|
140
|
+
target_cadence='1H'
|
|
141
|
+
)
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Fidelity Comparison
|
|
145
|
+
|
|
146
|
+
Compare different methods to find the best one for your data:
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
from downsampler.fidelity import FidelityComparison
|
|
150
|
+
|
|
151
|
+
comp = FidelityComparison(original_df, 'signal')
|
|
152
|
+
results = comp.compare_methods('10s')
|
|
153
|
+
|
|
154
|
+
for r in results:
|
|
155
|
+
print(f"{r.method.value}: RMSE={r.metrics.rmse:.4f}")
|
|
156
|
+
|
|
157
|
+
# Get recommendation
|
|
158
|
+
config = comp.recommend_settings('10s', priority='visual')
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## Configuration Options
|
|
162
|
+
|
|
163
|
+
### DownsampleConfig
|
|
164
|
+
|
|
165
|
+
| Parameter | Type | Default | Description |
|
|
166
|
+
|-----------|------|---------|-------------|
|
|
167
|
+
| `method` | AggregationMethod | MEAN | Downsampling method |
|
|
168
|
+
| `lttb_target_column` | str | None | Column to optimize for LTTB |
|
|
169
|
+
| `include_columns` | list[str] | [] | Columns to include (empty = all) |
|
|
170
|
+
| `exclude_columns` | list[str] | [] | Columns to exclude |
|
|
171
|
+
| `gap_handling` | GapHandling | SEGMENT | How to handle gaps |
|
|
172
|
+
| `gap_threshold` | str/Timedelta | "auto" | Min duration for gaps |
|
|
173
|
+
| `edge_handling` | EdgeHandling | FLAG | How to handle edges |
|
|
174
|
+
| `edge_window` | int | 2 | Points at each edge |
|
|
175
|
+
| `min_points_per_segment` | int | 3 | Min points for processing |
|
|
176
|
+
|
|
177
|
+
### Aggregation Methods
|
|
178
|
+
|
|
179
|
+
- `MEAN`: Arithmetic mean (best for general use)
|
|
180
|
+
- `MEDIAN`: Median (robust to outliers)
|
|
181
|
+
- `MIN`: Minimum value (preserves lows)
|
|
182
|
+
- `MAX`: Maximum value (preserves highs)
|
|
183
|
+
- `LTTB`: Largest Triangle Three Buckets (best visual fidelity)
|
|
184
|
+
|
|
185
|
+
### Gap Handling
|
|
186
|
+
|
|
187
|
+
- `SEGMENT`: Split at gaps, process independently (recommended)
|
|
188
|
+
- `INTERPOLATE`: Fill gaps before processing
|
|
189
|
+
- `IGNORE`: Treat as continuous data
|
|
190
|
+
|
|
191
|
+
### Edge Handling
|
|
192
|
+
|
|
193
|
+
- `KEEP`: Keep edge points as-is
|
|
194
|
+
- `FLAG`: Add `_is_edge` column
|
|
195
|
+
- `DISCARD`: Remove edge points
|
|
196
|
+
|
|
197
|
+
## Examples
|
|
198
|
+
|
|
199
|
+
See the `examples/` directory for complete examples:
|
|
200
|
+
|
|
201
|
+
- `basic_downsampling.py`: Core downsampling features
|
|
202
|
+
- `multi_aggregate.py`: Creating min/mean/max columns
|
|
203
|
+
- `deferred_fetch.py`: Lazy data loading
|
|
204
|
+
- `fidelity_comparison.marimo.py`: Interactive comparison notebook
|
|
205
|
+
|
|
206
|
+
## API Reference
|
|
207
|
+
|
|
208
|
+
### Core Functions
|
|
209
|
+
|
|
210
|
+
```python
|
|
211
|
+
downsample(df, target_cadence, config=None, **kwargs) -> DataFrame
|
|
212
|
+
downsample_multi_aggregate(df, target_cadence, variables, aggregations, ...) -> DataFrame
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
### Gap Functions
|
|
216
|
+
|
|
217
|
+
```python
|
|
218
|
+
find_gap_indices(df, timedelta_max_gap) -> Series
|
|
219
|
+
groupby_gaps(df, timedelta_max_gap) -> DataFrameGroupBy
|
|
220
|
+
split_at_gaps(df, timedelta_max_gap) -> list[DataFrame]
|
|
221
|
+
mark_gaps_in_dataframe(df, nominal_timedelta, ...) -> DataFrame
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
### Deferred Functions
|
|
225
|
+
|
|
226
|
+
```python
|
|
227
|
+
deferred_downsample(fetcher, output_start, output_end, target_cadence, ...) -> DataFrame
|
|
228
|
+
batch_deferred_downsample(fetcher, ..., batch_size) -> DataFrame
|
|
229
|
+
LazyDownsampler(fetcher, cache_buffer) -> LazyDownsampler
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
### Fidelity Functions
|
|
233
|
+
|
|
234
|
+
```python
|
|
235
|
+
compute_metrics(original, downsampled, column) -> FidelityMetrics
|
|
236
|
+
FidelityComparison(original_df, column) -> FidelityComparison
|
|
237
|
+
plot_comparison(original, downsampled, column, backend) -> Figure
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
## License
|
|
241
|
+
|
|
242
|
+
MIT License - see LICENSE file for details.
|
|
243
|
+
|
|
244
|
+
## Contributing
|
|
245
|
+
|
|
246
|
+
Contributions are welcome! Please feel free to submit issues and pull requests.
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
# downsampler
|
|
2
|
+
|
|
3
|
+
A Python package for time series DataFrame downsampling with LTTB, multiple aggregation methods, gap handling, and fidelity testing.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Multiple downsampling methods**: LTTB (visual fidelity), mean, median, min, max
|
|
8
|
+
- **Gap-aware processing**: Automatically detects and handles gaps in time series
|
|
9
|
+
- **Edge handling**: Flag, discard, or keep edge points
|
|
10
|
+
- **Multi-aggregate output**: Generate min/mean/max columns in a single call
|
|
11
|
+
- **Deferred fetching**: Lazy data loading with automatic edge buffering
|
|
12
|
+
- **Fidelity testing**: Compare methods and measure visual accuracy
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pip install downsampler
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
Or with visualization support:
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
pip install downsampler[viz]
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
### Basic Downsampling
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import pandas as pd
|
|
32
|
+
from downsampler import downsample
|
|
33
|
+
|
|
34
|
+
# Create sample data
|
|
35
|
+
df = pd.DataFrame(
|
|
36
|
+
{'temperature': range(1000)},
|
|
37
|
+
index=pd.date_range('2024-01-01', periods=1000, freq='1s')
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# Downsample to 1-minute cadence (default: mean)
|
|
41
|
+
result = downsample(df, target_cadence='PT1M')
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Using Different Methods
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
from downsampler import downsample, DownsampleConfig, AggregationMethod
|
|
48
|
+
|
|
49
|
+
# Mean (default)
|
|
50
|
+
result = downsample(df, '10min')
|
|
51
|
+
|
|
52
|
+
# Maximum
|
|
53
|
+
result = downsample(df, '10min', method='max')
|
|
54
|
+
|
|
55
|
+
# LTTB for visual fidelity
|
|
56
|
+
config = DownsampleConfig(
|
|
57
|
+
method=AggregationMethod.LTTB,
|
|
58
|
+
lttb_target_column='temperature'
|
|
59
|
+
)
|
|
60
|
+
result = downsample(df, '10min', config=config)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Multi-Aggregate Downsampling
|
|
64
|
+
|
|
65
|
+
Create min/mean/max columns for visualization with error bands:
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
from downsampler import downsample_multi_aggregate
|
|
69
|
+
|
|
70
|
+
result = downsample_multi_aggregate(
|
|
71
|
+
df,
|
|
72
|
+
target_cadence='1min',
|
|
73
|
+
variables=['temperature', 'pressure'],
|
|
74
|
+
aggregations=['min', 'mean', 'max']
|
|
75
|
+
)
|
|
76
|
+
# Result has columns: temperature_min, temperature_mean, temperature_max, etc.
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### Handling Gaps
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
from downsampler import DownsampleConfig, GapHandling
|
|
83
|
+
|
|
84
|
+
config = DownsampleConfig(
|
|
85
|
+
gap_handling=GapHandling.SEGMENT, # Process segments independently
|
|
86
|
+
gap_threshold='5min' # Gaps > 5 min trigger segmentation
|
|
87
|
+
)
|
|
88
|
+
result = downsample(df, '1min', config=config)
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### Deferred Data Fetching
|
|
92
|
+
|
|
93
|
+
For data that needs to be fetched from an external source:
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
from downsampler.deferred import deferred_downsample
|
|
97
|
+
|
|
98
|
+
def fetch_from_api(start, end):
|
|
99
|
+
# Your data fetching logic here
|
|
100
|
+
return pd.DataFrame(...)
|
|
101
|
+
|
|
102
|
+
result = deferred_downsample(
|
|
103
|
+
fetcher=fetch_from_api,
|
|
104
|
+
output_start=pd.Timestamp('2024-01-01'),
|
|
105
|
+
output_end=pd.Timestamp('2024-01-02'),
|
|
106
|
+
target_cadence='1H'
|
|
107
|
+
)
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Fidelity Comparison
|
|
111
|
+
|
|
112
|
+
Compare different methods to find the best one for your data:
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
from downsampler.fidelity import FidelityComparison
|
|
116
|
+
|
|
117
|
+
comp = FidelityComparison(original_df, 'signal')
|
|
118
|
+
results = comp.compare_methods('10s')
|
|
119
|
+
|
|
120
|
+
for r in results:
|
|
121
|
+
print(f"{r.method.value}: RMSE={r.metrics.rmse:.4f}")
|
|
122
|
+
|
|
123
|
+
# Get recommendation
|
|
124
|
+
config = comp.recommend_settings('10s', priority='visual')
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
## Configuration Options
|
|
128
|
+
|
|
129
|
+
### DownsampleConfig
|
|
130
|
+
|
|
131
|
+
| Parameter | Type | Default | Description |
|
|
132
|
+
|-----------|------|---------|-------------|
|
|
133
|
+
| `method` | AggregationMethod | MEAN | Downsampling method |
|
|
134
|
+
| `lttb_target_column` | str | None | Column to optimize for LTTB |
|
|
135
|
+
| `include_columns` | list[str] | [] | Columns to include (empty = all) |
|
|
136
|
+
| `exclude_columns` | list[str] | [] | Columns to exclude |
|
|
137
|
+
| `gap_handling` | GapHandling | SEGMENT | How to handle gaps |
|
|
138
|
+
| `gap_threshold` | str/Timedelta | "auto" | Min duration for gaps |
|
|
139
|
+
| `edge_handling` | EdgeHandling | FLAG | How to handle edges |
|
|
140
|
+
| `edge_window` | int | 2 | Points at each edge |
|
|
141
|
+
| `min_points_per_segment` | int | 3 | Min points for processing |
|
|
142
|
+
|
|
143
|
+
### Aggregation Methods
|
|
144
|
+
|
|
145
|
+
- `MEAN`: Arithmetic mean (best for general use)
|
|
146
|
+
- `MEDIAN`: Median (robust to outliers)
|
|
147
|
+
- `MIN`: Minimum value (preserves lows)
|
|
148
|
+
- `MAX`: Maximum value (preserves highs)
|
|
149
|
+
- `LTTB`: Largest Triangle Three Buckets (best visual fidelity)
|
|
150
|
+
|
|
151
|
+
### Gap Handling
|
|
152
|
+
|
|
153
|
+
- `SEGMENT`: Split at gaps, process independently (recommended)
|
|
154
|
+
- `INTERPOLATE`: Fill gaps before processing
|
|
155
|
+
- `IGNORE`: Treat as continuous data
|
|
156
|
+
|
|
157
|
+
### Edge Handling
|
|
158
|
+
|
|
159
|
+
- `KEEP`: Keep edge points as-is
|
|
160
|
+
- `FLAG`: Add `_is_edge` column
|
|
161
|
+
- `DISCARD`: Remove edge points
|
|
162
|
+
|
|
163
|
+
## Examples
|
|
164
|
+
|
|
165
|
+
See the `examples/` directory for complete examples:
|
|
166
|
+
|
|
167
|
+
- `basic_downsampling.py`: Core downsampling features
|
|
168
|
+
- `multi_aggregate.py`: Creating min/mean/max columns
|
|
169
|
+
- `deferred_fetch.py`: Lazy data loading
|
|
170
|
+
- `fidelity_comparison.marimo.py`: Interactive comparison notebook
|
|
171
|
+
|
|
172
|
+
## API Reference
|
|
173
|
+
|
|
174
|
+
### Core Functions
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
downsample(df, target_cadence, config=None, **kwargs) -> DataFrame
|
|
178
|
+
downsample_multi_aggregate(df, target_cadence, variables, aggregations, ...) -> DataFrame
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
### Gap Functions
|
|
182
|
+
|
|
183
|
+
```python
|
|
184
|
+
find_gap_indices(df, timedelta_max_gap) -> Series
|
|
185
|
+
groupby_gaps(df, timedelta_max_gap) -> DataFrameGroupBy
|
|
186
|
+
split_at_gaps(df, timedelta_max_gap) -> list[DataFrame]
|
|
187
|
+
mark_gaps_in_dataframe(df, nominal_timedelta, ...) -> DataFrame
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
### Deferred Functions
|
|
191
|
+
|
|
192
|
+
```python
|
|
193
|
+
deferred_downsample(fetcher, output_start, output_end, target_cadence, ...) -> DataFrame
|
|
194
|
+
batch_deferred_downsample(fetcher, ..., batch_size) -> DataFrame
|
|
195
|
+
LazyDownsampler(fetcher, cache_buffer) -> LazyDownsampler
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
### Fidelity Functions
|
|
199
|
+
|
|
200
|
+
```python
|
|
201
|
+
compute_metrics(original, downsampled, column) -> FidelityMetrics
|
|
202
|
+
FidelityComparison(original_df, column) -> FidelityComparison
|
|
203
|
+
plot_comparison(original, downsampled, column, backend) -> Figure
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
## License
|
|
207
|
+
|
|
208
|
+
MIT License - see LICENSE file for details.
|
|
209
|
+
|
|
210
|
+
## Contributing
|
|
211
|
+
|
|
212
|
+
Contributions are welcome! Please feel free to submit issues and pull requests.
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "downsampler"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Timeseries DataFrame downsampling with LTTB, aggregation methods, gap handling, and fidelity testing"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Eelco Doornbos", email = "eelco.doornbos@knmi.nl"}
|
|
14
|
+
]
|
|
15
|
+
keywords = ["timeseries", "downsampling", "lttb", "pandas", "visualization"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Science/Research",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.10",
|
|
22
|
+
"Programming Language :: Python :: 3.11",
|
|
23
|
+
"Programming Language :: Python :: 3.12",
|
|
24
|
+
"Topic :: Scientific/Engineering",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"numpy>=1.20",
|
|
28
|
+
"pandas>=1.3",
|
|
29
|
+
"lttb>=0.3",
|
|
30
|
+
"scipy>=1.7",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.optional-dependencies]
|
|
34
|
+
viz = ["matplotlib>=3.5", "altair>=5.0"]
|
|
35
|
+
test = ["pytest>=7.0", "pytest-cov>=4.0"]
|
|
36
|
+
dev = ["downsampler[viz,test]"]
|
|
37
|
+
|
|
38
|
+
[project.urls]
|
|
39
|
+
Homepage = "https://gitlab.com/KNMI-OSS/spaceweather/downsampler"
|
|
40
|
+
Repository = "https://gitlab.com/KNMI-OSS/spaceweather/downsampler"
|
|
41
|
+
|
|
42
|
+
[tool.setuptools.packages.find]
|
|
43
|
+
where = ["src"]
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""downsampler - Timeseries DataFrame downsampling with LTTB, aggregation methods, and fidelity testing.
|
|
2
|
+
|
|
3
|
+
This package provides tools for downsampling time series data in pandas DataFrames,
|
|
4
|
+
with support for:
|
|
5
|
+
- LTTB (Largest Triangle Three Buckets) algorithm for visual fidelity
|
|
6
|
+
- Multiple aggregation methods (mean, median, min, max)
|
|
7
|
+
- Gap-aware processing
|
|
8
|
+
- Edge handling strategies
|
|
9
|
+
- Deferred/lazy data fetching
|
|
10
|
+
- Fidelity testing and comparison
|
|
11
|
+
|
|
12
|
+
Example:
|
|
13
|
+
>>> import pandas as pd
|
|
14
|
+
>>> from downsampler import downsample, DownsampleConfig, AggregationMethod
|
|
15
|
+
>>>
|
|
16
|
+
>>> # Create sample data
|
|
17
|
+
>>> df = pd.DataFrame(
|
|
18
|
+
... {'value': [1, 2, 3, 4, 5]},
|
|
19
|
+
... index=pd.date_range('2024-01-01', periods=5, freq='1min')
|
|
20
|
+
... )
|
|
21
|
+
>>>
|
|
22
|
+
>>> # Downsample using mean
|
|
23
|
+
>>> result = downsample(df, target_cadence='5min')
|
|
24
|
+
>>>
|
|
25
|
+
>>> # Downsample using LTTB
|
|
26
|
+
>>> config = DownsampleConfig(
|
|
27
|
+
... method=AggregationMethod.LTTB,
|
|
28
|
+
... lttb_target_column='value'
|
|
29
|
+
... )
|
|
30
|
+
>>> result = downsample(df, target_cadence='5min', config=config)
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
from downsampler.config import (
|
|
34
|
+
AggregationMethod,
|
|
35
|
+
EdgeHandling,
|
|
36
|
+
GapHandling,
|
|
37
|
+
DownsampleConfig,
|
|
38
|
+
)
|
|
39
|
+
from downsampler.core import downsample, downsample_multi_aggregate
|
|
40
|
+
from downsampler.gaps import (
|
|
41
|
+
find_gap_indices,
|
|
42
|
+
groupby_gaps,
|
|
43
|
+
wrap_in_nans,
|
|
44
|
+
mark_gaps_in_dataframe,
|
|
45
|
+
)
|
|
46
|
+
from downsampler.lttb import downsample_lttb
|
|
47
|
+
from downsampler.aggregators import (
|
|
48
|
+
downsample_mean,
|
|
49
|
+
downsample_median,
|
|
50
|
+
downsample_min,
|
|
51
|
+
downsample_max,
|
|
52
|
+
)
|
|
53
|
+
from downsampler.deferred import deferred_downsample
|
|
54
|
+
|
|
55
|
+
__version__ = "0.1.0"
|
|
56
|
+
|
|
57
|
+
__all__ = [
|
|
58
|
+
# Config
|
|
59
|
+
"AggregationMethod",
|
|
60
|
+
"EdgeHandling",
|
|
61
|
+
"GapHandling",
|
|
62
|
+
"DownsampleConfig",
|
|
63
|
+
# Core
|
|
64
|
+
"downsample",
|
|
65
|
+
"downsample_multi_aggregate",
|
|
66
|
+
# Gaps
|
|
67
|
+
"find_gap_indices",
|
|
68
|
+
"groupby_gaps",
|
|
69
|
+
"wrap_in_nans",
|
|
70
|
+
"mark_gaps_in_dataframe",
|
|
71
|
+
# LTTB
|
|
72
|
+
"downsample_lttb",
|
|
73
|
+
# Aggregators
|
|
74
|
+
"downsample_mean",
|
|
75
|
+
"downsample_median",
|
|
76
|
+
"downsample_min",
|
|
77
|
+
"downsample_max",
|
|
78
|
+
# Deferred
|
|
79
|
+
"deferred_downsample",
|
|
80
|
+
]
|