downsampler 0.2.0__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {downsampler-0.2.0/src/downsampler.egg-info → downsampler-0.3.1}/PKG-INFO +9 -8
- {downsampler-0.2.0 → downsampler-0.3.1}/README.md +8 -1
- {downsampler-0.2.0 → downsampler-0.3.1}/pyproject.toml +12 -3
- {downsampler-0.2.0 → downsampler-0.3.1}/src/downsampler/__init__.py +1 -1
- {downsampler-0.2.0 → downsampler-0.3.1}/src/downsampler/config.py +4 -2
- {downsampler-0.2.0 → downsampler-0.3.1}/src/downsampler/gaps.py +64 -31
- {downsampler-0.2.0 → downsampler-0.3.1}/src/downsampler/lttb.py +51 -38
- {downsampler-0.2.0 → downsampler-0.3.1}/src/downsampler/m4.py +1 -1
- {downsampler-0.2.0 → downsampler-0.3.1/src/downsampler.egg-info}/PKG-INFO +9 -8
- downsampler-0.3.1/src/downsampler.egg-info/requires.txt +5 -0
- {downsampler-0.2.0 → downsampler-0.3.1}/tests/test_gaps.py +27 -13
- downsampler-0.3.1/tests/test_lttb.py +408 -0
- downsampler-0.2.0/src/downsampler.egg-info/requires.txt +0 -13
- downsampler-0.2.0/tests/test_lttb.py +0 -178
- {downsampler-0.2.0 → downsampler-0.3.1}/LICENSE +0 -0
- {downsampler-0.2.0 → downsampler-0.3.1}/setup.cfg +0 -0
- {downsampler-0.2.0 → downsampler-0.3.1}/src/downsampler/aggregators.py +0 -0
- {downsampler-0.2.0 → downsampler-0.3.1}/src/downsampler/core.py +0 -0
- {downsampler-0.2.0 → downsampler-0.3.1}/src/downsampler/edges.py +0 -0
- {downsampler-0.2.0 → downsampler-0.3.1}/src/downsampler/fidelity/__init__.py +0 -0
- {downsampler-0.2.0 → downsampler-0.3.1}/src/downsampler/fidelity/comparison.py +0 -0
- {downsampler-0.2.0 → downsampler-0.3.1}/src/downsampler/fidelity/metrics.py +0 -0
- {downsampler-0.2.0 → downsampler-0.3.1}/src/downsampler/ranged.py +0 -0
- {downsampler-0.2.0 → downsampler-0.3.1}/src/downsampler/utils.py +0 -0
- {downsampler-0.2.0 → downsampler-0.3.1}/src/downsampler.egg-info/SOURCES.txt +0 -0
- {downsampler-0.2.0 → downsampler-0.3.1}/src/downsampler.egg-info/dependency_links.txt +0 -0
- {downsampler-0.2.0 → downsampler-0.3.1}/src/downsampler.egg-info/top_level.txt +0 -0
- {downsampler-0.2.0 → downsampler-0.3.1}/tests/test_aggregators.py +0 -0
- {downsampler-0.2.0 → downsampler-0.3.1}/tests/test_core.py +0 -0
- {downsampler-0.2.0 → downsampler-0.3.1}/tests/test_edges.py +0 -0
- {downsampler-0.2.0 → downsampler-0.3.1}/tests/test_fidelity.py +0 -0
- {downsampler-0.2.0 → downsampler-0.3.1}/tests/test_m4.py +0 -0
- {downsampler-0.2.0 → downsampler-0.3.1}/tests/test_ranged.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: downsampler
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: Timeseries DataFrame downsampling with LTTB, aggregation methods, gap handling, and fidelity testing
|
|
5
5
|
Author-email: Eelco Doornbos <eelco.doornbos@knmi.nl>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -22,12 +22,6 @@ Requires-Dist: pandas>=1.3
|
|
|
22
22
|
Requires-Dist: lttbc>=0.3
|
|
23
23
|
Requires-Dist: scipy>=1.7
|
|
24
24
|
Requires-Dist: requests>=2.32.5
|
|
25
|
-
Provides-Extra: test
|
|
26
|
-
Requires-Dist: pytest>=7.0; extra == "test"
|
|
27
|
-
Requires-Dist: pytest-cov>=4.0; extra == "test"
|
|
28
|
-
Provides-Extra: dev
|
|
29
|
-
Requires-Dist: downsampler[test]; extra == "dev"
|
|
30
|
-
Requires-Dist: marimo; extra == "dev"
|
|
31
25
|
Dynamic: license-file
|
|
32
26
|
|
|
33
27
|
# downsampler
|
|
@@ -57,6 +51,13 @@ A Python package for time series DataFrame downsampling with LTTB, M4, multiple
|
|
|
57
51
|
pip install downsampler
|
|
58
52
|
```
|
|
59
53
|
|
|
54
|
+
> **Note (Linux + Python 3.11):** the `lttbc` dependency's prebuilt cp311
|
|
55
|
+
> Linux wheel was compiled against NumPy 1.x and fails to import under
|
|
56
|
+
> NumPy 2 (`numpy.core.multiarray failed to import`). Force a source build:
|
|
57
|
+
> `pip install --no-binary lttbc downsampler`. Other Python versions and
|
|
58
|
+
> macOS have no prebuilt wheel and build from source automatically. With
|
|
59
|
+
> uv, this repo's `[tool.uv] no-binary-package` setting handles it.
|
|
60
|
+
|
|
60
61
|
## Quick Start
|
|
61
62
|
|
|
62
63
|
### Basic Downsampling
|
|
@@ -237,7 +238,7 @@ print(summary_table(results))
|
|
|
237
238
|
| `gap_threshold` | str/Timedelta | "auto" | Min duration for gaps |
|
|
238
239
|
| `edge_handling` | EdgeHandling | KEEP | How to handle edges |
|
|
239
240
|
| `edge_window` | int | 2 | Points at each edge |
|
|
240
|
-
| `min_points_per_segment` | int |
|
|
241
|
+
| `min_points_per_segment` | int | 1 | Min points per segment; smaller segments are dropped |
|
|
241
242
|
| `min_completeness` | float | 0.9 | Min fraction of expected points per bucket |
|
|
242
243
|
| `source_cadence` | str/Timedelta | None | Source data cadence (estimated if None) |
|
|
243
244
|
|
|
@@ -25,6 +25,13 @@ A Python package for time series DataFrame downsampling with LTTB, M4, multiple
|
|
|
25
25
|
pip install downsampler
|
|
26
26
|
```
|
|
27
27
|
|
|
28
|
+
> **Note (Linux + Python 3.11):** the `lttbc` dependency's prebuilt cp311
|
|
29
|
+
> Linux wheel was compiled against NumPy 1.x and fails to import under
|
|
30
|
+
> NumPy 2 (`numpy.core.multiarray failed to import`). Force a source build:
|
|
31
|
+
> `pip install --no-binary lttbc downsampler`. Other Python versions and
|
|
32
|
+
> macOS have no prebuilt wheel and build from source automatically. With
|
|
33
|
+
> uv, this repo's `[tool.uv] no-binary-package` setting handles it.
|
|
34
|
+
|
|
28
35
|
## Quick Start
|
|
29
36
|
|
|
30
37
|
### Basic Downsampling
|
|
@@ -205,7 +212,7 @@ print(summary_table(results))
|
|
|
205
212
|
| `gap_threshold` | str/Timedelta | "auto" | Min duration for gaps |
|
|
206
213
|
| `edge_handling` | EdgeHandling | KEEP | How to handle edges |
|
|
207
214
|
| `edge_window` | int | 2 | Points at each edge |
|
|
208
|
-
| `min_points_per_segment` | int |
|
|
215
|
+
| `min_points_per_segment` | int | 1 | Min points per segment; smaller segments are dropped |
|
|
209
216
|
| `min_completeness` | float | 0.9 | Min fraction of expected points per bucket |
|
|
210
217
|
| `source_cadence` | str/Timedelta | None | Source data cadence (estimated if None) |
|
|
211
218
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "downsampler"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.3.1"
|
|
8
8
|
description = "Timeseries DataFrame downsampling with LTTB, aggregation methods, gap handling, and fidelity testing"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
@@ -30,13 +30,22 @@ dependencies = [
|
|
|
30
30
|
"requests>=2.32.5",
|
|
31
31
|
]
|
|
32
32
|
|
|
33
|
-
[
|
|
33
|
+
[dependency-groups]
|
|
34
34
|
test = ["pytest>=7.0", "pytest-cov>=4.0"]
|
|
35
|
-
dev = ["
|
|
35
|
+
dev = [{include-group = "test"}, "marimo"]
|
|
36
36
|
|
|
37
37
|
[project.urls]
|
|
38
38
|
Homepage = "https://gitlab.com/KNMI-OSS/spaceweather/libs/downsampler"
|
|
39
39
|
Repository = "https://gitlab.com/KNMI-OSS/spaceweather/libs/downsampler"
|
|
40
40
|
|
|
41
|
+
[tool.uv]
|
|
42
|
+
# lttbc's prebuilt cp311 Linux wheel was compiled against NumPy 1.x and
|
|
43
|
+
# crashes on import under NumPy 2 ("numpy.core.multiarray failed to
|
|
44
|
+
# import"). Building from source compiles it against the installed NumPy.
|
|
45
|
+
# Other Python versions ship no Linux wheel and always build from source.
|
|
46
|
+
# Same workaround as in spaceweather-data-pipelines; drop when lttbc
|
|
47
|
+
# publishes NumPy-2 wheels.
|
|
48
|
+
no-binary-package = ["lttbc"]
|
|
49
|
+
|
|
41
50
|
[tool.setuptools.packages.find]
|
|
42
51
|
where = ["src"]
|
|
@@ -46,7 +46,9 @@ class DownsampleConfig:
|
|
|
46
46
|
"auto" means 2x the target cadence.
|
|
47
47
|
edge_handling: Strategy for handling edge points.
|
|
48
48
|
edge_window: Number of points at each edge to consider as edge points.
|
|
49
|
-
min_points_per_segment: Minimum points required in a segment for
|
|
49
|
+
min_points_per_segment: Minimum points required in a segment for
|
|
50
|
+
processing. Default 1: even single-point data islands between
|
|
51
|
+
gaps are kept, so intermittent data survives downsampling.
|
|
50
52
|
source_cadence: Expected cadence of the source data. Used by LTTB to
|
|
51
53
|
interpolate small gaps and by aggregators for completeness
|
|
52
54
|
calculation. If None, estimated from data.
|
|
@@ -64,7 +66,7 @@ class DownsampleConfig:
|
|
|
64
66
|
gap_threshold: Union[str, pd.Timedelta] = "auto"
|
|
65
67
|
edge_handling: EdgeHandling = EdgeHandling.KEEP
|
|
66
68
|
edge_window: int = 2
|
|
67
|
-
min_points_per_segment: int =
|
|
69
|
+
min_points_per_segment: int = 1
|
|
68
70
|
source_cadence: Union[str, pd.Timedelta, None] = None
|
|
69
71
|
min_completeness: float = 0.9
|
|
70
72
|
|
|
@@ -68,9 +68,14 @@ def groupby_gaps(
|
|
|
68
68
|
2
|
|
69
69
|
"""
|
|
70
70
|
df_work = df.copy()
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
71
|
+
if len(df_work) == 0:
|
|
72
|
+
# [0, *gap_indices] below would fabricate a spurious row on an
|
|
73
|
+
# empty frame; an empty gap_index column yields zero groups.
|
|
74
|
+
df_work['gap_index'] = np.array([], dtype=np.int64)
|
|
75
|
+
else:
|
|
76
|
+
deltas = df_work.index.diff()[1:]
|
|
77
|
+
gap_indices = (deltas >= timedelta_max_gap).cumsum()
|
|
78
|
+
df_work['gap_index'] = [0, *gap_indices]
|
|
74
79
|
dfs_out = df_work.groupby('gap_index')
|
|
75
80
|
return dfs_out
|
|
76
81
|
|
|
@@ -326,6 +331,12 @@ def interpolate_small_gaps(
|
|
|
326
331
|
in the original data are preserved — only the newly created rows get
|
|
327
332
|
interpolated values.
|
|
328
333
|
|
|
334
|
+
Each column is interpolated **from its own non-NaN samples**, and any new
|
|
335
|
+
timestamp outside a column's valid-sample range is left NaN (``np.interp``
|
|
336
|
+
``left``/``right`` = NaN, i.e. no edge-clamp and no extrapolation). So a
|
|
337
|
+
column with a hole larger than ``gap_threshold`` keeps an honest gap at the
|
|
338
|
+
filled timestamps rather than borrowing another column's edge value.
|
|
339
|
+
|
|
329
340
|
Args:
|
|
330
341
|
df: DataFrame with DatetimeIndex.
|
|
331
342
|
gap_threshold: Gaps at or above this duration are "real" gaps and
|
|
@@ -344,41 +355,63 @@ def interpolate_small_gaps(
|
|
|
344
355
|
source_cadence = estimate_cadence(df)
|
|
345
356
|
|
|
346
357
|
jitter_threshold = source_cadence * 1.5
|
|
347
|
-
deltas = df.index.to_series().diff()
|
|
348
|
-
|
|
349
358
|
numeric_cols = get_numeric_columns(df)
|
|
350
|
-
# Convert original timestamps to float64 for np.interp
|
|
351
|
-
orig_timestamps = df.index.astype(np.int64).astype(np.float64)
|
|
352
359
|
|
|
353
|
-
|
|
360
|
+
# 1. Row skeleton: existing rows plus synthetic rows inside *index* gaps
|
|
361
|
+
# smaller than gap_threshold, at source_cadence, so LTTB sees a
|
|
362
|
+
# continuous grid across small gaps. Values are filled per-column below.
|
|
363
|
+
deltas = df.index.to_series().diff()
|
|
364
|
+
extra_times: list[pd.Timestamp] = []
|
|
354
365
|
for i in range(1, len(df)):
|
|
355
366
|
delta = deltas.iloc[i]
|
|
356
367
|
if delta > jitter_threshold and delta < gap_threshold:
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
freq=source_cadence,
|
|
368
|
+
extra_times.extend(
|
|
369
|
+
pd.date_range(
|
|
370
|
+
start=df.index[i - 1] + source_cadence,
|
|
371
|
+
end=df.index[i] - source_cadence * 0.5, # don't duplicate t_after
|
|
372
|
+
freq=source_cadence,
|
|
373
|
+
)
|
|
364
374
|
)
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
375
|
+
if extra_times:
|
|
376
|
+
out = df.reindex(df.index.append(pd.DatetimeIndex(extra_times)).sort_values())
|
|
377
|
+
else:
|
|
378
|
+
out = df.copy()
|
|
379
|
+
|
|
380
|
+
# 2. Per-column, fill each column's *own* small gaps from its *own* samples,
|
|
381
|
+
# leaving holes wider than gap_threshold NaN. Interpolation is inside-only
|
|
382
|
+
# (no extrapolation → no edge-clamp) and each column is independent (no
|
|
383
|
+
# cross-column bleed).
|
|
384
|
+
for col in numeric_cols:
|
|
385
|
+
out[col] = _interpolate_column_small_gaps(out[col], gap_threshold)
|
|
386
|
+
|
|
387
|
+
return out
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def _interpolate_column_small_gaps(
|
|
391
|
+
s: pd.Series,
|
|
392
|
+
gap_threshold: pd.Timedelta,
|
|
393
|
+
) -> pd.Series:
|
|
394
|
+
"""Time-interpolate a single column's gaps narrower than gap_threshold.
|
|
395
|
+
|
|
396
|
+
Points outside the column's valid range stay NaN (no extrapolation), and
|
|
397
|
+
any run of NaNs whose bounding valid samples are gap_threshold or more
|
|
398
|
+
apart is re-blanked so wide holes remain honest gaps.
|
|
399
|
+
"""
|
|
400
|
+
valid = s.notna()
|
|
401
|
+
if valid.sum() < 2:
|
|
402
|
+
return s
|
|
403
|
+
|
|
404
|
+
filled = s.interpolate(method="time", limit_area="inside")
|
|
405
|
+
|
|
406
|
+
# Re-blank interpolated points that fall inside a wide (>= gap_threshold)
|
|
407
|
+
# hole between two consecutive valid samples.
|
|
408
|
+
idx = s.index
|
|
409
|
+
valid_pos = np.flatnonzero(valid.to_numpy())
|
|
410
|
+
for a, b in zip(valid_pos[:-1], valid_pos[1:]):
|
|
411
|
+
if b > a + 1 and (idx[b] - idx[a]) >= gap_threshold:
|
|
412
|
+
filled.iloc[a + 1:b] = np.nan
|
|
380
413
|
|
|
381
|
-
return
|
|
414
|
+
return filled
|
|
382
415
|
|
|
383
416
|
|
|
384
417
|
def concatenate_with_gap_markers(
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
"""LTTB (Largest Triangle Three Buckets) downsampling with gap handling."""
|
|
2
2
|
|
|
3
|
-
import logging
|
|
4
3
|
import pandas as pd
|
|
5
4
|
import numpy as np
|
|
6
5
|
import lttbc
|
|
@@ -17,7 +16,7 @@ def downsample_lttb(
|
|
|
17
16
|
target_cadence: str | pd.Timedelta,
|
|
18
17
|
include_columns: list[str] | None = None,
|
|
19
18
|
gap_threshold: pd.Timedelta | None = None,
|
|
20
|
-
min_points_per_segment: int =
|
|
19
|
+
min_points_per_segment: int = 1,
|
|
21
20
|
source_cadence: pd.Timedelta | None = None,
|
|
22
21
|
) -> pd.DataFrame:
|
|
23
22
|
"""Perform LTTB downsampling on a pandas DataFrame.
|
|
@@ -26,10 +25,16 @@ def downsample_lttb(
|
|
|
26
25
|
preserves visual characteristics of the data by selecting points that
|
|
27
26
|
maximize the area of triangles formed with adjacent buckets.
|
|
28
27
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
(
|
|
32
|
-
|
|
28
|
+
Rows where the target column is NaN are treated as absent data, so
|
|
29
|
+
fill-encoded gaps (NaN rows present at nominal cadence) behave exactly
|
|
30
|
+
like missing-row gaps. Small gaps (below gap_threshold) are filled by
|
|
31
|
+
linear interpolation before downsampling so LTTB receives continuous
|
|
32
|
+
segments. Large gaps (at/above gap_threshold) split the data into
|
|
33
|
+
segments processed independently, with NaN marker rows inserted between
|
|
34
|
+
them. Segments always keep their first and last points, so gap
|
|
35
|
+
boundaries in the output match the source data exactly. Segments too
|
|
36
|
+
short for triangle selection are passed through as their first/last
|
|
37
|
+
points rather than dropped.
|
|
33
38
|
|
|
34
39
|
Args:
|
|
35
40
|
df_in: Input DataFrame with DatetimeIndex.
|
|
@@ -60,8 +65,16 @@ def downsample_lttb(
|
|
|
60
65
|
if gap_threshold is None:
|
|
61
66
|
gap_threshold = 2 * target_cadence
|
|
62
67
|
|
|
68
|
+
# A row without a valid target sample carries no signal for LTTB. Drop
|
|
69
|
+
# such rows before gap detection so that fill-encoded gaps (NaN rows at
|
|
70
|
+
# nominal cadence, as produced by CDF/NetCDF fill values) segment the
|
|
71
|
+
# data exactly like missing-row gaps.
|
|
72
|
+
df_valid = df_in.dropna(subset=[target_column])
|
|
73
|
+
if df_valid.empty:
|
|
74
|
+
return pd.DataFrame(columns=df_in.columns)
|
|
75
|
+
|
|
63
76
|
# Interpolate small gaps so LTTB receives continuous input
|
|
64
|
-
df_interp = interpolate_small_gaps(
|
|
77
|
+
df_interp = interpolate_small_gaps(df_valid, gap_threshold, source_cadence)
|
|
65
78
|
|
|
66
79
|
# Split at large gaps and process each segment
|
|
67
80
|
segments = split_at_gaps(df_interp, gap_threshold)
|
|
@@ -94,21 +107,31 @@ def _lttb_single_segment(
|
|
|
94
107
|
) -> pd.DataFrame | None:
|
|
95
108
|
"""Apply LTTB to a single contiguous segment.
|
|
96
109
|
|
|
110
|
+
LTTB selects real source timetags for the *target* column; every other
|
|
111
|
+
requested column is then **selected** at those timetags rather than
|
|
112
|
+
interpolated onto them. ``lttbc.downsample`` returns exact input points,
|
|
113
|
+
so the selected ``time_num`` values map back to the segment rows
|
|
114
|
+
unambiguously, and each ride-along column keeps its real measured value
|
|
115
|
+
(or the small-gap-interpolated value the preprocessing stage placed there).
|
|
116
|
+
This makes it impossible to fabricate a value: earlier code interpolated
|
|
117
|
+
include columns with ``np.interp``, which clamps to a column's edge value
|
|
118
|
+
for any timetag outside that column's valid range — stamping a foreign
|
|
119
|
+
constant across every gap. Selection cannot do that.
|
|
120
|
+
|
|
97
121
|
Args:
|
|
98
122
|
df: Input DataFrame (no gaps).
|
|
99
123
|
target_column: Column to optimize for.
|
|
100
124
|
target_cadence: Target cadence.
|
|
101
|
-
include_columns: Additional columns to
|
|
125
|
+
include_columns: Additional columns to carry (selected, not
|
|
126
|
+
interpolated). If None, all numeric columns are carried.
|
|
102
127
|
|
|
103
128
|
Returns:
|
|
104
129
|
Downsampled DataFrame or None if cannot process.
|
|
105
130
|
"""
|
|
106
|
-
# Compute number of output points
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
logging.warning("Cannot perform LTTB downsampling on less than 3 points")
|
|
111
|
-
return None
|
|
131
|
+
# Compute number of output points. Segments spanning less than the
|
|
132
|
+
# target cadence still keep their first and last points (lttbc handles
|
|
133
|
+
# n_out <= 2 gracefully) so short data islands survive downsampling.
|
|
134
|
+
n_out = max(2, compute_output_points(df.index[0], df.index[-1], target_cadence))
|
|
112
135
|
|
|
113
136
|
# Set up the data - convert time to numeric for LTTB algorithm
|
|
114
137
|
df_work = df.copy()
|
|
@@ -116,48 +139,38 @@ def _lttb_single_segment(
|
|
|
116
139
|
timeunit = '1min'
|
|
117
140
|
df_work['time_num'] = (df_work.index - timeref) / pd.to_timedelta(timeunit)
|
|
118
141
|
|
|
119
|
-
# Prepare data for LTTB (time_num, target_column)
|
|
142
|
+
# Prepare data for LTTB (time_num, target_column). The caller already
|
|
143
|
+
# drops NaN-target rows; this guard protects against lttbc silently
|
|
144
|
+
# converting any remaining NaN to 0.0 (a fabricated value).
|
|
120
145
|
df_clean = df_work[['time_num', target_column]].dropna()
|
|
121
146
|
|
|
122
|
-
if
|
|
123
|
-
logging.warning("Insufficient non-NaN data points for LTTB")
|
|
147
|
+
if df_clean.empty:
|
|
124
148
|
return None
|
|
125
149
|
|
|
126
150
|
# Apply LTTB downsampling (lttbc uses separate x, y arrays)
|
|
127
|
-
x_down,
|
|
151
|
+
x_down, _ = lttbc.downsample(
|
|
128
152
|
df_clean['time_num'].values,
|
|
129
153
|
df_clean[target_column].values,
|
|
130
154
|
n_out
|
|
131
155
|
)
|
|
132
|
-
df_resampled = pd.DataFrame(
|
|
133
|
-
{'time_num': x_down, target_column: y_down}
|
|
134
|
-
)
|
|
135
156
|
|
|
136
|
-
#
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
)
|
|
157
|
+
# Row-select the segment rows LTTB chose. ``x_down`` are exact members of
|
|
158
|
+
# the input ``time_num`` (lttbc returns real input points), so ``isin``
|
|
159
|
+
# selects exactly those rows — carrying every column's real value and the
|
|
160
|
+
# original datetime index, no interpolation.
|
|
161
|
+
selected = df_work[df_work['time_num'].isin(x_down)]
|
|
141
162
|
|
|
142
|
-
|
|
163
|
+
keep = [target_column]
|
|
143
164
|
for col in df.columns:
|
|
144
|
-
if col in
|
|
165
|
+
if col in ('time', 'time_num', target_column):
|
|
145
166
|
continue
|
|
146
167
|
if include_columns is not None and col not in include_columns:
|
|
147
168
|
continue
|
|
148
169
|
if not pd.api.types.is_numeric_dtype(df[col]):
|
|
149
170
|
continue
|
|
171
|
+
keep.append(col)
|
|
150
172
|
|
|
151
|
-
|
|
152
|
-
x=df_resampled['time_num'].values,
|
|
153
|
-
xp=df_work['time_num'].values,
|
|
154
|
-
fp=df_work[col].values
|
|
155
|
-
)
|
|
156
|
-
|
|
157
|
-
# Clean up
|
|
158
|
-
df_resampled = df_resampled.drop(['time_num'], axis=1)
|
|
159
|
-
|
|
160
|
-
return df_resampled
|
|
173
|
+
return selected[keep].copy()
|
|
161
174
|
|
|
162
175
|
|
|
163
176
|
def downsample_lttb_with_config(
|
|
@@ -24,7 +24,7 @@ def downsample_m4(
|
|
|
24
24
|
target_cadence: str | pd.Timedelta,
|
|
25
25
|
include_columns: list[str] | None = None,
|
|
26
26
|
gap_threshold: pd.Timedelta | None = None,
|
|
27
|
-
min_points_per_segment: int =
|
|
27
|
+
min_points_per_segment: int = 1,
|
|
28
28
|
deduplicate: bool = True,
|
|
29
29
|
collinearity_threshold: float | None = None,
|
|
30
30
|
target_points_per_bucket: float = 3.0
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: downsampler
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: Timeseries DataFrame downsampling with LTTB, aggregation methods, gap handling, and fidelity testing
|
|
5
5
|
Author-email: Eelco Doornbos <eelco.doornbos@knmi.nl>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -22,12 +22,6 @@ Requires-Dist: pandas>=1.3
|
|
|
22
22
|
Requires-Dist: lttbc>=0.3
|
|
23
23
|
Requires-Dist: scipy>=1.7
|
|
24
24
|
Requires-Dist: requests>=2.32.5
|
|
25
|
-
Provides-Extra: test
|
|
26
|
-
Requires-Dist: pytest>=7.0; extra == "test"
|
|
27
|
-
Requires-Dist: pytest-cov>=4.0; extra == "test"
|
|
28
|
-
Provides-Extra: dev
|
|
29
|
-
Requires-Dist: downsampler[test]; extra == "dev"
|
|
30
|
-
Requires-Dist: marimo; extra == "dev"
|
|
31
25
|
Dynamic: license-file
|
|
32
26
|
|
|
33
27
|
# downsampler
|
|
@@ -57,6 +51,13 @@ A Python package for time series DataFrame downsampling with LTTB, M4, multiple
|
|
|
57
51
|
pip install downsampler
|
|
58
52
|
```
|
|
59
53
|
|
|
54
|
+
> **Note (Linux + Python 3.11):** the `lttbc` dependency's prebuilt cp311
|
|
55
|
+
> Linux wheel was compiled against NumPy 1.x and fails to import under
|
|
56
|
+
> NumPy 2 (`numpy.core.multiarray failed to import`). Force a source build:
|
|
57
|
+
> `pip install --no-binary lttbc downsampler`. Other Python versions and
|
|
58
|
+
> macOS have no prebuilt wheel and build from source automatically. With
|
|
59
|
+
> uv, this repo's `[tool.uv] no-binary-package` setting handles it.
|
|
60
|
+
|
|
60
61
|
## Quick Start
|
|
61
62
|
|
|
62
63
|
### Basic Downsampling
|
|
@@ -237,7 +238,7 @@ print(summary_table(results))
|
|
|
237
238
|
| `gap_threshold` | str/Timedelta | "auto" | Min duration for gaps |
|
|
238
239
|
| `edge_handling` | EdgeHandling | KEEP | How to handle edges |
|
|
239
240
|
| `edge_window` | int | 2 | Points at each edge |
|
|
240
|
-
| `min_points_per_segment` | int |
|
|
241
|
+
| `min_points_per_segment` | int | 1 | Min points per segment; smaller segments are dropped |
|
|
241
242
|
| `min_completeness` | float | 0.9 | Min fraction of expected points per bucket |
|
|
242
243
|
| `source_cadence` | str/Timedelta | None | Source data cadence (estimated if None) |
|
|
243
244
|
|
|
@@ -239,22 +239,36 @@ class TestInterpolateSmallGaps:
|
|
|
239
239
|
# Small gap filled (2 new points), large gap left alone
|
|
240
240
|
assert len(result) == 8
|
|
241
241
|
|
|
242
|
-
def
|
|
243
|
-
"""
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
242
|
+
def test_small_inplace_gap_filled_wide_preserved(self):
|
|
243
|
+
"""Small in-place NaN holes are bridged per-column; wide holes stay NaN.
|
|
244
|
+
|
|
245
|
+
(New 0.3.1 contract: small-gap filling is a per-column preprocessing
|
|
246
|
+
step, so a hole narrower than gap_threshold in an otherwise-dense column
|
|
247
|
+
is interpolated from that column's own samples, while a hole at or above
|
|
248
|
+
the threshold is preserved as an honest gap.)
|
|
249
|
+
"""
|
|
250
|
+
times = pd.date_range('2024-01-01', periods=20, freq='1min')
|
|
251
|
+
value = np.arange(20, dtype=float)
|
|
252
|
+
value[3] = np.nan # 1-min hole -> small, should fill
|
|
253
|
+
value[8:15] = np.nan # 7-min hole -> < 10min, should fill
|
|
254
|
+
df = pd.DataFrame({'value': value}, index=times)
|
|
255
|
+
|
|
256
|
+
small = interpolate_small_gaps(
|
|
257
|
+
df, gap_threshold=pd.Timedelta('10min'),
|
|
258
|
+
source_cadence=pd.Timedelta('1min'),
|
|
248
259
|
)
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
260
|
+
# Both holes are narrower than 10 min -> bridged (linear: value == index).
|
|
261
|
+
assert small['value'].isna().sum() == 0
|
|
262
|
+
assert small.iloc[3]['value'] == pytest.approx(3.0)
|
|
263
|
+
assert small.iloc[10]['value'] == pytest.approx(10.0)
|
|
264
|
+
|
|
265
|
+
# With a tighter threshold the 7-min hole is now "wide" -> preserved.
|
|
266
|
+
wide = interpolate_small_gaps(
|
|
267
|
+
df, gap_threshold=pd.Timedelta('5min'),
|
|
253
268
|
source_cadence=pd.Timedelta('1min'),
|
|
254
269
|
)
|
|
255
|
-
|
|
256
|
-
#
|
|
257
|
-
assert np.isnan(result.iloc[1]['value'])
|
|
270
|
+
assert not np.isnan(wide.iloc[3]['value']) # 1-min hole still filled
|
|
271
|
+
assert wide['value'].iloc[8:14].isna().all() # 7-min hole preserved
|
|
258
272
|
|
|
259
273
|
def test_auto_cadence_estimation(self):
|
|
260
274
|
"""Test that source_cadence is auto-estimated when None."""
|
|
@@ -0,0 +1,408 @@
|
|
|
1
|
+
"""Tests for LTTB downsampling."""
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
from downsampler.lttb import downsample_lttb, downsample_lttb_with_config
|
|
8
|
+
from downsampler.config import DownsampleConfig, AggregationMethod
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TestDownsampleLttb:
|
|
12
|
+
"""Tests for LTTB downsampling function."""
|
|
13
|
+
|
|
14
|
+
def test_basic_lttb(self, sine_df):
|
|
15
|
+
"""Test basic LTTB downsampling."""
|
|
16
|
+
result = downsample_lttb(
|
|
17
|
+
sine_df,
|
|
18
|
+
target_column='signal',
|
|
19
|
+
target_cadence='PT10S'
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
assert len(result) < len(sine_df)
|
|
23
|
+
assert 'signal' in result.columns
|
|
24
|
+
|
|
25
|
+
def test_preserves_extreme_values(self, sine_df):
|
|
26
|
+
"""Test that LTTB preserves extreme values reasonably well."""
|
|
27
|
+
result = downsample_lttb(
|
|
28
|
+
sine_df,
|
|
29
|
+
target_column='signal',
|
|
30
|
+
target_cadence='PT10S'
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# Check that max/min are close to original
|
|
34
|
+
orig_max = sine_df['signal'].max()
|
|
35
|
+
orig_min = sine_df['signal'].min()
|
|
36
|
+
result_max = result['signal'].max()
|
|
37
|
+
result_min = result['signal'].min()
|
|
38
|
+
|
|
39
|
+
# Allow 10% tolerance
|
|
40
|
+
assert abs(result_max - orig_max) < 0.1 * abs(orig_max)
|
|
41
|
+
assert abs(result_min - orig_min) < 0.1 * abs(orig_min - orig_max)
|
|
42
|
+
|
|
43
|
+
def test_include_columns(self, sine_df):
|
|
44
|
+
"""Test including additional columns."""
|
|
45
|
+
result = downsample_lttb(
|
|
46
|
+
sine_df,
|
|
47
|
+
target_column='signal',
|
|
48
|
+
target_cadence='PT10S',
|
|
49
|
+
include_columns=['signal', 'noise']
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
assert 'signal' in result.columns
|
|
53
|
+
assert 'noise' in result.columns
|
|
54
|
+
|
|
55
|
+
def test_gap_handling(self, gappy_df):
|
|
56
|
+
"""Test LTTB with gappy data."""
|
|
57
|
+
# Add a target column
|
|
58
|
+
gappy_df['signal'] = np.sin(np.linspace(0, 4 * np.pi, len(gappy_df)))
|
|
59
|
+
|
|
60
|
+
result = downsample_lttb(
|
|
61
|
+
gappy_df,
|
|
62
|
+
target_column='signal',
|
|
63
|
+
target_cadence='PT5M',
|
|
64
|
+
gap_threshold=pd.Timedelta('30min')
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Should produce output from both segments
|
|
68
|
+
assert len(result) > 0
|
|
69
|
+
|
|
70
|
+
def test_insufficient_points(self):
|
|
71
|
+
"""Test handling of insufficient points."""
|
|
72
|
+
small_df = pd.DataFrame(
|
|
73
|
+
{'value': [1, 2]},
|
|
74
|
+
index=pd.date_range('2024-01-01', periods=2, freq='1s')
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
result = downsample_lttb(
|
|
78
|
+
small_df,
|
|
79
|
+
target_column='value',
|
|
80
|
+
target_cadence='PT10S',
|
|
81
|
+
min_points_per_segment=3
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Should return empty or minimal result
|
|
85
|
+
assert len(result) == 0
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class TestLttbGapHandling:
|
|
89
|
+
"""Tests for LTTB gap handling behavior."""
|
|
90
|
+
|
|
91
|
+
def test_lttb_inserts_nan_markers_at_large_gaps(self):
|
|
92
|
+
"""Test that LTTB output contains NaN markers between segments."""
|
|
93
|
+
# Two segments with a large gap between them
|
|
94
|
+
times1 = pd.date_range('2024-01-01 00:00', periods=100, freq='1s')
|
|
95
|
+
times2 = pd.date_range('2024-01-01 01:00', periods=100, freq='1s')
|
|
96
|
+
t1 = np.linspace(0, 4 * np.pi, 100)
|
|
97
|
+
t2 = np.linspace(0, 4 * np.pi, 100)
|
|
98
|
+
df = pd.concat([
|
|
99
|
+
pd.DataFrame({'signal': np.sin(t1)}, index=times1),
|
|
100
|
+
pd.DataFrame({'signal': np.sin(t2)}, index=times2),
|
|
101
|
+
])
|
|
102
|
+
|
|
103
|
+
result = downsample_lttb(
|
|
104
|
+
df,
|
|
105
|
+
target_column='signal',
|
|
106
|
+
target_cadence='PT10S',
|
|
107
|
+
gap_threshold=pd.Timedelta('5min'),
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# Should have NaN markers
|
|
111
|
+
assert result['signal'].isna().sum() > 0
|
|
112
|
+
|
|
113
|
+
def test_lttb_interpolates_small_gaps(self):
|
|
114
|
+
"""Test that small gaps are filled before LTTB processes."""
|
|
115
|
+
# Data with a small 5-second gap (below gap_threshold of 30s)
|
|
116
|
+
times = list(pd.date_range('2024-01-01 00:00', periods=50, freq='1s'))
|
|
117
|
+
# Remove 3 points to create a small gap
|
|
118
|
+
del times[25:28]
|
|
119
|
+
t = np.linspace(0, 4 * np.pi, len(times))
|
|
120
|
+
df = pd.DataFrame({'signal': np.sin(t)}, index=pd.DatetimeIndex(times))
|
|
121
|
+
|
|
122
|
+
result = downsample_lttb(
|
|
123
|
+
df,
|
|
124
|
+
target_column='signal',
|
|
125
|
+
target_cadence='PT5S',
|
|
126
|
+
gap_threshold=pd.Timedelta('30s'),
|
|
127
|
+
source_cadence=pd.Timedelta('1s'),
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
# Should NOT have NaN markers (gap was small and interpolated)
|
|
131
|
+
assert result['signal'].isna().sum() == 0
|
|
132
|
+
assert len(result) > 0
|
|
133
|
+
|
|
134
|
+
def test_lttb_source_cadence_via_config(self):
|
|
135
|
+
"""Test that source_cadence is passed through config."""
|
|
136
|
+
times1 = pd.date_range('2024-01-01 00:00', periods=100, freq='1s')
|
|
137
|
+
times2 = pd.date_range('2024-01-01 01:00', periods=100, freq='1s')
|
|
138
|
+
t = np.linspace(0, 4 * np.pi, 100)
|
|
139
|
+
df = pd.concat([
|
|
140
|
+
pd.DataFrame({'signal': np.sin(t)}, index=times1),
|
|
141
|
+
pd.DataFrame({'signal': np.sin(t)}, index=times2),
|
|
142
|
+
])
|
|
143
|
+
|
|
144
|
+
config = DownsampleConfig(
|
|
145
|
+
method=AggregationMethod.LTTB,
|
|
146
|
+
lttb_target_column='signal',
|
|
147
|
+
source_cadence='PT1S',
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
result = downsample_lttb_with_config(df, 'PT10S', config)
|
|
151
|
+
assert len(result) > 0
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class TestLttbNanEncodedGaps:
|
|
155
|
+
"""Fill-encoded gaps (NaN rows) must behave like missing-row gaps."""
|
|
156
|
+
|
|
157
|
+
@staticmethod
|
|
158
|
+
def _make_gappy(encoding: str) -> pd.DataFrame:
|
|
159
|
+
"""4 hours of 1-min data with a 45-min gap from 01:00 to 01:45."""
|
|
160
|
+
idx = pd.date_range('2024-01-01', periods=240, freq='1min')
|
|
161
|
+
values = 500 + 10 * np.sin(np.arange(240) / 20.0)
|
|
162
|
+
other = 100 + np.cos(np.arange(240) / 10.0)
|
|
163
|
+
df = pd.DataFrame({'signal': values, 'other': other}, index=idx)
|
|
164
|
+
gap_rows = idx[60:105]
|
|
165
|
+
if encoding == 'nan_rows':
|
|
166
|
+
df.loc[gap_rows, ['signal', 'other']] = np.nan
|
|
167
|
+
return df
|
|
168
|
+
return df.drop(gap_rows)
|
|
169
|
+
|
|
170
|
+
@pytest.mark.parametrize('encoding', ['nan_rows', 'missing_rows'])
|
|
171
|
+
def test_gap_boundaries_preserved_exactly(self, encoding):
|
|
172
|
+
"""Last point before and first point after a gap must survive."""
|
|
173
|
+
df = self._make_gappy(encoding)
|
|
174
|
+
|
|
175
|
+
result = downsample_lttb(
|
|
176
|
+
df, target_column='signal', target_cadence='PT15M',
|
|
177
|
+
source_cadence=pd.Timedelta('1min'),
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
# LTTB keeps first/last of each segment: the valid samples
|
|
181
|
+
# adjacent to the gap must be in the output.
|
|
182
|
+
assert pd.Timestamp('2024-01-01 00:59') in result.index
|
|
183
|
+
assert pd.Timestamp('2024-01-01 01:45') in result.index
|
|
184
|
+
|
|
185
|
+
def test_nan_rows_equivalent_to_missing_rows(self):
|
|
186
|
+
"""Both gap encodings must produce identical output."""
|
|
187
|
+
result_nan = downsample_lttb(
|
|
188
|
+
self._make_gappy('nan_rows'),
|
|
189
|
+
target_column='signal', target_cadence='PT15M',
|
|
190
|
+
source_cadence=pd.Timedelta('1min'),
|
|
191
|
+
)
|
|
192
|
+
result_missing = downsample_lttb(
|
|
193
|
+
self._make_gappy('missing_rows'),
|
|
194
|
+
target_column='signal', target_cadence='PT15M',
|
|
195
|
+
source_cadence=pd.Timedelta('1min'),
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
pd.testing.assert_frame_equal(result_nan, result_missing)
|
|
199
|
+
|
|
200
|
+
def test_nan_gap_produces_marker(self):
|
|
201
|
+
"""A NaN-encoded gap must yield a NaN marker row in the output."""
|
|
202
|
+
result = downsample_lttb(
|
|
203
|
+
self._make_gappy('nan_rows'),
|
|
204
|
+
target_column='signal', target_cadence='PT15M',
|
|
205
|
+
source_cadence=pd.Timedelta('1min'),
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
markers = result[result['signal'].isna()]
|
|
209
|
+
assert len(markers) == 1
|
|
210
|
+
# Marker sits just after the last point of the pre-gap segment
|
|
211
|
+
assert pd.Timestamp('2024-01-01 00:59') < markers.index[0]
|
|
212
|
+
assert markers.index[0] < pd.Timestamp('2024-01-01 01:45')
|
|
213
|
+
|
|
214
|
+
def test_all_nan_target_returns_empty(self):
|
|
215
|
+
"""A frame whose target column is entirely NaN yields no output."""
|
|
216
|
+
idx = pd.date_range('2024-01-01', periods=100, freq='1min')
|
|
217
|
+
df = pd.DataFrame({'signal': np.nan, 'other': 1.0}, index=idx)
|
|
218
|
+
|
|
219
|
+
result = downsample_lttb(df, target_column='signal', target_cadence='PT15M')
|
|
220
|
+
|
|
221
|
+
assert len(result) == 0
|
|
222
|
+
assert list(result.columns) == ['signal', 'other']
|
|
223
|
+
|
|
224
|
+
def test_include_column_own_nan_holes(self):
|
|
225
|
+
"""Include columns interpolate from their own valid samples only."""
|
|
226
|
+
idx = pd.date_range('2024-01-01', periods=240, freq='1min')
|
|
227
|
+
df = pd.DataFrame({
|
|
228
|
+
'signal': 500 + 10 * np.sin(np.arange(240) / 20.0),
|
|
229
|
+
'other': 100.0,
|
|
230
|
+
}, index=idx)
|
|
231
|
+
# Hole in 'other' only — target column is complete
|
|
232
|
+
df.loc[idx[100:110], 'other'] = np.nan
|
|
233
|
+
|
|
234
|
+
result = downsample_lttb(
|
|
235
|
+
df, target_column='signal', target_cadence='PT15M',
|
|
236
|
+
source_cadence=pd.Timedelta('1min'),
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
# 'other' is constant 100 outside its hole; the small-gap preprocessing
|
|
240
|
+
# bridges it per-column rather than propagating NaN.
|
|
241
|
+
assert result['other'].isna().sum() == 0
|
|
242
|
+
assert (result['other'] == 100.0).all()
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
class TestLttbIncludeSelection:
|
|
246
|
+
"""Include columns are row-SELECTED at LTTB timetags, never interpolated.
|
|
247
|
+
|
|
248
|
+
Regression coverage for the np.interp edge-clamp bug: interpolating an
|
|
249
|
+
include column onto the target's timetags stamped the column's edge value
|
|
250
|
+
across every timetag outside its valid range (a constant foreign peak at
|
|
251
|
+
each gap). Selection cannot fabricate values.
|
|
252
|
+
"""
|
|
253
|
+
|
|
254
|
+
def test_high_window_does_not_bleed_into_quiet_period(self):
|
|
255
|
+
"""A column valid only in a late high window must not appear earlier."""
|
|
256
|
+
idx = pd.date_range('2024-01-01 00:00', periods=24 * 60, freq='1min')
|
|
257
|
+
signal = 350 + 10 * np.sin(np.arange(len(idx)) / 30.0) # dense all day
|
|
258
|
+
other = np.full(len(idx), np.nan)
|
|
259
|
+
hi = (idx >= '2024-01-01 20:00') & (idx <= '2024-01-01 21:00')
|
|
260
|
+
other[hi] = 602.9 # the ONLY place 'other' has data
|
|
261
|
+
df = pd.DataFrame({'signal': signal, 'other': other}, index=idx)
|
|
262
|
+
|
|
263
|
+
result = downsample_lttb(
|
|
264
|
+
df, target_column='signal', target_cadence='PT5M',
|
|
265
|
+
source_cadence=pd.Timedelta('1min'),
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
# No 'other' value may appear at a timetag outside its valid window,
|
|
269
|
+
# and certainly not the clamped constant 602.9 in the quiet morning.
|
|
270
|
+
nonnull = result['other'].dropna()
|
|
271
|
+
assert (nonnull.index >= pd.Timestamp('2024-01-01 20:00')).all()
|
|
272
|
+
assert (nonnull.index <= pd.Timestamp('2024-01-01 21:00')).all()
|
|
273
|
+
morning = result.loc[result.index < pd.Timestamp('2024-01-01 19:00'), 'other']
|
|
274
|
+
assert morning.isna().all()
|
|
275
|
+
|
|
276
|
+
def test_ridealong_wide_hole_stays_nan(self):
|
|
277
|
+
"""A hole wider than gap_threshold in a ride-along column stays NaN."""
|
|
278
|
+
idx = pd.date_range('2024-01-01', periods=240, freq='1min')
|
|
279
|
+
signal = 500 + 10 * np.sin(np.arange(240) / 20.0) # complete target
|
|
280
|
+
other = np.full(240, 100.0)
|
|
281
|
+
other[60:150] = np.nan # 90-min hole >> 30-min gap_threshold (PT15M)
|
|
282
|
+
df = pd.DataFrame({'signal': signal, 'other': other}, index=idx)
|
|
283
|
+
|
|
284
|
+
result = downsample_lttb(
|
|
285
|
+
df, target_column='signal', target_cadence='PT15M',
|
|
286
|
+
source_cadence=pd.Timedelta('1min'),
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
in_hole = result[(result.index > idx[60]) & (result.index < idx[149])]
|
|
290
|
+
assert not in_hole.empty # target is dense there, so timetags exist
|
|
291
|
+
assert in_hole['other'].isna().all()
|
|
292
|
+
|
|
293
|
+
def test_include_values_are_exact_source_selections(self):
|
|
294
|
+
"""Each carried include value equals the real source sample (not interp)."""
|
|
295
|
+
idx = pd.date_range('2024-01-01', periods=240, freq='1min')
|
|
296
|
+
rng = np.random.default_rng(0)
|
|
297
|
+
signal = 500 + 50 * np.sin(np.arange(240) / 15.0) + rng.normal(0, 3, 240)
|
|
298
|
+
other = 100 + rng.normal(0, 10, 240) # noisy, dense (no gaps)
|
|
299
|
+
df = pd.DataFrame({'signal': signal, 'other': other}, index=idx)
|
|
300
|
+
|
|
301
|
+
result = downsample_lttb(
|
|
302
|
+
df, target_column='signal', target_cadence='PT15M',
|
|
303
|
+
source_cadence=pd.Timedelta('1min'),
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
for t, v in result['other'].dropna().items():
|
|
307
|
+
assert v == pytest.approx(df.loc[t, 'other'])
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
class TestLttbShortSegments:
|
|
311
|
+
"""Short data islands must survive instead of being dropped."""
|
|
312
|
+
|
|
313
|
+
def test_short_segment_keeps_first_and_last(self):
|
|
314
|
+
"""A segment spanning less than the target cadence keeps its extent."""
|
|
315
|
+
# 10 minutes of data at 1-min cadence, target PT1H: old behavior
|
|
316
|
+
# dropped this entirely (n_out < 3).
|
|
317
|
+
idx = pd.date_range('2024-01-01', periods=10, freq='1min')
|
|
318
|
+
df = pd.DataFrame({'signal': np.linspace(1.0, 2.0, 10)}, index=idx)
|
|
319
|
+
|
|
320
|
+
result = downsample_lttb(df, target_column='signal', target_cadence='PT1H')
|
|
321
|
+
|
|
322
|
+
assert idx[0] in result.index
|
|
323
|
+
assert idx[-1] in result.index
|
|
324
|
+
|
|
325
|
+
def test_intermittent_islands_all_survive(self):
|
|
326
|
+
"""Several short islands between gaps each keep points and markers."""
|
|
327
|
+
frames = []
|
|
328
|
+
for hour in [0, 3, 6]: # gaps well above the 2x PT1H auto threshold
|
|
329
|
+
idx = pd.date_range(f'2024-01-01 {hour:02d}:00', periods=8, freq='1min')
|
|
330
|
+
frames.append(pd.DataFrame(
|
|
331
|
+
{'signal': np.full(8, float(hour + 1))}, index=idx
|
|
332
|
+
))
|
|
333
|
+
df = pd.concat(frames)
|
|
334
|
+
|
|
335
|
+
result = downsample_lttb(df, target_column='signal', target_cadence='PT1H')
|
|
336
|
+
|
|
337
|
+
# All three islands present with their values
|
|
338
|
+
for hour in [0, 3, 6]:
|
|
339
|
+
assert (result['signal'] == hour + 1).any()
|
|
340
|
+
# Two NaN markers between three segments
|
|
341
|
+
assert result['signal'].isna().sum() == 2
|
|
342
|
+
|
|
343
|
+
def test_single_isolated_point_survives(self):
|
|
344
|
+
"""A lone valid sample between gaps is kept, bracketed by markers."""
|
|
345
|
+
idx1 = pd.date_range('2024-01-01 00:00', periods=30, freq='1min')
|
|
346
|
+
idx2 = pd.date_range('2024-01-01 02:00', periods=30, freq='1min')
|
|
347
|
+
lone = pd.Timestamp('2024-01-01 01:00')
|
|
348
|
+
df = pd.concat([
|
|
349
|
+
pd.DataFrame({'signal': 1.0}, index=idx1),
|
|
350
|
+
pd.DataFrame({'signal': 99.0}, index=[lone]),
|
|
351
|
+
pd.DataFrame({'signal': 2.0}, index=idx2),
|
|
352
|
+
])
|
|
353
|
+
|
|
354
|
+
result = downsample_lttb(
|
|
355
|
+
df, target_column='signal', target_cadence='PT5M',
|
|
356
|
+
source_cadence=pd.Timedelta('1min'),
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
assert lone in result.index
|
|
360
|
+
assert result.loc[lone, 'signal'] == 99.0
|
|
361
|
+
# Markers on both sides of the lone point
|
|
362
|
+
pos = result.index.get_loc(lone)
|
|
363
|
+
assert np.isnan(result['signal'].iloc[pos - 1])
|
|
364
|
+
assert np.isnan(result['signal'].iloc[pos + 1])
|
|
365
|
+
|
|
366
|
+
def test_min_points_per_segment_still_filters(self):
|
|
367
|
+
"""An explicit min_points_per_segment above 1 still discards."""
|
|
368
|
+
idx1 = pd.date_range('2024-01-01 00:00', periods=30, freq='1min')
|
|
369
|
+
idx2 = pd.date_range('2024-01-01 02:00', periods=2, freq='1min')
|
|
370
|
+
df = pd.concat([
|
|
371
|
+
pd.DataFrame({'signal': 1.0}, index=idx1),
|
|
372
|
+
pd.DataFrame({'signal': 2.0}, index=idx2),
|
|
373
|
+
])
|
|
374
|
+
|
|
375
|
+
result = downsample_lttb(
|
|
376
|
+
df, target_column='signal', target_cadence='PT5M',
|
|
377
|
+
min_points_per_segment=3,
|
|
378
|
+
source_cadence=pd.Timedelta('1min'),
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
assert not (result['signal'] == 2.0).any()
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
class TestDownsampleLttbWithConfig:
|
|
385
|
+
"""Tests for LTTB downsampling with config."""
|
|
386
|
+
|
|
387
|
+
def test_with_config(self, sine_df):
|
|
388
|
+
"""Test LTTB with full configuration."""
|
|
389
|
+
config = DownsampleConfig(
|
|
390
|
+
method=AggregationMethod.LTTB,
|
|
391
|
+
lttb_target_column='signal',
|
|
392
|
+
min_points_per_segment=5
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
result = downsample_lttb_with_config(
|
|
396
|
+
sine_df,
|
|
397
|
+
'PT10S',
|
|
398
|
+
config
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
assert len(result) > 0
|
|
402
|
+
|
|
403
|
+
def test_missing_target_column_raises(self, sine_df):
|
|
404
|
+
"""Test that missing target column raises error."""
|
|
405
|
+
config = DownsampleConfig(method=AggregationMethod.LTTB)
|
|
406
|
+
|
|
407
|
+
with pytest.raises(ValueError, match="lttb_target_column"):
|
|
408
|
+
downsample_lttb_with_config(sine_df, 'PT10S', config)
|
|
@@ -1,178 +0,0 @@
|
|
|
1
|
-
"""Tests for LTTB downsampling."""
|
|
2
|
-
|
|
3
|
-
import pytest
|
|
4
|
-
import pandas as pd
|
|
5
|
-
import numpy as np
|
|
6
|
-
|
|
7
|
-
from downsampler.lttb import downsample_lttb, downsample_lttb_with_config
|
|
8
|
-
from downsampler.config import DownsampleConfig, AggregationMethod
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class TestDownsampleLttb:
|
|
12
|
-
"""Tests for LTTB downsampling function."""
|
|
13
|
-
|
|
14
|
-
def test_basic_lttb(self, sine_df):
|
|
15
|
-
"""Test basic LTTB downsampling."""
|
|
16
|
-
result = downsample_lttb(
|
|
17
|
-
sine_df,
|
|
18
|
-
target_column='signal',
|
|
19
|
-
target_cadence='PT10S'
|
|
20
|
-
)
|
|
21
|
-
|
|
22
|
-
assert len(result) < len(sine_df)
|
|
23
|
-
assert 'signal' in result.columns
|
|
24
|
-
|
|
25
|
-
def test_preserves_extreme_values(self, sine_df):
|
|
26
|
-
"""Test that LTTB preserves extreme values reasonably well."""
|
|
27
|
-
result = downsample_lttb(
|
|
28
|
-
sine_df,
|
|
29
|
-
target_column='signal',
|
|
30
|
-
target_cadence='PT10S'
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
# Check that max/min are close to original
|
|
34
|
-
orig_max = sine_df['signal'].max()
|
|
35
|
-
orig_min = sine_df['signal'].min()
|
|
36
|
-
result_max = result['signal'].max()
|
|
37
|
-
result_min = result['signal'].min()
|
|
38
|
-
|
|
39
|
-
# Allow 10% tolerance
|
|
40
|
-
assert abs(result_max - orig_max) < 0.1 * abs(orig_max)
|
|
41
|
-
assert abs(result_min - orig_min) < 0.1 * abs(orig_min - orig_max)
|
|
42
|
-
|
|
43
|
-
def test_include_columns(self, sine_df):
|
|
44
|
-
"""Test including additional columns."""
|
|
45
|
-
result = downsample_lttb(
|
|
46
|
-
sine_df,
|
|
47
|
-
target_column='signal',
|
|
48
|
-
target_cadence='PT10S',
|
|
49
|
-
include_columns=['signal', 'noise']
|
|
50
|
-
)
|
|
51
|
-
|
|
52
|
-
assert 'signal' in result.columns
|
|
53
|
-
assert 'noise' in result.columns
|
|
54
|
-
|
|
55
|
-
def test_gap_handling(self, gappy_df):
|
|
56
|
-
"""Test LTTB with gappy data."""
|
|
57
|
-
# Add a target column
|
|
58
|
-
gappy_df['signal'] = np.sin(np.linspace(0, 4 * np.pi, len(gappy_df)))
|
|
59
|
-
|
|
60
|
-
result = downsample_lttb(
|
|
61
|
-
gappy_df,
|
|
62
|
-
target_column='signal',
|
|
63
|
-
target_cadence='PT5M',
|
|
64
|
-
gap_threshold=pd.Timedelta('30min')
|
|
65
|
-
)
|
|
66
|
-
|
|
67
|
-
# Should produce output from both segments
|
|
68
|
-
assert len(result) > 0
|
|
69
|
-
|
|
70
|
-
def test_insufficient_points(self):
|
|
71
|
-
"""Test handling of insufficient points."""
|
|
72
|
-
small_df = pd.DataFrame(
|
|
73
|
-
{'value': [1, 2]},
|
|
74
|
-
index=pd.date_range('2024-01-01', periods=2, freq='1s')
|
|
75
|
-
)
|
|
76
|
-
|
|
77
|
-
result = downsample_lttb(
|
|
78
|
-
small_df,
|
|
79
|
-
target_column='value',
|
|
80
|
-
target_cadence='PT10S',
|
|
81
|
-
min_points_per_segment=3
|
|
82
|
-
)
|
|
83
|
-
|
|
84
|
-
# Should return empty or minimal result
|
|
85
|
-
assert len(result) == 0
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
class TestLttbGapHandling:
|
|
89
|
-
"""Tests for LTTB gap handling behavior."""
|
|
90
|
-
|
|
91
|
-
def test_lttb_inserts_nan_markers_at_large_gaps(self):
|
|
92
|
-
"""Test that LTTB output contains NaN markers between segments."""
|
|
93
|
-
# Two segments with a large gap between them
|
|
94
|
-
times1 = pd.date_range('2024-01-01 00:00', periods=100, freq='1s')
|
|
95
|
-
times2 = pd.date_range('2024-01-01 01:00', periods=100, freq='1s')
|
|
96
|
-
t1 = np.linspace(0, 4 * np.pi, 100)
|
|
97
|
-
t2 = np.linspace(0, 4 * np.pi, 100)
|
|
98
|
-
df = pd.concat([
|
|
99
|
-
pd.DataFrame({'signal': np.sin(t1)}, index=times1),
|
|
100
|
-
pd.DataFrame({'signal': np.sin(t2)}, index=times2),
|
|
101
|
-
])
|
|
102
|
-
|
|
103
|
-
result = downsample_lttb(
|
|
104
|
-
df,
|
|
105
|
-
target_column='signal',
|
|
106
|
-
target_cadence='PT10S',
|
|
107
|
-
gap_threshold=pd.Timedelta('5min'),
|
|
108
|
-
)
|
|
109
|
-
|
|
110
|
-
# Should have NaN markers
|
|
111
|
-
assert result['signal'].isna().sum() > 0
|
|
112
|
-
|
|
113
|
-
def test_lttb_interpolates_small_gaps(self):
|
|
114
|
-
"""Test that small gaps are filled before LTTB processes."""
|
|
115
|
-
# Data with a small 5-second gap (below gap_threshold of 30s)
|
|
116
|
-
times = list(pd.date_range('2024-01-01 00:00', periods=50, freq='1s'))
|
|
117
|
-
# Remove 3 points to create a small gap
|
|
118
|
-
del times[25:28]
|
|
119
|
-
t = np.linspace(0, 4 * np.pi, len(times))
|
|
120
|
-
df = pd.DataFrame({'signal': np.sin(t)}, index=pd.DatetimeIndex(times))
|
|
121
|
-
|
|
122
|
-
result = downsample_lttb(
|
|
123
|
-
df,
|
|
124
|
-
target_column='signal',
|
|
125
|
-
target_cadence='PT5S',
|
|
126
|
-
gap_threshold=pd.Timedelta('30s'),
|
|
127
|
-
source_cadence=pd.Timedelta('1s'),
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
# Should NOT have NaN markers (gap was small and interpolated)
|
|
131
|
-
assert result['signal'].isna().sum() == 0
|
|
132
|
-
assert len(result) > 0
|
|
133
|
-
|
|
134
|
-
def test_lttb_source_cadence_via_config(self):
|
|
135
|
-
"""Test that source_cadence is passed through config."""
|
|
136
|
-
times1 = pd.date_range('2024-01-01 00:00', periods=100, freq='1s')
|
|
137
|
-
times2 = pd.date_range('2024-01-01 01:00', periods=100, freq='1s')
|
|
138
|
-
t = np.linspace(0, 4 * np.pi, 100)
|
|
139
|
-
df = pd.concat([
|
|
140
|
-
pd.DataFrame({'signal': np.sin(t)}, index=times1),
|
|
141
|
-
pd.DataFrame({'signal': np.sin(t)}, index=times2),
|
|
142
|
-
])
|
|
143
|
-
|
|
144
|
-
config = DownsampleConfig(
|
|
145
|
-
method=AggregationMethod.LTTB,
|
|
146
|
-
lttb_target_column='signal',
|
|
147
|
-
source_cadence='PT1S',
|
|
148
|
-
)
|
|
149
|
-
|
|
150
|
-
result = downsample_lttb_with_config(df, 'PT10S', config)
|
|
151
|
-
assert len(result) > 0
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
class TestDownsampleLttbWithConfig:
|
|
155
|
-
"""Tests for LTTB downsampling with config."""
|
|
156
|
-
|
|
157
|
-
def test_with_config(self, sine_df):
|
|
158
|
-
"""Test LTTB with full configuration."""
|
|
159
|
-
config = DownsampleConfig(
|
|
160
|
-
method=AggregationMethod.LTTB,
|
|
161
|
-
lttb_target_column='signal',
|
|
162
|
-
min_points_per_segment=5
|
|
163
|
-
)
|
|
164
|
-
|
|
165
|
-
result = downsample_lttb_with_config(
|
|
166
|
-
sine_df,
|
|
167
|
-
'PT10S',
|
|
168
|
-
config
|
|
169
|
-
)
|
|
170
|
-
|
|
171
|
-
assert len(result) > 0
|
|
172
|
-
|
|
173
|
-
def test_missing_target_column_raises(self, sine_df):
|
|
174
|
-
"""Test that missing target column raises error."""
|
|
175
|
-
config = DownsampleConfig(method=AggregationMethod.LTTB)
|
|
176
|
-
|
|
177
|
-
with pytest.raises(ValueError, match="lttb_target_column"):
|
|
178
|
-
downsample_lttb_with_config(sine_df, 'PT10S', config)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|