downsampler 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {downsampler-0.2.0/src/downsampler.egg-info → downsampler-0.3.0}/PKG-INFO +9 -8
  2. {downsampler-0.2.0 → downsampler-0.3.0}/README.md +8 -1
  3. {downsampler-0.2.0 → downsampler-0.3.0}/pyproject.toml +12 -3
  4. {downsampler-0.2.0 → downsampler-0.3.0}/src/downsampler/config.py +4 -2
  5. {downsampler-0.2.0 → downsampler-0.3.0}/src/downsampler/gaps.py +8 -3
  6. {downsampler-0.2.0 → downsampler-0.3.0}/src/downsampler/lttb.py +40 -22
  7. {downsampler-0.2.0 → downsampler-0.3.0}/src/downsampler/m4.py +1 -1
  8. {downsampler-0.2.0 → downsampler-0.3.0/src/downsampler.egg-info}/PKG-INFO +9 -8
  9. downsampler-0.3.0/src/downsampler.egg-info/requires.txt +5 -0
  10. downsampler-0.3.0/tests/test_lttb.py +343 -0
  11. downsampler-0.2.0/src/downsampler.egg-info/requires.txt +0 -13
  12. downsampler-0.2.0/tests/test_lttb.py +0 -178
  13. {downsampler-0.2.0 → downsampler-0.3.0}/LICENSE +0 -0
  14. {downsampler-0.2.0 → downsampler-0.3.0}/setup.cfg +0 -0
  15. {downsampler-0.2.0 → downsampler-0.3.0}/src/downsampler/__init__.py +0 -0
  16. {downsampler-0.2.0 → downsampler-0.3.0}/src/downsampler/aggregators.py +0 -0
  17. {downsampler-0.2.0 → downsampler-0.3.0}/src/downsampler/core.py +0 -0
  18. {downsampler-0.2.0 → downsampler-0.3.0}/src/downsampler/edges.py +0 -0
  19. {downsampler-0.2.0 → downsampler-0.3.0}/src/downsampler/fidelity/__init__.py +0 -0
  20. {downsampler-0.2.0 → downsampler-0.3.0}/src/downsampler/fidelity/comparison.py +0 -0
  21. {downsampler-0.2.0 → downsampler-0.3.0}/src/downsampler/fidelity/metrics.py +0 -0
  22. {downsampler-0.2.0 → downsampler-0.3.0}/src/downsampler/ranged.py +0 -0
  23. {downsampler-0.2.0 → downsampler-0.3.0}/src/downsampler/utils.py +0 -0
  24. {downsampler-0.2.0 → downsampler-0.3.0}/src/downsampler.egg-info/SOURCES.txt +0 -0
  25. {downsampler-0.2.0 → downsampler-0.3.0}/src/downsampler.egg-info/dependency_links.txt +0 -0
  26. {downsampler-0.2.0 → downsampler-0.3.0}/src/downsampler.egg-info/top_level.txt +0 -0
  27. {downsampler-0.2.0 → downsampler-0.3.0}/tests/test_aggregators.py +0 -0
  28. {downsampler-0.2.0 → downsampler-0.3.0}/tests/test_core.py +0 -0
  29. {downsampler-0.2.0 → downsampler-0.3.0}/tests/test_edges.py +0 -0
  30. {downsampler-0.2.0 → downsampler-0.3.0}/tests/test_fidelity.py +0 -0
  31. {downsampler-0.2.0 → downsampler-0.3.0}/tests/test_gaps.py +0 -0
  32. {downsampler-0.2.0 → downsampler-0.3.0}/tests/test_m4.py +0 -0
  33. {downsampler-0.2.0 → downsampler-0.3.0}/tests/test_ranged.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: downsampler
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Timeseries DataFrame downsampling with LTTB, aggregation methods, gap handling, and fidelity testing
5
5
  Author-email: Eelco Doornbos <eelco.doornbos@knmi.nl>
6
6
  License-Expression: MIT
@@ -22,12 +22,6 @@ Requires-Dist: pandas>=1.3
22
22
  Requires-Dist: lttbc>=0.3
23
23
  Requires-Dist: scipy>=1.7
24
24
  Requires-Dist: requests>=2.32.5
25
- Provides-Extra: test
26
- Requires-Dist: pytest>=7.0; extra == "test"
27
- Requires-Dist: pytest-cov>=4.0; extra == "test"
28
- Provides-Extra: dev
29
- Requires-Dist: downsampler[test]; extra == "dev"
30
- Requires-Dist: marimo; extra == "dev"
31
25
  Dynamic: license-file
32
26
 
33
27
  # downsampler
@@ -57,6 +51,13 @@ A Python package for time series DataFrame downsampling with LTTB, M4, multiple
57
51
  pip install downsampler
58
52
  ```
59
53
 
54
+ > **Note (Linux + Python 3.11):** the `lttbc` dependency's prebuilt cp311
55
+ > Linux wheel was compiled against NumPy 1.x and fails to import under
56
+ > NumPy 2 (`numpy.core.multiarray failed to import`). Force a source build:
57
+ > `pip install --no-binary lttbc downsampler`. Other Python versions and
58
+ > macOS have no prebuilt wheel and build from source automatically. With
59
+ > uv, this repo's `[tool.uv] no-binary-package` setting handles it.
60
+
60
61
  ## Quick Start
61
62
 
62
63
  ### Basic Downsampling
@@ -237,7 +238,7 @@ print(summary_table(results))
237
238
  | `gap_threshold` | str/Timedelta | "auto" | Min duration for gaps |
238
239
  | `edge_handling` | EdgeHandling | KEEP | How to handle edges |
239
240
  | `edge_window` | int | 2 | Points at each edge |
240
- | `min_points_per_segment` | int | 3 | Min points for processing |
241
+ | `min_points_per_segment` | int | 1 | Min points per segment; smaller segments are dropped |
241
242
  | `min_completeness` | float | 0.9 | Min fraction of expected points per bucket |
242
243
  | `source_cadence` | str/Timedelta | None | Source data cadence (estimated if None) |
243
244
 
@@ -25,6 +25,13 @@ A Python package for time series DataFrame downsampling with LTTB, M4, multiple
25
25
  pip install downsampler
26
26
  ```
27
27
 
28
+ > **Note (Linux + Python 3.11):** the `lttbc` dependency's prebuilt cp311
29
+ > Linux wheel was compiled against NumPy 1.x and fails to import under
30
+ > NumPy 2 (`numpy.core.multiarray failed to import`). Force a source build:
31
+ > `pip install --no-binary lttbc downsampler`. Other Python versions and
32
+ > macOS have no prebuilt wheel and build from source automatically. With
33
+ > uv, this repo's `[tool.uv] no-binary-package` setting handles it.
34
+
28
35
  ## Quick Start
29
36
 
30
37
  ### Basic Downsampling
@@ -205,7 +212,7 @@ print(summary_table(results))
205
212
  | `gap_threshold` | str/Timedelta | "auto" | Min duration for gaps |
206
213
  | `edge_handling` | EdgeHandling | KEEP | How to handle edges |
207
214
  | `edge_window` | int | 2 | Points at each edge |
208
- | `min_points_per_segment` | int | 3 | Min points for processing |
215
+ | `min_points_per_segment` | int | 1 | Min points per segment; smaller segments are dropped |
209
216
  | `min_completeness` | float | 0.9 | Min fraction of expected points per bucket |
210
217
  | `source_cadence` | str/Timedelta | None | Source data cadence (estimated if None) |
211
218
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "downsampler"
7
- version = "0.2.0"
7
+ version = "0.3.0"
8
8
  description = "Timeseries DataFrame downsampling with LTTB, aggregation methods, gap handling, and fidelity testing"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -30,13 +30,22 @@ dependencies = [
30
30
  "requests>=2.32.5",
31
31
  ]
32
32
 
33
- [project.optional-dependencies]
33
+ [dependency-groups]
34
34
  test = ["pytest>=7.0", "pytest-cov>=4.0"]
35
- dev = ["downsampler[test]", "marimo"]
35
+ dev = [{include-group = "test"}, "marimo"]
36
36
 
37
37
  [project.urls]
38
38
  Homepage = "https://gitlab.com/KNMI-OSS/spaceweather/libs/downsampler"
39
39
  Repository = "https://gitlab.com/KNMI-OSS/spaceweather/libs/downsampler"
40
40
 
41
+ [tool.uv]
42
+ # lttbc's prebuilt cp311 Linux wheel was compiled against NumPy 1.x and
43
+ # crashes on import under NumPy 2 ("numpy.core.multiarray failed to
44
+ # import"). Building from source compiles it against the installed NumPy.
45
+ # Other Python versions ship no Linux wheel and always build from source.
46
+ # Same workaround as in spaceweather-data-pipelines; drop when lttbc
47
+ # publishes NumPy-2 wheels.
48
+ no-binary-package = ["lttbc"]
49
+
41
50
  [tool.setuptools.packages.find]
42
51
  where = ["src"]
@@ -46,7 +46,9 @@ class DownsampleConfig:
46
46
  "auto" means 2x the target cadence.
47
47
  edge_handling: Strategy for handling edge points.
48
48
  edge_window: Number of points at each edge to consider as edge points.
49
- min_points_per_segment: Minimum points required in a segment for processing.
49
+ min_points_per_segment: Minimum points required in a segment for
50
+ processing. Default 1: even single-point data islands between
51
+ gaps are kept, so intermittent data survives downsampling.
50
52
  source_cadence: Expected cadence of the source data. Used by LTTB to
51
53
  interpolate small gaps and by aggregators for completeness
52
54
  calculation. If None, estimated from data.
@@ -64,7 +66,7 @@ class DownsampleConfig:
64
66
  gap_threshold: Union[str, pd.Timedelta] = "auto"
65
67
  edge_handling: EdgeHandling = EdgeHandling.KEEP
66
68
  edge_window: int = 2
67
- min_points_per_segment: int = 3
69
+ min_points_per_segment: int = 1
68
70
  source_cadence: Union[str, pd.Timedelta, None] = None
69
71
  min_completeness: float = 0.9
70
72
 
@@ -68,9 +68,14 @@ def groupby_gaps(
68
68
  2
69
69
  """
70
70
  df_work = df.copy()
71
- deltas = df_work.index.diff()[1:]
72
- gap_indices = (deltas >= timedelta_max_gap).cumsum()
73
- df_work['gap_index'] = [0, *gap_indices]
71
+ if len(df_work) == 0:
72
+ # [0, *gap_indices] below would fabricate a spurious row on an
73
+ # empty frame; an empty gap_index column yields zero groups.
74
+ df_work['gap_index'] = np.array([], dtype=np.int64)
75
+ else:
76
+ deltas = df_work.index.diff()[1:]
77
+ gap_indices = (deltas >= timedelta_max_gap).cumsum()
78
+ df_work['gap_index'] = [0, *gap_indices]
74
79
  dfs_out = df_work.groupby('gap_index')
75
80
  return dfs_out
76
81
 
@@ -1,6 +1,5 @@
1
1
  """LTTB (Largest Triangle Three Buckets) downsampling with gap handling."""
2
2
 
3
- import logging
4
3
  import pandas as pd
5
4
  import numpy as np
6
5
  import lttbc
@@ -17,7 +16,7 @@ def downsample_lttb(
17
16
  target_cadence: str | pd.Timedelta,
18
17
  include_columns: list[str] | None = None,
19
18
  gap_threshold: pd.Timedelta | None = None,
20
- min_points_per_segment: int = 3,
19
+ min_points_per_segment: int = 1,
21
20
  source_cadence: pd.Timedelta | None = None,
22
21
  ) -> pd.DataFrame:
23
22
  """Perform LTTB downsampling on a pandas DataFrame.
@@ -26,10 +25,16 @@ def downsample_lttb(
26
25
  preserves visual characteristics of the data by selecting points that
27
26
  maximize the area of triangles formed with adjacent buckets.
28
27
 
29
- Small gaps (below gap_threshold) are filled by linear interpolation
30
- before downsampling so LTTB receives continuous segments. Large gaps
31
- (at/above gap_threshold) split the data into segments processed
32
- independently, with NaN marker rows inserted between them.
28
+ Rows where the target column is NaN are treated as absent data, so
29
+ fill-encoded gaps (NaN rows present at nominal cadence) behave exactly
30
+ like missing-row gaps. Small gaps (below gap_threshold) are filled by
31
+ linear interpolation before downsampling so LTTB receives continuous
32
+ segments. Large gaps (at/above gap_threshold) split the data into
33
+ segments processed independently, with NaN marker rows inserted between
34
+ them. Segments always keep their first and last points, so gap
35
+ boundaries in the output match the source data exactly. Segments too
36
+ short for triangle selection are passed through as their first/last
37
+ points rather than dropped.
33
38
 
34
39
  Args:
35
40
  df_in: Input DataFrame with DatetimeIndex.
@@ -60,8 +65,16 @@ def downsample_lttb(
60
65
  if gap_threshold is None:
61
66
  gap_threshold = 2 * target_cadence
62
67
 
68
+ # A row without a valid target sample carries no signal for LTTB. Drop
69
+ # such rows before gap detection so that fill-encoded gaps (NaN rows at
70
+ # nominal cadence, as produced by CDF/NetCDF fill values) segment the
71
+ # data exactly like missing-row gaps.
72
+ df_valid = df_in.dropna(subset=[target_column])
73
+ if df_valid.empty:
74
+ return pd.DataFrame(columns=df_in.columns)
75
+
63
76
  # Interpolate small gaps so LTTB receives continuous input
64
- df_interp = interpolate_small_gaps(df_in, gap_threshold, source_cadence)
77
+ df_interp = interpolate_small_gaps(df_valid, gap_threshold, source_cadence)
65
78
 
66
79
  # Split at large gaps and process each segment
67
80
  segments = split_at_gaps(df_interp, gap_threshold)
@@ -103,12 +116,10 @@ def _lttb_single_segment(
103
116
  Returns:
104
117
  Downsampled DataFrame or None if cannot process.
105
118
  """
106
- # Compute number of output points
107
- n_out = compute_output_points(df.index[0], df.index[-1], target_cadence)
108
-
109
- if n_out < 3:
110
- logging.warning("Cannot perform LTTB downsampling on less than 3 points")
111
- return None
119
+ # Compute number of output points. Segments spanning less than the
120
+ # target cadence still keep their first and last points (lttbc handles
121
+ # n_out <= 2 gracefully) so short data islands survive downsampling.
122
+ n_out = max(2, compute_output_points(df.index[0], df.index[-1], target_cadence))
112
123
 
113
124
  # Set up the data - convert time to numeric for LTTB algorithm
114
125
  df_work = df.copy()
@@ -116,11 +127,12 @@ def _lttb_single_segment(
116
127
  timeunit = '1min'
117
128
  df_work['time_num'] = (df_work.index - timeref) / pd.to_timedelta(timeunit)
118
129
 
119
- # Prepare data for LTTB (time_num, target_column)
130
+ # Prepare data for LTTB (time_num, target_column). The caller already
131
+ # drops NaN-target rows; this guard protects against lttbc silently
132
+ # converting any remaining NaN to 0.0 (a fabricated value).
120
133
  df_clean = df_work[['time_num', target_column]].dropna()
121
134
 
122
- if len(df_clean) < 3:
123
- logging.warning("Insufficient non-NaN data points for LTTB")
135
+ if df_clean.empty:
124
136
  return None
125
137
 
126
138
  # Apply LTTB downsampling (lttbc uses separate x, y arrays)
@@ -139,7 +151,9 @@ def _lttb_single_segment(
139
151
  pd.to_timedelta(df_resampled['time_num'], unit='min')
140
152
  )
141
153
 
142
- # Interpolate other columns to LTTB-selected time points
154
+ # Interpolate other columns to LTTB-selected time points. Each column
155
+ # interpolates from its own non-NaN samples — np.interp propagates NaN
156
+ # from any NaN in fp, which would blank values near unrelated holes.
143
157
  for col in df.columns:
144
158
  if col in ['time', 'time_num', target_column]:
145
159
  continue
@@ -148,11 +162,15 @@ def _lttb_single_segment(
148
162
  if not pd.api.types.is_numeric_dtype(df[col]):
149
163
  continue
150
164
 
151
- df_resampled[col] = np.interp(
152
- x=df_resampled['time_num'].values,
153
- xp=df_work['time_num'].values,
154
- fp=df_work[col].values
155
- )
165
+ col_valid = df_work[['time_num', col]].dropna()
166
+ if col_valid.empty:
167
+ df_resampled[col] = np.nan
168
+ else:
169
+ df_resampled[col] = np.interp(
170
+ x=df_resampled['time_num'].values,
171
+ xp=col_valid['time_num'].values,
172
+ fp=col_valid[col].values
173
+ )
156
174
 
157
175
  # Clean up
158
176
  df_resampled = df_resampled.drop(['time_num'], axis=1)
@@ -24,7 +24,7 @@ def downsample_m4(
24
24
  target_cadence: str | pd.Timedelta,
25
25
  include_columns: list[str] | None = None,
26
26
  gap_threshold: pd.Timedelta | None = None,
27
- min_points_per_segment: int = 3,
27
+ min_points_per_segment: int = 1,
28
28
  deduplicate: bool = True,
29
29
  collinearity_threshold: float | None = None,
30
30
  target_points_per_bucket: float = 3.0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: downsampler
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Timeseries DataFrame downsampling with LTTB, aggregation methods, gap handling, and fidelity testing
5
5
  Author-email: Eelco Doornbos <eelco.doornbos@knmi.nl>
6
6
  License-Expression: MIT
@@ -22,12 +22,6 @@ Requires-Dist: pandas>=1.3
22
22
  Requires-Dist: lttbc>=0.3
23
23
  Requires-Dist: scipy>=1.7
24
24
  Requires-Dist: requests>=2.32.5
25
- Provides-Extra: test
26
- Requires-Dist: pytest>=7.0; extra == "test"
27
- Requires-Dist: pytest-cov>=4.0; extra == "test"
28
- Provides-Extra: dev
29
- Requires-Dist: downsampler[test]; extra == "dev"
30
- Requires-Dist: marimo; extra == "dev"
31
25
  Dynamic: license-file
32
26
 
33
27
  # downsampler
@@ -57,6 +51,13 @@ A Python package for time series DataFrame downsampling with LTTB, M4, multiple
57
51
  pip install downsampler
58
52
  ```
59
53
 
54
+ > **Note (Linux + Python 3.11):** the `lttbc` dependency's prebuilt cp311
55
+ > Linux wheel was compiled against NumPy 1.x and fails to import under
56
+ > NumPy 2 (`numpy.core.multiarray failed to import`). Force a source build:
57
+ > `pip install --no-binary lttbc downsampler`. Other Python versions and
58
+ > macOS have no prebuilt wheel and build from source automatically. With
59
+ > uv, this repo's `[tool.uv] no-binary-package` setting handles it.
60
+
60
61
  ## Quick Start
61
62
 
62
63
  ### Basic Downsampling
@@ -237,7 +238,7 @@ print(summary_table(results))
237
238
  | `gap_threshold` | str/Timedelta | "auto" | Min duration for gaps |
238
239
  | `edge_handling` | EdgeHandling | KEEP | How to handle edges |
239
240
  | `edge_window` | int | 2 | Points at each edge |
240
- | `min_points_per_segment` | int | 3 | Min points for processing |
241
+ | `min_points_per_segment` | int | 1 | Min points per segment; smaller segments are dropped |
241
242
  | `min_completeness` | float | 0.9 | Min fraction of expected points per bucket |
242
243
  | `source_cadence` | str/Timedelta | None | Source data cadence (estimated if None) |
243
244
 
@@ -0,0 +1,5 @@
1
+ numpy>=2.0
2
+ pandas>=1.3
3
+ lttbc>=0.3
4
+ scipy>=1.7
5
+ requests>=2.32.5
@@ -0,0 +1,343 @@
1
+ """Tests for LTTB downsampling."""
2
+
3
+ import pytest
4
+ import pandas as pd
5
+ import numpy as np
6
+
7
+ from downsampler.lttb import downsample_lttb, downsample_lttb_with_config
8
+ from downsampler.config import DownsampleConfig, AggregationMethod
9
+
10
+
11
+ class TestDownsampleLttb:
12
+ """Tests for LTTB downsampling function."""
13
+
14
+ def test_basic_lttb(self, sine_df):
15
+ """Test basic LTTB downsampling."""
16
+ result = downsample_lttb(
17
+ sine_df,
18
+ target_column='signal',
19
+ target_cadence='PT10S'
20
+ )
21
+
22
+ assert len(result) < len(sine_df)
23
+ assert 'signal' in result.columns
24
+
25
+ def test_preserves_extreme_values(self, sine_df):
26
+ """Test that LTTB preserves extreme values reasonably well."""
27
+ result = downsample_lttb(
28
+ sine_df,
29
+ target_column='signal',
30
+ target_cadence='PT10S'
31
+ )
32
+
33
+ # Check that max/min are close to original
34
+ orig_max = sine_df['signal'].max()
35
+ orig_min = sine_df['signal'].min()
36
+ result_max = result['signal'].max()
37
+ result_min = result['signal'].min()
38
+
39
+ # Allow 10% tolerance
40
+ assert abs(result_max - orig_max) < 0.1 * abs(orig_max)
41
+ assert abs(result_min - orig_min) < 0.1 * abs(orig_min - orig_max)
42
+
43
+ def test_include_columns(self, sine_df):
44
+ """Test including additional columns."""
45
+ result = downsample_lttb(
46
+ sine_df,
47
+ target_column='signal',
48
+ target_cadence='PT10S',
49
+ include_columns=['signal', 'noise']
50
+ )
51
+
52
+ assert 'signal' in result.columns
53
+ assert 'noise' in result.columns
54
+
55
+ def test_gap_handling(self, gappy_df):
56
+ """Test LTTB with gappy data."""
57
+ # Add a target column
58
+ gappy_df['signal'] = np.sin(np.linspace(0, 4 * np.pi, len(gappy_df)))
59
+
60
+ result = downsample_lttb(
61
+ gappy_df,
62
+ target_column='signal',
63
+ target_cadence='PT5M',
64
+ gap_threshold=pd.Timedelta('30min')
65
+ )
66
+
67
+ # Should produce output from both segments
68
+ assert len(result) > 0
69
+
70
+ def test_insufficient_points(self):
71
+ """Test handling of insufficient points."""
72
+ small_df = pd.DataFrame(
73
+ {'value': [1, 2]},
74
+ index=pd.date_range('2024-01-01', periods=2, freq='1s')
75
+ )
76
+
77
+ result = downsample_lttb(
78
+ small_df,
79
+ target_column='value',
80
+ target_cadence='PT10S',
81
+ min_points_per_segment=3
82
+ )
83
+
84
+ # Should return empty or minimal result
85
+ assert len(result) == 0
86
+
87
+
88
+ class TestLttbGapHandling:
89
+ """Tests for LTTB gap handling behavior."""
90
+
91
+ def test_lttb_inserts_nan_markers_at_large_gaps(self):
92
+ """Test that LTTB output contains NaN markers between segments."""
93
+ # Two segments with a large gap between them
94
+ times1 = pd.date_range('2024-01-01 00:00', periods=100, freq='1s')
95
+ times2 = pd.date_range('2024-01-01 01:00', periods=100, freq='1s')
96
+ t1 = np.linspace(0, 4 * np.pi, 100)
97
+ t2 = np.linspace(0, 4 * np.pi, 100)
98
+ df = pd.concat([
99
+ pd.DataFrame({'signal': np.sin(t1)}, index=times1),
100
+ pd.DataFrame({'signal': np.sin(t2)}, index=times2),
101
+ ])
102
+
103
+ result = downsample_lttb(
104
+ df,
105
+ target_column='signal',
106
+ target_cadence='PT10S',
107
+ gap_threshold=pd.Timedelta('5min'),
108
+ )
109
+
110
+ # Should have NaN markers
111
+ assert result['signal'].isna().sum() > 0
112
+
113
+ def test_lttb_interpolates_small_gaps(self):
114
+ """Test that small gaps are filled before LTTB processes."""
115
+ # Data with a small 5-second gap (below gap_threshold of 30s)
116
+ times = list(pd.date_range('2024-01-01 00:00', periods=50, freq='1s'))
117
+ # Remove 3 points to create a small gap
118
+ del times[25:28]
119
+ t = np.linspace(0, 4 * np.pi, len(times))
120
+ df = pd.DataFrame({'signal': np.sin(t)}, index=pd.DatetimeIndex(times))
121
+
122
+ result = downsample_lttb(
123
+ df,
124
+ target_column='signal',
125
+ target_cadence='PT5S',
126
+ gap_threshold=pd.Timedelta('30s'),
127
+ source_cadence=pd.Timedelta('1s'),
128
+ )
129
+
130
+ # Should NOT have NaN markers (gap was small and interpolated)
131
+ assert result['signal'].isna().sum() == 0
132
+ assert len(result) > 0
133
+
134
+ def test_lttb_source_cadence_via_config(self):
135
+ """Test that source_cadence is passed through config."""
136
+ times1 = pd.date_range('2024-01-01 00:00', periods=100, freq='1s')
137
+ times2 = pd.date_range('2024-01-01 01:00', periods=100, freq='1s')
138
+ t = np.linspace(0, 4 * np.pi, 100)
139
+ df = pd.concat([
140
+ pd.DataFrame({'signal': np.sin(t)}, index=times1),
141
+ pd.DataFrame({'signal': np.sin(t)}, index=times2),
142
+ ])
143
+
144
+ config = DownsampleConfig(
145
+ method=AggregationMethod.LTTB,
146
+ lttb_target_column='signal',
147
+ source_cadence='PT1S',
148
+ )
149
+
150
+ result = downsample_lttb_with_config(df, 'PT10S', config)
151
+ assert len(result) > 0
152
+
153
+
154
+ class TestLttbNanEncodedGaps:
155
+ """Fill-encoded gaps (NaN rows) must behave like missing-row gaps."""
156
+
157
+ @staticmethod
158
+ def _make_gappy(encoding: str) -> pd.DataFrame:
159
+ """4 hours of 1-min data with a 45-min gap from 01:00 to 01:45."""
160
+ idx = pd.date_range('2024-01-01', periods=240, freq='1min')
161
+ values = 500 + 10 * np.sin(np.arange(240) / 20.0)
162
+ other = 100 + np.cos(np.arange(240) / 10.0)
163
+ df = pd.DataFrame({'signal': values, 'other': other}, index=idx)
164
+ gap_rows = idx[60:105]
165
+ if encoding == 'nan_rows':
166
+ df.loc[gap_rows, ['signal', 'other']] = np.nan
167
+ return df
168
+ return df.drop(gap_rows)
169
+
170
+ @pytest.mark.parametrize('encoding', ['nan_rows', 'missing_rows'])
171
+ def test_gap_boundaries_preserved_exactly(self, encoding):
172
+ """Last point before and first point after a gap must survive."""
173
+ df = self._make_gappy(encoding)
174
+
175
+ result = downsample_lttb(
176
+ df, target_column='signal', target_cadence='PT15M',
177
+ source_cadence=pd.Timedelta('1min'),
178
+ )
179
+
180
+ # LTTB keeps first/last of each segment: the valid samples
181
+ # adjacent to the gap must be in the output.
182
+ assert pd.Timestamp('2024-01-01 00:59') in result.index
183
+ assert pd.Timestamp('2024-01-01 01:45') in result.index
184
+
185
+ def test_nan_rows_equivalent_to_missing_rows(self):
186
+ """Both gap encodings must produce identical output."""
187
+ result_nan = downsample_lttb(
188
+ self._make_gappy('nan_rows'),
189
+ target_column='signal', target_cadence='PT15M',
190
+ source_cadence=pd.Timedelta('1min'),
191
+ )
192
+ result_missing = downsample_lttb(
193
+ self._make_gappy('missing_rows'),
194
+ target_column='signal', target_cadence='PT15M',
195
+ source_cadence=pd.Timedelta('1min'),
196
+ )
197
+
198
+ pd.testing.assert_frame_equal(result_nan, result_missing)
199
+
200
+ def test_nan_gap_produces_marker(self):
201
+ """A NaN-encoded gap must yield a NaN marker row in the output."""
202
+ result = downsample_lttb(
203
+ self._make_gappy('nan_rows'),
204
+ target_column='signal', target_cadence='PT15M',
205
+ source_cadence=pd.Timedelta('1min'),
206
+ )
207
+
208
+ markers = result[result['signal'].isna()]
209
+ assert len(markers) == 1
210
+ # Marker sits just after the last point of the pre-gap segment
211
+ assert pd.Timestamp('2024-01-01 00:59') < markers.index[0]
212
+ assert markers.index[0] < pd.Timestamp('2024-01-01 01:45')
213
+
214
+ def test_all_nan_target_returns_empty(self):
215
+ """A frame whose target column is entirely NaN yields no output."""
216
+ idx = pd.date_range('2024-01-01', periods=100, freq='1min')
217
+ df = pd.DataFrame({'signal': np.nan, 'other': 1.0}, index=idx)
218
+
219
+ result = downsample_lttb(df, target_column='signal', target_cadence='PT15M')
220
+
221
+ assert len(result) == 0
222
+ assert list(result.columns) == ['signal', 'other']
223
+
224
+ def test_include_column_own_nan_holes(self):
225
+ """Include columns interpolate from their own valid samples only."""
226
+ idx = pd.date_range('2024-01-01', periods=240, freq='1min')
227
+ df = pd.DataFrame({
228
+ 'signal': 500 + 10 * np.sin(np.arange(240) / 20.0),
229
+ 'other': 100.0,
230
+ }, index=idx)
231
+ # Hole in 'other' only — target column is complete
232
+ df.loc[idx[100:110], 'other'] = np.nan
233
+
234
+ result = downsample_lttb(
235
+ df, target_column='signal', target_cadence='PT15M',
236
+ source_cadence=pd.Timedelta('1min'),
237
+ )
238
+
239
+ # 'other' is constant 100 outside its hole; interpolation across
240
+ # the hole must bridge it rather than propagate NaN.
241
+ assert result['other'].isna().sum() == 0
242
+ assert (result['other'] == 100.0).all()
243
+
244
+
245
+ class TestLttbShortSegments:
246
+ """Short data islands must survive instead of being dropped."""
247
+
248
+ def test_short_segment_keeps_first_and_last(self):
249
+ """A segment spanning less than the target cadence keeps its extent."""
250
+ # 10 minutes of data at 1-min cadence, target PT1H: old behavior
251
+ # dropped this entirely (n_out < 3).
252
+ idx = pd.date_range('2024-01-01', periods=10, freq='1min')
253
+ df = pd.DataFrame({'signal': np.linspace(1.0, 2.0, 10)}, index=idx)
254
+
255
+ result = downsample_lttb(df, target_column='signal', target_cadence='PT1H')
256
+
257
+ assert idx[0] in result.index
258
+ assert idx[-1] in result.index
259
+
260
+ def test_intermittent_islands_all_survive(self):
261
+ """Several short islands between gaps each keep points and markers."""
262
+ frames = []
263
+ for hour in [0, 3, 6]: # gaps well above the 2x PT1H auto threshold
264
+ idx = pd.date_range(f'2024-01-01 {hour:02d}:00', periods=8, freq='1min')
265
+ frames.append(pd.DataFrame(
266
+ {'signal': np.full(8, float(hour + 1))}, index=idx
267
+ ))
268
+ df = pd.concat(frames)
269
+
270
+ result = downsample_lttb(df, target_column='signal', target_cadence='PT1H')
271
+
272
+ # All three islands present with their values
273
+ for hour in [0, 3, 6]:
274
+ assert (result['signal'] == hour + 1).any()
275
+ # Two NaN markers between three segments
276
+ assert result['signal'].isna().sum() == 2
277
+
278
+ def test_single_isolated_point_survives(self):
279
+ """A lone valid sample between gaps is kept, bracketed by markers."""
280
+ idx1 = pd.date_range('2024-01-01 00:00', periods=30, freq='1min')
281
+ idx2 = pd.date_range('2024-01-01 02:00', periods=30, freq='1min')
282
+ lone = pd.Timestamp('2024-01-01 01:00')
283
+ df = pd.concat([
284
+ pd.DataFrame({'signal': 1.0}, index=idx1),
285
+ pd.DataFrame({'signal': 99.0}, index=[lone]),
286
+ pd.DataFrame({'signal': 2.0}, index=idx2),
287
+ ])
288
+
289
+ result = downsample_lttb(
290
+ df, target_column='signal', target_cadence='PT5M',
291
+ source_cadence=pd.Timedelta('1min'),
292
+ )
293
+
294
+ assert lone in result.index
295
+ assert result.loc[lone, 'signal'] == 99.0
296
+ # Markers on both sides of the lone point
297
+ pos = result.index.get_loc(lone)
298
+ assert np.isnan(result['signal'].iloc[pos - 1])
299
+ assert np.isnan(result['signal'].iloc[pos + 1])
300
+
301
+ def test_min_points_per_segment_still_filters(self):
302
+ """An explicit min_points_per_segment above 1 still discards."""
303
+ idx1 = pd.date_range('2024-01-01 00:00', periods=30, freq='1min')
304
+ idx2 = pd.date_range('2024-01-01 02:00', periods=2, freq='1min')
305
+ df = pd.concat([
306
+ pd.DataFrame({'signal': 1.0}, index=idx1),
307
+ pd.DataFrame({'signal': 2.0}, index=idx2),
308
+ ])
309
+
310
+ result = downsample_lttb(
311
+ df, target_column='signal', target_cadence='PT5M',
312
+ min_points_per_segment=3,
313
+ source_cadence=pd.Timedelta('1min'),
314
+ )
315
+
316
+ assert not (result['signal'] == 2.0).any()
317
+
318
+
319
+ class TestDownsampleLttbWithConfig:
320
+ """Tests for LTTB downsampling with config."""
321
+
322
+ def test_with_config(self, sine_df):
323
+ """Test LTTB with full configuration."""
324
+ config = DownsampleConfig(
325
+ method=AggregationMethod.LTTB,
326
+ lttb_target_column='signal',
327
+ min_points_per_segment=5
328
+ )
329
+
330
+ result = downsample_lttb_with_config(
331
+ sine_df,
332
+ 'PT10S',
333
+ config
334
+ )
335
+
336
+ assert len(result) > 0
337
+
338
+ def test_missing_target_column_raises(self, sine_df):
339
+ """Test that missing target column raises error."""
340
+ config = DownsampleConfig(method=AggregationMethod.LTTB)
341
+
342
+ with pytest.raises(ValueError, match="lttb_target_column"):
343
+ downsample_lttb_with_config(sine_df, 'PT10S', config)
@@ -1,13 +0,0 @@
1
- numpy>=2.0
2
- pandas>=1.3
3
- lttbc>=0.3
4
- scipy>=1.7
5
- requests>=2.32.5
6
-
7
- [dev]
8
- downsampler[test]
9
- marimo
10
-
11
- [test]
12
- pytest>=7.0
13
- pytest-cov>=4.0
@@ -1,178 +0,0 @@
1
- """Tests for LTTB downsampling."""
2
-
3
- import pytest
4
- import pandas as pd
5
- import numpy as np
6
-
7
- from downsampler.lttb import downsample_lttb, downsample_lttb_with_config
8
- from downsampler.config import DownsampleConfig, AggregationMethod
9
-
10
-
11
- class TestDownsampleLttb:
12
- """Tests for LTTB downsampling function."""
13
-
14
- def test_basic_lttb(self, sine_df):
15
- """Test basic LTTB downsampling."""
16
- result = downsample_lttb(
17
- sine_df,
18
- target_column='signal',
19
- target_cadence='PT10S'
20
- )
21
-
22
- assert len(result) < len(sine_df)
23
- assert 'signal' in result.columns
24
-
25
- def test_preserves_extreme_values(self, sine_df):
26
- """Test that LTTB preserves extreme values reasonably well."""
27
- result = downsample_lttb(
28
- sine_df,
29
- target_column='signal',
30
- target_cadence='PT10S'
31
- )
32
-
33
- # Check that max/min are close to original
34
- orig_max = sine_df['signal'].max()
35
- orig_min = sine_df['signal'].min()
36
- result_max = result['signal'].max()
37
- result_min = result['signal'].min()
38
-
39
- # Allow 10% tolerance
40
- assert abs(result_max - orig_max) < 0.1 * abs(orig_max)
41
- assert abs(result_min - orig_min) < 0.1 * abs(orig_min - orig_max)
42
-
43
- def test_include_columns(self, sine_df):
44
- """Test including additional columns."""
45
- result = downsample_lttb(
46
- sine_df,
47
- target_column='signal',
48
- target_cadence='PT10S',
49
- include_columns=['signal', 'noise']
50
- )
51
-
52
- assert 'signal' in result.columns
53
- assert 'noise' in result.columns
54
-
55
- def test_gap_handling(self, gappy_df):
56
- """Test LTTB with gappy data."""
57
- # Add a target column
58
- gappy_df['signal'] = np.sin(np.linspace(0, 4 * np.pi, len(gappy_df)))
59
-
60
- result = downsample_lttb(
61
- gappy_df,
62
- target_column='signal',
63
- target_cadence='PT5M',
64
- gap_threshold=pd.Timedelta('30min')
65
- )
66
-
67
- # Should produce output from both segments
68
- assert len(result) > 0
69
-
70
- def test_insufficient_points(self):
71
- """Test handling of insufficient points."""
72
- small_df = pd.DataFrame(
73
- {'value': [1, 2]},
74
- index=pd.date_range('2024-01-01', periods=2, freq='1s')
75
- )
76
-
77
- result = downsample_lttb(
78
- small_df,
79
- target_column='value',
80
- target_cadence='PT10S',
81
- min_points_per_segment=3
82
- )
83
-
84
- # Should return empty or minimal result
85
- assert len(result) == 0
86
-
87
-
88
- class TestLttbGapHandling:
89
- """Tests for LTTB gap handling behavior."""
90
-
91
- def test_lttb_inserts_nan_markers_at_large_gaps(self):
92
- """Test that LTTB output contains NaN markers between segments."""
93
- # Two segments with a large gap between them
94
- times1 = pd.date_range('2024-01-01 00:00', periods=100, freq='1s')
95
- times2 = pd.date_range('2024-01-01 01:00', periods=100, freq='1s')
96
- t1 = np.linspace(0, 4 * np.pi, 100)
97
- t2 = np.linspace(0, 4 * np.pi, 100)
98
- df = pd.concat([
99
- pd.DataFrame({'signal': np.sin(t1)}, index=times1),
100
- pd.DataFrame({'signal': np.sin(t2)}, index=times2),
101
- ])
102
-
103
- result = downsample_lttb(
104
- df,
105
- target_column='signal',
106
- target_cadence='PT10S',
107
- gap_threshold=pd.Timedelta('5min'),
108
- )
109
-
110
- # Should have NaN markers
111
- assert result['signal'].isna().sum() > 0
112
-
113
- def test_lttb_interpolates_small_gaps(self):
114
- """Test that small gaps are filled before LTTB processes."""
115
- # Data with a small 5-second gap (below gap_threshold of 30s)
116
- times = list(pd.date_range('2024-01-01 00:00', periods=50, freq='1s'))
117
- # Remove 3 points to create a small gap
118
- del times[25:28]
119
- t = np.linspace(0, 4 * np.pi, len(times))
120
- df = pd.DataFrame({'signal': np.sin(t)}, index=pd.DatetimeIndex(times))
121
-
122
- result = downsample_lttb(
123
- df,
124
- target_column='signal',
125
- target_cadence='PT5S',
126
- gap_threshold=pd.Timedelta('30s'),
127
- source_cadence=pd.Timedelta('1s'),
128
- )
129
-
130
- # Should NOT have NaN markers (gap was small and interpolated)
131
- assert result['signal'].isna().sum() == 0
132
- assert len(result) > 0
133
-
134
- def test_lttb_source_cadence_via_config(self):
135
- """Test that source_cadence is passed through config."""
136
- times1 = pd.date_range('2024-01-01 00:00', periods=100, freq='1s')
137
- times2 = pd.date_range('2024-01-01 01:00', periods=100, freq='1s')
138
- t = np.linspace(0, 4 * np.pi, 100)
139
- df = pd.concat([
140
- pd.DataFrame({'signal': np.sin(t)}, index=times1),
141
- pd.DataFrame({'signal': np.sin(t)}, index=times2),
142
- ])
143
-
144
- config = DownsampleConfig(
145
- method=AggregationMethod.LTTB,
146
- lttb_target_column='signal',
147
- source_cadence='PT1S',
148
- )
149
-
150
- result = downsample_lttb_with_config(df, 'PT10S', config)
151
- assert len(result) > 0
152
-
153
-
154
- class TestDownsampleLttbWithConfig:
155
- """Tests for LTTB downsampling with config."""
156
-
157
- def test_with_config(self, sine_df):
158
- """Test LTTB with full configuration."""
159
- config = DownsampleConfig(
160
- method=AggregationMethod.LTTB,
161
- lttb_target_column='signal',
162
- min_points_per_segment=5
163
- )
164
-
165
- result = downsample_lttb_with_config(
166
- sine_df,
167
- 'PT10S',
168
- config
169
- )
170
-
171
- assert len(result) > 0
172
-
173
- def test_missing_target_column_raises(self, sine_df):
174
- """Test that missing target column raises error."""
175
- config = DownsampleConfig(method=AggregationMethod.LTTB)
176
-
177
- with pytest.raises(ValueError, match="lttb_target_column"):
178
- downsample_lttb_with_config(sine_df, 'PT10S', config)
File without changes
File without changes