downsampler 0.3.0__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {downsampler-0.3.0/src/downsampler.egg-info → downsampler-0.3.1}/PKG-INFO +1 -1
  2. {downsampler-0.3.0 → downsampler-0.3.1}/pyproject.toml +1 -1
  3. {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler/__init__.py +1 -1
  4. {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler/gaps.py +56 -28
  5. {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler/lttb.py +23 -28
  6. {downsampler-0.3.0 → downsampler-0.3.1/src/downsampler.egg-info}/PKG-INFO +1 -1
  7. {downsampler-0.3.0 → downsampler-0.3.1}/tests/test_gaps.py +27 -13
  8. {downsampler-0.3.0 → downsampler-0.3.1}/tests/test_lttb.py +67 -2
  9. {downsampler-0.3.0 → downsampler-0.3.1}/LICENSE +0 -0
  10. {downsampler-0.3.0 → downsampler-0.3.1}/README.md +0 -0
  11. {downsampler-0.3.0 → downsampler-0.3.1}/setup.cfg +0 -0
  12. {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler/aggregators.py +0 -0
  13. {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler/config.py +0 -0
  14. {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler/core.py +0 -0
  15. {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler/edges.py +0 -0
  16. {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler/fidelity/__init__.py +0 -0
  17. {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler/fidelity/comparison.py +0 -0
  18. {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler/fidelity/metrics.py +0 -0
  19. {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler/m4.py +0 -0
  20. {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler/ranged.py +0 -0
  21. {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler/utils.py +0 -0
  22. {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler.egg-info/SOURCES.txt +0 -0
  23. {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler.egg-info/dependency_links.txt +0 -0
  24. {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler.egg-info/requires.txt +0 -0
  25. {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler.egg-info/top_level.txt +0 -0
  26. {downsampler-0.3.0 → downsampler-0.3.1}/tests/test_aggregators.py +0 -0
  27. {downsampler-0.3.0 → downsampler-0.3.1}/tests/test_core.py +0 -0
  28. {downsampler-0.3.0 → downsampler-0.3.1}/tests/test_edges.py +0 -0
  29. {downsampler-0.3.0 → downsampler-0.3.1}/tests/test_fidelity.py +0 -0
  30. {downsampler-0.3.0 → downsampler-0.3.1}/tests/test_m4.py +0 -0
  31. {downsampler-0.3.0 → downsampler-0.3.1}/tests/test_ranged.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: downsampler
3
- Version: 0.3.0
3
+ Version: 0.3.1
4
4
  Summary: Timeseries DataFrame downsampling with LTTB, aggregation methods, gap handling, and fidelity testing
5
5
  Author-email: Eelco Doornbos <eelco.doornbos@knmi.nl>
6
6
  License-Expression: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "downsampler"
7
- version = "0.3.0"
7
+ version = "0.3.1"
8
8
  description = "Timeseries DataFrame downsampling with LTTB, aggregation methods, gap handling, and fidelity testing"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -65,7 +65,7 @@ from downsampler.ranged import (
65
65
  DataFetcher,
66
66
  )
67
67
 
68
- __version__ = "0.2.0"
68
+ __version__ = "0.3.1"
69
69
 
70
70
  __all__ = [
71
71
  # Config
@@ -331,6 +331,12 @@ def interpolate_small_gaps(
331
331
  in the original data are preserved — only the newly created rows get
332
332
  interpolated values.
333
333
 
334
+ Each column is interpolated **from its own non-NaN samples**, and any new
335
+ timestamp outside a column's valid-sample range is left NaN (``np.interp``
336
+ ``left``/``right`` = NaN, i.e. no edge-clamp and no extrapolation). So a
337
+ column with a hole larger than ``gap_threshold`` keeps an honest gap at the
338
+ filled timestamps rather than borrowing another column's edge value.
339
+
334
340
  Args:
335
341
  df: DataFrame with DatetimeIndex.
336
342
  gap_threshold: Gaps at or above this duration are "real" gaps and
@@ -349,41 +355,63 @@ def interpolate_small_gaps(
349
355
  source_cadence = estimate_cadence(df)
350
356
 
351
357
  jitter_threshold = source_cadence * 1.5
352
- deltas = df.index.to_series().diff()
353
-
354
358
  numeric_cols = get_numeric_columns(df)
355
- # Convert original timestamps to float64 for np.interp
356
- orig_timestamps = df.index.astype(np.int64).astype(np.float64)
357
359
 
358
- new_rows = []
360
+ # 1. Row skeleton: existing rows plus synthetic rows inside *index* gaps
361
+ # smaller than gap_threshold, at source_cadence, so LTTB sees a
362
+ # continuous grid across small gaps. Values are filled per-column below.
363
+ deltas = df.index.to_series().diff()
364
+ extra_times: list[pd.Timestamp] = []
359
365
  for i in range(1, len(df)):
360
366
  delta = deltas.iloc[i]
361
367
  if delta > jitter_threshold and delta < gap_threshold:
362
- t_before = df.index[i - 1]
363
- t_after = df.index[i]
364
- # Generate timestamps at source_cadence intervals within the gap
365
- new_times = pd.date_range(
366
- start=t_before + source_cadence,
367
- end=t_after - source_cadence * 0.5, # don't duplicate t_after
368
- freq=source_cadence,
368
+ extra_times.extend(
369
+ pd.date_range(
370
+ start=df.index[i - 1] + source_cadence,
371
+ end=df.index[i] - source_cadence * 0.5, # don't duplicate t_after
372
+ freq=source_cadence,
373
+ )
369
374
  )
370
- if len(new_times) == 0:
371
- continue
372
-
373
- new_ts_float = new_times.astype(np.int64).astype(np.float64)
374
- row_data = {col: np.interp(new_ts_float, orig_timestamps, df[col].values)
375
- for col in numeric_cols}
376
- chunk = pd.DataFrame(row_data, index=new_times)
377
- # Add non-numeric columns as NaN (they can't be interpolated)
378
- for col in df.columns:
379
- if col not in numeric_cols:
380
- chunk[col] = np.nan
381
- new_rows.append(chunk)
382
-
383
- if not new_rows:
384
- return df.copy()
375
+ if extra_times:
376
+ out = df.reindex(df.index.append(pd.DatetimeIndex(extra_times)).sort_values())
377
+ else:
378
+ out = df.copy()
379
+
380
+ # 2. Per-column, fill each column's *own* small gaps from its *own* samples,
381
+ # leaving holes wider than gap_threshold NaN. Interpolation is inside-only
382
+ # (no extrapolation → no edge-clamp) and each column is independent (no
383
+ # cross-column bleed).
384
+ for col in numeric_cols:
385
+ out[col] = _interpolate_column_small_gaps(out[col], gap_threshold)
386
+
387
+ return out
388
+
389
+
390
+ def _interpolate_column_small_gaps(
391
+ s: pd.Series,
392
+ gap_threshold: pd.Timedelta,
393
+ ) -> pd.Series:
394
+ """Time-interpolate a single column's gaps narrower than gap_threshold.
395
+
396
+ Points outside the column's valid range stay NaN (no extrapolation), and
397
+ any run of NaNs whose bounding valid samples are gap_threshold or more
398
+ apart is re-blanked so wide holes remain honest gaps.
399
+ """
400
+ valid = s.notna()
401
+ if valid.sum() < 2:
402
+ return s
403
+
404
+ filled = s.interpolate(method="time", limit_area="inside")
405
+
406
+ # Re-blank interpolated points that fall inside a wide (>= gap_threshold)
407
+ # hole between two consecutive valid samples.
408
+ idx = s.index
409
+ valid_pos = np.flatnonzero(valid.to_numpy())
410
+ for a, b in zip(valid_pos[:-1], valid_pos[1:]):
411
+ if b > a + 1 and (idx[b] - idx[a]) >= gap_threshold:
412
+ filled.iloc[a + 1:b] = np.nan
385
413
 
386
- return pd.concat([df] + new_rows).sort_index()
414
+ return filled
387
415
 
388
416
 
389
417
  def concatenate_with_gap_markers(
@@ -107,11 +107,23 @@ def _lttb_single_segment(
107
107
  ) -> pd.DataFrame | None:
108
108
  """Apply LTTB to a single contiguous segment.
109
109
 
110
+ LTTB selects real source timetags for the *target* column; every other
111
+ requested column is then **selected** at those timetags rather than
112
+ interpolated onto them. ``lttbc.downsample`` returns exact input points,
113
+ so the selected ``time_num`` values map back to the segment rows
114
+ unambiguously, and each ride-along column keeps its real measured value
115
+ (or the small-gap-interpolated value the preprocessing stage placed there).
116
+ This makes it impossible to fabricate a value: earlier code interpolated
117
+ include columns with ``np.interp``, which clamps to a column's edge value
118
+ for any timetag outside that column's valid range — stamping a foreign
119
+ constant across every gap. Selection cannot do that.
120
+
110
121
  Args:
111
122
  df: Input DataFrame (no gaps).
112
123
  target_column: Column to optimize for.
113
124
  target_cadence: Target cadence.
114
- include_columns: Additional columns to include.
125
+ include_columns: Additional columns to carry (selected, not
126
+ interpolated). If None, all numeric columns are carried.
115
127
 
116
128
  Returns:
117
129
  Downsampled DataFrame or None if cannot process.
@@ -136,46 +148,29 @@ def _lttb_single_segment(
136
148
  return None
137
149
 
138
150
  # Apply LTTB downsampling (lttbc uses separate x, y arrays)
139
- x_down, y_down = lttbc.downsample(
151
+ x_down, _ = lttbc.downsample(
140
152
  df_clean['time_num'].values,
141
153
  df_clean[target_column].values,
142
154
  n_out
143
155
  )
144
- df_resampled = pd.DataFrame(
145
- {'time_num': x_down, target_column: y_down}
146
- )
147
156
 
148
- # Reconstruct the datetime index
149
- df_resampled.index = (
150
- timeref +
151
- pd.to_timedelta(df_resampled['time_num'], unit='min')
152
- )
157
+ # Row-select the segment rows LTTB chose. ``x_down`` are exact members of
158
+ # the input ``time_num`` (lttbc returns real input points), so ``isin``
159
+ # selects exactly those rows — carrying every column's real value and the
160
+ # original datetime index, no interpolation.
161
+ selected = df_work[df_work['time_num'].isin(x_down)]
153
162
 
154
- # Interpolate other columns to LTTB-selected time points. Each column
155
- # interpolates from its own non-NaN samples — np.interp propagates NaN
156
- # from any NaN in fp, which would blank values near unrelated holes.
163
+ keep = [target_column]
157
164
  for col in df.columns:
158
- if col in ['time', 'time_num', target_column]:
165
+ if col in ('time', 'time_num', target_column):
159
166
  continue
160
167
  if include_columns is not None and col not in include_columns:
161
168
  continue
162
169
  if not pd.api.types.is_numeric_dtype(df[col]):
163
170
  continue
171
+ keep.append(col)
164
172
 
165
- col_valid = df_work[['time_num', col]].dropna()
166
- if col_valid.empty:
167
- df_resampled[col] = np.nan
168
- else:
169
- df_resampled[col] = np.interp(
170
- x=df_resampled['time_num'].values,
171
- xp=col_valid['time_num'].values,
172
- fp=col_valid[col].values
173
- )
174
-
175
- # Clean up
176
- df_resampled = df_resampled.drop(['time_num'], axis=1)
177
-
178
- return df_resampled
173
+ return selected[keep].copy()
179
174
 
180
175
 
181
176
  def downsample_lttb_with_config(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: downsampler
3
- Version: 0.3.0
3
+ Version: 0.3.1
4
4
  Summary: Timeseries DataFrame downsampling with LTTB, aggregation methods, gap handling, and fidelity testing
5
5
  Author-email: Eelco Doornbos <eelco.doornbos@knmi.nl>
6
6
  License-Expression: MIT
@@ -239,22 +239,36 @@ class TestInterpolateSmallGaps:
239
239
  # Small gap filled (2 new points), large gap left alone
240
240
  assert len(result) == 8
241
241
 
242
- def test_preserves_original_nans(self):
243
- """Test that pre-existing NaN values are not overwritten."""
244
- times = pd.date_range('2024-01-01', periods=5, freq='1min')
245
- df = pd.DataFrame(
246
- {'value': [0.0, np.nan, 2.0, 3.0, 4.0]},
247
- index=times,
242
+ def test_small_inplace_gap_filled_wide_preserved(self):
243
+ """Small in-place NaN holes are bridged per-column; wide holes stay NaN.
244
+
245
+ (New 0.3.1 contract: small-gap filling is a per-column preprocessing
246
+ step, so a hole narrower than gap_threshold in an otherwise-dense column
247
+ is interpolated from that column's own samples, while a hole at or above
248
+ the threshold is preserved as an honest gap.)
249
+ """
250
+ times = pd.date_range('2024-01-01', periods=20, freq='1min')
251
+ value = np.arange(20, dtype=float)
252
+ value[3] = np.nan # 1-min hole -> small, should fill
253
+ value[8:15] = np.nan # 7-min hole -> < 10min, should fill
254
+ df = pd.DataFrame({'value': value}, index=times)
255
+
256
+ small = interpolate_small_gaps(
257
+ df, gap_threshold=pd.Timedelta('10min'),
258
+ source_cadence=pd.Timedelta('1min'),
248
259
  )
249
-
250
- result = interpolate_small_gaps(
251
- df,
252
- gap_threshold=pd.Timedelta('10min'),
260
+ # Both holes are narrower than 10 min -> bridged (linear: value == index).
261
+ assert small['value'].isna().sum() == 0
262
+ assert small.iloc[3]['value'] == pytest.approx(3.0)
263
+ assert small.iloc[10]['value'] == pytest.approx(10.0)
264
+
265
+ # With a tighter threshold the 7-min hole is now "wide" -> preserved.
266
+ wide = interpolate_small_gaps(
267
+ df, gap_threshold=pd.Timedelta('5min'),
253
268
  source_cadence=pd.Timedelta('1min'),
254
269
  )
255
-
256
- # Original NaN should still be NaN
257
- assert np.isnan(result.iloc[1]['value'])
270
+ assert not np.isnan(wide.iloc[3]['value']) # 1-min hole still filled
271
+ assert wide['value'].iloc[8:14].isna().all() # 7-min hole preserved
258
272
 
259
273
  def test_auto_cadence_estimation(self):
260
274
  """Test that source_cadence is auto-estimated when None."""
@@ -236,12 +236,77 @@ class TestLttbNanEncodedGaps:
236
236
  source_cadence=pd.Timedelta('1min'),
237
237
  )
238
238
 
239
- # 'other' is constant 100 outside its hole; interpolation across
240
- # the hole must bridge it rather than propagate NaN.
239
+ # 'other' is constant 100 outside its hole; the small-gap preprocessing
240
+ # bridges it per-column rather than propagating NaN.
241
241
  assert result['other'].isna().sum() == 0
242
242
  assert (result['other'] == 100.0).all()
243
243
 
244
244
 
245
+ class TestLttbIncludeSelection:
246
+ """Include columns are row-SELECTED at LTTB timetags, never interpolated.
247
+
248
+ Regression coverage for the np.interp edge-clamp bug: interpolating an
249
+ include column onto the target's timetags stamped the column's edge value
250
+ across every timetag outside its valid range (a constant foreign peak at
251
+ each gap). Selection cannot fabricate values.
252
+ """
253
+
254
+ def test_high_window_does_not_bleed_into_quiet_period(self):
255
+ """A column valid only in a late high window must not appear earlier."""
256
+ idx = pd.date_range('2024-01-01 00:00', periods=24 * 60, freq='1min')
257
+ signal = 350 + 10 * np.sin(np.arange(len(idx)) / 30.0) # dense all day
258
+ other = np.full(len(idx), np.nan)
259
+ hi = (idx >= '2024-01-01 20:00') & (idx <= '2024-01-01 21:00')
260
+ other[hi] = 602.9 # the ONLY place 'other' has data
261
+ df = pd.DataFrame({'signal': signal, 'other': other}, index=idx)
262
+
263
+ result = downsample_lttb(
264
+ df, target_column='signal', target_cadence='PT5M',
265
+ source_cadence=pd.Timedelta('1min'),
266
+ )
267
+
268
+ # No 'other' value may appear at a timetag outside its valid window,
269
+ # and certainly not the clamped constant 602.9 in the quiet morning.
270
+ nonnull = result['other'].dropna()
271
+ assert (nonnull.index >= pd.Timestamp('2024-01-01 20:00')).all()
272
+ assert (nonnull.index <= pd.Timestamp('2024-01-01 21:00')).all()
273
+ morning = result.loc[result.index < pd.Timestamp('2024-01-01 19:00'), 'other']
274
+ assert morning.isna().all()
275
+
276
+ def test_ridealong_wide_hole_stays_nan(self):
277
+ """A hole wider than gap_threshold in a ride-along column stays NaN."""
278
+ idx = pd.date_range('2024-01-01', periods=240, freq='1min')
279
+ signal = 500 + 10 * np.sin(np.arange(240) / 20.0) # complete target
280
+ other = np.full(240, 100.0)
281
+ other[60:150] = np.nan # 90-min hole >> 30-min gap_threshold (PT15M)
282
+ df = pd.DataFrame({'signal': signal, 'other': other}, index=idx)
283
+
284
+ result = downsample_lttb(
285
+ df, target_column='signal', target_cadence='PT15M',
286
+ source_cadence=pd.Timedelta('1min'),
287
+ )
288
+
289
+ in_hole = result[(result.index > idx[60]) & (result.index < idx[149])]
290
+ assert not in_hole.empty # target is dense there, so timetags exist
291
+ assert in_hole['other'].isna().all()
292
+
293
+ def test_include_values_are_exact_source_selections(self):
294
+ """Each carried include value equals the real source sample (not interp)."""
295
+ idx = pd.date_range('2024-01-01', periods=240, freq='1min')
296
+ rng = np.random.default_rng(0)
297
+ signal = 500 + 50 * np.sin(np.arange(240) / 15.0) + rng.normal(0, 3, 240)
298
+ other = 100 + rng.normal(0, 10, 240) # noisy, dense (no gaps)
299
+ df = pd.DataFrame({'signal': signal, 'other': other}, index=idx)
300
+
301
+ result = downsample_lttb(
302
+ df, target_column='signal', target_cadence='PT15M',
303
+ source_cadence=pd.Timedelta('1min'),
304
+ )
305
+
306
+ for t, v in result['other'].dropna().items():
307
+ assert v == pytest.approx(df.loc[t, 'other'])
308
+
309
+
245
310
  class TestLttbShortSegments:
246
311
  """Short data islands must survive instead of being dropped."""
247
312
 
File without changes
File without changes
File without changes