downsampler 0.3.0__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {downsampler-0.3.0/src/downsampler.egg-info → downsampler-0.3.1}/PKG-INFO +1 -1
- {downsampler-0.3.0 → downsampler-0.3.1}/pyproject.toml +1 -1
- {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler/__init__.py +1 -1
- {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler/gaps.py +56 -28
- {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler/lttb.py +23 -28
- {downsampler-0.3.0 → downsampler-0.3.1/src/downsampler.egg-info}/PKG-INFO +1 -1
- {downsampler-0.3.0 → downsampler-0.3.1}/tests/test_gaps.py +27 -13
- {downsampler-0.3.0 → downsampler-0.3.1}/tests/test_lttb.py +67 -2
- {downsampler-0.3.0 → downsampler-0.3.1}/LICENSE +0 -0
- {downsampler-0.3.0 → downsampler-0.3.1}/README.md +0 -0
- {downsampler-0.3.0 → downsampler-0.3.1}/setup.cfg +0 -0
- {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler/aggregators.py +0 -0
- {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler/config.py +0 -0
- {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler/core.py +0 -0
- {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler/edges.py +0 -0
- {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler/fidelity/__init__.py +0 -0
- {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler/fidelity/comparison.py +0 -0
- {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler/fidelity/metrics.py +0 -0
- {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler/m4.py +0 -0
- {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler/ranged.py +0 -0
- {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler/utils.py +0 -0
- {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler.egg-info/SOURCES.txt +0 -0
- {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler.egg-info/dependency_links.txt +0 -0
- {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler.egg-info/requires.txt +0 -0
- {downsampler-0.3.0 → downsampler-0.3.1}/src/downsampler.egg-info/top_level.txt +0 -0
- {downsampler-0.3.0 → downsampler-0.3.1}/tests/test_aggregators.py +0 -0
- {downsampler-0.3.0 → downsampler-0.3.1}/tests/test_core.py +0 -0
- {downsampler-0.3.0 → downsampler-0.3.1}/tests/test_edges.py +0 -0
- {downsampler-0.3.0 → downsampler-0.3.1}/tests/test_fidelity.py +0 -0
- {downsampler-0.3.0 → downsampler-0.3.1}/tests/test_m4.py +0 -0
- {downsampler-0.3.0 → downsampler-0.3.1}/tests/test_ranged.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: downsampler
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: Timeseries DataFrame downsampling with LTTB, aggregation methods, gap handling, and fidelity testing
|
|
5
5
|
Author-email: Eelco Doornbos <eelco.doornbos@knmi.nl>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "downsampler"
|
|
7
|
-
version = "0.3.
|
|
7
|
+
version = "0.3.1"
|
|
8
8
|
description = "Timeseries DataFrame downsampling with LTTB, aggregation methods, gap handling, and fidelity testing"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
@@ -331,6 +331,12 @@ def interpolate_small_gaps(
|
|
|
331
331
|
in the original data are preserved — only the newly created rows get
|
|
332
332
|
interpolated values.
|
|
333
333
|
|
|
334
|
+
Each column is interpolated **from its own non-NaN samples**, and any new
|
|
335
|
+
timestamp outside a column's valid-sample range is left NaN (``np.interp``
|
|
336
|
+
``left``/``right`` = NaN, i.e. no edge-clamp and no extrapolation). So a
|
|
337
|
+
column with a hole larger than ``gap_threshold`` keeps an honest gap at the
|
|
338
|
+
filled timestamps rather than borrowing another column's edge value.
|
|
339
|
+
|
|
334
340
|
Args:
|
|
335
341
|
df: DataFrame with DatetimeIndex.
|
|
336
342
|
gap_threshold: Gaps at or above this duration are "real" gaps and
|
|
@@ -349,41 +355,63 @@ def interpolate_small_gaps(
|
|
|
349
355
|
source_cadence = estimate_cadence(df)
|
|
350
356
|
|
|
351
357
|
jitter_threshold = source_cadence * 1.5
|
|
352
|
-
deltas = df.index.to_series().diff()
|
|
353
|
-
|
|
354
358
|
numeric_cols = get_numeric_columns(df)
|
|
355
|
-
# Convert original timestamps to float64 for np.interp
|
|
356
|
-
orig_timestamps = df.index.astype(np.int64).astype(np.float64)
|
|
357
359
|
|
|
358
|
-
|
|
360
|
+
# 1. Row skeleton: existing rows plus synthetic rows inside *index* gaps
|
|
361
|
+
# smaller than gap_threshold, at source_cadence, so LTTB sees a
|
|
362
|
+
# continuous grid across small gaps. Values are filled per-column below.
|
|
363
|
+
deltas = df.index.to_series().diff()
|
|
364
|
+
extra_times: list[pd.Timestamp] = []
|
|
359
365
|
for i in range(1, len(df)):
|
|
360
366
|
delta = deltas.iloc[i]
|
|
361
367
|
if delta > jitter_threshold and delta < gap_threshold:
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
freq=source_cadence,
|
|
368
|
+
extra_times.extend(
|
|
369
|
+
pd.date_range(
|
|
370
|
+
start=df.index[i - 1] + source_cadence,
|
|
371
|
+
end=df.index[i] - source_cadence * 0.5, # don't duplicate t_after
|
|
372
|
+
freq=source_cadence,
|
|
373
|
+
)
|
|
369
374
|
)
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
375
|
+
if extra_times:
|
|
376
|
+
out = df.reindex(df.index.append(pd.DatetimeIndex(extra_times)).sort_values())
|
|
377
|
+
else:
|
|
378
|
+
out = df.copy()
|
|
379
|
+
|
|
380
|
+
# 2. Per-column, fill each column's *own* small gaps from its *own* samples,
|
|
381
|
+
# leaving holes wider than gap_threshold NaN. Interpolation is inside-only
|
|
382
|
+
# (no extrapolation → no edge-clamp) and each column is independent (no
|
|
383
|
+
# cross-column bleed).
|
|
384
|
+
for col in numeric_cols:
|
|
385
|
+
out[col] = _interpolate_column_small_gaps(out[col], gap_threshold)
|
|
386
|
+
|
|
387
|
+
return out
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def _interpolate_column_small_gaps(
|
|
391
|
+
s: pd.Series,
|
|
392
|
+
gap_threshold: pd.Timedelta,
|
|
393
|
+
) -> pd.Series:
|
|
394
|
+
"""Time-interpolate a single column's gaps narrower than gap_threshold.
|
|
395
|
+
|
|
396
|
+
Points outside the column's valid range stay NaN (no extrapolation), and
|
|
397
|
+
any run of NaNs whose bounding valid samples are gap_threshold or more
|
|
398
|
+
apart is re-blanked so wide holes remain honest gaps.
|
|
399
|
+
"""
|
|
400
|
+
valid = s.notna()
|
|
401
|
+
if valid.sum() < 2:
|
|
402
|
+
return s
|
|
403
|
+
|
|
404
|
+
filled = s.interpolate(method="time", limit_area="inside")
|
|
405
|
+
|
|
406
|
+
# Re-blank interpolated points that fall inside a wide (>= gap_threshold)
|
|
407
|
+
# hole between two consecutive valid samples.
|
|
408
|
+
idx = s.index
|
|
409
|
+
valid_pos = np.flatnonzero(valid.to_numpy())
|
|
410
|
+
for a, b in zip(valid_pos[:-1], valid_pos[1:]):
|
|
411
|
+
if b > a + 1 and (idx[b] - idx[a]) >= gap_threshold:
|
|
412
|
+
filled.iloc[a + 1:b] = np.nan
|
|
385
413
|
|
|
386
|
-
return
|
|
414
|
+
return filled
|
|
387
415
|
|
|
388
416
|
|
|
389
417
|
def concatenate_with_gap_markers(
|
|
@@ -107,11 +107,23 @@ def _lttb_single_segment(
|
|
|
107
107
|
) -> pd.DataFrame | None:
|
|
108
108
|
"""Apply LTTB to a single contiguous segment.
|
|
109
109
|
|
|
110
|
+
LTTB selects real source timetags for the *target* column; every other
|
|
111
|
+
requested column is then **selected** at those timetags rather than
|
|
112
|
+
interpolated onto them. ``lttbc.downsample`` returns exact input points,
|
|
113
|
+
so the selected ``time_num`` values map back to the segment rows
|
|
114
|
+
unambiguously, and each ride-along column keeps its real measured value
|
|
115
|
+
(or the small-gap-interpolated value the preprocessing stage placed there).
|
|
116
|
+
This makes it impossible to fabricate a value: earlier code interpolated
|
|
117
|
+
include columns with ``np.interp``, which clamps to a column's edge value
|
|
118
|
+
for any timetag outside that column's valid range — stamping a foreign
|
|
119
|
+
constant across every gap. Selection cannot do that.
|
|
120
|
+
|
|
110
121
|
Args:
|
|
111
122
|
df: Input DataFrame (no gaps).
|
|
112
123
|
target_column: Column to optimize for.
|
|
113
124
|
target_cadence: Target cadence.
|
|
114
|
-
include_columns: Additional columns to
|
|
125
|
+
include_columns: Additional columns to carry (selected, not
|
|
126
|
+
interpolated). If None, all numeric columns are carried.
|
|
115
127
|
|
|
116
128
|
Returns:
|
|
117
129
|
Downsampled DataFrame or None if cannot process.
|
|
@@ -136,46 +148,29 @@ def _lttb_single_segment(
|
|
|
136
148
|
return None
|
|
137
149
|
|
|
138
150
|
# Apply LTTB downsampling (lttbc uses separate x, y arrays)
|
|
139
|
-
x_down,
|
|
151
|
+
x_down, _ = lttbc.downsample(
|
|
140
152
|
df_clean['time_num'].values,
|
|
141
153
|
df_clean[target_column].values,
|
|
142
154
|
n_out
|
|
143
155
|
)
|
|
144
|
-
df_resampled = pd.DataFrame(
|
|
145
|
-
{'time_num': x_down, target_column: y_down}
|
|
146
|
-
)
|
|
147
156
|
|
|
148
|
-
#
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
)
|
|
157
|
+
# Row-select the segment rows LTTB chose. ``x_down`` are exact members of
|
|
158
|
+
# the input ``time_num`` (lttbc returns real input points), so ``isin``
|
|
159
|
+
# selects exactly those rows — carrying every column's real value and the
|
|
160
|
+
# original datetime index, no interpolation.
|
|
161
|
+
selected = df_work[df_work['time_num'].isin(x_down)]
|
|
153
162
|
|
|
154
|
-
|
|
155
|
-
# interpolates from its own non-NaN samples — np.interp propagates NaN
|
|
156
|
-
# from any NaN in fp, which would blank values near unrelated holes.
|
|
163
|
+
keep = [target_column]
|
|
157
164
|
for col in df.columns:
|
|
158
|
-
if col in
|
|
165
|
+
if col in ('time', 'time_num', target_column):
|
|
159
166
|
continue
|
|
160
167
|
if include_columns is not None and col not in include_columns:
|
|
161
168
|
continue
|
|
162
169
|
if not pd.api.types.is_numeric_dtype(df[col]):
|
|
163
170
|
continue
|
|
171
|
+
keep.append(col)
|
|
164
172
|
|
|
165
|
-
|
|
166
|
-
if col_valid.empty:
|
|
167
|
-
df_resampled[col] = np.nan
|
|
168
|
-
else:
|
|
169
|
-
df_resampled[col] = np.interp(
|
|
170
|
-
x=df_resampled['time_num'].values,
|
|
171
|
-
xp=col_valid['time_num'].values,
|
|
172
|
-
fp=col_valid[col].values
|
|
173
|
-
)
|
|
174
|
-
|
|
175
|
-
# Clean up
|
|
176
|
-
df_resampled = df_resampled.drop(['time_num'], axis=1)
|
|
177
|
-
|
|
178
|
-
return df_resampled
|
|
173
|
+
return selected[keep].copy()
|
|
179
174
|
|
|
180
175
|
|
|
181
176
|
def downsample_lttb_with_config(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: downsampler
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: Timeseries DataFrame downsampling with LTTB, aggregation methods, gap handling, and fidelity testing
|
|
5
5
|
Author-email: Eelco Doornbos <eelco.doornbos@knmi.nl>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -239,22 +239,36 @@ class TestInterpolateSmallGaps:
|
|
|
239
239
|
# Small gap filled (2 new points), large gap left alone
|
|
240
240
|
assert len(result) == 8
|
|
241
241
|
|
|
242
|
-
def
|
|
243
|
-
"""
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
242
|
+
def test_small_inplace_gap_filled_wide_preserved(self):
|
|
243
|
+
"""Small in-place NaN holes are bridged per-column; wide holes stay NaN.
|
|
244
|
+
|
|
245
|
+
(New 0.3.1 contract: small-gap filling is a per-column preprocessing
|
|
246
|
+
step, so a hole narrower than gap_threshold in an otherwise-dense column
|
|
247
|
+
is interpolated from that column's own samples, while a hole at or above
|
|
248
|
+
the threshold is preserved as an honest gap.)
|
|
249
|
+
"""
|
|
250
|
+
times = pd.date_range('2024-01-01', periods=20, freq='1min')
|
|
251
|
+
value = np.arange(20, dtype=float)
|
|
252
|
+
value[3] = np.nan # 1-min hole -> small, should fill
|
|
253
|
+
value[8:15] = np.nan # 7-min hole -> < 10min, should fill
|
|
254
|
+
df = pd.DataFrame({'value': value}, index=times)
|
|
255
|
+
|
|
256
|
+
small = interpolate_small_gaps(
|
|
257
|
+
df, gap_threshold=pd.Timedelta('10min'),
|
|
258
|
+
source_cadence=pd.Timedelta('1min'),
|
|
248
259
|
)
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
260
|
+
# Both holes are narrower than 10 min -> bridged (linear: value == index).
|
|
261
|
+
assert small['value'].isna().sum() == 0
|
|
262
|
+
assert small.iloc[3]['value'] == pytest.approx(3.0)
|
|
263
|
+
assert small.iloc[10]['value'] == pytest.approx(10.0)
|
|
264
|
+
|
|
265
|
+
# With a tighter threshold the 7-min hole is now "wide" -> preserved.
|
|
266
|
+
wide = interpolate_small_gaps(
|
|
267
|
+
df, gap_threshold=pd.Timedelta('5min'),
|
|
253
268
|
source_cadence=pd.Timedelta('1min'),
|
|
254
269
|
)
|
|
255
|
-
|
|
256
|
-
#
|
|
257
|
-
assert np.isnan(result.iloc[1]['value'])
|
|
270
|
+
assert not np.isnan(wide.iloc[3]['value']) # 1-min hole still filled
|
|
271
|
+
assert wide['value'].iloc[8:14].isna().all() # 7-min hole preserved
|
|
258
272
|
|
|
259
273
|
def test_auto_cadence_estimation(self):
|
|
260
274
|
"""Test that source_cadence is auto-estimated when None."""
|
|
@@ -236,12 +236,77 @@ class TestLttbNanEncodedGaps:
|
|
|
236
236
|
source_cadence=pd.Timedelta('1min'),
|
|
237
237
|
)
|
|
238
238
|
|
|
239
|
-
# 'other' is constant 100 outside its hole;
|
|
240
|
-
#
|
|
239
|
+
# 'other' is constant 100 outside its hole; the small-gap preprocessing
|
|
240
|
+
# bridges it per-column rather than propagating NaN.
|
|
241
241
|
assert result['other'].isna().sum() == 0
|
|
242
242
|
assert (result['other'] == 100.0).all()
|
|
243
243
|
|
|
244
244
|
|
|
245
|
+
class TestLttbIncludeSelection:
|
|
246
|
+
"""Include columns are row-SELECTED at LTTB timetags, never interpolated.
|
|
247
|
+
|
|
248
|
+
Regression coverage for the np.interp edge-clamp bug: interpolating an
|
|
249
|
+
include column onto the target's timetags stamped the column's edge value
|
|
250
|
+
across every timetag outside its valid range (a constant foreign peak at
|
|
251
|
+
each gap). Selection cannot fabricate values.
|
|
252
|
+
"""
|
|
253
|
+
|
|
254
|
+
def test_high_window_does_not_bleed_into_quiet_period(self):
|
|
255
|
+
"""A column valid only in a late high window must not appear earlier."""
|
|
256
|
+
idx = pd.date_range('2024-01-01 00:00', periods=24 * 60, freq='1min')
|
|
257
|
+
signal = 350 + 10 * np.sin(np.arange(len(idx)) / 30.0) # dense all day
|
|
258
|
+
other = np.full(len(idx), np.nan)
|
|
259
|
+
hi = (idx >= '2024-01-01 20:00') & (idx <= '2024-01-01 21:00')
|
|
260
|
+
other[hi] = 602.9 # the ONLY place 'other' has data
|
|
261
|
+
df = pd.DataFrame({'signal': signal, 'other': other}, index=idx)
|
|
262
|
+
|
|
263
|
+
result = downsample_lttb(
|
|
264
|
+
df, target_column='signal', target_cadence='PT5M',
|
|
265
|
+
source_cadence=pd.Timedelta('1min'),
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
# No 'other' value may appear at a timetag outside its valid window,
|
|
269
|
+
# and certainly not the clamped constant 602.9 in the quiet morning.
|
|
270
|
+
nonnull = result['other'].dropna()
|
|
271
|
+
assert (nonnull.index >= pd.Timestamp('2024-01-01 20:00')).all()
|
|
272
|
+
assert (nonnull.index <= pd.Timestamp('2024-01-01 21:00')).all()
|
|
273
|
+
morning = result.loc[result.index < pd.Timestamp('2024-01-01 19:00'), 'other']
|
|
274
|
+
assert morning.isna().all()
|
|
275
|
+
|
|
276
|
+
def test_ridealong_wide_hole_stays_nan(self):
|
|
277
|
+
"""A hole wider than gap_threshold in a ride-along column stays NaN."""
|
|
278
|
+
idx = pd.date_range('2024-01-01', periods=240, freq='1min')
|
|
279
|
+
signal = 500 + 10 * np.sin(np.arange(240) / 20.0) # complete target
|
|
280
|
+
other = np.full(240, 100.0)
|
|
281
|
+
other[60:150] = np.nan # 90-min hole >> 30-min gap_threshold (PT15M)
|
|
282
|
+
df = pd.DataFrame({'signal': signal, 'other': other}, index=idx)
|
|
283
|
+
|
|
284
|
+
result = downsample_lttb(
|
|
285
|
+
df, target_column='signal', target_cadence='PT15M',
|
|
286
|
+
source_cadence=pd.Timedelta('1min'),
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
in_hole = result[(result.index > idx[60]) & (result.index < idx[149])]
|
|
290
|
+
assert not in_hole.empty # target is dense there, so timetags exist
|
|
291
|
+
assert in_hole['other'].isna().all()
|
|
292
|
+
|
|
293
|
+
def test_include_values_are_exact_source_selections(self):
|
|
294
|
+
"""Each carried include value equals the real source sample (not interp)."""
|
|
295
|
+
idx = pd.date_range('2024-01-01', periods=240, freq='1min')
|
|
296
|
+
rng = np.random.default_rng(0)
|
|
297
|
+
signal = 500 + 50 * np.sin(np.arange(240) / 15.0) + rng.normal(0, 3, 240)
|
|
298
|
+
other = 100 + rng.normal(0, 10, 240) # noisy, dense (no gaps)
|
|
299
|
+
df = pd.DataFrame({'signal': signal, 'other': other}, index=idx)
|
|
300
|
+
|
|
301
|
+
result = downsample_lttb(
|
|
302
|
+
df, target_column='signal', target_cadence='PT15M',
|
|
303
|
+
source_cadence=pd.Timedelta('1min'),
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
for t, v in result['other'].dropna().items():
|
|
307
|
+
assert v == pytest.approx(df.loc[t, 'other'])
|
|
308
|
+
|
|
309
|
+
|
|
245
310
|
class TestLttbShortSegments:
|
|
246
311
|
"""Short data islands must survive instead of being dropped."""
|
|
247
312
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|