downsampler 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,357 @@
1
+ """Deferred/lazy data fetching API for downsampling.
2
+
3
+ This module provides tools for downsampling when data needs to be fetched
4
+ from an external source (e.g., API, database). It handles edge buffering
5
+ automatically to ensure stable output at the boundaries.
6
+ """
7
+
8
+ import logging
9
+ from typing import Callable, Protocol
10
+ import pandas as pd
11
+
12
+ from downsampler.config import DownsampleConfig, EdgeHandling
13
+ from downsampler.core import downsample
14
+ from downsampler.edges import compute_edge_buffer, trim_edges_by_time, expand_time_range
15
+ from downsampler.utils import parse_cadence
16
+
17
+
18
+ class DataFetcher(Protocol):
19
+ """Protocol for data fetching functions.
20
+
21
+ A DataFetcher is a callable that retrieves data for a given time range.
22
+ """
23
+ def __call__(
24
+ self,
25
+ start: pd.Timestamp,
26
+ end: pd.Timestamp
27
+ ) -> pd.DataFrame:
28
+ """Fetch data for the given time range.
29
+
30
+ Args:
31
+ start: Start of time range (inclusive).
32
+ end: End of time range (exclusive).
33
+
34
+ Returns:
35
+ DataFrame with DatetimeIndex containing data in the time range.
36
+ """
37
+ ...
38
+
39
+
40
+ def deferred_downsample(
41
+ fetcher: Callable[[pd.Timestamp, pd.Timestamp], pd.DataFrame],
42
+ output_start: pd.Timestamp,
43
+ output_end: pd.Timestamp,
44
+ target_cadence: str | pd.Timedelta,
45
+ config: DownsampleConfig | None = None,
46
+ edge_buffer_multiplier: float = 2.0,
47
+ ) -> pd.DataFrame:
48
+ """Downsample with automatic data fetching and edge buffering.
49
+
50
+ Automatically fetches extra data at edges for stable output. This is
51
+ useful when downsampling data that needs to be retrieved from an
52
+ external source (API, database, file).
53
+
54
+ The function:
55
+ 1. Computes the required fetch range (output range + edge buffer)
56
+ 2. Calls the fetcher to retrieve data
57
+ 3. Performs downsampling
58
+ 4. Trims the result to the requested output range
59
+
60
+ Args:
61
+ fetcher: Function that retrieves data for a given time range.
62
+ Signature: fetcher(start: pd.Timestamp, end: pd.Timestamp) -> pd.DataFrame
63
+ output_start: Start of desired output time range.
64
+ output_end: End of desired output time range.
65
+ target_cadence: Target cadence as ISO duration string or Timedelta.
66
+ config: Downsampling configuration. If None, uses default config.
67
+ edge_buffer_multiplier: Multiplier for edge buffer calculation.
68
+ Higher values provide more stable edges but require fetching
69
+ more data.
70
+
71
+ Returns:
72
+ Downsampled DataFrame covering the requested output range.
73
+
74
+ Example:
75
+ >>> def fetch_from_api(start, end):
76
+ ... # Simulate API call
77
+ ... return pd.DataFrame(
78
+ ... {'value': range(100)},
79
+ ... index=pd.date_range(start, end, periods=100)
80
+ ... )
81
+ >>>
82
+ >>> result = deferred_downsample(
83
+ ... fetcher=fetch_from_api,
84
+ ... output_start=pd.Timestamp('2024-01-01 00:00'),
85
+ ... output_end=pd.Timestamp('2024-01-01 12:00'),
86
+ ... target_cadence='1H'
87
+ ... )
88
+ """
89
+ if config is None:
90
+ config = DownsampleConfig()
91
+
92
+ target_cadence = parse_cadence(target_cadence)
93
+
94
+ # Compute edge buffer
95
+ edge_buffer = compute_edge_buffer(
96
+ target_cadence,
97
+ config.edge_window,
98
+ edge_buffer_multiplier
99
+ )
100
+
101
+ # Expand time range for fetching
102
+ fetch_start, fetch_end = expand_time_range(
103
+ output_start, output_end, edge_buffer
104
+ )
105
+
106
+ logging.debug(
107
+ f"Fetching data from {fetch_start} to {fetch_end} "
108
+ f"(buffer: {edge_buffer}) for output range {output_start} to {output_end}"
109
+ )
110
+
111
+ # Fetch data
112
+ df = fetcher(fetch_start, fetch_end)
113
+
114
+ if df is None or len(df) == 0:
115
+ logging.warning(f"Fetcher returned empty DataFrame for {fetch_start} to {fetch_end}")
116
+ return pd.DataFrame()
117
+
118
+ # Create a config with edge handling disabled (we'll handle it via trimming)
119
+ config_no_edges = DownsampleConfig(
120
+ method=config.method,
121
+ lttb_target_column=config.lttb_target_column,
122
+ include_columns=config.include_columns,
123
+ exclude_columns=config.exclude_columns,
124
+ gap_handling=config.gap_handling,
125
+ gap_threshold=config.gap_threshold,
126
+ edge_handling=EdgeHandling.KEEP, # Keep edges, we'll trim
127
+ edge_window=config.edge_window,
128
+ min_points_per_segment=config.min_points_per_segment,
129
+ )
130
+
131
+ # Perform downsampling
132
+ result = downsample(df, target_cadence, config_no_edges)
133
+
134
+ if len(result) == 0:
135
+ return result
136
+
137
+ # Trim to requested output range
138
+ result = trim_edges_by_time(result, output_start, output_end)
139
+
140
+ return result
141
+
142
+
143
+ def batch_deferred_downsample(
144
+ fetcher: Callable[[pd.Timestamp, pd.Timestamp], pd.DataFrame],
145
+ output_start: pd.Timestamp,
146
+ output_end: pd.Timestamp,
147
+ target_cadence: str | pd.Timedelta,
148
+ batch_size: str | pd.Timedelta = 'P1D',
149
+ config: DownsampleConfig | None = None,
150
+ edge_buffer_multiplier: float = 2.0,
151
+ ) -> pd.DataFrame:
152
+ """Downsample large time ranges in batches.
153
+
154
+ Useful when the data is too large to fetch and process in one go.
155
+ The function processes the time range in batches, fetching extra
156
+ data at batch boundaries for stable results.
157
+
158
+ Args:
159
+ fetcher: Function that retrieves data for a given time range.
160
+ output_start: Start of desired output time range.
161
+ output_end: End of desired output time range.
162
+ target_cadence: Target cadence as ISO duration string or Timedelta.
163
+ batch_size: Size of each processing batch.
164
+ config: Downsampling configuration.
165
+ edge_buffer_multiplier: Multiplier for edge buffer calculation.
166
+
167
+ Returns:
168
+ Downsampled DataFrame covering the requested output range.
169
+
170
+ Example:
171
+ >>> result = batch_deferred_downsample(
172
+ ... fetcher=fetch_from_api,
173
+ ... output_start=pd.Timestamp('2024-01-01'),
174
+ ... output_end=pd.Timestamp('2024-02-01'),
175
+ ... target_cadence='1H',
176
+ ... batch_size='P1D' # Process one day at a time
177
+ ... )
178
+ """
179
+ if config is None:
180
+ config = DownsampleConfig()
181
+
182
+ target_cadence = parse_cadence(target_cadence)
183
+ batch_size = parse_cadence(batch_size)
184
+
185
+ results = []
186
+
187
+ # Process in batches
188
+ current_start = output_start
189
+ while current_start < output_end:
190
+ current_end = min(current_start + batch_size, output_end)
191
+
192
+ logging.info(f"Processing batch: {current_start} to {current_end}")
193
+
194
+ batch_result = deferred_downsample(
195
+ fetcher=fetcher,
196
+ output_start=current_start,
197
+ output_end=current_end,
198
+ target_cadence=target_cadence,
199
+ config=config,
200
+ edge_buffer_multiplier=edge_buffer_multiplier,
201
+ )
202
+
203
+ if len(batch_result) > 0:
204
+ results.append(batch_result)
205
+
206
+ current_start = current_end
207
+
208
+ if not results:
209
+ return pd.DataFrame()
210
+
211
+ # Concatenate and sort
212
+ result = pd.concat(results).sort_index()
213
+
214
+ # Remove any duplicates that might occur at batch boundaries
215
+ result = result[~result.index.duplicated(keep='first')]
216
+
217
+ return result
218
+
219
+
220
+ class LazyDownsampler:
221
+ """A lazy downsampler that caches fetched data.
222
+
223
+ Useful when you need to downsample to multiple cadences or compare
224
+ different methods on the same data without re-fetching.
225
+
226
+ Example:
227
+ >>> lazy = LazyDownsampler(fetch_from_api)
228
+ >>> result_1h = lazy.downsample(
229
+ ... pd.Timestamp('2024-01-01'),
230
+ ... pd.Timestamp('2024-01-02'),
231
+ ... '1H'
232
+ ... )
233
+ >>> result_30min = lazy.downsample(
234
+ ... pd.Timestamp('2024-01-01'),
235
+ ... pd.Timestamp('2024-01-02'),
236
+ ... '30min'
237
+ ... ) # Uses cached data if sufficient
238
+ """
239
+
240
+ def __init__(
241
+ self,
242
+ fetcher: Callable[[pd.Timestamp, pd.Timestamp], pd.DataFrame],
243
+ cache_buffer: str | pd.Timedelta = 'PT1H'
244
+ ):
245
+ """Initialize the lazy downsampler.
246
+
247
+ Args:
248
+ fetcher: Function that retrieves data for a given time range.
249
+ cache_buffer: Extra time to fetch beyond requested range for caching.
250
+ """
251
+ self.fetcher = fetcher
252
+ self.cache_buffer = parse_cadence(cache_buffer)
253
+ self._cache: pd.DataFrame | None = None
254
+ self._cache_start: pd.Timestamp | None = None
255
+ self._cache_end: pd.Timestamp | None = None
256
+
257
+ def _ensure_cache(self, start: pd.Timestamp, end: pd.Timestamp) -> pd.DataFrame:
258
+ """Ensure data is cached for the requested range.
259
+
260
+ Args:
261
+ start: Start of required range.
262
+ end: End of required range.
263
+
264
+ Returns:
265
+ Cached DataFrame covering at least the requested range.
266
+ """
267
+ # Check if current cache covers the range
268
+ if (
269
+ self._cache is not None
270
+ and self._cache_start is not None
271
+ and self._cache_end is not None
272
+ and self._cache_start <= start
273
+ and self._cache_end >= end
274
+ ):
275
+ return self._cache
276
+
277
+ # Need to fetch (or extend cache)
278
+ fetch_start = start - self.cache_buffer
279
+ fetch_end = end + self.cache_buffer
280
+
281
+ logging.info(f"LazyDownsampler: fetching {fetch_start} to {fetch_end}")
282
+ self._cache = self.fetcher(fetch_start, fetch_end)
283
+ self._cache_start = fetch_start
284
+ self._cache_end = fetch_end
285
+
286
+ return self._cache
287
+
288
+ def downsample(
289
+ self,
290
+ output_start: pd.Timestamp,
291
+ output_end: pd.Timestamp,
292
+ target_cadence: str | pd.Timedelta,
293
+ config: DownsampleConfig | None = None,
294
+ edge_buffer_multiplier: float = 2.0,
295
+ ) -> pd.DataFrame:
296
+ """Downsample using cached data.
297
+
298
+ Args:
299
+ output_start: Start of desired output time range.
300
+ output_end: End of desired output time range.
301
+ target_cadence: Target cadence.
302
+ config: Downsampling configuration.
303
+ edge_buffer_multiplier: Multiplier for edge buffer.
304
+
305
+ Returns:
306
+ Downsampled DataFrame.
307
+ """
308
+ if config is None:
309
+ config = DownsampleConfig()
310
+
311
+ target_cadence = parse_cadence(target_cadence)
312
+
313
+ # Compute required fetch range with edge buffer
314
+ edge_buffer = compute_edge_buffer(
315
+ target_cadence,
316
+ config.edge_window,
317
+ edge_buffer_multiplier
318
+ )
319
+ fetch_start = output_start - edge_buffer
320
+ fetch_end = output_end + edge_buffer
321
+
322
+ # Ensure cache covers the range
323
+ df = self._ensure_cache(fetch_start, fetch_end)
324
+
325
+ # Slice to required range
326
+ df_slice = df[(df.index >= fetch_start) & (df.index < fetch_end)]
327
+
328
+ if len(df_slice) == 0:
329
+ return pd.DataFrame()
330
+
331
+ # Create config without edge handling
332
+ config_no_edges = DownsampleConfig(
333
+ method=config.method,
334
+ lttb_target_column=config.lttb_target_column,
335
+ include_columns=config.include_columns,
336
+ exclude_columns=config.exclude_columns,
337
+ gap_handling=config.gap_handling,
338
+ gap_threshold=config.gap_threshold,
339
+ edge_handling=EdgeHandling.KEEP,
340
+ edge_window=config.edge_window,
341
+ min_points_per_segment=config.min_points_per_segment,
342
+ )
343
+
344
+ # Downsample
345
+ result = downsample(df_slice, target_cadence, config_no_edges)
346
+
347
+ if len(result) == 0:
348
+ return result
349
+
350
+ # Trim to output range
351
+ return trim_edges_by_time(result, output_start, output_end)
352
+
353
+ def clear_cache(self):
354
+ """Clear the cached data."""
355
+ self._cache = None
356
+ self._cache_start = None
357
+ self._cache_end = None
downsampler/edges.py ADDED
@@ -0,0 +1,202 @@
1
+ """Edge handling strategies for downsampled time series data."""
2
+
3
+ import pandas as pd
4
+ import numpy as np
5
+ from typing import Tuple
6
+
7
+ from downsampler.config import EdgeHandling
8
+
9
+
10
+ def identify_edge_points(
11
+ df: pd.DataFrame,
12
+ edge_window: int = 2
13
+ ) -> pd.Series:
14
+ """Identify edge points in a DataFrame.
15
+
16
+ Edge points are the first and last few points in a DataFrame that may
17
+ have reduced accuracy in downsampling due to boundary effects.
18
+
19
+ Args:
20
+ df: DataFrame with DatetimeIndex.
21
+ edge_window: Number of points at each edge to mark as edge points.
22
+
23
+ Returns:
24
+ Boolean Series where True indicates an edge point.
25
+
26
+ Example:
27
+ >>> df = pd.DataFrame(
28
+ ... {'value': range(10)},
29
+ ... index=pd.date_range('2024-01-01', periods=10, freq='1min')
30
+ ... )
31
+ >>> edges = identify_edge_points(df, edge_window=2)
32
+ >>> edges.sum()
33
+ 4
34
+ """
35
+ n = len(df)
36
+ is_edge = pd.Series(False, index=df.index)
37
+
38
+ if n <= 2 * edge_window:
39
+ # All points are edge points
40
+ is_edge[:] = True
41
+ else:
42
+ is_edge.iloc[:edge_window] = True
43
+ is_edge.iloc[-edge_window:] = True
44
+
45
+ return is_edge
46
+
47
+
48
+ def apply_edge_handling(
49
+ df: pd.DataFrame,
50
+ handling: EdgeHandling,
51
+ edge_window: int = 2
52
+ ) -> pd.DataFrame:
53
+ """Apply edge handling strategy to a DataFrame.
54
+
55
+ Args:
56
+ df: DataFrame with DatetimeIndex (typically after downsampling).
57
+ handling: Edge handling strategy to apply.
58
+ edge_window: Number of points at each edge to consider as edge points.
59
+
60
+ Returns:
61
+ DataFrame with edge handling applied:
62
+ - KEEP: Returns the DataFrame unchanged.
63
+ - FLAG: Adds '_is_edge' column with boolean values.
64
+ - DISCARD: Returns DataFrame with edge points removed.
65
+
66
+ Example:
67
+ >>> df = pd.DataFrame(
68
+ ... {'value': range(10)},
69
+ ... index=pd.date_range('2024-01-01', periods=10, freq='1min')
70
+ ... )
71
+ >>> flagged = apply_edge_handling(df, EdgeHandling.FLAG, edge_window=2)
72
+ >>> '_is_edge' in flagged.columns
73
+ True
74
+ """
75
+ if handling == EdgeHandling.KEEP:
76
+ return df
77
+
78
+ is_edge = identify_edge_points(df, edge_window)
79
+
80
+ if handling == EdgeHandling.FLAG:
81
+ df_result = df.copy()
82
+ df_result['_is_edge'] = is_edge
83
+ return df_result
84
+
85
+ elif handling == EdgeHandling.DISCARD:
86
+ return df[~is_edge].copy()
87
+
88
+ return df
89
+
90
+
91
+ def compute_edge_buffer(
92
+ target_cadence: pd.Timedelta,
93
+ edge_window: int = 2,
94
+ multiplier: float = 2.0
95
+ ) -> pd.Timedelta:
96
+ """Compute the time buffer needed at edges for stable downsampling.
97
+
98
+ When fetching data for downsampling, extra data should be fetched at
99
+ the edges to ensure that edge effects don't affect the desired output
100
+ range.
101
+
102
+ Args:
103
+ target_cadence: Target cadence for downsampling.
104
+ edge_window: Number of points at each edge to consider as edge points.
105
+ multiplier: Safety multiplier for the buffer.
106
+
107
+ Returns:
108
+ Time buffer to add at each edge.
109
+
110
+ Example:
111
+ >>> buffer = compute_edge_buffer(pd.Timedelta('1H'), edge_window=2, multiplier=2.0)
112
+ >>> buffer
113
+ Timedelta('0 days 04:00:00')
114
+ """
115
+ return target_cadence * edge_window * multiplier
116
+
117
+
118
+ def trim_edges_by_time(
119
+ df: pd.DataFrame,
120
+ start: pd.Timestamp,
121
+ end: pd.Timestamp
122
+ ) -> pd.DataFrame:
123
+ """Trim a DataFrame to a specific time range.
124
+
125
+ Useful for removing edge data that was fetched for buffering purposes
126
+ after downsampling is complete.
127
+
128
+ Args:
129
+ df: DataFrame with DatetimeIndex.
130
+ start: Start of desired time range (inclusive).
131
+ end: End of desired time range (exclusive).
132
+
133
+ Returns:
134
+ DataFrame trimmed to the specified time range.
135
+
136
+ Example:
137
+ >>> df = pd.DataFrame(
138
+ ... {'value': range(10)},
139
+ ... index=pd.date_range('2024-01-01', periods=10, freq='1H')
140
+ ... )
141
+ >>> start = pd.Timestamp('2024-01-01 02:00')
142
+ >>> end = pd.Timestamp('2024-01-01 08:00')
143
+ >>> trimmed = trim_edges_by_time(df, start, end)
144
+ >>> len(trimmed)
145
+ 6
146
+ """
147
+ return df[(df.index >= start) & (df.index < end)].copy()
148
+
149
+
150
+ def expand_time_range(
151
+ start: pd.Timestamp,
152
+ end: pd.Timestamp,
153
+ buffer: pd.Timedelta
154
+ ) -> Tuple[pd.Timestamp, pd.Timestamp]:
155
+ """Expand a time range by a buffer on both ends.
156
+
157
+ Useful for computing the fetch range needed for stable downsampling.
158
+
159
+ Args:
160
+ start: Original start timestamp.
161
+ end: Original end timestamp.
162
+ buffer: Time buffer to add at each end.
163
+
164
+ Returns:
165
+ Tuple of (expanded_start, expanded_end).
166
+
167
+ Example:
168
+ >>> start = pd.Timestamp('2024-01-01 12:00')
169
+ >>> end = pd.Timestamp('2024-01-01 18:00')
170
+ >>> expanded_start, expanded_end = expand_time_range(start, end, pd.Timedelta('1H'))
171
+ >>> expanded_start
172
+ Timestamp('2024-01-01 11:00:00')
173
+ >>> expanded_end
174
+ Timestamp('2024-01-01 19:00:00')
175
+ """
176
+ return start - buffer, end + buffer
177
+
178
+
179
+ def merge_edge_flags(dfs: list[pd.DataFrame]) -> pd.DataFrame:
180
+ """Merge DataFrames while preserving edge flags.
181
+
182
+ When concatenating segments that were processed independently,
183
+ the edge flags should be updated to reflect the combined data.
184
+
185
+ Args:
186
+ dfs: List of DataFrames, potentially with '_is_edge' columns.
187
+
188
+ Returns:
189
+ Concatenated DataFrame with merged edge information.
190
+ Points that were internal edges (between segments) retain their
191
+ edge flag from the original processing.
192
+ """
193
+ if not dfs:
194
+ return pd.DataFrame()
195
+
196
+ result = pd.concat(dfs).sort_index()
197
+
198
+ # If no edge flags present, just return
199
+ if '_is_edge' not in result.columns:
200
+ return result
201
+
202
+ return result
@@ -0,0 +1,23 @@
1
+ """Fidelity testing and comparison tools for downsampling.
2
+
3
+ This module provides tools for evaluating the visual and statistical fidelity
4
+ of downsampled time series data compared to the original.
5
+ """
6
+
7
+ from downsampler.fidelity.metrics import FidelityMetrics, compute_metrics
8
+ from downsampler.fidelity.comparison import FidelityComparison, ComparisonResult
9
+ from downsampler.fidelity.visualization import (
10
+ plot_comparison,
11
+ plot_method_comparison,
12
+ MarimoHelper,
13
+ )
14
+
15
+ __all__ = [
16
+ "FidelityMetrics",
17
+ "compute_metrics",
18
+ "FidelityComparison",
19
+ "ComparisonResult",
20
+ "plot_comparison",
21
+ "plot_method_comparison",
22
+ "MarimoHelper",
23
+ ]