downsampler 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,343 @@
1
+ """Comparison engine for evaluating downsampling methods and cadences."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Any
5
+ import pandas as pd
6
+
7
+ from downsampler.config import DownsampleConfig, AggregationMethod
8
+ from downsampler.core import downsample
9
+ from downsampler.fidelity.metrics import FidelityMetrics, compute_metrics, compute_reduction_ratio
10
+ from downsampler.utils import parse_cadence
11
+
12
+
13
+ @dataclass
14
+ class ComparisonResult:
15
+ """Result of a single downsampling comparison.
16
+
17
+ Attributes:
18
+ method: The aggregation method used.
19
+ cadence: The target cadence.
20
+ metrics: Fidelity metrics for this result.
21
+ reduction_ratio: Data reduction ratio (original/downsampled rows).
22
+ config: The full configuration used.
23
+ downsampled: The downsampled DataFrame (optional, may be None if not stored).
24
+ """
25
+ method: AggregationMethod
26
+ cadence: pd.Timedelta
27
+ metrics: FidelityMetrics
28
+ reduction_ratio: float
29
+ config: DownsampleConfig
30
+ downsampled: pd.DataFrame | None = None
31
+
32
+ def to_dict(self) -> dict:
33
+ """Convert result to a dictionary."""
34
+ return {
35
+ 'method': self.method.value,
36
+ 'cadence': str(self.cadence),
37
+ 'reduction_ratio': self.reduction_ratio,
38
+ **self.metrics.to_dict(),
39
+ }
40
+
41
+
42
+ class FidelityComparison:
43
+ """Engine for comparing downsampling methods and cadences.
44
+
45
+ Example:
46
+ >>> original = pd.DataFrame(
47
+ ... {'signal': np.sin(np.linspace(0, 10*np.pi, 10000))},
48
+ ... index=pd.date_range('2024-01-01', periods=10000, freq='1s')
49
+ ... )
50
+ >>> comp = FidelityComparison(original, 'signal')
51
+ >>> results = comp.compare_methods('1min')
52
+ >>> for r in results:
53
+ ... print(f"{r.method.value}: RMSE={r.metrics.rmse:.4f}")
54
+ """
55
+
56
+ def __init__(
57
+ self,
58
+ original_df: pd.DataFrame,
59
+ column: str,
60
+ peak_prominence: float | None = None
61
+ ):
62
+ """Initialize the comparison engine.
63
+
64
+ Args:
65
+ original_df: The original high-cadence DataFrame.
66
+ column: The column to use for comparisons.
67
+ peak_prominence: Minimum prominence for peak detection.
68
+ """
69
+ self.original_df = original_df
70
+ self.column = column
71
+ self.peak_prominence = peak_prominence
72
+ self._results: list[ComparisonResult] = []
73
+
74
+ def compare_methods(
75
+ self,
76
+ target_cadence: str | pd.Timedelta,
77
+ methods: list[AggregationMethod] | None = None,
78
+ lttb_target_column: str | None = None,
79
+ store_downsampled: bool = False
80
+ ) -> list[ComparisonResult]:
81
+ """Compare different downsampling methods at a fixed cadence.
82
+
83
+ Args:
84
+ target_cadence: Target cadence for all comparisons.
85
+ methods: List of methods to compare. If None, compares all methods.
86
+ lttb_target_column: Column to optimize for LTTB. Defaults to
87
+ the comparison column.
88
+ store_downsampled: Whether to store downsampled DataFrames in results.
89
+
90
+ Returns:
91
+ List of ComparisonResult objects.
92
+ """
93
+ target_cadence = parse_cadence(target_cadence)
94
+
95
+ if methods is None:
96
+ methods = list(AggregationMethod)
97
+
98
+ if lttb_target_column is None:
99
+ lttb_target_column = self.column
100
+
101
+ results = []
102
+ for method in methods:
103
+ config = DownsampleConfig(
104
+ method=method,
105
+ lttb_target_column=lttb_target_column if method == AggregationMethod.LTTB else None,
106
+ )
107
+
108
+ try:
109
+ downsampled = downsample(self.original_df, target_cadence, config)
110
+ except Exception as e:
111
+ # Log and skip failed methods
112
+ print(f"Warning: {method.value} failed: {e}")
113
+ continue
114
+
115
+ if len(downsampled) == 0:
116
+ continue
117
+
118
+ metrics = compute_metrics(
119
+ self.original_df,
120
+ downsampled,
121
+ self.column,
122
+ self.peak_prominence
123
+ )
124
+
125
+ reduction_ratio = compute_reduction_ratio(self.original_df, downsampled)
126
+
127
+ result = ComparisonResult(
128
+ method=method,
129
+ cadence=target_cadence,
130
+ metrics=metrics,
131
+ reduction_ratio=reduction_ratio,
132
+ config=config,
133
+ downsampled=downsampled if store_downsampled else None,
134
+ )
135
+ results.append(result)
136
+
137
+ self._results.extend(results)
138
+ return results
139
+
140
+ def compare_cadences(
141
+ self,
142
+ cadences: list[str | pd.Timedelta],
143
+ method: AggregationMethod = AggregationMethod.MEAN,
144
+ lttb_target_column: str | None = None,
145
+ store_downsampled: bool = False
146
+ ) -> list[ComparisonResult]:
147
+ """Compare different cadences using a fixed method.
148
+
149
+ Args:
150
+ cadences: List of target cadences to compare.
151
+ method: Downsampling method to use.
152
+ lttb_target_column: Column to optimize for LTTB.
153
+ store_downsampled: Whether to store downsampled DataFrames.
154
+
155
+ Returns:
156
+ List of ComparisonResult objects.
157
+ """
158
+ if lttb_target_column is None:
159
+ lttb_target_column = self.column
160
+
161
+ results = []
162
+ for cadence in cadences:
163
+ cadence = parse_cadence(cadence)
164
+
165
+ config = DownsampleConfig(
166
+ method=method,
167
+ lttb_target_column=lttb_target_column if method == AggregationMethod.LTTB else None,
168
+ )
169
+
170
+ try:
171
+ downsampled = downsample(self.original_df, cadence, config)
172
+ except Exception as e:
173
+ print(f"Warning: cadence {cadence} failed: {e}")
174
+ continue
175
+
176
+ if len(downsampled) == 0:
177
+ continue
178
+
179
+ metrics = compute_metrics(
180
+ self.original_df,
181
+ downsampled,
182
+ self.column,
183
+ self.peak_prominence
184
+ )
185
+
186
+ reduction_ratio = compute_reduction_ratio(self.original_df, downsampled)
187
+
188
+ result = ComparisonResult(
189
+ method=method,
190
+ cadence=cadence,
191
+ metrics=metrics,
192
+ reduction_ratio=reduction_ratio,
193
+ config=config,
194
+ downsampled=downsampled if store_downsampled else None,
195
+ )
196
+ results.append(result)
197
+
198
+ self._results.extend(results)
199
+ return results
200
+
201
+ def compare_grid(
202
+ self,
203
+ cadences: list[str | pd.Timedelta],
204
+ methods: list[AggregationMethod] | None = None,
205
+ lttb_target_column: str | None = None,
206
+ store_downsampled: bool = False
207
+ ) -> list[ComparisonResult]:
208
+ """Compare all combinations of cadences and methods.
209
+
210
+ Args:
211
+ cadences: List of target cadences.
212
+ methods: List of methods. If None, uses all methods.
213
+ lttb_target_column: Column to optimize for LTTB.
214
+ store_downsampled: Whether to store downsampled DataFrames.
215
+
216
+ Returns:
217
+ List of ComparisonResult objects.
218
+ """
219
+ if methods is None:
220
+ methods = list(AggregationMethod)
221
+
222
+ if lttb_target_column is None:
223
+ lttb_target_column = self.column
224
+
225
+ results = []
226
+ for cadence in cadences:
227
+ for method in methods:
228
+ cadence_parsed = parse_cadence(cadence)
229
+
230
+ config = DownsampleConfig(
231
+ method=method,
232
+ lttb_target_column=lttb_target_column if method == AggregationMethod.LTTB else None,
233
+ )
234
+
235
+ try:
236
+ downsampled = downsample(self.original_df, cadence_parsed, config)
237
+ except Exception:
238
+ continue
239
+
240
+ if len(downsampled) == 0:
241
+ continue
242
+
243
+ metrics = compute_metrics(
244
+ self.original_df,
245
+ downsampled,
246
+ self.column,
247
+ self.peak_prominence
248
+ )
249
+
250
+ reduction_ratio = compute_reduction_ratio(self.original_df, downsampled)
251
+
252
+ result = ComparisonResult(
253
+ method=method,
254
+ cadence=cadence_parsed,
255
+ metrics=metrics,
256
+ reduction_ratio=reduction_ratio,
257
+ config=config,
258
+ downsampled=downsampled if store_downsampled else None,
259
+ )
260
+ results.append(result)
261
+
262
+ self._results.extend(results)
263
+ return results
264
+
265
+ def summary_table(self, results: list[ComparisonResult] | None = None) -> pd.DataFrame:
266
+ """Generate a summary table from comparison results.
267
+
268
+ Args:
269
+ results: List of results to summarize. If None, uses all
270
+ results from this comparison engine.
271
+
272
+ Returns:
273
+ DataFrame with metrics for each method/cadence combination.
274
+ """
275
+ if results is None:
276
+ results = self._results
277
+
278
+ if not results:
279
+ return pd.DataFrame()
280
+
281
+ rows = [r.to_dict() for r in results]
282
+ return pd.DataFrame(rows)
283
+
284
+ def recommend_settings(
285
+ self,
286
+ target_cadence: str | pd.Timedelta,
287
+ priority: str = "visual"
288
+ ) -> DownsampleConfig:
289
+ """Recommend downsampling settings based on comparison results.
290
+
291
+ Args:
292
+ target_cadence: Target cadence for the recommendation.
293
+ priority: Optimization priority:
294
+ - "visual": Minimize visual error (RMSE + peak_error)
295
+ - "peaks": Prioritize peak preservation
296
+ - "correlation": Maximize correlation
297
+ - "speed": Prefer simple aggregation methods
298
+
299
+ Returns:
300
+ Recommended DownsampleConfig.
301
+ """
302
+ target_cadence = parse_cadence(target_cadence)
303
+
304
+ # Run comparison if we don't have results for this cadence
305
+ relevant_results = [r for r in self._results if r.cadence == target_cadence]
306
+ if not relevant_results:
307
+ relevant_results = self.compare_methods(target_cadence)
308
+
309
+ if not relevant_results:
310
+ # Return default config if no results
311
+ return DownsampleConfig()
312
+
313
+ # Score each result based on priority
314
+ def score(r: ComparisonResult) -> float:
315
+ m = r.metrics
316
+ if priority == "visual":
317
+ # Lower is better for RMSE and peak_error
318
+ return m.rmse + 0.5 * m.peak_error
319
+ elif priority == "peaks":
320
+ # Closer to 1.0 is better for peak_count_ratio
321
+ return abs(1.0 - m.peak_count_ratio) + 0.1 * m.peak_error
322
+ elif priority == "correlation":
323
+ # Higher is better for pearson_r
324
+ return -m.pearson_r
325
+ elif priority == "speed":
326
+ # Prefer simple methods
327
+ method_penalty = {
328
+ AggregationMethod.MEAN: 0,
329
+ AggregationMethod.MIN: 0,
330
+ AggregationMethod.MAX: 0,
331
+ AggregationMethod.MEDIAN: 0.1,
332
+ AggregationMethod.LTTB: 0.2,
333
+ }
334
+ return m.rmse + method_penalty.get(r.method, 0)
335
+ else:
336
+ return m.rmse
337
+
338
+ best = min(relevant_results, key=score)
339
+ return best.config
340
+
341
+ def clear_results(self):
342
+ """Clear stored comparison results."""
343
+ self._results = []
@@ -0,0 +1,212 @@
1
+ """Statistical metrics for evaluating downsampling fidelity."""
2
+
3
+ from dataclasses import dataclass
4
+ import numpy as np
5
+ import pandas as pd
6
+ from scipy import stats
7
+ from scipy.signal import find_peaks
8
+
9
+
10
+ @dataclass
11
+ class FidelityMetrics:
12
+ """Metrics for evaluating the fidelity of downsampled data.
13
+
14
+ Attributes:
15
+ mae: Mean Absolute Error between original and interpolated downsampled.
16
+ rmse: Root Mean Square Error.
17
+ max_error: Maximum absolute error.
18
+ pearson_r: Pearson correlation coefficient.
19
+ peak_error: Mean absolute error at detected peaks.
20
+ peak_count_ratio: Ratio of peaks preserved (downsampled / original).
21
+ coverage: Fraction of original points that could be compared.
22
+ """
23
+ mae: float
24
+ rmse: float
25
+ max_error: float
26
+ pearson_r: float
27
+ peak_error: float
28
+ peak_count_ratio: float
29
+ coverage: float
30
+
31
+ def to_dict(self) -> dict:
32
+ """Convert metrics to a dictionary."""
33
+ return {
34
+ 'mae': self.mae,
35
+ 'rmse': self.rmse,
36
+ 'max_error': self.max_error,
37
+ 'pearson_r': self.pearson_r,
38
+ 'peak_error': self.peak_error,
39
+ 'peak_count_ratio': self.peak_count_ratio,
40
+ 'coverage': self.coverage,
41
+ }
42
+
43
+ def __str__(self) -> str:
44
+ """Format metrics as a readable string."""
45
+ return (
46
+ f"FidelityMetrics(\n"
47
+ f" MAE: {self.mae:.6f}\n"
48
+ f" RMSE: {self.rmse:.6f}\n"
49
+ f" Max Error: {self.max_error:.6f}\n"
50
+ f" Pearson r: {self.pearson_r:.4f}\n"
51
+ f" Peak Error: {self.peak_error:.6f}\n"
52
+ f" Peak Count Ratio: {self.peak_count_ratio:.2%}\n"
53
+ f" Coverage: {self.coverage:.2%}\n"
54
+ f")"
55
+ )
56
+
57
+
58
+ def compute_metrics(
59
+ original: pd.DataFrame,
60
+ downsampled: pd.DataFrame,
61
+ column: str,
62
+ peak_prominence: float | None = None
63
+ ) -> FidelityMetrics:
64
+ """Compute fidelity metrics comparing original and downsampled data.
65
+
66
+ The downsampled data is interpolated back to the original timestamps
67
+ for comparison.
68
+
69
+ Args:
70
+ original: Original high-cadence DataFrame.
71
+ downsampled: Downsampled DataFrame.
72
+ column: Column name to compare.
73
+ peak_prominence: Minimum prominence for peak detection.
74
+ If None, auto-calculated as 10% of data range.
75
+
76
+ Returns:
77
+ FidelityMetrics containing various comparison metrics.
78
+
79
+ Example:
80
+ >>> original = pd.DataFrame(
81
+ ... {'value': np.sin(np.linspace(0, 10*np.pi, 1000))},
82
+ ... index=pd.date_range('2024-01-01', periods=1000, freq='1s')
83
+ ... )
84
+ >>> downsampled = original.resample('10s').mean()
85
+ >>> metrics = compute_metrics(original, downsampled, 'value')
86
+ >>> metrics.pearson_r > 0.9
87
+ True
88
+ """
89
+ # Get original values
90
+ orig_values = original[column].dropna()
91
+ if len(orig_values) == 0:
92
+ return _empty_metrics()
93
+
94
+ # Interpolate downsampled to original timestamps
95
+ ds_values = downsampled[column].dropna()
96
+ if len(ds_values) < 2:
97
+ return _empty_metrics()
98
+
99
+ # Create interpolated values at original timestamps
100
+ orig_times_numeric = (orig_values.index - orig_values.index[0]) / pd.Timedelta('1s')
101
+ ds_times_numeric = (ds_values.index - orig_values.index[0]) / pd.Timedelta('1s')
102
+
103
+ # Only interpolate within the range of downsampled data
104
+ mask = (orig_values.index >= ds_values.index[0]) & (orig_values.index <= ds_values.index[-1])
105
+ orig_in_range = orig_values[mask]
106
+ orig_times_in_range = orig_times_numeric[mask]
107
+
108
+ if len(orig_in_range) == 0:
109
+ return _empty_metrics()
110
+
111
+ interpolated = np.interp(
112
+ orig_times_in_range.values,
113
+ ds_times_numeric.values,
114
+ ds_values.values
115
+ )
116
+
117
+ # Compute basic metrics
118
+ errors = orig_in_range.values - interpolated
119
+ mae = np.mean(np.abs(errors))
120
+ rmse = np.sqrt(np.mean(errors**2))
121
+ max_error = np.max(np.abs(errors))
122
+
123
+ # Pearson correlation
124
+ if len(orig_in_range) > 1 and np.std(orig_in_range.values) > 0 and np.std(interpolated) > 0:
125
+ pearson_r, _ = stats.pearsonr(orig_in_range.values, interpolated)
126
+ else:
127
+ pearson_r = 1.0 if np.allclose(orig_in_range.values, interpolated) else 0.0
128
+
129
+ # Peak analysis
130
+ if peak_prominence is None:
131
+ data_range = np.max(orig_values.values) - np.min(orig_values.values)
132
+ peak_prominence = 0.1 * data_range if data_range > 0 else 0.1
133
+
134
+ orig_peaks, _ = find_peaks(orig_values.values, prominence=peak_prominence)
135
+ ds_peaks, _ = find_peaks(ds_values.values, prominence=peak_prominence)
136
+
137
+ # Peak count ratio
138
+ if len(orig_peaks) > 0:
139
+ peak_count_ratio = len(ds_peaks) / len(orig_peaks)
140
+ else:
141
+ peak_count_ratio = 1.0 if len(ds_peaks) == 0 else float('inf')
142
+
143
+ # Peak error (error at original peak locations)
144
+ if len(orig_peaks) > 0:
145
+ peak_times_numeric = orig_times_numeric.values[orig_peaks]
146
+ peak_interpolated = np.interp(peak_times_numeric, ds_times_numeric.values, ds_values.values)
147
+ peak_errors = orig_values.values[orig_peaks] - peak_interpolated
148
+ peak_error = np.mean(np.abs(peak_errors))
149
+ else:
150
+ peak_error = 0.0
151
+
152
+ # Coverage
153
+ coverage = len(orig_in_range) / len(orig_values)
154
+
155
+ return FidelityMetrics(
156
+ mae=mae,
157
+ rmse=rmse,
158
+ max_error=max_error,
159
+ pearson_r=pearson_r,
160
+ peak_error=peak_error,
161
+ peak_count_ratio=peak_count_ratio,
162
+ coverage=coverage,
163
+ )
164
+
165
+
166
+ def _empty_metrics() -> FidelityMetrics:
167
+ """Return empty/NaN metrics when comparison isn't possible."""
168
+ return FidelityMetrics(
169
+ mae=np.nan,
170
+ rmse=np.nan,
171
+ max_error=np.nan,
172
+ pearson_r=np.nan,
173
+ peak_error=np.nan,
174
+ peak_count_ratio=np.nan,
175
+ coverage=0.0,
176
+ )
177
+
178
+
179
+ def compute_reduction_ratio(
180
+ original: pd.DataFrame,
181
+ downsampled: pd.DataFrame
182
+ ) -> float:
183
+ """Compute the data reduction ratio.
184
+
185
+ Args:
186
+ original: Original DataFrame.
187
+ downsampled: Downsampled DataFrame.
188
+
189
+ Returns:
190
+ Ratio of original rows to downsampled rows.
191
+ """
192
+ if len(downsampled) == 0:
193
+ return float('inf')
194
+ return len(original) / len(downsampled)
195
+
196
+
197
+ def compute_storage_savings(
198
+ original: pd.DataFrame,
199
+ downsampled: pd.DataFrame
200
+ ) -> float:
201
+ """Compute storage savings as a percentage.
202
+
203
+ Args:
204
+ original: Original DataFrame.
205
+ downsampled: Downsampled DataFrame.
206
+
207
+ Returns:
208
+ Percentage of storage saved (0-100).
209
+ """
210
+ if len(original) == 0:
211
+ return 0.0
212
+ return 100 * (1 - len(downsampled) / len(original))