downsampler 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- downsampler/__init__.py +80 -0
- downsampler/aggregators.py +338 -0
- downsampler/config.py +72 -0
- downsampler/core.py +166 -0
- downsampler/deferred.py +357 -0
- downsampler/edges.py +202 -0
- downsampler/fidelity/__init__.py +23 -0
- downsampler/fidelity/comparison.py +343 -0
- downsampler/fidelity/metrics.py +212 -0
- downsampler/fidelity/visualization.py +359 -0
- downsampler/gaps.py +310 -0
- downsampler/lttb.py +207 -0
- downsampler/utils.py +150 -0
- downsampler-0.1.0.dist-info/METADATA +246 -0
- downsampler-0.1.0.dist-info/RECORD +18 -0
- downsampler-0.1.0.dist-info/WHEEL +5 -0
- downsampler-0.1.0.dist-info/licenses/LICENSE +21 -0
- downsampler-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
"""Visualization helpers for fidelity comparison.
|
|
2
|
+
|
|
3
|
+
Provides functions for plotting original vs downsampled data and
|
|
4
|
+
comparing different methods.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Any
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import numpy as np
|
|
10
|
+
|
|
11
|
+
from downsampler.fidelity.comparison import ComparisonResult
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def plot_comparison(
|
|
15
|
+
original: pd.DataFrame,
|
|
16
|
+
downsampled: pd.DataFrame,
|
|
17
|
+
column: str,
|
|
18
|
+
backend: str = "matplotlib",
|
|
19
|
+
title: str | None = None,
|
|
20
|
+
**kwargs
|
|
21
|
+
) -> Any:
|
|
22
|
+
"""Plot original and downsampled data for visual comparison.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
original: Original DataFrame.
|
|
26
|
+
downsampled: Downsampled DataFrame.
|
|
27
|
+
column: Column to plot.
|
|
28
|
+
backend: Plotting backend ('matplotlib' or 'altair').
|
|
29
|
+
title: Optional plot title.
|
|
30
|
+
**kwargs: Additional arguments passed to the plotting function.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
Plot object (matplotlib Figure or Altair Chart).
|
|
34
|
+
|
|
35
|
+
Example:
|
|
36
|
+
>>> fig = plot_comparison(original_df, downsampled_df, 'signal')
|
|
37
|
+
>>> fig.savefig('comparison.png')
|
|
38
|
+
"""
|
|
39
|
+
if backend == "matplotlib":
|
|
40
|
+
return _plot_matplotlib(original, downsampled, column, title, **kwargs)
|
|
41
|
+
elif backend == "altair":
|
|
42
|
+
return _plot_altair(original, downsampled, column, title, **kwargs)
|
|
43
|
+
else:
|
|
44
|
+
raise ValueError(f"Unknown backend: {backend}")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _plot_matplotlib(
|
|
48
|
+
original: pd.DataFrame,
|
|
49
|
+
downsampled: pd.DataFrame,
|
|
50
|
+
column: str,
|
|
51
|
+
title: str | None = None,
|
|
52
|
+
figsize: tuple = (12, 6),
|
|
53
|
+
**kwargs
|
|
54
|
+
) -> Any:
|
|
55
|
+
"""Create matplotlib comparison plot."""
|
|
56
|
+
try:
|
|
57
|
+
import matplotlib.pyplot as plt
|
|
58
|
+
except ImportError:
|
|
59
|
+
raise ImportError("matplotlib is required for this function. Install with: pip install matplotlib")
|
|
60
|
+
|
|
61
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
62
|
+
|
|
63
|
+
# Plot original
|
|
64
|
+
ax.plot(
|
|
65
|
+
original.index,
|
|
66
|
+
original[column],
|
|
67
|
+
label='Original',
|
|
68
|
+
alpha=0.7,
|
|
69
|
+
linewidth=0.5,
|
|
70
|
+
color='blue'
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Plot downsampled
|
|
74
|
+
ax.plot(
|
|
75
|
+
downsampled.index,
|
|
76
|
+
downsampled[column],
|
|
77
|
+
label='Downsampled',
|
|
78
|
+
alpha=0.9,
|
|
79
|
+
linewidth=1.5,
|
|
80
|
+
color='red',
|
|
81
|
+
marker='.',
|
|
82
|
+
markersize=3
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
ax.set_xlabel('Time')
|
|
86
|
+
ax.set_ylabel(column)
|
|
87
|
+
ax.legend()
|
|
88
|
+
|
|
89
|
+
if title:
|
|
90
|
+
ax.set_title(title)
|
|
91
|
+
else:
|
|
92
|
+
reduction = len(original) / len(downsampled) if len(downsampled) > 0 else float('inf')
|
|
93
|
+
ax.set_title(f'Downsampling Comparison ({reduction:.1f}x reduction)')
|
|
94
|
+
|
|
95
|
+
fig.tight_layout()
|
|
96
|
+
return fig
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _plot_altair(
|
|
100
|
+
original: pd.DataFrame,
|
|
101
|
+
downsampled: pd.DataFrame,
|
|
102
|
+
column: str,
|
|
103
|
+
title: str | None = None,
|
|
104
|
+
width: int = 800,
|
|
105
|
+
height: int = 400,
|
|
106
|
+
**kwargs
|
|
107
|
+
) -> Any:
|
|
108
|
+
"""Create Altair comparison plot."""
|
|
109
|
+
try:
|
|
110
|
+
import altair as alt
|
|
111
|
+
except ImportError:
|
|
112
|
+
raise ImportError("altair is required for this function. Install with: pip install altair")
|
|
113
|
+
|
|
114
|
+
# Prepare data
|
|
115
|
+
orig_data = original[[column]].reset_index()
|
|
116
|
+
orig_data.columns = ['time', column]
|
|
117
|
+
orig_data['source'] = 'Original'
|
|
118
|
+
|
|
119
|
+
ds_data = downsampled[[column]].reset_index()
|
|
120
|
+
ds_data.columns = ['time', column]
|
|
121
|
+
ds_data['source'] = 'Downsampled'
|
|
122
|
+
|
|
123
|
+
combined = pd.concat([orig_data, ds_data])
|
|
124
|
+
|
|
125
|
+
chart = alt.Chart(combined).mark_line().encode(
|
|
126
|
+
x='time:T',
|
|
127
|
+
y=f'{column}:Q',
|
|
128
|
+
color='source:N',
|
|
129
|
+
strokeWidth=alt.condition(
|
|
130
|
+
alt.datum.source == 'Downsampled',
|
|
131
|
+
alt.value(2),
|
|
132
|
+
alt.value(0.5)
|
|
133
|
+
),
|
|
134
|
+
opacity=alt.condition(
|
|
135
|
+
alt.datum.source == 'Downsampled',
|
|
136
|
+
alt.value(1),
|
|
137
|
+
alt.value(0.7)
|
|
138
|
+
)
|
|
139
|
+
).properties(
|
|
140
|
+
width=width,
|
|
141
|
+
height=height,
|
|
142
|
+
title=title or 'Downsampling Comparison'
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
return chart
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def plot_method_comparison(
|
|
149
|
+
results: list[ComparisonResult],
|
|
150
|
+
metric: str = "rmse",
|
|
151
|
+
backend: str = "matplotlib",
|
|
152
|
+
**kwargs
|
|
153
|
+
) -> Any:
|
|
154
|
+
"""Plot comparison of different methods by a specific metric.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
results: List of ComparisonResult objects.
|
|
158
|
+
metric: Metric to compare ('rmse', 'mae', 'pearson_r', etc.).
|
|
159
|
+
backend: Plotting backend ('matplotlib' or 'altair').
|
|
160
|
+
**kwargs: Additional arguments passed to the plotting function.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Plot object.
|
|
164
|
+
"""
|
|
165
|
+
if backend == "matplotlib":
|
|
166
|
+
return _plot_method_comparison_matplotlib(results, metric, **kwargs)
|
|
167
|
+
elif backend == "altair":
|
|
168
|
+
return _plot_method_comparison_altair(results, metric, **kwargs)
|
|
169
|
+
else:
|
|
170
|
+
raise ValueError(f"Unknown backend: {backend}")
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _plot_method_comparison_matplotlib(
|
|
174
|
+
results: list[ComparisonResult],
|
|
175
|
+
metric: str = "rmse",
|
|
176
|
+
figsize: tuple = (10, 6),
|
|
177
|
+
**kwargs
|
|
178
|
+
) -> Any:
|
|
179
|
+
"""Create matplotlib method comparison bar chart."""
|
|
180
|
+
try:
|
|
181
|
+
import matplotlib.pyplot as plt
|
|
182
|
+
except ImportError:
|
|
183
|
+
raise ImportError("matplotlib is required")
|
|
184
|
+
|
|
185
|
+
# Extract data
|
|
186
|
+
methods = [r.method.value for r in results]
|
|
187
|
+
values = [getattr(r.metrics, metric) for r in results]
|
|
188
|
+
|
|
189
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
190
|
+
bars = ax.bar(methods, values, color='steelblue')
|
|
191
|
+
|
|
192
|
+
# Highlight best value
|
|
193
|
+
if metric in ['rmse', 'mae', 'max_error', 'peak_error']:
|
|
194
|
+
best_idx = np.argmin(values)
|
|
195
|
+
else:
|
|
196
|
+
best_idx = np.argmax(values)
|
|
197
|
+
|
|
198
|
+
bars[best_idx].set_color('green')
|
|
199
|
+
|
|
200
|
+
ax.set_xlabel('Method')
|
|
201
|
+
ax.set_ylabel(metric.upper())
|
|
202
|
+
ax.set_title(f'Method Comparison by {metric.upper()}')
|
|
203
|
+
|
|
204
|
+
plt.xticks(rotation=45, ha='right')
|
|
205
|
+
fig.tight_layout()
|
|
206
|
+
|
|
207
|
+
return fig
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _plot_method_comparison_altair(
|
|
211
|
+
results: list[ComparisonResult],
|
|
212
|
+
metric: str = "rmse",
|
|
213
|
+
width: int = 400,
|
|
214
|
+
height: int = 300,
|
|
215
|
+
**kwargs
|
|
216
|
+
) -> Any:
|
|
217
|
+
"""Create Altair method comparison bar chart."""
|
|
218
|
+
try:
|
|
219
|
+
import altair as alt
|
|
220
|
+
except ImportError:
|
|
221
|
+
raise ImportError("altair is required")
|
|
222
|
+
|
|
223
|
+
data = pd.DataFrame([
|
|
224
|
+
{'method': r.method.value, metric: getattr(r.metrics, metric)}
|
|
225
|
+
for r in results
|
|
226
|
+
])
|
|
227
|
+
|
|
228
|
+
chart = alt.Chart(data).mark_bar().encode(
|
|
229
|
+
x=alt.X('method:N', sort='-y'),
|
|
230
|
+
y=f'{metric}:Q',
|
|
231
|
+
color=alt.Color('method:N', legend=None)
|
|
232
|
+
).properties(
|
|
233
|
+
width=width,
|
|
234
|
+
height=height,
|
|
235
|
+
title=f'Method Comparison by {metric.upper()}'
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
return chart
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
class MarimoHelper:
|
|
242
|
+
"""Helper class for creating interactive Marimo notebook comparisons.
|
|
243
|
+
|
|
244
|
+
Example usage in a Marimo notebook:
|
|
245
|
+
>>> from downsampler.fidelity import MarimoHelper
|
|
246
|
+
>>> helper = MarimoHelper()
|
|
247
|
+
>>> ui, output = helper.interactive_comparison(
|
|
248
|
+
... original_df, 'signal',
|
|
249
|
+
... cadences=['1min', '5min', '10min'],
|
|
250
|
+
... methods=['mean', 'lttb']
|
|
251
|
+
... )
|
|
252
|
+
>>> # Display ui and output in separate cells
|
|
253
|
+
"""
|
|
254
|
+
|
|
255
|
+
@staticmethod
|
|
256
|
+
def interactive_comparison(
|
|
257
|
+
original: pd.DataFrame,
|
|
258
|
+
column: str,
|
|
259
|
+
cadences: list[str],
|
|
260
|
+
methods: list[str] | None = None
|
|
261
|
+
) -> tuple[Any, Any]:
|
|
262
|
+
"""Create interactive comparison UI for Marimo notebooks.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
original: Original DataFrame.
|
|
266
|
+
column: Column to compare.
|
|
267
|
+
cadences: List of cadence options.
|
|
268
|
+
methods: List of method options. If None, uses all methods.
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
Tuple of (ui_element, output_function) for Marimo.
|
|
272
|
+
"""
|
|
273
|
+
try:
|
|
274
|
+
import marimo as mo
|
|
275
|
+
except ImportError:
|
|
276
|
+
raise ImportError(
|
|
277
|
+
"marimo is required for interactive comparisons. "
|
|
278
|
+
"Install with: pip install marimo"
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
from downsampler.config import AggregationMethod
|
|
282
|
+
from downsampler.core import downsample
|
|
283
|
+
from downsampler.fidelity.metrics import compute_metrics
|
|
284
|
+
|
|
285
|
+
if methods is None:
|
|
286
|
+
methods = [m.value for m in AggregationMethod]
|
|
287
|
+
|
|
288
|
+
# Create UI elements
|
|
289
|
+
cadence_select = mo.ui.dropdown(
|
|
290
|
+
options=cadences,
|
|
291
|
+
value=cadences[0],
|
|
292
|
+
label="Target Cadence"
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
method_select = mo.ui.dropdown(
|
|
296
|
+
options=methods,
|
|
297
|
+
value=methods[0],
|
|
298
|
+
label="Method"
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
ui = mo.vstack([cadence_select, method_select])
|
|
302
|
+
|
|
303
|
+
def compute_output():
|
|
304
|
+
cadence = cadence_select.value
|
|
305
|
+
method = AggregationMethod(method_select.value)
|
|
306
|
+
|
|
307
|
+
from downsampler.config import DownsampleConfig
|
|
308
|
+
|
|
309
|
+
config = DownsampleConfig(
|
|
310
|
+
method=method,
|
|
311
|
+
lttb_target_column=column if method == AggregationMethod.LTTB else None
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
downsampled = downsample(original, cadence, config)
|
|
315
|
+
metrics = compute_metrics(original, downsampled, column)
|
|
316
|
+
|
|
317
|
+
# Create plot
|
|
318
|
+
fig = plot_comparison(original, downsampled, column, backend="matplotlib")
|
|
319
|
+
|
|
320
|
+
return mo.vstack([
|
|
321
|
+
mo.md(f"### Results for {method.value} at {cadence}"),
|
|
322
|
+
mo.md(f"**Reduction:** {len(original)/len(downsampled):.1f}x"),
|
|
323
|
+
mo.md(f"**RMSE:** {metrics.rmse:.6f}"),
|
|
324
|
+
mo.md(f"**Correlation:** {metrics.pearson_r:.4f}"),
|
|
325
|
+
fig
|
|
326
|
+
])
|
|
327
|
+
|
|
328
|
+
return ui, compute_output
|
|
329
|
+
|
|
330
|
+
@staticmethod
|
|
331
|
+
def comparison_table(
|
|
332
|
+
original: pd.DataFrame,
|
|
333
|
+
column: str,
|
|
334
|
+
cadences: list[str],
|
|
335
|
+
methods: list[str] | None = None
|
|
336
|
+
) -> pd.DataFrame:
|
|
337
|
+
"""Generate a comparison table for multiple cadences and methods.
|
|
338
|
+
|
|
339
|
+
Args:
|
|
340
|
+
original: Original DataFrame.
|
|
341
|
+
column: Column to compare.
|
|
342
|
+
cadences: List of cadences to compare.
|
|
343
|
+
methods: List of methods. If None, uses all methods.
|
|
344
|
+
|
|
345
|
+
Returns:
|
|
346
|
+
DataFrame with comparison metrics.
|
|
347
|
+
"""
|
|
348
|
+
from downsampler.fidelity.comparison import FidelityComparison
|
|
349
|
+
from downsampler.config import AggregationMethod
|
|
350
|
+
|
|
351
|
+
if methods is None:
|
|
352
|
+
method_enums = list(AggregationMethod)
|
|
353
|
+
else:
|
|
354
|
+
method_enums = [AggregationMethod(m) for m in methods]
|
|
355
|
+
|
|
356
|
+
comp = FidelityComparison(original, column)
|
|
357
|
+
results = comp.compare_grid(cadences, method_enums)
|
|
358
|
+
|
|
359
|
+
return comp.summary_table(results)
|
downsampler/gaps.py
ADDED
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
"""Gap detection and handling for time series data."""
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
from typing import Iterator
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def find_gap_indices(
|
|
9
|
+
df: pd.DataFrame,
|
|
10
|
+
timedelta_max_gap: pd.Timedelta
|
|
11
|
+
) -> pd.Series:
|
|
12
|
+
"""Find gaps in a DataFrame and return their locations and durations.
|
|
13
|
+
|
|
14
|
+
Identifies gaps in the DataFrame's DatetimeIndex that are equal to or
|
|
15
|
+
longer than the specified threshold.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
df: DataFrame with DatetimeIndex.
|
|
19
|
+
timedelta_max_gap: Minimum duration to consider as a gap.
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
Series where indices are the integer positions of gap starts in the
|
|
23
|
+
DataFrame, and values are the gap durations as multiples of
|
|
24
|
+
timedelta_max_gap.
|
|
25
|
+
|
|
26
|
+
Example:
|
|
27
|
+
>>> df = pd.DataFrame(
|
|
28
|
+
... {'value': [1, 2, 3]},
|
|
29
|
+
... index=pd.to_datetime(['2024-01-01 00:00', '2024-01-01 00:01', '2024-01-01 00:10'])
|
|
30
|
+
... )
|
|
31
|
+
>>> gaps = find_gap_indices(df, pd.Timedelta('5min'))
|
|
32
|
+
>>> len(gaps) # One gap found
|
|
33
|
+
1
|
|
34
|
+
"""
|
|
35
|
+
deltas = pd.Series(df.index).diff()[1:]
|
|
36
|
+
gaps = deltas[deltas >= timedelta_max_gap] / timedelta_max_gap
|
|
37
|
+
return gaps
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def groupby_gaps(
|
|
41
|
+
df: pd.DataFrame,
|
|
42
|
+
timedelta_max_gap: pd.Timedelta
|
|
43
|
+
) -> pd.api.typing.DataFrameGroupBy:
|
|
44
|
+
"""Split a DataFrame at gaps and return a groupby object.
|
|
45
|
+
|
|
46
|
+
Finds gaps in the DataFrame and returns a groupby object where each
|
|
47
|
+
group is a contiguous segment between gaps.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
df: DataFrame with DatetimeIndex.
|
|
51
|
+
timedelta_max_gap: Minimum duration to consider as a gap.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
DataFrameGroupBy object where each group is a contiguous segment.
|
|
55
|
+
|
|
56
|
+
Note:
|
|
57
|
+
This function modifies the input DataFrame by adding a 'gap_index'
|
|
58
|
+
column. Use .copy() if you need to preserve the original.
|
|
59
|
+
|
|
60
|
+
Example:
|
|
61
|
+
>>> df = pd.DataFrame(
|
|
62
|
+
... {'value': [1, 2, 3, 4, 5]},
|
|
63
|
+
... index=pd.to_datetime([
|
|
64
|
+
... '2024-01-01 00:00', '2024-01-01 00:01', '2024-01-01 00:02',
|
|
65
|
+
... '2024-01-01 00:10', '2024-01-01 00:11'
|
|
66
|
+
... ])
|
|
67
|
+
... )
|
|
68
|
+
>>> groups = groupby_gaps(df.copy(), pd.Timedelta('5min'))
|
|
69
|
+
>>> len(list(groups)) # Two segments
|
|
70
|
+
2
|
|
71
|
+
"""
|
|
72
|
+
deltas = df.index.diff()[1:]
|
|
73
|
+
gap_indices = (deltas >= timedelta_max_gap).cumsum()
|
|
74
|
+
df['gap_index'] = [0, *gap_indices]
|
|
75
|
+
dfs_out = df.groupby('gap_index')
|
|
76
|
+
return dfs_out
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def split_at_gaps(
|
|
80
|
+
df: pd.DataFrame,
|
|
81
|
+
timedelta_max_gap: pd.Timedelta
|
|
82
|
+
) -> list[pd.DataFrame]:
|
|
83
|
+
"""Split a DataFrame at gaps into a list of DataFrames.
|
|
84
|
+
|
|
85
|
+
Finds gaps in the DataFrame and returns a list of DataFrames,
|
|
86
|
+
each representing a contiguous segment between gaps.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
df: DataFrame with DatetimeIndex.
|
|
90
|
+
timedelta_max_gap: Minimum duration to consider as a gap.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
List of DataFrames, one for each contiguous segment.
|
|
94
|
+
|
|
95
|
+
Example:
|
|
96
|
+
>>> df = pd.DataFrame(
|
|
97
|
+
... {'value': [1, 2, 3, 4, 5]},
|
|
98
|
+
... index=pd.to_datetime([
|
|
99
|
+
... '2024-01-01 00:00', '2024-01-01 00:01', '2024-01-01 00:02',
|
|
100
|
+
... '2024-01-01 00:10', '2024-01-01 00:11'
|
|
101
|
+
... ])
|
|
102
|
+
... )
|
|
103
|
+
>>> segments = split_at_gaps(df, pd.Timedelta('5min'))
|
|
104
|
+
>>> len(segments)
|
|
105
|
+
2
|
|
106
|
+
>>> len(segments[0])
|
|
107
|
+
3
|
|
108
|
+
>>> len(segments[1])
|
|
109
|
+
2
|
|
110
|
+
"""
|
|
111
|
+
df_work = df.copy()
|
|
112
|
+
groups = groupby_gaps(df_work, timedelta_max_gap)
|
|
113
|
+
return [group.drop(columns=['gap_index']) for _, group in groups]
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def iter_segments(
|
|
117
|
+
df: pd.DataFrame,
|
|
118
|
+
timedelta_max_gap: pd.Timedelta,
|
|
119
|
+
min_points: int = 1
|
|
120
|
+
) -> Iterator[pd.DataFrame]:
|
|
121
|
+
"""Iterate over contiguous segments in a DataFrame.
|
|
122
|
+
|
|
123
|
+
Yields DataFrames representing contiguous segments between gaps,
|
|
124
|
+
optionally filtering out segments with too few points.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
df: DataFrame with DatetimeIndex.
|
|
128
|
+
timedelta_max_gap: Minimum duration to consider as a gap.
|
|
129
|
+
min_points: Minimum number of points required in a segment.
|
|
130
|
+
|
|
131
|
+
Yields:
|
|
132
|
+
DataFrames for each contiguous segment with at least min_points.
|
|
133
|
+
|
|
134
|
+
Example:
|
|
135
|
+
>>> df = pd.DataFrame(
|
|
136
|
+
... {'value': [1, 2, 3, 4, 5]},
|
|
137
|
+
... index=pd.to_datetime([
|
|
138
|
+
... '2024-01-01 00:00', '2024-01-01 00:01', '2024-01-01 00:02',
|
|
139
|
+
... '2024-01-01 00:10', '2024-01-01 00:11'
|
|
140
|
+
... ])
|
|
141
|
+
... )
|
|
142
|
+
>>> for segment in iter_segments(df, pd.Timedelta('5min'), min_points=3):
|
|
143
|
+
... print(len(segment))
|
|
144
|
+
3
|
|
145
|
+
"""
|
|
146
|
+
df_work = df.copy()
|
|
147
|
+
groups = groupby_gaps(df_work, timedelta_max_gap)
|
|
148
|
+
|
|
149
|
+
for _, group in groups:
|
|
150
|
+
segment = group.drop(columns=['gap_index'])
|
|
151
|
+
if len(segment) >= min_points:
|
|
152
|
+
yield segment
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def wrap_in_nans(
|
|
156
|
+
df_in: pd.DataFrame,
|
|
157
|
+
offset: str = 'PT0.1S',
|
|
158
|
+
where: str = 'both'
|
|
159
|
+
) -> pd.DataFrame:
|
|
160
|
+
"""Add NaN boundary rows before and/or after a DataFrame.
|
|
161
|
+
|
|
162
|
+
Returns a copy of the DataFrame with rows of NaN values added at the
|
|
163
|
+
start and/or end. This is useful for creating visual breaks in plots.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
df_in: Input DataFrame with DatetimeIndex.
|
|
167
|
+
offset: Time offset for the NaN rows as ISO 8601 duration string.
|
|
168
|
+
where: Where to add NaN rows: 'start', 'end', or 'both'.
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
DataFrame with NaN boundary rows added.
|
|
172
|
+
|
|
173
|
+
Example:
|
|
174
|
+
>>> df = pd.DataFrame(
|
|
175
|
+
... {'value': [1, 2, 3]},
|
|
176
|
+
... index=pd.date_range('2024-01-01', periods=3, freq='1min')
|
|
177
|
+
... )
|
|
178
|
+
>>> wrapped = wrap_in_nans(df, offset='PT1S')
|
|
179
|
+
>>> len(wrapped)
|
|
180
|
+
5
|
|
181
|
+
>>> np.isnan(wrapped.iloc[0]['value'])
|
|
182
|
+
True
|
|
183
|
+
"""
|
|
184
|
+
df_out = df_in.copy()
|
|
185
|
+
data_nans = {col: np.nan for col in df_out.columns}
|
|
186
|
+
offset_timedelta = pd.to_timedelta(offset)
|
|
187
|
+
|
|
188
|
+
# Add gap before start
|
|
189
|
+
if where in ('start', 'both'):
|
|
190
|
+
df_new_record_before = pd.DataFrame(
|
|
191
|
+
data=data_nans,
|
|
192
|
+
index=[df_out.index[0] - offset_timedelta]
|
|
193
|
+
)
|
|
194
|
+
df_out = pd.concat([df_new_record_before, df_out])
|
|
195
|
+
|
|
196
|
+
# Add gap after end
|
|
197
|
+
if where in ('end', 'both'):
|
|
198
|
+
df_new_record_after = pd.DataFrame(
|
|
199
|
+
data=data_nans,
|
|
200
|
+
index=[df_out.index[-1] + offset_timedelta]
|
|
201
|
+
)
|
|
202
|
+
df_out = pd.concat([df_out, df_new_record_after])
|
|
203
|
+
|
|
204
|
+
return df_out
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def mark_gaps_in_dataframe(
|
|
208
|
+
df: pd.DataFrame,
|
|
209
|
+
nominal_timedelta: pd.Timedelta = pd.to_timedelta(1, 'min'),
|
|
210
|
+
nominal_start_time: pd.Timestamp | None = None,
|
|
211
|
+
nominal_end_time: pd.Timestamp | None = None
|
|
212
|
+
) -> pd.DataFrame:
|
|
213
|
+
"""Insert NaN records at gaps to create visual breaks in plots.
|
|
214
|
+
|
|
215
|
+
Looks for gaps in the DataFrame and inserts NaN records to ensure
|
|
216
|
+
that plotting libraries will show breaks at gap locations.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
df: DataFrame with DatetimeIndex.
|
|
220
|
+
nominal_timedelta: Expected cadence of the time series.
|
|
221
|
+
nominal_start_time: If provided, add NaN before first record if
|
|
222
|
+
it's after this time.
|
|
223
|
+
nominal_end_time: If provided, add NaN after last record if
|
|
224
|
+
it's before this time.
|
|
225
|
+
|
|
226
|
+
Returns:
|
|
227
|
+
DataFrame with NaN records inserted at gap locations.
|
|
228
|
+
|
|
229
|
+
Example:
|
|
230
|
+
>>> df = pd.DataFrame(
|
|
231
|
+
... {'value': [1, 2, 3]},
|
|
232
|
+
... index=pd.to_datetime([
|
|
233
|
+
... '2024-01-01 00:00', '2024-01-01 00:01', '2024-01-01 00:10'
|
|
234
|
+
... ])
|
|
235
|
+
... )
|
|
236
|
+
>>> marked = mark_gaps_in_dataframe(df, pd.Timedelta('1min'))
|
|
237
|
+
>>> len(marked) > len(df)
|
|
238
|
+
True
|
|
239
|
+
"""
|
|
240
|
+
deltas = pd.Series(df.index).diff()[1:]
|
|
241
|
+
gaps = deltas[deltas > nominal_timedelta] / nominal_timedelta
|
|
242
|
+
|
|
243
|
+
df_gapfilled = df.copy()
|
|
244
|
+
data_nans = {col: np.nan for col in df.columns}
|
|
245
|
+
|
|
246
|
+
for i, gap in gaps.items():
|
|
247
|
+
# Add a np.nan record after the start of each gap,
|
|
248
|
+
# to force breaks in plotted lines
|
|
249
|
+
time_gap_start = df.index[i - 1] + nominal_timedelta
|
|
250
|
+
df_new_record = pd.DataFrame(data=data_nans, index=[time_gap_start])
|
|
251
|
+
df_gapfilled = pd.concat([df_gapfilled, df_new_record]).sort_index()
|
|
252
|
+
|
|
253
|
+
# For gaps longer than 1 record, also add a np.nan record before the
|
|
254
|
+
# end of the gap
|
|
255
|
+
if gap > 2:
|
|
256
|
+
time_gap_end = df.index[i] - nominal_timedelta
|
|
257
|
+
df_new_record = pd.DataFrame(data=data_nans, index=[time_gap_end])
|
|
258
|
+
df_gapfilled = pd.concat([df_gapfilled, df_new_record]).sort_index()
|
|
259
|
+
|
|
260
|
+
# Add gap before start
|
|
261
|
+
if nominal_start_time is not None:
|
|
262
|
+
if df.index[0] > nominal_start_time:
|
|
263
|
+
df_new_record_before = pd.DataFrame(
|
|
264
|
+
data=data_nans,
|
|
265
|
+
index=[df.index[0] - nominal_timedelta]
|
|
266
|
+
)
|
|
267
|
+
df_gapfilled = pd.concat([df_new_record_before, df_gapfilled])
|
|
268
|
+
|
|
269
|
+
# Add gap after end
|
|
270
|
+
if nominal_end_time is not None:
|
|
271
|
+
if df.index[-1] < nominal_end_time:
|
|
272
|
+
df_new_record_after = pd.DataFrame(
|
|
273
|
+
data=data_nans,
|
|
274
|
+
index=[df.index[-1] + nominal_timedelta]
|
|
275
|
+
)
|
|
276
|
+
df_gapfilled = pd.concat([df_gapfilled, df_new_record_after])
|
|
277
|
+
|
|
278
|
+
return df_gapfilled
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def has_gaps(df: pd.DataFrame, threshold: pd.Timedelta) -> bool:
|
|
282
|
+
"""Check if a DataFrame has gaps larger than the threshold.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
df: DataFrame with DatetimeIndex.
|
|
286
|
+
threshold: Minimum duration to consider as a gap.
|
|
287
|
+
|
|
288
|
+
Returns:
|
|
289
|
+
True if gaps are found, False otherwise.
|
|
290
|
+
"""
|
|
291
|
+
if len(df) < 2:
|
|
292
|
+
return False
|
|
293
|
+
deltas = df.index.diff()[1:]
|
|
294
|
+
return (deltas >= threshold).any()
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def count_gaps(df: pd.DataFrame, threshold: pd.Timedelta) -> int:
|
|
298
|
+
"""Count the number of gaps in a DataFrame.
|
|
299
|
+
|
|
300
|
+
Args:
|
|
301
|
+
df: DataFrame with DatetimeIndex.
|
|
302
|
+
threshold: Minimum duration to consider as a gap.
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
Number of gaps found.
|
|
306
|
+
"""
|
|
307
|
+
if len(df) < 2:
|
|
308
|
+
return 0
|
|
309
|
+
deltas = df.index.diff()[1:]
|
|
310
|
+
return (deltas >= threshold).sum()
|