tsdownsample 0.1.4.1rc0__cp39-cp39-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tsdownsample/__init__.py +26 -0
- tsdownsample/_python/__init__.py +0 -0
- tsdownsample/_python/downsamplers.py +257 -0
- tsdownsample/_rust/__init__.py +1 -0
- tsdownsample/_rust/_tsdownsample_rs.cp39-win_amd64.pyd +0 -0
- tsdownsample/downsamplers.py +158 -0
- tsdownsample/downsampling_interface.py +432 -0
- tsdownsample-0.1.4.1rc0.dist-info/METADATA +168 -0
- tsdownsample-0.1.4.1rc0.dist-info/RECORD +11 -0
- tsdownsample-0.1.4.1rc0.dist-info/WHEEL +4 -0
- tsdownsample-0.1.4.1rc0.dist-info/licenses/LICENSE +21 -0
tsdownsample/__init__.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""tsdownsample: high performance downsampling of time series data for visualization."""
|
|
2
|
+
|
|
3
|
+
from .downsamplers import (
|
|
4
|
+
EveryNthDownsampler,
|
|
5
|
+
LTTBDownsampler,
|
|
6
|
+
M4Downsampler,
|
|
7
|
+
MinMaxDownsampler,
|
|
8
|
+
MinMaxLTTBDownsampler,
|
|
9
|
+
NaNM4Downsampler,
|
|
10
|
+
NaNMinMaxDownsampler,
|
|
11
|
+
NaNMinMaxLTTBDownsampler,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
__version__ = "0.1.4.1rc0"
|
|
15
|
+
__author__ = "Jeroen Van Der Donckt"
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"EveryNthDownsampler",
|
|
19
|
+
"MinMaxDownsampler",
|
|
20
|
+
"M4Downsampler",
|
|
21
|
+
"LTTBDownsampler",
|
|
22
|
+
"MinMaxLTTBDownsampler",
|
|
23
|
+
"NaNMinMaxDownsampler",
|
|
24
|
+
"NaNM4Downsampler",
|
|
25
|
+
"NaNMinMaxLTTBDownsampler",
|
|
26
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
from typing import Union
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
from ..downsampling_interface import AbstractDownsampler
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _get_bin_idxs(x: np.ndarray, nb_bins: int) -> np.ndarray:
|
|
9
|
+
"""Get the equidistant indices of the bins to use for the aggregation.
|
|
10
|
+
|
|
11
|
+
Parameters
|
|
12
|
+
----------
|
|
13
|
+
x : np.ndarray
|
|
14
|
+
The x values of the input data.
|
|
15
|
+
nb_bins : int
|
|
16
|
+
The number of bins.
|
|
17
|
+
|
|
18
|
+
Returns
|
|
19
|
+
-------
|
|
20
|
+
np.ndarray
|
|
21
|
+
The indices of the bins to use for the aggregation.
|
|
22
|
+
"""
|
|
23
|
+
# Thanks to the `linspace` the data is evenly distributed over the index-range
|
|
24
|
+
# The searchsorted function returns the index positions
|
|
25
|
+
bins = np.searchsorted(x, np.linspace(x[0], x[-1], nb_bins + 1), side="right")
|
|
26
|
+
bins[0] = 0
|
|
27
|
+
bins[-1] = len(x)
|
|
28
|
+
return np.array(bins)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class LTTB_py(AbstractDownsampler):
|
|
32
|
+
@staticmethod
|
|
33
|
+
def _argmax_area(prev_x, prev_y, avg_next_x, avg_next_y, x_bucket, y_bucket) -> int:
|
|
34
|
+
"""Vectorized triangular area argmax computation.
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
prev_x : float
|
|
39
|
+
The previous selected point is x value.
|
|
40
|
+
prev_y : float
|
|
41
|
+
The previous selected point its y value.
|
|
42
|
+
avg_next_x : float
|
|
43
|
+
The x mean of the next bucket
|
|
44
|
+
avg_next_y : float
|
|
45
|
+
The y mean of the next bucket
|
|
46
|
+
x_bucket : np.ndarray
|
|
47
|
+
All x values in the bucket
|
|
48
|
+
y_bucket : np.ndarray
|
|
49
|
+
All y values in the bucket
|
|
50
|
+
|
|
51
|
+
Returns
|
|
52
|
+
-------
|
|
53
|
+
int
|
|
54
|
+
The index of the point with the largest triangular area.
|
|
55
|
+
"""
|
|
56
|
+
return np.abs(
|
|
57
|
+
x_bucket * (prev_y - avg_next_y)
|
|
58
|
+
+ y_bucket * (avg_next_x - prev_x)
|
|
59
|
+
+ (prev_x * avg_next_y - avg_next_x * prev_y)
|
|
60
|
+
).argmax()
|
|
61
|
+
|
|
62
|
+
def _downsample(
|
|
63
|
+
self, x: Union[np.ndarray, None], y: np.ndarray, n_out: int, **kwargs
|
|
64
|
+
) -> np.ndarray:
|
|
65
|
+
"""TODO complete docs"""
|
|
66
|
+
if x is None:
|
|
67
|
+
# Is fine for this implementation as this is only used for testing
|
|
68
|
+
x = np.arange(y.shape[0])
|
|
69
|
+
|
|
70
|
+
# Bucket size. Leave room for start and end data points
|
|
71
|
+
block_size = (y.shape[0] - 2) / (n_out - 2)
|
|
72
|
+
# Note this 'astype' cast must take place after array creation (and not with the
|
|
73
|
+
# aranage() its dtype argument) or it will cast the `block_size` step to an int
|
|
74
|
+
# before the arange array creation
|
|
75
|
+
offset = np.arange(start=1, stop=y.shape[0], step=block_size).astype(np.int64)
|
|
76
|
+
|
|
77
|
+
# Construct the output array
|
|
78
|
+
sampled_x = np.empty(n_out, dtype="int64")
|
|
79
|
+
sampled_x[0] = 0
|
|
80
|
+
sampled_x[-1] = x.shape[0] - 1
|
|
81
|
+
|
|
82
|
+
# Convert x & y to int if it is boolean
|
|
83
|
+
if x.dtype == np.bool_:
|
|
84
|
+
x = x.astype(np.int8)
|
|
85
|
+
if y.dtype == np.bool_:
|
|
86
|
+
y = y.astype(np.int8)
|
|
87
|
+
|
|
88
|
+
a = 0
|
|
89
|
+
for i in range(n_out - 3):
|
|
90
|
+
a = (
|
|
91
|
+
LTTB_py._argmax_area(
|
|
92
|
+
prev_x=x[a],
|
|
93
|
+
prev_y=y[a],
|
|
94
|
+
avg_next_x=np.mean(x[offset[i + 1] : offset[i + 2]]),
|
|
95
|
+
avg_next_y=y[offset[i + 1] : offset[i + 2]].mean(),
|
|
96
|
+
x_bucket=x[offset[i] : offset[i + 1]],
|
|
97
|
+
y_bucket=y[offset[i] : offset[i + 1]],
|
|
98
|
+
)
|
|
99
|
+
+ offset[i]
|
|
100
|
+
)
|
|
101
|
+
sampled_x[i + 1] = a
|
|
102
|
+
|
|
103
|
+
# ------------ EDGE CASE ------------
|
|
104
|
+
# next-average of last bucket = last point
|
|
105
|
+
sampled_x[-2] = (
|
|
106
|
+
LTTB_py._argmax_area(
|
|
107
|
+
prev_x=x[a],
|
|
108
|
+
prev_y=y[a],
|
|
109
|
+
avg_next_x=x[-1], # last point
|
|
110
|
+
avg_next_y=y[-1],
|
|
111
|
+
x_bucket=x[offset[-2] : offset[-1]],
|
|
112
|
+
y_bucket=y[offset[-2] : offset[-1]],
|
|
113
|
+
)
|
|
114
|
+
+ offset[-2]
|
|
115
|
+
)
|
|
116
|
+
return sampled_x
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class MinMax_py(AbstractDownsampler):
|
|
120
|
+
"""Aggregation method which performs binned min-max aggregation over fully
|
|
121
|
+
overlapping windows.
|
|
122
|
+
"""
|
|
123
|
+
|
|
124
|
+
@staticmethod
|
|
125
|
+
def _check_valid_n_out(n_out: int):
|
|
126
|
+
assert n_out % 2 == 0, "n_out must be a multiple of 2"
|
|
127
|
+
|
|
128
|
+
def _downsample(
|
|
129
|
+
self, x: Union[np.ndarray, None], y: np.ndarray, n_out: int, **kwargs
|
|
130
|
+
) -> np.ndarray:
|
|
131
|
+
if x is None:
|
|
132
|
+
# Is fine for this implementation as this is only used for testing
|
|
133
|
+
x = np.arange(y.shape[0])
|
|
134
|
+
|
|
135
|
+
xdt = x.dtype
|
|
136
|
+
if np.issubdtype(xdt, np.datetime64) or np.issubdtype(xdt, np.timedelta64):
|
|
137
|
+
x = x.view(np.int64)
|
|
138
|
+
|
|
139
|
+
bins = _get_bin_idxs(x, n_out // 2)
|
|
140
|
+
|
|
141
|
+
rel_idxs = []
|
|
142
|
+
for lower, upper in zip(bins, bins[1:]):
|
|
143
|
+
y_slice = y[lower:upper]
|
|
144
|
+
if not len(y_slice):
|
|
145
|
+
continue
|
|
146
|
+
# calculate the argmin(slice) & argmax(slice)
|
|
147
|
+
rel_idxs.append(lower + np.nanargmin(y_slice))
|
|
148
|
+
rel_idxs.append(lower + np.nanargmax(y_slice))
|
|
149
|
+
return np.unique(rel_idxs)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class NaNMinMax_py(AbstractDownsampler):
|
|
153
|
+
@staticmethod
|
|
154
|
+
def _check_valid_n_out(n_out: int):
|
|
155
|
+
assert n_out % 2 == 0, "n_out must be a multiple of 2"
|
|
156
|
+
|
|
157
|
+
def _downsample(
|
|
158
|
+
self, x: Union[np.ndarray, None], y: np.ndarray, n_out: int, **kwargs
|
|
159
|
+
) -> np.ndarray:
|
|
160
|
+
if x is None:
|
|
161
|
+
# Is fine for this implementation as this is only used for testing
|
|
162
|
+
x = np.arange(y.shape[0])
|
|
163
|
+
|
|
164
|
+
xdt = x.dtype
|
|
165
|
+
if np.issubdtype(xdt, np.datetime64) or np.issubdtype(xdt, np.timedelta64):
|
|
166
|
+
x = x.view(np.int64)
|
|
167
|
+
|
|
168
|
+
bins = _get_bin_idxs(x, n_out // 2)
|
|
169
|
+
|
|
170
|
+
rel_idxs = []
|
|
171
|
+
for lower, upper in zip(bins, bins[1:]):
|
|
172
|
+
y_slice = y[lower:upper]
|
|
173
|
+
if not len(y_slice):
|
|
174
|
+
continue
|
|
175
|
+
# calculate the argmin(slice) & argmax(slice)
|
|
176
|
+
rel_idxs.append(lower + np.argmin(y_slice))
|
|
177
|
+
rel_idxs.append(lower + np.argmax(y_slice))
|
|
178
|
+
return np.array(sorted(rel_idxs))
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
class M4_py(AbstractDownsampler):
|
|
182
|
+
"""Aggregation method which selects the 4 M-s, i.e y-argmin, y-argmax, x-argmin, and
|
|
183
|
+
x-argmax per bin.
|
|
184
|
+
|
|
185
|
+
.. note::
|
|
186
|
+
When `n_out` is 4 * the canvas its pixel widht it should create a pixel-perfect
|
|
187
|
+
visualization w.r.t. the raw data.
|
|
188
|
+
|
|
189
|
+
"""
|
|
190
|
+
|
|
191
|
+
@staticmethod
|
|
192
|
+
def _check_valid_n_out(n_out: int):
|
|
193
|
+
assert n_out % 4 == 0, "n_out must be a multiple of 4"
|
|
194
|
+
|
|
195
|
+
def _downsample(
|
|
196
|
+
self, x: Union[np.ndarray, None], y: np.ndarray, n_out: int, **kwargs
|
|
197
|
+
) -> np.ndarray:
|
|
198
|
+
"""TODO complete docs"""
|
|
199
|
+
if x is None:
|
|
200
|
+
# Is fine for this implementation as this is only used for testing
|
|
201
|
+
x = np.arange(y.shape[0])
|
|
202
|
+
|
|
203
|
+
xdt = x.dtype
|
|
204
|
+
if np.issubdtype(xdt, np.datetime64) or np.issubdtype(xdt, np.timedelta64):
|
|
205
|
+
x = x.view(np.int64)
|
|
206
|
+
|
|
207
|
+
bins = _get_bin_idxs(x, n_out // 4)
|
|
208
|
+
|
|
209
|
+
rel_idxs = []
|
|
210
|
+
for lower, upper in zip(bins, bins[1:]):
|
|
211
|
+
y_slice = y[lower:upper]
|
|
212
|
+
if not len(y_slice):
|
|
213
|
+
continue
|
|
214
|
+
|
|
215
|
+
# calculate the min(idx), argmin(slice), argmax(slice), max(idx)
|
|
216
|
+
rel_idxs.append(lower)
|
|
217
|
+
rel_idxs.append(lower + np.nanargmin(y_slice))
|
|
218
|
+
rel_idxs.append(lower + np.nanargmax(y_slice))
|
|
219
|
+
rel_idxs.append(upper - 1)
|
|
220
|
+
|
|
221
|
+
# NOTE: we do not use the np.unique so that all indices are retained
|
|
222
|
+
return np.array(sorted(rel_idxs))
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
class NaNM4_py(AbstractDownsampler):
|
|
226
|
+
@staticmethod
|
|
227
|
+
def _check_valid_n_out(n_out: int):
|
|
228
|
+
assert n_out % 4 == 0, "n_out must be a multiple of 4"
|
|
229
|
+
|
|
230
|
+
def _downsample(
|
|
231
|
+
self, x: Union[np.ndarray, None], y: np.ndarray, n_out: int, **kwargs
|
|
232
|
+
) -> np.ndarray:
|
|
233
|
+
"""TODO complete docs"""
|
|
234
|
+
if x is None:
|
|
235
|
+
# Is fine for this implementation as this is only used for testing
|
|
236
|
+
x = np.arange(y.shape[0])
|
|
237
|
+
|
|
238
|
+
xdt = x.dtype
|
|
239
|
+
if np.issubdtype(xdt, np.datetime64) or np.issubdtype(xdt, np.timedelta64):
|
|
240
|
+
x = x.view(np.int64)
|
|
241
|
+
|
|
242
|
+
bins = _get_bin_idxs(x, n_out // 4)
|
|
243
|
+
|
|
244
|
+
rel_idxs = []
|
|
245
|
+
for lower, upper in zip(bins, bins[1:]):
|
|
246
|
+
y_slice = y[lower:upper]
|
|
247
|
+
if not len(y_slice):
|
|
248
|
+
continue
|
|
249
|
+
|
|
250
|
+
# calculate the min(idx), argmin(slice), argmax(slice), max(idx)
|
|
251
|
+
rel_idxs.append(lower)
|
|
252
|
+
rel_idxs.append(lower + y_slice.argmin())
|
|
253
|
+
rel_idxs.append(lower + y_slice.argmax())
|
|
254
|
+
rel_idxs.append(upper - 1)
|
|
255
|
+
|
|
256
|
+
# NOTE: we do not use the np.unique so that all indices are retained
|
|
257
|
+
return np.array(sorted(rel_idxs))
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# In this folder the compiled rust code should be placed.
|
|
Binary file
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
from typing import Union
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
# ------------------ Rust Downsamplers ------------------
|
|
7
|
+
from tsdownsample._rust import _tsdownsample_rs # type: ignore[attr-defined]
|
|
8
|
+
|
|
9
|
+
from .downsampling_interface import (
|
|
10
|
+
AbstractDownsampler,
|
|
11
|
+
AbstractRustDownsampler,
|
|
12
|
+
AbstractRustNaNDownsampler,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MinMaxDownsampler(AbstractRustDownsampler):
|
|
17
|
+
"""Downsampler that uses the MinMax algorithm. If the y data contains NaNs, these
|
|
18
|
+
ignored (i.e. the NaNs are not taken into account when selecting data points).
|
|
19
|
+
|
|
20
|
+
For each bin, the indices of the minimum and maximum values are selected.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def rust_mod(self):
|
|
25
|
+
return _tsdownsample_rs.minmax
|
|
26
|
+
|
|
27
|
+
@staticmethod
|
|
28
|
+
def _check_valid_n_out(n_out: int):
|
|
29
|
+
AbstractRustDownsampler._check_valid_n_out(n_out)
|
|
30
|
+
if n_out % 2 != 0:
|
|
31
|
+
raise ValueError("n_out must be even")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class NaNMinMaxDownsampler(AbstractRustNaNDownsampler):
|
|
35
|
+
"""Downsampler that uses the MinMax algorithm. If the y data contains NaNs, the
|
|
36
|
+
indices of these NaNs are returned.
|
|
37
|
+
|
|
38
|
+
For each bin, the indices of the minimum and maximum values are selected.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def rust_mod(self):
|
|
43
|
+
return _tsdownsample_rs.minmax
|
|
44
|
+
|
|
45
|
+
@staticmethod
|
|
46
|
+
def _check_valid_n_out(n_out: int):
|
|
47
|
+
AbstractRustDownsampler._check_valid_n_out(n_out)
|
|
48
|
+
if n_out % 2 != 0:
|
|
49
|
+
raise ValueError("n_out must be even")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class M4Downsampler(AbstractRustDownsampler):
|
|
53
|
+
"""Downsampler that uses the M4 algorithm. If the y data contains NaNs, these are
|
|
54
|
+
ignored (i.e. the NaNs are not taken into account when selecting data points).
|
|
55
|
+
|
|
56
|
+
For each bin, the indices of the first, last, minimum and maximum values are
|
|
57
|
+
selected.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def rust_mod(self):
|
|
62
|
+
return _tsdownsample_rs.m4
|
|
63
|
+
|
|
64
|
+
@staticmethod
|
|
65
|
+
def _check_valid_n_out(n_out: int):
|
|
66
|
+
AbstractRustDownsampler._check_valid_n_out(n_out)
|
|
67
|
+
if n_out % 4 != 0:
|
|
68
|
+
raise ValueError("n_out must be a multiple of 4")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class NaNM4Downsampler(AbstractRustNaNDownsampler):
|
|
72
|
+
"""Downsampler that uses the M4 algorithm. If the y data contains NaNs, the indices
|
|
73
|
+
of these NaNs are returned.
|
|
74
|
+
|
|
75
|
+
For each bin, the indices of the first, last, minimum and maximum values are
|
|
76
|
+
selected.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
@property
|
|
80
|
+
def rust_mod(self):
|
|
81
|
+
return _tsdownsample_rs.m4
|
|
82
|
+
|
|
83
|
+
@staticmethod
|
|
84
|
+
def _check_valid_n_out(n_out: int):
|
|
85
|
+
AbstractRustDownsampler._check_valid_n_out(n_out)
|
|
86
|
+
if n_out % 4 != 0:
|
|
87
|
+
raise ValueError("n_out must be a multiple of 4")
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class LTTBDownsampler(AbstractRustDownsampler):
|
|
91
|
+
"""Downsampler that uses the LTTB algorithm."""
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def rust_mod(self):
|
|
95
|
+
return _tsdownsample_rs.lttb
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class MinMaxLTTBDownsampler(AbstractRustDownsampler):
|
|
99
|
+
"""Downsampler that uses the MinMaxLTTB algorithm. If the y data contains NaNs,
|
|
100
|
+
these are ignored (i.e. the NaNs are not taken into account when selecting data
|
|
101
|
+
points).
|
|
102
|
+
|
|
103
|
+
MinMaxLTTB paper: https://arxiv.org/abs/2305.00332
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
@property
|
|
107
|
+
def rust_mod(self):
|
|
108
|
+
return _tsdownsample_rs.minmaxlttb
|
|
109
|
+
|
|
110
|
+
def downsample(
|
|
111
|
+
self, *args, n_out: int, minmax_ratio: int = 4, parallel: bool = False, **_
|
|
112
|
+
):
|
|
113
|
+
assert minmax_ratio > 0, "minmax_ratio must be greater than 0"
|
|
114
|
+
return super().downsample(
|
|
115
|
+
*args, n_out=n_out, parallel=parallel, ratio=minmax_ratio
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class NaNMinMaxLTTBDownsampler(AbstractRustNaNDownsampler):
|
|
120
|
+
"""Downsampler that uses the MinMaxLTTB algorithm. If the y data contains NaNs, the
|
|
121
|
+
indices of these NaNs are returned.
|
|
122
|
+
|
|
123
|
+
MinMaxLTTB paper: https://arxiv.org/abs/2305.00332
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
@property
|
|
127
|
+
def rust_mod(self):
|
|
128
|
+
return _tsdownsample_rs.minmaxlttb
|
|
129
|
+
|
|
130
|
+
def downsample(
|
|
131
|
+
self, *args, n_out: int, minmax_ratio: int = 4, parallel: bool = False, **_
|
|
132
|
+
):
|
|
133
|
+
assert minmax_ratio > 0, "minmax_ratio must be greater than 0"
|
|
134
|
+
return super().downsample(
|
|
135
|
+
*args, n_out=n_out, parallel=parallel, ratio=minmax_ratio
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
# ------------------ EveryNth Downsampler ------------------
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class EveryNthDownsampler(AbstractDownsampler):
|
|
143
|
+
"""Downsampler that selects every nth data point"""
|
|
144
|
+
|
|
145
|
+
def __init__(self, **kwargs):
|
|
146
|
+
super().__init__(check_contiguous=False, **kwargs)
|
|
147
|
+
|
|
148
|
+
def _downsample(
|
|
149
|
+
self, x: Union[np.ndarray, None], y: np.ndarray, n_out: int, **_
|
|
150
|
+
) -> np.ndarray:
|
|
151
|
+
if x is not None:
|
|
152
|
+
name = self.__class__.__name__
|
|
153
|
+
warnings.warn(
|
|
154
|
+
f"x is passed to downsample method of {name}, but is not taken "
|
|
155
|
+
"into account by the current implementation of the EveryNth algorithm."
|
|
156
|
+
)
|
|
157
|
+
step = max(1, len(y) / n_out)
|
|
158
|
+
return np.arange(start=0, stop=len(y) - 0.1, step=step).astype(np.uint)
|
|
@@ -0,0 +1,432 @@
|
|
|
1
|
+
"""AbstractDownsampler interface-class, subclassed by concrete downsamplers."""
|
|
2
|
+
|
|
3
|
+
__author__ = "Jeroen Van Der Donckt"
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
import warnings
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from copy import deepcopy
|
|
9
|
+
from types import ModuleType
|
|
10
|
+
from typing import Callable, List, Optional, Tuple, Union
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class AbstractDownsampler(ABC):
|
|
16
|
+
"""AbstractDownsampler interface-class, subclassed by concrete downsamplers."""
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
check_contiguous: bool = True,
|
|
21
|
+
x_dtype_regex_list: Optional[List[str]] = None,
|
|
22
|
+
y_dtype_regex_list: Optional[List[str]] = None,
|
|
23
|
+
):
|
|
24
|
+
self.check_contiguous = check_contiguous
|
|
25
|
+
self.x_dtype_regex_list = x_dtype_regex_list
|
|
26
|
+
self.y_dtype_regex_list = y_dtype_regex_list
|
|
27
|
+
|
|
28
|
+
def _check_contiguous(self, arr: np.ndarray, y: bool = True):
|
|
29
|
+
# necessary for rust downsamplers as they don't support non-contiguous arrays
|
|
30
|
+
# (we call .as_slice().unwrap() on the array) in the lib.rs file
|
|
31
|
+
# which will panic if the array is not contiguous
|
|
32
|
+
if not self.check_contiguous:
|
|
33
|
+
return
|
|
34
|
+
|
|
35
|
+
if arr.flags["C_CONTIGUOUS"]:
|
|
36
|
+
return
|
|
37
|
+
|
|
38
|
+
raise ValueError(f"{'y' if y else 'x'} array must be contiguous.")
|
|
39
|
+
|
|
40
|
+
def _supports_dtype(self, arr: np.ndarray, y: bool = True):
|
|
41
|
+
dtype_regex_list = self.y_dtype_regex_list if y else self.x_dtype_regex_list
|
|
42
|
+
# base case
|
|
43
|
+
if dtype_regex_list is None:
|
|
44
|
+
return
|
|
45
|
+
|
|
46
|
+
for dtype_regex_str in dtype_regex_list:
|
|
47
|
+
m = re.compile(dtype_regex_str).match(str(arr.dtype))
|
|
48
|
+
if m is not None: # a match is found
|
|
49
|
+
return
|
|
50
|
+
raise ValueError(
|
|
51
|
+
f"{arr.dtype} doesn't match with any regex in {dtype_regex_list} "
|
|
52
|
+
f"for the {'y' if y else 'x'}-data"
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
@staticmethod
|
|
56
|
+
def _check_valid_downsample_args(
|
|
57
|
+
*args,
|
|
58
|
+
) -> Tuple[Union[np.ndarray, None], np.ndarray]:
|
|
59
|
+
if len(args) == 2:
|
|
60
|
+
x, y = args
|
|
61
|
+
elif len(args) == 1:
|
|
62
|
+
x, y = None, args[0]
|
|
63
|
+
else:
|
|
64
|
+
raise ValueError(
|
|
65
|
+
"downsample() takes 1 or 2 positional arguments but "
|
|
66
|
+
f"{len(args)} were given"
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
if x is not None and not isinstance(x, np.ndarray):
|
|
70
|
+
x = np.array(x)
|
|
71
|
+
if not isinstance(y, np.ndarray):
|
|
72
|
+
y = np.array(y)
|
|
73
|
+
|
|
74
|
+
# y must be 1D array
|
|
75
|
+
if y.ndim != 1:
|
|
76
|
+
raise ValueError("y must be 1D array")
|
|
77
|
+
# x must be 1D array with same length as y or None
|
|
78
|
+
if x is not None:
|
|
79
|
+
if x.ndim != 1:
|
|
80
|
+
raise ValueError("x must be 1D array")
|
|
81
|
+
if len(x) != len(y):
|
|
82
|
+
raise ValueError("x and y must have the same length")
|
|
83
|
+
|
|
84
|
+
return x, y
|
|
85
|
+
|
|
86
|
+
@staticmethod
|
|
87
|
+
def _check_valid_n_out(n_out: int):
|
|
88
|
+
if n_out <= 0:
|
|
89
|
+
raise ValueError("n_out must be greater than 0")
|
|
90
|
+
|
|
91
|
+
@abstractmethod
|
|
92
|
+
def _downsample(
|
|
93
|
+
self, x: Union[np.ndarray, None], y: np.ndarray, n_out: int, **kwargs
|
|
94
|
+
) -> np.ndarray:
|
|
95
|
+
"""Downsample the data in x and y.
|
|
96
|
+
|
|
97
|
+
Returns
|
|
98
|
+
-------
|
|
99
|
+
np.ndarray
|
|
100
|
+
The selected indices.
|
|
101
|
+
"""
|
|
102
|
+
raise NotImplementedError
|
|
103
|
+
|
|
104
|
+
def downsample(self, *args, n_out: int, **kwargs): # x and y are optional
|
|
105
|
+
"""Downsample y (and x).
|
|
106
|
+
|
|
107
|
+
Call signatures::
|
|
108
|
+
downsample([x], y, n_out, **kwargs)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
Parameters
|
|
112
|
+
----------
|
|
113
|
+
x, y : array-like
|
|
114
|
+
The horizontal / vertical coordinates of the data points.
|
|
115
|
+
*x* values are optional.
|
|
116
|
+
These parameters should be 1D arrays.
|
|
117
|
+
These arguments cannot be passed as keywords.
|
|
118
|
+
n_out : int
|
|
119
|
+
The number of points to keep.
|
|
120
|
+
**kwargs
|
|
121
|
+
Additional keyword arguments are passed to the downsampler.
|
|
122
|
+
|
|
123
|
+
Returns
|
|
124
|
+
-------
|
|
125
|
+
np.ndarray
|
|
126
|
+
The selected indices.
|
|
127
|
+
"""
|
|
128
|
+
self._check_valid_n_out(n_out)
|
|
129
|
+
x, y = self._check_valid_downsample_args(*args)
|
|
130
|
+
self._supports_dtype(y, y=True)
|
|
131
|
+
self._check_contiguous(y, y=True)
|
|
132
|
+
if x is not None:
|
|
133
|
+
self._supports_dtype(x, y=False)
|
|
134
|
+
self._check_contiguous(x, y=False)
|
|
135
|
+
return self._downsample(x, y, n_out, **kwargs)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
# ------------------- Rust Downsample Interface -------------------
|
|
139
|
+
DOWNSAMPLE_F = "downsample"
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
# the following dtypes are supported by the rust downsamplers (x and y)
|
|
143
|
+
_rust_dtypes = [
|
|
144
|
+
"float32",
|
|
145
|
+
"float64",
|
|
146
|
+
"uint16",
|
|
147
|
+
"uint32",
|
|
148
|
+
"uint64",
|
|
149
|
+
"int16",
|
|
150
|
+
"int32",
|
|
151
|
+
"int64",
|
|
152
|
+
"datetime64",
|
|
153
|
+
"timedelta64",
|
|
154
|
+
]
|
|
155
|
+
# <= 8-bit x-dtypes are not supported as the range of the values is too small to require
|
|
156
|
+
# downsampling
|
|
157
|
+
_y_rust_dtypes = _rust_dtypes + ["float16", "int8", "uint8", "bool"]
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class AbstractRustDownsampler(AbstractDownsampler, ABC):
|
|
161
|
+
"""RustDownsampler interface-class, subclassed by concrete downsamplers."""
|
|
162
|
+
|
|
163
|
+
def __init__(self):
|
|
164
|
+
super().__init__(True, _rust_dtypes, _y_rust_dtypes) # same for x and y
|
|
165
|
+
|
|
166
|
+
@property
|
|
167
|
+
def _downsample_func_prefix(self) -> str:
|
|
168
|
+
"""The prefix of the downsample functions in the rust module."""
|
|
169
|
+
return DOWNSAMPLE_F
|
|
170
|
+
|
|
171
|
+
@property
|
|
172
|
+
def rust_mod(self) -> ModuleType:
|
|
173
|
+
"""The compiled Rust module for the current downsampler."""
|
|
174
|
+
raise NotImplementedError
|
|
175
|
+
|
|
176
|
+
@property
|
|
177
|
+
def mod_single_core(self) -> ModuleType:
|
|
178
|
+
"""Get the single-core Rust module.
|
|
179
|
+
|
|
180
|
+
Returns
|
|
181
|
+
-------
|
|
182
|
+
ModuleType
|
|
183
|
+
If SIMD compiled module is available, that one is returned. Otherwise, the
|
|
184
|
+
scalar compiled module is returned.
|
|
185
|
+
"""
|
|
186
|
+
return self.rust_mod.sequential
|
|
187
|
+
|
|
188
|
+
@property
|
|
189
|
+
def mod_multi_core(self) -> Union[ModuleType, None]:
|
|
190
|
+
"""Get the multi-core Rust module.
|
|
191
|
+
|
|
192
|
+
Returns
|
|
193
|
+
-------
|
|
194
|
+
ModuleType or None
|
|
195
|
+
If SIMD parallel compiled module is available, that one is returned.
|
|
196
|
+
Otherwise, the scalar parallel compiled module is returned.
|
|
197
|
+
If no parallel compiled module is available, None is returned.
|
|
198
|
+
"""
|
|
199
|
+
if hasattr(self.rust_mod, "parallel"):
|
|
200
|
+
# use SIMD implementation if available
|
|
201
|
+
return self.rust_mod.parallel
|
|
202
|
+
return None # no parallel compiled module available
|
|
203
|
+
|
|
204
|
+
@staticmethod
|
|
205
|
+
def _view_x(x: np.ndarray) -> np.ndarray:
|
|
206
|
+
"""View the x-data as different dtype (if necessary)."""
|
|
207
|
+
if np.issubdtype(x.dtype, np.datetime64):
|
|
208
|
+
# datetime64 is viewed as int64
|
|
209
|
+
return x.view(dtype=np.int64)
|
|
210
|
+
elif np.issubdtype(x.dtype, np.timedelta64):
|
|
211
|
+
# timedelta64 is viewed as int64
|
|
212
|
+
return x.view(dtype=np.int64)
|
|
213
|
+
return x
|
|
214
|
+
|
|
215
|
+
@staticmethod
|
|
216
|
+
def _view_y(y: np.ndarray) -> np.ndarray:
|
|
217
|
+
"""View the y-data as different dtype (if necessary)."""
|
|
218
|
+
if y.dtype == "bool":
|
|
219
|
+
# bool is viewed as int8
|
|
220
|
+
return y.view(dtype=np.int8)
|
|
221
|
+
elif np.issubdtype(y.dtype, np.datetime64):
|
|
222
|
+
# datetime64 is viewed as int64
|
|
223
|
+
return y.view(dtype=np.int64)
|
|
224
|
+
elif np.issubdtype(y.dtype, np.timedelta64):
|
|
225
|
+
# timedelta64 is viewed as int64
|
|
226
|
+
return y.view(dtype=np.int64)
|
|
227
|
+
return y
|
|
228
|
+
|
|
229
|
+
def _switch_mod_with_y(
|
|
230
|
+
self, y_dtype: np.dtype, mod: ModuleType, downsample_func: Optional[str] = None
|
|
231
|
+
) -> Callable:
|
|
232
|
+
"""Select the appropriate function from the rust module for the y-data.
|
|
233
|
+
|
|
234
|
+
Assumes equal binning (when no data for x is passed -> only this function is
|
|
235
|
+
executed).
|
|
236
|
+
Equidistant binning is utilized when a `downsample_func` is passed from the
|
|
237
|
+
`_switch_mod_with_x_and_y` method (since the x-data is considered in the
|
|
238
|
+
downsampling).
|
|
239
|
+
|
|
240
|
+
Parameters
|
|
241
|
+
----------
|
|
242
|
+
y_dtype : np.dtype
|
|
243
|
+
The dtype of the y-data
|
|
244
|
+
mod : ModuleType
|
|
245
|
+
The module to select the appropriate function from
|
|
246
|
+
downsample_func : str, optional
|
|
247
|
+
The name of the function to use, by default DOWNSAMPLE_FUNC.
|
|
248
|
+
This argument is passed from the `_switch_mod_with_x_and_y` method when
|
|
249
|
+
the x-data is considered in the downsampling.
|
|
250
|
+
"""
|
|
251
|
+
if downsample_func is None:
|
|
252
|
+
downsample_func = self._downsample_func_prefix
|
|
253
|
+
# FLOATS
|
|
254
|
+
if np.issubdtype(y_dtype, np.floating):
|
|
255
|
+
if y_dtype == np.float16:
|
|
256
|
+
return getattr(mod, downsample_func + "_f16")
|
|
257
|
+
elif y_dtype == np.float32:
|
|
258
|
+
return getattr(mod, downsample_func + "_f32")
|
|
259
|
+
elif y_dtype == np.float64:
|
|
260
|
+
return getattr(mod, downsample_func + "_f64")
|
|
261
|
+
# UINTS
|
|
262
|
+
elif np.issubdtype(y_dtype, np.unsignedinteger):
|
|
263
|
+
if y_dtype == np.uint8:
|
|
264
|
+
return getattr(mod, downsample_func + "_u8")
|
|
265
|
+
elif y_dtype == np.uint16:
|
|
266
|
+
return getattr(mod, downsample_func + "_u16")
|
|
267
|
+
elif y_dtype == np.uint32:
|
|
268
|
+
return getattr(mod, downsample_func + "_u32")
|
|
269
|
+
elif y_dtype == np.uint64:
|
|
270
|
+
return getattr(mod, downsample_func + "_u64")
|
|
271
|
+
# INTS (need to be last because uint is subdtype of int)
|
|
272
|
+
elif np.issubdtype(y_dtype, np.integer):
|
|
273
|
+
if y_dtype == np.int8:
|
|
274
|
+
return getattr(mod, downsample_func + "_i8")
|
|
275
|
+
elif y_dtype == np.int16:
|
|
276
|
+
return getattr(mod, downsample_func + "_i16")
|
|
277
|
+
elif y_dtype == np.int32:
|
|
278
|
+
return getattr(mod, downsample_func + "_i32")
|
|
279
|
+
elif y_dtype == np.int64:
|
|
280
|
+
return getattr(mod, downsample_func + "_i64")
|
|
281
|
+
# DATETIME -> i64 (datetime64 is viewed as int64)
|
|
282
|
+
# TIMEDELTA -> i64 (timedelta64 is viewed as int64)
|
|
283
|
+
# BOOLS -> int8 (bool is viewed as int8)
|
|
284
|
+
raise ValueError(f"Unsupported data type (for y): {y_dtype}")
|
|
285
|
+
|
|
286
|
+
def _switch_mod_with_x_and_y(
|
|
287
|
+
self, # necessary to access the class its _switch_mod_with_y method
|
|
288
|
+
x_dtype: np.dtype,
|
|
289
|
+
y_dtype: np.dtype,
|
|
290
|
+
mod: ModuleType,
|
|
291
|
+
downsample_func: Optional[str] = None,
|
|
292
|
+
) -> Callable:
|
|
293
|
+
"""The x-data is considered in the downsampling
|
|
294
|
+
|
|
295
|
+
Assumes equal binning.
|
|
296
|
+
|
|
297
|
+
Parameters
|
|
298
|
+
----------
|
|
299
|
+
x_dtype : np.dtype
|
|
300
|
+
The dtype of the x-data
|
|
301
|
+
y_dtype : np.dtype
|
|
302
|
+
The dtype of the y-data
|
|
303
|
+
mod : ModuleType
|
|
304
|
+
The module to select the appropriate function from
|
|
305
|
+
downsample_func : str, optional
|
|
306
|
+
The name of the function to use, by default DOWNSAMPLE_FUNC.
|
|
307
|
+
"""
|
|
308
|
+
if downsample_func is None:
|
|
309
|
+
downsample_func = self._downsample_func_prefix
|
|
310
|
+
# FLOATS
|
|
311
|
+
if np.issubdtype(x_dtype, np.floating):
|
|
312
|
+
if x_dtype == np.float16:
|
|
313
|
+
return self._switch_mod_with_y(y_dtype, mod, f"{downsample_func}_f16")
|
|
314
|
+
elif x_dtype == np.float32:
|
|
315
|
+
return self._switch_mod_with_y(y_dtype, mod, f"{downsample_func}_f32")
|
|
316
|
+
elif x_dtype == np.float64:
|
|
317
|
+
return self._switch_mod_with_y(y_dtype, mod, f"{downsample_func}_f64")
|
|
318
|
+
# UINTS
|
|
319
|
+
elif np.issubdtype(x_dtype, np.unsignedinteger):
|
|
320
|
+
if x_dtype == np.uint16:
|
|
321
|
+
return self._switch_mod_with_y(y_dtype, mod, f"{downsample_func}_u16")
|
|
322
|
+
elif x_dtype == np.uint32:
|
|
323
|
+
return self._switch_mod_with_y(y_dtype, mod, f"{downsample_func}_u32")
|
|
324
|
+
elif x_dtype == np.uint64:
|
|
325
|
+
return self._switch_mod_with_y(y_dtype, mod, f"{downsample_func}_u64")
|
|
326
|
+
# INTS (need to be last because uint is subdtype of int)
|
|
327
|
+
elif np.issubdtype(x_dtype, np.integer):
|
|
328
|
+
if x_dtype == np.int16:
|
|
329
|
+
return self._switch_mod_with_y(y_dtype, mod, f"{downsample_func}_i16")
|
|
330
|
+
elif x_dtype == np.int32:
|
|
331
|
+
return self._switch_mod_with_y(y_dtype, mod, f"{downsample_func}_i32")
|
|
332
|
+
elif x_dtype == np.int64:
|
|
333
|
+
return self._switch_mod_with_y(y_dtype, mod, f"{downsample_func}_i64")
|
|
334
|
+
# DATETIME -> i64 (datetime64 is viewed as int64)
|
|
335
|
+
# TIMEDELTA -> i64 (timedelta64 is viewed as int64)
|
|
336
|
+
raise ValueError(f"Unsupported data type (for x): {x_dtype}")
|
|
337
|
+
|
|
338
|
+
def _downsample(
|
|
339
|
+
self,
|
|
340
|
+
x: Union[np.ndarray, None],
|
|
341
|
+
y: np.ndarray,
|
|
342
|
+
n_out: int,
|
|
343
|
+
parallel: bool = False,
|
|
344
|
+
**kwargs,
|
|
345
|
+
) -> np.ndarray:
|
|
346
|
+
"""Downsample the data in x and y."""
|
|
347
|
+
mod = self.mod_single_core
|
|
348
|
+
if parallel:
|
|
349
|
+
if self.mod_multi_core is None:
|
|
350
|
+
name = self.__class__.__name__
|
|
351
|
+
warnings.warn(
|
|
352
|
+
f"No parallel implementation available for {name}. "
|
|
353
|
+
"Falling back to single-core implementation."
|
|
354
|
+
)
|
|
355
|
+
else:
|
|
356
|
+
mod = self.mod_multi_core
|
|
357
|
+
## Viewing the y-data as different dtype (if necessary)
|
|
358
|
+
y = self._view_y(y)
|
|
359
|
+
## Viewing the x-data as different dtype (if necessary)
|
|
360
|
+
if x is None:
|
|
361
|
+
downsample_f = self._switch_mod_with_y(y.dtype, mod)
|
|
362
|
+
return downsample_f(y, n_out, **kwargs)
|
|
363
|
+
x = self._view_x(x)
|
|
364
|
+
## Getting the appropriate downsample function
|
|
365
|
+
downsample_f = self._switch_mod_with_x_and_y(x.dtype, y.dtype, mod)
|
|
366
|
+
return downsample_f(x, y, n_out, **kwargs)
|
|
367
|
+
|
|
368
|
+
def downsample(self, *args, n_out: int, parallel: bool = False, **kwargs):
|
|
369
|
+
"""Downsample the data in x and y.
|
|
370
|
+
|
|
371
|
+
The x and y arguments are positional-only arguments. If only one argument is
|
|
372
|
+
passed, it is considered to be the y-data. If two arguments are passed, the
|
|
373
|
+
first argument is considered to be the x-data and the second argument is
|
|
374
|
+
considered to be the y-data.
|
|
375
|
+
"""
|
|
376
|
+
return super().downsample(*args, n_out=n_out, parallel=parallel, **kwargs)
|
|
377
|
+
|
|
378
|
+
def __deepcopy__(self, memo):
|
|
379
|
+
"""Deepcopy the object."""
|
|
380
|
+
cls = self.__class__
|
|
381
|
+
result = cls.__new__(cls)
|
|
382
|
+
memo[id(self)] = result
|
|
383
|
+
for k, v in self.__dict__.items():
|
|
384
|
+
if k.endswith("_mod") or k.startswith("mod_"):
|
|
385
|
+
# Don't (deep)copy the compiled modules
|
|
386
|
+
setattr(result, k, v)
|
|
387
|
+
else:
|
|
388
|
+
setattr(result, k, deepcopy(v, memo))
|
|
389
|
+
return result
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
NAN_DOWNSAMPLE_F = "downsample_nan"
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
class AbstractRustNaNDownsampler(AbstractRustDownsampler, ABC):
|
|
396
|
+
"""RustNaNDownsampler interface-class, subclassed by concrete downsamplers."""
|
|
397
|
+
|
|
398
|
+
@property
|
|
399
|
+
def _downsample_func_prefix(self) -> str:
|
|
400
|
+
"""The prefix of the downsample functions in the rust module."""
|
|
401
|
+
return NAN_DOWNSAMPLE_F
|
|
402
|
+
|
|
403
|
+
def _switch_mod_with_y(
|
|
404
|
+
self, y_dtype: np.dtype, mod: ModuleType, downsample_func: Optional[str] = None
|
|
405
|
+
) -> Callable:
|
|
406
|
+
"""Select the appropriate function from the rust module for the y-data.
|
|
407
|
+
|
|
408
|
+
Assumes equal binning (when no data for x is passed -> only this function is
|
|
409
|
+
executed).
|
|
410
|
+
Equidistant binning is utilized when a `downsample_func` is passed from the
|
|
411
|
+
`_switch_mod_with_x_and_y` method (since the x-data is considered in the
|
|
412
|
+
downsampling).
|
|
413
|
+
|
|
414
|
+
Parameters
|
|
415
|
+
----------
|
|
416
|
+
y_dtype : np.dtype
|
|
417
|
+
The dtype of the y-data
|
|
418
|
+
mod : ModuleType
|
|
419
|
+
The module to select the appropriate function from
|
|
420
|
+
downsample_func : str, optional
|
|
421
|
+
The name of the function to use, by default NAN_DOWNSAMPLE_F.
|
|
422
|
+
This argument is passed from the `_switch_mod_with_x_and_y` method when
|
|
423
|
+
the x-data is considered in the downsampling.
|
|
424
|
+
"""
|
|
425
|
+
if downsample_func is None:
|
|
426
|
+
downsample_func = self._downsample_func_prefix
|
|
427
|
+
if not np.issubdtype(y_dtype, np.floating):
|
|
428
|
+
# When y is not a float, we need to remove the _nan suffix to use the
|
|
429
|
+
# regular downsample function as the _nan suffix is only used for floats.
|
|
430
|
+
# (Note that NaNs only exist for floats)
|
|
431
|
+
downsample_func = downsample_func.replace("_nan", "")
|
|
432
|
+
return super()._switch_mod_with_y(y_dtype, mod, downsample_func)
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tsdownsample
|
|
3
|
+
Version: 0.1.4.1rc0
|
|
4
|
+
Classifier: Intended Audience :: Developers
|
|
5
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
+
Classifier: Operating System :: POSIX
|
|
14
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
|
15
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
16
|
+
Requires-Dist: numpy
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Summary: Time series downsampling in rust
|
|
19
|
+
Keywords: time series,downsampling,rust,data science,visualization
|
|
20
|
+
Author: Jeroen Van Der Donckt
|
|
21
|
+
License: MIT
|
|
22
|
+
Requires-Python: >=3.8
|
|
23
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
24
|
+
Project-URL: Homepage, https://github.com/predict-idlab/tsdownsample
|
|
25
|
+
Project-URL: Repository, https://github.com/predict-idlab/tsdownsample
|
|
26
|
+
|
|
27
|
+
# tsdownsample
|
|
28
|
+
|
|
29
|
+
[](https://pypi.org/project/tsdownsample/)
|
|
30
|
+
[](https://img.shields.io/pypi/pyversions/tsdownsample)
|
|
31
|
+
[](https://pepy.tech/project/tsdownsample)
|
|
32
|
+
[](https://github.com/predict-idlab/tsdownsample/actions/workflows/codeql.yml)
|
|
33
|
+
[](https://github.com/predict-idlab/tsdownsample/actions/workflows/ci-downsample_rs.yml)
|
|
34
|
+
[](https://github.com/predict-idlab/tsdownsample/actions/workflows/ci-tsdownsample.yml)
|
|
35
|
+
[](https://discord.gg/k2d59GrxPX)
|
|
36
|
+
|
|
37
|
+
<!-- TODO: codecov -->
|
|
38
|
+
|
|
39
|
+
Extremely fast **time series downsampling 📈** for visualization, written in Rust.
|
|
40
|
+
|
|
41
|
+
## Features ✨
|
|
42
|
+
|
|
43
|
+
- **Fast**: written in rust with PyO3 bindings
|
|
44
|
+
- leverages optimized [argminmax](https://github.com/jvdd/argminmax) - which is SIMD accelerated with runtime feature detection
|
|
45
|
+
- scales linearly with the number of data points
|
|
46
|
+
<!-- TODO check if it scales sublinearly -->
|
|
47
|
+
- multithreaded with Rayon (in Rust)
|
|
48
|
+
<details>
|
|
49
|
+
<summary><i>Why we do not use Python multiprocessing</i></summary>
|
|
50
|
+
Citing the <a href="https://pyo3.rs/v0.17.3/parallelism.html">PyO3 docs on parallelism</a>:<br>
|
|
51
|
+
<blockquote>
|
|
52
|
+
CPython has the infamous Global Interpreter Lock, which prevents several threads from executing Python bytecode in parallel. This makes threading in Python a bad fit for CPU-bound tasks and often forces developers to accept the overhead of multiprocessing.
|
|
53
|
+
</blockquote>
|
|
54
|
+
In Rust - which is a compiled language - there is no GIL, so CPU-bound tasks can be parallelized (with <a href="https://github.com/rayon-rs/rayon">Rayon</a>) with little to no overhead.
|
|
55
|
+
</details>
|
|
56
|
+
- **Efficient**: memory efficient
|
|
57
|
+
- works on views of the data (no copies)
|
|
58
|
+
- no intermediate data structures are created
|
|
59
|
+
- **Flexible**: works on any type of data
|
|
60
|
+
- supported datatypes are
|
|
61
|
+
- for `x`: `f32`, `f64`, `i16`, `i32`, `i64`, `u16`, `u32`, `u64`, `datetime64`, `timedelta64`
|
|
62
|
+
- for `y`: `f16`, `f32`, `f64`, `i8`, `i16`, `i32`, `i64`, `u8`, `u16`, `u32`, `u64`, `datetime64`, `timedelta64`, `bool`
|
|
63
|
+
<details>
|
|
64
|
+
<summary><i>!! 🚀 <code>f16</code> <a href="https://github.com/jvdd/argminmax">argminmax</a> is 200-300x faster than numpy</i></summary>
|
|
65
|
+
In contrast with all other data types above, <code>f16</code> is *not* hardware supported (i.e., no instructions for f16) by most modern CPUs!! <br>
|
|
66
|
+
🐌 Programming languages facilitate support for this datatype by either (i) upcasting to <u>f32</u> or (ii) using a software implementation. <br>
|
|
67
|
+
💡 As for argminmax, only comparisons are needed - and thus no arithmetic operations - creating a <u>symmetrical ordinal mapping from <code>f16</code> to <code>i16</code></u> is sufficient. This mapping allows to use the hardware supported scalar and SIMD <code>i16</code> instructions - while not producing any memory overhead 🎉 <br>
|
|
68
|
+
<i>More details are described in <a href="https://github.com/jvdd/argminmax/pull/1">argminmax PR #1</a>.</i>
|
|
69
|
+
</details>
|
|
70
|
+
- **Easy to use**: simple & flexible API
|
|
71
|
+
|
|
72
|
+
## Install
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
pip install tsdownsample
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Usage
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from tsdownsample import MinMaxLTTBDownsampler
|
|
82
|
+
import numpy as np
|
|
83
|
+
|
|
84
|
+
# Create a time series
|
|
85
|
+
y = np.random.randn(10_000_000)
|
|
86
|
+
x = np.arange(len(y))
|
|
87
|
+
|
|
88
|
+
# Downsample to 1000 points (assuming constant sampling rate)
|
|
89
|
+
s_ds = MinMaxLTTBDownsampler().downsample(y, n_out=1000)
|
|
90
|
+
|
|
91
|
+
# Select downsampled data
|
|
92
|
+
downsampled_y = y[s_ds]
|
|
93
|
+
|
|
94
|
+
# Downsample to 1000 points using the (possible irregularly spaced) x-data
|
|
95
|
+
s_ds = MinMaxLTTBDownsampler().downsample(x, y, n_out=1000)
|
|
96
|
+
|
|
97
|
+
# Select downsampled data
|
|
98
|
+
downsampled_x = x[s_ds]
|
|
99
|
+
downsampled_y = y[s_ds]
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## Downsampling algorithms & API
|
|
103
|
+
|
|
104
|
+
### Downsampling API 📑
|
|
105
|
+
|
|
106
|
+
Each downsampling algorithm is implemented as a class that implements a `downsample` method.
|
|
107
|
+
The signature of the `downsample` method:
|
|
108
|
+
|
|
109
|
+
```
|
|
110
|
+
downsample([x], y, n_out, **kwargs) -> ndarray[uint64]
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
**Arguments**:
|
|
114
|
+
|
|
115
|
+
- `x` is optional
|
|
116
|
+
- `x` and `y` are both positional arguments
|
|
117
|
+
- `n_out` is a mandatory keyword argument that defines the number of output values<sup>*</sup>
|
|
118
|
+
- `**kwargs` are optional keyword arguments *(see [table below](#downsampling-algorithms-📈))*:
|
|
119
|
+
- `parallel`: whether to use multi-threading (default: `False`)
|
|
120
|
+
❗ The max number of threads can be configured with the `TSDOWNSAMPLE_MAX_THREADS` ENV var (e.g. `os.environ["TSDOWNSAMPLE_MAX_THREADS"] = "4"`)
|
|
121
|
+
- ...
|
|
122
|
+
|
|
123
|
+
**Returns**: a `ndarray[uint64]` of indices that can be used to index the original data.
|
|
124
|
+
|
|
125
|
+
<sup>\*</sup><i>When there are gaps in the time series, fewer than `n_out` indices may be returned.</i>
|
|
126
|
+
|
|
127
|
+
### Downsampling algorithms 📈
|
|
128
|
+
|
|
129
|
+
The following downsampling algorithms (classes) are implemented:
|
|
130
|
+
|
|
131
|
+
| Downsampler | Description | `**kwargs` |
|
|
132
|
+
| ---:| --- |--- |
|
|
133
|
+
| `MinMaxDownsampler` | selects the **min and max** value in each bin | `parallel` |
|
|
134
|
+
| `M4Downsampler` | selects the [**min, max, first and last**](https://dl.acm.org/doi/pdf/10.14778/2732951.2732953) value in each bin | `parallel` |
|
|
135
|
+
| `LTTBDownsampler` | performs the [**Largest Triangle Three Buckets**](https://skemman.is/bitstream/1946/15343/3/SS_MSthesis.pdf) algorithm | `parallel` |
|
|
136
|
+
| `MinMaxLTTBDownsampler` | (*new two-step algorithm 🎉*) first selects `n_out` * `minmax_ratio` **min and max** values, then further reduces these to `n_out` values using the **Largest Triangle Three Buckets** algorithm | `parallel`, `minmax_ratio`<sup>*</sup> |
|
|
137
|
+
|
|
138
|
+
<sup>*</sup><i>Default value for `minmax_ratio` is 4, which is empirically proven to be a good default. More details here: https://arxiv.org/abs/2305.00332</i>
|
|
139
|
+
|
|
140
|
+
### Handling NaNs
|
|
141
|
+
|
|
142
|
+
This library supports two `NaN`-policies:
|
|
143
|
+
|
|
144
|
+
1. Omit `NaN`s (`NaN`s are ignored during downsampling).
|
|
145
|
+
2. Return index of first `NaN` once there is at least one present in the bin of the considered data.
|
|
146
|
+
|
|
147
|
+
| Omit `NaN`s | Return `NaN`s |
|
|
148
|
+
| ----------------------: | :------------------------- |
|
|
149
|
+
| `MinMaxDownsampler` | `NaNMinMaxDownsampler` |
|
|
150
|
+
| `M4Downsampler` | `NaNM4Downsampler` |
|
|
151
|
+
| `MinMaxLTTBDownsampler` | `NaNMinMaxLTTBDownsampler` |
|
|
152
|
+
| `LTTBDownsampler` | |
|
|
153
|
+
|
|
154
|
+
> Note that NaNs are not supported for `x`-data.
|
|
155
|
+
|
|
156
|
+
## Limitations & assumptions 🚨
|
|
157
|
+
|
|
158
|
+
Assumes;
|
|
159
|
+
|
|
160
|
+
1. `x`-data is (non-strictly) monotonic increasing (i.e., sorted)
|
|
161
|
+
2. no `NaN`s in `x`-data
|
|
162
|
+
|
|
163
|
+
---
|
|
164
|
+
|
|
165
|
+
<p align="center">
|
|
166
|
+
👤 <i>Jeroen Van Der Donckt</i>
|
|
167
|
+
</p>
|
|
168
|
+
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
tsdownsample-0.1.4.1rc0.dist-info/METADATA,sha256=e0y2YyFZrhjUCV7jAlAtwdP7_a4YO76Xw0ceznp8HiU,8160
|
|
2
|
+
tsdownsample-0.1.4.1rc0.dist-info/WHEEL,sha256=Dg5iAOm8hb2flCZz-g3oSBJ_MQUtfLw_2uuDTAM9fQs,94
|
|
3
|
+
tsdownsample-0.1.4.1rc0.dist-info/licenses/LICENSE,sha256=NSbA8Qo_STXZvdmASn5697OCOsYtH2XwH5TNhODIKV8,1099
|
|
4
|
+
tsdownsample/downsamplers.py,sha256=EIHJFYIx_vmag2m1swE7Irx29jivoL8o0fhB6KgtC_g,5034
|
|
5
|
+
tsdownsample/downsampling_interface.py,sha256=vIZcALvdBHd0wm-EUon5d0sZYVeFqruQGmk-yZqS2zw,16701
|
|
6
|
+
tsdownsample/_python/downsamplers.py,sha256=WOBaHama0BUOdiCBTuwA5j2v-LtTet3wyR9nGJRGC5A,8807
|
|
7
|
+
tsdownsample/_python/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
tsdownsample/_rust/__init__.py,sha256=J-7Wu5IPVJQOxeypg3Ta9Mu8Sx2vtw7vehjKEFKFugQ,59
|
|
9
|
+
tsdownsample/__init__.py,sha256=Sy4yckabYwopsWppgqYeEEIxqC4XQTalH9Gq5L453HM,628
|
|
10
|
+
tsdownsample/_rust/_tsdownsample_rs.cp39-win_amd64.pyd,sha256=FY0r-dagUSDlKDXVcBfqg38Unswth8p90qfx0D__hWk,5656576
|
|
11
|
+
tsdownsample-0.1.4.1rc0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2022 Jeroen Van Der Donckt
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|