oups 2025.9.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of oups might be problematic. Click here for more details.
- oups/__init__.py +40 -0
- oups/date_utils.py +62 -0
- oups/defines.py +26 -0
- oups/numpy_utils.py +114 -0
- oups/stateful_loop/__init__.py +14 -0
- oups/stateful_loop/loop_persistence_io.py +55 -0
- oups/stateful_loop/stateful_loop.py +654 -0
- oups/stateful_loop/validate_loop_usage.py +338 -0
- oups/stateful_ops/__init__.py +22 -0
- oups/stateful_ops/aggstream/__init__.py +12 -0
- oups/stateful_ops/aggstream/aggstream.py +1524 -0
- oups/stateful_ops/aggstream/cumsegagg.py +580 -0
- oups/stateful_ops/aggstream/jcumsegagg.py +416 -0
- oups/stateful_ops/aggstream/segmentby.py +1018 -0
- oups/stateful_ops/aggstream/utils.py +71 -0
- oups/stateful_ops/asof_merger/__init__.py +11 -0
- oups/stateful_ops/asof_merger/asof_merger.py +750 -0
- oups/stateful_ops/asof_merger/get_config.py +401 -0
- oups/stateful_ops/asof_merger/validate_params.py +285 -0
- oups/store/__init__.py +15 -0
- oups/store/filepath_utils.py +68 -0
- oups/store/indexer.py +457 -0
- oups/store/ordered_parquet_dataset/__init__.py +19 -0
- oups/store/ordered_parquet_dataset/metadata_filename.py +50 -0
- oups/store/ordered_parquet_dataset/ordered_parquet_dataset/__init__.py +15 -0
- oups/store/ordered_parquet_dataset/ordered_parquet_dataset/base.py +863 -0
- oups/store/ordered_parquet_dataset/ordered_parquet_dataset/read_only.py +252 -0
- oups/store/ordered_parquet_dataset/parquet_adapter.py +157 -0
- oups/store/ordered_parquet_dataset/write/__init__.py +19 -0
- oups/store/ordered_parquet_dataset/write/iter_merge_split_data.py +131 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/__init__.py +22 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/base.py +784 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/n_rows_strategy.py +297 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/time_period_strategy.py +319 -0
- oups/store/ordered_parquet_dataset/write/write.py +270 -0
- oups/store/store/__init__.py +11 -0
- oups/store/store/dataset_cache.py +50 -0
- oups/store/store/iter_intersections.py +397 -0
- oups/store/store/store.py +345 -0
- oups-2025.9.5.dist-info/LICENSE +201 -0
- oups-2025.9.5.dist-info/METADATA +44 -0
- oups-2025.9.5.dist-info/RECORD +43 -0
- oups-2025.9.5.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,1018 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Created on Wed Dec 4 21:30:00 2021.
|
|
4
|
+
|
|
5
|
+
@author: pierrot
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
from collections.abc import Callable
|
|
9
|
+
from functools import partial
|
|
10
|
+
from math import ceil
|
|
11
|
+
from math import fmod
|
|
12
|
+
|
|
13
|
+
from numba import njit
|
|
14
|
+
from numpy import arange
|
|
15
|
+
from numpy import argsort
|
|
16
|
+
from numpy import concatenate
|
|
17
|
+
from numpy import diff as ndiff
|
|
18
|
+
from numpy import dtype
|
|
19
|
+
from numpy import full
|
|
20
|
+
from numpy import insert as ninsert
|
|
21
|
+
from numpy import ndenumerate
|
|
22
|
+
from numpy import nonzero
|
|
23
|
+
from numpy import zeros
|
|
24
|
+
from numpy.typing import NDArray
|
|
25
|
+
from pandas import DataFrame
|
|
26
|
+
from pandas import IntervalIndex
|
|
27
|
+
from pandas import Series
|
|
28
|
+
from pandas import Timedelta
|
|
29
|
+
from pandas import concat
|
|
30
|
+
from pandas import date_range
|
|
31
|
+
from pandas.core.resample import TimeGrouper
|
|
32
|
+
from pandas.core.resample import _get_timestamp_range_edges as gtre
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# Some constants.
|
|
36
|
+
DTYPE_INT64 = dtype("int64")
|
|
37
|
+
DTYPE_DATETIME64 = dtype("datetime64[ns]")
|
|
38
|
+
NULL_INT64_1D_ARRAY = zeros(0, DTYPE_INT64)
|
|
39
|
+
LEFT = "left"
|
|
40
|
+
RIGHT = "right"
|
|
41
|
+
# Keys for main buffer.
|
|
42
|
+
KEY_BIN = "bin"
|
|
43
|
+
KEY_SNAP = "snap"
|
|
44
|
+
# Keys for 'by_...' when a Callable.
|
|
45
|
+
KEY_LAST_BIN_LABEL = "last_bin_label"
|
|
46
|
+
KEY_LAST_BIN_END = "last_bin_end"
|
|
47
|
+
KEY_RESTART_KEY = "restart_key"
|
|
48
|
+
KEY_LAST_ON_VALUE = "last_on_value"
|
|
49
|
+
# Keys for 'bin_by' when a dict
|
|
50
|
+
KEY_ON_COLS = "on_cols"
|
|
51
|
+
KEY_BIN_BY = "bin_by"
|
|
52
|
+
KEY_ORDERED_ON = "ordered_on"
|
|
53
|
+
KEY_SNAP_BY = "snap_by"
|
|
54
|
+
KEY_BIN_ON = "bin_on"
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@njit(
|
|
58
|
+
[
|
|
59
|
+
"Tuple((int64[:], int64, boolean))(int64[:], int64[:], boolean)",
|
|
60
|
+
"Tuple((int64[:], int64, boolean))(float64[:], float64[:], boolean)",
|
|
61
|
+
],
|
|
62
|
+
)
|
|
63
|
+
def _next_chunk_starts(
|
|
64
|
+
data: NDArray,
|
|
65
|
+
right_edges: NDArray,
|
|
66
|
+
right: bool,
|
|
67
|
+
):
|
|
68
|
+
"""
|
|
69
|
+
Return row indices for starts of next chunks.
|
|
70
|
+
|
|
71
|
+
Parameters
|
|
72
|
+
----------
|
|
73
|
+
data: NDArray
|
|
74
|
+
One-dimensional array from which deriving next chunk starts, assuming
|
|
75
|
+
data is sorted (monotonic increasing data).
|
|
76
|
+
right_edges: NDArray
|
|
77
|
+
One-dimensional array of chunk right edges, sorted.
|
|
78
|
+
right : bool
|
|
79
|
+
If `True`, histogram is built considering right-closed bins.
|
|
80
|
+
If `False`, histogram is built considering left-closed bins.
|
|
81
|
+
|
|
82
|
+
Returns
|
|
83
|
+
-------
|
|
84
|
+
next_chunk_starts : ndarray
|
|
85
|
+
One-dimensional array, containing row indices for start of next chunk,
|
|
86
|
+
to bin 'data' as per 'right_edges'.
|
|
87
|
+
If last right edges are out of 'data', the 'next chunk starts' for the
|
|
88
|
+
resulting empty bins are not returned.
|
|
89
|
+
Size of 'next_chunk_starts' is smaller than or equal to
|
|
90
|
+
``len(right_edges)``.
|
|
91
|
+
n_null_chunks : ndarray
|
|
92
|
+
One-dimensional array of size 1, which single value is the number of
|
|
93
|
+
null chunks identified.
|
|
94
|
+
data_traversed : boolean
|
|
95
|
+
Specifies if 'data' has been completely traversed or not.
|
|
96
|
+
|
|
97
|
+
"""
|
|
98
|
+
# Output variables
|
|
99
|
+
next_chunk_starts = zeros(len(right_edges), dtype=DTYPE_INT64)
|
|
100
|
+
n_null_chunks = 0
|
|
101
|
+
# Flag for counting null chunks.
|
|
102
|
+
prev_d_idx = 0
|
|
103
|
+
_d_idx = prev_d_idx = 0
|
|
104
|
+
data_max_idx = len(data) - 1
|
|
105
|
+
for (b_idx_loc,), bin_ in ndenumerate(right_edges):
|
|
106
|
+
prev_bin = True
|
|
107
|
+
if right:
|
|
108
|
+
# Right-closed bins.
|
|
109
|
+
for (_d_idx_loc,), val in ndenumerate(data[_d_idx:]):
|
|
110
|
+
if val > bin_:
|
|
111
|
+
prev_bin = False
|
|
112
|
+
break
|
|
113
|
+
else:
|
|
114
|
+
# Left-closed bins.
|
|
115
|
+
for (_d_idx_loc,), val in ndenumerate(data[_d_idx:]):
|
|
116
|
+
if val >= bin_:
|
|
117
|
+
prev_bin = False
|
|
118
|
+
break
|
|
119
|
+
_d_idx += _d_idx_loc
|
|
120
|
+
if _d_idx == data_max_idx and prev_bin:
|
|
121
|
+
# Array 'data' terminated and loop stayed in previous chunk.
|
|
122
|
+
# Then, last loop has not been accounted for.
|
|
123
|
+
# Hence a '+1' to account for it.
|
|
124
|
+
next_chunk_starts[b_idx_loc] = _d_idx + 1
|
|
125
|
+
# Previous code to return all bins, including the empty ones
|
|
126
|
+
# defined by the last values in 'right_edges'.
|
|
127
|
+
# next_chunk_starts[b_idx_loc:] = _d_idx + 1
|
|
128
|
+
# n_null_chunks += len(next_chunk_starts[b_idx_loc:]) - 1
|
|
129
|
+
# Do not return empty bins at end of data.
|
|
130
|
+
return next_chunk_starts[: b_idx_loc + 1], n_null_chunks, True
|
|
131
|
+
else:
|
|
132
|
+
next_chunk_starts[b_idx_loc] = _d_idx
|
|
133
|
+
if prev_d_idx == _d_idx:
|
|
134
|
+
n_null_chunks += 1
|
|
135
|
+
else:
|
|
136
|
+
prev_d_idx = _d_idx
|
|
137
|
+
# Array 'right_edges' is terminated, before 'data' is ended.
|
|
138
|
+
return next_chunk_starts, n_null_chunks, False
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def by_scale(
|
|
142
|
+
on: Series,
|
|
143
|
+
by: TimeGrouper | Series | tuple[Series],
|
|
144
|
+
closed: str | None = None,
|
|
145
|
+
buffer: dict | None = None,
|
|
146
|
+
) -> tuple[NDArray, Series, int, str, Series, bool]:
|
|
147
|
+
"""
|
|
148
|
+
Segment an ordered DatetimeIndex or Series.
|
|
149
|
+
|
|
150
|
+
Parameters
|
|
151
|
+
----------
|
|
152
|
+
on : Series
|
|
153
|
+
Ordered date time index over which performing the binning as defined
|
|
154
|
+
per 'by'.
|
|
155
|
+
by : Grouper or Series or tuple of 2 Series
|
|
156
|
+
Setup to define binning as a pandas TimeGrouper, or values contained in
|
|
157
|
+
a Series.
|
|
158
|
+
If a Series, values are used both as ends and labels of chunks.
|
|
159
|
+
If a tuple of 2 Series, values in first Series are labels of chunks,
|
|
160
|
+
and second Series are ends of chunks.
|
|
161
|
+
closed : str, default None
|
|
162
|
+
Optional string, specifying if intervals defined by 'by' are left or
|
|
163
|
+
right closed. This parameter overrides 'by.closed' if 'by' is a pandas
|
|
164
|
+
TimeGrouper.
|
|
165
|
+
buffer : dict
|
|
166
|
+
Dict to keep parameters allowing chaining calls to 'by_scale', with
|
|
167
|
+
``restart_key``, keeping track of the end of the one-but-last chunk
|
|
168
|
+
from previous iteration, derived from 'by'.
|
|
169
|
+
|
|
170
|
+
Returns
|
|
171
|
+
-------
|
|
172
|
+
tuple[NDArray, Series, int, str, Series, bool]
|
|
173
|
+
The first 3 items are used in 'cumsegagg' in all situations.
|
|
174
|
+
- ``next_chunk_starts``, a one-dimensional array of `int` specifying
|
|
175
|
+
the row indices of the next-bin starts, for each bin. Successive
|
|
176
|
+
identical indices imply empty bins, except the first.
|
|
177
|
+
- ``chunk_labels``, a pandas Series specifying for each bin its
|
|
178
|
+
label. Labels are defined as per 'on' pandas TimeGrouper.
|
|
179
|
+
- ``n_null_chunks``, an int, the number of null chunks identified in
|
|
180
|
+
'on'.
|
|
181
|
+
|
|
182
|
+
The 3 following items are used only if both bins and snapshots are
|
|
183
|
+
generated in 'cumsegagg'.
|
|
184
|
+
- ``chunk_closed``, a str, indicating if bins are left or right
|
|
185
|
+
closed, as per 'by' pandas TimeGrouper or 'closed' parameter.
|
|
186
|
+
- ``chunk_ends``, a pandas Series containing bin ends, as per 'by'
|
|
187
|
+
pandas TimeGrouper.
|
|
188
|
+
- ``unknown_last_chunk_end``, a boolean, always `False`, specifying
|
|
189
|
+
that the last chunk end is known. This is because chunk ends are
|
|
190
|
+
always fully specified as per 'by' pandas TimeGrouper or Series.
|
|
191
|
+
|
|
192
|
+
Notes
|
|
193
|
+
-----
|
|
194
|
+
If running ``by_scale()`` with a buffer, setting of value for key
|
|
195
|
+
`"restart_key`" depends if last value derived from 'by' (either a
|
|
196
|
+
TimeGrouper or a Series) lies before the last value in 'on'.
|
|
197
|
+
- If it lies before, then this last value derived from 'by' is the
|
|
198
|
+
restart key.
|
|
199
|
+
- If it lies after, then the one-but-last value derived from 'by' is the
|
|
200
|
+
restart key.
|
|
201
|
+
|
|
202
|
+
"""
|
|
203
|
+
if isinstance(by, TimeGrouper):
|
|
204
|
+
# If 'buffer' is not empty, it necessarily contains 'KEY_RESTART_KEY'.
|
|
205
|
+
first = buffer[KEY_RESTART_KEY] if buffer else on.iloc[0]
|
|
206
|
+
# In case 'by' is for snapshotting, and 'closed' is not set, take care
|
|
207
|
+
# to use 'closed' provided.
|
|
208
|
+
if closed is None:
|
|
209
|
+
closed = by.closed
|
|
210
|
+
# TODO: replace with date_utils.floor_ts() and date_utils.ceil_ts()?
|
|
211
|
+
start, end = gtre(
|
|
212
|
+
first=first,
|
|
213
|
+
last=on.iloc[-1],
|
|
214
|
+
freq=by.freq,
|
|
215
|
+
closed=closed,
|
|
216
|
+
unit=first.unit,
|
|
217
|
+
origin=by.origin,
|
|
218
|
+
offset=by.offset,
|
|
219
|
+
)
|
|
220
|
+
edges = date_range(start, end, freq=by.freq)
|
|
221
|
+
chunk_ends = edges[1:]
|
|
222
|
+
chunk_labels = chunk_ends if by.label == RIGHT else edges[:-1]
|
|
223
|
+
else:
|
|
224
|
+
# Case 'by' is a Series.
|
|
225
|
+
if closed is None:
|
|
226
|
+
raise ValueError(f"'closed' has to be set to {LEFT} or {RIGHT}.")
|
|
227
|
+
if isinstance(by, tuple):
|
|
228
|
+
chunk_labels, chunk_ends = by
|
|
229
|
+
if len(chunk_labels) != len(chunk_ends):
|
|
230
|
+
raise ValueError(
|
|
231
|
+
"number of chunk labels has to be equal to number of chunk ends.",
|
|
232
|
+
)
|
|
233
|
+
else:
|
|
234
|
+
chunk_labels = chunk_ends = by
|
|
235
|
+
if buffer:
|
|
236
|
+
# In case at previous iteration, there has been no snapshot,
|
|
237
|
+
# 'buffer' will not contain 'KEY_RESTART_KEY', but will contain
|
|
238
|
+
# 'KEY_LAST_ON_VALUE'.
|
|
239
|
+
if KEY_RESTART_KEY in buffer and buffer[KEY_RESTART_KEY] != chunk_ends[0]:
|
|
240
|
+
# In case of restart, if first value in 'chunk_ends' is not the
|
|
241
|
+
# the one that was used in last at last iteration, try first to
|
|
242
|
+
# trim values in 'by' that are earlier than 'restart_key'.
|
|
243
|
+
n_chunk_ends_init = len(chunk_ends)
|
|
244
|
+
chunk_ends = chunk_ends[chunk_ends >= buffer[KEY_RESTART_KEY]]
|
|
245
|
+
if buffer[KEY_RESTART_KEY] != chunk_ends[0]:
|
|
246
|
+
raise ValueError(
|
|
247
|
+
f"'by' needs to contain value {buffer[KEY_RESTART_KEY]} " "to restart correctly.",
|
|
248
|
+
)
|
|
249
|
+
n_first_chunks_to_remove = n_chunk_ends_init - len(chunk_ends)
|
|
250
|
+
chunk_labels = chunk_labels[n_first_chunks_to_remove:]
|
|
251
|
+
if KEY_LAST_ON_VALUE in buffer:
|
|
252
|
+
# In the specific case 'on' has not been traversed completely
|
|
253
|
+
# at previous iteration, the chunk for the remaining of the
|
|
254
|
+
# data has no label, and will not appear in the snapshot
|
|
255
|
+
# results. But it will be calculated during the aggregation
|
|
256
|
+
# phase ('cumsegagg()'), and kept in a temporary variable
|
|
257
|
+
# ('chunk_res').
|
|
258
|
+
# In this case, at next iteration, with new chunk ends, a
|
|
259
|
+
# specific check is managed here to ensure correctness of the
|
|
260
|
+
# restart.
|
|
261
|
+
# For this new iteration,
|
|
262
|
+
# - a new bin has necessarily to be started. Otherwise,
|
|
263
|
+
# aggregation results for last chunk at previous iteration
|
|
264
|
+
# will overwrite those of elapsed last bin. This last bin
|
|
265
|
+
# has been completed at previous iteration. Its results
|
|
266
|
+
# do not have to be modified.
|
|
267
|
+
# - this new first bin has to end after the last value in 'on'
|
|
268
|
+
# from previous iteration. If it is not, then the remaining
|
|
269
|
+
# aggregated data from previous iteration is not usable, as
|
|
270
|
+
# it aggregates over several chunks.
|
|
271
|
+
# If there is a single chunk end, then it is that of previous
|
|
272
|
+
# iteration, nothing to check.
|
|
273
|
+
last_on_value = buffer[KEY_LAST_ON_VALUE]
|
|
274
|
+
if len(chunk_ends) > 1 and (
|
|
275
|
+
(closed == RIGHT and chunk_ends[1] < last_on_value)
|
|
276
|
+
or (closed == LEFT and chunk_ends[1] <= last_on_value)
|
|
277
|
+
):
|
|
278
|
+
raise ValueError(
|
|
279
|
+
"2nd chunk end in 'by' has to be larger than value "
|
|
280
|
+
f"{buffer[KEY_LAST_ON_VALUE]} to restart correctly.",
|
|
281
|
+
)
|
|
282
|
+
if (closed == RIGHT and chunk_ends[0] < last_on_value) or (
|
|
283
|
+
closed == LEFT and chunk_ends[0] <= last_on_value
|
|
284
|
+
):
|
|
285
|
+
# At previous iteration, if last value in 'on' is later
|
|
286
|
+
# than first chunk end, then this chunk should not be
|
|
287
|
+
# updated. It is 'done'.
|
|
288
|
+
# To prevent updating it, this chunk should be removed.
|
|
289
|
+
# Only the 1st chunk is removed, because it was just
|
|
290
|
+
# checked that 2nd chunk complies correctly with this
|
|
291
|
+
# condition.
|
|
292
|
+
chunk_ends = chunk_ends[1:]
|
|
293
|
+
chunk_labels = chunk_labels[1:]
|
|
294
|
+
del buffer[KEY_LAST_ON_VALUE]
|
|
295
|
+
if chunk_ends.empty:
|
|
296
|
+
if isinstance(buffer, dict):
|
|
297
|
+
buffer[KEY_LAST_ON_VALUE] = on.iloc[-1]
|
|
298
|
+
return (NULL_INT64_1D_ARRAY, chunk_labels, 0, closed, chunk_ends, False)
|
|
299
|
+
if chunk_ends.dtype == DTYPE_DATETIME64:
|
|
300
|
+
next_chunk_starts, n_null_chunks, data_traversed = _next_chunk_starts(
|
|
301
|
+
on.to_numpy(copy=False).view(DTYPE_INT64),
|
|
302
|
+
chunk_ends.to_numpy(copy=False).view(DTYPE_INT64),
|
|
303
|
+
closed == RIGHT,
|
|
304
|
+
)
|
|
305
|
+
else:
|
|
306
|
+
next_chunk_starts, n_null_chunks, data_traversed = _next_chunk_starts(
|
|
307
|
+
on.to_numpy(copy=False),
|
|
308
|
+
chunk_ends.to_numpy(copy=False),
|
|
309
|
+
closed == RIGHT,
|
|
310
|
+
)
|
|
311
|
+
n_chunks = len(next_chunk_starts)
|
|
312
|
+
# Rationale for selecting the "restart key".
|
|
313
|
+
# - For a correct restart at iteration N+1, the restart point needs to be
|
|
314
|
+
# that of the last bin at iteration N that has been "in-progress". The
|
|
315
|
+
# restart is said correct because it restarts on new data, where
|
|
316
|
+
# aggregation at iteration N stopped. There is no omission of new data,
|
|
317
|
+
# nor omission of possibly empty bins till new data.
|
|
318
|
+
# - At iteration N,
|
|
319
|
+
# - if last value derived from 'by' is after last value in
|
|
320
|
+
# "on", then at next iteration, N+1, new data can be used, which
|
|
321
|
+
# still lies before this last value derived from 'by' at iteration N.
|
|
322
|
+
# To make sure this new data is correctly managed, we need to restart
|
|
323
|
+
# from one-but-last value derived from 'by' at iteration N.
|
|
324
|
+
# - if last value derived from 'by' is before last value in "on", then
|
|
325
|
+
# at next iteration, N+1, we are sure no new data will appear before
|
|
326
|
+
# it. This last value can be safely used as restart value.
|
|
327
|
+
# TODO: when splitting 'by_scale()' into 'by_pgrouper()' and 'by_scale()',
|
|
328
|
+
# for 'by_pgrouper()', then using for 'restart_key' the last value in 'on'
|
|
329
|
+
# complies with whatever the 'closed' parameter is (I think). This
|
|
330
|
+
# simplifies below code.
|
|
331
|
+
if data_traversed:
|
|
332
|
+
chunk_labels = chunk_labels[:n_chunks]
|
|
333
|
+
chunk_ends = chunk_ends[:n_chunks]
|
|
334
|
+
if buffer is not None:
|
|
335
|
+
if closed == LEFT and isinstance(by, TimeGrouper):
|
|
336
|
+
# Use of intricate way to get last or last-but-one element in
|
|
337
|
+
# 'chunk_ends', compatible with both Series and DatetimeIndex.
|
|
338
|
+
if n_chunks > 1:
|
|
339
|
+
# Get one-but-last element.
|
|
340
|
+
# Initialize this way if there are more than 2 elements at
|
|
341
|
+
# least.
|
|
342
|
+
buffer[KEY_RESTART_KEY] = chunk_ends[n_chunks - 2]
|
|
343
|
+
else:
|
|
344
|
+
# If there is a single incomplete bin, take first element
|
|
345
|
+
# in 'on'.
|
|
346
|
+
buffer[KEY_RESTART_KEY] = on.iloc[0]
|
|
347
|
+
else:
|
|
348
|
+
# Take last end
|
|
349
|
+
# - either if 'by' is a TimeGrouper, as it is enough for
|
|
350
|
+
# generating edges at next iteration.
|
|
351
|
+
# - or if 'by' is a Series, because Series only needs to
|
|
352
|
+
# restart from this point then.
|
|
353
|
+
buffer[KEY_RESTART_KEY] = chunk_ends[n_chunks - 1]
|
|
354
|
+
elif buffer is not None:
|
|
355
|
+
# Data is not traversed.
|
|
356
|
+
# This can only happen if 'by' is not a TimeGrouper.
|
|
357
|
+
# Keep last chunk end.
|
|
358
|
+
buffer[KEY_RESTART_KEY] = chunk_ends[n_chunks - 1]
|
|
359
|
+
buffer[KEY_LAST_ON_VALUE] = on.iloc[-1]
|
|
360
|
+
return (
|
|
361
|
+
next_chunk_starts,
|
|
362
|
+
Series(chunk_labels),
|
|
363
|
+
n_null_chunks,
|
|
364
|
+
closed,
|
|
365
|
+
chunk_ends,
|
|
366
|
+
False,
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
def by_x_rows(
|
|
371
|
+
on: DataFrame | Series,
|
|
372
|
+
by: int | None = 4,
|
|
373
|
+
closed: str | None = LEFT,
|
|
374
|
+
buffer: dict | None = None,
|
|
375
|
+
) -> tuple[NDArray, Series, int, str, Series, bool]:
|
|
376
|
+
"""
|
|
377
|
+
Segment by group of x rows.
|
|
378
|
+
|
|
379
|
+
Dummy binning function for testing 'cumsegagg' with 'bin_by' set as a
|
|
380
|
+
Callable.
|
|
381
|
+
|
|
382
|
+
Parameters
|
|
383
|
+
----------
|
|
384
|
+
on : DataFrame | Series
|
|
385
|
+
Either a pandas Series or a DataFrame made of two columns, from which
|
|
386
|
+
deriving
|
|
387
|
+
- the number of rows in 'on',
|
|
388
|
+
- bin labels for each bin (from the last column of 'on'),
|
|
389
|
+
- bin ends for each bin (from the last column of 'on').
|
|
390
|
+
by : int, default 4
|
|
391
|
+
Number of rows in a bin.
|
|
392
|
+
closed : str, default "left"
|
|
393
|
+
How is closed the segments, either "left" or "right".
|
|
394
|
+
buffer : dict, default None
|
|
395
|
+
Dict to keep 2 parameters allowing chaining calls to 'by_x_rows':
|
|
396
|
+
- 'restart_key', an int specifying the number of rows in last
|
|
397
|
+
(and possibly incomplete) bin from the previous call to
|
|
398
|
+
'bin_x_rows'.
|
|
399
|
+
- 'last_bin_label', label of the last bin, that will be reused in
|
|
400
|
+
next iteration.
|
|
401
|
+
|
|
402
|
+
Returns
|
|
403
|
+
-------
|
|
404
|
+
tuple[NDArray, Series, int, str, Series, bool]
|
|
405
|
+
The first 3 items are used in 'cumsegagg' in all situations.
|
|
406
|
+
- ``next_chunk_starts``, a one-dimensional numpy array of int,
|
|
407
|
+
specifying for each bin the row indice at which starts the next
|
|
408
|
+
bin.
|
|
409
|
+
- ``bin_labels``, a pandas Series specifying for each bin its label.
|
|
410
|
+
Labels are first value in bin taken in last column of 'on' (which
|
|
411
|
+
is supposed to be an ordered column).
|
|
412
|
+
- ``n_null_bins``, an int, always ``0``.
|
|
413
|
+
|
|
414
|
+
The 3 next items are used only if both bins and snapshots are generated
|
|
415
|
+
in 'cumsegagg'.
|
|
416
|
+
- ``bin_closed``, a str, ``"left"`` or ``"right"``, indicating that
|
|
417
|
+
the bins are left or right closed.
|
|
418
|
+
- ``bin_ends``, a pandas Series made of values from the last columns
|
|
419
|
+
of 'on' (which is either single-column or two-column) and
|
|
420
|
+
indicating the "position" of the bin end, which is marked by the
|
|
421
|
+
start of the next bin, excluded. The end of the last bin being
|
|
422
|
+
unknown by definition (because is excluded), the last value is not
|
|
423
|
+
relevant. It is forced anyhow in 'segmentby()' to be last.
|
|
424
|
+
- ``unknown_last_bin_end``, a boolean specifying if the last bin end
|
|
425
|
+
is unknown. It is ``True`` if bins are lef-closed, meaning that
|
|
426
|
+
their end is excluded. Hence, the last bin is always "in-progress".
|
|
427
|
+
It is ``False`` if they are right-closed.
|
|
428
|
+
|
|
429
|
+
"""
|
|
430
|
+
len_on = len(on)
|
|
431
|
+
if isinstance(on, DataFrame):
|
|
432
|
+
# Keep only last column, supposed to be `ordered_on` column.
|
|
433
|
+
on = on.iloc[:, -1]
|
|
434
|
+
# Derive number of rows in first bins (cannot be 0) and number of bins.
|
|
435
|
+
if buffer is not None and KEY_RESTART_KEY in buffer:
|
|
436
|
+
# Case 'restart'.
|
|
437
|
+
rows_in_prev_last_bin = buffer[KEY_RESTART_KEY]
|
|
438
|
+
rows_in_continued_bin = min(len_on, by - rows_in_prev_last_bin) if rows_in_prev_last_bin != by else 0
|
|
439
|
+
else:
|
|
440
|
+
# Case 'start from scratch'.
|
|
441
|
+
rows_in_prev_last_bin = 0
|
|
442
|
+
rows_in_continued_bin = 0
|
|
443
|
+
n_rows_for_new_bins = len_on - rows_in_continued_bin
|
|
444
|
+
n_bins = ceil(n_rows_for_new_bins / by) + 1 if rows_in_continued_bin else ceil(n_rows_for_new_bins / by)
|
|
445
|
+
# Define 'next_chunk_starts'.
|
|
446
|
+
first_next_chunk_start = rows_in_continued_bin if rows_in_continued_bin else min(by, len_on)
|
|
447
|
+
next_chunk_starts = arange(
|
|
448
|
+
start=first_next_chunk_start,
|
|
449
|
+
stop=(n_bins - 1) * by + first_next_chunk_start + 1,
|
|
450
|
+
step=by,
|
|
451
|
+
)
|
|
452
|
+
# Make a copy and arrange for deriving 'chunk_starts', required for
|
|
453
|
+
# defining bin labels. 'bin_labels' are derived from last column (is then
|
|
454
|
+
# 'ordered_on' and if not, is 'bin_on'). Bin labels are 1st value in bin.
|
|
455
|
+
chunk_starts = next_chunk_starts.copy() - by
|
|
456
|
+
# Correct start of 1st chunk.
|
|
457
|
+
chunk_starts[0] = 0
|
|
458
|
+
bin_labels = on.iloc[chunk_starts].reset_index(drop=True)
|
|
459
|
+
if n_rows_for_new_bins:
|
|
460
|
+
# Case 'there are new bins'.
|
|
461
|
+
n_rows_in_last_bin = (
|
|
462
|
+
n_rows_for_new_bins if n_rows_for_new_bins <= by else fmod(n_rows_for_new_bins, by) or by
|
|
463
|
+
)
|
|
464
|
+
else:
|
|
465
|
+
# Case 'there are not'.
|
|
466
|
+
n_rows_in_last_bin = rows_in_continued_bin + rows_in_prev_last_bin
|
|
467
|
+
if closed == LEFT:
|
|
468
|
+
# Case 'left, end is start of next bin, excluded,
|
|
469
|
+
# 'bin_ends' has no end for last bin, because it is unknown.
|
|
470
|
+
# Temporarily adjust 'next_chunk_start' of last bin to last index.
|
|
471
|
+
next_chunk_starts[-1] = len_on - 1
|
|
472
|
+
bin_ends = on.iloc[next_chunk_starts].reset_index(drop=True)
|
|
473
|
+
unknown_last_bin_end = True
|
|
474
|
+
# Reset 'next_chunk_start' of last bin.
|
|
475
|
+
next_chunk_starts[-1] = len_on
|
|
476
|
+
if closed == RIGHT:
|
|
477
|
+
# Case 'right', end is end of current bin, included.
|
|
478
|
+
bin_ends = on.iloc[next_chunk_starts - 1].reset_index(drop=True)
|
|
479
|
+
# Bin end is unknown if last bin does not end exactly.
|
|
480
|
+
unknown_last_bin_end = True if n_rows_in_last_bin != by else False
|
|
481
|
+
# There is likely no empty bin.
|
|
482
|
+
n_null_bins = 0
|
|
483
|
+
if buffer is not None:
|
|
484
|
+
if buffer:
|
|
485
|
+
if rows_in_continued_bin:
|
|
486
|
+
# Correct 1st label if not a new bin.
|
|
487
|
+
bin_labels.iloc[0] = buffer[KEY_LAST_BIN_LABEL]
|
|
488
|
+
else:
|
|
489
|
+
# If a new bin has been created right at start,
|
|
490
|
+
# insert an empty one with label of last bin at prev iteration.
|
|
491
|
+
bin_labels = concat(
|
|
492
|
+
[Series([buffer[KEY_LAST_BIN_LABEL]]), bin_labels],
|
|
493
|
+
).reset_index(drop=True)
|
|
494
|
+
first_bin_end = buffer[KEY_LAST_BIN_END] if closed == RIGHT else on.iloc[0]
|
|
495
|
+
bin_ends = concat([Series([first_bin_end]), bin_ends]).reset_index(drop=True)
|
|
496
|
+
next_chunk_starts = ninsert(next_chunk_starts, 0, 0)
|
|
497
|
+
# In this case, first bin is empty.
|
|
498
|
+
n_null_bins = 1
|
|
499
|
+
# Update 'buffer[xxx]' parameters for next run.
|
|
500
|
+
buffer[KEY_RESTART_KEY] = n_rows_in_last_bin
|
|
501
|
+
buffer[KEY_LAST_BIN_LABEL] = bin_labels.iloc[-1]
|
|
502
|
+
if closed == RIGHT:
|
|
503
|
+
buffer[KEY_LAST_BIN_END] = bin_ends.iloc[-1]
|
|
504
|
+
return (
|
|
505
|
+
next_chunk_starts,
|
|
506
|
+
bin_labels,
|
|
507
|
+
n_null_bins,
|
|
508
|
+
closed,
|
|
509
|
+
bin_ends,
|
|
510
|
+
unknown_last_bin_end,
|
|
511
|
+
)
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
def mergesort(
|
|
515
|
+
labels: tuple[NDArray, NDArray],
|
|
516
|
+
keys: tuple[NDArray, NDArray],
|
|
517
|
+
force_last_from_second: bool | None = False,
|
|
518
|
+
) -> tuple[NDArray, NDArray]:
|
|
519
|
+
"""
|
|
520
|
+
Mergesort labels from keys.
|
|
521
|
+
|
|
522
|
+
Parameters
|
|
523
|
+
----------
|
|
524
|
+
labels : tuple[NDArray, NDArray]
|
|
525
|
+
2 one-dimensional arrays of labels to be merged together, provided as a
|
|
526
|
+
``tuple``.
|
|
527
|
+
keys : tuple[NDArray, NDArray]
|
|
528
|
+
2 one-dimensional arrays of sorted keys according which labels can be
|
|
529
|
+
sorted one with respect to the other.
|
|
530
|
+
``keys[0]``, resp. ``1``, are keys for ``labels[0]``, resp. ``1``.
|
|
531
|
+
force_last_from_second : bool, default False
|
|
532
|
+
If True, the last label in the resulting sorted array is forced to be
|
|
533
|
+
the last from the second label array.
|
|
534
|
+
|
|
535
|
+
Returns
|
|
536
|
+
-------
|
|
537
|
+
tuple[NDArray, NDArray]
|
|
538
|
+
The first array contains sorted labels from the 2 input arrays.
|
|
539
|
+
The second array contains the insertion indices for labels (i.e. the
|
|
540
|
+
indices in the resulting merged array) from the 2nd input array,
|
|
541
|
+
|
|
542
|
+
Notes
|
|
543
|
+
-----
|
|
544
|
+
If a value is found in both input arrays, then value of 2nd input array
|
|
545
|
+
comes after value of 1st input array, as can be checked with insertion
|
|
546
|
+
indices.
|
|
547
|
+
|
|
548
|
+
"""
|
|
549
|
+
# TODO: transition this to numba.
|
|
550
|
+
labels1, labels2 = labels
|
|
551
|
+
keys1, keys2 = keys
|
|
552
|
+
len_labels1 = len(labels1)
|
|
553
|
+
len_labels2 = len(labels2)
|
|
554
|
+
if len(keys1) != len_labels1:
|
|
555
|
+
raise ValueError(
|
|
556
|
+
"not possible to have arrays of different length for first labels and keys arrays.",
|
|
557
|
+
)
|
|
558
|
+
if len(keys2) != len_labels2:
|
|
559
|
+
raise ValueError(
|
|
560
|
+
"not possible to have arrays of different length for second labels and keys arrays.",
|
|
561
|
+
)
|
|
562
|
+
if force_last_from_second:
|
|
563
|
+
len_tot = len_labels1 + len_labels2
|
|
564
|
+
sort_indices = full(len_tot, len_tot - 1, dtype=DTYPE_INT64)
|
|
565
|
+
sort_indices[:-1] = argsort(concatenate((keys1, keys2[:-1])), kind="mergesort")
|
|
566
|
+
else:
|
|
567
|
+
sort_indices = argsort(concatenate(keys), kind="mergesort")
|
|
568
|
+
return concatenate(labels)[sort_indices], nonzero(len_labels1 <= sort_indices)[0]
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
def setup_segmentby(
|
|
572
|
+
bin_by: TimeGrouper | Callable,
|
|
573
|
+
bin_on: str | None = None,
|
|
574
|
+
ordered_on: str | None = None,
|
|
575
|
+
snap_by: TimeGrouper | Series | None = None,
|
|
576
|
+
) -> dict[str, Callable | str]:
|
|
577
|
+
"""
|
|
578
|
+
Check and setup parameters to operate data segmentation.
|
|
579
|
+
|
|
580
|
+
Parameters
|
|
581
|
+
----------
|
|
582
|
+
bin_by : TimeGrouper | Callable
|
|
583
|
+
A pandas TimeGrouper or a Callable to perform segmentation.
|
|
584
|
+
bin_on : str | None
|
|
585
|
+
Name of the column onto which performing the segmentation.
|
|
586
|
+
ordered_on : str | None
|
|
587
|
+
Name of the column containing ordered data and to use when snapshotting.
|
|
588
|
+
With this column, snapshots (points of observation) can be positioned
|
|
589
|
+
with respect to bin ends.
|
|
590
|
+
snap_by : TimeGrouper | IntervalIndex | None
|
|
591
|
+
A pandas TimeGrouper or a pandas Series defining the snapshots (points of
|
|
592
|
+
observation).
|
|
593
|
+
|
|
594
|
+
Returns
|
|
595
|
+
-------
|
|
596
|
+
dict[str, Callable | str]
|
|
597
|
+
A dict with keys
|
|
598
|
+
- ``BIN_BY``, 'bin_by' forced as a Callable,
|
|
599
|
+
- ``ON_COLS``, column name or list of column names to be used for
|
|
600
|
+
segmentation.
|
|
601
|
+
- ``ORDERED_ON``, consolidated value for 'ordered_on' column.
|
|
602
|
+
|
|
603
|
+
"""
|
|
604
|
+
bin_by_closed = None
|
|
605
|
+
if isinstance(bin_by, TimeGrouper):
|
|
606
|
+
# 'bin_by' is a TimeGrouper.
|
|
607
|
+
bin_by_closed = bin_by.closed
|
|
608
|
+
if bin_by.key:
|
|
609
|
+
if bin_on:
|
|
610
|
+
if bin_by.key != bin_on:
|
|
611
|
+
raise ValueError(
|
|
612
|
+
"not possible to set 'bin_by.key' and 'bin_on' to different values.",
|
|
613
|
+
)
|
|
614
|
+
else:
|
|
615
|
+
bin_on = bin_by.key
|
|
616
|
+
elif not bin_on:
|
|
617
|
+
raise ValueError("not possible to set both 'bin_by.key' and 'bin_on' to `None`.")
|
|
618
|
+
if ordered_on and ordered_on != bin_on:
|
|
619
|
+
raise ValueError(
|
|
620
|
+
"not possible to set 'bin_on' and 'ordered_on' to different values when "
|
|
621
|
+
"'bin_by' is a TimeGrouper.",
|
|
622
|
+
)
|
|
623
|
+
elif not ordered_on:
|
|
624
|
+
# Case 'ordered_on' has not been provided but 'bin_on' has been.
|
|
625
|
+
# Then set 'ordered_on' to 'bin_on'. this is so because 'bin_by' is
|
|
626
|
+
# a TimeGrouper.
|
|
627
|
+
ordered_on = bin_on
|
|
628
|
+
bin_by = partial(by_scale, by=bin_by)
|
|
629
|
+
elif callable(bin_by):
|
|
630
|
+
# 'bin_by' is a Callable.
|
|
631
|
+
if bin_on is None and ordered_on is None:
|
|
632
|
+
raise ValueError("not possible to set both 'bin_on' and 'ordered_on' to `None`.")
|
|
633
|
+
else:
|
|
634
|
+
# 'bin_by' is neither a TimeGrouper, nor a Callable.
|
|
635
|
+
# This is not possible.
|
|
636
|
+
raise ValueError(
|
|
637
|
+
"not possible to have 'bin_by' parameter different " "than a pandas TimeGrouper or a Callable.",
|
|
638
|
+
)
|
|
639
|
+
if snap_by is not None:
|
|
640
|
+
if isinstance(snap_by, TimeGrouper):
|
|
641
|
+
if snap_by.key:
|
|
642
|
+
if ordered_on is None:
|
|
643
|
+
ordered_on = snap_by.key
|
|
644
|
+
elif snap_by.key != ordered_on:
|
|
645
|
+
raise ValueError(
|
|
646
|
+
"not possible to set 'ordered_on' and 'snap_by.key' to different values.",
|
|
647
|
+
)
|
|
648
|
+
if bin_by_closed and snap_by.closed != bin_by_closed:
|
|
649
|
+
raise ValueError(
|
|
650
|
+
"not possible to set 'bin_by.closed' and 'snap_by.closed' to different values.",
|
|
651
|
+
)
|
|
652
|
+
elif not ordered_on:
|
|
653
|
+
# Case 'snap_by' is not a TimeGrouper.
|
|
654
|
+
raise ValueError(
|
|
655
|
+
"not possible to leave 'ordered_on' to `None` in case of snapshotting.",
|
|
656
|
+
)
|
|
657
|
+
return {
|
|
658
|
+
KEY_BIN_BY: bin_by,
|
|
659
|
+
KEY_ON_COLS: (
|
|
660
|
+
[bin_on, ordered_on]
|
|
661
|
+
if ordered_on and bin_on and ordered_on != bin_on
|
|
662
|
+
else bin_on if bin_on else ordered_on
|
|
663
|
+
),
|
|
664
|
+
KEY_ORDERED_ON: ordered_on,
|
|
665
|
+
KEY_BIN_ON: bin_on,
|
|
666
|
+
KEY_SNAP_BY: snap_by if isinstance(snap_by, TimeGrouper) else None,
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
|
|
670
|
+
def setup_mainbuffer(buffer: dict, with_snapshot: bool | None = False) -> tuple[dict, dict]:
|
|
671
|
+
"""
|
|
672
|
+
Return 'buffer_bin' and 'buffer_snap' from main buffer.
|
|
673
|
+
|
|
674
|
+
Parameters
|
|
675
|
+
----------
|
|
676
|
+
buffer : dict
|
|
677
|
+
Main buffer, either containing only values for 'buffer_bin', or only
|
|
678
|
+
two keys `"bin"` and `"snap"` providing a separate dict for each of the
|
|
679
|
+
binning and snapshotting processes.
|
|
680
|
+
with_snapshot : bool, default False
|
|
681
|
+
Boolean ``True`` if snapshotting process is requested.
|
|
682
|
+
|
|
683
|
+
Returns
|
|
684
|
+
-------
|
|
685
|
+
tuple[dict, dict]
|
|
686
|
+
The first dict is the binning buffer.
|
|
687
|
+
The second dict is the snapshotting buffer.
|
|
688
|
+
|
|
689
|
+
"""
|
|
690
|
+
if buffer is not None:
|
|
691
|
+
if KEY_BIN not in buffer:
|
|
692
|
+
buffer[KEY_BIN] = {}
|
|
693
|
+
if with_snapshot:
|
|
694
|
+
buffer[KEY_SNAP] = {}
|
|
695
|
+
if with_snapshot:
|
|
696
|
+
return buffer[KEY_BIN], buffer[KEY_SNAP]
|
|
697
|
+
else:
|
|
698
|
+
return buffer[KEY_BIN], None
|
|
699
|
+
else:
|
|
700
|
+
return None, None
|
|
701
|
+
|
|
702
|
+
|
|
703
|
+
def segmentby(
|
|
704
|
+
data: DataFrame,
|
|
705
|
+
bin_by: TimeGrouper | Callable | dict,
|
|
706
|
+
bin_on: str | None = None,
|
|
707
|
+
ordered_on: str | None = None,
|
|
708
|
+
snap_by: TimeGrouper | IntervalIndex | None = None,
|
|
709
|
+
buffer: dict | None = None,
|
|
710
|
+
) -> tuple[NDArray, NDArray, Series, int, Series, int]:
|
|
711
|
+
"""
|
|
712
|
+
Identify starts of segments in data, either bins or optionally snapshots.
|
|
713
|
+
|
|
714
|
+
Parameters
|
|
715
|
+
----------
|
|
716
|
+
data: DataFrame
|
|
717
|
+
A pandas DataFrame containing the columns to conduct segmentation of
|
|
718
|
+
data.
|
|
719
|
+
- ``bin_on`` column,
|
|
720
|
+
- optionally ``ordered_on`` column (same as ``snap_by.key`` optional
|
|
721
|
+
column, from 'snap_by' parameter, if using snapshots.
|
|
722
|
+
|
|
723
|
+
If any of ``ordered_on`` or ``snap_by.key`` parameters are used, the
|
|
724
|
+
column they point to (the same if both parameters are provided) has to
|
|
725
|
+
be ordered.
|
|
726
|
+
bin_by : TimeGrouper | Callable | dict
|
|
727
|
+
Callable or pandas TimeGrouper to perform binning.
|
|
728
|
+
If a Callable, it is called with following parameters:
|
|
729
|
+
``bin_by(on, buffer)``
|
|
730
|
+
where:
|
|
731
|
+
- ``on``,
|
|
732
|
+
- either ``ordered_on`` is ``None``. ``on`` is then a pandas Series
|
|
733
|
+
made from ``data[bin_on]`` column.
|
|
734
|
+
- or ``ordered_on`` is provided and is different from ``bin_on``.
|
|
735
|
+
Then ``on`` is a two-column pandas DataFrame made of
|
|
736
|
+
``data[[bin_on, ordered_on]]``.
|
|
737
|
+
Values from ``data[ordered_on]`` have to be used to define bin
|
|
738
|
+
ends when 'snap_by' is set.
|
|
739
|
+
Also, values from ``data[ordered_on]`` can be used advantageously
|
|
740
|
+
as bin labels.
|
|
741
|
+
|
|
742
|
+
- ``buffer``, a dict that has to be modified in-place by 'bin_by' to
|
|
743
|
+
keep internal parameters which allow restart calls to 'bin_by'.
|
|
744
|
+
|
|
745
|
+
If a dict, it contains the full setup for conducting the segmentation
|
|
746
|
+
of 'data', as generated by 'setup_segmentby()'.
|
|
747
|
+
- 'on_cols', a str or list of str, to be forwarded to 'bin_by'
|
|
748
|
+
Callable.
|
|
749
|
+
- 'bin_by', a Callable, either the one initially provided, or one
|
|
750
|
+
derived from a pandas TimeGrouper.
|
|
751
|
+
- 'ordered_on', a str, its definitive value.
|
|
752
|
+
- 'snap_by', if a TimeGrouper.
|
|
753
|
+
|
|
754
|
+
It has then to return a tuple made of 6 items. There are 3 items used
|
|
755
|
+
whatever if snapshotting is used or not.
|
|
756
|
+
- ``next_chunk_starts``, a one-dimensional array of `int`, specifying
|
|
757
|
+
the row index at which the next bin starts (included) as found in
|
|
758
|
+
``bin_on``.
|
|
759
|
+
If the same indices appear several times, it means that
|
|
760
|
+
corresponding bins are empty, except the first one. In this case,
|
|
761
|
+
corresponding rows in aggregation result will be filled with null
|
|
762
|
+
values.
|
|
763
|
+
Last value of this array always equals to ``len(on)``.
|
|
764
|
+
- ``bin_labels``, a pandas Series which values are expected to be
|
|
765
|
+
all bin labels, incl. those of empty bins, as they will appear in
|
|
766
|
+
aggregation results. Labels can be of any type.
|
|
767
|
+
In case of restarting the aggregation with new seed data, care
|
|
768
|
+
should be taken so that the label of the first bin is the same as
|
|
769
|
+
that of the last bin from previous iteration if it has been the
|
|
770
|
+
same bin. An exception is raised if not.
|
|
771
|
+
- ``n_null_bins``, an `int` indicating the number of empty bins.
|
|
772
|
+
|
|
773
|
+
The 3 next items are used only in case of snapshotting (``snap_by`` is
|
|
774
|
+
different than ``None``).
|
|
775
|
+
- ``bin_closed``, a str, either `'right'` or `'left'`, indicating
|
|
776
|
+
if bins are left or right-closed (i.e. if ``chunk_ends`` is
|
|
777
|
+
included or excluded in the bin).
|
|
778
|
+
- ``bin_ends``, an optional pandas Series, specifying the ends of
|
|
779
|
+
bins with values derived from ``data[ordered_on]`` column. If
|
|
780
|
+
snapshotting, then points of observation (defined by ``snap_by``)
|
|
781
|
+
are positioned with respect to the bin ends. This data allows
|
|
782
|
+
sorting snapshots with respect to bins in case they start/end at
|
|
783
|
+
the same row index in data.
|
|
784
|
+
``bin_ends`` is not required if no snapshotting. If not used, set
|
|
785
|
+
to None.
|
|
786
|
+
- ``last_bin_end_unknown``, a boolean indicating if the end of the
|
|
787
|
+
last bin is known or not. If bins are left-closed, then it is
|
|
788
|
+
possible the end of the last bin is not known. In this case,
|
|
789
|
+
de-facto, this unknown bin end is supposed to be positioned after
|
|
790
|
+
all snapshots.
|
|
791
|
+
|
|
792
|
+
bin_on : str | None, default None
|
|
793
|
+
Name of the column in `data` over which performing the binning
|
|
794
|
+
operation.
|
|
795
|
+
If 'bin_by' is a pandas `TimeGrouper`, its `key` parameter is used instead.
|
|
796
|
+
If 'bin_on' is set, its consistence with ``bin_by.key`` parameter is
|
|
797
|
+
then checked.
|
|
798
|
+
ordered_on : str | None, default None
|
|
799
|
+
Name of an existing ordered column in 'data'. When setting it, it is
|
|
800
|
+
then forwarded to 'bin_by' Callable.
|
|
801
|
+
This parameter is compulsory if 'snap_by' is set. Values derived from
|
|
802
|
+
'snap_by' (either a TimeGrouper or a Series) are compared to ``bin_ends``,
|
|
803
|
+
themselves derived from ``data[ordered_on]``.
|
|
804
|
+
snap_by : TimeGrouper | Series | None, default None
|
|
805
|
+
Values positioning the points of observation, either derived from a
|
|
806
|
+
pandas TimeGrouper, or contained in a pandas Series.
|
|
807
|
+
buffer : dict | None, default None
|
|
808
|
+
Dict of 2 dict.
|
|
809
|
+
- first dict, with key `"bin"` embed values from previous binning
|
|
810
|
+
process, set by 'bin_by' when it is a Callable, or by the internal
|
|
811
|
+
function ``by_scale`` if 'bin_by' is a TimeGrouper. These values are
|
|
812
|
+
required when restarting the binning process with new seed data.
|
|
813
|
+
- second dict, with key `"snap"` embed values from previous
|
|
814
|
+
snapshotting process, set by 'by_scale'. Similarly, these values
|
|
815
|
+
are required to allow restarting the snapshotting process with new
|
|
816
|
+
seed data.
|
|
817
|
+
|
|
818
|
+
Returns
|
|
819
|
+
-------
|
|
820
|
+
Tuple made of 6 items.
|
|
821
|
+
- ``next_chunk_starts``, an ordered one-dimensional numpy array of int,
|
|
822
|
+
specifying for each bin and snapshot the row indice at which starts the
|
|
823
|
+
next one.
|
|
824
|
+
- ``bin_indices``, a one-dimensional array of int, specifying which
|
|
825
|
+
value in ``next_chunk_starts`` relates to a bin (as opposed to a
|
|
826
|
+
snapshot)
|
|
827
|
+
- ``bin_labels``, a pandas Series specifying for each bin its label.
|
|
828
|
+
- ``n_null_bins``, an int, indicating how many bins are empty.
|
|
829
|
+
- ``snap_labels``, a pandas Series specifying for each snapshot its
|
|
830
|
+
label.
|
|
831
|
+
- ``n_max_null_snaps``, an int, specifying how many at most there are
|
|
832
|
+
empty snapshots. This figure is an upper bound.
|
|
833
|
+
|
|
834
|
+
Notes
|
|
835
|
+
-----
|
|
836
|
+
When implementing `bin_by` Callable the developer should take care that
|
|
837
|
+
``next_chunk_starts``, ``chunk_labels`` and``chunk_ends`` that are returned
|
|
838
|
+
by 'bin_by' are expected to be all of the same size, i.e. the total number
|
|
839
|
+
of bins that are expected, including empty ones.
|
|
840
|
+
|
|
841
|
+
Also, when implementing it for repetitive calls, care should be taken
|
|
842
|
+
that `bin_by` keeps in the 'buffer' parameter all the data needed to:
|
|
843
|
+
- create the correct number of bins that would be in-between the data
|
|
844
|
+
processed at the previous aggregation iteration, and the new data.
|
|
845
|
+
This has to show in 'next_chunk_starts' array that is returned.
|
|
846
|
+
- start with same bin label as previous iteration when using snapshots.
|
|
847
|
+
|
|
848
|
+
Having the same bin label between both iterations when using snapshots will
|
|
849
|
+
ensure:
|
|
850
|
+
- that the bin with previous aggregation results is overwritten (ok, not
|
|
851
|
+
necessarily meaningful if agg results have not changed in case there
|
|
852
|
+
has been no new data in this bin).
|
|
853
|
+
- even if this bin is empty at restart, in the case of snapshotting, it
|
|
854
|
+
is necessary when this bin ends that new empty snapshots before its end
|
|
855
|
+
correctly forward past results, and that new empty snapshots after this
|
|
856
|
+
end are correctly accounted for as empty chunks. For this reason, when
|
|
857
|
+
using snapshots, a check ensures that same bin label is used between
|
|
858
|
+
two successive iterations.
|
|
859
|
+
|
|
860
|
+
Still for repetitive calls of 'bin_by', care has to be taken that:
|
|
861
|
+
- the last bin is not an empty one.
|
|
862
|
+
- the last bin does cover the full size of data.
|
|
863
|
+
|
|
864
|
+
If not, exceptions will be raised.
|
|
865
|
+
|
|
866
|
+
When using snapshots, values defined by ``snap_by`` are considered the
|
|
867
|
+
"points of isolated observation". At such a point, an observation of the
|
|
868
|
+
"on-going" bin is made. In case of snapshot(s) positioned exactly on
|
|
869
|
+
segment(s) ends, at the same row index in data, the observation point will
|
|
870
|
+
always come before the bin end.
|
|
871
|
+
|
|
872
|
+
"""
|
|
873
|
+
# TODO : split 'by_scale' into 'by_pgrouper' and 'by_scale'.
|
|
874
|
+
# TODO : make some tests validating use of 'by_scale' as 'bin_by' parameter.
|
|
875
|
+
# (when user-provided 'bin_by' is a Series or a tuple of Series)
|
|
876
|
+
# TODO : consider transitioning 'bin_by' and 'snap_by' into a class.
|
|
877
|
+
# Probably, below initiatialization is to be part of a template class, to
|
|
878
|
+
# be run at child class instantiation.
|
|
879
|
+
if not isinstance(bin_by, dict):
|
|
880
|
+
bin_by = setup_segmentby(bin_by, bin_on, ordered_on, snap_by)
|
|
881
|
+
if bin_by[KEY_SNAP_BY] is not None:
|
|
882
|
+
# 'bin_by[KEY_SNAP_BY]' is not none if 'snap_by' is a TimeGrouper.
|
|
883
|
+
# Otherwise, it can be a DatetimeIndex or a Series.
|
|
884
|
+
snap_by = bin_by[KEY_SNAP_BY]
|
|
885
|
+
buffer_bin, buffer_snap = setup_mainbuffer(buffer, snap_by is not None)
|
|
886
|
+
ordered_on = bin_by[KEY_ORDERED_ON]
|
|
887
|
+
if ordered_on:
|
|
888
|
+
# Check 'ordered_on' is an ordered column.
|
|
889
|
+
if not (
|
|
890
|
+
(
|
|
891
|
+
data[ordered_on].dtype == DTYPE_DATETIME64
|
|
892
|
+
and (data[ordered_on].diff().iloc[1:] >= Timedelta(0)).all()
|
|
893
|
+
)
|
|
894
|
+
or (data[ordered_on].dtype != DTYPE_DATETIME64 and (data[ordered_on].diff().iloc[1:] >= 0).all())
|
|
895
|
+
):
|
|
896
|
+
raise ValueError(
|
|
897
|
+
f"column '{ordered_on}' is not ordered. It has to be for "
|
|
898
|
+
"'cumsegagg' to operate faultlessly.",
|
|
899
|
+
)
|
|
900
|
+
on = data.loc[:, bin_by[KEY_ON_COLS]]
|
|
901
|
+
# 'bin_by' binning.
|
|
902
|
+
(
|
|
903
|
+
next_chunk_starts,
|
|
904
|
+
bin_labels,
|
|
905
|
+
n_null_bins,
|
|
906
|
+
bin_closed,
|
|
907
|
+
bin_ends,
|
|
908
|
+
unknown_last_bin_end,
|
|
909
|
+
) = bin_by[
|
|
910
|
+
KEY_BIN_BY
|
|
911
|
+
](on=on, buffer=buffer_bin)
|
|
912
|
+
# Check consistency of 'bin_by' results.
|
|
913
|
+
# TODO : consider transitioning 'bin_by' and 'snap_by' into a class.
|
|
914
|
+
# Integrate below checks within a template class.
|
|
915
|
+
# Some checks may probably be managed at class instantiation.
|
|
916
|
+
# Others at runtime.
|
|
917
|
+
if bin_closed != LEFT and bin_closed != RIGHT:
|
|
918
|
+
raise ValueError(f"'bin_closed' has to be set either to '{LEFT}' or to '{RIGHT}'.")
|
|
919
|
+
if not isinstance(bin_labels, Series):
|
|
920
|
+
# Because `iloc` is used afterwards, `bin_labels` has to be a pandas
|
|
921
|
+
# Series.
|
|
922
|
+
raise TypeError("'bin_labels' has to be a pandas Series.")
|
|
923
|
+
n_bins = len(next_chunk_starts)
|
|
924
|
+
if n_bins != len(bin_labels):
|
|
925
|
+
raise ValueError("'next_chunk_starts' and 'chunk_labels' have to be of the same size.")
|
|
926
|
+
if n_bins != len(bin_ends):
|
|
927
|
+
raise ValueError("'next_chunk_starts' and 'chunk_ends' have to be of the same size.")
|
|
928
|
+
if isinstance(buffer, dict) and next_chunk_starts[-1] != len(data):
|
|
929
|
+
raise ValueError(
|
|
930
|
+
"series of bins have to cover the full length of 'data'. "
|
|
931
|
+
f"But last bin ends at row {next_chunk_starts[-1]} "
|
|
932
|
+
f"excluded, while size of data is {len(data)}.",
|
|
933
|
+
)
|
|
934
|
+
if buffer is not None:
|
|
935
|
+
# A buffer that is not 'None' means a restart is expected.
|
|
936
|
+
if n_bins > 1 and next_chunk_starts[-2] == len(on):
|
|
937
|
+
# In case a user-provided 'bin_by()' Callable is used, check if there
|
|
938
|
+
# are empty trailing bins. If there are, and that restart are expected
|
|
939
|
+
# (use of 'buffer'), then raise error, this it not allowed, as it would
|
|
940
|
+
# lead to wrong results in 'jcumsegagg()'.
|
|
941
|
+
raise ValueError(
|
|
942
|
+
"there is at least one empty trailing bin. "
|
|
943
|
+
"This is not possible if planning to restart on new "
|
|
944
|
+
"data in a next iteration.",
|
|
945
|
+
)
|
|
946
|
+
if KEY_LAST_BIN_LABEL in buffer and buffer[KEY_LAST_BIN_LABEL] != bin_labels.iloc[0]:
|
|
947
|
+
# When using snapshots, and in case of multiple calls, check that
|
|
948
|
+
# label of last bin (previous iteration) is same than label of
|
|
949
|
+
# first bin (current iteration).
|
|
950
|
+
raise ValueError(
|
|
951
|
+
f"not possible to have label '{buffer[KEY_LAST_BIN_LABEL]}' "
|
|
952
|
+
"of last bin at previous iteration different than label "
|
|
953
|
+
f"'{bin_labels.iloc[0]}' of first bin at current iteration.",
|
|
954
|
+
)
|
|
955
|
+
if snap_by is not None:
|
|
956
|
+
# Define points of observation
|
|
957
|
+
(next_snap_starts, snap_labels, n_max_null_snaps, _, snap_ends, _) = by_scale(
|
|
958
|
+
on=data.loc[:, ordered_on],
|
|
959
|
+
by=snap_by,
|
|
960
|
+
closed=bin_closed,
|
|
961
|
+
buffer=buffer_snap,
|
|
962
|
+
)
|
|
963
|
+
# Consolidate 'next_snap_starts' into 'next_chunk_starts'.
|
|
964
|
+
# If bins are left-closed, the end of the last bin can possibly be
|
|
965
|
+
# unknown yet.
|
|
966
|
+
# If a snapshot (observation point) is also set at end of data
|
|
967
|
+
# (a snapshot position is always known, because it is either
|
|
968
|
+
# derived from a pandas TimeGrouper, or an iterable of ordered values),
|
|
969
|
+
# then 'merge_sorted()' cannot sort them one to the other (end of last
|
|
970
|
+
# bin with last snapshot).
|
|
971
|
+
# In this case, we force the bin end to be after the last snapshot.
|
|
972
|
+
# The logic is that we want both to know the last bin and last
|
|
973
|
+
# snapshot while this last bin is in-progress.
|
|
974
|
+
# Having the bin end before the snapshot would on the opposite
|
|
975
|
+
# reset the data and the resulting snapshot would be a null one.
|
|
976
|
+
next_chunk_starts, bin_indices = mergesort(
|
|
977
|
+
labels=(next_snap_starts, next_chunk_starts),
|
|
978
|
+
keys=(snap_ends, bin_ends),
|
|
979
|
+
force_last_from_second=unknown_last_bin_end,
|
|
980
|
+
)
|
|
981
|
+
# Take indices of 'next_chunk_starts' corresponding to bins that are
|
|
982
|
+
# followed right after by a snapshot.
|
|
983
|
+
# ('append=len(next_chunk_starts)' in 'nonzero()' allows to simulate a
|
|
984
|
+
# configuration in which the last indices in 'next_chunk_starts' is
|
|
985
|
+
# that of a bin, hence to detect if a snapshot is after the actual
|
|
986
|
+
# (real) last bin. Without it, a snapshot after the last bin would not
|
|
987
|
+
# be detected and if needed, accounted for.)
|
|
988
|
+
indices_of_bins_followed_by_a_snap = bin_indices[
|
|
989
|
+
nonzero(ndiff(bin_indices, append=len(next_chunk_starts)) - 1)[0]
|
|
990
|
+
]
|
|
991
|
+
# Check if the 'next_chunk_starts' for these bins equal that of the
|
|
992
|
+
# snapshot that follows. If yes, then those are potential additional
|
|
993
|
+
# null snapshots.
|
|
994
|
+
n_max_null_snaps += len(
|
|
995
|
+
nonzero(
|
|
996
|
+
(
|
|
997
|
+
next_chunk_starts[indices_of_bins_followed_by_a_snap]
|
|
998
|
+
- next_chunk_starts[indices_of_bins_followed_by_a_snap + 1]
|
|
999
|
+
)
|
|
1000
|
+
== 0,
|
|
1001
|
+
)[0],
|
|
1002
|
+
)
|
|
1003
|
+
else:
|
|
1004
|
+
bin_indices = NULL_INT64_1D_ARRAY
|
|
1005
|
+
snap_labels = None
|
|
1006
|
+
n_max_null_snaps = 0
|
|
1007
|
+
# Keep track of last bin labels for checking at next iteration.
|
|
1008
|
+
# Check is managed at upper level in `cumsegagg`.
|
|
1009
|
+
if buffer is not None:
|
|
1010
|
+
buffer[KEY_LAST_BIN_LABEL] = bin_labels.iloc[-1]
|
|
1011
|
+
return (
|
|
1012
|
+
next_chunk_starts,
|
|
1013
|
+
bin_indices,
|
|
1014
|
+
bin_labels,
|
|
1015
|
+
n_null_bins,
|
|
1016
|
+
snap_labels,
|
|
1017
|
+
n_max_null_snaps,
|
|
1018
|
+
)
|