oups 2025.9.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of oups might be problematic. Click here for more details.

Files changed (43) hide show
  1. oups/__init__.py +40 -0
  2. oups/date_utils.py +62 -0
  3. oups/defines.py +26 -0
  4. oups/numpy_utils.py +114 -0
  5. oups/stateful_loop/__init__.py +14 -0
  6. oups/stateful_loop/loop_persistence_io.py +55 -0
  7. oups/stateful_loop/stateful_loop.py +654 -0
  8. oups/stateful_loop/validate_loop_usage.py +338 -0
  9. oups/stateful_ops/__init__.py +22 -0
  10. oups/stateful_ops/aggstream/__init__.py +12 -0
  11. oups/stateful_ops/aggstream/aggstream.py +1524 -0
  12. oups/stateful_ops/aggstream/cumsegagg.py +580 -0
  13. oups/stateful_ops/aggstream/jcumsegagg.py +416 -0
  14. oups/stateful_ops/aggstream/segmentby.py +1018 -0
  15. oups/stateful_ops/aggstream/utils.py +71 -0
  16. oups/stateful_ops/asof_merger/__init__.py +11 -0
  17. oups/stateful_ops/asof_merger/asof_merger.py +750 -0
  18. oups/stateful_ops/asof_merger/get_config.py +401 -0
  19. oups/stateful_ops/asof_merger/validate_params.py +285 -0
  20. oups/store/__init__.py +15 -0
  21. oups/store/filepath_utils.py +68 -0
  22. oups/store/indexer.py +457 -0
  23. oups/store/ordered_parquet_dataset/__init__.py +19 -0
  24. oups/store/ordered_parquet_dataset/metadata_filename.py +50 -0
  25. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/__init__.py +15 -0
  26. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/base.py +863 -0
  27. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/read_only.py +252 -0
  28. oups/store/ordered_parquet_dataset/parquet_adapter.py +157 -0
  29. oups/store/ordered_parquet_dataset/write/__init__.py +19 -0
  30. oups/store/ordered_parquet_dataset/write/iter_merge_split_data.py +131 -0
  31. oups/store/ordered_parquet_dataset/write/merge_split_strategies/__init__.py +22 -0
  32. oups/store/ordered_parquet_dataset/write/merge_split_strategies/base.py +784 -0
  33. oups/store/ordered_parquet_dataset/write/merge_split_strategies/n_rows_strategy.py +297 -0
  34. oups/store/ordered_parquet_dataset/write/merge_split_strategies/time_period_strategy.py +319 -0
  35. oups/store/ordered_parquet_dataset/write/write.py +270 -0
  36. oups/store/store/__init__.py +11 -0
  37. oups/store/store/dataset_cache.py +50 -0
  38. oups/store/store/iter_intersections.py +397 -0
  39. oups/store/store/store.py +345 -0
  40. oups-2025.9.5.dist-info/LICENSE +201 -0
  41. oups-2025.9.5.dist-info/METADATA +44 -0
  42. oups-2025.9.5.dist-info/RECORD +43 -0
  43. oups-2025.9.5.dist-info/WHEEL +4 -0
@@ -0,0 +1,1018 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Created on Wed Dec 4 21:30:00 2021.
4
+
5
+ @author: pierrot
6
+
7
+ """
8
+ from collections.abc import Callable
9
+ from functools import partial
10
+ from math import ceil
11
+ from math import fmod
12
+
13
+ from numba import njit
14
+ from numpy import arange
15
+ from numpy import argsort
16
+ from numpy import concatenate
17
+ from numpy import diff as ndiff
18
+ from numpy import dtype
19
+ from numpy import full
20
+ from numpy import insert as ninsert
21
+ from numpy import ndenumerate
22
+ from numpy import nonzero
23
+ from numpy import zeros
24
+ from numpy.typing import NDArray
25
+ from pandas import DataFrame
26
+ from pandas import IntervalIndex
27
+ from pandas import Series
28
+ from pandas import Timedelta
29
+ from pandas import concat
30
+ from pandas import date_range
31
+ from pandas.core.resample import TimeGrouper
32
+ from pandas.core.resample import _get_timestamp_range_edges as gtre
33
+
34
+
35
+ # Some constants.
36
+ DTYPE_INT64 = dtype("int64")
37
+ DTYPE_DATETIME64 = dtype("datetime64[ns]")
38
+ NULL_INT64_1D_ARRAY = zeros(0, DTYPE_INT64)
39
+ LEFT = "left"
40
+ RIGHT = "right"
41
+ # Keys for main buffer.
42
+ KEY_BIN = "bin"
43
+ KEY_SNAP = "snap"
44
+ # Keys for 'by_...' when a Callable.
45
+ KEY_LAST_BIN_LABEL = "last_bin_label"
46
+ KEY_LAST_BIN_END = "last_bin_end"
47
+ KEY_RESTART_KEY = "restart_key"
48
+ KEY_LAST_ON_VALUE = "last_on_value"
49
+ # Keys for 'bin_by' when a dict
50
+ KEY_ON_COLS = "on_cols"
51
+ KEY_BIN_BY = "bin_by"
52
+ KEY_ORDERED_ON = "ordered_on"
53
+ KEY_SNAP_BY = "snap_by"
54
+ KEY_BIN_ON = "bin_on"
55
+
56
+
57
+ @njit(
58
+ [
59
+ "Tuple((int64[:], int64, boolean))(int64[:], int64[:], boolean)",
60
+ "Tuple((int64[:], int64, boolean))(float64[:], float64[:], boolean)",
61
+ ],
62
+ )
63
+ def _next_chunk_starts(
64
+ data: NDArray,
65
+ right_edges: NDArray,
66
+ right: bool,
67
+ ):
68
+ """
69
+ Return row indices for starts of next chunks.
70
+
71
+ Parameters
72
+ ----------
73
+ data: NDArray
74
+ One-dimensional array from which deriving next chunk starts, assuming
75
+ data is sorted (monotonic increasing data).
76
+ right_edges: NDArray
77
+ One-dimensional array of chunk right edges, sorted.
78
+ right : bool
79
+ If `True`, histogram is built considering right-closed bins.
80
+ If `False`, histogram is built considering left-closed bins.
81
+
82
+ Returns
83
+ -------
84
+ next_chunk_starts : ndarray
85
+ One-dimensional array, containing row indices for start of next chunk,
86
+ to bin 'data' as per 'right_edges'.
87
+ If last right edges are out of 'data', the 'next chunk starts' for the
88
+ resulting empty bins are not returned.
89
+ Size of 'next_chunk_starts' is smaller than or equal to
90
+ ``len(right_edges)``.
91
+ n_null_chunks : ndarray
92
+ One-dimensional array of size 1, which single value is the number of
93
+ null chunks identified.
94
+ data_traversed : boolean
95
+ Specifies if 'data' has been completely traversed or not.
96
+
97
+ """
98
+ # Output variables
99
+ next_chunk_starts = zeros(len(right_edges), dtype=DTYPE_INT64)
100
+ n_null_chunks = 0
101
+ # Flag for counting null chunks.
102
+ prev_d_idx = 0
103
+ _d_idx = prev_d_idx = 0
104
+ data_max_idx = len(data) - 1
105
+ for (b_idx_loc,), bin_ in ndenumerate(right_edges):
106
+ prev_bin = True
107
+ if right:
108
+ # Right-closed bins.
109
+ for (_d_idx_loc,), val in ndenumerate(data[_d_idx:]):
110
+ if val > bin_:
111
+ prev_bin = False
112
+ break
113
+ else:
114
+ # Left-closed bins.
115
+ for (_d_idx_loc,), val in ndenumerate(data[_d_idx:]):
116
+ if val >= bin_:
117
+ prev_bin = False
118
+ break
119
+ _d_idx += _d_idx_loc
120
+ if _d_idx == data_max_idx and prev_bin:
121
+ # Array 'data' terminated and loop stayed in previous chunk.
122
+ # Then, last loop has not been accounted for.
123
+ # Hence a '+1' to account for it.
124
+ next_chunk_starts[b_idx_loc] = _d_idx + 1
125
+ # Previous code to return all bins, including the empty ones
126
+ # defined by the last values in 'right_edges'.
127
+ # next_chunk_starts[b_idx_loc:] = _d_idx + 1
128
+ # n_null_chunks += len(next_chunk_starts[b_idx_loc:]) - 1
129
+ # Do not return empty bins at end of data.
130
+ return next_chunk_starts[: b_idx_loc + 1], n_null_chunks, True
131
+ else:
132
+ next_chunk_starts[b_idx_loc] = _d_idx
133
+ if prev_d_idx == _d_idx:
134
+ n_null_chunks += 1
135
+ else:
136
+ prev_d_idx = _d_idx
137
+ # Array 'right_edges' is terminated, before 'data' is ended.
138
+ return next_chunk_starts, n_null_chunks, False
139
+
140
+
141
+ def by_scale(
142
+ on: Series,
143
+ by: TimeGrouper | Series | tuple[Series],
144
+ closed: str | None = None,
145
+ buffer: dict | None = None,
146
+ ) -> tuple[NDArray, Series, int, str, Series, bool]:
147
+ """
148
+ Segment an ordered DatetimeIndex or Series.
149
+
150
+ Parameters
151
+ ----------
152
+ on : Series
153
+ Ordered date time index over which performing the binning as defined
154
+ per 'by'.
155
+ by : Grouper or Series or tuple of 2 Series
156
+ Setup to define binning as a pandas TimeGrouper, or values contained in
157
+ a Series.
158
+ If a Series, values are used both as ends and labels of chunks.
159
+ If a tuple of 2 Series, values in first Series are labels of chunks,
160
+ and second Series are ends of chunks.
161
+ closed : str, default None
162
+ Optional string, specifying if intervals defined by 'by' are left or
163
+ right closed. This parameter overrides 'by.closed' if 'by' is a pandas
164
+ TimeGrouper.
165
+ buffer : dict
166
+ Dict to keep parameters allowing chaining calls to 'by_scale', with
167
+ ``restart_key``, keeping track of the end of the one-but-last chunk
168
+ from previous iteration, derived from 'by'.
169
+
170
+ Returns
171
+ -------
172
+ tuple[NDArray, Series, int, str, Series, bool]
173
+ The first 3 items are used in 'cumsegagg' in all situations.
174
+ - ``next_chunk_starts``, a one-dimensional array of `int` specifying
175
+ the row indices of the next-bin starts, for each bin. Successive
176
+ identical indices imply empty bins, except the first.
177
+ - ``chunk_labels``, a pandas Series specifying for each bin its
178
+ label. Labels are defined as per 'on' pandas TimeGrouper.
179
+ - ``n_null_chunks``, an int, the number of null chunks identified in
180
+ 'on'.
181
+
182
+ The 3 following items are used only if both bins and snapshots are
183
+ generated in 'cumsegagg'.
184
+ - ``chunk_closed``, a str, indicating if bins are left or right
185
+ closed, as per 'by' pandas TimeGrouper or 'closed' parameter.
186
+ - ``chunk_ends``, a pandas Series containing bin ends, as per 'by'
187
+ pandas TimeGrouper.
188
+ - ``unknown_last_chunk_end``, a boolean, always `False`, specifying
189
+ that the last chunk end is known. This is because chunk ends are
190
+ always fully specified as per 'by' pandas TimeGrouper or Series.
191
+
192
+ Notes
193
+ -----
194
+ If running ``by_scale()`` with a buffer, setting of value for key
195
+ `"restart_key`" depends if last value derived from 'by' (either a
196
+ TimeGrouper or a Series) lies before the last value in 'on'.
197
+ - If it lies before, then this last value derived from 'by' is the
198
+ restart key.
199
+ - If it lies after, then the one-but-last value derived from 'by' is the
200
+ restart key.
201
+
202
+ """
203
+ if isinstance(by, TimeGrouper):
204
+ # If 'buffer' is not empty, it necessarily contains 'KEY_RESTART_KEY'.
205
+ first = buffer[KEY_RESTART_KEY] if buffer else on.iloc[0]
206
+ # In case 'by' is for snapshotting, and 'closed' is not set, take care
207
+ # to use 'closed' provided.
208
+ if closed is None:
209
+ closed = by.closed
210
+ # TODO: replace with date_utils.floor_ts() and date_utils.ceil_ts()?
211
+ start, end = gtre(
212
+ first=first,
213
+ last=on.iloc[-1],
214
+ freq=by.freq,
215
+ closed=closed,
216
+ unit=first.unit,
217
+ origin=by.origin,
218
+ offset=by.offset,
219
+ )
220
+ edges = date_range(start, end, freq=by.freq)
221
+ chunk_ends = edges[1:]
222
+ chunk_labels = chunk_ends if by.label == RIGHT else edges[:-1]
223
+ else:
224
+ # Case 'by' is a Series.
225
+ if closed is None:
226
+ raise ValueError(f"'closed' has to be set to {LEFT} or {RIGHT}.")
227
+ if isinstance(by, tuple):
228
+ chunk_labels, chunk_ends = by
229
+ if len(chunk_labels) != len(chunk_ends):
230
+ raise ValueError(
231
+ "number of chunk labels has to be equal to number of chunk ends.",
232
+ )
233
+ else:
234
+ chunk_labels = chunk_ends = by
235
+ if buffer:
236
+ # In case at previous iteration, there has been no snapshot,
237
+ # 'buffer' will not contain 'KEY_RESTART_KEY', but will contain
238
+ # 'KEY_LAST_ON_VALUE'.
239
+ if KEY_RESTART_KEY in buffer and buffer[KEY_RESTART_KEY] != chunk_ends[0]:
240
+ # In case of restart, if first value in 'chunk_ends' is not the
241
+ # the one that was used in last at last iteration, try first to
242
+ # trim values in 'by' that are earlier than 'restart_key'.
243
+ n_chunk_ends_init = len(chunk_ends)
244
+ chunk_ends = chunk_ends[chunk_ends >= buffer[KEY_RESTART_KEY]]
245
+ if buffer[KEY_RESTART_KEY] != chunk_ends[0]:
246
+ raise ValueError(
247
+ f"'by' needs to contain value {buffer[KEY_RESTART_KEY]} " "to restart correctly.",
248
+ )
249
+ n_first_chunks_to_remove = n_chunk_ends_init - len(chunk_ends)
250
+ chunk_labels = chunk_labels[n_first_chunks_to_remove:]
251
+ if KEY_LAST_ON_VALUE in buffer:
252
+ # In the specific case 'on' has not been traversed completely
253
+ # at previous iteration, the chunk for the remaining of the
254
+ # data has no label, and will not appear in the snapshot
255
+ # results. But it will be calculated during the aggregation
256
+ # phase ('cumsegagg()'), and kept in a temporary variable
257
+ # ('chunk_res').
258
+ # In this case, at next iteration, with new chunk ends, a
259
+ # specific check is managed here to ensure correctness of the
260
+ # restart.
261
+ # For this new iteration,
262
+ # - a new bin has necessarily to be started. Otherwise,
263
+ # aggregation results for last chunk at previous iteration
264
+ # will overwrite those of elapsed last bin. This last bin
265
+ # has been completed at previous iteration. Its results
266
+ # do not have to be modified.
267
+ # - this new first bin has to end after the last value in 'on'
268
+ # from previous iteration. If it is not, then the remaining
269
+ # aggregated data from previous iteration is not usable, as
270
+ # it aggregates over several chunks.
271
+ # If there is a single chunk end, then it is that of previous
272
+ # iteration, nothing to check.
273
+ last_on_value = buffer[KEY_LAST_ON_VALUE]
274
+ if len(chunk_ends) > 1 and (
275
+ (closed == RIGHT and chunk_ends[1] < last_on_value)
276
+ or (closed == LEFT and chunk_ends[1] <= last_on_value)
277
+ ):
278
+ raise ValueError(
279
+ "2nd chunk end in 'by' has to be larger than value "
280
+ f"{buffer[KEY_LAST_ON_VALUE]} to restart correctly.",
281
+ )
282
+ if (closed == RIGHT and chunk_ends[0] < last_on_value) or (
283
+ closed == LEFT and chunk_ends[0] <= last_on_value
284
+ ):
285
+ # At previous iteration, if last value in 'on' is later
286
+ # than first chunk end, then this chunk should not be
287
+ # updated. It is 'done'.
288
+ # To prevent updating it, this chunk should be removed.
289
+ # Only the 1st chunk is removed, because it was just
290
+ # checked that 2nd chunk complies correctly with this
291
+ # condition.
292
+ chunk_ends = chunk_ends[1:]
293
+ chunk_labels = chunk_labels[1:]
294
+ del buffer[KEY_LAST_ON_VALUE]
295
+ if chunk_ends.empty:
296
+ if isinstance(buffer, dict):
297
+ buffer[KEY_LAST_ON_VALUE] = on.iloc[-1]
298
+ return (NULL_INT64_1D_ARRAY, chunk_labels, 0, closed, chunk_ends, False)
299
+ if chunk_ends.dtype == DTYPE_DATETIME64:
300
+ next_chunk_starts, n_null_chunks, data_traversed = _next_chunk_starts(
301
+ on.to_numpy(copy=False).view(DTYPE_INT64),
302
+ chunk_ends.to_numpy(copy=False).view(DTYPE_INT64),
303
+ closed == RIGHT,
304
+ )
305
+ else:
306
+ next_chunk_starts, n_null_chunks, data_traversed = _next_chunk_starts(
307
+ on.to_numpy(copy=False),
308
+ chunk_ends.to_numpy(copy=False),
309
+ closed == RIGHT,
310
+ )
311
+ n_chunks = len(next_chunk_starts)
312
+ # Rationale for selecting the "restart key".
313
+ # - For a correct restart at iteration N+1, the restart point needs to be
314
+ # that of the last bin at iteration N that has been "in-progress". The
315
+ # restart is said correct because it restarts on new data, where
316
+ # aggregation at iteration N stopped. There is no omission of new data,
317
+ # nor omission of possibly empty bins till new data.
318
+ # - At iteration N,
319
+ # - if last value derived from 'by' is after last value in
320
+ # "on", then at next iteration, N+1, new data can be used, which
321
+ # still lies before this last value derived from 'by' at iteration N.
322
+ # To make sure this new data is correctly managed, we need to restart
323
+ # from one-but-last value derived from 'by' at iteration N.
324
+ # - if last value derived from 'by' is before last value in "on", then
325
+ # at next iteration, N+1, we are sure no new data will appear before
326
+ # it. This last value can be safely used as restart value.
327
+ # TODO: when splitting 'by_scale()' into 'by_pgrouper()' and 'by_scale()',
328
+ # for 'by_pgrouper()', then using for 'restart_key' the last value in 'on'
329
+ # complies with whatever the 'closed' parameter is (I think). This
330
+ # simplifies below code.
331
+ if data_traversed:
332
+ chunk_labels = chunk_labels[:n_chunks]
333
+ chunk_ends = chunk_ends[:n_chunks]
334
+ if buffer is not None:
335
+ if closed == LEFT and isinstance(by, TimeGrouper):
336
+ # Use of intricate way to get last or last-but-one element in
337
+ # 'chunk_ends', compatible with both Series and DatetimeIndex.
338
+ if n_chunks > 1:
339
+ # Get one-but-last element.
340
+ # Initialize this way if there are more than 2 elements at
341
+ # least.
342
+ buffer[KEY_RESTART_KEY] = chunk_ends[n_chunks - 2]
343
+ else:
344
+ # If there is a single incomplete bin, take first element
345
+ # in 'on'.
346
+ buffer[KEY_RESTART_KEY] = on.iloc[0]
347
+ else:
348
+ # Take last end
349
+ # - either if 'by' is a TimeGrouper, as it is enough for
350
+ # generating edges at next iteration.
351
+ # - or if 'by' is a Series, because Series only needs to
352
+ # restart from this point then.
353
+ buffer[KEY_RESTART_KEY] = chunk_ends[n_chunks - 1]
354
+ elif buffer is not None:
355
+ # Data is not traversed.
356
+ # This can only happen if 'by' is not a TimeGrouper.
357
+ # Keep last chunk end.
358
+ buffer[KEY_RESTART_KEY] = chunk_ends[n_chunks - 1]
359
+ buffer[KEY_LAST_ON_VALUE] = on.iloc[-1]
360
+ return (
361
+ next_chunk_starts,
362
+ Series(chunk_labels),
363
+ n_null_chunks,
364
+ closed,
365
+ chunk_ends,
366
+ False,
367
+ )
368
+
369
+
370
+ def by_x_rows(
371
+ on: DataFrame | Series,
372
+ by: int | None = 4,
373
+ closed: str | None = LEFT,
374
+ buffer: dict | None = None,
375
+ ) -> tuple[NDArray, Series, int, str, Series, bool]:
376
+ """
377
+ Segment by group of x rows.
378
+
379
+ Dummy binning function for testing 'cumsegagg' with 'bin_by' set as a
380
+ Callable.
381
+
382
+ Parameters
383
+ ----------
384
+ on : DataFrame | Series
385
+ Either a pandas Series or a DataFrame made of two columns, from which
386
+ deriving
387
+ - the number of rows in 'on',
388
+ - bin labels for each bin (from the last column of 'on'),
389
+ - bin ends for each bin (from the last column of 'on').
390
+ by : int, default 4
391
+ Number of rows in a bin.
392
+ closed : str, default "left"
393
+ How is closed the segments, either "left" or "right".
394
+ buffer : dict, default None
395
+ Dict to keep 2 parameters allowing chaining calls to 'by_x_rows':
396
+ - 'restart_key', an int specifying the number of rows in last
397
+ (and possibly incomplete) bin from the previous call to
398
+ 'bin_x_rows'.
399
+ - 'last_bin_label', label of the last bin, that will be reused in
400
+ next iteration.
401
+
402
+ Returns
403
+ -------
404
+ tuple[NDArray, Series, int, str, Series, bool]
405
+ The first 3 items are used in 'cumsegagg' in all situations.
406
+ - ``next_chunk_starts``, a one-dimensional numpy array of int,
407
+ specifying for each bin the row indice at which starts the next
408
+ bin.
409
+ - ``bin_labels``, a pandas Series specifying for each bin its label.
410
+ Labels are first value in bin taken in last column of 'on' (which
411
+ is supposed to be an ordered column).
412
+ - ``n_null_bins``, an int, always ``0``.
413
+
414
+ The 3 next items are used only if both bins and snapshots are generated
415
+ in 'cumsegagg'.
416
+ - ``bin_closed``, a str, ``"left"`` or ``"right"``, indicating that
417
+ the bins are left or right closed.
418
+ - ``bin_ends``, a pandas Series made of values from the last columns
419
+ of 'on' (which is either single-column or two-column) and
420
+ indicating the "position" of the bin end, which is marked by the
421
+ start of the next bin, excluded. The end of the last bin being
422
+ unknown by definition (because is excluded), the last value is not
423
+ relevant. It is forced anyhow in 'segmentby()' to be last.
424
+ - ``unknown_last_bin_end``, a boolean specifying if the last bin end
425
+ is unknown. It is ``True`` if bins are lef-closed, meaning that
426
+ their end is excluded. Hence, the last bin is always "in-progress".
427
+ It is ``False`` if they are right-closed.
428
+
429
+ """
430
+ len_on = len(on)
431
+ if isinstance(on, DataFrame):
432
+ # Keep only last column, supposed to be `ordered_on` column.
433
+ on = on.iloc[:, -1]
434
+ # Derive number of rows in first bins (cannot be 0) and number of bins.
435
+ if buffer is not None and KEY_RESTART_KEY in buffer:
436
+ # Case 'restart'.
437
+ rows_in_prev_last_bin = buffer[KEY_RESTART_KEY]
438
+ rows_in_continued_bin = min(len_on, by - rows_in_prev_last_bin) if rows_in_prev_last_bin != by else 0
439
+ else:
440
+ # Case 'start from scratch'.
441
+ rows_in_prev_last_bin = 0
442
+ rows_in_continued_bin = 0
443
+ n_rows_for_new_bins = len_on - rows_in_continued_bin
444
+ n_bins = ceil(n_rows_for_new_bins / by) + 1 if rows_in_continued_bin else ceil(n_rows_for_new_bins / by)
445
+ # Define 'next_chunk_starts'.
446
+ first_next_chunk_start = rows_in_continued_bin if rows_in_continued_bin else min(by, len_on)
447
+ next_chunk_starts = arange(
448
+ start=first_next_chunk_start,
449
+ stop=(n_bins - 1) * by + first_next_chunk_start + 1,
450
+ step=by,
451
+ )
452
+ # Make a copy and arrange for deriving 'chunk_starts', required for
453
+ # defining bin labels. 'bin_labels' are derived from last column (is then
454
+ # 'ordered_on' and if not, is 'bin_on'). Bin labels are 1st value in bin.
455
+ chunk_starts = next_chunk_starts.copy() - by
456
+ # Correct start of 1st chunk.
457
+ chunk_starts[0] = 0
458
+ bin_labels = on.iloc[chunk_starts].reset_index(drop=True)
459
+ if n_rows_for_new_bins:
460
+ # Case 'there are new bins'.
461
+ n_rows_in_last_bin = (
462
+ n_rows_for_new_bins if n_rows_for_new_bins <= by else fmod(n_rows_for_new_bins, by) or by
463
+ )
464
+ else:
465
+ # Case 'there are not'.
466
+ n_rows_in_last_bin = rows_in_continued_bin + rows_in_prev_last_bin
467
+ if closed == LEFT:
468
+ # Case 'left, end is start of next bin, excluded,
469
+ # 'bin_ends' has no end for last bin, because it is unknown.
470
+ # Temporarily adjust 'next_chunk_start' of last bin to last index.
471
+ next_chunk_starts[-1] = len_on - 1
472
+ bin_ends = on.iloc[next_chunk_starts].reset_index(drop=True)
473
+ unknown_last_bin_end = True
474
+ # Reset 'next_chunk_start' of last bin.
475
+ next_chunk_starts[-1] = len_on
476
+ if closed == RIGHT:
477
+ # Case 'right', end is end of current bin, included.
478
+ bin_ends = on.iloc[next_chunk_starts - 1].reset_index(drop=True)
479
+ # Bin end is unknown if last bin does not end exactly.
480
+ unknown_last_bin_end = True if n_rows_in_last_bin != by else False
481
+ # There is likely no empty bin.
482
+ n_null_bins = 0
483
+ if buffer is not None:
484
+ if buffer:
485
+ if rows_in_continued_bin:
486
+ # Correct 1st label if not a new bin.
487
+ bin_labels.iloc[0] = buffer[KEY_LAST_BIN_LABEL]
488
+ else:
489
+ # If a new bin has been created right at start,
490
+ # insert an empty one with label of last bin at prev iteration.
491
+ bin_labels = concat(
492
+ [Series([buffer[KEY_LAST_BIN_LABEL]]), bin_labels],
493
+ ).reset_index(drop=True)
494
+ first_bin_end = buffer[KEY_LAST_BIN_END] if closed == RIGHT else on.iloc[0]
495
+ bin_ends = concat([Series([first_bin_end]), bin_ends]).reset_index(drop=True)
496
+ next_chunk_starts = ninsert(next_chunk_starts, 0, 0)
497
+ # In this case, first bin is empty.
498
+ n_null_bins = 1
499
+ # Update 'buffer[xxx]' parameters for next run.
500
+ buffer[KEY_RESTART_KEY] = n_rows_in_last_bin
501
+ buffer[KEY_LAST_BIN_LABEL] = bin_labels.iloc[-1]
502
+ if closed == RIGHT:
503
+ buffer[KEY_LAST_BIN_END] = bin_ends.iloc[-1]
504
+ return (
505
+ next_chunk_starts,
506
+ bin_labels,
507
+ n_null_bins,
508
+ closed,
509
+ bin_ends,
510
+ unknown_last_bin_end,
511
+ )
512
+
513
+
514
+ def mergesort(
515
+ labels: tuple[NDArray, NDArray],
516
+ keys: tuple[NDArray, NDArray],
517
+ force_last_from_second: bool | None = False,
518
+ ) -> tuple[NDArray, NDArray]:
519
+ """
520
+ Mergesort labels from keys.
521
+
522
+ Parameters
523
+ ----------
524
+ labels : tuple[NDArray, NDArray]
525
+ 2 one-dimensional arrays of labels to be merged together, provided as a
526
+ ``tuple``.
527
+ keys : tuple[NDArray, NDArray]
528
+ 2 one-dimensional arrays of sorted keys according which labels can be
529
+ sorted one with respect to the other.
530
+ ``keys[0]``, resp. ``1``, are keys for ``labels[0]``, resp. ``1``.
531
+ force_last_from_second : bool, default False
532
+ If True, the last label in the resulting sorted array is forced to be
533
+ the last from the second label array.
534
+
535
+ Returns
536
+ -------
537
+ tuple[NDArray, NDArray]
538
+ The first array contains sorted labels from the 2 input arrays.
539
+ The second array contains the insertion indices for labels (i.e. the
540
+ indices in the resulting merged array) from the 2nd input array,
541
+
542
+ Notes
543
+ -----
544
+ If a value is found in both input arrays, then value of 2nd input array
545
+ comes after value of 1st input array, as can be checked with insertion
546
+ indices.
547
+
548
+ """
549
+ # TODO: transition this to numba.
550
+ labels1, labels2 = labels
551
+ keys1, keys2 = keys
552
+ len_labels1 = len(labels1)
553
+ len_labels2 = len(labels2)
554
+ if len(keys1) != len_labels1:
555
+ raise ValueError(
556
+ "not possible to have arrays of different length for first labels and keys arrays.",
557
+ )
558
+ if len(keys2) != len_labels2:
559
+ raise ValueError(
560
+ "not possible to have arrays of different length for second labels and keys arrays.",
561
+ )
562
+ if force_last_from_second:
563
+ len_tot = len_labels1 + len_labels2
564
+ sort_indices = full(len_tot, len_tot - 1, dtype=DTYPE_INT64)
565
+ sort_indices[:-1] = argsort(concatenate((keys1, keys2[:-1])), kind="mergesort")
566
+ else:
567
+ sort_indices = argsort(concatenate(keys), kind="mergesort")
568
+ return concatenate(labels)[sort_indices], nonzero(len_labels1 <= sort_indices)[0]
569
+
570
+
571
+ def setup_segmentby(
572
+ bin_by: TimeGrouper | Callable,
573
+ bin_on: str | None = None,
574
+ ordered_on: str | None = None,
575
+ snap_by: TimeGrouper | Series | None = None,
576
+ ) -> dict[str, Callable | str]:
577
+ """
578
+ Check and setup parameters to operate data segmentation.
579
+
580
+ Parameters
581
+ ----------
582
+ bin_by : TimeGrouper | Callable
583
+ A pandas TimeGrouper or a Callable to perform segmentation.
584
+ bin_on : str | None
585
+ Name of the column onto which performing the segmentation.
586
+ ordered_on : str | None
587
+ Name of the column containing ordered data and to use when snapshotting.
588
+ With this column, snapshots (points of observation) can be positioned
589
+ with respect to bin ends.
590
+ snap_by : TimeGrouper | IntervalIndex | None
591
+ A pandas TimeGrouper or a pandas Series defining the snapshots (points of
592
+ observation).
593
+
594
+ Returns
595
+ -------
596
+ dict[str, Callable | str]
597
+ A dict with keys
598
+ - ``BIN_BY``, 'bin_by' forced as a Callable,
599
+ - ``ON_COLS``, column name or list of column names to be used for
600
+ segmentation.
601
+ - ``ORDERED_ON``, consolidated value for 'ordered_on' column.
602
+
603
+ """
604
+ bin_by_closed = None
605
+ if isinstance(bin_by, TimeGrouper):
606
+ # 'bin_by' is a TimeGrouper.
607
+ bin_by_closed = bin_by.closed
608
+ if bin_by.key:
609
+ if bin_on:
610
+ if bin_by.key != bin_on:
611
+ raise ValueError(
612
+ "not possible to set 'bin_by.key' and 'bin_on' to different values.",
613
+ )
614
+ else:
615
+ bin_on = bin_by.key
616
+ elif not bin_on:
617
+ raise ValueError("not possible to set both 'bin_by.key' and 'bin_on' to `None`.")
618
+ if ordered_on and ordered_on != bin_on:
619
+ raise ValueError(
620
+ "not possible to set 'bin_on' and 'ordered_on' to different values when "
621
+ "'bin_by' is a TimeGrouper.",
622
+ )
623
+ elif not ordered_on:
624
+ # Case 'ordered_on' has not been provided but 'bin_on' has been.
625
+ # Then set 'ordered_on' to 'bin_on'. this is so because 'bin_by' is
626
+ # a TimeGrouper.
627
+ ordered_on = bin_on
628
+ bin_by = partial(by_scale, by=bin_by)
629
+ elif callable(bin_by):
630
+ # 'bin_by' is a Callable.
631
+ if bin_on is None and ordered_on is None:
632
+ raise ValueError("not possible to set both 'bin_on' and 'ordered_on' to `None`.")
633
+ else:
634
+ # 'bin_by' is neither a TimeGrouper, nor a Callable.
635
+ # This is not possible.
636
+ raise ValueError(
637
+ "not possible to have 'bin_by' parameter different " "than a pandas TimeGrouper or a Callable.",
638
+ )
639
+ if snap_by is not None:
640
+ if isinstance(snap_by, TimeGrouper):
641
+ if snap_by.key:
642
+ if ordered_on is None:
643
+ ordered_on = snap_by.key
644
+ elif snap_by.key != ordered_on:
645
+ raise ValueError(
646
+ "not possible to set 'ordered_on' and 'snap_by.key' to different values.",
647
+ )
648
+ if bin_by_closed and snap_by.closed != bin_by_closed:
649
+ raise ValueError(
650
+ "not possible to set 'bin_by.closed' and 'snap_by.closed' to different values.",
651
+ )
652
+ elif not ordered_on:
653
+ # Case 'snap_by' is not a TimeGrouper.
654
+ raise ValueError(
655
+ "not possible to leave 'ordered_on' to `None` in case of snapshotting.",
656
+ )
657
+ return {
658
+ KEY_BIN_BY: bin_by,
659
+ KEY_ON_COLS: (
660
+ [bin_on, ordered_on]
661
+ if ordered_on and bin_on and ordered_on != bin_on
662
+ else bin_on if bin_on else ordered_on
663
+ ),
664
+ KEY_ORDERED_ON: ordered_on,
665
+ KEY_BIN_ON: bin_on,
666
+ KEY_SNAP_BY: snap_by if isinstance(snap_by, TimeGrouper) else None,
667
+ }
668
+
669
+
670
+ def setup_mainbuffer(buffer: dict, with_snapshot: bool | None = False) -> tuple[dict, dict]:
671
+ """
672
+ Return 'buffer_bin' and 'buffer_snap' from main buffer.
673
+
674
+ Parameters
675
+ ----------
676
+ buffer : dict
677
+ Main buffer, either containing only values for 'buffer_bin', or only
678
+ two keys `"bin"` and `"snap"` providing a separate dict for each of the
679
+ binning and snapshotting processes.
680
+ with_snapshot : bool, default False
681
+ Boolean ``True`` if snapshotting process is requested.
682
+
683
+ Returns
684
+ -------
685
+ tuple[dict, dict]
686
+ The first dict is the binning buffer.
687
+ The second dict is the snapshotting buffer.
688
+
689
+ """
690
+ if buffer is not None:
691
+ if KEY_BIN not in buffer:
692
+ buffer[KEY_BIN] = {}
693
+ if with_snapshot:
694
+ buffer[KEY_SNAP] = {}
695
+ if with_snapshot:
696
+ return buffer[KEY_BIN], buffer[KEY_SNAP]
697
+ else:
698
+ return buffer[KEY_BIN], None
699
+ else:
700
+ return None, None
701
+
702
+
703
+ def segmentby(
704
+ data: DataFrame,
705
+ bin_by: TimeGrouper | Callable | dict,
706
+ bin_on: str | None = None,
707
+ ordered_on: str | None = None,
708
+ snap_by: TimeGrouper | IntervalIndex | None = None,
709
+ buffer: dict | None = None,
710
+ ) -> tuple[NDArray, NDArray, Series, int, Series, int]:
711
+ """
712
+ Identify starts of segments in data, either bins or optionally snapshots.
713
+
714
+ Parameters
715
+ ----------
716
+ data: DataFrame
717
+ A pandas DataFrame containing the columns to conduct segmentation of
718
+ data.
719
+ - ``bin_on`` column,
720
+ - optionally ``ordered_on`` column (same as ``snap_by.key`` optional
721
+ column, from 'snap_by' parameter, if using snapshots.
722
+
723
+ If any of ``ordered_on`` or ``snap_by.key`` parameters are used, the
724
+ column they point to (the same if both parameters are provided) has to
725
+ be ordered.
726
+ bin_by : TimeGrouper | Callable | dict
727
+ Callable or pandas TimeGrouper to perform binning.
728
+ If a Callable, it is called with following parameters:
729
+ ``bin_by(on, buffer)``
730
+ where:
731
+ - ``on``,
732
+ - either ``ordered_on`` is ``None``. ``on`` is then a pandas Series
733
+ made from ``data[bin_on]`` column.
734
+ - or ``ordered_on`` is provided and is different from ``bin_on``.
735
+ Then ``on`` is a two-column pandas DataFrame made of
736
+ ``data[[bin_on, ordered_on]]``.
737
+ Values from ``data[ordered_on]`` have to be used to define bin
738
+ ends when 'snap_by' is set.
739
+ Also, values from ``data[ordered_on]`` can be used advantageously
740
+ as bin labels.
741
+
742
+ - ``buffer``, a dict that has to be modified in-place by 'bin_by' to
743
+ keep internal parameters which allow restart calls to 'bin_by'.
744
+
745
+ If a dict, it contains the full setup for conducting the segmentation
746
+ of 'data', as generated by 'setup_segmentby()'.
747
+ - 'on_cols', a str or list of str, to be forwarded to 'bin_by'
748
+ Callable.
749
+ - 'bin_by', a Callable, either the one initially provided, or one
750
+ derived from a pandas TimeGrouper.
751
+ - 'ordered_on', a str, its definitive value.
752
+ - 'snap_by', if a TimeGrouper.
753
+
754
+ It has then to return a tuple made of 6 items. There are 3 items used
755
+ whatever if snapshotting is used or not.
756
+ - ``next_chunk_starts``, a one-dimensional array of `int`, specifying
757
+ the row index at which the next bin starts (included) as found in
758
+ ``bin_on``.
759
+ If the same indices appear several times, it means that
760
+ corresponding bins are empty, except the first one. In this case,
761
+ corresponding rows in aggregation result will be filled with null
762
+ values.
763
+ Last value of this array always equals to ``len(on)``.
764
+ - ``bin_labels``, a pandas Series which values are expected to be
765
+ all bin labels, incl. those of empty bins, as they will appear in
766
+ aggregation results. Labels can be of any type.
767
+ In case of restarting the aggregation with new seed data, care
768
+ should be taken so that the label of the first bin is the same as
769
+ that of the last bin from previous iteration if it has been the
770
+ same bin. An exception is raised if not.
771
+ - ``n_null_bins``, an `int` indicating the number of empty bins.
772
+
773
+ The 3 next items are used only in case of snapshotting (``snap_by`` is
774
+ different than ``None``).
775
+ - ``bin_closed``, a str, either `'right'` or `'left'`, indicating
776
+ if bins are left or right-closed (i.e. if ``chunk_ends`` is
777
+ included or excluded in the bin).
778
+ - ``bin_ends``, an optional pandas Series, specifying the ends of
779
+ bins with values derived from ``data[ordered_on]`` column. If
780
+ snapshotting, then points of observation (defined by ``snap_by``)
781
+ are positioned with respect to the bin ends. This data allows
782
+ sorting snapshots with respect to bins in case they start/end at
783
+ the same row index in data.
784
+ ``bin_ends`` is not required if no snapshotting. If not used, set
785
+ to None.
786
+ - ``last_bin_end_unknown``, a boolean indicating if the end of the
787
+ last bin is known or not. If bins are left-closed, then it is
788
+ possible the end of the last bin is not known. In this case,
789
+ de-facto, this unknown bin end is supposed to be positioned after
790
+ all snapshots.
791
+
792
+ bin_on : str | None, default None
793
+ Name of the column in `data` over which performing the binning
794
+ operation.
795
+ If 'bin_by' is a pandas `TimeGrouper`, its `key` parameter is used instead.
796
+ If 'bin_on' is set, its consistence with ``bin_by.key`` parameter is
797
+ then checked.
798
+ ordered_on : str | None, default None
799
+ Name of an existing ordered column in 'data'. When setting it, it is
800
+ then forwarded to 'bin_by' Callable.
801
+ This parameter is compulsory if 'snap_by' is set. Values derived from
802
+ 'snap_by' (either a TimeGrouper or a Series) are compared to ``bin_ends``,
803
+ themselves derived from ``data[ordered_on]``.
804
+ snap_by : TimeGrouper | Series | None, default None
805
+ Values positioning the points of observation, either derived from a
806
+ pandas TimeGrouper, or contained in a pandas Series.
807
+ buffer : dict | None, default None
808
+ Dict of 2 dict.
809
+ - first dict, with key `"bin"` embed values from previous binning
810
+ process, set by 'bin_by' when it is a Callable, or by the internal
811
+ function ``by_scale`` if 'bin_by' is a TimeGrouper. These values are
812
+ required when restarting the binning process with new seed data.
813
+ - second dict, with key `"snap"` embed values from previous
814
+ snapshotting process, set by 'by_scale'. Similarly, these values
815
+ are required to allow restarting the snapshotting process with new
816
+ seed data.
817
+
818
+ Returns
819
+ -------
820
+ Tuple made of 6 items.
821
+ - ``next_chunk_starts``, an ordered one-dimensional numpy array of int,
822
+ specifying for each bin and snapshot the row indice at which starts the
823
+ next one.
824
+ - ``bin_indices``, a one-dimensional array of int, specifying which
825
+ value in ``next_chunk_starts`` relates to a bin (as opposed to a
826
+ snapshot)
827
+ - ``bin_labels``, a pandas Series specifying for each bin its label.
828
+ - ``n_null_bins``, an int, indicating how many bins are empty.
829
+ - ``snap_labels``, a pandas Series specifying for each snapshot its
830
+ label.
831
+ - ``n_max_null_snaps``, an int, specifying how many at most there are
832
+ empty snapshots. This figure is an upper bound.
833
+
834
+ Notes
835
+ -----
836
+ When implementing `bin_by` Callable the developer should take care that
837
+ ``next_chunk_starts``, ``chunk_labels`` and``chunk_ends`` that are returned
838
+ by 'bin_by' are expected to be all of the same size, i.e. the total number
839
+ of bins that are expected, including empty ones.
840
+
841
+ Also, when implementing it for repetitive calls, care should be taken
842
+ that `bin_by` keeps in the 'buffer' parameter all the data needed to:
843
+ - create the correct number of bins that would be in-between the data
844
+ processed at the previous aggregation iteration, and the new data.
845
+ This has to show in 'next_chunk_starts' array that is returned.
846
+ - start with same bin label as previous iteration when using snapshots.
847
+
848
+ Having the same bin label between both iterations when using snapshots will
849
+ ensure:
850
+ - that the bin with previous aggregation results is overwritten (ok, not
851
+ necessarily meaningful if agg results have not changed in case there
852
+ has been no new data in this bin).
853
+ - even if this bin is empty at restart, in the case of snapshotting, it
854
+ is necessary when this bin ends that new empty snapshots before its end
855
+ correctly forward past results, and that new empty snapshots after this
856
+ end are correctly accounted for as empty chunks. For this reason, when
857
+ using snapshots, a check ensures that same bin label is used between
858
+ two successive iterations.
859
+
860
+ Still for repetitive calls of 'bin_by', care has to be taken that:
861
+ - the last bin is not an empty one.
862
+ - the last bin does cover the full size of data.
863
+
864
+ If not, exceptions will be raised.
865
+
866
+ When using snapshots, values defined by ``snap_by`` are considered the
867
+ "points of isolated observation". At such a point, an observation of the
868
+ "on-going" bin is made. In case of snapshot(s) positioned exactly on
869
+ segment(s) ends, at the same row index in data, the observation point will
870
+ always come before the bin end.
871
+
872
+ """
873
+ # TODO : split 'by_scale' into 'by_pgrouper' and 'by_scale'.
874
+ # TODO : make some tests validating use of 'by_scale' as 'bin_by' parameter.
875
+ # (when user-provided 'bin_by' is a Series or a tuple of Series)
876
+ # TODO : consider transitioning 'bin_by' and 'snap_by' into a class.
877
+ # Probably, below initiatialization is to be part of a template class, to
878
+ # be run at child class instantiation.
879
+ if not isinstance(bin_by, dict):
880
+ bin_by = setup_segmentby(bin_by, bin_on, ordered_on, snap_by)
881
+ if bin_by[KEY_SNAP_BY] is not None:
882
+ # 'bin_by[KEY_SNAP_BY]' is not none if 'snap_by' is a TimeGrouper.
883
+ # Otherwise, it can be a DatetimeIndex or a Series.
884
+ snap_by = bin_by[KEY_SNAP_BY]
885
+ buffer_bin, buffer_snap = setup_mainbuffer(buffer, snap_by is not None)
886
+ ordered_on = bin_by[KEY_ORDERED_ON]
887
+ if ordered_on:
888
+ # Check 'ordered_on' is an ordered column.
889
+ if not (
890
+ (
891
+ data[ordered_on].dtype == DTYPE_DATETIME64
892
+ and (data[ordered_on].diff().iloc[1:] >= Timedelta(0)).all()
893
+ )
894
+ or (data[ordered_on].dtype != DTYPE_DATETIME64 and (data[ordered_on].diff().iloc[1:] >= 0).all())
895
+ ):
896
+ raise ValueError(
897
+ f"column '{ordered_on}' is not ordered. It has to be for "
898
+ "'cumsegagg' to operate faultlessly.",
899
+ )
900
+ on = data.loc[:, bin_by[KEY_ON_COLS]]
901
+ # 'bin_by' binning.
902
+ (
903
+ next_chunk_starts,
904
+ bin_labels,
905
+ n_null_bins,
906
+ bin_closed,
907
+ bin_ends,
908
+ unknown_last_bin_end,
909
+ ) = bin_by[
910
+ KEY_BIN_BY
911
+ ](on=on, buffer=buffer_bin)
912
+ # Check consistency of 'bin_by' results.
913
+ # TODO : consider transitioning 'bin_by' and 'snap_by' into a class.
914
+ # Integrate below checks within a template class.
915
+ # Some checks may probably be managed at class instantiation.
916
+ # Others at runtime.
917
+ if bin_closed != LEFT and bin_closed != RIGHT:
918
+ raise ValueError(f"'bin_closed' has to be set either to '{LEFT}' or to '{RIGHT}'.")
919
+ if not isinstance(bin_labels, Series):
920
+ # Because `iloc` is used afterwards, `bin_labels` has to be a pandas
921
+ # Series.
922
+ raise TypeError("'bin_labels' has to be a pandas Series.")
923
+ n_bins = len(next_chunk_starts)
924
+ if n_bins != len(bin_labels):
925
+ raise ValueError("'next_chunk_starts' and 'chunk_labels' have to be of the same size.")
926
+ if n_bins != len(bin_ends):
927
+ raise ValueError("'next_chunk_starts' and 'chunk_ends' have to be of the same size.")
928
+ if isinstance(buffer, dict) and next_chunk_starts[-1] != len(data):
929
+ raise ValueError(
930
+ "series of bins have to cover the full length of 'data'. "
931
+ f"But last bin ends at row {next_chunk_starts[-1]} "
932
+ f"excluded, while size of data is {len(data)}.",
933
+ )
934
+ if buffer is not None:
935
+ # A buffer that is not 'None' means a restart is expected.
936
+ if n_bins > 1 and next_chunk_starts[-2] == len(on):
937
+ # In case a user-provided 'bin_by()' Callable is used, check if there
938
+ # are empty trailing bins. If there are, and that restart are expected
939
+ # (use of 'buffer'), then raise error, this it not allowed, as it would
940
+ # lead to wrong results in 'jcumsegagg()'.
941
+ raise ValueError(
942
+ "there is at least one empty trailing bin. "
943
+ "This is not possible if planning to restart on new "
944
+ "data in a next iteration.",
945
+ )
946
+ if KEY_LAST_BIN_LABEL in buffer and buffer[KEY_LAST_BIN_LABEL] != bin_labels.iloc[0]:
947
+ # When using snapshots, and in case of multiple calls, check that
948
+ # label of last bin (previous iteration) is same than label of
949
+ # first bin (current iteration).
950
+ raise ValueError(
951
+ f"not possible to have label '{buffer[KEY_LAST_BIN_LABEL]}' "
952
+ "of last bin at previous iteration different than label "
953
+ f"'{bin_labels.iloc[0]}' of first bin at current iteration.",
954
+ )
955
+ if snap_by is not None:
956
+ # Define points of observation
957
+ (next_snap_starts, snap_labels, n_max_null_snaps, _, snap_ends, _) = by_scale(
958
+ on=data.loc[:, ordered_on],
959
+ by=snap_by,
960
+ closed=bin_closed,
961
+ buffer=buffer_snap,
962
+ )
963
+ # Consolidate 'next_snap_starts' into 'next_chunk_starts'.
964
+ # If bins are left-closed, the end of the last bin can possibly be
965
+ # unknown yet.
966
+ # If a snapshot (observation point) is also set at end of data
967
+ # (a snapshot position is always known, because it is either
968
+ # derived from a pandas TimeGrouper, or an iterable of ordered values),
969
+ # then 'merge_sorted()' cannot sort them one to the other (end of last
970
+ # bin with last snapshot).
971
+ # In this case, we force the bin end to be after the last snapshot.
972
+ # The logic is that we want both to know the last bin and last
973
+ # snapshot while this last bin is in-progress.
974
+ # Having the bin end before the snapshot would on the opposite
975
+ # reset the data and the resulting snapshot would be a null one.
976
+ next_chunk_starts, bin_indices = mergesort(
977
+ labels=(next_snap_starts, next_chunk_starts),
978
+ keys=(snap_ends, bin_ends),
979
+ force_last_from_second=unknown_last_bin_end,
980
+ )
981
+ # Take indices of 'next_chunk_starts' corresponding to bins that are
982
+ # followed right after by a snapshot.
983
+ # ('append=len(next_chunk_starts)' in 'nonzero()' allows to simulate a
984
+ # configuration in which the last indices in 'next_chunk_starts' is
985
+ # that of a bin, hence to detect if a snapshot is after the actual
986
+ # (real) last bin. Without it, a snapshot after the last bin would not
987
+ # be detected and if needed, accounted for.)
988
+ indices_of_bins_followed_by_a_snap = bin_indices[
989
+ nonzero(ndiff(bin_indices, append=len(next_chunk_starts)) - 1)[0]
990
+ ]
991
+ # Check if the 'next_chunk_starts' for these bins equal that of the
992
+ # snapshot that follows. If yes, then those are potential additional
993
+ # null snapshots.
994
+ n_max_null_snaps += len(
995
+ nonzero(
996
+ (
997
+ next_chunk_starts[indices_of_bins_followed_by_a_snap]
998
+ - next_chunk_starts[indices_of_bins_followed_by_a_snap + 1]
999
+ )
1000
+ == 0,
1001
+ )[0],
1002
+ )
1003
+ else:
1004
+ bin_indices = NULL_INT64_1D_ARRAY
1005
+ snap_labels = None
1006
+ n_max_null_snaps = 0
1007
+ # Keep track of last bin labels for checking at next iteration.
1008
+ # Check is managed at upper level in `cumsegagg`.
1009
+ if buffer is not None:
1010
+ buffer[KEY_LAST_BIN_LABEL] = bin_labels.iloc[-1]
1011
+ return (
1012
+ next_chunk_starts,
1013
+ bin_indices,
1014
+ bin_labels,
1015
+ n_null_bins,
1016
+ snap_labels,
1017
+ n_max_null_snaps,
1018
+ )