oups 2025.9.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of oups might be problematic. Click here for more details.

Files changed (43) hide show
  1. oups/__init__.py +40 -0
  2. oups/date_utils.py +62 -0
  3. oups/defines.py +26 -0
  4. oups/numpy_utils.py +114 -0
  5. oups/stateful_loop/__init__.py +14 -0
  6. oups/stateful_loop/loop_persistence_io.py +55 -0
  7. oups/stateful_loop/stateful_loop.py +654 -0
  8. oups/stateful_loop/validate_loop_usage.py +338 -0
  9. oups/stateful_ops/__init__.py +22 -0
  10. oups/stateful_ops/aggstream/__init__.py +12 -0
  11. oups/stateful_ops/aggstream/aggstream.py +1524 -0
  12. oups/stateful_ops/aggstream/cumsegagg.py +580 -0
  13. oups/stateful_ops/aggstream/jcumsegagg.py +416 -0
  14. oups/stateful_ops/aggstream/segmentby.py +1018 -0
  15. oups/stateful_ops/aggstream/utils.py +71 -0
  16. oups/stateful_ops/asof_merger/__init__.py +11 -0
  17. oups/stateful_ops/asof_merger/asof_merger.py +750 -0
  18. oups/stateful_ops/asof_merger/get_config.py +401 -0
  19. oups/stateful_ops/asof_merger/validate_params.py +285 -0
  20. oups/store/__init__.py +15 -0
  21. oups/store/filepath_utils.py +68 -0
  22. oups/store/indexer.py +457 -0
  23. oups/store/ordered_parquet_dataset/__init__.py +19 -0
  24. oups/store/ordered_parquet_dataset/metadata_filename.py +50 -0
  25. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/__init__.py +15 -0
  26. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/base.py +863 -0
  27. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/read_only.py +252 -0
  28. oups/store/ordered_parquet_dataset/parquet_adapter.py +157 -0
  29. oups/store/ordered_parquet_dataset/write/__init__.py +19 -0
  30. oups/store/ordered_parquet_dataset/write/iter_merge_split_data.py +131 -0
  31. oups/store/ordered_parquet_dataset/write/merge_split_strategies/__init__.py +22 -0
  32. oups/store/ordered_parquet_dataset/write/merge_split_strategies/base.py +784 -0
  33. oups/store/ordered_parquet_dataset/write/merge_split_strategies/n_rows_strategy.py +297 -0
  34. oups/store/ordered_parquet_dataset/write/merge_split_strategies/time_period_strategy.py +319 -0
  35. oups/store/ordered_parquet_dataset/write/write.py +270 -0
  36. oups/store/store/__init__.py +11 -0
  37. oups/store/store/dataset_cache.py +50 -0
  38. oups/store/store/iter_intersections.py +397 -0
  39. oups/store/store/store.py +345 -0
  40. oups-2025.9.5.dist-info/LICENSE +201 -0
  41. oups-2025.9.5.dist-info/METADATA +44 -0
  42. oups-2025.9.5.dist-info/RECORD +43 -0
  43. oups-2025.9.5.dist-info/WHEEL +4 -0
@@ -0,0 +1,580 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Created on Wed Dec 4 21:30:00 2021.
4
+
5
+ @author: pierrot
6
+
7
+ """
8
+ from collections.abc import Callable
9
+
10
+ from numpy import array
11
+ from numpy import dtype
12
+ from numpy import full
13
+ from numpy import isin as nisin
14
+ from numpy import nan as nNaN
15
+ from numpy import zeros
16
+ from pandas import NA as pNA
17
+ from pandas import DataFrame
18
+ from pandas import DatetimeIndex
19
+ from pandas import Int64Dtype
20
+ from pandas import NaT as pNaT
21
+ from pandas import Series
22
+ from pandas.core.resample import TimeGrouper
23
+
24
+ from oups.stateful_ops.aggstream.jcumsegagg import AGG_FUNCS
25
+ from oups.stateful_ops.aggstream.jcumsegagg import jcsagg
26
+ from oups.stateful_ops.aggstream.segmentby import KEY_BIN_ON
27
+ from oups.stateful_ops.aggstream.segmentby import KEY_LAST_BIN_LABEL
28
+ from oups.stateful_ops.aggstream.segmentby import KEY_ORDERED_ON
29
+ from oups.stateful_ops.aggstream.segmentby import KEY_SNAP_BY
30
+ from oups.stateful_ops.aggstream.segmentby import segmentby
31
+ from oups.stateful_ops.aggstream.segmentby import setup_segmentby
32
+
33
+
34
+ # Some constants.
35
+ DTYPE_INT64 = dtype("int64")
36
+ DTYPE_FLOAT64 = dtype("float64")
37
+ DTYPE_DATETIME64 = dtype("datetime64[ns]")
38
+ DTYPE_NULLABLE_INT64 = Int64Dtype()
39
+ NULL_INT64_1D_ARRAY = zeros(0, DTYPE_INT64)
40
+ NULL_INT64_2D_ARRAY = NULL_INT64_1D_ARRAY.reshape(0, 0)
41
+ # Null values.
42
+ NULL_DICT = {DTYPE_INT64: pNA, DTYPE_FLOAT64: nNaN, DTYPE_DATETIME64: pNaT}
43
+ # Key for buffer.
44
+ KEY_LAST_CHUNK_RES = "last_chunk_res"
45
+
46
+
47
+ def setup_cumsegagg(
48
+ agg: dict[str, tuple[str, str]],
49
+ data_dtype: dict[str, dtype],
50
+ ) -> dict[dtype, tuple[list[str], list[str], tuple, int]]:
51
+ """
52
+ Construct chaingrouby aggregation configuration.
53
+
54
+ Parameters
55
+ ----------
56
+ agg : dict[str, tuple[str, str]]
57
+ Dict specifying aggregation in the form
58
+ ``'out_col_name' : ('in_col_name', 'function_name')``
59
+ data_dtype : dict[str, dtype]
60
+ Dict specifying per column name its dtype. Typically obtained with
61
+ ``df.dtypes.to_dict()``
62
+
63
+ Returns
64
+ -------
65
+ dict[dtype,
66
+ tuple[list[str],
67
+ list[str],
68
+ tuple[tuple[Callable, ndarray[int64], ndarray[int64]]],
69
+ int
70
+ ]
71
+ ]
72
+ Dict 'cgb_agg_cfg' in the form
73
+ ``{dtype: list[str], 'cols_name_in_data'
74
+ column name in input data, with this dtype,
75
+ list[str], 'cols_name_in_res'
76
+ expected column names in aggregation result,
77
+ tuple[tuple[Callable, ndarray[int64], ndarray[int64]]],
78
+ 'aggs'
79
+ Tuple of Tuple. One inner Tuple per aggregation
80
+ function. Each one contain 3 items,
81
+ - a Callable, the aggregation function
82
+ - a 1st 1d numpy array with indices of columns
83
+ in 'data', to which has to be applied the
84
+ aggregation function.
85
+ - a 2nd 1d numpy array with indices of columns
86
+ in 'res', to which are recoreded aggregation
87
+ results
88
+ int64, 'n_cols'
89
+ Total number of columns in 'res' (summing for all
90
+ aggregation function).
91
+ }``
92
+
93
+ """
94
+ cgb_agg_cfg = {}
95
+ # Step 1.
96
+ for out_col, (in_col, func) in agg.items():
97
+ if in_col not in data_dtype:
98
+ raise ValueError(f"column '{in_col}' does not exist in input data.")
99
+ else:
100
+ dtype_ = data_dtype[in_col]
101
+ try:
102
+ tup = cgb_agg_cfg[dtype_]
103
+ except KeyError:
104
+ cgb_agg_cfg[dtype_] = [
105
+ [], # 'cols_name_in_data'
106
+ [], # 'cols_name_in_res'
107
+ [], # 'agg_func_idx' (temporary)
108
+ [], # 'cols_data' (temporary)
109
+ [], # 'cols_res' (temporary)
110
+ ]
111
+ tup = cgb_agg_cfg[dtype_]
112
+ # 'in_col' / name / 1d list.
113
+ cols_name_in_data = tup[0]
114
+ if in_col in cols_name_in_data:
115
+ in_col_idx = cols_name_in_data.index(in_col)
116
+ else:
117
+ in_col_idx = len(cols_name_in_data)
118
+ cols_name_in_data.append(in_col)
119
+ # 'out_col' / name / 1d list.
120
+ cols_name_in_res = tup[1]
121
+ out_col_idx = len(cols_name_in_res)
122
+ cols_name_in_res.append(out_col)
123
+ # Set list of agg functions (temporary buffer).
124
+ agg_funcs = tup[2]
125
+ try:
126
+ if (agg_func := AGG_FUNCS[func]) in agg_funcs:
127
+ func_idx = agg_funcs.index(agg_func)
128
+ else:
129
+ func_idx = len(agg_funcs)
130
+ agg_funcs.append(AGG_FUNCS[func])
131
+ except KeyError:
132
+ raise ValueError(f"`{func}` aggregation function is unknown.")
133
+ # 'cols_idx'
134
+ cols_data = tup[3]
135
+ cols_res = tup[4]
136
+ if len(cols_data) <= func_idx:
137
+ # Create list for this aggregation function.
138
+ cols_data.append([in_col_idx])
139
+ cols_res.append([out_col_idx])
140
+ else:
141
+ # Add this column index for this aggregation function.
142
+ cols_data[func_idx].append(in_col_idx)
143
+ cols_res[func_idx].append(out_col_idx)
144
+ # Step 2.
145
+ for conf in cgb_agg_cfg.values():
146
+ # Remove 'agg_funcs' & 'cols_idx'.
147
+ agg_funcs = conf.pop(2)
148
+ cols_data = conf.pop(2)
149
+ cols_res = conf.pop(2)
150
+ n_cols = sum(map(len, cols_res))
151
+ # Add back 'aggs', as tuple of tuple.
152
+ conf.append(tuple(zip(agg_funcs, map(array, cols_data), map(array, cols_res), strict=False)))
153
+ # 'n_cols'.
154
+ conf.append(n_cols)
155
+ return cgb_agg_cfg
156
+
157
+
158
+ def setup_chunk_res(agg: dict[dtype, tuple]) -> DataFrame:
159
+ """
160
+ Initialize one-row DataFrame for storing the first 'chunk_res'.
161
+ """
162
+ chunk_res = {}
163
+ for dtype_, (
164
+ _,
165
+ cols_name_in_res,
166
+ _,
167
+ n_cols,
168
+ ) in agg.items():
169
+ chunk_res_single_dtype = zeros(n_cols, dtype=dtype_)
170
+ chunk_res.update(
171
+ {name: chunk_res_single_dtype[i : i + 1] for i, name in enumerate(cols_name_in_res)},
172
+ )
173
+ return DataFrame(chunk_res, copy=False)
174
+
175
+
176
+ def cumsegagg(
177
+ data: DataFrame,
178
+ agg: dict[str, tuple[str, str]] | dict[dtype, tuple[list[str], list[str], tuple, int]],
179
+ bin_by: TimeGrouper | Callable | dict,
180
+ bin_on: str | None = None,
181
+ buffer: dict | None = None,
182
+ ordered_on: str | None = None,
183
+ snap_by: TimeGrouper | Series | DatetimeIndex | None = None,
184
+ error_on_0: bool | None = True,
185
+ ) -> DataFrame | tuple[DataFrame, DataFrame]:
186
+ """
187
+ Cumulative segmented aggregations, with optional snapshotting.
188
+
189
+ In this function, "snapshotting" is understood as the action of making
190
+ isolated observations. When using snapshots, values derived from
191
+ ``snap_by`` TimeGrouper (or contained in ``snap_by`` Series) are considered
192
+ the "points of isolated observation".
193
+ At a given point, an observation of the "on-going" segment (aka bin) is
194
+ made. Because segments are contiguous, any row of the dataset falls in a
195
+ segment.
196
+
197
+ Parameters
198
+ ----------
199
+ data: DataFrame
200
+ A pandas DataFrame containing the columns over which binning (relying
201
+ on ``bin_on`` column), performing aggregations and optionally
202
+ snapshotting (relying on column pointed by 'ordered_on' and optionally
203
+ ``snap_by.key`` if is a TimeGrouper).
204
+ If using snapshots ('snap_by' parameter), then the column pointed by
205
+ ``snap_by.key`` has to be ordered.
206
+ agg : dict
207
+ Definition of aggregation.
208
+ If in the form ``dict[str, tuple[str, str]]`` (typically a form
209
+ compatible with pandas aggregation), then it is transformed in the 2nd
210
+ form ``dict[dtype, tuple[list[str], list[str], tuple, int]]``.
211
+ - in the form ``dict[str, tuple[str, str]]``
212
+ - keys are ``str``, requested output column name
213
+ - values are ``tuple`` with 1st component a ``str`` for the input
214
+ column name, and 2nd component a ``str`` for aggregation function
215
+ name.
216
+
217
+ - the 2nd form is that returned by the function ``setup_cumsegagg``.
218
+
219
+ bin_by : TimeGrouper | Callable | dict
220
+ Callable or pandas TimeGrouper to perform binning.
221
+ If a Callable, please see signature requirements in 'segmentby'
222
+ docstring.
223
+ If a dict, it contains the full setup for conducting the segmentation
224
+ of 'data', as generated by 'setup_segmentby()'.
225
+ bin_on : str | None, default None
226
+ Name of the column in `data` over which performing the binning
227
+ operation.
228
+ If 'bin_by' is a pandas `TimeGrouper`, its `key` parameter is used instead,
229
+ and 'bin_on' is ignored.
230
+ If not provided, and 'ordered_on' parameter is, then 'ordered_on' value
231
+ is also used to specify the column name onto which performing binning.
232
+ buffer : dict | None, default None
233
+ Buffer containing data for restarting the binning process with new seed
234
+ data:
235
+ - from previous segmentation step,
236
+ - from previous aggregation step.
237
+ ordered_on : str | None
238
+ Name of an existing ordered column in 'data'. When setting it, it is
239
+ then forwarded to 'bin_by' Callable.
240
+ This parameter is compulsory if 'snap_by' is set. Values derived from
241
+ 'snap_by' (either a TimeGrouper or a Series) are compared to ``bin_ends``,
242
+ themselves derived from ``data[ordered_on]``.
243
+ snap_by : TimeGrouper | Series | DatetimeIndex | None, default None
244
+ Values positioning the points of observation, either derived from a
245
+ pandas TimeGrouper, or contained in a pandas Series.
246
+ In case 'snap_by' is a Series, values serve as locations for points of
247
+ observation.
248
+ Additionally, ``closed`` value defined by 'bin_on' specifies if points
249
+ of observations are included or excluded. As "should be logical", if
250
+ - `left`, then values at points of observation are excluded.
251
+ - `right`, then values at points of observation are included.
252
+
253
+ error_on_0 : bool, default True
254
+ By default, check that there is no `0` value (either int or float) in
255
+ aggregation results (bins and snapshots). ``cumsegagg()`` is
256
+ experimental and a `0` value is likely to hint a bug. If raised, the
257
+ result should be double checked. Ultimately, please, report the use
258
+ case that is raising this error, and what would be the expected
259
+ behavior.
260
+
261
+ Returns
262
+ -------
263
+ DataFrame | tuple[DataFrame, DataFrame]
264
+ A pandas DataFrame with aggregation results. Its index is composed of
265
+ the bin labels.
266
+ If a tuple, then the first DataFrame is that for the bins, and the
267
+ second that for the snapshots.
268
+
269
+ Notes
270
+ -----
271
+ When using snapshots, values derived from ``snap_by`` are considered the
272
+ "points of isolated observation". At such a point, an observation of the
273
+ "on-going" bin is made. In case of snapshot(s) positioned exactly on
274
+ segment(s) ends, at the same row index in data, snapshot will come "before"
275
+ the bin.
276
+
277
+ When using 'cumsegagg' through 'chainagg' function (i.e. for chained calls
278
+ to 'cumsegagg') and if setting `bin_by` as a Callable, the developer should
279
+ take care that `bin_by` keeps in the 'buffer' parameter all the data needed
280
+ to:
281
+ - create the correct number of bins that would be in-between the data
282
+ processed at the previous aggregation iteration, and the new data. This
283
+ has to show in 'next_chunk_starts' array that is returned.
284
+ - appropriately label the first bin.
285
+ - either it is a new bin, different than the last one from previous
286
+ aggregation iteration. Then the label of the new bin has to be
287
+ different than that of the last one from previous iteration.
288
+ - or it is the same bin that is continuing. Then the label has be the
289
+ same. This ensures that when recording the new aggregation result,
290
+ the data from the previous iteration (last bin was in-progress, i.e.
291
+ incomplete) is overwritten.
292
+
293
+ Notes on design for allowing 'restart'
294
+ --------------------------------------
295
+ Current implementation may present limitations from inadequate design
296
+ choices, not challenged so far.
297
+ To minimize memory footprint, segmentation step is expected to provide
298
+ start indices of the next bin (as opposed to providing the status for each
299
+ row of the input data, individually).
300
+ Because historically, the aggregation function is expected to use all data
301
+ so as to provide the actual status of the last, in-progress bin, its end
302
+ is de-facto the end of the input data.
303
+ Because of this, the internal flag 'preserve_res' is always set to
304
+ ``False`` when reaching the end of input data.
305
+ This is a limitation. It should be ``False`` only to mark the actual end of
306
+ the bins. As a result, this internal flag 'preserve_res' cannot be output
307
+ for reuse in a next restart step.
308
+ An option to circumvent this is to use snapshots instead of bins as media
309
+ to output aggregation results for last, in-progress bin.
310
+ This option has not been implemented.
311
+
312
+ In current implementation, the limitation presented above is circumvented
313
+ by assuming that last bin is never empty, that is to say, 'chunk_res'
314
+ parameter which contains aggregation results for the last, in-progress bin,
315
+ always has relevant results to preserve. This is true, as long as the last,
316
+ in-progress bin is not empty.
317
+ Would this last bin be empty, then 'chunk_res' would still contain
318
+ aggregation results for the last not empty bin it was used. In this case,
319
+ we would need to make sure if the last row in input data matches the end of
320
+ a bin or not, to not preserve 'chunk_res' or preserve it.
321
+ Now, if we assume the last, in-progress bin is not empty, we can wait for
322
+ the restart to check if the bin has ended before the start of input data,
323
+ and then close this bin which was the last, in-progress bin at previous
324
+ iteration.
325
+ To bring more freedom to this implementation, a 'preserve_res' flag is
326
+ expected from the segmentation phase. This flag is set ``False`` to allow
327
+ restarting right on a new, next bin, if at the previous iteration, the last
328
+ bin was complete.
329
+ In current implementation, the limitation is thus that from the
330
+ segmentation, the last bin cannot be empty. All empty trailing bins have to
331
+ be trimmed, otherwise an exception is raised.
332
+
333
+ The following thoughts have been investigated in current implementation.
334
+ - **segmentation step ('segmentby()')**
335
+ -1 From this step, 'next_chunk_starts' should not end with an empty
336
+ bin, as mentioned above. A complementary thought is that when
337
+ restarting after several empty bins, it *may* be that some new data
338
+ was actually in these bins, empty at previous iteration.
339
+ A check is then managed in 'segmentby()'. All empty bins at end of
340
+ 'next_chunk_starts' have to be trimmed or an exception will be
341
+ raised.
342
+ -2 If restarting, bins produced by the user-defined 'bin_by()' have to
343
+ cover the full size of data, meaning last item in
344
+ 'next_chunk_starts' is equal to length of data.
345
+ As mentioned above, this rationale is from history.
346
+ Additionally, it *may* be that if no bin goes till the end of data,
347
+ then we are not sure the next bin (at next iteration) will not lie
348
+ within these last values in data at current iteration.
349
+ A check is then performed and an exception is raised if this
350
+ situation occurs.
351
+ This requirement is not applied to 'snap_by' (in case using a
352
+ Series). Because it is applied to 'bin_by', then 'chunk_res' will
353
+ contain aggregation results over the last values in data, it is not
354
+ lost.
355
+ In the existing 'snap_by' (either by TimeGrouper or by Series),
356
+ - either if a TimeGrouper, then last snapshot ends after end of data
357
+ - or if a Series, at restart, if 2nd snapshot ends before last
358
+ value in data at previous iteration, then an exception is
359
+ raised.
360
+
361
+ -3 At next iteration, the first bin has to be the continuation of the
362
+ last one from previous iteration. A check is made using bin label.
363
+ This is the case even if the bin is empty. Thus, if it is preceded /
364
+ followed by empty snapshots, content of these snapshots will be set
365
+ appropriately. For empty snapshots that precede this bin end, past
366
+ results are forwarded. For empty snapshots that follow this bin end,
367
+ this results in empty snapshots.
368
+
369
+ - **cumulative segmented aggregation ('cumsegagg()')**
370
+ -1 'preserve_res' parameter is used to indicate if aggregation
371
+ calculations start from scratch (first iteration) or reuse past
372
+ aggregation results (following iterations).
373
+ Aggregation results from last, in-progress bin can then be
374
+ forwarded.
375
+
376
+ """
377
+ # TODO: create a test case with restart, that has no snapshot in 1st
378
+ # iteration (with 'by_scale' using a Series). Target is to check that even
379
+ # without snapshot in 1st iteration, an empty 'snap_res' gets returned
380
+ # nonetheless and that concatenation can be managed with subsequent
381
+ # 'snap_res' from next iterations.
382
+ # TODO: make possible to pass several 'bin_by' (with same 'snap_by'):
383
+ # - in segmentby, has to produce an array providing for each 'bin_by' when
384
+ # the next bin start. Modify the existing 'mergesort' function for this.
385
+ # - in jcumsegagg, change logic to have aggregation function without
386
+ # 'preserve_res'. Instead, create companion function that can be called
387
+ # when storing results, achieving reconciliation of existing results
388
+ # with new results (if not a new bin), or restarting the aggregation
389
+ # if new results.
390
+ len_data = len(data)
391
+ if not len_data:
392
+ # 'data' is empty. Simply return.
393
+ return
394
+ if not isinstance(next(iter(agg.values())), list):
395
+ # Reshape aggregation definition.
396
+ agg = setup_cumsegagg(agg, data.dtypes.to_dict())
397
+ if buffer is None:
398
+ # Single run agg.
399
+ preserve_res = False
400
+ else:
401
+ # Agg iteration with possible restart.
402
+ # Detection of 1st iteration is managed below with test if a new bin
403
+ # is started.
404
+ preserve_res = True
405
+ prev_last_bin_label = buffer[KEY_LAST_BIN_LABEL] if KEY_LAST_BIN_LABEL in buffer else None
406
+ if not isinstance(bin_by, dict):
407
+ bin_by = setup_segmentby(bin_by, bin_on, ordered_on, snap_by)
408
+ # Following 'setup_segmentby', parameters 'ordered_on', 'bin_on' have to
409
+ # be retrieved from it.
410
+ ordered_on = bin_by[KEY_ORDERED_ON]
411
+ # 'bin_by' as a dict may contain 'snap_by' if it is a TimeGrouper.
412
+ if bin_by[KEY_SNAP_BY] is not None:
413
+ # 'bin_by[KEY_SNAP_BY]' is not none if 'snap_by' is a TimeGrouper.
414
+ # Otherwise, it can be a DatetimeIndex or a Series.
415
+ snap_by = bin_by[KEY_SNAP_BY]
416
+ # In case of restart, 'n_max_null_bins' is a max because 1st null bin may
417
+ # well be continuation of last in-progress bin, without result in current
418
+ # iteration, but with results from previous iteration.
419
+ (
420
+ next_chunk_starts,
421
+ bin_indices,
422
+ bin_labels,
423
+ n_max_null_bins,
424
+ snap_labels,
425
+ n_max_null_snaps,
426
+ ) = segmentby(
427
+ data=data,
428
+ bin_by=bin_by,
429
+ snap_by=snap_by,
430
+ buffer=buffer,
431
+ )
432
+ if preserve_res and prev_last_bin_label != bin_labels.iloc[0]:
433
+ # A new bin has been started. Do not preserve past results.
434
+ # This behavior is only possible in case no snapshot is used.
435
+ preserve_res = False
436
+ # Initiate dict of result columns.
437
+ # Setup 'chunk_res'.
438
+ chunk_res_prev = (
439
+ buffer[KEY_LAST_CHUNK_RES]
440
+ if isinstance(buffer, dict) and KEY_LAST_CHUNK_RES in buffer
441
+ else setup_chunk_res(agg)
442
+ )
443
+ chunk_res = {}
444
+ # Setup 'bin_res'.
445
+ n_bins = len(bin_labels)
446
+ null_bin_indices = full(n_max_null_bins, -1, dtype=DTYPE_INT64)
447
+ bin_res = {}
448
+ # Setup 'snap_res', & preserve_res
449
+ snap_res = {}
450
+ if snap_by is None:
451
+ snap_res_single_dtype = NULL_INT64_2D_ARRAY
452
+ null_snap_indices = NULL_INT64_1D_ARRAY
453
+ else:
454
+ # Initialize 'null_snap_indices' to -1, to identify easily those which
455
+ # are not set. they will be removed in a post-processing step.
456
+ n_snaps = len(snap_labels)
457
+ null_snap_indices = full(n_max_null_snaps, -1, dtype=DTYPE_INT64)
458
+ # Loop.
459
+ for dtype_, (
460
+ cols_name_in_data,
461
+ cols_name_in_res,
462
+ aggs,
463
+ n_cols,
464
+ ) in agg.items():
465
+ data_single_dtype = (
466
+ data.loc[:, cols_name_in_data].to_numpy(copy=False)
467
+ if len(cols_name_in_data) > 1
468
+ else data.loc[:, cols_name_in_data].to_numpy(copy=False).reshape(-1, 1)
469
+ )
470
+ # Setup 'chunk_res_single_dtype'.
471
+ chunk_res_single_dtype = chunk_res_prev.loc[:, cols_name_in_res].to_numpy(copy=False).reshape(n_cols)
472
+ chunk_res.update(
473
+ {name: chunk_res_single_dtype[i : i + 1] for i, name in enumerate(cols_name_in_res)},
474
+ )
475
+ # Setup 'bin_res_single_dtype'.
476
+ bin_res_single_dtype = zeros((n_bins, n_cols), dtype=dtype_)
477
+ bin_res.update(
478
+ {name: bin_res_single_dtype[:, i] for i, name in enumerate(cols_name_in_res)},
479
+ )
480
+ # Setup 'snap_res_single_dtype'.
481
+ if snap_by is not None:
482
+ snap_res_single_dtype = zeros((n_snaps, n_cols), dtype=dtype_)
483
+ snap_res.update(
484
+ {name: snap_res_single_dtype[:, i] for i, name in enumerate(cols_name_in_res)},
485
+ )
486
+ if dtype_ == DTYPE_DATETIME64:
487
+ data_single_dtype = data_single_dtype.view(DTYPE_INT64)
488
+ bin_res_single_dtype = bin_res_single_dtype.view(DTYPE_INT64)
489
+ chunk_res_single_dtype = chunk_res_single_dtype.view(DTYPE_INT64)
490
+ if snap_by is not None:
491
+ snap_res_single_dtype = snap_res_single_dtype.view(DTYPE_INT64)
492
+ # 'data' is a numpy array, with columns in 'expected order',
493
+ # as defined in 'cols_data' & 'cols_res' embedded in 'aggs'.
494
+ # TODO: if extending 'jcsagg()' to process last chunk in data (even if
495
+ # not a bin or a snap, so as to make possible that bins really only end
496
+ # on end of bins, and that end of 'data' is not systematically a bin
497
+ # end as well), then output from 'jcsagg()' 'preserve_res' parameter.
498
+ # When inputting it for the next iteration, 'preserve_res' parameter
499
+ # is then ``not first_bin_is_new and preserve_res``.
500
+ # With this feature, empty trailing bins are then possible to manage.
501
+ jcsagg(
502
+ data_single_dtype, # 2d
503
+ aggs,
504
+ next_chunk_starts, # 1d
505
+ bin_indices, # 1d
506
+ preserve_res,
507
+ chunk_res_single_dtype,
508
+ bin_res_single_dtype, # 2d
509
+ snap_res_single_dtype, # 2d
510
+ null_bin_indices, # 1d
511
+ null_snap_indices, # 1d
512
+ )
513
+ # Record last aggregation results for a restart.
514
+ if isinstance(buffer, dict):
515
+ buffer[KEY_LAST_CHUNK_RES] = DataFrame(chunk_res, copy=False)
516
+ # Assemble 'bin_res' as a pandas DataFrame.
517
+ bin_res = DataFrame(bin_res, index=bin_labels, copy=False)
518
+ bin_res.index.name = ordered_on if ordered_on else bin_by[KEY_BIN_ON]
519
+ if DTYPE_INT64 in agg:
520
+ # As of pandas 1.5.3, use "Int64" dtype to work with nullable 'int'.
521
+ # (it is a pandas dtype, not a numpy one, which is why it is set only
522
+ # in pandas results, and not numpy inputs to 'cumsegagg()').
523
+ # Force 'int64' to pandas nullable 'Int64', even if there is no null
524
+ # value in results at the moment. Indeed null values can appear in a
525
+ # later aggregation step (use of 'restart' feature).
526
+ bin_res[agg[DTYPE_INT64][1]] = bin_res[agg[DTYPE_INT64][1]].astype(
527
+ DTYPE_NULLABLE_INT64,
528
+ )
529
+ # Set null values.
530
+ if n_max_null_bins != 0:
531
+ null_bin_labels = bin_labels.iloc[null_bin_indices[~nisin(null_bin_indices, -1)]]
532
+ if not null_bin_labels.empty:
533
+ for dtype_, (
534
+ _,
535
+ cols_name_in_res,
536
+ _,
537
+ _,
538
+ ) in agg.items():
539
+ bin_res.loc[null_bin_labels, cols_name_in_res] = NULL_DICT[dtype_]
540
+ if snap_by is not None:
541
+ snap_res = DataFrame(snap_res, index=snap_labels, copy=False)
542
+ snap_res.index.name = ordered_on
543
+ if DTYPE_INT64 in agg:
544
+ # As of pandas 1.5.3, use "Int64" dtype to work with nullable 'int'.
545
+ # It is a pandas dtype, not a numpy one, which is why it is set
546
+ # only in pandas results, and not numpy inputs to 'cumsegagg()').
547
+ # Force 'int64' to pandas nullable 'Int64', even if there is no
548
+ # null value in results at the moment. Indeed null values can
549
+ # appear in a later aggregation step (use of 'restart' feature).
550
+ snap_res[agg[DTYPE_INT64][1]] = snap_res[agg[DTYPE_INT64][1]].astype(
551
+ DTYPE_NULLABLE_INT64,
552
+ )
553
+ # Set null values.
554
+ if n_max_null_snaps != 0:
555
+ # Remove -1 indices.
556
+ # TODO: is not necessary to re-create an array without the -1.
557
+ # Only indices above 0 should be used.
558
+ # Alternatively, output number of empty snaps from 'jcumsegagg()'?
559
+ null_snap_labels = snap_labels[null_snap_indices[~nisin(null_snap_indices, -1)]]
560
+ if not null_snap_labels.empty:
561
+ for dtype_, (
562
+ _,
563
+ cols_name_in_res,
564
+ _,
565
+ _,
566
+ ) in agg.items():
567
+ snap_res.loc[null_snap_labels, cols_name_in_res] = NULL_DICT[dtype_]
568
+ if error_on_0:
569
+ if snap_by is not None and snap_res.eq(0).any().any():
570
+ raise ValueError(
571
+ "at least one null value exists in 'snap_res' which is likely to hint a bug.",
572
+ )
573
+ if bin_res.eq(0).any().any():
574
+ raise ValueError(
575
+ "at least one null value exists in 'bin_res' which is likely to hint a bug.",
576
+ )
577
+ if snap_by is not None:
578
+ return bin_res, snap_res
579
+ else:
580
+ return bin_res