oups 2025.9.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of oups might be problematic. Click here for more details.

Files changed (43) hide show
  1. oups/__init__.py +40 -0
  2. oups/date_utils.py +62 -0
  3. oups/defines.py +26 -0
  4. oups/numpy_utils.py +114 -0
  5. oups/stateful_loop/__init__.py +14 -0
  6. oups/stateful_loop/loop_persistence_io.py +55 -0
  7. oups/stateful_loop/stateful_loop.py +654 -0
  8. oups/stateful_loop/validate_loop_usage.py +338 -0
  9. oups/stateful_ops/__init__.py +22 -0
  10. oups/stateful_ops/aggstream/__init__.py +12 -0
  11. oups/stateful_ops/aggstream/aggstream.py +1524 -0
  12. oups/stateful_ops/aggstream/cumsegagg.py +580 -0
  13. oups/stateful_ops/aggstream/jcumsegagg.py +416 -0
  14. oups/stateful_ops/aggstream/segmentby.py +1018 -0
  15. oups/stateful_ops/aggstream/utils.py +71 -0
  16. oups/stateful_ops/asof_merger/__init__.py +11 -0
  17. oups/stateful_ops/asof_merger/asof_merger.py +750 -0
  18. oups/stateful_ops/asof_merger/get_config.py +401 -0
  19. oups/stateful_ops/asof_merger/validate_params.py +285 -0
  20. oups/store/__init__.py +15 -0
  21. oups/store/filepath_utils.py +68 -0
  22. oups/store/indexer.py +457 -0
  23. oups/store/ordered_parquet_dataset/__init__.py +19 -0
  24. oups/store/ordered_parquet_dataset/metadata_filename.py +50 -0
  25. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/__init__.py +15 -0
  26. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/base.py +863 -0
  27. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/read_only.py +252 -0
  28. oups/store/ordered_parquet_dataset/parquet_adapter.py +157 -0
  29. oups/store/ordered_parquet_dataset/write/__init__.py +19 -0
  30. oups/store/ordered_parquet_dataset/write/iter_merge_split_data.py +131 -0
  31. oups/store/ordered_parquet_dataset/write/merge_split_strategies/__init__.py +22 -0
  32. oups/store/ordered_parquet_dataset/write/merge_split_strategies/base.py +784 -0
  33. oups/store/ordered_parquet_dataset/write/merge_split_strategies/n_rows_strategy.py +297 -0
  34. oups/store/ordered_parquet_dataset/write/merge_split_strategies/time_period_strategy.py +319 -0
  35. oups/store/ordered_parquet_dataset/write/write.py +270 -0
  36. oups/store/store/__init__.py +11 -0
  37. oups/store/store/dataset_cache.py +50 -0
  38. oups/store/store/iter_intersections.py +397 -0
  39. oups/store/store/store.py +345 -0
  40. oups-2025.9.5.dist-info/LICENSE +201 -0
  41. oups-2025.9.5.dist-info/METADATA +44 -0
  42. oups-2025.9.5.dist-info/RECORD +43 -0
  43. oups-2025.9.5.dist-info/WHEEL +4 -0
@@ -0,0 +1,1524 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Created on Wed Nov 15 21:30:00 2023.
4
+
5
+ @author: pierrot
6
+
7
+ """
8
+ from collections import ChainMap
9
+ from collections import namedtuple
10
+ from collections.abc import Callable
11
+ from collections.abc import Iterable
12
+ from enum import Enum
13
+ from inspect import Parameter
14
+ from inspect import signature
15
+ from itertools import chain
16
+ from multiprocessing import cpu_count
17
+ from typing import Any
18
+
19
+ from joblib import Parallel
20
+ from joblib import delayed
21
+ from numpy import ones
22
+ from pandas import DataFrame
23
+ from pandas import DatetimeIndex
24
+ from pandas import Series
25
+ from pandas import Timestamp
26
+ from pandas import concat
27
+ from pandas.core.resample import TimeGrouper
28
+
29
+ from oups.defines import KEY_DUPLICATES_ON
30
+ from oups.defines import KEY_ORDERED_ON
31
+ from oups.defines import KEY_ROW_GROUP_TARGET_SIZE
32
+ from oups.stateful_ops.aggstream.cumsegagg import cumsegagg
33
+ from oups.stateful_ops.aggstream.cumsegagg import setup_cumsegagg
34
+ from oups.stateful_ops.aggstream.jcumsegagg import FIRST
35
+ from oups.stateful_ops.aggstream.jcumsegagg import LAST
36
+ from oups.stateful_ops.aggstream.jcumsegagg import MAX
37
+ from oups.stateful_ops.aggstream.jcumsegagg import MIN
38
+ from oups.stateful_ops.aggstream.jcumsegagg import SUM
39
+ from oups.stateful_ops.aggstream.segmentby import KEY_BIN_BY
40
+ from oups.stateful_ops.aggstream.segmentby import KEY_BIN_ON
41
+ from oups.stateful_ops.aggstream.segmentby import KEY_SNAP_BY
42
+ from oups.stateful_ops.aggstream.segmentby import setup_segmentby
43
+ from oups.stateful_ops.aggstream.utils import dataframe_filter
44
+ from oups.store import OrderedParquetDataset
45
+ from oups.store import Store
46
+ from oups.store import write
47
+
48
+
49
+ # Aggregation functions.
50
+ ACCEPTED_AGG_FUNC = {FIRST, LAST, MIN, MAX, SUM}
51
+ # List of keys.
52
+ KEY_AGGSTREAM = "aggstream"
53
+ KEY_PRE = "pre"
54
+ KEY_PRE_BUFFER = "pre_buffer"
55
+ KEY_SEGAGG_BUFFER = "segagg_buffer"
56
+ KEY_POST_BUFFER = "post_buffer"
57
+ KEY_BIN_RES_BUFFER = "bin_res_buffer"
58
+ KEY_BIN_ON_OUT = "bin_on_out"
59
+ KEY_SNAP_RES_BUFFER = "snap_res_buffer"
60
+ KEY_FILTERS = "filters"
61
+ KEY_RESTART_INDEX = "restart_index"
62
+ KEY_BIN_RES = "bin_res"
63
+ KEY_SNAP_RES = "snap_res"
64
+ KEY_WRITE_CONFIG = "write_config"
65
+ KEY_AGG_IN_MEMORY_SIZE = "agg_in_memory_size"
66
+ KEY_MAX_IN_MEMORY_SIZE_B = "max_in_memory_size_b"
67
+ KEY_MAX_IN_MEMORY_SIZE_MB = "max_in_memory_size"
68
+ KEY_AGG_RES_TYPE = "agg_res_type"
69
+ KEY_SEG_CONFIG = "seg_config"
70
+ # Filters
71
+ NO_FILTER_ID = "_"
72
+ # List of valid parameters for 'key_conf_in'
73
+ KEY_AGG = "agg"
74
+ KEY_POST = "post"
75
+ # 'bin_by' is a compulsory parameter, and a specific check is made for it.
76
+ # It is not added in 'KEY_CONF_IN_PARAMS'.
77
+ WRITE_PARAMS = {
78
+ name
79
+ for name, p in signature(write).parameters.items()
80
+ if p.kind in (Parameter.POSITIONAL_OR_KEYWORD, Parameter.KEYWORD_ONLY)
81
+ }
82
+ KEY_CONF_IN_PARAMS = {
83
+ KEY_BIN_ON,
84
+ KEY_SNAP_BY,
85
+ KEY_AGG,
86
+ KEY_POST,
87
+ KEY_MAX_IN_MEMORY_SIZE_B,
88
+ } | WRITE_PARAMS
89
+ # Parallel jobs, at most using 75% of available cpus.
90
+ KEY_MAX_P_JOBS = max(int(cpu_count() * 3 / 4), 1)
91
+ # Max in memory size of result dataframes allowed before writing to disk.
92
+ # Provided in bytes.
93
+ MEGABYTES_TO_BYTES = 1048576
94
+ MAX_IN_MEMORY_SIZE_MB = 140
95
+ MAX_IN_MEMORY_SIZE_B = MAX_IN_MEMORY_SIZE_MB * MEGABYTES_TO_BYTES
96
+
97
+
98
+ FilterApp = namedtuple("FilterApp", "keys n_jobs")
99
+ AggResType = Enum("AggResType", ["BINS", "SNAPS", "BOTH"])
100
+
101
+
102
+ def _is_aggstream_result(handle: OrderedParquetDataset) -> bool:
103
+ """
104
+ Check whether the input dataset was produced by aggstream.
105
+
106
+ Parameters
107
+ ----------
108
+ handle : OrderedParquetDataset
109
+ Dataset handle to check.
110
+
111
+ Returns
112
+ -------
113
+ bool
114
+ True if parquet file contains metadata as produced by
115
+ ``oups.aggstream``, which confirms this dataset has been produced with
116
+ this latter function.
117
+
118
+ """
119
+ return KEY_AGGSTREAM in handle.key_value_metadata
120
+
121
+
122
+ def _init_keys_config(
123
+ seed_ordered_on: str,
124
+ keys_config: dict,
125
+ keys_default: dict,
126
+ ):
127
+ """
128
+ Consolidate keys' configuration into ``keys_config`` and ``agg_pd``.
129
+
130
+ Parameters
131
+ ----------
132
+ seed_ordered_on : str
133
+ Name of the column with respect to which seed is in ascending order.
134
+ This parameter is used for seed segmentation. It is also used as
135
+ default name of the column with respect to which aggregation results are
136
+ in ascending order, if not provided in ``keys`` parameter.
137
+ keys_config : dict
138
+ Unconsolidated keys config.
139
+ keys_default : dict
140
+ Default values for missing parameters in ``keys_config``.
141
+
142
+ Other Parameters
143
+ ----------------
144
+ kwargs : dict
145
+ Other user parameters that will be set into ``keys_config``.
146
+
147
+ Returns
148
+ -------
149
+ The following AggStream's parameters are initialized with this function.
150
+
151
+ - ``keys_config``, dict of keys' config in the form:
152
+ ``{key: {'bin_on_out' : str, name in aggregation results for column
153
+ with bin ids.
154
+ 'seg_config' : dict specifying the segmentation config,
155
+ 'post' : Callable or None,
156
+ 'max_in_memory_size_b': int, max allowed result in memory size,
157
+ in bytes
158
+ 'write_config' : {'ordered_on' : str,
159
+ 'duplicates_on' : str or list,
160
+ 'max_row_group_size' : str | int | tuple
161
+ ...
162
+ },
163
+ 'agg_res_rype' : AggResType, either 'BINS', 'SNAPS', or 'BOTH'.
164
+ },
165
+ }``
166
+ - ``self.agg_pd``, dict, specifying per key the aggregation
167
+ configuration.
168
+
169
+ """
170
+ consolidated_keys_config = {}
171
+ agg_pd = {}
172
+ for key, key_conf_in in keys_config.items():
173
+ # Parameters in 'key_conf_in' take precedence over those in
174
+ # 'keys_default'. Additionally, with this step, 'key_conf_in' is a
175
+ # deep copy, and when parameters are popped, it does not affect
176
+ # the initial 'key_conf_in'.
177
+ try:
178
+ bin_by = key_conf_in.pop(KEY_BIN_BY)
179
+ except KeyError:
180
+ raise ValueError(f"'{KEY_BIN_BY}' parameter is missing for key '{key}'.")
181
+ if KEY_MAX_IN_MEMORY_SIZE_MB in key_conf_in:
182
+ # Switch from MB to B.
183
+ key_conf_in[KEY_MAX_IN_MEMORY_SIZE_B] = int(
184
+ key_conf_in.pop(KEY_MAX_IN_MEMORY_SIZE_MB) * MEGABYTES_TO_BYTES,
185
+ )
186
+ key_conf_in = keys_default | key_conf_in
187
+ # Check parameters in 'key_conf_in' are valid ones.
188
+ for param in key_conf_in:
189
+ if param not in KEY_CONF_IN_PARAMS:
190
+ raise ValueError(
191
+ f"'{param}' not a valid parameters in '{key}' aggregation config.",
192
+ )
193
+ bin_on = key_conf_in.pop(KEY_BIN_ON, None)
194
+ agg_pd[key] = key_conf_in.pop(KEY_AGG)
195
+
196
+ if isinstance(bin_on, tuple):
197
+ # 'bin_on_out' is name of column containing group keys in
198
+ # 'agg_res'. Setting of 'bin_on_out' is an 'AggStream'
199
+ # task, not a 'cumesegagg' one. This is because this
200
+ # parameter clarifies then how to set 'duplicates_on'
201
+ # parameter for 'oups.writer.write' which is also part of
202
+ # 'AggStream' perimeter.
203
+ bin_on, bin_on_out = bin_on
204
+ else:
205
+ bin_on_out = None
206
+ # Setup 'seg_conf', 'bin_on_out' & 'agg_pd'.
207
+ try:
208
+ seg_config = setup_segmentby(
209
+ bin_by=bin_by,
210
+ bin_on=bin_on,
211
+ ordered_on=seed_ordered_on,
212
+ snap_by=key_conf_in.pop(KEY_SNAP_BY),
213
+ )
214
+ except Exception:
215
+ raise ValueError(f"exception raised for key '{key}'")
216
+ if bin_on := seg_config[KEY_BIN_ON]:
217
+ if bin_on_out is None:
218
+ # It may be that 'bin_on' value has been modified in
219
+ # 'setup_segmentby'. If 'bin_on_out' has not been set
220
+ # previously, then set it to this possibly new value of
221
+ # 'bin_on'.
222
+ bin_on_out = bin_on
223
+ # 'agg' is in the form:
224
+ # {"output_col":("input_col", "agg_function_name")}
225
+ if bin_on_out in agg_pd[key]:
226
+ # Check that this name is not already that of an output
227
+ # column from aggregation.
228
+ raise ValueError(
229
+ f"not possible to have {bin_on_out} as column name in"
230
+ " aggregated results as it is also for column"
231
+ " containing group keys.",
232
+ )
233
+ # Initialize 'write_config', which are parameters remaining in
234
+ # 'key_conf_in' and some adjustments.
235
+ # Adding 'bin_on_out' to 'duplicates_on' except if
236
+ # 'duplicates_on' is set already. In this case, if 'bin_on_out'
237
+ # is not in 'duplicates_on', it is understood as a voluntary
238
+ # user choice. For all other cases, 'duplicates_on' has been
239
+ # set by user. Setting 'duplicates_on' is the true reason of
240
+ # having 'bin_on_out'. It allows the user to inform 'AggStream'
241
+ # that the binning column (with unique keys) is this one.
242
+ if KEY_DUPLICATES_ON not in key_conf_in or key_conf_in[KEY_DUPLICATES_ON] is None:
243
+ # Force 'bin_on_out', else reuse 'ordered_on' parameter
244
+ # specific to keys (aggregation results).
245
+ key_conf_in[KEY_DUPLICATES_ON] = bin_on_out if bin_on_out else key_conf_in[KEY_ORDERED_ON]
246
+ # key_conf_in[KEY_DUPLICATES_ON] = key_conf_in[KEY_ORDERED_ON]
247
+ if seg_config[KEY_SNAP_BY] is None:
248
+ # Snapshots not requested, aggreagtation results are necessarily
249
+ # bins.
250
+ agg_res_type = AggResType.BINS
251
+ elif isinstance(key, tuple):
252
+ # 2 keys are provided, aggregation results are necessarily both
253
+ # bins and snapshots.
254
+ agg_res_type = AggResType.BOTH
255
+ else:
256
+ # Otherwise, a single aggregation result is expected, and it is
257
+ # created from both bins and snapshots. Hence it is snaps like.
258
+ agg_res_type = AggResType.SNAPS
259
+ if agg_res_type is AggResType.BOTH:
260
+ if KEY_ROW_GROUP_TARGET_SIZE in key_conf_in:
261
+ if not isinstance(key_conf_in[KEY_ROW_GROUP_TARGET_SIZE], tuple):
262
+ key_conf_in[KEY_ROW_GROUP_TARGET_SIZE] = (
263
+ key_conf_in[KEY_ROW_GROUP_TARGET_SIZE],
264
+ key_conf_in[KEY_ROW_GROUP_TARGET_SIZE],
265
+ )
266
+ else:
267
+ key_conf_in[KEY_ROW_GROUP_TARGET_SIZE] = (None, None)
268
+ consolidated_keys_config[key] = {
269
+ KEY_SEG_CONFIG: seg_config,
270
+ KEY_BIN_ON_OUT: bin_on_out,
271
+ KEY_MAX_IN_MEMORY_SIZE_B: key_conf_in.pop(KEY_MAX_IN_MEMORY_SIZE_B),
272
+ KEY_POST: key_conf_in.pop(KEY_POST),
273
+ KEY_WRITE_CONFIG: key_conf_in,
274
+ KEY_AGG_RES_TYPE: agg_res_type,
275
+ }
276
+ return consolidated_keys_config, agg_pd
277
+
278
+
279
+ def _init_buffers(
280
+ store: Store,
281
+ keys: dict,
282
+ ):
283
+ """
284
+ Initialize pre, aggregation and post buffers from existing results.
285
+
286
+ Also set ``seed_index_restart``.
287
+
288
+ Parameters
289
+ ----------
290
+ store : Store
291
+ Store to which aggregation results may already exist, and from which
292
+ retrieving previous buffer data.
293
+ keys : Any | dict
294
+ Single level dict as defined in ``_init__`` function.
295
+
296
+ Returns
297
+ -------
298
+ The following AggStream's parameters are initialized in this function.
299
+ - ``seed_index_restart``, int, float or Timestamp, the index
300
+ from which (included) should be restarted the next aggregation
301
+ iteration.
302
+ - ``pre_buffer``, dict, user-defined buffer to keep track of intermediate
303
+ variables between successive pre-processing of individual seed chunk.
304
+ - ``agg_buffers``, dict of aggregation buffer variables specific for each
305
+ key, in the form:
306
+ ``{key: {'agg_in_memory_size' : 0,
307
+ 'bin_res' : None,
308
+ 'snap_res' : None,
309
+ 'bin_res_buffer' : list,
310
+ 'snap_res_buffer' : list,
311
+ 'segagg_buffer' : dict, possibly empty,
312
+ 'post_buffer' : dict, possibly empty,
313
+ },
314
+ }``
315
+
316
+ """
317
+ pre_buffer = {}
318
+ agg_buffers = {}
319
+ seed_index_restart_set = set()
320
+ for key in keys:
321
+ # Default values for aggregation counters and buffers.
322
+ # 'agg_in_memory_size' : number of rows in aggregation result.
323
+ # 'agg_res_buffer' and 'bin_res_buffer' are buffers to keep
324
+ # aggregation chunks before a concatenation to record. Because
325
+ # they are appended in-place for each key, they are created
326
+ # separately for each key.
327
+ # Because 'segagg_buffer' and 'post_buffer' are modified
328
+ # in-place for each key, they are created separately for
329
+ # each key.
330
+ agg_buffers[key] = _reset_agg_buffers()
331
+ # Process metadata if already existing aggregation results.
332
+ # If 'key' is atuple of 'bin_key' and 'snap_key', keep 'bin_key' as
333
+ # the main key to check existing results in store.
334
+ main_key = key[0] if isinstance(key, tuple) else key
335
+ if main_key in store:
336
+ # Prior AggStream results already in store.
337
+ # Retrieve corresponding metadata to re-start aggregations.
338
+ prev_agg_res = store[main_key]
339
+ if not _is_aggstream_result(prev_agg_res):
340
+ raise ValueError(
341
+ f"provided '{main_key}' data is not an AggStream result.",
342
+ )
343
+ aggstream_md = prev_agg_res.key_value_metadata[KEY_AGGSTREAM]
344
+ # - 'last_seed_index' to trim accordingly head of seed data.
345
+ # - metadata related to pre-processing of individual seed chunk.
346
+ # - metadata related to binning process from past binnings
347
+ # on prior data. It is used in case 'bin_by' is a callable.
348
+ # If not used, it is an empty dict.
349
+ # - metadata related to post-processing of prior
350
+ # aggregation results, to be used by 'post'. If not used,
351
+ # it is an empty dict.
352
+ seed_index_restart_set.add(aggstream_md[KEY_RESTART_INDEX])
353
+ if KEY_PRE_BUFFER in aggstream_md:
354
+ pre_buffer = aggstream_md[KEY_PRE_BUFFER]
355
+ agg_buffers[key][KEY_SEGAGG_BUFFER] = (
356
+ aggstream_md[KEY_SEGAGG_BUFFER] if aggstream_md[KEY_SEGAGG_BUFFER] else {}
357
+ )
358
+ agg_buffers[key][KEY_POST_BUFFER] = (
359
+ aggstream_md[KEY_POST_BUFFER] if aggstream_md[KEY_POST_BUFFER] else {}
360
+ )
361
+ else:
362
+ agg_buffers[key][KEY_SEGAGG_BUFFER] = {}
363
+ agg_buffers[key][KEY_POST_BUFFER] = {}
364
+
365
+ if len(seed_index_restart_set) > 1:
366
+ raise ValueError(
367
+ "not possible to aggregate on multiple keys with existing "
368
+ "aggregation results not aggregated up to the same seed index.",
369
+ )
370
+ return (
371
+ None if not seed_index_restart_set else seed_index_restart_set.pop(),
372
+ pre_buffer,
373
+ agg_buffers,
374
+ )
375
+
376
+
377
+ def _reset_agg_buffers(agg_buffers: dict | None = None) -> dict | None:
378
+ """
379
+ Reset aggregation buffers and counters.
380
+
381
+ Either modify in-place, or return a new dict.
382
+
383
+ Parameters
384
+ ----------
385
+ agg_buffers : dict | None, default None
386
+ Buffer to keep track of aggregation sequence intermediate results.
387
+
388
+ - n_rows : int, number of rows in main aggregation results (snapshots
389
+ is snapshots are quested, or bins otherwise). It is reset here after
390
+ writing.
391
+ - bin_res : DataFrame, last aggregation results (bins), to reset to None
392
+ after writing.
393
+ - snap_res : DataFrame, last aggregation results (snapshots), to reset
394
+ to None after writing.
395
+ - bin_res_buffer : list[DataFrame], list of bins resulting from
396
+ aggregation (pandas DataFrame).
397
+ - snap_res_buffer : list[pandas.DataFrame], list of snapshots resulting
398
+ from aggregation (pandas dataframes), when snapshots are requested.
399
+ - post_buffer : dict, buffer to keep track of data that can be
400
+ processed during previous iterations. This pointer should not be
401
+ re-initialized in 'post' or data from previous iterations will be
402
+ lost. This dict has to contain data that can be serialized, as data
403
+ is then kept in parquet file metadata.
404
+ - segagg_buffer : dict, parameters from segmentation and aggregation
405
+ process, that are required when restarting the aggregation with new
406
+ seed data. (for recording in metadata of aggregation results)
407
+
408
+ Returns
409
+ -------
410
+ dict
411
+ A dict with initialized values for ``agg_buffers``.
412
+
413
+ """
414
+ init_values = {
415
+ KEY_AGG_IN_MEMORY_SIZE: 0,
416
+ KEY_BIN_RES: None,
417
+ KEY_SNAP_RES: None,
418
+ KEY_BIN_RES_BUFFER: [],
419
+ KEY_SNAP_RES_BUFFER: [],
420
+ }
421
+ if agg_buffers is None:
422
+ return init_values
423
+ else:
424
+ agg_buffers |= init_values
425
+
426
+
427
+ class SeedPreException(Exception):
428
+ """
429
+ Exception related to user-defined checks on seed chunk.
430
+ """
431
+
432
+ def __init__(self, message: str | None = None):
433
+ """
434
+ Exception message.
435
+ """
436
+ if message is None:
437
+ self.message = "failing user-defined checks."
438
+ else:
439
+ self.message = message
440
+
441
+
442
+ def _iter_data(
443
+ seed: Iterable[DataFrame],
444
+ ordered_on: str,
445
+ restart_index: float | Timestamp | None,
446
+ pre: Callable | None,
447
+ pre_buffer: dict,
448
+ filters: dict | None,
449
+ trim_start: bool,
450
+ discard_last: bool,
451
+ ):
452
+ """
453
+ Iterate provided seed, applying sequentially (optionally) filters.
454
+
455
+ Seed has to be monotonic increasing on 'ordered_on' column. If not, it is
456
+ ordered.
457
+
458
+ Parameters
459
+ ----------
460
+ seed : Iterable[DataFrame]
461
+ Iterable of pandas Dataframe.
462
+ ordered_on : str
463
+ Name of column with respect to which seed data is in ascending
464
+ order.
465
+ restart_index : int, float, Timestamp or None
466
+ Index (excluded) in `ordered_on` column before which rows in seed
467
+ will be trimmed.
468
+ pre : Callable or None
469
+ Used-defined Callable to proceed checks over each item of the seed
470
+ Iterable, accepting 2 parameters:
471
+
472
+ - An ``on`` parameter, a pandas dataframe, the current seed item
473
+ (before any filter is applied).
474
+ - A ``buffer`` parameter, a dict that can be used as a buffer
475
+ for storing temporary results from one chunk processing to
476
+ the next. Its initial value is that provided by `pre_buffer`.
477
+
478
+ In-place modifications of seed dataframe has to be carried out here.
479
+ pre_buffer : dict
480
+ Buffer to keep track of intermediate data that can be required for
481
+ proceeding with pre of individual seed item.
482
+ filters : dict or None
483
+ Dict in the form
484
+ ``{"filter_id":[[("col", op, val), ...], ...]}``
485
+ To filter out data from seed.
486
+ Filter syntax: [[(column, op, val), ...],...]
487
+ where op is [==, =, >, >=, <, <=, !=, in, not in]
488
+ The innermost tuples are transposed into a set of filters applied
489
+ through an `AND` operation.
490
+ The outer list combines these sets of filters through an `OR`
491
+ operation.
492
+ A single list of tuples can also be used, meaning that no `OR`
493
+ operation between set of filters is to be conducted.
494
+ trim_start : bool
495
+ Flag to indicate if seed head has to be trimmed till value of
496
+ 'restart_index' (last seed index of previous aggregation sequence).
497
+ discard_last : bool
498
+ If ``True``, last row group in seed data (sharing the same value in
499
+ `ordered_on` column) is removed from the aggregation step.
500
+
501
+ Returns
502
+ -------
503
+ last_seed_index, filder_id, filtered_chunk
504
+ - 'last_seed_index', int | float | Timestamp, the last seed
505
+ index value (likely of an incomplete group), of the current seed
506
+ chunk, before filters are applied.
507
+ - 'pre_buffer' : dict, buffer to keep track of intermediate data that
508
+ can be required for proceeding with preprocessing of individual seed
509
+ chunk.
510
+ - 'filter_id', str, indicating which set of filters has been
511
+ applied for the seed chunk provided.
512
+ - 'filtered_chunk', DataFrame, from the seed Iterable, with
513
+ optionally filters applied.
514
+
515
+ Notes
516
+ -----
517
+ Checks are applied after having trimming seed head (if ``trim_start``
518
+ is True) and discard last row group (if ``discard_last`` is True).
519
+
520
+ Reasons to discard last seed row (or row group) may be twofold:
521
+ - last row is temporary (yet to get some final values, for instance
522
+ if seed data is some kind of aggregation stream itself),
523
+ - last rows are part of a single row group 'same index value in
524
+ 'ordered_on')not yet complete itself (new rows part of this row group
525
+ to be expected).
526
+
527
+ """
528
+ if restart_index is None:
529
+ # No aggregation result existing yet. Whatever 'trim_start' value, no
530
+ # trimming is possible.
531
+ trim_start = False
532
+ seed_remainder = None
533
+ for seed_chunk in seed:
534
+ # Check seed chunk is ordered on 'ordered_on'.
535
+ # This re-ordering is made because for 'trim_start' and
536
+ # 'discard_last', this ordering is required.
537
+ if not seed_chunk[ordered_on].is_monotonic_increasing:
538
+ # Currently un-eased to silently modify seed data without knowing
539
+ # if it makes sense, so leaving this row commented.
540
+ # seed_chunk.sort_values(by=ordered_on, inplace=True)
541
+ # Instead, raise an exception.
542
+ raise SeedPreException("seed data is not in ascending order.")
543
+ # Step 1 / Seed pre-processing by user.
544
+ if pre:
545
+ # Apply user checks.
546
+ try:
547
+ pre(on=seed_chunk, buffer=pre_buffer)
548
+ except Exception as e:
549
+ # Stop iteration in case of failing pre.
550
+ # Aggregation has been run up to the last valid chunk.
551
+ raise SeedPreException(str(e))
552
+ # Step 2 / If a previous remainder, concatenate it to give current
553
+ # DataFrame its 'final' length.
554
+ if not (seed_remainder is None or seed_remainder.empty):
555
+ seed_chunk = concat([seed_remainder, seed_chunk], ignore_index=True)
556
+ # Step 3 / Prepare filter to trim seed head and tail if requested.
557
+ if trim_start:
558
+ if seed_chunk.loc[:, ordered_on].iloc[-1] < restart_index:
559
+ # This full chunk is to be discarded. Go to the next.
560
+ continue
561
+ else:
562
+ filter_array = seed_chunk[ordered_on] >= restart_index
563
+ # Once it has been applied once, no need to check for it
564
+ # again on subsequent chunks.
565
+ trim_start = False
566
+ else:
567
+ filter_array = ones(len(seed_chunk), dtype=bool)
568
+ # 'ordered_on' being necessarily in ascending order, last index
569
+ # value is its max value.
570
+ last_seed_index = seed_chunk.loc[:, ordered_on].iloc[-1]
571
+ if discard_last:
572
+ filter_main_chunk = seed_chunk.loc[:, ordered_on] < last_seed_index
573
+ seed_remainder = seed_chunk.loc[~filter_main_chunk]
574
+ filter_array &= filter_main_chunk
575
+ # Step 4 / Filter seed and yield.
576
+ for filt_id, filters_ in filters.items():
577
+ # Filter.
578
+ filter_array_loc = (
579
+ dataframe_filter(seed_chunk, filters_) & filter_array
580
+ if filt_id != NO_FILTER_ID
581
+ else filter_array.copy()
582
+ )
583
+ if not filter_array_loc.any():
584
+ # DataFrame will be empty after filtering.
585
+ # Proceed with next iteration.
586
+ continue
587
+ elif filter_array_loc.all():
588
+ # If filter only contains 1, simply return full seed chunk.
589
+ yield last_seed_index, pre_buffer, filt_id, seed_chunk
590
+ else:
591
+ # Otherwise, filter.
592
+ yield last_seed_index, pre_buffer, filt_id, seed_chunk.loc[filter_array_loc].reset_index(
593
+ drop=True,
594
+ )
595
+
596
+
597
+ def _concat_agg_res(
598
+ agg_res_buffers: list[DataFrame],
599
+ agg_res: DataFrame,
600
+ append_last_res: bool,
601
+ index_name: str,
602
+ ):
603
+ """
604
+ Concat aggregation results with / without last row.
605
+
606
+ Parameters
607
+ ----------
608
+ agg_res_buffers : list[DataFrame]
609
+ List of aggregation results to concatenate.
610
+ agg_res : DataFrame
611
+ Last aggregation results (all rows from last iteration).
612
+ append_last_res : bool
613
+ If 'agg_res' should be appended to 'agg_res_buffer' and if 'bin_res'
614
+ should be appended to 'bin_res_buffers'.
615
+ index_name : str, default None
616
+ If a string, index name of dataframe resulting from aggregation with
617
+ this value, which will be enforced in written results.
618
+
619
+ Returns
620
+ -------
621
+ DataFrame
622
+ List of aggregation results concatenated in a single DataFrame.
623
+
624
+ """
625
+ agg_res_list = [*agg_res_buffers, agg_res] if append_last_res else agg_res_buffers
626
+ # Make a copy when a single item, to not propagate the 'reset_index'
627
+ # to original 'agg_res'.
628
+ agg_res = concat(agg_res_list) if len(agg_res_list) > 1 else agg_res_list[0].copy(deep=False)
629
+ if index_name:
630
+ # In case 'by' is a callable, index may have no name, but user may have
631
+ # defined one with 'bin_on' parameter.
632
+ agg_res.index.name = index_name
633
+ # Keep group keys as a column before post-processing.
634
+ agg_res.reset_index(inplace=True)
635
+ return agg_res
636
+
637
+
638
+ def _post_n_write_agg_chunks(
639
+ agg_buffers: dict,
640
+ agg_res_type: Enum,
641
+ append_last_res: bool,
642
+ store: Store,
643
+ key: Any | tuple[Any, Any],
644
+ write_config: dict,
645
+ index_name: str | None = None,
646
+ post: Callable | None = None,
647
+ last_seed_index: float | Timestamp | None = None,
648
+ pre_buffer: dict | None = None,
649
+ ):
650
+ """
651
+ Write list of aggregation row groups with optional post.
652
+
653
+ Buffer variables 'agg_res_buffer', 'bin_res_buffer' are then reset.
654
+
655
+ Parameters
656
+ ----------
657
+ agg_buffers : dict
658
+ Buffer to keep track of aggregation sequence intermediate results.
659
+
660
+ - agg_in_memory_size : int, size in bytes of aggregation results (bins
661
+ only or bins and snapshots if snapshots are requested. It is reset
662
+ here after writing.
663
+ - bin_res : DataFrame, last aggregation results, to reset to None
664
+ after writing.
665
+ - snap_res : DataFrame, last aggregation results, to reset to None
666
+ after writing.
667
+ - bin_res_buffer : list[DataFrame], list of bins resulting from
668
+ aggregation (pandas DataFrame).
669
+ It contains 'bin_res' (last aggregation results),but without last
670
+ row. It is flushed here after writing
671
+ - snap_res_buffer : list[pandas.DataFrame], list of snapshots resulting
672
+ from aggregation (pandas dataframes), when snapshots are requested.
673
+ It contains 'bin_res' (last aggregation results), but without last
674
+ row. It is flushed here after writing
675
+ - post_buffer : dict, buffer to keep track of data that can be
676
+ processed during previous iterations. This pointer should not be
677
+ re-initialized in 'post' or data from previous iterations will be
678
+ lost. This dict has to contain data that can be serialized, as data
679
+ is then kept in parquet file metadata.
680
+ It is NOT reset after writing. It is however required to be
681
+ written in metadata.
682
+ - segagg_buffer : dict, parameters from segmentation and aggregation
683
+ process, that are required when restarting the aggregation with new
684
+ seed data. (for recording in metadata of aggregation results)
685
+ It is NOT reset after writing. It is however required to be
686
+ written in metadata.
687
+
688
+ agg_res_type : Enum
689
+ Either 'BINS', 'SNAPS', or 'BOTH'.
690
+ append_last_res : bool
691
+ If 'agg_res' should be appended to 'agg_res_buffer' and if 'bin_res'
692
+ should be appended to 'bin_res_buffers'.
693
+ store : Store
694
+ Store to which recording aggregation results.
695
+ key : Any | tuple[Any, Any]
696
+ Key for retrieving corresponding metadata.
697
+ If a tuple of 2 dataclass, the first is key for bins, the second is key
698
+ for snapshots.
699
+ write_config : dict
700
+ Settings forwarded to ``oups.writer.write`` when writing aggregation
701
+ results to store. Compulsory parameter defining at least `ordered_on`
702
+ and `duplicates_on` columns.
703
+ index_name : str, default None
704
+ If a string, index name of dataframe resulting from aggregation with
705
+ this value, which will be enforced in written results.
706
+ post : Callable, default None
707
+ User-defined function accepting 3 parameters.
708
+
709
+ - ``buffer``, a dict to be used as data buffer, that can be necessary
710
+ for some user-defined post-processing requiring data assessed in
711
+ previous post-processing iteration.
712
+ - ``bin_res``, a pandas dataframe resulting from the aggregations
713
+ defined by ``agg`` parameter, with first row already corrected
714
+ with last row of previous streamed aggregation.
715
+ These are aggregation results for bins.
716
+ - ``snap_res`` (optional), a pandas dataframe resulting from the
717
+ aggregations defined by ``agg`` parameter that contains snapshots.
718
+
719
+ It has then to return a pandas dataframe that will be recorded.
720
+ This optional post-processing is intended for use of vectorized
721
+ functions (not mixing rows together, but operating on one or several
722
+ columns), or dataframe formatting before results are finally recorded.
723
+
724
+ last_seed_index : int | float | Timestamp | None, default None
725
+ Last index in seed data. Can be numeric type, timestamp... (for
726
+ recording in metadata of aggregation results)
727
+ Writing metadata is triggered ONLY if ``last_seed_index`` is provided.
728
+ pre_buffer : dict or None
729
+ Buffer to keep track of intermediate data that can be required for
730
+ proceeding with preprocessing of individual seed chunk.
731
+
732
+ """
733
+ post_buffer = agg_buffers[KEY_POST_BUFFER]
734
+ # When there is no result, 'agg_res' is None.
735
+ if isinstance((bin_res := agg_buffers[KEY_BIN_RES]), DataFrame):
736
+ # To keep track there has been res in the 1st place.
737
+ initial_agg_res = True
738
+ # Concat list of aggregation results.
739
+ bin_res = _concat_agg_res(
740
+ agg_buffers[KEY_BIN_RES_BUFFER],
741
+ bin_res,
742
+ append_last_res,
743
+ index_name,
744
+ )
745
+ # Same if needed with 'snap_res_buffer'.
746
+ if isinstance((snap_res := agg_buffers[KEY_SNAP_RES]), DataFrame):
747
+ snap_res = _concat_agg_res(
748
+ agg_buffers[KEY_SNAP_RES_BUFFER],
749
+ snap_res,
750
+ append_last_res,
751
+ index_name,
752
+ )
753
+ if post:
754
+ # Post processing if any.
755
+ # 'post_buffer' has to be modified in-place.
756
+ # It is possible 'main_res' is None, if 'post' needs a minimal
757
+ # number of rows before outputting results (warm-up).
758
+ main_res = (
759
+ post(buffer=post_buffer, bin_res=bin_res)
760
+ if agg_res_type is AggResType.BINS
761
+ else post(buffer=post_buffer, bin_res=bin_res, snap_res=snap_res)
762
+ )
763
+ if agg_res_type is AggResType.BOTH:
764
+ # First result, recorded with 'bin_key', is considered main
765
+ # result.
766
+ try:
767
+ main_res, snap_res = main_res
768
+ except ValueError:
769
+ raise ValueError(
770
+ f"not possible to have key '{key[0]}' for bins and "
771
+ f"key '{key[1]}' for snapshots but 'post()' function "
772
+ "only returning one result.",
773
+ )
774
+ # Set to None 'bin_res' and 'snap_res' to catch possible
775
+ # mistake in 'key' parameter (finally commented out).
776
+ # snap_res = None
777
+ # bin_res = None
778
+ elif agg_res_type is not AggResType.SNAPS:
779
+ # Case only 'bin_res' is recorded or both 'bin_res' and 'snap_res'.
780
+ # main_res, bin_res = bin_res, None
781
+ main_res = bin_res
782
+ else:
783
+ # Case only 'snap_res' is recorded, and not 'bin_res'.
784
+ # main_res, bin_res, snap_res = snap_res, None, None
785
+ main_res = snap_res
786
+ else:
787
+ initial_agg_res = False
788
+ main_res = None
789
+ main_key, snap_key = key if isinstance(key, tuple) else (key, None)
790
+ if last_seed_index:
791
+ # If 'last_seed_index', set oups metadata.
792
+ # It is possible there is no result yet to write for different reasons:
793
+ # - new seed data has been streamed and needs to be taken into account,
794
+ # but there is no result for this key, because all related seed data
795
+ # has been filtered out.
796
+ # - or maybe 'post' has a wamr up period and has not released results
797
+ # yet.
798
+ # But 'last_seed_index' has to be recorded, and so do possibly
799
+ # 'pre_buffer', 'segagg_buffer' and 'post_buffer'.
800
+ # Oups metadata only get written for 'main_key'.
801
+ # When 'key' is a tuple, 'main_key' is the 1st key.
802
+ write_config["key_value_metadata"] = {
803
+ KEY_AGGSTREAM: {
804
+ KEY_RESTART_INDEX: last_seed_index,
805
+ KEY_PRE_BUFFER: pre_buffer,
806
+ KEY_SEGAGG_BUFFER: agg_buffers[KEY_SEGAGG_BUFFER],
807
+ KEY_POST_BUFFER: post_buffer,
808
+ },
809
+ }
810
+ # When there is no result, 'main_res' is None.
811
+ # If no result, metadata is possibly to be written. This is indicated by
812
+ # 'last_seed_index', which informs about the last 'aggstream' local
813
+ # iteration.
814
+ if isinstance(main_res, DataFrame) or last_seed_index:
815
+ if agg_res_type is AggResType.BOTH:
816
+ store[main_key].write(
817
+ **(write_config | {KEY_ROW_GROUP_TARGET_SIZE: write_config[KEY_ROW_GROUP_TARGET_SIZE][0]}),
818
+ df=main_res,
819
+ )
820
+ store[snap_key].write(
821
+ **(
822
+ write_config
823
+ | {
824
+ KEY_ROW_GROUP_TARGET_SIZE: write_config[KEY_ROW_GROUP_TARGET_SIZE][1],
825
+ "key_value_metadata": None,
826
+ }
827
+ ),
828
+ df=snap_res,
829
+ )
830
+ else:
831
+ store[main_key].write(**write_config, df=main_res)
832
+ if initial_agg_res:
833
+ # If there have been results, they have been processed (either written
834
+ # directly or through 'post()'). Time to reset aggregation buffers and
835
+ # counters.
836
+ _reset_agg_buffers(agg_buffers)
837
+ return
838
+
839
+
840
+ def agg_iter(
841
+ seed_chunk: DataFrame,
842
+ store: Store,
843
+ key: Any,
844
+ keys_config: dict,
845
+ agg_config: dict,
846
+ agg_buffers: dict,
847
+ ):
848
+ """
849
+ Post-process and write iter. n-1, segment and aggregate iter. n.
850
+
851
+ Parameters
852
+ ----------
853
+ seed_chunk : DataFrame
854
+ Chunk of seed data.
855
+ store : Store
856
+ Store to which recording aggregation results.
857
+ key : Any | tuple[Any, Any]
858
+ Key for recording aggregation results.
859
+ keys_config
860
+ Settings related to 'key' for conducting post-processing, writing and
861
+ segmentation.
862
+ agg_config : dict
863
+ Settings related to 'key' for conducting aggregation.
864
+ agg_buffers : dict
865
+ Buffer to keep track of aggregation sequence intermediate results.
866
+
867
+ Returns
868
+ -------
869
+ key, updated_agg_buffers
870
+ - ``key``, key to which changed parameters are related.
871
+ - ``updated_agg_buffers``, dict with modified parameters.
872
+
873
+ """
874
+ # Post process and write.
875
+ if not ((bin_res := agg_buffers[KEY_BIN_RES]) is None or bin_res.empty):
876
+ # If previous results, check if this is write time.
877
+ bin_res_buffer = agg_buffers[KEY_BIN_RES_BUFFER]
878
+ # Add 'agg_res' to 'agg_res_buffer' ignoring last row.
879
+ # It is incomplete, so useless to write it to results while
880
+ # aggregation iterations are on-going.
881
+ bin_res_buffer.append(bin_res.iloc[:-1])
882
+ agg_in_memory_size = agg_buffers[KEY_AGG_IN_MEMORY_SIZE]
883
+ if (snap_res := agg_buffers[KEY_SNAP_RES]) is None:
884
+ agg_buffers[KEY_AGG_IN_MEMORY_SIZE] += bin_res.memory_usage().sum()
885
+ else:
886
+ # If we have bins & snapshots, do same with snapshots.
887
+ agg_buffers[KEY_SNAP_RES_BUFFER].append(snap_res.iloc[:-1])
888
+ agg_buffers[KEY_AGG_IN_MEMORY_SIZE] += (
889
+ bin_res.memory_usage().sum() + snap_res.memory_usage().sum()
890
+ )
891
+ # Length of 'bin_res_buffer' is number of times it has been
892
+ # appended. Be it from bins, or snapshots, length is same.
893
+ # Keep floor part.
894
+ agg_mean_in_memory_group_size = agg_in_memory_size // len(bin_res_buffer)
895
+ if agg_in_memory_size + agg_mean_in_memory_group_size > keys_config[KEY_MAX_IN_MEMORY_SIZE_B]:
896
+ # For next iteration, chances are that 'agg_in_memory_size' will be
897
+ # larger than threshold. Time to write results from previous
898
+ # iteration.
899
+ _post_n_write_agg_chunks(
900
+ agg_buffers=agg_buffers,
901
+ agg_res_type=keys_config[KEY_AGG_RES_TYPE],
902
+ append_last_res=False,
903
+ store=store,
904
+ key=key,
905
+ write_config=keys_config[KEY_WRITE_CONFIG],
906
+ index_name=keys_config[KEY_BIN_ON_OUT],
907
+ post=keys_config[KEY_POST],
908
+ )
909
+ # Segment and aggregate. Group keys becomes the index.
910
+ agg_res = cumsegagg(
911
+ data=seed_chunk,
912
+ agg=agg_config,
913
+ bin_by=keys_config[KEY_SEG_CONFIG],
914
+ buffer=agg_buffers[KEY_SEGAGG_BUFFER],
915
+ )
916
+ # 'agg_res' is 'main' aggregation results onto which are assessed
917
+ # 'everything'. If only 'bins' are requested, it gathers bins.
918
+ # If 'bins' and 'snapshots' are requested, it gathers snapshots.
919
+ agg_buffers[KEY_BIN_RES], agg_buffers[KEY_SNAP_RES] = (
920
+ agg_res if isinstance(agg_res, tuple) else (agg_res, None)
921
+ )
922
+ return key, agg_buffers
923
+
924
+
925
+ class AggStream:
926
+ """
927
+ Persist configuration data to run aggregation in sequence.
928
+
929
+ Attributes
930
+ ----------
931
+ - ``self.seed_config`, a dict keeping track of seed-related parameters.
932
+ ``{'ordered_on' : string, specifying column name in seed data in
933
+ ascending order.
934
+ 'restart_index' : int, float or Timestamp, the index from which
935
+ (included) should be restarted the next
936
+ aggregation iteration.
937
+ 'pre' : Callable, to apply user-defined pre-processing on seed.
938
+ 'pre_buffer' : dict, to keep track of intermediate values for
939
+ proceeding with pre-processing of individual seed
940
+ items (by `pre` function).
941
+ 'filters' : dict, as per `filters` parameter.
942
+ }``
943
+ - ``self.store``, oups store, as per `store` parameter.
944
+ - ``self.agg_pd``, dict, as per `agg` parameter, in pandas format.
945
+ - ``self.agg_cs``, an attribute initialized once an aggregation
946
+ iteration has been run, and defining aggregation in `cumsegagg`
947
+ standard. It is initialized in ``self.agg`` function, the 1st time
948
+ an aggregation is run (seed data dtypes is required).
949
+ - ``self.filter_apps``, dict, mapping filter ids to list of keys, and
950
+ number of parallel jobs that can be run for this filter id.
951
+ Number of jobs is to be used as key in ``self.p_jobs`` attribute.
952
+ - ``self.keys_config``, dict of keys config in the form:
953
+ ``{key: {'dirpath': str, where to record agg res,
954
+ 'bin_on_out' : str, name in aggregation results for column
955
+ with bin ids.
956
+ 'seg_config' : dict specifying the segmentation config,
957
+ 'post' : Callable or None,
958
+ 'max_in_memory_size_b': int, max allowed result in memory size,
959
+ in bytes.
960
+ 'write_config' : {'ordered_on' : str,
961
+ 'duplicates_on' : str or list,
962
+ ...
963
+ },
964
+ },
965
+ }``
966
+ - ``self.agg_buffers``, dict to keep track of aggregation iteration
967
+ intermediate results.
968
+ ``{key: {'agg_in_memory_size' : int, size in bytes of current
969
+ aggregation results, for bins (if snapshots not
970
+ requested) or bins and snapshots.
971
+ 'bin_res' : None or DataFrame, last aggregation results,
972
+ for bins,
973
+ 'snap_res' : None or DataFrame, last aggregation results,
974
+ for snapshots,
975
+ 'bin_res_buffer' : list of DataFrame, buffer to keep
976
+ bin aggregagation results,
977
+ 'snap_res_buffer' : list of DataFrame, buffer to keep bin
978
+ snapshot aggregagation results (if snapshots are
979
+ requested),
980
+ 'segagg_buffer' : dict, possibly empty, keeping track of
981
+ segmentation and aggregation intermediate
982
+ variables,
983
+ 'post_buffer' : dict, possibly empty, keeping track of
984
+ 'post' function intermediate variables,
985
+ },
986
+ }``
987
+ - ``self.p_jobs``, dict, containing Parallel objects, as per joblib
988
+ setup. Keys are int, being the number of parallel jobs to run for this
989
+ filter id.
990
+
991
+ """
992
+
993
+ def __init__(
994
+ self,
995
+ ordered_on: str,
996
+ store: Store,
997
+ keys: Any | tuple[Any, Any] | dict,
998
+ pre: Callable | None = None,
999
+ filters: dict | None = None,
1000
+ agg: dict | None = None,
1001
+ bin_by: TimeGrouper | Callable[[Series, dict], tuple] | None = None,
1002
+ bin_on: str | tuple[str, str] | None = None,
1003
+ snap_by: TimeGrouper | Series | DatetimeIndex | None = None,
1004
+ post: Callable | None = None,
1005
+ max_in_memory_size: int | None = MAX_IN_MEMORY_SIZE_MB,
1006
+ parallel: bool | None = False,
1007
+ **kwargs,
1008
+ ):
1009
+ """
1010
+ Initialize aggregation stream on ordered data.
1011
+
1012
+ This object enables 'streamed aggregation', iteratively
1013
+ (out-of-core) with optional filtering of seed data, and optional
1014
+ post-processing of aggregation results (by use of vectorized functions
1015
+ or for dataframe formatting).
1016
+ Aggregation results are recoreded into a 'oups store'.
1017
+
1018
+ Parameters
1019
+ ----------
1020
+ ordered_on : str
1021
+ Name of the column with respect to which seed dataset is in
1022
+ ascending order. While this parameter is compulsory for correct
1023
+ restart on seed data, seed data is not necessarily grouped by this
1024
+ column. ``bin_by`` and/or ``bin_on`` parameters can be used to
1025
+ define such a different parameter.
1026
+ This value is also used as default 'ordered_on' parameter for
1027
+ aggregation results, if not provided separately for each key.
1028
+ store : Store
1029
+ Store to which recording aggregation results.
1030
+ keys : Indexer | tuple[Indexer, Indexer] | dict
1031
+ Key(s) for recording aggregation results.
1032
+ In case snapshots are requested, and to request recording of both
1033
+ bins and snapshots, it should be a tuple of 2 indices, the first to
1034
+ record bins, the second to record snapshots.
1035
+ If a dict, several keys can be specified for operating multiple
1036
+ parallel aggregations on the same seed. In this case, the dict can
1037
+ be of two forms.
1038
+
1039
+ - In case seed data is not to be filtered, it should be in the
1040
+ form 1, defined as:
1041
+ ``{key: {'agg': agg,
1042
+ 'bin_by': bin_by,
1043
+ 'bin_on': bin_on,
1044
+ 'snap_by': snap_by,
1045
+ 'post': post,
1046
+ **kwargs}
1047
+ }``
1048
+ Any additional parameters, (``**kwargs``) are forwarded to
1049
+ ``oups.writer.write`` when writing aggregation results to
1050
+ store, such as custom `max_row_group_size`, 'duplicates_on' or
1051
+ 'ordered_on' parameters (see not below for 'duplicates_on').
1052
+ Please, note:
1053
+
1054
+ - `bin_by` is a compulsory parameter.
1055
+ - If not specified, `bin_on` parameter in dict does not get
1056
+ default values.
1057
+ - If not specified in dict, `agg`, `snap_by`, `post` and
1058
+ other parameters related to writing of aggregation
1059
+ results... get values from `agg`, `snap_by`, `post`,
1060
+ `ordered_on` and `**kwargs` parameters defined when
1061
+ initializing `AggStream`.
1062
+ If using `snap_by` or `post` when initializing `AggStream`
1063
+ and not willing to apply it for one key, set it to ``None``
1064
+ in key specific config.
1065
+
1066
+ - In case seed is to be filtered, dict written in form 1 are
1067
+ themselves values within an upper dict. Keys for this upper
1068
+ dict are string used as filter id. Each of these filter ids
1069
+ have then to be listed in ``filters`` parameter.
1070
+ For keys deriving from unfiltered data, use the `NO_FILTER_ID`
1071
+ ``"_"``.
1072
+
1073
+ pre : Callable, default None
1074
+ Used-defined Callable to proceed with preèprocessing of each chunks
1075
+ of the seed Iterable, accepting 2 parameters:
1076
+
1077
+ - An ``on`` parameter, a pandas dataframe, the current seed item
1078
+ (before any filter is applied).
1079
+ - A ``buffer`` parameter, a dict that can be used as a buffer
1080
+ for storing temporary results from one chunk processing to
1081
+ the next. Its initial value is that provided by `pre_buffer`.
1082
+
1083
+ If running ``pre`` raises an exception (whichever type it is), a
1084
+ ``SeedPreException`` will subsequently be raised.
1085
+ Modification of seed chunk, if any, has to be realized in-place.
1086
+ No DataFrame returned by this function is expected.
1087
+ filters : dict | None, default None
1088
+ Dict in the form
1089
+ ``{"filter_id":[[("col", op, val), ...], ...]}``
1090
+ To filter out data from seed.
1091
+ Filter syntax: [[(column, op, val), ...],...]
1092
+ where op is [==, =, >, >=, <, <=, !=, in, not in]
1093
+ The innermost tuples are transposed into a set of filters applied
1094
+ through an `AND` operation.
1095
+ The outer list combines these sets of filters through an `OR`
1096
+ operation.
1097
+ A single list of tuples can also be used, meaning that no `OR`
1098
+ operation between set of filters is to be conducted.
1099
+ agg : dict | None, default None
1100
+ Dict in the form
1101
+ ``{"output_col":("input_col", "agg_function_name")}``
1102
+ where keys are names of output columns into which are recorded
1103
+ results of aggregations, and values describe the aggregations to
1104
+ operate. ``input_col`` has to exist in seed data.
1105
+ Examples of ``agg_function_name`` are `first`, `last`, `min`, `max`
1106
+ and `sum`.
1107
+ This parameter is compulsory, except if ``key`` parameter is a
1108
+ `dict`.
1109
+ bin_by : TimeGrouper | Callable, default None
1110
+ Parameter defining the binning logic.
1111
+ If a `Callable`, it is given following parameters.
1112
+
1113
+ - An ``on`` parameter, a pandas dataframe made of column
1114
+ ``ordered_on``, and column ``bin_on`` if different than
1115
+ ``ordered_on``.
1116
+ - A ``buffer`` parameter, a dict that can be used as a buffer for
1117
+ storing temporary results from one chunk processing to
1118
+ the next.
1119
+
1120
+ TThis parameter is the ``bin_by`` parameter of
1121
+ ``oups.aggstream.segmentby.segmentby`` function. For more
1122
+ information, please, read its docstring.
1123
+ bin_on : str | tuple[str, str] | None, default None
1124
+ ``bin_on`` may either be a string or a tuple of 2 string. When a
1125
+ string, it refers to an existing column in seed data onto which
1126
+ applying the binning defined by ``bin_by`` parameter. Its value is
1127
+ then carried over as name for the column containing the group keys.
1128
+ It is further used when writing results for defining
1129
+ ``duplicates_on`` parameter (see ``oups.writer.write``).
1130
+ When a tuple, the 1st string refers to an existing column in seed
1131
+ data, the 2nd the name to use for the column which values will be
1132
+ the group keys in aggregation results.
1133
+ Setting of ``bin_on`` should be adapted depending how is defined
1134
+ ``bin_by`` parameter. When ``bin_by`` is a Callable, then
1135
+ ``bin_on`` can have different values.
1136
+
1137
+ - ``None``, the default.
1138
+ - the name of an existing column onto which applying the binning.
1139
+ Its value is then carried over as name for the column
1140
+ containing the group keys.
1141
+
1142
+ snap_by : TimeGrouper | Series | DatetimeIndex | None, default None
1143
+ Values positioning points of observation, either derived from a
1144
+ pandas TimeGrouper, or contained in a pandas Series.
1145
+ In case 'snap_by' is a Series, values serve as locations for points
1146
+ of observation.
1147
+ Additionally, ``closed`` value defined by 'bin_on' specifies if
1148
+ points of observations are included or excluded.
1149
+
1150
+ - `left`, then values at points of observation are excluded.
1151
+ - `right`, then values at points of observation are included.
1152
+
1153
+ post : Callable, default None
1154
+ User-defined function accepting up to 3 parameters.
1155
+
1156
+ - ``buffer``, a dict to be used as data buffer, that can be
1157
+ necessary for some user-defined post-processing requiring data
1158
+ assessed in previous post-processing iteration.
1159
+ - ``bin_res``, a pandas dataframe resulting from the aggregations
1160
+ defined by ``agg`` parameter, with first row already corrected
1161
+ with last row of previous streamed aggregation.
1162
+ These are aggregation results for bins.
1163
+ - ``snap_res`` (optional), a pandas dataframe resulting from the
1164
+ aggregations defined by ``agg`` parameter that contains
1165
+ snapshots.
1166
+
1167
+ It has then to return a pandas dataframe that will be recorded.
1168
+ This optional post-processing is intended for use of vectorized
1169
+ functions (not mixing rows together, but operating on one or
1170
+ several columns), or dataframe formatting before results are
1171
+ finally recorded.
1172
+ Please, read the note below regarding 'post' parameter.
1173
+ max_in_memory_size : int, default 'MAX_IN_MEMORY_SIZE_MB'
1174
+ Maximum allowed size in Megabytes of results stored in memory.
1175
+ parallel : bool, default False
1176
+ Conduct processing of keys in parallel, with one process per `key`.
1177
+ If a single `key`, only one process is possible.
1178
+
1179
+ Other Parameters
1180
+ ----------------
1181
+ kwargs : dict
1182
+ Settings forwarded to ``oups.writer.write`` when writing
1183
+ aggregation results to store. Can define for instance custom
1184
+ `max_row_group_size` or `duplicates_on` parameters (see notes below
1185
+ for `duplicates_on`).
1186
+
1187
+ Notes
1188
+ -----
1189
+ - Result is necessarily added to a dataset from an instantiated oups
1190
+ ``Store``. ``AggStream`` actually relies on the update feature
1191
+ from oups.
1192
+ - With the post-processing step, user can also take care of removing
1193
+ columns produced by the aggregation step, but not needed afterwards.
1194
+ Other formatting operations on the dataframe can also be achieved
1195
+ (renaming columns or index, and so on...). To be noticed, group keys
1196
+ are available through a column having same name as initial column
1197
+ from seed data, or defined by 'bin_on' parameter if 'bin_by' is a
1198
+ Callable.
1199
+ - When recording, both 'ordered_on' and 'duplicates_on' parameters are
1200
+ set when calling ``oups.writer.write``. If additional parameters are
1201
+ defined by the user, some checks are made.
1202
+
1203
+ - 'ordered_on' is forced to 'AggStream' ``ordered_on`` parameter.
1204
+ - If 'duplicates_on' is not set by the user or is `None`, then it
1205
+ is
1206
+
1207
+ - either set to the name of the output column for group keys
1208
+ defined by `bin_on` if `bin_on` is set. The rational is that
1209
+ this column identifies uniquely each bin, and so is a
1210
+ relevant column to identify duplicates.
1211
+ - if `bin_on` is not set, then it defaults to `ordered_on`
1212
+ column.
1213
+
1214
+ There might case when this logic is unsuited. For instance,
1215
+ perhaps values in 'ordered_on' column does provide a unique valid
1216
+ identifier for bins already (if there are unique values in
1217
+ 'ordered_on'). It may then be that the column containing group
1218
+ keys is removed during user post-processing.
1219
+ To allow such specific use case, the user can set
1220
+ ``duplicates_on`` as additional parameter to ``AggStream``. If
1221
+ the user omit a column name, it means that this is a voluntary
1222
+ choice from the user.
1223
+
1224
+ - If an exception is raised by ``pre`` function on seed data, then,
1225
+ last good results are still written to disk with correct metadata. If
1226
+ an exception is raised at some other point of the aggregation
1227
+ process, results are not written.
1228
+ - Use of 'post' parameter can be intricate. The user should be aware
1229
+ of 2 situations.
1230
+
1231
+ - Either 'post' is called not as 'final_write'. In this case, the
1232
+ last existing row is removed from bin and snapshot aggregation
1233
+ results. It will be added back at the next iteration though.
1234
+ this is to optimize the iteration mechanism.
1235
+ - Or 'post' is called as 'final_write'. In this case, the last
1236
+ existing row is kept in bin and snapshot aggregation results.
1237
+
1238
+ The user should make sure the 'post' function adapts to both
1239
+ situations.
1240
+
1241
+ """
1242
+ # Check 'kwargs' parameters are those expected for 'write' function.
1243
+ for param in kwargs:
1244
+ if param not in WRITE_PARAMS:
1245
+ raise ValueError(
1246
+ f"'{param}' is neither a valid parameter for `AggStream`"
1247
+ " initialization, nor for `oups.write` function.",
1248
+ )
1249
+ # Seed-related attributes.
1250
+ if filters is not None:
1251
+ # Check if only an "AND" part has been provided. If yes, enclose it
1252
+ # in an outer list.
1253
+ filters = {
1254
+ filt_id: [filters_] if isinstance(filters_[0], tuple) else filters_
1255
+ for filt_id, filters_ in filters.items()
1256
+ }
1257
+ # Set default values for keys' config.
1258
+ keys_default = {
1259
+ KEY_SNAP_BY: snap_by,
1260
+ KEY_AGG: agg,
1261
+ KEY_POST: post,
1262
+ KEY_ORDERED_ON: ordered_on,
1263
+ KEY_MAX_IN_MEMORY_SIZE_B: int(max_in_memory_size * MEGABYTES_TO_BYTES),
1264
+ } | kwargs
1265
+ if not isinstance(keys, dict):
1266
+ keys = {keys: keys_default | {KEY_BIN_BY: bin_by, KEY_BIN_ON: bin_on}}
1267
+ if isinstance(next(iter(keys)), str):
1268
+ # Case filter is used.
1269
+ # Check 'filters' parameter is used.
1270
+ if filters is None:
1271
+ raise ValueError(
1272
+ "not possible to use filter syntax for `keys` parameter "
1273
+ "without providing `filters` parameter as well.",
1274
+ )
1275
+ else:
1276
+ # Check same filters id are both in 'keys' and 'filters'
1277
+ # parameters.
1278
+ if NO_FILTER_ID in filters:
1279
+ if filters[NO_FILTER_ID] is not None:
1280
+ raise ValueError(
1281
+ f"not possible to use '{NO_FILTER_ID}' as key in "
1282
+ "`filters` parameter with a value different than "
1283
+ "`None`.",
1284
+ )
1285
+ elif NO_FILTER_ID in keys:
1286
+ # If not in 'filters' but in 'keys', add it to 'filters'.
1287
+ filters[NO_FILTER_ID] = None
1288
+ filt_filt_ids = set(filters)
1289
+ filt_filt_ids.discard(NO_FILTER_ID)
1290
+ keys_filt_ids = set(keys)
1291
+ keys_filt_ids.discard(NO_FILTER_ID)
1292
+ if filt_filt_ids != keys_filt_ids:
1293
+ raise ValueError(
1294
+ "not possible to have different lists of filter ids"
1295
+ " between `keys` and `filters` parameters.\n"
1296
+ f" List of filter ids in `keys` parameter is {keys_filt_ids}.\n"
1297
+ f" List of filter ids in `filters` parameter is {filt_filt_ids}.",
1298
+ )
1299
+ else:
1300
+ # Case no filter is used.
1301
+ keys = {NO_FILTER_ID: keys}
1302
+ filters = {NO_FILTER_ID: None}
1303
+ _filter_apps = {}
1304
+ _all_keys = []
1305
+ _p_jobs = {KEY_MAX_P_JOBS: Parallel(n_jobs=KEY_MAX_P_JOBS, prefer="threads")}
1306
+ for filt_id in keys:
1307
+ # Set number of jobs.
1308
+ n_keys = len(keys[filt_id])
1309
+ n_jobs = min(KEY_MAX_P_JOBS, n_keys) if parallel else 1
1310
+ _filter_apps[filt_id] = FilterApp(list(keys[filt_id]), n_jobs)
1311
+ if n_jobs not in _p_jobs:
1312
+ # Configure parallel jobs.
1313
+ _p_jobs[n_jobs] = Parallel(n_jobs=n_jobs, prefer="threads")
1314
+ _all_keys.extend(keys[filt_id])
1315
+ # Check for duplicates keys between different filter ids.
1316
+ seen = set()
1317
+ dupes = [key for key in _all_keys if key in seen or seen.add(key)]
1318
+ if dupes:
1319
+ raise ValueError(f"not possible to have key(s) {dupes} used for different filter ids.")
1320
+ self.p_jobs = _p_jobs
1321
+ self.filter_apps = _filter_apps
1322
+ # Once filters have been managed, simplify 'keys' as a single level
1323
+ # dict.
1324
+ keys = ChainMap(*keys.values())
1325
+ (
1326
+ self.keys_config,
1327
+ self.agg_pd,
1328
+ ) = _init_keys_config(ordered_on, keys, keys_default)
1329
+ (
1330
+ restart_index,
1331
+ pre_buffer,
1332
+ self.agg_buffers,
1333
+ ) = _init_buffers(store, keys)
1334
+ self.seed_config = {
1335
+ KEY_ORDERED_ON: ordered_on,
1336
+ KEY_PRE: pre,
1337
+ KEY_PRE_BUFFER: pre_buffer,
1338
+ KEY_FILTERS: filters,
1339
+ KEY_RESTART_INDEX: restart_index,
1340
+ }
1341
+ # Cumsegagg-like agg definition.
1342
+ # Cannot be set yet, because seed dtype is required.
1343
+ # Is a dict, specifying for each key, its expected aggregation.
1344
+ self.agg_cs = {}
1345
+ # Store attribute.
1346
+ self.store = store
1347
+
1348
+ def _init_agg_cs(self, seed: Iterable[DataFrame]):
1349
+ """
1350
+ Initialize ``self.agg_cs``.
1351
+
1352
+ Because dtypes of seed DataFrame is required, the first seed chunk is
1353
+ generated from the Iterable. Seed Iterable is then repacked with first
1354
+ item already in memory.
1355
+
1356
+ Parameters
1357
+ ----------
1358
+ seed : Iterable[DataFrame]
1359
+ Seed data, from which getting pandas DataFrame dtypes.
1360
+
1361
+ Returns
1362
+ -------
1363
+ seed
1364
+ Seed that had to be repacked.
1365
+
1366
+ """
1367
+ remainder = iter(seed)
1368
+ first = next(remainder)
1369
+ # Recompose seed with 1st item materialized.
1370
+ seed = chain([first], remainder)
1371
+ seed_dtypes = first.dtypes.to_dict()
1372
+ for key in self.keys_config:
1373
+ try:
1374
+ self.agg_cs[key] = setup_cumsegagg(self.agg_pd[key], seed_dtypes)
1375
+ except Exception:
1376
+ raise ValueError(f"exception raised for key '{key}'")
1377
+ return seed
1378
+
1379
+ def agg(
1380
+ self,
1381
+ seed: DataFrame | Iterable[DataFrame] = None,
1382
+ trim_start: bool | None = False,
1383
+ discard_last: bool | None = False,
1384
+ final_write: bool | None = True,
1385
+ ):
1386
+ """
1387
+ Aggregate sequentially on successive chunks (stream) of ordered data.
1388
+
1389
+ This function conducts 'streamed aggregation', iteratively (out-of
1390
+ core) with optional post-processing of aggregation results (by use of
1391
+ vectorized functions or for dataframe formatting).
1392
+
1393
+ Parameters
1394
+ ----------
1395
+ seed : DataFrame | Iterable[DataFrame]
1396
+ Seed data over which conducting streamed aggregations.
1397
+ trim_start : bool, default True
1398
+ If ``True``, and if aggregated results already exist, then
1399
+ retrieves the last index present in seed data (recorded in metadata
1400
+ of existing aggregated results), and trim all seed data before this
1401
+ index (index excluded from trim, so it will be in new aggregation
1402
+ results). This trimming makes sense if previous aggregation
1403
+ iteration has been managed with ``discard_last`` set ``True``.
1404
+ discard_last : bool, default True
1405
+ If ``True``, last row group in seed data (sharing the same value in
1406
+ `ordered_on` column) is removed from the aggregation step. See
1407
+ below notes.
1408
+ final_write : bool, default True
1409
+ If ``True``, after last iteration of aggregation, aggregation
1410
+ results are written to disk. With this parameter, restarting
1411
+ aggregation with a new AggStream instance is possible.
1412
+ If ``True``, if an exception is raised during seed check, then last
1413
+ aggregation results from last valid seed chunk are also written to
1414
+ disk.
1415
+
1416
+ Notes
1417
+ -----
1418
+ - If aggregation results already exist in oups ``Store`` instance,
1419
+ and `trim_start` is `True`, last index from previous aggregation is
1420
+ retrieved, and prior seed data is trimmed.
1421
+ - Aggregation is by default processed up to the last index excluded,
1422
+ and subsequent aggregation will start from this last index included,
1423
+ assumed to be that of an incomplete row group.
1424
+ If `discard_last` is set `False`, then aggregation is process up to
1425
+ the last data.
1426
+ - By default, with parameter `discard_last`` set ``True``, the last row
1427
+ group (composed from rows sharing the same value in `ordered_on`
1428
+ column), is discarded.
1429
+
1430
+ - It may be for instance that this row group is not complete yet
1431
+ and should therefore not be accounted for. More precisely, new
1432
+ rows with same value in `ordered_on` may appear in seed data
1433
+ later on. Because seed data is trimmed to start from last
1434
+ processed value from `ordered_on` column (value included), these
1435
+ new rows would be excluded from the next aggregation, leading to
1436
+ an inaccurate aggregation result. Doing so is a way to identify
1437
+ easily when re-starting the aggregation in a case there can be
1438
+ duplicates in `ordered_on` column. A ``sum`` aggregation will
1439
+ then return the correct result for instance, as no data is
1440
+ accounted for twice.
1441
+ - Or if composed of a single row, this last row in seed data is
1442
+ temporary (and may get its final values only at a later time,
1443
+ when it becomes the one-but-last row, as a new row is added).
1444
+
1445
+ """
1446
+ # TODO: add 'snap_by' parameter to 'agg()' to allow using list of
1447
+ # timestamps. 'cumsegagg()' is already compatible.
1448
+ # TODO: add a writing step once aggregation on a seed chunk is done
1449
+ # (keeping track of '_last_seed_index': as soon as it changes from
1450
+ # one iteration to the next, trigger the intermediate writing step)
1451
+ # Aggregation results to keep are listed through an additional
1452
+ # 'group_res' parameter, in the form:
1453
+ # {key_to_write_grouped_res_to: [key1, key2, key3],
1454
+ # ...}
1455
+ # Motivation is to be able to gather results for different filters and
1456
+ # 'bin_by' value, and post-process them in a single 'post' function and
1457
+ # write results in a single file.
1458
+ # This particularly make sense if there is a single 'snap_by' value, as
1459
+ # snapshots results will be easily merged.
1460
+ # TODO: change default settings:
1461
+ # discard_last = trim_start = final_write = False
1462
+ if isinstance(seed, DataFrame):
1463
+ # Make the seed an iterable.
1464
+ seed = [seed]
1465
+ # Seed can be an empty list or None.
1466
+ if seed:
1467
+ if not self.agg_cs:
1468
+ # If first time an aggregation is made with this object,
1469
+ # initialize 'agg_cs'.
1470
+ seed = self._init_agg_cs(seed)
1471
+ seed_check_exception = False
1472
+ try:
1473
+ # TODO: '_pre_buffer' is modified in-place. It should not be
1474
+ # needed to return it. It is within 'self.seed_config'.
1475
+ for _last_seed_index, _pre_buffer, filter_id, filtered_chunk in _iter_data(
1476
+ seed=seed,
1477
+ **self.seed_config,
1478
+ trim_start=trim_start,
1479
+ discard_last=discard_last,
1480
+ ):
1481
+ # Retrieve Parallel joblib setup.
1482
+ agg_loop_res = self.p_jobs[self.filter_apps[filter_id].n_jobs](
1483
+ delayed(agg_iter)(
1484
+ seed_chunk=filtered_chunk,
1485
+ store=self.store,
1486
+ key=key,
1487
+ keys_config=self.keys_config[key],
1488
+ agg_config=self.agg_cs[key],
1489
+ agg_buffers=self.agg_buffers[key],
1490
+ )
1491
+ for key in self.filter_apps[filter_id].keys
1492
+ )
1493
+ # Transform list of tuples into a dict.
1494
+ for key, agg_res in agg_loop_res:
1495
+ self.agg_buffers[key].update(agg_res)
1496
+ # Set 'seed_index_restart' to the 'last_seed_index' with
1497
+ # which restarting the next aggregation iteration.
1498
+ self.seed_config[KEY_RESTART_INDEX] = _last_seed_index
1499
+ # Also keep track of last 'pre_buffer' value.
1500
+ self.seed_config[KEY_PRE_BUFFER] = _pre_buffer
1501
+ except SeedPreException as sce:
1502
+ seed_check_exception = True
1503
+ exception_message = str(sce)
1504
+ if final_write:
1505
+ # Post-process & write results from last iteration, this time
1506
+ # keeping last aggregation row, and recording metadata for a
1507
+ # future 'AggStream.agg' execution.
1508
+ self.p_jobs[KEY_MAX_P_JOBS](
1509
+ delayed(_post_n_write_agg_chunks)(
1510
+ store=self.store,
1511
+ key=key,
1512
+ agg_buffers=agg_res,
1513
+ agg_res_type=self.keys_config[key][KEY_AGG_RES_TYPE],
1514
+ append_last_res=True,
1515
+ write_config=self.keys_config[key][KEY_WRITE_CONFIG],
1516
+ index_name=self.keys_config[key][KEY_BIN_ON_OUT],
1517
+ post=self.keys_config[key][KEY_POST],
1518
+ last_seed_index=self.seed_config[KEY_RESTART_INDEX],
1519
+ pre_buffer=self.seed_config[KEY_PRE_BUFFER],
1520
+ )
1521
+ for key, agg_res in self.agg_buffers.items()
1522
+ )
1523
+ if seed and seed_check_exception:
1524
+ raise SeedPreException(exception_message)