oups 2025.9.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of oups might be problematic. Click here for more details.
- oups/__init__.py +40 -0
- oups/date_utils.py +62 -0
- oups/defines.py +26 -0
- oups/numpy_utils.py +114 -0
- oups/stateful_loop/__init__.py +14 -0
- oups/stateful_loop/loop_persistence_io.py +55 -0
- oups/stateful_loop/stateful_loop.py +654 -0
- oups/stateful_loop/validate_loop_usage.py +338 -0
- oups/stateful_ops/__init__.py +22 -0
- oups/stateful_ops/aggstream/__init__.py +12 -0
- oups/stateful_ops/aggstream/aggstream.py +1524 -0
- oups/stateful_ops/aggstream/cumsegagg.py +580 -0
- oups/stateful_ops/aggstream/jcumsegagg.py +416 -0
- oups/stateful_ops/aggstream/segmentby.py +1018 -0
- oups/stateful_ops/aggstream/utils.py +71 -0
- oups/stateful_ops/asof_merger/__init__.py +11 -0
- oups/stateful_ops/asof_merger/asof_merger.py +750 -0
- oups/stateful_ops/asof_merger/get_config.py +401 -0
- oups/stateful_ops/asof_merger/validate_params.py +285 -0
- oups/store/__init__.py +15 -0
- oups/store/filepath_utils.py +68 -0
- oups/store/indexer.py +457 -0
- oups/store/ordered_parquet_dataset/__init__.py +19 -0
- oups/store/ordered_parquet_dataset/metadata_filename.py +50 -0
- oups/store/ordered_parquet_dataset/ordered_parquet_dataset/__init__.py +15 -0
- oups/store/ordered_parquet_dataset/ordered_parquet_dataset/base.py +863 -0
- oups/store/ordered_parquet_dataset/ordered_parquet_dataset/read_only.py +252 -0
- oups/store/ordered_parquet_dataset/parquet_adapter.py +157 -0
- oups/store/ordered_parquet_dataset/write/__init__.py +19 -0
- oups/store/ordered_parquet_dataset/write/iter_merge_split_data.py +131 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/__init__.py +22 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/base.py +784 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/n_rows_strategy.py +297 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/time_period_strategy.py +319 -0
- oups/store/ordered_parquet_dataset/write/write.py +270 -0
- oups/store/store/__init__.py +11 -0
- oups/store/store/dataset_cache.py +50 -0
- oups/store/store/iter_intersections.py +397 -0
- oups/store/store/store.py +345 -0
- oups-2025.9.5.dist-info/LICENSE +201 -0
- oups-2025.9.5.dist-info/METADATA +44 -0
- oups-2025.9.5.dist-info/RECORD +43 -0
- oups-2025.9.5.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,1524 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Created on Wed Nov 15 21:30:00 2023.
|
|
4
|
+
|
|
5
|
+
@author: pierrot
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
from collections import ChainMap
|
|
9
|
+
from collections import namedtuple
|
|
10
|
+
from collections.abc import Callable
|
|
11
|
+
from collections.abc import Iterable
|
|
12
|
+
from enum import Enum
|
|
13
|
+
from inspect import Parameter
|
|
14
|
+
from inspect import signature
|
|
15
|
+
from itertools import chain
|
|
16
|
+
from multiprocessing import cpu_count
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
from joblib import Parallel
|
|
20
|
+
from joblib import delayed
|
|
21
|
+
from numpy import ones
|
|
22
|
+
from pandas import DataFrame
|
|
23
|
+
from pandas import DatetimeIndex
|
|
24
|
+
from pandas import Series
|
|
25
|
+
from pandas import Timestamp
|
|
26
|
+
from pandas import concat
|
|
27
|
+
from pandas.core.resample import TimeGrouper
|
|
28
|
+
|
|
29
|
+
from oups.defines import KEY_DUPLICATES_ON
|
|
30
|
+
from oups.defines import KEY_ORDERED_ON
|
|
31
|
+
from oups.defines import KEY_ROW_GROUP_TARGET_SIZE
|
|
32
|
+
from oups.stateful_ops.aggstream.cumsegagg import cumsegagg
|
|
33
|
+
from oups.stateful_ops.aggstream.cumsegagg import setup_cumsegagg
|
|
34
|
+
from oups.stateful_ops.aggstream.jcumsegagg import FIRST
|
|
35
|
+
from oups.stateful_ops.aggstream.jcumsegagg import LAST
|
|
36
|
+
from oups.stateful_ops.aggstream.jcumsegagg import MAX
|
|
37
|
+
from oups.stateful_ops.aggstream.jcumsegagg import MIN
|
|
38
|
+
from oups.stateful_ops.aggstream.jcumsegagg import SUM
|
|
39
|
+
from oups.stateful_ops.aggstream.segmentby import KEY_BIN_BY
|
|
40
|
+
from oups.stateful_ops.aggstream.segmentby import KEY_BIN_ON
|
|
41
|
+
from oups.stateful_ops.aggstream.segmentby import KEY_SNAP_BY
|
|
42
|
+
from oups.stateful_ops.aggstream.segmentby import setup_segmentby
|
|
43
|
+
from oups.stateful_ops.aggstream.utils import dataframe_filter
|
|
44
|
+
from oups.store import OrderedParquetDataset
|
|
45
|
+
from oups.store import Store
|
|
46
|
+
from oups.store import write
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# Aggregation functions.
|
|
50
|
+
ACCEPTED_AGG_FUNC = {FIRST, LAST, MIN, MAX, SUM}
|
|
51
|
+
# List of keys.
|
|
52
|
+
KEY_AGGSTREAM = "aggstream"
|
|
53
|
+
KEY_PRE = "pre"
|
|
54
|
+
KEY_PRE_BUFFER = "pre_buffer"
|
|
55
|
+
KEY_SEGAGG_BUFFER = "segagg_buffer"
|
|
56
|
+
KEY_POST_BUFFER = "post_buffer"
|
|
57
|
+
KEY_BIN_RES_BUFFER = "bin_res_buffer"
|
|
58
|
+
KEY_BIN_ON_OUT = "bin_on_out"
|
|
59
|
+
KEY_SNAP_RES_BUFFER = "snap_res_buffer"
|
|
60
|
+
KEY_FILTERS = "filters"
|
|
61
|
+
KEY_RESTART_INDEX = "restart_index"
|
|
62
|
+
KEY_BIN_RES = "bin_res"
|
|
63
|
+
KEY_SNAP_RES = "snap_res"
|
|
64
|
+
KEY_WRITE_CONFIG = "write_config"
|
|
65
|
+
KEY_AGG_IN_MEMORY_SIZE = "agg_in_memory_size"
|
|
66
|
+
KEY_MAX_IN_MEMORY_SIZE_B = "max_in_memory_size_b"
|
|
67
|
+
KEY_MAX_IN_MEMORY_SIZE_MB = "max_in_memory_size"
|
|
68
|
+
KEY_AGG_RES_TYPE = "agg_res_type"
|
|
69
|
+
KEY_SEG_CONFIG = "seg_config"
|
|
70
|
+
# Filters
|
|
71
|
+
NO_FILTER_ID = "_"
|
|
72
|
+
# List of valid parameters for 'key_conf_in'
|
|
73
|
+
KEY_AGG = "agg"
|
|
74
|
+
KEY_POST = "post"
|
|
75
|
+
# 'bin_by' is a compulsory parameter, and a specific check is made for it.
|
|
76
|
+
# It is not added in 'KEY_CONF_IN_PARAMS'.
|
|
77
|
+
WRITE_PARAMS = {
|
|
78
|
+
name
|
|
79
|
+
for name, p in signature(write).parameters.items()
|
|
80
|
+
if p.kind in (Parameter.POSITIONAL_OR_KEYWORD, Parameter.KEYWORD_ONLY)
|
|
81
|
+
}
|
|
82
|
+
KEY_CONF_IN_PARAMS = {
|
|
83
|
+
KEY_BIN_ON,
|
|
84
|
+
KEY_SNAP_BY,
|
|
85
|
+
KEY_AGG,
|
|
86
|
+
KEY_POST,
|
|
87
|
+
KEY_MAX_IN_MEMORY_SIZE_B,
|
|
88
|
+
} | WRITE_PARAMS
|
|
89
|
+
# Parallel jobs, at most using 75% of available cpus.
|
|
90
|
+
KEY_MAX_P_JOBS = max(int(cpu_count() * 3 / 4), 1)
|
|
91
|
+
# Max in memory size of result dataframes allowed before writing to disk.
|
|
92
|
+
# Provided in bytes.
|
|
93
|
+
MEGABYTES_TO_BYTES = 1048576
|
|
94
|
+
MAX_IN_MEMORY_SIZE_MB = 140
|
|
95
|
+
MAX_IN_MEMORY_SIZE_B = MAX_IN_MEMORY_SIZE_MB * MEGABYTES_TO_BYTES
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
FilterApp = namedtuple("FilterApp", "keys n_jobs")
|
|
99
|
+
AggResType = Enum("AggResType", ["BINS", "SNAPS", "BOTH"])
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _is_aggstream_result(handle: OrderedParquetDataset) -> bool:
|
|
103
|
+
"""
|
|
104
|
+
Check whether the input dataset was produced by aggstream.
|
|
105
|
+
|
|
106
|
+
Parameters
|
|
107
|
+
----------
|
|
108
|
+
handle : OrderedParquetDataset
|
|
109
|
+
Dataset handle to check.
|
|
110
|
+
|
|
111
|
+
Returns
|
|
112
|
+
-------
|
|
113
|
+
bool
|
|
114
|
+
True if parquet file contains metadata as produced by
|
|
115
|
+
``oups.aggstream``, which confirms this dataset has been produced with
|
|
116
|
+
this latter function.
|
|
117
|
+
|
|
118
|
+
"""
|
|
119
|
+
return KEY_AGGSTREAM in handle.key_value_metadata
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _init_keys_config(
|
|
123
|
+
seed_ordered_on: str,
|
|
124
|
+
keys_config: dict,
|
|
125
|
+
keys_default: dict,
|
|
126
|
+
):
|
|
127
|
+
"""
|
|
128
|
+
Consolidate keys' configuration into ``keys_config`` and ``agg_pd``.
|
|
129
|
+
|
|
130
|
+
Parameters
|
|
131
|
+
----------
|
|
132
|
+
seed_ordered_on : str
|
|
133
|
+
Name of the column with respect to which seed is in ascending order.
|
|
134
|
+
This parameter is used for seed segmentation. It is also used as
|
|
135
|
+
default name of the column with respect to which aggregation results are
|
|
136
|
+
in ascending order, if not provided in ``keys`` parameter.
|
|
137
|
+
keys_config : dict
|
|
138
|
+
Unconsolidated keys config.
|
|
139
|
+
keys_default : dict
|
|
140
|
+
Default values for missing parameters in ``keys_config``.
|
|
141
|
+
|
|
142
|
+
Other Parameters
|
|
143
|
+
----------------
|
|
144
|
+
kwargs : dict
|
|
145
|
+
Other user parameters that will be set into ``keys_config``.
|
|
146
|
+
|
|
147
|
+
Returns
|
|
148
|
+
-------
|
|
149
|
+
The following AggStream's parameters are initialized with this function.
|
|
150
|
+
|
|
151
|
+
- ``keys_config``, dict of keys' config in the form:
|
|
152
|
+
``{key: {'bin_on_out' : str, name in aggregation results for column
|
|
153
|
+
with bin ids.
|
|
154
|
+
'seg_config' : dict specifying the segmentation config,
|
|
155
|
+
'post' : Callable or None,
|
|
156
|
+
'max_in_memory_size_b': int, max allowed result in memory size,
|
|
157
|
+
in bytes
|
|
158
|
+
'write_config' : {'ordered_on' : str,
|
|
159
|
+
'duplicates_on' : str or list,
|
|
160
|
+
'max_row_group_size' : str | int | tuple
|
|
161
|
+
...
|
|
162
|
+
},
|
|
163
|
+
'agg_res_rype' : AggResType, either 'BINS', 'SNAPS', or 'BOTH'.
|
|
164
|
+
},
|
|
165
|
+
}``
|
|
166
|
+
- ``self.agg_pd``, dict, specifying per key the aggregation
|
|
167
|
+
configuration.
|
|
168
|
+
|
|
169
|
+
"""
|
|
170
|
+
consolidated_keys_config = {}
|
|
171
|
+
agg_pd = {}
|
|
172
|
+
for key, key_conf_in in keys_config.items():
|
|
173
|
+
# Parameters in 'key_conf_in' take precedence over those in
|
|
174
|
+
# 'keys_default'. Additionally, with this step, 'key_conf_in' is a
|
|
175
|
+
# deep copy, and when parameters are popped, it does not affect
|
|
176
|
+
# the initial 'key_conf_in'.
|
|
177
|
+
try:
|
|
178
|
+
bin_by = key_conf_in.pop(KEY_BIN_BY)
|
|
179
|
+
except KeyError:
|
|
180
|
+
raise ValueError(f"'{KEY_BIN_BY}' parameter is missing for key '{key}'.")
|
|
181
|
+
if KEY_MAX_IN_MEMORY_SIZE_MB in key_conf_in:
|
|
182
|
+
# Switch from MB to B.
|
|
183
|
+
key_conf_in[KEY_MAX_IN_MEMORY_SIZE_B] = int(
|
|
184
|
+
key_conf_in.pop(KEY_MAX_IN_MEMORY_SIZE_MB) * MEGABYTES_TO_BYTES,
|
|
185
|
+
)
|
|
186
|
+
key_conf_in = keys_default | key_conf_in
|
|
187
|
+
# Check parameters in 'key_conf_in' are valid ones.
|
|
188
|
+
for param in key_conf_in:
|
|
189
|
+
if param not in KEY_CONF_IN_PARAMS:
|
|
190
|
+
raise ValueError(
|
|
191
|
+
f"'{param}' not a valid parameters in '{key}' aggregation config.",
|
|
192
|
+
)
|
|
193
|
+
bin_on = key_conf_in.pop(KEY_BIN_ON, None)
|
|
194
|
+
agg_pd[key] = key_conf_in.pop(KEY_AGG)
|
|
195
|
+
|
|
196
|
+
if isinstance(bin_on, tuple):
|
|
197
|
+
# 'bin_on_out' is name of column containing group keys in
|
|
198
|
+
# 'agg_res'. Setting of 'bin_on_out' is an 'AggStream'
|
|
199
|
+
# task, not a 'cumesegagg' one. This is because this
|
|
200
|
+
# parameter clarifies then how to set 'duplicates_on'
|
|
201
|
+
# parameter for 'oups.writer.write' which is also part of
|
|
202
|
+
# 'AggStream' perimeter.
|
|
203
|
+
bin_on, bin_on_out = bin_on
|
|
204
|
+
else:
|
|
205
|
+
bin_on_out = None
|
|
206
|
+
# Setup 'seg_conf', 'bin_on_out' & 'agg_pd'.
|
|
207
|
+
try:
|
|
208
|
+
seg_config = setup_segmentby(
|
|
209
|
+
bin_by=bin_by,
|
|
210
|
+
bin_on=bin_on,
|
|
211
|
+
ordered_on=seed_ordered_on,
|
|
212
|
+
snap_by=key_conf_in.pop(KEY_SNAP_BY),
|
|
213
|
+
)
|
|
214
|
+
except Exception:
|
|
215
|
+
raise ValueError(f"exception raised for key '{key}'")
|
|
216
|
+
if bin_on := seg_config[KEY_BIN_ON]:
|
|
217
|
+
if bin_on_out is None:
|
|
218
|
+
# It may be that 'bin_on' value has been modified in
|
|
219
|
+
# 'setup_segmentby'. If 'bin_on_out' has not been set
|
|
220
|
+
# previously, then set it to this possibly new value of
|
|
221
|
+
# 'bin_on'.
|
|
222
|
+
bin_on_out = bin_on
|
|
223
|
+
# 'agg' is in the form:
|
|
224
|
+
# {"output_col":("input_col", "agg_function_name")}
|
|
225
|
+
if bin_on_out in agg_pd[key]:
|
|
226
|
+
# Check that this name is not already that of an output
|
|
227
|
+
# column from aggregation.
|
|
228
|
+
raise ValueError(
|
|
229
|
+
f"not possible to have {bin_on_out} as column name in"
|
|
230
|
+
" aggregated results as it is also for column"
|
|
231
|
+
" containing group keys.",
|
|
232
|
+
)
|
|
233
|
+
# Initialize 'write_config', which are parameters remaining in
|
|
234
|
+
# 'key_conf_in' and some adjustments.
|
|
235
|
+
# Adding 'bin_on_out' to 'duplicates_on' except if
|
|
236
|
+
# 'duplicates_on' is set already. In this case, if 'bin_on_out'
|
|
237
|
+
# is not in 'duplicates_on', it is understood as a voluntary
|
|
238
|
+
# user choice. For all other cases, 'duplicates_on' has been
|
|
239
|
+
# set by user. Setting 'duplicates_on' is the true reason of
|
|
240
|
+
# having 'bin_on_out'. It allows the user to inform 'AggStream'
|
|
241
|
+
# that the binning column (with unique keys) is this one.
|
|
242
|
+
if KEY_DUPLICATES_ON not in key_conf_in or key_conf_in[KEY_DUPLICATES_ON] is None:
|
|
243
|
+
# Force 'bin_on_out', else reuse 'ordered_on' parameter
|
|
244
|
+
# specific to keys (aggregation results).
|
|
245
|
+
key_conf_in[KEY_DUPLICATES_ON] = bin_on_out if bin_on_out else key_conf_in[KEY_ORDERED_ON]
|
|
246
|
+
# key_conf_in[KEY_DUPLICATES_ON] = key_conf_in[KEY_ORDERED_ON]
|
|
247
|
+
if seg_config[KEY_SNAP_BY] is None:
|
|
248
|
+
# Snapshots not requested, aggreagtation results are necessarily
|
|
249
|
+
# bins.
|
|
250
|
+
agg_res_type = AggResType.BINS
|
|
251
|
+
elif isinstance(key, tuple):
|
|
252
|
+
# 2 keys are provided, aggregation results are necessarily both
|
|
253
|
+
# bins and snapshots.
|
|
254
|
+
agg_res_type = AggResType.BOTH
|
|
255
|
+
else:
|
|
256
|
+
# Otherwise, a single aggregation result is expected, and it is
|
|
257
|
+
# created from both bins and snapshots. Hence it is snaps like.
|
|
258
|
+
agg_res_type = AggResType.SNAPS
|
|
259
|
+
if agg_res_type is AggResType.BOTH:
|
|
260
|
+
if KEY_ROW_GROUP_TARGET_SIZE in key_conf_in:
|
|
261
|
+
if not isinstance(key_conf_in[KEY_ROW_GROUP_TARGET_SIZE], tuple):
|
|
262
|
+
key_conf_in[KEY_ROW_GROUP_TARGET_SIZE] = (
|
|
263
|
+
key_conf_in[KEY_ROW_GROUP_TARGET_SIZE],
|
|
264
|
+
key_conf_in[KEY_ROW_GROUP_TARGET_SIZE],
|
|
265
|
+
)
|
|
266
|
+
else:
|
|
267
|
+
key_conf_in[KEY_ROW_GROUP_TARGET_SIZE] = (None, None)
|
|
268
|
+
consolidated_keys_config[key] = {
|
|
269
|
+
KEY_SEG_CONFIG: seg_config,
|
|
270
|
+
KEY_BIN_ON_OUT: bin_on_out,
|
|
271
|
+
KEY_MAX_IN_MEMORY_SIZE_B: key_conf_in.pop(KEY_MAX_IN_MEMORY_SIZE_B),
|
|
272
|
+
KEY_POST: key_conf_in.pop(KEY_POST),
|
|
273
|
+
KEY_WRITE_CONFIG: key_conf_in,
|
|
274
|
+
KEY_AGG_RES_TYPE: agg_res_type,
|
|
275
|
+
}
|
|
276
|
+
return consolidated_keys_config, agg_pd
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def _init_buffers(
|
|
280
|
+
store: Store,
|
|
281
|
+
keys: dict,
|
|
282
|
+
):
|
|
283
|
+
"""
|
|
284
|
+
Initialize pre, aggregation and post buffers from existing results.
|
|
285
|
+
|
|
286
|
+
Also set ``seed_index_restart``.
|
|
287
|
+
|
|
288
|
+
Parameters
|
|
289
|
+
----------
|
|
290
|
+
store : Store
|
|
291
|
+
Store to which aggregation results may already exist, and from which
|
|
292
|
+
retrieving previous buffer data.
|
|
293
|
+
keys : Any | dict
|
|
294
|
+
Single level dict as defined in ``_init__`` function.
|
|
295
|
+
|
|
296
|
+
Returns
|
|
297
|
+
-------
|
|
298
|
+
The following AggStream's parameters are initialized in this function.
|
|
299
|
+
- ``seed_index_restart``, int, float or Timestamp, the index
|
|
300
|
+
from which (included) should be restarted the next aggregation
|
|
301
|
+
iteration.
|
|
302
|
+
- ``pre_buffer``, dict, user-defined buffer to keep track of intermediate
|
|
303
|
+
variables between successive pre-processing of individual seed chunk.
|
|
304
|
+
- ``agg_buffers``, dict of aggregation buffer variables specific for each
|
|
305
|
+
key, in the form:
|
|
306
|
+
``{key: {'agg_in_memory_size' : 0,
|
|
307
|
+
'bin_res' : None,
|
|
308
|
+
'snap_res' : None,
|
|
309
|
+
'bin_res_buffer' : list,
|
|
310
|
+
'snap_res_buffer' : list,
|
|
311
|
+
'segagg_buffer' : dict, possibly empty,
|
|
312
|
+
'post_buffer' : dict, possibly empty,
|
|
313
|
+
},
|
|
314
|
+
}``
|
|
315
|
+
|
|
316
|
+
"""
|
|
317
|
+
pre_buffer = {}
|
|
318
|
+
agg_buffers = {}
|
|
319
|
+
seed_index_restart_set = set()
|
|
320
|
+
for key in keys:
|
|
321
|
+
# Default values for aggregation counters and buffers.
|
|
322
|
+
# 'agg_in_memory_size' : number of rows in aggregation result.
|
|
323
|
+
# 'agg_res_buffer' and 'bin_res_buffer' are buffers to keep
|
|
324
|
+
# aggregation chunks before a concatenation to record. Because
|
|
325
|
+
# they are appended in-place for each key, they are created
|
|
326
|
+
# separately for each key.
|
|
327
|
+
# Because 'segagg_buffer' and 'post_buffer' are modified
|
|
328
|
+
# in-place for each key, they are created separately for
|
|
329
|
+
# each key.
|
|
330
|
+
agg_buffers[key] = _reset_agg_buffers()
|
|
331
|
+
# Process metadata if already existing aggregation results.
|
|
332
|
+
# If 'key' is atuple of 'bin_key' and 'snap_key', keep 'bin_key' as
|
|
333
|
+
# the main key to check existing results in store.
|
|
334
|
+
main_key = key[0] if isinstance(key, tuple) else key
|
|
335
|
+
if main_key in store:
|
|
336
|
+
# Prior AggStream results already in store.
|
|
337
|
+
# Retrieve corresponding metadata to re-start aggregations.
|
|
338
|
+
prev_agg_res = store[main_key]
|
|
339
|
+
if not _is_aggstream_result(prev_agg_res):
|
|
340
|
+
raise ValueError(
|
|
341
|
+
f"provided '{main_key}' data is not an AggStream result.",
|
|
342
|
+
)
|
|
343
|
+
aggstream_md = prev_agg_res.key_value_metadata[KEY_AGGSTREAM]
|
|
344
|
+
# - 'last_seed_index' to trim accordingly head of seed data.
|
|
345
|
+
# - metadata related to pre-processing of individual seed chunk.
|
|
346
|
+
# - metadata related to binning process from past binnings
|
|
347
|
+
# on prior data. It is used in case 'bin_by' is a callable.
|
|
348
|
+
# If not used, it is an empty dict.
|
|
349
|
+
# - metadata related to post-processing of prior
|
|
350
|
+
# aggregation results, to be used by 'post'. If not used,
|
|
351
|
+
# it is an empty dict.
|
|
352
|
+
seed_index_restart_set.add(aggstream_md[KEY_RESTART_INDEX])
|
|
353
|
+
if KEY_PRE_BUFFER in aggstream_md:
|
|
354
|
+
pre_buffer = aggstream_md[KEY_PRE_BUFFER]
|
|
355
|
+
agg_buffers[key][KEY_SEGAGG_BUFFER] = (
|
|
356
|
+
aggstream_md[KEY_SEGAGG_BUFFER] if aggstream_md[KEY_SEGAGG_BUFFER] else {}
|
|
357
|
+
)
|
|
358
|
+
agg_buffers[key][KEY_POST_BUFFER] = (
|
|
359
|
+
aggstream_md[KEY_POST_BUFFER] if aggstream_md[KEY_POST_BUFFER] else {}
|
|
360
|
+
)
|
|
361
|
+
else:
|
|
362
|
+
agg_buffers[key][KEY_SEGAGG_BUFFER] = {}
|
|
363
|
+
agg_buffers[key][KEY_POST_BUFFER] = {}
|
|
364
|
+
|
|
365
|
+
if len(seed_index_restart_set) > 1:
|
|
366
|
+
raise ValueError(
|
|
367
|
+
"not possible to aggregate on multiple keys with existing "
|
|
368
|
+
"aggregation results not aggregated up to the same seed index.",
|
|
369
|
+
)
|
|
370
|
+
return (
|
|
371
|
+
None if not seed_index_restart_set else seed_index_restart_set.pop(),
|
|
372
|
+
pre_buffer,
|
|
373
|
+
agg_buffers,
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def _reset_agg_buffers(agg_buffers: dict | None = None) -> dict | None:
|
|
378
|
+
"""
|
|
379
|
+
Reset aggregation buffers and counters.
|
|
380
|
+
|
|
381
|
+
Either modify in-place, or return a new dict.
|
|
382
|
+
|
|
383
|
+
Parameters
|
|
384
|
+
----------
|
|
385
|
+
agg_buffers : dict | None, default None
|
|
386
|
+
Buffer to keep track of aggregation sequence intermediate results.
|
|
387
|
+
|
|
388
|
+
- n_rows : int, number of rows in main aggregation results (snapshots
|
|
389
|
+
is snapshots are quested, or bins otherwise). It is reset here after
|
|
390
|
+
writing.
|
|
391
|
+
- bin_res : DataFrame, last aggregation results (bins), to reset to None
|
|
392
|
+
after writing.
|
|
393
|
+
- snap_res : DataFrame, last aggregation results (snapshots), to reset
|
|
394
|
+
to None after writing.
|
|
395
|
+
- bin_res_buffer : list[DataFrame], list of bins resulting from
|
|
396
|
+
aggregation (pandas DataFrame).
|
|
397
|
+
- snap_res_buffer : list[pandas.DataFrame], list of snapshots resulting
|
|
398
|
+
from aggregation (pandas dataframes), when snapshots are requested.
|
|
399
|
+
- post_buffer : dict, buffer to keep track of data that can be
|
|
400
|
+
processed during previous iterations. This pointer should not be
|
|
401
|
+
re-initialized in 'post' or data from previous iterations will be
|
|
402
|
+
lost. This dict has to contain data that can be serialized, as data
|
|
403
|
+
is then kept in parquet file metadata.
|
|
404
|
+
- segagg_buffer : dict, parameters from segmentation and aggregation
|
|
405
|
+
process, that are required when restarting the aggregation with new
|
|
406
|
+
seed data. (for recording in metadata of aggregation results)
|
|
407
|
+
|
|
408
|
+
Returns
|
|
409
|
+
-------
|
|
410
|
+
dict
|
|
411
|
+
A dict with initialized values for ``agg_buffers``.
|
|
412
|
+
|
|
413
|
+
"""
|
|
414
|
+
init_values = {
|
|
415
|
+
KEY_AGG_IN_MEMORY_SIZE: 0,
|
|
416
|
+
KEY_BIN_RES: None,
|
|
417
|
+
KEY_SNAP_RES: None,
|
|
418
|
+
KEY_BIN_RES_BUFFER: [],
|
|
419
|
+
KEY_SNAP_RES_BUFFER: [],
|
|
420
|
+
}
|
|
421
|
+
if agg_buffers is None:
|
|
422
|
+
return init_values
|
|
423
|
+
else:
|
|
424
|
+
agg_buffers |= init_values
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
class SeedPreException(Exception):
|
|
428
|
+
"""
|
|
429
|
+
Exception related to user-defined checks on seed chunk.
|
|
430
|
+
"""
|
|
431
|
+
|
|
432
|
+
def __init__(self, message: str | None = None):
|
|
433
|
+
"""
|
|
434
|
+
Exception message.
|
|
435
|
+
"""
|
|
436
|
+
if message is None:
|
|
437
|
+
self.message = "failing user-defined checks."
|
|
438
|
+
else:
|
|
439
|
+
self.message = message
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
def _iter_data(
|
|
443
|
+
seed: Iterable[DataFrame],
|
|
444
|
+
ordered_on: str,
|
|
445
|
+
restart_index: float | Timestamp | None,
|
|
446
|
+
pre: Callable | None,
|
|
447
|
+
pre_buffer: dict,
|
|
448
|
+
filters: dict | None,
|
|
449
|
+
trim_start: bool,
|
|
450
|
+
discard_last: bool,
|
|
451
|
+
):
|
|
452
|
+
"""
|
|
453
|
+
Iterate provided seed, applying sequentially (optionally) filters.
|
|
454
|
+
|
|
455
|
+
Seed has to be monotonic increasing on 'ordered_on' column. If not, it is
|
|
456
|
+
ordered.
|
|
457
|
+
|
|
458
|
+
Parameters
|
|
459
|
+
----------
|
|
460
|
+
seed : Iterable[DataFrame]
|
|
461
|
+
Iterable of pandas Dataframe.
|
|
462
|
+
ordered_on : str
|
|
463
|
+
Name of column with respect to which seed data is in ascending
|
|
464
|
+
order.
|
|
465
|
+
restart_index : int, float, Timestamp or None
|
|
466
|
+
Index (excluded) in `ordered_on` column before which rows in seed
|
|
467
|
+
will be trimmed.
|
|
468
|
+
pre : Callable or None
|
|
469
|
+
Used-defined Callable to proceed checks over each item of the seed
|
|
470
|
+
Iterable, accepting 2 parameters:
|
|
471
|
+
|
|
472
|
+
- An ``on`` parameter, a pandas dataframe, the current seed item
|
|
473
|
+
(before any filter is applied).
|
|
474
|
+
- A ``buffer`` parameter, a dict that can be used as a buffer
|
|
475
|
+
for storing temporary results from one chunk processing to
|
|
476
|
+
the next. Its initial value is that provided by `pre_buffer`.
|
|
477
|
+
|
|
478
|
+
In-place modifications of seed dataframe has to be carried out here.
|
|
479
|
+
pre_buffer : dict
|
|
480
|
+
Buffer to keep track of intermediate data that can be required for
|
|
481
|
+
proceeding with pre of individual seed item.
|
|
482
|
+
filters : dict or None
|
|
483
|
+
Dict in the form
|
|
484
|
+
``{"filter_id":[[("col", op, val), ...], ...]}``
|
|
485
|
+
To filter out data from seed.
|
|
486
|
+
Filter syntax: [[(column, op, val), ...],...]
|
|
487
|
+
where op is [==, =, >, >=, <, <=, !=, in, not in]
|
|
488
|
+
The innermost tuples are transposed into a set of filters applied
|
|
489
|
+
through an `AND` operation.
|
|
490
|
+
The outer list combines these sets of filters through an `OR`
|
|
491
|
+
operation.
|
|
492
|
+
A single list of tuples can also be used, meaning that no `OR`
|
|
493
|
+
operation between set of filters is to be conducted.
|
|
494
|
+
trim_start : bool
|
|
495
|
+
Flag to indicate if seed head has to be trimmed till value of
|
|
496
|
+
'restart_index' (last seed index of previous aggregation sequence).
|
|
497
|
+
discard_last : bool
|
|
498
|
+
If ``True``, last row group in seed data (sharing the same value in
|
|
499
|
+
`ordered_on` column) is removed from the aggregation step.
|
|
500
|
+
|
|
501
|
+
Returns
|
|
502
|
+
-------
|
|
503
|
+
last_seed_index, filder_id, filtered_chunk
|
|
504
|
+
- 'last_seed_index', int | float | Timestamp, the last seed
|
|
505
|
+
index value (likely of an incomplete group), of the current seed
|
|
506
|
+
chunk, before filters are applied.
|
|
507
|
+
- 'pre_buffer' : dict, buffer to keep track of intermediate data that
|
|
508
|
+
can be required for proceeding with preprocessing of individual seed
|
|
509
|
+
chunk.
|
|
510
|
+
- 'filter_id', str, indicating which set of filters has been
|
|
511
|
+
applied for the seed chunk provided.
|
|
512
|
+
- 'filtered_chunk', DataFrame, from the seed Iterable, with
|
|
513
|
+
optionally filters applied.
|
|
514
|
+
|
|
515
|
+
Notes
|
|
516
|
+
-----
|
|
517
|
+
Checks are applied after having trimming seed head (if ``trim_start``
|
|
518
|
+
is True) and discard last row group (if ``discard_last`` is True).
|
|
519
|
+
|
|
520
|
+
Reasons to discard last seed row (or row group) may be twofold:
|
|
521
|
+
- last row is temporary (yet to get some final values, for instance
|
|
522
|
+
if seed data is some kind of aggregation stream itself),
|
|
523
|
+
- last rows are part of a single row group 'same index value in
|
|
524
|
+
'ordered_on')not yet complete itself (new rows part of this row group
|
|
525
|
+
to be expected).
|
|
526
|
+
|
|
527
|
+
"""
|
|
528
|
+
if restart_index is None:
|
|
529
|
+
# No aggregation result existing yet. Whatever 'trim_start' value, no
|
|
530
|
+
# trimming is possible.
|
|
531
|
+
trim_start = False
|
|
532
|
+
seed_remainder = None
|
|
533
|
+
for seed_chunk in seed:
|
|
534
|
+
# Check seed chunk is ordered on 'ordered_on'.
|
|
535
|
+
# This re-ordering is made because for 'trim_start' and
|
|
536
|
+
# 'discard_last', this ordering is required.
|
|
537
|
+
if not seed_chunk[ordered_on].is_monotonic_increasing:
|
|
538
|
+
# Currently un-eased to silently modify seed data without knowing
|
|
539
|
+
# if it makes sense, so leaving this row commented.
|
|
540
|
+
# seed_chunk.sort_values(by=ordered_on, inplace=True)
|
|
541
|
+
# Instead, raise an exception.
|
|
542
|
+
raise SeedPreException("seed data is not in ascending order.")
|
|
543
|
+
# Step 1 / Seed pre-processing by user.
|
|
544
|
+
if pre:
|
|
545
|
+
# Apply user checks.
|
|
546
|
+
try:
|
|
547
|
+
pre(on=seed_chunk, buffer=pre_buffer)
|
|
548
|
+
except Exception as e:
|
|
549
|
+
# Stop iteration in case of failing pre.
|
|
550
|
+
# Aggregation has been run up to the last valid chunk.
|
|
551
|
+
raise SeedPreException(str(e))
|
|
552
|
+
# Step 2 / If a previous remainder, concatenate it to give current
|
|
553
|
+
# DataFrame its 'final' length.
|
|
554
|
+
if not (seed_remainder is None or seed_remainder.empty):
|
|
555
|
+
seed_chunk = concat([seed_remainder, seed_chunk], ignore_index=True)
|
|
556
|
+
# Step 3 / Prepare filter to trim seed head and tail if requested.
|
|
557
|
+
if trim_start:
|
|
558
|
+
if seed_chunk.loc[:, ordered_on].iloc[-1] < restart_index:
|
|
559
|
+
# This full chunk is to be discarded. Go to the next.
|
|
560
|
+
continue
|
|
561
|
+
else:
|
|
562
|
+
filter_array = seed_chunk[ordered_on] >= restart_index
|
|
563
|
+
# Once it has been applied once, no need to check for it
|
|
564
|
+
# again on subsequent chunks.
|
|
565
|
+
trim_start = False
|
|
566
|
+
else:
|
|
567
|
+
filter_array = ones(len(seed_chunk), dtype=bool)
|
|
568
|
+
# 'ordered_on' being necessarily in ascending order, last index
|
|
569
|
+
# value is its max value.
|
|
570
|
+
last_seed_index = seed_chunk.loc[:, ordered_on].iloc[-1]
|
|
571
|
+
if discard_last:
|
|
572
|
+
filter_main_chunk = seed_chunk.loc[:, ordered_on] < last_seed_index
|
|
573
|
+
seed_remainder = seed_chunk.loc[~filter_main_chunk]
|
|
574
|
+
filter_array &= filter_main_chunk
|
|
575
|
+
# Step 4 / Filter seed and yield.
|
|
576
|
+
for filt_id, filters_ in filters.items():
|
|
577
|
+
# Filter.
|
|
578
|
+
filter_array_loc = (
|
|
579
|
+
dataframe_filter(seed_chunk, filters_) & filter_array
|
|
580
|
+
if filt_id != NO_FILTER_ID
|
|
581
|
+
else filter_array.copy()
|
|
582
|
+
)
|
|
583
|
+
if not filter_array_loc.any():
|
|
584
|
+
# DataFrame will be empty after filtering.
|
|
585
|
+
# Proceed with next iteration.
|
|
586
|
+
continue
|
|
587
|
+
elif filter_array_loc.all():
|
|
588
|
+
# If filter only contains 1, simply return full seed chunk.
|
|
589
|
+
yield last_seed_index, pre_buffer, filt_id, seed_chunk
|
|
590
|
+
else:
|
|
591
|
+
# Otherwise, filter.
|
|
592
|
+
yield last_seed_index, pre_buffer, filt_id, seed_chunk.loc[filter_array_loc].reset_index(
|
|
593
|
+
drop=True,
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
def _concat_agg_res(
|
|
598
|
+
agg_res_buffers: list[DataFrame],
|
|
599
|
+
agg_res: DataFrame,
|
|
600
|
+
append_last_res: bool,
|
|
601
|
+
index_name: str,
|
|
602
|
+
):
|
|
603
|
+
"""
|
|
604
|
+
Concat aggregation results with / without last row.
|
|
605
|
+
|
|
606
|
+
Parameters
|
|
607
|
+
----------
|
|
608
|
+
agg_res_buffers : list[DataFrame]
|
|
609
|
+
List of aggregation results to concatenate.
|
|
610
|
+
agg_res : DataFrame
|
|
611
|
+
Last aggregation results (all rows from last iteration).
|
|
612
|
+
append_last_res : bool
|
|
613
|
+
If 'agg_res' should be appended to 'agg_res_buffer' and if 'bin_res'
|
|
614
|
+
should be appended to 'bin_res_buffers'.
|
|
615
|
+
index_name : str, default None
|
|
616
|
+
If a string, index name of dataframe resulting from aggregation with
|
|
617
|
+
this value, which will be enforced in written results.
|
|
618
|
+
|
|
619
|
+
Returns
|
|
620
|
+
-------
|
|
621
|
+
DataFrame
|
|
622
|
+
List of aggregation results concatenated in a single DataFrame.
|
|
623
|
+
|
|
624
|
+
"""
|
|
625
|
+
agg_res_list = [*agg_res_buffers, agg_res] if append_last_res else agg_res_buffers
|
|
626
|
+
# Make a copy when a single item, to not propagate the 'reset_index'
|
|
627
|
+
# to original 'agg_res'.
|
|
628
|
+
agg_res = concat(agg_res_list) if len(agg_res_list) > 1 else agg_res_list[0].copy(deep=False)
|
|
629
|
+
if index_name:
|
|
630
|
+
# In case 'by' is a callable, index may have no name, but user may have
|
|
631
|
+
# defined one with 'bin_on' parameter.
|
|
632
|
+
agg_res.index.name = index_name
|
|
633
|
+
# Keep group keys as a column before post-processing.
|
|
634
|
+
agg_res.reset_index(inplace=True)
|
|
635
|
+
return agg_res
|
|
636
|
+
|
|
637
|
+
|
|
638
|
+
def _post_n_write_agg_chunks(
|
|
639
|
+
agg_buffers: dict,
|
|
640
|
+
agg_res_type: Enum,
|
|
641
|
+
append_last_res: bool,
|
|
642
|
+
store: Store,
|
|
643
|
+
key: Any | tuple[Any, Any],
|
|
644
|
+
write_config: dict,
|
|
645
|
+
index_name: str | None = None,
|
|
646
|
+
post: Callable | None = None,
|
|
647
|
+
last_seed_index: float | Timestamp | None = None,
|
|
648
|
+
pre_buffer: dict | None = None,
|
|
649
|
+
):
|
|
650
|
+
"""
|
|
651
|
+
Write list of aggregation row groups with optional post.
|
|
652
|
+
|
|
653
|
+
Buffer variables 'agg_res_buffer', 'bin_res_buffer' are then reset.
|
|
654
|
+
|
|
655
|
+
Parameters
|
|
656
|
+
----------
|
|
657
|
+
agg_buffers : dict
|
|
658
|
+
Buffer to keep track of aggregation sequence intermediate results.
|
|
659
|
+
|
|
660
|
+
- agg_in_memory_size : int, size in bytes of aggregation results (bins
|
|
661
|
+
only or bins and snapshots if snapshots are requested. It is reset
|
|
662
|
+
here after writing.
|
|
663
|
+
- bin_res : DataFrame, last aggregation results, to reset to None
|
|
664
|
+
after writing.
|
|
665
|
+
- snap_res : DataFrame, last aggregation results, to reset to None
|
|
666
|
+
after writing.
|
|
667
|
+
- bin_res_buffer : list[DataFrame], list of bins resulting from
|
|
668
|
+
aggregation (pandas DataFrame).
|
|
669
|
+
It contains 'bin_res' (last aggregation results),but without last
|
|
670
|
+
row. It is flushed here after writing
|
|
671
|
+
- snap_res_buffer : list[pandas.DataFrame], list of snapshots resulting
|
|
672
|
+
from aggregation (pandas dataframes), when snapshots are requested.
|
|
673
|
+
It contains 'bin_res' (last aggregation results), but without last
|
|
674
|
+
row. It is flushed here after writing
|
|
675
|
+
- post_buffer : dict, buffer to keep track of data that can be
|
|
676
|
+
processed during previous iterations. This pointer should not be
|
|
677
|
+
re-initialized in 'post' or data from previous iterations will be
|
|
678
|
+
lost. This dict has to contain data that can be serialized, as data
|
|
679
|
+
is then kept in parquet file metadata.
|
|
680
|
+
It is NOT reset after writing. It is however required to be
|
|
681
|
+
written in metadata.
|
|
682
|
+
- segagg_buffer : dict, parameters from segmentation and aggregation
|
|
683
|
+
process, that are required when restarting the aggregation with new
|
|
684
|
+
seed data. (for recording in metadata of aggregation results)
|
|
685
|
+
It is NOT reset after writing. It is however required to be
|
|
686
|
+
written in metadata.
|
|
687
|
+
|
|
688
|
+
agg_res_type : Enum
|
|
689
|
+
Either 'BINS', 'SNAPS', or 'BOTH'.
|
|
690
|
+
append_last_res : bool
|
|
691
|
+
If 'agg_res' should be appended to 'agg_res_buffer' and if 'bin_res'
|
|
692
|
+
should be appended to 'bin_res_buffers'.
|
|
693
|
+
store : Store
|
|
694
|
+
Store to which recording aggregation results.
|
|
695
|
+
key : Any | tuple[Any, Any]
|
|
696
|
+
Key for retrieving corresponding metadata.
|
|
697
|
+
If a tuple of 2 dataclass, the first is key for bins, the second is key
|
|
698
|
+
for snapshots.
|
|
699
|
+
write_config : dict
|
|
700
|
+
Settings forwarded to ``oups.writer.write`` when writing aggregation
|
|
701
|
+
results to store. Compulsory parameter defining at least `ordered_on`
|
|
702
|
+
and `duplicates_on` columns.
|
|
703
|
+
index_name : str, default None
|
|
704
|
+
If a string, index name of dataframe resulting from aggregation with
|
|
705
|
+
this value, which will be enforced in written results.
|
|
706
|
+
post : Callable, default None
|
|
707
|
+
User-defined function accepting 3 parameters.
|
|
708
|
+
|
|
709
|
+
- ``buffer``, a dict to be used as data buffer, that can be necessary
|
|
710
|
+
for some user-defined post-processing requiring data assessed in
|
|
711
|
+
previous post-processing iteration.
|
|
712
|
+
- ``bin_res``, a pandas dataframe resulting from the aggregations
|
|
713
|
+
defined by ``agg`` parameter, with first row already corrected
|
|
714
|
+
with last row of previous streamed aggregation.
|
|
715
|
+
These are aggregation results for bins.
|
|
716
|
+
- ``snap_res`` (optional), a pandas dataframe resulting from the
|
|
717
|
+
aggregations defined by ``agg`` parameter that contains snapshots.
|
|
718
|
+
|
|
719
|
+
It has then to return a pandas dataframe that will be recorded.
|
|
720
|
+
This optional post-processing is intended for use of vectorized
|
|
721
|
+
functions (not mixing rows together, but operating on one or several
|
|
722
|
+
columns), or dataframe formatting before results are finally recorded.
|
|
723
|
+
|
|
724
|
+
last_seed_index : int | float | Timestamp | None, default None
|
|
725
|
+
Last index in seed data. Can be numeric type, timestamp... (for
|
|
726
|
+
recording in metadata of aggregation results)
|
|
727
|
+
Writing metadata is triggered ONLY if ``last_seed_index`` is provided.
|
|
728
|
+
pre_buffer : dict or None
|
|
729
|
+
Buffer to keep track of intermediate data that can be required for
|
|
730
|
+
proceeding with preprocessing of individual seed chunk.
|
|
731
|
+
|
|
732
|
+
"""
|
|
733
|
+
post_buffer = agg_buffers[KEY_POST_BUFFER]
|
|
734
|
+
# When there is no result, 'agg_res' is None.
|
|
735
|
+
if isinstance((bin_res := agg_buffers[KEY_BIN_RES]), DataFrame):
|
|
736
|
+
# To keep track there has been res in the 1st place.
|
|
737
|
+
initial_agg_res = True
|
|
738
|
+
# Concat list of aggregation results.
|
|
739
|
+
bin_res = _concat_agg_res(
|
|
740
|
+
agg_buffers[KEY_BIN_RES_BUFFER],
|
|
741
|
+
bin_res,
|
|
742
|
+
append_last_res,
|
|
743
|
+
index_name,
|
|
744
|
+
)
|
|
745
|
+
# Same if needed with 'snap_res_buffer'.
|
|
746
|
+
if isinstance((snap_res := agg_buffers[KEY_SNAP_RES]), DataFrame):
|
|
747
|
+
snap_res = _concat_agg_res(
|
|
748
|
+
agg_buffers[KEY_SNAP_RES_BUFFER],
|
|
749
|
+
snap_res,
|
|
750
|
+
append_last_res,
|
|
751
|
+
index_name,
|
|
752
|
+
)
|
|
753
|
+
if post:
|
|
754
|
+
# Post processing if any.
|
|
755
|
+
# 'post_buffer' has to be modified in-place.
|
|
756
|
+
# It is possible 'main_res' is None, if 'post' needs a minimal
|
|
757
|
+
# number of rows before outputting results (warm-up).
|
|
758
|
+
main_res = (
|
|
759
|
+
post(buffer=post_buffer, bin_res=bin_res)
|
|
760
|
+
if agg_res_type is AggResType.BINS
|
|
761
|
+
else post(buffer=post_buffer, bin_res=bin_res, snap_res=snap_res)
|
|
762
|
+
)
|
|
763
|
+
if agg_res_type is AggResType.BOTH:
|
|
764
|
+
# First result, recorded with 'bin_key', is considered main
|
|
765
|
+
# result.
|
|
766
|
+
try:
|
|
767
|
+
main_res, snap_res = main_res
|
|
768
|
+
except ValueError:
|
|
769
|
+
raise ValueError(
|
|
770
|
+
f"not possible to have key '{key[0]}' for bins and "
|
|
771
|
+
f"key '{key[1]}' for snapshots but 'post()' function "
|
|
772
|
+
"only returning one result.",
|
|
773
|
+
)
|
|
774
|
+
# Set to None 'bin_res' and 'snap_res' to catch possible
|
|
775
|
+
# mistake in 'key' parameter (finally commented out).
|
|
776
|
+
# snap_res = None
|
|
777
|
+
# bin_res = None
|
|
778
|
+
elif agg_res_type is not AggResType.SNAPS:
|
|
779
|
+
# Case only 'bin_res' is recorded or both 'bin_res' and 'snap_res'.
|
|
780
|
+
# main_res, bin_res = bin_res, None
|
|
781
|
+
main_res = bin_res
|
|
782
|
+
else:
|
|
783
|
+
# Case only 'snap_res' is recorded, and not 'bin_res'.
|
|
784
|
+
# main_res, bin_res, snap_res = snap_res, None, None
|
|
785
|
+
main_res = snap_res
|
|
786
|
+
else:
|
|
787
|
+
initial_agg_res = False
|
|
788
|
+
main_res = None
|
|
789
|
+
main_key, snap_key = key if isinstance(key, tuple) else (key, None)
|
|
790
|
+
if last_seed_index:
|
|
791
|
+
# If 'last_seed_index', set oups metadata.
|
|
792
|
+
# It is possible there is no result yet to write for different reasons:
|
|
793
|
+
# - new seed data has been streamed and needs to be taken into account,
|
|
794
|
+
# but there is no result for this key, because all related seed data
|
|
795
|
+
# has been filtered out.
|
|
796
|
+
# - or maybe 'post' has a wamr up period and has not released results
|
|
797
|
+
# yet.
|
|
798
|
+
# But 'last_seed_index' has to be recorded, and so do possibly
|
|
799
|
+
# 'pre_buffer', 'segagg_buffer' and 'post_buffer'.
|
|
800
|
+
# Oups metadata only get written for 'main_key'.
|
|
801
|
+
# When 'key' is a tuple, 'main_key' is the 1st key.
|
|
802
|
+
write_config["key_value_metadata"] = {
|
|
803
|
+
KEY_AGGSTREAM: {
|
|
804
|
+
KEY_RESTART_INDEX: last_seed_index,
|
|
805
|
+
KEY_PRE_BUFFER: pre_buffer,
|
|
806
|
+
KEY_SEGAGG_BUFFER: agg_buffers[KEY_SEGAGG_BUFFER],
|
|
807
|
+
KEY_POST_BUFFER: post_buffer,
|
|
808
|
+
},
|
|
809
|
+
}
|
|
810
|
+
# When there is no result, 'main_res' is None.
|
|
811
|
+
# If no result, metadata is possibly to be written. This is indicated by
|
|
812
|
+
# 'last_seed_index', which informs about the last 'aggstream' local
|
|
813
|
+
# iteration.
|
|
814
|
+
if isinstance(main_res, DataFrame) or last_seed_index:
|
|
815
|
+
if agg_res_type is AggResType.BOTH:
|
|
816
|
+
store[main_key].write(
|
|
817
|
+
**(write_config | {KEY_ROW_GROUP_TARGET_SIZE: write_config[KEY_ROW_GROUP_TARGET_SIZE][0]}),
|
|
818
|
+
df=main_res,
|
|
819
|
+
)
|
|
820
|
+
store[snap_key].write(
|
|
821
|
+
**(
|
|
822
|
+
write_config
|
|
823
|
+
| {
|
|
824
|
+
KEY_ROW_GROUP_TARGET_SIZE: write_config[KEY_ROW_GROUP_TARGET_SIZE][1],
|
|
825
|
+
"key_value_metadata": None,
|
|
826
|
+
}
|
|
827
|
+
),
|
|
828
|
+
df=snap_res,
|
|
829
|
+
)
|
|
830
|
+
else:
|
|
831
|
+
store[main_key].write(**write_config, df=main_res)
|
|
832
|
+
if initial_agg_res:
|
|
833
|
+
# If there have been results, they have been processed (either written
|
|
834
|
+
# directly or through 'post()'). Time to reset aggregation buffers and
|
|
835
|
+
# counters.
|
|
836
|
+
_reset_agg_buffers(agg_buffers)
|
|
837
|
+
return
|
|
838
|
+
|
|
839
|
+
|
|
840
|
+
def agg_iter(
|
|
841
|
+
seed_chunk: DataFrame,
|
|
842
|
+
store: Store,
|
|
843
|
+
key: Any,
|
|
844
|
+
keys_config: dict,
|
|
845
|
+
agg_config: dict,
|
|
846
|
+
agg_buffers: dict,
|
|
847
|
+
):
|
|
848
|
+
"""
|
|
849
|
+
Post-process and write iter. n-1, segment and aggregate iter. n.
|
|
850
|
+
|
|
851
|
+
Parameters
|
|
852
|
+
----------
|
|
853
|
+
seed_chunk : DataFrame
|
|
854
|
+
Chunk of seed data.
|
|
855
|
+
store : Store
|
|
856
|
+
Store to which recording aggregation results.
|
|
857
|
+
key : Any | tuple[Any, Any]
|
|
858
|
+
Key for recording aggregation results.
|
|
859
|
+
keys_config
|
|
860
|
+
Settings related to 'key' for conducting post-processing, writing and
|
|
861
|
+
segmentation.
|
|
862
|
+
agg_config : dict
|
|
863
|
+
Settings related to 'key' for conducting aggregation.
|
|
864
|
+
agg_buffers : dict
|
|
865
|
+
Buffer to keep track of aggregation sequence intermediate results.
|
|
866
|
+
|
|
867
|
+
Returns
|
|
868
|
+
-------
|
|
869
|
+
key, updated_agg_buffers
|
|
870
|
+
- ``key``, key to which changed parameters are related.
|
|
871
|
+
- ``updated_agg_buffers``, dict with modified parameters.
|
|
872
|
+
|
|
873
|
+
"""
|
|
874
|
+
# Post process and write.
|
|
875
|
+
if not ((bin_res := agg_buffers[KEY_BIN_RES]) is None or bin_res.empty):
|
|
876
|
+
# If previous results, check if this is write time.
|
|
877
|
+
bin_res_buffer = agg_buffers[KEY_BIN_RES_BUFFER]
|
|
878
|
+
# Add 'agg_res' to 'agg_res_buffer' ignoring last row.
|
|
879
|
+
# It is incomplete, so useless to write it to results while
|
|
880
|
+
# aggregation iterations are on-going.
|
|
881
|
+
bin_res_buffer.append(bin_res.iloc[:-1])
|
|
882
|
+
agg_in_memory_size = agg_buffers[KEY_AGG_IN_MEMORY_SIZE]
|
|
883
|
+
if (snap_res := agg_buffers[KEY_SNAP_RES]) is None:
|
|
884
|
+
agg_buffers[KEY_AGG_IN_MEMORY_SIZE] += bin_res.memory_usage().sum()
|
|
885
|
+
else:
|
|
886
|
+
# If we have bins & snapshots, do same with snapshots.
|
|
887
|
+
agg_buffers[KEY_SNAP_RES_BUFFER].append(snap_res.iloc[:-1])
|
|
888
|
+
agg_buffers[KEY_AGG_IN_MEMORY_SIZE] += (
|
|
889
|
+
bin_res.memory_usage().sum() + snap_res.memory_usage().sum()
|
|
890
|
+
)
|
|
891
|
+
# Length of 'bin_res_buffer' is number of times it has been
|
|
892
|
+
# appended. Be it from bins, or snapshots, length is same.
|
|
893
|
+
# Keep floor part.
|
|
894
|
+
agg_mean_in_memory_group_size = agg_in_memory_size // len(bin_res_buffer)
|
|
895
|
+
if agg_in_memory_size + agg_mean_in_memory_group_size > keys_config[KEY_MAX_IN_MEMORY_SIZE_B]:
|
|
896
|
+
# For next iteration, chances are that 'agg_in_memory_size' will be
|
|
897
|
+
# larger than threshold. Time to write results from previous
|
|
898
|
+
# iteration.
|
|
899
|
+
_post_n_write_agg_chunks(
|
|
900
|
+
agg_buffers=agg_buffers,
|
|
901
|
+
agg_res_type=keys_config[KEY_AGG_RES_TYPE],
|
|
902
|
+
append_last_res=False,
|
|
903
|
+
store=store,
|
|
904
|
+
key=key,
|
|
905
|
+
write_config=keys_config[KEY_WRITE_CONFIG],
|
|
906
|
+
index_name=keys_config[KEY_BIN_ON_OUT],
|
|
907
|
+
post=keys_config[KEY_POST],
|
|
908
|
+
)
|
|
909
|
+
# Segment and aggregate. Group keys becomes the index.
|
|
910
|
+
agg_res = cumsegagg(
|
|
911
|
+
data=seed_chunk,
|
|
912
|
+
agg=agg_config,
|
|
913
|
+
bin_by=keys_config[KEY_SEG_CONFIG],
|
|
914
|
+
buffer=agg_buffers[KEY_SEGAGG_BUFFER],
|
|
915
|
+
)
|
|
916
|
+
# 'agg_res' is 'main' aggregation results onto which are assessed
|
|
917
|
+
# 'everything'. If only 'bins' are requested, it gathers bins.
|
|
918
|
+
# If 'bins' and 'snapshots' are requested, it gathers snapshots.
|
|
919
|
+
agg_buffers[KEY_BIN_RES], agg_buffers[KEY_SNAP_RES] = (
|
|
920
|
+
agg_res if isinstance(agg_res, tuple) else (agg_res, None)
|
|
921
|
+
)
|
|
922
|
+
return key, agg_buffers
|
|
923
|
+
|
|
924
|
+
|
|
925
|
+
class AggStream:
|
|
926
|
+
"""
|
|
927
|
+
Persist configuration data to run aggregation in sequence.
|
|
928
|
+
|
|
929
|
+
Attributes
|
|
930
|
+
----------
|
|
931
|
+
- ``self.seed_config`, a dict keeping track of seed-related parameters.
|
|
932
|
+
``{'ordered_on' : string, specifying column name in seed data in
|
|
933
|
+
ascending order.
|
|
934
|
+
'restart_index' : int, float or Timestamp, the index from which
|
|
935
|
+
(included) should be restarted the next
|
|
936
|
+
aggregation iteration.
|
|
937
|
+
'pre' : Callable, to apply user-defined pre-processing on seed.
|
|
938
|
+
'pre_buffer' : dict, to keep track of intermediate values for
|
|
939
|
+
proceeding with pre-processing of individual seed
|
|
940
|
+
items (by `pre` function).
|
|
941
|
+
'filters' : dict, as per `filters` parameter.
|
|
942
|
+
}``
|
|
943
|
+
- ``self.store``, oups store, as per `store` parameter.
|
|
944
|
+
- ``self.agg_pd``, dict, as per `agg` parameter, in pandas format.
|
|
945
|
+
- ``self.agg_cs``, an attribute initialized once an aggregation
|
|
946
|
+
iteration has been run, and defining aggregation in `cumsegagg`
|
|
947
|
+
standard. It is initialized in ``self.agg`` function, the 1st time
|
|
948
|
+
an aggregation is run (seed data dtypes is required).
|
|
949
|
+
- ``self.filter_apps``, dict, mapping filter ids to list of keys, and
|
|
950
|
+
number of parallel jobs that can be run for this filter id.
|
|
951
|
+
Number of jobs is to be used as key in ``self.p_jobs`` attribute.
|
|
952
|
+
- ``self.keys_config``, dict of keys config in the form:
|
|
953
|
+
``{key: {'dirpath': str, where to record agg res,
|
|
954
|
+
'bin_on_out' : str, name in aggregation results for column
|
|
955
|
+
with bin ids.
|
|
956
|
+
'seg_config' : dict specifying the segmentation config,
|
|
957
|
+
'post' : Callable or None,
|
|
958
|
+
'max_in_memory_size_b': int, max allowed result in memory size,
|
|
959
|
+
in bytes.
|
|
960
|
+
'write_config' : {'ordered_on' : str,
|
|
961
|
+
'duplicates_on' : str or list,
|
|
962
|
+
...
|
|
963
|
+
},
|
|
964
|
+
},
|
|
965
|
+
}``
|
|
966
|
+
- ``self.agg_buffers``, dict to keep track of aggregation iteration
|
|
967
|
+
intermediate results.
|
|
968
|
+
``{key: {'agg_in_memory_size' : int, size in bytes of current
|
|
969
|
+
aggregation results, for bins (if snapshots not
|
|
970
|
+
requested) or bins and snapshots.
|
|
971
|
+
'bin_res' : None or DataFrame, last aggregation results,
|
|
972
|
+
for bins,
|
|
973
|
+
'snap_res' : None or DataFrame, last aggregation results,
|
|
974
|
+
for snapshots,
|
|
975
|
+
'bin_res_buffer' : list of DataFrame, buffer to keep
|
|
976
|
+
bin aggregagation results,
|
|
977
|
+
'snap_res_buffer' : list of DataFrame, buffer to keep bin
|
|
978
|
+
snapshot aggregagation results (if snapshots are
|
|
979
|
+
requested),
|
|
980
|
+
'segagg_buffer' : dict, possibly empty, keeping track of
|
|
981
|
+
segmentation and aggregation intermediate
|
|
982
|
+
variables,
|
|
983
|
+
'post_buffer' : dict, possibly empty, keeping track of
|
|
984
|
+
'post' function intermediate variables,
|
|
985
|
+
},
|
|
986
|
+
}``
|
|
987
|
+
- ``self.p_jobs``, dict, containing Parallel objects, as per joblib
|
|
988
|
+
setup. Keys are int, being the number of parallel jobs to run for this
|
|
989
|
+
filter id.
|
|
990
|
+
|
|
991
|
+
"""
|
|
992
|
+
|
|
993
|
+
def __init__(
|
|
994
|
+
self,
|
|
995
|
+
ordered_on: str,
|
|
996
|
+
store: Store,
|
|
997
|
+
keys: Any | tuple[Any, Any] | dict,
|
|
998
|
+
pre: Callable | None = None,
|
|
999
|
+
filters: dict | None = None,
|
|
1000
|
+
agg: dict | None = None,
|
|
1001
|
+
bin_by: TimeGrouper | Callable[[Series, dict], tuple] | None = None,
|
|
1002
|
+
bin_on: str | tuple[str, str] | None = None,
|
|
1003
|
+
snap_by: TimeGrouper | Series | DatetimeIndex | None = None,
|
|
1004
|
+
post: Callable | None = None,
|
|
1005
|
+
max_in_memory_size: int | None = MAX_IN_MEMORY_SIZE_MB,
|
|
1006
|
+
parallel: bool | None = False,
|
|
1007
|
+
**kwargs,
|
|
1008
|
+
):
|
|
1009
|
+
"""
|
|
1010
|
+
Initialize aggregation stream on ordered data.
|
|
1011
|
+
|
|
1012
|
+
This object enables 'streamed aggregation', iteratively
|
|
1013
|
+
(out-of-core) with optional filtering of seed data, and optional
|
|
1014
|
+
post-processing of aggregation results (by use of vectorized functions
|
|
1015
|
+
or for dataframe formatting).
|
|
1016
|
+
Aggregation results are recoreded into a 'oups store'.
|
|
1017
|
+
|
|
1018
|
+
Parameters
|
|
1019
|
+
----------
|
|
1020
|
+
ordered_on : str
|
|
1021
|
+
Name of the column with respect to which seed dataset is in
|
|
1022
|
+
ascending order. While this parameter is compulsory for correct
|
|
1023
|
+
restart on seed data, seed data is not necessarily grouped by this
|
|
1024
|
+
column. ``bin_by`` and/or ``bin_on`` parameters can be used to
|
|
1025
|
+
define such a different parameter.
|
|
1026
|
+
This value is also used as default 'ordered_on' parameter for
|
|
1027
|
+
aggregation results, if not provided separately for each key.
|
|
1028
|
+
store : Store
|
|
1029
|
+
Store to which recording aggregation results.
|
|
1030
|
+
keys : Indexer | tuple[Indexer, Indexer] | dict
|
|
1031
|
+
Key(s) for recording aggregation results.
|
|
1032
|
+
In case snapshots are requested, and to request recording of both
|
|
1033
|
+
bins and snapshots, it should be a tuple of 2 indices, the first to
|
|
1034
|
+
record bins, the second to record snapshots.
|
|
1035
|
+
If a dict, several keys can be specified for operating multiple
|
|
1036
|
+
parallel aggregations on the same seed. In this case, the dict can
|
|
1037
|
+
be of two forms.
|
|
1038
|
+
|
|
1039
|
+
- In case seed data is not to be filtered, it should be in the
|
|
1040
|
+
form 1, defined as:
|
|
1041
|
+
``{key: {'agg': agg,
|
|
1042
|
+
'bin_by': bin_by,
|
|
1043
|
+
'bin_on': bin_on,
|
|
1044
|
+
'snap_by': snap_by,
|
|
1045
|
+
'post': post,
|
|
1046
|
+
**kwargs}
|
|
1047
|
+
}``
|
|
1048
|
+
Any additional parameters, (``**kwargs``) are forwarded to
|
|
1049
|
+
``oups.writer.write`` when writing aggregation results to
|
|
1050
|
+
store, such as custom `max_row_group_size`, 'duplicates_on' or
|
|
1051
|
+
'ordered_on' parameters (see not below for 'duplicates_on').
|
|
1052
|
+
Please, note:
|
|
1053
|
+
|
|
1054
|
+
- `bin_by` is a compulsory parameter.
|
|
1055
|
+
- If not specified, `bin_on` parameter in dict does not get
|
|
1056
|
+
default values.
|
|
1057
|
+
- If not specified in dict, `agg`, `snap_by`, `post` and
|
|
1058
|
+
other parameters related to writing of aggregation
|
|
1059
|
+
results... get values from `agg`, `snap_by`, `post`,
|
|
1060
|
+
`ordered_on` and `**kwargs` parameters defined when
|
|
1061
|
+
initializing `AggStream`.
|
|
1062
|
+
If using `snap_by` or `post` when initializing `AggStream`
|
|
1063
|
+
and not willing to apply it for one key, set it to ``None``
|
|
1064
|
+
in key specific config.
|
|
1065
|
+
|
|
1066
|
+
- In case seed is to be filtered, dict written in form 1 are
|
|
1067
|
+
themselves values within an upper dict. Keys for this upper
|
|
1068
|
+
dict are string used as filter id. Each of these filter ids
|
|
1069
|
+
have then to be listed in ``filters`` parameter.
|
|
1070
|
+
For keys deriving from unfiltered data, use the `NO_FILTER_ID`
|
|
1071
|
+
``"_"``.
|
|
1072
|
+
|
|
1073
|
+
pre : Callable, default None
|
|
1074
|
+
Used-defined Callable to proceed with preèprocessing of each chunks
|
|
1075
|
+
of the seed Iterable, accepting 2 parameters:
|
|
1076
|
+
|
|
1077
|
+
- An ``on`` parameter, a pandas dataframe, the current seed item
|
|
1078
|
+
(before any filter is applied).
|
|
1079
|
+
- A ``buffer`` parameter, a dict that can be used as a buffer
|
|
1080
|
+
for storing temporary results from one chunk processing to
|
|
1081
|
+
the next. Its initial value is that provided by `pre_buffer`.
|
|
1082
|
+
|
|
1083
|
+
If running ``pre`` raises an exception (whichever type it is), a
|
|
1084
|
+
``SeedPreException`` will subsequently be raised.
|
|
1085
|
+
Modification of seed chunk, if any, has to be realized in-place.
|
|
1086
|
+
No DataFrame returned by this function is expected.
|
|
1087
|
+
filters : dict | None, default None
|
|
1088
|
+
Dict in the form
|
|
1089
|
+
``{"filter_id":[[("col", op, val), ...], ...]}``
|
|
1090
|
+
To filter out data from seed.
|
|
1091
|
+
Filter syntax: [[(column, op, val), ...],...]
|
|
1092
|
+
where op is [==, =, >, >=, <, <=, !=, in, not in]
|
|
1093
|
+
The innermost tuples are transposed into a set of filters applied
|
|
1094
|
+
through an `AND` operation.
|
|
1095
|
+
The outer list combines these sets of filters through an `OR`
|
|
1096
|
+
operation.
|
|
1097
|
+
A single list of tuples can also be used, meaning that no `OR`
|
|
1098
|
+
operation between set of filters is to be conducted.
|
|
1099
|
+
agg : dict | None, default None
|
|
1100
|
+
Dict in the form
|
|
1101
|
+
``{"output_col":("input_col", "agg_function_name")}``
|
|
1102
|
+
where keys are names of output columns into which are recorded
|
|
1103
|
+
results of aggregations, and values describe the aggregations to
|
|
1104
|
+
operate. ``input_col`` has to exist in seed data.
|
|
1105
|
+
Examples of ``agg_function_name`` are `first`, `last`, `min`, `max`
|
|
1106
|
+
and `sum`.
|
|
1107
|
+
This parameter is compulsory, except if ``key`` parameter is a
|
|
1108
|
+
`dict`.
|
|
1109
|
+
bin_by : TimeGrouper | Callable, default None
|
|
1110
|
+
Parameter defining the binning logic.
|
|
1111
|
+
If a `Callable`, it is given following parameters.
|
|
1112
|
+
|
|
1113
|
+
- An ``on`` parameter, a pandas dataframe made of column
|
|
1114
|
+
``ordered_on``, and column ``bin_on`` if different than
|
|
1115
|
+
``ordered_on``.
|
|
1116
|
+
- A ``buffer`` parameter, a dict that can be used as a buffer for
|
|
1117
|
+
storing temporary results from one chunk processing to
|
|
1118
|
+
the next.
|
|
1119
|
+
|
|
1120
|
+
TThis parameter is the ``bin_by`` parameter of
|
|
1121
|
+
``oups.aggstream.segmentby.segmentby`` function. For more
|
|
1122
|
+
information, please, read its docstring.
|
|
1123
|
+
bin_on : str | tuple[str, str] | None, default None
|
|
1124
|
+
``bin_on`` may either be a string or a tuple of 2 string. When a
|
|
1125
|
+
string, it refers to an existing column in seed data onto which
|
|
1126
|
+
applying the binning defined by ``bin_by`` parameter. Its value is
|
|
1127
|
+
then carried over as name for the column containing the group keys.
|
|
1128
|
+
It is further used when writing results for defining
|
|
1129
|
+
``duplicates_on`` parameter (see ``oups.writer.write``).
|
|
1130
|
+
When a tuple, the 1st string refers to an existing column in seed
|
|
1131
|
+
data, the 2nd the name to use for the column which values will be
|
|
1132
|
+
the group keys in aggregation results.
|
|
1133
|
+
Setting of ``bin_on`` should be adapted depending how is defined
|
|
1134
|
+
``bin_by`` parameter. When ``bin_by`` is a Callable, then
|
|
1135
|
+
``bin_on`` can have different values.
|
|
1136
|
+
|
|
1137
|
+
- ``None``, the default.
|
|
1138
|
+
- the name of an existing column onto which applying the binning.
|
|
1139
|
+
Its value is then carried over as name for the column
|
|
1140
|
+
containing the group keys.
|
|
1141
|
+
|
|
1142
|
+
snap_by : TimeGrouper | Series | DatetimeIndex | None, default None
|
|
1143
|
+
Values positioning points of observation, either derived from a
|
|
1144
|
+
pandas TimeGrouper, or contained in a pandas Series.
|
|
1145
|
+
In case 'snap_by' is a Series, values serve as locations for points
|
|
1146
|
+
of observation.
|
|
1147
|
+
Additionally, ``closed`` value defined by 'bin_on' specifies if
|
|
1148
|
+
points of observations are included or excluded.
|
|
1149
|
+
|
|
1150
|
+
- `left`, then values at points of observation are excluded.
|
|
1151
|
+
- `right`, then values at points of observation are included.
|
|
1152
|
+
|
|
1153
|
+
post : Callable, default None
|
|
1154
|
+
User-defined function accepting up to 3 parameters.
|
|
1155
|
+
|
|
1156
|
+
- ``buffer``, a dict to be used as data buffer, that can be
|
|
1157
|
+
necessary for some user-defined post-processing requiring data
|
|
1158
|
+
assessed in previous post-processing iteration.
|
|
1159
|
+
- ``bin_res``, a pandas dataframe resulting from the aggregations
|
|
1160
|
+
defined by ``agg`` parameter, with first row already corrected
|
|
1161
|
+
with last row of previous streamed aggregation.
|
|
1162
|
+
These are aggregation results for bins.
|
|
1163
|
+
- ``snap_res`` (optional), a pandas dataframe resulting from the
|
|
1164
|
+
aggregations defined by ``agg`` parameter that contains
|
|
1165
|
+
snapshots.
|
|
1166
|
+
|
|
1167
|
+
It has then to return a pandas dataframe that will be recorded.
|
|
1168
|
+
This optional post-processing is intended for use of vectorized
|
|
1169
|
+
functions (not mixing rows together, but operating on one or
|
|
1170
|
+
several columns), or dataframe formatting before results are
|
|
1171
|
+
finally recorded.
|
|
1172
|
+
Please, read the note below regarding 'post' parameter.
|
|
1173
|
+
max_in_memory_size : int, default 'MAX_IN_MEMORY_SIZE_MB'
|
|
1174
|
+
Maximum allowed size in Megabytes of results stored in memory.
|
|
1175
|
+
parallel : bool, default False
|
|
1176
|
+
Conduct processing of keys in parallel, with one process per `key`.
|
|
1177
|
+
If a single `key`, only one process is possible.
|
|
1178
|
+
|
|
1179
|
+
Other Parameters
|
|
1180
|
+
----------------
|
|
1181
|
+
kwargs : dict
|
|
1182
|
+
Settings forwarded to ``oups.writer.write`` when writing
|
|
1183
|
+
aggregation results to store. Can define for instance custom
|
|
1184
|
+
`max_row_group_size` or `duplicates_on` parameters (see notes below
|
|
1185
|
+
for `duplicates_on`).
|
|
1186
|
+
|
|
1187
|
+
Notes
|
|
1188
|
+
-----
|
|
1189
|
+
- Result is necessarily added to a dataset from an instantiated oups
|
|
1190
|
+
``Store``. ``AggStream`` actually relies on the update feature
|
|
1191
|
+
from oups.
|
|
1192
|
+
- With the post-processing step, user can also take care of removing
|
|
1193
|
+
columns produced by the aggregation step, but not needed afterwards.
|
|
1194
|
+
Other formatting operations on the dataframe can also be achieved
|
|
1195
|
+
(renaming columns or index, and so on...). To be noticed, group keys
|
|
1196
|
+
are available through a column having same name as initial column
|
|
1197
|
+
from seed data, or defined by 'bin_on' parameter if 'bin_by' is a
|
|
1198
|
+
Callable.
|
|
1199
|
+
- When recording, both 'ordered_on' and 'duplicates_on' parameters are
|
|
1200
|
+
set when calling ``oups.writer.write``. If additional parameters are
|
|
1201
|
+
defined by the user, some checks are made.
|
|
1202
|
+
|
|
1203
|
+
- 'ordered_on' is forced to 'AggStream' ``ordered_on`` parameter.
|
|
1204
|
+
- If 'duplicates_on' is not set by the user or is `None`, then it
|
|
1205
|
+
is
|
|
1206
|
+
|
|
1207
|
+
- either set to the name of the output column for group keys
|
|
1208
|
+
defined by `bin_on` if `bin_on` is set. The rational is that
|
|
1209
|
+
this column identifies uniquely each bin, and so is a
|
|
1210
|
+
relevant column to identify duplicates.
|
|
1211
|
+
- if `bin_on` is not set, then it defaults to `ordered_on`
|
|
1212
|
+
column.
|
|
1213
|
+
|
|
1214
|
+
There might case when this logic is unsuited. For instance,
|
|
1215
|
+
perhaps values in 'ordered_on' column does provide a unique valid
|
|
1216
|
+
identifier for bins already (if there are unique values in
|
|
1217
|
+
'ordered_on'). It may then be that the column containing group
|
|
1218
|
+
keys is removed during user post-processing.
|
|
1219
|
+
To allow such specific use case, the user can set
|
|
1220
|
+
``duplicates_on`` as additional parameter to ``AggStream``. If
|
|
1221
|
+
the user omit a column name, it means that this is a voluntary
|
|
1222
|
+
choice from the user.
|
|
1223
|
+
|
|
1224
|
+
- If an exception is raised by ``pre`` function on seed data, then,
|
|
1225
|
+
last good results are still written to disk with correct metadata. If
|
|
1226
|
+
an exception is raised at some other point of the aggregation
|
|
1227
|
+
process, results are not written.
|
|
1228
|
+
- Use of 'post' parameter can be intricate. The user should be aware
|
|
1229
|
+
of 2 situations.
|
|
1230
|
+
|
|
1231
|
+
- Either 'post' is called not as 'final_write'. In this case, the
|
|
1232
|
+
last existing row is removed from bin and snapshot aggregation
|
|
1233
|
+
results. It will be added back at the next iteration though.
|
|
1234
|
+
this is to optimize the iteration mechanism.
|
|
1235
|
+
- Or 'post' is called as 'final_write'. In this case, the last
|
|
1236
|
+
existing row is kept in bin and snapshot aggregation results.
|
|
1237
|
+
|
|
1238
|
+
The user should make sure the 'post' function adapts to both
|
|
1239
|
+
situations.
|
|
1240
|
+
|
|
1241
|
+
"""
|
|
1242
|
+
# Check 'kwargs' parameters are those expected for 'write' function.
|
|
1243
|
+
for param in kwargs:
|
|
1244
|
+
if param not in WRITE_PARAMS:
|
|
1245
|
+
raise ValueError(
|
|
1246
|
+
f"'{param}' is neither a valid parameter for `AggStream`"
|
|
1247
|
+
" initialization, nor for `oups.write` function.",
|
|
1248
|
+
)
|
|
1249
|
+
# Seed-related attributes.
|
|
1250
|
+
if filters is not None:
|
|
1251
|
+
# Check if only an "AND" part has been provided. If yes, enclose it
|
|
1252
|
+
# in an outer list.
|
|
1253
|
+
filters = {
|
|
1254
|
+
filt_id: [filters_] if isinstance(filters_[0], tuple) else filters_
|
|
1255
|
+
for filt_id, filters_ in filters.items()
|
|
1256
|
+
}
|
|
1257
|
+
# Set default values for keys' config.
|
|
1258
|
+
keys_default = {
|
|
1259
|
+
KEY_SNAP_BY: snap_by,
|
|
1260
|
+
KEY_AGG: agg,
|
|
1261
|
+
KEY_POST: post,
|
|
1262
|
+
KEY_ORDERED_ON: ordered_on,
|
|
1263
|
+
KEY_MAX_IN_MEMORY_SIZE_B: int(max_in_memory_size * MEGABYTES_TO_BYTES),
|
|
1264
|
+
} | kwargs
|
|
1265
|
+
if not isinstance(keys, dict):
|
|
1266
|
+
keys = {keys: keys_default | {KEY_BIN_BY: bin_by, KEY_BIN_ON: bin_on}}
|
|
1267
|
+
if isinstance(next(iter(keys)), str):
|
|
1268
|
+
# Case filter is used.
|
|
1269
|
+
# Check 'filters' parameter is used.
|
|
1270
|
+
if filters is None:
|
|
1271
|
+
raise ValueError(
|
|
1272
|
+
"not possible to use filter syntax for `keys` parameter "
|
|
1273
|
+
"without providing `filters` parameter as well.",
|
|
1274
|
+
)
|
|
1275
|
+
else:
|
|
1276
|
+
# Check same filters id are both in 'keys' and 'filters'
|
|
1277
|
+
# parameters.
|
|
1278
|
+
if NO_FILTER_ID in filters:
|
|
1279
|
+
if filters[NO_FILTER_ID] is not None:
|
|
1280
|
+
raise ValueError(
|
|
1281
|
+
f"not possible to use '{NO_FILTER_ID}' as key in "
|
|
1282
|
+
"`filters` parameter with a value different than "
|
|
1283
|
+
"`None`.",
|
|
1284
|
+
)
|
|
1285
|
+
elif NO_FILTER_ID in keys:
|
|
1286
|
+
# If not in 'filters' but in 'keys', add it to 'filters'.
|
|
1287
|
+
filters[NO_FILTER_ID] = None
|
|
1288
|
+
filt_filt_ids = set(filters)
|
|
1289
|
+
filt_filt_ids.discard(NO_FILTER_ID)
|
|
1290
|
+
keys_filt_ids = set(keys)
|
|
1291
|
+
keys_filt_ids.discard(NO_FILTER_ID)
|
|
1292
|
+
if filt_filt_ids != keys_filt_ids:
|
|
1293
|
+
raise ValueError(
|
|
1294
|
+
"not possible to have different lists of filter ids"
|
|
1295
|
+
" between `keys` and `filters` parameters.\n"
|
|
1296
|
+
f" List of filter ids in `keys` parameter is {keys_filt_ids}.\n"
|
|
1297
|
+
f" List of filter ids in `filters` parameter is {filt_filt_ids}.",
|
|
1298
|
+
)
|
|
1299
|
+
else:
|
|
1300
|
+
# Case no filter is used.
|
|
1301
|
+
keys = {NO_FILTER_ID: keys}
|
|
1302
|
+
filters = {NO_FILTER_ID: None}
|
|
1303
|
+
_filter_apps = {}
|
|
1304
|
+
_all_keys = []
|
|
1305
|
+
_p_jobs = {KEY_MAX_P_JOBS: Parallel(n_jobs=KEY_MAX_P_JOBS, prefer="threads")}
|
|
1306
|
+
for filt_id in keys:
|
|
1307
|
+
# Set number of jobs.
|
|
1308
|
+
n_keys = len(keys[filt_id])
|
|
1309
|
+
n_jobs = min(KEY_MAX_P_JOBS, n_keys) if parallel else 1
|
|
1310
|
+
_filter_apps[filt_id] = FilterApp(list(keys[filt_id]), n_jobs)
|
|
1311
|
+
if n_jobs not in _p_jobs:
|
|
1312
|
+
# Configure parallel jobs.
|
|
1313
|
+
_p_jobs[n_jobs] = Parallel(n_jobs=n_jobs, prefer="threads")
|
|
1314
|
+
_all_keys.extend(keys[filt_id])
|
|
1315
|
+
# Check for duplicates keys between different filter ids.
|
|
1316
|
+
seen = set()
|
|
1317
|
+
dupes = [key for key in _all_keys if key in seen or seen.add(key)]
|
|
1318
|
+
if dupes:
|
|
1319
|
+
raise ValueError(f"not possible to have key(s) {dupes} used for different filter ids.")
|
|
1320
|
+
self.p_jobs = _p_jobs
|
|
1321
|
+
self.filter_apps = _filter_apps
|
|
1322
|
+
# Once filters have been managed, simplify 'keys' as a single level
|
|
1323
|
+
# dict.
|
|
1324
|
+
keys = ChainMap(*keys.values())
|
|
1325
|
+
(
|
|
1326
|
+
self.keys_config,
|
|
1327
|
+
self.agg_pd,
|
|
1328
|
+
) = _init_keys_config(ordered_on, keys, keys_default)
|
|
1329
|
+
(
|
|
1330
|
+
restart_index,
|
|
1331
|
+
pre_buffer,
|
|
1332
|
+
self.agg_buffers,
|
|
1333
|
+
) = _init_buffers(store, keys)
|
|
1334
|
+
self.seed_config = {
|
|
1335
|
+
KEY_ORDERED_ON: ordered_on,
|
|
1336
|
+
KEY_PRE: pre,
|
|
1337
|
+
KEY_PRE_BUFFER: pre_buffer,
|
|
1338
|
+
KEY_FILTERS: filters,
|
|
1339
|
+
KEY_RESTART_INDEX: restart_index,
|
|
1340
|
+
}
|
|
1341
|
+
# Cumsegagg-like agg definition.
|
|
1342
|
+
# Cannot be set yet, because seed dtype is required.
|
|
1343
|
+
# Is a dict, specifying for each key, its expected aggregation.
|
|
1344
|
+
self.agg_cs = {}
|
|
1345
|
+
# Store attribute.
|
|
1346
|
+
self.store = store
|
|
1347
|
+
|
|
1348
|
+
def _init_agg_cs(self, seed: Iterable[DataFrame]):
|
|
1349
|
+
"""
|
|
1350
|
+
Initialize ``self.agg_cs``.
|
|
1351
|
+
|
|
1352
|
+
Because dtypes of seed DataFrame is required, the first seed chunk is
|
|
1353
|
+
generated from the Iterable. Seed Iterable is then repacked with first
|
|
1354
|
+
item already in memory.
|
|
1355
|
+
|
|
1356
|
+
Parameters
|
|
1357
|
+
----------
|
|
1358
|
+
seed : Iterable[DataFrame]
|
|
1359
|
+
Seed data, from which getting pandas DataFrame dtypes.
|
|
1360
|
+
|
|
1361
|
+
Returns
|
|
1362
|
+
-------
|
|
1363
|
+
seed
|
|
1364
|
+
Seed that had to be repacked.
|
|
1365
|
+
|
|
1366
|
+
"""
|
|
1367
|
+
remainder = iter(seed)
|
|
1368
|
+
first = next(remainder)
|
|
1369
|
+
# Recompose seed with 1st item materialized.
|
|
1370
|
+
seed = chain([first], remainder)
|
|
1371
|
+
seed_dtypes = first.dtypes.to_dict()
|
|
1372
|
+
for key in self.keys_config:
|
|
1373
|
+
try:
|
|
1374
|
+
self.agg_cs[key] = setup_cumsegagg(self.agg_pd[key], seed_dtypes)
|
|
1375
|
+
except Exception:
|
|
1376
|
+
raise ValueError(f"exception raised for key '{key}'")
|
|
1377
|
+
return seed
|
|
1378
|
+
|
|
1379
|
+
def agg(
|
|
1380
|
+
self,
|
|
1381
|
+
seed: DataFrame | Iterable[DataFrame] = None,
|
|
1382
|
+
trim_start: bool | None = False,
|
|
1383
|
+
discard_last: bool | None = False,
|
|
1384
|
+
final_write: bool | None = True,
|
|
1385
|
+
):
|
|
1386
|
+
"""
|
|
1387
|
+
Aggregate sequentially on successive chunks (stream) of ordered data.
|
|
1388
|
+
|
|
1389
|
+
This function conducts 'streamed aggregation', iteratively (out-of
|
|
1390
|
+
core) with optional post-processing of aggregation results (by use of
|
|
1391
|
+
vectorized functions or for dataframe formatting).
|
|
1392
|
+
|
|
1393
|
+
Parameters
|
|
1394
|
+
----------
|
|
1395
|
+
seed : DataFrame | Iterable[DataFrame]
|
|
1396
|
+
Seed data over which conducting streamed aggregations.
|
|
1397
|
+
trim_start : bool, default True
|
|
1398
|
+
If ``True``, and if aggregated results already exist, then
|
|
1399
|
+
retrieves the last index present in seed data (recorded in metadata
|
|
1400
|
+
of existing aggregated results), and trim all seed data before this
|
|
1401
|
+
index (index excluded from trim, so it will be in new aggregation
|
|
1402
|
+
results). This trimming makes sense if previous aggregation
|
|
1403
|
+
iteration has been managed with ``discard_last`` set ``True``.
|
|
1404
|
+
discard_last : bool, default True
|
|
1405
|
+
If ``True``, last row group in seed data (sharing the same value in
|
|
1406
|
+
`ordered_on` column) is removed from the aggregation step. See
|
|
1407
|
+
below notes.
|
|
1408
|
+
final_write : bool, default True
|
|
1409
|
+
If ``True``, after last iteration of aggregation, aggregation
|
|
1410
|
+
results are written to disk. With this parameter, restarting
|
|
1411
|
+
aggregation with a new AggStream instance is possible.
|
|
1412
|
+
If ``True``, if an exception is raised during seed check, then last
|
|
1413
|
+
aggregation results from last valid seed chunk are also written to
|
|
1414
|
+
disk.
|
|
1415
|
+
|
|
1416
|
+
Notes
|
|
1417
|
+
-----
|
|
1418
|
+
- If aggregation results already exist in oups ``Store`` instance,
|
|
1419
|
+
and `trim_start` is `True`, last index from previous aggregation is
|
|
1420
|
+
retrieved, and prior seed data is trimmed.
|
|
1421
|
+
- Aggregation is by default processed up to the last index excluded,
|
|
1422
|
+
and subsequent aggregation will start from this last index included,
|
|
1423
|
+
assumed to be that of an incomplete row group.
|
|
1424
|
+
If `discard_last` is set `False`, then aggregation is process up to
|
|
1425
|
+
the last data.
|
|
1426
|
+
- By default, with parameter `discard_last`` set ``True``, the last row
|
|
1427
|
+
group (composed from rows sharing the same value in `ordered_on`
|
|
1428
|
+
column), is discarded.
|
|
1429
|
+
|
|
1430
|
+
- It may be for instance that this row group is not complete yet
|
|
1431
|
+
and should therefore not be accounted for. More precisely, new
|
|
1432
|
+
rows with same value in `ordered_on` may appear in seed data
|
|
1433
|
+
later on. Because seed data is trimmed to start from last
|
|
1434
|
+
processed value from `ordered_on` column (value included), these
|
|
1435
|
+
new rows would be excluded from the next aggregation, leading to
|
|
1436
|
+
an inaccurate aggregation result. Doing so is a way to identify
|
|
1437
|
+
easily when re-starting the aggregation in a case there can be
|
|
1438
|
+
duplicates in `ordered_on` column. A ``sum`` aggregation will
|
|
1439
|
+
then return the correct result for instance, as no data is
|
|
1440
|
+
accounted for twice.
|
|
1441
|
+
- Or if composed of a single row, this last row in seed data is
|
|
1442
|
+
temporary (and may get its final values only at a later time,
|
|
1443
|
+
when it becomes the one-but-last row, as a new row is added).
|
|
1444
|
+
|
|
1445
|
+
"""
|
|
1446
|
+
# TODO: add 'snap_by' parameter to 'agg()' to allow using list of
|
|
1447
|
+
# timestamps. 'cumsegagg()' is already compatible.
|
|
1448
|
+
# TODO: add a writing step once aggregation on a seed chunk is done
|
|
1449
|
+
# (keeping track of '_last_seed_index': as soon as it changes from
|
|
1450
|
+
# one iteration to the next, trigger the intermediate writing step)
|
|
1451
|
+
# Aggregation results to keep are listed through an additional
|
|
1452
|
+
# 'group_res' parameter, in the form:
|
|
1453
|
+
# {key_to_write_grouped_res_to: [key1, key2, key3],
|
|
1454
|
+
# ...}
|
|
1455
|
+
# Motivation is to be able to gather results for different filters and
|
|
1456
|
+
# 'bin_by' value, and post-process them in a single 'post' function and
|
|
1457
|
+
# write results in a single file.
|
|
1458
|
+
# This particularly make sense if there is a single 'snap_by' value, as
|
|
1459
|
+
# snapshots results will be easily merged.
|
|
1460
|
+
# TODO: change default settings:
|
|
1461
|
+
# discard_last = trim_start = final_write = False
|
|
1462
|
+
if isinstance(seed, DataFrame):
|
|
1463
|
+
# Make the seed an iterable.
|
|
1464
|
+
seed = [seed]
|
|
1465
|
+
# Seed can be an empty list or None.
|
|
1466
|
+
if seed:
|
|
1467
|
+
if not self.agg_cs:
|
|
1468
|
+
# If first time an aggregation is made with this object,
|
|
1469
|
+
# initialize 'agg_cs'.
|
|
1470
|
+
seed = self._init_agg_cs(seed)
|
|
1471
|
+
seed_check_exception = False
|
|
1472
|
+
try:
|
|
1473
|
+
# TODO: '_pre_buffer' is modified in-place. It should not be
|
|
1474
|
+
# needed to return it. It is within 'self.seed_config'.
|
|
1475
|
+
for _last_seed_index, _pre_buffer, filter_id, filtered_chunk in _iter_data(
|
|
1476
|
+
seed=seed,
|
|
1477
|
+
**self.seed_config,
|
|
1478
|
+
trim_start=trim_start,
|
|
1479
|
+
discard_last=discard_last,
|
|
1480
|
+
):
|
|
1481
|
+
# Retrieve Parallel joblib setup.
|
|
1482
|
+
agg_loop_res = self.p_jobs[self.filter_apps[filter_id].n_jobs](
|
|
1483
|
+
delayed(agg_iter)(
|
|
1484
|
+
seed_chunk=filtered_chunk,
|
|
1485
|
+
store=self.store,
|
|
1486
|
+
key=key,
|
|
1487
|
+
keys_config=self.keys_config[key],
|
|
1488
|
+
agg_config=self.agg_cs[key],
|
|
1489
|
+
agg_buffers=self.agg_buffers[key],
|
|
1490
|
+
)
|
|
1491
|
+
for key in self.filter_apps[filter_id].keys
|
|
1492
|
+
)
|
|
1493
|
+
# Transform list of tuples into a dict.
|
|
1494
|
+
for key, agg_res in agg_loop_res:
|
|
1495
|
+
self.agg_buffers[key].update(agg_res)
|
|
1496
|
+
# Set 'seed_index_restart' to the 'last_seed_index' with
|
|
1497
|
+
# which restarting the next aggregation iteration.
|
|
1498
|
+
self.seed_config[KEY_RESTART_INDEX] = _last_seed_index
|
|
1499
|
+
# Also keep track of last 'pre_buffer' value.
|
|
1500
|
+
self.seed_config[KEY_PRE_BUFFER] = _pre_buffer
|
|
1501
|
+
except SeedPreException as sce:
|
|
1502
|
+
seed_check_exception = True
|
|
1503
|
+
exception_message = str(sce)
|
|
1504
|
+
if final_write:
|
|
1505
|
+
# Post-process & write results from last iteration, this time
|
|
1506
|
+
# keeping last aggregation row, and recording metadata for a
|
|
1507
|
+
# future 'AggStream.agg' execution.
|
|
1508
|
+
self.p_jobs[KEY_MAX_P_JOBS](
|
|
1509
|
+
delayed(_post_n_write_agg_chunks)(
|
|
1510
|
+
store=self.store,
|
|
1511
|
+
key=key,
|
|
1512
|
+
agg_buffers=agg_res,
|
|
1513
|
+
agg_res_type=self.keys_config[key][KEY_AGG_RES_TYPE],
|
|
1514
|
+
append_last_res=True,
|
|
1515
|
+
write_config=self.keys_config[key][KEY_WRITE_CONFIG],
|
|
1516
|
+
index_name=self.keys_config[key][KEY_BIN_ON_OUT],
|
|
1517
|
+
post=self.keys_config[key][KEY_POST],
|
|
1518
|
+
last_seed_index=self.seed_config[KEY_RESTART_INDEX],
|
|
1519
|
+
pre_buffer=self.seed_config[KEY_PRE_BUFFER],
|
|
1520
|
+
)
|
|
1521
|
+
for key, agg_res in self.agg_buffers.items()
|
|
1522
|
+
)
|
|
1523
|
+
if seed and seed_check_exception:
|
|
1524
|
+
raise SeedPreException(exception_message)
|