oups 2025.9.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of oups might be problematic. Click here for more details.
- oups/__init__.py +40 -0
- oups/date_utils.py +62 -0
- oups/defines.py +26 -0
- oups/numpy_utils.py +114 -0
- oups/stateful_loop/__init__.py +14 -0
- oups/stateful_loop/loop_persistence_io.py +55 -0
- oups/stateful_loop/stateful_loop.py +654 -0
- oups/stateful_loop/validate_loop_usage.py +338 -0
- oups/stateful_ops/__init__.py +22 -0
- oups/stateful_ops/aggstream/__init__.py +12 -0
- oups/stateful_ops/aggstream/aggstream.py +1524 -0
- oups/stateful_ops/aggstream/cumsegagg.py +580 -0
- oups/stateful_ops/aggstream/jcumsegagg.py +416 -0
- oups/stateful_ops/aggstream/segmentby.py +1018 -0
- oups/stateful_ops/aggstream/utils.py +71 -0
- oups/stateful_ops/asof_merger/__init__.py +11 -0
- oups/stateful_ops/asof_merger/asof_merger.py +750 -0
- oups/stateful_ops/asof_merger/get_config.py +401 -0
- oups/stateful_ops/asof_merger/validate_params.py +285 -0
- oups/store/__init__.py +15 -0
- oups/store/filepath_utils.py +68 -0
- oups/store/indexer.py +457 -0
- oups/store/ordered_parquet_dataset/__init__.py +19 -0
- oups/store/ordered_parquet_dataset/metadata_filename.py +50 -0
- oups/store/ordered_parquet_dataset/ordered_parquet_dataset/__init__.py +15 -0
- oups/store/ordered_parquet_dataset/ordered_parquet_dataset/base.py +863 -0
- oups/store/ordered_parquet_dataset/ordered_parquet_dataset/read_only.py +252 -0
- oups/store/ordered_parquet_dataset/parquet_adapter.py +157 -0
- oups/store/ordered_parquet_dataset/write/__init__.py +19 -0
- oups/store/ordered_parquet_dataset/write/iter_merge_split_data.py +131 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/__init__.py +22 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/base.py +784 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/n_rows_strategy.py +297 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/time_period_strategy.py +319 -0
- oups/store/ordered_parquet_dataset/write/write.py +270 -0
- oups/store/store/__init__.py +11 -0
- oups/store/store/dataset_cache.py +50 -0
- oups/store/store/iter_intersections.py +397 -0
- oups/store/store/store.py +345 -0
- oups-2025.9.5.dist-info/LICENSE +201 -0
- oups-2025.9.5.dist-info/METADATA +44 -0
- oups-2025.9.5.dist-info/RECORD +43 -0
- oups-2025.9.5.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,580 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Created on Wed Dec 4 21:30:00 2021.
|
|
4
|
+
|
|
5
|
+
@author: pierrot
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
from collections.abc import Callable
|
|
9
|
+
|
|
10
|
+
from numpy import array
|
|
11
|
+
from numpy import dtype
|
|
12
|
+
from numpy import full
|
|
13
|
+
from numpy import isin as nisin
|
|
14
|
+
from numpy import nan as nNaN
|
|
15
|
+
from numpy import zeros
|
|
16
|
+
from pandas import NA as pNA
|
|
17
|
+
from pandas import DataFrame
|
|
18
|
+
from pandas import DatetimeIndex
|
|
19
|
+
from pandas import Int64Dtype
|
|
20
|
+
from pandas import NaT as pNaT
|
|
21
|
+
from pandas import Series
|
|
22
|
+
from pandas.core.resample import TimeGrouper
|
|
23
|
+
|
|
24
|
+
from oups.stateful_ops.aggstream.jcumsegagg import AGG_FUNCS
|
|
25
|
+
from oups.stateful_ops.aggstream.jcumsegagg import jcsagg
|
|
26
|
+
from oups.stateful_ops.aggstream.segmentby import KEY_BIN_ON
|
|
27
|
+
from oups.stateful_ops.aggstream.segmentby import KEY_LAST_BIN_LABEL
|
|
28
|
+
from oups.stateful_ops.aggstream.segmentby import KEY_ORDERED_ON
|
|
29
|
+
from oups.stateful_ops.aggstream.segmentby import KEY_SNAP_BY
|
|
30
|
+
from oups.stateful_ops.aggstream.segmentby import segmentby
|
|
31
|
+
from oups.stateful_ops.aggstream.segmentby import setup_segmentby
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# Some constants.
|
|
35
|
+
DTYPE_INT64 = dtype("int64")
|
|
36
|
+
DTYPE_FLOAT64 = dtype("float64")
|
|
37
|
+
DTYPE_DATETIME64 = dtype("datetime64[ns]")
|
|
38
|
+
DTYPE_NULLABLE_INT64 = Int64Dtype()
|
|
39
|
+
NULL_INT64_1D_ARRAY = zeros(0, DTYPE_INT64)
|
|
40
|
+
NULL_INT64_2D_ARRAY = NULL_INT64_1D_ARRAY.reshape(0, 0)
|
|
41
|
+
# Null values.
|
|
42
|
+
NULL_DICT = {DTYPE_INT64: pNA, DTYPE_FLOAT64: nNaN, DTYPE_DATETIME64: pNaT}
|
|
43
|
+
# Key for buffer.
|
|
44
|
+
KEY_LAST_CHUNK_RES = "last_chunk_res"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def setup_cumsegagg(
|
|
48
|
+
agg: dict[str, tuple[str, str]],
|
|
49
|
+
data_dtype: dict[str, dtype],
|
|
50
|
+
) -> dict[dtype, tuple[list[str], list[str], tuple, int]]:
|
|
51
|
+
"""
|
|
52
|
+
Construct chaingrouby aggregation configuration.
|
|
53
|
+
|
|
54
|
+
Parameters
|
|
55
|
+
----------
|
|
56
|
+
agg : dict[str, tuple[str, str]]
|
|
57
|
+
Dict specifying aggregation in the form
|
|
58
|
+
``'out_col_name' : ('in_col_name', 'function_name')``
|
|
59
|
+
data_dtype : dict[str, dtype]
|
|
60
|
+
Dict specifying per column name its dtype. Typically obtained with
|
|
61
|
+
``df.dtypes.to_dict()``
|
|
62
|
+
|
|
63
|
+
Returns
|
|
64
|
+
-------
|
|
65
|
+
dict[dtype,
|
|
66
|
+
tuple[list[str],
|
|
67
|
+
list[str],
|
|
68
|
+
tuple[tuple[Callable, ndarray[int64], ndarray[int64]]],
|
|
69
|
+
int
|
|
70
|
+
]
|
|
71
|
+
]
|
|
72
|
+
Dict 'cgb_agg_cfg' in the form
|
|
73
|
+
``{dtype: list[str], 'cols_name_in_data'
|
|
74
|
+
column name in input data, with this dtype,
|
|
75
|
+
list[str], 'cols_name_in_res'
|
|
76
|
+
expected column names in aggregation result,
|
|
77
|
+
tuple[tuple[Callable, ndarray[int64], ndarray[int64]]],
|
|
78
|
+
'aggs'
|
|
79
|
+
Tuple of Tuple. One inner Tuple per aggregation
|
|
80
|
+
function. Each one contain 3 items,
|
|
81
|
+
- a Callable, the aggregation function
|
|
82
|
+
- a 1st 1d numpy array with indices of columns
|
|
83
|
+
in 'data', to which has to be applied the
|
|
84
|
+
aggregation function.
|
|
85
|
+
- a 2nd 1d numpy array with indices of columns
|
|
86
|
+
in 'res', to which are recoreded aggregation
|
|
87
|
+
results
|
|
88
|
+
int64, 'n_cols'
|
|
89
|
+
Total number of columns in 'res' (summing for all
|
|
90
|
+
aggregation function).
|
|
91
|
+
}``
|
|
92
|
+
|
|
93
|
+
"""
|
|
94
|
+
cgb_agg_cfg = {}
|
|
95
|
+
# Step 1.
|
|
96
|
+
for out_col, (in_col, func) in agg.items():
|
|
97
|
+
if in_col not in data_dtype:
|
|
98
|
+
raise ValueError(f"column '{in_col}' does not exist in input data.")
|
|
99
|
+
else:
|
|
100
|
+
dtype_ = data_dtype[in_col]
|
|
101
|
+
try:
|
|
102
|
+
tup = cgb_agg_cfg[dtype_]
|
|
103
|
+
except KeyError:
|
|
104
|
+
cgb_agg_cfg[dtype_] = [
|
|
105
|
+
[], # 'cols_name_in_data'
|
|
106
|
+
[], # 'cols_name_in_res'
|
|
107
|
+
[], # 'agg_func_idx' (temporary)
|
|
108
|
+
[], # 'cols_data' (temporary)
|
|
109
|
+
[], # 'cols_res' (temporary)
|
|
110
|
+
]
|
|
111
|
+
tup = cgb_agg_cfg[dtype_]
|
|
112
|
+
# 'in_col' / name / 1d list.
|
|
113
|
+
cols_name_in_data = tup[0]
|
|
114
|
+
if in_col in cols_name_in_data:
|
|
115
|
+
in_col_idx = cols_name_in_data.index(in_col)
|
|
116
|
+
else:
|
|
117
|
+
in_col_idx = len(cols_name_in_data)
|
|
118
|
+
cols_name_in_data.append(in_col)
|
|
119
|
+
# 'out_col' / name / 1d list.
|
|
120
|
+
cols_name_in_res = tup[1]
|
|
121
|
+
out_col_idx = len(cols_name_in_res)
|
|
122
|
+
cols_name_in_res.append(out_col)
|
|
123
|
+
# Set list of agg functions (temporary buffer).
|
|
124
|
+
agg_funcs = tup[2]
|
|
125
|
+
try:
|
|
126
|
+
if (agg_func := AGG_FUNCS[func]) in agg_funcs:
|
|
127
|
+
func_idx = agg_funcs.index(agg_func)
|
|
128
|
+
else:
|
|
129
|
+
func_idx = len(agg_funcs)
|
|
130
|
+
agg_funcs.append(AGG_FUNCS[func])
|
|
131
|
+
except KeyError:
|
|
132
|
+
raise ValueError(f"`{func}` aggregation function is unknown.")
|
|
133
|
+
# 'cols_idx'
|
|
134
|
+
cols_data = tup[3]
|
|
135
|
+
cols_res = tup[4]
|
|
136
|
+
if len(cols_data) <= func_idx:
|
|
137
|
+
# Create list for this aggregation function.
|
|
138
|
+
cols_data.append([in_col_idx])
|
|
139
|
+
cols_res.append([out_col_idx])
|
|
140
|
+
else:
|
|
141
|
+
# Add this column index for this aggregation function.
|
|
142
|
+
cols_data[func_idx].append(in_col_idx)
|
|
143
|
+
cols_res[func_idx].append(out_col_idx)
|
|
144
|
+
# Step 2.
|
|
145
|
+
for conf in cgb_agg_cfg.values():
|
|
146
|
+
# Remove 'agg_funcs' & 'cols_idx'.
|
|
147
|
+
agg_funcs = conf.pop(2)
|
|
148
|
+
cols_data = conf.pop(2)
|
|
149
|
+
cols_res = conf.pop(2)
|
|
150
|
+
n_cols = sum(map(len, cols_res))
|
|
151
|
+
# Add back 'aggs', as tuple of tuple.
|
|
152
|
+
conf.append(tuple(zip(agg_funcs, map(array, cols_data), map(array, cols_res), strict=False)))
|
|
153
|
+
# 'n_cols'.
|
|
154
|
+
conf.append(n_cols)
|
|
155
|
+
return cgb_agg_cfg
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def setup_chunk_res(agg: dict[dtype, tuple]) -> DataFrame:
|
|
159
|
+
"""
|
|
160
|
+
Initialize one-row DataFrame for storing the first 'chunk_res'.
|
|
161
|
+
"""
|
|
162
|
+
chunk_res = {}
|
|
163
|
+
for dtype_, (
|
|
164
|
+
_,
|
|
165
|
+
cols_name_in_res,
|
|
166
|
+
_,
|
|
167
|
+
n_cols,
|
|
168
|
+
) in agg.items():
|
|
169
|
+
chunk_res_single_dtype = zeros(n_cols, dtype=dtype_)
|
|
170
|
+
chunk_res.update(
|
|
171
|
+
{name: chunk_res_single_dtype[i : i + 1] for i, name in enumerate(cols_name_in_res)},
|
|
172
|
+
)
|
|
173
|
+
return DataFrame(chunk_res, copy=False)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def cumsegagg(
|
|
177
|
+
data: DataFrame,
|
|
178
|
+
agg: dict[str, tuple[str, str]] | dict[dtype, tuple[list[str], list[str], tuple, int]],
|
|
179
|
+
bin_by: TimeGrouper | Callable | dict,
|
|
180
|
+
bin_on: str | None = None,
|
|
181
|
+
buffer: dict | None = None,
|
|
182
|
+
ordered_on: str | None = None,
|
|
183
|
+
snap_by: TimeGrouper | Series | DatetimeIndex | None = None,
|
|
184
|
+
error_on_0: bool | None = True,
|
|
185
|
+
) -> DataFrame | tuple[DataFrame, DataFrame]:
|
|
186
|
+
"""
|
|
187
|
+
Cumulative segmented aggregations, with optional snapshotting.
|
|
188
|
+
|
|
189
|
+
In this function, "snapshotting" is understood as the action of making
|
|
190
|
+
isolated observations. When using snapshots, values derived from
|
|
191
|
+
``snap_by`` TimeGrouper (or contained in ``snap_by`` Series) are considered
|
|
192
|
+
the "points of isolated observation".
|
|
193
|
+
At a given point, an observation of the "on-going" segment (aka bin) is
|
|
194
|
+
made. Because segments are contiguous, any row of the dataset falls in a
|
|
195
|
+
segment.
|
|
196
|
+
|
|
197
|
+
Parameters
|
|
198
|
+
----------
|
|
199
|
+
data: DataFrame
|
|
200
|
+
A pandas DataFrame containing the columns over which binning (relying
|
|
201
|
+
on ``bin_on`` column), performing aggregations and optionally
|
|
202
|
+
snapshotting (relying on column pointed by 'ordered_on' and optionally
|
|
203
|
+
``snap_by.key`` if is a TimeGrouper).
|
|
204
|
+
If using snapshots ('snap_by' parameter), then the column pointed by
|
|
205
|
+
``snap_by.key`` has to be ordered.
|
|
206
|
+
agg : dict
|
|
207
|
+
Definition of aggregation.
|
|
208
|
+
If in the form ``dict[str, tuple[str, str]]`` (typically a form
|
|
209
|
+
compatible with pandas aggregation), then it is transformed in the 2nd
|
|
210
|
+
form ``dict[dtype, tuple[list[str], list[str], tuple, int]]``.
|
|
211
|
+
- in the form ``dict[str, tuple[str, str]]``
|
|
212
|
+
- keys are ``str``, requested output column name
|
|
213
|
+
- values are ``tuple`` with 1st component a ``str`` for the input
|
|
214
|
+
column name, and 2nd component a ``str`` for aggregation function
|
|
215
|
+
name.
|
|
216
|
+
|
|
217
|
+
- the 2nd form is that returned by the function ``setup_cumsegagg``.
|
|
218
|
+
|
|
219
|
+
bin_by : TimeGrouper | Callable | dict
|
|
220
|
+
Callable or pandas TimeGrouper to perform binning.
|
|
221
|
+
If a Callable, please see signature requirements in 'segmentby'
|
|
222
|
+
docstring.
|
|
223
|
+
If a dict, it contains the full setup for conducting the segmentation
|
|
224
|
+
of 'data', as generated by 'setup_segmentby()'.
|
|
225
|
+
bin_on : str | None, default None
|
|
226
|
+
Name of the column in `data` over which performing the binning
|
|
227
|
+
operation.
|
|
228
|
+
If 'bin_by' is a pandas `TimeGrouper`, its `key` parameter is used instead,
|
|
229
|
+
and 'bin_on' is ignored.
|
|
230
|
+
If not provided, and 'ordered_on' parameter is, then 'ordered_on' value
|
|
231
|
+
is also used to specify the column name onto which performing binning.
|
|
232
|
+
buffer : dict | None, default None
|
|
233
|
+
Buffer containing data for restarting the binning process with new seed
|
|
234
|
+
data:
|
|
235
|
+
- from previous segmentation step,
|
|
236
|
+
- from previous aggregation step.
|
|
237
|
+
ordered_on : str | None
|
|
238
|
+
Name of an existing ordered column in 'data'. When setting it, it is
|
|
239
|
+
then forwarded to 'bin_by' Callable.
|
|
240
|
+
This parameter is compulsory if 'snap_by' is set. Values derived from
|
|
241
|
+
'snap_by' (either a TimeGrouper or a Series) are compared to ``bin_ends``,
|
|
242
|
+
themselves derived from ``data[ordered_on]``.
|
|
243
|
+
snap_by : TimeGrouper | Series | DatetimeIndex | None, default None
|
|
244
|
+
Values positioning the points of observation, either derived from a
|
|
245
|
+
pandas TimeGrouper, or contained in a pandas Series.
|
|
246
|
+
In case 'snap_by' is a Series, values serve as locations for points of
|
|
247
|
+
observation.
|
|
248
|
+
Additionally, ``closed`` value defined by 'bin_on' specifies if points
|
|
249
|
+
of observations are included or excluded. As "should be logical", if
|
|
250
|
+
- `left`, then values at points of observation are excluded.
|
|
251
|
+
- `right`, then values at points of observation are included.
|
|
252
|
+
|
|
253
|
+
error_on_0 : bool, default True
|
|
254
|
+
By default, check that there is no `0` value (either int or float) in
|
|
255
|
+
aggregation results (bins and snapshots). ``cumsegagg()`` is
|
|
256
|
+
experimental and a `0` value is likely to hint a bug. If raised, the
|
|
257
|
+
result should be double checked. Ultimately, please, report the use
|
|
258
|
+
case that is raising this error, and what would be the expected
|
|
259
|
+
behavior.
|
|
260
|
+
|
|
261
|
+
Returns
|
|
262
|
+
-------
|
|
263
|
+
DataFrame | tuple[DataFrame, DataFrame]
|
|
264
|
+
A pandas DataFrame with aggregation results. Its index is composed of
|
|
265
|
+
the bin labels.
|
|
266
|
+
If a tuple, then the first DataFrame is that for the bins, and the
|
|
267
|
+
second that for the snapshots.
|
|
268
|
+
|
|
269
|
+
Notes
|
|
270
|
+
-----
|
|
271
|
+
When using snapshots, values derived from ``snap_by`` are considered the
|
|
272
|
+
"points of isolated observation". At such a point, an observation of the
|
|
273
|
+
"on-going" bin is made. In case of snapshot(s) positioned exactly on
|
|
274
|
+
segment(s) ends, at the same row index in data, snapshot will come "before"
|
|
275
|
+
the bin.
|
|
276
|
+
|
|
277
|
+
When using 'cumsegagg' through 'chainagg' function (i.e. for chained calls
|
|
278
|
+
to 'cumsegagg') and if setting `bin_by` as a Callable, the developer should
|
|
279
|
+
take care that `bin_by` keeps in the 'buffer' parameter all the data needed
|
|
280
|
+
to:
|
|
281
|
+
- create the correct number of bins that would be in-between the data
|
|
282
|
+
processed at the previous aggregation iteration, and the new data. This
|
|
283
|
+
has to show in 'next_chunk_starts' array that is returned.
|
|
284
|
+
- appropriately label the first bin.
|
|
285
|
+
- either it is a new bin, different than the last one from previous
|
|
286
|
+
aggregation iteration. Then the label of the new bin has to be
|
|
287
|
+
different than that of the last one from previous iteration.
|
|
288
|
+
- or it is the same bin that is continuing. Then the label has be the
|
|
289
|
+
same. This ensures that when recording the new aggregation result,
|
|
290
|
+
the data from the previous iteration (last bin was in-progress, i.e.
|
|
291
|
+
incomplete) is overwritten.
|
|
292
|
+
|
|
293
|
+
Notes on design for allowing 'restart'
|
|
294
|
+
--------------------------------------
|
|
295
|
+
Current implementation may present limitations from inadequate design
|
|
296
|
+
choices, not challenged so far.
|
|
297
|
+
To minimize memory footprint, segmentation step is expected to provide
|
|
298
|
+
start indices of the next bin (as opposed to providing the status for each
|
|
299
|
+
row of the input data, individually).
|
|
300
|
+
Because historically, the aggregation function is expected to use all data
|
|
301
|
+
so as to provide the actual status of the last, in-progress bin, its end
|
|
302
|
+
is de-facto the end of the input data.
|
|
303
|
+
Because of this, the internal flag 'preserve_res' is always set to
|
|
304
|
+
``False`` when reaching the end of input data.
|
|
305
|
+
This is a limitation. It should be ``False`` only to mark the actual end of
|
|
306
|
+
the bins. As a result, this internal flag 'preserve_res' cannot be output
|
|
307
|
+
for reuse in a next restart step.
|
|
308
|
+
An option to circumvent this is to use snapshots instead of bins as media
|
|
309
|
+
to output aggregation results for last, in-progress bin.
|
|
310
|
+
This option has not been implemented.
|
|
311
|
+
|
|
312
|
+
In current implementation, the limitation presented above is circumvented
|
|
313
|
+
by assuming that last bin is never empty, that is to say, 'chunk_res'
|
|
314
|
+
parameter which contains aggregation results for the last, in-progress bin,
|
|
315
|
+
always has relevant results to preserve. This is true, as long as the last,
|
|
316
|
+
in-progress bin is not empty.
|
|
317
|
+
Would this last bin be empty, then 'chunk_res' would still contain
|
|
318
|
+
aggregation results for the last not empty bin it was used. In this case,
|
|
319
|
+
we would need to make sure if the last row in input data matches the end of
|
|
320
|
+
a bin or not, to not preserve 'chunk_res' or preserve it.
|
|
321
|
+
Now, if we assume the last, in-progress bin is not empty, we can wait for
|
|
322
|
+
the restart to check if the bin has ended before the start of input data,
|
|
323
|
+
and then close this bin which was the last, in-progress bin at previous
|
|
324
|
+
iteration.
|
|
325
|
+
To bring more freedom to this implementation, a 'preserve_res' flag is
|
|
326
|
+
expected from the segmentation phase. This flag is set ``False`` to allow
|
|
327
|
+
restarting right on a new, next bin, if at the previous iteration, the last
|
|
328
|
+
bin was complete.
|
|
329
|
+
In current implementation, the limitation is thus that from the
|
|
330
|
+
segmentation, the last bin cannot be empty. All empty trailing bins have to
|
|
331
|
+
be trimmed, otherwise an exception is raised.
|
|
332
|
+
|
|
333
|
+
The following thoughts have been investigated in current implementation.
|
|
334
|
+
- **segmentation step ('segmentby()')**
|
|
335
|
+
-1 From this step, 'next_chunk_starts' should not end with an empty
|
|
336
|
+
bin, as mentioned above. A complementary thought is that when
|
|
337
|
+
restarting after several empty bins, it *may* be that some new data
|
|
338
|
+
was actually in these bins, empty at previous iteration.
|
|
339
|
+
A check is then managed in 'segmentby()'. All empty bins at end of
|
|
340
|
+
'next_chunk_starts' have to be trimmed or an exception will be
|
|
341
|
+
raised.
|
|
342
|
+
-2 If restarting, bins produced by the user-defined 'bin_by()' have to
|
|
343
|
+
cover the full size of data, meaning last item in
|
|
344
|
+
'next_chunk_starts' is equal to length of data.
|
|
345
|
+
As mentioned above, this rationale is from history.
|
|
346
|
+
Additionally, it *may* be that if no bin goes till the end of data,
|
|
347
|
+
then we are not sure the next bin (at next iteration) will not lie
|
|
348
|
+
within these last values in data at current iteration.
|
|
349
|
+
A check is then performed and an exception is raised if this
|
|
350
|
+
situation occurs.
|
|
351
|
+
This requirement is not applied to 'snap_by' (in case using a
|
|
352
|
+
Series). Because it is applied to 'bin_by', then 'chunk_res' will
|
|
353
|
+
contain aggregation results over the last values in data, it is not
|
|
354
|
+
lost.
|
|
355
|
+
In the existing 'snap_by' (either by TimeGrouper or by Series),
|
|
356
|
+
- either if a TimeGrouper, then last snapshot ends after end of data
|
|
357
|
+
- or if a Series, at restart, if 2nd snapshot ends before last
|
|
358
|
+
value in data at previous iteration, then an exception is
|
|
359
|
+
raised.
|
|
360
|
+
|
|
361
|
+
-3 At next iteration, the first bin has to be the continuation of the
|
|
362
|
+
last one from previous iteration. A check is made using bin label.
|
|
363
|
+
This is the case even if the bin is empty. Thus, if it is preceded /
|
|
364
|
+
followed by empty snapshots, content of these snapshots will be set
|
|
365
|
+
appropriately. For empty snapshots that precede this bin end, past
|
|
366
|
+
results are forwarded. For empty snapshots that follow this bin end,
|
|
367
|
+
this results in empty snapshots.
|
|
368
|
+
|
|
369
|
+
- **cumulative segmented aggregation ('cumsegagg()')**
|
|
370
|
+
-1 'preserve_res' parameter is used to indicate if aggregation
|
|
371
|
+
calculations start from scratch (first iteration) or reuse past
|
|
372
|
+
aggregation results (following iterations).
|
|
373
|
+
Aggregation results from last, in-progress bin can then be
|
|
374
|
+
forwarded.
|
|
375
|
+
|
|
376
|
+
"""
|
|
377
|
+
# TODO: create a test case with restart, that has no snapshot in 1st
|
|
378
|
+
# iteration (with 'by_scale' using a Series). Target is to check that even
|
|
379
|
+
# without snapshot in 1st iteration, an empty 'snap_res' gets returned
|
|
380
|
+
# nonetheless and that concatenation can be managed with subsequent
|
|
381
|
+
# 'snap_res' from next iterations.
|
|
382
|
+
# TODO: make possible to pass several 'bin_by' (with same 'snap_by'):
|
|
383
|
+
# - in segmentby, has to produce an array providing for each 'bin_by' when
|
|
384
|
+
# the next bin start. Modify the existing 'mergesort' function for this.
|
|
385
|
+
# - in jcumsegagg, change logic to have aggregation function without
|
|
386
|
+
# 'preserve_res'. Instead, create companion function that can be called
|
|
387
|
+
# when storing results, achieving reconciliation of existing results
|
|
388
|
+
# with new results (if not a new bin), or restarting the aggregation
|
|
389
|
+
# if new results.
|
|
390
|
+
len_data = len(data)
|
|
391
|
+
if not len_data:
|
|
392
|
+
# 'data' is empty. Simply return.
|
|
393
|
+
return
|
|
394
|
+
if not isinstance(next(iter(agg.values())), list):
|
|
395
|
+
# Reshape aggregation definition.
|
|
396
|
+
agg = setup_cumsegagg(agg, data.dtypes.to_dict())
|
|
397
|
+
if buffer is None:
|
|
398
|
+
# Single run agg.
|
|
399
|
+
preserve_res = False
|
|
400
|
+
else:
|
|
401
|
+
# Agg iteration with possible restart.
|
|
402
|
+
# Detection of 1st iteration is managed below with test if a new bin
|
|
403
|
+
# is started.
|
|
404
|
+
preserve_res = True
|
|
405
|
+
prev_last_bin_label = buffer[KEY_LAST_BIN_LABEL] if KEY_LAST_BIN_LABEL in buffer else None
|
|
406
|
+
if not isinstance(bin_by, dict):
|
|
407
|
+
bin_by = setup_segmentby(bin_by, bin_on, ordered_on, snap_by)
|
|
408
|
+
# Following 'setup_segmentby', parameters 'ordered_on', 'bin_on' have to
|
|
409
|
+
# be retrieved from it.
|
|
410
|
+
ordered_on = bin_by[KEY_ORDERED_ON]
|
|
411
|
+
# 'bin_by' as a dict may contain 'snap_by' if it is a TimeGrouper.
|
|
412
|
+
if bin_by[KEY_SNAP_BY] is not None:
|
|
413
|
+
# 'bin_by[KEY_SNAP_BY]' is not none if 'snap_by' is a TimeGrouper.
|
|
414
|
+
# Otherwise, it can be a DatetimeIndex or a Series.
|
|
415
|
+
snap_by = bin_by[KEY_SNAP_BY]
|
|
416
|
+
# In case of restart, 'n_max_null_bins' is a max because 1st null bin may
|
|
417
|
+
# well be continuation of last in-progress bin, without result in current
|
|
418
|
+
# iteration, but with results from previous iteration.
|
|
419
|
+
(
|
|
420
|
+
next_chunk_starts,
|
|
421
|
+
bin_indices,
|
|
422
|
+
bin_labels,
|
|
423
|
+
n_max_null_bins,
|
|
424
|
+
snap_labels,
|
|
425
|
+
n_max_null_snaps,
|
|
426
|
+
) = segmentby(
|
|
427
|
+
data=data,
|
|
428
|
+
bin_by=bin_by,
|
|
429
|
+
snap_by=snap_by,
|
|
430
|
+
buffer=buffer,
|
|
431
|
+
)
|
|
432
|
+
if preserve_res and prev_last_bin_label != bin_labels.iloc[0]:
|
|
433
|
+
# A new bin has been started. Do not preserve past results.
|
|
434
|
+
# This behavior is only possible in case no snapshot is used.
|
|
435
|
+
preserve_res = False
|
|
436
|
+
# Initiate dict of result columns.
|
|
437
|
+
# Setup 'chunk_res'.
|
|
438
|
+
chunk_res_prev = (
|
|
439
|
+
buffer[KEY_LAST_CHUNK_RES]
|
|
440
|
+
if isinstance(buffer, dict) and KEY_LAST_CHUNK_RES in buffer
|
|
441
|
+
else setup_chunk_res(agg)
|
|
442
|
+
)
|
|
443
|
+
chunk_res = {}
|
|
444
|
+
# Setup 'bin_res'.
|
|
445
|
+
n_bins = len(bin_labels)
|
|
446
|
+
null_bin_indices = full(n_max_null_bins, -1, dtype=DTYPE_INT64)
|
|
447
|
+
bin_res = {}
|
|
448
|
+
# Setup 'snap_res', & preserve_res
|
|
449
|
+
snap_res = {}
|
|
450
|
+
if snap_by is None:
|
|
451
|
+
snap_res_single_dtype = NULL_INT64_2D_ARRAY
|
|
452
|
+
null_snap_indices = NULL_INT64_1D_ARRAY
|
|
453
|
+
else:
|
|
454
|
+
# Initialize 'null_snap_indices' to -1, to identify easily those which
|
|
455
|
+
# are not set. they will be removed in a post-processing step.
|
|
456
|
+
n_snaps = len(snap_labels)
|
|
457
|
+
null_snap_indices = full(n_max_null_snaps, -1, dtype=DTYPE_INT64)
|
|
458
|
+
# Loop.
|
|
459
|
+
for dtype_, (
|
|
460
|
+
cols_name_in_data,
|
|
461
|
+
cols_name_in_res,
|
|
462
|
+
aggs,
|
|
463
|
+
n_cols,
|
|
464
|
+
) in agg.items():
|
|
465
|
+
data_single_dtype = (
|
|
466
|
+
data.loc[:, cols_name_in_data].to_numpy(copy=False)
|
|
467
|
+
if len(cols_name_in_data) > 1
|
|
468
|
+
else data.loc[:, cols_name_in_data].to_numpy(copy=False).reshape(-1, 1)
|
|
469
|
+
)
|
|
470
|
+
# Setup 'chunk_res_single_dtype'.
|
|
471
|
+
chunk_res_single_dtype = chunk_res_prev.loc[:, cols_name_in_res].to_numpy(copy=False).reshape(n_cols)
|
|
472
|
+
chunk_res.update(
|
|
473
|
+
{name: chunk_res_single_dtype[i : i + 1] for i, name in enumerate(cols_name_in_res)},
|
|
474
|
+
)
|
|
475
|
+
# Setup 'bin_res_single_dtype'.
|
|
476
|
+
bin_res_single_dtype = zeros((n_bins, n_cols), dtype=dtype_)
|
|
477
|
+
bin_res.update(
|
|
478
|
+
{name: bin_res_single_dtype[:, i] for i, name in enumerate(cols_name_in_res)},
|
|
479
|
+
)
|
|
480
|
+
# Setup 'snap_res_single_dtype'.
|
|
481
|
+
if snap_by is not None:
|
|
482
|
+
snap_res_single_dtype = zeros((n_snaps, n_cols), dtype=dtype_)
|
|
483
|
+
snap_res.update(
|
|
484
|
+
{name: snap_res_single_dtype[:, i] for i, name in enumerate(cols_name_in_res)},
|
|
485
|
+
)
|
|
486
|
+
if dtype_ == DTYPE_DATETIME64:
|
|
487
|
+
data_single_dtype = data_single_dtype.view(DTYPE_INT64)
|
|
488
|
+
bin_res_single_dtype = bin_res_single_dtype.view(DTYPE_INT64)
|
|
489
|
+
chunk_res_single_dtype = chunk_res_single_dtype.view(DTYPE_INT64)
|
|
490
|
+
if snap_by is not None:
|
|
491
|
+
snap_res_single_dtype = snap_res_single_dtype.view(DTYPE_INT64)
|
|
492
|
+
# 'data' is a numpy array, with columns in 'expected order',
|
|
493
|
+
# as defined in 'cols_data' & 'cols_res' embedded in 'aggs'.
|
|
494
|
+
# TODO: if extending 'jcsagg()' to process last chunk in data (even if
|
|
495
|
+
# not a bin or a snap, so as to make possible that bins really only end
|
|
496
|
+
# on end of bins, and that end of 'data' is not systematically a bin
|
|
497
|
+
# end as well), then output from 'jcsagg()' 'preserve_res' parameter.
|
|
498
|
+
# When inputting it for the next iteration, 'preserve_res' parameter
|
|
499
|
+
# is then ``not first_bin_is_new and preserve_res``.
|
|
500
|
+
# With this feature, empty trailing bins are then possible to manage.
|
|
501
|
+
jcsagg(
|
|
502
|
+
data_single_dtype, # 2d
|
|
503
|
+
aggs,
|
|
504
|
+
next_chunk_starts, # 1d
|
|
505
|
+
bin_indices, # 1d
|
|
506
|
+
preserve_res,
|
|
507
|
+
chunk_res_single_dtype,
|
|
508
|
+
bin_res_single_dtype, # 2d
|
|
509
|
+
snap_res_single_dtype, # 2d
|
|
510
|
+
null_bin_indices, # 1d
|
|
511
|
+
null_snap_indices, # 1d
|
|
512
|
+
)
|
|
513
|
+
# Record last aggregation results for a restart.
|
|
514
|
+
if isinstance(buffer, dict):
|
|
515
|
+
buffer[KEY_LAST_CHUNK_RES] = DataFrame(chunk_res, copy=False)
|
|
516
|
+
# Assemble 'bin_res' as a pandas DataFrame.
|
|
517
|
+
bin_res = DataFrame(bin_res, index=bin_labels, copy=False)
|
|
518
|
+
bin_res.index.name = ordered_on if ordered_on else bin_by[KEY_BIN_ON]
|
|
519
|
+
if DTYPE_INT64 in agg:
|
|
520
|
+
# As of pandas 1.5.3, use "Int64" dtype to work with nullable 'int'.
|
|
521
|
+
# (it is a pandas dtype, not a numpy one, which is why it is set only
|
|
522
|
+
# in pandas results, and not numpy inputs to 'cumsegagg()').
|
|
523
|
+
# Force 'int64' to pandas nullable 'Int64', even if there is no null
|
|
524
|
+
# value in results at the moment. Indeed null values can appear in a
|
|
525
|
+
# later aggregation step (use of 'restart' feature).
|
|
526
|
+
bin_res[agg[DTYPE_INT64][1]] = bin_res[agg[DTYPE_INT64][1]].astype(
|
|
527
|
+
DTYPE_NULLABLE_INT64,
|
|
528
|
+
)
|
|
529
|
+
# Set null values.
|
|
530
|
+
if n_max_null_bins != 0:
|
|
531
|
+
null_bin_labels = bin_labels.iloc[null_bin_indices[~nisin(null_bin_indices, -1)]]
|
|
532
|
+
if not null_bin_labels.empty:
|
|
533
|
+
for dtype_, (
|
|
534
|
+
_,
|
|
535
|
+
cols_name_in_res,
|
|
536
|
+
_,
|
|
537
|
+
_,
|
|
538
|
+
) in agg.items():
|
|
539
|
+
bin_res.loc[null_bin_labels, cols_name_in_res] = NULL_DICT[dtype_]
|
|
540
|
+
if snap_by is not None:
|
|
541
|
+
snap_res = DataFrame(snap_res, index=snap_labels, copy=False)
|
|
542
|
+
snap_res.index.name = ordered_on
|
|
543
|
+
if DTYPE_INT64 in agg:
|
|
544
|
+
# As of pandas 1.5.3, use "Int64" dtype to work with nullable 'int'.
|
|
545
|
+
# It is a pandas dtype, not a numpy one, which is why it is set
|
|
546
|
+
# only in pandas results, and not numpy inputs to 'cumsegagg()').
|
|
547
|
+
# Force 'int64' to pandas nullable 'Int64', even if there is no
|
|
548
|
+
# null value in results at the moment. Indeed null values can
|
|
549
|
+
# appear in a later aggregation step (use of 'restart' feature).
|
|
550
|
+
snap_res[agg[DTYPE_INT64][1]] = snap_res[agg[DTYPE_INT64][1]].astype(
|
|
551
|
+
DTYPE_NULLABLE_INT64,
|
|
552
|
+
)
|
|
553
|
+
# Set null values.
|
|
554
|
+
if n_max_null_snaps != 0:
|
|
555
|
+
# Remove -1 indices.
|
|
556
|
+
# TODO: is not necessary to re-create an array without the -1.
|
|
557
|
+
# Only indices above 0 should be used.
|
|
558
|
+
# Alternatively, output number of empty snaps from 'jcumsegagg()'?
|
|
559
|
+
null_snap_labels = snap_labels[null_snap_indices[~nisin(null_snap_indices, -1)]]
|
|
560
|
+
if not null_snap_labels.empty:
|
|
561
|
+
for dtype_, (
|
|
562
|
+
_,
|
|
563
|
+
cols_name_in_res,
|
|
564
|
+
_,
|
|
565
|
+
_,
|
|
566
|
+
) in agg.items():
|
|
567
|
+
snap_res.loc[null_snap_labels, cols_name_in_res] = NULL_DICT[dtype_]
|
|
568
|
+
if error_on_0:
|
|
569
|
+
if snap_by is not None and snap_res.eq(0).any().any():
|
|
570
|
+
raise ValueError(
|
|
571
|
+
"at least one null value exists in 'snap_res' which is likely to hint a bug.",
|
|
572
|
+
)
|
|
573
|
+
if bin_res.eq(0).any().any():
|
|
574
|
+
raise ValueError(
|
|
575
|
+
"at least one null value exists in 'bin_res' which is likely to hint a bug.",
|
|
576
|
+
)
|
|
577
|
+
if snap_by is not None:
|
|
578
|
+
return bin_res, snap_res
|
|
579
|
+
else:
|
|
580
|
+
return bin_res
|