oups 2025.9.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of oups might be problematic. Click here for more details.

Files changed (43) hide show
  1. oups/__init__.py +40 -0
  2. oups/date_utils.py +62 -0
  3. oups/defines.py +26 -0
  4. oups/numpy_utils.py +114 -0
  5. oups/stateful_loop/__init__.py +14 -0
  6. oups/stateful_loop/loop_persistence_io.py +55 -0
  7. oups/stateful_loop/stateful_loop.py +654 -0
  8. oups/stateful_loop/validate_loop_usage.py +338 -0
  9. oups/stateful_ops/__init__.py +22 -0
  10. oups/stateful_ops/aggstream/__init__.py +12 -0
  11. oups/stateful_ops/aggstream/aggstream.py +1524 -0
  12. oups/stateful_ops/aggstream/cumsegagg.py +580 -0
  13. oups/stateful_ops/aggstream/jcumsegagg.py +416 -0
  14. oups/stateful_ops/aggstream/segmentby.py +1018 -0
  15. oups/stateful_ops/aggstream/utils.py +71 -0
  16. oups/stateful_ops/asof_merger/__init__.py +11 -0
  17. oups/stateful_ops/asof_merger/asof_merger.py +750 -0
  18. oups/stateful_ops/asof_merger/get_config.py +401 -0
  19. oups/stateful_ops/asof_merger/validate_params.py +285 -0
  20. oups/store/__init__.py +15 -0
  21. oups/store/filepath_utils.py +68 -0
  22. oups/store/indexer.py +457 -0
  23. oups/store/ordered_parquet_dataset/__init__.py +19 -0
  24. oups/store/ordered_parquet_dataset/metadata_filename.py +50 -0
  25. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/__init__.py +15 -0
  26. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/base.py +863 -0
  27. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/read_only.py +252 -0
  28. oups/store/ordered_parquet_dataset/parquet_adapter.py +157 -0
  29. oups/store/ordered_parquet_dataset/write/__init__.py +19 -0
  30. oups/store/ordered_parquet_dataset/write/iter_merge_split_data.py +131 -0
  31. oups/store/ordered_parquet_dataset/write/merge_split_strategies/__init__.py +22 -0
  32. oups/store/ordered_parquet_dataset/write/merge_split_strategies/base.py +784 -0
  33. oups/store/ordered_parquet_dataset/write/merge_split_strategies/n_rows_strategy.py +297 -0
  34. oups/store/ordered_parquet_dataset/write/merge_split_strategies/time_period_strategy.py +319 -0
  35. oups/store/ordered_parquet_dataset/write/write.py +270 -0
  36. oups/store/store/__init__.py +11 -0
  37. oups/store/store/dataset_cache.py +50 -0
  38. oups/store/store/iter_intersections.py +397 -0
  39. oups/store/store/store.py +345 -0
  40. oups-2025.9.5.dist-info/LICENSE +201 -0
  41. oups-2025.9.5.dist-info/METADATA +44 -0
  42. oups-2025.9.5.dist-info/RECORD +43 -0
  43. oups-2025.9.5.dist-info/WHEEL +4 -0
@@ -0,0 +1,750 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Created on Sat Jun 28 18:35:00 2025.
4
+
5
+ @author: pierrot
6
+
7
+ """
8
+
9
+ from numpy import concatenate
10
+ from numpy import nonzero
11
+ from numpy import searchsorted
12
+ from numpy import zeros
13
+ from numpy.lib.stride_tricks import sliding_window_view
14
+ from numpy.typing import DTypeLike
15
+ from numpy.typing import NDArray
16
+ from pandas import DataFrame
17
+
18
+ from oups.stateful_ops.asof_merger.get_config import KEY_COLS_DTYPES_SLICES_IN_RES_PER_COL_NAME
19
+ from oups.stateful_ops.asof_merger.get_config import KEY_COLS_PER_DTYPE
20
+ from oups.stateful_ops.asof_merger.get_config import KEY_COLS_REINDEX_IN_DF
21
+ from oups.stateful_ops.asof_merger.get_config import KEY_COLS_SLICES_IN_DF_PER_DTYPE
22
+ from oups.stateful_ops.asof_merger.get_config import KEY_COLS_SLICES_IN_RES_PER_JOIN_POS
23
+ from oups.stateful_ops.asof_merger.get_config import KEY_SEED_RES_ARRAYS
24
+ from oups.stateful_ops.asof_merger.get_config import _get_config
25
+ from oups.stateful_ops.asof_merger.get_config import _initialize_fill_values
26
+ from oups.stateful_ops.asof_merger.validate_params import _validate_fill_values_init
27
+ from oups.stateful_ops.asof_merger.validate_params import _validate_monotonic_increasing
28
+ from oups.stateful_ops.asof_merger.validate_params import _validate_n_prev
29
+ from oups.stateful_ops.asof_merger.validate_params import _validate_params
30
+
31
+
32
+ RIGHT = "right"
33
+ LEFT = "left"
34
+
35
+
36
+ def _resize_res_arrays_length(
37
+ seed_res_arrays: dict[DTypeLike, NDArray],
38
+ n_rows: int,
39
+ copy: bool,
40
+ ) -> dict[DTypeLike, NDArray]:
41
+ """
42
+ Provide views on ``seed_res_arrays`` sized to ``n_rows``.
43
+
44
+ Parameters
45
+ ----------
46
+ seed_res_arrays : dict[DTypeLike, NDArray]
47
+ Mapping of dtype to 2D result arrays. Seed arrays are resized in-place
48
+ only when their current number of rows is smaller than ``n_rows``.
49
+ n_rows : int
50
+ Target number of rows for all result arrays.
51
+ copy : bool
52
+ If True, return a copy of the result arrays.
53
+
54
+ Returns
55
+ -------
56
+ dict[DTypeLike, NDArray]
57
+ Mapping of dtype to 2D array views with exactly ``n_rows`` rows.
58
+
59
+ Notes
60
+ -----
61
+ Returned arrays are views on the underlying storage and have shape
62
+ ``(n_rows, n_cols)``.
63
+
64
+ """
65
+ if copy:
66
+ return {
67
+ res_dtype: zeros((n_rows, res_array.shape[1]), dtype=res_dtype)
68
+ for res_dtype, res_array in seed_res_arrays.items()
69
+ }
70
+ else:
71
+ if n_rows > next(iter(seed_res_arrays.values())).shape[0]:
72
+ for res_array in seed_res_arrays.values():
73
+ res_array.resize((n_rows, res_array.shape[1]), refcheck=False)
74
+ return {res_dtype: res_array[:n_rows] for res_dtype, res_array in seed_res_arrays.items()}
75
+
76
+
77
+ def _get_df_row_idx(
78
+ group_main: NDArray,
79
+ df_on: NDArray,
80
+ allow_exact_matches: bool,
81
+ ) -> tuple[NDArray, bool]:
82
+ """
83
+ Get 'merge_asof' row indices with respect to 'main' for this dataframe.
84
+
85
+ Parameters
86
+ ----------
87
+ group_main : NDArray
88
+ Target values for alignment for this group. Must be sorted ascending.
89
+ df_on : NDArray
90
+ Column to use for merging 'asof'. Must be sorted ascending.
91
+ allow_exact_matches : bool
92
+ If False and an exact match is found, use the previous value in the
93
+ dataframe.
94
+
95
+ Returns
96
+ -------
97
+ tuple[NDArray, bool]
98
+ - Row indices (one per ``group_main``) referencing the asof-selected
99
+ current row in this dataframe. When the first index would be -1 (the
100
+ first ``group_main`` precedes the first ``df_on`` with the chosen
101
+ side), indices are incremented by 1 to be non-negative.
102
+ - A boolean flag ``use_guard_row`` indicating whether a guard row must
103
+ be included before dataframe values when creating sliding windows.
104
+
105
+ Notes
106
+ -----
107
+ - ``group_main`` is expected to be sorted ascending. Because of this
108
+ monotonicity, checking the first computed index is sufficient to know if
109
+ any index would be negative: if the first is non-negative, all are.
110
+ - When ``use_guard_row`` is True, all returned indices are globally
111
+ incremented by 1 so they align with window indices created over the
112
+ concatenation ``[guard_rows] + ar``. This ensures that index 0 selects
113
+ the guard-only window, index 1 selects the window ending at ``ar[0]``,
114
+ and so forth.
115
+ - When ``use_guard_row`` is True, callers should include the leading guard
116
+ row from their ``fill_values`` (i.e., use the full ``n_prev + 1`` rows).
117
+ When False, callers should drop the guard row (use only the last
118
+ ``n_prev`` rows) to keep window alignment consistent.
119
+
120
+ """
121
+ df_row_idx = (
122
+ searchsorted(
123
+ df_on,
124
+ group_main,
125
+ side=RIGHT if allow_exact_matches else LEFT,
126
+ )
127
+ - 1
128
+ )
129
+ if df_row_idx[0] < 0:
130
+ return df_row_idx + 1, True
131
+ else:
132
+ return df_row_idx, False
133
+
134
+
135
+ def _comb_merge_asof(
136
+ main: NDArray,
137
+ on: str,
138
+ df_groups: list[list[DataFrame]],
139
+ cols_slices_in_df_per_dtype: list[dict[DTypeLike, slice]],
140
+ combinations: NDArray,
141
+ allow_exact_matches: list[bool],
142
+ n_prev: list[int],
143
+ fill_values: list[list[dict[DTypeLike, NDArray]]],
144
+ res_arrays: dict[DTypeLike, NDArray],
145
+ cols_slices_in_res_per_join_pos: list[list[dict[DTypeLike, slice]]],
146
+ ) -> None:
147
+ """
148
+ Core combine and asof merge implementation for groups of dataframes.
149
+
150
+ This function performs the actual asof join logic by iterating through
151
+ groups, dataframes, and data types to populate the pre-allocated result
152
+ structure. It handles sliding window views for previous values and maps
153
+ results to the correct output positions. When previous values are
154
+ requested, each window is ordered earliest-to-latest as
155
+ [prev_n, ..., prev1, current].
156
+
157
+ Parameters
158
+ ----------
159
+ main : NDArray
160
+ Array of target values for alignment. Same length as ``combinations``.
161
+ on : str
162
+ Column name used for asof joining (key column).
163
+ df_groups : list[list[DataFrame]]
164
+ Nested list of dataframes:
165
+ - Outer list: Different groups of dataframes
166
+ - Inner lists: Dataframes within each group
167
+ All dataframes must contain the 'on' column.
168
+ cols_slices_in_df_per_dtype : list[dict[DTypeLike, slice]]
169
+ List with one dict per dataframe in a group. Each dict maps
170
+ dtype -> slice of column names (excluding 'on' column).
171
+ Structure: [{dtype1: slice(col1, col2), dtype2: slice(col3, col4)}, ...]
172
+ combinations : NDArray
173
+ Integer array of shape ``(n_output_rows, n_join_positions)`` where
174
+ each element is a group index for the corresponding join position.
175
+ allow_exact_matches : list[bool]
176
+ List of booleans, one per dataframe in a group. If False and an
177
+ exact match is found, uses previous value instead.
178
+ n_prev : list[int]
179
+ List of integers, one per dataframe in a group. Number of previous
180
+ values to include for each dataframe (0 = current value only).
181
+ fill_values : list[list[dict[DTypeLike, NDArray]]]
182
+ Fill data for insufficient previous values. Structure matches
183
+ ``df_groups`` and inner lists contain one dict per dataframe in a group.
184
+ These arrays have ``n_prev + 1`` rows: the leading extra row is a guard
185
+ used when the first 'main' value precedes the first 'df_on' value.
186
+ res_arrays : dict[DTypeLike, NDArray]
187
+ Pre-allocated result arrays to populate in-place, one array per dtype
188
+ with shape (n_output_rows, n_columns_for_this_dtype).
189
+ cols_slices_in_res_per_join_pos : list[list[dict[DTypeLike, slice]]]
190
+ List of lists of dicts, one per join position, one per dataframe,
191
+ one per dtype. For each join position and dataframe, slice per dtype
192
+
193
+ Notes
194
+ -----
195
+ - This function modifies 'res_arrays' in-place.
196
+ - The algorithm uses three nested loops:
197
+ 1. Loop over groups of dataframes
198
+ - Build `group_mask = (combinations == group_idx)`
199
+ - `output_row_indices_for_group = nonzero(group_mask.any(axis=1))[0]`
200
+ - `main_values_for_group = main[output_row_indices_for_group]`
201
+ - For each join position where this group appears, compute
202
+ `group_main_indices_per_join_pos[j]` which are the local row
203
+ indices within `main_values_for_group` to materialize for that
204
+ join position
205
+
206
+ 2. Loop over dataframes within the group
207
+ - Compute `df_row_idx, use_guard_row = _get_df_row_idx(
208
+ main_values_for_group, df[on], allow_exact_matches[df_idx])
209
+ - `df_row_idx` gives, for each value in `main_values_for_group`, the
210
+ window end index in the extended df values (guard already accounted
211
+ for)
212
+ - Concatenate guard rows (from `fill_values`) with df values depending on
213
+ `use_guard_row`, then update `fill_values` in-place with the last
214
+ `n_prev + 1` rows for reuse
215
+
216
+ 3. Loop over dtypes within the dataframe
217
+ - If `n_prev > 0`, create sliding windows so each row selects a
218
+ window of shape `(n_prev + 1, n_cols)` ordered from earliest to
219
+ current
220
+ - For each join position, select only the local subset of rows using
221
+ `group_main_indices_per_join_pos`, map to global output rows via
222
+ `output_row_indices_for_group`, then write into the correct result
223
+ array slice for this dtype
224
+
225
+ """
226
+ # Loop 1: Over groups of dataframes.
227
+ for group_idx, group in enumerate(df_groups):
228
+ group_mask = combinations == group_idx
229
+ # Join positions where this group appears (column indices in
230
+ # combinations, excluding main).
231
+ join_positions_for_group = nonzero(group_mask.any(axis=0))[0].tolist()
232
+ # Output row indices that use this group at any join position.
233
+ output_row_indices_for_group = nonzero(group_mask.any(axis=1))[0]
234
+ # Subset of main values for output rows that use this group.
235
+ main_values_for_group = main[output_row_indices_for_group]
236
+ # For each join position, get indices within main_values_for_group.
237
+ group_main_indices_per_join_pos = [
238
+ nonzero((group_mask[:, join_pos])[output_row_indices_for_group])[0]
239
+ for join_pos in join_positions_for_group
240
+ ]
241
+ # Loop 2: Over dataframes in a group.
242
+ for df_idx, df in enumerate(group):
243
+ # Get asof row indices (same length as 'main_values_for_group').
244
+ # These will be filtered by 'group_main_indices_per_join_pos'
245
+ # depending on join position when assembling results.
246
+ df_row_idx, use_guard_row = _get_df_row_idx(
247
+ main_values_for_group,
248
+ df.loc[:, on].to_numpy(),
249
+ allow_exact_matches[df_idx],
250
+ )
251
+ # Loop 3: Over data types in a dataframe.
252
+ n_prev_plus_one = n_prev[df_idx] + 1
253
+ for cols_dtype, cols_slice in cols_slices_in_df_per_dtype[df_idx].items():
254
+ val_array = df.loc[:, cols_slice].to_numpy(copy=False)
255
+ # Prepare per-dtype fill values.
256
+ fill_values_for_dtype = fill_values[group_idx][df_idx][cols_dtype]
257
+ # Concatenate fill values with original data.
258
+ extended_val_array = (
259
+ concatenate([fill_values_for_dtype, val_array], axis=0)
260
+ if use_guard_row
261
+ else concatenate([fill_values_for_dtype[1:], val_array], axis=0)
262
+ )
263
+ # Reset fill values in place with last n_prev+1 rows from the extended array.
264
+ fill_values_for_dtype[:] = extended_val_array[-n_prev_plus_one:]
265
+ if n_prev_plus_one > 1:
266
+ # 'n_prev' values requested.
267
+ # Create sliding window view: (len(ar), n_prev+1, n_cols).
268
+ extended_val_array = sliding_window_view(
269
+ extended_val_array,
270
+ n_prev_plus_one,
271
+ axis=0,
272
+ )
273
+ # Map results to output positions for each join position
274
+ for join_pos_idx, join_pos in enumerate(join_positions_for_group):
275
+ relevant_row_indices_in_group_main = group_main_indices_per_join_pos[join_pos_idx]
276
+ relevant_row_indices_in_df = df_row_idx[relevant_row_indices_in_group_main]
277
+ # 'reshape' is only needed for n_prev > 0, in which case
278
+ # 'extended_val_array' is a windowed view with shape
279
+ # (n_prev+1, n_cols).
280
+ # For performance reason, 'reshape' is after the indexing
281
+ # operation with 'relevant_row_indices_in_df'.
282
+ # Having the reshaping before the indexing would lead to
283
+ # creation of a large temporary copy of all windows.
284
+ selected_values = extended_val_array[relevant_row_indices_in_df].reshape(
285
+ len(relevant_row_indices_in_group_main),
286
+ -1,
287
+ )
288
+ col_slice_in_res = cols_slices_in_res_per_join_pos[join_pos][df_idx][cols_dtype]
289
+ res_arrays[cols_dtype][
290
+ output_row_indices_for_group[relevant_row_indices_in_group_main],
291
+ col_slice_in_res,
292
+ ] = selected_values
293
+
294
+
295
+ class AsofMerger:
296
+ """
297
+ A class for combine and asof merge operations on groups of dataframes.
298
+
299
+ The class is designed to be used iteratively.
300
+
301
+ Attributes
302
+ ----------
303
+ on : str
304
+ Column name to use for joining. This column should contain ordered
305
+ values (typically timestamps or sequential numbers), and has to exist
306
+ in all dataframes.
307
+ n_dfs_per_group : int
308
+ Number of dataframes per group.
309
+ prefixes : list[list[str]]
310
+ Column name prefixes for each join position in 'combinations'.
311
+ Outer list length must match number of join positions (width of
312
+ 'combinations').
313
+ Inner list length must match number of dataframes in a group.
314
+ allow_exact_matches : list[bool]
315
+ List of booleans of same length as the inner list in `df_groups`
316
+ If False, and an exact match is found, the previous value in the
317
+ dataframe is used.
318
+ n_prev : list[int]
319
+ List of length ``len(df_groups[0])``, where each value indicates the
320
+ number of previous values to include for the corresponding dataframe.
321
+ This configuration applies identically to all groups.
322
+ n_prev_suffix_start : int
323
+ Start index for suffixing column names for dataframes with previous
324
+ values.
325
+ _fill_values : list[list[dict[DTypeLike, NDArray]]]
326
+ Fill data for when insufficient previous values exist in data of current
327
+ iteration.
328
+ Structure mirrors `df_groups`. Each inner list corresponds to a group
329
+ and contains a dict per dataframe in the group, in the same order as
330
+ in the corresponding group. Each dict stores one numpy array per
331
+ dtype in the corresponding dataframe, 'on' column being omitted.
332
+ Each array has ``n_prev + 1`` rows. The leading extra row is a guard
333
+ used when the first 'main' value precedes the first value in 'on'
334
+ column.
335
+ For dataframes with `n_prev = 0`, at least 1 row is used for guard
336
+ operations during asof merge.
337
+ During execution, ``_fill_values`` is updated in-place with the last
338
+ ``n_prev + 1`` rows to be reused in the next call.
339
+ This attribute is internal and lazily initialized on the first call to
340
+ ``merge`` based on the configured ``n_prev`` and optional
341
+ ``fill_values_init`` provided at initialization. It is not required to
342
+ be bound as state for resumability. Binding ``_conf`` alone may be
343
+ sufficient for stateful usage; ``_fill_values`` will be re-initialized
344
+ automatically on resume if needed (zeros when no ``fill_values_init`` is
345
+ provided). In particular, if input chunks already include at least
346
+ ``n_prev[i]`` rows preceding the earliest value of ``main`` for each
347
+ dataframe ``i`` (for example, by using ``Store.iter_intersections`` with
348
+ its ``n_prev`` parameter), then persisting ``_fill_values`` is
349
+ unnecessary because previous windows can be rebuilt from the loaded
350
+ rows.
351
+ _conf : dict
352
+ Internal configuration cached on first merge. Contains arrays, slices,
353
+ and layout metadata returned by ``_get_config``. Intended to be bound
354
+ as object state with ``StatefulLoop.bind_object_state(...)``.
355
+ - cols_per_dtype: list[dict[DTypeLike, list[str]]]
356
+ List of dicts, one per dataframe in a group.
357
+ Each dict maps dtype -> list of column names (excluding 'on' column).
358
+ Structure: [{dtype1: [col1, col2], dtype2: [col3, col4]}, ...]
359
+ - cols_reindex_in_df: list[Index]
360
+ List of pandas Index objects, one per dataframe in a group.
361
+ Each Index object contains the column names of the corresponding
362
+ dataframe, starting with 'on' column, and re-ordered per dtype.
363
+ Reindexing with this Index object enables selecting columns of same
364
+ dtype by using slices.
365
+ - cols_slices_in_df_per_dtype: list[dict[DTypeLike, slice]]
366
+ List of dicts, one per dataframe in a group.
367
+ Each dict maps dtype -> slice of column names (excluding 'on' column).
368
+ Structure: [{dtype1: slice(col1, col2), dtype2: slice(col3, col4)}, ...]
369
+ - seed_res_arrays: dict[DTypeLike, NDArray]
370
+ Dictionary with one numpy array per dtype, used to store the result of
371
+ the asof merge for each dtype.
372
+ - cols_slices_in_res_per_join_pos: list[list[dict[DTypeLike, slice]]]
373
+ List of lists of dicts, one per join position, one per dataframe in a
374
+ group, one per dtype. For each join position, dataframe and dtype,
375
+ slice in corresponding result array where to paste the result.
376
+ - cols_dtypes_slices_in_res_per_col_name: dict[str, tuple[DTypeLike, slice]]
377
+ Dictionary with one tuple per column name, mapping column name in
378
+ result to its dtype and its position in the result array (using
379
+ slices).
380
+
381
+ Methods
382
+ -------
383
+ merge(
384
+ main: NDArray,
385
+ df_groups: list[list[DataFrame]],
386
+ combinations: NDArray,
387
+ copy: bool,
388
+ check_sorted: bool,
389
+ ) -> DataFrame
390
+ Perform simultaneously an as-of merge and combine operations on multiple
391
+ groups of dataframes vs an ordered key.
392
+
393
+ """
394
+
395
+ def __init__(
396
+ self,
397
+ on: str,
398
+ *,
399
+ n_dfs_per_group: int,
400
+ prefixes: list[list[str]] | list[str] | None = None,
401
+ allow_exact_matches: list[bool] | None = None,
402
+ n_prev: list[int] | None = None,
403
+ n_prev_suffix_start: int = 0,
404
+ fill_values_init: list[list[DataFrame]] | None = None,
405
+ ) -> None:
406
+ """
407
+ Initialize the merger.
408
+
409
+ Parameters
410
+ ----------
411
+ on : str
412
+ Column name to use for joining. This column should contain ordered
413
+ values (typically timestamps or sequential numbers), and has to
414
+ exist in all dataframes.
415
+ *,
416
+ n_dfs_per_group : int
417
+ Number of dataframes expected in each group.
418
+ prefixes : list[list[str]] | list[str] | None
419
+ Column name prefixes per join position (nested), or for single join
420
+ position as a flat list, or None.
421
+ - Nested form: outer length == number of join positions; inner
422
+ length == number of dataframes per group.
423
+ - Flat form: treated as a single join position.
424
+ - None: prefixes will be generated at first merge using empty
425
+ strings with shape [n_join_positions][n_df_per_group].
426
+ allow_exact_matches : Optional[list[bool]], default None
427
+ List of booleans of same length as the inner list in `df_groups`
428
+ Per dataframe, if False, and an exact match is found, the previous
429
+ value in the dataframe is used.
430
+ If None, ``allow_exact_matches`` is False for any dataframe.
431
+ n_prev : Optional[list[int]], default None
432
+ List of length ``len(df_groups[0])``, where each value indicates the
433
+ number of previous values to include for the corresponding
434
+ dataframe.
435
+ If 'n_prev' is specified, all values must be >= 0.
436
+ This configuration applies identically to all groups.
437
+ If None, only the current asof value is included for each dataframe.
438
+ If set, column names in result dataframe are those of input
439
+ dataframes with column names for previous values suffixed by the
440
+ position of the previous values, starting at 'n_prev_suffix_start'.
441
+ n_prev_suffix_start : int, default 0
442
+ Start index for suffixing column names for dataframes with previous
443
+ values.
444
+ fill_values_init : Optional[list[list[DataFrame]]], default None
445
+ Fill data for when insufficient previous values exist in data at
446
+ first iteration.
447
+ Structure mirrors `df_groups`. Each group must contain the same
448
+ number of dataframes. Each inner list corresponds to a group in the
449
+ same order as `df_groups`, with one dataframe per dataframe in the
450
+ corresponding group, without the 'on' column.
451
+ If None, missing previous values are filled with ``0``.
452
+
453
+ Examples
454
+ --------
455
+ >>> # Define prefixes for each join position (2 positions, 2 dataframes per
456
+ >>> # group)
457
+ >>> prefixes = [
458
+ ... ['left_df1_', 'left_df2_'], # Prefixes for first join position
459
+ ... ['right_df1_', 'right_df2_'], # Prefixes for second join position
460
+ ... ]
461
+ >>>
462
+ >>> # Basic merger initialization (current values only)
463
+ >>> merger = AsofMerger(
464
+ ... on='timestamp',
465
+ ... prefixes=prefixes
466
+ ... )
467
+ >>>
468
+ >>> # Advanced merger with previous values
469
+ >>> n_prev = [3, 1] # df0: +3 previous, df1: +1 previous
470
+ >>> merger = AsofMerger(
471
+ ... on='timestamp',
472
+ ... prefixes=prefixes,
473
+ ... n_prev=n_prev
474
+ ... )
475
+ >>>
476
+ >>> # Advanced merger with custom fill values
477
+ >>> n_prev = [3, 0] # df0: +3 previous, df1: +0 previous (but guard row needed)
478
+ >>> fill_values_init = [
479
+ ... [df0_fill_g0, df1_guard_g0], # Fill data for df0 + guard for df1 in group 0
480
+ ... [df0_fill_g1, df1_guard_g1], # Fill data for df0 + guard for df1 in group 1
481
+ ... [df0_fill_g2, df1_guard_g2], # Fill data for df0 + guard for df1 in group 2
482
+ ... ]
483
+ >>> merger = AsofMerger(
484
+ ... on='timestamp',
485
+ ... prefixes=prefixes,
486
+ ... n_prev=n_prev,
487
+ ... fill_values_init=fill_values_init
488
+ ... )
489
+
490
+ """
491
+ self.on = on
492
+ self.n_dfs_per_group = n_dfs_per_group
493
+ # Normalize prefixes to nested form and set join positions.
494
+ if prefixes is not None:
495
+ if isinstance(prefixes[0], str):
496
+ prefixes = [prefixes]
497
+ if any((len(group) != n_dfs_per_group) for group in prefixes):
498
+ raise ValueError("each group must be of length 'n_dfs_per_group' in 'prefixes'.")
499
+ self.prefixes = prefixes
500
+ self.allow_exact_matches = (
501
+ [False] * self.n_dfs_per_group if allow_exact_matches is None else allow_exact_matches
502
+ )
503
+ if len(self.allow_exact_matches) != self.n_dfs_per_group:
504
+ raise ValueError("'allow_exact_matches' length must match 'n_dfs_per_group'.")
505
+ if n_prev is None:
506
+ self.n_prev = [0] * self.n_dfs_per_group
507
+ else:
508
+ self.n_prev = n_prev
509
+ _validate_n_prev(self.n_prev, self.n_dfs_per_group)
510
+ self.n_prev_suffix_start = n_prev_suffix_start
511
+ self.fill_values_init = fill_values_init
512
+ # These parameters require the first group of dataframes to be passed.
513
+ # They will be initialized at first call of 'merge' method.
514
+ self._conf = None
515
+ self._fill_values = None
516
+
517
+ def merge(
518
+ self,
519
+ main: NDArray,
520
+ *,
521
+ df_groups: list[list[DataFrame]] | list[DataFrame],
522
+ combinations: NDArray | None = None,
523
+ copy: bool = True,
524
+ check_sorted: bool = False,
525
+ ) -> DataFrame:
526
+ """
527
+ Perform an as-of join and combine on multiple groups of dataframes.
528
+
529
+ This function aligns rows from multiple dataframes based on the nearest
530
+ preceding value in the key column.
531
+ This method is similar to pandas 'merge_asof' function but supports
532
+ multiple dataframes and multiple group of dataframes simultaneously.
533
+
534
+ Parameters
535
+ ----------
536
+ main : NDArray
537
+ Target values for alignment. The asof join will find the nearest
538
+ preceding value in each dataframe's key column for each value in main.
539
+ A value can be referenced multiple times in 'main'.
540
+ *,
541
+ df_groups : list[list[DataFrame]] | list[DataFrame]
542
+ Dataframe groups to join.
543
+ - When ``combinations`` is provided: nested list structure where
544
+ the outer list contains groups (length ``n_groups``) and each
545
+ inner list contains dataframes for that group (length
546
+ ``n_dfs_per_group``). Each dataframe at position i across all
547
+ groups must have identical column structure (name and dtype).
548
+ - When ``combinations`` is None (single join position): requires a
549
+ flat list of DataFrames. Inputs are normalized internally to a
550
+ single-group nested list.
551
+ - All dataframes must contain the specified 'on' column.
552
+
553
+ combinations : Optional[NDArray[int]]
554
+ Integer array of shape ``(n_output_rows, n_join_positions)`` where:
555
+ - Column indices represent join positions (combinations of fixed
556
+ width),
557
+ - Values are group indices referring to positions in ``df_groups``'s
558
+ outer list,
559
+ - The same group can be referenced multiple times per row at
560
+ different join positions.
561
+
562
+ If None, a single join position is required and a default array of
563
+ zeros of shape ``(len(main), 1)`` is used (selecting group ``0`` at
564
+ the sole join position).
565
+ copy : bool, default True
566
+ If True, return a copy of the result arrays.
567
+ If False, return views on the result arrays.
568
+ Returning views will result in faster processing time if `merge` is
569
+ called iteratively (result array won't be re-initialized across
570
+ iterations). This requires however the caller to ensure the result
571
+ arrays to make copy of the results before the next call to `merge`.
572
+ In particular, if `merge` is called in successive iterations, along
573
+ with result accumulation across iterations without any copy of the
574
+ results, it is recommended to keep `copy` to True.
575
+ check_sorted : bool, default False
576
+ If True, validate that 'main' is increasing and that each
577
+ dataframe's 'on' column is increasing. This is an O(N) pass per
578
+ array/Series and is intended for debugging or defensive runs.
579
+
580
+ Returns
581
+ -------
582
+ DataFrame
583
+ Joined dataframe where each row corresponds to a row in
584
+ ``combinations``.
585
+ The first column contains the `main` values named after `on`.
586
+ Then columns follow join position order, dataframe order, dtype
587
+ order, and previous values order (if any).
588
+
589
+ Examples
590
+ --------
591
+ >>> # Three groups of dataframes, each group has 2 dataframes
592
+ >>> df_groups = [
593
+ ... [df1_g0, df2_g0], # Group 0
594
+ ... [df1_g1, df2_g1], # Group 1
595
+ ... [df1_g2, df2_g2], # Group 2
596
+ ... ]
597
+ >>> main_times = np.array([10, 20, 30])
598
+ >>>
599
+ >>> # Create output rows, each with 2 join positions selecting different groups:
600
+ >>> combinations = np.array([
601
+ ... [0, 1], # main[0] (10): pos0 -> group0, pos1 -> group1
602
+ ... [0, 2], # main[0] (10): pos0 -> group0, pos1 -> group2
603
+ ... [1, 2], # main[1] (20): pos0 -> group1, pos1 -> group2
604
+ ... [2, 1], # main[2] (30): pos0 -> group2, pos1 -> group1
605
+ ... ])
606
+ >>>
607
+ >>> # Initialize merger
608
+ >>> prefixes = [
609
+ ... ['left_df1_', 'left_df2_'], # Prefixes for first join position
610
+ ... ['right_df1_', 'right_df2_'], # Prefixes for second join position
611
+ ... ]
612
+ >>> merger = AsofMerger(on='timestamp', prefixes=prefixes)
613
+ >>>
614
+ >>> # Basic join (current values only)
615
+ >>> result = merger.merge(main_times, df_groups, combinations)
616
+ >>> # Result has 4 rows, each with 5 columns (2 join positions × 2 dataframes × 1
617
+ >>> # value + 1 main)
618
+ >>>
619
+ >>> # Advanced join with previous values
620
+ >>> n_prev = [3, 1] # df0: +3 previous, df1: +1 previous
621
+ >>> merger_prev = AsofMerger(on='timestamp', prefixes=prefixes, n_prev=n_prev)
622
+ >>> result = merger_prev.merge(main_times, df_groups, combinations)
623
+ >>> # Result has 4 rows, each with 11 columns (2 join positions × (4+2)
624
+ >>> # columns + 1 main)
625
+ >>>
626
+ >>> # Advanced join with custom fill values
627
+ >>> n_prev = [3, 0] # df0: +3 previous, df1: +0 previous (but guard row needed)
628
+ >>> fill_values_init = [
629
+ ... [df0_fill_g0, df1_guard_g0], # Fill data for df0 + guard for df1 in group 0
630
+ ... [df0_fill_g1, df1_guard_g1], # Fill data for df0 + guard for df1 in group 1
631
+ ... [df0_fill_g2, df1_guard_g2], # Fill data for df0 + guard for df1 in group 2
632
+ ... ]
633
+ >>> merger_fill = AsofMerger(on='timestamp', prefixes=prefixes,
634
+ ... n_prev=n_prev, fill_values_init=fill_values_init)
635
+ >>> result = merger_fill.merge(main_times, df_groups, combinations)
636
+ >>> # Result has 4 rows, each with 9 columns (2 join positions × (4+1)
637
+ >>> # columns + 1 main)
638
+
639
+ Notes
640
+ -----
641
+ - All dataframes must have their 'on' column sorted in ascending order.
642
+ - The asof join uses backward search (nearest preceding value).
643
+ - Columns in output are re-ordered per dtype, for improved performance.
644
+ - For dataframes with previous values, within each dtype the expanded
645
+ columns for a given source column are ordered from earliest previous
646
+ on the left to the current value on the right.
647
+ - ``main`` must be monotonically increasing. The algorithm relies on
648
+ monotonic ``main`` for efficient asof index computation and for the
649
+ correctness of the first-entry negative-index guard.
650
+ - When ``n_prev`` is all zeros and ``fill_values`` is None, a single
651
+ guard row of zeros per dtype is synthesized internally so that early
652
+ ``main`` values produce correct guard windows.
653
+ - When ``merge`` is called a first time, ``fill_values_init`` provided
654
+ in initialization is deleted.The internal state is managed
655
+ automatically across iterations. The ``_fill_values`` attribute is
656
+ internal and does not necessarily need to be bound for resumability;
657
+ binding ``_conf`` alone may be sufficient. On resume, ``_fill_values``
658
+ is synthesized from zeros (or rebuilt from ``fill_values_init`` when
659
+ provided).
660
+ - Iterative usage guidance:
661
+ * The internal state (``fill_values``, and other configuration
662
+ parameters) is managed automatically.
663
+ * If you set ``copy=False`` across iterations, take a copy of each
664
+ returned result DataFrame before invoking the next iteration to
665
+ avoid overwriting via shared underlying buffers.
666
+
667
+ """
668
+ if combinations is None:
669
+ # Normalize inputs up-front when combinations is None (single join
670
+ # position).
671
+ if not isinstance(df_groups[0], DataFrame):
672
+ raise ValueError(
673
+ "when 'combinations' is None, 'df_groups' must be a flat " "list of DataFrames.",
674
+ )
675
+ df_groups = [df_groups]
676
+ combinations = zeros((len(main), 1), dtype=int)
677
+ n_join_positions = combinations.shape[1]
678
+ # Optional monotonicity checks.
679
+ if check_sorted:
680
+ _validate_monotonic_increasing(main, df_groups, self.on)
681
+ if self._conf is None:
682
+ # First use of function, validate consistency of input data.
683
+ _validate_params(
684
+ main=main,
685
+ n_dfs_per_group=self.n_dfs_per_group,
686
+ df_groups=df_groups,
687
+ combinations=combinations,
688
+ )
689
+ self._conf = _get_config(
690
+ cols_dtypes_per_df=[df.dtypes.to_dict() for df in df_groups[0]],
691
+ filter_out=[self.on],
692
+ n_join_positions=n_join_positions,
693
+ prefixes=self.prefixes,
694
+ n_prev=self.n_prev,
695
+ n_prev_suffix_start=self.n_prev_suffix_start,
696
+ )
697
+ if self._fill_values is None:
698
+ if isinstance(self.fill_values_init, list):
699
+ _validate_fill_values_init(
700
+ on=self.on,
701
+ df_groups=df_groups,
702
+ n_prev=self.n_prev,
703
+ fill_values_init=self.fill_values_init,
704
+ )
705
+ self._fill_values = _initialize_fill_values(
706
+ n_df_groups=len(df_groups),
707
+ cols_per_dtype=self._conf[KEY_COLS_PER_DTYPE],
708
+ n_prev=self.n_prev,
709
+ fill_values_init=self.fill_values_init,
710
+ )
711
+ # 'fill_values_init' no longer needed after transformation, free
712
+ # memory.
713
+ del self.fill_values_init
714
+ # Reindex dfs to ensure columns are in the same order per dtype.
715
+ # (make use of slices safe in '_comb_merge_asof')
716
+ for group in df_groups:
717
+ for df_idx in range(len(group)):
718
+ group[df_idx] = group[df_idx].reindex(
719
+ columns=self._conf[KEY_COLS_REINDEX_IN_DF][df_idx],
720
+ copy=False,
721
+ )
722
+ # Resize result arrays to match the number of output rows.
723
+ res_arrays = _resize_res_arrays_length(
724
+ seed_res_arrays=self._conf[KEY_SEED_RES_ARRAYS],
725
+ n_rows=len(main),
726
+ copy=copy,
727
+ )
728
+ # Set 'res_arrays' in-place.
729
+ _comb_merge_asof(
730
+ main=main,
731
+ on=self.on,
732
+ df_groups=df_groups,
733
+ cols_slices_in_df_per_dtype=self._conf[KEY_COLS_SLICES_IN_DF_PER_DTYPE],
734
+ combinations=combinations,
735
+ allow_exact_matches=self.allow_exact_matches,
736
+ n_prev=self.n_prev,
737
+ fill_values=self._fill_values,
738
+ res_arrays=res_arrays,
739
+ cols_slices_in_res_per_join_pos=self._conf[KEY_COLS_SLICES_IN_RES_PER_JOIN_POS],
740
+ )
741
+ return DataFrame(
742
+ {self.on: main}
743
+ | {
744
+ col_name: res_arrays[col_dtype][:, col_slice].reshape(-1)
745
+ for col_name, (
746
+ col_dtype,
747
+ col_slice,
748
+ ) in self._conf[KEY_COLS_DTYPES_SLICES_IN_RES_PER_COL_NAME].items()
749
+ },
750
+ )