oups 2025.9.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of oups might be problematic. Click here for more details.

Files changed (43) hide show
  1. oups/__init__.py +40 -0
  2. oups/date_utils.py +62 -0
  3. oups/defines.py +26 -0
  4. oups/numpy_utils.py +114 -0
  5. oups/stateful_loop/__init__.py +14 -0
  6. oups/stateful_loop/loop_persistence_io.py +55 -0
  7. oups/stateful_loop/stateful_loop.py +654 -0
  8. oups/stateful_loop/validate_loop_usage.py +338 -0
  9. oups/stateful_ops/__init__.py +22 -0
  10. oups/stateful_ops/aggstream/__init__.py +12 -0
  11. oups/stateful_ops/aggstream/aggstream.py +1524 -0
  12. oups/stateful_ops/aggstream/cumsegagg.py +580 -0
  13. oups/stateful_ops/aggstream/jcumsegagg.py +416 -0
  14. oups/stateful_ops/aggstream/segmentby.py +1018 -0
  15. oups/stateful_ops/aggstream/utils.py +71 -0
  16. oups/stateful_ops/asof_merger/__init__.py +11 -0
  17. oups/stateful_ops/asof_merger/asof_merger.py +750 -0
  18. oups/stateful_ops/asof_merger/get_config.py +401 -0
  19. oups/stateful_ops/asof_merger/validate_params.py +285 -0
  20. oups/store/__init__.py +15 -0
  21. oups/store/filepath_utils.py +68 -0
  22. oups/store/indexer.py +457 -0
  23. oups/store/ordered_parquet_dataset/__init__.py +19 -0
  24. oups/store/ordered_parquet_dataset/metadata_filename.py +50 -0
  25. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/__init__.py +15 -0
  26. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/base.py +863 -0
  27. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/read_only.py +252 -0
  28. oups/store/ordered_parquet_dataset/parquet_adapter.py +157 -0
  29. oups/store/ordered_parquet_dataset/write/__init__.py +19 -0
  30. oups/store/ordered_parquet_dataset/write/iter_merge_split_data.py +131 -0
  31. oups/store/ordered_parquet_dataset/write/merge_split_strategies/__init__.py +22 -0
  32. oups/store/ordered_parquet_dataset/write/merge_split_strategies/base.py +784 -0
  33. oups/store/ordered_parquet_dataset/write/merge_split_strategies/n_rows_strategy.py +297 -0
  34. oups/store/ordered_parquet_dataset/write/merge_split_strategies/time_period_strategy.py +319 -0
  35. oups/store/ordered_parquet_dataset/write/write.py +270 -0
  36. oups/store/store/__init__.py +11 -0
  37. oups/store/store/dataset_cache.py +50 -0
  38. oups/store/store/iter_intersections.py +397 -0
  39. oups/store/store/store.py +345 -0
  40. oups-2025.9.5.dist-info/LICENSE +201 -0
  41. oups-2025.9.5.dist-info/METADATA +44 -0
  42. oups-2025.9.5.dist-info/RECORD +43 -0
  43. oups-2025.9.5.dist-info/WHEEL +4 -0
@@ -0,0 +1,397 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Created on Mon May 26 18:00:00 2025.
4
+
5
+ @author: pierrot
6
+
7
+ """
8
+ from collections import defaultdict
9
+ from collections.abc import Iterator
10
+
11
+ from numpy import arange
12
+ from numpy import full
13
+ from numpy import insert
14
+ from numpy import nan
15
+ from numpy import ones
16
+ from numpy import r_
17
+ from numpy import roll
18
+ from numpy import searchsorted
19
+ from pandas import DataFrame
20
+ from pandas import Int64Dtype
21
+ from pandas import Timestamp
22
+ from pandas import concat
23
+
24
+ from oups.defines import KEY_N_ROWS
25
+ from oups.defines import KEY_ORDERED_ON_MAXS
26
+ from oups.defines import KEY_ORDERED_ON_MINS
27
+ from oups.numpy_utils import isnotin_ordered
28
+ from oups.store.indexer import StoreKey
29
+ from oups.store.ordered_parquet_dataset import OrderedParquetDataset
30
+
31
+
32
+ KEY_LEFT = "left"
33
+
34
+
35
+ def _get_and_validate_ordered_on_column[K: StoreKey](datasets: dict[K, OrderedParquetDataset]) -> str:
36
+ """
37
+ Get and validate the 'ordered_on' column name across all datasets.
38
+
39
+ Parameters
40
+ ----------
41
+ datasets : dict[K, OrderedParquetDataset]
42
+ Dictionary mapping dataset keys to their corresponding datasets.
43
+
44
+ Returns
45
+ -------
46
+ str
47
+ The common 'ordered_on' column name.
48
+
49
+ Raises
50
+ ------
51
+ ValueError
52
+ If ordered_on column names differ between datasets.
53
+
54
+ """
55
+ # Get 'ordered_on' from first key.
56
+ iter_datasets = iter(datasets.items())
57
+ first_key, first_dataset = next(iter_datasets)
58
+ # Validate all keys have the same 'ordered_on' column.
59
+ for key, dataset in iter_datasets:
60
+ if dataset.ordered_on != first_dataset.ordered_on:
61
+ raise ValueError(
62
+ f"inconsistent 'ordered_on' columns. '{first_key}' has "
63
+ f"'{first_dataset.ordered_on}', but '{key}' has '{dataset.ordered_on}'.",
64
+ )
65
+ return first_dataset.ordered_on
66
+
67
+
68
+ def _get_intersections[K: StoreKey](
69
+ datasets: dict[K, OrderedParquetDataset],
70
+ start: float | Timestamp | None = None,
71
+ end_excl: float | Timestamp | None = None,
72
+ ) -> tuple[dict[K, int], dict[K, int], Iterator[tuple]]:
73
+ """
74
+ Create an iterator over intersection boundaries with row group indices.
75
+
76
+ This function analyzes row group statistics across all keys to determine
77
+ intersection boundaries and the corresponding row group indices for each
78
+ key. Returns starting row group indices, first ending indices, and
79
+ intersection boundaries.
80
+
81
+ Parameters
82
+ ----------
83
+ datasets : dict[K, OrderedParquetDataset]
84
+ Dictionary mapping dataset keys to their corresponding datasets.
85
+ start : Optional[Union[int, float, Timestamp]], default None
86
+ Start value for the 'ordered_on' column range.
87
+ end_excl : Optional[Union[int, float, Timestamp]], default None
88
+ End value (exclusive) for the 'ordered_on' column range.
89
+
90
+ Returns
91
+ -------
92
+ tuple[dict[K, int], dict[K, int], Iterator[tuple]]
93
+ tuple containing:
94
+ - Dictionary mapping each key to its starting row group index,
95
+ - Dictionary mapping each key to its first ending row group index
96
+ (exclusive) in the trimmed range,
97
+ - Iterator yielding (current_end_excl, rg_idx_ends_excl) tuples where:
98
+ * current_end_excl: End boundary (exclusive) for current intersection
99
+ * rg_idx_ends_excl: dict mapping each key to its row group index for
100
+ this intersection
101
+
102
+ Notes
103
+ -----
104
+ - A key without value in the span of interest will not appear in returned
105
+ dict of row group indices (start, first end excl, and end excl).
106
+ - The first row group to appear for each key is loaded right at the first
107
+ iteration, even though it would not be needed immediately.
108
+ - For a given dataset, successive row groups sharing same 'ordered_on'
109
+ values (last of first row group is equal to first of second row group) are
110
+ 'collapsed' into a single row group. More exactly, indices returned to
111
+ 'iter_intersections()' will ensure that both row groups (as many as
112
+ complying with the condition) are returned as a single row group.
113
+ This may result in larger intersections being yielded in a same iteration
114
+ by 'iter_intersections()'.
115
+
116
+ """
117
+ if isinstance(start, Timestamp):
118
+ start = start.to_numpy()
119
+ if isinstance(end_excl, Timestamp):
120
+ end_excl = end_excl.to_numpy()
121
+ # Store "ordered_on_mins" in a dict, only keeping unique value and
122
+ # corresponding row group indices.
123
+ unique_ordered_on_mins = None
124
+ keys_ordered_on_ends_excl = {}
125
+ keys_rg_idx_starts = {}
126
+ keys_rg_idx_first_ends_excl = {}
127
+ keys_rg_idx_ends_excl = {}
128
+ for key, dataset in datasets.items():
129
+ row_group_stats = dataset.row_group_stats
130
+ ordered_on_mins = row_group_stats.loc[:, KEY_ORDERED_ON_MINS].to_numpy()
131
+ ordered_on_maxs = row_group_stats.loc[:, KEY_ORDERED_ON_MAXS].to_numpy()
132
+ n_rgs = len(row_group_stats)
133
+ # Main row groups are those not overlapping with next ones.
134
+ mask_main_rgs_for_mins = ones(n_rgs).astype(bool)
135
+ mask_main_rgs_for_mins[1:] = ordered_on_mins[1:] != ordered_on_maxs[:-1]
136
+ mask_main_rgs_for_maxs = roll(mask_main_rgs_for_mins, -1)
137
+ # Skip first row group in trimming.
138
+ trim_idx_first_end_excl = (
139
+ searchsorted(ordered_on_maxs[mask_main_rgs_for_maxs], start, side=KEY_LEFT) + 1 if start else 1
140
+ )
141
+ _unique_ordered_on_mins = ordered_on_mins[mask_main_rgs_for_mins]
142
+ trim_idx_last_end_excl = (
143
+ searchsorted(_unique_ordered_on_mins, end_excl, side=KEY_LEFT)
144
+ if end_excl
145
+ else len(_unique_ordered_on_mins)
146
+ )
147
+ # 'unique_rg_idx_ends_excl' is completed with its length as last value.
148
+ if trim_idx_first_end_excl < trim_idx_last_end_excl + 1:
149
+ keys_ordered_on_ends_excl[key] = _unique_ordered_on_mins[
150
+ trim_idx_first_end_excl:trim_idx_last_end_excl
151
+ ]
152
+ # Collect 'ordered_on_mins' for each key, keeping unique values only.
153
+ if unique_ordered_on_mins is None:
154
+ unique_ordered_on_mins = keys_ordered_on_ends_excl[key]
155
+ else:
156
+ is_not_found, unfound_insert_idx = isnotin_ordered(
157
+ sorted_array=unique_ordered_on_mins,
158
+ query_elements=keys_ordered_on_ends_excl[key],
159
+ return_insert_positions=True,
160
+ )
161
+ unique_ordered_on_mins = insert(
162
+ unique_ordered_on_mins,
163
+ unfound_insert_idx,
164
+ keys_ordered_on_ends_excl[key][is_not_found],
165
+ )
166
+ rg_idx = arange(n_rgs)
167
+ _unique_rg_idx_ends_excl = r_[rg_idx[mask_main_rgs_for_mins], n_rgs]
168
+ keys_rg_idx_starts[key] = _unique_rg_idx_ends_excl[trim_idx_first_end_excl - 1]
169
+ keys_rg_idx_ends_excl[key] = _unique_rg_idx_ends_excl[
170
+ trim_idx_first_end_excl : trim_idx_last_end_excl + 1
171
+ ]
172
+ keys_rg_idx_first_ends_excl[key] = keys_rg_idx_ends_excl[key][0]
173
+ if unique_ordered_on_mins is None:
174
+ return {}, {}, iter([])
175
+ # Adding one for last value, will be either 'end_excl' or None.
176
+ len_unique_ordered_on_mins = len(unique_ordered_on_mins) + 1
177
+ for key, rg_idx_ends_excl in keys_rg_idx_ends_excl.items():
178
+ _rg_idx_ends_excl = full(len_unique_ordered_on_mins, nan)
179
+ # Forcing last row group index, which cannot be always positioned,
180
+ # in the case 'end_excl' is None, and therefore is not in
181
+ # 'keys_ordered_on_ends_excl[key]'.
182
+ _rg_idx_ends_excl[-1] = rg_idx_ends_excl[-1]
183
+ confirmed_ordered_on_ends_excl_idx = searchsorted(
184
+ unique_ordered_on_mins,
185
+ keys_ordered_on_ends_excl[key],
186
+ side=KEY_LEFT,
187
+ )
188
+ _rg_idx_ends_excl[confirmed_ordered_on_ends_excl_idx] = rg_idx_ends_excl[
189
+ : len(confirmed_ordered_on_ends_excl_idx)
190
+ ]
191
+ keys_rg_idx_ends_excl[key] = _rg_idx_ends_excl
192
+ intersections = DataFrame(keys_rg_idx_ends_excl, dtype=Int64Dtype())
193
+ intersections.bfill(axis=0, inplace=True)
194
+ return (
195
+ keys_rg_idx_starts,
196
+ keys_rg_idx_first_ends_excl,
197
+ zip(
198
+ list(unique_ordered_on_mins) + [end_excl],
199
+ intersections.to_dict(orient="records"),
200
+ strict=False,
201
+ ),
202
+ )
203
+
204
+
205
+ def _initialize_first_load[K: StoreKey](
206
+ datasets: dict[K, OrderedParquetDataset],
207
+ ordered_on_col_name: str,
208
+ rg_idx_starts: dict[K, int],
209
+ prev_rg_idx_ends_excl: dict[K, int],
210
+ start: float | Timestamp | None,
211
+ n_prev: list[int],
212
+ ) -> tuple[dict[K, DataFrame], dict[K, int | None]]:
213
+ """
214
+ Prepare first in-memory dataframes and initial start indices, honoring n_prev.
215
+
216
+ Parameters
217
+ ----------
218
+ datasets : dict[K, OrderedParquetDataset]
219
+ Datasets to read from.
220
+ ordered_on_col_name : str
221
+ Column used for ordering.
222
+ rg_idx_starts : dict[K, int]
223
+ Row-group start indices for each key.
224
+ prev_rg_idx_ends_excl : dict[K, int]
225
+ First row-group end-excluded indices within the trimmed range.
226
+ start : Union[int, float, Timestamp, None]
227
+ Start boundary (inclusive).
228
+ n_prev : list[int]
229
+ Number of previous rows to prepend to the first slice if available.
230
+
231
+ Returns
232
+ -------
233
+ tuple[dict[K, DataFrame], dict[K, Optional[int]]]
234
+ in_memory_data and current_start_indices for the first iteration.
235
+
236
+ Notes
237
+ -----
238
+ - When additional rows are needed, only the minimal number of previous
239
+ row-groups is loaded, and only their tail rows necessary to reach
240
+ 'n_prev' are kept.
241
+
242
+ """
243
+ # Base first load.
244
+ in_memory_data = {
245
+ key: datasets[key][rg_idx_start : prev_rg_idx_ends_excl[key]].to_pandas()
246
+ for key, rg_idx_start in rg_idx_starts.items()
247
+ }
248
+ # Fast-path when no 'start' anchor.
249
+ if start is None:
250
+ return in_memory_data, defaultdict(lambda: None)
251
+
252
+ # Else, compute initial start indices anchored at 'start'.
253
+ current_start_indices = {
254
+ key: df.loc[:, ordered_on_col_name].searchsorted(start, side=KEY_LEFT)
255
+ for key, df in in_memory_data.items()
256
+ }
257
+ # Fast-path when no 'n_prev' anchor.
258
+ if not any(n_prev):
259
+ return in_memory_data, current_start_indices
260
+
261
+ # Else, shift start indices backward by 'n_prev' if possible.
262
+ # Load minimal previous data if needed.
263
+ for key_idx, (key, df) in enumerate(in_memory_data.items()):
264
+ # Fast-path when no previous rows are needed.
265
+ if n_prev[key_idx] == 0:
266
+ continue
267
+ new_idx = current_start_indices[key] - n_prev[key_idx]
268
+ # Fast-path when no previous row groups are needed.
269
+ if new_idx >= 0:
270
+ current_start_indices[key] = new_idx
271
+ continue
272
+ # Else, need to fetch previous rows from prior row groups.
273
+ missing = -new_idx
274
+ rg_idx_start = rg_idx_starts[key]
275
+ # Fast-path when no previous row groups are available.
276
+ if rg_idx_start == 0:
277
+ current_start_indices[key] = 0
278
+ continue
279
+ n_rows_array = datasets[key].row_group_stats.loc[:, KEY_N_ROWS].to_numpy()
280
+ sum_rows = 0
281
+ for first_needed_rg in range(rg_idx_start - 1, -1, -1):
282
+ sum_rows += int(n_rows_array[first_needed_rg])
283
+ if sum_rows >= missing:
284
+ break
285
+ # Load previous row groups.
286
+ prev_block = datasets[key][first_needed_rg:rg_idx_start].to_pandas()
287
+ # Add previous rows to in-memory data.
288
+ prev_tail = prev_block.iloc[-min(missing, len(prev_block)) :, :]
289
+ in_memory_data[key] = concat([prev_tail, df], ignore_index=True)
290
+ current_start_indices[key] = 0
291
+ return in_memory_data, current_start_indices
292
+
293
+
294
+ def iter_intersections[K: StoreKey](
295
+ datasets: dict[K, OrderedParquetDataset],
296
+ start: float | Timestamp | None = None,
297
+ n_prev: int | list[int] = 0,
298
+ end_excl: float | Timestamp | None = None,
299
+ ) -> Iterator[dict[K, DataFrame]]:
300
+ """
301
+ Iterate over synchronized row groups across multiple datasets in the store.
302
+
303
+ This function yields data from multiple datasets (keys) in synchronized
304
+ chunks, ensuring that all returned DataFrames share overlapping spans
305
+ in their 'ordered_on' column. This allows processing 'ordered_on'-aligned
306
+ data from multiple sources.
307
+
308
+ Parameters
309
+ ----------
310
+ datasets : dict[K, OrderedParquetDataset]
311
+ Dictionary mapping dataset keys to their corresponding datasets.
312
+ start : Optional[Union[int, float, Timestamp]], default None
313
+ Start value (inclusive) for the 'ordered_on' column range. If None,
314
+ starts from the earliest value across all specified keys.
315
+ n_prev : Union[int, list[int]], default 0
316
+ Number of previous rows (number of values before 'start') to prepend to
317
+ first dataframe yielded for each key.
318
+ If a list, values are used for each key in the same order as 'datasets'.
319
+ end_excl : Optional[Union[int, float, Timestamp]], default None
320
+ End value (exclusive) for the 'ordered_on' column range. If None,
321
+ continues until the latest value across all specified keys.
322
+
323
+ Yields
324
+ ------
325
+ dict[K, DataFrame]
326
+ Dictionary mapping each key to its corresponding DataFrame chunk.
327
+ All DataFrames in each yielded dictionary share a common span in their
328
+ 'ordered_on' column using [start, end_excl) semantics (start inclusive,
329
+ end exclusive).
330
+
331
+ Notes
332
+ -----
333
+ - All datasets must have an 'ordered_on' column with the same name.
334
+ - The iteration is synchronized: each yield contains data from the same
335
+ span across all datasets.
336
+ - Uses [start, end_excl) interval semantics throughout.
337
+
338
+ Examples
339
+ --------
340
+ >>> store = Store(...)
341
+ >>> store[key1].write(data1, ordered_on='timestamp')
342
+ >>> store[key2].write(data2, ordered_on='timestamp')
343
+ >>> for data_dict in store.iter_intersections([key1, key2], start="2022-01-01"):
344
+ ... df1 = data_dict[key1] # DataFrame for key1
345
+ ... df2 = data_dict[key2] # DataFrame for key2
346
+ ... # Process synchronized data
347
+
348
+ """
349
+ # Get and validate ordered_on column name.
350
+ ordered_on_col_name = _get_and_validate_ordered_on_column(datasets)
351
+ if isinstance(n_prev, int):
352
+ n_prev = [n_prev] * len(datasets)
353
+ if any(_n_prev < 0 for _n_prev in n_prev):
354
+ raise ValueError("'n_prev' values must be greater than or equal to 0.")
355
+ # Get row group indices to start iterations with, and intersections.
356
+ rg_idx_starts, prev_rg_idx_ends_excl, intersections = _get_intersections(
357
+ datasets,
358
+ start,
359
+ end_excl,
360
+ )
361
+ # Load initial row groups and initialize start indices, honoring n_prev.
362
+ in_memory_data, current_start_indices = _initialize_first_load(
363
+ datasets=datasets,
364
+ ordered_on_col_name=ordered_on_col_name,
365
+ rg_idx_starts=rg_idx_starts,
366
+ prev_rg_idx_ends_excl=prev_rg_idx_ends_excl,
367
+ start=start,
368
+ n_prev=n_prev,
369
+ )
370
+ current_end_indices = {}
371
+ for current_end_excl, rg_idx_ends_excl in intersections:
372
+ for key, rg_idx_end_excl in rg_idx_ends_excl.items():
373
+ if rg_idx_end_excl != prev_rg_idx_ends_excl[key]:
374
+ in_memory_data[key] = datasets[key][prev_rg_idx_ends_excl[key] : rg_idx_end_excl].to_pandas()
375
+ prev_rg_idx_ends_excl[key] = rg_idx_end_excl
376
+ # Reset start index to 0 for new row group.
377
+ current_start_indices[key] = 0
378
+ # Calculate end indices for current_end_excl.
379
+ current_end_indices[key] = (
380
+ None
381
+ if current_end_excl is None
382
+ else in_memory_data[key]
383
+ .loc[:, ordered_on_col_name]
384
+ .searchsorted(
385
+ current_end_excl,
386
+ side=KEY_LEFT,
387
+ )
388
+ )
389
+ # Yield synchronized views [current_start, current_end_excl)
390
+ yield {
391
+ key: df.iloc[current_start_indices[key] : current_end_indices[key], :].reset_index(
392
+ drop=True,
393
+ )
394
+ for key, df in in_memory_data.items()
395
+ }
396
+ # Buffer end indices for next iteration as start indices.
397
+ current_start_indices = current_end_indices.copy()