oups 2025.9.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of oups might be problematic. Click here for more details.

Files changed (43) hide show
  1. oups/__init__.py +40 -0
  2. oups/date_utils.py +62 -0
  3. oups/defines.py +26 -0
  4. oups/numpy_utils.py +114 -0
  5. oups/stateful_loop/__init__.py +14 -0
  6. oups/stateful_loop/loop_persistence_io.py +55 -0
  7. oups/stateful_loop/stateful_loop.py +654 -0
  8. oups/stateful_loop/validate_loop_usage.py +338 -0
  9. oups/stateful_ops/__init__.py +22 -0
  10. oups/stateful_ops/aggstream/__init__.py +12 -0
  11. oups/stateful_ops/aggstream/aggstream.py +1524 -0
  12. oups/stateful_ops/aggstream/cumsegagg.py +580 -0
  13. oups/stateful_ops/aggstream/jcumsegagg.py +416 -0
  14. oups/stateful_ops/aggstream/segmentby.py +1018 -0
  15. oups/stateful_ops/aggstream/utils.py +71 -0
  16. oups/stateful_ops/asof_merger/__init__.py +11 -0
  17. oups/stateful_ops/asof_merger/asof_merger.py +750 -0
  18. oups/stateful_ops/asof_merger/get_config.py +401 -0
  19. oups/stateful_ops/asof_merger/validate_params.py +285 -0
  20. oups/store/__init__.py +15 -0
  21. oups/store/filepath_utils.py +68 -0
  22. oups/store/indexer.py +457 -0
  23. oups/store/ordered_parquet_dataset/__init__.py +19 -0
  24. oups/store/ordered_parquet_dataset/metadata_filename.py +50 -0
  25. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/__init__.py +15 -0
  26. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/base.py +863 -0
  27. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/read_only.py +252 -0
  28. oups/store/ordered_parquet_dataset/parquet_adapter.py +157 -0
  29. oups/store/ordered_parquet_dataset/write/__init__.py +19 -0
  30. oups/store/ordered_parquet_dataset/write/iter_merge_split_data.py +131 -0
  31. oups/store/ordered_parquet_dataset/write/merge_split_strategies/__init__.py +22 -0
  32. oups/store/ordered_parquet_dataset/write/merge_split_strategies/base.py +784 -0
  33. oups/store/ordered_parquet_dataset/write/merge_split_strategies/n_rows_strategy.py +297 -0
  34. oups/store/ordered_parquet_dataset/write/merge_split_strategies/time_period_strategy.py +319 -0
  35. oups/store/ordered_parquet_dataset/write/write.py +270 -0
  36. oups/store/store/__init__.py +11 -0
  37. oups/store/store/dataset_cache.py +50 -0
  38. oups/store/store/iter_intersections.py +397 -0
  39. oups/store/store/store.py +345 -0
  40. oups-2025.9.5.dist-info/LICENSE +201 -0
  41. oups-2025.9.5.dist-info/METADATA +44 -0
  42. oups-2025.9.5.dist-info/RECORD +43 -0
  43. oups-2025.9.5.dist-info/WHEEL +4 -0
@@ -0,0 +1,416 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Created on Fri Dec 22 19:00:00 2022.
4
+
5
+ @author: pierrot
6
+
7
+ """
8
+ from collections.abc import Callable
9
+
10
+ from numba import boolean
11
+ from numba import float64
12
+ from numba import int64
13
+ from numba import literal_unroll
14
+ from numba import njit
15
+ from numpy import ndenumerate
16
+ from numpy import zeros
17
+ from numpy.typing import NDArray
18
+
19
+
20
+ @njit(
21
+ [int64[:](int64[:, :], int64[:], boolean), float64[:](float64[:, :], float64[:], boolean)],
22
+ cache=True,
23
+ )
24
+ def jfirst(ar: NDArray, initial: NDArray, use_init: bool):
25
+ """
26
+ Jitted first.
27
+
28
+ Parameters
29
+ ----------
30
+ ar : np.ndarray
31
+ 2d numpy array from which taking first value.
32
+ initial : np.ndarray
33
+ 1d numpy array containing 'initial' values to be considered as previous
34
+ first.
35
+ use_init : bool
36
+ If 'initial' should be used.
37
+
38
+ Returns
39
+ -------
40
+ np.ndarray
41
+ First in 'ar' if 'use_init' is false, else 'initial'.
42
+
43
+ """
44
+ if use_init > 0:
45
+ return initial
46
+ elif len(ar) > 0:
47
+ return ar[0]
48
+ else:
49
+ return zeros(ar.shape[1], dtype=ar.dtype)
50
+
51
+
52
+ @njit(
53
+ [int64[:](int64[:, :], int64[:], boolean), float64[:](float64[:, :], float64[:], boolean)],
54
+ cache=True,
55
+ )
56
+ def jlast(ar: NDArray, initial: NDArray, use_init: bool):
57
+ """
58
+ Jitted last.
59
+
60
+ Parameters
61
+ ----------
62
+ ar : np.ndarray
63
+ 2d numpy array from which taking last value.
64
+ initial : np.ndarray
65
+ 1d numpy array containing 'initial' values to be considered as previous
66
+ last.
67
+ These values are used if 'ar' is a null array.
68
+ use_init : bool
69
+ If 'initial' should be used.
70
+
71
+ Returns
72
+ -------
73
+ np.ndarray
74
+ Last in 'ar' if 'ar' is not a null array, else 'initial'.
75
+
76
+ """
77
+ if len(ar) > 0:
78
+ return ar[-1]
79
+ elif use_init:
80
+ return initial
81
+ else:
82
+ return zeros(ar.shape[1], dtype=ar.dtype)
83
+
84
+
85
+ @njit(
86
+ [int64[:](int64[:, :], int64[:], boolean), float64[:](float64[:, :], float64[:], boolean)],
87
+ cache=True,
88
+ parallel=True,
89
+ )
90
+ def jmax(ar: NDArray, initial: NDArray, use_init: bool):
91
+ """
92
+ Jitted max.
93
+
94
+ Parameters
95
+ ----------
96
+ ar : np.ndarray
97
+ 2d numpy array over which retrieving max values for each column.
98
+ initial : np.ndarray
99
+ 1d numpy array containing 'initial' values to be considered in max
100
+ evaluation, for each column.
101
+ use_init : bool
102
+ If 'initial' should be used.
103
+
104
+ Returns
105
+ -------
106
+ np.ndarray
107
+ Max values per column in 'ar', including 'initial' if 'use_init' is
108
+ true.
109
+
110
+ """
111
+ len_ar = len(ar)
112
+ if len_ar > 0:
113
+ if use_init:
114
+ k = 0
115
+ res = initial
116
+ else:
117
+ k = 1
118
+ res = ar[0]
119
+ if len_ar > 1 or k == 0:
120
+ for row in ar[k:]:
121
+ for i, val in ndenumerate(row):
122
+ if val > res[i]:
123
+ res[i] = val
124
+ return res
125
+ elif use_init:
126
+ return initial
127
+ else:
128
+ return zeros(ar.shape[1], dtype=ar.dtype)
129
+
130
+
131
+ @njit(
132
+ [int64[:](int64[:, :], int64[:], boolean), float64[:](float64[:, :], float64[:], boolean)],
133
+ cache=True,
134
+ parallel=True,
135
+ )
136
+ def jmin(ar: NDArray, initial: NDArray, use_init: bool):
137
+ """
138
+ Jitted min.
139
+
140
+ Parameters
141
+ ----------
142
+ ar : np.ndarray
143
+ 2d numpy array over which retrieving min values for each column.
144
+ initial : np.ndarray
145
+ 1d numpy array containing 'initial' values to be considered in min
146
+ evaluation, for each column.
147
+ use_init : bool
148
+ If 'initial' should be used.
149
+
150
+ Returns
151
+ -------
152
+ np.ndarray
153
+ Min values per column in 'ar', including 'initial' if 'use_init' is
154
+ true.
155
+
156
+ """
157
+ len_ar = len(ar)
158
+ if len_ar > 0:
159
+ if use_init:
160
+ k = 0
161
+ res = initial
162
+ else:
163
+ k = 1
164
+ res = ar[0]
165
+ if len_ar > 1 or k == 0:
166
+ for row in ar[k:]:
167
+ for i, val in ndenumerate(row):
168
+ if val < res[i]:
169
+ res[i] = val
170
+ return res
171
+ elif use_init:
172
+ return initial
173
+ else:
174
+ return zeros(ar.shape[1], dtype=ar.dtype)
175
+
176
+
177
+ @njit(
178
+ [int64[:](int64[:, :], int64[:], boolean), float64[:](float64[:, :], float64[:], boolean)],
179
+ cache=True,
180
+ parallel=True,
181
+ )
182
+ def jsum(ar: NDArray, initial: NDArray, use_init: bool):
183
+ """
184
+ Jitted sum.
185
+
186
+ Parameters
187
+ ----------
188
+ ar : np.ndarray
189
+ 2d numpy array over which assessing sum of values for each column.
190
+ initial : np.ndarray
191
+ 1d numpy array containing 'initial' values to be considered in sum
192
+ evaluation, for each column.
193
+ use_init : bool
194
+ If 'initial' should be used.
195
+
196
+ Returns
197
+ -------
198
+ np.ndarray
199
+ Sum of values per column in 'ar', including 'initial' if 'use_init' is
200
+ true.
201
+
202
+ """
203
+ len_ar = len(ar)
204
+ if len_ar > 0:
205
+ if use_init:
206
+ k = 0
207
+ res = initial
208
+ else:
209
+ k = 1
210
+ res = ar[0]
211
+ if len_ar > 1 or k == 0:
212
+ for row in ar[k:]:
213
+ res += row
214
+ return res
215
+ elif use_init:
216
+ return initial
217
+ else:
218
+ return zeros(ar.shape[1], dtype=ar.dtype)
219
+
220
+
221
+ # Aggregation function ids.
222
+ FIRST = "first"
223
+ LAST = "last"
224
+ MIN = "min"
225
+ MAX = "max"
226
+ SUM = "sum"
227
+ AGG_FUNCS = {FIRST: jfirst, LAST: jlast, MIN: jmin, MAX: jmax, SUM: jsum}
228
+
229
+
230
+ @njit
231
+ def jcsagg(
232
+ data: NDArray, # 2d
233
+ aggs: tuple[tuple[Callable, NDArray, NDArray]],
234
+ next_chunk_starts: NDArray, # 1d
235
+ bin_indices: NDArray, # 1d
236
+ preserve_res: bool,
237
+ chunk_res: NDArray, # 1d
238
+ bin_res: NDArray, # 2d
239
+ snap_res: NDArray, # 2d
240
+ null_bin_indices: NDArray, # 1d
241
+ null_snap_indices: NDArray, # 1d
242
+ ):
243
+ """
244
+ Group assuming contiguity.
245
+
246
+ Parameters
247
+ ----------
248
+ data : np.ndarray
249
+ Array over which performing aggregation functions.
250
+ aggs : tuple[tuple[Callable, NDArray, NDArray]]
251
+ Tuple of Tuple with 3 items:
252
+ - aggregation function,
253
+ - for related aggregation function, a 1d numpy array listing the
254
+ indices of columns in 'data' to which apply the aggregation
255
+ function.
256
+ - for related aggregation function, and corresponding column in
257
+ 'data', the index of the column in 'bin_res' and/or 'snap_res' to
258
+ which recording the result. These indices are listed in a 1d numpy
259
+ array, sorted in the same order than indices of columns in 'data'.
260
+ next_chunk_starts : np.ndarray
261
+ Ordered one dimensional array of ``int``, indicating the index of the
262
+ 1st row of next chunk (or last row index of current chunk, excluded).
263
+ May contain duplicates, indicating, depending the chunk type, possibly
264
+ an empty bin or an empty snapshot.
265
+ bin_indices : np.ndarray
266
+ Sorted, one dimensional array of ``int``, of same size than the number
267
+ of bins, and indicating that a chunk at this index in
268
+ 'next_chunk_starts' is a bin (and not a snapshot). Beware that it has
269
+ to contain no duplicate values.
270
+ In case of no snapshotting ('snap_res' is a null array), then
271
+ 'bin_indices' can be a null array.
272
+ preserve_res : boolean
273
+ If 'chunk_res' parameter has to be accounted for in aggregation results
274
+ of 1st chunk or not. In other words, is the first chunk the
275
+ continuation of aggregation calculation from the previous iteration, or
276
+ is the new iteration to be started from scratch?
277
+ chunk_res : np.ndarray
278
+ Aggregation results of last chunk from previous iteration. If
279
+ 'preserve_res' is ``True``, then these results are reused in 1st
280
+ calculation for this new iteration.
281
+
282
+ Returns
283
+ -------
284
+ chunk_res : np.ndarray
285
+ Aggregation results of last chunk for current iteration, for use in
286
+ next iteration.
287
+ bin_res : np.ndarray
288
+ Results from aggregation, with same `dtype` than 'data' array, for
289
+ bins.
290
+ snap_res : np.ndarray
291
+ Results from aggregation, with same `dtype` than 'data' array
292
+ considering intermediate snapshots.
293
+ null_bin_indices : np.ndarray
294
+ One dimensional array containing row indices in 'bin_res' that
295
+ correspond to "empty" bins, i.e. for which bin size has been set to 0.
296
+ null_snap_indices : np.ndarray
297
+ One dimensional array containing row indices in 'snap_res' that
298
+ correspond to "empty" snapshots, i.e. for which snapshot size has been
299
+ set to 0. Input array should be set to null values, so that unused
300
+ rows can be identified clearly.
301
+
302
+ Notes
303
+ -----
304
+ In case of a 'restart', for the implemented logic to work, it is crucial
305
+ that in the previous iteration, last been has not been an empty one.
306
+ In current implementation, if 'preserve_res' parameter is ``True``, then
307
+ 'chunk_res' contains valid results which are forwarded into current
308
+ iteration.
309
+ But if last bin from previous iteration has been empty, then 'chunk_res'
310
+ does not contain relevant results to be forwarded.
311
+
312
+ """
313
+ # 'pinnu' is 'prev_is_non_null_update'. It is renamed 'preserve_res'.
314
+ # With 'preserve_res' True, then cumulate (pass-through) previous results.
315
+ # TODO: check if last index in "next_chunk_array" is size of data.
316
+ # If not, do a last iteration to cover the complete input data, and simply
317
+ # keep in 'chunk_res'. Possibly, activate this behavior only if a flag is
318
+ # set. In this case, 'preserve_res' should be output from 'jcumsegagg()'
319
+ # To keep track of the case the last bin exactly end on last row of data.
320
+ # TODO: when creation 'null_bin_indices' and 'null_snap_indices', only
321
+ # trim the trailing '-1' if there are less null indices than their initial
322
+ # size.
323
+ bin_start = -1 if preserve_res else 0
324
+ chunk_start = 0
325
+ bin_res_idx = snap_res_idx = 0
326
+ null_bin_idx = null_snap_idx = 0
327
+ # A 'snapshot' is an 'update' or 'pass-through'.
328
+ # An end of 'bin' induces a reset.
329
+ if len(snap_res) != 0:
330
+ # Case 'snapshots expected'.
331
+ # Setup identification of bins (vs snapshots).
332
+ some_snaps = True
333
+ if len(bin_indices) > 0:
334
+ # Case 'there are bins'.
335
+ # If a 'snapshot' chunk shares same last row than a 'bin'
336
+ # chunk, the 'snapshot' is expected to be listed prior to the
337
+ # 'bin' chunk. This is ensured in the way bin indices are sorted
338
+ # vs snapshot indices. Bin indices are identified thanks to
339
+ # 'bin_indices'.
340
+ iter_bin_indices = iter(bin_indices)
341
+ next_bin_idx = next(iter_bin_indices)
342
+ last_bin_idx = bin_indices[-1]
343
+ else:
344
+ # Case 'only snapshots'.
345
+ next_bin_idx = -1
346
+ else:
347
+ # Case 'no snapshot expected'.
348
+ some_snaps = False
349
+ is_update = False
350
+ for (idx,), next_chunk_start in ndenumerate(next_chunk_starts):
351
+ if some_snaps:
352
+ # Is the current 'next_chunk_start' idx that of a 'bin' or that of
353
+ # a 'snapshot'?
354
+ if idx == next_bin_idx:
355
+ # Case 'bin'.
356
+ is_update = False
357
+ if next_bin_idx != last_bin_idx:
358
+ next_bin_idx = next(iter_bin_indices)
359
+ else:
360
+ # Case 'snapshot'.
361
+ is_update = True
362
+ # Null chunk is identified if no new data since start of 'bin',
363
+ # whatever the chunk is, a 'bin' or a 'snapshot' (update).
364
+ # An update without any row is not necessarily a null update.
365
+ # Values from past update may need to be forwarded.
366
+ if bin_start == next_chunk_start:
367
+ # Null chunk since the start of the bin.
368
+ if is_update:
369
+ null_snap_indices[null_snap_idx] = snap_res_idx
370
+ null_snap_idx += 1
371
+ snap_res_idx += 1
372
+ else:
373
+ null_bin_indices[null_bin_idx] = bin_res_idx
374
+ null_bin_idx += 1
375
+ bin_res_idx += 1
376
+ preserve_res = False
377
+ else:
378
+ # Chunk with some rows from the start of the bin.
379
+ chunk = data[chunk_start:next_chunk_start]
380
+ # Step 1: compute results for current chunk.
381
+ # If no data in current chunk, 'chunk_res' is naturally forwarded
382
+ # to next iteration, no need to update it.
383
+ if len(chunk) != 0:
384
+ # TODO: integrate in 'jcsagg()' a loopover the dtypes, with all
385
+ # input arrays and 'chunk_res' for a dtype at same positions in
386
+ # different input tuples. Motivation is that chunks are the
387
+ # same size whatever the dtype of seed data. It would prevent
388
+ # restarting 'jcsagg' for different dtypes.
389
+ # TODO: is usage of a tuple and 'literal_unroll' really
390
+ # necessary? 'aggs' is always a tuple of 3 components here.
391
+ # Create a parameter in 'jcsagg' for each component, and then
392
+ # use an index to iterate through the iterable inputs with the
393
+ # index.
394
+ # for agg in aggs:
395
+ for agg in literal_unroll(aggs):
396
+ agg_func, cols_data, cols_res = agg
397
+ chunk_res[cols_res] = agg_func(
398
+ chunk[:, cols_data],
399
+ chunk_res[cols_res],
400
+ preserve_res,
401
+ )
402
+ # Step 2: record results.
403
+ if is_update:
404
+ # Case of 'snapshot', record result in 'snap_res'.
405
+ snap_res[snap_res_idx, :] = chunk_res
406
+ # Update local variables and counters.
407
+ snap_res_idx += 1
408
+ preserve_res = True
409
+ else:
410
+ # Case of 'bin', record results in 'bin_res'.
411
+ bin_res[bin_res_idx, :] = chunk_res
412
+ # Update local variables and counters to reflect end of bin.
413
+ bin_res_idx += 1
414
+ bin_start = next_chunk_start
415
+ preserve_res = False
416
+ chunk_start = next_chunk_start