oups 2025.9.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of oups might be problematic. Click here for more details.

Files changed (43) hide show
  1. oups/__init__.py +40 -0
  2. oups/date_utils.py +62 -0
  3. oups/defines.py +26 -0
  4. oups/numpy_utils.py +114 -0
  5. oups/stateful_loop/__init__.py +14 -0
  6. oups/stateful_loop/loop_persistence_io.py +55 -0
  7. oups/stateful_loop/stateful_loop.py +654 -0
  8. oups/stateful_loop/validate_loop_usage.py +338 -0
  9. oups/stateful_ops/__init__.py +22 -0
  10. oups/stateful_ops/aggstream/__init__.py +12 -0
  11. oups/stateful_ops/aggstream/aggstream.py +1524 -0
  12. oups/stateful_ops/aggstream/cumsegagg.py +580 -0
  13. oups/stateful_ops/aggstream/jcumsegagg.py +416 -0
  14. oups/stateful_ops/aggstream/segmentby.py +1018 -0
  15. oups/stateful_ops/aggstream/utils.py +71 -0
  16. oups/stateful_ops/asof_merger/__init__.py +11 -0
  17. oups/stateful_ops/asof_merger/asof_merger.py +750 -0
  18. oups/stateful_ops/asof_merger/get_config.py +401 -0
  19. oups/stateful_ops/asof_merger/validate_params.py +285 -0
  20. oups/store/__init__.py +15 -0
  21. oups/store/filepath_utils.py +68 -0
  22. oups/store/indexer.py +457 -0
  23. oups/store/ordered_parquet_dataset/__init__.py +19 -0
  24. oups/store/ordered_parquet_dataset/metadata_filename.py +50 -0
  25. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/__init__.py +15 -0
  26. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/base.py +863 -0
  27. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/read_only.py +252 -0
  28. oups/store/ordered_parquet_dataset/parquet_adapter.py +157 -0
  29. oups/store/ordered_parquet_dataset/write/__init__.py +19 -0
  30. oups/store/ordered_parquet_dataset/write/iter_merge_split_data.py +131 -0
  31. oups/store/ordered_parquet_dataset/write/merge_split_strategies/__init__.py +22 -0
  32. oups/store/ordered_parquet_dataset/write/merge_split_strategies/base.py +784 -0
  33. oups/store/ordered_parquet_dataset/write/merge_split_strategies/n_rows_strategy.py +297 -0
  34. oups/store/ordered_parquet_dataset/write/merge_split_strategies/time_period_strategy.py +319 -0
  35. oups/store/ordered_parquet_dataset/write/write.py +270 -0
  36. oups/store/store/__init__.py +11 -0
  37. oups/store/store/dataset_cache.py +50 -0
  38. oups/store/store/iter_intersections.py +397 -0
  39. oups/store/store/store.py +345 -0
  40. oups-2025.9.5.dist-info/LICENSE +201 -0
  41. oups-2025.9.5.dist-info/METADATA +44 -0
  42. oups-2025.9.5.dist-info/RECORD +43 -0
  43. oups-2025.9.5.dist-info/WHEEL +4 -0
@@ -0,0 +1,784 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Created on Mon Mar 17 18:00:00 2025.
4
+
5
+ Ordered atomic regions for Parquet files and DataFrames.
6
+
7
+ This module defines the base functions for analyzing how DataFrame can be merged
8
+ with existing Parquet files when both are ordered on the same column.
9
+ An ordered atomic region ('oar') represents the smallest unit for merging, which
10
+ is either:
11
+ - A single row group in a ParquetFile and its corresponding overlapping
12
+ DataFrame chunk (if any)
13
+ - A DataFrame chunk that doesn't overlap with any row group in the ParquetFile
14
+
15
+ @author: pierrot
16
+
17
+ """
18
+ from abc import ABC
19
+ from abc import abstractmethod
20
+ from functools import cached_property
21
+
22
+ from numpy import arange
23
+ from numpy import array
24
+ from numpy import bool_
25
+ from numpy import column_stack
26
+ from numpy import cumsum
27
+ from numpy import diff
28
+ from numpy import empty
29
+ from numpy import flatnonzero
30
+ from numpy import insert
31
+ from numpy import int8
32
+ from numpy import int_
33
+ from numpy import isin
34
+ from numpy import ones
35
+ from numpy import r_
36
+ from numpy import searchsorted
37
+ from numpy import vstack
38
+ from numpy import zeros
39
+ from numpy.typing import NDArray
40
+ from pandas import Series
41
+
42
+
43
+ LEFT = "left"
44
+ RIGHT = "right"
45
+
46
+
47
+ def get_region_indices_of_true_values(mask: NDArray[bool_]) -> NDArray[int_]:
48
+ """
49
+ Compute the start and end indices of each connected components in `mask`.
50
+
51
+ Taken from https://stackoverflow.com/questions/68514880/finding-contiguous-regions-in-a-1d-boolean-array.
52
+
53
+ Parameters
54
+ ----------
55
+ mask : NDArray[np_bool]
56
+ A 1d numpy array of dtype `bool`.
57
+
58
+ Returns
59
+ -------
60
+ NDArray[np_int]
61
+ A numpy array containing the list of the start and end indices of each
62
+ connected components in `mask`.
63
+
64
+ """
65
+ return flatnonzero(diff(r_[int8(0), mask.astype(int8), int8(0)])).reshape(-1, 2)
66
+
67
+
68
+ def set_true_in_regions(length: int, regions: NDArray[int_]) -> NDArray[bool_]:
69
+ """
70
+ Set regions in a boolean array to True based on start-end index pairs.
71
+
72
+ Regions have to be non overlapping.
73
+
74
+ Parameters
75
+ ----------
76
+ length : int
77
+ Length of the output array.
78
+ regions : NDArray[np_int]
79
+ 2D array of shape (n, 2) where each row contains [start, end) indices.
80
+ Start indices are inclusive, end indices are exclusive.
81
+ Regions are assumed to be non-overlapping.
82
+
83
+ Returns
84
+ -------
85
+ NDArray[np_bool]
86
+ Boolean array of length 'length' with True values in specified regions.
87
+
88
+ """
89
+ # Array of changes with +1 at starts, and -1 at ends of regions.
90
+ changes = zeros(length + 1, dtype=int8)
91
+ changes[regions[:, 0]] = 1
92
+ changes[regions[:, 1]] = -1
93
+ # Positive cumulative sum provides which positions are inside regions
94
+ return cumsum(changes[:-1]).astype(bool_)
95
+
96
+
97
+ def get_region_start_end_delta(m_values: NDArray, indices: NDArray) -> NDArray:
98
+ """
99
+ Get difference between values at end and start of each region.
100
+
101
+ For regions where the start index is 0, the start value is considered 0.
102
+ For all other regions, the start value is m_values[start_index - 1].
103
+
104
+ Parameters
105
+ ----------
106
+ m_values : NDArray
107
+ Array of monotonic values, such as coming from a cumulative sum.
108
+ indices : NDArray
109
+ Array of shape (n, 2) where 'n' is the number of regions, and each row
110
+ contains start included and end excluded indices of a region.
111
+
112
+ Returns
113
+ -------
114
+ NDArray
115
+ Array of length 'n' containing the difference between values at end and
116
+ start of each region, with special handling for regions starting at
117
+ index 0.
118
+
119
+ """
120
+ if not indices.size:
121
+ return empty(0, dtype=int_)
122
+ if indices[0, 0] == 0:
123
+ start_values = m_values[indices[:, 0] - 1]
124
+ start_values[0] = 0
125
+ return m_values[indices[:, 1] - 1] - start_values
126
+ else:
127
+ return m_values[indices[:, 1] - 1] - m_values[indices[:, 0] - 1]
128
+
129
+
130
+ class OARMergeSplitStrategy(ABC):
131
+ """
132
+ Abstract base class for ordered atomic region merge and split strategies.
133
+
134
+ This class defines strategies for:
135
+ - evaluating likelihood of row groups being on target size after merge,
136
+ - determining appropriate sizes for new row groups,
137
+ - consolidating merge plans for efficient write operations.
138
+
139
+ An OAR is considered 'on target size' if it meets the target size criteria
140
+ (either in terms of number of rows or time period). Otherwise it is
141
+ considered 'off target size'.
142
+
143
+ Attributes
144
+ ----------
145
+ oars_rg_idx_starts : NDArray[int_]
146
+ Start indices of row groups in each OAR.
147
+ oars_cmpt_idx_ends_excl : NDArray[int_, int_]
148
+ End indices (excluded) of row groups and DataFrame chunks in each OAR.
149
+ oars_has_row_group : NDArray[bool_]
150
+ Boolean array indicating if OAR contains a row group.
151
+ oars_df_n_rows : NDArray[int_]
152
+ Number of rows in each DataFrame chunk in each OAR.
153
+ oars_has_df_overlap : NDArray[bool_]
154
+ Boolean array indicating if OAR contains a DataFrame chunk.
155
+ n_oars : int
156
+ Number of ordered atomic regions.
157
+ oars_likely_on_target_size : NDArray[bool_]
158
+ Boolean array indicating if OAR is likely to be on target size.
159
+ oar_idx_mrs_starts_ends_excl : NDArray[int_]
160
+ Array of shape (e, 2) containing the list of the OARs start and end
161
+ indices for each merge regions.
162
+ rg_idx_ends_excl_not_to_use_as_split_points : Union[NDArray, None]
163
+ Array containing indices of row groups which should not be used as split
164
+ points in 'merge_sequences'. This ensures these row groups will
165
+ be loaded all together so that duplicate search can be made over all
166
+ relevant row groups.
167
+ n_rgs : int
168
+ Number of existing row groups.
169
+ n_df_rows : int
170
+ Number of rows in DataFrame.
171
+ rg_idx_mrs_starts_ends_excl : List[slice]
172
+ List of slices, each containing the start (included) and end (excluded)
173
+ indices of the row groups in a merge sequence.
174
+ merge_sequences : List[Tuple[int, NDArray]]
175
+ List of merge sequences, each containing a tuple of two items:
176
+ - the first item is the row group index starting the merge sequence,
177
+ - the second item is a numpy array of shape (n, 2) containing the
178
+ successive end (excluded) indices of row groups and DataFrame row of
179
+ the merge sequence.
180
+
181
+ """
182
+
183
+ def __init__(
184
+ self,
185
+ rg_ordered_on_mins: NDArray,
186
+ rg_ordered_on_maxs: NDArray,
187
+ df_ordered_on: Series,
188
+ drop_duplicates: bool | None = False,
189
+ ):
190
+ """
191
+ Compute ordered atomic regions (OARs) from row groups and DataFrame.
192
+
193
+ An ordered atomic region is either:
194
+ - A row group and its overlapping DataFrame chunk (if any)
195
+ - A DataFrame chunk that doesn't overlap with any row group
196
+
197
+ Returned arrays provide the start and end (excluded) indices in row
198
+ groups and end (excluded) indices in DataFrame for each of these ordered
199
+ atomic regions. All these arrays are of same size and describe how are
200
+ composed the ordered atomic regions.
201
+
202
+ Parameters
203
+ ----------
204
+ rg_ordered_on_mins : NDArray[Timestamp]
205
+ Minimum values of 'ordered_on' in each row group.
206
+ rg_ordered_on_maxs : NDArray[Timestamp]
207
+ Maximum values of 'ordered_on' in each row group.
208
+ df_ordered_on : Series[Timestamp]
209
+ Values of 'ordered_on' column in DataFrame.
210
+ drop_duplicates : Optional[bool], default False
211
+ Flag impacting how overlapping boundaries have to be managed.
212
+ More exactly, row groups are considered as first data, and DataFrame
213
+ as second data, coming after. In case of a row group leading a
214
+ DataFrame chunk, if the last value in row group is a duplicate of a
215
+ value in DataFrame chunk, then
216
+ - If True, at this index, overlap starts
217
+ - If False, no overlap at this index
218
+
219
+ Attributes
220
+ ----------
221
+ rg_idx_ends_excl_not_to_use_as_split_points : Union[NDArray, None]
222
+ Array containing indices of row groups which should not be used as
223
+ split points in 'merge_sequences'. This ensures these row
224
+ groups will be loaded all together so that duplicate search can be
225
+ made over all relevant row groups.
226
+
227
+ Raises
228
+ ------
229
+ ValueError
230
+ If input arrays have inconsistent lengths or unsorted data.
231
+
232
+ Notes
233
+ -----
234
+ Start indices in DataFrame are not provided, as they can be inferred
235
+ from the end (excluded) indices in DataFrame of the previous ordered
236
+ atomic region (no part of the DataFrame is omitted for the write).
237
+
238
+ In case 'drop_duplicates' is False, and there are duplicate values
239
+ between row group max values and DataFrame 'ordered_on' values, then
240
+ DataFrame 'ordered_on' values are considered to be the last occurrences
241
+ of the duplicates in 'ordered_on'. Leading row groups (with duplicate
242
+ max values) will not be in the same ordered atomic region as the
243
+ DataFrame chunk starting at the duplicate 'ordered_on' value. This is
244
+ an optimization to prevent rewriting these leading row groups.
245
+
246
+ On the opposite, if 'drop_duplicates' is True, then the row group with
247
+ the last occurrence of a duplicate 'ordered_on' value will be considered
248
+ to be the one corresponding to the DataFrame chunk with this value.
249
+ But all row groups with a this duplicate 'ordered_on' value will be
250
+ considered to have an overlap with the DataFrame chunk with this value,
251
+ i.e. 'self.oars_has_df_overlap' will be True for these row groups.
252
+
253
+ """
254
+ # Validate 'ordered_on' in row groups and DataFrame.
255
+ n_rgs = len(rg_ordered_on_mins)
256
+ n_df_rows = len(df_ordered_on)
257
+ if n_rgs != len(rg_ordered_on_maxs):
258
+ raise ValueError("rg_ordered_on_mins and rg_ordered_on_maxs must have the same length.")
259
+ # Check that rg_maxs[i] is less than rg_mins[i+1] (no overlapping row groups).
260
+ if n_rgs > 1 and (rg_ordered_on_maxs[:-1] > rg_ordered_on_mins[1:]).any():
261
+ raise ValueError("row groups must not overlap.")
262
+ # Check that df_ordered_on is sorted.
263
+ if not df_ordered_on.is_monotonic_increasing:
264
+ raise ValueError("'df_ordered_on' must be sorted in ascending order.")
265
+ # Check use of OARSplitStrategy with no row groups.
266
+ if not n_rgs:
267
+ self.oars_rg_idx_starts = zeros(1, dtype=int_)
268
+ self.oars_cmpt_idx_ends_excl = array([[0, n_df_rows]], dtype=int_)
269
+ self.oars_has_row_group = zeros(1).astype(bool_)
270
+ self.oars_df_n_rows = array([n_df_rows], dtype=int_)
271
+ self.oars_has_df_overlap = ones(1).astype(bool_)
272
+ self.n_oars = 1
273
+ self.rg_idx_ends_excl_not_to_use_as_split_points = None
274
+ self.n_rgs = 0
275
+ self.n_df_rows = n_df_rows
276
+ return
277
+
278
+ if drop_duplicates:
279
+ # Determine overlap start/end indices in row groups
280
+ df_idx_rgs_starts = searchsorted(df_ordered_on, rg_ordered_on_mins, side=LEFT)
281
+ df_idx_rgs_ends_excl = searchsorted(df_ordered_on, rg_ordered_on_maxs, side=RIGHT)
282
+ else:
283
+ df_idx_rgs_starts, df_idx_rgs_ends_excl = searchsorted(
284
+ df_ordered_on,
285
+ vstack((rg_ordered_on_mins, rg_ordered_on_maxs)),
286
+ side=LEFT,
287
+ )
288
+ # Keep track of which row groups have an overlap with a DataFrame chunk.
289
+ rgs_has_df_overlap = df_idx_rgs_starts != df_idx_rgs_ends_excl
290
+ # 'rg_idx_ends_excl_not_to_use_as_split_points' keeps track of row group
291
+ # indices which should not be used as split points.
292
+ self.rg_idx_ends_excl_not_to_use_as_split_points = None
293
+ if any(
294
+ rgs_min_equ_max := (rg_ordered_on_mins[1:] == rg_ordered_on_maxs[:-1]) & rgs_has_df_overlap[1:],
295
+ ):
296
+ # In case rg_maxs[i] is a duplicate of rg_mins[i+1],
297
+ # then df_idx_rg_ends_excl for rg[i] should be set to
298
+ # df_idx_rg_starts of rg[i+1], so that the overlapping df chunk is
299
+ # not in several row groups.
300
+ # Restrict the correction to row groups that overlap with a
301
+ # DataFrame chunk.
302
+ rg_idx_maxs_to_correct = flatnonzero(rgs_min_equ_max)
303
+ df_idx_rgs_ends_excl[rg_idx_maxs_to_correct] = df_idx_rgs_starts[rg_idx_maxs_to_correct + 1]
304
+ self.rg_idx_ends_excl_not_to_use_as_split_points = rg_idx_maxs_to_correct + 1
305
+ # DataFrame orphans are regions in DataFrame that do not overlap with
306
+ # any row group. Find indices in row groups of DataFrame orphans.
307
+ rg_idx_df_orphans = flatnonzero(
308
+ r_[
309
+ df_idx_rgs_starts[0], # gap at start (0 to first start)
310
+ df_idx_rgs_ends_excl[:-1] - df_idx_rgs_starts[1:],
311
+ n_df_rows - df_idx_rgs_ends_excl[-1], # gap at end
312
+ ],
313
+ )
314
+ n_df_orphans = len(rg_idx_df_orphans)
315
+ rg_idxs_template = arange(n_rgs + 1)
316
+ if n_df_orphans != 0:
317
+ # Case of non-overlapping regions in DataFrame.
318
+ # Resize 'rg_idxs', and duplicate values where there are
319
+ # non-overlapping regions in DataFrame.
320
+ # These really become now the OARs.
321
+ rg_idx_to_insert = rg_idxs_template[rg_idx_df_orphans]
322
+ rg_idxs_template = insert(
323
+ rg_idxs_template,
324
+ rg_idx_df_orphans,
325
+ rg_idx_to_insert,
326
+ )
327
+ # 'Resize 'df_idx_oar_ends_excl', and duplicate values where there
328
+ # are non-overlapping regions in DataFrame.
329
+ df_idx_to_insert = r_[df_idx_rgs_starts, n_df_rows][rg_idx_df_orphans]
330
+ df_idx_rgs_ends_excl = insert(
331
+ df_idx_rgs_ends_excl,
332
+ rg_idx_df_orphans,
333
+ df_idx_to_insert,
334
+ )
335
+ rgs_has_df_overlap = insert(
336
+ rgs_has_df_overlap,
337
+ rg_idx_df_orphans,
338
+ True,
339
+ )
340
+
341
+ self.oars_rg_idx_starts = rg_idxs_template[:-1] if len(rg_idxs_template) > 1 else zeros(1, dtype=int_)
342
+ self.oars_cmpt_idx_ends_excl = column_stack((rg_idxs_template[1:], df_idx_rgs_ends_excl))
343
+ self.oars_has_row_group = rg_idxs_template[:-1] != rg_idxs_template[1:]
344
+ self.oars_df_n_rows = diff(df_idx_rgs_ends_excl, prepend=0)
345
+ self.oars_has_df_overlap = rgs_has_df_overlap
346
+ self.n_oars = len(self.oars_rg_idx_starts)
347
+ self.n_rgs = n_rgs
348
+ self.n_df_rows = n_df_rows
349
+
350
+ @abstractmethod
351
+ def _specialized_init(self, **kwargs):
352
+ """
353
+ Initialize specialized attributes.
354
+
355
+ This method initializes attributes specific to strategy concrete
356
+ implementation. It is kept apart from class constructor to allow for
357
+ reuse by 'self.from_oars_desc' class method.
358
+
359
+ Parameters
360
+ ----------
361
+ **kwargs : dict
362
+ Keyword arguments to initialize specialized attributes.
363
+
364
+ """
365
+ raise NotImplementedError("Subclasses must implement this method")
366
+
367
+ @classmethod
368
+ def from_oars_desc(
369
+ cls,
370
+ oars_rg_idx_starts: NDArray,
371
+ oars_cmpt_idx_ends_excl: NDArray,
372
+ oars_has_row_group: NDArray,
373
+ oars_has_df_overlap: NDArray,
374
+ rg_idx_ends_excl_not_to_use_as_split_points: NDArray | None,
375
+ **kwargs,
376
+ ) -> "OARMergeSplitStrategy":
377
+ """
378
+ Create a strategy instance with a given OARs description.
379
+
380
+ This is primarily for testing purposes, allowing tests to directly set
381
+ the 'OARMergeSplitStrategy' base attributes without having to compute it
382
+ from row groups and DataFrame.
383
+
384
+ Parameters
385
+ ----------
386
+ oars_rg_idx_starts : NDArray
387
+ Start indices of row groups in each OAR.
388
+ oars_cmpt_idx_ends_excl : NDArray
389
+ End indices (excluded) of row groups and DataFrame chunks in each
390
+ OAR.
391
+ oars_has_row_group : NDArray
392
+ Boolean array indicating if OAR contains a row group.
393
+ oars_has_df_overlap : NDArray
394
+ Boolean array indicating if OAR overlaps with a DataFrame chunk.
395
+ rg_idx_ends_excl_not_to_use_as_split_points : Union[NDArray, None]
396
+ Array of indices for row group not to use as split points. There are
397
+ filtered out from 'merge_sequences'.
398
+ **kwargs
399
+ Additional arguments needed by specific strategy implementations.
400
+ For NRowsMergeSplitStrategy, this should include 'rgs_n_rows',
401
+ 'row_group_target_size', and optionally 'drop_duplicates'.
402
+ For TimePeriodMergeSplitStrategy, this should include
403
+ 'rg_ordered_on_mins', 'rg_ordered_on_maxs', 'df_ordered_on', and
404
+ 'row_group_time_period'.
405
+
406
+ Returns
407
+ -------
408
+ OARMergeSplitStrategy
409
+ An instance of the strategy with the given OARs description.
410
+
411
+ """
412
+ instance = cls.__new__(cls)
413
+ instance.oars_rg_idx_starts = oars_rg_idx_starts
414
+ instance.oars_cmpt_idx_ends_excl = oars_cmpt_idx_ends_excl
415
+ instance.oars_has_row_group = oars_has_row_group
416
+ instance.oars_df_n_rows = diff(oars_cmpt_idx_ends_excl[:, 1], prepend=0)
417
+ instance.oars_has_df_overlap = oars_has_df_overlap
418
+ instance.rg_idx_ends_excl_not_to_use_as_split_points = rg_idx_ends_excl_not_to_use_as_split_points
419
+ instance.n_oars = len(oars_rg_idx_starts)
420
+ instance._specialized_init(**kwargs)
421
+ return instance
422
+
423
+ @abstractmethod
424
+ def oars_likely_on_target_size(self) -> NDArray:
425
+ """
426
+ Return boolean array indicating which OARs are likely to be on target size.
427
+
428
+ This can be the result of 2 conditions:
429
+ - either a single DataFrame chunk or the merge of a Dataframe chunk and
430
+ a row group, with a (resulting) size that is on target size
431
+ (not under-sized, but over-sized is accepted).
432
+ - or if there is only a row group, it is on target size on its own.
433
+
434
+ Returns
435
+ -------
436
+ NDArray
437
+ Boolean array of length the number of ordered atomic regions.
438
+
439
+ Notes
440
+ -----
441
+ The logic implements an asymmetric treatment of OARs with and without
442
+ DataFrame chunks to prevent fragmentation and ensure proper compliance
443
+ with split strategy, including off target existing row groups.
444
+
445
+ 1. For OARs containing a DataFrame chunk:
446
+ - Writing is always triggered (systematic).
447
+ - If oversized, considered on target to force rewrite of neighbor
448
+ already existing off target row groups.
449
+ - If undersized, considered off target to be properly accounted for
450
+ when comparing to 'max_n_off_target_rgs'.
451
+ This ensures that writing a new on target row group will trigger
452
+ the rewrite of adjacent off target row groups when
453
+ 'max_n_off_target_rgs' is set.
454
+
455
+ 2. For OARs containing only row groups:
456
+ - Writing is triggered only if:
457
+ * The OAR is within a set of contiguous off target OARs
458
+ (under or over sized) and neighbors an OAR with a DataFrame
459
+ chunk.
460
+ * Either the number of off target OARs exceeds
461
+ 'max_n_off_target_rgs',
462
+ * or an OAR to be written (with DataFrame chunk) will induce
463
+ writing of a row group likely to be on target.
464
+ - Considered off target if either under or over sized to ensure
465
+ proper accounting when comparing to 'max_n_off_target_rgs'.
466
+
467
+ This approach ensures:
468
+ - All off target row groups are captured for potential rewrite.
469
+ - Writing an on target new row group forces rewrite of all adjacent
470
+ already existing off target row groups (under or over sized).
471
+ - Fragmentation is prevented by consolidating off target regions when
472
+ such *full* rewrite is triggered.
473
+
474
+ """
475
+ raise NotImplementedError("Subclasses must implement this property")
476
+
477
+ @abstractmethod
478
+ def mrs_likely_exceeds_target_size(self, mrs_starts_ends_excl: NDArray) -> NDArray:
479
+ """
480
+ Return boolean array indicating which merge regions likely exceed target size.
481
+
482
+ Parameters
483
+ ----------
484
+ mrs_starts_ends_excl : NDArray
485
+ Array of shape (m, 2) containing the start (included) and end
486
+ (excluded) indices of the merge regions.
487
+
488
+ Returns
489
+ -------
490
+ NDArray
491
+ Boolean array of length equal to the number of merge regions, where
492
+ True indicates the merge region is likely to exceed target size.
493
+
494
+ """
495
+ raise NotImplementedError("Subclasses must implement this method")
496
+
497
+ def _compute_merge_regions_start_ends_excl(
498
+ self,
499
+ max_n_off_target_rgs: int | None = None,
500
+ ) -> NDArray[int_]:
501
+ """
502
+ Aggregate ordered atomic regions into merge regions.
503
+
504
+ Sets of contiguous ordered atomic regions with DataFrame chunks are
505
+ possibly extended with neighbor regions that are off target size
506
+ depending on two conditions,
507
+ - if the atomic merge region with DataFrame chunks is found to result in
508
+ a row group potentially on target size.
509
+ - if the total number of atomic merge regions off target size in a
510
+ given enlarged merge region is greater than `max_n_off_target_rgs`.
511
+
512
+ Parameters
513
+ ----------
514
+ max_n_off_target_rgs : Optional[int_], default None
515
+ Maximum number of off-target size row groups allowed in a contiguous
516
+ set of row groups. It cannot be set to 0. This parameter helps
517
+ limiting fragmentation by limiting number of contiguous row groups
518
+ off target size.
519
+ A ``None`` value induces no merging of off target size row groups
520
+ neighbor to a newly added row groups.
521
+
522
+ Attributes
523
+ ----------
524
+ oar_idx_mrs_starts_ends_excl : NDArray[int_]
525
+ A numpy array of shape (e, 2) containing the list of the OARs start
526
+ and end indices for each merge regions.
527
+
528
+ Notes
529
+ -----
530
+ Reason for including off target size OARs contiguous to a newly added
531
+ OAR likely to be on target size is to prevent that the addition of new
532
+ data creates isolated sets of off target size row groups followed by on
533
+ target size row groups. This most notably applies when new data is
534
+ appended at the tail of the DataFrame.
535
+
536
+ This method relies on the abstract property
537
+ 'oars_likely_on_target_size' which must be implemented by concrete
538
+ subclasses before calling this method.
539
+
540
+ """
541
+ simple_mrs_starts_ends_excl = get_region_indices_of_true_values(self.oars_has_df_overlap)
542
+ if max_n_off_target_rgs is None:
543
+ self.oar_idx_mrs_starts_ends_excl = simple_mrs_starts_ends_excl
544
+ return
545
+ elif max_n_off_target_rgs == 0:
546
+ raise ValueError("'max_n_off_target_rgs' cannot be 0.")
547
+
548
+ # If 'max_n_off_target_rgs' is not None, then we need to compute the
549
+ # merge regions.
550
+ # Step 1: assess start indices (included) and end indices (excluded) of
551
+ # enlarged merge regions.
552
+ oars_off_target = ~self.oars_likely_on_target_size
553
+ potential_emrs_starts_ends_excl = get_region_indices_of_true_values(
554
+ self.oars_has_df_overlap | oars_off_target,
555
+ )
556
+ if self.n_df_rows:
557
+ # Filter out emrs without overlap with a DataFrame chunk.
558
+ # If there is no DataFrame overlap, then all enlarged merge regions
559
+ # are accepted. This allows for resize of row groups if desired.
560
+ # As of this point, potential enlarged merge regions are those which
561
+ # have an overlap with a simple merge region (has a DataFrame
562
+ # chunk).
563
+ potential_emrs_starts_ends_excl = potential_emrs_starts_ends_excl[
564
+ get_region_start_end_delta(
565
+ m_values=cumsum(self.oars_has_df_overlap),
566
+ indices=potential_emrs_starts_ends_excl,
567
+ ).astype(bool_)
568
+ ]
569
+ # Step 2: Filter out enlarged candidates based on multiple criteria.
570
+ # 2.a - Get number of off target size OARs per enlarged merged region.
571
+ # Those where 'max_n_off_target_rgs' is not reached will be filtered out
572
+ n_off_target_oars_in_pemrs = get_region_start_end_delta(
573
+ m_values=cumsum(oars_off_target),
574
+ indices=potential_emrs_starts_ends_excl,
575
+ )
576
+ # Keep enlarged merge regions with too many off target atomic regions or
577
+ # which likely exceed target size if merging row groups together.
578
+ confirmed_emrs_starts_ends_excl = potential_emrs_starts_ends_excl[
579
+ (n_off_target_oars_in_pemrs > max_n_off_target_rgs)
580
+ | self.mrs_likely_exceeds_target_size(
581
+ mrs_starts_ends_excl=potential_emrs_starts_ends_excl,
582
+ )
583
+ ]
584
+ if not self.n_df_rows:
585
+ # If there is no DataFrame overlap, no need for subsequent steps.
586
+ self.oar_idx_mrs_starts_ends_excl = confirmed_emrs_starts_ends_excl
587
+ return
588
+
589
+ # Step 3: Retrieve indices of merge regions which have overlap with a
590
+ # DataFrame chunk but are not in retained enlarged merge regions.
591
+ oars_confirmed_emrs = set_true_in_regions(
592
+ length=self.n_oars,
593
+ regions=confirmed_emrs_starts_ends_excl,
594
+ )
595
+ # Create an array of length the number of simple merge regions, with
596
+ # value 1 if the simple merge region is within an enlarged merge
597
+ # regions.
598
+ smrs_overlaps_with_confirmed_emrs = get_region_start_end_delta(
599
+ m_values=cumsum(oars_confirmed_emrs),
600
+ indices=simple_mrs_starts_ends_excl,
601
+ ).astype(bool_)
602
+ n_simple_mrs_in_enlarged_mrs = sum(smrs_overlaps_with_confirmed_emrs)
603
+ if n_simple_mrs_in_enlarged_mrs == 0:
604
+ # Case there is no simple merge regions in enlarged merge regions.
605
+ # This means there is no enlarged merge regions.
606
+ self.oar_idx_mrs_starts_ends_excl = simple_mrs_starts_ends_excl
607
+ elif n_simple_mrs_in_enlarged_mrs == len(simple_mrs_starts_ends_excl):
608
+ # Case all simple merge regions are encompassed in enlarged merge
609
+ # regions.
610
+ self.oar_idx_mrs_starts_ends_excl = confirmed_emrs_starts_ends_excl
611
+ else:
612
+ # Case in-between.
613
+ self.oar_idx_mrs_starts_ends_excl = vstack(
614
+ (
615
+ simple_mrs_starts_ends_excl[~smrs_overlaps_with_confirmed_emrs],
616
+ confirmed_emrs_starts_ends_excl,
617
+ ),
618
+ )
619
+ # Sort along 1st column.
620
+ # Sorting is required to ensure that DataFrame chunks are enumerated
621
+ # correctly (end indices excluded of a Dataframe chunk is the start
622
+ # index of the next Dataframe chunk).
623
+ self.oar_idx_mrs_starts_ends_excl = self.oar_idx_mrs_starts_ends_excl[
624
+ self.oar_idx_mrs_starts_ends_excl[:, 0].argsort()
625
+ ]
626
+
627
+ @abstractmethod
628
+ def _specialized_compute_merge_sequences(
629
+ self,
630
+ ) -> list[tuple[int, NDArray]]:
631
+ """
632
+ Sequence merge regions (MRs) into optimally sized chunks for writing.
633
+
634
+ Returns
635
+ -------
636
+ List[Tuple[int, NDArray]]
637
+ Merge sequences, a list of tuples, where each tuple contains for
638
+ each merge sequence:
639
+ - First element: Start index of the first row group in the merge
640
+ sequence.
641
+ - Second element: Array of shape (m, 2) containing end indices
642
+ (excluded) for row groups and DataFrame chunks in the merge
643
+ sequence.
644
+
645
+ """
646
+ raise NotImplementedError("Subclasses must implement this method")
647
+
648
+ def compute_merge_sequences(
649
+ self,
650
+ max_n_off_target_rgs: int | None = None,
651
+ ) -> list[tuple[int, NDArray]]:
652
+ """
653
+ Compute merge sequences.
654
+
655
+ This method is a wrapper to the chain of methods:
656
+ - '_compute_merge_regions_start_ends_excl'
657
+ - '_specialized_compute_merge_sequences'
658
+ Additionally, row group indices listed in
659
+ 'rg_idx_ends_excl_not_to_use_as_split_points' are filtered out from the
660
+ output returned by child '_specialized_compute_merge_sequences'. This
661
+ filtering ensures that in case 'drop_duplicates' is True, prior
662
+ existing row groups with a max 'ordered_on' value equals to next row
663
+ group's min 'ordered_on' value are merged. This approach guarantees that
664
+ duplicate search is made over all relevant priorly existing row groups.
665
+
666
+ Parameters
667
+ ----------
668
+ max_n_off_target_rgs : Optional[int], default None
669
+ Maximum number of off target row groups to merge.
670
+
671
+ Returns
672
+ -------
673
+ List[Tuple[int, NDArray]]
674
+ List of tuples, where each tuple contains for each merge sequence:
675
+ - First element: Start index of the first row group in the merge
676
+ sequence.
677
+ - Second element: Array of shape (m, 2) containing end indices
678
+ (excluded) for row groups and DataFrame chunks in the merge
679
+ sequence.
680
+
681
+ Notes
682
+ -----
683
+ The return value is also stored in 'self.merge_sequences'.
684
+
685
+ """
686
+ self._compute_merge_regions_start_ends_excl(max_n_off_target_rgs=max_n_off_target_rgs)
687
+ self.merge_sequences = (
688
+ self._specialized_compute_merge_sequences()
689
+ if self.rg_idx_ends_excl_not_to_use_as_split_points is None
690
+ else [
691
+ (
692
+ rg_idx_start,
693
+ cmpt_ends_excl[
694
+ isin(
695
+ cmpt_ends_excl[:, 0],
696
+ self.rg_idx_ends_excl_not_to_use_as_split_points,
697
+ invert=True,
698
+ )
699
+ ],
700
+ )
701
+ for rg_idx_start, cmpt_ends_excl in self._specialized_compute_merge_sequences()
702
+ ]
703
+ )
704
+ return self.merge_sequences
705
+
706
+ @cached_property
707
+ def rg_idx_mrs_starts_ends_excl(self) -> list[slice]:
708
+ """
709
+ Get the start and end indices of row groups for each merge regions.
710
+
711
+ Returns
712
+ -------
713
+ List[slice]
714
+ List of slices, where each slice contains the start (included) and
715
+ end (excluded) indices of row groups for each merge region.
716
+
717
+ """
718
+ if not hasattr(self, "oar_idx_mrs_starts_ends_excl"):
719
+ raise AttributeError(
720
+ "not possible to return 'rg_idx_mrs_starts_ends_excl' value if "
721
+ "'compute_merge_sequences()' has not been run beforehand.",
722
+ )
723
+ return [
724
+ slice(rg_idx_start, rg_idx_end_excl)
725
+ for oar_idx_start, oar_idx_end_excl in self.oar_idx_mrs_starts_ends_excl
726
+ if (
727
+ (rg_idx_start := self.oars_rg_idx_starts[oar_idx_start])
728
+ != (rg_idx_end_excl := self.oars_cmpt_idx_ends_excl[oar_idx_end_excl - 1, 0])
729
+ )
730
+ ]
731
+
732
+ @cached_property
733
+ def sort_rgs_after_write(self) -> bool:
734
+ """
735
+ Whether to sort row groups after writing.
736
+
737
+ Row groups witten may be so in the middle of existing row groups.
738
+ It is then required to sort them so that order is maintained between
739
+ row groups.
740
+
741
+ Returns
742
+ -------
743
+ bool
744
+ Whether to sort row groups after writing.
745
+
746
+ """
747
+ try:
748
+ return (
749
+ (
750
+ len(self.merge_sequences) > 1
751
+ # 'merge_sequences[0][1][-1,0]' is 'rg_idx_ends_excl'
752
+ # of the last row group in the first merge sequence.
753
+ or self.merge_sequences[0][1][-1, 0] < self.n_rgs
754
+ )
755
+ if self.merge_sequences
756
+ else False
757
+ )
758
+ except AttributeError:
759
+ raise AttributeError(
760
+ "not possible to return 'sort_rgs_after_write' value if "
761
+ "'compute_merge_sequences()' has not been run beforehand.",
762
+ )
763
+
764
+ @abstractmethod
765
+ def compute_split_sequence(self, df_ordered_on: Series) -> list[int]:
766
+ """
767
+ Define the split sequence for a chunk depending row group target size.
768
+
769
+ Result is to be used as `row_group_offsets` parameter in
770
+ `iter_dataframe` method.
771
+
772
+ Parameters
773
+ ----------
774
+ df_ordered_on : Series
775
+ Series by which the DataFrame to be written is ordered.
776
+
777
+ Returns
778
+ -------
779
+ List[int]
780
+ A list of indices with the explicit index values to start new row
781
+ groups.
782
+
783
+ """
784
+ raise NotImplementedError("Subclasses must implement this method")