oups 2025.9.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of oups might be problematic. Click here for more details.

Files changed (43) hide show
  1. oups/__init__.py +40 -0
  2. oups/date_utils.py +62 -0
  3. oups/defines.py +26 -0
  4. oups/numpy_utils.py +114 -0
  5. oups/stateful_loop/__init__.py +14 -0
  6. oups/stateful_loop/loop_persistence_io.py +55 -0
  7. oups/stateful_loop/stateful_loop.py +654 -0
  8. oups/stateful_loop/validate_loop_usage.py +338 -0
  9. oups/stateful_ops/__init__.py +22 -0
  10. oups/stateful_ops/aggstream/__init__.py +12 -0
  11. oups/stateful_ops/aggstream/aggstream.py +1524 -0
  12. oups/stateful_ops/aggstream/cumsegagg.py +580 -0
  13. oups/stateful_ops/aggstream/jcumsegagg.py +416 -0
  14. oups/stateful_ops/aggstream/segmentby.py +1018 -0
  15. oups/stateful_ops/aggstream/utils.py +71 -0
  16. oups/stateful_ops/asof_merger/__init__.py +11 -0
  17. oups/stateful_ops/asof_merger/asof_merger.py +750 -0
  18. oups/stateful_ops/asof_merger/get_config.py +401 -0
  19. oups/stateful_ops/asof_merger/validate_params.py +285 -0
  20. oups/store/__init__.py +15 -0
  21. oups/store/filepath_utils.py +68 -0
  22. oups/store/indexer.py +457 -0
  23. oups/store/ordered_parquet_dataset/__init__.py +19 -0
  24. oups/store/ordered_parquet_dataset/metadata_filename.py +50 -0
  25. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/__init__.py +15 -0
  26. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/base.py +863 -0
  27. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/read_only.py +252 -0
  28. oups/store/ordered_parquet_dataset/parquet_adapter.py +157 -0
  29. oups/store/ordered_parquet_dataset/write/__init__.py +19 -0
  30. oups/store/ordered_parquet_dataset/write/iter_merge_split_data.py +131 -0
  31. oups/store/ordered_parquet_dataset/write/merge_split_strategies/__init__.py +22 -0
  32. oups/store/ordered_parquet_dataset/write/merge_split_strategies/base.py +784 -0
  33. oups/store/ordered_parquet_dataset/write/merge_split_strategies/n_rows_strategy.py +297 -0
  34. oups/store/ordered_parquet_dataset/write/merge_split_strategies/time_period_strategy.py +319 -0
  35. oups/store/ordered_parquet_dataset/write/write.py +270 -0
  36. oups/store/store/__init__.py +11 -0
  37. oups/store/store/dataset_cache.py +50 -0
  38. oups/store/store/iter_intersections.py +397 -0
  39. oups/store/store/store.py +345 -0
  40. oups-2025.9.5.dist-info/LICENSE +201 -0
  41. oups-2025.9.5.dist-info/METADATA +44 -0
  42. oups-2025.9.5.dist-info/RECORD +43 -0
  43. oups-2025.9.5.dist-info/WHEEL +4 -0
@@ -0,0 +1,297 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Created on Mon Mar 17 18:00:00 2025.
4
+
5
+ Concrete implementation of OARMergeSplitStrategy based on number of rows.
6
+
7
+ @author: pierrot
8
+
9
+ """
10
+ from functools import cached_property
11
+
12
+ from numpy import cumsum
13
+ from numpy import int_
14
+ from numpy import linspace
15
+ from numpy import maximum
16
+ from numpy import r_
17
+ from numpy import searchsorted
18
+ from numpy import unique
19
+ from numpy import zeros
20
+ from numpy.typing import NDArray
21
+ from pandas import Series
22
+
23
+ from oups.store.ordered_parquet_dataset.write.merge_split_strategies.base import OARMergeSplitStrategy
24
+ from oups.store.ordered_parquet_dataset.write.merge_split_strategies.base import get_region_start_end_delta
25
+
26
+
27
+ LEFT = "left"
28
+ ROW_GROUP_TARGET_SIZE_SCALE_FACTOR = 0.8 # % of target row group size.
29
+ # MIN_RG_NUMBER_TO_ENSURE_ON_TARGET_RGS = 1 / (1 - ROW_GROUP_TARGET_SIZE_SCALE_FACTOR)
30
+
31
+
32
+ class NRowsMergeSplitStrategy(OARMergeSplitStrategy):
33
+ """
34
+ OAR merge and split strategy based on a target number of rows per row group.
35
+
36
+ This strategy ensures that row groups are split when they exceed a target
37
+ size, while maintaining a minimum size to prevent too small row groups. It
38
+ also handles off target size row groups through the 'max_n_off_target_rgs'
39
+ parameter.
40
+
41
+ Attributes
42
+ ----------
43
+ row_group_target_size : int
44
+ Target number of rows above which a new row group should be created.
45
+ row_group_min_size : int
46
+ Minimum number of rows in an ordered atomic region, computed as
47
+ ``ROW_GROUP_TARGET_SIZE_SCALE_FACTOR * row_group_target_size``.
48
+ oars_max_n_rows : NDArray
49
+ Array of shape (e) containing the maximum number of rows in each ordered
50
+ atomic region, obtained by summing the number of rows in a row group
51
+ (if present) and the number of rows in its corresponding DataFrame chunk
52
+ (if present).
53
+ oars_min_n_rows : NDArray
54
+ Array of shape (e) containing the likely minimum number of rows in each
55
+ ordered atomic region. It is equal to ``oars_max_n_rows`` if
56
+ ``drop_duplicates`` is False.
57
+
58
+ Notes
59
+ -----
60
+ The maximum number of rows in an OAR is calculated as the sum of:
61
+ - The number of rows in the row group (if present)
62
+ - The number of rows in the DataFrame chunk (if present)
63
+ This represents the worst-case scenario where there are no duplicates.
64
+
65
+ """
66
+
67
+ def __init__(
68
+ self,
69
+ rg_ordered_on_mins: NDArray,
70
+ rg_ordered_on_maxs: NDArray,
71
+ df_ordered_on: Series,
72
+ rgs_n_rows: NDArray,
73
+ row_group_target_size: int,
74
+ drop_duplicates: bool | None = False,
75
+ ):
76
+ """
77
+ Initialize scheme with target size.
78
+
79
+ Parameters
80
+ ----------
81
+ rg_ordered_on_mins : NDArray
82
+ Array of shape (r) containing the minimum values of the ordered
83
+ row groups.
84
+ rg_ordered_on_maxs : NDArray
85
+ Array of shape (r) containing the maximum values of the ordered
86
+ row groups.
87
+ df_ordered_on : Series
88
+ Series of shape (d) containing the ordered DataFrame.
89
+ rgs_n_rows : NDArray
90
+ Array of shape (r) containing the number of rows in each row group
91
+ in existing ParquetFile.
92
+ row_group_target_size : int
93
+ Target number of rows above which a new row group should be created.
94
+ drop_duplicates : Optional[bool], default False
95
+ Whether to drop duplicates between row groups and DataFrame.
96
+
97
+ """
98
+ super().__init__(
99
+ rg_ordered_on_mins,
100
+ rg_ordered_on_maxs,
101
+ df_ordered_on,
102
+ drop_duplicates,
103
+ )
104
+ self._specialized_init(
105
+ rgs_n_rows=rgs_n_rows,
106
+ row_group_target_size=row_group_target_size,
107
+ drop_duplicates=drop_duplicates,
108
+ )
109
+
110
+ def _specialized_init(
111
+ self,
112
+ rgs_n_rows: NDArray,
113
+ row_group_target_size: int,
114
+ drop_duplicates: bool | None = False,
115
+ ):
116
+ """
117
+ Initialize scheme with target size.
118
+
119
+ Parameters
120
+ ----------
121
+ rgs_n_rows : NDArray
122
+ Array of shape (r) containing the number of rows in each row group
123
+ in existing ParquetFile.
124
+ row_group_target_size : int
125
+ Target number of rows above which a new row group should be created.
126
+ drop_duplicates : Optional[bool], default False
127
+ Whether to drop duplicates between row groups and DataFrame.
128
+
129
+ """
130
+ self.row_group_target_size = row_group_target_size
131
+ self.row_group_min_size = int(row_group_target_size * ROW_GROUP_TARGET_SIZE_SCALE_FACTOR)
132
+ # Max number of rows in each ordered atomic region. This is a max in case
133
+ # there are duplicates between row groups and DataFrame that will be
134
+ # dropped.
135
+ self.oars_max_n_rows = zeros(self.n_oars, dtype=int)
136
+ self.oars_max_n_rows[self.oars_has_row_group] = rgs_n_rows
137
+ if drop_duplicates:
138
+ # Assuming each DataFrame chunk and each row group have no
139
+ # duplicates within themselves, 'oars_min_n_rows' is set assuming
140
+ # that all rows in the smallest component are duplicates of rows
141
+ # in the largest component.
142
+ self.oars_min_n_rows = maximum(self.oars_max_n_rows, self.oars_df_n_rows)
143
+ else:
144
+ self.oars_min_n_rows = self.oars_max_n_rows
145
+ self.oars_max_n_rows += self.oars_df_n_rows
146
+
147
+ @cached_property
148
+ def oars_likely_on_target_size(self) -> NDArray:
149
+ """
150
+ Return boolean array indicating which OARs are likely to be on target size.
151
+
152
+ An OAR is considered likely to be on target size if:
153
+ - for OARs containing a DataFrame chunk, its maximum possible size is
154
+ above the minimum size. This is an estimate since the actual size
155
+ after deduplication could be smaller.
156
+ - for OARs containing only row groups, their maximum possible size is
157
+ between the minimum and target sizes.
158
+
159
+ Returns
160
+ -------
161
+ NDArray
162
+ Boolean array of length equal to the number of ordered atomic
163
+ regions, where True indicates the OAR is likely to be on target
164
+ size.
165
+
166
+ Notes
167
+ -----
168
+ See the parent class documentation for details on the asymmetric
169
+ treatment of OARs with and without DataFrame chunks.
170
+
171
+ """
172
+ return self.oars_has_df_overlap & ( # OAR containing a DataFrame chunk.
173
+ self.oars_max_n_rows >= self.row_group_min_size
174
+ ) | ~self.oars_has_df_overlap & ( # OAR containing only row groups.
175
+ (self.oars_max_n_rows >= self.row_group_min_size)
176
+ & (self.oars_max_n_rows <= self.row_group_target_size)
177
+ )
178
+
179
+ def mrs_likely_exceeds_target_size(self, mrs_starts_ends_excl: NDArray) -> NDArray:
180
+ """
181
+ Return boolean array indicating which merge regions likely exceed target size.
182
+
183
+ Parameters
184
+ ----------
185
+ mrs_starts_ends_excl : NDArray
186
+ Array of shape (m, 2) containing the start (included) and end
187
+ (excluded) indices of the merge regions.
188
+
189
+ Returns
190
+ -------
191
+ NDArray
192
+ Boolean array of length equal to the number of merge regions, where
193
+ True indicates the merge region is likely to exceed target size.
194
+
195
+ """
196
+ return (
197
+ get_region_start_end_delta(
198
+ m_values=cumsum(self.oars_min_n_rows),
199
+ indices=mrs_starts_ends_excl,
200
+ )
201
+ >= self.row_group_target_size
202
+ )
203
+
204
+ def _specialized_compute_merge_sequences(
205
+ self,
206
+ ) -> list[tuple[int, NDArray]]:
207
+ """
208
+ Sequence merge regions (MRs) into optimally sized chunks for writing.
209
+
210
+ For each merge region (MR) defined in 'oar_idx_mrs_starts_ends_excl',
211
+ this method:
212
+ 1. Accumulates row counts using self.oars_min_n_rows
213
+ 2. Determines split points where accumulated rows reach
214
+ 'self.row_group_target_size'
215
+ 3. Creates consolidated chunks by filtering the original OARs indices to
216
+ ensure optimal row group loading.
217
+
218
+ This ensures each consolidated chunk approaches the target row size
219
+ while minimizing the number of row groups loaded into memory at once.
220
+
221
+ Returns
222
+ -------
223
+ List[Tuple[int, NDArray]]
224
+ Merge sequences, a list of tuples, where each tuple contains for
225
+ each merge sequence:
226
+ - First element: Start index of the first row group in the merge
227
+ sequence.
228
+ - Second element: Array of shape (m, 2) containing end indices
229
+ (excluded) for row groups and DataFrame chunks in the merge
230
+ sequence.
231
+
232
+ Notes
233
+ -----
234
+ The partitioning optimizes memory usage by loading only the minimum
235
+ number of row groups needed to create complete chunks of approximately
236
+ target size rows. The returned indices may be a subset of the original
237
+ OARs indices, filtered to ensure efficient memory usage during the
238
+ write process.
239
+
240
+ """
241
+ # Process each merge region to find optimal split points:
242
+ # 1. For each merge region, accumulate row counts
243
+ # 2. Find indices where accumulated rows reach multiples of target size
244
+ # 3. Include the last index of the region
245
+ # 4. Return a list of tuples with:
246
+ # - Starting row group index for each merge sequence
247
+ # - Array of component end indices at split points
248
+ return [
249
+ (
250
+ self.oars_rg_idx_starts[oar_idx_start],
251
+ self.oars_cmpt_idx_ends_excl[oar_idx_start:oar_idx_end_excl][
252
+ r_[
253
+ unique(
254
+ searchsorted(
255
+ (
256
+ cum_rows := cumsum(
257
+ self.oars_min_n_rows[oar_idx_start:oar_idx_end_excl],
258
+ )
259
+ ),
260
+ linspace(
261
+ self.row_group_target_size,
262
+ self.row_group_target_size
263
+ * (n_multiples := cum_rows[-1] // self.row_group_target_size),
264
+ n_multiples,
265
+ endpoint=True,
266
+ dtype=int_,
267
+ ),
268
+ side=LEFT,
269
+ ),
270
+ )[:-1],
271
+ oar_idx_end_excl - oar_idx_start - 1,
272
+ ]
273
+ ],
274
+ )
275
+ for oar_idx_start, oar_idx_end_excl in self.oar_idx_mrs_starts_ends_excl
276
+ ]
277
+
278
+ def compute_split_sequence(self, df_ordered_on: Series) -> list[int]:
279
+ """
280
+ Define the split sequence for a chunk depending row group target size.
281
+
282
+ Result is to be used as `row_group_offsets` parameter in
283
+ `iter_dataframe` method.
284
+
285
+ Parameters
286
+ ----------
287
+ df_ordered_on : Series
288
+ Series by which the DataFrame to be written is ordered.
289
+
290
+ Returns
291
+ -------
292
+ List[int]
293
+ A list of indices with the explicit index values to start new row
294
+ groups.
295
+
296
+ """
297
+ return list(range(0, len(df_ordered_on), self.row_group_target_size))
@@ -0,0 +1,319 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Created on Mon Mar 17 18:00:00 2025.
4
+
5
+ Concrete implementation of OARMergeSplitStrategy based on time period.
6
+
7
+ @author: pierrot
8
+
9
+ """
10
+ from functools import cached_property
11
+
12
+ from numpy import bincount
13
+ from numpy import dtype
14
+ from numpy import flatnonzero
15
+ from numpy import ones
16
+ from numpy import searchsorted
17
+ from numpy import unique
18
+ from numpy import zeros
19
+ from numpy.typing import NDArray
20
+ from pandas import Series
21
+ from pandas import Timestamp
22
+ from pandas import date_range
23
+
24
+ from oups.date_utils import ceil_ts
25
+ from oups.date_utils import floor_ts
26
+ from oups.store.ordered_parquet_dataset.write.merge_split_strategies.base import OARMergeSplitStrategy
27
+
28
+
29
+ LEFT = "left"
30
+ RIGHT = "right"
31
+ DTYPE_DATETIME64 = dtype("datetime64[ns]")
32
+
33
+
34
+ class TimePeriodMergeSplitStrategy(OARMergeSplitStrategy):
35
+ """
36
+ OAR merge and split strategy based on a time period target per row group.
37
+
38
+ This strategy ensures that row groups are split based on time periods. Each
39
+ resulting row group will ideally contain data from a single time period
40
+ (e.g., a month, day, etc.).
41
+
42
+ Attributes
43
+ ----------
44
+ row_group_time_period : str
45
+ Time period for a row group to be on target size (e.g., 'MS' for month
46
+ start).
47
+ oars_mins_maxs : NDArray
48
+ Array of shape (e, 2) containing the start and end bounds of each
49
+ ordered atomic region.
50
+ period_bounds : DatetimeIndex
51
+ Period bounds over the total time span of the dataset, considering both
52
+ row groups and DataFrame.
53
+ oars_period_idx : NDArray
54
+ Array of shape (e, 2) containing the start and end indices of each
55
+ ordered atomic region in the period bounds.
56
+
57
+ Notes
58
+ -----
59
+ - A row group is considered meeting the target size if it contains data from
60
+ exactly one time period.
61
+ - A point in time is within a time period if it is greater than or equal to
62
+ period start and strictly less than period end.
63
+
64
+ """
65
+
66
+ def __init__(
67
+ self,
68
+ rg_ordered_on_mins: NDArray,
69
+ rg_ordered_on_maxs: NDArray,
70
+ df_ordered_on: Series,
71
+ row_group_time_period: str,
72
+ drop_duplicates: bool | None = False,
73
+ ):
74
+ """
75
+ Initialize scheme with time size.
76
+
77
+ Parameters
78
+ ----------
79
+ rg_ordered_on_mins : NDArray
80
+ Array of shape (r) containing the minimum values of the ordered
81
+ row groups.
82
+ rg_ordered_on_maxs : NDArray
83
+ Array of shape (r) containing the maximum values of the ordered
84
+ row groups.
85
+ df_ordered_on : Series
86
+ Series of shape (d) containing the ordered DataFrame.
87
+ drop_duplicates : Optional[bool], default False
88
+ Whether to drop duplicates between row groups and DataFrame.
89
+ row_group_time_period : str
90
+ Target period for each row group (pandas freqstr).
91
+
92
+ """
93
+ super().__init__(
94
+ rg_ordered_on_mins,
95
+ rg_ordered_on_maxs,
96
+ df_ordered_on,
97
+ drop_duplicates,
98
+ )
99
+ self._specialized_init(
100
+ rg_ordered_on_mins,
101
+ rg_ordered_on_maxs,
102
+ df_ordered_on,
103
+ row_group_time_period,
104
+ )
105
+
106
+ def _specialized_init(
107
+ self,
108
+ rg_ordered_on_mins: NDArray,
109
+ rg_ordered_on_maxs: NDArray,
110
+ df_ordered_on: Series,
111
+ row_group_time_period: str,
112
+ ):
113
+ """
114
+ Initialize scheme with target period.
115
+
116
+ Parameters
117
+ ----------
118
+ rg_ordered_on_mins : NDArray
119
+ Minimum value of 'ordered_on' in each row group.
120
+ rg_ordered_on_maxs : NDArray
121
+ Maximum value of 'ordered_on' in each row group.
122
+ df_ordered_on : Series
123
+ Values of 'ordered_on' column in DataFrame.
124
+ row_group_time_period : str
125
+ Expected time period for each row group (pandas freqstr).
126
+
127
+ """
128
+ if not df_ordered_on.empty and df_ordered_on.dtype != DTYPE_DATETIME64:
129
+ raise TypeError(
130
+ "if 'row_group_target_size' is a pandas 'freqstr', dtype"
131
+ f" of column {df_ordered_on.name} has to be 'datetime64[ns]'.",
132
+ )
133
+ self.row_group_time_period = row_group_time_period
134
+ df_ordered_on_np = df_ordered_on.to_numpy()
135
+ self.oars_mins_maxs = ones((self.n_oars, 2)).astype(DTYPE_DATETIME64)
136
+ # Row groups encompasses Dataframe chunks in an OAR.
137
+ # Hence, start with Dataframe chunks starts and ends.
138
+ oar_idx_df_chunk = flatnonzero(self.oars_has_df_overlap)
139
+ df_idx_chunk_starts = zeros(len(oar_idx_df_chunk), dtype=int)
140
+ df_idx_chunk_starts[1:] = self.oars_cmpt_idx_ends_excl[oar_idx_df_chunk[:-1], 1]
141
+ self.oars_mins_maxs[oar_idx_df_chunk, 0] = df_ordered_on_np[df_idx_chunk_starts]
142
+ self.oars_mins_maxs[oar_idx_df_chunk, 1] = df_ordered_on_np[
143
+ self.oars_cmpt_idx_ends_excl[oar_idx_df_chunk, 1] - 1
144
+ ]
145
+ # Only then add row groups starts and ends. They will overwrite where
146
+ # Dataframe chunks are present.
147
+ oar_idx_row_groups = flatnonzero(self.oars_has_row_group)
148
+ self.oars_mins_maxs[oar_idx_row_groups, 0] = rg_ordered_on_mins
149
+ self.oars_mins_maxs[oar_idx_row_groups, 1] = rg_ordered_on_maxs
150
+ # Generate period bounds.
151
+ start_ts = floor_ts(Timestamp(self.oars_mins_maxs[0, 0]), row_group_time_period)
152
+ end_ts = ceil_ts(Timestamp(self.oars_mins_maxs[-1, 1]), row_group_time_period)
153
+ self.period_bounds = date_range(start=start_ts, end=end_ts, freq=row_group_time_period)
154
+ # Find period indices for each OAR.
155
+ self.oars_period_idx = searchsorted(
156
+ self.period_bounds,
157
+ self.oars_mins_maxs,
158
+ side=RIGHT,
159
+ )
160
+
161
+ @cached_property
162
+ def oars_likely_on_target_size(self) -> NDArray:
163
+ """
164
+ Return boolean array indicating which OARs are likely to be on target size.
165
+
166
+ An OAR meets target size if and only if:
167
+ - It contains exactly one row group OR one DataFrame chunk (not both)
168
+ - That component fits entirely within a single period bound
169
+
170
+ Returns
171
+ -------
172
+ NDArray
173
+ Boolean array of length equal to the number of ordered atomic
174
+ regions, where True indicates the OAR is likely to be on target
175
+ size.
176
+
177
+ Notes
178
+ -----
179
+ See the parent class documentation for details on the asymmetric
180
+ treatment of OARs with and without DataFrame chunks.
181
+
182
+ """
183
+ # Check if OAR fits in a single period
184
+ single_period_oars = self.oars_period_idx[:, 0] == self.oars_period_idx[:, 1]
185
+ # Check if OAR is the only one in its period
186
+ # period_counts = bincount(period_idx_oars.ravel())
187
+ # Each period index has to appear only twice (oncee for start, once for end).
188
+ # Since we already checked OARs don't span multiple periods (start == end),
189
+ # the check is then only made on the period start.
190
+ # oars_single_in_period = period_counts[period_idx_oars[:, 0]] == 2
191
+ return ( # Over-sized OAR containing a DataFrame chunk.
192
+ self.oars_has_df_overlap & ~single_period_oars
193
+ ) | ( # OAR with or wo DataFrame chunk, single in period and within a single period.
194
+ single_period_oars & (bincount(self.oars_period_idx.ravel())[self.oars_period_idx[:, 0]] == 2)
195
+ )
196
+
197
+ def mrs_likely_exceeds_target_size(self, mrs_starts_ends_excl: NDArray) -> NDArray:
198
+ """
199
+ Return boolean array indicating which merge regions likely exceed target size.
200
+
201
+ Parameters
202
+ ----------
203
+ mrs_starts_ends_excl : NDArray
204
+ Array of shape (m, 2) containing the start (included) and end
205
+ (excluded) indices of the merge regions.
206
+
207
+ Returns
208
+ -------
209
+ NDArray
210
+ Boolean array of length equal to the number of merge regions, where
211
+ True indicates the merge region is likely to exceed target size.
212
+
213
+ """
214
+ return (
215
+ self.oars_period_idx[mrs_starts_ends_excl[:, 0], 0]
216
+ != self.oars_period_idx[mrs_starts_ends_excl[:, 1] - 1, 1]
217
+ )
218
+
219
+ def _specialized_compute_merge_sequences(
220
+ self,
221
+ ) -> list[tuple[int, NDArray]]:
222
+ """
223
+ Sequence merge regions (MRs) into optimally sized chunks for writing.
224
+
225
+ For each merge region (MR) defined in 'oar_idx_mrs_starts_ends_excl',
226
+ this method:
227
+ 1. Determines split points where 'ordered_on' valuee is equal or larfer
228
+ than corresponding time period lower bound and strictly lower than
229
+ time period lower bound.
230
+ 2. Creates consolidated chunks by filtering the original OARs indices to
231
+ ensure optimal row group loading.
232
+
233
+ This ensures each consolidated chunk approaches row group time period
234
+ while minimizing the number of row groups loaded into memory at once.
235
+
236
+ Returns
237
+ -------
238
+ List[Tuple[int, NDArray]]
239
+ Merge sequences, a list of tuples, where each tuple contains for
240
+ each merge sequence:
241
+ - First element: Start index of the first row group in the merge
242
+ sequence.
243
+ - Second element: Array of shape (m, 2) containing end indices
244
+ (excluded) for row groups and DataFrame chunks in the merge
245
+ sequence.
246
+
247
+ Notes
248
+ -----
249
+ The partitioning optimizes memory usage by loading only the minimum
250
+ number of row groups needed to create complete chunks of approximately
251
+ row group time period. The returned indices may be a subset of the
252
+ original OARs indices, filtered to ensure efficient memory usage
253
+ during the write process.
254
+
255
+ """
256
+ # Process each merge region to find period-based split points:
257
+ # 1. For each merge region, identify the starting row group index
258
+ # 2. Find indices of OARs that are the last in each unique time period
259
+ # 3. Extract component end indices at these period boundaries
260
+ # 4. Return a list of tuples with:
261
+ # - Starting row group index for each merge sequence
262
+ # - Array of component end indices at period boundaries
263
+ return [
264
+ (
265
+ self.oars_rg_idx_starts[oar_idx_start],
266
+ self.oars_cmpt_idx_ends_excl[oar_idx_start:oar_idx_end_excl][
267
+ (
268
+ oar_idx_end_excl
269
+ - oar_idx_start
270
+ - 1
271
+ - unique(
272
+ self.oars_period_idx[oar_idx_start:oar_idx_end_excl, 0][::-1],
273
+ return_index=True,
274
+ )
275
+ )[1]
276
+ ],
277
+ )
278
+ for oar_idx_start, oar_idx_end_excl in self.oar_idx_mrs_starts_ends_excl
279
+ ]
280
+
281
+ def compute_split_sequence(self, df_ordered_on: Series) -> list[int]:
282
+ """
283
+ Define the split sequence for a chunk depending row group target size.
284
+
285
+ Result is to be used as `compute_split_sequence` parameter in
286
+ `iter_dataframe` method.
287
+
288
+ Parameters
289
+ ----------
290
+ df_ordered_on : Series
291
+ Series by which the DataFrame to be written is ordered.
292
+
293
+ Returns
294
+ -------
295
+ List[int]
296
+ A list of indices with the explicit index values to start new row
297
+ groups.
298
+
299
+ """
300
+ # Generate period bounds for the chunk.
301
+ # start_ts = floor_ts(Timestamp(df_ordered_on.iloc[0]), self.row_group_time_period)
302
+ # end_ts = ceil_ts(Timestamp(df_ordered_on.iloc[-1]), self.row_group_time_period)
303
+ # period_bounds = date_range(
304
+ # start=start_ts,
305
+ # end=end_ts,
306
+ # freq=self.row_group_time_period,
307
+ # )[:-1]
308
+ # Find where each period boundary falls in 'df_ordered_on'.
309
+ return unique(
310
+ searchsorted(
311
+ df_ordered_on,
312
+ date_range(
313
+ start=floor_ts(Timestamp(df_ordered_on.iloc[0]), self.row_group_time_period),
314
+ end=ceil_ts(Timestamp(df_ordered_on.iloc[-1]), self.row_group_time_period),
315
+ freq=self.row_group_time_period,
316
+ )[:-1],
317
+ side=LEFT,
318
+ ),
319
+ ).tolist()