oups 2025.9.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of oups might be problematic. Click here for more details.
- oups/__init__.py +40 -0
- oups/date_utils.py +62 -0
- oups/defines.py +26 -0
- oups/numpy_utils.py +114 -0
- oups/stateful_loop/__init__.py +14 -0
- oups/stateful_loop/loop_persistence_io.py +55 -0
- oups/stateful_loop/stateful_loop.py +654 -0
- oups/stateful_loop/validate_loop_usage.py +338 -0
- oups/stateful_ops/__init__.py +22 -0
- oups/stateful_ops/aggstream/__init__.py +12 -0
- oups/stateful_ops/aggstream/aggstream.py +1524 -0
- oups/stateful_ops/aggstream/cumsegagg.py +580 -0
- oups/stateful_ops/aggstream/jcumsegagg.py +416 -0
- oups/stateful_ops/aggstream/segmentby.py +1018 -0
- oups/stateful_ops/aggstream/utils.py +71 -0
- oups/stateful_ops/asof_merger/__init__.py +11 -0
- oups/stateful_ops/asof_merger/asof_merger.py +750 -0
- oups/stateful_ops/asof_merger/get_config.py +401 -0
- oups/stateful_ops/asof_merger/validate_params.py +285 -0
- oups/store/__init__.py +15 -0
- oups/store/filepath_utils.py +68 -0
- oups/store/indexer.py +457 -0
- oups/store/ordered_parquet_dataset/__init__.py +19 -0
- oups/store/ordered_parquet_dataset/metadata_filename.py +50 -0
- oups/store/ordered_parquet_dataset/ordered_parquet_dataset/__init__.py +15 -0
- oups/store/ordered_parquet_dataset/ordered_parquet_dataset/base.py +863 -0
- oups/store/ordered_parquet_dataset/ordered_parquet_dataset/read_only.py +252 -0
- oups/store/ordered_parquet_dataset/parquet_adapter.py +157 -0
- oups/store/ordered_parquet_dataset/write/__init__.py +19 -0
- oups/store/ordered_parquet_dataset/write/iter_merge_split_data.py +131 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/__init__.py +22 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/base.py +784 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/n_rows_strategy.py +297 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/time_period_strategy.py +319 -0
- oups/store/ordered_parquet_dataset/write/write.py +270 -0
- oups/store/store/__init__.py +11 -0
- oups/store/store/dataset_cache.py +50 -0
- oups/store/store/iter_intersections.py +397 -0
- oups/store/store/store.py +345 -0
- oups-2025.9.5.dist-info/LICENSE +201 -0
- oups-2025.9.5.dist-info/METADATA +44 -0
- oups-2025.9.5.dist-info/RECORD +43 -0
- oups-2025.9.5.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Created on Mon Mar 17 18:00:00 2025.
|
|
4
|
+
|
|
5
|
+
Concrete implementation of OARMergeSplitStrategy based on number of rows.
|
|
6
|
+
|
|
7
|
+
@author: pierrot
|
|
8
|
+
|
|
9
|
+
"""
|
|
10
|
+
from functools import cached_property
|
|
11
|
+
|
|
12
|
+
from numpy import cumsum
|
|
13
|
+
from numpy import int_
|
|
14
|
+
from numpy import linspace
|
|
15
|
+
from numpy import maximum
|
|
16
|
+
from numpy import r_
|
|
17
|
+
from numpy import searchsorted
|
|
18
|
+
from numpy import unique
|
|
19
|
+
from numpy import zeros
|
|
20
|
+
from numpy.typing import NDArray
|
|
21
|
+
from pandas import Series
|
|
22
|
+
|
|
23
|
+
from oups.store.ordered_parquet_dataset.write.merge_split_strategies.base import OARMergeSplitStrategy
|
|
24
|
+
from oups.store.ordered_parquet_dataset.write.merge_split_strategies.base import get_region_start_end_delta
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
LEFT = "left"
|
|
28
|
+
ROW_GROUP_TARGET_SIZE_SCALE_FACTOR = 0.8 # % of target row group size.
|
|
29
|
+
# MIN_RG_NUMBER_TO_ENSURE_ON_TARGET_RGS = 1 / (1 - ROW_GROUP_TARGET_SIZE_SCALE_FACTOR)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class NRowsMergeSplitStrategy(OARMergeSplitStrategy):
|
|
33
|
+
"""
|
|
34
|
+
OAR merge and split strategy based on a target number of rows per row group.
|
|
35
|
+
|
|
36
|
+
This strategy ensures that row groups are split when they exceed a target
|
|
37
|
+
size, while maintaining a minimum size to prevent too small row groups. It
|
|
38
|
+
also handles off target size row groups through the 'max_n_off_target_rgs'
|
|
39
|
+
parameter.
|
|
40
|
+
|
|
41
|
+
Attributes
|
|
42
|
+
----------
|
|
43
|
+
row_group_target_size : int
|
|
44
|
+
Target number of rows above which a new row group should be created.
|
|
45
|
+
row_group_min_size : int
|
|
46
|
+
Minimum number of rows in an ordered atomic region, computed as
|
|
47
|
+
``ROW_GROUP_TARGET_SIZE_SCALE_FACTOR * row_group_target_size``.
|
|
48
|
+
oars_max_n_rows : NDArray
|
|
49
|
+
Array of shape (e) containing the maximum number of rows in each ordered
|
|
50
|
+
atomic region, obtained by summing the number of rows in a row group
|
|
51
|
+
(if present) and the number of rows in its corresponding DataFrame chunk
|
|
52
|
+
(if present).
|
|
53
|
+
oars_min_n_rows : NDArray
|
|
54
|
+
Array of shape (e) containing the likely minimum number of rows in each
|
|
55
|
+
ordered atomic region. It is equal to ``oars_max_n_rows`` if
|
|
56
|
+
``drop_duplicates`` is False.
|
|
57
|
+
|
|
58
|
+
Notes
|
|
59
|
+
-----
|
|
60
|
+
The maximum number of rows in an OAR is calculated as the sum of:
|
|
61
|
+
- The number of rows in the row group (if present)
|
|
62
|
+
- The number of rows in the DataFrame chunk (if present)
|
|
63
|
+
This represents the worst-case scenario where there are no duplicates.
|
|
64
|
+
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
69
|
+
rg_ordered_on_mins: NDArray,
|
|
70
|
+
rg_ordered_on_maxs: NDArray,
|
|
71
|
+
df_ordered_on: Series,
|
|
72
|
+
rgs_n_rows: NDArray,
|
|
73
|
+
row_group_target_size: int,
|
|
74
|
+
drop_duplicates: bool | None = False,
|
|
75
|
+
):
|
|
76
|
+
"""
|
|
77
|
+
Initialize scheme with target size.
|
|
78
|
+
|
|
79
|
+
Parameters
|
|
80
|
+
----------
|
|
81
|
+
rg_ordered_on_mins : NDArray
|
|
82
|
+
Array of shape (r) containing the minimum values of the ordered
|
|
83
|
+
row groups.
|
|
84
|
+
rg_ordered_on_maxs : NDArray
|
|
85
|
+
Array of shape (r) containing the maximum values of the ordered
|
|
86
|
+
row groups.
|
|
87
|
+
df_ordered_on : Series
|
|
88
|
+
Series of shape (d) containing the ordered DataFrame.
|
|
89
|
+
rgs_n_rows : NDArray
|
|
90
|
+
Array of shape (r) containing the number of rows in each row group
|
|
91
|
+
in existing ParquetFile.
|
|
92
|
+
row_group_target_size : int
|
|
93
|
+
Target number of rows above which a new row group should be created.
|
|
94
|
+
drop_duplicates : Optional[bool], default False
|
|
95
|
+
Whether to drop duplicates between row groups and DataFrame.
|
|
96
|
+
|
|
97
|
+
"""
|
|
98
|
+
super().__init__(
|
|
99
|
+
rg_ordered_on_mins,
|
|
100
|
+
rg_ordered_on_maxs,
|
|
101
|
+
df_ordered_on,
|
|
102
|
+
drop_duplicates,
|
|
103
|
+
)
|
|
104
|
+
self._specialized_init(
|
|
105
|
+
rgs_n_rows=rgs_n_rows,
|
|
106
|
+
row_group_target_size=row_group_target_size,
|
|
107
|
+
drop_duplicates=drop_duplicates,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
def _specialized_init(
|
|
111
|
+
self,
|
|
112
|
+
rgs_n_rows: NDArray,
|
|
113
|
+
row_group_target_size: int,
|
|
114
|
+
drop_duplicates: bool | None = False,
|
|
115
|
+
):
|
|
116
|
+
"""
|
|
117
|
+
Initialize scheme with target size.
|
|
118
|
+
|
|
119
|
+
Parameters
|
|
120
|
+
----------
|
|
121
|
+
rgs_n_rows : NDArray
|
|
122
|
+
Array of shape (r) containing the number of rows in each row group
|
|
123
|
+
in existing ParquetFile.
|
|
124
|
+
row_group_target_size : int
|
|
125
|
+
Target number of rows above which a new row group should be created.
|
|
126
|
+
drop_duplicates : Optional[bool], default False
|
|
127
|
+
Whether to drop duplicates between row groups and DataFrame.
|
|
128
|
+
|
|
129
|
+
"""
|
|
130
|
+
self.row_group_target_size = row_group_target_size
|
|
131
|
+
self.row_group_min_size = int(row_group_target_size * ROW_GROUP_TARGET_SIZE_SCALE_FACTOR)
|
|
132
|
+
# Max number of rows in each ordered atomic region. This is a max in case
|
|
133
|
+
# there are duplicates between row groups and DataFrame that will be
|
|
134
|
+
# dropped.
|
|
135
|
+
self.oars_max_n_rows = zeros(self.n_oars, dtype=int)
|
|
136
|
+
self.oars_max_n_rows[self.oars_has_row_group] = rgs_n_rows
|
|
137
|
+
if drop_duplicates:
|
|
138
|
+
# Assuming each DataFrame chunk and each row group have no
|
|
139
|
+
# duplicates within themselves, 'oars_min_n_rows' is set assuming
|
|
140
|
+
# that all rows in the smallest component are duplicates of rows
|
|
141
|
+
# in the largest component.
|
|
142
|
+
self.oars_min_n_rows = maximum(self.oars_max_n_rows, self.oars_df_n_rows)
|
|
143
|
+
else:
|
|
144
|
+
self.oars_min_n_rows = self.oars_max_n_rows
|
|
145
|
+
self.oars_max_n_rows += self.oars_df_n_rows
|
|
146
|
+
|
|
147
|
+
@cached_property
|
|
148
|
+
def oars_likely_on_target_size(self) -> NDArray:
|
|
149
|
+
"""
|
|
150
|
+
Return boolean array indicating which OARs are likely to be on target size.
|
|
151
|
+
|
|
152
|
+
An OAR is considered likely to be on target size if:
|
|
153
|
+
- for OARs containing a DataFrame chunk, its maximum possible size is
|
|
154
|
+
above the minimum size. This is an estimate since the actual size
|
|
155
|
+
after deduplication could be smaller.
|
|
156
|
+
- for OARs containing only row groups, their maximum possible size is
|
|
157
|
+
between the minimum and target sizes.
|
|
158
|
+
|
|
159
|
+
Returns
|
|
160
|
+
-------
|
|
161
|
+
NDArray
|
|
162
|
+
Boolean array of length equal to the number of ordered atomic
|
|
163
|
+
regions, where True indicates the OAR is likely to be on target
|
|
164
|
+
size.
|
|
165
|
+
|
|
166
|
+
Notes
|
|
167
|
+
-----
|
|
168
|
+
See the parent class documentation for details on the asymmetric
|
|
169
|
+
treatment of OARs with and without DataFrame chunks.
|
|
170
|
+
|
|
171
|
+
"""
|
|
172
|
+
return self.oars_has_df_overlap & ( # OAR containing a DataFrame chunk.
|
|
173
|
+
self.oars_max_n_rows >= self.row_group_min_size
|
|
174
|
+
) | ~self.oars_has_df_overlap & ( # OAR containing only row groups.
|
|
175
|
+
(self.oars_max_n_rows >= self.row_group_min_size)
|
|
176
|
+
& (self.oars_max_n_rows <= self.row_group_target_size)
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
def mrs_likely_exceeds_target_size(self, mrs_starts_ends_excl: NDArray) -> NDArray:
|
|
180
|
+
"""
|
|
181
|
+
Return boolean array indicating which merge regions likely exceed target size.
|
|
182
|
+
|
|
183
|
+
Parameters
|
|
184
|
+
----------
|
|
185
|
+
mrs_starts_ends_excl : NDArray
|
|
186
|
+
Array of shape (m, 2) containing the start (included) and end
|
|
187
|
+
(excluded) indices of the merge regions.
|
|
188
|
+
|
|
189
|
+
Returns
|
|
190
|
+
-------
|
|
191
|
+
NDArray
|
|
192
|
+
Boolean array of length equal to the number of merge regions, where
|
|
193
|
+
True indicates the merge region is likely to exceed target size.
|
|
194
|
+
|
|
195
|
+
"""
|
|
196
|
+
return (
|
|
197
|
+
get_region_start_end_delta(
|
|
198
|
+
m_values=cumsum(self.oars_min_n_rows),
|
|
199
|
+
indices=mrs_starts_ends_excl,
|
|
200
|
+
)
|
|
201
|
+
>= self.row_group_target_size
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
def _specialized_compute_merge_sequences(
|
|
205
|
+
self,
|
|
206
|
+
) -> list[tuple[int, NDArray]]:
|
|
207
|
+
"""
|
|
208
|
+
Sequence merge regions (MRs) into optimally sized chunks for writing.
|
|
209
|
+
|
|
210
|
+
For each merge region (MR) defined in 'oar_idx_mrs_starts_ends_excl',
|
|
211
|
+
this method:
|
|
212
|
+
1. Accumulates row counts using self.oars_min_n_rows
|
|
213
|
+
2. Determines split points where accumulated rows reach
|
|
214
|
+
'self.row_group_target_size'
|
|
215
|
+
3. Creates consolidated chunks by filtering the original OARs indices to
|
|
216
|
+
ensure optimal row group loading.
|
|
217
|
+
|
|
218
|
+
This ensures each consolidated chunk approaches the target row size
|
|
219
|
+
while minimizing the number of row groups loaded into memory at once.
|
|
220
|
+
|
|
221
|
+
Returns
|
|
222
|
+
-------
|
|
223
|
+
List[Tuple[int, NDArray]]
|
|
224
|
+
Merge sequences, a list of tuples, where each tuple contains for
|
|
225
|
+
each merge sequence:
|
|
226
|
+
- First element: Start index of the first row group in the merge
|
|
227
|
+
sequence.
|
|
228
|
+
- Second element: Array of shape (m, 2) containing end indices
|
|
229
|
+
(excluded) for row groups and DataFrame chunks in the merge
|
|
230
|
+
sequence.
|
|
231
|
+
|
|
232
|
+
Notes
|
|
233
|
+
-----
|
|
234
|
+
The partitioning optimizes memory usage by loading only the minimum
|
|
235
|
+
number of row groups needed to create complete chunks of approximately
|
|
236
|
+
target size rows. The returned indices may be a subset of the original
|
|
237
|
+
OARs indices, filtered to ensure efficient memory usage during the
|
|
238
|
+
write process.
|
|
239
|
+
|
|
240
|
+
"""
|
|
241
|
+
# Process each merge region to find optimal split points:
|
|
242
|
+
# 1. For each merge region, accumulate row counts
|
|
243
|
+
# 2. Find indices where accumulated rows reach multiples of target size
|
|
244
|
+
# 3. Include the last index of the region
|
|
245
|
+
# 4. Return a list of tuples with:
|
|
246
|
+
# - Starting row group index for each merge sequence
|
|
247
|
+
# - Array of component end indices at split points
|
|
248
|
+
return [
|
|
249
|
+
(
|
|
250
|
+
self.oars_rg_idx_starts[oar_idx_start],
|
|
251
|
+
self.oars_cmpt_idx_ends_excl[oar_idx_start:oar_idx_end_excl][
|
|
252
|
+
r_[
|
|
253
|
+
unique(
|
|
254
|
+
searchsorted(
|
|
255
|
+
(
|
|
256
|
+
cum_rows := cumsum(
|
|
257
|
+
self.oars_min_n_rows[oar_idx_start:oar_idx_end_excl],
|
|
258
|
+
)
|
|
259
|
+
),
|
|
260
|
+
linspace(
|
|
261
|
+
self.row_group_target_size,
|
|
262
|
+
self.row_group_target_size
|
|
263
|
+
* (n_multiples := cum_rows[-1] // self.row_group_target_size),
|
|
264
|
+
n_multiples,
|
|
265
|
+
endpoint=True,
|
|
266
|
+
dtype=int_,
|
|
267
|
+
),
|
|
268
|
+
side=LEFT,
|
|
269
|
+
),
|
|
270
|
+
)[:-1],
|
|
271
|
+
oar_idx_end_excl - oar_idx_start - 1,
|
|
272
|
+
]
|
|
273
|
+
],
|
|
274
|
+
)
|
|
275
|
+
for oar_idx_start, oar_idx_end_excl in self.oar_idx_mrs_starts_ends_excl
|
|
276
|
+
]
|
|
277
|
+
|
|
278
|
+
def compute_split_sequence(self, df_ordered_on: Series) -> list[int]:
|
|
279
|
+
"""
|
|
280
|
+
Define the split sequence for a chunk depending row group target size.
|
|
281
|
+
|
|
282
|
+
Result is to be used as `row_group_offsets` parameter in
|
|
283
|
+
`iter_dataframe` method.
|
|
284
|
+
|
|
285
|
+
Parameters
|
|
286
|
+
----------
|
|
287
|
+
df_ordered_on : Series
|
|
288
|
+
Series by which the DataFrame to be written is ordered.
|
|
289
|
+
|
|
290
|
+
Returns
|
|
291
|
+
-------
|
|
292
|
+
List[int]
|
|
293
|
+
A list of indices with the explicit index values to start new row
|
|
294
|
+
groups.
|
|
295
|
+
|
|
296
|
+
"""
|
|
297
|
+
return list(range(0, len(df_ordered_on), self.row_group_target_size))
|
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Created on Mon Mar 17 18:00:00 2025.
|
|
4
|
+
|
|
5
|
+
Concrete implementation of OARMergeSplitStrategy based on time period.
|
|
6
|
+
|
|
7
|
+
@author: pierrot
|
|
8
|
+
|
|
9
|
+
"""
|
|
10
|
+
from functools import cached_property
|
|
11
|
+
|
|
12
|
+
from numpy import bincount
|
|
13
|
+
from numpy import dtype
|
|
14
|
+
from numpy import flatnonzero
|
|
15
|
+
from numpy import ones
|
|
16
|
+
from numpy import searchsorted
|
|
17
|
+
from numpy import unique
|
|
18
|
+
from numpy import zeros
|
|
19
|
+
from numpy.typing import NDArray
|
|
20
|
+
from pandas import Series
|
|
21
|
+
from pandas import Timestamp
|
|
22
|
+
from pandas import date_range
|
|
23
|
+
|
|
24
|
+
from oups.date_utils import ceil_ts
|
|
25
|
+
from oups.date_utils import floor_ts
|
|
26
|
+
from oups.store.ordered_parquet_dataset.write.merge_split_strategies.base import OARMergeSplitStrategy
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
LEFT = "left"
|
|
30
|
+
RIGHT = "right"
|
|
31
|
+
DTYPE_DATETIME64 = dtype("datetime64[ns]")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class TimePeriodMergeSplitStrategy(OARMergeSplitStrategy):
|
|
35
|
+
"""
|
|
36
|
+
OAR merge and split strategy based on a time period target per row group.
|
|
37
|
+
|
|
38
|
+
This strategy ensures that row groups are split based on time periods. Each
|
|
39
|
+
resulting row group will ideally contain data from a single time period
|
|
40
|
+
(e.g., a month, day, etc.).
|
|
41
|
+
|
|
42
|
+
Attributes
|
|
43
|
+
----------
|
|
44
|
+
row_group_time_period : str
|
|
45
|
+
Time period for a row group to be on target size (e.g., 'MS' for month
|
|
46
|
+
start).
|
|
47
|
+
oars_mins_maxs : NDArray
|
|
48
|
+
Array of shape (e, 2) containing the start and end bounds of each
|
|
49
|
+
ordered atomic region.
|
|
50
|
+
period_bounds : DatetimeIndex
|
|
51
|
+
Period bounds over the total time span of the dataset, considering both
|
|
52
|
+
row groups and DataFrame.
|
|
53
|
+
oars_period_idx : NDArray
|
|
54
|
+
Array of shape (e, 2) containing the start and end indices of each
|
|
55
|
+
ordered atomic region in the period bounds.
|
|
56
|
+
|
|
57
|
+
Notes
|
|
58
|
+
-----
|
|
59
|
+
- A row group is considered meeting the target size if it contains data from
|
|
60
|
+
exactly one time period.
|
|
61
|
+
- A point in time is within a time period if it is greater than or equal to
|
|
62
|
+
period start and strictly less than period end.
|
|
63
|
+
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
def __init__(
|
|
67
|
+
self,
|
|
68
|
+
rg_ordered_on_mins: NDArray,
|
|
69
|
+
rg_ordered_on_maxs: NDArray,
|
|
70
|
+
df_ordered_on: Series,
|
|
71
|
+
row_group_time_period: str,
|
|
72
|
+
drop_duplicates: bool | None = False,
|
|
73
|
+
):
|
|
74
|
+
"""
|
|
75
|
+
Initialize scheme with time size.
|
|
76
|
+
|
|
77
|
+
Parameters
|
|
78
|
+
----------
|
|
79
|
+
rg_ordered_on_mins : NDArray
|
|
80
|
+
Array of shape (r) containing the minimum values of the ordered
|
|
81
|
+
row groups.
|
|
82
|
+
rg_ordered_on_maxs : NDArray
|
|
83
|
+
Array of shape (r) containing the maximum values of the ordered
|
|
84
|
+
row groups.
|
|
85
|
+
df_ordered_on : Series
|
|
86
|
+
Series of shape (d) containing the ordered DataFrame.
|
|
87
|
+
drop_duplicates : Optional[bool], default False
|
|
88
|
+
Whether to drop duplicates between row groups and DataFrame.
|
|
89
|
+
row_group_time_period : str
|
|
90
|
+
Target period for each row group (pandas freqstr).
|
|
91
|
+
|
|
92
|
+
"""
|
|
93
|
+
super().__init__(
|
|
94
|
+
rg_ordered_on_mins,
|
|
95
|
+
rg_ordered_on_maxs,
|
|
96
|
+
df_ordered_on,
|
|
97
|
+
drop_duplicates,
|
|
98
|
+
)
|
|
99
|
+
self._specialized_init(
|
|
100
|
+
rg_ordered_on_mins,
|
|
101
|
+
rg_ordered_on_maxs,
|
|
102
|
+
df_ordered_on,
|
|
103
|
+
row_group_time_period,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
def _specialized_init(
|
|
107
|
+
self,
|
|
108
|
+
rg_ordered_on_mins: NDArray,
|
|
109
|
+
rg_ordered_on_maxs: NDArray,
|
|
110
|
+
df_ordered_on: Series,
|
|
111
|
+
row_group_time_period: str,
|
|
112
|
+
):
|
|
113
|
+
"""
|
|
114
|
+
Initialize scheme with target period.
|
|
115
|
+
|
|
116
|
+
Parameters
|
|
117
|
+
----------
|
|
118
|
+
rg_ordered_on_mins : NDArray
|
|
119
|
+
Minimum value of 'ordered_on' in each row group.
|
|
120
|
+
rg_ordered_on_maxs : NDArray
|
|
121
|
+
Maximum value of 'ordered_on' in each row group.
|
|
122
|
+
df_ordered_on : Series
|
|
123
|
+
Values of 'ordered_on' column in DataFrame.
|
|
124
|
+
row_group_time_period : str
|
|
125
|
+
Expected time period for each row group (pandas freqstr).
|
|
126
|
+
|
|
127
|
+
"""
|
|
128
|
+
if not df_ordered_on.empty and df_ordered_on.dtype != DTYPE_DATETIME64:
|
|
129
|
+
raise TypeError(
|
|
130
|
+
"if 'row_group_target_size' is a pandas 'freqstr', dtype"
|
|
131
|
+
f" of column {df_ordered_on.name} has to be 'datetime64[ns]'.",
|
|
132
|
+
)
|
|
133
|
+
self.row_group_time_period = row_group_time_period
|
|
134
|
+
df_ordered_on_np = df_ordered_on.to_numpy()
|
|
135
|
+
self.oars_mins_maxs = ones((self.n_oars, 2)).astype(DTYPE_DATETIME64)
|
|
136
|
+
# Row groups encompasses Dataframe chunks in an OAR.
|
|
137
|
+
# Hence, start with Dataframe chunks starts and ends.
|
|
138
|
+
oar_idx_df_chunk = flatnonzero(self.oars_has_df_overlap)
|
|
139
|
+
df_idx_chunk_starts = zeros(len(oar_idx_df_chunk), dtype=int)
|
|
140
|
+
df_idx_chunk_starts[1:] = self.oars_cmpt_idx_ends_excl[oar_idx_df_chunk[:-1], 1]
|
|
141
|
+
self.oars_mins_maxs[oar_idx_df_chunk, 0] = df_ordered_on_np[df_idx_chunk_starts]
|
|
142
|
+
self.oars_mins_maxs[oar_idx_df_chunk, 1] = df_ordered_on_np[
|
|
143
|
+
self.oars_cmpt_idx_ends_excl[oar_idx_df_chunk, 1] - 1
|
|
144
|
+
]
|
|
145
|
+
# Only then add row groups starts and ends. They will overwrite where
|
|
146
|
+
# Dataframe chunks are present.
|
|
147
|
+
oar_idx_row_groups = flatnonzero(self.oars_has_row_group)
|
|
148
|
+
self.oars_mins_maxs[oar_idx_row_groups, 0] = rg_ordered_on_mins
|
|
149
|
+
self.oars_mins_maxs[oar_idx_row_groups, 1] = rg_ordered_on_maxs
|
|
150
|
+
# Generate period bounds.
|
|
151
|
+
start_ts = floor_ts(Timestamp(self.oars_mins_maxs[0, 0]), row_group_time_period)
|
|
152
|
+
end_ts = ceil_ts(Timestamp(self.oars_mins_maxs[-1, 1]), row_group_time_period)
|
|
153
|
+
self.period_bounds = date_range(start=start_ts, end=end_ts, freq=row_group_time_period)
|
|
154
|
+
# Find period indices for each OAR.
|
|
155
|
+
self.oars_period_idx = searchsorted(
|
|
156
|
+
self.period_bounds,
|
|
157
|
+
self.oars_mins_maxs,
|
|
158
|
+
side=RIGHT,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
@cached_property
|
|
162
|
+
def oars_likely_on_target_size(self) -> NDArray:
|
|
163
|
+
"""
|
|
164
|
+
Return boolean array indicating which OARs are likely to be on target size.
|
|
165
|
+
|
|
166
|
+
An OAR meets target size if and only if:
|
|
167
|
+
- It contains exactly one row group OR one DataFrame chunk (not both)
|
|
168
|
+
- That component fits entirely within a single period bound
|
|
169
|
+
|
|
170
|
+
Returns
|
|
171
|
+
-------
|
|
172
|
+
NDArray
|
|
173
|
+
Boolean array of length equal to the number of ordered atomic
|
|
174
|
+
regions, where True indicates the OAR is likely to be on target
|
|
175
|
+
size.
|
|
176
|
+
|
|
177
|
+
Notes
|
|
178
|
+
-----
|
|
179
|
+
See the parent class documentation for details on the asymmetric
|
|
180
|
+
treatment of OARs with and without DataFrame chunks.
|
|
181
|
+
|
|
182
|
+
"""
|
|
183
|
+
# Check if OAR fits in a single period
|
|
184
|
+
single_period_oars = self.oars_period_idx[:, 0] == self.oars_period_idx[:, 1]
|
|
185
|
+
# Check if OAR is the only one in its period
|
|
186
|
+
# period_counts = bincount(period_idx_oars.ravel())
|
|
187
|
+
# Each period index has to appear only twice (oncee for start, once for end).
|
|
188
|
+
# Since we already checked OARs don't span multiple periods (start == end),
|
|
189
|
+
# the check is then only made on the period start.
|
|
190
|
+
# oars_single_in_period = period_counts[period_idx_oars[:, 0]] == 2
|
|
191
|
+
return ( # Over-sized OAR containing a DataFrame chunk.
|
|
192
|
+
self.oars_has_df_overlap & ~single_period_oars
|
|
193
|
+
) | ( # OAR with or wo DataFrame chunk, single in period and within a single period.
|
|
194
|
+
single_period_oars & (bincount(self.oars_period_idx.ravel())[self.oars_period_idx[:, 0]] == 2)
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
def mrs_likely_exceeds_target_size(self, mrs_starts_ends_excl: NDArray) -> NDArray:
|
|
198
|
+
"""
|
|
199
|
+
Return boolean array indicating which merge regions likely exceed target size.
|
|
200
|
+
|
|
201
|
+
Parameters
|
|
202
|
+
----------
|
|
203
|
+
mrs_starts_ends_excl : NDArray
|
|
204
|
+
Array of shape (m, 2) containing the start (included) and end
|
|
205
|
+
(excluded) indices of the merge regions.
|
|
206
|
+
|
|
207
|
+
Returns
|
|
208
|
+
-------
|
|
209
|
+
NDArray
|
|
210
|
+
Boolean array of length equal to the number of merge regions, where
|
|
211
|
+
True indicates the merge region is likely to exceed target size.
|
|
212
|
+
|
|
213
|
+
"""
|
|
214
|
+
return (
|
|
215
|
+
self.oars_period_idx[mrs_starts_ends_excl[:, 0], 0]
|
|
216
|
+
!= self.oars_period_idx[mrs_starts_ends_excl[:, 1] - 1, 1]
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
def _specialized_compute_merge_sequences(
|
|
220
|
+
self,
|
|
221
|
+
) -> list[tuple[int, NDArray]]:
|
|
222
|
+
"""
|
|
223
|
+
Sequence merge regions (MRs) into optimally sized chunks for writing.
|
|
224
|
+
|
|
225
|
+
For each merge region (MR) defined in 'oar_idx_mrs_starts_ends_excl',
|
|
226
|
+
this method:
|
|
227
|
+
1. Determines split points where 'ordered_on' valuee is equal or larfer
|
|
228
|
+
than corresponding time period lower bound and strictly lower than
|
|
229
|
+
time period lower bound.
|
|
230
|
+
2. Creates consolidated chunks by filtering the original OARs indices to
|
|
231
|
+
ensure optimal row group loading.
|
|
232
|
+
|
|
233
|
+
This ensures each consolidated chunk approaches row group time period
|
|
234
|
+
while minimizing the number of row groups loaded into memory at once.
|
|
235
|
+
|
|
236
|
+
Returns
|
|
237
|
+
-------
|
|
238
|
+
List[Tuple[int, NDArray]]
|
|
239
|
+
Merge sequences, a list of tuples, where each tuple contains for
|
|
240
|
+
each merge sequence:
|
|
241
|
+
- First element: Start index of the first row group in the merge
|
|
242
|
+
sequence.
|
|
243
|
+
- Second element: Array of shape (m, 2) containing end indices
|
|
244
|
+
(excluded) for row groups and DataFrame chunks in the merge
|
|
245
|
+
sequence.
|
|
246
|
+
|
|
247
|
+
Notes
|
|
248
|
+
-----
|
|
249
|
+
The partitioning optimizes memory usage by loading only the minimum
|
|
250
|
+
number of row groups needed to create complete chunks of approximately
|
|
251
|
+
row group time period. The returned indices may be a subset of the
|
|
252
|
+
original OARs indices, filtered to ensure efficient memory usage
|
|
253
|
+
during the write process.
|
|
254
|
+
|
|
255
|
+
"""
|
|
256
|
+
# Process each merge region to find period-based split points:
|
|
257
|
+
# 1. For each merge region, identify the starting row group index
|
|
258
|
+
# 2. Find indices of OARs that are the last in each unique time period
|
|
259
|
+
# 3. Extract component end indices at these period boundaries
|
|
260
|
+
# 4. Return a list of tuples with:
|
|
261
|
+
# - Starting row group index for each merge sequence
|
|
262
|
+
# - Array of component end indices at period boundaries
|
|
263
|
+
return [
|
|
264
|
+
(
|
|
265
|
+
self.oars_rg_idx_starts[oar_idx_start],
|
|
266
|
+
self.oars_cmpt_idx_ends_excl[oar_idx_start:oar_idx_end_excl][
|
|
267
|
+
(
|
|
268
|
+
oar_idx_end_excl
|
|
269
|
+
- oar_idx_start
|
|
270
|
+
- 1
|
|
271
|
+
- unique(
|
|
272
|
+
self.oars_period_idx[oar_idx_start:oar_idx_end_excl, 0][::-1],
|
|
273
|
+
return_index=True,
|
|
274
|
+
)
|
|
275
|
+
)[1]
|
|
276
|
+
],
|
|
277
|
+
)
|
|
278
|
+
for oar_idx_start, oar_idx_end_excl in self.oar_idx_mrs_starts_ends_excl
|
|
279
|
+
]
|
|
280
|
+
|
|
281
|
+
def compute_split_sequence(self, df_ordered_on: Series) -> list[int]:
|
|
282
|
+
"""
|
|
283
|
+
Define the split sequence for a chunk depending row group target size.
|
|
284
|
+
|
|
285
|
+
Result is to be used as `compute_split_sequence` parameter in
|
|
286
|
+
`iter_dataframe` method.
|
|
287
|
+
|
|
288
|
+
Parameters
|
|
289
|
+
----------
|
|
290
|
+
df_ordered_on : Series
|
|
291
|
+
Series by which the DataFrame to be written is ordered.
|
|
292
|
+
|
|
293
|
+
Returns
|
|
294
|
+
-------
|
|
295
|
+
List[int]
|
|
296
|
+
A list of indices with the explicit index values to start new row
|
|
297
|
+
groups.
|
|
298
|
+
|
|
299
|
+
"""
|
|
300
|
+
# Generate period bounds for the chunk.
|
|
301
|
+
# start_ts = floor_ts(Timestamp(df_ordered_on.iloc[0]), self.row_group_time_period)
|
|
302
|
+
# end_ts = ceil_ts(Timestamp(df_ordered_on.iloc[-1]), self.row_group_time_period)
|
|
303
|
+
# period_bounds = date_range(
|
|
304
|
+
# start=start_ts,
|
|
305
|
+
# end=end_ts,
|
|
306
|
+
# freq=self.row_group_time_period,
|
|
307
|
+
# )[:-1]
|
|
308
|
+
# Find where each period boundary falls in 'df_ordered_on'.
|
|
309
|
+
return unique(
|
|
310
|
+
searchsorted(
|
|
311
|
+
df_ordered_on,
|
|
312
|
+
date_range(
|
|
313
|
+
start=floor_ts(Timestamp(df_ordered_on.iloc[0]), self.row_group_time_period),
|
|
314
|
+
end=ceil_ts(Timestamp(df_ordered_on.iloc[-1]), self.row_group_time_period),
|
|
315
|
+
freq=self.row_group_time_period,
|
|
316
|
+
)[:-1],
|
|
317
|
+
side=LEFT,
|
|
318
|
+
),
|
|
319
|
+
).tolist()
|