oups 2025.9.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of oups might be problematic. Click here for more details.
- oups/__init__.py +40 -0
- oups/date_utils.py +62 -0
- oups/defines.py +26 -0
- oups/numpy_utils.py +114 -0
- oups/stateful_loop/__init__.py +14 -0
- oups/stateful_loop/loop_persistence_io.py +55 -0
- oups/stateful_loop/stateful_loop.py +654 -0
- oups/stateful_loop/validate_loop_usage.py +338 -0
- oups/stateful_ops/__init__.py +22 -0
- oups/stateful_ops/aggstream/__init__.py +12 -0
- oups/stateful_ops/aggstream/aggstream.py +1524 -0
- oups/stateful_ops/aggstream/cumsegagg.py +580 -0
- oups/stateful_ops/aggstream/jcumsegagg.py +416 -0
- oups/stateful_ops/aggstream/segmentby.py +1018 -0
- oups/stateful_ops/aggstream/utils.py +71 -0
- oups/stateful_ops/asof_merger/__init__.py +11 -0
- oups/stateful_ops/asof_merger/asof_merger.py +750 -0
- oups/stateful_ops/asof_merger/get_config.py +401 -0
- oups/stateful_ops/asof_merger/validate_params.py +285 -0
- oups/store/__init__.py +15 -0
- oups/store/filepath_utils.py +68 -0
- oups/store/indexer.py +457 -0
- oups/store/ordered_parquet_dataset/__init__.py +19 -0
- oups/store/ordered_parquet_dataset/metadata_filename.py +50 -0
- oups/store/ordered_parquet_dataset/ordered_parquet_dataset/__init__.py +15 -0
- oups/store/ordered_parquet_dataset/ordered_parquet_dataset/base.py +863 -0
- oups/store/ordered_parquet_dataset/ordered_parquet_dataset/read_only.py +252 -0
- oups/store/ordered_parquet_dataset/parquet_adapter.py +157 -0
- oups/store/ordered_parquet_dataset/write/__init__.py +19 -0
- oups/store/ordered_parquet_dataset/write/iter_merge_split_data.py +131 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/__init__.py +22 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/base.py +784 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/n_rows_strategy.py +297 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/time_period_strategy.py +319 -0
- oups/store/ordered_parquet_dataset/write/write.py +270 -0
- oups/store/store/__init__.py +11 -0
- oups/store/store/dataset_cache.py +50 -0
- oups/store/store/iter_intersections.py +397 -0
- oups/store/store/store.py +345 -0
- oups-2025.9.5.dist-info/LICENSE +201 -0
- oups-2025.9.5.dist-info/METADATA +44 -0
- oups-2025.9.5.dist-info/RECORD +43 -0
- oups-2025.9.5.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,863 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Created on Tue Jun 10 22:30:00 2025.
|
|
4
|
+
|
|
5
|
+
@author: pierrot
|
|
6
|
+
|
|
7
|
+
Ordered parquet dataset file structure.
|
|
8
|
+
|
|
9
|
+
parent_directory/
|
|
10
|
+
├── my_dataset1/ # Dataset directory
|
|
11
|
+
│ ├── file_0000.parquet
|
|
12
|
+
│ └── file_0001.parquet
|
|
13
|
+
├── my_dataset1_opdmd # Metadata file
|
|
14
|
+
└── my_dataset1.lock # Exclusive lock file
|
|
15
|
+
|
|
16
|
+
A lock is acquired at object creation and held for the object's entire lifetime.
|
|
17
|
+
This provides simple, race-condition-free exclusive access suitable for
|
|
18
|
+
scenarios with limited concurrent processes.
|
|
19
|
+
|
|
20
|
+
"""
|
|
21
|
+
from collections.abc import Iterable
|
|
22
|
+
from functools import cached_property
|
|
23
|
+
from itertools import chain
|
|
24
|
+
from os import remove
|
|
25
|
+
from os import rename
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
from typing import TYPE_CHECKING
|
|
28
|
+
|
|
29
|
+
from flufl.lock import Lock
|
|
30
|
+
from flufl.lock import TimeOutError
|
|
31
|
+
from numpy import iinfo
|
|
32
|
+
from numpy import isin
|
|
33
|
+
from numpy import uint16
|
|
34
|
+
from numpy import uint32
|
|
35
|
+
from pandas import DataFrame
|
|
36
|
+
from pandas import Series
|
|
37
|
+
from pandas import concat
|
|
38
|
+
|
|
39
|
+
from oups.defines import KEY_FILE_IDS
|
|
40
|
+
from oups.defines import KEY_N_ROWS
|
|
41
|
+
from oups.defines import KEY_ORDERED_ON
|
|
42
|
+
from oups.defines import KEY_ORDERED_ON_MAXS
|
|
43
|
+
from oups.defines import KEY_ORDERED_ON_MINS
|
|
44
|
+
from oups.store.filepath_utils import remove_dir
|
|
45
|
+
from oups.store.ordered_parquet_dataset.metadata_filename import get_md_filepath
|
|
46
|
+
from oups.store.ordered_parquet_dataset.parquet_adapter import ParquetAdapter
|
|
47
|
+
from oups.store.ordered_parquet_dataset.write import write
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
if TYPE_CHECKING:
|
|
51
|
+
from oups.store.ordered_parquet_dataset.ordered_parquet_dataset.read_only import (
|
|
52
|
+
ReadOnlyOrderedParquetDataset,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
LOCK_EXTENSION = ".lock"
|
|
56
|
+
PARQUET_FILE_PREFIX = "file_"
|
|
57
|
+
PARQUET_FILE_EXTENSION = ".parquet"
|
|
58
|
+
# Do not change this order, it is expected by OrderedParquetDataset.write_row_group_files()
|
|
59
|
+
RGS_STATS_COLUMNS = [KEY_FILE_IDS, KEY_N_ROWS, KEY_ORDERED_ON_MINS, KEY_ORDERED_ON_MAXS]
|
|
60
|
+
RGS_STATS_BASE_DTYPES = {
|
|
61
|
+
KEY_N_ROWS: uint32,
|
|
62
|
+
KEY_FILE_IDS: uint16,
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
parquet_adapter = ParquetAdapter(use_arro3=False)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def get_parquet_filepaths(
|
|
70
|
+
dirpath: Path,
|
|
71
|
+
file_id: int | Series,
|
|
72
|
+
file_id_n_digits: int,
|
|
73
|
+
) -> str | list[str]:
|
|
74
|
+
"""
|
|
75
|
+
Get standardized parquet file path(s).
|
|
76
|
+
|
|
77
|
+
Parameters
|
|
78
|
+
----------
|
|
79
|
+
dirpath : Path
|
|
80
|
+
The directory path to use in the filename.
|
|
81
|
+
file_id : int or Series[int]
|
|
82
|
+
The file ID to use in the filename. If a Series, a list of file paths
|
|
83
|
+
is returned.
|
|
84
|
+
file_id_n_digits : int, optional
|
|
85
|
+
Number of digits to use for 'file_id' in filename.
|
|
86
|
+
|
|
87
|
+
Returns
|
|
88
|
+
-------
|
|
89
|
+
Union[str, list[str]]
|
|
90
|
+
The formatted file path(s).
|
|
91
|
+
|
|
92
|
+
"""
|
|
93
|
+
return (
|
|
94
|
+
(
|
|
95
|
+
str(dirpath / PARQUET_FILE_PREFIX)
|
|
96
|
+
+ file_id.astype("string").str.zfill(file_id_n_digits)
|
|
97
|
+
+ PARQUET_FILE_EXTENSION
|
|
98
|
+
).to_list()
|
|
99
|
+
if isinstance(file_id, Series)
|
|
100
|
+
else dirpath / f"{PARQUET_FILE_PREFIX}{file_id:0{file_id_n_digits}}{PARQUET_FILE_EXTENSION}"
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def validate_ordered_on_match(base_ordered_on: str, new_ordered_on: str):
|
|
105
|
+
"""
|
|
106
|
+
Check if 'new_ordered_on' is equal to 'base_ordered_on'.
|
|
107
|
+
|
|
108
|
+
Raise ValueError if 'new_ordered_on' is not equal to 'base_ordered_on'.
|
|
109
|
+
|
|
110
|
+
"""
|
|
111
|
+
if base_ordered_on != new_ordered_on:
|
|
112
|
+
raise ValueError(
|
|
113
|
+
f"'ordered_on' parameter value '{new_ordered_on}' does not match "
|
|
114
|
+
f"'{base_ordered_on}' in record dataset.",
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class OrderedParquetDataset:
|
|
119
|
+
"""
|
|
120
|
+
Base class for Ordered Parquet Dataset with shared functionality.
|
|
121
|
+
|
|
122
|
+
This class contains all shared attributes, properties, and methods between
|
|
123
|
+
the full OrderedParquetDataset and its read-only version.
|
|
124
|
+
|
|
125
|
+
Attributes
|
|
126
|
+
----------
|
|
127
|
+
_file_ids_n_digits : int
|
|
128
|
+
Number of digits to use for 'file_id' in filename. It is kept as an
|
|
129
|
+
attribute to avoid recomputing it at each call to
|
|
130
|
+
'get_parquet_filepaths()'.
|
|
131
|
+
_lock : Lock
|
|
132
|
+
Exclusive lock held for the object's entire lifetime.
|
|
133
|
+
_lock._ref_count : int
|
|
134
|
+
Reference count for the lock. It needs to be attached to '_lock'
|
|
135
|
+
attribute.
|
|
136
|
+
_max_allowed_file_id : int
|
|
137
|
+
Maximum allowed file id. Kept as hidden attribute to avoid
|
|
138
|
+
recomputing it at each call in 'write_row_group_files()'.
|
|
139
|
+
_max_n_rows : int
|
|
140
|
+
Maximum allowed number of rows in a row group. Kept as hidden
|
|
141
|
+
attribute to avoid recomputing it at each call in
|
|
142
|
+
'write_row_group_files()'.
|
|
143
|
+
dirpath : Path
|
|
144
|
+
Directory path where to load data from.
|
|
145
|
+
is_newly_initialized : bool
|
|
146
|
+
True if this dataset instance was just created and has no existing
|
|
147
|
+
metadata file. False if the dataset was loaded from existing files.
|
|
148
|
+
key_value_metadata : dict[str, str]
|
|
149
|
+
Key-value metadata, from user and including 'ordered_on' column name.
|
|
150
|
+
max_file_id : int
|
|
151
|
+
Maximum file id in current directory.
|
|
152
|
+
ordered_on : str
|
|
153
|
+
Column name to order row groups by. Can be set either at opd
|
|
154
|
+
instantiation or in 'kwargs' of 'write()' method. Once set, it cannot
|
|
155
|
+
be changed.
|
|
156
|
+
row_group_stats : DataFrame
|
|
157
|
+
Row groups statistics,
|
|
158
|
+
- "ordered_on_min", min value in 'ordered_on' column for this group,
|
|
159
|
+
- "ordered_on_max", max value in 'ordered_on' column for this group,
|
|
160
|
+
- "n_rows": number of rows per row group,
|
|
161
|
+
- "file_id": an int indicating the file id for this group.
|
|
162
|
+
|
|
163
|
+
Methods
|
|
164
|
+
-------
|
|
165
|
+
remove_from_disk()
|
|
166
|
+
Remove all dataset files from disk and update in-memory state.
|
|
167
|
+
to_pandas()
|
|
168
|
+
Return data as a pandas dataframe.
|
|
169
|
+
write()
|
|
170
|
+
Write data to disk, merging with existing data.
|
|
171
|
+
__del__()
|
|
172
|
+
Release lock when object is garbage collected.
|
|
173
|
+
Uses reference counting to ensure lock is only released when all
|
|
174
|
+
instances are gone.
|
|
175
|
+
__getitem__(self, item: Union[int, slice]) -> 'ReadOnlyOrderedParquetDataset'
|
|
176
|
+
Select among the row-groups using integer/slicing.
|
|
177
|
+
__len__()
|
|
178
|
+
Return number of row groups in the dataset.
|
|
179
|
+
_align_file_ids()
|
|
180
|
+
Align file ids to row group position in the dataset.
|
|
181
|
+
_release_lock()
|
|
182
|
+
Release lock with reference counting.
|
|
183
|
+
_remove_row_group_files()
|
|
184
|
+
Remove row group files from disk. Row group indexes are also removed
|
|
185
|
+
from row_group_stats.
|
|
186
|
+
_sort_row_groups()
|
|
187
|
+
Sort row groups according their min value in 'ordered_on' column.
|
|
188
|
+
_write_metadata_file()
|
|
189
|
+
Write metadata to disk.
|
|
190
|
+
_write_row_group_files()
|
|
191
|
+
Write row group as files to disk. One row group per file.
|
|
192
|
+
|
|
193
|
+
Notes
|
|
194
|
+
-----
|
|
195
|
+
- There is one row group per file.
|
|
196
|
+
- Dataset metadata are written in a separate file in parquet format, located
|
|
197
|
+
at the same level than the dataset directory (not within the directory).
|
|
198
|
+
This way, if provided the directory path, another parquet reader can read
|
|
199
|
+
the dataset without being confused by this metadata file.
|
|
200
|
+
- File ids (in file names) have the same number of digits. This is to ensure
|
|
201
|
+
that files can be read in the correct order by other parquet readers.
|
|
202
|
+
- When creating an OrderedParquetDataset object, a lock is acquired and held
|
|
203
|
+
for the object's entire lifetime. The purpose is to provide
|
|
204
|
+
race-condition-free exclusive access suitable for scenarios with limited
|
|
205
|
+
concurrent processes.
|
|
206
|
+
The lock is acquired with a timeout and a lifetime. The timeout is the
|
|
207
|
+
maximum time to wait for lock acquisition in seconds. The lifetime is the
|
|
208
|
+
expected maximum lifetime of the lock, as a timedelta or integer number of
|
|
209
|
+
seconds, relative to when the lock is acquired.
|
|
210
|
+
Reading and writing operations refresh the lock to the lifetime it has
|
|
211
|
+
been initially provided.
|
|
212
|
+
|
|
213
|
+
"""
|
|
214
|
+
|
|
215
|
+
def __init__(
|
|
216
|
+
self,
|
|
217
|
+
dirpath: str | Path,
|
|
218
|
+
ordered_on: str | None = None,
|
|
219
|
+
lock_timeout: int | None = None,
|
|
220
|
+
lock_lifetime: int | None = 15,
|
|
221
|
+
):
|
|
222
|
+
"""
|
|
223
|
+
Initialize OrderedParquetDataset.
|
|
224
|
+
|
|
225
|
+
A lock is acquired at object creation and held for the object's entire
|
|
226
|
+
lifetime. This provides simple, race-condition-free exclusive access
|
|
227
|
+
suitable for scenarios with limited concurrent processes.
|
|
228
|
+
|
|
229
|
+
Parameters
|
|
230
|
+
----------
|
|
231
|
+
dirpath : Union[str, Path]
|
|
232
|
+
Directory path from where to load data.
|
|
233
|
+
ordered_on : Optional[str], default None
|
|
234
|
+
Column name to order row groups by. If not initialized, it can also
|
|
235
|
+
be provided in 'kwargs' of 'write()' method.
|
|
236
|
+
lock_timeout : Optional[int], default None
|
|
237
|
+
Approximately how long the lock acquisition attempt should be made.
|
|
238
|
+
None (the default) means keep trying forever.
|
|
239
|
+
lock_lifetime : Optional[int], default 15
|
|
240
|
+
The expected maximum lifetime of the lock, as a timedelta or integer
|
|
241
|
+
number of seconds, relative to now. Defaults to 15 seconds.
|
|
242
|
+
|
|
243
|
+
"""
|
|
244
|
+
self._dirpath = Path(dirpath).resolve()
|
|
245
|
+
# Acquire exclusive lock for the entire object lifetime
|
|
246
|
+
lock_file = self._dirpath.parent / f"{self._dirpath.name}{LOCK_EXTENSION}"
|
|
247
|
+
lock_file.parent.mkdir(parents=True, exist_ok=True)
|
|
248
|
+
self._lock = Lock(str(lock_file), lifetime=lock_lifetime)
|
|
249
|
+
# Initiate '_lock._ref_count'.
|
|
250
|
+
self._lock._ref_count = 0
|
|
251
|
+
try:
|
|
252
|
+
self._lock.lock(timeout=lock_timeout)
|
|
253
|
+
except TimeOutError:
|
|
254
|
+
raise TimeoutError(
|
|
255
|
+
f"failed to acquire lock for dataset '{self._dirpath}' within "
|
|
256
|
+
f"{lock_timeout} seconds. Another process may be using this dataset.",
|
|
257
|
+
)
|
|
258
|
+
# Increment reference counting to the lock object
|
|
259
|
+
self._lock._ref_count += 1
|
|
260
|
+
try:
|
|
261
|
+
# remaining initialization code.
|
|
262
|
+
try:
|
|
263
|
+
self._row_group_stats, self._key_value_metadata = parquet_adapter.read_parquet(
|
|
264
|
+
str(get_md_filepath(self._dirpath)),
|
|
265
|
+
return_key_value_metadata=True,
|
|
266
|
+
)
|
|
267
|
+
if ordered_on:
|
|
268
|
+
validate_ordered_on_match(
|
|
269
|
+
base_ordered_on=self._key_value_metadata[KEY_ORDERED_ON],
|
|
270
|
+
new_ordered_on=ordered_on,
|
|
271
|
+
)
|
|
272
|
+
self._is_newly_initialized = False
|
|
273
|
+
except FileNotFoundError:
|
|
274
|
+
# Using an empty Dataframe so that it can be written in the case
|
|
275
|
+
# user is only using '_write_metadata_file()' without adding row
|
|
276
|
+
# groups.
|
|
277
|
+
self._row_group_stats = DataFrame(columns=RGS_STATS_COLUMNS).astype(
|
|
278
|
+
RGS_STATS_BASE_DTYPES,
|
|
279
|
+
)
|
|
280
|
+
self._key_value_metadata = {KEY_ORDERED_ON: ordered_on}
|
|
281
|
+
self._is_newly_initialized = True
|
|
282
|
+
# While opd is in memory, 'ordered_on' is kept as a private attribute,
|
|
283
|
+
# with the idea that it is an immutable dataset property, while the
|
|
284
|
+
# content of 'self._key_value_metadata' is mutable.
|
|
285
|
+
self._ordered_on = self._key_value_metadata.pop(KEY_ORDERED_ON)
|
|
286
|
+
except Exception:
|
|
287
|
+
# If initialization code did not go well, release the lock.
|
|
288
|
+
self._release_lock()
|
|
289
|
+
raise
|
|
290
|
+
|
|
291
|
+
def __del__(self):
|
|
292
|
+
"""
|
|
293
|
+
Release lock when object is garbage collected.
|
|
294
|
+
|
|
295
|
+
Uses reference counting to ensure lock is only released when all instances are
|
|
296
|
+
gone.
|
|
297
|
+
|
|
298
|
+
"""
|
|
299
|
+
self._release_lock()
|
|
300
|
+
|
|
301
|
+
def __getitem__(self, item: int | slice) -> "ReadOnlyOrderedParquetDataset":
|
|
302
|
+
"""
|
|
303
|
+
Select among the row-groups using integer/slicing.
|
|
304
|
+
|
|
305
|
+
Parameters
|
|
306
|
+
----------
|
|
307
|
+
item : int or slice
|
|
308
|
+
Integer or slice to select row groups.
|
|
309
|
+
|
|
310
|
+
Returns
|
|
311
|
+
-------
|
|
312
|
+
ReadOnlyOrderedParquetDataset
|
|
313
|
+
A new read-only dataset with the selected row groups.
|
|
314
|
+
|
|
315
|
+
"""
|
|
316
|
+
# To preserve DataFrame format when selecting single row
|
|
317
|
+
row_group_stats_subset = (
|
|
318
|
+
self.row_group_stats.iloc[item : item + 1]
|
|
319
|
+
if isinstance(item, int)
|
|
320
|
+
else self.row_group_stats.iloc[item]
|
|
321
|
+
)
|
|
322
|
+
# Create new instance
|
|
323
|
+
opd_subset = object.__new__(OrderedParquetDataset)
|
|
324
|
+
opd_subset.__dict__ = self.__dict__ | {
|
|
325
|
+
"_row_group_stats": row_group_stats_subset,
|
|
326
|
+
}
|
|
327
|
+
# Increment reference count since new instance shares the lock
|
|
328
|
+
# Lock reference counting note:
|
|
329
|
+
# Two objects will reference the same lock after this method returns:
|
|
330
|
+
# - 'opd_subset' (ephemeral OrderedParquetDataset created here)
|
|
331
|
+
# - the ReadOnlyOrderedParquetDataset created in '_from_instance'
|
|
332
|
+
# We increment the lock ref-count here for 'opd_subset'.
|
|
333
|
+
# '_from_instance' will increment it again for the read-only view.
|
|
334
|
+
# Each object's '__del__' will decrement once, so the net count remains
|
|
335
|
+
# correct and the lock is eventually released when the last reference
|
|
336
|
+
# goes away.
|
|
337
|
+
self._lock._ref_count += 1
|
|
338
|
+
# Lazy import to avoid circular dependency
|
|
339
|
+
from oups.store.ordered_parquet_dataset.ordered_parquet_dataset.read_only import (
|
|
340
|
+
ReadOnlyOrderedParquetDataset,
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
return ReadOnlyOrderedParquetDataset._from_instance(opd_subset)
|
|
344
|
+
|
|
345
|
+
def __len__(self):
|
|
346
|
+
"""
|
|
347
|
+
Return number of row groups in the dataset.
|
|
348
|
+
"""
|
|
349
|
+
return len(self.row_group_stats)
|
|
350
|
+
|
|
351
|
+
@cached_property
|
|
352
|
+
def _max_allowed_file_id(self):
|
|
353
|
+
"""
|
|
354
|
+
Return maximum allowed file id.
|
|
355
|
+
"""
|
|
356
|
+
return iinfo(self.row_group_stats[KEY_FILE_IDS].dtype).max
|
|
357
|
+
|
|
358
|
+
@cached_property
|
|
359
|
+
def _file_id_n_digits(self):
|
|
360
|
+
"""
|
|
361
|
+
Return number of digits imposed to format file ids in file names.
|
|
362
|
+
"""
|
|
363
|
+
return len(str(self._max_allowed_file_id))
|
|
364
|
+
|
|
365
|
+
@cached_property
|
|
366
|
+
def _max_n_rows(self):
|
|
367
|
+
"""
|
|
368
|
+
Return maximum allowed number of rows in a row group.
|
|
369
|
+
"""
|
|
370
|
+
return iinfo(self.row_group_stats[KEY_N_ROWS].dtype).max
|
|
371
|
+
|
|
372
|
+
@property
|
|
373
|
+
def dirpath(self):
|
|
374
|
+
"""
|
|
375
|
+
Return directory path.
|
|
376
|
+
"""
|
|
377
|
+
return self._dirpath
|
|
378
|
+
|
|
379
|
+
@property
|
|
380
|
+
def is_newly_initialized(self):
|
|
381
|
+
"""
|
|
382
|
+
Return True if this dataset has no existing metadata file.
|
|
383
|
+
"""
|
|
384
|
+
return self._is_newly_initialized
|
|
385
|
+
|
|
386
|
+
@property
|
|
387
|
+
def key_value_metadata(self):
|
|
388
|
+
"""
|
|
389
|
+
Return key-value metadata.
|
|
390
|
+
"""
|
|
391
|
+
return self._key_value_metadata
|
|
392
|
+
|
|
393
|
+
@property
|
|
394
|
+
def ordered_on(self):
|
|
395
|
+
"""
|
|
396
|
+
Return column name to order row groups by.
|
|
397
|
+
"""
|
|
398
|
+
return self._ordered_on
|
|
399
|
+
|
|
400
|
+
@property
|
|
401
|
+
def row_group_stats(self):
|
|
402
|
+
"""
|
|
403
|
+
Return row group statistics.
|
|
404
|
+
"""
|
|
405
|
+
return self._row_group_stats
|
|
406
|
+
|
|
407
|
+
@property
|
|
408
|
+
def max_file_id(self):
|
|
409
|
+
"""
|
|
410
|
+
Return maximum file id in current directory.
|
|
411
|
+
|
|
412
|
+
If not row group in directory, return -1.
|
|
413
|
+
Note: Base dataset uses metadata for efficiency and reliability.
|
|
414
|
+
|
|
415
|
+
"""
|
|
416
|
+
# Get max 'file_id' from 'self.row_group_stats'.
|
|
417
|
+
return -1 if self.row_group_stats.empty else int(self.row_group_stats[KEY_FILE_IDS].max())
|
|
418
|
+
|
|
419
|
+
def remove_from_disk(self, preserve_metadata: bool = False, release_lock: bool = True) -> None:
|
|
420
|
+
"""
|
|
421
|
+
Remove all dataset files from disk and update in-memory state.
|
|
422
|
+
|
|
423
|
+
Parameters
|
|
424
|
+
----------
|
|
425
|
+
preserve_metadata : bool, default False
|
|
426
|
+
If True, keep user metadata accessible but clear row_group_stats.
|
|
427
|
+
If False, reset both row_group_stats and key_value_metadata.
|
|
428
|
+
release_lock : bool, default True
|
|
429
|
+
If True, release the lock after removal. Set to False if you plan
|
|
430
|
+
to continue using this OPD instance after removal.
|
|
431
|
+
|
|
432
|
+
Notes
|
|
433
|
+
-----
|
|
434
|
+
After calling this method with ``release_lock=True``, the OPD instance
|
|
435
|
+
should not be used for file operations, though metadata access remains
|
|
436
|
+
available if ``preserve_metadata=True``.
|
|
437
|
+
|
|
438
|
+
"""
|
|
439
|
+
# Update in-memory state first to 'inform' all references to this
|
|
440
|
+
# OrderedParquetDataset object.
|
|
441
|
+
self._row_group_stats = DataFrame(columns=RGS_STATS_COLUMNS).astype(RGS_STATS_BASE_DTYPES)
|
|
442
|
+
if not preserve_metadata:
|
|
443
|
+
self._key_value_metadata = {}
|
|
444
|
+
if not self._is_newly_initialized:
|
|
445
|
+
# Remove opdmd file in 2nd.
|
|
446
|
+
get_md_filepath(self.dirpath).unlink()
|
|
447
|
+
# Finally, remove dataset directory and all its contents.
|
|
448
|
+
remove_dir(self.dirpath)
|
|
449
|
+
# Mark as newly initialized since files are gone
|
|
450
|
+
self._is_newly_initialized = True
|
|
451
|
+
if release_lock:
|
|
452
|
+
self._release_lock()
|
|
453
|
+
|
|
454
|
+
def to_pandas(self) -> DataFrame:
|
|
455
|
+
"""
|
|
456
|
+
Return data as a pandas dataframe.
|
|
457
|
+
|
|
458
|
+
Returns
|
|
459
|
+
-------
|
|
460
|
+
DataFrame
|
|
461
|
+
Dataframe.
|
|
462
|
+
|
|
463
|
+
"""
|
|
464
|
+
# Refreshing the lock to the lifetime it has been provided.
|
|
465
|
+
self._lock.refresh(unconditionally=True)
|
|
466
|
+
return parquet_adapter.read_parquet(
|
|
467
|
+
get_parquet_filepaths(
|
|
468
|
+
self.dirpath,
|
|
469
|
+
self.row_group_stats[KEY_FILE_IDS],
|
|
470
|
+
self._file_id_n_digits,
|
|
471
|
+
),
|
|
472
|
+
return_key_value_metadata=False,
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
def write(self, **kwargs):
|
|
476
|
+
"""
|
|
477
|
+
Write data to disk.
|
|
478
|
+
|
|
479
|
+
This method relies on 'oups.store.write.write()' function.
|
|
480
|
+
|
|
481
|
+
Parameters
|
|
482
|
+
----------
|
|
483
|
+
**kwargs : dict
|
|
484
|
+
Keywords in 'kwargs' are forwarded to `oups.store.write.write()`.
|
|
485
|
+
|
|
486
|
+
"""
|
|
487
|
+
if self.ordered_on is None:
|
|
488
|
+
if KEY_ORDERED_ON in kwargs:
|
|
489
|
+
self._ordered_on = kwargs.pop(KEY_ORDERED_ON)
|
|
490
|
+
else:
|
|
491
|
+
raise ValueError("'ordered_on' parameter is required.")
|
|
492
|
+
elif KEY_ORDERED_ON in kwargs:
|
|
493
|
+
validate_ordered_on_match(
|
|
494
|
+
base_ordered_on=self.ordered_on,
|
|
495
|
+
new_ordered_on=kwargs.pop(KEY_ORDERED_ON),
|
|
496
|
+
)
|
|
497
|
+
write(self, ordered_on=self.ordered_on, **kwargs)
|
|
498
|
+
|
|
499
|
+
def _align_file_ids(self):
|
|
500
|
+
"""
|
|
501
|
+
Align file ids to row group position in the dataset and rename files.
|
|
502
|
+
|
|
503
|
+
This method ensures that file ids match their row group positions while:
|
|
504
|
+
1. Minimizing the number of renames.
|
|
505
|
+
2. Avoiding conflicts where target filenames are already taken.
|
|
506
|
+
3. Using temporary filenames when necessary to handle circular
|
|
507
|
+
dependencies.
|
|
508
|
+
|
|
509
|
+
"""
|
|
510
|
+
# Build mapping of current file ids to desired new ids.
|
|
511
|
+
mask_ids_to_rename = self.row_group_stats.loc[:, KEY_FILE_IDS] != self.row_group_stats.index
|
|
512
|
+
current_ids_to_rename = self.row_group_stats.loc[mask_ids_to_rename, KEY_FILE_IDS]
|
|
513
|
+
if len(current_ids_to_rename) == 0:
|
|
514
|
+
return
|
|
515
|
+
# Initialize 'temp_id' to be used when no direct rename is possible.
|
|
516
|
+
temp_id = self.max_file_id + 1
|
|
517
|
+
new_ids = current_ids_to_rename.index.astype(RGS_STATS_BASE_DTYPES[KEY_FILE_IDS])
|
|
518
|
+
current_to_new = dict(zip(current_ids_to_rename, new_ids, strict=False))
|
|
519
|
+
# Set of ids already being used by files in directory.
|
|
520
|
+
# Before renaming, we will check the 'new_id' is not already taken.
|
|
521
|
+
# Collision rationale:
|
|
522
|
+
# - 'new_ids' are exactly the indices of rows being renamed (those where
|
|
523
|
+
# 'file_id != index').
|
|
524
|
+
# - Rows already correct ('file_id == index') are excluded from the
|
|
525
|
+
# rename set, and their indices are therefore not in 'new_ids'.
|
|
526
|
+
# - Hence, no rename will ever target the id of a file that is already
|
|
527
|
+
# correct. We only need to avoid conflicts among the ids within the
|
|
528
|
+
# rename set itself, which is what this 'ids_already_in_use' covers.
|
|
529
|
+
# - Cycles among the rename set are handled by the temporary id logic
|
|
530
|
+
# below.
|
|
531
|
+
ids_already_in_use = set(current_ids_to_rename)
|
|
532
|
+
# Process renames
|
|
533
|
+
while current_to_new:
|
|
534
|
+
# Find a current_id whose new_id is not taken by another current_id.
|
|
535
|
+
for current_id, new_id in list(current_to_new.items()):
|
|
536
|
+
if new_id not in ids_already_in_use:
|
|
537
|
+
# Safe to rename directly
|
|
538
|
+
rename(
|
|
539
|
+
get_parquet_filepaths(self.dirpath, current_id, self._file_id_n_digits),
|
|
540
|
+
get_parquet_filepaths(self.dirpath, new_id, self._file_id_n_digits),
|
|
541
|
+
)
|
|
542
|
+
del current_to_new[current_id]
|
|
543
|
+
ids_already_in_use.discard(current_id)
|
|
544
|
+
else:
|
|
545
|
+
# No direct renames possible, need to use temporary id.
|
|
546
|
+
current_to_new[current_id] = temp_id
|
|
547
|
+
# Add at bottom of dict the correct mapping.
|
|
548
|
+
current_to_new[temp_id] = new_id
|
|
549
|
+
temp_id += 1
|
|
550
|
+
# Restart the loop.
|
|
551
|
+
break
|
|
552
|
+
# Set new ids.
|
|
553
|
+
self._row_group_stats.loc[mask_ids_to_rename, KEY_FILE_IDS] = new_ids
|
|
554
|
+
|
|
555
|
+
def _release_lock(self):
|
|
556
|
+
"""
|
|
557
|
+
Release lock with reference counting.
|
|
558
|
+
"""
|
|
559
|
+
if self._lock._ref_count > 0:
|
|
560
|
+
self._lock._ref_count -= 1
|
|
561
|
+
if self._lock._ref_count == 0:
|
|
562
|
+
self._lock.unlock(unconditionally=True)
|
|
563
|
+
|
|
564
|
+
def _remove_row_group_files(
|
|
565
|
+
self,
|
|
566
|
+
file_ids: list[int],
|
|
567
|
+
sort_row_groups: bool | None = True,
|
|
568
|
+
key_value_metadata: dict[str, str] | None = None,
|
|
569
|
+
):
|
|
570
|
+
"""
|
|
571
|
+
Remove row group files from disk.
|
|
572
|
+
|
|
573
|
+
Row group indexes are also removed from 'self.row_group_stats'.
|
|
574
|
+
|
|
575
|
+
Parameters
|
|
576
|
+
----------
|
|
577
|
+
file_ids : list[int]
|
|
578
|
+
File ids to remove.
|
|
579
|
+
sort_row_groups : Optional[bool], default True
|
|
580
|
+
If `True`, sort row groups after removing files.
|
|
581
|
+
key_value_metadata : Optional[dict[str, str]], default None
|
|
582
|
+
User-defined key-value metadata to write in metadata file.
|
|
583
|
+
|
|
584
|
+
Notes
|
|
585
|
+
-----
|
|
586
|
+
After file removal, and optional row group sorting, '_align_file_ids()'
|
|
587
|
+
and '_write_metadata_file()' methods are called, as a result of the
|
|
588
|
+
following reasoning.
|
|
589
|
+
It is anticipated that 'file_ids' may be generated from row group
|
|
590
|
+
indexes. If definition of 'file_ids' from row group indexes occurs in a
|
|
591
|
+
loop where '_remove_row_group_files()' is called, and that row group
|
|
592
|
+
indexes are defined before execution of the loop, then row group indexes
|
|
593
|
+
may not be valid anylonger at a next iteration.
|
|
594
|
+
To mitigate this issue, '_align_file_ids()' and '_write_metadata_file()'
|
|
595
|
+
methods are called, aligning then row group stats in memory and on disk
|
|
596
|
+
('_opdmd' file) with the existing row group files on disk.
|
|
597
|
+
|
|
598
|
+
"""
|
|
599
|
+
if not file_ids:
|
|
600
|
+
return
|
|
601
|
+
# Remove files from disk.
|
|
602
|
+
for file_id in file_ids:
|
|
603
|
+
remove(get_parquet_filepaths(self.dirpath, file_id, self._file_id_n_digits))
|
|
604
|
+
# Remove corresponding file ids from 'self.row_group_stats'.
|
|
605
|
+
mask_rows_to_keep = isin(
|
|
606
|
+
self.row_group_stats.loc[:, KEY_FILE_IDS].to_numpy(),
|
|
607
|
+
file_ids,
|
|
608
|
+
invert=True,
|
|
609
|
+
)
|
|
610
|
+
self._row_group_stats = self.row_group_stats.loc[mask_rows_to_keep, :].reset_index(
|
|
611
|
+
drop=True,
|
|
612
|
+
)
|
|
613
|
+
if sort_row_groups:
|
|
614
|
+
self._sort_row_groups()
|
|
615
|
+
self._align_file_ids()
|
|
616
|
+
self._write_metadata_file(key_value_metadata=key_value_metadata)
|
|
617
|
+
|
|
618
|
+
def _sort_row_groups(self):
|
|
619
|
+
"""
|
|
620
|
+
Sort row groups according their min value in 'ordered_on' column.
|
|
621
|
+
"""
|
|
622
|
+
self._row_group_stats.sort_values(by=KEY_ORDERED_ON_MINS, inplace=True, ignore_index=True)
|
|
623
|
+
|
|
624
|
+
def _write_metadata_file(self, key_value_metadata: dict[str, str] | None = None):
|
|
625
|
+
"""
|
|
626
|
+
Write metadata to disk.
|
|
627
|
+
|
|
628
|
+
Metadata are 2 different types of data,
|
|
629
|
+
- ``self.key_value_metadata``, a dict which (key, value) pairs can be
|
|
630
|
+
set by user, and which also contain ``self.ordered_on`` parameter.
|
|
631
|
+
It is retrieved from ``OUPS_METADATA_KEY`` key.
|
|
632
|
+
- ``self.row_group_stats``, a DataFrame which contains row groups
|
|
633
|
+
statistics.
|
|
634
|
+
|
|
635
|
+
Parameters
|
|
636
|
+
----------
|
|
637
|
+
key_value_metadata : dict[str, str], optional
|
|
638
|
+
User-defined key-value metadata to write, or update in dataset.
|
|
639
|
+
|
|
640
|
+
Notes
|
|
641
|
+
-----
|
|
642
|
+
Update strategy of oups specific metadata depends if key found in
|
|
643
|
+
``OUPS_METADATA`` metadata is also found in already existing metadata,
|
|
644
|
+
as well as its value.
|
|
645
|
+
- If not found in existing, it is added.
|
|
646
|
+
- If found in existing, it is updated.
|
|
647
|
+
- If its value is `None`, it is not added, and if found in existing,
|
|
648
|
+
it is removed from existing.
|
|
649
|
+
|
|
650
|
+
Albeit a parquet file, opdmd file is not compressed.
|
|
651
|
+
|
|
652
|
+
"""
|
|
653
|
+
existing_md = self._key_value_metadata
|
|
654
|
+
if key_value_metadata:
|
|
655
|
+
for key, value in key_value_metadata.items():
|
|
656
|
+
if key in existing_md:
|
|
657
|
+
if value is None:
|
|
658
|
+
# Case 'remove'.
|
|
659
|
+
del existing_md[key]
|
|
660
|
+
else:
|
|
661
|
+
# Case 'update'.
|
|
662
|
+
existing_md[key] = value
|
|
663
|
+
elif value:
|
|
664
|
+
# Case 'add'.
|
|
665
|
+
existing_md[key] = value
|
|
666
|
+
if self._is_newly_initialized:
|
|
667
|
+
self.dirpath.parent.mkdir(parents=True, exist_ok=True)
|
|
668
|
+
parquet_adapter.write_parquet(
|
|
669
|
+
path=get_md_filepath(self.dirpath),
|
|
670
|
+
df=self.row_group_stats,
|
|
671
|
+
key_value_metadata=existing_md | {KEY_ORDERED_ON: self.ordered_on},
|
|
672
|
+
)
|
|
673
|
+
self._is_newly_initialized = False
|
|
674
|
+
|
|
675
|
+
def _write_row_group_files(
|
|
676
|
+
self,
|
|
677
|
+
dfs: Iterable[DataFrame],
|
|
678
|
+
write_metadata_file: bool = True,
|
|
679
|
+
key_value_metadata: dict[str, str] = None,
|
|
680
|
+
**kwargs,
|
|
681
|
+
):
|
|
682
|
+
"""
|
|
683
|
+
Write row groups as files to disk. One row group per file.
|
|
684
|
+
|
|
685
|
+
Parameters
|
|
686
|
+
----------
|
|
687
|
+
dfs : Iterable[DataFrame]
|
|
688
|
+
Dataframes to write.
|
|
689
|
+
write_metadata_file : bool, optional
|
|
690
|
+
If `True`, write opd metadata file to disk.
|
|
691
|
+
key_value_metadata : dict[str, str], optional
|
|
692
|
+
User-defined key-value metadata to write, if 'write_metadata_file'
|
|
693
|
+
is `True`.
|
|
694
|
+
**kwargs : dict
|
|
695
|
+
Additional parameters to pass to 'ParquetAdapter.write_parquet()'.
|
|
696
|
+
|
|
697
|
+
"""
|
|
698
|
+
iter_dfs = iter(dfs)
|
|
699
|
+
try:
|
|
700
|
+
first_df = next(iter_dfs)
|
|
701
|
+
except StopIteration:
|
|
702
|
+
return
|
|
703
|
+
if self.ordered_on not in first_df.columns:
|
|
704
|
+
raise ValueError(
|
|
705
|
+
f"'ordered_on' column '{self.ordered_on}' is not in dataframe columns.",
|
|
706
|
+
)
|
|
707
|
+
if len(self.row_group_stats) == 0:
|
|
708
|
+
self.dirpath.mkdir(parents=True, exist_ok=True)
|
|
709
|
+
buffer, dtype_limit_exceeded, last_written_df = self._write_row_group_files_loop(
|
|
710
|
+
chain([first_df], iter_dfs),
|
|
711
|
+
**kwargs,
|
|
712
|
+
)
|
|
713
|
+
self._row_group_stats = concat(
|
|
714
|
+
[
|
|
715
|
+
None if self.row_group_stats.empty else self.row_group_stats,
|
|
716
|
+
DataFrame(data=buffer, columns=RGS_STATS_COLUMNS).astype(RGS_STATS_BASE_DTYPES),
|
|
717
|
+
],
|
|
718
|
+
ignore_index=True,
|
|
719
|
+
copy=False,
|
|
720
|
+
)
|
|
721
|
+
if write_metadata_file or dtype_limit_exceeded:
|
|
722
|
+
self._write_metadata_file(key_value_metadata=key_value_metadata)
|
|
723
|
+
if dtype_limit_exceeded:
|
|
724
|
+
self._handle_dtype_limit_exceeded(self.max_file_id + len(buffer), last_written_df)
|
|
725
|
+
|
|
726
|
+
def _write_row_group_files_loop(self, dfs: Iterable[DataFrame], **kwargs):
|
|
727
|
+
"""
|
|
728
|
+
Write row groups as files to disk and collect row group statistics.
|
|
729
|
+
|
|
730
|
+
Helper method for '_write_row_group_files()' method.
|
|
731
|
+
|
|
732
|
+
Parameters
|
|
733
|
+
----------
|
|
734
|
+
dfs : Iterable[DataFrame]
|
|
735
|
+
Dataframes to write.
|
|
736
|
+
|
|
737
|
+
**kwargs : dict
|
|
738
|
+
Additional parameters to pass to 'ParquetAdapter.write_parquet()'.
|
|
739
|
+
|
|
740
|
+
Returns
|
|
741
|
+
-------
|
|
742
|
+
buffer : list
|
|
743
|
+
List of row group statistics.
|
|
744
|
+
dtype_limit_exceeded : bool
|
|
745
|
+
If `True`, dtype limit has been exceeded.
|
|
746
|
+
df : DataFrame
|
|
747
|
+
Last dataframe written.
|
|
748
|
+
|
|
749
|
+
"""
|
|
750
|
+
buffer = []
|
|
751
|
+
dtype_limit_exceeded = False
|
|
752
|
+
for file_id, df in enumerate(dfs, start=self.max_file_id + 1):
|
|
753
|
+
if file_id > self._max_allowed_file_id or len(df) > self._max_n_rows:
|
|
754
|
+
dtype_limit_exceeded = True
|
|
755
|
+
break
|
|
756
|
+
if ((file_id - self.max_file_id - 1) % 10) == 0:
|
|
757
|
+
# Refreshing the lock to the lifetime it has been provided every
|
|
758
|
+
# 10 files.
|
|
759
|
+
self._lock.refresh(unconditionally=True)
|
|
760
|
+
buffer.append(
|
|
761
|
+
(
|
|
762
|
+
file_id, # file_ids
|
|
763
|
+
len(df), # n_rows
|
|
764
|
+
df.loc[:, self.ordered_on].iloc[0], # ordered_on_mins
|
|
765
|
+
df.loc[:, self.ordered_on].iloc[-1], # ordered_on_maxs
|
|
766
|
+
),
|
|
767
|
+
)
|
|
768
|
+
parquet_adapter.write_parquet(
|
|
769
|
+
path=get_parquet_filepaths(self.dirpath, file_id, self._file_id_n_digits),
|
|
770
|
+
df=df,
|
|
771
|
+
**kwargs,
|
|
772
|
+
)
|
|
773
|
+
return buffer, dtype_limit_exceeded, df
|
|
774
|
+
|
|
775
|
+
def _handle_dtype_limit_exceeded(self, file_id: int, df: DataFrame):
|
|
776
|
+
"""
|
|
777
|
+
Handle cases where dtype limits are exceeded.
|
|
778
|
+
|
|
779
|
+
Helper method for '_write_row_group_files()' method.
|
|
780
|
+
|
|
781
|
+
Parameters
|
|
782
|
+
----------
|
|
783
|
+
file_id : int
|
|
784
|
+
File id when a dtype limit has been exceeded.
|
|
785
|
+
df : DataFrame
|
|
786
|
+
Dataframe written when a dtype limit has been exceeded.
|
|
787
|
+
|
|
788
|
+
Raises
|
|
789
|
+
------
|
|
790
|
+
ValueError
|
|
791
|
+
If dtype limit has been exceeded.
|
|
792
|
+
|
|
793
|
+
"""
|
|
794
|
+
if file_id > self._max_allowed_file_id:
|
|
795
|
+
raise ValueError(
|
|
796
|
+
f"file id '{file_id}' exceeds max value "
|
|
797
|
+
f"{self._max_allowed_file_id}. Metadata has been written "
|
|
798
|
+
"before the exception has been raised.",
|
|
799
|
+
)
|
|
800
|
+
else:
|
|
801
|
+
raise ValueError(
|
|
802
|
+
f"number of rows {len(df)} exceeds max value "
|
|
803
|
+
f"{self._max_n_rows}. Metadata has been written before the "
|
|
804
|
+
"exception has been raised.",
|
|
805
|
+
)
|
|
806
|
+
|
|
807
|
+
|
|
808
|
+
def create_custom_opd(
|
|
809
|
+
tmp_path: str | Path,
|
|
810
|
+
df: DataFrame,
|
|
811
|
+
row_group_offsets: list[int],
|
|
812
|
+
ordered_on: str,
|
|
813
|
+
):
|
|
814
|
+
"""
|
|
815
|
+
Create a custom opd for testing.
|
|
816
|
+
|
|
817
|
+
Parameters
|
|
818
|
+
----------
|
|
819
|
+
tmp_path : Union[str, Path]
|
|
820
|
+
Temporary directory where to locate the opd files.
|
|
821
|
+
df : DataFrame
|
|
822
|
+
Data to write to opd files.
|
|
823
|
+
row_group_offsets : list[int]
|
|
824
|
+
Start index of row groups in 'df'.
|
|
825
|
+
ordered_on : str
|
|
826
|
+
Column name to order row groups by.
|
|
827
|
+
|
|
828
|
+
|
|
829
|
+
Returns
|
|
830
|
+
-------
|
|
831
|
+
OrderedParquetDataset
|
|
832
|
+
The created opd object.
|
|
833
|
+
|
|
834
|
+
"""
|
|
835
|
+
tmp_path = Path(tmp_path).resolve()
|
|
836
|
+
_max_allowed_file_id = iinfo(RGS_STATS_BASE_DTYPES[KEY_FILE_IDS]).max
|
|
837
|
+
_file_id_n_digits = len(str(_max_allowed_file_id))
|
|
838
|
+
n_rows = []
|
|
839
|
+
ordered_on_mins = []
|
|
840
|
+
ordered_on_maxs = []
|
|
841
|
+
row_group_ends_excluded = row_group_offsets[1:] + [len(df)]
|
|
842
|
+
tmp_path.mkdir(parents=True, exist_ok=True)
|
|
843
|
+
for file_id, (row_group_start, row_group_end_excluded) in enumerate(
|
|
844
|
+
zip(row_group_offsets, row_group_ends_excluded, strict=False),
|
|
845
|
+
):
|
|
846
|
+
df_rg = df.iloc[row_group_start:row_group_end_excluded]
|
|
847
|
+
n_rows.append(len(df_rg))
|
|
848
|
+
ordered_on_mins.append(df_rg.loc[:, ordered_on].iloc[0])
|
|
849
|
+
ordered_on_maxs.append(df_rg.loc[:, ordered_on].iloc[-1])
|
|
850
|
+
parquet_adapter.write_parquet(
|
|
851
|
+
path=get_parquet_filepaths(tmp_path, file_id, _file_id_n_digits),
|
|
852
|
+
df=df_rg,
|
|
853
|
+
)
|
|
854
|
+
row_group_stats = DataFrame(
|
|
855
|
+
data=zip(range(len(row_group_offsets)), n_rows, ordered_on_mins, ordered_on_maxs, strict=False),
|
|
856
|
+
columns=RGS_STATS_COLUMNS,
|
|
857
|
+
).astype(RGS_STATS_BASE_DTYPES)
|
|
858
|
+
parquet_adapter.write_parquet(
|
|
859
|
+
path=get_md_filepath(tmp_path),
|
|
860
|
+
df=row_group_stats,
|
|
861
|
+
key_value_metadata={KEY_ORDERED_ON: ordered_on},
|
|
862
|
+
)
|
|
863
|
+
return OrderedParquetDataset(tmp_path)
|