oups 2025.9.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of oups might be problematic. Click here for more details.

Files changed (43) hide show
  1. oups/__init__.py +40 -0
  2. oups/date_utils.py +62 -0
  3. oups/defines.py +26 -0
  4. oups/numpy_utils.py +114 -0
  5. oups/stateful_loop/__init__.py +14 -0
  6. oups/stateful_loop/loop_persistence_io.py +55 -0
  7. oups/stateful_loop/stateful_loop.py +654 -0
  8. oups/stateful_loop/validate_loop_usage.py +338 -0
  9. oups/stateful_ops/__init__.py +22 -0
  10. oups/stateful_ops/aggstream/__init__.py +12 -0
  11. oups/stateful_ops/aggstream/aggstream.py +1524 -0
  12. oups/stateful_ops/aggstream/cumsegagg.py +580 -0
  13. oups/stateful_ops/aggstream/jcumsegagg.py +416 -0
  14. oups/stateful_ops/aggstream/segmentby.py +1018 -0
  15. oups/stateful_ops/aggstream/utils.py +71 -0
  16. oups/stateful_ops/asof_merger/__init__.py +11 -0
  17. oups/stateful_ops/asof_merger/asof_merger.py +750 -0
  18. oups/stateful_ops/asof_merger/get_config.py +401 -0
  19. oups/stateful_ops/asof_merger/validate_params.py +285 -0
  20. oups/store/__init__.py +15 -0
  21. oups/store/filepath_utils.py +68 -0
  22. oups/store/indexer.py +457 -0
  23. oups/store/ordered_parquet_dataset/__init__.py +19 -0
  24. oups/store/ordered_parquet_dataset/metadata_filename.py +50 -0
  25. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/__init__.py +15 -0
  26. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/base.py +863 -0
  27. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/read_only.py +252 -0
  28. oups/store/ordered_parquet_dataset/parquet_adapter.py +157 -0
  29. oups/store/ordered_parquet_dataset/write/__init__.py +19 -0
  30. oups/store/ordered_parquet_dataset/write/iter_merge_split_data.py +131 -0
  31. oups/store/ordered_parquet_dataset/write/merge_split_strategies/__init__.py +22 -0
  32. oups/store/ordered_parquet_dataset/write/merge_split_strategies/base.py +784 -0
  33. oups/store/ordered_parquet_dataset/write/merge_split_strategies/n_rows_strategy.py +297 -0
  34. oups/store/ordered_parquet_dataset/write/merge_split_strategies/time_period_strategy.py +319 -0
  35. oups/store/ordered_parquet_dataset/write/write.py +270 -0
  36. oups/store/store/__init__.py +11 -0
  37. oups/store/store/dataset_cache.py +50 -0
  38. oups/store/store/iter_intersections.py +397 -0
  39. oups/store/store/store.py +345 -0
  40. oups-2025.9.5.dist-info/LICENSE +201 -0
  41. oups-2025.9.5.dist-info/METADATA +44 -0
  42. oups-2025.9.5.dist-info/RECORD +43 -0
  43. oups-2025.9.5.dist-info/WHEEL +4 -0
@@ -0,0 +1,863 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Created on Tue Jun 10 22:30:00 2025.
4
+
5
+ @author: pierrot
6
+
7
+ Ordered parquet dataset file structure.
8
+
9
+ parent_directory/
10
+ ├── my_dataset1/ # Dataset directory
11
+ │ ├── file_0000.parquet
12
+ │ └── file_0001.parquet
13
+ ├── my_dataset1_opdmd # Metadata file
14
+ └── my_dataset1.lock # Exclusive lock file
15
+
16
+ A lock is acquired at object creation and held for the object's entire lifetime.
17
+ This provides simple, race-condition-free exclusive access suitable for
18
+ scenarios with limited concurrent processes.
19
+
20
+ """
21
+ from collections.abc import Iterable
22
+ from functools import cached_property
23
+ from itertools import chain
24
+ from os import remove
25
+ from os import rename
26
+ from pathlib import Path
27
+ from typing import TYPE_CHECKING
28
+
29
+ from flufl.lock import Lock
30
+ from flufl.lock import TimeOutError
31
+ from numpy import iinfo
32
+ from numpy import isin
33
+ from numpy import uint16
34
+ from numpy import uint32
35
+ from pandas import DataFrame
36
+ from pandas import Series
37
+ from pandas import concat
38
+
39
+ from oups.defines import KEY_FILE_IDS
40
+ from oups.defines import KEY_N_ROWS
41
+ from oups.defines import KEY_ORDERED_ON
42
+ from oups.defines import KEY_ORDERED_ON_MAXS
43
+ from oups.defines import KEY_ORDERED_ON_MINS
44
+ from oups.store.filepath_utils import remove_dir
45
+ from oups.store.ordered_parquet_dataset.metadata_filename import get_md_filepath
46
+ from oups.store.ordered_parquet_dataset.parquet_adapter import ParquetAdapter
47
+ from oups.store.ordered_parquet_dataset.write import write
48
+
49
+
50
+ if TYPE_CHECKING:
51
+ from oups.store.ordered_parquet_dataset.ordered_parquet_dataset.read_only import (
52
+ ReadOnlyOrderedParquetDataset,
53
+ )
54
+
55
+ LOCK_EXTENSION = ".lock"
56
+ PARQUET_FILE_PREFIX = "file_"
57
+ PARQUET_FILE_EXTENSION = ".parquet"
58
+ # Do not change this order, it is expected by OrderedParquetDataset.write_row_group_files()
59
+ RGS_STATS_COLUMNS = [KEY_FILE_IDS, KEY_N_ROWS, KEY_ORDERED_ON_MINS, KEY_ORDERED_ON_MAXS]
60
+ RGS_STATS_BASE_DTYPES = {
61
+ KEY_N_ROWS: uint32,
62
+ KEY_FILE_IDS: uint16,
63
+ }
64
+
65
+
66
+ parquet_adapter = ParquetAdapter(use_arro3=False)
67
+
68
+
69
+ def get_parquet_filepaths(
70
+ dirpath: Path,
71
+ file_id: int | Series,
72
+ file_id_n_digits: int,
73
+ ) -> str | list[str]:
74
+ """
75
+ Get standardized parquet file path(s).
76
+
77
+ Parameters
78
+ ----------
79
+ dirpath : Path
80
+ The directory path to use in the filename.
81
+ file_id : int or Series[int]
82
+ The file ID to use in the filename. If a Series, a list of file paths
83
+ is returned.
84
+ file_id_n_digits : int, optional
85
+ Number of digits to use for 'file_id' in filename.
86
+
87
+ Returns
88
+ -------
89
+ Union[str, list[str]]
90
+ The formatted file path(s).
91
+
92
+ """
93
+ return (
94
+ (
95
+ str(dirpath / PARQUET_FILE_PREFIX)
96
+ + file_id.astype("string").str.zfill(file_id_n_digits)
97
+ + PARQUET_FILE_EXTENSION
98
+ ).to_list()
99
+ if isinstance(file_id, Series)
100
+ else dirpath / f"{PARQUET_FILE_PREFIX}{file_id:0{file_id_n_digits}}{PARQUET_FILE_EXTENSION}"
101
+ )
102
+
103
+
104
+ def validate_ordered_on_match(base_ordered_on: str, new_ordered_on: str):
105
+ """
106
+ Check if 'new_ordered_on' is equal to 'base_ordered_on'.
107
+
108
+ Raise ValueError if 'new_ordered_on' is not equal to 'base_ordered_on'.
109
+
110
+ """
111
+ if base_ordered_on != new_ordered_on:
112
+ raise ValueError(
113
+ f"'ordered_on' parameter value '{new_ordered_on}' does not match "
114
+ f"'{base_ordered_on}' in record dataset.",
115
+ )
116
+
117
+
118
+ class OrderedParquetDataset:
119
+ """
120
+ Base class for Ordered Parquet Dataset with shared functionality.
121
+
122
+ This class contains all shared attributes, properties, and methods between
123
+ the full OrderedParquetDataset and its read-only version.
124
+
125
+ Attributes
126
+ ----------
127
+ _file_ids_n_digits : int
128
+ Number of digits to use for 'file_id' in filename. It is kept as an
129
+ attribute to avoid recomputing it at each call to
130
+ 'get_parquet_filepaths()'.
131
+ _lock : Lock
132
+ Exclusive lock held for the object's entire lifetime.
133
+ _lock._ref_count : int
134
+ Reference count for the lock. It needs to be attached to '_lock'
135
+ attribute.
136
+ _max_allowed_file_id : int
137
+ Maximum allowed file id. Kept as hidden attribute to avoid
138
+ recomputing it at each call in 'write_row_group_files()'.
139
+ _max_n_rows : int
140
+ Maximum allowed number of rows in a row group. Kept as hidden
141
+ attribute to avoid recomputing it at each call in
142
+ 'write_row_group_files()'.
143
+ dirpath : Path
144
+ Directory path where to load data from.
145
+ is_newly_initialized : bool
146
+ True if this dataset instance was just created and has no existing
147
+ metadata file. False if the dataset was loaded from existing files.
148
+ key_value_metadata : dict[str, str]
149
+ Key-value metadata, from user and including 'ordered_on' column name.
150
+ max_file_id : int
151
+ Maximum file id in current directory.
152
+ ordered_on : str
153
+ Column name to order row groups by. Can be set either at opd
154
+ instantiation or in 'kwargs' of 'write()' method. Once set, it cannot
155
+ be changed.
156
+ row_group_stats : DataFrame
157
+ Row groups statistics,
158
+ - "ordered_on_min", min value in 'ordered_on' column for this group,
159
+ - "ordered_on_max", max value in 'ordered_on' column for this group,
160
+ - "n_rows": number of rows per row group,
161
+ - "file_id": an int indicating the file id for this group.
162
+
163
+ Methods
164
+ -------
165
+ remove_from_disk()
166
+ Remove all dataset files from disk and update in-memory state.
167
+ to_pandas()
168
+ Return data as a pandas dataframe.
169
+ write()
170
+ Write data to disk, merging with existing data.
171
+ __del__()
172
+ Release lock when object is garbage collected.
173
+ Uses reference counting to ensure lock is only released when all
174
+ instances are gone.
175
+ __getitem__(self, item: Union[int, slice]) -> 'ReadOnlyOrderedParquetDataset'
176
+ Select among the row-groups using integer/slicing.
177
+ __len__()
178
+ Return number of row groups in the dataset.
179
+ _align_file_ids()
180
+ Align file ids to row group position in the dataset.
181
+ _release_lock()
182
+ Release lock with reference counting.
183
+ _remove_row_group_files()
184
+ Remove row group files from disk. Row group indexes are also removed
185
+ from row_group_stats.
186
+ _sort_row_groups()
187
+ Sort row groups according their min value in 'ordered_on' column.
188
+ _write_metadata_file()
189
+ Write metadata to disk.
190
+ _write_row_group_files()
191
+ Write row group as files to disk. One row group per file.
192
+
193
+ Notes
194
+ -----
195
+ - There is one row group per file.
196
+ - Dataset metadata are written in a separate file in parquet format, located
197
+ at the same level than the dataset directory (not within the directory).
198
+ This way, if provided the directory path, another parquet reader can read
199
+ the dataset without being confused by this metadata file.
200
+ - File ids (in file names) have the same number of digits. This is to ensure
201
+ that files can be read in the correct order by other parquet readers.
202
+ - When creating an OrderedParquetDataset object, a lock is acquired and held
203
+ for the object's entire lifetime. The purpose is to provide
204
+ race-condition-free exclusive access suitable for scenarios with limited
205
+ concurrent processes.
206
+ The lock is acquired with a timeout and a lifetime. The timeout is the
207
+ maximum time to wait for lock acquisition in seconds. The lifetime is the
208
+ expected maximum lifetime of the lock, as a timedelta or integer number of
209
+ seconds, relative to when the lock is acquired.
210
+ Reading and writing operations refresh the lock to the lifetime it has
211
+ been initially provided.
212
+
213
+ """
214
+
215
+ def __init__(
216
+ self,
217
+ dirpath: str | Path,
218
+ ordered_on: str | None = None,
219
+ lock_timeout: int | None = None,
220
+ lock_lifetime: int | None = 15,
221
+ ):
222
+ """
223
+ Initialize OrderedParquetDataset.
224
+
225
+ A lock is acquired at object creation and held for the object's entire
226
+ lifetime. This provides simple, race-condition-free exclusive access
227
+ suitable for scenarios with limited concurrent processes.
228
+
229
+ Parameters
230
+ ----------
231
+ dirpath : Union[str, Path]
232
+ Directory path from where to load data.
233
+ ordered_on : Optional[str], default None
234
+ Column name to order row groups by. If not initialized, it can also
235
+ be provided in 'kwargs' of 'write()' method.
236
+ lock_timeout : Optional[int], default None
237
+ Approximately how long the lock acquisition attempt should be made.
238
+ None (the default) means keep trying forever.
239
+ lock_lifetime : Optional[int], default 15
240
+ The expected maximum lifetime of the lock, as a timedelta or integer
241
+ number of seconds, relative to now. Defaults to 15 seconds.
242
+
243
+ """
244
+ self._dirpath = Path(dirpath).resolve()
245
+ # Acquire exclusive lock for the entire object lifetime
246
+ lock_file = self._dirpath.parent / f"{self._dirpath.name}{LOCK_EXTENSION}"
247
+ lock_file.parent.mkdir(parents=True, exist_ok=True)
248
+ self._lock = Lock(str(lock_file), lifetime=lock_lifetime)
249
+ # Initiate '_lock._ref_count'.
250
+ self._lock._ref_count = 0
251
+ try:
252
+ self._lock.lock(timeout=lock_timeout)
253
+ except TimeOutError:
254
+ raise TimeoutError(
255
+ f"failed to acquire lock for dataset '{self._dirpath}' within "
256
+ f"{lock_timeout} seconds. Another process may be using this dataset.",
257
+ )
258
+ # Increment reference counting to the lock object
259
+ self._lock._ref_count += 1
260
+ try:
261
+ # remaining initialization code.
262
+ try:
263
+ self._row_group_stats, self._key_value_metadata = parquet_adapter.read_parquet(
264
+ str(get_md_filepath(self._dirpath)),
265
+ return_key_value_metadata=True,
266
+ )
267
+ if ordered_on:
268
+ validate_ordered_on_match(
269
+ base_ordered_on=self._key_value_metadata[KEY_ORDERED_ON],
270
+ new_ordered_on=ordered_on,
271
+ )
272
+ self._is_newly_initialized = False
273
+ except FileNotFoundError:
274
+ # Using an empty Dataframe so that it can be written in the case
275
+ # user is only using '_write_metadata_file()' without adding row
276
+ # groups.
277
+ self._row_group_stats = DataFrame(columns=RGS_STATS_COLUMNS).astype(
278
+ RGS_STATS_BASE_DTYPES,
279
+ )
280
+ self._key_value_metadata = {KEY_ORDERED_ON: ordered_on}
281
+ self._is_newly_initialized = True
282
+ # While opd is in memory, 'ordered_on' is kept as a private attribute,
283
+ # with the idea that it is an immutable dataset property, while the
284
+ # content of 'self._key_value_metadata' is mutable.
285
+ self._ordered_on = self._key_value_metadata.pop(KEY_ORDERED_ON)
286
+ except Exception:
287
+ # If initialization code did not go well, release the lock.
288
+ self._release_lock()
289
+ raise
290
+
291
+ def __del__(self):
292
+ """
293
+ Release lock when object is garbage collected.
294
+
295
+ Uses reference counting to ensure lock is only released when all instances are
296
+ gone.
297
+
298
+ """
299
+ self._release_lock()
300
+
301
+ def __getitem__(self, item: int | slice) -> "ReadOnlyOrderedParquetDataset":
302
+ """
303
+ Select among the row-groups using integer/slicing.
304
+
305
+ Parameters
306
+ ----------
307
+ item : int or slice
308
+ Integer or slice to select row groups.
309
+
310
+ Returns
311
+ -------
312
+ ReadOnlyOrderedParquetDataset
313
+ A new read-only dataset with the selected row groups.
314
+
315
+ """
316
+ # To preserve DataFrame format when selecting single row
317
+ row_group_stats_subset = (
318
+ self.row_group_stats.iloc[item : item + 1]
319
+ if isinstance(item, int)
320
+ else self.row_group_stats.iloc[item]
321
+ )
322
+ # Create new instance
323
+ opd_subset = object.__new__(OrderedParquetDataset)
324
+ opd_subset.__dict__ = self.__dict__ | {
325
+ "_row_group_stats": row_group_stats_subset,
326
+ }
327
+ # Increment reference count since new instance shares the lock
328
+ # Lock reference counting note:
329
+ # Two objects will reference the same lock after this method returns:
330
+ # - 'opd_subset' (ephemeral OrderedParquetDataset created here)
331
+ # - the ReadOnlyOrderedParquetDataset created in '_from_instance'
332
+ # We increment the lock ref-count here for 'opd_subset'.
333
+ # '_from_instance' will increment it again for the read-only view.
334
+ # Each object's '__del__' will decrement once, so the net count remains
335
+ # correct and the lock is eventually released when the last reference
336
+ # goes away.
337
+ self._lock._ref_count += 1
338
+ # Lazy import to avoid circular dependency
339
+ from oups.store.ordered_parquet_dataset.ordered_parquet_dataset.read_only import (
340
+ ReadOnlyOrderedParquetDataset,
341
+ )
342
+
343
+ return ReadOnlyOrderedParquetDataset._from_instance(opd_subset)
344
+
345
+ def __len__(self):
346
+ """
347
+ Return number of row groups in the dataset.
348
+ """
349
+ return len(self.row_group_stats)
350
+
351
+ @cached_property
352
+ def _max_allowed_file_id(self):
353
+ """
354
+ Return maximum allowed file id.
355
+ """
356
+ return iinfo(self.row_group_stats[KEY_FILE_IDS].dtype).max
357
+
358
+ @cached_property
359
+ def _file_id_n_digits(self):
360
+ """
361
+ Return number of digits imposed to format file ids in file names.
362
+ """
363
+ return len(str(self._max_allowed_file_id))
364
+
365
+ @cached_property
366
+ def _max_n_rows(self):
367
+ """
368
+ Return maximum allowed number of rows in a row group.
369
+ """
370
+ return iinfo(self.row_group_stats[KEY_N_ROWS].dtype).max
371
+
372
+ @property
373
+ def dirpath(self):
374
+ """
375
+ Return directory path.
376
+ """
377
+ return self._dirpath
378
+
379
+ @property
380
+ def is_newly_initialized(self):
381
+ """
382
+ Return True if this dataset has no existing metadata file.
383
+ """
384
+ return self._is_newly_initialized
385
+
386
+ @property
387
+ def key_value_metadata(self):
388
+ """
389
+ Return key-value metadata.
390
+ """
391
+ return self._key_value_metadata
392
+
393
+ @property
394
+ def ordered_on(self):
395
+ """
396
+ Return column name to order row groups by.
397
+ """
398
+ return self._ordered_on
399
+
400
+ @property
401
+ def row_group_stats(self):
402
+ """
403
+ Return row group statistics.
404
+ """
405
+ return self._row_group_stats
406
+
407
+ @property
408
+ def max_file_id(self):
409
+ """
410
+ Return maximum file id in current directory.
411
+
412
+ If not row group in directory, return -1.
413
+ Note: Base dataset uses metadata for efficiency and reliability.
414
+
415
+ """
416
+ # Get max 'file_id' from 'self.row_group_stats'.
417
+ return -1 if self.row_group_stats.empty else int(self.row_group_stats[KEY_FILE_IDS].max())
418
+
419
+ def remove_from_disk(self, preserve_metadata: bool = False, release_lock: bool = True) -> None:
420
+ """
421
+ Remove all dataset files from disk and update in-memory state.
422
+
423
+ Parameters
424
+ ----------
425
+ preserve_metadata : bool, default False
426
+ If True, keep user metadata accessible but clear row_group_stats.
427
+ If False, reset both row_group_stats and key_value_metadata.
428
+ release_lock : bool, default True
429
+ If True, release the lock after removal. Set to False if you plan
430
+ to continue using this OPD instance after removal.
431
+
432
+ Notes
433
+ -----
434
+ After calling this method with ``release_lock=True``, the OPD instance
435
+ should not be used for file operations, though metadata access remains
436
+ available if ``preserve_metadata=True``.
437
+
438
+ """
439
+ # Update in-memory state first to 'inform' all references to this
440
+ # OrderedParquetDataset object.
441
+ self._row_group_stats = DataFrame(columns=RGS_STATS_COLUMNS).astype(RGS_STATS_BASE_DTYPES)
442
+ if not preserve_metadata:
443
+ self._key_value_metadata = {}
444
+ if not self._is_newly_initialized:
445
+ # Remove opdmd file in 2nd.
446
+ get_md_filepath(self.dirpath).unlink()
447
+ # Finally, remove dataset directory and all its contents.
448
+ remove_dir(self.dirpath)
449
+ # Mark as newly initialized since files are gone
450
+ self._is_newly_initialized = True
451
+ if release_lock:
452
+ self._release_lock()
453
+
454
+ def to_pandas(self) -> DataFrame:
455
+ """
456
+ Return data as a pandas dataframe.
457
+
458
+ Returns
459
+ -------
460
+ DataFrame
461
+ Dataframe.
462
+
463
+ """
464
+ # Refreshing the lock to the lifetime it has been provided.
465
+ self._lock.refresh(unconditionally=True)
466
+ return parquet_adapter.read_parquet(
467
+ get_parquet_filepaths(
468
+ self.dirpath,
469
+ self.row_group_stats[KEY_FILE_IDS],
470
+ self._file_id_n_digits,
471
+ ),
472
+ return_key_value_metadata=False,
473
+ )
474
+
475
+ def write(self, **kwargs):
476
+ """
477
+ Write data to disk.
478
+
479
+ This method relies on 'oups.store.write.write()' function.
480
+
481
+ Parameters
482
+ ----------
483
+ **kwargs : dict
484
+ Keywords in 'kwargs' are forwarded to `oups.store.write.write()`.
485
+
486
+ """
487
+ if self.ordered_on is None:
488
+ if KEY_ORDERED_ON in kwargs:
489
+ self._ordered_on = kwargs.pop(KEY_ORDERED_ON)
490
+ else:
491
+ raise ValueError("'ordered_on' parameter is required.")
492
+ elif KEY_ORDERED_ON in kwargs:
493
+ validate_ordered_on_match(
494
+ base_ordered_on=self.ordered_on,
495
+ new_ordered_on=kwargs.pop(KEY_ORDERED_ON),
496
+ )
497
+ write(self, ordered_on=self.ordered_on, **kwargs)
498
+
499
+ def _align_file_ids(self):
500
+ """
501
+ Align file ids to row group position in the dataset and rename files.
502
+
503
+ This method ensures that file ids match their row group positions while:
504
+ 1. Minimizing the number of renames.
505
+ 2. Avoiding conflicts where target filenames are already taken.
506
+ 3. Using temporary filenames when necessary to handle circular
507
+ dependencies.
508
+
509
+ """
510
+ # Build mapping of current file ids to desired new ids.
511
+ mask_ids_to_rename = self.row_group_stats.loc[:, KEY_FILE_IDS] != self.row_group_stats.index
512
+ current_ids_to_rename = self.row_group_stats.loc[mask_ids_to_rename, KEY_FILE_IDS]
513
+ if len(current_ids_to_rename) == 0:
514
+ return
515
+ # Initialize 'temp_id' to be used when no direct rename is possible.
516
+ temp_id = self.max_file_id + 1
517
+ new_ids = current_ids_to_rename.index.astype(RGS_STATS_BASE_DTYPES[KEY_FILE_IDS])
518
+ current_to_new = dict(zip(current_ids_to_rename, new_ids, strict=False))
519
+ # Set of ids already being used by files in directory.
520
+ # Before renaming, we will check the 'new_id' is not already taken.
521
+ # Collision rationale:
522
+ # - 'new_ids' are exactly the indices of rows being renamed (those where
523
+ # 'file_id != index').
524
+ # - Rows already correct ('file_id == index') are excluded from the
525
+ # rename set, and their indices are therefore not in 'new_ids'.
526
+ # - Hence, no rename will ever target the id of a file that is already
527
+ # correct. We only need to avoid conflicts among the ids within the
528
+ # rename set itself, which is what this 'ids_already_in_use' covers.
529
+ # - Cycles among the rename set are handled by the temporary id logic
530
+ # below.
531
+ ids_already_in_use = set(current_ids_to_rename)
532
+ # Process renames
533
+ while current_to_new:
534
+ # Find a current_id whose new_id is not taken by another current_id.
535
+ for current_id, new_id in list(current_to_new.items()):
536
+ if new_id not in ids_already_in_use:
537
+ # Safe to rename directly
538
+ rename(
539
+ get_parquet_filepaths(self.dirpath, current_id, self._file_id_n_digits),
540
+ get_parquet_filepaths(self.dirpath, new_id, self._file_id_n_digits),
541
+ )
542
+ del current_to_new[current_id]
543
+ ids_already_in_use.discard(current_id)
544
+ else:
545
+ # No direct renames possible, need to use temporary id.
546
+ current_to_new[current_id] = temp_id
547
+ # Add at bottom of dict the correct mapping.
548
+ current_to_new[temp_id] = new_id
549
+ temp_id += 1
550
+ # Restart the loop.
551
+ break
552
+ # Set new ids.
553
+ self._row_group_stats.loc[mask_ids_to_rename, KEY_FILE_IDS] = new_ids
554
+
555
+ def _release_lock(self):
556
+ """
557
+ Release lock with reference counting.
558
+ """
559
+ if self._lock._ref_count > 0:
560
+ self._lock._ref_count -= 1
561
+ if self._lock._ref_count == 0:
562
+ self._lock.unlock(unconditionally=True)
563
+
564
+ def _remove_row_group_files(
565
+ self,
566
+ file_ids: list[int],
567
+ sort_row_groups: bool | None = True,
568
+ key_value_metadata: dict[str, str] | None = None,
569
+ ):
570
+ """
571
+ Remove row group files from disk.
572
+
573
+ Row group indexes are also removed from 'self.row_group_stats'.
574
+
575
+ Parameters
576
+ ----------
577
+ file_ids : list[int]
578
+ File ids to remove.
579
+ sort_row_groups : Optional[bool], default True
580
+ If `True`, sort row groups after removing files.
581
+ key_value_metadata : Optional[dict[str, str]], default None
582
+ User-defined key-value metadata to write in metadata file.
583
+
584
+ Notes
585
+ -----
586
+ After file removal, and optional row group sorting, '_align_file_ids()'
587
+ and '_write_metadata_file()' methods are called, as a result of the
588
+ following reasoning.
589
+ It is anticipated that 'file_ids' may be generated from row group
590
+ indexes. If definition of 'file_ids' from row group indexes occurs in a
591
+ loop where '_remove_row_group_files()' is called, and that row group
592
+ indexes are defined before execution of the loop, then row group indexes
593
+ may not be valid anylonger at a next iteration.
594
+ To mitigate this issue, '_align_file_ids()' and '_write_metadata_file()'
595
+ methods are called, aligning then row group stats in memory and on disk
596
+ ('_opdmd' file) with the existing row group files on disk.
597
+
598
+ """
599
+ if not file_ids:
600
+ return
601
+ # Remove files from disk.
602
+ for file_id in file_ids:
603
+ remove(get_parquet_filepaths(self.dirpath, file_id, self._file_id_n_digits))
604
+ # Remove corresponding file ids from 'self.row_group_stats'.
605
+ mask_rows_to_keep = isin(
606
+ self.row_group_stats.loc[:, KEY_FILE_IDS].to_numpy(),
607
+ file_ids,
608
+ invert=True,
609
+ )
610
+ self._row_group_stats = self.row_group_stats.loc[mask_rows_to_keep, :].reset_index(
611
+ drop=True,
612
+ )
613
+ if sort_row_groups:
614
+ self._sort_row_groups()
615
+ self._align_file_ids()
616
+ self._write_metadata_file(key_value_metadata=key_value_metadata)
617
+
618
+ def _sort_row_groups(self):
619
+ """
620
+ Sort row groups according their min value in 'ordered_on' column.
621
+ """
622
+ self._row_group_stats.sort_values(by=KEY_ORDERED_ON_MINS, inplace=True, ignore_index=True)
623
+
624
+ def _write_metadata_file(self, key_value_metadata: dict[str, str] | None = None):
625
+ """
626
+ Write metadata to disk.
627
+
628
+ Metadata are 2 different types of data,
629
+ - ``self.key_value_metadata``, a dict which (key, value) pairs can be
630
+ set by user, and which also contain ``self.ordered_on`` parameter.
631
+ It is retrieved from ``OUPS_METADATA_KEY`` key.
632
+ - ``self.row_group_stats``, a DataFrame which contains row groups
633
+ statistics.
634
+
635
+ Parameters
636
+ ----------
637
+ key_value_metadata : dict[str, str], optional
638
+ User-defined key-value metadata to write, or update in dataset.
639
+
640
+ Notes
641
+ -----
642
+ Update strategy of oups specific metadata depends if key found in
643
+ ``OUPS_METADATA`` metadata is also found in already existing metadata,
644
+ as well as its value.
645
+ - If not found in existing, it is added.
646
+ - If found in existing, it is updated.
647
+ - If its value is `None`, it is not added, and if found in existing,
648
+ it is removed from existing.
649
+
650
+ Albeit a parquet file, opdmd file is not compressed.
651
+
652
+ """
653
+ existing_md = self._key_value_metadata
654
+ if key_value_metadata:
655
+ for key, value in key_value_metadata.items():
656
+ if key in existing_md:
657
+ if value is None:
658
+ # Case 'remove'.
659
+ del existing_md[key]
660
+ else:
661
+ # Case 'update'.
662
+ existing_md[key] = value
663
+ elif value:
664
+ # Case 'add'.
665
+ existing_md[key] = value
666
+ if self._is_newly_initialized:
667
+ self.dirpath.parent.mkdir(parents=True, exist_ok=True)
668
+ parquet_adapter.write_parquet(
669
+ path=get_md_filepath(self.dirpath),
670
+ df=self.row_group_stats,
671
+ key_value_metadata=existing_md | {KEY_ORDERED_ON: self.ordered_on},
672
+ )
673
+ self._is_newly_initialized = False
674
+
675
+ def _write_row_group_files(
676
+ self,
677
+ dfs: Iterable[DataFrame],
678
+ write_metadata_file: bool = True,
679
+ key_value_metadata: dict[str, str] = None,
680
+ **kwargs,
681
+ ):
682
+ """
683
+ Write row groups as files to disk. One row group per file.
684
+
685
+ Parameters
686
+ ----------
687
+ dfs : Iterable[DataFrame]
688
+ Dataframes to write.
689
+ write_metadata_file : bool, optional
690
+ If `True`, write opd metadata file to disk.
691
+ key_value_metadata : dict[str, str], optional
692
+ User-defined key-value metadata to write, if 'write_metadata_file'
693
+ is `True`.
694
+ **kwargs : dict
695
+ Additional parameters to pass to 'ParquetAdapter.write_parquet()'.
696
+
697
+ """
698
+ iter_dfs = iter(dfs)
699
+ try:
700
+ first_df = next(iter_dfs)
701
+ except StopIteration:
702
+ return
703
+ if self.ordered_on not in first_df.columns:
704
+ raise ValueError(
705
+ f"'ordered_on' column '{self.ordered_on}' is not in dataframe columns.",
706
+ )
707
+ if len(self.row_group_stats) == 0:
708
+ self.dirpath.mkdir(parents=True, exist_ok=True)
709
+ buffer, dtype_limit_exceeded, last_written_df = self._write_row_group_files_loop(
710
+ chain([first_df], iter_dfs),
711
+ **kwargs,
712
+ )
713
+ self._row_group_stats = concat(
714
+ [
715
+ None if self.row_group_stats.empty else self.row_group_stats,
716
+ DataFrame(data=buffer, columns=RGS_STATS_COLUMNS).astype(RGS_STATS_BASE_DTYPES),
717
+ ],
718
+ ignore_index=True,
719
+ copy=False,
720
+ )
721
+ if write_metadata_file or dtype_limit_exceeded:
722
+ self._write_metadata_file(key_value_metadata=key_value_metadata)
723
+ if dtype_limit_exceeded:
724
+ self._handle_dtype_limit_exceeded(self.max_file_id + len(buffer), last_written_df)
725
+
726
+ def _write_row_group_files_loop(self, dfs: Iterable[DataFrame], **kwargs):
727
+ """
728
+ Write row groups as files to disk and collect row group statistics.
729
+
730
+ Helper method for '_write_row_group_files()' method.
731
+
732
+ Parameters
733
+ ----------
734
+ dfs : Iterable[DataFrame]
735
+ Dataframes to write.
736
+
737
+ **kwargs : dict
738
+ Additional parameters to pass to 'ParquetAdapter.write_parquet()'.
739
+
740
+ Returns
741
+ -------
742
+ buffer : list
743
+ List of row group statistics.
744
+ dtype_limit_exceeded : bool
745
+ If `True`, dtype limit has been exceeded.
746
+ df : DataFrame
747
+ Last dataframe written.
748
+
749
+ """
750
+ buffer = []
751
+ dtype_limit_exceeded = False
752
+ for file_id, df in enumerate(dfs, start=self.max_file_id + 1):
753
+ if file_id > self._max_allowed_file_id or len(df) > self._max_n_rows:
754
+ dtype_limit_exceeded = True
755
+ break
756
+ if ((file_id - self.max_file_id - 1) % 10) == 0:
757
+ # Refreshing the lock to the lifetime it has been provided every
758
+ # 10 files.
759
+ self._lock.refresh(unconditionally=True)
760
+ buffer.append(
761
+ (
762
+ file_id, # file_ids
763
+ len(df), # n_rows
764
+ df.loc[:, self.ordered_on].iloc[0], # ordered_on_mins
765
+ df.loc[:, self.ordered_on].iloc[-1], # ordered_on_maxs
766
+ ),
767
+ )
768
+ parquet_adapter.write_parquet(
769
+ path=get_parquet_filepaths(self.dirpath, file_id, self._file_id_n_digits),
770
+ df=df,
771
+ **kwargs,
772
+ )
773
+ return buffer, dtype_limit_exceeded, df
774
+
775
+ def _handle_dtype_limit_exceeded(self, file_id: int, df: DataFrame):
776
+ """
777
+ Handle cases where dtype limits are exceeded.
778
+
779
+ Helper method for '_write_row_group_files()' method.
780
+
781
+ Parameters
782
+ ----------
783
+ file_id : int
784
+ File id when a dtype limit has been exceeded.
785
+ df : DataFrame
786
+ Dataframe written when a dtype limit has been exceeded.
787
+
788
+ Raises
789
+ ------
790
+ ValueError
791
+ If dtype limit has been exceeded.
792
+
793
+ """
794
+ if file_id > self._max_allowed_file_id:
795
+ raise ValueError(
796
+ f"file id '{file_id}' exceeds max value "
797
+ f"{self._max_allowed_file_id}. Metadata has been written "
798
+ "before the exception has been raised.",
799
+ )
800
+ else:
801
+ raise ValueError(
802
+ f"number of rows {len(df)} exceeds max value "
803
+ f"{self._max_n_rows}. Metadata has been written before the "
804
+ "exception has been raised.",
805
+ )
806
+
807
+
808
+ def create_custom_opd(
809
+ tmp_path: str | Path,
810
+ df: DataFrame,
811
+ row_group_offsets: list[int],
812
+ ordered_on: str,
813
+ ):
814
+ """
815
+ Create a custom opd for testing.
816
+
817
+ Parameters
818
+ ----------
819
+ tmp_path : Union[str, Path]
820
+ Temporary directory where to locate the opd files.
821
+ df : DataFrame
822
+ Data to write to opd files.
823
+ row_group_offsets : list[int]
824
+ Start index of row groups in 'df'.
825
+ ordered_on : str
826
+ Column name to order row groups by.
827
+
828
+
829
+ Returns
830
+ -------
831
+ OrderedParquetDataset
832
+ The created opd object.
833
+
834
+ """
835
+ tmp_path = Path(tmp_path).resolve()
836
+ _max_allowed_file_id = iinfo(RGS_STATS_BASE_DTYPES[KEY_FILE_IDS]).max
837
+ _file_id_n_digits = len(str(_max_allowed_file_id))
838
+ n_rows = []
839
+ ordered_on_mins = []
840
+ ordered_on_maxs = []
841
+ row_group_ends_excluded = row_group_offsets[1:] + [len(df)]
842
+ tmp_path.mkdir(parents=True, exist_ok=True)
843
+ for file_id, (row_group_start, row_group_end_excluded) in enumerate(
844
+ zip(row_group_offsets, row_group_ends_excluded, strict=False),
845
+ ):
846
+ df_rg = df.iloc[row_group_start:row_group_end_excluded]
847
+ n_rows.append(len(df_rg))
848
+ ordered_on_mins.append(df_rg.loc[:, ordered_on].iloc[0])
849
+ ordered_on_maxs.append(df_rg.loc[:, ordered_on].iloc[-1])
850
+ parquet_adapter.write_parquet(
851
+ path=get_parquet_filepaths(tmp_path, file_id, _file_id_n_digits),
852
+ df=df_rg,
853
+ )
854
+ row_group_stats = DataFrame(
855
+ data=zip(range(len(row_group_offsets)), n_rows, ordered_on_mins, ordered_on_maxs, strict=False),
856
+ columns=RGS_STATS_COLUMNS,
857
+ ).astype(RGS_STATS_BASE_DTYPES)
858
+ parquet_adapter.write_parquet(
859
+ path=get_md_filepath(tmp_path),
860
+ df=row_group_stats,
861
+ key_value_metadata={KEY_ORDERED_ON: ordered_on},
862
+ )
863
+ return OrderedParquetDataset(tmp_path)