oups 2025.9.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of oups might be problematic. Click here for more details.

Files changed (43) hide show
  1. oups/__init__.py +40 -0
  2. oups/date_utils.py +62 -0
  3. oups/defines.py +26 -0
  4. oups/numpy_utils.py +114 -0
  5. oups/stateful_loop/__init__.py +14 -0
  6. oups/stateful_loop/loop_persistence_io.py +55 -0
  7. oups/stateful_loop/stateful_loop.py +654 -0
  8. oups/stateful_loop/validate_loop_usage.py +338 -0
  9. oups/stateful_ops/__init__.py +22 -0
  10. oups/stateful_ops/aggstream/__init__.py +12 -0
  11. oups/stateful_ops/aggstream/aggstream.py +1524 -0
  12. oups/stateful_ops/aggstream/cumsegagg.py +580 -0
  13. oups/stateful_ops/aggstream/jcumsegagg.py +416 -0
  14. oups/stateful_ops/aggstream/segmentby.py +1018 -0
  15. oups/stateful_ops/aggstream/utils.py +71 -0
  16. oups/stateful_ops/asof_merger/__init__.py +11 -0
  17. oups/stateful_ops/asof_merger/asof_merger.py +750 -0
  18. oups/stateful_ops/asof_merger/get_config.py +401 -0
  19. oups/stateful_ops/asof_merger/validate_params.py +285 -0
  20. oups/store/__init__.py +15 -0
  21. oups/store/filepath_utils.py +68 -0
  22. oups/store/indexer.py +457 -0
  23. oups/store/ordered_parquet_dataset/__init__.py +19 -0
  24. oups/store/ordered_parquet_dataset/metadata_filename.py +50 -0
  25. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/__init__.py +15 -0
  26. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/base.py +863 -0
  27. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/read_only.py +252 -0
  28. oups/store/ordered_parquet_dataset/parquet_adapter.py +157 -0
  29. oups/store/ordered_parquet_dataset/write/__init__.py +19 -0
  30. oups/store/ordered_parquet_dataset/write/iter_merge_split_data.py +131 -0
  31. oups/store/ordered_parquet_dataset/write/merge_split_strategies/__init__.py +22 -0
  32. oups/store/ordered_parquet_dataset/write/merge_split_strategies/base.py +784 -0
  33. oups/store/ordered_parquet_dataset/write/merge_split_strategies/n_rows_strategy.py +297 -0
  34. oups/store/ordered_parquet_dataset/write/merge_split_strategies/time_period_strategy.py +319 -0
  35. oups/store/ordered_parquet_dataset/write/write.py +270 -0
  36. oups/store/store/__init__.py +11 -0
  37. oups/store/store/dataset_cache.py +50 -0
  38. oups/store/store/iter_intersections.py +397 -0
  39. oups/store/store/store.py +345 -0
  40. oups-2025.9.5.dist-info/LICENSE +201 -0
  41. oups-2025.9.5.dist-info/METADATA +44 -0
  42. oups-2025.9.5.dist-info/RECORD +43 -0
  43. oups-2025.9.5.dist-info/WHEEL +4 -0
@@ -0,0 +1,270 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Created on Wed Dec 6 22:30:00 2021.
4
+
5
+ @author: pierrot
6
+
7
+ """
8
+ from __future__ import annotations
9
+
10
+ from importlib import import_module
11
+ from pathlib import Path
12
+ from typing import TYPE_CHECKING
13
+
14
+ from pandas import DataFrame
15
+ from pandas import Series
16
+
17
+ from oups.defines import KEY_FILE_IDS
18
+ from oups.defines import KEY_N_ROWS
19
+ from oups.defines import KEY_ORDERED_ON_MAXS
20
+ from oups.defines import KEY_ORDERED_ON_MINS
21
+ from oups.store.ordered_parquet_dataset.write.iter_merge_split_data import iter_merge_split_data
22
+ from oups.store.ordered_parquet_dataset.write.merge_split_strategies import NRowsMergeSplitStrategy
23
+ from oups.store.ordered_parquet_dataset.write.merge_split_strategies import TimePeriodMergeSplitStrategy
24
+
25
+
26
+ if TYPE_CHECKING:
27
+ from oups.store.ordered_parquet_dataset.ordered_parquet_dataset.base import OrderedParquetDataset
28
+
29
+
30
+ ROW_GROUP_INT_TARGET_SIZE = 6_345_000
31
+
32
+
33
+ def _validate_duplicate_on_param(
34
+ duplicates_on: str | list[str] | list[tuple[str]],
35
+ ordered_on: str,
36
+ ) -> list[str]:
37
+ """
38
+ Validate and normalize duplicate parameters.
39
+
40
+ Parameters
41
+ ----------
42
+ duplicates_on : str | list[str] | list[tuple[str]]
43
+ Column(s) to check for duplicates. If empty list, all columns are used.
44
+ ordered_on : str
45
+ Column name by which data is ordered.
46
+
47
+ Returns
48
+ -------
49
+ tuple[bool, str | list[str] | None]
50
+ Boolean flag indicating if duplicates are to be dropped, and the subset
51
+ to pass to ``DataFrame.drop_duplicates`` (including 'ordered_on'). If
52
+ duplicates are dropped, ``None`` indicates to consider all columns.
53
+
54
+ Raises
55
+ ------
56
+ ValueError
57
+ If distinct_bounds is not set while duplicates_on is provided.
58
+
59
+ """
60
+ if duplicates_on is None:
61
+ return (False, None)
62
+ else:
63
+ if isinstance(duplicates_on, list):
64
+ if duplicates_on == []:
65
+ return (True, None)
66
+ elif ordered_on not in duplicates_on:
67
+ duplicates_on.append(ordered_on)
68
+ return (True, duplicates_on)
69
+ else:
70
+ # 'duplicates_on' is a single column name.
71
+ if duplicates_on != ordered_on:
72
+ return (True, [duplicates_on, ordered_on])
73
+ return (True, ordered_on)
74
+
75
+
76
+ def write(
77
+ dirpath: str | Path | OrderedParquetDataset,
78
+ *,
79
+ ordered_on: str | tuple[str],
80
+ df: DataFrame | None = None,
81
+ row_group_target_size: int | str | None = ROW_GROUP_INT_TARGET_SIZE,
82
+ duplicates_on: str | list[str] | list[tuple[str]] | None = None,
83
+ max_n_off_target_rgs: int | None = None,
84
+ key_value_metadata: dict[str, str] | None = None,
85
+ **kwargs,
86
+ ):
87
+ """
88
+ Write data to disk at location specified by path.
89
+
90
+ Parameters
91
+ ----------
92
+ dirpath : str | Path | OrderedParquetDataset
93
+ If a string or a Path, it is the directory where writing pandas
94
+ dataframe.
95
+ If an OrderedParquetDataset, it is the dataset where writing pandas
96
+ dataframe.
97
+ ordered_on : str | tuple[str]
98
+ Name of the column with respect to which dataset is in ascending order.
99
+ If column multi-index, name of the column is a tuple.
100
+ It allows knowing 'where' to insert new data into existing data, i.e.
101
+ completing or correcting past records (but it does not allow to remove
102
+ prior data).
103
+ df : DataFrame | None, default None
104
+ Data to write. If None, a resize of Ordered Parquet Dataset may however
105
+ be performed.
106
+ row_group_target_size : int | str | None
107
+ Target size of row groups. If not set, default to ``6_345_000``, which
108
+ for a dataframe with 6 columns of ``float64`` or ``int64`` results in a
109
+ memory footprint (RAM) of about 290MB.
110
+ It can be a pandas `freqstr` as well, to gather data by timestamp over a
111
+ defined period.
112
+ duplicates_on : str | list[str] | list[tuple[str]] | None
113
+ Column names according which 'row duplicates' can be identified (i.e.
114
+ rows sharing same values on these specific columns) so as to drop
115
+ them. Duplicates are only identified in new data, and existing
116
+ recorded row groups that overlap with new data.
117
+ If duplicates are dropped, only last is kept.
118
+ To identify row duplicates using all columns, empty list ``[]`` can be
119
+ used instead of all columns names.
120
+ If not set, default to ``None``, meaning no row is dropped.
121
+ max_n_off_target_rgs : int | None
122
+ Max expected number of 'off target' row groups.
123
+ If 'row_group_target_size' is an ``int``, then a 'complete' row group
124
+ is one which size is 'close to' ``row_group_target_size`` (>=80%).
125
+ If 'row_group_target_size' is a pandas `freqstr`, and if there are several
126
+ row groups in the last period defined by the `freqstr`, then these row
127
+ groups are considered incomplete.
128
+ To evaluate number of 'incomplete' row groups, only those at the end of
129
+ an existing dataset are accounted for. 'Incomplete' row groups in the
130
+ middle of 'complete' row groups are not accounted for (they can be
131
+ created by insertion of new data in the middle of existing data).
132
+ If not set, default to ``None``.
133
+
134
+ - ``None`` value induces no coalescing of row groups. If there is no
135
+ drop of duplicates, new data is systematically appended.
136
+ - A value of ``0`` or ``1`` means that new data should systematically
137
+ be merged to the last existing one to 'complete' it (if it is not
138
+ 'complete' already).
139
+
140
+ key_value_metadata : dict[str, str], optional
141
+ Key-value metadata to write, or update in dataset. Please see
142
+ fastparquet for updating logic in case of `None` value being used.
143
+ **kwargs :
144
+ Additional parameters to pass to
145
+ 'OrderedParquetDataset.write_row_group_files()'.
146
+
147
+ Notes
148
+ -----
149
+ - When writing a dataframe with this function,
150
+
151
+ - index of dataframe is not written to disk.
152
+ - parquet file scheme is 'hive' (one row group per parquet file).
153
+
154
+ - Coalescing off target size row groups is triggered if actual number of off
155
+ target row groups is larger than ``max_n_off_target_rgs``.
156
+ This assessment is however only triggered if ``max_n_off_target_rgs`` is
157
+ set. Otherwise, new data is simply appended, without prior check.
158
+ - When ``duplicates_on`` is set, 'ordered_on' column is added to
159
+ ``duplicates_on`` list, if not already part of it. Purpose is to enable a
160
+ first approximate search for duplicates, to load data of interest only.
161
+ - For simple data appending, i.e. without need to drop duplicates, it is
162
+ advised to keep ``ordered_on`` and ``duplicates_on`` parameters set to
163
+ ``None`` as this parameter will trigger unnecessary evaluations.
164
+ - Off target size row groups are row groups:
165
+
166
+ - either not reaching the maximum number of rows if
167
+ 'row_group_target_size' is an ``int``,
168
+ - or several row groups lying in the same time period if
169
+ 'row_group_target_size' is a pandas 'freqstr'.
170
+
171
+ - When incorporating new data within recorded data, existing off target size
172
+ row groups will only be resized if there is intersection with new data.
173
+ Otherwise, new data is only added, without merging with existing off
174
+ target size row groups.
175
+
176
+ """
177
+ drop_duplicates, subset = _validate_duplicate_on_param(
178
+ duplicates_on=duplicates_on,
179
+ ordered_on=ordered_on,
180
+ )
181
+ df_ordered_on = Series([]) if df is None else df.loc[:, ordered_on]
182
+ # Check that df_ordered_on is sorted.
183
+ if not df_ordered_on.is_monotonic_increasing:
184
+ raise ValueError("'df_ordered_on' must be sorted in ascending order.")
185
+ ordered_parquet_dataset = (
186
+ # Case 'dirpath' is a path to a directory.
187
+ import_module("oups.store.ordered_parquet_dataset").OrderedParquetDataset(
188
+ dirpath,
189
+ ordered_on=ordered_on,
190
+ )
191
+ if isinstance(dirpath, (str, Path))
192
+ else
193
+ # Case 'dirpath' is already an OrderedParquetDataset.
194
+ dirpath
195
+ )
196
+ if df is not None or len(ordered_parquet_dataset):
197
+ if isinstance(row_group_target_size, int):
198
+ if drop_duplicates and df is not None:
199
+ # Duplicates are dropped a first time in the DataFrame, so that the
200
+ # calculation of merge and split strategy is made with the most
201
+ # correct approximate number of rows in DataFrame.
202
+ df.drop_duplicates(subset=subset, keep="last", ignore_index=True, inplace=True)
203
+ merge_split_strategy = NRowsMergeSplitStrategy(
204
+ rg_ordered_on_mins=ordered_parquet_dataset.row_group_stats.loc[
205
+ :,
206
+ KEY_ORDERED_ON_MINS,
207
+ ].to_numpy(), # rg_ordered_on_mins,
208
+ rg_ordered_on_maxs=ordered_parquet_dataset.row_group_stats.loc[
209
+ :,
210
+ KEY_ORDERED_ON_MAXS,
211
+ ].to_numpy(), # rg_ordered_on_maxs,
212
+ df_ordered_on=df_ordered_on,
213
+ drop_duplicates=drop_duplicates,
214
+ rgs_n_rows=ordered_parquet_dataset.row_group_stats.loc[:, KEY_N_ROWS],
215
+ row_group_target_size=row_group_target_size,
216
+ )
217
+ else:
218
+ merge_split_strategy = TimePeriodMergeSplitStrategy(
219
+ rg_ordered_on_mins=ordered_parquet_dataset.row_group_stats.loc[
220
+ :,
221
+ KEY_ORDERED_ON_MINS,
222
+ ].to_numpy(), # rg_ordered_on_mins,
223
+ rg_ordered_on_maxs=ordered_parquet_dataset.row_group_stats.loc[
224
+ :,
225
+ KEY_ORDERED_ON_MAXS,
226
+ ].to_numpy(), # rg_ordered_on_maxs,
227
+ df_ordered_on=df_ordered_on,
228
+ drop_duplicates=drop_duplicates,
229
+ row_group_time_period=row_group_target_size,
230
+ )
231
+ ordered_parquet_dataset._write_row_group_files(
232
+ dfs=iter_merge_split_data(
233
+ opd=ordered_parquet_dataset,
234
+ ordered_on=ordered_on,
235
+ df=df,
236
+ merge_sequences=merge_split_strategy.compute_merge_sequences(
237
+ max_n_off_target_rgs=max_n_off_target_rgs,
238
+ ),
239
+ split_sequence=merge_split_strategy.compute_split_sequence,
240
+ drop_duplicates=drop_duplicates,
241
+ subset=subset,
242
+ ),
243
+ write_metadata_file=False,
244
+ **kwargs,
245
+ )
246
+ # Remove row groups of data that is overlapping.
247
+ file_id_col_idx = ordered_parquet_dataset.row_group_stats.columns.get_loc(KEY_FILE_IDS)
248
+ rg_file_ids_to_remove = [
249
+ file_id
250
+ for rg_idx_mr_start_end_excl in merge_split_strategy.rg_idx_mrs_starts_ends_excl
251
+ for file_id in ordered_parquet_dataset.row_group_stats.iloc[
252
+ rg_idx_mr_start_end_excl,
253
+ file_id_col_idx,
254
+ ].to_list()
255
+ ]
256
+ if rg_file_ids_to_remove:
257
+ ordered_parquet_dataset._remove_row_group_files(
258
+ file_ids=rg_file_ids_to_remove,
259
+ sort_row_groups=merge_split_strategy.sort_rgs_after_write,
260
+ key_value_metadata=key_value_metadata,
261
+ )
262
+ # 'remove_row_group_files()' embeds calls to 'sort_row_groups()' and
263
+ # 'align_file_ids()' methods.
264
+ return
265
+ # Rename partition files.
266
+ elif merge_split_strategy.sort_rgs_after_write:
267
+ ordered_parquet_dataset._sort_row_groups()
268
+ ordered_parquet_dataset._align_file_ids()
269
+ # Write metadata.
270
+ ordered_parquet_dataset._write_metadata_file(key_value_metadata=key_value_metadata)
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Store for oups library.
4
+ """
5
+
6
+ from .store import Store
7
+
8
+
9
+ __all__ = [
10
+ "Store",
11
+ ]
@@ -0,0 +1,50 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Created on Mon Jun 16 20:00:00 2025.
4
+
5
+ @author: pierrot
6
+
7
+ OrderedParquetDataset caching utilities for Store operations.
8
+
9
+ """
10
+ from __future__ import annotations
11
+
12
+ from contextlib import contextmanager
13
+ from typing import TYPE_CHECKING
14
+
15
+ from oups.store.indexer import StoreKey
16
+
17
+
18
+ if TYPE_CHECKING:
19
+ from oups.store.store import Store
20
+
21
+
22
+ @contextmanager
23
+ def cached_datasets[K: StoreKey](
24
+ store: Store[K],
25
+ keys: list[K],
26
+ ):
27
+ """
28
+ Context manager for caching OrderedParquetDataset objects.
29
+
30
+ Parameters
31
+ ----------
32
+ store : Store[K]
33
+ Store instance to get datasets from
34
+ keys : list[K]
35
+ List of dataset keys to cache
36
+
37
+ Yields
38
+ ------
39
+ dict[K, OrderedParquetDataset]
40
+ Dictionary mapping keys to cached dataset objects
41
+
42
+ """
43
+ cache = {}
44
+ try:
45
+ cache = {key: store[key] for key in keys}
46
+ yield cache
47
+ finally:
48
+ # Explicitly trigger OrderedParquetDataset deletion to release the lock.
49
+ for key in keys:
50
+ del cache[key]