oups 2025.9.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of oups might be problematic. Click here for more details.
- oups/__init__.py +40 -0
- oups/date_utils.py +62 -0
- oups/defines.py +26 -0
- oups/numpy_utils.py +114 -0
- oups/stateful_loop/__init__.py +14 -0
- oups/stateful_loop/loop_persistence_io.py +55 -0
- oups/stateful_loop/stateful_loop.py +654 -0
- oups/stateful_loop/validate_loop_usage.py +338 -0
- oups/stateful_ops/__init__.py +22 -0
- oups/stateful_ops/aggstream/__init__.py +12 -0
- oups/stateful_ops/aggstream/aggstream.py +1524 -0
- oups/stateful_ops/aggstream/cumsegagg.py +580 -0
- oups/stateful_ops/aggstream/jcumsegagg.py +416 -0
- oups/stateful_ops/aggstream/segmentby.py +1018 -0
- oups/stateful_ops/aggstream/utils.py +71 -0
- oups/stateful_ops/asof_merger/__init__.py +11 -0
- oups/stateful_ops/asof_merger/asof_merger.py +750 -0
- oups/stateful_ops/asof_merger/get_config.py +401 -0
- oups/stateful_ops/asof_merger/validate_params.py +285 -0
- oups/store/__init__.py +15 -0
- oups/store/filepath_utils.py +68 -0
- oups/store/indexer.py +457 -0
- oups/store/ordered_parquet_dataset/__init__.py +19 -0
- oups/store/ordered_parquet_dataset/metadata_filename.py +50 -0
- oups/store/ordered_parquet_dataset/ordered_parquet_dataset/__init__.py +15 -0
- oups/store/ordered_parquet_dataset/ordered_parquet_dataset/base.py +863 -0
- oups/store/ordered_parquet_dataset/ordered_parquet_dataset/read_only.py +252 -0
- oups/store/ordered_parquet_dataset/parquet_adapter.py +157 -0
- oups/store/ordered_parquet_dataset/write/__init__.py +19 -0
- oups/store/ordered_parquet_dataset/write/iter_merge_split_data.py +131 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/__init__.py +22 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/base.py +784 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/n_rows_strategy.py +297 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/time_period_strategy.py +319 -0
- oups/store/ordered_parquet_dataset/write/write.py +270 -0
- oups/store/store/__init__.py +11 -0
- oups/store/store/dataset_cache.py +50 -0
- oups/store/store/iter_intersections.py +397 -0
- oups/store/store/store.py +345 -0
- oups-2025.9.5.dist-info/LICENSE +201 -0
- oups-2025.9.5.dist-info/METADATA +44 -0
- oups-2025.9.5.dist-info/RECORD +43 -0
- oups-2025.9.5.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Created on Wed Dec 6 22:30:00 2021.
|
|
4
|
+
|
|
5
|
+
@author: pierrot
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from importlib import import_module
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import TYPE_CHECKING
|
|
13
|
+
|
|
14
|
+
from pandas import DataFrame
|
|
15
|
+
from pandas import Series
|
|
16
|
+
|
|
17
|
+
from oups.defines import KEY_FILE_IDS
|
|
18
|
+
from oups.defines import KEY_N_ROWS
|
|
19
|
+
from oups.defines import KEY_ORDERED_ON_MAXS
|
|
20
|
+
from oups.defines import KEY_ORDERED_ON_MINS
|
|
21
|
+
from oups.store.ordered_parquet_dataset.write.iter_merge_split_data import iter_merge_split_data
|
|
22
|
+
from oups.store.ordered_parquet_dataset.write.merge_split_strategies import NRowsMergeSplitStrategy
|
|
23
|
+
from oups.store.ordered_parquet_dataset.write.merge_split_strategies import TimePeriodMergeSplitStrategy
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from oups.store.ordered_parquet_dataset.ordered_parquet_dataset.base import OrderedParquetDataset
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
ROW_GROUP_INT_TARGET_SIZE = 6_345_000
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _validate_duplicate_on_param(
|
|
34
|
+
duplicates_on: str | list[str] | list[tuple[str]],
|
|
35
|
+
ordered_on: str,
|
|
36
|
+
) -> list[str]:
|
|
37
|
+
"""
|
|
38
|
+
Validate and normalize duplicate parameters.
|
|
39
|
+
|
|
40
|
+
Parameters
|
|
41
|
+
----------
|
|
42
|
+
duplicates_on : str | list[str] | list[tuple[str]]
|
|
43
|
+
Column(s) to check for duplicates. If empty list, all columns are used.
|
|
44
|
+
ordered_on : str
|
|
45
|
+
Column name by which data is ordered.
|
|
46
|
+
|
|
47
|
+
Returns
|
|
48
|
+
-------
|
|
49
|
+
tuple[bool, str | list[str] | None]
|
|
50
|
+
Boolean flag indicating if duplicates are to be dropped, and the subset
|
|
51
|
+
to pass to ``DataFrame.drop_duplicates`` (including 'ordered_on'). If
|
|
52
|
+
duplicates are dropped, ``None`` indicates to consider all columns.
|
|
53
|
+
|
|
54
|
+
Raises
|
|
55
|
+
------
|
|
56
|
+
ValueError
|
|
57
|
+
If distinct_bounds is not set while duplicates_on is provided.
|
|
58
|
+
|
|
59
|
+
"""
|
|
60
|
+
if duplicates_on is None:
|
|
61
|
+
return (False, None)
|
|
62
|
+
else:
|
|
63
|
+
if isinstance(duplicates_on, list):
|
|
64
|
+
if duplicates_on == []:
|
|
65
|
+
return (True, None)
|
|
66
|
+
elif ordered_on not in duplicates_on:
|
|
67
|
+
duplicates_on.append(ordered_on)
|
|
68
|
+
return (True, duplicates_on)
|
|
69
|
+
else:
|
|
70
|
+
# 'duplicates_on' is a single column name.
|
|
71
|
+
if duplicates_on != ordered_on:
|
|
72
|
+
return (True, [duplicates_on, ordered_on])
|
|
73
|
+
return (True, ordered_on)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def write(
|
|
77
|
+
dirpath: str | Path | OrderedParquetDataset,
|
|
78
|
+
*,
|
|
79
|
+
ordered_on: str | tuple[str],
|
|
80
|
+
df: DataFrame | None = None,
|
|
81
|
+
row_group_target_size: int | str | None = ROW_GROUP_INT_TARGET_SIZE,
|
|
82
|
+
duplicates_on: str | list[str] | list[tuple[str]] | None = None,
|
|
83
|
+
max_n_off_target_rgs: int | None = None,
|
|
84
|
+
key_value_metadata: dict[str, str] | None = None,
|
|
85
|
+
**kwargs,
|
|
86
|
+
):
|
|
87
|
+
"""
|
|
88
|
+
Write data to disk at location specified by path.
|
|
89
|
+
|
|
90
|
+
Parameters
|
|
91
|
+
----------
|
|
92
|
+
dirpath : str | Path | OrderedParquetDataset
|
|
93
|
+
If a string or a Path, it is the directory where writing pandas
|
|
94
|
+
dataframe.
|
|
95
|
+
If an OrderedParquetDataset, it is the dataset where writing pandas
|
|
96
|
+
dataframe.
|
|
97
|
+
ordered_on : str | tuple[str]
|
|
98
|
+
Name of the column with respect to which dataset is in ascending order.
|
|
99
|
+
If column multi-index, name of the column is a tuple.
|
|
100
|
+
It allows knowing 'where' to insert new data into existing data, i.e.
|
|
101
|
+
completing or correcting past records (but it does not allow to remove
|
|
102
|
+
prior data).
|
|
103
|
+
df : DataFrame | None, default None
|
|
104
|
+
Data to write. If None, a resize of Ordered Parquet Dataset may however
|
|
105
|
+
be performed.
|
|
106
|
+
row_group_target_size : int | str | None
|
|
107
|
+
Target size of row groups. If not set, default to ``6_345_000``, which
|
|
108
|
+
for a dataframe with 6 columns of ``float64`` or ``int64`` results in a
|
|
109
|
+
memory footprint (RAM) of about 290MB.
|
|
110
|
+
It can be a pandas `freqstr` as well, to gather data by timestamp over a
|
|
111
|
+
defined period.
|
|
112
|
+
duplicates_on : str | list[str] | list[tuple[str]] | None
|
|
113
|
+
Column names according which 'row duplicates' can be identified (i.e.
|
|
114
|
+
rows sharing same values on these specific columns) so as to drop
|
|
115
|
+
them. Duplicates are only identified in new data, and existing
|
|
116
|
+
recorded row groups that overlap with new data.
|
|
117
|
+
If duplicates are dropped, only last is kept.
|
|
118
|
+
To identify row duplicates using all columns, empty list ``[]`` can be
|
|
119
|
+
used instead of all columns names.
|
|
120
|
+
If not set, default to ``None``, meaning no row is dropped.
|
|
121
|
+
max_n_off_target_rgs : int | None
|
|
122
|
+
Max expected number of 'off target' row groups.
|
|
123
|
+
If 'row_group_target_size' is an ``int``, then a 'complete' row group
|
|
124
|
+
is one which size is 'close to' ``row_group_target_size`` (>=80%).
|
|
125
|
+
If 'row_group_target_size' is a pandas `freqstr`, and if there are several
|
|
126
|
+
row groups in the last period defined by the `freqstr`, then these row
|
|
127
|
+
groups are considered incomplete.
|
|
128
|
+
To evaluate number of 'incomplete' row groups, only those at the end of
|
|
129
|
+
an existing dataset are accounted for. 'Incomplete' row groups in the
|
|
130
|
+
middle of 'complete' row groups are not accounted for (they can be
|
|
131
|
+
created by insertion of new data in the middle of existing data).
|
|
132
|
+
If not set, default to ``None``.
|
|
133
|
+
|
|
134
|
+
- ``None`` value induces no coalescing of row groups. If there is no
|
|
135
|
+
drop of duplicates, new data is systematically appended.
|
|
136
|
+
- A value of ``0`` or ``1`` means that new data should systematically
|
|
137
|
+
be merged to the last existing one to 'complete' it (if it is not
|
|
138
|
+
'complete' already).
|
|
139
|
+
|
|
140
|
+
key_value_metadata : dict[str, str], optional
|
|
141
|
+
Key-value metadata to write, or update in dataset. Please see
|
|
142
|
+
fastparquet for updating logic in case of `None` value being used.
|
|
143
|
+
**kwargs :
|
|
144
|
+
Additional parameters to pass to
|
|
145
|
+
'OrderedParquetDataset.write_row_group_files()'.
|
|
146
|
+
|
|
147
|
+
Notes
|
|
148
|
+
-----
|
|
149
|
+
- When writing a dataframe with this function,
|
|
150
|
+
|
|
151
|
+
- index of dataframe is not written to disk.
|
|
152
|
+
- parquet file scheme is 'hive' (one row group per parquet file).
|
|
153
|
+
|
|
154
|
+
- Coalescing off target size row groups is triggered if actual number of off
|
|
155
|
+
target row groups is larger than ``max_n_off_target_rgs``.
|
|
156
|
+
This assessment is however only triggered if ``max_n_off_target_rgs`` is
|
|
157
|
+
set. Otherwise, new data is simply appended, without prior check.
|
|
158
|
+
- When ``duplicates_on`` is set, 'ordered_on' column is added to
|
|
159
|
+
``duplicates_on`` list, if not already part of it. Purpose is to enable a
|
|
160
|
+
first approximate search for duplicates, to load data of interest only.
|
|
161
|
+
- For simple data appending, i.e. without need to drop duplicates, it is
|
|
162
|
+
advised to keep ``ordered_on`` and ``duplicates_on`` parameters set to
|
|
163
|
+
``None`` as this parameter will trigger unnecessary evaluations.
|
|
164
|
+
- Off target size row groups are row groups:
|
|
165
|
+
|
|
166
|
+
- either not reaching the maximum number of rows if
|
|
167
|
+
'row_group_target_size' is an ``int``,
|
|
168
|
+
- or several row groups lying in the same time period if
|
|
169
|
+
'row_group_target_size' is a pandas 'freqstr'.
|
|
170
|
+
|
|
171
|
+
- When incorporating new data within recorded data, existing off target size
|
|
172
|
+
row groups will only be resized if there is intersection with new data.
|
|
173
|
+
Otherwise, new data is only added, without merging with existing off
|
|
174
|
+
target size row groups.
|
|
175
|
+
|
|
176
|
+
"""
|
|
177
|
+
drop_duplicates, subset = _validate_duplicate_on_param(
|
|
178
|
+
duplicates_on=duplicates_on,
|
|
179
|
+
ordered_on=ordered_on,
|
|
180
|
+
)
|
|
181
|
+
df_ordered_on = Series([]) if df is None else df.loc[:, ordered_on]
|
|
182
|
+
# Check that df_ordered_on is sorted.
|
|
183
|
+
if not df_ordered_on.is_monotonic_increasing:
|
|
184
|
+
raise ValueError("'df_ordered_on' must be sorted in ascending order.")
|
|
185
|
+
ordered_parquet_dataset = (
|
|
186
|
+
# Case 'dirpath' is a path to a directory.
|
|
187
|
+
import_module("oups.store.ordered_parquet_dataset").OrderedParquetDataset(
|
|
188
|
+
dirpath,
|
|
189
|
+
ordered_on=ordered_on,
|
|
190
|
+
)
|
|
191
|
+
if isinstance(dirpath, (str, Path))
|
|
192
|
+
else
|
|
193
|
+
# Case 'dirpath' is already an OrderedParquetDataset.
|
|
194
|
+
dirpath
|
|
195
|
+
)
|
|
196
|
+
if df is not None or len(ordered_parquet_dataset):
|
|
197
|
+
if isinstance(row_group_target_size, int):
|
|
198
|
+
if drop_duplicates and df is not None:
|
|
199
|
+
# Duplicates are dropped a first time in the DataFrame, so that the
|
|
200
|
+
# calculation of merge and split strategy is made with the most
|
|
201
|
+
# correct approximate number of rows in DataFrame.
|
|
202
|
+
df.drop_duplicates(subset=subset, keep="last", ignore_index=True, inplace=True)
|
|
203
|
+
merge_split_strategy = NRowsMergeSplitStrategy(
|
|
204
|
+
rg_ordered_on_mins=ordered_parquet_dataset.row_group_stats.loc[
|
|
205
|
+
:,
|
|
206
|
+
KEY_ORDERED_ON_MINS,
|
|
207
|
+
].to_numpy(), # rg_ordered_on_mins,
|
|
208
|
+
rg_ordered_on_maxs=ordered_parquet_dataset.row_group_stats.loc[
|
|
209
|
+
:,
|
|
210
|
+
KEY_ORDERED_ON_MAXS,
|
|
211
|
+
].to_numpy(), # rg_ordered_on_maxs,
|
|
212
|
+
df_ordered_on=df_ordered_on,
|
|
213
|
+
drop_duplicates=drop_duplicates,
|
|
214
|
+
rgs_n_rows=ordered_parquet_dataset.row_group_stats.loc[:, KEY_N_ROWS],
|
|
215
|
+
row_group_target_size=row_group_target_size,
|
|
216
|
+
)
|
|
217
|
+
else:
|
|
218
|
+
merge_split_strategy = TimePeriodMergeSplitStrategy(
|
|
219
|
+
rg_ordered_on_mins=ordered_parquet_dataset.row_group_stats.loc[
|
|
220
|
+
:,
|
|
221
|
+
KEY_ORDERED_ON_MINS,
|
|
222
|
+
].to_numpy(), # rg_ordered_on_mins,
|
|
223
|
+
rg_ordered_on_maxs=ordered_parquet_dataset.row_group_stats.loc[
|
|
224
|
+
:,
|
|
225
|
+
KEY_ORDERED_ON_MAXS,
|
|
226
|
+
].to_numpy(), # rg_ordered_on_maxs,
|
|
227
|
+
df_ordered_on=df_ordered_on,
|
|
228
|
+
drop_duplicates=drop_duplicates,
|
|
229
|
+
row_group_time_period=row_group_target_size,
|
|
230
|
+
)
|
|
231
|
+
ordered_parquet_dataset._write_row_group_files(
|
|
232
|
+
dfs=iter_merge_split_data(
|
|
233
|
+
opd=ordered_parquet_dataset,
|
|
234
|
+
ordered_on=ordered_on,
|
|
235
|
+
df=df,
|
|
236
|
+
merge_sequences=merge_split_strategy.compute_merge_sequences(
|
|
237
|
+
max_n_off_target_rgs=max_n_off_target_rgs,
|
|
238
|
+
),
|
|
239
|
+
split_sequence=merge_split_strategy.compute_split_sequence,
|
|
240
|
+
drop_duplicates=drop_duplicates,
|
|
241
|
+
subset=subset,
|
|
242
|
+
),
|
|
243
|
+
write_metadata_file=False,
|
|
244
|
+
**kwargs,
|
|
245
|
+
)
|
|
246
|
+
# Remove row groups of data that is overlapping.
|
|
247
|
+
file_id_col_idx = ordered_parquet_dataset.row_group_stats.columns.get_loc(KEY_FILE_IDS)
|
|
248
|
+
rg_file_ids_to_remove = [
|
|
249
|
+
file_id
|
|
250
|
+
for rg_idx_mr_start_end_excl in merge_split_strategy.rg_idx_mrs_starts_ends_excl
|
|
251
|
+
for file_id in ordered_parquet_dataset.row_group_stats.iloc[
|
|
252
|
+
rg_idx_mr_start_end_excl,
|
|
253
|
+
file_id_col_idx,
|
|
254
|
+
].to_list()
|
|
255
|
+
]
|
|
256
|
+
if rg_file_ids_to_remove:
|
|
257
|
+
ordered_parquet_dataset._remove_row_group_files(
|
|
258
|
+
file_ids=rg_file_ids_to_remove,
|
|
259
|
+
sort_row_groups=merge_split_strategy.sort_rgs_after_write,
|
|
260
|
+
key_value_metadata=key_value_metadata,
|
|
261
|
+
)
|
|
262
|
+
# 'remove_row_group_files()' embeds calls to 'sort_row_groups()' and
|
|
263
|
+
# 'align_file_ids()' methods.
|
|
264
|
+
return
|
|
265
|
+
# Rename partition files.
|
|
266
|
+
elif merge_split_strategy.sort_rgs_after_write:
|
|
267
|
+
ordered_parquet_dataset._sort_row_groups()
|
|
268
|
+
ordered_parquet_dataset._align_file_ids()
|
|
269
|
+
# Write metadata.
|
|
270
|
+
ordered_parquet_dataset._write_metadata_file(key_value_metadata=key_value_metadata)
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Created on Mon Jun 16 20:00:00 2025.
|
|
4
|
+
|
|
5
|
+
@author: pierrot
|
|
6
|
+
|
|
7
|
+
OrderedParquetDataset caching utilities for Store operations.
|
|
8
|
+
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from contextlib import contextmanager
|
|
13
|
+
from typing import TYPE_CHECKING
|
|
14
|
+
|
|
15
|
+
from oups.store.indexer import StoreKey
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from oups.store.store import Store
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@contextmanager
|
|
23
|
+
def cached_datasets[K: StoreKey](
|
|
24
|
+
store: Store[K],
|
|
25
|
+
keys: list[K],
|
|
26
|
+
):
|
|
27
|
+
"""
|
|
28
|
+
Context manager for caching OrderedParquetDataset objects.
|
|
29
|
+
|
|
30
|
+
Parameters
|
|
31
|
+
----------
|
|
32
|
+
store : Store[K]
|
|
33
|
+
Store instance to get datasets from
|
|
34
|
+
keys : list[K]
|
|
35
|
+
List of dataset keys to cache
|
|
36
|
+
|
|
37
|
+
Yields
|
|
38
|
+
------
|
|
39
|
+
dict[K, OrderedParquetDataset]
|
|
40
|
+
Dictionary mapping keys to cached dataset objects
|
|
41
|
+
|
|
42
|
+
"""
|
|
43
|
+
cache = {}
|
|
44
|
+
try:
|
|
45
|
+
cache = {key: store[key] for key in keys}
|
|
46
|
+
yield cache
|
|
47
|
+
finally:
|
|
48
|
+
# Explicitly trigger OrderedParquetDataset deletion to release the lock.
|
|
49
|
+
for key in keys:
|
|
50
|
+
del cache[key]
|