oups 2025.9.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of oups might be problematic. Click here for more details.
- oups/__init__.py +40 -0
- oups/date_utils.py +62 -0
- oups/defines.py +26 -0
- oups/numpy_utils.py +114 -0
- oups/stateful_loop/__init__.py +14 -0
- oups/stateful_loop/loop_persistence_io.py +55 -0
- oups/stateful_loop/stateful_loop.py +654 -0
- oups/stateful_loop/validate_loop_usage.py +338 -0
- oups/stateful_ops/__init__.py +22 -0
- oups/stateful_ops/aggstream/__init__.py +12 -0
- oups/stateful_ops/aggstream/aggstream.py +1524 -0
- oups/stateful_ops/aggstream/cumsegagg.py +580 -0
- oups/stateful_ops/aggstream/jcumsegagg.py +416 -0
- oups/stateful_ops/aggstream/segmentby.py +1018 -0
- oups/stateful_ops/aggstream/utils.py +71 -0
- oups/stateful_ops/asof_merger/__init__.py +11 -0
- oups/stateful_ops/asof_merger/asof_merger.py +750 -0
- oups/stateful_ops/asof_merger/get_config.py +401 -0
- oups/stateful_ops/asof_merger/validate_params.py +285 -0
- oups/store/__init__.py +15 -0
- oups/store/filepath_utils.py +68 -0
- oups/store/indexer.py +457 -0
- oups/store/ordered_parquet_dataset/__init__.py +19 -0
- oups/store/ordered_parquet_dataset/metadata_filename.py +50 -0
- oups/store/ordered_parquet_dataset/ordered_parquet_dataset/__init__.py +15 -0
- oups/store/ordered_parquet_dataset/ordered_parquet_dataset/base.py +863 -0
- oups/store/ordered_parquet_dataset/ordered_parquet_dataset/read_only.py +252 -0
- oups/store/ordered_parquet_dataset/parquet_adapter.py +157 -0
- oups/store/ordered_parquet_dataset/write/__init__.py +19 -0
- oups/store/ordered_parquet_dataset/write/iter_merge_split_data.py +131 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/__init__.py +22 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/base.py +784 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/n_rows_strategy.py +297 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/time_period_strategy.py +319 -0
- oups/store/ordered_parquet_dataset/write/write.py +270 -0
- oups/store/store/__init__.py +11 -0
- oups/store/store/dataset_cache.py +50 -0
- oups/store/store/iter_intersections.py +397 -0
- oups/store/store/store.py +345 -0
- oups-2025.9.5.dist-info/LICENSE +201 -0
- oups-2025.9.5.dist-info/METADATA +44 -0
- oups-2025.9.5.dist-info/RECORD +43 -0
- oups-2025.9.5.dist-info/WHEEL +4 -0
oups/__init__.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Created on Wed Dec 1 18:35:00 2021.
|
|
4
|
+
|
|
5
|
+
@author: pierrot
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
import sys
|
|
9
|
+
|
|
10
|
+
# Import version dynamically from Poetry
|
|
11
|
+
from importlib.metadata import PackageNotFoundError
|
|
12
|
+
from importlib.metadata import version
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# Conditional import for stateful_ops to avoid numba issues during documentation builds
|
|
16
|
+
if "sphinx" in sys.modules:
|
|
17
|
+
# During documentation builds, skip heavy stateful_ops imports
|
|
18
|
+
AggStream = None
|
|
19
|
+
by_x_rows = None
|
|
20
|
+
else:
|
|
21
|
+
from .stateful_ops import AggStream
|
|
22
|
+
from .stateful_ops import by_x_rows
|
|
23
|
+
|
|
24
|
+
from .stateful_loop import StatefulLoop
|
|
25
|
+
from .stateful_ops import AsofMerger
|
|
26
|
+
from .store import OrderedParquetDataset
|
|
27
|
+
from .store import Store
|
|
28
|
+
from .store import check_cmidx
|
|
29
|
+
from .store import conform_cmidx
|
|
30
|
+
from .store import is_toplevel
|
|
31
|
+
from .store import sublevel
|
|
32
|
+
from .store import toplevel
|
|
33
|
+
from .store import write
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
__version__ = version("oups")
|
|
38
|
+
except PackageNotFoundError:
|
|
39
|
+
# Package is not installed, likely in development
|
|
40
|
+
__version__ = "development"
|
oups/date_utils.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Created on Sat May 24 18:35:00 2025.
|
|
4
|
+
|
|
5
|
+
@author: pierrot
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
from pandas import Timedelta
|
|
9
|
+
from pandas import Timestamp
|
|
10
|
+
from pandas import date_range
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def floor_ts(ts: Timestamp, freq: str) -> Timestamp:
|
|
14
|
+
"""
|
|
15
|
+
Floor a timestamp even if using non-fixed frequency.
|
|
16
|
+
|
|
17
|
+
Parameters
|
|
18
|
+
----------
|
|
19
|
+
ts : Timestamp
|
|
20
|
+
Timestamp to floor.
|
|
21
|
+
freq : str
|
|
22
|
+
Frequency string.
|
|
23
|
+
https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
Returns
|
|
27
|
+
-------
|
|
28
|
+
Timestamp
|
|
29
|
+
Floored timestamp.
|
|
30
|
+
|
|
31
|
+
"""
|
|
32
|
+
try:
|
|
33
|
+
return ts.floor(freq)
|
|
34
|
+
except ValueError:
|
|
35
|
+
return date_range(end=ts.normalize(), periods=1, freq=freq)[0]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def ceil_ts(ts: Timestamp, freq: str) -> Timestamp:
|
|
39
|
+
"""
|
|
40
|
+
Ceil a timestamp even if using non-fixed frequency.
|
|
41
|
+
|
|
42
|
+
Parameters
|
|
43
|
+
----------
|
|
44
|
+
ts : Timestamp
|
|
45
|
+
Timestamp to ceil.
|
|
46
|
+
freq : str
|
|
47
|
+
Frequency string.
|
|
48
|
+
https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases
|
|
49
|
+
|
|
50
|
+
Returns
|
|
51
|
+
-------
|
|
52
|
+
Timestamp
|
|
53
|
+
Ceiled timestamp.
|
|
54
|
+
|
|
55
|
+
"""
|
|
56
|
+
try:
|
|
57
|
+
# Can't use 'ceil' from pandas because it does not work on 'D'aily
|
|
58
|
+
# frequency.
|
|
59
|
+
# return ts.ceil(freq)
|
|
60
|
+
return (ts + Timedelta(1, unit=freq)).floor(freq)
|
|
61
|
+
except ValueError:
|
|
62
|
+
return date_range(start=floor_ts(ts, freq), periods=2, freq=freq)[1]
|
oups/defines.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Created on Wed Dec 1 18:35:00 2021.
|
|
4
|
+
|
|
5
|
+
@author: pierrot
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
# Central key in oups world, used as ID of the column name according which
|
|
9
|
+
# dataframes are ordered.
|
|
10
|
+
KEY_ORDERED_ON = "ordered_on"
|
|
11
|
+
# Other shared keys.
|
|
12
|
+
KEY_FILE_IDS = "file_ids"
|
|
13
|
+
KEY_N_ROWS = "n_rows"
|
|
14
|
+
KEY_ORDERED_ON_MINS = "ordered_on_mins"
|
|
15
|
+
KEY_ORDERED_ON_MAXS = "ordered_on_maxs"
|
|
16
|
+
# A specific key for a function parameter with a three-fold type:
|
|
17
|
+
# - None, meaning no duplicates check,
|
|
18
|
+
# - an empty list, meaning identify duplicate on all columns of the dataframe,
|
|
19
|
+
# - a not empty list of a string, the columns to identify row duplicates.
|
|
20
|
+
KEY_DUPLICATES_ON = "duplicates_on"
|
|
21
|
+
# In a fastparquet `ParquetFile`, oups-specific metadata is stored as value for
|
|
22
|
+
# key `KEY_METADATA_KEY`.
|
|
23
|
+
KEY_OUPS_METADATA = "oups"
|
|
24
|
+
# Parameters for 'write()' function.
|
|
25
|
+
KEY_ROW_GROUP_TARGET_SIZE = "row_group_target_size"
|
|
26
|
+
KEY_MAX_N_OFF_TARGET_RGS = "max_n_off_target_rgs"
|
oups/numpy_utils.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Created on Mon Jun 02 18:35:00 2025.
|
|
4
|
+
|
|
5
|
+
@author: pierrot
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from numpy import arange
|
|
10
|
+
from numpy import flip
|
|
11
|
+
from numpy import isnan
|
|
12
|
+
from numpy import maximum
|
|
13
|
+
from numpy import ones
|
|
14
|
+
from numpy import searchsorted
|
|
15
|
+
from numpy import where
|
|
16
|
+
from numpy.typing import NDArray
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def isnotin_ordered(
|
|
20
|
+
sorted_array: NDArray,
|
|
21
|
+
query_elements: NDArray,
|
|
22
|
+
return_insert_positions: bool = False,
|
|
23
|
+
) -> NDArray | tuple[NDArray, NDArray]:
|
|
24
|
+
"""
|
|
25
|
+
Check if query elements are not present in a sorted array.
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
sorted_array : NDArray
|
|
30
|
+
Sorted array in which to search for elements.
|
|
31
|
+
Must be sorted in ascending order.
|
|
32
|
+
query_elements : NDArray
|
|
33
|
+
Array of elements to search for.
|
|
34
|
+
Must be sorted in ascending order if containing elements which are
|
|
35
|
+
are larger than the largest element in 'sorted_array'.
|
|
36
|
+
return_insert_positions : bool, optional
|
|
37
|
+
If True, also return the insert positions where unfound elements
|
|
38
|
+
could be inserted into 'sorted_array', maintaining sort order.
|
|
39
|
+
Default is False.
|
|
40
|
+
|
|
41
|
+
Returns
|
|
42
|
+
-------
|
|
43
|
+
Union[NDArray, tuple[NDArray, NDArray]]
|
|
44
|
+
If 'return_insert_positions' is False:
|
|
45
|
+
Array of booleans with same length as 'query_elements', where
|
|
46
|
+
True indicates the element is not found in 'sorted_array'.
|
|
47
|
+
If 'return_insert_positions' is True:
|
|
48
|
+
Tuple containing:
|
|
49
|
+
- Array of booleans indicating which query elements are not found,
|
|
50
|
+
- Array of insert positions for unfound elements (where they could
|
|
51
|
+
be inserted into 'sorted_array', maintaining sort order)
|
|
52
|
+
|
|
53
|
+
Examples
|
|
54
|
+
--------
|
|
55
|
+
>>> sorted_arr = np.array([1, 3, 5, 7, 9])
|
|
56
|
+
>>> queries = np.array([2, 3, 6, 7])
|
|
57
|
+
>>> isnotin_ordered(sorted_arr, queries)
|
|
58
|
+
array([True, False, True, False])
|
|
59
|
+
|
|
60
|
+
>>> not_found, insert_positions = isnotin_ordered(sorted_arr, queries, return_insert_positions=True)
|
|
61
|
+
>>> not_found
|
|
62
|
+
array([True, False, True, False])
|
|
63
|
+
>>> insert_positions
|
|
64
|
+
array([1, 3]) # positions where 2 and 6 could be inserted
|
|
65
|
+
|
|
66
|
+
"""
|
|
67
|
+
insert_idx = searchsorted(sorted_array, query_elements, side="left")
|
|
68
|
+
# Check if elements are found: they exist if insert position is valid
|
|
69
|
+
# and the element at that position matches the query
|
|
70
|
+
found_max_idx = searchsorted(insert_idx, len(sorted_array))
|
|
71
|
+
is_not_found = ones(len(query_elements), dtype=bool)
|
|
72
|
+
is_not_found[:found_max_idx] = sorted_array[insert_idx[:found_max_idx]] != query_elements[:found_max_idx]
|
|
73
|
+
return (is_not_found, insert_idx[is_not_found]) if return_insert_positions else is_not_found
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def ffill1d(arr: NDArray) -> NDArray:
|
|
77
|
+
"""
|
|
78
|
+
Forward fill 1D array in-place.
|
|
79
|
+
|
|
80
|
+
Parameters
|
|
81
|
+
----------
|
|
82
|
+
arr : NDArray
|
|
83
|
+
Array to fill.
|
|
84
|
+
|
|
85
|
+
Returns
|
|
86
|
+
-------
|
|
87
|
+
None
|
|
88
|
+
|
|
89
|
+
Notes
|
|
90
|
+
-----
|
|
91
|
+
If first value is nan, it is kept as nan.
|
|
92
|
+
|
|
93
|
+
"""
|
|
94
|
+
mask = isnan(arr)
|
|
95
|
+
idx = where(~mask, arange(mask.size), 0)
|
|
96
|
+
maximum.accumulate(idx, out=idx)
|
|
97
|
+
arr[mask] = arr[idx[mask]]
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def bfill1d(arr: NDArray) -> NDArray:
|
|
101
|
+
"""
|
|
102
|
+
Backward fill 1D array in-place.
|
|
103
|
+
|
|
104
|
+
Parameters
|
|
105
|
+
----------
|
|
106
|
+
arr : NDArray
|
|
107
|
+
Array to fill.
|
|
108
|
+
|
|
109
|
+
Returns
|
|
110
|
+
-------
|
|
111
|
+
None
|
|
112
|
+
|
|
113
|
+
"""
|
|
114
|
+
ffill1d(flip(arr, 0))
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Created on Wed Jun 1 18:35:00 2025.
|
|
4
|
+
|
|
5
|
+
Loop persistence I/O helpers for the stateful loop.
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from cloudpickle import dump
|
|
12
|
+
from cloudpickle import load
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
KEY_ID = "id"
|
|
16
|
+
KEY_VERSION = "version"
|
|
17
|
+
KEY_STATES = "states"
|
|
18
|
+
PERSISTENCE_ID = "stateful_loop"
|
|
19
|
+
PERSISTENCE_VERSION = 1
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class LoopPersistenceIO:
|
|
23
|
+
"""
|
|
24
|
+
Helper for reading and writing the loop persistence file with schema validation.
|
|
25
|
+
|
|
26
|
+
Schema: {"id": "stateful_loop", "version": 1, "states": {...}}.
|
|
27
|
+
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
@staticmethod
|
|
31
|
+
def load(path: Path) -> dict[str, dict[str, Any]]:
|
|
32
|
+
"""
|
|
33
|
+
Load and validate states from ``path``.
|
|
34
|
+
"""
|
|
35
|
+
with open(path, "rb") as f:
|
|
36
|
+
payload = load(f)
|
|
37
|
+
if not isinstance(payload, dict):
|
|
38
|
+
raise ValueError("invalid persistence file format.")
|
|
39
|
+
if payload[KEY_ID] != PERSISTENCE_ID:
|
|
40
|
+
raise ValueError("invalid persistence file id.")
|
|
41
|
+
if payload[KEY_VERSION] != PERSISTENCE_VERSION:
|
|
42
|
+
raise ValueError("unsupported persistence file version.")
|
|
43
|
+
states = payload[KEY_STATES]
|
|
44
|
+
if not isinstance(states, dict):
|
|
45
|
+
raise ValueError("invalid 'states' content in persistence file.")
|
|
46
|
+
return states
|
|
47
|
+
|
|
48
|
+
@staticmethod
|
|
49
|
+
def save(path: Path, states: dict[str, dict[str, Any]]) -> None:
|
|
50
|
+
"""
|
|
51
|
+
Save ``states`` to ``path`` with schema and version.
|
|
52
|
+
"""
|
|
53
|
+
payload = {KEY_ID: PERSISTENCE_ID, KEY_VERSION: PERSISTENCE_VERSION, KEY_STATES: states}
|
|
54
|
+
with open(path, "wb") as f:
|
|
55
|
+
dump(payload, f)
|