oups 2025.9.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of oups might be problematic. Click here for more details.

Files changed (43) hide show
  1. oups/__init__.py +40 -0
  2. oups/date_utils.py +62 -0
  3. oups/defines.py +26 -0
  4. oups/numpy_utils.py +114 -0
  5. oups/stateful_loop/__init__.py +14 -0
  6. oups/stateful_loop/loop_persistence_io.py +55 -0
  7. oups/stateful_loop/stateful_loop.py +654 -0
  8. oups/stateful_loop/validate_loop_usage.py +338 -0
  9. oups/stateful_ops/__init__.py +22 -0
  10. oups/stateful_ops/aggstream/__init__.py +12 -0
  11. oups/stateful_ops/aggstream/aggstream.py +1524 -0
  12. oups/stateful_ops/aggstream/cumsegagg.py +580 -0
  13. oups/stateful_ops/aggstream/jcumsegagg.py +416 -0
  14. oups/stateful_ops/aggstream/segmentby.py +1018 -0
  15. oups/stateful_ops/aggstream/utils.py +71 -0
  16. oups/stateful_ops/asof_merger/__init__.py +11 -0
  17. oups/stateful_ops/asof_merger/asof_merger.py +750 -0
  18. oups/stateful_ops/asof_merger/get_config.py +401 -0
  19. oups/stateful_ops/asof_merger/validate_params.py +285 -0
  20. oups/store/__init__.py +15 -0
  21. oups/store/filepath_utils.py +68 -0
  22. oups/store/indexer.py +457 -0
  23. oups/store/ordered_parquet_dataset/__init__.py +19 -0
  24. oups/store/ordered_parquet_dataset/metadata_filename.py +50 -0
  25. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/__init__.py +15 -0
  26. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/base.py +863 -0
  27. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/read_only.py +252 -0
  28. oups/store/ordered_parquet_dataset/parquet_adapter.py +157 -0
  29. oups/store/ordered_parquet_dataset/write/__init__.py +19 -0
  30. oups/store/ordered_parquet_dataset/write/iter_merge_split_data.py +131 -0
  31. oups/store/ordered_parquet_dataset/write/merge_split_strategies/__init__.py +22 -0
  32. oups/store/ordered_parquet_dataset/write/merge_split_strategies/base.py +784 -0
  33. oups/store/ordered_parquet_dataset/write/merge_split_strategies/n_rows_strategy.py +297 -0
  34. oups/store/ordered_parquet_dataset/write/merge_split_strategies/time_period_strategy.py +319 -0
  35. oups/store/ordered_parquet_dataset/write/write.py +270 -0
  36. oups/store/store/__init__.py +11 -0
  37. oups/store/store/dataset_cache.py +50 -0
  38. oups/store/store/iter_intersections.py +397 -0
  39. oups/store/store/store.py +345 -0
  40. oups-2025.9.5.dist-info/LICENSE +201 -0
  41. oups-2025.9.5.dist-info/METADATA +44 -0
  42. oups-2025.9.5.dist-info/RECORD +43 -0
  43. oups-2025.9.5.dist-info/WHEEL +4 -0
oups/__init__.py ADDED
@@ -0,0 +1,40 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Created on Wed Dec 1 18:35:00 2021.
4
+
5
+ @author: pierrot
6
+
7
+ """
8
+ import sys
9
+
10
+ # Import version dynamically from Poetry
11
+ from importlib.metadata import PackageNotFoundError
12
+ from importlib.metadata import version
13
+
14
+
15
+ # Conditional import for stateful_ops to avoid numba issues during documentation builds
16
+ if "sphinx" in sys.modules:
17
+ # During documentation builds, skip heavy stateful_ops imports
18
+ AggStream = None
19
+ by_x_rows = None
20
+ else:
21
+ from .stateful_ops import AggStream
22
+ from .stateful_ops import by_x_rows
23
+
24
+ from .stateful_loop import StatefulLoop
25
+ from .stateful_ops import AsofMerger
26
+ from .store import OrderedParquetDataset
27
+ from .store import Store
28
+ from .store import check_cmidx
29
+ from .store import conform_cmidx
30
+ from .store import is_toplevel
31
+ from .store import sublevel
32
+ from .store import toplevel
33
+ from .store import write
34
+
35
+
36
+ try:
37
+ __version__ = version("oups")
38
+ except PackageNotFoundError:
39
+ # Package is not installed, likely in development
40
+ __version__ = "development"
oups/date_utils.py ADDED
@@ -0,0 +1,62 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Created on Sat May 24 18:35:00 2025.
4
+
5
+ @author: pierrot
6
+
7
+ """
8
+ from pandas import Timedelta
9
+ from pandas import Timestamp
10
+ from pandas import date_range
11
+
12
+
13
+ def floor_ts(ts: Timestamp, freq: str) -> Timestamp:
14
+ """
15
+ Floor a timestamp even if using non-fixed frequency.
16
+
17
+ Parameters
18
+ ----------
19
+ ts : Timestamp
20
+ Timestamp to floor.
21
+ freq : str
22
+ Frequency string.
23
+ https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases
24
+
25
+
26
+ Returns
27
+ -------
28
+ Timestamp
29
+ Floored timestamp.
30
+
31
+ """
32
+ try:
33
+ return ts.floor(freq)
34
+ except ValueError:
35
+ return date_range(end=ts.normalize(), periods=1, freq=freq)[0]
36
+
37
+
38
+ def ceil_ts(ts: Timestamp, freq: str) -> Timestamp:
39
+ """
40
+ Ceil a timestamp even if using non-fixed frequency.
41
+
42
+ Parameters
43
+ ----------
44
+ ts : Timestamp
45
+ Timestamp to ceil.
46
+ freq : str
47
+ Frequency string.
48
+ https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases
49
+
50
+ Returns
51
+ -------
52
+ Timestamp
53
+ Ceiled timestamp.
54
+
55
+ """
56
+ try:
57
+ # Can't use 'ceil' from pandas because it does not work on 'D'aily
58
+ # frequency.
59
+ # return ts.ceil(freq)
60
+ return (ts + Timedelta(1, unit=freq)).floor(freq)
61
+ except ValueError:
62
+ return date_range(start=floor_ts(ts, freq), periods=2, freq=freq)[1]
oups/defines.py ADDED
@@ -0,0 +1,26 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Created on Wed Dec 1 18:35:00 2021.
4
+
5
+ @author: pierrot
6
+
7
+ """
8
+ # Central key in oups world, used as ID of the column name according which
9
+ # dataframes are ordered.
10
+ KEY_ORDERED_ON = "ordered_on"
11
+ # Other shared keys.
12
+ KEY_FILE_IDS = "file_ids"
13
+ KEY_N_ROWS = "n_rows"
14
+ KEY_ORDERED_ON_MINS = "ordered_on_mins"
15
+ KEY_ORDERED_ON_MAXS = "ordered_on_maxs"
16
+ # A specific key for a function parameter with a three-fold type:
17
+ # - None, meaning no duplicates check,
18
+ # - an empty list, meaning identify duplicate on all columns of the dataframe,
19
+ # - a not empty list of a string, the columns to identify row duplicates.
20
+ KEY_DUPLICATES_ON = "duplicates_on"
21
+ # In a fastparquet `ParquetFile`, oups-specific metadata is stored as value for
22
+ # key `KEY_METADATA_KEY`.
23
+ KEY_OUPS_METADATA = "oups"
24
+ # Parameters for 'write()' function.
25
+ KEY_ROW_GROUP_TARGET_SIZE = "row_group_target_size"
26
+ KEY_MAX_N_OFF_TARGET_RGS = "max_n_off_target_rgs"
oups/numpy_utils.py ADDED
@@ -0,0 +1,114 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Created on Mon Jun 02 18:35:00 2025.
4
+
5
+ @author: pierrot
6
+
7
+ """
8
+
9
+ from numpy import arange
10
+ from numpy import flip
11
+ from numpy import isnan
12
+ from numpy import maximum
13
+ from numpy import ones
14
+ from numpy import searchsorted
15
+ from numpy import where
16
+ from numpy.typing import NDArray
17
+
18
+
19
+ def isnotin_ordered(
20
+ sorted_array: NDArray,
21
+ query_elements: NDArray,
22
+ return_insert_positions: bool = False,
23
+ ) -> NDArray | tuple[NDArray, NDArray]:
24
+ """
25
+ Check if query elements are not present in a sorted array.
26
+
27
+ Parameters
28
+ ----------
29
+ sorted_array : NDArray
30
+ Sorted array in which to search for elements.
31
+ Must be sorted in ascending order.
32
+ query_elements : NDArray
33
+ Array of elements to search for.
34
+ Must be sorted in ascending order if containing elements which are
35
+ are larger than the largest element in 'sorted_array'.
36
+ return_insert_positions : bool, optional
37
+ If True, also return the insert positions where unfound elements
38
+ could be inserted into 'sorted_array', maintaining sort order.
39
+ Default is False.
40
+
41
+ Returns
42
+ -------
43
+ Union[NDArray, tuple[NDArray, NDArray]]
44
+ If 'return_insert_positions' is False:
45
+ Array of booleans with same length as 'query_elements', where
46
+ True indicates the element is not found in 'sorted_array'.
47
+ If 'return_insert_positions' is True:
48
+ Tuple containing:
49
+ - Array of booleans indicating which query elements are not found,
50
+ - Array of insert positions for unfound elements (where they could
51
+ be inserted into 'sorted_array', maintaining sort order)
52
+
53
+ Examples
54
+ --------
55
+ >>> sorted_arr = np.array([1, 3, 5, 7, 9])
56
+ >>> queries = np.array([2, 3, 6, 7])
57
+ >>> isnotin_ordered(sorted_arr, queries)
58
+ array([True, False, True, False])
59
+
60
+ >>> not_found, insert_positions = isnotin_ordered(sorted_arr, queries, return_insert_positions=True)
61
+ >>> not_found
62
+ array([True, False, True, False])
63
+ >>> insert_positions
64
+ array([1, 3]) # positions where 2 and 6 could be inserted
65
+
66
+ """
67
+ insert_idx = searchsorted(sorted_array, query_elements, side="left")
68
+ # Check if elements are found: they exist if insert position is valid
69
+ # and the element at that position matches the query
70
+ found_max_idx = searchsorted(insert_idx, len(sorted_array))
71
+ is_not_found = ones(len(query_elements), dtype=bool)
72
+ is_not_found[:found_max_idx] = sorted_array[insert_idx[:found_max_idx]] != query_elements[:found_max_idx]
73
+ return (is_not_found, insert_idx[is_not_found]) if return_insert_positions else is_not_found
74
+
75
+
76
+ def ffill1d(arr: NDArray) -> NDArray:
77
+ """
78
+ Forward fill 1D array in-place.
79
+
80
+ Parameters
81
+ ----------
82
+ arr : NDArray
83
+ Array to fill.
84
+
85
+ Returns
86
+ -------
87
+ None
88
+
89
+ Notes
90
+ -----
91
+ If first value is nan, it is kept as nan.
92
+
93
+ """
94
+ mask = isnan(arr)
95
+ idx = where(~mask, arange(mask.size), 0)
96
+ maximum.accumulate(idx, out=idx)
97
+ arr[mask] = arr[idx[mask]]
98
+
99
+
100
+ def bfill1d(arr: NDArray) -> NDArray:
101
+ """
102
+ Backward fill 1D array in-place.
103
+
104
+ Parameters
105
+ ----------
106
+ arr : NDArray
107
+ Array to fill.
108
+
109
+ Returns
110
+ -------
111
+ None
112
+
113
+ """
114
+ ffill1d(flip(arr, 0))
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ OUPS Stateful Loop Module.
4
+
5
+ This module provides the core stateful loop functionality.
6
+
7
+ """
8
+
9
+ from .stateful_loop import StatefulLoop
10
+
11
+
12
+ __all__ = [
13
+ "StatefulLoop",
14
+ ]
@@ -0,0 +1,55 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Created on Wed Jun 1 18:35:00 2025.
4
+
5
+ Loop persistence I/O helpers for the stateful loop.
6
+
7
+ """
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ from cloudpickle import dump
12
+ from cloudpickle import load
13
+
14
+
15
+ KEY_ID = "id"
16
+ KEY_VERSION = "version"
17
+ KEY_STATES = "states"
18
+ PERSISTENCE_ID = "stateful_loop"
19
+ PERSISTENCE_VERSION = 1
20
+
21
+
22
+ class LoopPersistenceIO:
23
+ """
24
+ Helper for reading and writing the loop persistence file with schema validation.
25
+
26
+ Schema: {"id": "stateful_loop", "version": 1, "states": {...}}.
27
+
28
+ """
29
+
30
+ @staticmethod
31
+ def load(path: Path) -> dict[str, dict[str, Any]]:
32
+ """
33
+ Load and validate states from ``path``.
34
+ """
35
+ with open(path, "rb") as f:
36
+ payload = load(f)
37
+ if not isinstance(payload, dict):
38
+ raise ValueError("invalid persistence file format.")
39
+ if payload[KEY_ID] != PERSISTENCE_ID:
40
+ raise ValueError("invalid persistence file id.")
41
+ if payload[KEY_VERSION] != PERSISTENCE_VERSION:
42
+ raise ValueError("unsupported persistence file version.")
43
+ states = payload[KEY_STATES]
44
+ if not isinstance(states, dict):
45
+ raise ValueError("invalid 'states' content in persistence file.")
46
+ return states
47
+
48
+ @staticmethod
49
+ def save(path: Path, states: dict[str, dict[str, Any]]) -> None:
50
+ """
51
+ Save ``states`` to ``path`` with schema and version.
52
+ """
53
+ payload = {KEY_ID: PERSISTENCE_ID, KEY_VERSION: PERSISTENCE_VERSION, KEY_STATES: states}
54
+ with open(path, "wb") as f:
55
+ dump(payload, f)