oups 2025.9.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of oups might be problematic. Click here for more details.
- oups/__init__.py +40 -0
- oups/date_utils.py +62 -0
- oups/defines.py +26 -0
- oups/numpy_utils.py +114 -0
- oups/stateful_loop/__init__.py +14 -0
- oups/stateful_loop/loop_persistence_io.py +55 -0
- oups/stateful_loop/stateful_loop.py +654 -0
- oups/stateful_loop/validate_loop_usage.py +338 -0
- oups/stateful_ops/__init__.py +22 -0
- oups/stateful_ops/aggstream/__init__.py +12 -0
- oups/stateful_ops/aggstream/aggstream.py +1524 -0
- oups/stateful_ops/aggstream/cumsegagg.py +580 -0
- oups/stateful_ops/aggstream/jcumsegagg.py +416 -0
- oups/stateful_ops/aggstream/segmentby.py +1018 -0
- oups/stateful_ops/aggstream/utils.py +71 -0
- oups/stateful_ops/asof_merger/__init__.py +11 -0
- oups/stateful_ops/asof_merger/asof_merger.py +750 -0
- oups/stateful_ops/asof_merger/get_config.py +401 -0
- oups/stateful_ops/asof_merger/validate_params.py +285 -0
- oups/store/__init__.py +15 -0
- oups/store/filepath_utils.py +68 -0
- oups/store/indexer.py +457 -0
- oups/store/ordered_parquet_dataset/__init__.py +19 -0
- oups/store/ordered_parquet_dataset/metadata_filename.py +50 -0
- oups/store/ordered_parquet_dataset/ordered_parquet_dataset/__init__.py +15 -0
- oups/store/ordered_parquet_dataset/ordered_parquet_dataset/base.py +863 -0
- oups/store/ordered_parquet_dataset/ordered_parquet_dataset/read_only.py +252 -0
- oups/store/ordered_parquet_dataset/parquet_adapter.py +157 -0
- oups/store/ordered_parquet_dataset/write/__init__.py +19 -0
- oups/store/ordered_parquet_dataset/write/iter_merge_split_data.py +131 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/__init__.py +22 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/base.py +784 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/n_rows_strategy.py +297 -0
- oups/store/ordered_parquet_dataset/write/merge_split_strategies/time_period_strategy.py +319 -0
- oups/store/ordered_parquet_dataset/write/write.py +270 -0
- oups/store/store/__init__.py +11 -0
- oups/store/store/dataset_cache.py +50 -0
- oups/store/store/iter_intersections.py +397 -0
- oups/store/store/store.py +345 -0
- oups-2025.9.5.dist-info/LICENSE +201 -0
- oups-2025.9.5.dist-info/METADATA +44 -0
- oups-2025.9.5.dist-info/RECORD +43 -0
- oups-2025.9.5.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,654 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Created on Wed Jun 1 18:35:00 2025.
|
|
4
|
+
|
|
5
|
+
@author: pierrot
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
from collections import defaultdict
|
|
9
|
+
from collections.abc import Callable
|
|
10
|
+
from collections.abc import Hashable
|
|
11
|
+
from collections.abc import Iterable
|
|
12
|
+
from collections.abc import Iterator
|
|
13
|
+
from functools import partial
|
|
14
|
+
from inspect import signature
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from types import TracebackType
|
|
17
|
+
from typing import Any
|
|
18
|
+
from typing import TypeVar
|
|
19
|
+
|
|
20
|
+
from pandas import DataFrame
|
|
21
|
+
from pandas import concat
|
|
22
|
+
|
|
23
|
+
from oups.stateful_loop.loop_persistence_io import LoopPersistenceIO
|
|
24
|
+
from oups.stateful_loop.validate_loop_usage import validate_loop_usage
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
T = TypeVar("T")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class Skip(Exception):
|
|
31
|
+
"""
|
|
32
|
+
Exception used to skip downstream processing for the current item.
|
|
33
|
+
|
|
34
|
+
In this stateful loop context, ``Skip`` signals that the current iteration
|
|
35
|
+
should continue without executing downstream code for the current data item.
|
|
36
|
+
It is typically raised by accumulating operations when memory limits are not
|
|
37
|
+
yet reached.
|
|
38
|
+
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class IterationContext:
|
|
43
|
+
"""
|
|
44
|
+
Per-item context manager that swallows ``Skip`` and yields the item.
|
|
45
|
+
|
|
46
|
+
The ``__enter__`` method returns the current item. If a ``Skip``
|
|
47
|
+
exception is raised inside the ``with`` block, it is swallowed to
|
|
48
|
+
proceed to the next iteration without running downstream code.
|
|
49
|
+
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def __init__(self, current: Any):
|
|
53
|
+
"""
|
|
54
|
+
Initialize the IterationContext.
|
|
55
|
+
|
|
56
|
+
Parameters
|
|
57
|
+
----------
|
|
58
|
+
current : Any
|
|
59
|
+
The current item to be yielded.
|
|
60
|
+
|
|
61
|
+
"""
|
|
62
|
+
self._current = current
|
|
63
|
+
|
|
64
|
+
def __enter__(self):
|
|
65
|
+
"""
|
|
66
|
+
Return the current item.
|
|
67
|
+
"""
|
|
68
|
+
return self._current
|
|
69
|
+
|
|
70
|
+
def __exit__(
|
|
71
|
+
self,
|
|
72
|
+
exc_type: type[BaseException] | None,
|
|
73
|
+
_exc: BaseException | None,
|
|
74
|
+
_tb: TracebackType | None,
|
|
75
|
+
) -> bool:
|
|
76
|
+
"""
|
|
77
|
+
Exit the IterationContext.
|
|
78
|
+
|
|
79
|
+
Swallow ``Skip`` exceptions to proceed to the next iteration without
|
|
80
|
+
running downstream code.
|
|
81
|
+
|
|
82
|
+
"""
|
|
83
|
+
if exc_type is Skip:
|
|
84
|
+
return True
|
|
85
|
+
return False
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _raise_invalid_state_keys(
|
|
89
|
+
invalid_keys: Iterable[str],
|
|
90
|
+
) -> None:
|
|
91
|
+
"""
|
|
92
|
+
Raise a ValueError if the invalid keys are not empty.
|
|
93
|
+
|
|
94
|
+
Parameters
|
|
95
|
+
----------
|
|
96
|
+
invalid_keys : Iterable[str]
|
|
97
|
+
Keys that are not present in the stateful function/object.
|
|
98
|
+
|
|
99
|
+
Raises
|
|
100
|
+
------
|
|
101
|
+
ValueError
|
|
102
|
+
If the invalid keys are not empty.
|
|
103
|
+
|
|
104
|
+
"""
|
|
105
|
+
if invalid_keys:
|
|
106
|
+
raise ValueError(
|
|
107
|
+
"state contains keys not present in stateful function/object: " + ", ".join(sorted(invalid_keys)),
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class StatefulLoop:
|
|
112
|
+
"""
|
|
113
|
+
Main orchestrator for stateful loop execution.
|
|
114
|
+
|
|
115
|
+
The StatefulLoop class provides the core functionality wrapping a
|
|
116
|
+
lightweight data-processing ``for`` loop, including iteration control, state
|
|
117
|
+
management, and DataFrame buffering with memory-triggered concatenation.
|
|
118
|
+
|
|
119
|
+
If the provided ``filepath`` already exists at instantiation time,
|
|
120
|
+
its content is loaded and used to initialize the internal state store. In
|
|
121
|
+
that case, any initial values given later via ``bind_function_state`` or
|
|
122
|
+
``bind_object_state`` are ignored for the corresponding bindings, because
|
|
123
|
+
previously recorded state takes precedence. This enables resuming a
|
|
124
|
+
stateful loop by re-running the same function that declares the loop,
|
|
125
|
+
bindings, and iteration.
|
|
126
|
+
|
|
127
|
+
Targeted usage is:
|
|
128
|
+
|
|
129
|
+
- define a function that receives a data iterable/generator as a parameter.
|
|
130
|
+
- instantiate ``StatefulLoop`` inside the function, providing a stable
|
|
131
|
+
loop persistence ``filepath``.
|
|
132
|
+
- bind stateful functions/objects with ``bind_function_state`` and
|
|
133
|
+
``bind_object_state``.
|
|
134
|
+
- iterate using ``for item_ctx in loop.iterate(source):``.
|
|
135
|
+
|
|
136
|
+
On subsequent calls of the same function, the stored state is loaded from
|
|
137
|
+
the loop persistence ``filepath`` at construction time, so stateful
|
|
138
|
+
functions resume from their last recorded state and ignore newly provided
|
|
139
|
+
initial values. State is persisted when the loop completes (after the last
|
|
140
|
+
item).
|
|
141
|
+
|
|
142
|
+
Attributes
|
|
143
|
+
----------
|
|
144
|
+
default_memory_limit_mb : float
|
|
145
|
+
Default memory limit in megabytes used by accumulation when
|
|
146
|
+
no per-call override is provided.
|
|
147
|
+
default_memory_limit_bytes : int
|
|
148
|
+
Default memory limit in bytes used by accumulation when
|
|
149
|
+
no per-call override is provided.
|
|
150
|
+
is_last_iteration : bool
|
|
151
|
+
Flag indicating if this is the last iteration of the stateful loop.
|
|
152
|
+
iteration_count : int
|
|
153
|
+
Current iteration count (0-based).
|
|
154
|
+
Value is '-1' till the loop starts.
|
|
155
|
+
filepath : Path
|
|
156
|
+
Path of the loop persistence file: serialized states for stateful
|
|
157
|
+
functions/objects and a run-flag used by loop validation and buffering
|
|
158
|
+
behavior.
|
|
159
|
+
_persistence_loaded : bool
|
|
160
|
+
Whether a persistence file existed and was loaded at construction time.
|
|
161
|
+
Used to decide default behavior of the buffer placement validation.
|
|
162
|
+
_data_buffer : dict[int, defaultdict[Hashable, list[DataFrame]]]
|
|
163
|
+
Nested buffer for buffering DataFrames.
|
|
164
|
+
First level keys are buffer IDs (call position within iteration),
|
|
165
|
+
second level keys are user-provided keys, values are lists of
|
|
166
|
+
DataFrames.
|
|
167
|
+
_iteration_buffer_current : int
|
|
168
|
+
Tracks the iteration index for which the current buffer position
|
|
169
|
+
counter is valid. Used to reset the counter at each new iteration.
|
|
170
|
+
_iteration_buffer_count : int
|
|
171
|
+
Tracks the 0-based call position of ``buffer()`` within the current
|
|
172
|
+
iteration.
|
|
173
|
+
_memory_usage_bytes : dict[int, int]
|
|
174
|
+
Memory usage tracking per buffer ID in bytes.
|
|
175
|
+
_state_key_counts : dict[str, int]
|
|
176
|
+
Counter per base state reference used to generate stable unique keys
|
|
177
|
+
for stateful functions and objects (e.g., ``func:name#1``,
|
|
178
|
+
``obj:name#1``).
|
|
179
|
+
_state_store : dict[str, dict[str, Any]]
|
|
180
|
+
In-memory state storage (persisted on disk when stateful loop finishes).
|
|
181
|
+
For stateful functions: stores parameter name -> value mappings.
|
|
182
|
+
For stateful objects: stores last persisted attribute values.
|
|
183
|
+
_object_bindings : dict[str, tuple[Any, list[str]]]
|
|
184
|
+
Registry of object bindings keyed by namespaced state reference
|
|
185
|
+
(e.g., ``obj:Counter#1``) to a tuple of the bound object and the list
|
|
186
|
+
of attribute names to snapshot on save.
|
|
187
|
+
|
|
188
|
+
Methods
|
|
189
|
+
-------
|
|
190
|
+
iterate(iterable: Iterable[Any], *, check_loop_usage: Optional[bool] = None)
|
|
191
|
+
-> Iterator[IterationContext]
|
|
192
|
+
Wrap an iterable to control loop flow in stateful loop context. Optionally
|
|
193
|
+
runs a strict AST validation that enforces legal buffer placement.
|
|
194
|
+
buffer(
|
|
195
|
+
data: dict[Hashable, DataFrame],
|
|
196
|
+
memory_limit_mb: Optional[float] = None,
|
|
197
|
+
concat_func: Callable[[list[DataFrame]], DataFrame] = pandas.concat,
|
|
198
|
+
) -> Optional[dict[Hashable, DataFrame]]
|
|
199
|
+
Buffer DataFrames in memory and track memory usage.
|
|
200
|
+
bind_function_state(func: Callable[..., Any], *, state: dict[str, Any],
|
|
201
|
+
name: Optional[str] = None) -> Callable[..., Any]
|
|
202
|
+
Wrap a function to bind specified parameters as state across iterations.
|
|
203
|
+
bind_object_state(obj: T, *, state: list[str], name: Optional[str] = None) -> T
|
|
204
|
+
Register a stateful object for state binding.
|
|
205
|
+
|
|
206
|
+
"""
|
|
207
|
+
|
|
208
|
+
def __init__(
|
|
209
|
+
self,
|
|
210
|
+
filepath: Path,
|
|
211
|
+
*,
|
|
212
|
+
default_memory_limit_mb: float = 300.0,
|
|
213
|
+
):
|
|
214
|
+
"""
|
|
215
|
+
Initialize the StatefulLoop.
|
|
216
|
+
|
|
217
|
+
Parameters
|
|
218
|
+
----------
|
|
219
|
+
filepath : Path
|
|
220
|
+
File path for storing the loop persistence. If this file
|
|
221
|
+
already exists when the instance is created, its content is loaded
|
|
222
|
+
to initialize the internal state store, allowing stateful loops to
|
|
223
|
+
resume from a prior run. This also flags that a successful prior run
|
|
224
|
+
occurred, allowing the loop validation to be skipped on subsequent
|
|
225
|
+
runs if 'check_loop_usage' is ``None``.
|
|
226
|
+
default_memory_limit_mb : float, default 300.0
|
|
227
|
+
Default memory limit in megabytes used by delayed concatenation when
|
|
228
|
+
no per-instance/per-site override is provided.
|
|
229
|
+
|
|
230
|
+
"""
|
|
231
|
+
self._filepath = filepath
|
|
232
|
+
self._default_memory_limit_bytes = int(default_memory_limit_mb * 1024 * 1024)
|
|
233
|
+
# Simple iteration context attributes
|
|
234
|
+
self.is_last_iteration = False
|
|
235
|
+
# 'iteration_count' will be set to 0 at first iteration.
|
|
236
|
+
self.iteration_count: int = -1
|
|
237
|
+
# In-memory state storage (persisted when the stateful loop finishes).
|
|
238
|
+
# If a loop persistence file exists, load it to resume previous states;
|
|
239
|
+
# otherwise start empty.
|
|
240
|
+
self._state_store: dict[str, dict[str, Any]] = {}
|
|
241
|
+
self._persistence_loaded = Path(self._filepath).exists()
|
|
242
|
+
if self._persistence_loaded:
|
|
243
|
+
self._state_store = LoopPersistenceIO.load(self._filepath)
|
|
244
|
+
# Track counts for state references to ensure stable, unique keys
|
|
245
|
+
self._state_key_counts: dict[str, int] = defaultdict(int)
|
|
246
|
+
# Registry for object bindings (strong refs during loop lifetime)
|
|
247
|
+
self._object_bindings: dict[str, tuple[Any, list[str]]] = {}
|
|
248
|
+
# Track buffer call order within each iteration
|
|
249
|
+
self._iteration_buffer_current = -1
|
|
250
|
+
self._iteration_buffer_count = 0
|
|
251
|
+
# Data buffer for buffering: buffer_id -> user_key -> list[DataFrame]
|
|
252
|
+
self._data_buffer: dict[int, defaultdict[Hashable, list[DataFrame]]] = {}
|
|
253
|
+
# Track memory usage per buffer_id in bytes
|
|
254
|
+
self._memory_usage_bytes: dict[int, int] = {}
|
|
255
|
+
|
|
256
|
+
def __repr__(self):
|
|
257
|
+
"""
|
|
258
|
+
Return string representation of the StatefulLoop.
|
|
259
|
+
"""
|
|
260
|
+
return (
|
|
261
|
+
f"StatefulLoop(filepath={self.filepath}, "
|
|
262
|
+
f"default_memory_limit_mb={self.default_memory_limit_mb})"
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
@property
|
|
266
|
+
def filepath(self):
|
|
267
|
+
"""
|
|
268
|
+
Return loop persistence file path.
|
|
269
|
+
"""
|
|
270
|
+
return self._filepath
|
|
271
|
+
|
|
272
|
+
@property
|
|
273
|
+
def default_memory_limit_mb(self):
|
|
274
|
+
"""
|
|
275
|
+
Return default memory limit in megabytes.
|
|
276
|
+
"""
|
|
277
|
+
return self._default_memory_limit_bytes / (1024 * 1024)
|
|
278
|
+
|
|
279
|
+
@property
|
|
280
|
+
def default_memory_limit_bytes(self):
|
|
281
|
+
"""
|
|
282
|
+
Return default memory limit in bytes.
|
|
283
|
+
"""
|
|
284
|
+
return self._default_memory_limit_bytes
|
|
285
|
+
|
|
286
|
+
def bind_function_state(
|
|
287
|
+
self,
|
|
288
|
+
func: Callable[..., Any],
|
|
289
|
+
*,
|
|
290
|
+
state: dict[str, Any],
|
|
291
|
+
name: str | None = None,
|
|
292
|
+
) -> Callable[..., Any]:
|
|
293
|
+
"""
|
|
294
|
+
Create a partial callable that binds specified parameters as state.
|
|
295
|
+
|
|
296
|
+
The binding is by reference. For it to work, the parameters bound as
|
|
297
|
+
state must be mutable (e.g., ``dict`` or ``list``) and updated in place
|
|
298
|
+
by the stateful function.
|
|
299
|
+
|
|
300
|
+
Parameters
|
|
301
|
+
----------
|
|
302
|
+
func : callable
|
|
303
|
+
Function to wrap. The partial callable publishes a reduced
|
|
304
|
+
signature that hides state-managed parameters. There is no runtime
|
|
305
|
+
guard: if callers pass those parameters, they will override the
|
|
306
|
+
bound values for that call.
|
|
307
|
+
state : dict[str, Any]
|
|
308
|
+
Mapping of state parameter names to initial values used only if no
|
|
309
|
+
stored state exists yet for this binding. Values should be mutable
|
|
310
|
+
(e.g., ``dict`` or ``list``) and updated in place by the stateful
|
|
311
|
+
function.
|
|
312
|
+
name : Optional[str]
|
|
313
|
+
Optional base name used to generate a stable, unique state key
|
|
314
|
+
(e.g., ``name#1``). Declare stateful functions in a consistent
|
|
315
|
+
order to keep keys stable across runs. Defaults to
|
|
316
|
+
the function's ``__name__``.
|
|
317
|
+
|
|
318
|
+
Returns
|
|
319
|
+
-------
|
|
320
|
+
callable
|
|
321
|
+
A partial callable compatible with ``func`` with state references
|
|
322
|
+
pre-bound and a reduced public signature.
|
|
323
|
+
|
|
324
|
+
"""
|
|
325
|
+
# Initialize or reuse stored state references.
|
|
326
|
+
base_ref = name or getattr(func, "__name__", None)
|
|
327
|
+
if base_ref is None:
|
|
328
|
+
raise ValueError("function has no name.")
|
|
329
|
+
sig = signature(func)
|
|
330
|
+
_raise_invalid_state_keys(set(state) - set(sig.parameters))
|
|
331
|
+
_, stored_state = self._get_or_init_state(
|
|
332
|
+
"func",
|
|
333
|
+
base_ref,
|
|
334
|
+
initial_state=state,
|
|
335
|
+
)
|
|
336
|
+
partial_func = partial(func, **stored_state)
|
|
337
|
+
# Publish reduced signature so callers see only non-state parameters.
|
|
338
|
+
public_params = [p for p in sig.parameters.values() if p.name not in state]
|
|
339
|
+
partial_func.__signature__ = sig.replace(parameters=public_params)
|
|
340
|
+
return partial_func
|
|
341
|
+
|
|
342
|
+
# --- Stateful object support ---
|
|
343
|
+
def bind_object_state(
|
|
344
|
+
self,
|
|
345
|
+
obj: T,
|
|
346
|
+
*,
|
|
347
|
+
state: list[str],
|
|
348
|
+
name: str | None = None,
|
|
349
|
+
) -> T:
|
|
350
|
+
"""
|
|
351
|
+
Register a stateful object for state binding.
|
|
352
|
+
|
|
353
|
+
At bind time, if a stored persisted state exists for this object binding,
|
|
354
|
+
the listed attributes are restored on ``obj``. Otherwise, the current
|
|
355
|
+
values of those attributes are stored into the internal state store.
|
|
356
|
+
The loop keeps a registry of bound objects and attribute names, and on
|
|
357
|
+
persistence it records the latest attribute values via
|
|
358
|
+
``getattr``. Both in-place mutation and reassignment are supported.
|
|
359
|
+
|
|
360
|
+
Parameters
|
|
361
|
+
----------
|
|
362
|
+
obj : T
|
|
363
|
+
The object to bind.
|
|
364
|
+
state : list[str]
|
|
365
|
+
List of attribute names to bind as state. Attributes must exist on
|
|
366
|
+
``obj`` at bind time and their values must be serializable by the
|
|
367
|
+
configured loop persistence I/O.
|
|
368
|
+
name : Optional[str], default None
|
|
369
|
+
Base name used to build a stable state reference for this object.
|
|
370
|
+
If None, the object's name or its class name is used.
|
|
371
|
+
|
|
372
|
+
Returns
|
|
373
|
+
-------
|
|
374
|
+
T
|
|
375
|
+
The same object instance provided in ``obj``.
|
|
376
|
+
|
|
377
|
+
Examples
|
|
378
|
+
--------
|
|
379
|
+
Pre-initialized attribute (mutation):
|
|
380
|
+
|
|
381
|
+
>>> class PreInitCounter:
|
|
382
|
+
... def __init__(self):
|
|
383
|
+
... self.state1 = {"count": 0}
|
|
384
|
+
... def process(self, x):
|
|
385
|
+
... self.state1["count"] += 1
|
|
386
|
+
... return x
|
|
387
|
+
>>> obj = PreInitCounter()
|
|
388
|
+
>>> loop.bind_object_state(obj, state=["state1"])
|
|
389
|
+
|
|
390
|
+
Lazy initialization with reassignment:
|
|
391
|
+
|
|
392
|
+
>>> class ReassigningCounter:
|
|
393
|
+
... def __init__(self, start=0):
|
|
394
|
+
... self.state1 = None
|
|
395
|
+
... self._start = start
|
|
396
|
+
... def process(self, x):
|
|
397
|
+
... if self.state1 is None:
|
|
398
|
+
... self.state1 = {"count": self._start}
|
|
399
|
+
... self.state1["count"] += 1
|
|
400
|
+
... return x
|
|
401
|
+
>>> obj2 = ReassigningCounter()
|
|
402
|
+
>>> loop.bind_object_state(obj2, state=["state1"])
|
|
403
|
+
|
|
404
|
+
"""
|
|
405
|
+
base_ref = name or getattr(obj, "__name__", None) or obj.__class__.__name__
|
|
406
|
+
if base_ref is None:
|
|
407
|
+
raise ValueError("object has no name.")
|
|
408
|
+
_raise_invalid_state_keys({attr for attr in state if not hasattr(obj, attr)})
|
|
409
|
+
state_ref, stored_state = self._get_or_init_state(
|
|
410
|
+
"obj",
|
|
411
|
+
base_ref,
|
|
412
|
+
initial_state={attr: getattr(obj, attr) for attr in state},
|
|
413
|
+
)
|
|
414
|
+
for attr in state:
|
|
415
|
+
setattr(obj, attr, stored_state[attr])
|
|
416
|
+
# Register binding for persistence-on-save using strong reference.
|
|
417
|
+
self._object_bindings[state_ref] = (obj, list(state))
|
|
418
|
+
return obj
|
|
419
|
+
|
|
420
|
+
def buffer(
|
|
421
|
+
self,
|
|
422
|
+
data: dict[Hashable, DataFrame],
|
|
423
|
+
memory_limit_mb: float | None = None,
|
|
424
|
+
concat_func: Callable[[list[DataFrame]], DataFrame] = concat,
|
|
425
|
+
) -> dict[Hashable, DataFrame] | None:
|
|
426
|
+
"""
|
|
427
|
+
Buffer DataFrames in memory and track memory usage.
|
|
428
|
+
|
|
429
|
+
This method automatically creates unique buffer spaces for each
|
|
430
|
+
``buffer()`` call within an iteration, preventing data from different
|
|
431
|
+
``buffer()`` calls from interfering with each other, even when using the
|
|
432
|
+
same user-provided keys.
|
|
433
|
+
|
|
434
|
+
``buffer()`` cannot be placed within a nested loop. The unique
|
|
435
|
+
identifier is based on call order within each iteration.
|
|
436
|
+
|
|
437
|
+
Placement rules
|
|
438
|
+
---------------
|
|
439
|
+
Calls to ``buffer()`` are intended to be used directly as top-level
|
|
440
|
+
statements inside the first ``with item_ctx as ...:`` block inside the
|
|
441
|
+
body of ``for item_ctx in loop.iterate(...):``. A strict AST validation
|
|
442
|
+
can enforce these rules when ``iterate(..., check_loop_usage=...)``
|
|
443
|
+
enables it (see ``iterate`` docstring for details).
|
|
444
|
+
|
|
445
|
+
Parameters
|
|
446
|
+
----------
|
|
447
|
+
data : dict[Hashable, DataFrame]
|
|
448
|
+
Dictionary mapping keys to DataFrames to be buffered.
|
|
449
|
+
Keys can be reused across different ``buffer()`` calls without
|
|
450
|
+
conflict.
|
|
451
|
+
memory_limit_mb : Optional[float], default None
|
|
452
|
+
Memory limit in megabytes. If None, uses default memory limit.
|
|
453
|
+
When exceeded, triggers concatenation and returns results.
|
|
454
|
+
concat_func : Callable[[list[DataFrame]], DataFrame], default ``pandas.concat``
|
|
455
|
+
Function to concatenate a non-empty list of DataFrames when the
|
|
456
|
+
memory limit is reached.
|
|
457
|
+
|
|
458
|
+
Returns
|
|
459
|
+
-------
|
|
460
|
+
Optional[dict[Hashable, DataFrame]]
|
|
461
|
+
Returns concatenated DataFrames when memory limit is exceeded or on
|
|
462
|
+
last iteration.
|
|
463
|
+
|
|
464
|
+
Raises
|
|
465
|
+
------
|
|
466
|
+
Skip
|
|
467
|
+
Raised when memory limit is not exceeded and not on last iteration
|
|
468
|
+
to signal the caller to skip downstream processing and continue to
|
|
469
|
+
the next iteration.
|
|
470
|
+
|
|
471
|
+
"""
|
|
472
|
+
# Generate unique iteration-based identifier.
|
|
473
|
+
buffer_id = self._get_buffer_id()
|
|
474
|
+
# Ensure buffer_id exists in buffer and memory tracker.
|
|
475
|
+
if buffer_id not in self._data_buffer:
|
|
476
|
+
self._data_buffer[buffer_id] = defaultdict(list)
|
|
477
|
+
self._memory_usage_bytes[buffer_id] = 0
|
|
478
|
+
# Append data to buffer and track memory usage.
|
|
479
|
+
for user_key, df in data.items():
|
|
480
|
+
self._data_buffer[buffer_id][user_key].append(df)
|
|
481
|
+
# Increment memory usage for this buffer_id
|
|
482
|
+
self._memory_usage_bytes[buffer_id] += df.memory_usage(deep=True).sum()
|
|
483
|
+
# Check if we need to concat current buffer.
|
|
484
|
+
memory_limit_bytes = int(
|
|
485
|
+
(
|
|
486
|
+
memory_limit_mb * 1024 * 1024
|
|
487
|
+
if memory_limit_mb is not None
|
|
488
|
+
else self._default_memory_limit_bytes
|
|
489
|
+
),
|
|
490
|
+
)
|
|
491
|
+
if self.is_last_iteration or self._memory_usage_bytes[buffer_id] >= memory_limit_bytes:
|
|
492
|
+
concat_res = {}
|
|
493
|
+
for user_key, df_list in self._data_buffer[buffer_id].items():
|
|
494
|
+
if df_list:
|
|
495
|
+
# Concatenate all DataFrames for this user_key.
|
|
496
|
+
concat_res[user_key] = concat_func(df_list)
|
|
497
|
+
# Free memory on the way to prevent buffering data
|
|
498
|
+
# twice along the concatenation chain.
|
|
499
|
+
self._data_buffer[buffer_id][user_key].clear()
|
|
500
|
+
self._memory_usage_bytes[buffer_id] = 0
|
|
501
|
+
return concat_res
|
|
502
|
+
else:
|
|
503
|
+
raise Skip
|
|
504
|
+
|
|
505
|
+
def iterate(
|
|
506
|
+
self,
|
|
507
|
+
iterable: Iterable[Any],
|
|
508
|
+
*,
|
|
509
|
+
check_loop_usage: bool | None = None,
|
|
510
|
+
) -> Iterator[IterationContext]:
|
|
511
|
+
"""
|
|
512
|
+
Wrap an iterable to control loop flow using context-managed steps.
|
|
513
|
+
|
|
514
|
+
This method provides the fundamental pattern for stateful loops: a
|
|
515
|
+
for-loop that processes data iteratively while managing iteration
|
|
516
|
+
context flags. It uses lookahead to detect the last element without
|
|
517
|
+
emitting a sentinel value.
|
|
518
|
+
|
|
519
|
+
``Skip`` exceptions raised inside the ``with item_ctx`` block (e.g., by
|
|
520
|
+
``buffer()`` while still under the memory limit) are swallowed by the
|
|
521
|
+
``IterationContext``, skipping downstream code and continuing to the
|
|
522
|
+
next iteration.
|
|
523
|
+
|
|
524
|
+
Parameters
|
|
525
|
+
----------
|
|
526
|
+
iterable : Iterable[Any]
|
|
527
|
+
The iterable to wrap and process
|
|
528
|
+
check_loop_usage : Optional[bool], default None
|
|
529
|
+
If True, always run the strict validation that enforces:
|
|
530
|
+
- the first statement in the loop body is ``with item_ctx as ...:``
|
|
531
|
+
- any ``loop.buffer(...)`` calls are direct statements at the top
|
|
532
|
+
level inside that ``with`` body (not in conditionals/loops/nested
|
|
533
|
+
blocks).
|
|
534
|
+
File read failures will raise immediately.
|
|
535
|
+
If False, never run the validation.
|
|
536
|
+
If None, run the validation only if the loop persistence file was
|
|
537
|
+
not loaded at construction time (i.e., first run for this stateful
|
|
538
|
+
loop). If a loop persistence file existed and was loaded, skip the
|
|
539
|
+
validation.
|
|
540
|
+
|
|
541
|
+
Yields
|
|
542
|
+
------
|
|
543
|
+
IterationContext
|
|
544
|
+
A context manager that yields the current item and swallows
|
|
545
|
+
``Skip``.
|
|
546
|
+
|
|
547
|
+
Examples
|
|
548
|
+
--------
|
|
549
|
+
>>> from pathlib import Path
|
|
550
|
+
>>> loop = StatefulLoop(Path("state.pkl"))
|
|
551
|
+
>>> out = []
|
|
552
|
+
>>> for item_ctx in loop.iterate([10, 20]):
|
|
553
|
+
... with item_ctx as item:
|
|
554
|
+
... out.append(item)
|
|
555
|
+
>>> out
|
|
556
|
+
[10, 20]
|
|
557
|
+
|
|
558
|
+
"""
|
|
559
|
+
# Optional strict validation to fail fast on illegal buffer placement.
|
|
560
|
+
if check_loop_usage is True or (check_loop_usage is None and not self._persistence_loaded):
|
|
561
|
+
validate_loop_usage(self)
|
|
562
|
+
it = iter(iterable)
|
|
563
|
+
try:
|
|
564
|
+
try:
|
|
565
|
+
next_item = next(it)
|
|
566
|
+
except StopIteration:
|
|
567
|
+
# Empty iterable, exit.
|
|
568
|
+
# No state to persist since no iterations occurred.
|
|
569
|
+
return
|
|
570
|
+
|
|
571
|
+
while True:
|
|
572
|
+
self.iteration_count += 1
|
|
573
|
+
current = next_item
|
|
574
|
+
try:
|
|
575
|
+
next_item = next(it)
|
|
576
|
+
except StopIteration:
|
|
577
|
+
self.is_last_iteration = True
|
|
578
|
+
yield IterationContext(current)
|
|
579
|
+
break
|
|
580
|
+
|
|
581
|
+
yield IterationContext(current)
|
|
582
|
+
|
|
583
|
+
finally:
|
|
584
|
+
if self.is_last_iteration:
|
|
585
|
+
# Persist object-bound attributes just before saving state.
|
|
586
|
+
if self._object_bindings:
|
|
587
|
+
for state_ref, (obj, attrs) in self._object_bindings.items():
|
|
588
|
+
self._state_store[state_ref] = {attr: getattr(obj, attr) for attr in attrs}
|
|
589
|
+
# After yielding last element, persist state and stop.
|
|
590
|
+
# The file is created even if there is no state to persist
|
|
591
|
+
# (no bindings). The file is still used as a flag to indicate
|
|
592
|
+
# the stateful loop has been run once (see 'check_loop_usage').
|
|
593
|
+
LoopPersistenceIO.save(self.filepath, self._state_store)
|
|
594
|
+
# Clear strong references so objects can be GC'ed after save.
|
|
595
|
+
self._object_bindings.clear()
|
|
596
|
+
|
|
597
|
+
# --- Iteration-based buffer ID generation ---
|
|
598
|
+
def _get_buffer_id(self) -> int:
|
|
599
|
+
"""
|
|
600
|
+
Generate buffer id depending on its call order in the iteration.
|
|
601
|
+
|
|
602
|
+
This approach is valid if all ``buffer()`` calls are at same level
|
|
603
|
+
in the code, within the stateful loop.
|
|
604
|
+
|
|
605
|
+
Returns
|
|
606
|
+
-------
|
|
607
|
+
int
|
|
608
|
+
The call position (0, 1, 2, ...) of this buffer within the
|
|
609
|
+
iteration. Same position across different iterations gets the same
|
|
610
|
+
ID.
|
|
611
|
+
|
|
612
|
+
"""
|
|
613
|
+
# Reset counter when we enter a new iteration
|
|
614
|
+
if self.iteration_count != self._iteration_buffer_current:
|
|
615
|
+
self._iteration_buffer_current = self.iteration_count
|
|
616
|
+
self._iteration_buffer_count = 0
|
|
617
|
+
else:
|
|
618
|
+
self._iteration_buffer_count += 1
|
|
619
|
+
return self._iteration_buffer_count
|
|
620
|
+
|
|
621
|
+
def _get_or_init_state(
|
|
622
|
+
self,
|
|
623
|
+
namespace: str,
|
|
624
|
+
base_ref: str,
|
|
625
|
+
*,
|
|
626
|
+
initial_state: dict[str, Any],
|
|
627
|
+
) -> tuple[str, dict[str, Any]]:
|
|
628
|
+
"""
|
|
629
|
+
Validate states, initialize if needed, and return state ref and values.
|
|
630
|
+
|
|
631
|
+
Parameters
|
|
632
|
+
----------
|
|
633
|
+
namespace : str
|
|
634
|
+
Either ``"func"`` or ``"obj"``. Used to namespace state keys.
|
|
635
|
+
base_ref : str
|
|
636
|
+
Base name for the state reference; a unique counter is appended and
|
|
637
|
+
prefixed by the namespace (e.g., ``func:base#1``, ``obj:base#1``).
|
|
638
|
+
initial_state : dict[str, Any]
|
|
639
|
+
Initial mapping to use when creating a new state entry.
|
|
640
|
+
|
|
641
|
+
Returns
|
|
642
|
+
-------
|
|
643
|
+
tuple[str, dict[str, Any]]
|
|
644
|
+
The namespaced state reference and the stored state mapping.
|
|
645
|
+
|
|
646
|
+
"""
|
|
647
|
+
# Generate unique state reference and initialize if needed
|
|
648
|
+
namespaced_ref = f"{namespace}:{base_ref}"
|
|
649
|
+
self._state_key_counts[namespaced_ref] += 1
|
|
650
|
+
state_ref = f"{namespaced_ref}#{self._state_key_counts[namespaced_ref]}"
|
|
651
|
+
if state_ref not in self._state_store:
|
|
652
|
+
# Initialize new state entry with initial state.
|
|
653
|
+
self._state_store[state_ref] = initial_state
|
|
654
|
+
return state_ref, self._state_store[state_ref]
|