batchcorder 0.1.2__cp310-abi3-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,523 @@
1
+ """Batchcorder: Replayable cached Arrow record-batch streams."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from importlib.metadata import version
6
+ from typing import TYPE_CHECKING
7
+
8
+
9
+ if TYPE_CHECKING:
10
+ from typing import Any
11
+
12
+ from ._batchcorder import (
13
+ CastingStreamCache as _PyCastingStreamCache,
14
+ )
15
+ from ._batchcorder import (
16
+ StreamCache as _PyStreamCache,
17
+ )
18
+ from ._batchcorder import (
19
+ StreamCacheReader as _PyStreamCacheReader,
20
+ )
21
+
22
+
23
+ __all__ = [
24
+ "CastingStreamCache",
25
+ "StreamCache",
26
+ "StreamCacheReader",
27
+ ]
28
+
29
+ __version__: str = version("batchcorder")
30
+
31
+
32
+ class StreamCache:
33
+ """
34
+ A cached Arrow stream backed by an in-memory Vec or an on-disk IPC file.
35
+
36
+ Wraps any Arrow stream source and stores each ``RecordBatch`` so multiple
37
+ independent :class:`StreamCacheReader` handles can replay the full stream
38
+ from any position. The upstream source is ingested lazily on demand and
39
+ consumed exactly once.
40
+
41
+ Two storage modes are supported:
42
+
43
+ - **Memory-only** (omit ``disk_path`` / ``disk_capacity``): batches are
44
+ kept as reference-counted pointers in RAM. Reads are zero-copy; no IPC
45
+ serialisation happens.
46
+ - **Disk** (provide both ``disk_path`` and ``disk_capacity``): batches are
47
+ serialised to an append-only Arrow IPC file. A configurable hot layer
48
+ (``memory_capacity``) keeps recently ingested batches in RAM to reduce
49
+ disk reads.
50
+
51
+ Parameters
52
+ ----------
53
+ reader : object
54
+ Any object implementing ``__arrow_c_stream__`` (e.g.
55
+ :class:`pyarrow.Table`, :class:`pyarrow.RecordBatchReader`).
56
+ memory_capacity : int, optional
57
+ Hot-layer budget in bytes for disk mode. Defaults to total physical
58
+ RAM. Ignored in memory-only mode.
59
+ disk_path : str, optional
60
+ Directory for the on-disk IPC file. Created on first use.
61
+ Must be provided together with ``disk_capacity``.
62
+ disk_capacity : int, optional
63
+ On-disk storage budget in bytes.
64
+ Must be provided together with ``disk_path``.
65
+
66
+ Examples
67
+ --------
68
+ Memory-only:
69
+
70
+ >>> import pyarrow as pa
71
+ >>> from batchcorder import StreamCache
72
+ >>> table = pa.table({"id": [1, 2, 3], "val": [0.5, 1.0, 1.5]})
73
+ >>> ds = StreamCache(table)
74
+ >>> pa.RecordBatchReader.from_stream(ds).read_all().equals(table)
75
+ True
76
+
77
+ Disk mode:
78
+
79
+ >>> import tempfile
80
+ >>> tmp = tempfile.mkdtemp()
81
+ >>> ds = StreamCache(table, memory_capacity=16 << 20, disk_path=tmp, disk_capacity=64 << 20)
82
+ >>> pa.RecordBatchReader.from_stream(ds).read_all().equals(table)
83
+ True
84
+ >>> ds.upstream_exhausted
85
+ True
86
+
87
+ """
88
+
89
+ def __init__(
90
+ self,
91
+ reader: Any,
92
+ memory_capacity: int | None = None,
93
+ disk_path: str | None = None,
94
+ disk_capacity: int | None = None,
95
+ ):
96
+ """See class docstring for parameter documentation."""
97
+ self._impl = _PyStreamCache(reader, memory_capacity, disk_path, disk_capacity)
98
+
99
+ @property
100
+ def schema(self) -> Any:
101
+ """
102
+ Arrow schema of this dataset.
103
+
104
+ Returns
105
+ -------
106
+ pyarrow.Schema
107
+
108
+ Examples
109
+ --------
110
+ >>> import tempfile, pyarrow as pa
111
+ >>> from batchcorder import StreamCache
112
+ >>> table = pa.table({"id": [1, 2], "val": [0.5, 1.0]})
113
+ >>> tmp = tempfile.mkdtemp()
114
+ >>> ds = StreamCache(table, 16 << 20, tmp, 64 << 20)
115
+ >>> [f.name for f in ds.schema]
116
+ ['id', 'val']
117
+
118
+ """
119
+ return self._impl.schema
120
+
121
+ @property
122
+ def ingested_count(self) -> int:
123
+ """
124
+ Number of batches pulled from the upstream source so far.
125
+
126
+ Increments lazily as readers consume batches.
127
+
128
+ Returns
129
+ -------
130
+ int
131
+
132
+ Examples
133
+ --------
134
+ >>> import tempfile, pyarrow as pa
135
+ >>> from batchcorder import StreamCache
136
+ >>> table = pa.table({"x": [1, 2, 3]})
137
+ >>> tmp = tempfile.mkdtemp()
138
+ >>> ds = StreamCache(table, 16 << 20, tmp, 64 << 20)
139
+ >>> ds.ingested_count
140
+ 0
141
+ >>> ds.ingest_all()
142
+ 1
143
+ >>> ds.ingested_count
144
+ 1
145
+
146
+ """
147
+ return self._impl.ingested_count
148
+
149
+ @property
150
+ def upstream_exhausted(self) -> bool:
151
+ """
152
+ ``True`` once the upstream source has been fully consumed.
153
+
154
+ Returns
155
+ -------
156
+ bool
157
+
158
+ Examples
159
+ --------
160
+ >>> import tempfile, pyarrow as pa
161
+ >>> from batchcorder import StreamCache
162
+ >>> table = pa.table({"x": [1, 2, 3]})
163
+ >>> tmp = tempfile.mkdtemp()
164
+ >>> ds = StreamCache(table, 16 << 20, tmp, 64 << 20)
165
+ >>> ds.upstream_exhausted
166
+ False
167
+ >>> ds.ingest_all()
168
+ 1
169
+ >>> ds.upstream_exhausted
170
+ True
171
+
172
+ """
173
+ return self._impl.upstream_exhausted
174
+
175
+ def reader(self, from_start: bool = True) -> StreamCacheReader:
176
+ """
177
+ Return a new :class:`StreamCacheReader` handle.
178
+
179
+ Parameters
180
+ ----------
181
+ from_start : bool, optional
182
+ If ``True`` (default), the reader starts at batch 0 and replays the
183
+ full stream. If ``False``, it starts at the current ingestion
184
+ frontier and yields only batches ingested after this call.
185
+
186
+ Returns
187
+ -------
188
+ StreamCacheReader
189
+
190
+ Examples
191
+ --------
192
+ >>> import tempfile, pyarrow as pa
193
+ >>> from batchcorder import StreamCache
194
+ >>> table = pa.table({"x": [1, 2, 3]})
195
+ >>> tmp = tempfile.mkdtemp()
196
+ >>> ds = StreamCache(table, 16 << 20, tmp, 64 << 20)
197
+ >>> r1 = ds.reader()
198
+ >>> r2 = ds.reader()
199
+ >>> r1.closed, r2.closed
200
+ (False, False)
201
+
202
+ """
203
+ return StreamCacheReader(self._impl.reader(from_start))
204
+
205
+ def __iter__(self) -> StreamCacheReader:
206
+ """
207
+ Iterate over all batches from the start.
208
+
209
+ Creates a fresh :class:`StreamCacheReader` starting at batch 0 and
210
+ returns it as the iterator.
211
+
212
+ Returns
213
+ -------
214
+ StreamCacheReader
215
+
216
+ """
217
+ return self.reader(True)
218
+
219
+ def __arrow_c_stream__(self, requested_schema: Any = None) -> Any:
220
+ """
221
+ Enable Arrow stream export via the `PyCapsule Interface <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html>`_.
222
+
223
+ This dunder method should not be called directly, but enables zero-copy data
224
+ transfer to other Python libraries that understand Arrow memory.
225
+
226
+ Creates a fresh reader starting at batch 0. Allows the dataset to be
227
+ consumed directly by PyArrow, DuckDB, DataFusion, and any other
228
+ Arrow-compatible library.
229
+
230
+ Parameters
231
+ ----------
232
+ requested_schema : object, optional
233
+ Schema capsule to cast the stream to, or ``None``.
234
+
235
+ """
236
+ return self._impl.__arrow_c_stream__(requested_schema)
237
+
238
+ def __arrow_c_schema__(self) -> Any:
239
+ """
240
+ Enable Arrow schema export via the `PyCapsule Interface <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html>`_.
241
+
242
+ This dunder method should not be called directly, but enables zero-copy data
243
+ transfer to other Python libraries that understand Arrow memory.
244
+
245
+ This allows Arrow consumers to inspect the data type of this
246
+ :class:`StreamCache`. Then the consumer can ask the producer (in
247
+ ``__arrow_c_stream__``) to cast the exported data to a supported data type.
248
+
249
+ """
250
+ return self._impl.__arrow_c_schema__()
251
+
252
+ def cast(self, target_schema: Any) -> CastingStreamCache:
253
+ """
254
+ Cast the dataset to produce batches with the given schema.
255
+
256
+ Returns a :class:`CastingStreamCache` — a **replayable** wrapper that
257
+ applies the schema cast on every read. Unlike
258
+ :meth:`pyarrow.RecordBatchReader.cast`, the result can be consumed
259
+ multiple times, making it suitable for DuckDB self-joins and ASOF joins.
260
+
261
+ Parameters
262
+ ----------
263
+ target_schema : object
264
+ Any Arrow schema-compatible object (e.g. :class:`pyarrow.Schema`,
265
+ :class:`pyarrow.Schema`).
266
+
267
+ Returns
268
+ -------
269
+ CastingStreamCache
270
+
271
+ """
272
+ return CastingStreamCache(self._impl.cast(target_schema))
273
+
274
+ def ingest_all(self) -> int:
275
+ """
276
+ Eagerly ingest all batches from the upstream source into the cache.
277
+
278
+ After this call ``upstream_exhausted`` is ``True`` and the upstream
279
+ reference is released. Subsequent reads are served entirely from cache.
280
+ Calling this method more than once is safe and idempotent.
281
+
282
+ Returns
283
+ -------
284
+ int
285
+ Total number of batches ingested (including any ingested previously).
286
+
287
+ Examples
288
+ --------
289
+ >>> import tempfile, pyarrow as pa
290
+ >>> from batchcorder import StreamCache
291
+ >>> table = pa.table({"x": [1, 2, 3]})
292
+ >>> tmp = tempfile.mkdtemp()
293
+ >>> ds = StreamCache(table, 16 << 20, tmp, 64 << 20)
294
+ >>> ds.ingest_all()
295
+ 1
296
+ >>> ds.upstream_exhausted
297
+ True
298
+
299
+ """
300
+ return self._impl.ingest_all()
301
+
302
+ def close(self) -> None:
303
+ """
304
+ Close the dataset and destroy the underlying storage.
305
+
306
+ This method clears the hybrid cache and destroys the disk storage,
307
+ removing any unused files that were eagerly created.
308
+
309
+ Returns
310
+ -------
311
+ None
312
+
313
+ Examples
314
+ --------
315
+ >>> import tempfile, pyarrow as pa
316
+ >>> from batchcorder import StreamCache
317
+ >>> table = pa.table({"x": [1, 2, 3]})
318
+ >>> tmp = tempfile.mkdtemp()
319
+ >>> ds = StreamCache(table, 16 << 20, tmp, 64 << 20)
320
+ >>> ds.close()
321
+
322
+ """
323
+ return self._impl.close()
324
+
325
+
326
+ class StreamCacheReader:
327
+ """
328
+ A single-use iterator handle for a :class:`StreamCache`.
329
+
330
+ Maintains an independent read position. Multiple handles backed by the
331
+ same dataset share the underlying cache; the upstream source is ingested
332
+ lazily as needed.
333
+
334
+ Once consumed via ``__arrow_c_stream__`` or by exhausting iteration the
335
+ reader is marked closed and raises an error on further use.
336
+
337
+ Notes
338
+ -----
339
+ Obtain a handle from :meth:`StreamCache.reader` rather than constructing
340
+ one directly.
341
+
342
+ """
343
+
344
+ def __init__(self, impl: _PyStreamCacheReader):
345
+ """Obtain via :meth:`StreamCache.reader`."""
346
+ self._impl = impl
347
+
348
+ @property
349
+ def schema(self) -> Any:
350
+ """
351
+ Arrow schema of batches produced by this reader.
352
+
353
+ Returns
354
+ -------
355
+ pyarrow.Schema
356
+
357
+ Raises
358
+ ------
359
+ ValueError
360
+ If the reader has already been consumed.
361
+
362
+ """
363
+ return self._impl.schema
364
+
365
+ @property
366
+ def closed(self) -> bool:
367
+ """
368
+ ``True`` if this reader has been consumed.
369
+
370
+ Returns
371
+ -------
372
+ bool
373
+
374
+ """
375
+ return self._impl.closed
376
+
377
+ def __arrow_c_stream__(self, requested_schema: Any = None) -> Any:
378
+ """
379
+ Enable Arrow stream export via the `PyCapsule Interface <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html>`_.
380
+
381
+ This dunder method should not be called directly, but enables zero-copy data
382
+ transfer to other Python libraries that understand Arrow memory.
383
+
384
+ Consumes the reader; subsequent calls raise an error.
385
+
386
+ Parameters
387
+ ----------
388
+ requested_schema : object, optional
389
+ Schema capsule to cast the stream to, or ``None``.
390
+
391
+ Raises
392
+ ------
393
+ ValueError
394
+ If the reader has already been consumed.
395
+
396
+ """
397
+ return self._impl.__arrow_c_stream__(requested_schema)
398
+
399
+ def __arrow_c_schema__(self) -> Any:
400
+ """
401
+ Enable Arrow schema export via the `PyCapsule Interface <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html>`_.
402
+
403
+ This dunder method should not be called directly, but enables zero-copy data
404
+ transfer to other Python libraries that understand Arrow memory.
405
+
406
+ This allows Arrow consumers to inspect the data type of this
407
+ :class:`StreamCacheReader`. Then the consumer can ask the producer (in
408
+ ``__arrow_c_stream__``) to cast the exported data to a supported data type.
409
+
410
+ Raises
411
+ ------
412
+ ValueError
413
+ If the reader has already been consumed.
414
+
415
+ """
416
+ return self._impl.__arrow_c_schema__()
417
+
418
+ def __iter__(self) -> StreamCacheReader:
419
+ """Return self as the iterator."""
420
+ return self
421
+
422
+ def cast(self, target_schema: Any) -> Any:
423
+ """
424
+ Cast the reader to produce batches with the given schema.
425
+
426
+ Mirrors :meth:`pyarrow.RecordBatchReader.cast`. Returns a
427
+ :class:`pyarrow.RecordBatchReader` that applies the cast as batches are
428
+ read. Consumes this reader.
429
+
430
+ Parameters
431
+ ----------
432
+ target_schema : object
433
+ Any Arrow schema-compatible object (e.g. :class:`pyarrow.Schema`,
434
+ :class:`pyarrow.Schema`).
435
+
436
+ Returns
437
+ -------
438
+ pyarrow.RecordBatchReader
439
+
440
+ Raises
441
+ ------
442
+ ValueError
443
+ If the reader has already been consumed.
444
+
445
+ """
446
+ return self._impl.cast(target_schema)
447
+
448
+ def __next__(self) -> Any:
449
+ """Get the next batch from the reader."""
450
+ return next(iter(self._impl))
451
+
452
+
453
+ class CastingStreamCache:
454
+ """
455
+ A replayable cast view of a :class:`StreamCache`.
456
+
457
+ Created by :meth:`StreamCache.cast`. Each call to ``__arrow_c_stream__``
458
+ produces a fresh reader from the underlying cache with each batch cast to
459
+ :attr:`schema`, so this object is **replayable** — DuckDB self-joins, ASOF
460
+ joins, and other multi-scan consumers work correctly on it.
461
+
462
+ Notes
463
+ -----
464
+ Obtain via :meth:`StreamCache.cast` rather than constructing directly.
465
+
466
+ """
467
+
468
+ def __init__(self, impl: _PyCastingStreamCache):
469
+ """Obtain via :meth:`StreamCache.cast`."""
470
+ self._impl = impl
471
+
472
+ @property
473
+ def schema(self) -> Any:
474
+ """
475
+ Arrow schema produced by this dataset after casting.
476
+
477
+ Returns
478
+ -------
479
+ pyarrow.Schema
480
+
481
+ """
482
+ return self._impl.schema
483
+
484
+ def __arrow_c_stream__(self, requested_schema: Any = None) -> Any:
485
+ """
486
+ Enable Arrow stream export via the `PyCapsule Interface <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html>`_.
487
+
488
+ Creates a fresh reader from the underlying cache and applies the cast.
489
+ Safe to call multiple times — each call produces an independent stream.
490
+
491
+ Parameters
492
+ ----------
493
+ requested_schema : object, optional
494
+ Schema capsule to further cast the stream to, or ``None`` (uses
495
+ :attr:`schema`).
496
+
497
+ """
498
+ return self._impl.__arrow_c_stream__(requested_schema)
499
+
500
+ def __arrow_c_schema__(self) -> Any:
501
+ """
502
+ Enable Arrow schema export via the `PyCapsule Interface <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html>`_.
503
+
504
+ Returns the target schema so consumers can inspect the post-cast type.
505
+
506
+ """
507
+ return self._impl.__arrow_c_schema__()
508
+
509
+ def cast(self, target_schema: Any) -> CastingStreamCache:
510
+ """
511
+ Cast to a further target schema, returning a new :class:`CastingStreamCache`.
512
+
513
+ Parameters
514
+ ----------
515
+ target_schema : object
516
+ Any Arrow schema-compatible object.
517
+
518
+ Returns
519
+ -------
520
+ CastingStreamCache
521
+
522
+ """
523
+ return CastingStreamCache(self._impl.cast(target_schema))
Binary file
@@ -0,0 +1,55 @@
1
+ # This file is automatically generated by pyo3_stub_gen
2
+
3
+ import builtins
4
+ import typing
5
+
6
+ import pyarrow as pa
7
+
8
+ __all__ = [
9
+ "CastingStreamCache",
10
+ "StreamCache",
11
+ "StreamCacheReader",
12
+ ]
13
+
14
+ @typing.final
15
+ class CastingStreamCache:
16
+ @property
17
+ def schema(self) -> pa.Schema: ...
18
+ def __arrow_c_stream__(self, requested_schema: typing.Any = None) -> typing.Any: ...
19
+ def __arrow_c_schema__(self) -> typing.Any: ...
20
+ def cast(self, target_schema: typing.Any) -> CastingStreamCache: ...
21
+
22
+ @typing.final
23
+ class StreamCache:
24
+ @property
25
+ def schema(self) -> pa.Schema: ...
26
+ @property
27
+ def ingested_count(self) -> builtins.int: ...
28
+ @property
29
+ def upstream_exhausted(self) -> builtins.bool: ...
30
+ def __new__(
31
+ cls,
32
+ reader: typing.Any,
33
+ memory_capacity: builtins.int | None = None,
34
+ disk_path: builtins.str | None = None,
35
+ disk_capacity: builtins.int | None = None,
36
+ ) -> StreamCache: ...
37
+ def reader(self, from_start: builtins.bool = ...) -> StreamCacheReader: ...
38
+ def __iter__(self) -> StreamCacheReader: ...
39
+ def __arrow_c_stream__(self, requested_schema: typing.Any = None) -> typing.Any: ...
40
+ def __arrow_c_schema__(self) -> typing.Any: ...
41
+ def cast(self, target_schema: typing.Any) -> CastingStreamCache: ...
42
+ def ingest_all(self) -> builtins.int: ...
43
+ def close(self) -> None: ...
44
+
45
+ @typing.final
46
+ class StreamCacheReader:
47
+ @property
48
+ def schema(self) -> pa.Schema: ...
49
+ @property
50
+ def closed(self) -> builtins.bool: ...
51
+ def __arrow_c_stream__(self, requested_schema: typing.Any = None) -> typing.Any: ...
52
+ def __arrow_c_schema__(self) -> typing.Any: ...
53
+ def __iter__(self) -> StreamCacheReader: ...
54
+ def cast(self, target_schema: typing.Any) -> typing.Any: ...
55
+ def __next__(self) -> pa.RecordBatch: ...
batchcorder/py.typed ADDED
File without changes