PyPI - batchcorder - Versions diffs - 0.1.2__cp310-abi3-win_amd64.whl - Mend

batchcorder 0.1.2__cp310-abi3-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

batchcorder/__init__.py +523 -0
batchcorder/_batchcorder.pyd +0 -0
batchcorder/_batchcorder.pyi +55 -0
batchcorder/py.typed +0 -0
batchcorder-0.1.2.dist-info/METADATA +146 -0
batchcorder-0.1.2.dist-info/RECORD +9 -0
batchcorder-0.1.2.dist-info/WHEEL +4 -0
batchcorder-0.1.2.dist-info/licenses/LICENSE +201 -0
batchcorder-0.1.2.dist-info/sboms/batchcorder.cyclonedx.json +4096 -0

batchcorder/__init__.py ADDED Viewed

@@ -0,0 +1,523 @@
+"""Batchcorder: Replayable cached Arrow record-batch streams."""
+from __future__ import annotations
+from importlib.metadata import version
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from typing import Any
+from ._batchcorder import (
+    CastingStreamCache as _PyCastingStreamCache,
+)
+from ._batchcorder import (
+    StreamCache as _PyStreamCache,
+)
+from ._batchcorder import (
+    StreamCacheReader as _PyStreamCacheReader,
+)
+__all__ = [
+    "CastingStreamCache",
+    "StreamCache",
+    "StreamCacheReader",
+]
+__version__: str = version("batchcorder")
+class StreamCache:
+    """
+    A cached Arrow stream backed by an in-memory Vec or an on-disk IPC file.
+    Wraps any Arrow stream source and stores each ``RecordBatch`` so multiple
+    independent :class:`StreamCacheReader` handles can replay the full stream
+    from any position.  The upstream source is ingested lazily on demand and
+    consumed exactly once.
+    Two storage modes are supported:
+    - **Memory-only** (omit ``disk_path`` / ``disk_capacity``): batches are
+      kept as reference-counted pointers in RAM.  Reads are zero-copy; no IPC
+      serialisation happens.
+    - **Disk** (provide both ``disk_path`` and ``disk_capacity``): batches are
+      serialised to an append-only Arrow IPC file.  A configurable hot layer
+      (``memory_capacity``) keeps recently ingested batches in RAM to reduce
+      disk reads.
+    Parameters
+    ----------
+    reader : object
+        Any object implementing ``__arrow_c_stream__`` (e.g.
+        :class:`pyarrow.Table`, :class:`pyarrow.RecordBatchReader`).
+    memory_capacity : int, optional
+        Hot-layer budget in bytes for disk mode.  Defaults to total physical
+        RAM.  Ignored in memory-only mode.
+    disk_path : str, optional
+        Directory for the on-disk IPC file.  Created on first use.
+        Must be provided together with ``disk_capacity``.
+    disk_capacity : int, optional
+        On-disk storage budget in bytes.
+        Must be provided together with ``disk_path``.
+    Examples
+    --------
+    Memory-only:
+    >>> import pyarrow as pa
+    >>> from batchcorder import StreamCache
+    >>> table = pa.table({"id": [1, 2, 3], "val": [0.5, 1.0, 1.5]})
+    >>> ds = StreamCache(table)
+    >>> pa.RecordBatchReader.from_stream(ds).read_all().equals(table)
+    True
+    Disk mode:
+    >>> import tempfile
+    >>> tmp = tempfile.mkdtemp()
+    >>> ds = StreamCache(table, memory_capacity=16 << 20, disk_path=tmp, disk_capacity=64 << 20)
+    >>> pa.RecordBatchReader.from_stream(ds).read_all().equals(table)
+    True
+    >>> ds.upstream_exhausted
+    True
+    """
+    def __init__(
+        self,
+        reader: Any,
+        memory_capacity: int | None = None,
+        disk_path: str | None = None,
+        disk_capacity: int | None = None,
+    ):
+        """See class docstring for parameter documentation."""
+        self._impl = _PyStreamCache(reader, memory_capacity, disk_path, disk_capacity)
+    @property
+    def schema(self) -> Any:
+        """
+        Arrow schema of this dataset.
+        Returns
+        -------
+        pyarrow.Schema
+        Examples
+        --------
+        >>> import tempfile, pyarrow as pa
+        >>> from batchcorder import StreamCache
+        >>> table = pa.table({"id": [1, 2], "val": [0.5, 1.0]})
+        >>> tmp = tempfile.mkdtemp()
+        >>> ds = StreamCache(table, 16 << 20, tmp, 64 << 20)
+        >>> [f.name for f in ds.schema]
+        ['id', 'val']
+        """
+        return self._impl.schema
+    @property
+    def ingested_count(self) -> int:
+        """
+        Number of batches pulled from the upstream source so far.
+        Increments lazily as readers consume batches.
+        Returns
+        -------
+        int
+        Examples
+        --------
+        >>> import tempfile, pyarrow as pa
+        >>> from batchcorder import StreamCache
+        >>> table = pa.table({"x": [1, 2, 3]})
+        >>> tmp = tempfile.mkdtemp()
+        >>> ds = StreamCache(table, 16 << 20, tmp, 64 << 20)
+        >>> ds.ingested_count
+        0
+        >>> ds.ingest_all()
+        1
+        >>> ds.ingested_count
+        1
+        """
+        return self._impl.ingested_count
+    @property
+    def upstream_exhausted(self) -> bool:
+        """
+        ``True`` once the upstream source has been fully consumed.
+        Returns
+        -------
+        bool
+        Examples
+        --------
+        >>> import tempfile, pyarrow as pa
+        >>> from batchcorder import StreamCache
+        >>> table = pa.table({"x": [1, 2, 3]})
+        >>> tmp = tempfile.mkdtemp()
+        >>> ds = StreamCache(table, 16 << 20, tmp, 64 << 20)
+        >>> ds.upstream_exhausted
+        False
+        >>> ds.ingest_all()
+        1
+        >>> ds.upstream_exhausted
+        True
+        """
+        return self._impl.upstream_exhausted
+    def reader(self, from_start: bool = True) -> StreamCacheReader:
+        """
+        Return a new :class:`StreamCacheReader` handle.
+        Parameters
+        ----------
+        from_start : bool, optional
+            If ``True`` (default), the reader starts at batch 0 and replays the
+            full stream.  If ``False``, it starts at the current ingestion
+            frontier and yields only batches ingested after this call.
+        Returns
+        -------
+        StreamCacheReader
+        Examples
+        --------
+        >>> import tempfile, pyarrow as pa
+        >>> from batchcorder import StreamCache
+        >>> table = pa.table({"x": [1, 2, 3]})
+        >>> tmp = tempfile.mkdtemp()
+        >>> ds = StreamCache(table, 16 << 20, tmp, 64 << 20)
+        >>> r1 = ds.reader()
+        >>> r2 = ds.reader()
+        >>> r1.closed, r2.closed
+        (False, False)
+        """
+        return StreamCacheReader(self._impl.reader(from_start))
+    def __iter__(self) -> StreamCacheReader:
+        """
+        Iterate over all batches from the start.
+        Creates a fresh :class:`StreamCacheReader` starting at batch 0 and
+        returns it as the iterator.
+        Returns
+        -------
+        StreamCacheReader
+        """
+        return self.reader(True)
+    def __arrow_c_stream__(self, requested_schema: Any = None) -> Any:
+        """
+        Enable Arrow stream export via the `PyCapsule Interface <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html>`_.
+        This dunder method should not be called directly, but enables zero-copy data
+        transfer to other Python libraries that understand Arrow memory.
+        Creates a fresh reader starting at batch 0.  Allows the dataset to be
+        consumed directly by PyArrow, DuckDB, DataFusion, and any other
+        Arrow-compatible library.
+        Parameters
+        ----------
+        requested_schema : object, optional
+            Schema capsule to cast the stream to, or ``None``.
+        """
+        return self._impl.__arrow_c_stream__(requested_schema)
+    def __arrow_c_schema__(self) -> Any:
+        """
+        Enable Arrow schema export via the `PyCapsule Interface <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html>`_.
+        This dunder method should not be called directly, but enables zero-copy data
+        transfer to other Python libraries that understand Arrow memory.
+        This allows Arrow consumers to inspect the data type of this
+        :class:`StreamCache`.  Then the consumer can ask the producer (in
+        ``__arrow_c_stream__``) to cast the exported data to a supported data type.
+        """
+        return self._impl.__arrow_c_schema__()
+    def cast(self, target_schema: Any) -> CastingStreamCache:
+        """
+        Cast the dataset to produce batches with the given schema.
+        Returns a :class:`CastingStreamCache` — a **replayable** wrapper that
+        applies the schema cast on every read.  Unlike
+        :meth:`pyarrow.RecordBatchReader.cast`, the result can be consumed
+        multiple times, making it suitable for DuckDB self-joins and ASOF joins.
+        Parameters
+        ----------
+        target_schema : object
+            Any Arrow schema-compatible object (e.g. :class:`pyarrow.Schema`,
+            :class:`pyarrow.Schema`).
+        Returns
+        -------
+        CastingStreamCache
+        """
+        return CastingStreamCache(self._impl.cast(target_schema))
+    def ingest_all(self) -> int:
+        """
+        Eagerly ingest all batches from the upstream source into the cache.
+        After this call ``upstream_exhausted`` is ``True`` and the upstream
+        reference is released.  Subsequent reads are served entirely from cache.
+        Calling this method more than once is safe and idempotent.
+        Returns
+        -------
+        int
+            Total number of batches ingested (including any ingested previously).
+        Examples
+        --------
+        >>> import tempfile, pyarrow as pa
+        >>> from batchcorder import StreamCache
+        >>> table = pa.table({"x": [1, 2, 3]})
+        >>> tmp = tempfile.mkdtemp()
+        >>> ds = StreamCache(table, 16 << 20, tmp, 64 << 20)
+        >>> ds.ingest_all()
+        1
+        >>> ds.upstream_exhausted
+        True
+        """
+        return self._impl.ingest_all()
+    def close(self) -> None:
+        """
+        Close the dataset and destroy the underlying storage.
+        This method clears the hybrid cache and destroys the disk storage,
+        removing any unused files that were eagerly created.
+        Returns
+        -------
+        None
+        Examples
+        --------
+        >>> import tempfile, pyarrow as pa
+        >>> from batchcorder import StreamCache
+        >>> table = pa.table({"x": [1, 2, 3]})
+        >>> tmp = tempfile.mkdtemp()
+        >>> ds = StreamCache(table, 16 << 20, tmp, 64 << 20)
+        >>> ds.close()
+        """
+        return self._impl.close()
+class StreamCacheReader:
+    """
+    A single-use iterator handle for a :class:`StreamCache`.
+    Maintains an independent read position.  Multiple handles backed by the
+    same dataset share the underlying cache; the upstream source is ingested
+    lazily as needed.
+    Once consumed via ``__arrow_c_stream__`` or by exhausting iteration the
+    reader is marked closed and raises an error on further use.
+    Notes
+    -----
+    Obtain a handle from :meth:`StreamCache.reader` rather than constructing
+    one directly.
+    """
+    def __init__(self, impl: _PyStreamCacheReader):
+        """Obtain via :meth:`StreamCache.reader`."""
+        self._impl = impl
+    @property
+    def schema(self) -> Any:
+        """
+        Arrow schema of batches produced by this reader.
+        Returns
+        -------
+        pyarrow.Schema
+        Raises
+        ------
+        ValueError
+            If the reader has already been consumed.
+        """
+        return self._impl.schema
+    @property
+    def closed(self) -> bool:
+        """
+        ``True`` if this reader has been consumed.
+        Returns
+        -------
+        bool
+        """
+        return self._impl.closed
+    def __arrow_c_stream__(self, requested_schema: Any = None) -> Any:
+        """
+        Enable Arrow stream export via the `PyCapsule Interface <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html>`_.
+        This dunder method should not be called directly, but enables zero-copy data
+        transfer to other Python libraries that understand Arrow memory.
+        Consumes the reader; subsequent calls raise an error.
+        Parameters
+        ----------
+        requested_schema : object, optional
+            Schema capsule to cast the stream to, or ``None``.
+        Raises
+        ------
+        ValueError
+            If the reader has already been consumed.
+        """
+        return self._impl.__arrow_c_stream__(requested_schema)
+    def __arrow_c_schema__(self) -> Any:
+        """
+        Enable Arrow schema export via the `PyCapsule Interface <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html>`_.
+        This dunder method should not be called directly, but enables zero-copy data
+        transfer to other Python libraries that understand Arrow memory.
+        This allows Arrow consumers to inspect the data type of this
+        :class:`StreamCacheReader`.  Then the consumer can ask the producer (in
+        ``__arrow_c_stream__``) to cast the exported data to a supported data type.
+        Raises
+        ------
+        ValueError
+            If the reader has already been consumed.
+        """
+        return self._impl.__arrow_c_schema__()
+    def __iter__(self) -> StreamCacheReader:
+        """Return self as the iterator."""
+        return self
+    def cast(self, target_schema: Any) -> Any:
+        """
+        Cast the reader to produce batches with the given schema.
+        Mirrors :meth:`pyarrow.RecordBatchReader.cast`.  Returns a
+        :class:`pyarrow.RecordBatchReader` that applies the cast as batches are
+        read.  Consumes this reader.
+        Parameters
+        ----------
+        target_schema : object
+            Any Arrow schema-compatible object (e.g. :class:`pyarrow.Schema`,
+            :class:`pyarrow.Schema`).
+        Returns
+        -------
+        pyarrow.RecordBatchReader
+        Raises
+        ------
+        ValueError
+            If the reader has already been consumed.
+        """
+        return self._impl.cast(target_schema)
+    def __next__(self) -> Any:
+        """Get the next batch from the reader."""
+        return next(iter(self._impl))
+class CastingStreamCache:
+    """
+    A replayable cast view of a :class:`StreamCache`.
+    Created by :meth:`StreamCache.cast`.  Each call to ``__arrow_c_stream__``
+    produces a fresh reader from the underlying cache with each batch cast to
+    :attr:`schema`, so this object is **replayable** — DuckDB self-joins, ASOF
+    joins, and other multi-scan consumers work correctly on it.
+    Notes
+    -----
+    Obtain via :meth:`StreamCache.cast` rather than constructing directly.
+    """
+    def __init__(self, impl: _PyCastingStreamCache):
+        """Obtain via :meth:`StreamCache.cast`."""
+        self._impl = impl
+    @property
+    def schema(self) -> Any:
+        """
+        Arrow schema produced by this dataset after casting.
+        Returns
+        -------
+        pyarrow.Schema
+        """
+        return self._impl.schema
+    def __arrow_c_stream__(self, requested_schema: Any = None) -> Any:
+        """
+        Enable Arrow stream export via the `PyCapsule Interface <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html>`_.
+        Creates a fresh reader from the underlying cache and applies the cast.
+        Safe to call multiple times — each call produces an independent stream.
+        Parameters
+        ----------
+        requested_schema : object, optional
+            Schema capsule to further cast the stream to, or ``None`` (uses
+            :attr:`schema`).
+        """
+        return self._impl.__arrow_c_stream__(requested_schema)
+    def __arrow_c_schema__(self) -> Any:
+        """
+        Enable Arrow schema export via the `PyCapsule Interface <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html>`_.
+        Returns the target schema so consumers can inspect the post-cast type.
+        """
+        return self._impl.__arrow_c_schema__()
+    def cast(self, target_schema: Any) -> CastingStreamCache:
+        """
+        Cast to a further target schema, returning a new :class:`CastingStreamCache`.
+        Parameters
+        ----------
+        target_schema : object
+            Any Arrow schema-compatible object.
+        Returns
+        -------
+        CastingStreamCache
+        """
+        return CastingStreamCache(self._impl.cast(target_schema))

batchcorder/_batchcorder.pyd ADDED Viewed

Binary file

batchcorder/_batchcorder.pyi ADDED Viewed

@@ -0,0 +1,55 @@
+# This file is automatically generated by pyo3_stub_gen
+import builtins
+import typing
+import pyarrow as pa
+__all__ = [
+    "CastingStreamCache",
+    "StreamCache",
+    "StreamCacheReader",
+]
+@typing.final
+class CastingStreamCache:
+    @property
+    def schema(self) -> pa.Schema: ...
+    def __arrow_c_stream__(self, requested_schema: typing.Any = None) -> typing.Any: ...
+    def __arrow_c_schema__(self) -> typing.Any: ...
+    def cast(self, target_schema: typing.Any) -> CastingStreamCache: ...
+@typing.final
+class StreamCache:
+    @property
+    def schema(self) -> pa.Schema: ...
+    @property
+    def ingested_count(self) -> builtins.int: ...
+    @property
+    def upstream_exhausted(self) -> builtins.bool: ...
+    def __new__(
+        cls,
+        reader: typing.Any,
+        memory_capacity: builtins.int | None = None,
+        disk_path: builtins.str | None = None,
+        disk_capacity: builtins.int | None = None,
+    ) -> StreamCache: ...
+    def reader(self, from_start: builtins.bool = ...) -> StreamCacheReader: ...
+    def __iter__(self) -> StreamCacheReader: ...
+    def __arrow_c_stream__(self, requested_schema: typing.Any = None) -> typing.Any: ...
+    def __arrow_c_schema__(self) -> typing.Any: ...
+    def cast(self, target_schema: typing.Any) -> CastingStreamCache: ...
+    def ingest_all(self) -> builtins.int: ...
+    def close(self) -> None: ...
+@typing.final
+class StreamCacheReader:
+    @property
+    def schema(self) -> pa.Schema: ...
+    @property
+    def closed(self) -> builtins.bool: ...
+    def __arrow_c_stream__(self, requested_schema: typing.Any = None) -> typing.Any: ...
+    def __arrow_c_schema__(self) -> typing.Any: ...
+    def __iter__(self) -> StreamCacheReader: ...
+    def cast(self, target_schema: typing.Any) -> typing.Any: ...
+    def __next__(self) -> pa.RecordBatch: ...

batchcorder/py.typed ADDED Viewed

File without changes