PyPI - batchcorder - Versions diffs - 0.1.2__tar.gz → 0.1.3__tar.gz - Mend

batchcorder 0.1.2tar.gz → 0.1.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

{batchcorder-0.1.2 → batchcorder-0.1.3}/.gitignore RENAMED Viewed

@@ -90,3 +90,8 @@ docs/source/_autoapi/
 site
 drafts
+# cargo-llvm-cov output
+coverage.lcov
+.claude

{batchcorder-0.1.2 → batchcorder-0.1.3}/.pre-commit-config.yaml RENAMED Viewed

@@ -12,6 +12,9 @@ repos:
         args: ["--maxkb=500"]
       - id: name-tests-test
         args: ["--pytest-test-first"]
+        # strategies.py (shared Hypothesis strategies) and helpers.py (shared
+        # test constants) are support modules, not test files.
+        exclude: ^tests/(strategies|helpers)\.py$
   - repo: https://github.com/abravalheri/validate-pyproject
     rev: v0.25

{batchcorder-0.1.2 → batchcorder-0.1.3}/Cargo.lock RENAMED Viewed

@@ -131,7 +131,7 @@ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
 [[package]]
 name = "batchcorder"
-version = "0.1.2"
+version = "0.1.3"
 dependencies = [
  "arrow-array",
  "arrow-ipc",
@@ -140,6 +140,7 @@ dependencies = [
  "pyo3",
  "pyo3-stub-gen",
  "sysinfo",
+ "xxhash-rust",
 ]
 [[package]]
@@ -1319,6 +1320,12 @@ version = "0.51.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
+[[package]]
+name = "xxhash-rust"
+version = "0.8.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3"
 [[package]]
 name = "zerocopy"
 version = "0.8.42"

{batchcorder-0.1.2 → batchcorder-0.1.3}/Cargo.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "batchcorder"
-version = "0.1.2"
+version = "0.1.3"
 edition = "2024"
 readme = "README.md"
@@ -30,6 +30,9 @@ arrow-ipc = "58"
 # System information for detecting total physical RAM (used as default hot-cache budget)
 sysinfo = { version = "0.33", default-features = false, features = ["system"] }
+# Fast non-cryptographic hash for cache entry integrity checks
+xxhash-rust = { version = "0.8", features = ["xxh3"] }
 [lints.rust]
 # Py_GIL_DISABLED is a cfg set by pyo3's build script to indicate free-threaded Python.
 unexpected_cfgs = { level = "warn", check-cfg = ['cfg(Py_GIL_DISABLED)'] }

{batchcorder-0.1.2 → batchcorder-0.1.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: batchcorder
-Version: 0.1.2
+Version: 0.1.3
 Classifier: Programming Language :: Rust
 Classifier: Programming Language :: Python :: Implementation :: CPython
 Classifier: Programming Language :: Python :: Implementation :: PyPy
@@ -99,6 +99,12 @@ for batch in ds:
 reader = ds.reader()
 result = pa.RecordBatchReader.from_stream(reader).read_all()
+# Bounded-memory: evict batches once all readers have passed them
+ds = StreamCache(
+    table.to_reader(max_chunksize=1),
+    max_readers=2,  # at most 2 reads; batches evicted when both advance
+)
 # Pre-ingest everything upfront
 ds.ingest_all()
 ```
@@ -129,6 +135,12 @@ duckdb.table("ds")  # DuckDB
 - **Replay from any position**: `ds.reader(from_start=True)` (default) replays
   from batch 0; `ds.reader(from_start=False)` starts from the current ingestion
   frontier (next batch not yet ingested).
+- **Bounded-memory streaming**: set `max_readers=N` to evict batches once all
+  `N` readers exist and have advanced past them — eviction does not begin
+  until all `N` readers have been created.  `max_readers` is a hard cap on
+  total readers ever created (dropping a reader does not free a slot).  Once
+  eviction has started, `reader(from_start=True)` raises `ValueError`.
+  When unset, all batches are retained indefinitely.
 ## Development

{batchcorder-0.1.2 → batchcorder-0.1.3}/README.md RENAMED Viewed

@@ -75,6 +75,12 @@ for batch in ds:
 reader = ds.reader()
 result = pa.RecordBatchReader.from_stream(reader).read_all()
+# Bounded-memory: evict batches once all readers have passed them
+ds = StreamCache(
+    table.to_reader(max_chunksize=1),
+    max_readers=2,  # at most 2 reads; batches evicted when both advance
+)
 # Pre-ingest everything upfront
 ds.ingest_all()
 ```
@@ -105,6 +111,12 @@ duckdb.table("ds")  # DuckDB
 - **Replay from any position**: `ds.reader(from_start=True)` (default) replays
   from batch 0; `ds.reader(from_start=False)` starts from the current ingestion
   frontier (next batch not yet ingested).
+- **Bounded-memory streaming**: set `max_readers=N` to evict batches once all
+  `N` readers exist and have advanced past them — eviction does not begin
+  until all `N` readers have been created.  `max_readers` is a hard cap on
+  total readers ever created (dropping a reader does not free a slot).  Once
+  eviction has started, `reader(from_start=True)` raises `ValueError`.
+  When unset, all batches are retained indefinitely.
 ## Development

{batchcorder-0.1.2 → batchcorder-0.1.3}/docs/quarto/how-to/cache-config.qmd RENAMED Viewed

@@ -64,6 +64,44 @@ Set `memory_capacity` to cover the data your hottest readers are actively
 consuming.  Batches that exceed the hot budget are served directly from disk
 with no in-memory copy.
+## Bounded-memory streaming with `max_readers`
+When you know how many times the stream will be read, set `max_readers` to
+evict batches once all readers have advanced past them.  This keeps memory
+usage proportional to the window between the slowest and fastest reader,
+rather than the full stream.
+```{python}
+#| eval: false
+ds = StreamCache(
+    source,
+    max_readers=2,   # at most 2 reads of the stream
+)
+r1 = ds.reader()
+r2 = ds.reader()
+# As both readers advance, batches behind the slowest are freed.
+result1 = pa.RecordBatchReader.from_stream(r1).read_all()
+result2 = pa.RecordBatchReader.from_stream(r2).read_all()
+```
+**Important:** `max_readers` is a hard cap on the total number of readers
+ever created — dropping a reader does not free a slot for a new one.  Once
+all `max_readers` readers have been created, no further readers can be
+obtained.  Eviction does not begin until all `max_readers` readers have
+actually been created: with fewer readers, every batch is retained so that
+future readers can still replay from the start.  Additionally, once batches
+have been evicted, `reader(from_start=True)` raises ``ValueError`` because
+batch 0 is no longer available.
+For disk-backed caches, eviction frees memory (hot layer and index entries)
+but not bytes already written to the append-only cache file — disk space is
+reclaimed only when the cache is closed or dropped.
+When `max_readers` is omitted (default), all batches are retained
+indefinitely and unlimited readers are allowed.
 ## Training-loop workloads
 For ML training loops where you replay the full stream each epoch, call

{batchcorder-0.1.2 → batchcorder-0.1.3}/docs/quarto/how-to/eviction.qmd RENAMED Viewed

@@ -69,6 +69,51 @@ ds.ingest_all()   # upstream fully consumed; cache is complete
 readers = [ds.reader() for _ in range(8)]
 ```
+## Bounded concurrent reads with `max_readers`
+If you know the number of concurrent readers upfront, set `max_readers` to
+enable automatic eviction.  Batches are freed from cache once all readers
+have advanced past them:
+```{python}
+#| eval: false
+ds = StreamCache(source, max_readers=4)
+results = [None] * 4
+# Create readers in the main thread to guarantee all slots are claimed.
+readers = [ds.reader() for _ in range(4)]
+def read(i):
+    results[i] = pa.RecordBatchReader.from_stream(readers[i]).read_all()
+threads = [threading.Thread(target=read, args=(i,)) for i in range(4)]
+for t in threads:
+    t.start()
+for t in threads:
+    t.join()
+```
+This keeps memory proportional to the gap between the fastest and slowest
+reader.  Without `max_readers`, all batches are retained for the lifetime of
+the cache.
+**Important:** `max_readers` is a hard cap on the total number of readers
+ever created, not the number of concurrent readers.  Dropping a reader does
+not free a slot for a new one.  Once all `max_readers` readers have been
+created, no further readers can be obtained.  Eviction does not begin until
+all `max_readers` readers have actually been created — with fewer readers,
+every batch is retained so future readers can still replay from the start.
+Also note that once eviction has begun, `reader(from_start=True)` raises
+``ValueError`` because batch 0 is no longer available.  Anything that creates
+a reader counts against the cap: `ds.reader()`, `iter(ds)`,
+`ds.__arrow_c_stream__()` (so each engine scan of `ds`), and each read of a
+`cast()` result all consume one slot.
+For disk-backed caches, eviction frees memory (hot layer and index entries)
+but not bytes already written to the append-only cache file — disk space is
+reclaimed only when the cache is closed or dropped.
 ## Reading from `__arrow_c_stream__`
 `StreamCache` itself implements `__arrow_c_stream__`, which creates a fresh

{batchcorder-0.1.2 → batchcorder-0.1.3}/docs/quarto/reference/api-overview.qmd RENAMED Viewed

@@ -15,7 +15,7 @@ hybrid memory + disk cache.
 | Method / property        | Description |
 |--------------------------|-------------|
-| `__init__(...)`          | Create a StreamCache from any `__arrow_c_stream__` source. |
+| `__init__(...)`          | Create a StreamCache from any `__arrow_c_stream__` source. Accepts optional `max_readers` to enable bounded-memory eviction. |
 | `reader(from_start=True)`| Return a new independent reader handle. |
 | `ingest_all()`           | Eagerly consume the upstream source into the cache. |
 | `schema`                 | Arrow schema of the stream. |

{batchcorder-0.1.2 → batchcorder-0.1.3}/pyproject.toml RENAMED Viewed

@@ -40,6 +40,7 @@ dev = [
   "maturin>=1.7,<2.0",
   "pyarrow>=18",
   "pytest>=8",
+  "hypothesis>=6",
   "duckdb>=1.5",
   "ruff>=0.15.6",
   "pre-commit>=3.5.0",
@@ -69,6 +70,14 @@ skip = "uv.lock,docs/requirements.txt"
 extend-include = ["*.md"]
 [tool.ruff.lint]
+# Preview is required for the pydoclint (DOC) rules.  Scoped two ways: it is
+# lint-only (the formatter stays on stable style), and explicit-preview-rules
+# limits it to the exact preview codes listed in extend-select — prefix
+# selectors (RUF, B, UP, ...) keep selecting stable rules only, so a ruff
+# upgrade cannot silently enable new preview rules (ruff is pinned at the
+# matching rev in .pre-commit-config.yaml).
+preview = true
+explicit-preview-rules = true
 ignore = [
   "D203", # conflicts with D211 (no-blank-line-before-class); D211 wins
   "D212", # conflicts with D213 (multi-line-summary-second-line); D213 wins
@@ -83,14 +92,25 @@ extend-select = [
   "SIM",
   "PT",
   "D",
+  # pydoclint, selected by exact code (preview rules; see explicit-preview-rules)
+  "DOC201", # return value not documented in docstring
+  "DOC202", # docstring documents a return the body does not have
+  "DOC402", # yielded value not documented in docstring
+  "DOC403", # docstring documents yields the body does not have
+  "DOC501", # explicitly raised exception missing from docstring
+  "DOC502", # docstring documents an exception the body does not raise
   "RUF",
   "PYI",
   "UP",
 ]
 [tool.ruff.lint.per-file-ignores]
-"tests/**" = ["D"]
-"conftest.py" = ["D"]
+# D/DOC: docstring section structure is not enforced in tests.
+"tests/**" = ["D", "DOC"]
+"conftest.py" = ["D", "DOC"]
+# DOC502: the wrappers in __init__.py document exceptions raised inside the
+# Rust delegate, which pydoclint cannot see across the FFI boundary.
+"python/batchcorder/__init__.py" = ["DOC502"]
 [tool.ruff.lint.isort]
 known-first-party = ["batchcorder"]

{batchcorder-0.1.2 → batchcorder-0.1.3}/python/batchcorder/__init__.py RENAMED Viewed

@@ -9,6 +9,8 @@ from typing import TYPE_CHECKING
 if TYPE_CHECKING:
     from typing import Any
+    import pyarrow as pa
 from ._batchcorder import (
     CastingStreamCache as _PyCastingStreamCache,
 )
@@ -62,6 +64,25 @@ class StreamCache:
     disk_capacity : int, optional
         On-disk storage budget in bytes.
         Must be provided together with ``disk_path``.
+    write_policy : str, optional
+        When batches are flushed to disk (disk mode only; ignored in
+        memory-only mode).  ``"on_insertion"`` (default) writes every batch to
+        disk immediately.  ``"on_eviction"`` keeps batches in the hot layer and
+        only writes them to disk when evicted, so a hot layer large enough to
+        hold the whole stream never touches disk.
+    max_readers : int, optional
+        Hard cap on the total number of readers ever created from this cache.
+        When set, batches are evicted once all readers have advanced past them,
+        enabling bounded-memory streaming.  Eviction only begins once all
+        ``max_readers`` readers have actually been created — with fewer live
+        readers every batch is retained so future readers can still replay
+        from the start.  Dropping a reader does **not** free a slot — once
+        ``max_readers`` readers have been created, no more can be obtained.
+        ``reader(from_start=True)`` raises ``ValueError`` if batch 0 has
+        already been evicted.  For disk-backed caches, eviction frees memory
+        (hot layer and index) but not bytes in the append-only cache file —
+        disk space is reclaimed only when the cache is closed or dropped.
+        When ``None`` (default), all batches are retained indefinitely.
     Examples
     --------
@@ -86,18 +107,24 @@ class StreamCache:
     """
+    _impl: _PyStreamCache
     def __init__(
         self,
         reader: Any,
         memory_capacity: int | None = None,
         disk_path: str | None = None,
         disk_capacity: int | None = None,
-    ):
+        write_policy: str = "on_insertion",
+        max_readers: int | None = None,
+    ) -> None:
         """See class docstring for parameter documentation."""
-        self._impl = _PyStreamCache(reader, memory_capacity, disk_path, disk_capacity)
+        self._impl = _PyStreamCache(
+            reader, memory_capacity, disk_path, disk_capacity, write_policy, max_readers
+        )
     @property
-    def schema(self) -> Any:
+    def schema(self) -> pa.Schema:
         """
         Arrow schema of this dataset.
@@ -232,6 +259,11 @@ class StreamCache:
         requested_schema : object, optional
             Schema capsule to cast the stream to, or ``None``.
+        Returns
+        -------
+        PyCapsule
+            An Arrow C stream capsule wrapping a fresh reader.
         """
         return self._impl.__arrow_c_stream__(requested_schema)
@@ -246,6 +278,11 @@ class StreamCache:
         :class:`StreamCache`.  Then the consumer can ask the producer (in
         ``__arrow_c_stream__``) to cast the exported data to a supported data type.
+        Returns
+        -------
+        PyCapsule
+            An Arrow C schema capsule for the stream's schema.
         """
         return self._impl.__arrow_c_schema__()
@@ -341,12 +378,14 @@ class StreamCacheReader:
     """
-    def __init__(self, impl: _PyStreamCacheReader):
+    _impl: _PyStreamCacheReader
+    def __init__(self, impl: _PyStreamCacheReader) -> None:
         """Obtain via :meth:`StreamCache.reader`."""
         self._impl = impl
     @property
-    def schema(self) -> Any:
+    def schema(self) -> pa.Schema:
         """
         Arrow schema of batches produced by this reader.
@@ -388,6 +427,11 @@ class StreamCacheReader:
         requested_schema : object, optional
             Schema capsule to cast the stream to, or ``None``.
+        Returns
+        -------
+        PyCapsule
+            An Arrow C stream capsule wrapping this reader.
         Raises
         ------
         ValueError
@@ -407,6 +451,11 @@ class StreamCacheReader:
         :class:`StreamCacheReader`.  Then the consumer can ask the producer (in
         ``__arrow_c_stream__``) to cast the exported data to a supported data type.
+        Returns
+        -------
+        PyCapsule
+            An Arrow C schema capsule for the reader's schema.
         Raises
         ------
         ValueError
@@ -416,10 +465,17 @@ class StreamCacheReader:
         return self._impl.__arrow_c_schema__()
     def __iter__(self) -> StreamCacheReader:
-        """Return self as the iterator."""
+        """
+        Return self as the iterator.
+        Returns
+        -------
+        StreamCacheReader
+        """
         return self
-    def cast(self, target_schema: Any) -> Any:
+    def cast(self, target_schema: Any) -> pa.RecordBatchReader:
         """
         Cast the reader to produce batches with the given schema.
@@ -445,8 +501,15 @@ class StreamCacheReader:
         """
         return self._impl.cast(target_schema)
-    def __next__(self) -> Any:
-        """Get the next batch from the reader."""
+    def __next__(self) -> pa.RecordBatch:
+        """
+        Get the next batch from the reader.
+        Returns
+        -------
+        pyarrow.RecordBatch
+        """
         return next(iter(self._impl))
@@ -465,12 +528,14 @@ class CastingStreamCache:
     """
-    def __init__(self, impl: _PyCastingStreamCache):
+    _impl: _PyCastingStreamCache
+    def __init__(self, impl: _PyCastingStreamCache) -> None:
         """Obtain via :meth:`StreamCache.cast`."""
         self._impl = impl
     @property
-    def schema(self) -> Any:
+    def schema(self) -> pa.Schema:
         """
         Arrow schema produced by this dataset after casting.
@@ -494,6 +559,11 @@ class CastingStreamCache:
             Schema capsule to further cast the stream to, or ``None`` (uses
             :attr:`schema`).
+        Returns
+        -------
+        PyCapsule
+            An Arrow C stream capsule wrapping a fresh casting reader.
         """
         return self._impl.__arrow_c_stream__(requested_schema)
@@ -503,6 +573,11 @@ class CastingStreamCache:
         Returns the target schema so consumers can inspect the post-cast type.
+        Returns
+        -------
+        PyCapsule
+            An Arrow C schema capsule for the post-cast schema.
         """
         return self._impl.__arrow_c_schema__()

{batchcorder-0.1.2 → batchcorder-0.1.3}/python/batchcorder/_batchcorder.pyi RENAMED Viewed

@@ -33,6 +33,8 @@ class StreamCache:
         memory_capacity: builtins.int | None = None,
         disk_path: builtins.str | None = None,
         disk_capacity: builtins.int | None = None,
+        write_policy: builtins.str = ...,
+        max_readers: builtins.int | None = None,
     ) -> StreamCache: ...
     def reader(self, from_start: builtins.bool = ...) -> StreamCacheReader: ...
     def __iter__(self) -> StreamCacheReader: ...

batchcorder 0.1.2__tar.gz → 0.1.3__tar.gz

batchcorder 0.1.2tar.gz → 0.1.3tar.gz