batchcorder 0.1.2__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {batchcorder-0.1.2 → batchcorder-0.1.3}/.gitignore +5 -0
  2. {batchcorder-0.1.2 → batchcorder-0.1.3}/.pre-commit-config.yaml +3 -0
  3. {batchcorder-0.1.2 → batchcorder-0.1.3}/Cargo.lock +8 -1
  4. {batchcorder-0.1.2 → batchcorder-0.1.3}/Cargo.toml +4 -1
  5. {batchcorder-0.1.2 → batchcorder-0.1.3}/PKG-INFO +13 -1
  6. {batchcorder-0.1.2 → batchcorder-0.1.3}/README.md +12 -0
  7. {batchcorder-0.1.2 → batchcorder-0.1.3}/docs/quarto/how-to/cache-config.qmd +38 -0
  8. {batchcorder-0.1.2 → batchcorder-0.1.3}/docs/quarto/how-to/eviction.qmd +45 -0
  9. {batchcorder-0.1.2 → batchcorder-0.1.3}/docs/quarto/reference/api-overview.qmd +1 -1
  10. {batchcorder-0.1.2 → batchcorder-0.1.3}/pyproject.toml +22 -2
  11. {batchcorder-0.1.2 → batchcorder-0.1.3}/python/batchcorder/__init__.py +86 -11
  12. {batchcorder-0.1.2 → batchcorder-0.1.3}/python/batchcorder/_batchcorder.pyi +2 -0
  13. batchcorder-0.1.3/src/cached_dataset.rs +1946 -0
  14. batchcorder-0.1.3/tests/helpers.py +11 -0
  15. batchcorder-0.1.3/tests/strategies.py +244 -0
  16. {batchcorder-0.1.2 → batchcorder-0.1.3}/tests/test_as_record_batch_reader.py +19 -15
  17. {batchcorder-0.1.2 → batchcorder-0.1.3}/tests/test_cast.py +27 -15
  18. batchcorder-0.1.3/tests/test_construction_properties.py +138 -0
  19. batchcorder-0.1.3/tests/test_errors.py +358 -0
  20. batchcorder-0.1.3/tests/test_max_readers.py +229 -0
  21. batchcorder-0.1.3/tests/test_max_readers_properties.py +171 -0
  22. {batchcorder-0.1.2 → batchcorder-0.1.3}/tests/test_performance.py +14 -3
  23. {batchcorder-0.1.2 → batchcorder-0.1.3}/tests/test_stream_cache.py +126 -24
  24. batchcorder-0.1.3/tests/test_write_policy.py +172 -0
  25. {batchcorder-0.1.2 → batchcorder-0.1.3}/uv.lock +24 -0
  26. batchcorder-0.1.2/src/cached_dataset.rs +0 -902
  27. batchcorder-0.1.2/tests/test_errors.py +0 -148
  28. {batchcorder-0.1.2 → batchcorder-0.1.3}/.github/release-drafter.yml +0 -0
  29. {batchcorder-0.1.2 → batchcorder-0.1.3}/.github/workflows/build-wheels.yaml +0 -0
  30. {batchcorder-0.1.2 → batchcorder-0.1.3}/.github/workflows/ci-lint.yaml +0 -0
  31. {batchcorder-0.1.2 → batchcorder-0.1.3}/.github/workflows/ci-pre-release.yaml +0 -0
  32. {batchcorder-0.1.2 → batchcorder-0.1.3}/.github/workflows/ci-release.yaml +0 -0
  33. {batchcorder-0.1.2 → batchcorder-0.1.3}/.github/workflows/ci-test.yaml +0 -0
  34. {batchcorder-0.1.2 → batchcorder-0.1.3}/.github/workflows/release-drafter.yml +0 -0
  35. {batchcorder-0.1.2 → batchcorder-0.1.3}/.readthedocs.yaml +0 -0
  36. {batchcorder-0.1.2 → batchcorder-0.1.3}/.yamllint.yaml +0 -0
  37. {batchcorder-0.1.2 → batchcorder-0.1.3}/CLAUDE.md +0 -0
  38. {batchcorder-0.1.2 → batchcorder-0.1.3}/CONTRIBUTING.md +0 -0
  39. {batchcorder-0.1.2 → batchcorder-0.1.3}/LICENSE +0 -0
  40. {batchcorder-0.1.2 → batchcorder-0.1.3}/Makefile +0 -0
  41. {batchcorder-0.1.2 → batchcorder-0.1.3}/conftest.py +0 -0
  42. {batchcorder-0.1.2 → batchcorder-0.1.3}/docs/quarto/.gitignore +0 -0
  43. {batchcorder-0.1.2 → batchcorder-0.1.3}/docs/quarto/_quarto.yml +0 -0
  44. {batchcorder-0.1.2 → batchcorder-0.1.3}/docs/quarto/how-to/duckdb.qmd +0 -0
  45. {batchcorder-0.1.2 → batchcorder-0.1.3}/docs/quarto/tutorials/getting-started.qmd +0 -0
  46. {batchcorder-0.1.2 → batchcorder-0.1.3}/docs/requirements.txt +0 -0
  47. {batchcorder-0.1.2 → batchcorder-0.1.3}/docs/source/api.rst +0 -0
  48. {batchcorder-0.1.2 → batchcorder-0.1.3}/docs/source/conf.py +0 -0
  49. {batchcorder-0.1.2 → batchcorder-0.1.3}/docs/source/index.rst +0 -0
  50. {batchcorder-0.1.2 → batchcorder-0.1.3}/python/batchcorder/py.typed +0 -0
  51. {batchcorder-0.1.2 → batchcorder-0.1.3}/rust-toolchain.toml +0 -0
  52. {batchcorder-0.1.2 → batchcorder-0.1.3}/scripts/build-docs.sh +0 -0
  53. {batchcorder-0.1.2 → batchcorder-0.1.3}/scripts/build-release.sh +0 -0
  54. {batchcorder-0.1.2 → batchcorder-0.1.3}/src/bin/stub_gen.rs +0 -0
  55. {batchcorder-0.1.2 → batchcorder-0.1.3}/src/lib.rs +0 -0
  56. {batchcorder-0.1.2 → batchcorder-0.1.3}/tests/test_docstrings.py +0 -0
  57. {batchcorder-0.1.2 → batchcorder-0.1.3}/tests/test_duckdb.py +0 -0
@@ -90,3 +90,8 @@ docs/source/_autoapi/
90
90
  site
91
91
 
92
92
  drafts
93
+
94
+ # cargo-llvm-cov output
95
+ coverage.lcov
96
+
97
+ .claude
@@ -12,6 +12,9 @@ repos:
12
12
  args: ["--maxkb=500"]
13
13
  - id: name-tests-test
14
14
  args: ["--pytest-test-first"]
15
+ # strategies.py (shared Hypothesis strategies) and helpers.py (shared
16
+ # test constants) are support modules, not test files.
17
+ exclude: ^tests/(strategies|helpers)\.py$
15
18
 
16
19
  - repo: https://github.com/abravalheri/validate-pyproject
17
20
  rev: v0.25
@@ -131,7 +131,7 @@ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
131
131
 
132
132
  [[package]]
133
133
  name = "batchcorder"
134
- version = "0.1.2"
134
+ version = "0.1.3"
135
135
  dependencies = [
136
136
  "arrow-array",
137
137
  "arrow-ipc",
@@ -140,6 +140,7 @@ dependencies = [
140
140
  "pyo3",
141
141
  "pyo3-stub-gen",
142
142
  "sysinfo",
143
+ "xxhash-rust",
143
144
  ]
144
145
 
145
146
  [[package]]
@@ -1319,6 +1320,12 @@ version = "0.51.0"
1319
1320
  source = "registry+https://github.com/rust-lang/crates.io-index"
1320
1321
  checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
1321
1322
 
1323
+ [[package]]
1324
+ name = "xxhash-rust"
1325
+ version = "0.8.15"
1326
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1327
+ checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3"
1328
+
1322
1329
  [[package]]
1323
1330
  name = "zerocopy"
1324
1331
  version = "0.8.42"
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "batchcorder"
3
- version = "0.1.2"
3
+ version = "0.1.3"
4
4
  edition = "2024"
5
5
  readme = "README.md"
6
6
 
@@ -30,6 +30,9 @@ arrow-ipc = "58"
30
30
  # System information for detecting total physical RAM (used as default hot-cache budget)
31
31
  sysinfo = { version = "0.33", default-features = false, features = ["system"] }
32
32
 
33
+ # Fast non-cryptographic hash for cache entry integrity checks
34
+ xxhash-rust = { version = "0.8", features = ["xxh3"] }
35
+
33
36
  [lints.rust]
34
37
  # Py_GIL_DISABLED is a cfg set by pyo3's build script to indicate free-threaded Python.
35
38
  unexpected_cfgs = { level = "warn", check-cfg = ['cfg(Py_GIL_DISABLED)'] }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: batchcorder
3
- Version: 0.1.2
3
+ Version: 0.1.3
4
4
  Classifier: Programming Language :: Rust
5
5
  Classifier: Programming Language :: Python :: Implementation :: CPython
6
6
  Classifier: Programming Language :: Python :: Implementation :: PyPy
@@ -99,6 +99,12 @@ for batch in ds:
99
99
  reader = ds.reader()
100
100
  result = pa.RecordBatchReader.from_stream(reader).read_all()
101
101
 
102
+ # Bounded-memory: evict batches once all readers have passed them
103
+ ds = StreamCache(
104
+ table.to_reader(max_chunksize=1),
105
+ max_readers=2, # at most 2 reads; batches evicted when both advance
106
+ )
107
+
102
108
  # Pre-ingest everything upfront
103
109
  ds.ingest_all()
104
110
  ```
@@ -129,6 +135,12 @@ duckdb.table("ds") # DuckDB
129
135
  - **Replay from any position**: `ds.reader(from_start=True)` (default) replays
130
136
  from batch 0; `ds.reader(from_start=False)` starts from the current ingestion
131
137
  frontier (next batch not yet ingested).
138
+ - **Bounded-memory streaming**: set `max_readers=N` to evict batches once all
139
+ `N` readers exist and have advanced past them — eviction does not begin
140
+ until all `N` readers have been created. `max_readers` is a hard cap on
141
+ total readers ever created (dropping a reader does not free a slot). Once
142
+ eviction has started, `reader(from_start=True)` raises `ValueError`.
143
+ When unset, all batches are retained indefinitely.
132
144
 
133
145
  ## Development
134
146
 
@@ -75,6 +75,12 @@ for batch in ds:
75
75
  reader = ds.reader()
76
76
  result = pa.RecordBatchReader.from_stream(reader).read_all()
77
77
 
78
+ # Bounded-memory: evict batches once all readers have passed them
79
+ ds = StreamCache(
80
+ table.to_reader(max_chunksize=1),
81
+ max_readers=2, # at most 2 reads; batches evicted when both advance
82
+ )
83
+
78
84
  # Pre-ingest everything upfront
79
85
  ds.ingest_all()
80
86
  ```
@@ -105,6 +111,12 @@ duckdb.table("ds") # DuckDB
105
111
  - **Replay from any position**: `ds.reader(from_start=True)` (default) replays
106
112
  from batch 0; `ds.reader(from_start=False)` starts from the current ingestion
107
113
  frontier (next batch not yet ingested).
114
+ - **Bounded-memory streaming**: set `max_readers=N` to evict batches once all
115
+ `N` readers exist and have advanced past them — eviction does not begin
116
+ until all `N` readers have been created. `max_readers` is a hard cap on
117
+ total readers ever created (dropping a reader does not free a slot). Once
118
+ eviction has started, `reader(from_start=True)` raises `ValueError`.
119
+ When unset, all batches are retained indefinitely.
108
120
 
109
121
  ## Development
110
122
 
@@ -64,6 +64,44 @@ Set `memory_capacity` to cover the data your hottest readers are actively
64
64
  consuming. Batches that exceed the hot budget are served directly from disk
65
65
  with no in-memory copy.
66
66
 
67
+ ## Bounded-memory streaming with `max_readers`
68
+
69
+ When you know how many times the stream will be read, set `max_readers` to
70
+ evict batches once all readers have advanced past them. This keeps memory
71
+ usage proportional to the window between the slowest and fastest reader,
72
+ rather than the full stream.
73
+
74
+ ```{python}
75
+ #| eval: false
76
+ ds = StreamCache(
77
+ source,
78
+ max_readers=2, # at most 2 reads of the stream
79
+ )
80
+
81
+ r1 = ds.reader()
82
+ r2 = ds.reader()
83
+
84
+ # As both readers advance, batches behind the slowest are freed.
85
+ result1 = pa.RecordBatchReader.from_stream(r1).read_all()
86
+ result2 = pa.RecordBatchReader.from_stream(r2).read_all()
87
+ ```
88
+
89
+ **Important:** `max_readers` is a hard cap on the total number of readers
90
+ ever created — dropping a reader does not free a slot for a new one. Once
91
+ all `max_readers` readers have been created, no further readers can be
92
+ obtained. Eviction does not begin until all `max_readers` readers have
93
+ actually been created: with fewer readers, every batch is retained so that
94
+ future readers can still replay from the start. Additionally, once batches
95
+ have been evicted, `reader(from_start=True)` raises ``ValueError`` because
96
+ batch 0 is no longer available.
97
+
98
+ For disk-backed caches, eviction frees memory (hot layer and index entries)
99
+ but not bytes already written to the append-only cache file — disk space is
100
+ reclaimed only when the cache is closed or dropped.
101
+
102
+ When `max_readers` is omitted (default), all batches are retained
103
+ indefinitely and unlimited readers are allowed.
104
+
67
105
  ## Training-loop workloads
68
106
 
69
107
  For ML training loops where you replay the full stream each epoch, call
@@ -69,6 +69,51 @@ ds.ingest_all() # upstream fully consumed; cache is complete
69
69
  readers = [ds.reader() for _ in range(8)]
70
70
  ```
71
71
 
72
+ ## Bounded concurrent reads with `max_readers`
73
+
74
+ If you know the number of concurrent readers upfront, set `max_readers` to
75
+ enable automatic eviction. Batches are freed from cache once all readers
76
+ have advanced past them:
77
+
78
+ ```{python}
79
+ #| eval: false
80
+ ds = StreamCache(source, max_readers=4)
81
+ results = [None] * 4
82
+
83
+ # Create readers in the main thread to guarantee all slots are claimed.
84
+ readers = [ds.reader() for _ in range(4)]
85
+
86
+ def read(i):
87
+ results[i] = pa.RecordBatchReader.from_stream(readers[i]).read_all()
88
+
89
+ threads = [threading.Thread(target=read, args=(i,)) for i in range(4)]
90
+ for t in threads:
91
+ t.start()
92
+ for t in threads:
93
+ t.join()
94
+ ```
95
+
96
+ This keeps memory proportional to the gap between the fastest and slowest
97
+ reader. Without `max_readers`, all batches are retained for the lifetime of
98
+ the cache.
99
+
100
+ **Important:** `max_readers` is a hard cap on the total number of readers
101
+ ever created, not the number of concurrent readers. Dropping a reader does
102
+ not free a slot for a new one. Once all `max_readers` readers have been
103
+ created, no further readers can be obtained. Eviction does not begin until
104
+ all `max_readers` readers have actually been created — with fewer readers,
105
+ every batch is retained so future readers can still replay from the start.
106
+
107
+ Also note that once eviction has begun, `reader(from_start=True)` raises
108
+ ``ValueError`` because batch 0 is no longer available. Anything that creates
109
+ a reader counts against the cap: `ds.reader()`, `iter(ds)`,
110
+ `ds.__arrow_c_stream__()` (so each engine scan of `ds`), and each read of a
111
+ `cast()` result all consume one slot.
112
+
113
+ For disk-backed caches, eviction frees memory (hot layer and index entries)
114
+ but not bytes already written to the append-only cache file — disk space is
115
+ reclaimed only when the cache is closed or dropped.
116
+
72
117
  ## Reading from `__arrow_c_stream__`
73
118
 
74
119
  `StreamCache` itself implements `__arrow_c_stream__`, which creates a fresh
@@ -15,7 +15,7 @@ hybrid memory + disk cache.
15
15
 
16
16
  | Method / property | Description |
17
17
  |--------------------------|-------------|
18
- | `__init__(...)` | Create a StreamCache from any `__arrow_c_stream__` source. |
18
+ | `__init__(...)` | Create a StreamCache from any `__arrow_c_stream__` source. Accepts optional `max_readers` to enable bounded-memory eviction. |
19
19
  | `reader(from_start=True)`| Return a new independent reader handle. |
20
20
  | `ingest_all()` | Eagerly consume the upstream source into the cache. |
21
21
  | `schema` | Arrow schema of the stream. |
@@ -40,6 +40,7 @@ dev = [
40
40
  "maturin>=1.7,<2.0",
41
41
  "pyarrow>=18",
42
42
  "pytest>=8",
43
+ "hypothesis>=6",
43
44
  "duckdb>=1.5",
44
45
  "ruff>=0.15.6",
45
46
  "pre-commit>=3.5.0",
@@ -69,6 +70,14 @@ skip = "uv.lock,docs/requirements.txt"
69
70
  extend-include = ["*.md"]
70
71
 
71
72
  [tool.ruff.lint]
73
+ # Preview is required for the pydoclint (DOC) rules. Scoped two ways: it is
74
+ # lint-only (the formatter stays on stable style), and explicit-preview-rules
75
+ # limits it to the exact preview codes listed in extend-select — prefix
76
+ # selectors (RUF, B, UP, ...) keep selecting stable rules only, so a ruff
77
+ # upgrade cannot silently enable new preview rules (ruff is pinned at the
78
+ # matching rev in .pre-commit-config.yaml).
79
+ preview = true
80
+ explicit-preview-rules = true
72
81
  ignore = [
73
82
  "D203", # conflicts with D211 (no-blank-line-before-class); D211 wins
74
83
  "D212", # conflicts with D213 (multi-line-summary-second-line); D213 wins
@@ -83,14 +92,25 @@ extend-select = [
83
92
  "SIM",
84
93
  "PT",
85
94
  "D",
95
+ # pydoclint, selected by exact code (preview rules; see explicit-preview-rules)
96
+ "DOC201", # return value not documented in docstring
97
+ "DOC202", # docstring documents a return the body does not have
98
+ "DOC402", # yielded value not documented in docstring
99
+ "DOC403", # docstring documents yields the body does not have
100
+ "DOC501", # explicitly raised exception missing from docstring
101
+ "DOC502", # docstring documents an exception the body does not raise
86
102
  "RUF",
87
103
  "PYI",
88
104
  "UP",
89
105
  ]
90
106
 
91
107
  [tool.ruff.lint.per-file-ignores]
92
- "tests/**" = ["D"]
93
- "conftest.py" = ["D"]
108
+ # D/DOC: docstring section structure is not enforced in tests.
109
+ "tests/**" = ["D", "DOC"]
110
+ "conftest.py" = ["D", "DOC"]
111
+ # DOC502: the wrappers in __init__.py document exceptions raised inside the
112
+ # Rust delegate, which pydoclint cannot see across the FFI boundary.
113
+ "python/batchcorder/__init__.py" = ["DOC502"]
94
114
 
95
115
  [tool.ruff.lint.isort]
96
116
  known-first-party = ["batchcorder"]
@@ -9,6 +9,8 @@ from typing import TYPE_CHECKING
9
9
  if TYPE_CHECKING:
10
10
  from typing import Any
11
11
 
12
+ import pyarrow as pa
13
+
12
14
  from ._batchcorder import (
13
15
  CastingStreamCache as _PyCastingStreamCache,
14
16
  )
@@ -62,6 +64,25 @@ class StreamCache:
62
64
  disk_capacity : int, optional
63
65
  On-disk storage budget in bytes.
64
66
  Must be provided together with ``disk_path``.
67
+ write_policy : str, optional
68
+ When batches are flushed to disk (disk mode only; ignored in
69
+ memory-only mode). ``"on_insertion"`` (default) writes every batch to
70
+ disk immediately. ``"on_eviction"`` keeps batches in the hot layer and
71
+ only writes them to disk when evicted, so a hot layer large enough to
72
+ hold the whole stream never touches disk.
73
+ max_readers : int, optional
74
+ Hard cap on the total number of readers ever created from this cache.
75
+ When set, batches are evicted once all readers have advanced past them,
76
+ enabling bounded-memory streaming. Eviction only begins once all
77
+ ``max_readers`` readers have actually been created — with fewer live
78
+ readers every batch is retained so future readers can still replay
79
+ from the start. Dropping a reader does **not** free a slot — once
80
+ ``max_readers`` readers have been created, no more can be obtained.
81
+ ``reader(from_start=True)`` raises ``ValueError`` if batch 0 has
82
+ already been evicted. For disk-backed caches, eviction frees memory
83
+ (hot layer and index) but not bytes in the append-only cache file —
84
+ disk space is reclaimed only when the cache is closed or dropped.
85
+ When ``None`` (default), all batches are retained indefinitely.
65
86
 
66
87
  Examples
67
88
  --------
@@ -86,18 +107,24 @@ class StreamCache:
86
107
 
87
108
  """
88
109
 
110
+ _impl: _PyStreamCache
111
+
89
112
  def __init__(
90
113
  self,
91
114
  reader: Any,
92
115
  memory_capacity: int | None = None,
93
116
  disk_path: str | None = None,
94
117
  disk_capacity: int | None = None,
95
- ):
118
+ write_policy: str = "on_insertion",
119
+ max_readers: int | None = None,
120
+ ) -> None:
96
121
  """See class docstring for parameter documentation."""
97
- self._impl = _PyStreamCache(reader, memory_capacity, disk_path, disk_capacity)
122
+ self._impl = _PyStreamCache(
123
+ reader, memory_capacity, disk_path, disk_capacity, write_policy, max_readers
124
+ )
98
125
 
99
126
  @property
100
- def schema(self) -> Any:
127
+ def schema(self) -> pa.Schema:
101
128
  """
102
129
  Arrow schema of this dataset.
103
130
 
@@ -232,6 +259,11 @@ class StreamCache:
232
259
  requested_schema : object, optional
233
260
  Schema capsule to cast the stream to, or ``None``.
234
261
 
262
+ Returns
263
+ -------
264
+ PyCapsule
265
+ An Arrow C stream capsule wrapping a fresh reader.
266
+
235
267
  """
236
268
  return self._impl.__arrow_c_stream__(requested_schema)
237
269
 
@@ -246,6 +278,11 @@ class StreamCache:
246
278
  :class:`StreamCache`. Then the consumer can ask the producer (in
247
279
  ``__arrow_c_stream__``) to cast the exported data to a supported data type.
248
280
 
281
+ Returns
282
+ -------
283
+ PyCapsule
284
+ An Arrow C schema capsule for the stream's schema.
285
+
249
286
  """
250
287
  return self._impl.__arrow_c_schema__()
251
288
 
@@ -341,12 +378,14 @@ class StreamCacheReader:
341
378
 
342
379
  """
343
380
 
344
- def __init__(self, impl: _PyStreamCacheReader):
381
+ _impl: _PyStreamCacheReader
382
+
383
+ def __init__(self, impl: _PyStreamCacheReader) -> None:
345
384
  """Obtain via :meth:`StreamCache.reader`."""
346
385
  self._impl = impl
347
386
 
348
387
  @property
349
- def schema(self) -> Any:
388
+ def schema(self) -> pa.Schema:
350
389
  """
351
390
  Arrow schema of batches produced by this reader.
352
391
 
@@ -388,6 +427,11 @@ class StreamCacheReader:
388
427
  requested_schema : object, optional
389
428
  Schema capsule to cast the stream to, or ``None``.
390
429
 
430
+ Returns
431
+ -------
432
+ PyCapsule
433
+ An Arrow C stream capsule wrapping this reader.
434
+
391
435
  Raises
392
436
  ------
393
437
  ValueError
@@ -407,6 +451,11 @@ class StreamCacheReader:
407
451
  :class:`StreamCacheReader`. Then the consumer can ask the producer (in
408
452
  ``__arrow_c_stream__``) to cast the exported data to a supported data type.
409
453
 
454
+ Returns
455
+ -------
456
+ PyCapsule
457
+ An Arrow C schema capsule for the reader's schema.
458
+
410
459
  Raises
411
460
  ------
412
461
  ValueError
@@ -416,10 +465,17 @@ class StreamCacheReader:
416
465
  return self._impl.__arrow_c_schema__()
417
466
 
418
467
  def __iter__(self) -> StreamCacheReader:
419
- """Return self as the iterator."""
468
+ """
469
+ Return self as the iterator.
470
+
471
+ Returns
472
+ -------
473
+ StreamCacheReader
474
+
475
+ """
420
476
  return self
421
477
 
422
- def cast(self, target_schema: Any) -> Any:
478
+ def cast(self, target_schema: Any) -> pa.RecordBatchReader:
423
479
  """
424
480
  Cast the reader to produce batches with the given schema.
425
481
 
@@ -445,8 +501,15 @@ class StreamCacheReader:
445
501
  """
446
502
  return self._impl.cast(target_schema)
447
503
 
448
- def __next__(self) -> Any:
449
- """Get the next batch from the reader."""
504
+ def __next__(self) -> pa.RecordBatch:
505
+ """
506
+ Get the next batch from the reader.
507
+
508
+ Returns
509
+ -------
510
+ pyarrow.RecordBatch
511
+
512
+ """
450
513
  return next(iter(self._impl))
451
514
 
452
515
 
@@ -465,12 +528,14 @@ class CastingStreamCache:
465
528
 
466
529
  """
467
530
 
468
- def __init__(self, impl: _PyCastingStreamCache):
531
+ _impl: _PyCastingStreamCache
532
+
533
+ def __init__(self, impl: _PyCastingStreamCache) -> None:
469
534
  """Obtain via :meth:`StreamCache.cast`."""
470
535
  self._impl = impl
471
536
 
472
537
  @property
473
- def schema(self) -> Any:
538
+ def schema(self) -> pa.Schema:
474
539
  """
475
540
  Arrow schema produced by this dataset after casting.
476
541
 
@@ -494,6 +559,11 @@ class CastingStreamCache:
494
559
  Schema capsule to further cast the stream to, or ``None`` (uses
495
560
  :attr:`schema`).
496
561
 
562
+ Returns
563
+ -------
564
+ PyCapsule
565
+ An Arrow C stream capsule wrapping a fresh casting reader.
566
+
497
567
  """
498
568
  return self._impl.__arrow_c_stream__(requested_schema)
499
569
 
@@ -503,6 +573,11 @@ class CastingStreamCache:
503
573
 
504
574
  Returns the target schema so consumers can inspect the post-cast type.
505
575
 
576
+ Returns
577
+ -------
578
+ PyCapsule
579
+ An Arrow C schema capsule for the post-cast schema.
580
+
506
581
  """
507
582
  return self._impl.__arrow_c_schema__()
508
583
 
@@ -33,6 +33,8 @@ class StreamCache:
33
33
  memory_capacity: builtins.int | None = None,
34
34
  disk_path: builtins.str | None = None,
35
35
  disk_capacity: builtins.int | None = None,
36
+ write_policy: builtins.str = ...,
37
+ max_readers: builtins.int | None = None,
36
38
  ) -> StreamCache: ...
37
39
  def reader(self, from_start: builtins.bool = ...) -> StreamCacheReader: ...
38
40
  def __iter__(self) -> StreamCacheReader: ...