batchcorder 0.1.2__tar.gz → 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batchcorder-0.1.2 → batchcorder-0.1.3}/.gitignore +5 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/.pre-commit-config.yaml +3 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/Cargo.lock +8 -1
- {batchcorder-0.1.2 → batchcorder-0.1.3}/Cargo.toml +4 -1
- {batchcorder-0.1.2 → batchcorder-0.1.3}/PKG-INFO +13 -1
- {batchcorder-0.1.2 → batchcorder-0.1.3}/README.md +12 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/docs/quarto/how-to/cache-config.qmd +38 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/docs/quarto/how-to/eviction.qmd +45 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/docs/quarto/reference/api-overview.qmd +1 -1
- {batchcorder-0.1.2 → batchcorder-0.1.3}/pyproject.toml +22 -2
- {batchcorder-0.1.2 → batchcorder-0.1.3}/python/batchcorder/__init__.py +86 -11
- {batchcorder-0.1.2 → batchcorder-0.1.3}/python/batchcorder/_batchcorder.pyi +2 -0
- batchcorder-0.1.3/src/cached_dataset.rs +1946 -0
- batchcorder-0.1.3/tests/helpers.py +11 -0
- batchcorder-0.1.3/tests/strategies.py +244 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/tests/test_as_record_batch_reader.py +19 -15
- {batchcorder-0.1.2 → batchcorder-0.1.3}/tests/test_cast.py +27 -15
- batchcorder-0.1.3/tests/test_construction_properties.py +138 -0
- batchcorder-0.1.3/tests/test_errors.py +358 -0
- batchcorder-0.1.3/tests/test_max_readers.py +229 -0
- batchcorder-0.1.3/tests/test_max_readers_properties.py +171 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/tests/test_performance.py +14 -3
- {batchcorder-0.1.2 → batchcorder-0.1.3}/tests/test_stream_cache.py +126 -24
- batchcorder-0.1.3/tests/test_write_policy.py +172 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/uv.lock +24 -0
- batchcorder-0.1.2/src/cached_dataset.rs +0 -902
- batchcorder-0.1.2/tests/test_errors.py +0 -148
- {batchcorder-0.1.2 → batchcorder-0.1.3}/.github/release-drafter.yml +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/.github/workflows/build-wheels.yaml +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/.github/workflows/ci-lint.yaml +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/.github/workflows/ci-pre-release.yaml +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/.github/workflows/ci-release.yaml +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/.github/workflows/ci-test.yaml +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/.github/workflows/release-drafter.yml +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/.readthedocs.yaml +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/.yamllint.yaml +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/CLAUDE.md +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/CONTRIBUTING.md +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/LICENSE +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/Makefile +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/conftest.py +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/docs/quarto/.gitignore +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/docs/quarto/_quarto.yml +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/docs/quarto/how-to/duckdb.qmd +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/docs/quarto/tutorials/getting-started.qmd +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/docs/requirements.txt +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/docs/source/api.rst +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/docs/source/conf.py +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/docs/source/index.rst +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/python/batchcorder/py.typed +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/rust-toolchain.toml +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/scripts/build-docs.sh +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/scripts/build-release.sh +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/src/bin/stub_gen.rs +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/src/lib.rs +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/tests/test_docstrings.py +0 -0
- {batchcorder-0.1.2 → batchcorder-0.1.3}/tests/test_duckdb.py +0 -0
|
@@ -12,6 +12,9 @@ repos:
|
|
|
12
12
|
args: ["--maxkb=500"]
|
|
13
13
|
- id: name-tests-test
|
|
14
14
|
args: ["--pytest-test-first"]
|
|
15
|
+
# strategies.py (shared Hypothesis strategies) and helpers.py (shared
|
|
16
|
+
# test constants) are support modules, not test files.
|
|
17
|
+
exclude: ^tests/(strategies|helpers)\.py$
|
|
15
18
|
|
|
16
19
|
- repo: https://github.com/abravalheri/validate-pyproject
|
|
17
20
|
rev: v0.25
|
|
@@ -131,7 +131,7 @@ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
|
|
|
131
131
|
|
|
132
132
|
[[package]]
|
|
133
133
|
name = "batchcorder"
|
|
134
|
-
version = "0.1.
|
|
134
|
+
version = "0.1.3"
|
|
135
135
|
dependencies = [
|
|
136
136
|
"arrow-array",
|
|
137
137
|
"arrow-ipc",
|
|
@@ -140,6 +140,7 @@ dependencies = [
|
|
|
140
140
|
"pyo3",
|
|
141
141
|
"pyo3-stub-gen",
|
|
142
142
|
"sysinfo",
|
|
143
|
+
"xxhash-rust",
|
|
143
144
|
]
|
|
144
145
|
|
|
145
146
|
[[package]]
|
|
@@ -1319,6 +1320,12 @@ version = "0.51.0"
|
|
|
1319
1320
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1320
1321
|
checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
|
|
1321
1322
|
|
|
1323
|
+
[[package]]
|
|
1324
|
+
name = "xxhash-rust"
|
|
1325
|
+
version = "0.8.15"
|
|
1326
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1327
|
+
checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3"
|
|
1328
|
+
|
|
1322
1329
|
[[package]]
|
|
1323
1330
|
name = "zerocopy"
|
|
1324
1331
|
version = "0.8.42"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "batchcorder"
|
|
3
|
-
version = "0.1.
|
|
3
|
+
version = "0.1.3"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
|
|
@@ -30,6 +30,9 @@ arrow-ipc = "58"
|
|
|
30
30
|
# System information for detecting total physical RAM (used as default hot-cache budget)
|
|
31
31
|
sysinfo = { version = "0.33", default-features = false, features = ["system"] }
|
|
32
32
|
|
|
33
|
+
# Fast non-cryptographic hash for cache entry integrity checks
|
|
34
|
+
xxhash-rust = { version = "0.8", features = ["xxh3"] }
|
|
35
|
+
|
|
33
36
|
[lints.rust]
|
|
34
37
|
# Py_GIL_DISABLED is a cfg set by pyo3's build script to indicate free-threaded Python.
|
|
35
38
|
unexpected_cfgs = { level = "warn", check-cfg = ['cfg(Py_GIL_DISABLED)'] }
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: batchcorder
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Classifier: Programming Language :: Rust
|
|
5
5
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
6
6
|
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
@@ -99,6 +99,12 @@ for batch in ds:
|
|
|
99
99
|
reader = ds.reader()
|
|
100
100
|
result = pa.RecordBatchReader.from_stream(reader).read_all()
|
|
101
101
|
|
|
102
|
+
# Bounded-memory: evict batches once all readers have passed them
|
|
103
|
+
ds = StreamCache(
|
|
104
|
+
table.to_reader(max_chunksize=1),
|
|
105
|
+
max_readers=2, # at most 2 reads; batches evicted when both advance
|
|
106
|
+
)
|
|
107
|
+
|
|
102
108
|
# Pre-ingest everything upfront
|
|
103
109
|
ds.ingest_all()
|
|
104
110
|
```
|
|
@@ -129,6 +135,12 @@ duckdb.table("ds") # DuckDB
|
|
|
129
135
|
- **Replay from any position**: `ds.reader(from_start=True)` (default) replays
|
|
130
136
|
from batch 0; `ds.reader(from_start=False)` starts from the current ingestion
|
|
131
137
|
frontier (next batch not yet ingested).
|
|
138
|
+
- **Bounded-memory streaming**: set `max_readers=N` to evict batches once all
|
|
139
|
+
`N` readers exist and have advanced past them — eviction does not begin
|
|
140
|
+
until all `N` readers have been created. `max_readers` is a hard cap on
|
|
141
|
+
total readers ever created (dropping a reader does not free a slot). Once
|
|
142
|
+
eviction has started, `reader(from_start=True)` raises `ValueError`.
|
|
143
|
+
When unset, all batches are retained indefinitely.
|
|
132
144
|
|
|
133
145
|
## Development
|
|
134
146
|
|
|
@@ -75,6 +75,12 @@ for batch in ds:
|
|
|
75
75
|
reader = ds.reader()
|
|
76
76
|
result = pa.RecordBatchReader.from_stream(reader).read_all()
|
|
77
77
|
|
|
78
|
+
# Bounded-memory: evict batches once all readers have passed them
|
|
79
|
+
ds = StreamCache(
|
|
80
|
+
table.to_reader(max_chunksize=1),
|
|
81
|
+
max_readers=2, # at most 2 reads; batches evicted when both advance
|
|
82
|
+
)
|
|
83
|
+
|
|
78
84
|
# Pre-ingest everything upfront
|
|
79
85
|
ds.ingest_all()
|
|
80
86
|
```
|
|
@@ -105,6 +111,12 @@ duckdb.table("ds") # DuckDB
|
|
|
105
111
|
- **Replay from any position**: `ds.reader(from_start=True)` (default) replays
|
|
106
112
|
from batch 0; `ds.reader(from_start=False)` starts from the current ingestion
|
|
107
113
|
frontier (next batch not yet ingested).
|
|
114
|
+
- **Bounded-memory streaming**: set `max_readers=N` to evict batches once all
|
|
115
|
+
`N` readers exist and have advanced past them — eviction does not begin
|
|
116
|
+
until all `N` readers have been created. `max_readers` is a hard cap on
|
|
117
|
+
total readers ever created (dropping a reader does not free a slot). Once
|
|
118
|
+
eviction has started, `reader(from_start=True)` raises `ValueError`.
|
|
119
|
+
When unset, all batches are retained indefinitely.
|
|
108
120
|
|
|
109
121
|
## Development
|
|
110
122
|
|
|
@@ -64,6 +64,44 @@ Set `memory_capacity` to cover the data your hottest readers are actively
|
|
|
64
64
|
consuming. Batches that exceed the hot budget are served directly from disk
|
|
65
65
|
with no in-memory copy.
|
|
66
66
|
|
|
67
|
+
## Bounded-memory streaming with `max_readers`
|
|
68
|
+
|
|
69
|
+
When you know how many times the stream will be read, set `max_readers` to
|
|
70
|
+
evict batches once all readers have advanced past them. This keeps memory
|
|
71
|
+
usage proportional to the window between the slowest and fastest reader,
|
|
72
|
+
rather than the full stream.
|
|
73
|
+
|
|
74
|
+
```{python}
|
|
75
|
+
#| eval: false
|
|
76
|
+
ds = StreamCache(
|
|
77
|
+
source,
|
|
78
|
+
max_readers=2, # at most 2 reads of the stream
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
r1 = ds.reader()
|
|
82
|
+
r2 = ds.reader()
|
|
83
|
+
|
|
84
|
+
# As both readers advance, batches behind the slowest are freed.
|
|
85
|
+
result1 = pa.RecordBatchReader.from_stream(r1).read_all()
|
|
86
|
+
result2 = pa.RecordBatchReader.from_stream(r2).read_all()
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
**Important:** `max_readers` is a hard cap on the total number of readers
|
|
90
|
+
ever created — dropping a reader does not free a slot for a new one. Once
|
|
91
|
+
all `max_readers` readers have been created, no further readers can be
|
|
92
|
+
obtained. Eviction does not begin until all `max_readers` readers have
|
|
93
|
+
actually been created: with fewer readers, every batch is retained so that
|
|
94
|
+
future readers can still replay from the start. Additionally, once batches
|
|
95
|
+
have been evicted, `reader(from_start=True)` raises ``ValueError`` because
|
|
96
|
+
batch 0 is no longer available.
|
|
97
|
+
|
|
98
|
+
For disk-backed caches, eviction frees memory (hot layer and index entries)
|
|
99
|
+
but not bytes already written to the append-only cache file — disk space is
|
|
100
|
+
reclaimed only when the cache is closed or dropped.
|
|
101
|
+
|
|
102
|
+
When `max_readers` is omitted (default), all batches are retained
|
|
103
|
+
indefinitely and unlimited readers are allowed.
|
|
104
|
+
|
|
67
105
|
## Training-loop workloads
|
|
68
106
|
|
|
69
107
|
For ML training loops where you replay the full stream each epoch, call
|
|
@@ -69,6 +69,51 @@ ds.ingest_all() # upstream fully consumed; cache is complete
|
|
|
69
69
|
readers = [ds.reader() for _ in range(8)]
|
|
70
70
|
```
|
|
71
71
|
|
|
72
|
+
## Bounded concurrent reads with `max_readers`
|
|
73
|
+
|
|
74
|
+
If you know the number of concurrent readers upfront, set `max_readers` to
|
|
75
|
+
enable automatic eviction. Batches are freed from cache once all readers
|
|
76
|
+
have advanced past them:
|
|
77
|
+
|
|
78
|
+
```{python}
|
|
79
|
+
#| eval: false
|
|
80
|
+
ds = StreamCache(source, max_readers=4)
|
|
81
|
+
results = [None] * 4
|
|
82
|
+
|
|
83
|
+
# Create readers in the main thread to guarantee all slots are claimed.
|
|
84
|
+
readers = [ds.reader() for _ in range(4)]
|
|
85
|
+
|
|
86
|
+
def read(i):
|
|
87
|
+
results[i] = pa.RecordBatchReader.from_stream(readers[i]).read_all()
|
|
88
|
+
|
|
89
|
+
threads = [threading.Thread(target=read, args=(i,)) for i in range(4)]
|
|
90
|
+
for t in threads:
|
|
91
|
+
t.start()
|
|
92
|
+
for t in threads:
|
|
93
|
+
t.join()
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
This keeps memory proportional to the gap between the fastest and slowest
|
|
97
|
+
reader. Without `max_readers`, all batches are retained for the lifetime of
|
|
98
|
+
the cache.
|
|
99
|
+
|
|
100
|
+
**Important:** `max_readers` is a hard cap on the total number of readers
|
|
101
|
+
ever created, not the number of concurrent readers. Dropping a reader does
|
|
102
|
+
not free a slot for a new one. Once all `max_readers` readers have been
|
|
103
|
+
created, no further readers can be obtained. Eviction does not begin until
|
|
104
|
+
all `max_readers` readers have actually been created — with fewer readers,
|
|
105
|
+
every batch is retained so future readers can still replay from the start.
|
|
106
|
+
|
|
107
|
+
Also note that once eviction has begun, `reader(from_start=True)` raises
|
|
108
|
+
``ValueError`` because batch 0 is no longer available. Anything that creates
|
|
109
|
+
a reader counts against the cap: `ds.reader()`, `iter(ds)`,
|
|
110
|
+
`ds.__arrow_c_stream__()` (so each engine scan of `ds`), and each read of a
|
|
111
|
+
`cast()` result all consume one slot.
|
|
112
|
+
|
|
113
|
+
For disk-backed caches, eviction frees memory (hot layer and index entries)
|
|
114
|
+
but not bytes already written to the append-only cache file — disk space is
|
|
115
|
+
reclaimed only when the cache is closed or dropped.
|
|
116
|
+
|
|
72
117
|
## Reading from `__arrow_c_stream__`
|
|
73
118
|
|
|
74
119
|
`StreamCache` itself implements `__arrow_c_stream__`, which creates a fresh
|
|
@@ -15,7 +15,7 @@ hybrid memory + disk cache.
|
|
|
15
15
|
|
|
16
16
|
| Method / property | Description |
|
|
17
17
|
|--------------------------|-------------|
|
|
18
|
-
| `__init__(...)` | Create a StreamCache from any `__arrow_c_stream__` source. |
|
|
18
|
+
| `__init__(...)` | Create a StreamCache from any `__arrow_c_stream__` source. Accepts optional `max_readers` to enable bounded-memory eviction. |
|
|
19
19
|
| `reader(from_start=True)`| Return a new independent reader handle. |
|
|
20
20
|
| `ingest_all()` | Eagerly consume the upstream source into the cache. |
|
|
21
21
|
| `schema` | Arrow schema of the stream. |
|
|
@@ -40,6 +40,7 @@ dev = [
|
|
|
40
40
|
"maturin>=1.7,<2.0",
|
|
41
41
|
"pyarrow>=18",
|
|
42
42
|
"pytest>=8",
|
|
43
|
+
"hypothesis>=6",
|
|
43
44
|
"duckdb>=1.5",
|
|
44
45
|
"ruff>=0.15.6",
|
|
45
46
|
"pre-commit>=3.5.0",
|
|
@@ -69,6 +70,14 @@ skip = "uv.lock,docs/requirements.txt"
|
|
|
69
70
|
extend-include = ["*.md"]
|
|
70
71
|
|
|
71
72
|
[tool.ruff.lint]
|
|
73
|
+
# Preview is required for the pydoclint (DOC) rules. Scoped two ways: it is
|
|
74
|
+
# lint-only (the formatter stays on stable style), and explicit-preview-rules
|
|
75
|
+
# limits it to the exact preview codes listed in extend-select — prefix
|
|
76
|
+
# selectors (RUF, B, UP, ...) keep selecting stable rules only, so a ruff
|
|
77
|
+
# upgrade cannot silently enable new preview rules (ruff is pinned at the
|
|
78
|
+
# matching rev in .pre-commit-config.yaml).
|
|
79
|
+
preview = true
|
|
80
|
+
explicit-preview-rules = true
|
|
72
81
|
ignore = [
|
|
73
82
|
"D203", # conflicts with D211 (no-blank-line-before-class); D211 wins
|
|
74
83
|
"D212", # conflicts with D213 (multi-line-summary-second-line); D213 wins
|
|
@@ -83,14 +92,25 @@ extend-select = [
|
|
|
83
92
|
"SIM",
|
|
84
93
|
"PT",
|
|
85
94
|
"D",
|
|
95
|
+
# pydoclint, selected by exact code (preview rules; see explicit-preview-rules)
|
|
96
|
+
"DOC201", # return value not documented in docstring
|
|
97
|
+
"DOC202", # docstring documents a return the body does not have
|
|
98
|
+
"DOC402", # yielded value not documented in docstring
|
|
99
|
+
"DOC403", # docstring documents yields the body does not have
|
|
100
|
+
"DOC501", # explicitly raised exception missing from docstring
|
|
101
|
+
"DOC502", # docstring documents an exception the body does not raise
|
|
86
102
|
"RUF",
|
|
87
103
|
"PYI",
|
|
88
104
|
"UP",
|
|
89
105
|
]
|
|
90
106
|
|
|
91
107
|
[tool.ruff.lint.per-file-ignores]
|
|
92
|
-
|
|
93
|
-
"
|
|
108
|
+
# D/DOC: docstring section structure is not enforced in tests.
|
|
109
|
+
"tests/**" = ["D", "DOC"]
|
|
110
|
+
"conftest.py" = ["D", "DOC"]
|
|
111
|
+
# DOC502: the wrappers in __init__.py document exceptions raised inside the
|
|
112
|
+
# Rust delegate, which pydoclint cannot see across the FFI boundary.
|
|
113
|
+
"python/batchcorder/__init__.py" = ["DOC502"]
|
|
94
114
|
|
|
95
115
|
[tool.ruff.lint.isort]
|
|
96
116
|
known-first-party = ["batchcorder"]
|
|
@@ -9,6 +9,8 @@ from typing import TYPE_CHECKING
|
|
|
9
9
|
if TYPE_CHECKING:
|
|
10
10
|
from typing import Any
|
|
11
11
|
|
|
12
|
+
import pyarrow as pa
|
|
13
|
+
|
|
12
14
|
from ._batchcorder import (
|
|
13
15
|
CastingStreamCache as _PyCastingStreamCache,
|
|
14
16
|
)
|
|
@@ -62,6 +64,25 @@ class StreamCache:
|
|
|
62
64
|
disk_capacity : int, optional
|
|
63
65
|
On-disk storage budget in bytes.
|
|
64
66
|
Must be provided together with ``disk_path``.
|
|
67
|
+
write_policy : str, optional
|
|
68
|
+
When batches are flushed to disk (disk mode only; ignored in
|
|
69
|
+
memory-only mode). ``"on_insertion"`` (default) writes every batch to
|
|
70
|
+
disk immediately. ``"on_eviction"`` keeps batches in the hot layer and
|
|
71
|
+
only writes them to disk when evicted, so a hot layer large enough to
|
|
72
|
+
hold the whole stream never touches disk.
|
|
73
|
+
max_readers : int, optional
|
|
74
|
+
Hard cap on the total number of readers ever created from this cache.
|
|
75
|
+
When set, batches are evicted once all readers have advanced past them,
|
|
76
|
+
enabling bounded-memory streaming. Eviction only begins once all
|
|
77
|
+
``max_readers`` readers have actually been created — with fewer live
|
|
78
|
+
readers every batch is retained so future readers can still replay
|
|
79
|
+
from the start. Dropping a reader does **not** free a slot — once
|
|
80
|
+
``max_readers`` readers have been created, no more can be obtained.
|
|
81
|
+
``reader(from_start=True)`` raises ``ValueError`` if batch 0 has
|
|
82
|
+
already been evicted. For disk-backed caches, eviction frees memory
|
|
83
|
+
(hot layer and index) but not bytes in the append-only cache file —
|
|
84
|
+
disk space is reclaimed only when the cache is closed or dropped.
|
|
85
|
+
When ``None`` (default), all batches are retained indefinitely.
|
|
65
86
|
|
|
66
87
|
Examples
|
|
67
88
|
--------
|
|
@@ -86,18 +107,24 @@ class StreamCache:
|
|
|
86
107
|
|
|
87
108
|
"""
|
|
88
109
|
|
|
110
|
+
_impl: _PyStreamCache
|
|
111
|
+
|
|
89
112
|
def __init__(
|
|
90
113
|
self,
|
|
91
114
|
reader: Any,
|
|
92
115
|
memory_capacity: int | None = None,
|
|
93
116
|
disk_path: str | None = None,
|
|
94
117
|
disk_capacity: int | None = None,
|
|
95
|
-
|
|
118
|
+
write_policy: str = "on_insertion",
|
|
119
|
+
max_readers: int | None = None,
|
|
120
|
+
) -> None:
|
|
96
121
|
"""See class docstring for parameter documentation."""
|
|
97
|
-
self._impl = _PyStreamCache(
|
|
122
|
+
self._impl = _PyStreamCache(
|
|
123
|
+
reader, memory_capacity, disk_path, disk_capacity, write_policy, max_readers
|
|
124
|
+
)
|
|
98
125
|
|
|
99
126
|
@property
|
|
100
|
-
def schema(self) ->
|
|
127
|
+
def schema(self) -> pa.Schema:
|
|
101
128
|
"""
|
|
102
129
|
Arrow schema of this dataset.
|
|
103
130
|
|
|
@@ -232,6 +259,11 @@ class StreamCache:
|
|
|
232
259
|
requested_schema : object, optional
|
|
233
260
|
Schema capsule to cast the stream to, or ``None``.
|
|
234
261
|
|
|
262
|
+
Returns
|
|
263
|
+
-------
|
|
264
|
+
PyCapsule
|
|
265
|
+
An Arrow C stream capsule wrapping a fresh reader.
|
|
266
|
+
|
|
235
267
|
"""
|
|
236
268
|
return self._impl.__arrow_c_stream__(requested_schema)
|
|
237
269
|
|
|
@@ -246,6 +278,11 @@ class StreamCache:
|
|
|
246
278
|
:class:`StreamCache`. Then the consumer can ask the producer (in
|
|
247
279
|
``__arrow_c_stream__``) to cast the exported data to a supported data type.
|
|
248
280
|
|
|
281
|
+
Returns
|
|
282
|
+
-------
|
|
283
|
+
PyCapsule
|
|
284
|
+
An Arrow C schema capsule for the stream's schema.
|
|
285
|
+
|
|
249
286
|
"""
|
|
250
287
|
return self._impl.__arrow_c_schema__()
|
|
251
288
|
|
|
@@ -341,12 +378,14 @@ class StreamCacheReader:
|
|
|
341
378
|
|
|
342
379
|
"""
|
|
343
380
|
|
|
344
|
-
|
|
381
|
+
_impl: _PyStreamCacheReader
|
|
382
|
+
|
|
383
|
+
def __init__(self, impl: _PyStreamCacheReader) -> None:
|
|
345
384
|
"""Obtain via :meth:`StreamCache.reader`."""
|
|
346
385
|
self._impl = impl
|
|
347
386
|
|
|
348
387
|
@property
|
|
349
|
-
def schema(self) ->
|
|
388
|
+
def schema(self) -> pa.Schema:
|
|
350
389
|
"""
|
|
351
390
|
Arrow schema of batches produced by this reader.
|
|
352
391
|
|
|
@@ -388,6 +427,11 @@ class StreamCacheReader:
|
|
|
388
427
|
requested_schema : object, optional
|
|
389
428
|
Schema capsule to cast the stream to, or ``None``.
|
|
390
429
|
|
|
430
|
+
Returns
|
|
431
|
+
-------
|
|
432
|
+
PyCapsule
|
|
433
|
+
An Arrow C stream capsule wrapping this reader.
|
|
434
|
+
|
|
391
435
|
Raises
|
|
392
436
|
------
|
|
393
437
|
ValueError
|
|
@@ -407,6 +451,11 @@ class StreamCacheReader:
|
|
|
407
451
|
:class:`StreamCacheReader`. Then the consumer can ask the producer (in
|
|
408
452
|
``__arrow_c_stream__``) to cast the exported data to a supported data type.
|
|
409
453
|
|
|
454
|
+
Returns
|
|
455
|
+
-------
|
|
456
|
+
PyCapsule
|
|
457
|
+
An Arrow C schema capsule for the reader's schema.
|
|
458
|
+
|
|
410
459
|
Raises
|
|
411
460
|
------
|
|
412
461
|
ValueError
|
|
@@ -416,10 +465,17 @@ class StreamCacheReader:
|
|
|
416
465
|
return self._impl.__arrow_c_schema__()
|
|
417
466
|
|
|
418
467
|
def __iter__(self) -> StreamCacheReader:
|
|
419
|
-
"""
|
|
468
|
+
"""
|
|
469
|
+
Return self as the iterator.
|
|
470
|
+
|
|
471
|
+
Returns
|
|
472
|
+
-------
|
|
473
|
+
StreamCacheReader
|
|
474
|
+
|
|
475
|
+
"""
|
|
420
476
|
return self
|
|
421
477
|
|
|
422
|
-
def cast(self, target_schema: Any) ->
|
|
478
|
+
def cast(self, target_schema: Any) -> pa.RecordBatchReader:
|
|
423
479
|
"""
|
|
424
480
|
Cast the reader to produce batches with the given schema.
|
|
425
481
|
|
|
@@ -445,8 +501,15 @@ class StreamCacheReader:
|
|
|
445
501
|
"""
|
|
446
502
|
return self._impl.cast(target_schema)
|
|
447
503
|
|
|
448
|
-
def __next__(self) ->
|
|
449
|
-
"""
|
|
504
|
+
def __next__(self) -> pa.RecordBatch:
|
|
505
|
+
"""
|
|
506
|
+
Get the next batch from the reader.
|
|
507
|
+
|
|
508
|
+
Returns
|
|
509
|
+
-------
|
|
510
|
+
pyarrow.RecordBatch
|
|
511
|
+
|
|
512
|
+
"""
|
|
450
513
|
return next(iter(self._impl))
|
|
451
514
|
|
|
452
515
|
|
|
@@ -465,12 +528,14 @@ class CastingStreamCache:
|
|
|
465
528
|
|
|
466
529
|
"""
|
|
467
530
|
|
|
468
|
-
|
|
531
|
+
_impl: _PyCastingStreamCache
|
|
532
|
+
|
|
533
|
+
def __init__(self, impl: _PyCastingStreamCache) -> None:
|
|
469
534
|
"""Obtain via :meth:`StreamCache.cast`."""
|
|
470
535
|
self._impl = impl
|
|
471
536
|
|
|
472
537
|
@property
|
|
473
|
-
def schema(self) ->
|
|
538
|
+
def schema(self) -> pa.Schema:
|
|
474
539
|
"""
|
|
475
540
|
Arrow schema produced by this dataset after casting.
|
|
476
541
|
|
|
@@ -494,6 +559,11 @@ class CastingStreamCache:
|
|
|
494
559
|
Schema capsule to further cast the stream to, or ``None`` (uses
|
|
495
560
|
:attr:`schema`).
|
|
496
561
|
|
|
562
|
+
Returns
|
|
563
|
+
-------
|
|
564
|
+
PyCapsule
|
|
565
|
+
An Arrow C stream capsule wrapping a fresh casting reader.
|
|
566
|
+
|
|
497
567
|
"""
|
|
498
568
|
return self._impl.__arrow_c_stream__(requested_schema)
|
|
499
569
|
|
|
@@ -503,6 +573,11 @@ class CastingStreamCache:
|
|
|
503
573
|
|
|
504
574
|
Returns the target schema so consumers can inspect the post-cast type.
|
|
505
575
|
|
|
576
|
+
Returns
|
|
577
|
+
-------
|
|
578
|
+
PyCapsule
|
|
579
|
+
An Arrow C schema capsule for the post-cast schema.
|
|
580
|
+
|
|
506
581
|
"""
|
|
507
582
|
return self._impl.__arrow_c_schema__()
|
|
508
583
|
|
|
@@ -33,6 +33,8 @@ class StreamCache:
|
|
|
33
33
|
memory_capacity: builtins.int | None = None,
|
|
34
34
|
disk_path: builtins.str | None = None,
|
|
35
35
|
disk_capacity: builtins.int | None = None,
|
|
36
|
+
write_policy: builtins.str = ...,
|
|
37
|
+
max_readers: builtins.int | None = None,
|
|
36
38
|
) -> StreamCache: ...
|
|
37
39
|
def reader(self, from_start: builtins.bool = ...) -> StreamCacheReader: ...
|
|
38
40
|
def __iter__(self) -> StreamCacheReader: ...
|