yoro-cache 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- yoro_cache-0.1.1/LICENSE +21 -0
- yoro_cache-0.1.1/PKG-INFO +213 -0
- yoro_cache-0.1.1/README.md +188 -0
- yoro_cache-0.1.1/bench/__init__.py +33 -0
- yoro_cache-0.1.1/bench/budget.py +81 -0
- yoro_cache-0.1.1/bench/checkpoint.py +67 -0
- yoro_cache-0.1.1/bench/convergence.py +81 -0
- yoro_cache-0.1.1/bench/datasets.py +432 -0
- yoro_cache-0.1.1/bench/eventlog.py +137 -0
- yoro_cache-0.1.1/bench/ladder.py +183 -0
- yoro_cache-0.1.1/bench/metrics.py +96 -0
- yoro_cache-0.1.1/bench/model_client.py +134 -0
- yoro_cache-0.1.1/bench/run_phase0.py +557 -0
- yoro_cache-0.1.1/bench/spike_replay.py +123 -0
- yoro_cache-0.1.1/bench/vast.py +79 -0
- yoro_cache-0.1.1/bench/wandb_log.py +49 -0
- yoro_cache-0.1.1/pyproject.toml +41 -0
- yoro_cache-0.1.1/setup.cfg +4 -0
- yoro_cache-0.1.1/tests/test_bench.py +496 -0
- yoro_cache-0.1.1/tests/test_extensions.py +76 -0
- yoro_cache-0.1.1/tests/test_proxy.py +193 -0
- yoro_cache-0.1.1/tests/test_yoro.py +171 -0
- yoro_cache-0.1.1/yoro/__init__.py +52 -0
- yoro_cache-0.1.1/yoro/behaviors.py +108 -0
- yoro_cache-0.1.1/yoro/cache.py +140 -0
- yoro_cache-0.1.1/yoro/cli.py +68 -0
- yoro_cache-0.1.1/yoro/core.py +150 -0
- yoro_cache-0.1.1/yoro/embeddings.py +98 -0
- yoro_cache-0.1.1/yoro/invalidation.py +50 -0
- yoro_cache-0.1.1/yoro/keyer.py +102 -0
- yoro_cache-0.1.1/yoro/matcher.py +46 -0
- yoro_cache-0.1.1/yoro/opencode_behaviors.py +223 -0
- yoro_cache-0.1.1/yoro/proxy.py +574 -0
- yoro_cache-0.1.1/yoro/structured.py +84 -0
- yoro_cache-0.1.1/yoro/tree.py +60 -0
- yoro_cache-0.1.1/yoro_cache.egg-info/PKG-INFO +213 -0
- yoro_cache-0.1.1/yoro_cache.egg-info/SOURCES.txt +39 -0
- yoro_cache-0.1.1/yoro_cache.egg-info/dependency_links.txt +1 -0
- yoro_cache-0.1.1/yoro_cache.egg-info/entry_points.txt +2 -0
- yoro_cache-0.1.1/yoro_cache.egg-info/requires.txt +8 -0
- yoro_cache-0.1.1/yoro_cache.egg-info/top_level.txt +2 -0
yoro_cache-0.1.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Venkata Sai Chaitanya Pinapaka
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: yoro-cache
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: YORO — You Only Reason Once. A drop-in LLM caching proxy that invalidates on change and replays cached reasoning instead of serving stale answers.
|
|
5
|
+
Author: Chaitanya Pinapaka
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://yorocache.com
|
|
8
|
+
Project-URL: Repository, https://github.com/ChaitanyaPinapaka/yoro-cache
|
|
9
|
+
Keywords: llm,cache,semantic-cache,agents,inference,proxy
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
|
+
Requires-Python: >=3.10
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Requires-Dist: numpy>=1.26
|
|
19
|
+
Requires-Dist: requests>=2.30
|
|
20
|
+
Provides-Extra: embed
|
|
21
|
+
Requires-Dist: sentence-transformers>=3.0; extra == "embed"
|
|
22
|
+
Provides-Extra: dev
|
|
23
|
+
Requires-Dist: pytest>=8; extra == "dev"
|
|
24
|
+
Dynamic: license-file
|
|
25
|
+
|
|
26
|
+
# YORO — You Only Reason Once
|
|
27
|
+
|
|
28
|
+

|
|
29
|
+

|
|
30
|
+
|
|
31
|
+
YORO is an OpenAI-compatible caching proxy for LLM applications. Unlike a plain
|
|
32
|
+
semantic cache, it tracks what each cached answer depends on and invalidates
|
|
33
|
+
entries when those dependencies change, so it never serves an answer whose
|
|
34
|
+
premises moved. The engine can also re-apply the cached reasoning to new inputs
|
|
35
|
+
(*replay*) instead of re-deriving from scratch — available in the library today,
|
|
36
|
+
wired into the proxy in the next release.
|
|
37
|
+
|
|
38
|
+
Website: [yorocache.com](https://yorocache.com)
|
|
39
|
+
|
|
40
|
+
## Why
|
|
41
|
+
|
|
42
|
+
Semantic caches (GPTCache and similar) serve a cached answer whenever a new request
|
|
43
|
+
is embedding-similar to a previous one. This saves tokens, but it has a failure mode
|
|
44
|
+
that standard cache metrics do not surface: when the world changes, the cache keeps
|
|
45
|
+
serving the old answer. In our measurements, a drift rate of just 5% (5% of recurring
|
|
46
|
+
tasks whose true answer has changed) already makes over half of a naive cache's hits
|
|
47
|
+
wrong, because popular items drift too and every later hit serves the dead answer.
|
|
48
|
+
|
|
49
|
+
Adding invalidation alone is not sufficient. In agent workloads, the *method* behind
|
|
50
|
+
an answer often lives in earlier interactions rather than in the current request. An
|
|
51
|
+
invalidating cache correctly drops the stale entry, then re-derives without the
|
|
52
|
+
method, caches the wrong result, and serves it — a failure mode we call
|
|
53
|
+
*re-poisoning*. YORO addresses both failure modes: dependency fingerprints handle
|
|
54
|
+
detection, and replay of the stored reasoning handles re-derivation.
|
|
55
|
+
|
|
56
|
+
## Install
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
pip install "yoro-cache[embed]"
|
|
60
|
+
# before the first PyPI release:
|
|
61
|
+
pip install "yoro-cache[embed] @ git+https://github.com/ChaitanyaPinapaka/yoro-cache"
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Requires Python 3.10+. The `[embed]` extra installs `sentence-transformers` for
|
|
65
|
+
semantic matching; without it the library still works with the hash embedder or an
|
|
66
|
+
external embedding endpoint.
|
|
67
|
+
|
|
68
|
+
## Usage
|
|
69
|
+
|
|
70
|
+
Run the proxy in front of any OpenAI-compatible endpoint (vLLM, llama.cpp server,
|
|
71
|
+
OpenRouter, ...), then point your client at it. The worked example below is the
|
|
72
|
+
setup this README was tested on — a local 35B reasoning model on an M-series Mac:
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
# 1. serve a local model via llama.cpp (brew install llama.cpp)
|
|
76
|
+
llama-server -hf deepreinforce-ai/Ornith-1.0-35B-GGUF --port 8000
|
|
77
|
+
|
|
78
|
+
# 2. put YORO in front of it
|
|
79
|
+
YORO_UPSTREAM=http://127.0.0.1:8000/v1 yoro serve # listens on :8400
|
|
80
|
+
|
|
81
|
+
# 3. point any OpenAI-compatible client at the proxy
|
|
82
|
+
export OPENAI_BASE_URL=http://127.0.0.1:8400/v1
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
On this setup, a repeated ask serves from cache in ~12 ms against ~3.3 s upstream,
|
|
86
|
+
with the cached reasoning trace preserved in the response.
|
|
87
|
+
|
|
88
|
+
To use YORO under [OpenCode](https://opencode.ai), register the proxy as a custom
|
|
89
|
+
provider in `opencode.json`:
|
|
90
|
+
|
|
91
|
+
```json
|
|
92
|
+
{ "provider": { "yoro": {
|
|
93
|
+
"npm": "@ai-sdk/openai-compatible",
|
|
94
|
+
"options": { "baseURL": "http://127.0.0.1:8400/v1" },
|
|
95
|
+
"models": { "ornith-35b": {} } } } }
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
The safe policy caches OpenCode's plain question turns and passes its tool-bearing
|
|
99
|
+
(agentic) turns through untouched.
|
|
100
|
+
|
|
101
|
+
To scope a cache entry to workspace state, pass dependency fingerprints. An entry
|
|
102
|
+
only serves while its fingerprints match what was stored; when they change, the
|
|
103
|
+
entry stops serving and the request is re-reasoned upstream:
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
from openai import OpenAI
|
|
107
|
+
|
|
108
|
+
client = OpenAI(base_url="http://127.0.0.1:8400/v1", api_key="unused-locally")
|
|
109
|
+
r = client.chat.completions.create(
|
|
110
|
+
model="your-model",
|
|
111
|
+
messages=[{"role": "user", "content": "Recompute the rollup for March"}],
|
|
112
|
+
extra_headers={"X-YORO-Deps": "rollup.csv:9f3ab2"},
|
|
113
|
+
)
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Every response reports the cache decision, and `yoro stats` (or
|
|
117
|
+
`GET /yoro/stats`) shows running totals.
|
|
118
|
+
|
|
119
|
+
| Header | Direction | Meaning |
|
|
120
|
+
|---|---|---|
|
|
121
|
+
| `X-YORO-Deps` | request | `name:fingerprint,...` — entry serves only while these match |
|
|
122
|
+
| `X-YORO-Cache: 0` / `1` | request | force caching off / on for this call |
|
|
123
|
+
| `X-YORO-Cache` | response | `HIT`, `MISS`, or `SKIP:<reason>` |
|
|
124
|
+
| `X-YORO-Sim` | response | similarity of the matched entry (on hits) |
|
|
125
|
+
|
|
126
|
+
### Configuration
|
|
127
|
+
|
|
128
|
+
| Variable | Default | |
|
|
129
|
+
|---|---|---|
|
|
130
|
+
| `YORO_UPSTREAM` | `http://127.0.0.1:8000/v1` | upstream OpenAI-compatible endpoint |
|
|
131
|
+
| `YORO_PORT` | `8400` | proxy listen port |
|
|
132
|
+
| `YORO_POLICY` | `safe` | `safe` refuses to cache tool-bearing or sampled turns; `aggressive` caches them |
|
|
133
|
+
| `YORO_TAU_HIT` / `YORO_TAU_MISS` | `0.95` / `0.6` | reuse-acceptance / novelty thresholds |
|
|
134
|
+
| `YORO_EMBED` | `all-MiniLM-L6-v2` | sentence-transformers model for matching |
|
|
135
|
+
| `YORO_CACHE_PATH` | `~/.yoro/proxy_cache.json` | persistent cache location |
|
|
136
|
+
|
|
137
|
+
The default policy is deliberately conservative: requests that carry tools, contain
|
|
138
|
+
tool history, or use `temperature > 0.2` pass through uncached, because a stale hit
|
|
139
|
+
in an agentic flow can corrupt real work. Caching such turns is an explicit opt-in.
|
|
140
|
+
|
|
141
|
+
## How it works
|
|
142
|
+
|
|
143
|
+
Each request is embedded and matched against the case store, then routed to the
|
|
144
|
+
cheapest tier that is safe:
|
|
145
|
+
|
|
146
|
+
1. **Serve** — the matched entry is fresh and similarity is high: return the cached
|
|
147
|
+
answer with no model call.
|
|
148
|
+
2. **Replay** — same entry, but its dependencies changed: inject the stored
|
|
149
|
+
reasoning trace and apply it to the new inputs. Short output; no re-exploration.
|
|
150
|
+
(Library + benchmark today; proxy integration lands in the next release.)
|
|
151
|
+
3. **Reason** — novel or borderline request: full reasoning upstream; the trace,
|
|
152
|
+
answer, and dependency fingerprints are cached.
|
|
153
|
+
|
|
154
|
+
A novelty gate escalates look-alike-but-different requests to re-reasoning instead
|
|
155
|
+
of force-fitting them into a near-match — trading some hit rate for correctness.
|
|
156
|
+
|
|
157
|
+
## Evaluation
|
|
158
|
+
|
|
159
|
+
The claims above are measured, on gpt-oss-120B (H100, vLLM) and reproduced on
|
|
160
|
+
Qwen2.5-32B-Instruct-AWQ (4-bit, one consumer RTX 5090), across controlled sweeps of
|
|
161
|
+
drift rate, near-miss rate, and invalidation-signal fidelity — 25 sweep levels,
|
|
162
|
+
1,027 runs, 616,200 scored queries, 72.7M tokens in total. Selected results at
|
|
163
|
+
drift 0.4 on the method-in-history workload:
|
|
164
|
+
|
|
165
|
+
| | GPTCache-style | YORO serve-only | YORO replay | YORO replay (low effort) | no cache |
|
|
166
|
+
|---|---|---|---|---|---|
|
|
167
|
+
| Accuracy | 0.16 | 0.16 | **0.96** | 0.92 | 0.07 |
|
|
168
|
+
| Output tokens vs no-cache | 4% | 42% | 21% | 10% | 100% |
|
|
169
|
+
|
|
170
|
+
- On self-contained workloads, a no-invalidation cache reaches staleness 0.90
|
|
171
|
+
(share of hits serving a wrong answer) as drift rises; YORO holds ~0.00 at the
|
|
172
|
+
same matched thresholds, with accuracy 1.00.
|
|
173
|
+
- Wrong serves split into two mechanistically different failure modes: *outdated*
|
|
174
|
+
(served an answer that was once correct) and *re-poisoned* (served an answer that
|
|
175
|
+
was never correct). The no-invalidation cache fails mostly outdated; an
|
|
176
|
+
invalidating cache without replay fails ~99% re-poisoned; replay reduces both to
|
|
177
|
+
near zero. Accuracy alone cannot distinguish these; the taxonomy metrics
|
|
178
|
+
(`outdated_rate`, `repoisoned_rate`) can.
|
|
179
|
+
- Weakening the invalidation signal degrades YORO gracefully — staleness tracks the
|
|
180
|
+
share of missed signals and converges to naive-cache behavior at zero signal.
|
|
181
|
+
|
|
182
|
+
The full benchmark harness (sweep driver, workload generators, taxonomy metrics,
|
|
183
|
+
and the result curves behind these numbers) lands in this repository in an upcoming
|
|
184
|
+
release.
|
|
185
|
+
|
|
186
|
+
## Scope and limitations
|
|
187
|
+
|
|
188
|
+
- The replay result is measured in the *method-in-history* regime, where re-asks
|
|
189
|
+
reference a procedure established earlier — the normal case for long-running
|
|
190
|
+
agents. If every request restates its full context, a plain cache with
|
|
191
|
+
invalidation performs equally well on correctness.
|
|
192
|
+
- Replay is validated on multi-step arithmetic procedures; non-numeric procedures
|
|
193
|
+
(extraction rules, rubrics, tool plans) have not yet been evaluated.
|
|
194
|
+
- Replay quality depends on the invalidation signal. Without dependency
|
|
195
|
+
fingerprints, YORO falls back to conservative matching and behaves like a
|
|
196
|
+
gated semantic cache.
|
|
197
|
+
- Related work: Buffer of Thoughts, Metacognitive Reuse, and Analogical Prompting
|
|
198
|
+
reuse reasoning templates. YORO's contribution is making reuse safe and
|
|
199
|
+
accounted for: invalidation, the failure-mode taxonomy, and separate input/output
|
|
200
|
+
token accounting.
|
|
201
|
+
|
|
202
|
+
## Repository layout
|
|
203
|
+
|
|
204
|
+
```
|
|
205
|
+
yoro/ library and proxy: cache, matcher, invalidation, replay, CLI
|
|
206
|
+
bench/ the benchmark harness: rungs, sweeps, taxonomy metrics, result curves, runbook
|
|
207
|
+
tests/ library, proxy, and benchmark tests; no GPU required
|
|
208
|
+
site/ yorocache.com (static)
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
## License
|
|
212
|
+
|
|
213
|
+
MIT. Built and measured by [Chaitanya Pinapaka](https://github.com/ChaitanyaPinapaka).
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
# YORO — You Only Reason Once
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+
|
|
6
|
+
YORO is an OpenAI-compatible caching proxy for LLM applications. Unlike a plain
|
|
7
|
+
semantic cache, it tracks what each cached answer depends on and invalidates
|
|
8
|
+
entries when those dependencies change, so it never serves an answer whose
|
|
9
|
+
premises moved. The engine can also re-apply the cached reasoning to new inputs
|
|
10
|
+
(*replay*) instead of re-deriving from scratch — available in the library today,
|
|
11
|
+
wired into the proxy in the next release.
|
|
12
|
+
|
|
13
|
+
Website: [yorocache.com](https://yorocache.com)
|
|
14
|
+
|
|
15
|
+
## Why
|
|
16
|
+
|
|
17
|
+
Semantic caches (GPTCache and similar) serve a cached answer whenever a new request
|
|
18
|
+
is embedding-similar to a previous one. This saves tokens, but it has a failure mode
|
|
19
|
+
that standard cache metrics do not surface: when the world changes, the cache keeps
|
|
20
|
+
serving the old answer. In our measurements, a drift rate of just 5% (5% of recurring
|
|
21
|
+
tasks whose true answer has changed) already makes over half of a naive cache's hits
|
|
22
|
+
wrong, because popular items drift too and every later hit serves the dead answer.
|
|
23
|
+
|
|
24
|
+
Adding invalidation alone is not sufficient. In agent workloads, the *method* behind
|
|
25
|
+
an answer often lives in earlier interactions rather than in the current request. An
|
|
26
|
+
invalidating cache correctly drops the stale entry, then re-derives without the
|
|
27
|
+
method, caches the wrong result, and serves it — a failure mode we call
|
|
28
|
+
*re-poisoning*. YORO addresses both failure modes: dependency fingerprints handle
|
|
29
|
+
detection, and replay of the stored reasoning handles re-derivation.
|
|
30
|
+
|
|
31
|
+
## Install
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install "yoro-cache[embed]"
|
|
35
|
+
# before the first PyPI release:
|
|
36
|
+
pip install "yoro-cache[embed] @ git+https://github.com/ChaitanyaPinapaka/yoro-cache"
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Requires Python 3.10+. The `[embed]` extra installs `sentence-transformers` for
|
|
40
|
+
semantic matching; without it the library still works with the hash embedder or an
|
|
41
|
+
external embedding endpoint.
|
|
42
|
+
|
|
43
|
+
## Usage
|
|
44
|
+
|
|
45
|
+
Run the proxy in front of any OpenAI-compatible endpoint (vLLM, llama.cpp server,
|
|
46
|
+
OpenRouter, ...), then point your client at it. The worked example below is the
|
|
47
|
+
setup this README was tested on — a local 35B reasoning model on an M-series Mac:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
# 1. serve a local model via llama.cpp (brew install llama.cpp)
|
|
51
|
+
llama-server -hf deepreinforce-ai/Ornith-1.0-35B-GGUF --port 8000
|
|
52
|
+
|
|
53
|
+
# 2. put YORO in front of it
|
|
54
|
+
YORO_UPSTREAM=http://127.0.0.1:8000/v1 yoro serve # listens on :8400
|
|
55
|
+
|
|
56
|
+
# 3. point any OpenAI-compatible client at the proxy
|
|
57
|
+
export OPENAI_BASE_URL=http://127.0.0.1:8400/v1
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
On this setup, a repeated ask serves from cache in ~12 ms against ~3.3 s upstream,
|
|
61
|
+
with the cached reasoning trace preserved in the response.
|
|
62
|
+
|
|
63
|
+
To use YORO under [OpenCode](https://opencode.ai), register the proxy as a custom
|
|
64
|
+
provider in `opencode.json`:
|
|
65
|
+
|
|
66
|
+
```json
|
|
67
|
+
{ "provider": { "yoro": {
|
|
68
|
+
"npm": "@ai-sdk/openai-compatible",
|
|
69
|
+
"options": { "baseURL": "http://127.0.0.1:8400/v1" },
|
|
70
|
+
"models": { "ornith-35b": {} } } } }
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
The safe policy caches OpenCode's plain question turns and passes its tool-bearing
|
|
74
|
+
(agentic) turns through untouched.
|
|
75
|
+
|
|
76
|
+
To scope a cache entry to workspace state, pass dependency fingerprints. An entry
|
|
77
|
+
only serves while its fingerprints match what was stored; when they change, the
|
|
78
|
+
entry stops serving and the request is re-reasoned upstream:
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from openai import OpenAI
|
|
82
|
+
|
|
83
|
+
client = OpenAI(base_url="http://127.0.0.1:8400/v1", api_key="unused-locally")
|
|
84
|
+
r = client.chat.completions.create(
|
|
85
|
+
model="your-model",
|
|
86
|
+
messages=[{"role": "user", "content": "Recompute the rollup for March"}],
|
|
87
|
+
extra_headers={"X-YORO-Deps": "rollup.csv:9f3ab2"},
|
|
88
|
+
)
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
Every response reports the cache decision, and `yoro stats` (or
|
|
92
|
+
`GET /yoro/stats`) shows running totals.
|
|
93
|
+
|
|
94
|
+
| Header | Direction | Meaning |
|
|
95
|
+
|---|---|---|
|
|
96
|
+
| `X-YORO-Deps` | request | `name:fingerprint,...` — entry serves only while these match |
|
|
97
|
+
| `X-YORO-Cache: 0` / `1` | request | force caching off / on for this call |
|
|
98
|
+
| `X-YORO-Cache` | response | `HIT`, `MISS`, or `SKIP:<reason>` |
|
|
99
|
+
| `X-YORO-Sim` | response | similarity of the matched entry (on hits) |
|
|
100
|
+
|
|
101
|
+
### Configuration
|
|
102
|
+
|
|
103
|
+
| Variable | Default | |
|
|
104
|
+
|---|---|---|
|
|
105
|
+
| `YORO_UPSTREAM` | `http://127.0.0.1:8000/v1` | upstream OpenAI-compatible endpoint |
|
|
106
|
+
| `YORO_PORT` | `8400` | proxy listen port |
|
|
107
|
+
| `YORO_POLICY` | `safe` | `safe` refuses to cache tool-bearing or sampled turns; `aggressive` caches them |
|
|
108
|
+
| `YORO_TAU_HIT` / `YORO_TAU_MISS` | `0.95` / `0.6` | reuse-acceptance / novelty thresholds |
|
|
109
|
+
| `YORO_EMBED` | `all-MiniLM-L6-v2` | sentence-transformers model for matching |
|
|
110
|
+
| `YORO_CACHE_PATH` | `~/.yoro/proxy_cache.json` | persistent cache location |
|
|
111
|
+
|
|
112
|
+
The default policy is deliberately conservative: requests that carry tools, contain
|
|
113
|
+
tool history, or use `temperature > 0.2` pass through uncached, because a stale hit
|
|
114
|
+
in an agentic flow can corrupt real work. Caching such turns is an explicit opt-in.
|
|
115
|
+
|
|
116
|
+
## How it works
|
|
117
|
+
|
|
118
|
+
Each request is embedded and matched against the case store, then routed to the
|
|
119
|
+
cheapest tier that is safe:
|
|
120
|
+
|
|
121
|
+
1. **Serve** — the matched entry is fresh and similarity is high: return the cached
|
|
122
|
+
answer with no model call.
|
|
123
|
+
2. **Replay** — same entry, but its dependencies changed: inject the stored
|
|
124
|
+
reasoning trace and apply it to the new inputs. Short output; no re-exploration.
|
|
125
|
+
(Library + benchmark today; proxy integration lands in the next release.)
|
|
126
|
+
3. **Reason** — novel or borderline request: full reasoning upstream; the trace,
|
|
127
|
+
answer, and dependency fingerprints are cached.
|
|
128
|
+
|
|
129
|
+
A novelty gate escalates look-alike-but-different requests to re-reasoning instead
|
|
130
|
+
of force-fitting them into a near-match — trading some hit rate for correctness.
|
|
131
|
+
|
|
132
|
+
## Evaluation
|
|
133
|
+
|
|
134
|
+
The claims above are measured, on gpt-oss-120B (H100, vLLM) and reproduced on
|
|
135
|
+
Qwen2.5-32B-Instruct-AWQ (4-bit, one consumer RTX 5090), across controlled sweeps of
|
|
136
|
+
drift rate, near-miss rate, and invalidation-signal fidelity — 25 sweep levels,
|
|
137
|
+
1,027 runs, 616,200 scored queries, 72.7M tokens in total. Selected results at
|
|
138
|
+
drift 0.4 on the method-in-history workload:
|
|
139
|
+
|
|
140
|
+
| | GPTCache-style | YORO serve-only | YORO replay | YORO replay (low effort) | no cache |
|
|
141
|
+
|---|---|---|---|---|---|
|
|
142
|
+
| Accuracy | 0.16 | 0.16 | **0.96** | 0.92 | 0.07 |
|
|
143
|
+
| Output tokens vs no-cache | 4% | 42% | 21% | 10% | 100% |
|
|
144
|
+
|
|
145
|
+
- On self-contained workloads, a no-invalidation cache reaches staleness 0.90
|
|
146
|
+
(share of hits serving a wrong answer) as drift rises; YORO holds ~0.00 at the
|
|
147
|
+
same matched thresholds, with accuracy 1.00.
|
|
148
|
+
- Wrong serves split into two mechanistically different failure modes: *outdated*
|
|
149
|
+
(served an answer that was once correct) and *re-poisoned* (served an answer that
|
|
150
|
+
was never correct). The no-invalidation cache fails mostly outdated; an
|
|
151
|
+
invalidating cache without replay fails ~99% re-poisoned; replay reduces both to
|
|
152
|
+
near zero. Accuracy alone cannot distinguish these; the taxonomy metrics
|
|
153
|
+
(`outdated_rate`, `repoisoned_rate`) can.
|
|
154
|
+
- Weakening the invalidation signal degrades YORO gracefully — staleness tracks the
|
|
155
|
+
share of missed signals and converges to naive-cache behavior at zero signal.
|
|
156
|
+
|
|
157
|
+
The full benchmark harness (sweep driver, workload generators, taxonomy metrics,
|
|
158
|
+
and the result curves behind these numbers) lands in this repository in an upcoming
|
|
159
|
+
release.
|
|
160
|
+
|
|
161
|
+
## Scope and limitations
|
|
162
|
+
|
|
163
|
+
- The replay result is measured in the *method-in-history* regime, where re-asks
|
|
164
|
+
reference a procedure established earlier — the normal case for long-running
|
|
165
|
+
agents. If every request restates its full context, a plain cache with
|
|
166
|
+
invalidation performs equally well on correctness.
|
|
167
|
+
- Replay is validated on multi-step arithmetic procedures; non-numeric procedures
|
|
168
|
+
(extraction rules, rubrics, tool plans) have not yet been evaluated.
|
|
169
|
+
- Replay quality depends on the invalidation signal. Without dependency
|
|
170
|
+
fingerprints, YORO falls back to conservative matching and behaves like a
|
|
171
|
+
gated semantic cache.
|
|
172
|
+
- Related work: Buffer of Thoughts, Metacognitive Reuse, and Analogical Prompting
|
|
173
|
+
reuse reasoning templates. YORO's contribution is making reuse safe and
|
|
174
|
+
accounted for: invalidation, the failure-mode taxonomy, and separate input/output
|
|
175
|
+
token accounting.
|
|
176
|
+
|
|
177
|
+
## Repository layout
|
|
178
|
+
|
|
179
|
+
```
|
|
180
|
+
yoro/ library and proxy: cache, matcher, invalidation, replay, CLI
|
|
181
|
+
bench/ the benchmark harness: rungs, sweeps, taxonomy metrics, result curves, runbook
|
|
182
|
+
tests/ library, proxy, and benchmark tests; no GPU required
|
|
183
|
+
site/ yorocache.com (static)
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
## License
|
|
187
|
+
|
|
188
|
+
MIT. Built and measured by [Chaitanya Pinapaka](https://github.com/ChaitanyaPinapaka).
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""YORO benchmark harness.
|
|
2
|
+
|
|
3
|
+
A baseline ladder over labelled prompt streams, runnable locally with a mock model
|
|
4
|
+
(`--smoke`, no GPU) or against any OpenAI-compatible endpoint, with per-level
|
|
5
|
+
checkpoint/resume, optional cloud sinks (S3 / CloudWatch / W&B), and a hard
|
|
6
|
+
auto-shutdown budget cap for rented GPUs.
|
|
7
|
+
|
|
8
|
+
Modules:
|
|
9
|
+
budget - BudgetGuard: spend tracking + auto-shutdown before the ceiling.
|
|
10
|
+
metrics - per-prompt Outcome, run summary, cross-seed aggregation + significance.
|
|
11
|
+
ladder - the five rungs (no-cache / exact / gptcache-semantic / behaviors / YORO).
|
|
12
|
+
wandb_log- thin W&B logger shim (falls back to stdout if wandb is absent).
|
|
13
|
+
"""
|
|
14
|
+
from .budget import BudgetGuard
|
|
15
|
+
from .metrics import Outcome, summarize, aggregate_seeds, paired_t
|
|
16
|
+
from .ladder import (Strategy, NoCache, ExactCache, SemanticCache, BehaviorsOnly,
|
|
17
|
+
YOROStrategy, build_ladder)
|
|
18
|
+
from .wandb_log import WandbLogger
|
|
19
|
+
from .eventlog import EventLog, S3FileSink, CloudWatchSink
|
|
20
|
+
from .checkpoint import Checkpoint
|
|
21
|
+
from .convergence import Convergence, ci_halfwidth
|
|
22
|
+
from .vast import VastCredit, stop_self
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"BudgetGuard",
|
|
26
|
+
"Outcome", "summarize", "aggregate_seeds", "paired_t",
|
|
27
|
+
"Strategy", "NoCache", "ExactCache", "SemanticCache", "BehaviorsOnly", "YOROStrategy", "build_ladder",
|
|
28
|
+
"WandbLogger",
|
|
29
|
+
"EventLog", "S3FileSink", "CloudWatchSink",
|
|
30
|
+
"Checkpoint",
|
|
31
|
+
"Convergence", "ci_halfwidth",
|
|
32
|
+
"VastCredit", "stop_self",
|
|
33
|
+
]
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""BudgetGuard — the safety feature that makes a multi-day, unattended, $500 rented-GPU
|
|
2
|
+
run safe to leave alone.
|
|
3
|
+
|
|
4
|
+
It tracks spend = instance $/hr x elapsed + any per-token API cost, and once spend
|
|
5
|
+
crosses a soft fraction of the hard ceiling it fires a provider-specific shutdown hook
|
|
6
|
+
(e.g. `vastai destroy instance <id>`) exactly once. Pair it with frequent checkpointing
|
|
7
|
+
so the auto-shutdown (or a spot preemption) never loses results.
|
|
8
|
+
|
|
9
|
+
The clock is injectable so the logic is unit-testable without waiting hours.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import time
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from typing import Callable, Optional
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class BudgetGuard:
|
|
20
|
+
ceiling_usd: float # HARD cap (e.g. 500)
|
|
21
|
+
hourly_usd: float # instance price, e.g. Vast.ai spot $/hr
|
|
22
|
+
shutdown_frac: float = 0.9 # auto-shutdown at 90% of the ceiling
|
|
23
|
+
clock: Callable[[], float] = time.time # injectable for tests
|
|
24
|
+
on_shutdown: Optional[Callable[[], None]] = None # provider terminate hook (fired once)
|
|
25
|
+
token_cost_usd: float = 0.0 # accrued per-token cost (0 when self-hosting)
|
|
26
|
+
started_at: Optional[float] = None
|
|
27
|
+
_stopped: bool = field(default=False, repr=False)
|
|
28
|
+
|
|
29
|
+
def __post_init__(self):
|
|
30
|
+
if self.started_at is None:
|
|
31
|
+
self.started_at = self.clock()
|
|
32
|
+
|
|
33
|
+
def add_token_cost(self, usd: float) -> None:
|
|
34
|
+
self.token_cost_usd += max(0.0, usd)
|
|
35
|
+
|
|
36
|
+
def spent(self) -> float:
|
|
37
|
+
hours = max(0.0, (self.clock() - self.started_at) / 3600.0)
|
|
38
|
+
return hours * self.hourly_usd + self.token_cost_usd
|
|
39
|
+
|
|
40
|
+
def remaining(self) -> float:
|
|
41
|
+
return max(0.0, self.ceiling_usd - self.spent())
|
|
42
|
+
|
|
43
|
+
def soft_cap(self) -> float:
|
|
44
|
+
return self.ceiling_usd * self.shutdown_frac
|
|
45
|
+
|
|
46
|
+
def should_stop(self) -> bool:
|
|
47
|
+
return self.spent() >= self.soft_cap()
|
|
48
|
+
|
|
49
|
+
def check(self) -> bool:
|
|
50
|
+
"""Call this periodically (e.g. every checkpoint). Returns True once spend crosses
|
|
51
|
+
the soft cap, firing the shutdown hook exactly once. Idempotent thereafter."""
|
|
52
|
+
if self._stopped:
|
|
53
|
+
return True
|
|
54
|
+
if self.should_stop():
|
|
55
|
+
self._stopped = True
|
|
56
|
+
if self.on_shutdown is not None:
|
|
57
|
+
try:
|
|
58
|
+
self.on_shutdown()
|
|
59
|
+
except Exception:
|
|
60
|
+
pass
|
|
61
|
+
return True
|
|
62
|
+
return False
|
|
63
|
+
|
|
64
|
+
def stop(self) -> None:
|
|
65
|
+
"""Force-stop from an EXTERNAL signal (e.g. low real Vast credit), so a sweep that
|
|
66
|
+
shares this guard also halts — not just the current level."""
|
|
67
|
+
self._stopped = True
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def stopped(self) -> bool:
|
|
71
|
+
return self._stopped
|
|
72
|
+
|
|
73
|
+
def status(self) -> dict:
|
|
74
|
+
return {
|
|
75
|
+
"spent_usd": round(self.spent(), 2),
|
|
76
|
+
"remaining_usd": round(self.remaining(), 2),
|
|
77
|
+
"ceiling_usd": self.ceiling_usd,
|
|
78
|
+
"soft_cap_usd": round(self.soft_cap(), 2),
|
|
79
|
+
"hourly_usd": self.hourly_usd,
|
|
80
|
+
"stopped": self._stopped,
|
|
81
|
+
}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Checkpoint + resume — so a Vast.ai spot preemption or the budget auto-shutdown never
|
|
2
|
+
loses work. A checkpoint is a JSON snapshot of run state (cursor into the prompt stream,
|
|
3
|
+
per-rung outcomes so far, cache contents, accumulated spend). It's written ATOMICALLY
|
|
4
|
+
(temp file + os.replace, so a kill mid-write can't corrupt it) and mirrored to S3, so a
|
|
5
|
+
fresh instance can pull the latest and continue from the cursor.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
ck = Checkpoint("runs/phase0/ckpt.json", s3=("my-bucket", "yoro/phase0/ckpt.json"))
|
|
9
|
+
state = ck.load() or {"cursor": 0, "outcomes": {}}
|
|
10
|
+
... # run from state["cursor"], periodically:
|
|
11
|
+
ck.save(state)
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
import os
|
|
17
|
+
import tempfile
|
|
18
|
+
from typing import Optional, Tuple
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _s3():
|
|
22
|
+
try:
|
|
23
|
+
import boto3
|
|
24
|
+
return boto3.client("s3")
|
|
25
|
+
except Exception:
|
|
26
|
+
return None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class Checkpoint:
|
|
30
|
+
def __init__(self, path: str, s3: Optional[Tuple[str, str]] = None):
|
|
31
|
+
self.path = path
|
|
32
|
+
self.s3 = s3 # (bucket, key) or None
|
|
33
|
+
self._c = _s3() if s3 else None
|
|
34
|
+
|
|
35
|
+
def save(self, state: dict) -> None:
|
|
36
|
+
d = os.path.dirname(os.path.abspath(self.path)) or "."
|
|
37
|
+
os.makedirs(d, exist_ok=True)
|
|
38
|
+
fd, tmp = tempfile.mkstemp(dir=d, suffix=".tmp")
|
|
39
|
+
try:
|
|
40
|
+
with os.fdopen(fd, "w") as f:
|
|
41
|
+
json.dump(state, f, default=str)
|
|
42
|
+
os.replace(tmp, self.path) # atomic on POSIX
|
|
43
|
+
finally:
|
|
44
|
+
if os.path.exists(tmp):
|
|
45
|
+
os.remove(tmp)
|
|
46
|
+
if self._c:
|
|
47
|
+
try:
|
|
48
|
+
self._c.upload_file(self.path, self.s3[0], self.s3[1])
|
|
49
|
+
except Exception as e:
|
|
50
|
+
print(f"[ckpt s3 err {str(e)[:70]}]")
|
|
51
|
+
|
|
52
|
+
def load(self) -> Optional[dict]:
|
|
53
|
+
if os.path.exists(self.path):
|
|
54
|
+
try:
|
|
55
|
+
with open(self.path) as f:
|
|
56
|
+
return json.load(f)
|
|
57
|
+
except Exception:
|
|
58
|
+
pass # fall through to S3
|
|
59
|
+
if self._c:
|
|
60
|
+
try:
|
|
61
|
+
os.makedirs(os.path.dirname(os.path.abspath(self.path)) or ".", exist_ok=True)
|
|
62
|
+
self._c.download_file(self.s3[0], self.s3[1], self.path) # boto3 won't mkdir the target dir
|
|
63
|
+
with open(self.path) as f:
|
|
64
|
+
return json.load(f)
|
|
65
|
+
except Exception:
|
|
66
|
+
return None
|
|
67
|
+
return None
|