yoro-cache 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. yoro_cache-0.1.1/LICENSE +21 -0
  2. yoro_cache-0.1.1/PKG-INFO +213 -0
  3. yoro_cache-0.1.1/README.md +188 -0
  4. yoro_cache-0.1.1/bench/__init__.py +33 -0
  5. yoro_cache-0.1.1/bench/budget.py +81 -0
  6. yoro_cache-0.1.1/bench/checkpoint.py +67 -0
  7. yoro_cache-0.1.1/bench/convergence.py +81 -0
  8. yoro_cache-0.1.1/bench/datasets.py +432 -0
  9. yoro_cache-0.1.1/bench/eventlog.py +137 -0
  10. yoro_cache-0.1.1/bench/ladder.py +183 -0
  11. yoro_cache-0.1.1/bench/metrics.py +96 -0
  12. yoro_cache-0.1.1/bench/model_client.py +134 -0
  13. yoro_cache-0.1.1/bench/run_phase0.py +557 -0
  14. yoro_cache-0.1.1/bench/spike_replay.py +123 -0
  15. yoro_cache-0.1.1/bench/vast.py +79 -0
  16. yoro_cache-0.1.1/bench/wandb_log.py +49 -0
  17. yoro_cache-0.1.1/pyproject.toml +41 -0
  18. yoro_cache-0.1.1/setup.cfg +4 -0
  19. yoro_cache-0.1.1/tests/test_bench.py +496 -0
  20. yoro_cache-0.1.1/tests/test_extensions.py +76 -0
  21. yoro_cache-0.1.1/tests/test_proxy.py +193 -0
  22. yoro_cache-0.1.1/tests/test_yoro.py +171 -0
  23. yoro_cache-0.1.1/yoro/__init__.py +52 -0
  24. yoro_cache-0.1.1/yoro/behaviors.py +108 -0
  25. yoro_cache-0.1.1/yoro/cache.py +140 -0
  26. yoro_cache-0.1.1/yoro/cli.py +68 -0
  27. yoro_cache-0.1.1/yoro/core.py +150 -0
  28. yoro_cache-0.1.1/yoro/embeddings.py +98 -0
  29. yoro_cache-0.1.1/yoro/invalidation.py +50 -0
  30. yoro_cache-0.1.1/yoro/keyer.py +102 -0
  31. yoro_cache-0.1.1/yoro/matcher.py +46 -0
  32. yoro_cache-0.1.1/yoro/opencode_behaviors.py +223 -0
  33. yoro_cache-0.1.1/yoro/proxy.py +574 -0
  34. yoro_cache-0.1.1/yoro/structured.py +84 -0
  35. yoro_cache-0.1.1/yoro/tree.py +60 -0
  36. yoro_cache-0.1.1/yoro_cache.egg-info/PKG-INFO +213 -0
  37. yoro_cache-0.1.1/yoro_cache.egg-info/SOURCES.txt +39 -0
  38. yoro_cache-0.1.1/yoro_cache.egg-info/dependency_links.txt +1 -0
  39. yoro_cache-0.1.1/yoro_cache.egg-info/entry_points.txt +2 -0
  40. yoro_cache-0.1.1/yoro_cache.egg-info/requires.txt +8 -0
  41. yoro_cache-0.1.1/yoro_cache.egg-info/top_level.txt +2 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Venkata Sai Chaitanya Pinapaka
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,213 @@
1
+ Metadata-Version: 2.4
2
+ Name: yoro-cache
3
+ Version: 0.1.1
4
+ Summary: YORO — You Only Reason Once. A drop-in LLM caching proxy that invalidates on change and replays cached reasoning instead of serving stale answers.
5
+ Author: Chaitanya Pinapaka
6
+ License: MIT
7
+ Project-URL: Homepage, https://yorocache.com
8
+ Project-URL: Repository, https://github.com/ChaitanyaPinapaka/yoro-cache
9
+ Keywords: llm,cache,semantic-cache,agents,inference,proxy
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
15
+ Requires-Python: >=3.10
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Requires-Dist: numpy>=1.26
19
+ Requires-Dist: requests>=2.30
20
+ Provides-Extra: embed
21
+ Requires-Dist: sentence-transformers>=3.0; extra == "embed"
22
+ Provides-Extra: dev
23
+ Requires-Dist: pytest>=8; extra == "dev"
24
+ Dynamic: license-file
25
+
26
+ # YORO — You Only Reason Once
27
+
28
+ ![tests](https://github.com/ChaitanyaPinapaka/yoro-cache/actions/workflows/tests.yml/badge.svg)
29
+ ![license](https://img.shields.io/badge/license-MIT-blue)
30
+
31
+ YORO is an OpenAI-compatible caching proxy for LLM applications. Unlike a plain
32
+ semantic cache, it tracks what each cached answer depends on and invalidates
33
+ entries when those dependencies change, so it never serves an answer whose
34
+ premises moved. The engine can also re-apply the cached reasoning to new inputs
35
+ (*replay*) instead of re-deriving from scratch — available in the library today,
36
+ wired into the proxy in the next release.
37
+
38
+ Website: [yorocache.com](https://yorocache.com)
39
+
40
+ ## Why
41
+
42
+ Semantic caches (GPTCache and similar) serve a cached answer whenever a new request
43
+ is embedding-similar to a previous one. This saves tokens, but it has a failure mode
44
+ that standard cache metrics do not surface: when the world changes, the cache keeps
45
+ serving the old answer. In our measurements, a drift rate of just 5% (5% of recurring
46
+ tasks whose true answer has changed) already makes over half of a naive cache's hits
47
+ wrong, because popular items drift too and every later hit serves the dead answer.
48
+
49
+ Adding invalidation alone is not sufficient. In agent workloads, the *method* behind
50
+ an answer often lives in earlier interactions rather than in the current request. An
51
+ invalidating cache correctly drops the stale entry, then re-derives without the
52
+ method, caches the wrong result, and serves it — a failure mode we call
53
+ *re-poisoning*. YORO addresses both failure modes: dependency fingerprints handle
54
+ detection, and replay of the stored reasoning handles re-derivation.
55
+
56
+ ## Install
57
+
58
+ ```bash
59
+ pip install "yoro-cache[embed]"
60
+ # before the first PyPI release:
61
+ pip install "yoro-cache[embed] @ git+https://github.com/ChaitanyaPinapaka/yoro-cache"
62
+ ```
63
+
64
+ Requires Python 3.10+. The `[embed]` extra installs `sentence-transformers` for
65
+ semantic matching; without it the library still works with the hash embedder or an
66
+ external embedding endpoint.
67
+
68
+ ## Usage
69
+
70
+ Run the proxy in front of any OpenAI-compatible endpoint (vLLM, llama.cpp server,
71
+ OpenRouter, ...), then point your client at it. The worked example below is the
72
+ setup this README was tested on — a local 35B reasoning model on an M-series Mac:
73
+
74
+ ```bash
75
+ # 1. serve a local model via llama.cpp (brew install llama.cpp)
76
+ llama-server -hf deepreinforce-ai/Ornith-1.0-35B-GGUF --port 8000
77
+
78
+ # 2. put YORO in front of it
79
+ YORO_UPSTREAM=http://127.0.0.1:8000/v1 yoro serve # listens on :8400
80
+
81
+ # 3. point any OpenAI-compatible client at the proxy
82
+ export OPENAI_BASE_URL=http://127.0.0.1:8400/v1
83
+ ```
84
+
85
+ On this setup, a repeated ask serves from cache in ~12 ms against ~3.3 s upstream,
86
+ with the cached reasoning trace preserved in the response.
87
+
88
+ To use YORO under [OpenCode](https://opencode.ai), register the proxy as a custom
89
+ provider in `opencode.json`:
90
+
91
+ ```json
92
+ { "provider": { "yoro": {
93
+ "npm": "@ai-sdk/openai-compatible",
94
+ "options": { "baseURL": "http://127.0.0.1:8400/v1" },
95
+ "models": { "ornith-35b": {} } } } }
96
+ ```
97
+
98
+ The safe policy caches OpenCode's plain question turns and passes its tool-bearing
99
+ (agentic) turns through untouched.
100
+
101
+ To scope a cache entry to workspace state, pass dependency fingerprints. An entry
102
+ only serves while its fingerprints match what was stored; when they change, the
103
+ entry stops serving and the request is re-reasoned upstream:
104
+
105
+ ```python
106
+ from openai import OpenAI
107
+
108
+ client = OpenAI(base_url="http://127.0.0.1:8400/v1", api_key="unused-locally")
109
+ r = client.chat.completions.create(
110
+ model="your-model",
111
+ messages=[{"role": "user", "content": "Recompute the rollup for March"}],
112
+ extra_headers={"X-YORO-Deps": "rollup.csv:9f3ab2"},
113
+ )
114
+ ```
115
+
116
+ Every response reports the cache decision, and `yoro stats` (or
117
+ `GET /yoro/stats`) shows running totals.
118
+
119
+ | Header | Direction | Meaning |
120
+ |---|---|---|
121
+ | `X-YORO-Deps` | request | `name:fingerprint,...` — entry serves only while these match |
122
+ | `X-YORO-Cache: 0` / `1` | request | force caching off / on for this call |
123
+ | `X-YORO-Cache` | response | `HIT`, `MISS`, or `SKIP:<reason>` |
124
+ | `X-YORO-Sim` | response | similarity of the matched entry (on hits) |
125
+
126
+ ### Configuration
127
+
128
+ | Variable | Default | |
129
+ |---|---|---|
130
+ | `YORO_UPSTREAM` | `http://127.0.0.1:8000/v1` | upstream OpenAI-compatible endpoint |
131
+ | `YORO_PORT` | `8400` | proxy listen port |
132
+ | `YORO_POLICY` | `safe` | `safe` refuses to cache tool-bearing or sampled turns; `aggressive` caches them |
133
+ | `YORO_TAU_HIT` / `YORO_TAU_MISS` | `0.95` / `0.6` | reuse-acceptance / novelty thresholds |
134
+ | `YORO_EMBED` | `all-MiniLM-L6-v2` | sentence-transformers model for matching |
135
+ | `YORO_CACHE_PATH` | `~/.yoro/proxy_cache.json` | persistent cache location |
136
+
137
+ The default policy is deliberately conservative: requests that carry tools, contain
138
+ tool history, or use `temperature > 0.2` pass through uncached, because a stale hit
139
+ in an agentic flow can corrupt real work. Caching such turns is an explicit opt-in.
140
+
141
+ ## How it works
142
+
143
+ Each request is embedded and matched against the case store, then routed to the
144
+ cheapest tier that is safe:
145
+
146
+ 1. **Serve** — the matched entry is fresh and similarity is high: return the cached
147
+ answer with no model call.
148
+ 2. **Replay** — same entry, but its dependencies changed: inject the stored
149
+ reasoning trace and apply it to the new inputs. Short output; no re-exploration.
150
+ (Library + benchmark today; proxy integration lands in the next release.)
151
+ 3. **Reason** — novel or borderline request: full reasoning upstream; the trace,
152
+ answer, and dependency fingerprints are cached.
153
+
154
+ A novelty gate escalates look-alike-but-different requests to re-reasoning instead
155
+ of force-fitting them into a near-match — trading some hit rate for correctness.
156
+
157
+ ## Evaluation
158
+
159
+ The claims above are measured, on gpt-oss-120B (H100, vLLM) and reproduced on
160
+ Qwen2.5-32B-Instruct-AWQ (4-bit, one consumer RTX 5090), across controlled sweeps of
161
+ drift rate, near-miss rate, and invalidation-signal fidelity — 25 sweep levels,
162
+ 1,027 runs, 616,200 scored queries, 72.7M tokens in total. Selected results at
163
+ drift 0.4 on the method-in-history workload:
164
+
165
+ | | GPTCache-style | YORO serve-only | YORO replay | YORO replay (low effort) | no cache |
166
+ |---|---|---|---|---|---|
167
+ | Accuracy | 0.16 | 0.16 | **0.96** | 0.92 | 0.07 |
168
+ | Output tokens vs no-cache | 4% | 42% | 21% | 10% | 100% |
169
+
170
+ - On self-contained workloads, a no-invalidation cache reaches staleness 0.90
171
+ (share of hits serving a wrong answer) as drift rises; YORO holds ~0.00 at the
172
+ same matched thresholds, with accuracy 1.00.
173
+ - Wrong serves split into two mechanistically different failure modes: *outdated*
174
+ (served an answer that was once correct) and *re-poisoned* (served an answer that
175
+ was never correct). The no-invalidation cache fails mostly outdated; an
176
+ invalidating cache without replay fails ~99% re-poisoned; replay reduces both to
177
+ near zero. Accuracy alone cannot distinguish these; the taxonomy metrics
178
+ (`outdated_rate`, `repoisoned_rate`) can.
179
+ - Weakening the invalidation signal degrades YORO gracefully — staleness tracks the
180
+ share of missed signals and converges to naive-cache behavior at zero signal.
181
+
182
+ The full benchmark harness (sweep driver, workload generators, taxonomy metrics,
183
+ and the result curves behind these numbers) lands in this repository in an upcoming
184
+ release.
185
+
186
+ ## Scope and limitations
187
+
188
+ - The replay result is measured in the *method-in-history* regime, where re-asks
189
+ reference a procedure established earlier — the normal case for long-running
190
+ agents. If every request restates its full context, a plain cache with
191
+ invalidation performs equally well on correctness.
192
+ - Replay is validated on multi-step arithmetic procedures; non-numeric procedures
193
+ (extraction rules, rubrics, tool plans) have not yet been evaluated.
194
+ - Replay quality depends on the invalidation signal. Without dependency
195
+ fingerprints, YORO falls back to conservative matching and behaves like a
196
+ gated semantic cache.
197
+ - Related work: Buffer of Thoughts, Metacognitive Reuse, and Analogical Prompting
198
+ reuse reasoning templates. YORO's contribution is making reuse safe and
199
+ accounted for: invalidation, the failure-mode taxonomy, and separate input/output
200
+ token accounting.
201
+
202
+ ## Repository layout
203
+
204
+ ```
205
+ yoro/ library and proxy: cache, matcher, invalidation, replay, CLI
206
+ bench/ the benchmark harness: rungs, sweeps, taxonomy metrics, result curves, runbook
207
+ tests/ library, proxy, and benchmark tests; no GPU required
208
+ site/ yorocache.com (static)
209
+ ```
210
+
211
+ ## License
212
+
213
+ MIT. Built and measured by [Chaitanya Pinapaka](https://github.com/ChaitanyaPinapaka).
@@ -0,0 +1,188 @@
1
+ # YORO — You Only Reason Once
2
+
3
+ ![tests](https://github.com/ChaitanyaPinapaka/yoro-cache/actions/workflows/tests.yml/badge.svg)
4
+ ![license](https://img.shields.io/badge/license-MIT-blue)
5
+
6
+ YORO is an OpenAI-compatible caching proxy for LLM applications. Unlike a plain
7
+ semantic cache, it tracks what each cached answer depends on and invalidates
8
+ entries when those dependencies change, so it never serves an answer whose
9
+ premises moved. The engine can also re-apply the cached reasoning to new inputs
10
+ (*replay*) instead of re-deriving from scratch — available in the library today,
11
+ wired into the proxy in the next release.
12
+
13
+ Website: [yorocache.com](https://yorocache.com)
14
+
15
+ ## Why
16
+
17
+ Semantic caches (GPTCache and similar) serve a cached answer whenever a new request
18
+ is embedding-similar to a previous one. This saves tokens, but it has a failure mode
19
+ that standard cache metrics do not surface: when the world changes, the cache keeps
20
+ serving the old answer. In our measurements, a drift rate of just 5% (5% of recurring
21
+ tasks whose true answer has changed) already makes over half of a naive cache's hits
22
+ wrong, because popular items drift too and every later hit serves the dead answer.
23
+
24
+ Adding invalidation alone is not sufficient. In agent workloads, the *method* behind
25
+ an answer often lives in earlier interactions rather than in the current request. An
26
+ invalidating cache correctly drops the stale entry, then re-derives without the
27
+ method, caches the wrong result, and serves it — a failure mode we call
28
+ *re-poisoning*. YORO addresses both failure modes: dependency fingerprints handle
29
+ detection, and replay of the stored reasoning handles re-derivation.
30
+
31
+ ## Install
32
+
33
+ ```bash
34
+ pip install "yoro-cache[embed]"
35
+ # before the first PyPI release:
36
+ pip install "yoro-cache[embed] @ git+https://github.com/ChaitanyaPinapaka/yoro-cache"
37
+ ```
38
+
39
+ Requires Python 3.10+. The `[embed]` extra installs `sentence-transformers` for
40
+ semantic matching; without it the library still works with the hash embedder or an
41
+ external embedding endpoint.
42
+
43
+ ## Usage
44
+
45
+ Run the proxy in front of any OpenAI-compatible endpoint (vLLM, llama.cpp server,
46
+ OpenRouter, ...), then point your client at it. The worked example below is the
47
+ setup this README was tested on — a local 35B reasoning model on an M-series Mac:
48
+
49
+ ```bash
50
+ # 1. serve a local model via llama.cpp (brew install llama.cpp)
51
+ llama-server -hf deepreinforce-ai/Ornith-1.0-35B-GGUF --port 8000
52
+
53
+ # 2. put YORO in front of it
54
+ YORO_UPSTREAM=http://127.0.0.1:8000/v1 yoro serve # listens on :8400
55
+
56
+ # 3. point any OpenAI-compatible client at the proxy
57
+ export OPENAI_BASE_URL=http://127.0.0.1:8400/v1
58
+ ```
59
+
60
+ On this setup, a repeated ask serves from cache in ~12 ms against ~3.3 s upstream,
61
+ with the cached reasoning trace preserved in the response.
62
+
63
+ To use YORO under [OpenCode](https://opencode.ai), register the proxy as a custom
64
+ provider in `opencode.json`:
65
+
66
+ ```json
67
+ { "provider": { "yoro": {
68
+ "npm": "@ai-sdk/openai-compatible",
69
+ "options": { "baseURL": "http://127.0.0.1:8400/v1" },
70
+ "models": { "ornith-35b": {} } } } }
71
+ ```
72
+
73
+ The safe policy caches OpenCode's plain question turns and passes its tool-bearing
74
+ (agentic) turns through untouched.
75
+
76
+ To scope a cache entry to workspace state, pass dependency fingerprints. An entry
77
+ only serves while its fingerprints match what was stored; when they change, the
78
+ entry stops serving and the request is re-reasoned upstream:
79
+
80
+ ```python
81
+ from openai import OpenAI
82
+
83
+ client = OpenAI(base_url="http://127.0.0.1:8400/v1", api_key="unused-locally")
84
+ r = client.chat.completions.create(
85
+ model="your-model",
86
+ messages=[{"role": "user", "content": "Recompute the rollup for March"}],
87
+ extra_headers={"X-YORO-Deps": "rollup.csv:9f3ab2"},
88
+ )
89
+ ```
90
+
91
+ Every response reports the cache decision, and `yoro stats` (or
92
+ `GET /yoro/stats`) shows running totals.
93
+
94
+ | Header | Direction | Meaning |
95
+ |---|---|---|
96
+ | `X-YORO-Deps` | request | `name:fingerprint,...` — entry serves only while these match |
97
+ | `X-YORO-Cache: 0` / `1` | request | force caching off / on for this call |
98
+ | `X-YORO-Cache` | response | `HIT`, `MISS`, or `SKIP:<reason>` |
99
+ | `X-YORO-Sim` | response | similarity of the matched entry (on hits) |
100
+
101
+ ### Configuration
102
+
103
+ | Variable | Default | |
104
+ |---|---|---|
105
+ | `YORO_UPSTREAM` | `http://127.0.0.1:8000/v1` | upstream OpenAI-compatible endpoint |
106
+ | `YORO_PORT` | `8400` | proxy listen port |
107
+ | `YORO_POLICY` | `safe` | `safe` refuses to cache tool-bearing or sampled turns; `aggressive` caches them |
108
+ | `YORO_TAU_HIT` / `YORO_TAU_MISS` | `0.95` / `0.6` | reuse-acceptance / novelty thresholds |
109
+ | `YORO_EMBED` | `all-MiniLM-L6-v2` | sentence-transformers model for matching |
110
+ | `YORO_CACHE_PATH` | `~/.yoro/proxy_cache.json` | persistent cache location |
111
+
112
+ The default policy is deliberately conservative: requests that carry tools, contain
113
+ tool history, or use `temperature > 0.2` pass through uncached, because a stale hit
114
+ in an agentic flow can corrupt real work. Caching such turns is an explicit opt-in.
115
+
116
+ ## How it works
117
+
118
+ Each request is embedded and matched against the case store, then routed to the
119
+ cheapest tier that is safe:
120
+
121
+ 1. **Serve** — the matched entry is fresh and similarity is high: return the cached
122
+ answer with no model call.
123
+ 2. **Replay** — same entry, but its dependencies changed: inject the stored
124
+ reasoning trace and apply it to the new inputs. Short output; no re-exploration.
125
+ (Library + benchmark today; proxy integration lands in the next release.)
126
+ 3. **Reason** — novel or borderline request: full reasoning upstream; the trace,
127
+ answer, and dependency fingerprints are cached.
128
+
129
+ A novelty gate escalates look-alike-but-different requests to re-reasoning instead
130
+ of force-fitting them into a near-match — trading some hit rate for correctness.
131
+
132
+ ## Evaluation
133
+
134
+ The claims above are measured, on gpt-oss-120B (H100, vLLM) and reproduced on
135
+ Qwen2.5-32B-Instruct-AWQ (4-bit, one consumer RTX 5090), across controlled sweeps of
136
+ drift rate, near-miss rate, and invalidation-signal fidelity — 25 sweep levels,
137
+ 1,027 runs, 616,200 scored queries, 72.7M tokens in total. Selected results at
138
+ drift 0.4 on the method-in-history workload:
139
+
140
+ | | GPTCache-style | YORO serve-only | YORO replay | YORO replay (low effort) | no cache |
141
+ |---|---|---|---|---|---|
142
+ | Accuracy | 0.16 | 0.16 | **0.96** | 0.92 | 0.07 |
143
+ | Output tokens vs no-cache | 4% | 42% | 21% | 10% | 100% |
144
+
145
+ - On self-contained workloads, a no-invalidation cache reaches staleness 0.90
146
+ (share of hits serving a wrong answer) as drift rises; YORO holds ~0.00 at the
147
+ same matched thresholds, with accuracy 1.00.
148
+ - Wrong serves split into two mechanistically different failure modes: *outdated*
149
+ (served an answer that was once correct) and *re-poisoned* (served an answer that
150
+ was never correct). The no-invalidation cache fails mostly outdated; an
151
+ invalidating cache without replay fails ~99% re-poisoned; replay reduces both to
152
+ near zero. Accuracy alone cannot distinguish these; the taxonomy metrics
153
+ (`outdated_rate`, `repoisoned_rate`) can.
154
+ - Weakening the invalidation signal degrades YORO gracefully — staleness tracks the
155
+ share of missed signals and converges to naive-cache behavior at zero signal.
156
+
157
+ The full benchmark harness (sweep driver, workload generators, taxonomy metrics,
158
+ and the result curves behind these numbers) lands in this repository in an upcoming
159
+ release.
160
+
161
+ ## Scope and limitations
162
+
163
+ - The replay result is measured in the *method-in-history* regime, where re-asks
164
+ reference a procedure established earlier — the normal case for long-running
165
+ agents. If every request restates its full context, a plain cache with
166
+ invalidation performs equally well on correctness.
167
+ - Replay is validated on multi-step arithmetic procedures; non-numeric procedures
168
+ (extraction rules, rubrics, tool plans) have not yet been evaluated.
169
+ - Replay quality depends on the invalidation signal. Without dependency
170
+ fingerprints, YORO falls back to conservative matching and behaves like a
171
+ gated semantic cache.
172
+ - Related work: Buffer of Thoughts, Metacognitive Reuse, and Analogical Prompting
173
+ reuse reasoning templates. YORO's contribution is making reuse safe and
174
+ accounted for: invalidation, the failure-mode taxonomy, and separate input/output
175
+ token accounting.
176
+
177
+ ## Repository layout
178
+
179
+ ```
180
+ yoro/ library and proxy: cache, matcher, invalidation, replay, CLI
181
+ bench/ the benchmark harness: rungs, sweeps, taxonomy metrics, result curves, runbook
182
+ tests/ library, proxy, and benchmark tests; no GPU required
183
+ site/ yorocache.com (static)
184
+ ```
185
+
186
+ ## License
187
+
188
+ MIT. Built and measured by [Chaitanya Pinapaka](https://github.com/ChaitanyaPinapaka).
@@ -0,0 +1,33 @@
1
+ """YORO benchmark harness.
2
+
3
+ A baseline ladder over labelled prompt streams, runnable locally with a mock model
4
+ (`--smoke`, no GPU) or against any OpenAI-compatible endpoint, with per-level
5
+ checkpoint/resume, optional cloud sinks (S3 / CloudWatch / W&B), and a hard
6
+ auto-shutdown budget cap for rented GPUs.
7
+
8
+ Modules:
9
+ budget - BudgetGuard: spend tracking + auto-shutdown before the ceiling.
10
+ metrics - per-prompt Outcome, run summary, cross-seed aggregation + significance.
11
+ ladder - the five rungs (no-cache / exact / gptcache-semantic / behaviors / YORO).
12
+ wandb_log- thin W&B logger shim (falls back to stdout if wandb is absent).
13
+ """
14
+ from .budget import BudgetGuard
15
+ from .metrics import Outcome, summarize, aggregate_seeds, paired_t
16
+ from .ladder import (Strategy, NoCache, ExactCache, SemanticCache, BehaviorsOnly,
17
+ YOROStrategy, build_ladder)
18
+ from .wandb_log import WandbLogger
19
+ from .eventlog import EventLog, S3FileSink, CloudWatchSink
20
+ from .checkpoint import Checkpoint
21
+ from .convergence import Convergence, ci_halfwidth
22
+ from .vast import VastCredit, stop_self
23
+
24
+ __all__ = [
25
+ "BudgetGuard",
26
+ "Outcome", "summarize", "aggregate_seeds", "paired_t",
27
+ "Strategy", "NoCache", "ExactCache", "SemanticCache", "BehaviorsOnly", "YOROStrategy", "build_ladder",
28
+ "WandbLogger",
29
+ "EventLog", "S3FileSink", "CloudWatchSink",
30
+ "Checkpoint",
31
+ "Convergence", "ci_halfwidth",
32
+ "VastCredit", "stop_self",
33
+ ]
@@ -0,0 +1,81 @@
1
+ """BudgetGuard — the safety feature that makes a multi-day, unattended, $500 rented-GPU
2
+ run safe to leave alone.
3
+
4
+ It tracks spend = instance $/hr x elapsed + any per-token API cost, and once spend
5
+ crosses a soft fraction of the hard ceiling it fires a provider-specific shutdown hook
6
+ (e.g. `vastai destroy instance <id>`) exactly once. Pair it with frequent checkpointing
7
+ so the auto-shutdown (or a spot preemption) never loses results.
8
+
9
+ The clock is injectable so the logic is unit-testable without waiting hours.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import time
14
+ from dataclasses import dataclass, field
15
+ from typing import Callable, Optional
16
+
17
+
18
+ @dataclass
19
+ class BudgetGuard:
20
+ ceiling_usd: float # HARD cap (e.g. 500)
21
+ hourly_usd: float # instance price, e.g. Vast.ai spot $/hr
22
+ shutdown_frac: float = 0.9 # auto-shutdown at 90% of the ceiling
23
+ clock: Callable[[], float] = time.time # injectable for tests
24
+ on_shutdown: Optional[Callable[[], None]] = None # provider terminate hook (fired once)
25
+ token_cost_usd: float = 0.0 # accrued per-token cost (0 when self-hosting)
26
+ started_at: Optional[float] = None
27
+ _stopped: bool = field(default=False, repr=False)
28
+
29
+ def __post_init__(self):
30
+ if self.started_at is None:
31
+ self.started_at = self.clock()
32
+
33
+ def add_token_cost(self, usd: float) -> None:
34
+ self.token_cost_usd += max(0.0, usd)
35
+
36
+ def spent(self) -> float:
37
+ hours = max(0.0, (self.clock() - self.started_at) / 3600.0)
38
+ return hours * self.hourly_usd + self.token_cost_usd
39
+
40
+ def remaining(self) -> float:
41
+ return max(0.0, self.ceiling_usd - self.spent())
42
+
43
+ def soft_cap(self) -> float:
44
+ return self.ceiling_usd * self.shutdown_frac
45
+
46
+ def should_stop(self) -> bool:
47
+ return self.spent() >= self.soft_cap()
48
+
49
+ def check(self) -> bool:
50
+ """Call this periodically (e.g. every checkpoint). Returns True once spend crosses
51
+ the soft cap, firing the shutdown hook exactly once. Idempotent thereafter."""
52
+ if self._stopped:
53
+ return True
54
+ if self.should_stop():
55
+ self._stopped = True
56
+ if self.on_shutdown is not None:
57
+ try:
58
+ self.on_shutdown()
59
+ except Exception:
60
+ pass
61
+ return True
62
+ return False
63
+
64
+ def stop(self) -> None:
65
+ """Force-stop from an EXTERNAL signal (e.g. low real Vast credit), so a sweep that
66
+ shares this guard also halts — not just the current level."""
67
+ self._stopped = True
68
+
69
+ @property
70
+ def stopped(self) -> bool:
71
+ return self._stopped
72
+
73
+ def status(self) -> dict:
74
+ return {
75
+ "spent_usd": round(self.spent(), 2),
76
+ "remaining_usd": round(self.remaining(), 2),
77
+ "ceiling_usd": self.ceiling_usd,
78
+ "soft_cap_usd": round(self.soft_cap(), 2),
79
+ "hourly_usd": self.hourly_usd,
80
+ "stopped": self._stopped,
81
+ }
@@ -0,0 +1,67 @@
1
+ """Checkpoint + resume — so a Vast.ai spot preemption or the budget auto-shutdown never
2
+ loses work. A checkpoint is a JSON snapshot of run state (cursor into the prompt stream,
3
+ per-rung outcomes so far, cache contents, accumulated spend). It's written ATOMICALLY
4
+ (temp file + os.replace, so a kill mid-write can't corrupt it) and mirrored to S3, so a
5
+ fresh instance can pull the latest and continue from the cursor.
6
+
7
+ Usage:
8
+ ck = Checkpoint("runs/phase0/ckpt.json", s3=("my-bucket", "yoro/phase0/ckpt.json"))
9
+ state = ck.load() or {"cursor": 0, "outcomes": {}}
10
+ ... # run from state["cursor"], periodically:
11
+ ck.save(state)
12
+ """
13
+ from __future__ import annotations
14
+
15
+ import json
16
+ import os
17
+ import tempfile
18
+ from typing import Optional, Tuple
19
+
20
+
21
+ def _s3():
22
+ try:
23
+ import boto3
24
+ return boto3.client("s3")
25
+ except Exception:
26
+ return None
27
+
28
+
29
+ class Checkpoint:
30
+ def __init__(self, path: str, s3: Optional[Tuple[str, str]] = None):
31
+ self.path = path
32
+ self.s3 = s3 # (bucket, key) or None
33
+ self._c = _s3() if s3 else None
34
+
35
+ def save(self, state: dict) -> None:
36
+ d = os.path.dirname(os.path.abspath(self.path)) or "."
37
+ os.makedirs(d, exist_ok=True)
38
+ fd, tmp = tempfile.mkstemp(dir=d, suffix=".tmp")
39
+ try:
40
+ with os.fdopen(fd, "w") as f:
41
+ json.dump(state, f, default=str)
42
+ os.replace(tmp, self.path) # atomic on POSIX
43
+ finally:
44
+ if os.path.exists(tmp):
45
+ os.remove(tmp)
46
+ if self._c:
47
+ try:
48
+ self._c.upload_file(self.path, self.s3[0], self.s3[1])
49
+ except Exception as e:
50
+ print(f"[ckpt s3 err {str(e)[:70]}]")
51
+
52
+ def load(self) -> Optional[dict]:
53
+ if os.path.exists(self.path):
54
+ try:
55
+ with open(self.path) as f:
56
+ return json.load(f)
57
+ except Exception:
58
+ pass # fall through to S3
59
+ if self._c:
60
+ try:
61
+ os.makedirs(os.path.dirname(os.path.abspath(self.path)) or ".", exist_ok=True)
62
+ self._c.download_file(self.s3[0], self.s3[1], self.path) # boto3 won't mkdir the target dir
63
+ with open(self.path) as f:
64
+ return json.load(f)
65
+ except Exception:
66
+ return None
67
+ return None