yoro-cache 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bench/__init__.py +33 -0
- bench/budget.py +81 -0
- bench/checkpoint.py +67 -0
- bench/convergence.py +81 -0
- bench/datasets.py +432 -0
- bench/eventlog.py +137 -0
- bench/ladder.py +183 -0
- bench/metrics.py +96 -0
- bench/model_client.py +134 -0
- bench/run_phase0.py +557 -0
- bench/spike_replay.py +123 -0
- bench/vast.py +79 -0
- bench/wandb_log.py +49 -0
- yoro/__init__.py +52 -0
- yoro/behaviors.py +108 -0
- yoro/cache.py +140 -0
- yoro/cli.py +68 -0
- yoro/core.py +150 -0
- yoro/embeddings.py +98 -0
- yoro/invalidation.py +50 -0
- yoro/keyer.py +102 -0
- yoro/matcher.py +46 -0
- yoro/opencode_behaviors.py +223 -0
- yoro/proxy.py +574 -0
- yoro/structured.py +84 -0
- yoro/tree.py +60 -0
- yoro_cache-0.1.1.dist-info/METADATA +213 -0
- yoro_cache-0.1.1.dist-info/RECORD +32 -0
- yoro_cache-0.1.1.dist-info/WHEEL +5 -0
- yoro_cache-0.1.1.dist-info/entry_points.txt +2 -0
- yoro_cache-0.1.1.dist-info/licenses/LICENSE +21 -0
- yoro_cache-0.1.1.dist-info/top_level.txt +2 -0
bench/__init__.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""YORO benchmark harness.
|
|
2
|
+
|
|
3
|
+
A baseline ladder over labelled prompt streams, runnable locally with a mock model
|
|
4
|
+
(`--smoke`, no GPU) or against any OpenAI-compatible endpoint, with per-level
|
|
5
|
+
checkpoint/resume, optional cloud sinks (S3 / CloudWatch / W&B), and a hard
|
|
6
|
+
auto-shutdown budget cap for rented GPUs.
|
|
7
|
+
|
|
8
|
+
Modules:
|
|
9
|
+
budget - BudgetGuard: spend tracking + auto-shutdown before the ceiling.
|
|
10
|
+
metrics - per-prompt Outcome, run summary, cross-seed aggregation + significance.
|
|
11
|
+
ladder - the five rungs (no-cache / exact / gptcache-semantic / behaviors / YORO).
|
|
12
|
+
wandb_log- thin W&B logger shim (falls back to stdout if wandb is absent).
|
|
13
|
+
"""
|
|
14
|
+
from .budget import BudgetGuard
|
|
15
|
+
from .metrics import Outcome, summarize, aggregate_seeds, paired_t
|
|
16
|
+
from .ladder import (Strategy, NoCache, ExactCache, SemanticCache, BehaviorsOnly,
|
|
17
|
+
YOROStrategy, build_ladder)
|
|
18
|
+
from .wandb_log import WandbLogger
|
|
19
|
+
from .eventlog import EventLog, S3FileSink, CloudWatchSink
|
|
20
|
+
from .checkpoint import Checkpoint
|
|
21
|
+
from .convergence import Convergence, ci_halfwidth
|
|
22
|
+
from .vast import VastCredit, stop_self
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"BudgetGuard",
|
|
26
|
+
"Outcome", "summarize", "aggregate_seeds", "paired_t",
|
|
27
|
+
"Strategy", "NoCache", "ExactCache", "SemanticCache", "BehaviorsOnly", "YOROStrategy", "build_ladder",
|
|
28
|
+
"WandbLogger",
|
|
29
|
+
"EventLog", "S3FileSink", "CloudWatchSink",
|
|
30
|
+
"Checkpoint",
|
|
31
|
+
"Convergence", "ci_halfwidth",
|
|
32
|
+
"VastCredit", "stop_self",
|
|
33
|
+
]
|
bench/budget.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""BudgetGuard — the safety feature that makes a multi-day, unattended, $500 rented-GPU
|
|
2
|
+
run safe to leave alone.
|
|
3
|
+
|
|
4
|
+
It tracks spend = instance $/hr x elapsed + any per-token API cost, and once spend
|
|
5
|
+
crosses a soft fraction of the hard ceiling it fires a provider-specific shutdown hook
|
|
6
|
+
(e.g. `vastai destroy instance <id>`) exactly once. Pair it with frequent checkpointing
|
|
7
|
+
so the auto-shutdown (or a spot preemption) never loses results.
|
|
8
|
+
|
|
9
|
+
The clock is injectable so the logic is unit-testable without waiting hours.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import time
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from typing import Callable, Optional
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class BudgetGuard:
|
|
20
|
+
ceiling_usd: float # HARD cap (e.g. 500)
|
|
21
|
+
hourly_usd: float # instance price, e.g. Vast.ai spot $/hr
|
|
22
|
+
shutdown_frac: float = 0.9 # auto-shutdown at 90% of the ceiling
|
|
23
|
+
clock: Callable[[], float] = time.time # injectable for tests
|
|
24
|
+
on_shutdown: Optional[Callable[[], None]] = None # provider terminate hook (fired once)
|
|
25
|
+
token_cost_usd: float = 0.0 # accrued per-token cost (0 when self-hosting)
|
|
26
|
+
started_at: Optional[float] = None
|
|
27
|
+
_stopped: bool = field(default=False, repr=False)
|
|
28
|
+
|
|
29
|
+
def __post_init__(self):
|
|
30
|
+
if self.started_at is None:
|
|
31
|
+
self.started_at = self.clock()
|
|
32
|
+
|
|
33
|
+
def add_token_cost(self, usd: float) -> None:
|
|
34
|
+
self.token_cost_usd += max(0.0, usd)
|
|
35
|
+
|
|
36
|
+
def spent(self) -> float:
|
|
37
|
+
hours = max(0.0, (self.clock() - self.started_at) / 3600.0)
|
|
38
|
+
return hours * self.hourly_usd + self.token_cost_usd
|
|
39
|
+
|
|
40
|
+
def remaining(self) -> float:
|
|
41
|
+
return max(0.0, self.ceiling_usd - self.spent())
|
|
42
|
+
|
|
43
|
+
def soft_cap(self) -> float:
|
|
44
|
+
return self.ceiling_usd * self.shutdown_frac
|
|
45
|
+
|
|
46
|
+
def should_stop(self) -> bool:
|
|
47
|
+
return self.spent() >= self.soft_cap()
|
|
48
|
+
|
|
49
|
+
def check(self) -> bool:
|
|
50
|
+
"""Call this periodically (e.g. every checkpoint). Returns True once spend crosses
|
|
51
|
+
the soft cap, firing the shutdown hook exactly once. Idempotent thereafter."""
|
|
52
|
+
if self._stopped:
|
|
53
|
+
return True
|
|
54
|
+
if self.should_stop():
|
|
55
|
+
self._stopped = True
|
|
56
|
+
if self.on_shutdown is not None:
|
|
57
|
+
try:
|
|
58
|
+
self.on_shutdown()
|
|
59
|
+
except Exception:
|
|
60
|
+
pass
|
|
61
|
+
return True
|
|
62
|
+
return False
|
|
63
|
+
|
|
64
|
+
def stop(self) -> None:
|
|
65
|
+
"""Force-stop from an EXTERNAL signal (e.g. low real Vast credit), so a sweep that
|
|
66
|
+
shares this guard also halts — not just the current level."""
|
|
67
|
+
self._stopped = True
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def stopped(self) -> bool:
|
|
71
|
+
return self._stopped
|
|
72
|
+
|
|
73
|
+
def status(self) -> dict:
|
|
74
|
+
return {
|
|
75
|
+
"spent_usd": round(self.spent(), 2),
|
|
76
|
+
"remaining_usd": round(self.remaining(), 2),
|
|
77
|
+
"ceiling_usd": self.ceiling_usd,
|
|
78
|
+
"soft_cap_usd": round(self.soft_cap(), 2),
|
|
79
|
+
"hourly_usd": self.hourly_usd,
|
|
80
|
+
"stopped": self._stopped,
|
|
81
|
+
}
|
bench/checkpoint.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Checkpoint + resume — so a Vast.ai spot preemption or the budget auto-shutdown never
|
|
2
|
+
loses work. A checkpoint is a JSON snapshot of run state (cursor into the prompt stream,
|
|
3
|
+
per-rung outcomes so far, cache contents, accumulated spend). It's written ATOMICALLY
|
|
4
|
+
(temp file + os.replace, so a kill mid-write can't corrupt it) and mirrored to S3, so a
|
|
5
|
+
fresh instance can pull the latest and continue from the cursor.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
ck = Checkpoint("runs/phase0/ckpt.json", s3=("my-bucket", "yoro/phase0/ckpt.json"))
|
|
9
|
+
state = ck.load() or {"cursor": 0, "outcomes": {}}
|
|
10
|
+
... # run from state["cursor"], periodically:
|
|
11
|
+
ck.save(state)
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
import os
|
|
17
|
+
import tempfile
|
|
18
|
+
from typing import Optional, Tuple
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _s3():
|
|
22
|
+
try:
|
|
23
|
+
import boto3
|
|
24
|
+
return boto3.client("s3")
|
|
25
|
+
except Exception:
|
|
26
|
+
return None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class Checkpoint:
|
|
30
|
+
def __init__(self, path: str, s3: Optional[Tuple[str, str]] = None):
|
|
31
|
+
self.path = path
|
|
32
|
+
self.s3 = s3 # (bucket, key) or None
|
|
33
|
+
self._c = _s3() if s3 else None
|
|
34
|
+
|
|
35
|
+
def save(self, state: dict) -> None:
|
|
36
|
+
d = os.path.dirname(os.path.abspath(self.path)) or "."
|
|
37
|
+
os.makedirs(d, exist_ok=True)
|
|
38
|
+
fd, tmp = tempfile.mkstemp(dir=d, suffix=".tmp")
|
|
39
|
+
try:
|
|
40
|
+
with os.fdopen(fd, "w") as f:
|
|
41
|
+
json.dump(state, f, default=str)
|
|
42
|
+
os.replace(tmp, self.path) # atomic on POSIX
|
|
43
|
+
finally:
|
|
44
|
+
if os.path.exists(tmp):
|
|
45
|
+
os.remove(tmp)
|
|
46
|
+
if self._c:
|
|
47
|
+
try:
|
|
48
|
+
self._c.upload_file(self.path, self.s3[0], self.s3[1])
|
|
49
|
+
except Exception as e:
|
|
50
|
+
print(f"[ckpt s3 err {str(e)[:70]}]")
|
|
51
|
+
|
|
52
|
+
def load(self) -> Optional[dict]:
|
|
53
|
+
if os.path.exists(self.path):
|
|
54
|
+
try:
|
|
55
|
+
with open(self.path) as f:
|
|
56
|
+
return json.load(f)
|
|
57
|
+
except Exception:
|
|
58
|
+
pass # fall through to S3
|
|
59
|
+
if self._c:
|
|
60
|
+
try:
|
|
61
|
+
os.makedirs(os.path.dirname(os.path.abspath(self.path)) or ".", exist_ok=True)
|
|
62
|
+
self._c.download_file(self.s3[0], self.s3[1], self.path) # boto3 won't mkdir the target dir
|
|
63
|
+
with open(self.path) as f:
|
|
64
|
+
return json.load(f)
|
|
65
|
+
except Exception:
|
|
66
|
+
return None
|
|
67
|
+
return None
|
bench/convergence.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""Convergence-based early stop — conclude when the result is statistically SETTLED, not
|
|
2
|
+
when the budget runs out. $500 is the ceiling; convergence is the target.
|
|
3
|
+
|
|
4
|
+
After each seed we hold the per-seed value of every metric for every rung. We track the
|
|
5
|
+
95% CI half-width of the PRIMARY quantities — YORO's accuracy and hit-rate, and the
|
|
6
|
+
YORO-vs-GPTCache deltas on staleness + brittleness (the headline claims). Once every
|
|
7
|
+
primary CI is tighter than `ci_target` and at least `min_seeds` are in, we stop — usually
|
|
8
|
+
well under `max_seeds`, saving GPU hours. Final p-values still come from scipy on the
|
|
9
|
+
cluster; this is just the stopping rule.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import math
|
|
14
|
+
|
|
15
|
+
# t_{.975, n-1} for small n (two-sided 95%); falls back to 1.96 for large n.
|
|
16
|
+
_T95 = {2: 12.71, 3: 4.30, 4: 3.18, 5: 2.78, 6: 2.57, 7: 2.45, 8: 2.36, 9: 2.31,
|
|
17
|
+
10: 2.26, 12: 2.20, 15: 2.14, 20: 2.09, 25: 2.06, 30: 2.05}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _t95(n: int) -> float:
|
|
21
|
+
if n < 2:
|
|
22
|
+
return float("inf")
|
|
23
|
+
for k in sorted(_T95):
|
|
24
|
+
if n <= k:
|
|
25
|
+
return _T95[k]
|
|
26
|
+
return 1.96
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def ci_halfwidth(vals: list) -> float:
|
|
30
|
+
n = len(vals)
|
|
31
|
+
if n < 2:
|
|
32
|
+
return float("inf")
|
|
33
|
+
m = sum(vals) / n
|
|
34
|
+
sd = math.sqrt(sum((v - m) ** 2 for v in vals) / (n - 1))
|
|
35
|
+
return _t95(n) * sd / math.sqrt(n)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class Convergence:
|
|
39
|
+
def __init__(self, min_seeds: int = 8, max_seeds: int = 30, ci_target: float = 0.02):
|
|
40
|
+
self.min_seeds = min_seeds
|
|
41
|
+
self.max_seeds = max_seeds
|
|
42
|
+
self.ci_target = ci_target
|
|
43
|
+
|
|
44
|
+
def primaries(self, per_seed: list) -> dict:
|
|
45
|
+
"""The primary metrics whose CIs gate the early-stop: reuse rate and
|
|
46
|
+
the staleness/brittleness advantage vs GPTCache. NOTE: model accuracy is deliberately
|
|
47
|
+
NOT here — it's a property of gpt-oss (±0.05 seed noise), not of YORO, so gating on its
|
|
48
|
+
0.02-precision would force every level to max_seeds without sharpening the claim. It is
|
|
49
|
+
still measured per-seed and reported in the aggregate (with its honest CI)."""
|
|
50
|
+
if not per_seed:
|
|
51
|
+
return {}
|
|
52
|
+
rungs = per_seed[0].keys()
|
|
53
|
+
out = {}
|
|
54
|
+
if "yoro" in rungs:
|
|
55
|
+
out["yoro.hit_rate"] = [ps["yoro"]["hit_rate"] for ps in per_seed]
|
|
56
|
+
if "yoro" in rungs and "gptcache-semantic" in rungs:
|
|
57
|
+
out["delta.staleness"] = [ps["gptcache-semantic"]["staleness"] - ps["yoro"]["staleness"]
|
|
58
|
+
for ps in per_seed]
|
|
59
|
+
out["delta.brittleness"] = [ps["gptcache-semantic"]["brittleness"] - ps["yoro"]["brittleness"]
|
|
60
|
+
for ps in per_seed]
|
|
61
|
+
return out
|
|
62
|
+
|
|
63
|
+
def reported(self, per_seed: list) -> dict:
|
|
64
|
+
"""CIs we LOG for transparency but do NOT gate on (model accuracy — gpt-oss noise)."""
|
|
65
|
+
if not per_seed or "yoro" not in per_seed[0]:
|
|
66
|
+
return {}
|
|
67
|
+
return {"yoro.accuracy": [ps["yoro"]["accuracy"] for ps in per_seed]}
|
|
68
|
+
|
|
69
|
+
def check(self, per_seed: list) -> tuple:
|
|
70
|
+
"""Returns (stop: bool, info: dict). Stop on convergence OR max_seeds."""
|
|
71
|
+
n = len(per_seed)
|
|
72
|
+
if n >= self.max_seeds:
|
|
73
|
+
return True, {"reason": "max_seeds", "n": n}
|
|
74
|
+
if n < self.min_seeds:
|
|
75
|
+
return False, {"reason": "min_seeds_not_reached", "n": n, "need": self.min_seeds}
|
|
76
|
+
widths = {k: ci_halfwidth(v) for k, v in self.primaries(per_seed).items()}
|
|
77
|
+
extra = {k: ci_halfwidth(v) for k, v in self.reported(per_seed).items()} # logged, not gated
|
|
78
|
+
converged = bool(widths) and all(w <= self.ci_target for w in widths.values())
|
|
79
|
+
return converged, {"reason": "converged" if converged else "not_yet", "n": n,
|
|
80
|
+
"ci_target": self.ci_target, "gated_on": list(widths.keys()),
|
|
81
|
+
"ci_halfwidths": {k: round(w, 4) for k, w in {**widths, **extra}.items()}}
|