yoro-cache 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bench/__init__.py ADDED
@@ -0,0 +1,33 @@
1
+ """YORO benchmark harness.
2
+
3
+ A baseline ladder over labelled prompt streams, runnable locally with a mock model
4
+ (`--smoke`, no GPU) or against any OpenAI-compatible endpoint, with per-level
5
+ checkpoint/resume, optional cloud sinks (S3 / CloudWatch / W&B), and a hard
6
+ auto-shutdown budget cap for rented GPUs.
7
+
8
+ Modules:
9
+ budget - BudgetGuard: spend tracking + auto-shutdown before the ceiling.
10
+ metrics - per-prompt Outcome, run summary, cross-seed aggregation + significance.
11
+ ladder - the five rungs (no-cache / exact / gptcache-semantic / behaviors / YORO).
12
+ wandb_log- thin W&B logger shim (falls back to stdout if wandb is absent).
13
+ """
14
+ from .budget import BudgetGuard
15
+ from .metrics import Outcome, summarize, aggregate_seeds, paired_t
16
+ from .ladder import (Strategy, NoCache, ExactCache, SemanticCache, BehaviorsOnly,
17
+ YOROStrategy, build_ladder)
18
+ from .wandb_log import WandbLogger
19
+ from .eventlog import EventLog, S3FileSink, CloudWatchSink
20
+ from .checkpoint import Checkpoint
21
+ from .convergence import Convergence, ci_halfwidth
22
+ from .vast import VastCredit, stop_self
23
+
24
+ __all__ = [
25
+ "BudgetGuard",
26
+ "Outcome", "summarize", "aggregate_seeds", "paired_t",
27
+ "Strategy", "NoCache", "ExactCache", "SemanticCache", "BehaviorsOnly", "YOROStrategy", "build_ladder",
28
+ "WandbLogger",
29
+ "EventLog", "S3FileSink", "CloudWatchSink",
30
+ "Checkpoint",
31
+ "Convergence", "ci_halfwidth",
32
+ "VastCredit", "stop_self",
33
+ ]
bench/budget.py ADDED
@@ -0,0 +1,81 @@
1
+ """BudgetGuard — the safety feature that makes a multi-day, unattended, $500 rented-GPU
2
+ run safe to leave alone.
3
+
4
+ It tracks spend = instance $/hr x elapsed + any per-token API cost, and once spend
5
+ crosses a soft fraction of the hard ceiling it fires a provider-specific shutdown hook
6
+ (e.g. `vastai destroy instance <id>`) exactly once. Pair it with frequent checkpointing
7
+ so the auto-shutdown (or a spot preemption) never loses results.
8
+
9
+ The clock is injectable so the logic is unit-testable without waiting hours.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import time
14
+ from dataclasses import dataclass, field
15
+ from typing import Callable, Optional
16
+
17
+
18
+ @dataclass
19
+ class BudgetGuard:
20
+ ceiling_usd: float # HARD cap (e.g. 500)
21
+ hourly_usd: float # instance price, e.g. Vast.ai spot $/hr
22
+ shutdown_frac: float = 0.9 # auto-shutdown at 90% of the ceiling
23
+ clock: Callable[[], float] = time.time # injectable for tests
24
+ on_shutdown: Optional[Callable[[], None]] = None # provider terminate hook (fired once)
25
+ token_cost_usd: float = 0.0 # accrued per-token cost (0 when self-hosting)
26
+ started_at: Optional[float] = None
27
+ _stopped: bool = field(default=False, repr=False)
28
+
29
+ def __post_init__(self):
30
+ if self.started_at is None:
31
+ self.started_at = self.clock()
32
+
33
+ def add_token_cost(self, usd: float) -> None:
34
+ self.token_cost_usd += max(0.0, usd)
35
+
36
+ def spent(self) -> float:
37
+ hours = max(0.0, (self.clock() - self.started_at) / 3600.0)
38
+ return hours * self.hourly_usd + self.token_cost_usd
39
+
40
+ def remaining(self) -> float:
41
+ return max(0.0, self.ceiling_usd - self.spent())
42
+
43
+ def soft_cap(self) -> float:
44
+ return self.ceiling_usd * self.shutdown_frac
45
+
46
+ def should_stop(self) -> bool:
47
+ return self.spent() >= self.soft_cap()
48
+
49
+ def check(self) -> bool:
50
+ """Call this periodically (e.g. every checkpoint). Returns True once spend crosses
51
+ the soft cap, firing the shutdown hook exactly once. Idempotent thereafter."""
52
+ if self._stopped:
53
+ return True
54
+ if self.should_stop():
55
+ self._stopped = True
56
+ if self.on_shutdown is not None:
57
+ try:
58
+ self.on_shutdown()
59
+ except Exception:
60
+ pass
61
+ return True
62
+ return False
63
+
64
+ def stop(self) -> None:
65
+ """Force-stop from an EXTERNAL signal (e.g. low real Vast credit), so a sweep that
66
+ shares this guard also halts — not just the current level."""
67
+ self._stopped = True
68
+
69
+ @property
70
+ def stopped(self) -> bool:
71
+ return self._stopped
72
+
73
+ def status(self) -> dict:
74
+ return {
75
+ "spent_usd": round(self.spent(), 2),
76
+ "remaining_usd": round(self.remaining(), 2),
77
+ "ceiling_usd": self.ceiling_usd,
78
+ "soft_cap_usd": round(self.soft_cap(), 2),
79
+ "hourly_usd": self.hourly_usd,
80
+ "stopped": self._stopped,
81
+ }
bench/checkpoint.py ADDED
@@ -0,0 +1,67 @@
1
+ """Checkpoint + resume — so a Vast.ai spot preemption or the budget auto-shutdown never
2
+ loses work. A checkpoint is a JSON snapshot of run state (cursor into the prompt stream,
3
+ per-rung outcomes so far, cache contents, accumulated spend). It's written ATOMICALLY
4
+ (temp file + os.replace, so a kill mid-write can't corrupt it) and mirrored to S3, so a
5
+ fresh instance can pull the latest and continue from the cursor.
6
+
7
+ Usage:
8
+ ck = Checkpoint("runs/phase0/ckpt.json", s3=("my-bucket", "yoro/phase0/ckpt.json"))
9
+ state = ck.load() or {"cursor": 0, "outcomes": {}}
10
+ ... # run from state["cursor"], periodically:
11
+ ck.save(state)
12
+ """
13
+ from __future__ import annotations
14
+
15
+ import json
16
+ import os
17
+ import tempfile
18
+ from typing import Optional, Tuple
19
+
20
+
21
+ def _s3():
22
+ try:
23
+ import boto3
24
+ return boto3.client("s3")
25
+ except Exception:
26
+ return None
27
+
28
+
29
+ class Checkpoint:
30
+ def __init__(self, path: str, s3: Optional[Tuple[str, str]] = None):
31
+ self.path = path
32
+ self.s3 = s3 # (bucket, key) or None
33
+ self._c = _s3() if s3 else None
34
+
35
+ def save(self, state: dict) -> None:
36
+ d = os.path.dirname(os.path.abspath(self.path)) or "."
37
+ os.makedirs(d, exist_ok=True)
38
+ fd, tmp = tempfile.mkstemp(dir=d, suffix=".tmp")
39
+ try:
40
+ with os.fdopen(fd, "w") as f:
41
+ json.dump(state, f, default=str)
42
+ os.replace(tmp, self.path) # atomic on POSIX
43
+ finally:
44
+ if os.path.exists(tmp):
45
+ os.remove(tmp)
46
+ if self._c:
47
+ try:
48
+ self._c.upload_file(self.path, self.s3[0], self.s3[1])
49
+ except Exception as e:
50
+ print(f"[ckpt s3 err {str(e)[:70]}]")
51
+
52
+ def load(self) -> Optional[dict]:
53
+ if os.path.exists(self.path):
54
+ try:
55
+ with open(self.path) as f:
56
+ return json.load(f)
57
+ except Exception:
58
+ pass # fall through to S3
59
+ if self._c:
60
+ try:
61
+ os.makedirs(os.path.dirname(os.path.abspath(self.path)) or ".", exist_ok=True)
62
+ self._c.download_file(self.s3[0], self.s3[1], self.path) # boto3 won't mkdir the target dir
63
+ with open(self.path) as f:
64
+ return json.load(f)
65
+ except Exception:
66
+ return None
67
+ return None
bench/convergence.py ADDED
@@ -0,0 +1,81 @@
1
+ """Convergence-based early stop — conclude when the result is statistically SETTLED, not
2
+ when the budget runs out. $500 is the ceiling; convergence is the target.
3
+
4
+ After each seed we hold the per-seed value of every metric for every rung. We track the
5
+ 95% CI half-width of the PRIMARY quantities — YORO's accuracy and hit-rate, and the
6
+ YORO-vs-GPTCache deltas on staleness + brittleness (the headline claims). Once every
7
+ primary CI is tighter than `ci_target` and at least `min_seeds` are in, we stop — usually
8
+ well under `max_seeds`, saving GPU hours. Final p-values still come from scipy on the
9
+ cluster; this is just the stopping rule.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import math
14
+
15
+ # t_{.975, n-1} for small n (two-sided 95%); falls back to 1.96 for large n.
16
+ _T95 = {2: 12.71, 3: 4.30, 4: 3.18, 5: 2.78, 6: 2.57, 7: 2.45, 8: 2.36, 9: 2.31,
17
+ 10: 2.26, 12: 2.20, 15: 2.14, 20: 2.09, 25: 2.06, 30: 2.05}
18
+
19
+
20
+ def _t95(n: int) -> float:
21
+ if n < 2:
22
+ return float("inf")
23
+ for k in sorted(_T95):
24
+ if n <= k:
25
+ return _T95[k]
26
+ return 1.96
27
+
28
+
29
+ def ci_halfwidth(vals: list) -> float:
30
+ n = len(vals)
31
+ if n < 2:
32
+ return float("inf")
33
+ m = sum(vals) / n
34
+ sd = math.sqrt(sum((v - m) ** 2 for v in vals) / (n - 1))
35
+ return _t95(n) * sd / math.sqrt(n)
36
+
37
+
38
+ class Convergence:
39
+ def __init__(self, min_seeds: int = 8, max_seeds: int = 30, ci_target: float = 0.02):
40
+ self.min_seeds = min_seeds
41
+ self.max_seeds = max_seeds
42
+ self.ci_target = ci_target
43
+
44
+ def primaries(self, per_seed: list) -> dict:
45
+ """The primary metrics whose CIs gate the early-stop: reuse rate and
46
+ the staleness/brittleness advantage vs GPTCache. NOTE: model accuracy is deliberately
47
+ NOT here — it's a property of gpt-oss (±0.05 seed noise), not of YORO, so gating on its
48
+ 0.02-precision would force every level to max_seeds without sharpening the claim. It is
49
+ still measured per-seed and reported in the aggregate (with its honest CI)."""
50
+ if not per_seed:
51
+ return {}
52
+ rungs = per_seed[0].keys()
53
+ out = {}
54
+ if "yoro" in rungs:
55
+ out["yoro.hit_rate"] = [ps["yoro"]["hit_rate"] for ps in per_seed]
56
+ if "yoro" in rungs and "gptcache-semantic" in rungs:
57
+ out["delta.staleness"] = [ps["gptcache-semantic"]["staleness"] - ps["yoro"]["staleness"]
58
+ for ps in per_seed]
59
+ out["delta.brittleness"] = [ps["gptcache-semantic"]["brittleness"] - ps["yoro"]["brittleness"]
60
+ for ps in per_seed]
61
+ return out
62
+
63
+ def reported(self, per_seed: list) -> dict:
64
+ """CIs we LOG for transparency but do NOT gate on (model accuracy — gpt-oss noise)."""
65
+ if not per_seed or "yoro" not in per_seed[0]:
66
+ return {}
67
+ return {"yoro.accuracy": [ps["yoro"]["accuracy"] for ps in per_seed]}
68
+
69
+ def check(self, per_seed: list) -> tuple:
70
+ """Returns (stop: bool, info: dict). Stop on convergence OR max_seeds."""
71
+ n = len(per_seed)
72
+ if n >= self.max_seeds:
73
+ return True, {"reason": "max_seeds", "n": n}
74
+ if n < self.min_seeds:
75
+ return False, {"reason": "min_seeds_not_reached", "n": n, "need": self.min_seeds}
76
+ widths = {k: ci_halfwidth(v) for k, v in self.primaries(per_seed).items()}
77
+ extra = {k: ci_halfwidth(v) for k, v in self.reported(per_seed).items()} # logged, not gated
78
+ converged = bool(widths) and all(w <= self.ci_target for w in widths.values())
79
+ return converged, {"reason": "converged" if converged else "not_yet", "n": n,
80
+ "ci_target": self.ci_target, "gated_on": list(widths.keys()),
81
+ "ci_halfwidths": {k: round(w, 4) for k, w in {**widths, **extra}.items()}}