tracefork 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracefork/__init__.py +6 -0
- tracefork/blame.py +296 -0
- tracefork/cli.py +367 -0
- tracefork/constants.py +24 -0
- tracefork/faults.py +129 -0
- tracefork/fork.py +173 -0
- tracefork/nondet.py +96 -0
- tracefork/py.typed +0 -0
- tracefork/recorder.py +140 -0
- tracefork/replay.py +119 -0
- tracefork/report.py +131 -0
- tracefork/server.py +73 -0
- tracefork/store.py +123 -0
- tracefork/synthetic.py +104 -0
- tracefork/tape.py +135 -0
- tracefork/transport.py +137 -0
- tracefork/validate.py +177 -0
- tracefork/web/report.html +209 -0
- tracefork/wire.py +76 -0
- tracefork-0.1.0.dist-info/METADATA +235 -0
- tracefork-0.1.0.dist-info/RECORD +32 -0
- tracefork-0.1.0.dist-info/WHEEL +4 -0
- tracefork-0.1.0.dist-info/entry_points.txt +2 -0
- tracefork-0.1.0.dist-info/licenses/LICENSE +21 -0
- tracefork_spike/__init__.py +7 -0
- tracefork_spike/__main__.py +3 -0
- tracefork_spike/agent.py +91 -0
- tracefork_spike/fake_llm.py +106 -0
- tracefork_spike/nondet.py +97 -0
- tracefork_spike/spike.py +125 -0
- tracefork_spike/tape.py +79 -0
- tracefork_spike/transport.py +68 -0
tracefork/__init__.py
ADDED
tracefork/blame.py
ADDED
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
"""Blame engine: rank each exchange by causal flip-rate with Wilson CIs.
|
|
2
|
+
|
|
3
|
+
The causal question is "if step *i* had gone differently, how often would the
|
|
4
|
+
run's *outcome* change?" Answering it honestly requires re-running the agent,
|
|
5
|
+
not just rewriting the tape: perturbing step *i*'s response changes every
|
|
6
|
+
request the agent makes afterward. So for each candidate step we:
|
|
7
|
+
|
|
8
|
+
1. fork the recorded run at *i* with a perturbed response — the prefix is
|
|
9
|
+
replayed from the parent tape for $0 and the agent is re-run from there
|
|
10
|
+
(`ForkEngine.fork`);
|
|
11
|
+
2. grade the resulting outcome with an `Oracle`;
|
|
12
|
+
3. count it as a *flip* when the graded outcome differs from the parent run's.
|
|
13
|
+
|
|
14
|
+
`flip_rate = flips / k` over `k` trials, with a Wilson score 95% interval so a
|
|
15
|
+
small *k* doesn't masquerade as certainty. `BudgetGovernor` estimates the
|
|
16
|
+
fork count and dollar cost before any spend.
|
|
17
|
+
|
|
18
|
+
The engine is agent- and domain-agnostic: the caller supplies `agent_fn` (the
|
|
19
|
+
same agent that produced the tape) and a `perturb_factory(step) -> (response,
|
|
20
|
+
tail_transport)`. In tests and the offline validation suite, `tail_transport`
|
|
21
|
+
is a scripted fake (zero cost); for a live run it is `None`, so the
|
|
22
|
+
counterfactual tail hits the real API under the budget cap.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import json
|
|
28
|
+
import math
|
|
29
|
+
from collections.abc import Callable
|
|
30
|
+
from dataclasses import dataclass
|
|
31
|
+
from typing import Protocol, cast
|
|
32
|
+
|
|
33
|
+
import httpx
|
|
34
|
+
|
|
35
|
+
from .constants import PRICING_TABLE, SONNET
|
|
36
|
+
from .fork import BranchSpec, ForkEngine
|
|
37
|
+
from .tape import Tape
|
|
38
|
+
|
|
39
|
+
# ── Wilson score CI ────────────────────────────────────────────────────────
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def wilson_ci(successes: int, n: int, z: float = 1.96) -> tuple[float, float]:
|
|
43
|
+
"""95% Wilson score confidence interval for a proportion."""
|
|
44
|
+
if n == 0:
|
|
45
|
+
return (0.0, 1.0)
|
|
46
|
+
p = successes / n
|
|
47
|
+
denom = 1 + z**2 / n
|
|
48
|
+
centre = (p + z**2 / (2 * n)) / denom
|
|
49
|
+
spread = (z * math.sqrt(p * (1 - p) / n + z**2 / (4 * n**2))) / denom
|
|
50
|
+
return (max(0.0, centre - spread), min(1.0, centre + spread))
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# ── Oracle protocol ─────────────────────────────────────────────────────────
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class Oracle(Protocol):
|
|
57
|
+
def grade(self, output: str) -> bool | None: ...
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class StringMatchOracle:
|
|
61
|
+
"""Grades by regex match: True=success, False=failure, None=ambiguous."""
|
|
62
|
+
|
|
63
|
+
def __init__(self, *, success_re: str, failure_re: str) -> None:
|
|
64
|
+
import re
|
|
65
|
+
|
|
66
|
+
self._success = re.compile(success_re)
|
|
67
|
+
self._failure = re.compile(failure_re)
|
|
68
|
+
|
|
69
|
+
def grade(self, output: str) -> bool | None:
|
|
70
|
+
if self._success.search(output):
|
|
71
|
+
return True
|
|
72
|
+
if self._failure.search(output):
|
|
73
|
+
return False
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# ── Result types ────────────────────────────────────────────────────────────
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass
|
|
81
|
+
class FlipRateResult:
|
|
82
|
+
step_index: int
|
|
83
|
+
flip_rate: float
|
|
84
|
+
ci_lo: float
|
|
85
|
+
ci_hi: float
|
|
86
|
+
flips: int
|
|
87
|
+
trials: int
|
|
88
|
+
interpretation: str = ""
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@dataclass
|
|
92
|
+
class BlameReport:
|
|
93
|
+
results: list[FlipRateResult]
|
|
94
|
+
k: int
|
|
95
|
+
total_forks: int
|
|
96
|
+
parent_outcome: bool | None = None
|
|
97
|
+
est_cost_usd: float = 0.0
|
|
98
|
+
|
|
99
|
+
def top(self) -> FlipRateResult | None:
|
|
100
|
+
if not self.results:
|
|
101
|
+
return None
|
|
102
|
+
return max(self.results, key=lambda r: r.flip_rate)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@dataclass
|
|
106
|
+
class BlameEstimate:
|
|
107
|
+
n_candidates: int
|
|
108
|
+
n_forks: int
|
|
109
|
+
est_usd: float
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
# ── BudgetGovernor ──────────────────────────────────────────────────────────
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class BudgetExceededError(RuntimeError):
|
|
116
|
+
"""Raised when a blame run's estimated cost exceeds the caller's budget."""
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _detect_model(tape: Tape) -> str:
|
|
120
|
+
"""Best-effort model id from the first recorded request (defaults to Sonnet)."""
|
|
121
|
+
for req, _ in tape.exchanges:
|
|
122
|
+
try:
|
|
123
|
+
m = json.loads(req).get("model")
|
|
124
|
+
except Exception:
|
|
125
|
+
m = None
|
|
126
|
+
if m:
|
|
127
|
+
return m
|
|
128
|
+
return SONNET
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _avg_tokens(tape: Tape) -> tuple[float, float]:
|
|
132
|
+
"""Average (input, output) tokens per exchange — from recorded ``usage`` when
|
|
133
|
+
present, else a ~4-bytes-per-token estimate from the raw bytes."""
|
|
134
|
+
if not tape.exchanges:
|
|
135
|
+
return (0.0, 0.0)
|
|
136
|
+
ins: list[float] = []
|
|
137
|
+
outs: list[float] = []
|
|
138
|
+
for req, resp in tape.exchanges:
|
|
139
|
+
usage: dict = {}
|
|
140
|
+
try:
|
|
141
|
+
d = json.loads(resp)
|
|
142
|
+
if isinstance(d, dict):
|
|
143
|
+
usage = d.get("usage") or {}
|
|
144
|
+
except Exception:
|
|
145
|
+
usage = {}
|
|
146
|
+
ins.append(usage.get("input_tokens") or max(1, len(req) // 4))
|
|
147
|
+
outs.append(usage.get("output_tokens") or max(1, len(resp) // 4))
|
|
148
|
+
n = len(tape.exchanges)
|
|
149
|
+
return (sum(ins) / n, sum(outs) / n)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class BudgetGovernor:
|
|
153
|
+
@staticmethod
|
|
154
|
+
def estimate(
|
|
155
|
+
tape: Tape,
|
|
156
|
+
*,
|
|
157
|
+
k: int,
|
|
158
|
+
model: str | None = None,
|
|
159
|
+
cost_per_fork_usd: float | None = None,
|
|
160
|
+
) -> BlameEstimate:
|
|
161
|
+
"""Estimate the dollar cost of a blame run.
|
|
162
|
+
|
|
163
|
+
Only the counterfactual *tail* hits the API — the replayed prefix and the
|
|
164
|
+
mutated step itself cost $0. Forking step ``i`` records ``n-1-i`` tail
|
|
165
|
+
calls, so total billed calls = ``sum_i (n-1-i) * k``. Each call is priced
|
|
166
|
+
with the model's real per-token rates (``constants.PRICING_TABLE``) against
|
|
167
|
+
the tape's recorded token usage. Pass ``cost_per_fork_usd`` to override with
|
|
168
|
+
a flat per-fork figure instead.
|
|
169
|
+
"""
|
|
170
|
+
n_candidates = len(tape.exchanges)
|
|
171
|
+
n_forks = n_candidates * k
|
|
172
|
+
if cost_per_fork_usd is not None:
|
|
173
|
+
est_usd = n_forks * cost_per_fork_usd
|
|
174
|
+
else:
|
|
175
|
+
billed_calls = sum(n_candidates - 1 - i for i in range(n_candidates)) * k
|
|
176
|
+
in_rate, out_rate = PRICING_TABLE.get(
|
|
177
|
+
model or _detect_model(tape), PRICING_TABLE[SONNET]
|
|
178
|
+
)
|
|
179
|
+
avg_in, avg_out = _avg_tokens(tape)
|
|
180
|
+
est_usd = billed_calls * (avg_in * in_rate + avg_out * out_rate)
|
|
181
|
+
return BlameEstimate(n_candidates=n_candidates, n_forks=n_forks, est_usd=est_usd)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
# ── outcome extraction ────────────────────────────────────────────────────────
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _outcome_text(resp_bytes: bytes) -> str:
|
|
188
|
+
"""Extract the assistant's text from a recorded response (JSON or SSE)."""
|
|
189
|
+
try:
|
|
190
|
+
d = json.loads(resp_bytes)
|
|
191
|
+
except Exception:
|
|
192
|
+
return resp_bytes.decode(errors="replace")
|
|
193
|
+
if isinstance(d, dict):
|
|
194
|
+
for block in d.get("content", []):
|
|
195
|
+
if block.get("type") == "text":
|
|
196
|
+
return block.get("text", "")
|
|
197
|
+
return ""
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _interpret(flip_rate: float) -> str:
|
|
201
|
+
if flip_rate >= 0.7:
|
|
202
|
+
return "decisive — this step caused it"
|
|
203
|
+
if flip_rate >= 0.3:
|
|
204
|
+
return "suggestive"
|
|
205
|
+
return "diffuse — not the cause"
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
# ── BlameEngine ─────────────────────────────────────────────────────────────
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
class BlameEngine:
|
|
212
|
+
"""Ranks exchanges by causal flip-rate."""
|
|
213
|
+
|
|
214
|
+
@staticmethod
|
|
215
|
+
def rank(
|
|
216
|
+
tape: Tape,
|
|
217
|
+
agent_fn, # Callable[[anthropic.Anthropic], Any] — the SAME agent
|
|
218
|
+
oracle: Oracle,
|
|
219
|
+
*,
|
|
220
|
+
perturb_factory: Callable[[int], tuple[bytes, object]],
|
|
221
|
+
k: int = 10,
|
|
222
|
+
budget_usd: float = 5.0,
|
|
223
|
+
api_key: str = "sk-ant-blame",
|
|
224
|
+
) -> BlameReport:
|
|
225
|
+
"""Fork each exchange `k` times with a perturbed response and measure how
|
|
226
|
+
often the graded outcome flips relative to the parent run.
|
|
227
|
+
|
|
228
|
+
`perturb_factory(step_idx)` returns `(mutated_response_bytes,
|
|
229
|
+
tail_transport)`, where `tail_transport` serves the counterfactual tail
|
|
230
|
+
(a scripted fake offline, or `None` to use the real API).
|
|
231
|
+
"""
|
|
232
|
+
est = BudgetGovernor.estimate(tape, k=k)
|
|
233
|
+
if est.est_usd > budget_usd:
|
|
234
|
+
raise BudgetExceededError(
|
|
235
|
+
f"estimated blame cost ${est.est_usd:.2f} exceeds budget "
|
|
236
|
+
f"${budget_usd:.2f} ({est.n_forks} forks at k={k}); raise the budget "
|
|
237
|
+
f"or lower k"
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
parent_outcome: bool | None = None
|
|
241
|
+
if tape.exchanges:
|
|
242
|
+
parent_outcome = oracle.grade(_outcome_text(tape.exchanges[-1][1]))
|
|
243
|
+
|
|
244
|
+
results: list[FlipRateResult] = []
|
|
245
|
+
total_forks = 0
|
|
246
|
+
|
|
247
|
+
for step_idx in range(len(tape.exchanges)):
|
|
248
|
+
flips = 0
|
|
249
|
+
for _trial in range(k):
|
|
250
|
+
mutated_resp, tail_transport_obj = perturb_factory(step_idx)
|
|
251
|
+
tail_transport = cast("httpx.BaseTransport | None", tail_transport_obj)
|
|
252
|
+
spec = BranchSpec(divergence_step=step_idx, mutated_response=mutated_resp)
|
|
253
|
+
try:
|
|
254
|
+
branch = ForkEngine.fork(
|
|
255
|
+
tape,
|
|
256
|
+
spec,
|
|
257
|
+
agent_fn,
|
|
258
|
+
post_fork_transport=tail_transport,
|
|
259
|
+
api_key=api_key,
|
|
260
|
+
)
|
|
261
|
+
total_forks += 1
|
|
262
|
+
except Exception:
|
|
263
|
+
# A divergent fork (e.g. agent not deterministic up to the
|
|
264
|
+
# step) counts as cost spent but no observed flip.
|
|
265
|
+
total_forks += 1
|
|
266
|
+
continue
|
|
267
|
+
|
|
268
|
+
if branch.delta_tape.exchanges:
|
|
269
|
+
graded = oracle.grade(_outcome_text(branch.delta_tape.exchanges[-1][1]))
|
|
270
|
+
else:
|
|
271
|
+
graded = None
|
|
272
|
+
if graded is not None and graded != parent_outcome:
|
|
273
|
+
flips += 1
|
|
274
|
+
|
|
275
|
+
flip_rate = flips / k if k > 0 else 0.0
|
|
276
|
+
ci_lo, ci_hi = wilson_ci(flips, k)
|
|
277
|
+
results.append(
|
|
278
|
+
FlipRateResult(
|
|
279
|
+
step_index=step_idx,
|
|
280
|
+
flip_rate=flip_rate,
|
|
281
|
+
ci_lo=ci_lo,
|
|
282
|
+
ci_hi=ci_hi,
|
|
283
|
+
flips=flips,
|
|
284
|
+
trials=k,
|
|
285
|
+
interpretation=_interpret(flip_rate),
|
|
286
|
+
)
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
results.sort(key=lambda r: r.flip_rate, reverse=True)
|
|
290
|
+
return BlameReport(
|
|
291
|
+
results=results,
|
|
292
|
+
k=k,
|
|
293
|
+
total_forks=total_forks,
|
|
294
|
+
parent_outcome=parent_outcome,
|
|
295
|
+
est_cost_usd=est.est_usd,
|
|
296
|
+
)
|
tracefork/cli.py
ADDED
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
"""tracefork CLI — entry point for all commands.
|
|
2
|
+
|
|
3
|
+
tracefork <command> [args]
|
|
4
|
+
|
|
5
|
+
Commands: replay, verify, fork, report, serve, blame, validate.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
import typer
|
|
13
|
+
|
|
14
|
+
app = typer.Typer(name="tracefork", help="Time-travel debugger for AI agents.")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@app.command()
|
|
18
|
+
def replay(
|
|
19
|
+
tape_path: Path = typer.Argument(..., help="Path to a .tape.sqlite file"), # noqa: B008
|
|
20
|
+
agent: str = typer.Option(..., "--agent", "-a", help="Import path of agent fn (pkg.mod:fn)"),
|
|
21
|
+
) -> None:
|
|
22
|
+
"""Replay a tape and print the verification receipt."""
|
|
23
|
+
import importlib
|
|
24
|
+
|
|
25
|
+
from tracefork.replay import ReplayVerifier
|
|
26
|
+
from tracefork.tape import Tape
|
|
27
|
+
|
|
28
|
+
tape = Tape.load(str(tape_path))
|
|
29
|
+
|
|
30
|
+
module_path, fn_name = agent.rsplit(":", 1)
|
|
31
|
+
mod = importlib.import_module(module_path)
|
|
32
|
+
agent_fn = getattr(mod, fn_name)
|
|
33
|
+
|
|
34
|
+
result = ReplayVerifier(tape, agent_fn).verify()
|
|
35
|
+
_print_receipt(tape_path, result)
|
|
36
|
+
raise typer.Exit(0 if result.bit_exact else 1)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@app.command()
|
|
40
|
+
def verify(
|
|
41
|
+
tape_path: Path = typer.Argument(None, help="Single tape to verify"), # noqa: B008
|
|
42
|
+
agent: str = typer.Option(None, "--agent", "-a", help="Import path of agent fn"),
|
|
43
|
+
corpus: bool = typer.Option(
|
|
44
|
+
False, "--corpus", help="Verify all tapes in experiments/validation_tapes/"
|
|
45
|
+
),
|
|
46
|
+
) -> None:
|
|
47
|
+
"""Verify bit-exact replay. Exit 1 on drift."""
|
|
48
|
+
import importlib
|
|
49
|
+
|
|
50
|
+
from tracefork.replay import ReplayVerifier
|
|
51
|
+
from tracefork.tape import Tape
|
|
52
|
+
|
|
53
|
+
if corpus:
|
|
54
|
+
corpus_dir = Path("experiments/validation_tapes")
|
|
55
|
+
tapes = list(corpus_dir.glob("*.tape.sqlite"))
|
|
56
|
+
if not tapes:
|
|
57
|
+
typer.echo("No tapes found in experiments/validation_tapes/")
|
|
58
|
+
raise typer.Exit(1)
|
|
59
|
+
for tp in sorted(tapes):
|
|
60
|
+
typer.echo(f" {tp.name}: skipped (agent not specified per-tape)")
|
|
61
|
+
typer.echo(f"Corpus: {len(tapes)} tapes scanned")
|
|
62
|
+
raise typer.Exit(0)
|
|
63
|
+
|
|
64
|
+
if tape_path is None or agent is None:
|
|
65
|
+
typer.echo("Provide --agent and a tape path, or use --corpus")
|
|
66
|
+
raise typer.Exit(1)
|
|
67
|
+
|
|
68
|
+
tape = Tape.load(str(tape_path))
|
|
69
|
+
module_path, fn_name = agent.rsplit(":", 1)
|
|
70
|
+
mod = importlib.import_module(module_path)
|
|
71
|
+
agent_fn = getattr(mod, fn_name)
|
|
72
|
+
result = ReplayVerifier(tape, agent_fn).verify()
|
|
73
|
+
_print_receipt(tape_path, result)
|
|
74
|
+
raise typer.Exit(0 if result.bit_exact else 1)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@app.command()
|
|
78
|
+
def fork(
|
|
79
|
+
run_id: str = typer.Argument(..., help="Parent run_id to fork from"),
|
|
80
|
+
step: int = typer.Option(..., "--step", "-s", help="Exchange index to diverge at"),
|
|
81
|
+
response_file: Path = typer.Option( # noqa: B008
|
|
82
|
+
..., "--response", "-r", help="Path to .bytes file containing mutated response"
|
|
83
|
+
),
|
|
84
|
+
agent: str = typer.Option(..., "--agent", "-a", help="Import path of post-fork agent fn"),
|
|
85
|
+
store: Path = typer.Option(Path("store.db"), "--store", help="Path to store.db"), # noqa: B008
|
|
86
|
+
desc: str = typer.Option("", "--desc", "-d", help="Human description of mutation"),
|
|
87
|
+
) -> None:
|
|
88
|
+
"""Fork a run at a step with a mutated response, record the new branch."""
|
|
89
|
+
import importlib
|
|
90
|
+
|
|
91
|
+
from tracefork.fork import BranchSpec, ForkEngine
|
|
92
|
+
from tracefork.store import TapeStore
|
|
93
|
+
|
|
94
|
+
db = TapeStore(str(store))
|
|
95
|
+
parent_tape = db.load_tape(run_id)
|
|
96
|
+
|
|
97
|
+
mutated_response = response_file.read_bytes()
|
|
98
|
+
|
|
99
|
+
module_path, fn_name = agent.rsplit(":", 1)
|
|
100
|
+
mod = importlib.import_module(module_path)
|
|
101
|
+
agent_fn = getattr(mod, fn_name)
|
|
102
|
+
|
|
103
|
+
spec = BranchSpec(
|
|
104
|
+
divergence_step=step,
|
|
105
|
+
mutated_response=mutated_response,
|
|
106
|
+
mutation_desc=desc,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
branch = ForkEngine.fork(parent_tape, spec, agent_fn)
|
|
110
|
+
|
|
111
|
+
branch_id = db.save_branch(
|
|
112
|
+
parent_run_id=run_id,
|
|
113
|
+
divergence_step=step,
|
|
114
|
+
delta_tape=branch.delta_tape,
|
|
115
|
+
mutation_desc=desc,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
typer.echo("\n Fork created")
|
|
119
|
+
typer.echo(f" branch_id {branch_id}")
|
|
120
|
+
typer.echo(f" parent_run_id {run_id}")
|
|
121
|
+
typer.echo(f" divergence_step {step}")
|
|
122
|
+
typer.echo(f" delta_exchanges {len(branch.delta_tape.exchanges)}")
|
|
123
|
+
typer.echo(f" description {desc or '(none)'}\n")
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@app.command()
|
|
127
|
+
def report(
|
|
128
|
+
run_id: str = typer.Argument(None, help="run_id to report on (from store)"),
|
|
129
|
+
tape_path: Path = typer.Option( # noqa: B008
|
|
130
|
+
None, "--tape", "-t", help="Path to a .tape.sqlite file"
|
|
131
|
+
),
|
|
132
|
+
output: Path = typer.Option( # noqa: B008
|
|
133
|
+
Path("report.html"), "--output", "-o", help="Output HTML file"
|
|
134
|
+
),
|
|
135
|
+
store: Path = typer.Option( # noqa: B008
|
|
136
|
+
Path("store.db"), "--store", help="Path to store.db"
|
|
137
|
+
),
|
|
138
|
+
) -> None:
|
|
139
|
+
"""Generate a self-contained HTML report from a tape."""
|
|
140
|
+
from tracefork.report import generate_report
|
|
141
|
+
from tracefork.tape import Tape
|
|
142
|
+
|
|
143
|
+
if tape_path:
|
|
144
|
+
tape = Tape.load(str(tape_path))
|
|
145
|
+
elif run_id:
|
|
146
|
+
from tracefork.store import TapeStore
|
|
147
|
+
|
|
148
|
+
db = TapeStore(str(store))
|
|
149
|
+
tape = db.load_tape(run_id)
|
|
150
|
+
else:
|
|
151
|
+
typer.echo("Provide a run_id or --tape path")
|
|
152
|
+
raise typer.Exit(1)
|
|
153
|
+
|
|
154
|
+
generate_report(tape, output)
|
|
155
|
+
typer.echo(f"Report written to {output}")
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
@app.command()
|
|
159
|
+
def serve(
|
|
160
|
+
store: Path = typer.Option( # noqa: B008
|
|
161
|
+
Path("store.db"), "--store", help="Path to store.db"
|
|
162
|
+
),
|
|
163
|
+
port: int = typer.Option(7777, "--port", "-p", help="Port to listen on"),
|
|
164
|
+
) -> None:
|
|
165
|
+
"""Start the tracefork web UI server on port 7777."""
|
|
166
|
+
import uvicorn
|
|
167
|
+
|
|
168
|
+
from tracefork.server import app as fastapi_app
|
|
169
|
+
from tracefork.server import init_store
|
|
170
|
+
|
|
171
|
+
init_store(str(store))
|
|
172
|
+
typer.echo(f" tracefork serve → http://127.0.0.1:{port}")
|
|
173
|
+
uvicorn.run(fastapi_app, host="127.0.0.1", port=port, workers=1, log_level="warning")
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
@app.command()
|
|
177
|
+
def blame(
|
|
178
|
+
run_id: str = typer.Argument(..., help="run_id to analyze"),
|
|
179
|
+
agent: str = typer.Option(
|
|
180
|
+
...,
|
|
181
|
+
"--agent",
|
|
182
|
+
"-a",
|
|
183
|
+
help="Import path of the agent fn (pkg.mod:fn) that produced this run; "
|
|
184
|
+
"it is re-run for each fork and must be deterministic up to the fork point",
|
|
185
|
+
),
|
|
186
|
+
k: int = typer.Option(10, "--k", help="Forks per candidate step"),
|
|
187
|
+
budget: float = typer.Option(5.0, "--budget", help="USD spend cap"),
|
|
188
|
+
perturbation: str = typer.Option(
|
|
189
|
+
"[tracefork] this step did not complete as recorded",
|
|
190
|
+
"--perturbation",
|
|
191
|
+
help="Text injected as the counterfactual response",
|
|
192
|
+
),
|
|
193
|
+
success_re: str = typer.Option("SUCCESS", "--success-re", help="Regex for success outcome"),
|
|
194
|
+
failure_re: str = typer.Option("FAIL", "--failure-re", help="Regex for failure outcome"),
|
|
195
|
+
store: Path = typer.Option( # noqa: B008
|
|
196
|
+
Path("store.db"), "--store", help="Path to store.db"
|
|
197
|
+
),
|
|
198
|
+
) -> None:
|
|
199
|
+
"""Run causal blame analysis on a recorded run.
|
|
200
|
+
|
|
201
|
+
For each exchange, the agent is re-run with that step's response perturbed
|
|
202
|
+
and the counterfactual tail recorded against the real API (budget-capped).
|
|
203
|
+
The offline, $0 proof that blame correctly fingers known faults is
|
|
204
|
+
`tracefork validate`.
|
|
205
|
+
"""
|
|
206
|
+
if not run_id or not all(c.isalnum() or c in "-_" for c in run_id):
|
|
207
|
+
raise typer.BadParameter("run_id must be alphanumeric (with '-' or '_')")
|
|
208
|
+
|
|
209
|
+
import importlib
|
|
210
|
+
import json
|
|
211
|
+
import os
|
|
212
|
+
|
|
213
|
+
from tracefork.blame import BlameEngine, BudgetGovernor, StringMatchOracle
|
|
214
|
+
from tracefork.store import TapeStore
|
|
215
|
+
from tracefork.wire import make_text_response
|
|
216
|
+
|
|
217
|
+
db = TapeStore(str(store))
|
|
218
|
+
tape = db.load_tape(run_id)
|
|
219
|
+
|
|
220
|
+
module_path, fn_name = agent.rsplit(":", 1)
|
|
221
|
+
agent_fn = getattr(importlib.import_module(module_path), fn_name)
|
|
222
|
+
|
|
223
|
+
oracle = StringMatchOracle(success_re=success_re, failure_re=failure_re)
|
|
224
|
+
est = BudgetGovernor.estimate(tape, k=k)
|
|
225
|
+
|
|
226
|
+
typer.echo(f"\n Blame estimate: {est.n_forks} forks, ~${est.est_usd:.2f}")
|
|
227
|
+
if est.est_usd > budget:
|
|
228
|
+
typer.echo(f" Estimated cost ${est.est_usd:.2f} exceeds budget ${budget:.2f}.")
|
|
229
|
+
typer.echo(" Use --budget to increase or --k to reduce trials.")
|
|
230
|
+
raise typer.Exit(1)
|
|
231
|
+
|
|
232
|
+
mutated = make_text_response(perturbation)
|
|
233
|
+
|
|
234
|
+
def perturb_factory(step_idx: int):
|
|
235
|
+
# tail_transport=None → the counterfactual tail hits the real API.
|
|
236
|
+
return mutated, None
|
|
237
|
+
|
|
238
|
+
report = BlameEngine.rank(
|
|
239
|
+
tape,
|
|
240
|
+
agent_fn,
|
|
241
|
+
oracle,
|
|
242
|
+
perturb_factory=perturb_factory,
|
|
243
|
+
k=k,
|
|
244
|
+
budget_usd=budget,
|
|
245
|
+
api_key=os.environ.get("ANTHROPIC_API_KEY", ""),
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
typer.echo(f"\n run-{run_id} · blame analysis · k={k} · {report.total_forks} forks\n")
|
|
249
|
+
typer.echo(f" {'rank':<5} {'step':<8} {'flip-rate':<12} {'95% CI':<22} interpretation")
|
|
250
|
+
typer.echo(f" {'─' * 70}")
|
|
251
|
+
for rank, r in enumerate(report.results, 1):
|
|
252
|
+
ci_str = f"[{r.ci_lo:.2f}, {r.ci_hi:.2f}]"
|
|
253
|
+
typer.echo(
|
|
254
|
+
f" {rank:<5} step-{r.step_index:<3} {r.flip_rate:<12.2f} "
|
|
255
|
+
f"{ci_str:<22} {r.interpretation}"
|
|
256
|
+
)
|
|
257
|
+
typer.echo("")
|
|
258
|
+
|
|
259
|
+
report_path = Path(f"blame_{run_id}.json")
|
|
260
|
+
report_path.write_text(
|
|
261
|
+
json.dumps(
|
|
262
|
+
{
|
|
263
|
+
"run_id": run_id,
|
|
264
|
+
"k": k,
|
|
265
|
+
"results": [
|
|
266
|
+
{
|
|
267
|
+
"step_index": r.step_index,
|
|
268
|
+
"flip_rate": r.flip_rate,
|
|
269
|
+
"ci_lo": r.ci_lo,
|
|
270
|
+
"ci_hi": r.ci_hi,
|
|
271
|
+
"interpretation": r.interpretation,
|
|
272
|
+
}
|
|
273
|
+
for r in report.results
|
|
274
|
+
],
|
|
275
|
+
},
|
|
276
|
+
indent=2,
|
|
277
|
+
)
|
|
278
|
+
)
|
|
279
|
+
typer.echo(f" Report saved to {report_path}")
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
@app.command()
|
|
283
|
+
def validate(
|
|
284
|
+
k: int = typer.Option(3, "--k", help="Forks per candidate step per run"),
|
|
285
|
+
n_runs: int = typer.Option(5, "--n-runs", help="Runs per fault class"),
|
|
286
|
+
output: Path = typer.Option( # noqa: B008
|
|
287
|
+
Path("validation_report.json"), "--output", "-o"
|
|
288
|
+
),
|
|
289
|
+
check: bool = typer.Option(False, "--check", help="Diff vs committed report (regression gate)"),
|
|
290
|
+
) -> None:
|
|
291
|
+
"""Run fault-injection validation suite; produce validation_report.json."""
|
|
292
|
+
import json as _json
|
|
293
|
+
|
|
294
|
+
from tracefork.validate import run_all_fault_classes
|
|
295
|
+
|
|
296
|
+
typer.echo(f"\n tracefork validate — k={k}, n_runs={n_runs} per class")
|
|
297
|
+
typer.echo(f" {'─' * 50}")
|
|
298
|
+
|
|
299
|
+
results = run_all_fault_classes(k=k, n_runs=n_runs)
|
|
300
|
+
|
|
301
|
+
overall_precision = sum(r["top1_precision"] for r in results.values()) / len(results)
|
|
302
|
+
max_ctrl = max(r["negative_control_max_flip"] for r in results.values())
|
|
303
|
+
|
|
304
|
+
report_data = {
|
|
305
|
+
"top1_precision_by_class": {fc: v["top1_precision"] for fc, v in results.items()},
|
|
306
|
+
"overall_top1_precision": overall_precision,
|
|
307
|
+
"negative_control_max_flip": max_ctrl,
|
|
308
|
+
"n_runs_per_class": n_runs,
|
|
309
|
+
"k": k,
|
|
310
|
+
"reproduce_cmd": f"tracefork validate --k {k} --n-runs {n_runs}",
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
for fault_class, data in results.items():
|
|
314
|
+
status = "PASS" if data["top1_precision"] >= 0.7 else "WARN"
|
|
315
|
+
typer.echo(f" [{status}] {fault_class:<35} top-1: {data['top1_precision']:.2f}")
|
|
316
|
+
|
|
317
|
+
typer.echo(f"\n overall top-1 precision: {overall_precision:.2f}")
|
|
318
|
+
typer.echo(f" negative control max flip: {max_ctrl:.2f} (threshold 0.30)")
|
|
319
|
+
|
|
320
|
+
output.write_text(_json.dumps(report_data, indent=2))
|
|
321
|
+
typer.echo(f"\n Report saved to {output}\n")
|
|
322
|
+
|
|
323
|
+
control_threshold = 0.30
|
|
324
|
+
if max_ctrl >= control_threshold:
|
|
325
|
+
typer.echo(
|
|
326
|
+
f" [FAIL] negative control max flip {max_ctrl:.2f} ≥ {control_threshold:.2f} "
|
|
327
|
+
"— blame is firing on no-op perturbations; the precision number is not trustworthy."
|
|
328
|
+
)
|
|
329
|
+
raise typer.Exit(1)
|
|
330
|
+
|
|
331
|
+
if check:
|
|
332
|
+
committed = Path("experiments/validation_report_committed.json")
|
|
333
|
+
if not committed.exists():
|
|
334
|
+
typer.echo(" No committed report found — run without --check to create one.")
|
|
335
|
+
raise typer.Exit(1)
|
|
336
|
+
old = _json.loads(committed.read_text())
|
|
337
|
+
regressions = []
|
|
338
|
+
for fc, new_prec in report_data["top1_precision_by_class"].items():
|
|
339
|
+
old_prec = old.get("top1_precision_by_class", {}).get(fc, 0.0)
|
|
340
|
+
if new_prec < old_prec - 0.15:
|
|
341
|
+
regressions.append(f"{fc}: {old_prec:.2f} → {new_prec:.2f}")
|
|
342
|
+
old_ctrl = old.get("negative_control_max_flip", 0.0)
|
|
343
|
+
if max_ctrl > old_ctrl + 0.15:
|
|
344
|
+
regressions.append(f"negative_control_max_flip: {old_ctrl:.2f} → {max_ctrl:.2f}")
|
|
345
|
+
if regressions:
|
|
346
|
+
typer.echo(" REGRESSION detected:")
|
|
347
|
+
for r_str in regressions:
|
|
348
|
+
typer.echo(f" {r_str}")
|
|
349
|
+
raise typer.Exit(1)
|
|
350
|
+
typer.echo(" No regressions vs committed report.")
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def _print_receipt(tape_path: Path, result) -> None:
|
|
354
|
+
from tracefork.replay import DriftDoctor
|
|
355
|
+
|
|
356
|
+
status = "PASS" if result.bit_exact else "FAIL"
|
|
357
|
+
typer.echo("\n tracefork — replay receipt")
|
|
358
|
+
typer.echo(f" {'─' * 40}")
|
|
359
|
+
typer.echo(f" tape {tape_path.name}")
|
|
360
|
+
typer.echo(f" exchanges {result.matched}/{result.total} matched")
|
|
361
|
+
typer.echo(f" fingerprint {'match' if result.fingerprints_match else 'MISMATCH'}")
|
|
362
|
+
typer.echo(f" result {status}")
|
|
363
|
+
if result.divergence:
|
|
364
|
+
cause = DriftDoctor.classify(result.divergence)
|
|
365
|
+
typer.echo(f" drift cause {cause.value}")
|
|
366
|
+
typer.echo(f" at exchange #{result.divergence.step_index}")
|
|
367
|
+
typer.echo("")
|
tracefork/constants.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Centralised constants — model IDs, pricing, determinism boundary."""
|
|
2
|
+
|
|
3
|
+
BOUNDARY_V1 = "single-process-asyncio-v1"
|
|
4
|
+
|
|
5
|
+
# Model IDs (consult claude-api skill before editing)
|
|
6
|
+
SONNET = "claude-sonnet-4-6"
|
|
7
|
+
HAIKU = "claude-haiku-4-5-20251001"
|
|
8
|
+
OPUS = "claude-opus-4-8"
|
|
9
|
+
|
|
10
|
+
# Pricing per token (USD), list price per 1M tokens — update PRICING_VERSION when
|
|
11
|
+
# changed. Source: the `claude-api` skill (current Anthropic list pricing).
|
|
12
|
+
PRICING_VERSION = "2026-06b"
|
|
13
|
+
SONNET_INPUT_PER_TOKEN = 3.00 / 1_000_000
|
|
14
|
+
SONNET_OUTPUT_PER_TOKEN = 15.00 / 1_000_000
|
|
15
|
+
HAIKU_INPUT_PER_TOKEN = 1.00 / 1_000_000
|
|
16
|
+
HAIKU_OUTPUT_PER_TOKEN = 5.00 / 1_000_000
|
|
17
|
+
OPUS_INPUT_PER_TOKEN = 5.00 / 1_000_000
|
|
18
|
+
OPUS_OUTPUT_PER_TOKEN = 25.00 / 1_000_000
|
|
19
|
+
|
|
20
|
+
PRICING_TABLE: dict[str, tuple[float, float]] = {
|
|
21
|
+
SONNET: (SONNET_INPUT_PER_TOKEN, SONNET_OUTPUT_PER_TOKEN),
|
|
22
|
+
HAIKU: (HAIKU_INPUT_PER_TOKEN, HAIKU_OUTPUT_PER_TOKEN),
|
|
23
|
+
OPUS: (OPUS_INPUT_PER_TOKEN, OPUS_OUTPUT_PER_TOKEN),
|
|
24
|
+
}
|