tracefork 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tracefork/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ """tracefork — time-travel debugger for AI agents."""
2
+
3
+ from .recorder import AsyncRecorder, Recorder
4
+ from .tape import Tape
5
+
6
+ __all__ = ["Recorder", "AsyncRecorder", "Tape"]
tracefork/blame.py ADDED
@@ -0,0 +1,296 @@
1
+ """Blame engine: rank each exchange by causal flip-rate with Wilson CIs.
2
+
3
+ The causal question is "if step *i* had gone differently, how often would the
4
+ run's *outcome* change?" Answering it honestly requires re-running the agent,
5
+ not just rewriting the tape: perturbing step *i*'s response changes every
6
+ request the agent makes afterward. So for each candidate step we:
7
+
8
+ 1. fork the recorded run at *i* with a perturbed response — the prefix is
9
+ replayed from the parent tape for $0 and the agent is re-run from there
10
+ (`ForkEngine.fork`);
11
+ 2. grade the resulting outcome with an `Oracle`;
12
+ 3. count it as a *flip* when the graded outcome differs from the parent run's.
13
+
14
+ `flip_rate = flips / k` over `k` trials, with a Wilson score 95% interval so a
15
+ small *k* doesn't masquerade as certainty. `BudgetGovernor` estimates the
16
+ fork count and dollar cost before any spend.
17
+
18
+ The engine is agent- and domain-agnostic: the caller supplies `agent_fn` (the
19
+ same agent that produced the tape) and a `perturb_factory(step) -> (response,
20
+ tail_transport)`. In tests and the offline validation suite, `tail_transport`
21
+ is a scripted fake (zero cost); for a live run it is `None`, so the
22
+ counterfactual tail hits the real API under the budget cap.
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import json
28
+ import math
29
+ from collections.abc import Callable
30
+ from dataclasses import dataclass
31
+ from typing import Protocol, cast
32
+
33
+ import httpx
34
+
35
+ from .constants import PRICING_TABLE, SONNET
36
+ from .fork import BranchSpec, ForkEngine
37
+ from .tape import Tape
38
+
39
+ # ── Wilson score CI ────────────────────────────────────────────────────────
40
+
41
+
42
+ def wilson_ci(successes: int, n: int, z: float = 1.96) -> tuple[float, float]:
43
+ """95% Wilson score confidence interval for a proportion."""
44
+ if n == 0:
45
+ return (0.0, 1.0)
46
+ p = successes / n
47
+ denom = 1 + z**2 / n
48
+ centre = (p + z**2 / (2 * n)) / denom
49
+ spread = (z * math.sqrt(p * (1 - p) / n + z**2 / (4 * n**2))) / denom
50
+ return (max(0.0, centre - spread), min(1.0, centre + spread))
51
+
52
+
53
+ # ── Oracle protocol ─────────────────────────────────────────────────────────
54
+
55
+
56
+ class Oracle(Protocol):
57
+ def grade(self, output: str) -> bool | None: ...
58
+
59
+
60
+ class StringMatchOracle:
61
+ """Grades by regex match: True=success, False=failure, None=ambiguous."""
62
+
63
+ def __init__(self, *, success_re: str, failure_re: str) -> None:
64
+ import re
65
+
66
+ self._success = re.compile(success_re)
67
+ self._failure = re.compile(failure_re)
68
+
69
+ def grade(self, output: str) -> bool | None:
70
+ if self._success.search(output):
71
+ return True
72
+ if self._failure.search(output):
73
+ return False
74
+ return None
75
+
76
+
77
+ # ── Result types ────────────────────────────────────────────────────────────
78
+
79
+
80
+ @dataclass
81
+ class FlipRateResult:
82
+ step_index: int
83
+ flip_rate: float
84
+ ci_lo: float
85
+ ci_hi: float
86
+ flips: int
87
+ trials: int
88
+ interpretation: str = ""
89
+
90
+
91
+ @dataclass
92
+ class BlameReport:
93
+ results: list[FlipRateResult]
94
+ k: int
95
+ total_forks: int
96
+ parent_outcome: bool | None = None
97
+ est_cost_usd: float = 0.0
98
+
99
+ def top(self) -> FlipRateResult | None:
100
+ if not self.results:
101
+ return None
102
+ return max(self.results, key=lambda r: r.flip_rate)
103
+
104
+
105
+ @dataclass
106
+ class BlameEstimate:
107
+ n_candidates: int
108
+ n_forks: int
109
+ est_usd: float
110
+
111
+
112
+ # ── BudgetGovernor ──────────────────────────────────────────────────────────
113
+
114
+
115
+ class BudgetExceededError(RuntimeError):
116
+ """Raised when a blame run's estimated cost exceeds the caller's budget."""
117
+
118
+
119
+ def _detect_model(tape: Tape) -> str:
120
+ """Best-effort model id from the first recorded request (defaults to Sonnet)."""
121
+ for req, _ in tape.exchanges:
122
+ try:
123
+ m = json.loads(req).get("model")
124
+ except Exception:
125
+ m = None
126
+ if m:
127
+ return m
128
+ return SONNET
129
+
130
+
131
+ def _avg_tokens(tape: Tape) -> tuple[float, float]:
132
+ """Average (input, output) tokens per exchange — from recorded ``usage`` when
133
+ present, else a ~4-bytes-per-token estimate from the raw bytes."""
134
+ if not tape.exchanges:
135
+ return (0.0, 0.0)
136
+ ins: list[float] = []
137
+ outs: list[float] = []
138
+ for req, resp in tape.exchanges:
139
+ usage: dict = {}
140
+ try:
141
+ d = json.loads(resp)
142
+ if isinstance(d, dict):
143
+ usage = d.get("usage") or {}
144
+ except Exception:
145
+ usage = {}
146
+ ins.append(usage.get("input_tokens") or max(1, len(req) // 4))
147
+ outs.append(usage.get("output_tokens") or max(1, len(resp) // 4))
148
+ n = len(tape.exchanges)
149
+ return (sum(ins) / n, sum(outs) / n)
150
+
151
+
152
+ class BudgetGovernor:
153
+ @staticmethod
154
+ def estimate(
155
+ tape: Tape,
156
+ *,
157
+ k: int,
158
+ model: str | None = None,
159
+ cost_per_fork_usd: float | None = None,
160
+ ) -> BlameEstimate:
161
+ """Estimate the dollar cost of a blame run.
162
+
163
+ Only the counterfactual *tail* hits the API — the replayed prefix and the
164
+ mutated step itself cost $0. Forking step ``i`` records ``n-1-i`` tail
165
+ calls, so total billed calls = ``sum_i (n-1-i) * k``. Each call is priced
166
+ with the model's real per-token rates (``constants.PRICING_TABLE``) against
167
+ the tape's recorded token usage. Pass ``cost_per_fork_usd`` to override with
168
+ a flat per-fork figure instead.
169
+ """
170
+ n_candidates = len(tape.exchanges)
171
+ n_forks = n_candidates * k
172
+ if cost_per_fork_usd is not None:
173
+ est_usd = n_forks * cost_per_fork_usd
174
+ else:
175
+ billed_calls = sum(n_candidates - 1 - i for i in range(n_candidates)) * k
176
+ in_rate, out_rate = PRICING_TABLE.get(
177
+ model or _detect_model(tape), PRICING_TABLE[SONNET]
178
+ )
179
+ avg_in, avg_out = _avg_tokens(tape)
180
+ est_usd = billed_calls * (avg_in * in_rate + avg_out * out_rate)
181
+ return BlameEstimate(n_candidates=n_candidates, n_forks=n_forks, est_usd=est_usd)
182
+
183
+
184
+ # ── outcome extraction ────────────────────────────────────────────────────────
185
+
186
+
187
+ def _outcome_text(resp_bytes: bytes) -> str:
188
+ """Extract the assistant's text from a recorded response (JSON or SSE)."""
189
+ try:
190
+ d = json.loads(resp_bytes)
191
+ except Exception:
192
+ return resp_bytes.decode(errors="replace")
193
+ if isinstance(d, dict):
194
+ for block in d.get("content", []):
195
+ if block.get("type") == "text":
196
+ return block.get("text", "")
197
+ return ""
198
+
199
+
200
+ def _interpret(flip_rate: float) -> str:
201
+ if flip_rate >= 0.7:
202
+ return "decisive — this step caused it"
203
+ if flip_rate >= 0.3:
204
+ return "suggestive"
205
+ return "diffuse — not the cause"
206
+
207
+
208
+ # ── BlameEngine ─────────────────────────────────────────────────────────────
209
+
210
+
211
+ class BlameEngine:
212
+ """Ranks exchanges by causal flip-rate."""
213
+
214
+ @staticmethod
215
+ def rank(
216
+ tape: Tape,
217
+ agent_fn, # Callable[[anthropic.Anthropic], Any] — the SAME agent
218
+ oracle: Oracle,
219
+ *,
220
+ perturb_factory: Callable[[int], tuple[bytes, object]],
221
+ k: int = 10,
222
+ budget_usd: float = 5.0,
223
+ api_key: str = "sk-ant-blame",
224
+ ) -> BlameReport:
225
+ """Fork each exchange `k` times with a perturbed response and measure how
226
+ often the graded outcome flips relative to the parent run.
227
+
228
+ `perturb_factory(step_idx)` returns `(mutated_response_bytes,
229
+ tail_transport)`, where `tail_transport` serves the counterfactual tail
230
+ (a scripted fake offline, or `None` to use the real API).
231
+ """
232
+ est = BudgetGovernor.estimate(tape, k=k)
233
+ if est.est_usd > budget_usd:
234
+ raise BudgetExceededError(
235
+ f"estimated blame cost ${est.est_usd:.2f} exceeds budget "
236
+ f"${budget_usd:.2f} ({est.n_forks} forks at k={k}); raise the budget "
237
+ f"or lower k"
238
+ )
239
+
240
+ parent_outcome: bool | None = None
241
+ if tape.exchanges:
242
+ parent_outcome = oracle.grade(_outcome_text(tape.exchanges[-1][1]))
243
+
244
+ results: list[FlipRateResult] = []
245
+ total_forks = 0
246
+
247
+ for step_idx in range(len(tape.exchanges)):
248
+ flips = 0
249
+ for _trial in range(k):
250
+ mutated_resp, tail_transport_obj = perturb_factory(step_idx)
251
+ tail_transport = cast("httpx.BaseTransport | None", tail_transport_obj)
252
+ spec = BranchSpec(divergence_step=step_idx, mutated_response=mutated_resp)
253
+ try:
254
+ branch = ForkEngine.fork(
255
+ tape,
256
+ spec,
257
+ agent_fn,
258
+ post_fork_transport=tail_transport,
259
+ api_key=api_key,
260
+ )
261
+ total_forks += 1
262
+ except Exception:
263
+ # A divergent fork (e.g. agent not deterministic up to the
264
+ # step) counts as cost spent but no observed flip.
265
+ total_forks += 1
266
+ continue
267
+
268
+ if branch.delta_tape.exchanges:
269
+ graded = oracle.grade(_outcome_text(branch.delta_tape.exchanges[-1][1]))
270
+ else:
271
+ graded = None
272
+ if graded is not None and graded != parent_outcome:
273
+ flips += 1
274
+
275
+ flip_rate = flips / k if k > 0 else 0.0
276
+ ci_lo, ci_hi = wilson_ci(flips, k)
277
+ results.append(
278
+ FlipRateResult(
279
+ step_index=step_idx,
280
+ flip_rate=flip_rate,
281
+ ci_lo=ci_lo,
282
+ ci_hi=ci_hi,
283
+ flips=flips,
284
+ trials=k,
285
+ interpretation=_interpret(flip_rate),
286
+ )
287
+ )
288
+
289
+ results.sort(key=lambda r: r.flip_rate, reverse=True)
290
+ return BlameReport(
291
+ results=results,
292
+ k=k,
293
+ total_forks=total_forks,
294
+ parent_outcome=parent_outcome,
295
+ est_cost_usd=est.est_usd,
296
+ )
tracefork/cli.py ADDED
@@ -0,0 +1,367 @@
1
+ """tracefork CLI — entry point for all commands.
2
+
3
+ tracefork <command> [args]
4
+
5
+ Commands: replay, verify, fork, report, serve, blame, validate.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from pathlib import Path
11
+
12
+ import typer
13
+
14
+ app = typer.Typer(name="tracefork", help="Time-travel debugger for AI agents.")
15
+
16
+
17
+ @app.command()
18
+ def replay(
19
+ tape_path: Path = typer.Argument(..., help="Path to a .tape.sqlite file"), # noqa: B008
20
+ agent: str = typer.Option(..., "--agent", "-a", help="Import path of agent fn (pkg.mod:fn)"),
21
+ ) -> None:
22
+ """Replay a tape and print the verification receipt."""
23
+ import importlib
24
+
25
+ from tracefork.replay import ReplayVerifier
26
+ from tracefork.tape import Tape
27
+
28
+ tape = Tape.load(str(tape_path))
29
+
30
+ module_path, fn_name = agent.rsplit(":", 1)
31
+ mod = importlib.import_module(module_path)
32
+ agent_fn = getattr(mod, fn_name)
33
+
34
+ result = ReplayVerifier(tape, agent_fn).verify()
35
+ _print_receipt(tape_path, result)
36
+ raise typer.Exit(0 if result.bit_exact else 1)
37
+
38
+
39
+ @app.command()
40
+ def verify(
41
+ tape_path: Path = typer.Argument(None, help="Single tape to verify"), # noqa: B008
42
+ agent: str = typer.Option(None, "--agent", "-a", help="Import path of agent fn"),
43
+ corpus: bool = typer.Option(
44
+ False, "--corpus", help="Verify all tapes in experiments/validation_tapes/"
45
+ ),
46
+ ) -> None:
47
+ """Verify bit-exact replay. Exit 1 on drift."""
48
+ import importlib
49
+
50
+ from tracefork.replay import ReplayVerifier
51
+ from tracefork.tape import Tape
52
+
53
+ if corpus:
54
+ corpus_dir = Path("experiments/validation_tapes")
55
+ tapes = list(corpus_dir.glob("*.tape.sqlite"))
56
+ if not tapes:
57
+ typer.echo("No tapes found in experiments/validation_tapes/")
58
+ raise typer.Exit(1)
59
+ for tp in sorted(tapes):
60
+ typer.echo(f" {tp.name}: skipped (agent not specified per-tape)")
61
+ typer.echo(f"Corpus: {len(tapes)} tapes scanned")
62
+ raise typer.Exit(0)
63
+
64
+ if tape_path is None or agent is None:
65
+ typer.echo("Provide --agent and a tape path, or use --corpus")
66
+ raise typer.Exit(1)
67
+
68
+ tape = Tape.load(str(tape_path))
69
+ module_path, fn_name = agent.rsplit(":", 1)
70
+ mod = importlib.import_module(module_path)
71
+ agent_fn = getattr(mod, fn_name)
72
+ result = ReplayVerifier(tape, agent_fn).verify()
73
+ _print_receipt(tape_path, result)
74
+ raise typer.Exit(0 if result.bit_exact else 1)
75
+
76
+
77
+ @app.command()
78
+ def fork(
79
+ run_id: str = typer.Argument(..., help="Parent run_id to fork from"),
80
+ step: int = typer.Option(..., "--step", "-s", help="Exchange index to diverge at"),
81
+ response_file: Path = typer.Option( # noqa: B008
82
+ ..., "--response", "-r", help="Path to .bytes file containing mutated response"
83
+ ),
84
+ agent: str = typer.Option(..., "--agent", "-a", help="Import path of post-fork agent fn"),
85
+ store: Path = typer.Option(Path("store.db"), "--store", help="Path to store.db"), # noqa: B008
86
+ desc: str = typer.Option("", "--desc", "-d", help="Human description of mutation"),
87
+ ) -> None:
88
+ """Fork a run at a step with a mutated response, record the new branch."""
89
+ import importlib
90
+
91
+ from tracefork.fork import BranchSpec, ForkEngine
92
+ from tracefork.store import TapeStore
93
+
94
+ db = TapeStore(str(store))
95
+ parent_tape = db.load_tape(run_id)
96
+
97
+ mutated_response = response_file.read_bytes()
98
+
99
+ module_path, fn_name = agent.rsplit(":", 1)
100
+ mod = importlib.import_module(module_path)
101
+ agent_fn = getattr(mod, fn_name)
102
+
103
+ spec = BranchSpec(
104
+ divergence_step=step,
105
+ mutated_response=mutated_response,
106
+ mutation_desc=desc,
107
+ )
108
+
109
+ branch = ForkEngine.fork(parent_tape, spec, agent_fn)
110
+
111
+ branch_id = db.save_branch(
112
+ parent_run_id=run_id,
113
+ divergence_step=step,
114
+ delta_tape=branch.delta_tape,
115
+ mutation_desc=desc,
116
+ )
117
+
118
+ typer.echo("\n Fork created")
119
+ typer.echo(f" branch_id {branch_id}")
120
+ typer.echo(f" parent_run_id {run_id}")
121
+ typer.echo(f" divergence_step {step}")
122
+ typer.echo(f" delta_exchanges {len(branch.delta_tape.exchanges)}")
123
+ typer.echo(f" description {desc or '(none)'}\n")
124
+
125
+
126
+ @app.command()
127
+ def report(
128
+ run_id: str = typer.Argument(None, help="run_id to report on (from store)"),
129
+ tape_path: Path = typer.Option( # noqa: B008
130
+ None, "--tape", "-t", help="Path to a .tape.sqlite file"
131
+ ),
132
+ output: Path = typer.Option( # noqa: B008
133
+ Path("report.html"), "--output", "-o", help="Output HTML file"
134
+ ),
135
+ store: Path = typer.Option( # noqa: B008
136
+ Path("store.db"), "--store", help="Path to store.db"
137
+ ),
138
+ ) -> None:
139
+ """Generate a self-contained HTML report from a tape."""
140
+ from tracefork.report import generate_report
141
+ from tracefork.tape import Tape
142
+
143
+ if tape_path:
144
+ tape = Tape.load(str(tape_path))
145
+ elif run_id:
146
+ from tracefork.store import TapeStore
147
+
148
+ db = TapeStore(str(store))
149
+ tape = db.load_tape(run_id)
150
+ else:
151
+ typer.echo("Provide a run_id or --tape path")
152
+ raise typer.Exit(1)
153
+
154
+ generate_report(tape, output)
155
+ typer.echo(f"Report written to {output}")
156
+
157
+
158
+ @app.command()
159
+ def serve(
160
+ store: Path = typer.Option( # noqa: B008
161
+ Path("store.db"), "--store", help="Path to store.db"
162
+ ),
163
+ port: int = typer.Option(7777, "--port", "-p", help="Port to listen on"),
164
+ ) -> None:
165
+ """Start the tracefork web UI server on port 7777."""
166
+ import uvicorn
167
+
168
+ from tracefork.server import app as fastapi_app
169
+ from tracefork.server import init_store
170
+
171
+ init_store(str(store))
172
+ typer.echo(f" tracefork serve → http://127.0.0.1:{port}")
173
+ uvicorn.run(fastapi_app, host="127.0.0.1", port=port, workers=1, log_level="warning")
174
+
175
+
176
+ @app.command()
177
+ def blame(
178
+ run_id: str = typer.Argument(..., help="run_id to analyze"),
179
+ agent: str = typer.Option(
180
+ ...,
181
+ "--agent",
182
+ "-a",
183
+ help="Import path of the agent fn (pkg.mod:fn) that produced this run; "
184
+ "it is re-run for each fork and must be deterministic up to the fork point",
185
+ ),
186
+ k: int = typer.Option(10, "--k", help="Forks per candidate step"),
187
+ budget: float = typer.Option(5.0, "--budget", help="USD spend cap"),
188
+ perturbation: str = typer.Option(
189
+ "[tracefork] this step did not complete as recorded",
190
+ "--perturbation",
191
+ help="Text injected as the counterfactual response",
192
+ ),
193
+ success_re: str = typer.Option("SUCCESS", "--success-re", help="Regex for success outcome"),
194
+ failure_re: str = typer.Option("FAIL", "--failure-re", help="Regex for failure outcome"),
195
+ store: Path = typer.Option( # noqa: B008
196
+ Path("store.db"), "--store", help="Path to store.db"
197
+ ),
198
+ ) -> None:
199
+ """Run causal blame analysis on a recorded run.
200
+
201
+ For each exchange, the agent is re-run with that step's response perturbed
202
+ and the counterfactual tail recorded against the real API (budget-capped).
203
+ The offline, $0 proof that blame correctly fingers known faults is
204
+ `tracefork validate`.
205
+ """
206
+ if not run_id or not all(c.isalnum() or c in "-_" for c in run_id):
207
+ raise typer.BadParameter("run_id must be alphanumeric (with '-' or '_')")
208
+
209
+ import importlib
210
+ import json
211
+ import os
212
+
213
+ from tracefork.blame import BlameEngine, BudgetGovernor, StringMatchOracle
214
+ from tracefork.store import TapeStore
215
+ from tracefork.wire import make_text_response
216
+
217
+ db = TapeStore(str(store))
218
+ tape = db.load_tape(run_id)
219
+
220
+ module_path, fn_name = agent.rsplit(":", 1)
221
+ agent_fn = getattr(importlib.import_module(module_path), fn_name)
222
+
223
+ oracle = StringMatchOracle(success_re=success_re, failure_re=failure_re)
224
+ est = BudgetGovernor.estimate(tape, k=k)
225
+
226
+ typer.echo(f"\n Blame estimate: {est.n_forks} forks, ~${est.est_usd:.2f}")
227
+ if est.est_usd > budget:
228
+ typer.echo(f" Estimated cost ${est.est_usd:.2f} exceeds budget ${budget:.2f}.")
229
+ typer.echo(" Use --budget to increase or --k to reduce trials.")
230
+ raise typer.Exit(1)
231
+
232
+ mutated = make_text_response(perturbation)
233
+
234
+ def perturb_factory(step_idx: int):
235
+ # tail_transport=None → the counterfactual tail hits the real API.
236
+ return mutated, None
237
+
238
+ report = BlameEngine.rank(
239
+ tape,
240
+ agent_fn,
241
+ oracle,
242
+ perturb_factory=perturb_factory,
243
+ k=k,
244
+ budget_usd=budget,
245
+ api_key=os.environ.get("ANTHROPIC_API_KEY", ""),
246
+ )
247
+
248
+ typer.echo(f"\n run-{run_id} · blame analysis · k={k} · {report.total_forks} forks\n")
249
+ typer.echo(f" {'rank':<5} {'step':<8} {'flip-rate':<12} {'95% CI':<22} interpretation")
250
+ typer.echo(f" {'─' * 70}")
251
+ for rank, r in enumerate(report.results, 1):
252
+ ci_str = f"[{r.ci_lo:.2f}, {r.ci_hi:.2f}]"
253
+ typer.echo(
254
+ f" {rank:<5} step-{r.step_index:<3} {r.flip_rate:<12.2f} "
255
+ f"{ci_str:<22} {r.interpretation}"
256
+ )
257
+ typer.echo("")
258
+
259
+ report_path = Path(f"blame_{run_id}.json")
260
+ report_path.write_text(
261
+ json.dumps(
262
+ {
263
+ "run_id": run_id,
264
+ "k": k,
265
+ "results": [
266
+ {
267
+ "step_index": r.step_index,
268
+ "flip_rate": r.flip_rate,
269
+ "ci_lo": r.ci_lo,
270
+ "ci_hi": r.ci_hi,
271
+ "interpretation": r.interpretation,
272
+ }
273
+ for r in report.results
274
+ ],
275
+ },
276
+ indent=2,
277
+ )
278
+ )
279
+ typer.echo(f" Report saved to {report_path}")
280
+
281
+
282
+ @app.command()
283
+ def validate(
284
+ k: int = typer.Option(3, "--k", help="Forks per candidate step per run"),
285
+ n_runs: int = typer.Option(5, "--n-runs", help="Runs per fault class"),
286
+ output: Path = typer.Option( # noqa: B008
287
+ Path("validation_report.json"), "--output", "-o"
288
+ ),
289
+ check: bool = typer.Option(False, "--check", help="Diff vs committed report (regression gate)"),
290
+ ) -> None:
291
+ """Run fault-injection validation suite; produce validation_report.json."""
292
+ import json as _json
293
+
294
+ from tracefork.validate import run_all_fault_classes
295
+
296
+ typer.echo(f"\n tracefork validate — k={k}, n_runs={n_runs} per class")
297
+ typer.echo(f" {'─' * 50}")
298
+
299
+ results = run_all_fault_classes(k=k, n_runs=n_runs)
300
+
301
+ overall_precision = sum(r["top1_precision"] for r in results.values()) / len(results)
302
+ max_ctrl = max(r["negative_control_max_flip"] for r in results.values())
303
+
304
+ report_data = {
305
+ "top1_precision_by_class": {fc: v["top1_precision"] for fc, v in results.items()},
306
+ "overall_top1_precision": overall_precision,
307
+ "negative_control_max_flip": max_ctrl,
308
+ "n_runs_per_class": n_runs,
309
+ "k": k,
310
+ "reproduce_cmd": f"tracefork validate --k {k} --n-runs {n_runs}",
311
+ }
312
+
313
+ for fault_class, data in results.items():
314
+ status = "PASS" if data["top1_precision"] >= 0.7 else "WARN"
315
+ typer.echo(f" [{status}] {fault_class:<35} top-1: {data['top1_precision']:.2f}")
316
+
317
+ typer.echo(f"\n overall top-1 precision: {overall_precision:.2f}")
318
+ typer.echo(f" negative control max flip: {max_ctrl:.2f} (threshold 0.30)")
319
+
320
+ output.write_text(_json.dumps(report_data, indent=2))
321
+ typer.echo(f"\n Report saved to {output}\n")
322
+
323
+ control_threshold = 0.30
324
+ if max_ctrl >= control_threshold:
325
+ typer.echo(
326
+ f" [FAIL] negative control max flip {max_ctrl:.2f} ≥ {control_threshold:.2f} "
327
+ "— blame is firing on no-op perturbations; the precision number is not trustworthy."
328
+ )
329
+ raise typer.Exit(1)
330
+
331
+ if check:
332
+ committed = Path("experiments/validation_report_committed.json")
333
+ if not committed.exists():
334
+ typer.echo(" No committed report found — run without --check to create one.")
335
+ raise typer.Exit(1)
336
+ old = _json.loads(committed.read_text())
337
+ regressions = []
338
+ for fc, new_prec in report_data["top1_precision_by_class"].items():
339
+ old_prec = old.get("top1_precision_by_class", {}).get(fc, 0.0)
340
+ if new_prec < old_prec - 0.15:
341
+ regressions.append(f"{fc}: {old_prec:.2f} → {new_prec:.2f}")
342
+ old_ctrl = old.get("negative_control_max_flip", 0.0)
343
+ if max_ctrl > old_ctrl + 0.15:
344
+ regressions.append(f"negative_control_max_flip: {old_ctrl:.2f} → {max_ctrl:.2f}")
345
+ if regressions:
346
+ typer.echo(" REGRESSION detected:")
347
+ for r_str in regressions:
348
+ typer.echo(f" {r_str}")
349
+ raise typer.Exit(1)
350
+ typer.echo(" No regressions vs committed report.")
351
+
352
+
353
+ def _print_receipt(tape_path: Path, result) -> None:
354
+ from tracefork.replay import DriftDoctor
355
+
356
+ status = "PASS" if result.bit_exact else "FAIL"
357
+ typer.echo("\n tracefork — replay receipt")
358
+ typer.echo(f" {'─' * 40}")
359
+ typer.echo(f" tape {tape_path.name}")
360
+ typer.echo(f" exchanges {result.matched}/{result.total} matched")
361
+ typer.echo(f" fingerprint {'match' if result.fingerprints_match else 'MISMATCH'}")
362
+ typer.echo(f" result {status}")
363
+ if result.divergence:
364
+ cause = DriftDoctor.classify(result.divergence)
365
+ typer.echo(f" drift cause {cause.value}")
366
+ typer.echo(f" at exchange #{result.divergence.step_index}")
367
+ typer.echo("")
tracefork/constants.py ADDED
@@ -0,0 +1,24 @@
1
+ """Centralised constants — model IDs, pricing, determinism boundary."""
2
+
3
+ BOUNDARY_V1 = "single-process-asyncio-v1"
4
+
5
+ # Model IDs (consult claude-api skill before editing)
6
+ SONNET = "claude-sonnet-4-6"
7
+ HAIKU = "claude-haiku-4-5-20251001"
8
+ OPUS = "claude-opus-4-8"
9
+
10
+ # Pricing per token (USD), list price per 1M tokens — update PRICING_VERSION when
11
+ # changed. Source: the `claude-api` skill (current Anthropic list pricing).
12
+ PRICING_VERSION = "2026-06b"
13
+ SONNET_INPUT_PER_TOKEN = 3.00 / 1_000_000
14
+ SONNET_OUTPUT_PER_TOKEN = 15.00 / 1_000_000
15
+ HAIKU_INPUT_PER_TOKEN = 1.00 / 1_000_000
16
+ HAIKU_OUTPUT_PER_TOKEN = 5.00 / 1_000_000
17
+ OPUS_INPUT_PER_TOKEN = 5.00 / 1_000_000
18
+ OPUS_OUTPUT_PER_TOKEN = 25.00 / 1_000_000
19
+
20
+ PRICING_TABLE: dict[str, tuple[float, float]] = {
21
+ SONNET: (SONNET_INPUT_PER_TOKEN, SONNET_OUTPUT_PER_TOKEN),
22
+ HAIKU: (HAIKU_INPUT_PER_TOKEN, HAIKU_OUTPUT_PER_TOKEN),
23
+ OPUS: (OPUS_INPUT_PER_TOKEN, OPUS_OUTPUT_PER_TOKEN),
24
+ }