tracefork 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracefork/__init__.py +6 -0
- tracefork/blame.py +296 -0
- tracefork/cli.py +367 -0
- tracefork/constants.py +24 -0
- tracefork/faults.py +129 -0
- tracefork/fork.py +173 -0
- tracefork/nondet.py +96 -0
- tracefork/py.typed +0 -0
- tracefork/recorder.py +140 -0
- tracefork/replay.py +119 -0
- tracefork/report.py +131 -0
- tracefork/server.py +73 -0
- tracefork/store.py +123 -0
- tracefork/synthetic.py +104 -0
- tracefork/tape.py +135 -0
- tracefork/transport.py +137 -0
- tracefork/validate.py +177 -0
- tracefork/web/report.html +209 -0
- tracefork/wire.py +76 -0
- tracefork-0.1.0.dist-info/METADATA +235 -0
- tracefork-0.1.0.dist-info/RECORD +32 -0
- tracefork-0.1.0.dist-info/WHEEL +4 -0
- tracefork-0.1.0.dist-info/entry_points.txt +2 -0
- tracefork-0.1.0.dist-info/licenses/LICENSE +21 -0
- tracefork_spike/__init__.py +7 -0
- tracefork_spike/__main__.py +3 -0
- tracefork_spike/agent.py +91 -0
- tracefork_spike/fake_llm.py +106 -0
- tracefork_spike/nondet.py +97 -0
- tracefork_spike/spike.py +125 -0
- tracefork_spike/tape.py +79 -0
- tracefork_spike/transport.py +68 -0
tracefork/transport.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""Recording/replay httpx transports — sync and async, streaming SSE capable.
|
|
2
|
+
|
|
3
|
+
Record mode: forward to the inner transport, buffer the full response body
|
|
4
|
+
(works for both streaming SSE and non-streaming JSON — httpx buffers both
|
|
5
|
+
identically via .read()/.aread()), append to the tape, return the response.
|
|
6
|
+
|
|
7
|
+
Replay mode: for each request, sha256-assert its *body* matches the tape record,
|
|
8
|
+
then serve the recorded bytes back. A replay transport has no inner transport;
|
|
9
|
+
any unrecorded request is a hard error. The matched surface is the request body;
|
|
10
|
+
request headers (e.g. ``anthropic-beta`` / ``anthropic-version``) are out of scope
|
|
11
|
+
for the bit-exactness claim — see the README's determinism-boundary note.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import httpx
|
|
17
|
+
|
|
18
|
+
from .nondet import DivergenceError
|
|
19
|
+
from .tape import Tape, sha256_hex
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class TraceforkTransport(httpx.BaseTransport):
|
|
23
|
+
"""Sync recording/replay transport."""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
mode: str,
|
|
28
|
+
tape: Tape,
|
|
29
|
+
inner: httpx.BaseTransport | None = None,
|
|
30
|
+
) -> None:
|
|
31
|
+
assert mode in ("record", "replay")
|
|
32
|
+
if mode == "record" and inner is None:
|
|
33
|
+
raise ValueError("record mode requires an inner transport")
|
|
34
|
+
self.mode = mode
|
|
35
|
+
self.tape = tape
|
|
36
|
+
self.inner = inner
|
|
37
|
+
self._i = 0
|
|
38
|
+
self.matched = 0
|
|
39
|
+
|
|
40
|
+
def handle_request(self, request: httpx.Request) -> httpx.Response:
|
|
41
|
+
body = request.content
|
|
42
|
+
|
|
43
|
+
if self.mode == "record":
|
|
44
|
+
inner_resp = self.inner.handle_request(request) # type: ignore[union-attr]
|
|
45
|
+
resp_body = inner_resp.read()
|
|
46
|
+
self.tape.append_exchange(body, resp_body)
|
|
47
|
+
return httpx.Response(
|
|
48
|
+
inner_resp.status_code,
|
|
49
|
+
headers={
|
|
50
|
+
"content-type": inner_resp.headers.get("content-type", "application/json")
|
|
51
|
+
},
|
|
52
|
+
content=resp_body,
|
|
53
|
+
request=request,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# replay
|
|
57
|
+
if self._i >= len(self.tape.exchanges):
|
|
58
|
+
raise DivergenceError(
|
|
59
|
+
f"replay made unrecorded request #{self._i} "
|
|
60
|
+
f"(tape has {len(self.tape.exchanges)} exchanges)"
|
|
61
|
+
)
|
|
62
|
+
rec_req, rec_resp = self.tape.exchange(self._i)
|
|
63
|
+
if sha256_hex(rec_req) != sha256_hex(body):
|
|
64
|
+
raise DivergenceError(
|
|
65
|
+
f"request #{self._i} diverged from tape "
|
|
66
|
+
f"(recorded {sha256_hex(rec_req)[:12]}, replay {sha256_hex(body)[:12]})"
|
|
67
|
+
)
|
|
68
|
+
self._i += 1
|
|
69
|
+
self.matched += 1
|
|
70
|
+
return httpx.Response(
|
|
71
|
+
200,
|
|
72
|
+
headers={"content-type": "application/json"},
|
|
73
|
+
content=rec_resp,
|
|
74
|
+
request=request,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
def fully_consumed(self) -> bool:
|
|
78
|
+
return self.mode == "replay" and self._i == len(self.tape.exchanges)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class AsyncTraceforkTransport(httpx.AsyncBaseTransport):
|
|
82
|
+
"""Async recording/replay transport — identical logic to sync variant."""
|
|
83
|
+
|
|
84
|
+
def __init__(
|
|
85
|
+
self,
|
|
86
|
+
mode: str,
|
|
87
|
+
tape: Tape,
|
|
88
|
+
inner: httpx.AsyncBaseTransport | None = None,
|
|
89
|
+
) -> None:
|
|
90
|
+
assert mode in ("record", "replay")
|
|
91
|
+
if mode == "record" and inner is None:
|
|
92
|
+
raise ValueError("record mode requires an inner transport")
|
|
93
|
+
self.mode = mode
|
|
94
|
+
self.tape = tape
|
|
95
|
+
self.inner = inner
|
|
96
|
+
self._i = 0
|
|
97
|
+
self.matched = 0
|
|
98
|
+
|
|
99
|
+
async def handle_async_request(self, request: httpx.Request) -> httpx.Response:
|
|
100
|
+
body = request.content
|
|
101
|
+
|
|
102
|
+
if self.mode == "record":
|
|
103
|
+
inner_resp = await self.inner.handle_async_request(request) # type: ignore[union-attr]
|
|
104
|
+
resp_body = await inner_resp.aread()
|
|
105
|
+
self.tape.append_exchange(body, resp_body)
|
|
106
|
+
return httpx.Response(
|
|
107
|
+
inner_resp.status_code,
|
|
108
|
+
headers={
|
|
109
|
+
"content-type": inner_resp.headers.get("content-type", "application/json")
|
|
110
|
+
},
|
|
111
|
+
content=resp_body,
|
|
112
|
+
request=request,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# replay
|
|
116
|
+
if self._i >= len(self.tape.exchanges):
|
|
117
|
+
raise DivergenceError(
|
|
118
|
+
f"replay made unrecorded request #{self._i} "
|
|
119
|
+
f"(tape has {len(self.tape.exchanges)} exchanges)"
|
|
120
|
+
)
|
|
121
|
+
rec_req, rec_resp = self.tape.exchange(self._i)
|
|
122
|
+
if sha256_hex(rec_req) != sha256_hex(body):
|
|
123
|
+
raise DivergenceError(
|
|
124
|
+
f"request #{self._i} diverged from tape "
|
|
125
|
+
f"(recorded {sha256_hex(rec_req)[:12]}, replay {sha256_hex(body)[:12]})"
|
|
126
|
+
)
|
|
127
|
+
self._i += 1
|
|
128
|
+
self.matched += 1
|
|
129
|
+
return httpx.Response(
|
|
130
|
+
200,
|
|
131
|
+
headers={"content-type": "application/json"},
|
|
132
|
+
content=rec_resp,
|
|
133
|
+
request=request,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
def fully_consumed(self) -> bool:
|
|
137
|
+
return self.mode == "replay" and self._i == len(self.tape.exchanges)
|
tracefork/validate.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""Self-validation: run the blame engine on fault-injected runs with known
|
|
2
|
+
ground truth and measure how often it fingers the right step.
|
|
3
|
+
|
|
4
|
+
Fully offline and $0. Each run:
|
|
5
|
+
1. record a clean two-step tape with a synthetic agent;
|
|
6
|
+
2. inject a known fault into step 0 (the "root cause");
|
|
7
|
+
3. run the blame engine — forking re-runs the synthetic agent, which echoes
|
|
8
|
+
each response into its next request, so the fault marker reaches the
|
|
9
|
+
fault-aware tail and flips the outcome;
|
|
10
|
+
4. score a hit when blame ranks the fault step #1 (top-1 precision).
|
|
11
|
+
|
|
12
|
+
A negative control runs blame with a no-op perturbation and asserts the
|
|
13
|
+
flip-rate stays near zero — otherwise a high "precision" would be meaningless.
|
|
14
|
+
|
|
15
|
+
The synthetic agent is the same callable during recording and every fork, so
|
|
16
|
+
the fork prefix replays bit-for-bit (the determinism contract blame relies on).
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import json
|
|
22
|
+
from dataclasses import dataclass
|
|
23
|
+
|
|
24
|
+
import anthropic
|
|
25
|
+
import httpx
|
|
26
|
+
|
|
27
|
+
from .blame import BlameEngine, StringMatchOracle
|
|
28
|
+
from .faults import FAULT_MARKER_BYTES, FaultClass, FaultInjector
|
|
29
|
+
from .synthetic import FaultAwareFakeLLM, ScriptedFakeLLM
|
|
30
|
+
from .tape import Tape
|
|
31
|
+
from .transport import TraceforkTransport
|
|
32
|
+
from .wire import make_text_response, make_tool_use_response
|
|
33
|
+
|
|
34
|
+
SUCCESS_RESP = make_text_response("SUCCESS — confirmed")
|
|
35
|
+
FAIL_RESP = make_text_response("FAIL — cancelled")
|
|
36
|
+
TOOL_RESP = make_tool_use_response("check_availability", {"seats": 3, "destination": "Tokyo"})
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _serialize_response(msg) -> str:
|
|
40
|
+
"""Flatten an Anthropic message's content to a deterministic string, so the
|
|
41
|
+
agent can echo it (markers and all) into its next request."""
|
|
42
|
+
parts: list[str] = []
|
|
43
|
+
for block in msg.content:
|
|
44
|
+
t = getattr(block, "type", None)
|
|
45
|
+
if t == "text":
|
|
46
|
+
parts.append(block.text)
|
|
47
|
+
elif t == "tool_use":
|
|
48
|
+
parts.append(f"{block.name} {json.dumps(block.input, sort_keys=True)}")
|
|
49
|
+
return " | ".join(parts) or "(empty)"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def synthetic_agent(client: anthropic.Anthropic) -> str:
|
|
53
|
+
"""Two-turn agent: ask, then confirm — echoing turn 1's response into the
|
|
54
|
+
turn-2 request so an injected fault propagates to the outcome."""
|
|
55
|
+
r1 = client.messages.create(
|
|
56
|
+
model="claude-sonnet-4-6",
|
|
57
|
+
max_tokens=100,
|
|
58
|
+
messages=[{"role": "user", "content": "book a flight to Tokyo"}],
|
|
59
|
+
)
|
|
60
|
+
echoed = _serialize_response(r1)
|
|
61
|
+
r2 = client.messages.create(
|
|
62
|
+
model="claude-sonnet-4-6",
|
|
63
|
+
max_tokens=100,
|
|
64
|
+
messages=[
|
|
65
|
+
{"role": "user", "content": "book a flight to Tokyo"},
|
|
66
|
+
{"role": "assistant", "content": echoed},
|
|
67
|
+
{"role": "user", "content": "confirm"},
|
|
68
|
+
],
|
|
69
|
+
)
|
|
70
|
+
return _serialize_response(r2)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _record_clean_tape() -> Tape:
|
|
74
|
+
fake = ScriptedFakeLLM([TOOL_RESP, SUCCESS_RESP])
|
|
75
|
+
tape = Tape(agent_name="synthetic_booking_agent")
|
|
76
|
+
transport = TraceforkTransport("record", tape, fake)
|
|
77
|
+
client = anthropic.Anthropic(
|
|
78
|
+
api_key="sk-ant-fake",
|
|
79
|
+
http_client=httpx.Client(transport=transport),
|
|
80
|
+
max_retries=0,
|
|
81
|
+
)
|
|
82
|
+
synthetic_agent(client)
|
|
83
|
+
return tape
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@dataclass
|
|
87
|
+
class ValidationReport:
|
|
88
|
+
fault_class: FaultClass
|
|
89
|
+
n_runs: int
|
|
90
|
+
top1_correct: int
|
|
91
|
+
top1_precision: float
|
|
92
|
+
negative_control_max_flip: float = 0.0
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class ValidationRunner:
|
|
96
|
+
"""Runs offline fault-injection validation for a single fault class."""
|
|
97
|
+
|
|
98
|
+
def __init__(self, fault_class: FaultClass, *, k: int = 3, n_runs: int = 5) -> None:
|
|
99
|
+
self._fault_class = fault_class
|
|
100
|
+
self._k = k
|
|
101
|
+
self._n_runs = n_runs
|
|
102
|
+
|
|
103
|
+
def run(self) -> ValidationReport:
|
|
104
|
+
oracle = StringMatchOracle(success_re=r"SUCCESS", failure_re=r"FAIL")
|
|
105
|
+
fault_step = 0
|
|
106
|
+
top1_correct = 0
|
|
107
|
+
max_flip_control = 0.0
|
|
108
|
+
|
|
109
|
+
for _run in range(self._n_runs):
|
|
110
|
+
tape = _record_clean_tape()
|
|
111
|
+
mutated_resp = FaultInjector.inject(tape, fault_step, self._fault_class)
|
|
112
|
+
|
|
113
|
+
# Scope note: this is a positive-vs-inert control — the faulted step gets a
|
|
114
|
+
# flip-capable tail, every other step an inert one. It proves the engine ranks
|
|
115
|
+
# a genuinely outcome-flipping step first (test_blame.py injects the flip at the
|
|
116
|
+
# *final* step to show it isn't hardwired to step 0), not that it discriminates
|
|
117
|
+
# among multiple competing causes on a long tape. See README → Validation scope.
|
|
118
|
+
def perturb_factory(step_idx: int, _mutated=mutated_resp, _fault=fault_step):
|
|
119
|
+
if step_idx == _fault:
|
|
120
|
+
# Inject the fault; the tail flips when it sees the marker.
|
|
121
|
+
return _mutated, FaultAwareFakeLLM(
|
|
122
|
+
normal_responses=[SUCCESS_RESP] * 10,
|
|
123
|
+
fault_responses=[FAIL_RESP] * 10,
|
|
124
|
+
fault_marker=FAULT_MARKER_BYTES,
|
|
125
|
+
)
|
|
126
|
+
# Other steps: a benign perturbation that should not flip.
|
|
127
|
+
return SUCCESS_RESP, ScriptedFakeLLM([SUCCESS_RESP] * 10)
|
|
128
|
+
|
|
129
|
+
report = BlameEngine.rank(
|
|
130
|
+
tape,
|
|
131
|
+
synthetic_agent,
|
|
132
|
+
oracle,
|
|
133
|
+
perturb_factory=perturb_factory,
|
|
134
|
+
k=self._k,
|
|
135
|
+
budget_usd=100.0,
|
|
136
|
+
)
|
|
137
|
+
top = report.top()
|
|
138
|
+
if top is not None and top.step_index == fault_step:
|
|
139
|
+
top1_correct += 1
|
|
140
|
+
|
|
141
|
+
# Negative control: no real perturbation anywhere → expect no flips.
|
|
142
|
+
def null_perturb_factory(step_idx: int):
|
|
143
|
+
return SUCCESS_RESP, ScriptedFakeLLM([SUCCESS_RESP] * 10)
|
|
144
|
+
|
|
145
|
+
ctrl = BlameEngine.rank(
|
|
146
|
+
tape,
|
|
147
|
+
synthetic_agent,
|
|
148
|
+
oracle,
|
|
149
|
+
perturb_factory=null_perturb_factory,
|
|
150
|
+
k=self._k,
|
|
151
|
+
budget_usd=100.0,
|
|
152
|
+
)
|
|
153
|
+
for r in ctrl.results:
|
|
154
|
+
max_flip_control = max(max_flip_control, r.flip_rate)
|
|
155
|
+
|
|
156
|
+
precision = top1_correct / self._n_runs if self._n_runs > 0 else 0.0
|
|
157
|
+
return ValidationReport(
|
|
158
|
+
fault_class=self._fault_class,
|
|
159
|
+
n_runs=self._n_runs,
|
|
160
|
+
top1_correct=top1_correct,
|
|
161
|
+
top1_precision=precision,
|
|
162
|
+
negative_control_max_flip=max_flip_control,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def run_all_fault_classes(k: int = 3, n_runs: int = 5) -> dict:
|
|
167
|
+
"""Run validation for all five fault classes; return a report dict."""
|
|
168
|
+
results = {}
|
|
169
|
+
for fc in FaultClass:
|
|
170
|
+
report = ValidationRunner(fc, k=k, n_runs=n_runs).run()
|
|
171
|
+
results[fc.value] = {
|
|
172
|
+
"top1_precision": report.top1_precision,
|
|
173
|
+
"top1_correct": report.top1_correct,
|
|
174
|
+
"n_runs": report.n_runs,
|
|
175
|
+
"negative_control_max_flip": report.negative_control_max_flip,
|
|
176
|
+
}
|
|
177
|
+
return results
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="UTF-8">
|
|
5
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
6
|
+
<title>tracefork — time-travel debugger</title>
|
|
7
|
+
<style>
|
|
8
|
+
* { box-sizing: border-box; margin: 0; padding: 0; }
|
|
9
|
+
:root {
|
|
10
|
+
--bg: #0d1117; --surface: #161b22; --border: #30363d;
|
|
11
|
+
--text: #c9d1d9; --muted: #6e7681; --green: #3fb950;
|
|
12
|
+
--blue: #58a6ff; --orange: #f0883e; --purple: #d2a8ff;
|
|
13
|
+
--red: #f85149; --yellow: #e3b341;
|
|
14
|
+
}
|
|
15
|
+
body { background: var(--bg); color: var(--text); font-family: ui-monospace, monospace; font-size: 13px; }
|
|
16
|
+
header { padding: 12px 20px; border-bottom: 1px solid var(--border); display: flex; align-items: center; gap: 12px; }
|
|
17
|
+
header h1 { font-size: 15px; color: var(--green); }
|
|
18
|
+
header .run-meta { color: var(--muted); font-size: 11px; }
|
|
19
|
+
.layout { display: grid; grid-template-columns: 280px 1fr 320px; height: calc(100vh - 45px); }
|
|
20
|
+
.panel { border-right: 1px solid var(--border); overflow-y: auto; }
|
|
21
|
+
.panel:last-child { border-right: none; }
|
|
22
|
+
.panel-header { padding: 10px 14px; background: var(--surface); border-bottom: 1px solid var(--border);
|
|
23
|
+
font-size: 11px; color: var(--muted); text-transform: uppercase; letter-spacing: 0.05em; }
|
|
24
|
+
|
|
25
|
+
/* Panel 1 — Timeline */
|
|
26
|
+
.exchange-item { padding: 10px 14px; border-bottom: 1px solid var(--border); cursor: pointer; }
|
|
27
|
+
.exchange-item:hover, .exchange-item.active { background: var(--surface); }
|
|
28
|
+
.exchange-item .step { font-size: 10px; color: var(--muted); }
|
|
29
|
+
.exchange-item .role { font-size: 12px; margin: 3px 0; }
|
|
30
|
+
.role-user { color: var(--blue); }
|
|
31
|
+
.role-assistant { color: var(--purple); }
|
|
32
|
+
.role-tool { color: var(--orange); }
|
|
33
|
+
.exchange-item .preview { font-size: 11px; color: var(--muted); white-space: nowrap; overflow: hidden; text-overflow: ellipsis; }
|
|
34
|
+
.blame-badge { float: right; font-size: 10px; padding: 2px 6px; border-radius: 10px; background: #1a3a1a; color: var(--green); }
|
|
35
|
+
.blame-badge.high { background: #3a1a1a; color: var(--red); }
|
|
36
|
+
|
|
37
|
+
/* Panel 2 — Detail */
|
|
38
|
+
.detail-empty { padding: 40px; color: var(--muted); text-align: center; }
|
|
39
|
+
.detail-section { padding: 14px; border-bottom: 1px solid var(--border); }
|
|
40
|
+
.detail-section h3 { font-size: 11px; color: var(--muted); margin-bottom: 8px; text-transform: uppercase; }
|
|
41
|
+
pre { background: var(--surface); padding: 10px; border-radius: 4px; overflow-x: auto; font-size: 11px; line-height: 1.6; }
|
|
42
|
+
.key { color: var(--blue); }
|
|
43
|
+
.string { color: var(--green); }
|
|
44
|
+
.number { color: var(--yellow); }
|
|
45
|
+
|
|
46
|
+
/* Panel 3 — Blame */
|
|
47
|
+
.blame-empty { padding: 40px; color: var(--muted); text-align: center; }
|
|
48
|
+
.blame-row { padding: 8px 14px; border-bottom: 1px solid var(--border); display: flex; gap: 8px; align-items: center; }
|
|
49
|
+
.blame-rank { width: 20px; color: var(--muted); font-size: 11px; }
|
|
50
|
+
.blame-info { flex: 1; }
|
|
51
|
+
.blame-step { font-size: 12px; }
|
|
52
|
+
.blame-ci { font-size: 10px; color: var(--muted); }
|
|
53
|
+
.blame-bar-wrap { width: 80px; }
|
|
54
|
+
.blame-bar { height: 6px; background: var(--green); border-radius: 3px; }
|
|
55
|
+
.blame-bar.decisive { background: var(--red); }
|
|
56
|
+
.blame-rate { font-size: 11px; text-align: right; min-width: 50px; }
|
|
57
|
+
.blame-rate.decisive { color: var(--red); font-weight: bold; }
|
|
58
|
+
.blame-header-row { padding: 6px 14px; background: var(--surface); border-bottom: 1px solid var(--border);
|
|
59
|
+
display: flex; gap: 8px; font-size: 10px; color: var(--muted); }
|
|
60
|
+
|
|
61
|
+
.loading { padding: 40px; text-align: center; color: var(--muted); }
|
|
62
|
+
.error { padding: 20px; color: var(--red); }
|
|
63
|
+
.tag { display: inline-block; padding: 1px 6px; border-radius: 3px; font-size: 10px; margin-left: 6px; }
|
|
64
|
+
.tag-live { background: #1a3a1a; color: var(--green); }
|
|
65
|
+
.tag-static { background: #1a2a3a; color: var(--blue); }
|
|
66
|
+
</style>
|
|
67
|
+
</head>
|
|
68
|
+
<body>
|
|
69
|
+
<header>
|
|
70
|
+
<h1>tracefork</h1>
|
|
71
|
+
<span class="run-meta" id="run-meta">loading…</span>
|
|
72
|
+
<span class="tag" id="mode-tag"></span>
|
|
73
|
+
</header>
|
|
74
|
+
<div class="layout">
|
|
75
|
+
<div class="panel" id="timeline-panel">
|
|
76
|
+
<div class="panel-header">Timeline</div>
|
|
77
|
+
<div id="timeline-content"><div class="loading">loading…</div></div>
|
|
78
|
+
</div>
|
|
79
|
+
<div class="panel" id="detail-panel">
|
|
80
|
+
<div class="panel-header">Exchange Detail</div>
|
|
81
|
+
<div id="detail-content"><div class="detail-empty">← select an exchange</div></div>
|
|
82
|
+
</div>
|
|
83
|
+
<div class="panel" id="blame-panel">
|
|
84
|
+
<div class="panel-header">Blame</div>
|
|
85
|
+
<div id="blame-content"><div class="blame-empty">run tracefork blame to populate</div></div>
|
|
86
|
+
</div>
|
|
87
|
+
</div>
|
|
88
|
+
|
|
89
|
+
<script>
|
|
90
|
+
// ── data source ───────────────────────────────────────────────────────────
|
|
91
|
+
let DATA = null;
|
|
92
|
+
|
|
93
|
+
async function loadData() {
|
|
94
|
+
if (window.__TRACEFORK_DATA__) {
|
|
95
|
+
// static mode: data injected at report-generation time
|
|
96
|
+
document.getElementById('mode-tag').textContent = 'static';
|
|
97
|
+
document.getElementById('mode-tag').className = 'tag tag-static';
|
|
98
|
+
return window.__TRACEFORK_DATA__;
|
|
99
|
+
}
|
|
100
|
+
if (window.__TRACEFORK_SERVER_URL__ !== undefined) {
|
|
101
|
+
// live mode: fetch from the serving origin (empty base → same-origin, any port)
|
|
102
|
+
document.getElementById('mode-tag').textContent = 'live';
|
|
103
|
+
document.getElementById('mode-tag').className = 'tag tag-live';
|
|
104
|
+
const url = window.__TRACEFORK_SERVER_URL__;
|
|
105
|
+
const runId = new URLSearchParams(location.search).get('run_id');
|
|
106
|
+
const resp = await fetch(`${url}/api/run/${runId}`);
|
|
107
|
+
if (!resp.ok) throw new Error(`server ${resp.status}: run not found`);
|
|
108
|
+
return resp.json();
|
|
109
|
+
}
|
|
110
|
+
throw new Error('No data source configured');
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// ── render ─────────────────────────────────────────────────────────────────
|
|
114
|
+
function renderTimeline(data) {
|
|
115
|
+
const blame = data.blame || {};
|
|
116
|
+
const html = data.exchanges.map((ex, i) => {
|
|
117
|
+
const flipRate = blame[i] ? blame[i].flip_rate : null;
|
|
118
|
+
const isDecisive = flipRate !== null && flipRate >= 0.7;
|
|
119
|
+
const badgeHtml = flipRate !== null
|
|
120
|
+
? `<span class="blame-badge${isDecisive ? ' high' : ''}">${Math.round(flipRate*100)}%</span>`
|
|
121
|
+
: '';
|
|
122
|
+
const roleClass = ex.role === 'user' ? 'role-user' : ex.role === 'assistant' ? 'role-assistant' : 'role-tool';
|
|
123
|
+
return `<div class="exchange-item" data-i="${i}" onclick="selectExchange(${i})">
|
|
124
|
+
${badgeHtml}
|
|
125
|
+
<div class="step">exchange ${i}</div>
|
|
126
|
+
<div class="role ${roleClass}">${escape(ex.role || 'unknown')}</div>
|
|
127
|
+
<div class="preview">${escape(ex.preview || '')}</div>
|
|
128
|
+
</div>`;
|
|
129
|
+
}).join('');
|
|
130
|
+
document.getElementById('timeline-content').innerHTML = html;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
function renderBlame(data) {
|
|
134
|
+
const blame = data.blame;
|
|
135
|
+
if (!blame || Object.keys(blame).length === 0) return;
|
|
136
|
+
const entries = Object.entries(blame).sort((a, b) => b[1].flip_rate - a[1].flip_rate);
|
|
137
|
+
const headerHtml = `<div class="blame-header-row">
|
|
138
|
+
<span style="width:20px">rank</span>
|
|
139
|
+
<span style="flex:1">step</span>
|
|
140
|
+
<span style="width:80px">bar</span>
|
|
141
|
+
<span style="min-width:50px;text-align:right">flip</span>
|
|
142
|
+
</div>`;
|
|
143
|
+
const rowsHtml = entries.map(([step, info], rank) => {
|
|
144
|
+
const rate = info.flip_rate;
|
|
145
|
+
const isDecisive = rate >= 0.7;
|
|
146
|
+
const pct = Math.round(rate * 100);
|
|
147
|
+
return `<div class="blame-row">
|
|
148
|
+
<span class="blame-rank">${rank+1}</span>
|
|
149
|
+
<div class="blame-info">
|
|
150
|
+
<div class="blame-step">step-${step}</div>
|
|
151
|
+
<div class="blame-ci">95% CI [${info.ci_lo.toFixed(2)}, ${info.ci_hi.toFixed(2)}]</div>
|
|
152
|
+
</div>
|
|
153
|
+
<div class="blame-bar-wrap"><div class="blame-bar${isDecisive ? ' decisive' : ''}" style="width:${pct}%"></div></div>
|
|
154
|
+
<div class="blame-rate${isDecisive ? ' decisive' : ''}">${pct}%</div>
|
|
155
|
+
</div>`;
|
|
156
|
+
}).join('');
|
|
157
|
+
document.getElementById('blame-content').innerHTML = headerHtml + rowsHtml;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
let _selected = -1;
|
|
161
|
+
function selectExchange(i) {
|
|
162
|
+
if (_selected >= 0) {
|
|
163
|
+
document.querySelector(`[data-i="${_selected}"]`)?.classList.remove('active');
|
|
164
|
+
}
|
|
165
|
+
_selected = i;
|
|
166
|
+
document.querySelector(`[data-i="${i}"]`)?.classList.add('active');
|
|
167
|
+
const ex = DATA.exchanges[i];
|
|
168
|
+
const reqJson = syntaxHighlight(JSON.stringify(ex.request || {}, null, 2));
|
|
169
|
+
const respJson = syntaxHighlight(JSON.stringify(ex.response_preview || {}, null, 2));
|
|
170
|
+
document.getElementById('detail-content').innerHTML = `
|
|
171
|
+
<div class="detail-section">
|
|
172
|
+
<h3>Request (exchange ${i})</h3>
|
|
173
|
+
<pre>${reqJson}</pre>
|
|
174
|
+
</div>
|
|
175
|
+
<div class="detail-section">
|
|
176
|
+
<h3>Response</h3>
|
|
177
|
+
<pre>${respJson}</pre>
|
|
178
|
+
</div>`;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
function syntaxHighlight(json) {
|
|
182
|
+
return json
|
|
183
|
+
.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>')
|
|
184
|
+
.replace(/("(\\u[\da-fA-F]{4}|\\[^u]|[^\\"])*"(\s*:)?|\b(true|false|null)\b|-?\d+(?:\.\d*)?(?:[eE][+\-]?\d+)?)/g,
|
|
185
|
+
m => {
|
|
186
|
+
let cls = 'number';
|
|
187
|
+
if (/^"/.test(m)) cls = /:$/.test(m) ? 'key' : 'string';
|
|
188
|
+
return `<span class="${cls}">${m}</span>`;
|
|
189
|
+
});
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
function escape(s) { return s.replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>'); }
|
|
193
|
+
|
|
194
|
+
// ── boot ───────────────────────────────────────────────────────────────────
|
|
195
|
+
(async () => {
|
|
196
|
+
try {
|
|
197
|
+
DATA = await loadData();
|
|
198
|
+
document.getElementById('run-meta').textContent =
|
|
199
|
+
`${DATA.agent_name || 'unknown agent'} · ${DATA.exchanges.length} exchanges · ${DATA.created_at || ''}`;
|
|
200
|
+
renderTimeline(DATA);
|
|
201
|
+
renderBlame(DATA);
|
|
202
|
+
if (DATA.exchanges.length > 0) selectExchange(0);
|
|
203
|
+
} catch (e) {
|
|
204
|
+
document.getElementById('timeline-content').innerHTML = `<div class="error">${e.message}</div>`;
|
|
205
|
+
}
|
|
206
|
+
})();
|
|
207
|
+
</script>
|
|
208
|
+
</body>
|
|
209
|
+
</html>
|
tracefork/wire.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""Anthropic wire-format response builders.
|
|
2
|
+
|
|
3
|
+
Real Anthropic Messages-API JSON, used in three places:
|
|
4
|
+
- the offline test fakes (`tests/fakes.py` re-exports these),
|
|
5
|
+
- the blame engine's perturbation responses,
|
|
6
|
+
- the fault-injection validation suite.
|
|
7
|
+
|
|
8
|
+
Keeping them in the package (not in tests/) means production code never
|
|
9
|
+
imports from the test tree.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
|
|
16
|
+
from .tape import sha256_hex
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def make_text_response(
|
|
20
|
+
text: str,
|
|
21
|
+
*,
|
|
22
|
+
model: str = "claude-sonnet-4-6",
|
|
23
|
+
input_tokens: int = 100,
|
|
24
|
+
output_tokens: int = 20,
|
|
25
|
+
) -> bytes:
|
|
26
|
+
"""Return Anthropic wire-format JSON bytes for a final text response."""
|
|
27
|
+
rid = "msg_" + sha256_hex((text + model).encode())[:20]
|
|
28
|
+
return json.dumps(
|
|
29
|
+
{
|
|
30
|
+
"id": rid,
|
|
31
|
+
"type": "message",
|
|
32
|
+
"role": "assistant",
|
|
33
|
+
"model": model,
|
|
34
|
+
"content": [{"type": "text", "text": text}],
|
|
35
|
+
"stop_reason": "end_turn",
|
|
36
|
+
"stop_sequence": None,
|
|
37
|
+
"usage": {"input_tokens": input_tokens, "output_tokens": output_tokens},
|
|
38
|
+
}
|
|
39
|
+
).encode()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def make_tool_use_response(
|
|
43
|
+
tool_name: str,
|
|
44
|
+
tool_input: dict,
|
|
45
|
+
*,
|
|
46
|
+
model: str = "claude-sonnet-4-6",
|
|
47
|
+
preamble: str = "",
|
|
48
|
+
input_tokens: int = 100,
|
|
49
|
+
output_tokens: int = 30,
|
|
50
|
+
) -> bytes:
|
|
51
|
+
"""Return Anthropic wire-format JSON bytes for a tool_use response."""
|
|
52
|
+
content: list[dict] = []
|
|
53
|
+
if preamble:
|
|
54
|
+
content.append({"type": "text", "text": preamble})
|
|
55
|
+
toolu_id = "toolu_" + sha256_hex((tool_name + json.dumps(tool_input)).encode())[:18]
|
|
56
|
+
content.append(
|
|
57
|
+
{
|
|
58
|
+
"type": "tool_use",
|
|
59
|
+
"id": toolu_id,
|
|
60
|
+
"name": tool_name,
|
|
61
|
+
"input": tool_input,
|
|
62
|
+
}
|
|
63
|
+
)
|
|
64
|
+
rid = "msg_" + sha256_hex((tool_name + model).encode())[:20]
|
|
65
|
+
return json.dumps(
|
|
66
|
+
{
|
|
67
|
+
"id": rid,
|
|
68
|
+
"type": "message",
|
|
69
|
+
"role": "assistant",
|
|
70
|
+
"model": model,
|
|
71
|
+
"content": content,
|
|
72
|
+
"stop_reason": "tool_use",
|
|
73
|
+
"stop_sequence": None,
|
|
74
|
+
"usage": {"input_tokens": input_tokens, "output_tokens": output_tokens},
|
|
75
|
+
}
|
|
76
|
+
).encode()
|