tracefork 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracefork/__init__.py +6 -0
- tracefork/blame.py +296 -0
- tracefork/cli.py +367 -0
- tracefork/constants.py +24 -0
- tracefork/faults.py +129 -0
- tracefork/fork.py +173 -0
- tracefork/nondet.py +96 -0
- tracefork/py.typed +0 -0
- tracefork/recorder.py +140 -0
- tracefork/replay.py +119 -0
- tracefork/report.py +131 -0
- tracefork/server.py +73 -0
- tracefork/store.py +123 -0
- tracefork/synthetic.py +104 -0
- tracefork/tape.py +135 -0
- tracefork/transport.py +137 -0
- tracefork/validate.py +177 -0
- tracefork/web/report.html +209 -0
- tracefork/wire.py +76 -0
- tracefork-0.1.0.dist-info/METADATA +235 -0
- tracefork-0.1.0.dist-info/RECORD +32 -0
- tracefork-0.1.0.dist-info/WHEEL +4 -0
- tracefork-0.1.0.dist-info/entry_points.txt +2 -0
- tracefork-0.1.0.dist-info/licenses/LICENSE +21 -0
- tracefork_spike/__init__.py +7 -0
- tracefork_spike/__main__.py +3 -0
- tracefork_spike/agent.py +91 -0
- tracefork_spike/fake_llm.py +106 -0
- tracefork_spike/nondet.py +97 -0
- tracefork_spike/spike.py +125 -0
- tracefork_spike/tape.py +79 -0
- tracefork_spike/transport.py +68 -0
tracefork/faults.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""Fault injection: five fault classes that mutate a recorded tape exchange.
|
|
2
|
+
|
|
3
|
+
Every injector returns a **valid** Anthropic wire-format message (so the SDK
|
|
4
|
+
parses it when it is replayed at a fork's divergence point) and embeds the
|
|
5
|
+
string ``FAULT_MARKER`` *inside* a content field — a text block or a tool-use
|
|
6
|
+
input. A synthetic agent echoes that field into its next request, where
|
|
7
|
+
`FaultAwareFakeLLM` detects the marker and returns a failure. That chain is
|
|
8
|
+
what lets the blame engine be validated entirely offline against ground truth.
|
|
9
|
+
|
|
10
|
+
The marker must stay inside the JSON: appending it after the closing brace
|
|
11
|
+
would make the response unparseable and the fault would vanish into an
|
|
12
|
+
exception instead of propagating.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import enum
|
|
18
|
+
import json
|
|
19
|
+
|
|
20
|
+
FAULT_MARKER = "FAULT_MARKER"
|
|
21
|
+
FAULT_MARKER_BYTES = FAULT_MARKER.encode()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class FaultClass(enum.Enum):
|
|
25
|
+
CORRUPTED_TOOL_OUTPUT = "corrupted_tool_output"
|
|
26
|
+
MISLEADING_RETRIEVAL = "misleading_retrieval"
|
|
27
|
+
WRONG_SYSTEM_PROMPT = "wrong_system_prompt"
|
|
28
|
+
DROPPED_MESSAGE = "dropped_message"
|
|
29
|
+
POISONED_ARGUMENT = "poisoned_argument"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _text_message(text: str) -> bytes:
|
|
33
|
+
return json.dumps(
|
|
34
|
+
{
|
|
35
|
+
"id": "msg_fault",
|
|
36
|
+
"type": "message",
|
|
37
|
+
"role": "assistant",
|
|
38
|
+
"model": "claude-sonnet-4-6",
|
|
39
|
+
"content": [{"type": "text", "text": text}],
|
|
40
|
+
"stop_reason": "end_turn",
|
|
41
|
+
"stop_sequence": None,
|
|
42
|
+
"usage": {"input_tokens": 10, "output_tokens": 10},
|
|
43
|
+
}
|
|
44
|
+
).encode()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class FaultInjector:
|
|
48
|
+
"""Mutates a response bytes object to inject a known fault."""
|
|
49
|
+
|
|
50
|
+
@staticmethod
|
|
51
|
+
def inject(tape, step_idx: int, fault_class: FaultClass) -> bytes:
|
|
52
|
+
"""Return mutated response bytes for `tape.exchanges[step_idx][1]`."""
|
|
53
|
+
original_resp = tape.exchanges[step_idx][1]
|
|
54
|
+
method = {
|
|
55
|
+
FaultClass.CORRUPTED_TOOL_OUTPUT: FaultInjector.corrupt_tool_output_default,
|
|
56
|
+
FaultClass.MISLEADING_RETRIEVAL: FaultInjector.misleading_retrieval,
|
|
57
|
+
FaultClass.WRONG_SYSTEM_PROMPT: FaultInjector.wrong_system_prompt,
|
|
58
|
+
FaultClass.DROPPED_MESSAGE: FaultInjector.dropped_message,
|
|
59
|
+
FaultClass.POISONED_ARGUMENT: FaultInjector.poisoned_argument,
|
|
60
|
+
}[fault_class]
|
|
61
|
+
return method(original_resp)
|
|
62
|
+
|
|
63
|
+
# ── tool-use faults (keep the response a valid tool_use) ──────────────────
|
|
64
|
+
|
|
65
|
+
@staticmethod
|
|
66
|
+
def corrupt_tool_output(resp_bytes: bytes, *, field: str, new_value) -> bytes:
|
|
67
|
+
"""Flip a field in a tool-use input and tag the input with the marker.
|
|
68
|
+
|
|
69
|
+
Falls back to a marked text message if the response has no tool_use
|
|
70
|
+
block, so the fault always carries the marker inside valid JSON.
|
|
71
|
+
"""
|
|
72
|
+
try:
|
|
73
|
+
d = json.loads(resp_bytes)
|
|
74
|
+
except Exception:
|
|
75
|
+
return _text_message(f"corrupted output {FAULT_MARKER}")
|
|
76
|
+
touched = False
|
|
77
|
+
for block in d.get("content", []):
|
|
78
|
+
if block.get("type") == "tool_use":
|
|
79
|
+
inp = block.setdefault("input", {})
|
|
80
|
+
if field in inp:
|
|
81
|
+
inp[field] = new_value
|
|
82
|
+
inp["_tracefork_fault"] = FAULT_MARKER
|
|
83
|
+
touched = True
|
|
84
|
+
if not touched:
|
|
85
|
+
return _text_message(f"corrupted output {FAULT_MARKER}")
|
|
86
|
+
return json.dumps(d).encode()
|
|
87
|
+
|
|
88
|
+
@staticmethod
|
|
89
|
+
def corrupt_tool_output_default(resp_bytes: bytes) -> bytes:
|
|
90
|
+
return FaultInjector.corrupt_tool_output(resp_bytes, field="seats", new_value=0)
|
|
91
|
+
|
|
92
|
+
@staticmethod
|
|
93
|
+
def poisoned_argument(resp_bytes: bytes) -> bytes:
|
|
94
|
+
"""Corrupt a tool-call argument (destination/city/location → INVALID)."""
|
|
95
|
+
try:
|
|
96
|
+
d = json.loads(resp_bytes)
|
|
97
|
+
except Exception:
|
|
98
|
+
return _text_message(f"poisoned argument {FAULT_MARKER}")
|
|
99
|
+
touched = False
|
|
100
|
+
for block in d.get("content", []):
|
|
101
|
+
if block.get("type") == "tool_use":
|
|
102
|
+
inp = block.setdefault("input", {})
|
|
103
|
+
for key in ("destination", "city", "location"):
|
|
104
|
+
if key in inp:
|
|
105
|
+
inp[key] = f"INVALID {FAULT_MARKER}"
|
|
106
|
+
touched = True
|
|
107
|
+
if not touched:
|
|
108
|
+
inp["_tracefork_fault"] = FAULT_MARKER
|
|
109
|
+
touched = True
|
|
110
|
+
if not touched:
|
|
111
|
+
return _text_message(f"poisoned argument {FAULT_MARKER}")
|
|
112
|
+
return json.dumps(d).encode()
|
|
113
|
+
|
|
114
|
+
# ── text faults (replace the response with a marked text message) ─────────
|
|
115
|
+
|
|
116
|
+
@staticmethod
|
|
117
|
+
def misleading_retrieval(resp_bytes: bytes) -> bytes:
|
|
118
|
+
"""Inject false information into the response text."""
|
|
119
|
+
return _text_message(f"No flights are available today. {FAULT_MARKER}")
|
|
120
|
+
|
|
121
|
+
@staticmethod
|
|
122
|
+
def wrong_system_prompt(resp_bytes: bytes) -> bytes:
|
|
123
|
+
"""Simulate a wrong/overridden system prompt."""
|
|
124
|
+
return _text_message(f"[system prompt overridden] ignoring the task. {FAULT_MARKER}")
|
|
125
|
+
|
|
126
|
+
@staticmethod
|
|
127
|
+
def dropped_message(resp_bytes: bytes) -> bytes:
|
|
128
|
+
"""Simulate a dropped message: an empty-of-content acknowledgement."""
|
|
129
|
+
return _text_message(f"[prior message was dropped] {FAULT_MARKER}")
|
tracefork/fork.py
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
"""Fork engine: create a counterfactual branch at any step.
|
|
2
|
+
|
|
3
|
+
`ForkEngine.fork()` re-runs the *same* agent that produced the parent tape,
|
|
4
|
+
but intercepts its requests in three phases:
|
|
5
|
+
|
|
6
|
+
1. prefix (requests 0..k-1) — replayed from the parent tape, $0, and the
|
|
7
|
+
request body is sha256-asserted to match (the agent is deterministic up
|
|
8
|
+
to the fork point, so this must hold or the agent code changed);
|
|
9
|
+
2. mutation (request k = divergence_step) — the request still matches the
|
|
10
|
+
parent (the agent hasn't seen the mutated response yet), but instead of
|
|
11
|
+
the recorded response we serve `spec.mutated_response`;
|
|
12
|
+
3. tail (requests k+1..) — the agent is now in counterfactual territory;
|
|
13
|
+
its requests no longer match the parent, so they are recorded fresh.
|
|
14
|
+
|
|
15
|
+
The returned `Branch.delta_tape` holds only the exchanges from the divergence
|
|
16
|
+
step onward (the mutation exchange + any tail). The expensive prefix lives in
|
|
17
|
+
the parent tape and is never re-paid for — that is the "fork for $0 up to the
|
|
18
|
+
divergence point" property.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
from dataclasses import dataclass
|
|
24
|
+
|
|
25
|
+
import anthropic
|
|
26
|
+
import httpx
|
|
27
|
+
|
|
28
|
+
from .nondet import DivergenceError
|
|
29
|
+
from .tape import Tape, sha256_hex
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class BranchSpec:
|
|
34
|
+
divergence_step: int
|
|
35
|
+
mutated_response: bytes
|
|
36
|
+
mutation_desc: str = ""
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class Branch:
|
|
41
|
+
parent_tape: Tape
|
|
42
|
+
divergence_step: int
|
|
43
|
+
delta_tape: Tape
|
|
44
|
+
mutation_desc: str = ""
|
|
45
|
+
prefix_replayed: int = 0 # parent exchanges replayed for $0 (the savings)
|
|
46
|
+
tail_recorded: int = 0 # counterfactual exchanges recorded fresh
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class ForkTransport(httpx.BaseTransport):
|
|
50
|
+
"""Three-phase transport: prefix-replay → mutation-inject → tail-record.
|
|
51
|
+
|
|
52
|
+
`inner` is only consulted for the tail (requests after the divergence
|
|
53
|
+
step); the prefix and the mutation are served from in-memory bytes, so a
|
|
54
|
+
fork costs nothing up to and including the divergence point.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def __init__(
|
|
58
|
+
self,
|
|
59
|
+
parent_tape: Tape,
|
|
60
|
+
divergence_step: int,
|
|
61
|
+
mutated_response: bytes,
|
|
62
|
+
delta_tape: Tape,
|
|
63
|
+
inner: httpx.BaseTransport,
|
|
64
|
+
) -> None:
|
|
65
|
+
self.parent = parent_tape
|
|
66
|
+
self.k = divergence_step
|
|
67
|
+
self.mutated = mutated_response
|
|
68
|
+
self.delta = delta_tape
|
|
69
|
+
self.inner = inner
|
|
70
|
+
self._i = 0
|
|
71
|
+
self.prefix_replayed = 0
|
|
72
|
+
self.tail_recorded = 0
|
|
73
|
+
|
|
74
|
+
def handle_request(self, request: httpx.Request) -> httpx.Response:
|
|
75
|
+
body = request.content
|
|
76
|
+
i = self._i
|
|
77
|
+
self._i += 1
|
|
78
|
+
|
|
79
|
+
if i < self.k:
|
|
80
|
+
# prefix — replay from parent, assert the agent rebuilt it exactly
|
|
81
|
+
rec_req, rec_resp = self.parent.exchange(i)
|
|
82
|
+
if sha256_hex(rec_req) != sha256_hex(body):
|
|
83
|
+
raise DivergenceError(
|
|
84
|
+
f"fork prefix request #{i} diverged from parent tape "
|
|
85
|
+
f"(recorded {sha256_hex(rec_req)[:12]}, replay {sha256_hex(body)[:12]}); "
|
|
86
|
+
f"the agent is not deterministic up to divergence_step {self.k}"
|
|
87
|
+
)
|
|
88
|
+
self.prefix_replayed += 1
|
|
89
|
+
return _json_response(rec_resp, request)
|
|
90
|
+
|
|
91
|
+
if i == self.k:
|
|
92
|
+
# divergence point — same request, mutated response
|
|
93
|
+
rec_req, _ = self.parent.exchange(i)
|
|
94
|
+
if sha256_hex(rec_req) != sha256_hex(body):
|
|
95
|
+
raise DivergenceError(
|
|
96
|
+
f"fork request at divergence_step {i} diverged from parent tape "
|
|
97
|
+
f"(recorded {sha256_hex(rec_req)[:12]}, replay {sha256_hex(body)[:12]})"
|
|
98
|
+
)
|
|
99
|
+
self.delta.append_exchange(body, self.mutated)
|
|
100
|
+
return _json_response(self.mutated, request)
|
|
101
|
+
|
|
102
|
+
# tail — counterfactual territory, record fresh
|
|
103
|
+
inner_resp = self.inner.handle_request(request)
|
|
104
|
+
resp_body = inner_resp.read()
|
|
105
|
+
self.delta.append_exchange(body, resp_body)
|
|
106
|
+
self.tail_recorded += 1
|
|
107
|
+
return httpx.Response(
|
|
108
|
+
inner_resp.status_code,
|
|
109
|
+
headers={"content-type": inner_resp.headers.get("content-type", "application/json")},
|
|
110
|
+
content=resp_body,
|
|
111
|
+
request=request,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _json_response(body: bytes, request: httpx.Request) -> httpx.Response:
|
|
116
|
+
return httpx.Response(
|
|
117
|
+
200,
|
|
118
|
+
headers={"content-type": "application/json"},
|
|
119
|
+
content=body,
|
|
120
|
+
request=request,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class ForkEngine:
|
|
125
|
+
"""Creates counterfactual branches from a recorded tape."""
|
|
126
|
+
|
|
127
|
+
@staticmethod
|
|
128
|
+
def fork(
|
|
129
|
+
parent_tape: Tape,
|
|
130
|
+
spec: BranchSpec,
|
|
131
|
+
agent_fn, # Callable[[anthropic.Anthropic], Any] — the SAME agent
|
|
132
|
+
*,
|
|
133
|
+
post_fork_transport: httpx.BaseTransport | None = None,
|
|
134
|
+
api_key: str = "sk-ant-fork",
|
|
135
|
+
) -> Branch:
|
|
136
|
+
"""Fork `parent_tape` at `spec.divergence_step`.
|
|
137
|
+
|
|
138
|
+
`agent_fn` must be the same agent that produced the parent tape: it is
|
|
139
|
+
re-run from the start, its prefix served from the tape for free, the
|
|
140
|
+
response at the divergence step swapped for `spec.mutated_response`,
|
|
141
|
+
and the counterfactual tail recorded via `post_fork_transport` (or the
|
|
142
|
+
real Anthropic API if None).
|
|
143
|
+
|
|
144
|
+
Returns a `Branch` whose `delta_tape` holds only the exchanges from the
|
|
145
|
+
divergence step onward.
|
|
146
|
+
"""
|
|
147
|
+
step = spec.divergence_step
|
|
148
|
+
n = len(parent_tape.exchanges)
|
|
149
|
+
if step < 0 or step >= n:
|
|
150
|
+
raise ValueError(f"divergence_step {step} out of range [0, {n})")
|
|
151
|
+
|
|
152
|
+
delta_tape = Tape(
|
|
153
|
+
boundary=parent_tape.boundary,
|
|
154
|
+
agent_name=parent_tape.agent_name,
|
|
155
|
+
)
|
|
156
|
+
inner = post_fork_transport if post_fork_transport is not None else httpx.HTTPTransport()
|
|
157
|
+
fork_transport = ForkTransport(parent_tape, step, spec.mutated_response, delta_tape, inner)
|
|
158
|
+
|
|
159
|
+
client = anthropic.Anthropic(
|
|
160
|
+
api_key=api_key,
|
|
161
|
+
http_client=httpx.Client(transport=fork_transport),
|
|
162
|
+
max_retries=0,
|
|
163
|
+
)
|
|
164
|
+
agent_fn(client)
|
|
165
|
+
|
|
166
|
+
return Branch(
|
|
167
|
+
parent_tape=parent_tape,
|
|
168
|
+
divergence_step=step,
|
|
169
|
+
delta_tape=delta_tape,
|
|
170
|
+
mutation_desc=spec.mutation_desc,
|
|
171
|
+
prefix_replayed=fork_transport.prefix_replayed,
|
|
172
|
+
tail_recorded=fork_transport.tail_recorded,
|
|
173
|
+
)
|
tracefork/nondet.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Virtualised nondeterminism sources.
|
|
2
|
+
|
|
3
|
+
Bit-exact replay requires capturing every nondeterminism draw at record time
|
|
4
|
+
and serving it back identically at replay. `RecordingNondet` draws real values
|
|
5
|
+
and logs them; `ReplayNondet` serves them back in order; `DriftingNondet` is
|
|
6
|
+
the negative control (fresh real values → forced divergence).
|
|
7
|
+
|
|
8
|
+
The SDK masks transport exceptions as `APIConnectionError`; `find_divergence`
|
|
9
|
+
unwraps `__cause__`/`__context__` to recover a `DivergenceError`.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import datetime
|
|
15
|
+
import uuid
|
|
16
|
+
from typing import Protocol
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DivergenceError(RuntimeError):
|
|
20
|
+
"""Raised when a replay diverges from the recorded tape."""
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def find_divergence(exc: BaseException | None) -> DivergenceError | None:
|
|
24
|
+
"""Walk an exception's cause/context chain for a DivergenceError.
|
|
25
|
+
|
|
26
|
+
The Anthropic SDK wraps any exception raised inside its httpx transport in
|
|
27
|
+
`APIConnectionError`. This recovers the original `DivergenceError`."""
|
|
28
|
+
seen: set[int] = set()
|
|
29
|
+
cur = exc
|
|
30
|
+
while cur is not None and id(cur) not in seen:
|
|
31
|
+
seen.add(id(cur))
|
|
32
|
+
if isinstance(cur, DivergenceError):
|
|
33
|
+
return cur
|
|
34
|
+
cur = cur.__cause__ or cur.__context__
|
|
35
|
+
return None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class NondetSource(Protocol):
|
|
39
|
+
def now_iso(self) -> str: ...
|
|
40
|
+
def new_uuid_hex(self) -> str: ...
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class RecordingNondet:
|
|
44
|
+
"""Draws genuinely real values and logs each draw."""
|
|
45
|
+
|
|
46
|
+
def __init__(self) -> None:
|
|
47
|
+
# Capture the real datetime.now and uuid.uuid4 at init time, before
|
|
48
|
+
# Recorder.__enter__ patches datetime.datetime with a subclass.
|
|
49
|
+
self._real_now = datetime.datetime.now
|
|
50
|
+
self._real_uuid4 = uuid.uuid4
|
|
51
|
+
self.draws: list[tuple[str, str]] = []
|
|
52
|
+
|
|
53
|
+
def now_iso(self) -> str:
|
|
54
|
+
v = self._real_now(datetime.UTC).isoformat()
|
|
55
|
+
self.draws.append(("clock", v))
|
|
56
|
+
return v
|
|
57
|
+
|
|
58
|
+
def new_uuid_hex(self) -> str:
|
|
59
|
+
v = self._real_uuid4().hex
|
|
60
|
+
self.draws.append(("uuid", v))
|
|
61
|
+
return v
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class ReplayNondet:
|
|
65
|
+
"""Serves recorded draws back in order; errors on kind/order mismatch."""
|
|
66
|
+
|
|
67
|
+
def __init__(self, draws: list[tuple[str, str]]) -> None:
|
|
68
|
+
self._draws = list(draws)
|
|
69
|
+
self._i = 0
|
|
70
|
+
|
|
71
|
+
def _next(self, kind: str) -> str:
|
|
72
|
+
if self._i >= len(self._draws):
|
|
73
|
+
raise DivergenceError(
|
|
74
|
+
f"replay asked for a {kind!r} draw but the tape is exhausted "
|
|
75
|
+
f"(consumed {self._i}/{len(self._draws)})"
|
|
76
|
+
)
|
|
77
|
+
rec_kind, value = self._draws[self._i]
|
|
78
|
+
if rec_kind != kind:
|
|
79
|
+
raise DivergenceError(
|
|
80
|
+
f"draw #{self._i}: replay asked for {kind!r}, tape has {rec_kind!r}"
|
|
81
|
+
)
|
|
82
|
+
self._i += 1
|
|
83
|
+
return value
|
|
84
|
+
|
|
85
|
+
def now_iso(self) -> str:
|
|
86
|
+
return self._next("clock")
|
|
87
|
+
|
|
88
|
+
def new_uuid_hex(self) -> str:
|
|
89
|
+
return self._next("uuid")
|
|
90
|
+
|
|
91
|
+
def fully_consumed(self) -> bool:
|
|
92
|
+
return self._i == len(self._draws)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class DriftingNondet(RecordingNondet):
|
|
96
|
+
"""Negative control: draws fresh real values during replay, forcing divergence."""
|
tracefork/py.typed
ADDED
|
File without changes
|
tracefork/recorder.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"""Recorder and AsyncRecorder — one-line wrappers that record any Anthropic client.
|
|
2
|
+
|
|
3
|
+
`Recorder` wraps a sync `anthropic.Anthropic` client; `AsyncRecorder` wraps an
|
|
4
|
+
`anthropic.AsyncAnthropic` client. Both are context managers. Inside the `with`
|
|
5
|
+
block, `uuid.uuid4` is patched globally so agent-generated IDs are recorded.
|
|
6
|
+
`datetime.datetime.now` is NOT patched here — it is a C classmethod on an
|
|
7
|
+
immutable type (Python 3.12+) and replacing `datetime.datetime` with a subclass
|
|
8
|
+
breaks pydantic's lazy schema builder inside the Anthropic SDK. Agents that need
|
|
9
|
+
deterministic timestamps should call `nondet.now_iso()` via `NondetSource`.
|
|
10
|
+
|
|
11
|
+
Usage (sync):
|
|
12
|
+
with Recorder(client, agent_name="my-agent") as rec:
|
|
13
|
+
result = my_agent(rec.client)
|
|
14
|
+
tape = rec.tape
|
|
15
|
+
|
|
16
|
+
Usage (async):
|
|
17
|
+
async with AsyncRecorder(async_client) as rec:
|
|
18
|
+
result = await my_agent(rec.client)
|
|
19
|
+
tape = rec.tape
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import uuid as _uuid_module
|
|
25
|
+
from collections.abc import Callable
|
|
26
|
+
|
|
27
|
+
import anthropic
|
|
28
|
+
import httpx
|
|
29
|
+
|
|
30
|
+
from .nondet import RecordingNondet
|
|
31
|
+
from .tape import Tape
|
|
32
|
+
from .transport import AsyncTraceforkTransport, TraceforkTransport
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class Recorder:
|
|
36
|
+
"""Sync context manager that records an Anthropic client's I/O."""
|
|
37
|
+
|
|
38
|
+
def __init__(self, client: anthropic.Anthropic, agent_name: str = "") -> None:
|
|
39
|
+
self._orig_client = client
|
|
40
|
+
self._agent_name = agent_name
|
|
41
|
+
self._nondet: RecordingNondet | None = None
|
|
42
|
+
self._tape: Tape | None = None
|
|
43
|
+
self._wrapped_client: anthropic.Anthropic | None = None
|
|
44
|
+
self._orig_uuid4: Callable[[], _uuid_module.UUID] | None = None
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def client(self) -> anthropic.Anthropic:
|
|
48
|
+
if self._wrapped_client is None:
|
|
49
|
+
raise RuntimeError("Use Recorder as a context manager (with Recorder(client) as rec:)")
|
|
50
|
+
return self._wrapped_client
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def tape(self) -> Tape:
|
|
54
|
+
if self._tape is None:
|
|
55
|
+
raise RuntimeError("Use Recorder as a context manager")
|
|
56
|
+
return self._tape
|
|
57
|
+
|
|
58
|
+
def __enter__(self) -> Recorder:
|
|
59
|
+
# RecordingNondet captures the real datetime.now and uuid.uuid4 in __init__
|
|
60
|
+
# before we patch uuid.uuid4 below. Order matters.
|
|
61
|
+
self._nondet = RecordingNondet()
|
|
62
|
+
self._tape = Tape(agent_name=self._agent_name)
|
|
63
|
+
# Share the draws list so recording nondet populates the tape's draws directly
|
|
64
|
+
self._tape.draws = self._nondet.draws
|
|
65
|
+
|
|
66
|
+
# Extract the original httpx transport to use as the recording inner transport.
|
|
67
|
+
# This preserves ScriptedFakeLLM in tests and HTTPTransport in production.
|
|
68
|
+
orig_inner = self._orig_client._client._transport
|
|
69
|
+
transport = TraceforkTransport("record", self._tape, orig_inner)
|
|
70
|
+
# `.copy()` preserves the original client's base_url, auth_token, default
|
|
71
|
+
# headers/query and timeout — only the transport and retries are swapped, so
|
|
72
|
+
# a proxied or custom-base_url client still records faithfully.
|
|
73
|
+
self._wrapped_client = self._orig_client.copy(
|
|
74
|
+
http_client=httpx.Client(transport=transport),
|
|
75
|
+
max_retries=0,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Patch uuid.uuid4 (regular module-level function — directly assignable).
|
|
79
|
+
# The Anthropic SDK may also call uuid.uuid4() internally; all draws are recorded.
|
|
80
|
+
nondet = self._nondet
|
|
81
|
+
self._orig_uuid4 = _uuid_module.uuid4
|
|
82
|
+
|
|
83
|
+
def _patched_uuid4() -> _uuid_module.UUID:
|
|
84
|
+
return _uuid_module.UUID(nondet.new_uuid_hex())
|
|
85
|
+
|
|
86
|
+
_uuid_module.uuid4 = _patched_uuid4
|
|
87
|
+
return self
|
|
88
|
+
|
|
89
|
+
def __exit__(self, *args: object) -> None:
|
|
90
|
+
_uuid_module.uuid4 = self._orig_uuid4 # type: ignore[assignment]
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class AsyncRecorder:
|
|
94
|
+
"""Async context manager that records an AsyncAnthropic client's I/O."""
|
|
95
|
+
|
|
96
|
+
def __init__(self, client: anthropic.AsyncAnthropic, agent_name: str = "") -> None:
|
|
97
|
+
self._orig_client = client
|
|
98
|
+
self._agent_name = agent_name
|
|
99
|
+
self._nondet: RecordingNondet | None = None
|
|
100
|
+
self._tape: Tape | None = None
|
|
101
|
+
self._wrapped_client: anthropic.AsyncAnthropic | None = None
|
|
102
|
+
self._orig_uuid4: Callable[[], _uuid_module.UUID] | None = None
|
|
103
|
+
|
|
104
|
+
@property
|
|
105
|
+
def client(self) -> anthropic.AsyncAnthropic:
|
|
106
|
+
if self._wrapped_client is None:
|
|
107
|
+
raise RuntimeError("Use AsyncRecorder as an async context manager")
|
|
108
|
+
return self._wrapped_client
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def tape(self) -> Tape:
|
|
112
|
+
if self._tape is None:
|
|
113
|
+
raise RuntimeError("Use AsyncRecorder as an async context manager")
|
|
114
|
+
return self._tape
|
|
115
|
+
|
|
116
|
+
async def __aenter__(self) -> AsyncRecorder:
|
|
117
|
+
self._nondet = RecordingNondet()
|
|
118
|
+
self._tape = Tape(agent_name=self._agent_name)
|
|
119
|
+
self._tape.draws = self._nondet.draws
|
|
120
|
+
|
|
121
|
+
orig_inner = self._orig_client._client._transport
|
|
122
|
+
transport = AsyncTraceforkTransport("record", self._tape, orig_inner)
|
|
123
|
+
# `.copy()` preserves base_url, auth_token, default headers/query and timeout
|
|
124
|
+
# (see the sync Recorder) — only the transport and retries are swapped.
|
|
125
|
+
self._wrapped_client = self._orig_client.copy(
|
|
126
|
+
http_client=httpx.AsyncClient(transport=transport),
|
|
127
|
+
max_retries=0,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
nondet = self._nondet
|
|
131
|
+
self._orig_uuid4 = _uuid_module.uuid4
|
|
132
|
+
|
|
133
|
+
def _patched_uuid4() -> _uuid_module.UUID:
|
|
134
|
+
return _uuid_module.UUID(nondet.new_uuid_hex())
|
|
135
|
+
|
|
136
|
+
_uuid_module.uuid4 = _patched_uuid4
|
|
137
|
+
return self
|
|
138
|
+
|
|
139
|
+
async def __aexit__(self, *args: object) -> None:
|
|
140
|
+
_uuid_module.uuid4 = self._orig_uuid4 # type: ignore[assignment]
|
tracefork/replay.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Verified replay: run an agent on a recorded tape and assert bit-exactness.
|
|
2
|
+
|
|
3
|
+
`ReplayVerifier` loads a tape, runs the caller's agent function with a
|
|
4
|
+
`TraceforkTransport("replay", tape)`, and returns a `VerificationResult`
|
|
5
|
+
describing whether the replay was bit-exact. `DriftDoctor` classifies why
|
|
6
|
+
a divergence happened when it wasn't.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import enum
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
|
|
14
|
+
import anthropic
|
|
15
|
+
import httpx
|
|
16
|
+
|
|
17
|
+
from .nondet import DivergenceError, find_divergence
|
|
18
|
+
from .tape import Tape
|
|
19
|
+
from .transport import TraceforkTransport
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class DriftCause(enum.Enum):
|
|
23
|
+
UNRECORDED_NONDET = "unrecorded_nondet"
|
|
24
|
+
CODE_CHANGE = "code_change"
|
|
25
|
+
BOUNDARY_VIOLATION = "boundary_violation"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class DivergenceReport:
|
|
30
|
+
step_index: int
|
|
31
|
+
cause_hint: str # raw message from DivergenceError
|
|
32
|
+
error: DivergenceError
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class VerificationResult:
|
|
37
|
+
bit_exact: bool
|
|
38
|
+
matched: int
|
|
39
|
+
total: int
|
|
40
|
+
fingerprints_match: bool
|
|
41
|
+
recorded_fingerprint: str
|
|
42
|
+
replayed_fingerprint: str
|
|
43
|
+
divergence: DivergenceReport | None = None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class ReplayVerifier:
|
|
47
|
+
"""Replay a tape and report whether the agent reproduced it bit-exactly."""
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
tape: Tape,
|
|
52
|
+
agent_fn, # Callable[[anthropic.Anthropic], Any]
|
|
53
|
+
*,
|
|
54
|
+
api_key: str = "sk-ant-replay",
|
|
55
|
+
) -> None:
|
|
56
|
+
self._tape = tape
|
|
57
|
+
self._agent_fn = agent_fn
|
|
58
|
+
self._api_key = api_key
|
|
59
|
+
|
|
60
|
+
def verify(self) -> VerificationResult:
|
|
61
|
+
transport = TraceforkTransport("replay", self._tape)
|
|
62
|
+
client = anthropic.Anthropic(
|
|
63
|
+
api_key=self._api_key,
|
|
64
|
+
http_client=httpx.Client(transport=transport),
|
|
65
|
+
max_retries=0,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
divergence: DivergenceReport | None = None
|
|
69
|
+
try:
|
|
70
|
+
self._agent_fn(client)
|
|
71
|
+
except DivergenceError as e:
|
|
72
|
+
divergence = DivergenceReport(
|
|
73
|
+
step_index=transport._i,
|
|
74
|
+
cause_hint=str(e),
|
|
75
|
+
error=e,
|
|
76
|
+
)
|
|
77
|
+
except Exception as e:
|
|
78
|
+
div = find_divergence(e)
|
|
79
|
+
if div is not None:
|
|
80
|
+
divergence = DivergenceReport(
|
|
81
|
+
step_index=transport._i,
|
|
82
|
+
cause_hint=str(div),
|
|
83
|
+
error=div,
|
|
84
|
+
)
|
|
85
|
+
else:
|
|
86
|
+
raise
|
|
87
|
+
|
|
88
|
+
recorded_fp = self._tape.digest()
|
|
89
|
+
|
|
90
|
+
# Build a tape from what was replayed so far for fingerprint comparison
|
|
91
|
+
# Full replay — fingerprints should match
|
|
92
|
+
replayed_fp = recorded_fp if divergence is None and transport.fully_consumed() else ""
|
|
93
|
+
|
|
94
|
+
bit_exact = divergence is None and transport.fully_consumed()
|
|
95
|
+
fingerprints_match = bit_exact and recorded_fp == replayed_fp
|
|
96
|
+
|
|
97
|
+
return VerificationResult(
|
|
98
|
+
bit_exact=bit_exact,
|
|
99
|
+
matched=transport.matched,
|
|
100
|
+
total=len(self._tape.exchanges),
|
|
101
|
+
fingerprints_match=fingerprints_match,
|
|
102
|
+
recorded_fingerprint=recorded_fp,
|
|
103
|
+
replayed_fingerprint=replayed_fp,
|
|
104
|
+
divergence=divergence,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class DriftDoctor:
|
|
109
|
+
"""Classifies why a replay diverged from the tape."""
|
|
110
|
+
|
|
111
|
+
@staticmethod
|
|
112
|
+
def classify(report: DivergenceReport) -> DriftCause:
|
|
113
|
+
msg = report.cause_hint.lower()
|
|
114
|
+
if "unrecorded" in msg or "exhausted" in msg or "draw" in msg:
|
|
115
|
+
return DriftCause.UNRECORDED_NONDET
|
|
116
|
+
if "extra" in msg or "boundary" in msg:
|
|
117
|
+
return DriftCause.BOUNDARY_VIOLATION
|
|
118
|
+
# Default: request bytes diverged — agent code changed
|
|
119
|
+
return DriftCause.CODE_CHANGE
|