tracefork 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracefork/__init__.py +6 -0
- tracefork/blame.py +296 -0
- tracefork/cli.py +367 -0
- tracefork/constants.py +24 -0
- tracefork/faults.py +129 -0
- tracefork/fork.py +173 -0
- tracefork/nondet.py +96 -0
- tracefork/py.typed +0 -0
- tracefork/recorder.py +140 -0
- tracefork/replay.py +119 -0
- tracefork/report.py +131 -0
- tracefork/server.py +73 -0
- tracefork/store.py +123 -0
- tracefork/synthetic.py +104 -0
- tracefork/tape.py +135 -0
- tracefork/transport.py +137 -0
- tracefork/validate.py +177 -0
- tracefork/web/report.html +209 -0
- tracefork/wire.py +76 -0
- tracefork-0.1.0.dist-info/METADATA +235 -0
- tracefork-0.1.0.dist-info/RECORD +32 -0
- tracefork-0.1.0.dist-info/WHEEL +4 -0
- tracefork-0.1.0.dist-info/entry_points.txt +2 -0
- tracefork-0.1.0.dist-info/licenses/LICENSE +21 -0
- tracefork_spike/__init__.py +7 -0
- tracefork_spike/__main__.py +3 -0
- tracefork_spike/agent.py +91 -0
- tracefork_spike/fake_llm.py +106 -0
- tracefork_spike/nondet.py +97 -0
- tracefork_spike/spike.py +125 -0
- tracefork_spike/tape.py +79 -0
- tracefork_spike/transport.py +68 -0
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tracefork
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Time-travel debugger for AI agents: bit-exact record/replay, fork any step, causal blame with confidence intervals.
|
|
5
|
+
Project-URL: Homepage, https://github.com/pratik916/tracefork
|
|
6
|
+
Project-URL: Repository, https://github.com/pratik916/tracefork
|
|
7
|
+
Project-URL: Issues, https://github.com/pratik916/tracefork/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/pratik916/tracefork/blob/main/CHANGELOG.md
|
|
9
|
+
Author-email: Pratik Soni <godofcode.pratik@gmail.com>
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: ai-agents,anthropic,causal-inference,debugging,determinism,llm,observability,record-replay,time-travel-debugger
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Classifier: Topic :: Software Development :: Debuggers
|
|
22
|
+
Classifier: Topic :: Software Development :: Testing
|
|
23
|
+
Classifier: Typing :: Typed
|
|
24
|
+
Requires-Python: >=3.12
|
|
25
|
+
Requires-Dist: anthropic>=0.40
|
|
26
|
+
Requires-Dist: fastapi>=0.115
|
|
27
|
+
Requires-Dist: typer>=0.12
|
|
28
|
+
Requires-Dist: uvicorn>=0.30
|
|
29
|
+
Requires-Dist: zstandard>=0.22
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: mypy>=1.11; extra == 'dev'
|
|
32
|
+
Requires-Dist: pytest-asyncio>=0.24; extra == 'dev'
|
|
33
|
+
Requires-Dist: pytest-cov>=5; extra == 'dev'
|
|
34
|
+
Requires-Dist: pytest>=8; extra == 'dev'
|
|
35
|
+
Requires-Dist: ruff>=0.6; extra == 'dev'
|
|
36
|
+
Description-Content-Type: text/markdown
|
|
37
|
+
|
|
38
|
+
# tracefork
|
|
39
|
+
|
|
40
|
+
[](https://github.com/pratik916/tracefork/actions/workflows/ci.yml)
|
|
41
|
+
[](LICENSE)
|
|
42
|
+
[](https://www.python.org/downloads/)
|
|
43
|
+
[](https://github.com/astral-sh/ruff)
|
|
44
|
+
|
|
45
|
+
**A time-travel debugger for AI agents that doesn't just replay a failed run — it
|
|
46
|
+
proves the replay is bit-for-bit real, lets you fork any step, and measures *which*
|
|
47
|
+
step caused the failure, with confidence intervals.**
|
|
48
|
+
|
|
49
|
+

|
|
50
|
+
|
|
51
|
+
*The three-panel report: a run's timeline (left) with causal-blame badges, the
|
|
52
|
+
request/response for the selected exchange (center), and the blame ranking with 95%
|
|
53
|
+
confidence intervals (right). Generated offline, for $0, by
|
|
54
|
+
[`examples/demo_report.py`](examples/demo_report.py).*
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## The idea
|
|
59
|
+
|
|
60
|
+
Every agent-observability tool shows you a trace and asks you to eyeball it. tracefork
|
|
61
|
+
treats an agent run like a recording you can rewind, branch, and reason about causally:
|
|
62
|
+
|
|
63
|
+
- **Record** every model call into a content-addressed **tape** at the HTTP seam of the
|
|
64
|
+
Anthropic SDK, capturing the sources of nondeterminism (clock, ids) the agent reads.
|
|
65
|
+
- **Replay** the tape **bit-exact for $0** — every replayed request's *body* is
|
|
66
|
+
sha256-checked against the tape, so it's *proven* identical, not asserted. (The matched
|
|
67
|
+
surface is the request body; request headers such as `anthropic-beta` are out of scope —
|
|
68
|
+
see [Determinism boundary](#determinism-boundary-v1-honest-scope).) No network, no key.
|
|
69
|
+
- **Fork** any step: swap in a different model response and let the *same* agent run
|
|
70
|
+
forward from there. The unchanged prefix replays for free; only the new tail costs
|
|
71
|
+
anything.
|
|
72
|
+
- **Blame**: resample those forks across every step and rank each by its **flip-rate** —
|
|
73
|
+
how often perturbing it changes the run's outcome — with **Wilson score** confidence
|
|
74
|
+
intervals so a small sample can't masquerade as certainty.
|
|
75
|
+
- **Validate the instrument itself**: inject faults with *known* root causes and confirm
|
|
76
|
+
the blame engine fingers the right step. The engine is genuinely causal — it ranks
|
|
77
|
+
whichever step actually flips the outcome #1, not a fixed slot — and across five
|
|
78
|
+
injection mechanisms it hits **1.00 top-1 precision** offline against a flat negative
|
|
79
|
+
control (which is now *enforced*, not just printed). See
|
|
80
|
+
[Validation scope](#validation-scope) for exactly what that number does and doesn't claim.
|
|
81
|
+
|
|
82
|
+
That last pillar is the point: a debugger that claims to find root causes has to be
|
|
83
|
+
held to ground truth. `tracefork validate` is that proof, and it runs in under a second
|
|
84
|
+
with no API key.
|
|
85
|
+
|
|
86
|
+
## Quickstart (offline, $0, no API key)
|
|
87
|
+
|
|
88
|
+
Python **3.12** via [uv](https://docs.astral.sh/uv/). Everything below is offline and
|
|
89
|
+
makes no network calls.
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
uv sync --extra dev
|
|
93
|
+
|
|
94
|
+
# 1. The full offline test suite (65 tests).
|
|
95
|
+
uv run pytest -q
|
|
96
|
+
|
|
97
|
+
# 2. The instrument validates itself against injected, known-root-cause faults.
|
|
98
|
+
uv run tracefork validate
|
|
99
|
+
|
|
100
|
+
# 3. Generate the demo report shown above, then open it in any browser.
|
|
101
|
+
uv run python examples/demo_report.py
|
|
102
|
+
open examples/demo_report.html # macOS; or just open the file
|
|
103
|
+
|
|
104
|
+
# 4. The original Spike 0 receipt: record → persist → replay → prove bit-exact.
|
|
105
|
+
uv run python -m tracefork_spike
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
`tracefork validate` prints:
|
|
109
|
+
|
|
110
|
+
```
|
|
111
|
+
[PASS] corrupted_tool_output top-1: 1.00
|
|
112
|
+
[PASS] misleading_retrieval top-1: 1.00
|
|
113
|
+
[PASS] wrong_system_prompt top-1: 1.00
|
|
114
|
+
[PASS] dropped_message top-1: 1.00
|
|
115
|
+
[PASS] poisoned_argument top-1: 1.00
|
|
116
|
+
|
|
117
|
+
overall top-1 precision: 1.00
|
|
118
|
+
negative control max flip: 0.00 (threshold 0.30)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## The CLI
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
uv run tracefork --help
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
| Command | What it does |
|
|
128
|
+
|---|---|
|
|
129
|
+
| `replay <tape> --agent pkg.mod:fn` | Replay a tape and print the bit-exact verification receipt. |
|
|
130
|
+
| `verify <tape> --agent pkg.mod:fn` | Verify replay; exit non-zero on drift (CI gate). |
|
|
131
|
+
| `fork <run_id> --step N --response f --agent pkg.mod:fn` | Fork a run at step N with a mutated response; record the counterfactual branch. |
|
|
132
|
+
| `blame <run_id> --agent pkg.mod:fn [--k 10] [--budget 5.0]` | Rank every step by causal flip-rate with 95% CIs (re-runs the agent; budget-capped). |
|
|
133
|
+
| `report <run_id> \| --tape <tape> -o out.html` | Render the self-contained three-panel HTML report. |
|
|
134
|
+
| `serve [--store store.db] [--port 7777]` | Serve the live web UI (same-origin, 127.0.0.1). |
|
|
135
|
+
| `validate [--k 3] [--n-runs 5] [--check]` | Run the fault-injection suite; `--check` gates against the committed report. |
|
|
136
|
+
|
|
137
|
+
Replay, verify, fork, and the offline demos need no key. `blame` against a *real* run
|
|
138
|
+
re-runs the agent's counterfactual tails against the live API, which is why it's
|
|
139
|
+
budget-capped — the offline, $0 proof that blame works is `tracefork validate`.
|
|
140
|
+
|
|
141
|
+
## How it works
|
|
142
|
+
|
|
143
|
+
The spine is a **record/replay seam at the Anthropic SDK's httpx boundary** plus a
|
|
144
|
+
**nondeterminism-virtualization seam** the agent reads time and ids through. Bit-exactness
|
|
145
|
+
is the contract between them.
|
|
146
|
+
|
|
147
|
+
- **`transport.py`** — `TraceforkTransport` (sync) / `AsyncTraceforkTransport` (async).
|
|
148
|
+
Record mode tees request+response bytes into the tape (buffering streaming SSE and
|
|
149
|
+
plain JSON identically via `.read()`/`.aread()`); replay mode serves recorded bytes and
|
|
150
|
+
sha256-asserts every request body matches the tape. A replay transport has **no inner
|
|
151
|
+
transport**, so an unrecorded request is a hard error, never a silent network call.
|
|
152
|
+
- **`tape.py`** — content-addressed (sha256) blobs + an ordered event log, persistable to
|
|
153
|
+
SQLite, with a hash-chain `digest()` fingerprint.
|
|
154
|
+
- **`nondet.py`** — `NondetSource` is the only way the agent gets time/ids;
|
|
155
|
+
`RecordingNondet` logs real draws, `ReplayNondet` serves them back, `DriftingNondet` is
|
|
156
|
+
the negative control. `find_divergence()` unwraps the `DivergenceError` the SDK buries
|
|
157
|
+
inside an `APIConnectionError` so a real divergence isn't mistaken for a network blip.
|
|
158
|
+
- **`fork.py`** — `ForkTransport` runs three phases: **prefix-replay** (served from the
|
|
159
|
+
parent tape for $0, request asserted to match — the agent must be deterministic up to
|
|
160
|
+
the fork point), **mutation-injection** (same request, swapped response), and
|
|
161
|
+
**tail-record** (the counterfactual continuation recorded fresh). A `Branch` carries
|
|
162
|
+
`prefix_replayed`/`tail_recorded` counters that quantify the savings.
|
|
163
|
+
- **`blame.py`** — forks each step `k` times, re-runs the agent, grades the outcome via an
|
|
164
|
+
`Oracle`, and counts flips vs. the parent outcome. `wilson_ci()` gives the interval;
|
|
165
|
+
`BudgetGovernor` estimates fork count and dollar cost before any spend.
|
|
166
|
+
- **`faults.py` / `validate.py`** — five fault classes, each producing *valid* Anthropic
|
|
167
|
+
JSON with a marker embedded inside a content field. A synthetic agent echoes each
|
|
168
|
+
response into its next request, so an injected fault propagates through a fork to a
|
|
169
|
+
fault-aware tail and flips the outcome — letting the blame engine be scored against
|
|
170
|
+
ground truth entirely offline.
|
|
171
|
+
- **`report.py` / `server.py` / `web/report.html`** — a single, dependency-free HTML file
|
|
172
|
+
(vanilla JS, no npm) rendered statically by `report` or served live by `serve`.
|
|
173
|
+
|
|
174
|
+
## Determinism boundary (v1, honest scope)
|
|
175
|
+
|
|
176
|
+
Bit-exact replay holds within a declared boundary: **single-process, clock + id
|
|
177
|
+
nondeterminism, captured through `NondetSource`**. An agent that reads `datetime.now()` /
|
|
178
|
+
`uuid` / `random` directly, or runs its loop across threads/subprocesses, steps outside
|
|
179
|
+
that boundary — and the verifier will *detect* the resulting drift rather than paper over
|
|
180
|
+
it. Forking and blame assume the agent rebuilds its prefix deterministically (the same
|
|
181
|
+
property replay proves). See [`SPIKE0.md`](SPIKE0.md) for how the boundary was de-risked.
|
|
182
|
+
|
|
183
|
+
## Validation scope
|
|
184
|
+
|
|
185
|
+
What `tracefork validate` proves, stated precisely: the blame engine is **genuinely
|
|
186
|
+
causal** — inject an outcome-flipping fault at *any* step and the engine ranks that step
|
|
187
|
+
first (verified by also injecting at a non-root step), so the 1.00 is not a tautology or a
|
|
188
|
+
fixed-slot artifact. The five "fault classes" carry two real injection mechanisms (a
|
|
189
|
+
corrupted tool argument and a replaced text message) via a marker that survives the SDK's
|
|
190
|
+
JSON round-trip, and the negative control — a no-op perturbation that must not flip the
|
|
191
|
+
outcome — is enforced with a hard threshold (the run fails if it ever exceeds 0.30).
|
|
192
|
+
|
|
193
|
+
What it does **not** yet claim: discrimination among *several competing* plausible causes
|
|
194
|
+
on a long run. The fixture is a short tape where one step gets a flip-capable perturbation
|
|
195
|
+
and the rest get an inert one — a clean positive-vs-control, but an easy one. A longer tape
|
|
196
|
+
with a decoy step that changes the transcript without changing the outcome is the next
|
|
197
|
+
iteration; until then, read 1.00 as "the instrument reliably finds the planted cause," not
|
|
198
|
+
"it resolves ambiguous multi-cause blame."
|
|
199
|
+
|
|
200
|
+
## Layout
|
|
201
|
+
|
|
202
|
+
```
|
|
203
|
+
src/tracefork/ transport, tape, nondet, recorder, fork, store,
|
|
204
|
+
blame, faults, validate, report, server, wire, synthetic, cli
|
|
205
|
+
src/tracefork_spike/ the original bit-exact record/replay spike
|
|
206
|
+
web/report.html the single-file three-panel UI
|
|
207
|
+
examples/ runnable demo that produces the report above
|
|
208
|
+
tests/ 65 offline tests ($0, no key)
|
|
209
|
+
experiments/ committed reference report for `validate --check`
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
## Testing
|
|
213
|
+
|
|
214
|
+
```bash
|
|
215
|
+
uv run pytest -q # all 65 offline tests
|
|
216
|
+
uv run pytest tests/test_faults.py -q # the self-validation chain
|
|
217
|
+
uv run tracefork validate --check # regression-gate vs committed report
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
## Contributing
|
|
221
|
+
|
|
222
|
+
Contributions are welcome — see [`CONTRIBUTING.md`](CONTRIBUTING.md) for dev setup,
|
|
223
|
+
the invariants a PR must respect, and commit/PR conventions. The whole dev loop
|
|
224
|
+
(tests, `validate`, lint, type-check) is offline and $0, so you can run the full gate
|
|
225
|
+
with no API key. Please also read the [Code of Conduct](CODE_OF_CONDUCT.md).
|
|
226
|
+
|
|
227
|
+
## Security
|
|
228
|
+
|
|
229
|
+
See [`SECURITY.md`](SECURITY.md) for how to report a vulnerability. In short: tapes
|
|
230
|
+
are JSON + base64 (never pickle, so loading one can't execute code), and `tracefork
|
|
231
|
+
serve` binds to 127.0.0.1 only.
|
|
232
|
+
|
|
233
|
+
## License
|
|
234
|
+
|
|
235
|
+
MIT — see [`LICENSE`](LICENSE).
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
tracefork/__init__.py,sha256=U-ttX1vJOy7j4PYvbTRwhu_4UXfhJmWwS6X2q0Qo158,175
|
|
2
|
+
tracefork/blame.py,sha256=Rdnl4pfnm7CUftQtea1kfrpIlVTCU_Ge914ndJEhmH8,10802
|
|
3
|
+
tracefork/cli.py,sha256=tO1w6tCocsUUYsx9MHFpx2zTLJgbOo06Gko2ibDGxoU,13430
|
|
4
|
+
tracefork/constants.py,sha256=xBqcMokay6vOOU27v8iqUq8AQlTF-uB33LX9FM77F0g,936
|
|
5
|
+
tracefork/faults.py,sha256=iM59in35CCEkFJ6_a35ev-g__51wwbBfA-iy5eI9cNk,5291
|
|
6
|
+
tracefork/fork.py,sha256=CRR1LHHAPq7-fAPM38ts-71X0_frR3U5XuwtU3bbHN4,6334
|
|
7
|
+
tracefork/nondet.py,sha256=odpYOSxTwV0srleC7xU0ErTBtE8BxnBD9f-P7N2JzzM,3083
|
|
8
|
+
tracefork/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
tracefork/recorder.py,sha256=0NnA91Arq72u_dqUkohs0P0CCu6WHqEbGzV6KPEPbSg,5627
|
|
10
|
+
tracefork/replay.py,sha256=6BgPBq6wgMYg0cPY2l98BQn-9SxZAzLT0-88j8W_LDc,3625
|
|
11
|
+
tracefork/report.py,sha256=kzm1PDe7ajr7_M33oRaEZ0hZo4h6b91L5aZ5t_22Ivc,4628
|
|
12
|
+
tracefork/server.py,sha256=jntUa-KX9qglsq1FS8kd21PS2946Woi2NZOQdtPk67o,2349
|
|
13
|
+
tracefork/store.py,sha256=0qiwuoU2VeB9sa-aa_Dji7ksylrskvc434Nr12rRUbE,4427
|
|
14
|
+
tracefork/synthetic.py,sha256=mikKoulr_pIpM7j6T4uBimPenyQonU-dHGXYrYwcXt0,3851
|
|
15
|
+
tracefork/tape.py,sha256=eb-rm6zFGSOOn2ftjWh7dRuaW3w5zzljghEEkczgUOw,5082
|
|
16
|
+
tracefork/transport.py,sha256=bM6AbjtQvYBLbQkCUNhlaadukfpWRTqWMY6n0CF0qPQ,4960
|
|
17
|
+
tracefork/validate.py,sha256=upgFnRfxmgfysB2GmDf3GQRiLaUmb7Cjvl-pdhfu5MA,6821
|
|
18
|
+
tracefork/wire.py,sha256=Bb0u_VSuYuWBZmDNOtHpCIblKU6XbF8dbbaQ8ZV9184,2199
|
|
19
|
+
tracefork_spike/__init__.py,sha256=c_g12WwUaAn_mxKW2eQLG0WWx4zIwfrGIzKaN1mcNhA,246
|
|
20
|
+
tracefork_spike/__main__.py,sha256=HnbKfCaV-iv-zjLiy1gciGBrWYt6muRXf0pTH2u_K9U,50
|
|
21
|
+
tracefork_spike/agent.py,sha256=HsqBK39HtNjzYSrtUds6EpLyJMNUm2bbuym4oZB3ITU,3223
|
|
22
|
+
tracefork_spike/fake_llm.py,sha256=V55y9zrPLLqsQWNSPfJz7hRbN33Zg3lb0oazfk38UAo,4094
|
|
23
|
+
tracefork_spike/nondet.py,sha256=UPgMasUy8yRkdsFpqM3o-__JV_xc13LEZ8jEWqRwCJc,3298
|
|
24
|
+
tracefork_spike/spike.py,sha256=rg20GHMKHJRBRe6HcEftY8NEau80ACUH7GRQ9WgdF5g,5214
|
|
25
|
+
tracefork_spike/tape.py,sha256=AUlZvkP1hQXPUWa4chJaZLFdnOGGLIHtu45pKJeZYb8,3230
|
|
26
|
+
tracefork_spike/transport.py,sha256=NCVURT18wUELqf9Z_z9iZxItH-D8YMk3FrjNPp9NfYM,2643
|
|
27
|
+
tracefork/web/report.html,sha256=6NWNTrUy38zP_eZmqdvBhH7KrkU1etmv4QQHeMFaX3k,9857
|
|
28
|
+
tracefork-0.1.0.dist-info/METADATA,sha256=mPQQKaDYHze_LLdt7g-Nvr_yKUN-Yd4dYaoxgVYP6AU,12277
|
|
29
|
+
tracefork-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
30
|
+
tracefork-0.1.0.dist-info/entry_points.txt,sha256=w8wDABmxwUHKYLeZ9q6VplfnMftA0Jws2hLZk0wNgCo,48
|
|
31
|
+
tracefork-0.1.0.dist-info/licenses/LICENSE,sha256=Nze8S2cJmJD6Qe6HpaIXVJKAvh7bhrUfrksikSkS9V8,1068
|
|
32
|
+
tracefork-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Pratik Soni
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
tracefork_spike/agent.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""A tiny tool-using agent built on the real Anthropic SDK.
|
|
2
|
+
|
|
3
|
+
This is the "agent under recording". It is deliberately nondeterministic: the
|
|
4
|
+
`book_flight` tool stamps a wall-clock `booked_at` and a fresh `confirmation_id` on
|
|
5
|
+
every run. Those values flow into the *next* request body, so an honest replay can
|
|
6
|
+
only be byte-exact if that nondeterminism was captured and virtualized. The agent
|
|
7
|
+
reads time/ids exclusively through the injected `NondetSource`.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
|
|
14
|
+
import anthropic
|
|
15
|
+
import httpx
|
|
16
|
+
|
|
17
|
+
from .nondet import NondetSource, find_divergence
|
|
18
|
+
|
|
19
|
+
MODEL = "claude-opus-4-8"
|
|
20
|
+
|
|
21
|
+
TOOLS = [
|
|
22
|
+
{
|
|
23
|
+
"name": "book_flight",
|
|
24
|
+
"description": "Book a flight to a destination.",
|
|
25
|
+
"input_schema": {
|
|
26
|
+
"type": "object",
|
|
27
|
+
"properties": {
|
|
28
|
+
"destination": {"type": "string"},
|
|
29
|
+
"seats": {"type": "integer"},
|
|
30
|
+
},
|
|
31
|
+
"required": ["destination", "seats"],
|
|
32
|
+
},
|
|
33
|
+
}
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def make_client(transport: httpx.BaseTransport) -> anthropic.Anthropic:
|
|
38
|
+
return anthropic.Anthropic(
|
|
39
|
+
api_key="sk-ant-offline-fake",
|
|
40
|
+
http_client=httpx.Client(transport=transport),
|
|
41
|
+
max_retries=0,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _execute_tool(name: str, tool_input: dict, nondet: NondetSource) -> dict:
|
|
46
|
+
if name == "book_flight":
|
|
47
|
+
return {
|
|
48
|
+
"confirmation_id": nondet.new_id("CONF"), # virtualized nondeterminism
|
|
49
|
+
"booked_at": nondet.now_iso(), # virtualized nondeterminism
|
|
50
|
+
"destination": tool_input["destination"],
|
|
51
|
+
"seats": tool_input["seats"],
|
|
52
|
+
}
|
|
53
|
+
raise ValueError(f"unknown tool {name!r}")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def run_agent(client: anthropic.Anthropic, nondet: NondetSource) -> dict:
|
|
57
|
+
"""Run the agent loop to completion; return its observable trajectory."""
|
|
58
|
+
messages: list[dict] = [{"role": "user", "content": "Book me a flight to Tokyo."}]
|
|
59
|
+
turns: list[dict] = []
|
|
60
|
+
|
|
61
|
+
while True:
|
|
62
|
+
try:
|
|
63
|
+
resp = client.messages.create(
|
|
64
|
+
model=MODEL, max_tokens=1024, tools=TOOLS, messages=messages
|
|
65
|
+
)
|
|
66
|
+
except anthropic.APIConnectionError as e:
|
|
67
|
+
# The SDK masks transport-layer exceptions as connection errors; recover
|
|
68
|
+
# a replay DivergenceError so callers see the real cause.
|
|
69
|
+
div = find_divergence(e)
|
|
70
|
+
if div is not None:
|
|
71
|
+
raise div from None
|
|
72
|
+
raise
|
|
73
|
+
turns.append({"id": resp.id, "stop_reason": resp.stop_reason})
|
|
74
|
+
messages.append({"role": "assistant", "content": resp.content})
|
|
75
|
+
|
|
76
|
+
if resp.stop_reason != "tool_use":
|
|
77
|
+
final_text = next((b.text for b in resp.content if b.type == "text"), "")
|
|
78
|
+
return {"final_text": final_text, "turns": turns}
|
|
79
|
+
|
|
80
|
+
results = []
|
|
81
|
+
for block in resp.content:
|
|
82
|
+
if block.type == "tool_use":
|
|
83
|
+
out = _execute_tool(block.name, dict(block.input), nondet)
|
|
84
|
+
results.append(
|
|
85
|
+
{
|
|
86
|
+
"type": "tool_result",
|
|
87
|
+
"tool_use_id": block.id,
|
|
88
|
+
"content": json.dumps(out),
|
|
89
|
+
}
|
|
90
|
+
)
|
|
91
|
+
messages.append({"role": "user", "content": results})
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""A fake Anthropic endpoint as an httpx transport.
|
|
2
|
+
|
|
3
|
+
This stands in for the real /v1/messages API so the spike needs no key and no
|
|
4
|
+
network ($0, offline, CI-safe). It emits real Anthropic *wire-format* JSON so the
|
|
5
|
+
genuine `anthropic` SDK parses it into real `Message` objects — i.e. the spike
|
|
6
|
+
exercises the actual SDK + transport seam, not a hand-rolled client.
|
|
7
|
+
|
|
8
|
+
When a real key is available, this inner transport is simply swapped for the SDK's
|
|
9
|
+
real network transport; the recording/replay machinery around it is unchanged. That
|
|
10
|
+
is the whole point: the seam is provider-real.
|
|
11
|
+
|
|
12
|
+
The fake is a two-turn agent script: first request -> a `tool_use` for `book_flight`;
|
|
13
|
+
second request (which now carries a `tool_result`) -> a final `end_turn` answer that
|
|
14
|
+
echoes the confirmation id the agent's tool produced.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
|
|
21
|
+
import httpx
|
|
22
|
+
|
|
23
|
+
from .tape import sha256_hex
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _has_tool_result(payload: dict) -> bool:
|
|
27
|
+
for m in payload.get("messages", []):
|
|
28
|
+
content = m.get("content")
|
|
29
|
+
if isinstance(content, list):
|
|
30
|
+
for block in content:
|
|
31
|
+
if isinstance(block, dict) and block.get("type") == "tool_result":
|
|
32
|
+
return True
|
|
33
|
+
return False
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class FakeAnthropicTransport(httpx.BaseTransport):
|
|
37
|
+
"""Deterministic given the request bytes (ids derived from the request hash)."""
|
|
38
|
+
|
|
39
|
+
def handle_request(self, request: httpx.Request) -> httpx.Response:
|
|
40
|
+
payload = json.loads(request.content)
|
|
41
|
+
model = payload.get("model", "claude-opus-4-8")
|
|
42
|
+
rid = "msg_" + sha256_hex(request.content)[:20]
|
|
43
|
+
|
|
44
|
+
if not _has_tool_result(payload):
|
|
45
|
+
toolu = "toolu_" + sha256_hex(request.content)[:18]
|
|
46
|
+
message = {
|
|
47
|
+
"id": rid,
|
|
48
|
+
"type": "message",
|
|
49
|
+
"role": "assistant",
|
|
50
|
+
"model": model,
|
|
51
|
+
"content": [
|
|
52
|
+
{"type": "text", "text": "Booking your flight now."},
|
|
53
|
+
{
|
|
54
|
+
"type": "tool_use",
|
|
55
|
+
"id": toolu,
|
|
56
|
+
"name": "book_flight",
|
|
57
|
+
"input": {"destination": "Tokyo", "seats": 1},
|
|
58
|
+
},
|
|
59
|
+
],
|
|
60
|
+
"stop_reason": "tool_use",
|
|
61
|
+
"stop_sequence": None,
|
|
62
|
+
"usage": {"input_tokens": 48, "output_tokens": 22},
|
|
63
|
+
}
|
|
64
|
+
else:
|
|
65
|
+
confirmation = _last_tool_result_confirmation(payload)
|
|
66
|
+
message = {
|
|
67
|
+
"id": rid,
|
|
68
|
+
"type": "message",
|
|
69
|
+
"role": "assistant",
|
|
70
|
+
"model": model,
|
|
71
|
+
"content": [
|
|
72
|
+
{
|
|
73
|
+
"type": "text",
|
|
74
|
+
"text": (
|
|
75
|
+
f"Done — your flight to Tokyo is booked. Confirmation {confirmation}."
|
|
76
|
+
),
|
|
77
|
+
}
|
|
78
|
+
],
|
|
79
|
+
"stop_reason": "end_turn",
|
|
80
|
+
"stop_sequence": None,
|
|
81
|
+
"usage": {"input_tokens": 96, "output_tokens": 18},
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
return httpx.Response(
|
|
85
|
+
200,
|
|
86
|
+
headers={"content-type": "application/json"},
|
|
87
|
+
content=json.dumps(message).encode(),
|
|
88
|
+
request=request,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _last_tool_result_confirmation(payload: dict) -> str:
|
|
93
|
+
"""Pull the confirmation id out of the most recent tool_result so the final
|
|
94
|
+
answer references it — makes the agent's nondeterminism observable end-to-end."""
|
|
95
|
+
for m in reversed(payload.get("messages", [])):
|
|
96
|
+
content = m.get("content")
|
|
97
|
+
if isinstance(content, list):
|
|
98
|
+
for block in content:
|
|
99
|
+
if isinstance(block, dict) and block.get("type") == "tool_result":
|
|
100
|
+
raw = block.get("content")
|
|
101
|
+
text = raw if isinstance(raw, str) else json.dumps(raw)
|
|
102
|
+
try:
|
|
103
|
+
return json.loads(text).get("confirmation_id", "UNKNOWN")
|
|
104
|
+
except (ValueError, TypeError):
|
|
105
|
+
return "UNKNOWN"
|
|
106
|
+
return "UNKNOWN"
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""Virtualized nondeterminism sources.
|
|
2
|
+
|
|
3
|
+
The headline tracefork claim is bit-exact replay. That is only possible if every
|
|
4
|
+
source of nondeterminism the agent consumes is captured at record time and served
|
|
5
|
+
back identically at replay time. This module is the seam: the toy agent reads the
|
|
6
|
+
clock and generates IDs *only* through a `NondetSource`, never through `time` /
|
|
7
|
+
`uuid` directly. Swap `RecordingNondet` for `ReplayNondet` and the same agent code
|
|
8
|
+
produces a byte-identical trajectory.
|
|
9
|
+
|
|
10
|
+
`DriftingNondet` is the negative control: it draws fresh real values during replay,
|
|
11
|
+
which must make the replay diverge — proving the verifier actually detects drift
|
|
12
|
+
rather than always passing.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import datetime
|
|
18
|
+
import uuid
|
|
19
|
+
from typing import Protocol
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class DivergenceError(RuntimeError):
|
|
23
|
+
"""Raised when a replay diverges from the recorded tape."""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def find_divergence(exc: BaseException | None) -> DivergenceError | None:
|
|
27
|
+
"""Walk an exception's cause/context chain for a DivergenceError.
|
|
28
|
+
|
|
29
|
+
The Anthropic SDK wraps any exception raised inside its httpx transport in an
|
|
30
|
+
`APIConnectionError`, so a divergence we raise from the replay transport arrives
|
|
31
|
+
as `APIConnectionError.__cause__` (possibly nested). This recovers the original."""
|
|
32
|
+
seen: set[int] = set()
|
|
33
|
+
cur = exc
|
|
34
|
+
while cur is not None and id(cur) not in seen:
|
|
35
|
+
seen.add(id(cur))
|
|
36
|
+
if isinstance(cur, DivergenceError):
|
|
37
|
+
return cur
|
|
38
|
+
cur = cur.__cause__ or cur.__context__
|
|
39
|
+
return None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class NondetSource(Protocol):
|
|
43
|
+
def now_iso(self) -> str: ...
|
|
44
|
+
def new_id(self, prefix: str) -> str: ...
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class RecordingNondet:
|
|
48
|
+
"""Draws genuinely real values (wall clock, random UUIDs) and logs each draw."""
|
|
49
|
+
|
|
50
|
+
def __init__(self) -> None:
|
|
51
|
+
self.draws: list[tuple[str, str]] = []
|
|
52
|
+
|
|
53
|
+
def now_iso(self) -> str:
|
|
54
|
+
v = datetime.datetime.now(datetime.UTC).isoformat()
|
|
55
|
+
self.draws.append(("clock", v))
|
|
56
|
+
return v
|
|
57
|
+
|
|
58
|
+
def new_id(self, prefix: str) -> str:
|
|
59
|
+
v = f"{prefix}_{uuid.uuid4().hex[:16]}"
|
|
60
|
+
self.draws.append(("id", v))
|
|
61
|
+
return v
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class ReplayNondet:
|
|
65
|
+
"""Serves recorded draws back in order; errors on order/kind/exhaustion mismatch."""
|
|
66
|
+
|
|
67
|
+
def __init__(self, draws: list[tuple[str, str]]) -> None:
|
|
68
|
+
self._draws = list(draws)
|
|
69
|
+
self._i = 0
|
|
70
|
+
|
|
71
|
+
def _next(self, kind: str) -> str:
|
|
72
|
+
if self._i >= len(self._draws):
|
|
73
|
+
raise DivergenceError(
|
|
74
|
+
f"replay asked for a {kind!r} draw but the tape is exhausted "
|
|
75
|
+
f"(consumed {self._i}/{len(self._draws)})"
|
|
76
|
+
)
|
|
77
|
+
rec_kind, value = self._draws[self._i]
|
|
78
|
+
if rec_kind != kind:
|
|
79
|
+
raise DivergenceError(
|
|
80
|
+
f"draw #{self._i}: replay asked for {kind!r}, tape has {rec_kind!r}"
|
|
81
|
+
)
|
|
82
|
+
self._i += 1
|
|
83
|
+
return value
|
|
84
|
+
|
|
85
|
+
def now_iso(self) -> str:
|
|
86
|
+
return self._next("clock")
|
|
87
|
+
|
|
88
|
+
def new_id(self, prefix: str) -> str:
|
|
89
|
+
return self._next("id")
|
|
90
|
+
|
|
91
|
+
def fully_consumed(self) -> bool:
|
|
92
|
+
return self._i == len(self._draws)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class DriftingNondet(RecordingNondet):
|
|
96
|
+
"""Negative control: behaves like RecordingNondet (fresh real values) during a
|
|
97
|
+
replay, which makes the rebuilt request bytes diverge from the tape."""
|