tracefork 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tracefork/__init__.py +6 -0
- tracefork/blame.py +296 -0
- tracefork/cli.py +367 -0
- tracefork/constants.py +24 -0
- tracefork/faults.py +129 -0
- tracefork/fork.py +173 -0
- tracefork/nondet.py +96 -0
- tracefork/py.typed +0 -0
- tracefork/recorder.py +140 -0
- tracefork/replay.py +119 -0
- tracefork/report.py +131 -0
- tracefork/server.py +73 -0
- tracefork/store.py +123 -0
- tracefork/synthetic.py +104 -0
- tracefork/tape.py +135 -0
- tracefork/transport.py +137 -0
- tracefork/validate.py +177 -0
- tracefork/web/report.html +209 -0
- tracefork/wire.py +76 -0
- tracefork-0.1.0.dist-info/METADATA +235 -0
- tracefork-0.1.0.dist-info/RECORD +32 -0
- tracefork-0.1.0.dist-info/WHEEL +4 -0
- tracefork-0.1.0.dist-info/entry_points.txt +2 -0
- tracefork-0.1.0.dist-info/licenses/LICENSE +21 -0
- tracefork_spike/__init__.py +7 -0
- tracefork_spike/__main__.py +3 -0
- tracefork_spike/agent.py +91 -0
- tracefork_spike/fake_llm.py +106 -0
- tracefork_spike/nondet.py +97 -0
- tracefork_spike/spike.py +125 -0
- tracefork_spike/tape.py +79 -0
- tracefork_spike/transport.py +68 -0
tracefork_spike/spike.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""Spike 0 orchestration: record -> persist -> load -> replay -> verify.
|
|
2
|
+
|
|
3
|
+
Answers exactly one question: can we record a tool-using Anthropic-SDK agent run and
|
|
4
|
+
replay it bit-exact, with proof, for $0 and no network — within a declared
|
|
5
|
+
determinism boundary? The boundary here: a single-process, synchronous agent whose
|
|
6
|
+
only nondeterminism sources are clock and id generation, both routed through the
|
|
7
|
+
NondetSource seam.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
import tempfile
|
|
14
|
+
|
|
15
|
+
from .agent import make_client, run_agent
|
|
16
|
+
from .fake_llm import FakeAnthropicTransport
|
|
17
|
+
from .nondet import DivergenceError, DriftingNondet, RecordingNondet, ReplayNondet
|
|
18
|
+
from .tape import Tape
|
|
19
|
+
from .transport import TraceforkTransport
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def record_replay_verify(tape_path: str | None = None) -> dict:
|
|
23
|
+
"""Run the full spike and return a structured result dict (used by the CLI and tests)."""
|
|
24
|
+
cleanup = False
|
|
25
|
+
if tape_path is None:
|
|
26
|
+
fd, tape_path = tempfile.mkstemp(suffix=".tape.sqlite")
|
|
27
|
+
os.close(fd)
|
|
28
|
+
cleanup = True
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
# 1. RECORD — real agent + SDK, fake (offline) endpoint, genuine nondeterminism.
|
|
32
|
+
rec_tape = Tape()
|
|
33
|
+
rec_nondet = RecordingNondet()
|
|
34
|
+
rec_transport = TraceforkTransport("record", rec_tape, inner=FakeAnthropicTransport())
|
|
35
|
+
rec_result = run_agent(make_client(rec_transport), rec_nondet)
|
|
36
|
+
rec_tape.draws = rec_nondet.draws
|
|
37
|
+
record_fingerprint = rec_tape.digest()
|
|
38
|
+
|
|
39
|
+
# 2. PERSIST + RELOAD — prove the content-addressed tape round-trips through disk.
|
|
40
|
+
rec_tape.save(tape_path)
|
|
41
|
+
loaded = Tape.load(tape_path)
|
|
42
|
+
assert loaded.digest() == record_fingerprint, "tape changed across save/load"
|
|
43
|
+
|
|
44
|
+
# 3. REPLAY — no network: replay transport has no inner; nondeterminism virtualized.
|
|
45
|
+
rep_nondet = ReplayNondet(loaded.draws)
|
|
46
|
+
rep_transport = TraceforkTransport("replay", loaded) # inner=None -> any real call errors
|
|
47
|
+
rep_result = run_agent(make_client(rep_transport), rep_nondet)
|
|
48
|
+
replay_fingerprint = loaded.digest()
|
|
49
|
+
|
|
50
|
+
# 4. VERIFY — observable output identical, every request hash matched,
|
|
51
|
+
# all recorded draws consumed, no leftover exchanges.
|
|
52
|
+
checks = {
|
|
53
|
+
"output_identical": rep_result == rec_result,
|
|
54
|
+
"fingerprint_match": replay_fingerprint == record_fingerprint,
|
|
55
|
+
"all_request_hashes_matched": rep_transport.matched == len(loaded.exchanges),
|
|
56
|
+
"all_exchanges_consumed": rep_transport.fully_consumed(),
|
|
57
|
+
"all_draws_consumed": rep_nondet.fully_consumed(),
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
# 5. NEGATIVE CONTROL — replay with drifting (fresh) nondeterminism MUST diverge.
|
|
61
|
+
drift_detected = False
|
|
62
|
+
drift_at: str | None = None
|
|
63
|
+
try:
|
|
64
|
+
run_agent(make_client(TraceforkTransport("replay", loaded)), DriftingNondet())
|
|
65
|
+
except DivergenceError as e:
|
|
66
|
+
drift_detected = True
|
|
67
|
+
drift_at = str(e)
|
|
68
|
+
checks["negative_control_detected_drift"] = drift_detected
|
|
69
|
+
|
|
70
|
+
return {
|
|
71
|
+
"exchanges": len(loaded.exchanges),
|
|
72
|
+
"draws": len(loaded.draws),
|
|
73
|
+
"request_hashes_matched": rep_transport.matched,
|
|
74
|
+
"record_fingerprint": record_fingerprint,
|
|
75
|
+
"replay_fingerprint": replay_fingerprint,
|
|
76
|
+
"final_text": rec_result["final_text"],
|
|
77
|
+
"checks": checks,
|
|
78
|
+
"drift_at": drift_at,
|
|
79
|
+
"passed": all(checks.values()),
|
|
80
|
+
}
|
|
81
|
+
finally:
|
|
82
|
+
if cleanup and os.path.exists(tape_path):
|
|
83
|
+
os.remove(tape_path)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _fmt(result: dict) -> str:
|
|
87
|
+
c = result["checks"]
|
|
88
|
+
ok = "PASS" if result["passed"] else "FAIL"
|
|
89
|
+
lines = [
|
|
90
|
+
"",
|
|
91
|
+
" tracefork — Spike 0: bit-exact record/replay",
|
|
92
|
+
" " + "-" * 52,
|
|
93
|
+
f" recorded exchanges ........ {result['exchanges']}",
|
|
94
|
+
f" nondeterminism draws ...... {result['draws']} (clock + id, virtualized)",
|
|
95
|
+
f" request hashes matched .... {result['request_hashes_matched']}/{result['exchanges']}",
|
|
96
|
+
f" tape fingerprint .......... {result['record_fingerprint'][:24]}…",
|
|
97
|
+
f" replay fingerprint ........ {result['replay_fingerprint'][:24]}…",
|
|
98
|
+
" network calls / spend ..... 0 / $0.00",
|
|
99
|
+
f" agent final answer ........ {result['final_text']!r}",
|
|
100
|
+
"",
|
|
101
|
+
f" [{'x' if c['output_identical'] else ' '}] "
|
|
102
|
+
"replayed trajectory byte-identical to recorded",
|
|
103
|
+
f" [{'x' if c['fingerprint_match'] else ' '}] "
|
|
104
|
+
"tape fingerprint matches after save/load round-trip",
|
|
105
|
+
f" [{'x' if c['all_request_hashes_matched'] else ' '}] "
|
|
106
|
+
"every replayed request hash matched the tape",
|
|
107
|
+
f" [{'x' if c['all_draws_consumed'] else ' '}] "
|
|
108
|
+
"every recorded nondeterminism draw consumed",
|
|
109
|
+
f" [{'x' if c['negative_control_detected_drift'] else ' '}] "
|
|
110
|
+
"negative control: drift was DETECTED, not silently passed",
|
|
111
|
+
"",
|
|
112
|
+
f" RESULT: {ok}",
|
|
113
|
+
"",
|
|
114
|
+
]
|
|
115
|
+
return "\n".join(lines)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def main() -> int:
|
|
119
|
+
result = record_replay_verify()
|
|
120
|
+
print(_fmt(result))
|
|
121
|
+
return 0 if result["passed"] else 1
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
if __name__ == "__main__":
|
|
125
|
+
raise SystemExit(main())
|
tracefork_spike/tape.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Content-addressed, persistable tape.
|
|
2
|
+
|
|
3
|
+
A tape is the recorded artifact of one agent run: the ordered HTTP exchanges
|
|
4
|
+
(request body + response body) plus the ordered nondeterminism draws. Response and
|
|
5
|
+
request bodies are stored content-addressed (keyed by sha256), so identical bytes are
|
|
6
|
+
stored once; an ordered event log preserves sequence. `digest()` is a hash chain over
|
|
7
|
+
the whole tape — the single fingerprint reported in the receipt.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import hashlib
|
|
13
|
+
import sqlite3
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def sha256_hex(data: bytes) -> str:
|
|
18
|
+
return hashlib.sha256(data).hexdigest()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class Tape:
|
|
23
|
+
exchanges: list[tuple[bytes, bytes]] = field(default_factory=list)
|
|
24
|
+
draws: list[tuple[str, str]] = field(default_factory=list)
|
|
25
|
+
|
|
26
|
+
def append_exchange(self, request_body: bytes, response_body: bytes) -> None:
|
|
27
|
+
self.exchanges.append((request_body, response_body))
|
|
28
|
+
|
|
29
|
+
def exchange(self, i: int) -> tuple[bytes, bytes]:
|
|
30
|
+
return self.exchanges[i]
|
|
31
|
+
|
|
32
|
+
def digest(self) -> str:
|
|
33
|
+
"""sha256 hash chain over draws then exchanges — the tape fingerprint."""
|
|
34
|
+
h = hashlib.sha256()
|
|
35
|
+
for kind, value in self.draws:
|
|
36
|
+
h.update(b"D:" + kind.encode() + b":" + value.encode() + b"\n")
|
|
37
|
+
for req, resp in self.exchanges:
|
|
38
|
+
h.update(b"X:" + sha256_hex(req).encode() + b":" + sha256_hex(resp).encode() + b"\n")
|
|
39
|
+
return h.hexdigest()
|
|
40
|
+
|
|
41
|
+
# --- persistence: content-addressed blobs + ordered event log ---------------
|
|
42
|
+
|
|
43
|
+
def save(self, path: str) -> None:
|
|
44
|
+
con = sqlite3.connect(path)
|
|
45
|
+
try:
|
|
46
|
+
con.executescript(
|
|
47
|
+
"""
|
|
48
|
+
DROP TABLE IF EXISTS blobs;
|
|
49
|
+
DROP TABLE IF EXISTS events;
|
|
50
|
+
CREATE TABLE blobs (hash TEXT PRIMARY KEY, data BLOB NOT NULL);
|
|
51
|
+
CREATE TABLE events (seq INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
52
|
+
kind TEXT NOT NULL, a TEXT NOT NULL, b TEXT NOT NULL);
|
|
53
|
+
"""
|
|
54
|
+
)
|
|
55
|
+
for kind, value in self.draws:
|
|
56
|
+
con.execute("INSERT INTO events (kind, a, b) VALUES ('draw', ?, ?)", (kind, value))
|
|
57
|
+
for req, resp in self.exchanges:
|
|
58
|
+
rh, sh = sha256_hex(req), sha256_hex(resp)
|
|
59
|
+
con.execute("INSERT OR IGNORE INTO blobs (hash, data) VALUES (?, ?)", (rh, req))
|
|
60
|
+
con.execute("INSERT OR IGNORE INTO blobs (hash, data) VALUES (?, ?)", (sh, resp))
|
|
61
|
+
con.execute("INSERT INTO events (kind, a, b) VALUES ('exchange', ?, ?)", (rh, sh))
|
|
62
|
+
con.commit()
|
|
63
|
+
finally:
|
|
64
|
+
con.close()
|
|
65
|
+
|
|
66
|
+
@classmethod
|
|
67
|
+
def load(cls, path: str) -> Tape:
|
|
68
|
+
con = sqlite3.connect(path)
|
|
69
|
+
try:
|
|
70
|
+
blobs = dict(con.execute("SELECT hash, data FROM blobs").fetchall())
|
|
71
|
+
tape = cls()
|
|
72
|
+
for kind, a, b in con.execute("SELECT kind, a, b FROM events ORDER BY seq").fetchall():
|
|
73
|
+
if kind == "draw":
|
|
74
|
+
tape.draws.append((a, b))
|
|
75
|
+
elif kind == "exchange":
|
|
76
|
+
tape.exchanges.append((bytes(blobs[a]), bytes(blobs[b])))
|
|
77
|
+
return tape
|
|
78
|
+
finally:
|
|
79
|
+
con.close()
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""The recording/replay httpx transport — the model-I/O capture seam.
|
|
2
|
+
|
|
3
|
+
Record mode: forward each request to an inner transport, capture the request body
|
|
4
|
+
and the full response body into the tape, and return the response unchanged.
|
|
5
|
+
|
|
6
|
+
Replay mode: ignore the network entirely. For each request, pop the next recorded
|
|
7
|
+
exchange, assert the request body is byte-identical to what was recorded (this is the
|
|
8
|
+
divergence detector), and serve the recorded response bytes back. A replay transport
|
|
9
|
+
has no inner transport, so any unexpected/extra request is a hard error rather than a
|
|
10
|
+
silent network call.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import httpx
|
|
16
|
+
|
|
17
|
+
from .nondet import DivergenceError
|
|
18
|
+
from .tape import Tape, sha256_hex
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TraceforkTransport(httpx.BaseTransport):
|
|
22
|
+
def __init__(self, mode: str, tape: Tape, inner: httpx.BaseTransport | None = None) -> None:
|
|
23
|
+
assert mode in ("record", "replay")
|
|
24
|
+
if mode == "record" and inner is None:
|
|
25
|
+
raise ValueError("record mode requires an inner transport")
|
|
26
|
+
self.mode = mode
|
|
27
|
+
self.tape = tape
|
|
28
|
+
self.inner = inner
|
|
29
|
+
self._i = 0
|
|
30
|
+
self.matched = 0 # number of replay request-hashes that matched the tape
|
|
31
|
+
|
|
32
|
+
def handle_request(self, request: httpx.Request) -> httpx.Response:
|
|
33
|
+
body = request.content
|
|
34
|
+
|
|
35
|
+
if self.mode == "record":
|
|
36
|
+
inner_resp = self.inner.handle_request(request) # type: ignore[union-attr]
|
|
37
|
+
resp_body = inner_resp.read()
|
|
38
|
+
self.tape.append_exchange(body, resp_body)
|
|
39
|
+
return httpx.Response(
|
|
40
|
+
inner_resp.status_code,
|
|
41
|
+
headers={"content-type": "application/json"},
|
|
42
|
+
content=resp_body,
|
|
43
|
+
request=request,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# replay
|
|
47
|
+
if self._i >= len(self.tape.exchanges):
|
|
48
|
+
raise DivergenceError(
|
|
49
|
+
f"replay made an unrecorded request #{self._i} "
|
|
50
|
+
f"(tape has {len(self.tape.exchanges)} exchanges)"
|
|
51
|
+
)
|
|
52
|
+
rec_req, rec_resp = self.tape.exchange(self._i)
|
|
53
|
+
if sha256_hex(rec_req) != sha256_hex(body):
|
|
54
|
+
raise DivergenceError(
|
|
55
|
+
f"request #{self._i} diverged from the tape "
|
|
56
|
+
f"(recorded {sha256_hex(rec_req)[:12]}, replay {sha256_hex(body)[:12]})"
|
|
57
|
+
)
|
|
58
|
+
self._i += 1
|
|
59
|
+
self.matched += 1
|
|
60
|
+
return httpx.Response(
|
|
61
|
+
200,
|
|
62
|
+
headers={"content-type": "application/json"},
|
|
63
|
+
content=rec_resp,
|
|
64
|
+
request=request,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def fully_consumed(self) -> bool:
|
|
68
|
+
return self.mode == "replay" and self._i == len(self.tape.exchanges)
|