@possumtech/rummy 2.1.0 → 2.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +40 -15
- package/.xai.key +1 -0
- package/PLUGINS.md +169 -53
- package/README.md +38 -32
- package/SPEC.md +366 -179
- package/bin/digest.js +1097 -0
- package/biome/no-fallbacks.grit +2 -2
- package/gemini.key +1 -0
- package/lang/en.json +10 -1
- package/migrations/001_initial_schema.sql +9 -2
- package/package.json +19 -8
- package/service.js +1 -0
- package/src/agent/AgentLoop.js +76 -26
- package/src/agent/ContextAssembler.js +2 -0
- package/src/agent/Entries.js +238 -60
- package/src/agent/ProjectAgent.js +44 -0
- package/src/agent/TurnExecutor.js +99 -30
- package/src/agent/XmlParser.js +206 -111
- package/src/agent/errors.js +35 -0
- package/src/agent/known_queries.sql +1 -1
- package/src/agent/known_store.sql +3 -42
- package/src/agent/materializeContext.js +30 -1
- package/src/agent/runs.sql +8 -18
- package/src/agent/tokens.js +0 -1
- package/src/agent/turns.sql +1 -0
- package/src/hooks/Hooks.js +26 -0
- package/src/hooks/RummyContext.js +12 -1
- package/src/lib/hedberg/README.md +60 -0
- package/src/lib/hedberg/hedberg.js +60 -0
- package/src/lib/hedberg/marker.js +158 -0
- package/src/{plugins → lib}/hedberg/matcher.js +1 -2
- package/src/llm/LlmProvider.js +41 -3
- package/src/llm/openaiStream.js +17 -0
- package/src/plugins/ask_user/ask_user.js +12 -2
- package/src/plugins/ask_user/ask_userDoc.md +1 -5
- package/src/plugins/budget/README.md +29 -24
- package/src/plugins/budget/budget.js +166 -110
- package/src/plugins/cli/README.md +3 -4
- package/src/plugins/cli/cli.js +31 -5
- package/src/plugins/cloudflare/cloudflare.js +136 -0
- package/src/plugins/cp/cp.js +41 -4
- package/src/plugins/cp/cpDoc.md +5 -6
- package/src/plugins/engine/engine.sql +1 -1
- package/src/plugins/env/README.md +5 -4
- package/src/plugins/env/env.js +7 -4
- package/src/plugins/env/envDoc.md +7 -8
- package/src/plugins/error/error.js +56 -15
- package/src/plugins/file/README.md +12 -3
- package/src/plugins/file/file.js +2 -2
- package/src/plugins/get/get.js +59 -36
- package/src/plugins/get/getDoc.md +10 -34
- package/src/plugins/google/google.js +115 -0
- package/src/plugins/hedberg/hedberg.js +13 -56
- package/src/plugins/helpers.js +66 -12
- package/src/plugins/index.js +1 -2
- package/src/plugins/instructions/README.md +44 -47
- package/src/plugins/instructions/instructions-system.md +44 -0
- package/src/plugins/instructions/instructions-user.md +53 -0
- package/src/plugins/instructions/instructions.js +58 -189
- package/src/plugins/known/README.md +6 -7
- package/src/plugins/known/known.js +24 -30
- package/src/plugins/log/log.js +41 -32
- package/src/plugins/mv/mv.js +40 -1
- package/src/plugins/mv/mvDoc.md +1 -8
- package/src/plugins/ollama/ollama.js +4 -3
- package/src/plugins/openai/openai.js +4 -3
- package/src/plugins/openrouter/openrouter.js +14 -4
- package/src/plugins/persona/README.md +11 -13
- package/src/plugins/persona/default.md +29 -0
- package/src/plugins/persona/persona.js +10 -66
- package/src/plugins/policy/policy.js +23 -22
- package/src/plugins/prompt/README.md +37 -27
- package/src/plugins/prompt/prompt.js +13 -19
- package/src/plugins/rm/rm.js +18 -0
- package/src/plugins/rm/rmDoc.md +5 -6
- package/src/plugins/rpc/rpc.js +3 -3
- package/src/plugins/set/set.js +205 -323
- package/src/plugins/set/setDoc.md +47 -17
- package/src/plugins/sh/README.md +6 -5
- package/src/plugins/sh/sh.js +8 -5
- package/src/plugins/sh/shDoc.md +7 -8
- package/src/plugins/skill/README.md +37 -14
- package/src/plugins/skill/skill.js +200 -101
- package/src/plugins/skill/skillDoc.js +3 -0
- package/src/plugins/skill/skillDoc.md +9 -0
- package/src/plugins/stream/README.md +7 -6
- package/src/plugins/stream/finalize.js +100 -0
- package/src/plugins/stream/stream.js +13 -45
- package/src/plugins/telemetry/telemetry.js +27 -4
- package/src/plugins/think/think.js +2 -3
- package/src/plugins/think/thinkDoc.md +2 -4
- package/src/plugins/unknown/README.md +1 -1
- package/src/plugins/unknown/unknown.js +17 -19
- package/src/plugins/update/update.js +4 -51
- package/src/plugins/update/updateDoc.md +21 -6
- package/src/plugins/xai/xai.js +68 -102
- package/src/plugins/yolo/yolo.js +102 -75
- package/src/sql/functions/hedmatch.js +1 -1
- package/src/sql/functions/hedreplace.js +1 -1
- package/src/sql/functions/hedsearch.js +1 -1
- package/src/sql/functions/slugify.js +16 -2
- package/BENCH_ENVIRONMENT.md +0 -230
- package/CLIENT_INTERFACE.md +0 -396
- package/last_run.txt +0 -5617
- package/scriptify/ask_run.js +0 -77
- package/scriptify/cache_probe.js +0 -66
- package/scriptify/cache_probe_grok.js +0 -74
- package/src/agent/budget.js +0 -33
- package/src/agent/config.js +0 -38
- package/src/plugins/hedberg/README.md +0 -71
- package/src/plugins/hedberg/docs.md +0 -0
- package/src/plugins/hedberg/edits.js +0 -55
- package/src/plugins/hedberg/normalize.js +0 -17
- package/src/plugins/hedberg/sed.js +0 -49
- package/src/plugins/instructions/instructions.md +0 -34
- package/src/plugins/instructions/instructions_104.md +0 -8
- package/src/plugins/instructions/instructions_105.md +0 -39
- package/src/plugins/instructions/instructions_106.md +0 -22
- package/src/plugins/instructions/instructions_107.md +0 -17
- package/src/plugins/instructions/instructions_108.md +0 -0
- package/src/plugins/known/knownDoc.js +0 -3
- package/src/plugins/known/knownDoc.md +0 -8
- package/src/plugins/unknown/unknownDoc.js +0 -3
- package/src/plugins/unknown/unknownDoc.md +0 -11
- package/turns/cli_1777462658211/turn_001.txt +0 -772
- package/turns/cli_1777462658211/turn_002.txt +0 -606
- package/turns/cli_1777462658211/turn_003.txt +0 -667
- package/turns/cli_1777462658211/turn_004.txt +0 -297
- package/turns/cli_1777462658211/turn_005.txt +0 -301
- package/turns/cli_1777462658211/turn_006.txt +0 -262
- package/turns/cli_1777465095132/turn_001.txt +0 -715
- package/turns/cli_1777465095132/turn_002.txt +0 -236
- package/turns/cli_1777465095132/turn_003.txt +0 -287
- package/turns/cli_1777465095132/turn_004.txt +0 -694
- package/turns/cli_1777465095132/turn_005.txt +0 -422
- package/turns/cli_1777465095132/turn_006.txt +0 -365
- package/turns/cli_1777465095132/turn_007.txt +0 -885
- package/turns/cli_1777465095132/turn_008.txt +0 -1277
- package/turns/cli_1777465095132/turn_009.txt +0 -736
- /package/src/{plugins → lib}/hedberg/patterns.js +0 -0
package/bin/digest.js
ADDED
|
@@ -0,0 +1,1097 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Universal run-digest tool. Reads any rummy*.db (e2e, demo, bench, dev)
|
|
3
|
+
* and emits per-run forensic artifacts. First-order tool, not bench-
|
|
4
|
+
* specific — use it on anything with a rummy*.db inside.
|
|
5
|
+
*
|
|
6
|
+
* <out>/digest.md Run-shape header + waterfall: per-turn line
|
|
7
|
+
* with status, update body, indented emission
|
|
8
|
+
* list, and a reasoning excerpt.
|
|
9
|
+
* <out>/digest.json Same data, machine-queryable.
|
|
10
|
+
* <out>/reasoning.md Per-turn reasoning_content (full).
|
|
11
|
+
* <out>/packets.md Per-turn assembled wire packets.
|
|
12
|
+
* <out>/digest_skipped Written when no rummy*.db is present.
|
|
13
|
+
*
|
|
14
|
+
* <sweep>/index.csv Greppable per-task summary.
|
|
15
|
+
* <sweep>/errors.md Cross-task aggregated error report.
|
|
16
|
+
* <sweep>/errors.json Same, machine-queryable.
|
|
17
|
+
*
|
|
18
|
+
* Read-only derivative; never source-of-truth. Safe to re-run.
|
|
19
|
+
*
|
|
20
|
+
* Usage:
|
|
21
|
+
* node bin/digest.js <sweep-dir> sweep + index + errors
|
|
22
|
+
* node bin/digest.js <task-dir> single task + errors
|
|
23
|
+
* node bin/digest.js <path-to-rummy.db> bare DB → sibling .digest/ dir
|
|
24
|
+
* node bin/digest.js latest tbench sweep
|
|
25
|
+
*/
|
|
26
|
+
|
|
27
|
+
import {
|
|
28
|
+
closeSync,
|
|
29
|
+
existsSync,
|
|
30
|
+
mkdirSync,
|
|
31
|
+
openSync,
|
|
32
|
+
readdirSync,
|
|
33
|
+
readFileSync,
|
|
34
|
+
statSync,
|
|
35
|
+
writeFileSync,
|
|
36
|
+
} from "node:fs";
|
|
37
|
+
import { dirname, join, relative } from "node:path";
|
|
38
|
+
import { DatabaseSync } from "node:sqlite";
|
|
39
|
+
import { fileURLToPath } from "node:url";
|
|
40
|
+
|
|
41
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
42
|
+
// Default sweep root for the no-arg invocation: the tbench results dir
|
|
43
|
+
// historically holds the latest of everything. Resolved relative to the
|
|
44
|
+
// project root (one level up from bin/).
|
|
45
|
+
const RESULTS_DIR = join(__dirname, "..", "test", "tbench", "results");
|
|
46
|
+
// Bare-DB invocation output. The full per-run pile (subdirs per alias
|
|
47
|
+
// for multi-run DBs) lives in /tmp/rummy_digest/ — clobbered each run.
|
|
48
|
+
// Sweep / task-dir invocations keep writing alongside source dirs.
|
|
49
|
+
const PILE_DIR = "/tmp/rummy_digest";
|
|
50
|
+
// Single readable digest that mirrors the most recent run from the DB.
|
|
51
|
+
// Flat files only — no nested folders. Clobbered on each invocation.
|
|
52
|
+
const PUBLIC_DIR = join(__dirname, "..", "test", "digest");
|
|
53
|
+
|
|
54
|
+
const MAX_LOOP_TURNS = Number(process.env.RUMMY_MAX_LOOP_TURNS) || 99;
|
|
55
|
+
const REASONING_RUNAWAY_CHARS = 8000;
|
|
56
|
+
|
|
57
|
+
// Locate the agent's sqlite DB inside a task dir's agent/ folder. Tbench
|
|
58
|
+
// writes `rummy.db`; programbench writes `rummy_programbench.db` (so the
|
|
59
|
+
// host-side audit DB is segregated from any project-internal `rummy.db`
|
|
60
|
+
// the agent might create). Returns absolute path or null. Empty stubs
|
|
61
|
+
// (zero-length leftovers from aborted runs) are ignored.
|
|
62
|
+
function findAgentDb(taskDir) {
|
|
63
|
+
const agentDir = join(taskDir, "agent");
|
|
64
|
+
if (!existsSync(agentDir)) return null;
|
|
65
|
+
let names;
|
|
66
|
+
try {
|
|
67
|
+
names = readdirSync(agentDir);
|
|
68
|
+
} catch {
|
|
69
|
+
return null;
|
|
70
|
+
}
|
|
71
|
+
const candidates = names
|
|
72
|
+
.filter((n) => /^rummy.*\.db$/.test(n))
|
|
73
|
+
.map((n) => join(agentDir, n))
|
|
74
|
+
.filter((p) => {
|
|
75
|
+
try {
|
|
76
|
+
return statSync(p).size > 0;
|
|
77
|
+
} catch {
|
|
78
|
+
return false;
|
|
79
|
+
}
|
|
80
|
+
});
|
|
81
|
+
if (candidates.length === 0) return null;
|
|
82
|
+
const canonical = candidates.find((p) => p.endsWith("/rummy.db"));
|
|
83
|
+
if (canonical) return canonical;
|
|
84
|
+
return candidates.toSorted((a, b) => statSync(b).size - statSync(a).size)[0];
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
function isTaskDir(dir) {
|
|
88
|
+
return findAgentDb(dir) != null;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
function findTaskDirs(sweepDir) {
|
|
92
|
+
const result = [];
|
|
93
|
+
function walk(dir, depth) {
|
|
94
|
+
if (depth > 4) return;
|
|
95
|
+
let names;
|
|
96
|
+
try {
|
|
97
|
+
names = readdirSync(dir);
|
|
98
|
+
} catch {
|
|
99
|
+
return;
|
|
100
|
+
}
|
|
101
|
+
for (const name of names) {
|
|
102
|
+
const full = join(dir, name);
|
|
103
|
+
let s;
|
|
104
|
+
try {
|
|
105
|
+
s = statSync(full);
|
|
106
|
+
} catch {
|
|
107
|
+
continue;
|
|
108
|
+
}
|
|
109
|
+
if (!s.isDirectory()) continue;
|
|
110
|
+
if (isTaskDir(full)) {
|
|
111
|
+
result.push(full);
|
|
112
|
+
continue;
|
|
113
|
+
}
|
|
114
|
+
walk(full, depth + 1);
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
walk(sweepDir, 0);
|
|
118
|
+
return result;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
function readReward(taskDir) {
|
|
122
|
+
const p = join(taskDir, "verifier", "reward.txt");
|
|
123
|
+
if (!existsSync(p)) return null;
|
|
124
|
+
const r = readFileSync(p, "utf8").trim();
|
|
125
|
+
if (r === "0" || r === "1") return Number(r);
|
|
126
|
+
return null;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
function readRunSummary(taskDir) {
|
|
130
|
+
const p = join(taskDir, "agent", "rummy.txt");
|
|
131
|
+
if (!existsSync(p)) return null;
|
|
132
|
+
const text = readFileSync(p, "utf8");
|
|
133
|
+
const m = text.match(/__RUMMY_RUN_SUMMARY__\s+(\{.*\})\s*$/m);
|
|
134
|
+
if (!m) return null;
|
|
135
|
+
try {
|
|
136
|
+
return JSON.parse(m[1]);
|
|
137
|
+
} catch {
|
|
138
|
+
return null;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
function parseAttrs(s) {
|
|
143
|
+
if (s == null) return {};
|
|
144
|
+
if (typeof s === "object") return s;
|
|
145
|
+
try {
|
|
146
|
+
return JSON.parse(s);
|
|
147
|
+
} catch {
|
|
148
|
+
return {};
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
const TURN_FROM_PATH = /^log:\/\/turn_(\d+)\//;
|
|
153
|
+
|
|
154
|
+
function turnFromPath(path) {
|
|
155
|
+
const m = TURN_FROM_PATH.exec(path);
|
|
156
|
+
return m ? Number(m[1]) : null;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
function actionFromPath(path) {
|
|
160
|
+
const m = path.match(/^log:\/\/turn_\d+\/([^/]+)\//);
|
|
161
|
+
return m ? m[1] : null;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
function pathSlug(path) {
|
|
165
|
+
// Decode the slug after `log://turn_N/<action>/`. URL-encoded.
|
|
166
|
+
const m = path.match(/^log:\/\/turn_\d+\/[^/]+\/(.+)$/);
|
|
167
|
+
if (!m) return path;
|
|
168
|
+
try {
|
|
169
|
+
return decodeURIComponent(m[1]);
|
|
170
|
+
} catch {
|
|
171
|
+
return m[1];
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
function summarize(text, n = 80) {
|
|
176
|
+
if (!text) return "";
|
|
177
|
+
const flat = text.replace(/\s+/g, " ").trim();
|
|
178
|
+
if (flat.length <= n) return flat;
|
|
179
|
+
return `${flat.slice(0, n)}…`;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
// Read all runs from a DB plus their per-run data. Tbench task DBs have
|
|
183
|
+
// exactly one run; e2e TestDb DBs have many (one per test invocation).
|
|
184
|
+
// The caller drives a per-run digest pass either way.
|
|
185
|
+
function readDb(rummyDb) {
|
|
186
|
+
const db = new DatabaseSync(rummyDb, { readOnly: true });
|
|
187
|
+
|
|
188
|
+
const runs = db.prepare("SELECT * FROM runs ORDER BY id").all();
|
|
189
|
+
const turnsStmt = db.prepare(
|
|
190
|
+
`SELECT sequence, total_tokens, prompt_tokens, completion_tokens,
|
|
191
|
+
cached_tokens, reasoning_tokens, reasoning_content
|
|
192
|
+
FROM turns
|
|
193
|
+
WHERE run_id = ?
|
|
194
|
+
ORDER BY sequence`,
|
|
195
|
+
);
|
|
196
|
+
const logStmt = db.prepare(
|
|
197
|
+
`SELECT e.path, e.body, e.attributes, e.scheme,
|
|
198
|
+
rv.state, rv.outcome, rv.visibility, rv.turn
|
|
199
|
+
FROM entries e
|
|
200
|
+
JOIN run_views rv ON rv.entry_id = e.id
|
|
201
|
+
WHERE rv.run_id = ?
|
|
202
|
+
AND e.path LIKE 'log://turn_%'
|
|
203
|
+
ORDER BY e.id`,
|
|
204
|
+
);
|
|
205
|
+
const promptStmt = db.prepare(
|
|
206
|
+
`SELECT e.body
|
|
207
|
+
FROM entries e
|
|
208
|
+
JOIN run_views rv ON rv.entry_id = e.id
|
|
209
|
+
WHERE rv.run_id = ? AND e.path = 'prompt://1'
|
|
210
|
+
LIMIT 1`,
|
|
211
|
+
);
|
|
212
|
+
// Per-turn assembled packet bytes. system://N + user://N are what we
|
|
213
|
+
// sent to the LLM; assistant://N is the parsed content; model://N is
|
|
214
|
+
// the raw response wrapper (includes reasoning_content, finish_reason,
|
|
215
|
+
// usage). reasoning://N is the bare reasoning channel when the model
|
|
216
|
+
// surfaced one.
|
|
217
|
+
const packetStmt = db.prepare(
|
|
218
|
+
`SELECT e.path, e.body
|
|
219
|
+
FROM entries e
|
|
220
|
+
JOIN run_views rv ON rv.entry_id = e.id
|
|
221
|
+
WHERE rv.run_id = ?
|
|
222
|
+
AND (e.path GLOB 'system://*' OR e.path GLOB 'user://*'
|
|
223
|
+
OR e.path GLOB 'assistant://*' OR e.path GLOB 'model://*'
|
|
224
|
+
OR e.path GLOB 'reasoning://*')
|
|
225
|
+
ORDER BY e.id`,
|
|
226
|
+
);
|
|
227
|
+
|
|
228
|
+
const perRun = runs.map((run) => ({
|
|
229
|
+
run,
|
|
230
|
+
turns: turnsStmt.all(run.id),
|
|
231
|
+
logEntries: logStmt.all(run.id),
|
|
232
|
+
packetEntries: packetStmt.all(run.id),
|
|
233
|
+
prompt: promptStmt.get(run.id)?.body ?? null,
|
|
234
|
+
}));
|
|
235
|
+
|
|
236
|
+
db.close();
|
|
237
|
+
return perRun;
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// Build per-turn rows: one entry per turn with its update + emissions + errors.
|
|
241
|
+
function buildTurns(turns, logEntries) {
|
|
242
|
+
const byTurn = new Map();
|
|
243
|
+
for (const t of turns) {
|
|
244
|
+
byTurn.set(t.sequence, {
|
|
245
|
+
turn: t.sequence,
|
|
246
|
+
totalTokens: t.total_tokens,
|
|
247
|
+
promptTokens: t.prompt_tokens,
|
|
248
|
+
completionTokens: t.completion_tokens,
|
|
249
|
+
cachedTokens: t.cached_tokens,
|
|
250
|
+
reasoningTokens: t.reasoning_tokens,
|
|
251
|
+
reasoningChars: (t.reasoning_content || "").length,
|
|
252
|
+
reasoning: t.reasoning_content || "",
|
|
253
|
+
update: null, // {status, body, state, outcome}
|
|
254
|
+
emissions: [], // {action, slug, attrs, body, state, outcome}
|
|
255
|
+
errors: [], // {body, attrs}
|
|
256
|
+
});
|
|
257
|
+
}
|
|
258
|
+
// Make sure we have a row even for "ghost" turns where the LLM call
|
|
259
|
+
// failed before turn-row creation (rare but possible).
|
|
260
|
+
for (const e of logEntries) {
|
|
261
|
+
const turn = turnFromPath(e.path);
|
|
262
|
+
if (turn == null) continue;
|
|
263
|
+
if (!byTurn.has(turn)) {
|
|
264
|
+
byTurn.set(turn, {
|
|
265
|
+
turn,
|
|
266
|
+
totalTokens: null,
|
|
267
|
+
promptTokens: null,
|
|
268
|
+
completionTokens: null,
|
|
269
|
+
cachedTokens: null,
|
|
270
|
+
reasoningTokens: null,
|
|
271
|
+
reasoningChars: 0,
|
|
272
|
+
reasoning: "",
|
|
273
|
+
update: null,
|
|
274
|
+
emissions: [],
|
|
275
|
+
errors: [],
|
|
276
|
+
});
|
|
277
|
+
}
|
|
278
|
+
const row = byTurn.get(turn);
|
|
279
|
+
const action = actionFromPath(e.path);
|
|
280
|
+
const attrs = parseAttrs(e.attributes);
|
|
281
|
+
if (action === "update") {
|
|
282
|
+
row.update = {
|
|
283
|
+
status: attrs.status ?? null,
|
|
284
|
+
body: e.body,
|
|
285
|
+
state: e.state,
|
|
286
|
+
outcome: e.outcome,
|
|
287
|
+
};
|
|
288
|
+
} else if (action === "error") {
|
|
289
|
+
row.errors.push({
|
|
290
|
+
body: e.body,
|
|
291
|
+
attrs,
|
|
292
|
+
slug: pathSlug(e.path),
|
|
293
|
+
state: e.state,
|
|
294
|
+
outcome: e.outcome,
|
|
295
|
+
});
|
|
296
|
+
} else {
|
|
297
|
+
row.emissions.push({
|
|
298
|
+
action,
|
|
299
|
+
slug: pathSlug(e.path),
|
|
300
|
+
targetPath: attrs.path ?? null,
|
|
301
|
+
visibility: attrs.visibility ?? null,
|
|
302
|
+
query: attrs.query ?? null,
|
|
303
|
+
command: attrs.command ?? null,
|
|
304
|
+
body: e.body,
|
|
305
|
+
state: e.state,
|
|
306
|
+
outcome: e.outcome,
|
|
307
|
+
});
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
return [...byTurn.values()].sort((a, b) => a.turn - b.turn);
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
function classifyMarkers(reward, runSummary, turnRows) {
|
|
314
|
+
const markers = [];
|
|
315
|
+
const status = runSummary?.status ?? null;
|
|
316
|
+
if (reward === 1) markers.push("passed");
|
|
317
|
+
if (reward === 0 && status === 200)
|
|
318
|
+
markers.push("claim_success_verifier_fail");
|
|
319
|
+
if (status === 499 && turnRows.length >= MAX_LOOP_TURNS - 1) {
|
|
320
|
+
markers.push("max_loop_turns");
|
|
321
|
+
}
|
|
322
|
+
if (status === 413) markers.push("context_overflow");
|
|
323
|
+
if (status === 500) markers.push("dispatch_500");
|
|
324
|
+
|
|
325
|
+
let strikeAbandon = false;
|
|
326
|
+
let runawayTurn = null;
|
|
327
|
+
let parserWarn = false;
|
|
328
|
+
for (const row of turnRows) {
|
|
329
|
+
const stuck =
|
|
330
|
+
row.reasoningChars >= REASONING_RUNAWAY_CHARS &&
|
|
331
|
+
row.emissions.length === 0 &&
|
|
332
|
+
!row.update;
|
|
333
|
+
if (stuck) runawayTurn = row.turn;
|
|
334
|
+
for (const err of row.errors) {
|
|
335
|
+
const body = err.body || "";
|
|
336
|
+
if (body.startsWith("Abandoned after")) strikeAbandon = true;
|
|
337
|
+
if (body.startsWith("Unclosed") || body.includes("Tool call limit")) {
|
|
338
|
+
parserWarn = true;
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
if (strikeAbandon) markers.push("strike_abandon");
|
|
343
|
+
if (runawayTurn != null) markers.push(`reasoning_runaway_t${runawayTurn}`);
|
|
344
|
+
if (parserWarn) markers.push("parser_warning");
|
|
345
|
+
if (!runSummary) markers.push("exfil_fail");
|
|
346
|
+
return markers;
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
// Render an emission as a single waterfall line.
|
|
350
|
+
function renderEmission(em) {
|
|
351
|
+
const fail = em.state === "failed" ? " ✗" : "";
|
|
352
|
+
const outcome = em.outcome ? ` [${em.outcome}]` : "";
|
|
353
|
+
const target = em.targetPath ?? em.slug;
|
|
354
|
+
const vis = em.visibility ? ` visibility=${em.visibility}` : "";
|
|
355
|
+
const query = em.query ? ` "${summarize(em.query, 60)}"` : "";
|
|
356
|
+
const command = em.command ? ` "${summarize(em.command, 60)}"` : "";
|
|
357
|
+
return ` ← ${em.action} ${target}${vis}${query}${command}${fail}${outcome}`;
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
function renderError(err) {
|
|
361
|
+
return ` ✗ error: ${summarize(err.body, 100)}`;
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
// "What happened" header — five lines a forensic reader can scan in
|
|
365
|
+
// seconds before deciding whether to drop into the waterfall.
|
|
366
|
+
function renderRunShape(turnRows) {
|
|
367
|
+
let lastUpdate = null;
|
|
368
|
+
let lastEmissionTurn = null;
|
|
369
|
+
let setActions = 0;
|
|
370
|
+
let getActions = 0;
|
|
371
|
+
let searchActions = 0;
|
|
372
|
+
for (const row of turnRows) {
|
|
373
|
+
if (row.update) lastUpdate = { turn: row.turn, ...row.update };
|
|
374
|
+
if (row.emissions.length > 0) lastEmissionTurn = row.turn;
|
|
375
|
+
for (const em of row.emissions) {
|
|
376
|
+
if (em.action === "set") setActions++;
|
|
377
|
+
else if (em.action === "get") getActions++;
|
|
378
|
+
else if (em.action === "search") searchActions++;
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
const lastUpdateLine = lastUpdate
|
|
382
|
+
? `T${lastUpdate.turn} status=${lastUpdate.status ?? "—"} "${summarize(lastUpdate.body, 60)}"`
|
|
383
|
+
: "(none)";
|
|
384
|
+
return [
|
|
385
|
+
`Last update: ${lastUpdateLine}`,
|
|
386
|
+
`Last emission: ${lastEmissionTurn != null ? `T${lastEmissionTurn}` : "(none)"}`,
|
|
387
|
+
`Action mix: set=${setActions} get=${getActions} search=${searchActions}`,
|
|
388
|
+
];
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
function renderWaterfall(
|
|
392
|
+
taskName,
|
|
393
|
+
prompt,
|
|
394
|
+
runSummary,
|
|
395
|
+
reward,
|
|
396
|
+
turnRows,
|
|
397
|
+
markers,
|
|
398
|
+
) {
|
|
399
|
+
const lines = [];
|
|
400
|
+
lines.push(`# ${taskName}`);
|
|
401
|
+
lines.push("");
|
|
402
|
+
const status = runSummary?.status ?? "?";
|
|
403
|
+
const totalTurns = runSummary?.turns ?? turnRows.length;
|
|
404
|
+
const cost =
|
|
405
|
+
runSummary?.cost != null ? `$${runSummary.cost.toFixed(4)}` : "?";
|
|
406
|
+
const tokens = runSummary?.tokens
|
|
407
|
+
? `prompt=${runSummary.tokens.prompt} completion=${runSummary.tokens.completion} cached=${runSummary.tokens.cached}`
|
|
408
|
+
: "?";
|
|
409
|
+
const rewardStr = reward == null ? "—" : reward === 1 ? "PASS" : "FAIL";
|
|
410
|
+
lines.push(
|
|
411
|
+
`status=${status} reward=${rewardStr} turns=${totalTurns} cost=${cost} tokens=${tokens}`,
|
|
412
|
+
);
|
|
413
|
+
if (markers.length > 0) {
|
|
414
|
+
lines.push("");
|
|
415
|
+
lines.push(`markers: ${markers.join(", ")}`);
|
|
416
|
+
}
|
|
417
|
+
lines.push("");
|
|
418
|
+
lines.push("## Run shape");
|
|
419
|
+
lines.push("");
|
|
420
|
+
for (const l of renderRunShape(turnRows)) lines.push(l);
|
|
421
|
+
lines.push("");
|
|
422
|
+
if (prompt) {
|
|
423
|
+
lines.push("## Prompt");
|
|
424
|
+
lines.push(summarize(prompt, 240));
|
|
425
|
+
lines.push("");
|
|
426
|
+
}
|
|
427
|
+
lines.push("## Waterfall");
|
|
428
|
+
for (const row of turnRows) {
|
|
429
|
+
const upStatus = row.update?.status ?? "—";
|
|
430
|
+
const upBody = row.update ? summarize(row.update.body, 80) : "(no update)";
|
|
431
|
+
const upFail =
|
|
432
|
+
row.update?.state === "failed" ? ` ✗ ${row.update.outcome ?? ""}` : "";
|
|
433
|
+
lines.push(`T${row.turn}: ${upStatus} "${upBody}"${upFail}`);
|
|
434
|
+
if (row.reasoning)
|
|
435
|
+
lines.push(` ↳ reasoning: ${summarize(row.reasoning, 140)}`);
|
|
436
|
+
for (const em of row.emissions) lines.push(renderEmission(em));
|
|
437
|
+
for (const err of row.errors) lines.push(renderError(err));
|
|
438
|
+
}
|
|
439
|
+
lines.push("");
|
|
440
|
+
lines.push("## Drill-down");
|
|
441
|
+
lines.push("- agent/rummy.txt (full trace, when present)");
|
|
442
|
+
lines.push("- agent/rummy.db (sqlite — entries, run_views, turns)");
|
|
443
|
+
lines.push("- reasoning.md (per-turn reasoning_content)");
|
|
444
|
+
lines.push("- packets.md (per-turn assembled wire packets)");
|
|
445
|
+
return lines.join("\n");
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
function renderReasoning(taskName, turnRows) {
|
|
449
|
+
const lines = [];
|
|
450
|
+
lines.push(`# Reasoning: ${taskName}`);
|
|
451
|
+
for (const row of turnRows) {
|
|
452
|
+
lines.push("");
|
|
453
|
+
lines.push(`## Turn ${row.turn}`);
|
|
454
|
+
if (row.reasoning) {
|
|
455
|
+
lines.push("");
|
|
456
|
+
lines.push(row.reasoning);
|
|
457
|
+
} else {
|
|
458
|
+
lines.push("");
|
|
459
|
+
lines.push("(no reasoning_content)");
|
|
460
|
+
}
|
|
461
|
+
}
|
|
462
|
+
return lines.join("\n");
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
// Group packet entries (system://N / user://N / assistant://N / model://N
|
|
466
|
+
// / reasoning://N) by their turn number suffix.
|
|
467
|
+
function groupPacketsByTurn(packetEntries) {
|
|
468
|
+
const byTurn = new Map();
|
|
469
|
+
for (const e of packetEntries) {
|
|
470
|
+
const m = e.path.match(/^([a-z]+):\/\/(\d+)$/);
|
|
471
|
+
if (!m) continue;
|
|
472
|
+
const role = m[1];
|
|
473
|
+
const turn = Number(m[2]);
|
|
474
|
+
if (!byTurn.has(turn)) byTurn.set(turn, {});
|
|
475
|
+
byTurn.get(turn)[role] = e.body;
|
|
476
|
+
}
|
|
477
|
+
return [...byTurn.entries()]
|
|
478
|
+
.toSorted(([a], [b]) => a - b)
|
|
479
|
+
.map(([turn, parts]) => ({ turn, ...parts }));
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
// Per-turn packet dump: exactly what was sent (system + user) and
|
|
483
|
+
// received (assistant + model wrapper + reasoning) for each turn. The
|
|
484
|
+
// shape mirrors the wire payload so a forensic reader can see how
|
|
485
|
+
// errors, log entries, and state actually presented to the model.
|
|
486
|
+
function renderPackets(taskName, turnPackets) {
|
|
487
|
+
const lines = [];
|
|
488
|
+
lines.push(`# Packets: ${taskName}`);
|
|
489
|
+
lines.push("");
|
|
490
|
+
lines.push(
|
|
491
|
+
"Per-turn assembled packets. `system` + `user` are the outgoing message;",
|
|
492
|
+
);
|
|
493
|
+
lines.push(
|
|
494
|
+
"`assistant` is the parsed completion; `model` is the raw response",
|
|
495
|
+
);
|
|
496
|
+
lines.push("wrapper (usage, finish_reason); `reasoning` is the bare CoT");
|
|
497
|
+
lines.push("channel when the provider surfaces one.");
|
|
498
|
+
for (const p of turnPackets) {
|
|
499
|
+
lines.push("");
|
|
500
|
+
lines.push(`## Turn ${p.turn}`);
|
|
501
|
+
for (const role of ["system", "user", "assistant", "reasoning", "model"]) {
|
|
502
|
+
if (p[role] == null) continue;
|
|
503
|
+
lines.push("");
|
|
504
|
+
lines.push(`### ${role}://${p.turn}`);
|
|
505
|
+
lines.push("");
|
|
506
|
+
lines.push("```");
|
|
507
|
+
lines.push(p[role]);
|
|
508
|
+
lines.push("```");
|
|
509
|
+
}
|
|
510
|
+
}
|
|
511
|
+
return lines.join("\n");
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
function digestJson({
|
|
515
|
+
taskName,
|
|
516
|
+
taskDir,
|
|
517
|
+
prompt,
|
|
518
|
+
runSummary,
|
|
519
|
+
reward,
|
|
520
|
+
turnRows,
|
|
521
|
+
markers,
|
|
522
|
+
}) {
|
|
523
|
+
return {
|
|
524
|
+
task: taskName,
|
|
525
|
+
dir: taskDir,
|
|
526
|
+
reward,
|
|
527
|
+
status: runSummary?.status ?? null,
|
|
528
|
+
turns: runSummary?.turns ?? turnRows.length,
|
|
529
|
+
tokens: runSummary?.tokens ?? null,
|
|
530
|
+
cost: runSummary?.cost ?? null,
|
|
531
|
+
wallSeconds: runSummary?.wallSeconds ?? null,
|
|
532
|
+
markers,
|
|
533
|
+
prompt: prompt ?? null,
|
|
534
|
+
turnRows: turnRows.map((row) => ({
|
|
535
|
+
turn: row.turn,
|
|
536
|
+
totalTokens: row.totalTokens,
|
|
537
|
+
reasoningChars: row.reasoningChars,
|
|
538
|
+
update: row.update
|
|
539
|
+
? {
|
|
540
|
+
status: row.update.status,
|
|
541
|
+
body: row.update.body,
|
|
542
|
+
state: row.update.state,
|
|
543
|
+
outcome: row.update.outcome,
|
|
544
|
+
}
|
|
545
|
+
: null,
|
|
546
|
+
emissions: row.emissions.map((em) => ({
|
|
547
|
+
action: em.action,
|
|
548
|
+
targetPath: em.targetPath,
|
|
549
|
+
visibility: em.visibility,
|
|
550
|
+
state: em.state,
|
|
551
|
+
outcome: em.outcome,
|
|
552
|
+
})),
|
|
553
|
+
errors: row.errors.map((err) => ({ body: err.body })),
|
|
554
|
+
})),
|
|
555
|
+
};
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
// Build rich error records from a run's turn rows + log entries. Each
|
|
559
|
+
// record carries enough forensic context to read the failure without
|
|
560
|
+
// drilling into the DB: the originating action's path/body (the thing
|
|
561
|
+
// the model tried), the verdict status/outcome, soft-vs-strike, and a
|
|
562
|
+
// signature for cross-task aggregation.
|
|
563
|
+
function collectErrors(turnRows, logEntries, runIdent) {
|
|
564
|
+
const byPath = new Map();
|
|
565
|
+
for (const e of logEntries) byPath.set(e.path, e);
|
|
566
|
+
const records = [];
|
|
567
|
+
for (const row of turnRows) {
|
|
568
|
+
for (const err of row.errors) {
|
|
569
|
+
const sourcePath = err.attrs?.sourcePath ?? null;
|
|
570
|
+
const sourceEntry = sourcePath ? byPath.get(sourcePath) : null;
|
|
571
|
+
const sourceAttrs = sourceEntry ? parseAttrs(sourceEntry.attributes) : {};
|
|
572
|
+
const semanticOutcome = err.attrs?.outcome ?? err.outcome ?? null;
|
|
573
|
+
records.push({
|
|
574
|
+
run: runIdent,
|
|
575
|
+
turn: row.turn,
|
|
576
|
+
status: err.attrs?.status ?? null,
|
|
577
|
+
outcome: semanticOutcome,
|
|
578
|
+
rvOutcome: err.outcome ?? null,
|
|
579
|
+
state: err.state ?? null,
|
|
580
|
+
soft: err.state === "resolved",
|
|
581
|
+
sourcePath,
|
|
582
|
+
sourceAction: sourceEntry
|
|
583
|
+
? {
|
|
584
|
+
path: sourcePath,
|
|
585
|
+
action: actionFromPath(sourcePath),
|
|
586
|
+
targetPath: sourceAttrs.path ?? null,
|
|
587
|
+
body: sourceEntry.body ?? "",
|
|
588
|
+
state: sourceEntry.state ?? null,
|
|
589
|
+
outcome: sourceEntry.outcome ?? null,
|
|
590
|
+
}
|
|
591
|
+
: null,
|
|
592
|
+
body: err.body ?? "",
|
|
593
|
+
bodySig: errorSignature({
|
|
594
|
+
outcome: semanticOutcome,
|
|
595
|
+
sourcePath,
|
|
596
|
+
body: err.body,
|
|
597
|
+
}),
|
|
598
|
+
});
|
|
599
|
+
}
|
|
600
|
+
}
|
|
601
|
+
return records;
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
// Group key for cross-task aggregation. Same outcome + same source-path
|
|
605
|
+
// shape (turn-stripped) + same body prefix collapses repeats. The 80-char
|
|
606
|
+
// body prefix accommodates "<<<<<<< SEARCH\n<context-line>" patterns
|
|
607
|
+
// without bleeding into the divergent tail.
|
|
608
|
+
const SIG_BODY_CHARS = 80;
|
|
609
|
+
function errorSignature({ outcome, sourcePath, body }) {
|
|
610
|
+
const out = outcome ?? "—";
|
|
611
|
+
const src = sourcePath ? sourcePath.replace(/turn_\d+/, "turn_*") : "—";
|
|
612
|
+
const flat = (body ?? "").replace(/\s+/g, " ").trim();
|
|
613
|
+
const head =
|
|
614
|
+
flat.length > SIG_BODY_CHARS ? `${flat.slice(0, SIG_BODY_CHARS)}…` : flat;
|
|
615
|
+
return `${out} :: ${src} :: ${head}`;
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
function aggregateErrors(allErrors) {
|
|
619
|
+
const total = allErrors.length;
|
|
620
|
+
let strikes = 0;
|
|
621
|
+
let soft = 0;
|
|
622
|
+
const byOutcome = new Map();
|
|
623
|
+
const byTask = new Map();
|
|
624
|
+
const bySig = new Map();
|
|
625
|
+
for (const er of allErrors) {
|
|
626
|
+
if (er.soft) soft++;
|
|
627
|
+
else strikes++;
|
|
628
|
+
const oc = er.outcome ?? "—";
|
|
629
|
+
byOutcome.set(oc, (byOutcome.get(oc) ?? 0) + 1);
|
|
630
|
+
const taskKey = er.run.task;
|
|
631
|
+
byTask.set(taskKey, (byTask.get(taskKey) ?? 0) + 1);
|
|
632
|
+
if (!bySig.has(er.bodySig)) {
|
|
633
|
+
bySig.set(er.bodySig, {
|
|
634
|
+
sig: er.bodySig,
|
|
635
|
+
count: 0,
|
|
636
|
+
outcome: er.outcome,
|
|
637
|
+
sourcePathPattern: er.sourcePath
|
|
638
|
+
? er.sourcePath.replace(/turn_\d+/, "turn_*")
|
|
639
|
+
: null,
|
|
640
|
+
turns: new Set(),
|
|
641
|
+
tasks: new Set(),
|
|
642
|
+
exemplar: null,
|
|
643
|
+
});
|
|
644
|
+
}
|
|
645
|
+
const g = bySig.get(er.bodySig);
|
|
646
|
+
g.count++;
|
|
647
|
+
g.turns.add(er.turn);
|
|
648
|
+
g.tasks.add(taskKey);
|
|
649
|
+
if (g.exemplar == null) g.exemplar = er;
|
|
650
|
+
}
|
|
651
|
+
const topSignatures = [...bySig.values()]
|
|
652
|
+
.toSorted((a, b) => b.count - a.count)
|
|
653
|
+
.map((g) => ({
|
|
654
|
+
sig: g.sig,
|
|
655
|
+
count: g.count,
|
|
656
|
+
outcome: g.outcome,
|
|
657
|
+
sourcePathPattern: g.sourcePathPattern,
|
|
658
|
+
turns: [...g.turns].toSorted((a, b) => a - b),
|
|
659
|
+
tasks: [...g.tasks].toSorted(),
|
|
660
|
+
exemplar: g.exemplar,
|
|
661
|
+
}));
|
|
662
|
+
return {
|
|
663
|
+
total,
|
|
664
|
+
strikes,
|
|
665
|
+
soft,
|
|
666
|
+
byOutcome: Object.fromEntries(
|
|
667
|
+
[...byOutcome.entries()].toSorted((a, b) => b[1] - a[1]),
|
|
668
|
+
),
|
|
669
|
+
byTask: Object.fromEntries(
|
|
670
|
+
[...byTask.entries()].toSorted((a, b) => b[1] - a[1]),
|
|
671
|
+
),
|
|
672
|
+
topSignatures,
|
|
673
|
+
};
|
|
674
|
+
}
|
|
675
|
+
|
|
676
|
+
function indentBlock(text, indent = " ") {
|
|
677
|
+
if (!text) return "";
|
|
678
|
+
return text
|
|
679
|
+
.split("\n")
|
|
680
|
+
.map((line) => `${indent}${line}`)
|
|
681
|
+
.join("\n");
|
|
682
|
+
}
|
|
683
|
+
|
|
684
|
+
// Compress runs of consecutive turn numbers into "first-last" form so a
|
|
685
|
+
// 25-occurrence error spanning turns 96-120 reads as `96-120` instead of
|
|
686
|
+
// hogging a line. Mixed sparse + run sequences come out as
|
|
687
|
+
// `1, 4, 96-120`.
|
|
688
|
+
function compressTurns(turns) {
|
|
689
|
+
if (turns.length === 0) return "";
|
|
690
|
+
const sorted = [...turns].toSorted((a, b) => a - b);
|
|
691
|
+
const out = [];
|
|
692
|
+
let runStart = sorted[0];
|
|
693
|
+
let prev = sorted[0];
|
|
694
|
+
for (let i = 1; i <= sorted.length; i++) {
|
|
695
|
+
const cur = sorted[i];
|
|
696
|
+
if (cur === prev + 1) {
|
|
697
|
+
prev = cur;
|
|
698
|
+
continue;
|
|
699
|
+
}
|
|
700
|
+
out.push(runStart === prev ? `${runStart}` : `${runStart}-${prev}`);
|
|
701
|
+
runStart = cur;
|
|
702
|
+
prev = cur;
|
|
703
|
+
}
|
|
704
|
+
return out.join(", ");
|
|
705
|
+
}
|
|
706
|
+
|
|
707
|
+
function renderErrorsMarkdown(scopeName, allErrors, summary) {
|
|
708
|
+
const lines = [];
|
|
709
|
+
lines.push(`# Errors: ${scopeName}`);
|
|
710
|
+
lines.push("");
|
|
711
|
+
if (allErrors.length === 0) {
|
|
712
|
+
lines.push("No errors recorded.");
|
|
713
|
+
return lines.join("\n");
|
|
714
|
+
}
|
|
715
|
+
lines.push(
|
|
716
|
+
`${summary.total} errors across ${Object.keys(summary.byTask).length} task(s) — ${summary.strikes} strike, ${summary.soft} soft.`,
|
|
717
|
+
);
|
|
718
|
+
lines.push("");
|
|
719
|
+
lines.push("## Counts by outcome");
|
|
720
|
+
lines.push("");
|
|
721
|
+
for (const [oc, n] of Object.entries(summary.byOutcome)) {
|
|
722
|
+
lines.push(`- \`${oc}\` × ${n}`);
|
|
723
|
+
}
|
|
724
|
+
lines.push("");
|
|
725
|
+
lines.push("## Counts by task");
|
|
726
|
+
lines.push("");
|
|
727
|
+
for (const [task, n] of Object.entries(summary.byTask)) {
|
|
728
|
+
lines.push(`- \`${task}\` × ${n}`);
|
|
729
|
+
}
|
|
730
|
+
lines.push("");
|
|
731
|
+
lines.push("## Top signatures");
|
|
732
|
+
lines.push("");
|
|
733
|
+
for (const g of summary.topSignatures) {
|
|
734
|
+
lines.push(
|
|
735
|
+
`### ×${g.count} — \`${g.outcome ?? "—"}\`${g.sourcePathPattern ? ` @ \`${g.sourcePathPattern}\`` : ""}`,
|
|
736
|
+
);
|
|
737
|
+
lines.push("");
|
|
738
|
+
lines.push(`turns: ${compressTurns(g.turns)}`);
|
|
739
|
+
if (g.tasks.length > 1) {
|
|
740
|
+
lines.push(`tasks: ${g.tasks.map((t) => `\`${t}\``).join(", ")}`);
|
|
741
|
+
}
|
|
742
|
+
lines.push("");
|
|
743
|
+
lines.push("error body:");
|
|
744
|
+
lines.push("");
|
|
745
|
+
lines.push("```");
|
|
746
|
+
lines.push(g.exemplar.body);
|
|
747
|
+
lines.push("```");
|
|
748
|
+
if (g.exemplar.sourceAction) {
|
|
749
|
+
const sa = g.exemplar.sourceAction;
|
|
750
|
+
lines.push("");
|
|
751
|
+
lines.push(
|
|
752
|
+
`source action (\`${sa.action}\`${sa.targetPath ? ` → \`${sa.targetPath}\`` : ""}):`,
|
|
753
|
+
);
|
|
754
|
+
lines.push("");
|
|
755
|
+
lines.push("```");
|
|
756
|
+
lines.push(sa.body);
|
|
757
|
+
lines.push("```");
|
|
758
|
+
}
|
|
759
|
+
lines.push("");
|
|
760
|
+
}
|
|
761
|
+
lines.push("## Chronological");
|
|
762
|
+
lines.push("");
|
|
763
|
+
const byTask = new Map();
|
|
764
|
+
for (const er of allErrors) {
|
|
765
|
+
const k = er.run.task + (er.run.alias ? `/${er.run.alias}` : "");
|
|
766
|
+
if (!byTask.has(k)) byTask.set(k, []);
|
|
767
|
+
byTask.get(k).push(er);
|
|
768
|
+
}
|
|
769
|
+
for (const [taskKey, errs] of byTask) {
|
|
770
|
+
lines.push(`### ${taskKey}`);
|
|
771
|
+
lines.push("");
|
|
772
|
+
for (const er of errs) {
|
|
773
|
+
const stateTag = er.soft ? "soft" : "strike";
|
|
774
|
+
const oc = er.outcome ?? "—";
|
|
775
|
+
const src = er.sourcePath ? ` @ \`${er.sourcePath}\`` : "";
|
|
776
|
+
lines.push(`- T${er.turn} \`${oc}\`/${stateTag}${src}`);
|
|
777
|
+
lines.push("");
|
|
778
|
+
lines.push(indentBlock(er.body, " > "));
|
|
779
|
+
lines.push("");
|
|
780
|
+
if (er.sourceAction) {
|
|
781
|
+
const sa = er.sourceAction;
|
|
782
|
+
lines.push(
|
|
783
|
+
` source: \`${sa.action}\`${sa.targetPath ? ` → \`${sa.targetPath}\`` : ""}`,
|
|
784
|
+
);
|
|
785
|
+
lines.push("");
|
|
786
|
+
lines.push(indentBlock(sa.body, " > "));
|
|
787
|
+
lines.push("");
|
|
788
|
+
}
|
|
789
|
+
}
|
|
790
|
+
}
|
|
791
|
+
return lines.join("\n");
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
function writeErrorsArtifacts(outDir, scopeName, allErrors) {
|
|
795
|
+
const summary = aggregateErrors(allErrors);
|
|
796
|
+
writeFileSync(
|
|
797
|
+
join(outDir, "errors.md"),
|
|
798
|
+
`${renderErrorsMarkdown(scopeName, allErrors, summary)}\n`,
|
|
799
|
+
);
|
|
800
|
+
writeFileSync(
|
|
801
|
+
join(outDir, "errors.json"),
|
|
802
|
+
`${JSON.stringify({ scope: scopeName, summary, errors: allErrors }, null, 2)}\n`,
|
|
803
|
+
);
|
|
804
|
+
}
|
|
805
|
+
|
|
806
|
+
// Synthesize a run-summary from the DB for runs that lack the harbor-side
|
|
807
|
+
// rummy.txt `__RUMMY_RUN_SUMMARY__` line (e2e tests, primarily). Token
|
|
808
|
+
// counts come from the turns table; status from the runs table.
|
|
809
|
+
function synthSummary(run, turnsRows) {
|
|
810
|
+
let prompt = 0;
|
|
811
|
+
let completion = 0;
|
|
812
|
+
let cached = 0;
|
|
813
|
+
let reasoning = 0;
|
|
814
|
+
for (const t of turnsRows) {
|
|
815
|
+
prompt += t.prompt_tokens || 0;
|
|
816
|
+
completion += t.completion_tokens || 0;
|
|
817
|
+
cached += t.cached_tokens || 0;
|
|
818
|
+
reasoning += t.reasoning_tokens || 0;
|
|
819
|
+
}
|
|
820
|
+
return {
|
|
821
|
+
status: run.status ?? null,
|
|
822
|
+
turns: turnsRows.length,
|
|
823
|
+
tokens: { prompt, completion, cached, reasoning },
|
|
824
|
+
cost: 0,
|
|
825
|
+
wallSeconds: null,
|
|
826
|
+
};
|
|
827
|
+
}
|
|
828
|
+
|
|
829
|
+
function processTask(taskDir, taskNameOverride = null) {
|
|
830
|
+
const taskName =
|
|
831
|
+
taskNameOverride ??
|
|
832
|
+
taskDir
|
|
833
|
+
.replace(/\/+$/, "")
|
|
834
|
+
.split("/")
|
|
835
|
+
.pop()
|
|
836
|
+
.replace(/\.digest$/, "")
|
|
837
|
+
.replace(/__[A-Za-z0-9]+$/, "");
|
|
838
|
+
const rummyDb = findAgentDb(taskDir);
|
|
839
|
+
if (rummyDb == null) {
|
|
840
|
+
closeSync(openSync(join(taskDir, "digest_skipped"), "w"));
|
|
841
|
+
return [
|
|
842
|
+
{
|
|
843
|
+
task: taskName,
|
|
844
|
+
dir: taskDir,
|
|
845
|
+
reward: readReward(taskDir),
|
|
846
|
+
status: null,
|
|
847
|
+
turns: 0,
|
|
848
|
+
tokens: null,
|
|
849
|
+
cost: null,
|
|
850
|
+
wallSeconds: null,
|
|
851
|
+
markers: ["exfil_fail"],
|
|
852
|
+
prompt: null,
|
|
853
|
+
turnRows: [],
|
|
854
|
+
errors: [],
|
|
855
|
+
},
|
|
856
|
+
];
|
|
857
|
+
}
|
|
858
|
+
|
|
859
|
+
const reward = readReward(taskDir);
|
|
860
|
+
const harborSummary = readRunSummary(taskDir);
|
|
861
|
+
const perRun = readDb(rummyDb);
|
|
862
|
+
|
|
863
|
+
// Tbench DBs hold one run per task container; e2e TestDb DBs hold many
|
|
864
|
+
// (one per `it()` block). Single-run task dirs keep the legacy layout
|
|
865
|
+
// (digest.md alongside agent/); multi-run task dirs nest per-run output
|
|
866
|
+
// at <task>/<alias>/.
|
|
867
|
+
const multiRun = perRun.length > 1;
|
|
868
|
+
const out = [];
|
|
869
|
+
for (const { run, turns, logEntries, packetEntries, prompt } of perRun) {
|
|
870
|
+
const turnRows = buildTurns(turns, logEntries);
|
|
871
|
+
const turnPackets = groupPacketsByTurn(packetEntries);
|
|
872
|
+
const runSummary =
|
|
873
|
+
!multiRun && harborSummary ? harborSummary : synthSummary(run, turns);
|
|
874
|
+
const markers = classifyMarkers(reward, runSummary, turnRows);
|
|
875
|
+
const runName = multiRun ? `${taskName}/${run.alias}` : taskName;
|
|
876
|
+
const outDir = multiRun ? join(taskDir, run.alias) : taskDir;
|
|
877
|
+
mkdirSync(outDir, { recursive: true });
|
|
878
|
+
|
|
879
|
+
const waterfall = renderWaterfall(
|
|
880
|
+
runName,
|
|
881
|
+
prompt,
|
|
882
|
+
runSummary,
|
|
883
|
+
reward,
|
|
884
|
+
turnRows,
|
|
885
|
+
markers,
|
|
886
|
+
);
|
|
887
|
+
writeFileSync(join(outDir, "digest.md"), `${waterfall}\n`);
|
|
888
|
+
writeFileSync(
|
|
889
|
+
join(outDir, "reasoning.md"),
|
|
890
|
+
`${renderReasoning(runName, turnRows)}\n`,
|
|
891
|
+
);
|
|
892
|
+
writeFileSync(
|
|
893
|
+
join(outDir, "packets.md"),
|
|
894
|
+
`${renderPackets(runName, turnPackets)}\n`,
|
|
895
|
+
);
|
|
896
|
+
const digest = digestJson({
|
|
897
|
+
taskName: runName,
|
|
898
|
+
taskDir: outDir,
|
|
899
|
+
prompt,
|
|
900
|
+
runSummary,
|
|
901
|
+
reward,
|
|
902
|
+
turnRows,
|
|
903
|
+
markers,
|
|
904
|
+
});
|
|
905
|
+
digest.errors = collectErrors(turnRows, logEntries, {
|
|
906
|
+
task: taskName,
|
|
907
|
+
alias: multiRun ? run.alias : null,
|
|
908
|
+
});
|
|
909
|
+
writeFileSync(
|
|
910
|
+
join(outDir, "digest.json"),
|
|
911
|
+
`${JSON.stringify(digest, null, 2)}\n`,
|
|
912
|
+
);
|
|
913
|
+
out.push(digest);
|
|
914
|
+
}
|
|
915
|
+
return out;
|
|
916
|
+
}
|
|
917
|
+
|
|
918
|
+
function csvEscape(s) {
|
|
919
|
+
if (s == null) return "";
|
|
920
|
+
const str = String(s);
|
|
921
|
+
if (/[,"\n]/.test(str)) return `"${str.replace(/"/g, '""')}"`;
|
|
922
|
+
return str;
|
|
923
|
+
}
|
|
924
|
+
|
|
925
|
+
function writeIndex(sweepDir, digests) {
|
|
926
|
+
const header = [
|
|
927
|
+
"task",
|
|
928
|
+
"reward",
|
|
929
|
+
"status",
|
|
930
|
+
"turns",
|
|
931
|
+
"prompt_tokens",
|
|
932
|
+
"completion_tokens",
|
|
933
|
+
"cached_tokens",
|
|
934
|
+
"cost",
|
|
935
|
+
"wall_seconds",
|
|
936
|
+
"markers",
|
|
937
|
+
].join(",");
|
|
938
|
+
const rows = digests
|
|
939
|
+
.toSorted((a, b) => (a.task ?? "").localeCompare(b.task ?? ""))
|
|
940
|
+
.map((d) =>
|
|
941
|
+
[
|
|
942
|
+
csvEscape(d.task),
|
|
943
|
+
csvEscape(d.reward),
|
|
944
|
+
csvEscape(d.status),
|
|
945
|
+
csvEscape(d.turns),
|
|
946
|
+
csvEscape(d.tokens?.prompt ?? ""),
|
|
947
|
+
csvEscape(d.tokens?.completion ?? ""),
|
|
948
|
+
csvEscape(d.tokens?.cached ?? ""),
|
|
949
|
+
csvEscape(d.cost),
|
|
950
|
+
csvEscape(d.wallSeconds),
|
|
951
|
+
csvEscape(d.markers.join(";")),
|
|
952
|
+
].join(","),
|
|
953
|
+
);
|
|
954
|
+
writeFileSync(join(sweepDir, "index.csv"), `${header}\n${rows.join("\n")}\n`);
|
|
955
|
+
}
|
|
956
|
+
|
|
957
|
+
// CLI: first positional arg is a `.db` file, a task dir, or a sweep dir.
|
|
958
|
+
// Second positional arg (bare-DB mode only) selects which run becomes
|
|
959
|
+
// the "featured" run promoted to test/digest/ — exact or substring
|
|
960
|
+
// match against run.alias. Default: the most-recent (highest-id) run.
|
|
961
|
+
// With no arg, default to the latest sweep under test/tbench/results.
|
|
962
|
+
const target = process.argv[2];
|
|
963
|
+
const runSelector = process.argv[3] ?? null;
|
|
964
|
+
let entry;
|
|
965
|
+
if (target) {
|
|
966
|
+
entry = target.startsWith("/") ? target : join(process.cwd(), target);
|
|
967
|
+
} else {
|
|
968
|
+
const sweeps = readdirSync(RESULTS_DIR)
|
|
969
|
+
.filter((d) => statSync(join(RESULTS_DIR, d)).isDirectory())
|
|
970
|
+
.sort();
|
|
971
|
+
if (sweeps.length === 0) {
|
|
972
|
+
console.error("no sweep dir found");
|
|
973
|
+
process.exit(2);
|
|
974
|
+
}
|
|
975
|
+
entry = join(RESULTS_DIR, sweeps[sweeps.length - 1]);
|
|
976
|
+
}
|
|
977
|
+
|
|
978
|
+
if (!existsSync(entry)) {
|
|
979
|
+
console.error(`not found: ${entry}`);
|
|
980
|
+
process.exit(2);
|
|
981
|
+
}
|
|
982
|
+
|
|
983
|
+
// Bare-DB invocation: `node bin/digest.js /path/to/rummy.db`. The full
|
|
984
|
+
// pile (per-alias subdirs for multi-run DBs) lands in /tmp/rummy_digest/.
|
|
985
|
+
// We synthesize a task dir there so processTask's path discipline
|
|
986
|
+
// (agent/rummy.db, reward.txt-optional, rummy.txt-optional) keeps
|
|
987
|
+
// holding. After processTask runs, PUBLIC_DIR gets a flat copy of the
|
|
988
|
+
// most recent run's artifacts (see end of file).
|
|
989
|
+
let bareDbName = null;
|
|
990
|
+
if (statSync(entry).isFile() && /\.db$/.test(entry)) {
|
|
991
|
+
const { rmSync, linkSync, copyFileSync: cp } = await import("node:fs");
|
|
992
|
+
bareDbName = entry.split("/").pop().replace(/\.db$/, "");
|
|
993
|
+
rmSync(PILE_DIR, { recursive: true, force: true });
|
|
994
|
+
mkdirSync(join(PILE_DIR, "agent"), { recursive: true });
|
|
995
|
+
const linkedDb = join(PILE_DIR, "agent", "rummy.db");
|
|
996
|
+
// Hard-link is cheap and keeps reads off the live DB if any; fall
|
|
997
|
+
// back to copy on cross-device or filesystem-restricted setups.
|
|
998
|
+
try {
|
|
999
|
+
linkSync(entry, linkedDb);
|
|
1000
|
+
} catch {
|
|
1001
|
+
cp(entry, linkedDb);
|
|
1002
|
+
}
|
|
1003
|
+
entry = PILE_DIR;
|
|
1004
|
+
}
|
|
1005
|
+
|
|
1006
|
+
if (isTaskDir(entry)) {
|
|
1007
|
+
const digests = processTask(entry, bareDbName);
|
|
1008
|
+
const allErrors = digests.flatMap((d) => d.errors ?? []);
|
|
1009
|
+
writeErrorsArtifacts(entry, bareDbName ?? entry.split("/").pop(), allErrors);
|
|
1010
|
+
for (const d of digests) {
|
|
1011
|
+
console.log(`wrote digest for ${d.task}: ${d.markers.join(", ")}`);
|
|
1012
|
+
}
|
|
1013
|
+
console.log(
|
|
1014
|
+
`wrote errors.md + errors.json (${allErrors.length} errors) → ${entry}/`,
|
|
1015
|
+
);
|
|
1016
|
+
|
|
1017
|
+
// Bare-DB mode: copy the most-recent run's flat artifacts into
|
|
1018
|
+
// PUBLIC_DIR (test/digest/) so a human can `cat test/digest/digest.md`
|
|
1019
|
+
// without spelunking. Pile of all runs stays in PILE_DIR. For single-
|
|
1020
|
+
// run DBs this is just "the digest." For multi-run DBs (e2e TestDb,
|
|
1021
|
+
// LME) this is the highest-id run; pick a different one out of the
|
|
1022
|
+
// pile if you want it.
|
|
1023
|
+
if (bareDbName) {
|
|
1024
|
+
const { rmSync, copyFileSync: cp } = await import("node:fs");
|
|
1025
|
+
rmSync(PUBLIC_DIR, { recursive: true, force: true });
|
|
1026
|
+
mkdirSync(PUBLIC_DIR, { recursive: true });
|
|
1027
|
+
let featured = digests[digests.length - 1];
|
|
1028
|
+
if (runSelector) {
|
|
1029
|
+
const matches = digests.filter((d) =>
|
|
1030
|
+
d.task.includes(runSelector),
|
|
1031
|
+
);
|
|
1032
|
+
if (matches.length === 0) {
|
|
1033
|
+
console.error(
|
|
1034
|
+
`run selector "${runSelector}" matched no run in this DB. ` +
|
|
1035
|
+
`Available: ${digests.map((d) => d.task).join(", ")}`,
|
|
1036
|
+
);
|
|
1037
|
+
process.exit(2);
|
|
1038
|
+
}
|
|
1039
|
+
if (matches.length > 1) {
|
|
1040
|
+
console.error(
|
|
1041
|
+
`run selector "${runSelector}" matched ${matches.length} runs: ` +
|
|
1042
|
+
matches.map((d) => d.task).join(", "),
|
|
1043
|
+
);
|
|
1044
|
+
process.exit(2);
|
|
1045
|
+
}
|
|
1046
|
+
featured = matches[0];
|
|
1047
|
+
}
|
|
1048
|
+
for (const f of ["digest.md", "reasoning.md", "packets.md", "digest.json"]) {
|
|
1049
|
+
const src = join(featured.dir, f);
|
|
1050
|
+
if (existsSync(src)) cp(src, join(PUBLIC_DIR, f));
|
|
1051
|
+
}
|
|
1052
|
+
for (const f of ["errors.md", "errors.json"]) {
|
|
1053
|
+
const src = join(entry, f);
|
|
1054
|
+
if (existsSync(src)) cp(src, join(PUBLIC_DIR, f));
|
|
1055
|
+
}
|
|
1056
|
+
console.log(
|
|
1057
|
+
`featured run (${featured.task}) → ${PUBLIC_DIR}/digest.md`,
|
|
1058
|
+
);
|
|
1059
|
+
}
|
|
1060
|
+
} else {
|
|
1061
|
+
const taskDirs = findTaskDirs(entry);
|
|
1062
|
+
if (taskDirs.length === 0) {
|
|
1063
|
+
console.error(`no task dirs (with agent/rummy*.db) under ${entry}`);
|
|
1064
|
+
process.exit(2);
|
|
1065
|
+
}
|
|
1066
|
+
const digests = [];
|
|
1067
|
+
for (const td of taskDirs) {
|
|
1068
|
+
try {
|
|
1069
|
+
digests.push(...processTask(td));
|
|
1070
|
+
} catch (err) {
|
|
1071
|
+
console.error(`! ${relative(entry, td)}: ${err.message}`);
|
|
1072
|
+
digests.push({
|
|
1073
|
+
task: td
|
|
1074
|
+
.split("/")
|
|
1075
|
+
.pop()
|
|
1076
|
+
.replace(/__[A-Za-z0-9]+$/, ""),
|
|
1077
|
+
dir: td,
|
|
1078
|
+
reward: null,
|
|
1079
|
+
status: null,
|
|
1080
|
+
turns: 0,
|
|
1081
|
+
tokens: null,
|
|
1082
|
+
cost: null,
|
|
1083
|
+
wallSeconds: null,
|
|
1084
|
+
markers: ["digest_failed"],
|
|
1085
|
+
prompt: null,
|
|
1086
|
+
turnRows: [],
|
|
1087
|
+
errors: [],
|
|
1088
|
+
});
|
|
1089
|
+
}
|
|
1090
|
+
}
|
|
1091
|
+
writeIndex(entry, digests);
|
|
1092
|
+
const allErrors = digests.flatMap((d) => d.errors ?? []);
|
|
1093
|
+
writeErrorsArtifacts(entry, entry.split("/").pop(), allErrors);
|
|
1094
|
+
console.log(
|
|
1095
|
+
`wrote ${digests.length} digests + index.csv + errors.md (${allErrors.length} errors) → ${entry}/`,
|
|
1096
|
+
);
|
|
1097
|
+
}
|