@possumtech/rummy 2.1.0 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. package/.env.example +40 -15
  2. package/.xai.key +1 -0
  3. package/PLUGINS.md +169 -53
  4. package/README.md +38 -32
  5. package/SPEC.md +366 -179
  6. package/bin/digest.js +1097 -0
  7. package/biome/no-fallbacks.grit +2 -2
  8. package/gemini.key +1 -0
  9. package/lang/en.json +10 -1
  10. package/migrations/001_initial_schema.sql +9 -2
  11. package/package.json +19 -8
  12. package/service.js +1 -0
  13. package/src/agent/AgentLoop.js +76 -26
  14. package/src/agent/ContextAssembler.js +2 -0
  15. package/src/agent/Entries.js +238 -60
  16. package/src/agent/ProjectAgent.js +44 -0
  17. package/src/agent/TurnExecutor.js +99 -30
  18. package/src/agent/XmlParser.js +206 -111
  19. package/src/agent/errors.js +35 -0
  20. package/src/agent/known_queries.sql +1 -1
  21. package/src/agent/known_store.sql +3 -42
  22. package/src/agent/materializeContext.js +30 -1
  23. package/src/agent/runs.sql +8 -18
  24. package/src/agent/tokens.js +0 -1
  25. package/src/agent/turns.sql +1 -0
  26. package/src/hooks/Hooks.js +26 -0
  27. package/src/hooks/RummyContext.js +12 -1
  28. package/src/lib/hedberg/README.md +60 -0
  29. package/src/lib/hedberg/hedberg.js +60 -0
  30. package/src/lib/hedberg/marker.js +158 -0
  31. package/src/{plugins → lib}/hedberg/matcher.js +1 -2
  32. package/src/llm/LlmProvider.js +41 -3
  33. package/src/llm/openaiStream.js +17 -0
  34. package/src/plugins/ask_user/ask_user.js +12 -2
  35. package/src/plugins/ask_user/ask_userDoc.md +1 -5
  36. package/src/plugins/budget/README.md +29 -24
  37. package/src/plugins/budget/budget.js +166 -110
  38. package/src/plugins/cli/README.md +3 -4
  39. package/src/plugins/cli/cli.js +31 -5
  40. package/src/plugins/cloudflare/cloudflare.js +136 -0
  41. package/src/plugins/cp/cp.js +41 -4
  42. package/src/plugins/cp/cpDoc.md +5 -6
  43. package/src/plugins/engine/engine.sql +1 -1
  44. package/src/plugins/env/README.md +5 -4
  45. package/src/plugins/env/env.js +7 -4
  46. package/src/plugins/env/envDoc.md +7 -8
  47. package/src/plugins/error/error.js +56 -15
  48. package/src/plugins/file/README.md +12 -3
  49. package/src/plugins/file/file.js +2 -2
  50. package/src/plugins/get/get.js +59 -36
  51. package/src/plugins/get/getDoc.md +10 -34
  52. package/src/plugins/google/google.js +115 -0
  53. package/src/plugins/hedberg/hedberg.js +13 -56
  54. package/src/plugins/helpers.js +66 -12
  55. package/src/plugins/index.js +1 -2
  56. package/src/plugins/instructions/README.md +44 -47
  57. package/src/plugins/instructions/instructions-system.md +44 -0
  58. package/src/plugins/instructions/instructions-user.md +53 -0
  59. package/src/plugins/instructions/instructions.js +58 -189
  60. package/src/plugins/known/README.md +6 -7
  61. package/src/plugins/known/known.js +24 -30
  62. package/src/plugins/log/log.js +41 -32
  63. package/src/plugins/mv/mv.js +40 -1
  64. package/src/plugins/mv/mvDoc.md +1 -8
  65. package/src/plugins/ollama/ollama.js +4 -3
  66. package/src/plugins/openai/openai.js +4 -3
  67. package/src/plugins/openrouter/openrouter.js +14 -4
  68. package/src/plugins/persona/README.md +11 -13
  69. package/src/plugins/persona/default.md +29 -0
  70. package/src/plugins/persona/persona.js +10 -66
  71. package/src/plugins/policy/policy.js +23 -22
  72. package/src/plugins/prompt/README.md +37 -27
  73. package/src/plugins/prompt/prompt.js +13 -19
  74. package/src/plugins/rm/rm.js +18 -0
  75. package/src/plugins/rm/rmDoc.md +5 -6
  76. package/src/plugins/rpc/rpc.js +3 -3
  77. package/src/plugins/set/set.js +205 -323
  78. package/src/plugins/set/setDoc.md +47 -17
  79. package/src/plugins/sh/README.md +6 -5
  80. package/src/plugins/sh/sh.js +8 -5
  81. package/src/plugins/sh/shDoc.md +7 -8
  82. package/src/plugins/skill/README.md +37 -14
  83. package/src/plugins/skill/skill.js +200 -101
  84. package/src/plugins/skill/skillDoc.js +3 -0
  85. package/src/plugins/skill/skillDoc.md +9 -0
  86. package/src/plugins/stream/README.md +7 -6
  87. package/src/plugins/stream/finalize.js +100 -0
  88. package/src/plugins/stream/stream.js +13 -45
  89. package/src/plugins/telemetry/telemetry.js +27 -4
  90. package/src/plugins/think/think.js +2 -3
  91. package/src/plugins/think/thinkDoc.md +2 -4
  92. package/src/plugins/unknown/README.md +1 -1
  93. package/src/plugins/unknown/unknown.js +17 -19
  94. package/src/plugins/update/update.js +4 -51
  95. package/src/plugins/update/updateDoc.md +21 -6
  96. package/src/plugins/xai/xai.js +68 -102
  97. package/src/plugins/yolo/yolo.js +102 -75
  98. package/src/sql/functions/hedmatch.js +1 -1
  99. package/src/sql/functions/hedreplace.js +1 -1
  100. package/src/sql/functions/hedsearch.js +1 -1
  101. package/src/sql/functions/slugify.js +16 -2
  102. package/BENCH_ENVIRONMENT.md +0 -230
  103. package/CLIENT_INTERFACE.md +0 -396
  104. package/last_run.txt +0 -5617
  105. package/scriptify/ask_run.js +0 -77
  106. package/scriptify/cache_probe.js +0 -66
  107. package/scriptify/cache_probe_grok.js +0 -74
  108. package/src/agent/budget.js +0 -33
  109. package/src/agent/config.js +0 -38
  110. package/src/plugins/hedberg/README.md +0 -71
  111. package/src/plugins/hedberg/docs.md +0 -0
  112. package/src/plugins/hedberg/edits.js +0 -55
  113. package/src/plugins/hedberg/normalize.js +0 -17
  114. package/src/plugins/hedberg/sed.js +0 -49
  115. package/src/plugins/instructions/instructions.md +0 -34
  116. package/src/plugins/instructions/instructions_104.md +0 -8
  117. package/src/plugins/instructions/instructions_105.md +0 -39
  118. package/src/plugins/instructions/instructions_106.md +0 -22
  119. package/src/plugins/instructions/instructions_107.md +0 -17
  120. package/src/plugins/instructions/instructions_108.md +0 -0
  121. package/src/plugins/known/knownDoc.js +0 -3
  122. package/src/plugins/known/knownDoc.md +0 -8
  123. package/src/plugins/unknown/unknownDoc.js +0 -3
  124. package/src/plugins/unknown/unknownDoc.md +0 -11
  125. package/turns/cli_1777462658211/turn_001.txt +0 -772
  126. package/turns/cli_1777462658211/turn_002.txt +0 -606
  127. package/turns/cli_1777462658211/turn_003.txt +0 -667
  128. package/turns/cli_1777462658211/turn_004.txt +0 -297
  129. package/turns/cli_1777462658211/turn_005.txt +0 -301
  130. package/turns/cli_1777462658211/turn_006.txt +0 -262
  131. package/turns/cli_1777465095132/turn_001.txt +0 -715
  132. package/turns/cli_1777465095132/turn_002.txt +0 -236
  133. package/turns/cli_1777465095132/turn_003.txt +0 -287
  134. package/turns/cli_1777465095132/turn_004.txt +0 -694
  135. package/turns/cli_1777465095132/turn_005.txt +0 -422
  136. package/turns/cli_1777465095132/turn_006.txt +0 -365
  137. package/turns/cli_1777465095132/turn_007.txt +0 -885
  138. package/turns/cli_1777465095132/turn_008.txt +0 -1277
  139. package/turns/cli_1777465095132/turn_009.txt +0 -736
  140. /package/src/{plugins → lib}/hedberg/patterns.js +0 -0
package/bin/digest.js ADDED
@@ -0,0 +1,1097 @@
1
+ /**
2
+ * Universal run-digest tool. Reads any rummy*.db (e2e, demo, bench, dev)
3
+ * and emits per-run forensic artifacts. First-order tool, not bench-
4
+ * specific — use it on anything with a rummy*.db inside.
5
+ *
6
+ * <out>/digest.md Run-shape header + waterfall: per-turn line
7
+ * with status, update body, indented emission
8
+ * list, and a reasoning excerpt.
9
+ * <out>/digest.json Same data, machine-queryable.
10
+ * <out>/reasoning.md Per-turn reasoning_content (full).
11
+ * <out>/packets.md Per-turn assembled wire packets.
12
+ * <out>/digest_skipped Written when no rummy*.db is present.
13
+ *
14
+ * <sweep>/index.csv Greppable per-task summary.
15
+ * <sweep>/errors.md Cross-task aggregated error report.
16
+ * <sweep>/errors.json Same, machine-queryable.
17
+ *
18
+ * Read-only derivative; never source-of-truth. Safe to re-run.
19
+ *
20
+ * Usage:
21
+ * node bin/digest.js <sweep-dir> sweep + index + errors
22
+ * node bin/digest.js <task-dir> single task + errors
23
+ * node bin/digest.js <path-to-rummy.db> bare DB → sibling .digest/ dir
24
+ * node bin/digest.js latest tbench sweep
25
+ */
26
+
27
+ import {
28
+ closeSync,
29
+ existsSync,
30
+ mkdirSync,
31
+ openSync,
32
+ readdirSync,
33
+ readFileSync,
34
+ statSync,
35
+ writeFileSync,
36
+ } from "node:fs";
37
+ import { dirname, join, relative } from "node:path";
38
+ import { DatabaseSync } from "node:sqlite";
39
+ import { fileURLToPath } from "node:url";
40
+
41
+ const __dirname = dirname(fileURLToPath(import.meta.url));
42
+ // Default sweep root for the no-arg invocation: the tbench results dir
43
+ // historically holds the latest of everything. Resolved relative to the
44
+ // project root (one level up from bin/).
45
+ const RESULTS_DIR = join(__dirname, "..", "test", "tbench", "results");
46
+ // Bare-DB invocation output. The full per-run pile (subdirs per alias
47
+ // for multi-run DBs) lives in /tmp/rummy_digest/ — clobbered each run.
48
+ // Sweep / task-dir invocations keep writing alongside source dirs.
49
+ const PILE_DIR = "/tmp/rummy_digest";
50
+ // Single readable digest that mirrors the most recent run from the DB.
51
+ // Flat files only — no nested folders. Clobbered on each invocation.
52
+ const PUBLIC_DIR = join(__dirname, "..", "test", "digest");
53
+
54
+ const MAX_LOOP_TURNS = Number(process.env.RUMMY_MAX_LOOP_TURNS) || 99;
55
+ const REASONING_RUNAWAY_CHARS = 8000;
56
+
57
+ // Locate the agent's sqlite DB inside a task dir's agent/ folder. Tbench
58
+ // writes `rummy.db`; programbench writes `rummy_programbench.db` (so the
59
+ // host-side audit DB is segregated from any project-internal `rummy.db`
60
+ // the agent might create). Returns absolute path or null. Empty stubs
61
+ // (zero-length leftovers from aborted runs) are ignored.
62
+ function findAgentDb(taskDir) {
63
+ const agentDir = join(taskDir, "agent");
64
+ if (!existsSync(agentDir)) return null;
65
+ let names;
66
+ try {
67
+ names = readdirSync(agentDir);
68
+ } catch {
69
+ return null;
70
+ }
71
+ const candidates = names
72
+ .filter((n) => /^rummy.*\.db$/.test(n))
73
+ .map((n) => join(agentDir, n))
74
+ .filter((p) => {
75
+ try {
76
+ return statSync(p).size > 0;
77
+ } catch {
78
+ return false;
79
+ }
80
+ });
81
+ if (candidates.length === 0) return null;
82
+ const canonical = candidates.find((p) => p.endsWith("/rummy.db"));
83
+ if (canonical) return canonical;
84
+ return candidates.toSorted((a, b) => statSync(b).size - statSync(a).size)[0];
85
+ }
86
+
87
+ function isTaskDir(dir) {
88
+ return findAgentDb(dir) != null;
89
+ }
90
+
91
+ function findTaskDirs(sweepDir) {
92
+ const result = [];
93
+ function walk(dir, depth) {
94
+ if (depth > 4) return;
95
+ let names;
96
+ try {
97
+ names = readdirSync(dir);
98
+ } catch {
99
+ return;
100
+ }
101
+ for (const name of names) {
102
+ const full = join(dir, name);
103
+ let s;
104
+ try {
105
+ s = statSync(full);
106
+ } catch {
107
+ continue;
108
+ }
109
+ if (!s.isDirectory()) continue;
110
+ if (isTaskDir(full)) {
111
+ result.push(full);
112
+ continue;
113
+ }
114
+ walk(full, depth + 1);
115
+ }
116
+ }
117
+ walk(sweepDir, 0);
118
+ return result;
119
+ }
120
+
121
+ function readReward(taskDir) {
122
+ const p = join(taskDir, "verifier", "reward.txt");
123
+ if (!existsSync(p)) return null;
124
+ const r = readFileSync(p, "utf8").trim();
125
+ if (r === "0" || r === "1") return Number(r);
126
+ return null;
127
+ }
128
+
129
+ function readRunSummary(taskDir) {
130
+ const p = join(taskDir, "agent", "rummy.txt");
131
+ if (!existsSync(p)) return null;
132
+ const text = readFileSync(p, "utf8");
133
+ const m = text.match(/__RUMMY_RUN_SUMMARY__\s+(\{.*\})\s*$/m);
134
+ if (!m) return null;
135
+ try {
136
+ return JSON.parse(m[1]);
137
+ } catch {
138
+ return null;
139
+ }
140
+ }
141
+
142
+ function parseAttrs(s) {
143
+ if (s == null) return {};
144
+ if (typeof s === "object") return s;
145
+ try {
146
+ return JSON.parse(s);
147
+ } catch {
148
+ return {};
149
+ }
150
+ }
151
+
152
+ const TURN_FROM_PATH = /^log:\/\/turn_(\d+)\//;
153
+
154
+ function turnFromPath(path) {
155
+ const m = TURN_FROM_PATH.exec(path);
156
+ return m ? Number(m[1]) : null;
157
+ }
158
+
159
+ function actionFromPath(path) {
160
+ const m = path.match(/^log:\/\/turn_\d+\/([^/]+)\//);
161
+ return m ? m[1] : null;
162
+ }
163
+
164
+ function pathSlug(path) {
165
+ // Decode the slug after `log://turn_N/<action>/`. URL-encoded.
166
+ const m = path.match(/^log:\/\/turn_\d+\/[^/]+\/(.+)$/);
167
+ if (!m) return path;
168
+ try {
169
+ return decodeURIComponent(m[1]);
170
+ } catch {
171
+ return m[1];
172
+ }
173
+ }
174
+
175
+ function summarize(text, n = 80) {
176
+ if (!text) return "";
177
+ const flat = text.replace(/\s+/g, " ").trim();
178
+ if (flat.length <= n) return flat;
179
+ return `${flat.slice(0, n)}…`;
180
+ }
181
+
182
+ // Read all runs from a DB plus their per-run data. Tbench task DBs have
183
+ // exactly one run; e2e TestDb DBs have many (one per test invocation).
184
+ // The caller drives a per-run digest pass either way.
185
+ function readDb(rummyDb) {
186
+ const db = new DatabaseSync(rummyDb, { readOnly: true });
187
+
188
+ const runs = db.prepare("SELECT * FROM runs ORDER BY id").all();
189
+ const turnsStmt = db.prepare(
190
+ `SELECT sequence, total_tokens, prompt_tokens, completion_tokens,
191
+ cached_tokens, reasoning_tokens, reasoning_content
192
+ FROM turns
193
+ WHERE run_id = ?
194
+ ORDER BY sequence`,
195
+ );
196
+ const logStmt = db.prepare(
197
+ `SELECT e.path, e.body, e.attributes, e.scheme,
198
+ rv.state, rv.outcome, rv.visibility, rv.turn
199
+ FROM entries e
200
+ JOIN run_views rv ON rv.entry_id = e.id
201
+ WHERE rv.run_id = ?
202
+ AND e.path LIKE 'log://turn_%'
203
+ ORDER BY e.id`,
204
+ );
205
+ const promptStmt = db.prepare(
206
+ `SELECT e.body
207
+ FROM entries e
208
+ JOIN run_views rv ON rv.entry_id = e.id
209
+ WHERE rv.run_id = ? AND e.path = 'prompt://1'
210
+ LIMIT 1`,
211
+ );
212
+ // Per-turn assembled packet bytes. system://N + user://N are what we
213
+ // sent to the LLM; assistant://N is the parsed content; model://N is
214
+ // the raw response wrapper (includes reasoning_content, finish_reason,
215
+ // usage). reasoning://N is the bare reasoning channel when the model
216
+ // surfaced one.
217
+ const packetStmt = db.prepare(
218
+ `SELECT e.path, e.body
219
+ FROM entries e
220
+ JOIN run_views rv ON rv.entry_id = e.id
221
+ WHERE rv.run_id = ?
222
+ AND (e.path GLOB 'system://*' OR e.path GLOB 'user://*'
223
+ OR e.path GLOB 'assistant://*' OR e.path GLOB 'model://*'
224
+ OR e.path GLOB 'reasoning://*')
225
+ ORDER BY e.id`,
226
+ );
227
+
228
+ const perRun = runs.map((run) => ({
229
+ run,
230
+ turns: turnsStmt.all(run.id),
231
+ logEntries: logStmt.all(run.id),
232
+ packetEntries: packetStmt.all(run.id),
233
+ prompt: promptStmt.get(run.id)?.body ?? null,
234
+ }));
235
+
236
+ db.close();
237
+ return perRun;
238
+ }
239
+
240
+ // Build per-turn rows: one entry per turn with its update + emissions + errors.
241
+ function buildTurns(turns, logEntries) {
242
+ const byTurn = new Map();
243
+ for (const t of turns) {
244
+ byTurn.set(t.sequence, {
245
+ turn: t.sequence,
246
+ totalTokens: t.total_tokens,
247
+ promptTokens: t.prompt_tokens,
248
+ completionTokens: t.completion_tokens,
249
+ cachedTokens: t.cached_tokens,
250
+ reasoningTokens: t.reasoning_tokens,
251
+ reasoningChars: (t.reasoning_content || "").length,
252
+ reasoning: t.reasoning_content || "",
253
+ update: null, // {status, body, state, outcome}
254
+ emissions: [], // {action, slug, attrs, body, state, outcome}
255
+ errors: [], // {body, attrs}
256
+ });
257
+ }
258
+ // Make sure we have a row even for "ghost" turns where the LLM call
259
+ // failed before turn-row creation (rare but possible).
260
+ for (const e of logEntries) {
261
+ const turn = turnFromPath(e.path);
262
+ if (turn == null) continue;
263
+ if (!byTurn.has(turn)) {
264
+ byTurn.set(turn, {
265
+ turn,
266
+ totalTokens: null,
267
+ promptTokens: null,
268
+ completionTokens: null,
269
+ cachedTokens: null,
270
+ reasoningTokens: null,
271
+ reasoningChars: 0,
272
+ reasoning: "",
273
+ update: null,
274
+ emissions: [],
275
+ errors: [],
276
+ });
277
+ }
278
+ const row = byTurn.get(turn);
279
+ const action = actionFromPath(e.path);
280
+ const attrs = parseAttrs(e.attributes);
281
+ if (action === "update") {
282
+ row.update = {
283
+ status: attrs.status ?? null,
284
+ body: e.body,
285
+ state: e.state,
286
+ outcome: e.outcome,
287
+ };
288
+ } else if (action === "error") {
289
+ row.errors.push({
290
+ body: e.body,
291
+ attrs,
292
+ slug: pathSlug(e.path),
293
+ state: e.state,
294
+ outcome: e.outcome,
295
+ });
296
+ } else {
297
+ row.emissions.push({
298
+ action,
299
+ slug: pathSlug(e.path),
300
+ targetPath: attrs.path ?? null,
301
+ visibility: attrs.visibility ?? null,
302
+ query: attrs.query ?? null,
303
+ command: attrs.command ?? null,
304
+ body: e.body,
305
+ state: e.state,
306
+ outcome: e.outcome,
307
+ });
308
+ }
309
+ }
310
+ return [...byTurn.values()].sort((a, b) => a.turn - b.turn);
311
+ }
312
+
313
+ function classifyMarkers(reward, runSummary, turnRows) {
314
+ const markers = [];
315
+ const status = runSummary?.status ?? null;
316
+ if (reward === 1) markers.push("passed");
317
+ if (reward === 0 && status === 200)
318
+ markers.push("claim_success_verifier_fail");
319
+ if (status === 499 && turnRows.length >= MAX_LOOP_TURNS - 1) {
320
+ markers.push("max_loop_turns");
321
+ }
322
+ if (status === 413) markers.push("context_overflow");
323
+ if (status === 500) markers.push("dispatch_500");
324
+
325
+ let strikeAbandon = false;
326
+ let runawayTurn = null;
327
+ let parserWarn = false;
328
+ for (const row of turnRows) {
329
+ const stuck =
330
+ row.reasoningChars >= REASONING_RUNAWAY_CHARS &&
331
+ row.emissions.length === 0 &&
332
+ !row.update;
333
+ if (stuck) runawayTurn = row.turn;
334
+ for (const err of row.errors) {
335
+ const body = err.body || "";
336
+ if (body.startsWith("Abandoned after")) strikeAbandon = true;
337
+ if (body.startsWith("Unclosed") || body.includes("Tool call limit")) {
338
+ parserWarn = true;
339
+ }
340
+ }
341
+ }
342
+ if (strikeAbandon) markers.push("strike_abandon");
343
+ if (runawayTurn != null) markers.push(`reasoning_runaway_t${runawayTurn}`);
344
+ if (parserWarn) markers.push("parser_warning");
345
+ if (!runSummary) markers.push("exfil_fail");
346
+ return markers;
347
+ }
348
+
349
+ // Render an emission as a single waterfall line.
350
+ function renderEmission(em) {
351
+ const fail = em.state === "failed" ? " ✗" : "";
352
+ const outcome = em.outcome ? ` [${em.outcome}]` : "";
353
+ const target = em.targetPath ?? em.slug;
354
+ const vis = em.visibility ? ` visibility=${em.visibility}` : "";
355
+ const query = em.query ? ` "${summarize(em.query, 60)}"` : "";
356
+ const command = em.command ? ` "${summarize(em.command, 60)}"` : "";
357
+ return ` ← ${em.action} ${target}${vis}${query}${command}${fail}${outcome}`;
358
+ }
359
+
360
+ function renderError(err) {
361
+ return ` ✗ error: ${summarize(err.body, 100)}`;
362
+ }
363
+
364
+ // "What happened" header — five lines a forensic reader can scan in
365
+ // seconds before deciding whether to drop into the waterfall.
366
+ function renderRunShape(turnRows) {
367
+ let lastUpdate = null;
368
+ let lastEmissionTurn = null;
369
+ let setActions = 0;
370
+ let getActions = 0;
371
+ let searchActions = 0;
372
+ for (const row of turnRows) {
373
+ if (row.update) lastUpdate = { turn: row.turn, ...row.update };
374
+ if (row.emissions.length > 0) lastEmissionTurn = row.turn;
375
+ for (const em of row.emissions) {
376
+ if (em.action === "set") setActions++;
377
+ else if (em.action === "get") getActions++;
378
+ else if (em.action === "search") searchActions++;
379
+ }
380
+ }
381
+ const lastUpdateLine = lastUpdate
382
+ ? `T${lastUpdate.turn} status=${lastUpdate.status ?? "—"} "${summarize(lastUpdate.body, 60)}"`
383
+ : "(none)";
384
+ return [
385
+ `Last update: ${lastUpdateLine}`,
386
+ `Last emission: ${lastEmissionTurn != null ? `T${lastEmissionTurn}` : "(none)"}`,
387
+ `Action mix: set=${setActions} get=${getActions} search=${searchActions}`,
388
+ ];
389
+ }
390
+
391
+ function renderWaterfall(
392
+ taskName,
393
+ prompt,
394
+ runSummary,
395
+ reward,
396
+ turnRows,
397
+ markers,
398
+ ) {
399
+ const lines = [];
400
+ lines.push(`# ${taskName}`);
401
+ lines.push("");
402
+ const status = runSummary?.status ?? "?";
403
+ const totalTurns = runSummary?.turns ?? turnRows.length;
404
+ const cost =
405
+ runSummary?.cost != null ? `$${runSummary.cost.toFixed(4)}` : "?";
406
+ const tokens = runSummary?.tokens
407
+ ? `prompt=${runSummary.tokens.prompt} completion=${runSummary.tokens.completion} cached=${runSummary.tokens.cached}`
408
+ : "?";
409
+ const rewardStr = reward == null ? "—" : reward === 1 ? "PASS" : "FAIL";
410
+ lines.push(
411
+ `status=${status} reward=${rewardStr} turns=${totalTurns} cost=${cost} tokens=${tokens}`,
412
+ );
413
+ if (markers.length > 0) {
414
+ lines.push("");
415
+ lines.push(`markers: ${markers.join(", ")}`);
416
+ }
417
+ lines.push("");
418
+ lines.push("## Run shape");
419
+ lines.push("");
420
+ for (const l of renderRunShape(turnRows)) lines.push(l);
421
+ lines.push("");
422
+ if (prompt) {
423
+ lines.push("## Prompt");
424
+ lines.push(summarize(prompt, 240));
425
+ lines.push("");
426
+ }
427
+ lines.push("## Waterfall");
428
+ for (const row of turnRows) {
429
+ const upStatus = row.update?.status ?? "—";
430
+ const upBody = row.update ? summarize(row.update.body, 80) : "(no update)";
431
+ const upFail =
432
+ row.update?.state === "failed" ? ` ✗ ${row.update.outcome ?? ""}` : "";
433
+ lines.push(`T${row.turn}: ${upStatus} "${upBody}"${upFail}`);
434
+ if (row.reasoning)
435
+ lines.push(` ↳ reasoning: ${summarize(row.reasoning, 140)}`);
436
+ for (const em of row.emissions) lines.push(renderEmission(em));
437
+ for (const err of row.errors) lines.push(renderError(err));
438
+ }
439
+ lines.push("");
440
+ lines.push("## Drill-down");
441
+ lines.push("- agent/rummy.txt (full trace, when present)");
442
+ lines.push("- agent/rummy.db (sqlite — entries, run_views, turns)");
443
+ lines.push("- reasoning.md (per-turn reasoning_content)");
444
+ lines.push("- packets.md (per-turn assembled wire packets)");
445
+ return lines.join("\n");
446
+ }
447
+
448
+ function renderReasoning(taskName, turnRows) {
449
+ const lines = [];
450
+ lines.push(`# Reasoning: ${taskName}`);
451
+ for (const row of turnRows) {
452
+ lines.push("");
453
+ lines.push(`## Turn ${row.turn}`);
454
+ if (row.reasoning) {
455
+ lines.push("");
456
+ lines.push(row.reasoning);
457
+ } else {
458
+ lines.push("");
459
+ lines.push("(no reasoning_content)");
460
+ }
461
+ }
462
+ return lines.join("\n");
463
+ }
464
+
465
+ // Group packet entries (system://N / user://N / assistant://N / model://N
466
+ // / reasoning://N) by their turn number suffix.
467
+ function groupPacketsByTurn(packetEntries) {
468
+ const byTurn = new Map();
469
+ for (const e of packetEntries) {
470
+ const m = e.path.match(/^([a-z]+):\/\/(\d+)$/);
471
+ if (!m) continue;
472
+ const role = m[1];
473
+ const turn = Number(m[2]);
474
+ if (!byTurn.has(turn)) byTurn.set(turn, {});
475
+ byTurn.get(turn)[role] = e.body;
476
+ }
477
+ return [...byTurn.entries()]
478
+ .toSorted(([a], [b]) => a - b)
479
+ .map(([turn, parts]) => ({ turn, ...parts }));
480
+ }
481
+
482
+ // Per-turn packet dump: exactly what was sent (system + user) and
483
+ // received (assistant + model wrapper + reasoning) for each turn. The
484
+ // shape mirrors the wire payload so a forensic reader can see how
485
+ // errors, log entries, and state actually presented to the model.
486
+ function renderPackets(taskName, turnPackets) {
487
+ const lines = [];
488
+ lines.push(`# Packets: ${taskName}`);
489
+ lines.push("");
490
+ lines.push(
491
+ "Per-turn assembled packets. `system` + `user` are the outgoing message;",
492
+ );
493
+ lines.push(
494
+ "`assistant` is the parsed completion; `model` is the raw response",
495
+ );
496
+ lines.push("wrapper (usage, finish_reason); `reasoning` is the bare CoT");
497
+ lines.push("channel when the provider surfaces one.");
498
+ for (const p of turnPackets) {
499
+ lines.push("");
500
+ lines.push(`## Turn ${p.turn}`);
501
+ for (const role of ["system", "user", "assistant", "reasoning", "model"]) {
502
+ if (p[role] == null) continue;
503
+ lines.push("");
504
+ lines.push(`### ${role}://${p.turn}`);
505
+ lines.push("");
506
+ lines.push("```");
507
+ lines.push(p[role]);
508
+ lines.push("```");
509
+ }
510
+ }
511
+ return lines.join("\n");
512
+ }
513
+
514
+ function digestJson({
515
+ taskName,
516
+ taskDir,
517
+ prompt,
518
+ runSummary,
519
+ reward,
520
+ turnRows,
521
+ markers,
522
+ }) {
523
+ return {
524
+ task: taskName,
525
+ dir: taskDir,
526
+ reward,
527
+ status: runSummary?.status ?? null,
528
+ turns: runSummary?.turns ?? turnRows.length,
529
+ tokens: runSummary?.tokens ?? null,
530
+ cost: runSummary?.cost ?? null,
531
+ wallSeconds: runSummary?.wallSeconds ?? null,
532
+ markers,
533
+ prompt: prompt ?? null,
534
+ turnRows: turnRows.map((row) => ({
535
+ turn: row.turn,
536
+ totalTokens: row.totalTokens,
537
+ reasoningChars: row.reasoningChars,
538
+ update: row.update
539
+ ? {
540
+ status: row.update.status,
541
+ body: row.update.body,
542
+ state: row.update.state,
543
+ outcome: row.update.outcome,
544
+ }
545
+ : null,
546
+ emissions: row.emissions.map((em) => ({
547
+ action: em.action,
548
+ targetPath: em.targetPath,
549
+ visibility: em.visibility,
550
+ state: em.state,
551
+ outcome: em.outcome,
552
+ })),
553
+ errors: row.errors.map((err) => ({ body: err.body })),
554
+ })),
555
+ };
556
+ }
557
+
558
+ // Build rich error records from a run's turn rows + log entries. Each
559
+ // record carries enough forensic context to read the failure without
560
+ // drilling into the DB: the originating action's path/body (the thing
561
+ // the model tried), the verdict status/outcome, soft-vs-strike, and a
562
+ // signature for cross-task aggregation.
563
+ function collectErrors(turnRows, logEntries, runIdent) {
564
+ const byPath = new Map();
565
+ for (const e of logEntries) byPath.set(e.path, e);
566
+ const records = [];
567
+ for (const row of turnRows) {
568
+ for (const err of row.errors) {
569
+ const sourcePath = err.attrs?.sourcePath ?? null;
570
+ const sourceEntry = sourcePath ? byPath.get(sourcePath) : null;
571
+ const sourceAttrs = sourceEntry ? parseAttrs(sourceEntry.attributes) : {};
572
+ const semanticOutcome = err.attrs?.outcome ?? err.outcome ?? null;
573
+ records.push({
574
+ run: runIdent,
575
+ turn: row.turn,
576
+ status: err.attrs?.status ?? null,
577
+ outcome: semanticOutcome,
578
+ rvOutcome: err.outcome ?? null,
579
+ state: err.state ?? null,
580
+ soft: err.state === "resolved",
581
+ sourcePath,
582
+ sourceAction: sourceEntry
583
+ ? {
584
+ path: sourcePath,
585
+ action: actionFromPath(sourcePath),
586
+ targetPath: sourceAttrs.path ?? null,
587
+ body: sourceEntry.body ?? "",
588
+ state: sourceEntry.state ?? null,
589
+ outcome: sourceEntry.outcome ?? null,
590
+ }
591
+ : null,
592
+ body: err.body ?? "",
593
+ bodySig: errorSignature({
594
+ outcome: semanticOutcome,
595
+ sourcePath,
596
+ body: err.body,
597
+ }),
598
+ });
599
+ }
600
+ }
601
+ return records;
602
+ }
603
+
604
+ // Group key for cross-task aggregation. Same outcome + same source-path
605
+ // shape (turn-stripped) + same body prefix collapses repeats. The 80-char
606
+ // body prefix accommodates "<<<<<<< SEARCH\n<context-line>" patterns
607
+ // without bleeding into the divergent tail.
608
+ const SIG_BODY_CHARS = 80;
609
+ function errorSignature({ outcome, sourcePath, body }) {
610
+ const out = outcome ?? "—";
611
+ const src = sourcePath ? sourcePath.replace(/turn_\d+/, "turn_*") : "—";
612
+ const flat = (body ?? "").replace(/\s+/g, " ").trim();
613
+ const head =
614
+ flat.length > SIG_BODY_CHARS ? `${flat.slice(0, SIG_BODY_CHARS)}…` : flat;
615
+ return `${out} :: ${src} :: ${head}`;
616
+ }
617
+
618
+ function aggregateErrors(allErrors) {
619
+ const total = allErrors.length;
620
+ let strikes = 0;
621
+ let soft = 0;
622
+ const byOutcome = new Map();
623
+ const byTask = new Map();
624
+ const bySig = new Map();
625
+ for (const er of allErrors) {
626
+ if (er.soft) soft++;
627
+ else strikes++;
628
+ const oc = er.outcome ?? "—";
629
+ byOutcome.set(oc, (byOutcome.get(oc) ?? 0) + 1);
630
+ const taskKey = er.run.task;
631
+ byTask.set(taskKey, (byTask.get(taskKey) ?? 0) + 1);
632
+ if (!bySig.has(er.bodySig)) {
633
+ bySig.set(er.bodySig, {
634
+ sig: er.bodySig,
635
+ count: 0,
636
+ outcome: er.outcome,
637
+ sourcePathPattern: er.sourcePath
638
+ ? er.sourcePath.replace(/turn_\d+/, "turn_*")
639
+ : null,
640
+ turns: new Set(),
641
+ tasks: new Set(),
642
+ exemplar: null,
643
+ });
644
+ }
645
+ const g = bySig.get(er.bodySig);
646
+ g.count++;
647
+ g.turns.add(er.turn);
648
+ g.tasks.add(taskKey);
649
+ if (g.exemplar == null) g.exemplar = er;
650
+ }
651
+ const topSignatures = [...bySig.values()]
652
+ .toSorted((a, b) => b.count - a.count)
653
+ .map((g) => ({
654
+ sig: g.sig,
655
+ count: g.count,
656
+ outcome: g.outcome,
657
+ sourcePathPattern: g.sourcePathPattern,
658
+ turns: [...g.turns].toSorted((a, b) => a - b),
659
+ tasks: [...g.tasks].toSorted(),
660
+ exemplar: g.exemplar,
661
+ }));
662
+ return {
663
+ total,
664
+ strikes,
665
+ soft,
666
+ byOutcome: Object.fromEntries(
667
+ [...byOutcome.entries()].toSorted((a, b) => b[1] - a[1]),
668
+ ),
669
+ byTask: Object.fromEntries(
670
+ [...byTask.entries()].toSorted((a, b) => b[1] - a[1]),
671
+ ),
672
+ topSignatures,
673
+ };
674
+ }
675
+
676
+ function indentBlock(text, indent = " ") {
677
+ if (!text) return "";
678
+ return text
679
+ .split("\n")
680
+ .map((line) => `${indent}${line}`)
681
+ .join("\n");
682
+ }
683
+
684
+ // Compress runs of consecutive turn numbers into "first-last" form so a
685
+ // 25-occurrence error spanning turns 96-120 reads as `96-120` instead of
686
+ // hogging a line. Mixed sparse + run sequences come out as
687
+ // `1, 4, 96-120`.
688
+ function compressTurns(turns) {
689
+ if (turns.length === 0) return "";
690
+ const sorted = [...turns].toSorted((a, b) => a - b);
691
+ const out = [];
692
+ let runStart = sorted[0];
693
+ let prev = sorted[0];
694
+ for (let i = 1; i <= sorted.length; i++) {
695
+ const cur = sorted[i];
696
+ if (cur === prev + 1) {
697
+ prev = cur;
698
+ continue;
699
+ }
700
+ out.push(runStart === prev ? `${runStart}` : `${runStart}-${prev}`);
701
+ runStart = cur;
702
+ prev = cur;
703
+ }
704
+ return out.join(", ");
705
+ }
706
+
707
+ function renderErrorsMarkdown(scopeName, allErrors, summary) {
708
+ const lines = [];
709
+ lines.push(`# Errors: ${scopeName}`);
710
+ lines.push("");
711
+ if (allErrors.length === 0) {
712
+ lines.push("No errors recorded.");
713
+ return lines.join("\n");
714
+ }
715
+ lines.push(
716
+ `${summary.total} errors across ${Object.keys(summary.byTask).length} task(s) — ${summary.strikes} strike, ${summary.soft} soft.`,
717
+ );
718
+ lines.push("");
719
+ lines.push("## Counts by outcome");
720
+ lines.push("");
721
+ for (const [oc, n] of Object.entries(summary.byOutcome)) {
722
+ lines.push(`- \`${oc}\` × ${n}`);
723
+ }
724
+ lines.push("");
725
+ lines.push("## Counts by task");
726
+ lines.push("");
727
+ for (const [task, n] of Object.entries(summary.byTask)) {
728
+ lines.push(`- \`${task}\` × ${n}`);
729
+ }
730
+ lines.push("");
731
+ lines.push("## Top signatures");
732
+ lines.push("");
733
+ for (const g of summary.topSignatures) {
734
+ lines.push(
735
+ `### ×${g.count} — \`${g.outcome ?? "—"}\`${g.sourcePathPattern ? ` @ \`${g.sourcePathPattern}\`` : ""}`,
736
+ );
737
+ lines.push("");
738
+ lines.push(`turns: ${compressTurns(g.turns)}`);
739
+ if (g.tasks.length > 1) {
740
+ lines.push(`tasks: ${g.tasks.map((t) => `\`${t}\``).join(", ")}`);
741
+ }
742
+ lines.push("");
743
+ lines.push("error body:");
744
+ lines.push("");
745
+ lines.push("```");
746
+ lines.push(g.exemplar.body);
747
+ lines.push("```");
748
+ if (g.exemplar.sourceAction) {
749
+ const sa = g.exemplar.sourceAction;
750
+ lines.push("");
751
+ lines.push(
752
+ `source action (\`${sa.action}\`${sa.targetPath ? ` → \`${sa.targetPath}\`` : ""}):`,
753
+ );
754
+ lines.push("");
755
+ lines.push("```");
756
+ lines.push(sa.body);
757
+ lines.push("```");
758
+ }
759
+ lines.push("");
760
+ }
761
+ lines.push("## Chronological");
762
+ lines.push("");
763
+ const byTask = new Map();
764
+ for (const er of allErrors) {
765
+ const k = er.run.task + (er.run.alias ? `/${er.run.alias}` : "");
766
+ if (!byTask.has(k)) byTask.set(k, []);
767
+ byTask.get(k).push(er);
768
+ }
769
+ for (const [taskKey, errs] of byTask) {
770
+ lines.push(`### ${taskKey}`);
771
+ lines.push("");
772
+ for (const er of errs) {
773
+ const stateTag = er.soft ? "soft" : "strike";
774
+ const oc = er.outcome ?? "—";
775
+ const src = er.sourcePath ? ` @ \`${er.sourcePath}\`` : "";
776
+ lines.push(`- T${er.turn} \`${oc}\`/${stateTag}${src}`);
777
+ lines.push("");
778
+ lines.push(indentBlock(er.body, " > "));
779
+ lines.push("");
780
+ if (er.sourceAction) {
781
+ const sa = er.sourceAction;
782
+ lines.push(
783
+ ` source: \`${sa.action}\`${sa.targetPath ? ` → \`${sa.targetPath}\`` : ""}`,
784
+ );
785
+ lines.push("");
786
+ lines.push(indentBlock(sa.body, " > "));
787
+ lines.push("");
788
+ }
789
+ }
790
+ }
791
+ return lines.join("\n");
792
+ }
793
+
794
+ function writeErrorsArtifacts(outDir, scopeName, allErrors) {
795
+ const summary = aggregateErrors(allErrors);
796
+ writeFileSync(
797
+ join(outDir, "errors.md"),
798
+ `${renderErrorsMarkdown(scopeName, allErrors, summary)}\n`,
799
+ );
800
+ writeFileSync(
801
+ join(outDir, "errors.json"),
802
+ `${JSON.stringify({ scope: scopeName, summary, errors: allErrors }, null, 2)}\n`,
803
+ );
804
+ }
805
+
806
+ // Synthesize a run-summary from the DB for runs that lack the harbor-side
807
+ // rummy.txt `__RUMMY_RUN_SUMMARY__` line (e2e tests, primarily). Token
808
+ // counts come from the turns table; status from the runs table.
809
+ function synthSummary(run, turnsRows) {
810
+ let prompt = 0;
811
+ let completion = 0;
812
+ let cached = 0;
813
+ let reasoning = 0;
814
+ for (const t of turnsRows) {
815
+ prompt += t.prompt_tokens || 0;
816
+ completion += t.completion_tokens || 0;
817
+ cached += t.cached_tokens || 0;
818
+ reasoning += t.reasoning_tokens || 0;
819
+ }
820
+ return {
821
+ status: run.status ?? null,
822
+ turns: turnsRows.length,
823
+ tokens: { prompt, completion, cached, reasoning },
824
+ cost: 0,
825
+ wallSeconds: null,
826
+ };
827
+ }
828
+
829
+ function processTask(taskDir, taskNameOverride = null) {
830
+ const taskName =
831
+ taskNameOverride ??
832
+ taskDir
833
+ .replace(/\/+$/, "")
834
+ .split("/")
835
+ .pop()
836
+ .replace(/\.digest$/, "")
837
+ .replace(/__[A-Za-z0-9]+$/, "");
838
+ const rummyDb = findAgentDb(taskDir);
839
+ if (rummyDb == null) {
840
+ closeSync(openSync(join(taskDir, "digest_skipped"), "w"));
841
+ return [
842
+ {
843
+ task: taskName,
844
+ dir: taskDir,
845
+ reward: readReward(taskDir),
846
+ status: null,
847
+ turns: 0,
848
+ tokens: null,
849
+ cost: null,
850
+ wallSeconds: null,
851
+ markers: ["exfil_fail"],
852
+ prompt: null,
853
+ turnRows: [],
854
+ errors: [],
855
+ },
856
+ ];
857
+ }
858
+
859
+ const reward = readReward(taskDir);
860
+ const harborSummary = readRunSummary(taskDir);
861
+ const perRun = readDb(rummyDb);
862
+
863
+ // Tbench DBs hold one run per task container; e2e TestDb DBs hold many
864
+ // (one per `it()` block). Single-run task dirs keep the legacy layout
865
+ // (digest.md alongside agent/); multi-run task dirs nest per-run output
866
+ // at <task>/<alias>/.
867
+ const multiRun = perRun.length > 1;
868
+ const out = [];
869
+ for (const { run, turns, logEntries, packetEntries, prompt } of perRun) {
870
+ const turnRows = buildTurns(turns, logEntries);
871
+ const turnPackets = groupPacketsByTurn(packetEntries);
872
+ const runSummary =
873
+ !multiRun && harborSummary ? harborSummary : synthSummary(run, turns);
874
+ const markers = classifyMarkers(reward, runSummary, turnRows);
875
+ const runName = multiRun ? `${taskName}/${run.alias}` : taskName;
876
+ const outDir = multiRun ? join(taskDir, run.alias) : taskDir;
877
+ mkdirSync(outDir, { recursive: true });
878
+
879
+ const waterfall = renderWaterfall(
880
+ runName,
881
+ prompt,
882
+ runSummary,
883
+ reward,
884
+ turnRows,
885
+ markers,
886
+ );
887
+ writeFileSync(join(outDir, "digest.md"), `${waterfall}\n`);
888
+ writeFileSync(
889
+ join(outDir, "reasoning.md"),
890
+ `${renderReasoning(runName, turnRows)}\n`,
891
+ );
892
+ writeFileSync(
893
+ join(outDir, "packets.md"),
894
+ `${renderPackets(runName, turnPackets)}\n`,
895
+ );
896
+ const digest = digestJson({
897
+ taskName: runName,
898
+ taskDir: outDir,
899
+ prompt,
900
+ runSummary,
901
+ reward,
902
+ turnRows,
903
+ markers,
904
+ });
905
+ digest.errors = collectErrors(turnRows, logEntries, {
906
+ task: taskName,
907
+ alias: multiRun ? run.alias : null,
908
+ });
909
+ writeFileSync(
910
+ join(outDir, "digest.json"),
911
+ `${JSON.stringify(digest, null, 2)}\n`,
912
+ );
913
+ out.push(digest);
914
+ }
915
+ return out;
916
+ }
917
+
918
+ function csvEscape(s) {
919
+ if (s == null) return "";
920
+ const str = String(s);
921
+ if (/[,"\n]/.test(str)) return `"${str.replace(/"/g, '""')}"`;
922
+ return str;
923
+ }
924
+
925
+ function writeIndex(sweepDir, digests) {
926
+ const header = [
927
+ "task",
928
+ "reward",
929
+ "status",
930
+ "turns",
931
+ "prompt_tokens",
932
+ "completion_tokens",
933
+ "cached_tokens",
934
+ "cost",
935
+ "wall_seconds",
936
+ "markers",
937
+ ].join(",");
938
+ const rows = digests
939
+ .toSorted((a, b) => (a.task ?? "").localeCompare(b.task ?? ""))
940
+ .map((d) =>
941
+ [
942
+ csvEscape(d.task),
943
+ csvEscape(d.reward),
944
+ csvEscape(d.status),
945
+ csvEscape(d.turns),
946
+ csvEscape(d.tokens?.prompt ?? ""),
947
+ csvEscape(d.tokens?.completion ?? ""),
948
+ csvEscape(d.tokens?.cached ?? ""),
949
+ csvEscape(d.cost),
950
+ csvEscape(d.wallSeconds),
951
+ csvEscape(d.markers.join(";")),
952
+ ].join(","),
953
+ );
954
+ writeFileSync(join(sweepDir, "index.csv"), `${header}\n${rows.join("\n")}\n`);
955
+ }
956
+
957
+ // CLI: first positional arg is a `.db` file, a task dir, or a sweep dir.
958
+ // Second positional arg (bare-DB mode only) selects which run becomes
959
+ // the "featured" run promoted to test/digest/ — exact or substring
960
+ // match against run.alias. Default: the most-recent (highest-id) run.
961
+ // With no arg, default to the latest sweep under test/tbench/results.
962
+ const target = process.argv[2];
963
+ const runSelector = process.argv[3] ?? null;
964
+ let entry;
965
+ if (target) {
966
+ entry = target.startsWith("/") ? target : join(process.cwd(), target);
967
+ } else {
968
+ const sweeps = readdirSync(RESULTS_DIR)
969
+ .filter((d) => statSync(join(RESULTS_DIR, d)).isDirectory())
970
+ .sort();
971
+ if (sweeps.length === 0) {
972
+ console.error("no sweep dir found");
973
+ process.exit(2);
974
+ }
975
+ entry = join(RESULTS_DIR, sweeps[sweeps.length - 1]);
976
+ }
977
+
978
+ if (!existsSync(entry)) {
979
+ console.error(`not found: ${entry}`);
980
+ process.exit(2);
981
+ }
982
+
983
+ // Bare-DB invocation: `node bin/digest.js /path/to/rummy.db`. The full
984
+ // pile (per-alias subdirs for multi-run DBs) lands in /tmp/rummy_digest/.
985
+ // We synthesize a task dir there so processTask's path discipline
986
+ // (agent/rummy.db, reward.txt-optional, rummy.txt-optional) keeps
987
+ // holding. After processTask runs, PUBLIC_DIR gets a flat copy of the
988
+ // most recent run's artifacts (see end of file).
989
+ let bareDbName = null;
990
+ if (statSync(entry).isFile() && /\.db$/.test(entry)) {
991
+ const { rmSync, linkSync, copyFileSync: cp } = await import("node:fs");
992
+ bareDbName = entry.split("/").pop().replace(/\.db$/, "");
993
+ rmSync(PILE_DIR, { recursive: true, force: true });
994
+ mkdirSync(join(PILE_DIR, "agent"), { recursive: true });
995
+ const linkedDb = join(PILE_DIR, "agent", "rummy.db");
996
+ // Hard-link is cheap and keeps reads off the live DB if any; fall
997
+ // back to copy on cross-device or filesystem-restricted setups.
998
+ try {
999
+ linkSync(entry, linkedDb);
1000
+ } catch {
1001
+ cp(entry, linkedDb);
1002
+ }
1003
+ entry = PILE_DIR;
1004
+ }
1005
+
1006
+ if (isTaskDir(entry)) {
1007
+ const digests = processTask(entry, bareDbName);
1008
+ const allErrors = digests.flatMap((d) => d.errors ?? []);
1009
+ writeErrorsArtifacts(entry, bareDbName ?? entry.split("/").pop(), allErrors);
1010
+ for (const d of digests) {
1011
+ console.log(`wrote digest for ${d.task}: ${d.markers.join(", ")}`);
1012
+ }
1013
+ console.log(
1014
+ `wrote errors.md + errors.json (${allErrors.length} errors) → ${entry}/`,
1015
+ );
1016
+
1017
+ // Bare-DB mode: copy the most-recent run's flat artifacts into
1018
+ // PUBLIC_DIR (test/digest/) so a human can `cat test/digest/digest.md`
1019
+ // without spelunking. Pile of all runs stays in PILE_DIR. For single-
1020
+ // run DBs this is just "the digest." For multi-run DBs (e2e TestDb,
1021
+ // LME) this is the highest-id run; pick a different one out of the
1022
+ // pile if you want it.
1023
+ if (bareDbName) {
1024
+ const { rmSync, copyFileSync: cp } = await import("node:fs");
1025
+ rmSync(PUBLIC_DIR, { recursive: true, force: true });
1026
+ mkdirSync(PUBLIC_DIR, { recursive: true });
1027
+ let featured = digests[digests.length - 1];
1028
+ if (runSelector) {
1029
+ const matches = digests.filter((d) =>
1030
+ d.task.includes(runSelector),
1031
+ );
1032
+ if (matches.length === 0) {
1033
+ console.error(
1034
+ `run selector "${runSelector}" matched no run in this DB. ` +
1035
+ `Available: ${digests.map((d) => d.task).join(", ")}`,
1036
+ );
1037
+ process.exit(2);
1038
+ }
1039
+ if (matches.length > 1) {
1040
+ console.error(
1041
+ `run selector "${runSelector}" matched ${matches.length} runs: ` +
1042
+ matches.map((d) => d.task).join(", "),
1043
+ );
1044
+ process.exit(2);
1045
+ }
1046
+ featured = matches[0];
1047
+ }
1048
+ for (const f of ["digest.md", "reasoning.md", "packets.md", "digest.json"]) {
1049
+ const src = join(featured.dir, f);
1050
+ if (existsSync(src)) cp(src, join(PUBLIC_DIR, f));
1051
+ }
1052
+ for (const f of ["errors.md", "errors.json"]) {
1053
+ const src = join(entry, f);
1054
+ if (existsSync(src)) cp(src, join(PUBLIC_DIR, f));
1055
+ }
1056
+ console.log(
1057
+ `featured run (${featured.task}) → ${PUBLIC_DIR}/digest.md`,
1058
+ );
1059
+ }
1060
+ } else {
1061
+ const taskDirs = findTaskDirs(entry);
1062
+ if (taskDirs.length === 0) {
1063
+ console.error(`no task dirs (with agent/rummy*.db) under ${entry}`);
1064
+ process.exit(2);
1065
+ }
1066
+ const digests = [];
1067
+ for (const td of taskDirs) {
1068
+ try {
1069
+ digests.push(...processTask(td));
1070
+ } catch (err) {
1071
+ console.error(`! ${relative(entry, td)}: ${err.message}`);
1072
+ digests.push({
1073
+ task: td
1074
+ .split("/")
1075
+ .pop()
1076
+ .replace(/__[A-Za-z0-9]+$/, ""),
1077
+ dir: td,
1078
+ reward: null,
1079
+ status: null,
1080
+ turns: 0,
1081
+ tokens: null,
1082
+ cost: null,
1083
+ wallSeconds: null,
1084
+ markers: ["digest_failed"],
1085
+ prompt: null,
1086
+ turnRows: [],
1087
+ errors: [],
1088
+ });
1089
+ }
1090
+ }
1091
+ writeIndex(entry, digests);
1092
+ const allErrors = digests.flatMap((d) => d.errors ?? []);
1093
+ writeErrorsArtifacts(entry, entry.split("/").pop(), allErrors);
1094
+ console.log(
1095
+ `wrote ${digests.length} digests + index.csv + errors.md (${allErrors.length} errors) → ${entry}/`,
1096
+ );
1097
+ }