akm-cli 0.7.0-rc1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/dist/src/cli.js +100 -16
  2. package/dist/src/commands/config-cli.js +42 -0
  3. package/dist/src/commands/history.js +78 -7
  4. package/dist/src/commands/registry-search.js +69 -6
  5. package/dist/src/commands/search.js +30 -3
  6. package/dist/src/commands/show.js +29 -0
  7. package/dist/src/commands/source-add.js +5 -1
  8. package/dist/src/commands/source-manage.js +7 -1
  9. package/dist/src/core/config.js +28 -0
  10. package/dist/src/indexer/db-search.js +1 -0
  11. package/dist/src/indexer/indexer.js +16 -2
  12. package/dist/src/indexer/matchers.js +1 -1
  13. package/dist/src/indexer/search-source.js +4 -2
  14. package/dist/src/integrations/agent/profiles.js +1 -1
  15. package/dist/src/integrations/agent/spawn.js +67 -16
  16. package/dist/src/integrations/github.js +9 -3
  17. package/dist/src/llm/embedders/remote.js +37 -3
  18. package/dist/src/output/cli-hints.js +15 -2
  19. package/dist/src/output/renderers.js +3 -1
  20. package/dist/src/output/shapes.js +8 -1
  21. package/dist/src/output/text.js +156 -3
  22. package/dist/src/registry/build-index.js +5 -4
  23. package/dist/src/registry/providers/static-index.js +3 -1
  24. package/dist/src/setup/setup.js +9 -0
  25. package/dist/src/wiki/wiki.js +54 -6
  26. package/dist/src/workflows/runs.js +37 -3
  27. package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +1 -1
  28. package/dist/tests/bench/attribution.test.js +24 -23
  29. package/dist/tests/bench/cleanup.js +31 -0
  30. package/dist/tests/bench/cli.js +366 -31
  31. package/dist/tests/bench/cli.test.js +282 -14
  32. package/dist/tests/bench/corpus.js +3 -0
  33. package/dist/tests/bench/corpus.test.js +10 -10
  34. package/dist/tests/bench/doctor.js +525 -0
  35. package/dist/tests/bench/driver.js +77 -22
  36. package/dist/tests/bench/driver.test.js +142 -1
  37. package/dist/tests/bench/environment.js +233 -0
  38. package/dist/tests/bench/environment.test.js +199 -0
  39. package/dist/tests/bench/evolve.js +67 -0
  40. package/dist/tests/bench/evolve.test.js +12 -4
  41. package/dist/tests/bench/failure-modes.test.js +52 -3
  42. package/dist/tests/bench/feedback-integrity.test.js +3 -2
  43. package/dist/tests/bench/leakage.test.js +105 -2
  44. package/dist/tests/bench/learning-curve.test.js +3 -2
  45. package/dist/tests/bench/metrics.js +102 -26
  46. package/dist/tests/bench/metrics.test.js +10 -4
  47. package/dist/tests/bench/opencode-config.js +194 -0
  48. package/dist/tests/bench/opencode-config.test.js +370 -0
  49. package/dist/tests/bench/report.js +73 -9
  50. package/dist/tests/bench/report.test.js +59 -10
  51. package/dist/tests/bench/run-config.js +355 -0
  52. package/dist/tests/bench/run-config.test.js +298 -0
  53. package/dist/tests/bench/run-curate-test.js +32 -0
  54. package/dist/tests/bench/run-failing-tasks.js +56 -0
  55. package/dist/tests/bench/run-full-bench.js +51 -0
  56. package/dist/tests/bench/run-items36-targeted.js +69 -0
  57. package/dist/tests/bench/run-nano-quick.js +42 -0
  58. package/dist/tests/bench/run-waveg-targeted.js +62 -0
  59. package/dist/tests/bench/runner.js +257 -94
  60. package/dist/tests/bench/tmp.js +90 -0
  61. package/dist/tests/bench/trajectory.js +2 -2
  62. package/dist/tests/bench/verifier.js +6 -1
  63. package/dist/tests/bench/workflow-spec.js +11 -24
  64. package/dist/tests/bench/workflow-spec.test.js +1 -1
  65. package/dist/tests/bench/workflow-trace.js +34 -0
  66. package/dist/tests/cli-errors.test.js +1 -0
  67. package/dist/tests/commands/history.test.js +195 -0
  68. package/dist/tests/config.test.js +25 -0
  69. package/dist/tests/e2e.test.js +23 -2
  70. package/dist/tests/fixtures/stashes/load.js +1 -1
  71. package/dist/tests/fixtures/stashes/load.test.js +11 -2
  72. package/dist/tests/indexer.test.js +12 -1
  73. package/dist/tests/output-baseline.test.js +2 -1
  74. package/dist/tests/output-shapes-unit.test.js +3 -1
  75. package/dist/tests/registry-build-index.test.js +17 -1
  76. package/dist/tests/registry-providers/static-index.test.js +34 -0
  77. package/dist/tests/registry-search.test.js +200 -0
  78. package/dist/tests/remember-frontmatter.test.js +11 -13
  79. package/dist/tests/source-qa-fixes.test.js +18 -0
  80. package/dist/tests/source-registry.test.js +3 -3
  81. package/dist/tests/source-source.test.js +61 -1
  82. package/dist/tests/workflow-qa-fixes.test.js +18 -0
  83. package/package.json +1 -1
@@ -0,0 +1,370 @@
1
+ /**
2
+ * Tests for the bench opencode-config module.
3
+ *
4
+ * Covers all cases described in the design spec:
5
+ * - loads canonical fixture without error
6
+ * - rejects literal apiKey (not env-ref)
7
+ * - accepts {env:VAR} apiKey form
8
+ * - rejects sk-XXXX credential heuristic anywhere in tree
9
+ * - rejects top-level plugin / mcp / permission keys
10
+ * - rejects unknown schemaVersion
11
+ * - isUsageError: true when file missing
12
+ * - selectProviderForModel picks correct provider
13
+ * - selectProviderForModel throws on unknown provider prefix
14
+ * - materializeOpencodeConfig writes exactly $schema + provider keys, mode 0o600
15
+ */
16
+ import { afterAll, beforeAll, describe, expect, test } from "bun:test";
17
+ import fs from "node:fs";
18
+ import path from "node:path";
19
+ import { BenchConfigError, loadOpencodeProviders, materializeOpencodeConfig, selectProviderForModel, } from "./opencode-config";
20
+ import { benchMkdtemp } from "./tmp";
21
+ /** Absolute path to the committed fixture. */
22
+ const FIXTURE_PATH = path.resolve(__dirname, "..", "fixtures", "bench", "opencode-providers.json");
23
+ /** Write a temp JSON file and return its path. */
24
+ function writeTmp(dir, name, content) {
25
+ const p = path.join(dir, name);
26
+ fs.writeFileSync(p, JSON.stringify(content));
27
+ return p;
28
+ }
29
+ describe("loadOpencodeProviders", () => {
30
+ let tmp;
31
+ beforeAll(() => {
32
+ tmp = benchMkdtemp("bench-opencode-config-test-");
33
+ });
34
+ afterAll(() => {
35
+ fs.rmSync(tmp, { recursive: true, force: true });
36
+ });
37
+ // ── Canonical fixture ─────────────────────────────────────────────────────
38
+ test("loads the canonical committed fixture without error", () => {
39
+ expect(() => loadOpencodeProviders(FIXTURE_PATH)).not.toThrow();
40
+ const loaded = loadOpencodeProviders(FIXTURE_PATH);
41
+ expect(loaded.source).toBe(FIXTURE_PATH);
42
+ expect(loaded.providers).toBeDefined();
43
+ expect(typeof loaded.providers).toBe("object");
44
+ expect(loaded.defaultModel).toBe("local/qwen/qwen3.5-9b");
45
+ expect("local" in loaded.providers).toBe(true);
46
+ });
47
+ // ── File not found ────────────────────────────────────────────────────────
48
+ test("throws BenchConfigError with isUsageError: true when file does not exist", () => {
49
+ const missing = path.join(tmp, "does-not-exist.json");
50
+ let err;
51
+ try {
52
+ loadOpencodeProviders(missing);
53
+ }
54
+ catch (e) {
55
+ err = e;
56
+ }
57
+ expect(err).toBeInstanceOf(BenchConfigError);
58
+ const bce = err;
59
+ expect(bce.code).toBe("BENCH_CONFIG");
60
+ expect(bce.isUsageError).toBe(true);
61
+ expect(bce.message).toContain("not found");
62
+ });
63
+ // ── JSON parse failure ────────────────────────────────────────────────────
64
+ test("throws BenchConfigError with isUsageError: false on malformed JSON", () => {
65
+ const p = path.join(tmp, "bad.json");
66
+ fs.writeFileSync(p, "{ this is not json }");
67
+ let err;
68
+ try {
69
+ loadOpencodeProviders(p);
70
+ }
71
+ catch (e) {
72
+ err = e;
73
+ }
74
+ expect(err).toBeInstanceOf(BenchConfigError);
75
+ expect(err.isUsageError).toBe(false);
76
+ expect(err.message).toContain("JSON parse error");
77
+ });
78
+ // ── schemaVersion ─────────────────────────────────────────────────────────
79
+ test("rejects unknown schemaVersion", () => {
80
+ const p = writeTmp(tmp, "bad-version.json", {
81
+ schemaVersion: 2,
82
+ providers: {},
83
+ });
84
+ expect(() => loadOpencodeProviders(p)).toThrow(BenchConfigError);
85
+ let err;
86
+ try {
87
+ loadOpencodeProviders(p);
88
+ }
89
+ catch (e) {
90
+ if (e instanceof BenchConfigError)
91
+ err = e;
92
+ }
93
+ expect(err?.isUsageError).toBe(false);
94
+ expect(err?.message).toContain("schemaVersion");
95
+ });
96
+ test("rejects schemaVersion: 0", () => {
97
+ const p = writeTmp(tmp, "version-0.json", { schemaVersion: 0, providers: {} });
98
+ expect(() => loadOpencodeProviders(p)).toThrow(BenchConfigError);
99
+ });
100
+ // ── Forbidden top-level keys ──────────────────────────────────────────────
101
+ test("rejects top-level 'plugin' key", () => {
102
+ const p = writeTmp(tmp, "has-plugin.json", {
103
+ schemaVersion: 1,
104
+ providers: {},
105
+ plugin: [],
106
+ });
107
+ let err;
108
+ try {
109
+ loadOpencodeProviders(p);
110
+ }
111
+ catch (e) {
112
+ if (e instanceof BenchConfigError)
113
+ err = e;
114
+ }
115
+ expect(err).toBeDefined();
116
+ expect(err?.isUsageError).toBe(false);
117
+ expect(err?.message).toContain("plugin");
118
+ });
119
+ test("rejects top-level 'mcp' key", () => {
120
+ const p = writeTmp(tmp, "has-mcp.json", {
121
+ schemaVersion: 1,
122
+ providers: {},
123
+ mcp: {},
124
+ });
125
+ expect(() => loadOpencodeProviders(p)).toThrow(BenchConfigError);
126
+ });
127
+ test("rejects top-level 'permission' key", () => {
128
+ const p = writeTmp(tmp, "has-permission.json", {
129
+ schemaVersion: 1,
130
+ providers: {},
131
+ permission: {},
132
+ });
133
+ expect(() => loadOpencodeProviders(p)).toThrow(BenchConfigError);
134
+ });
135
+ test("rejects top-level 'disabled_providers' key", () => {
136
+ const p = writeTmp(tmp, "has-disabled.json", {
137
+ schemaVersion: 1,
138
+ providers: {},
139
+ disabled_providers: [],
140
+ });
141
+ expect(() => loadOpencodeProviders(p)).toThrow(BenchConfigError);
142
+ });
143
+ test("rejects top-level 'small_model' key", () => {
144
+ const p = writeTmp(tmp, "has-small-model.json", {
145
+ schemaVersion: 1,
146
+ providers: {},
147
+ small_model: "anthropic/claude-haiku-4-5",
148
+ });
149
+ expect(() => loadOpencodeProviders(p)).toThrow(BenchConfigError);
150
+ });
151
+ test("rejects top-level 'snapshot' key", () => {
152
+ const p = writeTmp(tmp, "has-snapshot.json", {
153
+ schemaVersion: 1,
154
+ providers: {},
155
+ snapshot: true,
156
+ });
157
+ expect(() => loadOpencodeProviders(p)).toThrow(BenchConfigError);
158
+ });
159
+ // ── apiKey validation ─────────────────────────────────────────────────────
160
+ test("rejects literal apiKey string (not an env-ref)", () => {
161
+ const p = writeTmp(tmp, "literal-apikey.json", {
162
+ schemaVersion: 1,
163
+ providers: {
164
+ myProvider: {
165
+ apiKey: "not-an-env-ref",
166
+ },
167
+ },
168
+ });
169
+ let err;
170
+ try {
171
+ loadOpencodeProviders(p);
172
+ }
173
+ catch (e) {
174
+ if (e instanceof BenchConfigError)
175
+ err = e;
176
+ }
177
+ expect(err).toBeDefined();
178
+ expect(err?.isUsageError).toBe(false);
179
+ expect(err?.message).toContain("apiKey");
180
+ expect(err?.message).toContain("env-ref");
181
+ });
182
+ test("accepts {env:VAR} form for apiKey", () => {
183
+ const p = writeTmp(tmp, "env-ref-apikey.json", {
184
+ schemaVersion: 1,
185
+ providers: {
186
+ myProvider: {
187
+ npm: "@ai-sdk/openai-compatible",
188
+ apiKey: "{env:MY_API_KEY}",
189
+ options: { baseURL: "http://localhost:1234/v1" },
190
+ },
191
+ },
192
+ });
193
+ expect(() => loadOpencodeProviders(p)).not.toThrow();
194
+ const loaded = loadOpencodeProviders(p);
195
+ expect("myProvider" in loaded.providers).toBe(true);
196
+ });
197
+ test("accepts {env:UNDERSCORE_KEY_123} env-ref form", () => {
198
+ const p = writeTmp(tmp, "env-ref-underscore.json", {
199
+ schemaVersion: 1,
200
+ providers: {
201
+ p: { apiKey: "{env:MY_KEY_123}" },
202
+ },
203
+ });
204
+ expect(() => loadOpencodeProviders(p)).not.toThrow();
205
+ });
206
+ test("rejects apiKey starting with lowercase (not a valid env-ref)", () => {
207
+ const p = writeTmp(tmp, "bad-env-ref.json", {
208
+ schemaVersion: 1,
209
+ providers: {
210
+ p: { apiKey: "{env:my_lowercase_key}" },
211
+ },
212
+ });
213
+ expect(() => loadOpencodeProviders(p)).toThrow(BenchConfigError);
214
+ });
215
+ // ── Credential heuristic ──────────────────────────────────────────────────
216
+ test("rejects sk-XXXX credential anywhere in the providers tree", () => {
217
+ const p = writeTmp(tmp, "has-sk-key.json", {
218
+ schemaVersion: 1,
219
+ providers: {
220
+ openai: {
221
+ npm: "@ai-sdk/openai",
222
+ secret: "sk-abcdefghijklmnopqrstuvwxyz0123456789",
223
+ },
224
+ },
225
+ });
226
+ let err;
227
+ try {
228
+ loadOpencodeProviders(p);
229
+ }
230
+ catch (e) {
231
+ if (e instanceof BenchConfigError)
232
+ err = e;
233
+ }
234
+ expect(err).toBeDefined();
235
+ expect(err?.isUsageError).toBe(false);
236
+ expect(err?.message).toContain("credential heuristic");
237
+ });
238
+ test("rejects sk-XXXX credential in a nested object", () => {
239
+ const p = writeTmp(tmp, "nested-sk-key.json", {
240
+ schemaVersion: 1,
241
+ providers: {
242
+ p: {
243
+ options: {
244
+ headers: {
245
+ Authorization: "sk-proj-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
246
+ },
247
+ },
248
+ },
249
+ },
250
+ });
251
+ expect(() => loadOpencodeProviders(p)).toThrow(BenchConfigError);
252
+ });
253
+ // ── Valid minimal file ────────────────────────────────────────────────────
254
+ test("accepts a valid minimal file with no defaultModel", () => {
255
+ const p = writeTmp(tmp, "minimal.json", {
256
+ schemaVersion: 1,
257
+ providers: {
258
+ local: {
259
+ npm: "@ai-sdk/openai-compatible",
260
+ options: { baseURL: "http://localhost:1234/v1" },
261
+ },
262
+ },
263
+ });
264
+ const loaded = loadOpencodeProviders(p);
265
+ expect(loaded.defaultModel).toBeUndefined();
266
+ expect("local" in loaded.providers).toBe(true);
267
+ });
268
+ });
269
+ describe("selectProviderForModel", () => {
270
+ const loaded = {
271
+ source: "/fake/path.json",
272
+ providers: {
273
+ don: { npm: "@ai-sdk/openai-compatible", name: "Don LM Studio" },
274
+ ollama: { npm: "@ai-sdk/openai-compatible", name: "Ollama" },
275
+ },
276
+ defaultModel: "don/mlx-community/qwen3.6-35b-a3b",
277
+ };
278
+ test("splits on first slash and returns the correct provider entry", () => {
279
+ const result = selectProviderForModel(loaded, "don/mlx-community/qwen3.6-35b-a3b");
280
+ expect(result.providerKey).toBe("don");
281
+ expect(result.entry).toBe(loaded.providers.don);
282
+ });
283
+ test("handles a model with no slash (entire string is the provider key)", () => {
284
+ const result = selectProviderForModel(loaded, "ollama");
285
+ expect(result.providerKey).toBe("ollama");
286
+ expect(result.entry).toBe(loaded.providers.ollama);
287
+ });
288
+ test("throws BenchConfigError when provider key is not in loaded.providers", () => {
289
+ let err;
290
+ try {
291
+ selectProviderForModel(loaded, "unknown/some-model");
292
+ }
293
+ catch (e) {
294
+ if (e instanceof BenchConfigError)
295
+ err = e;
296
+ }
297
+ expect(err).toBeDefined();
298
+ expect(err?.code).toBe("BENCH_CONFIG");
299
+ expect(err?.isUsageError).toBe(false);
300
+ expect(err?.message).toContain("unknown");
301
+ expect(err?.message).toContain("provider key");
302
+ });
303
+ test("error message lists available provider keys", () => {
304
+ let err;
305
+ try {
306
+ selectProviderForModel(loaded, "missing/model");
307
+ }
308
+ catch (e) {
309
+ if (e instanceof BenchConfigError)
310
+ err = e;
311
+ }
312
+ expect(err?.message).toContain("don");
313
+ expect(err?.message).toContain("ollama");
314
+ });
315
+ });
316
+ describe("materializeOpencodeConfig", () => {
317
+ let tmp;
318
+ beforeAll(() => {
319
+ tmp = benchMkdtemp("bench-materialize-test-");
320
+ });
321
+ afterAll(() => {
322
+ fs.rmSync(tmp, { recursive: true, force: true });
323
+ });
324
+ test("writes opencode.json with required bench isolation invariants and provider", () => {
325
+ const configDir = path.join(tmp, "run-config");
326
+ fs.mkdirSync(configDir, { recursive: true });
327
+ const entry = { npm: "@ai-sdk/openai-compatible", name: "Test Provider" };
328
+ materializeOpencodeConfig(configDir, { providerKey: "test", entry }, "test/my-model");
329
+ const outPath = path.join(configDir, "opencode.json");
330
+ expect(fs.existsSync(outPath)).toBe(true);
331
+ const contents = JSON.parse(fs.readFileSync(outPath, "utf8"));
332
+ expect(contents.model).toBe("test/my-model");
333
+ expect(contents.$schema).toBe("https://opencode.ai/config.json");
334
+ // Bench isolation invariants: plugin:[] prevents operator plugin interference;
335
+ // permission block ensures opencode run (non-interactive) allows bash/file tools.
336
+ expect(contents.plugin).toEqual([]);
337
+ expect(contents.permission?.bash).toBe("allow");
338
+ // Provider block is written correctly.
339
+ const provider = contents.provider;
340
+ expect(Object.keys(provider)).toEqual(["test"]);
341
+ expect(provider.test).toEqual(entry);
342
+ });
343
+ test("does not write mcp into the config", () => {
344
+ const configDir = path.join(tmp, "run-config-2");
345
+ fs.mkdirSync(configDir, { recursive: true });
346
+ materializeOpencodeConfig(configDir, { providerKey: "p", entry: {} }, "p/model");
347
+ const contents = JSON.parse(fs.readFileSync(path.join(configDir, "opencode.json"), "utf8"));
348
+ expect(contents.mcp).toBeUndefined();
349
+ });
350
+ test("writes the file with mode 0o600 (not world-readable)", () => {
351
+ const configDir = path.join(tmp, "run-config-3");
352
+ fs.mkdirSync(configDir, { recursive: true });
353
+ materializeOpencodeConfig(configDir, { providerKey: "p", entry: {} }, "p/model");
354
+ const stat = fs.statSync(path.join(configDir, "opencode.json"));
355
+ // Mode 0o600 means only owner can read/write (no group or other bits).
356
+ // On Linux/macOS the lower 9 bits are 0o600 = 0o110000000 in binary.
357
+ const mode = stat.mode & 0o777;
358
+ expect(mode).toBe(0o600);
359
+ });
360
+ test("can be called twice (overwrites an existing opencode.json)", () => {
361
+ const configDir = path.join(tmp, "run-config-4");
362
+ fs.mkdirSync(configDir, { recursive: true });
363
+ materializeOpencodeConfig(configDir, { providerKey: "a", entry: { name: "first" } }, "a/m1");
364
+ materializeOpencodeConfig(configDir, { providerKey: "b", entry: { name: "second" } }, "b/m2");
365
+ const contents = JSON.parse(fs.readFileSync(path.join(configDir, "opencode.json"), "utf8"));
366
+ const provider = contents.provider;
367
+ expect("b" in provider).toBe(true);
368
+ expect("a" in provider).toBe(false);
369
+ });
370
+ });
@@ -179,6 +179,12 @@ function buildUtilityJson(input) {
179
179
  if (input.allRuns) {
180
180
  envelope.runs = input.allRuns.map(serializeRunForReport);
181
181
  }
182
+ // Baseline pass-rate map — additive top-level key. Emitted only when the
183
+ // caller supplied a baseline through `loadBenchRunConfig`; legacy reports
184
+ // stay byte-identical without it.
185
+ if (input.baselineByTaskId) {
186
+ envelope.baseline_by_task_id = { ...input.baselineByTaskId };
187
+ }
182
188
  // Per-asset attribution is an additive top-level key (§6.5). Emit it only
183
189
  // when the runner populated it so older code paths (e.g. the empty-corpus
184
190
  // skeleton) don't gain the key spuriously.
@@ -229,6 +235,8 @@ function serialiseAkmOverheadPerRun(row) {
229
235
  search_count: row.searchCount,
230
236
  show_count: row.showCount,
231
237
  feedback_count: row.feedbackCount,
238
+ positive_feedback_count: row.positiveFeedbackCount,
239
+ negative_feedback_count: row.negativeFeedbackCount,
232
240
  total_tool_calls: row.totalToolCalls,
233
241
  assets_loaded_count: row.assetsLoadedCount,
234
242
  irrelevant_assets_loaded_count: row.irrelevantAssetsLoadedCount,
@@ -255,6 +263,12 @@ function serialiseAkmOverheadAggregate(agg) {
255
263
  total_tool_calls: agg.totalToolCalls,
256
264
  tool_calls_per_success: agg.toolCallsPerSuccess,
257
265
  cost_per_success: agg.costPerSuccess,
266
+ search_engagement_rate: agg.searchEngagementRate,
267
+ show_engagement_rate: agg.showEngagementRate,
268
+ feedback_engagement_rate: agg.feedbackEngagementRate,
269
+ search_to_show_ratio: agg.searchToShowRatio,
270
+ mean_positive_feedback_count: agg.meanPositiveFeedbackCount,
271
+ mean_negative_feedback_count: agg.meanNegativeFeedbackCount,
258
272
  };
259
273
  }
260
274
  /**
@@ -331,6 +345,7 @@ function serialiseCorpus(c) {
331
345
  return {
332
346
  pass_rate: c.passRate,
333
347
  tokens_per_pass: c.tokensPerPass,
348
+ tokens_per_run: c.tokensPerRun,
334
349
  wallclock_ms: c.wallclockMs,
335
350
  };
336
351
  }
@@ -338,6 +353,7 @@ function serialiseDelta(d) {
338
353
  return {
339
354
  pass_rate: d.passRate,
340
355
  tokens_per_pass: d.tokensPerPass,
356
+ tokens_per_run: d.tokensPerRun,
341
357
  wallclock_ms: d.wallclockMs,
342
358
  };
343
359
  }
@@ -426,6 +442,7 @@ function serialisePerTaskMetrics(m) {
426
442
  pass_rate: m.passRate,
427
443
  pass_at_1: m.passAt1,
428
444
  tokens_per_pass: m.tokensPerPass,
445
+ tokens_per_run: m.tokensPerRun,
429
446
  wallclock_ms: m.wallclockMs,
430
447
  pass_rate_stdev: m.passRateStdev,
431
448
  budget_exceeded_count: m.budgetExceededCount,
@@ -511,23 +528,43 @@ function buildUtilityMarkdown(input) {
511
528
  lines.push("");
512
529
  lines.push(`- correct_asset_loaded: ${formatPercent(input.trajectoryAkm.correctAssetLoaded)}`);
513
530
  lines.push(`- feedback_recorded: ${formatPercent(input.trajectoryAkm.feedbackRecorded)}`);
531
+ // Per-run trajectory detail: when allRuns is present emit a compact table
532
+ // so operators can distinguish null (harness error — no events captured)
533
+ // from false (agent ran, behaviour not observed) from true (confirmed).
534
+ // Symbols: "—" = null, "✗" = false, "✓" = true.
535
+ const akmRuns = (input.allRuns ?? []).filter((r) => r.arm === "akm");
536
+ if (akmRuns.length > 0) {
537
+ lines.push("");
538
+ lines.push("| task | seed | correct_asset_loaded | feedback_recorded |");
539
+ lines.push("|------|------|----------------------|-------------------|");
540
+ for (const r of akmRuns) {
541
+ lines.push(`| ${r.taskId} | ${r.seed} | ${formatTrajBool(r.trajectory.correctAssetLoaded)} | ${formatTrajBool(r.trajectory.feedbackRecorded)} |`);
542
+ }
543
+ }
514
544
  lines.push("");
515
545
  lines.push("## Per-task pass rates");
516
546
  lines.push("");
517
547
  // #261: synthetic column is rendered only when the synthetic arm ran.
518
548
  // The default header/row stays identical to the pre-#261 output.
519
- if (input.aggregateSynth) {
520
- lines.push("| task | noakm | synthetic | akm | delta |");
521
- lines.push("|------|-------|-----------|-----|-------|");
549
+ // Baseline column is rendered only when `baselineByTaskId` was supplied
550
+ // by the caller; legacy reports without it produce byte-identical output.
551
+ const includeSynthCol = input.aggregateSynth !== undefined;
552
+ const baselineMap = input.baselineByTaskId;
553
+ const includeBaselineCol = baselineMap !== undefined;
554
+ const baseColHeader = includeBaselineCol ? " baseline | vs base |" : "";
555
+ const baseColSep = includeBaselineCol ? "----------|---------|" : "";
556
+ if (includeSynthCol) {
557
+ lines.push(`| task | noakm | synthetic | akm | delta |${baseColHeader}`);
558
+ lines.push(`|------|-------|-----------|-----|-------|${baseColSep}`);
522
559
  }
523
560
  else {
524
- lines.push("| task | noakm | akm | delta |");
525
- lines.push("|------|-------|-----|-------|");
561
+ lines.push(`| task | noakm | akm | delta |${baseColHeader}`);
562
+ lines.push(`|------|-------|-----|-------|${baseColSep}`);
526
563
  }
527
564
  // Sort tasks alphabetically for byte-stable markdown output.
528
565
  const sorted = [...input.tasks].sort((a, b) => a.id.localeCompare(b.id));
529
566
  for (const t of sorted) {
530
- lines.push(taskRow(t, input.aggregateSynth !== undefined));
567
+ lines.push(taskRow(t, includeSynthCol, baselineMap));
531
568
  }
532
569
  // Corpus-coverage section (#262). Renders only when at least one task was
533
570
  // tagged with a `memory_ability`; without tags the section adds no signal
@@ -650,15 +687,29 @@ function deltaRow(d) {
650
687
  const tpp = d.tokensPerPass === null ? "n/a" : signed(d.tokensPerPass.toFixed(0));
651
688
  return `| **delta** | ${signed(d.passRate.toFixed(2))} | ${tpp} | ${signed(d.wallclockMs.toFixed(0))} |`;
652
689
  }
653
- function taskRow(t, includeSynthetic = false) {
690
+ function taskRow(t, includeSynthetic = false, baselineByTaskId) {
691
+ // Baseline-delta cell is rendered only when a baseline map is provided
692
+ // AND this task has an entry. Tasks without a baseline entry get an empty
693
+ // pair of cells so columns stay aligned.
694
+ let baselineCells = "";
695
+ if (baselineByTaskId) {
696
+ const base = baselineByTaskId[t.id];
697
+ if (base === undefined) {
698
+ baselineCells = " n/a | n/a |";
699
+ }
700
+ else {
701
+ const delta = t.akm.passRate - base;
702
+ baselineCells = ` ${base.toFixed(2)} | ${signed(delta.toFixed(2))} |`;
703
+ }
704
+ }
654
705
  if (includeSynthetic) {
655
706
  // #261: render the synthetic-arm pass-rate when present; "n/a" when the
656
707
  // arm did not run for this task. A missing arm is NOT a zero-pass arm —
657
708
  // a 0.00 cell would be misleading because the model never tried.
658
709
  const synth = t.synthetic ? t.synthetic.passRate.toFixed(2) : "n/a";
659
- return `| ${t.id} | ${t.noakm.passRate.toFixed(2)} | ${synth} | ${t.akm.passRate.toFixed(2)} | ${signed(t.delta.passRate.toFixed(2))} |`;
710
+ return `| ${t.id} | ${t.noakm.passRate.toFixed(2)} | ${synth} | ${t.akm.passRate.toFixed(2)} | ${signed(t.delta.passRate.toFixed(2))} |${baselineCells}`;
660
711
  }
661
- return `| ${t.id} | ${t.noakm.passRate.toFixed(2)} | ${t.akm.passRate.toFixed(2)} | ${signed(t.delta.passRate.toFixed(2))} |`;
712
+ return `| ${t.id} | ${t.noakm.passRate.toFixed(2)} | ${t.akm.passRate.toFixed(2)} | ${signed(t.delta.passRate.toFixed(2))} |${baselineCells}`;
662
713
  }
663
714
  function signed(text) {
664
715
  if (text.startsWith("-"))
@@ -672,6 +723,19 @@ function formatPercent(value) {
672
723
  return "n/a";
673
724
  return `${(value * 100).toFixed(1)}%`;
674
725
  }
726
+ /**
727
+ * Render a `boolean | null` trajectory field for markdown tables.
728
+ *
729
+ * Three-state semantics:
730
+ * - `null` → `"—"` — no trajectory data (harness error; events.jsonl not captured).
731
+ * - `false` → `"✗"` — agent ran but the behaviour was not observed.
732
+ * - `true` → `"✓"` — behaviour confirmed.
733
+ */
734
+ export function formatTrajBool(value) {
735
+ if (value === null)
736
+ return "—";
737
+ return value ? "✓" : "✗";
738
+ }
675
739
  // ── Compare rendering (§8) ─────────────────────────────────────────────────
676
740
  /**
677
741
  * Render a CompareResult as a deterministic markdown diff.
@@ -3,7 +3,7 @@
3
3
  */
4
4
  import { describe, expect, test } from "bun:test";
5
5
  import fs from "node:fs";
6
- import { renderJsonReport, renderMarkdownSummary, renderUtilityReport, resolveGitBranch, resolveGitCommit, serializeRunForReport, } from "./report";
6
+ import { formatTrajBool, renderJsonReport, renderMarkdownSummary, renderUtilityReport, resolveGitBranch, resolveGitCommit, serializeRunForReport, } from "./report";
7
7
  import { benchMkdtemp } from "./tmp";
8
8
  const sample = {
9
9
  timestamp: "2026-04-27T12:00:00Z",
@@ -66,6 +66,7 @@ function pt(passRate, tokens, wall, count = 5) {
66
66
  passRate,
67
67
  passAt1: passes > 0 ? 1 : 0,
68
68
  tokensPerPass: tokens,
69
+ tokensPerRun: tokens,
69
70
  wallclockMs: wall,
70
71
  passRateStdev: 0,
71
72
  budgetExceededCount: 0,
@@ -80,9 +81,9 @@ const utilSample = {
80
81
  commit: "deadbee",
81
82
  model: "anthropic/claude-opus-4-7",
82
83
  corpus: { domains: 3, tasks: 2, slice: "all", seedsPerArm: 5 },
83
- aggregateNoakm: { passRate: 0.4, tokensPerPass: 18000, wallclockMs: 41000 },
84
- aggregateAkm: { passRate: 0.7, tokensPerPass: 14000, wallclockMs: 36000 },
85
- aggregateDelta: { passRate: 0.3, tokensPerPass: -4000, wallclockMs: -5000 },
84
+ aggregateNoakm: { passRate: 0.4, tokensPerPass: 18000, tokensPerRun: null, wallclockMs: 41000 },
85
+ aggregateAkm: { passRate: 0.7, tokensPerPass: 14000, tokensPerRun: null, wallclockMs: 36000 },
86
+ aggregateDelta: { passRate: 0.3, tokensPerPass: -4000, tokensPerRun: null, wallclockMs: -5000 },
86
87
  trajectoryAkm: { correctAssetLoaded: 0.78, feedbackRecorded: 0.65 },
87
88
  failureModes: { byLabel: {}, byTask: {} },
88
89
  tasks: [
@@ -90,13 +91,13 @@ const utilSample = {
90
91
  id: "domain-a/task-1",
91
92
  noakm: pt(0.4, 20000, 40000),
92
93
  akm: pt(0.8, 13000, 35000),
93
- delta: { passRate: 0.4, tokensPerPass: -7000, wallclockMs: -5000 },
94
+ delta: { passRate: 0.4, tokensPerPass: -7000, tokensPerRun: null, wallclockMs: -5000 },
94
95
  },
95
96
  {
96
97
  id: "domain-b/task-2",
97
98
  noakm: pt(0.4, null, 42000),
98
99
  akm: pt(0.6, 15000, 37000),
99
- delta: { passRate: 0.2, tokensPerPass: null, wallclockMs: -5000 },
100
+ delta: { passRate: 0.2, tokensPerPass: null, tokensPerRun: null, wallclockMs: -5000 },
100
101
  },
101
102
  ],
102
103
  warnings: [],
@@ -253,6 +254,54 @@ describe("serializeRunForReport", () => {
253
254
  expect(row.failure_mode).toBe("wrong_asset");
254
255
  });
255
256
  });
257
+ // ── formatTrajBool (M3) ───────────────────────────────────────────────────
258
+ describe("formatTrajBool", () => {
259
+ test("null → '—' (harness error, no trajectory data)", () => {
260
+ expect(formatTrajBool(null)).toBe("—");
261
+ });
262
+ test("false → '✗' (agent ran, behaviour not observed)", () => {
263
+ expect(formatTrajBool(false)).toBe("✗");
264
+ });
265
+ test("true → '✓' (behaviour confirmed)", () => {
266
+ expect(formatTrajBool(true)).toBe("✓");
267
+ });
268
+ });
269
+ describe("renderUtilityReport per-run trajectory table (M3)", () => {
270
+ test("markdown includes per-run table when allRuns has akm runs", () => {
271
+ const allRuns = [
272
+ makeRun({
273
+ taskId: "domain-a/task-1",
274
+ arm: "akm",
275
+ seed: 0,
276
+ trajectory: { correctAssetLoaded: true, feedbackRecorded: false },
277
+ }),
278
+ makeRun({
279
+ taskId: "domain-a/task-1",
280
+ arm: "akm",
281
+ seed: 1,
282
+ trajectory: { correctAssetLoaded: null, feedbackRecorded: null },
283
+ }),
284
+ // noakm run should be excluded from the table
285
+ makeRun({
286
+ taskId: "domain-a/task-1",
287
+ arm: "noakm",
288
+ seed: 0,
289
+ trajectory: { correctAssetLoaded: false, feedbackRecorded: false },
290
+ }),
291
+ ];
292
+ const report = { ...utilSample, allRuns };
293
+ const { markdown } = renderUtilityReport(report);
294
+ expect(markdown).toContain("| task | seed | correct_asset_loaded | feedback_recorded |");
295
+ expect(markdown).toContain("domain-a/task-1 | 0 | ✓ | ✗");
296
+ expect(markdown).toContain("domain-a/task-1 | 1 | — | —");
297
+ // noakm run must NOT appear in the akm-only trajectory table
298
+ // (the table is gated on arm === "akm")
299
+ });
300
+ test("markdown has no per-run trajectory table when allRuns is absent", () => {
301
+ const { markdown } = renderUtilityReport(utilSample);
302
+ expect(markdown).not.toContain("| task | seed | correct_asset_loaded | feedback_recorded |");
303
+ });
304
+ });
256
305
  describe("renderUtilityReport runs[] persistence (#249)", () => {
257
306
  test("emits one row per (task, arm, seed) when allRuns is supplied", () => {
258
307
  const allRuns = [
@@ -374,13 +423,13 @@ describe("renderUtilityReport negative-transfer (#260)", () => {
374
423
  id: "domain-a/task-1",
375
424
  noakm: pt(0.4, 20000, 40000),
376
425
  akm: pt(0.8, 13000, 35000),
377
- delta: { passRate: 0.4, tokensPerPass: -7000, wallclockMs: -5000 },
426
+ delta: { passRate: 0.4, tokensPerPass: -7000, tokensPerRun: null, wallclockMs: -5000 },
378
427
  },
379
428
  {
380
429
  id: "domain-b/task-2",
381
430
  noakm: pt(0.6, 20000, 40000),
382
431
  akm: pt(0.2, 25000, 38000),
383
- delta: { passRate: -0.4, tokensPerPass: 5000, wallclockMs: -2000 },
432
+ delta: { passRate: -0.4, tokensPerPass: 5000, tokensPerRun: null, wallclockMs: -2000 },
384
433
  },
385
434
  ],
386
435
  };
@@ -441,13 +490,13 @@ describe("renderUtilityReport negative-transfer (#260)", () => {
441
490
  id: "domain-a/task-1",
442
491
  noakm: pt(0.4, 20000, 40000),
443
492
  akm: pt(0.8, 13000, 35000),
444
- delta: { passRate: 0.4, tokensPerPass: -7000, wallclockMs: -5000 },
493
+ delta: { passRate: 0.4, tokensPerPass: -7000, tokensPerRun: null, wallclockMs: -5000 },
445
494
  },
446
495
  {
447
496
  id: "domain-b/task-2",
448
497
  noakm: pt(0.6, 20000, 40000),
449
498
  akm: pt(0.2, 25000, 38000),
450
- delta: { passRate: -0.4, tokensPerPass: 5000, wallclockMs: -2000 },
499
+ delta: { passRate: -0.4, tokensPerPass: 5000, tokensPerRun: null, wallclockMs: -2000 },
451
500
  },
452
501
  ],
453
502
  akmRuns,