npm - akm-cli - Versions diffs - 0.7.0-rc1 → 0.7.0 - Mend

akm-cli 0.7.0-rc1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

package/dist/src/cli.js +100 -16
package/dist/src/commands/config-cli.js +42 -0
package/dist/src/commands/history.js +78 -7
package/dist/src/commands/registry-search.js +69 -6
package/dist/src/commands/search.js +30 -3
package/dist/src/commands/show.js +29 -0
package/dist/src/commands/source-add.js +5 -1
package/dist/src/commands/source-manage.js +7 -1
package/dist/src/core/config.js +28 -0
package/dist/src/indexer/db-search.js +1 -0
package/dist/src/indexer/indexer.js +16 -2
package/dist/src/indexer/matchers.js +1 -1
package/dist/src/indexer/search-source.js +4 -2
package/dist/src/integrations/agent/profiles.js +1 -1
package/dist/src/integrations/agent/spawn.js +67 -16
package/dist/src/integrations/github.js +9 -3
package/dist/src/llm/embedders/remote.js +37 -3
package/dist/src/output/cli-hints.js +15 -2
package/dist/src/output/renderers.js +3 -1
package/dist/src/output/shapes.js +8 -1
package/dist/src/output/text.js +156 -3
package/dist/src/registry/build-index.js +5 -4
package/dist/src/registry/providers/static-index.js +3 -1
package/dist/src/setup/setup.js +9 -0
package/dist/src/wiki/wiki.js +54 -6
package/dist/src/workflows/runs.js +37 -3
package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +1 -1
package/dist/tests/bench/attribution.test.js +24 -23
package/dist/tests/bench/cleanup.js +31 -0
package/dist/tests/bench/cli.js +366 -31
package/dist/tests/bench/cli.test.js +282 -14
package/dist/tests/bench/corpus.js +3 -0
package/dist/tests/bench/corpus.test.js +10 -10
package/dist/tests/bench/doctor.js +525 -0
package/dist/tests/bench/driver.js +77 -22
package/dist/tests/bench/driver.test.js +142 -1
package/dist/tests/bench/environment.js +233 -0
package/dist/tests/bench/environment.test.js +199 -0
package/dist/tests/bench/evolve.js +67 -0
package/dist/tests/bench/evolve.test.js +12 -4
package/dist/tests/bench/failure-modes.test.js +52 -3
package/dist/tests/bench/feedback-integrity.test.js +3 -2
package/dist/tests/bench/leakage.test.js +105 -2
package/dist/tests/bench/learning-curve.test.js +3 -2
package/dist/tests/bench/metrics.js +102 -26
package/dist/tests/bench/metrics.test.js +10 -4
package/dist/tests/bench/opencode-config.js +194 -0
package/dist/tests/bench/opencode-config.test.js +370 -0
package/dist/tests/bench/report.js +73 -9
package/dist/tests/bench/report.test.js +59 -10
package/dist/tests/bench/run-config.js +355 -0
package/dist/tests/bench/run-config.test.js +298 -0
package/dist/tests/bench/run-curate-test.js +32 -0
package/dist/tests/bench/run-failing-tasks.js +56 -0
package/dist/tests/bench/run-full-bench.js +51 -0
package/dist/tests/bench/run-items36-targeted.js +69 -0
package/dist/tests/bench/run-nano-quick.js +42 -0
package/dist/tests/bench/run-waveg-targeted.js +62 -0
package/dist/tests/bench/runner.js +257 -94
package/dist/tests/bench/tmp.js +90 -0
package/dist/tests/bench/trajectory.js +2 -2
package/dist/tests/bench/verifier.js +6 -1
package/dist/tests/bench/workflow-spec.js +11 -24
package/dist/tests/bench/workflow-spec.test.js +1 -1
package/dist/tests/bench/workflow-trace.js +34 -0
package/dist/tests/cli-errors.test.js +1 -0
package/dist/tests/commands/history.test.js +195 -0
package/dist/tests/config.test.js +25 -0
package/dist/tests/e2e.test.js +23 -2
package/dist/tests/fixtures/stashes/load.js +1 -1
package/dist/tests/fixtures/stashes/load.test.js +11 -2
package/dist/tests/indexer.test.js +12 -1
package/dist/tests/output-baseline.test.js +2 -1
package/dist/tests/output-shapes-unit.test.js +3 -1
package/dist/tests/registry-build-index.test.js +17 -1
package/dist/tests/registry-providers/static-index.test.js +34 -0
package/dist/tests/registry-search.test.js +200 -0
package/dist/tests/remember-frontmatter.test.js +11 -13
package/dist/tests/source-qa-fixes.test.js +18 -0
package/dist/tests/source-registry.test.js +3 -3
package/dist/tests/source-source.test.js +61 -1
package/dist/tests/workflow-qa-fixes.test.js +18 -0
package/package.json +1 -1

package/dist/tests/bench/opencode-config.test.js ADDED Viewed

@@ -0,0 +1,370 @@
+/**
+ * Tests for the bench opencode-config module.
+ *
+ * Covers all cases described in the design spec:
+ *   - loads canonical fixture without error
+ *   - rejects literal apiKey (not env-ref)
+ *   - accepts {env:VAR} apiKey form
+ *   - rejects sk-XXXX credential heuristic anywhere in tree
+ *   - rejects top-level plugin / mcp / permission keys
+ *   - rejects unknown schemaVersion
+ *   - isUsageError: true when file missing
+ *   - selectProviderForModel picks correct provider
+ *   - selectProviderForModel throws on unknown provider prefix
+ *   - materializeOpencodeConfig writes exactly $schema + provider keys, mode 0o600
+ */
+import { afterAll, beforeAll, describe, expect, test } from "bun:test";
+import fs from "node:fs";
+import path from "node:path";
+import { BenchConfigError, loadOpencodeProviders, materializeOpencodeConfig, selectProviderForModel, } from "./opencode-config";
+import { benchMkdtemp } from "./tmp";
+/** Absolute path to the committed fixture. */
+const FIXTURE_PATH = path.resolve(__dirname, "..", "fixtures", "bench", "opencode-providers.json");
+/** Write a temp JSON file and return its path. */
+function writeTmp(dir, name, content) {
+    const p = path.join(dir, name);
+    fs.writeFileSync(p, JSON.stringify(content));
+    return p;
+}
+describe("loadOpencodeProviders", () => {
+    let tmp;
+    beforeAll(() => {
+        tmp = benchMkdtemp("bench-opencode-config-test-");
+    });
+    afterAll(() => {
+        fs.rmSync(tmp, { recursive: true, force: true });
+    });
+    // ── Canonical fixture ─────────────────────────────────────────────────────
+    test("loads the canonical committed fixture without error", () => {
+        expect(() => loadOpencodeProviders(FIXTURE_PATH)).not.toThrow();
+        const loaded = loadOpencodeProviders(FIXTURE_PATH);
+        expect(loaded.source).toBe(FIXTURE_PATH);
+        expect(loaded.providers).toBeDefined();
+        expect(typeof loaded.providers).toBe("object");
+        expect(loaded.defaultModel).toBe("local/qwen/qwen3.5-9b");
+        expect("local" in loaded.providers).toBe(true);
+    });
+    // ── File not found ────────────────────────────────────────────────────────
+    test("throws BenchConfigError with isUsageError: true when file does not exist", () => {
+        const missing = path.join(tmp, "does-not-exist.json");
+        let err;
+        try {
+            loadOpencodeProviders(missing);
+        }
+        catch (e) {
+            err = e;
+        }
+        expect(err).toBeInstanceOf(BenchConfigError);
+        const bce = err;
+        expect(bce.code).toBe("BENCH_CONFIG");
+        expect(bce.isUsageError).toBe(true);
+        expect(bce.message).toContain("not found");
+    });
+    // ── JSON parse failure ────────────────────────────────────────────────────
+    test("throws BenchConfigError with isUsageError: false on malformed JSON", () => {
+        const p = path.join(tmp, "bad.json");
+        fs.writeFileSync(p, "{ this is not json }");
+        let err;
+        try {
+            loadOpencodeProviders(p);
+        }
+        catch (e) {
+            err = e;
+        }
+        expect(err).toBeInstanceOf(BenchConfigError);
+        expect(err.isUsageError).toBe(false);
+        expect(err.message).toContain("JSON parse error");
+    });
+    // ── schemaVersion ─────────────────────────────────────────────────────────
+    test("rejects unknown schemaVersion", () => {
+        const p = writeTmp(tmp, "bad-version.json", {
+            schemaVersion: 2,
+            providers: {},
+        });
+        expect(() => loadOpencodeProviders(p)).toThrow(BenchConfigError);
+        let err;
+        try {
+            loadOpencodeProviders(p);
+        }
+        catch (e) {
+            if (e instanceof BenchConfigError)
+                err = e;
+        }
+        expect(err?.isUsageError).toBe(false);
+        expect(err?.message).toContain("schemaVersion");
+    });
+    test("rejects schemaVersion: 0", () => {
+        const p = writeTmp(tmp, "version-0.json", { schemaVersion: 0, providers: {} });
+        expect(() => loadOpencodeProviders(p)).toThrow(BenchConfigError);
+    });
+    // ── Forbidden top-level keys ──────────────────────────────────────────────
+    test("rejects top-level 'plugin' key", () => {
+        const p = writeTmp(tmp, "has-plugin.json", {
+            schemaVersion: 1,
+            providers: {},
+            plugin: [],
+        });
+        let err;
+        try {
+            loadOpencodeProviders(p);
+        }
+        catch (e) {
+            if (e instanceof BenchConfigError)
+                err = e;
+        }
+        expect(err).toBeDefined();
+        expect(err?.isUsageError).toBe(false);
+        expect(err?.message).toContain("plugin");
+    });
+    test("rejects top-level 'mcp' key", () => {
+        const p = writeTmp(tmp, "has-mcp.json", {
+            schemaVersion: 1,
+            providers: {},
+            mcp: {},
+        });
+        expect(() => loadOpencodeProviders(p)).toThrow(BenchConfigError);
+    });
+    test("rejects top-level 'permission' key", () => {
+        const p = writeTmp(tmp, "has-permission.json", {
+            schemaVersion: 1,
+            providers: {},
+            permission: {},
+        });
+        expect(() => loadOpencodeProviders(p)).toThrow(BenchConfigError);
+    });
+    test("rejects top-level 'disabled_providers' key", () => {
+        const p = writeTmp(tmp, "has-disabled.json", {
+            schemaVersion: 1,
+            providers: {},
+            disabled_providers: [],
+        });
+        expect(() => loadOpencodeProviders(p)).toThrow(BenchConfigError);
+    });
+    test("rejects top-level 'small_model' key", () => {
+        const p = writeTmp(tmp, "has-small-model.json", {
+            schemaVersion: 1,
+            providers: {},
+            small_model: "anthropic/claude-haiku-4-5",
+        });
+        expect(() => loadOpencodeProviders(p)).toThrow(BenchConfigError);
+    });
+    test("rejects top-level 'snapshot' key", () => {
+        const p = writeTmp(tmp, "has-snapshot.json", {
+            schemaVersion: 1,
+            providers: {},
+            snapshot: true,
+        });
+        expect(() => loadOpencodeProviders(p)).toThrow(BenchConfigError);
+    });
+    // ── apiKey validation ─────────────────────────────────────────────────────
+    test("rejects literal apiKey string (not an env-ref)", () => {
+        const p = writeTmp(tmp, "literal-apikey.json", {
+            schemaVersion: 1,
+            providers: {
+                myProvider: {
+                    apiKey: "not-an-env-ref",
+                },
+            },
+        });
+        let err;
+        try {
+            loadOpencodeProviders(p);
+        }
+        catch (e) {
+            if (e instanceof BenchConfigError)
+                err = e;
+        }
+        expect(err).toBeDefined();
+        expect(err?.isUsageError).toBe(false);
+        expect(err?.message).toContain("apiKey");
+        expect(err?.message).toContain("env-ref");
+    });
+    test("accepts {env:VAR} form for apiKey", () => {
+        const p = writeTmp(tmp, "env-ref-apikey.json", {
+            schemaVersion: 1,
+            providers: {
+                myProvider: {
+                    npm: "@ai-sdk/openai-compatible",
+                    apiKey: "{env:MY_API_KEY}",
+                    options: { baseURL: "http://localhost:1234/v1" },
+                },
+            },
+        });
+        expect(() => loadOpencodeProviders(p)).not.toThrow();
+        const loaded = loadOpencodeProviders(p);
+        expect("myProvider" in loaded.providers).toBe(true);
+    });
+    test("accepts {env:UNDERSCORE_KEY_123} env-ref form", () => {
+        const p = writeTmp(tmp, "env-ref-underscore.json", {
+            schemaVersion: 1,
+            providers: {
+                p: { apiKey: "{env:MY_KEY_123}" },
+            },
+        });
+        expect(() => loadOpencodeProviders(p)).not.toThrow();
+    });
+    test("rejects apiKey starting with lowercase (not a valid env-ref)", () => {
+        const p = writeTmp(tmp, "bad-env-ref.json", {
+            schemaVersion: 1,
+            providers: {
+                p: { apiKey: "{env:my_lowercase_key}" },
+            },
+        });
+        expect(() => loadOpencodeProviders(p)).toThrow(BenchConfigError);
+    });
+    // ── Credential heuristic ──────────────────────────────────────────────────
+    test("rejects sk-XXXX credential anywhere in the providers tree", () => {
+        const p = writeTmp(tmp, "has-sk-key.json", {
+            schemaVersion: 1,
+            providers: {
+                openai: {
+                    npm: "@ai-sdk/openai",
+                    secret: "sk-abcdefghijklmnopqrstuvwxyz0123456789",
+                },
+            },
+        });
+        let err;
+        try {
+            loadOpencodeProviders(p);
+        }
+        catch (e) {
+            if (e instanceof BenchConfigError)
+                err = e;
+        }
+        expect(err).toBeDefined();
+        expect(err?.isUsageError).toBe(false);
+        expect(err?.message).toContain("credential heuristic");
+    });
+    test("rejects sk-XXXX credential in a nested object", () => {
+        const p = writeTmp(tmp, "nested-sk-key.json", {
+            schemaVersion: 1,
+            providers: {
+                p: {
+                    options: {
+                        headers: {
+                            Authorization: "sk-proj-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+                        },
+                    },
+                },
+            },
+        });
+        expect(() => loadOpencodeProviders(p)).toThrow(BenchConfigError);
+    });
+    // ── Valid minimal file ────────────────────────────────────────────────────
+    test("accepts a valid minimal file with no defaultModel", () => {
+        const p = writeTmp(tmp, "minimal.json", {
+            schemaVersion: 1,
+            providers: {
+                local: {
+                    npm: "@ai-sdk/openai-compatible",
+                    options: { baseURL: "http://localhost:1234/v1" },
+                },
+            },
+        });
+        const loaded = loadOpencodeProviders(p);
+        expect(loaded.defaultModel).toBeUndefined();
+        expect("local" in loaded.providers).toBe(true);
+    });
+});
+describe("selectProviderForModel", () => {
+    const loaded = {
+        source: "/fake/path.json",
+        providers: {
+            don: { npm: "@ai-sdk/openai-compatible", name: "Don LM Studio" },
+            ollama: { npm: "@ai-sdk/openai-compatible", name: "Ollama" },
+        },
+        defaultModel: "don/mlx-community/qwen3.6-35b-a3b",
+    };
+    test("splits on first slash and returns the correct provider entry", () => {
+        const result = selectProviderForModel(loaded, "don/mlx-community/qwen3.6-35b-a3b");
+        expect(result.providerKey).toBe("don");
+        expect(result.entry).toBe(loaded.providers.don);
+    });
+    test("handles a model with no slash (entire string is the provider key)", () => {
+        const result = selectProviderForModel(loaded, "ollama");
+        expect(result.providerKey).toBe("ollama");
+        expect(result.entry).toBe(loaded.providers.ollama);
+    });
+    test("throws BenchConfigError when provider key is not in loaded.providers", () => {
+        let err;
+        try {
+            selectProviderForModel(loaded, "unknown/some-model");
+        }
+        catch (e) {
+            if (e instanceof BenchConfigError)
+                err = e;
+        }
+        expect(err).toBeDefined();
+        expect(err?.code).toBe("BENCH_CONFIG");
+        expect(err?.isUsageError).toBe(false);
+        expect(err?.message).toContain("unknown");
+        expect(err?.message).toContain("provider key");
+    });
+    test("error message lists available provider keys", () => {
+        let err;
+        try {
+            selectProviderForModel(loaded, "missing/model");
+        }
+        catch (e) {
+            if (e instanceof BenchConfigError)
+                err = e;
+        }
+        expect(err?.message).toContain("don");
+        expect(err?.message).toContain("ollama");
+    });
+});
+describe("materializeOpencodeConfig", () => {
+    let tmp;
+    beforeAll(() => {
+        tmp = benchMkdtemp("bench-materialize-test-");
+    });
+    afterAll(() => {
+        fs.rmSync(tmp, { recursive: true, force: true });
+    });
+    test("writes opencode.json with required bench isolation invariants and provider", () => {
+        const configDir = path.join(tmp, "run-config");
+        fs.mkdirSync(configDir, { recursive: true });
+        const entry = { npm: "@ai-sdk/openai-compatible", name: "Test Provider" };
+        materializeOpencodeConfig(configDir, { providerKey: "test", entry }, "test/my-model");
+        const outPath = path.join(configDir, "opencode.json");
+        expect(fs.existsSync(outPath)).toBe(true);
+        const contents = JSON.parse(fs.readFileSync(outPath, "utf8"));
+        expect(contents.model).toBe("test/my-model");
+        expect(contents.$schema).toBe("https://opencode.ai/config.json");
+        // Bench isolation invariants: plugin:[] prevents operator plugin interference;
+        // permission block ensures opencode run (non-interactive) allows bash/file tools.
+        expect(contents.plugin).toEqual([]);
+        expect(contents.permission?.bash).toBe("allow");
+        // Provider block is written correctly.
+        const provider = contents.provider;
+        expect(Object.keys(provider)).toEqual(["test"]);
+        expect(provider.test).toEqual(entry);
+    });
+    test("does not write mcp into the config", () => {
+        const configDir = path.join(tmp, "run-config-2");
+        fs.mkdirSync(configDir, { recursive: true });
+        materializeOpencodeConfig(configDir, { providerKey: "p", entry: {} }, "p/model");
+        const contents = JSON.parse(fs.readFileSync(path.join(configDir, "opencode.json"), "utf8"));
+        expect(contents.mcp).toBeUndefined();
+    });
+    test("writes the file with mode 0o600 (not world-readable)", () => {
+        const configDir = path.join(tmp, "run-config-3");
+        fs.mkdirSync(configDir, { recursive: true });
+        materializeOpencodeConfig(configDir, { providerKey: "p", entry: {} }, "p/model");
+        const stat = fs.statSync(path.join(configDir, "opencode.json"));
+        // Mode 0o600 means only owner can read/write (no group or other bits).
+        // On Linux/macOS the lower 9 bits are 0o600 = 0o110000000 in binary.
+        const mode = stat.mode & 0o777;
+        expect(mode).toBe(0o600);
+    });
+    test("can be called twice (overwrites an existing opencode.json)", () => {
+        const configDir = path.join(tmp, "run-config-4");
+        fs.mkdirSync(configDir, { recursive: true });
+        materializeOpencodeConfig(configDir, { providerKey: "a", entry: { name: "first" } }, "a/m1");
+        materializeOpencodeConfig(configDir, { providerKey: "b", entry: { name: "second" } }, "b/m2");
+        const contents = JSON.parse(fs.readFileSync(path.join(configDir, "opencode.json"), "utf8"));
+        const provider = contents.provider;
+        expect("b" in provider).toBe(true);
+        expect("a" in provider).toBe(false);
+    });
+});

package/dist/tests/bench/report.js CHANGED Viewed

@@ -179,6 +179,12 @@ function buildUtilityJson(input) {
     if (input.allRuns) {
         envelope.runs = input.allRuns.map(serializeRunForReport);
     }
+    // Baseline pass-rate map — additive top-level key. Emitted only when the
+    // caller supplied a baseline through `loadBenchRunConfig`; legacy reports
+    // stay byte-identical without it.
+    if (input.baselineByTaskId) {
+        envelope.baseline_by_task_id = { ...input.baselineByTaskId };
+    }
     // Per-asset attribution is an additive top-level key (§6.5). Emit it only
     // when the runner populated it so older code paths (e.g. the empty-corpus
     // skeleton) don't gain the key spuriously.
@@ -229,6 +235,8 @@ function serialiseAkmOverheadPerRun(row) {
         search_count: row.searchCount,
         show_count: row.showCount,
         feedback_count: row.feedbackCount,
+        positive_feedback_count: row.positiveFeedbackCount,
+        negative_feedback_count: row.negativeFeedbackCount,
         total_tool_calls: row.totalToolCalls,
         assets_loaded_count: row.assetsLoadedCount,
         irrelevant_assets_loaded_count: row.irrelevantAssetsLoadedCount,
@@ -255,6 +263,12 @@ function serialiseAkmOverheadAggregate(agg) {
         total_tool_calls: agg.totalToolCalls,
         tool_calls_per_success: agg.toolCallsPerSuccess,
         cost_per_success: agg.costPerSuccess,
+        search_engagement_rate: agg.searchEngagementRate,
+        show_engagement_rate: agg.showEngagementRate,
+        feedback_engagement_rate: agg.feedbackEngagementRate,
+        search_to_show_ratio: agg.searchToShowRatio,
+        mean_positive_feedback_count: agg.meanPositiveFeedbackCount,
+        mean_negative_feedback_count: agg.meanNegativeFeedbackCount,
     };
 }
 /**
@@ -331,6 +345,7 @@ function serialiseCorpus(c) {
     return {
         pass_rate: c.passRate,
         tokens_per_pass: c.tokensPerPass,
+        tokens_per_run: c.tokensPerRun,
         wallclock_ms: c.wallclockMs,
     };
 }
@@ -338,6 +353,7 @@ function serialiseDelta(d) {
     return {
         pass_rate: d.passRate,
         tokens_per_pass: d.tokensPerPass,
+        tokens_per_run: d.tokensPerRun,
         wallclock_ms: d.wallclockMs,
     };
 }
@@ -426,6 +442,7 @@ function serialisePerTaskMetrics(m) {
         pass_rate: m.passRate,
         pass_at_1: m.passAt1,
         tokens_per_pass: m.tokensPerPass,
+        tokens_per_run: m.tokensPerRun,
         wallclock_ms: m.wallclockMs,
         pass_rate_stdev: m.passRateStdev,
         budget_exceeded_count: m.budgetExceededCount,
@@ -511,23 +528,43 @@ function buildUtilityMarkdown(input) {
     lines.push("");
     lines.push(`- correct_asset_loaded: ${formatPercent(input.trajectoryAkm.correctAssetLoaded)}`);
     lines.push(`- feedback_recorded: ${formatPercent(input.trajectoryAkm.feedbackRecorded)}`);
+    // Per-run trajectory detail: when allRuns is present emit a compact table
+    // so operators can distinguish null (harness error — no events captured)
+    // from false (agent ran, behaviour not observed) from true (confirmed).
+    // Symbols: "—" = null, "✗" = false, "✓" = true.
+    const akmRuns = (input.allRuns ?? []).filter((r) => r.arm === "akm");
+    if (akmRuns.length > 0) {
+        lines.push("");
+        lines.push("| task | seed | correct_asset_loaded | feedback_recorded |");
+        lines.push("|------|------|----------------------|-------------------|");
+        for (const r of akmRuns) {
+            lines.push(`| ${r.taskId} | ${r.seed} | ${formatTrajBool(r.trajectory.correctAssetLoaded)} | ${formatTrajBool(r.trajectory.feedbackRecorded)} |`);
+        }
+    }
     lines.push("");
     lines.push("## Per-task pass rates");
     lines.push("");
     // #261: synthetic column is rendered only when the synthetic arm ran.
     // The default header/row stays identical to the pre-#261 output.
-    if (input.aggregateSynth) {
-        lines.push("| task | noakm | synthetic | akm | delta |");
-        lines.push("|------|-------|-----------|-----|-------|");
+    // Baseline column is rendered only when `baselineByTaskId` was supplied
+    // by the caller; legacy reports without it produce byte-identical output.
+    const includeSynthCol = input.aggregateSynth !== undefined;
+    const baselineMap = input.baselineByTaskId;
+    const includeBaselineCol = baselineMap !== undefined;
+    const baseColHeader = includeBaselineCol ? " baseline | vs base |" : "";
+    const baseColSep = includeBaselineCol ? "----------|---------|" : "";
+    if (includeSynthCol) {
+        lines.push(`| task | noakm | synthetic | akm | delta |${baseColHeader}`);
+        lines.push(`|------|-------|-----------|-----|-------|${baseColSep}`);
     }
     else {
-        lines.push("| task | noakm | akm | delta |");
-        lines.push("|------|-------|-----|-------|");
+        lines.push(`| task | noakm | akm | delta |${baseColHeader}`);
+        lines.push(`|------|-------|-----|-------|${baseColSep}`);
     }
     // Sort tasks alphabetically for byte-stable markdown output.
     const sorted = [...input.tasks].sort((a, b) => a.id.localeCompare(b.id));
     for (const t of sorted) {
-        lines.push(taskRow(t, input.aggregateSynth !== undefined));
+        lines.push(taskRow(t, includeSynthCol, baselineMap));
     }
     // Corpus-coverage section (#262). Renders only when at least one task was
     // tagged with a `memory_ability`; without tags the section adds no signal
@@ -650,15 +687,29 @@ function deltaRow(d) {
     const tpp = d.tokensPerPass === null ? "n/a" : signed(d.tokensPerPass.toFixed(0));
     return `| **delta** | ${signed(d.passRate.toFixed(2))} | ${tpp} | ${signed(d.wallclockMs.toFixed(0))} |`;
 }
-function taskRow(t, includeSynthetic = false) {
+function taskRow(t, includeSynthetic = false, baselineByTaskId) {
+    // Baseline-delta cell is rendered only when a baseline map is provided
+    // AND this task has an entry. Tasks without a baseline entry get an empty
+    // pair of cells so columns stay aligned.
+    let baselineCells = "";
+    if (baselineByTaskId) {
+        const base = baselineByTaskId[t.id];
+        if (base === undefined) {
+            baselineCells = " n/a | n/a |";
+        }
+        else {
+            const delta = t.akm.passRate - base;
+            baselineCells = ` ${base.toFixed(2)} | ${signed(delta.toFixed(2))} |`;
+        }
+    }
     if (includeSynthetic) {
         // #261: render the synthetic-arm pass-rate when present; "n/a" when the
         // arm did not run for this task. A missing arm is NOT a zero-pass arm —
         // a 0.00 cell would be misleading because the model never tried.
         const synth = t.synthetic ? t.synthetic.passRate.toFixed(2) : "n/a";
-        return `| ${t.id} | ${t.noakm.passRate.toFixed(2)} | ${synth} | ${t.akm.passRate.toFixed(2)} | ${signed(t.delta.passRate.toFixed(2))} |`;
+        return `| ${t.id} | ${t.noakm.passRate.toFixed(2)} | ${synth} | ${t.akm.passRate.toFixed(2)} | ${signed(t.delta.passRate.toFixed(2))} |${baselineCells}`;
     }
-    return `| ${t.id} | ${t.noakm.passRate.toFixed(2)} | ${t.akm.passRate.toFixed(2)} | ${signed(t.delta.passRate.toFixed(2))} |`;
+    return `| ${t.id} | ${t.noakm.passRate.toFixed(2)} | ${t.akm.passRate.toFixed(2)} | ${signed(t.delta.passRate.toFixed(2))} |${baselineCells}`;
 }
 function signed(text) {
     if (text.startsWith("-"))
@@ -672,6 +723,19 @@ function formatPercent(value) {
         return "n/a";
     return `${(value * 100).toFixed(1)}%`;
 }
+/**
+ * Render a `boolean | null` trajectory field for markdown tables.
+ *
+ * Three-state semantics:
+ * - `null`  → `"—"` — no trajectory data (harness error; events.jsonl not captured).
+ * - `false` → `"✗"` — agent ran but the behaviour was not observed.
+ * - `true`  → `"✓"` — behaviour confirmed.
+ */
+export function formatTrajBool(value) {
+    if (value === null)
+        return "—";
+    return value ? "✓" : "✗";
+}
 // ── Compare rendering (§8) ─────────────────────────────────────────────────
 /**
  * Render a CompareResult as a deterministic markdown diff.

package/dist/tests/bench/report.test.js CHANGED Viewed

@@ -3,7 +3,7 @@
  */
 import { describe, expect, test } from "bun:test";
 import fs from "node:fs";
-import { renderJsonReport, renderMarkdownSummary, renderUtilityReport, resolveGitBranch, resolveGitCommit, serializeRunForReport, } from "./report";
+import { formatTrajBool, renderJsonReport, renderMarkdownSummary, renderUtilityReport, resolveGitBranch, resolveGitCommit, serializeRunForReport, } from "./report";
 import { benchMkdtemp } from "./tmp";
 const sample = {
     timestamp: "2026-04-27T12:00:00Z",
@@ -66,6 +66,7 @@ function pt(passRate, tokens, wall, count = 5) {
         passRate,
         passAt1: passes > 0 ? 1 : 0,
         tokensPerPass: tokens,
+        tokensPerRun: tokens,
         wallclockMs: wall,
         passRateStdev: 0,
         budgetExceededCount: 0,
@@ -80,9 +81,9 @@ const utilSample = {
     commit: "deadbee",
     model: "anthropic/claude-opus-4-7",
     corpus: { domains: 3, tasks: 2, slice: "all", seedsPerArm: 5 },
-    aggregateNoakm: { passRate: 0.4, tokensPerPass: 18000, wallclockMs: 41000 },
-    aggregateAkm: { passRate: 0.7, tokensPerPass: 14000, wallclockMs: 36000 },
-    aggregateDelta: { passRate: 0.3, tokensPerPass: -4000, wallclockMs: -5000 },
+    aggregateNoakm: { passRate: 0.4, tokensPerPass: 18000, tokensPerRun: null, wallclockMs: 41000 },
+    aggregateAkm: { passRate: 0.7, tokensPerPass: 14000, tokensPerRun: null, wallclockMs: 36000 },
+    aggregateDelta: { passRate: 0.3, tokensPerPass: -4000, tokensPerRun: null, wallclockMs: -5000 },
     trajectoryAkm: { correctAssetLoaded: 0.78, feedbackRecorded: 0.65 },
     failureModes: { byLabel: {}, byTask: {} },
     tasks: [
@@ -90,13 +91,13 @@ const utilSample = {
             id: "domain-a/task-1",
             noakm: pt(0.4, 20000, 40000),
             akm: pt(0.8, 13000, 35000),
-            delta: { passRate: 0.4, tokensPerPass: -7000, wallclockMs: -5000 },
+            delta: { passRate: 0.4, tokensPerPass: -7000, tokensPerRun: null, wallclockMs: -5000 },
         },
         {
             id: "domain-b/task-2",
             noakm: pt(0.4, null, 42000),
             akm: pt(0.6, 15000, 37000),
-            delta: { passRate: 0.2, tokensPerPass: null, wallclockMs: -5000 },
+            delta: { passRate: 0.2, tokensPerPass: null, tokensPerRun: null, wallclockMs: -5000 },
         },
     ],
     warnings: [],
@@ -253,6 +254,54 @@ describe("serializeRunForReport", () => {
         expect(row.failure_mode).toBe("wrong_asset");
     });
 });
+// ── formatTrajBool (M3) ───────────────────────────────────────────────────
+describe("formatTrajBool", () => {
+    test("null → '—' (harness error, no trajectory data)", () => {
+        expect(formatTrajBool(null)).toBe("—");
+    });
+    test("false → '✗' (agent ran, behaviour not observed)", () => {
+        expect(formatTrajBool(false)).toBe("✗");
+    });
+    test("true → '✓' (behaviour confirmed)", () => {
+        expect(formatTrajBool(true)).toBe("✓");
+    });
+});
+describe("renderUtilityReport per-run trajectory table (M3)", () => {
+    test("markdown includes per-run table when allRuns has akm runs", () => {
+        const allRuns = [
+            makeRun({
+                taskId: "domain-a/task-1",
+                arm: "akm",
+                seed: 0,
+                trajectory: { correctAssetLoaded: true, feedbackRecorded: false },
+            }),
+            makeRun({
+                taskId: "domain-a/task-1",
+                arm: "akm",
+                seed: 1,
+                trajectory: { correctAssetLoaded: null, feedbackRecorded: null },
+            }),
+            // noakm run should be excluded from the table
+            makeRun({
+                taskId: "domain-a/task-1",
+                arm: "noakm",
+                seed: 0,
+                trajectory: { correctAssetLoaded: false, feedbackRecorded: false },
+            }),
+        ];
+        const report = { ...utilSample, allRuns };
+        const { markdown } = renderUtilityReport(report);
+        expect(markdown).toContain("| task | seed | correct_asset_loaded | feedback_recorded |");
+        expect(markdown).toContain("domain-a/task-1 | 0 | ✓ | ✗");
+        expect(markdown).toContain("domain-a/task-1 | 1 | — | —");
+        // noakm run must NOT appear in the akm-only trajectory table
+        // (the table is gated on arm === "akm")
+    });
+    test("markdown has no per-run trajectory table when allRuns is absent", () => {
+        const { markdown } = renderUtilityReport(utilSample);
+        expect(markdown).not.toContain("| task | seed | correct_asset_loaded | feedback_recorded |");
+    });
+});
 describe("renderUtilityReport runs[] persistence (#249)", () => {
     test("emits one row per (task, arm, seed) when allRuns is supplied", () => {
         const allRuns = [
@@ -374,13 +423,13 @@ describe("renderUtilityReport negative-transfer (#260)", () => {
                     id: "domain-a/task-1",
                     noakm: pt(0.4, 20000, 40000),
                     akm: pt(0.8, 13000, 35000),
-                    delta: { passRate: 0.4, tokensPerPass: -7000, wallclockMs: -5000 },
+                    delta: { passRate: 0.4, tokensPerPass: -7000, tokensPerRun: null, wallclockMs: -5000 },
                 },
                 {
                     id: "domain-b/task-2",
                     noakm: pt(0.6, 20000, 40000),
                     akm: pt(0.2, 25000, 38000),
-                    delta: { passRate: -0.4, tokensPerPass: 5000, wallclockMs: -2000 },
+                    delta: { passRate: -0.4, tokensPerPass: 5000, tokensPerRun: null, wallclockMs: -2000 },
                 },
             ],
         };
@@ -441,13 +490,13 @@ describe("renderUtilityReport negative-transfer (#260)", () => {
                     id: "domain-a/task-1",
                     noakm: pt(0.4, 20000, 40000),
                     akm: pt(0.8, 13000, 35000),
-                    delta: { passRate: 0.4, tokensPerPass: -7000, wallclockMs: -5000 },
+                    delta: { passRate: 0.4, tokensPerPass: -7000, tokensPerRun: null, wallclockMs: -5000 },
                 },
                 {
                     id: "domain-b/task-2",
                     noakm: pt(0.6, 20000, 40000),
                     akm: pt(0.2, 25000, 38000),
-                    delta: { passRate: -0.4, tokensPerPass: 5000, wallclockMs: -2000 },
+                    delta: { passRate: -0.4, tokensPerPass: 5000, tokensPerRun: null, wallclockMs: -2000 },
                 },
             ],
             akmRuns,