npm - akm-cli - Versions diffs - 0.7.0-rc1 → 0.7.0 - Mend

akm-cli 0.7.0-rc1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

package/dist/src/cli.js +100 -16
package/dist/src/commands/config-cli.js +42 -0
package/dist/src/commands/history.js +78 -7
package/dist/src/commands/registry-search.js +69 -6
package/dist/src/commands/search.js +30 -3
package/dist/src/commands/show.js +29 -0
package/dist/src/commands/source-add.js +5 -1
package/dist/src/commands/source-manage.js +7 -1
package/dist/src/core/config.js +28 -0
package/dist/src/indexer/db-search.js +1 -0
package/dist/src/indexer/indexer.js +16 -2
package/dist/src/indexer/matchers.js +1 -1
package/dist/src/indexer/search-source.js +4 -2
package/dist/src/integrations/agent/profiles.js +1 -1
package/dist/src/integrations/agent/spawn.js +67 -16
package/dist/src/integrations/github.js +9 -3
package/dist/src/llm/embedders/remote.js +37 -3
package/dist/src/output/cli-hints.js +15 -2
package/dist/src/output/renderers.js +3 -1
package/dist/src/output/shapes.js +8 -1
package/dist/src/output/text.js +156 -3
package/dist/src/registry/build-index.js +5 -4
package/dist/src/registry/providers/static-index.js +3 -1
package/dist/src/setup/setup.js +9 -0
package/dist/src/wiki/wiki.js +54 -6
package/dist/src/workflows/runs.js +37 -3
package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +1 -1
package/dist/tests/bench/attribution.test.js +24 -23
package/dist/tests/bench/cleanup.js +31 -0
package/dist/tests/bench/cli.js +366 -31
package/dist/tests/bench/cli.test.js +282 -14
package/dist/tests/bench/corpus.js +3 -0
package/dist/tests/bench/corpus.test.js +10 -10
package/dist/tests/bench/doctor.js +525 -0
package/dist/tests/bench/driver.js +77 -22
package/dist/tests/bench/driver.test.js +142 -1
package/dist/tests/bench/environment.js +233 -0
package/dist/tests/bench/environment.test.js +199 -0
package/dist/tests/bench/evolve.js +67 -0
package/dist/tests/bench/evolve.test.js +12 -4
package/dist/tests/bench/failure-modes.test.js +52 -3
package/dist/tests/bench/feedback-integrity.test.js +3 -2
package/dist/tests/bench/leakage.test.js +105 -2
package/dist/tests/bench/learning-curve.test.js +3 -2
package/dist/tests/bench/metrics.js +102 -26
package/dist/tests/bench/metrics.test.js +10 -4
package/dist/tests/bench/opencode-config.js +194 -0
package/dist/tests/bench/opencode-config.test.js +370 -0
package/dist/tests/bench/report.js +73 -9
package/dist/tests/bench/report.test.js +59 -10
package/dist/tests/bench/run-config.js +355 -0
package/dist/tests/bench/run-config.test.js +298 -0
package/dist/tests/bench/run-curate-test.js +32 -0
package/dist/tests/bench/run-failing-tasks.js +56 -0
package/dist/tests/bench/run-full-bench.js +51 -0
package/dist/tests/bench/run-items36-targeted.js +69 -0
package/dist/tests/bench/run-nano-quick.js +42 -0
package/dist/tests/bench/run-waveg-targeted.js +62 -0
package/dist/tests/bench/runner.js +257 -94
package/dist/tests/bench/tmp.js +90 -0
package/dist/tests/bench/trajectory.js +2 -2
package/dist/tests/bench/verifier.js +6 -1
package/dist/tests/bench/workflow-spec.js +11 -24
package/dist/tests/bench/workflow-spec.test.js +1 -1
package/dist/tests/bench/workflow-trace.js +34 -0
package/dist/tests/cli-errors.test.js +1 -0
package/dist/tests/commands/history.test.js +195 -0
package/dist/tests/config.test.js +25 -0
package/dist/tests/e2e.test.js +23 -2
package/dist/tests/fixtures/stashes/load.js +1 -1
package/dist/tests/fixtures/stashes/load.test.js +11 -2
package/dist/tests/indexer.test.js +12 -1
package/dist/tests/output-baseline.test.js +2 -1
package/dist/tests/output-shapes-unit.test.js +3 -1
package/dist/tests/registry-build-index.test.js +17 -1
package/dist/tests/registry-providers/static-index.test.js +34 -0
package/dist/tests/registry-search.test.js +200 -0
package/dist/tests/remember-frontmatter.test.js +11 -13
package/dist/tests/source-qa-fixes.test.js +18 -0
package/dist/tests/source-registry.test.js +3 -3
package/dist/tests/source-source.test.js +61 -1
package/dist/tests/workflow-qa-fixes.test.js +18 -0
package/package.json +1 -1

package/dist/tests/bench/evolve.js CHANGED Viewed

@@ -38,6 +38,7 @@ import { registerCleanup } from "./cleanup";
 import { computeLessonMetrics } from "./evolve-metrics";
 import { computeFeedbackIntegrity, computeLongitudinalMetrics, computeProposalQualityMetrics, } from "./metrics";
 import { runUtility } from "./runner";
+import { benchMkdtemp } from "./tmp";
 /**
  * Drive the three-phase Track B runner.
  *
@@ -79,6 +80,8 @@ export async function runEvolve(options) {
     const preStashes = new Map();
     const evolveDirByFixture = new Map();
     const preDirByFixture = new Map();
+    /** Per-fixture XDG_CACHE_HOME dirs allocated for evolve-stash indexing. */
+    const evolveCacheDirByFixture = new Map();
     // SIGINT trap (#267): every per-fixture stash registers its cleanup with
     // the shared registry so an external Ctrl-C reaps the tmp dirs even when
     // the top-level try/finally never runs. We deregister in the matching
@@ -91,6 +94,12 @@ export async function runEvolve(options) {
                 const evolved = loadFixtureStash(name, { skipIndex: false });
                 evolveStashes.set(name, evolved);
                 evolveDirByFixture.set(name, evolved.stashDir);
+                // Allocate a per-fixture cache dir for the evolve-stash re-index.
+                // `loadFixtureStash` used its own isolated XDG_CACHE_HOME; subsequent
+                // `akmCli` calls (feedback, distill, reflect) must look in the same
+                // cache. We allocate a fresh bench cache dir and pass it through
+                // `indexEvolveStash` + `envForRef` so the FTS5 DB is in a known place.
+                evolveCacheDirByFixture.set(name, benchMkdtemp(`akm-evolve-cache-${name}-`));
                 stashDeregistrations.push(registerCleanup(() => {
                     try {
                         evolved.cleanup();
@@ -132,6 +141,7 @@ export async function runEvolve(options) {
             refToFixture.set(t.goldRef, t.stash);
     }
     const fallbackEvolveDir = [...evolveDirByFixture.values()][0];
+    const fallbackEvolveCacheDir = [...evolveCacheDirByFixture.values()][0];
     function envForRef(ref) {
         const baseEnv = { ...process.env };
         if (!materialiseStash) {
@@ -142,12 +152,40 @@ export async function runEvolve(options) {
         }
         const fixture = ref ? refToFixture.get(ref) : undefined;
         const dir = (fixture && evolveDirByFixture.get(fixture)) ?? fallbackEvolveDir;
+        const cacheDir = (fixture && evolveCacheDirByFixture.get(fixture)) ?? fallbackEvolveCacheDir;
         if (dir)
             baseEnv.AKM_STASH_DIR = dir;
         else
             delete baseEnv.AKM_STASH_DIR;
+        if (cacheDir)
+            baseEnv.XDG_CACHE_HOME = cacheDir;
         return baseEnv;
     }
+    // ── Phase 1 pre-flight: index each evolve stash in its dedicated cache. ───
+    // `loadFixtureStash` already ran `akm index` but used an isolated
+    // XDG_CACHE_HOME that subsequent `akmCli` calls (feedback, distill, reflect)
+    // cannot see. Re-running `akm index` here via `akmCli` with the same
+    // AKM_STASH_DIR + XDG_CACHE_HOME that `envForRef` will produce ensures the
+    // FTS5 database is populated where Phase 1 feedback will look.
+    // Non-zero exit adds a warning but does not abort — Phase 1 can still run
+    // with degraded feedback if the index step fails.
+    if (materialiseStash) {
+        const phase1Cwd = options.tasks[0]?.taskDir ?? process.cwd();
+        for (const [fixtureName, stashDir] of evolveDirByFixture) {
+            const cacheDir = evolveCacheDirByFixture.get(fixtureName);
+            if (!cacheDir)
+                continue;
+            try {
+                const result = await indexEvolveStash(stashDir, cacheDir, akmCli, phase1Cwd);
+                if (!result.ok) {
+                    warnings.push(`evolve: pre-flight akm index failed for stash ${stashDir}: ${result.stderr.trim()}`);
+                }
+            }
+            catch (err) {
+                warnings.push(`evolve: pre-flight akm index threw for stash ${stashDir}: ${err.message}`);
+            }
+        }
+    }
     let preReport;
     let postReport;
     let syntheticReport;
@@ -172,6 +210,7 @@ export async function runEvolve(options) {
             ...(options.timestamp ? { timestamp: options.timestamp } : {}),
             ...(options.branch ? { branch: options.branch } : {}),
             ...(options.commit ? { commit: options.commit } : {}),
+            ...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
         });
         // Issue feedback events per (task, seed) outcome on the akm arm.
         const feedbackByRef = new Map();
@@ -271,6 +310,9 @@ export async function runEvolve(options) {
                 const dir = evolveDirByFixture.get(fixtureName);
                 if (dir)
                     proposalEnv.AKM_STASH_DIR = dir;
+                const cacheDir = evolveCacheDirByFixture.get(fixtureName);
+                if (cacheDir)
+                    proposalEnv.XDG_CACHE_HOME = cacheDir;
             }
             else if (!materialiseStash) {
                 delete proposalEnv.AKM_STASH_DIR;
@@ -332,6 +374,7 @@ export async function runEvolve(options) {
             ...(options.timestamp ? { timestamp: options.timestamp } : {}),
             ...(options.branch ? { branch: options.branch } : {}),
             ...(options.commit ? { commit: options.commit } : {}),
+            ...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
         });
         postReport = await runUtility({
             tasks: evalTasks,
@@ -350,6 +393,7 @@ export async function runEvolve(options) {
             ...(options.branch ? { branch: options.branch } : {}),
             ...(options.commit ? { commit: options.commit } : {}),
             ...(options.spawn ? { spawn: wrapSpawnWithArm(options.spawn, "post") } : {}),
+            ...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
         });
         // synthetic: no stash. We pass a spawn wrapper that strips
         // AKM_STASH_DIR and injects the "Bring Your Own Skills" tag so test
@@ -371,6 +415,7 @@ export async function runEvolve(options) {
             ...(options.branch ? { branch: options.branch } : {}),
             ...(options.commit ? { commit: options.commit } : {}),
             ...(options.spawn ? { spawn: wrapSpawnWithArm(options.spawn, "synthetic", undefined, true) } : {}),
+            ...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
         });
     }
     finally {
@@ -568,6 +613,28 @@ function parseProposalShow(stdout) {
     }
     return { lintPass, ...(lintMessage ? { lintMessage } : {}) };
 }
+/**
+ * Run `akm index` on the evolve stash to populate the FTS5 database in the
+ * cache directory that Phase 1 `akmCli` calls will use.
+ *
+ * `loadFixtureStash` already indexed the stash into an isolated XDG_CACHE_HOME
+ * that is invisible to subsequent `akmCli` calls. Calling this helper with the
+ * same `stashDir` + `cacheDir` that `envForRef` will forward ensures `akm
+ * feedback` (and later `akm distill` / `akm reflect`) can look up refs in the
+ * FTS5 index.
+ *
+ * Returns `{ ok: true }` on exit code 0, `{ ok: false, stderr }` otherwise.
+ * Exported for tests.
+ */
+export async function indexEvolveStash(stashDir, cacheDir, akmCli, cwd) {
+    const env = {
+        ...process.env,
+        AKM_STASH_DIR: stashDir,
+        XDG_CACHE_HOME: cacheDir,
+    };
+    const result = await akmCli(["index"], cwd, env);
+    return { ok: result.exitCode === 0, stderr: result.stderr };
+}
 /** Exposed for tests so the synthetic-arm prompt construction can be asserted. */
 export function buildSyntheticPrompt(taskId) {
     return [

package/dist/tests/bench/evolve.test.js CHANGED Viewed

@@ -534,6 +534,7 @@ describe("computeLongitudinalMetrics", () => {
             passRate: akmPassRate,
             passAt1: 0,
             tokensPerPass: null,
+            tokensPerRun: null,
             wallclockMs: 0,
             passRateStdev: 0,
             budgetExceededCount: 0,
@@ -548,9 +549,9 @@ describe("computeLongitudinalMetrics", () => {
             commit: "c",
             model: "m",
             corpus: { domains: 1, tasks: 1, slice: "eval", seedsPerArm },
-            aggregateNoakm: { passRate: 0, tokensPerPass: null, wallclockMs: 0 },
-            aggregateAkm: { passRate: akmPassRate, tokensPerPass: null, wallclockMs: 0 },
-            aggregateDelta: { passRate: akmPassRate, tokensPerPass: null, wallclockMs: 0 },
+            aggregateNoakm: { passRate: 0, tokensPerPass: null, tokensPerRun: null, wallclockMs: 0 },
+            aggregateAkm: { passRate: akmPassRate, tokensPerPass: null, tokensPerRun: null, wallclockMs: 0 },
+            aggregateDelta: { passRate: akmPassRate, tokensPerPass: null, tokensPerRun: null, wallclockMs: 0 },
             trajectoryAkm: { correctAssetLoaded: null, feedbackRecorded: 0 },
             failureModes: opts.failureMode
                 ? {
@@ -558,7 +559,14 @@ describe("computeLongitudinalMetrics", () => {
                     byTask: { [taskId]: { [opts.failureMode]: 1 } },
                 }
                 : { byLabel: {}, byTask: {} },
-            tasks: [{ id: taskId, noakm, akm, delta: { passRate: akmPassRate, tokensPerPass: null, wallclockMs: 0 } }],
+            tasks: [
+                {
+                    id: taskId,
+                    noakm,
+                    akm,
+                    delta: { passRate: akmPassRate, tokensPerPass: null, tokensPerRun: null, wallclockMs: 0 },
+                },
+            ],
             warnings: [],
         };
     }

package/dist/tests/bench/failure-modes.test.js CHANGED Viewed

@@ -130,6 +130,55 @@ describe("classifyFailureMode — seven labels", () => {
         const out = classifyFailureMode(fakeTask({ goldRef: undefined }), fakeRun({ verifierStdout: trace }));
         expect(out).toBe("unrelated_bug");
     });
+    test("no_events: task has no goldRef and no search in trace", () => {
+        // When there is no goldRef and no search evidence, trajectory.correctAssetLoaded
+        // is always null (metric undefined). We cannot tell whether the agent searched
+        // or whether events data was absent. Surfaces as `no_events`.
+        const out = classifyFailureMode(fakeTask({ goldRef: undefined }), fakeRun({ verifierStdout: "" }));
+        expect(out).toBe("no_events");
+    });
+});
+describe("classifyFailureMode — trajectory-aware classification (REC-07 / REC-13)", () => {
+    test("loaded_ignored: correctAssetLoaded=true + fail → loaded_ignored (short-circuit)", () => {
+        // The agent loaded the correct asset (confirmed by trajectory data) but still
+        // produced wrong output. This is the dominant failure pattern in the
+        // 2026-05-03 baseline: 24/25 `search_no_gold` labels were wrong because the
+        // classifier didn't consult trajectory.correctAssetLoaded.
+        const out = classifyFailureMode(fakeTask(), fakeRun({
+            trajectory: { correctAssetLoaded: true, feedbackRecorded: null },
+            verifierStdout: "verifier: field values wrong",
+        }));
+        expect(out).toBe("loaded_ignored");
+    });
+    test("loaded_ignored: correctAssetLoaded=true overrides stdout-scan — fires even with no search in trace", () => {
+        // Trajectory data is authoritative. Even if verifierStdout shows no `akm
+        // search`, the trajectory says the gold was loaded → loaded_ignored, not
+        // no_search.
+        const out = classifyFailureMode(fakeTask(), fakeRun({
+            trajectory: { correctAssetLoaded: true, feedbackRecorded: null },
+            verifierStdout: "",
+        }));
+        expect(out).toBe("loaded_ignored");
+    });
+    test("search_no_gold: correctAssetLoaded=false + search ran + gold absent → search_no_gold", () => {
+        // When trajectory says gold was NOT loaded and search ran but gold ref absent
+        // from results, this is a genuine search failure.
+        const trace = ["$ akm search homelab", "1. skill:foo", "2. skill:bar"].join("\n");
+        const out = classifyFailureMode(fakeTask(), fakeRun({
+            trajectory: { correctAssetLoaded: false, feedbackRecorded: null },
+            verifierStdout: trace,
+        }));
+        expect(out).toBe("search_no_gold");
+    });
+    test("no_search: correctAssetLoaded=false + no search in trace → no_search", () => {
+        // When trajectory says gold was NOT loaded and there is no search evidence,
+        // the agent genuinely didn't search.
+        const out = classifyFailureMode(fakeTask(), fakeRun({
+            trajectory: { correctAssetLoaded: false, feedbackRecorded: null },
+            verifierStdout: "verifier: missing output",
+        }));
+        expect(out).toBe("no_search");
+    });
 });
 describe("classifyFailureMode — tie-breaking and priority", () => {
     test("no_search beats search_no_gold when both could apply (no search call)", () => {
@@ -258,9 +307,9 @@ describe("renderFailureModeBreakdown", () => {
             commit: "y",
             model: "m",
             corpus: { domains: 1, tasks: 1, slice: "all", seedsPerArm: 5 },
-            aggregateNoakm: { passRate: 0, tokensPerPass: null, wallclockMs: 0 },
-            aggregateAkm: { passRate: 0, tokensPerPass: null, wallclockMs: 0 },
-            aggregateDelta: { passRate: 0, tokensPerPass: null, wallclockMs: 0 },
+            aggregateNoakm: { passRate: 0, tokensPerPass: null, tokensPerRun: null, wallclockMs: 0 },
+            aggregateAkm: { passRate: 0, tokensPerPass: null, tokensPerRun: null, wallclockMs: 0 },
+            aggregateDelta: { passRate: 0, tokensPerPass: null, tokensPerRun: null, wallclockMs: 0 },
             trajectoryAkm: { correctAssetLoaded: null, feedbackRecorded: 0 },
             failureModes: { byLabel, byTask: {} },
             tasks: [],

package/dist/tests/bench/feedback-integrity.test.js CHANGED Viewed

@@ -288,11 +288,12 @@ function emptyUtilityReport() {
         commit: "deadbee",
         model: "m",
         corpus: { domains: 0, tasks: 0, slice: "all", seedsPerArm: 1 },
-        aggregateNoakm: { passRate: 0, tokensPerPass: 0, wallclockMs: 0 },
-        aggregateAkm: { passRate: 0, tokensPerPass: 0, wallclockMs: 0 },
+        aggregateNoakm: { passRate: 0, tokensPerPass: 0, tokensPerRun: null, wallclockMs: 0 },
+        aggregateAkm: { passRate: 0, tokensPerPass: 0, tokensPerRun: null, wallclockMs: 0 },
         aggregateDelta: {
             passRate: 0,
             tokensPerPass: 0,
+            tokensPerRun: null,
             wallclockMs: 0,
         },
         trajectoryAkm: {

package/dist/tests/bench/leakage.test.js CHANGED Viewed

@@ -1,6 +1,14 @@
 /**
  * Leakage smoke test for the seeded bench corpus (spec §7.4).
  *
+ * Gated behind `AKM_BENCH_FIXTURE_TESTS=1`. This is a corpus-content
+ * validator (it inspects the seeded fixture stashes and verifier files,
+ * not the bench framework code itself), so it ships skipped by default —
+ * matching the `AKM_SEMANTIC_TESTS` / `AKM_DOCKER_TESTS` pattern. Run it
+ * locally when you change a fixture stash or a verifier:
+ *
+ *   AKM_BENCH_FIXTURE_TESTS=1 bun test tests/bench/leakage.test.ts
+ *
  * For every task that declares a `gold_ref` of the form `skill:<name>`,
  * locate the SKILL.md inside the named fixture stash and assert that the
  * verifier's *structural assertions* do not appear verbatim in the gold-ref
@@ -26,7 +34,8 @@
 import { describe, expect, test } from "bun:test";
 import fs from "node:fs";
 import path from "node:path";
-import { getTasksRoot, listTasks } from "./corpus";
+import { effectiveSlice, getTasksRoot, listTasks } from "./corpus";
+const FIXTURE_TESTS = !!process.env.AKM_BENCH_FIXTURE_TESTS;
 const STASHES_ROOT = path.resolve(getTasksRoot(), "..", "..", "stashes");
 /** Resolve `skill:<name>` against the named stash; returns SKILL.md path or `undefined`. */
 function resolveGoldRefPath(stashName, goldRef) {
@@ -93,7 +102,97 @@ function readVerifierFiles(task) {
     }
     return combined;
 }
-describe("gold-ref leakage check", () => {
+/**
+ * Return the verifier assertion fragments for a task, applying an additional
+ * filter suitable for cross-task comparisons. Short two-word domain phrases
+ * (e.g. `akm feedback`, `akm search`) naturally recur across tasks that share
+ * a domain — they are NOT meaningful leakage signals. A fragment is considered
+ * meaningful only when it either:
+ *   • contains at least two spaces (three or more tokens), or
+ *   • contains a structural character (`=`, `[`, `(`) that marks it as a
+ *     complex expression unlikely to appear by coincidence.
+ *
+ * This is more precise than a raw length threshold because it captures the
+ * difference between `akm feedback` (12 chars, 2 tokens, no structure) and
+ * `.model == "anthropic/claude-opus-4-7"` (37 chars, structural `==`).
+ */
+function crossTaskFragments(task) {
+    const isMeaningful = (f) => {
+        const spaceCount = (f.match(/ /g) ?? []).length;
+        return spaceCount >= 2 || /[=[(]/.test(f);
+    };
+    const raw = [];
+    if (task.verifier === "regex" && task.expectedMatch) {
+        raw.push(...regexLiterals(task.expectedMatch));
+    }
+    else {
+        const verifierText = readVerifierFiles(task);
+        raw.push(...pytestStructuralFragments(verifierText));
+        raw.push(...shellAssertionFragments(verifierText));
+    }
+    return raw.filter(isMeaningful);
+}
+describe.skipIf(!FIXTURE_TESTS)("cross-task eval/train verifier leakage check", () => {
+    const allTasks = listTasks();
+    // Group tasks by stash name.
+    const byStash = new Map();
+    for (const task of allTasks) {
+        const group = byStash.get(task.stash) ?? [];
+        group.push(task);
+        byStash.set(task.stash, group);
+    }
+    // Only stashes that have BOTH train and eval tasks are interesting.
+    const mixedStashes = [...byStash.entries()].filter(([, tasks]) => {
+        const hasTrain = tasks.some((t) => effectiveSlice(t) === "train");
+        const hasEval = tasks.some((t) => effectiveSlice(t) === "eval");
+        return hasTrain && hasEval;
+    });
+    test("at least one stash has both train and eval tasks", () => {
+        expect(mixedStashes.length).toBeGreaterThan(0);
+    });
+    for (const [stashName, tasks] of mixedStashes) {
+        const trainTasks = tasks.filter((t) => effectiveSlice(t) === "train");
+        const evalTasks = tasks.filter((t) => effectiveSlice(t) === "eval");
+        // Train → Eval: train verifier fragments must not appear in eval verifier text.
+        // Skip pairs that are intentional train/eval variants of the same task family
+        // (e.g. inkwell/add-healthcheck-train vs inkwell/add-healthcheck) — they share
+        // field-access patterns by design, just with different expected values.
+        const isVariantPair = (trainId, evalId) => {
+            const trainBase = trainId.replace(/-train$/, "");
+            return trainBase === evalId || evalId.startsWith(`${trainBase}-`);
+        };
+        for (const trainTask of trainTasks) {
+            const trainFragments = crossTaskFragments(trainTask);
+            if (trainFragments.length === 0)
+                continue;
+            for (const evalTask of evalTasks) {
+                if (isVariantPair(trainTask.id, evalTask.id))
+                    continue;
+                const evalVerifierText = readVerifierFiles(evalTask);
+                test(`stash:${stashName} — train:${trainTask.id} fragments not in eval:${evalTask.id} verifier`, () => {
+                    const leaked = trainFragments.filter((frag) => evalVerifierText.includes(frag));
+                    expect(leaked, `fragments leaked from train verifier to eval verifier: ${JSON.stringify(leaked)}`).toEqual([]);
+                });
+            }
+        }
+        // Eval → Train: eval verifier fragments must not appear in train verifier text.
+        for (const evalTask of evalTasks) {
+            const evalFragments = crossTaskFragments(evalTask);
+            if (evalFragments.length === 0)
+                continue;
+            for (const trainTask of trainTasks) {
+                if (isVariantPair(trainTask.id, evalTask.id))
+                    continue;
+                const trainVerifierText = readVerifierFiles(trainTask);
+                test(`stash:${stashName} — eval:${evalTask.id} fragments not in train:${trainTask.id} verifier`, () => {
+                    const leaked = evalFragments.filter((frag) => trainVerifierText.includes(frag));
+                    expect(leaked, `fragments leaked from eval verifier to train verifier: ${JSON.stringify(leaked)}`).toEqual([]);
+                });
+            }
+        }
+    }
+});
+describe.skipIf(!FIXTURE_TESTS)("gold-ref leakage check", () => {
     const tasks = listTasks().filter((t) => t.goldRef);
     test("at least one task ships with a gold_ref", () => {
         expect(tasks.length).toBeGreaterThan(0);
@@ -106,6 +205,10 @@ describe("gold-ref leakage check", () => {
             // skipping here previously masked typos and stash-name drift; we now
             // fail loudly so the corpus author is forced to fix the reference.
             if (!goldPath) {
+                // Non-skill refs (workflow:, command:, etc.) are not leakage-checked —
+                // only skill: refs map to a SKILL.md that could leak answers.
+                if (!/^skill:/.test(goldRef))
+                    return;
                 throw new Error(`${task.id}: gold_ref "${goldRef}" against stash "${task.stash}" did not resolve to a SKILL.md under tests/fixtures/stashes/. Fix the gold_ref, fix the stash name, or remove the gold_ref.`);
             }
             const goldContent = fs.readFileSync(goldPath, "utf8");

package/dist/tests/bench/learning-curve.test.js CHANGED Viewed

@@ -16,11 +16,12 @@ function emptyUtilityReport() {
         commit: "deadbee",
         model: "m",
         corpus: { domains: 0, tasks: 0, slice: "all", seedsPerArm: 1 },
-        aggregateNoakm: { passRate: 0, tokensPerPass: 0, wallclockMs: 0 },
-        aggregateAkm: { passRate: 0, tokensPerPass: 0, wallclockMs: 0 },
+        aggregateNoakm: { passRate: 0, tokensPerPass: 0, tokensPerRun: null, wallclockMs: 0 },
+        aggregateAkm: { passRate: 0, tokensPerPass: 0, tokensPerRun: null, wallclockMs: 0 },
         aggregateDelta: {
             passRate: 0,
             tokensPerPass: 0,
+            tokensPerRun: null,
             wallclockMs: 0,
         },
         trajectoryAkm: {