akm-cli 0.7.0-rc1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/dist/src/cli.js +100 -16
  2. package/dist/src/commands/config-cli.js +42 -0
  3. package/dist/src/commands/history.js +78 -7
  4. package/dist/src/commands/registry-search.js +69 -6
  5. package/dist/src/commands/search.js +30 -3
  6. package/dist/src/commands/show.js +29 -0
  7. package/dist/src/commands/source-add.js +5 -1
  8. package/dist/src/commands/source-manage.js +7 -1
  9. package/dist/src/core/config.js +28 -0
  10. package/dist/src/indexer/db-search.js +1 -0
  11. package/dist/src/indexer/indexer.js +16 -2
  12. package/dist/src/indexer/matchers.js +1 -1
  13. package/dist/src/indexer/search-source.js +4 -2
  14. package/dist/src/integrations/agent/profiles.js +1 -1
  15. package/dist/src/integrations/agent/spawn.js +67 -16
  16. package/dist/src/integrations/github.js +9 -3
  17. package/dist/src/llm/embedders/remote.js +37 -3
  18. package/dist/src/output/cli-hints.js +15 -2
  19. package/dist/src/output/renderers.js +3 -1
  20. package/dist/src/output/shapes.js +8 -1
  21. package/dist/src/output/text.js +156 -3
  22. package/dist/src/registry/build-index.js +5 -4
  23. package/dist/src/registry/providers/static-index.js +3 -1
  24. package/dist/src/setup/setup.js +9 -0
  25. package/dist/src/wiki/wiki.js +54 -6
  26. package/dist/src/workflows/runs.js +37 -3
  27. package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +1 -1
  28. package/dist/tests/bench/attribution.test.js +24 -23
  29. package/dist/tests/bench/cleanup.js +31 -0
  30. package/dist/tests/bench/cli.js +366 -31
  31. package/dist/tests/bench/cli.test.js +282 -14
  32. package/dist/tests/bench/corpus.js +3 -0
  33. package/dist/tests/bench/corpus.test.js +10 -10
  34. package/dist/tests/bench/doctor.js +525 -0
  35. package/dist/tests/bench/driver.js +77 -22
  36. package/dist/tests/bench/driver.test.js +142 -1
  37. package/dist/tests/bench/environment.js +233 -0
  38. package/dist/tests/bench/environment.test.js +199 -0
  39. package/dist/tests/bench/evolve.js +67 -0
  40. package/dist/tests/bench/evolve.test.js +12 -4
  41. package/dist/tests/bench/failure-modes.test.js +52 -3
  42. package/dist/tests/bench/feedback-integrity.test.js +3 -2
  43. package/dist/tests/bench/leakage.test.js +105 -2
  44. package/dist/tests/bench/learning-curve.test.js +3 -2
  45. package/dist/tests/bench/metrics.js +102 -26
  46. package/dist/tests/bench/metrics.test.js +10 -4
  47. package/dist/tests/bench/opencode-config.js +194 -0
  48. package/dist/tests/bench/opencode-config.test.js +370 -0
  49. package/dist/tests/bench/report.js +73 -9
  50. package/dist/tests/bench/report.test.js +59 -10
  51. package/dist/tests/bench/run-config.js +355 -0
  52. package/dist/tests/bench/run-config.test.js +298 -0
  53. package/dist/tests/bench/run-curate-test.js +32 -0
  54. package/dist/tests/bench/run-failing-tasks.js +56 -0
  55. package/dist/tests/bench/run-full-bench.js +51 -0
  56. package/dist/tests/bench/run-items36-targeted.js +69 -0
  57. package/dist/tests/bench/run-nano-quick.js +42 -0
  58. package/dist/tests/bench/run-waveg-targeted.js +62 -0
  59. package/dist/tests/bench/runner.js +257 -94
  60. package/dist/tests/bench/tmp.js +90 -0
  61. package/dist/tests/bench/trajectory.js +2 -2
  62. package/dist/tests/bench/verifier.js +6 -1
  63. package/dist/tests/bench/workflow-spec.js +11 -24
  64. package/dist/tests/bench/workflow-spec.test.js +1 -1
  65. package/dist/tests/bench/workflow-trace.js +34 -0
  66. package/dist/tests/cli-errors.test.js +1 -0
  67. package/dist/tests/commands/history.test.js +195 -0
  68. package/dist/tests/config.test.js +25 -0
  69. package/dist/tests/e2e.test.js +23 -2
  70. package/dist/tests/fixtures/stashes/load.js +1 -1
  71. package/dist/tests/fixtures/stashes/load.test.js +11 -2
  72. package/dist/tests/indexer.test.js +12 -1
  73. package/dist/tests/output-baseline.test.js +2 -1
  74. package/dist/tests/output-shapes-unit.test.js +3 -1
  75. package/dist/tests/registry-build-index.test.js +17 -1
  76. package/dist/tests/registry-providers/static-index.test.js +34 -0
  77. package/dist/tests/registry-search.test.js +200 -0
  78. package/dist/tests/remember-frontmatter.test.js +11 -13
  79. package/dist/tests/source-qa-fixes.test.js +18 -0
  80. package/dist/tests/source-registry.test.js +3 -3
  81. package/dist/tests/source-source.test.js +61 -1
  82. package/dist/tests/workflow-qa-fixes.test.js +18 -0
  83. package/package.json +1 -1
@@ -38,6 +38,7 @@ import { registerCleanup } from "./cleanup";
38
38
  import { computeLessonMetrics } from "./evolve-metrics";
39
39
  import { computeFeedbackIntegrity, computeLongitudinalMetrics, computeProposalQualityMetrics, } from "./metrics";
40
40
  import { runUtility } from "./runner";
41
+ import { benchMkdtemp } from "./tmp";
41
42
  /**
42
43
  * Drive the three-phase Track B runner.
43
44
  *
@@ -79,6 +80,8 @@ export async function runEvolve(options) {
79
80
  const preStashes = new Map();
80
81
  const evolveDirByFixture = new Map();
81
82
  const preDirByFixture = new Map();
83
+ /** Per-fixture XDG_CACHE_HOME dirs allocated for evolve-stash indexing. */
84
+ const evolveCacheDirByFixture = new Map();
82
85
  // SIGINT trap (#267): every per-fixture stash registers its cleanup with
83
86
  // the shared registry so an external Ctrl-C reaps the tmp dirs even when
84
87
  // the top-level try/finally never runs. We deregister in the matching
@@ -91,6 +94,12 @@ export async function runEvolve(options) {
91
94
  const evolved = loadFixtureStash(name, { skipIndex: false });
92
95
  evolveStashes.set(name, evolved);
93
96
  evolveDirByFixture.set(name, evolved.stashDir);
97
+ // Allocate a per-fixture cache dir for the evolve-stash re-index.
98
+ // `loadFixtureStash` used its own isolated XDG_CACHE_HOME; subsequent
99
+ // `akmCli` calls (feedback, distill, reflect) must look in the same
100
+ // cache. We allocate a fresh bench cache dir and pass it through
101
+ // `indexEvolveStash` + `envForRef` so the FTS5 DB is in a known place.
102
+ evolveCacheDirByFixture.set(name, benchMkdtemp(`akm-evolve-cache-${name}-`));
94
103
  stashDeregistrations.push(registerCleanup(() => {
95
104
  try {
96
105
  evolved.cleanup();
@@ -132,6 +141,7 @@ export async function runEvolve(options) {
132
141
  refToFixture.set(t.goldRef, t.stash);
133
142
  }
134
143
  const fallbackEvolveDir = [...evolveDirByFixture.values()][0];
144
+ const fallbackEvolveCacheDir = [...evolveCacheDirByFixture.values()][0];
135
145
  function envForRef(ref) {
136
146
  const baseEnv = { ...process.env };
137
147
  if (!materialiseStash) {
@@ -142,12 +152,40 @@ export async function runEvolve(options) {
142
152
  }
143
153
  const fixture = ref ? refToFixture.get(ref) : undefined;
144
154
  const dir = (fixture && evolveDirByFixture.get(fixture)) ?? fallbackEvolveDir;
155
+ const cacheDir = (fixture && evolveCacheDirByFixture.get(fixture)) ?? fallbackEvolveCacheDir;
145
156
  if (dir)
146
157
  baseEnv.AKM_STASH_DIR = dir;
147
158
  else
148
159
  delete baseEnv.AKM_STASH_DIR;
160
+ if (cacheDir)
161
+ baseEnv.XDG_CACHE_HOME = cacheDir;
149
162
  return baseEnv;
150
163
  }
164
+ // ── Phase 1 pre-flight: index each evolve stash in its dedicated cache. ───
165
+ // `loadFixtureStash` already ran `akm index` but used an isolated
166
+ // XDG_CACHE_HOME that subsequent `akmCli` calls (feedback, distill, reflect)
167
+ // cannot see. Re-running `akm index` here via `akmCli` with the same
168
+ // AKM_STASH_DIR + XDG_CACHE_HOME that `envForRef` will produce ensures the
169
+ // FTS5 database is populated where Phase 1 feedback will look.
170
+ // Non-zero exit adds a warning but does not abort — Phase 1 can still run
171
+ // with degraded feedback if the index step fails.
172
+ if (materialiseStash) {
173
+ const phase1Cwd = options.tasks[0]?.taskDir ?? process.cwd();
174
+ for (const [fixtureName, stashDir] of evolveDirByFixture) {
175
+ const cacheDir = evolveCacheDirByFixture.get(fixtureName);
176
+ if (!cacheDir)
177
+ continue;
178
+ try {
179
+ const result = await indexEvolveStash(stashDir, cacheDir, akmCli, phase1Cwd);
180
+ if (!result.ok) {
181
+ warnings.push(`evolve: pre-flight akm index failed for stash ${stashDir}: ${result.stderr.trim()}`);
182
+ }
183
+ }
184
+ catch (err) {
185
+ warnings.push(`evolve: pre-flight akm index threw for stash ${stashDir}: ${err.message}`);
186
+ }
187
+ }
188
+ }
151
189
  let preReport;
152
190
  let postReport;
153
191
  let syntheticReport;
@@ -172,6 +210,7 @@ export async function runEvolve(options) {
172
210
  ...(options.timestamp ? { timestamp: options.timestamp } : {}),
173
211
  ...(options.branch ? { branch: options.branch } : {}),
174
212
  ...(options.commit ? { commit: options.commit } : {}),
213
+ ...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
175
214
  });
176
215
  // Issue feedback events per (task, seed) outcome on the akm arm.
177
216
  const feedbackByRef = new Map();
@@ -271,6 +310,9 @@ export async function runEvolve(options) {
271
310
  const dir = evolveDirByFixture.get(fixtureName);
272
311
  if (dir)
273
312
  proposalEnv.AKM_STASH_DIR = dir;
313
+ const cacheDir = evolveCacheDirByFixture.get(fixtureName);
314
+ if (cacheDir)
315
+ proposalEnv.XDG_CACHE_HOME = cacheDir;
274
316
  }
275
317
  else if (!materialiseStash) {
276
318
  delete proposalEnv.AKM_STASH_DIR;
@@ -332,6 +374,7 @@ export async function runEvolve(options) {
332
374
  ...(options.timestamp ? { timestamp: options.timestamp } : {}),
333
375
  ...(options.branch ? { branch: options.branch } : {}),
334
376
  ...(options.commit ? { commit: options.commit } : {}),
377
+ ...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
335
378
  });
336
379
  postReport = await runUtility({
337
380
  tasks: evalTasks,
@@ -350,6 +393,7 @@ export async function runEvolve(options) {
350
393
  ...(options.branch ? { branch: options.branch } : {}),
351
394
  ...(options.commit ? { commit: options.commit } : {}),
352
395
  ...(options.spawn ? { spawn: wrapSpawnWithArm(options.spawn, "post") } : {}),
396
+ ...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
353
397
  });
354
398
  // synthetic: no stash. We pass a spawn wrapper that strips
355
399
  // AKM_STASH_DIR and injects the "Bring Your Own Skills" tag so test
@@ -371,6 +415,7 @@ export async function runEvolve(options) {
371
415
  ...(options.branch ? { branch: options.branch } : {}),
372
416
  ...(options.commit ? { commit: options.commit } : {}),
373
417
  ...(options.spawn ? { spawn: wrapSpawnWithArm(options.spawn, "synthetic", undefined, true) } : {}),
418
+ ...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
374
419
  });
375
420
  }
376
421
  finally {
@@ -568,6 +613,28 @@ function parseProposalShow(stdout) {
568
613
  }
569
614
  return { lintPass, ...(lintMessage ? { lintMessage } : {}) };
570
615
  }
616
+ /**
617
+ * Run `akm index` on the evolve stash to populate the FTS5 database in the
618
+ * cache directory that Phase 1 `akmCli` calls will use.
619
+ *
620
+ * `loadFixtureStash` already indexed the stash into an isolated XDG_CACHE_HOME
621
+ * that is invisible to subsequent `akmCli` calls. Calling this helper with the
622
+ * same `stashDir` + `cacheDir` that `envForRef` will forward ensures `akm
623
+ * feedback` (and later `akm distill` / `akm reflect`) can look up refs in the
624
+ * FTS5 index.
625
+ *
626
+ * Returns `{ ok: true }` on exit code 0, `{ ok: false, stderr }` otherwise.
627
+ * Exported for tests.
628
+ */
629
+ export async function indexEvolveStash(stashDir, cacheDir, akmCli, cwd) {
630
+ const env = {
631
+ ...process.env,
632
+ AKM_STASH_DIR: stashDir,
633
+ XDG_CACHE_HOME: cacheDir,
634
+ };
635
+ const result = await akmCli(["index"], cwd, env);
636
+ return { ok: result.exitCode === 0, stderr: result.stderr };
637
+ }
571
638
  /** Exposed for tests so the synthetic-arm prompt construction can be asserted. */
572
639
  export function buildSyntheticPrompt(taskId) {
573
640
  return [
@@ -534,6 +534,7 @@ describe("computeLongitudinalMetrics", () => {
534
534
  passRate: akmPassRate,
535
535
  passAt1: 0,
536
536
  tokensPerPass: null,
537
+ tokensPerRun: null,
537
538
  wallclockMs: 0,
538
539
  passRateStdev: 0,
539
540
  budgetExceededCount: 0,
@@ -548,9 +549,9 @@ describe("computeLongitudinalMetrics", () => {
548
549
  commit: "c",
549
550
  model: "m",
550
551
  corpus: { domains: 1, tasks: 1, slice: "eval", seedsPerArm },
551
- aggregateNoakm: { passRate: 0, tokensPerPass: null, wallclockMs: 0 },
552
- aggregateAkm: { passRate: akmPassRate, tokensPerPass: null, wallclockMs: 0 },
553
- aggregateDelta: { passRate: akmPassRate, tokensPerPass: null, wallclockMs: 0 },
552
+ aggregateNoakm: { passRate: 0, tokensPerPass: null, tokensPerRun: null, wallclockMs: 0 },
553
+ aggregateAkm: { passRate: akmPassRate, tokensPerPass: null, tokensPerRun: null, wallclockMs: 0 },
554
+ aggregateDelta: { passRate: akmPassRate, tokensPerPass: null, tokensPerRun: null, wallclockMs: 0 },
554
555
  trajectoryAkm: { correctAssetLoaded: null, feedbackRecorded: 0 },
555
556
  failureModes: opts.failureMode
556
557
  ? {
@@ -558,7 +559,14 @@ describe("computeLongitudinalMetrics", () => {
558
559
  byTask: { [taskId]: { [opts.failureMode]: 1 } },
559
560
  }
560
561
  : { byLabel: {}, byTask: {} },
561
- tasks: [{ id: taskId, noakm, akm, delta: { passRate: akmPassRate, tokensPerPass: null, wallclockMs: 0 } }],
562
+ tasks: [
563
+ {
564
+ id: taskId,
565
+ noakm,
566
+ akm,
567
+ delta: { passRate: akmPassRate, tokensPerPass: null, tokensPerRun: null, wallclockMs: 0 },
568
+ },
569
+ ],
562
570
  warnings: [],
563
571
  };
564
572
  }
@@ -130,6 +130,55 @@ describe("classifyFailureMode — seven labels", () => {
130
130
  const out = classifyFailureMode(fakeTask({ goldRef: undefined }), fakeRun({ verifierStdout: trace }));
131
131
  expect(out).toBe("unrelated_bug");
132
132
  });
133
+ test("no_events: task has no goldRef and no search in trace", () => {
134
+ // When there is no goldRef and no search evidence, trajectory.correctAssetLoaded
135
+ // is always null (metric undefined). We cannot tell whether the agent searched
136
+ // or whether events data was absent. Surfaces as `no_events`.
137
+ const out = classifyFailureMode(fakeTask({ goldRef: undefined }), fakeRun({ verifierStdout: "" }));
138
+ expect(out).toBe("no_events");
139
+ });
140
+ });
141
+ describe("classifyFailureMode — trajectory-aware classification (REC-07 / REC-13)", () => {
142
+ test("loaded_ignored: correctAssetLoaded=true + fail → loaded_ignored (short-circuit)", () => {
143
+ // The agent loaded the correct asset (confirmed by trajectory data) but still
144
+ // produced wrong output. This is the dominant failure pattern in the
145
+ // 2026-05-03 baseline: 24/25 `search_no_gold` labels were wrong because the
146
+ // classifier didn't consult trajectory.correctAssetLoaded.
147
+ const out = classifyFailureMode(fakeTask(), fakeRun({
148
+ trajectory: { correctAssetLoaded: true, feedbackRecorded: null },
149
+ verifierStdout: "verifier: field values wrong",
150
+ }));
151
+ expect(out).toBe("loaded_ignored");
152
+ });
153
+ test("loaded_ignored: correctAssetLoaded=true overrides stdout-scan — fires even with no search in trace", () => {
154
+ // Trajectory data is authoritative. Even if verifierStdout shows no `akm
155
+ // search`, the trajectory says the gold was loaded → loaded_ignored, not
156
+ // no_search.
157
+ const out = classifyFailureMode(fakeTask(), fakeRun({
158
+ trajectory: { correctAssetLoaded: true, feedbackRecorded: null },
159
+ verifierStdout: "",
160
+ }));
161
+ expect(out).toBe("loaded_ignored");
162
+ });
163
+ test("search_no_gold: correctAssetLoaded=false + search ran + gold absent → search_no_gold", () => {
164
+ // When trajectory says gold was NOT loaded and search ran but gold ref absent
165
+ // from results, this is a genuine search failure.
166
+ const trace = ["$ akm search homelab", "1. skill:foo", "2. skill:bar"].join("\n");
167
+ const out = classifyFailureMode(fakeTask(), fakeRun({
168
+ trajectory: { correctAssetLoaded: false, feedbackRecorded: null },
169
+ verifierStdout: trace,
170
+ }));
171
+ expect(out).toBe("search_no_gold");
172
+ });
173
+ test("no_search: correctAssetLoaded=false + no search in trace → no_search", () => {
174
+ // When trajectory says gold was NOT loaded and there is no search evidence,
175
+ // the agent genuinely didn't search.
176
+ const out = classifyFailureMode(fakeTask(), fakeRun({
177
+ trajectory: { correctAssetLoaded: false, feedbackRecorded: null },
178
+ verifierStdout: "verifier: missing output",
179
+ }));
180
+ expect(out).toBe("no_search");
181
+ });
133
182
  });
134
183
  describe("classifyFailureMode — tie-breaking and priority", () => {
135
184
  test("no_search beats search_no_gold when both could apply (no search call)", () => {
@@ -258,9 +307,9 @@ describe("renderFailureModeBreakdown", () => {
258
307
  commit: "y",
259
308
  model: "m",
260
309
  corpus: { domains: 1, tasks: 1, slice: "all", seedsPerArm: 5 },
261
- aggregateNoakm: { passRate: 0, tokensPerPass: null, wallclockMs: 0 },
262
- aggregateAkm: { passRate: 0, tokensPerPass: null, wallclockMs: 0 },
263
- aggregateDelta: { passRate: 0, tokensPerPass: null, wallclockMs: 0 },
310
+ aggregateNoakm: { passRate: 0, tokensPerPass: null, tokensPerRun: null, wallclockMs: 0 },
311
+ aggregateAkm: { passRate: 0, tokensPerPass: null, tokensPerRun: null, wallclockMs: 0 },
312
+ aggregateDelta: { passRate: 0, tokensPerPass: null, tokensPerRun: null, wallclockMs: 0 },
264
313
  trajectoryAkm: { correctAssetLoaded: null, feedbackRecorded: 0 },
265
314
  failureModes: { byLabel, byTask: {} },
266
315
  tasks: [],
@@ -288,11 +288,12 @@ function emptyUtilityReport() {
288
288
  commit: "deadbee",
289
289
  model: "m",
290
290
  corpus: { domains: 0, tasks: 0, slice: "all", seedsPerArm: 1 },
291
- aggregateNoakm: { passRate: 0, tokensPerPass: 0, wallclockMs: 0 },
292
- aggregateAkm: { passRate: 0, tokensPerPass: 0, wallclockMs: 0 },
291
+ aggregateNoakm: { passRate: 0, tokensPerPass: 0, tokensPerRun: null, wallclockMs: 0 },
292
+ aggregateAkm: { passRate: 0, tokensPerPass: 0, tokensPerRun: null, wallclockMs: 0 },
293
293
  aggregateDelta: {
294
294
  passRate: 0,
295
295
  tokensPerPass: 0,
296
+ tokensPerRun: null,
296
297
  wallclockMs: 0,
297
298
  },
298
299
  trajectoryAkm: {
@@ -1,6 +1,14 @@
1
1
  /**
2
2
  * Leakage smoke test for the seeded bench corpus (spec §7.4).
3
3
  *
4
+ * Gated behind `AKM_BENCH_FIXTURE_TESTS=1`. This is a corpus-content
5
+ * validator (it inspects the seeded fixture stashes and verifier files,
6
+ * not the bench framework code itself), so it ships skipped by default —
7
+ * matching the `AKM_SEMANTIC_TESTS` / `AKM_DOCKER_TESTS` pattern. Run it
8
+ * locally when you change a fixture stash or a verifier:
9
+ *
10
+ * AKM_BENCH_FIXTURE_TESTS=1 bun test tests/bench/leakage.test.ts
11
+ *
4
12
  * For every task that declares a `gold_ref` of the form `skill:<name>`,
5
13
  * locate the SKILL.md inside the named fixture stash and assert that the
6
14
  * verifier's *structural assertions* do not appear verbatim in the gold-ref
@@ -26,7 +34,8 @@
26
34
  import { describe, expect, test } from "bun:test";
27
35
  import fs from "node:fs";
28
36
  import path from "node:path";
29
- import { getTasksRoot, listTasks } from "./corpus";
37
+ import { effectiveSlice, getTasksRoot, listTasks } from "./corpus";
38
+ const FIXTURE_TESTS = !!process.env.AKM_BENCH_FIXTURE_TESTS;
30
39
  const STASHES_ROOT = path.resolve(getTasksRoot(), "..", "..", "stashes");
31
40
  /** Resolve `skill:<name>` against the named stash; returns SKILL.md path or `undefined`. */
32
41
  function resolveGoldRefPath(stashName, goldRef) {
@@ -93,7 +102,97 @@ function readVerifierFiles(task) {
93
102
  }
94
103
  return combined;
95
104
  }
96
- describe("gold-ref leakage check", () => {
105
+ /**
106
+ * Return the verifier assertion fragments for a task, applying an additional
107
+ * filter suitable for cross-task comparisons. Short two-word domain phrases
108
+ * (e.g. `akm feedback`, `akm search`) naturally recur across tasks that share
109
+ * a domain — they are NOT meaningful leakage signals. A fragment is considered
110
+ * meaningful only when it either:
111
+ * • contains at least two spaces (three or more tokens), or
112
+ * • contains a structural character (`=`, `[`, `(`) that marks it as a
113
+ * complex expression unlikely to appear by coincidence.
114
+ *
115
+ * This is more precise than a raw length threshold because it captures the
116
+ * difference between `akm feedback` (12 chars, 2 tokens, no structure) and
117
+ * `.model == "anthropic/claude-opus-4-7"` (37 chars, structural `==`).
118
+ */
119
+ function crossTaskFragments(task) {
120
+ const isMeaningful = (f) => {
121
+ const spaceCount = (f.match(/ /g) ?? []).length;
122
+ return spaceCount >= 2 || /[=[(]/.test(f);
123
+ };
124
+ const raw = [];
125
+ if (task.verifier === "regex" && task.expectedMatch) {
126
+ raw.push(...regexLiterals(task.expectedMatch));
127
+ }
128
+ else {
129
+ const verifierText = readVerifierFiles(task);
130
+ raw.push(...pytestStructuralFragments(verifierText));
131
+ raw.push(...shellAssertionFragments(verifierText));
132
+ }
133
+ return raw.filter(isMeaningful);
134
+ }
135
+ describe.skipIf(!FIXTURE_TESTS)("cross-task eval/train verifier leakage check", () => {
136
+ const allTasks = listTasks();
137
+ // Group tasks by stash name.
138
+ const byStash = new Map();
139
+ for (const task of allTasks) {
140
+ const group = byStash.get(task.stash) ?? [];
141
+ group.push(task);
142
+ byStash.set(task.stash, group);
143
+ }
144
+ // Only stashes that have BOTH train and eval tasks are interesting.
145
+ const mixedStashes = [...byStash.entries()].filter(([, tasks]) => {
146
+ const hasTrain = tasks.some((t) => effectiveSlice(t) === "train");
147
+ const hasEval = tasks.some((t) => effectiveSlice(t) === "eval");
148
+ return hasTrain && hasEval;
149
+ });
150
+ test("at least one stash has both train and eval tasks", () => {
151
+ expect(mixedStashes.length).toBeGreaterThan(0);
152
+ });
153
+ for (const [stashName, tasks] of mixedStashes) {
154
+ const trainTasks = tasks.filter((t) => effectiveSlice(t) === "train");
155
+ const evalTasks = tasks.filter((t) => effectiveSlice(t) === "eval");
156
+ // Train → Eval: train verifier fragments must not appear in eval verifier text.
157
+ // Skip pairs that are intentional train/eval variants of the same task family
158
+ // (e.g. inkwell/add-healthcheck-train vs inkwell/add-healthcheck) — they share
159
+ // field-access patterns by design, just with different expected values.
160
+ const isVariantPair = (trainId, evalId) => {
161
+ const trainBase = trainId.replace(/-train$/, "");
162
+ return trainBase === evalId || evalId.startsWith(`${trainBase}-`);
163
+ };
164
+ for (const trainTask of trainTasks) {
165
+ const trainFragments = crossTaskFragments(trainTask);
166
+ if (trainFragments.length === 0)
167
+ continue;
168
+ for (const evalTask of evalTasks) {
169
+ if (isVariantPair(trainTask.id, evalTask.id))
170
+ continue;
171
+ const evalVerifierText = readVerifierFiles(evalTask);
172
+ test(`stash:${stashName} — train:${trainTask.id} fragments not in eval:${evalTask.id} verifier`, () => {
173
+ const leaked = trainFragments.filter((frag) => evalVerifierText.includes(frag));
174
+ expect(leaked, `fragments leaked from train verifier to eval verifier: ${JSON.stringify(leaked)}`).toEqual([]);
175
+ });
176
+ }
177
+ }
178
+ // Eval → Train: eval verifier fragments must not appear in train verifier text.
179
+ for (const evalTask of evalTasks) {
180
+ const evalFragments = crossTaskFragments(evalTask);
181
+ if (evalFragments.length === 0)
182
+ continue;
183
+ for (const trainTask of trainTasks) {
184
+ if (isVariantPair(trainTask.id, evalTask.id))
185
+ continue;
186
+ const trainVerifierText = readVerifierFiles(trainTask);
187
+ test(`stash:${stashName} — eval:${evalTask.id} fragments not in train:${trainTask.id} verifier`, () => {
188
+ const leaked = evalFragments.filter((frag) => trainVerifierText.includes(frag));
189
+ expect(leaked, `fragments leaked from eval verifier to train verifier: ${JSON.stringify(leaked)}`).toEqual([]);
190
+ });
191
+ }
192
+ }
193
+ }
194
+ });
195
+ describe.skipIf(!FIXTURE_TESTS)("gold-ref leakage check", () => {
97
196
  const tasks = listTasks().filter((t) => t.goldRef);
98
197
  test("at least one task ships with a gold_ref", () => {
99
198
  expect(tasks.length).toBeGreaterThan(0);
@@ -106,6 +205,10 @@ describe("gold-ref leakage check", () => {
106
205
  // skipping here previously masked typos and stash-name drift; we now
107
206
  // fail loudly so the corpus author is forced to fix the reference.
108
207
  if (!goldPath) {
208
+ // Non-skill refs (workflow:, command:, etc.) are not leakage-checked —
209
+ // only skill: refs map to a SKILL.md that could leak answers.
210
+ if (!/^skill:/.test(goldRef))
211
+ return;
109
212
  throw new Error(`${task.id}: gold_ref "${goldRef}" against stash "${task.stash}" did not resolve to a SKILL.md under tests/fixtures/stashes/. Fix the gold_ref, fix the stash name, or remove the gold_ref.`);
110
213
  }
111
214
  const goldContent = fs.readFileSync(goldPath, "utf8");
@@ -16,11 +16,12 @@ function emptyUtilityReport() {
16
16
  commit: "deadbee",
17
17
  model: "m",
18
18
  corpus: { domains: 0, tasks: 0, slice: "all", seedsPerArm: 1 },
19
- aggregateNoakm: { passRate: 0, tokensPerPass: 0, wallclockMs: 0 },
20
- aggregateAkm: { passRate: 0, tokensPerPass: 0, wallclockMs: 0 },
19
+ aggregateNoakm: { passRate: 0, tokensPerPass: 0, tokensPerRun: null, wallclockMs: 0 },
20
+ aggregateAkm: { passRate: 0, tokensPerPass: 0, tokensPerRun: null, wallclockMs: 0 },
21
21
  aggregateDelta: {
22
22
  passRate: 0,
23
23
  tokensPerPass: 0,
24
+ tokensPerRun: null,
24
25
  wallclockMs: 0,
25
26
  },
26
27
  trajectoryAkm: {