akm-cli 0.7.0-rc1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/src/cli.js +100 -16
- package/dist/src/commands/config-cli.js +42 -0
- package/dist/src/commands/history.js +78 -7
- package/dist/src/commands/registry-search.js +69 -6
- package/dist/src/commands/search.js +30 -3
- package/dist/src/commands/show.js +29 -0
- package/dist/src/commands/source-add.js +5 -1
- package/dist/src/commands/source-manage.js +7 -1
- package/dist/src/core/config.js +28 -0
- package/dist/src/indexer/db-search.js +1 -0
- package/dist/src/indexer/indexer.js +16 -2
- package/dist/src/indexer/matchers.js +1 -1
- package/dist/src/indexer/search-source.js +4 -2
- package/dist/src/integrations/agent/profiles.js +1 -1
- package/dist/src/integrations/agent/spawn.js +67 -16
- package/dist/src/integrations/github.js +9 -3
- package/dist/src/llm/embedders/remote.js +37 -3
- package/dist/src/output/cli-hints.js +15 -2
- package/dist/src/output/renderers.js +3 -1
- package/dist/src/output/shapes.js +8 -1
- package/dist/src/output/text.js +156 -3
- package/dist/src/registry/build-index.js +5 -4
- package/dist/src/registry/providers/static-index.js +3 -1
- package/dist/src/setup/setup.js +9 -0
- package/dist/src/wiki/wiki.js +54 -6
- package/dist/src/workflows/runs.js +37 -3
- package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +1 -1
- package/dist/tests/bench/attribution.test.js +24 -23
- package/dist/tests/bench/cleanup.js +31 -0
- package/dist/tests/bench/cli.js +366 -31
- package/dist/tests/bench/cli.test.js +282 -14
- package/dist/tests/bench/corpus.js +3 -0
- package/dist/tests/bench/corpus.test.js +10 -10
- package/dist/tests/bench/doctor.js +525 -0
- package/dist/tests/bench/driver.js +77 -22
- package/dist/tests/bench/driver.test.js +142 -1
- package/dist/tests/bench/environment.js +233 -0
- package/dist/tests/bench/environment.test.js +199 -0
- package/dist/tests/bench/evolve.js +67 -0
- package/dist/tests/bench/evolve.test.js +12 -4
- package/dist/tests/bench/failure-modes.test.js +52 -3
- package/dist/tests/bench/feedback-integrity.test.js +3 -2
- package/dist/tests/bench/leakage.test.js +105 -2
- package/dist/tests/bench/learning-curve.test.js +3 -2
- package/dist/tests/bench/metrics.js +102 -26
- package/dist/tests/bench/metrics.test.js +10 -4
- package/dist/tests/bench/opencode-config.js +194 -0
- package/dist/tests/bench/opencode-config.test.js +370 -0
- package/dist/tests/bench/report.js +73 -9
- package/dist/tests/bench/report.test.js +59 -10
- package/dist/tests/bench/run-config.js +355 -0
- package/dist/tests/bench/run-config.test.js +298 -0
- package/dist/tests/bench/run-curate-test.js +32 -0
- package/dist/tests/bench/run-failing-tasks.js +56 -0
- package/dist/tests/bench/run-full-bench.js +51 -0
- package/dist/tests/bench/run-items36-targeted.js +69 -0
- package/dist/tests/bench/run-nano-quick.js +42 -0
- package/dist/tests/bench/run-waveg-targeted.js +62 -0
- package/dist/tests/bench/runner.js +257 -94
- package/dist/tests/bench/tmp.js +90 -0
- package/dist/tests/bench/trajectory.js +2 -2
- package/dist/tests/bench/verifier.js +6 -1
- package/dist/tests/bench/workflow-spec.js +11 -24
- package/dist/tests/bench/workflow-spec.test.js +1 -1
- package/dist/tests/bench/workflow-trace.js +34 -0
- package/dist/tests/cli-errors.test.js +1 -0
- package/dist/tests/commands/history.test.js +195 -0
- package/dist/tests/config.test.js +25 -0
- package/dist/tests/e2e.test.js +23 -2
- package/dist/tests/fixtures/stashes/load.js +1 -1
- package/dist/tests/fixtures/stashes/load.test.js +11 -2
- package/dist/tests/indexer.test.js +12 -1
- package/dist/tests/output-baseline.test.js +2 -1
- package/dist/tests/output-shapes-unit.test.js +3 -1
- package/dist/tests/registry-build-index.test.js +17 -1
- package/dist/tests/registry-providers/static-index.test.js +34 -0
- package/dist/tests/registry-search.test.js +200 -0
- package/dist/tests/remember-frontmatter.test.js +11 -13
- package/dist/tests/source-qa-fixes.test.js +18 -0
- package/dist/tests/source-registry.test.js +3 -3
- package/dist/tests/source-source.test.js +61 -1
- package/dist/tests/workflow-qa-fixes.test.js +18 -0
- package/package.json +1 -1
|
@@ -38,6 +38,7 @@ import { registerCleanup } from "./cleanup";
|
|
|
38
38
|
import { computeLessonMetrics } from "./evolve-metrics";
|
|
39
39
|
import { computeFeedbackIntegrity, computeLongitudinalMetrics, computeProposalQualityMetrics, } from "./metrics";
|
|
40
40
|
import { runUtility } from "./runner";
|
|
41
|
+
import { benchMkdtemp } from "./tmp";
|
|
41
42
|
/**
|
|
42
43
|
* Drive the three-phase Track B runner.
|
|
43
44
|
*
|
|
@@ -79,6 +80,8 @@ export async function runEvolve(options) {
|
|
|
79
80
|
const preStashes = new Map();
|
|
80
81
|
const evolveDirByFixture = new Map();
|
|
81
82
|
const preDirByFixture = new Map();
|
|
83
|
+
/** Per-fixture XDG_CACHE_HOME dirs allocated for evolve-stash indexing. */
|
|
84
|
+
const evolveCacheDirByFixture = new Map();
|
|
82
85
|
// SIGINT trap (#267): every per-fixture stash registers its cleanup with
|
|
83
86
|
// the shared registry so an external Ctrl-C reaps the tmp dirs even when
|
|
84
87
|
// the top-level try/finally never runs. We deregister in the matching
|
|
@@ -91,6 +94,12 @@ export async function runEvolve(options) {
|
|
|
91
94
|
const evolved = loadFixtureStash(name, { skipIndex: false });
|
|
92
95
|
evolveStashes.set(name, evolved);
|
|
93
96
|
evolveDirByFixture.set(name, evolved.stashDir);
|
|
97
|
+
// Allocate a per-fixture cache dir for the evolve-stash re-index.
|
|
98
|
+
// `loadFixtureStash` used its own isolated XDG_CACHE_HOME; subsequent
|
|
99
|
+
// `akmCli` calls (feedback, distill, reflect) must look in the same
|
|
100
|
+
// cache. We allocate a fresh bench cache dir and pass it through
|
|
101
|
+
// `indexEvolveStash` + `envForRef` so the FTS5 DB is in a known place.
|
|
102
|
+
evolveCacheDirByFixture.set(name, benchMkdtemp(`akm-evolve-cache-${name}-`));
|
|
94
103
|
stashDeregistrations.push(registerCleanup(() => {
|
|
95
104
|
try {
|
|
96
105
|
evolved.cleanup();
|
|
@@ -132,6 +141,7 @@ export async function runEvolve(options) {
|
|
|
132
141
|
refToFixture.set(t.goldRef, t.stash);
|
|
133
142
|
}
|
|
134
143
|
const fallbackEvolveDir = [...evolveDirByFixture.values()][0];
|
|
144
|
+
const fallbackEvolveCacheDir = [...evolveCacheDirByFixture.values()][0];
|
|
135
145
|
function envForRef(ref) {
|
|
136
146
|
const baseEnv = { ...process.env };
|
|
137
147
|
if (!materialiseStash) {
|
|
@@ -142,12 +152,40 @@ export async function runEvolve(options) {
|
|
|
142
152
|
}
|
|
143
153
|
const fixture = ref ? refToFixture.get(ref) : undefined;
|
|
144
154
|
const dir = (fixture && evolveDirByFixture.get(fixture)) ?? fallbackEvolveDir;
|
|
155
|
+
const cacheDir = (fixture && evolveCacheDirByFixture.get(fixture)) ?? fallbackEvolveCacheDir;
|
|
145
156
|
if (dir)
|
|
146
157
|
baseEnv.AKM_STASH_DIR = dir;
|
|
147
158
|
else
|
|
148
159
|
delete baseEnv.AKM_STASH_DIR;
|
|
160
|
+
if (cacheDir)
|
|
161
|
+
baseEnv.XDG_CACHE_HOME = cacheDir;
|
|
149
162
|
return baseEnv;
|
|
150
163
|
}
|
|
164
|
+
// ── Phase 1 pre-flight: index each evolve stash in its dedicated cache. ───
|
|
165
|
+
// `loadFixtureStash` already ran `akm index` but used an isolated
|
|
166
|
+
// XDG_CACHE_HOME that subsequent `akmCli` calls (feedback, distill, reflect)
|
|
167
|
+
// cannot see. Re-running `akm index` here via `akmCli` with the same
|
|
168
|
+
// AKM_STASH_DIR + XDG_CACHE_HOME that `envForRef` will produce ensures the
|
|
169
|
+
// FTS5 database is populated where Phase 1 feedback will look.
|
|
170
|
+
// Non-zero exit adds a warning but does not abort — Phase 1 can still run
|
|
171
|
+
// with degraded feedback if the index step fails.
|
|
172
|
+
if (materialiseStash) {
|
|
173
|
+
const phase1Cwd = options.tasks[0]?.taskDir ?? process.cwd();
|
|
174
|
+
for (const [fixtureName, stashDir] of evolveDirByFixture) {
|
|
175
|
+
const cacheDir = evolveCacheDirByFixture.get(fixtureName);
|
|
176
|
+
if (!cacheDir)
|
|
177
|
+
continue;
|
|
178
|
+
try {
|
|
179
|
+
const result = await indexEvolveStash(stashDir, cacheDir, akmCli, phase1Cwd);
|
|
180
|
+
if (!result.ok) {
|
|
181
|
+
warnings.push(`evolve: pre-flight akm index failed for stash ${stashDir}: ${result.stderr.trim()}`);
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
catch (err) {
|
|
185
|
+
warnings.push(`evolve: pre-flight akm index threw for stash ${stashDir}: ${err.message}`);
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
}
|
|
151
189
|
let preReport;
|
|
152
190
|
let postReport;
|
|
153
191
|
let syntheticReport;
|
|
@@ -172,6 +210,7 @@ export async function runEvolve(options) {
|
|
|
172
210
|
...(options.timestamp ? { timestamp: options.timestamp } : {}),
|
|
173
211
|
...(options.branch ? { branch: options.branch } : {}),
|
|
174
212
|
...(options.commit ? { commit: options.commit } : {}),
|
|
213
|
+
...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
|
|
175
214
|
});
|
|
176
215
|
// Issue feedback events per (task, seed) outcome on the akm arm.
|
|
177
216
|
const feedbackByRef = new Map();
|
|
@@ -271,6 +310,9 @@ export async function runEvolve(options) {
|
|
|
271
310
|
const dir = evolveDirByFixture.get(fixtureName);
|
|
272
311
|
if (dir)
|
|
273
312
|
proposalEnv.AKM_STASH_DIR = dir;
|
|
313
|
+
const cacheDir = evolveCacheDirByFixture.get(fixtureName);
|
|
314
|
+
if (cacheDir)
|
|
315
|
+
proposalEnv.XDG_CACHE_HOME = cacheDir;
|
|
274
316
|
}
|
|
275
317
|
else if (!materialiseStash) {
|
|
276
318
|
delete proposalEnv.AKM_STASH_DIR;
|
|
@@ -332,6 +374,7 @@ export async function runEvolve(options) {
|
|
|
332
374
|
...(options.timestamp ? { timestamp: options.timestamp } : {}),
|
|
333
375
|
...(options.branch ? { branch: options.branch } : {}),
|
|
334
376
|
...(options.commit ? { commit: options.commit } : {}),
|
|
377
|
+
...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
|
|
335
378
|
});
|
|
336
379
|
postReport = await runUtility({
|
|
337
380
|
tasks: evalTasks,
|
|
@@ -350,6 +393,7 @@ export async function runEvolve(options) {
|
|
|
350
393
|
...(options.branch ? { branch: options.branch } : {}),
|
|
351
394
|
...(options.commit ? { commit: options.commit } : {}),
|
|
352
395
|
...(options.spawn ? { spawn: wrapSpawnWithArm(options.spawn, "post") } : {}),
|
|
396
|
+
...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
|
|
353
397
|
});
|
|
354
398
|
// synthetic: no stash. We pass a spawn wrapper that strips
|
|
355
399
|
// AKM_STASH_DIR and injects the "Bring Your Own Skills" tag so test
|
|
@@ -371,6 +415,7 @@ export async function runEvolve(options) {
|
|
|
371
415
|
...(options.branch ? { branch: options.branch } : {}),
|
|
372
416
|
...(options.commit ? { commit: options.commit } : {}),
|
|
373
417
|
...(options.spawn ? { spawn: wrapSpawnWithArm(options.spawn, "synthetic", undefined, true) } : {}),
|
|
418
|
+
...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
|
|
374
419
|
});
|
|
375
420
|
}
|
|
376
421
|
finally {
|
|
@@ -568,6 +613,28 @@ function parseProposalShow(stdout) {
|
|
|
568
613
|
}
|
|
569
614
|
return { lintPass, ...(lintMessage ? { lintMessage } : {}) };
|
|
570
615
|
}
|
|
616
|
+
/**
|
|
617
|
+
* Run `akm index` on the evolve stash to populate the FTS5 database in the
|
|
618
|
+
* cache directory that Phase 1 `akmCli` calls will use.
|
|
619
|
+
*
|
|
620
|
+
* `loadFixtureStash` already indexed the stash into an isolated XDG_CACHE_HOME
|
|
621
|
+
* that is invisible to subsequent `akmCli` calls. Calling this helper with the
|
|
622
|
+
* same `stashDir` + `cacheDir` that `envForRef` will forward ensures `akm
|
|
623
|
+
* feedback` (and later `akm distill` / `akm reflect`) can look up refs in the
|
|
624
|
+
* FTS5 index.
|
|
625
|
+
*
|
|
626
|
+
* Returns `{ ok: true }` on exit code 0, `{ ok: false, stderr }` otherwise.
|
|
627
|
+
* Exported for tests.
|
|
628
|
+
*/
|
|
629
|
+
export async function indexEvolveStash(stashDir, cacheDir, akmCli, cwd) {
|
|
630
|
+
const env = {
|
|
631
|
+
...process.env,
|
|
632
|
+
AKM_STASH_DIR: stashDir,
|
|
633
|
+
XDG_CACHE_HOME: cacheDir,
|
|
634
|
+
};
|
|
635
|
+
const result = await akmCli(["index"], cwd, env);
|
|
636
|
+
return { ok: result.exitCode === 0, stderr: result.stderr };
|
|
637
|
+
}
|
|
571
638
|
/** Exposed for tests so the synthetic-arm prompt construction can be asserted. */
|
|
572
639
|
export function buildSyntheticPrompt(taskId) {
|
|
573
640
|
return [
|
|
@@ -534,6 +534,7 @@ describe("computeLongitudinalMetrics", () => {
|
|
|
534
534
|
passRate: akmPassRate,
|
|
535
535
|
passAt1: 0,
|
|
536
536
|
tokensPerPass: null,
|
|
537
|
+
tokensPerRun: null,
|
|
537
538
|
wallclockMs: 0,
|
|
538
539
|
passRateStdev: 0,
|
|
539
540
|
budgetExceededCount: 0,
|
|
@@ -548,9 +549,9 @@ describe("computeLongitudinalMetrics", () => {
|
|
|
548
549
|
commit: "c",
|
|
549
550
|
model: "m",
|
|
550
551
|
corpus: { domains: 1, tasks: 1, slice: "eval", seedsPerArm },
|
|
551
|
-
aggregateNoakm: { passRate: 0, tokensPerPass: null, wallclockMs: 0 },
|
|
552
|
-
aggregateAkm: { passRate: akmPassRate, tokensPerPass: null, wallclockMs: 0 },
|
|
553
|
-
aggregateDelta: { passRate: akmPassRate, tokensPerPass: null, wallclockMs: 0 },
|
|
552
|
+
aggregateNoakm: { passRate: 0, tokensPerPass: null, tokensPerRun: null, wallclockMs: 0 },
|
|
553
|
+
aggregateAkm: { passRate: akmPassRate, tokensPerPass: null, tokensPerRun: null, wallclockMs: 0 },
|
|
554
|
+
aggregateDelta: { passRate: akmPassRate, tokensPerPass: null, tokensPerRun: null, wallclockMs: 0 },
|
|
554
555
|
trajectoryAkm: { correctAssetLoaded: null, feedbackRecorded: 0 },
|
|
555
556
|
failureModes: opts.failureMode
|
|
556
557
|
? {
|
|
@@ -558,7 +559,14 @@ describe("computeLongitudinalMetrics", () => {
|
|
|
558
559
|
byTask: { [taskId]: { [opts.failureMode]: 1 } },
|
|
559
560
|
}
|
|
560
561
|
: { byLabel: {}, byTask: {} },
|
|
561
|
-
tasks: [
|
|
562
|
+
tasks: [
|
|
563
|
+
{
|
|
564
|
+
id: taskId,
|
|
565
|
+
noakm,
|
|
566
|
+
akm,
|
|
567
|
+
delta: { passRate: akmPassRate, tokensPerPass: null, tokensPerRun: null, wallclockMs: 0 },
|
|
568
|
+
},
|
|
569
|
+
],
|
|
562
570
|
warnings: [],
|
|
563
571
|
};
|
|
564
572
|
}
|
|
@@ -130,6 +130,55 @@ describe("classifyFailureMode — seven labels", () => {
|
|
|
130
130
|
const out = classifyFailureMode(fakeTask({ goldRef: undefined }), fakeRun({ verifierStdout: trace }));
|
|
131
131
|
expect(out).toBe("unrelated_bug");
|
|
132
132
|
});
|
|
133
|
+
test("no_events: task has no goldRef and no search in trace", () => {
|
|
134
|
+
// When there is no goldRef and no search evidence, trajectory.correctAssetLoaded
|
|
135
|
+
// is always null (metric undefined). We cannot tell whether the agent searched
|
|
136
|
+
// or whether events data was absent. Surfaces as `no_events`.
|
|
137
|
+
const out = classifyFailureMode(fakeTask({ goldRef: undefined }), fakeRun({ verifierStdout: "" }));
|
|
138
|
+
expect(out).toBe("no_events");
|
|
139
|
+
});
|
|
140
|
+
});
|
|
141
|
+
describe("classifyFailureMode — trajectory-aware classification (REC-07 / REC-13)", () => {
|
|
142
|
+
test("loaded_ignored: correctAssetLoaded=true + fail → loaded_ignored (short-circuit)", () => {
|
|
143
|
+
// The agent loaded the correct asset (confirmed by trajectory data) but still
|
|
144
|
+
// produced wrong output. This is the dominant failure pattern in the
|
|
145
|
+
// 2026-05-03 baseline: 24/25 `search_no_gold` labels were wrong because the
|
|
146
|
+
// classifier didn't consult trajectory.correctAssetLoaded.
|
|
147
|
+
const out = classifyFailureMode(fakeTask(), fakeRun({
|
|
148
|
+
trajectory: { correctAssetLoaded: true, feedbackRecorded: null },
|
|
149
|
+
verifierStdout: "verifier: field values wrong",
|
|
150
|
+
}));
|
|
151
|
+
expect(out).toBe("loaded_ignored");
|
|
152
|
+
});
|
|
153
|
+
test("loaded_ignored: correctAssetLoaded=true overrides stdout-scan — fires even with no search in trace", () => {
|
|
154
|
+
// Trajectory data is authoritative. Even if verifierStdout shows no `akm
|
|
155
|
+
// search`, the trajectory says the gold was loaded → loaded_ignored, not
|
|
156
|
+
// no_search.
|
|
157
|
+
const out = classifyFailureMode(fakeTask(), fakeRun({
|
|
158
|
+
trajectory: { correctAssetLoaded: true, feedbackRecorded: null },
|
|
159
|
+
verifierStdout: "",
|
|
160
|
+
}));
|
|
161
|
+
expect(out).toBe("loaded_ignored");
|
|
162
|
+
});
|
|
163
|
+
test("search_no_gold: correctAssetLoaded=false + search ran + gold absent → search_no_gold", () => {
|
|
164
|
+
// When trajectory says gold was NOT loaded and search ran but gold ref absent
|
|
165
|
+
// from results, this is a genuine search failure.
|
|
166
|
+
const trace = ["$ akm search homelab", "1. skill:foo", "2. skill:bar"].join("\n");
|
|
167
|
+
const out = classifyFailureMode(fakeTask(), fakeRun({
|
|
168
|
+
trajectory: { correctAssetLoaded: false, feedbackRecorded: null },
|
|
169
|
+
verifierStdout: trace,
|
|
170
|
+
}));
|
|
171
|
+
expect(out).toBe("search_no_gold");
|
|
172
|
+
});
|
|
173
|
+
test("no_search: correctAssetLoaded=false + no search in trace → no_search", () => {
|
|
174
|
+
// When trajectory says gold was NOT loaded and there is no search evidence,
|
|
175
|
+
// the agent genuinely didn't search.
|
|
176
|
+
const out = classifyFailureMode(fakeTask(), fakeRun({
|
|
177
|
+
trajectory: { correctAssetLoaded: false, feedbackRecorded: null },
|
|
178
|
+
verifierStdout: "verifier: missing output",
|
|
179
|
+
}));
|
|
180
|
+
expect(out).toBe("no_search");
|
|
181
|
+
});
|
|
133
182
|
});
|
|
134
183
|
describe("classifyFailureMode — tie-breaking and priority", () => {
|
|
135
184
|
test("no_search beats search_no_gold when both could apply (no search call)", () => {
|
|
@@ -258,9 +307,9 @@ describe("renderFailureModeBreakdown", () => {
|
|
|
258
307
|
commit: "y",
|
|
259
308
|
model: "m",
|
|
260
309
|
corpus: { domains: 1, tasks: 1, slice: "all", seedsPerArm: 5 },
|
|
261
|
-
aggregateNoakm: { passRate: 0, tokensPerPass: null, wallclockMs: 0 },
|
|
262
|
-
aggregateAkm: { passRate: 0, tokensPerPass: null, wallclockMs: 0 },
|
|
263
|
-
aggregateDelta: { passRate: 0, tokensPerPass: null, wallclockMs: 0 },
|
|
310
|
+
aggregateNoakm: { passRate: 0, tokensPerPass: null, tokensPerRun: null, wallclockMs: 0 },
|
|
311
|
+
aggregateAkm: { passRate: 0, tokensPerPass: null, tokensPerRun: null, wallclockMs: 0 },
|
|
312
|
+
aggregateDelta: { passRate: 0, tokensPerPass: null, tokensPerRun: null, wallclockMs: 0 },
|
|
264
313
|
trajectoryAkm: { correctAssetLoaded: null, feedbackRecorded: 0 },
|
|
265
314
|
failureModes: { byLabel, byTask: {} },
|
|
266
315
|
tasks: [],
|
|
@@ -288,11 +288,12 @@ function emptyUtilityReport() {
|
|
|
288
288
|
commit: "deadbee",
|
|
289
289
|
model: "m",
|
|
290
290
|
corpus: { domains: 0, tasks: 0, slice: "all", seedsPerArm: 1 },
|
|
291
|
-
aggregateNoakm: { passRate: 0, tokensPerPass: 0, wallclockMs: 0 },
|
|
292
|
-
aggregateAkm: { passRate: 0, tokensPerPass: 0, wallclockMs: 0 },
|
|
291
|
+
aggregateNoakm: { passRate: 0, tokensPerPass: 0, tokensPerRun: null, wallclockMs: 0 },
|
|
292
|
+
aggregateAkm: { passRate: 0, tokensPerPass: 0, tokensPerRun: null, wallclockMs: 0 },
|
|
293
293
|
aggregateDelta: {
|
|
294
294
|
passRate: 0,
|
|
295
295
|
tokensPerPass: 0,
|
|
296
|
+
tokensPerRun: null,
|
|
296
297
|
wallclockMs: 0,
|
|
297
298
|
},
|
|
298
299
|
trajectoryAkm: {
|
|
@@ -1,6 +1,14 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Leakage smoke test for the seeded bench corpus (spec §7.4).
|
|
3
3
|
*
|
|
4
|
+
* Gated behind `AKM_BENCH_FIXTURE_TESTS=1`. This is a corpus-content
|
|
5
|
+
* validator (it inspects the seeded fixture stashes and verifier files,
|
|
6
|
+
* not the bench framework code itself), so it ships skipped by default —
|
|
7
|
+
* matching the `AKM_SEMANTIC_TESTS` / `AKM_DOCKER_TESTS` pattern. Run it
|
|
8
|
+
* locally when you change a fixture stash or a verifier:
|
|
9
|
+
*
|
|
10
|
+
* AKM_BENCH_FIXTURE_TESTS=1 bun test tests/bench/leakage.test.ts
|
|
11
|
+
*
|
|
4
12
|
* For every task that declares a `gold_ref` of the form `skill:<name>`,
|
|
5
13
|
* locate the SKILL.md inside the named fixture stash and assert that the
|
|
6
14
|
* verifier's *structural assertions* do not appear verbatim in the gold-ref
|
|
@@ -26,7 +34,8 @@
|
|
|
26
34
|
import { describe, expect, test } from "bun:test";
|
|
27
35
|
import fs from "node:fs";
|
|
28
36
|
import path from "node:path";
|
|
29
|
-
import { getTasksRoot, listTasks } from "./corpus";
|
|
37
|
+
import { effectiveSlice, getTasksRoot, listTasks } from "./corpus";
|
|
38
|
+
const FIXTURE_TESTS = !!process.env.AKM_BENCH_FIXTURE_TESTS;
|
|
30
39
|
const STASHES_ROOT = path.resolve(getTasksRoot(), "..", "..", "stashes");
|
|
31
40
|
/** Resolve `skill:<name>` against the named stash; returns SKILL.md path or `undefined`. */
|
|
32
41
|
function resolveGoldRefPath(stashName, goldRef) {
|
|
@@ -93,7 +102,97 @@ function readVerifierFiles(task) {
|
|
|
93
102
|
}
|
|
94
103
|
return combined;
|
|
95
104
|
}
|
|
96
|
-
|
|
105
|
+
/**
|
|
106
|
+
* Return the verifier assertion fragments for a task, applying an additional
|
|
107
|
+
* filter suitable for cross-task comparisons. Short two-word domain phrases
|
|
108
|
+
* (e.g. `akm feedback`, `akm search`) naturally recur across tasks that share
|
|
109
|
+
* a domain — they are NOT meaningful leakage signals. A fragment is considered
|
|
110
|
+
* meaningful only when it either:
|
|
111
|
+
* • contains at least two spaces (three or more tokens), or
|
|
112
|
+
* • contains a structural character (`=`, `[`, `(`) that marks it as a
|
|
113
|
+
* complex expression unlikely to appear by coincidence.
|
|
114
|
+
*
|
|
115
|
+
* This is more precise than a raw length threshold because it captures the
|
|
116
|
+
* difference between `akm feedback` (12 chars, 2 tokens, no structure) and
|
|
117
|
+
* `.model == "anthropic/claude-opus-4-7"` (37 chars, structural `==`).
|
|
118
|
+
*/
|
|
119
|
+
function crossTaskFragments(task) {
|
|
120
|
+
const isMeaningful = (f) => {
|
|
121
|
+
const spaceCount = (f.match(/ /g) ?? []).length;
|
|
122
|
+
return spaceCount >= 2 || /[=[(]/.test(f);
|
|
123
|
+
};
|
|
124
|
+
const raw = [];
|
|
125
|
+
if (task.verifier === "regex" && task.expectedMatch) {
|
|
126
|
+
raw.push(...regexLiterals(task.expectedMatch));
|
|
127
|
+
}
|
|
128
|
+
else {
|
|
129
|
+
const verifierText = readVerifierFiles(task);
|
|
130
|
+
raw.push(...pytestStructuralFragments(verifierText));
|
|
131
|
+
raw.push(...shellAssertionFragments(verifierText));
|
|
132
|
+
}
|
|
133
|
+
return raw.filter(isMeaningful);
|
|
134
|
+
}
|
|
135
|
+
describe.skipIf(!FIXTURE_TESTS)("cross-task eval/train verifier leakage check", () => {
|
|
136
|
+
const allTasks = listTasks();
|
|
137
|
+
// Group tasks by stash name.
|
|
138
|
+
const byStash = new Map();
|
|
139
|
+
for (const task of allTasks) {
|
|
140
|
+
const group = byStash.get(task.stash) ?? [];
|
|
141
|
+
group.push(task);
|
|
142
|
+
byStash.set(task.stash, group);
|
|
143
|
+
}
|
|
144
|
+
// Only stashes that have BOTH train and eval tasks are interesting.
|
|
145
|
+
const mixedStashes = [...byStash.entries()].filter(([, tasks]) => {
|
|
146
|
+
const hasTrain = tasks.some((t) => effectiveSlice(t) === "train");
|
|
147
|
+
const hasEval = tasks.some((t) => effectiveSlice(t) === "eval");
|
|
148
|
+
return hasTrain && hasEval;
|
|
149
|
+
});
|
|
150
|
+
test("at least one stash has both train and eval tasks", () => {
|
|
151
|
+
expect(mixedStashes.length).toBeGreaterThan(0);
|
|
152
|
+
});
|
|
153
|
+
for (const [stashName, tasks] of mixedStashes) {
|
|
154
|
+
const trainTasks = tasks.filter((t) => effectiveSlice(t) === "train");
|
|
155
|
+
const evalTasks = tasks.filter((t) => effectiveSlice(t) === "eval");
|
|
156
|
+
// Train → Eval: train verifier fragments must not appear in eval verifier text.
|
|
157
|
+
// Skip pairs that are intentional train/eval variants of the same task family
|
|
158
|
+
// (e.g. inkwell/add-healthcheck-train vs inkwell/add-healthcheck) — they share
|
|
159
|
+
// field-access patterns by design, just with different expected values.
|
|
160
|
+
const isVariantPair = (trainId, evalId) => {
|
|
161
|
+
const trainBase = trainId.replace(/-train$/, "");
|
|
162
|
+
return trainBase === evalId || evalId.startsWith(`${trainBase}-`);
|
|
163
|
+
};
|
|
164
|
+
for (const trainTask of trainTasks) {
|
|
165
|
+
const trainFragments = crossTaskFragments(trainTask);
|
|
166
|
+
if (trainFragments.length === 0)
|
|
167
|
+
continue;
|
|
168
|
+
for (const evalTask of evalTasks) {
|
|
169
|
+
if (isVariantPair(trainTask.id, evalTask.id))
|
|
170
|
+
continue;
|
|
171
|
+
const evalVerifierText = readVerifierFiles(evalTask);
|
|
172
|
+
test(`stash:${stashName} — train:${trainTask.id} fragments not in eval:${evalTask.id} verifier`, () => {
|
|
173
|
+
const leaked = trainFragments.filter((frag) => evalVerifierText.includes(frag));
|
|
174
|
+
expect(leaked, `fragments leaked from train verifier to eval verifier: ${JSON.stringify(leaked)}`).toEqual([]);
|
|
175
|
+
});
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
// Eval → Train: eval verifier fragments must not appear in train verifier text.
|
|
179
|
+
for (const evalTask of evalTasks) {
|
|
180
|
+
const evalFragments = crossTaskFragments(evalTask);
|
|
181
|
+
if (evalFragments.length === 0)
|
|
182
|
+
continue;
|
|
183
|
+
for (const trainTask of trainTasks) {
|
|
184
|
+
if (isVariantPair(trainTask.id, evalTask.id))
|
|
185
|
+
continue;
|
|
186
|
+
const trainVerifierText = readVerifierFiles(trainTask);
|
|
187
|
+
test(`stash:${stashName} — eval:${evalTask.id} fragments not in train:${trainTask.id} verifier`, () => {
|
|
188
|
+
const leaked = evalFragments.filter((frag) => trainVerifierText.includes(frag));
|
|
189
|
+
expect(leaked, `fragments leaked from eval verifier to train verifier: ${JSON.stringify(leaked)}`).toEqual([]);
|
|
190
|
+
});
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
});
|
|
195
|
+
describe.skipIf(!FIXTURE_TESTS)("gold-ref leakage check", () => {
|
|
97
196
|
const tasks = listTasks().filter((t) => t.goldRef);
|
|
98
197
|
test("at least one task ships with a gold_ref", () => {
|
|
99
198
|
expect(tasks.length).toBeGreaterThan(0);
|
|
@@ -106,6 +205,10 @@ describe("gold-ref leakage check", () => {
|
|
|
106
205
|
// skipping here previously masked typos and stash-name drift; we now
|
|
107
206
|
// fail loudly so the corpus author is forced to fix the reference.
|
|
108
207
|
if (!goldPath) {
|
|
208
|
+
// Non-skill refs (workflow:, command:, etc.) are not leakage-checked —
|
|
209
|
+
// only skill: refs map to a SKILL.md that could leak answers.
|
|
210
|
+
if (!/^skill:/.test(goldRef))
|
|
211
|
+
return;
|
|
109
212
|
throw new Error(`${task.id}: gold_ref "${goldRef}" against stash "${task.stash}" did not resolve to a SKILL.md under tests/fixtures/stashes/. Fix the gold_ref, fix the stash name, or remove the gold_ref.`);
|
|
110
213
|
}
|
|
111
214
|
const goldContent = fs.readFileSync(goldPath, "utf8");
|
|
@@ -16,11 +16,12 @@ function emptyUtilityReport() {
|
|
|
16
16
|
commit: "deadbee",
|
|
17
17
|
model: "m",
|
|
18
18
|
corpus: { domains: 0, tasks: 0, slice: "all", seedsPerArm: 1 },
|
|
19
|
-
aggregateNoakm: { passRate: 0, tokensPerPass: 0, wallclockMs: 0 },
|
|
20
|
-
aggregateAkm: { passRate: 0, tokensPerPass: 0, wallclockMs: 0 },
|
|
19
|
+
aggregateNoakm: { passRate: 0, tokensPerPass: 0, tokensPerRun: null, wallclockMs: 0 },
|
|
20
|
+
aggregateAkm: { passRate: 0, tokensPerPass: 0, tokensPerRun: null, wallclockMs: 0 },
|
|
21
21
|
aggregateDelta: {
|
|
22
22
|
passRate: 0,
|
|
23
23
|
tokensPerPass: 0,
|
|
24
|
+
tokensPerRun: null,
|
|
24
25
|
wallclockMs: 0,
|
|
25
26
|
},
|
|
26
27
|
trajectoryAkm: {
|