akm-cli 0.7.0-rc1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/src/cli.js +100 -16
- package/dist/src/commands/config-cli.js +42 -0
- package/dist/src/commands/history.js +78 -7
- package/dist/src/commands/registry-search.js +69 -6
- package/dist/src/commands/search.js +30 -3
- package/dist/src/commands/show.js +29 -0
- package/dist/src/commands/source-add.js +5 -1
- package/dist/src/commands/source-manage.js +7 -1
- package/dist/src/core/config.js +28 -0
- package/dist/src/indexer/db-search.js +1 -0
- package/dist/src/indexer/indexer.js +16 -2
- package/dist/src/indexer/matchers.js +1 -1
- package/dist/src/indexer/search-source.js +4 -2
- package/dist/src/integrations/agent/profiles.js +1 -1
- package/dist/src/integrations/agent/spawn.js +67 -16
- package/dist/src/integrations/github.js +9 -3
- package/dist/src/llm/embedders/remote.js +37 -3
- package/dist/src/output/cli-hints.js +15 -2
- package/dist/src/output/renderers.js +3 -1
- package/dist/src/output/shapes.js +8 -1
- package/dist/src/output/text.js +156 -3
- package/dist/src/registry/build-index.js +5 -4
- package/dist/src/registry/providers/static-index.js +3 -1
- package/dist/src/setup/setup.js +9 -0
- package/dist/src/wiki/wiki.js +54 -6
- package/dist/src/workflows/runs.js +37 -3
- package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +1 -1
- package/dist/tests/bench/attribution.test.js +24 -23
- package/dist/tests/bench/cleanup.js +31 -0
- package/dist/tests/bench/cli.js +366 -31
- package/dist/tests/bench/cli.test.js +282 -14
- package/dist/tests/bench/corpus.js +3 -0
- package/dist/tests/bench/corpus.test.js +10 -10
- package/dist/tests/bench/doctor.js +525 -0
- package/dist/tests/bench/driver.js +77 -22
- package/dist/tests/bench/driver.test.js +142 -1
- package/dist/tests/bench/environment.js +233 -0
- package/dist/tests/bench/environment.test.js +199 -0
- package/dist/tests/bench/evolve.js +67 -0
- package/dist/tests/bench/evolve.test.js +12 -4
- package/dist/tests/bench/failure-modes.test.js +52 -3
- package/dist/tests/bench/feedback-integrity.test.js +3 -2
- package/dist/tests/bench/leakage.test.js +105 -2
- package/dist/tests/bench/learning-curve.test.js +3 -2
- package/dist/tests/bench/metrics.js +102 -26
- package/dist/tests/bench/metrics.test.js +10 -4
- package/dist/tests/bench/opencode-config.js +194 -0
- package/dist/tests/bench/opencode-config.test.js +370 -0
- package/dist/tests/bench/report.js +73 -9
- package/dist/tests/bench/report.test.js +59 -10
- package/dist/tests/bench/run-config.js +355 -0
- package/dist/tests/bench/run-config.test.js +298 -0
- package/dist/tests/bench/run-curate-test.js +32 -0
- package/dist/tests/bench/run-failing-tasks.js +56 -0
- package/dist/tests/bench/run-full-bench.js +51 -0
- package/dist/tests/bench/run-items36-targeted.js +69 -0
- package/dist/tests/bench/run-nano-quick.js +42 -0
- package/dist/tests/bench/run-waveg-targeted.js +62 -0
- package/dist/tests/bench/runner.js +257 -94
- package/dist/tests/bench/tmp.js +90 -0
- package/dist/tests/bench/trajectory.js +2 -2
- package/dist/tests/bench/verifier.js +6 -1
- package/dist/tests/bench/workflow-spec.js +11 -24
- package/dist/tests/bench/workflow-spec.test.js +1 -1
- package/dist/tests/bench/workflow-trace.js +34 -0
- package/dist/tests/cli-errors.test.js +1 -0
- package/dist/tests/commands/history.test.js +195 -0
- package/dist/tests/config.test.js +25 -0
- package/dist/tests/e2e.test.js +23 -2
- package/dist/tests/fixtures/stashes/load.js +1 -1
- package/dist/tests/fixtures/stashes/load.test.js +11 -2
- package/dist/tests/indexer.test.js +12 -1
- package/dist/tests/output-baseline.test.js +2 -1
- package/dist/tests/output-shapes-unit.test.js +3 -1
- package/dist/tests/registry-build-index.test.js +17 -1
- package/dist/tests/registry-providers/static-index.test.js +34 -0
- package/dist/tests/registry-search.test.js +200 -0
- package/dist/tests/remember-frontmatter.test.js +11 -13
- package/dist/tests/source-qa-fixes.test.js +18 -0
- package/dist/tests/source-registry.test.js +3 -3
- package/dist/tests/source-source.test.js +61 -1
- package/dist/tests/workflow-qa-fixes.test.js +18 -0
- package/package.json +1 -1
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for the bench opencode-config module.
|
|
3
|
+
*
|
|
4
|
+
* Covers all cases described in the design spec:
|
|
5
|
+
* - loads canonical fixture without error
|
|
6
|
+
* - rejects literal apiKey (not env-ref)
|
|
7
|
+
* - accepts {env:VAR} apiKey form
|
|
8
|
+
* - rejects sk-XXXX credential heuristic anywhere in tree
|
|
9
|
+
* - rejects top-level plugin / mcp / permission keys
|
|
10
|
+
* - rejects unknown schemaVersion
|
|
11
|
+
* - isUsageError: true when file missing
|
|
12
|
+
* - selectProviderForModel picks correct provider
|
|
13
|
+
* - selectProviderForModel throws on unknown provider prefix
|
|
14
|
+
* - materializeOpencodeConfig writes exactly $schema + provider keys, mode 0o600
|
|
15
|
+
*/
|
|
16
|
+
import { afterAll, beforeAll, describe, expect, test } from "bun:test";
|
|
17
|
+
import fs from "node:fs";
|
|
18
|
+
import path from "node:path";
|
|
19
|
+
import { BenchConfigError, loadOpencodeProviders, materializeOpencodeConfig, selectProviderForModel, } from "./opencode-config";
|
|
20
|
+
import { benchMkdtemp } from "./tmp";
|
|
21
|
+
/** Absolute path to the committed fixture. */
|
|
22
|
+
const FIXTURE_PATH = path.resolve(__dirname, "..", "fixtures", "bench", "opencode-providers.json");
|
|
23
|
+
/** Write a temp JSON file and return its path. */
|
|
24
|
+
function writeTmp(dir, name, content) {
|
|
25
|
+
const p = path.join(dir, name);
|
|
26
|
+
fs.writeFileSync(p, JSON.stringify(content));
|
|
27
|
+
return p;
|
|
28
|
+
}
|
|
29
|
+
describe("loadOpencodeProviders", () => {
|
|
30
|
+
let tmp;
|
|
31
|
+
beforeAll(() => {
|
|
32
|
+
tmp = benchMkdtemp("bench-opencode-config-test-");
|
|
33
|
+
});
|
|
34
|
+
afterAll(() => {
|
|
35
|
+
fs.rmSync(tmp, { recursive: true, force: true });
|
|
36
|
+
});
|
|
37
|
+
// ── Canonical fixture ─────────────────────────────────────────────────────
|
|
38
|
+
test("loads the canonical committed fixture without error", () => {
|
|
39
|
+
expect(() => loadOpencodeProviders(FIXTURE_PATH)).not.toThrow();
|
|
40
|
+
const loaded = loadOpencodeProviders(FIXTURE_PATH);
|
|
41
|
+
expect(loaded.source).toBe(FIXTURE_PATH);
|
|
42
|
+
expect(loaded.providers).toBeDefined();
|
|
43
|
+
expect(typeof loaded.providers).toBe("object");
|
|
44
|
+
expect(loaded.defaultModel).toBe("local/qwen/qwen3.5-9b");
|
|
45
|
+
expect("local" in loaded.providers).toBe(true);
|
|
46
|
+
});
|
|
47
|
+
// ── File not found ────────────────────────────────────────────────────────
|
|
48
|
+
test("throws BenchConfigError with isUsageError: true when file does not exist", () => {
|
|
49
|
+
const missing = path.join(tmp, "does-not-exist.json");
|
|
50
|
+
let err;
|
|
51
|
+
try {
|
|
52
|
+
loadOpencodeProviders(missing);
|
|
53
|
+
}
|
|
54
|
+
catch (e) {
|
|
55
|
+
err = e;
|
|
56
|
+
}
|
|
57
|
+
expect(err).toBeInstanceOf(BenchConfigError);
|
|
58
|
+
const bce = err;
|
|
59
|
+
expect(bce.code).toBe("BENCH_CONFIG");
|
|
60
|
+
expect(bce.isUsageError).toBe(true);
|
|
61
|
+
expect(bce.message).toContain("not found");
|
|
62
|
+
});
|
|
63
|
+
// ── JSON parse failure ────────────────────────────────────────────────────
|
|
64
|
+
test("throws BenchConfigError with isUsageError: false on malformed JSON", () => {
|
|
65
|
+
const p = path.join(tmp, "bad.json");
|
|
66
|
+
fs.writeFileSync(p, "{ this is not json }");
|
|
67
|
+
let err;
|
|
68
|
+
try {
|
|
69
|
+
loadOpencodeProviders(p);
|
|
70
|
+
}
|
|
71
|
+
catch (e) {
|
|
72
|
+
err = e;
|
|
73
|
+
}
|
|
74
|
+
expect(err).toBeInstanceOf(BenchConfigError);
|
|
75
|
+
expect(err.isUsageError).toBe(false);
|
|
76
|
+
expect(err.message).toContain("JSON parse error");
|
|
77
|
+
});
|
|
78
|
+
// ── schemaVersion ─────────────────────────────────────────────────────────
|
|
79
|
+
test("rejects unknown schemaVersion", () => {
|
|
80
|
+
const p = writeTmp(tmp, "bad-version.json", {
|
|
81
|
+
schemaVersion: 2,
|
|
82
|
+
providers: {},
|
|
83
|
+
});
|
|
84
|
+
expect(() => loadOpencodeProviders(p)).toThrow(BenchConfigError);
|
|
85
|
+
let err;
|
|
86
|
+
try {
|
|
87
|
+
loadOpencodeProviders(p);
|
|
88
|
+
}
|
|
89
|
+
catch (e) {
|
|
90
|
+
if (e instanceof BenchConfigError)
|
|
91
|
+
err = e;
|
|
92
|
+
}
|
|
93
|
+
expect(err?.isUsageError).toBe(false);
|
|
94
|
+
expect(err?.message).toContain("schemaVersion");
|
|
95
|
+
});
|
|
96
|
+
test("rejects schemaVersion: 0", () => {
|
|
97
|
+
const p = writeTmp(tmp, "version-0.json", { schemaVersion: 0, providers: {} });
|
|
98
|
+
expect(() => loadOpencodeProviders(p)).toThrow(BenchConfigError);
|
|
99
|
+
});
|
|
100
|
+
// ── Forbidden top-level keys ──────────────────────────────────────────────
|
|
101
|
+
test("rejects top-level 'plugin' key", () => {
|
|
102
|
+
const p = writeTmp(tmp, "has-plugin.json", {
|
|
103
|
+
schemaVersion: 1,
|
|
104
|
+
providers: {},
|
|
105
|
+
plugin: [],
|
|
106
|
+
});
|
|
107
|
+
let err;
|
|
108
|
+
try {
|
|
109
|
+
loadOpencodeProviders(p);
|
|
110
|
+
}
|
|
111
|
+
catch (e) {
|
|
112
|
+
if (e instanceof BenchConfigError)
|
|
113
|
+
err = e;
|
|
114
|
+
}
|
|
115
|
+
expect(err).toBeDefined();
|
|
116
|
+
expect(err?.isUsageError).toBe(false);
|
|
117
|
+
expect(err?.message).toContain("plugin");
|
|
118
|
+
});
|
|
119
|
+
test("rejects top-level 'mcp' key", () => {
|
|
120
|
+
const p = writeTmp(tmp, "has-mcp.json", {
|
|
121
|
+
schemaVersion: 1,
|
|
122
|
+
providers: {},
|
|
123
|
+
mcp: {},
|
|
124
|
+
});
|
|
125
|
+
expect(() => loadOpencodeProviders(p)).toThrow(BenchConfigError);
|
|
126
|
+
});
|
|
127
|
+
test("rejects top-level 'permission' key", () => {
|
|
128
|
+
const p = writeTmp(tmp, "has-permission.json", {
|
|
129
|
+
schemaVersion: 1,
|
|
130
|
+
providers: {},
|
|
131
|
+
permission: {},
|
|
132
|
+
});
|
|
133
|
+
expect(() => loadOpencodeProviders(p)).toThrow(BenchConfigError);
|
|
134
|
+
});
|
|
135
|
+
test("rejects top-level 'disabled_providers' key", () => {
|
|
136
|
+
const p = writeTmp(tmp, "has-disabled.json", {
|
|
137
|
+
schemaVersion: 1,
|
|
138
|
+
providers: {},
|
|
139
|
+
disabled_providers: [],
|
|
140
|
+
});
|
|
141
|
+
expect(() => loadOpencodeProviders(p)).toThrow(BenchConfigError);
|
|
142
|
+
});
|
|
143
|
+
test("rejects top-level 'small_model' key", () => {
|
|
144
|
+
const p = writeTmp(tmp, "has-small-model.json", {
|
|
145
|
+
schemaVersion: 1,
|
|
146
|
+
providers: {},
|
|
147
|
+
small_model: "anthropic/claude-haiku-4-5",
|
|
148
|
+
});
|
|
149
|
+
expect(() => loadOpencodeProviders(p)).toThrow(BenchConfigError);
|
|
150
|
+
});
|
|
151
|
+
test("rejects top-level 'snapshot' key", () => {
|
|
152
|
+
const p = writeTmp(tmp, "has-snapshot.json", {
|
|
153
|
+
schemaVersion: 1,
|
|
154
|
+
providers: {},
|
|
155
|
+
snapshot: true,
|
|
156
|
+
});
|
|
157
|
+
expect(() => loadOpencodeProviders(p)).toThrow(BenchConfigError);
|
|
158
|
+
});
|
|
159
|
+
// ── apiKey validation ─────────────────────────────────────────────────────
|
|
160
|
+
test("rejects literal apiKey string (not an env-ref)", () => {
|
|
161
|
+
const p = writeTmp(tmp, "literal-apikey.json", {
|
|
162
|
+
schemaVersion: 1,
|
|
163
|
+
providers: {
|
|
164
|
+
myProvider: {
|
|
165
|
+
apiKey: "not-an-env-ref",
|
|
166
|
+
},
|
|
167
|
+
},
|
|
168
|
+
});
|
|
169
|
+
let err;
|
|
170
|
+
try {
|
|
171
|
+
loadOpencodeProviders(p);
|
|
172
|
+
}
|
|
173
|
+
catch (e) {
|
|
174
|
+
if (e instanceof BenchConfigError)
|
|
175
|
+
err = e;
|
|
176
|
+
}
|
|
177
|
+
expect(err).toBeDefined();
|
|
178
|
+
expect(err?.isUsageError).toBe(false);
|
|
179
|
+
expect(err?.message).toContain("apiKey");
|
|
180
|
+
expect(err?.message).toContain("env-ref");
|
|
181
|
+
});
|
|
182
|
+
test("accepts {env:VAR} form for apiKey", () => {
|
|
183
|
+
const p = writeTmp(tmp, "env-ref-apikey.json", {
|
|
184
|
+
schemaVersion: 1,
|
|
185
|
+
providers: {
|
|
186
|
+
myProvider: {
|
|
187
|
+
npm: "@ai-sdk/openai-compatible",
|
|
188
|
+
apiKey: "{env:MY_API_KEY}",
|
|
189
|
+
options: { baseURL: "http://localhost:1234/v1" },
|
|
190
|
+
},
|
|
191
|
+
},
|
|
192
|
+
});
|
|
193
|
+
expect(() => loadOpencodeProviders(p)).not.toThrow();
|
|
194
|
+
const loaded = loadOpencodeProviders(p);
|
|
195
|
+
expect("myProvider" in loaded.providers).toBe(true);
|
|
196
|
+
});
|
|
197
|
+
test("accepts {env:UNDERSCORE_KEY_123} env-ref form", () => {
|
|
198
|
+
const p = writeTmp(tmp, "env-ref-underscore.json", {
|
|
199
|
+
schemaVersion: 1,
|
|
200
|
+
providers: {
|
|
201
|
+
p: { apiKey: "{env:MY_KEY_123}" },
|
|
202
|
+
},
|
|
203
|
+
});
|
|
204
|
+
expect(() => loadOpencodeProviders(p)).not.toThrow();
|
|
205
|
+
});
|
|
206
|
+
test("rejects apiKey starting with lowercase (not a valid env-ref)", () => {
|
|
207
|
+
const p = writeTmp(tmp, "bad-env-ref.json", {
|
|
208
|
+
schemaVersion: 1,
|
|
209
|
+
providers: {
|
|
210
|
+
p: { apiKey: "{env:my_lowercase_key}" },
|
|
211
|
+
},
|
|
212
|
+
});
|
|
213
|
+
expect(() => loadOpencodeProviders(p)).toThrow(BenchConfigError);
|
|
214
|
+
});
|
|
215
|
+
// ── Credential heuristic ──────────────────────────────────────────────────
|
|
216
|
+
test("rejects sk-XXXX credential anywhere in the providers tree", () => {
|
|
217
|
+
const p = writeTmp(tmp, "has-sk-key.json", {
|
|
218
|
+
schemaVersion: 1,
|
|
219
|
+
providers: {
|
|
220
|
+
openai: {
|
|
221
|
+
npm: "@ai-sdk/openai",
|
|
222
|
+
secret: "sk-abcdefghijklmnopqrstuvwxyz0123456789",
|
|
223
|
+
},
|
|
224
|
+
},
|
|
225
|
+
});
|
|
226
|
+
let err;
|
|
227
|
+
try {
|
|
228
|
+
loadOpencodeProviders(p);
|
|
229
|
+
}
|
|
230
|
+
catch (e) {
|
|
231
|
+
if (e instanceof BenchConfigError)
|
|
232
|
+
err = e;
|
|
233
|
+
}
|
|
234
|
+
expect(err).toBeDefined();
|
|
235
|
+
expect(err?.isUsageError).toBe(false);
|
|
236
|
+
expect(err?.message).toContain("credential heuristic");
|
|
237
|
+
});
|
|
238
|
+
test("rejects sk-XXXX credential in a nested object", () => {
|
|
239
|
+
const p = writeTmp(tmp, "nested-sk-key.json", {
|
|
240
|
+
schemaVersion: 1,
|
|
241
|
+
providers: {
|
|
242
|
+
p: {
|
|
243
|
+
options: {
|
|
244
|
+
headers: {
|
|
245
|
+
Authorization: "sk-proj-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
|
|
246
|
+
},
|
|
247
|
+
},
|
|
248
|
+
},
|
|
249
|
+
},
|
|
250
|
+
});
|
|
251
|
+
expect(() => loadOpencodeProviders(p)).toThrow(BenchConfigError);
|
|
252
|
+
});
|
|
253
|
+
// ── Valid minimal file ────────────────────────────────────────────────────
|
|
254
|
+
test("accepts a valid minimal file with no defaultModel", () => {
|
|
255
|
+
const p = writeTmp(tmp, "minimal.json", {
|
|
256
|
+
schemaVersion: 1,
|
|
257
|
+
providers: {
|
|
258
|
+
local: {
|
|
259
|
+
npm: "@ai-sdk/openai-compatible",
|
|
260
|
+
options: { baseURL: "http://localhost:1234/v1" },
|
|
261
|
+
},
|
|
262
|
+
},
|
|
263
|
+
});
|
|
264
|
+
const loaded = loadOpencodeProviders(p);
|
|
265
|
+
expect(loaded.defaultModel).toBeUndefined();
|
|
266
|
+
expect("local" in loaded.providers).toBe(true);
|
|
267
|
+
});
|
|
268
|
+
});
|
|
269
|
+
describe("selectProviderForModel", () => {
|
|
270
|
+
const loaded = {
|
|
271
|
+
source: "/fake/path.json",
|
|
272
|
+
providers: {
|
|
273
|
+
don: { npm: "@ai-sdk/openai-compatible", name: "Don LM Studio" },
|
|
274
|
+
ollama: { npm: "@ai-sdk/openai-compatible", name: "Ollama" },
|
|
275
|
+
},
|
|
276
|
+
defaultModel: "don/mlx-community/qwen3.6-35b-a3b",
|
|
277
|
+
};
|
|
278
|
+
test("splits on first slash and returns the correct provider entry", () => {
|
|
279
|
+
const result = selectProviderForModel(loaded, "don/mlx-community/qwen3.6-35b-a3b");
|
|
280
|
+
expect(result.providerKey).toBe("don");
|
|
281
|
+
expect(result.entry).toBe(loaded.providers.don);
|
|
282
|
+
});
|
|
283
|
+
test("handles a model with no slash (entire string is the provider key)", () => {
|
|
284
|
+
const result = selectProviderForModel(loaded, "ollama");
|
|
285
|
+
expect(result.providerKey).toBe("ollama");
|
|
286
|
+
expect(result.entry).toBe(loaded.providers.ollama);
|
|
287
|
+
});
|
|
288
|
+
test("throws BenchConfigError when provider key is not in loaded.providers", () => {
|
|
289
|
+
let err;
|
|
290
|
+
try {
|
|
291
|
+
selectProviderForModel(loaded, "unknown/some-model");
|
|
292
|
+
}
|
|
293
|
+
catch (e) {
|
|
294
|
+
if (e instanceof BenchConfigError)
|
|
295
|
+
err = e;
|
|
296
|
+
}
|
|
297
|
+
expect(err).toBeDefined();
|
|
298
|
+
expect(err?.code).toBe("BENCH_CONFIG");
|
|
299
|
+
expect(err?.isUsageError).toBe(false);
|
|
300
|
+
expect(err?.message).toContain("unknown");
|
|
301
|
+
expect(err?.message).toContain("provider key");
|
|
302
|
+
});
|
|
303
|
+
test("error message lists available provider keys", () => {
|
|
304
|
+
let err;
|
|
305
|
+
try {
|
|
306
|
+
selectProviderForModel(loaded, "missing/model");
|
|
307
|
+
}
|
|
308
|
+
catch (e) {
|
|
309
|
+
if (e instanceof BenchConfigError)
|
|
310
|
+
err = e;
|
|
311
|
+
}
|
|
312
|
+
expect(err?.message).toContain("don");
|
|
313
|
+
expect(err?.message).toContain("ollama");
|
|
314
|
+
});
|
|
315
|
+
});
|
|
316
|
+
describe("materializeOpencodeConfig", () => {
|
|
317
|
+
let tmp;
|
|
318
|
+
beforeAll(() => {
|
|
319
|
+
tmp = benchMkdtemp("bench-materialize-test-");
|
|
320
|
+
});
|
|
321
|
+
afterAll(() => {
|
|
322
|
+
fs.rmSync(tmp, { recursive: true, force: true });
|
|
323
|
+
});
|
|
324
|
+
test("writes opencode.json with required bench isolation invariants and provider", () => {
|
|
325
|
+
const configDir = path.join(tmp, "run-config");
|
|
326
|
+
fs.mkdirSync(configDir, { recursive: true });
|
|
327
|
+
const entry = { npm: "@ai-sdk/openai-compatible", name: "Test Provider" };
|
|
328
|
+
materializeOpencodeConfig(configDir, { providerKey: "test", entry }, "test/my-model");
|
|
329
|
+
const outPath = path.join(configDir, "opencode.json");
|
|
330
|
+
expect(fs.existsSync(outPath)).toBe(true);
|
|
331
|
+
const contents = JSON.parse(fs.readFileSync(outPath, "utf8"));
|
|
332
|
+
expect(contents.model).toBe("test/my-model");
|
|
333
|
+
expect(contents.$schema).toBe("https://opencode.ai/config.json");
|
|
334
|
+
// Bench isolation invariants: plugin:[] prevents operator plugin interference;
|
|
335
|
+
// permission block ensures opencode run (non-interactive) allows bash/file tools.
|
|
336
|
+
expect(contents.plugin).toEqual([]);
|
|
337
|
+
expect(contents.permission?.bash).toBe("allow");
|
|
338
|
+
// Provider block is written correctly.
|
|
339
|
+
const provider = contents.provider;
|
|
340
|
+
expect(Object.keys(provider)).toEqual(["test"]);
|
|
341
|
+
expect(provider.test).toEqual(entry);
|
|
342
|
+
});
|
|
343
|
+
test("does not write mcp into the config", () => {
|
|
344
|
+
const configDir = path.join(tmp, "run-config-2");
|
|
345
|
+
fs.mkdirSync(configDir, { recursive: true });
|
|
346
|
+
materializeOpencodeConfig(configDir, { providerKey: "p", entry: {} }, "p/model");
|
|
347
|
+
const contents = JSON.parse(fs.readFileSync(path.join(configDir, "opencode.json"), "utf8"));
|
|
348
|
+
expect(contents.mcp).toBeUndefined();
|
|
349
|
+
});
|
|
350
|
+
test("writes the file with mode 0o600 (not world-readable)", () => {
|
|
351
|
+
const configDir = path.join(tmp, "run-config-3");
|
|
352
|
+
fs.mkdirSync(configDir, { recursive: true });
|
|
353
|
+
materializeOpencodeConfig(configDir, { providerKey: "p", entry: {} }, "p/model");
|
|
354
|
+
const stat = fs.statSync(path.join(configDir, "opencode.json"));
|
|
355
|
+
// Mode 0o600 means only owner can read/write (no group or other bits).
|
|
356
|
+
// On Linux/macOS the lower 9 bits are 0o600 = 0o110000000 in binary.
|
|
357
|
+
const mode = stat.mode & 0o777;
|
|
358
|
+
expect(mode).toBe(0o600);
|
|
359
|
+
});
|
|
360
|
+
test("can be called twice (overwrites an existing opencode.json)", () => {
|
|
361
|
+
const configDir = path.join(tmp, "run-config-4");
|
|
362
|
+
fs.mkdirSync(configDir, { recursive: true });
|
|
363
|
+
materializeOpencodeConfig(configDir, { providerKey: "a", entry: { name: "first" } }, "a/m1");
|
|
364
|
+
materializeOpencodeConfig(configDir, { providerKey: "b", entry: { name: "second" } }, "b/m2");
|
|
365
|
+
const contents = JSON.parse(fs.readFileSync(path.join(configDir, "opencode.json"), "utf8"));
|
|
366
|
+
const provider = contents.provider;
|
|
367
|
+
expect("b" in provider).toBe(true);
|
|
368
|
+
expect("a" in provider).toBe(false);
|
|
369
|
+
});
|
|
370
|
+
});
|
|
@@ -179,6 +179,12 @@ function buildUtilityJson(input) {
|
|
|
179
179
|
if (input.allRuns) {
|
|
180
180
|
envelope.runs = input.allRuns.map(serializeRunForReport);
|
|
181
181
|
}
|
|
182
|
+
// Baseline pass-rate map — additive top-level key. Emitted only when the
|
|
183
|
+
// caller supplied a baseline through `loadBenchRunConfig`; legacy reports
|
|
184
|
+
// stay byte-identical without it.
|
|
185
|
+
if (input.baselineByTaskId) {
|
|
186
|
+
envelope.baseline_by_task_id = { ...input.baselineByTaskId };
|
|
187
|
+
}
|
|
182
188
|
// Per-asset attribution is an additive top-level key (§6.5). Emit it only
|
|
183
189
|
// when the runner populated it so older code paths (e.g. the empty-corpus
|
|
184
190
|
// skeleton) don't gain the key spuriously.
|
|
@@ -229,6 +235,8 @@ function serialiseAkmOverheadPerRun(row) {
|
|
|
229
235
|
search_count: row.searchCount,
|
|
230
236
|
show_count: row.showCount,
|
|
231
237
|
feedback_count: row.feedbackCount,
|
|
238
|
+
positive_feedback_count: row.positiveFeedbackCount,
|
|
239
|
+
negative_feedback_count: row.negativeFeedbackCount,
|
|
232
240
|
total_tool_calls: row.totalToolCalls,
|
|
233
241
|
assets_loaded_count: row.assetsLoadedCount,
|
|
234
242
|
irrelevant_assets_loaded_count: row.irrelevantAssetsLoadedCount,
|
|
@@ -255,6 +263,12 @@ function serialiseAkmOverheadAggregate(agg) {
|
|
|
255
263
|
total_tool_calls: agg.totalToolCalls,
|
|
256
264
|
tool_calls_per_success: agg.toolCallsPerSuccess,
|
|
257
265
|
cost_per_success: agg.costPerSuccess,
|
|
266
|
+
search_engagement_rate: agg.searchEngagementRate,
|
|
267
|
+
show_engagement_rate: agg.showEngagementRate,
|
|
268
|
+
feedback_engagement_rate: agg.feedbackEngagementRate,
|
|
269
|
+
search_to_show_ratio: agg.searchToShowRatio,
|
|
270
|
+
mean_positive_feedback_count: agg.meanPositiveFeedbackCount,
|
|
271
|
+
mean_negative_feedback_count: agg.meanNegativeFeedbackCount,
|
|
258
272
|
};
|
|
259
273
|
}
|
|
260
274
|
/**
|
|
@@ -331,6 +345,7 @@ function serialiseCorpus(c) {
|
|
|
331
345
|
return {
|
|
332
346
|
pass_rate: c.passRate,
|
|
333
347
|
tokens_per_pass: c.tokensPerPass,
|
|
348
|
+
tokens_per_run: c.tokensPerRun,
|
|
334
349
|
wallclock_ms: c.wallclockMs,
|
|
335
350
|
};
|
|
336
351
|
}
|
|
@@ -338,6 +353,7 @@ function serialiseDelta(d) {
|
|
|
338
353
|
return {
|
|
339
354
|
pass_rate: d.passRate,
|
|
340
355
|
tokens_per_pass: d.tokensPerPass,
|
|
356
|
+
tokens_per_run: d.tokensPerRun,
|
|
341
357
|
wallclock_ms: d.wallclockMs,
|
|
342
358
|
};
|
|
343
359
|
}
|
|
@@ -426,6 +442,7 @@ function serialisePerTaskMetrics(m) {
|
|
|
426
442
|
pass_rate: m.passRate,
|
|
427
443
|
pass_at_1: m.passAt1,
|
|
428
444
|
tokens_per_pass: m.tokensPerPass,
|
|
445
|
+
tokens_per_run: m.tokensPerRun,
|
|
429
446
|
wallclock_ms: m.wallclockMs,
|
|
430
447
|
pass_rate_stdev: m.passRateStdev,
|
|
431
448
|
budget_exceeded_count: m.budgetExceededCount,
|
|
@@ -511,23 +528,43 @@ function buildUtilityMarkdown(input) {
|
|
|
511
528
|
lines.push("");
|
|
512
529
|
lines.push(`- correct_asset_loaded: ${formatPercent(input.trajectoryAkm.correctAssetLoaded)}`);
|
|
513
530
|
lines.push(`- feedback_recorded: ${formatPercent(input.trajectoryAkm.feedbackRecorded)}`);
|
|
531
|
+
// Per-run trajectory detail: when allRuns is present emit a compact table
|
|
532
|
+
// so operators can distinguish null (harness error — no events captured)
|
|
533
|
+
// from false (agent ran, behaviour not observed) from true (confirmed).
|
|
534
|
+
// Symbols: "—" = null, "✗" = false, "✓" = true.
|
|
535
|
+
const akmRuns = (input.allRuns ?? []).filter((r) => r.arm === "akm");
|
|
536
|
+
if (akmRuns.length > 0) {
|
|
537
|
+
lines.push("");
|
|
538
|
+
lines.push("| task | seed | correct_asset_loaded | feedback_recorded |");
|
|
539
|
+
lines.push("|------|------|----------------------|-------------------|");
|
|
540
|
+
for (const r of akmRuns) {
|
|
541
|
+
lines.push(`| ${r.taskId} | ${r.seed} | ${formatTrajBool(r.trajectory.correctAssetLoaded)} | ${formatTrajBool(r.trajectory.feedbackRecorded)} |`);
|
|
542
|
+
}
|
|
543
|
+
}
|
|
514
544
|
lines.push("");
|
|
515
545
|
lines.push("## Per-task pass rates");
|
|
516
546
|
lines.push("");
|
|
517
547
|
// #261: synthetic column is rendered only when the synthetic arm ran.
|
|
518
548
|
// The default header/row stays identical to the pre-#261 output.
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
549
|
+
// Baseline column is rendered only when `baselineByTaskId` was supplied
|
|
550
|
+
// by the caller; legacy reports without it produce byte-identical output.
|
|
551
|
+
const includeSynthCol = input.aggregateSynth !== undefined;
|
|
552
|
+
const baselineMap = input.baselineByTaskId;
|
|
553
|
+
const includeBaselineCol = baselineMap !== undefined;
|
|
554
|
+
const baseColHeader = includeBaselineCol ? " baseline | vs base |" : "";
|
|
555
|
+
const baseColSep = includeBaselineCol ? "----------|---------|" : "";
|
|
556
|
+
if (includeSynthCol) {
|
|
557
|
+
lines.push(`| task | noakm | synthetic | akm | delta |${baseColHeader}`);
|
|
558
|
+
lines.push(`|------|-------|-----------|-----|-------|${baseColSep}`);
|
|
522
559
|
}
|
|
523
560
|
else {
|
|
524
|
-
lines.push(
|
|
525
|
-
lines.push(
|
|
561
|
+
lines.push(`| task | noakm | akm | delta |${baseColHeader}`);
|
|
562
|
+
lines.push(`|------|-------|-----|-------|${baseColSep}`);
|
|
526
563
|
}
|
|
527
564
|
// Sort tasks alphabetically for byte-stable markdown output.
|
|
528
565
|
const sorted = [...input.tasks].sort((a, b) => a.id.localeCompare(b.id));
|
|
529
566
|
for (const t of sorted) {
|
|
530
|
-
lines.push(taskRow(t,
|
|
567
|
+
lines.push(taskRow(t, includeSynthCol, baselineMap));
|
|
531
568
|
}
|
|
532
569
|
// Corpus-coverage section (#262). Renders only when at least one task was
|
|
533
570
|
// tagged with a `memory_ability`; without tags the section adds no signal
|
|
@@ -650,15 +687,29 @@ function deltaRow(d) {
|
|
|
650
687
|
const tpp = d.tokensPerPass === null ? "n/a" : signed(d.tokensPerPass.toFixed(0));
|
|
651
688
|
return `| **delta** | ${signed(d.passRate.toFixed(2))} | ${tpp} | ${signed(d.wallclockMs.toFixed(0))} |`;
|
|
652
689
|
}
|
|
653
|
-
function taskRow(t, includeSynthetic = false) {
|
|
690
|
+
function taskRow(t, includeSynthetic = false, baselineByTaskId) {
|
|
691
|
+
// Baseline-delta cell is rendered only when a baseline map is provided
|
|
692
|
+
// AND this task has an entry. Tasks without a baseline entry get an empty
|
|
693
|
+
// pair of cells so columns stay aligned.
|
|
694
|
+
let baselineCells = "";
|
|
695
|
+
if (baselineByTaskId) {
|
|
696
|
+
const base = baselineByTaskId[t.id];
|
|
697
|
+
if (base === undefined) {
|
|
698
|
+
baselineCells = " n/a | n/a |";
|
|
699
|
+
}
|
|
700
|
+
else {
|
|
701
|
+
const delta = t.akm.passRate - base;
|
|
702
|
+
baselineCells = ` ${base.toFixed(2)} | ${signed(delta.toFixed(2))} |`;
|
|
703
|
+
}
|
|
704
|
+
}
|
|
654
705
|
if (includeSynthetic) {
|
|
655
706
|
// #261: render the synthetic-arm pass-rate when present; "n/a" when the
|
|
656
707
|
// arm did not run for this task. A missing arm is NOT a zero-pass arm —
|
|
657
708
|
// a 0.00 cell would be misleading because the model never tried.
|
|
658
709
|
const synth = t.synthetic ? t.synthetic.passRate.toFixed(2) : "n/a";
|
|
659
|
-
return `| ${t.id} | ${t.noakm.passRate.toFixed(2)} | ${synth} | ${t.akm.passRate.toFixed(2)} | ${signed(t.delta.passRate.toFixed(2))}
|
|
710
|
+
return `| ${t.id} | ${t.noakm.passRate.toFixed(2)} | ${synth} | ${t.akm.passRate.toFixed(2)} | ${signed(t.delta.passRate.toFixed(2))} |${baselineCells}`;
|
|
660
711
|
}
|
|
661
|
-
return `| ${t.id} | ${t.noakm.passRate.toFixed(2)} | ${t.akm.passRate.toFixed(2)} | ${signed(t.delta.passRate.toFixed(2))}
|
|
712
|
+
return `| ${t.id} | ${t.noakm.passRate.toFixed(2)} | ${t.akm.passRate.toFixed(2)} | ${signed(t.delta.passRate.toFixed(2))} |${baselineCells}`;
|
|
662
713
|
}
|
|
663
714
|
function signed(text) {
|
|
664
715
|
if (text.startsWith("-"))
|
|
@@ -672,6 +723,19 @@ function formatPercent(value) {
|
|
|
672
723
|
return "n/a";
|
|
673
724
|
return `${(value * 100).toFixed(1)}%`;
|
|
674
725
|
}
|
|
726
|
+
/**
|
|
727
|
+
* Render a `boolean | null` trajectory field for markdown tables.
|
|
728
|
+
*
|
|
729
|
+
* Three-state semantics:
|
|
730
|
+
* - `null` → `"—"` — no trajectory data (harness error; events.jsonl not captured).
|
|
731
|
+
* - `false` → `"✗"` — agent ran but the behaviour was not observed.
|
|
732
|
+
* - `true` → `"✓"` — behaviour confirmed.
|
|
733
|
+
*/
|
|
734
|
+
export function formatTrajBool(value) {
|
|
735
|
+
if (value === null)
|
|
736
|
+
return "—";
|
|
737
|
+
return value ? "✓" : "✗";
|
|
738
|
+
}
|
|
675
739
|
// ── Compare rendering (§8) ─────────────────────────────────────────────────
|
|
676
740
|
/**
|
|
677
741
|
* Render a CompareResult as a deterministic markdown diff.
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
*/
|
|
4
4
|
import { describe, expect, test } from "bun:test";
|
|
5
5
|
import fs from "node:fs";
|
|
6
|
-
import { renderJsonReport, renderMarkdownSummary, renderUtilityReport, resolveGitBranch, resolveGitCommit, serializeRunForReport, } from "./report";
|
|
6
|
+
import { formatTrajBool, renderJsonReport, renderMarkdownSummary, renderUtilityReport, resolveGitBranch, resolveGitCommit, serializeRunForReport, } from "./report";
|
|
7
7
|
import { benchMkdtemp } from "./tmp";
|
|
8
8
|
const sample = {
|
|
9
9
|
timestamp: "2026-04-27T12:00:00Z",
|
|
@@ -66,6 +66,7 @@ function pt(passRate, tokens, wall, count = 5) {
|
|
|
66
66
|
passRate,
|
|
67
67
|
passAt1: passes > 0 ? 1 : 0,
|
|
68
68
|
tokensPerPass: tokens,
|
|
69
|
+
tokensPerRun: tokens,
|
|
69
70
|
wallclockMs: wall,
|
|
70
71
|
passRateStdev: 0,
|
|
71
72
|
budgetExceededCount: 0,
|
|
@@ -80,9 +81,9 @@ const utilSample = {
|
|
|
80
81
|
commit: "deadbee",
|
|
81
82
|
model: "anthropic/claude-opus-4-7",
|
|
82
83
|
corpus: { domains: 3, tasks: 2, slice: "all", seedsPerArm: 5 },
|
|
83
|
-
aggregateNoakm: { passRate: 0.4, tokensPerPass: 18000, wallclockMs: 41000 },
|
|
84
|
-
aggregateAkm: { passRate: 0.7, tokensPerPass: 14000, wallclockMs: 36000 },
|
|
85
|
-
aggregateDelta: { passRate: 0.3, tokensPerPass: -4000, wallclockMs: -5000 },
|
|
84
|
+
aggregateNoakm: { passRate: 0.4, tokensPerPass: 18000, tokensPerRun: null, wallclockMs: 41000 },
|
|
85
|
+
aggregateAkm: { passRate: 0.7, tokensPerPass: 14000, tokensPerRun: null, wallclockMs: 36000 },
|
|
86
|
+
aggregateDelta: { passRate: 0.3, tokensPerPass: -4000, tokensPerRun: null, wallclockMs: -5000 },
|
|
86
87
|
trajectoryAkm: { correctAssetLoaded: 0.78, feedbackRecorded: 0.65 },
|
|
87
88
|
failureModes: { byLabel: {}, byTask: {} },
|
|
88
89
|
tasks: [
|
|
@@ -90,13 +91,13 @@ const utilSample = {
|
|
|
90
91
|
id: "domain-a/task-1",
|
|
91
92
|
noakm: pt(0.4, 20000, 40000),
|
|
92
93
|
akm: pt(0.8, 13000, 35000),
|
|
93
|
-
delta: { passRate: 0.4, tokensPerPass: -7000, wallclockMs: -5000 },
|
|
94
|
+
delta: { passRate: 0.4, tokensPerPass: -7000, tokensPerRun: null, wallclockMs: -5000 },
|
|
94
95
|
},
|
|
95
96
|
{
|
|
96
97
|
id: "domain-b/task-2",
|
|
97
98
|
noakm: pt(0.4, null, 42000),
|
|
98
99
|
akm: pt(0.6, 15000, 37000),
|
|
99
|
-
delta: { passRate: 0.2, tokensPerPass: null, wallclockMs: -5000 },
|
|
100
|
+
delta: { passRate: 0.2, tokensPerPass: null, tokensPerRun: null, wallclockMs: -5000 },
|
|
100
101
|
},
|
|
101
102
|
],
|
|
102
103
|
warnings: [],
|
|
@@ -253,6 +254,54 @@ describe("serializeRunForReport", () => {
|
|
|
253
254
|
expect(row.failure_mode).toBe("wrong_asset");
|
|
254
255
|
});
|
|
255
256
|
});
|
|
257
|
+
// ── formatTrajBool (M3) ───────────────────────────────────────────────────
|
|
258
|
+
describe("formatTrajBool", () => {
|
|
259
|
+
test("null → '—' (harness error, no trajectory data)", () => {
|
|
260
|
+
expect(formatTrajBool(null)).toBe("—");
|
|
261
|
+
});
|
|
262
|
+
test("false → '✗' (agent ran, behaviour not observed)", () => {
|
|
263
|
+
expect(formatTrajBool(false)).toBe("✗");
|
|
264
|
+
});
|
|
265
|
+
test("true → '✓' (behaviour confirmed)", () => {
|
|
266
|
+
expect(formatTrajBool(true)).toBe("✓");
|
|
267
|
+
});
|
|
268
|
+
});
|
|
269
|
+
describe("renderUtilityReport per-run trajectory table (M3)", () => {
|
|
270
|
+
test("markdown includes per-run table when allRuns has akm runs", () => {
|
|
271
|
+
const allRuns = [
|
|
272
|
+
makeRun({
|
|
273
|
+
taskId: "domain-a/task-1",
|
|
274
|
+
arm: "akm",
|
|
275
|
+
seed: 0,
|
|
276
|
+
trajectory: { correctAssetLoaded: true, feedbackRecorded: false },
|
|
277
|
+
}),
|
|
278
|
+
makeRun({
|
|
279
|
+
taskId: "domain-a/task-1",
|
|
280
|
+
arm: "akm",
|
|
281
|
+
seed: 1,
|
|
282
|
+
trajectory: { correctAssetLoaded: null, feedbackRecorded: null },
|
|
283
|
+
}),
|
|
284
|
+
// noakm run should be excluded from the table
|
|
285
|
+
makeRun({
|
|
286
|
+
taskId: "domain-a/task-1",
|
|
287
|
+
arm: "noakm",
|
|
288
|
+
seed: 0,
|
|
289
|
+
trajectory: { correctAssetLoaded: false, feedbackRecorded: false },
|
|
290
|
+
}),
|
|
291
|
+
];
|
|
292
|
+
const report = { ...utilSample, allRuns };
|
|
293
|
+
const { markdown } = renderUtilityReport(report);
|
|
294
|
+
expect(markdown).toContain("| task | seed | correct_asset_loaded | feedback_recorded |");
|
|
295
|
+
expect(markdown).toContain("domain-a/task-1 | 0 | ✓ | ✗");
|
|
296
|
+
expect(markdown).toContain("domain-a/task-1 | 1 | — | —");
|
|
297
|
+
// noakm run must NOT appear in the akm-only trajectory table
|
|
298
|
+
// (the table is gated on arm === "akm")
|
|
299
|
+
});
|
|
300
|
+
test("markdown has no per-run trajectory table when allRuns is absent", () => {
|
|
301
|
+
const { markdown } = renderUtilityReport(utilSample);
|
|
302
|
+
expect(markdown).not.toContain("| task | seed | correct_asset_loaded | feedback_recorded |");
|
|
303
|
+
});
|
|
304
|
+
});
|
|
256
305
|
describe("renderUtilityReport runs[] persistence (#249)", () => {
|
|
257
306
|
test("emits one row per (task, arm, seed) when allRuns is supplied", () => {
|
|
258
307
|
const allRuns = [
|
|
@@ -374,13 +423,13 @@ describe("renderUtilityReport negative-transfer (#260)", () => {
|
|
|
374
423
|
id: "domain-a/task-1",
|
|
375
424
|
noakm: pt(0.4, 20000, 40000),
|
|
376
425
|
akm: pt(0.8, 13000, 35000),
|
|
377
|
-
delta: { passRate: 0.4, tokensPerPass: -7000, wallclockMs: -5000 },
|
|
426
|
+
delta: { passRate: 0.4, tokensPerPass: -7000, tokensPerRun: null, wallclockMs: -5000 },
|
|
378
427
|
},
|
|
379
428
|
{
|
|
380
429
|
id: "domain-b/task-2",
|
|
381
430
|
noakm: pt(0.6, 20000, 40000),
|
|
382
431
|
akm: pt(0.2, 25000, 38000),
|
|
383
|
-
delta: { passRate: -0.4, tokensPerPass: 5000, wallclockMs: -2000 },
|
|
432
|
+
delta: { passRate: -0.4, tokensPerPass: 5000, tokensPerRun: null, wallclockMs: -2000 },
|
|
384
433
|
},
|
|
385
434
|
],
|
|
386
435
|
};
|
|
@@ -441,13 +490,13 @@ describe("renderUtilityReport negative-transfer (#260)", () => {
|
|
|
441
490
|
id: "domain-a/task-1",
|
|
442
491
|
noakm: pt(0.4, 20000, 40000),
|
|
443
492
|
akm: pt(0.8, 13000, 35000),
|
|
444
|
-
delta: { passRate: 0.4, tokensPerPass: -7000, wallclockMs: -5000 },
|
|
493
|
+
delta: { passRate: 0.4, tokensPerPass: -7000, tokensPerRun: null, wallclockMs: -5000 },
|
|
445
494
|
},
|
|
446
495
|
{
|
|
447
496
|
id: "domain-b/task-2",
|
|
448
497
|
noakm: pt(0.6, 20000, 40000),
|
|
449
498
|
akm: pt(0.2, 25000, 38000),
|
|
450
|
-
delta: { passRate: -0.4, tokensPerPass: 5000, wallclockMs: -2000 },
|
|
499
|
+
delta: { passRate: -0.4, tokensPerPass: 5000, tokensPerRun: null, wallclockMs: -2000 },
|
|
451
500
|
},
|
|
452
501
|
],
|
|
453
502
|
akmRuns,
|