akm-cli 0.7.0-rc1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/src/cli.js +100 -16
- package/dist/src/commands/config-cli.js +42 -0
- package/dist/src/commands/history.js +78 -7
- package/dist/src/commands/registry-search.js +69 -6
- package/dist/src/commands/search.js +30 -3
- package/dist/src/commands/show.js +29 -0
- package/dist/src/commands/source-add.js +5 -1
- package/dist/src/commands/source-manage.js +7 -1
- package/dist/src/core/config.js +28 -0
- package/dist/src/indexer/db-search.js +1 -0
- package/dist/src/indexer/indexer.js +16 -2
- package/dist/src/indexer/matchers.js +1 -1
- package/dist/src/indexer/search-source.js +4 -2
- package/dist/src/integrations/agent/profiles.js +1 -1
- package/dist/src/integrations/agent/spawn.js +67 -16
- package/dist/src/integrations/github.js +9 -3
- package/dist/src/llm/embedders/remote.js +37 -3
- package/dist/src/output/cli-hints.js +15 -2
- package/dist/src/output/renderers.js +3 -1
- package/dist/src/output/shapes.js +8 -1
- package/dist/src/output/text.js +156 -3
- package/dist/src/registry/build-index.js +5 -4
- package/dist/src/registry/providers/static-index.js +3 -1
- package/dist/src/setup/setup.js +9 -0
- package/dist/src/wiki/wiki.js +54 -6
- package/dist/src/workflows/runs.js +37 -3
- package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +1 -1
- package/dist/tests/bench/attribution.test.js +24 -23
- package/dist/tests/bench/cleanup.js +31 -0
- package/dist/tests/bench/cli.js +366 -31
- package/dist/tests/bench/cli.test.js +282 -14
- package/dist/tests/bench/corpus.js +3 -0
- package/dist/tests/bench/corpus.test.js +10 -10
- package/dist/tests/bench/doctor.js +525 -0
- package/dist/tests/bench/driver.js +77 -22
- package/dist/tests/bench/driver.test.js +142 -1
- package/dist/tests/bench/environment.js +233 -0
- package/dist/tests/bench/environment.test.js +199 -0
- package/dist/tests/bench/evolve.js +67 -0
- package/dist/tests/bench/evolve.test.js +12 -4
- package/dist/tests/bench/failure-modes.test.js +52 -3
- package/dist/tests/bench/feedback-integrity.test.js +3 -2
- package/dist/tests/bench/leakage.test.js +105 -2
- package/dist/tests/bench/learning-curve.test.js +3 -2
- package/dist/tests/bench/metrics.js +102 -26
- package/dist/tests/bench/metrics.test.js +10 -4
- package/dist/tests/bench/opencode-config.js +194 -0
- package/dist/tests/bench/opencode-config.test.js +370 -0
- package/dist/tests/bench/report.js +73 -9
- package/dist/tests/bench/report.test.js +59 -10
- package/dist/tests/bench/run-config.js +355 -0
- package/dist/tests/bench/run-config.test.js +298 -0
- package/dist/tests/bench/run-curate-test.js +32 -0
- package/dist/tests/bench/run-failing-tasks.js +56 -0
- package/dist/tests/bench/run-full-bench.js +51 -0
- package/dist/tests/bench/run-items36-targeted.js +69 -0
- package/dist/tests/bench/run-nano-quick.js +42 -0
- package/dist/tests/bench/run-waveg-targeted.js +62 -0
- package/dist/tests/bench/runner.js +257 -94
- package/dist/tests/bench/tmp.js +90 -0
- package/dist/tests/bench/trajectory.js +2 -2
- package/dist/tests/bench/verifier.js +6 -1
- package/dist/tests/bench/workflow-spec.js +11 -24
- package/dist/tests/bench/workflow-spec.test.js +1 -1
- package/dist/tests/bench/workflow-trace.js +34 -0
- package/dist/tests/cli-errors.test.js +1 -0
- package/dist/tests/commands/history.test.js +195 -0
- package/dist/tests/config.test.js +25 -0
- package/dist/tests/e2e.test.js +23 -2
- package/dist/tests/fixtures/stashes/load.js +1 -1
- package/dist/tests/fixtures/stashes/load.test.js +11 -2
- package/dist/tests/indexer.test.js +12 -1
- package/dist/tests/output-baseline.test.js +2 -1
- package/dist/tests/output-shapes-unit.test.js +3 -1
- package/dist/tests/registry-build-index.test.js +17 -1
- package/dist/tests/registry-providers/static-index.test.js +34 -0
- package/dist/tests/registry-search.test.js +200 -0
- package/dist/tests/remember-frontmatter.test.js +11 -13
- package/dist/tests/source-qa-fixes.test.js +18 -0
- package/dist/tests/source-registry.test.js +3 -3
- package/dist/tests/source-source.test.js +61 -1
- package/dist/tests/workflow-qa-fixes.test.js +18 -0
- package/package.json +1 -1
|
@@ -88,6 +88,7 @@ export function aggregatePerTask(results) {
|
|
|
88
88
|
harnessErrorCount: 0,
|
|
89
89
|
count: 0,
|
|
90
90
|
runsWithMeasuredTokens: 0,
|
|
91
|
+
tokensPerRun: null,
|
|
91
92
|
};
|
|
92
93
|
}
|
|
93
94
|
let passes = 0;
|
|
@@ -97,12 +98,16 @@ export function aggregatePerTask(results) {
|
|
|
97
98
|
let budgetExceeded = 0;
|
|
98
99
|
let harnessError = 0;
|
|
99
100
|
let runsWithMeasuredTokens = 0;
|
|
101
|
+
let totalTokensInMeasuredRuns = 0;
|
|
102
|
+
let measuredRuns = 0;
|
|
100
103
|
// For the standard deviation we need a fixed-iteration buffer of pass/fail.
|
|
101
104
|
const passSamples = [];
|
|
102
105
|
for (const r of results) {
|
|
103
106
|
totalWallclock += r.wallclockMs;
|
|
104
107
|
if (isMeasured(r)) {
|
|
105
108
|
runsWithMeasuredTokens += 1;
|
|
109
|
+
measuredRuns += 1;
|
|
110
|
+
totalTokensInMeasuredRuns += r.tokens.input + r.tokens.output;
|
|
106
111
|
}
|
|
107
112
|
const isPass = r.outcome === "pass" ? 1 : 0;
|
|
108
113
|
passSamples.push(isPass);
|
|
@@ -135,6 +140,7 @@ export function aggregatePerTask(results) {
|
|
|
135
140
|
harnessErrorCount: harnessError,
|
|
136
141
|
count: results.length,
|
|
137
142
|
runsWithMeasuredTokens,
|
|
143
|
+
tokensPerRun: measuredRuns === 0 ? null : totalTokensInMeasuredRuns / measuredRuns,
|
|
138
144
|
};
|
|
139
145
|
}
|
|
140
146
|
/** Sample standard deviation. Returns 0 for length ≤ 1 (no spread to measure). */
|
|
@@ -156,13 +162,15 @@ function stdev(values) {
|
|
|
156
162
|
export function aggregateCorpus(perTask) {
|
|
157
163
|
const tasks = Object.values(perTask);
|
|
158
164
|
if (tasks.length === 0) {
|
|
159
|
-
return { passRate: 0, tokensPerPass: null, wallclockMs: 0 };
|
|
165
|
+
return { passRate: 0, tokensPerPass: null, wallclockMs: 0, tokensPerRun: null };
|
|
160
166
|
}
|
|
161
167
|
const passRate = tasks.reduce((a, t) => a + t.passRate, 0) / tasks.length;
|
|
162
168
|
const wallclockMs = tasks.reduce((a, t) => a + t.wallclockMs, 0) / tasks.length;
|
|
163
169
|
const tppValues = tasks.map((t) => t.tokensPerPass).filter((v) => v !== null);
|
|
164
170
|
const tokensPerPass = tppValues.length === 0 ? null : tppValues.reduce((a, b) => a + b, 0) / tppValues.length;
|
|
165
|
-
|
|
171
|
+
const tprValues = tasks.map((t) => t.tokensPerRun).filter((v) => v !== null);
|
|
172
|
+
const tokensPerRun = tprValues.length === 0 ? null : tprValues.reduce((a, b) => a + b, 0) / tprValues.length;
|
|
173
|
+
return { passRate, tokensPerPass, wallclockMs, tokensPerRun };
|
|
166
174
|
}
|
|
167
175
|
/**
|
|
168
176
|
* Compute the akm − noakm delta. Negative `tokensPerPass`/`wallclockMs` mean
|
|
@@ -174,6 +182,7 @@ export function computeCorpusDelta(noakm, akm) {
|
|
|
174
182
|
passRate: akm.passRate - noakm.passRate,
|
|
175
183
|
tokensPerPass: akm.tokensPerPass === null || noakm.tokensPerPass === null ? null : akm.tokensPerPass - noakm.tokensPerPass,
|
|
176
184
|
wallclockMs: akm.wallclockMs - noakm.wallclockMs,
|
|
185
|
+
tokensPerRun: akm.tokensPerRun === null || noakm.tokensPerRun === null ? null : akm.tokensPerRun - noakm.tokensPerRun,
|
|
177
186
|
};
|
|
178
187
|
}
|
|
179
188
|
/** Per-task delta with the same null-safety as the corpus delta. */
|
|
@@ -182,6 +191,7 @@ export function computePerTaskDelta(noakm, akm) {
|
|
|
182
191
|
passRate: akm.passRate - noakm.passRate,
|
|
183
192
|
tokensPerPass: akm.tokensPerPass === null || noakm.tokensPerPass === null ? null : akm.tokensPerPass - noakm.tokensPerPass,
|
|
184
193
|
wallclockMs: akm.wallclockMs - noakm.wallclockMs,
|
|
194
|
+
tokensPerRun: akm.tokensPerRun === null || noakm.tokensPerRun === null ? null : akm.tokensPerRun - noakm.tokensPerRun,
|
|
185
195
|
};
|
|
186
196
|
}
|
|
187
197
|
/**
|
|
@@ -1126,51 +1136,78 @@ const SEARCH_RANK_CUTOFF = 5;
|
|
|
1126
1136
|
/** Cap on the number of characters of `verifierStdout` we substring-scan. Mirrors trajectory.ts. */
|
|
1127
1137
|
const FAILURE_MODE_STDOUT_SCAN_CAP = 16 * 1024 * 1024;
|
|
1128
1138
|
/**
|
|
1129
|
-
* Classify a single failed run into one of the
|
|
1130
|
-
*
|
|
1131
|
-
*
|
|
1139
|
+
* Classify a single failed run into one of the §6.6 labels. Pure function —
|
|
1140
|
+
* consults `runResult.trajectory.correctAssetLoaded` first (trajectory data
|
|
1141
|
+
* is authoritative when present), then falls back to string-matching
|
|
1142
|
+
* `runResult.events[]` and `runResult.verifierStdout`. Never calls an LLM,
|
|
1143
|
+
* never touches the filesystem.
|
|
1132
1144
|
*
|
|
1133
1145
|
* Decision tree (priority order — first match wins):
|
|
1134
1146
|
* 1. Run not failed (`pass`, `budget_exceeded`, `harness_error`) → `null`.
|
|
1135
|
-
* 2.
|
|
1136
|
-
*
|
|
1137
|
-
*
|
|
1138
|
-
*
|
|
1139
|
-
*
|
|
1140
|
-
*
|
|
1141
|
-
*
|
|
1142
|
-
*
|
|
1143
|
-
*
|
|
1144
|
-
*
|
|
1145
|
-
*
|
|
1146
|
-
*
|
|
1147
|
+
* 2. `trajectory.correctAssetLoaded === true` → the agent loaded the gold
|
|
1148
|
+
* asset but still failed. This is `loaded_ignored` (agent wrote from
|
|
1149
|
+
* memory instead of applying asset content). This short-circuit fixes
|
|
1150
|
+
* the 2026-05-03 baseline bug where 24/25 `search_no_gold` labels were
|
|
1151
|
+
* wrong because the classifier didn't consult trajectory data.
|
|
1152
|
+
* 3. No `akm search` call in the trace:
|
|
1153
|
+
* a. If task has no `goldRef` (so `correctAssetLoaded` is always null)
|
|
1154
|
+
* → `no_events` (trajectory metric undefined; cannot distinguish
|
|
1155
|
+
* "agent ran but events absent" from "agent never ran").
|
|
1156
|
+
* b. Otherwise → `no_search`.
|
|
1157
|
+
* 4. Search ran, no goldRef → `unrelated_bug`.
|
|
1158
|
+
* 5. Search ran; gold ref absent from results → `search_no_gold`.
|
|
1159
|
+
* (Only reachable when `correctAssetLoaded` is false or null, since
|
|
1160
|
+
* true is handled in step 2.)
|
|
1161
|
+
* 6. Gold ref present at rank > 5 → `search_low_rank`.
|
|
1162
|
+
* 7. `akm show` invoked on a non-gold ref AND gold ref never loaded
|
|
1163
|
+
* → `loaded_wrong`.
|
|
1164
|
+
* 8. Gold ref loaded; verifier output suggests the action contradicts the
|
|
1165
|
+
* asset's guidance → `loaded_ignored`.
|
|
1166
|
+
* 9. Gold ref loaded and apparently followed → `followed_wrong`.
|
|
1167
|
+
* 10. Default → `unrelated_bug`.
|
|
1147
1168
|
*/
|
|
1148
1169
|
export function classifyFailureMode(taskMeta, runResult) {
|
|
1149
1170
|
if (runResult.outcome !== "fail")
|
|
1150
1171
|
return null;
|
|
1151
|
-
const trace = collectTrace(runResult);
|
|
1152
1172
|
const goldRef = taskMeta.goldRef;
|
|
1153
|
-
|
|
1173
|
+
const correctAssetLoaded = runResult.trajectory?.correctAssetLoaded;
|
|
1174
|
+
// 1. Trajectory short-circuit: if events data confirms the gold asset was
|
|
1175
|
+
// loaded, the failure must be compliance-related, not discovery-related.
|
|
1176
|
+
// Return `loaded_ignored` immediately without scanning stdout.
|
|
1177
|
+
if (correctAssetLoaded === true) {
|
|
1178
|
+
return "loaded_ignored";
|
|
1179
|
+
}
|
|
1180
|
+
const trace = collectTrace(runResult);
|
|
1181
|
+
// 2. no_search / no_events — no `akm search` invocation anywhere in the trace.
|
|
1154
1182
|
if (!hasAkmSearch(trace, runResult)) {
|
|
1183
|
+
// When there is no goldRef, correctAssetLoaded is always null (the metric
|
|
1184
|
+
// is undefined). We cannot tell whether the agent genuinely didn't search
|
|
1185
|
+
// or whether events data was simply absent. Use `no_events` to surface
|
|
1186
|
+
// this ambiguity rather than conflating it with `no_search`.
|
|
1187
|
+
if (!goldRef) {
|
|
1188
|
+
return "no_events";
|
|
1189
|
+
}
|
|
1155
1190
|
return "no_search";
|
|
1156
1191
|
}
|
|
1157
1192
|
// Without a gold ref the search-based and load-based checks are undefined.
|
|
1158
|
-
// We can only distinguish "no_search" from everything else.
|
|
1193
|
+
// We can only distinguish "no_search" / "no_events" from everything else.
|
|
1159
1194
|
if (!goldRef) {
|
|
1160
1195
|
return "unrelated_bug";
|
|
1161
1196
|
}
|
|
1162
1197
|
const searchRank = findGoldSearchRank(trace, goldRef);
|
|
1163
|
-
//
|
|
1198
|
+
// 3. search_no_gold — search ran (precondition above) but gold ref absent.
|
|
1199
|
+
// Only reachable when correctAssetLoaded is false or null (trajectory
|
|
1200
|
+
// data indicates gold was not loaded), because true is handled above.
|
|
1164
1201
|
if (searchRank === null) {
|
|
1165
1202
|
return "search_no_gold";
|
|
1166
1203
|
}
|
|
1167
|
-
//
|
|
1204
|
+
// 4. search_low_rank — present but below the cutoff.
|
|
1168
1205
|
if (searchRank > SEARCH_RANK_CUTOFF) {
|
|
1169
1206
|
return "search_low_rank";
|
|
1170
1207
|
}
|
|
1171
1208
|
const goldLoaded = hasAkmShow(trace, runResult, goldRef);
|
|
1172
1209
|
const otherRefLoaded = hasAkmShowOtherRef(trace, runResult, goldRef);
|
|
1173
|
-
//
|
|
1210
|
+
// 5. loaded_wrong — agent showed a non-gold ref AND never loaded the gold.
|
|
1174
1211
|
if (otherRefLoaded && !goldLoaded) {
|
|
1175
1212
|
return "loaded_wrong";
|
|
1176
1213
|
}
|
|
@@ -1181,7 +1218,7 @@ export function classifyFailureMode(taskMeta, runResult) {
|
|
|
1181
1218
|
// table has no row for "found but never opened" — treat as unrelated_bug.
|
|
1182
1219
|
return "unrelated_bug";
|
|
1183
1220
|
}
|
|
1184
|
-
//
|
|
1221
|
+
// 6. loaded_ignored — verifier diagnostic indicates the action contradicts
|
|
1185
1222
|
// the loaded asset. Conservative heuristic: look for explicit "ignored"
|
|
1186
1223
|
// or "not applied" markers in the verifier stdout. Without an LLM we
|
|
1187
1224
|
// cannot detect subtler contradictions, so this branch only fires when
|
|
@@ -1189,7 +1226,7 @@ export function classifyFailureMode(taskMeta, runResult) {
|
|
|
1189
1226
|
if (verifierIndicatesIgnored(runResult.verifierStdout)) {
|
|
1190
1227
|
return "loaded_ignored";
|
|
1191
1228
|
}
|
|
1192
|
-
//
|
|
1229
|
+
// 7. followed_wrong — gold loaded, apparently followed, verifier still
|
|
1193
1230
|
// failed. The §6.6 spec maps this to "the asset itself is wrong".
|
|
1194
1231
|
return "followed_wrong";
|
|
1195
1232
|
}
|
|
@@ -1992,6 +2029,8 @@ function perRun(run, taskMetadata) {
|
|
|
1992
2029
|
let searchCount = 0;
|
|
1993
2030
|
let showCount = 0;
|
|
1994
2031
|
let feedbackCount = 0;
|
|
2032
|
+
let positiveFeedbackCount = 0;
|
|
2033
|
+
let negativeFeedbackCount = 0;
|
|
1995
2034
|
const uniqueShowRefs = new Set();
|
|
1996
2035
|
for (const ev of events) {
|
|
1997
2036
|
if (ev.type === "akm_search")
|
|
@@ -2002,8 +2041,17 @@ function perRun(run, taskMetadata) {
|
|
|
2002
2041
|
uniqueShowRefs.add(ev.assetRef);
|
|
2003
2042
|
}
|
|
2004
2043
|
}
|
|
2005
|
-
else if (ev.type === "akm_feedback")
|
|
2044
|
+
else if (ev.type === "akm_feedback") {
|
|
2006
2045
|
feedbackCount += 1;
|
|
2046
|
+
// Polarity is carried in args as "--positive" or "--negative".
|
|
2047
|
+
// Events sourced from events.jsonl also have args populated by
|
|
2048
|
+
// normalizeRunToTrace. Absence of both flags is treated as unknown
|
|
2049
|
+
// (contributes to feedbackCount but not to either polarity counter).
|
|
2050
|
+
if (ev.args?.includes("--positive"))
|
|
2051
|
+
positiveFeedbackCount += 1;
|
|
2052
|
+
else if (ev.args?.includes("--negative"))
|
|
2053
|
+
negativeFeedbackCount += 1;
|
|
2054
|
+
}
|
|
2007
2055
|
}
|
|
2008
2056
|
const totalToolCalls = searchCount + showCount + feedbackCount;
|
|
2009
2057
|
// Run-start anchor: earliest parseable ts in the trace. We use the trace
|
|
@@ -2049,6 +2097,8 @@ function perRun(run, taskMetadata) {
|
|
|
2049
2097
|
searchCount,
|
|
2050
2098
|
showCount,
|
|
2051
2099
|
feedbackCount,
|
|
2100
|
+
positiveFeedbackCount,
|
|
2101
|
+
negativeFeedbackCount,
|
|
2052
2102
|
totalToolCalls,
|
|
2053
2103
|
assetsLoadedCount: uniqueShowRefs.size,
|
|
2054
2104
|
irrelevantAssetsLoadedCount,
|
|
@@ -2087,6 +2137,12 @@ export function aggregateAkmOverhead(perRun, rawRuns = []) {
|
|
|
2087
2137
|
totalToolCalls: 0,
|
|
2088
2138
|
toolCallsPerSuccess: null,
|
|
2089
2139
|
costPerSuccess: null,
|
|
2140
|
+
searchEngagementRate: 0,
|
|
2141
|
+
showEngagementRate: 0,
|
|
2142
|
+
feedbackEngagementRate: 0,
|
|
2143
|
+
searchToShowRatio: null,
|
|
2144
|
+
meanPositiveFeedbackCount: 0,
|
|
2145
|
+
meanNegativeFeedbackCount: 0,
|
|
2090
2146
|
};
|
|
2091
2147
|
}
|
|
2092
2148
|
let searchSum = 0;
|
|
@@ -2115,12 +2171,25 @@ export function aggregateAkmOverhead(perRun, rawRuns = []) {
|
|
|
2115
2171
|
let parsedPassTokenSum = 0;
|
|
2116
2172
|
let parsedPassCount = 0;
|
|
2117
2173
|
let anyPassMissingMeasurement = false;
|
|
2174
|
+
let searchEngagedRuns = 0;
|
|
2175
|
+
let showEngagedRuns = 0;
|
|
2176
|
+
let feedbackEngagedRuns = 0;
|
|
2177
|
+
let positiveFeedbackSum = 0;
|
|
2178
|
+
let negativeFeedbackSum = 0;
|
|
2118
2179
|
for (const row of perRun) {
|
|
2119
2180
|
searchSum += row.searchCount;
|
|
2120
2181
|
showSum += row.showCount;
|
|
2121
2182
|
feedbackSum += row.feedbackCount;
|
|
2122
2183
|
toolCallsSum += row.totalToolCalls;
|
|
2123
2184
|
assetsSum += row.assetsLoadedCount;
|
|
2185
|
+
if (row.searchCount > 0)
|
|
2186
|
+
searchEngagedRuns += 1;
|
|
2187
|
+
if (row.showCount > 0)
|
|
2188
|
+
showEngagedRuns += 1;
|
|
2189
|
+
if (row.feedbackCount > 0)
|
|
2190
|
+
feedbackEngagedRuns += 1;
|
|
2191
|
+
positiveFeedbackSum += row.positiveFeedbackCount;
|
|
2192
|
+
negativeFeedbackSum += row.negativeFeedbackCount;
|
|
2124
2193
|
if (row.irrelevantAssetsLoadedCount !== null) {
|
|
2125
2194
|
irrelevantSum += row.irrelevantAssetsLoadedCount;
|
|
2126
2195
|
irrelevantCount += 1;
|
|
@@ -2166,6 +2235,7 @@ export function aggregateAkmOverhead(perRun, rawRuns = []) {
|
|
|
2166
2235
|
const costPerSuccess = passingRuns === 0 || anyPassMissingMeasurement || parsedPassCount === 0
|
|
2167
2236
|
? null
|
|
2168
2237
|
: parsedPassTokenSum / parsedPassCount;
|
|
2238
|
+
const searchToShowRatio = searchSum === 0 ? null : showSum / searchSum;
|
|
2169
2239
|
return {
|
|
2170
2240
|
totalRuns: n,
|
|
2171
2241
|
passingRuns,
|
|
@@ -2182,6 +2252,12 @@ export function aggregateAkmOverhead(perRun, rawRuns = []) {
|
|
|
2182
2252
|
totalToolCalls: toolCallsSum,
|
|
2183
2253
|
toolCallsPerSuccess,
|
|
2184
2254
|
costPerSuccess,
|
|
2255
|
+
searchEngagementRate: searchEngagedRuns / n,
|
|
2256
|
+
showEngagementRate: showEngagedRuns / n,
|
|
2257
|
+
feedbackEngagementRate: feedbackEngagedRuns / n,
|
|
2258
|
+
searchToShowRatio,
|
|
2259
|
+
meanPositiveFeedbackCount: positiveFeedbackSum / n,
|
|
2260
|
+
meanNegativeFeedbackCount: negativeFeedbackSum / n,
|
|
2185
2261
|
};
|
|
2186
2262
|
}
|
|
2187
2263
|
/**
|
|
@@ -11,6 +11,7 @@ function ptm(overrides = {}) {
|
|
|
11
11
|
passRate: 0,
|
|
12
12
|
passAt1: 0,
|
|
13
13
|
tokensPerPass: null,
|
|
14
|
+
tokensPerRun: null,
|
|
14
15
|
wallclockMs: 0,
|
|
15
16
|
passRateStdev: 0,
|
|
16
17
|
budgetExceededCount: 0,
|
|
@@ -209,6 +210,7 @@ describe("aggregateCorpus", () => {
|
|
|
209
210
|
passRate: 1,
|
|
210
211
|
passAt1: 1,
|
|
211
212
|
tokensPerPass: 1000,
|
|
213
|
+
tokensPerRun: 1000,
|
|
212
214
|
wallclockMs: 1000,
|
|
213
215
|
passRateStdev: 0,
|
|
214
216
|
budgetExceededCount: 0,
|
|
@@ -220,6 +222,7 @@ describe("aggregateCorpus", () => {
|
|
|
220
222
|
passRate: 0,
|
|
221
223
|
passAt1: 0,
|
|
222
224
|
tokensPerPass: null,
|
|
225
|
+
tokensPerRun: null,
|
|
223
226
|
wallclockMs: 2000,
|
|
224
227
|
passRateStdev: 0,
|
|
225
228
|
budgetExceededCount: 0,
|
|
@@ -239,6 +242,7 @@ describe("aggregateCorpus", () => {
|
|
|
239
242
|
passRate: 0,
|
|
240
243
|
passAt1: 0,
|
|
241
244
|
tokensPerPass: null,
|
|
245
|
+
tokensPerRun: null,
|
|
242
246
|
wallclockMs: 1000,
|
|
243
247
|
passRateStdev: 0,
|
|
244
248
|
budgetExceededCount: 0,
|
|
@@ -258,16 +262,16 @@ describe("aggregateCorpus", () => {
|
|
|
258
262
|
});
|
|
259
263
|
describe("delta helpers", () => {
|
|
260
264
|
test("computeCorpusDelta — akm − noakm", () => {
|
|
261
|
-
const noakm = { passRate: 0.3, tokensPerPass: 18000, wallclockMs: 4000 };
|
|
262
|
-
const akm = { passRate: 0.7, tokensPerPass: 14000, wallclockMs: 3000 };
|
|
265
|
+
const noakm = { passRate: 0.3, tokensPerPass: 18000, tokensPerRun: null, wallclockMs: 4000 };
|
|
266
|
+
const akm = { passRate: 0.7, tokensPerPass: 14000, tokensPerRun: null, wallclockMs: 3000 };
|
|
263
267
|
const d = computeCorpusDelta(noakm, akm);
|
|
264
268
|
expect(d.passRate).toBeCloseTo(0.4);
|
|
265
269
|
expect(d.tokensPerPass).toBeCloseTo(-4000);
|
|
266
270
|
expect(d.wallclockMs).toBeCloseTo(-1000);
|
|
267
271
|
});
|
|
268
272
|
test("computeCorpusDelta — null tokensPerPass propagates", () => {
|
|
269
|
-
const noakm = { passRate: 0, tokensPerPass: null, wallclockMs: 1 };
|
|
270
|
-
const akm = { passRate: 1, tokensPerPass: 5, wallclockMs: 2 };
|
|
273
|
+
const noakm = { passRate: 0, tokensPerPass: null, tokensPerRun: null, wallclockMs: 1 };
|
|
274
|
+
const akm = { passRate: 1, tokensPerPass: 5, tokensPerRun: null, wallclockMs: 2 };
|
|
271
275
|
expect(computeCorpusDelta(noakm, akm).tokensPerPass).toBeNull();
|
|
272
276
|
});
|
|
273
277
|
test("computePerTaskDelta — same null-safety rule", () => {
|
|
@@ -275,6 +279,7 @@ describe("delta helpers", () => {
|
|
|
275
279
|
passRate: 0,
|
|
276
280
|
passAt1: 0,
|
|
277
281
|
tokensPerPass: null,
|
|
282
|
+
tokensPerRun: null,
|
|
278
283
|
wallclockMs: 0,
|
|
279
284
|
passRateStdev: 0,
|
|
280
285
|
budgetExceededCount: 0,
|
|
@@ -286,6 +291,7 @@ describe("delta helpers", () => {
|
|
|
286
291
|
passRate: 1,
|
|
287
292
|
passAt1: 1,
|
|
288
293
|
tokensPerPass: 1000,
|
|
294
|
+
tokensPerRun: null,
|
|
289
295
|
wallclockMs: 100,
|
|
290
296
|
passRateStdev: 0,
|
|
291
297
|
budgetExceededCount: 0,
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* opencode-config.ts — config-driven opencode provider materialisation.
|
|
3
|
+
*
|
|
4
|
+
* Loads the operator's bench provider file (committed fixture or
|
|
5
|
+
* gitignored `.local.json` overlay), validates it for safety (no hard-coded
|
|
6
|
+
* credentials, no extra top-level keys), and writes a minimal
|
|
7
|
+
* `opencode.json` into the per-run isolated `OPENCODE_CONFIG` directory.
|
|
8
|
+
*
|
|
9
|
+
* Design: `tests/bench/BENCH.md` §"Config-driven opencode provider".
|
|
10
|
+
*/
|
|
11
|
+
import fs from "node:fs";
|
|
12
|
+
import path from "node:path";
|
|
13
|
+
/**
|
|
14
|
+
* Error class for bench provider-config problems.
|
|
15
|
+
*
|
|
16
|
+
* `isUsageError: true` → the caller should exit 2 (USAGE).
|
|
17
|
+
* `isUsageError: false` → the caller should exit 78 (CONFIG).
|
|
18
|
+
*/
|
|
19
|
+
export class BenchConfigError extends Error {
|
|
20
|
+
code = "BENCH_CONFIG";
|
|
21
|
+
isUsageError;
|
|
22
|
+
constructor(message, isUsageError) {
|
|
23
|
+
super(message);
|
|
24
|
+
this.name = "BenchConfigError";
|
|
25
|
+
this.isUsageError = isUsageError;
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Top-level keys that belong in a full opencode user-config but are FORBIDDEN
|
|
30
|
+
* in the bench provider file. The bench file is intentionally minimal — it
|
|
31
|
+
* only specifies provider entries. Any of these keys in the file means the
|
|
32
|
+
* operator has pasted a full opencode config into the bench slot, which could
|
|
33
|
+
* contain credentials, plugins, or permission overrides that the bench MUST
|
|
34
|
+
* NOT inherit.
|
|
35
|
+
*/
|
|
36
|
+
const FORBIDDEN_TOPLEVEL_KEYS = new Set([
|
|
37
|
+
"plugin",
|
|
38
|
+
"mcp",
|
|
39
|
+
"permission",
|
|
40
|
+
"disabled_providers",
|
|
41
|
+
"small_model",
|
|
42
|
+
"snapshot",
|
|
43
|
+
]);
|
|
44
|
+
/**
|
|
45
|
+
* Regex that an `apiKey` string value MUST match when present. The only
|
|
46
|
+
* allowed form is an env-ref placeholder: `{env:VAR_NAME}`.
|
|
47
|
+
*/
|
|
48
|
+
const ENV_REF_RE = /^\{env:[A-Z_][A-Z0-9_]*\}$/;
|
|
49
|
+
/** Heuristic to detect literal API credentials accidentally pasted into the file. */
|
|
50
|
+
const CREDENTIAL_RE = /^sk-[A-Za-z0-9_-]{20,}$/;
|
|
51
|
+
/**
|
|
52
|
+
* Recursively scan `node` for credential heuristic violations and literal
|
|
53
|
+
* `apiKey` values that are not env-refs. Throws `BenchConfigError` on the
|
|
54
|
+
* first violation found.
|
|
55
|
+
*
|
|
56
|
+
* @param node The value to scan (any JSON value).
|
|
57
|
+
* @param jspath JSON-path-like string for error messages, e.g. `providers.myProvider.apiKey`.
|
|
58
|
+
*/
|
|
59
|
+
function scanForCredentials(node, jspath) {
|
|
60
|
+
if (typeof node === "string") {
|
|
61
|
+
// Heuristic: reject anything that looks like an OpenAI/Anthropic-style key.
|
|
62
|
+
if (CREDENTIAL_RE.test(node)) {
|
|
63
|
+
throw new BenchConfigError(`bench provider file: credential heuristic triggered at "${jspath}" — literal API key detected; use {env:VAR_NAME} instead`, false);
|
|
64
|
+
}
|
|
65
|
+
return;
|
|
66
|
+
}
|
|
67
|
+
if (Array.isArray(node)) {
|
|
68
|
+
for (let i = 0; i < node.length; i++) {
|
|
69
|
+
scanForCredentials(node[i], `${jspath}[${i}]`);
|
|
70
|
+
}
|
|
71
|
+
return;
|
|
72
|
+
}
|
|
73
|
+
if (node !== null && typeof node === "object") {
|
|
74
|
+
for (const [key, value] of Object.entries(node)) {
|
|
75
|
+
const childPath = `${jspath}.${key}`;
|
|
76
|
+
// apiKey must be an env-ref if present as a string.
|
|
77
|
+
if (key === "apiKey" && typeof value === "string") {
|
|
78
|
+
if (!ENV_REF_RE.test(value)) {
|
|
79
|
+
throw new BenchConfigError(`bench provider file: "${childPath}" must be an env-ref (e.g. {env:MY_API_KEY}), not a literal value`, false);
|
|
80
|
+
}
|
|
81
|
+
// An env-ref is fine — don't recurse further into it.
|
|
82
|
+
continue;
|
|
83
|
+
}
|
|
84
|
+
scanForCredentials(value, childPath);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Load and validate a bench opencode providers JSON file.
|
|
90
|
+
*
|
|
91
|
+
* Throws:
|
|
92
|
+
* - `BenchConfigError(isUsageError: true)` if the file does not exist.
|
|
93
|
+
* - `BenchConfigError(isUsageError: false)` if JSON parse fails or the file
|
|
94
|
+
* fails validation (bad schema version, forbidden top-level keys, detected
|
|
95
|
+
* credentials).
|
|
96
|
+
*/
|
|
97
|
+
export function loadOpencodeProviders(absPath) {
|
|
98
|
+
// ── File existence ────────────────────────────────────────────────────────
|
|
99
|
+
let raw;
|
|
100
|
+
try {
|
|
101
|
+
raw = fs.readFileSync(absPath, "utf8");
|
|
102
|
+
}
|
|
103
|
+
catch (err) {
|
|
104
|
+
const isEnoent = err.code === "ENOENT";
|
|
105
|
+
if (isEnoent) {
|
|
106
|
+
throw new BenchConfigError(`bench provider file not found: ${absPath}`, true);
|
|
107
|
+
}
|
|
108
|
+
throw new BenchConfigError(`bench provider file: could not read "${absPath}": ${err instanceof Error ? err.message : String(err)}`, false);
|
|
109
|
+
}
|
|
110
|
+
// ── JSON parse ────────────────────────────────────────────────────────────
|
|
111
|
+
let parsed;
|
|
112
|
+
try {
|
|
113
|
+
parsed = JSON.parse(raw);
|
|
114
|
+
}
|
|
115
|
+
catch (err) {
|
|
116
|
+
throw new BenchConfigError(`bench provider file: JSON parse error in "${absPath}": ${err instanceof Error ? err.message : String(err)}`, false);
|
|
117
|
+
}
|
|
118
|
+
if (parsed === null || typeof parsed !== "object" || Array.isArray(parsed)) {
|
|
119
|
+
throw new BenchConfigError(`bench provider file: root must be a JSON object (got ${Array.isArray(parsed) ? "array" : typeof parsed})`, false);
|
|
120
|
+
}
|
|
121
|
+
const obj = parsed;
|
|
122
|
+
// ── Forbidden top-level keys ──────────────────────────────────────────────
|
|
123
|
+
for (const key of Object.keys(obj)) {
|
|
124
|
+
if (FORBIDDEN_TOPLEVEL_KEYS.has(key)) {
|
|
125
|
+
throw new BenchConfigError(`bench provider file: forbidden top-level key "${key}" — the bench provider file must contain only "schemaVersion", "defaultModel", and "providers"`, false);
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
// ── schemaVersion ─────────────────────────────────────────────────────────
|
|
129
|
+
if (obj.schemaVersion !== 1) {
|
|
130
|
+
throw new BenchConfigError(`bench provider file: unsupported schemaVersion ${JSON.stringify(obj.schemaVersion)}; expected 1`, false);
|
|
131
|
+
}
|
|
132
|
+
// ── providers ─────────────────────────────────────────────────────────────
|
|
133
|
+
if (obj.providers === null || typeof obj.providers !== "object" || Array.isArray(obj.providers)) {
|
|
134
|
+
throw new BenchConfigError(`bench provider file: "providers" must be an object`, false);
|
|
135
|
+
}
|
|
136
|
+
const providers = obj.providers;
|
|
137
|
+
// ── Credential scan ───────────────────────────────────────────────────────
|
|
138
|
+
scanForCredentials(providers, "providers");
|
|
139
|
+
return {
|
|
140
|
+
source: absPath,
|
|
141
|
+
providers,
|
|
142
|
+
...(typeof obj.defaultModel === "string" ? { defaultModel: obj.defaultModel } : {}),
|
|
143
|
+
};
|
|
144
|
+
}
|
|
145
|
+
/**
|
|
146
|
+
* Given a model ID (e.g. `"don/mlx-community/qwen3.6-35b-a3b"`), split on
|
|
147
|
+
* the first `/` to get the provider key and look it up in `loaded.providers`.
|
|
148
|
+
*
|
|
149
|
+
* Throws `BenchConfigError` if the provider key is not found.
|
|
150
|
+
*/
|
|
151
|
+
export function selectProviderForModel(loaded, modelId) {
|
|
152
|
+
const slashIdx = modelId.indexOf("/");
|
|
153
|
+
const providerKey = slashIdx === -1 ? modelId : modelId.slice(0, slashIdx);
|
|
154
|
+
if (!(providerKey in loaded.providers)) {
|
|
155
|
+
throw new BenchConfigError(`bench provider file: model ID "${modelId}" maps to provider key "${providerKey}", which is not present in ${loaded.source}; available: ${Object.keys(loaded.providers).join(", ") || "(none)"}`, false);
|
|
156
|
+
}
|
|
157
|
+
return { providerKey, entry: loaded.providers[providerKey] };
|
|
158
|
+
}
|
|
159
|
+
/**
|
|
160
|
+
* Write a minimal `opencode.json` into `opencodeConfigDir` for the given
|
|
161
|
+
* provider selection. The file contains exactly two top-level keys:
|
|
162
|
+
* `$schema` and `provider`.
|
|
163
|
+
*
|
|
164
|
+
* Written with mode `0o600` so the file is not world-readable (it may
|
|
165
|
+
* contain env-ref placeholders that hint at secret variable names).
|
|
166
|
+
*/
|
|
167
|
+
export function materializeOpencodeConfig(opencodeConfigDir, selected,
|
|
168
|
+
/** Full model id (e.g. "don/mlx-community/qwen3.6-35b-a3b") written as the
|
|
169
|
+
* top-level `model` key so opencode uses it without a --model flag. */
|
|
170
|
+
modelId) {
|
|
171
|
+
const config = {
|
|
172
|
+
$schema: "https://opencode.ai/config.json",
|
|
173
|
+
model: modelId,
|
|
174
|
+
provider: {
|
|
175
|
+
[selected.providerKey]: selected.entry,
|
|
176
|
+
},
|
|
177
|
+
// Explicitly allow all tools so opencode run (non-interactive) doesn't
|
|
178
|
+
// silently skip bash/file operations due to missing permission config.
|
|
179
|
+
permission: {
|
|
180
|
+
bash: "allow",
|
|
181
|
+
edit: "allow",
|
|
182
|
+
write: "allow",
|
|
183
|
+
read: "allow",
|
|
184
|
+
webfetch: "allow",
|
|
185
|
+
},
|
|
186
|
+
// Disable operator plugins during bench runs. Plugins like akm-opencode
|
|
187
|
+
// run their own session lifecycle hooks (warmIndexInBackground, akm setup
|
|
188
|
+
// prompts, AKM_STASH_DIR overrides in shell.env) that interfere with the
|
|
189
|
+
// bench's isolated fixture stash and cause stash mismatch failures.
|
|
190
|
+
plugin: [],
|
|
191
|
+
};
|
|
192
|
+
const outPath = path.join(opencodeConfigDir, "opencode.json");
|
|
193
|
+
fs.writeFileSync(outPath, JSON.stringify(config, null, 2), { mode: 0o600 });
|
|
194
|
+
}
|