akm-cli 0.7.0-rc1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/dist/src/cli.js +100 -16
  2. package/dist/src/commands/config-cli.js +42 -0
  3. package/dist/src/commands/history.js +78 -7
  4. package/dist/src/commands/registry-search.js +69 -6
  5. package/dist/src/commands/search.js +30 -3
  6. package/dist/src/commands/show.js +29 -0
  7. package/dist/src/commands/source-add.js +5 -1
  8. package/dist/src/commands/source-manage.js +7 -1
  9. package/dist/src/core/config.js +28 -0
  10. package/dist/src/indexer/db-search.js +1 -0
  11. package/dist/src/indexer/indexer.js +16 -2
  12. package/dist/src/indexer/matchers.js +1 -1
  13. package/dist/src/indexer/search-source.js +4 -2
  14. package/dist/src/integrations/agent/profiles.js +1 -1
  15. package/dist/src/integrations/agent/spawn.js +67 -16
  16. package/dist/src/integrations/github.js +9 -3
  17. package/dist/src/llm/embedders/remote.js +37 -3
  18. package/dist/src/output/cli-hints.js +15 -2
  19. package/dist/src/output/renderers.js +3 -1
  20. package/dist/src/output/shapes.js +8 -1
  21. package/dist/src/output/text.js +156 -3
  22. package/dist/src/registry/build-index.js +5 -4
  23. package/dist/src/registry/providers/static-index.js +3 -1
  24. package/dist/src/setup/setup.js +9 -0
  25. package/dist/src/wiki/wiki.js +54 -6
  26. package/dist/src/workflows/runs.js +37 -3
  27. package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +1 -1
  28. package/dist/tests/bench/attribution.test.js +24 -23
  29. package/dist/tests/bench/cleanup.js +31 -0
  30. package/dist/tests/bench/cli.js +366 -31
  31. package/dist/tests/bench/cli.test.js +282 -14
  32. package/dist/tests/bench/corpus.js +3 -0
  33. package/dist/tests/bench/corpus.test.js +10 -10
  34. package/dist/tests/bench/doctor.js +525 -0
  35. package/dist/tests/bench/driver.js +77 -22
  36. package/dist/tests/bench/driver.test.js +142 -1
  37. package/dist/tests/bench/environment.js +233 -0
  38. package/dist/tests/bench/environment.test.js +199 -0
  39. package/dist/tests/bench/evolve.js +67 -0
  40. package/dist/tests/bench/evolve.test.js +12 -4
  41. package/dist/tests/bench/failure-modes.test.js +52 -3
  42. package/dist/tests/bench/feedback-integrity.test.js +3 -2
  43. package/dist/tests/bench/leakage.test.js +105 -2
  44. package/dist/tests/bench/learning-curve.test.js +3 -2
  45. package/dist/tests/bench/metrics.js +102 -26
  46. package/dist/tests/bench/metrics.test.js +10 -4
  47. package/dist/tests/bench/opencode-config.js +194 -0
  48. package/dist/tests/bench/opencode-config.test.js +370 -0
  49. package/dist/tests/bench/report.js +73 -9
  50. package/dist/tests/bench/report.test.js +59 -10
  51. package/dist/tests/bench/run-config.js +355 -0
  52. package/dist/tests/bench/run-config.test.js +298 -0
  53. package/dist/tests/bench/run-curate-test.js +32 -0
  54. package/dist/tests/bench/run-failing-tasks.js +56 -0
  55. package/dist/tests/bench/run-full-bench.js +51 -0
  56. package/dist/tests/bench/run-items36-targeted.js +69 -0
  57. package/dist/tests/bench/run-nano-quick.js +42 -0
  58. package/dist/tests/bench/run-waveg-targeted.js +62 -0
  59. package/dist/tests/bench/runner.js +257 -94
  60. package/dist/tests/bench/tmp.js +90 -0
  61. package/dist/tests/bench/trajectory.js +2 -2
  62. package/dist/tests/bench/verifier.js +6 -1
  63. package/dist/tests/bench/workflow-spec.js +11 -24
  64. package/dist/tests/bench/workflow-spec.test.js +1 -1
  65. package/dist/tests/bench/workflow-trace.js +34 -0
  66. package/dist/tests/cli-errors.test.js +1 -0
  67. package/dist/tests/commands/history.test.js +195 -0
  68. package/dist/tests/config.test.js +25 -0
  69. package/dist/tests/e2e.test.js +23 -2
  70. package/dist/tests/fixtures/stashes/load.js +1 -1
  71. package/dist/tests/fixtures/stashes/load.test.js +11 -2
  72. package/dist/tests/indexer.test.js +12 -1
  73. package/dist/tests/output-baseline.test.js +2 -1
  74. package/dist/tests/output-shapes-unit.test.js +3 -1
  75. package/dist/tests/registry-build-index.test.js +17 -1
  76. package/dist/tests/registry-providers/static-index.test.js +34 -0
  77. package/dist/tests/registry-search.test.js +200 -0
  78. package/dist/tests/remember-frontmatter.test.js +11 -13
  79. package/dist/tests/source-qa-fixes.test.js +18 -0
  80. package/dist/tests/source-registry.test.js +3 -3
  81. package/dist/tests/source-source.test.js +61 -1
  82. package/dist/tests/workflow-qa-fixes.test.js +18 -0
  83. package/package.json +1 -1
@@ -88,6 +88,7 @@ export function aggregatePerTask(results) {
88
88
  harnessErrorCount: 0,
89
89
  count: 0,
90
90
  runsWithMeasuredTokens: 0,
91
+ tokensPerRun: null,
91
92
  };
92
93
  }
93
94
  let passes = 0;
@@ -97,12 +98,16 @@ export function aggregatePerTask(results) {
97
98
  let budgetExceeded = 0;
98
99
  let harnessError = 0;
99
100
  let runsWithMeasuredTokens = 0;
101
+ let totalTokensInMeasuredRuns = 0;
102
+ let measuredRuns = 0;
100
103
  // For the standard deviation we need a fixed-iteration buffer of pass/fail.
101
104
  const passSamples = [];
102
105
  for (const r of results) {
103
106
  totalWallclock += r.wallclockMs;
104
107
  if (isMeasured(r)) {
105
108
  runsWithMeasuredTokens += 1;
109
+ measuredRuns += 1;
110
+ totalTokensInMeasuredRuns += r.tokens.input + r.tokens.output;
106
111
  }
107
112
  const isPass = r.outcome === "pass" ? 1 : 0;
108
113
  passSamples.push(isPass);
@@ -135,6 +140,7 @@ export function aggregatePerTask(results) {
135
140
  harnessErrorCount: harnessError,
136
141
  count: results.length,
137
142
  runsWithMeasuredTokens,
143
+ tokensPerRun: measuredRuns === 0 ? null : totalTokensInMeasuredRuns / measuredRuns,
138
144
  };
139
145
  }
140
146
  /** Sample standard deviation. Returns 0 for length ≤ 1 (no spread to measure). */
@@ -156,13 +162,15 @@ function stdev(values) {
156
162
  export function aggregateCorpus(perTask) {
157
163
  const tasks = Object.values(perTask);
158
164
  if (tasks.length === 0) {
159
- return { passRate: 0, tokensPerPass: null, wallclockMs: 0 };
165
+ return { passRate: 0, tokensPerPass: null, wallclockMs: 0, tokensPerRun: null };
160
166
  }
161
167
  const passRate = tasks.reduce((a, t) => a + t.passRate, 0) / tasks.length;
162
168
  const wallclockMs = tasks.reduce((a, t) => a + t.wallclockMs, 0) / tasks.length;
163
169
  const tppValues = tasks.map((t) => t.tokensPerPass).filter((v) => v !== null);
164
170
  const tokensPerPass = tppValues.length === 0 ? null : tppValues.reduce((a, b) => a + b, 0) / tppValues.length;
165
- return { passRate, tokensPerPass, wallclockMs };
171
+ const tprValues = tasks.map((t) => t.tokensPerRun).filter((v) => v !== null);
172
+ const tokensPerRun = tprValues.length === 0 ? null : tprValues.reduce((a, b) => a + b, 0) / tprValues.length;
173
+ return { passRate, tokensPerPass, wallclockMs, tokensPerRun };
166
174
  }
167
175
  /**
168
176
  * Compute the akm − noakm delta. Negative `tokensPerPass`/`wallclockMs` mean
@@ -174,6 +182,7 @@ export function computeCorpusDelta(noakm, akm) {
174
182
  passRate: akm.passRate - noakm.passRate,
175
183
  tokensPerPass: akm.tokensPerPass === null || noakm.tokensPerPass === null ? null : akm.tokensPerPass - noakm.tokensPerPass,
176
184
  wallclockMs: akm.wallclockMs - noakm.wallclockMs,
185
+ tokensPerRun: akm.tokensPerRun === null || noakm.tokensPerRun === null ? null : akm.tokensPerRun - noakm.tokensPerRun,
177
186
  };
178
187
  }
179
188
  /** Per-task delta with the same null-safety as the corpus delta. */
@@ -182,6 +191,7 @@ export function computePerTaskDelta(noakm, akm) {
182
191
  passRate: akm.passRate - noakm.passRate,
183
192
  tokensPerPass: akm.tokensPerPass === null || noakm.tokensPerPass === null ? null : akm.tokensPerPass - noakm.tokensPerPass,
184
193
  wallclockMs: akm.wallclockMs - noakm.wallclockMs,
194
+ tokensPerRun: akm.tokensPerRun === null || noakm.tokensPerRun === null ? null : akm.tokensPerRun - noakm.tokensPerRun,
185
195
  };
186
196
  }
187
197
  /**
@@ -1126,51 +1136,78 @@ const SEARCH_RANK_CUTOFF = 5;
1126
1136
  /** Cap on the number of characters of `verifierStdout` we substring-scan. Mirrors trajectory.ts. */
1127
1137
  const FAILURE_MODE_STDOUT_SCAN_CAP = 16 * 1024 * 1024;
1128
1138
  /**
1129
- * Classify a single failed run into one of the seven §6.6 labels. Pure
1130
- * function — string-matches `runResult.events[]` and `runResult.verifierStdout`,
1131
- * never calls an LLM, never touches the filesystem.
1139
+ * Classify a single failed run into one of the §6.6 labels. Pure function —
1140
+ * consults `runResult.trajectory.correctAssetLoaded` first (trajectory data
1141
+ * is authoritative when present), then falls back to string-matching
1142
+ * `runResult.events[]` and `runResult.verifierStdout`. Never calls an LLM,
1143
+ * never touches the filesystem.
1132
1144
  *
1133
1145
  * Decision tree (priority order — first match wins):
1134
1146
  * 1. Run not failed (`pass`, `budget_exceeded`, `harness_error`) → `null`.
1135
- * 2. No `akm search` call in the trace `no_search`.
1136
- * 3. Search ran; gold ref absent from search results → `search_no_gold`.
1137
- * 4. Gold ref present in search results at rank > 5 → `search_low_rank`.
1138
- * 5. `akm show` invoked on a non-gold ref AND gold ref never loaded → `loaded_wrong`.
1139
- * 6. Gold ref loaded; verifier output suggests the action contradicts the
1140
- * asset's guidance (heuristic: verifier mentions the gold pattern was
1141
- * explicitly NOT followed) `loaded_ignored`.
1142
- * 7. Gold ref loaded and apparently followed → `followed_wrong`.
1143
- * 8. Default `unrelated_bug`.
1144
- *
1145
- * Tasks without `goldRef`: rules that depend on the gold ref (3-7) are
1146
- * skipped; only `no_search` and `unrelated_bug` are reachable.
1147
+ * 2. `trajectory.correctAssetLoaded === true` the agent loaded the gold
1148
+ * asset but still failed. This is `loaded_ignored` (agent wrote from
1149
+ * memory instead of applying asset content). This short-circuit fixes
1150
+ * the 2026-05-03 baseline bug where 24/25 `search_no_gold` labels were
1151
+ * wrong because the classifier didn't consult trajectory data.
1152
+ * 3. No `akm search` call in the trace:
1153
+ * a. If task has no `goldRef` (so `correctAssetLoaded` is always null)
1154
+ * `no_events` (trajectory metric undefined; cannot distinguish
1155
+ * "agent ran but events absent" from "agent never ran").
1156
+ * b. Otherwise → `no_search`.
1157
+ * 4. Search ran, no goldRef `unrelated_bug`.
1158
+ * 5. Search ran; gold ref absent from results → `search_no_gold`.
1159
+ * (Only reachable when `correctAssetLoaded` is false or null, since
1160
+ * true is handled in step 2.)
1161
+ * 6. Gold ref present at rank > 5 → `search_low_rank`.
1162
+ * 7. `akm show` invoked on a non-gold ref AND gold ref never loaded
1163
+ * → `loaded_wrong`.
1164
+ * 8. Gold ref loaded; verifier output suggests the action contradicts the
1165
+ * asset's guidance → `loaded_ignored`.
1166
+ * 9. Gold ref loaded and apparently followed → `followed_wrong`.
1167
+ * 10. Default → `unrelated_bug`.
1147
1168
  */
1148
1169
  export function classifyFailureMode(taskMeta, runResult) {
1149
1170
  if (runResult.outcome !== "fail")
1150
1171
  return null;
1151
- const trace = collectTrace(runResult);
1152
1172
  const goldRef = taskMeta.goldRef;
1153
- // 1. no_search — no `akm search` invocation anywhere in the trace.
1173
+ const correctAssetLoaded = runResult.trajectory?.correctAssetLoaded;
1174
+ // 1. Trajectory short-circuit: if events data confirms the gold asset was
1175
+ // loaded, the failure must be compliance-related, not discovery-related.
1176
+ // Return `loaded_ignored` immediately without scanning stdout.
1177
+ if (correctAssetLoaded === true) {
1178
+ return "loaded_ignored";
1179
+ }
1180
+ const trace = collectTrace(runResult);
1181
+ // 2. no_search / no_events — no `akm search` invocation anywhere in the trace.
1154
1182
  if (!hasAkmSearch(trace, runResult)) {
1183
+ // When there is no goldRef, correctAssetLoaded is always null (the metric
1184
+ // is undefined). We cannot tell whether the agent genuinely didn't search
1185
+ // or whether events data was simply absent. Use `no_events` to surface
1186
+ // this ambiguity rather than conflating it with `no_search`.
1187
+ if (!goldRef) {
1188
+ return "no_events";
1189
+ }
1155
1190
  return "no_search";
1156
1191
  }
1157
1192
  // Without a gold ref the search-based and load-based checks are undefined.
1158
- // We can only distinguish "no_search" from everything else.
1193
+ // We can only distinguish "no_search" / "no_events" from everything else.
1159
1194
  if (!goldRef) {
1160
1195
  return "unrelated_bug";
1161
1196
  }
1162
1197
  const searchRank = findGoldSearchRank(trace, goldRef);
1163
- // 2. search_no_gold — search ran (precondition above) but gold ref absent.
1198
+ // 3. search_no_gold — search ran (precondition above) but gold ref absent.
1199
+ // Only reachable when correctAssetLoaded is false or null (trajectory
1200
+ // data indicates gold was not loaded), because true is handled above.
1164
1201
  if (searchRank === null) {
1165
1202
  return "search_no_gold";
1166
1203
  }
1167
- // 3. search_low_rank — present but below the cutoff.
1204
+ // 4. search_low_rank — present but below the cutoff.
1168
1205
  if (searchRank > SEARCH_RANK_CUTOFF) {
1169
1206
  return "search_low_rank";
1170
1207
  }
1171
1208
  const goldLoaded = hasAkmShow(trace, runResult, goldRef);
1172
1209
  const otherRefLoaded = hasAkmShowOtherRef(trace, runResult, goldRef);
1173
- // 4. loaded_wrong — agent showed a non-gold ref AND never loaded the gold.
1210
+ // 5. loaded_wrong — agent showed a non-gold ref AND never loaded the gold.
1174
1211
  if (otherRefLoaded && !goldLoaded) {
1175
1212
  return "loaded_wrong";
1176
1213
  }
@@ -1181,7 +1218,7 @@ export function classifyFailureMode(taskMeta, runResult) {
1181
1218
  // table has no row for "found but never opened" — treat as unrelated_bug.
1182
1219
  return "unrelated_bug";
1183
1220
  }
1184
- // 5. loaded_ignored — verifier diagnostic indicates the action contradicts
1221
+ // 6. loaded_ignored — verifier diagnostic indicates the action contradicts
1185
1222
  // the loaded asset. Conservative heuristic: look for explicit "ignored"
1186
1223
  // or "not applied" markers in the verifier stdout. Without an LLM we
1187
1224
  // cannot detect subtler contradictions, so this branch only fires when
@@ -1189,7 +1226,7 @@ export function classifyFailureMode(taskMeta, runResult) {
1189
1226
  if (verifierIndicatesIgnored(runResult.verifierStdout)) {
1190
1227
  return "loaded_ignored";
1191
1228
  }
1192
- // 6. followed_wrong — gold loaded, apparently followed, verifier still
1229
+ // 7. followed_wrong — gold loaded, apparently followed, verifier still
1193
1230
  // failed. The §6.6 spec maps this to "the asset itself is wrong".
1194
1231
  return "followed_wrong";
1195
1232
  }
@@ -1992,6 +2029,8 @@ function perRun(run, taskMetadata) {
1992
2029
  let searchCount = 0;
1993
2030
  let showCount = 0;
1994
2031
  let feedbackCount = 0;
2032
+ let positiveFeedbackCount = 0;
2033
+ let negativeFeedbackCount = 0;
1995
2034
  const uniqueShowRefs = new Set();
1996
2035
  for (const ev of events) {
1997
2036
  if (ev.type === "akm_search")
@@ -2002,8 +2041,17 @@ function perRun(run, taskMetadata) {
2002
2041
  uniqueShowRefs.add(ev.assetRef);
2003
2042
  }
2004
2043
  }
2005
- else if (ev.type === "akm_feedback")
2044
+ else if (ev.type === "akm_feedback") {
2006
2045
  feedbackCount += 1;
2046
+ // Polarity is carried in args as "--positive" or "--negative".
2047
+ // Events sourced from events.jsonl also have args populated by
2048
+ // normalizeRunToTrace. Absence of both flags is treated as unknown
2049
+ // (contributes to feedbackCount but not to either polarity counter).
2050
+ if (ev.args?.includes("--positive"))
2051
+ positiveFeedbackCount += 1;
2052
+ else if (ev.args?.includes("--negative"))
2053
+ negativeFeedbackCount += 1;
2054
+ }
2007
2055
  }
2008
2056
  const totalToolCalls = searchCount + showCount + feedbackCount;
2009
2057
  // Run-start anchor: earliest parseable ts in the trace. We use the trace
@@ -2049,6 +2097,8 @@ function perRun(run, taskMetadata) {
2049
2097
  searchCount,
2050
2098
  showCount,
2051
2099
  feedbackCount,
2100
+ positiveFeedbackCount,
2101
+ negativeFeedbackCount,
2052
2102
  totalToolCalls,
2053
2103
  assetsLoadedCount: uniqueShowRefs.size,
2054
2104
  irrelevantAssetsLoadedCount,
@@ -2087,6 +2137,12 @@ export function aggregateAkmOverhead(perRun, rawRuns = []) {
2087
2137
  totalToolCalls: 0,
2088
2138
  toolCallsPerSuccess: null,
2089
2139
  costPerSuccess: null,
2140
+ searchEngagementRate: 0,
2141
+ showEngagementRate: 0,
2142
+ feedbackEngagementRate: 0,
2143
+ searchToShowRatio: null,
2144
+ meanPositiveFeedbackCount: 0,
2145
+ meanNegativeFeedbackCount: 0,
2090
2146
  };
2091
2147
  }
2092
2148
  let searchSum = 0;
@@ -2115,12 +2171,25 @@ export function aggregateAkmOverhead(perRun, rawRuns = []) {
2115
2171
  let parsedPassTokenSum = 0;
2116
2172
  let parsedPassCount = 0;
2117
2173
  let anyPassMissingMeasurement = false;
2174
+ let searchEngagedRuns = 0;
2175
+ let showEngagedRuns = 0;
2176
+ let feedbackEngagedRuns = 0;
2177
+ let positiveFeedbackSum = 0;
2178
+ let negativeFeedbackSum = 0;
2118
2179
  for (const row of perRun) {
2119
2180
  searchSum += row.searchCount;
2120
2181
  showSum += row.showCount;
2121
2182
  feedbackSum += row.feedbackCount;
2122
2183
  toolCallsSum += row.totalToolCalls;
2123
2184
  assetsSum += row.assetsLoadedCount;
2185
+ if (row.searchCount > 0)
2186
+ searchEngagedRuns += 1;
2187
+ if (row.showCount > 0)
2188
+ showEngagedRuns += 1;
2189
+ if (row.feedbackCount > 0)
2190
+ feedbackEngagedRuns += 1;
2191
+ positiveFeedbackSum += row.positiveFeedbackCount;
2192
+ negativeFeedbackSum += row.negativeFeedbackCount;
2124
2193
  if (row.irrelevantAssetsLoadedCount !== null) {
2125
2194
  irrelevantSum += row.irrelevantAssetsLoadedCount;
2126
2195
  irrelevantCount += 1;
@@ -2166,6 +2235,7 @@ export function aggregateAkmOverhead(perRun, rawRuns = []) {
2166
2235
  const costPerSuccess = passingRuns === 0 || anyPassMissingMeasurement || parsedPassCount === 0
2167
2236
  ? null
2168
2237
  : parsedPassTokenSum / parsedPassCount;
2238
+ const searchToShowRatio = searchSum === 0 ? null : showSum / searchSum;
2169
2239
  return {
2170
2240
  totalRuns: n,
2171
2241
  passingRuns,
@@ -2182,6 +2252,12 @@ export function aggregateAkmOverhead(perRun, rawRuns = []) {
2182
2252
  totalToolCalls: toolCallsSum,
2183
2253
  toolCallsPerSuccess,
2184
2254
  costPerSuccess,
2255
+ searchEngagementRate: searchEngagedRuns / n,
2256
+ showEngagementRate: showEngagedRuns / n,
2257
+ feedbackEngagementRate: feedbackEngagedRuns / n,
2258
+ searchToShowRatio,
2259
+ meanPositiveFeedbackCount: positiveFeedbackSum / n,
2260
+ meanNegativeFeedbackCount: negativeFeedbackSum / n,
2185
2261
  };
2186
2262
  }
2187
2263
  /**
@@ -11,6 +11,7 @@ function ptm(overrides = {}) {
11
11
  passRate: 0,
12
12
  passAt1: 0,
13
13
  tokensPerPass: null,
14
+ tokensPerRun: null,
14
15
  wallclockMs: 0,
15
16
  passRateStdev: 0,
16
17
  budgetExceededCount: 0,
@@ -209,6 +210,7 @@ describe("aggregateCorpus", () => {
209
210
  passRate: 1,
210
211
  passAt1: 1,
211
212
  tokensPerPass: 1000,
213
+ tokensPerRun: 1000,
212
214
  wallclockMs: 1000,
213
215
  passRateStdev: 0,
214
216
  budgetExceededCount: 0,
@@ -220,6 +222,7 @@ describe("aggregateCorpus", () => {
220
222
  passRate: 0,
221
223
  passAt1: 0,
222
224
  tokensPerPass: null,
225
+ tokensPerRun: null,
223
226
  wallclockMs: 2000,
224
227
  passRateStdev: 0,
225
228
  budgetExceededCount: 0,
@@ -239,6 +242,7 @@ describe("aggregateCorpus", () => {
239
242
  passRate: 0,
240
243
  passAt1: 0,
241
244
  tokensPerPass: null,
245
+ tokensPerRun: null,
242
246
  wallclockMs: 1000,
243
247
  passRateStdev: 0,
244
248
  budgetExceededCount: 0,
@@ -258,16 +262,16 @@ describe("aggregateCorpus", () => {
258
262
  });
259
263
  describe("delta helpers", () => {
260
264
  test("computeCorpusDelta — akm − noakm", () => {
261
- const noakm = { passRate: 0.3, tokensPerPass: 18000, wallclockMs: 4000 };
262
- const akm = { passRate: 0.7, tokensPerPass: 14000, wallclockMs: 3000 };
265
+ const noakm = { passRate: 0.3, tokensPerPass: 18000, tokensPerRun: null, wallclockMs: 4000 };
266
+ const akm = { passRate: 0.7, tokensPerPass: 14000, tokensPerRun: null, wallclockMs: 3000 };
263
267
  const d = computeCorpusDelta(noakm, akm);
264
268
  expect(d.passRate).toBeCloseTo(0.4);
265
269
  expect(d.tokensPerPass).toBeCloseTo(-4000);
266
270
  expect(d.wallclockMs).toBeCloseTo(-1000);
267
271
  });
268
272
  test("computeCorpusDelta — null tokensPerPass propagates", () => {
269
- const noakm = { passRate: 0, tokensPerPass: null, wallclockMs: 1 };
270
- const akm = { passRate: 1, tokensPerPass: 5, wallclockMs: 2 };
273
+ const noakm = { passRate: 0, tokensPerPass: null, tokensPerRun: null, wallclockMs: 1 };
274
+ const akm = { passRate: 1, tokensPerPass: 5, tokensPerRun: null, wallclockMs: 2 };
271
275
  expect(computeCorpusDelta(noakm, akm).tokensPerPass).toBeNull();
272
276
  });
273
277
  test("computePerTaskDelta — same null-safety rule", () => {
@@ -275,6 +279,7 @@ describe("delta helpers", () => {
275
279
  passRate: 0,
276
280
  passAt1: 0,
277
281
  tokensPerPass: null,
282
+ tokensPerRun: null,
278
283
  wallclockMs: 0,
279
284
  passRateStdev: 0,
280
285
  budgetExceededCount: 0,
@@ -286,6 +291,7 @@ describe("delta helpers", () => {
286
291
  passRate: 1,
287
292
  passAt1: 1,
288
293
  tokensPerPass: 1000,
294
+ tokensPerRun: null,
289
295
  wallclockMs: 100,
290
296
  passRateStdev: 0,
291
297
  budgetExceededCount: 0,
@@ -0,0 +1,194 @@
1
+ /**
2
+ * opencode-config.ts — config-driven opencode provider materialisation.
3
+ *
4
+ * Loads the operator's bench provider file (committed fixture or
5
+ * gitignored `.local.json` overlay), validates it for safety (no hard-coded
6
+ * credentials, no extra top-level keys), and writes a minimal
7
+ * `opencode.json` into the per-run isolated `OPENCODE_CONFIG` directory.
8
+ *
9
+ * Design: `tests/bench/BENCH.md` §"Config-driven opencode provider".
10
+ */
11
+ import fs from "node:fs";
12
+ import path from "node:path";
13
+ /**
14
+ * Error class for bench provider-config problems.
15
+ *
16
+ * `isUsageError: true` → the caller should exit 2 (USAGE).
17
+ * `isUsageError: false` → the caller should exit 78 (CONFIG).
18
+ */
19
+ export class BenchConfigError extends Error {
20
+ code = "BENCH_CONFIG";
21
+ isUsageError;
22
+ constructor(message, isUsageError) {
23
+ super(message);
24
+ this.name = "BenchConfigError";
25
+ this.isUsageError = isUsageError;
26
+ }
27
+ }
28
+ /**
29
+ * Top-level keys that belong in a full opencode user-config but are FORBIDDEN
30
+ * in the bench provider file. The bench file is intentionally minimal — it
31
+ * only specifies provider entries. Any of these keys in the file means the
32
+ * operator has pasted a full opencode config into the bench slot, which could
33
+ * contain credentials, plugins, or permission overrides that the bench MUST
34
+ * NOT inherit.
35
+ */
36
+ const FORBIDDEN_TOPLEVEL_KEYS = new Set([
37
+ "plugin",
38
+ "mcp",
39
+ "permission",
40
+ "disabled_providers",
41
+ "small_model",
42
+ "snapshot",
43
+ ]);
44
+ /**
45
+ * Regex that an `apiKey` string value MUST match when present. The only
46
+ * allowed form is an env-ref placeholder: `{env:VAR_NAME}`.
47
+ */
48
+ const ENV_REF_RE = /^\{env:[A-Z_][A-Z0-9_]*\}$/;
49
+ /** Heuristic to detect literal API credentials accidentally pasted into the file. */
50
+ const CREDENTIAL_RE = /^sk-[A-Za-z0-9_-]{20,}$/;
51
+ /**
52
+ * Recursively scan `node` for credential heuristic violations and literal
53
+ * `apiKey` values that are not env-refs. Throws `BenchConfigError` on the
54
+ * first violation found.
55
+ *
56
+ * @param node The value to scan (any JSON value).
57
+ * @param jspath JSON-path-like string for error messages, e.g. `providers.myProvider.apiKey`.
58
+ */
59
+ function scanForCredentials(node, jspath) {
60
+ if (typeof node === "string") {
61
+ // Heuristic: reject anything that looks like an OpenAI/Anthropic-style key.
62
+ if (CREDENTIAL_RE.test(node)) {
63
+ throw new BenchConfigError(`bench provider file: credential heuristic triggered at "${jspath}" — literal API key detected; use {env:VAR_NAME} instead`, false);
64
+ }
65
+ return;
66
+ }
67
+ if (Array.isArray(node)) {
68
+ for (let i = 0; i < node.length; i++) {
69
+ scanForCredentials(node[i], `${jspath}[${i}]`);
70
+ }
71
+ return;
72
+ }
73
+ if (node !== null && typeof node === "object") {
74
+ for (const [key, value] of Object.entries(node)) {
75
+ const childPath = `${jspath}.${key}`;
76
+ // apiKey must be an env-ref if present as a string.
77
+ if (key === "apiKey" && typeof value === "string") {
78
+ if (!ENV_REF_RE.test(value)) {
79
+ throw new BenchConfigError(`bench provider file: "${childPath}" must be an env-ref (e.g. {env:MY_API_KEY}), not a literal value`, false);
80
+ }
81
+ // An env-ref is fine — don't recurse further into it.
82
+ continue;
83
+ }
84
+ scanForCredentials(value, childPath);
85
+ }
86
+ }
87
+ }
88
+ /**
89
+ * Load and validate a bench opencode providers JSON file.
90
+ *
91
+ * Throws:
92
+ * - `BenchConfigError(isUsageError: true)` if the file does not exist.
93
+ * - `BenchConfigError(isUsageError: false)` if JSON parse fails or the file
94
+ * fails validation (bad schema version, forbidden top-level keys, detected
95
+ * credentials).
96
+ */
97
+ export function loadOpencodeProviders(absPath) {
98
+ // ── File existence ────────────────────────────────────────────────────────
99
+ let raw;
100
+ try {
101
+ raw = fs.readFileSync(absPath, "utf8");
102
+ }
103
+ catch (err) {
104
+ const isEnoent = err.code === "ENOENT";
105
+ if (isEnoent) {
106
+ throw new BenchConfigError(`bench provider file not found: ${absPath}`, true);
107
+ }
108
+ throw new BenchConfigError(`bench provider file: could not read "${absPath}": ${err instanceof Error ? err.message : String(err)}`, false);
109
+ }
110
+ // ── JSON parse ────────────────────────────────────────────────────────────
111
+ let parsed;
112
+ try {
113
+ parsed = JSON.parse(raw);
114
+ }
115
+ catch (err) {
116
+ throw new BenchConfigError(`bench provider file: JSON parse error in "${absPath}": ${err instanceof Error ? err.message : String(err)}`, false);
117
+ }
118
+ if (parsed === null || typeof parsed !== "object" || Array.isArray(parsed)) {
119
+ throw new BenchConfigError(`bench provider file: root must be a JSON object (got ${Array.isArray(parsed) ? "array" : typeof parsed})`, false);
120
+ }
121
+ const obj = parsed;
122
+ // ── Forbidden top-level keys ──────────────────────────────────────────────
123
+ for (const key of Object.keys(obj)) {
124
+ if (FORBIDDEN_TOPLEVEL_KEYS.has(key)) {
125
+ throw new BenchConfigError(`bench provider file: forbidden top-level key "${key}" — the bench provider file must contain only "schemaVersion", "defaultModel", and "providers"`, false);
126
+ }
127
+ }
128
+ // ── schemaVersion ─────────────────────────────────────────────────────────
129
+ if (obj.schemaVersion !== 1) {
130
+ throw new BenchConfigError(`bench provider file: unsupported schemaVersion ${JSON.stringify(obj.schemaVersion)}; expected 1`, false);
131
+ }
132
+ // ── providers ─────────────────────────────────────────────────────────────
133
+ if (obj.providers === null || typeof obj.providers !== "object" || Array.isArray(obj.providers)) {
134
+ throw new BenchConfigError(`bench provider file: "providers" must be an object`, false);
135
+ }
136
+ const providers = obj.providers;
137
+ // ── Credential scan ───────────────────────────────────────────────────────
138
+ scanForCredentials(providers, "providers");
139
+ return {
140
+ source: absPath,
141
+ providers,
142
+ ...(typeof obj.defaultModel === "string" ? { defaultModel: obj.defaultModel } : {}),
143
+ };
144
+ }
145
+ /**
146
+ * Given a model ID (e.g. `"don/mlx-community/qwen3.6-35b-a3b"`), split on
147
+ * the first `/` to get the provider key and look it up in `loaded.providers`.
148
+ *
149
+ * Throws `BenchConfigError` if the provider key is not found.
150
+ */
151
+ export function selectProviderForModel(loaded, modelId) {
152
+ const slashIdx = modelId.indexOf("/");
153
+ const providerKey = slashIdx === -1 ? modelId : modelId.slice(0, slashIdx);
154
+ if (!(providerKey in loaded.providers)) {
155
+ throw new BenchConfigError(`bench provider file: model ID "${modelId}" maps to provider key "${providerKey}", which is not present in ${loaded.source}; available: ${Object.keys(loaded.providers).join(", ") || "(none)"}`, false);
156
+ }
157
+ return { providerKey, entry: loaded.providers[providerKey] };
158
+ }
159
+ /**
160
+ * Write a minimal `opencode.json` into `opencodeConfigDir` for the given
161
+ * provider selection. The file contains exactly two top-level keys:
162
+ * `$schema` and `provider`.
163
+ *
164
+ * Written with mode `0o600` so the file is not world-readable (it may
165
+ * contain env-ref placeholders that hint at secret variable names).
166
+ */
167
+ export function materializeOpencodeConfig(opencodeConfigDir, selected,
168
+ /** Full model id (e.g. "don/mlx-community/qwen3.6-35b-a3b") written as the
169
+ * top-level `model` key so opencode uses it without a --model flag. */
170
+ modelId) {
171
+ const config = {
172
+ $schema: "https://opencode.ai/config.json",
173
+ model: modelId,
174
+ provider: {
175
+ [selected.providerKey]: selected.entry,
176
+ },
177
+ // Explicitly allow all tools so opencode run (non-interactive) doesn't
178
+ // silently skip bash/file operations due to missing permission config.
179
+ permission: {
180
+ bash: "allow",
181
+ edit: "allow",
182
+ write: "allow",
183
+ read: "allow",
184
+ webfetch: "allow",
185
+ },
186
+ // Disable operator plugins during bench runs. Plugins like akm-opencode
187
+ // run their own session lifecycle hooks (warmIndexInBackground, akm setup
188
+ // prompts, AKM_STASH_DIR overrides in shell.env) that interfere with the
189
+ // bench's isolated fixture stash and cause stash mismatch failures.
190
+ plugin: [],
191
+ };
192
+ const outPath = path.join(opencodeConfigDir, "opencode.json");
193
+ fs.writeFileSync(outPath, JSON.stringify(config, null, 2), { mode: 0o600 });
194
+ }