npm - akm-cli - Versions diffs - 0.7.0-rc1 → 0.7.0 - Mend

akm-cli 0.7.0-rc1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

package/dist/src/cli.js +100 -16
package/dist/src/commands/config-cli.js +42 -0
package/dist/src/commands/history.js +78 -7
package/dist/src/commands/registry-search.js +69 -6
package/dist/src/commands/search.js +30 -3
package/dist/src/commands/show.js +29 -0
package/dist/src/commands/source-add.js +5 -1
package/dist/src/commands/source-manage.js +7 -1
package/dist/src/core/config.js +28 -0
package/dist/src/indexer/db-search.js +1 -0
package/dist/src/indexer/indexer.js +16 -2
package/dist/src/indexer/matchers.js +1 -1
package/dist/src/indexer/search-source.js +4 -2
package/dist/src/integrations/agent/profiles.js +1 -1
package/dist/src/integrations/agent/spawn.js +67 -16
package/dist/src/integrations/github.js +9 -3
package/dist/src/llm/embedders/remote.js +37 -3
package/dist/src/output/cli-hints.js +15 -2
package/dist/src/output/renderers.js +3 -1
package/dist/src/output/shapes.js +8 -1
package/dist/src/output/text.js +156 -3
package/dist/src/registry/build-index.js +5 -4
package/dist/src/registry/providers/static-index.js +3 -1
package/dist/src/setup/setup.js +9 -0
package/dist/src/wiki/wiki.js +54 -6
package/dist/src/workflows/runs.js +37 -3
package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +1 -1
package/dist/tests/bench/attribution.test.js +24 -23
package/dist/tests/bench/cleanup.js +31 -0
package/dist/tests/bench/cli.js +366 -31
package/dist/tests/bench/cli.test.js +282 -14
package/dist/tests/bench/corpus.js +3 -0
package/dist/tests/bench/corpus.test.js +10 -10
package/dist/tests/bench/doctor.js +525 -0
package/dist/tests/bench/driver.js +77 -22
package/dist/tests/bench/driver.test.js +142 -1
package/dist/tests/bench/environment.js +233 -0
package/dist/tests/bench/environment.test.js +199 -0
package/dist/tests/bench/evolve.js +67 -0
package/dist/tests/bench/evolve.test.js +12 -4
package/dist/tests/bench/failure-modes.test.js +52 -3
package/dist/tests/bench/feedback-integrity.test.js +3 -2
package/dist/tests/bench/leakage.test.js +105 -2
package/dist/tests/bench/learning-curve.test.js +3 -2
package/dist/tests/bench/metrics.js +102 -26
package/dist/tests/bench/metrics.test.js +10 -4
package/dist/tests/bench/opencode-config.js +194 -0
package/dist/tests/bench/opencode-config.test.js +370 -0
package/dist/tests/bench/report.js +73 -9
package/dist/tests/bench/report.test.js +59 -10
package/dist/tests/bench/run-config.js +355 -0
package/dist/tests/bench/run-config.test.js +298 -0
package/dist/tests/bench/run-curate-test.js +32 -0
package/dist/tests/bench/run-failing-tasks.js +56 -0
package/dist/tests/bench/run-full-bench.js +51 -0
package/dist/tests/bench/run-items36-targeted.js +69 -0
package/dist/tests/bench/run-nano-quick.js +42 -0
package/dist/tests/bench/run-waveg-targeted.js +62 -0
package/dist/tests/bench/runner.js +257 -94
package/dist/tests/bench/tmp.js +90 -0
package/dist/tests/bench/trajectory.js +2 -2
package/dist/tests/bench/verifier.js +6 -1
package/dist/tests/bench/workflow-spec.js +11 -24
package/dist/tests/bench/workflow-spec.test.js +1 -1
package/dist/tests/bench/workflow-trace.js +34 -0
package/dist/tests/cli-errors.test.js +1 -0
package/dist/tests/commands/history.test.js +195 -0
package/dist/tests/config.test.js +25 -0
package/dist/tests/e2e.test.js +23 -2
package/dist/tests/fixtures/stashes/load.js +1 -1
package/dist/tests/fixtures/stashes/load.test.js +11 -2
package/dist/tests/indexer.test.js +12 -1
package/dist/tests/output-baseline.test.js +2 -1
package/dist/tests/output-shapes-unit.test.js +3 -1
package/dist/tests/registry-build-index.test.js +17 -1
package/dist/tests/registry-providers/static-index.test.js +34 -0
package/dist/tests/registry-search.test.js +200 -0
package/dist/tests/remember-frontmatter.test.js +11 -13
package/dist/tests/source-qa-fixes.test.js +18 -0
package/dist/tests/source-registry.test.js +3 -3
package/dist/tests/source-source.test.js +61 -1
package/dist/tests/workflow-qa-fixes.test.js +18 -0
package/package.json +1 -1

package/dist/tests/bench/metrics.js CHANGED Viewed

@@ -88,6 +88,7 @@ export function aggregatePerTask(results) {
             harnessErrorCount: 0,
             count: 0,
             runsWithMeasuredTokens: 0,
+            tokensPerRun: null,
         };
     }
     let passes = 0;
@@ -97,12 +98,16 @@ export function aggregatePerTask(results) {
     let budgetExceeded = 0;
     let harnessError = 0;
     let runsWithMeasuredTokens = 0;
+    let totalTokensInMeasuredRuns = 0;
+    let measuredRuns = 0;
     // For the standard deviation we need a fixed-iteration buffer of pass/fail.
     const passSamples = [];
     for (const r of results) {
         totalWallclock += r.wallclockMs;
         if (isMeasured(r)) {
             runsWithMeasuredTokens += 1;
+            measuredRuns += 1;
+            totalTokensInMeasuredRuns += r.tokens.input + r.tokens.output;
         }
         const isPass = r.outcome === "pass" ? 1 : 0;
         passSamples.push(isPass);
@@ -135,6 +140,7 @@ export function aggregatePerTask(results) {
         harnessErrorCount: harnessError,
         count: results.length,
         runsWithMeasuredTokens,
+        tokensPerRun: measuredRuns === 0 ? null : totalTokensInMeasuredRuns / measuredRuns,
     };
 }
 /** Sample standard deviation. Returns 0 for length ≤ 1 (no spread to measure). */
@@ -156,13 +162,15 @@ function stdev(values) {
 export function aggregateCorpus(perTask) {
     const tasks = Object.values(perTask);
     if (tasks.length === 0) {
-        return { passRate: 0, tokensPerPass: null, wallclockMs: 0 };
+        return { passRate: 0, tokensPerPass: null, wallclockMs: 0, tokensPerRun: null };
     }
     const passRate = tasks.reduce((a, t) => a + t.passRate, 0) / tasks.length;
     const wallclockMs = tasks.reduce((a, t) => a + t.wallclockMs, 0) / tasks.length;
     const tppValues = tasks.map((t) => t.tokensPerPass).filter((v) => v !== null);
     const tokensPerPass = tppValues.length === 0 ? null : tppValues.reduce((a, b) => a + b, 0) / tppValues.length;
-    return { passRate, tokensPerPass, wallclockMs };
+    const tprValues = tasks.map((t) => t.tokensPerRun).filter((v) => v !== null);
+    const tokensPerRun = tprValues.length === 0 ? null : tprValues.reduce((a, b) => a + b, 0) / tprValues.length;
+    return { passRate, tokensPerPass, wallclockMs, tokensPerRun };
 }
 /**
  * Compute the akm − noakm delta. Negative `tokensPerPass`/`wallclockMs` mean
@@ -174,6 +182,7 @@ export function computeCorpusDelta(noakm, akm) {
         passRate: akm.passRate - noakm.passRate,
         tokensPerPass: akm.tokensPerPass === null || noakm.tokensPerPass === null ? null : akm.tokensPerPass - noakm.tokensPerPass,
         wallclockMs: akm.wallclockMs - noakm.wallclockMs,
+        tokensPerRun: akm.tokensPerRun === null || noakm.tokensPerRun === null ? null : akm.tokensPerRun - noakm.tokensPerRun,
     };
 }
 /** Per-task delta with the same null-safety as the corpus delta. */
@@ -182,6 +191,7 @@ export function computePerTaskDelta(noakm, akm) {
         passRate: akm.passRate - noakm.passRate,
         tokensPerPass: akm.tokensPerPass === null || noakm.tokensPerPass === null ? null : akm.tokensPerPass - noakm.tokensPerPass,
         wallclockMs: akm.wallclockMs - noakm.wallclockMs,
+        tokensPerRun: akm.tokensPerRun === null || noakm.tokensPerRun === null ? null : akm.tokensPerRun - noakm.tokensPerRun,
     };
 }
 /**
@@ -1126,51 +1136,78 @@ const SEARCH_RANK_CUTOFF = 5;
 /** Cap on the number of characters of `verifierStdout` we substring-scan. Mirrors trajectory.ts. */
 const FAILURE_MODE_STDOUT_SCAN_CAP = 16 * 1024 * 1024;
 /**
- * Classify a single failed run into one of the seven §6.6 labels. Pure
- * function — string-matches `runResult.events[]` and `runResult.verifierStdout`,
- * never calls an LLM, never touches the filesystem.
+ * Classify a single failed run into one of the §6.6 labels. Pure function —
+ * consults `runResult.trajectory.correctAssetLoaded` first (trajectory data
+ * is authoritative when present), then falls back to string-matching
+ * `runResult.events[]` and `runResult.verifierStdout`. Never calls an LLM,
+ * never touches the filesystem.
  *
  * Decision tree (priority order — first match wins):
  *   1. Run not failed (`pass`, `budget_exceeded`, `harness_error`) → `null`.
- *   2. No `akm search` call in the trace → `no_search`.
- *   3. Search ran; gold ref absent from search results → `search_no_gold`.
- *   4. Gold ref present in search results at rank > 5 → `search_low_rank`.
- *   5. `akm show` invoked on a non-gold ref AND gold ref never loaded → `loaded_wrong`.
- *   6. Gold ref loaded; verifier output suggests the action contradicts the
- *      asset's guidance (heuristic: verifier mentions the gold pattern was
- *      explicitly NOT followed) → `loaded_ignored`.
- *   7. Gold ref loaded and apparently followed → `followed_wrong`.
- *   8. Default → `unrelated_bug`.
- *
- * Tasks without `goldRef`: rules that depend on the gold ref (3-7) are
- * skipped; only `no_search` and `unrelated_bug` are reachable.
+ *   2. `trajectory.correctAssetLoaded === true` → the agent loaded the gold
+ *      asset but still failed. This is `loaded_ignored` (agent wrote from
+ *      memory instead of applying asset content). This short-circuit fixes
+ *      the 2026-05-03 baseline bug where 24/25 `search_no_gold` labels were
+ *      wrong because the classifier didn't consult trajectory data.
+ *   3. No `akm search` call in the trace:
+ *      a. If task has no `goldRef` (so `correctAssetLoaded` is always null)
+ *         → `no_events` (trajectory metric undefined; cannot distinguish
+ *         "agent ran but events absent" from "agent never ran").
+ *      b. Otherwise → `no_search`.
+ *   4. Search ran, no goldRef → `unrelated_bug`.
+ *   5. Search ran; gold ref absent from results → `search_no_gold`.
+ *      (Only reachable when `correctAssetLoaded` is false or null, since
+ *      true is handled in step 2.)
+ *   6. Gold ref present at rank > 5 → `search_low_rank`.
+ *   7. `akm show` invoked on a non-gold ref AND gold ref never loaded
+ *      → `loaded_wrong`.
+ *   8. Gold ref loaded; verifier output suggests the action contradicts the
+ *      asset's guidance → `loaded_ignored`.
+ *   9. Gold ref loaded and apparently followed → `followed_wrong`.
+ *  10. Default → `unrelated_bug`.
  */
 export function classifyFailureMode(taskMeta, runResult) {
     if (runResult.outcome !== "fail")
         return null;
-    const trace = collectTrace(runResult);
     const goldRef = taskMeta.goldRef;
-    // 1. no_search — no `akm search` invocation anywhere in the trace.
+    const correctAssetLoaded = runResult.trajectory?.correctAssetLoaded;
+    // 1. Trajectory short-circuit: if events data confirms the gold asset was
+    //    loaded, the failure must be compliance-related, not discovery-related.
+    //    Return `loaded_ignored` immediately without scanning stdout.
+    if (correctAssetLoaded === true) {
+        return "loaded_ignored";
+    }
+    const trace = collectTrace(runResult);
+    // 2. no_search / no_events — no `akm search` invocation anywhere in the trace.
     if (!hasAkmSearch(trace, runResult)) {
+        // When there is no goldRef, correctAssetLoaded is always null (the metric
+        // is undefined). We cannot tell whether the agent genuinely didn't search
+        // or whether events data was simply absent. Use `no_events` to surface
+        // this ambiguity rather than conflating it with `no_search`.
+        if (!goldRef) {
+            return "no_events";
+        }
         return "no_search";
     }
     // Without a gold ref the search-based and load-based checks are undefined.
-    // We can only distinguish "no_search" from everything else.
+    // We can only distinguish "no_search" / "no_events" from everything else.
     if (!goldRef) {
         return "unrelated_bug";
     }
     const searchRank = findGoldSearchRank(trace, goldRef);
-    // 2. search_no_gold — search ran (precondition above) but gold ref absent.
+    // 3. search_no_gold — search ran (precondition above) but gold ref absent.
+    //    Only reachable when correctAssetLoaded is false or null (trajectory
+    //    data indicates gold was not loaded), because true is handled above.
     if (searchRank === null) {
         return "search_no_gold";
     }
-    // 3. search_low_rank — present but below the cutoff.
+    // 4. search_low_rank — present but below the cutoff.
     if (searchRank > SEARCH_RANK_CUTOFF) {
         return "search_low_rank";
     }
     const goldLoaded = hasAkmShow(trace, runResult, goldRef);
     const otherRefLoaded = hasAkmShowOtherRef(trace, runResult, goldRef);
-    // 4. loaded_wrong — agent showed a non-gold ref AND never loaded the gold.
+    // 5. loaded_wrong — agent showed a non-gold ref AND never loaded the gold.
     if (otherRefLoaded && !goldLoaded) {
         return "loaded_wrong";
     }
@@ -1181,7 +1218,7 @@ export function classifyFailureMode(taskMeta, runResult) {
         // table has no row for "found but never opened" — treat as unrelated_bug.
         return "unrelated_bug";
     }
-    // 5. loaded_ignored — verifier diagnostic indicates the action contradicts
+    // 6. loaded_ignored — verifier diagnostic indicates the action contradicts
     //    the loaded asset. Conservative heuristic: look for explicit "ignored"
     //    or "not applied" markers in the verifier stdout. Without an LLM we
     //    cannot detect subtler contradictions, so this branch only fires when
@@ -1189,7 +1226,7 @@ export function classifyFailureMode(taskMeta, runResult) {
     if (verifierIndicatesIgnored(runResult.verifierStdout)) {
         return "loaded_ignored";
     }
-    // 6. followed_wrong — gold loaded, apparently followed, verifier still
+    // 7. followed_wrong — gold loaded, apparently followed, verifier still
     //    failed. The §6.6 spec maps this to "the asset itself is wrong".
     return "followed_wrong";
 }
@@ -1992,6 +2029,8 @@ function perRun(run, taskMetadata) {
     let searchCount = 0;
     let showCount = 0;
     let feedbackCount = 0;
+    let positiveFeedbackCount = 0;
+    let negativeFeedbackCount = 0;
     const uniqueShowRefs = new Set();
     for (const ev of events) {
         if (ev.type === "akm_search")
@@ -2002,8 +2041,17 @@ function perRun(run, taskMetadata) {
                 uniqueShowRefs.add(ev.assetRef);
             }
         }
-        else if (ev.type === "akm_feedback")
+        else if (ev.type === "akm_feedback") {
             feedbackCount += 1;
+            // Polarity is carried in args as "--positive" or "--negative".
+            // Events sourced from events.jsonl also have args populated by
+            // normalizeRunToTrace. Absence of both flags is treated as unknown
+            // (contributes to feedbackCount but not to either polarity counter).
+            if (ev.args?.includes("--positive"))
+                positiveFeedbackCount += 1;
+            else if (ev.args?.includes("--negative"))
+                negativeFeedbackCount += 1;
+        }
     }
     const totalToolCalls = searchCount + showCount + feedbackCount;
     // Run-start anchor: earliest parseable ts in the trace. We use the trace
@@ -2049,6 +2097,8 @@ function perRun(run, taskMetadata) {
         searchCount,
         showCount,
         feedbackCount,
+        positiveFeedbackCount,
+        negativeFeedbackCount,
         totalToolCalls,
         assetsLoadedCount: uniqueShowRefs.size,
         irrelevantAssetsLoadedCount,
@@ -2087,6 +2137,12 @@ export function aggregateAkmOverhead(perRun, rawRuns = []) {
             totalToolCalls: 0,
             toolCallsPerSuccess: null,
             costPerSuccess: null,
+            searchEngagementRate: 0,
+            showEngagementRate: 0,
+            feedbackEngagementRate: 0,
+            searchToShowRatio: null,
+            meanPositiveFeedbackCount: 0,
+            meanNegativeFeedbackCount: 0,
         };
     }
     let searchSum = 0;
@@ -2115,12 +2171,25 @@ export function aggregateAkmOverhead(perRun, rawRuns = []) {
     let parsedPassTokenSum = 0;
     let parsedPassCount = 0;
     let anyPassMissingMeasurement = false;
+    let searchEngagedRuns = 0;
+    let showEngagedRuns = 0;
+    let feedbackEngagedRuns = 0;
+    let positiveFeedbackSum = 0;
+    let negativeFeedbackSum = 0;
     for (const row of perRun) {
         searchSum += row.searchCount;
         showSum += row.showCount;
         feedbackSum += row.feedbackCount;
         toolCallsSum += row.totalToolCalls;
         assetsSum += row.assetsLoadedCount;
+        if (row.searchCount > 0)
+            searchEngagedRuns += 1;
+        if (row.showCount > 0)
+            showEngagedRuns += 1;
+        if (row.feedbackCount > 0)
+            feedbackEngagedRuns += 1;
+        positiveFeedbackSum += row.positiveFeedbackCount;
+        negativeFeedbackSum += row.negativeFeedbackCount;
         if (row.irrelevantAssetsLoadedCount !== null) {
             irrelevantSum += row.irrelevantAssetsLoadedCount;
             irrelevantCount += 1;
@@ -2166,6 +2235,7 @@ export function aggregateAkmOverhead(perRun, rawRuns = []) {
     const costPerSuccess = passingRuns === 0 || anyPassMissingMeasurement || parsedPassCount === 0
         ? null
         : parsedPassTokenSum / parsedPassCount;
+    const searchToShowRatio = searchSum === 0 ? null : showSum / searchSum;
     return {
         totalRuns: n,
         passingRuns,
@@ -2182,6 +2252,12 @@ export function aggregateAkmOverhead(perRun, rawRuns = []) {
         totalToolCalls: toolCallsSum,
         toolCallsPerSuccess,
         costPerSuccess,
+        searchEngagementRate: searchEngagedRuns / n,
+        showEngagementRate: showEngagedRuns / n,
+        feedbackEngagementRate: feedbackEngagedRuns / n,
+        searchToShowRatio,
+        meanPositiveFeedbackCount: positiveFeedbackSum / n,
+        meanNegativeFeedbackCount: negativeFeedbackSum / n,
     };
 }
 /**

package/dist/tests/bench/metrics.test.js CHANGED Viewed

@@ -11,6 +11,7 @@ function ptm(overrides = {}) {
         passRate: 0,
         passAt1: 0,
         tokensPerPass: null,
+        tokensPerRun: null,
         wallclockMs: 0,
         passRateStdev: 0,
         budgetExceededCount: 0,
@@ -209,6 +210,7 @@ describe("aggregateCorpus", () => {
                 passRate: 1,
                 passAt1: 1,
                 tokensPerPass: 1000,
+                tokensPerRun: 1000,
                 wallclockMs: 1000,
                 passRateStdev: 0,
                 budgetExceededCount: 0,
@@ -220,6 +222,7 @@ describe("aggregateCorpus", () => {
                 passRate: 0,
                 passAt1: 0,
                 tokensPerPass: null,
+                tokensPerRun: null,
                 wallclockMs: 2000,
                 passRateStdev: 0,
                 budgetExceededCount: 0,
@@ -239,6 +242,7 @@ describe("aggregateCorpus", () => {
                 passRate: 0,
                 passAt1: 0,
                 tokensPerPass: null,
+                tokensPerRun: null,
                 wallclockMs: 1000,
                 passRateStdev: 0,
                 budgetExceededCount: 0,
@@ -258,16 +262,16 @@ describe("aggregateCorpus", () => {
 });
 describe("delta helpers", () => {
     test("computeCorpusDelta — akm − noakm", () => {
-        const noakm = { passRate: 0.3, tokensPerPass: 18000, wallclockMs: 4000 };
-        const akm = { passRate: 0.7, tokensPerPass: 14000, wallclockMs: 3000 };
+        const noakm = { passRate: 0.3, tokensPerPass: 18000, tokensPerRun: null, wallclockMs: 4000 };
+        const akm = { passRate: 0.7, tokensPerPass: 14000, tokensPerRun: null, wallclockMs: 3000 };
         const d = computeCorpusDelta(noakm, akm);
         expect(d.passRate).toBeCloseTo(0.4);
         expect(d.tokensPerPass).toBeCloseTo(-4000);
         expect(d.wallclockMs).toBeCloseTo(-1000);
     });
     test("computeCorpusDelta — null tokensPerPass propagates", () => {
-        const noakm = { passRate: 0, tokensPerPass: null, wallclockMs: 1 };
-        const akm = { passRate: 1, tokensPerPass: 5, wallclockMs: 2 };
+        const noakm = { passRate: 0, tokensPerPass: null, tokensPerRun: null, wallclockMs: 1 };
+        const akm = { passRate: 1, tokensPerPass: 5, tokensPerRun: null, wallclockMs: 2 };
         expect(computeCorpusDelta(noakm, akm).tokensPerPass).toBeNull();
     });
     test("computePerTaskDelta — same null-safety rule", () => {
@@ -275,6 +279,7 @@ describe("delta helpers", () => {
             passRate: 0,
             passAt1: 0,
             tokensPerPass: null,
+            tokensPerRun: null,
             wallclockMs: 0,
             passRateStdev: 0,
             budgetExceededCount: 0,
@@ -286,6 +291,7 @@ describe("delta helpers", () => {
             passRate: 1,
             passAt1: 1,
             tokensPerPass: 1000,
+            tokensPerRun: null,
             wallclockMs: 100,
             passRateStdev: 0,
             budgetExceededCount: 0,

package/dist/tests/bench/opencode-config.js ADDED Viewed

@@ -0,0 +1,194 @@
+/**
+ * opencode-config.ts — config-driven opencode provider materialisation.
+ *
+ * Loads the operator's bench provider file (committed fixture or
+ * gitignored `.local.json` overlay), validates it for safety (no hard-coded
+ * credentials, no extra top-level keys), and writes a minimal
+ * `opencode.json` into the per-run isolated `OPENCODE_CONFIG` directory.
+ *
+ * Design: `tests/bench/BENCH.md` §"Config-driven opencode provider".
+ */
+import fs from "node:fs";
+import path from "node:path";
+/**
+ * Error class for bench provider-config problems.
+ *
+ * `isUsageError: true`  → the caller should exit 2 (USAGE).
+ * `isUsageError: false` → the caller should exit 78 (CONFIG).
+ */
+export class BenchConfigError extends Error {
+    code = "BENCH_CONFIG";
+    isUsageError;
+    constructor(message, isUsageError) {
+        super(message);
+        this.name = "BenchConfigError";
+        this.isUsageError = isUsageError;
+    }
+}
+/**
+ * Top-level keys that belong in a full opencode user-config but are FORBIDDEN
+ * in the bench provider file. The bench file is intentionally minimal — it
+ * only specifies provider entries. Any of these keys in the file means the
+ * operator has pasted a full opencode config into the bench slot, which could
+ * contain credentials, plugins, or permission overrides that the bench MUST
+ * NOT inherit.
+ */
+const FORBIDDEN_TOPLEVEL_KEYS = new Set([
+    "plugin",
+    "mcp",
+    "permission",
+    "disabled_providers",
+    "small_model",
+    "snapshot",
+]);
+/**
+ * Regex that an `apiKey` string value MUST match when present. The only
+ * allowed form is an env-ref placeholder: `{env:VAR_NAME}`.
+ */
+const ENV_REF_RE = /^\{env:[A-Z_][A-Z0-9_]*\}$/;
+/** Heuristic to detect literal API credentials accidentally pasted into the file. */
+const CREDENTIAL_RE = /^sk-[A-Za-z0-9_-]{20,}$/;
+/**
+ * Recursively scan `node` for credential heuristic violations and literal
+ * `apiKey` values that are not env-refs. Throws `BenchConfigError` on the
+ * first violation found.
+ *
+ * @param node   The value to scan (any JSON value).
+ * @param jspath JSON-path-like string for error messages, e.g. `providers.myProvider.apiKey`.
+ */
+function scanForCredentials(node, jspath) {
+    if (typeof node === "string") {
+        // Heuristic: reject anything that looks like an OpenAI/Anthropic-style key.
+        if (CREDENTIAL_RE.test(node)) {
+            throw new BenchConfigError(`bench provider file: credential heuristic triggered at "${jspath}" — literal API key detected; use {env:VAR_NAME} instead`, false);
+        }
+        return;
+    }
+    if (Array.isArray(node)) {
+        for (let i = 0; i < node.length; i++) {
+            scanForCredentials(node[i], `${jspath}[${i}]`);
+        }
+        return;
+    }
+    if (node !== null && typeof node === "object") {
+        for (const [key, value] of Object.entries(node)) {
+            const childPath = `${jspath}.${key}`;
+            // apiKey must be an env-ref if present as a string.
+            if (key === "apiKey" && typeof value === "string") {
+                if (!ENV_REF_RE.test(value)) {
+                    throw new BenchConfigError(`bench provider file: "${childPath}" must be an env-ref (e.g. {env:MY_API_KEY}), not a literal value`, false);
+                }
+                // An env-ref is fine — don't recurse further into it.
+                continue;
+            }
+            scanForCredentials(value, childPath);
+        }
+    }
+}
+/**
+ * Load and validate a bench opencode providers JSON file.
+ *
+ * Throws:
+ * - `BenchConfigError(isUsageError: true)` if the file does not exist.
+ * - `BenchConfigError(isUsageError: false)` if JSON parse fails or the file
+ *   fails validation (bad schema version, forbidden top-level keys, detected
+ *   credentials).
+ */
+export function loadOpencodeProviders(absPath) {
+    // ── File existence ────────────────────────────────────────────────────────
+    let raw;
+    try {
+        raw = fs.readFileSync(absPath, "utf8");
+    }
+    catch (err) {
+        const isEnoent = err.code === "ENOENT";
+        if (isEnoent) {
+            throw new BenchConfigError(`bench provider file not found: ${absPath}`, true);
+        }
+        throw new BenchConfigError(`bench provider file: could not read "${absPath}": ${err instanceof Error ? err.message : String(err)}`, false);
+    }
+    // ── JSON parse ────────────────────────────────────────────────────────────
+    let parsed;
+    try {
+        parsed = JSON.parse(raw);
+    }
+    catch (err) {
+        throw new BenchConfigError(`bench provider file: JSON parse error in "${absPath}": ${err instanceof Error ? err.message : String(err)}`, false);
+    }
+    if (parsed === null || typeof parsed !== "object" || Array.isArray(parsed)) {
+        throw new BenchConfigError(`bench provider file: root must be a JSON object (got ${Array.isArray(parsed) ? "array" : typeof parsed})`, false);
+    }
+    const obj = parsed;
+    // ── Forbidden top-level keys ──────────────────────────────────────────────
+    for (const key of Object.keys(obj)) {
+        if (FORBIDDEN_TOPLEVEL_KEYS.has(key)) {
+            throw new BenchConfigError(`bench provider file: forbidden top-level key "${key}" — the bench provider file must contain only "schemaVersion", "defaultModel", and "providers"`, false);
+        }
+    }
+    // ── schemaVersion ─────────────────────────────────────────────────────────
+    if (obj.schemaVersion !== 1) {
+        throw new BenchConfigError(`bench provider file: unsupported schemaVersion ${JSON.stringify(obj.schemaVersion)}; expected 1`, false);
+    }
+    // ── providers ─────────────────────────────────────────────────────────────
+    if (obj.providers === null || typeof obj.providers !== "object" || Array.isArray(obj.providers)) {
+        throw new BenchConfigError(`bench provider file: "providers" must be an object`, false);
+    }
+    const providers = obj.providers;
+    // ── Credential scan ───────────────────────────────────────────────────────
+    scanForCredentials(providers, "providers");
+    return {
+        source: absPath,
+        providers,
+        ...(typeof obj.defaultModel === "string" ? { defaultModel: obj.defaultModel } : {}),
+    };
+}
+/**
+ * Given a model ID (e.g. `"don/mlx-community/qwen3.6-35b-a3b"`), split on
+ * the first `/` to get the provider key and look it up in `loaded.providers`.
+ *
+ * Throws `BenchConfigError` if the provider key is not found.
+ */
+export function selectProviderForModel(loaded, modelId) {
+    const slashIdx = modelId.indexOf("/");
+    const providerKey = slashIdx === -1 ? modelId : modelId.slice(0, slashIdx);
+    if (!(providerKey in loaded.providers)) {
+        throw new BenchConfigError(`bench provider file: model ID "${modelId}" maps to provider key "${providerKey}", which is not present in ${loaded.source}; available: ${Object.keys(loaded.providers).join(", ") || "(none)"}`, false);
+    }
+    return { providerKey, entry: loaded.providers[providerKey] };
+}
+/**
+ * Write a minimal `opencode.json` into `opencodeConfigDir` for the given
+ * provider selection. The file contains exactly two top-level keys:
+ * `$schema` and `provider`.
+ *
+ * Written with mode `0o600` so the file is not world-readable (it may
+ * contain env-ref placeholders that hint at secret variable names).
+ */
+export function materializeOpencodeConfig(opencodeConfigDir, selected,
+/** Full model id (e.g. "don/mlx-community/qwen3.6-35b-a3b") written as the
+ *  top-level `model` key so opencode uses it without a --model flag. */
+modelId) {
+    const config = {
+        $schema: "https://opencode.ai/config.json",
+        model: modelId,
+        provider: {
+            [selected.providerKey]: selected.entry,
+        },
+        // Explicitly allow all tools so opencode run (non-interactive) doesn't
+        // silently skip bash/file operations due to missing permission config.
+        permission: {
+            bash: "allow",
+            edit: "allow",
+            write: "allow",
+            read: "allow",
+            webfetch: "allow",
+        },
+        // Disable operator plugins during bench runs. Plugins like akm-opencode
+        // run their own session lifecycle hooks (warmIndexInBackground, akm setup
+        // prompts, AKM_STASH_DIR overrides in shell.env) that interfere with the
+        // bench's isolated fixture stash and cause stash mismatch failures.
+        plugin: [],
+    };
+    const outPath = path.join(opencodeConfigDir, "opencode.json");
+    fs.writeFileSync(outPath, JSON.stringify(config, null, 2), { mode: 0o600 });
+}