akm-cli 0.6.0 → 0.7.0-rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (319) hide show
  1. package/CHANGELOG.md +66 -0
  2. package/dist/{cli.js → src/cli.js} +672 -29
  3. package/dist/{commands → src/commands}/config-cli.js +5 -4
  4. package/dist/src/commands/distill.js +283 -0
  5. package/dist/src/commands/events.js +108 -0
  6. package/dist/src/commands/history.js +120 -0
  7. package/dist/{commands → src/commands}/installed-stashes.js +28 -2
  8. package/dist/src/commands/proposal.js +119 -0
  9. package/dist/src/commands/propose.js +171 -0
  10. package/dist/src/commands/reflect.js +193 -0
  11. package/dist/{commands → src/commands}/registry-search.js +2 -1
  12. package/dist/{commands → src/commands}/remember.js +12 -0
  13. package/dist/{commands → src/commands}/search.js +74 -1
  14. package/dist/{commands → src/commands}/self-update.js +4 -3
  15. package/dist/{commands → src/commands}/show.js +67 -2
  16. package/dist/{core → src/core}/asset-ref.js +5 -5
  17. package/dist/{core → src/core}/asset-spec.js +12 -0
  18. package/dist/{core → src/core}/common.js +1 -1
  19. package/dist/{core → src/core}/config.js +175 -121
  20. package/dist/{core → src/core}/errors.js +4 -0
  21. package/dist/src/core/events.js +239 -0
  22. package/dist/src/core/lesson-lint.js +86 -0
  23. package/dist/src/core/proposals.js +406 -0
  24. package/dist/src/core/warn.js +72 -0
  25. package/dist/{core → src/core}/write-source.js +80 -5
  26. package/dist/{indexer → src/indexer}/db-search.js +119 -27
  27. package/dist/{indexer → src/indexer}/db.js +76 -23
  28. package/dist/{indexer → src/indexer}/file-context.js +0 -3
  29. package/dist/src/indexer/graph-boost.js +179 -0
  30. package/dist/src/indexer/graph-extraction.js +212 -0
  31. package/dist/{indexer → src/indexer}/indexer.js +73 -6
  32. package/dist/src/indexer/memory-inference.js +263 -0
  33. package/dist/{indexer → src/indexer}/metadata.js +114 -11
  34. package/dist/src/integrations/agent/config.js +292 -0
  35. package/dist/src/integrations/agent/detect.js +94 -0
  36. package/dist/src/integrations/agent/index.js +17 -0
  37. package/dist/src/integrations/agent/profiles.js +65 -0
  38. package/dist/src/integrations/agent/prompts.js +167 -0
  39. package/dist/src/integrations/agent/spawn.js +221 -0
  40. package/dist/{integrations → src/integrations}/lockfile.js +0 -26
  41. package/dist/{llm → src/llm}/client.js +33 -2
  42. package/dist/src/llm/feature-gate.js +108 -0
  43. package/dist/src/llm/graph-extract.js +107 -0
  44. package/dist/src/llm/index-passes.js +35 -0
  45. package/dist/src/llm/memory-infer.js +86 -0
  46. package/dist/{output → src/output}/renderers.js +60 -1
  47. package/dist/src/output/shapes.js +516 -0
  48. package/dist/{output → src/output}/text.js +447 -4
  49. package/dist/{registry → src/registry}/build-index.js +14 -4
  50. package/dist/{registry → src/registry}/factory.js +0 -8
  51. package/dist/{registry → src/registry}/providers/static-index.js +3 -2
  52. package/dist/{registry → src/registry}/resolve.js +68 -2
  53. package/dist/{setup → src/setup}/setup.js +43 -5
  54. package/dist/{sources → src/sources}/providers/git.js +7 -15
  55. package/dist/{wiki → src/wiki}/wiki.js +9 -11
  56. package/dist/tests/add-website-source.test.js +119 -0
  57. package/dist/tests/agent/agent-config-loader.test.js +70 -0
  58. package/dist/tests/agent/agent-config.test.js +221 -0
  59. package/dist/tests/agent/agent-detect.test.js +100 -0
  60. package/dist/tests/agent/agent-spawn.test.js +234 -0
  61. package/dist/tests/agent-output.test.js +186 -0
  62. package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +103 -0
  63. package/dist/tests/architecture/agent-spawn-seam.test.js +193 -0
  64. package/dist/tests/architecture/llm-stateless-seam.test.js +112 -0
  65. package/dist/tests/asset-ref.test.js +192 -0
  66. package/dist/tests/asset-registry.test.js +103 -0
  67. package/dist/tests/asset-spec.test.js +241 -0
  68. package/dist/tests/bench/attribution.test.js +995 -0
  69. package/dist/tests/bench/cleanup-sigint.test.js +83 -0
  70. package/dist/tests/bench/cleanup.js +203 -0
  71. package/dist/tests/bench/cleanup.test.js +166 -0
  72. package/dist/tests/bench/cli.js +683 -0
  73. package/dist/tests/bench/cli.test.js +177 -0
  74. package/dist/tests/bench/compare.test.js +556 -0
  75. package/dist/tests/bench/corpus.js +314 -0
  76. package/dist/tests/bench/corpus.test.js +258 -0
  77. package/dist/tests/bench/driver.js +346 -0
  78. package/dist/tests/bench/driver.test.js +443 -0
  79. package/dist/tests/bench/evolve-metrics.js +179 -0
  80. package/dist/tests/bench/evolve-metrics.test.js +187 -0
  81. package/dist/tests/bench/evolve.js +580 -0
  82. package/dist/tests/bench/evolve.test.js +616 -0
  83. package/dist/tests/bench/failure-modes.test.js +300 -0
  84. package/dist/tests/bench/feedback-integrity.test.js +456 -0
  85. package/dist/tests/bench/leakage.test.js +125 -0
  86. package/dist/tests/bench/learning-curve.test.js +133 -0
  87. package/dist/tests/bench/metrics.js +2319 -0
  88. package/dist/tests/bench/metrics.test.js +1144 -0
  89. package/dist/tests/bench/no-os-tmpdir-invariant.test.js +43 -0
  90. package/dist/tests/bench/report.js +1821 -0
  91. package/dist/tests/bench/report.test.js +989 -0
  92. package/dist/tests/bench/runner.js +536 -0
  93. package/dist/tests/bench/runner.test.js +958 -0
  94. package/dist/tests/bench/search-bridge.test.js +331 -0
  95. package/dist/tests/bench/tmp.js +41 -0
  96. package/dist/tests/bench/trajectory.js +116 -0
  97. package/dist/tests/bench/trajectory.test.js +127 -0
  98. package/dist/tests/bench/verifier.js +109 -0
  99. package/dist/tests/bench/verifier.test.js +118 -0
  100. package/dist/tests/bench/workflow-evaluator.js +557 -0
  101. package/dist/tests/bench/workflow-evaluator.test.js +421 -0
  102. package/dist/tests/bench/workflow-spec.js +358 -0
  103. package/dist/tests/bench/workflow-spec.test.js +363 -0
  104. package/dist/tests/bench/workflow-trace.js +438 -0
  105. package/dist/tests/bench/workflow-trace.test.js +254 -0
  106. package/dist/tests/benchmark-search-quality.js +536 -0
  107. package/dist/tests/benchmark-suite.js +1441 -0
  108. package/dist/tests/capture-cli.test.js +112 -0
  109. package/dist/tests/cli-errors.test.js +203 -0
  110. package/dist/tests/commands/events.test.js +370 -0
  111. package/dist/tests/commands/history.test.js +223 -0
  112. package/dist/tests/commands/import.test.js +103 -0
  113. package/dist/tests/commands/proposal-cli.test.js +209 -0
  114. package/dist/tests/commands/reflect-propose-cli.test.js +333 -0
  115. package/dist/tests/commands/remember.test.js +97 -0
  116. package/dist/tests/commands/scope-flags.test.js +300 -0
  117. package/dist/tests/commands/search.test.js +537 -0
  118. package/dist/tests/commands/show-indexer-parity.test.js +117 -0
  119. package/dist/tests/commands/show.test.js +294 -0
  120. package/dist/tests/common.test.js +266 -0
  121. package/dist/tests/completions.test.js +142 -0
  122. package/dist/tests/config-cli.test.js +193 -0
  123. package/dist/tests/config-llm-features.test.js +139 -0
  124. package/dist/tests/config.test.js +544 -0
  125. package/dist/tests/contracts/migration-baseline.test.js +43 -0
  126. package/dist/tests/contracts/reflect-propose-envelope.test.js +139 -0
  127. package/dist/tests/contracts/spec-helpers.js +46 -0
  128. package/dist/tests/contracts/v1-spec-section-11-proposal-queue.test.js +228 -0
  129. package/dist/tests/contracts/v1-spec-section-12-agent-config.test.js +56 -0
  130. package/dist/tests/contracts/v1-spec-section-13-lesson-type.test.js +34 -0
  131. package/dist/tests/contracts/v1-spec-section-14-llm-features.test.js +94 -0
  132. package/dist/tests/contracts/v1-spec-section-4-1-asset-types.test.js +39 -0
  133. package/dist/tests/contracts/v1-spec-section-4-2-quality-rules.test.js +44 -0
  134. package/dist/tests/contracts/v1-spec-section-5-configuration.test.js +47 -0
  135. package/dist/tests/contracts/v1-spec-section-6-orchestration.test.js +40 -0
  136. package/dist/tests/contracts/v1-spec-section-7-module-layout.test.js +58 -0
  137. package/dist/tests/contracts/v1-spec-section-8-extension-points.test.js +34 -0
  138. package/dist/tests/contracts/v1-spec-section-9-4-cli-surface.test.js +75 -0
  139. package/dist/tests/contracts/v1-spec-section-9-7-llm-agent-boundary.test.js +36 -0
  140. package/dist/tests/core/write-source.test.js +366 -0
  141. package/dist/tests/curate-command.test.js +87 -0
  142. package/dist/tests/db-scoring.test.js +201 -0
  143. package/dist/tests/db.test.js +654 -0
  144. package/dist/tests/distill-cli-flag.test.js +208 -0
  145. package/dist/tests/distill.test.js +515 -0
  146. package/dist/tests/docker-install.test.js +120 -0
  147. package/dist/tests/e2e.test.js +1398 -0
  148. package/dist/tests/embedder.test.js +340 -0
  149. package/dist/tests/embedding-model-config.test.js +379 -0
  150. package/dist/tests/feedback-command.test.js +172 -0
  151. package/dist/tests/file-context.test.js +552 -0
  152. package/dist/tests/fixtures/scripts/git/summarize-diff.js +9 -0
  153. package/dist/tests/fixtures/scripts/lint/eslint-check.js +7 -0
  154. package/dist/tests/fixtures/stashes/load.js +166 -0
  155. package/dist/tests/fixtures/stashes/load.test.js +88 -0
  156. package/dist/tests/fixtures/stashes/ranking-baseline/scripts/mem0-search.js +12 -0
  157. package/dist/tests/frontmatter.test.js +190 -0
  158. package/dist/tests/fts-field-weighting.test.js +254 -0
  159. package/dist/tests/fuzzy-search.test.js +230 -0
  160. package/dist/tests/git-provider-clone.test.js +45 -0
  161. package/dist/tests/github.test.js +161 -0
  162. package/dist/tests/graph-boost-ranking.test.js +305 -0
  163. package/dist/tests/graph-extraction.test.js +282 -0
  164. package/dist/tests/helpers/usage-events.js +8 -0
  165. package/dist/tests/index-pass-llm.test.js +161 -0
  166. package/dist/tests/indexer.test.js +559 -0
  167. package/dist/tests/info-command.test.js +166 -0
  168. package/dist/tests/init.test.js +69 -0
  169. package/dist/tests/install-script.test.js +246 -0
  170. package/dist/tests/integration/agent-real-profile.test.js +94 -0
  171. package/dist/tests/issue-36-repro.test.js +304 -0
  172. package/dist/tests/issues-191-194.test.js +160 -0
  173. package/dist/tests/lesson-lint.test.js +111 -0
  174. package/dist/tests/llm-client.test.js +115 -0
  175. package/dist/tests/llm-feature-gate.test.js +151 -0
  176. package/dist/tests/llm.test.js +139 -0
  177. package/dist/tests/lockfile.test.js +216 -0
  178. package/dist/tests/manifest.test.js +205 -0
  179. package/dist/tests/markdown.test.js +126 -0
  180. package/dist/tests/matchers-unit.test.js +189 -0
  181. package/dist/tests/memory-inference.test.js +299 -0
  182. package/dist/tests/merge-scoring.test.js +136 -0
  183. package/dist/tests/metadata.test.js +313 -0
  184. package/dist/tests/migration-help.test.js +89 -0
  185. package/dist/tests/origin-resolve.test.js +124 -0
  186. package/dist/tests/output-baseline.test.js +217 -0
  187. package/dist/tests/output-shapes-unit.test.js +476 -0
  188. package/dist/tests/parallel-search.test.js +272 -0
  189. package/dist/tests/parameter-metadata.test.js +365 -0
  190. package/dist/tests/paths.test.js +177 -0
  191. package/dist/tests/progressive-disclosure.test.js +280 -0
  192. package/dist/tests/proposals.test.js +279 -0
  193. package/dist/tests/proposed-quality.test.js +271 -0
  194. package/dist/tests/provider-registry.test.js +32 -0
  195. package/dist/tests/ranking-regression.test.js +548 -0
  196. package/dist/tests/reflect-propose.test.js +455 -0
  197. package/dist/tests/registry-build-index.test.js +378 -0
  198. package/dist/tests/registry-cli.test.js +290 -0
  199. package/dist/tests/registry-index-v2.test.js +430 -0
  200. package/dist/tests/registry-install.test.js +728 -0
  201. package/dist/tests/registry-providers/parity.test.js +189 -0
  202. package/dist/tests/registry-providers/skills-sh.test.js +309 -0
  203. package/dist/tests/registry-providers/static-index.test.js +204 -0
  204. package/dist/tests/registry-resolve.test.js +126 -0
  205. package/dist/tests/registry-search.test.js +723 -0
  206. package/dist/tests/remember-frontmatter.test.js +380 -0
  207. package/dist/tests/remember-unit.test.js +123 -0
  208. package/dist/tests/ripgrep-install.test.js +251 -0
  209. package/dist/tests/ripgrep-resolve.test.js +108 -0
  210. package/dist/tests/ripgrep.test.js +163 -0
  211. package/dist/tests/save-command.test.js +94 -0
  212. package/dist/tests/save-trust-qa-fixes.test.js +270 -0
  213. package/dist/tests/scoring-pipeline.test.js +648 -0
  214. package/dist/tests/search-include-proposed-cli.test.js +118 -0
  215. package/dist/tests/self-update.test.js +442 -0
  216. package/dist/tests/semantic-search-e2e.test.js +512 -0
  217. package/dist/tests/semantic-status.test.js +471 -0
  218. package/dist/tests/setup-run.integration.js +877 -0
  219. package/dist/tests/setup-wizard.test.js +198 -0
  220. package/dist/tests/setup.test.js +131 -0
  221. package/dist/tests/source-add.test.js +11 -0
  222. package/dist/tests/source-clone.test.js +254 -0
  223. package/dist/tests/source-manage.test.js +366 -0
  224. package/dist/tests/source-providers/filesystem.test.js +82 -0
  225. package/dist/tests/source-providers/git.test.js +252 -0
  226. package/dist/tests/source-providers/website.test.js +128 -0
  227. package/dist/tests/source-qa-fixes.test.js +268 -0
  228. package/dist/tests/source-registry.test.js +350 -0
  229. package/dist/tests/source-resolve.test.js +100 -0
  230. package/dist/tests/source-source.test.js +221 -0
  231. package/dist/tests/source.test.js +533 -0
  232. package/dist/tests/tar-utils-scan.test.js +73 -0
  233. package/dist/tests/toggle-components.test.js +73 -0
  234. package/dist/tests/usage-telemetry.test.js +265 -0
  235. package/dist/tests/utility-scoring.test.js +558 -0
  236. package/dist/tests/vault-load-error.test.js +78 -0
  237. package/dist/tests/vault-qa-fixes.test.js +194 -0
  238. package/dist/tests/vault.test.js +429 -0
  239. package/dist/tests/vector-search.test.js +608 -0
  240. package/dist/tests/walker.test.js +252 -0
  241. package/dist/tests/wave2-cluster-bc.test.js +228 -0
  242. package/dist/tests/wave2-cluster-d.test.js +180 -0
  243. package/dist/tests/wave2-cluster-e.test.js +179 -0
  244. package/dist/tests/wiki-qa-fixes.test.js +270 -0
  245. package/dist/tests/wiki.test.js +529 -0
  246. package/dist/tests/workflow-cli.test.js +271 -0
  247. package/dist/tests/workflow-markdown.test.js +171 -0
  248. package/dist/tests/workflow-path-escape.test.js +132 -0
  249. package/dist/tests/workflow-qa-fixes.test.js +377 -0
  250. package/dist/tests/workflows/indexer-rejection.test.js +213 -0
  251. package/docs/README.md +8 -0
  252. package/docs/migration/release-notes/0.7.0.md +244 -0
  253. package/package.json +2 -2
  254. package/dist/core/warn.js +0 -27
  255. package/dist/output/shapes.js +0 -212
  256. /package/dist/{commands → src/commands}/completions.js +0 -0
  257. /package/dist/{commands → src/commands}/curate.js +0 -0
  258. /package/dist/{commands → src/commands}/info.js +0 -0
  259. /package/dist/{commands → src/commands}/init.js +0 -0
  260. /package/dist/{commands → src/commands}/install-audit.js +0 -0
  261. /package/dist/{commands → src/commands}/migration-help.js +0 -0
  262. /package/dist/{commands → src/commands}/source-add.js +0 -0
  263. /package/dist/{commands → src/commands}/source-clone.js +0 -0
  264. /package/dist/{commands → src/commands}/source-manage.js +0 -0
  265. /package/dist/{commands → src/commands}/vault.js +0 -0
  266. /package/dist/{core → src/core}/asset-registry.js +0 -0
  267. /package/dist/{core → src/core}/frontmatter.js +0 -0
  268. /package/dist/{core → src/core}/markdown.js +0 -0
  269. /package/dist/{core → src/core}/paths.js +0 -0
  270. /package/dist/{indexer → src/indexer}/manifest.js +0 -0
  271. /package/dist/{indexer → src/indexer}/matchers.js +0 -0
  272. /package/dist/{indexer → src/indexer}/search-fields.js +0 -0
  273. /package/dist/{indexer → src/indexer}/search-source.js +0 -0
  274. /package/dist/{indexer → src/indexer}/semantic-status.js +0 -0
  275. /package/dist/{indexer → src/indexer}/usage-events.js +0 -0
  276. /package/dist/{indexer → src/indexer}/walker.js +0 -0
  277. /package/dist/{integrations → src/integrations}/github.js +0 -0
  278. /package/dist/{llm → src/llm}/embedder.js +0 -0
  279. /package/dist/{llm → src/llm}/embedders/cache.js +0 -0
  280. /package/dist/{llm → src/llm}/embedders/local.js +0 -0
  281. /package/dist/{llm → src/llm}/embedders/remote.js +0 -0
  282. /package/dist/{llm → src/llm}/embedders/types.js +0 -0
  283. /package/dist/{llm → src/llm}/metadata-enhance.js +0 -0
  284. /package/dist/{output → src/output}/cli-hints.js +0 -0
  285. /package/dist/{output → src/output}/context.js +0 -0
  286. /package/dist/{registry → src/registry}/create-provider-registry.js +0 -0
  287. /package/dist/{registry → src/registry}/origin-resolve.js +0 -0
  288. /package/dist/{registry → src/registry}/providers/index.js +0 -0
  289. /package/dist/{registry → src/registry}/providers/skills-sh.js +0 -0
  290. /package/dist/{registry → src/registry}/providers/types.js +0 -0
  291. /package/dist/{registry → src/registry}/types.js +0 -0
  292. /package/dist/{setup → src/setup}/detect.js +0 -0
  293. /package/dist/{setup → src/setup}/ripgrep-install.js +0 -0
  294. /package/dist/{setup → src/setup}/ripgrep-resolve.js +0 -0
  295. /package/dist/{setup → src/setup}/steps.js +0 -0
  296. /package/dist/{sources → src/sources}/include.js +0 -0
  297. /package/dist/{sources → src/sources}/provider-factory.js +0 -0
  298. /package/dist/{sources → src/sources}/provider.js +0 -0
  299. /package/dist/{sources → src/sources}/providers/filesystem.js +0 -0
  300. /package/dist/{sources → src/sources}/providers/index.js +0 -0
  301. /package/dist/{sources → src/sources}/providers/install-types.js +0 -0
  302. /package/dist/{sources → src/sources}/providers/npm.js +0 -0
  303. /package/dist/{sources → src/sources}/providers/provider-utils.js +0 -0
  304. /package/dist/{sources → src/sources}/providers/sync-from-ref.js +0 -0
  305. /package/dist/{sources → src/sources}/providers/tar-utils.js +0 -0
  306. /package/dist/{sources → src/sources}/providers/website.js +0 -0
  307. /package/dist/{sources → src/sources}/resolve.js +0 -0
  308. /package/dist/{sources → src/sources}/types.js +0 -0
  309. /package/dist/{templates → src/templates}/wiki-templates.js +0 -0
  310. /package/dist/{version.js → src/version.js} +0 -0
  311. /package/dist/{workflows → src/workflows}/authoring.js +0 -0
  312. /package/dist/{workflows → src/workflows}/cli.js +0 -0
  313. /package/dist/{workflows → src/workflows}/db.js +0 -0
  314. /package/dist/{workflows → src/workflows}/document-cache.js +0 -0
  315. /package/dist/{workflows → src/workflows}/parser.js +0 -0
  316. /package/dist/{workflows → src/workflows}/renderer.js +0 -0
  317. /package/dist/{workflows → src/workflows}/runs.js +0 -0
  318. /package/dist/{workflows → src/workflows}/schema.js +0 -0
  319. /package/dist/{workflows → src/workflows}/validator.js +0 -0
@@ -0,0 +1,346 @@
1
+ /**
2
+ * akm-bench driver — `runOne(options)` executes a single (task, arm, seed)
3
+ * triple end-to-end and returns a v1 RunResult envelope.
4
+ *
5
+ * See `docs/technical/benchmark.md` §5.2 for the locked schema and §7.1/§7.2
6
+ * for the isolation/budget rules. The shapes here are the v1 contract that
7
+ * #238/#239/#240/#243 will extend without breaking.
8
+ *
9
+ * Design notes:
10
+ * • The driver invokes opencode through `runAgent` with the built-in
11
+ * `opencode` profile. No new harness abstraction.
12
+ * • Per-run isolation: every run gets fresh tmpdirs for `XDG_CACHE_HOME`,
13
+ * `XDG_CONFIG_HOME`, `OPENCODE_CONFIG`, and (when `stashDir` is provided)
14
+ * `AKM_STASH_DIR`. The operator's personal opencode/akm config is NEVER
15
+ * read or written.
16
+ * • Hard budgets: `budgetWallMs` is enforced via `runAgent`'s timeout. A
17
+ * timeout produces `outcome: "budget_exceeded"`, which is a distinct
18
+ * state from `fail` so cost regressions stay visible.
19
+ * • This issue (#236) does not need a real opencode call to work end-to-end.
20
+ * The harness shape, isolation, and result envelope must be correct and
21
+ * unit-testable with an injected fake spawn.
22
+ */
23
+ import fs from "node:fs";
24
+ import path from "node:path";
25
+ import { BUILTIN_AGENT_PROFILE_NAMES, getBuiltinAgentProfile } from "../../src/integrations/agent/profiles";
26
+ import { runAgent } from "../../src/integrations/agent/spawn";
27
+ import { benchMkdtemp } from "./tmp";
28
+ import { runVerifier } from "./verifier";
29
+ /** Operator-config env names that MUST NOT leak into per-run children. */
30
+ const ISOLATED_ENV_NAMES = ["OPENCODE_CONFIG", "AKM_STASH_DIR", "XDG_CACHE_HOME", "XDG_CONFIG_HOME"];
31
+ /**
32
+ * Operator-env names that MUST be stripped from `envSource` before the bench
33
+ * driver hands it to `runAgent`. These are credentials and config-dir hints
34
+ * that belong to the operator's *interactive* environment and have no
35
+ * business inside a bench-arm child:
36
+ *
37
+ * • `OPENCODE_API_KEY` / `ANTHROPIC_API_KEY` — real-money credentials. The
38
+ * opencode profile lists `OPENCODE_API_KEY` in `envPassthrough`, so
39
+ * without explicit scrubbing the bench would forward the operator's key
40
+ * into every (task × arm × seed) child. Bench is hermetic by design;
41
+ * credentials must be supplied through the bench's own config surface,
42
+ * not inherited.
43
+ * • `AKM_CONFIG_DIR` — points akm at the operator's stash config. Letting
44
+ * this leak defeats the per-run isolation tmpdirs `createIsolationDirs`
45
+ * materialises (XDG_CACHE_HOME / XDG_CONFIG_HOME) and would cause
46
+ * bench runs to read the operator's writable config.
47
+ *
48
+ * Recurrence guard for #271 (mirrors the #243/#251 fixup pattern of
49
+ * pinning isolation behaviour with regression tests).
50
+ */
51
+ const SCRUBBED_OPERATOR_ENV_NAMES = ["OPENCODE_API_KEY", "ANTHROPIC_API_KEY", "AKM_CONFIG_DIR"];
52
+ /**
53
+ * Build the `envSource` passed to `runAgent`. Returns a copy of `source`
54
+ * (default: `process.env`) with `SCRUBBED_OPERATOR_ENV_NAMES` removed so
55
+ * profile-level passthrough (`profile.envPassthrough`) cannot drag operator
56
+ * credentials/config-dir hints into the bench-arm child.
57
+ *
58
+ * The returned object is a shallow copy — callers may mutate it without
59
+ * touching the real `process.env`.
60
+ */
61
+ export function buildSanitizedEnvSource(source) {
62
+ const src = source ?? process.env;
63
+ const out = { ...src };
64
+ for (const name of SCRUBBED_OPERATOR_ENV_NAMES) {
65
+ delete out[name];
66
+ }
67
+ return out;
68
+ }
69
+ export function createIsolationDirs(stashDir) {
70
+ const root = benchMkdtemp("akm-bench-run-");
71
+ const cacheHome = path.join(root, "cache");
72
+ const configHome = path.join(root, "config");
73
+ const opencodeConfig = path.join(root, "opencode-config");
74
+ fs.mkdirSync(cacheHome, { recursive: true });
75
+ fs.mkdirSync(configHome, { recursive: true });
76
+ fs.mkdirSync(opencodeConfig, { recursive: true });
77
+ return {
78
+ root,
79
+ cacheHome,
80
+ configHome,
81
+ opencodeConfig,
82
+ akmStashDir: stashDir,
83
+ };
84
+ }
85
+ /** Build the env passed to `runAgent`. The XDG/AKM/OPENCODE keys are pinned. */
86
+ export function buildIsolatedEnv(dirs, model) {
87
+ const env = {
88
+ XDG_CACHE_HOME: dirs.cacheHome,
89
+ XDG_CONFIG_HOME: dirs.configHome,
90
+ OPENCODE_CONFIG: dirs.opencodeConfig,
91
+ BENCH_OPENCODE_MODEL: model,
92
+ };
93
+ if (dirs.akmStashDir)
94
+ env.AKM_STASH_DIR = dirs.akmStashDir;
95
+ return env;
96
+ }
97
+ /**
98
+ * Strip `AKM_STASH_DIR` from a child env object. Used by the synthetic-arm
99
+ * spawn path (#261) so the operator's real `AKM_STASH_DIR` cannot leak in
100
+ * via the parent process even when the harness has copied a wider env via
101
+ * `{ ...process.env, ...env }`. This is the recurrence guard for the #243
102
+ * fixup pattern — a synthetic-arm child must NEVER inherit a stash.
103
+ *
104
+ * Mutates `env` in place and returns it for ergonomic chaining.
105
+ */
106
+ export function stripAkmStashDir(env) {
107
+ delete env.AKM_STASH_DIR;
108
+ return env;
109
+ }
110
+ /**
111
+ * Best-effort token-usage parser for opencode stdout. Returns numeric token
112
+ * counts AND a measurement status so callers can distinguish a real zero
113
+ * (`"parsed"`, both fields legitimately 0) from an unparseable / absent
114
+ * report (`"missing"`, both fields default to 0 but downstream aggregation
115
+ * MUST skip the run rather than treat that 0 as measured).
116
+ *
117
+ * The harness never emits `"unsupported"` from this parser — that label is
118
+ * stamped on results from arms that don't run a token-reporting agent
119
+ * (e.g. the synthetic arm), and is set by the caller, not here.
120
+ */
121
+ export function parseTokenUsage(stdout) {
122
+ // opencode prints lines like `tokens: input=1234 output=5678` in some
123
+ // configurations. We look for the keys defensively; absent values mean we
124
+ // could not measure (`measurement: "missing"`).
125
+ const inputMatch = stdout.match(/(?:input[_\s-]?tokens?|tokens?[_\s-]?input)[\s:=]+(\d+)/i);
126
+ const outputMatch = stdout.match(/(?:output[_\s-]?tokens?|tokens?[_\s-]?output)[\s:=]+(\d+)/i);
127
+ if (!inputMatch && !outputMatch) {
128
+ return { input: 0, output: 0, measurement: "missing" };
129
+ }
130
+ return {
131
+ input: inputMatch ? Number.parseInt(inputMatch[1], 10) : 0,
132
+ output: outputMatch ? Number.parseInt(outputMatch[1], 10) : 0,
133
+ measurement: "parsed",
134
+ };
135
+ }
136
+ /**
137
+ * Maximum bytes read from events.jsonl per run. A runaway agent producing
138
+ * GBs of structured-log output would otherwise OOM the bench. Trajectory
139
+ * parsing operates on the prefix; a warning is appended when the cap is
140
+ * hit so the report surfaces the truncation.
141
+ */
142
+ export const EVENTS_READ_CAP_BYTES = 16 * 1024 * 1024;
143
+ /**
144
+ * Read the events.jsonl file produced by this run, if any. The path is
145
+ * `<XDG_CACHE_HOME>/akm/events.jsonl` per `src/core/events.ts`.
146
+ *
147
+ * Caps the number of bytes read at `EVENTS_READ_CAP_BYTES` (16 MiB). When the
148
+ * file is larger, the prefix is parsed and a warning is appended to
149
+ * `opts.warnings` (when supplied). The trailing partial line after a
150
+ * truncation is dropped, since `JSON.parse` would reject it anyway.
151
+ */
152
+ export function readRunEvents(cacheHome, opts) {
153
+ const eventsPath = path.join(cacheHome, "akm", "events.jsonl");
154
+ if (!fs.existsSync(eventsPath))
155
+ return [];
156
+ // Read up to the cap. We open the file rather than `readFileSync` so we
157
+ // don't allocate an arbitrarily large buffer just to throw most of it away.
158
+ let totalSize = 0;
159
+ try {
160
+ totalSize = fs.statSync(eventsPath).size;
161
+ }
162
+ catch {
163
+ return [];
164
+ }
165
+ const cap = EVENTS_READ_CAP_BYTES;
166
+ const truncated = totalSize > cap;
167
+ let text;
168
+ if (truncated) {
169
+ const buf = Buffer.alloc(cap);
170
+ const fd = fs.openSync(eventsPath, "r");
171
+ try {
172
+ fs.readSync(fd, buf, 0, cap, 0);
173
+ }
174
+ finally {
175
+ fs.closeSync(fd);
176
+ }
177
+ text = buf.toString("utf8");
178
+ // Drop the partial trailing line so we don't try to parse half a record.
179
+ const lastNl = text.lastIndexOf("\n");
180
+ if (lastNl !== -1)
181
+ text = text.slice(0, lastNl);
182
+ if (opts?.warnings) {
183
+ opts.warnings.push(`events.jsonl truncated: ${totalSize} bytes exceeds ${cap}-byte cap; trajectory computed from the prefix.`);
184
+ }
185
+ }
186
+ else {
187
+ text = fs.readFileSync(eventsPath, "utf8");
188
+ }
189
+ const out = [];
190
+ let id = 0;
191
+ for (const line of text.split("\n")) {
192
+ const trimmed = line.trim();
193
+ if (!trimmed)
194
+ continue;
195
+ try {
196
+ const parsed = JSON.parse(trimmed);
197
+ out.push({ ...parsed, id: parsed.id ?? id });
198
+ id += 1;
199
+ }
200
+ catch {
201
+ // Skip malformed lines — events stream is best-effort upstream.
202
+ }
203
+ }
204
+ return out;
205
+ }
206
+ /** Default prompt forwarded to opencode when caller omits one. */
207
+ function defaultPrompt(options) {
208
+ return [
209
+ `Task: ${options.taskId}`,
210
+ `Arm: ${options.arm}`,
211
+ `Workspace: ${options.workspace}`,
212
+ options.arm === "akm"
213
+ ? "An akm stash is configured via AKM_STASH_DIR. Use `akm search` and `akm show` to find relevant assets before acting."
214
+ : "",
215
+ ]
216
+ .filter(Boolean)
217
+ .join("\n");
218
+ }
219
+ /**
220
+ * Run a single (task, arm, seed) and return the v1 RunResult envelope.
221
+ *
222
+ * The function never throws on infrastructure failures — every error path
223
+ * is captured into the returned RunResult with a stable outcome value.
224
+ */
225
+ export async function runOne(options) {
226
+ // Stamp a baseline result; we mutate fields below as the run progresses.
227
+ const result = {
228
+ schemaVersion: 1,
229
+ taskId: options.taskId,
230
+ arm: options.arm,
231
+ seed: options.seed,
232
+ model: options.model,
233
+ outcome: "harness_error",
234
+ tokens: { input: 0, output: 0 },
235
+ tokenMeasurement: "missing",
236
+ wallclockMs: 0,
237
+ trajectory: { correctAssetLoaded: null, feedbackRecorded: null },
238
+ events: [],
239
+ verifierStdout: "",
240
+ verifierExitCode: -1,
241
+ assetsLoaded: [],
242
+ };
243
+ // Look up the built-in opencode profile defensively. The lookup is a pure
244
+ // map read today, but wrapping it preserves the doc-comment guarantee that
245
+ // runOne never throws on infrastructure failures even if the registry
246
+ // shape changes. A missing/throwing profile becomes harness_error.
247
+ let profile;
248
+ try {
249
+ profile = getBuiltinAgentProfile("opencode");
250
+ }
251
+ catch (err) {
252
+ result.verifierStdout = `harness: getBuiltinAgentProfile("opencode") threw: ${err instanceof Error ? err.message : String(err)}`;
253
+ return result;
254
+ }
255
+ if (!profile) {
256
+ result.verifierStdout = `harness: built-in agent profile "opencode" missing; available: ${BUILTIN_AGENT_PROFILE_NAMES.join(", ")}`;
257
+ return result;
258
+ }
259
+ // #261: synthetic-arm runs MUST NOT carry AKM_STASH_DIR. We refuse to
260
+ // forward a stashDir for the synthetic arm even when the caller mistakenly
261
+ // supplies one, and we explicitly delete the key from the built env so the
262
+ // operator's real AKM_STASH_DIR can never leak in through any parent-env
263
+ // inheritance the harness happens to do downstream. Recurrence guard for
264
+ // the #243 fixup pattern.
265
+ const stashDir = options.arm === "synthetic" ? undefined : options.stashDir;
266
+ const dirs = createIsolationDirs(stashDir);
267
+ const env = buildIsolatedEnv(dirs, options.model);
268
+ if (options.arm === "synthetic") {
269
+ stripAkmStashDir(env);
270
+ }
271
+ try {
272
+ const agentResult = await runAgent(profile, options.prompt ?? defaultPrompt(options), {
273
+ env,
274
+ // #271: scrub operator credentials + config-dir hints from the env
275
+ // source BEFORE profile.envPassthrough copies them into the child.
276
+ // Without this, OPENCODE_API_KEY (in opencode's passthrough list) and
277
+ // AKM_CONFIG_DIR (read by akm at startup) would leak the operator's
278
+ // interactive environment into every bench child.
279
+ envSource: buildSanitizedEnvSource(),
280
+ cwd: options.workspace,
281
+ timeoutMs: options.budgetWallMs,
282
+ stdio: "captured",
283
+ ...(options.spawn ? { spawn: options.spawn } : {}),
284
+ });
285
+ result.wallclockMs = agentResult.durationMs;
286
+ const parsed = parseTokenUsage(agentResult.stdout);
287
+ result.tokens = { input: parsed.input, output: parsed.output };
288
+ result.tokenMeasurement = parsed.measurement;
289
+ result.events = readRunEvents(dirs.cacheHome, { warnings: options.warnings });
290
+ if (!agentResult.ok) {
291
+ if (agentResult.reason === "timeout") {
292
+ result.outcome = "budget_exceeded";
293
+ return result;
294
+ }
295
+ // spawn_failed / non_zero_exit / parse_error all mean the harness
296
+ // itself broke; the verifier never saw the workspace.
297
+ if (agentResult.reason === "spawn_failed" || agentResult.reason === "parse_error") {
298
+ result.outcome = "harness_error";
299
+ return result;
300
+ }
301
+ // non_zero_exit from the agent: intentionally falls through to the
302
+ // verifier path. Per spec §5.3 ("deterministic verifiers, never LLM"),
303
+ // the agent is the system under test, not the judge — its exit code
304
+ // does not gate verification. The verifier always runs against
305
+ // whatever workspace state the agent left behind, even on a crash.
306
+ }
307
+ // Token-budget enforcement is best-effort: only mark `budget_exceeded`
308
+ // if measurement was actually parsed (issue #252) AND the total exceeds
309
+ // the cap. A `"missing"` / `"unsupported"` measurement MUST NOT silently
310
+ // mask a budget overrun as a pass — it leaves the verifier to decide.
311
+ if (result.tokenMeasurement === "parsed") {
312
+ const totalTokens = result.tokens.input + result.tokens.output;
313
+ if (totalTokens > options.budgetTokens) {
314
+ result.outcome = "budget_exceeded";
315
+ return result;
316
+ }
317
+ }
318
+ const verifierResult = await runVerifier(options.taskDir, options.workspace, options.verifier, {
319
+ agentStdout: agentResult.stdout,
320
+ expectedMatch: options.expectedMatch,
321
+ ...(options.spawn ? { spawn: options.spawn } : {}),
322
+ });
323
+ result.verifierStdout = verifierResult.stdout;
324
+ result.verifierExitCode = verifierResult.exitCode;
325
+ if (verifierResult.exitCode === 127) {
326
+ // Missing runtime (e.g. pytest not on PATH) — not the agent's fault.
327
+ result.outcome = "harness_error";
328
+ }
329
+ else {
330
+ result.outcome = verifierResult.exitCode === 0 ? "pass" : "fail";
331
+ }
332
+ return result;
333
+ }
334
+ finally {
335
+ // Always tear down the isolation tmpdir. We copy events out before
336
+ // deletion (see readRunEvents above), so this is safe.
337
+ fs.rmSync(dirs.root, { recursive: true, force: true });
338
+ }
339
+ }
340
+ /** Exposed for the unit test that asserts operator env never leaks. */
341
+ export const _ISOLATED_ENV_NAMES = ISOLATED_ENV_NAMES;
342
+ /**
343
+ * Exposed for the #271 regression test that asserts operator credentials +
344
+ * `AKM_CONFIG_DIR` never reach a bench-arm child via profile.envPassthrough.
345
+ */
346
+ export const _SCRUBBED_OPERATOR_ENV_NAMES = SCRUBBED_OPERATOR_ENV_NAMES;