akm-cli 0.7.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (327) hide show
  1. package/package.json +8 -8
  2. package/dist/tests/add-website-source.test.js +0 -119
  3. package/dist/tests/agent/agent-config-loader.test.js +0 -70
  4. package/dist/tests/agent/agent-config.test.js +0 -221
  5. package/dist/tests/agent/agent-detect.test.js +0 -100
  6. package/dist/tests/agent/agent-spawn.test.js +0 -234
  7. package/dist/tests/agent-output.test.js +0 -186
  8. package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +0 -103
  9. package/dist/tests/architecture/agent-spawn-seam.test.js +0 -193
  10. package/dist/tests/architecture/llm-stateless-seam.test.js +0 -112
  11. package/dist/tests/asset-ref.test.js +0 -192
  12. package/dist/tests/asset-registry.test.js +0 -103
  13. package/dist/tests/asset-spec.test.js +0 -241
  14. package/dist/tests/bench/attribution.test.js +0 -996
  15. package/dist/tests/bench/cleanup-sigint.test.js +0 -83
  16. package/dist/tests/bench/cleanup.js +0 -234
  17. package/dist/tests/bench/cleanup.test.js +0 -166
  18. package/dist/tests/bench/cli.js +0 -1018
  19. package/dist/tests/bench/cli.test.js +0 -445
  20. package/dist/tests/bench/compare.test.js +0 -556
  21. package/dist/tests/bench/corpus.js +0 -317
  22. package/dist/tests/bench/corpus.test.js +0 -258
  23. package/dist/tests/bench/doctor.js +0 -525
  24. package/dist/tests/bench/driver.js +0 -401
  25. package/dist/tests/bench/driver.test.js +0 -584
  26. package/dist/tests/bench/environment.js +0 -233
  27. package/dist/tests/bench/environment.test.js +0 -199
  28. package/dist/tests/bench/evolve-metrics.js +0 -179
  29. package/dist/tests/bench/evolve-metrics.test.js +0 -187
  30. package/dist/tests/bench/evolve.js +0 -647
  31. package/dist/tests/bench/evolve.test.js +0 -624
  32. package/dist/tests/bench/failure-modes.test.js +0 -349
  33. package/dist/tests/bench/feedback-integrity.test.js +0 -457
  34. package/dist/tests/bench/leakage.test.js +0 -228
  35. package/dist/tests/bench/learning-curve.test.js +0 -134
  36. package/dist/tests/bench/metrics.js +0 -2395
  37. package/dist/tests/bench/metrics.test.js +0 -1150
  38. package/dist/tests/bench/no-os-tmpdir-invariant.test.js +0 -43
  39. package/dist/tests/bench/opencode-config.js +0 -194
  40. package/dist/tests/bench/opencode-config.test.js +0 -370
  41. package/dist/tests/bench/report.js +0 -1885
  42. package/dist/tests/bench/report.test.js +0 -1038
  43. package/dist/tests/bench/run-config.js +0 -355
  44. package/dist/tests/bench/run-config.test.js +0 -298
  45. package/dist/tests/bench/run-curate-test.js +0 -32
  46. package/dist/tests/bench/run-failing-tasks.js +0 -56
  47. package/dist/tests/bench/run-full-bench.js +0 -51
  48. package/dist/tests/bench/run-items36-targeted.js +0 -69
  49. package/dist/tests/bench/run-nano-quick.js +0 -42
  50. package/dist/tests/bench/run-waveg-targeted.js +0 -62
  51. package/dist/tests/bench/runner.js +0 -699
  52. package/dist/tests/bench/runner.test.js +0 -958
  53. package/dist/tests/bench/search-bridge.test.js +0 -331
  54. package/dist/tests/bench/tmp.js +0 -131
  55. package/dist/tests/bench/trajectory.js +0 -116
  56. package/dist/tests/bench/trajectory.test.js +0 -127
  57. package/dist/tests/bench/verifier.js +0 -114
  58. package/dist/tests/bench/verifier.test.js +0 -118
  59. package/dist/tests/bench/workflow-evaluator.js +0 -557
  60. package/dist/tests/bench/workflow-evaluator.test.js +0 -421
  61. package/dist/tests/bench/workflow-spec.js +0 -345
  62. package/dist/tests/bench/workflow-spec.test.js +0 -363
  63. package/dist/tests/bench/workflow-trace.js +0 -472
  64. package/dist/tests/bench/workflow-trace.test.js +0 -254
  65. package/dist/tests/benchmark-search-quality.js +0 -536
  66. package/dist/tests/benchmark-suite.js +0 -1441
  67. package/dist/tests/capture-cli.test.js +0 -112
  68. package/dist/tests/cli-errors.test.js +0 -204
  69. package/dist/tests/commands/events.test.js +0 -370
  70. package/dist/tests/commands/history.test.js +0 -418
  71. package/dist/tests/commands/import.test.js +0 -103
  72. package/dist/tests/commands/proposal-cli.test.js +0 -209
  73. package/dist/tests/commands/reflect-propose-cli.test.js +0 -333
  74. package/dist/tests/commands/remember.test.js +0 -97
  75. package/dist/tests/commands/scope-flags.test.js +0 -300
  76. package/dist/tests/commands/search.test.js +0 -537
  77. package/dist/tests/commands/show-indexer-parity.test.js +0 -117
  78. package/dist/tests/commands/show.test.js +0 -294
  79. package/dist/tests/common.test.js +0 -266
  80. package/dist/tests/completions.test.js +0 -142
  81. package/dist/tests/config-cli.test.js +0 -193
  82. package/dist/tests/config-llm-features.test.js +0 -139
  83. package/dist/tests/config.test.js +0 -569
  84. package/dist/tests/contracts/migration-baseline.test.js +0 -43
  85. package/dist/tests/contracts/reflect-propose-envelope.test.js +0 -139
  86. package/dist/tests/contracts/spec-helpers.js +0 -46
  87. package/dist/tests/contracts/v1-spec-section-11-proposal-queue.test.js +0 -228
  88. package/dist/tests/contracts/v1-spec-section-12-agent-config.test.js +0 -56
  89. package/dist/tests/contracts/v1-spec-section-13-lesson-type.test.js +0 -34
  90. package/dist/tests/contracts/v1-spec-section-14-llm-features.test.js +0 -94
  91. package/dist/tests/contracts/v1-spec-section-4-1-asset-types.test.js +0 -39
  92. package/dist/tests/contracts/v1-spec-section-4-2-quality-rules.test.js +0 -44
  93. package/dist/tests/contracts/v1-spec-section-5-configuration.test.js +0 -47
  94. package/dist/tests/contracts/v1-spec-section-6-orchestration.test.js +0 -40
  95. package/dist/tests/contracts/v1-spec-section-7-module-layout.test.js +0 -58
  96. package/dist/tests/contracts/v1-spec-section-8-extension-points.test.js +0 -34
  97. package/dist/tests/contracts/v1-spec-section-9-4-cli-surface.test.js +0 -75
  98. package/dist/tests/contracts/v1-spec-section-9-7-llm-agent-boundary.test.js +0 -36
  99. package/dist/tests/core/write-source.test.js +0 -366
  100. package/dist/tests/curate-command.test.js +0 -87
  101. package/dist/tests/db-scoring.test.js +0 -201
  102. package/dist/tests/db.test.js +0 -654
  103. package/dist/tests/distill-cli-flag.test.js +0 -208
  104. package/dist/tests/distill.test.js +0 -515
  105. package/dist/tests/docker-install.test.js +0 -120
  106. package/dist/tests/e2e.test.js +0 -1419
  107. package/dist/tests/embedder.test.js +0 -340
  108. package/dist/tests/embedding-model-config.test.js +0 -379
  109. package/dist/tests/feedback-command.test.js +0 -172
  110. package/dist/tests/file-context.test.js +0 -552
  111. package/dist/tests/fixtures/scripts/git/summarize-diff.js +0 -9
  112. package/dist/tests/fixtures/scripts/lint/eslint-check.js +0 -7
  113. package/dist/tests/fixtures/stashes/load.js +0 -166
  114. package/dist/tests/fixtures/stashes/load.test.js +0 -97
  115. package/dist/tests/fixtures/stashes/ranking-baseline/scripts/mem0-search.js +0 -12
  116. package/dist/tests/frontmatter.test.js +0 -190
  117. package/dist/tests/fts-field-weighting.test.js +0 -254
  118. package/dist/tests/fuzzy-search.test.js +0 -230
  119. package/dist/tests/git-provider-clone.test.js +0 -45
  120. package/dist/tests/github.test.js +0 -161
  121. package/dist/tests/graph-boost-ranking.test.js +0 -305
  122. package/dist/tests/graph-extraction.test.js +0 -282
  123. package/dist/tests/helpers/usage-events.js +0 -8
  124. package/dist/tests/index-pass-llm.test.js +0 -161
  125. package/dist/tests/indexer.test.js +0 -570
  126. package/dist/tests/info-command.test.js +0 -166
  127. package/dist/tests/init.test.js +0 -69
  128. package/dist/tests/install-script.test.js +0 -246
  129. package/dist/tests/integration/agent-real-profile.test.js +0 -94
  130. package/dist/tests/issue-36-repro.test.js +0 -304
  131. package/dist/tests/issues-191-194.test.js +0 -160
  132. package/dist/tests/lesson-lint.test.js +0 -111
  133. package/dist/tests/llm-client.test.js +0 -115
  134. package/dist/tests/llm-feature-gate.test.js +0 -151
  135. package/dist/tests/llm.test.js +0 -139
  136. package/dist/tests/lockfile.test.js +0 -216
  137. package/dist/tests/manifest.test.js +0 -205
  138. package/dist/tests/markdown.test.js +0 -126
  139. package/dist/tests/matchers-unit.test.js +0 -189
  140. package/dist/tests/memory-inference.test.js +0 -299
  141. package/dist/tests/merge-scoring.test.js +0 -136
  142. package/dist/tests/metadata.test.js +0 -313
  143. package/dist/tests/migration-help.test.js +0 -89
  144. package/dist/tests/origin-resolve.test.js +0 -124
  145. package/dist/tests/output-baseline.test.js +0 -218
  146. package/dist/tests/output-shapes-unit.test.js +0 -478
  147. package/dist/tests/parallel-search.test.js +0 -272
  148. package/dist/tests/parameter-metadata.test.js +0 -365
  149. package/dist/tests/paths.test.js +0 -177
  150. package/dist/tests/progressive-disclosure.test.js +0 -280
  151. package/dist/tests/proposals.test.js +0 -279
  152. package/dist/tests/proposed-quality.test.js +0 -271
  153. package/dist/tests/provider-registry.test.js +0 -32
  154. package/dist/tests/ranking-regression.test.js +0 -548
  155. package/dist/tests/reflect-propose.test.js +0 -455
  156. package/dist/tests/registry-build-index.test.js +0 -394
  157. package/dist/tests/registry-cli.test.js +0 -290
  158. package/dist/tests/registry-index-v2.test.js +0 -430
  159. package/dist/tests/registry-install.test.js +0 -728
  160. package/dist/tests/registry-providers/parity.test.js +0 -189
  161. package/dist/tests/registry-providers/skills-sh.test.js +0 -309
  162. package/dist/tests/registry-providers/static-index.test.js +0 -238
  163. package/dist/tests/registry-resolve.test.js +0 -126
  164. package/dist/tests/registry-search.test.js +0 -923
  165. package/dist/tests/remember-frontmatter.test.js +0 -378
  166. package/dist/tests/remember-unit.test.js +0 -123
  167. package/dist/tests/ripgrep-install.test.js +0 -251
  168. package/dist/tests/ripgrep-resolve.test.js +0 -108
  169. package/dist/tests/ripgrep.test.js +0 -163
  170. package/dist/tests/save-command.test.js +0 -94
  171. package/dist/tests/save-trust-qa-fixes.test.js +0 -270
  172. package/dist/tests/scoring-pipeline.test.js +0 -648
  173. package/dist/tests/search-include-proposed-cli.test.js +0 -118
  174. package/dist/tests/self-update.test.js +0 -442
  175. package/dist/tests/semantic-search-e2e.test.js +0 -512
  176. package/dist/tests/semantic-status.test.js +0 -471
  177. package/dist/tests/setup-run.integration.js +0 -877
  178. package/dist/tests/setup-wizard.test.js +0 -198
  179. package/dist/tests/setup.test.js +0 -131
  180. package/dist/tests/source-add.test.js +0 -11
  181. package/dist/tests/source-clone.test.js +0 -254
  182. package/dist/tests/source-manage.test.js +0 -366
  183. package/dist/tests/source-providers/filesystem.test.js +0 -82
  184. package/dist/tests/source-providers/git.test.js +0 -252
  185. package/dist/tests/source-providers/website.test.js +0 -128
  186. package/dist/tests/source-qa-fixes.test.js +0 -286
  187. package/dist/tests/source-registry.test.js +0 -350
  188. package/dist/tests/source-resolve.test.js +0 -100
  189. package/dist/tests/source-source.test.js +0 -281
  190. package/dist/tests/source.test.js +0 -533
  191. package/dist/tests/tar-utils-scan.test.js +0 -73
  192. package/dist/tests/toggle-components.test.js +0 -73
  193. package/dist/tests/usage-telemetry.test.js +0 -265
  194. package/dist/tests/utility-scoring.test.js +0 -558
  195. package/dist/tests/vault-load-error.test.js +0 -78
  196. package/dist/tests/vault-qa-fixes.test.js +0 -194
  197. package/dist/tests/vault.test.js +0 -429
  198. package/dist/tests/vector-search.test.js +0 -608
  199. package/dist/tests/walker.test.js +0 -252
  200. package/dist/tests/wave2-cluster-bc.test.js +0 -228
  201. package/dist/tests/wave2-cluster-d.test.js +0 -180
  202. package/dist/tests/wave2-cluster-e.test.js +0 -179
  203. package/dist/tests/wiki-qa-fixes.test.js +0 -270
  204. package/dist/tests/wiki.test.js +0 -529
  205. package/dist/tests/workflow-cli.test.js +0 -271
  206. package/dist/tests/workflow-markdown.test.js +0 -171
  207. package/dist/tests/workflow-path-escape.test.js +0 -132
  208. package/dist/tests/workflow-qa-fixes.test.js +0 -395
  209. package/dist/tests/workflows/indexer-rejection.test.js +0 -213
  210. /package/dist/{src/cli.js → cli.js} +0 -0
  211. /package/dist/{src/commands → commands}/completions.js +0 -0
  212. /package/dist/{src/commands → commands}/config-cli.js +0 -0
  213. /package/dist/{src/commands → commands}/curate.js +0 -0
  214. /package/dist/{src/commands → commands}/distill.js +0 -0
  215. /package/dist/{src/commands → commands}/events.js +0 -0
  216. /package/dist/{src/commands → commands}/history.js +0 -0
  217. /package/dist/{src/commands → commands}/info.js +0 -0
  218. /package/dist/{src/commands → commands}/init.js +0 -0
  219. /package/dist/{src/commands → commands}/install-audit.js +0 -0
  220. /package/dist/{src/commands → commands}/installed-stashes.js +0 -0
  221. /package/dist/{src/commands → commands}/migration-help.js +0 -0
  222. /package/dist/{src/commands → commands}/proposal.js +0 -0
  223. /package/dist/{src/commands → commands}/propose.js +0 -0
  224. /package/dist/{src/commands → commands}/reflect.js +0 -0
  225. /package/dist/{src/commands → commands}/registry-search.js +0 -0
  226. /package/dist/{src/commands → commands}/remember.js +0 -0
  227. /package/dist/{src/commands → commands}/search.js +0 -0
  228. /package/dist/{src/commands → commands}/self-update.js +0 -0
  229. /package/dist/{src/commands → commands}/show.js +0 -0
  230. /package/dist/{src/commands → commands}/source-add.js +0 -0
  231. /package/dist/{src/commands → commands}/source-clone.js +0 -0
  232. /package/dist/{src/commands → commands}/source-manage.js +0 -0
  233. /package/dist/{src/commands → commands}/vault.js +0 -0
  234. /package/dist/{src/core → core}/asset-ref.js +0 -0
  235. /package/dist/{src/core → core}/asset-registry.js +0 -0
  236. /package/dist/{src/core → core}/asset-spec.js +0 -0
  237. /package/dist/{src/core → core}/common.js +0 -0
  238. /package/dist/{src/core → core}/config.js +0 -0
  239. /package/dist/{src/core → core}/errors.js +0 -0
  240. /package/dist/{src/core → core}/events.js +0 -0
  241. /package/dist/{src/core → core}/frontmatter.js +0 -0
  242. /package/dist/{src/core → core}/lesson-lint.js +0 -0
  243. /package/dist/{src/core → core}/markdown.js +0 -0
  244. /package/dist/{src/core → core}/paths.js +0 -0
  245. /package/dist/{src/core → core}/proposals.js +0 -0
  246. /package/dist/{src/core → core}/warn.js +0 -0
  247. /package/dist/{src/core → core}/write-source.js +0 -0
  248. /package/dist/{src/indexer → indexer}/db-search.js +0 -0
  249. /package/dist/{src/indexer → indexer}/db.js +0 -0
  250. /package/dist/{src/indexer → indexer}/file-context.js +0 -0
  251. /package/dist/{src/indexer → indexer}/graph-boost.js +0 -0
  252. /package/dist/{src/indexer → indexer}/graph-extraction.js +0 -0
  253. /package/dist/{src/indexer → indexer}/indexer.js +0 -0
  254. /package/dist/{src/indexer → indexer}/manifest.js +0 -0
  255. /package/dist/{src/indexer → indexer}/matchers.js +0 -0
  256. /package/dist/{src/indexer → indexer}/memory-inference.js +0 -0
  257. /package/dist/{src/indexer → indexer}/metadata.js +0 -0
  258. /package/dist/{src/indexer → indexer}/search-fields.js +0 -0
  259. /package/dist/{src/indexer → indexer}/search-source.js +0 -0
  260. /package/dist/{src/indexer → indexer}/semantic-status.js +0 -0
  261. /package/dist/{src/indexer → indexer}/usage-events.js +0 -0
  262. /package/dist/{src/indexer → indexer}/walker.js +0 -0
  263. /package/dist/{src/integrations → integrations}/agent/config.js +0 -0
  264. /package/dist/{src/integrations → integrations}/agent/detect.js +0 -0
  265. /package/dist/{src/integrations → integrations}/agent/index.js +0 -0
  266. /package/dist/{src/integrations → integrations}/agent/profiles.js +0 -0
  267. /package/dist/{src/integrations → integrations}/agent/prompts.js +0 -0
  268. /package/dist/{src/integrations → integrations}/agent/spawn.js +0 -0
  269. /package/dist/{src/integrations → integrations}/github.js +0 -0
  270. /package/dist/{src/integrations → integrations}/lockfile.js +0 -0
  271. /package/dist/{src/llm → llm}/client.js +0 -0
  272. /package/dist/{src/llm → llm}/embedder.js +0 -0
  273. /package/dist/{src/llm → llm}/embedders/cache.js +0 -0
  274. /package/dist/{src/llm → llm}/embedders/local.js +0 -0
  275. /package/dist/{src/llm → llm}/embedders/remote.js +0 -0
  276. /package/dist/{src/llm → llm}/embedders/types.js +0 -0
  277. /package/dist/{src/llm → llm}/feature-gate.js +0 -0
  278. /package/dist/{src/llm → llm}/graph-extract.js +0 -0
  279. /package/dist/{src/llm → llm}/index-passes.js +0 -0
  280. /package/dist/{src/llm → llm}/memory-infer.js +0 -0
  281. /package/dist/{src/llm → llm}/metadata-enhance.js +0 -0
  282. /package/dist/{src/output → output}/cli-hints.js +0 -0
  283. /package/dist/{src/output → output}/context.js +0 -0
  284. /package/dist/{src/output → output}/renderers.js +0 -0
  285. /package/dist/{src/output → output}/shapes.js +0 -0
  286. /package/dist/{src/output → output}/text.js +0 -0
  287. /package/dist/{src/registry → registry}/build-index.js +0 -0
  288. /package/dist/{src/registry → registry}/create-provider-registry.js +0 -0
  289. /package/dist/{src/registry → registry}/factory.js +0 -0
  290. /package/dist/{src/registry → registry}/origin-resolve.js +0 -0
  291. /package/dist/{src/registry → registry}/providers/index.js +0 -0
  292. /package/dist/{src/registry → registry}/providers/skills-sh.js +0 -0
  293. /package/dist/{src/registry → registry}/providers/static-index.js +0 -0
  294. /package/dist/{src/registry → registry}/providers/types.js +0 -0
  295. /package/dist/{src/registry → registry}/resolve.js +0 -0
  296. /package/dist/{src/registry → registry}/types.js +0 -0
  297. /package/dist/{src/setup → setup}/detect.js +0 -0
  298. /package/dist/{src/setup → setup}/ripgrep-install.js +0 -0
  299. /package/dist/{src/setup → setup}/ripgrep-resolve.js +0 -0
  300. /package/dist/{src/setup → setup}/setup.js +0 -0
  301. /package/dist/{src/setup → setup}/steps.js +0 -0
  302. /package/dist/{src/sources → sources}/include.js +0 -0
  303. /package/dist/{src/sources → sources}/provider-factory.js +0 -0
  304. /package/dist/{src/sources → sources}/provider.js +0 -0
  305. /package/dist/{src/sources → sources}/providers/filesystem.js +0 -0
  306. /package/dist/{src/sources → sources}/providers/git.js +0 -0
  307. /package/dist/{src/sources → sources}/providers/index.js +0 -0
  308. /package/dist/{src/sources → sources}/providers/install-types.js +0 -0
  309. /package/dist/{src/sources → sources}/providers/npm.js +0 -0
  310. /package/dist/{src/sources → sources}/providers/provider-utils.js +0 -0
  311. /package/dist/{src/sources → sources}/providers/sync-from-ref.js +0 -0
  312. /package/dist/{src/sources → sources}/providers/tar-utils.js +0 -0
  313. /package/dist/{src/sources → sources}/providers/website.js +0 -0
  314. /package/dist/{src/sources → sources}/resolve.js +0 -0
  315. /package/dist/{src/sources → sources}/types.js +0 -0
  316. /package/dist/{src/templates → templates}/wiki-templates.js +0 -0
  317. /package/dist/{src/version.js → version.js} +0 -0
  318. /package/dist/{src/wiki → wiki}/wiki.js +0 -0
  319. /package/dist/{src/workflows → workflows}/authoring.js +0 -0
  320. /package/dist/{src/workflows → workflows}/cli.js +0 -0
  321. /package/dist/{src/workflows → workflows}/db.js +0 -0
  322. /package/dist/{src/workflows → workflows}/document-cache.js +0 -0
  323. /package/dist/{src/workflows → workflows}/parser.js +0 -0
  324. /package/dist/{src/workflows → workflows}/renderer.js +0 -0
  325. /package/dist/{src/workflows → workflows}/runs.js +0 -0
  326. /package/dist/{src/workflows → workflows}/schema.js +0 -0
  327. /package/dist/{src/workflows → workflows}/validator.js +0 -0
@@ -1,401 +0,0 @@
1
- /**
2
- * akm-bench driver — `runOne(options)` executes a single (task, arm, seed)
3
- * triple end-to-end and returns a v1 RunResult envelope.
4
- *
5
- * See `docs/technical/benchmark.md` §5.2 for the locked schema and §7.1/§7.2
6
- * for the isolation/budget rules. The shapes here are the v1 contract that
7
- * #238/#239/#240/#243 will extend without breaking.
8
- *
9
- * Design notes:
10
- * • The driver invokes opencode through `runAgent` with the built-in
11
- * `opencode` profile. No new harness abstraction.
12
- * • Per-run isolation: every run gets fresh tmpdirs for `XDG_CACHE_HOME`,
13
- * `XDG_CONFIG_HOME`, `OPENCODE_CONFIG`, and (when `stashDir` is provided)
14
- * `AKM_STASH_DIR`. The operator's personal opencode/akm config is NEVER
15
- * read or written.
16
- * • Hard budgets: `budgetWallMs` is enforced via `runAgent`'s timeout. A
17
- * timeout produces `outcome: "budget_exceeded"`, which is a distinct
18
- * state from `fail` so cost regressions stay visible.
19
- * • This issue (#236) does not need a real opencode call to work end-to-end.
20
- * The harness shape, isolation, and result envelope must be correct and
21
- * unit-testable with an injected fake spawn.
22
- */
23
- import fs from "node:fs";
24
- import os from "node:os";
25
- import path from "node:path";
26
- import { BUILTIN_AGENT_PROFILE_NAMES, getBuiltinAgentProfile } from "../../src/integrations/agent/profiles";
27
- import { runAgent } from "../../src/integrations/agent/spawn";
28
- import { setupBenchEnvironment } from "./environment";
29
- import { benchMkdtemp } from "./tmp";
30
- import { runVerifier } from "./verifier";
31
- /** Operator-config env names that MUST NOT leak into per-run children. */
32
- const ISOLATED_ENV_NAMES = ["OPENCODE_CONFIG", "AKM_STASH_DIR", "XDG_CACHE_HOME", "XDG_CONFIG_HOME"];
33
- /**
34
- * Operator-env names that MUST be stripped from `envSource` before the bench
35
- * driver hands it to `runAgent`. These are credentials and config-dir hints
36
- * that belong to the operator's *interactive* environment and have no
37
- * business inside a bench-arm child:
38
- *
39
- * • `OPENCODE_API_KEY` / `ANTHROPIC_API_KEY` — real-money credentials. The
40
- * opencode profile lists `OPENCODE_API_KEY` in `envPassthrough`, so
41
- * without explicit scrubbing the bench would forward the operator's key
42
- * into every (task × arm × seed) child. Bench is hermetic by design;
43
- * credentials must be supplied through the bench's own config surface,
44
- * not inherited.
45
- * • `AKM_CONFIG_DIR` — points akm at the operator's stash config. Letting
46
- * this leak defeats the per-run isolation tmpdirs `createIsolationDirs`
47
- * materialises (XDG_CACHE_HOME / XDG_CONFIG_HOME) and would cause
48
- * bench runs to read the operator's writable config.
49
- *
50
- * Recurrence guard for #271 (mirrors the #243/#251 fixup pattern of
51
- * pinning isolation behaviour with regression tests).
52
- */
53
- const SCRUBBED_OPERATOR_ENV_NAMES = ["OPENCODE_API_KEY", "ANTHROPIC_API_KEY", "AKM_CONFIG_DIR"];
54
- /**
55
- * Build the `envSource` passed to `runAgent`. Returns a copy of `source`
56
- * (default: `process.env`) with `SCRUBBED_OPERATOR_ENV_NAMES` removed so
57
- * profile-level passthrough (`profile.envPassthrough`) cannot drag operator
58
- * credentials/config-dir hints into the bench-arm child.
59
- *
60
- * The returned object is a shallow copy — callers may mutate it without
61
- * touching the real `process.env`.
62
- */
63
- export function buildSanitizedEnvSource(source) {
64
- const src = source ?? process.env;
65
- const out = { ...src };
66
- for (const name of SCRUBBED_OPERATOR_ENV_NAMES) {
67
- delete out[name];
68
- }
69
- return out;
70
- }
71
- export function createIsolationDirs(stashDir) {
72
- const root = benchMkdtemp("akm-bench-run-");
73
- const cacheHome = path.join(root, "cache");
74
- const configHome = path.join(root, "config");
75
- const opencodeConfig = path.join(root, "opencode-config");
76
- fs.mkdirSync(cacheHome, { recursive: true });
77
- fs.mkdirSync(configHome, { recursive: true });
78
- fs.mkdirSync(opencodeConfig, { recursive: true });
79
- // Symlink the real opencode config dir into XDG_CONFIG_HOME so opencode
80
- // can find its installed npm provider packages (node_modules). Without
81
- // this, overriding XDG_CONFIG_HOME produces an empty opencode config dir
82
- // and provider plugins (e.g. @ai-sdk/openai-compatible) fail to load.
83
- // OPENCODE_CONFIG still points to our materialised file, which opencode
84
- // reads in preference to XDG_CONFIG_HOME/opencode/opencode.json.
85
- const realOpencodeConfigDir = path.join(os.homedir(), ".config", "opencode");
86
- const isolatedOpencodeConfigDir = path.join(configHome, "opencode");
87
- if (fs.existsSync(realOpencodeConfigDir)) {
88
- fs.symlinkSync(realOpencodeConfigDir, isolatedOpencodeConfigDir);
89
- }
90
- else {
91
- fs.mkdirSync(isolatedOpencodeConfigDir, { recursive: true });
92
- }
93
- return {
94
- root,
95
- cacheHome,
96
- configHome,
97
- opencodeConfig,
98
- akmStashDir: stashDir,
99
- };
100
- }
101
- /** Build the env passed to `runAgent`. The XDG/AKM/OPENCODE keys are pinned. */
102
- export function buildIsolatedEnv(dirs, model) {
103
- const env = {
104
- XDG_CACHE_HOME: dirs.cacheHome,
105
- XDG_CONFIG_HOME: dirs.configHome,
106
- OPENCODE_CONFIG: path.join(dirs.opencodeConfig, "opencode.json"),
107
- BENCH_OPENCODE_MODEL: model,
108
- };
109
- if (dirs.akmStashDir)
110
- env.AKM_STASH_DIR = dirs.akmStashDir;
111
- return env;
112
- }
113
- /**
114
- * Strip `AKM_STASH_DIR` from a child env object. Used by the synthetic-arm
115
- * spawn path (#261) so the operator's real `AKM_STASH_DIR` cannot leak in
116
- * via the parent process even when the harness has copied a wider env via
117
- * `{ ...process.env, ...env }`. This is the recurrence guard for the #243
118
- * fixup pattern — a synthetic-arm child must NEVER inherit a stash.
119
- *
120
- * Mutates `env` in place and returns it for ergonomic chaining.
121
- */
122
- export function stripAkmStashDir(env) {
123
- delete env.AKM_STASH_DIR;
124
- return env;
125
- }
126
- /**
127
- * Best-effort token-usage parser for opencode stdout. Returns numeric token
128
- * counts AND a measurement status so callers can distinguish a real zero
129
- * (`"parsed"`, both fields legitimately 0) from an unparseable / absent
130
- * report (`"missing"`, both fields default to 0 but downstream aggregation
131
- * MUST skip the run rather than treat that 0 as measured).
132
- *
133
- * The harness never emits `"unsupported"` from this parser — that label is
134
- * stamped on results from arms that don't run a token-reporting agent
135
- * (e.g. the synthetic arm), and is set by the caller, not here.
136
- */
137
- export function parseTokenUsage(stdout) {
138
- // opencode prints lines like `tokens: input=1234 output=5678` in some
139
- // configurations. We look for the keys defensively; absent values mean we
140
- // could not measure (`measurement: "missing"`).
141
- const inputMatch = stdout.match(/(?:input[_\s-]?tokens?|tokens?[_\s-]?input)[\s:=]+(\d+)/i);
142
- const outputMatch = stdout.match(/(?:output[_\s-]?tokens?|tokens?[_\s-]?output)[\s:=]+(\d+)/i);
143
- if (!inputMatch && !outputMatch) {
144
- return { input: 0, output: 0, measurement: "missing" };
145
- }
146
- return {
147
- input: inputMatch ? Number.parseInt(inputMatch[1], 10) : 0,
148
- output: outputMatch ? Number.parseInt(outputMatch[1], 10) : 0,
149
- measurement: "parsed",
150
- };
151
- }
152
- /**
153
- * Maximum bytes read from events.jsonl per run. A runaway agent producing
154
- * GBs of structured-log output would otherwise OOM the bench. Trajectory
155
- * parsing operates on the prefix; a warning is appended when the cap is
156
- * hit so the report surfaces the truncation.
157
- */
158
- export const EVENTS_READ_CAP_BYTES = 16 * 1024 * 1024;
159
- /**
160
- * Read the events.jsonl file produced by this run, if any. The path is
161
- * `<XDG_CACHE_HOME>/akm/events.jsonl` per `src/core/events.ts`.
162
- *
163
- * Caps the number of bytes read at `EVENTS_READ_CAP_BYTES` (16 MiB). When the
164
- * file is larger, the prefix is parsed and a warning is appended to
165
- * `opts.warnings` (when supplied). The trailing partial line after a
166
- * truncation is dropped, since `JSON.parse` would reject it anyway.
167
- */
168
- export function readRunEvents(cacheHome, opts) {
169
- const eventsPath = path.join(cacheHome, "akm", "events.jsonl");
170
- if (!fs.existsSync(eventsPath))
171
- return [];
172
- // Read up to the cap. We open the file rather than `readFileSync` so we
173
- // don't allocate an arbitrarily large buffer just to throw most of it away.
174
- let totalSize = 0;
175
- try {
176
- totalSize = fs.statSync(eventsPath).size;
177
- }
178
- catch {
179
- return [];
180
- }
181
- const cap = EVENTS_READ_CAP_BYTES;
182
- const truncated = totalSize > cap;
183
- let text;
184
- if (truncated) {
185
- const buf = Buffer.alloc(cap);
186
- const fd = fs.openSync(eventsPath, "r");
187
- try {
188
- fs.readSync(fd, buf, 0, cap, 0);
189
- }
190
- finally {
191
- fs.closeSync(fd);
192
- }
193
- text = buf.toString("utf8");
194
- // Drop the partial trailing line so we don't try to parse half a record.
195
- const lastNl = text.lastIndexOf("\n");
196
- if (lastNl !== -1)
197
- text = text.slice(0, lastNl);
198
- if (opts?.warnings) {
199
- opts.warnings.push(`events.jsonl truncated: ${totalSize} bytes exceeds ${cap}-byte cap; trajectory computed from the prefix.`);
200
- }
201
- }
202
- else {
203
- text = fs.readFileSync(eventsPath, "utf8");
204
- }
205
- const out = [];
206
- let id = 0;
207
- for (const line of text.split("\n")) {
208
- const trimmed = line.trim();
209
- if (!trimmed)
210
- continue;
211
- try {
212
- const parsed = JSON.parse(trimmed);
213
- out.push({ ...parsed, id: parsed.id ?? id });
214
- id += 1;
215
- }
216
- catch {
217
- // Skip malformed lines — events stream is best-effort upstream.
218
- }
219
- }
220
- return out;
221
- }
222
- /** Default prompt forwarded to opencode when caller omits one. */
223
- function defaultPrompt(options) {
224
- // For non-akm arms: keep the minimal format so the model is forced to read
225
- // the workspace README.md to discover task specifics. Injecting the title
226
- // here causes the model to answer from the prompt alone and skip the README,
227
- // which breaks tasks where specific parameter values (names, IDs) only appear
228
- // in the workspace files.
229
- if (options.arm !== "akm") {
230
- return [`Task: ${options.taskId}`, `Arm: ${options.arm}`, `Workspace: ${options.workspace}`].join("\n");
231
- }
232
- const title = options.taskTitle ? `\n${options.taskTitle}` : "";
233
- const taskLine = `Task: ${options.taskId}${title}`;
234
- // Derive search keywords: prefer explicit field, fall back to task domain.
235
- const keywords = options.akmKeywords ?? options.taskId.split("/")[0].replace(/-/g, " ");
236
- // Force the model to use the bash tool to run akm CLI commands before
237
- // writing any output. Each step is an explicit bash invocation so the
238
- // model cannot skip to writing the answer without executing the commands.
239
- return [
240
- `You have access to a knowledge stash via the akm CLI tool.`,
241
- ``,
242
- `Step 1 — open a terminal and execute this bash command:`,
243
- ` bash: akm search ${keywords}`,
244
- ``,
245
- `Step 2 — from the search results, execute:`,
246
- ` bash: akm show <ref> (e.g. akm show skill:${keywords.split(" ")[0]})`,
247
- ``,
248
- `Step 3 — read README.md in the workspace to understand the specific task requirements:`,
249
- ` bash: cat ${options.workspace}/README.md`,
250
- ``,
251
- `Step 4 — using the skill content from step 2 and the task requirements from step 3,`,
252
- `write the answer to ${options.workspace}/commands.txt`,
253
- ``,
254
- `Step 5 — execute:`,
255
- ` bash: akm feedback <ref> --positive (or --negative)`,
256
- ``,
257
- `DO NOT write commands.txt before running steps 1 and 2.`,
258
- ``,
259
- taskLine,
260
- `Workspace: ${options.workspace}`,
261
- ].join("\n");
262
- }
263
- /**
264
- * Run a single (task, arm, seed) and return the v1 RunResult envelope.
265
- *
266
- * The function never throws on infrastructure failures — every error path
267
- * is captured into the returned RunResult with a stable outcome value.
268
- */
269
- export async function runOne(options) {
270
- // Stamp a baseline result; we mutate fields below as the run progresses.
271
- const result = {
272
- schemaVersion: 1,
273
- taskId: options.taskId,
274
- arm: options.arm,
275
- seed: options.seed,
276
- model: options.model,
277
- outcome: "harness_error",
278
- tokens: { input: 0, output: 0 },
279
- tokenMeasurement: "missing",
280
- wallclockMs: 0,
281
- trajectory: { correctAssetLoaded: null, feedbackRecorded: null },
282
- events: [],
283
- verifierStdout: "",
284
- verifierExitCode: -1,
285
- assetsLoaded: [],
286
- };
287
- // Look up the built-in opencode profile defensively. The lookup is a pure
288
- // map read today, but wrapping it preserves the doc-comment guarantee that
289
- // runOne never throws on infrastructure failures even if the registry
290
- // shape changes. A missing/throwing profile becomes harness_error.
291
- let profile;
292
- try {
293
- profile = getBuiltinAgentProfile("opencode");
294
- }
295
- catch (err) {
296
- result.verifierStdout = `harness: getBuiltinAgentProfile("opencode") threw: ${err instanceof Error ? err.message : String(err)}`;
297
- return result;
298
- }
299
- if (!profile) {
300
- result.verifierStdout = `harness: built-in agent profile "opencode" missing; available: ${BUILTIN_AGENT_PROFILE_NAMES.join(", ")}`;
301
- return result;
302
- }
303
- // Set up the complete bench environment: isolation dirs, opencode.json
304
- // (with BENCH_OPENCODE_INVARIANTS), akm config.json, and FTS5 index.
305
- // `dryRun: true` when a test-injected spawn is present — the fake stash
306
- // doesn't exist on disk so the akm config and index writes are skipped.
307
- let benchEnv;
308
- try {
309
- benchEnv = setupBenchEnvironment({
310
- model: options.model,
311
- arm: options.arm,
312
- stashDir: options.stashDir,
313
- indexCacheHome: options.indexCacheHome,
314
- providers: options.opencodeProviders,
315
- dryRun: !!options.spawn,
316
- warnings: options.warnings,
317
- });
318
- }
319
- catch (err) {
320
- result.verifierStdout = `harness: environment setup failed: ${err instanceof Error ? err.message : String(err)}`;
321
- return result;
322
- }
323
- const { dirs, env } = benchEnv;
324
- try {
325
- result.startedAt = new Date().toISOString();
326
- const agentResult = await runAgent(profile, options.prompt ?? defaultPrompt(options), {
327
- env,
328
- // #271: scrub operator credentials + config-dir hints from the env
329
- // source BEFORE profile.envPassthrough copies them into the child.
330
- // Without this, OPENCODE_API_KEY (in opencode's passthrough list) and
331
- // AKM_CONFIG_DIR (read by akm at startup) would leak the operator's
332
- // interactive environment into every bench child.
333
- envSource: buildSanitizedEnvSource(),
334
- cwd: options.workspace,
335
- timeoutMs: options.budgetWallMs,
336
- stdio: "captured",
337
- ...(options.spawn ? { spawn: options.spawn } : {}),
338
- });
339
- result.finishedAt = new Date().toISOString();
340
- result.wallclockMs = agentResult.durationMs;
341
- const parsed = parseTokenUsage(agentResult.stdout);
342
- result.tokens = { input: parsed.input, output: parsed.output };
343
- result.tokenMeasurement = parsed.measurement;
344
- result.events = readRunEvents(dirs.cacheHome, { warnings: options.warnings });
345
- if (!agentResult.ok) {
346
- if (agentResult.reason === "timeout") {
347
- result.outcome = "budget_exceeded";
348
- return result;
349
- }
350
- // spawn_failed / non_zero_exit / parse_error all mean the harness
351
- // itself broke; the verifier never saw the workspace.
352
- if (agentResult.reason === "spawn_failed" || agentResult.reason === "parse_error") {
353
- result.outcome = "harness_error";
354
- return result;
355
- }
356
- // non_zero_exit from the agent: intentionally falls through to the
357
- // verifier path. Per spec §5.3 ("deterministic verifiers, never LLM"),
358
- // the agent is the system under test, not the judge — its exit code
359
- // does not gate verification. The verifier always runs against
360
- // whatever workspace state the agent left behind, even on a crash.
361
- }
362
- // Token-budget enforcement is best-effort: only mark `budget_exceeded`
363
- // if measurement was actually parsed (issue #252) AND the total exceeds
364
- // the cap. A `"missing"` / `"unsupported"` measurement MUST NOT silently
365
- // mask a budget overrun as a pass — it leaves the verifier to decide.
366
- if (result.tokenMeasurement === "parsed") {
367
- const totalTokens = result.tokens.input + result.tokens.output;
368
- if (totalTokens > options.budgetTokens) {
369
- result.outcome = "budget_exceeded";
370
- return result;
371
- }
372
- }
373
- const verifierResult = await runVerifier(options.taskDir, options.workspace, options.verifier, {
374
- agentStdout: agentResult.stdout,
375
- expectedMatch: options.expectedMatch,
376
- ...(options.spawn ? { spawn: options.spawn } : {}),
377
- });
378
- result.verifierStdout = verifierResult.stdout;
379
- result.verifierExitCode = verifierResult.exitCode;
380
- if (verifierResult.exitCode === 127) {
381
- // Missing runtime (e.g. pytest not on PATH) — not the agent's fault.
382
- result.outcome = "harness_error";
383
- }
384
- else {
385
- result.outcome = verifierResult.exitCode === 0 ? "pass" : "fail";
386
- }
387
- return result;
388
- }
389
- finally {
390
- // Always tear down the isolation tmpdir. Events are read out before
391
- // deletion (see readRunEvents above), so this is safe.
392
- benchEnv.teardown();
393
- }
394
- }
395
- /** Exposed for the unit test that asserts operator env never leaks. */
396
- export const _ISOLATED_ENV_NAMES = ISOLATED_ENV_NAMES;
397
- /**
398
- * Exposed for the #271 regression test that asserts operator credentials +
399
- * `AKM_CONFIG_DIR` never reach a bench-arm child via profile.envPassthrough.
400
- */
401
- export const _SCRUBBED_OPERATOR_ENV_NAMES = SCRUBBED_OPERATOR_ENV_NAMES;