akm-cli 0.6.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (333) hide show
  1. package/CHANGELOG.md +66 -0
  2. package/dist/{cli.js → src/cli.js} +712 -34
  3. package/dist/{commands → src/commands}/config-cli.js +47 -4
  4. package/dist/src/commands/distill.js +283 -0
  5. package/dist/src/commands/events.js +108 -0
  6. package/dist/src/commands/history.js +191 -0
  7. package/dist/{commands → src/commands}/installed-stashes.js +1 -1
  8. package/dist/src/commands/proposal.js +119 -0
  9. package/dist/src/commands/propose.js +171 -0
  10. package/dist/src/commands/reflect.js +193 -0
  11. package/dist/{commands → src/commands}/registry-search.js +71 -7
  12. package/dist/{commands → src/commands}/remember.js +12 -0
  13. package/dist/{commands → src/commands}/search.js +104 -4
  14. package/dist/{commands → src/commands}/self-update.js +4 -3
  15. package/dist/{commands → src/commands}/show.js +73 -0
  16. package/dist/{commands → src/commands}/source-add.js +5 -1
  17. package/dist/{commands → src/commands}/source-manage.js +7 -1
  18. package/dist/{core → src/core}/asset-ref.js +5 -5
  19. package/dist/{core → src/core}/asset-spec.js +12 -0
  20. package/dist/{core → src/core}/common.js +1 -1
  21. package/dist/{core → src/core}/config.js +203 -121
  22. package/dist/{core → src/core}/errors.js +4 -0
  23. package/dist/src/core/events.js +239 -0
  24. package/dist/src/core/lesson-lint.js +86 -0
  25. package/dist/src/core/proposals.js +406 -0
  26. package/dist/src/core/warn.js +72 -0
  27. package/dist/{core → src/core}/write-source.js +80 -5
  28. package/dist/{indexer → src/indexer}/db-search.js +114 -24
  29. package/dist/{indexer → src/indexer}/db.js +76 -23
  30. package/dist/{indexer → src/indexer}/file-context.js +0 -3
  31. package/dist/src/indexer/graph-boost.js +179 -0
  32. package/dist/src/indexer/graph-extraction.js +212 -0
  33. package/dist/{indexer → src/indexer}/indexer.js +88 -7
  34. package/dist/{indexer → src/indexer}/matchers.js +1 -1
  35. package/dist/src/indexer/memory-inference.js +263 -0
  36. package/dist/{indexer → src/indexer}/metadata.js +111 -3
  37. package/dist/{indexer → src/indexer}/search-source.js +4 -2
  38. package/dist/src/integrations/agent/config.js +292 -0
  39. package/dist/src/integrations/agent/detect.js +94 -0
  40. package/dist/src/integrations/agent/index.js +17 -0
  41. package/dist/src/integrations/agent/profiles.js +65 -0
  42. package/dist/src/integrations/agent/prompts.js +167 -0
  43. package/dist/src/integrations/agent/spawn.js +272 -0
  44. package/dist/{integrations → src/integrations}/github.js +9 -3
  45. package/dist/{integrations → src/integrations}/lockfile.js +0 -26
  46. package/dist/{llm → src/llm}/client.js +33 -2
  47. package/dist/{llm → src/llm}/embedders/remote.js +37 -3
  48. package/dist/src/llm/feature-gate.js +108 -0
  49. package/dist/src/llm/graph-extract.js +107 -0
  50. package/dist/src/llm/index-passes.js +35 -0
  51. package/dist/src/llm/memory-infer.js +86 -0
  52. package/dist/{output → src/output}/cli-hints.js +15 -2
  53. package/dist/{output → src/output}/renderers.js +63 -2
  54. package/dist/src/output/shapes.js +523 -0
  55. package/dist/src/output/text.js +1116 -0
  56. package/dist/{registry → src/registry}/build-index.js +19 -8
  57. package/dist/{registry → src/registry}/factory.js +0 -8
  58. package/dist/{registry → src/registry}/providers/static-index.js +6 -3
  59. package/dist/{registry → src/registry}/resolve.js +68 -2
  60. package/dist/{setup → src/setup}/setup.js +52 -5
  61. package/dist/{sources → src/sources}/providers/git.js +7 -15
  62. package/dist/{wiki → src/wiki}/wiki.js +54 -6
  63. package/dist/{workflows → src/workflows}/runs.js +37 -3
  64. package/dist/tests/add-website-source.test.js +119 -0
  65. package/dist/tests/agent/agent-config-loader.test.js +70 -0
  66. package/dist/tests/agent/agent-config.test.js +221 -0
  67. package/dist/tests/agent/agent-detect.test.js +100 -0
  68. package/dist/tests/agent/agent-spawn.test.js +234 -0
  69. package/dist/tests/agent-output.test.js +186 -0
  70. package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +103 -0
  71. package/dist/tests/architecture/agent-spawn-seam.test.js +193 -0
  72. package/dist/tests/architecture/llm-stateless-seam.test.js +112 -0
  73. package/dist/tests/asset-ref.test.js +192 -0
  74. package/dist/tests/asset-registry.test.js +103 -0
  75. package/dist/tests/asset-spec.test.js +241 -0
  76. package/dist/tests/bench/attribution.test.js +996 -0
  77. package/dist/tests/bench/cleanup-sigint.test.js +83 -0
  78. package/dist/tests/bench/cleanup.js +234 -0
  79. package/dist/tests/bench/cleanup.test.js +166 -0
  80. package/dist/tests/bench/cli.js +1018 -0
  81. package/dist/tests/bench/cli.test.js +445 -0
  82. package/dist/tests/bench/compare.test.js +556 -0
  83. package/dist/tests/bench/corpus.js +317 -0
  84. package/dist/tests/bench/corpus.test.js +258 -0
  85. package/dist/tests/bench/doctor.js +525 -0
  86. package/dist/tests/bench/driver.js +401 -0
  87. package/dist/tests/bench/driver.test.js +584 -0
  88. package/dist/tests/bench/environment.js +233 -0
  89. package/dist/tests/bench/environment.test.js +199 -0
  90. package/dist/tests/bench/evolve-metrics.js +179 -0
  91. package/dist/tests/bench/evolve-metrics.test.js +187 -0
  92. package/dist/tests/bench/evolve.js +647 -0
  93. package/dist/tests/bench/evolve.test.js +624 -0
  94. package/dist/tests/bench/failure-modes.test.js +349 -0
  95. package/dist/tests/bench/feedback-integrity.test.js +457 -0
  96. package/dist/tests/bench/leakage.test.js +228 -0
  97. package/dist/tests/bench/learning-curve.test.js +134 -0
  98. package/dist/tests/bench/metrics.js +2395 -0
  99. package/dist/tests/bench/metrics.test.js +1150 -0
  100. package/dist/tests/bench/no-os-tmpdir-invariant.test.js +43 -0
  101. package/dist/tests/bench/opencode-config.js +194 -0
  102. package/dist/tests/bench/opencode-config.test.js +370 -0
  103. package/dist/tests/bench/report.js +1885 -0
  104. package/dist/tests/bench/report.test.js +1038 -0
  105. package/dist/tests/bench/run-config.js +355 -0
  106. package/dist/tests/bench/run-config.test.js +298 -0
  107. package/dist/tests/bench/run-curate-test.js +32 -0
  108. package/dist/tests/bench/run-failing-tasks.js +56 -0
  109. package/dist/tests/bench/run-full-bench.js +51 -0
  110. package/dist/tests/bench/run-items36-targeted.js +69 -0
  111. package/dist/tests/bench/run-nano-quick.js +42 -0
  112. package/dist/tests/bench/run-waveg-targeted.js +62 -0
  113. package/dist/tests/bench/runner.js +699 -0
  114. package/dist/tests/bench/runner.test.js +958 -0
  115. package/dist/tests/bench/search-bridge.test.js +331 -0
  116. package/dist/tests/bench/tmp.js +131 -0
  117. package/dist/tests/bench/trajectory.js +116 -0
  118. package/dist/tests/bench/trajectory.test.js +127 -0
  119. package/dist/tests/bench/verifier.js +114 -0
  120. package/dist/tests/bench/verifier.test.js +118 -0
  121. package/dist/tests/bench/workflow-evaluator.js +557 -0
  122. package/dist/tests/bench/workflow-evaluator.test.js +421 -0
  123. package/dist/tests/bench/workflow-spec.js +345 -0
  124. package/dist/tests/bench/workflow-spec.test.js +363 -0
  125. package/dist/tests/bench/workflow-trace.js +472 -0
  126. package/dist/tests/bench/workflow-trace.test.js +254 -0
  127. package/dist/tests/benchmark-search-quality.js +536 -0
  128. package/dist/tests/benchmark-suite.js +1441 -0
  129. package/dist/tests/capture-cli.test.js +112 -0
  130. package/dist/tests/cli-errors.test.js +204 -0
  131. package/dist/tests/commands/events.test.js +370 -0
  132. package/dist/tests/commands/history.test.js +418 -0
  133. package/dist/tests/commands/import.test.js +103 -0
  134. package/dist/tests/commands/proposal-cli.test.js +209 -0
  135. package/dist/tests/commands/reflect-propose-cli.test.js +333 -0
  136. package/dist/tests/commands/remember.test.js +97 -0
  137. package/dist/tests/commands/scope-flags.test.js +300 -0
  138. package/dist/tests/commands/search.test.js +537 -0
  139. package/dist/tests/commands/show-indexer-parity.test.js +117 -0
  140. package/dist/tests/commands/show.test.js +294 -0
  141. package/dist/tests/common.test.js +266 -0
  142. package/dist/tests/completions.test.js +142 -0
  143. package/dist/tests/config-cli.test.js +193 -0
  144. package/dist/tests/config-llm-features.test.js +139 -0
  145. package/dist/tests/config.test.js +569 -0
  146. package/dist/tests/contracts/migration-baseline.test.js +43 -0
  147. package/dist/tests/contracts/reflect-propose-envelope.test.js +139 -0
  148. package/dist/tests/contracts/spec-helpers.js +46 -0
  149. package/dist/tests/contracts/v1-spec-section-11-proposal-queue.test.js +228 -0
  150. package/dist/tests/contracts/v1-spec-section-12-agent-config.test.js +56 -0
  151. package/dist/tests/contracts/v1-spec-section-13-lesson-type.test.js +34 -0
  152. package/dist/tests/contracts/v1-spec-section-14-llm-features.test.js +94 -0
  153. package/dist/tests/contracts/v1-spec-section-4-1-asset-types.test.js +39 -0
  154. package/dist/tests/contracts/v1-spec-section-4-2-quality-rules.test.js +44 -0
  155. package/dist/tests/contracts/v1-spec-section-5-configuration.test.js +47 -0
  156. package/dist/tests/contracts/v1-spec-section-6-orchestration.test.js +40 -0
  157. package/dist/tests/contracts/v1-spec-section-7-module-layout.test.js +58 -0
  158. package/dist/tests/contracts/v1-spec-section-8-extension-points.test.js +34 -0
  159. package/dist/tests/contracts/v1-spec-section-9-4-cli-surface.test.js +75 -0
  160. package/dist/tests/contracts/v1-spec-section-9-7-llm-agent-boundary.test.js +36 -0
  161. package/dist/tests/core/write-source.test.js +366 -0
  162. package/dist/tests/curate-command.test.js +87 -0
  163. package/dist/tests/db-scoring.test.js +201 -0
  164. package/dist/tests/db.test.js +654 -0
  165. package/dist/tests/distill-cli-flag.test.js +208 -0
  166. package/dist/tests/distill.test.js +515 -0
  167. package/dist/tests/docker-install.test.js +120 -0
  168. package/dist/tests/e2e.test.js +1419 -0
  169. package/dist/tests/embedder.test.js +340 -0
  170. package/dist/tests/embedding-model-config.test.js +379 -0
  171. package/dist/tests/feedback-command.test.js +172 -0
  172. package/dist/tests/file-context.test.js +552 -0
  173. package/dist/tests/fixtures/scripts/git/summarize-diff.js +9 -0
  174. package/dist/tests/fixtures/scripts/lint/eslint-check.js +7 -0
  175. package/dist/tests/fixtures/stashes/load.js +166 -0
  176. package/dist/tests/fixtures/stashes/load.test.js +97 -0
  177. package/dist/tests/fixtures/stashes/ranking-baseline/scripts/mem0-search.js +12 -0
  178. package/dist/tests/frontmatter.test.js +190 -0
  179. package/dist/tests/fts-field-weighting.test.js +254 -0
  180. package/dist/tests/fuzzy-search.test.js +230 -0
  181. package/dist/tests/git-provider-clone.test.js +45 -0
  182. package/dist/tests/github.test.js +161 -0
  183. package/dist/tests/graph-boost-ranking.test.js +305 -0
  184. package/dist/tests/graph-extraction.test.js +282 -0
  185. package/dist/tests/helpers/usage-events.js +8 -0
  186. package/dist/tests/index-pass-llm.test.js +161 -0
  187. package/dist/tests/indexer.test.js +570 -0
  188. package/dist/tests/info-command.test.js +166 -0
  189. package/dist/tests/init.test.js +69 -0
  190. package/dist/tests/install-script.test.js +246 -0
  191. package/dist/tests/integration/agent-real-profile.test.js +94 -0
  192. package/dist/tests/issue-36-repro.test.js +304 -0
  193. package/dist/tests/issues-191-194.test.js +160 -0
  194. package/dist/tests/lesson-lint.test.js +111 -0
  195. package/dist/tests/llm-client.test.js +115 -0
  196. package/dist/tests/llm-feature-gate.test.js +151 -0
  197. package/dist/tests/llm.test.js +139 -0
  198. package/dist/tests/lockfile.test.js +216 -0
  199. package/dist/tests/manifest.test.js +205 -0
  200. package/dist/tests/markdown.test.js +126 -0
  201. package/dist/tests/matchers-unit.test.js +189 -0
  202. package/dist/tests/memory-inference.test.js +299 -0
  203. package/dist/tests/merge-scoring.test.js +136 -0
  204. package/dist/tests/metadata.test.js +313 -0
  205. package/dist/tests/migration-help.test.js +89 -0
  206. package/dist/tests/origin-resolve.test.js +124 -0
  207. package/dist/tests/output-baseline.test.js +218 -0
  208. package/dist/tests/output-shapes-unit.test.js +478 -0
  209. package/dist/tests/parallel-search.test.js +272 -0
  210. package/dist/tests/parameter-metadata.test.js +365 -0
  211. package/dist/tests/paths.test.js +177 -0
  212. package/dist/tests/progressive-disclosure.test.js +280 -0
  213. package/dist/tests/proposals.test.js +279 -0
  214. package/dist/tests/proposed-quality.test.js +271 -0
  215. package/dist/tests/provider-registry.test.js +32 -0
  216. package/dist/tests/ranking-regression.test.js +548 -0
  217. package/dist/tests/reflect-propose.test.js +455 -0
  218. package/dist/tests/registry-build-index.test.js +394 -0
  219. package/dist/tests/registry-cli.test.js +290 -0
  220. package/dist/tests/registry-index-v2.test.js +430 -0
  221. package/dist/tests/registry-install.test.js +728 -0
  222. package/dist/tests/registry-providers/parity.test.js +189 -0
  223. package/dist/tests/registry-providers/skills-sh.test.js +309 -0
  224. package/dist/tests/registry-providers/static-index.test.js +238 -0
  225. package/dist/tests/registry-resolve.test.js +126 -0
  226. package/dist/tests/registry-search.test.js +923 -0
  227. package/dist/tests/remember-frontmatter.test.js +378 -0
  228. package/dist/tests/remember-unit.test.js +123 -0
  229. package/dist/tests/ripgrep-install.test.js +251 -0
  230. package/dist/tests/ripgrep-resolve.test.js +108 -0
  231. package/dist/tests/ripgrep.test.js +163 -0
  232. package/dist/tests/save-command.test.js +94 -0
  233. package/dist/tests/save-trust-qa-fixes.test.js +270 -0
  234. package/dist/tests/scoring-pipeline.test.js +648 -0
  235. package/dist/tests/search-include-proposed-cli.test.js +118 -0
  236. package/dist/tests/self-update.test.js +442 -0
  237. package/dist/tests/semantic-search-e2e.test.js +512 -0
  238. package/dist/tests/semantic-status.test.js +471 -0
  239. package/dist/tests/setup-run.integration.js +877 -0
  240. package/dist/tests/setup-wizard.test.js +198 -0
  241. package/dist/tests/setup.test.js +131 -0
  242. package/dist/tests/source-add.test.js +11 -0
  243. package/dist/tests/source-clone.test.js +254 -0
  244. package/dist/tests/source-manage.test.js +366 -0
  245. package/dist/tests/source-providers/filesystem.test.js +82 -0
  246. package/dist/tests/source-providers/git.test.js +252 -0
  247. package/dist/tests/source-providers/website.test.js +128 -0
  248. package/dist/tests/source-qa-fixes.test.js +286 -0
  249. package/dist/tests/source-registry.test.js +350 -0
  250. package/dist/tests/source-resolve.test.js +100 -0
  251. package/dist/tests/source-source.test.js +281 -0
  252. package/dist/tests/source.test.js +533 -0
  253. package/dist/tests/tar-utils-scan.test.js +73 -0
  254. package/dist/tests/toggle-components.test.js +73 -0
  255. package/dist/tests/usage-telemetry.test.js +265 -0
  256. package/dist/tests/utility-scoring.test.js +558 -0
  257. package/dist/tests/vault-load-error.test.js +78 -0
  258. package/dist/tests/vault-qa-fixes.test.js +194 -0
  259. package/dist/tests/vault.test.js +429 -0
  260. package/dist/tests/vector-search.test.js +608 -0
  261. package/dist/tests/walker.test.js +252 -0
  262. package/dist/tests/wave2-cluster-bc.test.js +228 -0
  263. package/dist/tests/wave2-cluster-d.test.js +180 -0
  264. package/dist/tests/wave2-cluster-e.test.js +179 -0
  265. package/dist/tests/wiki-qa-fixes.test.js +270 -0
  266. package/dist/tests/wiki.test.js +529 -0
  267. package/dist/tests/workflow-cli.test.js +271 -0
  268. package/dist/tests/workflow-markdown.test.js +171 -0
  269. package/dist/tests/workflow-path-escape.test.js +132 -0
  270. package/dist/tests/workflow-qa-fixes.test.js +395 -0
  271. package/dist/tests/workflows/indexer-rejection.test.js +213 -0
  272. package/docs/README.md +8 -0
  273. package/docs/migration/release-notes/0.7.0.md +244 -0
  274. package/package.json +2 -2
  275. package/dist/core/warn.js +0 -27
  276. package/dist/output/shapes.js +0 -212
  277. package/dist/output/text.js +0 -520
  278. /package/dist/{commands → src/commands}/completions.js +0 -0
  279. /package/dist/{commands → src/commands}/curate.js +0 -0
  280. /package/dist/{commands → src/commands}/info.js +0 -0
  281. /package/dist/{commands → src/commands}/init.js +0 -0
  282. /package/dist/{commands → src/commands}/install-audit.js +0 -0
  283. /package/dist/{commands → src/commands}/migration-help.js +0 -0
  284. /package/dist/{commands → src/commands}/source-clone.js +0 -0
  285. /package/dist/{commands → src/commands}/vault.js +0 -0
  286. /package/dist/{core → src/core}/asset-registry.js +0 -0
  287. /package/dist/{core → src/core}/frontmatter.js +0 -0
  288. /package/dist/{core → src/core}/markdown.js +0 -0
  289. /package/dist/{core → src/core}/paths.js +0 -0
  290. /package/dist/{indexer → src/indexer}/manifest.js +0 -0
  291. /package/dist/{indexer → src/indexer}/search-fields.js +0 -0
  292. /package/dist/{indexer → src/indexer}/semantic-status.js +0 -0
  293. /package/dist/{indexer → src/indexer}/usage-events.js +0 -0
  294. /package/dist/{indexer → src/indexer}/walker.js +0 -0
  295. /package/dist/{llm → src/llm}/embedder.js +0 -0
  296. /package/dist/{llm → src/llm}/embedders/cache.js +0 -0
  297. /package/dist/{llm → src/llm}/embedders/local.js +0 -0
  298. /package/dist/{llm → src/llm}/embedders/types.js +0 -0
  299. /package/dist/{llm → src/llm}/metadata-enhance.js +0 -0
  300. /package/dist/{output → src/output}/context.js +0 -0
  301. /package/dist/{registry → src/registry}/create-provider-registry.js +0 -0
  302. /package/dist/{registry → src/registry}/origin-resolve.js +0 -0
  303. /package/dist/{registry → src/registry}/providers/index.js +0 -0
  304. /package/dist/{registry → src/registry}/providers/skills-sh.js +0 -0
  305. /package/dist/{registry → src/registry}/providers/types.js +0 -0
  306. /package/dist/{registry → src/registry}/types.js +0 -0
  307. /package/dist/{setup → src/setup}/detect.js +0 -0
  308. /package/dist/{setup → src/setup}/ripgrep-install.js +0 -0
  309. /package/dist/{setup → src/setup}/ripgrep-resolve.js +0 -0
  310. /package/dist/{setup → src/setup}/steps.js +0 -0
  311. /package/dist/{sources → src/sources}/include.js +0 -0
  312. /package/dist/{sources → src/sources}/provider-factory.js +0 -0
  313. /package/dist/{sources → src/sources}/provider.js +0 -0
  314. /package/dist/{sources → src/sources}/providers/filesystem.js +0 -0
  315. /package/dist/{sources → src/sources}/providers/index.js +0 -0
  316. /package/dist/{sources → src/sources}/providers/install-types.js +0 -0
  317. /package/dist/{sources → src/sources}/providers/npm.js +0 -0
  318. /package/dist/{sources → src/sources}/providers/provider-utils.js +0 -0
  319. /package/dist/{sources → src/sources}/providers/sync-from-ref.js +0 -0
  320. /package/dist/{sources → src/sources}/providers/tar-utils.js +0 -0
  321. /package/dist/{sources → src/sources}/providers/website.js +0 -0
  322. /package/dist/{sources → src/sources}/resolve.js +0 -0
  323. /package/dist/{sources → src/sources}/types.js +0 -0
  324. /package/dist/{templates → src/templates}/wiki-templates.js +0 -0
  325. /package/dist/{version.js → src/version.js} +0 -0
  326. /package/dist/{workflows → src/workflows}/authoring.js +0 -0
  327. /package/dist/{workflows → src/workflows}/cli.js +0 -0
  328. /package/dist/{workflows → src/workflows}/db.js +0 -0
  329. /package/dist/{workflows → src/workflows}/document-cache.js +0 -0
  330. /package/dist/{workflows → src/workflows}/parser.js +0 -0
  331. /package/dist/{workflows → src/workflows}/renderer.js +0 -0
  332. /package/dist/{workflows → src/workflows}/schema.js +0 -0
  333. /package/dist/{workflows → src/workflows}/validator.js +0 -0
@@ -0,0 +1,647 @@
1
+ /**
2
+ * akm-bench `evolve` — Track B longitudinal three-phase runner (spec §4 + §6.4).
3
+ *
4
+ * `runEvolve()` orchestrates three phases against a single eval-domain corpus:
5
+ *
6
+ * • Phase 1 (signal accumulation): run K seeds × tasks (train slice only)
7
+ * under the akm arm, then record `akm feedback <gold_ref> --positive` /
8
+ * `--negative` events per outcome.
9
+ * • Phase 2 (evolve): for every asset whose negative feedback crosses the
10
+ * threshold, invoke `akm distill` and `akm reflect`, validate every
11
+ * resulting proposal via `akm proposal show --json`, then accept or
12
+ * reject per lint outcome. After processing, rebuild the index.
13
+ * • Phase 3 (re-evaluate): run the eval slice under THREE arms — `pre` (the
14
+ * original un-evolved fixture), `post` (the evolved fixture), `synthetic`
15
+ * (no stash, scratchpad-only "Bring Your Own Skills" prompt).
16
+ *
17
+ * Leakage prevention (spec §7.4): before invoking distill we compute the set
18
+ * of eval-slice gold refs and pass it to `akm distill` via
19
+ * `--exclude-feedback-from <csv>` (#267). `akmDistill` filters those
20
+ * feedback events out of its LLM input before constructing the prompt.
21
+ * Refs in the exclusion list still see distillation run — but distillation
22
+ * runs from asset content alone, with no feedback signal that could have
23
+ * leaked from the eval slice. The proposal log + Phase 1 feedback stream
24
+ * are also filtered before computeProposalQualityMetrics ever sees them.
25
+ *
26
+ * Test seams: every external interaction is funnelled through one of three
27
+ * injectable functions:
28
+ * - `spawn` — forwarded to `runOne` (drives the agent harness).
29
+ * - `akmCli(args, cwd, env)` — invoked for every `akm <verb>` subprocess.
30
+ * - `materialiseStash` — when false, `runUtility` doesn't touch
31
+ * fixtures/stashes/.
32
+ * Tests inject fakes; production wires the real `Bun.spawnSync` and the
33
+ * real `loadFixtureStash`.
34
+ */
35
+ import path from "node:path";
36
+ import { loadFixtureStash } from "../fixtures/stashes/load";
37
+ import { registerCleanup } from "./cleanup";
38
+ import { computeLessonMetrics } from "./evolve-metrics";
39
+ import { computeFeedbackIntegrity, computeLongitudinalMetrics, computeProposalQualityMetrics, } from "./metrics";
40
+ import { runUtility } from "./runner";
41
+ import { benchMkdtemp } from "./tmp";
42
+ /**
43
+ * Drive the three-phase Track B runner.
44
+ *
45
+ * Pre: `tasks` is already filtered to one domain (or `all`). The runner
46
+ * partitions internally on `task.slice`.
47
+ *
48
+ * Sandboxing: at the start of every real run the runner materialises one
49
+ * dedicated tmp stash per fixture (the `evolveStash`) plus a fresh sibling
50
+ * snapshot per fixture (the `preStash`). Phase 1 + Phase 2 pin
51
+ * `AKM_STASH_DIR` to the appropriate `evolveStash` for every spawned `akm`
52
+ * invocation; Phase 3's pre arm uses `preStash`, the post arm uses
53
+ * `evolveStash`, and the synthetic arm uses no stash. The operator's real
54
+ * `process.env.AKM_STASH_DIR` is never read or written by `runEvolve`. All
55
+ * stashes are cleaned up in a top-level try/finally.
56
+ */
57
+ export async function runEvolve(options) {
58
+ const seedsPerArm = options.seedsPerArm ?? 5;
59
+ const budgetTokens = options.budgetTokens ?? 30000;
60
+ const budgetWallMs = options.budgetWallMs ?? 120000;
61
+ const negativeThreshold = options.negativeThreshold ?? { absoluteCount: 2, ratio: 0.5 };
62
+ const materialiseStash = options.materialiseStash ?? true;
63
+ const akmCli = options.akmCli ?? defaultAkmCli;
64
+ const warnings = [];
65
+ const trainTasks = options.tasks.filter((t) => effectiveSlice(t) === "train");
66
+ const evalTasks = options.tasks.filter((t) => effectiveSlice(t) === "eval");
67
+ // Use the first task's domain (or "all") as the corpus label. The CLI
68
+ // already filtered to one domain; this is just for the report header.
69
+ const domain = uniqueDomain(options.tasks);
70
+ // ── Sandbox setup: per-fixture evolveStash + preStash. ───────────────────
71
+ // We materialise one tmp stash per unique `task.stash` so Phase 1
72
+ // accumulates feedback into the same on-disk stash that Phase 2 mutates,
73
+ // and that Phase 3's post arm reads back. The operator's real
74
+ // AKM_STASH_DIR is never touched. The pre arm gets a fresh snapshot of
75
+ // the same starting fixture (no Phase 2 mutations applied).
76
+ const fixtureNames = new Set();
77
+ for (const t of options.tasks)
78
+ fixtureNames.add(t.stash);
79
+ const evolveStashes = new Map();
80
+ const preStashes = new Map();
81
+ const evolveDirByFixture = new Map();
82
+ const preDirByFixture = new Map();
83
+ /** Per-fixture XDG_CACHE_HOME dirs allocated for evolve-stash indexing. */
84
+ const evolveCacheDirByFixture = new Map();
85
+ // SIGINT trap (#267): every per-fixture stash registers its cleanup with
86
+ // the shared registry so an external Ctrl-C reaps the tmp dirs even when
87
+ // the top-level try/finally never runs. We deregister in the matching
88
+ // finally block before invoking the synchronous cleanup so the handler
89
+ // doesn't double-fire.
90
+ const stashDeregistrations = [];
91
+ if (materialiseStash) {
92
+ for (const name of fixtureNames) {
93
+ try {
94
+ const evolved = loadFixtureStash(name, { skipIndex: false });
95
+ evolveStashes.set(name, evolved);
96
+ evolveDirByFixture.set(name, evolved.stashDir);
97
+ // Allocate a per-fixture cache dir for the evolve-stash re-index.
98
+ // `loadFixtureStash` used its own isolated XDG_CACHE_HOME; subsequent
99
+ // `akmCli` calls (feedback, distill, reflect) must look in the same
100
+ // cache. We allocate a fresh bench cache dir and pass it through
101
+ // `indexEvolveStash` + `envForRef` so the FTS5 DB is in a known place.
102
+ evolveCacheDirByFixture.set(name, benchMkdtemp(`akm-evolve-cache-${name}-`));
103
+ stashDeregistrations.push(registerCleanup(() => {
104
+ try {
105
+ evolved.cleanup();
106
+ }
107
+ catch {
108
+ /* swallow */
109
+ }
110
+ }));
111
+ }
112
+ catch (err) {
113
+ warnings.push(`evolve: failed to materialise evolve stash for fixture "${name}": ${err.message}`);
114
+ }
115
+ try {
116
+ const pre = loadFixtureStash(name, { skipIndex: false });
117
+ preStashes.set(name, pre);
118
+ preDirByFixture.set(name, pre.stashDir);
119
+ stashDeregistrations.push(registerCleanup(() => {
120
+ try {
121
+ pre.cleanup();
122
+ }
123
+ catch {
124
+ /* swallow */
125
+ }
126
+ }));
127
+ }
128
+ catch (err) {
129
+ warnings.push(`evolve: failed to materialise pre stash for fixture "${name}": ${err.message}`);
130
+ }
131
+ }
132
+ }
133
+ // Resolve the evolveStash dir for a given asset ref. We map ref → fixture
134
+ // by looking up which task's gold ref it matches; if no task owns it (or
135
+ // multiple do, which is unusual), we fall back to the first available
136
+ // evolveStash. The simple — and most common — case is a single fixture
137
+ // per `--tasks <domain>` invocation.
138
+ const refToFixture = new Map();
139
+ for (const t of options.tasks) {
140
+ if (t.goldRef)
141
+ refToFixture.set(t.goldRef, t.stash);
142
+ }
143
+ const fallbackEvolveDir = [...evolveDirByFixture.values()][0];
144
+ const fallbackEvolveCacheDir = [...evolveCacheDirByFixture.values()][0];
145
+ function envForRef(ref) {
146
+ const baseEnv = { ...process.env };
147
+ if (!materialiseStash) {
148
+ // Tests opt out of fixture materialisation entirely; we still strip
149
+ // the operator's AKM_STASH_DIR so the fake CLI sees a known sentinel.
150
+ delete baseEnv.AKM_STASH_DIR;
151
+ return baseEnv;
152
+ }
153
+ const fixture = ref ? refToFixture.get(ref) : undefined;
154
+ const dir = (fixture && evolveDirByFixture.get(fixture)) ?? fallbackEvolveDir;
155
+ const cacheDir = (fixture && evolveCacheDirByFixture.get(fixture)) ?? fallbackEvolveCacheDir;
156
+ if (dir)
157
+ baseEnv.AKM_STASH_DIR = dir;
158
+ else
159
+ delete baseEnv.AKM_STASH_DIR;
160
+ if (cacheDir)
161
+ baseEnv.XDG_CACHE_HOME = cacheDir;
162
+ return baseEnv;
163
+ }
164
+ // ── Phase 1 pre-flight: index each evolve stash in its dedicated cache. ───
165
+ // `loadFixtureStash` already ran `akm index` but used an isolated
166
+ // XDG_CACHE_HOME that subsequent `akmCli` calls (feedback, distill, reflect)
167
+ // cannot see. Re-running `akm index` here via `akmCli` with the same
168
+ // AKM_STASH_DIR + XDG_CACHE_HOME that `envForRef` will produce ensures the
169
+ // FTS5 database is populated where Phase 1 feedback will look.
170
+ // Non-zero exit adds a warning but does not abort — Phase 1 can still run
171
+ // with degraded feedback if the index step fails.
172
+ if (materialiseStash) {
173
+ const phase1Cwd = options.tasks[0]?.taskDir ?? process.cwd();
174
+ for (const [fixtureName, stashDir] of evolveDirByFixture) {
175
+ const cacheDir = evolveCacheDirByFixture.get(fixtureName);
176
+ if (!cacheDir)
177
+ continue;
178
+ try {
179
+ const result = await indexEvolveStash(stashDir, cacheDir, akmCli, phase1Cwd);
180
+ if (!result.ok) {
181
+ warnings.push(`evolve: pre-flight akm index failed for stash ${stashDir}: ${result.stderr.trim()}`);
182
+ }
183
+ }
184
+ catch (err) {
185
+ warnings.push(`evolve: pre-flight akm index threw for stash ${stashDir}: ${err.message}`);
186
+ }
187
+ }
188
+ }
189
+ let preReport;
190
+ let postReport;
191
+ let syntheticReport;
192
+ let phase1Report;
193
+ const feedbackLog = [];
194
+ const proposalLog = [];
195
+ try {
196
+ // ── Phase 1: accumulate signal on the train slice (akm arm only). ─────
197
+ phase1Report = await runUtility({
198
+ tasks: trainTasks,
199
+ arms: ["akm"],
200
+ model: options.model,
201
+ seedsPerArm,
202
+ budgetTokens,
203
+ budgetWallMs,
204
+ slice: "train",
205
+ ...(options.spawn ? { spawn: options.spawn } : {}),
206
+ // We pre-materialised the per-fixture evolve stash above; tell the
207
+ // runner to forward those dirs and skip its own per-task materialise.
208
+ materialiseStash,
209
+ ...(materialiseStash ? { stashDirByFixture: evolveDirByFixture } : {}),
210
+ ...(options.timestamp ? { timestamp: options.timestamp } : {}),
211
+ ...(options.branch ? { branch: options.branch } : {}),
212
+ ...(options.commit ? { commit: options.commit } : {}),
213
+ ...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
214
+ });
215
+ // Issue feedback events per (task, seed) outcome on the akm arm.
216
+ const feedbackByRef = new Map();
217
+ const phase1Cwd = options.tasks[0]?.taskDir ?? process.cwd();
218
+ for (const run of phase1Report.akmRuns ?? []) {
219
+ const taskMeta = options.tasks.find((t) => t.id === run.taskId);
220
+ const goldRef = taskMeta?.goldRef;
221
+ if (!goldRef)
222
+ continue;
223
+ if (run.outcome === "harness_error")
224
+ continue;
225
+ const signal = run.outcome === "pass" ? "positive" : "negative";
226
+ const args = ["feedback", goldRef, signal === "positive" ? "--positive" : "--negative"];
227
+ // Wrap in try/catch so a single throwing akmCli (e.g. subprocess
228
+ // crash) cannot leave `feedbackByRef` partially populated and let
229
+ // Phase 2 proceed on corrupt state.
230
+ try {
231
+ const cliResult = await akmCli(args, phase1Cwd, envForRef(goldRef));
232
+ feedbackLog.push({ taskId: run.taskId, seed: run.seed, goldRef, signal, ok: cliResult.exitCode === 0 });
233
+ if (cliResult.exitCode !== 0) {
234
+ warnings.push(`phase1: akm feedback for ${goldRef} (${signal}) failed: ${cliResult.stderr.trim()}`);
235
+ }
236
+ }
237
+ catch (err) {
238
+ feedbackLog.push({ taskId: run.taskId, seed: run.seed, goldRef, signal, ok: false });
239
+ warnings.push(`phase1.feedback_dispatch_failed: ${goldRef} ${err.message}`);
240
+ }
241
+ const counts = feedbackByRef.get(goldRef) ?? { positive: 0, negative: 0 };
242
+ if (signal === "positive")
243
+ counts.positive += 1;
244
+ else
245
+ counts.negative += 1;
246
+ feedbackByRef.set(goldRef, counts);
247
+ }
248
+ // ── Phase 2: evolve. ────────────────────────────────────────────────────
249
+ const evalGoldRefs = new Set();
250
+ for (const t of evalTasks) {
251
+ if (t.goldRef)
252
+ evalGoldRefs.add(t.goldRef);
253
+ }
254
+ const refsToEvolve = [];
255
+ for (const [ref, counts] of feedbackByRef.entries()) {
256
+ if (crossesNegativeThreshold(counts, negativeThreshold))
257
+ refsToEvolve.push(ref);
258
+ }
259
+ refsToEvolve.sort();
260
+ // §7.4 leakage prevention (#267): instead of hard-skipping refs that
261
+ // overlap eval-slice gold refs, we now pass the gold-ref set through
262
+ // `--exclude-feedback-from` (and the matching env var) so `akm distill`
263
+ // filters those events out of its LLM input. The behaviour collapses
264
+ // back to "no useful feedback shown" for refs that ARE the gold ref —
265
+ // distill then runs from asset content only, which is what we want.
266
+ const evalGoldRefList = [...evalGoldRefs].sort();
267
+ const excludeFeedbackCsv = evalGoldRefList.join(",");
268
+ for (const ref of refsToEvolve) {
269
+ // The env var fallback is the contract `akm distill` honours; it lets
270
+ // the bench keep working even if a hypothetical caller invokes
271
+ // distill via a wrapper that mangles flags.
272
+ const evolveEnv = {
273
+ ...envForRef(ref),
274
+ AKM_BENCH_EXCLUDE_GOLD_REFS: excludeFeedbackCsv,
275
+ ...(excludeFeedbackCsv ? { AKM_DISTILL_EXCLUDE_FEEDBACK_FROM: excludeFeedbackCsv } : {}),
276
+ };
277
+ // Pass the eval-gold list explicitly via the CLI flag so the contract
278
+ // is observable in test logs (the env var is a fallback for harnesses
279
+ // that strip flags). Reflect doesn't accept this flag — it's a distill
280
+ // concern only.
281
+ const distillArgs = ["distill", ref];
282
+ if (excludeFeedbackCsv) {
283
+ distillArgs.push("--exclude-feedback-from", excludeFeedbackCsv);
284
+ }
285
+ const distillResult = await akmCli(distillArgs, phase1Cwd, evolveEnv);
286
+ if (distillResult.exitCode !== 0) {
287
+ warnings.push(`phase2: akm distill ${ref} failed: ${distillResult.stderr.trim()}`);
288
+ }
289
+ else if (evalGoldRefs.has(ref) && excludeFeedbackCsv) {
290
+ // Per-ref leakage info — replaces the previous "skipped" message.
291
+ // Operator can audit which refs ran through the filter and confirm
292
+ // distillation didn't see leaked feedback.
293
+ warnings.push(`phase2: filtered eval-slice gold-ref feedback from distill input for ${ref} (--exclude-feedback-from ${excludeFeedbackCsv}).`);
294
+ }
295
+ const reflectResult = await akmCli(["reflect", ref], phase1Cwd, evolveEnv);
296
+ if (reflectResult.exitCode !== 0) {
297
+ // `reflect` requires `agent.default` to be configured — a missing
298
+ // config is non-fatal for the bench; we record and continue.
299
+ warnings.push(`phase2: akm reflect ${ref} skipped/failed: ${reflectResult.stderr.trim()}`);
300
+ }
301
+ }
302
+ // Walk the proposal queue per fixture (each evolveStash has its own
303
+ // proposal log on disk). When we materialised stashes we iterate every
304
+ // fixture that produced proposals; in the common single-fixture case
305
+ // this is one pass.
306
+ const proposalFixtures = materialiseStash ? [...evolveDirByFixture.keys()] : [undefined];
307
+ for (const fixtureName of proposalFixtures) {
308
+ const proposalEnv = { ...process.env };
309
+ if (materialiseStash && fixtureName) {
310
+ const dir = evolveDirByFixture.get(fixtureName);
311
+ if (dir)
312
+ proposalEnv.AKM_STASH_DIR = dir;
313
+ const cacheDir = evolveCacheDirByFixture.get(fixtureName);
314
+ if (cacheDir)
315
+ proposalEnv.XDG_CACHE_HOME = cacheDir;
316
+ }
317
+ else if (!materialiseStash) {
318
+ delete proposalEnv.AKM_STASH_DIR;
319
+ }
320
+ const listResult = await akmCli(["proposal", "list", "--json"], phase1Cwd, proposalEnv);
321
+ const proposals = parseProposalList(listResult.stdout);
322
+ for (const p of proposals) {
323
+ const showResult = await akmCli(["proposal", "show", p.id, "--json"], phase1Cwd, proposalEnv);
324
+ const lintInfo = parseProposalShow(showResult.stdout);
325
+ const lintPass = lintInfo.lintPass;
326
+ if (lintPass) {
327
+ const acceptResult = await akmCli(["proposal", "accept", p.id], phase1Cwd, proposalEnv);
328
+ proposalLog.push({
329
+ proposalId: p.id,
330
+ assetRef: p.assetRef,
331
+ kind: p.kind,
332
+ lintPass: true,
333
+ decision: acceptResult.exitCode === 0 ? "accept" : "reject",
334
+ ...(acceptResult.exitCode === 0 ? {} : { rejectReason: `accept failed: ${acceptResult.stderr.trim()}` }),
335
+ });
336
+ }
337
+ else {
338
+ const reason = lintInfo.lintMessage ?? "lint failed";
339
+ const rejectResult = await akmCli(["proposal", "reject", p.id, "--reason", `lint failed: ${reason}`], phase1Cwd, proposalEnv);
340
+ proposalLog.push({
341
+ proposalId: p.id,
342
+ assetRef: p.assetRef,
343
+ kind: p.kind,
344
+ lintPass: false,
345
+ decision: "reject",
346
+ rejectReason: reason,
347
+ });
348
+ if (rejectResult.exitCode !== 0) {
349
+ warnings.push(`phase2: akm proposal reject ${p.id} failed: ${rejectResult.stderr.trim()}`);
350
+ }
351
+ }
352
+ }
353
+ // Rebuild the index so accepted lessons surface in Phase 3.
354
+ const indexResult = await akmCli(["index"], phase1Cwd, proposalEnv);
355
+ if (indexResult.exitCode !== 0) {
356
+ warnings.push(`phase2: akm index rebuild failed: ${indexResult.stderr.trim()}`);
357
+ }
358
+ }
359
+ // ── Phase 3: re-evaluate (eval slice). ─────────────────────────────────
360
+ // pre arm: fresh snapshot of the starting fixture (no Phase 2 mutations
361
+ // applied). post arm: the mutated evolveStash so accepted lessons reach
362
+ // the eval slice. synthetic arm: no stash.
363
+ preReport = await runUtility({
364
+ tasks: evalTasks,
365
+ arms: ["akm"],
366
+ model: options.model,
367
+ seedsPerArm,
368
+ budgetTokens,
369
+ budgetWallMs,
370
+ slice: "eval",
371
+ ...(options.spawn ? { spawn: options.spawn } : {}),
372
+ materialiseStash,
373
+ ...(materialiseStash ? { stashDirByFixture: preDirByFixture } : {}),
374
+ ...(options.timestamp ? { timestamp: options.timestamp } : {}),
375
+ ...(options.branch ? { branch: options.branch } : {}),
376
+ ...(options.commit ? { commit: options.commit } : {}),
377
+ ...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
378
+ });
379
+ postReport = await runUtility({
380
+ tasks: evalTasks,
381
+ arms: ["akm"],
382
+ model: options.model,
383
+ seedsPerArm,
384
+ budgetTokens,
385
+ budgetWallMs,
386
+ slice: "eval",
387
+ // Stamp arm metadata so spawn fakes can distinguish pre-vs-post via
388
+ // an env probe. We thread it via a fresh `spawn` wrapper when one
389
+ // was supplied.
390
+ materialiseStash,
391
+ ...(materialiseStash ? { stashDirByFixture: evolveDirByFixture } : {}),
392
+ ...(options.timestamp ? { timestamp: options.timestamp } : {}),
393
+ ...(options.branch ? { branch: options.branch } : {}),
394
+ ...(options.commit ? { commit: options.commit } : {}),
395
+ ...(options.spawn ? { spawn: wrapSpawnWithArm(options.spawn, "post") } : {}),
396
+ ...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
397
+ });
398
+ // synthetic: no stash. We pass a spawn wrapper that strips
399
+ // AKM_STASH_DIR and injects the "Bring Your Own Skills" tag so test
400
+ // fakes (and a future real harness) can branch. #267 — also forward a
401
+ // per-task scratchpad prompt via the runner's `buildPrompt` seam so the
402
+ // synthetic arm actually exercises the BYOS prompt path rather than
403
+ // relying on the noakm default.
404
+ syntheticReport = await runUtility({
405
+ tasks: evalTasks,
406
+ arms: ["akm"],
407
+ model: options.model,
408
+ seedsPerArm,
409
+ budgetTokens,
410
+ budgetWallMs,
411
+ slice: "eval",
412
+ materialiseStash: false,
413
+ buildPrompt: (task, _arm) => buildSyntheticPrompt(task.id),
414
+ ...(options.timestamp ? { timestamp: options.timestamp } : {}),
415
+ ...(options.branch ? { branch: options.branch } : {}),
416
+ ...(options.commit ? { commit: options.commit } : {}),
417
+ ...(options.spawn ? { spawn: wrapSpawnWithArm(options.spawn, "synthetic", undefined, true) } : {}),
418
+ ...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
419
+ });
420
+ }
421
+ finally {
422
+ // Deregister BEFORE running cleanup so a SIGINT during teardown
423
+ // doesn't double-fire the cleanup fns (per cleanup.ts contract).
424
+ for (const deregister of stashDeregistrations)
425
+ deregister();
426
+ for (const s of evolveStashes.values()) {
427
+ try {
428
+ s.cleanup();
429
+ }
430
+ catch {
431
+ /* swallow — best-effort tmp cleanup */
432
+ }
433
+ }
434
+ for (const s of preStashes.values()) {
435
+ try {
436
+ s.cleanup();
437
+ }
438
+ catch {
439
+ /* swallow — best-effort tmp cleanup */
440
+ }
441
+ }
442
+ }
443
+ // ── Compute aggregates. ──────────────────────────────────────────────────
444
+ const proposalsMetrics = computeProposalQualityMetrics(proposalLog);
445
+ const longitudinal = computeLongitudinalMetrics(preReport, postReport, syntheticReport);
446
+ const feedbackIntegrity = computeFeedbackIntegrity({ phase1: phase1Report, feedbackLog });
447
+ // #264 — lesson quality + reuse metrics. The runner doesn't (yet) read
448
+ // accepted lesson bodies off disk or load verifier source text; we pass
449
+ // empty maps so the leakage check defaults to "low" until the read seam
450
+ // lands. Reuse + negative-transfer attribution work today off the
451
+ // pre/post arm `assetsLoaded` stream.
452
+ const lessons = computeLessonMetrics({
453
+ proposalLog,
454
+ feedbackLog,
455
+ preRuns: preReport.akmRuns ?? [],
456
+ postRuns: postReport.akmRuns ?? [],
457
+ });
458
+ return {
459
+ timestamp: options.timestamp ?? new Date().toISOString(),
460
+ branch: options.branch ?? preReport.branch,
461
+ commit: options.commit ?? preReport.commit,
462
+ model: options.model,
463
+ domain,
464
+ seedsPerArm,
465
+ feedbackLog,
466
+ proposalLog,
467
+ proposals: proposalsMetrics,
468
+ lessons,
469
+ longitudinal,
470
+ feedbackIntegrity,
471
+ phase1: phase1Report,
472
+ arms: { pre: preReport, post: postReport, synthetic: syntheticReport },
473
+ warnings: [
474
+ ...warnings,
475
+ ...phase1Report.warnings,
476
+ ...preReport.warnings,
477
+ ...postReport.warnings,
478
+ ...syntheticReport.warnings,
479
+ ],
480
+ };
481
+ }
482
+ /**
483
+ * Default subprocess invoker — runs `bun run src/cli.ts <args>` in `cwd`
484
+ * with the supplied env. Real runs use this; tests inject a fake.
485
+ */
486
+ async function defaultAkmCli(args, cwd, env) {
487
+ const cli = path.resolve(__dirname, "..", "..", "src", "cli.ts");
488
+ const proc = Bun.spawnSync({
489
+ cmd: ["bun", "run", cli, ...args],
490
+ cwd,
491
+ env: { ...process.env, ...env },
492
+ stdout: "pipe",
493
+ stderr: "pipe",
494
+ });
495
+ const stdout = proc.stdout ? new TextDecoder().decode(proc.stdout) : "";
496
+ const stderr = proc.stderr ? new TextDecoder().decode(proc.stderr) : "";
497
+ return { exitCode: proc.exitCode ?? -1, stdout, stderr };
498
+ }
499
+ /**
500
+ * Threshold check: an asset crosses the negative threshold if either the
501
+ * absolute negative count meets `absoluteCount` OR the negative *ratio* among
502
+ * total feedback exceeds `ratio`. Either branch is sufficient — both are
503
+ * spec-mandated defaults.
504
+ */
505
+ function crossesNegativeThreshold(counts, threshold) {
506
+ if (counts.negative >= threshold.absoluteCount)
507
+ return true;
508
+ const total = counts.positive + counts.negative;
509
+ if (total === 0)
510
+ return false;
511
+ return counts.negative / total > threshold.ratio;
512
+ }
513
+ /** Best-effort partition. Honours explicit `slice:` and falls back to id-hash. */
514
+ function effectiveSlice(task) {
515
+ if (task.slice)
516
+ return task.slice;
517
+ // Mirror corpus.effectiveSlice — SHA-1 first byte parity.
518
+ // We avoid the import cycle by inlining the trivial fallback.
519
+ let h = 0;
520
+ for (let i = 0; i < task.id.length; i += 1)
521
+ h = (h * 31 + task.id.charCodeAt(i)) | 0;
522
+ return Math.abs(h) % 2 === 0 ? "train" : "eval";
523
+ }
524
+ function uniqueDomain(tasks) {
525
+ const set = new Set(tasks.map((t) => t.domain));
526
+ if (set.size === 1)
527
+ return [...set][0] ?? "all";
528
+ return "all";
529
+ }
530
+ /**
531
+ * Wrap a spawn fake so every child sees `BENCH_EVOLVE_ARM=<arm>` (and
532
+ * `BENCH_EVOLVE_SCRATCHPAD=1` for the synthetic arm). Used by Phase 3 so
533
+ * test fakes can distinguish the three arms without us having to expose a
534
+ * `prompt` override on `runUtility`. Real production runs receive the same
535
+ * env keys; the real `runAgent` harness ignores them.
536
+ */
537
+ function wrapSpawnWithArm(inner, arm, stashDir, scratchpad = false) {
538
+ return (cmd, opts) => {
539
+ const env = { ...(opts.env ?? {}) };
540
+ env.BENCH_EVOLVE_ARM = arm;
541
+ if (scratchpad)
542
+ env.BENCH_EVOLVE_SCRATCHPAD = "1";
543
+ if (stashDir)
544
+ env.AKM_STASH_DIR = stashDir;
545
+ if (arm === "synthetic")
546
+ delete env.AKM_STASH_DIR;
547
+ return inner(cmd, { ...opts, env });
548
+ };
549
+ }
550
+ /** Tolerant parser for `akm proposal list --json` stdout. */
551
+ function parseProposalList(stdout) {
552
+ if (!stdout.trim())
553
+ return [];
554
+ let parsed;
555
+ try {
556
+ parsed = JSON.parse(stdout);
557
+ }
558
+ catch {
559
+ return [];
560
+ }
561
+ const arr = Array.isArray(parsed)
562
+ ? parsed
563
+ : Array.isArray(parsed.proposals)
564
+ ? parsed.proposals
565
+ : [];
566
+ const out = [];
567
+ for (const item of arr) {
568
+ if (!item || typeof item !== "object")
569
+ continue;
570
+ const rec = item;
571
+ const id = typeof rec.id === "string" ? rec.id : null;
572
+ const assetRef = typeof rec.target_ref === "string"
573
+ ? rec.target_ref
574
+ : typeof rec.targetRef === "string"
575
+ ? rec.targetRef
576
+ : typeof rec.ref === "string"
577
+ ? rec.ref
578
+ : null;
579
+ const kindRaw = typeof rec.kind === "string" ? rec.kind : typeof rec.source === "string" ? rec.source : "unknown";
580
+ const kind = kindRaw === "lesson" || kindRaw === "distill"
581
+ ? "lesson"
582
+ : kindRaw === "revision" || kindRaw === "reflect"
583
+ ? "revision"
584
+ : "unknown";
585
+ if (!id || !assetRef)
586
+ continue;
587
+ out.push({ id, assetRef, kind });
588
+ }
589
+ return out;
590
+ }
591
+ function parseProposalShow(stdout) {
592
+ if (!stdout.trim())
593
+ return { lintPass: false, lintMessage: "empty proposal show output" };
594
+ let parsed;
595
+ try {
596
+ parsed = JSON.parse(stdout);
597
+ }
598
+ catch (err) {
599
+ return { lintPass: false, lintMessage: `proposal show: parse error (${err.message})` };
600
+ }
601
+ const lintPass = parsed.lint_pass === true ||
602
+ parsed.lintPass === true ||
603
+ (typeof parsed.lint === "object" && parsed.lint !== null && parsed.lint.pass === true);
604
+ const lintRaw = parsed.lint;
605
+ let lintMessage;
606
+ if (lintRaw && typeof lintRaw === "object") {
607
+ const issues = lintRaw.issues;
608
+ if (Array.isArray(issues) && issues.length > 0) {
609
+ lintMessage = issues
610
+ .map((i) => (typeof i === "string" ? i : (i?.message ?? JSON.stringify(i))))
611
+ .join("; ");
612
+ }
613
+ }
614
+ return { lintPass, ...(lintMessage ? { lintMessage } : {}) };
615
+ }
616
+ /**
617
+ * Run `akm index` on the evolve stash to populate the FTS5 database in the
618
+ * cache directory that Phase 1 `akmCli` calls will use.
619
+ *
620
+ * `loadFixtureStash` already indexed the stash into an isolated XDG_CACHE_HOME
621
+ * that is invisible to subsequent `akmCli` calls. Calling this helper with the
622
+ * same `stashDir` + `cacheDir` that `envForRef` will forward ensures `akm
623
+ * feedback` (and later `akm distill` / `akm reflect`) can look up refs in the
624
+ * FTS5 index.
625
+ *
626
+ * Returns `{ ok: true }` on exit code 0, `{ ok: false, stderr }` otherwise.
627
+ * Exported for tests.
628
+ */
629
+ export async function indexEvolveStash(stashDir, cacheDir, akmCli, cwd) {
630
+ const env = {
631
+ ...process.env,
632
+ AKM_STASH_DIR: stashDir,
633
+ XDG_CACHE_HOME: cacheDir,
634
+ };
635
+ const result = await akmCli(["index"], cwd, env);
636
+ return { ok: result.exitCode === 0, stderr: result.stderr };
637
+ }
638
+ /** Exposed for tests so the synthetic-arm prompt construction can be asserted. */
639
+ export function buildSyntheticPrompt(taskId) {
640
+ return [
641
+ `Task: ${taskId}`,
642
+ "Arm: synthetic (Bring Your Own Skills)",
643
+ "No akm stash is available. Before solving the task, write a short scratchpad of the skills",
644
+ "and steps you intend to use, then proceed. Cite the scratchpad in your trace so the verifier",
645
+ "can attribute the approach to your own reasoning rather than retrieved guidance.",
646
+ ].join("\n");
647
+ }