akm-cli 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (332) hide show
  1. package/CHANGELOG.md +8 -0
  2. package/dist/{src/cli.js → cli.js} +22 -8
  3. package/dist/{src/commands → commands}/installed-stashes.js +1 -1
  4. package/dist/{src/commands → commands}/source-add.js +1 -1
  5. package/dist/{src/core → core}/common.js +16 -1
  6. package/dist/{src/core → core}/config.js +5 -2
  7. package/dist/{src/indexer → indexer}/db-search.js +16 -1
  8. package/dist/{src/indexer → indexer}/graph-extraction.js +5 -3
  9. package/dist/{src/indexer → indexer}/indexer.js +27 -11
  10. package/dist/{src/indexer → indexer}/memory-inference.js +47 -58
  11. package/dist/{src/indexer → indexer}/search-source.js +1 -1
  12. package/dist/{src/llm → llm}/client.js +61 -1
  13. package/dist/{src/llm → llm}/embedder.js +8 -5
  14. package/dist/{src/llm → llm}/embedders/local.js +8 -2
  15. package/dist/{src/llm → llm}/embedders/remote.js +4 -2
  16. package/dist/{src/llm → llm}/graph-extract.js +4 -4
  17. package/dist/llm/memory-infer.js +114 -0
  18. package/dist/{src/llm → llm}/metadata-enhance.js +2 -2
  19. package/dist/{src/output → output}/cli-hints.js +2 -0
  20. package/dist/{src/setup → setup}/setup.js +30 -20
  21. package/dist/sources/providers/website.js +27 -0
  22. package/dist/{src/sources/providers/website.js → sources/website-ingest.js} +38 -51
  23. package/docs/README.md +7 -0
  24. package/docs/migration/release-notes/0.7.0.md +14 -0
  25. package/package.json +11 -8
  26. package/dist/src/llm/memory-infer.js +0 -86
  27. package/dist/tests/add-website-source.test.js +0 -119
  28. package/dist/tests/agent/agent-config-loader.test.js +0 -70
  29. package/dist/tests/agent/agent-config.test.js +0 -221
  30. package/dist/tests/agent/agent-detect.test.js +0 -100
  31. package/dist/tests/agent/agent-spawn.test.js +0 -234
  32. package/dist/tests/agent-output.test.js +0 -186
  33. package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +0 -103
  34. package/dist/tests/architecture/agent-spawn-seam.test.js +0 -193
  35. package/dist/tests/architecture/llm-stateless-seam.test.js +0 -112
  36. package/dist/tests/asset-ref.test.js +0 -192
  37. package/dist/tests/asset-registry.test.js +0 -103
  38. package/dist/tests/asset-spec.test.js +0 -241
  39. package/dist/tests/bench/attribution.test.js +0 -996
  40. package/dist/tests/bench/cleanup-sigint.test.js +0 -83
  41. package/dist/tests/bench/cleanup.js +0 -234
  42. package/dist/tests/bench/cleanup.test.js +0 -166
  43. package/dist/tests/bench/cli.js +0 -1018
  44. package/dist/tests/bench/cli.test.js +0 -445
  45. package/dist/tests/bench/compare.test.js +0 -556
  46. package/dist/tests/bench/corpus.js +0 -317
  47. package/dist/tests/bench/corpus.test.js +0 -258
  48. package/dist/tests/bench/doctor.js +0 -525
  49. package/dist/tests/bench/driver.js +0 -401
  50. package/dist/tests/bench/driver.test.js +0 -584
  51. package/dist/tests/bench/environment.js +0 -233
  52. package/dist/tests/bench/environment.test.js +0 -199
  53. package/dist/tests/bench/evolve-metrics.js +0 -179
  54. package/dist/tests/bench/evolve-metrics.test.js +0 -187
  55. package/dist/tests/bench/evolve.js +0 -647
  56. package/dist/tests/bench/evolve.test.js +0 -624
  57. package/dist/tests/bench/failure-modes.test.js +0 -349
  58. package/dist/tests/bench/feedback-integrity.test.js +0 -457
  59. package/dist/tests/bench/leakage.test.js +0 -228
  60. package/dist/tests/bench/learning-curve.test.js +0 -134
  61. package/dist/tests/bench/metrics.js +0 -2395
  62. package/dist/tests/bench/metrics.test.js +0 -1150
  63. package/dist/tests/bench/no-os-tmpdir-invariant.test.js +0 -43
  64. package/dist/tests/bench/opencode-config.js +0 -194
  65. package/dist/tests/bench/opencode-config.test.js +0 -370
  66. package/dist/tests/bench/report.js +0 -1885
  67. package/dist/tests/bench/report.test.js +0 -1038
  68. package/dist/tests/bench/run-config.js +0 -355
  69. package/dist/tests/bench/run-config.test.js +0 -298
  70. package/dist/tests/bench/run-curate-test.js +0 -32
  71. package/dist/tests/bench/run-failing-tasks.js +0 -56
  72. package/dist/tests/bench/run-full-bench.js +0 -51
  73. package/dist/tests/bench/run-items36-targeted.js +0 -69
  74. package/dist/tests/bench/run-nano-quick.js +0 -42
  75. package/dist/tests/bench/run-waveg-targeted.js +0 -62
  76. package/dist/tests/bench/runner.js +0 -699
  77. package/dist/tests/bench/runner.test.js +0 -958
  78. package/dist/tests/bench/search-bridge.test.js +0 -331
  79. package/dist/tests/bench/tmp.js +0 -131
  80. package/dist/tests/bench/trajectory.js +0 -116
  81. package/dist/tests/bench/trajectory.test.js +0 -127
  82. package/dist/tests/bench/verifier.js +0 -114
  83. package/dist/tests/bench/verifier.test.js +0 -118
  84. package/dist/tests/bench/workflow-evaluator.js +0 -557
  85. package/dist/tests/bench/workflow-evaluator.test.js +0 -421
  86. package/dist/tests/bench/workflow-spec.js +0 -345
  87. package/dist/tests/bench/workflow-spec.test.js +0 -363
  88. package/dist/tests/bench/workflow-trace.js +0 -472
  89. package/dist/tests/bench/workflow-trace.test.js +0 -254
  90. package/dist/tests/benchmark-search-quality.js +0 -536
  91. package/dist/tests/benchmark-suite.js +0 -1441
  92. package/dist/tests/capture-cli.test.js +0 -112
  93. package/dist/tests/cli-errors.test.js +0 -204
  94. package/dist/tests/commands/events.test.js +0 -370
  95. package/dist/tests/commands/history.test.js +0 -418
  96. package/dist/tests/commands/import.test.js +0 -103
  97. package/dist/tests/commands/proposal-cli.test.js +0 -209
  98. package/dist/tests/commands/reflect-propose-cli.test.js +0 -333
  99. package/dist/tests/commands/remember.test.js +0 -97
  100. package/dist/tests/commands/scope-flags.test.js +0 -300
  101. package/dist/tests/commands/search.test.js +0 -537
  102. package/dist/tests/commands/show-indexer-parity.test.js +0 -117
  103. package/dist/tests/commands/show.test.js +0 -294
  104. package/dist/tests/common.test.js +0 -266
  105. package/dist/tests/completions.test.js +0 -142
  106. package/dist/tests/config-cli.test.js +0 -193
  107. package/dist/tests/config-llm-features.test.js +0 -139
  108. package/dist/tests/config.test.js +0 -569
  109. package/dist/tests/contracts/migration-baseline.test.js +0 -43
  110. package/dist/tests/contracts/reflect-propose-envelope.test.js +0 -139
  111. package/dist/tests/contracts/spec-helpers.js +0 -46
  112. package/dist/tests/contracts/v1-spec-section-11-proposal-queue.test.js +0 -228
  113. package/dist/tests/contracts/v1-spec-section-12-agent-config.test.js +0 -56
  114. package/dist/tests/contracts/v1-spec-section-13-lesson-type.test.js +0 -34
  115. package/dist/tests/contracts/v1-spec-section-14-llm-features.test.js +0 -94
  116. package/dist/tests/contracts/v1-spec-section-4-1-asset-types.test.js +0 -39
  117. package/dist/tests/contracts/v1-spec-section-4-2-quality-rules.test.js +0 -44
  118. package/dist/tests/contracts/v1-spec-section-5-configuration.test.js +0 -47
  119. package/dist/tests/contracts/v1-spec-section-6-orchestration.test.js +0 -40
  120. package/dist/tests/contracts/v1-spec-section-7-module-layout.test.js +0 -58
  121. package/dist/tests/contracts/v1-spec-section-8-extension-points.test.js +0 -34
  122. package/dist/tests/contracts/v1-spec-section-9-4-cli-surface.test.js +0 -75
  123. package/dist/tests/contracts/v1-spec-section-9-7-llm-agent-boundary.test.js +0 -36
  124. package/dist/tests/core/write-source.test.js +0 -366
  125. package/dist/tests/curate-command.test.js +0 -87
  126. package/dist/tests/db-scoring.test.js +0 -201
  127. package/dist/tests/db.test.js +0 -654
  128. package/dist/tests/distill-cli-flag.test.js +0 -208
  129. package/dist/tests/distill.test.js +0 -515
  130. package/dist/tests/docker-install.test.js +0 -120
  131. package/dist/tests/e2e.test.js +0 -1419
  132. package/dist/tests/embedder.test.js +0 -340
  133. package/dist/tests/embedding-model-config.test.js +0 -379
  134. package/dist/tests/feedback-command.test.js +0 -172
  135. package/dist/tests/file-context.test.js +0 -552
  136. package/dist/tests/fixtures/scripts/git/summarize-diff.js +0 -9
  137. package/dist/tests/fixtures/scripts/lint/eslint-check.js +0 -7
  138. package/dist/tests/fixtures/stashes/load.js +0 -166
  139. package/dist/tests/fixtures/stashes/load.test.js +0 -97
  140. package/dist/tests/fixtures/stashes/ranking-baseline/scripts/mem0-search.js +0 -12
  141. package/dist/tests/frontmatter.test.js +0 -190
  142. package/dist/tests/fts-field-weighting.test.js +0 -254
  143. package/dist/tests/fuzzy-search.test.js +0 -230
  144. package/dist/tests/git-provider-clone.test.js +0 -45
  145. package/dist/tests/github.test.js +0 -161
  146. package/dist/tests/graph-boost-ranking.test.js +0 -305
  147. package/dist/tests/graph-extraction.test.js +0 -282
  148. package/dist/tests/helpers/usage-events.js +0 -8
  149. package/dist/tests/index-pass-llm.test.js +0 -161
  150. package/dist/tests/indexer.test.js +0 -570
  151. package/dist/tests/info-command.test.js +0 -166
  152. package/dist/tests/init.test.js +0 -69
  153. package/dist/tests/install-script.test.js +0 -246
  154. package/dist/tests/integration/agent-real-profile.test.js +0 -94
  155. package/dist/tests/issue-36-repro.test.js +0 -304
  156. package/dist/tests/issues-191-194.test.js +0 -160
  157. package/dist/tests/lesson-lint.test.js +0 -111
  158. package/dist/tests/llm-client.test.js +0 -115
  159. package/dist/tests/llm-feature-gate.test.js +0 -151
  160. package/dist/tests/llm.test.js +0 -139
  161. package/dist/tests/lockfile.test.js +0 -216
  162. package/dist/tests/manifest.test.js +0 -205
  163. package/dist/tests/markdown.test.js +0 -126
  164. package/dist/tests/matchers-unit.test.js +0 -189
  165. package/dist/tests/memory-inference.test.js +0 -299
  166. package/dist/tests/merge-scoring.test.js +0 -136
  167. package/dist/tests/metadata.test.js +0 -313
  168. package/dist/tests/migration-help.test.js +0 -89
  169. package/dist/tests/origin-resolve.test.js +0 -124
  170. package/dist/tests/output-baseline.test.js +0 -218
  171. package/dist/tests/output-shapes-unit.test.js +0 -478
  172. package/dist/tests/parallel-search.test.js +0 -272
  173. package/dist/tests/parameter-metadata.test.js +0 -365
  174. package/dist/tests/paths.test.js +0 -177
  175. package/dist/tests/progressive-disclosure.test.js +0 -280
  176. package/dist/tests/proposals.test.js +0 -279
  177. package/dist/tests/proposed-quality.test.js +0 -271
  178. package/dist/tests/provider-registry.test.js +0 -32
  179. package/dist/tests/ranking-regression.test.js +0 -548
  180. package/dist/tests/reflect-propose.test.js +0 -455
  181. package/dist/tests/registry-build-index.test.js +0 -394
  182. package/dist/tests/registry-cli.test.js +0 -290
  183. package/dist/tests/registry-index-v2.test.js +0 -430
  184. package/dist/tests/registry-install.test.js +0 -728
  185. package/dist/tests/registry-providers/parity.test.js +0 -189
  186. package/dist/tests/registry-providers/skills-sh.test.js +0 -309
  187. package/dist/tests/registry-providers/static-index.test.js +0 -238
  188. package/dist/tests/registry-resolve.test.js +0 -126
  189. package/dist/tests/registry-search.test.js +0 -923
  190. package/dist/tests/remember-frontmatter.test.js +0 -378
  191. package/dist/tests/remember-unit.test.js +0 -123
  192. package/dist/tests/ripgrep-install.test.js +0 -251
  193. package/dist/tests/ripgrep-resolve.test.js +0 -108
  194. package/dist/tests/ripgrep.test.js +0 -163
  195. package/dist/tests/save-command.test.js +0 -94
  196. package/dist/tests/save-trust-qa-fixes.test.js +0 -270
  197. package/dist/tests/scoring-pipeline.test.js +0 -648
  198. package/dist/tests/search-include-proposed-cli.test.js +0 -118
  199. package/dist/tests/self-update.test.js +0 -442
  200. package/dist/tests/semantic-search-e2e.test.js +0 -512
  201. package/dist/tests/semantic-status.test.js +0 -471
  202. package/dist/tests/setup-run.integration.js +0 -877
  203. package/dist/tests/setup-wizard.test.js +0 -198
  204. package/dist/tests/setup.test.js +0 -131
  205. package/dist/tests/source-add.test.js +0 -11
  206. package/dist/tests/source-clone.test.js +0 -254
  207. package/dist/tests/source-manage.test.js +0 -366
  208. package/dist/tests/source-providers/filesystem.test.js +0 -82
  209. package/dist/tests/source-providers/git.test.js +0 -252
  210. package/dist/tests/source-providers/website.test.js +0 -128
  211. package/dist/tests/source-qa-fixes.test.js +0 -286
  212. package/dist/tests/source-registry.test.js +0 -350
  213. package/dist/tests/source-resolve.test.js +0 -100
  214. package/dist/tests/source-source.test.js +0 -281
  215. package/dist/tests/source.test.js +0 -533
  216. package/dist/tests/tar-utils-scan.test.js +0 -73
  217. package/dist/tests/toggle-components.test.js +0 -73
  218. package/dist/tests/usage-telemetry.test.js +0 -265
  219. package/dist/tests/utility-scoring.test.js +0 -558
  220. package/dist/tests/vault-load-error.test.js +0 -78
  221. package/dist/tests/vault-qa-fixes.test.js +0 -194
  222. package/dist/tests/vault.test.js +0 -429
  223. package/dist/tests/vector-search.test.js +0 -608
  224. package/dist/tests/walker.test.js +0 -252
  225. package/dist/tests/wave2-cluster-bc.test.js +0 -228
  226. package/dist/tests/wave2-cluster-d.test.js +0 -180
  227. package/dist/tests/wave2-cluster-e.test.js +0 -179
  228. package/dist/tests/wiki-qa-fixes.test.js +0 -270
  229. package/dist/tests/wiki.test.js +0 -529
  230. package/dist/tests/workflow-cli.test.js +0 -271
  231. package/dist/tests/workflow-markdown.test.js +0 -171
  232. package/dist/tests/workflow-path-escape.test.js +0 -132
  233. package/dist/tests/workflow-qa-fixes.test.js +0 -395
  234. package/dist/tests/workflows/indexer-rejection.test.js +0 -213
  235. /package/dist/{src/commands → commands}/completions.js +0 -0
  236. /package/dist/{src/commands → commands}/config-cli.js +0 -0
  237. /package/dist/{src/commands → commands}/curate.js +0 -0
  238. /package/dist/{src/commands → commands}/distill.js +0 -0
  239. /package/dist/{src/commands → commands}/events.js +0 -0
  240. /package/dist/{src/commands → commands}/history.js +0 -0
  241. /package/dist/{src/commands → commands}/info.js +0 -0
  242. /package/dist/{src/commands → commands}/init.js +0 -0
  243. /package/dist/{src/commands → commands}/install-audit.js +0 -0
  244. /package/dist/{src/commands → commands}/migration-help.js +0 -0
  245. /package/dist/{src/commands → commands}/proposal.js +0 -0
  246. /package/dist/{src/commands → commands}/propose.js +0 -0
  247. /package/dist/{src/commands → commands}/reflect.js +0 -0
  248. /package/dist/{src/commands → commands}/registry-search.js +0 -0
  249. /package/dist/{src/commands → commands}/remember.js +0 -0
  250. /package/dist/{src/commands → commands}/search.js +0 -0
  251. /package/dist/{src/commands → commands}/self-update.js +0 -0
  252. /package/dist/{src/commands → commands}/show.js +0 -0
  253. /package/dist/{src/commands → commands}/source-clone.js +0 -0
  254. /package/dist/{src/commands → commands}/source-manage.js +0 -0
  255. /package/dist/{src/commands → commands}/vault.js +0 -0
  256. /package/dist/{src/core → core}/asset-ref.js +0 -0
  257. /package/dist/{src/core → core}/asset-registry.js +0 -0
  258. /package/dist/{src/core → core}/asset-spec.js +0 -0
  259. /package/dist/{src/core → core}/errors.js +0 -0
  260. /package/dist/{src/core → core}/events.js +0 -0
  261. /package/dist/{src/core → core}/frontmatter.js +0 -0
  262. /package/dist/{src/core → core}/lesson-lint.js +0 -0
  263. /package/dist/{src/core → core}/markdown.js +0 -0
  264. /package/dist/{src/core → core}/paths.js +0 -0
  265. /package/dist/{src/core → core}/proposals.js +0 -0
  266. /package/dist/{src/core → core}/warn.js +0 -0
  267. /package/dist/{src/core → core}/write-source.js +0 -0
  268. /package/dist/{src/indexer → indexer}/db.js +0 -0
  269. /package/dist/{src/indexer → indexer}/file-context.js +0 -0
  270. /package/dist/{src/indexer → indexer}/graph-boost.js +0 -0
  271. /package/dist/{src/indexer → indexer}/manifest.js +0 -0
  272. /package/dist/{src/indexer → indexer}/matchers.js +0 -0
  273. /package/dist/{src/indexer → indexer}/metadata.js +0 -0
  274. /package/dist/{src/indexer → indexer}/search-fields.js +0 -0
  275. /package/dist/{src/indexer → indexer}/semantic-status.js +0 -0
  276. /package/dist/{src/indexer → indexer}/usage-events.js +0 -0
  277. /package/dist/{src/indexer → indexer}/walker.js +0 -0
  278. /package/dist/{src/integrations → integrations}/agent/config.js +0 -0
  279. /package/dist/{src/integrations → integrations}/agent/detect.js +0 -0
  280. /package/dist/{src/integrations → integrations}/agent/index.js +0 -0
  281. /package/dist/{src/integrations → integrations}/agent/profiles.js +0 -0
  282. /package/dist/{src/integrations → integrations}/agent/prompts.js +0 -0
  283. /package/dist/{src/integrations → integrations}/agent/spawn.js +0 -0
  284. /package/dist/{src/integrations → integrations}/github.js +0 -0
  285. /package/dist/{src/integrations → integrations}/lockfile.js +0 -0
  286. /package/dist/{src/llm → llm}/embedders/cache.js +0 -0
  287. /package/dist/{src/llm → llm}/embedders/types.js +0 -0
  288. /package/dist/{src/llm → llm}/feature-gate.js +0 -0
  289. /package/dist/{src/llm → llm}/index-passes.js +0 -0
  290. /package/dist/{src/output → output}/context.js +0 -0
  291. /package/dist/{src/output → output}/renderers.js +0 -0
  292. /package/dist/{src/output → output}/shapes.js +0 -0
  293. /package/dist/{src/output → output}/text.js +0 -0
  294. /package/dist/{src/registry → registry}/build-index.js +0 -0
  295. /package/dist/{src/registry → registry}/create-provider-registry.js +0 -0
  296. /package/dist/{src/registry → registry}/factory.js +0 -0
  297. /package/dist/{src/registry → registry}/origin-resolve.js +0 -0
  298. /package/dist/{src/registry → registry}/providers/index.js +0 -0
  299. /package/dist/{src/registry → registry}/providers/skills-sh.js +0 -0
  300. /package/dist/{src/registry → registry}/providers/static-index.js +0 -0
  301. /package/dist/{src/registry → registry}/providers/types.js +0 -0
  302. /package/dist/{src/registry → registry}/resolve.js +0 -0
  303. /package/dist/{src/registry → registry}/types.js +0 -0
  304. /package/dist/{src/setup → setup}/detect.js +0 -0
  305. /package/dist/{src/setup → setup}/ripgrep-install.js +0 -0
  306. /package/dist/{src/setup → setup}/ripgrep-resolve.js +0 -0
  307. /package/dist/{src/setup → setup}/steps.js +0 -0
  308. /package/dist/{src/sources → sources}/include.js +0 -0
  309. /package/dist/{src/sources → sources}/provider-factory.js +0 -0
  310. /package/dist/{src/sources → sources}/provider.js +0 -0
  311. /package/dist/{src/sources → sources}/providers/filesystem.js +0 -0
  312. /package/dist/{src/sources → sources}/providers/git.js +0 -0
  313. /package/dist/{src/sources → sources}/providers/index.js +0 -0
  314. /package/dist/{src/sources → sources}/providers/install-types.js +0 -0
  315. /package/dist/{src/sources → sources}/providers/npm.js +0 -0
  316. /package/dist/{src/sources → sources}/providers/provider-utils.js +0 -0
  317. /package/dist/{src/sources → sources}/providers/sync-from-ref.js +0 -0
  318. /package/dist/{src/sources → sources}/providers/tar-utils.js +0 -0
  319. /package/dist/{src/sources → sources}/resolve.js +0 -0
  320. /package/dist/{src/sources → sources}/types.js +0 -0
  321. /package/dist/{src/templates → templates}/wiki-templates.js +0 -0
  322. /package/dist/{src/version.js → version.js} +0 -0
  323. /package/dist/{src/wiki → wiki}/wiki.js +0 -0
  324. /package/dist/{src/workflows → workflows}/authoring.js +0 -0
  325. /package/dist/{src/workflows → workflows}/cli.js +0 -0
  326. /package/dist/{src/workflows → workflows}/db.js +0 -0
  327. /package/dist/{src/workflows → workflows}/document-cache.js +0 -0
  328. /package/dist/{src/workflows → workflows}/parser.js +0 -0
  329. /package/dist/{src/workflows → workflows}/renderer.js +0 -0
  330. /package/dist/{src/workflows → workflows}/runs.js +0 -0
  331. /package/dist/{src/workflows → workflows}/schema.js +0 -0
  332. /package/dist/{src/workflows → workflows}/validator.js +0 -0
@@ -1,647 +0,0 @@
1
- /**
2
- * akm-bench `evolve` — Track B longitudinal three-phase runner (spec §4 + §6.4).
3
- *
4
- * `runEvolve()` orchestrates three phases against a single eval-domain corpus:
5
- *
6
- * • Phase 1 (signal accumulation): run K seeds × tasks (train slice only)
7
- * under the akm arm, then record `akm feedback <gold_ref> --positive` /
8
- * `--negative` events per outcome.
9
- * • Phase 2 (evolve): for every asset whose negative feedback crosses the
10
- * threshold, invoke `akm distill` and `akm reflect`, validate every
11
- * resulting proposal via `akm proposal show --json`, then accept or
12
- * reject per lint outcome. After processing, rebuild the index.
13
- * • Phase 3 (re-evaluate): run the eval slice under THREE arms — `pre` (the
14
- * original un-evolved fixture), `post` (the evolved fixture), `synthetic`
15
- * (no stash, scratchpad-only "Bring Your Own Skills" prompt).
16
- *
17
- * Leakage prevention (spec §7.4): before invoking distill we compute the set
18
- * of eval-slice gold refs and pass it to `akm distill` via
19
- * `--exclude-feedback-from <csv>` (#267). `akmDistill` filters those
20
- * feedback events out of its LLM input before constructing the prompt.
21
- * Refs in the exclusion list still see distillation run — but distillation
22
- * runs from asset content alone, with no feedback signal that could have
23
- * leaked from the eval slice. The proposal log + Phase 1 feedback stream
24
- * are also filtered before computeProposalQualityMetrics ever sees them.
25
- *
26
- * Test seams: every external interaction is funnelled through one of three
27
- * injectable functions:
28
- * - `spawn` — forwarded to `runOne` (drives the agent harness).
29
- * - `akmCli(args, cwd, env)` — invoked for every `akm <verb>` subprocess.
30
- * - `materialiseStash` — when false, `runUtility` doesn't touch
31
- * fixtures/stashes/.
32
- * Tests inject fakes; production wires the real `Bun.spawnSync` and the
33
- * real `loadFixtureStash`.
34
- */
35
- import path from "node:path";
36
- import { loadFixtureStash } from "../fixtures/stashes/load";
37
- import { registerCleanup } from "./cleanup";
38
- import { computeLessonMetrics } from "./evolve-metrics";
39
- import { computeFeedbackIntegrity, computeLongitudinalMetrics, computeProposalQualityMetrics, } from "./metrics";
40
- import { runUtility } from "./runner";
41
- import { benchMkdtemp } from "./tmp";
42
- /**
43
- * Drive the three-phase Track B runner.
44
- *
45
- * Pre: `tasks` is already filtered to one domain (or `all`). The runner
46
- * partitions internally on `task.slice`.
47
- *
48
- * Sandboxing: at the start of every real run the runner materialises one
49
- * dedicated tmp stash per fixture (the `evolveStash`) plus a fresh sibling
50
- * snapshot per fixture (the `preStash`). Phase 1 + Phase 2 pin
51
- * `AKM_STASH_DIR` to the appropriate `evolveStash` for every spawned `akm`
52
- * invocation; Phase 3's pre arm uses `preStash`, the post arm uses
53
- * `evolveStash`, and the synthetic arm uses no stash. The operator's real
54
- * `process.env.AKM_STASH_DIR` is never read or written by `runEvolve`. All
55
- * stashes are cleaned up in a top-level try/finally.
56
- */
57
- export async function runEvolve(options) {
58
- const seedsPerArm = options.seedsPerArm ?? 5;
59
- const budgetTokens = options.budgetTokens ?? 30000;
60
- const budgetWallMs = options.budgetWallMs ?? 120000;
61
- const negativeThreshold = options.negativeThreshold ?? { absoluteCount: 2, ratio: 0.5 };
62
- const materialiseStash = options.materialiseStash ?? true;
63
- const akmCli = options.akmCli ?? defaultAkmCli;
64
- const warnings = [];
65
- const trainTasks = options.tasks.filter((t) => effectiveSlice(t) === "train");
66
- const evalTasks = options.tasks.filter((t) => effectiveSlice(t) === "eval");
67
- // Use the first task's domain (or "all") as the corpus label. The CLI
68
- // already filtered to one domain; this is just for the report header.
69
- const domain = uniqueDomain(options.tasks);
70
- // ── Sandbox setup: per-fixture evolveStash + preStash. ───────────────────
71
- // We materialise one tmp stash per unique `task.stash` so Phase 1
72
- // accumulates feedback into the same on-disk stash that Phase 2 mutates,
73
- // and that Phase 3's post arm reads back. The operator's real
74
- // AKM_STASH_DIR is never touched. The pre arm gets a fresh snapshot of
75
- // the same starting fixture (no Phase 2 mutations applied).
76
- const fixtureNames = new Set();
77
- for (const t of options.tasks)
78
- fixtureNames.add(t.stash);
79
- const evolveStashes = new Map();
80
- const preStashes = new Map();
81
- const evolveDirByFixture = new Map();
82
- const preDirByFixture = new Map();
83
- /** Per-fixture XDG_CACHE_HOME dirs allocated for evolve-stash indexing. */
84
- const evolveCacheDirByFixture = new Map();
85
- // SIGINT trap (#267): every per-fixture stash registers its cleanup with
86
- // the shared registry so an external Ctrl-C reaps the tmp dirs even when
87
- // the top-level try/finally never runs. We deregister in the matching
88
- // finally block before invoking the synchronous cleanup so the handler
89
- // doesn't double-fire.
90
- const stashDeregistrations = [];
91
- if (materialiseStash) {
92
- for (const name of fixtureNames) {
93
- try {
94
- const evolved = loadFixtureStash(name, { skipIndex: false });
95
- evolveStashes.set(name, evolved);
96
- evolveDirByFixture.set(name, evolved.stashDir);
97
- // Allocate a per-fixture cache dir for the evolve-stash re-index.
98
- // `loadFixtureStash` used its own isolated XDG_CACHE_HOME; subsequent
99
- // `akmCli` calls (feedback, distill, reflect) must look in the same
100
- // cache. We allocate a fresh bench cache dir and pass it through
101
- // `indexEvolveStash` + `envForRef` so the FTS5 DB is in a known place.
102
- evolveCacheDirByFixture.set(name, benchMkdtemp(`akm-evolve-cache-${name}-`));
103
- stashDeregistrations.push(registerCleanup(() => {
104
- try {
105
- evolved.cleanup();
106
- }
107
- catch {
108
- /* swallow */
109
- }
110
- }));
111
- }
112
- catch (err) {
113
- warnings.push(`evolve: failed to materialise evolve stash for fixture "${name}": ${err.message}`);
114
- }
115
- try {
116
- const pre = loadFixtureStash(name, { skipIndex: false });
117
- preStashes.set(name, pre);
118
- preDirByFixture.set(name, pre.stashDir);
119
- stashDeregistrations.push(registerCleanup(() => {
120
- try {
121
- pre.cleanup();
122
- }
123
- catch {
124
- /* swallow */
125
- }
126
- }));
127
- }
128
- catch (err) {
129
- warnings.push(`evolve: failed to materialise pre stash for fixture "${name}": ${err.message}`);
130
- }
131
- }
132
- }
133
- // Resolve the evolveStash dir for a given asset ref. We map ref → fixture
134
- // by looking up which task's gold ref it matches; if no task owns it (or
135
- // multiple do, which is unusual), we fall back to the first available
136
- // evolveStash. The simple — and most common — case is a single fixture
137
- // per `--tasks <domain>` invocation.
138
- const refToFixture = new Map();
139
- for (const t of options.tasks) {
140
- if (t.goldRef)
141
- refToFixture.set(t.goldRef, t.stash);
142
- }
143
- const fallbackEvolveDir = [...evolveDirByFixture.values()][0];
144
- const fallbackEvolveCacheDir = [...evolveCacheDirByFixture.values()][0];
145
- function envForRef(ref) {
146
- const baseEnv = { ...process.env };
147
- if (!materialiseStash) {
148
- // Tests opt out of fixture materialisation entirely; we still strip
149
- // the operator's AKM_STASH_DIR so the fake CLI sees a known sentinel.
150
- delete baseEnv.AKM_STASH_DIR;
151
- return baseEnv;
152
- }
153
- const fixture = ref ? refToFixture.get(ref) : undefined;
154
- const dir = (fixture && evolveDirByFixture.get(fixture)) ?? fallbackEvolveDir;
155
- const cacheDir = (fixture && evolveCacheDirByFixture.get(fixture)) ?? fallbackEvolveCacheDir;
156
- if (dir)
157
- baseEnv.AKM_STASH_DIR = dir;
158
- else
159
- delete baseEnv.AKM_STASH_DIR;
160
- if (cacheDir)
161
- baseEnv.XDG_CACHE_HOME = cacheDir;
162
- return baseEnv;
163
- }
164
- // ── Phase 1 pre-flight: index each evolve stash in its dedicated cache. ───
165
- // `loadFixtureStash` already ran `akm index` but used an isolated
166
- // XDG_CACHE_HOME that subsequent `akmCli` calls (feedback, distill, reflect)
167
- // cannot see. Re-running `akm index` here via `akmCli` with the same
168
- // AKM_STASH_DIR + XDG_CACHE_HOME that `envForRef` will produce ensures the
169
- // FTS5 database is populated where Phase 1 feedback will look.
170
- // Non-zero exit adds a warning but does not abort — Phase 1 can still run
171
- // with degraded feedback if the index step fails.
172
- if (materialiseStash) {
173
- const phase1Cwd = options.tasks[0]?.taskDir ?? process.cwd();
174
- for (const [fixtureName, stashDir] of evolveDirByFixture) {
175
- const cacheDir = evolveCacheDirByFixture.get(fixtureName);
176
- if (!cacheDir)
177
- continue;
178
- try {
179
- const result = await indexEvolveStash(stashDir, cacheDir, akmCli, phase1Cwd);
180
- if (!result.ok) {
181
- warnings.push(`evolve: pre-flight akm index failed for stash ${stashDir}: ${result.stderr.trim()}`);
182
- }
183
- }
184
- catch (err) {
185
- warnings.push(`evolve: pre-flight akm index threw for stash ${stashDir}: ${err.message}`);
186
- }
187
- }
188
- }
189
- let preReport;
190
- let postReport;
191
- let syntheticReport;
192
- let phase1Report;
193
- const feedbackLog = [];
194
- const proposalLog = [];
195
- try {
196
- // ── Phase 1: accumulate signal on the train slice (akm arm only). ─────
197
- phase1Report = await runUtility({
198
- tasks: trainTasks,
199
- arms: ["akm"],
200
- model: options.model,
201
- seedsPerArm,
202
- budgetTokens,
203
- budgetWallMs,
204
- slice: "train",
205
- ...(options.spawn ? { spawn: options.spawn } : {}),
206
- // We pre-materialised the per-fixture evolve stash above; tell the
207
- // runner to forward those dirs and skip its own per-task materialise.
208
- materialiseStash,
209
- ...(materialiseStash ? { stashDirByFixture: evolveDirByFixture } : {}),
210
- ...(options.timestamp ? { timestamp: options.timestamp } : {}),
211
- ...(options.branch ? { branch: options.branch } : {}),
212
- ...(options.commit ? { commit: options.commit } : {}),
213
- ...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
214
- });
215
- // Issue feedback events per (task, seed) outcome on the akm arm.
216
- const feedbackByRef = new Map();
217
- const phase1Cwd = options.tasks[0]?.taskDir ?? process.cwd();
218
- for (const run of phase1Report.akmRuns ?? []) {
219
- const taskMeta = options.tasks.find((t) => t.id === run.taskId);
220
- const goldRef = taskMeta?.goldRef;
221
- if (!goldRef)
222
- continue;
223
- if (run.outcome === "harness_error")
224
- continue;
225
- const signal = run.outcome === "pass" ? "positive" : "negative";
226
- const args = ["feedback", goldRef, signal === "positive" ? "--positive" : "--negative"];
227
- // Wrap in try/catch so a single throwing akmCli (e.g. subprocess
228
- // crash) cannot leave `feedbackByRef` partially populated and let
229
- // Phase 2 proceed on corrupt state.
230
- try {
231
- const cliResult = await akmCli(args, phase1Cwd, envForRef(goldRef));
232
- feedbackLog.push({ taskId: run.taskId, seed: run.seed, goldRef, signal, ok: cliResult.exitCode === 0 });
233
- if (cliResult.exitCode !== 0) {
234
- warnings.push(`phase1: akm feedback for ${goldRef} (${signal}) failed: ${cliResult.stderr.trim()}`);
235
- }
236
- }
237
- catch (err) {
238
- feedbackLog.push({ taskId: run.taskId, seed: run.seed, goldRef, signal, ok: false });
239
- warnings.push(`phase1.feedback_dispatch_failed: ${goldRef} ${err.message}`);
240
- }
241
- const counts = feedbackByRef.get(goldRef) ?? { positive: 0, negative: 0 };
242
- if (signal === "positive")
243
- counts.positive += 1;
244
- else
245
- counts.negative += 1;
246
- feedbackByRef.set(goldRef, counts);
247
- }
248
- // ── Phase 2: evolve. ────────────────────────────────────────────────────
249
- const evalGoldRefs = new Set();
250
- for (const t of evalTasks) {
251
- if (t.goldRef)
252
- evalGoldRefs.add(t.goldRef);
253
- }
254
- const refsToEvolve = [];
255
- for (const [ref, counts] of feedbackByRef.entries()) {
256
- if (crossesNegativeThreshold(counts, negativeThreshold))
257
- refsToEvolve.push(ref);
258
- }
259
- refsToEvolve.sort();
260
- // §7.4 leakage prevention (#267): instead of hard-skipping refs that
261
- // overlap eval-slice gold refs, we now pass the gold-ref set through
262
- // `--exclude-feedback-from` (and the matching env var) so `akm distill`
263
- // filters those events out of its LLM input. The behaviour collapses
264
- // back to "no useful feedback shown" for refs that ARE the gold ref —
265
- // distill then runs from asset content only, which is what we want.
266
- const evalGoldRefList = [...evalGoldRefs].sort();
267
- const excludeFeedbackCsv = evalGoldRefList.join(",");
268
- for (const ref of refsToEvolve) {
269
- // The env var fallback is the contract `akm distill` honours; it lets
270
- // the bench keep working even if a hypothetical caller invokes
271
- // distill via a wrapper that mangles flags.
272
- const evolveEnv = {
273
- ...envForRef(ref),
274
- AKM_BENCH_EXCLUDE_GOLD_REFS: excludeFeedbackCsv,
275
- ...(excludeFeedbackCsv ? { AKM_DISTILL_EXCLUDE_FEEDBACK_FROM: excludeFeedbackCsv } : {}),
276
- };
277
- // Pass the eval-gold list explicitly via the CLI flag so the contract
278
- // is observable in test logs (the env var is a fallback for harnesses
279
- // that strip flags). Reflect doesn't accept this flag — it's a distill
280
- // concern only.
281
- const distillArgs = ["distill", ref];
282
- if (excludeFeedbackCsv) {
283
- distillArgs.push("--exclude-feedback-from", excludeFeedbackCsv);
284
- }
285
- const distillResult = await akmCli(distillArgs, phase1Cwd, evolveEnv);
286
- if (distillResult.exitCode !== 0) {
287
- warnings.push(`phase2: akm distill ${ref} failed: ${distillResult.stderr.trim()}`);
288
- }
289
- else if (evalGoldRefs.has(ref) && excludeFeedbackCsv) {
290
- // Per-ref leakage info — replaces the previous "skipped" message.
291
- // Operator can audit which refs ran through the filter and confirm
292
- // distillation didn't see leaked feedback.
293
- warnings.push(`phase2: filtered eval-slice gold-ref feedback from distill input for ${ref} (--exclude-feedback-from ${excludeFeedbackCsv}).`);
294
- }
295
- const reflectResult = await akmCli(["reflect", ref], phase1Cwd, evolveEnv);
296
- if (reflectResult.exitCode !== 0) {
297
- // `reflect` requires `agent.default` to be configured — a missing
298
- // config is non-fatal for the bench; we record and continue.
299
- warnings.push(`phase2: akm reflect ${ref} skipped/failed: ${reflectResult.stderr.trim()}`);
300
- }
301
- }
302
- // Walk the proposal queue per fixture (each evolveStash has its own
303
- // proposal log on disk). When we materialised stashes we iterate every
304
- // fixture that produced proposals; in the common single-fixture case
305
- // this is one pass.
306
- const proposalFixtures = materialiseStash ? [...evolveDirByFixture.keys()] : [undefined];
307
- for (const fixtureName of proposalFixtures) {
308
- const proposalEnv = { ...process.env };
309
- if (materialiseStash && fixtureName) {
310
- const dir = evolveDirByFixture.get(fixtureName);
311
- if (dir)
312
- proposalEnv.AKM_STASH_DIR = dir;
313
- const cacheDir = evolveCacheDirByFixture.get(fixtureName);
314
- if (cacheDir)
315
- proposalEnv.XDG_CACHE_HOME = cacheDir;
316
- }
317
- else if (!materialiseStash) {
318
- delete proposalEnv.AKM_STASH_DIR;
319
- }
320
- const listResult = await akmCli(["proposal", "list", "--json"], phase1Cwd, proposalEnv);
321
- const proposals = parseProposalList(listResult.stdout);
322
- for (const p of proposals) {
323
- const showResult = await akmCli(["proposal", "show", p.id, "--json"], phase1Cwd, proposalEnv);
324
- const lintInfo = parseProposalShow(showResult.stdout);
325
- const lintPass = lintInfo.lintPass;
326
- if (lintPass) {
327
- const acceptResult = await akmCli(["proposal", "accept", p.id], phase1Cwd, proposalEnv);
328
- proposalLog.push({
329
- proposalId: p.id,
330
- assetRef: p.assetRef,
331
- kind: p.kind,
332
- lintPass: true,
333
- decision: acceptResult.exitCode === 0 ? "accept" : "reject",
334
- ...(acceptResult.exitCode === 0 ? {} : { rejectReason: `accept failed: ${acceptResult.stderr.trim()}` }),
335
- });
336
- }
337
- else {
338
- const reason = lintInfo.lintMessage ?? "lint failed";
339
- const rejectResult = await akmCli(["proposal", "reject", p.id, "--reason", `lint failed: ${reason}`], phase1Cwd, proposalEnv);
340
- proposalLog.push({
341
- proposalId: p.id,
342
- assetRef: p.assetRef,
343
- kind: p.kind,
344
- lintPass: false,
345
- decision: "reject",
346
- rejectReason: reason,
347
- });
348
- if (rejectResult.exitCode !== 0) {
349
- warnings.push(`phase2: akm proposal reject ${p.id} failed: ${rejectResult.stderr.trim()}`);
350
- }
351
- }
352
- }
353
- // Rebuild the index so accepted lessons surface in Phase 3.
354
- const indexResult = await akmCli(["index"], phase1Cwd, proposalEnv);
355
- if (indexResult.exitCode !== 0) {
356
- warnings.push(`phase2: akm index rebuild failed: ${indexResult.stderr.trim()}`);
357
- }
358
- }
359
- // ── Phase 3: re-evaluate (eval slice). ─────────────────────────────────
360
- // pre arm: fresh snapshot of the starting fixture (no Phase 2 mutations
361
- // applied). post arm: the mutated evolveStash so accepted lessons reach
362
- // the eval slice. synthetic arm: no stash.
363
- preReport = await runUtility({
364
- tasks: evalTasks,
365
- arms: ["akm"],
366
- model: options.model,
367
- seedsPerArm,
368
- budgetTokens,
369
- budgetWallMs,
370
- slice: "eval",
371
- ...(options.spawn ? { spawn: options.spawn } : {}),
372
- materialiseStash,
373
- ...(materialiseStash ? { stashDirByFixture: preDirByFixture } : {}),
374
- ...(options.timestamp ? { timestamp: options.timestamp } : {}),
375
- ...(options.branch ? { branch: options.branch } : {}),
376
- ...(options.commit ? { commit: options.commit } : {}),
377
- ...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
378
- });
379
- postReport = await runUtility({
380
- tasks: evalTasks,
381
- arms: ["akm"],
382
- model: options.model,
383
- seedsPerArm,
384
- budgetTokens,
385
- budgetWallMs,
386
- slice: "eval",
387
- // Stamp arm metadata so spawn fakes can distinguish pre-vs-post via
388
- // an env probe. We thread it via a fresh `spawn` wrapper when one
389
- // was supplied.
390
- materialiseStash,
391
- ...(materialiseStash ? { stashDirByFixture: evolveDirByFixture } : {}),
392
- ...(options.timestamp ? { timestamp: options.timestamp } : {}),
393
- ...(options.branch ? { branch: options.branch } : {}),
394
- ...(options.commit ? { commit: options.commit } : {}),
395
- ...(options.spawn ? { spawn: wrapSpawnWithArm(options.spawn, "post") } : {}),
396
- ...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
397
- });
398
- // synthetic: no stash. We pass a spawn wrapper that strips
399
- // AKM_STASH_DIR and injects the "Bring Your Own Skills" tag so test
400
- // fakes (and a future real harness) can branch. #267 — also forward a
401
- // per-task scratchpad prompt via the runner's `buildPrompt` seam so the
402
- // synthetic arm actually exercises the BYOS prompt path rather than
403
- // relying on the noakm default.
404
- syntheticReport = await runUtility({
405
- tasks: evalTasks,
406
- arms: ["akm"],
407
- model: options.model,
408
- seedsPerArm,
409
- budgetTokens,
410
- budgetWallMs,
411
- slice: "eval",
412
- materialiseStash: false,
413
- buildPrompt: (task, _arm) => buildSyntheticPrompt(task.id),
414
- ...(options.timestamp ? { timestamp: options.timestamp } : {}),
415
- ...(options.branch ? { branch: options.branch } : {}),
416
- ...(options.commit ? { commit: options.commit } : {}),
417
- ...(options.spawn ? { spawn: wrapSpawnWithArm(options.spawn, "synthetic", undefined, true) } : {}),
418
- ...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
419
- });
420
- }
421
- finally {
422
- // Deregister BEFORE running cleanup so a SIGINT during teardown
423
- // doesn't double-fire the cleanup fns (per cleanup.ts contract).
424
- for (const deregister of stashDeregistrations)
425
- deregister();
426
- for (const s of evolveStashes.values()) {
427
- try {
428
- s.cleanup();
429
- }
430
- catch {
431
- /* swallow — best-effort tmp cleanup */
432
- }
433
- }
434
- for (const s of preStashes.values()) {
435
- try {
436
- s.cleanup();
437
- }
438
- catch {
439
- /* swallow — best-effort tmp cleanup */
440
- }
441
- }
442
- }
443
- // ── Compute aggregates. ──────────────────────────────────────────────────
444
- const proposalsMetrics = computeProposalQualityMetrics(proposalLog);
445
- const longitudinal = computeLongitudinalMetrics(preReport, postReport, syntheticReport);
446
- const feedbackIntegrity = computeFeedbackIntegrity({ phase1: phase1Report, feedbackLog });
447
- // #264 — lesson quality + reuse metrics. The runner doesn't (yet) read
448
- // accepted lesson bodies off disk or load verifier source text; we pass
449
- // empty maps so the leakage check defaults to "low" until the read seam
450
- // lands. Reuse + negative-transfer attribution work today off the
451
- // pre/post arm `assetsLoaded` stream.
452
- const lessons = computeLessonMetrics({
453
- proposalLog,
454
- feedbackLog,
455
- preRuns: preReport.akmRuns ?? [],
456
- postRuns: postReport.akmRuns ?? [],
457
- });
458
- return {
459
- timestamp: options.timestamp ?? new Date().toISOString(),
460
- branch: options.branch ?? preReport.branch,
461
- commit: options.commit ?? preReport.commit,
462
- model: options.model,
463
- domain,
464
- seedsPerArm,
465
- feedbackLog,
466
- proposalLog,
467
- proposals: proposalsMetrics,
468
- lessons,
469
- longitudinal,
470
- feedbackIntegrity,
471
- phase1: phase1Report,
472
- arms: { pre: preReport, post: postReport, synthetic: syntheticReport },
473
- warnings: [
474
- ...warnings,
475
- ...phase1Report.warnings,
476
- ...preReport.warnings,
477
- ...postReport.warnings,
478
- ...syntheticReport.warnings,
479
- ],
480
- };
481
- }
482
- /**
483
- * Default subprocess invoker — runs `bun run src/cli.ts <args>` in `cwd`
484
- * with the supplied env. Real runs use this; tests inject a fake.
485
- */
486
- async function defaultAkmCli(args, cwd, env) {
487
- const cli = path.resolve(__dirname, "..", "..", "src", "cli.ts");
488
- const proc = Bun.spawnSync({
489
- cmd: ["bun", "run", cli, ...args],
490
- cwd,
491
- env: { ...process.env, ...env },
492
- stdout: "pipe",
493
- stderr: "pipe",
494
- });
495
- const stdout = proc.stdout ? new TextDecoder().decode(proc.stdout) : "";
496
- const stderr = proc.stderr ? new TextDecoder().decode(proc.stderr) : "";
497
- return { exitCode: proc.exitCode ?? -1, stdout, stderr };
498
- }
499
- /**
500
- * Threshold check: an asset crosses the negative threshold if either the
501
- * absolute negative count meets `absoluteCount` OR the negative *ratio* among
502
- * total feedback exceeds `ratio`. Either branch is sufficient — both are
503
- * spec-mandated defaults.
504
- */
505
- function crossesNegativeThreshold(counts, threshold) {
506
- if (counts.negative >= threshold.absoluteCount)
507
- return true;
508
- const total = counts.positive + counts.negative;
509
- if (total === 0)
510
- return false;
511
- return counts.negative / total > threshold.ratio;
512
- }
513
- /** Best-effort partition. Honours explicit `slice:` and falls back to id-hash. */
514
- function effectiveSlice(task) {
515
- if (task.slice)
516
- return task.slice;
517
- // Mirror corpus.effectiveSlice — SHA-1 first byte parity.
518
- // We avoid the import cycle by inlining the trivial fallback.
519
- let h = 0;
520
- for (let i = 0; i < task.id.length; i += 1)
521
- h = (h * 31 + task.id.charCodeAt(i)) | 0;
522
- return Math.abs(h) % 2 === 0 ? "train" : "eval";
523
- }
524
- function uniqueDomain(tasks) {
525
- const set = new Set(tasks.map((t) => t.domain));
526
- if (set.size === 1)
527
- return [...set][0] ?? "all";
528
- return "all";
529
- }
530
- /**
531
- * Wrap a spawn fake so every child sees `BENCH_EVOLVE_ARM=<arm>` (and
532
- * `BENCH_EVOLVE_SCRATCHPAD=1` for the synthetic arm). Used by Phase 3 so
533
- * test fakes can distinguish the three arms without us having to expose a
534
- * `prompt` override on `runUtility`. Real production runs receive the same
535
- * env keys; the real `runAgent` harness ignores them.
536
- */
537
- function wrapSpawnWithArm(inner, arm, stashDir, scratchpad = false) {
538
- return (cmd, opts) => {
539
- const env = { ...(opts.env ?? {}) };
540
- env.BENCH_EVOLVE_ARM = arm;
541
- if (scratchpad)
542
- env.BENCH_EVOLVE_SCRATCHPAD = "1";
543
- if (stashDir)
544
- env.AKM_STASH_DIR = stashDir;
545
- if (arm === "synthetic")
546
- delete env.AKM_STASH_DIR;
547
- return inner(cmd, { ...opts, env });
548
- };
549
- }
550
- /** Tolerant parser for `akm proposal list --json` stdout. */
551
- function parseProposalList(stdout) {
552
- if (!stdout.trim())
553
- return [];
554
- let parsed;
555
- try {
556
- parsed = JSON.parse(stdout);
557
- }
558
- catch {
559
- return [];
560
- }
561
- const arr = Array.isArray(parsed)
562
- ? parsed
563
- : Array.isArray(parsed.proposals)
564
- ? parsed.proposals
565
- : [];
566
- const out = [];
567
- for (const item of arr) {
568
- if (!item || typeof item !== "object")
569
- continue;
570
- const rec = item;
571
- const id = typeof rec.id === "string" ? rec.id : null;
572
- const assetRef = typeof rec.target_ref === "string"
573
- ? rec.target_ref
574
- : typeof rec.targetRef === "string"
575
- ? rec.targetRef
576
- : typeof rec.ref === "string"
577
- ? rec.ref
578
- : null;
579
- const kindRaw = typeof rec.kind === "string" ? rec.kind : typeof rec.source === "string" ? rec.source : "unknown";
580
- const kind = kindRaw === "lesson" || kindRaw === "distill"
581
- ? "lesson"
582
- : kindRaw === "revision" || kindRaw === "reflect"
583
- ? "revision"
584
- : "unknown";
585
- if (!id || !assetRef)
586
- continue;
587
- out.push({ id, assetRef, kind });
588
- }
589
- return out;
590
- }
591
- function parseProposalShow(stdout) {
592
- if (!stdout.trim())
593
- return { lintPass: false, lintMessage: "empty proposal show output" };
594
- let parsed;
595
- try {
596
- parsed = JSON.parse(stdout);
597
- }
598
- catch (err) {
599
- return { lintPass: false, lintMessage: `proposal show: parse error (${err.message})` };
600
- }
601
- const lintPass = parsed.lint_pass === true ||
602
- parsed.lintPass === true ||
603
- (typeof parsed.lint === "object" && parsed.lint !== null && parsed.lint.pass === true);
604
- const lintRaw = parsed.lint;
605
- let lintMessage;
606
- if (lintRaw && typeof lintRaw === "object") {
607
- const issues = lintRaw.issues;
608
- if (Array.isArray(issues) && issues.length > 0) {
609
- lintMessage = issues
610
- .map((i) => (typeof i === "string" ? i : (i?.message ?? JSON.stringify(i))))
611
- .join("; ");
612
- }
613
- }
614
- return { lintPass, ...(lintMessage ? { lintMessage } : {}) };
615
- }
616
- /**
617
- * Run `akm index` on the evolve stash to populate the FTS5 database in the
618
- * cache directory that Phase 1 `akmCli` calls will use.
619
- *
620
- * `loadFixtureStash` already indexed the stash into an isolated XDG_CACHE_HOME
621
- * that is invisible to subsequent `akmCli` calls. Calling this helper with the
622
- * same `stashDir` + `cacheDir` that `envForRef` will forward ensures `akm
623
- * feedback` (and later `akm distill` / `akm reflect`) can look up refs in the
624
- * FTS5 index.
625
- *
626
- * Returns `{ ok: true }` on exit code 0, `{ ok: false, stderr }` otherwise.
627
- * Exported for tests.
628
- */
629
- export async function indexEvolveStash(stashDir, cacheDir, akmCli, cwd) {
630
- const env = {
631
- ...process.env,
632
- AKM_STASH_DIR: stashDir,
633
- XDG_CACHE_HOME: cacheDir,
634
- };
635
- const result = await akmCli(["index"], cwd, env);
636
- return { ok: result.exitCode === 0, stderr: result.stderr };
637
- }
638
- /** Exposed for tests so the synthetic-arm prompt construction can be asserted. */
639
- export function buildSyntheticPrompt(taskId) {
640
- return [
641
- `Task: ${taskId}`,
642
- "Arm: synthetic (Bring Your Own Skills)",
643
- "No akm stash is available. Before solving the task, write a short scratchpad of the skills",
644
- "and steps you intend to use, then proceed. Cite the scratchpad in your trace so the verifier",
645
- "can attribute the approach to your own reasoning rather than retrieved guidance.",
646
- ].join("\n");
647
- }