akm-cli 0.7.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (327) hide show
  1. package/package.json +8 -8
  2. package/dist/tests/add-website-source.test.js +0 -119
  3. package/dist/tests/agent/agent-config-loader.test.js +0 -70
  4. package/dist/tests/agent/agent-config.test.js +0 -221
  5. package/dist/tests/agent/agent-detect.test.js +0 -100
  6. package/dist/tests/agent/agent-spawn.test.js +0 -234
  7. package/dist/tests/agent-output.test.js +0 -186
  8. package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +0 -103
  9. package/dist/tests/architecture/agent-spawn-seam.test.js +0 -193
  10. package/dist/tests/architecture/llm-stateless-seam.test.js +0 -112
  11. package/dist/tests/asset-ref.test.js +0 -192
  12. package/dist/tests/asset-registry.test.js +0 -103
  13. package/dist/tests/asset-spec.test.js +0 -241
  14. package/dist/tests/bench/attribution.test.js +0 -996
  15. package/dist/tests/bench/cleanup-sigint.test.js +0 -83
  16. package/dist/tests/bench/cleanup.js +0 -234
  17. package/dist/tests/bench/cleanup.test.js +0 -166
  18. package/dist/tests/bench/cli.js +0 -1018
  19. package/dist/tests/bench/cli.test.js +0 -445
  20. package/dist/tests/bench/compare.test.js +0 -556
  21. package/dist/tests/bench/corpus.js +0 -317
  22. package/dist/tests/bench/corpus.test.js +0 -258
  23. package/dist/tests/bench/doctor.js +0 -525
  24. package/dist/tests/bench/driver.js +0 -401
  25. package/dist/tests/bench/driver.test.js +0 -584
  26. package/dist/tests/bench/environment.js +0 -233
  27. package/dist/tests/bench/environment.test.js +0 -199
  28. package/dist/tests/bench/evolve-metrics.js +0 -179
  29. package/dist/tests/bench/evolve-metrics.test.js +0 -187
  30. package/dist/tests/bench/evolve.js +0 -647
  31. package/dist/tests/bench/evolve.test.js +0 -624
  32. package/dist/tests/bench/failure-modes.test.js +0 -349
  33. package/dist/tests/bench/feedback-integrity.test.js +0 -457
  34. package/dist/tests/bench/leakage.test.js +0 -228
  35. package/dist/tests/bench/learning-curve.test.js +0 -134
  36. package/dist/tests/bench/metrics.js +0 -2395
  37. package/dist/tests/bench/metrics.test.js +0 -1150
  38. package/dist/tests/bench/no-os-tmpdir-invariant.test.js +0 -43
  39. package/dist/tests/bench/opencode-config.js +0 -194
  40. package/dist/tests/bench/opencode-config.test.js +0 -370
  41. package/dist/tests/bench/report.js +0 -1885
  42. package/dist/tests/bench/report.test.js +0 -1038
  43. package/dist/tests/bench/run-config.js +0 -355
  44. package/dist/tests/bench/run-config.test.js +0 -298
  45. package/dist/tests/bench/run-curate-test.js +0 -32
  46. package/dist/tests/bench/run-failing-tasks.js +0 -56
  47. package/dist/tests/bench/run-full-bench.js +0 -51
  48. package/dist/tests/bench/run-items36-targeted.js +0 -69
  49. package/dist/tests/bench/run-nano-quick.js +0 -42
  50. package/dist/tests/bench/run-waveg-targeted.js +0 -62
  51. package/dist/tests/bench/runner.js +0 -699
  52. package/dist/tests/bench/runner.test.js +0 -958
  53. package/dist/tests/bench/search-bridge.test.js +0 -331
  54. package/dist/tests/bench/tmp.js +0 -131
  55. package/dist/tests/bench/trajectory.js +0 -116
  56. package/dist/tests/bench/trajectory.test.js +0 -127
  57. package/dist/tests/bench/verifier.js +0 -114
  58. package/dist/tests/bench/verifier.test.js +0 -118
  59. package/dist/tests/bench/workflow-evaluator.js +0 -557
  60. package/dist/tests/bench/workflow-evaluator.test.js +0 -421
  61. package/dist/tests/bench/workflow-spec.js +0 -345
  62. package/dist/tests/bench/workflow-spec.test.js +0 -363
  63. package/dist/tests/bench/workflow-trace.js +0 -472
  64. package/dist/tests/bench/workflow-trace.test.js +0 -254
  65. package/dist/tests/benchmark-search-quality.js +0 -536
  66. package/dist/tests/benchmark-suite.js +0 -1441
  67. package/dist/tests/capture-cli.test.js +0 -112
  68. package/dist/tests/cli-errors.test.js +0 -204
  69. package/dist/tests/commands/events.test.js +0 -370
  70. package/dist/tests/commands/history.test.js +0 -418
  71. package/dist/tests/commands/import.test.js +0 -103
  72. package/dist/tests/commands/proposal-cli.test.js +0 -209
  73. package/dist/tests/commands/reflect-propose-cli.test.js +0 -333
  74. package/dist/tests/commands/remember.test.js +0 -97
  75. package/dist/tests/commands/scope-flags.test.js +0 -300
  76. package/dist/tests/commands/search.test.js +0 -537
  77. package/dist/tests/commands/show-indexer-parity.test.js +0 -117
  78. package/dist/tests/commands/show.test.js +0 -294
  79. package/dist/tests/common.test.js +0 -266
  80. package/dist/tests/completions.test.js +0 -142
  81. package/dist/tests/config-cli.test.js +0 -193
  82. package/dist/tests/config-llm-features.test.js +0 -139
  83. package/dist/tests/config.test.js +0 -569
  84. package/dist/tests/contracts/migration-baseline.test.js +0 -43
  85. package/dist/tests/contracts/reflect-propose-envelope.test.js +0 -139
  86. package/dist/tests/contracts/spec-helpers.js +0 -46
  87. package/dist/tests/contracts/v1-spec-section-11-proposal-queue.test.js +0 -228
  88. package/dist/tests/contracts/v1-spec-section-12-agent-config.test.js +0 -56
  89. package/dist/tests/contracts/v1-spec-section-13-lesson-type.test.js +0 -34
  90. package/dist/tests/contracts/v1-spec-section-14-llm-features.test.js +0 -94
  91. package/dist/tests/contracts/v1-spec-section-4-1-asset-types.test.js +0 -39
  92. package/dist/tests/contracts/v1-spec-section-4-2-quality-rules.test.js +0 -44
  93. package/dist/tests/contracts/v1-spec-section-5-configuration.test.js +0 -47
  94. package/dist/tests/contracts/v1-spec-section-6-orchestration.test.js +0 -40
  95. package/dist/tests/contracts/v1-spec-section-7-module-layout.test.js +0 -58
  96. package/dist/tests/contracts/v1-spec-section-8-extension-points.test.js +0 -34
  97. package/dist/tests/contracts/v1-spec-section-9-4-cli-surface.test.js +0 -75
  98. package/dist/tests/contracts/v1-spec-section-9-7-llm-agent-boundary.test.js +0 -36
  99. package/dist/tests/core/write-source.test.js +0 -366
  100. package/dist/tests/curate-command.test.js +0 -87
  101. package/dist/tests/db-scoring.test.js +0 -201
  102. package/dist/tests/db.test.js +0 -654
  103. package/dist/tests/distill-cli-flag.test.js +0 -208
  104. package/dist/tests/distill.test.js +0 -515
  105. package/dist/tests/docker-install.test.js +0 -120
  106. package/dist/tests/e2e.test.js +0 -1419
  107. package/dist/tests/embedder.test.js +0 -340
  108. package/dist/tests/embedding-model-config.test.js +0 -379
  109. package/dist/tests/feedback-command.test.js +0 -172
  110. package/dist/tests/file-context.test.js +0 -552
  111. package/dist/tests/fixtures/scripts/git/summarize-diff.js +0 -9
  112. package/dist/tests/fixtures/scripts/lint/eslint-check.js +0 -7
  113. package/dist/tests/fixtures/stashes/load.js +0 -166
  114. package/dist/tests/fixtures/stashes/load.test.js +0 -97
  115. package/dist/tests/fixtures/stashes/ranking-baseline/scripts/mem0-search.js +0 -12
  116. package/dist/tests/frontmatter.test.js +0 -190
  117. package/dist/tests/fts-field-weighting.test.js +0 -254
  118. package/dist/tests/fuzzy-search.test.js +0 -230
  119. package/dist/tests/git-provider-clone.test.js +0 -45
  120. package/dist/tests/github.test.js +0 -161
  121. package/dist/tests/graph-boost-ranking.test.js +0 -305
  122. package/dist/tests/graph-extraction.test.js +0 -282
  123. package/dist/tests/helpers/usage-events.js +0 -8
  124. package/dist/tests/index-pass-llm.test.js +0 -161
  125. package/dist/tests/indexer.test.js +0 -570
  126. package/dist/tests/info-command.test.js +0 -166
  127. package/dist/tests/init.test.js +0 -69
  128. package/dist/tests/install-script.test.js +0 -246
  129. package/dist/tests/integration/agent-real-profile.test.js +0 -94
  130. package/dist/tests/issue-36-repro.test.js +0 -304
  131. package/dist/tests/issues-191-194.test.js +0 -160
  132. package/dist/tests/lesson-lint.test.js +0 -111
  133. package/dist/tests/llm-client.test.js +0 -115
  134. package/dist/tests/llm-feature-gate.test.js +0 -151
  135. package/dist/tests/llm.test.js +0 -139
  136. package/dist/tests/lockfile.test.js +0 -216
  137. package/dist/tests/manifest.test.js +0 -205
  138. package/dist/tests/markdown.test.js +0 -126
  139. package/dist/tests/matchers-unit.test.js +0 -189
  140. package/dist/tests/memory-inference.test.js +0 -299
  141. package/dist/tests/merge-scoring.test.js +0 -136
  142. package/dist/tests/metadata.test.js +0 -313
  143. package/dist/tests/migration-help.test.js +0 -89
  144. package/dist/tests/origin-resolve.test.js +0 -124
  145. package/dist/tests/output-baseline.test.js +0 -218
  146. package/dist/tests/output-shapes-unit.test.js +0 -478
  147. package/dist/tests/parallel-search.test.js +0 -272
  148. package/dist/tests/parameter-metadata.test.js +0 -365
  149. package/dist/tests/paths.test.js +0 -177
  150. package/dist/tests/progressive-disclosure.test.js +0 -280
  151. package/dist/tests/proposals.test.js +0 -279
  152. package/dist/tests/proposed-quality.test.js +0 -271
  153. package/dist/tests/provider-registry.test.js +0 -32
  154. package/dist/tests/ranking-regression.test.js +0 -548
  155. package/dist/tests/reflect-propose.test.js +0 -455
  156. package/dist/tests/registry-build-index.test.js +0 -394
  157. package/dist/tests/registry-cli.test.js +0 -290
  158. package/dist/tests/registry-index-v2.test.js +0 -430
  159. package/dist/tests/registry-install.test.js +0 -728
  160. package/dist/tests/registry-providers/parity.test.js +0 -189
  161. package/dist/tests/registry-providers/skills-sh.test.js +0 -309
  162. package/dist/tests/registry-providers/static-index.test.js +0 -238
  163. package/dist/tests/registry-resolve.test.js +0 -126
  164. package/dist/tests/registry-search.test.js +0 -923
  165. package/dist/tests/remember-frontmatter.test.js +0 -378
  166. package/dist/tests/remember-unit.test.js +0 -123
  167. package/dist/tests/ripgrep-install.test.js +0 -251
  168. package/dist/tests/ripgrep-resolve.test.js +0 -108
  169. package/dist/tests/ripgrep.test.js +0 -163
  170. package/dist/tests/save-command.test.js +0 -94
  171. package/dist/tests/save-trust-qa-fixes.test.js +0 -270
  172. package/dist/tests/scoring-pipeline.test.js +0 -648
  173. package/dist/tests/search-include-proposed-cli.test.js +0 -118
  174. package/dist/tests/self-update.test.js +0 -442
  175. package/dist/tests/semantic-search-e2e.test.js +0 -512
  176. package/dist/tests/semantic-status.test.js +0 -471
  177. package/dist/tests/setup-run.integration.js +0 -877
  178. package/dist/tests/setup-wizard.test.js +0 -198
  179. package/dist/tests/setup.test.js +0 -131
  180. package/dist/tests/source-add.test.js +0 -11
  181. package/dist/tests/source-clone.test.js +0 -254
  182. package/dist/tests/source-manage.test.js +0 -366
  183. package/dist/tests/source-providers/filesystem.test.js +0 -82
  184. package/dist/tests/source-providers/git.test.js +0 -252
  185. package/dist/tests/source-providers/website.test.js +0 -128
  186. package/dist/tests/source-qa-fixes.test.js +0 -286
  187. package/dist/tests/source-registry.test.js +0 -350
  188. package/dist/tests/source-resolve.test.js +0 -100
  189. package/dist/tests/source-source.test.js +0 -281
  190. package/dist/tests/source.test.js +0 -533
  191. package/dist/tests/tar-utils-scan.test.js +0 -73
  192. package/dist/tests/toggle-components.test.js +0 -73
  193. package/dist/tests/usage-telemetry.test.js +0 -265
  194. package/dist/tests/utility-scoring.test.js +0 -558
  195. package/dist/tests/vault-load-error.test.js +0 -78
  196. package/dist/tests/vault-qa-fixes.test.js +0 -194
  197. package/dist/tests/vault.test.js +0 -429
  198. package/dist/tests/vector-search.test.js +0 -608
  199. package/dist/tests/walker.test.js +0 -252
  200. package/dist/tests/wave2-cluster-bc.test.js +0 -228
  201. package/dist/tests/wave2-cluster-d.test.js +0 -180
  202. package/dist/tests/wave2-cluster-e.test.js +0 -179
  203. package/dist/tests/wiki-qa-fixes.test.js +0 -270
  204. package/dist/tests/wiki.test.js +0 -529
  205. package/dist/tests/workflow-cli.test.js +0 -271
  206. package/dist/tests/workflow-markdown.test.js +0 -171
  207. package/dist/tests/workflow-path-escape.test.js +0 -132
  208. package/dist/tests/workflow-qa-fixes.test.js +0 -395
  209. package/dist/tests/workflows/indexer-rejection.test.js +0 -213
  210. /package/dist/{src/cli.js → cli.js} +0 -0
  211. /package/dist/{src/commands → commands}/completions.js +0 -0
  212. /package/dist/{src/commands → commands}/config-cli.js +0 -0
  213. /package/dist/{src/commands → commands}/curate.js +0 -0
  214. /package/dist/{src/commands → commands}/distill.js +0 -0
  215. /package/dist/{src/commands → commands}/events.js +0 -0
  216. /package/dist/{src/commands → commands}/history.js +0 -0
  217. /package/dist/{src/commands → commands}/info.js +0 -0
  218. /package/dist/{src/commands → commands}/init.js +0 -0
  219. /package/dist/{src/commands → commands}/install-audit.js +0 -0
  220. /package/dist/{src/commands → commands}/installed-stashes.js +0 -0
  221. /package/dist/{src/commands → commands}/migration-help.js +0 -0
  222. /package/dist/{src/commands → commands}/proposal.js +0 -0
  223. /package/dist/{src/commands → commands}/propose.js +0 -0
  224. /package/dist/{src/commands → commands}/reflect.js +0 -0
  225. /package/dist/{src/commands → commands}/registry-search.js +0 -0
  226. /package/dist/{src/commands → commands}/remember.js +0 -0
  227. /package/dist/{src/commands → commands}/search.js +0 -0
  228. /package/dist/{src/commands → commands}/self-update.js +0 -0
  229. /package/dist/{src/commands → commands}/show.js +0 -0
  230. /package/dist/{src/commands → commands}/source-add.js +0 -0
  231. /package/dist/{src/commands → commands}/source-clone.js +0 -0
  232. /package/dist/{src/commands → commands}/source-manage.js +0 -0
  233. /package/dist/{src/commands → commands}/vault.js +0 -0
  234. /package/dist/{src/core → core}/asset-ref.js +0 -0
  235. /package/dist/{src/core → core}/asset-registry.js +0 -0
  236. /package/dist/{src/core → core}/asset-spec.js +0 -0
  237. /package/dist/{src/core → core}/common.js +0 -0
  238. /package/dist/{src/core → core}/config.js +0 -0
  239. /package/dist/{src/core → core}/errors.js +0 -0
  240. /package/dist/{src/core → core}/events.js +0 -0
  241. /package/dist/{src/core → core}/frontmatter.js +0 -0
  242. /package/dist/{src/core → core}/lesson-lint.js +0 -0
  243. /package/dist/{src/core → core}/markdown.js +0 -0
  244. /package/dist/{src/core → core}/paths.js +0 -0
  245. /package/dist/{src/core → core}/proposals.js +0 -0
  246. /package/dist/{src/core → core}/warn.js +0 -0
  247. /package/dist/{src/core → core}/write-source.js +0 -0
  248. /package/dist/{src/indexer → indexer}/db-search.js +0 -0
  249. /package/dist/{src/indexer → indexer}/db.js +0 -0
  250. /package/dist/{src/indexer → indexer}/file-context.js +0 -0
  251. /package/dist/{src/indexer → indexer}/graph-boost.js +0 -0
  252. /package/dist/{src/indexer → indexer}/graph-extraction.js +0 -0
  253. /package/dist/{src/indexer → indexer}/indexer.js +0 -0
  254. /package/dist/{src/indexer → indexer}/manifest.js +0 -0
  255. /package/dist/{src/indexer → indexer}/matchers.js +0 -0
  256. /package/dist/{src/indexer → indexer}/memory-inference.js +0 -0
  257. /package/dist/{src/indexer → indexer}/metadata.js +0 -0
  258. /package/dist/{src/indexer → indexer}/search-fields.js +0 -0
  259. /package/dist/{src/indexer → indexer}/search-source.js +0 -0
  260. /package/dist/{src/indexer → indexer}/semantic-status.js +0 -0
  261. /package/dist/{src/indexer → indexer}/usage-events.js +0 -0
  262. /package/dist/{src/indexer → indexer}/walker.js +0 -0
  263. /package/dist/{src/integrations → integrations}/agent/config.js +0 -0
  264. /package/dist/{src/integrations → integrations}/agent/detect.js +0 -0
  265. /package/dist/{src/integrations → integrations}/agent/index.js +0 -0
  266. /package/dist/{src/integrations → integrations}/agent/profiles.js +0 -0
  267. /package/dist/{src/integrations → integrations}/agent/prompts.js +0 -0
  268. /package/dist/{src/integrations → integrations}/agent/spawn.js +0 -0
  269. /package/dist/{src/integrations → integrations}/github.js +0 -0
  270. /package/dist/{src/integrations → integrations}/lockfile.js +0 -0
  271. /package/dist/{src/llm → llm}/client.js +0 -0
  272. /package/dist/{src/llm → llm}/embedder.js +0 -0
  273. /package/dist/{src/llm → llm}/embedders/cache.js +0 -0
  274. /package/dist/{src/llm → llm}/embedders/local.js +0 -0
  275. /package/dist/{src/llm → llm}/embedders/remote.js +0 -0
  276. /package/dist/{src/llm → llm}/embedders/types.js +0 -0
  277. /package/dist/{src/llm → llm}/feature-gate.js +0 -0
  278. /package/dist/{src/llm → llm}/graph-extract.js +0 -0
  279. /package/dist/{src/llm → llm}/index-passes.js +0 -0
  280. /package/dist/{src/llm → llm}/memory-infer.js +0 -0
  281. /package/dist/{src/llm → llm}/metadata-enhance.js +0 -0
  282. /package/dist/{src/output → output}/cli-hints.js +0 -0
  283. /package/dist/{src/output → output}/context.js +0 -0
  284. /package/dist/{src/output → output}/renderers.js +0 -0
  285. /package/dist/{src/output → output}/shapes.js +0 -0
  286. /package/dist/{src/output → output}/text.js +0 -0
  287. /package/dist/{src/registry → registry}/build-index.js +0 -0
  288. /package/dist/{src/registry → registry}/create-provider-registry.js +0 -0
  289. /package/dist/{src/registry → registry}/factory.js +0 -0
  290. /package/dist/{src/registry → registry}/origin-resolve.js +0 -0
  291. /package/dist/{src/registry → registry}/providers/index.js +0 -0
  292. /package/dist/{src/registry → registry}/providers/skills-sh.js +0 -0
  293. /package/dist/{src/registry → registry}/providers/static-index.js +0 -0
  294. /package/dist/{src/registry → registry}/providers/types.js +0 -0
  295. /package/dist/{src/registry → registry}/resolve.js +0 -0
  296. /package/dist/{src/registry → registry}/types.js +0 -0
  297. /package/dist/{src/setup → setup}/detect.js +0 -0
  298. /package/dist/{src/setup → setup}/ripgrep-install.js +0 -0
  299. /package/dist/{src/setup → setup}/ripgrep-resolve.js +0 -0
  300. /package/dist/{src/setup → setup}/setup.js +0 -0
  301. /package/dist/{src/setup → setup}/steps.js +0 -0
  302. /package/dist/{src/sources → sources}/include.js +0 -0
  303. /package/dist/{src/sources → sources}/provider-factory.js +0 -0
  304. /package/dist/{src/sources → sources}/provider.js +0 -0
  305. /package/dist/{src/sources → sources}/providers/filesystem.js +0 -0
  306. /package/dist/{src/sources → sources}/providers/git.js +0 -0
  307. /package/dist/{src/sources → sources}/providers/index.js +0 -0
  308. /package/dist/{src/sources → sources}/providers/install-types.js +0 -0
  309. /package/dist/{src/sources → sources}/providers/npm.js +0 -0
  310. /package/dist/{src/sources → sources}/providers/provider-utils.js +0 -0
  311. /package/dist/{src/sources → sources}/providers/sync-from-ref.js +0 -0
  312. /package/dist/{src/sources → sources}/providers/tar-utils.js +0 -0
  313. /package/dist/{src/sources → sources}/providers/website.js +0 -0
  314. /package/dist/{src/sources → sources}/resolve.js +0 -0
  315. /package/dist/{src/sources → sources}/types.js +0 -0
  316. /package/dist/{src/templates → templates}/wiki-templates.js +0 -0
  317. /package/dist/{src/version.js → version.js} +0 -0
  318. /package/dist/{src/wiki → wiki}/wiki.js +0 -0
  319. /package/dist/{src/workflows → workflows}/authoring.js +0 -0
  320. /package/dist/{src/workflows → workflows}/cli.js +0 -0
  321. /package/dist/{src/workflows → workflows}/db.js +0 -0
  322. /package/dist/{src/workflows → workflows}/document-cache.js +0 -0
  323. /package/dist/{src/workflows → workflows}/parser.js +0 -0
  324. /package/dist/{src/workflows → workflows}/renderer.js +0 -0
  325. /package/dist/{src/workflows → workflows}/runs.js +0 -0
  326. /package/dist/{src/workflows → workflows}/schema.js +0 -0
  327. /package/dist/{src/workflows → workflows}/validator.js +0 -0
@@ -1,647 +0,0 @@
1
- /**
2
- * akm-bench `evolve` — Track B longitudinal three-phase runner (spec §4 + §6.4).
3
- *
4
- * `runEvolve()` orchestrates three phases against a single eval-domain corpus:
5
- *
6
- * • Phase 1 (signal accumulation): run K seeds × tasks (train slice only)
7
- * under the akm arm, then record `akm feedback <gold_ref> --positive` /
8
- * `--negative` events per outcome.
9
- * • Phase 2 (evolve): for every asset whose negative feedback crosses the
10
- * threshold, invoke `akm distill` and `akm reflect`, validate every
11
- * resulting proposal via `akm proposal show --json`, then accept or
12
- * reject per lint outcome. After processing, rebuild the index.
13
- * • Phase 3 (re-evaluate): run the eval slice under THREE arms — `pre` (the
14
- * original un-evolved fixture), `post` (the evolved fixture), `synthetic`
15
- * (no stash, scratchpad-only "Bring Your Own Skills" prompt).
16
- *
17
- * Leakage prevention (spec §7.4): before invoking distill we compute the set
18
- * of eval-slice gold refs and pass it to `akm distill` via
19
- * `--exclude-feedback-from <csv>` (#267). `akmDistill` filters those
20
- * feedback events out of its LLM input before constructing the prompt.
21
- * Refs in the exclusion list still see distillation run — but distillation
22
- * runs from asset content alone, with no feedback signal that could have
23
- * leaked from the eval slice. The proposal log + Phase 1 feedback stream
24
- * are also filtered before computeProposalQualityMetrics ever sees them.
25
- *
26
- * Test seams: every external interaction is funnelled through one of three
27
- * injectable functions:
28
- * - `spawn` — forwarded to `runOne` (drives the agent harness).
29
- * - `akmCli(args, cwd, env)` — invoked for every `akm <verb>` subprocess.
30
- * - `materialiseStash` — when false, `runUtility` doesn't touch
31
- * fixtures/stashes/.
32
- * Tests inject fakes; production wires the real `Bun.spawnSync` and the
33
- * real `loadFixtureStash`.
34
- */
35
- import path from "node:path";
36
- import { loadFixtureStash } from "../fixtures/stashes/load";
37
- import { registerCleanup } from "./cleanup";
38
- import { computeLessonMetrics } from "./evolve-metrics";
39
- import { computeFeedbackIntegrity, computeLongitudinalMetrics, computeProposalQualityMetrics, } from "./metrics";
40
- import { runUtility } from "./runner";
41
- import { benchMkdtemp } from "./tmp";
42
- /**
43
- * Drive the three-phase Track B runner.
44
- *
45
- * Pre: `tasks` is already filtered to one domain (or `all`). The runner
46
- * partitions internally on `task.slice`.
47
- *
48
- * Sandboxing: at the start of every real run the runner materialises one
49
- * dedicated tmp stash per fixture (the `evolveStash`) plus a fresh sibling
50
- * snapshot per fixture (the `preStash`). Phase 1 + Phase 2 pin
51
- * `AKM_STASH_DIR` to the appropriate `evolveStash` for every spawned `akm`
52
- * invocation; Phase 3's pre arm uses `preStash`, the post arm uses
53
- * `evolveStash`, and the synthetic arm uses no stash. The operator's real
54
- * `process.env.AKM_STASH_DIR` is never read or written by `runEvolve`. All
55
- * stashes are cleaned up in a top-level try/finally.
56
- */
57
- export async function runEvolve(options) {
58
- const seedsPerArm = options.seedsPerArm ?? 5;
59
- const budgetTokens = options.budgetTokens ?? 30000;
60
- const budgetWallMs = options.budgetWallMs ?? 120000;
61
- const negativeThreshold = options.negativeThreshold ?? { absoluteCount: 2, ratio: 0.5 };
62
- const materialiseStash = options.materialiseStash ?? true;
63
- const akmCli = options.akmCli ?? defaultAkmCli;
64
- const warnings = [];
65
- const trainTasks = options.tasks.filter((t) => effectiveSlice(t) === "train");
66
- const evalTasks = options.tasks.filter((t) => effectiveSlice(t) === "eval");
67
- // Use the first task's domain (or "all") as the corpus label. The CLI
68
- // already filtered to one domain; this is just for the report header.
69
- const domain = uniqueDomain(options.tasks);
70
- // ── Sandbox setup: per-fixture evolveStash + preStash. ───────────────────
71
- // We materialise one tmp stash per unique `task.stash` so Phase 1
72
- // accumulates feedback into the same on-disk stash that Phase 2 mutates,
73
- // and that Phase 3's post arm reads back. The operator's real
74
- // AKM_STASH_DIR is never touched. The pre arm gets a fresh snapshot of
75
- // the same starting fixture (no Phase 2 mutations applied).
76
- const fixtureNames = new Set();
77
- for (const t of options.tasks)
78
- fixtureNames.add(t.stash);
79
- const evolveStashes = new Map();
80
- const preStashes = new Map();
81
- const evolveDirByFixture = new Map();
82
- const preDirByFixture = new Map();
83
- /** Per-fixture XDG_CACHE_HOME dirs allocated for evolve-stash indexing. */
84
- const evolveCacheDirByFixture = new Map();
85
- // SIGINT trap (#267): every per-fixture stash registers its cleanup with
86
- // the shared registry so an external Ctrl-C reaps the tmp dirs even when
87
- // the top-level try/finally never runs. We deregister in the matching
88
- // finally block before invoking the synchronous cleanup so the handler
89
- // doesn't double-fire.
90
- const stashDeregistrations = [];
91
- if (materialiseStash) {
92
- for (const name of fixtureNames) {
93
- try {
94
- const evolved = loadFixtureStash(name, { skipIndex: false });
95
- evolveStashes.set(name, evolved);
96
- evolveDirByFixture.set(name, evolved.stashDir);
97
- // Allocate a per-fixture cache dir for the evolve-stash re-index.
98
- // `loadFixtureStash` used its own isolated XDG_CACHE_HOME; subsequent
99
- // `akmCli` calls (feedback, distill, reflect) must look in the same
100
- // cache. We allocate a fresh bench cache dir and pass it through
101
- // `indexEvolveStash` + `envForRef` so the FTS5 DB is in a known place.
102
- evolveCacheDirByFixture.set(name, benchMkdtemp(`akm-evolve-cache-${name}-`));
103
- stashDeregistrations.push(registerCleanup(() => {
104
- try {
105
- evolved.cleanup();
106
- }
107
- catch {
108
- /* swallow */
109
- }
110
- }));
111
- }
112
- catch (err) {
113
- warnings.push(`evolve: failed to materialise evolve stash for fixture "${name}": ${err.message}`);
114
- }
115
- try {
116
- const pre = loadFixtureStash(name, { skipIndex: false });
117
- preStashes.set(name, pre);
118
- preDirByFixture.set(name, pre.stashDir);
119
- stashDeregistrations.push(registerCleanup(() => {
120
- try {
121
- pre.cleanup();
122
- }
123
- catch {
124
- /* swallow */
125
- }
126
- }));
127
- }
128
- catch (err) {
129
- warnings.push(`evolve: failed to materialise pre stash for fixture "${name}": ${err.message}`);
130
- }
131
- }
132
- }
133
- // Resolve the evolveStash dir for a given asset ref. We map ref → fixture
134
- // by looking up which task's gold ref it matches; if no task owns it (or
135
- // multiple do, which is unusual), we fall back to the first available
136
- // evolveStash. The simple — and most common — case is a single fixture
137
- // per `--tasks <domain>` invocation.
138
- const refToFixture = new Map();
139
- for (const t of options.tasks) {
140
- if (t.goldRef)
141
- refToFixture.set(t.goldRef, t.stash);
142
- }
143
- const fallbackEvolveDir = [...evolveDirByFixture.values()][0];
144
- const fallbackEvolveCacheDir = [...evolveCacheDirByFixture.values()][0];
145
- function envForRef(ref) {
146
- const baseEnv = { ...process.env };
147
- if (!materialiseStash) {
148
- // Tests opt out of fixture materialisation entirely; we still strip
149
- // the operator's AKM_STASH_DIR so the fake CLI sees a known sentinel.
150
- delete baseEnv.AKM_STASH_DIR;
151
- return baseEnv;
152
- }
153
- const fixture = ref ? refToFixture.get(ref) : undefined;
154
- const dir = (fixture && evolveDirByFixture.get(fixture)) ?? fallbackEvolveDir;
155
- const cacheDir = (fixture && evolveCacheDirByFixture.get(fixture)) ?? fallbackEvolveCacheDir;
156
- if (dir)
157
- baseEnv.AKM_STASH_DIR = dir;
158
- else
159
- delete baseEnv.AKM_STASH_DIR;
160
- if (cacheDir)
161
- baseEnv.XDG_CACHE_HOME = cacheDir;
162
- return baseEnv;
163
- }
164
- // ── Phase 1 pre-flight: index each evolve stash in its dedicated cache. ───
165
- // `loadFixtureStash` already ran `akm index` but used an isolated
166
- // XDG_CACHE_HOME that subsequent `akmCli` calls (feedback, distill, reflect)
167
- // cannot see. Re-running `akm index` here via `akmCli` with the same
168
- // AKM_STASH_DIR + XDG_CACHE_HOME that `envForRef` will produce ensures the
169
- // FTS5 database is populated where Phase 1 feedback will look.
170
- // Non-zero exit adds a warning but does not abort — Phase 1 can still run
171
- // with degraded feedback if the index step fails.
172
- if (materialiseStash) {
173
- const phase1Cwd = options.tasks[0]?.taskDir ?? process.cwd();
174
- for (const [fixtureName, stashDir] of evolveDirByFixture) {
175
- const cacheDir = evolveCacheDirByFixture.get(fixtureName);
176
- if (!cacheDir)
177
- continue;
178
- try {
179
- const result = await indexEvolveStash(stashDir, cacheDir, akmCli, phase1Cwd);
180
- if (!result.ok) {
181
- warnings.push(`evolve: pre-flight akm index failed for stash ${stashDir}: ${result.stderr.trim()}`);
182
- }
183
- }
184
- catch (err) {
185
- warnings.push(`evolve: pre-flight akm index threw for stash ${stashDir}: ${err.message}`);
186
- }
187
- }
188
- }
189
- let preReport;
190
- let postReport;
191
- let syntheticReport;
192
- let phase1Report;
193
- const feedbackLog = [];
194
- const proposalLog = [];
195
- try {
196
- // ── Phase 1: accumulate signal on the train slice (akm arm only). ─────
197
- phase1Report = await runUtility({
198
- tasks: trainTasks,
199
- arms: ["akm"],
200
- model: options.model,
201
- seedsPerArm,
202
- budgetTokens,
203
- budgetWallMs,
204
- slice: "train",
205
- ...(options.spawn ? { spawn: options.spawn } : {}),
206
- // We pre-materialised the per-fixture evolve stash above; tell the
207
- // runner to forward those dirs and skip its own per-task materialise.
208
- materialiseStash,
209
- ...(materialiseStash ? { stashDirByFixture: evolveDirByFixture } : {}),
210
- ...(options.timestamp ? { timestamp: options.timestamp } : {}),
211
- ...(options.branch ? { branch: options.branch } : {}),
212
- ...(options.commit ? { commit: options.commit } : {}),
213
- ...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
214
- });
215
- // Issue feedback events per (task, seed) outcome on the akm arm.
216
- const feedbackByRef = new Map();
217
- const phase1Cwd = options.tasks[0]?.taskDir ?? process.cwd();
218
- for (const run of phase1Report.akmRuns ?? []) {
219
- const taskMeta = options.tasks.find((t) => t.id === run.taskId);
220
- const goldRef = taskMeta?.goldRef;
221
- if (!goldRef)
222
- continue;
223
- if (run.outcome === "harness_error")
224
- continue;
225
- const signal = run.outcome === "pass" ? "positive" : "negative";
226
- const args = ["feedback", goldRef, signal === "positive" ? "--positive" : "--negative"];
227
- // Wrap in try/catch so a single throwing akmCli (e.g. subprocess
228
- // crash) cannot leave `feedbackByRef` partially populated and let
229
- // Phase 2 proceed on corrupt state.
230
- try {
231
- const cliResult = await akmCli(args, phase1Cwd, envForRef(goldRef));
232
- feedbackLog.push({ taskId: run.taskId, seed: run.seed, goldRef, signal, ok: cliResult.exitCode === 0 });
233
- if (cliResult.exitCode !== 0) {
234
- warnings.push(`phase1: akm feedback for ${goldRef} (${signal}) failed: ${cliResult.stderr.trim()}`);
235
- }
236
- }
237
- catch (err) {
238
- feedbackLog.push({ taskId: run.taskId, seed: run.seed, goldRef, signal, ok: false });
239
- warnings.push(`phase1.feedback_dispatch_failed: ${goldRef} ${err.message}`);
240
- }
241
- const counts = feedbackByRef.get(goldRef) ?? { positive: 0, negative: 0 };
242
- if (signal === "positive")
243
- counts.positive += 1;
244
- else
245
- counts.negative += 1;
246
- feedbackByRef.set(goldRef, counts);
247
- }
248
- // ── Phase 2: evolve. ────────────────────────────────────────────────────
249
- const evalGoldRefs = new Set();
250
- for (const t of evalTasks) {
251
- if (t.goldRef)
252
- evalGoldRefs.add(t.goldRef);
253
- }
254
- const refsToEvolve = [];
255
- for (const [ref, counts] of feedbackByRef.entries()) {
256
- if (crossesNegativeThreshold(counts, negativeThreshold))
257
- refsToEvolve.push(ref);
258
- }
259
- refsToEvolve.sort();
260
- // §7.4 leakage prevention (#267): instead of hard-skipping refs that
261
- // overlap eval-slice gold refs, we now pass the gold-ref set through
262
- // `--exclude-feedback-from` (and the matching env var) so `akm distill`
263
- // filters those events out of its LLM input. The behaviour collapses
264
- // back to "no useful feedback shown" for refs that ARE the gold ref —
265
- // distill then runs from asset content only, which is what we want.
266
- const evalGoldRefList = [...evalGoldRefs].sort();
267
- const excludeFeedbackCsv = evalGoldRefList.join(",");
268
- for (const ref of refsToEvolve) {
269
- // The env var fallback is the contract `akm distill` honours; it lets
270
- // the bench keep working even if a hypothetical caller invokes
271
- // distill via a wrapper that mangles flags.
272
- const evolveEnv = {
273
- ...envForRef(ref),
274
- AKM_BENCH_EXCLUDE_GOLD_REFS: excludeFeedbackCsv,
275
- ...(excludeFeedbackCsv ? { AKM_DISTILL_EXCLUDE_FEEDBACK_FROM: excludeFeedbackCsv } : {}),
276
- };
277
- // Pass the eval-gold list explicitly via the CLI flag so the contract
278
- // is observable in test logs (the env var is a fallback for harnesses
279
- // that strip flags). Reflect doesn't accept this flag — it's a distill
280
- // concern only.
281
- const distillArgs = ["distill", ref];
282
- if (excludeFeedbackCsv) {
283
- distillArgs.push("--exclude-feedback-from", excludeFeedbackCsv);
284
- }
285
- const distillResult = await akmCli(distillArgs, phase1Cwd, evolveEnv);
286
- if (distillResult.exitCode !== 0) {
287
- warnings.push(`phase2: akm distill ${ref} failed: ${distillResult.stderr.trim()}`);
288
- }
289
- else if (evalGoldRefs.has(ref) && excludeFeedbackCsv) {
290
- // Per-ref leakage info — replaces the previous "skipped" message.
291
- // Operator can audit which refs ran through the filter and confirm
292
- // distillation didn't see leaked feedback.
293
- warnings.push(`phase2: filtered eval-slice gold-ref feedback from distill input for ${ref} (--exclude-feedback-from ${excludeFeedbackCsv}).`);
294
- }
295
- const reflectResult = await akmCli(["reflect", ref], phase1Cwd, evolveEnv);
296
- if (reflectResult.exitCode !== 0) {
297
- // `reflect` requires `agent.default` to be configured — a missing
298
- // config is non-fatal for the bench; we record and continue.
299
- warnings.push(`phase2: akm reflect ${ref} skipped/failed: ${reflectResult.stderr.trim()}`);
300
- }
301
- }
302
- // Walk the proposal queue per fixture (each evolveStash has its own
303
- // proposal log on disk). When we materialised stashes we iterate every
304
- // fixture that produced proposals; in the common single-fixture case
305
- // this is one pass.
306
- const proposalFixtures = materialiseStash ? [...evolveDirByFixture.keys()] : [undefined];
307
- for (const fixtureName of proposalFixtures) {
308
- const proposalEnv = { ...process.env };
309
- if (materialiseStash && fixtureName) {
310
- const dir = evolveDirByFixture.get(fixtureName);
311
- if (dir)
312
- proposalEnv.AKM_STASH_DIR = dir;
313
- const cacheDir = evolveCacheDirByFixture.get(fixtureName);
314
- if (cacheDir)
315
- proposalEnv.XDG_CACHE_HOME = cacheDir;
316
- }
317
- else if (!materialiseStash) {
318
- delete proposalEnv.AKM_STASH_DIR;
319
- }
320
- const listResult = await akmCli(["proposal", "list", "--json"], phase1Cwd, proposalEnv);
321
- const proposals = parseProposalList(listResult.stdout);
322
- for (const p of proposals) {
323
- const showResult = await akmCli(["proposal", "show", p.id, "--json"], phase1Cwd, proposalEnv);
324
- const lintInfo = parseProposalShow(showResult.stdout);
325
- const lintPass = lintInfo.lintPass;
326
- if (lintPass) {
327
- const acceptResult = await akmCli(["proposal", "accept", p.id], phase1Cwd, proposalEnv);
328
- proposalLog.push({
329
- proposalId: p.id,
330
- assetRef: p.assetRef,
331
- kind: p.kind,
332
- lintPass: true,
333
- decision: acceptResult.exitCode === 0 ? "accept" : "reject",
334
- ...(acceptResult.exitCode === 0 ? {} : { rejectReason: `accept failed: ${acceptResult.stderr.trim()}` }),
335
- });
336
- }
337
- else {
338
- const reason = lintInfo.lintMessage ?? "lint failed";
339
- const rejectResult = await akmCli(["proposal", "reject", p.id, "--reason", `lint failed: ${reason}`], phase1Cwd, proposalEnv);
340
- proposalLog.push({
341
- proposalId: p.id,
342
- assetRef: p.assetRef,
343
- kind: p.kind,
344
- lintPass: false,
345
- decision: "reject",
346
- rejectReason: reason,
347
- });
348
- if (rejectResult.exitCode !== 0) {
349
- warnings.push(`phase2: akm proposal reject ${p.id} failed: ${rejectResult.stderr.trim()}`);
350
- }
351
- }
352
- }
353
- // Rebuild the index so accepted lessons surface in Phase 3.
354
- const indexResult = await akmCli(["index"], phase1Cwd, proposalEnv);
355
- if (indexResult.exitCode !== 0) {
356
- warnings.push(`phase2: akm index rebuild failed: ${indexResult.stderr.trim()}`);
357
- }
358
- }
359
- // ── Phase 3: re-evaluate (eval slice). ─────────────────────────────────
360
- // pre arm: fresh snapshot of the starting fixture (no Phase 2 mutations
361
- // applied). post arm: the mutated evolveStash so accepted lessons reach
362
- // the eval slice. synthetic arm: no stash.
363
- preReport = await runUtility({
364
- tasks: evalTasks,
365
- arms: ["akm"],
366
- model: options.model,
367
- seedsPerArm,
368
- budgetTokens,
369
- budgetWallMs,
370
- slice: "eval",
371
- ...(options.spawn ? { spawn: options.spawn } : {}),
372
- materialiseStash,
373
- ...(materialiseStash ? { stashDirByFixture: preDirByFixture } : {}),
374
- ...(options.timestamp ? { timestamp: options.timestamp } : {}),
375
- ...(options.branch ? { branch: options.branch } : {}),
376
- ...(options.commit ? { commit: options.commit } : {}),
377
- ...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
378
- });
379
- postReport = await runUtility({
380
- tasks: evalTasks,
381
- arms: ["akm"],
382
- model: options.model,
383
- seedsPerArm,
384
- budgetTokens,
385
- budgetWallMs,
386
- slice: "eval",
387
- // Stamp arm metadata so spawn fakes can distinguish pre-vs-post via
388
- // an env probe. We thread it via a fresh `spawn` wrapper when one
389
- // was supplied.
390
- materialiseStash,
391
- ...(materialiseStash ? { stashDirByFixture: evolveDirByFixture } : {}),
392
- ...(options.timestamp ? { timestamp: options.timestamp } : {}),
393
- ...(options.branch ? { branch: options.branch } : {}),
394
- ...(options.commit ? { commit: options.commit } : {}),
395
- ...(options.spawn ? { spawn: wrapSpawnWithArm(options.spawn, "post") } : {}),
396
- ...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
397
- });
398
- // synthetic: no stash. We pass a spawn wrapper that strips
399
- // AKM_STASH_DIR and injects the "Bring Your Own Skills" tag so test
400
- // fakes (and a future real harness) can branch. #267 — also forward a
401
- // per-task scratchpad prompt via the runner's `buildPrompt` seam so the
402
- // synthetic arm actually exercises the BYOS prompt path rather than
403
- // relying on the noakm default.
404
- syntheticReport = await runUtility({
405
- tasks: evalTasks,
406
- arms: ["akm"],
407
- model: options.model,
408
- seedsPerArm,
409
- budgetTokens,
410
- budgetWallMs,
411
- slice: "eval",
412
- materialiseStash: false,
413
- buildPrompt: (task, _arm) => buildSyntheticPrompt(task.id),
414
- ...(options.timestamp ? { timestamp: options.timestamp } : {}),
415
- ...(options.branch ? { branch: options.branch } : {}),
416
- ...(options.commit ? { commit: options.commit } : {}),
417
- ...(options.spawn ? { spawn: wrapSpawnWithArm(options.spawn, "synthetic", undefined, true) } : {}),
418
- ...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
419
- });
420
- }
421
- finally {
422
- // Deregister BEFORE running cleanup so a SIGINT during teardown
423
- // doesn't double-fire the cleanup fns (per cleanup.ts contract).
424
- for (const deregister of stashDeregistrations)
425
- deregister();
426
- for (const s of evolveStashes.values()) {
427
- try {
428
- s.cleanup();
429
- }
430
- catch {
431
- /* swallow — best-effort tmp cleanup */
432
- }
433
- }
434
- for (const s of preStashes.values()) {
435
- try {
436
- s.cleanup();
437
- }
438
- catch {
439
- /* swallow — best-effort tmp cleanup */
440
- }
441
- }
442
- }
443
- // ── Compute aggregates. ──────────────────────────────────────────────────
444
- const proposalsMetrics = computeProposalQualityMetrics(proposalLog);
445
- const longitudinal = computeLongitudinalMetrics(preReport, postReport, syntheticReport);
446
- const feedbackIntegrity = computeFeedbackIntegrity({ phase1: phase1Report, feedbackLog });
447
- // #264 — lesson quality + reuse metrics. The runner doesn't (yet) read
448
- // accepted lesson bodies off disk or load verifier source text; we pass
449
- // empty maps so the leakage check defaults to "low" until the read seam
450
- // lands. Reuse + negative-transfer attribution work today off the
451
- // pre/post arm `assetsLoaded` stream.
452
- const lessons = computeLessonMetrics({
453
- proposalLog,
454
- feedbackLog,
455
- preRuns: preReport.akmRuns ?? [],
456
- postRuns: postReport.akmRuns ?? [],
457
- });
458
- return {
459
- timestamp: options.timestamp ?? new Date().toISOString(),
460
- branch: options.branch ?? preReport.branch,
461
- commit: options.commit ?? preReport.commit,
462
- model: options.model,
463
- domain,
464
- seedsPerArm,
465
- feedbackLog,
466
- proposalLog,
467
- proposals: proposalsMetrics,
468
- lessons,
469
- longitudinal,
470
- feedbackIntegrity,
471
- phase1: phase1Report,
472
- arms: { pre: preReport, post: postReport, synthetic: syntheticReport },
473
- warnings: [
474
- ...warnings,
475
- ...phase1Report.warnings,
476
- ...preReport.warnings,
477
- ...postReport.warnings,
478
- ...syntheticReport.warnings,
479
- ],
480
- };
481
- }
482
- /**
483
- * Default subprocess invoker — runs `bun run src/cli.ts <args>` in `cwd`
484
- * with the supplied env. Real runs use this; tests inject a fake.
485
- */
486
- async function defaultAkmCli(args, cwd, env) {
487
- const cli = path.resolve(__dirname, "..", "..", "src", "cli.ts");
488
- const proc = Bun.spawnSync({
489
- cmd: ["bun", "run", cli, ...args],
490
- cwd,
491
- env: { ...process.env, ...env },
492
- stdout: "pipe",
493
- stderr: "pipe",
494
- });
495
- const stdout = proc.stdout ? new TextDecoder().decode(proc.stdout) : "";
496
- const stderr = proc.stderr ? new TextDecoder().decode(proc.stderr) : "";
497
- return { exitCode: proc.exitCode ?? -1, stdout, stderr };
498
- }
499
- /**
500
- * Threshold check: an asset crosses the negative threshold if either the
501
- * absolute negative count meets `absoluteCount` OR the negative *ratio* among
502
- * total feedback exceeds `ratio`. Either branch is sufficient — both are
503
- * spec-mandated defaults.
504
- */
505
- function crossesNegativeThreshold(counts, threshold) {
506
- if (counts.negative >= threshold.absoluteCount)
507
- return true;
508
- const total = counts.positive + counts.negative;
509
- if (total === 0)
510
- return false;
511
- return counts.negative / total > threshold.ratio;
512
- }
513
- /** Best-effort partition. Honours explicit `slice:` and falls back to id-hash. */
514
- function effectiveSlice(task) {
515
- if (task.slice)
516
- return task.slice;
517
- // Mirror corpus.effectiveSlice — SHA-1 first byte parity.
518
- // We avoid the import cycle by inlining the trivial fallback.
519
- let h = 0;
520
- for (let i = 0; i < task.id.length; i += 1)
521
- h = (h * 31 + task.id.charCodeAt(i)) | 0;
522
- return Math.abs(h) % 2 === 0 ? "train" : "eval";
523
- }
524
- function uniqueDomain(tasks) {
525
- const set = new Set(tasks.map((t) => t.domain));
526
- if (set.size === 1)
527
- return [...set][0] ?? "all";
528
- return "all";
529
- }
530
- /**
531
- * Wrap a spawn fake so every child sees `BENCH_EVOLVE_ARM=<arm>` (and
532
- * `BENCH_EVOLVE_SCRATCHPAD=1` for the synthetic arm). Used by Phase 3 so
533
- * test fakes can distinguish the three arms without us having to expose a
534
- * `prompt` override on `runUtility`. Real production runs receive the same
535
- * env keys; the real `runAgent` harness ignores them.
536
- */
537
- function wrapSpawnWithArm(inner, arm, stashDir, scratchpad = false) {
538
- return (cmd, opts) => {
539
- const env = { ...(opts.env ?? {}) };
540
- env.BENCH_EVOLVE_ARM = arm;
541
- if (scratchpad)
542
- env.BENCH_EVOLVE_SCRATCHPAD = "1";
543
- if (stashDir)
544
- env.AKM_STASH_DIR = stashDir;
545
- if (arm === "synthetic")
546
- delete env.AKM_STASH_DIR;
547
- return inner(cmd, { ...opts, env });
548
- };
549
- }
550
- /** Tolerant parser for `akm proposal list --json` stdout. */
551
- function parseProposalList(stdout) {
552
- if (!stdout.trim())
553
- return [];
554
- let parsed;
555
- try {
556
- parsed = JSON.parse(stdout);
557
- }
558
- catch {
559
- return [];
560
- }
561
- const arr = Array.isArray(parsed)
562
- ? parsed
563
- : Array.isArray(parsed.proposals)
564
- ? parsed.proposals
565
- : [];
566
- const out = [];
567
- for (const item of arr) {
568
- if (!item || typeof item !== "object")
569
- continue;
570
- const rec = item;
571
- const id = typeof rec.id === "string" ? rec.id : null;
572
- const assetRef = typeof rec.target_ref === "string"
573
- ? rec.target_ref
574
- : typeof rec.targetRef === "string"
575
- ? rec.targetRef
576
- : typeof rec.ref === "string"
577
- ? rec.ref
578
- : null;
579
- const kindRaw = typeof rec.kind === "string" ? rec.kind : typeof rec.source === "string" ? rec.source : "unknown";
580
- const kind = kindRaw === "lesson" || kindRaw === "distill"
581
- ? "lesson"
582
- : kindRaw === "revision" || kindRaw === "reflect"
583
- ? "revision"
584
- : "unknown";
585
- if (!id || !assetRef)
586
- continue;
587
- out.push({ id, assetRef, kind });
588
- }
589
- return out;
590
- }
591
- function parseProposalShow(stdout) {
592
- if (!stdout.trim())
593
- return { lintPass: false, lintMessage: "empty proposal show output" };
594
- let parsed;
595
- try {
596
- parsed = JSON.parse(stdout);
597
- }
598
- catch (err) {
599
- return { lintPass: false, lintMessage: `proposal show: parse error (${err.message})` };
600
- }
601
- const lintPass = parsed.lint_pass === true ||
602
- parsed.lintPass === true ||
603
- (typeof parsed.lint === "object" && parsed.lint !== null && parsed.lint.pass === true);
604
- const lintRaw = parsed.lint;
605
- let lintMessage;
606
- if (lintRaw && typeof lintRaw === "object") {
607
- const issues = lintRaw.issues;
608
- if (Array.isArray(issues) && issues.length > 0) {
609
- lintMessage = issues
610
- .map((i) => (typeof i === "string" ? i : (i?.message ?? JSON.stringify(i))))
611
- .join("; ");
612
- }
613
- }
614
- return { lintPass, ...(lintMessage ? { lintMessage } : {}) };
615
- }
616
- /**
617
- * Run `akm index` on the evolve stash to populate the FTS5 database in the
618
- * cache directory that Phase 1 `akmCli` calls will use.
619
- *
620
- * `loadFixtureStash` already indexed the stash into an isolated XDG_CACHE_HOME
621
- * that is invisible to subsequent `akmCli` calls. Calling this helper with the
622
- * same `stashDir` + `cacheDir` that `envForRef` will forward ensures `akm
623
- * feedback` (and later `akm distill` / `akm reflect`) can look up refs in the
624
- * FTS5 index.
625
- *
626
- * Returns `{ ok: true }` on exit code 0, `{ ok: false, stderr }` otherwise.
627
- * Exported for tests.
628
- */
629
- export async function indexEvolveStash(stashDir, cacheDir, akmCli, cwd) {
630
- const env = {
631
- ...process.env,
632
- AKM_STASH_DIR: stashDir,
633
- XDG_CACHE_HOME: cacheDir,
634
- };
635
- const result = await akmCli(["index"], cwd, env);
636
- return { ok: result.exitCode === 0, stderr: result.stderr };
637
- }
638
- /** Exposed for tests so the synthetic-arm prompt construction can be asserted. */
639
- export function buildSyntheticPrompt(taskId) {
640
- return [
641
- `Task: ${taskId}`,
642
- "Arm: synthetic (Bring Your Own Skills)",
643
- "No akm stash is available. Before solving the task, write a short scratchpad of the skills",
644
- "and steps you intend to use, then proceed. Cite the scratchpad in your trace so the verifier",
645
- "can attribute the approach to your own reasoning rather than retrieved guidance.",
646
- ].join("\n");
647
- }