akm-cli 0.6.0 → 0.7.0-rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (319) hide show
  1. package/CHANGELOG.md +66 -0
  2. package/dist/{cli.js → src/cli.js} +672 -29
  3. package/dist/{commands → src/commands}/config-cli.js +5 -4
  4. package/dist/src/commands/distill.js +283 -0
  5. package/dist/src/commands/events.js +108 -0
  6. package/dist/src/commands/history.js +120 -0
  7. package/dist/{commands → src/commands}/installed-stashes.js +28 -2
  8. package/dist/src/commands/proposal.js +119 -0
  9. package/dist/src/commands/propose.js +171 -0
  10. package/dist/src/commands/reflect.js +193 -0
  11. package/dist/{commands → src/commands}/registry-search.js +2 -1
  12. package/dist/{commands → src/commands}/remember.js +12 -0
  13. package/dist/{commands → src/commands}/search.js +74 -1
  14. package/dist/{commands → src/commands}/self-update.js +4 -3
  15. package/dist/{commands → src/commands}/show.js +67 -2
  16. package/dist/{core → src/core}/asset-ref.js +5 -5
  17. package/dist/{core → src/core}/asset-spec.js +12 -0
  18. package/dist/{core → src/core}/common.js +1 -1
  19. package/dist/{core → src/core}/config.js +175 -121
  20. package/dist/{core → src/core}/errors.js +4 -0
  21. package/dist/src/core/events.js +239 -0
  22. package/dist/src/core/lesson-lint.js +86 -0
  23. package/dist/src/core/proposals.js +406 -0
  24. package/dist/src/core/warn.js +72 -0
  25. package/dist/{core → src/core}/write-source.js +80 -5
  26. package/dist/{indexer → src/indexer}/db-search.js +119 -27
  27. package/dist/{indexer → src/indexer}/db.js +76 -23
  28. package/dist/{indexer → src/indexer}/file-context.js +0 -3
  29. package/dist/src/indexer/graph-boost.js +179 -0
  30. package/dist/src/indexer/graph-extraction.js +212 -0
  31. package/dist/{indexer → src/indexer}/indexer.js +73 -6
  32. package/dist/src/indexer/memory-inference.js +263 -0
  33. package/dist/{indexer → src/indexer}/metadata.js +114 -11
  34. package/dist/src/integrations/agent/config.js +292 -0
  35. package/dist/src/integrations/agent/detect.js +94 -0
  36. package/dist/src/integrations/agent/index.js +17 -0
  37. package/dist/src/integrations/agent/profiles.js +65 -0
  38. package/dist/src/integrations/agent/prompts.js +167 -0
  39. package/dist/src/integrations/agent/spawn.js +221 -0
  40. package/dist/{integrations → src/integrations}/lockfile.js +0 -26
  41. package/dist/{llm → src/llm}/client.js +33 -2
  42. package/dist/src/llm/feature-gate.js +108 -0
  43. package/dist/src/llm/graph-extract.js +107 -0
  44. package/dist/src/llm/index-passes.js +35 -0
  45. package/dist/src/llm/memory-infer.js +86 -0
  46. package/dist/{output → src/output}/renderers.js +60 -1
  47. package/dist/src/output/shapes.js +516 -0
  48. package/dist/{output → src/output}/text.js +447 -4
  49. package/dist/{registry → src/registry}/build-index.js +14 -4
  50. package/dist/{registry → src/registry}/factory.js +0 -8
  51. package/dist/{registry → src/registry}/providers/static-index.js +3 -2
  52. package/dist/{registry → src/registry}/resolve.js +68 -2
  53. package/dist/{setup → src/setup}/setup.js +43 -5
  54. package/dist/{sources → src/sources}/providers/git.js +7 -15
  55. package/dist/{wiki → src/wiki}/wiki.js +9 -11
  56. package/dist/tests/add-website-source.test.js +119 -0
  57. package/dist/tests/agent/agent-config-loader.test.js +70 -0
  58. package/dist/tests/agent/agent-config.test.js +221 -0
  59. package/dist/tests/agent/agent-detect.test.js +100 -0
  60. package/dist/tests/agent/agent-spawn.test.js +234 -0
  61. package/dist/tests/agent-output.test.js +186 -0
  62. package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +103 -0
  63. package/dist/tests/architecture/agent-spawn-seam.test.js +193 -0
  64. package/dist/tests/architecture/llm-stateless-seam.test.js +112 -0
  65. package/dist/tests/asset-ref.test.js +192 -0
  66. package/dist/tests/asset-registry.test.js +103 -0
  67. package/dist/tests/asset-spec.test.js +241 -0
  68. package/dist/tests/bench/attribution.test.js +995 -0
  69. package/dist/tests/bench/cleanup-sigint.test.js +83 -0
  70. package/dist/tests/bench/cleanup.js +203 -0
  71. package/dist/tests/bench/cleanup.test.js +166 -0
  72. package/dist/tests/bench/cli.js +683 -0
  73. package/dist/tests/bench/cli.test.js +177 -0
  74. package/dist/tests/bench/compare.test.js +556 -0
  75. package/dist/tests/bench/corpus.js +314 -0
  76. package/dist/tests/bench/corpus.test.js +258 -0
  77. package/dist/tests/bench/driver.js +346 -0
  78. package/dist/tests/bench/driver.test.js +443 -0
  79. package/dist/tests/bench/evolve-metrics.js +179 -0
  80. package/dist/tests/bench/evolve-metrics.test.js +187 -0
  81. package/dist/tests/bench/evolve.js +580 -0
  82. package/dist/tests/bench/evolve.test.js +616 -0
  83. package/dist/tests/bench/failure-modes.test.js +300 -0
  84. package/dist/tests/bench/feedback-integrity.test.js +456 -0
  85. package/dist/tests/bench/leakage.test.js +125 -0
  86. package/dist/tests/bench/learning-curve.test.js +133 -0
  87. package/dist/tests/bench/metrics.js +2319 -0
  88. package/dist/tests/bench/metrics.test.js +1144 -0
  89. package/dist/tests/bench/no-os-tmpdir-invariant.test.js +43 -0
  90. package/dist/tests/bench/report.js +1821 -0
  91. package/dist/tests/bench/report.test.js +989 -0
  92. package/dist/tests/bench/runner.js +536 -0
  93. package/dist/tests/bench/runner.test.js +958 -0
  94. package/dist/tests/bench/search-bridge.test.js +331 -0
  95. package/dist/tests/bench/tmp.js +41 -0
  96. package/dist/tests/bench/trajectory.js +116 -0
  97. package/dist/tests/bench/trajectory.test.js +127 -0
  98. package/dist/tests/bench/verifier.js +109 -0
  99. package/dist/tests/bench/verifier.test.js +118 -0
  100. package/dist/tests/bench/workflow-evaluator.js +557 -0
  101. package/dist/tests/bench/workflow-evaluator.test.js +421 -0
  102. package/dist/tests/bench/workflow-spec.js +358 -0
  103. package/dist/tests/bench/workflow-spec.test.js +363 -0
  104. package/dist/tests/bench/workflow-trace.js +438 -0
  105. package/dist/tests/bench/workflow-trace.test.js +254 -0
  106. package/dist/tests/benchmark-search-quality.js +536 -0
  107. package/dist/tests/benchmark-suite.js +1441 -0
  108. package/dist/tests/capture-cli.test.js +112 -0
  109. package/dist/tests/cli-errors.test.js +203 -0
  110. package/dist/tests/commands/events.test.js +370 -0
  111. package/dist/tests/commands/history.test.js +223 -0
  112. package/dist/tests/commands/import.test.js +103 -0
  113. package/dist/tests/commands/proposal-cli.test.js +209 -0
  114. package/dist/tests/commands/reflect-propose-cli.test.js +333 -0
  115. package/dist/tests/commands/remember.test.js +97 -0
  116. package/dist/tests/commands/scope-flags.test.js +300 -0
  117. package/dist/tests/commands/search.test.js +537 -0
  118. package/dist/tests/commands/show-indexer-parity.test.js +117 -0
  119. package/dist/tests/commands/show.test.js +294 -0
  120. package/dist/tests/common.test.js +266 -0
  121. package/dist/tests/completions.test.js +142 -0
  122. package/dist/tests/config-cli.test.js +193 -0
  123. package/dist/tests/config-llm-features.test.js +139 -0
  124. package/dist/tests/config.test.js +544 -0
  125. package/dist/tests/contracts/migration-baseline.test.js +43 -0
  126. package/dist/tests/contracts/reflect-propose-envelope.test.js +139 -0
  127. package/dist/tests/contracts/spec-helpers.js +46 -0
  128. package/dist/tests/contracts/v1-spec-section-11-proposal-queue.test.js +228 -0
  129. package/dist/tests/contracts/v1-spec-section-12-agent-config.test.js +56 -0
  130. package/dist/tests/contracts/v1-spec-section-13-lesson-type.test.js +34 -0
  131. package/dist/tests/contracts/v1-spec-section-14-llm-features.test.js +94 -0
  132. package/dist/tests/contracts/v1-spec-section-4-1-asset-types.test.js +39 -0
  133. package/dist/tests/contracts/v1-spec-section-4-2-quality-rules.test.js +44 -0
  134. package/dist/tests/contracts/v1-spec-section-5-configuration.test.js +47 -0
  135. package/dist/tests/contracts/v1-spec-section-6-orchestration.test.js +40 -0
  136. package/dist/tests/contracts/v1-spec-section-7-module-layout.test.js +58 -0
  137. package/dist/tests/contracts/v1-spec-section-8-extension-points.test.js +34 -0
  138. package/dist/tests/contracts/v1-spec-section-9-4-cli-surface.test.js +75 -0
  139. package/dist/tests/contracts/v1-spec-section-9-7-llm-agent-boundary.test.js +36 -0
  140. package/dist/tests/core/write-source.test.js +366 -0
  141. package/dist/tests/curate-command.test.js +87 -0
  142. package/dist/tests/db-scoring.test.js +201 -0
  143. package/dist/tests/db.test.js +654 -0
  144. package/dist/tests/distill-cli-flag.test.js +208 -0
  145. package/dist/tests/distill.test.js +515 -0
  146. package/dist/tests/docker-install.test.js +120 -0
  147. package/dist/tests/e2e.test.js +1398 -0
  148. package/dist/tests/embedder.test.js +340 -0
  149. package/dist/tests/embedding-model-config.test.js +379 -0
  150. package/dist/tests/feedback-command.test.js +172 -0
  151. package/dist/tests/file-context.test.js +552 -0
  152. package/dist/tests/fixtures/scripts/git/summarize-diff.js +9 -0
  153. package/dist/tests/fixtures/scripts/lint/eslint-check.js +7 -0
  154. package/dist/tests/fixtures/stashes/load.js +166 -0
  155. package/dist/tests/fixtures/stashes/load.test.js +88 -0
  156. package/dist/tests/fixtures/stashes/ranking-baseline/scripts/mem0-search.js +12 -0
  157. package/dist/tests/frontmatter.test.js +190 -0
  158. package/dist/tests/fts-field-weighting.test.js +254 -0
  159. package/dist/tests/fuzzy-search.test.js +230 -0
  160. package/dist/tests/git-provider-clone.test.js +45 -0
  161. package/dist/tests/github.test.js +161 -0
  162. package/dist/tests/graph-boost-ranking.test.js +305 -0
  163. package/dist/tests/graph-extraction.test.js +282 -0
  164. package/dist/tests/helpers/usage-events.js +8 -0
  165. package/dist/tests/index-pass-llm.test.js +161 -0
  166. package/dist/tests/indexer.test.js +559 -0
  167. package/dist/tests/info-command.test.js +166 -0
  168. package/dist/tests/init.test.js +69 -0
  169. package/dist/tests/install-script.test.js +246 -0
  170. package/dist/tests/integration/agent-real-profile.test.js +94 -0
  171. package/dist/tests/issue-36-repro.test.js +304 -0
  172. package/dist/tests/issues-191-194.test.js +160 -0
  173. package/dist/tests/lesson-lint.test.js +111 -0
  174. package/dist/tests/llm-client.test.js +115 -0
  175. package/dist/tests/llm-feature-gate.test.js +151 -0
  176. package/dist/tests/llm.test.js +139 -0
  177. package/dist/tests/lockfile.test.js +216 -0
  178. package/dist/tests/manifest.test.js +205 -0
  179. package/dist/tests/markdown.test.js +126 -0
  180. package/dist/tests/matchers-unit.test.js +189 -0
  181. package/dist/tests/memory-inference.test.js +299 -0
  182. package/dist/tests/merge-scoring.test.js +136 -0
  183. package/dist/tests/metadata.test.js +313 -0
  184. package/dist/tests/migration-help.test.js +89 -0
  185. package/dist/tests/origin-resolve.test.js +124 -0
  186. package/dist/tests/output-baseline.test.js +217 -0
  187. package/dist/tests/output-shapes-unit.test.js +476 -0
  188. package/dist/tests/parallel-search.test.js +272 -0
  189. package/dist/tests/parameter-metadata.test.js +365 -0
  190. package/dist/tests/paths.test.js +177 -0
  191. package/dist/tests/progressive-disclosure.test.js +280 -0
  192. package/dist/tests/proposals.test.js +279 -0
  193. package/dist/tests/proposed-quality.test.js +271 -0
  194. package/dist/tests/provider-registry.test.js +32 -0
  195. package/dist/tests/ranking-regression.test.js +548 -0
  196. package/dist/tests/reflect-propose.test.js +455 -0
  197. package/dist/tests/registry-build-index.test.js +378 -0
  198. package/dist/tests/registry-cli.test.js +290 -0
  199. package/dist/tests/registry-index-v2.test.js +430 -0
  200. package/dist/tests/registry-install.test.js +728 -0
  201. package/dist/tests/registry-providers/parity.test.js +189 -0
  202. package/dist/tests/registry-providers/skills-sh.test.js +309 -0
  203. package/dist/tests/registry-providers/static-index.test.js +204 -0
  204. package/dist/tests/registry-resolve.test.js +126 -0
  205. package/dist/tests/registry-search.test.js +723 -0
  206. package/dist/tests/remember-frontmatter.test.js +380 -0
  207. package/dist/tests/remember-unit.test.js +123 -0
  208. package/dist/tests/ripgrep-install.test.js +251 -0
  209. package/dist/tests/ripgrep-resolve.test.js +108 -0
  210. package/dist/tests/ripgrep.test.js +163 -0
  211. package/dist/tests/save-command.test.js +94 -0
  212. package/dist/tests/save-trust-qa-fixes.test.js +270 -0
  213. package/dist/tests/scoring-pipeline.test.js +648 -0
  214. package/dist/tests/search-include-proposed-cli.test.js +118 -0
  215. package/dist/tests/self-update.test.js +442 -0
  216. package/dist/tests/semantic-search-e2e.test.js +512 -0
  217. package/dist/tests/semantic-status.test.js +471 -0
  218. package/dist/tests/setup-run.integration.js +877 -0
  219. package/dist/tests/setup-wizard.test.js +198 -0
  220. package/dist/tests/setup.test.js +131 -0
  221. package/dist/tests/source-add.test.js +11 -0
  222. package/dist/tests/source-clone.test.js +254 -0
  223. package/dist/tests/source-manage.test.js +366 -0
  224. package/dist/tests/source-providers/filesystem.test.js +82 -0
  225. package/dist/tests/source-providers/git.test.js +252 -0
  226. package/dist/tests/source-providers/website.test.js +128 -0
  227. package/dist/tests/source-qa-fixes.test.js +268 -0
  228. package/dist/tests/source-registry.test.js +350 -0
  229. package/dist/tests/source-resolve.test.js +100 -0
  230. package/dist/tests/source-source.test.js +221 -0
  231. package/dist/tests/source.test.js +533 -0
  232. package/dist/tests/tar-utils-scan.test.js +73 -0
  233. package/dist/tests/toggle-components.test.js +73 -0
  234. package/dist/tests/usage-telemetry.test.js +265 -0
  235. package/dist/tests/utility-scoring.test.js +558 -0
  236. package/dist/tests/vault-load-error.test.js +78 -0
  237. package/dist/tests/vault-qa-fixes.test.js +194 -0
  238. package/dist/tests/vault.test.js +429 -0
  239. package/dist/tests/vector-search.test.js +608 -0
  240. package/dist/tests/walker.test.js +252 -0
  241. package/dist/tests/wave2-cluster-bc.test.js +228 -0
  242. package/dist/tests/wave2-cluster-d.test.js +180 -0
  243. package/dist/tests/wave2-cluster-e.test.js +179 -0
  244. package/dist/tests/wiki-qa-fixes.test.js +270 -0
  245. package/dist/tests/wiki.test.js +529 -0
  246. package/dist/tests/workflow-cli.test.js +271 -0
  247. package/dist/tests/workflow-markdown.test.js +171 -0
  248. package/dist/tests/workflow-path-escape.test.js +132 -0
  249. package/dist/tests/workflow-qa-fixes.test.js +377 -0
  250. package/dist/tests/workflows/indexer-rejection.test.js +213 -0
  251. package/docs/README.md +8 -0
  252. package/docs/migration/release-notes/0.7.0.md +244 -0
  253. package/package.json +2 -2
  254. package/dist/core/warn.js +0 -27
  255. package/dist/output/shapes.js +0 -212
  256. /package/dist/{commands → src/commands}/completions.js +0 -0
  257. /package/dist/{commands → src/commands}/curate.js +0 -0
  258. /package/dist/{commands → src/commands}/info.js +0 -0
  259. /package/dist/{commands → src/commands}/init.js +0 -0
  260. /package/dist/{commands → src/commands}/install-audit.js +0 -0
  261. /package/dist/{commands → src/commands}/migration-help.js +0 -0
  262. /package/dist/{commands → src/commands}/source-add.js +0 -0
  263. /package/dist/{commands → src/commands}/source-clone.js +0 -0
  264. /package/dist/{commands → src/commands}/source-manage.js +0 -0
  265. /package/dist/{commands → src/commands}/vault.js +0 -0
  266. /package/dist/{core → src/core}/asset-registry.js +0 -0
  267. /package/dist/{core → src/core}/frontmatter.js +0 -0
  268. /package/dist/{core → src/core}/markdown.js +0 -0
  269. /package/dist/{core → src/core}/paths.js +0 -0
  270. /package/dist/{indexer → src/indexer}/manifest.js +0 -0
  271. /package/dist/{indexer → src/indexer}/matchers.js +0 -0
  272. /package/dist/{indexer → src/indexer}/search-fields.js +0 -0
  273. /package/dist/{indexer → src/indexer}/search-source.js +0 -0
  274. /package/dist/{indexer → src/indexer}/semantic-status.js +0 -0
  275. /package/dist/{indexer → src/indexer}/usage-events.js +0 -0
  276. /package/dist/{indexer → src/indexer}/walker.js +0 -0
  277. /package/dist/{integrations → src/integrations}/github.js +0 -0
  278. /package/dist/{llm → src/llm}/embedder.js +0 -0
  279. /package/dist/{llm → src/llm}/embedders/cache.js +0 -0
  280. /package/dist/{llm → src/llm}/embedders/local.js +0 -0
  281. /package/dist/{llm → src/llm}/embedders/remote.js +0 -0
  282. /package/dist/{llm → src/llm}/embedders/types.js +0 -0
  283. /package/dist/{llm → src/llm}/metadata-enhance.js +0 -0
  284. /package/dist/{output → src/output}/cli-hints.js +0 -0
  285. /package/dist/{output → src/output}/context.js +0 -0
  286. /package/dist/{registry → src/registry}/create-provider-registry.js +0 -0
  287. /package/dist/{registry → src/registry}/origin-resolve.js +0 -0
  288. /package/dist/{registry → src/registry}/providers/index.js +0 -0
  289. /package/dist/{registry → src/registry}/providers/skills-sh.js +0 -0
  290. /package/dist/{registry → src/registry}/providers/types.js +0 -0
  291. /package/dist/{registry → src/registry}/types.js +0 -0
  292. /package/dist/{setup → src/setup}/detect.js +0 -0
  293. /package/dist/{setup → src/setup}/ripgrep-install.js +0 -0
  294. /package/dist/{setup → src/setup}/ripgrep-resolve.js +0 -0
  295. /package/dist/{setup → src/setup}/steps.js +0 -0
  296. /package/dist/{sources → src/sources}/include.js +0 -0
  297. /package/dist/{sources → src/sources}/provider-factory.js +0 -0
  298. /package/dist/{sources → src/sources}/provider.js +0 -0
  299. /package/dist/{sources → src/sources}/providers/filesystem.js +0 -0
  300. /package/dist/{sources → src/sources}/providers/index.js +0 -0
  301. /package/dist/{sources → src/sources}/providers/install-types.js +0 -0
  302. /package/dist/{sources → src/sources}/providers/npm.js +0 -0
  303. /package/dist/{sources → src/sources}/providers/provider-utils.js +0 -0
  304. /package/dist/{sources → src/sources}/providers/sync-from-ref.js +0 -0
  305. /package/dist/{sources → src/sources}/providers/tar-utils.js +0 -0
  306. /package/dist/{sources → src/sources}/providers/website.js +0 -0
  307. /package/dist/{sources → src/sources}/resolve.js +0 -0
  308. /package/dist/{sources → src/sources}/types.js +0 -0
  309. /package/dist/{templates → src/templates}/wiki-templates.js +0 -0
  310. /package/dist/{version.js → src/version.js} +0 -0
  311. /package/dist/{workflows → src/workflows}/authoring.js +0 -0
  312. /package/dist/{workflows → src/workflows}/cli.js +0 -0
  313. /package/dist/{workflows → src/workflows}/db.js +0 -0
  314. /package/dist/{workflows → src/workflows}/document-cache.js +0 -0
  315. /package/dist/{workflows → src/workflows}/parser.js +0 -0
  316. /package/dist/{workflows → src/workflows}/renderer.js +0 -0
  317. /package/dist/{workflows → src/workflows}/runs.js +0 -0
  318. /package/dist/{workflows → src/workflows}/schema.js +0 -0
  319. /package/dist/{workflows → src/workflows}/validator.js +0 -0
@@ -0,0 +1,580 @@
1
+ /**
2
+ * akm-bench `evolve` — Track B longitudinal three-phase runner (spec §4 + §6.4).
3
+ *
4
+ * `runEvolve()` orchestrates three phases against a single eval-domain corpus:
5
+ *
6
+ * • Phase 1 (signal accumulation): run K seeds × tasks (train slice only)
7
+ * under the akm arm, then record `akm feedback <gold_ref> --positive` /
8
+ * `--negative` events per outcome.
9
+ * • Phase 2 (evolve): for every asset whose negative feedback crosses the
10
+ * threshold, invoke `akm distill` and `akm reflect`, validate every
11
+ * resulting proposal via `akm proposal show --json`, then accept or
12
+ * reject per lint outcome. After processing, rebuild the index.
13
+ * • Phase 3 (re-evaluate): run the eval slice under THREE arms — `pre` (the
14
+ * original un-evolved fixture), `post` (the evolved fixture), `synthetic`
15
+ * (no stash, scratchpad-only "Bring Your Own Skills" prompt).
16
+ *
17
+ * Leakage prevention (spec §7.4): before invoking distill we compute the set
18
+ * of eval-slice gold refs and pass it to `akm distill` via
19
+ * `--exclude-feedback-from <csv>` (#267). `akmDistill` filters those
20
+ * feedback events out of its LLM input before constructing the prompt.
21
+ * Refs in the exclusion list still see distillation run — but distillation
22
+ * runs from asset content alone, with no feedback signal that could have
23
+ * leaked from the eval slice. The proposal log + Phase 1 feedback stream
24
+ * are also filtered before computeProposalQualityMetrics ever sees them.
25
+ *
26
+ * Test seams: every external interaction is funnelled through one of three
27
+ * injectable functions:
28
+ * - `spawn` — forwarded to `runOne` (drives the agent harness).
29
+ * - `akmCli(args, cwd, env)` — invoked for every `akm <verb>` subprocess.
30
+ * - `materialiseStash` — when false, `runUtility` doesn't touch
31
+ * fixtures/stashes/.
32
+ * Tests inject fakes; production wires the real `Bun.spawnSync` and the
33
+ * real `loadFixtureStash`.
34
+ */
35
+ import path from "node:path";
36
+ import { loadFixtureStash } from "../fixtures/stashes/load";
37
+ import { registerCleanup } from "./cleanup";
38
+ import { computeLessonMetrics } from "./evolve-metrics";
39
+ import { computeFeedbackIntegrity, computeLongitudinalMetrics, computeProposalQualityMetrics, } from "./metrics";
40
+ import { runUtility } from "./runner";
41
+ /**
42
+ * Drive the three-phase Track B runner.
43
+ *
44
+ * Pre: `tasks` is already filtered to one domain (or `all`). The runner
45
+ * partitions internally on `task.slice`.
46
+ *
47
+ * Sandboxing: at the start of every real run the runner materialises one
48
+ * dedicated tmp stash per fixture (the `evolveStash`) plus a fresh sibling
49
+ * snapshot per fixture (the `preStash`). Phase 1 + Phase 2 pin
50
+ * `AKM_STASH_DIR` to the appropriate `evolveStash` for every spawned `akm`
51
+ * invocation; Phase 3's pre arm uses `preStash`, the post arm uses
52
+ * `evolveStash`, and the synthetic arm uses no stash. The operator's real
53
+ * `process.env.AKM_STASH_DIR` is never read or written by `runEvolve`. All
54
+ * stashes are cleaned up in a top-level try/finally.
55
+ */
56
+ export async function runEvolve(options) {
57
+ const seedsPerArm = options.seedsPerArm ?? 5;
58
+ const budgetTokens = options.budgetTokens ?? 30000;
59
+ const budgetWallMs = options.budgetWallMs ?? 120000;
60
+ const negativeThreshold = options.negativeThreshold ?? { absoluteCount: 2, ratio: 0.5 };
61
+ const materialiseStash = options.materialiseStash ?? true;
62
+ const akmCli = options.akmCli ?? defaultAkmCli;
63
+ const warnings = [];
64
+ const trainTasks = options.tasks.filter((t) => effectiveSlice(t) === "train");
65
+ const evalTasks = options.tasks.filter((t) => effectiveSlice(t) === "eval");
66
+ // Use the first task's domain (or "all") as the corpus label. The CLI
67
+ // already filtered to one domain; this is just for the report header.
68
+ const domain = uniqueDomain(options.tasks);
69
+ // ── Sandbox setup: per-fixture evolveStash + preStash. ───────────────────
70
+ // We materialise one tmp stash per unique `task.stash` so Phase 1
71
+ // accumulates feedback into the same on-disk stash that Phase 2 mutates,
72
+ // and that Phase 3's post arm reads back. The operator's real
73
+ // AKM_STASH_DIR is never touched. The pre arm gets a fresh snapshot of
74
+ // the same starting fixture (no Phase 2 mutations applied).
75
+ const fixtureNames = new Set();
76
+ for (const t of options.tasks)
77
+ fixtureNames.add(t.stash);
78
+ const evolveStashes = new Map();
79
+ const preStashes = new Map();
80
+ const evolveDirByFixture = new Map();
81
+ const preDirByFixture = new Map();
82
+ // SIGINT trap (#267): every per-fixture stash registers its cleanup with
83
+ // the shared registry so an external Ctrl-C reaps the tmp dirs even when
84
+ // the top-level try/finally never runs. We deregister in the matching
85
+ // finally block before invoking the synchronous cleanup so the handler
86
+ // doesn't double-fire.
87
+ const stashDeregistrations = [];
88
+ if (materialiseStash) {
89
+ for (const name of fixtureNames) {
90
+ try {
91
+ const evolved = loadFixtureStash(name, { skipIndex: false });
92
+ evolveStashes.set(name, evolved);
93
+ evolveDirByFixture.set(name, evolved.stashDir);
94
+ stashDeregistrations.push(registerCleanup(() => {
95
+ try {
96
+ evolved.cleanup();
97
+ }
98
+ catch {
99
+ /* swallow */
100
+ }
101
+ }));
102
+ }
103
+ catch (err) {
104
+ warnings.push(`evolve: failed to materialise evolve stash for fixture "${name}": ${err.message}`);
105
+ }
106
+ try {
107
+ const pre = loadFixtureStash(name, { skipIndex: false });
108
+ preStashes.set(name, pre);
109
+ preDirByFixture.set(name, pre.stashDir);
110
+ stashDeregistrations.push(registerCleanup(() => {
111
+ try {
112
+ pre.cleanup();
113
+ }
114
+ catch {
115
+ /* swallow */
116
+ }
117
+ }));
118
+ }
119
+ catch (err) {
120
+ warnings.push(`evolve: failed to materialise pre stash for fixture "${name}": ${err.message}`);
121
+ }
122
+ }
123
+ }
124
+ // Resolve the evolveStash dir for a given asset ref. We map ref → fixture
125
+ // by looking up which task's gold ref it matches; if no task owns it (or
126
+ // multiple do, which is unusual), we fall back to the first available
127
+ // evolveStash. The simple — and most common — case is a single fixture
128
+ // per `--tasks <domain>` invocation.
129
+ const refToFixture = new Map();
130
+ for (const t of options.tasks) {
131
+ if (t.goldRef)
132
+ refToFixture.set(t.goldRef, t.stash);
133
+ }
134
+ const fallbackEvolveDir = [...evolveDirByFixture.values()][0];
135
+ function envForRef(ref) {
136
+ const baseEnv = { ...process.env };
137
+ if (!materialiseStash) {
138
+ // Tests opt out of fixture materialisation entirely; we still strip
139
+ // the operator's AKM_STASH_DIR so the fake CLI sees a known sentinel.
140
+ delete baseEnv.AKM_STASH_DIR;
141
+ return baseEnv;
142
+ }
143
+ const fixture = ref ? refToFixture.get(ref) : undefined;
144
+ const dir = (fixture && evolveDirByFixture.get(fixture)) ?? fallbackEvolveDir;
145
+ if (dir)
146
+ baseEnv.AKM_STASH_DIR = dir;
147
+ else
148
+ delete baseEnv.AKM_STASH_DIR;
149
+ return baseEnv;
150
+ }
151
+ let preReport;
152
+ let postReport;
153
+ let syntheticReport;
154
+ let phase1Report;
155
+ const feedbackLog = [];
156
+ const proposalLog = [];
157
+ try {
158
+ // ── Phase 1: accumulate signal on the train slice (akm arm only). ─────
159
+ phase1Report = await runUtility({
160
+ tasks: trainTasks,
161
+ arms: ["akm"],
162
+ model: options.model,
163
+ seedsPerArm,
164
+ budgetTokens,
165
+ budgetWallMs,
166
+ slice: "train",
167
+ ...(options.spawn ? { spawn: options.spawn } : {}),
168
+ // We pre-materialised the per-fixture evolve stash above; tell the
169
+ // runner to forward those dirs and skip its own per-task materialise.
170
+ materialiseStash,
171
+ ...(materialiseStash ? { stashDirByFixture: evolveDirByFixture } : {}),
172
+ ...(options.timestamp ? { timestamp: options.timestamp } : {}),
173
+ ...(options.branch ? { branch: options.branch } : {}),
174
+ ...(options.commit ? { commit: options.commit } : {}),
175
+ });
176
+ // Issue feedback events per (task, seed) outcome on the akm arm.
177
+ const feedbackByRef = new Map();
178
+ const phase1Cwd = options.tasks[0]?.taskDir ?? process.cwd();
179
+ for (const run of phase1Report.akmRuns ?? []) {
180
+ const taskMeta = options.tasks.find((t) => t.id === run.taskId);
181
+ const goldRef = taskMeta?.goldRef;
182
+ if (!goldRef)
183
+ continue;
184
+ if (run.outcome === "harness_error")
185
+ continue;
186
+ const signal = run.outcome === "pass" ? "positive" : "negative";
187
+ const args = ["feedback", goldRef, signal === "positive" ? "--positive" : "--negative"];
188
+ // Wrap in try/catch so a single throwing akmCli (e.g. subprocess
189
+ // crash) cannot leave `feedbackByRef` partially populated and let
190
+ // Phase 2 proceed on corrupt state.
191
+ try {
192
+ const cliResult = await akmCli(args, phase1Cwd, envForRef(goldRef));
193
+ feedbackLog.push({ taskId: run.taskId, seed: run.seed, goldRef, signal, ok: cliResult.exitCode === 0 });
194
+ if (cliResult.exitCode !== 0) {
195
+ warnings.push(`phase1: akm feedback for ${goldRef} (${signal}) failed: ${cliResult.stderr.trim()}`);
196
+ }
197
+ }
198
+ catch (err) {
199
+ feedbackLog.push({ taskId: run.taskId, seed: run.seed, goldRef, signal, ok: false });
200
+ warnings.push(`phase1.feedback_dispatch_failed: ${goldRef} ${err.message}`);
201
+ }
202
+ const counts = feedbackByRef.get(goldRef) ?? { positive: 0, negative: 0 };
203
+ if (signal === "positive")
204
+ counts.positive += 1;
205
+ else
206
+ counts.negative += 1;
207
+ feedbackByRef.set(goldRef, counts);
208
+ }
209
+ // ── Phase 2: evolve. ────────────────────────────────────────────────────
210
+ const evalGoldRefs = new Set();
211
+ for (const t of evalTasks) {
212
+ if (t.goldRef)
213
+ evalGoldRefs.add(t.goldRef);
214
+ }
215
+ const refsToEvolve = [];
216
+ for (const [ref, counts] of feedbackByRef.entries()) {
217
+ if (crossesNegativeThreshold(counts, negativeThreshold))
218
+ refsToEvolve.push(ref);
219
+ }
220
+ refsToEvolve.sort();
221
+ // §7.4 leakage prevention (#267): instead of hard-skipping refs that
222
+ // overlap eval-slice gold refs, we now pass the gold-ref set through
223
+ // `--exclude-feedback-from` (and the matching env var) so `akm distill`
224
+ // filters those events out of its LLM input. The behaviour collapses
225
+ // back to "no useful feedback shown" for refs that ARE the gold ref —
226
+ // distill then runs from asset content only, which is what we want.
227
+ const evalGoldRefList = [...evalGoldRefs].sort();
228
+ const excludeFeedbackCsv = evalGoldRefList.join(",");
229
+ for (const ref of refsToEvolve) {
230
+ // The env var fallback is the contract `akm distill` honours; it lets
231
+ // the bench keep working even if a hypothetical caller invokes
232
+ // distill via a wrapper that mangles flags.
233
+ const evolveEnv = {
234
+ ...envForRef(ref),
235
+ AKM_BENCH_EXCLUDE_GOLD_REFS: excludeFeedbackCsv,
236
+ ...(excludeFeedbackCsv ? { AKM_DISTILL_EXCLUDE_FEEDBACK_FROM: excludeFeedbackCsv } : {}),
237
+ };
238
+ // Pass the eval-gold list explicitly via the CLI flag so the contract
239
+ // is observable in test logs (the env var is a fallback for harnesses
240
+ // that strip flags). Reflect doesn't accept this flag — it's a distill
241
+ // concern only.
242
+ const distillArgs = ["distill", ref];
243
+ if (excludeFeedbackCsv) {
244
+ distillArgs.push("--exclude-feedback-from", excludeFeedbackCsv);
245
+ }
246
+ const distillResult = await akmCli(distillArgs, phase1Cwd, evolveEnv);
247
+ if (distillResult.exitCode !== 0) {
248
+ warnings.push(`phase2: akm distill ${ref} failed: ${distillResult.stderr.trim()}`);
249
+ }
250
+ else if (evalGoldRefs.has(ref) && excludeFeedbackCsv) {
251
+ // Per-ref leakage info — replaces the previous "skipped" message.
252
+ // Operator can audit which refs ran through the filter and confirm
253
+ // distillation didn't see leaked feedback.
254
+ warnings.push(`phase2: filtered eval-slice gold-ref feedback from distill input for ${ref} (--exclude-feedback-from ${excludeFeedbackCsv}).`);
255
+ }
256
+ const reflectResult = await akmCli(["reflect", ref], phase1Cwd, evolveEnv);
257
+ if (reflectResult.exitCode !== 0) {
258
+ // `reflect` requires `agent.default` to be configured — a missing
259
+ // config is non-fatal for the bench; we record and continue.
260
+ warnings.push(`phase2: akm reflect ${ref} skipped/failed: ${reflectResult.stderr.trim()}`);
261
+ }
262
+ }
263
+ // Walk the proposal queue per fixture (each evolveStash has its own
264
+ // proposal log on disk). When we materialised stashes we iterate every
265
+ // fixture that produced proposals; in the common single-fixture case
266
+ // this is one pass.
267
+ const proposalFixtures = materialiseStash ? [...evolveDirByFixture.keys()] : [undefined];
268
+ for (const fixtureName of proposalFixtures) {
269
+ const proposalEnv = { ...process.env };
270
+ if (materialiseStash && fixtureName) {
271
+ const dir = evolveDirByFixture.get(fixtureName);
272
+ if (dir)
273
+ proposalEnv.AKM_STASH_DIR = dir;
274
+ }
275
+ else if (!materialiseStash) {
276
+ delete proposalEnv.AKM_STASH_DIR;
277
+ }
278
+ const listResult = await akmCli(["proposal", "list", "--json"], phase1Cwd, proposalEnv);
279
+ const proposals = parseProposalList(listResult.stdout);
280
+ for (const p of proposals) {
281
+ const showResult = await akmCli(["proposal", "show", p.id, "--json"], phase1Cwd, proposalEnv);
282
+ const lintInfo = parseProposalShow(showResult.stdout);
283
+ const lintPass = lintInfo.lintPass;
284
+ if (lintPass) {
285
+ const acceptResult = await akmCli(["proposal", "accept", p.id], phase1Cwd, proposalEnv);
286
+ proposalLog.push({
287
+ proposalId: p.id,
288
+ assetRef: p.assetRef,
289
+ kind: p.kind,
290
+ lintPass: true,
291
+ decision: acceptResult.exitCode === 0 ? "accept" : "reject",
292
+ ...(acceptResult.exitCode === 0 ? {} : { rejectReason: `accept failed: ${acceptResult.stderr.trim()}` }),
293
+ });
294
+ }
295
+ else {
296
+ const reason = lintInfo.lintMessage ?? "lint failed";
297
+ const rejectResult = await akmCli(["proposal", "reject", p.id, "--reason", `lint failed: ${reason}`], phase1Cwd, proposalEnv);
298
+ proposalLog.push({
299
+ proposalId: p.id,
300
+ assetRef: p.assetRef,
301
+ kind: p.kind,
302
+ lintPass: false,
303
+ decision: "reject",
304
+ rejectReason: reason,
305
+ });
306
+ if (rejectResult.exitCode !== 0) {
307
+ warnings.push(`phase2: akm proposal reject ${p.id} failed: ${rejectResult.stderr.trim()}`);
308
+ }
309
+ }
310
+ }
311
+ // Rebuild the index so accepted lessons surface in Phase 3.
312
+ const indexResult = await akmCli(["index"], phase1Cwd, proposalEnv);
313
+ if (indexResult.exitCode !== 0) {
314
+ warnings.push(`phase2: akm index rebuild failed: ${indexResult.stderr.trim()}`);
315
+ }
316
+ }
317
+ // ── Phase 3: re-evaluate (eval slice). ─────────────────────────────────
318
+ // pre arm: fresh snapshot of the starting fixture (no Phase 2 mutations
319
+ // applied). post arm: the mutated evolveStash so accepted lessons reach
320
+ // the eval slice. synthetic arm: no stash.
321
+ preReport = await runUtility({
322
+ tasks: evalTasks,
323
+ arms: ["akm"],
324
+ model: options.model,
325
+ seedsPerArm,
326
+ budgetTokens,
327
+ budgetWallMs,
328
+ slice: "eval",
329
+ ...(options.spawn ? { spawn: options.spawn } : {}),
330
+ materialiseStash,
331
+ ...(materialiseStash ? { stashDirByFixture: preDirByFixture } : {}),
332
+ ...(options.timestamp ? { timestamp: options.timestamp } : {}),
333
+ ...(options.branch ? { branch: options.branch } : {}),
334
+ ...(options.commit ? { commit: options.commit } : {}),
335
+ });
336
+ postReport = await runUtility({
337
+ tasks: evalTasks,
338
+ arms: ["akm"],
339
+ model: options.model,
340
+ seedsPerArm,
341
+ budgetTokens,
342
+ budgetWallMs,
343
+ slice: "eval",
344
+ // Stamp arm metadata so spawn fakes can distinguish pre-vs-post via
345
+ // an env probe. We thread it via a fresh `spawn` wrapper when one
346
+ // was supplied.
347
+ materialiseStash,
348
+ ...(materialiseStash ? { stashDirByFixture: evolveDirByFixture } : {}),
349
+ ...(options.timestamp ? { timestamp: options.timestamp } : {}),
350
+ ...(options.branch ? { branch: options.branch } : {}),
351
+ ...(options.commit ? { commit: options.commit } : {}),
352
+ ...(options.spawn ? { spawn: wrapSpawnWithArm(options.spawn, "post") } : {}),
353
+ });
354
+ // synthetic: no stash. We pass a spawn wrapper that strips
355
+ // AKM_STASH_DIR and injects the "Bring Your Own Skills" tag so test
356
+ // fakes (and a future real harness) can branch. #267 — also forward a
357
+ // per-task scratchpad prompt via the runner's `buildPrompt` seam so the
358
+ // synthetic arm actually exercises the BYOS prompt path rather than
359
+ // relying on the noakm default.
360
+ syntheticReport = await runUtility({
361
+ tasks: evalTasks,
362
+ arms: ["akm"],
363
+ model: options.model,
364
+ seedsPerArm,
365
+ budgetTokens,
366
+ budgetWallMs,
367
+ slice: "eval",
368
+ materialiseStash: false,
369
+ buildPrompt: (task, _arm) => buildSyntheticPrompt(task.id),
370
+ ...(options.timestamp ? { timestamp: options.timestamp } : {}),
371
+ ...(options.branch ? { branch: options.branch } : {}),
372
+ ...(options.commit ? { commit: options.commit } : {}),
373
+ ...(options.spawn ? { spawn: wrapSpawnWithArm(options.spawn, "synthetic", undefined, true) } : {}),
374
+ });
375
+ }
376
+ finally {
377
+ // Deregister BEFORE running cleanup so a SIGINT during teardown
378
+ // doesn't double-fire the cleanup fns (per cleanup.ts contract).
379
+ for (const deregister of stashDeregistrations)
380
+ deregister();
381
+ for (const s of evolveStashes.values()) {
382
+ try {
383
+ s.cleanup();
384
+ }
385
+ catch {
386
+ /* swallow — best-effort tmp cleanup */
387
+ }
388
+ }
389
+ for (const s of preStashes.values()) {
390
+ try {
391
+ s.cleanup();
392
+ }
393
+ catch {
394
+ /* swallow — best-effort tmp cleanup */
395
+ }
396
+ }
397
+ }
398
+ // ── Compute aggregates. ──────────────────────────────────────────────────
399
+ const proposalsMetrics = computeProposalQualityMetrics(proposalLog);
400
+ const longitudinal = computeLongitudinalMetrics(preReport, postReport, syntheticReport);
401
+ const feedbackIntegrity = computeFeedbackIntegrity({ phase1: phase1Report, feedbackLog });
402
+ // #264 — lesson quality + reuse metrics. The runner doesn't (yet) read
403
+ // accepted lesson bodies off disk or load verifier source text; we pass
404
+ // empty maps so the leakage check defaults to "low" until the read seam
405
+ // lands. Reuse + negative-transfer attribution work today off the
406
+ // pre/post arm `assetsLoaded` stream.
407
+ const lessons = computeLessonMetrics({
408
+ proposalLog,
409
+ feedbackLog,
410
+ preRuns: preReport.akmRuns ?? [],
411
+ postRuns: postReport.akmRuns ?? [],
412
+ });
413
+ return {
414
+ timestamp: options.timestamp ?? new Date().toISOString(),
415
+ branch: options.branch ?? preReport.branch,
416
+ commit: options.commit ?? preReport.commit,
417
+ model: options.model,
418
+ domain,
419
+ seedsPerArm,
420
+ feedbackLog,
421
+ proposalLog,
422
+ proposals: proposalsMetrics,
423
+ lessons,
424
+ longitudinal,
425
+ feedbackIntegrity,
426
+ phase1: phase1Report,
427
+ arms: { pre: preReport, post: postReport, synthetic: syntheticReport },
428
+ warnings: [
429
+ ...warnings,
430
+ ...phase1Report.warnings,
431
+ ...preReport.warnings,
432
+ ...postReport.warnings,
433
+ ...syntheticReport.warnings,
434
+ ],
435
+ };
436
+ }
437
+ /**
438
+ * Default subprocess invoker — runs `bun run src/cli.ts <args>` in `cwd`
439
+ * with the supplied env. Real runs use this; tests inject a fake.
440
+ */
441
+ async function defaultAkmCli(args, cwd, env) {
442
+ const cli = path.resolve(__dirname, "..", "..", "src", "cli.ts");
443
+ const proc = Bun.spawnSync({
444
+ cmd: ["bun", "run", cli, ...args],
445
+ cwd,
446
+ env: { ...process.env, ...env },
447
+ stdout: "pipe",
448
+ stderr: "pipe",
449
+ });
450
+ const stdout = proc.stdout ? new TextDecoder().decode(proc.stdout) : "";
451
+ const stderr = proc.stderr ? new TextDecoder().decode(proc.stderr) : "";
452
+ return { exitCode: proc.exitCode ?? -1, stdout, stderr };
453
+ }
454
+ /**
455
+ * Threshold check: an asset crosses the negative threshold if either the
456
+ * absolute negative count meets `absoluteCount` OR the negative *ratio* among
457
+ * total feedback exceeds `ratio`. Either branch is sufficient — both are
458
+ * spec-mandated defaults.
459
+ */
460
+ function crossesNegativeThreshold(counts, threshold) {
461
+ if (counts.negative >= threshold.absoluteCount)
462
+ return true;
463
+ const total = counts.positive + counts.negative;
464
+ if (total === 0)
465
+ return false;
466
+ return counts.negative / total > threshold.ratio;
467
+ }
468
+ /** Best-effort partition. Honours explicit `slice:` and falls back to id-hash. */
469
+ function effectiveSlice(task) {
470
+ if (task.slice)
471
+ return task.slice;
472
+ // Mirror corpus.effectiveSlice — SHA-1 first byte parity.
473
+ // We avoid the import cycle by inlining the trivial fallback.
474
+ let h = 0;
475
+ for (let i = 0; i < task.id.length; i += 1)
476
+ h = (h * 31 + task.id.charCodeAt(i)) | 0;
477
+ return Math.abs(h) % 2 === 0 ? "train" : "eval";
478
+ }
479
+ function uniqueDomain(tasks) {
480
+ const set = new Set(tasks.map((t) => t.domain));
481
+ if (set.size === 1)
482
+ return [...set][0] ?? "all";
483
+ return "all";
484
+ }
485
+ /**
486
+ * Wrap a spawn fake so every child sees `BENCH_EVOLVE_ARM=<arm>` (and
487
+ * `BENCH_EVOLVE_SCRATCHPAD=1` for the synthetic arm). Used by Phase 3 so
488
+ * test fakes can distinguish the three arms without us having to expose a
489
+ * `prompt` override on `runUtility`. Real production runs receive the same
490
+ * env keys; the real `runAgent` harness ignores them.
491
+ */
492
+ function wrapSpawnWithArm(inner, arm, stashDir, scratchpad = false) {
493
+ return (cmd, opts) => {
494
+ const env = { ...(opts.env ?? {}) };
495
+ env.BENCH_EVOLVE_ARM = arm;
496
+ if (scratchpad)
497
+ env.BENCH_EVOLVE_SCRATCHPAD = "1";
498
+ if (stashDir)
499
+ env.AKM_STASH_DIR = stashDir;
500
+ if (arm === "synthetic")
501
+ delete env.AKM_STASH_DIR;
502
+ return inner(cmd, { ...opts, env });
503
+ };
504
+ }
505
+ /** Tolerant parser for `akm proposal list --json` stdout. */
506
+ function parseProposalList(stdout) {
507
+ if (!stdout.trim())
508
+ return [];
509
+ let parsed;
510
+ try {
511
+ parsed = JSON.parse(stdout);
512
+ }
513
+ catch {
514
+ return [];
515
+ }
516
+ const arr = Array.isArray(parsed)
517
+ ? parsed
518
+ : Array.isArray(parsed.proposals)
519
+ ? parsed.proposals
520
+ : [];
521
+ const out = [];
522
+ for (const item of arr) {
523
+ if (!item || typeof item !== "object")
524
+ continue;
525
+ const rec = item;
526
+ const id = typeof rec.id === "string" ? rec.id : null;
527
+ const assetRef = typeof rec.target_ref === "string"
528
+ ? rec.target_ref
529
+ : typeof rec.targetRef === "string"
530
+ ? rec.targetRef
531
+ : typeof rec.ref === "string"
532
+ ? rec.ref
533
+ : null;
534
+ const kindRaw = typeof rec.kind === "string" ? rec.kind : typeof rec.source === "string" ? rec.source : "unknown";
535
+ const kind = kindRaw === "lesson" || kindRaw === "distill"
536
+ ? "lesson"
537
+ : kindRaw === "revision" || kindRaw === "reflect"
538
+ ? "revision"
539
+ : "unknown";
540
+ if (!id || !assetRef)
541
+ continue;
542
+ out.push({ id, assetRef, kind });
543
+ }
544
+ return out;
545
+ }
546
+ function parseProposalShow(stdout) {
547
+ if (!stdout.trim())
548
+ return { lintPass: false, lintMessage: "empty proposal show output" };
549
+ let parsed;
550
+ try {
551
+ parsed = JSON.parse(stdout);
552
+ }
553
+ catch (err) {
554
+ return { lintPass: false, lintMessage: `proposal show: parse error (${err.message})` };
555
+ }
556
+ const lintPass = parsed.lint_pass === true ||
557
+ parsed.lintPass === true ||
558
+ (typeof parsed.lint === "object" && parsed.lint !== null && parsed.lint.pass === true);
559
+ const lintRaw = parsed.lint;
560
+ let lintMessage;
561
+ if (lintRaw && typeof lintRaw === "object") {
562
+ const issues = lintRaw.issues;
563
+ if (Array.isArray(issues) && issues.length > 0) {
564
+ lintMessage = issues
565
+ .map((i) => (typeof i === "string" ? i : (i?.message ?? JSON.stringify(i))))
566
+ .join("; ");
567
+ }
568
+ }
569
+ return { lintPass, ...(lintMessage ? { lintMessage } : {}) };
570
+ }
571
+ /** Exposed for tests so the synthetic-arm prompt construction can be asserted. */
572
+ export function buildSyntheticPrompt(taskId) {
573
+ return [
574
+ `Task: ${taskId}`,
575
+ "Arm: synthetic (Bring Your Own Skills)",
576
+ "No akm stash is available. Before solving the task, write a short scratchpad of the skills",
577
+ "and steps you intend to use, then proceed. Cite the scratchpad in your trace so the verifier",
578
+ "can attribute the approach to your own reasoning rather than retrieved guidance.",
579
+ ].join("\n");
580
+ }