akm-cli 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (332) hide show
  1. package/CHANGELOG.md +8 -0
  2. package/dist/{src/cli.js → cli.js} +22 -8
  3. package/dist/{src/commands → commands}/installed-stashes.js +1 -1
  4. package/dist/{src/commands → commands}/source-add.js +1 -1
  5. package/dist/{src/core → core}/common.js +16 -1
  6. package/dist/{src/core → core}/config.js +5 -2
  7. package/dist/{src/indexer → indexer}/db-search.js +16 -1
  8. package/dist/{src/indexer → indexer}/graph-extraction.js +5 -3
  9. package/dist/{src/indexer → indexer}/indexer.js +27 -11
  10. package/dist/{src/indexer → indexer}/memory-inference.js +47 -58
  11. package/dist/{src/indexer → indexer}/search-source.js +1 -1
  12. package/dist/{src/llm → llm}/client.js +61 -1
  13. package/dist/{src/llm → llm}/embedder.js +8 -5
  14. package/dist/{src/llm → llm}/embedders/local.js +8 -2
  15. package/dist/{src/llm → llm}/embedders/remote.js +4 -2
  16. package/dist/{src/llm → llm}/graph-extract.js +4 -4
  17. package/dist/llm/memory-infer.js +114 -0
  18. package/dist/{src/llm → llm}/metadata-enhance.js +2 -2
  19. package/dist/{src/output → output}/cli-hints.js +2 -0
  20. package/dist/{src/setup → setup}/setup.js +30 -20
  21. package/dist/sources/providers/website.js +27 -0
  22. package/dist/{src/sources/providers/website.js → sources/website-ingest.js} +38 -51
  23. package/docs/README.md +7 -0
  24. package/docs/migration/release-notes/0.7.0.md +14 -0
  25. package/package.json +11 -8
  26. package/dist/src/llm/memory-infer.js +0 -86
  27. package/dist/tests/add-website-source.test.js +0 -119
  28. package/dist/tests/agent/agent-config-loader.test.js +0 -70
  29. package/dist/tests/agent/agent-config.test.js +0 -221
  30. package/dist/tests/agent/agent-detect.test.js +0 -100
  31. package/dist/tests/agent/agent-spawn.test.js +0 -234
  32. package/dist/tests/agent-output.test.js +0 -186
  33. package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +0 -103
  34. package/dist/tests/architecture/agent-spawn-seam.test.js +0 -193
  35. package/dist/tests/architecture/llm-stateless-seam.test.js +0 -112
  36. package/dist/tests/asset-ref.test.js +0 -192
  37. package/dist/tests/asset-registry.test.js +0 -103
  38. package/dist/tests/asset-spec.test.js +0 -241
  39. package/dist/tests/bench/attribution.test.js +0 -996
  40. package/dist/tests/bench/cleanup-sigint.test.js +0 -83
  41. package/dist/tests/bench/cleanup.js +0 -234
  42. package/dist/tests/bench/cleanup.test.js +0 -166
  43. package/dist/tests/bench/cli.js +0 -1018
  44. package/dist/tests/bench/cli.test.js +0 -445
  45. package/dist/tests/bench/compare.test.js +0 -556
  46. package/dist/tests/bench/corpus.js +0 -317
  47. package/dist/tests/bench/corpus.test.js +0 -258
  48. package/dist/tests/bench/doctor.js +0 -525
  49. package/dist/tests/bench/driver.js +0 -401
  50. package/dist/tests/bench/driver.test.js +0 -584
  51. package/dist/tests/bench/environment.js +0 -233
  52. package/dist/tests/bench/environment.test.js +0 -199
  53. package/dist/tests/bench/evolve-metrics.js +0 -179
  54. package/dist/tests/bench/evolve-metrics.test.js +0 -187
  55. package/dist/tests/bench/evolve.js +0 -647
  56. package/dist/tests/bench/evolve.test.js +0 -624
  57. package/dist/tests/bench/failure-modes.test.js +0 -349
  58. package/dist/tests/bench/feedback-integrity.test.js +0 -457
  59. package/dist/tests/bench/leakage.test.js +0 -228
  60. package/dist/tests/bench/learning-curve.test.js +0 -134
  61. package/dist/tests/bench/metrics.js +0 -2395
  62. package/dist/tests/bench/metrics.test.js +0 -1150
  63. package/dist/tests/bench/no-os-tmpdir-invariant.test.js +0 -43
  64. package/dist/tests/bench/opencode-config.js +0 -194
  65. package/dist/tests/bench/opencode-config.test.js +0 -370
  66. package/dist/tests/bench/report.js +0 -1885
  67. package/dist/tests/bench/report.test.js +0 -1038
  68. package/dist/tests/bench/run-config.js +0 -355
  69. package/dist/tests/bench/run-config.test.js +0 -298
  70. package/dist/tests/bench/run-curate-test.js +0 -32
  71. package/dist/tests/bench/run-failing-tasks.js +0 -56
  72. package/dist/tests/bench/run-full-bench.js +0 -51
  73. package/dist/tests/bench/run-items36-targeted.js +0 -69
  74. package/dist/tests/bench/run-nano-quick.js +0 -42
  75. package/dist/tests/bench/run-waveg-targeted.js +0 -62
  76. package/dist/tests/bench/runner.js +0 -699
  77. package/dist/tests/bench/runner.test.js +0 -958
  78. package/dist/tests/bench/search-bridge.test.js +0 -331
  79. package/dist/tests/bench/tmp.js +0 -131
  80. package/dist/tests/bench/trajectory.js +0 -116
  81. package/dist/tests/bench/trajectory.test.js +0 -127
  82. package/dist/tests/bench/verifier.js +0 -114
  83. package/dist/tests/bench/verifier.test.js +0 -118
  84. package/dist/tests/bench/workflow-evaluator.js +0 -557
  85. package/dist/tests/bench/workflow-evaluator.test.js +0 -421
  86. package/dist/tests/bench/workflow-spec.js +0 -345
  87. package/dist/tests/bench/workflow-spec.test.js +0 -363
  88. package/dist/tests/bench/workflow-trace.js +0 -472
  89. package/dist/tests/bench/workflow-trace.test.js +0 -254
  90. package/dist/tests/benchmark-search-quality.js +0 -536
  91. package/dist/tests/benchmark-suite.js +0 -1441
  92. package/dist/tests/capture-cli.test.js +0 -112
  93. package/dist/tests/cli-errors.test.js +0 -204
  94. package/dist/tests/commands/events.test.js +0 -370
  95. package/dist/tests/commands/history.test.js +0 -418
  96. package/dist/tests/commands/import.test.js +0 -103
  97. package/dist/tests/commands/proposal-cli.test.js +0 -209
  98. package/dist/tests/commands/reflect-propose-cli.test.js +0 -333
  99. package/dist/tests/commands/remember.test.js +0 -97
  100. package/dist/tests/commands/scope-flags.test.js +0 -300
  101. package/dist/tests/commands/search.test.js +0 -537
  102. package/dist/tests/commands/show-indexer-parity.test.js +0 -117
  103. package/dist/tests/commands/show.test.js +0 -294
  104. package/dist/tests/common.test.js +0 -266
  105. package/dist/tests/completions.test.js +0 -142
  106. package/dist/tests/config-cli.test.js +0 -193
  107. package/dist/tests/config-llm-features.test.js +0 -139
  108. package/dist/tests/config.test.js +0 -569
  109. package/dist/tests/contracts/migration-baseline.test.js +0 -43
  110. package/dist/tests/contracts/reflect-propose-envelope.test.js +0 -139
  111. package/dist/tests/contracts/spec-helpers.js +0 -46
  112. package/dist/tests/contracts/v1-spec-section-11-proposal-queue.test.js +0 -228
  113. package/dist/tests/contracts/v1-spec-section-12-agent-config.test.js +0 -56
  114. package/dist/tests/contracts/v1-spec-section-13-lesson-type.test.js +0 -34
  115. package/dist/tests/contracts/v1-spec-section-14-llm-features.test.js +0 -94
  116. package/dist/tests/contracts/v1-spec-section-4-1-asset-types.test.js +0 -39
  117. package/dist/tests/contracts/v1-spec-section-4-2-quality-rules.test.js +0 -44
  118. package/dist/tests/contracts/v1-spec-section-5-configuration.test.js +0 -47
  119. package/dist/tests/contracts/v1-spec-section-6-orchestration.test.js +0 -40
  120. package/dist/tests/contracts/v1-spec-section-7-module-layout.test.js +0 -58
  121. package/dist/tests/contracts/v1-spec-section-8-extension-points.test.js +0 -34
  122. package/dist/tests/contracts/v1-spec-section-9-4-cli-surface.test.js +0 -75
  123. package/dist/tests/contracts/v1-spec-section-9-7-llm-agent-boundary.test.js +0 -36
  124. package/dist/tests/core/write-source.test.js +0 -366
  125. package/dist/tests/curate-command.test.js +0 -87
  126. package/dist/tests/db-scoring.test.js +0 -201
  127. package/dist/tests/db.test.js +0 -654
  128. package/dist/tests/distill-cli-flag.test.js +0 -208
  129. package/dist/tests/distill.test.js +0 -515
  130. package/dist/tests/docker-install.test.js +0 -120
  131. package/dist/tests/e2e.test.js +0 -1419
  132. package/dist/tests/embedder.test.js +0 -340
  133. package/dist/tests/embedding-model-config.test.js +0 -379
  134. package/dist/tests/feedback-command.test.js +0 -172
  135. package/dist/tests/file-context.test.js +0 -552
  136. package/dist/tests/fixtures/scripts/git/summarize-diff.js +0 -9
  137. package/dist/tests/fixtures/scripts/lint/eslint-check.js +0 -7
  138. package/dist/tests/fixtures/stashes/load.js +0 -166
  139. package/dist/tests/fixtures/stashes/load.test.js +0 -97
  140. package/dist/tests/fixtures/stashes/ranking-baseline/scripts/mem0-search.js +0 -12
  141. package/dist/tests/frontmatter.test.js +0 -190
  142. package/dist/tests/fts-field-weighting.test.js +0 -254
  143. package/dist/tests/fuzzy-search.test.js +0 -230
  144. package/dist/tests/git-provider-clone.test.js +0 -45
  145. package/dist/tests/github.test.js +0 -161
  146. package/dist/tests/graph-boost-ranking.test.js +0 -305
  147. package/dist/tests/graph-extraction.test.js +0 -282
  148. package/dist/tests/helpers/usage-events.js +0 -8
  149. package/dist/tests/index-pass-llm.test.js +0 -161
  150. package/dist/tests/indexer.test.js +0 -570
  151. package/dist/tests/info-command.test.js +0 -166
  152. package/dist/tests/init.test.js +0 -69
  153. package/dist/tests/install-script.test.js +0 -246
  154. package/dist/tests/integration/agent-real-profile.test.js +0 -94
  155. package/dist/tests/issue-36-repro.test.js +0 -304
  156. package/dist/tests/issues-191-194.test.js +0 -160
  157. package/dist/tests/lesson-lint.test.js +0 -111
  158. package/dist/tests/llm-client.test.js +0 -115
  159. package/dist/tests/llm-feature-gate.test.js +0 -151
  160. package/dist/tests/llm.test.js +0 -139
  161. package/dist/tests/lockfile.test.js +0 -216
  162. package/dist/tests/manifest.test.js +0 -205
  163. package/dist/tests/markdown.test.js +0 -126
  164. package/dist/tests/matchers-unit.test.js +0 -189
  165. package/dist/tests/memory-inference.test.js +0 -299
  166. package/dist/tests/merge-scoring.test.js +0 -136
  167. package/dist/tests/metadata.test.js +0 -313
  168. package/dist/tests/migration-help.test.js +0 -89
  169. package/dist/tests/origin-resolve.test.js +0 -124
  170. package/dist/tests/output-baseline.test.js +0 -218
  171. package/dist/tests/output-shapes-unit.test.js +0 -478
  172. package/dist/tests/parallel-search.test.js +0 -272
  173. package/dist/tests/parameter-metadata.test.js +0 -365
  174. package/dist/tests/paths.test.js +0 -177
  175. package/dist/tests/progressive-disclosure.test.js +0 -280
  176. package/dist/tests/proposals.test.js +0 -279
  177. package/dist/tests/proposed-quality.test.js +0 -271
  178. package/dist/tests/provider-registry.test.js +0 -32
  179. package/dist/tests/ranking-regression.test.js +0 -548
  180. package/dist/tests/reflect-propose.test.js +0 -455
  181. package/dist/tests/registry-build-index.test.js +0 -394
  182. package/dist/tests/registry-cli.test.js +0 -290
  183. package/dist/tests/registry-index-v2.test.js +0 -430
  184. package/dist/tests/registry-install.test.js +0 -728
  185. package/dist/tests/registry-providers/parity.test.js +0 -189
  186. package/dist/tests/registry-providers/skills-sh.test.js +0 -309
  187. package/dist/tests/registry-providers/static-index.test.js +0 -238
  188. package/dist/tests/registry-resolve.test.js +0 -126
  189. package/dist/tests/registry-search.test.js +0 -923
  190. package/dist/tests/remember-frontmatter.test.js +0 -378
  191. package/dist/tests/remember-unit.test.js +0 -123
  192. package/dist/tests/ripgrep-install.test.js +0 -251
  193. package/dist/tests/ripgrep-resolve.test.js +0 -108
  194. package/dist/tests/ripgrep.test.js +0 -163
  195. package/dist/tests/save-command.test.js +0 -94
  196. package/dist/tests/save-trust-qa-fixes.test.js +0 -270
  197. package/dist/tests/scoring-pipeline.test.js +0 -648
  198. package/dist/tests/search-include-proposed-cli.test.js +0 -118
  199. package/dist/tests/self-update.test.js +0 -442
  200. package/dist/tests/semantic-search-e2e.test.js +0 -512
  201. package/dist/tests/semantic-status.test.js +0 -471
  202. package/dist/tests/setup-run.integration.js +0 -877
  203. package/dist/tests/setup-wizard.test.js +0 -198
  204. package/dist/tests/setup.test.js +0 -131
  205. package/dist/tests/source-add.test.js +0 -11
  206. package/dist/tests/source-clone.test.js +0 -254
  207. package/dist/tests/source-manage.test.js +0 -366
  208. package/dist/tests/source-providers/filesystem.test.js +0 -82
  209. package/dist/tests/source-providers/git.test.js +0 -252
  210. package/dist/tests/source-providers/website.test.js +0 -128
  211. package/dist/tests/source-qa-fixes.test.js +0 -286
  212. package/dist/tests/source-registry.test.js +0 -350
  213. package/dist/tests/source-resolve.test.js +0 -100
  214. package/dist/tests/source-source.test.js +0 -281
  215. package/dist/tests/source.test.js +0 -533
  216. package/dist/tests/tar-utils-scan.test.js +0 -73
  217. package/dist/tests/toggle-components.test.js +0 -73
  218. package/dist/tests/usage-telemetry.test.js +0 -265
  219. package/dist/tests/utility-scoring.test.js +0 -558
  220. package/dist/tests/vault-load-error.test.js +0 -78
  221. package/dist/tests/vault-qa-fixes.test.js +0 -194
  222. package/dist/tests/vault.test.js +0 -429
  223. package/dist/tests/vector-search.test.js +0 -608
  224. package/dist/tests/walker.test.js +0 -252
  225. package/dist/tests/wave2-cluster-bc.test.js +0 -228
  226. package/dist/tests/wave2-cluster-d.test.js +0 -180
  227. package/dist/tests/wave2-cluster-e.test.js +0 -179
  228. package/dist/tests/wiki-qa-fixes.test.js +0 -270
  229. package/dist/tests/wiki.test.js +0 -529
  230. package/dist/tests/workflow-cli.test.js +0 -271
  231. package/dist/tests/workflow-markdown.test.js +0 -171
  232. package/dist/tests/workflow-path-escape.test.js +0 -132
  233. package/dist/tests/workflow-qa-fixes.test.js +0 -395
  234. package/dist/tests/workflows/indexer-rejection.test.js +0 -213
  235. /package/dist/{src/commands → commands}/completions.js +0 -0
  236. /package/dist/{src/commands → commands}/config-cli.js +0 -0
  237. /package/dist/{src/commands → commands}/curate.js +0 -0
  238. /package/dist/{src/commands → commands}/distill.js +0 -0
  239. /package/dist/{src/commands → commands}/events.js +0 -0
  240. /package/dist/{src/commands → commands}/history.js +0 -0
  241. /package/dist/{src/commands → commands}/info.js +0 -0
  242. /package/dist/{src/commands → commands}/init.js +0 -0
  243. /package/dist/{src/commands → commands}/install-audit.js +0 -0
  244. /package/dist/{src/commands → commands}/migration-help.js +0 -0
  245. /package/dist/{src/commands → commands}/proposal.js +0 -0
  246. /package/dist/{src/commands → commands}/propose.js +0 -0
  247. /package/dist/{src/commands → commands}/reflect.js +0 -0
  248. /package/dist/{src/commands → commands}/registry-search.js +0 -0
  249. /package/dist/{src/commands → commands}/remember.js +0 -0
  250. /package/dist/{src/commands → commands}/search.js +0 -0
  251. /package/dist/{src/commands → commands}/self-update.js +0 -0
  252. /package/dist/{src/commands → commands}/show.js +0 -0
  253. /package/dist/{src/commands → commands}/source-clone.js +0 -0
  254. /package/dist/{src/commands → commands}/source-manage.js +0 -0
  255. /package/dist/{src/commands → commands}/vault.js +0 -0
  256. /package/dist/{src/core → core}/asset-ref.js +0 -0
  257. /package/dist/{src/core → core}/asset-registry.js +0 -0
  258. /package/dist/{src/core → core}/asset-spec.js +0 -0
  259. /package/dist/{src/core → core}/errors.js +0 -0
  260. /package/dist/{src/core → core}/events.js +0 -0
  261. /package/dist/{src/core → core}/frontmatter.js +0 -0
  262. /package/dist/{src/core → core}/lesson-lint.js +0 -0
  263. /package/dist/{src/core → core}/markdown.js +0 -0
  264. /package/dist/{src/core → core}/paths.js +0 -0
  265. /package/dist/{src/core → core}/proposals.js +0 -0
  266. /package/dist/{src/core → core}/warn.js +0 -0
  267. /package/dist/{src/core → core}/write-source.js +0 -0
  268. /package/dist/{src/indexer → indexer}/db.js +0 -0
  269. /package/dist/{src/indexer → indexer}/file-context.js +0 -0
  270. /package/dist/{src/indexer → indexer}/graph-boost.js +0 -0
  271. /package/dist/{src/indexer → indexer}/manifest.js +0 -0
  272. /package/dist/{src/indexer → indexer}/matchers.js +0 -0
  273. /package/dist/{src/indexer → indexer}/metadata.js +0 -0
  274. /package/dist/{src/indexer → indexer}/search-fields.js +0 -0
  275. /package/dist/{src/indexer → indexer}/semantic-status.js +0 -0
  276. /package/dist/{src/indexer → indexer}/usage-events.js +0 -0
  277. /package/dist/{src/indexer → indexer}/walker.js +0 -0
  278. /package/dist/{src/integrations → integrations}/agent/config.js +0 -0
  279. /package/dist/{src/integrations → integrations}/agent/detect.js +0 -0
  280. /package/dist/{src/integrations → integrations}/agent/index.js +0 -0
  281. /package/dist/{src/integrations → integrations}/agent/profiles.js +0 -0
  282. /package/dist/{src/integrations → integrations}/agent/prompts.js +0 -0
  283. /package/dist/{src/integrations → integrations}/agent/spawn.js +0 -0
  284. /package/dist/{src/integrations → integrations}/github.js +0 -0
  285. /package/dist/{src/integrations → integrations}/lockfile.js +0 -0
  286. /package/dist/{src/llm → llm}/embedders/cache.js +0 -0
  287. /package/dist/{src/llm → llm}/embedders/types.js +0 -0
  288. /package/dist/{src/llm → llm}/feature-gate.js +0 -0
  289. /package/dist/{src/llm → llm}/index-passes.js +0 -0
  290. /package/dist/{src/output → output}/context.js +0 -0
  291. /package/dist/{src/output → output}/renderers.js +0 -0
  292. /package/dist/{src/output → output}/shapes.js +0 -0
  293. /package/dist/{src/output → output}/text.js +0 -0
  294. /package/dist/{src/registry → registry}/build-index.js +0 -0
  295. /package/dist/{src/registry → registry}/create-provider-registry.js +0 -0
  296. /package/dist/{src/registry → registry}/factory.js +0 -0
  297. /package/dist/{src/registry → registry}/origin-resolve.js +0 -0
  298. /package/dist/{src/registry → registry}/providers/index.js +0 -0
  299. /package/dist/{src/registry → registry}/providers/skills-sh.js +0 -0
  300. /package/dist/{src/registry → registry}/providers/static-index.js +0 -0
  301. /package/dist/{src/registry → registry}/providers/types.js +0 -0
  302. /package/dist/{src/registry → registry}/resolve.js +0 -0
  303. /package/dist/{src/registry → registry}/types.js +0 -0
  304. /package/dist/{src/setup → setup}/detect.js +0 -0
  305. /package/dist/{src/setup → setup}/ripgrep-install.js +0 -0
  306. /package/dist/{src/setup → setup}/ripgrep-resolve.js +0 -0
  307. /package/dist/{src/setup → setup}/steps.js +0 -0
  308. /package/dist/{src/sources → sources}/include.js +0 -0
  309. /package/dist/{src/sources → sources}/provider-factory.js +0 -0
  310. /package/dist/{src/sources → sources}/provider.js +0 -0
  311. /package/dist/{src/sources → sources}/providers/filesystem.js +0 -0
  312. /package/dist/{src/sources → sources}/providers/git.js +0 -0
  313. /package/dist/{src/sources → sources}/providers/index.js +0 -0
  314. /package/dist/{src/sources → sources}/providers/install-types.js +0 -0
  315. /package/dist/{src/sources → sources}/providers/npm.js +0 -0
  316. /package/dist/{src/sources → sources}/providers/provider-utils.js +0 -0
  317. /package/dist/{src/sources → sources}/providers/sync-from-ref.js +0 -0
  318. /package/dist/{src/sources → sources}/providers/tar-utils.js +0 -0
  319. /package/dist/{src/sources → sources}/resolve.js +0 -0
  320. /package/dist/{src/sources → sources}/types.js +0 -0
  321. /package/dist/{src/templates → templates}/wiki-templates.js +0 -0
  322. /package/dist/{src/version.js → version.js} +0 -0
  323. /package/dist/{src/wiki → wiki}/wiki.js +0 -0
  324. /package/dist/{src/workflows → workflows}/authoring.js +0 -0
  325. /package/dist/{src/workflows → workflows}/cli.js +0 -0
  326. /package/dist/{src/workflows → workflows}/db.js +0 -0
  327. /package/dist/{src/workflows → workflows}/document-cache.js +0 -0
  328. /package/dist/{src/workflows → workflows}/parser.js +0 -0
  329. /package/dist/{src/workflows → workflows}/renderer.js +0 -0
  330. /package/dist/{src/workflows → workflows}/runs.js +0 -0
  331. /package/dist/{src/workflows → workflows}/schema.js +0 -0
  332. /package/dist/{src/workflows → workflows}/validator.js +0 -0
@@ -1,699 +0,0 @@
1
- /**
2
- * akm-bench K-seed runner (spec §5 + §6).
3
- *
4
- * `runUtility(options)` is the single entry point used by both the CLI
5
- * dispatcher (`tests/bench/cli.ts utility`) and unit tests. It expands the
6
- * caller's `(tasks × arms × seeds)` cartesian product, calls `runOne` for
7
- * each triple, splices the trajectory record back in, and returns a
8
- * `UtilityRunReport` that `renderUtilityReport` can stamp into JSON +
9
- * markdown.
10
- *
11
- * Per-(arm, seed) isolation:
12
- * • Workspace: each (task, arm, seed) gets a fresh tmp dir seeded from the
13
- * task's `workspace/` template so runs cannot pollute each other.
14
- * • Stash: only the `akm` arm materialises a stash via `loadFixtureStash`.
15
- * We materialise once per task (the stash content is identical across
16
- * the K seeds) and reuse it.
17
- *
18
- * Cleanup: every tmp resource is wrapped in `try/finally`. We never leak
19
- * tmp dirs even on harness exceptions.
20
- */
21
- import { createHash } from "node:crypto";
22
- import fs from "node:fs";
23
- import path from "node:path";
24
- import { warn } from "../../src/core/warn";
25
- import { computeFixtureContentHash, loadFixtureStash } from "../fixtures/stashes/load";
26
- import { registerCleanup } from "./cleanup";
27
- import { computeTaskCorpusHash, readTaskBody } from "./corpus";
28
- import { runOne } from "./driver";
29
- import { validateFixtureCorpus } from "./environment";
30
- import { aggregateCorpus, aggregateFailureModes, aggregatePerTask, aggregateTrajectory, classifyFailureMode, computeCorpusDelta, computePerAssetAttribution, computePerTaskDelta, computeSearchBridge, extractAssetLoads, extractGoldRanks, } from "./metrics";
31
- import { resolveGitBranch, resolveGitCommit } from "./report";
32
- import { benchMkdtemp, benchTmpRoot } from "./tmp";
33
- import { computeTrajectory } from "./trajectory";
34
- import { evaluateRunAgainstAllSpecs, } from "./workflow-evaluator";
35
- import { loadAllWorkflowSpecs } from "./workflow-spec";
36
- import { normalizeRunToTrace } from "./workflow-trace";
37
- /** Checkpoint write interval: write a partial file every N completed runs. */
38
- const CHECKPOINT_INTERVAL = 5;
39
- /** Partial file max age before cleanup: 24 hours in milliseconds. */
40
- const PARTIAL_MAX_AGE_MS = 24 * 60 * 60 * 1000;
41
- /**
42
- * Emit a one-line progress update to stderr after each (task, arm, seed)
43
- * completes. Goes to stderr even when --json is passed so operators always
44
- * have a heartbeat signal during long runs.
45
- *
46
- * Format: `[<completed>/<total>] <taskId> <arm> <outcome> <wallclockSeconds>s`
47
- */
48
- function emitProgress(completed, total, run) {
49
- const secs = Math.round(run.wallclockMs / 1000);
50
- process.stderr.write(`[${completed}/${total}] ${run.taskId} ${run.arm} ${run.outcome} ${secs}s\n`);
51
- }
52
- /**
53
- * Write a partial checkpoint file under `${AKM_CACHE_DIR}/bench/`.
54
- * The file contains the runs completed so far plus a `partial: true` marker
55
- * and a `summary.total_runs_completed` counter. Old partial files (>24h)
56
- * are not cleaned up here — that is done at startup via `cleanupOldPartials`.
57
- */
58
- function writePartialCheckpoint(runs, timestamp) {
59
- try {
60
- const root = benchTmpRoot();
61
- const filename = `bench-partial-${timestamp.replace(/[:.]/g, "-")}.json`;
62
- const outPath = path.join(root, filename);
63
- const envelope = {
64
- partial: true,
65
- summary: {
66
- total_runs_completed: runs.length,
67
- },
68
- timestamp,
69
- runs: runs.map((r) => ({
70
- task_id: r.taskId,
71
- arm: r.arm,
72
- seed: r.seed,
73
- model: r.model,
74
- outcome: r.outcome,
75
- wallclock_ms: r.wallclockMs,
76
- })),
77
- };
78
- fs.writeFileSync(outPath, JSON.stringify(envelope, null, 2), "utf8");
79
- }
80
- catch {
81
- // Checkpoint writes are best-effort — never abort a run for a write failure.
82
- }
83
- }
84
- /**
85
- * Remove partial checkpoint files older than 24 hours from the bench tmp root.
86
- * Called once at the start of `runUtility` to reap orphans from prior crashed runs.
87
- */
88
- function cleanupOldPartials() {
89
- try {
90
- const root = benchTmpRoot();
91
- const now = Date.now();
92
- const entries = fs.readdirSync(root);
93
- for (const entry of entries) {
94
- if (!entry.startsWith("bench-partial-"))
95
- continue;
96
- const fullPath = path.join(root, entry);
97
- try {
98
- const stat = fs.statSync(fullPath);
99
- if (now - stat.mtimeMs > PARTIAL_MAX_AGE_MS) {
100
- fs.unlinkSync(fullPath);
101
- }
102
- }
103
- catch {
104
- /* swallow per-file errors */
105
- }
106
- }
107
- }
108
- catch {
109
- /* swallow — cleanup is best-effort */
110
- }
111
- }
112
- /**
113
- * Default workflows directory. Can be overridden by callers (tests) via
114
- * `RunUtilityOptions.workflowsDir`. Specs in this directory are loaded ONCE
115
- * per `runUtility` call (not per run) — the evaluator filters via each spec's
116
- * `applies_to` so we don't I/O in the hot loop.
117
- */
118
- const DEFAULT_WORKFLOWS_DIR = path.resolve(__dirname, "..", "fixtures", "bench", "workflows");
119
- /**
120
- * Run `items` in batches of `n` concurrently, calling `fn` for each item.
121
- * Batches are executed sequentially; within each batch all items run with
122
- * `Promise.all`. This gives bounded concurrency without a full work-queue.
123
- */
124
- async function runInBatches(items, n, fn) {
125
- for (let i = 0; i < items.length; i += n) {
126
- await Promise.all(items.slice(i, i + n).map(fn));
127
- }
128
- }
129
- /**
130
- * Run K seeds × len(arms) × len(tasks) and return the §13.3 report.
131
- *
132
- * The function is robust to per-run failures — `runOne` already captures
133
- * every failure path into a RunResult, so the runner only has to worry
134
- * about its own infrastructure (stash materialisation, workspace copy).
135
- * Those failures are recorded as `harness_error` runs.
136
- *
137
- * When `options.parallel > 1`, work items are batched and run concurrently
138
- * via `runInBatches`. The shared `warnings`, `goldRankRecords`, and
139
- * `workflowChecks` arrays are updated atomically at the end of each item so
140
- * no JS-level races occur (Node/Bun is single-threaded).
141
- */
142
- export async function runUtility(options) {
143
- const seedsPerArm = options.seedsPerArm ?? 5;
144
- const budgetTokens = options.budgetTokens ?? 30000;
145
- const budgetWallMs = options.budgetWallMs ?? 120000;
146
- const slice = options.slice ?? "all";
147
- const materialiseStash = options.materialiseStash ?? true;
148
- // Clamp parallel to [1, 8].
149
- const parallel = Math.min(8, Math.max(1, options.parallel ?? 1));
150
- if (parallel > 4 && !options.forceParallel) {
151
- process.stderr.write(`bench: --parallel ${parallel} exceeds 4; high concurrency may overwhelm local providers. ` +
152
- `Pass --force-parallel to suppress this warning.\n`);
153
- }
154
- // Clean up orphaned partial files from prior crashed runs (best-effort).
155
- cleanupOldPartials();
156
- const grouped = new Map();
157
- const warnings = [];
158
- // Validate all task stash references before starting any work. Missing
159
- // fixtures produce harness_error at run time; better to surface them loudly
160
- // at startup with the fixture name than to discover them per-seed mid-run.
161
- if (materialiseStash && options.arms.includes("akm")) {
162
- const { missing } = validateFixtureCorpus(options.tasks);
163
- for (const [fixture, taskIds] of missing) {
164
- const w = `fixture "${fixture}" missing MANIFEST.json — tasks will harness_error: ${taskIds.join(", ")}`;
165
- process.stderr.write(`bench: WARNING: ${w}\n`);
166
- warnings.push(w);
167
- }
168
- }
169
- const goldRankRecords = [];
170
- // Progress tracking: compute total run count upfront so progress lines show
171
- // `[7/40]` rather than an unbounded counter.
172
- const armsForProgress = options.includeSynthetic
173
- ? [...new Set([...options.arms, "synthetic"])]
174
- : options.arms;
175
- const totalRuns = options.tasks.length * armsForProgress.length * seedsPerArm;
176
- let completedRuns = 0;
177
- // Partial checkpoint accumulator: collects all RunResults as they land so
178
- // we can write a partial envelope periodically without keeping duplicates.
179
- const allCompletedRuns = [];
180
- const runTimestamp = options.timestamp ?? new Date().toISOString();
181
- // #257: load workflow specs ONCE per runUtility call. Skipped when the
182
- // caller passes an empty `workflowsDir` string (test escape hatch). Errors
183
- // are surfaced as warnings — workflow evaluation is best-effort and a
184
- // missing/malformed spec must not abort the whole bench run.
185
- const workflowSpecs = [];
186
- const workflowsDir = options.workflowsDir ?? DEFAULT_WORKFLOWS_DIR;
187
- if (workflowsDir.length > 0) {
188
- try {
189
- const loaded = loadAllWorkflowSpecs(workflowsDir);
190
- workflowSpecs.push(...loaded);
191
- }
192
- catch (err) {
193
- const msg = err instanceof Error ? err.message : String(err);
194
- warnings.push(`workflow specs: failed to load from "${workflowsDir}": ${msg}`);
195
- warn(`[runUtility] workflow specs unavailable: ${msg}`);
196
- }
197
- }
198
- const workflowChecks = [];
199
- for (const task of options.tasks) {
200
- const taskRuns = new Map();
201
- grouped.set(task.id, taskRuns);
202
- // Resolve a caller-supplied stash override before materialising. When
203
- // `stashDirByFixture` provides a directory for this task's fixture, we
204
- // skip `loadFixtureStash` entirely and forward the override.
205
- const overrideStashDir = options.stashDirByFixture?.get(task.stash);
206
- // Materialise the akm-arm stash once per task. We share it across the K
207
- // seeds because the stash content is identical and re-running `akm
208
- // index` for every seed is wasted work.
209
- let stash;
210
- let stashError;
211
- if (options.arms.includes("akm") && materialiseStash && !overrideStashDir) {
212
- try {
213
- stash = loadFixtureStash(task.stash);
214
- }
215
- catch (err) {
216
- stashError = err instanceof Error ? err.message : String(err);
217
- warnings.push(`task ${task.id}: stash "${task.stash}" failed to load: ${stashError}`);
218
- }
219
- }
220
- // SIGINT/SIGTERM trap (#267): register the per-task stash cleanup so an
221
- // external signal mid-run reaps the tmp dir we just created.
222
- const stashSnapshot = stash;
223
- const deregisterStash = stashSnapshot
224
- ? registerCleanup(() => {
225
- try {
226
- stashSnapshot.cleanup();
227
- }
228
- catch {
229
- /* swallow */
230
- }
231
- })
232
- : () => { };
233
- // #261: when `includeSynthetic` is set, splice the synthetic arm into the
234
- // per-task arm iteration alongside whatever the caller asked for. We
235
- // dedupe so a caller that already passes `synthetic` in `arms` does not
236
- // see it run twice. Pre-#261 callers (no flag, no `synthetic` in arms)
237
- // see the old loop verbatim — that's the byte-identical default contract.
238
- const armsForTask = (() => {
239
- if (!options.includeSynthetic)
240
- return options.arms;
241
- if (options.arms.includes("synthetic"))
242
- return options.arms;
243
- return [...options.arms, "synthetic"];
244
- })();
245
- const workItems = [];
246
- for (const arm of armsForTask) {
247
- taskRuns.set(arm, []);
248
- for (let seed = 0; seed < seedsPerArm; seed += 1) {
249
- workItems.push({ arm, seed });
250
- }
251
- }
252
- // Per-run worker: resolves stash/prompt, executes runOneIsolated, then
253
- // splices the result into the shared accumulators. Because Bun/Node is
254
- // single-threaded these splices are race-free even across concurrent
255
- // awaits — only one microtask runs at a time between yield points.
256
- const runItem = async ({ arm, seed }) => {
257
- // Resolve the stashDir we'll forward to the agent. The akm arm
258
- // always carries a stashDir so AKM_STASH_DIR is set in the child
259
- // env — this is how downstream tooling (and the trajectory parser
260
- // event-stream lookup) distinguishes arms. When the operator opted
261
- // out of fixture materialisation (tests, dry-run), we still pass a
262
- // stable placeholder so the env keys are wired correctly.
263
- let stashDir;
264
- if (arm === "akm") {
265
- // Resolution order (must match the issue #251 acceptance criteria):
266
- // 1. Per-task explicit override (used by `runMaskedCorpus` to
267
- // point at a tmp stash with one asset removed). Highest
268
- // priority because attribution correctness depends on this
269
- // branch never being shadowed by the `__no-stash__`
270
- // placeholder fallback.
271
- // 2. Per-(task, arm)-call `stashDirByFixture` override (Phase
272
- // 3 evolve persistence).
273
- // 3. Per-task materialised fixture stash from `loadFixtureStash`.
274
- // 4. `materialiseStash: false` placeholder so AKM_STASH_DIR is
275
- // still wired into the child env.
276
- if (task.stashDirOverride)
277
- stashDir = task.stashDirOverride;
278
- else if (overrideStashDir)
279
- stashDir = overrideStashDir;
280
- else if (stash)
281
- stashDir = stash.stashDir;
282
- else if (!materialiseStash)
283
- stashDir = path.join(task.taskDir, "__no-stash__");
284
- }
285
- // Build the prompt-override (#267). The builder is invoked once
286
- // per (task, arm) — seeds share a prompt. `undefined` keeps the
287
- // driver's default prompt in play.
288
- //
289
- // #261: the synthetic arm has a scratch-notes prompt contract —
290
- // the model is told no AKM stash is available and instructed to
291
- // write/use its own procedural notes. When the caller does not
292
- // supply a `buildPrompt` override for the synthetic arm we fall
293
- // back to a built-in scratch-notes prompt so the contract is
294
- // honoured by every utility-track caller, not just `runEvolve`.
295
- let promptOverride = options.buildPrompt?.(task, arm);
296
- if (promptOverride === undefined && arm === "synthetic") {
297
- promptOverride = buildUtilitySyntheticPrompt(task.id);
298
- }
299
- // Collect per-run warnings separately and merge after the run so
300
- // concurrent runs don't interleave partial warning sequences.
301
- const runWarnings = [];
302
- const run = await runOneIsolated({
303
- task,
304
- arm,
305
- seed,
306
- model: options.model,
307
- stashDir,
308
- budgetTokens,
309
- budgetWallMs,
310
- spawn: options.spawn,
311
- warnings: runWarnings,
312
- ...(promptOverride !== undefined ? { prompt: promptOverride } : {}),
313
- ...(options.opencodeProviders ? { opencodeProviders: options.opencodeProviders } : {}),
314
- ...(stash?.indexCacheHome ? { indexCacheHome: stash.indexCacheHome } : {}),
315
- });
316
- // Merge per-run warnings into the shared array.
317
- if (runWarnings.length > 0)
318
- warnings.push(...runWarnings);
319
- taskRuns.get(arm)?.push(run);
320
- // Emit a compact progress line to stderr (unconditional — even under
321
- // --json so operators have a heartbeat during long runs).
322
- completedRuns += 1;
323
- emitProgress(completedRuns, totalRuns, run);
324
- // Accumulate for partial checkpointing.
325
- allCompletedRuns.push(run);
326
- if (completedRuns % CHECKPOINT_INTERVAL === 0) {
327
- writePartialCheckpoint(allCompletedRuns, runTimestamp);
328
- }
329
- // §6.7 search-pipeline bridge: only the akm arm consults the stash,
330
- // and we only attribute ranks for tasks with a gold ref. Both
331
- // guards mean noakm and gold-less runs are silently excluded.
332
- if (arm === "akm" && task.goldRef) {
333
- const searches = extractGoldRanks(run, task.goldRef);
334
- goldRankRecords.push({
335
- taskId: task.id,
336
- arm,
337
- seed,
338
- outcome: run.outcome,
339
- goldRef: task.goldRef,
340
- searches,
341
- });
342
- }
343
- // #257: evaluate the akm-arm run against every workflow spec. The
344
- // evaluator's `specApplies` filter handles applicability (arm,
345
- // domain, gold ref, repeated-failures threshold), so we hand it the
346
- // entire spec list and append whatever it returns. noakm/synthetic
347
- // arms are not evaluated — workflow specs target the akm arm.
348
- if (arm === "akm" && workflowSpecs.length > 0) {
349
- const trace = normalizeRunToTrace(run, {
350
- warnings: runWarnings,
351
- harness: {
352
- agentStartedTs: run.startedAt,
353
- agentFinishedTs: run.finishedAt,
354
- },
355
- });
356
- const runCtx = {
357
- arm: run.arm,
358
- taskId: run.taskId,
359
- seed: run.seed,
360
- outcome: run.outcome,
361
- };
362
- const taskMetadata = buildWorkflowTaskMetadata(task, trace);
363
- const checks = evaluateRunAgainstAllSpecs(trace, workflowSpecs, runCtx, taskMetadata);
364
- workflowChecks.push(...checks);
365
- }
366
- };
367
- try {
368
- await runInBatches(workItems, parallel, runItem);
369
- }
370
- finally {
371
- // Deregister BEFORE running cleanup so a SIGINT arriving during this
372
- // block doesn't double-fire the cleanup (per cleanup.ts contract).
373
- deregisterStash();
374
- stash?.cleanup();
375
- }
376
- }
377
- return buildReport({
378
- grouped,
379
- options,
380
- seedsPerArm,
381
- slice,
382
- warnings,
383
- goldRankRecords,
384
- workflowChecks,
385
- });
386
- }
387
- function buildWorkflowTaskMetadata(task, trace) {
388
- const flags = {
389
- search_has_relevant_result: searchResultIncludesGoldRef(trace, task.goldRef),
390
- task_has_tests: taskHasTests(task),
391
- };
392
- return {
393
- ...(task.goldRef !== undefined ? { goldRef: task.goldRef } : {}),
394
- flags,
395
- };
396
- }
397
- function searchResultIncludesGoldRef(trace, goldRef) {
398
- if (!goldRef)
399
- return false;
400
- for (const event of trace.events) {
401
- if (event.type !== "akm_search")
402
- continue;
403
- if (event.resultRefs?.includes(goldRef))
404
- return true;
405
- }
406
- return false;
407
- }
408
- function taskHasTests(task) {
409
- if (task.verifier === "pytest")
410
- return true;
411
- const testsDir = path.join(task.taskDir, "tests");
412
- if (!fs.existsSync(testsDir))
413
- return false;
414
- try {
415
- return fs.readdirSync(testsDir).some((name) => name.endsWith(".py") || name.endsWith(".sh"));
416
- }
417
- catch {
418
- return false;
419
- }
420
- }
421
- /**
422
- * Set up a fresh workspace for one (task, arm, seed) triple, run `runOne`
423
- * against it, splice in the trajectory record, then tear everything down.
424
- */
425
- async function runOneIsolated(args) {
426
- const workspace = benchMkdtemp(`akm-bench-ws-${args.task.domain}-`);
427
- // SIGINT trap: register workspace cleanup so external signals don't leak
428
- // tmp dirs. Deregistered in `finally` before we do the synchronous rm so
429
- // the handler doesn't double-fire (per cleanup.ts contract).
430
- const deregisterWorkspace = registerCleanup(() => {
431
- try {
432
- fs.rmSync(workspace, { recursive: true, force: true });
433
- }
434
- catch {
435
- /* swallow */
436
- }
437
- });
438
- try {
439
- seedWorkspace(args.task.taskDir, workspace);
440
- const runOptions = {
441
- track: "utility",
442
- arm: args.arm,
443
- taskId: args.task.id,
444
- taskTitle: args.task.title,
445
- workspace,
446
- model: args.model,
447
- seed: args.seed,
448
- budgetTokens: args.budgetTokens,
449
- budgetWallMs: args.budgetWallMs,
450
- verifier: args.task.verifier,
451
- taskDir: args.task.taskDir,
452
- ...(args.task.expectedMatch ? { expectedMatch: args.task.expectedMatch } : {}),
453
- ...(args.task.akmKeywords ? { akmKeywords: args.task.akmKeywords } : {}),
454
- ...(args.stashDir ? { stashDir: args.stashDir } : {}),
455
- ...(args.spawn ? { spawn: args.spawn } : {}),
456
- ...(args.prompt !== undefined ? { prompt: args.prompt } : {}),
457
- warnings: args.warnings,
458
- ...(args.opencodeProviders ? { opencodeProviders: args.opencodeProviders } : {}),
459
- ...(args.indexCacheHome ? { indexCacheHome: args.indexCacheHome } : {}),
460
- };
461
- const result = await runOne(runOptions);
462
- // Splice in the trajectory metric. The driver always returns
463
- // `{ null, null }` — this is where the real values get filled.
464
- const trajectory = computeTrajectory({ goldRef: args.task.goldRef }, result, {
465
- warnings: args.warnings,
466
- });
467
- // Per-asset attribution is post-processing on the trace; it's free, so we
468
- // run it on every (task, arm, seed) result. The driver emits an empty
469
- // assetsLoaded[]; this is where the real refs get filled. Spec §6.5.
470
- const assetsLoaded = extractAssetLoads(result);
471
- // Splice in the failure-mode label. Only the akm arm carries one; the
472
- // noakm baseline is the control and isn't part of the §6.6 to-do list.
473
- // `classifyFailureMode` returns null for non-failed runs.
474
- const failureMode = args.arm === "akm" ? classifyFailureMode(args.task, { ...result, trajectory, assetsLoaded }) : null;
475
- return { ...result, trajectory, assetsLoaded, failureMode };
476
- }
477
- finally {
478
- deregisterWorkspace();
479
- fs.rmSync(workspace, { recursive: true, force: true });
480
- }
481
- }
482
- /**
483
- * Copy the task's `workspace/` template into the per-run tmp dir. If the
484
- * task has no `workspace/` (loader-test fixtures), the run starts with an
485
- * empty cwd — that is also valid for verifier-only tasks.
486
- */
487
- function seedWorkspace(taskDir, dest) {
488
- const src = path.join(taskDir, "workspace");
489
- if (!fs.existsSync(src))
490
- return;
491
- copyDirRecursive(src, dest);
492
- }
493
- /**
494
- * Default synthetic-arm prompt (#261). Used by Track A `runUtility` when the
495
- * caller opts in via `includeSynthetic: true` and does not also supply a
496
- * `buildPrompt` override for the synthetic arm.
497
- *
498
- * The prompt is a clear scratch-notes contract: the model is told no AKM
499
- * stash is available and instructed to write/use its own procedural notes
500
- * before solving the task. This mirrors the prompt shape used by Track B's
501
- * `buildSyntheticPrompt(taskId)` but is intentionally duplicated here so
502
- * Track A has no module-level dependency on `evolve.ts`.
503
- *
504
- * Exported for tests.
505
- */
506
- export function buildUtilitySyntheticPrompt(taskId) {
507
- return [
508
- `Task: ${taskId}`,
509
- "Arm: synthetic (Bring Your Own Skills)",
510
- "No akm stash is available; AKM_STASH_DIR is intentionally absent. Before solving",
511
- "the task, write a short scratchpad of the skills and steps you intend to use,",
512
- "then proceed. Cite the scratchpad in your trace so the verifier can attribute",
513
- "the approach to your own reasoning rather than retrieved guidance.",
514
- ].join("\n");
515
- }
516
- function copyDirRecursive(src, dest) {
517
- fs.mkdirSync(dest, { recursive: true });
518
- const entries = fs.readdirSync(src, { withFileTypes: true });
519
- for (const entry of entries) {
520
- if (entry.name === ".gitkeep")
521
- continue;
522
- const s = path.join(src, entry.name);
523
- const d = path.join(dest, entry.name);
524
- if (entry.isDirectory())
525
- copyDirRecursive(s, d);
526
- else if (entry.isFile())
527
- fs.copyFileSync(s, d);
528
- }
529
- }
530
- function buildReport(args) {
531
- const tasks = [];
532
- const noakmPerTask = {};
533
- const akmPerTask = {};
534
- const synthPerTask = {};
535
- const akmRunsAll = [];
536
- const allRuns = [];
537
- const includeSynth = args.options.includeSynthetic === true;
538
- // #257: index workflow checks by taskId so we can attach a per-task
539
- // mean compliance to each `UtilityReportTaskEntry`. Only `pass` and
540
- // `partial` statuses contribute non-zero scores; `not_applicable` is
541
- // skipped (the spec did not target this run); `harness_error` rolls in
542
- // as a 0 so corrupt traces drag the per-task number down.
543
- const checksByTask = new Map();
544
- for (const c of args.workflowChecks) {
545
- const arr = checksByTask.get(c.taskId);
546
- if (arr)
547
- arr.push(c);
548
- else
549
- checksByTask.set(c.taskId, [c]);
550
- }
551
- for (const task of args.options.tasks) {
552
- const taskRuns = args.grouped.get(task.id);
553
- const noakmRuns = taskRuns?.get("noakm") ?? [];
554
- const akmRuns = taskRuns?.get("akm") ?? [];
555
- // #261: synthetic-arm runs are only consulted when the caller opted in.
556
- // A missing arm is NOT a zero-pass arm — we leave `synthPerTask[task.id]`
557
- // unset rather than defaulting to a zeroed PerTaskMetrics so downstream
558
- // consumers can distinguish "arm not run" from "arm ran with 0 passes".
559
- const synthRuns = includeSynth ? (taskRuns?.get("synthetic") ?? []) : [];
560
- const noakmMetrics = aggregatePerTask(noakmRuns);
561
- const akmMetrics = aggregatePerTask(akmRuns);
562
- const delta = computePerTaskDelta(noakmMetrics, akmMetrics);
563
- noakmPerTask[task.id] = noakmMetrics;
564
- akmPerTask[task.id] = akmMetrics;
565
- if (includeSynth) {
566
- synthPerTask[task.id] = aggregatePerTask(synthRuns);
567
- }
568
- akmRunsAll.push(...akmRuns);
569
- // Preserve arm order (noakm, synthetic when enabled, then akm) so the
570
- // persisted runs[] array is deterministic across reruns. #249. The
571
- // synthetic block is omitted entirely when includeSynth is false so the
572
- // pre-#261 envelope stays byte-identical.
573
- if (includeSynth) {
574
- allRuns.push(...noakmRuns, ...synthRuns, ...akmRuns);
575
- }
576
- else {
577
- allRuns.push(...noakmRuns, ...akmRuns);
578
- }
579
- // #257: per-task workflow compliance, mean of `score` over applicable
580
- // checks (excludes `not_applicable`). Undefined when this task has no
581
- // applicable checks at all so downstream renderers can distinguish
582
- // "not measured" from "measured at 0".
583
- const taskChecks = checksByTask.get(task.id) ?? [];
584
- const applicableTaskChecks = taskChecks.filter((c) => c.status !== "not_applicable");
585
- let workflowCompliance;
586
- if (applicableTaskChecks.length > 0) {
587
- let sum = 0;
588
- for (const c of applicableTaskChecks)
589
- sum += c.score;
590
- workflowCompliance = sum / applicableTaskChecks.length;
591
- }
592
- tasks.push({
593
- id: task.id,
594
- noakm: noakmMetrics,
595
- akm: akmMetrics,
596
- delta,
597
- ...(includeSynth ? { synthetic: aggregatePerTask(synthRuns) } : {}),
598
- ...(workflowCompliance !== undefined ? { workflowCompliance } : {}),
599
- });
600
- }
601
- const aggregateNoakm = aggregateCorpus(noakmPerTask);
602
- const aggregateAkm = aggregateCorpus(akmPerTask);
603
- const aggregateDelta = computeCorpusDelta(aggregateNoakm, aggregateAkm);
604
- // #261: synthetic-arm aggregate is built ONLY when the caller opted in.
605
- // We compute it once here so the report renderer can stamp `arms.synthetic`
606
- // and `akm_over_synthetic_lift` without recomputing.
607
- const aggregateSynth = includeSynth ? aggregateCorpus(synthPerTask) : undefined;
608
- const trajectoryAkm = aggregateTrajectory(akmRunsAll);
609
- // Failure-mode aggregate (§6.6). Walks every akm-arm run; runs that are
610
- // not "fail" carry `failureMode: null` and are skipped here.
611
- const failureEntries = [];
612
- for (const r of akmRunsAll) {
613
- if (r.failureMode)
614
- failureEntries.push({ taskId: r.taskId, mode: r.failureMode });
615
- }
616
- const failureModes = aggregateFailureModes(failureEntries);
617
- const domains = new Set(args.options.tasks.map((t) => t.domain)).size;
618
- const branch = args.options.branch ?? resolveGitBranch();
619
- const commit = args.options.commit ?? resolveGitCommit();
620
- const timestamp = args.options.timestamp ?? new Date().toISOString();
621
- // §6.7 — compute the search-pipeline bridge once over the whole corpus.
622
- // The function tolerates an empty record list (renders the N/A sentence
623
- // downstream).
624
- const searchBridge = computeSearchBridge({ goldRankRecords: args.goldRankRecords });
625
- // #250 — stamp deterministic corpus + fixture identity into the report
626
- // so `bench compare` can refuse cross-corpus / cross-fixture diffs unless
627
- // the operator explicitly opts in via --allow-corpus-mismatch /
628
- // --allow-fixture-mismatch.
629
- const selectedTaskIds = [...args.options.tasks.map((t) => t.id)].sort();
630
- const taskBodies = new Map();
631
- for (const t of args.options.tasks)
632
- taskBodies.set(t.id, readTaskBody(t.taskDir));
633
- const taskCorpusHash = computeTaskCorpusHash(selectedTaskIds, taskBodies);
634
- const fixtureNames = [...new Set(args.options.tasks.map((t) => t.stash))].sort();
635
- const fixtures = {};
636
- for (const name of fixtureNames) {
637
- try {
638
- fixtures[name] = computeFixtureContentHash(name);
639
- }
640
- catch (err) {
641
- // Loader-test tasks point at fixtures that may not exist on disk; we
642
- // still want to stamp identity for the present fixtures, so we record
643
- // the failure as a warning and continue with the remaining set.
644
- args.warnings.push(`corpus stamp: cannot hash fixture "${name}": ${err instanceof Error ? err.message : String(err)}`);
645
- }
646
- }
647
- // Combined fixture-content hash. Hash input is the same `<name>\0<hash>\0`
648
- // pattern used elsewhere — order-stable because `fixtureNames` is sorted.
649
- const combinedHash = createHash("sha256");
650
- for (const name of fixtureNames) {
651
- combinedHash.update(name);
652
- combinedHash.update("\0");
653
- combinedHash.update(fixtures[name] ?? "");
654
- combinedHash.update("\0");
655
- }
656
- const fixtureContentHash = combinedHash.digest("hex");
657
- const baseReport = {
658
- timestamp,
659
- branch,
660
- commit,
661
- model: args.options.model,
662
- corpus: {
663
- domains,
664
- tasks: args.options.tasks.length,
665
- slice: args.slice,
666
- seedsPerArm: args.seedsPerArm,
667
- selectedTaskIds,
668
- taskCorpusHash,
669
- fixtures,
670
- fixtureContentHash,
671
- },
672
- aggregateNoakm,
673
- aggregateAkm,
674
- aggregateDelta,
675
- ...(aggregateSynth ? { aggregateSynth } : {}),
676
- trajectoryAkm,
677
- failureModes,
678
- tasks,
679
- warnings: args.warnings,
680
- akmRuns: akmRunsAll,
681
- allRuns,
682
- taskMetadata: args.options.tasks,
683
- goldRankRecords: args.goldRankRecords,
684
- searchBridge,
685
- workflowChecks: args.workflowChecks,
686
- };
687
- // Compute per-asset attribution as post-processing on the akm-arm runs
688
- // we just collected. This is the §6.5 "free" diagnostic — it runs on every
689
- // utility invocation, no extra spawns.
690
- baseReport.perAsset = computePerAssetAttribution(baseReport);
691
- // Stamp the optional baseline pass-rate map onto the report so the
692
- // renderer surfaces a `vs base` column in markdown and a
693
- // `baseline_by_task_id` field in JSON. Additive — when the caller did
694
- // not pass a baseline the report shape is byte-identical to before.
695
- if (args.options.baselineByTaskId) {
696
- baseReport.baselineByTaskId = { ...args.options.baselineByTaskId };
697
- }
698
- return baseReport;
699
- }