akm-cli 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (332) hide show
  1. package/CHANGELOG.md +8 -0
  2. package/dist/{src/cli.js → cli.js} +22 -8
  3. package/dist/{src/commands → commands}/installed-stashes.js +1 -1
  4. package/dist/{src/commands → commands}/source-add.js +1 -1
  5. package/dist/{src/core → core}/common.js +16 -1
  6. package/dist/{src/core → core}/config.js +5 -2
  7. package/dist/{src/indexer → indexer}/db-search.js +16 -1
  8. package/dist/{src/indexer → indexer}/graph-extraction.js +5 -3
  9. package/dist/{src/indexer → indexer}/indexer.js +27 -11
  10. package/dist/{src/indexer → indexer}/memory-inference.js +47 -58
  11. package/dist/{src/indexer → indexer}/search-source.js +1 -1
  12. package/dist/{src/llm → llm}/client.js +61 -1
  13. package/dist/{src/llm → llm}/embedder.js +8 -5
  14. package/dist/{src/llm → llm}/embedders/local.js +8 -2
  15. package/dist/{src/llm → llm}/embedders/remote.js +4 -2
  16. package/dist/{src/llm → llm}/graph-extract.js +4 -4
  17. package/dist/llm/memory-infer.js +114 -0
  18. package/dist/{src/llm → llm}/metadata-enhance.js +2 -2
  19. package/dist/{src/output → output}/cli-hints.js +2 -0
  20. package/dist/{src/setup → setup}/setup.js +30 -20
  21. package/dist/sources/providers/website.js +27 -0
  22. package/dist/{src/sources/providers/website.js → sources/website-ingest.js} +38 -51
  23. package/docs/README.md +7 -0
  24. package/docs/migration/release-notes/0.7.0.md +14 -0
  25. package/package.json +11 -8
  26. package/dist/src/llm/memory-infer.js +0 -86
  27. package/dist/tests/add-website-source.test.js +0 -119
  28. package/dist/tests/agent/agent-config-loader.test.js +0 -70
  29. package/dist/tests/agent/agent-config.test.js +0 -221
  30. package/dist/tests/agent/agent-detect.test.js +0 -100
  31. package/dist/tests/agent/agent-spawn.test.js +0 -234
  32. package/dist/tests/agent-output.test.js +0 -186
  33. package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +0 -103
  34. package/dist/tests/architecture/agent-spawn-seam.test.js +0 -193
  35. package/dist/tests/architecture/llm-stateless-seam.test.js +0 -112
  36. package/dist/tests/asset-ref.test.js +0 -192
  37. package/dist/tests/asset-registry.test.js +0 -103
  38. package/dist/tests/asset-spec.test.js +0 -241
  39. package/dist/tests/bench/attribution.test.js +0 -996
  40. package/dist/tests/bench/cleanup-sigint.test.js +0 -83
  41. package/dist/tests/bench/cleanup.js +0 -234
  42. package/dist/tests/bench/cleanup.test.js +0 -166
  43. package/dist/tests/bench/cli.js +0 -1018
  44. package/dist/tests/bench/cli.test.js +0 -445
  45. package/dist/tests/bench/compare.test.js +0 -556
  46. package/dist/tests/bench/corpus.js +0 -317
  47. package/dist/tests/bench/corpus.test.js +0 -258
  48. package/dist/tests/bench/doctor.js +0 -525
  49. package/dist/tests/bench/driver.js +0 -401
  50. package/dist/tests/bench/driver.test.js +0 -584
  51. package/dist/tests/bench/environment.js +0 -233
  52. package/dist/tests/bench/environment.test.js +0 -199
  53. package/dist/tests/bench/evolve-metrics.js +0 -179
  54. package/dist/tests/bench/evolve-metrics.test.js +0 -187
  55. package/dist/tests/bench/evolve.js +0 -647
  56. package/dist/tests/bench/evolve.test.js +0 -624
  57. package/dist/tests/bench/failure-modes.test.js +0 -349
  58. package/dist/tests/bench/feedback-integrity.test.js +0 -457
  59. package/dist/tests/bench/leakage.test.js +0 -228
  60. package/dist/tests/bench/learning-curve.test.js +0 -134
  61. package/dist/tests/bench/metrics.js +0 -2395
  62. package/dist/tests/bench/metrics.test.js +0 -1150
  63. package/dist/tests/bench/no-os-tmpdir-invariant.test.js +0 -43
  64. package/dist/tests/bench/opencode-config.js +0 -194
  65. package/dist/tests/bench/opencode-config.test.js +0 -370
  66. package/dist/tests/bench/report.js +0 -1885
  67. package/dist/tests/bench/report.test.js +0 -1038
  68. package/dist/tests/bench/run-config.js +0 -355
  69. package/dist/tests/bench/run-config.test.js +0 -298
  70. package/dist/tests/bench/run-curate-test.js +0 -32
  71. package/dist/tests/bench/run-failing-tasks.js +0 -56
  72. package/dist/tests/bench/run-full-bench.js +0 -51
  73. package/dist/tests/bench/run-items36-targeted.js +0 -69
  74. package/dist/tests/bench/run-nano-quick.js +0 -42
  75. package/dist/tests/bench/run-waveg-targeted.js +0 -62
  76. package/dist/tests/bench/runner.js +0 -699
  77. package/dist/tests/bench/runner.test.js +0 -958
  78. package/dist/tests/bench/search-bridge.test.js +0 -331
  79. package/dist/tests/bench/tmp.js +0 -131
  80. package/dist/tests/bench/trajectory.js +0 -116
  81. package/dist/tests/bench/trajectory.test.js +0 -127
  82. package/dist/tests/bench/verifier.js +0 -114
  83. package/dist/tests/bench/verifier.test.js +0 -118
  84. package/dist/tests/bench/workflow-evaluator.js +0 -557
  85. package/dist/tests/bench/workflow-evaluator.test.js +0 -421
  86. package/dist/tests/bench/workflow-spec.js +0 -345
  87. package/dist/tests/bench/workflow-spec.test.js +0 -363
  88. package/dist/tests/bench/workflow-trace.js +0 -472
  89. package/dist/tests/bench/workflow-trace.test.js +0 -254
  90. package/dist/tests/benchmark-search-quality.js +0 -536
  91. package/dist/tests/benchmark-suite.js +0 -1441
  92. package/dist/tests/capture-cli.test.js +0 -112
  93. package/dist/tests/cli-errors.test.js +0 -204
  94. package/dist/tests/commands/events.test.js +0 -370
  95. package/dist/tests/commands/history.test.js +0 -418
  96. package/dist/tests/commands/import.test.js +0 -103
  97. package/dist/tests/commands/proposal-cli.test.js +0 -209
  98. package/dist/tests/commands/reflect-propose-cli.test.js +0 -333
  99. package/dist/tests/commands/remember.test.js +0 -97
  100. package/dist/tests/commands/scope-flags.test.js +0 -300
  101. package/dist/tests/commands/search.test.js +0 -537
  102. package/dist/tests/commands/show-indexer-parity.test.js +0 -117
  103. package/dist/tests/commands/show.test.js +0 -294
  104. package/dist/tests/common.test.js +0 -266
  105. package/dist/tests/completions.test.js +0 -142
  106. package/dist/tests/config-cli.test.js +0 -193
  107. package/dist/tests/config-llm-features.test.js +0 -139
  108. package/dist/tests/config.test.js +0 -569
  109. package/dist/tests/contracts/migration-baseline.test.js +0 -43
  110. package/dist/tests/contracts/reflect-propose-envelope.test.js +0 -139
  111. package/dist/tests/contracts/spec-helpers.js +0 -46
  112. package/dist/tests/contracts/v1-spec-section-11-proposal-queue.test.js +0 -228
  113. package/dist/tests/contracts/v1-spec-section-12-agent-config.test.js +0 -56
  114. package/dist/tests/contracts/v1-spec-section-13-lesson-type.test.js +0 -34
  115. package/dist/tests/contracts/v1-spec-section-14-llm-features.test.js +0 -94
  116. package/dist/tests/contracts/v1-spec-section-4-1-asset-types.test.js +0 -39
  117. package/dist/tests/contracts/v1-spec-section-4-2-quality-rules.test.js +0 -44
  118. package/dist/tests/contracts/v1-spec-section-5-configuration.test.js +0 -47
  119. package/dist/tests/contracts/v1-spec-section-6-orchestration.test.js +0 -40
  120. package/dist/tests/contracts/v1-spec-section-7-module-layout.test.js +0 -58
  121. package/dist/tests/contracts/v1-spec-section-8-extension-points.test.js +0 -34
  122. package/dist/tests/contracts/v1-spec-section-9-4-cli-surface.test.js +0 -75
  123. package/dist/tests/contracts/v1-spec-section-9-7-llm-agent-boundary.test.js +0 -36
  124. package/dist/tests/core/write-source.test.js +0 -366
  125. package/dist/tests/curate-command.test.js +0 -87
  126. package/dist/tests/db-scoring.test.js +0 -201
  127. package/dist/tests/db.test.js +0 -654
  128. package/dist/tests/distill-cli-flag.test.js +0 -208
  129. package/dist/tests/distill.test.js +0 -515
  130. package/dist/tests/docker-install.test.js +0 -120
  131. package/dist/tests/e2e.test.js +0 -1419
  132. package/dist/tests/embedder.test.js +0 -340
  133. package/dist/tests/embedding-model-config.test.js +0 -379
  134. package/dist/tests/feedback-command.test.js +0 -172
  135. package/dist/tests/file-context.test.js +0 -552
  136. package/dist/tests/fixtures/scripts/git/summarize-diff.js +0 -9
  137. package/dist/tests/fixtures/scripts/lint/eslint-check.js +0 -7
  138. package/dist/tests/fixtures/stashes/load.js +0 -166
  139. package/dist/tests/fixtures/stashes/load.test.js +0 -97
  140. package/dist/tests/fixtures/stashes/ranking-baseline/scripts/mem0-search.js +0 -12
  141. package/dist/tests/frontmatter.test.js +0 -190
  142. package/dist/tests/fts-field-weighting.test.js +0 -254
  143. package/dist/tests/fuzzy-search.test.js +0 -230
  144. package/dist/tests/git-provider-clone.test.js +0 -45
  145. package/dist/tests/github.test.js +0 -161
  146. package/dist/tests/graph-boost-ranking.test.js +0 -305
  147. package/dist/tests/graph-extraction.test.js +0 -282
  148. package/dist/tests/helpers/usage-events.js +0 -8
  149. package/dist/tests/index-pass-llm.test.js +0 -161
  150. package/dist/tests/indexer.test.js +0 -570
  151. package/dist/tests/info-command.test.js +0 -166
  152. package/dist/tests/init.test.js +0 -69
  153. package/dist/tests/install-script.test.js +0 -246
  154. package/dist/tests/integration/agent-real-profile.test.js +0 -94
  155. package/dist/tests/issue-36-repro.test.js +0 -304
  156. package/dist/tests/issues-191-194.test.js +0 -160
  157. package/dist/tests/lesson-lint.test.js +0 -111
  158. package/dist/tests/llm-client.test.js +0 -115
  159. package/dist/tests/llm-feature-gate.test.js +0 -151
  160. package/dist/tests/llm.test.js +0 -139
  161. package/dist/tests/lockfile.test.js +0 -216
  162. package/dist/tests/manifest.test.js +0 -205
  163. package/dist/tests/markdown.test.js +0 -126
  164. package/dist/tests/matchers-unit.test.js +0 -189
  165. package/dist/tests/memory-inference.test.js +0 -299
  166. package/dist/tests/merge-scoring.test.js +0 -136
  167. package/dist/tests/metadata.test.js +0 -313
  168. package/dist/tests/migration-help.test.js +0 -89
  169. package/dist/tests/origin-resolve.test.js +0 -124
  170. package/dist/tests/output-baseline.test.js +0 -218
  171. package/dist/tests/output-shapes-unit.test.js +0 -478
  172. package/dist/tests/parallel-search.test.js +0 -272
  173. package/dist/tests/parameter-metadata.test.js +0 -365
  174. package/dist/tests/paths.test.js +0 -177
  175. package/dist/tests/progressive-disclosure.test.js +0 -280
  176. package/dist/tests/proposals.test.js +0 -279
  177. package/dist/tests/proposed-quality.test.js +0 -271
  178. package/dist/tests/provider-registry.test.js +0 -32
  179. package/dist/tests/ranking-regression.test.js +0 -548
  180. package/dist/tests/reflect-propose.test.js +0 -455
  181. package/dist/tests/registry-build-index.test.js +0 -394
  182. package/dist/tests/registry-cli.test.js +0 -290
  183. package/dist/tests/registry-index-v2.test.js +0 -430
  184. package/dist/tests/registry-install.test.js +0 -728
  185. package/dist/tests/registry-providers/parity.test.js +0 -189
  186. package/dist/tests/registry-providers/skills-sh.test.js +0 -309
  187. package/dist/tests/registry-providers/static-index.test.js +0 -238
  188. package/dist/tests/registry-resolve.test.js +0 -126
  189. package/dist/tests/registry-search.test.js +0 -923
  190. package/dist/tests/remember-frontmatter.test.js +0 -378
  191. package/dist/tests/remember-unit.test.js +0 -123
  192. package/dist/tests/ripgrep-install.test.js +0 -251
  193. package/dist/tests/ripgrep-resolve.test.js +0 -108
  194. package/dist/tests/ripgrep.test.js +0 -163
  195. package/dist/tests/save-command.test.js +0 -94
  196. package/dist/tests/save-trust-qa-fixes.test.js +0 -270
  197. package/dist/tests/scoring-pipeline.test.js +0 -648
  198. package/dist/tests/search-include-proposed-cli.test.js +0 -118
  199. package/dist/tests/self-update.test.js +0 -442
  200. package/dist/tests/semantic-search-e2e.test.js +0 -512
  201. package/dist/tests/semantic-status.test.js +0 -471
  202. package/dist/tests/setup-run.integration.js +0 -877
  203. package/dist/tests/setup-wizard.test.js +0 -198
  204. package/dist/tests/setup.test.js +0 -131
  205. package/dist/tests/source-add.test.js +0 -11
  206. package/dist/tests/source-clone.test.js +0 -254
  207. package/dist/tests/source-manage.test.js +0 -366
  208. package/dist/tests/source-providers/filesystem.test.js +0 -82
  209. package/dist/tests/source-providers/git.test.js +0 -252
  210. package/dist/tests/source-providers/website.test.js +0 -128
  211. package/dist/tests/source-qa-fixes.test.js +0 -286
  212. package/dist/tests/source-registry.test.js +0 -350
  213. package/dist/tests/source-resolve.test.js +0 -100
  214. package/dist/tests/source-source.test.js +0 -281
  215. package/dist/tests/source.test.js +0 -533
  216. package/dist/tests/tar-utils-scan.test.js +0 -73
  217. package/dist/tests/toggle-components.test.js +0 -73
  218. package/dist/tests/usage-telemetry.test.js +0 -265
  219. package/dist/tests/utility-scoring.test.js +0 -558
  220. package/dist/tests/vault-load-error.test.js +0 -78
  221. package/dist/tests/vault-qa-fixes.test.js +0 -194
  222. package/dist/tests/vault.test.js +0 -429
  223. package/dist/tests/vector-search.test.js +0 -608
  224. package/dist/tests/walker.test.js +0 -252
  225. package/dist/tests/wave2-cluster-bc.test.js +0 -228
  226. package/dist/tests/wave2-cluster-d.test.js +0 -180
  227. package/dist/tests/wave2-cluster-e.test.js +0 -179
  228. package/dist/tests/wiki-qa-fixes.test.js +0 -270
  229. package/dist/tests/wiki.test.js +0 -529
  230. package/dist/tests/workflow-cli.test.js +0 -271
  231. package/dist/tests/workflow-markdown.test.js +0 -171
  232. package/dist/tests/workflow-path-escape.test.js +0 -132
  233. package/dist/tests/workflow-qa-fixes.test.js +0 -395
  234. package/dist/tests/workflows/indexer-rejection.test.js +0 -213
  235. /package/dist/{src/commands → commands}/completions.js +0 -0
  236. /package/dist/{src/commands → commands}/config-cli.js +0 -0
  237. /package/dist/{src/commands → commands}/curate.js +0 -0
  238. /package/dist/{src/commands → commands}/distill.js +0 -0
  239. /package/dist/{src/commands → commands}/events.js +0 -0
  240. /package/dist/{src/commands → commands}/history.js +0 -0
  241. /package/dist/{src/commands → commands}/info.js +0 -0
  242. /package/dist/{src/commands → commands}/init.js +0 -0
  243. /package/dist/{src/commands → commands}/install-audit.js +0 -0
  244. /package/dist/{src/commands → commands}/migration-help.js +0 -0
  245. /package/dist/{src/commands → commands}/proposal.js +0 -0
  246. /package/dist/{src/commands → commands}/propose.js +0 -0
  247. /package/dist/{src/commands → commands}/reflect.js +0 -0
  248. /package/dist/{src/commands → commands}/registry-search.js +0 -0
  249. /package/dist/{src/commands → commands}/remember.js +0 -0
  250. /package/dist/{src/commands → commands}/search.js +0 -0
  251. /package/dist/{src/commands → commands}/self-update.js +0 -0
  252. /package/dist/{src/commands → commands}/show.js +0 -0
  253. /package/dist/{src/commands → commands}/source-clone.js +0 -0
  254. /package/dist/{src/commands → commands}/source-manage.js +0 -0
  255. /package/dist/{src/commands → commands}/vault.js +0 -0
  256. /package/dist/{src/core → core}/asset-ref.js +0 -0
  257. /package/dist/{src/core → core}/asset-registry.js +0 -0
  258. /package/dist/{src/core → core}/asset-spec.js +0 -0
  259. /package/dist/{src/core → core}/errors.js +0 -0
  260. /package/dist/{src/core → core}/events.js +0 -0
  261. /package/dist/{src/core → core}/frontmatter.js +0 -0
  262. /package/dist/{src/core → core}/lesson-lint.js +0 -0
  263. /package/dist/{src/core → core}/markdown.js +0 -0
  264. /package/dist/{src/core → core}/paths.js +0 -0
  265. /package/dist/{src/core → core}/proposals.js +0 -0
  266. /package/dist/{src/core → core}/warn.js +0 -0
  267. /package/dist/{src/core → core}/write-source.js +0 -0
  268. /package/dist/{src/indexer → indexer}/db.js +0 -0
  269. /package/dist/{src/indexer → indexer}/file-context.js +0 -0
  270. /package/dist/{src/indexer → indexer}/graph-boost.js +0 -0
  271. /package/dist/{src/indexer → indexer}/manifest.js +0 -0
  272. /package/dist/{src/indexer → indexer}/matchers.js +0 -0
  273. /package/dist/{src/indexer → indexer}/metadata.js +0 -0
  274. /package/dist/{src/indexer → indexer}/search-fields.js +0 -0
  275. /package/dist/{src/indexer → indexer}/semantic-status.js +0 -0
  276. /package/dist/{src/indexer → indexer}/usage-events.js +0 -0
  277. /package/dist/{src/indexer → indexer}/walker.js +0 -0
  278. /package/dist/{src/integrations → integrations}/agent/config.js +0 -0
  279. /package/dist/{src/integrations → integrations}/agent/detect.js +0 -0
  280. /package/dist/{src/integrations → integrations}/agent/index.js +0 -0
  281. /package/dist/{src/integrations → integrations}/agent/profiles.js +0 -0
  282. /package/dist/{src/integrations → integrations}/agent/prompts.js +0 -0
  283. /package/dist/{src/integrations → integrations}/agent/spawn.js +0 -0
  284. /package/dist/{src/integrations → integrations}/github.js +0 -0
  285. /package/dist/{src/integrations → integrations}/lockfile.js +0 -0
  286. /package/dist/{src/llm → llm}/embedders/cache.js +0 -0
  287. /package/dist/{src/llm → llm}/embedders/types.js +0 -0
  288. /package/dist/{src/llm → llm}/feature-gate.js +0 -0
  289. /package/dist/{src/llm → llm}/index-passes.js +0 -0
  290. /package/dist/{src/output → output}/context.js +0 -0
  291. /package/dist/{src/output → output}/renderers.js +0 -0
  292. /package/dist/{src/output → output}/shapes.js +0 -0
  293. /package/dist/{src/output → output}/text.js +0 -0
  294. /package/dist/{src/registry → registry}/build-index.js +0 -0
  295. /package/dist/{src/registry → registry}/create-provider-registry.js +0 -0
  296. /package/dist/{src/registry → registry}/factory.js +0 -0
  297. /package/dist/{src/registry → registry}/origin-resolve.js +0 -0
  298. /package/dist/{src/registry → registry}/providers/index.js +0 -0
  299. /package/dist/{src/registry → registry}/providers/skills-sh.js +0 -0
  300. /package/dist/{src/registry → registry}/providers/static-index.js +0 -0
  301. /package/dist/{src/registry → registry}/providers/types.js +0 -0
  302. /package/dist/{src/registry → registry}/resolve.js +0 -0
  303. /package/dist/{src/registry → registry}/types.js +0 -0
  304. /package/dist/{src/setup → setup}/detect.js +0 -0
  305. /package/dist/{src/setup → setup}/ripgrep-install.js +0 -0
  306. /package/dist/{src/setup → setup}/ripgrep-resolve.js +0 -0
  307. /package/dist/{src/setup → setup}/steps.js +0 -0
  308. /package/dist/{src/sources → sources}/include.js +0 -0
  309. /package/dist/{src/sources → sources}/provider-factory.js +0 -0
  310. /package/dist/{src/sources → sources}/provider.js +0 -0
  311. /package/dist/{src/sources → sources}/providers/filesystem.js +0 -0
  312. /package/dist/{src/sources → sources}/providers/git.js +0 -0
  313. /package/dist/{src/sources → sources}/providers/index.js +0 -0
  314. /package/dist/{src/sources → sources}/providers/install-types.js +0 -0
  315. /package/dist/{src/sources → sources}/providers/npm.js +0 -0
  316. /package/dist/{src/sources → sources}/providers/provider-utils.js +0 -0
  317. /package/dist/{src/sources → sources}/providers/sync-from-ref.js +0 -0
  318. /package/dist/{src/sources → sources}/providers/tar-utils.js +0 -0
  319. /package/dist/{src/sources → sources}/resolve.js +0 -0
  320. /package/dist/{src/sources → sources}/types.js +0 -0
  321. /package/dist/{src/templates → templates}/wiki-templates.js +0 -0
  322. /package/dist/{src/version.js → version.js} +0 -0
  323. /package/dist/{src/wiki → wiki}/wiki.js +0 -0
  324. /package/dist/{src/workflows → workflows}/authoring.js +0 -0
  325. /package/dist/{src/workflows → workflows}/cli.js +0 -0
  326. /package/dist/{src/workflows → workflows}/db.js +0 -0
  327. /package/dist/{src/workflows → workflows}/document-cache.js +0 -0
  328. /package/dist/{src/workflows → workflows}/parser.js +0 -0
  329. /package/dist/{src/workflows → workflows}/renderer.js +0 -0
  330. /package/dist/{src/workflows → workflows}/runs.js +0 -0
  331. /package/dist/{src/workflows → workflows}/schema.js +0 -0
  332. /package/dist/{src/workflows → workflows}/validator.js +0 -0
@@ -1,2395 +0,0 @@
1
- /**
2
- * akm-bench metrics (spec §6).
3
- *
4
- * Outcome metrics (§6.1) and trajectory metrics (§6.2). Both are pure
5
- * functions over `RunResult[]` slices so the runner can compose them
6
- * however it likes. The §6.3+ catalog (proposal-quality, longitudinal,
7
- * attribution, failure-mode taxonomy) lands in #239/#240/#243.
8
- *
9
- * The failure-mode taxonomy classifier (§6.6) lives in this file
10
- * (`classifyFailureMode`).
11
- *
12
- * Search-pipeline bridge metrics (§6.7) are below: they tie the synthetic
13
- * MRR/Recall@K view in `tests/benchmark-suite.ts` to real-task pass rate
14
- * by logging gold-rank-of-search per `akm search` invocation and slicing
15
- * pass-rate by the rank of the agent's *chosen* search.
16
- */
17
- import fs from "node:fs";
18
- import path from "node:path";
19
- import { safeRealpath } from "../../src/core/common";
20
- import { MEMORY_ABILITY_VALUES } from "./corpus";
21
- import { serializeRunForReport } from "./report";
22
- import { benchMkdtemp } from "./tmp";
23
- import { normalizeRunToTrace } from "./workflow-trace";
24
- /**
25
- * Aggregate outcome metrics over a flat list of RunResults.
26
- *
27
- * Aggregations across multiple arms are the caller's responsibility — pass
28
- * each arm's slice in separately. Backward-compatible v1 contract; the
29
- * richer per-task / corpus shapes below subsume this.
30
- */
31
- export function computeOutcomeAggregate(results) {
32
- if (results.length === 0) {
33
- return { passRate: 0, tokensPerPass: 0, wallclockMs: 0, budgetExceeded: 0, runsWithMeasuredTokens: 0 };
34
- }
35
- let passes = 0;
36
- let budgetExceeded = 0;
37
- let totalTokensInMeasuredPasses = 0;
38
- let measuredPasses = 0;
39
- let runsWithMeasuredTokens = 0;
40
- let totalWallclock = 0;
41
- for (const r of results) {
42
- totalWallclock += r.wallclockMs;
43
- if (isMeasured(r)) {
44
- runsWithMeasuredTokens += 1;
45
- }
46
- if (r.outcome === "pass") {
47
- passes += 1;
48
- // Only fold tokens into the mean when we actually measured them
49
- // (issue #252) — otherwise a `0` would silently understate cost.
50
- if (isMeasured(r)) {
51
- measuredPasses += 1;
52
- totalTokensInMeasuredPasses += r.tokens.input + r.tokens.output;
53
- }
54
- }
55
- else if (r.outcome === "budget_exceeded") {
56
- budgetExceeded += 1;
57
- }
58
- }
59
- return {
60
- passRate: passes / results.length,
61
- tokensPerPass: measuredPasses === 0 ? 0 : totalTokensInMeasuredPasses / measuredPasses,
62
- wallclockMs: totalWallclock / results.length,
63
- budgetExceeded,
64
- runsWithMeasuredTokens,
65
- };
66
- }
67
- /**
68
- * Treat older artefacts without `tokenMeasurement` as `"parsed"` for backward
69
- * compatibility — pre-#252 reports always returned numeric zero, and rejecting
70
- * them entirely would break compare/attribute over historical runs.
71
- */
72
- function isMeasured(r) {
73
- return (r.tokenMeasurement ?? "parsed") === "parsed";
74
- }
75
- /**
76
- * Aggregate K seed runs of one (task, arm) pair into PerTaskMetrics. Returns
77
- * a zeroed envelope on empty input — callers decide whether to skip or render.
78
- */
79
- export function aggregatePerTask(results) {
80
- if (results.length === 0) {
81
- return {
82
- passRate: 0,
83
- passAt1: 0,
84
- tokensPerPass: null,
85
- wallclockMs: 0,
86
- passRateStdev: 0,
87
- budgetExceededCount: 0,
88
- harnessErrorCount: 0,
89
- count: 0,
90
- runsWithMeasuredTokens: 0,
91
- tokensPerRun: null,
92
- };
93
- }
94
- let passes = 0;
95
- let measuredPasses = 0;
96
- let totalTokensInMeasuredPasses = 0;
97
- let totalWallclock = 0;
98
- let budgetExceeded = 0;
99
- let harnessError = 0;
100
- let runsWithMeasuredTokens = 0;
101
- let totalTokensInMeasuredRuns = 0;
102
- let measuredRuns = 0;
103
- // For the standard deviation we need a fixed-iteration buffer of pass/fail.
104
- const passSamples = [];
105
- for (const r of results) {
106
- totalWallclock += r.wallclockMs;
107
- if (isMeasured(r)) {
108
- runsWithMeasuredTokens += 1;
109
- measuredRuns += 1;
110
- totalTokensInMeasuredRuns += r.tokens.input + r.tokens.output;
111
- }
112
- const isPass = r.outcome === "pass" ? 1 : 0;
113
- passSamples.push(isPass);
114
- if (isPass === 1) {
115
- passes += 1;
116
- // Only count tokens for measured passes (issue #252). A pass with
117
- // missing measurement contributes to `passRate` but NOT to
118
- // `tokensPerPass` — preserving "tokens per measured pass" semantics.
119
- if (isMeasured(r)) {
120
- measuredPasses += 1;
121
- totalTokensInMeasuredPasses += r.tokens.input + r.tokens.output;
122
- }
123
- }
124
- else if (r.outcome === "budget_exceeded") {
125
- budgetExceeded += 1;
126
- }
127
- else if (r.outcome === "harness_error") {
128
- harnessError += 1;
129
- }
130
- }
131
- const seed0 = results.find((r) => r.seed === 0) ?? results[0];
132
- const passAt1 = seed0 && seed0.outcome === "pass" ? 1 : 0;
133
- return {
134
- passRate: passes / results.length,
135
- passAt1,
136
- tokensPerPass: measuredPasses === 0 ? null : totalTokensInMeasuredPasses / measuredPasses,
137
- wallclockMs: totalWallclock / results.length,
138
- passRateStdev: stdev(passSamples),
139
- budgetExceededCount: budgetExceeded,
140
- harnessErrorCount: harnessError,
141
- count: results.length,
142
- runsWithMeasuredTokens,
143
- tokensPerRun: measuredRuns === 0 ? null : totalTokensInMeasuredRuns / measuredRuns,
144
- };
145
- }
146
- /** Sample standard deviation. Returns 0 for length ≤ 1 (no spread to measure). */
147
- function stdev(values) {
148
- if (values.length <= 1)
149
- return 0;
150
- const mean = values.reduce((a, b) => a + b, 0) / values.length;
151
- const sumSq = values.reduce((acc, v) => acc + (v - mean) * (v - mean), 0);
152
- // Sample stdev (Bessel's correction) — n-1 denominator.
153
- return Math.sqrt(sumSq / (values.length - 1));
154
- }
155
- /**
156
- * Mean across per-task metrics. Each task contributes once, regardless of
157
- * how many seeds it ran (K is already collapsed in `aggregatePerTask`).
158
- *
159
- * `tokensPerPass`: tasks where `tokensPerPass` is `null` (no passes) are
160
- * dropped from that mean. The result is `null` if every task failed.
161
- */
162
- export function aggregateCorpus(perTask) {
163
- const tasks = Object.values(perTask);
164
- if (tasks.length === 0) {
165
- return { passRate: 0, tokensPerPass: null, wallclockMs: 0, tokensPerRun: null };
166
- }
167
- const passRate = tasks.reduce((a, t) => a + t.passRate, 0) / tasks.length;
168
- const wallclockMs = tasks.reduce((a, t) => a + t.wallclockMs, 0) / tasks.length;
169
- const tppValues = tasks.map((t) => t.tokensPerPass).filter((v) => v !== null);
170
- const tokensPerPass = tppValues.length === 0 ? null : tppValues.reduce((a, b) => a + b, 0) / tppValues.length;
171
- const tprValues = tasks.map((t) => t.tokensPerRun).filter((v) => v !== null);
172
- const tokensPerRun = tprValues.length === 0 ? null : tprValues.reduce((a, b) => a + b, 0) / tprValues.length;
173
- return { passRate, tokensPerPass, wallclockMs, tokensPerRun };
174
- }
175
- /**
176
- * Compute the akm − noakm delta. Negative `tokensPerPass`/`wallclockMs` mean
177
- * akm was cheaper / faster; positive means it cost more. Pass-rate uses the
178
- * opposite convention (positive = akm wins).
179
- */
180
- export function computeCorpusDelta(noakm, akm) {
181
- return {
182
- passRate: akm.passRate - noakm.passRate,
183
- tokensPerPass: akm.tokensPerPass === null || noakm.tokensPerPass === null ? null : akm.tokensPerPass - noakm.tokensPerPass,
184
- wallclockMs: akm.wallclockMs - noakm.wallclockMs,
185
- tokensPerRun: akm.tokensPerRun === null || noakm.tokensPerRun === null ? null : akm.tokensPerRun - noakm.tokensPerRun,
186
- };
187
- }
188
- /** Per-task delta with the same null-safety as the corpus delta. */
189
- export function computePerTaskDelta(noakm, akm) {
190
- return {
191
- passRate: akm.passRate - noakm.passRate,
192
- tokensPerPass: akm.tokensPerPass === null || noakm.tokensPerPass === null ? null : akm.tokensPerPass - noakm.tokensPerPass,
193
- wallclockMs: akm.wallclockMs - noakm.wallclockMs,
194
- tokensPerRun: akm.tokensPerRun === null || noakm.tokensPerRun === null ? null : akm.tokensPerRun - noakm.tokensPerRun,
195
- };
196
- }
197
- /**
198
- * Extract the domain prefix from a task ID. The corpus convention is
199
- * `<domain>/<task-name>`; we split on the first `/`. Tasks lacking a slash
200
- * fall back to the literal `unknown` bucket so they aggregate predictably
201
- * rather than producing per-task domains-of-one.
202
- */
203
- export function domainOfTaskId(taskId) {
204
- const idx = taskId.indexOf("/");
205
- if (idx <= 0)
206
- return "unknown";
207
- return taskId.slice(0, idx);
208
- }
209
- /**
210
- * Compute the negative-transfer aggregate over a set of per-task entries
211
- * (one entry per task; both arms already aggregated into PerTaskMetrics).
212
- *
213
- * A task is "regressed" when `akm.passRate < noakm.passRate`. Ties (equal
214
- * pass rate, including 0=0) are NOT regressions. `topRegressedTasks` is
215
- * sorted by `severity` descending then `taskId` ascending so output is
216
- * deterministic.
217
- */
218
- export function computeNegativeTransfer(tasks) {
219
- const regressed = [];
220
- for (const t of tasks) {
221
- const delta = t.akm.passRate - t.noakm.passRate;
222
- if (delta >= 0)
223
- continue;
224
- regressed.push({
225
- taskId: t.id,
226
- domain: domainOfTaskId(t.id),
227
- noakmPassRate: t.noakm.passRate,
228
- akmPassRate: t.akm.passRate,
229
- delta,
230
- severity: -delta,
231
- });
232
- }
233
- regressed.sort((a, b) => {
234
- if (b.severity !== a.severity)
235
- return b.severity - a.severity;
236
- return a.taskId.localeCompare(b.taskId);
237
- });
238
- const severity = regressed.reduce((acc, r) => acc + r.severity, 0);
239
- return { count: regressed.length, severity, topRegressedTasks: regressed };
240
- }
241
- /**
242
- * Compute per-domain aggregates over a set of per-task entries. Each task
243
- * contributes once to its domain (K seeds already collapsed). Output rows
244
- * are sorted by `domain` ascending so JSON / markdown are byte-stable.
245
- *
246
- * Domain extraction uses `domainOfTaskId` (split on first `/`).
247
- */
248
- export function computeDomainAggregates(tasks) {
249
- const buckets = new Map();
250
- for (const t of tasks) {
251
- const d = domainOfTaskId(t.id);
252
- let arr = buckets.get(d);
253
- if (!arr) {
254
- arr = [];
255
- buckets.set(d, arr);
256
- }
257
- arr.push(t);
258
- }
259
- const rows = [];
260
- for (const [domain, group] of buckets) {
261
- const n = group.length;
262
- let noakmSum = 0;
263
- let akmSum = 0;
264
- let wallNoakm = 0;
265
- let wallAkm = 0;
266
- let regressionCount = 0;
267
- const noakmTpp = [];
268
- const akmTpp = [];
269
- for (const t of group) {
270
- noakmSum += t.noakm.passRate;
271
- akmSum += t.akm.passRate;
272
- wallNoakm += t.noakm.wallclockMs;
273
- wallAkm += t.akm.wallclockMs;
274
- if (t.akm.passRate < t.noakm.passRate)
275
- regressionCount += 1;
276
- if (t.noakm.tokensPerPass !== null)
277
- noakmTpp.push(t.noakm.tokensPerPass);
278
- if (t.akm.tokensPerPass !== null)
279
- akmTpp.push(t.akm.tokensPerPass);
280
- }
281
- const passRateNoakm = noakmSum / n;
282
- const passRateAkm = akmSum / n;
283
- const meanNoakmTpp = noakmTpp.length === 0 ? null : noakmTpp.reduce((a, b) => a + b, 0) / noakmTpp.length;
284
- const meanAkmTpp = akmTpp.length === 0 ? null : akmTpp.reduce((a, b) => a + b, 0) / akmTpp.length;
285
- const tokensPerPassDelta = meanNoakmTpp === null || meanAkmTpp === null ? null : meanAkmTpp - meanNoakmTpp;
286
- rows.push({
287
- domain,
288
- taskCount: n,
289
- regressionCount,
290
- passRateNoakm,
291
- passRateAkm,
292
- passRateDelta: passRateAkm - passRateNoakm,
293
- tokensPerPassDelta,
294
- wallclockMsDelta: wallAkm / n - wallNoakm / n,
295
- });
296
- }
297
- rows.sort((a, b) => a.domain.localeCompare(b.domain));
298
- return rows;
299
- }
300
- /**
301
- * Compute asset-regression-candidate rows (#260). Walks the AKM-arm runs,
302
- * keeps only those whose `taskId` is in `regressedTaskIds`, and tallies how
303
- * often each loaded asset shows up. `regressedTaskCount` (distinct task IDs
304
- * touched) is the primary sort key — assets that hurt many tasks are more
305
- * actionable than assets that flooded one task across seeds.
306
- *
307
- * Sort: regressedTaskCount desc, totalLoadCount desc, assetRef asc.
308
- */
309
- export function computeAssetRegressionCandidates(regressedTaskIds, akmRuns) {
310
- const regressed = new Set(regressedTaskIds);
311
- if (regressed.size === 0)
312
- return [];
313
- const taskIdsByAsset = new Map();
314
- const totalLoadByAsset = new Map();
315
- for (const run of akmRuns) {
316
- if (!regressed.has(run.taskId))
317
- continue;
318
- const assets = run.assetsLoaded ?? [];
319
- for (const ref of assets) {
320
- let bucket = taskIdsByAsset.get(ref);
321
- if (!bucket) {
322
- bucket = new Set();
323
- taskIdsByAsset.set(ref, bucket);
324
- }
325
- bucket.add(run.taskId);
326
- totalLoadByAsset.set(ref, (totalLoadByAsset.get(ref) ?? 0) + 1);
327
- }
328
- }
329
- const rows = [];
330
- for (const [assetRef, taskIds] of taskIdsByAsset) {
331
- rows.push({
332
- assetRef,
333
- regressedTaskCount: taskIds.size,
334
- regressedTaskIds: [...taskIds].sort(),
335
- totalLoadCount: totalLoadByAsset.get(assetRef) ?? 0,
336
- });
337
- }
338
- rows.sort((a, b) => {
339
- if (b.regressedTaskCount !== a.regressedTaskCount)
340
- return b.regressedTaskCount - a.regressedTaskCount;
341
- if (b.totalLoadCount !== a.totalLoadCount)
342
- return b.totalLoadCount - a.totalLoadCount;
343
- return a.assetRef.localeCompare(b.assetRef);
344
- });
345
- return rows;
346
- }
347
- // ── Per-asset attribution (§6.5) ───────────────────────────────────────────
348
- /**
349
- * Extract the unique asset refs an agent loaded during a run by scanning
350
- * `events[]` and `verifierStdout` for `akm show <ref>` invocations.
351
- *
352
- * Detection strategy (all heuristic, all conservative):
353
- * 1. `event.eventType === "show"` with `event.ref` (forward-compat — akm
354
- * itself does not currently emit `show` events).
355
- * 2. Substring match on `akm show <ref>` in stdout. The ref shape is
356
- * `[origin//]type:name` per the v1 contract; we accept word-boundary
357
- * terminators after the name.
358
- * 3. Tool-call JSON `{"args":["show","<ref>"]}` — the form opencode logs
359
- * when the agent invokes the akm CLI as a tool. We extract refs that
360
- * look like asset refs from the args array entries adjacent to "show".
361
- *
362
- * Returns refs in first-seen order, deduplicated. Bounded scan: stdout is
363
- * truncated at 16 MiB (the same cap the trajectory parser uses) to keep
364
- * runaway agents from OOMing the bench.
365
- */
366
- const ASSET_LOAD_STDOUT_SCAN_CAP = 16 * 1024 * 1024;
367
- // Asset ref grammar: optional `origin//` prefix, type:name, where type and
368
- // name are lowercase letters, digits, `_`, `-`. We deliberately do NOT match
369
- // `://` schemes (those are install locators, not asset refs). The character
370
- // class is intentionally tight so we don't mis-pickup arbitrary words after
371
- // `akm show`. The `name` segment is restricted to `[A-Za-z0-9_-]+` (no `/`,
372
- // no `.`) — the v1 grammar in src/core/asset-ref.ts permits `/` and `.` in
373
- // names (e.g. `script:db/migrate/run.sh`), but the masker treats names as
374
- // untrusted input and rejects any traversal-shaped value, so the bench-side
375
- // scanner does not need (or want) to extract such refs from agent stdout.
376
- // Limiting the regex here is defense-in-depth against a prompt-injected
377
- // agent emitting `akm show "skill:../../etc"` and us pulling that ref into
378
- // the masking flow.
379
- const ASSET_REF_PATTERN = /(?:[a-z0-9_-]+\/\/)?[a-z][a-z0-9_-]*:[A-Za-z0-9_-]+/g;
380
- export function extractAssetLoads(runResult) {
381
- const seen = new Set();
382
- const out = [];
383
- const push = (ref) => {
384
- if (!ref)
385
- return;
386
- if (seen.has(ref))
387
- return;
388
- seen.add(ref);
389
- out.push(ref);
390
- };
391
- // 1. Events stream.
392
- for (const event of runResult.events) {
393
- if (event.eventType === "show" && typeof event.ref === "string") {
394
- push(event.ref);
395
- }
396
- const meta = event.metadata;
397
- if (meta && typeof meta === "object" && event.eventType === "show") {
398
- const candidate = meta.ref;
399
- if (typeof candidate === "string")
400
- push(candidate);
401
- }
402
- }
403
- // 2 & 3. Stdout scanning. Bound the scan so a runaway agent stdout cannot
404
- // OOM the bench. Truncation is silent — the trajectory parser already
405
- // surfaces a warning for the same data on its own scan.
406
- let haystack = runResult.verifierStdout || "";
407
- if (haystack.length > ASSET_LOAD_STDOUT_SCAN_CAP) {
408
- haystack = haystack.slice(0, ASSET_LOAD_STDOUT_SCAN_CAP);
409
- }
410
- // `akm show <ref>` literal form. Accept optional quoting around the ref so
411
- // shell traces like `akm show "skill:foo"` work too.
412
- const literalRe = /akm\s+show\s+["']?((?:[a-z0-9_-]+\/\/)?[a-z][a-z0-9_-]*:[A-Za-z0-9_-]+)["']?/g;
413
- for (const literalMatch of haystack.matchAll(literalRe)) {
414
- push(literalMatch[1]);
415
- }
416
- // Tool-call JSON form. `"args":[..., "show", "<ref>", ...]`. We extract
417
- // every refish token in the haystack that follows a "show" arg in JSON-y
418
- // form. A second cheap pass keeps the pattern simple.
419
- const toolCallRe = /"show"\s*,\s*"((?:[a-z0-9_-]+\/\/)?[a-z][a-z0-9_-]*:[A-Za-z0-9_-]+)"/g;
420
- for (const toolCallMatch of haystack.matchAll(toolCallRe)) {
421
- push(toolCallMatch[1]);
422
- }
423
- return out;
424
- }
425
- // Suppress the unused warning for `ASSET_REF_PATTERN` above. The constant is
426
- // retained as the documentation seam called out by the #251 review addenda,
427
- // even though `extractAssetLoads` uses inline regexes for its two scan forms.
428
- void ASSET_REF_PATTERN;
429
- /**
430
- * Anchored variant of `ASSET_REF_PATTERN` for whole-string validation.
431
- *
432
- * Used by `materialiseMaskedStash` (#251) to gate every asset ref BEFORE we
433
- * touch the filesystem. The base `ASSET_REF_PATTERN` is `/g`-flagged for
434
- * scanning agent stdout; we re-anchor here so a hostile string like
435
- * `skill:foo/../../etc` is rejected as a whole even though the regex would
436
- * happily match a `skill:foo` substring under `/g`.
437
- *
438
- * Rejects `..`, absolute paths, drive letters, null bytes, `/`, `\`, and
439
- * anything else outside the v1 ref grammar (mirrors src/core/asset-ref.ts).
440
- */
441
- const ASSET_REF_ANCHORED = /^(?:[a-z0-9_-]+\/\/)?[a-z][a-z0-9_-]*:[A-Za-z0-9_-]+$/;
442
- /**
443
- * Reject hostile asset refs before they reach any `fs.rmSync` call. The ref
444
- * comes from agent stdout (untrusted; the agent could be prompt-injected) so
445
- * we apply the anchored grammar pattern first, then the per-segment shape
446
- * check after the colon-split. Defense in depth — each layer is sufficient
447
- * on its own; the layered structure makes a future grammar relax safe.
448
- */
449
- function isSafeAssetRef(ref) {
450
- if (!ref)
451
- return false;
452
- if (ref.includes("\0"))
453
- return false;
454
- return ASSET_REF_ANCHORED.test(ref);
455
- }
456
- /**
457
- * Aggregate per-asset load + pass counts across all akm-arm runs in a report.
458
- *
459
- * Sort order (stable, deterministic):
460
- * 1. loadCount descending (most-used first)
461
- * 2. loadPassRate descending (working assets above broken ones at the same load count)
462
- * 3. assetRef ascending (alphabetical tiebreak)
463
- *
464
- * Only `arm === "akm"` runs contribute. The `noakm` arm has no stash and
465
- * cannot load assets, so including it would zero-bias the rates.
466
- */
467
- export function computePerAssetAttribution(report) {
468
- const passing = new Map();
469
- const failing = new Map();
470
- let totalAkmRuns = 0;
471
- // The §13.3 task entry doesn't carry RunResults — we read them from the
472
- // shared akm-arm runs collection that the runner stamps onto `report.akmRuns`.
473
- const akmRuns = collectAkmRuns(report);
474
- for (const r of akmRuns) {
475
- totalAkmRuns += 1;
476
- const isPass = r.outcome === "pass";
477
- for (const ref of r.assetsLoaded ?? []) {
478
- const bucket = isPass ? passing : failing;
479
- bucket.set(ref, (bucket.get(ref) ?? 0) + 1);
480
- }
481
- }
482
- const refs = new Set([...passing.keys(), ...failing.keys()]);
483
- const rows = [];
484
- for (const ref of refs) {
485
- const p = passing.get(ref) ?? 0;
486
- const f = failing.get(ref) ?? 0;
487
- const total = p + f;
488
- rows.push({
489
- assetRef: ref,
490
- loadCountPassing: p,
491
- loadCountFailing: f,
492
- loadCount: total,
493
- loadPassRate: total === 0 ? null : p / total,
494
- });
495
- }
496
- rows.sort((a, b) => {
497
- if (b.loadCount !== a.loadCount)
498
- return b.loadCount - a.loadCount;
499
- const ar = a.loadPassRate ?? -1;
500
- const br = b.loadPassRate ?? -1;
501
- if (br !== ar)
502
- return br - ar;
503
- return a.assetRef.localeCompare(b.assetRef);
504
- });
505
- return { rows, totalAkmRuns };
506
- }
507
- /**
508
- * Pull the akm-arm RunResults out of a UtilityRunReport. The runner stamps
509
- * them into the optional `akmRuns` field on the report so attribution can
510
- * post-process them without re-running.
511
- */
512
- function collectAkmRuns(report) {
513
- if (Array.isArray(report.akmRuns))
514
- return report.akmRuns;
515
- return [];
516
- }
517
- // ── runs[] serialisation (#249) ────────────────────────────────────────────
518
- /**
519
- * Project a list of RunResults onto the compact `runs[]` rows persisted
520
- * inside the §13.3 JSON envelope (#249). One row per (task, arm, seed)
521
- * triple; the renderer walks the input order verbatim, which the runner
522
- * already builds deterministically (per-task block, noakm before akm,
523
- * seeds in ascending order).
524
- *
525
- * Aggregate metrics (per-task, trajectory, failure-mode, search-bridge,
526
- * attribution) MUST be recomputable from these rows + task metadata. This
527
- * helper is the canonical projection — keep it in lockstep with the field
528
- * list in the issue body.
529
- */
530
- export function aggregateRunsForReport(runs) {
531
- return runs.map(serializeRunForReport);
532
- }
533
- /**
534
- * Hydrate a persisted `runs[]` row back into the `RunResult` shape that
535
- * downstream metrics helpers (`computePerAssetAttribution`, `aggregateCorpus`,
536
- * etc.) expect. Used by `bench attribute` / `bench compare` when they read a
537
- * §13.3 envelope from disk: the persisted row carries a compact subset, but
538
- * it carries everything those helpers need.
539
- *
540
- * Fields the row deliberately does NOT carry are filled with safe defaults:
541
- * • `events: []` — events.jsonl is not persisted; downstream attribution
542
- * only consults `assetsLoaded` and `verifierStdout`.
543
- * • `verifierStdout: ""` — full stdout is intentionally omitted from the
544
- * envelope (#249 acceptance criterion). `assetsLoaded` already carries
545
- * the post-hoc extraction the agent run produced.
546
- * • `schemaVersion: 1` — the report schema implies it.
547
- *
548
- * Tokens are passed through as-is so a future `measurement` field added by
549
- * #252 lands on the rehydrated row automatically. TODO(#252): keep this
550
- * spread.
551
- */
552
- export function rehydrateRunFromSerialized(row) {
553
- // The compact row uses a permissive Record shape for tokens (see
554
- // RunRecordSerialized). Coerce defensively so older artefacts with only
555
- // {input, output} hydrate cleanly.
556
- const tok = row.tokens;
557
- return {
558
- schemaVersion: 1,
559
- taskId: row.task_id,
560
- arm: row.arm,
561
- seed: row.seed,
562
- model: row.model,
563
- outcome: row.outcome,
564
- tokens: {
565
- ...tok,
566
- input: typeof tok.input === "number" ? tok.input : 0,
567
- output: typeof tok.output === "number" ? tok.output : 0,
568
- },
569
- wallclockMs: row.wallclock_ms,
570
- trajectory: {
571
- correctAssetLoaded: row.trajectory.correct_asset_loaded,
572
- feedbackRecorded: row.trajectory.feedback_recorded,
573
- },
574
- events: [],
575
- verifierStdout: "",
576
- verifierExitCode: row.verifier_exit_code,
577
- assetsLoaded: [...row.assets_loaded],
578
- failureMode: (row.failure_mode ?? null),
579
- };
580
- }
581
- /**
582
- * Pick the top-N most-loaded assets from a base report and re-run the corpus
583
- * with each one masked from its source stash. Returns a marginal-contribution
584
- * row per masked asset.
585
- *
586
- * Cost: N * (tasks × arms × seedsPerArm) re-runs. Operators clamp N before
587
- * calling — but we also clamp internally if `topN` exceeds the unique-asset
588
- * count to avoid surprising no-op runs.
589
- *
590
- * Source-fixture safety: every masked re-run materialises a fresh tmp copy
591
- * of the fixture stash, deletes the masked asset's files there, and points
592
- * the re-run at the tmp dir. The shipped fixture in `tests/fixtures/stashes/`
593
- * is NEVER mutated.
594
- */
595
- export async function runMaskedCorpus(opts) {
596
- const baseReport = opts.baseReport;
597
- const fixturesRoot = opts.fixturesRoot ?? path.resolve(__dirname, "..", "fixtures", "stashes");
598
- const attribution = computePerAssetAttribution(baseReport);
599
- const desired = Math.max(1, opts.topN ?? 5);
600
- const clamped = Math.min(desired, attribution.rows.length);
601
- const baseAkmPassRate = baseReport.aggregateAkm.passRate;
602
- const top = attribution.rows.slice(0, clamped);
603
- const attributions = [];
604
- const maskedRefs = [];
605
- for (const row of top) {
606
- const maskedTasks = [];
607
- const tmpDirs = [];
608
- try {
609
- for (const baseTask of baseReport.taskMetadata ?? []) {
610
- const maskedStashDir = materialiseMaskedStash(fixturesRoot, baseTask.stash, row.assetRef);
611
- if (maskedStashDir)
612
- tmpDirs.push(maskedStashDir);
613
- // Issue #251: forward the masked stashDir via the explicit
614
- // `stashDirOverride` field on the cloned TaskMetadata. We MUST NOT
615
- // mutate `baseTask.stash` (the fixture name) — the runner uses that
616
- // to call `loadFixtureStash`, and overloading it breaks the
617
- // `__no-stash__` resolution branch in runner.ts. The runner's AKM-arm
618
- // branch checks `task.stashDirOverride` first.
619
- //
620
- // When `materialiseMaskedStash` returned `null` (asset not present in
621
- // this fixture, or hostile ref shape rejected by the validator), we
622
- // intentionally leave both fields untouched. The runner falls back to
623
- // the normal materialisation flow against the unchanged source
624
- // fixture — so the re-run still happens, but the result mirrors the
625
- // base. This is a meaningful diagnostic (the ref didn't bind in this
626
- // fixture) and is the same accounting `cost-accounting`-style tests
627
- // assert against.
628
- if (maskedStashDir) {
629
- maskedTasks.push({ ...baseTask, stashDirOverride: maskedStashDir });
630
- }
631
- else {
632
- maskedTasks.push({ ...baseTask });
633
- }
634
- }
635
- const maskedReport = await opts.runUtility({
636
- ...opts.baseOptions,
637
- tasks: maskedTasks,
638
- // The masked stash already has the correct content on disk, and the
639
- // runner now resolves it via `task.stashDirOverride`. We still pass
640
- // `materialiseStash: false` so the runner does not call
641
- // `loadFixtureStash` against the (unmasked) named fixture — that
642
- // would waste work and risk re-indexing the source dir.
643
- materialiseStash: false,
644
- });
645
- const maskedPassRate = maskedReport.aggregateAkm.passRate;
646
- attributions.push({
647
- assetRef: row.assetRef,
648
- basePassRate: baseAkmPassRate,
649
- maskedPassRate,
650
- marginalContribution: baseAkmPassRate - maskedPassRate,
651
- });
652
- maskedRefs.push(row.assetRef);
653
- }
654
- finally {
655
- // Cleanup runs in BOTH success and failure paths (acceptance criterion).
656
- // Best-effort: a tmpfs failure here is logged via the `try/catch` below
657
- // and the host OS reaps the tmp dir on reboot.
658
- for (const dir of tmpDirs) {
659
- try {
660
- fs.rmSync(dir, { recursive: true, force: true });
661
- }
662
- catch {
663
- // Best-effort cleanup; tmpfs cleanup will handle leaks.
664
- }
665
- }
666
- }
667
- }
668
- return {
669
- baseReport,
670
- attributions,
671
- runsPerformed: clamped,
672
- maskingStrategy: "leave-one-out",
673
- maskedRefs,
674
- };
675
- }
676
- /**
677
- * Copy a fixture stash into a fresh tmp dir, delete every file matching the
678
- * masked asset ref, and return the tmp dir path. Returns `null` if the named
679
- * asset is not present in the fixture (we still re-run, but the result will
680
- * mirror the base — which is itself a meaningful diagnostic).
681
- *
682
- * The masking heuristic:
683
- * 1. Walk `<stash>/*<...>/.stash.json` files.
684
- * 2. For each entry whose `name` + `type` matches the asset ref, drop the
685
- * entry and delete its `filename` if present.
686
- * 3. Rewrite the `.stash.json` with the trimmed entries (or remove it if
687
- * it is now empty).
688
- */
689
- export function materialiseMaskedStash(fixturesRoot, stashName, assetRef) {
690
- // #271: validate stashName containment BEFORE touching the filesystem.
691
- // `stashName` originates from a task YAML which, while authored, is part
692
- // of the fixture corpus the bench loads; a fixture with `stash: "../../etc"`
693
- // would otherwise resolve outside `fixturesRoot` and let masking edits or
694
- // copies escape the bench sandbox. path.relative gives the cleanest
695
- // containment check (handles `..` AND absolute path injection in one go).
696
- const fixturesRootResolved = path.resolve(fixturesRoot);
697
- const sourceDir = path.resolve(fixturesRootResolved, stashName);
698
- const rel = path.relative(fixturesRootResolved, sourceDir);
699
- if (rel.startsWith("..") || path.isAbsolute(rel))
700
- return null;
701
- if (!fs.existsSync(path.join(sourceDir, "MANIFEST.json")))
702
- return null;
703
- // Issue #251 review addendum: validate the WHOLE ref against the anchored
704
- // grammar before we touch the filesystem. The downstream `isSafeAssetNameSegment`
705
- // + `isPathContained` checks are still applied — this is defense in depth.
706
- if (!isSafeAssetRef(assetRef))
707
- return null;
708
- const colonIdx = assetRef.indexOf(":");
709
- if (colonIdx < 0) {
710
- // Malformed ref: still produce a tmp copy with no edits so the caller's
711
- // re-run sees the unmodified fixture.
712
- const tmpRoot = benchMkdtemp(`akm-bench-masked-${stashName}-`);
713
- copyDirRecursive(sourceDir, tmpRoot);
714
- return tmpRoot;
715
- }
716
- const typeWithOrigin = assetRef.slice(0, colonIdx);
717
- const name = assetRef.slice(colonIdx + 1);
718
- const type = typeWithOrigin.includes("//") ? (typeWithOrigin.split("//")[1] ?? typeWithOrigin) : typeWithOrigin;
719
- // SECURITY: the asset ref originates from agent stdout (untrusted; the
720
- // agent could be prompt-injected). The masking heuristic below will
721
- // `fs.rmSync` files under the tmp stash dir whose names are derived from
722
- // `name`. A traversal-shaped name (`../etc`, `/abs/path`, `..\\..`) would
723
- // escape the tmp root and delete arbitrary disk content. Reject those
724
- // shapes BEFORE we materialise — and re-validate after path-resolving
725
- // each candidate. Mirrors src/core/asset-ref.ts validateName().
726
- if (!isSafeAssetNameSegment(name))
727
- return null;
728
- const tmpRoot = benchMkdtemp(`akm-bench-masked-${stashName}-`);
729
- copyDirRecursive(sourceDir, tmpRoot);
730
- // Walk every .stash.json under the tmp root and edit in place.
731
- walkStashJsonFiles(tmpRoot, (jsonPath) => {
732
- let raw;
733
- try {
734
- raw = fs.readFileSync(jsonPath, "utf8");
735
- }
736
- catch {
737
- return;
738
- }
739
- let parsed;
740
- try {
741
- parsed = JSON.parse(raw);
742
- }
743
- catch {
744
- return;
745
- }
746
- const entries = parsed.entries ?? [];
747
- const kept = [];
748
- const jsonDir = path.dirname(jsonPath);
749
- for (const entry of entries) {
750
- if (entry.type === type && entry.name === name) {
751
- // Remove the entry's content file(s). The on-disk `filename` is read
752
- // from the fixture .stash.json (trusted) but the value still passes
753
- // through path.relative containment so a malicious fixture can't use
754
- // this path to escape either.
755
- const filename = entry.filename;
756
- if (typeof filename === "string" && isSafeAssetNameSegment(filename)) {
757
- const target = path.resolve(jsonDir, filename);
758
- if (isPathContained(tmpRoot, target)) {
759
- try {
760
- fs.rmSync(target, { force: true });
761
- }
762
- catch {
763
- // ignore
764
- }
765
- }
766
- }
767
- // Some fixtures keep a per-asset directory (e.g. skills/<name>/SKILL.md).
768
- const dirCandidate = path.resolve(jsonDir, name);
769
- if (isPathContained(tmpRoot, dirCandidate) &&
770
- fs.existsSync(dirCandidate) &&
771
- fs.statSync(dirCandidate).isDirectory()) {
772
- try {
773
- fs.rmSync(dirCandidate, { recursive: true, force: true });
774
- }
775
- catch {
776
- // ignore
777
- }
778
- }
779
- continue;
780
- }
781
- kept.push(entry);
782
- }
783
- if (kept.length === entries.length)
784
- return; // nothing changed
785
- if (kept.length === 0) {
786
- try {
787
- fs.rmSync(jsonPath, { force: true });
788
- }
789
- catch {
790
- // ignore
791
- }
792
- }
793
- else {
794
- fs.writeFileSync(jsonPath, `${JSON.stringify({ ...parsed, entries: kept }, null, 2)}\n`);
795
- }
796
- });
797
- return tmpRoot;
798
- }
799
- /**
800
- * Reject any segment that could escape the tmp stash root when used as a
801
- * relative path component:
802
- * - empty string
803
- * - any `/` or `\\` (path separators)
804
- * - a `..` segment in any form
805
- * - a leading `/` (POSIX absolute) or `C:` (Windows drive)
806
- * - any null byte
807
- *
808
- * Mirrors src/core/asset-ref.ts validateName(), but returns a boolean
809
- * (callers map this to "skip" rather than "throw").
810
- */
811
- function isSafeAssetNameSegment(value) {
812
- if (!value)
813
- return false;
814
- if (value.includes("\0"))
815
- return false;
816
- if (value.includes("/") || value.includes("\\"))
817
- return false;
818
- if (value === ".." || value === ".")
819
- return false;
820
- if (/^[A-Za-z]:/.test(value))
821
- return false;
822
- return true;
823
- }
824
- /**
825
- * After resolving a target path, confirm it lives under `root`. Defense in
826
- * depth: even if a traversal-shaped name slipped past the segment check,
827
- * this catches escapes via symlinks or odd `path.join` semantics.
828
- *
829
- * #271: aligned with `isWithin` in `src/core/common.ts` — both inputs go
830
- * through `safeRealpath` so a symlink inside `root` that points outside
831
- * cannot fool the `path.relative` containment check. The shared helper
832
- * also handles not-yet-existing children (walks up to the closest existing
833
- * ancestor and resolves symlinks there) so we keep the existing semantics
834
- * for `target` paths the masking heuristic is about to create.
835
- */
836
- export function isPathContained(root, target) {
837
- const rootResolved = safeRealpath(root);
838
- const targetResolved = safeRealpath(target);
839
- const rel = path.relative(rootResolved, targetResolved);
840
- if (rel === "")
841
- return true;
842
- if (rel.startsWith(".."))
843
- return false;
844
- if (path.isAbsolute(rel))
845
- return false;
846
- return true;
847
- }
848
- function walkStashJsonFiles(root, visit) {
849
- const stack = [root];
850
- while (stack.length > 0) {
851
- const cur = stack.pop();
852
- if (!cur)
853
- continue;
854
- let entries;
855
- try {
856
- entries = fs.readdirSync(cur, { withFileTypes: true });
857
- }
858
- catch {
859
- continue;
860
- }
861
- for (const entry of entries) {
862
- const abs = path.join(cur, entry.name);
863
- if (entry.isDirectory())
864
- stack.push(abs);
865
- else if (entry.isFile() && entry.name === ".stash.json")
866
- visit(abs);
867
- }
868
- }
869
- }
870
- function copyDirRecursive(src, dest) {
871
- fs.mkdirSync(dest, { recursive: true });
872
- const entries = fs.readdirSync(src, { withFileTypes: true });
873
- for (const entry of entries) {
874
- const s = path.join(src, entry.name);
875
- const d = path.join(dest, entry.name);
876
- if (entry.isDirectory())
877
- copyDirRecursive(s, d);
878
- else if (entry.isFile())
879
- fs.copyFileSync(s, d);
880
- }
881
- }
882
- /** Aggregate trajectory booleans across a bag of runs. */
883
- export function aggregateTrajectory(results) {
884
- if (results.length === 0) {
885
- return { correctAssetLoaded: null, feedbackRecorded: 0 };
886
- }
887
- let knownAsset = 0;
888
- let assetLoaded = 0;
889
- let feedback = 0;
890
- for (const r of results) {
891
- if (r.trajectory.correctAssetLoaded !== null) {
892
- knownAsset += 1;
893
- if (r.trajectory.correctAssetLoaded)
894
- assetLoaded += 1;
895
- }
896
- if (r.trajectory.feedbackRecorded === true)
897
- feedback += 1;
898
- }
899
- return {
900
- correctAssetLoaded: knownAsset === 0 ? null : assetLoaded / knownAsset,
901
- feedbackRecorded: feedback / results.length,
902
- };
903
- }
904
- /**
905
- * Sign threshold below which a delta is rendered as `flat`. `pass_rate` is
906
- * normalised to `[0, 1]`, so a 0.005 (0.5pp) tolerance keeps tiny K-seed
907
- * sampling jitter from looking like a regression.
908
- */
909
- const PASS_RATE_FLAT_TOLERANCE = 0.005;
910
- /** `tokens_per_pass` and `wallclock_ms` use raw counts; 0 is the only "flat". */
911
- const COUNT_FLAT_TOLERANCE = 0;
912
- function classifyPassRate(delta) {
913
- if (delta === null)
914
- return "flat";
915
- if (Math.abs(delta) <= PASS_RATE_FLAT_TOLERANCE)
916
- return "flat";
917
- return delta > 0 ? "improve" : "regress";
918
- }
919
- function classifyCount(delta, lowerIsBetter) {
920
- if (delta === null)
921
- return "flat";
922
- if (Math.abs(delta) <= COUNT_FLAT_TOLERANCE)
923
- return "flat";
924
- if (lowerIsBetter)
925
- return delta < 0 ? "improve" : "regress";
926
- return delta > 0 ? "improve" : "regress";
927
- }
928
- function readModel(r) {
929
- return r.agent?.model ?? "<unknown>";
930
- }
931
- function readFixtureHash(r) {
932
- const v = r.corpus?.fixtureContentHash;
933
- return v === undefined || v === null ? null : v;
934
- }
935
- function readTaskCorpusHash(r) {
936
- const v = r.corpus?.taskCorpusHash;
937
- return v === undefined || v === null ? null : v;
938
- }
939
- function readSelectedTaskIds(r) {
940
- const v = r.corpus?.selectedTaskIds;
941
- return Array.isArray(v) ? v : null;
942
- }
943
- function arraysEqualIgnoringOrder(a, b) {
944
- if (a.length !== b.length)
945
- return false;
946
- const sa = [...a].sort();
947
- const sb = [...b].sort();
948
- for (let i = 0; i < sa.length; i += 1)
949
- if (sa[i] !== sb[i])
950
- return false;
951
- return true;
952
- }
953
- function akmAgg(r) {
954
- const a = r.aggregate?.akm ?? {};
955
- return {
956
- pass_rate: a.pass_rate ?? 0,
957
- tokens_per_pass: a.tokens_per_pass ?? null,
958
- wallclock_ms: a.wallclock_ms ?? 0,
959
- };
960
- }
961
- /**
962
- * Diff two parsed UtilityRunReport JSONs.
963
- *
964
- * Refusal cases:
965
- * • Either side missing `schemaVersion: 1` or `track: "utility"` →
966
- * `schema_mismatch` / `track_mismatch`.
967
- * • `agent.model` differs → `model_mismatch`.
968
- * • Both sides report a `corpus.fixtureContentHash` and they differ →
969
- * `hash_mismatch`. Missing hash on either side proceeds with a warning
970
- * (Wave A may add it; older reports won't have it).
971
- *
972
- * On success the per-task table includes rows for every task in either side,
973
- * plus aggregate deltas computed against the akm arm only (the noakm arm is
974
- * the control — its delta is meaningless). `pass_rate` is in `[0, 1]`,
975
- * higher is better; `tokens_per_pass` and `wallclock_ms` are counts, lower
976
- * is better.
977
- */
978
- export function compareReports(base, current, options = {}) {
979
- // Schema-version gate.
980
- if (base.schemaVersion !== 1 || current.schemaVersion !== 1) {
981
- return {
982
- ok: false,
983
- reason: "schema_mismatch",
984
- message: `compare requires schemaVersion=1 on both sides; got base=${String(base.schemaVersion)}, current=${String(current.schemaVersion)}`,
985
- };
986
- }
987
- // Track gate. Cross-track diffs are nonsensical.
988
- if (base.track !== "utility" || current.track !== "utility") {
989
- return {
990
- ok: false,
991
- reason: "track_mismatch",
992
- message: `compare only supports track="utility"; got base="${String(base.track)}", current="${String(current.track)}"`,
993
- };
994
- }
995
- const baseModel = readModel(base);
996
- const currentModel = readModel(current);
997
- if (baseModel !== currentModel) {
998
- return {
999
- ok: false,
1000
- reason: "model_mismatch",
1001
- message: `cannot compare across different models: base="${baseModel}", current="${currentModel}". Rerun on the same model.`,
1002
- baseModel,
1003
- currentModel,
1004
- };
1005
- }
1006
- const baseHash = readFixtureHash(base);
1007
- const currentHash = readFixtureHash(current);
1008
- const warnings = [];
1009
- // #250 — task corpus hash + selected task IDs. Refused unless either side
1010
- // is legacy (missing the hash) or the operator passed
1011
- // `allowCorpusMismatch`. Legacy reports (no taskCorpusHash) degrade to a
1012
- // warning so older artefacts can still be diffed.
1013
- const baseTaskHash = readTaskCorpusHash(base);
1014
- const currentTaskHash = readTaskCorpusHash(current);
1015
- const baseIds = readSelectedTaskIds(base);
1016
- const currentIds = readSelectedTaskIds(current);
1017
- if (baseTaskHash !== null && currentTaskHash !== null && baseTaskHash !== currentTaskHash) {
1018
- if (!options.allowCorpusMismatch) {
1019
- return {
1020
- ok: false,
1021
- reason: "corpus_mismatch",
1022
- message: `cannot compare across different task corpora: base taskCorpusHash="${baseTaskHash}", current="${currentTaskHash}". Rerun against the same task selection or pass --allow-corpus-mismatch to override.`,
1023
- baseModel,
1024
- currentModel,
1025
- baseTaskCorpusHash: baseTaskHash,
1026
- currentTaskCorpusHash: currentTaskHash,
1027
- ...(baseIds ? { baseSelectedTaskIds: baseIds } : {}),
1028
- ...(currentIds ? { currentSelectedTaskIds: currentIds } : {}),
1029
- };
1030
- }
1031
- warnings.push(`task corpus hashes differ (base="${baseTaskHash}", current="${currentTaskHash}") — diff requested via --allow-corpus-mismatch`);
1032
- }
1033
- else if (baseTaskHash === null &&
1034
- currentTaskHash === null &&
1035
- baseIds !== null &&
1036
- currentIds !== null &&
1037
- !arraysEqualIgnoringOrder(baseIds, currentIds)) {
1038
- // Both sides legacy (no taskCorpusHash) but both expose selectedTaskIds
1039
- // and they differ. We can still detect a mismatched corpus from the ID
1040
- // list alone — refuse unless the operator opted in.
1041
- if (!options.allowCorpusMismatch) {
1042
- return {
1043
- ok: false,
1044
- reason: "corpus_mismatch",
1045
- message: `cannot compare across different selected task IDs. Rerun against the same task selection or pass --allow-corpus-mismatch to override.`,
1046
- baseModel,
1047
- currentModel,
1048
- baseSelectedTaskIds: baseIds,
1049
- currentSelectedTaskIds: currentIds,
1050
- };
1051
- }
1052
- warnings.push("selected task IDs differ — diff requested via --allow-corpus-mismatch");
1053
- }
1054
- if (baseTaskHash === null)
1055
- warnings.push("base report has no corpus.taskCorpusHash; proceeding without task-corpus-pin check");
1056
- if (currentTaskHash === null)
1057
- warnings.push("current report has no corpus.taskCorpusHash; proceeding without task-corpus-pin check");
1058
- if (baseHash !== null && currentHash !== null && baseHash !== currentHash) {
1059
- if (!options.allowFixtureMismatch) {
1060
- return {
1061
- ok: false,
1062
- reason: "hash_mismatch",
1063
- message: `cannot compare across different fixture-content hashes: base="${baseHash}", current="${currentHash}". Rerun against matching fixtures or pass --allow-fixture-mismatch to override.`,
1064
- baseModel,
1065
- currentModel,
1066
- baseFixtureContentHash: baseHash,
1067
- currentFixtureContentHash: currentHash,
1068
- };
1069
- }
1070
- warnings.push(`fixture-content hashes differ (base="${baseHash}", current="${currentHash}") — diff requested via --allow-fixture-mismatch`);
1071
- }
1072
- if (baseHash === null)
1073
- warnings.push("base report has no corpus.fixtureContentHash; proceeding without fixture-pin check");
1074
- if (currentHash === null)
1075
- warnings.push("current report has no corpus.fixtureContentHash; proceeding without fixture-pin check");
1076
- // Aggregate (akm arm is the one that matters — noakm is the control).
1077
- const ba = akmAgg(base);
1078
- const ca = akmAgg(current);
1079
- const passRateDelta = ca.pass_rate - ba.pass_rate;
1080
- const tokensPerPassDelta = ba.tokens_per_pass === null || ca.tokens_per_pass === null ? null : ca.tokens_per_pass - ba.tokens_per_pass;
1081
- const wallclockMsDelta = ca.wallclock_ms - ba.wallclock_ms;
1082
- const aggregate = {
1083
- passRateDelta,
1084
- passRateSign: classifyPassRate(passRateDelta),
1085
- tokensPerPassDelta,
1086
- tokensPerPassSign: classifyCount(tokensPerPassDelta, true),
1087
- wallclockMsDelta,
1088
- wallclockMsSign: classifyCount(wallclockMsDelta, true),
1089
- };
1090
- // Per-task rows. Outer-join on task id.
1091
- const baseTasks = new Map();
1092
- for (const t of base.tasks ?? [])
1093
- baseTasks.set(t.id, t);
1094
- const currentTasks = new Map();
1095
- for (const t of current.tasks ?? [])
1096
- currentTasks.set(t.id, t);
1097
- const allIds = new Set();
1098
- for (const id of baseTasks.keys())
1099
- allIds.add(id);
1100
- for (const id of currentTasks.keys())
1101
- allIds.add(id);
1102
- const perTask = [];
1103
- for (const id of [...allIds].sort()) {
1104
- const b = baseTasks.get(id);
1105
- const c = currentTasks.get(id);
1106
- const bM = b?.akm ?? null;
1107
- const cM = c?.akm ?? null;
1108
- const presence = b !== undefined && c !== undefined ? "both" : b !== undefined ? "base-only" : "current-only";
1109
- const passRateDelta_ = bM !== null && cM !== null ? cM.pass_rate - bM.pass_rate : null;
1110
- const tokensPerPassDelta_ = bM !== null && cM !== null && bM.tokens_per_pass !== null && cM.tokens_per_pass !== null
1111
- ? cM.tokens_per_pass - bM.tokens_per_pass
1112
- : null;
1113
- const wallclockMsDelta_ = bM !== null && cM !== null ? cM.wallclock_ms - bM.wallclock_ms : null;
1114
- perTask.push({
1115
- id,
1116
- presence,
1117
- baseMetrics: bM,
1118
- currentMetrics: cM,
1119
- delta: { passRate: passRateDelta_, tokensPerPass: tokensPerPassDelta_, wallclockMs: wallclockMsDelta_ },
1120
- signMarker: classifyPassRate(passRateDelta_),
1121
- });
1122
- }
1123
- return {
1124
- ok: true,
1125
- baseModel,
1126
- currentModel,
1127
- baseFixtureContentHash: baseHash,
1128
- currentFixtureContentHash: currentHash,
1129
- warnings,
1130
- aggregate,
1131
- perTask,
1132
- };
1133
- }
1134
- /** Maximum rank at which the gold ref still counts as "found"; > this is `search_low_rank`. */
1135
- const SEARCH_RANK_CUTOFF = 5;
1136
- /** Cap on the number of characters of `verifierStdout` we substring-scan. Mirrors trajectory.ts. */
1137
- const FAILURE_MODE_STDOUT_SCAN_CAP = 16 * 1024 * 1024;
1138
- /**
1139
- * Classify a single failed run into one of the §6.6 labels. Pure function —
1140
- * consults `runResult.trajectory.correctAssetLoaded` first (trajectory data
1141
- * is authoritative when present), then falls back to string-matching
1142
- * `runResult.events[]` and `runResult.verifierStdout`. Never calls an LLM,
1143
- * never touches the filesystem.
1144
- *
1145
- * Decision tree (priority order — first match wins):
1146
- * 1. Run not failed (`pass`, `budget_exceeded`, `harness_error`) → `null`.
1147
- * 2. `trajectory.correctAssetLoaded === true` → the agent loaded the gold
1148
- * asset but still failed. This is `loaded_ignored` (agent wrote from
1149
- * memory instead of applying asset content). This short-circuit fixes
1150
- * the 2026-05-03 baseline bug where 24/25 `search_no_gold` labels were
1151
- * wrong because the classifier didn't consult trajectory data.
1152
- * 3. No `akm search` call in the trace:
1153
- * a. If task has no `goldRef` (so `correctAssetLoaded` is always null)
1154
- * → `no_events` (trajectory metric undefined; cannot distinguish
1155
- * "agent ran but events absent" from "agent never ran").
1156
- * b. Otherwise → `no_search`.
1157
- * 4. Search ran, no goldRef → `unrelated_bug`.
1158
- * 5. Search ran; gold ref absent from results → `search_no_gold`.
1159
- * (Only reachable when `correctAssetLoaded` is false or null, since
1160
- * true is handled in step 2.)
1161
- * 6. Gold ref present at rank > 5 → `search_low_rank`.
1162
- * 7. `akm show` invoked on a non-gold ref AND gold ref never loaded
1163
- * → `loaded_wrong`.
1164
- * 8. Gold ref loaded; verifier output suggests the action contradicts the
1165
- * asset's guidance → `loaded_ignored`.
1166
- * 9. Gold ref loaded and apparently followed → `followed_wrong`.
1167
- * 10. Default → `unrelated_bug`.
1168
- */
1169
- export function classifyFailureMode(taskMeta, runResult) {
1170
- if (runResult.outcome !== "fail")
1171
- return null;
1172
- const goldRef = taskMeta.goldRef;
1173
- const correctAssetLoaded = runResult.trajectory?.correctAssetLoaded;
1174
- // 1. Trajectory short-circuit: if events data confirms the gold asset was
1175
- // loaded, the failure must be compliance-related, not discovery-related.
1176
- // Return `loaded_ignored` immediately without scanning stdout.
1177
- if (correctAssetLoaded === true) {
1178
- return "loaded_ignored";
1179
- }
1180
- const trace = collectTrace(runResult);
1181
- // 2. no_search / no_events — no `akm search` invocation anywhere in the trace.
1182
- if (!hasAkmSearch(trace, runResult)) {
1183
- // When there is no goldRef, correctAssetLoaded is always null (the metric
1184
- // is undefined). We cannot tell whether the agent genuinely didn't search
1185
- // or whether events data was simply absent. Use `no_events` to surface
1186
- // this ambiguity rather than conflating it with `no_search`.
1187
- if (!goldRef) {
1188
- return "no_events";
1189
- }
1190
- return "no_search";
1191
- }
1192
- // Without a gold ref the search-based and load-based checks are undefined.
1193
- // We can only distinguish "no_search" / "no_events" from everything else.
1194
- if (!goldRef) {
1195
- return "unrelated_bug";
1196
- }
1197
- const searchRank = findGoldSearchRank(trace, goldRef);
1198
- // 3. search_no_gold — search ran (precondition above) but gold ref absent.
1199
- // Only reachable when correctAssetLoaded is false or null (trajectory
1200
- // data indicates gold was not loaded), because true is handled above.
1201
- if (searchRank === null) {
1202
- return "search_no_gold";
1203
- }
1204
- // 4. search_low_rank — present but below the cutoff.
1205
- if (searchRank > SEARCH_RANK_CUTOFF) {
1206
- return "search_low_rank";
1207
- }
1208
- const goldLoaded = hasAkmShow(trace, runResult, goldRef);
1209
- const otherRefLoaded = hasAkmShowOtherRef(trace, runResult, goldRef);
1210
- // 5. loaded_wrong — agent showed a non-gold ref AND never loaded the gold.
1211
- if (otherRefLoaded && !goldLoaded) {
1212
- return "loaded_wrong";
1213
- }
1214
- // The remaining branches all assume the gold was loaded.
1215
- if (!goldLoaded) {
1216
- // Gold ref was found in search at an acceptable rank, but the agent
1217
- // never loaded anything (gold or otherwise) before failing. The taxonomy
1218
- // table has no row for "found but never opened" — treat as unrelated_bug.
1219
- return "unrelated_bug";
1220
- }
1221
- // 6. loaded_ignored — verifier diagnostic indicates the action contradicts
1222
- // the loaded asset. Conservative heuristic: look for explicit "ignored"
1223
- // or "not applied" markers in the verifier stdout. Without an LLM we
1224
- // cannot detect subtler contradictions, so this branch only fires when
1225
- // the verifier itself flagged the contradiction.
1226
- if (verifierIndicatesIgnored(runResult.verifierStdout)) {
1227
- return "loaded_ignored";
1228
- }
1229
- // 7. followed_wrong — gold loaded, apparently followed, verifier still
1230
- // failed. The §6.6 spec maps this to "the asset itself is wrong".
1231
- return "followed_wrong";
1232
- }
1233
- /** Build a `FailureModeAggregate` from a list of (taskId, label) pairs. */
1234
- export function aggregateFailureModes(entries) {
1235
- const byLabel = {};
1236
- const byTask = {};
1237
- for (const { taskId, mode } of entries) {
1238
- byLabel[mode] = (byLabel[mode] ?? 0) + 1;
1239
- if (!byTask[taskId])
1240
- byTask[taskId] = {};
1241
- byTask[taskId][mode] = (byTask[taskId][mode] ?? 0) + 1;
1242
- }
1243
- return { byLabel, byTask };
1244
- }
1245
- // ── Failure-mode classifier helpers ────────────────────────────────────────
1246
- /**
1247
- * Concatenated string used for substring scans. We pre-build this once per
1248
- * classify call so the helper functions can share it. Stdout is capped per
1249
- * the trajectory parser convention to keep runaway agents from OOMing the
1250
- * bench.
1251
- */
1252
- function collectTrace(runResult) {
1253
- const stdout = runResult.verifierStdout ?? "";
1254
- const capped = stdout.length > FAILURE_MODE_STDOUT_SCAN_CAP ? stdout.slice(0, FAILURE_MODE_STDOUT_SCAN_CAP) : stdout;
1255
- return capped;
1256
- }
1257
- /** Does the trace contain any `akm search` invocation (CLI form OR event)? */
1258
- function hasAkmSearch(trace, runResult) {
1259
- // Tool-call CLI form, e.g. `akm search "deploy homelab"`.
1260
- if (/\bakm\s+search\b/.test(trace))
1261
- return true;
1262
- // Tool-call JSON form, e.g. `"args":["search","..."]`.
1263
- if (trace.includes(`"search"`) && /["']search["']/.test(trace))
1264
- return true;
1265
- // Event-stream form (search verbs aren't currently emitted but the field
1266
- // is forward-compatible — see core/events.ts).
1267
- for (const event of runResult.events) {
1268
- if (event.eventType === "search" || event.eventType === "search_invoked")
1269
- return true;
1270
- }
1271
- return false;
1272
- }
1273
- /**
1274
- * Find the 1-based rank of `goldRef` in the search results captured in the
1275
- * trace, or `null` if not present. Best-effort heuristics:
1276
- * 1. Look for an `akm search` block followed by a numbered list (`1. skill:foo`).
1277
- * 2. Look for a JSON-ish results array containing the ref.
1278
- * 3. Fall back to substring presence — if the ref appears anywhere after
1279
- * a search invocation, treat it as rank-unknown. We err on the side of
1280
- * `1` (best case for the agent) so the classifier doesn't false-positive
1281
- * on `search_low_rank`.
1282
- */
1283
- function findGoldSearchRank(trace, goldRef) {
1284
- // Locate the first `akm search` invocation; restrict the rank search to
1285
- // text after it so we don't pick up `akm show` output.
1286
- const searchMatch = trace.match(/\bakm\s+search\b/);
1287
- if (!searchMatch || searchMatch.index === undefined) {
1288
- // Caller already verified search ran; if our regex disagrees, fall back
1289
- // to scanning the full trace.
1290
- return findRefRankInText(trace, goldRef);
1291
- }
1292
- const after = trace.slice(searchMatch.index);
1293
- return findRefRankInText(after, goldRef);
1294
- }
1295
- function findRefRankInText(text, goldRef) {
1296
- const escaped = goldRef.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
1297
- // Numbered list: lines of the form `<rank>. <ref>` or `<rank>) <ref>`.
1298
- const numberedRe = /^\s*(\d{1,3})[.)]\s+([^\s]+)/gm;
1299
- let match;
1300
- while (true) {
1301
- match = numberedRe.exec(text);
1302
- if (match === null)
1303
- break;
1304
- const ref = match[2];
1305
- if (refsMatch(ref, goldRef)) {
1306
- return Number.parseInt(match[1], 10);
1307
- }
1308
- }
1309
- // JSON array form: `"results":["a","b","skill:foo"]`. Estimate rank by
1310
- // splitting on commas after the bracket. Best-effort.
1311
- const jsonRe = /"results"\s*:\s*\[([^\]]+)\]/;
1312
- const jsonMatch = text.match(jsonRe);
1313
- if (jsonMatch) {
1314
- const items = jsonMatch[1].split(",").map((s) => s.trim().replace(/^["']|["']$/g, ""));
1315
- const idx = items.findIndex((item) => refsMatch(item, goldRef));
1316
- if (idx >= 0)
1317
- return idx + 1;
1318
- }
1319
- // Substring presence — assume rank 1 (best case for the agent, conservative
1320
- // for the `search_low_rank` rule).
1321
- const refRe = new RegExp(`\\b${escaped}\\b`);
1322
- if (refRe.test(text))
1323
- return 1;
1324
- return null;
1325
- }
1326
- /** True when `candidate` is `goldRef` or a strict ref-extension thereof. */
1327
- function refsMatch(candidate, goldRef) {
1328
- if (candidate === goldRef)
1329
- return true;
1330
- if (candidate.endsWith(`//${goldRef}`))
1331
- return true;
1332
- if (candidate.startsWith(`${goldRef}/`))
1333
- return true;
1334
- return false;
1335
- }
1336
- /** Did the agent invoke `akm show <goldRef>` at any point? */
1337
- function hasAkmShow(trace, runResult, goldRef) {
1338
- const escaped = goldRef.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
1339
- // CLI form, exact ref. Also matches origin-prefixed variants like
1340
- // `akm show team//skill:foo` because the `[\w/]*//` prefix is optional.
1341
- const cliRe = new RegExp(`\\bakm\\s+show\\s+["']?(?:[\\w-]+//)?${escaped}(?:\\b|\\W)`);
1342
- if (cliRe.test(trace))
1343
- return true;
1344
- // Tool-call JSON form: `"args":["show","skill:foo"]`.
1345
- if (trace.includes(`"show"`) && trace.includes(goldRef))
1346
- return true;
1347
- // Event-stream metadata.ref.
1348
- for (const event of runResult.events) {
1349
- if (typeof event.ref === "string" && refsMatch(event.ref, goldRef)) {
1350
- // Only count "show" or "load" eventTypes; a `feedback` event mentioning
1351
- // the ref doesn't mean the agent loaded it during this run.
1352
- if (event.eventType === "show" || event.eventType === "load" || event.eventType === "tool_call")
1353
- return true;
1354
- }
1355
- const meta = event.metadata;
1356
- if (meta && typeof meta === "object") {
1357
- const candidate = meta.ref;
1358
- if (typeof candidate === "string" && refsMatch(candidate, goldRef)) {
1359
- if (event.eventType === "show" || event.eventType === "load" || event.eventType === "tool_call")
1360
- return true;
1361
- }
1362
- }
1363
- }
1364
- return false;
1365
- }
1366
- /** Did the agent invoke `akm show <ref>` for some ref OTHER than `goldRef`? */
1367
- function hasAkmShowOtherRef(trace, runResult, goldRef) {
1368
- // CLI form: capture the ref argument and reject when it matches the gold.
1369
- const cliRe = /\bakm\s+show\s+["']?([^\s"'`]+)/g;
1370
- let match;
1371
- while (true) {
1372
- match = cliRe.exec(trace);
1373
- if (match === null)
1374
- break;
1375
- if (!refsMatch(match[1], goldRef))
1376
- return true;
1377
- }
1378
- // Tool-call JSON form: `"args":["show","..."]`. Best-effort scan.
1379
- const jsonRe = /\["show",\s*"([^"]+)"/g;
1380
- while (true) {
1381
- match = jsonRe.exec(trace);
1382
- if (match === null)
1383
- break;
1384
- if (!refsMatch(match[1], goldRef))
1385
- return true;
1386
- }
1387
- // Event-stream form.
1388
- for (const event of runResult.events) {
1389
- if (event.eventType !== "show" && event.eventType !== "load" && event.eventType !== "tool_call")
1390
- continue;
1391
- if (typeof event.ref === "string" && !refsMatch(event.ref, goldRef))
1392
- return true;
1393
- const meta = event.metadata;
1394
- if (meta && typeof meta === "object") {
1395
- const candidate = meta.ref;
1396
- if (typeof candidate === "string" && !refsMatch(candidate, goldRef))
1397
- return true;
1398
- }
1399
- }
1400
- return false;
1401
- }
1402
- /**
1403
- * Conservative heuristic for the `loaded_ignored` branch. Without an LLM we
1404
- * cannot reliably decide whether an arbitrary action contradicts arbitrary
1405
- * asset content; we only fire when the verifier's own diagnostic explicitly
1406
- * flags the gold-asset guidance as ignored.
1407
- *
1408
- * The verifier stdout strings are deterministic — they come from
1409
- * `runVerifier` and the per-task `verify.sh` scripts. Tasks that want to
1410
- * surface this label should emit one of the agreed-upon markers below.
1411
- */
1412
- function verifierIndicatesIgnored(verifierStdout) {
1413
- if (!verifierStdout)
1414
- return false;
1415
- const lower = verifierStdout.toLowerCase();
1416
- return (lower.includes("ignored gold guidance") ||
1417
- lower.includes("guidance ignored") ||
1418
- lower.includes("did not follow loaded asset") ||
1419
- lower.includes("contradicts loaded asset"));
1420
- }
1421
- /** Cap on the number of result refs we extract per `akm search` invocation. */
1422
- const TOP_K = 10;
1423
- /**
1424
- * Extract the gold rank for every `akm search` invocation in a run.
1425
- *
1426
- * The parser scans `runResult.verifierStdout` (which carries the captured
1427
- * agent stdout including its tool-call trace) for `akm search` commands
1428
- * and the result lists that follow them. The first 10 hits are considered;
1429
- * if the gold ref appears, `rankOfGold` is its 1-based position, else
1430
- * `null`.
1431
- *
1432
- * Pure function: never reads from disk and never mutates inputs. When
1433
- * `goldRef` is undefined the function returns `[]` — we only attribute
1434
- * ranks for tasks that actually have a gold asset.
1435
- */
1436
- export function extractGoldRanks(runResult, goldRef) {
1437
- if (!goldRef)
1438
- return [];
1439
- const haystack = runResult.verifierStdout;
1440
- if (!haystack)
1441
- return [];
1442
- const events = [];
1443
- // Walk the stdout linearly. A search invocation looks like
1444
- // `akm search "<query>"` or `akm search <query>`
1445
- // and the subsequent block carries the result list. A new `akm` command
1446
- // (or end of stdout) terminates the previous search's result block.
1447
- const lines = haystack.split(/\r?\n/);
1448
- let active = null;
1449
- // Regex for an `akm search` invocation. Captures the rest of the line
1450
- // after `search ` so we can pick up the query whether it's quoted or not.
1451
- const searchInvocationRe = /\bakm\s+search\s+(.+?)(?:\s+--|$)/;
1452
- // A different `akm <verb>` (not `search`) terminates the active block.
1453
- const akmInvocationRe = /\bakm\s+(\w+)/;
1454
- for (const rawLine of lines) {
1455
- const line = rawLine.trim();
1456
- if (!line)
1457
- continue;
1458
- const searchMatch = line.match(searchInvocationRe);
1459
- if (searchMatch) {
1460
- // Flush any active block before starting a new one.
1461
- if (active) {
1462
- active.rankOfGold = computeRank(active.results, goldRef);
1463
- events.push(active);
1464
- }
1465
- const query = stripQuotes(searchMatch[1].trim());
1466
- active = { query, results: [], rankOfGold: null };
1467
- // Some traces inline the JSON result on the same line — try to extract.
1468
- collectRefsFromLine(line, active.results);
1469
- continue;
1470
- }
1471
- if (!active)
1472
- continue;
1473
- // A non-search akm invocation closes the active search block.
1474
- const akmMatch = line.match(akmInvocationRe);
1475
- if (akmMatch && akmMatch[1] !== "search") {
1476
- active.rankOfGold = computeRank(active.results, goldRef);
1477
- events.push(active);
1478
- active = null;
1479
- continue;
1480
- }
1481
- collectRefsFromLine(line, active.results);
1482
- }
1483
- if (active) {
1484
- active.rankOfGold = computeRank(active.results, goldRef);
1485
- events.push(active);
1486
- }
1487
- return events;
1488
- }
1489
- /** Trim leading/trailing single or double quotes from a query string. */
1490
- function stripQuotes(s) {
1491
- if (s.length >= 2) {
1492
- const first = s[0];
1493
- const last = s[s.length - 1];
1494
- if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
1495
- return s.slice(1, -1);
1496
- }
1497
- }
1498
- return s;
1499
- }
1500
- /**
1501
- * Pull asset refs from a single line into `out`. Matches both plain
1502
- * `ref: <ref>` lines (text mode) and `"ref":"<ref>"` (JSON mode). We
1503
- * stop at TOP_K results to mirror the spec's top-10 cutoff.
1504
- */
1505
- function collectRefsFromLine(line, out) {
1506
- if (out.length >= TOP_K)
1507
- return;
1508
- // JSON form: `"ref":"skill:foo"` or `"ref": "skill:foo"`. Multiple per line possible.
1509
- const jsonRe = /"ref"\s*:\s*"([^"]+)"/g;
1510
- let m;
1511
- m = jsonRe.exec(line);
1512
- while (m !== null) {
1513
- if (out.length >= TOP_K)
1514
- return;
1515
- out.push(m[1]);
1516
- m = jsonRe.exec(line);
1517
- }
1518
- // Plain text form: ` ref: skill:foo`. Only treat the line as a ref-bearing
1519
- // line if it starts with `ref:` (after whitespace). Avoids picking up
1520
- // every `:` in arbitrary stdout.
1521
- const textRe = /^ref:\s*([^\s,]+)/;
1522
- const tm = line.match(textRe);
1523
- if (tm && out.length < TOP_K) {
1524
- out.push(tm[1]);
1525
- }
1526
- }
1527
- /**
1528
- * 1-based rank of `goldRef` in `results`, or `null` if absent within the
1529
- * top 10. We use `matchesGold` for prefix-tolerant matching so
1530
- * `team//skill:foo` counts as `skill:foo` (mirrors trajectory parser).
1531
- */
1532
- function computeRank(results, goldRef) {
1533
- const cap = Math.min(results.length, TOP_K);
1534
- for (let i = 0; i < cap; i += 1) {
1535
- if (matchesGold(results[i], goldRef))
1536
- return i + 1;
1537
- }
1538
- return null;
1539
- }
1540
- function matchesGold(candidate, gold) {
1541
- if (candidate === gold)
1542
- return true;
1543
- if (candidate.endsWith(`//${gold}`))
1544
- return true;
1545
- if (candidate.startsWith(`${gold}/`))
1546
- return true;
1547
- return false;
1548
- }
1549
- /**
1550
- * Aggregate gold-rank records across all akm-arm runs in the corpus.
1551
- *
1552
- * The function operates on `report.goldRankRecords`, which the runner
1553
- * populates per (task, arm, seed). When the corpus has no gold-ref tasks
1554
- * at all (every record list is empty), every metric collapses to a zero
1555
- * envelope and the `passRateByRank` table is empty — the renderer turns
1556
- * that into a single "(N/A)" sentence.
1557
- */
1558
- export function computeSearchBridge(report) {
1559
- const records = report.goldRankRecords ?? [];
1560
- // Histogram + percentile inputs across every search.
1561
- const histogram = emptyHistogram();
1562
- const allRanks = [];
1563
- let totalSearches = 0;
1564
- for (const rec of records) {
1565
- for (const ev of rec.searches) {
1566
- totalSearches += 1;
1567
- allRanks.push(ev.rankOfGold);
1568
- const bucket = ev.rankOfGold === null ? "missing" : String(ev.rankOfGold);
1569
- histogram[bucket] = (histogram[bucket] ?? 0) + 1;
1570
- }
1571
- }
1572
- const goldAtRank1 = totalSearches === 0 ? 0 : (histogram["1"] ?? 0) / totalSearches;
1573
- const goldMissing = totalSearches === 0 ? 0 : (histogram.missing ?? 0) / totalSearches;
1574
- const goldRankP50 = totalSearches === 0 ? null : percentile(allRanks, 50);
1575
- const goldRankP90 = totalSearches === 0 ? null : percentile(allRanks, 90);
1576
- // pass_rate_by_rank — split runs by the rank in *the search the agent
1577
- // actually ran*. We use the last `akm search` of the run (or "missing"
1578
- // when no search at all happened, or "missing" when the agent searched
1579
- // but gold wasn't in the top 10 in that final search). Runs without any
1580
- // `akm search` invocation are dropped from this slice — `pass_rate_by_rank`
1581
- // only describes what happened given a search.
1582
- const passRateBuckets = new Map();
1583
- for (const rec of records) {
1584
- if (rec.searches.length === 0)
1585
- continue;
1586
- const chosen = rec.searches[rec.searches.length - 1];
1587
- const bucket = chosen.rankOfGold === null ? "missing" : String(chosen.rankOfGold);
1588
- const slot = passRateBuckets.get(bucket) ?? { passes: 0, total: 0 };
1589
- slot.total += 1;
1590
- if (rec.outcome === "pass")
1591
- slot.passes += 1;
1592
- passRateBuckets.set(bucket, slot);
1593
- }
1594
- const passRateByRank = [];
1595
- for (const rank of histogramKeys()) {
1596
- const slot = passRateBuckets.get(rank);
1597
- if (!slot)
1598
- continue;
1599
- passRateByRank.push({
1600
- rank,
1601
- passRate: slot.total === 0 ? 0 : slot.passes / slot.total,
1602
- runCount: slot.total,
1603
- });
1604
- }
1605
- return {
1606
- goldRankDistribution: histogram,
1607
- goldRankP50,
1608
- goldRankP90,
1609
- goldAtRank1,
1610
- goldMissing,
1611
- passRateByRank,
1612
- runsObserved: records.length,
1613
- searchesObserved: totalSearches,
1614
- };
1615
- }
1616
- /** Ordered keys used for both the histogram and the pass_rate_by_rank table. */
1617
- export function histogramKeys() {
1618
- return ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "missing"];
1619
- }
1620
- function emptyHistogram() {
1621
- const out = {};
1622
- for (const k of histogramKeys())
1623
- out[k] = 0;
1624
- return out;
1625
- }
1626
- /**
1627
- * Linear-interpolated percentile over a list of ranks. `null` ranks are
1628
- * treated as `Infinity` so the missing bucket pushes percentiles up
1629
- * correctly. Returns `Infinity` when the percentile lands in the missing
1630
- * region; the renderer surfaces that as the literal `"missing"` token so
1631
- * downstream JSON consumers don't choke on `Infinity`.
1632
- */
1633
- function percentile(ranks, p) {
1634
- if (ranks.length === 0)
1635
- return Number.NaN;
1636
- const sorted = ranks.map((r) => (r === null ? Number.POSITIVE_INFINITY : r)).sort((a, b) => a - b);
1637
- // Nearest-rank method (avoids interpolation between Infinity and a finite).
1638
- // index = ceil(p/100 * N) - 1, clamped to [0, N-1].
1639
- const idx = Math.min(sorted.length - 1, Math.max(0, Math.ceil((p / 100) * sorted.length) - 1));
1640
- return sorted[idx];
1641
- }
1642
- /**
1643
- * Aggregate proposal-quality metrics from the evolve runner's proposal log.
1644
- * Pure function — does not touch disk and does not invoke any subprocess.
1645
- */
1646
- export function computeProposalQualityMetrics(proposalLog) {
1647
- const byRef = new Map();
1648
- let totalAccepted = 0;
1649
- let totalLintPass = 0;
1650
- for (const entry of proposalLog) {
1651
- let row = byRef.get(entry.assetRef);
1652
- if (!row) {
1653
- row = { assetRef: entry.assetRef, proposalCount: 0, lintPassCount: 0, acceptedCount: 0 };
1654
- byRef.set(entry.assetRef, row);
1655
- }
1656
- row.proposalCount += 1;
1657
- if (entry.lintPass) {
1658
- row.lintPassCount += 1;
1659
- totalLintPass += 1;
1660
- }
1661
- if (entry.decision === "accept") {
1662
- row.acceptedCount += 1;
1663
- totalAccepted += 1;
1664
- }
1665
- }
1666
- const rows = [...byRef.values()].sort((a, b) => a.assetRef.localeCompare(b.assetRef));
1667
- const totalProposals = proposalLog.length;
1668
- return {
1669
- rows,
1670
- totalProposals,
1671
- totalAccepted,
1672
- acceptanceRate: totalProposals === 0 ? 0 : totalAccepted / totalProposals,
1673
- lintPassRate: totalProposals === 0 ? 0 : totalLintPass / totalProposals,
1674
- };
1675
- }
1676
- /**
1677
- * Compute longitudinal metrics from three §13.3 utility-shaped reports. Each
1678
- * input report is expected to share the same eval-slice corpus, with one arm
1679
- * driving the akm side: `pre` = pre-evolve stash, `post` = evolved stash,
1680
- * `synthetic` = no-stash scratchpad arm.
1681
- *
1682
- * The "arm" we read off each report is `aggregateAkm.passRate` — the runners
1683
- * produce the akm arm for all three (synthetic is just the akm arm with a
1684
- * stripped stashDir; pre/post differ by stash content). `seedsPerArm` for
1685
- * the degradation threshold is taken from the post report's corpus envelope.
1686
- */
1687
- export function computeLongitudinalMetrics(preReport, postReport, syntheticReport) {
1688
- const prePassRate = preReport.aggregateAkm.passRate;
1689
- const postPassRate = postReport.aggregateAkm.passRate;
1690
- const syntheticPassRate = syntheticReport.aggregateAkm.passRate;
1691
- const seedsPerArm = Math.max(1, postReport.corpus.seedsPerArm);
1692
- const oneSeedFraction = 1 / seedsPerArm;
1693
- // Per-task degradation: outer-join pre and post on task id.
1694
- const preTasks = new Map();
1695
- for (const t of preReport.tasks)
1696
- preTasks.set(t.id, t);
1697
- const postTasks = new Map();
1698
- for (const t of postReport.tasks)
1699
- postTasks.set(t.id, t);
1700
- // Index post failure-mode labels by task id (one mode per task — first
1701
- // failed run wins; matches the §6.6 by-task aggregate's natural ordering).
1702
- const postFailureByTask = {};
1703
- const postFailureByTaskMap = postReport.failureModes?.byTask ?? {};
1704
- for (const [taskId, byMode] of Object.entries(postFailureByTaskMap)) {
1705
- const labels = Object.keys(byMode);
1706
- if (labels.length > 0)
1707
- postFailureByTask[taskId] = labels[0];
1708
- }
1709
- const degradations = [];
1710
- const allIds = new Set();
1711
- for (const id of preTasks.keys())
1712
- allIds.add(id);
1713
- for (const id of postTasks.keys())
1714
- allIds.add(id);
1715
- for (const id of [...allIds].sort()) {
1716
- const pre = preTasks.get(id);
1717
- const post = postTasks.get(id);
1718
- if (!pre || !post)
1719
- continue;
1720
- const preRate = pre.akm.passRate;
1721
- const postRate = post.akm.passRate;
1722
- const dropped = preRate - postRate;
1723
- if (dropped > oneSeedFraction) {
1724
- degradations.push({
1725
- taskId: id,
1726
- prePassRate: preRate,
1727
- postPassRate: postRate,
1728
- delta: postRate - preRate,
1729
- failureMode: postFailureByTask[id] ?? null,
1730
- });
1731
- }
1732
- }
1733
- return {
1734
- improvementSlope: postPassRate - prePassRate,
1735
- overSyntheticLift: postPassRate - syntheticPassRate,
1736
- degradationCount: degradations.length,
1737
- degradations,
1738
- prePassRate,
1739
- postPassRate,
1740
- syntheticPassRate,
1741
- };
1742
- }
1743
- /** Threshold above `pass_rate[0]` that defines "improvement" for §6.4. */
1744
- export const LEARNING_IMPROVEMENT_THRESHOLD = 0.05;
1745
- export function computeLearningCurve(episodes) {
1746
- // Stable sort by episode_index — defensive against unordered inputs.
1747
- const sorted = [...episodes].sort((a, b) => a.episode_index - b.episode_index);
1748
- // Recompute per-episode deltas so the contract holds regardless of what
1749
- // the caller stamped on the input record.
1750
- const normalised = sorted.map((ep, i) => {
1751
- const prev = i === 0 ? null : sorted[i - 1];
1752
- const delta = prev === null ? 0 : ep.pass_rate - prev.pass_rate;
1753
- return { ...ep, delta_from_previous_episode: delta };
1754
- });
1755
- const passRateByEpisode = normalised.map((ep) => ep.pass_rate);
1756
- // Linear regression slope: sum((xi - x_mean) * (yi - y_mean)) /
1757
- // sum((xi - x_mean)^2). For a single episode the denominator is 0 — we
1758
- // return 0 (no observable trend) rather than NaN.
1759
- const n = normalised.length;
1760
- let learningSlope = 0;
1761
- if (n >= 2) {
1762
- const xs = normalised.map((ep) => ep.episode_index);
1763
- const xMean = xs.reduce((s, v) => s + v, 0) / n;
1764
- const yMean = passRateByEpisode.reduce((s, v) => s + v, 0) / n;
1765
- let num = 0;
1766
- let den = 0;
1767
- for (let i = 0; i < n; i += 1) {
1768
- const dx = xs[i] - xMean;
1769
- const dy = passRateByEpisode[i] - yMean;
1770
- num += dx * dy;
1771
- den += dx * dx;
1772
- }
1773
- learningSlope = den === 0 ? 0 : num / den;
1774
- }
1775
- // time_to_improvement: smallest episode_index strictly greater than
1776
- // `pass_rate[0] + threshold`. Episode 0 itself is excluded — improvement
1777
- // is only meaningful relative to baseline.
1778
- let timeToImprovement = null;
1779
- if (n >= 2) {
1780
- const baseline = passRateByEpisode[0];
1781
- for (let i = 1; i < n; i += 1) {
1782
- if (passRateByEpisode[i] > baseline + LEARNING_IMPROVEMENT_THRESHOLD) {
1783
- timeToImprovement = normalised[i].episode_index;
1784
- break;
1785
- }
1786
- }
1787
- }
1788
- return {
1789
- episodes: normalised,
1790
- pass_rate_by_episode: passRateByEpisode,
1791
- learning_slope: learningSlope,
1792
- time_to_improvement: timeToImprovement,
1793
- };
1794
- }
1795
- /**
1796
- * Compute the §6.8 feedback-signal integrity confusion matrix.
1797
- *
1798
- * Pure function — does not touch disk and does not invoke any subprocess.
1799
- * The join is by `(taskId, seed)` so that a feedback event is attributed
1800
- * to the run that produced it, NOT to a later run that happens to touch
1801
- * the same gold ref. This matters when the same gold ref appears across
1802
- * multiple Phase 1 runs (e.g. multiple seeds, or two tasks sharing a
1803
- * skill); the per-asset row aggregates across all runs that referenced it
1804
- * in feedback, but each individual feedback event's matrix cell is
1805
- * decided by its own run's outcome.
1806
- *
1807
- * NaN-safety: a per-asset row with zero feedback events (cannot happen via
1808
- * this function — every row is derived from at least one feedback entry —
1809
- * but defensive against future callers passing curated subsets) emits all
1810
- * three rates as `null`. `false_positive_rate` is `null` when `FP+TN===0`
1811
- * even if the row has `FN+TP>0`, and vice versa.
1812
- */
1813
- export function computeFeedbackIntegrity(input) {
1814
- const akmRuns = input.phase1.akmRuns ?? [];
1815
- // Build a (taskId, seed) → outcome lookup so every feedback event
1816
- // resolves in O(1). When two runs share the same key (shouldn't happen
1817
- // — runner emits unique seeds per task — but defensive) the first
1818
- // wins.
1819
- const runOutcomeByKey = new Map();
1820
- for (const r of akmRuns) {
1821
- const key = `${r.taskId}::${r.seed}`;
1822
- if (!runOutcomeByKey.has(key))
1823
- runOutcomeByKey.set(key, r.outcome);
1824
- }
1825
- const perRef = new Map();
1826
- let aggTP = 0;
1827
- let aggFP = 0;
1828
- let aggTN = 0;
1829
- let aggFN = 0;
1830
- // Track which (taskId, seed) keys had any feedback dispatched (ok or
1831
- // not), for the coverage denominator. We count an attempted dispatch as
1832
- // covered — if `ok===false`, the operator wanted feedback but the CLI
1833
- // failed; that's still a covered run for the purpose of §6.8 (and is
1834
- // surfaced in the warnings list elsewhere).
1835
- const coveredKeys = new Set();
1836
- for (const fb of input.feedbackLog) {
1837
- const key = `${fb.taskId}::${fb.seed}`;
1838
- coveredKeys.add(key);
1839
- if (!fb.ok)
1840
- continue; // failed dispatches don't label a matrix cell.
1841
- const outcome = runOutcomeByKey.get(key);
1842
- if (outcome === undefined)
1843
- continue; // run not found — defensive, drop.
1844
- // harness_error runs are not labelled (the bench skips dispatching
1845
- // feedback for them; if a fake test injects one, we drop it from the
1846
- // matrix to avoid mislabelling).
1847
- if (outcome === "harness_error")
1848
- continue;
1849
- const passed = outcome === "pass";
1850
- let row = perRef.get(fb.goldRef);
1851
- if (!row) {
1852
- row = { truePositive: 0, falsePositive: 0, trueNegative: 0, falseNegative: 0 };
1853
- perRef.set(fb.goldRef, row);
1854
- }
1855
- if (fb.signal === "positive" && passed) {
1856
- row.truePositive += 1;
1857
- aggTP += 1;
1858
- }
1859
- else if (fb.signal === "positive" && !passed) {
1860
- row.falsePositive += 1;
1861
- aggFP += 1;
1862
- }
1863
- else if (fb.signal === "negative" && !passed) {
1864
- row.trueNegative += 1;
1865
- aggTN += 1;
1866
- }
1867
- else if (fb.signal === "negative" && passed) {
1868
- row.falseNegative += 1;
1869
- aggFN += 1;
1870
- }
1871
- }
1872
- const aggTotal = aggTP + aggFP + aggTN + aggFN;
1873
- const totalPhase1Runs = akmRuns.length;
1874
- const aggregate = {
1875
- truePositive: aggTP,
1876
- falsePositive: aggFP,
1877
- trueNegative: aggTN,
1878
- falseNegative: aggFN,
1879
- feedback_agreement: aggTotal === 0 ? 0 : (aggTP + aggTN) / aggTotal,
1880
- false_positive_rate: aggFP + aggTN === 0 ? 0 : aggFP / (aggFP + aggTN),
1881
- false_negative_rate: aggFN + aggTP === 0 ? 0 : aggFN / (aggFN + aggTP),
1882
- feedback_coverage: totalPhase1Runs === 0 ? 0 : coveredKeys.size / totalPhase1Runs,
1883
- };
1884
- const perAsset = [];
1885
- for (const [ref, row] of [...perRef.entries()].sort((a, b) => a[0].localeCompare(b[0]))) {
1886
- const total = row.truePositive + row.falsePositive + row.trueNegative + row.falseNegative;
1887
- const fpDenom = row.falsePositive + row.trueNegative;
1888
- const fnDenom = row.falseNegative + row.truePositive;
1889
- perAsset.push({
1890
- ref,
1891
- truePositive: row.truePositive,
1892
- falsePositive: row.falsePositive,
1893
- trueNegative: row.trueNegative,
1894
- falseNegative: row.falseNegative,
1895
- feedback_agreement: total === 0 ? null : (row.truePositive + row.trueNegative) / total,
1896
- false_positive_rate: fpDenom === 0 ? null : row.falsePositive / fpDenom,
1897
- false_negative_rate: fnDenom === 0 ? null : row.falseNegative / fnDenom,
1898
- });
1899
- }
1900
- return { aggregate, perAsset };
1901
- }
1902
- function aggregateByKey(entries, pickKey) {
1903
- const buckets = new Map();
1904
- for (const entry of entries) {
1905
- const key = pickKey(entry);
1906
- if (!key)
1907
- continue;
1908
- let arr = buckets.get(key);
1909
- if (!arr) {
1910
- arr = [];
1911
- buckets.set(key, arr);
1912
- }
1913
- arr.push(entry);
1914
- }
1915
- const rows = [];
1916
- for (const [category, group] of buckets) {
1917
- const n = group.length;
1918
- let noakmSum = 0;
1919
- let akmSum = 0;
1920
- let regressionCount = 0;
1921
- let complianceSum = 0;
1922
- let complianceCount = 0;
1923
- for (const t of group) {
1924
- noakmSum += t.noakm.passRate;
1925
- akmSum += t.akm.passRate;
1926
- if (t.akm.passRate < t.noakm.passRate)
1927
- regressionCount += 1;
1928
- if (typeof t.workflowCompliance === "number" && Number.isFinite(t.workflowCompliance)) {
1929
- complianceSum += t.workflowCompliance;
1930
- complianceCount += 1;
1931
- }
1932
- }
1933
- rows.push({
1934
- category,
1935
- taskCount: n,
1936
- passRateNoakm: noakmSum / n,
1937
- passRateAkm: akmSum / n,
1938
- passRateDelta: akmSum / n - noakmSum / n,
1939
- negativeTransferCount: regressionCount,
1940
- workflowCompliance: complianceCount === 0 ? null : complianceSum / complianceCount,
1941
- });
1942
- }
1943
- rows.sort((a, b) => a.category.localeCompare(b.category));
1944
- return rows;
1945
- }
1946
- /**
1947
- * Aggregate per-task entries by `memoryAbility` (#262). Tasks lacking a tag
1948
- * are skipped so the report only surfaces categories with explicit
1949
- * coverage. Output rows are sorted by category for byte-stable JSON.
1950
- *
1951
- * The closed set of memory-ability values is exported as
1952
- * {@link MEMORY_ABILITY_VALUES} from `corpus.ts`.
1953
- */
1954
- export function aggregateByMemoryAbility(entries) {
1955
- return aggregateByKey(entries, (e) => e.memoryAbility);
1956
- }
1957
- /**
1958
- * Aggregate per-task entries by `taskFamily` (#262). Tasks lacking a tag
1959
- * are skipped. `taskFamily` follows the `<domain>/<short-name>` grammar —
1960
- * tasks sharing a family are expected to transfer knowledge between each
1961
- * other. Output rows are sorted by category for byte-stable JSON.
1962
- */
1963
- export function aggregateByTaskFamily(entries) {
1964
- return aggregateByKey(entries, (e) => e.taskFamily);
1965
- }
1966
- export function computeCorpusCoverage(tasks) {
1967
- const memoryAbilityCounts = {
1968
- untagged: 0,
1969
- };
1970
- for (const ability of MEMORY_ABILITY_VALUES) {
1971
- memoryAbilityCounts[ability] = 0;
1972
- }
1973
- const taskFamilyCounts = {};
1974
- let untaggedFamily = 0;
1975
- for (const task of tasks) {
1976
- if (task.memoryAbility) {
1977
- memoryAbilityCounts[task.memoryAbility] = (memoryAbilityCounts[task.memoryAbility] ?? 0) + 1;
1978
- }
1979
- else {
1980
- memoryAbilityCounts.untagged += 1;
1981
- }
1982
- if (task.taskFamily) {
1983
- taskFamilyCounts[task.taskFamily] = (taskFamilyCounts[task.taskFamily] ?? 0) + 1;
1984
- }
1985
- else {
1986
- untaggedFamily += 1;
1987
- }
1988
- }
1989
- if (untaggedFamily > 0)
1990
- taskFamilyCounts.untagged = untaggedFamily;
1991
- return {
1992
- totalTasks: tasks.length,
1993
- memoryAbilityCounts,
1994
- taskFamilyCounts,
1995
- };
1996
- }
1997
- /**
1998
- * Verb counts considered "AKM tool calls" for `totalToolCalls`. We
1999
- * deliberately keep this list small — each verb folded in MUST be a
2000
- * user-initiated CLI invocation, not a background bookkeeping event.
2001
- * Adding new verbs here is additive and changes only `totalToolCalls`.
2002
- */
2003
- export const AKM_TOOL_CALL_TYPES = new Set([
2004
- "akm_search",
2005
- "akm_show",
2006
- "akm_feedback",
2007
- ]);
2008
- /**
2009
- * Compute per-run AKM overhead records by replaying #254's normalised trace.
2010
- *
2011
- * Pure function: never mutates `runs` and never reads disk. The optional
2012
- * `taskMetadata` lookup is used only to label loads as relevant / irrelevant
2013
- * and to compute `timeToFirstCorrectAssetMs`.
2014
- *
2015
- * Returned array length matches `runs.length`; element order matches input
2016
- * order. Runs whose trace contains no AKM events still produce a record
2017
- * with all counts at zero and timings at `null`.
2018
- */
2019
- export function computeAkmOverhead(runs, options = {}) {
2020
- const out = [];
2021
- for (const run of runs) {
2022
- out.push(perRun(run, options.taskMetadata));
2023
- }
2024
- return out;
2025
- }
2026
- function perRun(run, taskMetadata) {
2027
- const trace = normalizeRunToTrace(run);
2028
- const events = trace.events;
2029
- let searchCount = 0;
2030
- let showCount = 0;
2031
- let feedbackCount = 0;
2032
- let positiveFeedbackCount = 0;
2033
- let negativeFeedbackCount = 0;
2034
- const uniqueShowRefs = new Set();
2035
- for (const ev of events) {
2036
- if (ev.type === "akm_search")
2037
- searchCount += 1;
2038
- else if (ev.type === "akm_show") {
2039
- showCount += 1;
2040
- if (typeof ev.assetRef === "string" && ev.assetRef.length > 0) {
2041
- uniqueShowRefs.add(ev.assetRef);
2042
- }
2043
- }
2044
- else if (ev.type === "akm_feedback") {
2045
- feedbackCount += 1;
2046
- // Polarity is carried in args as "--positive" or "--negative".
2047
- // Events sourced from events.jsonl also have args populated by
2048
- // normalizeRunToTrace. Absence of both flags is treated as unknown
2049
- // (contributes to feedbackCount but not to either polarity counter).
2050
- if (ev.args?.includes("--positive"))
2051
- positiveFeedbackCount += 1;
2052
- else if (ev.args?.includes("--negative"))
2053
- negativeFeedbackCount += 1;
2054
- }
2055
- }
2056
- const totalToolCalls = searchCount + showCount + feedbackCount;
2057
- // Run-start anchor: earliest parseable ts in the trace. We use the trace
2058
- // (not RunResult.events directly) so harness lifecycle markers, when
2059
- // supplied, can serve as the anchor for stdout-derived events that lack a
2060
- // native ts.
2061
- const runStartMs = earliestEventMs(events);
2062
- const timeToFirstSearchMs = computeFirstEventOffsetMs(events, runStartMs, (ev) => ev.type === "akm_search");
2063
- // Resolve task metadata once. Missing metadata means we can't judge
2064
- // relevance — emit null counts rather than zero.
2065
- const meta = taskMetadata?.get(run.taskId);
2066
- const goldRef = meta?.goldRef;
2067
- const transferFrom = meta?.expectedTransferFrom ?? [];
2068
- const knownRelevant = new Set();
2069
- if (typeof goldRef === "string" && goldRef.length > 0)
2070
- knownRelevant.add(goldRef);
2071
- for (const r of transferFrom) {
2072
- if (typeof r === "string" && r.length > 0)
2073
- knownRelevant.add(r);
2074
- }
2075
- let irrelevantAssetsLoadedCount;
2076
- if (!meta) {
2077
- // No metadata: cannot tell relevant from irrelevant. Surface null.
2078
- irrelevantAssetsLoadedCount = null;
2079
- }
2080
- else {
2081
- let count = 0;
2082
- for (const ref of uniqueShowRefs) {
2083
- if (!knownRelevant.has(ref))
2084
- count += 1;
2085
- }
2086
- irrelevantAssetsLoadedCount = count;
2087
- }
2088
- let timeToFirstCorrectAssetMs = null;
2089
- if (typeof goldRef === "string" && goldRef.length > 0) {
2090
- timeToFirstCorrectAssetMs = computeFirstEventOffsetMs(events, runStartMs, (ev) => ev.type === "akm_show" && ev.assetRef === goldRef);
2091
- }
2092
- return {
2093
- taskId: run.taskId,
2094
- arm: run.arm,
2095
- seed: run.seed,
2096
- outcome: run.outcome,
2097
- searchCount,
2098
- showCount,
2099
- feedbackCount,
2100
- positiveFeedbackCount,
2101
- negativeFeedbackCount,
2102
- totalToolCalls,
2103
- assetsLoadedCount: uniqueShowRefs.size,
2104
- irrelevantAssetsLoadedCount,
2105
- timeToFirstSearchMs,
2106
- timeToFirstCorrectAssetMs,
2107
- // Byte sizes are not yet wired through the trace (#254 does not capture
2108
- // payload sizes). Callers MUST treat null as "unavailable", not zero.
2109
- contextBytesLoaded: null,
2110
- assetBytesLoaded: null,
2111
- };
2112
- }
2113
- /**
2114
- * Aggregate per-run AKM overhead records into the corpus-wide block (#263).
2115
- *
2116
- * Pure: never mutates `perRun`. When `perRun` is empty, returns a zero/null
2117
- * envelope so callers can render a "no AKM activity" section without
2118
- * branching. `passingRuns === 0` always implies `toolCallsPerSuccess === null`
2119
- * and `costPerSuccess === null`.
2120
- */
2121
- export function aggregateAkmOverhead(perRun, rawRuns = []) {
2122
- const n = perRun.length;
2123
- if (n === 0) {
2124
- return {
2125
- totalRuns: 0,
2126
- passingRuns: 0,
2127
- meanSearchCount: 0,
2128
- meanShowCount: 0,
2129
- meanFeedbackCount: 0,
2130
- meanToolCalls: 0,
2131
- meanAssetsLoaded: 0,
2132
- meanIrrelevantAssetsLoaded: null,
2133
- meanTimeToFirstSearchMs: null,
2134
- meanTimeToFirstCorrectAssetMs: null,
2135
- meanContextBytesLoaded: null,
2136
- meanAssetBytesLoaded: null,
2137
- totalToolCalls: 0,
2138
- toolCallsPerSuccess: null,
2139
- costPerSuccess: null,
2140
- searchEngagementRate: 0,
2141
- showEngagementRate: 0,
2142
- feedbackEngagementRate: 0,
2143
- searchToShowRatio: null,
2144
- meanPositiveFeedbackCount: 0,
2145
- meanNegativeFeedbackCount: 0,
2146
- };
2147
- }
2148
- let searchSum = 0;
2149
- let showSum = 0;
2150
- let feedbackSum = 0;
2151
- let toolCallsSum = 0;
2152
- let assetsSum = 0;
2153
- let irrelevantSum = 0;
2154
- let irrelevantCount = 0;
2155
- let firstSearchSum = 0;
2156
- let firstSearchCount = 0;
2157
- let firstCorrectSum = 0;
2158
- let firstCorrectCount = 0;
2159
- let contextBytesSum = 0;
2160
- let contextBytesCount = 0;
2161
- let assetBytesSum = 0;
2162
- let assetBytesCount = 0;
2163
- // Build a quick lookup for token measurement off `rawRuns` so the cost-
2164
- // per-success calc can honour the parsed/missing/unsupported distinction
2165
- // without forcing the caller to project tokens onto AkmOverheadPerRun.
2166
- const rawByKey = new Map();
2167
- for (const r of rawRuns) {
2168
- rawByKey.set(`${r.taskId}${r.arm}${r.seed}`, r);
2169
- }
2170
- let passingRuns = 0;
2171
- let parsedPassTokenSum = 0;
2172
- let parsedPassCount = 0;
2173
- let anyPassMissingMeasurement = false;
2174
- let searchEngagedRuns = 0;
2175
- let showEngagedRuns = 0;
2176
- let feedbackEngagedRuns = 0;
2177
- let positiveFeedbackSum = 0;
2178
- let negativeFeedbackSum = 0;
2179
- for (const row of perRun) {
2180
- searchSum += row.searchCount;
2181
- showSum += row.showCount;
2182
- feedbackSum += row.feedbackCount;
2183
- toolCallsSum += row.totalToolCalls;
2184
- assetsSum += row.assetsLoadedCount;
2185
- if (row.searchCount > 0)
2186
- searchEngagedRuns += 1;
2187
- if (row.showCount > 0)
2188
- showEngagedRuns += 1;
2189
- if (row.feedbackCount > 0)
2190
- feedbackEngagedRuns += 1;
2191
- positiveFeedbackSum += row.positiveFeedbackCount;
2192
- negativeFeedbackSum += row.negativeFeedbackCount;
2193
- if (row.irrelevantAssetsLoadedCount !== null) {
2194
- irrelevantSum += row.irrelevantAssetsLoadedCount;
2195
- irrelevantCount += 1;
2196
- }
2197
- if (row.timeToFirstSearchMs !== null) {
2198
- firstSearchSum += row.timeToFirstSearchMs;
2199
- firstSearchCount += 1;
2200
- }
2201
- if (row.timeToFirstCorrectAssetMs !== null) {
2202
- firstCorrectSum += row.timeToFirstCorrectAssetMs;
2203
- firstCorrectCount += 1;
2204
- }
2205
- if (row.contextBytesLoaded !== null) {
2206
- contextBytesSum += row.contextBytesLoaded;
2207
- contextBytesCount += 1;
2208
- }
2209
- if (row.assetBytesLoaded !== null) {
2210
- assetBytesSum += row.assetBytesLoaded;
2211
- assetBytesCount += 1;
2212
- }
2213
- if (row.outcome === "pass") {
2214
- passingRuns += 1;
2215
- const raw = rawByKey.get(`${row.taskId}${row.arm}${row.seed}`);
2216
- // Treat absent tokenMeasurement as `parsed` for backward compat with
2217
- // older artefacts (mirrors `isMeasured` behaviour above).
2218
- const measurement = raw?.tokenMeasurement ?? "parsed";
2219
- if (raw && measurement === "parsed") {
2220
- parsedPassTokenSum += raw.tokens.input + raw.tokens.output;
2221
- parsedPassCount += 1;
2222
- }
2223
- else if (raw) {
2224
- anyPassMissingMeasurement = true;
2225
- }
2226
- else {
2227
- // No matching raw run supplied — cannot honour cost-per-success.
2228
- anyPassMissingMeasurement = true;
2229
- }
2230
- }
2231
- }
2232
- const toolCallsPerSuccess = passingRuns === 0 ? null : toolCallsSum / passingRuns;
2233
- // Cost-per-success: null unless EVERY passing run has parsed measurement.
2234
- // Mixed measurement statuses cannot be averaged honestly (issue #252).
2235
- const costPerSuccess = passingRuns === 0 || anyPassMissingMeasurement || parsedPassCount === 0
2236
- ? null
2237
- : parsedPassTokenSum / parsedPassCount;
2238
- const searchToShowRatio = searchSum === 0 ? null : showSum / searchSum;
2239
- return {
2240
- totalRuns: n,
2241
- passingRuns,
2242
- meanSearchCount: searchSum / n,
2243
- meanShowCount: showSum / n,
2244
- meanFeedbackCount: feedbackSum / n,
2245
- meanToolCalls: toolCallsSum / n,
2246
- meanAssetsLoaded: assetsSum / n,
2247
- meanIrrelevantAssetsLoaded: irrelevantCount === 0 ? null : irrelevantSum / irrelevantCount,
2248
- meanTimeToFirstSearchMs: firstSearchCount === 0 ? null : firstSearchSum / firstSearchCount,
2249
- meanTimeToFirstCorrectAssetMs: firstCorrectCount === 0 ? null : firstCorrectSum / firstCorrectCount,
2250
- meanContextBytesLoaded: contextBytesCount === 0 ? null : contextBytesSum / contextBytesCount,
2251
- meanAssetBytesLoaded: assetBytesCount === 0 ? null : assetBytesSum / assetBytesCount,
2252
- totalToolCalls: toolCallsSum,
2253
- toolCallsPerSuccess,
2254
- costPerSuccess,
2255
- searchEngagementRate: searchEngagedRuns / n,
2256
- showEngagementRate: showEngagedRuns / n,
2257
- feedbackEngagementRate: feedbackEngagedRuns / n,
2258
- searchToShowRatio,
2259
- meanPositiveFeedbackCount: positiveFeedbackSum / n,
2260
- meanNegativeFeedbackCount: negativeFeedbackSum / n,
2261
- };
2262
- }
2263
- /**
2264
- * Bucket a workflow check status onto pass / non-pass for reliability.
2265
- *
2266
- * Reliability is a strict pass-or-not metric (issue #258). Anything other
2267
- * than `pass` (including `partial`, `fail`, `harness_error`) counts as a
2268
- * non-pass. `not_applicable` returns `null` so the caller can skip the
2269
- * entire (task, seed) pair — it never contributes to either numerator or
2270
- * denominator.
2271
- */
2272
- function bucketReliabilityStatus(status) {
2273
- if (status === "not_applicable")
2274
- return null;
2275
- if (status === "pass")
2276
- return "pass";
2277
- return "non_pass";
2278
- }
2279
- /**
2280
- * Compute workflow reliability metrics (`pass@k` and `pass^k`) per workflow
2281
- * and corpus-wide from a flat list of `WorkflowCheckResult`.
2282
- *
2283
- * Methodology (per #258 review addendum):
2284
- * 1. Filter out `not_applicable` checks entirely.
2285
- * 2. For each `(workflow_id, task_id)` group, collapse seeds to the set
2286
- * of statuses observed.
2287
- * 3. `pass_at_k` per task = 1 if at least one seed is `pass`, else 0.
2288
- * 4. `pass_all_k` per task = 1 if every seed is `pass`, else 0.
2289
- * 5. Per-workflow row averages over its task set.
2290
- * 6. Corpus rollup averages over every (workflow, task) group equally.
2291
- *
2292
- * Pure: never mutates `checks`. Returns a stable shape for empty input.
2293
- */
2294
- export function computeWorkflowReliability(checks) {
2295
- // Group by (workflow_id, task_id) → list of statuses across seeds.
2296
- // Use Map<string, Map<string, WorkflowCheckStatus[]>> so iteration order
2297
- // is insertion order (deterministic given deterministic input).
2298
- const grouped = new Map();
2299
- for (const c of checks) {
2300
- if (bucketReliabilityStatus(c.status) === null)
2301
- continue;
2302
- let perWorkflow = grouped.get(c.workflowId);
2303
- if (!perWorkflow) {
2304
- perWorkflow = new Map();
2305
- grouped.set(c.workflowId, perWorkflow);
2306
- }
2307
- const list = perWorkflow.get(c.taskId);
2308
- if (list)
2309
- list.push(c.status);
2310
- else
2311
- perWorkflow.set(c.taskId, [c.status]);
2312
- }
2313
- const byWorkflow = {};
2314
- let corpusPassAtKSum = 0;
2315
- let corpusPassAllKSum = 0;
2316
- let corpusGroupCount = 0;
2317
- const corpusTasks = new Set();
2318
- for (const [workflowId, perTask] of grouped) {
2319
- let passAtKSum = 0;
2320
- let passAllKSum = 0;
2321
- let kMax = 0;
2322
- for (const [taskId, statuses] of perTask) {
2323
- if (statuses.length > kMax)
2324
- kMax = statuses.length;
2325
- const allPass = statuses.every((s) => s === "pass");
2326
- const anyPass = statuses.some((s) => s === "pass");
2327
- if (anyPass)
2328
- passAtKSum += 1;
2329
- if (allPass)
2330
- passAllKSum += 1;
2331
- corpusPassAtKSum += anyPass ? 1 : 0;
2332
- corpusPassAllKSum += allPass ? 1 : 0;
2333
- corpusGroupCount += 1;
2334
- corpusTasks.add(taskId);
2335
- }
2336
- const taskCount = perTask.size;
2337
- byWorkflow[workflowId] = {
2338
- workflow_id: workflowId,
2339
- pass_at_k: taskCount === 0 ? 0 : passAtKSum / taskCount,
2340
- pass_all_k: taskCount === 0 ? 0 : passAllKSum / taskCount,
2341
- tasks: taskCount,
2342
- k: kMax,
2343
- };
2344
- }
2345
- const corpus = {
2346
- pass_at_k: corpusGroupCount === 0 ? 0 : corpusPassAtKSum / corpusGroupCount,
2347
- pass_all_k: corpusGroupCount === 0 ? 0 : corpusPassAllKSum / corpusGroupCount,
2348
- groups: corpusGroupCount,
2349
- tasks: corpusTasks.size,
2350
- };
2351
- return { byWorkflow, corpus };
2352
- }
2353
- /** Earliest parseable ts (ms epoch) among events; null when none. */
2354
- function earliestEventMs(events) {
2355
- let earliest = null;
2356
- for (const ev of events) {
2357
- const ms = parseTsToMs(ev.ts);
2358
- if (ms === null)
2359
- continue;
2360
- if (earliest === null || ms < earliest)
2361
- earliest = ms;
2362
- }
2363
- return earliest;
2364
- }
2365
- /**
2366
- * Find the first event matching `predicate`, parse its ts, and return
2367
- * `(ts - runStartMs)`. Returns `null` if no matching event has a parseable
2368
- * ts, if `runStartMs` is null, or if the offset would be negative (a clock
2369
- * inversion we refuse to silently coerce to zero).
2370
- */
2371
- function computeFirstEventOffsetMs(events, runStartMs, predicate) {
2372
- if (runStartMs === null)
2373
- return null;
2374
- for (const ev of events) {
2375
- if (!predicate(ev))
2376
- continue;
2377
- const ms = parseTsToMs(ev.ts);
2378
- if (ms === null)
2379
- continue;
2380
- const offset = ms - runStartMs;
2381
- if (offset < 0)
2382
- return null;
2383
- return offset;
2384
- }
2385
- return null;
2386
- }
2387
- /** Parse an ISO ts to ms-epoch; null when missing or unparseable. */
2388
- function parseTsToMs(ts) {
2389
- if (typeof ts !== "string" || ts.length === 0)
2390
- return null;
2391
- const ms = Date.parse(ts);
2392
- if (Number.isNaN(ms))
2393
- return null;
2394
- return ms;
2395
- }