akm-cli 0.6.1 → 0.7.0-rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (319) hide show
  1. package/CHANGELOG.md +66 -0
  2. package/dist/{cli.js → src/cli.js} +620 -26
  3. package/dist/{commands → src/commands}/config-cli.js +5 -4
  4. package/dist/src/commands/distill.js +283 -0
  5. package/dist/src/commands/events.js +108 -0
  6. package/dist/src/commands/history.js +120 -0
  7. package/dist/{commands → src/commands}/installed-stashes.js +1 -1
  8. package/dist/src/commands/proposal.js +119 -0
  9. package/dist/src/commands/propose.js +171 -0
  10. package/dist/src/commands/reflect.js +193 -0
  11. package/dist/{commands → src/commands}/registry-search.js +2 -1
  12. package/dist/{commands → src/commands}/remember.js +12 -0
  13. package/dist/{commands → src/commands}/search.js +74 -1
  14. package/dist/{commands → src/commands}/self-update.js +4 -3
  15. package/dist/{commands → src/commands}/show.js +44 -0
  16. package/dist/{core → src/core}/asset-ref.js +5 -5
  17. package/dist/{core → src/core}/asset-spec.js +12 -0
  18. package/dist/{core → src/core}/common.js +1 -1
  19. package/dist/{core → src/core}/config.js +175 -121
  20. package/dist/{core → src/core}/errors.js +4 -0
  21. package/dist/src/core/events.js +239 -0
  22. package/dist/src/core/lesson-lint.js +86 -0
  23. package/dist/src/core/proposals.js +406 -0
  24. package/dist/src/core/warn.js +72 -0
  25. package/dist/{core → src/core}/write-source.js +80 -5
  26. package/dist/{indexer → src/indexer}/db-search.js +113 -24
  27. package/dist/{indexer → src/indexer}/db.js +76 -23
  28. package/dist/{indexer → src/indexer}/file-context.js +0 -3
  29. package/dist/src/indexer/graph-boost.js +179 -0
  30. package/dist/src/indexer/graph-extraction.js +212 -0
  31. package/dist/{indexer → src/indexer}/indexer.js +73 -6
  32. package/dist/src/indexer/memory-inference.js +263 -0
  33. package/dist/{indexer → src/indexer}/metadata.js +111 -3
  34. package/dist/src/integrations/agent/config.js +292 -0
  35. package/dist/src/integrations/agent/detect.js +94 -0
  36. package/dist/src/integrations/agent/index.js +17 -0
  37. package/dist/src/integrations/agent/profiles.js +65 -0
  38. package/dist/src/integrations/agent/prompts.js +167 -0
  39. package/dist/src/integrations/agent/spawn.js +221 -0
  40. package/dist/{integrations → src/integrations}/lockfile.js +0 -26
  41. package/dist/{llm → src/llm}/client.js +33 -2
  42. package/dist/src/llm/feature-gate.js +108 -0
  43. package/dist/src/llm/graph-extract.js +107 -0
  44. package/dist/src/llm/index-passes.js +35 -0
  45. package/dist/src/llm/memory-infer.js +86 -0
  46. package/dist/{output → src/output}/renderers.js +60 -1
  47. package/dist/src/output/shapes.js +516 -0
  48. package/dist/{output → src/output}/text.js +447 -4
  49. package/dist/{registry → src/registry}/build-index.js +14 -4
  50. package/dist/{registry → src/registry}/factory.js +0 -8
  51. package/dist/{registry → src/registry}/providers/static-index.js +3 -2
  52. package/dist/{registry → src/registry}/resolve.js +68 -2
  53. package/dist/{setup → src/setup}/setup.js +43 -5
  54. package/dist/{sources → src/sources}/providers/git.js +7 -15
  55. package/dist/tests/add-website-source.test.js +119 -0
  56. package/dist/tests/agent/agent-config-loader.test.js +70 -0
  57. package/dist/tests/agent/agent-config.test.js +221 -0
  58. package/dist/tests/agent/agent-detect.test.js +100 -0
  59. package/dist/tests/agent/agent-spawn.test.js +234 -0
  60. package/dist/tests/agent-output.test.js +186 -0
  61. package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +103 -0
  62. package/dist/tests/architecture/agent-spawn-seam.test.js +193 -0
  63. package/dist/tests/architecture/llm-stateless-seam.test.js +112 -0
  64. package/dist/tests/asset-ref.test.js +192 -0
  65. package/dist/tests/asset-registry.test.js +103 -0
  66. package/dist/tests/asset-spec.test.js +241 -0
  67. package/dist/tests/bench/attribution.test.js +995 -0
  68. package/dist/tests/bench/cleanup-sigint.test.js +83 -0
  69. package/dist/tests/bench/cleanup.js +203 -0
  70. package/dist/tests/bench/cleanup.test.js +166 -0
  71. package/dist/tests/bench/cli.js +683 -0
  72. package/dist/tests/bench/cli.test.js +177 -0
  73. package/dist/tests/bench/compare.test.js +556 -0
  74. package/dist/tests/bench/corpus.js +314 -0
  75. package/dist/tests/bench/corpus.test.js +258 -0
  76. package/dist/tests/bench/driver.js +346 -0
  77. package/dist/tests/bench/driver.test.js +443 -0
  78. package/dist/tests/bench/evolve-metrics.js +179 -0
  79. package/dist/tests/bench/evolve-metrics.test.js +187 -0
  80. package/dist/tests/bench/evolve.js +580 -0
  81. package/dist/tests/bench/evolve.test.js +616 -0
  82. package/dist/tests/bench/failure-modes.test.js +300 -0
  83. package/dist/tests/bench/feedback-integrity.test.js +456 -0
  84. package/dist/tests/bench/leakage.test.js +125 -0
  85. package/dist/tests/bench/learning-curve.test.js +133 -0
  86. package/dist/tests/bench/metrics.js +2319 -0
  87. package/dist/tests/bench/metrics.test.js +1144 -0
  88. package/dist/tests/bench/no-os-tmpdir-invariant.test.js +43 -0
  89. package/dist/tests/bench/report.js +1821 -0
  90. package/dist/tests/bench/report.test.js +989 -0
  91. package/dist/tests/bench/runner.js +536 -0
  92. package/dist/tests/bench/runner.test.js +958 -0
  93. package/dist/tests/bench/search-bridge.test.js +331 -0
  94. package/dist/tests/bench/tmp.js +41 -0
  95. package/dist/tests/bench/trajectory.js +116 -0
  96. package/dist/tests/bench/trajectory.test.js +127 -0
  97. package/dist/tests/bench/verifier.js +109 -0
  98. package/dist/tests/bench/verifier.test.js +118 -0
  99. package/dist/tests/bench/workflow-evaluator.js +557 -0
  100. package/dist/tests/bench/workflow-evaluator.test.js +421 -0
  101. package/dist/tests/bench/workflow-spec.js +358 -0
  102. package/dist/tests/bench/workflow-spec.test.js +363 -0
  103. package/dist/tests/bench/workflow-trace.js +438 -0
  104. package/dist/tests/bench/workflow-trace.test.js +254 -0
  105. package/dist/tests/benchmark-search-quality.js +536 -0
  106. package/dist/tests/benchmark-suite.js +1441 -0
  107. package/dist/tests/capture-cli.test.js +112 -0
  108. package/dist/tests/cli-errors.test.js +203 -0
  109. package/dist/tests/commands/events.test.js +370 -0
  110. package/dist/tests/commands/history.test.js +223 -0
  111. package/dist/tests/commands/import.test.js +103 -0
  112. package/dist/tests/commands/proposal-cli.test.js +209 -0
  113. package/dist/tests/commands/reflect-propose-cli.test.js +333 -0
  114. package/dist/tests/commands/remember.test.js +97 -0
  115. package/dist/tests/commands/scope-flags.test.js +300 -0
  116. package/dist/tests/commands/search.test.js +537 -0
  117. package/dist/tests/commands/show-indexer-parity.test.js +117 -0
  118. package/dist/tests/commands/show.test.js +294 -0
  119. package/dist/tests/common.test.js +266 -0
  120. package/dist/tests/completions.test.js +142 -0
  121. package/dist/tests/config-cli.test.js +193 -0
  122. package/dist/tests/config-llm-features.test.js +139 -0
  123. package/dist/tests/config.test.js +544 -0
  124. package/dist/tests/contracts/migration-baseline.test.js +43 -0
  125. package/dist/tests/contracts/reflect-propose-envelope.test.js +139 -0
  126. package/dist/tests/contracts/spec-helpers.js +46 -0
  127. package/dist/tests/contracts/v1-spec-section-11-proposal-queue.test.js +228 -0
  128. package/dist/tests/contracts/v1-spec-section-12-agent-config.test.js +56 -0
  129. package/dist/tests/contracts/v1-spec-section-13-lesson-type.test.js +34 -0
  130. package/dist/tests/contracts/v1-spec-section-14-llm-features.test.js +94 -0
  131. package/dist/tests/contracts/v1-spec-section-4-1-asset-types.test.js +39 -0
  132. package/dist/tests/contracts/v1-spec-section-4-2-quality-rules.test.js +44 -0
  133. package/dist/tests/contracts/v1-spec-section-5-configuration.test.js +47 -0
  134. package/dist/tests/contracts/v1-spec-section-6-orchestration.test.js +40 -0
  135. package/dist/tests/contracts/v1-spec-section-7-module-layout.test.js +58 -0
  136. package/dist/tests/contracts/v1-spec-section-8-extension-points.test.js +34 -0
  137. package/dist/tests/contracts/v1-spec-section-9-4-cli-surface.test.js +75 -0
  138. package/dist/tests/contracts/v1-spec-section-9-7-llm-agent-boundary.test.js +36 -0
  139. package/dist/tests/core/write-source.test.js +366 -0
  140. package/dist/tests/curate-command.test.js +87 -0
  141. package/dist/tests/db-scoring.test.js +201 -0
  142. package/dist/tests/db.test.js +654 -0
  143. package/dist/tests/distill-cli-flag.test.js +208 -0
  144. package/dist/tests/distill.test.js +515 -0
  145. package/dist/tests/docker-install.test.js +120 -0
  146. package/dist/tests/e2e.test.js +1398 -0
  147. package/dist/tests/embedder.test.js +340 -0
  148. package/dist/tests/embedding-model-config.test.js +379 -0
  149. package/dist/tests/feedback-command.test.js +172 -0
  150. package/dist/tests/file-context.test.js +552 -0
  151. package/dist/tests/fixtures/scripts/git/summarize-diff.js +9 -0
  152. package/dist/tests/fixtures/scripts/lint/eslint-check.js +7 -0
  153. package/dist/tests/fixtures/stashes/load.js +166 -0
  154. package/dist/tests/fixtures/stashes/load.test.js +88 -0
  155. package/dist/tests/fixtures/stashes/ranking-baseline/scripts/mem0-search.js +12 -0
  156. package/dist/tests/frontmatter.test.js +190 -0
  157. package/dist/tests/fts-field-weighting.test.js +254 -0
  158. package/dist/tests/fuzzy-search.test.js +230 -0
  159. package/dist/tests/git-provider-clone.test.js +45 -0
  160. package/dist/tests/github.test.js +161 -0
  161. package/dist/tests/graph-boost-ranking.test.js +305 -0
  162. package/dist/tests/graph-extraction.test.js +282 -0
  163. package/dist/tests/helpers/usage-events.js +8 -0
  164. package/dist/tests/index-pass-llm.test.js +161 -0
  165. package/dist/tests/indexer.test.js +559 -0
  166. package/dist/tests/info-command.test.js +166 -0
  167. package/dist/tests/init.test.js +69 -0
  168. package/dist/tests/install-script.test.js +246 -0
  169. package/dist/tests/integration/agent-real-profile.test.js +94 -0
  170. package/dist/tests/issue-36-repro.test.js +304 -0
  171. package/dist/tests/issues-191-194.test.js +160 -0
  172. package/dist/tests/lesson-lint.test.js +111 -0
  173. package/dist/tests/llm-client.test.js +115 -0
  174. package/dist/tests/llm-feature-gate.test.js +151 -0
  175. package/dist/tests/llm.test.js +139 -0
  176. package/dist/tests/lockfile.test.js +216 -0
  177. package/dist/tests/manifest.test.js +205 -0
  178. package/dist/tests/markdown.test.js +126 -0
  179. package/dist/tests/matchers-unit.test.js +189 -0
  180. package/dist/tests/memory-inference.test.js +299 -0
  181. package/dist/tests/merge-scoring.test.js +136 -0
  182. package/dist/tests/metadata.test.js +313 -0
  183. package/dist/tests/migration-help.test.js +89 -0
  184. package/dist/tests/origin-resolve.test.js +124 -0
  185. package/dist/tests/output-baseline.test.js +217 -0
  186. package/dist/tests/output-shapes-unit.test.js +476 -0
  187. package/dist/tests/parallel-search.test.js +272 -0
  188. package/dist/tests/parameter-metadata.test.js +365 -0
  189. package/dist/tests/paths.test.js +177 -0
  190. package/dist/tests/progressive-disclosure.test.js +280 -0
  191. package/dist/tests/proposals.test.js +279 -0
  192. package/dist/tests/proposed-quality.test.js +271 -0
  193. package/dist/tests/provider-registry.test.js +32 -0
  194. package/dist/tests/ranking-regression.test.js +548 -0
  195. package/dist/tests/reflect-propose.test.js +455 -0
  196. package/dist/tests/registry-build-index.test.js +378 -0
  197. package/dist/tests/registry-cli.test.js +290 -0
  198. package/dist/tests/registry-index-v2.test.js +430 -0
  199. package/dist/tests/registry-install.test.js +728 -0
  200. package/dist/tests/registry-providers/parity.test.js +189 -0
  201. package/dist/tests/registry-providers/skills-sh.test.js +309 -0
  202. package/dist/tests/registry-providers/static-index.test.js +204 -0
  203. package/dist/tests/registry-resolve.test.js +126 -0
  204. package/dist/tests/registry-search.test.js +723 -0
  205. package/dist/tests/remember-frontmatter.test.js +380 -0
  206. package/dist/tests/remember-unit.test.js +123 -0
  207. package/dist/tests/ripgrep-install.test.js +251 -0
  208. package/dist/tests/ripgrep-resolve.test.js +108 -0
  209. package/dist/tests/ripgrep.test.js +163 -0
  210. package/dist/tests/save-command.test.js +94 -0
  211. package/dist/tests/save-trust-qa-fixes.test.js +270 -0
  212. package/dist/tests/scoring-pipeline.test.js +648 -0
  213. package/dist/tests/search-include-proposed-cli.test.js +118 -0
  214. package/dist/tests/self-update.test.js +442 -0
  215. package/dist/tests/semantic-search-e2e.test.js +512 -0
  216. package/dist/tests/semantic-status.test.js +471 -0
  217. package/dist/tests/setup-run.integration.js +877 -0
  218. package/dist/tests/setup-wizard.test.js +198 -0
  219. package/dist/tests/setup.test.js +131 -0
  220. package/dist/tests/source-add.test.js +11 -0
  221. package/dist/tests/source-clone.test.js +254 -0
  222. package/dist/tests/source-manage.test.js +366 -0
  223. package/dist/tests/source-providers/filesystem.test.js +82 -0
  224. package/dist/tests/source-providers/git.test.js +252 -0
  225. package/dist/tests/source-providers/website.test.js +128 -0
  226. package/dist/tests/source-qa-fixes.test.js +268 -0
  227. package/dist/tests/source-registry.test.js +350 -0
  228. package/dist/tests/source-resolve.test.js +100 -0
  229. package/dist/tests/source-source.test.js +221 -0
  230. package/dist/tests/source.test.js +533 -0
  231. package/dist/tests/tar-utils-scan.test.js +73 -0
  232. package/dist/tests/toggle-components.test.js +73 -0
  233. package/dist/tests/usage-telemetry.test.js +265 -0
  234. package/dist/tests/utility-scoring.test.js +558 -0
  235. package/dist/tests/vault-load-error.test.js +78 -0
  236. package/dist/tests/vault-qa-fixes.test.js +194 -0
  237. package/dist/tests/vault.test.js +429 -0
  238. package/dist/tests/vector-search.test.js +608 -0
  239. package/dist/tests/walker.test.js +252 -0
  240. package/dist/tests/wave2-cluster-bc.test.js +228 -0
  241. package/dist/tests/wave2-cluster-d.test.js +180 -0
  242. package/dist/tests/wave2-cluster-e.test.js +179 -0
  243. package/dist/tests/wiki-qa-fixes.test.js +270 -0
  244. package/dist/tests/wiki.test.js +529 -0
  245. package/dist/tests/workflow-cli.test.js +271 -0
  246. package/dist/tests/workflow-markdown.test.js +171 -0
  247. package/dist/tests/workflow-path-escape.test.js +132 -0
  248. package/dist/tests/workflow-qa-fixes.test.js +377 -0
  249. package/dist/tests/workflows/indexer-rejection.test.js +213 -0
  250. package/docs/README.md +8 -0
  251. package/docs/migration/release-notes/0.7.0.md +244 -0
  252. package/package.json +2 -2
  253. package/dist/core/warn.js +0 -27
  254. package/dist/output/shapes.js +0 -212
  255. /package/dist/{commands → src/commands}/completions.js +0 -0
  256. /package/dist/{commands → src/commands}/curate.js +0 -0
  257. /package/dist/{commands → src/commands}/info.js +0 -0
  258. /package/dist/{commands → src/commands}/init.js +0 -0
  259. /package/dist/{commands → src/commands}/install-audit.js +0 -0
  260. /package/dist/{commands → src/commands}/migration-help.js +0 -0
  261. /package/dist/{commands → src/commands}/source-add.js +0 -0
  262. /package/dist/{commands → src/commands}/source-clone.js +0 -0
  263. /package/dist/{commands → src/commands}/source-manage.js +0 -0
  264. /package/dist/{commands → src/commands}/vault.js +0 -0
  265. /package/dist/{core → src/core}/asset-registry.js +0 -0
  266. /package/dist/{core → src/core}/frontmatter.js +0 -0
  267. /package/dist/{core → src/core}/markdown.js +0 -0
  268. /package/dist/{core → src/core}/paths.js +0 -0
  269. /package/dist/{indexer → src/indexer}/manifest.js +0 -0
  270. /package/dist/{indexer → src/indexer}/matchers.js +0 -0
  271. /package/dist/{indexer → src/indexer}/search-fields.js +0 -0
  272. /package/dist/{indexer → src/indexer}/search-source.js +0 -0
  273. /package/dist/{indexer → src/indexer}/semantic-status.js +0 -0
  274. /package/dist/{indexer → src/indexer}/usage-events.js +0 -0
  275. /package/dist/{indexer → src/indexer}/walker.js +0 -0
  276. /package/dist/{integrations → src/integrations}/github.js +0 -0
  277. /package/dist/{llm → src/llm}/embedder.js +0 -0
  278. /package/dist/{llm → src/llm}/embedders/cache.js +0 -0
  279. /package/dist/{llm → src/llm}/embedders/local.js +0 -0
  280. /package/dist/{llm → src/llm}/embedders/remote.js +0 -0
  281. /package/dist/{llm → src/llm}/embedders/types.js +0 -0
  282. /package/dist/{llm → src/llm}/metadata-enhance.js +0 -0
  283. /package/dist/{output → src/output}/cli-hints.js +0 -0
  284. /package/dist/{output → src/output}/context.js +0 -0
  285. /package/dist/{registry → src/registry}/create-provider-registry.js +0 -0
  286. /package/dist/{registry → src/registry}/origin-resolve.js +0 -0
  287. /package/dist/{registry → src/registry}/providers/index.js +0 -0
  288. /package/dist/{registry → src/registry}/providers/skills-sh.js +0 -0
  289. /package/dist/{registry → src/registry}/providers/types.js +0 -0
  290. /package/dist/{registry → src/registry}/types.js +0 -0
  291. /package/dist/{setup → src/setup}/detect.js +0 -0
  292. /package/dist/{setup → src/setup}/ripgrep-install.js +0 -0
  293. /package/dist/{setup → src/setup}/ripgrep-resolve.js +0 -0
  294. /package/dist/{setup → src/setup}/steps.js +0 -0
  295. /package/dist/{sources → src/sources}/include.js +0 -0
  296. /package/dist/{sources → src/sources}/provider-factory.js +0 -0
  297. /package/dist/{sources → src/sources}/provider.js +0 -0
  298. /package/dist/{sources → src/sources}/providers/filesystem.js +0 -0
  299. /package/dist/{sources → src/sources}/providers/index.js +0 -0
  300. /package/dist/{sources → src/sources}/providers/install-types.js +0 -0
  301. /package/dist/{sources → src/sources}/providers/npm.js +0 -0
  302. /package/dist/{sources → src/sources}/providers/provider-utils.js +0 -0
  303. /package/dist/{sources → src/sources}/providers/sync-from-ref.js +0 -0
  304. /package/dist/{sources → src/sources}/providers/tar-utils.js +0 -0
  305. /package/dist/{sources → src/sources}/providers/website.js +0 -0
  306. /package/dist/{sources → src/sources}/resolve.js +0 -0
  307. /package/dist/{sources → src/sources}/types.js +0 -0
  308. /package/dist/{templates → src/templates}/wiki-templates.js +0 -0
  309. /package/dist/{version.js → src/version.js} +0 -0
  310. /package/dist/{wiki → src/wiki}/wiki.js +0 -0
  311. /package/dist/{workflows → src/workflows}/authoring.js +0 -0
  312. /package/dist/{workflows → src/workflows}/cli.js +0 -0
  313. /package/dist/{workflows → src/workflows}/db.js +0 -0
  314. /package/dist/{workflows → src/workflows}/document-cache.js +0 -0
  315. /package/dist/{workflows → src/workflows}/parser.js +0 -0
  316. /package/dist/{workflows → src/workflows}/renderer.js +0 -0
  317. /package/dist/{workflows → src/workflows}/runs.js +0 -0
  318. /package/dist/{workflows → src/workflows}/schema.js +0 -0
  319. /package/dist/{workflows → src/workflows}/validator.js +0 -0
@@ -0,0 +1,2319 @@
1
+ /**
2
+ * akm-bench metrics (spec §6).
3
+ *
4
+ * Outcome metrics (§6.1) and trajectory metrics (§6.2). Both are pure
5
+ * functions over `RunResult[]` slices so the runner can compose them
6
+ * however it likes. The §6.3+ catalog (proposal-quality, longitudinal,
7
+ * attribution, failure-mode taxonomy) lands in #239/#240/#243.
8
+ *
9
+ * The failure-mode taxonomy classifier (§6.6) lives in this file
10
+ * (`classifyFailureMode`).
11
+ *
12
+ * Search-pipeline bridge metrics (§6.7) are below: they tie the synthetic
13
+ * MRR/Recall@K view in `tests/benchmark-suite.ts` to real-task pass rate
14
+ * by logging gold-rank-of-search per `akm search` invocation and slicing
15
+ * pass-rate by the rank of the agent's *chosen* search.
16
+ */
17
+ import fs from "node:fs";
18
+ import path from "node:path";
19
+ import { safeRealpath } from "../../src/core/common";
20
+ import { MEMORY_ABILITY_VALUES } from "./corpus";
21
+ import { serializeRunForReport } from "./report";
22
+ import { benchMkdtemp } from "./tmp";
23
+ import { normalizeRunToTrace } from "./workflow-trace";
24
+ /**
25
+ * Aggregate outcome metrics over a flat list of RunResults.
26
+ *
27
+ * Aggregations across multiple arms are the caller's responsibility — pass
28
+ * each arm's slice in separately. Backward-compatible v1 contract; the
29
+ * richer per-task / corpus shapes below subsume this.
30
+ */
31
+ export function computeOutcomeAggregate(results) {
32
+ if (results.length === 0) {
33
+ return { passRate: 0, tokensPerPass: 0, wallclockMs: 0, budgetExceeded: 0, runsWithMeasuredTokens: 0 };
34
+ }
35
+ let passes = 0;
36
+ let budgetExceeded = 0;
37
+ let totalTokensInMeasuredPasses = 0;
38
+ let measuredPasses = 0;
39
+ let runsWithMeasuredTokens = 0;
40
+ let totalWallclock = 0;
41
+ for (const r of results) {
42
+ totalWallclock += r.wallclockMs;
43
+ if (isMeasured(r)) {
44
+ runsWithMeasuredTokens += 1;
45
+ }
46
+ if (r.outcome === "pass") {
47
+ passes += 1;
48
+ // Only fold tokens into the mean when we actually measured them
49
+ // (issue #252) — otherwise a `0` would silently understate cost.
50
+ if (isMeasured(r)) {
51
+ measuredPasses += 1;
52
+ totalTokensInMeasuredPasses += r.tokens.input + r.tokens.output;
53
+ }
54
+ }
55
+ else if (r.outcome === "budget_exceeded") {
56
+ budgetExceeded += 1;
57
+ }
58
+ }
59
+ return {
60
+ passRate: passes / results.length,
61
+ tokensPerPass: measuredPasses === 0 ? 0 : totalTokensInMeasuredPasses / measuredPasses,
62
+ wallclockMs: totalWallclock / results.length,
63
+ budgetExceeded,
64
+ runsWithMeasuredTokens,
65
+ };
66
+ }
67
+ /**
68
+ * Treat older artefacts without `tokenMeasurement` as `"parsed"` for backward
69
+ * compatibility — pre-#252 reports always returned numeric zero, and rejecting
70
+ * them entirely would break compare/attribute over historical runs.
71
+ */
72
+ function isMeasured(r) {
73
+ return (r.tokenMeasurement ?? "parsed") === "parsed";
74
+ }
75
+ /**
76
+ * Aggregate K seed runs of one (task, arm) pair into PerTaskMetrics. Returns
77
+ * a zeroed envelope on empty input — callers decide whether to skip or render.
78
+ */
79
+ export function aggregatePerTask(results) {
80
+ if (results.length === 0) {
81
+ return {
82
+ passRate: 0,
83
+ passAt1: 0,
84
+ tokensPerPass: null,
85
+ wallclockMs: 0,
86
+ passRateStdev: 0,
87
+ budgetExceededCount: 0,
88
+ harnessErrorCount: 0,
89
+ count: 0,
90
+ runsWithMeasuredTokens: 0,
91
+ };
92
+ }
93
+ let passes = 0;
94
+ let measuredPasses = 0;
95
+ let totalTokensInMeasuredPasses = 0;
96
+ let totalWallclock = 0;
97
+ let budgetExceeded = 0;
98
+ let harnessError = 0;
99
+ let runsWithMeasuredTokens = 0;
100
+ // For the standard deviation we need a fixed-iteration buffer of pass/fail.
101
+ const passSamples = [];
102
+ for (const r of results) {
103
+ totalWallclock += r.wallclockMs;
104
+ if (isMeasured(r)) {
105
+ runsWithMeasuredTokens += 1;
106
+ }
107
+ const isPass = r.outcome === "pass" ? 1 : 0;
108
+ passSamples.push(isPass);
109
+ if (isPass === 1) {
110
+ passes += 1;
111
+ // Only count tokens for measured passes (issue #252). A pass with
112
+ // missing measurement contributes to `passRate` but NOT to
113
+ // `tokensPerPass` — preserving "tokens per measured pass" semantics.
114
+ if (isMeasured(r)) {
115
+ measuredPasses += 1;
116
+ totalTokensInMeasuredPasses += r.tokens.input + r.tokens.output;
117
+ }
118
+ }
119
+ else if (r.outcome === "budget_exceeded") {
120
+ budgetExceeded += 1;
121
+ }
122
+ else if (r.outcome === "harness_error") {
123
+ harnessError += 1;
124
+ }
125
+ }
126
+ const seed0 = results.find((r) => r.seed === 0) ?? results[0];
127
+ const passAt1 = seed0 && seed0.outcome === "pass" ? 1 : 0;
128
+ return {
129
+ passRate: passes / results.length,
130
+ passAt1,
131
+ tokensPerPass: measuredPasses === 0 ? null : totalTokensInMeasuredPasses / measuredPasses,
132
+ wallclockMs: totalWallclock / results.length,
133
+ passRateStdev: stdev(passSamples),
134
+ budgetExceededCount: budgetExceeded,
135
+ harnessErrorCount: harnessError,
136
+ count: results.length,
137
+ runsWithMeasuredTokens,
138
+ };
139
+ }
140
+ /** Sample standard deviation. Returns 0 for length ≤ 1 (no spread to measure). */
141
+ function stdev(values) {
142
+ if (values.length <= 1)
143
+ return 0;
144
+ const mean = values.reduce((a, b) => a + b, 0) / values.length;
145
+ const sumSq = values.reduce((acc, v) => acc + (v - mean) * (v - mean), 0);
146
+ // Sample stdev (Bessel's correction) — n-1 denominator.
147
+ return Math.sqrt(sumSq / (values.length - 1));
148
+ }
149
+ /**
150
+ * Mean across per-task metrics. Each task contributes once, regardless of
151
+ * how many seeds it ran (K is already collapsed in `aggregatePerTask`).
152
+ *
153
+ * `tokensPerPass`: tasks where `tokensPerPass` is `null` (no passes) are
154
+ * dropped from that mean. The result is `null` if every task failed.
155
+ */
156
+ export function aggregateCorpus(perTask) {
157
+ const tasks = Object.values(perTask);
158
+ if (tasks.length === 0) {
159
+ return { passRate: 0, tokensPerPass: null, wallclockMs: 0 };
160
+ }
161
+ const passRate = tasks.reduce((a, t) => a + t.passRate, 0) / tasks.length;
162
+ const wallclockMs = tasks.reduce((a, t) => a + t.wallclockMs, 0) / tasks.length;
163
+ const tppValues = tasks.map((t) => t.tokensPerPass).filter((v) => v !== null);
164
+ const tokensPerPass = tppValues.length === 0 ? null : tppValues.reduce((a, b) => a + b, 0) / tppValues.length;
165
+ return { passRate, tokensPerPass, wallclockMs };
166
+ }
167
+ /**
168
+ * Compute the akm − noakm delta. Negative `tokensPerPass`/`wallclockMs` mean
169
+ * akm was cheaper / faster; positive means it cost more. Pass-rate uses the
170
+ * opposite convention (positive = akm wins).
171
+ */
172
+ export function computeCorpusDelta(noakm, akm) {
173
+ return {
174
+ passRate: akm.passRate - noakm.passRate,
175
+ tokensPerPass: akm.tokensPerPass === null || noakm.tokensPerPass === null ? null : akm.tokensPerPass - noakm.tokensPerPass,
176
+ wallclockMs: akm.wallclockMs - noakm.wallclockMs,
177
+ };
178
+ }
179
+ /** Per-task delta with the same null-safety as the corpus delta. */
180
+ export function computePerTaskDelta(noakm, akm) {
181
+ return {
182
+ passRate: akm.passRate - noakm.passRate,
183
+ tokensPerPass: akm.tokensPerPass === null || noakm.tokensPerPass === null ? null : akm.tokensPerPass - noakm.tokensPerPass,
184
+ wallclockMs: akm.wallclockMs - noakm.wallclockMs,
185
+ };
186
+ }
187
+ /**
188
+ * Extract the domain prefix from a task ID. The corpus convention is
189
+ * `<domain>/<task-name>`; we split on the first `/`. Tasks lacking a slash
190
+ * fall back to the literal `unknown` bucket so they aggregate predictably
191
+ * rather than producing per-task domains-of-one.
192
+ */
193
+ export function domainOfTaskId(taskId) {
194
+ const idx = taskId.indexOf("/");
195
+ if (idx <= 0)
196
+ return "unknown";
197
+ return taskId.slice(0, idx);
198
+ }
199
+ /**
200
+ * Compute the negative-transfer aggregate over a set of per-task entries
201
+ * (one entry per task; both arms already aggregated into PerTaskMetrics).
202
+ *
203
+ * A task is "regressed" when `akm.passRate < noakm.passRate`. Ties (equal
204
+ * pass rate, including 0=0) are NOT regressions. `topRegressedTasks` is
205
+ * sorted by `severity` descending then `taskId` ascending so output is
206
+ * deterministic.
207
+ */
208
+ export function computeNegativeTransfer(tasks) {
209
+ const regressed = [];
210
+ for (const t of tasks) {
211
+ const delta = t.akm.passRate - t.noakm.passRate;
212
+ if (delta >= 0)
213
+ continue;
214
+ regressed.push({
215
+ taskId: t.id,
216
+ domain: domainOfTaskId(t.id),
217
+ noakmPassRate: t.noakm.passRate,
218
+ akmPassRate: t.akm.passRate,
219
+ delta,
220
+ severity: -delta,
221
+ });
222
+ }
223
+ regressed.sort((a, b) => {
224
+ if (b.severity !== a.severity)
225
+ return b.severity - a.severity;
226
+ return a.taskId.localeCompare(b.taskId);
227
+ });
228
+ const severity = regressed.reduce((acc, r) => acc + r.severity, 0);
229
+ return { count: regressed.length, severity, topRegressedTasks: regressed };
230
+ }
231
+ /**
232
+ * Compute per-domain aggregates over a set of per-task entries. Each task
233
+ * contributes once to its domain (K seeds already collapsed). Output rows
234
+ * are sorted by `domain` ascending so JSON / markdown are byte-stable.
235
+ *
236
+ * Domain extraction uses `domainOfTaskId` (split on first `/`).
237
+ */
238
+ export function computeDomainAggregates(tasks) {
239
+ const buckets = new Map();
240
+ for (const t of tasks) {
241
+ const d = domainOfTaskId(t.id);
242
+ let arr = buckets.get(d);
243
+ if (!arr) {
244
+ arr = [];
245
+ buckets.set(d, arr);
246
+ }
247
+ arr.push(t);
248
+ }
249
+ const rows = [];
250
+ for (const [domain, group] of buckets) {
251
+ const n = group.length;
252
+ let noakmSum = 0;
253
+ let akmSum = 0;
254
+ let wallNoakm = 0;
255
+ let wallAkm = 0;
256
+ let regressionCount = 0;
257
+ const noakmTpp = [];
258
+ const akmTpp = [];
259
+ for (const t of group) {
260
+ noakmSum += t.noakm.passRate;
261
+ akmSum += t.akm.passRate;
262
+ wallNoakm += t.noakm.wallclockMs;
263
+ wallAkm += t.akm.wallclockMs;
264
+ if (t.akm.passRate < t.noakm.passRate)
265
+ regressionCount += 1;
266
+ if (t.noakm.tokensPerPass !== null)
267
+ noakmTpp.push(t.noakm.tokensPerPass);
268
+ if (t.akm.tokensPerPass !== null)
269
+ akmTpp.push(t.akm.tokensPerPass);
270
+ }
271
+ const passRateNoakm = noakmSum / n;
272
+ const passRateAkm = akmSum / n;
273
+ const meanNoakmTpp = noakmTpp.length === 0 ? null : noakmTpp.reduce((a, b) => a + b, 0) / noakmTpp.length;
274
+ const meanAkmTpp = akmTpp.length === 0 ? null : akmTpp.reduce((a, b) => a + b, 0) / akmTpp.length;
275
+ const tokensPerPassDelta = meanNoakmTpp === null || meanAkmTpp === null ? null : meanAkmTpp - meanNoakmTpp;
276
+ rows.push({
277
+ domain,
278
+ taskCount: n,
279
+ regressionCount,
280
+ passRateNoakm,
281
+ passRateAkm,
282
+ passRateDelta: passRateAkm - passRateNoakm,
283
+ tokensPerPassDelta,
284
+ wallclockMsDelta: wallAkm / n - wallNoakm / n,
285
+ });
286
+ }
287
+ rows.sort((a, b) => a.domain.localeCompare(b.domain));
288
+ return rows;
289
+ }
290
+ /**
291
+ * Compute asset-regression-candidate rows (#260). Walks the AKM-arm runs,
292
+ * keeps only those whose `taskId` is in `regressedTaskIds`, and tallies how
293
+ * often each loaded asset shows up. `regressedTaskCount` (distinct task IDs
294
+ * touched) is the primary sort key — assets that hurt many tasks are more
295
+ * actionable than assets that flooded one task across seeds.
296
+ *
297
+ * Sort: regressedTaskCount desc, totalLoadCount desc, assetRef asc.
298
+ */
299
+ export function computeAssetRegressionCandidates(regressedTaskIds, akmRuns) {
300
+ const regressed = new Set(regressedTaskIds);
301
+ if (regressed.size === 0)
302
+ return [];
303
+ const taskIdsByAsset = new Map();
304
+ const totalLoadByAsset = new Map();
305
+ for (const run of akmRuns) {
306
+ if (!regressed.has(run.taskId))
307
+ continue;
308
+ const assets = run.assetsLoaded ?? [];
309
+ for (const ref of assets) {
310
+ let bucket = taskIdsByAsset.get(ref);
311
+ if (!bucket) {
312
+ bucket = new Set();
313
+ taskIdsByAsset.set(ref, bucket);
314
+ }
315
+ bucket.add(run.taskId);
316
+ totalLoadByAsset.set(ref, (totalLoadByAsset.get(ref) ?? 0) + 1);
317
+ }
318
+ }
319
+ const rows = [];
320
+ for (const [assetRef, taskIds] of taskIdsByAsset) {
321
+ rows.push({
322
+ assetRef,
323
+ regressedTaskCount: taskIds.size,
324
+ regressedTaskIds: [...taskIds].sort(),
325
+ totalLoadCount: totalLoadByAsset.get(assetRef) ?? 0,
326
+ });
327
+ }
328
+ rows.sort((a, b) => {
329
+ if (b.regressedTaskCount !== a.regressedTaskCount)
330
+ return b.regressedTaskCount - a.regressedTaskCount;
331
+ if (b.totalLoadCount !== a.totalLoadCount)
332
+ return b.totalLoadCount - a.totalLoadCount;
333
+ return a.assetRef.localeCompare(b.assetRef);
334
+ });
335
+ return rows;
336
+ }
337
+ // ── Per-asset attribution (§6.5) ───────────────────────────────────────────
338
+ /**
339
+ * Extract the unique asset refs an agent loaded during a run by scanning
340
+ * `events[]` and `verifierStdout` for `akm show <ref>` invocations.
341
+ *
342
+ * Detection strategy (all heuristic, all conservative):
343
+ * 1. `event.eventType === "show"` with `event.ref` (forward-compat — akm
344
+ * itself does not currently emit `show` events).
345
+ * 2. Substring match on `akm show <ref>` in stdout. The ref shape is
346
+ * `[origin//]type:name` per the v1 contract; we accept word-boundary
347
+ * terminators after the name.
348
+ * 3. Tool-call JSON `{"args":["show","<ref>"]}` — the form opencode logs
349
+ * when the agent invokes the akm CLI as a tool. We extract refs that
350
+ * look like asset refs from the args array entries adjacent to "show".
351
+ *
352
+ * Returns refs in first-seen order, deduplicated. Bounded scan: stdout is
353
+ * truncated at 16 MiB (the same cap the trajectory parser uses) to keep
354
+ * runaway agents from OOMing the bench.
355
+ */
356
+ const ASSET_LOAD_STDOUT_SCAN_CAP = 16 * 1024 * 1024;
357
+ // Asset ref grammar: optional `origin//` prefix, type:name, where type and
358
+ // name are lowercase letters, digits, `_`, `-`. We deliberately do NOT match
359
+ // `://` schemes (those are install locators, not asset refs). The character
360
+ // class is intentionally tight so we don't mis-pickup arbitrary words after
361
+ // `akm show`. The `name` segment is restricted to `[A-Za-z0-9_-]+` (no `/`,
362
+ // no `.`) — the v1 grammar in src/core/asset-ref.ts permits `/` and `.` in
363
+ // names (e.g. `script:db/migrate/run.sh`), but the masker treats names as
364
+ // untrusted input and rejects any traversal-shaped value, so the bench-side
365
+ // scanner does not need (or want) to extract such refs from agent stdout.
366
+ // Limiting the regex here is defense-in-depth against a prompt-injected
367
+ // agent emitting `akm show "skill:../../etc"` and us pulling that ref into
368
+ // the masking flow.
369
+ const ASSET_REF_PATTERN = /(?:[a-z0-9_-]+\/\/)?[a-z][a-z0-9_-]*:[A-Za-z0-9_-]+/g;
370
+ export function extractAssetLoads(runResult) {
371
+ const seen = new Set();
372
+ const out = [];
373
+ const push = (ref) => {
374
+ if (!ref)
375
+ return;
376
+ if (seen.has(ref))
377
+ return;
378
+ seen.add(ref);
379
+ out.push(ref);
380
+ };
381
+ // 1. Events stream.
382
+ for (const event of runResult.events) {
383
+ if (event.eventType === "show" && typeof event.ref === "string") {
384
+ push(event.ref);
385
+ }
386
+ const meta = event.metadata;
387
+ if (meta && typeof meta === "object" && event.eventType === "show") {
388
+ const candidate = meta.ref;
389
+ if (typeof candidate === "string")
390
+ push(candidate);
391
+ }
392
+ }
393
+ // 2 & 3. Stdout scanning. Bound the scan so a runaway agent stdout cannot
394
+ // OOM the bench. Truncation is silent — the trajectory parser already
395
+ // surfaces a warning for the same data on its own scan.
396
+ let haystack = runResult.verifierStdout || "";
397
+ if (haystack.length > ASSET_LOAD_STDOUT_SCAN_CAP) {
398
+ haystack = haystack.slice(0, ASSET_LOAD_STDOUT_SCAN_CAP);
399
+ }
400
+ // `akm show <ref>` literal form. Accept optional quoting around the ref so
401
+ // shell traces like `akm show "skill:foo"` work too.
402
+ const literalRe = /akm\s+show\s+["']?((?:[a-z0-9_-]+\/\/)?[a-z][a-z0-9_-]*:[A-Za-z0-9_-]+)["']?/g;
403
+ for (const literalMatch of haystack.matchAll(literalRe)) {
404
+ push(literalMatch[1]);
405
+ }
406
+ // Tool-call JSON form. `"args":[..., "show", "<ref>", ...]`. We extract
407
+ // every refish token in the haystack that follows a "show" arg in JSON-y
408
+ // form. A second cheap pass keeps the pattern simple.
409
+ const toolCallRe = /"show"\s*,\s*"((?:[a-z0-9_-]+\/\/)?[a-z][a-z0-9_-]*:[A-Za-z0-9_-]+)"/g;
410
+ for (const toolCallMatch of haystack.matchAll(toolCallRe)) {
411
+ push(toolCallMatch[1]);
412
+ }
413
+ return out;
414
+ }
415
+ // Suppress the unused warning for `ASSET_REF_PATTERN` above. The constant is
416
+ // retained as the documentation seam called out by the #251 review addenda,
417
+ // even though `extractAssetLoads` uses inline regexes for its two scan forms.
418
+ void ASSET_REF_PATTERN;
419
+ /**
420
+ * Anchored variant of `ASSET_REF_PATTERN` for whole-string validation.
421
+ *
422
+ * Used by `materialiseMaskedStash` (#251) to gate every asset ref BEFORE we
423
+ * touch the filesystem. The base `ASSET_REF_PATTERN` is `/g`-flagged for
424
+ * scanning agent stdout; we re-anchor here so a hostile string like
425
+ * `skill:foo/../../etc` is rejected as a whole even though the regex would
426
+ * happily match a `skill:foo` substring under `/g`.
427
+ *
428
+ * Rejects `..`, absolute paths, drive letters, null bytes, `/`, `\`, and
429
+ * anything else outside the v1 ref grammar (mirrors src/core/asset-ref.ts).
430
+ */
431
+ const ASSET_REF_ANCHORED = /^(?:[a-z0-9_-]+\/\/)?[a-z][a-z0-9_-]*:[A-Za-z0-9_-]+$/;
432
+ /**
433
+ * Reject hostile asset refs before they reach any `fs.rmSync` call. The ref
434
+ * comes from agent stdout (untrusted; the agent could be prompt-injected) so
435
+ * we apply the anchored grammar pattern first, then the per-segment shape
436
+ * check after the colon-split. Defense in depth — each layer is sufficient
437
+ * on its own; the layered structure makes a future grammar relax safe.
438
+ */
439
+ function isSafeAssetRef(ref) {
440
+ if (!ref)
441
+ return false;
442
+ if (ref.includes("\0"))
443
+ return false;
444
+ return ASSET_REF_ANCHORED.test(ref);
445
+ }
446
+ /**
447
+ * Aggregate per-asset load + pass counts across all akm-arm runs in a report.
448
+ *
449
+ * Sort order (stable, deterministic):
450
+ * 1. loadCount descending (most-used first)
451
+ * 2. loadPassRate descending (working assets above broken ones at the same load count)
452
+ * 3. assetRef ascending (alphabetical tiebreak)
453
+ *
454
+ * Only `arm === "akm"` runs contribute. The `noakm` arm has no stash and
455
+ * cannot load assets, so including it would zero-bias the rates.
456
+ */
457
+ export function computePerAssetAttribution(report) {
458
+ const passing = new Map();
459
+ const failing = new Map();
460
+ let totalAkmRuns = 0;
461
+ // The §13.3 task entry doesn't carry RunResults — we read them from the
462
+ // shared akm-arm runs collection that the runner stamps onto `report.akmRuns`.
463
+ const akmRuns = collectAkmRuns(report);
464
+ for (const r of akmRuns) {
465
+ totalAkmRuns += 1;
466
+ const isPass = r.outcome === "pass";
467
+ for (const ref of r.assetsLoaded ?? []) {
468
+ const bucket = isPass ? passing : failing;
469
+ bucket.set(ref, (bucket.get(ref) ?? 0) + 1);
470
+ }
471
+ }
472
+ const refs = new Set([...passing.keys(), ...failing.keys()]);
473
+ const rows = [];
474
+ for (const ref of refs) {
475
+ const p = passing.get(ref) ?? 0;
476
+ const f = failing.get(ref) ?? 0;
477
+ const total = p + f;
478
+ rows.push({
479
+ assetRef: ref,
480
+ loadCountPassing: p,
481
+ loadCountFailing: f,
482
+ loadCount: total,
483
+ loadPassRate: total === 0 ? null : p / total,
484
+ });
485
+ }
486
+ rows.sort((a, b) => {
487
+ if (b.loadCount !== a.loadCount)
488
+ return b.loadCount - a.loadCount;
489
+ const ar = a.loadPassRate ?? -1;
490
+ const br = b.loadPassRate ?? -1;
491
+ if (br !== ar)
492
+ return br - ar;
493
+ return a.assetRef.localeCompare(b.assetRef);
494
+ });
495
+ return { rows, totalAkmRuns };
496
+ }
497
+ /**
498
+ * Pull the akm-arm RunResults out of a UtilityRunReport. The runner stamps
499
+ * them into the optional `akmRuns` field on the report so attribution can
500
+ * post-process them without re-running.
501
+ */
502
+ function collectAkmRuns(report) {
503
+ if (Array.isArray(report.akmRuns))
504
+ return report.akmRuns;
505
+ return [];
506
+ }
507
+ // ── runs[] serialisation (#249) ────────────────────────────────────────────
508
+ /**
509
+ * Project a list of RunResults onto the compact `runs[]` rows persisted
510
+ * inside the §13.3 JSON envelope (#249). One row per (task, arm, seed)
511
+ * triple; the renderer walks the input order verbatim, which the runner
512
+ * already builds deterministically (per-task block, noakm before akm,
513
+ * seeds in ascending order).
514
+ *
515
+ * Aggregate metrics (per-task, trajectory, failure-mode, search-bridge,
516
+ * attribution) MUST be recomputable from these rows + task metadata. This
517
+ * helper is the canonical projection — keep it in lockstep with the field
518
+ * list in the issue body.
519
+ */
520
+ export function aggregateRunsForReport(runs) {
521
+ return runs.map(serializeRunForReport);
522
+ }
523
+ /**
524
+ * Hydrate a persisted `runs[]` row back into the `RunResult` shape that
525
+ * downstream metrics helpers (`computePerAssetAttribution`, `aggregateCorpus`,
526
+ * etc.) expect. Used by `bench attribute` / `bench compare` when they read a
527
+ * §13.3 envelope from disk: the persisted row carries a compact subset, but
528
+ * it carries everything those helpers need.
529
+ *
530
+ * Fields the row deliberately does NOT carry are filled with safe defaults:
531
+ * • `events: []` — events.jsonl is not persisted; downstream attribution
532
+ * only consults `assetsLoaded` and `verifierStdout`.
533
+ * • `verifierStdout: ""` — full stdout is intentionally omitted from the
534
+ * envelope (#249 acceptance criterion). `assetsLoaded` already carries
535
+ * the post-hoc extraction the agent run produced.
536
+ * • `schemaVersion: 1` — the report schema implies it.
537
+ *
538
+ * Tokens are passed through as-is so a future `measurement` field added by
539
+ * #252 lands on the rehydrated row automatically. TODO(#252): keep this
540
+ * spread.
541
+ */
542
+ export function rehydrateRunFromSerialized(row) {
543
+ // The compact row uses a permissive Record shape for tokens (see
544
+ // RunRecordSerialized). Coerce defensively so older artefacts with only
545
+ // {input, output} hydrate cleanly.
546
+ const tok = row.tokens;
547
+ return {
548
+ schemaVersion: 1,
549
+ taskId: row.task_id,
550
+ arm: row.arm,
551
+ seed: row.seed,
552
+ model: row.model,
553
+ outcome: row.outcome,
554
+ tokens: {
555
+ ...tok,
556
+ input: typeof tok.input === "number" ? tok.input : 0,
557
+ output: typeof tok.output === "number" ? tok.output : 0,
558
+ },
559
+ wallclockMs: row.wallclock_ms,
560
+ trajectory: {
561
+ correctAssetLoaded: row.trajectory.correct_asset_loaded,
562
+ feedbackRecorded: row.trajectory.feedback_recorded,
563
+ },
564
+ events: [],
565
+ verifierStdout: "",
566
+ verifierExitCode: row.verifier_exit_code,
567
+ assetsLoaded: [...row.assets_loaded],
568
+ failureMode: (row.failure_mode ?? null),
569
+ };
570
+ }
571
+ /**
572
+ * Pick the top-N most-loaded assets from a base report and re-run the corpus
573
+ * with each one masked from its source stash. Returns a marginal-contribution
574
+ * row per masked asset.
575
+ *
576
+ * Cost: N * (tasks × arms × seedsPerArm) re-runs. Operators clamp N before
577
+ * calling — but we also clamp internally if `topN` exceeds the unique-asset
578
+ * count to avoid surprising no-op runs.
579
+ *
580
+ * Source-fixture safety: every masked re-run materialises a fresh tmp copy
581
+ * of the fixture stash, deletes the masked asset's files there, and points
582
+ * the re-run at the tmp dir. The shipped fixture in `tests/fixtures/stashes/`
583
+ * is NEVER mutated.
584
+ */
585
+ export async function runMaskedCorpus(opts) {
586
+ const baseReport = opts.baseReport;
587
+ const fixturesRoot = opts.fixturesRoot ?? path.resolve(__dirname, "..", "fixtures", "stashes");
588
+ const attribution = computePerAssetAttribution(baseReport);
589
+ const desired = Math.max(1, opts.topN ?? 5);
590
+ const clamped = Math.min(desired, attribution.rows.length);
591
+ const baseAkmPassRate = baseReport.aggregateAkm.passRate;
592
+ const top = attribution.rows.slice(0, clamped);
593
+ const attributions = [];
594
+ const maskedRefs = [];
595
+ for (const row of top) {
596
+ const maskedTasks = [];
597
+ const tmpDirs = [];
598
+ try {
599
+ for (const baseTask of baseReport.taskMetadata ?? []) {
600
+ const maskedStashDir = materialiseMaskedStash(fixturesRoot, baseTask.stash, row.assetRef);
601
+ if (maskedStashDir)
602
+ tmpDirs.push(maskedStashDir);
603
+ // Issue #251: forward the masked stashDir via the explicit
604
+ // `stashDirOverride` field on the cloned TaskMetadata. We MUST NOT
605
+ // mutate `baseTask.stash` (the fixture name) — the runner uses that
606
+ // to call `loadFixtureStash`, and overloading it breaks the
607
+ // `__no-stash__` resolution branch in runner.ts. The runner's AKM-arm
608
+ // branch checks `task.stashDirOverride` first.
609
+ //
610
+ // When `materialiseMaskedStash` returned `null` (asset not present in
611
+ // this fixture, or hostile ref shape rejected by the validator), we
612
+ // intentionally leave both fields untouched. The runner falls back to
613
+ // the normal materialisation flow against the unchanged source
614
+ // fixture — so the re-run still happens, but the result mirrors the
615
+ // base. This is a meaningful diagnostic (the ref didn't bind in this
616
+ // fixture) and is the same accounting `cost-accounting`-style tests
617
+ // assert against.
618
+ if (maskedStashDir) {
619
+ maskedTasks.push({ ...baseTask, stashDirOverride: maskedStashDir });
620
+ }
621
+ else {
622
+ maskedTasks.push({ ...baseTask });
623
+ }
624
+ }
625
+ const maskedReport = await opts.runUtility({
626
+ ...opts.baseOptions,
627
+ tasks: maskedTasks,
628
+ // The masked stash already has the correct content on disk, and the
629
+ // runner now resolves it via `task.stashDirOverride`. We still pass
630
+ // `materialiseStash: false` so the runner does not call
631
+ // `loadFixtureStash` against the (unmasked) named fixture — that
632
+ // would waste work and risk re-indexing the source dir.
633
+ materialiseStash: false,
634
+ });
635
+ const maskedPassRate = maskedReport.aggregateAkm.passRate;
636
+ attributions.push({
637
+ assetRef: row.assetRef,
638
+ basePassRate: baseAkmPassRate,
639
+ maskedPassRate,
640
+ marginalContribution: baseAkmPassRate - maskedPassRate,
641
+ });
642
+ maskedRefs.push(row.assetRef);
643
+ }
644
+ finally {
645
+ // Cleanup runs in BOTH success and failure paths (acceptance criterion).
646
+ // Best-effort: a tmpfs failure here is logged via the `try/catch` below
647
+ // and the host OS reaps the tmp dir on reboot.
648
+ for (const dir of tmpDirs) {
649
+ try {
650
+ fs.rmSync(dir, { recursive: true, force: true });
651
+ }
652
+ catch {
653
+ // Best-effort cleanup; tmpfs cleanup will handle leaks.
654
+ }
655
+ }
656
+ }
657
+ }
658
+ return {
659
+ baseReport,
660
+ attributions,
661
+ runsPerformed: clamped,
662
+ maskingStrategy: "leave-one-out",
663
+ maskedRefs,
664
+ };
665
+ }
666
+ /**
667
+ * Copy a fixture stash into a fresh tmp dir, delete every file matching the
668
+ * masked asset ref, and return the tmp dir path. Returns `null` if the named
669
+ * asset is not present in the fixture (we still re-run, but the result will
670
+ * mirror the base — which is itself a meaningful diagnostic).
671
+ *
672
+ * The masking heuristic:
673
+ * 1. Walk `<stash>/*<...>/.stash.json` files.
674
+ * 2. For each entry whose `name` + `type` matches the asset ref, drop the
675
+ * entry and delete its `filename` if present.
676
+ * 3. Rewrite the `.stash.json` with the trimmed entries (or remove it if
677
+ * it is now empty).
678
+ */
679
+ export function materialiseMaskedStash(fixturesRoot, stashName, assetRef) {
680
+ // #271: validate stashName containment BEFORE touching the filesystem.
681
+ // `stashName` originates from a task YAML which, while authored, is part
682
+ // of the fixture corpus the bench loads; a fixture with `stash: "../../etc"`
683
+ // would otherwise resolve outside `fixturesRoot` and let masking edits or
684
+ // copies escape the bench sandbox. path.relative gives the cleanest
685
+ // containment check (handles `..` AND absolute path injection in one go).
686
+ const fixturesRootResolved = path.resolve(fixturesRoot);
687
+ const sourceDir = path.resolve(fixturesRootResolved, stashName);
688
+ const rel = path.relative(fixturesRootResolved, sourceDir);
689
+ if (rel.startsWith("..") || path.isAbsolute(rel))
690
+ return null;
691
+ if (!fs.existsSync(path.join(sourceDir, "MANIFEST.json")))
692
+ return null;
693
+ // Issue #251 review addendum: validate the WHOLE ref against the anchored
694
+ // grammar before we touch the filesystem. The downstream `isSafeAssetNameSegment`
695
+ // + `isPathContained` checks are still applied — this is defense in depth.
696
+ if (!isSafeAssetRef(assetRef))
697
+ return null;
698
+ const colonIdx = assetRef.indexOf(":");
699
+ if (colonIdx < 0) {
700
+ // Malformed ref: still produce a tmp copy with no edits so the caller's
701
+ // re-run sees the unmodified fixture.
702
+ const tmpRoot = benchMkdtemp(`akm-bench-masked-${stashName}-`);
703
+ copyDirRecursive(sourceDir, tmpRoot);
704
+ return tmpRoot;
705
+ }
706
+ const typeWithOrigin = assetRef.slice(0, colonIdx);
707
+ const name = assetRef.slice(colonIdx + 1);
708
+ const type = typeWithOrigin.includes("//") ? (typeWithOrigin.split("//")[1] ?? typeWithOrigin) : typeWithOrigin;
709
+ // SECURITY: the asset ref originates from agent stdout (untrusted; the
710
+ // agent could be prompt-injected). The masking heuristic below will
711
+ // `fs.rmSync` files under the tmp stash dir whose names are derived from
712
+ // `name`. A traversal-shaped name (`../etc`, `/abs/path`, `..\\..`) would
713
+ // escape the tmp root and delete arbitrary disk content. Reject those
714
+ // shapes BEFORE we materialise — and re-validate after path-resolving
715
+ // each candidate. Mirrors src/core/asset-ref.ts validateName().
716
+ if (!isSafeAssetNameSegment(name))
717
+ return null;
718
+ const tmpRoot = benchMkdtemp(`akm-bench-masked-${stashName}-`);
719
+ copyDirRecursive(sourceDir, tmpRoot);
720
+ // Walk every .stash.json under the tmp root and edit in place.
721
+ walkStashJsonFiles(tmpRoot, (jsonPath) => {
722
+ let raw;
723
+ try {
724
+ raw = fs.readFileSync(jsonPath, "utf8");
725
+ }
726
+ catch {
727
+ return;
728
+ }
729
+ let parsed;
730
+ try {
731
+ parsed = JSON.parse(raw);
732
+ }
733
+ catch {
734
+ return;
735
+ }
736
+ const entries = parsed.entries ?? [];
737
+ const kept = [];
738
+ const jsonDir = path.dirname(jsonPath);
739
+ for (const entry of entries) {
740
+ if (entry.type === type && entry.name === name) {
741
+ // Remove the entry's content file(s). The on-disk `filename` is read
742
+ // from the fixture .stash.json (trusted) but the value still passes
743
+ // through path.relative containment so a malicious fixture can't use
744
+ // this path to escape either.
745
+ const filename = entry.filename;
746
+ if (typeof filename === "string" && isSafeAssetNameSegment(filename)) {
747
+ const target = path.resolve(jsonDir, filename);
748
+ if (isPathContained(tmpRoot, target)) {
749
+ try {
750
+ fs.rmSync(target, { force: true });
751
+ }
752
+ catch {
753
+ // ignore
754
+ }
755
+ }
756
+ }
757
+ // Some fixtures keep a per-asset directory (e.g. skills/<name>/SKILL.md).
758
+ const dirCandidate = path.resolve(jsonDir, name);
759
+ if (isPathContained(tmpRoot, dirCandidate) &&
760
+ fs.existsSync(dirCandidate) &&
761
+ fs.statSync(dirCandidate).isDirectory()) {
762
+ try {
763
+ fs.rmSync(dirCandidate, { recursive: true, force: true });
764
+ }
765
+ catch {
766
+ // ignore
767
+ }
768
+ }
769
+ continue;
770
+ }
771
+ kept.push(entry);
772
+ }
773
+ if (kept.length === entries.length)
774
+ return; // nothing changed
775
+ if (kept.length === 0) {
776
+ try {
777
+ fs.rmSync(jsonPath, { force: true });
778
+ }
779
+ catch {
780
+ // ignore
781
+ }
782
+ }
783
+ else {
784
+ fs.writeFileSync(jsonPath, `${JSON.stringify({ ...parsed, entries: kept }, null, 2)}\n`);
785
+ }
786
+ });
787
+ return tmpRoot;
788
+ }
789
+ /**
790
+ * Reject any segment that could escape the tmp stash root when used as a
791
+ * relative path component:
792
+ * - empty string
793
+ * - any `/` or `\\` (path separators)
794
+ * - a `..` segment in any form
795
+ * - a leading `/` (POSIX absolute) or `C:` (Windows drive)
796
+ * - any null byte
797
+ *
798
+ * Mirrors src/core/asset-ref.ts validateName(), but returns a boolean
799
+ * (callers map this to "skip" rather than "throw").
800
+ */
801
+ function isSafeAssetNameSegment(value) {
802
+ if (!value)
803
+ return false;
804
+ if (value.includes("\0"))
805
+ return false;
806
+ if (value.includes("/") || value.includes("\\"))
807
+ return false;
808
+ if (value === ".." || value === ".")
809
+ return false;
810
+ if (/^[A-Za-z]:/.test(value))
811
+ return false;
812
+ return true;
813
+ }
814
+ /**
815
+ * After resolving a target path, confirm it lives under `root`. Defense in
816
+ * depth: even if a traversal-shaped name slipped past the segment check,
817
+ * this catches escapes via symlinks or odd `path.join` semantics.
818
+ *
819
+ * #271: aligned with `isWithin` in `src/core/common.ts` — both inputs go
820
+ * through `safeRealpath` so a symlink inside `root` that points outside
821
+ * cannot fool the `path.relative` containment check. The shared helper
822
+ * also handles not-yet-existing children (walks up to the closest existing
823
+ * ancestor and resolves symlinks there) so we keep the existing semantics
824
+ * for `target` paths the masking heuristic is about to create.
825
+ */
826
+ export function isPathContained(root, target) {
827
+ const rootResolved = safeRealpath(root);
828
+ const targetResolved = safeRealpath(target);
829
+ const rel = path.relative(rootResolved, targetResolved);
830
+ if (rel === "")
831
+ return true;
832
+ if (rel.startsWith(".."))
833
+ return false;
834
+ if (path.isAbsolute(rel))
835
+ return false;
836
+ return true;
837
+ }
838
+ function walkStashJsonFiles(root, visit) {
839
+ const stack = [root];
840
+ while (stack.length > 0) {
841
+ const cur = stack.pop();
842
+ if (!cur)
843
+ continue;
844
+ let entries;
845
+ try {
846
+ entries = fs.readdirSync(cur, { withFileTypes: true });
847
+ }
848
+ catch {
849
+ continue;
850
+ }
851
+ for (const entry of entries) {
852
+ const abs = path.join(cur, entry.name);
853
+ if (entry.isDirectory())
854
+ stack.push(abs);
855
+ else if (entry.isFile() && entry.name === ".stash.json")
856
+ visit(abs);
857
+ }
858
+ }
859
+ }
860
+ function copyDirRecursive(src, dest) {
861
+ fs.mkdirSync(dest, { recursive: true });
862
+ const entries = fs.readdirSync(src, { withFileTypes: true });
863
+ for (const entry of entries) {
864
+ const s = path.join(src, entry.name);
865
+ const d = path.join(dest, entry.name);
866
+ if (entry.isDirectory())
867
+ copyDirRecursive(s, d);
868
+ else if (entry.isFile())
869
+ fs.copyFileSync(s, d);
870
+ }
871
+ }
872
+ /** Aggregate trajectory booleans across a bag of runs. */
873
+ export function aggregateTrajectory(results) {
874
+ if (results.length === 0) {
875
+ return { correctAssetLoaded: null, feedbackRecorded: 0 };
876
+ }
877
+ let knownAsset = 0;
878
+ let assetLoaded = 0;
879
+ let feedback = 0;
880
+ for (const r of results) {
881
+ if (r.trajectory.correctAssetLoaded !== null) {
882
+ knownAsset += 1;
883
+ if (r.trajectory.correctAssetLoaded)
884
+ assetLoaded += 1;
885
+ }
886
+ if (r.trajectory.feedbackRecorded === true)
887
+ feedback += 1;
888
+ }
889
+ return {
890
+ correctAssetLoaded: knownAsset === 0 ? null : assetLoaded / knownAsset,
891
+ feedbackRecorded: feedback / results.length,
892
+ };
893
+ }
894
+ /**
895
+ * Sign threshold below which a delta is rendered as `flat`. `pass_rate` is
896
+ * normalised to `[0, 1]`, so a 0.005 (0.5pp) tolerance keeps tiny K-seed
897
+ * sampling jitter from looking like a regression.
898
+ */
899
+ const PASS_RATE_FLAT_TOLERANCE = 0.005;
900
+ /** `tokens_per_pass` and `wallclock_ms` use raw counts; 0 is the only "flat". */
901
+ const COUNT_FLAT_TOLERANCE = 0;
902
+ function classifyPassRate(delta) {
903
+ if (delta === null)
904
+ return "flat";
905
+ if (Math.abs(delta) <= PASS_RATE_FLAT_TOLERANCE)
906
+ return "flat";
907
+ return delta > 0 ? "improve" : "regress";
908
+ }
909
+ function classifyCount(delta, lowerIsBetter) {
910
+ if (delta === null)
911
+ return "flat";
912
+ if (Math.abs(delta) <= COUNT_FLAT_TOLERANCE)
913
+ return "flat";
914
+ if (lowerIsBetter)
915
+ return delta < 0 ? "improve" : "regress";
916
+ return delta > 0 ? "improve" : "regress";
917
+ }
918
+ function readModel(r) {
919
+ return r.agent?.model ?? "<unknown>";
920
+ }
921
+ function readFixtureHash(r) {
922
+ const v = r.corpus?.fixtureContentHash;
923
+ return v === undefined || v === null ? null : v;
924
+ }
925
+ function readTaskCorpusHash(r) {
926
+ const v = r.corpus?.taskCorpusHash;
927
+ return v === undefined || v === null ? null : v;
928
+ }
929
+ function readSelectedTaskIds(r) {
930
+ const v = r.corpus?.selectedTaskIds;
931
+ return Array.isArray(v) ? v : null;
932
+ }
933
+ function arraysEqualIgnoringOrder(a, b) {
934
+ if (a.length !== b.length)
935
+ return false;
936
+ const sa = [...a].sort();
937
+ const sb = [...b].sort();
938
+ for (let i = 0; i < sa.length; i += 1)
939
+ if (sa[i] !== sb[i])
940
+ return false;
941
+ return true;
942
+ }
943
+ function akmAgg(r) {
944
+ const a = r.aggregate?.akm ?? {};
945
+ return {
946
+ pass_rate: a.pass_rate ?? 0,
947
+ tokens_per_pass: a.tokens_per_pass ?? null,
948
+ wallclock_ms: a.wallclock_ms ?? 0,
949
+ };
950
+ }
951
+ /**
952
+ * Diff two parsed UtilityRunReport JSONs.
953
+ *
954
+ * Refusal cases:
955
+ * • Either side missing `schemaVersion: 1` or `track: "utility"` →
956
+ * `schema_mismatch` / `track_mismatch`.
957
+ * • `agent.model` differs → `model_mismatch`.
958
+ * • Both sides report a `corpus.fixtureContentHash` and they differ →
959
+ * `hash_mismatch`. Missing hash on either side proceeds with a warning
960
+ * (Wave A may add it; older reports won't have it).
961
+ *
962
+ * On success the per-task table includes rows for every task in either side,
963
+ * plus aggregate deltas computed against the akm arm only (the noakm arm is
964
+ * the control — its delta is meaningless). `pass_rate` is in `[0, 1]`,
965
+ * higher is better; `tokens_per_pass` and `wallclock_ms` are counts, lower
966
+ * is better.
967
+ */
968
+ export function compareReports(base, current, options = {}) {
969
+ // Schema-version gate.
970
+ if (base.schemaVersion !== 1 || current.schemaVersion !== 1) {
971
+ return {
972
+ ok: false,
973
+ reason: "schema_mismatch",
974
+ message: `compare requires schemaVersion=1 on both sides; got base=${String(base.schemaVersion)}, current=${String(current.schemaVersion)}`,
975
+ };
976
+ }
977
+ // Track gate. Cross-track diffs are nonsensical.
978
+ if (base.track !== "utility" || current.track !== "utility") {
979
+ return {
980
+ ok: false,
981
+ reason: "track_mismatch",
982
+ message: `compare only supports track="utility"; got base="${String(base.track)}", current="${String(current.track)}"`,
983
+ };
984
+ }
985
+ const baseModel = readModel(base);
986
+ const currentModel = readModel(current);
987
+ if (baseModel !== currentModel) {
988
+ return {
989
+ ok: false,
990
+ reason: "model_mismatch",
991
+ message: `cannot compare across different models: base="${baseModel}", current="${currentModel}". Rerun on the same model.`,
992
+ baseModel,
993
+ currentModel,
994
+ };
995
+ }
996
+ const baseHash = readFixtureHash(base);
997
+ const currentHash = readFixtureHash(current);
998
+ const warnings = [];
999
+ // #250 — task corpus hash + selected task IDs. Refused unless either side
1000
+ // is legacy (missing the hash) or the operator passed
1001
+ // `allowCorpusMismatch`. Legacy reports (no taskCorpusHash) degrade to a
1002
+ // warning so older artefacts can still be diffed.
1003
+ const baseTaskHash = readTaskCorpusHash(base);
1004
+ const currentTaskHash = readTaskCorpusHash(current);
1005
+ const baseIds = readSelectedTaskIds(base);
1006
+ const currentIds = readSelectedTaskIds(current);
1007
+ if (baseTaskHash !== null && currentTaskHash !== null && baseTaskHash !== currentTaskHash) {
1008
+ if (!options.allowCorpusMismatch) {
1009
+ return {
1010
+ ok: false,
1011
+ reason: "corpus_mismatch",
1012
+ message: `cannot compare across different task corpora: base taskCorpusHash="${baseTaskHash}", current="${currentTaskHash}". Rerun against the same task selection or pass --allow-corpus-mismatch to override.`,
1013
+ baseModel,
1014
+ currentModel,
1015
+ baseTaskCorpusHash: baseTaskHash,
1016
+ currentTaskCorpusHash: currentTaskHash,
1017
+ ...(baseIds ? { baseSelectedTaskIds: baseIds } : {}),
1018
+ ...(currentIds ? { currentSelectedTaskIds: currentIds } : {}),
1019
+ };
1020
+ }
1021
+ warnings.push(`task corpus hashes differ (base="${baseTaskHash}", current="${currentTaskHash}") — diff requested via --allow-corpus-mismatch`);
1022
+ }
1023
+ else if (baseTaskHash === null &&
1024
+ currentTaskHash === null &&
1025
+ baseIds !== null &&
1026
+ currentIds !== null &&
1027
+ !arraysEqualIgnoringOrder(baseIds, currentIds)) {
1028
+ // Both sides legacy (no taskCorpusHash) but both expose selectedTaskIds
1029
+ // and they differ. We can still detect a mismatched corpus from the ID
1030
+ // list alone — refuse unless the operator opted in.
1031
+ if (!options.allowCorpusMismatch) {
1032
+ return {
1033
+ ok: false,
1034
+ reason: "corpus_mismatch",
1035
+ message: `cannot compare across different selected task IDs. Rerun against the same task selection or pass --allow-corpus-mismatch to override.`,
1036
+ baseModel,
1037
+ currentModel,
1038
+ baseSelectedTaskIds: baseIds,
1039
+ currentSelectedTaskIds: currentIds,
1040
+ };
1041
+ }
1042
+ warnings.push("selected task IDs differ — diff requested via --allow-corpus-mismatch");
1043
+ }
1044
+ if (baseTaskHash === null)
1045
+ warnings.push("base report has no corpus.taskCorpusHash; proceeding without task-corpus-pin check");
1046
+ if (currentTaskHash === null)
1047
+ warnings.push("current report has no corpus.taskCorpusHash; proceeding without task-corpus-pin check");
1048
+ if (baseHash !== null && currentHash !== null && baseHash !== currentHash) {
1049
+ if (!options.allowFixtureMismatch) {
1050
+ return {
1051
+ ok: false,
1052
+ reason: "hash_mismatch",
1053
+ message: `cannot compare across different fixture-content hashes: base="${baseHash}", current="${currentHash}". Rerun against matching fixtures or pass --allow-fixture-mismatch to override.`,
1054
+ baseModel,
1055
+ currentModel,
1056
+ baseFixtureContentHash: baseHash,
1057
+ currentFixtureContentHash: currentHash,
1058
+ };
1059
+ }
1060
+ warnings.push(`fixture-content hashes differ (base="${baseHash}", current="${currentHash}") — diff requested via --allow-fixture-mismatch`);
1061
+ }
1062
+ if (baseHash === null)
1063
+ warnings.push("base report has no corpus.fixtureContentHash; proceeding without fixture-pin check");
1064
+ if (currentHash === null)
1065
+ warnings.push("current report has no corpus.fixtureContentHash; proceeding without fixture-pin check");
1066
+ // Aggregate (akm arm is the one that matters — noakm is the control).
1067
+ const ba = akmAgg(base);
1068
+ const ca = akmAgg(current);
1069
+ const passRateDelta = ca.pass_rate - ba.pass_rate;
1070
+ const tokensPerPassDelta = ba.tokens_per_pass === null || ca.tokens_per_pass === null ? null : ca.tokens_per_pass - ba.tokens_per_pass;
1071
+ const wallclockMsDelta = ca.wallclock_ms - ba.wallclock_ms;
1072
+ const aggregate = {
1073
+ passRateDelta,
1074
+ passRateSign: classifyPassRate(passRateDelta),
1075
+ tokensPerPassDelta,
1076
+ tokensPerPassSign: classifyCount(tokensPerPassDelta, true),
1077
+ wallclockMsDelta,
1078
+ wallclockMsSign: classifyCount(wallclockMsDelta, true),
1079
+ };
1080
+ // Per-task rows. Outer-join on task id.
1081
+ const baseTasks = new Map();
1082
+ for (const t of base.tasks ?? [])
1083
+ baseTasks.set(t.id, t);
1084
+ const currentTasks = new Map();
1085
+ for (const t of current.tasks ?? [])
1086
+ currentTasks.set(t.id, t);
1087
+ const allIds = new Set();
1088
+ for (const id of baseTasks.keys())
1089
+ allIds.add(id);
1090
+ for (const id of currentTasks.keys())
1091
+ allIds.add(id);
1092
+ const perTask = [];
1093
+ for (const id of [...allIds].sort()) {
1094
+ const b = baseTasks.get(id);
1095
+ const c = currentTasks.get(id);
1096
+ const bM = b?.akm ?? null;
1097
+ const cM = c?.akm ?? null;
1098
+ const presence = b !== undefined && c !== undefined ? "both" : b !== undefined ? "base-only" : "current-only";
1099
+ const passRateDelta_ = bM !== null && cM !== null ? cM.pass_rate - bM.pass_rate : null;
1100
+ const tokensPerPassDelta_ = bM !== null && cM !== null && bM.tokens_per_pass !== null && cM.tokens_per_pass !== null
1101
+ ? cM.tokens_per_pass - bM.tokens_per_pass
1102
+ : null;
1103
+ const wallclockMsDelta_ = bM !== null && cM !== null ? cM.wallclock_ms - bM.wallclock_ms : null;
1104
+ perTask.push({
1105
+ id,
1106
+ presence,
1107
+ baseMetrics: bM,
1108
+ currentMetrics: cM,
1109
+ delta: { passRate: passRateDelta_, tokensPerPass: tokensPerPassDelta_, wallclockMs: wallclockMsDelta_ },
1110
+ signMarker: classifyPassRate(passRateDelta_),
1111
+ });
1112
+ }
1113
+ return {
1114
+ ok: true,
1115
+ baseModel,
1116
+ currentModel,
1117
+ baseFixtureContentHash: baseHash,
1118
+ currentFixtureContentHash: currentHash,
1119
+ warnings,
1120
+ aggregate,
1121
+ perTask,
1122
+ };
1123
+ }
1124
+ /** Maximum rank at which the gold ref still counts as "found"; > this is `search_low_rank`. */
1125
+ const SEARCH_RANK_CUTOFF = 5;
1126
+ /** Cap on the number of characters of `verifierStdout` we substring-scan. Mirrors trajectory.ts. */
1127
+ const FAILURE_MODE_STDOUT_SCAN_CAP = 16 * 1024 * 1024;
1128
+ /**
1129
+ * Classify a single failed run into one of the seven §6.6 labels. Pure
1130
+ * function — string-matches `runResult.events[]` and `runResult.verifierStdout`,
1131
+ * never calls an LLM, never touches the filesystem.
1132
+ *
1133
+ * Decision tree (priority order — first match wins):
1134
+ * 1. Run not failed (`pass`, `budget_exceeded`, `harness_error`) → `null`.
1135
+ * 2. No `akm search` call in the trace → `no_search`.
1136
+ * 3. Search ran; gold ref absent from search results → `search_no_gold`.
1137
+ * 4. Gold ref present in search results at rank > 5 → `search_low_rank`.
1138
+ * 5. `akm show` invoked on a non-gold ref AND gold ref never loaded → `loaded_wrong`.
1139
+ * 6. Gold ref loaded; verifier output suggests the action contradicts the
1140
+ * asset's guidance (heuristic: verifier mentions the gold pattern was
1141
+ * explicitly NOT followed) → `loaded_ignored`.
1142
+ * 7. Gold ref loaded and apparently followed → `followed_wrong`.
1143
+ * 8. Default → `unrelated_bug`.
1144
+ *
1145
+ * Tasks without `goldRef`: rules that depend on the gold ref (3-7) are
1146
+ * skipped; only `no_search` and `unrelated_bug` are reachable.
1147
+ */
1148
+ export function classifyFailureMode(taskMeta, runResult) {
1149
+ if (runResult.outcome !== "fail")
1150
+ return null;
1151
+ const trace = collectTrace(runResult);
1152
+ const goldRef = taskMeta.goldRef;
1153
+ // 1. no_search — no `akm search` invocation anywhere in the trace.
1154
+ if (!hasAkmSearch(trace, runResult)) {
1155
+ return "no_search";
1156
+ }
1157
+ // Without a gold ref the search-based and load-based checks are undefined.
1158
+ // We can only distinguish "no_search" from everything else.
1159
+ if (!goldRef) {
1160
+ return "unrelated_bug";
1161
+ }
1162
+ const searchRank = findGoldSearchRank(trace, goldRef);
1163
+ // 2. search_no_gold — search ran (precondition above) but gold ref absent.
1164
+ if (searchRank === null) {
1165
+ return "search_no_gold";
1166
+ }
1167
+ // 3. search_low_rank — present but below the cutoff.
1168
+ if (searchRank > SEARCH_RANK_CUTOFF) {
1169
+ return "search_low_rank";
1170
+ }
1171
+ const goldLoaded = hasAkmShow(trace, runResult, goldRef);
1172
+ const otherRefLoaded = hasAkmShowOtherRef(trace, runResult, goldRef);
1173
+ // 4. loaded_wrong — agent showed a non-gold ref AND never loaded the gold.
1174
+ if (otherRefLoaded && !goldLoaded) {
1175
+ return "loaded_wrong";
1176
+ }
1177
+ // The remaining branches all assume the gold was loaded.
1178
+ if (!goldLoaded) {
1179
+ // Gold ref was found in search at an acceptable rank, but the agent
1180
+ // never loaded anything (gold or otherwise) before failing. The taxonomy
1181
+ // table has no row for "found but never opened" — treat as unrelated_bug.
1182
+ return "unrelated_bug";
1183
+ }
1184
+ // 5. loaded_ignored — verifier diagnostic indicates the action contradicts
1185
+ // the loaded asset. Conservative heuristic: look for explicit "ignored"
1186
+ // or "not applied" markers in the verifier stdout. Without an LLM we
1187
+ // cannot detect subtler contradictions, so this branch only fires when
1188
+ // the verifier itself flagged the contradiction.
1189
+ if (verifierIndicatesIgnored(runResult.verifierStdout)) {
1190
+ return "loaded_ignored";
1191
+ }
1192
+ // 6. followed_wrong — gold loaded, apparently followed, verifier still
1193
+ // failed. The §6.6 spec maps this to "the asset itself is wrong".
1194
+ return "followed_wrong";
1195
+ }
1196
+ /** Build a `FailureModeAggregate` from a list of (taskId, label) pairs. */
1197
+ export function aggregateFailureModes(entries) {
1198
+ const byLabel = {};
1199
+ const byTask = {};
1200
+ for (const { taskId, mode } of entries) {
1201
+ byLabel[mode] = (byLabel[mode] ?? 0) + 1;
1202
+ if (!byTask[taskId])
1203
+ byTask[taskId] = {};
1204
+ byTask[taskId][mode] = (byTask[taskId][mode] ?? 0) + 1;
1205
+ }
1206
+ return { byLabel, byTask };
1207
+ }
1208
+ // ── Failure-mode classifier helpers ────────────────────────────────────────
1209
+ /**
1210
+ * Concatenated string used for substring scans. We pre-build this once per
1211
+ * classify call so the helper functions can share it. Stdout is capped per
1212
+ * the trajectory parser convention to keep runaway agents from OOMing the
1213
+ * bench.
1214
+ */
1215
+ function collectTrace(runResult) {
1216
+ const stdout = runResult.verifierStdout ?? "";
1217
+ const capped = stdout.length > FAILURE_MODE_STDOUT_SCAN_CAP ? stdout.slice(0, FAILURE_MODE_STDOUT_SCAN_CAP) : stdout;
1218
+ return capped;
1219
+ }
1220
+ /** Does the trace contain any `akm search` invocation (CLI form OR event)? */
1221
+ function hasAkmSearch(trace, runResult) {
1222
+ // Tool-call CLI form, e.g. `akm search "deploy homelab"`.
1223
+ if (/\bakm\s+search\b/.test(trace))
1224
+ return true;
1225
+ // Tool-call JSON form, e.g. `"args":["search","..."]`.
1226
+ if (trace.includes(`"search"`) && /["']search["']/.test(trace))
1227
+ return true;
1228
+ // Event-stream form (search verbs aren't currently emitted but the field
1229
+ // is forward-compatible — see core/events.ts).
1230
+ for (const event of runResult.events) {
1231
+ if (event.eventType === "search" || event.eventType === "search_invoked")
1232
+ return true;
1233
+ }
1234
+ return false;
1235
+ }
1236
+ /**
1237
+ * Find the 1-based rank of `goldRef` in the search results captured in the
1238
+ * trace, or `null` if not present. Best-effort heuristics:
1239
+ * 1. Look for an `akm search` block followed by a numbered list (`1. skill:foo`).
1240
+ * 2. Look for a JSON-ish results array containing the ref.
1241
+ * 3. Fall back to substring presence — if the ref appears anywhere after
1242
+ * a search invocation, treat it as rank-unknown. We err on the side of
1243
+ * `1` (best case for the agent) so the classifier doesn't false-positive
1244
+ * on `search_low_rank`.
1245
+ */
1246
+ function findGoldSearchRank(trace, goldRef) {
1247
+ // Locate the first `akm search` invocation; restrict the rank search to
1248
+ // text after it so we don't pick up `akm show` output.
1249
+ const searchMatch = trace.match(/\bakm\s+search\b/);
1250
+ if (!searchMatch || searchMatch.index === undefined) {
1251
+ // Caller already verified search ran; if our regex disagrees, fall back
1252
+ // to scanning the full trace.
1253
+ return findRefRankInText(trace, goldRef);
1254
+ }
1255
+ const after = trace.slice(searchMatch.index);
1256
+ return findRefRankInText(after, goldRef);
1257
+ }
1258
+ function findRefRankInText(text, goldRef) {
1259
+ const escaped = goldRef.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
1260
+ // Numbered list: lines of the form `<rank>. <ref>` or `<rank>) <ref>`.
1261
+ const numberedRe = /^\s*(\d{1,3})[.)]\s+([^\s]+)/gm;
1262
+ let match;
1263
+ while (true) {
1264
+ match = numberedRe.exec(text);
1265
+ if (match === null)
1266
+ break;
1267
+ const ref = match[2];
1268
+ if (refsMatch(ref, goldRef)) {
1269
+ return Number.parseInt(match[1], 10);
1270
+ }
1271
+ }
1272
+ // JSON array form: `"results":["a","b","skill:foo"]`. Estimate rank by
1273
+ // splitting on commas after the bracket. Best-effort.
1274
+ const jsonRe = /"results"\s*:\s*\[([^\]]+)\]/;
1275
+ const jsonMatch = text.match(jsonRe);
1276
+ if (jsonMatch) {
1277
+ const items = jsonMatch[1].split(",").map((s) => s.trim().replace(/^["']|["']$/g, ""));
1278
+ const idx = items.findIndex((item) => refsMatch(item, goldRef));
1279
+ if (idx >= 0)
1280
+ return idx + 1;
1281
+ }
1282
+ // Substring presence — assume rank 1 (best case for the agent, conservative
1283
+ // for the `search_low_rank` rule).
1284
+ const refRe = new RegExp(`\\b${escaped}\\b`);
1285
+ if (refRe.test(text))
1286
+ return 1;
1287
+ return null;
1288
+ }
1289
+ /** True when `candidate` is `goldRef` or a strict ref-extension thereof. */
1290
+ function refsMatch(candidate, goldRef) {
1291
+ if (candidate === goldRef)
1292
+ return true;
1293
+ if (candidate.endsWith(`//${goldRef}`))
1294
+ return true;
1295
+ if (candidate.startsWith(`${goldRef}/`))
1296
+ return true;
1297
+ return false;
1298
+ }
1299
+ /** Did the agent invoke `akm show <goldRef>` at any point? */
1300
+ function hasAkmShow(trace, runResult, goldRef) {
1301
+ const escaped = goldRef.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
1302
+ // CLI form, exact ref. Also matches origin-prefixed variants like
1303
+ // `akm show team//skill:foo` because the `[\w/]*//` prefix is optional.
1304
+ const cliRe = new RegExp(`\\bakm\\s+show\\s+["']?(?:[\\w-]+//)?${escaped}(?:\\b|\\W)`);
1305
+ if (cliRe.test(trace))
1306
+ return true;
1307
+ // Tool-call JSON form: `"args":["show","skill:foo"]`.
1308
+ if (trace.includes(`"show"`) && trace.includes(goldRef))
1309
+ return true;
1310
+ // Event-stream metadata.ref.
1311
+ for (const event of runResult.events) {
1312
+ if (typeof event.ref === "string" && refsMatch(event.ref, goldRef)) {
1313
+ // Only count "show" or "load" eventTypes; a `feedback` event mentioning
1314
+ // the ref doesn't mean the agent loaded it during this run.
1315
+ if (event.eventType === "show" || event.eventType === "load" || event.eventType === "tool_call")
1316
+ return true;
1317
+ }
1318
+ const meta = event.metadata;
1319
+ if (meta && typeof meta === "object") {
1320
+ const candidate = meta.ref;
1321
+ if (typeof candidate === "string" && refsMatch(candidate, goldRef)) {
1322
+ if (event.eventType === "show" || event.eventType === "load" || event.eventType === "tool_call")
1323
+ return true;
1324
+ }
1325
+ }
1326
+ }
1327
+ return false;
1328
+ }
1329
+ /** Did the agent invoke `akm show <ref>` for some ref OTHER than `goldRef`? */
1330
+ function hasAkmShowOtherRef(trace, runResult, goldRef) {
1331
+ // CLI form: capture the ref argument and reject when it matches the gold.
1332
+ const cliRe = /\bakm\s+show\s+["']?([^\s"'`]+)/g;
1333
+ let match;
1334
+ while (true) {
1335
+ match = cliRe.exec(trace);
1336
+ if (match === null)
1337
+ break;
1338
+ if (!refsMatch(match[1], goldRef))
1339
+ return true;
1340
+ }
1341
+ // Tool-call JSON form: `"args":["show","..."]`. Best-effort scan.
1342
+ const jsonRe = /\["show",\s*"([^"]+)"/g;
1343
+ while (true) {
1344
+ match = jsonRe.exec(trace);
1345
+ if (match === null)
1346
+ break;
1347
+ if (!refsMatch(match[1], goldRef))
1348
+ return true;
1349
+ }
1350
+ // Event-stream form.
1351
+ for (const event of runResult.events) {
1352
+ if (event.eventType !== "show" && event.eventType !== "load" && event.eventType !== "tool_call")
1353
+ continue;
1354
+ if (typeof event.ref === "string" && !refsMatch(event.ref, goldRef))
1355
+ return true;
1356
+ const meta = event.metadata;
1357
+ if (meta && typeof meta === "object") {
1358
+ const candidate = meta.ref;
1359
+ if (typeof candidate === "string" && !refsMatch(candidate, goldRef))
1360
+ return true;
1361
+ }
1362
+ }
1363
+ return false;
1364
+ }
1365
+ /**
1366
+ * Conservative heuristic for the `loaded_ignored` branch. Without an LLM we
1367
+ * cannot reliably decide whether an arbitrary action contradicts arbitrary
1368
+ * asset content; we only fire when the verifier's own diagnostic explicitly
1369
+ * flags the gold-asset guidance as ignored.
1370
+ *
1371
+ * The verifier stdout strings are deterministic — they come from
1372
+ * `runVerifier` and the per-task `verify.sh` scripts. Tasks that want to
1373
+ * surface this label should emit one of the agreed-upon markers below.
1374
+ */
1375
+ function verifierIndicatesIgnored(verifierStdout) {
1376
+ if (!verifierStdout)
1377
+ return false;
1378
+ const lower = verifierStdout.toLowerCase();
1379
+ return (lower.includes("ignored gold guidance") ||
1380
+ lower.includes("guidance ignored") ||
1381
+ lower.includes("did not follow loaded asset") ||
1382
+ lower.includes("contradicts loaded asset"));
1383
+ }
1384
+ /** Cap on the number of result refs we extract per `akm search` invocation. */
1385
+ const TOP_K = 10;
1386
+ /**
1387
+ * Extract the gold rank for every `akm search` invocation in a run.
1388
+ *
1389
+ * The parser scans `runResult.verifierStdout` (which carries the captured
1390
+ * agent stdout including its tool-call trace) for `akm search` commands
1391
+ * and the result lists that follow them. The first 10 hits are considered;
1392
+ * if the gold ref appears, `rankOfGold` is its 1-based position, else
1393
+ * `null`.
1394
+ *
1395
+ * Pure function: never reads from disk and never mutates inputs. When
1396
+ * `goldRef` is undefined the function returns `[]` — we only attribute
1397
+ * ranks for tasks that actually have a gold asset.
1398
+ */
1399
+ export function extractGoldRanks(runResult, goldRef) {
1400
+ if (!goldRef)
1401
+ return [];
1402
+ const haystack = runResult.verifierStdout;
1403
+ if (!haystack)
1404
+ return [];
1405
+ const events = [];
1406
+ // Walk the stdout linearly. A search invocation looks like
1407
+ // `akm search "<query>"` or `akm search <query>`
1408
+ // and the subsequent block carries the result list. A new `akm` command
1409
+ // (or end of stdout) terminates the previous search's result block.
1410
+ const lines = haystack.split(/\r?\n/);
1411
+ let active = null;
1412
+ // Regex for an `akm search` invocation. Captures the rest of the line
1413
+ // after `search ` so we can pick up the query whether it's quoted or not.
1414
+ const searchInvocationRe = /\bakm\s+search\s+(.+?)(?:\s+--|$)/;
1415
+ // A different `akm <verb>` (not `search`) terminates the active block.
1416
+ const akmInvocationRe = /\bakm\s+(\w+)/;
1417
+ for (const rawLine of lines) {
1418
+ const line = rawLine.trim();
1419
+ if (!line)
1420
+ continue;
1421
+ const searchMatch = line.match(searchInvocationRe);
1422
+ if (searchMatch) {
1423
+ // Flush any active block before starting a new one.
1424
+ if (active) {
1425
+ active.rankOfGold = computeRank(active.results, goldRef);
1426
+ events.push(active);
1427
+ }
1428
+ const query = stripQuotes(searchMatch[1].trim());
1429
+ active = { query, results: [], rankOfGold: null };
1430
+ // Some traces inline the JSON result on the same line — try to extract.
1431
+ collectRefsFromLine(line, active.results);
1432
+ continue;
1433
+ }
1434
+ if (!active)
1435
+ continue;
1436
+ // A non-search akm invocation closes the active search block.
1437
+ const akmMatch = line.match(akmInvocationRe);
1438
+ if (akmMatch && akmMatch[1] !== "search") {
1439
+ active.rankOfGold = computeRank(active.results, goldRef);
1440
+ events.push(active);
1441
+ active = null;
1442
+ continue;
1443
+ }
1444
+ collectRefsFromLine(line, active.results);
1445
+ }
1446
+ if (active) {
1447
+ active.rankOfGold = computeRank(active.results, goldRef);
1448
+ events.push(active);
1449
+ }
1450
+ return events;
1451
+ }
1452
+ /** Trim leading/trailing single or double quotes from a query string. */
1453
+ function stripQuotes(s) {
1454
+ if (s.length >= 2) {
1455
+ const first = s[0];
1456
+ const last = s[s.length - 1];
1457
+ if ((first === '"' && last === '"') || (first === "'" && last === "'")) {
1458
+ return s.slice(1, -1);
1459
+ }
1460
+ }
1461
+ return s;
1462
+ }
1463
+ /**
1464
+ * Pull asset refs from a single line into `out`. Matches both plain
1465
+ * `ref: <ref>` lines (text mode) and `"ref":"<ref>"` (JSON mode). We
1466
+ * stop at TOP_K results to mirror the spec's top-10 cutoff.
1467
+ */
1468
+ function collectRefsFromLine(line, out) {
1469
+ if (out.length >= TOP_K)
1470
+ return;
1471
+ // JSON form: `"ref":"skill:foo"` or `"ref": "skill:foo"`. Multiple per line possible.
1472
+ const jsonRe = /"ref"\s*:\s*"([^"]+)"/g;
1473
+ let m;
1474
+ m = jsonRe.exec(line);
1475
+ while (m !== null) {
1476
+ if (out.length >= TOP_K)
1477
+ return;
1478
+ out.push(m[1]);
1479
+ m = jsonRe.exec(line);
1480
+ }
1481
+ // Plain text form: ` ref: skill:foo`. Only treat the line as a ref-bearing
1482
+ // line if it starts with `ref:` (after whitespace). Avoids picking up
1483
+ // every `:` in arbitrary stdout.
1484
+ const textRe = /^ref:\s*([^\s,]+)/;
1485
+ const tm = line.match(textRe);
1486
+ if (tm && out.length < TOP_K) {
1487
+ out.push(tm[1]);
1488
+ }
1489
+ }
1490
+ /**
1491
+ * 1-based rank of `goldRef` in `results`, or `null` if absent within the
1492
+ * top 10. We use `matchesGold` for prefix-tolerant matching so
1493
+ * `team//skill:foo` counts as `skill:foo` (mirrors trajectory parser).
1494
+ */
1495
+ function computeRank(results, goldRef) {
1496
+ const cap = Math.min(results.length, TOP_K);
1497
+ for (let i = 0; i < cap; i += 1) {
1498
+ if (matchesGold(results[i], goldRef))
1499
+ return i + 1;
1500
+ }
1501
+ return null;
1502
+ }
1503
+ function matchesGold(candidate, gold) {
1504
+ if (candidate === gold)
1505
+ return true;
1506
+ if (candidate.endsWith(`//${gold}`))
1507
+ return true;
1508
+ if (candidate.startsWith(`${gold}/`))
1509
+ return true;
1510
+ return false;
1511
+ }
1512
+ /**
1513
+ * Aggregate gold-rank records across all akm-arm runs in the corpus.
1514
+ *
1515
+ * The function operates on `report.goldRankRecords`, which the runner
1516
+ * populates per (task, arm, seed). When the corpus has no gold-ref tasks
1517
+ * at all (every record list is empty), every metric collapses to a zero
1518
+ * envelope and the `passRateByRank` table is empty — the renderer turns
1519
+ * that into a single "(N/A)" sentence.
1520
+ */
1521
+ export function computeSearchBridge(report) {
1522
+ const records = report.goldRankRecords ?? [];
1523
+ // Histogram + percentile inputs across every search.
1524
+ const histogram = emptyHistogram();
1525
+ const allRanks = [];
1526
+ let totalSearches = 0;
1527
+ for (const rec of records) {
1528
+ for (const ev of rec.searches) {
1529
+ totalSearches += 1;
1530
+ allRanks.push(ev.rankOfGold);
1531
+ const bucket = ev.rankOfGold === null ? "missing" : String(ev.rankOfGold);
1532
+ histogram[bucket] = (histogram[bucket] ?? 0) + 1;
1533
+ }
1534
+ }
1535
+ const goldAtRank1 = totalSearches === 0 ? 0 : (histogram["1"] ?? 0) / totalSearches;
1536
+ const goldMissing = totalSearches === 0 ? 0 : (histogram.missing ?? 0) / totalSearches;
1537
+ const goldRankP50 = totalSearches === 0 ? null : percentile(allRanks, 50);
1538
+ const goldRankP90 = totalSearches === 0 ? null : percentile(allRanks, 90);
1539
+ // pass_rate_by_rank — split runs by the rank in *the search the agent
1540
+ // actually ran*. We use the last `akm search` of the run (or "missing"
1541
+ // when no search at all happened, or "missing" when the agent searched
1542
+ // but gold wasn't in the top 10 in that final search). Runs without any
1543
+ // `akm search` invocation are dropped from this slice — `pass_rate_by_rank`
1544
+ // only describes what happened given a search.
1545
+ const passRateBuckets = new Map();
1546
+ for (const rec of records) {
1547
+ if (rec.searches.length === 0)
1548
+ continue;
1549
+ const chosen = rec.searches[rec.searches.length - 1];
1550
+ const bucket = chosen.rankOfGold === null ? "missing" : String(chosen.rankOfGold);
1551
+ const slot = passRateBuckets.get(bucket) ?? { passes: 0, total: 0 };
1552
+ slot.total += 1;
1553
+ if (rec.outcome === "pass")
1554
+ slot.passes += 1;
1555
+ passRateBuckets.set(bucket, slot);
1556
+ }
1557
+ const passRateByRank = [];
1558
+ for (const rank of histogramKeys()) {
1559
+ const slot = passRateBuckets.get(rank);
1560
+ if (!slot)
1561
+ continue;
1562
+ passRateByRank.push({
1563
+ rank,
1564
+ passRate: slot.total === 0 ? 0 : slot.passes / slot.total,
1565
+ runCount: slot.total,
1566
+ });
1567
+ }
1568
+ return {
1569
+ goldRankDistribution: histogram,
1570
+ goldRankP50,
1571
+ goldRankP90,
1572
+ goldAtRank1,
1573
+ goldMissing,
1574
+ passRateByRank,
1575
+ runsObserved: records.length,
1576
+ searchesObserved: totalSearches,
1577
+ };
1578
+ }
1579
+ /** Ordered keys used for both the histogram and the pass_rate_by_rank table. */
1580
+ export function histogramKeys() {
1581
+ return ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "missing"];
1582
+ }
1583
+ function emptyHistogram() {
1584
+ const out = {};
1585
+ for (const k of histogramKeys())
1586
+ out[k] = 0;
1587
+ return out;
1588
+ }
1589
+ /**
1590
+ * Linear-interpolated percentile over a list of ranks. `null` ranks are
1591
+ * treated as `Infinity` so the missing bucket pushes percentiles up
1592
+ * correctly. Returns `Infinity` when the percentile lands in the missing
1593
+ * region; the renderer surfaces that as the literal `"missing"` token so
1594
+ * downstream JSON consumers don't choke on `Infinity`.
1595
+ */
1596
+ function percentile(ranks, p) {
1597
+ if (ranks.length === 0)
1598
+ return Number.NaN;
1599
+ const sorted = ranks.map((r) => (r === null ? Number.POSITIVE_INFINITY : r)).sort((a, b) => a - b);
1600
+ // Nearest-rank method (avoids interpolation between Infinity and a finite).
1601
+ // index = ceil(p/100 * N) - 1, clamped to [0, N-1].
1602
+ const idx = Math.min(sorted.length - 1, Math.max(0, Math.ceil((p / 100) * sorted.length) - 1));
1603
+ return sorted[idx];
1604
+ }
1605
+ /**
1606
+ * Aggregate proposal-quality metrics from the evolve runner's proposal log.
1607
+ * Pure function — does not touch disk and does not invoke any subprocess.
1608
+ */
1609
+ export function computeProposalQualityMetrics(proposalLog) {
1610
+ const byRef = new Map();
1611
+ let totalAccepted = 0;
1612
+ let totalLintPass = 0;
1613
+ for (const entry of proposalLog) {
1614
+ let row = byRef.get(entry.assetRef);
1615
+ if (!row) {
1616
+ row = { assetRef: entry.assetRef, proposalCount: 0, lintPassCount: 0, acceptedCount: 0 };
1617
+ byRef.set(entry.assetRef, row);
1618
+ }
1619
+ row.proposalCount += 1;
1620
+ if (entry.lintPass) {
1621
+ row.lintPassCount += 1;
1622
+ totalLintPass += 1;
1623
+ }
1624
+ if (entry.decision === "accept") {
1625
+ row.acceptedCount += 1;
1626
+ totalAccepted += 1;
1627
+ }
1628
+ }
1629
+ const rows = [...byRef.values()].sort((a, b) => a.assetRef.localeCompare(b.assetRef));
1630
+ const totalProposals = proposalLog.length;
1631
+ return {
1632
+ rows,
1633
+ totalProposals,
1634
+ totalAccepted,
1635
+ acceptanceRate: totalProposals === 0 ? 0 : totalAccepted / totalProposals,
1636
+ lintPassRate: totalProposals === 0 ? 0 : totalLintPass / totalProposals,
1637
+ };
1638
+ }
1639
+ /**
1640
+ * Compute longitudinal metrics from three §13.3 utility-shaped reports. Each
1641
+ * input report is expected to share the same eval-slice corpus, with one arm
1642
+ * driving the akm side: `pre` = pre-evolve stash, `post` = evolved stash,
1643
+ * `synthetic` = no-stash scratchpad arm.
1644
+ *
1645
+ * The "arm" we read off each report is `aggregateAkm.passRate` — the runners
1646
+ * produce the akm arm for all three (synthetic is just the akm arm with a
1647
+ * stripped stashDir; pre/post differ by stash content). `seedsPerArm` for
1648
+ * the degradation threshold is taken from the post report's corpus envelope.
1649
+ */
1650
+ export function computeLongitudinalMetrics(preReport, postReport, syntheticReport) {
1651
+ const prePassRate = preReport.aggregateAkm.passRate;
1652
+ const postPassRate = postReport.aggregateAkm.passRate;
1653
+ const syntheticPassRate = syntheticReport.aggregateAkm.passRate;
1654
+ const seedsPerArm = Math.max(1, postReport.corpus.seedsPerArm);
1655
+ const oneSeedFraction = 1 / seedsPerArm;
1656
+ // Per-task degradation: outer-join pre and post on task id.
1657
+ const preTasks = new Map();
1658
+ for (const t of preReport.tasks)
1659
+ preTasks.set(t.id, t);
1660
+ const postTasks = new Map();
1661
+ for (const t of postReport.tasks)
1662
+ postTasks.set(t.id, t);
1663
+ // Index post failure-mode labels by task id (one mode per task — first
1664
+ // failed run wins; matches the §6.6 by-task aggregate's natural ordering).
1665
+ const postFailureByTask = {};
1666
+ const postFailureByTaskMap = postReport.failureModes?.byTask ?? {};
1667
+ for (const [taskId, byMode] of Object.entries(postFailureByTaskMap)) {
1668
+ const labels = Object.keys(byMode);
1669
+ if (labels.length > 0)
1670
+ postFailureByTask[taskId] = labels[0];
1671
+ }
1672
+ const degradations = [];
1673
+ const allIds = new Set();
1674
+ for (const id of preTasks.keys())
1675
+ allIds.add(id);
1676
+ for (const id of postTasks.keys())
1677
+ allIds.add(id);
1678
+ for (const id of [...allIds].sort()) {
1679
+ const pre = preTasks.get(id);
1680
+ const post = postTasks.get(id);
1681
+ if (!pre || !post)
1682
+ continue;
1683
+ const preRate = pre.akm.passRate;
1684
+ const postRate = post.akm.passRate;
1685
+ const dropped = preRate - postRate;
1686
+ if (dropped > oneSeedFraction) {
1687
+ degradations.push({
1688
+ taskId: id,
1689
+ prePassRate: preRate,
1690
+ postPassRate: postRate,
1691
+ delta: postRate - preRate,
1692
+ failureMode: postFailureByTask[id] ?? null,
1693
+ });
1694
+ }
1695
+ }
1696
+ return {
1697
+ improvementSlope: postPassRate - prePassRate,
1698
+ overSyntheticLift: postPassRate - syntheticPassRate,
1699
+ degradationCount: degradations.length,
1700
+ degradations,
1701
+ prePassRate,
1702
+ postPassRate,
1703
+ syntheticPassRate,
1704
+ };
1705
+ }
1706
+ /** Threshold above `pass_rate[0]` that defines "improvement" for §6.4. */
1707
+ export const LEARNING_IMPROVEMENT_THRESHOLD = 0.05;
1708
+ export function computeLearningCurve(episodes) {
1709
+ // Stable sort by episode_index — defensive against unordered inputs.
1710
+ const sorted = [...episodes].sort((a, b) => a.episode_index - b.episode_index);
1711
+ // Recompute per-episode deltas so the contract holds regardless of what
1712
+ // the caller stamped on the input record.
1713
+ const normalised = sorted.map((ep, i) => {
1714
+ const prev = i === 0 ? null : sorted[i - 1];
1715
+ const delta = prev === null ? 0 : ep.pass_rate - prev.pass_rate;
1716
+ return { ...ep, delta_from_previous_episode: delta };
1717
+ });
1718
+ const passRateByEpisode = normalised.map((ep) => ep.pass_rate);
1719
+ // Linear regression slope: sum((xi - x_mean) * (yi - y_mean)) /
1720
+ // sum((xi - x_mean)^2). For a single episode the denominator is 0 — we
1721
+ // return 0 (no observable trend) rather than NaN.
1722
+ const n = normalised.length;
1723
+ let learningSlope = 0;
1724
+ if (n >= 2) {
1725
+ const xs = normalised.map((ep) => ep.episode_index);
1726
+ const xMean = xs.reduce((s, v) => s + v, 0) / n;
1727
+ const yMean = passRateByEpisode.reduce((s, v) => s + v, 0) / n;
1728
+ let num = 0;
1729
+ let den = 0;
1730
+ for (let i = 0; i < n; i += 1) {
1731
+ const dx = xs[i] - xMean;
1732
+ const dy = passRateByEpisode[i] - yMean;
1733
+ num += dx * dy;
1734
+ den += dx * dx;
1735
+ }
1736
+ learningSlope = den === 0 ? 0 : num / den;
1737
+ }
1738
+ // time_to_improvement: smallest episode_index strictly greater than
1739
+ // `pass_rate[0] + threshold`. Episode 0 itself is excluded — improvement
1740
+ // is only meaningful relative to baseline.
1741
+ let timeToImprovement = null;
1742
+ if (n >= 2) {
1743
+ const baseline = passRateByEpisode[0];
1744
+ for (let i = 1; i < n; i += 1) {
1745
+ if (passRateByEpisode[i] > baseline + LEARNING_IMPROVEMENT_THRESHOLD) {
1746
+ timeToImprovement = normalised[i].episode_index;
1747
+ break;
1748
+ }
1749
+ }
1750
+ }
1751
+ return {
1752
+ episodes: normalised,
1753
+ pass_rate_by_episode: passRateByEpisode,
1754
+ learning_slope: learningSlope,
1755
+ time_to_improvement: timeToImprovement,
1756
+ };
1757
+ }
1758
+ /**
1759
+ * Compute the §6.8 feedback-signal integrity confusion matrix.
1760
+ *
1761
+ * Pure function — does not touch disk and does not invoke any subprocess.
1762
+ * The join is by `(taskId, seed)` so that a feedback event is attributed
1763
+ * to the run that produced it, NOT to a later run that happens to touch
1764
+ * the same gold ref. This matters when the same gold ref appears across
1765
+ * multiple Phase 1 runs (e.g. multiple seeds, or two tasks sharing a
1766
+ * skill); the per-asset row aggregates across all runs that referenced it
1767
+ * in feedback, but each individual feedback event's matrix cell is
1768
+ * decided by its own run's outcome.
1769
+ *
1770
+ * NaN-safety: a per-asset row with zero feedback events (cannot happen via
1771
+ * this function — every row is derived from at least one feedback entry —
1772
+ * but defensive against future callers passing curated subsets) emits all
1773
+ * three rates as `null`. `false_positive_rate` is `null` when `FP+TN===0`
1774
+ * even if the row has `FN+TP>0`, and vice versa.
1775
+ */
1776
+ export function computeFeedbackIntegrity(input) {
1777
+ const akmRuns = input.phase1.akmRuns ?? [];
1778
+ // Build a (taskId, seed) → outcome lookup so every feedback event
1779
+ // resolves in O(1). When two runs share the same key (shouldn't happen
1780
+ // — runner emits unique seeds per task — but defensive) the first
1781
+ // wins.
1782
+ const runOutcomeByKey = new Map();
1783
+ for (const r of akmRuns) {
1784
+ const key = `${r.taskId}::${r.seed}`;
1785
+ if (!runOutcomeByKey.has(key))
1786
+ runOutcomeByKey.set(key, r.outcome);
1787
+ }
1788
+ const perRef = new Map();
1789
+ let aggTP = 0;
1790
+ let aggFP = 0;
1791
+ let aggTN = 0;
1792
+ let aggFN = 0;
1793
+ // Track which (taskId, seed) keys had any feedback dispatched (ok or
1794
+ // not), for the coverage denominator. We count an attempted dispatch as
1795
+ // covered — if `ok===false`, the operator wanted feedback but the CLI
1796
+ // failed; that's still a covered run for the purpose of §6.8 (and is
1797
+ // surfaced in the warnings list elsewhere).
1798
+ const coveredKeys = new Set();
1799
+ for (const fb of input.feedbackLog) {
1800
+ const key = `${fb.taskId}::${fb.seed}`;
1801
+ coveredKeys.add(key);
1802
+ if (!fb.ok)
1803
+ continue; // failed dispatches don't label a matrix cell.
1804
+ const outcome = runOutcomeByKey.get(key);
1805
+ if (outcome === undefined)
1806
+ continue; // run not found — defensive, drop.
1807
+ // harness_error runs are not labelled (the bench skips dispatching
1808
+ // feedback for them; if a fake test injects one, we drop it from the
1809
+ // matrix to avoid mislabelling).
1810
+ if (outcome === "harness_error")
1811
+ continue;
1812
+ const passed = outcome === "pass";
1813
+ let row = perRef.get(fb.goldRef);
1814
+ if (!row) {
1815
+ row = { truePositive: 0, falsePositive: 0, trueNegative: 0, falseNegative: 0 };
1816
+ perRef.set(fb.goldRef, row);
1817
+ }
1818
+ if (fb.signal === "positive" && passed) {
1819
+ row.truePositive += 1;
1820
+ aggTP += 1;
1821
+ }
1822
+ else if (fb.signal === "positive" && !passed) {
1823
+ row.falsePositive += 1;
1824
+ aggFP += 1;
1825
+ }
1826
+ else if (fb.signal === "negative" && !passed) {
1827
+ row.trueNegative += 1;
1828
+ aggTN += 1;
1829
+ }
1830
+ else if (fb.signal === "negative" && passed) {
1831
+ row.falseNegative += 1;
1832
+ aggFN += 1;
1833
+ }
1834
+ }
1835
+ const aggTotal = aggTP + aggFP + aggTN + aggFN;
1836
+ const totalPhase1Runs = akmRuns.length;
1837
+ const aggregate = {
1838
+ truePositive: aggTP,
1839
+ falsePositive: aggFP,
1840
+ trueNegative: aggTN,
1841
+ falseNegative: aggFN,
1842
+ feedback_agreement: aggTotal === 0 ? 0 : (aggTP + aggTN) / aggTotal,
1843
+ false_positive_rate: aggFP + aggTN === 0 ? 0 : aggFP / (aggFP + aggTN),
1844
+ false_negative_rate: aggFN + aggTP === 0 ? 0 : aggFN / (aggFN + aggTP),
1845
+ feedback_coverage: totalPhase1Runs === 0 ? 0 : coveredKeys.size / totalPhase1Runs,
1846
+ };
1847
+ const perAsset = [];
1848
+ for (const [ref, row] of [...perRef.entries()].sort((a, b) => a[0].localeCompare(b[0]))) {
1849
+ const total = row.truePositive + row.falsePositive + row.trueNegative + row.falseNegative;
1850
+ const fpDenom = row.falsePositive + row.trueNegative;
1851
+ const fnDenom = row.falseNegative + row.truePositive;
1852
+ perAsset.push({
1853
+ ref,
1854
+ truePositive: row.truePositive,
1855
+ falsePositive: row.falsePositive,
1856
+ trueNegative: row.trueNegative,
1857
+ falseNegative: row.falseNegative,
1858
+ feedback_agreement: total === 0 ? null : (row.truePositive + row.trueNegative) / total,
1859
+ false_positive_rate: fpDenom === 0 ? null : row.falsePositive / fpDenom,
1860
+ false_negative_rate: fnDenom === 0 ? null : row.falseNegative / fnDenom,
1861
+ });
1862
+ }
1863
+ return { aggregate, perAsset };
1864
+ }
1865
+ function aggregateByKey(entries, pickKey) {
1866
+ const buckets = new Map();
1867
+ for (const entry of entries) {
1868
+ const key = pickKey(entry);
1869
+ if (!key)
1870
+ continue;
1871
+ let arr = buckets.get(key);
1872
+ if (!arr) {
1873
+ arr = [];
1874
+ buckets.set(key, arr);
1875
+ }
1876
+ arr.push(entry);
1877
+ }
1878
+ const rows = [];
1879
+ for (const [category, group] of buckets) {
1880
+ const n = group.length;
1881
+ let noakmSum = 0;
1882
+ let akmSum = 0;
1883
+ let regressionCount = 0;
1884
+ let complianceSum = 0;
1885
+ let complianceCount = 0;
1886
+ for (const t of group) {
1887
+ noakmSum += t.noakm.passRate;
1888
+ akmSum += t.akm.passRate;
1889
+ if (t.akm.passRate < t.noakm.passRate)
1890
+ regressionCount += 1;
1891
+ if (typeof t.workflowCompliance === "number" && Number.isFinite(t.workflowCompliance)) {
1892
+ complianceSum += t.workflowCompliance;
1893
+ complianceCount += 1;
1894
+ }
1895
+ }
1896
+ rows.push({
1897
+ category,
1898
+ taskCount: n,
1899
+ passRateNoakm: noakmSum / n,
1900
+ passRateAkm: akmSum / n,
1901
+ passRateDelta: akmSum / n - noakmSum / n,
1902
+ negativeTransferCount: regressionCount,
1903
+ workflowCompliance: complianceCount === 0 ? null : complianceSum / complianceCount,
1904
+ });
1905
+ }
1906
+ rows.sort((a, b) => a.category.localeCompare(b.category));
1907
+ return rows;
1908
+ }
1909
+ /**
1910
+ * Aggregate per-task entries by `memoryAbility` (#262). Tasks lacking a tag
1911
+ * are skipped so the report only surfaces categories with explicit
1912
+ * coverage. Output rows are sorted by category for byte-stable JSON.
1913
+ *
1914
+ * The closed set of memory-ability values is exported as
1915
+ * {@link MEMORY_ABILITY_VALUES} from `corpus.ts`.
1916
+ */
1917
+ export function aggregateByMemoryAbility(entries) {
1918
+ return aggregateByKey(entries, (e) => e.memoryAbility);
1919
+ }
1920
+ /**
1921
+ * Aggregate per-task entries by `taskFamily` (#262). Tasks lacking a tag
1922
+ * are skipped. `taskFamily` follows the `<domain>/<short-name>` grammar —
1923
+ * tasks sharing a family are expected to transfer knowledge between each
1924
+ * other. Output rows are sorted by category for byte-stable JSON.
1925
+ */
1926
+ export function aggregateByTaskFamily(entries) {
1927
+ return aggregateByKey(entries, (e) => e.taskFamily);
1928
+ }
1929
+ export function computeCorpusCoverage(tasks) {
1930
+ const memoryAbilityCounts = {
1931
+ untagged: 0,
1932
+ };
1933
+ for (const ability of MEMORY_ABILITY_VALUES) {
1934
+ memoryAbilityCounts[ability] = 0;
1935
+ }
1936
+ const taskFamilyCounts = {};
1937
+ let untaggedFamily = 0;
1938
+ for (const task of tasks) {
1939
+ if (task.memoryAbility) {
1940
+ memoryAbilityCounts[task.memoryAbility] = (memoryAbilityCounts[task.memoryAbility] ?? 0) + 1;
1941
+ }
1942
+ else {
1943
+ memoryAbilityCounts.untagged += 1;
1944
+ }
1945
+ if (task.taskFamily) {
1946
+ taskFamilyCounts[task.taskFamily] = (taskFamilyCounts[task.taskFamily] ?? 0) + 1;
1947
+ }
1948
+ else {
1949
+ untaggedFamily += 1;
1950
+ }
1951
+ }
1952
+ if (untaggedFamily > 0)
1953
+ taskFamilyCounts.untagged = untaggedFamily;
1954
+ return {
1955
+ totalTasks: tasks.length,
1956
+ memoryAbilityCounts,
1957
+ taskFamilyCounts,
1958
+ };
1959
+ }
1960
+ /**
1961
+ * Verb counts considered "AKM tool calls" for `totalToolCalls`. We
1962
+ * deliberately keep this list small — each verb folded in MUST be a
1963
+ * user-initiated CLI invocation, not a background bookkeeping event.
1964
+ * Adding new verbs here is additive and changes only `totalToolCalls`.
1965
+ */
1966
+ export const AKM_TOOL_CALL_TYPES = new Set([
1967
+ "akm_search",
1968
+ "akm_show",
1969
+ "akm_feedback",
1970
+ ]);
1971
+ /**
1972
+ * Compute per-run AKM overhead records by replaying #254's normalised trace.
1973
+ *
1974
+ * Pure function: never mutates `runs` and never reads disk. The optional
1975
+ * `taskMetadata` lookup is used only to label loads as relevant / irrelevant
1976
+ * and to compute `timeToFirstCorrectAssetMs`.
1977
+ *
1978
+ * Returned array length matches `runs.length`; element order matches input
1979
+ * order. Runs whose trace contains no AKM events still produce a record
1980
+ * with all counts at zero and timings at `null`.
1981
+ */
1982
+ export function computeAkmOverhead(runs, options = {}) {
1983
+ const out = [];
1984
+ for (const run of runs) {
1985
+ out.push(perRun(run, options.taskMetadata));
1986
+ }
1987
+ return out;
1988
+ }
1989
+ function perRun(run, taskMetadata) {
1990
+ const trace = normalizeRunToTrace(run);
1991
+ const events = trace.events;
1992
+ let searchCount = 0;
1993
+ let showCount = 0;
1994
+ let feedbackCount = 0;
1995
+ const uniqueShowRefs = new Set();
1996
+ for (const ev of events) {
1997
+ if (ev.type === "akm_search")
1998
+ searchCount += 1;
1999
+ else if (ev.type === "akm_show") {
2000
+ showCount += 1;
2001
+ if (typeof ev.assetRef === "string" && ev.assetRef.length > 0) {
2002
+ uniqueShowRefs.add(ev.assetRef);
2003
+ }
2004
+ }
2005
+ else if (ev.type === "akm_feedback")
2006
+ feedbackCount += 1;
2007
+ }
2008
+ const totalToolCalls = searchCount + showCount + feedbackCount;
2009
+ // Run-start anchor: earliest parseable ts in the trace. We use the trace
2010
+ // (not RunResult.events directly) so harness lifecycle markers, when
2011
+ // supplied, can serve as the anchor for stdout-derived events that lack a
2012
+ // native ts.
2013
+ const runStartMs = earliestEventMs(events);
2014
+ const timeToFirstSearchMs = computeFirstEventOffsetMs(events, runStartMs, (ev) => ev.type === "akm_search");
2015
+ // Resolve task metadata once. Missing metadata means we can't judge
2016
+ // relevance — emit null counts rather than zero.
2017
+ const meta = taskMetadata?.get(run.taskId);
2018
+ const goldRef = meta?.goldRef;
2019
+ const transferFrom = meta?.expectedTransferFrom ?? [];
2020
+ const knownRelevant = new Set();
2021
+ if (typeof goldRef === "string" && goldRef.length > 0)
2022
+ knownRelevant.add(goldRef);
2023
+ for (const r of transferFrom) {
2024
+ if (typeof r === "string" && r.length > 0)
2025
+ knownRelevant.add(r);
2026
+ }
2027
+ let irrelevantAssetsLoadedCount;
2028
+ if (!meta) {
2029
+ // No metadata: cannot tell relevant from irrelevant. Surface null.
2030
+ irrelevantAssetsLoadedCount = null;
2031
+ }
2032
+ else {
2033
+ let count = 0;
2034
+ for (const ref of uniqueShowRefs) {
2035
+ if (!knownRelevant.has(ref))
2036
+ count += 1;
2037
+ }
2038
+ irrelevantAssetsLoadedCount = count;
2039
+ }
2040
+ let timeToFirstCorrectAssetMs = null;
2041
+ if (typeof goldRef === "string" && goldRef.length > 0) {
2042
+ timeToFirstCorrectAssetMs = computeFirstEventOffsetMs(events, runStartMs, (ev) => ev.type === "akm_show" && ev.assetRef === goldRef);
2043
+ }
2044
+ return {
2045
+ taskId: run.taskId,
2046
+ arm: run.arm,
2047
+ seed: run.seed,
2048
+ outcome: run.outcome,
2049
+ searchCount,
2050
+ showCount,
2051
+ feedbackCount,
2052
+ totalToolCalls,
2053
+ assetsLoadedCount: uniqueShowRefs.size,
2054
+ irrelevantAssetsLoadedCount,
2055
+ timeToFirstSearchMs,
2056
+ timeToFirstCorrectAssetMs,
2057
+ // Byte sizes are not yet wired through the trace (#254 does not capture
2058
+ // payload sizes). Callers MUST treat null as "unavailable", not zero.
2059
+ contextBytesLoaded: null,
2060
+ assetBytesLoaded: null,
2061
+ };
2062
+ }
2063
+ /**
2064
+ * Aggregate per-run AKM overhead records into the corpus-wide block (#263).
2065
+ *
2066
+ * Pure: never mutates `perRun`. When `perRun` is empty, returns a zero/null
2067
+ * envelope so callers can render a "no AKM activity" section without
2068
+ * branching. `passingRuns === 0` always implies `toolCallsPerSuccess === null`
2069
+ * and `costPerSuccess === null`.
2070
+ */
2071
+ export function aggregateAkmOverhead(perRun, rawRuns = []) {
2072
+ const n = perRun.length;
2073
+ if (n === 0) {
2074
+ return {
2075
+ totalRuns: 0,
2076
+ passingRuns: 0,
2077
+ meanSearchCount: 0,
2078
+ meanShowCount: 0,
2079
+ meanFeedbackCount: 0,
2080
+ meanToolCalls: 0,
2081
+ meanAssetsLoaded: 0,
2082
+ meanIrrelevantAssetsLoaded: null,
2083
+ meanTimeToFirstSearchMs: null,
2084
+ meanTimeToFirstCorrectAssetMs: null,
2085
+ meanContextBytesLoaded: null,
2086
+ meanAssetBytesLoaded: null,
2087
+ totalToolCalls: 0,
2088
+ toolCallsPerSuccess: null,
2089
+ costPerSuccess: null,
2090
+ };
2091
+ }
2092
+ let searchSum = 0;
2093
+ let showSum = 0;
2094
+ let feedbackSum = 0;
2095
+ let toolCallsSum = 0;
2096
+ let assetsSum = 0;
2097
+ let irrelevantSum = 0;
2098
+ let irrelevantCount = 0;
2099
+ let firstSearchSum = 0;
2100
+ let firstSearchCount = 0;
2101
+ let firstCorrectSum = 0;
2102
+ let firstCorrectCount = 0;
2103
+ let contextBytesSum = 0;
2104
+ let contextBytesCount = 0;
2105
+ let assetBytesSum = 0;
2106
+ let assetBytesCount = 0;
2107
+ // Build a quick lookup for token measurement off `rawRuns` so the cost-
2108
+ // per-success calc can honour the parsed/missing/unsupported distinction
2109
+ // without forcing the caller to project tokens onto AkmOverheadPerRun.
2110
+ const rawByKey = new Map();
2111
+ for (const r of rawRuns) {
2112
+ rawByKey.set(`${r.taskId}${r.arm}${r.seed}`, r);
2113
+ }
2114
+ let passingRuns = 0;
2115
+ let parsedPassTokenSum = 0;
2116
+ let parsedPassCount = 0;
2117
+ let anyPassMissingMeasurement = false;
2118
+ for (const row of perRun) {
2119
+ searchSum += row.searchCount;
2120
+ showSum += row.showCount;
2121
+ feedbackSum += row.feedbackCount;
2122
+ toolCallsSum += row.totalToolCalls;
2123
+ assetsSum += row.assetsLoadedCount;
2124
+ if (row.irrelevantAssetsLoadedCount !== null) {
2125
+ irrelevantSum += row.irrelevantAssetsLoadedCount;
2126
+ irrelevantCount += 1;
2127
+ }
2128
+ if (row.timeToFirstSearchMs !== null) {
2129
+ firstSearchSum += row.timeToFirstSearchMs;
2130
+ firstSearchCount += 1;
2131
+ }
2132
+ if (row.timeToFirstCorrectAssetMs !== null) {
2133
+ firstCorrectSum += row.timeToFirstCorrectAssetMs;
2134
+ firstCorrectCount += 1;
2135
+ }
2136
+ if (row.contextBytesLoaded !== null) {
2137
+ contextBytesSum += row.contextBytesLoaded;
2138
+ contextBytesCount += 1;
2139
+ }
2140
+ if (row.assetBytesLoaded !== null) {
2141
+ assetBytesSum += row.assetBytesLoaded;
2142
+ assetBytesCount += 1;
2143
+ }
2144
+ if (row.outcome === "pass") {
2145
+ passingRuns += 1;
2146
+ const raw = rawByKey.get(`${row.taskId}${row.arm}${row.seed}`);
2147
+ // Treat absent tokenMeasurement as `parsed` for backward compat with
2148
+ // older artefacts (mirrors `isMeasured` behaviour above).
2149
+ const measurement = raw?.tokenMeasurement ?? "parsed";
2150
+ if (raw && measurement === "parsed") {
2151
+ parsedPassTokenSum += raw.tokens.input + raw.tokens.output;
2152
+ parsedPassCount += 1;
2153
+ }
2154
+ else if (raw) {
2155
+ anyPassMissingMeasurement = true;
2156
+ }
2157
+ else {
2158
+ // No matching raw run supplied — cannot honour cost-per-success.
2159
+ anyPassMissingMeasurement = true;
2160
+ }
2161
+ }
2162
+ }
2163
+ const toolCallsPerSuccess = passingRuns === 0 ? null : toolCallsSum / passingRuns;
2164
+ // Cost-per-success: null unless EVERY passing run has parsed measurement.
2165
+ // Mixed measurement statuses cannot be averaged honestly (issue #252).
2166
+ const costPerSuccess = passingRuns === 0 || anyPassMissingMeasurement || parsedPassCount === 0
2167
+ ? null
2168
+ : parsedPassTokenSum / parsedPassCount;
2169
+ return {
2170
+ totalRuns: n,
2171
+ passingRuns,
2172
+ meanSearchCount: searchSum / n,
2173
+ meanShowCount: showSum / n,
2174
+ meanFeedbackCount: feedbackSum / n,
2175
+ meanToolCalls: toolCallsSum / n,
2176
+ meanAssetsLoaded: assetsSum / n,
2177
+ meanIrrelevantAssetsLoaded: irrelevantCount === 0 ? null : irrelevantSum / irrelevantCount,
2178
+ meanTimeToFirstSearchMs: firstSearchCount === 0 ? null : firstSearchSum / firstSearchCount,
2179
+ meanTimeToFirstCorrectAssetMs: firstCorrectCount === 0 ? null : firstCorrectSum / firstCorrectCount,
2180
+ meanContextBytesLoaded: contextBytesCount === 0 ? null : contextBytesSum / contextBytesCount,
2181
+ meanAssetBytesLoaded: assetBytesCount === 0 ? null : assetBytesSum / assetBytesCount,
2182
+ totalToolCalls: toolCallsSum,
2183
+ toolCallsPerSuccess,
2184
+ costPerSuccess,
2185
+ };
2186
+ }
2187
+ /**
2188
+ * Bucket a workflow check status onto pass / non-pass for reliability.
2189
+ *
2190
+ * Reliability is a strict pass-or-not metric (issue #258). Anything other
2191
+ * than `pass` (including `partial`, `fail`, `harness_error`) counts as a
2192
+ * non-pass. `not_applicable` returns `null` so the caller can skip the
2193
+ * entire (task, seed) pair — it never contributes to either numerator or
2194
+ * denominator.
2195
+ */
2196
+ function bucketReliabilityStatus(status) {
2197
+ if (status === "not_applicable")
2198
+ return null;
2199
+ if (status === "pass")
2200
+ return "pass";
2201
+ return "non_pass";
2202
+ }
2203
+ /**
2204
+ * Compute workflow reliability metrics (`pass@k` and `pass^k`) per workflow
2205
+ * and corpus-wide from a flat list of `WorkflowCheckResult`.
2206
+ *
2207
+ * Methodology (per #258 review addendum):
2208
+ * 1. Filter out `not_applicable` checks entirely.
2209
+ * 2. For each `(workflow_id, task_id)` group, collapse seeds to the set
2210
+ * of statuses observed.
2211
+ * 3. `pass_at_k` per task = 1 if at least one seed is `pass`, else 0.
2212
+ * 4. `pass_all_k` per task = 1 if every seed is `pass`, else 0.
2213
+ * 5. Per-workflow row averages over its task set.
2214
+ * 6. Corpus rollup averages over every (workflow, task) group equally.
2215
+ *
2216
+ * Pure: never mutates `checks`. Returns a stable shape for empty input.
2217
+ */
2218
+ export function computeWorkflowReliability(checks) {
2219
+ // Group by (workflow_id, task_id) → list of statuses across seeds.
2220
+ // Use Map<string, Map<string, WorkflowCheckStatus[]>> so iteration order
2221
+ // is insertion order (deterministic given deterministic input).
2222
+ const grouped = new Map();
2223
+ for (const c of checks) {
2224
+ if (bucketReliabilityStatus(c.status) === null)
2225
+ continue;
2226
+ let perWorkflow = grouped.get(c.workflowId);
2227
+ if (!perWorkflow) {
2228
+ perWorkflow = new Map();
2229
+ grouped.set(c.workflowId, perWorkflow);
2230
+ }
2231
+ const list = perWorkflow.get(c.taskId);
2232
+ if (list)
2233
+ list.push(c.status);
2234
+ else
2235
+ perWorkflow.set(c.taskId, [c.status]);
2236
+ }
2237
+ const byWorkflow = {};
2238
+ let corpusPassAtKSum = 0;
2239
+ let corpusPassAllKSum = 0;
2240
+ let corpusGroupCount = 0;
2241
+ const corpusTasks = new Set();
2242
+ for (const [workflowId, perTask] of grouped) {
2243
+ let passAtKSum = 0;
2244
+ let passAllKSum = 0;
2245
+ let kMax = 0;
2246
+ for (const [taskId, statuses] of perTask) {
2247
+ if (statuses.length > kMax)
2248
+ kMax = statuses.length;
2249
+ const allPass = statuses.every((s) => s === "pass");
2250
+ const anyPass = statuses.some((s) => s === "pass");
2251
+ if (anyPass)
2252
+ passAtKSum += 1;
2253
+ if (allPass)
2254
+ passAllKSum += 1;
2255
+ corpusPassAtKSum += anyPass ? 1 : 0;
2256
+ corpusPassAllKSum += allPass ? 1 : 0;
2257
+ corpusGroupCount += 1;
2258
+ corpusTasks.add(taskId);
2259
+ }
2260
+ const taskCount = perTask.size;
2261
+ byWorkflow[workflowId] = {
2262
+ workflow_id: workflowId,
2263
+ pass_at_k: taskCount === 0 ? 0 : passAtKSum / taskCount,
2264
+ pass_all_k: taskCount === 0 ? 0 : passAllKSum / taskCount,
2265
+ tasks: taskCount,
2266
+ k: kMax,
2267
+ };
2268
+ }
2269
+ const corpus = {
2270
+ pass_at_k: corpusGroupCount === 0 ? 0 : corpusPassAtKSum / corpusGroupCount,
2271
+ pass_all_k: corpusGroupCount === 0 ? 0 : corpusPassAllKSum / corpusGroupCount,
2272
+ groups: corpusGroupCount,
2273
+ tasks: corpusTasks.size,
2274
+ };
2275
+ return { byWorkflow, corpus };
2276
+ }
2277
+ /** Earliest parseable ts (ms epoch) among events; null when none. */
2278
+ function earliestEventMs(events) {
2279
+ let earliest = null;
2280
+ for (const ev of events) {
2281
+ const ms = parseTsToMs(ev.ts);
2282
+ if (ms === null)
2283
+ continue;
2284
+ if (earliest === null || ms < earliest)
2285
+ earliest = ms;
2286
+ }
2287
+ return earliest;
2288
+ }
2289
+ /**
2290
+ * Find the first event matching `predicate`, parse its ts, and return
2291
+ * `(ts - runStartMs)`. Returns `null` if no matching event has a parseable
2292
+ * ts, if `runStartMs` is null, or if the offset would be negative (a clock
2293
+ * inversion we refuse to silently coerce to zero).
2294
+ */
2295
+ function computeFirstEventOffsetMs(events, runStartMs, predicate) {
2296
+ if (runStartMs === null)
2297
+ return null;
2298
+ for (const ev of events) {
2299
+ if (!predicate(ev))
2300
+ continue;
2301
+ const ms = parseTsToMs(ev.ts);
2302
+ if (ms === null)
2303
+ continue;
2304
+ const offset = ms - runStartMs;
2305
+ if (offset < 0)
2306
+ return null;
2307
+ return offset;
2308
+ }
2309
+ return null;
2310
+ }
2311
+ /** Parse an ISO ts to ms-epoch; null when missing or unparseable. */
2312
+ function parseTsToMs(ts) {
2313
+ if (typeof ts !== "string" || ts.length === 0)
2314
+ return null;
2315
+ const ms = Date.parse(ts);
2316
+ if (Number.isNaN(ms))
2317
+ return null;
2318
+ return ms;
2319
+ }