akm-cli 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (332) hide show
  1. package/CHANGELOG.md +8 -0
  2. package/dist/{src/cli.js → cli.js} +22 -8
  3. package/dist/{src/commands → commands}/installed-stashes.js +1 -1
  4. package/dist/{src/commands → commands}/source-add.js +1 -1
  5. package/dist/{src/core → core}/common.js +16 -1
  6. package/dist/{src/core → core}/config.js +5 -2
  7. package/dist/{src/indexer → indexer}/db-search.js +16 -1
  8. package/dist/{src/indexer → indexer}/graph-extraction.js +5 -3
  9. package/dist/{src/indexer → indexer}/indexer.js +27 -11
  10. package/dist/{src/indexer → indexer}/memory-inference.js +47 -58
  11. package/dist/{src/indexer → indexer}/search-source.js +1 -1
  12. package/dist/{src/llm → llm}/client.js +61 -1
  13. package/dist/{src/llm → llm}/embedder.js +8 -5
  14. package/dist/{src/llm → llm}/embedders/local.js +8 -2
  15. package/dist/{src/llm → llm}/embedders/remote.js +4 -2
  16. package/dist/{src/llm → llm}/graph-extract.js +4 -4
  17. package/dist/llm/memory-infer.js +114 -0
  18. package/dist/{src/llm → llm}/metadata-enhance.js +2 -2
  19. package/dist/{src/output → output}/cli-hints.js +2 -0
  20. package/dist/{src/setup → setup}/setup.js +30 -20
  21. package/dist/sources/providers/website.js +27 -0
  22. package/dist/{src/sources/providers/website.js → sources/website-ingest.js} +38 -51
  23. package/docs/README.md +7 -0
  24. package/docs/migration/release-notes/0.7.0.md +14 -0
  25. package/package.json +11 -8
  26. package/dist/src/llm/memory-infer.js +0 -86
  27. package/dist/tests/add-website-source.test.js +0 -119
  28. package/dist/tests/agent/agent-config-loader.test.js +0 -70
  29. package/dist/tests/agent/agent-config.test.js +0 -221
  30. package/dist/tests/agent/agent-detect.test.js +0 -100
  31. package/dist/tests/agent/agent-spawn.test.js +0 -234
  32. package/dist/tests/agent-output.test.js +0 -186
  33. package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +0 -103
  34. package/dist/tests/architecture/agent-spawn-seam.test.js +0 -193
  35. package/dist/tests/architecture/llm-stateless-seam.test.js +0 -112
  36. package/dist/tests/asset-ref.test.js +0 -192
  37. package/dist/tests/asset-registry.test.js +0 -103
  38. package/dist/tests/asset-spec.test.js +0 -241
  39. package/dist/tests/bench/attribution.test.js +0 -996
  40. package/dist/tests/bench/cleanup-sigint.test.js +0 -83
  41. package/dist/tests/bench/cleanup.js +0 -234
  42. package/dist/tests/bench/cleanup.test.js +0 -166
  43. package/dist/tests/bench/cli.js +0 -1018
  44. package/dist/tests/bench/cli.test.js +0 -445
  45. package/dist/tests/bench/compare.test.js +0 -556
  46. package/dist/tests/bench/corpus.js +0 -317
  47. package/dist/tests/bench/corpus.test.js +0 -258
  48. package/dist/tests/bench/doctor.js +0 -525
  49. package/dist/tests/bench/driver.js +0 -401
  50. package/dist/tests/bench/driver.test.js +0 -584
  51. package/dist/tests/bench/environment.js +0 -233
  52. package/dist/tests/bench/environment.test.js +0 -199
  53. package/dist/tests/bench/evolve-metrics.js +0 -179
  54. package/dist/tests/bench/evolve-metrics.test.js +0 -187
  55. package/dist/tests/bench/evolve.js +0 -647
  56. package/dist/tests/bench/evolve.test.js +0 -624
  57. package/dist/tests/bench/failure-modes.test.js +0 -349
  58. package/dist/tests/bench/feedback-integrity.test.js +0 -457
  59. package/dist/tests/bench/leakage.test.js +0 -228
  60. package/dist/tests/bench/learning-curve.test.js +0 -134
  61. package/dist/tests/bench/metrics.js +0 -2395
  62. package/dist/tests/bench/metrics.test.js +0 -1150
  63. package/dist/tests/bench/no-os-tmpdir-invariant.test.js +0 -43
  64. package/dist/tests/bench/opencode-config.js +0 -194
  65. package/dist/tests/bench/opencode-config.test.js +0 -370
  66. package/dist/tests/bench/report.js +0 -1885
  67. package/dist/tests/bench/report.test.js +0 -1038
  68. package/dist/tests/bench/run-config.js +0 -355
  69. package/dist/tests/bench/run-config.test.js +0 -298
  70. package/dist/tests/bench/run-curate-test.js +0 -32
  71. package/dist/tests/bench/run-failing-tasks.js +0 -56
  72. package/dist/tests/bench/run-full-bench.js +0 -51
  73. package/dist/tests/bench/run-items36-targeted.js +0 -69
  74. package/dist/tests/bench/run-nano-quick.js +0 -42
  75. package/dist/tests/bench/run-waveg-targeted.js +0 -62
  76. package/dist/tests/bench/runner.js +0 -699
  77. package/dist/tests/bench/runner.test.js +0 -958
  78. package/dist/tests/bench/search-bridge.test.js +0 -331
  79. package/dist/tests/bench/tmp.js +0 -131
  80. package/dist/tests/bench/trajectory.js +0 -116
  81. package/dist/tests/bench/trajectory.test.js +0 -127
  82. package/dist/tests/bench/verifier.js +0 -114
  83. package/dist/tests/bench/verifier.test.js +0 -118
  84. package/dist/tests/bench/workflow-evaluator.js +0 -557
  85. package/dist/tests/bench/workflow-evaluator.test.js +0 -421
  86. package/dist/tests/bench/workflow-spec.js +0 -345
  87. package/dist/tests/bench/workflow-spec.test.js +0 -363
  88. package/dist/tests/bench/workflow-trace.js +0 -472
  89. package/dist/tests/bench/workflow-trace.test.js +0 -254
  90. package/dist/tests/benchmark-search-quality.js +0 -536
  91. package/dist/tests/benchmark-suite.js +0 -1441
  92. package/dist/tests/capture-cli.test.js +0 -112
  93. package/dist/tests/cli-errors.test.js +0 -204
  94. package/dist/tests/commands/events.test.js +0 -370
  95. package/dist/tests/commands/history.test.js +0 -418
  96. package/dist/tests/commands/import.test.js +0 -103
  97. package/dist/tests/commands/proposal-cli.test.js +0 -209
  98. package/dist/tests/commands/reflect-propose-cli.test.js +0 -333
  99. package/dist/tests/commands/remember.test.js +0 -97
  100. package/dist/tests/commands/scope-flags.test.js +0 -300
  101. package/dist/tests/commands/search.test.js +0 -537
  102. package/dist/tests/commands/show-indexer-parity.test.js +0 -117
  103. package/dist/tests/commands/show.test.js +0 -294
  104. package/dist/tests/common.test.js +0 -266
  105. package/dist/tests/completions.test.js +0 -142
  106. package/dist/tests/config-cli.test.js +0 -193
  107. package/dist/tests/config-llm-features.test.js +0 -139
  108. package/dist/tests/config.test.js +0 -569
  109. package/dist/tests/contracts/migration-baseline.test.js +0 -43
  110. package/dist/tests/contracts/reflect-propose-envelope.test.js +0 -139
  111. package/dist/tests/contracts/spec-helpers.js +0 -46
  112. package/dist/tests/contracts/v1-spec-section-11-proposal-queue.test.js +0 -228
  113. package/dist/tests/contracts/v1-spec-section-12-agent-config.test.js +0 -56
  114. package/dist/tests/contracts/v1-spec-section-13-lesson-type.test.js +0 -34
  115. package/dist/tests/contracts/v1-spec-section-14-llm-features.test.js +0 -94
  116. package/dist/tests/contracts/v1-spec-section-4-1-asset-types.test.js +0 -39
  117. package/dist/tests/contracts/v1-spec-section-4-2-quality-rules.test.js +0 -44
  118. package/dist/tests/contracts/v1-spec-section-5-configuration.test.js +0 -47
  119. package/dist/tests/contracts/v1-spec-section-6-orchestration.test.js +0 -40
  120. package/dist/tests/contracts/v1-spec-section-7-module-layout.test.js +0 -58
  121. package/dist/tests/contracts/v1-spec-section-8-extension-points.test.js +0 -34
  122. package/dist/tests/contracts/v1-spec-section-9-4-cli-surface.test.js +0 -75
  123. package/dist/tests/contracts/v1-spec-section-9-7-llm-agent-boundary.test.js +0 -36
  124. package/dist/tests/core/write-source.test.js +0 -366
  125. package/dist/tests/curate-command.test.js +0 -87
  126. package/dist/tests/db-scoring.test.js +0 -201
  127. package/dist/tests/db.test.js +0 -654
  128. package/dist/tests/distill-cli-flag.test.js +0 -208
  129. package/dist/tests/distill.test.js +0 -515
  130. package/dist/tests/docker-install.test.js +0 -120
  131. package/dist/tests/e2e.test.js +0 -1419
  132. package/dist/tests/embedder.test.js +0 -340
  133. package/dist/tests/embedding-model-config.test.js +0 -379
  134. package/dist/tests/feedback-command.test.js +0 -172
  135. package/dist/tests/file-context.test.js +0 -552
  136. package/dist/tests/fixtures/scripts/git/summarize-diff.js +0 -9
  137. package/dist/tests/fixtures/scripts/lint/eslint-check.js +0 -7
  138. package/dist/tests/fixtures/stashes/load.js +0 -166
  139. package/dist/tests/fixtures/stashes/load.test.js +0 -97
  140. package/dist/tests/fixtures/stashes/ranking-baseline/scripts/mem0-search.js +0 -12
  141. package/dist/tests/frontmatter.test.js +0 -190
  142. package/dist/tests/fts-field-weighting.test.js +0 -254
  143. package/dist/tests/fuzzy-search.test.js +0 -230
  144. package/dist/tests/git-provider-clone.test.js +0 -45
  145. package/dist/tests/github.test.js +0 -161
  146. package/dist/tests/graph-boost-ranking.test.js +0 -305
  147. package/dist/tests/graph-extraction.test.js +0 -282
  148. package/dist/tests/helpers/usage-events.js +0 -8
  149. package/dist/tests/index-pass-llm.test.js +0 -161
  150. package/dist/tests/indexer.test.js +0 -570
  151. package/dist/tests/info-command.test.js +0 -166
  152. package/dist/tests/init.test.js +0 -69
  153. package/dist/tests/install-script.test.js +0 -246
  154. package/dist/tests/integration/agent-real-profile.test.js +0 -94
  155. package/dist/tests/issue-36-repro.test.js +0 -304
  156. package/dist/tests/issues-191-194.test.js +0 -160
  157. package/dist/tests/lesson-lint.test.js +0 -111
  158. package/dist/tests/llm-client.test.js +0 -115
  159. package/dist/tests/llm-feature-gate.test.js +0 -151
  160. package/dist/tests/llm.test.js +0 -139
  161. package/dist/tests/lockfile.test.js +0 -216
  162. package/dist/tests/manifest.test.js +0 -205
  163. package/dist/tests/markdown.test.js +0 -126
  164. package/dist/tests/matchers-unit.test.js +0 -189
  165. package/dist/tests/memory-inference.test.js +0 -299
  166. package/dist/tests/merge-scoring.test.js +0 -136
  167. package/dist/tests/metadata.test.js +0 -313
  168. package/dist/tests/migration-help.test.js +0 -89
  169. package/dist/tests/origin-resolve.test.js +0 -124
  170. package/dist/tests/output-baseline.test.js +0 -218
  171. package/dist/tests/output-shapes-unit.test.js +0 -478
  172. package/dist/tests/parallel-search.test.js +0 -272
  173. package/dist/tests/parameter-metadata.test.js +0 -365
  174. package/dist/tests/paths.test.js +0 -177
  175. package/dist/tests/progressive-disclosure.test.js +0 -280
  176. package/dist/tests/proposals.test.js +0 -279
  177. package/dist/tests/proposed-quality.test.js +0 -271
  178. package/dist/tests/provider-registry.test.js +0 -32
  179. package/dist/tests/ranking-regression.test.js +0 -548
  180. package/dist/tests/reflect-propose.test.js +0 -455
  181. package/dist/tests/registry-build-index.test.js +0 -394
  182. package/dist/tests/registry-cli.test.js +0 -290
  183. package/dist/tests/registry-index-v2.test.js +0 -430
  184. package/dist/tests/registry-install.test.js +0 -728
  185. package/dist/tests/registry-providers/parity.test.js +0 -189
  186. package/dist/tests/registry-providers/skills-sh.test.js +0 -309
  187. package/dist/tests/registry-providers/static-index.test.js +0 -238
  188. package/dist/tests/registry-resolve.test.js +0 -126
  189. package/dist/tests/registry-search.test.js +0 -923
  190. package/dist/tests/remember-frontmatter.test.js +0 -378
  191. package/dist/tests/remember-unit.test.js +0 -123
  192. package/dist/tests/ripgrep-install.test.js +0 -251
  193. package/dist/tests/ripgrep-resolve.test.js +0 -108
  194. package/dist/tests/ripgrep.test.js +0 -163
  195. package/dist/tests/save-command.test.js +0 -94
  196. package/dist/tests/save-trust-qa-fixes.test.js +0 -270
  197. package/dist/tests/scoring-pipeline.test.js +0 -648
  198. package/dist/tests/search-include-proposed-cli.test.js +0 -118
  199. package/dist/tests/self-update.test.js +0 -442
  200. package/dist/tests/semantic-search-e2e.test.js +0 -512
  201. package/dist/tests/semantic-status.test.js +0 -471
  202. package/dist/tests/setup-run.integration.js +0 -877
  203. package/dist/tests/setup-wizard.test.js +0 -198
  204. package/dist/tests/setup.test.js +0 -131
  205. package/dist/tests/source-add.test.js +0 -11
  206. package/dist/tests/source-clone.test.js +0 -254
  207. package/dist/tests/source-manage.test.js +0 -366
  208. package/dist/tests/source-providers/filesystem.test.js +0 -82
  209. package/dist/tests/source-providers/git.test.js +0 -252
  210. package/dist/tests/source-providers/website.test.js +0 -128
  211. package/dist/tests/source-qa-fixes.test.js +0 -286
  212. package/dist/tests/source-registry.test.js +0 -350
  213. package/dist/tests/source-resolve.test.js +0 -100
  214. package/dist/tests/source-source.test.js +0 -281
  215. package/dist/tests/source.test.js +0 -533
  216. package/dist/tests/tar-utils-scan.test.js +0 -73
  217. package/dist/tests/toggle-components.test.js +0 -73
  218. package/dist/tests/usage-telemetry.test.js +0 -265
  219. package/dist/tests/utility-scoring.test.js +0 -558
  220. package/dist/tests/vault-load-error.test.js +0 -78
  221. package/dist/tests/vault-qa-fixes.test.js +0 -194
  222. package/dist/tests/vault.test.js +0 -429
  223. package/dist/tests/vector-search.test.js +0 -608
  224. package/dist/tests/walker.test.js +0 -252
  225. package/dist/tests/wave2-cluster-bc.test.js +0 -228
  226. package/dist/tests/wave2-cluster-d.test.js +0 -180
  227. package/dist/tests/wave2-cluster-e.test.js +0 -179
  228. package/dist/tests/wiki-qa-fixes.test.js +0 -270
  229. package/dist/tests/wiki.test.js +0 -529
  230. package/dist/tests/workflow-cli.test.js +0 -271
  231. package/dist/tests/workflow-markdown.test.js +0 -171
  232. package/dist/tests/workflow-path-escape.test.js +0 -132
  233. package/dist/tests/workflow-qa-fixes.test.js +0 -395
  234. package/dist/tests/workflows/indexer-rejection.test.js +0 -213
  235. /package/dist/{src/commands → commands}/completions.js +0 -0
  236. /package/dist/{src/commands → commands}/config-cli.js +0 -0
  237. /package/dist/{src/commands → commands}/curate.js +0 -0
  238. /package/dist/{src/commands → commands}/distill.js +0 -0
  239. /package/dist/{src/commands → commands}/events.js +0 -0
  240. /package/dist/{src/commands → commands}/history.js +0 -0
  241. /package/dist/{src/commands → commands}/info.js +0 -0
  242. /package/dist/{src/commands → commands}/init.js +0 -0
  243. /package/dist/{src/commands → commands}/install-audit.js +0 -0
  244. /package/dist/{src/commands → commands}/migration-help.js +0 -0
  245. /package/dist/{src/commands → commands}/proposal.js +0 -0
  246. /package/dist/{src/commands → commands}/propose.js +0 -0
  247. /package/dist/{src/commands → commands}/reflect.js +0 -0
  248. /package/dist/{src/commands → commands}/registry-search.js +0 -0
  249. /package/dist/{src/commands → commands}/remember.js +0 -0
  250. /package/dist/{src/commands → commands}/search.js +0 -0
  251. /package/dist/{src/commands → commands}/self-update.js +0 -0
  252. /package/dist/{src/commands → commands}/show.js +0 -0
  253. /package/dist/{src/commands → commands}/source-clone.js +0 -0
  254. /package/dist/{src/commands → commands}/source-manage.js +0 -0
  255. /package/dist/{src/commands → commands}/vault.js +0 -0
  256. /package/dist/{src/core → core}/asset-ref.js +0 -0
  257. /package/dist/{src/core → core}/asset-registry.js +0 -0
  258. /package/dist/{src/core → core}/asset-spec.js +0 -0
  259. /package/dist/{src/core → core}/errors.js +0 -0
  260. /package/dist/{src/core → core}/events.js +0 -0
  261. /package/dist/{src/core → core}/frontmatter.js +0 -0
  262. /package/dist/{src/core → core}/lesson-lint.js +0 -0
  263. /package/dist/{src/core → core}/markdown.js +0 -0
  264. /package/dist/{src/core → core}/paths.js +0 -0
  265. /package/dist/{src/core → core}/proposals.js +0 -0
  266. /package/dist/{src/core → core}/warn.js +0 -0
  267. /package/dist/{src/core → core}/write-source.js +0 -0
  268. /package/dist/{src/indexer → indexer}/db.js +0 -0
  269. /package/dist/{src/indexer → indexer}/file-context.js +0 -0
  270. /package/dist/{src/indexer → indexer}/graph-boost.js +0 -0
  271. /package/dist/{src/indexer → indexer}/manifest.js +0 -0
  272. /package/dist/{src/indexer → indexer}/matchers.js +0 -0
  273. /package/dist/{src/indexer → indexer}/metadata.js +0 -0
  274. /package/dist/{src/indexer → indexer}/search-fields.js +0 -0
  275. /package/dist/{src/indexer → indexer}/semantic-status.js +0 -0
  276. /package/dist/{src/indexer → indexer}/usage-events.js +0 -0
  277. /package/dist/{src/indexer → indexer}/walker.js +0 -0
  278. /package/dist/{src/integrations → integrations}/agent/config.js +0 -0
  279. /package/dist/{src/integrations → integrations}/agent/detect.js +0 -0
  280. /package/dist/{src/integrations → integrations}/agent/index.js +0 -0
  281. /package/dist/{src/integrations → integrations}/agent/profiles.js +0 -0
  282. /package/dist/{src/integrations → integrations}/agent/prompts.js +0 -0
  283. /package/dist/{src/integrations → integrations}/agent/spawn.js +0 -0
  284. /package/dist/{src/integrations → integrations}/github.js +0 -0
  285. /package/dist/{src/integrations → integrations}/lockfile.js +0 -0
  286. /package/dist/{src/llm → llm}/embedders/cache.js +0 -0
  287. /package/dist/{src/llm → llm}/embedders/types.js +0 -0
  288. /package/dist/{src/llm → llm}/feature-gate.js +0 -0
  289. /package/dist/{src/llm → llm}/index-passes.js +0 -0
  290. /package/dist/{src/output → output}/context.js +0 -0
  291. /package/dist/{src/output → output}/renderers.js +0 -0
  292. /package/dist/{src/output → output}/shapes.js +0 -0
  293. /package/dist/{src/output → output}/text.js +0 -0
  294. /package/dist/{src/registry → registry}/build-index.js +0 -0
  295. /package/dist/{src/registry → registry}/create-provider-registry.js +0 -0
  296. /package/dist/{src/registry → registry}/factory.js +0 -0
  297. /package/dist/{src/registry → registry}/origin-resolve.js +0 -0
  298. /package/dist/{src/registry → registry}/providers/index.js +0 -0
  299. /package/dist/{src/registry → registry}/providers/skills-sh.js +0 -0
  300. /package/dist/{src/registry → registry}/providers/static-index.js +0 -0
  301. /package/dist/{src/registry → registry}/providers/types.js +0 -0
  302. /package/dist/{src/registry → registry}/resolve.js +0 -0
  303. /package/dist/{src/registry → registry}/types.js +0 -0
  304. /package/dist/{src/setup → setup}/detect.js +0 -0
  305. /package/dist/{src/setup → setup}/ripgrep-install.js +0 -0
  306. /package/dist/{src/setup → setup}/ripgrep-resolve.js +0 -0
  307. /package/dist/{src/setup → setup}/steps.js +0 -0
  308. /package/dist/{src/sources → sources}/include.js +0 -0
  309. /package/dist/{src/sources → sources}/provider-factory.js +0 -0
  310. /package/dist/{src/sources → sources}/provider.js +0 -0
  311. /package/dist/{src/sources → sources}/providers/filesystem.js +0 -0
  312. /package/dist/{src/sources → sources}/providers/git.js +0 -0
  313. /package/dist/{src/sources → sources}/providers/index.js +0 -0
  314. /package/dist/{src/sources → sources}/providers/install-types.js +0 -0
  315. /package/dist/{src/sources → sources}/providers/npm.js +0 -0
  316. /package/dist/{src/sources → sources}/providers/provider-utils.js +0 -0
  317. /package/dist/{src/sources → sources}/providers/sync-from-ref.js +0 -0
  318. /package/dist/{src/sources → sources}/providers/tar-utils.js +0 -0
  319. /package/dist/{src/sources → sources}/resolve.js +0 -0
  320. /package/dist/{src/sources → sources}/types.js +0 -0
  321. /package/dist/{src/templates → templates}/wiki-templates.js +0 -0
  322. /package/dist/{src/version.js → version.js} +0 -0
  323. /package/dist/{src/wiki → wiki}/wiki.js +0 -0
  324. /package/dist/{src/workflows → workflows}/authoring.js +0 -0
  325. /package/dist/{src/workflows → workflows}/cli.js +0 -0
  326. /package/dist/{src/workflows → workflows}/db.js +0 -0
  327. /package/dist/{src/workflows → workflows}/document-cache.js +0 -0
  328. /package/dist/{src/workflows → workflows}/parser.js +0 -0
  329. /package/dist/{src/workflows → workflows}/renderer.js +0 -0
  330. /package/dist/{src/workflows → workflows}/runs.js +0 -0
  331. /package/dist/{src/workflows → workflows}/schema.js +0 -0
  332. /package/dist/{src/workflows → workflows}/validator.js +0 -0
@@ -1,1885 +0,0 @@
1
- /**
2
- * akm-bench report rendering (spec §13.3).
3
- *
4
- * Two report flavours coexist:
5
- *
6
- * • `renderJsonReport` / `renderMarkdownSummary` — the simple v1 envelope
7
- * introduced in #236. Kept for backward-compat with the empty-corpus
8
- * skeleton path; not used by the populated `utility` flow.
9
- *
10
- * • `renderUtilityReport` — the §13.3 shape, including per-task breakdown,
11
- * per-arm and corpus-wide aggregates, akm−noakm deltas, and the
12
- * trajectory subsection. This is what `bench utility` writes when the
13
- * corpus has tasks.
14
- */
15
- import { execSync } from "node:child_process";
16
- import { aggregateAkmOverhead, aggregateByMemoryAbility, aggregateByTaskFamily, computeAkmOverhead, computeAssetRegressionCandidates, computeCorpusCoverage, computeDomainAggregates, computeNegativeTransfer, computeWorkflowReliability, histogramKeys, } from "./metrics";
17
- /**
18
- * Pretty-print a 2-space-indented JSON envelope. The shape is the v1
19
- * contract — `bench compare` reads it and refuses to diff across mismatched
20
- * `model` fields.
21
- */
22
- export function renderJsonReport(input) {
23
- const envelope = {
24
- schemaVersion: 1,
25
- timestamp: input.timestamp,
26
- branch: input.branch,
27
- commit: input.commit,
28
- track: input.track,
29
- agent: { harness: "opencode", model: input.model },
30
- aggregate: input.arms,
31
- };
32
- return JSON.stringify(envelope, null, 2);
33
- }
34
- /**
35
- * 5-ish-line markdown summary for stderr / PR descriptions. Used by the
36
- * empty-corpus skeleton path.
37
- */
38
- export function renderMarkdownSummary(input) {
39
- const lines = [];
40
- lines.push(`# akm-bench (${input.track}) — ${input.model}`);
41
- lines.push(`branch \`${input.branch}\` @ \`${input.commit}\` — ${input.timestamp}`);
42
- for (const [arm, agg] of Object.entries(input.arms)) {
43
- lines.push(`- **${arm}**: pass_rate=${agg.passRate.toFixed(2)}, tokens_per_pass=${agg.tokensPerPass.toFixed(0)}, wallclock_ms=${agg.wallclockMs.toFixed(0)}, budget_exceeded=${agg.budgetExceeded}`);
44
- }
45
- return lines.join("\n");
46
- }
47
- /**
48
- * Project a RunResult onto its compact serialised form for the §13.3 JSON
49
- * envelope (#249). Mirrors the field list in the issue body.
50
- *
51
- * Token-shape seam: `tokens` is spread verbatim from `result.tokens` so when
52
- * #252 adds a `measurement` field the renderer doesn't need a code change.
53
- * Do NOT hardcode `{input, output}` projections here.
54
- */
55
- export function serializeRunForReport(result) {
56
- return {
57
- task_id: result.taskId,
58
- arm: result.arm,
59
- seed: result.seed,
60
- model: result.model,
61
- outcome: result.outcome,
62
- // TODO(#252): when RunResult.tokens grows a `measurement` key, this spread
63
- // carries it forward without a renderer change.
64
- tokens: { ...result.tokens },
65
- wallclock_ms: result.wallclockMs,
66
- verifier_exit_code: result.verifierExitCode,
67
- trajectory: {
68
- correct_asset_loaded: result.trajectory.correctAssetLoaded,
69
- feedback_recorded: result.trajectory.feedbackRecorded,
70
- },
71
- assets_loaded: [...(result.assetsLoaded ?? [])],
72
- failure_mode: result.failureMode ?? null,
73
- };
74
- }
75
- /**
76
- * Stamp a utility run into both the §13.3 JSON envelope and a markdown
77
- * summary. Callers wire stdout/stderr separately.
78
- *
79
- * Determinism: given identical input the function is byte-stable. Markdown
80
- * does not embed `timestamp` in the body table (only in the header), so
81
- * snapshot tests are stable across reruns.
82
- */
83
- export function renderUtilityReport(input) {
84
- const json = buildUtilityJson(input);
85
- const markdown = buildUtilityMarkdown(input);
86
- return { json, markdown };
87
- }
88
- function buildUtilityJson(input) {
89
- const includeSynth = input.aggregateSynth !== undefined;
90
- const tasks = input.tasks.map((t) => ({
91
- id: t.id,
92
- noakm: serialisePerTaskMetrics(t.noakm),
93
- akm: serialisePerTaskMetrics(t.akm),
94
- delta: serialiseDelta(t.delta),
95
- // #261: per-task synthetic block is emitted ONLY when the runner opted
96
- // into the synthetic arm AND this task carries a synthetic aggregate.
97
- // When the arm was not run we leave the key absent — a missing arm is
98
- // not a zero-pass arm.
99
- ...(includeSynth && t.synthetic ? { synthetic: serialisePerTaskMetrics(t.synthetic) } : {}),
100
- }));
101
- // Negative-transfer + domain-level diagnostics (#260). Pure post-processing
102
- // off `input.tasks` and `input.akmRuns` — runner.ts is intentionally
103
- // untouched so this slots in alongside the per-task entries that already
104
- // carry both arms via UtilityReportTaskEntry.
105
- const negativeTransfer = computeNegativeTransfer(input.tasks);
106
- const domainDeltas = computeDomainAggregates(input.tasks);
107
- const assetRegressionCandidates = computeAssetRegressionCandidates(negativeTransfer.topRegressedTasks.map((r) => r.taskId), input.akmRuns ?? []);
108
- // Token-measurement coverage (issue #252). Folds the corpus-wide picture so
109
- // operators can tell at a glance whether token economics are reliable. The
110
- // warning string mirrors what we add to `warnings[]` in markdown output.
111
- const tokenMeasurement = summariseTokenMeasurement(input);
112
- const warnings = [...input.warnings];
113
- if (tokenMeasurement.warning)
114
- warnings.push(tokenMeasurement.warning);
115
- const envelope = {
116
- schemaVersion: 1,
117
- track: "utility",
118
- branch: input.branch,
119
- commit: input.commit,
120
- timestamp: input.timestamp,
121
- agent: { harness: "opencode", model: input.model },
122
- corpus: input.corpus,
123
- aggregate: {
124
- noakm: serialiseCorpus(input.aggregateNoakm),
125
- akm: serialiseCorpus(input.aggregateAkm),
126
- delta: serialiseDelta(input.aggregateDelta),
127
- // #261: synthetic aggregate is emitted ONLY when includeSynthetic
128
- // was set on the runner. Absent otherwise — byte-identical to the
129
- // pre-#261 envelope.
130
- ...(input.aggregateSynth ? { synthetic: serialiseCorpus(input.aggregateSynth) } : {}),
131
- // #261: akm_over_synthetic_lift = passRate(akm) - passRate(synthetic).
132
- // Only computed when the synthetic arm ran. Positive => AKM beats the
133
- // synthetic-notes baseline; non-positive flags AKM is not adding value
134
- // beyond what the model can synthesise on its own.
135
- ...(input.aggregateSynth
136
- ? { akm_over_synthetic_lift: input.aggregateAkm.passRate - input.aggregateSynth.passRate }
137
- : {}),
138
- },
139
- trajectory: {
140
- akm: {
141
- correct_asset_loaded: input.trajectoryAkm.correctAssetLoaded,
142
- feedback_recorded: input.trajectoryAkm.feedbackRecorded,
143
- },
144
- },
145
- failure_modes: {
146
- by_label: input.failureModes.byLabel,
147
- by_task: input.failureModes.byTask,
148
- },
149
- token_measurement: {
150
- total_runs: tokenMeasurement.totalRuns,
151
- runs_with_measured_tokens: tokenMeasurement.measuredRuns,
152
- runs_missing_measurement: tokenMeasurement.missingRuns,
153
- runs_unsupported_measurement: tokenMeasurement.unsupportedRuns,
154
- coverage: tokenMeasurement.coverage,
155
- reliable: tokenMeasurement.reliable,
156
- },
157
- tasks,
158
- negative_transfer_count: negativeTransfer.count,
159
- negative_transfer_severity: negativeTransfer.severity,
160
- top_regressed_tasks: negativeTransfer.topRegressedTasks.map((r) => ({
161
- task_id: r.taskId,
162
- domain: r.domain,
163
- noakm_pass_rate: r.noakmPassRate,
164
- akm_pass_rate: r.akmPassRate,
165
- delta: r.delta,
166
- severity: r.severity,
167
- })),
168
- domain_level_deltas: domainDeltas.map(serialiseDomainAggregate),
169
- asset_regression_candidates: assetRegressionCandidates.map(serialiseAssetRegressionCandidate),
170
- corpus_coverage: buildCorpusCoverageBlock(input),
171
- workflow: buildWorkflowAggregate(input.workflowChecks ?? []),
172
- warnings,
173
- ...(input.searchBridge ? { searchBridge: serialiseSearchBridge(input.searchBridge) } : {}),
174
- };
175
- // Compact raw runs[] — additive top-level key (#249). One row per
176
- // (task, arm, seed) execution; both noakm and akm. Older artefacts that
177
- // pre-date this field stay valid because we only emit it when the runner
178
- // actually populated `allRuns`.
179
- if (input.allRuns) {
180
- envelope.runs = input.allRuns.map(serializeRunForReport);
181
- }
182
- // Baseline pass-rate map — additive top-level key. Emitted only when the
183
- // caller supplied a baseline through `loadBenchRunConfig`; legacy reports
184
- // stay byte-identical without it.
185
- if (input.baselineByTaskId) {
186
- envelope.baseline_by_task_id = { ...input.baselineByTaskId };
187
- }
188
- // Per-asset attribution is an additive top-level key (§6.5). Emit it only
189
- // when the runner populated it so older code paths (e.g. the empty-corpus
190
- // skeleton) don't gain the key spuriously.
191
- if (input.perAsset) {
192
- envelope.perAsset = {
193
- total_akm_runs: input.perAsset.totalAkmRuns,
194
- rows: input.perAsset.rows.map((r) => ({
195
- asset_ref: r.assetRef,
196
- load_count: r.loadCount,
197
- load_count_passing: r.loadCountPassing,
198
- load_count_failing: r.loadCountFailing,
199
- load_pass_rate: r.loadPassRate,
200
- })),
201
- };
202
- }
203
- // AKM overhead + tool-use efficiency block (#263). Computed from the akm-
204
- // arm RunResults attached to the report; missing akmRuns yields an empty
205
- // aggregate so the key shape stays stable.
206
- envelope.akm_overhead = buildAkmOverheadBlock(input);
207
- return envelope;
208
- }
209
- // ── AKM overhead block (#263) ──────────────────────────────────────────────
210
- /**
211
- * Build the §13.3 `akm_overhead` block from the akm-arm RunResults and (when
212
- * supplied) per-task metadata. `taskMetadata` lets us split irrelevant from
213
- * relevant asset loads and compute time-to-first-correct-asset; without it
214
- * those fields surface as `null` rather than misleading zeros.
215
- */
216
- function buildAkmOverheadBlock(input) {
217
- const akmRuns = input.akmRuns ?? [];
218
- const meta = new Map();
219
- for (const t of input.taskMetadata ?? []) {
220
- meta.set(t.id, { goldRef: t.goldRef, expectedTransferFrom: t.expectedTransferFrom });
221
- }
222
- const perRun = computeAkmOverhead(akmRuns, { taskMetadata: meta });
223
- const aggregate = aggregateAkmOverhead(perRun, akmRuns);
224
- return {
225
- per_run: perRun.map(serialiseAkmOverheadPerRun),
226
- aggregate: serialiseAkmOverheadAggregate(aggregate),
227
- };
228
- }
229
- function serialiseAkmOverheadPerRun(row) {
230
- return {
231
- task_id: row.taskId,
232
- arm: row.arm,
233
- seed: row.seed,
234
- outcome: row.outcome,
235
- search_count: row.searchCount,
236
- show_count: row.showCount,
237
- feedback_count: row.feedbackCount,
238
- positive_feedback_count: row.positiveFeedbackCount,
239
- negative_feedback_count: row.negativeFeedbackCount,
240
- total_tool_calls: row.totalToolCalls,
241
- assets_loaded_count: row.assetsLoadedCount,
242
- irrelevant_assets_loaded_count: row.irrelevantAssetsLoadedCount,
243
- time_to_first_search_ms: row.timeToFirstSearchMs,
244
- time_to_first_correct_asset_ms: row.timeToFirstCorrectAssetMs,
245
- context_bytes_loaded: row.contextBytesLoaded,
246
- asset_bytes_loaded: row.assetBytesLoaded,
247
- };
248
- }
249
- function serialiseAkmOverheadAggregate(agg) {
250
- return {
251
- total_runs: agg.totalRuns,
252
- passing_runs: agg.passingRuns,
253
- mean_search_count: agg.meanSearchCount,
254
- mean_show_count: agg.meanShowCount,
255
- mean_feedback_count: agg.meanFeedbackCount,
256
- mean_tool_calls: agg.meanToolCalls,
257
- mean_assets_loaded: agg.meanAssetsLoaded,
258
- mean_irrelevant_assets_loaded: agg.meanIrrelevantAssetsLoaded,
259
- mean_time_to_first_search_ms: agg.meanTimeToFirstSearchMs,
260
- mean_time_to_first_correct_asset_ms: agg.meanTimeToFirstCorrectAssetMs,
261
- mean_context_bytes_loaded: agg.meanContextBytesLoaded,
262
- mean_asset_bytes_loaded: agg.meanAssetBytesLoaded,
263
- total_tool_calls: agg.totalToolCalls,
264
- tool_calls_per_success: agg.toolCallsPerSuccess,
265
- cost_per_success: agg.costPerSuccess,
266
- search_engagement_rate: agg.searchEngagementRate,
267
- show_engagement_rate: agg.showEngagementRate,
268
- feedback_engagement_rate: agg.feedbackEngagementRate,
269
- search_to_show_ratio: agg.searchToShowRatio,
270
- mean_positive_feedback_count: agg.meanPositiveFeedbackCount,
271
- mean_negative_feedback_count: agg.meanNegativeFeedbackCount,
272
- };
273
- }
274
- /**
275
- * Render the §13.3 AKM overhead summary as a compact markdown section (#263).
276
- * Skipped entirely when the corpus had no akm-arm runs so the report stays
277
- * tight on the no-akm code path.
278
- */
279
- export function renderAkmOverheadSection(input) {
280
- const akmRuns = input.akmRuns ?? [];
281
- if (akmRuns.length === 0)
282
- return "";
283
- const meta = new Map();
284
- for (const t of input.taskMetadata ?? []) {
285
- meta.set(t.id, { goldRef: t.goldRef, expectedTransferFrom: t.expectedTransferFrom });
286
- }
287
- const perRun = computeAkmOverhead(akmRuns, { taskMetadata: meta });
288
- const agg = aggregateAkmOverhead(perRun, akmRuns);
289
- const lines = [];
290
- lines.push("## AKM overhead");
291
- lines.push("");
292
- lines.push(`- runs: ${agg.totalRuns} (${agg.passingRuns} passed)`);
293
- lines.push(`- tool calls: search=${formatMean(agg.meanSearchCount)} show=${formatMean(agg.meanShowCount)} feedback=${formatMean(agg.meanFeedbackCount)} (mean per run)`);
294
- lines.push(`- total tool calls: ${agg.totalToolCalls} (mean ${formatMean(agg.meanToolCalls)} per run)`);
295
- lines.push(`- tool_calls_per_success: ${agg.toolCallsPerSuccess === null ? "n/a" : formatMean(agg.toolCallsPerSuccess)}`);
296
- lines.push(`- assets loaded (mean unique per run): ${formatMean(agg.meanAssetsLoaded)}`);
297
- lines.push(`- irrelevant assets loaded (mean per tagged run): ${formatNullableMean(agg.meanIrrelevantAssetsLoaded)}`);
298
- lines.push(`- time_to_first_search: ${formatNullableMs(agg.meanTimeToFirstSearchMs)}`);
299
- lines.push(`- time_to_first_correct_asset: ${formatNullableMs(agg.meanTimeToFirstCorrectAssetMs)}`);
300
- lines.push(`- context_bytes_loaded: ${formatNullableBytes(agg.meanContextBytesLoaded)}`);
301
- lines.push(`- asset_bytes_loaded: ${formatNullableBytes(agg.meanAssetBytesLoaded)}`);
302
- lines.push(`- cost_per_success: ${agg.costPerSuccess === null ? "n/a" : formatMean(agg.costPerSuccess)} tokens`);
303
- return lines.join("\n");
304
- }
305
- function formatMean(value) {
306
- return value.toFixed(2);
307
- }
308
- function formatNullableMean(value) {
309
- return value === null ? "n/a" : value.toFixed(2);
310
- }
311
- function formatNullableMs(value) {
312
- return value === null ? "n/a" : `${Math.round(value)}ms`;
313
- }
314
- function formatNullableBytes(value) {
315
- return value === null ? "n/a" : `${Math.round(value)} bytes`;
316
- }
317
- /**
318
- * §6.7 envelope. We expose `null` for percentiles that fell into the missing
319
- * bucket so JSON consumers don't choke on `Infinity`.
320
- */
321
- function serialiseSearchBridge(s) {
322
- return {
323
- runs_observed: s.runsObserved,
324
- searches_observed: s.searchesObserved,
325
- gold_rank_distribution: s.goldRankDistribution,
326
- gold_rank_p50: percentileForJson(s.goldRankP50),
327
- gold_rank_p90: percentileForJson(s.goldRankP90),
328
- gold_at_rank_1: s.goldAtRank1,
329
- gold_missing: s.goldMissing,
330
- pass_rate_by_rank: s.passRateByRank.map((e) => ({
331
- rank: e.rank,
332
- pass_rate: e.passRate,
333
- run_count: e.runCount,
334
- })),
335
- };
336
- }
337
- function percentileForJson(value) {
338
- if (value === null)
339
- return null;
340
- if (!Number.isFinite(value))
341
- return "missing";
342
- return value;
343
- }
344
- function serialiseCorpus(c) {
345
- return {
346
- pass_rate: c.passRate,
347
- tokens_per_pass: c.tokensPerPass,
348
- tokens_per_run: c.tokensPerRun,
349
- wallclock_ms: c.wallclockMs,
350
- };
351
- }
352
- function serialiseDelta(d) {
353
- return {
354
- pass_rate: d.passRate,
355
- tokens_per_pass: d.tokensPerPass,
356
- tokens_per_run: d.tokensPerRun,
357
- wallclock_ms: d.wallclockMs,
358
- };
359
- }
360
- /** Snake-case wire shape for one row of `domain_level_deltas` (#260). */
361
- function serialiseDomainAggregate(row) {
362
- return {
363
- domain: row.domain,
364
- task_count: row.taskCount,
365
- regression_count: row.regressionCount,
366
- pass_rate_noakm: row.passRateNoakm,
367
- pass_rate_akm: row.passRateAkm,
368
- pass_rate_delta: row.passRateDelta,
369
- tokens_per_pass_delta: row.tokensPerPassDelta,
370
- wallclock_ms_delta: row.wallclockMsDelta,
371
- };
372
- }
373
- // ── Corpus coverage block (#262) ───────────────────────────────────────────
374
- /**
375
- * Build the §13.3 `corpus_coverage` block from a UtilityRunReport (#262).
376
- * Folds three pieces:
377
- * - `coverage`: counts per `memory_ability` (closed set + `untagged`) and
378
- * `task_family`. Operators see at a glance which abilities the corpus
379
- * covers and which are missing.
380
- * - `by_memory_ability` / `by_task_family`: per-category aggregates of pass
381
- * rate, akm − noakm delta, negative transfer count, and (when supplied)
382
- * workflow-compliance mean.
383
- *
384
- * When the runner did not plumb `taskMetadata` (legacy code paths) we emit a
385
- * skeleton block with zero counts so JSON consumers don't see the key flicker
386
- * in and out depending on the runner version.
387
- */
388
- function buildCorpusCoverageBlock(input) {
389
- const taskMetadata = input.taskMetadata ?? [];
390
- const metaById = new Map();
391
- for (const m of taskMetadata)
392
- metaById.set(m.id, m);
393
- const tagEntries = input.tasks.map((t) => {
394
- const meta = metaById.get(t.id);
395
- const entry = {
396
- id: t.id,
397
- noakm: t.noakm,
398
- akm: t.akm,
399
- };
400
- if (meta?.memoryAbility)
401
- entry.memoryAbility = meta.memoryAbility;
402
- if (meta?.taskFamily)
403
- entry.taskFamily = meta.taskFamily;
404
- if (meta?.workflowFocus)
405
- entry.workflowFocus = meta.workflowFocus;
406
- if (typeof t.workflowCompliance === "number" && Number.isFinite(t.workflowCompliance)) {
407
- entry.workflowCompliance = t.workflowCompliance;
408
- }
409
- return entry;
410
- });
411
- const coverage = computeCorpusCoverage(taskMetadata);
412
- const byAbility = aggregateByMemoryAbility(tagEntries);
413
- const byFamily = aggregateByTaskFamily(tagEntries);
414
- return {
415
- coverage,
416
- by_memory_ability: byAbility.map(serialiseCategoryRow),
417
- by_task_family: byFamily.map(serialiseCategoryRow),
418
- };
419
- }
420
- function serialiseCategoryRow(row) {
421
- return {
422
- category: row.category,
423
- task_count: row.taskCount,
424
- pass_rate_noakm: row.passRateNoakm,
425
- pass_rate_akm: row.passRateAkm,
426
- pass_rate_delta: row.passRateDelta,
427
- negative_transfer_count: row.negativeTransferCount,
428
- workflow_compliance: row.workflowCompliance,
429
- };
430
- }
431
- /** Snake-case wire shape for one row of `asset_regression_candidates` (#260). */
432
- function serialiseAssetRegressionCandidate(row) {
433
- return {
434
- asset_ref: row.assetRef,
435
- regressed_task_count: row.regressedTaskCount,
436
- regressed_task_ids: row.regressedTaskIds,
437
- total_load_count: row.totalLoadCount,
438
- };
439
- }
440
- function serialisePerTaskMetrics(m) {
441
- return {
442
- pass_rate: m.passRate,
443
- pass_at_1: m.passAt1,
444
- tokens_per_pass: m.tokensPerPass,
445
- tokens_per_run: m.tokensPerRun,
446
- wallclock_ms: m.wallclockMs,
447
- pass_rate_stdev: m.passRateStdev,
448
- budget_exceeded_count: m.budgetExceededCount,
449
- harness_error_count: m.harnessErrorCount,
450
- count: m.count,
451
- runs_with_measured_tokens: m.runsWithMeasuredTokens,
452
- };
453
- }
454
- function summariseTokenMeasurement(input) {
455
- const runs = input.akmRuns ?? [];
456
- let measured = 0;
457
- let missing = 0;
458
- let unsupported = 0;
459
- for (const r of runs) {
460
- const m = r.tokenMeasurement ?? "parsed";
461
- if (m === "parsed")
462
- measured += 1;
463
- else if (m === "missing")
464
- missing += 1;
465
- else if (m === "unsupported")
466
- unsupported += 1;
467
- }
468
- const total = runs.length;
469
- const coverage = total === 0 ? null : measured / total;
470
- const reliable = total > 0 && missing === 0 && unsupported === 0;
471
- let warning = null;
472
- if (total > 0 && !reliable) {
473
- const parts = [];
474
- if (missing > 0)
475
- parts.push(`${missing} missing`);
476
- if (unsupported > 0)
477
- parts.push(`${unsupported} unsupported`);
478
- warning =
479
- `token measurement unreliable: ${parts.join(", ")} of ${total} akm-arm runs lack parsed token usage; ` +
480
- `tokens_per_pass and token-budget signals reflect only the ${measured} measured runs.`;
481
- }
482
- return {
483
- totalRuns: total,
484
- measuredRuns: measured,
485
- missingRuns: missing,
486
- unsupportedRuns: unsupported,
487
- coverage,
488
- reliable,
489
- warning,
490
- };
491
- }
492
- function buildUtilityMarkdown(input) {
493
- const lines = [];
494
- lines.push(`# akm-bench utility — ${input.model}`);
495
- lines.push("");
496
- lines.push(`branch \`${input.branch}\` @ \`${input.commit}\` — ${input.timestamp}`);
497
- lines.push(`corpus: ${input.corpus.tasks} tasks across ${input.corpus.domains} domains (slice=${input.corpus.slice}, seedsPerArm=${input.corpus.seedsPerArm})`);
498
- lines.push("");
499
- lines.push("## Aggregate");
500
- lines.push("");
501
- lines.push("| arm | pass_rate | tokens_per_pass | wallclock_ms |");
502
- lines.push("|-----|-----------|-----------------|--------------|");
503
- lines.push(corpusRow("noakm", input.aggregateNoakm));
504
- // #261: synthetic row sits between noakm and akm so the columns read
505
- // baseline → synthetic → akm in the natural progression. Only rendered
506
- // when the runner opted into the synthetic arm.
507
- if (input.aggregateSynth) {
508
- lines.push(corpusRow("synthetic", input.aggregateSynth));
509
- }
510
- lines.push(corpusRow("akm", input.aggregateAkm));
511
- lines.push(deltaRow(input.aggregateDelta));
512
- // #261: akm_over_synthetic_lift summary line. When AKM does not beat the
513
- // synthetic baseline (lift <= 0) we surface a warning marker so operators
514
- // cannot miss the regression. Otherwise we render the lift as an
515
- // informative line.
516
- if (input.aggregateSynth) {
517
- const lift = input.aggregateAkm.passRate - input.aggregateSynth.passRate;
518
- lines.push("");
519
- if (lift <= 0) {
520
- lines.push(`:warning: **akm_over_synthetic_lift = ${signedFixed(lift, 2)}** — AKM did not beat the synthetic-notes baseline.`);
521
- }
522
- else {
523
- lines.push(`**akm_over_synthetic_lift: ${signedFixed(lift, 2)}**`);
524
- }
525
- }
526
- lines.push("");
527
- lines.push("## Trajectory (akm)");
528
- lines.push("");
529
- lines.push(`- correct_asset_loaded: ${formatPercent(input.trajectoryAkm.correctAssetLoaded)}`);
530
- lines.push(`- feedback_recorded: ${formatPercent(input.trajectoryAkm.feedbackRecorded)}`);
531
- // Per-run trajectory detail: when allRuns is present emit a compact table
532
- // so operators can distinguish null (harness error — no events captured)
533
- // from false (agent ran, behaviour not observed) from true (confirmed).
534
- // Symbols: "—" = null, "✗" = false, "✓" = true.
535
- const akmRuns = (input.allRuns ?? []).filter((r) => r.arm === "akm");
536
- if (akmRuns.length > 0) {
537
- lines.push("");
538
- lines.push("| task | seed | correct_asset_loaded | feedback_recorded |");
539
- lines.push("|------|------|----------------------|-------------------|");
540
- for (const r of akmRuns) {
541
- lines.push(`| ${r.taskId} | ${r.seed} | ${formatTrajBool(r.trajectory.correctAssetLoaded)} | ${formatTrajBool(r.trajectory.feedbackRecorded)} |`);
542
- }
543
- }
544
- lines.push("");
545
- lines.push("## Per-task pass rates");
546
- lines.push("");
547
- // #261: synthetic column is rendered only when the synthetic arm ran.
548
- // The default header/row stays identical to the pre-#261 output.
549
- // Baseline column is rendered only when `baselineByTaskId` was supplied
550
- // by the caller; legacy reports without it produce byte-identical output.
551
- const includeSynthCol = input.aggregateSynth !== undefined;
552
- const baselineMap = input.baselineByTaskId;
553
- const includeBaselineCol = baselineMap !== undefined;
554
- const baseColHeader = includeBaselineCol ? " baseline | vs base |" : "";
555
- const baseColSep = includeBaselineCol ? "----------|---------|" : "";
556
- if (includeSynthCol) {
557
- lines.push(`| task | noakm | synthetic | akm | delta |${baseColHeader}`);
558
- lines.push(`|------|-------|-----------|-----|-------|${baseColSep}`);
559
- }
560
- else {
561
- lines.push(`| task | noakm | akm | delta |${baseColHeader}`);
562
- lines.push(`|------|-------|-----|-------|${baseColSep}`);
563
- }
564
- // Sort tasks alphabetically for byte-stable markdown output.
565
- const sorted = [...input.tasks].sort((a, b) => a.id.localeCompare(b.id));
566
- for (const t of sorted) {
567
- lines.push(taskRow(t, includeSynthCol, baselineMap));
568
- }
569
- // Corpus-coverage section (#262). Renders only when at least one task was
570
- // tagged with a `memory_ability`; without tags the section adds no signal
571
- // and would just churn snapshots.
572
- const coverageSection = renderCorpusCoverageSection(input);
573
- if (coverageSection.length > 0) {
574
- lines.push("");
575
- lines.push(coverageSection);
576
- }
577
- // Negative-transfer + domain diagnostics (#260). The section stays quiet
578
- // ("none") when no regressions were observed so green corpora don't fill
579
- // the report with empty subheaders.
580
- const negativeTransferSection = renderNegativeTransferSection(input);
581
- lines.push("");
582
- lines.push(negativeTransferSection);
583
- // Failure-mode breakdown (§6.6). Appended near the bottom so the headline
584
- // pass-rate / trajectory tables stay visually anchored at the top.
585
- const failureSection = renderFailureModeBreakdown(input);
586
- if (failureSection.length > 0) {
587
- lines.push("");
588
- lines.push(failureSection);
589
- }
590
- if (input.searchBridge) {
591
- lines.push("");
592
- lines.push(renderSearchBridgeTable(input.searchBridge));
593
- }
594
- // #257: workflow compliance section. `renderWorkflowComplianceSection`
595
- // returns "" when there are no checks, so we only push the blank-line
596
- // separator when there's actually content to render.
597
- const workflowSection = renderWorkflowComplianceSection(input);
598
- if (workflowSection.length > 0) {
599
- lines.push("");
600
- lines.push(workflowSection);
601
- }
602
- // AKM overhead + tool-use efficiency (#263). Skipped when the corpus had
603
- // no akm-arm runs so the report stays compact on the no-akm path.
604
- const overheadSection = renderAkmOverheadSection(input);
605
- if (overheadSection.length > 0) {
606
- lines.push("");
607
- lines.push(overheadSection);
608
- }
609
- // Token-measurement section (issue #252). Always rendered when there are
610
- // akm-arm runs to report on, so operators can tell whether tokens economics
611
- // are trustworthy without scrolling to the warnings block.
612
- const tokenSummary = summariseTokenMeasurement(input);
613
- if (tokenSummary.totalRuns > 0) {
614
- lines.push("");
615
- lines.push("## Token measurement (akm)");
616
- lines.push("");
617
- const cov = tokenSummary.coverage === null ? "n/a" : `${(tokenSummary.coverage * 100).toFixed(1)}%`;
618
- lines.push(`- runs: ${tokenSummary.totalRuns} total, ${tokenSummary.measuredRuns} measured, ${tokenSummary.missingRuns} missing, ${tokenSummary.unsupportedRuns} unsupported`);
619
- lines.push(`- coverage: ${cov} (${tokenSummary.reliable ? "reliable" : "unreliable — see warning below"})`);
620
- }
621
- const warnings = [...input.warnings];
622
- if (tokenSummary.warning)
623
- warnings.push(tokenSummary.warning);
624
- if (warnings.length > 0) {
625
- lines.push("");
626
- lines.push("## Warnings");
627
- lines.push("");
628
- for (const w of warnings)
629
- lines.push(`- ${w}`);
630
- }
631
- return lines.join("\n");
632
- }
633
- // ── Search-pipeline bridge (§6.7) markdown ─────────────────────────────────
634
- /**
635
- * Render the §6.7 search-pipeline bridge as a markdown section.
636
- *
637
- * When the corpus has no gold-ref tasks (or simply no `akm search`
638
- * invocations), the section collapses to a single "(N/A)" sentence so the
639
- * report stays compact.
640
- */
641
- export function renderSearchBridgeTable(metrics) {
642
- const lines = [];
643
- lines.push("## Search → outcome bridge");
644
- lines.push("");
645
- if (metrics.searchesObserved === 0 && metrics.runsObserved === 0) {
646
- lines.push("(no gold-ref tasks in corpus; bridge metrics N/A)");
647
- return lines.join("\n");
648
- }
649
- // Histogram of gold rank.
650
- lines.push("| rank | count |");
651
- lines.push("|------|-------|");
652
- for (const k of histogramKeys()) {
653
- const count = metrics.goldRankDistribution[k] ?? 0;
654
- lines.push(`| ${k} | ${count} |`);
655
- }
656
- lines.push("");
657
- // Summary line.
658
- const p50 = formatRank(metrics.goldRankP50);
659
- const p90 = formatRank(metrics.goldRankP90);
660
- lines.push(`p50=${p50}, p90=${p90}, gold_at_rank_1=${formatPercent(metrics.goldAtRank1)}, gold_missing=${formatPercent(metrics.goldMissing)}`);
661
- lines.push("");
662
- // pass_rate_by_rank.
663
- lines.push("| rank | pass_rate | run_count |");
664
- lines.push("|------|-----------|-----------|");
665
- if (metrics.passRateByRank.length === 0) {
666
- lines.push("| (no runs with `akm search` invocations) | — | 0 |");
667
- }
668
- else {
669
- for (const entry of metrics.passRateByRank) {
670
- lines.push(`| ${entry.rank} | ${entry.passRate.toFixed(2)} | ${entry.runCount} |`);
671
- }
672
- }
673
- return lines.join("\n");
674
- }
675
- function formatRank(value) {
676
- if (value === null)
677
- return "n/a";
678
- if (!Number.isFinite(value))
679
- return "missing";
680
- return value.toFixed(1);
681
- }
682
- function corpusRow(arm, c) {
683
- const tpp = c.tokensPerPass === null ? "n/a" : c.tokensPerPass.toFixed(0);
684
- return `| ${arm} | ${c.passRate.toFixed(2)} | ${tpp} | ${c.wallclockMs.toFixed(0)} |`;
685
- }
686
- function deltaRow(d) {
687
- const tpp = d.tokensPerPass === null ? "n/a" : signed(d.tokensPerPass.toFixed(0));
688
- return `| **delta** | ${signed(d.passRate.toFixed(2))} | ${tpp} | ${signed(d.wallclockMs.toFixed(0))} |`;
689
- }
690
- function taskRow(t, includeSynthetic = false, baselineByTaskId) {
691
- // Baseline-delta cell is rendered only when a baseline map is provided
692
- // AND this task has an entry. Tasks without a baseline entry get an empty
693
- // pair of cells so columns stay aligned.
694
- let baselineCells = "";
695
- if (baselineByTaskId) {
696
- const base = baselineByTaskId[t.id];
697
- if (base === undefined) {
698
- baselineCells = " n/a | n/a |";
699
- }
700
- else {
701
- const delta = t.akm.passRate - base;
702
- baselineCells = ` ${base.toFixed(2)} | ${signed(delta.toFixed(2))} |`;
703
- }
704
- }
705
- if (includeSynthetic) {
706
- // #261: render the synthetic-arm pass-rate when present; "n/a" when the
707
- // arm did not run for this task. A missing arm is NOT a zero-pass arm —
708
- // a 0.00 cell would be misleading because the model never tried.
709
- const synth = t.synthetic ? t.synthetic.passRate.toFixed(2) : "n/a";
710
- return `| ${t.id} | ${t.noakm.passRate.toFixed(2)} | ${synth} | ${t.akm.passRate.toFixed(2)} | ${signed(t.delta.passRate.toFixed(2))} |${baselineCells}`;
711
- }
712
- return `| ${t.id} | ${t.noakm.passRate.toFixed(2)} | ${t.akm.passRate.toFixed(2)} | ${signed(t.delta.passRate.toFixed(2))} |${baselineCells}`;
713
- }
714
- function signed(text) {
715
- if (text.startsWith("-"))
716
- return text;
717
- if (text === "0" || text === "0.00" || text === "0.0")
718
- return text;
719
- return `+${text}`;
720
- }
721
- function formatPercent(value) {
722
- if (value === null)
723
- return "n/a";
724
- return `${(value * 100).toFixed(1)}%`;
725
- }
726
- /**
727
- * Render a `boolean | null` trajectory field for markdown tables.
728
- *
729
- * Three-state semantics:
730
- * - `null` → `"—"` — no trajectory data (harness error; events.jsonl not captured).
731
- * - `false` → `"✗"` — agent ran but the behaviour was not observed.
732
- * - `true` → `"✓"` — behaviour confirmed.
733
- */
734
- export function formatTrajBool(value) {
735
- if (value === null)
736
- return "—";
737
- return value ? "✓" : "✗";
738
- }
739
- // ── Compare rendering (§8) ─────────────────────────────────────────────────
740
- /**
741
- * Render a CompareResult as a deterministic markdown diff.
742
- *
743
- * Determinism: no timestamps, no run IDs, no git SHAs in the body — the diff
744
- * is a pure function of the two inputs' aggregated numbers and per-task
745
- * tables. Per-task rows are sorted alphabetically (already done by
746
- * `compareReports`, but re-asserted here defensively).
747
- *
748
- * Refusal cases (model mismatch, hash mismatch, schema/track issues) render
749
- * as a single error block instead of a diff table — there's nothing
750
- * actionable to show, and the operator's recovery path is in the message.
751
- */
752
- export function renderCompareMarkdown(result) {
753
- if (!result.ok) {
754
- return renderCompareFailure(result);
755
- }
756
- return renderCompareSuccess(result);
757
- }
758
- function renderCompareFailure(result) {
759
- const lines = [];
760
- lines.push(`# akm-bench compare — refused (${result.reason})`);
761
- lines.push("");
762
- lines.push(result.message);
763
- if (result.reason === "model_mismatch" && result.baseModel !== undefined && result.currentModel !== undefined) {
764
- lines.push("");
765
- lines.push(`- base model: \`${result.baseModel}\``);
766
- lines.push(`- current model: \`${result.currentModel}\``);
767
- }
768
- if (result.reason === "hash_mismatch" &&
769
- result.baseFixtureContentHash !== undefined &&
770
- result.currentFixtureContentHash !== undefined) {
771
- lines.push("");
772
- lines.push(`- base fixture hash: \`${String(result.baseFixtureContentHash)}\``);
773
- lines.push(`- current fixture hash: \`${String(result.currentFixtureContentHash)}\``);
774
- if (result.affectedFixtures && result.affectedFixtures.length > 0) {
775
- lines.push("");
776
- lines.push("affected fixtures:");
777
- for (const f of result.affectedFixtures)
778
- lines.push(`- ${f}`);
779
- }
780
- }
781
- if (result.reason === "corpus_mismatch") {
782
- if (result.baseTaskCorpusHash !== undefined || result.currentTaskCorpusHash !== undefined) {
783
- lines.push("");
784
- lines.push(`- base taskCorpusHash: \`${String(result.baseTaskCorpusHash ?? "n/a")}\``);
785
- lines.push(`- current taskCorpusHash: \`${String(result.currentTaskCorpusHash ?? "n/a")}\``);
786
- }
787
- if (result.baseSelectedTaskIds && result.currentSelectedTaskIds) {
788
- const baseSet = new Set(result.baseSelectedTaskIds);
789
- const currentSet = new Set(result.currentSelectedTaskIds);
790
- const addedToCurrent = result.currentSelectedTaskIds.filter((id) => !baseSet.has(id)).sort();
791
- const droppedFromBase = result.baseSelectedTaskIds.filter((id) => !currentSet.has(id)).sort();
792
- if (addedToCurrent.length > 0) {
793
- lines.push("");
794
- lines.push("only in current:");
795
- for (const id of addedToCurrent)
796
- lines.push(`- ${id}`);
797
- }
798
- if (droppedFromBase.length > 0) {
799
- lines.push("");
800
- lines.push("only in base:");
801
- for (const id of droppedFromBase)
802
- lines.push(`- ${id}`);
803
- }
804
- }
805
- }
806
- return lines.join("\n");
807
- }
808
- function renderCompareSuccess(result) {
809
- const lines = [];
810
- lines.push(`# akm-bench compare — \`${result.currentModel}\``);
811
- lines.push("");
812
- if (result.baseFixtureContentHash !== null || result.currentFixtureContentHash !== null) {
813
- const b = result.baseFixtureContentHash === null ? "n/a" : `\`${result.baseFixtureContentHash}\``;
814
- const c = result.currentFixtureContentHash === null ? "n/a" : `\`${result.currentFixtureContentHash}\``;
815
- lines.push(`fixture-content hash: base=${b}, current=${c}`);
816
- lines.push("");
817
- }
818
- lines.push("## Aggregate (akm arm, current − base)");
819
- lines.push("");
820
- lines.push("| metric | delta | direction |");
821
- lines.push("|--------|-------|-----------|");
822
- lines.push(`| pass_rate | ${signedFixed(result.aggregate.passRateDelta, 2)} | ${signGlyph(result.aggregate.passRateSign)} |`);
823
- lines.push(`| tokens_per_pass | ${nullableSignedFixed(result.aggregate.tokensPerPassDelta, 0)} | ${signGlyph(result.aggregate.tokensPerPassSign)} |`);
824
- lines.push(`| wallclock_ms | ${signedFixed(result.aggregate.wallclockMsDelta, 0)} | ${signGlyph(result.aggregate.wallclockMsSign)} |`);
825
- lines.push("");
826
- lines.push("## Per-task (akm arm)");
827
- lines.push("");
828
- lines.push("| task | base pass_rate | current pass_rate | delta | dir | base stdev | current stdev |");
829
- lines.push("|------|----------------|-------------------|-------|-----|------------|---------------|");
830
- const sorted = [...result.perTask].sort((a, b) => a.id.localeCompare(b.id));
831
- for (const row of sorted)
832
- lines.push(perTaskCompareRow(row));
833
- if (result.warnings.length > 0) {
834
- lines.push("");
835
- lines.push("## Warnings");
836
- lines.push("");
837
- for (const w of result.warnings)
838
- lines.push(`- ${w}`);
839
- }
840
- return lines.join("\n");
841
- }
842
- function perTaskCompareRow(row) {
843
- const baseRate = row.baseMetrics === null ? "n/a" : row.baseMetrics.pass_rate.toFixed(2);
844
- const currentRate = row.currentMetrics === null ? "n/a" : row.currentMetrics.pass_rate.toFixed(2);
845
- const delta = row.delta.passRate === null ? "n/a" : signedFixed(row.delta.passRate, 2);
846
- const dir = signGlyph(row.signMarker);
847
- const baseStdev = row.baseMetrics === null ? "n/a" : row.baseMetrics.pass_rate_stdev.toFixed(2);
848
- const currentStdev = row.currentMetrics === null ? "n/a" : row.currentMetrics.pass_rate_stdev.toFixed(2);
849
- const idCell = row.presence === "both" ? row.id : `${row.id} _(${row.presence})_`;
850
- return `| ${idCell} | ${baseRate} | ${currentRate} | ${delta} | ${dir} | ${baseStdev} | ${currentStdev} |`;
851
- }
852
- function signGlyph(sign) {
853
- if (sign === "improve")
854
- return "▲";
855
- if (sign === "regress")
856
- return "▼";
857
- return "▬";
858
- }
859
- function signedFixed(value, digits) {
860
- // Treat numerical zero (or values that round to "-0.00") as "0" so we
861
- // never emit a misleading "+0.00" or "-0.00" in deterministic output.
862
- const fixed = value.toFixed(digits);
863
- if (fixed === "-0" || /^-0\.0+$/.test(fixed))
864
- return (0).toFixed(digits);
865
- if (value === 0)
866
- return fixed;
867
- return value > 0 ? `+${fixed}` : fixed;
868
- }
869
- function nullableSignedFixed(value, digits) {
870
- if (value === null)
871
- return "n/a";
872
- return signedFixed(value, digits);
873
- }
874
- // ── Attribution table rendering (§6.5) ─────────────────────────────────────
875
- /**
876
- * Threshold for the "highly loaded" slice — assets with a load count at or
877
- * above this fraction of the per-table maximum get bucketed into the "well
878
- * used and working" / "well used and not working" callout sections.
879
- */
880
- const HIGH_LOAD_THRESHOLD = 0.5;
881
- /**
882
- * Threshold for "working" pass-rate. An asset is "working" if its
883
- * load_pass_rate is at or above this; "not working" if below.
884
- */
885
- const WORKING_PASS_RATE_THRESHOLD = 0.5;
886
- /**
887
- * Render a per-asset attribution table as markdown. Sort order matches
888
- * `computePerAssetAttribution` (load count desc, pass rate desc, ref asc).
889
- *
890
- * The output has three sections:
891
- * 1. Full sorted table.
892
- * 2. "Well-used and working" callout — high load, high pass_rate.
893
- * 3. "Well-used and not working" callout — high load, low pass_rate.
894
- *
895
- * The two callouts are the actionable slices: the first is what curation
896
- * should preserve, the second is what should be improved or removed.
897
- */
898
- export function renderAttributionTable(attr) {
899
- const lines = [];
900
- lines.push("## Per-asset attribution");
901
- lines.push("");
902
- lines.push(`Total akm-arm runs aggregated: ${attr.totalAkmRuns}`);
903
- lines.push("");
904
- if (attr.rows.length === 0) {
905
- lines.push("_No assets were loaded by the agent during akm-arm runs._");
906
- return lines.join("\n");
907
- }
908
- lines.push("| asset_ref | load_count | load_count_passing | load_count_failing | load_pass_rate |");
909
- lines.push("|-----------|------------|--------------------|--------------------|----------------|");
910
- for (const row of attr.rows) {
911
- lines.push(`| \`${row.assetRef}\` | ${row.loadCount} | ${row.loadCountPassing} | ${row.loadCountFailing} | ${formatRate(row.loadPassRate)} |`);
912
- }
913
- // Slice callouts. We compute the high-load threshold relative to the
914
- // top-loaded asset's count so this scales whether the corpus has 5 or 500
915
- // total runs.
916
- const topLoad = attr.rows[0]?.loadCount ?? 0;
917
- const highLoadCutoff = Math.max(1, Math.ceil(topLoad * HIGH_LOAD_THRESHOLD));
918
- const heavilyLoaded = attr.rows.filter((r) => r.loadCount >= highLoadCutoff);
919
- const working = heavilyLoaded.filter((r) => (r.loadPassRate ?? 0) >= WORKING_PASS_RATE_THRESHOLD);
920
- const notWorking = heavilyLoaded.filter((r) => (r.loadPassRate ?? 0) < WORKING_PASS_RATE_THRESHOLD);
921
- lines.push("");
922
- lines.push("### Well-used and working");
923
- lines.push("");
924
- if (working.length === 0) {
925
- lines.push("_None._");
926
- }
927
- else {
928
- for (const r of working) {
929
- lines.push(`- \`${r.assetRef}\` (load_count=${r.loadCount}, load_pass_rate=${formatRate(r.loadPassRate)})`);
930
- }
931
- }
932
- lines.push("");
933
- lines.push("### Well-used and NOT working");
934
- lines.push("");
935
- if (notWorking.length === 0) {
936
- lines.push("_None._");
937
- }
938
- else {
939
- for (const r of notWorking) {
940
- lines.push(`- \`${r.assetRef}\` (load_count=${r.loadCount}, load_pass_rate=${formatRate(r.loadPassRate)})`);
941
- }
942
- }
943
- return lines.join("\n");
944
- }
945
- function formatRate(value) {
946
- if (value === null)
947
- return "n/a";
948
- return `${(value * 100).toFixed(1)}%`;
949
- }
950
- // ── Failure-mode breakdown (§6.6) ──────────────────────────────────────────
951
- /**
952
- * Render the §6.6 "Failure modes" markdown section. Lines are sorted by
953
- * descending count (ties broken alphabetically by label so output is
954
- * byte-stable). Each line:
955
- *
956
- * `<label> — <count> (<percent>% of failed runs)`
957
- *
958
- * Returns an empty string when no failed runs exist (caller decides whether
959
- * to append a blank section header).
960
- */
961
- export function renderFailureModeBreakdown(report) {
962
- const entries = Object.entries(report.failureModes.byLabel);
963
- if (entries.length === 0)
964
- return "";
965
- const totalFailures = entries.reduce((acc, [, count]) => acc + count, 0);
966
- if (totalFailures === 0)
967
- return "";
968
- // Sort by descending count, tie-break alphabetically for determinism.
969
- entries.sort((a, b) => {
970
- if (b[1] !== a[1])
971
- return b[1] - a[1];
972
- return a[0].localeCompare(b[0]);
973
- });
974
- const lines = ["## Failure modes", ""];
975
- for (const [label, count] of entries) {
976
- const percent = ((count / totalFailures) * 100).toFixed(1);
977
- lines.push(`- ${label} — ${count} (${percent}% of failed runs)`);
978
- }
979
- return lines.join("\n");
980
- }
981
- // ── Workflow compliance aggregation (#257) ─────────────────────────────────
982
- /**
983
- * Top-violation entry with enough detail to identify which (task, seed)
984
- * caused each occurrence. The `evidence` array is capped at
985
- * `MAX_VIOLATION_EVIDENCE` per code so a pathological corpus cannot blow up
986
- * the report.
987
- */
988
- const MAX_VIOLATION_EVIDENCE = 10;
989
- /**
990
- * Maximum number of top-violation entries to surface in JSON / markdown.
991
- * Operators care about the head of the distribution; the long tail is
992
- * recoverable from `workflowChecks` if needed.
993
- */
994
- const MAX_TOP_VIOLATIONS = 10;
995
- /**
996
- * Map a workflow check `status` onto the public pass/partial/fail bucket.
997
- * `not_applicable` returns `null` (excluded from the aggregate counts).
998
- * `harness_error` is bucketed as `fail` so corrupt traces are visibly
999
- * counted against compliance.
1000
- */
1001
- function bucketWorkflowStatus(status) {
1002
- if (status === "pass")
1003
- return "pass";
1004
- if (status === "partial")
1005
- return "partial";
1006
- if (status === "fail")
1007
- return "fail";
1008
- if (status === "harness_error")
1009
- return "fail";
1010
- return null; // not_applicable
1011
- }
1012
- /**
1013
- * Compute the §257 `workflow` block from a flat list of `WorkflowCheckResult`.
1014
- * Empty input yields an empty (zero-filled) aggregate so JSON consumers
1015
- * always see the same shape.
1016
- */
1017
- function buildWorkflowAggregate(checks) {
1018
- // #258: Compute reliability up front so all early-return paths share the
1019
- // same shape. Reliability tolerates empty input (`groups === 0`).
1020
- const reliabilityResult = computeWorkflowReliability(checks);
1021
- const reliability = {
1022
- by_workflow: reliabilityResult.byWorkflow,
1023
- corpus: reliabilityResult.corpus,
1024
- };
1025
- const empty = {
1026
- total_checks: checks.length,
1027
- applicable_checks: 0,
1028
- overall_compliance: 0,
1029
- strict_pass_rate: 0,
1030
- partial_pass_rate: 0,
1031
- fail_rate: 0,
1032
- violation_count: 0,
1033
- by_workflow: {},
1034
- top_violations: [],
1035
- cross_tab: [],
1036
- reliability,
1037
- };
1038
- if (checks.length === 0)
1039
- return empty;
1040
- // Bucket counts (corpus-wide) and accumulate per-spec / per-violation /
1041
- // cross-tab in a single pass.
1042
- let strict = 0;
1043
- let partial = 0;
1044
- let fail = 0;
1045
- let scoreSum = 0;
1046
- let applicable = 0;
1047
- let violationCount = 0;
1048
- const perSpecAcc = new Map();
1049
- const violationAcc = new Map();
1050
- const crossTabAcc = new Map();
1051
- // We need each (task_outcome, run) bucketed against the WORST workflow
1052
- // outcome that run produced — otherwise a run with one passing and one
1053
- // failing spec gets double-counted across cross-tab rows. Reduce per-run.
1054
- const runWorstOutcome = new Map();
1055
- // Track which run keys have at least one applicable check; non-applicable
1056
- // runs do not contribute to the cross-tab.
1057
- const runHasApplicable = new Set();
1058
- for (const c of checks) {
1059
- const bucket = bucketWorkflowStatus(c.status);
1060
- const runKey = `${c.taskId}::${c.arm}::${c.seed}`;
1061
- // Per-spec: include `not_applicable` in the spec's `count` column
1062
- // (operators want to see whether the spec ever fired) but exclude
1063
- // it from rate denominators.
1064
- const specEntry = perSpecAcc.get(c.workflowId) ?? {
1065
- count: 0,
1066
- scoreSum: 0,
1067
- pass: 0,
1068
- partial: 0,
1069
- fail: 0,
1070
- violationCount: 0,
1071
- };
1072
- specEntry.count += 1;
1073
- if (bucket !== null) {
1074
- specEntry.scoreSum += c.score;
1075
- specEntry[bucket] += 1;
1076
- }
1077
- specEntry.violationCount += c.violations.length;
1078
- perSpecAcc.set(c.workflowId, specEntry);
1079
- if (bucket === null)
1080
- continue;
1081
- applicable += 1;
1082
- scoreSum += c.score;
1083
- violationCount += c.violations.length;
1084
- runHasApplicable.add(runKey);
1085
- if (bucket === "pass")
1086
- strict += 1;
1087
- else if (bucket === "partial")
1088
- partial += 1;
1089
- else
1090
- fail += 1;
1091
- // Per-violation evidence collection. Cap evidence per code so one noisy
1092
- // failure mode cannot dominate the section.
1093
- for (const v of c.violations) {
1094
- const list = violationAcc.get(v.code) ?? [];
1095
- if (list.length < MAX_VIOLATION_EVIDENCE) {
1096
- const ev = {
1097
- task_id: c.taskId,
1098
- arm: c.arm,
1099
- seed: c.seed,
1100
- workflow_id: c.workflowId,
1101
- };
1102
- if (v.message)
1103
- ev.message = v.message;
1104
- if (v.expected !== undefined)
1105
- ev.expected = v.expected;
1106
- if (v.observed !== undefined)
1107
- ev.observed = v.observed;
1108
- list.push(ev);
1109
- }
1110
- violationAcc.set(v.code, list);
1111
- }
1112
- // Cross-tab bookkeeping: keep the WORST workflow outcome per run so we
1113
- // get one cell per run (not per (run × spec)).
1114
- const taskOutcome = readCheckTaskOutcome(c) ?? "unknown";
1115
- const worst = runWorstOutcome.get(runKey);
1116
- if (!worst) {
1117
- runWorstOutcome.set(runKey, { taskOutcome, workflowOutcome: bucket });
1118
- }
1119
- else if (severityRank(bucket) > severityRank(worst.workflowOutcome)) {
1120
- worst.workflowOutcome = bucket;
1121
- }
1122
- }
1123
- // Reduce runWorstOutcome into the public cross_tab rows. We always emit
1124
- // entries for `pass` and `fail` task outcomes so the table shape is
1125
- // stable; additional outcomes ("budget_exceeded", "harness_error",
1126
- // "unknown") only appear when at least one run carried them.
1127
- const stableOutcomes = ["pass", "fail"];
1128
- for (const [, entry] of runWorstOutcome) {
1129
- if (!stableOutcomes.includes(entry.taskOutcome) && entry.taskOutcome !== "unknown") {
1130
- stableOutcomes.push(entry.taskOutcome);
1131
- }
1132
- }
1133
- for (const [, entry] of runWorstOutcome) {
1134
- const counts = crossTabAcc.get(entry.taskOutcome) ?? { pass: 0, partial: 0, fail: 0 };
1135
- counts[entry.workflowOutcome] += 1;
1136
- crossTabAcc.set(entry.taskOutcome, counts);
1137
- }
1138
- const cross_tab = [];
1139
- for (const outcome of stableOutcomes) {
1140
- const counts = crossTabAcc.get(outcome) ?? { pass: 0, partial: 0, fail: 0 };
1141
- cross_tab.push({
1142
- task_outcome: outcome,
1143
- pass: counts.pass,
1144
- partial: counts.partial,
1145
- fail: counts.fail,
1146
- total: counts.pass + counts.partial + counts.fail,
1147
- });
1148
- }
1149
- // Append "unknown" row only if any run actually carried it.
1150
- if (crossTabAcc.has("unknown")) {
1151
- const counts = crossTabAcc.get("unknown") ?? { pass: 0, partial: 0, fail: 0 };
1152
- cross_tab.push({
1153
- task_outcome: "unknown",
1154
- pass: counts.pass,
1155
- partial: counts.partial,
1156
- fail: counts.fail,
1157
- total: counts.pass + counts.partial + counts.fail,
1158
- });
1159
- }
1160
- if (applicable === 0) {
1161
- // Every check was `not_applicable`. Surface a non-empty `by_workflow`
1162
- // (so operators see which specs ran) but leave the rate fields zeroed.
1163
- const by_workflow = {};
1164
- for (const [id, e] of perSpecAcc) {
1165
- by_workflow[id] = {
1166
- workflow_id: id,
1167
- count: e.count,
1168
- score: 0,
1169
- pass_rate: 0,
1170
- partial_rate: 0,
1171
- fail_rate: 0,
1172
- violation_count: e.violationCount,
1173
- };
1174
- }
1175
- return {
1176
- total_checks: checks.length,
1177
- applicable_checks: 0,
1178
- overall_compliance: 0,
1179
- strict_pass_rate: 0,
1180
- partial_pass_rate: 0,
1181
- fail_rate: 0,
1182
- violation_count: 0,
1183
- by_workflow,
1184
- top_violations: [],
1185
- cross_tab,
1186
- reliability,
1187
- };
1188
- }
1189
- const by_workflow = {};
1190
- for (const [id, e] of perSpecAcc) {
1191
- const applicableForSpec = e.pass + e.partial + e.fail;
1192
- const score = applicableForSpec === 0 ? 0 : e.scoreSum / applicableForSpec;
1193
- const passRate = applicableForSpec === 0 ? 0 : e.pass / applicableForSpec;
1194
- const partialRate = applicableForSpec === 0 ? 0 : e.partial / applicableForSpec;
1195
- const failRate = applicableForSpec === 0 ? 0 : e.fail / applicableForSpec;
1196
- by_workflow[id] = {
1197
- workflow_id: id,
1198
- count: e.count,
1199
- score,
1200
- pass_rate: passRate,
1201
- partial_rate: partialRate,
1202
- fail_rate: failRate,
1203
- violation_count: e.violationCount,
1204
- };
1205
- }
1206
- // Top-violation list: sort by count desc, tie-break alphabetically by
1207
- // code so rendering is byte-stable.
1208
- const top_violations = [];
1209
- for (const [code, evidence] of violationAcc) {
1210
- top_violations.push({
1211
- code,
1212
- count: evidence.length, // bounded; raw count below for accuracy
1213
- evidence,
1214
- });
1215
- }
1216
- // Recount: `evidence.length` is capped at MAX_VIOLATION_EVIDENCE; we want
1217
- // the true count for sorting/reporting. Re-derive from violationAcc by
1218
- // scanning checks again — cheap.
1219
- const trueCounts = new Map();
1220
- for (const c of checks) {
1221
- if (bucketWorkflowStatus(c.status) === null)
1222
- continue;
1223
- for (const v of c.violations) {
1224
- trueCounts.set(v.code, (trueCounts.get(v.code) ?? 0) + 1);
1225
- }
1226
- }
1227
- for (const tv of top_violations) {
1228
- tv.count = trueCounts.get(tv.code) ?? tv.count;
1229
- }
1230
- top_violations.sort((a, b) => {
1231
- if (b.count !== a.count)
1232
- return b.count - a.count;
1233
- return a.code.localeCompare(b.code);
1234
- });
1235
- const trimmedViolations = top_violations.slice(0, MAX_TOP_VIOLATIONS);
1236
- return {
1237
- total_checks: checks.length,
1238
- applicable_checks: applicable,
1239
- overall_compliance: scoreSum / applicable,
1240
- strict_pass_rate: strict / applicable,
1241
- partial_pass_rate: partial / applicable,
1242
- fail_rate: fail / applicable,
1243
- violation_count: violationCount,
1244
- by_workflow,
1245
- top_violations: trimmedViolations,
1246
- cross_tab,
1247
- reliability,
1248
- };
1249
- }
1250
- /**
1251
- * Severity rank for cross-tab "WORST workflow outcome per run" reduction.
1252
- * fail > partial > pass.
1253
- */
1254
- function severityRank(b) {
1255
- if (b === "fail")
1256
- return 2;
1257
- if (b === "partial")
1258
- return 1;
1259
- return 0;
1260
- }
1261
- /**
1262
- * Recover the task-level outcome that produced a check, when available.
1263
- * The check shape does not carry it directly; the runner stashes it on a
1264
- * non-public side-channel field. Returns `undefined` when no task outcome
1265
- * was attached (older callers, hand-written tests).
1266
- */
1267
- function readCheckTaskOutcome(c) {
1268
- return typeof c.taskOutcome === "string" ? c.taskOutcome : undefined;
1269
- }
1270
- /**
1271
- * Render the §257 `## Workflow compliance` markdown section. Returns "" when
1272
- * there are no checks so the report stays compact for runs without
1273
- * applicable workflow specs.
1274
- */
1275
- export function renderWorkflowComplianceSection(input) {
1276
- const checks = input.workflowChecks ?? [];
1277
- const agg = buildWorkflowAggregate(checks);
1278
- if (agg.total_checks === 0)
1279
- return "";
1280
- const lines = [];
1281
- lines.push("## Workflow compliance");
1282
- lines.push("");
1283
- if (agg.applicable_checks === 0) {
1284
- lines.push("_No workflow specs applied to this corpus._");
1285
- if (Object.keys(agg.by_workflow).length > 0) {
1286
- lines.push("");
1287
- lines.push(`Loaded specs (none matched the run): ${Object.keys(agg.by_workflow).sort().join(", ")}`);
1288
- }
1289
- return lines.join("\n");
1290
- }
1291
- lines.push(`overall_compliance=${agg.overall_compliance.toFixed(2)}, ` +
1292
- `strict_pass_rate=${agg.strict_pass_rate.toFixed(2)}, ` +
1293
- `partial_pass_rate=${agg.partial_pass_rate.toFixed(2)}, ` +
1294
- `fail_rate=${agg.fail_rate.toFixed(2)}, ` +
1295
- `violations=${agg.violation_count}`);
1296
- lines.push("");
1297
- lines.push("### By workflow");
1298
- lines.push("");
1299
- lines.push("| workflow_id | applicable | score | pass | partial | fail | violations |");
1300
- lines.push("|-------------|-----------:|------:|-----:|--------:|-----:|-----------:|");
1301
- const sortedSpecs = Object.values(agg.by_workflow).sort((a, b) => a.workflow_id.localeCompare(b.workflow_id));
1302
- for (const spec of sortedSpecs) {
1303
- lines.push(`| ${spec.workflow_id} | ${spec.count} | ${spec.score.toFixed(2)} | ${spec.pass_rate.toFixed(2)} | ${spec.partial_rate.toFixed(2)} | ${spec.fail_rate.toFixed(2)} | ${spec.violation_count} |`);
1304
- }
1305
- if (agg.top_violations.length > 0) {
1306
- lines.push("");
1307
- lines.push("### Top violations");
1308
- lines.push("");
1309
- lines.push("| code | count |");
1310
- lines.push("|------|------:|");
1311
- for (const tv of agg.top_violations) {
1312
- lines.push(`| ${tv.code} | ${tv.count} |`);
1313
- }
1314
- // Surface the first evidence pointer per top-violation so operators can
1315
- // jump to a concrete (task, seed) without parsing the JSON envelope.
1316
- lines.push("");
1317
- lines.push("### Violation evidence");
1318
- lines.push("");
1319
- lines.push("| code | task | seed | workflow | observed |");
1320
- lines.push("|------|------|-----:|----------|----------|");
1321
- for (const tv of agg.top_violations) {
1322
- for (const ev of tv.evidence) {
1323
- const observed = ev.observed ?? ev.message ?? "";
1324
- lines.push(`| ${tv.code} | ${ev.task_id} | ${ev.seed} | ${ev.workflow_id} | ${truncateCell(observed)} |`);
1325
- }
1326
- }
1327
- }
1328
- if (agg.cross_tab.length > 0) {
1329
- lines.push("");
1330
- lines.push("### Task outcome × workflow outcome");
1331
- lines.push("");
1332
- lines.push("| task_outcome | wf_pass | wf_partial | wf_fail | total |");
1333
- lines.push("|--------------|--------:|-----------:|--------:|------:|");
1334
- for (const row of agg.cross_tab) {
1335
- lines.push(`| ${row.task_outcome} | ${row.pass} | ${row.partial} | ${row.fail} | ${row.total} |`);
1336
- }
1337
- }
1338
- // #258: Reliability sub-section. Skip when no group contributed (all
1339
- // checks were `not_applicable` or input was empty).
1340
- const reliability = agg.reliability;
1341
- if (reliability.corpus.groups > 0) {
1342
- lines.push("");
1343
- lines.push("### Reliability (pass@k / pass^k)");
1344
- lines.push("");
1345
- lines.push(`corpus pass@k=${reliability.corpus.pass_at_k.toFixed(2)}, ` +
1346
- `pass^k=${reliability.corpus.pass_all_k.toFixed(2)} ` +
1347
- `(over ${reliability.corpus.groups} workflow×task groups, ${reliability.corpus.tasks} distinct tasks)`);
1348
- lines.push("");
1349
- lines.push("| workflow_id | tasks | k | pass@k | pass^k |");
1350
- lines.push("|-------------|------:|--:|-------:|-------:|");
1351
- const sortedReliability = Object.values(reliability.by_workflow).sort((a, b) => a.workflow_id.localeCompare(b.workflow_id));
1352
- for (const row of sortedReliability) {
1353
- lines.push(`| ${row.workflow_id} | ${row.tasks} | ${row.k} | ${row.pass_at_k.toFixed(2)} | ${row.pass_all_k.toFixed(2)} |`);
1354
- }
1355
- // Inconsistency callout: workflows where the agent CAN comply
1356
- // (pass@k high) but does not RELIABLY comply (pass^k materially lower).
1357
- // Threshold: pass@k ≥ 0.5 AND (pass@k − pass^k) ≥ 0.25.
1358
- const INCONSISTENCY_GAP = 0.25;
1359
- const PASS_AT_K_FLOOR = 0.5;
1360
- const inconsistent = sortedReliability.filter((r) => r.pass_at_k >= PASS_AT_K_FLOOR && r.pass_at_k - r.pass_all_k >= INCONSISTENCY_GAP);
1361
- if (inconsistent.length > 0) {
1362
- lines.push("");
1363
- lines.push("**Inconsistent workflows** (high pass@k but low pass^k — agent can comply but does not reliably):");
1364
- lines.push("");
1365
- for (const row of inconsistent) {
1366
- lines.push(`- \`${row.workflow_id}\`: pass@k=${row.pass_at_k.toFixed(2)} vs pass^k=${row.pass_all_k.toFixed(2)} (gap ${(row.pass_at_k - row.pass_all_k).toFixed(2)})`);
1367
- }
1368
- }
1369
- }
1370
- return lines.join("\n");
1371
- }
1372
- /**
1373
- * Trim a single cell so the markdown table stays scannable. We keep the
1374
- * head 80 chars and append `…` when clamped.
1375
- */
1376
- function truncateCell(s) {
1377
- if (s.length <= 80)
1378
- return s.replace(/\|/g, "\\|");
1379
- return `${s.slice(0, 80).replace(/\|/g, "\\|")}…`;
1380
- }
1381
- // ── Negative-transfer + domain diagnostics markdown (#260) ─────────────────
1382
- /**
1383
- * Render the §260 negative-transfer section. Stays quiet when no
1384
- * regressions exist — emits a single `## Negative transfer\n\nnone` block so
1385
- * the report remains scannable for green corpora. When regressions exist,
1386
- * renders headline counts, the top-regressed-task table, the per-domain
1387
- * delta table, and the asset-regression-candidate table.
1388
- */
1389
- export function renderNegativeTransferSection(input) {
1390
- const negativeTransfer = computeNegativeTransfer(input.tasks);
1391
- const lines = ["## Negative transfer", ""];
1392
- if (negativeTransfer.count === 0) {
1393
- lines.push("none");
1394
- return lines.join("\n");
1395
- }
1396
- lines.push(`count=${negativeTransfer.count}, severity=${negativeTransfer.severity.toFixed(2)} (sum of noakm − akm pass rate over regressed tasks)`);
1397
- lines.push("");
1398
- lines.push("### Top regressed tasks");
1399
- lines.push("");
1400
- lines.push("| task | domain | noakm | akm | delta |");
1401
- lines.push("|------|--------|-------|-----|-------|");
1402
- for (const row of negativeTransfer.topRegressedTasks) {
1403
- lines.push(`| ${row.taskId} | ${row.domain} | ${row.noakmPassRate.toFixed(2)} | ${row.akmPassRate.toFixed(2)} | ${signed(row.delta.toFixed(2))} |`);
1404
- }
1405
- const domainRows = computeDomainAggregates(input.tasks);
1406
- if (domainRows.length > 0) {
1407
- lines.push("");
1408
- lines.push("### Domain-level deltas");
1409
- lines.push("");
1410
- lines.push("| domain | tasks | regressions | noakm pass | akm pass | delta | tokens delta | wallclock delta (ms) |");
1411
- lines.push("|--------|-------|-------------|------------|----------|-------|--------------|----------------------|");
1412
- for (const row of domainRows) {
1413
- const tppDelta = row.tokensPerPassDelta === null ? "n/a" : signed(row.tokensPerPassDelta.toFixed(0));
1414
- lines.push(`| ${row.domain} | ${row.taskCount} | ${row.regressionCount} | ${row.passRateNoakm.toFixed(2)} | ${row.passRateAkm.toFixed(2)} | ${signed(row.passRateDelta.toFixed(2))} | ${tppDelta} | ${signed(row.wallclockMsDelta.toFixed(0))} |`);
1415
- }
1416
- }
1417
- const candidates = computeAssetRegressionCandidates(negativeTransfer.topRegressedTasks.map((r) => r.taskId), input.akmRuns ?? []);
1418
- if (candidates.length > 0) {
1419
- lines.push("");
1420
- lines.push("### Asset regression candidates");
1421
- lines.push("");
1422
- lines.push("| asset_ref | regressed tasks | total loads |");
1423
- lines.push("|-----------|-----------------|-------------|");
1424
- for (const row of candidates) {
1425
- lines.push(`| \`${row.assetRef}\` | ${row.regressedTaskCount} | ${row.totalLoadCount} |`);
1426
- }
1427
- }
1428
- return lines.join("\n");
1429
- }
1430
- // ── Corpus-coverage markdown (#262) ────────────────────────────────────────
1431
- /**
1432
- * Render the §13.3 corpus_coverage markdown section (#262). Returns "" when
1433
- * no task carries a `memory_ability` tag — at that point the section adds
1434
- * no signal and only churns markdown snapshots.
1435
- *
1436
- * Sections rendered:
1437
- * - Coverage counts per memory-ability label (closed set + `untagged`).
1438
- * - Per-memory-ability pass-rate / akm − noakm delta / negative-transfer
1439
- * counts, plus workflow compliance when at least one task supplied it.
1440
- * - A compact `## Task families` rollup when ≥ 2 families are tagged.
1441
- */
1442
- export function renderCorpusCoverageSection(input) {
1443
- const block = buildCorpusCoverageBlock(input);
1444
- const taggedAbility = Object.entries(block.coverage.memoryAbilityCounts).some(([k, v]) => k !== "untagged" && v > 0);
1445
- if (!taggedAbility)
1446
- return "";
1447
- const lines = [];
1448
- lines.push("## Corpus coverage");
1449
- lines.push("");
1450
- lines.push("| memory_ability | tasks |");
1451
- lines.push("|----------------|-------|");
1452
- // Sort keys: known abilities alphabetically, `untagged` last.
1453
- const counts = block.coverage.memoryAbilityCounts;
1454
- const knownKeys = Object.keys(counts)
1455
- .filter((k) => k !== "untagged")
1456
- .sort();
1457
- for (const k of knownKeys)
1458
- lines.push(`| ${k} | ${counts[k]} |`);
1459
- if ((counts.untagged ?? 0) > 0)
1460
- lines.push(`| untagged | ${counts.untagged} |`);
1461
- if (block.by_memory_ability.length > 0) {
1462
- lines.push("");
1463
- lines.push("### By memory_ability");
1464
- lines.push("");
1465
- const anyCompliance = block.by_memory_ability.some((r) => r.workflow_compliance !== null);
1466
- if (anyCompliance) {
1467
- lines.push("| memory_ability | tasks | noakm | akm | delta | neg.transfer | workflow_compliance |");
1468
- lines.push("|----------------|-------|-------|-----|-------|--------------|---------------------|");
1469
- }
1470
- else {
1471
- lines.push("| memory_ability | tasks | noakm | akm | delta | neg.transfer |");
1472
- lines.push("|----------------|-------|-------|-----|-------|--------------|");
1473
- }
1474
- for (const row of block.by_memory_ability) {
1475
- const base = `| ${row.category} | ${row.task_count} | ${row.pass_rate_noakm.toFixed(2)} | ${row.pass_rate_akm.toFixed(2)} | ${signed(row.pass_rate_delta.toFixed(2))} | ${row.negative_transfer_count} |`;
1476
- if (anyCompliance) {
1477
- const wc = row.workflow_compliance === null ? "n/a" : row.workflow_compliance.toFixed(2);
1478
- lines.push(`${base} ${wc} |`);
1479
- }
1480
- else {
1481
- lines.push(base);
1482
- }
1483
- }
1484
- }
1485
- const families = block.by_task_family;
1486
- if (families.length >= 2) {
1487
- lines.push("");
1488
- lines.push("### By task_family");
1489
- lines.push("");
1490
- lines.push("| task_family | tasks | noakm | akm | delta |");
1491
- lines.push("|-------------|-------|-------|-----|-------|");
1492
- for (const row of families) {
1493
- lines.push(`| ${row.category} | ${row.task_count} | ${row.pass_rate_noakm.toFixed(2)} | ${row.pass_rate_akm.toFixed(2)} | ${signed(row.pass_rate_delta.toFixed(2))} |`);
1494
- }
1495
- }
1496
- return lines.join("\n");
1497
- }
1498
- // ── Git helpers ────────────────────────────────────────────────────────────
1499
- /**
1500
- * Resolve `git rev-parse --abbrev-ref HEAD`. Falls back to `"unknown"` if
1501
- * git is unavailable or the cwd is not a repo. Tests inject `cwd` to point
1502
- * at a tmp non-repo to exercise the fallback.
1503
- */
1504
- export function resolveGitBranch(cwd) {
1505
- return tryGit(["rev-parse", "--abbrev-ref", "HEAD"], cwd);
1506
- }
1507
- /**
1508
- * Resolve `git rev-parse --short HEAD`. Same fallback rules as
1509
- * `resolveGitBranch`.
1510
- */
1511
- export function resolveGitCommit(cwd) {
1512
- return tryGit(["rev-parse", "--short", "HEAD"], cwd);
1513
- }
1514
- function tryGit(args, cwd) {
1515
- try {
1516
- const out = execSync(`git ${args.join(" ")}`, {
1517
- cwd: cwd ?? process.cwd(),
1518
- stdio: ["ignore", "pipe", "ignore"],
1519
- encoding: "utf8",
1520
- });
1521
- return out.trim() || "unknown";
1522
- }
1523
- catch {
1524
- return "unknown";
1525
- }
1526
- }
1527
- /**
1528
- * Threshold below which the markdown summary prepends a warning marker
1529
- * and the JSON envelope's `warnings[]` carries a structured
1530
- * `feedback_agreement_below_threshold` entry. Track B's headline numbers
1531
- * (`improvement_slope`, `over_synthetic_lift`) are unreliable when
1532
- * Phase 1 feedback disagrees with run outcomes more than 20% of the
1533
- * time. Spec §6.8.
1534
- */
1535
- export const FEEDBACK_AGREEMENT_WARNING_THRESHOLD = 0.8;
1536
- /**
1537
- * Render an evolve run as the §6.3+§6.4 JSON envelope plus a markdown
1538
- * summary. Mirrors `renderUtilityReport` — caller wires stdout/stderr.
1539
- */
1540
- export function renderEvolveReport(input) {
1541
- const json = buildEvolveJson(input);
1542
- const markdown = buildEvolveMarkdown(input);
1543
- return { json, markdown };
1544
- }
1545
- function buildEvolveJson(input) {
1546
- // For each arm we re-render the §13.3 utility envelope so downstream
1547
- // consumers can treat each arm exactly like a `bench utility` artefact.
1548
- const armEnvelope = (r) => buildUtilityJson(r);
1549
- // §6.8 — derive an additive `warnings[]` entry when the headline
1550
- // feedback_agreement falls below the trust threshold.
1551
- const augmentedWarnings = [...input.warnings];
1552
- if (input.feedbackIntegrity) {
1553
- const agreement = input.feedbackIntegrity.aggregate.feedback_agreement;
1554
- if (agreement < FEEDBACK_AGREEMENT_WARNING_THRESHOLD) {
1555
- augmentedWarnings.push(`feedback_agreement_below_threshold: ${agreement.toFixed(2)} < ${FEEDBACK_AGREEMENT_WARNING_THRESHOLD.toFixed(2)} — Track B headline numbers (improvement_slope, over_synthetic_lift) may be unreliable until AGENTS.md guidance for \`akm feedback\` is tightened.`);
1556
- }
1557
- }
1558
- return {
1559
- schemaVersion: 1,
1560
- track: "evolve",
1561
- branch: input.branch,
1562
- commit: input.commit,
1563
- timestamp: input.timestamp,
1564
- agent: { harness: "opencode", model: input.model },
1565
- corpus: {
1566
- domain: input.domain,
1567
- seedsPerArm: input.seedsPerArm,
1568
- },
1569
- proposals: {
1570
- total_proposals: input.proposals.totalProposals,
1571
- total_accepted: input.proposals.totalAccepted,
1572
- acceptance_rate: input.proposals.acceptanceRate,
1573
- lint_pass_rate: input.proposals.lintPassRate,
1574
- rows: input.proposals.rows.map((r) => ({
1575
- asset_ref: r.assetRef,
1576
- proposal_count: r.proposalCount,
1577
- lint_pass_count: r.lintPassCount,
1578
- accepted_count: r.acceptedCount,
1579
- })),
1580
- },
1581
- ...(input.lessons ? { lessons: serialiseLessons(input.lessons) } : {}),
1582
- longitudinal: {
1583
- improvement_slope: input.longitudinal.improvementSlope,
1584
- over_synthetic_lift: input.longitudinal.overSyntheticLift,
1585
- degradation_count: input.longitudinal.degradationCount,
1586
- pre_pass_rate: input.longitudinal.prePassRate,
1587
- post_pass_rate: input.longitudinal.postPassRate,
1588
- synthetic_pass_rate: input.longitudinal.syntheticPassRate,
1589
- degradations: input.longitudinal.degradations.map((d) => ({
1590
- task_id: d.taskId,
1591
- pre_pass_rate: d.prePassRate,
1592
- post_pass_rate: d.postPassRate,
1593
- delta: d.delta,
1594
- failure_mode: d.failureMode,
1595
- })),
1596
- },
1597
- ...(input.learningCurve ? { learning: serialiseLearningCurve(input.learningCurve) } : {}),
1598
- arms: {
1599
- pre: armEnvelope(input.arms.pre),
1600
- post: armEnvelope(input.arms.post),
1601
- synthetic: armEnvelope(input.arms.synthetic),
1602
- },
1603
- perAsset: input.arms.post.perAsset
1604
- ? {
1605
- total_akm_runs: input.arms.post.perAsset.totalAkmRuns,
1606
- rows: input.arms.post.perAsset.rows.map((r) => ({
1607
- asset_ref: r.assetRef,
1608
- load_count: r.loadCount,
1609
- load_count_passing: r.loadCountPassing,
1610
- load_count_failing: r.loadCountFailing,
1611
- load_pass_rate: r.loadPassRate,
1612
- })),
1613
- }
1614
- : { total_akm_runs: 0, rows: [] },
1615
- failure_modes: {
1616
- by_label: input.arms.post.failureModes.byLabel,
1617
- by_task: input.arms.post.failureModes.byTask,
1618
- },
1619
- ...(input.arms.post.searchBridge ? { searchBridge: serialiseSearchBridge(input.arms.post.searchBridge) } : {}),
1620
- ...(input.feedbackIntegrity ? { feedback_integrity: serialiseFeedbackIntegrity(input.feedbackIntegrity) } : {}),
1621
- warnings: augmentedWarnings,
1622
- };
1623
- }
1624
- /**
1625
- * #264 — flatten the LessonMetrics envelope into JSON. Aggregate counters
1626
- * sit alongside `lessons[]` so consumers can pick the headline numbers off
1627
- * without walking every row.
1628
- */
1629
- function serialiseLessons(metrics) {
1630
- return {
1631
- lessons_created_count: metrics.lessons_created_count,
1632
- lessons_accepted_count: metrics.lessons_accepted_count,
1633
- proposal_lint_pass_rate: metrics.proposal_lint_pass_rate,
1634
- proposal_acceptance_rate: metrics.proposal_acceptance_rate,
1635
- lesson_reuse_rate: metrics.lesson_reuse_rate,
1636
- lesson_reuse_success_rate: metrics.lesson_reuse_success_rate,
1637
- lesson_negative_transfer_count: metrics.lesson_negative_transfer_count,
1638
- lessons: metrics.lessons.map((l) => ({
1639
- ref: l.ref,
1640
- source_failures: l.source_failures,
1641
- lint_pass: l.lint_pass,
1642
- accepted: l.accepted,
1643
- first_reused_on: l.first_reused_on,
1644
- reuse_count: l.reuse_count,
1645
- reuse_pass_rate: l.reuse_pass_rate,
1646
- negative_transfer_count: l.negative_transfer_count,
1647
- leakage_risk: l.leakage_risk,
1648
- })),
1649
- };
1650
- }
1651
- /**
1652
- * §6.4 (issue #265) — flatten a `LearningCurve` into its JSON envelope.
1653
- * Mirrors the suggested shape from the issue body: an `episodes[]` block
1654
- * with per-episode rows, plus the headline `learning_slope` and
1655
- * `time_to_improvement`. `pass_rate_by_episode` is exposed as a flat array
1656
- * for tools that want to plot without re-projecting the rows.
1657
- */
1658
- function serialiseLearningCurve(curve) {
1659
- return {
1660
- episodes: curve.episodes.map((ep) => ({
1661
- episode_index: ep.episode_index,
1662
- pass_rate: ep.pass_rate,
1663
- delta_from_previous_episode: ep.delta_from_previous_episode,
1664
- cumulative_feedback_events: ep.cumulative_feedback_events,
1665
- cumulative_proposals_created: ep.cumulative_proposals_created,
1666
- cumulative_proposals_accepted: ep.cumulative_proposals_accepted,
1667
- cumulative_lessons_created: ep.cumulative_lessons_created,
1668
- lesson_reuse_rate: ep.lesson_reuse_rate,
1669
- })),
1670
- pass_rate_by_episode: curve.pass_rate_by_episode.slice(),
1671
- learning_slope: curve.learning_slope,
1672
- time_to_improvement: curve.time_to_improvement,
1673
- };
1674
- }
1675
- /**
1676
- * §6.4 (issue #265) — render a compact "Learning curve" markdown table.
1677
- * One row per episode plus the headline slope + time-to-improvement.
1678
- */
1679
- export function renderLearningCurveSection(curve) {
1680
- const lines = [];
1681
- lines.push("## Learning curve");
1682
- lines.push("");
1683
- lines.push(`learning_slope=${signedFixed(curve.learning_slope, 3)}, time_to_improvement=${curve.time_to_improvement === null ? "n/a" : String(curve.time_to_improvement)}`);
1684
- lines.push("");
1685
- if (curve.episodes.length === 0) {
1686
- lines.push("_No episodes recorded._");
1687
- return lines.join("\n");
1688
- }
1689
- lines.push("| episode | pass_rate | Δ prev | feedback | proposals | accepted | lessons | reuse |");
1690
- lines.push("|--------:|----------:|-------:|---------:|----------:|---------:|--------:|------:|");
1691
- for (const ep of curve.episodes) {
1692
- lines.push(`| ${ep.episode_index} | ${ep.pass_rate.toFixed(2)} | ${signedFixed(ep.delta_from_previous_episode, 2)} | ${ep.cumulative_feedback_events} | ${ep.cumulative_proposals_created} | ${ep.cumulative_proposals_accepted} | ${ep.cumulative_lessons_created} | ${ep.lesson_reuse_rate === null ? "n/a" : ep.lesson_reuse_rate.toFixed(2)} |`);
1693
- }
1694
- return lines.join("\n");
1695
- }
1696
- /** §6.8 — flatten the FeedbackIntegrityMetrics envelope into JSON. */
1697
- function serialiseFeedbackIntegrity(metrics) {
1698
- return {
1699
- aggregate: {
1700
- truePositive: metrics.aggregate.truePositive,
1701
- falsePositive: metrics.aggregate.falsePositive,
1702
- trueNegative: metrics.aggregate.trueNegative,
1703
- falseNegative: metrics.aggregate.falseNegative,
1704
- feedback_agreement: metrics.aggregate.feedback_agreement,
1705
- false_positive_rate: metrics.aggregate.false_positive_rate,
1706
- false_negative_rate: metrics.aggregate.false_negative_rate,
1707
- feedback_coverage: metrics.aggregate.feedback_coverage,
1708
- },
1709
- perAsset: metrics.perAsset.map((row) => ({
1710
- ref: row.ref,
1711
- truePositive: row.truePositive,
1712
- falsePositive: row.falsePositive,
1713
- trueNegative: row.trueNegative,
1714
- falseNegative: row.falseNegative,
1715
- feedback_agreement: row.feedback_agreement,
1716
- false_positive_rate: row.false_positive_rate,
1717
- false_negative_rate: row.false_negative_rate,
1718
- })),
1719
- };
1720
- }
1721
- /**
1722
- * Render the #264 lessons block — aggregate counters followed by one row
1723
- * per lesson. Exported for tests so the rendered shape can be asserted
1724
- * directly without going through `renderEvolveReport`.
1725
- */
1726
- export function renderLessonsTable(metrics) {
1727
- const lines = [];
1728
- lines.push("## Lessons");
1729
- lines.push("");
1730
- lines.push(`created=${metrics.lessons_created_count}, accepted=${metrics.lessons_accepted_count}, reuse_rate=${metrics.lesson_reuse_rate.toFixed(2)}, reuse_success_rate=${metrics.lesson_reuse_success_rate.toFixed(2)}, negative_transfer=${metrics.lesson_negative_transfer_count}`);
1731
- lines.push("");
1732
- if (metrics.lessons.length === 0) {
1733
- lines.push("_No lessons generated._");
1734
- return lines.join("\n");
1735
- }
1736
- lines.push("| ref | accepted | lint | reuse | reuse_pass | first_reused_on | neg_transfer | leakage |");
1737
- lines.push("|-----|----------|------|-------|------------|-----------------|--------------|---------|");
1738
- for (const l of metrics.lessons) {
1739
- lines.push(`| \`${l.ref}\` | ${l.accepted ? "yes" : "no"} | ${l.lint_pass ? "pass" : "fail"} | ${l.reuse_count} | ${l.reuse_pass_rate.toFixed(2)} | ${l.first_reused_on ?? "n/a"} | ${l.negative_transfer_count} | ${l.leakage_risk} |`);
1740
- }
1741
- return lines.join("\n");
1742
- }
1743
- /**
1744
- * Render the §6.8 confusion-matrix table — aggregate 2×2 followed by
1745
- * per-asset breakdown. Used by `renderEvolveReport`'s markdown body and
1746
- * exported for tests.
1747
- */
1748
- export function renderFeedbackIntegrityTable(metrics) {
1749
- const lines = [];
1750
- const agg = metrics.aggregate;
1751
- lines.push("## Feedback-signal integrity");
1752
- lines.push("");
1753
- lines.push("| | run passed | run failed |");
1754
- lines.push("|--------------|-----------:|-----------:|");
1755
- lines.push(`| feedback + | ${agg.truePositive} (TP) | ${agg.falsePositive} (FP) |`);
1756
- lines.push(`| feedback - | ${agg.falseNegative} (FN) | ${agg.trueNegative} (TN) |`);
1757
- lines.push("");
1758
- lines.push("| metric | value |");
1759
- lines.push("|--------|-------|");
1760
- lines.push(`| feedback_agreement | ${agg.feedback_agreement.toFixed(2)} |`);
1761
- lines.push(`| false_positive_rate | ${agg.false_positive_rate.toFixed(2)} |`);
1762
- lines.push(`| false_negative_rate | ${agg.false_negative_rate.toFixed(2)} |`);
1763
- lines.push(`| feedback_coverage | ${agg.feedback_coverage.toFixed(2)} |`);
1764
- lines.push("");
1765
- if (metrics.perAsset.length > 0) {
1766
- lines.push("| ref | TP | FP | TN | FN | agreement | FP rate | FN rate |");
1767
- lines.push("|-----|----|----|----|----|-----------|---------|---------|");
1768
- for (const row of metrics.perAsset) {
1769
- lines.push(`| \`${row.ref}\` | ${row.truePositive} | ${row.falsePositive} | ${row.trueNegative} | ${row.falseNegative} | ${formatNullableRate(row.feedback_agreement)} | ${formatNullableRate(row.false_positive_rate)} | ${formatNullableRate(row.false_negative_rate)} |`);
1770
- }
1771
- }
1772
- else {
1773
- lines.push("_No feedback events recorded._");
1774
- }
1775
- return lines.join("\n");
1776
- }
1777
- function formatNullableRate(value) {
1778
- if (value === null)
1779
- return "n/a";
1780
- return value.toFixed(2);
1781
- }
1782
- function buildEvolveMarkdown(input) {
1783
- const lines = [];
1784
- lines.push(`# akm-bench evolve — ${input.model}`);
1785
- lines.push("");
1786
- lines.push(`branch \`${input.branch}\` @ \`${input.commit}\` — ${input.timestamp}`);
1787
- lines.push(`corpus: domain=\`${input.domain}\`, seedsPerArm=${input.seedsPerArm}`);
1788
- lines.push("");
1789
- // §6.8 warning marker — prepended above the headline so operators can't
1790
- // miss it. We also still surface the structured warning in `warnings[]`.
1791
- if (input.feedbackIntegrity &&
1792
- input.feedbackIntegrity.aggregate.feedback_agreement < FEEDBACK_AGREEMENT_WARNING_THRESHOLD) {
1793
- lines.push(`:warning: feedback_agreement = ${input.feedbackIntegrity.aggregate.feedback_agreement.toFixed(2)} — Track B headline numbers (improvement_slope, over_synthetic_lift) may be unreliable until AGENTS.md guidance for \`akm feedback\` is tightened.`);
1794
- lines.push("");
1795
- }
1796
- // Headline: improvement_slope.
1797
- lines.push(`**improvement_slope: ${signedFixed(input.longitudinal.improvementSlope, 2)}** (post=${input.longitudinal.postPassRate.toFixed(2)}, pre=${input.longitudinal.prePassRate.toFixed(2)})`);
1798
- // Second line: real feedback_agreement (per #244), or placeholder when
1799
- // metrics not supplied.
1800
- if (input.feedbackIntegrity) {
1801
- lines.push(`**feedback_agreement: ${input.feedbackIntegrity.aggregate.feedback_agreement.toFixed(2)}** (coverage=${input.feedbackIntegrity.aggregate.feedback_coverage.toFixed(2)})`);
1802
- }
1803
- else {
1804
- lines.push("_feedback_agreement: pending (#244)_");
1805
- }
1806
- lines.push("");
1807
- lines.push("## Longitudinal");
1808
- lines.push("");
1809
- lines.push("| metric | value |");
1810
- lines.push("|--------|-------|");
1811
- lines.push(`| improvement_slope | ${signedFixed(input.longitudinal.improvementSlope, 2)} |`);
1812
- lines.push(`| over_synthetic_lift | ${signedFixed(input.longitudinal.overSyntheticLift, 2)} |`);
1813
- lines.push(`| degradation_count | ${input.longitudinal.degradationCount} |`);
1814
- lines.push(`| pre_pass_rate | ${input.longitudinal.prePassRate.toFixed(2)} |`);
1815
- lines.push(`| post_pass_rate | ${input.longitudinal.postPassRate.toFixed(2)} |`);
1816
- lines.push(`| synthetic_pass_rate | ${input.longitudinal.syntheticPassRate.toFixed(2)} |`);
1817
- lines.push("");
1818
- if (input.longitudinal.degradations.length > 0) {
1819
- lines.push("### Degradations");
1820
- lines.push("");
1821
- lines.push("| task | pre | post | delta | failure_mode |");
1822
- lines.push("|------|-----|------|-------|--------------|");
1823
- for (const d of input.longitudinal.degradations) {
1824
- lines.push(`| ${d.taskId} | ${d.prePassRate.toFixed(2)} | ${d.postPassRate.toFixed(2)} | ${signedFixed(d.delta, 2)} | ${d.failureMode ?? "n/a"} |`);
1825
- }
1826
- lines.push("");
1827
- }
1828
- lines.push("## Proposals");
1829
- lines.push("");
1830
- lines.push(`acceptance_rate=${input.proposals.acceptanceRate.toFixed(2)}, lint_pass_rate=${input.proposals.lintPassRate.toFixed(2)}, total=${input.proposals.totalProposals}`);
1831
- lines.push("");
1832
- if (input.proposals.rows.length > 0) {
1833
- lines.push("| asset_ref | proposals | lint_pass | accepted |");
1834
- lines.push("|-----------|-----------|-----------|----------|");
1835
- for (const row of input.proposals.rows) {
1836
- lines.push(`| \`${row.assetRef}\` | ${row.proposalCount} | ${row.lintPassCount} | ${row.acceptedCount} |`);
1837
- }
1838
- lines.push("");
1839
- }
1840
- else {
1841
- lines.push("_No proposals generated._");
1842
- lines.push("");
1843
- }
1844
- if (input.lessons) {
1845
- lines.push(renderLessonsTable(input.lessons));
1846
- lines.push("");
1847
- }
1848
- lines.push("## Per-task pre → post → synthetic");
1849
- lines.push("");
1850
- lines.push("| task | pre | post | synthetic | post − pre |");
1851
- lines.push("|------|-----|------|-----------|------------|");
1852
- const preTasks = new Map();
1853
- for (const t of input.arms.pre.tasks)
1854
- preTasks.set(t.id, t);
1855
- const postTasks = new Map();
1856
- for (const t of input.arms.post.tasks)
1857
- postTasks.set(t.id, t);
1858
- const synthTasks = new Map();
1859
- for (const t of input.arms.synthetic.tasks)
1860
- synthTasks.set(t.id, t);
1861
- const allIds = new Set([...preTasks.keys(), ...postTasks.keys(), ...synthTasks.keys()]);
1862
- for (const id of [...allIds].sort()) {
1863
- const pre = preTasks.get(id)?.akm.passRate;
1864
- const post = postTasks.get(id)?.akm.passRate;
1865
- const synth = synthTasks.get(id)?.akm.passRate;
1866
- const delta = pre !== undefined && post !== undefined ? signedFixed(post - pre, 2) : "n/a";
1867
- lines.push(`| ${id} | ${pre === undefined ? "n/a" : pre.toFixed(2)} | ${post === undefined ? "n/a" : post.toFixed(2)} | ${synth === undefined ? "n/a" : synth.toFixed(2)} | ${delta} |`);
1868
- }
1869
- if (input.feedbackIntegrity) {
1870
- lines.push("");
1871
- lines.push(renderFeedbackIntegrityTable(input.feedbackIntegrity));
1872
- }
1873
- if (input.learningCurve) {
1874
- lines.push("");
1875
- lines.push(renderLearningCurveSection(input.learningCurve));
1876
- }
1877
- if (input.warnings.length > 0) {
1878
- lines.push("");
1879
- lines.push("## Warnings");
1880
- lines.push("");
1881
- for (const w of input.warnings)
1882
- lines.push(`- ${w}`);
1883
- }
1884
- return lines.join("\n");
1885
- }