akm-cli 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (332) hide show
  1. package/CHANGELOG.md +8 -0
  2. package/dist/{src/cli.js → cli.js} +22 -8
  3. package/dist/{src/commands → commands}/installed-stashes.js +1 -1
  4. package/dist/{src/commands → commands}/source-add.js +1 -1
  5. package/dist/{src/core → core}/common.js +16 -1
  6. package/dist/{src/core → core}/config.js +5 -2
  7. package/dist/{src/indexer → indexer}/db-search.js +16 -1
  8. package/dist/{src/indexer → indexer}/graph-extraction.js +5 -3
  9. package/dist/{src/indexer → indexer}/indexer.js +27 -11
  10. package/dist/{src/indexer → indexer}/memory-inference.js +47 -58
  11. package/dist/{src/indexer → indexer}/search-source.js +1 -1
  12. package/dist/{src/llm → llm}/client.js +61 -1
  13. package/dist/{src/llm → llm}/embedder.js +8 -5
  14. package/dist/{src/llm → llm}/embedders/local.js +8 -2
  15. package/dist/{src/llm → llm}/embedders/remote.js +4 -2
  16. package/dist/{src/llm → llm}/graph-extract.js +4 -4
  17. package/dist/llm/memory-infer.js +114 -0
  18. package/dist/{src/llm → llm}/metadata-enhance.js +2 -2
  19. package/dist/{src/output → output}/cli-hints.js +2 -0
  20. package/dist/{src/setup → setup}/setup.js +30 -20
  21. package/dist/sources/providers/website.js +27 -0
  22. package/dist/{src/sources/providers/website.js → sources/website-ingest.js} +38 -51
  23. package/docs/README.md +7 -0
  24. package/docs/migration/release-notes/0.7.0.md +14 -0
  25. package/package.json +11 -8
  26. package/dist/src/llm/memory-infer.js +0 -86
  27. package/dist/tests/add-website-source.test.js +0 -119
  28. package/dist/tests/agent/agent-config-loader.test.js +0 -70
  29. package/dist/tests/agent/agent-config.test.js +0 -221
  30. package/dist/tests/agent/agent-detect.test.js +0 -100
  31. package/dist/tests/agent/agent-spawn.test.js +0 -234
  32. package/dist/tests/agent-output.test.js +0 -186
  33. package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +0 -103
  34. package/dist/tests/architecture/agent-spawn-seam.test.js +0 -193
  35. package/dist/tests/architecture/llm-stateless-seam.test.js +0 -112
  36. package/dist/tests/asset-ref.test.js +0 -192
  37. package/dist/tests/asset-registry.test.js +0 -103
  38. package/dist/tests/asset-spec.test.js +0 -241
  39. package/dist/tests/bench/attribution.test.js +0 -996
  40. package/dist/tests/bench/cleanup-sigint.test.js +0 -83
  41. package/dist/tests/bench/cleanup.js +0 -234
  42. package/dist/tests/bench/cleanup.test.js +0 -166
  43. package/dist/tests/bench/cli.js +0 -1018
  44. package/dist/tests/bench/cli.test.js +0 -445
  45. package/dist/tests/bench/compare.test.js +0 -556
  46. package/dist/tests/bench/corpus.js +0 -317
  47. package/dist/tests/bench/corpus.test.js +0 -258
  48. package/dist/tests/bench/doctor.js +0 -525
  49. package/dist/tests/bench/driver.js +0 -401
  50. package/dist/tests/bench/driver.test.js +0 -584
  51. package/dist/tests/bench/environment.js +0 -233
  52. package/dist/tests/bench/environment.test.js +0 -199
  53. package/dist/tests/bench/evolve-metrics.js +0 -179
  54. package/dist/tests/bench/evolve-metrics.test.js +0 -187
  55. package/dist/tests/bench/evolve.js +0 -647
  56. package/dist/tests/bench/evolve.test.js +0 -624
  57. package/dist/tests/bench/failure-modes.test.js +0 -349
  58. package/dist/tests/bench/feedback-integrity.test.js +0 -457
  59. package/dist/tests/bench/leakage.test.js +0 -228
  60. package/dist/tests/bench/learning-curve.test.js +0 -134
  61. package/dist/tests/bench/metrics.js +0 -2395
  62. package/dist/tests/bench/metrics.test.js +0 -1150
  63. package/dist/tests/bench/no-os-tmpdir-invariant.test.js +0 -43
  64. package/dist/tests/bench/opencode-config.js +0 -194
  65. package/dist/tests/bench/opencode-config.test.js +0 -370
  66. package/dist/tests/bench/report.js +0 -1885
  67. package/dist/tests/bench/report.test.js +0 -1038
  68. package/dist/tests/bench/run-config.js +0 -355
  69. package/dist/tests/bench/run-config.test.js +0 -298
  70. package/dist/tests/bench/run-curate-test.js +0 -32
  71. package/dist/tests/bench/run-failing-tasks.js +0 -56
  72. package/dist/tests/bench/run-full-bench.js +0 -51
  73. package/dist/tests/bench/run-items36-targeted.js +0 -69
  74. package/dist/tests/bench/run-nano-quick.js +0 -42
  75. package/dist/tests/bench/run-waveg-targeted.js +0 -62
  76. package/dist/tests/bench/runner.js +0 -699
  77. package/dist/tests/bench/runner.test.js +0 -958
  78. package/dist/tests/bench/search-bridge.test.js +0 -331
  79. package/dist/tests/bench/tmp.js +0 -131
  80. package/dist/tests/bench/trajectory.js +0 -116
  81. package/dist/tests/bench/trajectory.test.js +0 -127
  82. package/dist/tests/bench/verifier.js +0 -114
  83. package/dist/tests/bench/verifier.test.js +0 -118
  84. package/dist/tests/bench/workflow-evaluator.js +0 -557
  85. package/dist/tests/bench/workflow-evaluator.test.js +0 -421
  86. package/dist/tests/bench/workflow-spec.js +0 -345
  87. package/dist/tests/bench/workflow-spec.test.js +0 -363
  88. package/dist/tests/bench/workflow-trace.js +0 -472
  89. package/dist/tests/bench/workflow-trace.test.js +0 -254
  90. package/dist/tests/benchmark-search-quality.js +0 -536
  91. package/dist/tests/benchmark-suite.js +0 -1441
  92. package/dist/tests/capture-cli.test.js +0 -112
  93. package/dist/tests/cli-errors.test.js +0 -204
  94. package/dist/tests/commands/events.test.js +0 -370
  95. package/dist/tests/commands/history.test.js +0 -418
  96. package/dist/tests/commands/import.test.js +0 -103
  97. package/dist/tests/commands/proposal-cli.test.js +0 -209
  98. package/dist/tests/commands/reflect-propose-cli.test.js +0 -333
  99. package/dist/tests/commands/remember.test.js +0 -97
  100. package/dist/tests/commands/scope-flags.test.js +0 -300
  101. package/dist/tests/commands/search.test.js +0 -537
  102. package/dist/tests/commands/show-indexer-parity.test.js +0 -117
  103. package/dist/tests/commands/show.test.js +0 -294
  104. package/dist/tests/common.test.js +0 -266
  105. package/dist/tests/completions.test.js +0 -142
  106. package/dist/tests/config-cli.test.js +0 -193
  107. package/dist/tests/config-llm-features.test.js +0 -139
  108. package/dist/tests/config.test.js +0 -569
  109. package/dist/tests/contracts/migration-baseline.test.js +0 -43
  110. package/dist/tests/contracts/reflect-propose-envelope.test.js +0 -139
  111. package/dist/tests/contracts/spec-helpers.js +0 -46
  112. package/dist/tests/contracts/v1-spec-section-11-proposal-queue.test.js +0 -228
  113. package/dist/tests/contracts/v1-spec-section-12-agent-config.test.js +0 -56
  114. package/dist/tests/contracts/v1-spec-section-13-lesson-type.test.js +0 -34
  115. package/dist/tests/contracts/v1-spec-section-14-llm-features.test.js +0 -94
  116. package/dist/tests/contracts/v1-spec-section-4-1-asset-types.test.js +0 -39
  117. package/dist/tests/contracts/v1-spec-section-4-2-quality-rules.test.js +0 -44
  118. package/dist/tests/contracts/v1-spec-section-5-configuration.test.js +0 -47
  119. package/dist/tests/contracts/v1-spec-section-6-orchestration.test.js +0 -40
  120. package/dist/tests/contracts/v1-spec-section-7-module-layout.test.js +0 -58
  121. package/dist/tests/contracts/v1-spec-section-8-extension-points.test.js +0 -34
  122. package/dist/tests/contracts/v1-spec-section-9-4-cli-surface.test.js +0 -75
  123. package/dist/tests/contracts/v1-spec-section-9-7-llm-agent-boundary.test.js +0 -36
  124. package/dist/tests/core/write-source.test.js +0 -366
  125. package/dist/tests/curate-command.test.js +0 -87
  126. package/dist/tests/db-scoring.test.js +0 -201
  127. package/dist/tests/db.test.js +0 -654
  128. package/dist/tests/distill-cli-flag.test.js +0 -208
  129. package/dist/tests/distill.test.js +0 -515
  130. package/dist/tests/docker-install.test.js +0 -120
  131. package/dist/tests/e2e.test.js +0 -1419
  132. package/dist/tests/embedder.test.js +0 -340
  133. package/dist/tests/embedding-model-config.test.js +0 -379
  134. package/dist/tests/feedback-command.test.js +0 -172
  135. package/dist/tests/file-context.test.js +0 -552
  136. package/dist/tests/fixtures/scripts/git/summarize-diff.js +0 -9
  137. package/dist/tests/fixtures/scripts/lint/eslint-check.js +0 -7
  138. package/dist/tests/fixtures/stashes/load.js +0 -166
  139. package/dist/tests/fixtures/stashes/load.test.js +0 -97
  140. package/dist/tests/fixtures/stashes/ranking-baseline/scripts/mem0-search.js +0 -12
  141. package/dist/tests/frontmatter.test.js +0 -190
  142. package/dist/tests/fts-field-weighting.test.js +0 -254
  143. package/dist/tests/fuzzy-search.test.js +0 -230
  144. package/dist/tests/git-provider-clone.test.js +0 -45
  145. package/dist/tests/github.test.js +0 -161
  146. package/dist/tests/graph-boost-ranking.test.js +0 -305
  147. package/dist/tests/graph-extraction.test.js +0 -282
  148. package/dist/tests/helpers/usage-events.js +0 -8
  149. package/dist/tests/index-pass-llm.test.js +0 -161
  150. package/dist/tests/indexer.test.js +0 -570
  151. package/dist/tests/info-command.test.js +0 -166
  152. package/dist/tests/init.test.js +0 -69
  153. package/dist/tests/install-script.test.js +0 -246
  154. package/dist/tests/integration/agent-real-profile.test.js +0 -94
  155. package/dist/tests/issue-36-repro.test.js +0 -304
  156. package/dist/tests/issues-191-194.test.js +0 -160
  157. package/dist/tests/lesson-lint.test.js +0 -111
  158. package/dist/tests/llm-client.test.js +0 -115
  159. package/dist/tests/llm-feature-gate.test.js +0 -151
  160. package/dist/tests/llm.test.js +0 -139
  161. package/dist/tests/lockfile.test.js +0 -216
  162. package/dist/tests/manifest.test.js +0 -205
  163. package/dist/tests/markdown.test.js +0 -126
  164. package/dist/tests/matchers-unit.test.js +0 -189
  165. package/dist/tests/memory-inference.test.js +0 -299
  166. package/dist/tests/merge-scoring.test.js +0 -136
  167. package/dist/tests/metadata.test.js +0 -313
  168. package/dist/tests/migration-help.test.js +0 -89
  169. package/dist/tests/origin-resolve.test.js +0 -124
  170. package/dist/tests/output-baseline.test.js +0 -218
  171. package/dist/tests/output-shapes-unit.test.js +0 -478
  172. package/dist/tests/parallel-search.test.js +0 -272
  173. package/dist/tests/parameter-metadata.test.js +0 -365
  174. package/dist/tests/paths.test.js +0 -177
  175. package/dist/tests/progressive-disclosure.test.js +0 -280
  176. package/dist/tests/proposals.test.js +0 -279
  177. package/dist/tests/proposed-quality.test.js +0 -271
  178. package/dist/tests/provider-registry.test.js +0 -32
  179. package/dist/tests/ranking-regression.test.js +0 -548
  180. package/dist/tests/reflect-propose.test.js +0 -455
  181. package/dist/tests/registry-build-index.test.js +0 -394
  182. package/dist/tests/registry-cli.test.js +0 -290
  183. package/dist/tests/registry-index-v2.test.js +0 -430
  184. package/dist/tests/registry-install.test.js +0 -728
  185. package/dist/tests/registry-providers/parity.test.js +0 -189
  186. package/dist/tests/registry-providers/skills-sh.test.js +0 -309
  187. package/dist/tests/registry-providers/static-index.test.js +0 -238
  188. package/dist/tests/registry-resolve.test.js +0 -126
  189. package/dist/tests/registry-search.test.js +0 -923
  190. package/dist/tests/remember-frontmatter.test.js +0 -378
  191. package/dist/tests/remember-unit.test.js +0 -123
  192. package/dist/tests/ripgrep-install.test.js +0 -251
  193. package/dist/tests/ripgrep-resolve.test.js +0 -108
  194. package/dist/tests/ripgrep.test.js +0 -163
  195. package/dist/tests/save-command.test.js +0 -94
  196. package/dist/tests/save-trust-qa-fixes.test.js +0 -270
  197. package/dist/tests/scoring-pipeline.test.js +0 -648
  198. package/dist/tests/search-include-proposed-cli.test.js +0 -118
  199. package/dist/tests/self-update.test.js +0 -442
  200. package/dist/tests/semantic-search-e2e.test.js +0 -512
  201. package/dist/tests/semantic-status.test.js +0 -471
  202. package/dist/tests/setup-run.integration.js +0 -877
  203. package/dist/tests/setup-wizard.test.js +0 -198
  204. package/dist/tests/setup.test.js +0 -131
  205. package/dist/tests/source-add.test.js +0 -11
  206. package/dist/tests/source-clone.test.js +0 -254
  207. package/dist/tests/source-manage.test.js +0 -366
  208. package/dist/tests/source-providers/filesystem.test.js +0 -82
  209. package/dist/tests/source-providers/git.test.js +0 -252
  210. package/dist/tests/source-providers/website.test.js +0 -128
  211. package/dist/tests/source-qa-fixes.test.js +0 -286
  212. package/dist/tests/source-registry.test.js +0 -350
  213. package/dist/tests/source-resolve.test.js +0 -100
  214. package/dist/tests/source-source.test.js +0 -281
  215. package/dist/tests/source.test.js +0 -533
  216. package/dist/tests/tar-utils-scan.test.js +0 -73
  217. package/dist/tests/toggle-components.test.js +0 -73
  218. package/dist/tests/usage-telemetry.test.js +0 -265
  219. package/dist/tests/utility-scoring.test.js +0 -558
  220. package/dist/tests/vault-load-error.test.js +0 -78
  221. package/dist/tests/vault-qa-fixes.test.js +0 -194
  222. package/dist/tests/vault.test.js +0 -429
  223. package/dist/tests/vector-search.test.js +0 -608
  224. package/dist/tests/walker.test.js +0 -252
  225. package/dist/tests/wave2-cluster-bc.test.js +0 -228
  226. package/dist/tests/wave2-cluster-d.test.js +0 -180
  227. package/dist/tests/wave2-cluster-e.test.js +0 -179
  228. package/dist/tests/wiki-qa-fixes.test.js +0 -270
  229. package/dist/tests/wiki.test.js +0 -529
  230. package/dist/tests/workflow-cli.test.js +0 -271
  231. package/dist/tests/workflow-markdown.test.js +0 -171
  232. package/dist/tests/workflow-path-escape.test.js +0 -132
  233. package/dist/tests/workflow-qa-fixes.test.js +0 -395
  234. package/dist/tests/workflows/indexer-rejection.test.js +0 -213
  235. /package/dist/{src/commands → commands}/completions.js +0 -0
  236. /package/dist/{src/commands → commands}/config-cli.js +0 -0
  237. /package/dist/{src/commands → commands}/curate.js +0 -0
  238. /package/dist/{src/commands → commands}/distill.js +0 -0
  239. /package/dist/{src/commands → commands}/events.js +0 -0
  240. /package/dist/{src/commands → commands}/history.js +0 -0
  241. /package/dist/{src/commands → commands}/info.js +0 -0
  242. /package/dist/{src/commands → commands}/init.js +0 -0
  243. /package/dist/{src/commands → commands}/install-audit.js +0 -0
  244. /package/dist/{src/commands → commands}/migration-help.js +0 -0
  245. /package/dist/{src/commands → commands}/proposal.js +0 -0
  246. /package/dist/{src/commands → commands}/propose.js +0 -0
  247. /package/dist/{src/commands → commands}/reflect.js +0 -0
  248. /package/dist/{src/commands → commands}/registry-search.js +0 -0
  249. /package/dist/{src/commands → commands}/remember.js +0 -0
  250. /package/dist/{src/commands → commands}/search.js +0 -0
  251. /package/dist/{src/commands → commands}/self-update.js +0 -0
  252. /package/dist/{src/commands → commands}/show.js +0 -0
  253. /package/dist/{src/commands → commands}/source-clone.js +0 -0
  254. /package/dist/{src/commands → commands}/source-manage.js +0 -0
  255. /package/dist/{src/commands → commands}/vault.js +0 -0
  256. /package/dist/{src/core → core}/asset-ref.js +0 -0
  257. /package/dist/{src/core → core}/asset-registry.js +0 -0
  258. /package/dist/{src/core → core}/asset-spec.js +0 -0
  259. /package/dist/{src/core → core}/errors.js +0 -0
  260. /package/dist/{src/core → core}/events.js +0 -0
  261. /package/dist/{src/core → core}/frontmatter.js +0 -0
  262. /package/dist/{src/core → core}/lesson-lint.js +0 -0
  263. /package/dist/{src/core → core}/markdown.js +0 -0
  264. /package/dist/{src/core → core}/paths.js +0 -0
  265. /package/dist/{src/core → core}/proposals.js +0 -0
  266. /package/dist/{src/core → core}/warn.js +0 -0
  267. /package/dist/{src/core → core}/write-source.js +0 -0
  268. /package/dist/{src/indexer → indexer}/db.js +0 -0
  269. /package/dist/{src/indexer → indexer}/file-context.js +0 -0
  270. /package/dist/{src/indexer → indexer}/graph-boost.js +0 -0
  271. /package/dist/{src/indexer → indexer}/manifest.js +0 -0
  272. /package/dist/{src/indexer → indexer}/matchers.js +0 -0
  273. /package/dist/{src/indexer → indexer}/metadata.js +0 -0
  274. /package/dist/{src/indexer → indexer}/search-fields.js +0 -0
  275. /package/dist/{src/indexer → indexer}/semantic-status.js +0 -0
  276. /package/dist/{src/indexer → indexer}/usage-events.js +0 -0
  277. /package/dist/{src/indexer → indexer}/walker.js +0 -0
  278. /package/dist/{src/integrations → integrations}/agent/config.js +0 -0
  279. /package/dist/{src/integrations → integrations}/agent/detect.js +0 -0
  280. /package/dist/{src/integrations → integrations}/agent/index.js +0 -0
  281. /package/dist/{src/integrations → integrations}/agent/profiles.js +0 -0
  282. /package/dist/{src/integrations → integrations}/agent/prompts.js +0 -0
  283. /package/dist/{src/integrations → integrations}/agent/spawn.js +0 -0
  284. /package/dist/{src/integrations → integrations}/github.js +0 -0
  285. /package/dist/{src/integrations → integrations}/lockfile.js +0 -0
  286. /package/dist/{src/llm → llm}/embedders/cache.js +0 -0
  287. /package/dist/{src/llm → llm}/embedders/types.js +0 -0
  288. /package/dist/{src/llm → llm}/feature-gate.js +0 -0
  289. /package/dist/{src/llm → llm}/index-passes.js +0 -0
  290. /package/dist/{src/output → output}/context.js +0 -0
  291. /package/dist/{src/output → output}/renderers.js +0 -0
  292. /package/dist/{src/output → output}/shapes.js +0 -0
  293. /package/dist/{src/output → output}/text.js +0 -0
  294. /package/dist/{src/registry → registry}/build-index.js +0 -0
  295. /package/dist/{src/registry → registry}/create-provider-registry.js +0 -0
  296. /package/dist/{src/registry → registry}/factory.js +0 -0
  297. /package/dist/{src/registry → registry}/origin-resolve.js +0 -0
  298. /package/dist/{src/registry → registry}/providers/index.js +0 -0
  299. /package/dist/{src/registry → registry}/providers/skills-sh.js +0 -0
  300. /package/dist/{src/registry → registry}/providers/static-index.js +0 -0
  301. /package/dist/{src/registry → registry}/providers/types.js +0 -0
  302. /package/dist/{src/registry → registry}/resolve.js +0 -0
  303. /package/dist/{src/registry → registry}/types.js +0 -0
  304. /package/dist/{src/setup → setup}/detect.js +0 -0
  305. /package/dist/{src/setup → setup}/ripgrep-install.js +0 -0
  306. /package/dist/{src/setup → setup}/ripgrep-resolve.js +0 -0
  307. /package/dist/{src/setup → setup}/steps.js +0 -0
  308. /package/dist/{src/sources → sources}/include.js +0 -0
  309. /package/dist/{src/sources → sources}/provider-factory.js +0 -0
  310. /package/dist/{src/sources → sources}/provider.js +0 -0
  311. /package/dist/{src/sources → sources}/providers/filesystem.js +0 -0
  312. /package/dist/{src/sources → sources}/providers/git.js +0 -0
  313. /package/dist/{src/sources → sources}/providers/index.js +0 -0
  314. /package/dist/{src/sources → sources}/providers/install-types.js +0 -0
  315. /package/dist/{src/sources → sources}/providers/npm.js +0 -0
  316. /package/dist/{src/sources → sources}/providers/provider-utils.js +0 -0
  317. /package/dist/{src/sources → sources}/providers/sync-from-ref.js +0 -0
  318. /package/dist/{src/sources → sources}/providers/tar-utils.js +0 -0
  319. /package/dist/{src/sources → sources}/resolve.js +0 -0
  320. /package/dist/{src/sources → sources}/types.js +0 -0
  321. /package/dist/{src/templates → templates}/wiki-templates.js +0 -0
  322. /package/dist/{src/version.js → version.js} +0 -0
  323. /package/dist/{src/wiki → wiki}/wiki.js +0 -0
  324. /package/dist/{src/workflows → workflows}/authoring.js +0 -0
  325. /package/dist/{src/workflows → workflows}/cli.js +0 -0
  326. /package/dist/{src/workflows → workflows}/db.js +0 -0
  327. /package/dist/{src/workflows → workflows}/document-cache.js +0 -0
  328. /package/dist/{src/workflows → workflows}/parser.js +0 -0
  329. /package/dist/{src/workflows → workflows}/renderer.js +0 -0
  330. /package/dist/{src/workflows → workflows}/runs.js +0 -0
  331. /package/dist/{src/workflows → workflows}/schema.js +0 -0
  332. /package/dist/{src/workflows → workflows}/validator.js +0 -0
@@ -1,1150 +0,0 @@
1
- /**
2
- * Unit tests for outcome / per-task / corpus / trajectory aggregation.
3
- */
4
- import { describe, expect, test } from "bun:test";
5
- import fs from "node:fs";
6
- import path from "node:path";
7
- import { aggregateAkmOverhead, aggregateByMemoryAbility, aggregateByTaskFamily, aggregateCorpus, aggregatePerTask, aggregateTrajectory, computeAkmOverhead, computeAssetRegressionCandidates, computeCorpusCoverage, computeCorpusDelta, computeDomainAggregates, computeLearningCurve, computeNegativeTransfer, computeOutcomeAggregate, computePerTaskDelta, computeWorkflowReliability, domainOfTaskId, isPathContained, LEARNING_IMPROVEMENT_THRESHOLD, materialiseMaskedStash, } from "./metrics";
8
- import { benchMkdtemp } from "./tmp";
9
- function ptm(overrides = {}) {
10
- return {
11
- passRate: 0,
12
- passAt1: 0,
13
- tokensPerPass: null,
14
- tokensPerRun: null,
15
- wallclockMs: 0,
16
- passRateStdev: 0,
17
- budgetExceededCount: 0,
18
- harnessErrorCount: 0,
19
- count: 1,
20
- runsWithMeasuredTokens: 0,
21
- ...overrides,
22
- };
23
- }
24
- function fakeResult(overrides) {
25
- return {
26
- schemaVersion: 1,
27
- taskId: "t",
28
- arm: "akm",
29
- seed: 0,
30
- model: "m",
31
- outcome: "pass",
32
- tokens: { input: 0, output: 0 },
33
- wallclockMs: 0,
34
- trajectory: { correctAssetLoaded: null, feedbackRecorded: null },
35
- events: [],
36
- verifierStdout: "",
37
- verifierExitCode: 0,
38
- assetsLoaded: [],
39
- ...overrides,
40
- };
41
- }
42
- describe("computeOutcomeAggregate", () => {
43
- test("returns zeros on empty input", () => {
44
- expect(computeOutcomeAggregate([])).toEqual({
45
- passRate: 0,
46
- tokensPerPass: 0,
47
- wallclockMs: 0,
48
- budgetExceeded: 0,
49
- runsWithMeasuredTokens: 0,
50
- });
51
- });
52
- test("computes passRate, tokensPerPass, wallclockMs across mixed outcomes", () => {
53
- const results = [
54
- fakeResult({ outcome: "pass", tokens: { input: 1000, output: 500 }, wallclockMs: 1000 }),
55
- fakeResult({ outcome: "pass", tokens: { input: 2000, output: 1000 }, wallclockMs: 2000 }),
56
- fakeResult({ outcome: "fail", tokens: { input: 500, output: 200 }, wallclockMs: 1500 }),
57
- fakeResult({ outcome: "budget_exceeded", tokens: { input: 100, output: 50 }, wallclockMs: 500 }),
58
- ];
59
- const agg = computeOutcomeAggregate(results);
60
- expect(agg.passRate).toBeCloseTo(0.5);
61
- expect(agg.tokensPerPass).toBeCloseTo((1500 + 3000) / 2);
62
- expect(agg.wallclockMs).toBeCloseTo((1000 + 2000 + 1500 + 500) / 4);
63
- expect(agg.budgetExceeded).toBe(1);
64
- });
65
- test("tokensPerPass is 0 (not NaN) when no runs passed", () => {
66
- const results = [fakeResult({ outcome: "fail", wallclockMs: 100 })];
67
- const agg = computeOutcomeAggregate(results);
68
- expect(agg.passRate).toBe(0);
69
- expect(agg.tokensPerPass).toBe(0);
70
- });
71
- test("missing token measurement is NOT silently treated as zero (issue #252)", () => {
72
- // Two passes: one parsed at 1000, one missing measurement. The mean must
73
- // be 1000 (the measured pass), not (1000+0)/2 = 500.
74
- const results = [
75
- fakeResult({
76
- outcome: "pass",
77
- tokens: { input: 700, output: 300 },
78
- tokenMeasurement: "parsed",
79
- }),
80
- fakeResult({
81
- outcome: "pass",
82
- tokens: { input: 0, output: 0 },
83
- tokenMeasurement: "missing",
84
- }),
85
- ];
86
- const agg = computeOutcomeAggregate(results);
87
- expect(agg.passRate).toBeCloseTo(1);
88
- expect(agg.tokensPerPass).toBeCloseTo(1000);
89
- expect(agg.runsWithMeasuredTokens).toBe(1);
90
- });
91
- test("unsupported token measurement is also skipped from token aggregation", () => {
92
- const results = [
93
- fakeResult({
94
- outcome: "pass",
95
- tokens: { input: 0, output: 0 },
96
- tokenMeasurement: "unsupported",
97
- }),
98
- ];
99
- const agg = computeOutcomeAggregate(results);
100
- // No measured passes → tokensPerPass collapses to 0, but runsWithMeasuredTokens=0
101
- // signals that the 0 is "unknown", not "free".
102
- expect(agg.tokensPerPass).toBe(0);
103
- expect(agg.runsWithMeasuredTokens).toBe(0);
104
- });
105
- });
106
- describe("aggregatePerTask", () => {
107
- test("0 of K passes — tokensPerPass is null, passRate is 0", () => {
108
- const runs = [
109
- fakeResult({ seed: 0, outcome: "fail", wallclockMs: 1000 }),
110
- fakeResult({ seed: 1, outcome: "fail", wallclockMs: 2000 }),
111
- fakeResult({ seed: 2, outcome: "harness_error", wallclockMs: 3000 }),
112
- ];
113
- const m = aggregatePerTask(runs);
114
- expect(m.passRate).toBe(0);
115
- expect(m.passAt1).toBe(0);
116
- expect(m.tokensPerPass).toBeNull();
117
- expect(m.wallclockMs).toBe(2000);
118
- expect(m.harnessErrorCount).toBe(1);
119
- expect(m.budgetExceededCount).toBe(0);
120
- expect(m.count).toBe(3);
121
- });
122
- test("K of K passes — passRate is 1, stdev is 0", () => {
123
- const runs = Array.from({ length: 5 }, (_, i) => fakeResult({ seed: i, outcome: "pass", tokens: { input: 1000, output: 0 }, wallclockMs: 1000 }));
124
- const m = aggregatePerTask(runs);
125
- expect(m.passRate).toBe(1);
126
- expect(m.passAt1).toBe(1);
127
- expect(m.tokensPerPass).toBe(1000);
128
- expect(m.passRateStdev).toBe(0);
129
- });
130
- test("partial passes — passRate, stdev, and budget_exceeded count are computed", () => {
131
- const runs = [
132
- fakeResult({ seed: 0, outcome: "pass", tokens: { input: 800, output: 200 }, wallclockMs: 1000 }),
133
- fakeResult({ seed: 1, outcome: "pass", tokens: { input: 1200, output: 300 }, wallclockMs: 1500 }),
134
- fakeResult({ seed: 2, outcome: "fail", wallclockMs: 2000 }),
135
- fakeResult({ seed: 3, outcome: "budget_exceeded", wallclockMs: 3000 }),
136
- ];
137
- const m = aggregatePerTask(runs);
138
- expect(m.passRate).toBeCloseTo(0.5);
139
- expect(m.passAt1).toBe(1);
140
- expect(m.tokensPerPass).toBeCloseTo((1000 + 1500) / 2);
141
- expect(m.budgetExceededCount).toBe(1);
142
- // Sample stdev of [1, 1, 0, 0] over 4 samples = sqrt(4/3 * 0.25) — non-zero.
143
- expect(m.passRateStdev).toBeGreaterThan(0);
144
- });
145
- test("passAt1 honours seed=0 specifically when present", () => {
146
- const runs = [
147
- fakeResult({ seed: 1, outcome: "pass" }),
148
- fakeResult({ seed: 0, outcome: "fail" }),
149
- fakeResult({ seed: 2, outcome: "pass" }),
150
- ];
151
- const m = aggregatePerTask(runs);
152
- expect(m.passAt1).toBe(0);
153
- });
154
- test("empty input returns a zeroed envelope", () => {
155
- const m = aggregatePerTask([]);
156
- expect(m.count).toBe(0);
157
- expect(m.passRate).toBe(0);
158
- expect(m.tokensPerPass).toBeNull();
159
- expect(m.runsWithMeasuredTokens).toBe(0);
160
- });
161
- test("aggregatePerTask: passes with missing measurement do NOT pull tokensPerPass to zero", () => {
162
- const runs = [
163
- fakeResult({
164
- seed: 0,
165
- outcome: "pass",
166
- tokens: { input: 800, output: 200 },
167
- tokenMeasurement: "parsed",
168
- wallclockMs: 1000,
169
- }),
170
- fakeResult({
171
- seed: 1,
172
- outcome: "pass",
173
- tokens: { input: 0, output: 0 },
174
- tokenMeasurement: "missing",
175
- wallclockMs: 1000,
176
- }),
177
- ];
178
- const m = aggregatePerTask(runs);
179
- expect(m.passRate).toBe(1);
180
- // Mean is over the single measured pass, not (1000 + 0) / 2.
181
- expect(m.tokensPerPass).toBeCloseTo(1000);
182
- expect(m.runsWithMeasuredTokens).toBe(1);
183
- expect(m.count).toBe(2);
184
- });
185
- test("aggregatePerTask: tokensPerPass is null when every pass has missing measurement", () => {
186
- const runs = [
187
- fakeResult({
188
- seed: 0,
189
- outcome: "pass",
190
- tokens: { input: 0, output: 0 },
191
- tokenMeasurement: "missing",
192
- }),
193
- fakeResult({
194
- seed: 1,
195
- outcome: "pass",
196
- tokens: { input: 0, output: 0 },
197
- tokenMeasurement: "unsupported",
198
- }),
199
- ];
200
- const m = aggregatePerTask(runs);
201
- expect(m.passRate).toBe(1);
202
- expect(m.tokensPerPass).toBeNull();
203
- expect(m.runsWithMeasuredTokens).toBe(0);
204
- });
205
- });
206
- describe("aggregateCorpus", () => {
207
- test("weights every task equally regardless of seed count", () => {
208
- const perTask = {
209
- a: {
210
- passRate: 1,
211
- passAt1: 1,
212
- tokensPerPass: 1000,
213
- tokensPerRun: 1000,
214
- wallclockMs: 1000,
215
- passRateStdev: 0,
216
- budgetExceededCount: 0,
217
- harnessErrorCount: 0,
218
- count: 5,
219
- runsWithMeasuredTokens: 5,
220
- },
221
- b: {
222
- passRate: 0,
223
- passAt1: 0,
224
- tokensPerPass: null,
225
- tokensPerRun: null,
226
- wallclockMs: 2000,
227
- passRateStdev: 0,
228
- budgetExceededCount: 0,
229
- harnessErrorCount: 0,
230
- count: 1,
231
- runsWithMeasuredTokens: 0,
232
- },
233
- };
234
- const corpus = aggregateCorpus(perTask);
235
- expect(corpus.passRate).toBeCloseTo(0.5);
236
- expect(corpus.wallclockMs).toBeCloseTo(1500);
237
- expect(corpus.tokensPerPass).toBeCloseTo(1000); // null is dropped
238
- });
239
- test("tokensPerPass is null when every task has null tokensPerPass", () => {
240
- const perTask = {
241
- a: {
242
- passRate: 0,
243
- passAt1: 0,
244
- tokensPerPass: null,
245
- tokensPerRun: null,
246
- wallclockMs: 1000,
247
- passRateStdev: 0,
248
- budgetExceededCount: 0,
249
- harnessErrorCount: 0,
250
- count: 1,
251
- runsWithMeasuredTokens: 0,
252
- },
253
- };
254
- const corpus = aggregateCorpus(perTask);
255
- expect(corpus.tokensPerPass).toBeNull();
256
- });
257
- test("empty input returns zeros + null tokens", () => {
258
- const corpus = aggregateCorpus({});
259
- expect(corpus.passRate).toBe(0);
260
- expect(corpus.tokensPerPass).toBeNull();
261
- });
262
- });
263
- describe("delta helpers", () => {
264
- test("computeCorpusDelta — akm − noakm", () => {
265
- const noakm = { passRate: 0.3, tokensPerPass: 18000, tokensPerRun: null, wallclockMs: 4000 };
266
- const akm = { passRate: 0.7, tokensPerPass: 14000, tokensPerRun: null, wallclockMs: 3000 };
267
- const d = computeCorpusDelta(noakm, akm);
268
- expect(d.passRate).toBeCloseTo(0.4);
269
- expect(d.tokensPerPass).toBeCloseTo(-4000);
270
- expect(d.wallclockMs).toBeCloseTo(-1000);
271
- });
272
- test("computeCorpusDelta — null tokensPerPass propagates", () => {
273
- const noakm = { passRate: 0, tokensPerPass: null, tokensPerRun: null, wallclockMs: 1 };
274
- const akm = { passRate: 1, tokensPerPass: 5, tokensPerRun: null, wallclockMs: 2 };
275
- expect(computeCorpusDelta(noakm, akm).tokensPerPass).toBeNull();
276
- });
277
- test("computePerTaskDelta — same null-safety rule", () => {
278
- const noakm = {
279
- passRate: 0,
280
- passAt1: 0,
281
- tokensPerPass: null,
282
- tokensPerRun: null,
283
- wallclockMs: 0,
284
- passRateStdev: 0,
285
- budgetExceededCount: 0,
286
- harnessErrorCount: 0,
287
- count: 1,
288
- runsWithMeasuredTokens: 0,
289
- };
290
- const akm = {
291
- passRate: 1,
292
- passAt1: 1,
293
- tokensPerPass: 1000,
294
- tokensPerRun: null,
295
- wallclockMs: 100,
296
- passRateStdev: 0,
297
- budgetExceededCount: 0,
298
- harnessErrorCount: 0,
299
- count: 1,
300
- runsWithMeasuredTokens: 1,
301
- };
302
- expect(computePerTaskDelta(noakm, akm).tokensPerPass).toBeNull();
303
- });
304
- });
305
- describe("aggregateTrajectory", () => {
306
- test("returns null/0 on empty input", () => {
307
- const t = aggregateTrajectory([]);
308
- expect(t.correctAssetLoaded).toBeNull();
309
- expect(t.feedbackRecorded).toBe(0);
310
- });
311
- test("correctAssetLoaded is null when no run had a known goldRef", () => {
312
- const runs = [
313
- fakeResult({ trajectory: { correctAssetLoaded: null, feedbackRecorded: false } }),
314
- fakeResult({ trajectory: { correctAssetLoaded: null, feedbackRecorded: true } }),
315
- ];
316
- const t = aggregateTrajectory(runs);
317
- expect(t.correctAssetLoaded).toBeNull();
318
- expect(t.feedbackRecorded).toBeCloseTo(0.5);
319
- });
320
- test("correctAssetLoaded is fraction over runs with goldRef", () => {
321
- const runs = [
322
- fakeResult({ trajectory: { correctAssetLoaded: true, feedbackRecorded: false } }),
323
- fakeResult({ trajectory: { correctAssetLoaded: false, feedbackRecorded: false } }),
324
- fakeResult({ trajectory: { correctAssetLoaded: null, feedbackRecorded: false } }),
325
- ];
326
- const t = aggregateTrajectory(runs);
327
- expect(t.correctAssetLoaded).toBeCloseTo(0.5);
328
- expect(t.feedbackRecorded).toBe(0);
329
- });
330
- });
331
- describe("domainOfTaskId", () => {
332
- test("returns the segment before the first slash", () => {
333
- expect(domainOfTaskId("docker-homelab/redis-healthcheck")).toBe("docker-homelab");
334
- });
335
- test("falls back to 'unknown' when there is no slash", () => {
336
- expect(domainOfTaskId("noslash")).toBe("unknown");
337
- });
338
- test("falls back to 'unknown' when the slash is at index 0", () => {
339
- expect(domainOfTaskId("/leading")).toBe("unknown");
340
- });
341
- });
342
- describe("computeNegativeTransfer", () => {
343
- test("returns zero count and severity when no regressions are present", () => {
344
- const tasks = [
345
- { id: "d/a", noakm: ptm({ passRate: 0.4 }), akm: ptm({ passRate: 0.8 }) },
346
- { id: "d/b", noakm: ptm({ passRate: 0.5 }), akm: ptm({ passRate: 0.5 }) },
347
- ];
348
- const out = computeNegativeTransfer(tasks);
349
- expect(out.count).toBe(0);
350
- expect(out.severity).toBe(0);
351
- expect(out.topRegressedTasks).toEqual([]);
352
- });
353
- test("captures a single regression with correct delta and severity", () => {
354
- const tasks = [
355
- { id: "d/a", noakm: ptm({ passRate: 0.4 }), akm: ptm({ passRate: 0.8 }) },
356
- { id: "d/regressed", noakm: ptm({ passRate: 0.6 }), akm: ptm({ passRate: 0.2 }) },
357
- ];
358
- const out = computeNegativeTransfer(tasks);
359
- expect(out.count).toBe(1);
360
- expect(out.severity).toBeCloseTo(0.4);
361
- expect(out.topRegressedTasks).toHaveLength(1);
362
- const row = out.topRegressedTasks[0];
363
- if (!row)
364
- throw new Error("expected row");
365
- expect(row.taskId).toBe("d/regressed");
366
- expect(row.domain).toBe("d");
367
- expect(row.delta).toBeCloseTo(-0.4);
368
- expect(row.severity).toBeCloseTo(0.4);
369
- });
370
- test("multiple regressions are sorted by severity desc with deterministic tiebreak", () => {
371
- const tasks = [
372
- // Mild regression -0.1.
373
- { id: "alpha/x", noakm: ptm({ passRate: 0.6 }), akm: ptm({ passRate: 0.5 }) },
374
- // Tied severity -0.3 (first tiebreaks by taskId asc).
375
- { id: "beta/y", noakm: ptm({ passRate: 0.8 }), akm: ptm({ passRate: 0.5 }) },
376
- { id: "alpha/z", noakm: ptm({ passRate: 0.8 }), akm: ptm({ passRate: 0.5 }) },
377
- // Improvement (no regression).
378
- { id: "alpha/w", noakm: ptm({ passRate: 0.1 }), akm: ptm({ passRate: 0.9 }) },
379
- ];
380
- const out = computeNegativeTransfer(tasks);
381
- expect(out.count).toBe(3);
382
- expect(out.severity).toBeCloseTo(0.7);
383
- expect(out.topRegressedTasks.map((r) => r.taskId)).toEqual(["alpha/z", "beta/y", "alpha/x"]);
384
- });
385
- test("a task with equal pass rate is not counted as regressed", () => {
386
- const tasks = [{ id: "d/eq", noakm: ptm({ passRate: 0.5 }), akm: ptm({ passRate: 0.5 }) }];
387
- expect(computeNegativeTransfer(tasks).count).toBe(0);
388
- });
389
- });
390
- describe("computeDomainAggregates", () => {
391
- test("groups tasks by domain prefix", () => {
392
- const tasks = [
393
- {
394
- id: "alpha/a",
395
- noakm: ptm({ passRate: 0.4, tokensPerPass: 10000, wallclockMs: 1000 }),
396
- akm: ptm({ passRate: 0.8, tokensPerPass: 8000, wallclockMs: 900 }),
397
- },
398
- {
399
- id: "alpha/b",
400
- noakm: ptm({ passRate: 0.6, tokensPerPass: 12000, wallclockMs: 2000 }),
401
- akm: ptm({ passRate: 0.4, tokensPerPass: 9000, wallclockMs: 1500 }),
402
- },
403
- {
404
- id: "beta/c",
405
- noakm: ptm({ passRate: 0.2, tokensPerPass: null, wallclockMs: 500 }),
406
- akm: ptm({ passRate: 0.5, tokensPerPass: 5000, wallclockMs: 600 }),
407
- },
408
- ];
409
- const rows = computeDomainAggregates(tasks);
410
- expect(rows.map((r) => r.domain)).toEqual(["alpha", "beta"]);
411
- const alpha = rows.find((r) => r.domain === "alpha");
412
- if (!alpha)
413
- throw new Error("alpha missing");
414
- expect(alpha.taskCount).toBe(2);
415
- expect(alpha.regressionCount).toBe(1);
416
- expect(alpha.passRateNoakm).toBeCloseTo(0.5);
417
- expect(alpha.passRateAkm).toBeCloseTo(0.6);
418
- expect(alpha.passRateDelta).toBeCloseTo(0.1);
419
- expect(alpha.tokensPerPassDelta).toBeCloseTo(8500 - 11000);
420
- expect(alpha.wallclockMsDelta).toBeCloseTo(1200 - 1500);
421
- const beta = rows.find((r) => r.domain === "beta");
422
- if (!beta)
423
- throw new Error("beta missing");
424
- expect(beta.regressionCount).toBe(0);
425
- // Single-side null tokensPerPass yields null delta.
426
- expect(beta.tokensPerPassDelta).toBeNull();
427
- });
428
- test("emits an empty array on no tasks", () => {
429
- expect(computeDomainAggregates([])).toEqual([]);
430
- });
431
- });
432
- describe("computeAssetRegressionCandidates", () => {
433
- function fakeRun(taskId, assets) {
434
- return {
435
- schemaVersion: 1,
436
- taskId,
437
- arm: "akm",
438
- seed: 0,
439
- model: "m",
440
- outcome: "pass",
441
- tokens: { input: 0, output: 0 },
442
- wallclockMs: 0,
443
- trajectory: { correctAssetLoaded: null, feedbackRecorded: null },
444
- events: [],
445
- verifierStdout: "",
446
- verifierExitCode: 0,
447
- assetsLoaded: assets,
448
- };
449
- }
450
- test("returns empty when no regressed tasks were provided", () => {
451
- expect(computeAssetRegressionCandidates([], [fakeRun("d/a", ["skill:x"])])).toEqual([]);
452
- });
453
- test("counts distinct regressed tasks per asset and totals raw load volume", () => {
454
- const akmRuns = [
455
- // task d/r1 across two seeds, same asset.
456
- fakeRun("d/r1", ["skill:foo", "skill:bar"]),
457
- fakeRun("d/r1", ["skill:foo"]),
458
- // task d/r2 loads skill:foo (again) plus skill:baz.
459
- fakeRun("d/r2", ["skill:foo", "skill:baz"]),
460
- // Non-regressed task is ignored entirely.
461
- fakeRun("d/clean", ["skill:foo", "skill:bar", "skill:baz"]),
462
- ];
463
- const rows = computeAssetRegressionCandidates(["d/r1", "d/r2"], akmRuns);
464
- expect(rows.map((r) => r.assetRef)).toEqual(["skill:foo", "skill:bar", "skill:baz"]);
465
- const foo = rows[0];
466
- if (!foo)
467
- throw new Error("foo missing");
468
- expect(foo.regressedTaskCount).toBe(2);
469
- expect(foo.regressedTaskIds).toEqual(["d/r1", "d/r2"]);
470
- expect(foo.totalLoadCount).toBe(3);
471
- const bar = rows[1];
472
- if (!bar)
473
- throw new Error("bar missing");
474
- expect(bar.regressedTaskCount).toBe(1);
475
- expect(bar.totalLoadCount).toBe(1);
476
- });
477
- });
478
- // ── Memory-operation aggregations (#262) ───────────────────────────────────
479
- describe("aggregateByMemoryAbility / aggregateByTaskFamily (#262)", () => {
480
- function entry(id, noakmPass, akmPass, extras = {}) {
481
- return {
482
- id,
483
- noakm: ptm({ passRate: noakmPass }),
484
- akm: ptm({ passRate: akmPass }),
485
- ...extras,
486
- };
487
- }
488
- test("returns empty when no entries carry the keying tag", () => {
489
- const entries = [entry("d/a", 0.4, 0.6), entry("d/b", 0.5, 0.7)];
490
- expect(aggregateByMemoryAbility(entries)).toEqual([]);
491
- expect(aggregateByTaskFamily(entries)).toEqual([]);
492
- });
493
- test("aggregateByMemoryAbility groups tasks, computes deltas + negative transfer", () => {
494
- const entries = [
495
- entry("d/lookup-1", 0.4, 0.8, { memoryAbility: "procedural_lookup" }),
496
- entry("d/lookup-2", 0.6, 0.4, { memoryAbility: "procedural_lookup" }),
497
- entry("d/compose-1", 0.0, 1.0, { memoryAbility: "multi_asset_composition" }),
498
- entry("d/no-tag", 0.5, 0.7),
499
- ];
500
- const rows = aggregateByMemoryAbility(entries);
501
- expect(rows.map((r) => r.category)).toEqual(["multi_asset_composition", "procedural_lookup"]);
502
- const lookup = rows.find((r) => r.category === "procedural_lookup");
503
- expect(lookup?.taskCount).toBe(2);
504
- expect(lookup?.passRateNoakm).toBeCloseTo(0.5);
505
- expect(lookup?.passRateAkm).toBeCloseTo(0.6);
506
- expect(lookup?.passRateDelta).toBeCloseTo(0.1);
507
- // d/lookup-2 regressed (akm < noakm).
508
- expect(lookup?.negativeTransferCount).toBe(1);
509
- expect(lookup?.workflowCompliance).toBeNull();
510
- });
511
- test("aggregateByMemoryAbility folds workflow_compliance when at least one task supplies it", () => {
512
- const entries = [
513
- entry("d/a", 0.5, 0.7, { memoryAbility: "procedural_lookup", workflowCompliance: 0.8 }),
514
- entry("d/b", 0.5, 0.7, { memoryAbility: "procedural_lookup" }),
515
- entry("d/c", 0.5, 0.7, { memoryAbility: "procedural_lookup", workflowCompliance: 0.6 }),
516
- ];
517
- const [row] = aggregateByMemoryAbility(entries);
518
- expect(row?.workflowCompliance).toBeCloseTo(0.7);
519
- });
520
- test("aggregateByTaskFamily groups by family", () => {
521
- const entries = [
522
- entry("d/a", 0.4, 0.6, { taskFamily: "d/group-1" }),
523
- entry("d/b", 0.4, 0.4, { taskFamily: "d/group-1" }),
524
- entry("d/c", 0.0, 1.0, { taskFamily: "d/group-2" }),
525
- ];
526
- const rows = aggregateByTaskFamily(entries);
527
- expect(rows.map((r) => r.category)).toEqual(["d/group-1", "d/group-2"]);
528
- const g1 = rows.find((r) => r.category === "d/group-1");
529
- expect(g1?.taskCount).toBe(2);
530
- expect(g1?.passRateDelta).toBeCloseTo(0.1);
531
- });
532
- test("computeCorpusCoverage counts every closed-set ability + an untagged bucket", () => {
533
- const cov = computeCorpusCoverage([
534
- { memoryAbility: "procedural_lookup", taskFamily: "d/family-a" },
535
- { memoryAbility: "procedural_lookup", taskFamily: "d/family-a" },
536
- { memoryAbility: "abstention", taskFamily: "d/family-b" },
537
- { taskFamily: "d/family-c" },
538
- {},
539
- ]);
540
- expect(cov.totalTasks).toBe(5);
541
- expect(cov.memoryAbilityCounts.procedural_lookup).toBe(2);
542
- expect(cov.memoryAbilityCounts.abstention).toBe(1);
543
- expect(cov.memoryAbilityCounts.conflict_resolution).toBe(0);
544
- expect(cov.memoryAbilityCounts.untagged).toBe(2);
545
- expect(cov.taskFamilyCounts["d/family-a"]).toBe(2);
546
- expect(cov.taskFamilyCounts.untagged).toBe(1);
547
- });
548
- });
549
- // ── AKM overhead (#263) ────────────────────────────────────────────────────
550
- function akmEvent(eventType, ts, ref, metadata) {
551
- return {
552
- schemaVersion: 1,
553
- id: 0,
554
- ts,
555
- eventType,
556
- ...(ref ? { ref } : {}),
557
- ...(metadata ? { metadata } : {}),
558
- };
559
- }
560
- function metaMap(entries) {
561
- const m = new Map();
562
- for (const e of entries)
563
- m.set(e.id, { goldRef: e.goldRef, expectedTransferFrom: e.expectedTransferFrom });
564
- return m;
565
- }
566
- describe("computeAkmOverhead — no AKM calls", () => {
567
- test("zero counts and null timings when run had no AKM events", () => {
568
- const run = fakeResult({ taskId: "demo/none", events: [] });
569
- const rows = computeAkmOverhead([run]);
570
- expect(rows).toHaveLength(1);
571
- const r = rows[0];
572
- expect(r.searchCount).toBe(0);
573
- expect(r.showCount).toBe(0);
574
- expect(r.feedbackCount).toBe(0);
575
- expect(r.totalToolCalls).toBe(0);
576
- expect(r.assetsLoadedCount).toBe(0);
577
- expect(r.timeToFirstSearchMs).toBeNull();
578
- expect(r.timeToFirstCorrectAssetMs).toBeNull();
579
- expect(r.contextBytesLoaded).toBeNull();
580
- expect(r.assetBytesLoaded).toBeNull();
581
- // Without metadata, irrelevance is unjudgeable -> null.
582
- expect(r.irrelevantAssetsLoadedCount).toBeNull();
583
- });
584
- test("aggregate over empty array is the zero envelope", () => {
585
- const agg = aggregateAkmOverhead([]);
586
- expect(agg.totalRuns).toBe(0);
587
- expect(agg.passingRuns).toBe(0);
588
- expect(agg.toolCallsPerSuccess).toBeNull();
589
- expect(agg.costPerSuccess).toBeNull();
590
- expect(agg.meanTimeToFirstSearchMs).toBeNull();
591
- });
592
- });
593
- describe("computeAkmOverhead — successful AKM use", () => {
594
- test("counts search/show/feedback, computes timings and relevance", () => {
595
- const run = fakeResult({
596
- taskId: "demo/ok",
597
- outcome: "pass",
598
- tokenMeasurement: "parsed",
599
- tokens: { input: 100, output: 50 },
600
- events: [
601
- akmEvent("search", "2026-04-27T10:00:00.000Z", undefined, { query: "deploy" }),
602
- akmEvent("show", "2026-04-27T10:00:00.500Z", "skill:deploy"),
603
- akmEvent("feedback", "2026-04-27T10:00:01.000Z", "skill:deploy"),
604
- ],
605
- });
606
- const tasks = metaMap([{ id: "demo/ok", goldRef: "skill:deploy", expectedTransferFrom: [] }]);
607
- const rows = computeAkmOverhead([run], { taskMetadata: tasks });
608
- const r = rows[0];
609
- expect(r.searchCount).toBe(1);
610
- expect(r.showCount).toBe(1);
611
- expect(r.feedbackCount).toBe(1);
612
- expect(r.totalToolCalls).toBe(3);
613
- expect(r.assetsLoadedCount).toBe(1);
614
- expect(r.irrelevantAssetsLoadedCount).toBe(0);
615
- expect(r.timeToFirstSearchMs).toBe(0); // first search IS the run-start anchor
616
- expect(r.timeToFirstCorrectAssetMs).toBe(500);
617
- const agg = aggregateAkmOverhead(rows, [run]);
618
- expect(agg.passingRuns).toBe(1);
619
- expect(agg.toolCallsPerSuccess).toBe(3);
620
- expect(agg.costPerSuccess).toBe(150);
621
- });
622
- test("expected_transfer_from refs are not counted as irrelevant", () => {
623
- const run = fakeResult({
624
- taskId: "demo/transfer",
625
- events: [
626
- akmEvent("show", "2026-04-27T10:00:00.000Z", "skill:foo"),
627
- akmEvent("show", "2026-04-27T10:00:01.000Z", "skill:helper"),
628
- ],
629
- });
630
- const tasks = metaMap([{ id: "demo/transfer", goldRef: "skill:foo", expectedTransferFrom: ["skill:helper"] }]);
631
- const rows = computeAkmOverhead([run], { taskMetadata: tasks });
632
- expect(rows[0].assetsLoadedCount).toBe(2);
633
- expect(rows[0].irrelevantAssetsLoadedCount).toBe(0);
634
- });
635
- });
636
- describe("computeAkmOverhead — excessive AKM calls", () => {
637
- test("high counts and low calls-per-success are surfaced", () => {
638
- const goldRef = "skill:gold";
639
- const noisyRun = fakeResult({
640
- taskId: "demo/noisy",
641
- outcome: "fail",
642
- events: [
643
- akmEvent("search", "2026-04-27T10:00:00.000Z"),
644
- akmEvent("search", "2026-04-27T10:00:00.100Z"),
645
- akmEvent("search", "2026-04-27T10:00:00.200Z"),
646
- akmEvent("show", "2026-04-27T10:00:00.300Z", "skill:other"),
647
- akmEvent("show", "2026-04-27T10:00:00.400Z", "skill:other2"),
648
- akmEvent("show", "2026-04-27T10:00:00.500Z", "skill:other3"),
649
- akmEvent("show", "2026-04-27T10:00:00.600Z", goldRef),
650
- ],
651
- });
652
- const passingRun = fakeResult({
653
- taskId: "demo/easy",
654
- outcome: "pass",
655
- tokenMeasurement: "parsed",
656
- tokens: { input: 10, output: 10 },
657
- events: [akmEvent("search", "2026-04-27T10:00:00.000Z"), akmEvent("show", "2026-04-27T10:00:00.100Z", goldRef)],
658
- });
659
- const tasks = metaMap([
660
- { id: "demo/noisy", goldRef, expectedTransferFrom: [] },
661
- { id: "demo/easy", goldRef, expectedTransferFrom: [] },
662
- ]);
663
- const rows = computeAkmOverhead([noisyRun, passingRun], { taskMetadata: tasks });
664
- expect(rows[0].totalToolCalls).toBe(7);
665
- expect(rows[0].irrelevantAssetsLoadedCount).toBe(3);
666
- expect(rows[0].timeToFirstCorrectAssetMs).toBe(600);
667
- expect(rows[1].totalToolCalls).toBe(2);
668
- const agg = aggregateAkmOverhead(rows, [noisyRun, passingRun]);
669
- expect(agg.totalToolCalls).toBe(9);
670
- expect(agg.passingRuns).toBe(1);
671
- // 9 tool calls for one passing run = high overhead per success.
672
- expect(agg.toolCallsPerSuccess).toBe(9);
673
- });
674
- });
675
- describe("computeAkmOverhead — missing timing/byte data", () => {
676
- test("event without ts -> null first-search timing (NOT zero)", () => {
677
- const run = fakeResult({
678
- taskId: "demo/notime",
679
- events: [
680
- // No ts on event — workflow-trace assigns a synthetic order hint but
681
- // ts stays undefined, so we cannot anchor a real time-offset.
682
- { schemaVersion: 1, id: 0, eventType: "search" },
683
- ],
684
- });
685
- const rows = computeAkmOverhead([run]);
686
- expect(rows[0].searchCount).toBe(1);
687
- expect(rows[0].timeToFirstSearchMs).toBeNull();
688
- expect(rows[0].timeToFirstCorrectAssetMs).toBeNull();
689
- });
690
- test("byte sizes are always null for now (NOT zero)", () => {
691
- const run = fakeResult({
692
- events: [akmEvent("show", "2026-04-27T10:00:00.000Z", "skill:foo")],
693
- });
694
- const rows = computeAkmOverhead([run]);
695
- expect(rows[0].contextBytesLoaded).toBeNull();
696
- expect(rows[0].assetBytesLoaded).toBeNull();
697
- const agg = aggregateAkmOverhead(rows, [run]);
698
- expect(agg.meanContextBytesLoaded).toBeNull();
699
- expect(agg.meanAssetBytesLoaded).toBeNull();
700
- });
701
- test("cost_per_success is null when any passing run lacks parsed token measurement", () => {
702
- const passParsed = fakeResult({
703
- taskId: "t1",
704
- outcome: "pass",
705
- tokenMeasurement: "parsed",
706
- tokens: { input: 10, output: 5 },
707
- events: [akmEvent("search", "2026-04-27T10:00:00.000Z")],
708
- });
709
- const passMissing = fakeResult({
710
- taskId: "t2",
711
- outcome: "pass",
712
- tokenMeasurement: "missing",
713
- tokens: { input: 0, output: 0 },
714
- events: [akmEvent("search", "2026-04-27T10:00:00.000Z")],
715
- });
716
- const rows = computeAkmOverhead([passParsed, passMissing]);
717
- const agg = aggregateAkmOverhead(rows, [passParsed, passMissing]);
718
- expect(agg.passingRuns).toBe(2);
719
- expect(agg.costPerSuccess).toBeNull();
720
- });
721
- test("missing task metadata -> irrelevantAssetsLoadedCount is null (not 0)", () => {
722
- const run = fakeResult({
723
- taskId: "demo/unknown",
724
- events: [akmEvent("show", "2026-04-27T10:00:00.000Z", "skill:foo")],
725
- });
726
- // No metadata supplied for this task.
727
- const rows = computeAkmOverhead([run]);
728
- expect(rows[0].assetsLoadedCount).toBe(1);
729
- expect(rows[0].irrelevantAssetsLoadedCount).toBeNull();
730
- });
731
- test("aggregate skips null timings rather than zero-filling", () => {
732
- const noTime = fakeResult({
733
- taskId: "t1",
734
- outcome: "fail",
735
- events: [{ schemaVersion: 1, id: 0, eventType: "search" }],
736
- });
737
- const withTime = fakeResult({
738
- taskId: "t2",
739
- outcome: "fail",
740
- events: [akmEvent("search", "2026-04-27T10:00:01.000Z")],
741
- });
742
- const rows = computeAkmOverhead([noTime, withTime]);
743
- // First run: search event has no ts -> no run-start anchor, timing null.
744
- // Second run: search event IS the only event with ts, so it's both the
745
- // anchor and the first search -> offset 0.
746
- expect(rows[0].timeToFirstSearchMs).toBeNull();
747
- expect(rows[1].timeToFirstSearchMs).toBe(0);
748
- const agg = aggregateAkmOverhead(rows, [noTime, withTime]);
749
- // Mean honours only the parseable observation; the null is skipped, NOT
750
- // treated as zero in the numerator.
751
- expect(agg.meanTimeToFirstSearchMs).toBe(0);
752
- // tool_calls_per_success is null because no run passed.
753
- expect(agg.toolCallsPerSuccess).toBeNull();
754
- });
755
- });
756
- // ── computeWorkflowReliability (#258) ──────────────────────────────────────
757
- function wfCheck(overrides = {}) {
758
- return {
759
- schemaVersion: 1,
760
- workflowId: "wf-1",
761
- taskId: "t1",
762
- arm: "akm",
763
- seed: 0,
764
- status: "pass",
765
- score: 1,
766
- requiredPassed: 1,
767
- requiredTotal: 1,
768
- violations: [],
769
- evidence: {
770
- matchedEvents: 1,
771
- feedbackRecorded: false,
772
- goldAssetLoaded: false,
773
- traceTruncated: false,
774
- },
775
- ...overrides,
776
- };
777
- }
778
- // ── Learning curve across episodes (issue #265) ────────────────────────────
779
- function ep(overrides) {
780
- return {
781
- delta_from_previous_episode: 0,
782
- cumulative_feedback_events: 0,
783
- cumulative_proposals_created: 0,
784
- cumulative_proposals_accepted: 0,
785
- cumulative_lessons_created: 0,
786
- lesson_reuse_rate: null,
787
- ...overrides,
788
- };
789
- }
790
- function statuses(workflowId, taskId, statusList) {
791
- return statusList.map((status, seed) => wfCheck({ workflowId, taskId, seed, status }));
792
- }
793
- describe("computeWorkflowReliability (#258)", () => {
794
- test("empty input yields zeroed corpus + empty by_workflow", () => {
795
- const result = computeWorkflowReliability([]);
796
- expect(result.byWorkflow).toEqual({});
797
- expect(result.corpus).toEqual({ pass_at_k: 0, pass_all_k: 0, groups: 0, tasks: 0 });
798
- });
799
- test("all-pass: every (task, seed) is pass → pass@k=1, pass^k=1", () => {
800
- const checks = [
801
- ...statuses("wf-1", "t1", ["pass", "pass", "pass"]),
802
- ...statuses("wf-1", "t2", ["pass", "pass", "pass"]),
803
- ];
804
- const result = computeWorkflowReliability(checks);
805
- expect(result.byWorkflow["wf-1"].pass_at_k).toBe(1);
806
- expect(result.byWorkflow["wf-1"].pass_all_k).toBe(1);
807
- expect(result.byWorkflow["wf-1"].tasks).toBe(2);
808
- expect(result.byWorkflow["wf-1"].k).toBe(3);
809
- expect(result.corpus.pass_at_k).toBe(1);
810
- expect(result.corpus.pass_all_k).toBe(1);
811
- expect(result.corpus.groups).toBe(2);
812
- expect(result.corpus.tasks).toBe(2);
813
- });
814
- test("none-pass: no seed is pass → pass@k=0, pass^k=0", () => {
815
- const checks = [
816
- ...statuses("wf-1", "t1", ["fail", "fail", "fail"]),
817
- ...statuses("wf-1", "t2", ["partial", "fail", "harness_error"]),
818
- ];
819
- const result = computeWorkflowReliability(checks);
820
- expect(result.byWorkflow["wf-1"].pass_at_k).toBe(0);
821
- expect(result.byWorkflow["wf-1"].pass_all_k).toBe(0);
822
- expect(result.corpus.pass_at_k).toBe(0);
823
- expect(result.corpus.pass_all_k).toBe(0);
824
- });
825
- test("some-pass: pass@k > 0, pass^k < pass@k when seeds disagree per task", () => {
826
- // t1: 1 pass, 2 fail → counts toward pass@k (anyPass) but NOT pass^k
827
- // t2: 3 pass → counts toward both
828
- const checks = [
829
- ...statuses("wf-1", "t1", ["pass", "fail", "fail"]),
830
- ...statuses("wf-1", "t2", ["pass", "pass", "pass"]),
831
- ];
832
- const result = computeWorkflowReliability(checks);
833
- expect(result.byWorkflow["wf-1"].pass_at_k).toBeCloseTo(1); // both tasks have at least one pass
834
- expect(result.byWorkflow["wf-1"].pass_all_k).toBeCloseTo(0.5); // only t2 is all-pass
835
- expect(result.corpus.pass_at_k).toBeCloseTo(1);
836
- expect(result.corpus.pass_all_k).toBeCloseTo(0.5);
837
- });
838
- test("mixed partial/fail: partial does NOT count as pass for reliability", () => {
839
- // partial is non-pass per the strict reliability bucketing.
840
- const checks = [...statuses("wf-1", "t1", ["partial", "partial", "partial"])];
841
- const result = computeWorkflowReliability(checks);
842
- expect(result.byWorkflow["wf-1"].pass_at_k).toBe(0);
843
- expect(result.byWorkflow["wf-1"].pass_all_k).toBe(0);
844
- });
845
- test("not_applicable seeds are excluded from numerator and denominator", () => {
846
- // Only the 2 applicable seeds matter; both are pass → 1 task all-pass.
847
- const checks = [...statuses("wf-1", "t1", ["not_applicable", "pass", "pass", "not_applicable"])];
848
- const result = computeWorkflowReliability(checks);
849
- expect(result.byWorkflow["wf-1"].pass_at_k).toBe(1);
850
- expect(result.byWorkflow["wf-1"].pass_all_k).toBe(1);
851
- expect(result.byWorkflow["wf-1"].tasks).toBe(1);
852
- expect(result.byWorkflow["wf-1"].k).toBe(2);
853
- });
854
- test("workflow with every check not_applicable is omitted (no group counted)", () => {
855
- const checks = [...statuses("wf-skips", "t1", ["not_applicable", "not_applicable"])];
856
- const result = computeWorkflowReliability(checks);
857
- expect(result.byWorkflow["wf-skips"]).toBeUndefined();
858
- expect(result.corpus.groups).toBe(0);
859
- expect(result.corpus.tasks).toBe(0);
860
- });
861
- test("multiple workflows compute independently; corpus weights groups equally", () => {
862
- // wf-a: t1 all pass (1/1 = 1, 1/1 = 1)
863
- // wf-b: t2 mixed (anyPass=1, allPass=0); t3 none (0, 0)
864
- // per-workflow: pass@k=0.5, pass^k=0
865
- // corpus: 3 groups → pass@k = (1+1+0)/3 = 2/3; pass^k = (1+0+0)/3 = 1/3
866
- const checks = [
867
- ...statuses("wf-a", "t1", ["pass", "pass"]),
868
- ...statuses("wf-b", "t2", ["pass", "fail"]),
869
- ...statuses("wf-b", "t3", ["fail", "fail"]),
870
- ];
871
- const result = computeWorkflowReliability(checks);
872
- expect(result.byWorkflow["wf-a"].pass_at_k).toBe(1);
873
- expect(result.byWorkflow["wf-a"].pass_all_k).toBe(1);
874
- expect(result.byWorkflow["wf-b"].pass_at_k).toBeCloseTo(0.5);
875
- expect(result.byWorkflow["wf-b"].pass_all_k).toBe(0);
876
- expect(result.corpus.pass_at_k).toBeCloseTo(2 / 3);
877
- expect(result.corpus.pass_all_k).toBeCloseTo(1 / 3);
878
- expect(result.corpus.groups).toBe(3);
879
- expect(result.corpus.tasks).toBe(3);
880
- });
881
- test("harness_error is treated as non-pass (consistent with #257 bucketing)", () => {
882
- const checks = [...statuses("wf-1", "t1", ["pass", "harness_error", "pass"])];
883
- const result = computeWorkflowReliability(checks);
884
- expect(result.byWorkflow["wf-1"].pass_at_k).toBe(1);
885
- expect(result.byWorkflow["wf-1"].pass_all_k).toBe(0);
886
- });
887
- });
888
- describe("computeLearningCurve", () => {
889
- test("monotonic improvement: positive slope, time_to_improvement at first crossing", () => {
890
- const episodes = [
891
- ep({ episode_index: 0, pass_rate: 0.4, cumulative_feedback_events: 10 }),
892
- ep({ episode_index: 1, pass_rate: 0.5, cumulative_feedback_events: 22 }),
893
- ep({ episode_index: 2, pass_rate: 0.6, cumulative_feedback_events: 35 }),
894
- ep({ episode_index: 3, pass_rate: 0.7, cumulative_feedback_events: 48 }),
895
- ];
896
- const curve = computeLearningCurve(episodes);
897
- expect(curve.pass_rate_by_episode).toEqual([0.4, 0.5, 0.6, 0.7]);
898
- // Slope is exactly 0.1 per episode for evenly spaced 0.1 increments.
899
- expect(curve.learning_slope).toBeCloseTo(0.1, 6);
900
- // Episode 1 first exceeds 0.4 + 0.05 = 0.45 (0.5 > 0.45).
901
- expect(curve.time_to_improvement).toBe(1);
902
- // Deltas: 0, 0.1, 0.1, 0.1
903
- expect(curve.episodes[0].delta_from_previous_episode).toBe(0);
904
- expect(curve.episodes[1].delta_from_previous_episode).toBeCloseTo(0.1);
905
- expect(curve.episodes[3].delta_from_previous_episode).toBeCloseTo(0.1);
906
- });
907
- test("no improvement: flat pass rate yields zero slope and null time_to_improvement", () => {
908
- const episodes = [
909
- ep({ episode_index: 0, pass_rate: 0.5 }),
910
- ep({ episode_index: 1, pass_rate: 0.5 }),
911
- ep({ episode_index: 2, pass_rate: 0.51 }),
912
- ep({ episode_index: 3, pass_rate: 0.52 }),
913
- ];
914
- const curve = computeLearningCurve(episodes);
915
- expect(curve.learning_slope).toBeCloseTo(0.0073, 3);
916
- // Never crosses 0.5 + 0.05 = 0.55.
917
- expect(curve.time_to_improvement).toBeNull();
918
- });
919
- test("regression mid-episode: slope still computed, time_to_improvement honours first qualifying episode", () => {
920
- // Pass rate climbs, then regresses below baseline+threshold, then recovers.
921
- const episodes = [
922
- ep({ episode_index: 0, pass_rate: 0.4 }),
923
- ep({ episode_index: 1, pass_rate: 0.6 }), // > 0.45 → first crossing
924
- ep({ episode_index: 2, pass_rate: 0.42 }), // mid-episode regression
925
- ep({ episode_index: 3, pass_rate: 0.55 }),
926
- ];
927
- const curve = computeLearningCurve(episodes);
928
- // First crossing wins, even though episode 2 regresses below threshold.
929
- expect(curve.time_to_improvement).toBe(1);
930
- expect(curve.episodes[2].delta_from_previous_episode).toBeCloseTo(-0.18);
931
- // Slope is computed across all four points; should be positive overall.
932
- expect(curve.learning_slope).toBeGreaterThan(0);
933
- });
934
- test("single-episode degenerate input: slope is 0, time_to_improvement is null", () => {
935
- const curve = computeLearningCurve([ep({ episode_index: 0, pass_rate: 0.7 })]);
936
- expect(curve.pass_rate_by_episode).toEqual([0.7]);
937
- expect(curve.learning_slope).toBe(0);
938
- expect(curve.time_to_improvement).toBeNull();
939
- // Episode 0's delta is always 0 by definition.
940
- expect(curve.episodes[0].delta_from_previous_episode).toBe(0);
941
- });
942
- test("empty input is degenerate: empty arrays, zero slope, null time", () => {
943
- const curve = computeLearningCurve([]);
944
- expect(curve.episodes).toEqual([]);
945
- expect(curve.pass_rate_by_episode).toEqual([]);
946
- expect(curve.learning_slope).toBe(0);
947
- expect(curve.time_to_improvement).toBeNull();
948
- });
949
- test("unsorted input is sorted by episode_index before processing", () => {
950
- const episodes = [
951
- ep({ episode_index: 2, pass_rate: 0.6 }),
952
- ep({ episode_index: 0, pass_rate: 0.4 }),
953
- ep({ episode_index: 1, pass_rate: 0.5 }),
954
- ];
955
- const curve = computeLearningCurve(episodes);
956
- expect(curve.episodes.map((e) => e.episode_index)).toEqual([0, 1, 2]);
957
- expect(curve.pass_rate_by_episode).toEqual([0.4, 0.5, 0.6]);
958
- expect(curve.time_to_improvement).toBe(1);
959
- });
960
- test("delta_from_previous_episode is recomputed defensively from sorted pass_rates", () => {
961
- // Caller stamps wrong deltas — function recomputes.
962
- const episodes = [
963
- ep({ episode_index: 0, pass_rate: 0.4, delta_from_previous_episode: 99 }),
964
- ep({ episode_index: 1, pass_rate: 0.6, delta_from_previous_episode: -42 }),
965
- ];
966
- const curve = computeLearningCurve(episodes);
967
- expect(curve.episodes[0].delta_from_previous_episode).toBe(0);
968
- expect(curve.episodes[1].delta_from_previous_episode).toBeCloseTo(0.2);
969
- });
970
- test("threshold is strictly greater-than (exact baseline+threshold does not count)", () => {
971
- // baseline = 0.5; threshold = 0.05 → must exceed 0.55.
972
- const episodes = [
973
- ep({ episode_index: 0, pass_rate: 0.5 }),
974
- ep({ episode_index: 1, pass_rate: 0.5 + LEARNING_IMPROVEMENT_THRESHOLD }), // 0.55 exactly
975
- ep({ episode_index: 2, pass_rate: 0.5 + LEARNING_IMPROVEMENT_THRESHOLD + 0.001 }),
976
- ];
977
- const curve = computeLearningCurve(episodes);
978
- expect(curve.time_to_improvement).toBe(2);
979
- });
980
- test("cumulative counters are echoed verbatim (caller-provided)", () => {
981
- const episodes = [
982
- ep({
983
- episode_index: 0,
984
- pass_rate: 0.4,
985
- cumulative_feedback_events: 10,
986
- cumulative_proposals_created: 0,
987
- cumulative_proposals_accepted: 0,
988
- cumulative_lessons_created: 0,
989
- lesson_reuse_rate: null,
990
- }),
991
- ep({
992
- episode_index: 1,
993
- pass_rate: 0.55,
994
- cumulative_feedback_events: 25,
995
- cumulative_proposals_created: 4,
996
- cumulative_proposals_accepted: 3,
997
- cumulative_lessons_created: 3,
998
- lesson_reuse_rate: 0.42,
999
- }),
1000
- ];
1001
- const curve = computeLearningCurve(episodes);
1002
- expect(curve.episodes[1].cumulative_feedback_events).toBe(25);
1003
- expect(curve.episodes[1].cumulative_proposals_accepted).toBe(3);
1004
- expect(curve.episodes[1].cumulative_lessons_created).toBe(3);
1005
- expect(curve.episodes[1].lesson_reuse_rate).toBeCloseTo(0.42);
1006
- });
1007
- });
1008
- // ── #271: masked-stash path-traversal hardening ─────────────────────────────
1009
- describe("materialiseMaskedStash stashName containment (#271)", () => {
1010
- function makeFixturesRoot() {
1011
- const fixturesRoot = benchMkdtemp("akm-bench-fixtures-");
1012
- // Plant a sibling outside fixturesRoot that a traversal-shaped stashName
1013
- // could otherwise reach. The MANIFEST.json existence check would pass
1014
- // there if containment were not enforced.
1015
- const sibling = path.join(path.dirname(fixturesRoot), `sibling-${path.basename(fixturesRoot)}`);
1016
- fs.mkdirSync(sibling, { recursive: true });
1017
- fs.writeFileSync(path.join(sibling, "MANIFEST.json"), "{}");
1018
- return {
1019
- fixturesRoot,
1020
- cleanup: () => {
1021
- fs.rmSync(fixturesRoot, { recursive: true, force: true });
1022
- fs.rmSync(sibling, { recursive: true, force: true });
1023
- },
1024
- };
1025
- }
1026
- test("rejects stashName starting with '..' (relative traversal)", () => {
1027
- const { fixturesRoot, cleanup } = makeFixturesRoot();
1028
- try {
1029
- const sibling = `../sibling-${path.basename(fixturesRoot)}`;
1030
- const result = materialiseMaskedStash(fixturesRoot, sibling, "skill:foo");
1031
- expect(result).toBeNull();
1032
- }
1033
- finally {
1034
- cleanup();
1035
- }
1036
- });
1037
- test("rejects absolute stashName", () => {
1038
- const { fixturesRoot, cleanup } = makeFixturesRoot();
1039
- try {
1040
- const result = materialiseMaskedStash(fixturesRoot, "/etc", "skill:foo");
1041
- expect(result).toBeNull();
1042
- }
1043
- finally {
1044
- cleanup();
1045
- }
1046
- });
1047
- test("rejects nested traversal that would escape fixturesRoot", () => {
1048
- const { fixturesRoot, cleanup } = makeFixturesRoot();
1049
- try {
1050
- // path.resolve(fixturesRoot, "a/../../sibling-xyz") would land on
1051
- // the sibling directory if containment is not enforced.
1052
- const escapePath = `a/../../sibling-${path.basename(fixturesRoot)}`;
1053
- const result = materialiseMaskedStash(fixturesRoot, escapePath, "skill:foo");
1054
- expect(result).toBeNull();
1055
- }
1056
- finally {
1057
- cleanup();
1058
- }
1059
- });
1060
- test("returns null (not a crash) for a contained stashName with no MANIFEST", () => {
1061
- const { fixturesRoot, cleanup } = makeFixturesRoot();
1062
- try {
1063
- // 'inner' is inside fixturesRoot but has no MANIFEST.json. Containment
1064
- // passes; the existing MANIFEST gate returns null. Sanity check that
1065
- // the new containment check did not accidentally reject the happy path.
1066
- fs.mkdirSync(path.join(fixturesRoot, "inner"));
1067
- const result = materialiseMaskedStash(fixturesRoot, "inner", "skill:foo");
1068
- expect(result).toBeNull();
1069
- }
1070
- finally {
1071
- cleanup();
1072
- }
1073
- });
1074
- });
1075
- describe("isPathContained symlink resolution (#271)", () => {
1076
- test("rejects a symlink inside root that points outside (alignment with isWithin)", () => {
1077
- const tmpRoot = benchMkdtemp("akm-bench-contain-root-");
1078
- const outside = benchMkdtemp("akm-bench-contain-outside-");
1079
- try {
1080
- // The actual file lives outside `tmpRoot`. Without realpath alignment,
1081
- // path.resolve(tmpRoot, "escape") looks contained ('escape' is just a
1082
- // basename) and the masking heuristic would happily rmSync it.
1083
- const escapeTarget = path.join(outside, "victim");
1084
- fs.writeFileSync(escapeTarget, "do-not-delete");
1085
- const symlinkPath = path.join(tmpRoot, "escape");
1086
- try {
1087
- fs.symlinkSync(escapeTarget, symlinkPath);
1088
- }
1089
- catch (err) {
1090
- // Some sandboxes (e.g. Windows w/o dev mode) deny symlink creation —
1091
- // skip rather than fail in those environments.
1092
- if (process.platform === "win32")
1093
- return;
1094
- throw err;
1095
- }
1096
- // Without #271 alignment: rel === "escape" (contained). With
1097
- // safeRealpath: target resolves to `outside/victim` → rel starts with
1098
- // "..".
1099
- expect(isPathContained(tmpRoot, symlinkPath)).toBe(false);
1100
- // The victim file must still exist after the rejection (sanity check
1101
- // for the test fixture, not the function under test).
1102
- expect(fs.existsSync(escapeTarget)).toBe(true);
1103
- }
1104
- finally {
1105
- fs.rmSync(tmpRoot, { recursive: true, force: true });
1106
- fs.rmSync(outside, { recursive: true, force: true });
1107
- }
1108
- });
1109
- test("accepts a symlink inside root that points back inside root", () => {
1110
- const tmpRoot = benchMkdtemp("akm-bench-contain-inside-");
1111
- try {
1112
- const realFile = path.join(tmpRoot, "real");
1113
- fs.writeFileSync(realFile, "ok");
1114
- const linkPath = path.join(tmpRoot, "link");
1115
- try {
1116
- fs.symlinkSync(realFile, linkPath);
1117
- }
1118
- catch (err) {
1119
- if (process.platform === "win32")
1120
- return;
1121
- throw err;
1122
- }
1123
- expect(isPathContained(tmpRoot, linkPath)).toBe(true);
1124
- }
1125
- finally {
1126
- fs.rmSync(tmpRoot, { recursive: true, force: true });
1127
- }
1128
- });
1129
- test("accepts a non-existent child path under root (covers safeRealpath ancestor walk)", () => {
1130
- const tmpRoot = benchMkdtemp("akm-bench-contain-pending-");
1131
- try {
1132
- const pending = path.join(tmpRoot, "not-yet-created", "child.md");
1133
- expect(isPathContained(tmpRoot, pending)).toBe(true);
1134
- }
1135
- finally {
1136
- fs.rmSync(tmpRoot, { recursive: true, force: true });
1137
- }
1138
- });
1139
- test("rejects an absolute target outside root", () => {
1140
- const tmpRoot = benchMkdtemp("akm-bench-contain-abs-");
1141
- const outside = benchMkdtemp("akm-bench-contain-abs-outside-");
1142
- try {
1143
- expect(isPathContained(tmpRoot, path.join(outside, "x"))).toBe(false);
1144
- }
1145
- finally {
1146
- fs.rmSync(tmpRoot, { recursive: true, force: true });
1147
- fs.rmSync(outside, { recursive: true, force: true });
1148
- }
1149
- });
1150
- });