akm-cli 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (332) hide show
  1. package/CHANGELOG.md +8 -0
  2. package/dist/{src/cli.js → cli.js} +22 -8
  3. package/dist/{src/commands → commands}/installed-stashes.js +1 -1
  4. package/dist/{src/commands → commands}/source-add.js +1 -1
  5. package/dist/{src/core → core}/common.js +16 -1
  6. package/dist/{src/core → core}/config.js +5 -2
  7. package/dist/{src/indexer → indexer}/db-search.js +16 -1
  8. package/dist/{src/indexer → indexer}/graph-extraction.js +5 -3
  9. package/dist/{src/indexer → indexer}/indexer.js +27 -11
  10. package/dist/{src/indexer → indexer}/memory-inference.js +47 -58
  11. package/dist/{src/indexer → indexer}/search-source.js +1 -1
  12. package/dist/{src/llm → llm}/client.js +61 -1
  13. package/dist/{src/llm → llm}/embedder.js +8 -5
  14. package/dist/{src/llm → llm}/embedders/local.js +8 -2
  15. package/dist/{src/llm → llm}/embedders/remote.js +4 -2
  16. package/dist/{src/llm → llm}/graph-extract.js +4 -4
  17. package/dist/llm/memory-infer.js +114 -0
  18. package/dist/{src/llm → llm}/metadata-enhance.js +2 -2
  19. package/dist/{src/output → output}/cli-hints.js +2 -0
  20. package/dist/{src/setup → setup}/setup.js +30 -20
  21. package/dist/sources/providers/website.js +27 -0
  22. package/dist/{src/sources/providers/website.js → sources/website-ingest.js} +38 -51
  23. package/docs/README.md +7 -0
  24. package/docs/migration/release-notes/0.7.0.md +14 -0
  25. package/package.json +11 -8
  26. package/dist/src/llm/memory-infer.js +0 -86
  27. package/dist/tests/add-website-source.test.js +0 -119
  28. package/dist/tests/agent/agent-config-loader.test.js +0 -70
  29. package/dist/tests/agent/agent-config.test.js +0 -221
  30. package/dist/tests/agent/agent-detect.test.js +0 -100
  31. package/dist/tests/agent/agent-spawn.test.js +0 -234
  32. package/dist/tests/agent-output.test.js +0 -186
  33. package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +0 -103
  34. package/dist/tests/architecture/agent-spawn-seam.test.js +0 -193
  35. package/dist/tests/architecture/llm-stateless-seam.test.js +0 -112
  36. package/dist/tests/asset-ref.test.js +0 -192
  37. package/dist/tests/asset-registry.test.js +0 -103
  38. package/dist/tests/asset-spec.test.js +0 -241
  39. package/dist/tests/bench/attribution.test.js +0 -996
  40. package/dist/tests/bench/cleanup-sigint.test.js +0 -83
  41. package/dist/tests/bench/cleanup.js +0 -234
  42. package/dist/tests/bench/cleanup.test.js +0 -166
  43. package/dist/tests/bench/cli.js +0 -1018
  44. package/dist/tests/bench/cli.test.js +0 -445
  45. package/dist/tests/bench/compare.test.js +0 -556
  46. package/dist/tests/bench/corpus.js +0 -317
  47. package/dist/tests/bench/corpus.test.js +0 -258
  48. package/dist/tests/bench/doctor.js +0 -525
  49. package/dist/tests/bench/driver.js +0 -401
  50. package/dist/tests/bench/driver.test.js +0 -584
  51. package/dist/tests/bench/environment.js +0 -233
  52. package/dist/tests/bench/environment.test.js +0 -199
  53. package/dist/tests/bench/evolve-metrics.js +0 -179
  54. package/dist/tests/bench/evolve-metrics.test.js +0 -187
  55. package/dist/tests/bench/evolve.js +0 -647
  56. package/dist/tests/bench/evolve.test.js +0 -624
  57. package/dist/tests/bench/failure-modes.test.js +0 -349
  58. package/dist/tests/bench/feedback-integrity.test.js +0 -457
  59. package/dist/tests/bench/leakage.test.js +0 -228
  60. package/dist/tests/bench/learning-curve.test.js +0 -134
  61. package/dist/tests/bench/metrics.js +0 -2395
  62. package/dist/tests/bench/metrics.test.js +0 -1150
  63. package/dist/tests/bench/no-os-tmpdir-invariant.test.js +0 -43
  64. package/dist/tests/bench/opencode-config.js +0 -194
  65. package/dist/tests/bench/opencode-config.test.js +0 -370
  66. package/dist/tests/bench/report.js +0 -1885
  67. package/dist/tests/bench/report.test.js +0 -1038
  68. package/dist/tests/bench/run-config.js +0 -355
  69. package/dist/tests/bench/run-config.test.js +0 -298
  70. package/dist/tests/bench/run-curate-test.js +0 -32
  71. package/dist/tests/bench/run-failing-tasks.js +0 -56
  72. package/dist/tests/bench/run-full-bench.js +0 -51
  73. package/dist/tests/bench/run-items36-targeted.js +0 -69
  74. package/dist/tests/bench/run-nano-quick.js +0 -42
  75. package/dist/tests/bench/run-waveg-targeted.js +0 -62
  76. package/dist/tests/bench/runner.js +0 -699
  77. package/dist/tests/bench/runner.test.js +0 -958
  78. package/dist/tests/bench/search-bridge.test.js +0 -331
  79. package/dist/tests/bench/tmp.js +0 -131
  80. package/dist/tests/bench/trajectory.js +0 -116
  81. package/dist/tests/bench/trajectory.test.js +0 -127
  82. package/dist/tests/bench/verifier.js +0 -114
  83. package/dist/tests/bench/verifier.test.js +0 -118
  84. package/dist/tests/bench/workflow-evaluator.js +0 -557
  85. package/dist/tests/bench/workflow-evaluator.test.js +0 -421
  86. package/dist/tests/bench/workflow-spec.js +0 -345
  87. package/dist/tests/bench/workflow-spec.test.js +0 -363
  88. package/dist/tests/bench/workflow-trace.js +0 -472
  89. package/dist/tests/bench/workflow-trace.test.js +0 -254
  90. package/dist/tests/benchmark-search-quality.js +0 -536
  91. package/dist/tests/benchmark-suite.js +0 -1441
  92. package/dist/tests/capture-cli.test.js +0 -112
  93. package/dist/tests/cli-errors.test.js +0 -204
  94. package/dist/tests/commands/events.test.js +0 -370
  95. package/dist/tests/commands/history.test.js +0 -418
  96. package/dist/tests/commands/import.test.js +0 -103
  97. package/dist/tests/commands/proposal-cli.test.js +0 -209
  98. package/dist/tests/commands/reflect-propose-cli.test.js +0 -333
  99. package/dist/tests/commands/remember.test.js +0 -97
  100. package/dist/tests/commands/scope-flags.test.js +0 -300
  101. package/dist/tests/commands/search.test.js +0 -537
  102. package/dist/tests/commands/show-indexer-parity.test.js +0 -117
  103. package/dist/tests/commands/show.test.js +0 -294
  104. package/dist/tests/common.test.js +0 -266
  105. package/dist/tests/completions.test.js +0 -142
  106. package/dist/tests/config-cli.test.js +0 -193
  107. package/dist/tests/config-llm-features.test.js +0 -139
  108. package/dist/tests/config.test.js +0 -569
  109. package/dist/tests/contracts/migration-baseline.test.js +0 -43
  110. package/dist/tests/contracts/reflect-propose-envelope.test.js +0 -139
  111. package/dist/tests/contracts/spec-helpers.js +0 -46
  112. package/dist/tests/contracts/v1-spec-section-11-proposal-queue.test.js +0 -228
  113. package/dist/tests/contracts/v1-spec-section-12-agent-config.test.js +0 -56
  114. package/dist/tests/contracts/v1-spec-section-13-lesson-type.test.js +0 -34
  115. package/dist/tests/contracts/v1-spec-section-14-llm-features.test.js +0 -94
  116. package/dist/tests/contracts/v1-spec-section-4-1-asset-types.test.js +0 -39
  117. package/dist/tests/contracts/v1-spec-section-4-2-quality-rules.test.js +0 -44
  118. package/dist/tests/contracts/v1-spec-section-5-configuration.test.js +0 -47
  119. package/dist/tests/contracts/v1-spec-section-6-orchestration.test.js +0 -40
  120. package/dist/tests/contracts/v1-spec-section-7-module-layout.test.js +0 -58
  121. package/dist/tests/contracts/v1-spec-section-8-extension-points.test.js +0 -34
  122. package/dist/tests/contracts/v1-spec-section-9-4-cli-surface.test.js +0 -75
  123. package/dist/tests/contracts/v1-spec-section-9-7-llm-agent-boundary.test.js +0 -36
  124. package/dist/tests/core/write-source.test.js +0 -366
  125. package/dist/tests/curate-command.test.js +0 -87
  126. package/dist/tests/db-scoring.test.js +0 -201
  127. package/dist/tests/db.test.js +0 -654
  128. package/dist/tests/distill-cli-flag.test.js +0 -208
  129. package/dist/tests/distill.test.js +0 -515
  130. package/dist/tests/docker-install.test.js +0 -120
  131. package/dist/tests/e2e.test.js +0 -1419
  132. package/dist/tests/embedder.test.js +0 -340
  133. package/dist/tests/embedding-model-config.test.js +0 -379
  134. package/dist/tests/feedback-command.test.js +0 -172
  135. package/dist/tests/file-context.test.js +0 -552
  136. package/dist/tests/fixtures/scripts/git/summarize-diff.js +0 -9
  137. package/dist/tests/fixtures/scripts/lint/eslint-check.js +0 -7
  138. package/dist/tests/fixtures/stashes/load.js +0 -166
  139. package/dist/tests/fixtures/stashes/load.test.js +0 -97
  140. package/dist/tests/fixtures/stashes/ranking-baseline/scripts/mem0-search.js +0 -12
  141. package/dist/tests/frontmatter.test.js +0 -190
  142. package/dist/tests/fts-field-weighting.test.js +0 -254
  143. package/dist/tests/fuzzy-search.test.js +0 -230
  144. package/dist/tests/git-provider-clone.test.js +0 -45
  145. package/dist/tests/github.test.js +0 -161
  146. package/dist/tests/graph-boost-ranking.test.js +0 -305
  147. package/dist/tests/graph-extraction.test.js +0 -282
  148. package/dist/tests/helpers/usage-events.js +0 -8
  149. package/dist/tests/index-pass-llm.test.js +0 -161
  150. package/dist/tests/indexer.test.js +0 -570
  151. package/dist/tests/info-command.test.js +0 -166
  152. package/dist/tests/init.test.js +0 -69
  153. package/dist/tests/install-script.test.js +0 -246
  154. package/dist/tests/integration/agent-real-profile.test.js +0 -94
  155. package/dist/tests/issue-36-repro.test.js +0 -304
  156. package/dist/tests/issues-191-194.test.js +0 -160
  157. package/dist/tests/lesson-lint.test.js +0 -111
  158. package/dist/tests/llm-client.test.js +0 -115
  159. package/dist/tests/llm-feature-gate.test.js +0 -151
  160. package/dist/tests/llm.test.js +0 -139
  161. package/dist/tests/lockfile.test.js +0 -216
  162. package/dist/tests/manifest.test.js +0 -205
  163. package/dist/tests/markdown.test.js +0 -126
  164. package/dist/tests/matchers-unit.test.js +0 -189
  165. package/dist/tests/memory-inference.test.js +0 -299
  166. package/dist/tests/merge-scoring.test.js +0 -136
  167. package/dist/tests/metadata.test.js +0 -313
  168. package/dist/tests/migration-help.test.js +0 -89
  169. package/dist/tests/origin-resolve.test.js +0 -124
  170. package/dist/tests/output-baseline.test.js +0 -218
  171. package/dist/tests/output-shapes-unit.test.js +0 -478
  172. package/dist/tests/parallel-search.test.js +0 -272
  173. package/dist/tests/parameter-metadata.test.js +0 -365
  174. package/dist/tests/paths.test.js +0 -177
  175. package/dist/tests/progressive-disclosure.test.js +0 -280
  176. package/dist/tests/proposals.test.js +0 -279
  177. package/dist/tests/proposed-quality.test.js +0 -271
  178. package/dist/tests/provider-registry.test.js +0 -32
  179. package/dist/tests/ranking-regression.test.js +0 -548
  180. package/dist/tests/reflect-propose.test.js +0 -455
  181. package/dist/tests/registry-build-index.test.js +0 -394
  182. package/dist/tests/registry-cli.test.js +0 -290
  183. package/dist/tests/registry-index-v2.test.js +0 -430
  184. package/dist/tests/registry-install.test.js +0 -728
  185. package/dist/tests/registry-providers/parity.test.js +0 -189
  186. package/dist/tests/registry-providers/skills-sh.test.js +0 -309
  187. package/dist/tests/registry-providers/static-index.test.js +0 -238
  188. package/dist/tests/registry-resolve.test.js +0 -126
  189. package/dist/tests/registry-search.test.js +0 -923
  190. package/dist/tests/remember-frontmatter.test.js +0 -378
  191. package/dist/tests/remember-unit.test.js +0 -123
  192. package/dist/tests/ripgrep-install.test.js +0 -251
  193. package/dist/tests/ripgrep-resolve.test.js +0 -108
  194. package/dist/tests/ripgrep.test.js +0 -163
  195. package/dist/tests/save-command.test.js +0 -94
  196. package/dist/tests/save-trust-qa-fixes.test.js +0 -270
  197. package/dist/tests/scoring-pipeline.test.js +0 -648
  198. package/dist/tests/search-include-proposed-cli.test.js +0 -118
  199. package/dist/tests/self-update.test.js +0 -442
  200. package/dist/tests/semantic-search-e2e.test.js +0 -512
  201. package/dist/tests/semantic-status.test.js +0 -471
  202. package/dist/tests/setup-run.integration.js +0 -877
  203. package/dist/tests/setup-wizard.test.js +0 -198
  204. package/dist/tests/setup.test.js +0 -131
  205. package/dist/tests/source-add.test.js +0 -11
  206. package/dist/tests/source-clone.test.js +0 -254
  207. package/dist/tests/source-manage.test.js +0 -366
  208. package/dist/tests/source-providers/filesystem.test.js +0 -82
  209. package/dist/tests/source-providers/git.test.js +0 -252
  210. package/dist/tests/source-providers/website.test.js +0 -128
  211. package/dist/tests/source-qa-fixes.test.js +0 -286
  212. package/dist/tests/source-registry.test.js +0 -350
  213. package/dist/tests/source-resolve.test.js +0 -100
  214. package/dist/tests/source-source.test.js +0 -281
  215. package/dist/tests/source.test.js +0 -533
  216. package/dist/tests/tar-utils-scan.test.js +0 -73
  217. package/dist/tests/toggle-components.test.js +0 -73
  218. package/dist/tests/usage-telemetry.test.js +0 -265
  219. package/dist/tests/utility-scoring.test.js +0 -558
  220. package/dist/tests/vault-load-error.test.js +0 -78
  221. package/dist/tests/vault-qa-fixes.test.js +0 -194
  222. package/dist/tests/vault.test.js +0 -429
  223. package/dist/tests/vector-search.test.js +0 -608
  224. package/dist/tests/walker.test.js +0 -252
  225. package/dist/tests/wave2-cluster-bc.test.js +0 -228
  226. package/dist/tests/wave2-cluster-d.test.js +0 -180
  227. package/dist/tests/wave2-cluster-e.test.js +0 -179
  228. package/dist/tests/wiki-qa-fixes.test.js +0 -270
  229. package/dist/tests/wiki.test.js +0 -529
  230. package/dist/tests/workflow-cli.test.js +0 -271
  231. package/dist/tests/workflow-markdown.test.js +0 -171
  232. package/dist/tests/workflow-path-escape.test.js +0 -132
  233. package/dist/tests/workflow-qa-fixes.test.js +0 -395
  234. package/dist/tests/workflows/indexer-rejection.test.js +0 -213
  235. /package/dist/{src/commands → commands}/completions.js +0 -0
  236. /package/dist/{src/commands → commands}/config-cli.js +0 -0
  237. /package/dist/{src/commands → commands}/curate.js +0 -0
  238. /package/dist/{src/commands → commands}/distill.js +0 -0
  239. /package/dist/{src/commands → commands}/events.js +0 -0
  240. /package/dist/{src/commands → commands}/history.js +0 -0
  241. /package/dist/{src/commands → commands}/info.js +0 -0
  242. /package/dist/{src/commands → commands}/init.js +0 -0
  243. /package/dist/{src/commands → commands}/install-audit.js +0 -0
  244. /package/dist/{src/commands → commands}/migration-help.js +0 -0
  245. /package/dist/{src/commands → commands}/proposal.js +0 -0
  246. /package/dist/{src/commands → commands}/propose.js +0 -0
  247. /package/dist/{src/commands → commands}/reflect.js +0 -0
  248. /package/dist/{src/commands → commands}/registry-search.js +0 -0
  249. /package/dist/{src/commands → commands}/remember.js +0 -0
  250. /package/dist/{src/commands → commands}/search.js +0 -0
  251. /package/dist/{src/commands → commands}/self-update.js +0 -0
  252. /package/dist/{src/commands → commands}/show.js +0 -0
  253. /package/dist/{src/commands → commands}/source-clone.js +0 -0
  254. /package/dist/{src/commands → commands}/source-manage.js +0 -0
  255. /package/dist/{src/commands → commands}/vault.js +0 -0
  256. /package/dist/{src/core → core}/asset-ref.js +0 -0
  257. /package/dist/{src/core → core}/asset-registry.js +0 -0
  258. /package/dist/{src/core → core}/asset-spec.js +0 -0
  259. /package/dist/{src/core → core}/errors.js +0 -0
  260. /package/dist/{src/core → core}/events.js +0 -0
  261. /package/dist/{src/core → core}/frontmatter.js +0 -0
  262. /package/dist/{src/core → core}/lesson-lint.js +0 -0
  263. /package/dist/{src/core → core}/markdown.js +0 -0
  264. /package/dist/{src/core → core}/paths.js +0 -0
  265. /package/dist/{src/core → core}/proposals.js +0 -0
  266. /package/dist/{src/core → core}/warn.js +0 -0
  267. /package/dist/{src/core → core}/write-source.js +0 -0
  268. /package/dist/{src/indexer → indexer}/db.js +0 -0
  269. /package/dist/{src/indexer → indexer}/file-context.js +0 -0
  270. /package/dist/{src/indexer → indexer}/graph-boost.js +0 -0
  271. /package/dist/{src/indexer → indexer}/manifest.js +0 -0
  272. /package/dist/{src/indexer → indexer}/matchers.js +0 -0
  273. /package/dist/{src/indexer → indexer}/metadata.js +0 -0
  274. /package/dist/{src/indexer → indexer}/search-fields.js +0 -0
  275. /package/dist/{src/indexer → indexer}/semantic-status.js +0 -0
  276. /package/dist/{src/indexer → indexer}/usage-events.js +0 -0
  277. /package/dist/{src/indexer → indexer}/walker.js +0 -0
  278. /package/dist/{src/integrations → integrations}/agent/config.js +0 -0
  279. /package/dist/{src/integrations → integrations}/agent/detect.js +0 -0
  280. /package/dist/{src/integrations → integrations}/agent/index.js +0 -0
  281. /package/dist/{src/integrations → integrations}/agent/profiles.js +0 -0
  282. /package/dist/{src/integrations → integrations}/agent/prompts.js +0 -0
  283. /package/dist/{src/integrations → integrations}/agent/spawn.js +0 -0
  284. /package/dist/{src/integrations → integrations}/github.js +0 -0
  285. /package/dist/{src/integrations → integrations}/lockfile.js +0 -0
  286. /package/dist/{src/llm → llm}/embedders/cache.js +0 -0
  287. /package/dist/{src/llm → llm}/embedders/types.js +0 -0
  288. /package/dist/{src/llm → llm}/feature-gate.js +0 -0
  289. /package/dist/{src/llm → llm}/index-passes.js +0 -0
  290. /package/dist/{src/output → output}/context.js +0 -0
  291. /package/dist/{src/output → output}/renderers.js +0 -0
  292. /package/dist/{src/output → output}/shapes.js +0 -0
  293. /package/dist/{src/output → output}/text.js +0 -0
  294. /package/dist/{src/registry → registry}/build-index.js +0 -0
  295. /package/dist/{src/registry → registry}/create-provider-registry.js +0 -0
  296. /package/dist/{src/registry → registry}/factory.js +0 -0
  297. /package/dist/{src/registry → registry}/origin-resolve.js +0 -0
  298. /package/dist/{src/registry → registry}/providers/index.js +0 -0
  299. /package/dist/{src/registry → registry}/providers/skills-sh.js +0 -0
  300. /package/dist/{src/registry → registry}/providers/static-index.js +0 -0
  301. /package/dist/{src/registry → registry}/providers/types.js +0 -0
  302. /package/dist/{src/registry → registry}/resolve.js +0 -0
  303. /package/dist/{src/registry → registry}/types.js +0 -0
  304. /package/dist/{src/setup → setup}/detect.js +0 -0
  305. /package/dist/{src/setup → setup}/ripgrep-install.js +0 -0
  306. /package/dist/{src/setup → setup}/ripgrep-resolve.js +0 -0
  307. /package/dist/{src/setup → setup}/steps.js +0 -0
  308. /package/dist/{src/sources → sources}/include.js +0 -0
  309. /package/dist/{src/sources → sources}/provider-factory.js +0 -0
  310. /package/dist/{src/sources → sources}/provider.js +0 -0
  311. /package/dist/{src/sources → sources}/providers/filesystem.js +0 -0
  312. /package/dist/{src/sources → sources}/providers/git.js +0 -0
  313. /package/dist/{src/sources → sources}/providers/index.js +0 -0
  314. /package/dist/{src/sources → sources}/providers/install-types.js +0 -0
  315. /package/dist/{src/sources → sources}/providers/npm.js +0 -0
  316. /package/dist/{src/sources → sources}/providers/provider-utils.js +0 -0
  317. /package/dist/{src/sources → sources}/providers/sync-from-ref.js +0 -0
  318. /package/dist/{src/sources → sources}/providers/tar-utils.js +0 -0
  319. /package/dist/{src/sources → sources}/resolve.js +0 -0
  320. /package/dist/{src/sources → sources}/types.js +0 -0
  321. /package/dist/{src/templates → templates}/wiki-templates.js +0 -0
  322. /package/dist/{src/version.js → version.js} +0 -0
  323. /package/dist/{src/wiki → wiki}/wiki.js +0 -0
  324. /package/dist/{src/workflows → workflows}/authoring.js +0 -0
  325. /package/dist/{src/workflows → workflows}/cli.js +0 -0
  326. /package/dist/{src/workflows → workflows}/db.js +0 -0
  327. /package/dist/{src/workflows → workflows}/document-cache.js +0 -0
  328. /package/dist/{src/workflows → workflows}/parser.js +0 -0
  329. /package/dist/{src/workflows → workflows}/renderer.js +0 -0
  330. /package/dist/{src/workflows → workflows}/runs.js +0 -0
  331. /package/dist/{src/workflows → workflows}/schema.js +0 -0
  332. /package/dist/{src/workflows → workflows}/validator.js +0 -0
@@ -1,331 +0,0 @@
1
- /**
2
- * Unit tests for the §6.7 search-pipeline bridge.
3
- *
4
- * Covers:
5
- * • `extractGoldRanks` — pure-function rank extraction from synthetic
6
- * verifier-stdout traces, including JSON tool-call form, plain-text
7
- * `ref:` lines, multiple searches per run, and gold-not-in-top-10
8
- * (the "missing" bucket).
9
- * • `computeSearchBridge` — histogram, p50/p90, gold_at_rank_1,
10
- * gold_missing, and the keystone `pass_rate_by_rank` slice.
11
- * • Empty-corpus path — no records → renderer emits the N/A sentence.
12
- *
13
- * No real opencode is invoked; every fixture is a hand-crafted `RunResult`.
14
- */
15
- import { describe, expect, test } from "bun:test";
16
- import { computeSearchBridge, extractGoldRanks } from "./metrics";
17
- import { renderSearchBridgeTable } from "./report";
18
- function fakeResult(stdout, overrides = {}) {
19
- return {
20
- schemaVersion: 1,
21
- taskId: "t",
22
- arm: "akm",
23
- seed: 0,
24
- model: "m",
25
- outcome: "pass",
26
- tokens: { input: 0, output: 0 },
27
- wallclockMs: 0,
28
- trajectory: { correctAssetLoaded: null, feedbackRecorded: null },
29
- events: [],
30
- verifierStdout: stdout,
31
- verifierExitCode: 0,
32
- assetsLoaded: [],
33
- ...overrides,
34
- };
35
- }
36
- describe("extractGoldRanks", () => {
37
- test("returns [] when goldRef is undefined", () => {
38
- const r = fakeResult('akm search "foo"\nref: skill:foo');
39
- expect(extractGoldRanks(r, undefined)).toEqual([]);
40
- });
41
- test("returns [] when verifierStdout is empty", () => {
42
- const r = fakeResult("");
43
- expect(extractGoldRanks(r, "skill:foo")).toEqual([]);
44
- });
45
- test("extracts a single search with text-mode ref output, gold at rank 1", () => {
46
- const stdout = [
47
- `> akm search "redis healthcheck"`,
48
- `skill: docker-homelab`,
49
- ` ref: skill:docker-homelab`,
50
- ` score: 0.92`,
51
- `skill: nginx-tls`,
52
- ` ref: skill:nginx-tls`,
53
- ` score: 0.81`,
54
- ].join("\n");
55
- const events = extractGoldRanks(fakeResult(stdout), "skill:docker-homelab");
56
- expect(events).toHaveLength(1);
57
- expect(events[0].query).toBe("redis healthcheck");
58
- expect(events[0].results).toEqual(["skill:docker-homelab", "skill:nginx-tls"]);
59
- expect(events[0].rankOfGold).toBe(1);
60
- });
61
- test("extracts JSON tool-call form, gold at rank 3", () => {
62
- const stdout = [
63
- 'tool: akm search "kubernetes pod restart" --output json',
64
- '{"hits":[{"ref":"skill:k8s-debug"},{"ref":"skill:k8s-monitoring"},{"ref":"skill:k8s-restart"},{"ref":"skill:k8s-deploy"}]}',
65
- ].join("\n");
66
- const events = extractGoldRanks(fakeResult(stdout), "skill:k8s-restart");
67
- expect(events).toHaveLength(1);
68
- expect(events[0].results.slice(0, 4)).toEqual([
69
- "skill:k8s-debug",
70
- "skill:k8s-monitoring",
71
- "skill:k8s-restart",
72
- "skill:k8s-deploy",
73
- ]);
74
- expect(events[0].rankOfGold).toBe(3);
75
- });
76
- test("returns null rank when gold is missing from top 10", () => {
77
- const refs = Array.from({ length: 12 }, (_, i) => ` ref: skill:other-${i}`).join("\n");
78
- const stdout = `akm search "missing-target"\n${refs}`;
79
- const events = extractGoldRanks(fakeResult(stdout), "skill:gold");
80
- expect(events).toHaveLength(1);
81
- expect(events[0].rankOfGold).toBeNull();
82
- // Top-10 cap: only 10 results retained.
83
- expect(events[0].results.length).toBeLessThanOrEqual(10);
84
- });
85
- test("multiple searches per run are each emitted in order", () => {
86
- const stdout = [
87
- 'akm search "first query"',
88
- " ref: skill:a",
89
- " ref: skill:b",
90
- 'akm search "second query"',
91
- " ref: skill:gold",
92
- " ref: skill:c",
93
- ].join("\n");
94
- const events = extractGoldRanks(fakeResult(stdout), "skill:gold");
95
- expect(events).toHaveLength(2);
96
- expect(events[0].query).toBe("first query");
97
- expect(events[0].rankOfGold).toBeNull();
98
- expect(events[1].query).toBe("second query");
99
- expect(events[1].rankOfGold).toBe(1);
100
- });
101
- test("non-search akm invocation closes the active search block", () => {
102
- const stdout = [
103
- 'akm search "q"',
104
- " ref: skill:a",
105
- " ref: skill:gold",
106
- "akm show skill:gold",
107
- " ref: skill:gold (this should NOT extend the previous search)",
108
- " ref: skill:other",
109
- ].join("\n");
110
- const events = extractGoldRanks(fakeResult(stdout), "skill:gold");
111
- // Only the search block contributes to results; the show block is closed.
112
- expect(events).toHaveLength(1);
113
- expect(events[0].results).toEqual(["skill:a", "skill:gold"]);
114
- expect(events[0].rankOfGold).toBe(2);
115
- });
116
- test("origin-prefixed ref counts as gold (team//skill:foo matches skill:foo)", () => {
117
- const stdout = ['akm search "q"', " ref: team//skill:foo", " ref: skill:bar"].join("\n");
118
- const events = extractGoldRanks(fakeResult(stdout), "skill:foo");
119
- expect(events[0].rankOfGold).toBe(1);
120
- });
121
- });
122
- describe("computeSearchBridge — histogram + percentiles", () => {
123
- function fakeRecord(seed, outcome, rankOrNullPerSearch) {
124
- return {
125
- taskId: `t${seed}`,
126
- arm: "akm",
127
- seed,
128
- outcome,
129
- goldRef: "skill:gold",
130
- searches: rankOrNullPerSearch.map((rank, i) => ({
131
- query: `q${i}`,
132
- // Reconstruct a plausible result list: gold at the requested rank,
133
- // others as fillers. The aggregator only looks at rankOfGold.
134
- results: rank === null ? Array.from({ length: 10 }, (_, j) => `skill:other-${j}`) : [],
135
- rankOfGold: rank,
136
- })),
137
- };
138
- }
139
- test("empty corpus produces zero envelope", () => {
140
- const m = computeSearchBridge({ goldRankRecords: [] });
141
- expect(m.runsObserved).toBe(0);
142
- expect(m.searchesObserved).toBe(0);
143
- expect(m.goldRankP50).toBeNull();
144
- expect(m.goldRankP90).toBeNull();
145
- expect(m.goldAtRank1).toBe(0);
146
- expect(m.goldMissing).toBe(0);
147
- expect(m.passRateByRank).toEqual([]);
148
- // Histogram is fully zeroed for every key.
149
- for (const k of ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "missing"]) {
150
- expect(m.goldRankDistribution[k]).toBe(0);
151
- }
152
- });
153
- test("histogram counts ranks across all searches", () => {
154
- const records = [
155
- fakeRecord(0, "pass", [1, 2]),
156
- fakeRecord(1, "fail", [1, null]),
157
- fakeRecord(2, "pass", [3]),
158
- ];
159
- const m = computeSearchBridge({ goldRankRecords: records });
160
- expect(m.searchesObserved).toBe(5);
161
- expect(m.runsObserved).toBe(3);
162
- expect(m.goldRankDistribution["1"]).toBe(2);
163
- expect(m.goldRankDistribution["2"]).toBe(1);
164
- expect(m.goldRankDistribution["3"]).toBe(1);
165
- expect(m.goldRankDistribution.missing).toBe(1);
166
- expect(m.goldAtRank1).toBeCloseTo(2 / 5);
167
- expect(m.goldMissing).toBeCloseTo(1 / 5);
168
- });
169
- test("p50/p90 use nearest-rank with missing treated as Infinity", () => {
170
- // Ranks: [1,1,2,3,5,5,7,9,null,null] across one record.
171
- const records = [fakeRecord(0, "pass", [1, 1, 2, 3, 5, 5, 7, 9, null, null])];
172
- const m = computeSearchBridge({ goldRankRecords: records });
173
- // Sorted: [1,1,2,3,5,5,7,9,Inf,Inf]
174
- // p50 = idx ceil(0.5*10)-1 = 4 → 5
175
- expect(m.goldRankP50).toBe(5);
176
- // p90 = idx ceil(0.9*10)-1 = 8 → Infinity
177
- expect(m.goldRankP90).toBe(Number.POSITIVE_INFINITY);
178
- });
179
- });
180
- describe("computeSearchBridge — pass_rate_by_rank uses the agent's chosen search", () => {
181
- test("attributes pass/fail to the rank in the LAST search, not the highest-ranked", () => {
182
- // Run A passed; first search had gold at rank 1 (great rank!), but the
183
- // *chosen* (last) search had gold at rank 5. The bridge must attribute
184
- // run A to rank 5, not rank 1, otherwise it overstates the value of
185
- // having gold at rank 1.
186
- const records = [
187
- {
188
- taskId: "ta",
189
- arm: "akm",
190
- seed: 0,
191
- outcome: "pass",
192
- goldRef: "skill:gold",
193
- searches: [
194
- { query: "first", results: [], rankOfGold: 1 },
195
- { query: "last", results: [], rankOfGold: 5 },
196
- ],
197
- },
198
- {
199
- taskId: "tb",
200
- arm: "akm",
201
- seed: 0,
202
- outcome: "fail",
203
- goldRef: "skill:gold",
204
- searches: [{ query: "only", results: [], rankOfGold: 5 }],
205
- },
206
- {
207
- taskId: "tc",
208
- arm: "akm",
209
- seed: 0,
210
- outcome: "pass",
211
- goldRef: "skill:gold",
212
- searches: [{ query: "only", results: [], rankOfGold: 1 }],
213
- },
214
- ];
215
- const m = computeSearchBridge({ goldRankRecords: records });
216
- // Buckets: rank 1 → {1 pass / 1 total}, rank 5 → {1 pass / 2 total}.
217
- const rank1 = m.passRateByRank.find((e) => e.rank === "1");
218
- const rank5 = m.passRateByRank.find((e) => e.rank === "5");
219
- expect(rank1).toBeDefined();
220
- expect(rank1?.passRate).toBe(1);
221
- expect(rank1?.runCount).toBe(1);
222
- expect(rank5).toBeDefined();
223
- expect(rank5?.passRate).toBe(0.5);
224
- expect(rank5?.runCount).toBe(2);
225
- });
226
- test("missing bucket gets its own pass-rate row instead of being dropped", () => {
227
- const records = [
228
- {
229
- taskId: "tm1",
230
- arm: "akm",
231
- seed: 0,
232
- outcome: "pass",
233
- goldRef: "skill:gold",
234
- searches: [{ query: "q", results: [], rankOfGold: null }],
235
- },
236
- {
237
- taskId: "tm2",
238
- arm: "akm",
239
- seed: 0,
240
- outcome: "fail",
241
- goldRef: "skill:gold",
242
- searches: [{ query: "q", results: [], rankOfGold: null }],
243
- },
244
- ];
245
- const m = computeSearchBridge({ goldRankRecords: records });
246
- const missing = m.passRateByRank.find((e) => e.rank === "missing");
247
- expect(missing).toBeDefined();
248
- expect(missing?.runCount).toBe(2);
249
- expect(missing?.passRate).toBe(0.5);
250
- });
251
- test("runs without any akm search invocation are excluded from pass_rate_by_rank", () => {
252
- const records = [
253
- {
254
- taskId: "no-search",
255
- arm: "akm",
256
- seed: 0,
257
- outcome: "fail",
258
- goldRef: "skill:gold",
259
- searches: [],
260
- },
261
- ];
262
- const m = computeSearchBridge({ goldRankRecords: records });
263
- expect(m.runsObserved).toBe(1);
264
- expect(m.searchesObserved).toBe(0);
265
- expect(m.passRateByRank).toEqual([]);
266
- });
267
- });
268
- describe("renderSearchBridgeTable", () => {
269
- test("empty corpus renders the N/A sentence", () => {
270
- const md = renderSearchBridgeTable({
271
- goldRankDistribution: {
272
- "1": 0,
273
- "2": 0,
274
- "3": 0,
275
- "4": 0,
276
- "5": 0,
277
- "6": 0,
278
- "7": 0,
279
- "8": 0,
280
- "9": 0,
281
- "10": 0,
282
- missing: 0,
283
- },
284
- goldRankP50: null,
285
- goldRankP90: null,
286
- goldAtRank1: 0,
287
- goldMissing: 0,
288
- passRateByRank: [],
289
- runsObserved: 0,
290
- searchesObserved: 0,
291
- });
292
- expect(md).toContain("Search → outcome bridge");
293
- expect(md).toContain("(no gold-ref tasks in corpus; bridge metrics N/A)");
294
- });
295
- test("populated corpus surfaces histogram, p50/p90, and pass-rate-by-rank table", () => {
296
- const md = renderSearchBridgeTable({
297
- goldRankDistribution: {
298
- "1": 3,
299
- "2": 1,
300
- "3": 0,
301
- "4": 0,
302
- "5": 1,
303
- "6": 0,
304
- "7": 0,
305
- "8": 0,
306
- "9": 0,
307
- "10": 0,
308
- missing: 1,
309
- },
310
- goldRankP50: 1,
311
- goldRankP90: 5,
312
- goldAtRank1: 0.5,
313
- goldMissing: 1 / 6,
314
- passRateByRank: [
315
- { rank: "1", passRate: 0.67, runCount: 3 },
316
- { rank: "5", passRate: 0, runCount: 1 },
317
- { rank: "missing", passRate: 0, runCount: 1 },
318
- ],
319
- runsObserved: 5,
320
- searchesObserved: 6,
321
- });
322
- expect(md).toContain("| 1 | 3 |");
323
- expect(md).toContain("| missing | 1 |");
324
- expect(md).toContain("p50=1.0");
325
- expect(md).toContain("p90=5.0");
326
- expect(md).toContain("gold_at_rank_1=50.0%");
327
- expect(md).toContain("| rank | pass_rate | run_count |");
328
- expect(md).toContain("| 1 | 0.67 | 3 |");
329
- expect(md).toContain("| missing | 0.00 | 1 |");
330
- });
331
- });
@@ -1,131 +0,0 @@
1
- /**
2
- * Bench tmp-root redirection (#276).
3
- *
4
- * Every bench tmp directory — per-(task, arm, seed) workspace, per-task
5
- * fixture stash, per-fixture evolveStash + preStash, plus the scratch dirs
6
- * spun up inside unit tests — lives under `${AKM_CACHE_DIR}/bench/`, NOT
7
- * `os.tmpdir()`.
8
- *
9
- * Why: during long bench/workflow runs, orphan tmp dirs from crashed agents
10
- * accumulate. When they pile up under `/tmp` the OS-level partition fills,
11
- * which breaks shells, browsers, npm caches, and the rest of the system.
12
- * Pinning bench tmp to the akm cache dir means a single
13
- * `rm -rf "$(akm config get cache.dir)/bench"` purges all bench scratch
14
- * without disturbing anything else.
15
- *
16
- * The bench cleanup machinery (`tests/bench/cleanup.ts`) also reaps
17
- * `${AKM_CACHE_DIR}/bench/*` entries older than 6 hours on the first
18
- * `registerCleanup` call to catch orphans from prior crashed runs.
19
- *
20
- * NOTE: this helper deliberately does NOT import `os.tmpdir()`. The
21
- * invariant test (`tests/bench/no-os-tmpdir-invariant.test.ts`) asserts
22
- * zero `os.tmpdir` references across `tests/bench/*.ts`.
23
- */
24
- import * as fs from "node:fs";
25
- import * as path from "node:path";
26
- import { getCacheDir } from "../../src/core/paths";
27
- /** Bench-tmp root: `${AKM_CACHE_DIR}/bench/`. Created lazily. */
28
- export function benchTmpRoot() {
29
- const root = path.join(getCacheDir(), "bench");
30
- fs.mkdirSync(root, { recursive: true });
31
- return root;
32
- }
33
- /**
34
- * Create a fresh tmp directory under `benchTmpRoot()`.
35
- *
36
- * Drop-in replacement for `fs.mkdtempSync(path.join(os.tmpdir(), prefix))`.
37
- * The returned absolute path is unique per call.
38
- */
39
- export function benchMkdtemp(prefix) {
40
- return fs.mkdtempSync(path.join(benchTmpRoot(), prefix));
41
- }
42
- // ── PID file ────────────────────────────────────────────────────────────────
43
- /** Absolute path to the bench PID file: `${AKM_CACHE_DIR}/bench/bench.pid`. */
44
- export function benchPidPath() {
45
- return path.join(benchTmpRoot(), "bench.pid");
46
- }
47
- /**
48
- * Write `process.pid` to `bench.pid`.
49
- *
50
- * If a stale PID file exists and the referenced process is no longer running,
51
- * it is removed with a warning before writing the new one.
52
- *
53
- * Returns a cleanup function that removes the PID file. Call it in a
54
- * `finally` block so the file is removed on both clean exit and exceptions.
55
- */
56
- export function writeBenchPid() {
57
- const pidPath = benchPidPath();
58
- // Check for an existing PID file and warn if stale.
59
- if (fs.existsSync(pidPath)) {
60
- let existingPid;
61
- try {
62
- const raw = fs.readFileSync(pidPath, "utf8").trim();
63
- existingPid = Number.parseInt(raw, 10);
64
- }
65
- catch {
66
- // Unreadable — treat as stale.
67
- }
68
- if (existingPid !== undefined && Number.isFinite(existingPid) && !isPidRunning(existingPid)) {
69
- // Stale PID — warn and remove.
70
- process.stderr.write(`bench: removing stale PID file for PID ${existingPid} (process not running)\n`);
71
- try {
72
- fs.rmSync(pidPath, { force: true });
73
- }
74
- catch {
75
- /* best-effort */
76
- }
77
- }
78
- }
79
- try {
80
- fs.writeFileSync(pidPath, String(process.pid), "utf8");
81
- }
82
- catch {
83
- /* best-effort — PID file is diagnostic, not critical */
84
- }
85
- return () => {
86
- try {
87
- // Only remove if it still contains our PID (guard against races).
88
- const current = fs.readFileSync(pidPath, "utf8").trim();
89
- if (current === String(process.pid)) {
90
- fs.rmSync(pidPath, { force: true });
91
- }
92
- }
93
- catch {
94
- /* best-effort */
95
- }
96
- };
97
- }
98
- /**
99
- * Read the PID from `bench.pid`. Returns `undefined` when the file does not
100
- * exist or cannot be parsed.
101
- */
102
- export function readBenchPid() {
103
- const pidPath = benchPidPath();
104
- if (!fs.existsSync(pidPath))
105
- return undefined;
106
- try {
107
- const raw = fs.readFileSync(pidPath, "utf8").trim();
108
- const n = Number.parseInt(raw, 10);
109
- return Number.isFinite(n) && n > 0 ? n : undefined;
110
- }
111
- catch {
112
- return undefined;
113
- }
114
- }
115
- /**
116
- * Return `true` when the process with the given PID is running on this host.
117
- * Uses `process.kill(pid, 0)` — signal 0 is a no-op probe that throws ESRCH
118
- * when the process does not exist and EPERM when it exists but is owned by
119
- * another user (in which case it IS running).
120
- */
121
- export function isPidRunning(pid) {
122
- try {
123
- process.kill(pid, 0);
124
- return true;
125
- }
126
- catch (err) {
127
- const code = err.code;
128
- // EPERM means the process exists but we don't have permission to signal it.
129
- return code === "EPERM";
130
- }
131
- }
@@ -1,116 +0,0 @@
1
- /**
2
- * akm-bench trajectory parser (spec §6.2).
3
- *
4
- * Trajectory metrics describe the *path* the agent took through the run, not
5
- * just the terminal outcome. For #238 we score two booleans per run:
6
- *
7
- * • `correctAssetLoaded` — did the agent invoke `akm show <goldRef>` (or
8
- * a sufficient prefix thereof) at any point during the run? `null` when
9
- * the task carries no `goldRef` (and so the metric is undefined).
10
- * • `feedbackRecorded` — did the agent emit any `feedback` event into
11
- * `events.jsonl` during the run? Always `false` for the `noakm` arm
12
- * because that arm runs without a stash.
13
- *
14
- * The driver hands us a `RunResult` after the run has finished. We never
15
- * mutate it; we return a fresh `TrajectoryRecord` and let the runner splice
16
- * it back in. This keeps `runOne`'s signature stable and lets `#239`/`#240`
17
- * extend the trajectory shape without touching the driver.
18
- */
19
- /**
20
- * Cap on the number of characters of `verifierStdout` we substring-scan for
21
- * the `akm show <ref>` heuristic. A runaway agent could emit GBs of stdout;
22
- * scanning all of it would OOM the bench. The first 16 MiB is plenty to
23
- * decide whether the agent invoked `akm show` for the gold ref.
24
- */
25
- export const VERIFIER_STDOUT_SCAN_CAP = 16 * 1024 * 1024;
26
- /**
27
- * Compute the trajectory record for a single run.
28
- *
29
- * The `correctAssetLoaded` heuristic looks for the `akm show <ref>` invocation
30
- * in two places:
31
- * 1. The `events.jsonl` events array (if `akm show` ever emits an event —
32
- * currently it does not, but we future-proof).
33
- * 2. The agent's stdout/verifier stdout (`runResult.verifierStdout`). When
34
- * opencode logs its tool calls, the literal string `akm show <ref>`
35
- * appears verbatim in the trace.
36
- *
37
- * We accept a "sufficient prefix": `skill:docker-homelab` matches both the
38
- * exact ref and `skill:docker-homelab/anything`. The match is conservative
39
- * — case-sensitive, exact substring on `akm show <ref>` (whitespace-flexible).
40
- */
41
- export function computeTrajectory(task, runResult, opts) {
42
- const correctAssetLoaded = computeCorrectAssetLoaded(task, runResult, opts);
43
- const feedbackRecorded = computeFeedbackRecorded(runResult);
44
- return { correctAssetLoaded, feedbackRecorded };
45
- }
46
- function computeCorrectAssetLoaded(task, runResult, opts) {
47
- if (!task.goldRef)
48
- return null;
49
- const ref = task.goldRef;
50
- // Search the events stream for any tool-call event that carries the ref.
51
- // akm show emits an event to events.jsonl, so this path is the primary
52
- // detection route when the structured event stream is available.
53
- for (const event of runResult.events) {
54
- const refField = event.ref;
55
- if (typeof refField === "string" && matchesRef(refField, ref))
56
- return true;
57
- const meta = event.metadata;
58
- if (meta && typeof meta === "object") {
59
- const candidate = meta.ref;
60
- if (typeof candidate === "string" && matchesRef(candidate, ref))
61
- return true;
62
- }
63
- }
64
- // Substring scan on the captured agent/verifier stdout. We look for either
65
- // - `akm show <ref>` (the canonical form opencode logs when the agent
66
- // invokes the akm CLI as a tool), or
67
- // - the bare ref appearing on a line that mentions `show` (covers tool-
68
- // call JSON like `{"command":"akm","args":["show","skill:foo"]}`).
69
- // Cap the scan at VERIFIER_STDOUT_SCAN_CAP so a runaway agent's GBs of
70
- // stdout cannot OOM the bench. When we truncate, push a warning so the
71
- // top-level report aggregates it under `warnings[]`.
72
- const haystackFull = runResult.verifierStdout;
73
- let haystack = haystackFull;
74
- if (haystack && haystack.length > VERIFIER_STDOUT_SCAN_CAP) {
75
- haystack = haystack.slice(0, VERIFIER_STDOUT_SCAN_CAP);
76
- if (opts?.warnings) {
77
- opts.warnings.push(`verifierStdout truncated for trajectory scan: ${haystackFull.length} chars exceeds ${VERIFIER_STDOUT_SCAN_CAP}-char cap; correct_asset_loaded computed from the prefix.`);
78
- }
79
- }
80
- if (haystack && containsAkmShow(haystack, ref))
81
- return true;
82
- return false;
83
- }
84
- function matchesRef(candidate, gold) {
85
- if (candidate === gold)
86
- return true;
87
- // Allow goldRef to be a prefix of a more-specific ref (e.g. team//skill:foo
88
- // when the task says skill:foo). Keep the check anchored to ref segments.
89
- if (candidate.endsWith(`//${gold}`))
90
- return true;
91
- if (candidate.startsWith(`${gold}/`))
92
- return true;
93
- return false;
94
- }
95
- function containsAkmShow(text, ref) {
96
- // Whitespace-flexible match for `akm show <ref>`. We escape regex metas in
97
- // the ref because asset refs may contain `:` (always) and `/` (origin form).
98
- const escaped = ref.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
99
- const pattern = new RegExp(`akm\\s+show\\s+(?:["'])?${escaped}(?:\\b|\\W)`);
100
- if (pattern.test(text))
101
- return true;
102
- // Tool-call JSON form: `"args":["show","<ref>"]` or similar. Cheap heuristic.
103
- if (text.includes(`"show"`) && text.includes(ref))
104
- return true;
105
- return false;
106
- }
107
- function computeFeedbackRecorded(runResult) {
108
- // The `noakm` arm runs without an akm stash, so events.jsonl will be empty
109
- // by construction. Still honour the same scan — the assertion is an
110
- // invariant of the events stream, not arm-specific behaviour.
111
- for (const event of runResult.events) {
112
- if (event.eventType === "feedback")
113
- return true;
114
- }
115
- return false;
116
- }