akm-cli 0.7.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (327) hide show
  1. package/package.json +8 -8
  2. package/dist/tests/add-website-source.test.js +0 -119
  3. package/dist/tests/agent/agent-config-loader.test.js +0 -70
  4. package/dist/tests/agent/agent-config.test.js +0 -221
  5. package/dist/tests/agent/agent-detect.test.js +0 -100
  6. package/dist/tests/agent/agent-spawn.test.js +0 -234
  7. package/dist/tests/agent-output.test.js +0 -186
  8. package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +0 -103
  9. package/dist/tests/architecture/agent-spawn-seam.test.js +0 -193
  10. package/dist/tests/architecture/llm-stateless-seam.test.js +0 -112
  11. package/dist/tests/asset-ref.test.js +0 -192
  12. package/dist/tests/asset-registry.test.js +0 -103
  13. package/dist/tests/asset-spec.test.js +0 -241
  14. package/dist/tests/bench/attribution.test.js +0 -996
  15. package/dist/tests/bench/cleanup-sigint.test.js +0 -83
  16. package/dist/tests/bench/cleanup.js +0 -234
  17. package/dist/tests/bench/cleanup.test.js +0 -166
  18. package/dist/tests/bench/cli.js +0 -1018
  19. package/dist/tests/bench/cli.test.js +0 -445
  20. package/dist/tests/bench/compare.test.js +0 -556
  21. package/dist/tests/bench/corpus.js +0 -317
  22. package/dist/tests/bench/corpus.test.js +0 -258
  23. package/dist/tests/bench/doctor.js +0 -525
  24. package/dist/tests/bench/driver.js +0 -401
  25. package/dist/tests/bench/driver.test.js +0 -584
  26. package/dist/tests/bench/environment.js +0 -233
  27. package/dist/tests/bench/environment.test.js +0 -199
  28. package/dist/tests/bench/evolve-metrics.js +0 -179
  29. package/dist/tests/bench/evolve-metrics.test.js +0 -187
  30. package/dist/tests/bench/evolve.js +0 -647
  31. package/dist/tests/bench/evolve.test.js +0 -624
  32. package/dist/tests/bench/failure-modes.test.js +0 -349
  33. package/dist/tests/bench/feedback-integrity.test.js +0 -457
  34. package/dist/tests/bench/leakage.test.js +0 -228
  35. package/dist/tests/bench/learning-curve.test.js +0 -134
  36. package/dist/tests/bench/metrics.js +0 -2395
  37. package/dist/tests/bench/metrics.test.js +0 -1150
  38. package/dist/tests/bench/no-os-tmpdir-invariant.test.js +0 -43
  39. package/dist/tests/bench/opencode-config.js +0 -194
  40. package/dist/tests/bench/opencode-config.test.js +0 -370
  41. package/dist/tests/bench/report.js +0 -1885
  42. package/dist/tests/bench/report.test.js +0 -1038
  43. package/dist/tests/bench/run-config.js +0 -355
  44. package/dist/tests/bench/run-config.test.js +0 -298
  45. package/dist/tests/bench/run-curate-test.js +0 -32
  46. package/dist/tests/bench/run-failing-tasks.js +0 -56
  47. package/dist/tests/bench/run-full-bench.js +0 -51
  48. package/dist/tests/bench/run-items36-targeted.js +0 -69
  49. package/dist/tests/bench/run-nano-quick.js +0 -42
  50. package/dist/tests/bench/run-waveg-targeted.js +0 -62
  51. package/dist/tests/bench/runner.js +0 -699
  52. package/dist/tests/bench/runner.test.js +0 -958
  53. package/dist/tests/bench/search-bridge.test.js +0 -331
  54. package/dist/tests/bench/tmp.js +0 -131
  55. package/dist/tests/bench/trajectory.js +0 -116
  56. package/dist/tests/bench/trajectory.test.js +0 -127
  57. package/dist/tests/bench/verifier.js +0 -114
  58. package/dist/tests/bench/verifier.test.js +0 -118
  59. package/dist/tests/bench/workflow-evaluator.js +0 -557
  60. package/dist/tests/bench/workflow-evaluator.test.js +0 -421
  61. package/dist/tests/bench/workflow-spec.js +0 -345
  62. package/dist/tests/bench/workflow-spec.test.js +0 -363
  63. package/dist/tests/bench/workflow-trace.js +0 -472
  64. package/dist/tests/bench/workflow-trace.test.js +0 -254
  65. package/dist/tests/benchmark-search-quality.js +0 -536
  66. package/dist/tests/benchmark-suite.js +0 -1441
  67. package/dist/tests/capture-cli.test.js +0 -112
  68. package/dist/tests/cli-errors.test.js +0 -204
  69. package/dist/tests/commands/events.test.js +0 -370
  70. package/dist/tests/commands/history.test.js +0 -418
  71. package/dist/tests/commands/import.test.js +0 -103
  72. package/dist/tests/commands/proposal-cli.test.js +0 -209
  73. package/dist/tests/commands/reflect-propose-cli.test.js +0 -333
  74. package/dist/tests/commands/remember.test.js +0 -97
  75. package/dist/tests/commands/scope-flags.test.js +0 -300
  76. package/dist/tests/commands/search.test.js +0 -537
  77. package/dist/tests/commands/show-indexer-parity.test.js +0 -117
  78. package/dist/tests/commands/show.test.js +0 -294
  79. package/dist/tests/common.test.js +0 -266
  80. package/dist/tests/completions.test.js +0 -142
  81. package/dist/tests/config-cli.test.js +0 -193
  82. package/dist/tests/config-llm-features.test.js +0 -139
  83. package/dist/tests/config.test.js +0 -569
  84. package/dist/tests/contracts/migration-baseline.test.js +0 -43
  85. package/dist/tests/contracts/reflect-propose-envelope.test.js +0 -139
  86. package/dist/tests/contracts/spec-helpers.js +0 -46
  87. package/dist/tests/contracts/v1-spec-section-11-proposal-queue.test.js +0 -228
  88. package/dist/tests/contracts/v1-spec-section-12-agent-config.test.js +0 -56
  89. package/dist/tests/contracts/v1-spec-section-13-lesson-type.test.js +0 -34
  90. package/dist/tests/contracts/v1-spec-section-14-llm-features.test.js +0 -94
  91. package/dist/tests/contracts/v1-spec-section-4-1-asset-types.test.js +0 -39
  92. package/dist/tests/contracts/v1-spec-section-4-2-quality-rules.test.js +0 -44
  93. package/dist/tests/contracts/v1-spec-section-5-configuration.test.js +0 -47
  94. package/dist/tests/contracts/v1-spec-section-6-orchestration.test.js +0 -40
  95. package/dist/tests/contracts/v1-spec-section-7-module-layout.test.js +0 -58
  96. package/dist/tests/contracts/v1-spec-section-8-extension-points.test.js +0 -34
  97. package/dist/tests/contracts/v1-spec-section-9-4-cli-surface.test.js +0 -75
  98. package/dist/tests/contracts/v1-spec-section-9-7-llm-agent-boundary.test.js +0 -36
  99. package/dist/tests/core/write-source.test.js +0 -366
  100. package/dist/tests/curate-command.test.js +0 -87
  101. package/dist/tests/db-scoring.test.js +0 -201
  102. package/dist/tests/db.test.js +0 -654
  103. package/dist/tests/distill-cli-flag.test.js +0 -208
  104. package/dist/tests/distill.test.js +0 -515
  105. package/dist/tests/docker-install.test.js +0 -120
  106. package/dist/tests/e2e.test.js +0 -1419
  107. package/dist/tests/embedder.test.js +0 -340
  108. package/dist/tests/embedding-model-config.test.js +0 -379
  109. package/dist/tests/feedback-command.test.js +0 -172
  110. package/dist/tests/file-context.test.js +0 -552
  111. package/dist/tests/fixtures/scripts/git/summarize-diff.js +0 -9
  112. package/dist/tests/fixtures/scripts/lint/eslint-check.js +0 -7
  113. package/dist/tests/fixtures/stashes/load.js +0 -166
  114. package/dist/tests/fixtures/stashes/load.test.js +0 -97
  115. package/dist/tests/fixtures/stashes/ranking-baseline/scripts/mem0-search.js +0 -12
  116. package/dist/tests/frontmatter.test.js +0 -190
  117. package/dist/tests/fts-field-weighting.test.js +0 -254
  118. package/dist/tests/fuzzy-search.test.js +0 -230
  119. package/dist/tests/git-provider-clone.test.js +0 -45
  120. package/dist/tests/github.test.js +0 -161
  121. package/dist/tests/graph-boost-ranking.test.js +0 -305
  122. package/dist/tests/graph-extraction.test.js +0 -282
  123. package/dist/tests/helpers/usage-events.js +0 -8
  124. package/dist/tests/index-pass-llm.test.js +0 -161
  125. package/dist/tests/indexer.test.js +0 -570
  126. package/dist/tests/info-command.test.js +0 -166
  127. package/dist/tests/init.test.js +0 -69
  128. package/dist/tests/install-script.test.js +0 -246
  129. package/dist/tests/integration/agent-real-profile.test.js +0 -94
  130. package/dist/tests/issue-36-repro.test.js +0 -304
  131. package/dist/tests/issues-191-194.test.js +0 -160
  132. package/dist/tests/lesson-lint.test.js +0 -111
  133. package/dist/tests/llm-client.test.js +0 -115
  134. package/dist/tests/llm-feature-gate.test.js +0 -151
  135. package/dist/tests/llm.test.js +0 -139
  136. package/dist/tests/lockfile.test.js +0 -216
  137. package/dist/tests/manifest.test.js +0 -205
  138. package/dist/tests/markdown.test.js +0 -126
  139. package/dist/tests/matchers-unit.test.js +0 -189
  140. package/dist/tests/memory-inference.test.js +0 -299
  141. package/dist/tests/merge-scoring.test.js +0 -136
  142. package/dist/tests/metadata.test.js +0 -313
  143. package/dist/tests/migration-help.test.js +0 -89
  144. package/dist/tests/origin-resolve.test.js +0 -124
  145. package/dist/tests/output-baseline.test.js +0 -218
  146. package/dist/tests/output-shapes-unit.test.js +0 -478
  147. package/dist/tests/parallel-search.test.js +0 -272
  148. package/dist/tests/parameter-metadata.test.js +0 -365
  149. package/dist/tests/paths.test.js +0 -177
  150. package/dist/tests/progressive-disclosure.test.js +0 -280
  151. package/dist/tests/proposals.test.js +0 -279
  152. package/dist/tests/proposed-quality.test.js +0 -271
  153. package/dist/tests/provider-registry.test.js +0 -32
  154. package/dist/tests/ranking-regression.test.js +0 -548
  155. package/dist/tests/reflect-propose.test.js +0 -455
  156. package/dist/tests/registry-build-index.test.js +0 -394
  157. package/dist/tests/registry-cli.test.js +0 -290
  158. package/dist/tests/registry-index-v2.test.js +0 -430
  159. package/dist/tests/registry-install.test.js +0 -728
  160. package/dist/tests/registry-providers/parity.test.js +0 -189
  161. package/dist/tests/registry-providers/skills-sh.test.js +0 -309
  162. package/dist/tests/registry-providers/static-index.test.js +0 -238
  163. package/dist/tests/registry-resolve.test.js +0 -126
  164. package/dist/tests/registry-search.test.js +0 -923
  165. package/dist/tests/remember-frontmatter.test.js +0 -378
  166. package/dist/tests/remember-unit.test.js +0 -123
  167. package/dist/tests/ripgrep-install.test.js +0 -251
  168. package/dist/tests/ripgrep-resolve.test.js +0 -108
  169. package/dist/tests/ripgrep.test.js +0 -163
  170. package/dist/tests/save-command.test.js +0 -94
  171. package/dist/tests/save-trust-qa-fixes.test.js +0 -270
  172. package/dist/tests/scoring-pipeline.test.js +0 -648
  173. package/dist/tests/search-include-proposed-cli.test.js +0 -118
  174. package/dist/tests/self-update.test.js +0 -442
  175. package/dist/tests/semantic-search-e2e.test.js +0 -512
  176. package/dist/tests/semantic-status.test.js +0 -471
  177. package/dist/tests/setup-run.integration.js +0 -877
  178. package/dist/tests/setup-wizard.test.js +0 -198
  179. package/dist/tests/setup.test.js +0 -131
  180. package/dist/tests/source-add.test.js +0 -11
  181. package/dist/tests/source-clone.test.js +0 -254
  182. package/dist/tests/source-manage.test.js +0 -366
  183. package/dist/tests/source-providers/filesystem.test.js +0 -82
  184. package/dist/tests/source-providers/git.test.js +0 -252
  185. package/dist/tests/source-providers/website.test.js +0 -128
  186. package/dist/tests/source-qa-fixes.test.js +0 -286
  187. package/dist/tests/source-registry.test.js +0 -350
  188. package/dist/tests/source-resolve.test.js +0 -100
  189. package/dist/tests/source-source.test.js +0 -281
  190. package/dist/tests/source.test.js +0 -533
  191. package/dist/tests/tar-utils-scan.test.js +0 -73
  192. package/dist/tests/toggle-components.test.js +0 -73
  193. package/dist/tests/usage-telemetry.test.js +0 -265
  194. package/dist/tests/utility-scoring.test.js +0 -558
  195. package/dist/tests/vault-load-error.test.js +0 -78
  196. package/dist/tests/vault-qa-fixes.test.js +0 -194
  197. package/dist/tests/vault.test.js +0 -429
  198. package/dist/tests/vector-search.test.js +0 -608
  199. package/dist/tests/walker.test.js +0 -252
  200. package/dist/tests/wave2-cluster-bc.test.js +0 -228
  201. package/dist/tests/wave2-cluster-d.test.js +0 -180
  202. package/dist/tests/wave2-cluster-e.test.js +0 -179
  203. package/dist/tests/wiki-qa-fixes.test.js +0 -270
  204. package/dist/tests/wiki.test.js +0 -529
  205. package/dist/tests/workflow-cli.test.js +0 -271
  206. package/dist/tests/workflow-markdown.test.js +0 -171
  207. package/dist/tests/workflow-path-escape.test.js +0 -132
  208. package/dist/tests/workflow-qa-fixes.test.js +0 -395
  209. package/dist/tests/workflows/indexer-rejection.test.js +0 -213
  210. /package/dist/{src/cli.js → cli.js} +0 -0
  211. /package/dist/{src/commands → commands}/completions.js +0 -0
  212. /package/dist/{src/commands → commands}/config-cli.js +0 -0
  213. /package/dist/{src/commands → commands}/curate.js +0 -0
  214. /package/dist/{src/commands → commands}/distill.js +0 -0
  215. /package/dist/{src/commands → commands}/events.js +0 -0
  216. /package/dist/{src/commands → commands}/history.js +0 -0
  217. /package/dist/{src/commands → commands}/info.js +0 -0
  218. /package/dist/{src/commands → commands}/init.js +0 -0
  219. /package/dist/{src/commands → commands}/install-audit.js +0 -0
  220. /package/dist/{src/commands → commands}/installed-stashes.js +0 -0
  221. /package/dist/{src/commands → commands}/migration-help.js +0 -0
  222. /package/dist/{src/commands → commands}/proposal.js +0 -0
  223. /package/dist/{src/commands → commands}/propose.js +0 -0
  224. /package/dist/{src/commands → commands}/reflect.js +0 -0
  225. /package/dist/{src/commands → commands}/registry-search.js +0 -0
  226. /package/dist/{src/commands → commands}/remember.js +0 -0
  227. /package/dist/{src/commands → commands}/search.js +0 -0
  228. /package/dist/{src/commands → commands}/self-update.js +0 -0
  229. /package/dist/{src/commands → commands}/show.js +0 -0
  230. /package/dist/{src/commands → commands}/source-add.js +0 -0
  231. /package/dist/{src/commands → commands}/source-clone.js +0 -0
  232. /package/dist/{src/commands → commands}/source-manage.js +0 -0
  233. /package/dist/{src/commands → commands}/vault.js +0 -0
  234. /package/dist/{src/core → core}/asset-ref.js +0 -0
  235. /package/dist/{src/core → core}/asset-registry.js +0 -0
  236. /package/dist/{src/core → core}/asset-spec.js +0 -0
  237. /package/dist/{src/core → core}/common.js +0 -0
  238. /package/dist/{src/core → core}/config.js +0 -0
  239. /package/dist/{src/core → core}/errors.js +0 -0
  240. /package/dist/{src/core → core}/events.js +0 -0
  241. /package/dist/{src/core → core}/frontmatter.js +0 -0
  242. /package/dist/{src/core → core}/lesson-lint.js +0 -0
  243. /package/dist/{src/core → core}/markdown.js +0 -0
  244. /package/dist/{src/core → core}/paths.js +0 -0
  245. /package/dist/{src/core → core}/proposals.js +0 -0
  246. /package/dist/{src/core → core}/warn.js +0 -0
  247. /package/dist/{src/core → core}/write-source.js +0 -0
  248. /package/dist/{src/indexer → indexer}/db-search.js +0 -0
  249. /package/dist/{src/indexer → indexer}/db.js +0 -0
  250. /package/dist/{src/indexer → indexer}/file-context.js +0 -0
  251. /package/dist/{src/indexer → indexer}/graph-boost.js +0 -0
  252. /package/dist/{src/indexer → indexer}/graph-extraction.js +0 -0
  253. /package/dist/{src/indexer → indexer}/indexer.js +0 -0
  254. /package/dist/{src/indexer → indexer}/manifest.js +0 -0
  255. /package/dist/{src/indexer → indexer}/matchers.js +0 -0
  256. /package/dist/{src/indexer → indexer}/memory-inference.js +0 -0
  257. /package/dist/{src/indexer → indexer}/metadata.js +0 -0
  258. /package/dist/{src/indexer → indexer}/search-fields.js +0 -0
  259. /package/dist/{src/indexer → indexer}/search-source.js +0 -0
  260. /package/dist/{src/indexer → indexer}/semantic-status.js +0 -0
  261. /package/dist/{src/indexer → indexer}/usage-events.js +0 -0
  262. /package/dist/{src/indexer → indexer}/walker.js +0 -0
  263. /package/dist/{src/integrations → integrations}/agent/config.js +0 -0
  264. /package/dist/{src/integrations → integrations}/agent/detect.js +0 -0
  265. /package/dist/{src/integrations → integrations}/agent/index.js +0 -0
  266. /package/dist/{src/integrations → integrations}/agent/profiles.js +0 -0
  267. /package/dist/{src/integrations → integrations}/agent/prompts.js +0 -0
  268. /package/dist/{src/integrations → integrations}/agent/spawn.js +0 -0
  269. /package/dist/{src/integrations → integrations}/github.js +0 -0
  270. /package/dist/{src/integrations → integrations}/lockfile.js +0 -0
  271. /package/dist/{src/llm → llm}/client.js +0 -0
  272. /package/dist/{src/llm → llm}/embedder.js +0 -0
  273. /package/dist/{src/llm → llm}/embedders/cache.js +0 -0
  274. /package/dist/{src/llm → llm}/embedders/local.js +0 -0
  275. /package/dist/{src/llm → llm}/embedders/remote.js +0 -0
  276. /package/dist/{src/llm → llm}/embedders/types.js +0 -0
  277. /package/dist/{src/llm → llm}/feature-gate.js +0 -0
  278. /package/dist/{src/llm → llm}/graph-extract.js +0 -0
  279. /package/dist/{src/llm → llm}/index-passes.js +0 -0
  280. /package/dist/{src/llm → llm}/memory-infer.js +0 -0
  281. /package/dist/{src/llm → llm}/metadata-enhance.js +0 -0
  282. /package/dist/{src/output → output}/cli-hints.js +0 -0
  283. /package/dist/{src/output → output}/context.js +0 -0
  284. /package/dist/{src/output → output}/renderers.js +0 -0
  285. /package/dist/{src/output → output}/shapes.js +0 -0
  286. /package/dist/{src/output → output}/text.js +0 -0
  287. /package/dist/{src/registry → registry}/build-index.js +0 -0
  288. /package/dist/{src/registry → registry}/create-provider-registry.js +0 -0
  289. /package/dist/{src/registry → registry}/factory.js +0 -0
  290. /package/dist/{src/registry → registry}/origin-resolve.js +0 -0
  291. /package/dist/{src/registry → registry}/providers/index.js +0 -0
  292. /package/dist/{src/registry → registry}/providers/skills-sh.js +0 -0
  293. /package/dist/{src/registry → registry}/providers/static-index.js +0 -0
  294. /package/dist/{src/registry → registry}/providers/types.js +0 -0
  295. /package/dist/{src/registry → registry}/resolve.js +0 -0
  296. /package/dist/{src/registry → registry}/types.js +0 -0
  297. /package/dist/{src/setup → setup}/detect.js +0 -0
  298. /package/dist/{src/setup → setup}/ripgrep-install.js +0 -0
  299. /package/dist/{src/setup → setup}/ripgrep-resolve.js +0 -0
  300. /package/dist/{src/setup → setup}/setup.js +0 -0
  301. /package/dist/{src/setup → setup}/steps.js +0 -0
  302. /package/dist/{src/sources → sources}/include.js +0 -0
  303. /package/dist/{src/sources → sources}/provider-factory.js +0 -0
  304. /package/dist/{src/sources → sources}/provider.js +0 -0
  305. /package/dist/{src/sources → sources}/providers/filesystem.js +0 -0
  306. /package/dist/{src/sources → sources}/providers/git.js +0 -0
  307. /package/dist/{src/sources → sources}/providers/index.js +0 -0
  308. /package/dist/{src/sources → sources}/providers/install-types.js +0 -0
  309. /package/dist/{src/sources → sources}/providers/npm.js +0 -0
  310. /package/dist/{src/sources → sources}/providers/provider-utils.js +0 -0
  311. /package/dist/{src/sources → sources}/providers/sync-from-ref.js +0 -0
  312. /package/dist/{src/sources → sources}/providers/tar-utils.js +0 -0
  313. /package/dist/{src/sources → sources}/providers/website.js +0 -0
  314. /package/dist/{src/sources → sources}/resolve.js +0 -0
  315. /package/dist/{src/sources → sources}/types.js +0 -0
  316. /package/dist/{src/templates → templates}/wiki-templates.js +0 -0
  317. /package/dist/{src/version.js → version.js} +0 -0
  318. /package/dist/{src/wiki → wiki}/wiki.js +0 -0
  319. /package/dist/{src/workflows → workflows}/authoring.js +0 -0
  320. /package/dist/{src/workflows → workflows}/cli.js +0 -0
  321. /package/dist/{src/workflows → workflows}/db.js +0 -0
  322. /package/dist/{src/workflows → workflows}/document-cache.js +0 -0
  323. /package/dist/{src/workflows → workflows}/parser.js +0 -0
  324. /package/dist/{src/workflows → workflows}/renderer.js +0 -0
  325. /package/dist/{src/workflows → workflows}/runs.js +0 -0
  326. /package/dist/{src/workflows → workflows}/schema.js +0 -0
  327. /package/dist/{src/workflows → workflows}/validator.js +0 -0
@@ -1,331 +0,0 @@
1
- /**
2
- * Unit tests for the §6.7 search-pipeline bridge.
3
- *
4
- * Covers:
5
- * • `extractGoldRanks` — pure-function rank extraction from synthetic
6
- * verifier-stdout traces, including JSON tool-call form, plain-text
7
- * `ref:` lines, multiple searches per run, and gold-not-in-top-10
8
- * (the "missing" bucket).
9
- * • `computeSearchBridge` — histogram, p50/p90, gold_at_rank_1,
10
- * gold_missing, and the keystone `pass_rate_by_rank` slice.
11
- * • Empty-corpus path — no records → renderer emits the N/A sentence.
12
- *
13
- * No real opencode is invoked; every fixture is a hand-crafted `RunResult`.
14
- */
15
- import { describe, expect, test } from "bun:test";
16
- import { computeSearchBridge, extractGoldRanks } from "./metrics";
17
- import { renderSearchBridgeTable } from "./report";
18
- function fakeResult(stdout, overrides = {}) {
19
- return {
20
- schemaVersion: 1,
21
- taskId: "t",
22
- arm: "akm",
23
- seed: 0,
24
- model: "m",
25
- outcome: "pass",
26
- tokens: { input: 0, output: 0 },
27
- wallclockMs: 0,
28
- trajectory: { correctAssetLoaded: null, feedbackRecorded: null },
29
- events: [],
30
- verifierStdout: stdout,
31
- verifierExitCode: 0,
32
- assetsLoaded: [],
33
- ...overrides,
34
- };
35
- }
36
- describe("extractGoldRanks", () => {
37
- test("returns [] when goldRef is undefined", () => {
38
- const r = fakeResult('akm search "foo"\nref: skill:foo');
39
- expect(extractGoldRanks(r, undefined)).toEqual([]);
40
- });
41
- test("returns [] when verifierStdout is empty", () => {
42
- const r = fakeResult("");
43
- expect(extractGoldRanks(r, "skill:foo")).toEqual([]);
44
- });
45
- test("extracts a single search with text-mode ref output, gold at rank 1", () => {
46
- const stdout = [
47
- `> akm search "redis healthcheck"`,
48
- `skill: docker-homelab`,
49
- ` ref: skill:docker-homelab`,
50
- ` score: 0.92`,
51
- `skill: nginx-tls`,
52
- ` ref: skill:nginx-tls`,
53
- ` score: 0.81`,
54
- ].join("\n");
55
- const events = extractGoldRanks(fakeResult(stdout), "skill:docker-homelab");
56
- expect(events).toHaveLength(1);
57
- expect(events[0].query).toBe("redis healthcheck");
58
- expect(events[0].results).toEqual(["skill:docker-homelab", "skill:nginx-tls"]);
59
- expect(events[0].rankOfGold).toBe(1);
60
- });
61
- test("extracts JSON tool-call form, gold at rank 3", () => {
62
- const stdout = [
63
- 'tool: akm search "kubernetes pod restart" --output json',
64
- '{"hits":[{"ref":"skill:k8s-debug"},{"ref":"skill:k8s-monitoring"},{"ref":"skill:k8s-restart"},{"ref":"skill:k8s-deploy"}]}',
65
- ].join("\n");
66
- const events = extractGoldRanks(fakeResult(stdout), "skill:k8s-restart");
67
- expect(events).toHaveLength(1);
68
- expect(events[0].results.slice(0, 4)).toEqual([
69
- "skill:k8s-debug",
70
- "skill:k8s-monitoring",
71
- "skill:k8s-restart",
72
- "skill:k8s-deploy",
73
- ]);
74
- expect(events[0].rankOfGold).toBe(3);
75
- });
76
- test("returns null rank when gold is missing from top 10", () => {
77
- const refs = Array.from({ length: 12 }, (_, i) => ` ref: skill:other-${i}`).join("\n");
78
- const stdout = `akm search "missing-target"\n${refs}`;
79
- const events = extractGoldRanks(fakeResult(stdout), "skill:gold");
80
- expect(events).toHaveLength(1);
81
- expect(events[0].rankOfGold).toBeNull();
82
- // Top-10 cap: only 10 results retained.
83
- expect(events[0].results.length).toBeLessThanOrEqual(10);
84
- });
85
- test("multiple searches per run are each emitted in order", () => {
86
- const stdout = [
87
- 'akm search "first query"',
88
- " ref: skill:a",
89
- " ref: skill:b",
90
- 'akm search "second query"',
91
- " ref: skill:gold",
92
- " ref: skill:c",
93
- ].join("\n");
94
- const events = extractGoldRanks(fakeResult(stdout), "skill:gold");
95
- expect(events).toHaveLength(2);
96
- expect(events[0].query).toBe("first query");
97
- expect(events[0].rankOfGold).toBeNull();
98
- expect(events[1].query).toBe("second query");
99
- expect(events[1].rankOfGold).toBe(1);
100
- });
101
- test("non-search akm invocation closes the active search block", () => {
102
- const stdout = [
103
- 'akm search "q"',
104
- " ref: skill:a",
105
- " ref: skill:gold",
106
- "akm show skill:gold",
107
- " ref: skill:gold (this should NOT extend the previous search)",
108
- " ref: skill:other",
109
- ].join("\n");
110
- const events = extractGoldRanks(fakeResult(stdout), "skill:gold");
111
- // Only the search block contributes to results; the show block is closed.
112
- expect(events).toHaveLength(1);
113
- expect(events[0].results).toEqual(["skill:a", "skill:gold"]);
114
- expect(events[0].rankOfGold).toBe(2);
115
- });
116
- test("origin-prefixed ref counts as gold (team//skill:foo matches skill:foo)", () => {
117
- const stdout = ['akm search "q"', " ref: team//skill:foo", " ref: skill:bar"].join("\n");
118
- const events = extractGoldRanks(fakeResult(stdout), "skill:foo");
119
- expect(events[0].rankOfGold).toBe(1);
120
- });
121
- });
122
- describe("computeSearchBridge — histogram + percentiles", () => {
123
- function fakeRecord(seed, outcome, rankOrNullPerSearch) {
124
- return {
125
- taskId: `t${seed}`,
126
- arm: "akm",
127
- seed,
128
- outcome,
129
- goldRef: "skill:gold",
130
- searches: rankOrNullPerSearch.map((rank, i) => ({
131
- query: `q${i}`,
132
- // Reconstruct a plausible result list: gold at the requested rank,
133
- // others as fillers. The aggregator only looks at rankOfGold.
134
- results: rank === null ? Array.from({ length: 10 }, (_, j) => `skill:other-${j}`) : [],
135
- rankOfGold: rank,
136
- })),
137
- };
138
- }
139
- test("empty corpus produces zero envelope", () => {
140
- const m = computeSearchBridge({ goldRankRecords: [] });
141
- expect(m.runsObserved).toBe(0);
142
- expect(m.searchesObserved).toBe(0);
143
- expect(m.goldRankP50).toBeNull();
144
- expect(m.goldRankP90).toBeNull();
145
- expect(m.goldAtRank1).toBe(0);
146
- expect(m.goldMissing).toBe(0);
147
- expect(m.passRateByRank).toEqual([]);
148
- // Histogram is fully zeroed for every key.
149
- for (const k of ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "missing"]) {
150
- expect(m.goldRankDistribution[k]).toBe(0);
151
- }
152
- });
153
- test("histogram counts ranks across all searches", () => {
154
- const records = [
155
- fakeRecord(0, "pass", [1, 2]),
156
- fakeRecord(1, "fail", [1, null]),
157
- fakeRecord(2, "pass", [3]),
158
- ];
159
- const m = computeSearchBridge({ goldRankRecords: records });
160
- expect(m.searchesObserved).toBe(5);
161
- expect(m.runsObserved).toBe(3);
162
- expect(m.goldRankDistribution["1"]).toBe(2);
163
- expect(m.goldRankDistribution["2"]).toBe(1);
164
- expect(m.goldRankDistribution["3"]).toBe(1);
165
- expect(m.goldRankDistribution.missing).toBe(1);
166
- expect(m.goldAtRank1).toBeCloseTo(2 / 5);
167
- expect(m.goldMissing).toBeCloseTo(1 / 5);
168
- });
169
- test("p50/p90 use nearest-rank with missing treated as Infinity", () => {
170
- // Ranks: [1,1,2,3,5,5,7,9,null,null] across one record.
171
- const records = [fakeRecord(0, "pass", [1, 1, 2, 3, 5, 5, 7, 9, null, null])];
172
- const m = computeSearchBridge({ goldRankRecords: records });
173
- // Sorted: [1,1,2,3,5,5,7,9,Inf,Inf]
174
- // p50 = idx ceil(0.5*10)-1 = 4 → 5
175
- expect(m.goldRankP50).toBe(5);
176
- // p90 = idx ceil(0.9*10)-1 = 8 → Infinity
177
- expect(m.goldRankP90).toBe(Number.POSITIVE_INFINITY);
178
- });
179
- });
180
- describe("computeSearchBridge — pass_rate_by_rank uses the agent's chosen search", () => {
181
- test("attributes pass/fail to the rank in the LAST search, not the highest-ranked", () => {
182
- // Run A passed; first search had gold at rank 1 (great rank!), but the
183
- // *chosen* (last) search had gold at rank 5. The bridge must attribute
184
- // run A to rank 5, not rank 1, otherwise it overstates the value of
185
- // having gold at rank 1.
186
- const records = [
187
- {
188
- taskId: "ta",
189
- arm: "akm",
190
- seed: 0,
191
- outcome: "pass",
192
- goldRef: "skill:gold",
193
- searches: [
194
- { query: "first", results: [], rankOfGold: 1 },
195
- { query: "last", results: [], rankOfGold: 5 },
196
- ],
197
- },
198
- {
199
- taskId: "tb",
200
- arm: "akm",
201
- seed: 0,
202
- outcome: "fail",
203
- goldRef: "skill:gold",
204
- searches: [{ query: "only", results: [], rankOfGold: 5 }],
205
- },
206
- {
207
- taskId: "tc",
208
- arm: "akm",
209
- seed: 0,
210
- outcome: "pass",
211
- goldRef: "skill:gold",
212
- searches: [{ query: "only", results: [], rankOfGold: 1 }],
213
- },
214
- ];
215
- const m = computeSearchBridge({ goldRankRecords: records });
216
- // Buckets: rank 1 → {1 pass / 1 total}, rank 5 → {1 pass / 2 total}.
217
- const rank1 = m.passRateByRank.find((e) => e.rank === "1");
218
- const rank5 = m.passRateByRank.find((e) => e.rank === "5");
219
- expect(rank1).toBeDefined();
220
- expect(rank1?.passRate).toBe(1);
221
- expect(rank1?.runCount).toBe(1);
222
- expect(rank5).toBeDefined();
223
- expect(rank5?.passRate).toBe(0.5);
224
- expect(rank5?.runCount).toBe(2);
225
- });
226
- test("missing bucket gets its own pass-rate row instead of being dropped", () => {
227
- const records = [
228
- {
229
- taskId: "tm1",
230
- arm: "akm",
231
- seed: 0,
232
- outcome: "pass",
233
- goldRef: "skill:gold",
234
- searches: [{ query: "q", results: [], rankOfGold: null }],
235
- },
236
- {
237
- taskId: "tm2",
238
- arm: "akm",
239
- seed: 0,
240
- outcome: "fail",
241
- goldRef: "skill:gold",
242
- searches: [{ query: "q", results: [], rankOfGold: null }],
243
- },
244
- ];
245
- const m = computeSearchBridge({ goldRankRecords: records });
246
- const missing = m.passRateByRank.find((e) => e.rank === "missing");
247
- expect(missing).toBeDefined();
248
- expect(missing?.runCount).toBe(2);
249
- expect(missing?.passRate).toBe(0.5);
250
- });
251
- test("runs without any akm search invocation are excluded from pass_rate_by_rank", () => {
252
- const records = [
253
- {
254
- taskId: "no-search",
255
- arm: "akm",
256
- seed: 0,
257
- outcome: "fail",
258
- goldRef: "skill:gold",
259
- searches: [],
260
- },
261
- ];
262
- const m = computeSearchBridge({ goldRankRecords: records });
263
- expect(m.runsObserved).toBe(1);
264
- expect(m.searchesObserved).toBe(0);
265
- expect(m.passRateByRank).toEqual([]);
266
- });
267
- });
268
- describe("renderSearchBridgeTable", () => {
269
- test("empty corpus renders the N/A sentence", () => {
270
- const md = renderSearchBridgeTable({
271
- goldRankDistribution: {
272
- "1": 0,
273
- "2": 0,
274
- "3": 0,
275
- "4": 0,
276
- "5": 0,
277
- "6": 0,
278
- "7": 0,
279
- "8": 0,
280
- "9": 0,
281
- "10": 0,
282
- missing: 0,
283
- },
284
- goldRankP50: null,
285
- goldRankP90: null,
286
- goldAtRank1: 0,
287
- goldMissing: 0,
288
- passRateByRank: [],
289
- runsObserved: 0,
290
- searchesObserved: 0,
291
- });
292
- expect(md).toContain("Search → outcome bridge");
293
- expect(md).toContain("(no gold-ref tasks in corpus; bridge metrics N/A)");
294
- });
295
- test("populated corpus surfaces histogram, p50/p90, and pass-rate-by-rank table", () => {
296
- const md = renderSearchBridgeTable({
297
- goldRankDistribution: {
298
- "1": 3,
299
- "2": 1,
300
- "3": 0,
301
- "4": 0,
302
- "5": 1,
303
- "6": 0,
304
- "7": 0,
305
- "8": 0,
306
- "9": 0,
307
- "10": 0,
308
- missing: 1,
309
- },
310
- goldRankP50: 1,
311
- goldRankP90: 5,
312
- goldAtRank1: 0.5,
313
- goldMissing: 1 / 6,
314
- passRateByRank: [
315
- { rank: "1", passRate: 0.67, runCount: 3 },
316
- { rank: "5", passRate: 0, runCount: 1 },
317
- { rank: "missing", passRate: 0, runCount: 1 },
318
- ],
319
- runsObserved: 5,
320
- searchesObserved: 6,
321
- });
322
- expect(md).toContain("| 1 | 3 |");
323
- expect(md).toContain("| missing | 1 |");
324
- expect(md).toContain("p50=1.0");
325
- expect(md).toContain("p90=5.0");
326
- expect(md).toContain("gold_at_rank_1=50.0%");
327
- expect(md).toContain("| rank | pass_rate | run_count |");
328
- expect(md).toContain("| 1 | 0.67 | 3 |");
329
- expect(md).toContain("| missing | 0.00 | 1 |");
330
- });
331
- });
@@ -1,131 +0,0 @@
1
- /**
2
- * Bench tmp-root redirection (#276).
3
- *
4
- * Every bench tmp directory — per-(task, arm, seed) workspace, per-task
5
- * fixture stash, per-fixture evolveStash + preStash, plus the scratch dirs
6
- * spun up inside unit tests — lives under `${AKM_CACHE_DIR}/bench/`, NOT
7
- * `os.tmpdir()`.
8
- *
9
- * Why: during long bench/workflow runs, orphan tmp dirs from crashed agents
10
- * accumulate. When they pile up under `/tmp` the OS-level partition fills,
11
- * which breaks shells, browsers, npm caches, and the rest of the system.
12
- * Pinning bench tmp to the akm cache dir means a single
13
- * `rm -rf "$(akm config get cache.dir)/bench"` purges all bench scratch
14
- * without disturbing anything else.
15
- *
16
- * The bench cleanup machinery (`tests/bench/cleanup.ts`) also reaps
17
- * `${AKM_CACHE_DIR}/bench/*` entries older than 6 hours on the first
18
- * `registerCleanup` call to catch orphans from prior crashed runs.
19
- *
20
- * NOTE: this helper deliberately does NOT import `os.tmpdir()`. The
21
- * invariant test (`tests/bench/no-os-tmpdir-invariant.test.ts`) asserts
22
- * zero `os.tmpdir` references across `tests/bench/*.ts`.
23
- */
24
- import * as fs from "node:fs";
25
- import * as path from "node:path";
26
- import { getCacheDir } from "../../src/core/paths";
27
- /** Bench-tmp root: `${AKM_CACHE_DIR}/bench/`. Created lazily. */
28
- export function benchTmpRoot() {
29
- const root = path.join(getCacheDir(), "bench");
30
- fs.mkdirSync(root, { recursive: true });
31
- return root;
32
- }
33
- /**
34
- * Create a fresh tmp directory under `benchTmpRoot()`.
35
- *
36
- * Drop-in replacement for `fs.mkdtempSync(path.join(os.tmpdir(), prefix))`.
37
- * The returned absolute path is unique per call.
38
- */
39
- export function benchMkdtemp(prefix) {
40
- return fs.mkdtempSync(path.join(benchTmpRoot(), prefix));
41
- }
42
- // ── PID file ────────────────────────────────────────────────────────────────
43
- /** Absolute path to the bench PID file: `${AKM_CACHE_DIR}/bench/bench.pid`. */
44
- export function benchPidPath() {
45
- return path.join(benchTmpRoot(), "bench.pid");
46
- }
47
- /**
48
- * Write `process.pid` to `bench.pid`.
49
- *
50
- * If a stale PID file exists and the referenced process is no longer running,
51
- * it is removed with a warning before writing the new one.
52
- *
53
- * Returns a cleanup function that removes the PID file. Call it in a
54
- * `finally` block so the file is removed on both clean exit and exceptions.
55
- */
56
- export function writeBenchPid() {
57
- const pidPath = benchPidPath();
58
- // Check for an existing PID file and warn if stale.
59
- if (fs.existsSync(pidPath)) {
60
- let existingPid;
61
- try {
62
- const raw = fs.readFileSync(pidPath, "utf8").trim();
63
- existingPid = Number.parseInt(raw, 10);
64
- }
65
- catch {
66
- // Unreadable — treat as stale.
67
- }
68
- if (existingPid !== undefined && Number.isFinite(existingPid) && !isPidRunning(existingPid)) {
69
- // Stale PID — warn and remove.
70
- process.stderr.write(`bench: removing stale PID file for PID ${existingPid} (process not running)\n`);
71
- try {
72
- fs.rmSync(pidPath, { force: true });
73
- }
74
- catch {
75
- /* best-effort */
76
- }
77
- }
78
- }
79
- try {
80
- fs.writeFileSync(pidPath, String(process.pid), "utf8");
81
- }
82
- catch {
83
- /* best-effort — PID file is diagnostic, not critical */
84
- }
85
- return () => {
86
- try {
87
- // Only remove if it still contains our PID (guard against races).
88
- const current = fs.readFileSync(pidPath, "utf8").trim();
89
- if (current === String(process.pid)) {
90
- fs.rmSync(pidPath, { force: true });
91
- }
92
- }
93
- catch {
94
- /* best-effort */
95
- }
96
- };
97
- }
98
- /**
99
- * Read the PID from `bench.pid`. Returns `undefined` when the file does not
100
- * exist or cannot be parsed.
101
- */
102
- export function readBenchPid() {
103
- const pidPath = benchPidPath();
104
- if (!fs.existsSync(pidPath))
105
- return undefined;
106
- try {
107
- const raw = fs.readFileSync(pidPath, "utf8").trim();
108
- const n = Number.parseInt(raw, 10);
109
- return Number.isFinite(n) && n > 0 ? n : undefined;
110
- }
111
- catch {
112
- return undefined;
113
- }
114
- }
115
- /**
116
- * Return `true` when the process with the given PID is running on this host.
117
- * Uses `process.kill(pid, 0)` — signal 0 is a no-op probe that throws ESRCH
118
- * when the process does not exist and EPERM when it exists but is owned by
119
- * another user (in which case it IS running).
120
- */
121
- export function isPidRunning(pid) {
122
- try {
123
- process.kill(pid, 0);
124
- return true;
125
- }
126
- catch (err) {
127
- const code = err.code;
128
- // EPERM means the process exists but we don't have permission to signal it.
129
- return code === "EPERM";
130
- }
131
- }
@@ -1,116 +0,0 @@
1
- /**
2
- * akm-bench trajectory parser (spec §6.2).
3
- *
4
- * Trajectory metrics describe the *path* the agent took through the run, not
5
- * just the terminal outcome. For #238 we score two booleans per run:
6
- *
7
- * • `correctAssetLoaded` — did the agent invoke `akm show <goldRef>` (or
8
- * a sufficient prefix thereof) at any point during the run? `null` when
9
- * the task carries no `goldRef` (and so the metric is undefined).
10
- * • `feedbackRecorded` — did the agent emit any `feedback` event into
11
- * `events.jsonl` during the run? Always `false` for the `noakm` arm
12
- * because that arm runs without a stash.
13
- *
14
- * The driver hands us a `RunResult` after the run has finished. We never
15
- * mutate it; we return a fresh `TrajectoryRecord` and let the runner splice
16
- * it back in. This keeps `runOne`'s signature stable and lets `#239`/`#240`
17
- * extend the trajectory shape without touching the driver.
18
- */
19
- /**
20
- * Cap on the number of characters of `verifierStdout` we substring-scan for
21
- * the `akm show <ref>` heuristic. A runaway agent could emit GBs of stdout;
22
- * scanning all of it would OOM the bench. The first 16 MiB is plenty to
23
- * decide whether the agent invoked `akm show` for the gold ref.
24
- */
25
- export const VERIFIER_STDOUT_SCAN_CAP = 16 * 1024 * 1024;
26
- /**
27
- * Compute the trajectory record for a single run.
28
- *
29
- * The `correctAssetLoaded` heuristic looks for the `akm show <ref>` invocation
30
- * in two places:
31
- * 1. The `events.jsonl` events array (if `akm show` ever emits an event —
32
- * currently it does not, but we future-proof).
33
- * 2. The agent's stdout/verifier stdout (`runResult.verifierStdout`). When
34
- * opencode logs its tool calls, the literal string `akm show <ref>`
35
- * appears verbatim in the trace.
36
- *
37
- * We accept a "sufficient prefix": `skill:docker-homelab` matches both the
38
- * exact ref and `skill:docker-homelab/anything`. The match is conservative
39
- * — case-sensitive, exact substring on `akm show <ref>` (whitespace-flexible).
40
- */
41
- export function computeTrajectory(task, runResult, opts) {
42
- const correctAssetLoaded = computeCorrectAssetLoaded(task, runResult, opts);
43
- const feedbackRecorded = computeFeedbackRecorded(runResult);
44
- return { correctAssetLoaded, feedbackRecorded };
45
- }
46
- function computeCorrectAssetLoaded(task, runResult, opts) {
47
- if (!task.goldRef)
48
- return null;
49
- const ref = task.goldRef;
50
- // Search the events stream for any tool-call event that carries the ref.
51
- // akm show emits an event to events.jsonl, so this path is the primary
52
- // detection route when the structured event stream is available.
53
- for (const event of runResult.events) {
54
- const refField = event.ref;
55
- if (typeof refField === "string" && matchesRef(refField, ref))
56
- return true;
57
- const meta = event.metadata;
58
- if (meta && typeof meta === "object") {
59
- const candidate = meta.ref;
60
- if (typeof candidate === "string" && matchesRef(candidate, ref))
61
- return true;
62
- }
63
- }
64
- // Substring scan on the captured agent/verifier stdout. We look for either
65
- // - `akm show <ref>` (the canonical form opencode logs when the agent
66
- // invokes the akm CLI as a tool), or
67
- // - the bare ref appearing on a line that mentions `show` (covers tool-
68
- // call JSON like `{"command":"akm","args":["show","skill:foo"]}`).
69
- // Cap the scan at VERIFIER_STDOUT_SCAN_CAP so a runaway agent's GBs of
70
- // stdout cannot OOM the bench. When we truncate, push a warning so the
71
- // top-level report aggregates it under `warnings[]`.
72
- const haystackFull = runResult.verifierStdout;
73
- let haystack = haystackFull;
74
- if (haystack && haystack.length > VERIFIER_STDOUT_SCAN_CAP) {
75
- haystack = haystack.slice(0, VERIFIER_STDOUT_SCAN_CAP);
76
- if (opts?.warnings) {
77
- opts.warnings.push(`verifierStdout truncated for trajectory scan: ${haystackFull.length} chars exceeds ${VERIFIER_STDOUT_SCAN_CAP}-char cap; correct_asset_loaded computed from the prefix.`);
78
- }
79
- }
80
- if (haystack && containsAkmShow(haystack, ref))
81
- return true;
82
- return false;
83
- }
84
- function matchesRef(candidate, gold) {
85
- if (candidate === gold)
86
- return true;
87
- // Allow goldRef to be a prefix of a more-specific ref (e.g. team//skill:foo
88
- // when the task says skill:foo). Keep the check anchored to ref segments.
89
- if (candidate.endsWith(`//${gold}`))
90
- return true;
91
- if (candidate.startsWith(`${gold}/`))
92
- return true;
93
- return false;
94
- }
95
- function containsAkmShow(text, ref) {
96
- // Whitespace-flexible match for `akm show <ref>`. We escape regex metas in
97
- // the ref because asset refs may contain `:` (always) and `/` (origin form).
98
- const escaped = ref.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
99
- const pattern = new RegExp(`akm\\s+show\\s+(?:["'])?${escaped}(?:\\b|\\W)`);
100
- if (pattern.test(text))
101
- return true;
102
- // Tool-call JSON form: `"args":["show","<ref>"]` or similar. Cheap heuristic.
103
- if (text.includes(`"show"`) && text.includes(ref))
104
- return true;
105
- return false;
106
- }
107
- function computeFeedbackRecorded(runResult) {
108
- // The `noakm` arm runs without an akm stash, so events.jsonl will be empty
109
- // by construction. Still honour the same scan — the assertion is an
110
- // invariant of the events stream, not arm-specific behaviour.
111
- for (const event of runResult.events) {
112
- if (event.eventType === "feedback")
113
- return true;
114
- }
115
- return false;
116
- }