akm-cli 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (332) hide show
  1. package/CHANGELOG.md +8 -0
  2. package/dist/{src/cli.js → cli.js} +22 -8
  3. package/dist/{src/commands → commands}/installed-stashes.js +1 -1
  4. package/dist/{src/commands → commands}/source-add.js +1 -1
  5. package/dist/{src/core → core}/common.js +16 -1
  6. package/dist/{src/core → core}/config.js +5 -2
  7. package/dist/{src/indexer → indexer}/db-search.js +16 -1
  8. package/dist/{src/indexer → indexer}/graph-extraction.js +5 -3
  9. package/dist/{src/indexer → indexer}/indexer.js +27 -11
  10. package/dist/{src/indexer → indexer}/memory-inference.js +47 -58
  11. package/dist/{src/indexer → indexer}/search-source.js +1 -1
  12. package/dist/{src/llm → llm}/client.js +61 -1
  13. package/dist/{src/llm → llm}/embedder.js +8 -5
  14. package/dist/{src/llm → llm}/embedders/local.js +8 -2
  15. package/dist/{src/llm → llm}/embedders/remote.js +4 -2
  16. package/dist/{src/llm → llm}/graph-extract.js +4 -4
  17. package/dist/llm/memory-infer.js +114 -0
  18. package/dist/{src/llm → llm}/metadata-enhance.js +2 -2
  19. package/dist/{src/output → output}/cli-hints.js +2 -0
  20. package/dist/{src/setup → setup}/setup.js +30 -20
  21. package/dist/sources/providers/website.js +27 -0
  22. package/dist/{src/sources/providers/website.js → sources/website-ingest.js} +38 -51
  23. package/docs/README.md +7 -0
  24. package/docs/migration/release-notes/0.7.0.md +14 -0
  25. package/package.json +11 -8
  26. package/dist/src/llm/memory-infer.js +0 -86
  27. package/dist/tests/add-website-source.test.js +0 -119
  28. package/dist/tests/agent/agent-config-loader.test.js +0 -70
  29. package/dist/tests/agent/agent-config.test.js +0 -221
  30. package/dist/tests/agent/agent-detect.test.js +0 -100
  31. package/dist/tests/agent/agent-spawn.test.js +0 -234
  32. package/dist/tests/agent-output.test.js +0 -186
  33. package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +0 -103
  34. package/dist/tests/architecture/agent-spawn-seam.test.js +0 -193
  35. package/dist/tests/architecture/llm-stateless-seam.test.js +0 -112
  36. package/dist/tests/asset-ref.test.js +0 -192
  37. package/dist/tests/asset-registry.test.js +0 -103
  38. package/dist/tests/asset-spec.test.js +0 -241
  39. package/dist/tests/bench/attribution.test.js +0 -996
  40. package/dist/tests/bench/cleanup-sigint.test.js +0 -83
  41. package/dist/tests/bench/cleanup.js +0 -234
  42. package/dist/tests/bench/cleanup.test.js +0 -166
  43. package/dist/tests/bench/cli.js +0 -1018
  44. package/dist/tests/bench/cli.test.js +0 -445
  45. package/dist/tests/bench/compare.test.js +0 -556
  46. package/dist/tests/bench/corpus.js +0 -317
  47. package/dist/tests/bench/corpus.test.js +0 -258
  48. package/dist/tests/bench/doctor.js +0 -525
  49. package/dist/tests/bench/driver.js +0 -401
  50. package/dist/tests/bench/driver.test.js +0 -584
  51. package/dist/tests/bench/environment.js +0 -233
  52. package/dist/tests/bench/environment.test.js +0 -199
  53. package/dist/tests/bench/evolve-metrics.js +0 -179
  54. package/dist/tests/bench/evolve-metrics.test.js +0 -187
  55. package/dist/tests/bench/evolve.js +0 -647
  56. package/dist/tests/bench/evolve.test.js +0 -624
  57. package/dist/tests/bench/failure-modes.test.js +0 -349
  58. package/dist/tests/bench/feedback-integrity.test.js +0 -457
  59. package/dist/tests/bench/leakage.test.js +0 -228
  60. package/dist/tests/bench/learning-curve.test.js +0 -134
  61. package/dist/tests/bench/metrics.js +0 -2395
  62. package/dist/tests/bench/metrics.test.js +0 -1150
  63. package/dist/tests/bench/no-os-tmpdir-invariant.test.js +0 -43
  64. package/dist/tests/bench/opencode-config.js +0 -194
  65. package/dist/tests/bench/opencode-config.test.js +0 -370
  66. package/dist/tests/bench/report.js +0 -1885
  67. package/dist/tests/bench/report.test.js +0 -1038
  68. package/dist/tests/bench/run-config.js +0 -355
  69. package/dist/tests/bench/run-config.test.js +0 -298
  70. package/dist/tests/bench/run-curate-test.js +0 -32
  71. package/dist/tests/bench/run-failing-tasks.js +0 -56
  72. package/dist/tests/bench/run-full-bench.js +0 -51
  73. package/dist/tests/bench/run-items36-targeted.js +0 -69
  74. package/dist/tests/bench/run-nano-quick.js +0 -42
  75. package/dist/tests/bench/run-waveg-targeted.js +0 -62
  76. package/dist/tests/bench/runner.js +0 -699
  77. package/dist/tests/bench/runner.test.js +0 -958
  78. package/dist/tests/bench/search-bridge.test.js +0 -331
  79. package/dist/tests/bench/tmp.js +0 -131
  80. package/dist/tests/bench/trajectory.js +0 -116
  81. package/dist/tests/bench/trajectory.test.js +0 -127
  82. package/dist/tests/bench/verifier.js +0 -114
  83. package/dist/tests/bench/verifier.test.js +0 -118
  84. package/dist/tests/bench/workflow-evaluator.js +0 -557
  85. package/dist/tests/bench/workflow-evaluator.test.js +0 -421
  86. package/dist/tests/bench/workflow-spec.js +0 -345
  87. package/dist/tests/bench/workflow-spec.test.js +0 -363
  88. package/dist/tests/bench/workflow-trace.js +0 -472
  89. package/dist/tests/bench/workflow-trace.test.js +0 -254
  90. package/dist/tests/benchmark-search-quality.js +0 -536
  91. package/dist/tests/benchmark-suite.js +0 -1441
  92. package/dist/tests/capture-cli.test.js +0 -112
  93. package/dist/tests/cli-errors.test.js +0 -204
  94. package/dist/tests/commands/events.test.js +0 -370
  95. package/dist/tests/commands/history.test.js +0 -418
  96. package/dist/tests/commands/import.test.js +0 -103
  97. package/dist/tests/commands/proposal-cli.test.js +0 -209
  98. package/dist/tests/commands/reflect-propose-cli.test.js +0 -333
  99. package/dist/tests/commands/remember.test.js +0 -97
  100. package/dist/tests/commands/scope-flags.test.js +0 -300
  101. package/dist/tests/commands/search.test.js +0 -537
  102. package/dist/tests/commands/show-indexer-parity.test.js +0 -117
  103. package/dist/tests/commands/show.test.js +0 -294
  104. package/dist/tests/common.test.js +0 -266
  105. package/dist/tests/completions.test.js +0 -142
  106. package/dist/tests/config-cli.test.js +0 -193
  107. package/dist/tests/config-llm-features.test.js +0 -139
  108. package/dist/tests/config.test.js +0 -569
  109. package/dist/tests/contracts/migration-baseline.test.js +0 -43
  110. package/dist/tests/contracts/reflect-propose-envelope.test.js +0 -139
  111. package/dist/tests/contracts/spec-helpers.js +0 -46
  112. package/dist/tests/contracts/v1-spec-section-11-proposal-queue.test.js +0 -228
  113. package/dist/tests/contracts/v1-spec-section-12-agent-config.test.js +0 -56
  114. package/dist/tests/contracts/v1-spec-section-13-lesson-type.test.js +0 -34
  115. package/dist/tests/contracts/v1-spec-section-14-llm-features.test.js +0 -94
  116. package/dist/tests/contracts/v1-spec-section-4-1-asset-types.test.js +0 -39
  117. package/dist/tests/contracts/v1-spec-section-4-2-quality-rules.test.js +0 -44
  118. package/dist/tests/contracts/v1-spec-section-5-configuration.test.js +0 -47
  119. package/dist/tests/contracts/v1-spec-section-6-orchestration.test.js +0 -40
  120. package/dist/tests/contracts/v1-spec-section-7-module-layout.test.js +0 -58
  121. package/dist/tests/contracts/v1-spec-section-8-extension-points.test.js +0 -34
  122. package/dist/tests/contracts/v1-spec-section-9-4-cli-surface.test.js +0 -75
  123. package/dist/tests/contracts/v1-spec-section-9-7-llm-agent-boundary.test.js +0 -36
  124. package/dist/tests/core/write-source.test.js +0 -366
  125. package/dist/tests/curate-command.test.js +0 -87
  126. package/dist/tests/db-scoring.test.js +0 -201
  127. package/dist/tests/db.test.js +0 -654
  128. package/dist/tests/distill-cli-flag.test.js +0 -208
  129. package/dist/tests/distill.test.js +0 -515
  130. package/dist/tests/docker-install.test.js +0 -120
  131. package/dist/tests/e2e.test.js +0 -1419
  132. package/dist/tests/embedder.test.js +0 -340
  133. package/dist/tests/embedding-model-config.test.js +0 -379
  134. package/dist/tests/feedback-command.test.js +0 -172
  135. package/dist/tests/file-context.test.js +0 -552
  136. package/dist/tests/fixtures/scripts/git/summarize-diff.js +0 -9
  137. package/dist/tests/fixtures/scripts/lint/eslint-check.js +0 -7
  138. package/dist/tests/fixtures/stashes/load.js +0 -166
  139. package/dist/tests/fixtures/stashes/load.test.js +0 -97
  140. package/dist/tests/fixtures/stashes/ranking-baseline/scripts/mem0-search.js +0 -12
  141. package/dist/tests/frontmatter.test.js +0 -190
  142. package/dist/tests/fts-field-weighting.test.js +0 -254
  143. package/dist/tests/fuzzy-search.test.js +0 -230
  144. package/dist/tests/git-provider-clone.test.js +0 -45
  145. package/dist/tests/github.test.js +0 -161
  146. package/dist/tests/graph-boost-ranking.test.js +0 -305
  147. package/dist/tests/graph-extraction.test.js +0 -282
  148. package/dist/tests/helpers/usage-events.js +0 -8
  149. package/dist/tests/index-pass-llm.test.js +0 -161
  150. package/dist/tests/indexer.test.js +0 -570
  151. package/dist/tests/info-command.test.js +0 -166
  152. package/dist/tests/init.test.js +0 -69
  153. package/dist/tests/install-script.test.js +0 -246
  154. package/dist/tests/integration/agent-real-profile.test.js +0 -94
  155. package/dist/tests/issue-36-repro.test.js +0 -304
  156. package/dist/tests/issues-191-194.test.js +0 -160
  157. package/dist/tests/lesson-lint.test.js +0 -111
  158. package/dist/tests/llm-client.test.js +0 -115
  159. package/dist/tests/llm-feature-gate.test.js +0 -151
  160. package/dist/tests/llm.test.js +0 -139
  161. package/dist/tests/lockfile.test.js +0 -216
  162. package/dist/tests/manifest.test.js +0 -205
  163. package/dist/tests/markdown.test.js +0 -126
  164. package/dist/tests/matchers-unit.test.js +0 -189
  165. package/dist/tests/memory-inference.test.js +0 -299
  166. package/dist/tests/merge-scoring.test.js +0 -136
  167. package/dist/tests/metadata.test.js +0 -313
  168. package/dist/tests/migration-help.test.js +0 -89
  169. package/dist/tests/origin-resolve.test.js +0 -124
  170. package/dist/tests/output-baseline.test.js +0 -218
  171. package/dist/tests/output-shapes-unit.test.js +0 -478
  172. package/dist/tests/parallel-search.test.js +0 -272
  173. package/dist/tests/parameter-metadata.test.js +0 -365
  174. package/dist/tests/paths.test.js +0 -177
  175. package/dist/tests/progressive-disclosure.test.js +0 -280
  176. package/dist/tests/proposals.test.js +0 -279
  177. package/dist/tests/proposed-quality.test.js +0 -271
  178. package/dist/tests/provider-registry.test.js +0 -32
  179. package/dist/tests/ranking-regression.test.js +0 -548
  180. package/dist/tests/reflect-propose.test.js +0 -455
  181. package/dist/tests/registry-build-index.test.js +0 -394
  182. package/dist/tests/registry-cli.test.js +0 -290
  183. package/dist/tests/registry-index-v2.test.js +0 -430
  184. package/dist/tests/registry-install.test.js +0 -728
  185. package/dist/tests/registry-providers/parity.test.js +0 -189
  186. package/dist/tests/registry-providers/skills-sh.test.js +0 -309
  187. package/dist/tests/registry-providers/static-index.test.js +0 -238
  188. package/dist/tests/registry-resolve.test.js +0 -126
  189. package/dist/tests/registry-search.test.js +0 -923
  190. package/dist/tests/remember-frontmatter.test.js +0 -378
  191. package/dist/tests/remember-unit.test.js +0 -123
  192. package/dist/tests/ripgrep-install.test.js +0 -251
  193. package/dist/tests/ripgrep-resolve.test.js +0 -108
  194. package/dist/tests/ripgrep.test.js +0 -163
  195. package/dist/tests/save-command.test.js +0 -94
  196. package/dist/tests/save-trust-qa-fixes.test.js +0 -270
  197. package/dist/tests/scoring-pipeline.test.js +0 -648
  198. package/dist/tests/search-include-proposed-cli.test.js +0 -118
  199. package/dist/tests/self-update.test.js +0 -442
  200. package/dist/tests/semantic-search-e2e.test.js +0 -512
  201. package/dist/tests/semantic-status.test.js +0 -471
  202. package/dist/tests/setup-run.integration.js +0 -877
  203. package/dist/tests/setup-wizard.test.js +0 -198
  204. package/dist/tests/setup.test.js +0 -131
  205. package/dist/tests/source-add.test.js +0 -11
  206. package/dist/tests/source-clone.test.js +0 -254
  207. package/dist/tests/source-manage.test.js +0 -366
  208. package/dist/tests/source-providers/filesystem.test.js +0 -82
  209. package/dist/tests/source-providers/git.test.js +0 -252
  210. package/dist/tests/source-providers/website.test.js +0 -128
  211. package/dist/tests/source-qa-fixes.test.js +0 -286
  212. package/dist/tests/source-registry.test.js +0 -350
  213. package/dist/tests/source-resolve.test.js +0 -100
  214. package/dist/tests/source-source.test.js +0 -281
  215. package/dist/tests/source.test.js +0 -533
  216. package/dist/tests/tar-utils-scan.test.js +0 -73
  217. package/dist/tests/toggle-components.test.js +0 -73
  218. package/dist/tests/usage-telemetry.test.js +0 -265
  219. package/dist/tests/utility-scoring.test.js +0 -558
  220. package/dist/tests/vault-load-error.test.js +0 -78
  221. package/dist/tests/vault-qa-fixes.test.js +0 -194
  222. package/dist/tests/vault.test.js +0 -429
  223. package/dist/tests/vector-search.test.js +0 -608
  224. package/dist/tests/walker.test.js +0 -252
  225. package/dist/tests/wave2-cluster-bc.test.js +0 -228
  226. package/dist/tests/wave2-cluster-d.test.js +0 -180
  227. package/dist/tests/wave2-cluster-e.test.js +0 -179
  228. package/dist/tests/wiki-qa-fixes.test.js +0 -270
  229. package/dist/tests/wiki.test.js +0 -529
  230. package/dist/tests/workflow-cli.test.js +0 -271
  231. package/dist/tests/workflow-markdown.test.js +0 -171
  232. package/dist/tests/workflow-path-escape.test.js +0 -132
  233. package/dist/tests/workflow-qa-fixes.test.js +0 -395
  234. package/dist/tests/workflows/indexer-rejection.test.js +0 -213
  235. /package/dist/{src/commands → commands}/completions.js +0 -0
  236. /package/dist/{src/commands → commands}/config-cli.js +0 -0
  237. /package/dist/{src/commands → commands}/curate.js +0 -0
  238. /package/dist/{src/commands → commands}/distill.js +0 -0
  239. /package/dist/{src/commands → commands}/events.js +0 -0
  240. /package/dist/{src/commands → commands}/history.js +0 -0
  241. /package/dist/{src/commands → commands}/info.js +0 -0
  242. /package/dist/{src/commands → commands}/init.js +0 -0
  243. /package/dist/{src/commands → commands}/install-audit.js +0 -0
  244. /package/dist/{src/commands → commands}/migration-help.js +0 -0
  245. /package/dist/{src/commands → commands}/proposal.js +0 -0
  246. /package/dist/{src/commands → commands}/propose.js +0 -0
  247. /package/dist/{src/commands → commands}/reflect.js +0 -0
  248. /package/dist/{src/commands → commands}/registry-search.js +0 -0
  249. /package/dist/{src/commands → commands}/remember.js +0 -0
  250. /package/dist/{src/commands → commands}/search.js +0 -0
  251. /package/dist/{src/commands → commands}/self-update.js +0 -0
  252. /package/dist/{src/commands → commands}/show.js +0 -0
  253. /package/dist/{src/commands → commands}/source-clone.js +0 -0
  254. /package/dist/{src/commands → commands}/source-manage.js +0 -0
  255. /package/dist/{src/commands → commands}/vault.js +0 -0
  256. /package/dist/{src/core → core}/asset-ref.js +0 -0
  257. /package/dist/{src/core → core}/asset-registry.js +0 -0
  258. /package/dist/{src/core → core}/asset-spec.js +0 -0
  259. /package/dist/{src/core → core}/errors.js +0 -0
  260. /package/dist/{src/core → core}/events.js +0 -0
  261. /package/dist/{src/core → core}/frontmatter.js +0 -0
  262. /package/dist/{src/core → core}/lesson-lint.js +0 -0
  263. /package/dist/{src/core → core}/markdown.js +0 -0
  264. /package/dist/{src/core → core}/paths.js +0 -0
  265. /package/dist/{src/core → core}/proposals.js +0 -0
  266. /package/dist/{src/core → core}/warn.js +0 -0
  267. /package/dist/{src/core → core}/write-source.js +0 -0
  268. /package/dist/{src/indexer → indexer}/db.js +0 -0
  269. /package/dist/{src/indexer → indexer}/file-context.js +0 -0
  270. /package/dist/{src/indexer → indexer}/graph-boost.js +0 -0
  271. /package/dist/{src/indexer → indexer}/manifest.js +0 -0
  272. /package/dist/{src/indexer → indexer}/matchers.js +0 -0
  273. /package/dist/{src/indexer → indexer}/metadata.js +0 -0
  274. /package/dist/{src/indexer → indexer}/search-fields.js +0 -0
  275. /package/dist/{src/indexer → indexer}/semantic-status.js +0 -0
  276. /package/dist/{src/indexer → indexer}/usage-events.js +0 -0
  277. /package/dist/{src/indexer → indexer}/walker.js +0 -0
  278. /package/dist/{src/integrations → integrations}/agent/config.js +0 -0
  279. /package/dist/{src/integrations → integrations}/agent/detect.js +0 -0
  280. /package/dist/{src/integrations → integrations}/agent/index.js +0 -0
  281. /package/dist/{src/integrations → integrations}/agent/profiles.js +0 -0
  282. /package/dist/{src/integrations → integrations}/agent/prompts.js +0 -0
  283. /package/dist/{src/integrations → integrations}/agent/spawn.js +0 -0
  284. /package/dist/{src/integrations → integrations}/github.js +0 -0
  285. /package/dist/{src/integrations → integrations}/lockfile.js +0 -0
  286. /package/dist/{src/llm → llm}/embedders/cache.js +0 -0
  287. /package/dist/{src/llm → llm}/embedders/types.js +0 -0
  288. /package/dist/{src/llm → llm}/feature-gate.js +0 -0
  289. /package/dist/{src/llm → llm}/index-passes.js +0 -0
  290. /package/dist/{src/output → output}/context.js +0 -0
  291. /package/dist/{src/output → output}/renderers.js +0 -0
  292. /package/dist/{src/output → output}/shapes.js +0 -0
  293. /package/dist/{src/output → output}/text.js +0 -0
  294. /package/dist/{src/registry → registry}/build-index.js +0 -0
  295. /package/dist/{src/registry → registry}/create-provider-registry.js +0 -0
  296. /package/dist/{src/registry → registry}/factory.js +0 -0
  297. /package/dist/{src/registry → registry}/origin-resolve.js +0 -0
  298. /package/dist/{src/registry → registry}/providers/index.js +0 -0
  299. /package/dist/{src/registry → registry}/providers/skills-sh.js +0 -0
  300. /package/dist/{src/registry → registry}/providers/static-index.js +0 -0
  301. /package/dist/{src/registry → registry}/providers/types.js +0 -0
  302. /package/dist/{src/registry → registry}/resolve.js +0 -0
  303. /package/dist/{src/registry → registry}/types.js +0 -0
  304. /package/dist/{src/setup → setup}/detect.js +0 -0
  305. /package/dist/{src/setup → setup}/ripgrep-install.js +0 -0
  306. /package/dist/{src/setup → setup}/ripgrep-resolve.js +0 -0
  307. /package/dist/{src/setup → setup}/steps.js +0 -0
  308. /package/dist/{src/sources → sources}/include.js +0 -0
  309. /package/dist/{src/sources → sources}/provider-factory.js +0 -0
  310. /package/dist/{src/sources → sources}/provider.js +0 -0
  311. /package/dist/{src/sources → sources}/providers/filesystem.js +0 -0
  312. /package/dist/{src/sources → sources}/providers/git.js +0 -0
  313. /package/dist/{src/sources → sources}/providers/index.js +0 -0
  314. /package/dist/{src/sources → sources}/providers/install-types.js +0 -0
  315. /package/dist/{src/sources → sources}/providers/npm.js +0 -0
  316. /package/dist/{src/sources → sources}/providers/provider-utils.js +0 -0
  317. /package/dist/{src/sources → sources}/providers/sync-from-ref.js +0 -0
  318. /package/dist/{src/sources → sources}/providers/tar-utils.js +0 -0
  319. /package/dist/{src/sources → sources}/resolve.js +0 -0
  320. /package/dist/{src/sources → sources}/types.js +0 -0
  321. /package/dist/{src/templates → templates}/wiki-templates.js +0 -0
  322. /package/dist/{src/version.js → version.js} +0 -0
  323. /package/dist/{src/wiki → wiki}/wiki.js +0 -0
  324. /package/dist/{src/workflows → workflows}/authoring.js +0 -0
  325. /package/dist/{src/workflows → workflows}/cli.js +0 -0
  326. /package/dist/{src/workflows → workflows}/db.js +0 -0
  327. /package/dist/{src/workflows → workflows}/document-cache.js +0 -0
  328. /package/dist/{src/workflows → workflows}/parser.js +0 -0
  329. /package/dist/{src/workflows → workflows}/renderer.js +0 -0
  330. /package/dist/{src/workflows → workflows}/runs.js +0 -0
  331. /package/dist/{src/workflows → workflows}/schema.js +0 -0
  332. /package/dist/{src/workflows → workflows}/validator.js +0 -0
@@ -1,1441 +0,0 @@
1
- #!/usr/bin/env bun
2
- /**
3
- * Comprehensive benchmark suite for akm search system.
4
- *
5
- * Standalone script (NOT a bun:test suite) that covers:
6
- * 1. Search Quality (MRR, Recall@5, Recall@10)
7
- * 2. Search Performance (latency in ms)
8
- * 3. Indexing Performance (time in ms)
9
- * 4. Token Efficiency (byte savings %)
10
- * 5. Utility Scoring (M-2)
11
- * 6. Feature Correctness
12
- *
13
- * Usage:
14
- * bun run tests/benchmark-suite.ts
15
- * bun run tests/benchmark-suite.ts --json # machine-readable output only
16
- */
17
- import fs from "node:fs";
18
- import os from "node:os";
19
- import path from "node:path";
20
- import { assembleInfo } from "../src/commands/info";
21
- import { akmSearch } from "../src/commands/search";
22
- import { saveConfig } from "../src/core/config";
23
- import { getDbPath } from "../src/core/paths";
24
- import { closeDatabase, openDatabase, rebuildFts, upsertUtilityScore } from "../src/indexer/db";
25
- import { recomputeUtilityScores } from "../src/indexer/indexer";
26
- import { buildSearchFields } from "../src/indexer/search-fields";
27
- import { insertUsageEvent } from "../src/indexer/usage-events";
28
- import { recordUsageEvent } from "./helpers/usage-events";
29
- // ── CLI flags ────────────────────────────────────────────────────────────────
30
- const jsonOnly = process.argv.includes("--json");
31
- function log(msg) {
32
- if (!jsonOnly)
33
- process.stderr.write(msg);
34
- }
35
- // ── Environment isolation ────────────────────────────────────────────────────
36
- const tmpRoot = fs.mkdtempSync(path.join(os.tmpdir(), "akm-benchsuite-"));
37
- const testCacheDir = path.join(tmpRoot, "cache");
38
- const testConfigDir = path.join(tmpRoot, "config");
39
- fs.mkdirSync(testCacheDir, { recursive: true });
40
- fs.mkdirSync(testConfigDir, { recursive: true });
41
- const origXdgCache = process.env.XDG_CACHE_HOME;
42
- const origXdgConfig = process.env.XDG_CONFIG_HOME;
43
- const origStashDir = process.env.AKM_STASH_DIR;
44
- process.env.XDG_CACHE_HOME = testCacheDir;
45
- process.env.XDG_CONFIG_HOME = testConfigDir;
46
- function cleanup() {
47
- if (origXdgCache === undefined)
48
- delete process.env.XDG_CACHE_HOME;
49
- else
50
- process.env.XDG_CACHE_HOME = origXdgCache;
51
- if (origXdgConfig === undefined)
52
- delete process.env.XDG_CONFIG_HOME;
53
- else
54
- process.env.XDG_CONFIG_HOME = origXdgConfig;
55
- if (origStashDir === undefined)
56
- delete process.env.AKM_STASH_DIR;
57
- else
58
- process.env.AKM_STASH_DIR = origStashDir;
59
- fs.rmSync(tmpRoot, { recursive: true, force: true });
60
- }
61
- // ── Asset definitions (30+ assets) ───────────────────────────────────────────
62
- const ASSETS = [
63
- // ── 5 Skills (varying metadata quality) ──
64
- {
65
- dir: "skills/k8s-deploy",
66
- filename: "SKILL.md",
67
- fileContent: "# Kubernetes Deployment\n\nDeploy applications to Kubernetes clusters using kubectl.\n",
68
- stashEntry: {
69
- name: "k8s-deploy",
70
- type: "skill",
71
- description: "Deploy applications to Kubernetes clusters",
72
- tags: ["kubernetes", "deploy", "k8s", "containers"],
73
- searchHints: ["deploy to kubernetes", "kubectl apply", "container orchestration"],
74
- aliases: ["kube-deploy"],
75
- filename: "SKILL.md",
76
- quality: "curated",
77
- confidence: 0.95,
78
- },
79
- },
80
- {
81
- dir: "skills/code-review",
82
- filename: "SKILL.md",
83
- fileContent: "# Code Review\n\nReview pull requests for code quality and best practices.\n",
84
- stashEntry: {
85
- name: "code-review",
86
- type: "skill",
87
- description: "Review code for quality issues and best practices",
88
- tags: ["review", "quality", "pull-request"],
89
- searchHints: ["review pull request", "check code quality"],
90
- filename: "SKILL.md",
91
- quality: "curated",
92
- confidence: 0.9,
93
- },
94
- },
95
- {
96
- dir: "skills/api-design",
97
- filename: "SKILL.md",
98
- fileContent: "# API Design\n\nDesign RESTful APIs following best practices.\n",
99
- stashEntry: {
100
- name: "api-design",
101
- type: "skill",
102
- description: "Design RESTful APIs with OpenAPI specifications",
103
- tags: ["api", "rest", "openapi", "design"],
104
- searchHints: ["design a REST API", "create API specification"],
105
- filename: "SKILL.md",
106
- quality: "curated",
107
- confidence: 0.9,
108
- },
109
- },
110
- {
111
- dir: "skills/refactor",
112
- filename: "SKILL.md",
113
- fileContent: "# Code Refactoring\n\nRefactor code to improve readability and performance.\n",
114
- stashEntry: {
115
- name: "refactor",
116
- type: "skill",
117
- description: "Refactor code to improve structure and maintainability",
118
- tags: ["refactor", "clean-code", "maintenance"],
119
- searchHints: ["improve code structure", "clean up codebase"],
120
- filename: "SKILL.md",
121
- // Sparse metadata — no quality or confidence
122
- },
123
- },
124
- {
125
- dir: "skills/security-audit",
126
- filename: "SKILL.md",
127
- fileContent: "# Security Audit\n\nAudit applications for security vulnerabilities.\n",
128
- stashEntry: {
129
- name: "security-audit",
130
- type: "skill",
131
- description: "Audit code and infrastructure for security vulnerabilities",
132
- tags: ["security", "audit", "vulnerability", "pentest"],
133
- searchHints: ["find security vulnerabilities", "security scan"],
134
- filename: "SKILL.md",
135
- quality: "generated",
136
- confidence: 0.6,
137
- },
138
- },
139
- // ── 5 Commands with $ARGUMENTS parameters ──
140
- {
141
- dir: "commands",
142
- filename: "test-runner.md",
143
- fileContent: "---\ndescription: Run test suites across the project\nparams:\n suite: Test suite to run\n---\n# Test Runner\n\nRun $ARGUMENTS tests.\n",
144
- stashEntry: {
145
- name: "test-runner",
146
- type: "command",
147
- description: "Run test suites across the project",
148
- tags: ["test", "testing", "ci", "runner"],
149
- searchHints: ["run tests", "execute test suite"],
150
- filename: "test-runner.md",
151
- parameters: [{ name: "ARGUMENTS", description: "test suite path or pattern" }],
152
- },
153
- },
154
- {
155
- dir: "commands",
156
- filename: "lint-check.md",
157
- fileContent: "---\ndescription: Run linting checks on the codebase\n---\n# Lint Check\n\nRun lint on $ARGUMENTS.\n",
158
- stashEntry: {
159
- name: "lint-check",
160
- type: "command",
161
- description: "Run linting checks on the codebase",
162
- tags: ["lint", "eslint", "code-quality"],
163
- searchHints: ["lint code", "check for style issues"],
164
- filename: "lint-check.md",
165
- parameters: [{ name: "ARGUMENTS", description: "files to lint" }],
166
- },
167
- },
168
- {
169
- dir: "commands",
170
- filename: "git-summary.md",
171
- fileContent: "---\ndescription: Summarize recent git changes\n---\n# Git Summary\n\nSummarize $ARGUMENTS git log.\n",
172
- stashEntry: {
173
- name: "git-summary",
174
- type: "command",
175
- description: "Summarize recent git changes and commit history",
176
- tags: ["git", "summary", "changelog"],
177
- searchHints: ["summarize git commits", "show recent changes"],
178
- filename: "git-summary.md",
179
- parameters: [{ name: "ARGUMENTS", description: "branch or date range" }],
180
- },
181
- },
182
- {
183
- dir: "commands",
184
- filename: "deploy-status.md",
185
- fileContent: "---\ndescription: Check deployment status\n---\n# Deploy Status\n\nCheck $ARGUMENTS deployment status.\n",
186
- stashEntry: {
187
- name: "deploy-status",
188
- type: "command",
189
- description: "Check the current deployment status of services",
190
- tags: ["deploy", "status", "monitoring"],
191
- searchHints: ["check deployment", "is service deployed"],
192
- filename: "deploy-status.md",
193
- parameters: [{ name: "ARGUMENTS", description: "service name" }],
194
- },
195
- },
196
- {
197
- dir: "commands",
198
- filename: "docker-build.md",
199
- fileContent: "---\ndescription: Build Docker images from Dockerfile\nparams:\n image: Docker image name and tag\n context: Build context directory\n---\n# Docker Build\n\nBuild docker image $1 from $2.\n",
200
- stashEntry: {
201
- name: "docker-build",
202
- type: "command",
203
- description: "Build Docker images from Dockerfile",
204
- tags: ["docker", "build", "image", "containers"],
205
- searchHints: ["build docker image", "create container image"],
206
- filename: "docker-build.md",
207
- parameters: [
208
- { name: "image", description: "Docker image name and tag" },
209
- { name: "context", description: "Build context directory" },
210
- ],
211
- intent: { when: "Need to build a container image", input: "Dockerfile path", output: "Built image" },
212
- },
213
- },
214
- // ── 5 Scripts with @param JSDoc ──
215
- {
216
- dir: "scripts/pg-backup",
217
- filename: "pg-backup.sh",
218
- fileContent: '#!/bin/bash\n# @param {string} database - PostgreSQL database name\n# @param {string} output - Output file path for the dump\n# Backup PostgreSQL database\npg_dump "$1" > "$2"\n',
219
- stashEntry: {
220
- name: "pg-backup",
221
- type: "script",
222
- description: "Backup PostgreSQL database to a SQL dump file",
223
- tags: ["database", "backup", "postgresql", "postgres"],
224
- searchHints: ["backup database", "export postgres data", "pg_dump"],
225
- filename: "pg-backup.sh",
226
- parameters: [
227
- { name: "database", type: "string", description: "PostgreSQL database name" },
228
- { name: "output", type: "string", description: "Output file path for the dump" },
229
- ],
230
- },
231
- },
232
- {
233
- dir: "scripts/docker-clean",
234
- filename: "docker-clean.sh",
235
- fileContent: "#!/bin/bash\n# @param {string} filter - Optional image filter pattern\n# Clean up Docker resources\ndocker system prune -af\n",
236
- stashEntry: {
237
- name: "docker-clean",
238
- type: "script",
239
- description: "Clean up unused Docker images, containers, and volumes",
240
- tags: ["docker", "cleanup", "containers"],
241
- searchHints: ["clean docker", "remove unused images"],
242
- filename: "docker-clean.sh",
243
- parameters: [{ name: "filter", type: "string", description: "Optional image filter pattern" }],
244
- },
245
- },
246
- {
247
- dir: "scripts/ssl-renew",
248
- filename: "ssl-renew.sh",
249
- fileContent: "#!/bin/bash\n# @param {string} domain - Domain name for certificate renewal\n# Renew SSL certificates\ncertbot renew --domain $1\n",
250
- stashEntry: {
251
- name: "ssl-renew",
252
- type: "script",
253
- description: "Renew SSL/TLS certificates using certbot",
254
- tags: ["ssl", "tls", "certificate", "certbot"],
255
- searchHints: ["renew certificates", "ssl renewal"],
256
- filename: "ssl-renew.sh",
257
- parameters: [{ name: "domain", type: "string", description: "Domain name for certificate renewal" }],
258
- },
259
- },
260
- {
261
- dir: "scripts/log-rotate",
262
- filename: "log-rotate.sh",
263
- fileContent: "#!/bin/bash\n# @param {number} days - Number of days to keep logs\n# Rotate application logs\nlogrotate /etc/logrotate.conf\n",
264
- stashEntry: {
265
- name: "log-rotate",
266
- type: "script",
267
- description: "Rotate and compress application log files",
268
- tags: ["logs", "rotation", "maintenance"],
269
- searchHints: ["rotate logs", "compress old logs"],
270
- filename: "log-rotate.sh",
271
- parameters: [{ name: "days", type: "number", description: "Number of days to keep logs" }],
272
- },
273
- },
274
- {
275
- dir: "scripts/env-setup",
276
- filename: "env-setup.sh",
277
- fileContent: "#!/bin/bash\n# @param {string} environment - Target environment (dev, staging, prod)\n# Set up development environment\nnpm install && cp .env.example .env\n",
278
- stashEntry: {
279
- name: "env-setup",
280
- type: "script",
281
- description: "Set up local development environment with dependencies",
282
- tags: ["setup", "environment", "development", "onboarding"],
283
- searchHints: ["set up dev environment", "install dependencies"],
284
- filename: "env-setup.sh",
285
- parameters: [{ name: "environment", type: "string", description: "Target environment (dev, staging, prod)" }],
286
- },
287
- },
288
- // ── 5 Knowledge docs (some with deep TOC, some minimal) ──
289
- {
290
- dir: "knowledge",
291
- filename: "architecture-guide.md",
292
- fileContent: "---\ndescription: System architecture overview\n---\n# Architecture Guide\n\n## Microservices\n\nOverview of service boundaries.\n\n## Data Flow\n\nHow data moves through the system.\n\n## Database Schema\n\nRelational model overview.\n\n## API Gateway\n\nRouting and authentication.\n",
293
- stashEntry: {
294
- name: "architecture-guide",
295
- type: "knowledge",
296
- description: "System architecture overview and design decisions",
297
- tags: ["architecture", "design", "microservices"],
298
- searchHints: ["system architecture", "how the system works"],
299
- filename: "architecture-guide.md",
300
- },
301
- },
302
- {
303
- dir: "knowledge",
304
- filename: "runbook-incidents.md",
305
- fileContent: "---\ndescription: Incident response runbook\n---\n# Incident Runbook\n\n## Severity Levels\n\n## Escalation\n\n## Post-mortem\n",
306
- stashEntry: {
307
- name: "runbook-incidents",
308
- type: "knowledge",
309
- description: "Incident response procedures and escalation paths",
310
- tags: ["incident", "runbook", "on-call", "ops"],
311
- searchHints: ["handle incident", "escalation procedure"],
312
- filename: "runbook-incidents.md",
313
- },
314
- },
315
- {
316
- dir: "knowledge",
317
- filename: "coding-standards.md",
318
- fileContent: "---\ndescription: Team coding standards\n---\n# Coding Standards\n\n## Naming Conventions\n\n## Error Handling\n\n## Testing Requirements\n",
319
- stashEntry: {
320
- name: "coding-standards",
321
- type: "knowledge",
322
- description: "Team coding standards and conventions",
323
- tags: ["standards", "conventions", "style-guide"],
324
- searchHints: ["coding style", "naming conventions"],
325
- filename: "coding-standards.md",
326
- },
327
- },
328
- {
329
- dir: "knowledge",
330
- filename: "onboarding.md",
331
- fileContent: "---\ndescription: New team member onboarding guide\n---\n# Onboarding Guide\n\n## First Day\n\n## Access Setup\n\n## Development Environment\n\n## Team Norms\n\n## Resources\n",
332
- stashEntry: {
333
- name: "onboarding",
334
- type: "knowledge",
335
- description: "New team member onboarding guide with checklists",
336
- tags: ["onboarding", "new-hire", "team"],
337
- searchHints: ["new team member", "getting started"],
338
- filename: "onboarding.md",
339
- },
340
- },
341
- {
342
- dir: "knowledge",
343
- filename: "troubleshooting.md",
344
- fileContent: "---\ndescription: Common troubleshooting steps\n---\n# Troubleshooting\n\nBasic debugging tips.\n",
345
- stashEntry: {
346
- name: "troubleshooting",
347
- type: "knowledge",
348
- description: "Common troubleshooting steps for production issues",
349
- tags: ["troubleshooting", "debugging", "production"],
350
- searchHints: ["debug production issue", "common errors"],
351
- filename: "troubleshooting.md",
352
- },
353
- },
354
- // ── 5 Agents ──
355
- {
356
- dir: "agents",
357
- filename: "devops-engineer.md",
358
- fileContent: "---\ndescription: DevOps engineering agent\n---\nYou are a DevOps engineer specializing in CI/CD pipelines and infrastructure automation.\n",
359
- stashEntry: {
360
- name: "devops-engineer",
361
- type: "agent",
362
- description: "DevOps engineering agent for CI/CD and infrastructure",
363
- tags: ["devops", "ci-cd", "infrastructure", "automation"],
364
- searchHints: ["automate infrastructure", "CI/CD pipeline"],
365
- filename: "devops-engineer.md",
366
- },
367
- },
368
- {
369
- dir: "agents",
370
- filename: "data-analyst.md",
371
- fileContent: "---\ndescription: Data analysis agent\n---\nYou are a data analyst who helps explore datasets and generate insights.\n",
372
- stashEntry: {
373
- name: "data-analyst",
374
- type: "agent",
375
- description: "Data analysis agent for exploring datasets and generating insights",
376
- tags: ["data", "analysis", "statistics", "insights"],
377
- searchHints: ["analyze data", "generate reports"],
378
- filename: "data-analyst.md",
379
- },
380
- },
381
- {
382
- dir: "agents",
383
- filename: "technical-writer.md",
384
- fileContent: "---\ndescription: Technical writing agent\n---\nYou are a technical writer who creates clear documentation.\n",
385
- stashEntry: {
386
- name: "technical-writer",
387
- type: "agent",
388
- description: "Technical writing agent for creating documentation",
389
- tags: ["documentation", "writing", "technical"],
390
- searchHints: ["write documentation", "create technical docs"],
391
- filename: "technical-writer.md",
392
- },
393
- },
394
- {
395
- dir: "agents",
396
- filename: "frontend-dev.md",
397
- fileContent: "---\ndescription: Frontend development agent\n---\nYou are a frontend developer specializing in React and TypeScript.\n",
398
- stashEntry: {
399
- name: "frontend-dev",
400
- type: "agent",
401
- description: "Frontend development agent specializing in React and TypeScript",
402
- tags: ["frontend", "react", "typescript", "ui"],
403
- searchHints: ["build React component", "frontend development"],
404
- filename: "frontend-dev.md",
405
- },
406
- },
407
- {
408
- dir: "agents",
409
- filename: "dba-specialist.md",
410
- fileContent: "---\ndescription: Database administration specialist\n---\nYou are a DBA specialist who optimizes queries and manages schemas.\n",
411
- stashEntry: {
412
- name: "dba-specialist",
413
- type: "agent",
414
- description: "Database administration specialist for query optimization",
415
- tags: ["database", "sql", "optimization", "dba"],
416
- searchHints: ["optimize database query", "schema management"],
417
- filename: "dba-specialist.md",
418
- },
419
- },
420
- // ── 5 Assets with overlapping terms in different fields (field weighting tests) ──
421
- {
422
- dir: "skills/deploy-helper",
423
- filename: "SKILL.md",
424
- fileContent: "# Deploy Helper\n\nHelps with deployment workflows.\n",
425
- stashEntry: {
426
- name: "deploy-helper",
427
- type: "skill",
428
- description: "Assists with deployment workflow automation and rollbacks",
429
- tags: ["workflow", "automation", "rollback"],
430
- searchHints: ["automate deployment workflow"],
431
- filename: "SKILL.md",
432
- // Name contains "deploy" -- should rank higher for "deploy" than
433
- // assets that only have "deploy" in description or tags
434
- },
435
- },
436
- {
437
- dir: "knowledge",
438
- filename: "deploy-checklist.md",
439
- fileContent: "---\ndescription: Pre-deployment checklist for production releases\n---\n# Pre-deployment Checklist\n\n## Steps\n\n1. Run tests\n2. Review changes\n",
440
- stashEntry: {
441
- name: "deploy-checklist",
442
- type: "knowledge",
443
- description: "Pre-deployment checklist for production releases",
444
- tags: ["checklist", "production", "release"],
445
- filename: "deploy-checklist.md",
446
- // Name also contains "deploy" in name field
447
- },
448
- },
449
- {
450
- dir: "scripts/metrics-collector",
451
- filename: "metrics-collector.sh",
452
- fileContent: "#!/bin/bash\n# Collect deployment metrics from monitoring API\ncurl http://metrics.internal/deploy\n",
453
- stashEntry: {
454
- name: "metrics-collector",
455
- type: "script",
456
- description: "Collect deployment metrics from monitoring infrastructure",
457
- tags: ["metrics", "monitoring", "deploy"],
458
- searchHints: ["collect metrics"],
459
- filename: "metrics-collector.sh",
460
- // "deploy" only in tags and description, NOT in name
461
- },
462
- },
463
- {
464
- dir: "commands",
465
- filename: "health-check.md",
466
- fileContent: "---\ndescription: Run health checks against deployed services\n---\n# Health Check\n\nCheck service health after deployment.\n",
467
- stashEntry: {
468
- name: "health-check",
469
- type: "command",
470
- description: "Run health checks against deployed services",
471
- tags: ["health", "monitoring", "services"],
472
- searchHints: ["check service health", "verify deployment"],
473
- filename: "health-check.md",
474
- // "deploy" only in description and hints, NOT in name or tags
475
- },
476
- },
477
- {
478
- dir: "knowledge",
479
- filename: "monitoring-guide.md",
480
- fileContent: "---\ndescription: Guide to monitoring deployed applications\n---\n# Monitoring Guide\n\n## Alerting\n\n## Dashboards\n\n## Incident Response\n",
481
- stashEntry: {
482
- name: "monitoring-guide",
483
- type: "knowledge",
484
- description: "Guide to monitoring deployed applications and setting up alerts",
485
- tags: ["monitoring", "alerting", "dashboards", "observability"],
486
- filename: "monitoring-guide.md",
487
- // "deploy" only in description content
488
- },
489
- },
490
- ];
491
- // ── Stash creation ───────────────────────────────────────────────────────────
492
- function createBenchmarkStash() {
493
- const stashDir = path.join(tmpRoot, "stash");
494
- for (const sub of ["skills", "commands", "agents", "knowledge", "scripts"]) {
495
- fs.mkdirSync(path.join(stashDir, sub), { recursive: true });
496
- }
497
- for (const asset of ASSETS) {
498
- const dirPath = path.join(stashDir, asset.dir);
499
- fs.mkdirSync(dirPath, { recursive: true });
500
- fs.writeFileSync(path.join(dirPath, asset.filename), asset.fileContent);
501
- const stashJsonPath = path.join(dirPath, ".stash.json");
502
- let entries = [];
503
- if (fs.existsSync(stashJsonPath)) {
504
- const existing = JSON.parse(fs.readFileSync(stashJsonPath, "utf8"));
505
- entries = existing.entries;
506
- }
507
- entries.push(asset.stashEntry);
508
- fs.writeFileSync(stashJsonPath, JSON.stringify({ entries }, null, 2));
509
- }
510
- return stashDir;
511
- }
512
- // ── Git helpers ──────────────────────────────────────────────────────────────
513
- function gitInfo() {
514
- try {
515
- const branch = Bun.spawnSync(["git", "rev-parse", "--abbrev-ref", "HEAD"], {
516
- cwd: import.meta.dir,
517
- })
518
- .stdout.toString()
519
- .trim();
520
- const commit = Bun.spawnSync(["git", "rev-parse", "--short", "HEAD"], {
521
- cwd: import.meta.dir,
522
- })
523
- .stdout.toString()
524
- .trim();
525
- return { branch, commit };
526
- }
527
- catch {
528
- return { branch: "unknown", commit: "unknown" };
529
- }
530
- }
531
- // ── Timing utility ───────────────────────────────────────────────────────────
532
- function timeMs(fn) {
533
- const t0 = performance.now();
534
- fn();
535
- return Math.round((performance.now() - t0) * 100) / 100;
536
- }
537
- async function timeMsAsync(fn) {
538
- const t0 = performance.now();
539
- await fn();
540
- return Math.round((performance.now() - t0) * 100) / 100;
541
- }
542
- const QUALITY_QUERIES = [
543
- // Exact keyword matches
544
- { id: "sq-01", query: "kubernetes", expectedName: "k8s-deploy", expectedType: "skill", aspect: "exact-keyword-tag" },
545
- {
546
- id: "sq-02",
547
- query: "database backup",
548
- expectedName: "pg-backup",
549
- expectedType: "script",
550
- aspect: "exact-keyword-desc-tag",
551
- },
552
- {
553
- id: "sq-03",
554
- query: "test runner",
555
- expectedName: "test-runner",
556
- expectedType: "command",
557
- aspect: "exact-keyword-name",
558
- },
559
- {
560
- id: "sq-04",
561
- query: "security audit",
562
- expectedName: "security-audit",
563
- expectedType: "skill",
564
- aspect: "exact-keyword-name",
565
- },
566
- // Partial/prefix matches (S-1 fuzzy search)
567
- {
568
- id: "sq-05",
569
- query: "kube",
570
- expectedName: "k8s-deploy",
571
- expectedType: "skill",
572
- aspect: "prefix-alias",
573
- },
574
- {
575
- id: "sq-06",
576
- query: "cert",
577
- expectedName: "ssl-renew",
578
- expectedType: "script",
579
- aspect: "prefix-tag",
580
- },
581
- // Multi-word queries
582
- {
583
- id: "sq-07",
584
- query: "ci cd pipeline",
585
- expectedName: "devops-engineer",
586
- expectedType: "agent",
587
- aspect: "multi-word-tags",
588
- },
589
- {
590
- id: "sq-08",
591
- query: "code quality review",
592
- expectedName: "code-review",
593
- expectedType: "skill",
594
- aspect: "multi-word-desc",
595
- },
596
- // Natural language intent queries
597
- {
598
- id: "sq-09",
599
- query: "renew ssl certificate",
600
- expectedName: "ssl-renew",
601
- expectedType: "script",
602
- aspect: "natural-language",
603
- },
604
- {
605
- id: "sq-10",
606
- query: "deploy to kubernetes",
607
- expectedName: "k8s-deploy",
608
- expectedType: "skill",
609
- aspect: "natural-language-hint",
610
- },
611
- {
612
- id: "sq-11",
613
- query: "analyze data",
614
- expectedName: "data-analyst",
615
- expectedType: "agent",
616
- aspect: "natural-language-hint",
617
- },
618
- // Cross-field matches (name match > description match)
619
- {
620
- id: "sq-12",
621
- query: "deploy",
622
- // k8s-deploy is a skill with "deploy" in tags/aliases; deploy-helper has it in name
623
- // Both are valid top results — accept either at rank 1
624
- expectedName: "k8s-deploy",
625
- expectedType: "skill",
626
- aspect: "field-weighting-name-vs-desc",
627
- },
628
- // Parameter-based discovery (I-2)
629
- {
630
- id: "sq-13",
631
- query: "docker image",
632
- expectedName: "docker-build",
633
- expectedType: "command",
634
- aspect: "parameter-discovery",
635
- },
636
- // Tag match specificity
637
- {
638
- id: "sq-14",
639
- query: "docker",
640
- // docker-build is a command with "docker" in name+tags; ranks above docker-clean (script)
641
- // due to type boost (command > script)
642
- expectedName: "docker-build",
643
- expectedType: "command",
644
- aspect: "tag-match",
645
- },
646
- // Description match
647
- {
648
- id: "sq-15",
649
- query: "incident response",
650
- expectedName: "runbook-incidents",
651
- expectedType: "knowledge",
652
- aspect: "desc-match",
653
- },
654
- ];
655
- async function benchmarkSearchQuality(_stashDir) {
656
- log(" Running search quality benchmarks...\n");
657
- const cases = [];
658
- let sumRR = 0;
659
- let in5 = 0;
660
- let in10 = 0;
661
- for (const q of QUALITY_QUERIES) {
662
- const result = await akmSearch({ query: q.query, source: "stash", limit: 20 });
663
- const hits = result.hits.filter((h) => h.type !== "registry");
664
- const idx = hits.findIndex((h) => h.name === q.expectedName);
665
- const rank = idx >= 0 ? idx + 1 : null;
666
- const rr = rank !== null ? 1 / rank : 0;
667
- sumRR += rr;
668
- if (rank !== null && rank <= 5)
669
- in5++;
670
- if (rank !== null && rank <= 10)
671
- in10++;
672
- const passed = rank !== null && rank <= 5;
673
- cases.push({
674
- id: q.id,
675
- scenario: "search_quality",
676
- description: `${q.aspect}: "${q.query}" -> ${q.expectedName}`,
677
- passed,
678
- metric: rank ?? -1,
679
- unit: "rank",
680
- details: rank !== null ? `Rank ${rank}` : "MISS (not in results)",
681
- });
682
- }
683
- const total = QUALITY_QUERIES.length;
684
- const mrr = Math.round((sumRR / total) * 10000) / 10000;
685
- const recall_at_5 = Math.round((in5 / total) * 10000) / 10000;
686
- const recall_at_10 = Math.round((in10 / total) * 10000) / 10000;
687
- return { mrr, recall_at_5, recall_at_10, cases };
688
- }
689
- // ── Scenario 2: Search Performance ───────────────────────────────────────────
690
- async function benchmarkSearchPerformance(_stashDir) {
691
- log(" Running search performance benchmarks...\n");
692
- const cases = [];
693
- // Cold search (first query after process start -- index already warm from quality tests,
694
- // but this is the first timing of this specific query)
695
- const coldMs = await timeMsAsync(async () => {
696
- await akmSearch({ query: "infrastructure automation pipeline", source: "stash", limit: 20 });
697
- });
698
- cases.push({
699
- id: "sp-01",
700
- scenario: "search_performance",
701
- description: "Cold search (first query with this text)",
702
- passed: coldMs < 500,
703
- metric: coldMs,
704
- unit: "ms",
705
- });
706
- // Warm search (repeated query -- FTS cache warm)
707
- const warmMs = await timeMsAsync(async () => {
708
- await akmSearch({ query: "infrastructure automation pipeline", source: "stash", limit: 20 });
709
- });
710
- cases.push({
711
- id: "sp-02",
712
- scenario: "search_performance",
713
- description: "Warm search (repeated query)",
714
- passed: warmMs < 200,
715
- metric: warmMs,
716
- unit: "ms",
717
- });
718
- // FTS-only search (semantic search disabled in config)
719
- const ftsMs = await timeMsAsync(async () => {
720
- await akmSearch({ query: "deploy kubernetes containers", source: "stash", limit: 20 });
721
- });
722
- cases.push({
723
- id: "sp-03",
724
- scenario: "search_performance",
725
- description: "FTS-only search (no embeddings)",
726
- passed: ftsMs < 200,
727
- metric: ftsMs,
728
- unit: "ms",
729
- });
730
- // Large result set (empty query returns all entries)
731
- const largeMs = await timeMsAsync(async () => {
732
- await akmSearch({ query: "", source: "stash", limit: 100 });
733
- });
734
- cases.push({
735
- id: "sp-04",
736
- scenario: "search_performance",
737
- description: "Large result set (all assets)",
738
- passed: largeMs < 500,
739
- metric: largeMs,
740
- unit: "ms",
741
- });
742
- return {
743
- cold_ms: coldMs,
744
- warm_ms: warmMs,
745
- fts_only_ms: ftsMs,
746
- large_result_ms: largeMs,
747
- cases,
748
- };
749
- }
750
- // ── Scenario 3: Indexing Performance ─────────────────────────────────────────
751
- async function benchmarkIndexingPerformance(stashDir) {
752
- log(" Running indexing performance benchmarks...\n");
753
- const cases = [];
754
- // Import akmIndex locally to avoid any caching issues
755
- const { akmIndex } = await import("../src/indexer/indexer.js");
756
- // Full index (fresh rebuild)
757
- const fullMs = await timeMsAsync(async () => {
758
- await akmIndex({ stashDir, full: true });
759
- });
760
- cases.push({
761
- id: "ip-01",
762
- scenario: "indexing_performance",
763
- description: "Fresh full index (empty DB)",
764
- passed: fullMs < 5000,
765
- metric: fullMs,
766
- unit: "ms",
767
- });
768
- // Incremental index (nothing changed)
769
- const incrMs = await timeMsAsync(async () => {
770
- await akmIndex({ stashDir, full: false });
771
- });
772
- cases.push({
773
- id: "ip-02",
774
- scenario: "indexing_performance",
775
- description: "Incremental index (no changes)",
776
- passed: incrMs < fullMs,
777
- metric: incrMs,
778
- unit: "ms",
779
- details: `Should be faster than full (${fullMs}ms)`,
780
- });
781
- // FTS rebuild time
782
- const dbPath = getDbPath();
783
- const db = openDatabase(dbPath);
784
- let ftsMs = 0;
785
- let utilMs = 0;
786
- try {
787
- ftsMs = timeMs(() => {
788
- rebuildFts(db);
789
- });
790
- cases.push({
791
- id: "ip-03",
792
- scenario: "indexing_performance",
793
- description: "FTS rebuild time",
794
- passed: ftsMs < 500,
795
- metric: ftsMs,
796
- unit: "ms",
797
- });
798
- // recomputeUtilityScores time
799
- utilMs = timeMs(() => {
800
- recomputeUtilityScores(db);
801
- });
802
- cases.push({
803
- id: "ip-04",
804
- scenario: "indexing_performance",
805
- description: "recomputeUtilityScores time",
806
- passed: utilMs < 200,
807
- metric: utilMs,
808
- unit: "ms",
809
- });
810
- }
811
- finally {
812
- closeDatabase(db);
813
- }
814
- return {
815
- full_ms: fullMs,
816
- incremental_ms: incrMs,
817
- fts_rebuild_ms: ftsMs,
818
- recompute_utility_ms: utilMs,
819
- cases,
820
- };
821
- }
822
- // ── Scenario 4: Token Efficiency ─────────────────────────────────────────────
823
- async function benchmarkTokenEfficiency(stashDir) {
824
- log(" Running token efficiency benchmarks...\n");
825
- const cases = [];
826
- // Summary vs full: measure JSON output size
827
- // We simulate by calling akmSearch with the same query and comparing what
828
- // a "full" vs "summary" response would look like in terms of the show output.
829
- // Since we cannot easily call the CLI with --detail, we measure the search
830
- // result in different output scenarios.
831
- const fullResult = await akmSearch({ query: "deploy", source: "stash", limit: 10 });
832
- const fullJson = JSON.stringify(fullResult);
833
- const fullBytes = Buffer.byteLength(fullJson);
834
- // Build a summary-equivalent by stripping content fields
835
- const summaryResult = {
836
- ...fullResult,
837
- hits: fullResult.hits.map((h) => {
838
- const { path: _p, ...minimal } = h;
839
- return {
840
- name: minimal.name,
841
- type: minimal.type,
842
- description: minimal.description,
843
- ref: h.ref,
844
- };
845
- }),
846
- };
847
- const summaryJson = JSON.stringify(summaryResult);
848
- const summaryBytes = Buffer.byteLength(summaryJson);
849
- const summarySavingsPct = Math.round(((fullBytes - summaryBytes) / fullBytes) * 100);
850
- cases.push({
851
- id: "te-01",
852
- scenario: "token_efficiency",
853
- description: "Summary vs full search output savings",
854
- passed: summarySavingsPct > 10,
855
- metric: summarySavingsPct,
856
- unit: "%",
857
- details: `Full: ${fullBytes}B, Summary: ${summaryBytes}B`,
858
- });
859
- // Manifest output size per N assets
860
- const { akmManifest } = await import("../src/indexer/manifest.js");
861
- const manifest = await akmManifest({ stashDir });
862
- const manifestJson = JSON.stringify(manifest);
863
- const manifestBytes = Buffer.byteLength(manifestJson);
864
- const bytesPerAsset = manifest.entries.length > 0 ? Math.round(manifestBytes / manifest.entries.length) : 0;
865
- cases.push({
866
- id: "te-02",
867
- scenario: "token_efficiency",
868
- description: "Manifest bytes per asset",
869
- passed: bytesPerAsset < 200,
870
- metric: bytesPerAsset,
871
- unit: "bytes/asset",
872
- details: `Total: ${manifestBytes}B for ${manifest.entries.length} assets`,
873
- });
874
- // --for-agent output size vs normal: for-agent strips paths, editHints, etc.
875
- const normalHits = fullResult.hits;
876
- const normalJson = JSON.stringify(normalHits);
877
- const forAgentHits = normalHits.map((h) => ({
878
- type: h.type,
879
- name: h.name,
880
- ref: h.ref,
881
- description: h.description,
882
- action: h.action,
883
- score: h.score,
884
- }));
885
- const forAgentJson = JSON.stringify(forAgentHits);
886
- const forAgentSavings = Math.round(((Buffer.byteLength(normalJson) - Buffer.byteLength(forAgentJson)) / Buffer.byteLength(normalJson)) * 100);
887
- cases.push({
888
- id: "te-03",
889
- scenario: "token_efficiency",
890
- description: "--for-agent output size savings vs normal",
891
- passed: forAgentSavings > 10,
892
- metric: forAgentSavings,
893
- unit: "%",
894
- });
895
- // --format jsonl size vs json (JSONL has less overhead for arrays)
896
- const jsonlOutput = normalHits.map((h) => JSON.stringify(h)).join("\n");
897
- const jsonlBytes = Buffer.byteLength(jsonlOutput);
898
- const jsonBytes = Buffer.byteLength(JSON.stringify(normalHits));
899
- const jsonlSavingsPct = Math.round(((jsonBytes - jsonlBytes) / jsonBytes) * 100);
900
- cases.push({
901
- id: "te-04",
902
- scenario: "token_efficiency",
903
- description: "JSONL vs JSON format size",
904
- // JSONL typically has slightly less overhead (no outer brackets + commas)
905
- // but can be slightly larger too, so we just report
906
- passed: true,
907
- metric: jsonlSavingsPct,
908
- unit: "%",
909
- details: `JSON: ${jsonBytes}B, JSONL: ${jsonlBytes}B`,
910
- });
911
- return {
912
- summary_savings_pct: summarySavingsPct,
913
- manifest_bytes_per_asset: bytesPerAsset,
914
- for_agent_savings_pct: forAgentSavings,
915
- jsonl_savings_pct: jsonlSavingsPct,
916
- cases,
917
- };
918
- }
919
- // ── Scenario 5: Utility Scoring ──────────────────────────────────────────────
920
- async function benchmarkUtilityScoring(_stashDir) {
921
- log(" Running utility scoring benchmarks...\n");
922
- const cases = [];
923
- const dbPath = getDbPath();
924
- // Test 1: Fresh index with no usage data — all scores should be baseline (no utility boost)
925
- {
926
- const result = await akmSearch({ query: "deploy", source: "stash", limit: 20 });
927
- const localHits = result.hits.filter((h) => h.type !== "registry");
928
- const hasUtilityBoost = localHits.some((h) => h.whyMatched?.includes("usage history boost"));
929
- cases.push({
930
- id: "us-01",
931
- scenario: "utility_scoring",
932
- description: "Fresh index has no utility boosts",
933
- passed: !hasUtilityBoost,
934
- metric: hasUtilityBoost ? 1 : 0,
935
- unit: "boosted_count",
936
- });
937
- }
938
- // Test 2: After simulated usage events, boosted entry ranks higher
939
- let boostApplied = false;
940
- {
941
- const db = openDatabase(dbPath);
942
- try {
943
- // Find two entries that match the same query
944
- const entries = db
945
- .prepare("SELECT id, entry_key FROM entries WHERE entry_key LIKE '%deploy%' LIMIT 2")
946
- .all();
947
- if (entries.length >= 2) {
948
- const boostedId = entries[0].id;
949
- const _baselineId = entries[1].id;
950
- // Record usage events for the boosted entry
951
- for (let i = 0; i < 10; i++) {
952
- recordUsageEvent(db, { eventType: "show", entryId: boostedId, timestamp: new Date().toISOString() });
953
- recordUsageEvent(db, { eventType: "search", entryId: boostedId, timestamp: new Date().toISOString() });
954
- }
955
- // Recompute utility scores
956
- recomputeUtilityScores(db);
957
- // Verify the boosted entry now has a non-zero utility score
958
- const score = db.prepare("SELECT utility FROM utility_scores WHERE entry_id = ?").get(boostedId);
959
- boostApplied = (score?.utility ?? 0) > 0;
960
- }
961
- }
962
- finally {
963
- closeDatabase(db);
964
- }
965
- cases.push({
966
- id: "us-02",
967
- scenario: "utility_scoring",
968
- description: "Usage events generate positive utility score",
969
- passed: boostApplied,
970
- });
971
- }
972
- // Test 3: Recency decay — old events contribute less
973
- let decayWorks = false;
974
- {
975
- const db = openDatabase(dbPath);
976
- try {
977
- const entries = db.prepare("SELECT id FROM entries LIMIT 2").all();
978
- if (entries.length >= 2) {
979
- const recentId = entries[0].id;
980
- const oldId = entries[1].id;
981
- // Clear existing usage events and utility scores
982
- db.exec("DELETE FROM usage_events");
983
- db.exec("DELETE FROM utility_scores");
984
- // Recent usage for entry 0
985
- recordUsageEvent(db, { eventType: "show", entryId: recentId, timestamp: new Date().toISOString() });
986
- recordUsageEvent(db, { eventType: "search", entryId: recentId, timestamp: new Date().toISOString() });
987
- // Old usage for entry 1 (60 days ago)
988
- const oldDate = new Date();
989
- oldDate.setDate(oldDate.getDate() - 60);
990
- recordUsageEvent(db, { eventType: "show", entryId: oldId, timestamp: oldDate.toISOString() });
991
- recordUsageEvent(db, { eventType: "search", entryId: oldId, timestamp: oldDate.toISOString() });
992
- recomputeUtilityScores(db);
993
- const recentScore = db
994
- .prepare("SELECT utility, last_used_at FROM utility_scores WHERE entry_id = ?")
995
- .get(recentId);
996
- const oldScore = db.prepare("SELECT utility, last_used_at FROM utility_scores WHERE entry_id = ?").get(oldId);
997
- // Both should have the same utility score from recompute (based on select_rate),
998
- // but the recency decay is applied at search time, not at recompute time.
999
- // So we need to verify that the last_used_at timestamps differ.
1000
- if (recentScore && oldScore) {
1001
- const recentTs = new Date(recentScore.last_used_at).getTime();
1002
- const oldTs = new Date(oldScore.last_used_at).getTime();
1003
- decayWorks = recentTs > oldTs;
1004
- }
1005
- }
1006
- }
1007
- finally {
1008
- closeDatabase(db);
1009
- }
1010
- cases.push({
1011
- id: "us-03",
1012
- scenario: "utility_scoring",
1013
- description: "Recency decay: recent last_used_at vs old",
1014
- passed: decayWorks,
1015
- });
1016
- }
1017
- // Test 4: Utility cap — extreme utility doesn't over-boost (cap at 1.5x)
1018
- let capWorks = false;
1019
- {
1020
- const db = openDatabase(dbPath);
1021
- try {
1022
- const entries = db.prepare("SELECT id FROM entries LIMIT 2").all();
1023
- if (entries.length >= 2) {
1024
- // Give extreme utility to first entry
1025
- upsertUtilityScore(db, entries[0].id, {
1026
- utility: 100.0, // Extreme
1027
- showCount: 10000,
1028
- searchCount: 10000,
1029
- selectRate: 1.0,
1030
- lastUsedAt: new Date().toISOString(),
1031
- });
1032
- // Give zero utility to second entry
1033
- upsertUtilityScore(db, entries[1].id, {
1034
- utility: 0,
1035
- showCount: 0,
1036
- searchCount: 0,
1037
- selectRate: 0,
1038
- });
1039
- }
1040
- }
1041
- finally {
1042
- closeDatabase(db);
1043
- }
1044
- // Search and check scores
1045
- const result = await akmSearch({ query: "deploy", source: "stash", limit: 20 });
1046
- const localHits = result.hits.filter((h) => h.type !== "registry");
1047
- if (localHits.length >= 2) {
1048
- const maxScore = localHits[0].score ?? 0;
1049
- const minScore = localHits[localHits.length - 1].score ?? 0;
1050
- // The ratio should be bounded (due to 1.5x cap)
1051
- const ratio = minScore > 0 ? maxScore / minScore : 0;
1052
- // Even with extreme utility, the max boost factor is 1.5x applied to base score.
1053
- // With different base FTS scores the ratio can exceed 1.5, but
1054
- // for same-content entries it should be <= ~1.55
1055
- capWorks = ratio < 10; // Very generous bound; just verify no extreme blowup
1056
- }
1057
- cases.push({
1058
- id: "us-04",
1059
- scenario: "utility_scoring",
1060
- description: "Utility cap prevents extreme score inflation",
1061
- passed: capWorks,
1062
- });
1063
- }
1064
- // Clean up utility data for other tests
1065
- {
1066
- const db = openDatabase(dbPath);
1067
- try {
1068
- db.exec("DELETE FROM usage_events");
1069
- db.exec("DELETE FROM utility_scores");
1070
- }
1071
- finally {
1072
- closeDatabase(db);
1073
- }
1074
- }
1075
- return {
1076
- baseline_no_usage: !!cases[0].passed, // pass means no boost = correct
1077
- boost_applied: boostApplied,
1078
- decay_works: decayWorks,
1079
- cap_works: capWorks,
1080
- cases,
1081
- };
1082
- }
1083
- // ── Scenario 6: Feature Correctness ──────────────────────────────────────────
1084
- async function benchmarkFeatureCorrectness(_stashDir) {
1085
- log(" Running feature correctness benchmarks...\n");
1086
- const cases = [];
1087
- // Test 1: Fuzzy/prefix fallback triggers only when exact match returns 0
1088
- let fuzzyWorks = false;
1089
- {
1090
- // "certb" has no exact FTS match but prefix "certb*" should match "certbot" (tag of ssl-renew)
1091
- const exactResult = await akmSearch({ query: "certb", source: "stash", limit: 10 });
1092
- const exactHits = exactResult.hits.filter((h) => h.type !== "registry");
1093
- // FTS5 porter stemmer + prefix fallback should find ssl-renew via "certbot" tag
1094
- fuzzyWorks = exactHits.some((h) => h.name === "ssl-renew");
1095
- cases.push({
1096
- id: "fc-01",
1097
- scenario: "feature_correctness",
1098
- description: "Fuzzy/prefix fallback finds 'ssl-renew' for query 'certb'",
1099
- passed: fuzzyWorks,
1100
- details: fuzzyWorks ? "Found via prefix expansion" : `Got: ${exactHits.map((h) => h.name).join(", ") || "none"}`,
1101
- });
1102
- }
1103
- // Test 2: Field weighting — name match ranks higher than description match
1104
- let fieldWeightingCorrect = false;
1105
- {
1106
- // Query "deploy" — assets with "deploy" in their name should rank above
1107
- // those that only have "deploy" in description/tags
1108
- const result = await akmSearch({ query: "deploy", source: "stash", limit: 20 });
1109
- const hits = result.hits.filter((h) => h.type !== "registry");
1110
- // Assets with "deploy" in name or aliases: k8s-deploy, deploy-helper, deploy-status, deploy-checklist
1111
- const nameMatchAssets = ["k8s-deploy", "deploy-helper", "deploy-status", "deploy-checklist"];
1112
- // Assets with "deploy" NOT in name but in desc/tags: metrics-collector, health-check, monitoring-guide
1113
- const nonNameMatchAssets = ["metrics-collector", "health-check", "monitoring-guide"];
1114
- if (hits.length > 0) {
1115
- const nameRanks = nameMatchAssets.map((n) => hits.findIndex((h) => h.name === n)).filter((i) => i >= 0);
1116
- const nonNameRanks = nonNameMatchAssets.map((n) => hits.findIndex((h) => h.name === n)).filter((i) => i >= 0);
1117
- if (nameRanks.length > 0 && nonNameRanks.length > 0) {
1118
- const avgNameRank = nameRanks.reduce((s, r) => s + r, 0) / nameRanks.length;
1119
- const avgNonNameRank = nonNameRanks.reduce((s, r) => s + r, 0) / nonNameRanks.length;
1120
- // Name matches should on average rank higher (lower index) than non-name matches
1121
- fieldWeightingCorrect = avgNameRank < avgNonNameRank;
1122
- }
1123
- }
1124
- cases.push({
1125
- id: "fc-02",
1126
- scenario: "feature_correctness",
1127
- description: "Field weighting: name match ranks higher than desc-only match",
1128
- passed: fieldWeightingCorrect,
1129
- details: `Top 5: ${hits
1130
- .slice(0, 5)
1131
- .map((h) => h.name)
1132
- .join(", ")}`,
1133
- });
1134
- }
1135
- // Test 3: Parameter extraction — commands with $ARGUMENTS detected
1136
- let paramExtraction = false;
1137
- {
1138
- const { extractCommandParameters, extractScriptParameters } = await import("../src/indexer/metadata.js");
1139
- const cmdTemplate = "Run $ARGUMENTS tests and report results.\n$1 is the target directory.";
1140
- const cmdParams = extractCommandParameters(cmdTemplate);
1141
- const hasArguments = cmdParams?.some((p) => p.name === "ARGUMENTS") ?? false;
1142
- const hasDollar1 = cmdParams?.some((p) => p.name === "$1") ?? false;
1143
- const scriptContent = '#!/bin/bash\n# @param {string} host - Target hostname\n# @param {number} port - Port number\nssh "$1" -p "$2"\n';
1144
- const scriptParams = extractScriptParameters("/tmp/test.sh", scriptContent);
1145
- const hasHost = scriptParams?.some((p) => p.name === "host") ?? false;
1146
- const hasPort = scriptParams?.some((p) => p.name === "port") ?? false;
1147
- paramExtraction = hasArguments && hasDollar1 && hasHost && hasPort;
1148
- cases.push({
1149
- id: "fc-03",
1150
- scenario: "feature_correctness",
1151
- description: "Parameter extraction: $ARGUMENTS, $1, and @param",
1152
- passed: paramExtraction,
1153
- details: `CMD: ARGUMENTS=${hasArguments}, $1=${hasDollar1}; Script: host=${hasHost}, port=${hasPort}`,
1154
- });
1155
- }
1156
- // Test 4: akm info returns valid capability advertisement
1157
- let infoValid = false;
1158
- {
1159
- const info = assembleInfo();
1160
- infoValid =
1161
- info.schemaVersion === 1 &&
1162
- typeof info.version === "string" &&
1163
- Array.isArray(info.assetTypes) &&
1164
- info.assetTypes.length > 0 &&
1165
- Array.isArray(info.searchModes) &&
1166
- info.searchModes.includes("fts") &&
1167
- typeof info.indexStats.entryCount === "number";
1168
- cases.push({
1169
- id: "fc-04",
1170
- scenario: "feature_correctness",
1171
- description: "akm info returns valid capability advertisement",
1172
- passed: infoValid,
1173
- details: `version=${info.version}, types=${info.assetTypes.length}, modes=${info.searchModes.join(",")}`,
1174
- });
1175
- }
1176
- // Test 5: Feedback/usage events record correctly
1177
- let feedbackRecords = false;
1178
- {
1179
- const dbPath = getDbPath();
1180
- const db = openDatabase(dbPath);
1181
- try {
1182
- const countBefore = db.prepare("SELECT COUNT(*) AS cnt FROM usage_events").get().cnt;
1183
- insertUsageEvent(db, {
1184
- event_type: "feedback",
1185
- entry_ref: "skill:test-feedback",
1186
- signal: "positive",
1187
- metadata: JSON.stringify({ source: "benchmark" }),
1188
- });
1189
- const countAfter = db.prepare("SELECT COUNT(*) AS cnt FROM usage_events").get().cnt;
1190
- feedbackRecords = countAfter === countBefore + 1;
1191
- // Verify the event was written correctly
1192
- const lastEvent = db
1193
- .prepare("SELECT event_type, entry_ref, signal FROM usage_events ORDER BY id DESC LIMIT 1")
1194
- .get();
1195
- feedbackRecords =
1196
- feedbackRecords &&
1197
- lastEvent?.event_type === "feedback" &&
1198
- lastEvent?.entry_ref === "skill:test-feedback" &&
1199
- lastEvent?.signal === "positive";
1200
- }
1201
- finally {
1202
- closeDatabase(db);
1203
- }
1204
- cases.push({
1205
- id: "fc-05",
1206
- scenario: "feature_correctness",
1207
- description: "Feedback events are recorded correctly in usage_events",
1208
- passed: feedbackRecords,
1209
- });
1210
- }
1211
- // Test 6: buildSearchFields produces per-field text
1212
- {
1213
- const entry = {
1214
- name: "test-entry",
1215
- type: "skill",
1216
- description: "A test skill",
1217
- tags: ["alpha", "beta"],
1218
- searchHints: ["hint one"],
1219
- aliases: ["test alt"],
1220
- };
1221
- const fields = buildSearchFields(entry);
1222
- const nameOk = fields.name.includes("test") && fields.name.includes("entry");
1223
- const descOk = fields.description.includes("test skill");
1224
- const tagsOk = fields.tags.includes("alpha") && fields.tags.includes("beta");
1225
- const hintsOk = fields.hints.includes("hint one");
1226
- const allFieldsPresent = nameOk && descOk && tagsOk && hintsOk;
1227
- cases.push({
1228
- id: "fc-06",
1229
- scenario: "feature_correctness",
1230
- description: "buildSearchFields produces correct per-field text",
1231
- passed: allFieldsPresent,
1232
- details: `name=${nameOk}, desc=${descOk}, tags=${tagsOk}, hints=${hintsOk}`,
1233
- });
1234
- }
1235
- // Test 7: sanitizeFtsQuery handles special characters safely
1236
- {
1237
- const { sanitizeFtsQuery } = await import("../src/indexer/db.js");
1238
- const dangerous = 'code-review "OR 1=1" NEAR(test,5)';
1239
- const sanitized = sanitizeFtsQuery(dangerous);
1240
- const noQuotes = !sanitized.includes('"');
1241
- const noParens = !sanitized.includes("(") && !sanitized.includes(")");
1242
- const noNear = !sanitized.includes("NEAR");
1243
- const safe = noQuotes && noParens && noNear && sanitized.length > 0;
1244
- cases.push({
1245
- id: "fc-07",
1246
- scenario: "feature_correctness",
1247
- description: "sanitizeFtsQuery neutralizes dangerous FTS5 syntax",
1248
- passed: safe,
1249
- details: `Input: "${dangerous}" -> "${sanitized}"`,
1250
- });
1251
- }
1252
- // Test 8: Empty query returns all entries
1253
- {
1254
- const result = await akmSearch({ query: "", source: "stash", limit: 100 });
1255
- const localHits = result.hits.filter((h) => h.type !== "registry");
1256
- // Should return all or most of the 35 assets
1257
- const allEntriesReturned = localHits.length >= 25;
1258
- cases.push({
1259
- id: "fc-08",
1260
- scenario: "feature_correctness",
1261
- description: "Empty query returns all assets",
1262
- passed: allEntriesReturned,
1263
- metric: localHits.length,
1264
- unit: "assets",
1265
- });
1266
- }
1267
- // Test 9: Type filtering works
1268
- {
1269
- const result = await akmSearch({ query: "", type: "skill", source: "stash", limit: 50 });
1270
- const localHits = result.hits.filter((h) => h.type !== "registry");
1271
- const allSkills = localHits.every((h) => h.type === "skill");
1272
- const hasMultiple = localHits.length >= 3;
1273
- cases.push({
1274
- id: "fc-09",
1275
- scenario: "feature_correctness",
1276
- description: "Type filtering returns only matching types",
1277
- passed: allSkills && hasMultiple,
1278
- metric: localHits.length,
1279
- unit: "skills",
1280
- details: allSkills ? "All results are skills" : "Mixed types found",
1281
- });
1282
- }
1283
- // Test 10: Deterministic tiebreaker — same query returns same order
1284
- {
1285
- const r1 = await akmSearch({ query: "deploy", source: "stash", limit: 20 });
1286
- const r2 = await akmSearch({ query: "deploy", source: "stash", limit: 20 });
1287
- const h1 = r1.hits.filter((h) => h.type !== "registry").map((h) => h.name);
1288
- const h2 = r2.hits.filter((h) => h.type !== "registry").map((h) => h.name);
1289
- const deterministic = JSON.stringify(h1) === JSON.stringify(h2);
1290
- cases.push({
1291
- id: "fc-10",
1292
- scenario: "feature_correctness",
1293
- description: "Search results are deterministic (same order for same query)",
1294
- passed: deterministic,
1295
- });
1296
- }
1297
- return {
1298
- fuzzy_works: fuzzyWorks,
1299
- field_weighting_correct: fieldWeightingCorrect,
1300
- parameter_extraction: paramExtraction,
1301
- info_valid: infoValid,
1302
- feedback_records: feedbackRecords,
1303
- cases,
1304
- };
1305
- }
1306
- // ── Main benchmark orchestrator ──────────────────────────────────────────────
1307
- async function runBenchmarkSuite() {
1308
- const { branch, commit } = gitInfo();
1309
- log("=== akm Comprehensive Benchmark Suite ===\n\n");
1310
- // 1. Create stash and index
1311
- log("Setting up benchmark stash...\n");
1312
- const stashDir = createBenchmarkStash();
1313
- process.env.AKM_STASH_DIR = stashDir;
1314
- saveConfig({ semanticSearchMode: "off", registries: [] });
1315
- const { akmIndex } = await import("../src/indexer/indexer.js");
1316
- const indexResult = await akmIndex({ stashDir, full: true });
1317
- log(` Indexed ${indexResult.totalEntries} entries in ${indexResult.timing?.totalMs ?? "?"}ms\n\n`);
1318
- // 2. Run all scenarios
1319
- const searchQuality = await benchmarkSearchQuality(stashDir);
1320
- const searchPerf = await benchmarkSearchPerformance(stashDir);
1321
- const indexPerf = await benchmarkIndexingPerformance(stashDir);
1322
- const tokenEff = await benchmarkTokenEfficiency(stashDir);
1323
- const utilScoring = await benchmarkUtilityScoring(stashDir);
1324
- const featureCorr = await benchmarkFeatureCorrectness(stashDir);
1325
- // 3. Aggregate results
1326
- const allCases = [
1327
- ...searchQuality.cases,
1328
- ...searchPerf.cases,
1329
- ...indexPerf.cases,
1330
- ...tokenEff.cases,
1331
- ...utilScoring.cases,
1332
- ...featureCorr.cases,
1333
- ];
1334
- const totalCases = allCases.length;
1335
- const passedCount = allCases.filter((c) => c.passed).length;
1336
- const failedCount = totalCases - passedCount;
1337
- const output = {
1338
- branch,
1339
- commit,
1340
- timestamp: new Date().toISOString(),
1341
- asset_count: ASSETS.length,
1342
- scenarios: {
1343
- search_quality: {
1344
- mrr: searchQuality.mrr,
1345
- recall_at_5: searchQuality.recall_at_5,
1346
- recall_at_10: searchQuality.recall_at_10,
1347
- cases: searchQuality.cases,
1348
- },
1349
- search_performance: {
1350
- cold_ms: searchPerf.cold_ms,
1351
- warm_ms: searchPerf.warm_ms,
1352
- fts_only_ms: searchPerf.fts_only_ms,
1353
- large_result_ms: searchPerf.large_result_ms,
1354
- cases: searchPerf.cases,
1355
- },
1356
- indexing_performance: {
1357
- full_ms: indexPerf.full_ms,
1358
- incremental_ms: indexPerf.incremental_ms,
1359
- fts_rebuild_ms: indexPerf.fts_rebuild_ms,
1360
- recompute_utility_ms: indexPerf.recompute_utility_ms,
1361
- cases: indexPerf.cases,
1362
- },
1363
- token_efficiency: {
1364
- summary_savings_pct: tokenEff.summary_savings_pct,
1365
- manifest_bytes_per_asset: tokenEff.manifest_bytes_per_asset,
1366
- for_agent_savings_pct: tokenEff.for_agent_savings_pct,
1367
- jsonl_savings_pct: tokenEff.jsonl_savings_pct,
1368
- cases: tokenEff.cases,
1369
- },
1370
- utility_scoring: {
1371
- baseline_no_usage: utilScoring.baseline_no_usage,
1372
- boost_applied: utilScoring.boost_applied,
1373
- decay_works: utilScoring.decay_works,
1374
- cap_works: utilScoring.cap_works,
1375
- cases: utilScoring.cases,
1376
- },
1377
- feature_correctness: {
1378
- fuzzy_works: featureCorr.fuzzy_works,
1379
- field_weighting_correct: featureCorr.field_weighting_correct,
1380
- parameter_extraction: featureCorr.parameter_extraction,
1381
- info_valid: featureCorr.info_valid,
1382
- feedback_records: featureCorr.feedback_records,
1383
- cases: featureCorr.cases,
1384
- },
1385
- },
1386
- summary: {
1387
- total_cases: totalCases,
1388
- passed: passedCount,
1389
- failed: failedCount,
1390
- },
1391
- };
1392
- // 4. Output JSON
1393
- console.log(JSON.stringify(output, null, 2));
1394
- // 5. Human-readable summary
1395
- if (!jsonOnly) {
1396
- process.stderr.write("\n=== Benchmark Summary ===\n");
1397
- process.stderr.write(`Branch: ${branch} (${commit})\n`);
1398
- process.stderr.write(`Assets: ${ASSETS.length}\n\n`);
1399
- process.stderr.write(`Search Quality:\n`);
1400
- process.stderr.write(` MRR: ${searchQuality.mrr}\n`);
1401
- process.stderr.write(` Recall@5: ${searchQuality.recall_at_5}\n`);
1402
- process.stderr.write(` Recall@10: ${searchQuality.recall_at_10}\n\n`);
1403
- process.stderr.write(`Search Performance:\n`);
1404
- process.stderr.write(` Cold: ${searchPerf.cold_ms}ms\n`);
1405
- process.stderr.write(` Warm: ${searchPerf.warm_ms}ms\n`);
1406
- process.stderr.write(` FTS-only: ${searchPerf.fts_only_ms}ms\n\n`);
1407
- process.stderr.write(`Indexing Performance:\n`);
1408
- process.stderr.write(` Full: ${indexPerf.full_ms}ms\n`);
1409
- process.stderr.write(` Incr: ${indexPerf.incremental_ms}ms\n`);
1410
- process.stderr.write(` FTS rebuild: ${indexPerf.fts_rebuild_ms}ms\n\n`);
1411
- process.stderr.write(`Token Efficiency:\n`);
1412
- process.stderr.write(` Summary savings: ${tokenEff.summary_savings_pct}%\n`);
1413
- process.stderr.write(` Manifest: ${tokenEff.manifest_bytes_per_asset} bytes/asset\n\n`);
1414
- process.stderr.write(`Utility Scoring:\n`);
1415
- process.stderr.write(` Baseline: ${utilScoring.baseline_no_usage ? "PASS" : "FAIL"}\n`);
1416
- process.stderr.write(` Boost: ${utilScoring.boost_applied ? "PASS" : "FAIL"}\n`);
1417
- process.stderr.write(` Decay: ${utilScoring.decay_works ? "PASS" : "FAIL"}\n`);
1418
- process.stderr.write(` Cap: ${utilScoring.cap_works ? "PASS" : "FAIL"}\n\n`);
1419
- process.stderr.write(`Feature Correctness:\n`);
1420
- process.stderr.write(` Fuzzy: ${featureCorr.fuzzy_works ? "PASS" : "FAIL"}\n`);
1421
- process.stderr.write(` Weighting: ${featureCorr.field_weighting_correct ? "PASS" : "FAIL"}\n`);
1422
- process.stderr.write(` Params: ${featureCorr.parameter_extraction ? "PASS" : "FAIL"}\n`);
1423
- process.stderr.write(` Info: ${featureCorr.info_valid ? "PASS" : "FAIL"}\n`);
1424
- process.stderr.write(` Feedback: ${featureCorr.feedback_records ? "PASS" : "FAIL"}\n\n`);
1425
- process.stderr.write(`Total: ${passedCount}/${totalCases} passed, ${failedCount} failed\n`);
1426
- if (failedCount > 0) {
1427
- process.stderr.write("\nFailed cases:\n");
1428
- for (const c of allCases.filter((c) => !c.passed)) {
1429
- process.stderr.write(` [FAIL] ${c.id}: ${c.description}${c.details ? ` — ${c.details}` : ""}${c.metric !== undefined ? ` (${c.metric}${c.unit ? ` ${c.unit}` : ""})` : ""}\n`);
1430
- }
1431
- }
1432
- }
1433
- return output;
1434
- }
1435
- // ── Entry point ──────────────────────────────────────────────────────────────
1436
- try {
1437
- await runBenchmarkSuite();
1438
- }
1439
- finally {
1440
- cleanup();
1441
- }