akm-cli 0.7.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (327) hide show
  1. package/package.json +8 -8
  2. package/dist/tests/add-website-source.test.js +0 -119
  3. package/dist/tests/agent/agent-config-loader.test.js +0 -70
  4. package/dist/tests/agent/agent-config.test.js +0 -221
  5. package/dist/tests/agent/agent-detect.test.js +0 -100
  6. package/dist/tests/agent/agent-spawn.test.js +0 -234
  7. package/dist/tests/agent-output.test.js +0 -186
  8. package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +0 -103
  9. package/dist/tests/architecture/agent-spawn-seam.test.js +0 -193
  10. package/dist/tests/architecture/llm-stateless-seam.test.js +0 -112
  11. package/dist/tests/asset-ref.test.js +0 -192
  12. package/dist/tests/asset-registry.test.js +0 -103
  13. package/dist/tests/asset-spec.test.js +0 -241
  14. package/dist/tests/bench/attribution.test.js +0 -996
  15. package/dist/tests/bench/cleanup-sigint.test.js +0 -83
  16. package/dist/tests/bench/cleanup.js +0 -234
  17. package/dist/tests/bench/cleanup.test.js +0 -166
  18. package/dist/tests/bench/cli.js +0 -1018
  19. package/dist/tests/bench/cli.test.js +0 -445
  20. package/dist/tests/bench/compare.test.js +0 -556
  21. package/dist/tests/bench/corpus.js +0 -317
  22. package/dist/tests/bench/corpus.test.js +0 -258
  23. package/dist/tests/bench/doctor.js +0 -525
  24. package/dist/tests/bench/driver.js +0 -401
  25. package/dist/tests/bench/driver.test.js +0 -584
  26. package/dist/tests/bench/environment.js +0 -233
  27. package/dist/tests/bench/environment.test.js +0 -199
  28. package/dist/tests/bench/evolve-metrics.js +0 -179
  29. package/dist/tests/bench/evolve-metrics.test.js +0 -187
  30. package/dist/tests/bench/evolve.js +0 -647
  31. package/dist/tests/bench/evolve.test.js +0 -624
  32. package/dist/tests/bench/failure-modes.test.js +0 -349
  33. package/dist/tests/bench/feedback-integrity.test.js +0 -457
  34. package/dist/tests/bench/leakage.test.js +0 -228
  35. package/dist/tests/bench/learning-curve.test.js +0 -134
  36. package/dist/tests/bench/metrics.js +0 -2395
  37. package/dist/tests/bench/metrics.test.js +0 -1150
  38. package/dist/tests/bench/no-os-tmpdir-invariant.test.js +0 -43
  39. package/dist/tests/bench/opencode-config.js +0 -194
  40. package/dist/tests/bench/opencode-config.test.js +0 -370
  41. package/dist/tests/bench/report.js +0 -1885
  42. package/dist/tests/bench/report.test.js +0 -1038
  43. package/dist/tests/bench/run-config.js +0 -355
  44. package/dist/tests/bench/run-config.test.js +0 -298
  45. package/dist/tests/bench/run-curate-test.js +0 -32
  46. package/dist/tests/bench/run-failing-tasks.js +0 -56
  47. package/dist/tests/bench/run-full-bench.js +0 -51
  48. package/dist/tests/bench/run-items36-targeted.js +0 -69
  49. package/dist/tests/bench/run-nano-quick.js +0 -42
  50. package/dist/tests/bench/run-waveg-targeted.js +0 -62
  51. package/dist/tests/bench/runner.js +0 -699
  52. package/dist/tests/bench/runner.test.js +0 -958
  53. package/dist/tests/bench/search-bridge.test.js +0 -331
  54. package/dist/tests/bench/tmp.js +0 -131
  55. package/dist/tests/bench/trajectory.js +0 -116
  56. package/dist/tests/bench/trajectory.test.js +0 -127
  57. package/dist/tests/bench/verifier.js +0 -114
  58. package/dist/tests/bench/verifier.test.js +0 -118
  59. package/dist/tests/bench/workflow-evaluator.js +0 -557
  60. package/dist/tests/bench/workflow-evaluator.test.js +0 -421
  61. package/dist/tests/bench/workflow-spec.js +0 -345
  62. package/dist/tests/bench/workflow-spec.test.js +0 -363
  63. package/dist/tests/bench/workflow-trace.js +0 -472
  64. package/dist/tests/bench/workflow-trace.test.js +0 -254
  65. package/dist/tests/benchmark-search-quality.js +0 -536
  66. package/dist/tests/benchmark-suite.js +0 -1441
  67. package/dist/tests/capture-cli.test.js +0 -112
  68. package/dist/tests/cli-errors.test.js +0 -204
  69. package/dist/tests/commands/events.test.js +0 -370
  70. package/dist/tests/commands/history.test.js +0 -418
  71. package/dist/tests/commands/import.test.js +0 -103
  72. package/dist/tests/commands/proposal-cli.test.js +0 -209
  73. package/dist/tests/commands/reflect-propose-cli.test.js +0 -333
  74. package/dist/tests/commands/remember.test.js +0 -97
  75. package/dist/tests/commands/scope-flags.test.js +0 -300
  76. package/dist/tests/commands/search.test.js +0 -537
  77. package/dist/tests/commands/show-indexer-parity.test.js +0 -117
  78. package/dist/tests/commands/show.test.js +0 -294
  79. package/dist/tests/common.test.js +0 -266
  80. package/dist/tests/completions.test.js +0 -142
  81. package/dist/tests/config-cli.test.js +0 -193
  82. package/dist/tests/config-llm-features.test.js +0 -139
  83. package/dist/tests/config.test.js +0 -569
  84. package/dist/tests/contracts/migration-baseline.test.js +0 -43
  85. package/dist/tests/contracts/reflect-propose-envelope.test.js +0 -139
  86. package/dist/tests/contracts/spec-helpers.js +0 -46
  87. package/dist/tests/contracts/v1-spec-section-11-proposal-queue.test.js +0 -228
  88. package/dist/tests/contracts/v1-spec-section-12-agent-config.test.js +0 -56
  89. package/dist/tests/contracts/v1-spec-section-13-lesson-type.test.js +0 -34
  90. package/dist/tests/contracts/v1-spec-section-14-llm-features.test.js +0 -94
  91. package/dist/tests/contracts/v1-spec-section-4-1-asset-types.test.js +0 -39
  92. package/dist/tests/contracts/v1-spec-section-4-2-quality-rules.test.js +0 -44
  93. package/dist/tests/contracts/v1-spec-section-5-configuration.test.js +0 -47
  94. package/dist/tests/contracts/v1-spec-section-6-orchestration.test.js +0 -40
  95. package/dist/tests/contracts/v1-spec-section-7-module-layout.test.js +0 -58
  96. package/dist/tests/contracts/v1-spec-section-8-extension-points.test.js +0 -34
  97. package/dist/tests/contracts/v1-spec-section-9-4-cli-surface.test.js +0 -75
  98. package/dist/tests/contracts/v1-spec-section-9-7-llm-agent-boundary.test.js +0 -36
  99. package/dist/tests/core/write-source.test.js +0 -366
  100. package/dist/tests/curate-command.test.js +0 -87
  101. package/dist/tests/db-scoring.test.js +0 -201
  102. package/dist/tests/db.test.js +0 -654
  103. package/dist/tests/distill-cli-flag.test.js +0 -208
  104. package/dist/tests/distill.test.js +0 -515
  105. package/dist/tests/docker-install.test.js +0 -120
  106. package/dist/tests/e2e.test.js +0 -1419
  107. package/dist/tests/embedder.test.js +0 -340
  108. package/dist/tests/embedding-model-config.test.js +0 -379
  109. package/dist/tests/feedback-command.test.js +0 -172
  110. package/dist/tests/file-context.test.js +0 -552
  111. package/dist/tests/fixtures/scripts/git/summarize-diff.js +0 -9
  112. package/dist/tests/fixtures/scripts/lint/eslint-check.js +0 -7
  113. package/dist/tests/fixtures/stashes/load.js +0 -166
  114. package/dist/tests/fixtures/stashes/load.test.js +0 -97
  115. package/dist/tests/fixtures/stashes/ranking-baseline/scripts/mem0-search.js +0 -12
  116. package/dist/tests/frontmatter.test.js +0 -190
  117. package/dist/tests/fts-field-weighting.test.js +0 -254
  118. package/dist/tests/fuzzy-search.test.js +0 -230
  119. package/dist/tests/git-provider-clone.test.js +0 -45
  120. package/dist/tests/github.test.js +0 -161
  121. package/dist/tests/graph-boost-ranking.test.js +0 -305
  122. package/dist/tests/graph-extraction.test.js +0 -282
  123. package/dist/tests/helpers/usage-events.js +0 -8
  124. package/dist/tests/index-pass-llm.test.js +0 -161
  125. package/dist/tests/indexer.test.js +0 -570
  126. package/dist/tests/info-command.test.js +0 -166
  127. package/dist/tests/init.test.js +0 -69
  128. package/dist/tests/install-script.test.js +0 -246
  129. package/dist/tests/integration/agent-real-profile.test.js +0 -94
  130. package/dist/tests/issue-36-repro.test.js +0 -304
  131. package/dist/tests/issues-191-194.test.js +0 -160
  132. package/dist/tests/lesson-lint.test.js +0 -111
  133. package/dist/tests/llm-client.test.js +0 -115
  134. package/dist/tests/llm-feature-gate.test.js +0 -151
  135. package/dist/tests/llm.test.js +0 -139
  136. package/dist/tests/lockfile.test.js +0 -216
  137. package/dist/tests/manifest.test.js +0 -205
  138. package/dist/tests/markdown.test.js +0 -126
  139. package/dist/tests/matchers-unit.test.js +0 -189
  140. package/dist/tests/memory-inference.test.js +0 -299
  141. package/dist/tests/merge-scoring.test.js +0 -136
  142. package/dist/tests/metadata.test.js +0 -313
  143. package/dist/tests/migration-help.test.js +0 -89
  144. package/dist/tests/origin-resolve.test.js +0 -124
  145. package/dist/tests/output-baseline.test.js +0 -218
  146. package/dist/tests/output-shapes-unit.test.js +0 -478
  147. package/dist/tests/parallel-search.test.js +0 -272
  148. package/dist/tests/parameter-metadata.test.js +0 -365
  149. package/dist/tests/paths.test.js +0 -177
  150. package/dist/tests/progressive-disclosure.test.js +0 -280
  151. package/dist/tests/proposals.test.js +0 -279
  152. package/dist/tests/proposed-quality.test.js +0 -271
  153. package/dist/tests/provider-registry.test.js +0 -32
  154. package/dist/tests/ranking-regression.test.js +0 -548
  155. package/dist/tests/reflect-propose.test.js +0 -455
  156. package/dist/tests/registry-build-index.test.js +0 -394
  157. package/dist/tests/registry-cli.test.js +0 -290
  158. package/dist/tests/registry-index-v2.test.js +0 -430
  159. package/dist/tests/registry-install.test.js +0 -728
  160. package/dist/tests/registry-providers/parity.test.js +0 -189
  161. package/dist/tests/registry-providers/skills-sh.test.js +0 -309
  162. package/dist/tests/registry-providers/static-index.test.js +0 -238
  163. package/dist/tests/registry-resolve.test.js +0 -126
  164. package/dist/tests/registry-search.test.js +0 -923
  165. package/dist/tests/remember-frontmatter.test.js +0 -378
  166. package/dist/tests/remember-unit.test.js +0 -123
  167. package/dist/tests/ripgrep-install.test.js +0 -251
  168. package/dist/tests/ripgrep-resolve.test.js +0 -108
  169. package/dist/tests/ripgrep.test.js +0 -163
  170. package/dist/tests/save-command.test.js +0 -94
  171. package/dist/tests/save-trust-qa-fixes.test.js +0 -270
  172. package/dist/tests/scoring-pipeline.test.js +0 -648
  173. package/dist/tests/search-include-proposed-cli.test.js +0 -118
  174. package/dist/tests/self-update.test.js +0 -442
  175. package/dist/tests/semantic-search-e2e.test.js +0 -512
  176. package/dist/tests/semantic-status.test.js +0 -471
  177. package/dist/tests/setup-run.integration.js +0 -877
  178. package/dist/tests/setup-wizard.test.js +0 -198
  179. package/dist/tests/setup.test.js +0 -131
  180. package/dist/tests/source-add.test.js +0 -11
  181. package/dist/tests/source-clone.test.js +0 -254
  182. package/dist/tests/source-manage.test.js +0 -366
  183. package/dist/tests/source-providers/filesystem.test.js +0 -82
  184. package/dist/tests/source-providers/git.test.js +0 -252
  185. package/dist/tests/source-providers/website.test.js +0 -128
  186. package/dist/tests/source-qa-fixes.test.js +0 -286
  187. package/dist/tests/source-registry.test.js +0 -350
  188. package/dist/tests/source-resolve.test.js +0 -100
  189. package/dist/tests/source-source.test.js +0 -281
  190. package/dist/tests/source.test.js +0 -533
  191. package/dist/tests/tar-utils-scan.test.js +0 -73
  192. package/dist/tests/toggle-components.test.js +0 -73
  193. package/dist/tests/usage-telemetry.test.js +0 -265
  194. package/dist/tests/utility-scoring.test.js +0 -558
  195. package/dist/tests/vault-load-error.test.js +0 -78
  196. package/dist/tests/vault-qa-fixes.test.js +0 -194
  197. package/dist/tests/vault.test.js +0 -429
  198. package/dist/tests/vector-search.test.js +0 -608
  199. package/dist/tests/walker.test.js +0 -252
  200. package/dist/tests/wave2-cluster-bc.test.js +0 -228
  201. package/dist/tests/wave2-cluster-d.test.js +0 -180
  202. package/dist/tests/wave2-cluster-e.test.js +0 -179
  203. package/dist/tests/wiki-qa-fixes.test.js +0 -270
  204. package/dist/tests/wiki.test.js +0 -529
  205. package/dist/tests/workflow-cli.test.js +0 -271
  206. package/dist/tests/workflow-markdown.test.js +0 -171
  207. package/dist/tests/workflow-path-escape.test.js +0 -132
  208. package/dist/tests/workflow-qa-fixes.test.js +0 -395
  209. package/dist/tests/workflows/indexer-rejection.test.js +0 -213
  210. /package/dist/{src/cli.js → cli.js} +0 -0
  211. /package/dist/{src/commands → commands}/completions.js +0 -0
  212. /package/dist/{src/commands → commands}/config-cli.js +0 -0
  213. /package/dist/{src/commands → commands}/curate.js +0 -0
  214. /package/dist/{src/commands → commands}/distill.js +0 -0
  215. /package/dist/{src/commands → commands}/events.js +0 -0
  216. /package/dist/{src/commands → commands}/history.js +0 -0
  217. /package/dist/{src/commands → commands}/info.js +0 -0
  218. /package/dist/{src/commands → commands}/init.js +0 -0
  219. /package/dist/{src/commands → commands}/install-audit.js +0 -0
  220. /package/dist/{src/commands → commands}/installed-stashes.js +0 -0
  221. /package/dist/{src/commands → commands}/migration-help.js +0 -0
  222. /package/dist/{src/commands → commands}/proposal.js +0 -0
  223. /package/dist/{src/commands → commands}/propose.js +0 -0
  224. /package/dist/{src/commands → commands}/reflect.js +0 -0
  225. /package/dist/{src/commands → commands}/registry-search.js +0 -0
  226. /package/dist/{src/commands → commands}/remember.js +0 -0
  227. /package/dist/{src/commands → commands}/search.js +0 -0
  228. /package/dist/{src/commands → commands}/self-update.js +0 -0
  229. /package/dist/{src/commands → commands}/show.js +0 -0
  230. /package/dist/{src/commands → commands}/source-add.js +0 -0
  231. /package/dist/{src/commands → commands}/source-clone.js +0 -0
  232. /package/dist/{src/commands → commands}/source-manage.js +0 -0
  233. /package/dist/{src/commands → commands}/vault.js +0 -0
  234. /package/dist/{src/core → core}/asset-ref.js +0 -0
  235. /package/dist/{src/core → core}/asset-registry.js +0 -0
  236. /package/dist/{src/core → core}/asset-spec.js +0 -0
  237. /package/dist/{src/core → core}/common.js +0 -0
  238. /package/dist/{src/core → core}/config.js +0 -0
  239. /package/dist/{src/core → core}/errors.js +0 -0
  240. /package/dist/{src/core → core}/events.js +0 -0
  241. /package/dist/{src/core → core}/frontmatter.js +0 -0
  242. /package/dist/{src/core → core}/lesson-lint.js +0 -0
  243. /package/dist/{src/core → core}/markdown.js +0 -0
  244. /package/dist/{src/core → core}/paths.js +0 -0
  245. /package/dist/{src/core → core}/proposals.js +0 -0
  246. /package/dist/{src/core → core}/warn.js +0 -0
  247. /package/dist/{src/core → core}/write-source.js +0 -0
  248. /package/dist/{src/indexer → indexer}/db-search.js +0 -0
  249. /package/dist/{src/indexer → indexer}/db.js +0 -0
  250. /package/dist/{src/indexer → indexer}/file-context.js +0 -0
  251. /package/dist/{src/indexer → indexer}/graph-boost.js +0 -0
  252. /package/dist/{src/indexer → indexer}/graph-extraction.js +0 -0
  253. /package/dist/{src/indexer → indexer}/indexer.js +0 -0
  254. /package/dist/{src/indexer → indexer}/manifest.js +0 -0
  255. /package/dist/{src/indexer → indexer}/matchers.js +0 -0
  256. /package/dist/{src/indexer → indexer}/memory-inference.js +0 -0
  257. /package/dist/{src/indexer → indexer}/metadata.js +0 -0
  258. /package/dist/{src/indexer → indexer}/search-fields.js +0 -0
  259. /package/dist/{src/indexer → indexer}/search-source.js +0 -0
  260. /package/dist/{src/indexer → indexer}/semantic-status.js +0 -0
  261. /package/dist/{src/indexer → indexer}/usage-events.js +0 -0
  262. /package/dist/{src/indexer → indexer}/walker.js +0 -0
  263. /package/dist/{src/integrations → integrations}/agent/config.js +0 -0
  264. /package/dist/{src/integrations → integrations}/agent/detect.js +0 -0
  265. /package/dist/{src/integrations → integrations}/agent/index.js +0 -0
  266. /package/dist/{src/integrations → integrations}/agent/profiles.js +0 -0
  267. /package/dist/{src/integrations → integrations}/agent/prompts.js +0 -0
  268. /package/dist/{src/integrations → integrations}/agent/spawn.js +0 -0
  269. /package/dist/{src/integrations → integrations}/github.js +0 -0
  270. /package/dist/{src/integrations → integrations}/lockfile.js +0 -0
  271. /package/dist/{src/llm → llm}/client.js +0 -0
  272. /package/dist/{src/llm → llm}/embedder.js +0 -0
  273. /package/dist/{src/llm → llm}/embedders/cache.js +0 -0
  274. /package/dist/{src/llm → llm}/embedders/local.js +0 -0
  275. /package/dist/{src/llm → llm}/embedders/remote.js +0 -0
  276. /package/dist/{src/llm → llm}/embedders/types.js +0 -0
  277. /package/dist/{src/llm → llm}/feature-gate.js +0 -0
  278. /package/dist/{src/llm → llm}/graph-extract.js +0 -0
  279. /package/dist/{src/llm → llm}/index-passes.js +0 -0
  280. /package/dist/{src/llm → llm}/memory-infer.js +0 -0
  281. /package/dist/{src/llm → llm}/metadata-enhance.js +0 -0
  282. /package/dist/{src/output → output}/cli-hints.js +0 -0
  283. /package/dist/{src/output → output}/context.js +0 -0
  284. /package/dist/{src/output → output}/renderers.js +0 -0
  285. /package/dist/{src/output → output}/shapes.js +0 -0
  286. /package/dist/{src/output → output}/text.js +0 -0
  287. /package/dist/{src/registry → registry}/build-index.js +0 -0
  288. /package/dist/{src/registry → registry}/create-provider-registry.js +0 -0
  289. /package/dist/{src/registry → registry}/factory.js +0 -0
  290. /package/dist/{src/registry → registry}/origin-resolve.js +0 -0
  291. /package/dist/{src/registry → registry}/providers/index.js +0 -0
  292. /package/dist/{src/registry → registry}/providers/skills-sh.js +0 -0
  293. /package/dist/{src/registry → registry}/providers/static-index.js +0 -0
  294. /package/dist/{src/registry → registry}/providers/types.js +0 -0
  295. /package/dist/{src/registry → registry}/resolve.js +0 -0
  296. /package/dist/{src/registry → registry}/types.js +0 -0
  297. /package/dist/{src/setup → setup}/detect.js +0 -0
  298. /package/dist/{src/setup → setup}/ripgrep-install.js +0 -0
  299. /package/dist/{src/setup → setup}/ripgrep-resolve.js +0 -0
  300. /package/dist/{src/setup → setup}/setup.js +0 -0
  301. /package/dist/{src/setup → setup}/steps.js +0 -0
  302. /package/dist/{src/sources → sources}/include.js +0 -0
  303. /package/dist/{src/sources → sources}/provider-factory.js +0 -0
  304. /package/dist/{src/sources → sources}/provider.js +0 -0
  305. /package/dist/{src/sources → sources}/providers/filesystem.js +0 -0
  306. /package/dist/{src/sources → sources}/providers/git.js +0 -0
  307. /package/dist/{src/sources → sources}/providers/index.js +0 -0
  308. /package/dist/{src/sources → sources}/providers/install-types.js +0 -0
  309. /package/dist/{src/sources → sources}/providers/npm.js +0 -0
  310. /package/dist/{src/sources → sources}/providers/provider-utils.js +0 -0
  311. /package/dist/{src/sources → sources}/providers/sync-from-ref.js +0 -0
  312. /package/dist/{src/sources → sources}/providers/tar-utils.js +0 -0
  313. /package/dist/{src/sources → sources}/providers/website.js +0 -0
  314. /package/dist/{src/sources → sources}/resolve.js +0 -0
  315. /package/dist/{src/sources → sources}/types.js +0 -0
  316. /package/dist/{src/templates → templates}/wiki-templates.js +0 -0
  317. /package/dist/{src/version.js → version.js} +0 -0
  318. /package/dist/{src/wiki → wiki}/wiki.js +0 -0
  319. /package/dist/{src/workflows → workflows}/authoring.js +0 -0
  320. /package/dist/{src/workflows → workflows}/cli.js +0 -0
  321. /package/dist/{src/workflows → workflows}/db.js +0 -0
  322. /package/dist/{src/workflows → workflows}/document-cache.js +0 -0
  323. /package/dist/{src/workflows → workflows}/parser.js +0 -0
  324. /package/dist/{src/workflows → workflows}/renderer.js +0 -0
  325. /package/dist/{src/workflows → workflows}/runs.js +0 -0
  326. /package/dist/{src/workflows → workflows}/schema.js +0 -0
  327. /package/dist/{src/workflows → workflows}/validator.js +0 -0
@@ -1,1441 +0,0 @@
1
- #!/usr/bin/env bun
2
- /**
3
- * Comprehensive benchmark suite for akm search system.
4
- *
5
- * Standalone script (NOT a bun:test suite) that covers:
6
- * 1. Search Quality (MRR, Recall@5, Recall@10)
7
- * 2. Search Performance (latency in ms)
8
- * 3. Indexing Performance (time in ms)
9
- * 4. Token Efficiency (byte savings %)
10
- * 5. Utility Scoring (M-2)
11
- * 6. Feature Correctness
12
- *
13
- * Usage:
14
- * bun run tests/benchmark-suite.ts
15
- * bun run tests/benchmark-suite.ts --json # machine-readable output only
16
- */
17
- import fs from "node:fs";
18
- import os from "node:os";
19
- import path from "node:path";
20
- import { assembleInfo } from "../src/commands/info";
21
- import { akmSearch } from "../src/commands/search";
22
- import { saveConfig } from "../src/core/config";
23
- import { getDbPath } from "../src/core/paths";
24
- import { closeDatabase, openDatabase, rebuildFts, upsertUtilityScore } from "../src/indexer/db";
25
- import { recomputeUtilityScores } from "../src/indexer/indexer";
26
- import { buildSearchFields } from "../src/indexer/search-fields";
27
- import { insertUsageEvent } from "../src/indexer/usage-events";
28
- import { recordUsageEvent } from "./helpers/usage-events";
29
- // ── CLI flags ────────────────────────────────────────────────────────────────
30
- const jsonOnly = process.argv.includes("--json");
31
- function log(msg) {
32
- if (!jsonOnly)
33
- process.stderr.write(msg);
34
- }
35
- // ── Environment isolation ────────────────────────────────────────────────────
36
- const tmpRoot = fs.mkdtempSync(path.join(os.tmpdir(), "akm-benchsuite-"));
37
- const testCacheDir = path.join(tmpRoot, "cache");
38
- const testConfigDir = path.join(tmpRoot, "config");
39
- fs.mkdirSync(testCacheDir, { recursive: true });
40
- fs.mkdirSync(testConfigDir, { recursive: true });
41
- const origXdgCache = process.env.XDG_CACHE_HOME;
42
- const origXdgConfig = process.env.XDG_CONFIG_HOME;
43
- const origStashDir = process.env.AKM_STASH_DIR;
44
- process.env.XDG_CACHE_HOME = testCacheDir;
45
- process.env.XDG_CONFIG_HOME = testConfigDir;
46
- function cleanup() {
47
- if (origXdgCache === undefined)
48
- delete process.env.XDG_CACHE_HOME;
49
- else
50
- process.env.XDG_CACHE_HOME = origXdgCache;
51
- if (origXdgConfig === undefined)
52
- delete process.env.XDG_CONFIG_HOME;
53
- else
54
- process.env.XDG_CONFIG_HOME = origXdgConfig;
55
- if (origStashDir === undefined)
56
- delete process.env.AKM_STASH_DIR;
57
- else
58
- process.env.AKM_STASH_DIR = origStashDir;
59
- fs.rmSync(tmpRoot, { recursive: true, force: true });
60
- }
61
- // ── Asset definitions (30+ assets) ───────────────────────────────────────────
62
- const ASSETS = [
63
- // ── 5 Skills (varying metadata quality) ──
64
- {
65
- dir: "skills/k8s-deploy",
66
- filename: "SKILL.md",
67
- fileContent: "# Kubernetes Deployment\n\nDeploy applications to Kubernetes clusters using kubectl.\n",
68
- stashEntry: {
69
- name: "k8s-deploy",
70
- type: "skill",
71
- description: "Deploy applications to Kubernetes clusters",
72
- tags: ["kubernetes", "deploy", "k8s", "containers"],
73
- searchHints: ["deploy to kubernetes", "kubectl apply", "container orchestration"],
74
- aliases: ["kube-deploy"],
75
- filename: "SKILL.md",
76
- quality: "curated",
77
- confidence: 0.95,
78
- },
79
- },
80
- {
81
- dir: "skills/code-review",
82
- filename: "SKILL.md",
83
- fileContent: "# Code Review\n\nReview pull requests for code quality and best practices.\n",
84
- stashEntry: {
85
- name: "code-review",
86
- type: "skill",
87
- description: "Review code for quality issues and best practices",
88
- tags: ["review", "quality", "pull-request"],
89
- searchHints: ["review pull request", "check code quality"],
90
- filename: "SKILL.md",
91
- quality: "curated",
92
- confidence: 0.9,
93
- },
94
- },
95
- {
96
- dir: "skills/api-design",
97
- filename: "SKILL.md",
98
- fileContent: "# API Design\n\nDesign RESTful APIs following best practices.\n",
99
- stashEntry: {
100
- name: "api-design",
101
- type: "skill",
102
- description: "Design RESTful APIs with OpenAPI specifications",
103
- tags: ["api", "rest", "openapi", "design"],
104
- searchHints: ["design a REST API", "create API specification"],
105
- filename: "SKILL.md",
106
- quality: "curated",
107
- confidence: 0.9,
108
- },
109
- },
110
- {
111
- dir: "skills/refactor",
112
- filename: "SKILL.md",
113
- fileContent: "# Code Refactoring\n\nRefactor code to improve readability and performance.\n",
114
- stashEntry: {
115
- name: "refactor",
116
- type: "skill",
117
- description: "Refactor code to improve structure and maintainability",
118
- tags: ["refactor", "clean-code", "maintenance"],
119
- searchHints: ["improve code structure", "clean up codebase"],
120
- filename: "SKILL.md",
121
- // Sparse metadata — no quality or confidence
122
- },
123
- },
124
- {
125
- dir: "skills/security-audit",
126
- filename: "SKILL.md",
127
- fileContent: "# Security Audit\n\nAudit applications for security vulnerabilities.\n",
128
- stashEntry: {
129
- name: "security-audit",
130
- type: "skill",
131
- description: "Audit code and infrastructure for security vulnerabilities",
132
- tags: ["security", "audit", "vulnerability", "pentest"],
133
- searchHints: ["find security vulnerabilities", "security scan"],
134
- filename: "SKILL.md",
135
- quality: "generated",
136
- confidence: 0.6,
137
- },
138
- },
139
- // ── 5 Commands with $ARGUMENTS parameters ──
140
- {
141
- dir: "commands",
142
- filename: "test-runner.md",
143
- fileContent: "---\ndescription: Run test suites across the project\nparams:\n suite: Test suite to run\n---\n# Test Runner\n\nRun $ARGUMENTS tests.\n",
144
- stashEntry: {
145
- name: "test-runner",
146
- type: "command",
147
- description: "Run test suites across the project",
148
- tags: ["test", "testing", "ci", "runner"],
149
- searchHints: ["run tests", "execute test suite"],
150
- filename: "test-runner.md",
151
- parameters: [{ name: "ARGUMENTS", description: "test suite path or pattern" }],
152
- },
153
- },
154
- {
155
- dir: "commands",
156
- filename: "lint-check.md",
157
- fileContent: "---\ndescription: Run linting checks on the codebase\n---\n# Lint Check\n\nRun lint on $ARGUMENTS.\n",
158
- stashEntry: {
159
- name: "lint-check",
160
- type: "command",
161
- description: "Run linting checks on the codebase",
162
- tags: ["lint", "eslint", "code-quality"],
163
- searchHints: ["lint code", "check for style issues"],
164
- filename: "lint-check.md",
165
- parameters: [{ name: "ARGUMENTS", description: "files to lint" }],
166
- },
167
- },
168
- {
169
- dir: "commands",
170
- filename: "git-summary.md",
171
- fileContent: "---\ndescription: Summarize recent git changes\n---\n# Git Summary\n\nSummarize $ARGUMENTS git log.\n",
172
- stashEntry: {
173
- name: "git-summary",
174
- type: "command",
175
- description: "Summarize recent git changes and commit history",
176
- tags: ["git", "summary", "changelog"],
177
- searchHints: ["summarize git commits", "show recent changes"],
178
- filename: "git-summary.md",
179
- parameters: [{ name: "ARGUMENTS", description: "branch or date range" }],
180
- },
181
- },
182
- {
183
- dir: "commands",
184
- filename: "deploy-status.md",
185
- fileContent: "---\ndescription: Check deployment status\n---\n# Deploy Status\n\nCheck $ARGUMENTS deployment status.\n",
186
- stashEntry: {
187
- name: "deploy-status",
188
- type: "command",
189
- description: "Check the current deployment status of services",
190
- tags: ["deploy", "status", "monitoring"],
191
- searchHints: ["check deployment", "is service deployed"],
192
- filename: "deploy-status.md",
193
- parameters: [{ name: "ARGUMENTS", description: "service name" }],
194
- },
195
- },
196
- {
197
- dir: "commands",
198
- filename: "docker-build.md",
199
- fileContent: "---\ndescription: Build Docker images from Dockerfile\nparams:\n image: Docker image name and tag\n context: Build context directory\n---\n# Docker Build\n\nBuild docker image $1 from $2.\n",
200
- stashEntry: {
201
- name: "docker-build",
202
- type: "command",
203
- description: "Build Docker images from Dockerfile",
204
- tags: ["docker", "build", "image", "containers"],
205
- searchHints: ["build docker image", "create container image"],
206
- filename: "docker-build.md",
207
- parameters: [
208
- { name: "image", description: "Docker image name and tag" },
209
- { name: "context", description: "Build context directory" },
210
- ],
211
- intent: { when: "Need to build a container image", input: "Dockerfile path", output: "Built image" },
212
- },
213
- },
214
- // ── 5 Scripts with @param JSDoc ──
215
- {
216
- dir: "scripts/pg-backup",
217
- filename: "pg-backup.sh",
218
- fileContent: '#!/bin/bash\n# @param {string} database - PostgreSQL database name\n# @param {string} output - Output file path for the dump\n# Backup PostgreSQL database\npg_dump "$1" > "$2"\n',
219
- stashEntry: {
220
- name: "pg-backup",
221
- type: "script",
222
- description: "Backup PostgreSQL database to a SQL dump file",
223
- tags: ["database", "backup", "postgresql", "postgres"],
224
- searchHints: ["backup database", "export postgres data", "pg_dump"],
225
- filename: "pg-backup.sh",
226
- parameters: [
227
- { name: "database", type: "string", description: "PostgreSQL database name" },
228
- { name: "output", type: "string", description: "Output file path for the dump" },
229
- ],
230
- },
231
- },
232
- {
233
- dir: "scripts/docker-clean",
234
- filename: "docker-clean.sh",
235
- fileContent: "#!/bin/bash\n# @param {string} filter - Optional image filter pattern\n# Clean up Docker resources\ndocker system prune -af\n",
236
- stashEntry: {
237
- name: "docker-clean",
238
- type: "script",
239
- description: "Clean up unused Docker images, containers, and volumes",
240
- tags: ["docker", "cleanup", "containers"],
241
- searchHints: ["clean docker", "remove unused images"],
242
- filename: "docker-clean.sh",
243
- parameters: [{ name: "filter", type: "string", description: "Optional image filter pattern" }],
244
- },
245
- },
246
- {
247
- dir: "scripts/ssl-renew",
248
- filename: "ssl-renew.sh",
249
- fileContent: "#!/bin/bash\n# @param {string} domain - Domain name for certificate renewal\n# Renew SSL certificates\ncertbot renew --domain $1\n",
250
- stashEntry: {
251
- name: "ssl-renew",
252
- type: "script",
253
- description: "Renew SSL/TLS certificates using certbot",
254
- tags: ["ssl", "tls", "certificate", "certbot"],
255
- searchHints: ["renew certificates", "ssl renewal"],
256
- filename: "ssl-renew.sh",
257
- parameters: [{ name: "domain", type: "string", description: "Domain name for certificate renewal" }],
258
- },
259
- },
260
- {
261
- dir: "scripts/log-rotate",
262
- filename: "log-rotate.sh",
263
- fileContent: "#!/bin/bash\n# @param {number} days - Number of days to keep logs\n# Rotate application logs\nlogrotate /etc/logrotate.conf\n",
264
- stashEntry: {
265
- name: "log-rotate",
266
- type: "script",
267
- description: "Rotate and compress application log files",
268
- tags: ["logs", "rotation", "maintenance"],
269
- searchHints: ["rotate logs", "compress old logs"],
270
- filename: "log-rotate.sh",
271
- parameters: [{ name: "days", type: "number", description: "Number of days to keep logs" }],
272
- },
273
- },
274
- {
275
- dir: "scripts/env-setup",
276
- filename: "env-setup.sh",
277
- fileContent: "#!/bin/bash\n# @param {string} environment - Target environment (dev, staging, prod)\n# Set up development environment\nnpm install && cp .env.example .env\n",
278
- stashEntry: {
279
- name: "env-setup",
280
- type: "script",
281
- description: "Set up local development environment with dependencies",
282
- tags: ["setup", "environment", "development", "onboarding"],
283
- searchHints: ["set up dev environment", "install dependencies"],
284
- filename: "env-setup.sh",
285
- parameters: [{ name: "environment", type: "string", description: "Target environment (dev, staging, prod)" }],
286
- },
287
- },
288
- // ── 5 Knowledge docs (some with deep TOC, some minimal) ──
289
- {
290
- dir: "knowledge",
291
- filename: "architecture-guide.md",
292
- fileContent: "---\ndescription: System architecture overview\n---\n# Architecture Guide\n\n## Microservices\n\nOverview of service boundaries.\n\n## Data Flow\n\nHow data moves through the system.\n\n## Database Schema\n\nRelational model overview.\n\n## API Gateway\n\nRouting and authentication.\n",
293
- stashEntry: {
294
- name: "architecture-guide",
295
- type: "knowledge",
296
- description: "System architecture overview and design decisions",
297
- tags: ["architecture", "design", "microservices"],
298
- searchHints: ["system architecture", "how the system works"],
299
- filename: "architecture-guide.md",
300
- },
301
- },
302
- {
303
- dir: "knowledge",
304
- filename: "runbook-incidents.md",
305
- fileContent: "---\ndescription: Incident response runbook\n---\n# Incident Runbook\n\n## Severity Levels\n\n## Escalation\n\n## Post-mortem\n",
306
- stashEntry: {
307
- name: "runbook-incidents",
308
- type: "knowledge",
309
- description: "Incident response procedures and escalation paths",
310
- tags: ["incident", "runbook", "on-call", "ops"],
311
- searchHints: ["handle incident", "escalation procedure"],
312
- filename: "runbook-incidents.md",
313
- },
314
- },
315
- {
316
- dir: "knowledge",
317
- filename: "coding-standards.md",
318
- fileContent: "---\ndescription: Team coding standards\n---\n# Coding Standards\n\n## Naming Conventions\n\n## Error Handling\n\n## Testing Requirements\n",
319
- stashEntry: {
320
- name: "coding-standards",
321
- type: "knowledge",
322
- description: "Team coding standards and conventions",
323
- tags: ["standards", "conventions", "style-guide"],
324
- searchHints: ["coding style", "naming conventions"],
325
- filename: "coding-standards.md",
326
- },
327
- },
328
- {
329
- dir: "knowledge",
330
- filename: "onboarding.md",
331
- fileContent: "---\ndescription: New team member onboarding guide\n---\n# Onboarding Guide\n\n## First Day\n\n## Access Setup\n\n## Development Environment\n\n## Team Norms\n\n## Resources\n",
332
- stashEntry: {
333
- name: "onboarding",
334
- type: "knowledge",
335
- description: "New team member onboarding guide with checklists",
336
- tags: ["onboarding", "new-hire", "team"],
337
- searchHints: ["new team member", "getting started"],
338
- filename: "onboarding.md",
339
- },
340
- },
341
- {
342
- dir: "knowledge",
343
- filename: "troubleshooting.md",
344
- fileContent: "---\ndescription: Common troubleshooting steps\n---\n# Troubleshooting\n\nBasic debugging tips.\n",
345
- stashEntry: {
346
- name: "troubleshooting",
347
- type: "knowledge",
348
- description: "Common troubleshooting steps for production issues",
349
- tags: ["troubleshooting", "debugging", "production"],
350
- searchHints: ["debug production issue", "common errors"],
351
- filename: "troubleshooting.md",
352
- },
353
- },
354
- // ── 5 Agents ──
355
- {
356
- dir: "agents",
357
- filename: "devops-engineer.md",
358
- fileContent: "---\ndescription: DevOps engineering agent\n---\nYou are a DevOps engineer specializing in CI/CD pipelines and infrastructure automation.\n",
359
- stashEntry: {
360
- name: "devops-engineer",
361
- type: "agent",
362
- description: "DevOps engineering agent for CI/CD and infrastructure",
363
- tags: ["devops", "ci-cd", "infrastructure", "automation"],
364
- searchHints: ["automate infrastructure", "CI/CD pipeline"],
365
- filename: "devops-engineer.md",
366
- },
367
- },
368
- {
369
- dir: "agents",
370
- filename: "data-analyst.md",
371
- fileContent: "---\ndescription: Data analysis agent\n---\nYou are a data analyst who helps explore datasets and generate insights.\n",
372
- stashEntry: {
373
- name: "data-analyst",
374
- type: "agent",
375
- description: "Data analysis agent for exploring datasets and generating insights",
376
- tags: ["data", "analysis", "statistics", "insights"],
377
- searchHints: ["analyze data", "generate reports"],
378
- filename: "data-analyst.md",
379
- },
380
- },
381
- {
382
- dir: "agents",
383
- filename: "technical-writer.md",
384
- fileContent: "---\ndescription: Technical writing agent\n---\nYou are a technical writer who creates clear documentation.\n",
385
- stashEntry: {
386
- name: "technical-writer",
387
- type: "agent",
388
- description: "Technical writing agent for creating documentation",
389
- tags: ["documentation", "writing", "technical"],
390
- searchHints: ["write documentation", "create technical docs"],
391
- filename: "technical-writer.md",
392
- },
393
- },
394
- {
395
- dir: "agents",
396
- filename: "frontend-dev.md",
397
- fileContent: "---\ndescription: Frontend development agent\n---\nYou are a frontend developer specializing in React and TypeScript.\n",
398
- stashEntry: {
399
- name: "frontend-dev",
400
- type: "agent",
401
- description: "Frontend development agent specializing in React and TypeScript",
402
- tags: ["frontend", "react", "typescript", "ui"],
403
- searchHints: ["build React component", "frontend development"],
404
- filename: "frontend-dev.md",
405
- },
406
- },
407
- {
408
- dir: "agents",
409
- filename: "dba-specialist.md",
410
- fileContent: "---\ndescription: Database administration specialist\n---\nYou are a DBA specialist who optimizes queries and manages schemas.\n",
411
- stashEntry: {
412
- name: "dba-specialist",
413
- type: "agent",
414
- description: "Database administration specialist for query optimization",
415
- tags: ["database", "sql", "optimization", "dba"],
416
- searchHints: ["optimize database query", "schema management"],
417
- filename: "dba-specialist.md",
418
- },
419
- },
420
- // ── 5 Assets with overlapping terms in different fields (field weighting tests) ──
421
- {
422
- dir: "skills/deploy-helper",
423
- filename: "SKILL.md",
424
- fileContent: "# Deploy Helper\n\nHelps with deployment workflows.\n",
425
- stashEntry: {
426
- name: "deploy-helper",
427
- type: "skill",
428
- description: "Assists with deployment workflow automation and rollbacks",
429
- tags: ["workflow", "automation", "rollback"],
430
- searchHints: ["automate deployment workflow"],
431
- filename: "SKILL.md",
432
- // Name contains "deploy" -- should rank higher for "deploy" than
433
- // assets that only have "deploy" in description or tags
434
- },
435
- },
436
- {
437
- dir: "knowledge",
438
- filename: "deploy-checklist.md",
439
- fileContent: "---\ndescription: Pre-deployment checklist for production releases\n---\n# Pre-deployment Checklist\n\n## Steps\n\n1. Run tests\n2. Review changes\n",
440
- stashEntry: {
441
- name: "deploy-checklist",
442
- type: "knowledge",
443
- description: "Pre-deployment checklist for production releases",
444
- tags: ["checklist", "production", "release"],
445
- filename: "deploy-checklist.md",
446
- // Name also contains "deploy" in name field
447
- },
448
- },
449
- {
450
- dir: "scripts/metrics-collector",
451
- filename: "metrics-collector.sh",
452
- fileContent: "#!/bin/bash\n# Collect deployment metrics from monitoring API\ncurl http://metrics.internal/deploy\n",
453
- stashEntry: {
454
- name: "metrics-collector",
455
- type: "script",
456
- description: "Collect deployment metrics from monitoring infrastructure",
457
- tags: ["metrics", "monitoring", "deploy"],
458
- searchHints: ["collect metrics"],
459
- filename: "metrics-collector.sh",
460
- // "deploy" only in tags and description, NOT in name
461
- },
462
- },
463
- {
464
- dir: "commands",
465
- filename: "health-check.md",
466
- fileContent: "---\ndescription: Run health checks against deployed services\n---\n# Health Check\n\nCheck service health after deployment.\n",
467
- stashEntry: {
468
- name: "health-check",
469
- type: "command",
470
- description: "Run health checks against deployed services",
471
- tags: ["health", "monitoring", "services"],
472
- searchHints: ["check service health", "verify deployment"],
473
- filename: "health-check.md",
474
- // "deploy" only in description and hints, NOT in name or tags
475
- },
476
- },
477
- {
478
- dir: "knowledge",
479
- filename: "monitoring-guide.md",
480
- fileContent: "---\ndescription: Guide to monitoring deployed applications\n---\n# Monitoring Guide\n\n## Alerting\n\n## Dashboards\n\n## Incident Response\n",
481
- stashEntry: {
482
- name: "monitoring-guide",
483
- type: "knowledge",
484
- description: "Guide to monitoring deployed applications and setting up alerts",
485
- tags: ["monitoring", "alerting", "dashboards", "observability"],
486
- filename: "monitoring-guide.md",
487
- // "deploy" only in description content
488
- },
489
- },
490
- ];
491
- // ── Stash creation ───────────────────────────────────────────────────────────
492
- function createBenchmarkStash() {
493
- const stashDir = path.join(tmpRoot, "stash");
494
- for (const sub of ["skills", "commands", "agents", "knowledge", "scripts"]) {
495
- fs.mkdirSync(path.join(stashDir, sub), { recursive: true });
496
- }
497
- for (const asset of ASSETS) {
498
- const dirPath = path.join(stashDir, asset.dir);
499
- fs.mkdirSync(dirPath, { recursive: true });
500
- fs.writeFileSync(path.join(dirPath, asset.filename), asset.fileContent);
501
- const stashJsonPath = path.join(dirPath, ".stash.json");
502
- let entries = [];
503
- if (fs.existsSync(stashJsonPath)) {
504
- const existing = JSON.parse(fs.readFileSync(stashJsonPath, "utf8"));
505
- entries = existing.entries;
506
- }
507
- entries.push(asset.stashEntry);
508
- fs.writeFileSync(stashJsonPath, JSON.stringify({ entries }, null, 2));
509
- }
510
- return stashDir;
511
- }
512
- // ── Git helpers ──────────────────────────────────────────────────────────────
513
- function gitInfo() {
514
- try {
515
- const branch = Bun.spawnSync(["git", "rev-parse", "--abbrev-ref", "HEAD"], {
516
- cwd: import.meta.dir,
517
- })
518
- .stdout.toString()
519
- .trim();
520
- const commit = Bun.spawnSync(["git", "rev-parse", "--short", "HEAD"], {
521
- cwd: import.meta.dir,
522
- })
523
- .stdout.toString()
524
- .trim();
525
- return { branch, commit };
526
- }
527
- catch {
528
- return { branch: "unknown", commit: "unknown" };
529
- }
530
- }
531
- // ── Timing utility ───────────────────────────────────────────────────────────
532
- function timeMs(fn) {
533
- const t0 = performance.now();
534
- fn();
535
- return Math.round((performance.now() - t0) * 100) / 100;
536
- }
537
- async function timeMsAsync(fn) {
538
- const t0 = performance.now();
539
- await fn();
540
- return Math.round((performance.now() - t0) * 100) / 100;
541
- }
542
- const QUALITY_QUERIES = [
543
- // Exact keyword matches
544
- { id: "sq-01", query: "kubernetes", expectedName: "k8s-deploy", expectedType: "skill", aspect: "exact-keyword-tag" },
545
- {
546
- id: "sq-02",
547
- query: "database backup",
548
- expectedName: "pg-backup",
549
- expectedType: "script",
550
- aspect: "exact-keyword-desc-tag",
551
- },
552
- {
553
- id: "sq-03",
554
- query: "test runner",
555
- expectedName: "test-runner",
556
- expectedType: "command",
557
- aspect: "exact-keyword-name",
558
- },
559
- {
560
- id: "sq-04",
561
- query: "security audit",
562
- expectedName: "security-audit",
563
- expectedType: "skill",
564
- aspect: "exact-keyword-name",
565
- },
566
- // Partial/prefix matches (S-1 fuzzy search)
567
- {
568
- id: "sq-05",
569
- query: "kube",
570
- expectedName: "k8s-deploy",
571
- expectedType: "skill",
572
- aspect: "prefix-alias",
573
- },
574
- {
575
- id: "sq-06",
576
- query: "cert",
577
- expectedName: "ssl-renew",
578
- expectedType: "script",
579
- aspect: "prefix-tag",
580
- },
581
- // Multi-word queries
582
- {
583
- id: "sq-07",
584
- query: "ci cd pipeline",
585
- expectedName: "devops-engineer",
586
- expectedType: "agent",
587
- aspect: "multi-word-tags",
588
- },
589
- {
590
- id: "sq-08",
591
- query: "code quality review",
592
- expectedName: "code-review",
593
- expectedType: "skill",
594
- aspect: "multi-word-desc",
595
- },
596
- // Natural language intent queries
597
- {
598
- id: "sq-09",
599
- query: "renew ssl certificate",
600
- expectedName: "ssl-renew",
601
- expectedType: "script",
602
- aspect: "natural-language",
603
- },
604
- {
605
- id: "sq-10",
606
- query: "deploy to kubernetes",
607
- expectedName: "k8s-deploy",
608
- expectedType: "skill",
609
- aspect: "natural-language-hint",
610
- },
611
- {
612
- id: "sq-11",
613
- query: "analyze data",
614
- expectedName: "data-analyst",
615
- expectedType: "agent",
616
- aspect: "natural-language-hint",
617
- },
618
- // Cross-field matches (name match > description match)
619
- {
620
- id: "sq-12",
621
- query: "deploy",
622
- // k8s-deploy is a skill with "deploy" in tags/aliases; deploy-helper has it in name
623
- // Both are valid top results — accept either at rank 1
624
- expectedName: "k8s-deploy",
625
- expectedType: "skill",
626
- aspect: "field-weighting-name-vs-desc",
627
- },
628
- // Parameter-based discovery (I-2)
629
- {
630
- id: "sq-13",
631
- query: "docker image",
632
- expectedName: "docker-build",
633
- expectedType: "command",
634
- aspect: "parameter-discovery",
635
- },
636
- // Tag match specificity
637
- {
638
- id: "sq-14",
639
- query: "docker",
640
- // docker-build is a command with "docker" in name+tags; ranks above docker-clean (script)
641
- // due to type boost (command > script)
642
- expectedName: "docker-build",
643
- expectedType: "command",
644
- aspect: "tag-match",
645
- },
646
- // Description match
647
- {
648
- id: "sq-15",
649
- query: "incident response",
650
- expectedName: "runbook-incidents",
651
- expectedType: "knowledge",
652
- aspect: "desc-match",
653
- },
654
- ];
655
- async function benchmarkSearchQuality(_stashDir) {
656
- log(" Running search quality benchmarks...\n");
657
- const cases = [];
658
- let sumRR = 0;
659
- let in5 = 0;
660
- let in10 = 0;
661
- for (const q of QUALITY_QUERIES) {
662
- const result = await akmSearch({ query: q.query, source: "stash", limit: 20 });
663
- const hits = result.hits.filter((h) => h.type !== "registry");
664
- const idx = hits.findIndex((h) => h.name === q.expectedName);
665
- const rank = idx >= 0 ? idx + 1 : null;
666
- const rr = rank !== null ? 1 / rank : 0;
667
- sumRR += rr;
668
- if (rank !== null && rank <= 5)
669
- in5++;
670
- if (rank !== null && rank <= 10)
671
- in10++;
672
- const passed = rank !== null && rank <= 5;
673
- cases.push({
674
- id: q.id,
675
- scenario: "search_quality",
676
- description: `${q.aspect}: "${q.query}" -> ${q.expectedName}`,
677
- passed,
678
- metric: rank ?? -1,
679
- unit: "rank",
680
- details: rank !== null ? `Rank ${rank}` : "MISS (not in results)",
681
- });
682
- }
683
- const total = QUALITY_QUERIES.length;
684
- const mrr = Math.round((sumRR / total) * 10000) / 10000;
685
- const recall_at_5 = Math.round((in5 / total) * 10000) / 10000;
686
- const recall_at_10 = Math.round((in10 / total) * 10000) / 10000;
687
- return { mrr, recall_at_5, recall_at_10, cases };
688
- }
689
- // ── Scenario 2: Search Performance ───────────────────────────────────────────
690
- async function benchmarkSearchPerformance(_stashDir) {
691
- log(" Running search performance benchmarks...\n");
692
- const cases = [];
693
- // Cold search (first query after process start -- index already warm from quality tests,
694
- // but this is the first timing of this specific query)
695
- const coldMs = await timeMsAsync(async () => {
696
- await akmSearch({ query: "infrastructure automation pipeline", source: "stash", limit: 20 });
697
- });
698
- cases.push({
699
- id: "sp-01",
700
- scenario: "search_performance",
701
- description: "Cold search (first query with this text)",
702
- passed: coldMs < 500,
703
- metric: coldMs,
704
- unit: "ms",
705
- });
706
- // Warm search (repeated query -- FTS cache warm)
707
- const warmMs = await timeMsAsync(async () => {
708
- await akmSearch({ query: "infrastructure automation pipeline", source: "stash", limit: 20 });
709
- });
710
- cases.push({
711
- id: "sp-02",
712
- scenario: "search_performance",
713
- description: "Warm search (repeated query)",
714
- passed: warmMs < 200,
715
- metric: warmMs,
716
- unit: "ms",
717
- });
718
- // FTS-only search (semantic search disabled in config)
719
- const ftsMs = await timeMsAsync(async () => {
720
- await akmSearch({ query: "deploy kubernetes containers", source: "stash", limit: 20 });
721
- });
722
- cases.push({
723
- id: "sp-03",
724
- scenario: "search_performance",
725
- description: "FTS-only search (no embeddings)",
726
- passed: ftsMs < 200,
727
- metric: ftsMs,
728
- unit: "ms",
729
- });
730
- // Large result set (empty query returns all entries)
731
- const largeMs = await timeMsAsync(async () => {
732
- await akmSearch({ query: "", source: "stash", limit: 100 });
733
- });
734
- cases.push({
735
- id: "sp-04",
736
- scenario: "search_performance",
737
- description: "Large result set (all assets)",
738
- passed: largeMs < 500,
739
- metric: largeMs,
740
- unit: "ms",
741
- });
742
- return {
743
- cold_ms: coldMs,
744
- warm_ms: warmMs,
745
- fts_only_ms: ftsMs,
746
- large_result_ms: largeMs,
747
- cases,
748
- };
749
- }
750
- // ── Scenario 3: Indexing Performance ─────────────────────────────────────────
751
- async function benchmarkIndexingPerformance(stashDir) {
752
- log(" Running indexing performance benchmarks...\n");
753
- const cases = [];
754
- // Import akmIndex locally to avoid any caching issues
755
- const { akmIndex } = await import("../src/indexer/indexer.js");
756
- // Full index (fresh rebuild)
757
- const fullMs = await timeMsAsync(async () => {
758
- await akmIndex({ stashDir, full: true });
759
- });
760
- cases.push({
761
- id: "ip-01",
762
- scenario: "indexing_performance",
763
- description: "Fresh full index (empty DB)",
764
- passed: fullMs < 5000,
765
- metric: fullMs,
766
- unit: "ms",
767
- });
768
- // Incremental index (nothing changed)
769
- const incrMs = await timeMsAsync(async () => {
770
- await akmIndex({ stashDir, full: false });
771
- });
772
- cases.push({
773
- id: "ip-02",
774
- scenario: "indexing_performance",
775
- description: "Incremental index (no changes)",
776
- passed: incrMs < fullMs,
777
- metric: incrMs,
778
- unit: "ms",
779
- details: `Should be faster than full (${fullMs}ms)`,
780
- });
781
- // FTS rebuild time
782
- const dbPath = getDbPath();
783
- const db = openDatabase(dbPath);
784
- let ftsMs = 0;
785
- let utilMs = 0;
786
- try {
787
- ftsMs = timeMs(() => {
788
- rebuildFts(db);
789
- });
790
- cases.push({
791
- id: "ip-03",
792
- scenario: "indexing_performance",
793
- description: "FTS rebuild time",
794
- passed: ftsMs < 500,
795
- metric: ftsMs,
796
- unit: "ms",
797
- });
798
- // recomputeUtilityScores time
799
- utilMs = timeMs(() => {
800
- recomputeUtilityScores(db);
801
- });
802
- cases.push({
803
- id: "ip-04",
804
- scenario: "indexing_performance",
805
- description: "recomputeUtilityScores time",
806
- passed: utilMs < 200,
807
- metric: utilMs,
808
- unit: "ms",
809
- });
810
- }
811
- finally {
812
- closeDatabase(db);
813
- }
814
- return {
815
- full_ms: fullMs,
816
- incremental_ms: incrMs,
817
- fts_rebuild_ms: ftsMs,
818
- recompute_utility_ms: utilMs,
819
- cases,
820
- };
821
- }
822
- // ── Scenario 4: Token Efficiency ─────────────────────────────────────────────
823
- async function benchmarkTokenEfficiency(stashDir) {
824
- log(" Running token efficiency benchmarks...\n");
825
- const cases = [];
826
- // Summary vs full: measure JSON output size
827
- // We simulate by calling akmSearch with the same query and comparing what
828
- // a "full" vs "summary" response would look like in terms of the show output.
829
- // Since we cannot easily call the CLI with --detail, we measure the search
830
- // result in different output scenarios.
831
- const fullResult = await akmSearch({ query: "deploy", source: "stash", limit: 10 });
832
- const fullJson = JSON.stringify(fullResult);
833
- const fullBytes = Buffer.byteLength(fullJson);
834
- // Build a summary-equivalent by stripping content fields
835
- const summaryResult = {
836
- ...fullResult,
837
- hits: fullResult.hits.map((h) => {
838
- const { path: _p, ...minimal } = h;
839
- return {
840
- name: minimal.name,
841
- type: minimal.type,
842
- description: minimal.description,
843
- ref: h.ref,
844
- };
845
- }),
846
- };
847
- const summaryJson = JSON.stringify(summaryResult);
848
- const summaryBytes = Buffer.byteLength(summaryJson);
849
- const summarySavingsPct = Math.round(((fullBytes - summaryBytes) / fullBytes) * 100);
850
- cases.push({
851
- id: "te-01",
852
- scenario: "token_efficiency",
853
- description: "Summary vs full search output savings",
854
- passed: summarySavingsPct > 10,
855
- metric: summarySavingsPct,
856
- unit: "%",
857
- details: `Full: ${fullBytes}B, Summary: ${summaryBytes}B`,
858
- });
859
- // Manifest output size per N assets
860
- const { akmManifest } = await import("../src/indexer/manifest.js");
861
- const manifest = await akmManifest({ stashDir });
862
- const manifestJson = JSON.stringify(manifest);
863
- const manifestBytes = Buffer.byteLength(manifestJson);
864
- const bytesPerAsset = manifest.entries.length > 0 ? Math.round(manifestBytes / manifest.entries.length) : 0;
865
- cases.push({
866
- id: "te-02",
867
- scenario: "token_efficiency",
868
- description: "Manifest bytes per asset",
869
- passed: bytesPerAsset < 200,
870
- metric: bytesPerAsset,
871
- unit: "bytes/asset",
872
- details: `Total: ${manifestBytes}B for ${manifest.entries.length} assets`,
873
- });
874
- // --for-agent output size vs normal: for-agent strips paths, editHints, etc.
875
- const normalHits = fullResult.hits;
876
- const normalJson = JSON.stringify(normalHits);
877
- const forAgentHits = normalHits.map((h) => ({
878
- type: h.type,
879
- name: h.name,
880
- ref: h.ref,
881
- description: h.description,
882
- action: h.action,
883
- score: h.score,
884
- }));
885
- const forAgentJson = JSON.stringify(forAgentHits);
886
- const forAgentSavings = Math.round(((Buffer.byteLength(normalJson) - Buffer.byteLength(forAgentJson)) / Buffer.byteLength(normalJson)) * 100);
887
- cases.push({
888
- id: "te-03",
889
- scenario: "token_efficiency",
890
- description: "--for-agent output size savings vs normal",
891
- passed: forAgentSavings > 10,
892
- metric: forAgentSavings,
893
- unit: "%",
894
- });
895
- // --format jsonl size vs json (JSONL has less overhead for arrays)
896
- const jsonlOutput = normalHits.map((h) => JSON.stringify(h)).join("\n");
897
- const jsonlBytes = Buffer.byteLength(jsonlOutput);
898
- const jsonBytes = Buffer.byteLength(JSON.stringify(normalHits));
899
- const jsonlSavingsPct = Math.round(((jsonBytes - jsonlBytes) / jsonBytes) * 100);
900
- cases.push({
901
- id: "te-04",
902
- scenario: "token_efficiency",
903
- description: "JSONL vs JSON format size",
904
- // JSONL typically has slightly less overhead (no outer brackets + commas)
905
- // but can be slightly larger too, so we just report
906
- passed: true,
907
- metric: jsonlSavingsPct,
908
- unit: "%",
909
- details: `JSON: ${jsonBytes}B, JSONL: ${jsonlBytes}B`,
910
- });
911
- return {
912
- summary_savings_pct: summarySavingsPct,
913
- manifest_bytes_per_asset: bytesPerAsset,
914
- for_agent_savings_pct: forAgentSavings,
915
- jsonl_savings_pct: jsonlSavingsPct,
916
- cases,
917
- };
918
- }
919
- // ── Scenario 5: Utility Scoring ──────────────────────────────────────────────
920
- async function benchmarkUtilityScoring(_stashDir) {
921
- log(" Running utility scoring benchmarks...\n");
922
- const cases = [];
923
- const dbPath = getDbPath();
924
- // Test 1: Fresh index with no usage data — all scores should be baseline (no utility boost)
925
- {
926
- const result = await akmSearch({ query: "deploy", source: "stash", limit: 20 });
927
- const localHits = result.hits.filter((h) => h.type !== "registry");
928
- const hasUtilityBoost = localHits.some((h) => h.whyMatched?.includes("usage history boost"));
929
- cases.push({
930
- id: "us-01",
931
- scenario: "utility_scoring",
932
- description: "Fresh index has no utility boosts",
933
- passed: !hasUtilityBoost,
934
- metric: hasUtilityBoost ? 1 : 0,
935
- unit: "boosted_count",
936
- });
937
- }
938
- // Test 2: After simulated usage events, boosted entry ranks higher
939
- let boostApplied = false;
940
- {
941
- const db = openDatabase(dbPath);
942
- try {
943
- // Find two entries that match the same query
944
- const entries = db
945
- .prepare("SELECT id, entry_key FROM entries WHERE entry_key LIKE '%deploy%' LIMIT 2")
946
- .all();
947
- if (entries.length >= 2) {
948
- const boostedId = entries[0].id;
949
- const _baselineId = entries[1].id;
950
- // Record usage events for the boosted entry
951
- for (let i = 0; i < 10; i++) {
952
- recordUsageEvent(db, { eventType: "show", entryId: boostedId, timestamp: new Date().toISOString() });
953
- recordUsageEvent(db, { eventType: "search", entryId: boostedId, timestamp: new Date().toISOString() });
954
- }
955
- // Recompute utility scores
956
- recomputeUtilityScores(db);
957
- // Verify the boosted entry now has a non-zero utility score
958
- const score = db.prepare("SELECT utility FROM utility_scores WHERE entry_id = ?").get(boostedId);
959
- boostApplied = (score?.utility ?? 0) > 0;
960
- }
961
- }
962
- finally {
963
- closeDatabase(db);
964
- }
965
- cases.push({
966
- id: "us-02",
967
- scenario: "utility_scoring",
968
- description: "Usage events generate positive utility score",
969
- passed: boostApplied,
970
- });
971
- }
972
- // Test 3: Recency decay — old events contribute less
973
- let decayWorks = false;
974
- {
975
- const db = openDatabase(dbPath);
976
- try {
977
- const entries = db.prepare("SELECT id FROM entries LIMIT 2").all();
978
- if (entries.length >= 2) {
979
- const recentId = entries[0].id;
980
- const oldId = entries[1].id;
981
- // Clear existing usage events and utility scores
982
- db.exec("DELETE FROM usage_events");
983
- db.exec("DELETE FROM utility_scores");
984
- // Recent usage for entry 0
985
- recordUsageEvent(db, { eventType: "show", entryId: recentId, timestamp: new Date().toISOString() });
986
- recordUsageEvent(db, { eventType: "search", entryId: recentId, timestamp: new Date().toISOString() });
987
- // Old usage for entry 1 (60 days ago)
988
- const oldDate = new Date();
989
- oldDate.setDate(oldDate.getDate() - 60);
990
- recordUsageEvent(db, { eventType: "show", entryId: oldId, timestamp: oldDate.toISOString() });
991
- recordUsageEvent(db, { eventType: "search", entryId: oldId, timestamp: oldDate.toISOString() });
992
- recomputeUtilityScores(db);
993
- const recentScore = db
994
- .prepare("SELECT utility, last_used_at FROM utility_scores WHERE entry_id = ?")
995
- .get(recentId);
996
- const oldScore = db.prepare("SELECT utility, last_used_at FROM utility_scores WHERE entry_id = ?").get(oldId);
997
- // Both should have the same utility score from recompute (based on select_rate),
998
- // but the recency decay is applied at search time, not at recompute time.
999
- // So we need to verify that the last_used_at timestamps differ.
1000
- if (recentScore && oldScore) {
1001
- const recentTs = new Date(recentScore.last_used_at).getTime();
1002
- const oldTs = new Date(oldScore.last_used_at).getTime();
1003
- decayWorks = recentTs > oldTs;
1004
- }
1005
- }
1006
- }
1007
- finally {
1008
- closeDatabase(db);
1009
- }
1010
- cases.push({
1011
- id: "us-03",
1012
- scenario: "utility_scoring",
1013
- description: "Recency decay: recent last_used_at vs old",
1014
- passed: decayWorks,
1015
- });
1016
- }
1017
- // Test 4: Utility cap — extreme utility doesn't over-boost (cap at 1.5x)
1018
- let capWorks = false;
1019
- {
1020
- const db = openDatabase(dbPath);
1021
- try {
1022
- const entries = db.prepare("SELECT id FROM entries LIMIT 2").all();
1023
- if (entries.length >= 2) {
1024
- // Give extreme utility to first entry
1025
- upsertUtilityScore(db, entries[0].id, {
1026
- utility: 100.0, // Extreme
1027
- showCount: 10000,
1028
- searchCount: 10000,
1029
- selectRate: 1.0,
1030
- lastUsedAt: new Date().toISOString(),
1031
- });
1032
- // Give zero utility to second entry
1033
- upsertUtilityScore(db, entries[1].id, {
1034
- utility: 0,
1035
- showCount: 0,
1036
- searchCount: 0,
1037
- selectRate: 0,
1038
- });
1039
- }
1040
- }
1041
- finally {
1042
- closeDatabase(db);
1043
- }
1044
- // Search and check scores
1045
- const result = await akmSearch({ query: "deploy", source: "stash", limit: 20 });
1046
- const localHits = result.hits.filter((h) => h.type !== "registry");
1047
- if (localHits.length >= 2) {
1048
- const maxScore = localHits[0].score ?? 0;
1049
- const minScore = localHits[localHits.length - 1].score ?? 0;
1050
- // The ratio should be bounded (due to 1.5x cap)
1051
- const ratio = minScore > 0 ? maxScore / minScore : 0;
1052
- // Even with extreme utility, the max boost factor is 1.5x applied to base score.
1053
- // With different base FTS scores the ratio can exceed 1.5, but
1054
- // for same-content entries it should be <= ~1.55
1055
- capWorks = ratio < 10; // Very generous bound; just verify no extreme blowup
1056
- }
1057
- cases.push({
1058
- id: "us-04",
1059
- scenario: "utility_scoring",
1060
- description: "Utility cap prevents extreme score inflation",
1061
- passed: capWorks,
1062
- });
1063
- }
1064
- // Clean up utility data for other tests
1065
- {
1066
- const db = openDatabase(dbPath);
1067
- try {
1068
- db.exec("DELETE FROM usage_events");
1069
- db.exec("DELETE FROM utility_scores");
1070
- }
1071
- finally {
1072
- closeDatabase(db);
1073
- }
1074
- }
1075
- return {
1076
- baseline_no_usage: !!cases[0].passed, // pass means no boost = correct
1077
- boost_applied: boostApplied,
1078
- decay_works: decayWorks,
1079
- cap_works: capWorks,
1080
- cases,
1081
- };
1082
- }
1083
- // ── Scenario 6: Feature Correctness ──────────────────────────────────────────
1084
- async function benchmarkFeatureCorrectness(_stashDir) {
1085
- log(" Running feature correctness benchmarks...\n");
1086
- const cases = [];
1087
- // Test 1: Fuzzy/prefix fallback triggers only when exact match returns 0
1088
- let fuzzyWorks = false;
1089
- {
1090
- // "certb" has no exact FTS match but prefix "certb*" should match "certbot" (tag of ssl-renew)
1091
- const exactResult = await akmSearch({ query: "certb", source: "stash", limit: 10 });
1092
- const exactHits = exactResult.hits.filter((h) => h.type !== "registry");
1093
- // FTS5 porter stemmer + prefix fallback should find ssl-renew via "certbot" tag
1094
- fuzzyWorks = exactHits.some((h) => h.name === "ssl-renew");
1095
- cases.push({
1096
- id: "fc-01",
1097
- scenario: "feature_correctness",
1098
- description: "Fuzzy/prefix fallback finds 'ssl-renew' for query 'certb'",
1099
- passed: fuzzyWorks,
1100
- details: fuzzyWorks ? "Found via prefix expansion" : `Got: ${exactHits.map((h) => h.name).join(", ") || "none"}`,
1101
- });
1102
- }
1103
- // Test 2: Field weighting — name match ranks higher than description match
1104
- let fieldWeightingCorrect = false;
1105
- {
1106
- // Query "deploy" — assets with "deploy" in their name should rank above
1107
- // those that only have "deploy" in description/tags
1108
- const result = await akmSearch({ query: "deploy", source: "stash", limit: 20 });
1109
- const hits = result.hits.filter((h) => h.type !== "registry");
1110
- // Assets with "deploy" in name or aliases: k8s-deploy, deploy-helper, deploy-status, deploy-checklist
1111
- const nameMatchAssets = ["k8s-deploy", "deploy-helper", "deploy-status", "deploy-checklist"];
1112
- // Assets with "deploy" NOT in name but in desc/tags: metrics-collector, health-check, monitoring-guide
1113
- const nonNameMatchAssets = ["metrics-collector", "health-check", "monitoring-guide"];
1114
- if (hits.length > 0) {
1115
- const nameRanks = nameMatchAssets.map((n) => hits.findIndex((h) => h.name === n)).filter((i) => i >= 0);
1116
- const nonNameRanks = nonNameMatchAssets.map((n) => hits.findIndex((h) => h.name === n)).filter((i) => i >= 0);
1117
- if (nameRanks.length > 0 && nonNameRanks.length > 0) {
1118
- const avgNameRank = nameRanks.reduce((s, r) => s + r, 0) / nameRanks.length;
1119
- const avgNonNameRank = nonNameRanks.reduce((s, r) => s + r, 0) / nonNameRanks.length;
1120
- // Name matches should on average rank higher (lower index) than non-name matches
1121
- fieldWeightingCorrect = avgNameRank < avgNonNameRank;
1122
- }
1123
- }
1124
- cases.push({
1125
- id: "fc-02",
1126
- scenario: "feature_correctness",
1127
- description: "Field weighting: name match ranks higher than desc-only match",
1128
- passed: fieldWeightingCorrect,
1129
- details: `Top 5: ${hits
1130
- .slice(0, 5)
1131
- .map((h) => h.name)
1132
- .join(", ")}`,
1133
- });
1134
- }
1135
- // Test 3: Parameter extraction — commands with $ARGUMENTS detected
1136
- let paramExtraction = false;
1137
- {
1138
- const { extractCommandParameters, extractScriptParameters } = await import("../src/indexer/metadata.js");
1139
- const cmdTemplate = "Run $ARGUMENTS tests and report results.\n$1 is the target directory.";
1140
- const cmdParams = extractCommandParameters(cmdTemplate);
1141
- const hasArguments = cmdParams?.some((p) => p.name === "ARGUMENTS") ?? false;
1142
- const hasDollar1 = cmdParams?.some((p) => p.name === "$1") ?? false;
1143
- const scriptContent = '#!/bin/bash\n# @param {string} host - Target hostname\n# @param {number} port - Port number\nssh "$1" -p "$2"\n';
1144
- const scriptParams = extractScriptParameters("/tmp/test.sh", scriptContent);
1145
- const hasHost = scriptParams?.some((p) => p.name === "host") ?? false;
1146
- const hasPort = scriptParams?.some((p) => p.name === "port") ?? false;
1147
- paramExtraction = hasArguments && hasDollar1 && hasHost && hasPort;
1148
- cases.push({
1149
- id: "fc-03",
1150
- scenario: "feature_correctness",
1151
- description: "Parameter extraction: $ARGUMENTS, $1, and @param",
1152
- passed: paramExtraction,
1153
- details: `CMD: ARGUMENTS=${hasArguments}, $1=${hasDollar1}; Script: host=${hasHost}, port=${hasPort}`,
1154
- });
1155
- }
1156
- // Test 4: akm info returns valid capability advertisement
1157
- let infoValid = false;
1158
- {
1159
- const info = assembleInfo();
1160
- infoValid =
1161
- info.schemaVersion === 1 &&
1162
- typeof info.version === "string" &&
1163
- Array.isArray(info.assetTypes) &&
1164
- info.assetTypes.length > 0 &&
1165
- Array.isArray(info.searchModes) &&
1166
- info.searchModes.includes("fts") &&
1167
- typeof info.indexStats.entryCount === "number";
1168
- cases.push({
1169
- id: "fc-04",
1170
- scenario: "feature_correctness",
1171
- description: "akm info returns valid capability advertisement",
1172
- passed: infoValid,
1173
- details: `version=${info.version}, types=${info.assetTypes.length}, modes=${info.searchModes.join(",")}`,
1174
- });
1175
- }
1176
- // Test 5: Feedback/usage events record correctly
1177
- let feedbackRecords = false;
1178
- {
1179
- const dbPath = getDbPath();
1180
- const db = openDatabase(dbPath);
1181
- try {
1182
- const countBefore = db.prepare("SELECT COUNT(*) AS cnt FROM usage_events").get().cnt;
1183
- insertUsageEvent(db, {
1184
- event_type: "feedback",
1185
- entry_ref: "skill:test-feedback",
1186
- signal: "positive",
1187
- metadata: JSON.stringify({ source: "benchmark" }),
1188
- });
1189
- const countAfter = db.prepare("SELECT COUNT(*) AS cnt FROM usage_events").get().cnt;
1190
- feedbackRecords = countAfter === countBefore + 1;
1191
- // Verify the event was written correctly
1192
- const lastEvent = db
1193
- .prepare("SELECT event_type, entry_ref, signal FROM usage_events ORDER BY id DESC LIMIT 1")
1194
- .get();
1195
- feedbackRecords =
1196
- feedbackRecords &&
1197
- lastEvent?.event_type === "feedback" &&
1198
- lastEvent?.entry_ref === "skill:test-feedback" &&
1199
- lastEvent?.signal === "positive";
1200
- }
1201
- finally {
1202
- closeDatabase(db);
1203
- }
1204
- cases.push({
1205
- id: "fc-05",
1206
- scenario: "feature_correctness",
1207
- description: "Feedback events are recorded correctly in usage_events",
1208
- passed: feedbackRecords,
1209
- });
1210
- }
1211
- // Test 6: buildSearchFields produces per-field text
1212
- {
1213
- const entry = {
1214
- name: "test-entry",
1215
- type: "skill",
1216
- description: "A test skill",
1217
- tags: ["alpha", "beta"],
1218
- searchHints: ["hint one"],
1219
- aliases: ["test alt"],
1220
- };
1221
- const fields = buildSearchFields(entry);
1222
- const nameOk = fields.name.includes("test") && fields.name.includes("entry");
1223
- const descOk = fields.description.includes("test skill");
1224
- const tagsOk = fields.tags.includes("alpha") && fields.tags.includes("beta");
1225
- const hintsOk = fields.hints.includes("hint one");
1226
- const allFieldsPresent = nameOk && descOk && tagsOk && hintsOk;
1227
- cases.push({
1228
- id: "fc-06",
1229
- scenario: "feature_correctness",
1230
- description: "buildSearchFields produces correct per-field text",
1231
- passed: allFieldsPresent,
1232
- details: `name=${nameOk}, desc=${descOk}, tags=${tagsOk}, hints=${hintsOk}`,
1233
- });
1234
- }
1235
- // Test 7: sanitizeFtsQuery handles special characters safely
1236
- {
1237
- const { sanitizeFtsQuery } = await import("../src/indexer/db.js");
1238
- const dangerous = 'code-review "OR 1=1" NEAR(test,5)';
1239
- const sanitized = sanitizeFtsQuery(dangerous);
1240
- const noQuotes = !sanitized.includes('"');
1241
- const noParens = !sanitized.includes("(") && !sanitized.includes(")");
1242
- const noNear = !sanitized.includes("NEAR");
1243
- const safe = noQuotes && noParens && noNear && sanitized.length > 0;
1244
- cases.push({
1245
- id: "fc-07",
1246
- scenario: "feature_correctness",
1247
- description: "sanitizeFtsQuery neutralizes dangerous FTS5 syntax",
1248
- passed: safe,
1249
- details: `Input: "${dangerous}" -> "${sanitized}"`,
1250
- });
1251
- }
1252
- // Test 8: Empty query returns all entries
1253
- {
1254
- const result = await akmSearch({ query: "", source: "stash", limit: 100 });
1255
- const localHits = result.hits.filter((h) => h.type !== "registry");
1256
- // Should return all or most of the 35 assets
1257
- const allEntriesReturned = localHits.length >= 25;
1258
- cases.push({
1259
- id: "fc-08",
1260
- scenario: "feature_correctness",
1261
- description: "Empty query returns all assets",
1262
- passed: allEntriesReturned,
1263
- metric: localHits.length,
1264
- unit: "assets",
1265
- });
1266
- }
1267
- // Test 9: Type filtering works
1268
- {
1269
- const result = await akmSearch({ query: "", type: "skill", source: "stash", limit: 50 });
1270
- const localHits = result.hits.filter((h) => h.type !== "registry");
1271
- const allSkills = localHits.every((h) => h.type === "skill");
1272
- const hasMultiple = localHits.length >= 3;
1273
- cases.push({
1274
- id: "fc-09",
1275
- scenario: "feature_correctness",
1276
- description: "Type filtering returns only matching types",
1277
- passed: allSkills && hasMultiple,
1278
- metric: localHits.length,
1279
- unit: "skills",
1280
- details: allSkills ? "All results are skills" : "Mixed types found",
1281
- });
1282
- }
1283
- // Test 10: Deterministic tiebreaker — same query returns same order
1284
- {
1285
- const r1 = await akmSearch({ query: "deploy", source: "stash", limit: 20 });
1286
- const r2 = await akmSearch({ query: "deploy", source: "stash", limit: 20 });
1287
- const h1 = r1.hits.filter((h) => h.type !== "registry").map((h) => h.name);
1288
- const h2 = r2.hits.filter((h) => h.type !== "registry").map((h) => h.name);
1289
- const deterministic = JSON.stringify(h1) === JSON.stringify(h2);
1290
- cases.push({
1291
- id: "fc-10",
1292
- scenario: "feature_correctness",
1293
- description: "Search results are deterministic (same order for same query)",
1294
- passed: deterministic,
1295
- });
1296
- }
1297
- return {
1298
- fuzzy_works: fuzzyWorks,
1299
- field_weighting_correct: fieldWeightingCorrect,
1300
- parameter_extraction: paramExtraction,
1301
- info_valid: infoValid,
1302
- feedback_records: feedbackRecords,
1303
- cases,
1304
- };
1305
- }
1306
- // ── Main benchmark orchestrator ──────────────────────────────────────────────
1307
- async function runBenchmarkSuite() {
1308
- const { branch, commit } = gitInfo();
1309
- log("=== akm Comprehensive Benchmark Suite ===\n\n");
1310
- // 1. Create stash and index
1311
- log("Setting up benchmark stash...\n");
1312
- const stashDir = createBenchmarkStash();
1313
- process.env.AKM_STASH_DIR = stashDir;
1314
- saveConfig({ semanticSearchMode: "off", registries: [] });
1315
- const { akmIndex } = await import("../src/indexer/indexer.js");
1316
- const indexResult = await akmIndex({ stashDir, full: true });
1317
- log(` Indexed ${indexResult.totalEntries} entries in ${indexResult.timing?.totalMs ?? "?"}ms\n\n`);
1318
- // 2. Run all scenarios
1319
- const searchQuality = await benchmarkSearchQuality(stashDir);
1320
- const searchPerf = await benchmarkSearchPerformance(stashDir);
1321
- const indexPerf = await benchmarkIndexingPerformance(stashDir);
1322
- const tokenEff = await benchmarkTokenEfficiency(stashDir);
1323
- const utilScoring = await benchmarkUtilityScoring(stashDir);
1324
- const featureCorr = await benchmarkFeatureCorrectness(stashDir);
1325
- // 3. Aggregate results
1326
- const allCases = [
1327
- ...searchQuality.cases,
1328
- ...searchPerf.cases,
1329
- ...indexPerf.cases,
1330
- ...tokenEff.cases,
1331
- ...utilScoring.cases,
1332
- ...featureCorr.cases,
1333
- ];
1334
- const totalCases = allCases.length;
1335
- const passedCount = allCases.filter((c) => c.passed).length;
1336
- const failedCount = totalCases - passedCount;
1337
- const output = {
1338
- branch,
1339
- commit,
1340
- timestamp: new Date().toISOString(),
1341
- asset_count: ASSETS.length,
1342
- scenarios: {
1343
- search_quality: {
1344
- mrr: searchQuality.mrr,
1345
- recall_at_5: searchQuality.recall_at_5,
1346
- recall_at_10: searchQuality.recall_at_10,
1347
- cases: searchQuality.cases,
1348
- },
1349
- search_performance: {
1350
- cold_ms: searchPerf.cold_ms,
1351
- warm_ms: searchPerf.warm_ms,
1352
- fts_only_ms: searchPerf.fts_only_ms,
1353
- large_result_ms: searchPerf.large_result_ms,
1354
- cases: searchPerf.cases,
1355
- },
1356
- indexing_performance: {
1357
- full_ms: indexPerf.full_ms,
1358
- incremental_ms: indexPerf.incremental_ms,
1359
- fts_rebuild_ms: indexPerf.fts_rebuild_ms,
1360
- recompute_utility_ms: indexPerf.recompute_utility_ms,
1361
- cases: indexPerf.cases,
1362
- },
1363
- token_efficiency: {
1364
- summary_savings_pct: tokenEff.summary_savings_pct,
1365
- manifest_bytes_per_asset: tokenEff.manifest_bytes_per_asset,
1366
- for_agent_savings_pct: tokenEff.for_agent_savings_pct,
1367
- jsonl_savings_pct: tokenEff.jsonl_savings_pct,
1368
- cases: tokenEff.cases,
1369
- },
1370
- utility_scoring: {
1371
- baseline_no_usage: utilScoring.baseline_no_usage,
1372
- boost_applied: utilScoring.boost_applied,
1373
- decay_works: utilScoring.decay_works,
1374
- cap_works: utilScoring.cap_works,
1375
- cases: utilScoring.cases,
1376
- },
1377
- feature_correctness: {
1378
- fuzzy_works: featureCorr.fuzzy_works,
1379
- field_weighting_correct: featureCorr.field_weighting_correct,
1380
- parameter_extraction: featureCorr.parameter_extraction,
1381
- info_valid: featureCorr.info_valid,
1382
- feedback_records: featureCorr.feedback_records,
1383
- cases: featureCorr.cases,
1384
- },
1385
- },
1386
- summary: {
1387
- total_cases: totalCases,
1388
- passed: passedCount,
1389
- failed: failedCount,
1390
- },
1391
- };
1392
- // 4. Output JSON
1393
- console.log(JSON.stringify(output, null, 2));
1394
- // 5. Human-readable summary
1395
- if (!jsonOnly) {
1396
- process.stderr.write("\n=== Benchmark Summary ===\n");
1397
- process.stderr.write(`Branch: ${branch} (${commit})\n`);
1398
- process.stderr.write(`Assets: ${ASSETS.length}\n\n`);
1399
- process.stderr.write(`Search Quality:\n`);
1400
- process.stderr.write(` MRR: ${searchQuality.mrr}\n`);
1401
- process.stderr.write(` Recall@5: ${searchQuality.recall_at_5}\n`);
1402
- process.stderr.write(` Recall@10: ${searchQuality.recall_at_10}\n\n`);
1403
- process.stderr.write(`Search Performance:\n`);
1404
- process.stderr.write(` Cold: ${searchPerf.cold_ms}ms\n`);
1405
- process.stderr.write(` Warm: ${searchPerf.warm_ms}ms\n`);
1406
- process.stderr.write(` FTS-only: ${searchPerf.fts_only_ms}ms\n\n`);
1407
- process.stderr.write(`Indexing Performance:\n`);
1408
- process.stderr.write(` Full: ${indexPerf.full_ms}ms\n`);
1409
- process.stderr.write(` Incr: ${indexPerf.incremental_ms}ms\n`);
1410
- process.stderr.write(` FTS rebuild: ${indexPerf.fts_rebuild_ms}ms\n\n`);
1411
- process.stderr.write(`Token Efficiency:\n`);
1412
- process.stderr.write(` Summary savings: ${tokenEff.summary_savings_pct}%\n`);
1413
- process.stderr.write(` Manifest: ${tokenEff.manifest_bytes_per_asset} bytes/asset\n\n`);
1414
- process.stderr.write(`Utility Scoring:\n`);
1415
- process.stderr.write(` Baseline: ${utilScoring.baseline_no_usage ? "PASS" : "FAIL"}\n`);
1416
- process.stderr.write(` Boost: ${utilScoring.boost_applied ? "PASS" : "FAIL"}\n`);
1417
- process.stderr.write(` Decay: ${utilScoring.decay_works ? "PASS" : "FAIL"}\n`);
1418
- process.stderr.write(` Cap: ${utilScoring.cap_works ? "PASS" : "FAIL"}\n\n`);
1419
- process.stderr.write(`Feature Correctness:\n`);
1420
- process.stderr.write(` Fuzzy: ${featureCorr.fuzzy_works ? "PASS" : "FAIL"}\n`);
1421
- process.stderr.write(` Weighting: ${featureCorr.field_weighting_correct ? "PASS" : "FAIL"}\n`);
1422
- process.stderr.write(` Params: ${featureCorr.parameter_extraction ? "PASS" : "FAIL"}\n`);
1423
- process.stderr.write(` Info: ${featureCorr.info_valid ? "PASS" : "FAIL"}\n`);
1424
- process.stderr.write(` Feedback: ${featureCorr.feedback_records ? "PASS" : "FAIL"}\n\n`);
1425
- process.stderr.write(`Total: ${passedCount}/${totalCases} passed, ${failedCount} failed\n`);
1426
- if (failedCount > 0) {
1427
- process.stderr.write("\nFailed cases:\n");
1428
- for (const c of allCases.filter((c) => !c.passed)) {
1429
- process.stderr.write(` [FAIL] ${c.id}: ${c.description}${c.details ? ` — ${c.details}` : ""}${c.metric !== undefined ? ` (${c.metric}${c.unit ? ` ${c.unit}` : ""})` : ""}\n`);
1430
- }
1431
- }
1432
- }
1433
- return output;
1434
- }
1435
- // ── Entry point ──────────────────────────────────────────────────────────────
1436
- try {
1437
- await runBenchmarkSuite();
1438
- }
1439
- finally {
1440
- cleanup();
1441
- }