akm-cli 0.7.0-rc1 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (314) hide show
  1. package/dist/{src/cli.js → cli.js} +100 -16
  2. package/dist/{src/commands → commands}/config-cli.js +42 -0
  3. package/dist/{src/commands → commands}/history.js +78 -7
  4. package/dist/{src/commands → commands}/registry-search.js +69 -6
  5. package/dist/{src/commands → commands}/search.js +30 -3
  6. package/dist/{src/commands → commands}/show.js +29 -0
  7. package/dist/{src/commands → commands}/source-add.js +5 -1
  8. package/dist/{src/commands → commands}/source-manage.js +7 -1
  9. package/dist/{src/core → core}/config.js +28 -0
  10. package/dist/{src/indexer → indexer}/db-search.js +1 -0
  11. package/dist/{src/indexer → indexer}/indexer.js +16 -2
  12. package/dist/{src/indexer → indexer}/matchers.js +1 -1
  13. package/dist/{src/indexer → indexer}/search-source.js +4 -2
  14. package/dist/{src/integrations → integrations}/agent/profiles.js +1 -1
  15. package/dist/{src/integrations → integrations}/agent/spawn.js +67 -16
  16. package/dist/{src/integrations → integrations}/github.js +9 -3
  17. package/dist/{src/llm → llm}/embedders/remote.js +37 -3
  18. package/dist/{src/output → output}/cli-hints.js +15 -2
  19. package/dist/{src/output → output}/renderers.js +3 -1
  20. package/dist/{src/output → output}/shapes.js +8 -1
  21. package/dist/{src/output → output}/text.js +156 -3
  22. package/dist/{src/registry → registry}/build-index.js +5 -4
  23. package/dist/{src/registry → registry}/providers/static-index.js +3 -1
  24. package/dist/{src/setup → setup}/setup.js +9 -0
  25. package/dist/{src/wiki → wiki}/wiki.js +54 -6
  26. package/dist/{src/workflows → workflows}/runs.js +37 -3
  27. package/package.json +8 -8
  28. package/dist/tests/add-website-source.test.js +0 -119
  29. package/dist/tests/agent/agent-config-loader.test.js +0 -70
  30. package/dist/tests/agent/agent-config.test.js +0 -221
  31. package/dist/tests/agent/agent-detect.test.js +0 -100
  32. package/dist/tests/agent/agent-spawn.test.js +0 -234
  33. package/dist/tests/agent-output.test.js +0 -186
  34. package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +0 -103
  35. package/dist/tests/architecture/agent-spawn-seam.test.js +0 -193
  36. package/dist/tests/architecture/llm-stateless-seam.test.js +0 -112
  37. package/dist/tests/asset-ref.test.js +0 -192
  38. package/dist/tests/asset-registry.test.js +0 -103
  39. package/dist/tests/asset-spec.test.js +0 -241
  40. package/dist/tests/bench/attribution.test.js +0 -995
  41. package/dist/tests/bench/cleanup-sigint.test.js +0 -83
  42. package/dist/tests/bench/cleanup.js +0 -203
  43. package/dist/tests/bench/cleanup.test.js +0 -166
  44. package/dist/tests/bench/cli.js +0 -683
  45. package/dist/tests/bench/cli.test.js +0 -177
  46. package/dist/tests/bench/compare.test.js +0 -556
  47. package/dist/tests/bench/corpus.js +0 -314
  48. package/dist/tests/bench/corpus.test.js +0 -258
  49. package/dist/tests/bench/driver.js +0 -346
  50. package/dist/tests/bench/driver.test.js +0 -443
  51. package/dist/tests/bench/evolve-metrics.js +0 -179
  52. package/dist/tests/bench/evolve-metrics.test.js +0 -187
  53. package/dist/tests/bench/evolve.js +0 -580
  54. package/dist/tests/bench/evolve.test.js +0 -616
  55. package/dist/tests/bench/failure-modes.test.js +0 -300
  56. package/dist/tests/bench/feedback-integrity.test.js +0 -456
  57. package/dist/tests/bench/leakage.test.js +0 -125
  58. package/dist/tests/bench/learning-curve.test.js +0 -133
  59. package/dist/tests/bench/metrics.js +0 -2319
  60. package/dist/tests/bench/metrics.test.js +0 -1144
  61. package/dist/tests/bench/no-os-tmpdir-invariant.test.js +0 -43
  62. package/dist/tests/bench/report.js +0 -1821
  63. package/dist/tests/bench/report.test.js +0 -989
  64. package/dist/tests/bench/runner.js +0 -536
  65. package/dist/tests/bench/runner.test.js +0 -958
  66. package/dist/tests/bench/search-bridge.test.js +0 -331
  67. package/dist/tests/bench/tmp.js +0 -41
  68. package/dist/tests/bench/trajectory.js +0 -116
  69. package/dist/tests/bench/trajectory.test.js +0 -127
  70. package/dist/tests/bench/verifier.js +0 -109
  71. package/dist/tests/bench/verifier.test.js +0 -118
  72. package/dist/tests/bench/workflow-evaluator.js +0 -557
  73. package/dist/tests/bench/workflow-evaluator.test.js +0 -421
  74. package/dist/tests/bench/workflow-spec.js +0 -358
  75. package/dist/tests/bench/workflow-spec.test.js +0 -363
  76. package/dist/tests/bench/workflow-trace.js +0 -438
  77. package/dist/tests/bench/workflow-trace.test.js +0 -254
  78. package/dist/tests/benchmark-search-quality.js +0 -536
  79. package/dist/tests/benchmark-suite.js +0 -1441
  80. package/dist/tests/capture-cli.test.js +0 -112
  81. package/dist/tests/cli-errors.test.js +0 -203
  82. package/dist/tests/commands/events.test.js +0 -370
  83. package/dist/tests/commands/history.test.js +0 -223
  84. package/dist/tests/commands/import.test.js +0 -103
  85. package/dist/tests/commands/proposal-cli.test.js +0 -209
  86. package/dist/tests/commands/reflect-propose-cli.test.js +0 -333
  87. package/dist/tests/commands/remember.test.js +0 -97
  88. package/dist/tests/commands/scope-flags.test.js +0 -300
  89. package/dist/tests/commands/search.test.js +0 -537
  90. package/dist/tests/commands/show-indexer-parity.test.js +0 -117
  91. package/dist/tests/commands/show.test.js +0 -294
  92. package/dist/tests/common.test.js +0 -266
  93. package/dist/tests/completions.test.js +0 -142
  94. package/dist/tests/config-cli.test.js +0 -193
  95. package/dist/tests/config-llm-features.test.js +0 -139
  96. package/dist/tests/config.test.js +0 -544
  97. package/dist/tests/contracts/migration-baseline.test.js +0 -43
  98. package/dist/tests/contracts/reflect-propose-envelope.test.js +0 -139
  99. package/dist/tests/contracts/spec-helpers.js +0 -46
  100. package/dist/tests/contracts/v1-spec-section-11-proposal-queue.test.js +0 -228
  101. package/dist/tests/contracts/v1-spec-section-12-agent-config.test.js +0 -56
  102. package/dist/tests/contracts/v1-spec-section-13-lesson-type.test.js +0 -34
  103. package/dist/tests/contracts/v1-spec-section-14-llm-features.test.js +0 -94
  104. package/dist/tests/contracts/v1-spec-section-4-1-asset-types.test.js +0 -39
  105. package/dist/tests/contracts/v1-spec-section-4-2-quality-rules.test.js +0 -44
  106. package/dist/tests/contracts/v1-spec-section-5-configuration.test.js +0 -47
  107. package/dist/tests/contracts/v1-spec-section-6-orchestration.test.js +0 -40
  108. package/dist/tests/contracts/v1-spec-section-7-module-layout.test.js +0 -58
  109. package/dist/tests/contracts/v1-spec-section-8-extension-points.test.js +0 -34
  110. package/dist/tests/contracts/v1-spec-section-9-4-cli-surface.test.js +0 -75
  111. package/dist/tests/contracts/v1-spec-section-9-7-llm-agent-boundary.test.js +0 -36
  112. package/dist/tests/core/write-source.test.js +0 -366
  113. package/dist/tests/curate-command.test.js +0 -87
  114. package/dist/tests/db-scoring.test.js +0 -201
  115. package/dist/tests/db.test.js +0 -654
  116. package/dist/tests/distill-cli-flag.test.js +0 -208
  117. package/dist/tests/distill.test.js +0 -515
  118. package/dist/tests/docker-install.test.js +0 -120
  119. package/dist/tests/e2e.test.js +0 -1398
  120. package/dist/tests/embedder.test.js +0 -340
  121. package/dist/tests/embedding-model-config.test.js +0 -379
  122. package/dist/tests/feedback-command.test.js +0 -172
  123. package/dist/tests/file-context.test.js +0 -552
  124. package/dist/tests/fixtures/scripts/git/summarize-diff.js +0 -9
  125. package/dist/tests/fixtures/scripts/lint/eslint-check.js +0 -7
  126. package/dist/tests/fixtures/stashes/load.js +0 -166
  127. package/dist/tests/fixtures/stashes/load.test.js +0 -88
  128. package/dist/tests/fixtures/stashes/ranking-baseline/scripts/mem0-search.js +0 -12
  129. package/dist/tests/frontmatter.test.js +0 -190
  130. package/dist/tests/fts-field-weighting.test.js +0 -254
  131. package/dist/tests/fuzzy-search.test.js +0 -230
  132. package/dist/tests/git-provider-clone.test.js +0 -45
  133. package/dist/tests/github.test.js +0 -161
  134. package/dist/tests/graph-boost-ranking.test.js +0 -305
  135. package/dist/tests/graph-extraction.test.js +0 -282
  136. package/dist/tests/helpers/usage-events.js +0 -8
  137. package/dist/tests/index-pass-llm.test.js +0 -161
  138. package/dist/tests/indexer.test.js +0 -559
  139. package/dist/tests/info-command.test.js +0 -166
  140. package/dist/tests/init.test.js +0 -69
  141. package/dist/tests/install-script.test.js +0 -246
  142. package/dist/tests/integration/agent-real-profile.test.js +0 -94
  143. package/dist/tests/issue-36-repro.test.js +0 -304
  144. package/dist/tests/issues-191-194.test.js +0 -160
  145. package/dist/tests/lesson-lint.test.js +0 -111
  146. package/dist/tests/llm-client.test.js +0 -115
  147. package/dist/tests/llm-feature-gate.test.js +0 -151
  148. package/dist/tests/llm.test.js +0 -139
  149. package/dist/tests/lockfile.test.js +0 -216
  150. package/dist/tests/manifest.test.js +0 -205
  151. package/dist/tests/markdown.test.js +0 -126
  152. package/dist/tests/matchers-unit.test.js +0 -189
  153. package/dist/tests/memory-inference.test.js +0 -299
  154. package/dist/tests/merge-scoring.test.js +0 -136
  155. package/dist/tests/metadata.test.js +0 -313
  156. package/dist/tests/migration-help.test.js +0 -89
  157. package/dist/tests/origin-resolve.test.js +0 -124
  158. package/dist/tests/output-baseline.test.js +0 -217
  159. package/dist/tests/output-shapes-unit.test.js +0 -476
  160. package/dist/tests/parallel-search.test.js +0 -272
  161. package/dist/tests/parameter-metadata.test.js +0 -365
  162. package/dist/tests/paths.test.js +0 -177
  163. package/dist/tests/progressive-disclosure.test.js +0 -280
  164. package/dist/tests/proposals.test.js +0 -279
  165. package/dist/tests/proposed-quality.test.js +0 -271
  166. package/dist/tests/provider-registry.test.js +0 -32
  167. package/dist/tests/ranking-regression.test.js +0 -548
  168. package/dist/tests/reflect-propose.test.js +0 -455
  169. package/dist/tests/registry-build-index.test.js +0 -378
  170. package/dist/tests/registry-cli.test.js +0 -290
  171. package/dist/tests/registry-index-v2.test.js +0 -430
  172. package/dist/tests/registry-install.test.js +0 -728
  173. package/dist/tests/registry-providers/parity.test.js +0 -189
  174. package/dist/tests/registry-providers/skills-sh.test.js +0 -309
  175. package/dist/tests/registry-providers/static-index.test.js +0 -204
  176. package/dist/tests/registry-resolve.test.js +0 -126
  177. package/dist/tests/registry-search.test.js +0 -723
  178. package/dist/tests/remember-frontmatter.test.js +0 -380
  179. package/dist/tests/remember-unit.test.js +0 -123
  180. package/dist/tests/ripgrep-install.test.js +0 -251
  181. package/dist/tests/ripgrep-resolve.test.js +0 -108
  182. package/dist/tests/ripgrep.test.js +0 -163
  183. package/dist/tests/save-command.test.js +0 -94
  184. package/dist/tests/save-trust-qa-fixes.test.js +0 -270
  185. package/dist/tests/scoring-pipeline.test.js +0 -648
  186. package/dist/tests/search-include-proposed-cli.test.js +0 -118
  187. package/dist/tests/self-update.test.js +0 -442
  188. package/dist/tests/semantic-search-e2e.test.js +0 -512
  189. package/dist/tests/semantic-status.test.js +0 -471
  190. package/dist/tests/setup-run.integration.js +0 -877
  191. package/dist/tests/setup-wizard.test.js +0 -198
  192. package/dist/tests/setup.test.js +0 -131
  193. package/dist/tests/source-add.test.js +0 -11
  194. package/dist/tests/source-clone.test.js +0 -254
  195. package/dist/tests/source-manage.test.js +0 -366
  196. package/dist/tests/source-providers/filesystem.test.js +0 -82
  197. package/dist/tests/source-providers/git.test.js +0 -252
  198. package/dist/tests/source-providers/website.test.js +0 -128
  199. package/dist/tests/source-qa-fixes.test.js +0 -268
  200. package/dist/tests/source-registry.test.js +0 -350
  201. package/dist/tests/source-resolve.test.js +0 -100
  202. package/dist/tests/source-source.test.js +0 -221
  203. package/dist/tests/source.test.js +0 -533
  204. package/dist/tests/tar-utils-scan.test.js +0 -73
  205. package/dist/tests/toggle-components.test.js +0 -73
  206. package/dist/tests/usage-telemetry.test.js +0 -265
  207. package/dist/tests/utility-scoring.test.js +0 -558
  208. package/dist/tests/vault-load-error.test.js +0 -78
  209. package/dist/tests/vault-qa-fixes.test.js +0 -194
  210. package/dist/tests/vault.test.js +0 -429
  211. package/dist/tests/vector-search.test.js +0 -608
  212. package/dist/tests/walker.test.js +0 -252
  213. package/dist/tests/wave2-cluster-bc.test.js +0 -228
  214. package/dist/tests/wave2-cluster-d.test.js +0 -180
  215. package/dist/tests/wave2-cluster-e.test.js +0 -179
  216. package/dist/tests/wiki-qa-fixes.test.js +0 -270
  217. package/dist/tests/wiki.test.js +0 -529
  218. package/dist/tests/workflow-cli.test.js +0 -271
  219. package/dist/tests/workflow-markdown.test.js +0 -171
  220. package/dist/tests/workflow-path-escape.test.js +0 -132
  221. package/dist/tests/workflow-qa-fixes.test.js +0 -377
  222. package/dist/tests/workflows/indexer-rejection.test.js +0 -213
  223. /package/dist/{src/commands → commands}/completions.js +0 -0
  224. /package/dist/{src/commands → commands}/curate.js +0 -0
  225. /package/dist/{src/commands → commands}/distill.js +0 -0
  226. /package/dist/{src/commands → commands}/events.js +0 -0
  227. /package/dist/{src/commands → commands}/info.js +0 -0
  228. /package/dist/{src/commands → commands}/init.js +0 -0
  229. /package/dist/{src/commands → commands}/install-audit.js +0 -0
  230. /package/dist/{src/commands → commands}/installed-stashes.js +0 -0
  231. /package/dist/{src/commands → commands}/migration-help.js +0 -0
  232. /package/dist/{src/commands → commands}/proposal.js +0 -0
  233. /package/dist/{src/commands → commands}/propose.js +0 -0
  234. /package/dist/{src/commands → commands}/reflect.js +0 -0
  235. /package/dist/{src/commands → commands}/remember.js +0 -0
  236. /package/dist/{src/commands → commands}/self-update.js +0 -0
  237. /package/dist/{src/commands → commands}/source-clone.js +0 -0
  238. /package/dist/{src/commands → commands}/vault.js +0 -0
  239. /package/dist/{src/core → core}/asset-ref.js +0 -0
  240. /package/dist/{src/core → core}/asset-registry.js +0 -0
  241. /package/dist/{src/core → core}/asset-spec.js +0 -0
  242. /package/dist/{src/core → core}/common.js +0 -0
  243. /package/dist/{src/core → core}/errors.js +0 -0
  244. /package/dist/{src/core → core}/events.js +0 -0
  245. /package/dist/{src/core → core}/frontmatter.js +0 -0
  246. /package/dist/{src/core → core}/lesson-lint.js +0 -0
  247. /package/dist/{src/core → core}/markdown.js +0 -0
  248. /package/dist/{src/core → core}/paths.js +0 -0
  249. /package/dist/{src/core → core}/proposals.js +0 -0
  250. /package/dist/{src/core → core}/warn.js +0 -0
  251. /package/dist/{src/core → core}/write-source.js +0 -0
  252. /package/dist/{src/indexer → indexer}/db.js +0 -0
  253. /package/dist/{src/indexer → indexer}/file-context.js +0 -0
  254. /package/dist/{src/indexer → indexer}/graph-boost.js +0 -0
  255. /package/dist/{src/indexer → indexer}/graph-extraction.js +0 -0
  256. /package/dist/{src/indexer → indexer}/manifest.js +0 -0
  257. /package/dist/{src/indexer → indexer}/memory-inference.js +0 -0
  258. /package/dist/{src/indexer → indexer}/metadata.js +0 -0
  259. /package/dist/{src/indexer → indexer}/search-fields.js +0 -0
  260. /package/dist/{src/indexer → indexer}/semantic-status.js +0 -0
  261. /package/dist/{src/indexer → indexer}/usage-events.js +0 -0
  262. /package/dist/{src/indexer → indexer}/walker.js +0 -0
  263. /package/dist/{src/integrations → integrations}/agent/config.js +0 -0
  264. /package/dist/{src/integrations → integrations}/agent/detect.js +0 -0
  265. /package/dist/{src/integrations → integrations}/agent/index.js +0 -0
  266. /package/dist/{src/integrations → integrations}/agent/prompts.js +0 -0
  267. /package/dist/{src/integrations → integrations}/lockfile.js +0 -0
  268. /package/dist/{src/llm → llm}/client.js +0 -0
  269. /package/dist/{src/llm → llm}/embedder.js +0 -0
  270. /package/dist/{src/llm → llm}/embedders/cache.js +0 -0
  271. /package/dist/{src/llm → llm}/embedders/local.js +0 -0
  272. /package/dist/{src/llm → llm}/embedders/types.js +0 -0
  273. /package/dist/{src/llm → llm}/feature-gate.js +0 -0
  274. /package/dist/{src/llm → llm}/graph-extract.js +0 -0
  275. /package/dist/{src/llm → llm}/index-passes.js +0 -0
  276. /package/dist/{src/llm → llm}/memory-infer.js +0 -0
  277. /package/dist/{src/llm → llm}/metadata-enhance.js +0 -0
  278. /package/dist/{src/output → output}/context.js +0 -0
  279. /package/dist/{src/registry → registry}/create-provider-registry.js +0 -0
  280. /package/dist/{src/registry → registry}/factory.js +0 -0
  281. /package/dist/{src/registry → registry}/origin-resolve.js +0 -0
  282. /package/dist/{src/registry → registry}/providers/index.js +0 -0
  283. /package/dist/{src/registry → registry}/providers/skills-sh.js +0 -0
  284. /package/dist/{src/registry → registry}/providers/types.js +0 -0
  285. /package/dist/{src/registry → registry}/resolve.js +0 -0
  286. /package/dist/{src/registry → registry}/types.js +0 -0
  287. /package/dist/{src/setup → setup}/detect.js +0 -0
  288. /package/dist/{src/setup → setup}/ripgrep-install.js +0 -0
  289. /package/dist/{src/setup → setup}/ripgrep-resolve.js +0 -0
  290. /package/dist/{src/setup → setup}/steps.js +0 -0
  291. /package/dist/{src/sources → sources}/include.js +0 -0
  292. /package/dist/{src/sources → sources}/provider-factory.js +0 -0
  293. /package/dist/{src/sources → sources}/provider.js +0 -0
  294. /package/dist/{src/sources → sources}/providers/filesystem.js +0 -0
  295. /package/dist/{src/sources → sources}/providers/git.js +0 -0
  296. /package/dist/{src/sources → sources}/providers/index.js +0 -0
  297. /package/dist/{src/sources → sources}/providers/install-types.js +0 -0
  298. /package/dist/{src/sources → sources}/providers/npm.js +0 -0
  299. /package/dist/{src/sources → sources}/providers/provider-utils.js +0 -0
  300. /package/dist/{src/sources → sources}/providers/sync-from-ref.js +0 -0
  301. /package/dist/{src/sources → sources}/providers/tar-utils.js +0 -0
  302. /package/dist/{src/sources → sources}/providers/website.js +0 -0
  303. /package/dist/{src/sources → sources}/resolve.js +0 -0
  304. /package/dist/{src/sources → sources}/types.js +0 -0
  305. /package/dist/{src/templates → templates}/wiki-templates.js +0 -0
  306. /package/dist/{src/version.js → version.js} +0 -0
  307. /package/dist/{src/workflows → workflows}/authoring.js +0 -0
  308. /package/dist/{src/workflows → workflows}/cli.js +0 -0
  309. /package/dist/{src/workflows → workflows}/db.js +0 -0
  310. /package/dist/{src/workflows → workflows}/document-cache.js +0 -0
  311. /package/dist/{src/workflows → workflows}/parser.js +0 -0
  312. /package/dist/{src/workflows → workflows}/renderer.js +0 -0
  313. /package/dist/{src/workflows → workflows}/schema.js +0 -0
  314. /package/dist/{src/workflows → workflows}/validator.js +0 -0
@@ -1,443 +0,0 @@
1
- /**
2
- * Unit tests for the bench driver — exercises every RunResult outcome
3
- * (`pass`, `fail`, `budget_exceeded`, `harness_error`) via an injected fake
4
- * spawn. Real opencode is never invoked.
5
- */
6
- import { afterAll, beforeAll, describe, expect, test } from "bun:test";
7
- import fs from "node:fs";
8
- import path from "node:path";
9
- import { _ISOLATED_ENV_NAMES, _SCRUBBED_OPERATOR_ENV_NAMES, buildIsolatedEnv, buildSanitizedEnvSource, createIsolationDirs, EVENTS_READ_CAP_BYTES, parseTokenUsage, readRunEvents, runOne, stripAkmStashDir, } from "./driver";
10
- import { benchMkdtemp } from "./tmp";
11
- function asReadableStream(text) {
12
- const bytes = new TextEncoder().encode(text);
13
- return new ReadableStream({
14
- start(controller) {
15
- controller.enqueue(bytes);
16
- controller.close();
17
- },
18
- });
19
- }
20
- /**
21
- * Build a spawn fn that scripts the agent run first, then any subsequent
22
- * verifier run. Distinguishes by command: opencode is the configured `bin`
23
- * for the built-in opencode profile (i.e. cmd[0] === "opencode"); anything
24
- * else is a verifier.
25
- */
26
- function scriptedSpawn(agent, verifier) {
27
- const invocations = [];
28
- const spawn = (cmd, options) => {
29
- invocations.push({ cmd, env: options.env });
30
- const isAgent = cmd[0] === "opencode";
31
- const config = isAgent ? agent : (verifier ?? { exitCode: 0, stdout: "" });
32
- if (isAgent && agent.throwSync)
33
- throw agent.throwSync;
34
- let resolveExit = () => { };
35
- const exited = new Promise((resolve) => {
36
- resolveExit = resolve;
37
- if (!(isAgent && agent.hangsUntilKilled))
38
- resolve(config.exitCode);
39
- });
40
- const proc = {
41
- exitCode: isAgent && agent.hangsUntilKilled ? null : config.exitCode,
42
- exited,
43
- stdout: asReadableStream(config.stdout ?? ""),
44
- stderr: asReadableStream(config.stderr ?? ""),
45
- stdin: null,
46
- kill() {
47
- // Honour kill so timeout path resolves cleanly.
48
- resolveExit(143);
49
- },
50
- };
51
- return proc;
52
- };
53
- return { spawn, invocations };
54
- }
55
- const baseOptions = {
56
- track: "utility",
57
- arm: "noakm",
58
- taskId: "_example/example-task",
59
- workspace: "",
60
- model: "anthropic/claude-opus-4-7",
61
- seed: 0,
62
- budgetTokens: 100000,
63
- budgetWallMs: 60_000,
64
- verifier: "regex",
65
- taskDir: "",
66
- expectedMatch: "ok",
67
- };
68
- describe("runOne", () => {
69
- let workspace;
70
- beforeAll(() => {
71
- workspace = benchMkdtemp("bench-driver-test-");
72
- });
73
- afterAll(() => {
74
- fs.rmSync(workspace, { recursive: true, force: true });
75
- });
76
- test("pass: agent exits 0, verifier exits 0", async () => {
77
- const { spawn, invocations } = scriptedSpawn({ exitCode: 0, stdout: "ok" });
78
- const result = await runOne({ ...baseOptions, workspace, spawn });
79
- expect(result.outcome).toBe("pass");
80
- expect(result.verifierExitCode).toBe(0);
81
- expect(result.taskId).toBe("_example/example-task");
82
- expect(result.model).toBe("anthropic/claude-opus-4-7");
83
- expect(result.seed).toBe(0);
84
- expect(result.schemaVersion).toBe(1);
85
- expect(invocations[0]?.cmd[0]).toBe("opencode");
86
- });
87
- test("fail: agent exits 0 but verifier rejects output", async () => {
88
- const { spawn } = scriptedSpawn({ exitCode: 0, stdout: "nope" });
89
- const result = await runOne({ ...baseOptions, workspace, spawn });
90
- expect(result.outcome).toBe("fail");
91
- expect(result.verifierExitCode).toBe(1);
92
- });
93
- test("budget_exceeded: agent times out (runAgent reason: timeout)", async () => {
94
- const { spawn } = scriptedSpawn({ exitCode: 0, hangsUntilKilled: true });
95
- const result = await runOne({
96
- ...baseOptions,
97
- workspace,
98
- spawn,
99
- // Tiny budget so the timer fires before the fake agent ever exits.
100
- budgetWallMs: 50,
101
- });
102
- expect(result.outcome).toBe("budget_exceeded");
103
- });
104
- test("harness_error: agent spawn throws synchronously", async () => {
105
- const { spawn } = scriptedSpawn({ exitCode: 0, throwSync: new Error("ENOENT") });
106
- const result = await runOne({ ...baseOptions, workspace, spawn });
107
- expect(result.outcome).toBe("harness_error");
108
- });
109
- test("budget_exceeded: parsed token usage exceeds budgetTokens", async () => {
110
- // Agent reports 70k input + 50k output = 120k tokens, budget is 100k.
111
- // Verifier should NOT run; outcome must be budget_exceeded.
112
- const { spawn } = scriptedSpawn({
113
- exitCode: 0,
114
- stdout: "input_tokens: 70000 output_tokens: 50000",
115
- });
116
- const result = await runOne({
117
- ...baseOptions,
118
- workspace,
119
- spawn,
120
- budgetTokens: 100_000,
121
- });
122
- expect(result.outcome).toBe("budget_exceeded");
123
- expect(result.tokens.input + result.tokens.output).toBeGreaterThan(100_000);
124
- expect(result.tokens.input).toBe(70_000);
125
- expect(result.tokens.output).toBe(50_000);
126
- expect(result.tokenMeasurement).toBe("parsed");
127
- });
128
- test("tokenMeasurement: parsed when stdout reports tokens", async () => {
129
- const { spawn } = scriptedSpawn({
130
- exitCode: 0,
131
- stdout: "ok\ninput_tokens: 10 output_tokens: 5",
132
- });
133
- const result = await runOne({ ...baseOptions, workspace, spawn });
134
- expect(result.outcome).toBe("pass");
135
- expect(result.tokenMeasurement).toBe("parsed");
136
- expect(result.tokens.input).toBe(10);
137
- expect(result.tokens.output).toBe(5);
138
- });
139
- test("tokenMeasurement: missing when stdout has no token line — and budget is NOT enforced", async () => {
140
- // Agent never reports tokens. budgetTokens is 1, but the harness must not
141
- // mark this as budget_exceeded (issue #252) — measurement is missing.
142
- const { spawn } = scriptedSpawn({ exitCode: 0, stdout: "ok" });
143
- const result = await runOne({ ...baseOptions, workspace, spawn, budgetTokens: 1 });
144
- expect(result.tokenMeasurement).toBe("missing");
145
- expect(result.tokens).toEqual({ input: 0, output: 0 });
146
- expect(result.outcome).not.toBe("budget_exceeded");
147
- });
148
- test("tokenMeasurement: harness_error path leaves measurement as 'missing'", async () => {
149
- const { spawn } = scriptedSpawn({ exitCode: 0, throwSync: new Error("ENOENT") });
150
- const result = await runOne({ ...baseOptions, workspace, spawn });
151
- expect(result.outcome).toBe("harness_error");
152
- // No agent stdout was ever observed → measurement stays at the default.
153
- expect(result.tokenMeasurement).toBe("missing");
154
- });
155
- test("isolation: child env carries pinned XDG/OPENCODE/AKM dirs and not operator values", async () => {
156
- const sentinel = "/tmp/operator-config-must-not-leak";
157
- const priors = {};
158
- for (const name of _ISOLATED_ENV_NAMES) {
159
- priors[name] = process.env[name];
160
- process.env[name] = sentinel;
161
- }
162
- try {
163
- const { spawn, invocations } = scriptedSpawn({ exitCode: 0, stdout: "ok" });
164
- await runOne({
165
- ...baseOptions,
166
- workspace,
167
- stashDir: "/tmp/some-stash",
168
- arm: "akm",
169
- spawn,
170
- });
171
- const childEnv = invocations[0]?.env ?? {};
172
- // Each isolated key MUST be present and MUST NOT equal the operator sentinel.
173
- for (const name of _ISOLATED_ENV_NAMES) {
174
- expect(childEnv[name]).toBeDefined();
175
- expect(childEnv[name]).not.toBe(sentinel);
176
- }
177
- expect(childEnv.AKM_STASH_DIR).toBe("/tmp/some-stash");
178
- expect(childEnv.BENCH_OPENCODE_MODEL).toBe("anthropic/claude-opus-4-7");
179
- }
180
- finally {
181
- for (const name of _ISOLATED_ENV_NAMES) {
182
- if (priors[name] === undefined)
183
- delete process.env[name];
184
- else
185
- process.env[name] = priors[name];
186
- }
187
- }
188
- });
189
- // ── #271: operator-env isolation (OPENCODE_API_KEY/ANTHROPIC_API_KEY/AKM_CONFIG_DIR)
190
- test("operator env isolation: bench child never inherits OPENCODE_API_KEY/ANTHROPIC_API_KEY/AKM_CONFIG_DIR (#271)", async () => {
191
- // Even though `OPENCODE_API_KEY` is in the opencode profile's
192
- // `envPassthrough` list, the bench driver MUST scrub these operator-env
193
- // names before profile.envPassthrough copies them into the child. This
194
- // is the regression guard the #271 review identified — without it,
195
- // operator credentials and the operator's `AKM_CONFIG_DIR` would leak
196
- // into every (task × arm × seed) child.
197
- const sentinels = {
198
- OPENCODE_API_KEY: "sentinel-A-must-not-leak",
199
- ANTHROPIC_API_KEY: "sentinel-B-must-not-leak",
200
- AKM_CONFIG_DIR: "sentinel-C-must-not-leak",
201
- };
202
- const priors = {};
203
- for (const [name, value] of Object.entries(sentinels)) {
204
- priors[name] = process.env[name];
205
- process.env[name] = value;
206
- }
207
- try {
208
- const { spawn, invocations } = scriptedSpawn({ exitCode: 0, stdout: "ok" });
209
- await runOne({
210
- ...baseOptions,
211
- workspace,
212
- arm: "akm",
213
- stashDir: "/tmp/some-stash",
214
- spawn,
215
- });
216
- const childEnv = invocations[0]?.env ?? {};
217
- // None of the operator sentinels reach the child env that runAgent
218
- // hands to spawn — neither as a key:value pair nor as a substring
219
- // match (paranoid: confirm the literal sentinel strings are absent
220
- // even from values like `OPENCODE_CONFIG`).
221
- for (const name of _SCRUBBED_OPERATOR_ENV_NAMES) {
222
- expect(childEnv[name]).toBeUndefined();
223
- }
224
- for (const sentinel of Object.values(sentinels)) {
225
- for (const value of Object.values(childEnv)) {
226
- expect(value).not.toContain(sentinel);
227
- }
228
- }
229
- // The explicit bench keys ARE present and pinned to the per-run
230
- // tmpdirs (sanity: the scrubbing didn't accidentally drop them).
231
- expect(childEnv.XDG_CACHE_HOME).toBeDefined();
232
- expect(childEnv.XDG_CONFIG_HOME).toBeDefined();
233
- expect(childEnv.OPENCODE_CONFIG).toBeDefined();
234
- expect(childEnv.AKM_STASH_DIR).toBe("/tmp/some-stash");
235
- expect(childEnv.BENCH_OPENCODE_MODEL).toBe("anthropic/claude-opus-4-7");
236
- }
237
- finally {
238
- for (const [name, prior] of Object.entries(priors)) {
239
- if (prior === undefined)
240
- delete process.env[name];
241
- else
242
- process.env[name] = prior;
243
- }
244
- }
245
- });
246
- // ── #261: synthetic-arm AKM_STASH_DIR isolation ─────────────────────────────
247
- test("synthetic arm: child env never carries AKM_STASH_DIR (recurrence guard for #243 fixup)", async () => {
248
- // CRITICAL: synthetic-arm runs MUST NOT carry AKM_STASH_DIR. Without
249
- // this guard the operator's real AKM_STASH_DIR leaks in via parent-env
250
- // inheritance — exactly the failure mode the #243 fixup chased. We
251
- // exercise both the explicit-stashDir case (bad caller passes one
252
- // anyway) and the no-stashDir case.
253
- const operatorStash = "/tmp/operator-stash-must-never-leak-into-synthetic";
254
- const prior = process.env.AKM_STASH_DIR;
255
- process.env.AKM_STASH_DIR = operatorStash;
256
- try {
257
- // 1) Synthetic arm with NO stashDir option: AKM_STASH_DIR must be
258
- // absent in the child env.
259
- const { spawn, invocations } = scriptedSpawn({ exitCode: 0, stdout: "ok" });
260
- await runOne({
261
- ...baseOptions,
262
- workspace,
263
- arm: "synthetic",
264
- spawn,
265
- });
266
- const childEnv1 = invocations[0]?.env ?? {};
267
- expect(childEnv1.AKM_STASH_DIR).toBeUndefined();
268
- expect(childEnv1.AKM_STASH_DIR).not.toBe(operatorStash);
269
- // 2) Even when a buggy caller forwards a stashDir to the synthetic
270
- // arm, the driver MUST refuse to wire it into the child env.
271
- const { spawn: spawn2, invocations: invocations2 } = scriptedSpawn({ exitCode: 0, stdout: "ok" });
272
- await runOne({
273
- ...baseOptions,
274
- workspace,
275
- arm: "synthetic",
276
- stashDir: "/tmp/buggy-caller-stash",
277
- spawn: spawn2,
278
- });
279
- const childEnv2 = invocations2[0]?.env ?? {};
280
- expect(childEnv2.AKM_STASH_DIR).toBeUndefined();
281
- }
282
- finally {
283
- if (prior === undefined)
284
- delete process.env.AKM_STASH_DIR;
285
- else
286
- process.env.AKM_STASH_DIR = prior;
287
- }
288
- });
289
- });
290
- describe("driver helpers", () => {
291
- test("createIsolationDirs creates four dirs under a single root", () => {
292
- const dirs = createIsolationDirs();
293
- try {
294
- expect(fs.existsSync(dirs.cacheHome)).toBe(true);
295
- expect(fs.existsSync(dirs.configHome)).toBe(true);
296
- expect(fs.existsSync(dirs.opencodeConfig)).toBe(true);
297
- expect(dirs.cacheHome.startsWith(dirs.root)).toBe(true);
298
- }
299
- finally {
300
- fs.rmSync(dirs.root, { recursive: true, force: true });
301
- }
302
- });
303
- test("stripAkmStashDir deletes AKM_STASH_DIR in place (#261 synthetic-arm guard)", () => {
304
- const env = {
305
- AKM_STASH_DIR: "/tmp/operator-stash",
306
- XDG_CACHE_HOME: "/tmp/cache",
307
- };
308
- const result = stripAkmStashDir(env);
309
- expect(result).toBe(env); // mutates in place + returns same ref
310
- expect(env.AKM_STASH_DIR).toBeUndefined();
311
- expect(env.XDG_CACHE_HOME).toBe("/tmp/cache"); // siblings untouched
312
- // No-op on env without AKM_STASH_DIR.
313
- const env2 = { XDG_CACHE_HOME: "/tmp/cache" };
314
- stripAkmStashDir(env2);
315
- expect(env2).toEqual({ XDG_CACHE_HOME: "/tmp/cache" });
316
- });
317
- test("buildSanitizedEnvSource strips OPENCODE_API_KEY/ANTHROPIC_API_KEY/AKM_CONFIG_DIR (#271)", () => {
318
- const source = {
319
- OPENCODE_API_KEY: "leak-A",
320
- ANTHROPIC_API_KEY: "leak-B",
321
- AKM_CONFIG_DIR: "/operator/akm",
322
- PATH: "/usr/bin",
323
- HOME: "/home/op",
324
- OPENCODE_CONFIG: "/operator/opencode",
325
- UNRELATED: "kept",
326
- };
327
- const result = buildSanitizedEnvSource(source);
328
- // Operator names removed.
329
- expect(result.OPENCODE_API_KEY).toBeUndefined();
330
- expect(result.ANTHROPIC_API_KEY).toBeUndefined();
331
- expect(result.AKM_CONFIG_DIR).toBeUndefined();
332
- // Everything else preserved verbatim.
333
- expect(result.PATH).toBe("/usr/bin");
334
- expect(result.HOME).toBe("/home/op");
335
- expect(result.OPENCODE_CONFIG).toBe("/operator/opencode");
336
- expect(result.UNRELATED).toBe("kept");
337
- // Result is a copy, not the same reference (caller can mutate freely).
338
- expect(result).not.toBe(source);
339
- // Source object is untouched.
340
- expect(source.OPENCODE_API_KEY).toBe("leak-A");
341
- });
342
- test("buildSanitizedEnvSource defaults to process.env when no source given", () => {
343
- const prior = process.env.OPENCODE_API_KEY;
344
- process.env.OPENCODE_API_KEY = "default-source-leak";
345
- try {
346
- const result = buildSanitizedEnvSource();
347
- expect(result.OPENCODE_API_KEY).toBeUndefined();
348
- }
349
- finally {
350
- if (prior === undefined)
351
- delete process.env.OPENCODE_API_KEY;
352
- else
353
- process.env.OPENCODE_API_KEY = prior;
354
- }
355
- });
356
- test("buildIsolatedEnv pins the four isolation keys plus model", () => {
357
- const dirs = createIsolationDirs("/tmp/stash");
358
- try {
359
- const env = buildIsolatedEnv(dirs, "model-x");
360
- expect(env.XDG_CACHE_HOME).toBe(dirs.cacheHome);
361
- expect(env.XDG_CONFIG_HOME).toBe(dirs.configHome);
362
- expect(env.OPENCODE_CONFIG).toBe(dirs.opencodeConfig);
363
- expect(env.AKM_STASH_DIR).toBe("/tmp/stash");
364
- expect(env.BENCH_OPENCODE_MODEL).toBe("model-x");
365
- }
366
- finally {
367
- fs.rmSync(dirs.root, { recursive: true, force: true });
368
- }
369
- });
370
- test("parseTokenUsage extracts numbers when present, missing otherwise", () => {
371
- // No matchable token line at all → measurement is "missing", not a real zero (issue #252).
372
- expect(parseTokenUsage("")).toEqual({ input: 0, output: 0, measurement: "missing" });
373
- expect(parseTokenUsage("noise")).toEqual({ input: 0, output: 0, measurement: "missing" });
374
- // Both keys present → "parsed" with the actual numbers.
375
- expect(parseTokenUsage("input_tokens: 123 output_tokens: 456")).toEqual({
376
- input: 123,
377
- output: 456,
378
- measurement: "parsed",
379
- });
380
- // Only one key present → still "parsed", missing key defaults to 0.
381
- expect(parseTokenUsage("input_tokens: 99")).toEqual({ input: 99, output: 0, measurement: "parsed" });
382
- expect(parseTokenUsage("output_tokens: 55")).toEqual({ input: 0, output: 55, measurement: "parsed" });
383
- });
384
- test("readRunEvents returns [] when events.jsonl is missing and parses lines when present", () => {
385
- const tmp = benchMkdtemp("bench-events-");
386
- try {
387
- expect(readRunEvents(tmp)).toEqual([]);
388
- const akm = path.join(tmp, "akm");
389
- fs.mkdirSync(akm, { recursive: true });
390
- fs.writeFileSync(path.join(akm, "events.jsonl"), `${JSON.stringify({ schemaVersion: 1, ts: "2026-04-27T00:00:00Z", eventType: "feedback" })}\n`);
391
- const events = readRunEvents(tmp);
392
- expect(events.length).toBe(1);
393
- expect(events[0]?.eventType).toBe("feedback");
394
- }
395
- finally {
396
- fs.rmSync(tmp, { recursive: true, force: true });
397
- }
398
- });
399
- test("readRunEvents caps reads at EVENTS_READ_CAP_BYTES and records a warning when exceeded", () => {
400
- const tmp = benchMkdtemp("bench-events-cap-");
401
- try {
402
- const akm = path.join(tmp, "akm");
403
- fs.mkdirSync(akm, { recursive: true });
404
- const eventsPath = path.join(akm, "events.jsonl");
405
- // Write a leading parseable record, then a giant filler line that
406
- // pushes total size past the cap.
407
- const firstLine = `${JSON.stringify({ schemaVersion: 1, ts: "2026-04-27T00:00:00Z", eventType: "feedback" })}\n`;
408
- const fd = fs.openSync(eventsPath, "w");
409
- try {
410
- fs.writeSync(fd, firstLine);
411
- // Filler line: a single very long line that — combined with the
412
- // first — exceeds the cap. We cap at 16MiB so write 17MiB of 'x'.
413
- const fillerSize = EVENTS_READ_CAP_BYTES + 1024 * 1024;
414
- const chunk = Buffer.alloc(64 * 1024, "x".charCodeAt(0));
415
- let written = 0;
416
- while (written < fillerSize) {
417
- const remaining = fillerSize - written;
418
- const toWrite = remaining < chunk.length ? chunk.subarray(0, remaining) : chunk;
419
- fs.writeSync(fd, toWrite);
420
- written += toWrite.length;
421
- }
422
- fs.writeSync(fd, "\n");
423
- }
424
- finally {
425
- fs.closeSync(fd);
426
- }
427
- const totalSize = fs.statSync(eventsPath).size;
428
- expect(totalSize).toBeGreaterThan(EVENTS_READ_CAP_BYTES);
429
- const warnings = [];
430
- const events = readRunEvents(tmp, { warnings });
431
- // The first parseable record should still be returned from the prefix.
432
- expect(events.length).toBe(1);
433
- expect(events[0]?.eventType).toBe("feedback");
434
- // A warning was appended that mentions the cap and the actual size.
435
- expect(warnings.length).toBe(1);
436
- expect(warnings[0]).toContain("events.jsonl truncated");
437
- expect(warnings[0]).toContain(String(EVENTS_READ_CAP_BYTES));
438
- }
439
- finally {
440
- fs.rmSync(tmp, { recursive: true, force: true });
441
- }
442
- });
443
- });
@@ -1,179 +0,0 @@
1
- /**
2
- * Track B lesson quality + reuse metrics (issue #264, spec §6.3 follow-up).
3
- *
4
- * `computeLessonMetrics` walks the evolve runner's proposal log and the
5
- * Phase 3 pre/post arm `RunResult[]`s and emits one `LessonRecord` per
6
- * lesson-kind proposal. The record captures:
7
- *
8
- * - `source_failures` — eval/train tasks whose negative feedback events
9
- * targeted this asset ref (joined via the supplied `feedbackLog`).
10
- * - `lint_pass` / `accepted` — verbatim from the proposal log entry.
11
- * - `first_reused_on` / `reuse_count` / `reuse_pass_rate` — how often the
12
- * accepted lesson's ref appeared in post-arm runs' `assetsLoaded`, and
13
- * the pass-rate among those reuses.
14
- * - `negative_transfer_count` — count of (taskId, seed) pairs where the
15
- * same task PASSED in pre but FAILED in post AND the post run loaded
16
- * this lesson's ref. Spec §6.4 negative-transfer attribution.
17
- * - `leakage_risk` — `"high"` when any verbatim 4-token-or-longer phrase
18
- * in the supplied verifier source(s) appears verbatim in the lesson
19
- * body; `"medium"` for 3-token leakage; `"low"` otherwise. Mirrors the
20
- * Wave 3 `leakage.test.ts` philosophy: structural fragments are red
21
- * flags, lone tokens are not.
22
- *
23
- * The function is pure: no disk I/O, no subprocess. Callers (the evolve
24
- * runner) thread lesson bodies + verifier sources through optional maps so
25
- * the leakage check is fully deterministic and testable with mock inputs.
26
- */
27
- /**
28
- * Compute lesson-quality + reuse metrics from the evolve runner's outputs.
29
- * Pure function — does not touch disk and does not invoke any subprocess.
30
- *
31
- * Only `proposalLog` entries with `kind === "lesson"` are surfaced as
32
- * `LessonRecord`s. Revision-kind proposals are tracked elsewhere (the
33
- * §6.3 `proposals` block already covers them) and would skew the lesson
34
- * reuse rate if mixed in.
35
- */
36
- export function computeLessonMetrics(input) {
37
- const lessons = input.proposalLog.filter((p) => p.kind === "lesson");
38
- const feedbackLog = input.feedbackLog ?? [];
39
- const preRuns = input.preRuns ?? [];
40
- const postRuns = input.postRuns ?? [];
41
- const lessonBodies = input.lessonBodies ?? {};
42
- const verifierSources = input.verifierSources ?? {};
43
- // Pre-index pre-arm task → seed → outcome so negative-transfer attribution
44
- // is a constant-time lookup per post run.
45
- const preOutcomes = new Map();
46
- for (const r of preRuns) {
47
- let inner = preOutcomes.get(r.taskId);
48
- if (!inner) {
49
- inner = new Map();
50
- preOutcomes.set(r.taskId, inner);
51
- }
52
- inner.set(r.seed, r.outcome);
53
- }
54
- // Pre-index negative feedback by ref so source_failures is O(events).
55
- const negativeFeedbackByRef = new Map();
56
- for (const ev of feedbackLog) {
57
- if (ev.signal !== "negative")
58
- continue;
59
- let set = negativeFeedbackByRef.get(ev.goldRef);
60
- if (!set) {
61
- set = new Set();
62
- negativeFeedbackByRef.set(ev.goldRef, set);
63
- }
64
- set.add(ev.taskId);
65
- }
66
- const records = lessons.map((p) => {
67
- const ref = p.assetRef;
68
- const sourceFailures = [...(negativeFeedbackByRef.get(ref) ?? [])].sort();
69
- // Reuse: post-arm runs that loaded this ref.
70
- let firstReusedOn = null;
71
- let reuseCount = 0;
72
- let reusePassCount = 0;
73
- // Negative transfer: post-FAIL where pre-PASS for the same (task, seed)
74
- // AND this lesson was loaded in the post run. Dedupe by taskId so a
75
- // task that regresses across multiple seeds counts once.
76
- const negativeTransferTasks = new Set();
77
- if (p.decision === "accept") {
78
- for (const r of postRuns) {
79
- if (!r.assetsLoaded?.includes(ref))
80
- continue;
81
- if (firstReusedOn === null)
82
- firstReusedOn = r.taskId;
83
- reuseCount += 1;
84
- if (r.outcome === "pass")
85
- reusePassCount += 1;
86
- if (r.outcome === "fail" || r.outcome === "budget_exceeded") {
87
- const prePerSeed = preOutcomes.get(r.taskId);
88
- if (prePerSeed && prePerSeed.get(r.seed) === "pass") {
89
- negativeTransferTasks.add(r.taskId);
90
- }
91
- }
92
- }
93
- }
94
- const reusePassRate = reuseCount === 0 ? 0 : reusePassCount / reuseCount;
95
- const leakageRisk = classifyLeakageRisk(lessonBodies[ref], verifierSources[ref]);
96
- return {
97
- ref,
98
- source_failures: sourceFailures,
99
- lint_pass: p.lintPass,
100
- accepted: p.decision === "accept",
101
- first_reused_on: firstReusedOn,
102
- reuse_count: reuseCount,
103
- reuse_pass_rate: reusePassRate,
104
- negative_transfer_count: negativeTransferTasks.size,
105
- leakage_risk: leakageRisk,
106
- };
107
- });
108
- records.sort((a, b) => a.ref.localeCompare(b.ref));
109
- const total = records.length;
110
- const accepted = records.filter((r) => r.accepted);
111
- const lintPassed = records.filter((r) => r.lint_pass).length;
112
- const reusedAccepted = accepted.filter((r) => r.reuse_count > 0);
113
- const reusePassRateSum = reusedAccepted.reduce((sum, r) => sum + r.reuse_pass_rate, 0);
114
- const negativeTransferTotal = records.reduce((sum, r) => sum + r.negative_transfer_count, 0);
115
- return {
116
- lessons: records,
117
- lessons_created_count: total,
118
- lessons_accepted_count: accepted.length,
119
- proposal_lint_pass_rate: total === 0 ? 0 : lintPassed / total,
120
- proposal_acceptance_rate: total === 0 ? 0 : accepted.length / total,
121
- lesson_reuse_rate: accepted.length === 0 ? 0 : reusedAccepted.length / accepted.length,
122
- lesson_reuse_success_rate: reusedAccepted.length === 0 ? 0 : reusePassRateSum / reusedAccepted.length,
123
- lesson_negative_transfer_count: negativeTransferTotal,
124
- };
125
- }
126
- /**
127
- * Classify lesson-body leakage against verifier source text. Returns
128
- * `"high"` when a 4+-word verbatim phrase from any verifier-source entry
129
- * appears in the body; `"medium"` for 3-word overlap; `"low"` otherwise.
130
- *
131
- * The check is intentionally simple — Wave 3's `leakage.test.ts` uses
132
- * structural assertion extraction (regex literals, dotted paths, jq/grep
133
- * patterns); here we just slide an N-gram window over the verifier text
134
- * and ask "does the body contain this exact run of words?". Tokens are
135
- * normalised to lowercase and split on non-word boundaries so trivial
136
- * whitespace differences don't hide leakage.
137
- */
138
- export function classifyLeakageRisk(body, verifierSources) {
139
- if (!body || !verifierSources || verifierSources.length === 0)
140
- return "low";
141
- const bodyTokens = tokenize(body);
142
- if (bodyTokens.length === 0)
143
- return "low";
144
- const bodyJoined = ` ${bodyTokens.join(" ")} `;
145
- let mediumHit = false;
146
- for (const source of verifierSources) {
147
- const sourceTokens = tokenize(source);
148
- if (sourceTokens.length < 3)
149
- continue;
150
- if (containsNGram(bodyJoined, sourceTokens, 4))
151
- return "high";
152
- if (!mediumHit && containsNGram(bodyJoined, sourceTokens, 3))
153
- mediumHit = true;
154
- }
155
- return mediumHit ? "medium" : "low";
156
- }
157
- /**
158
- * Slide an N-gram window of size `n` across `tokens` and return true if any
159
- * window appears as a contiguous substring inside `bodyJoined` (which is
160
- * pre-padded with spaces so word boundaries match cleanly). Skips windows
161
- * shorter than `n`; returns false on empty input.
162
- */
163
- function containsNGram(bodyJoined, tokens, n) {
164
- if (tokens.length < n)
165
- return false;
166
- for (let i = 0; i + n <= tokens.length; i += 1) {
167
- const phrase = ` ${tokens.slice(i, i + n).join(" ")} `;
168
- if (bodyJoined.includes(phrase))
169
- return true;
170
- }
171
- return false;
172
- }
173
- /** Lowercase tokens split on non-word characters. Empty strings dropped. */
174
- function tokenize(text) {
175
- return text
176
- .toLowerCase()
177
- .split(/[^a-z0-9_]+/)
178
- .filter((t) => t.length > 0);
179
- }