@sanity/ailf 1.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (499) hide show
  1. package/README.md +0 -1
  2. package/canonical/grader-references/README.md +2 -2
  3. package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
  4. package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
  5. package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
  6. package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
  7. package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
  8. package/config/features.ts +1 -1
  9. package/config/models.ts +29 -12
  10. package/config/sources.ts +1 -1
  11. package/config/thresholds.ts +1 -1
  12. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
  13. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
  14. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
  15. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
  16. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
  17. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
  18. package/dist/_vendor/ailf-core/config-helpers.d.ts +20 -17
  19. package/dist/_vendor/ailf-core/config-helpers.js +51 -2
  20. package/dist/_vendor/ailf-core/examples/index.d.ts +166 -80
  21. package/dist/_vendor/ailf-core/examples/index.js +213 -94
  22. package/dist/_vendor/ailf-core/index.d.ts +3 -2
  23. package/dist/_vendor/ailf-core/index.js +2 -1
  24. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
  25. package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
  26. package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
  27. package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
  28. package/dist/_vendor/ailf-core/ports/context.d.ts +22 -1
  29. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
  30. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  31. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
  32. package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
  33. package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
  34. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  35. package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
  36. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +7 -1
  37. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +16 -2
  38. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
  39. package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
  40. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
  41. package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
  42. package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
  43. package/dist/_vendor/ailf-core/services/index.js +1 -1
  44. package/dist/_vendor/ailf-core/services/scoring.js +9 -0
  45. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +25 -1
  46. package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
  47. package/dist/_vendor/ailf-core/types/index.d.ts +48 -7
  48. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +105 -23
  49. package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
  50. package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
  51. package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
  52. package/dist/adapters/api-client/remediation.js +2 -2
  53. package/dist/adapters/config-sources/file-config-adapter.js +7 -1
  54. package/dist/adapters/config-sources/ts-config-loader.js +21 -13
  55. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
  56. package/dist/adapters/index.d.ts +0 -1
  57. package/dist/adapters/index.js +0 -1
  58. package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
  59. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  60. package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
  61. package/dist/adapters/task-sources/content-lake-task-source.js +21 -26
  62. package/dist/adapters/task-sources/index.d.ts +3 -4
  63. package/dist/adapters/task-sources/index.js +3 -4
  64. package/dist/adapters/task-sources/repo-schemas.d.ts +219 -17
  65. package/dist/adapters/task-sources/repo-schemas.js +228 -20
  66. package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
  67. package/dist/adapters/task-sources/repo-task-source.js +81 -122
  68. package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
  69. package/dist/adapters/task-sources/repo-trigger.js +1 -1
  70. package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
  71. package/dist/adapters/task-sources/repo-validation.js +126 -5
  72. package/dist/adapters/task-sources/task-file-loader.d.ts +10 -7
  73. package/dist/adapters/task-sources/task-file-loader.js +21 -7
  74. package/dist/agent-observer/test-imports.d.ts +7 -0
  75. package/dist/agent-observer/test-imports.js +185 -0
  76. package/dist/artifact-capture/comparator.d.ts +22 -0
  77. package/dist/artifact-capture/comparator.js +493 -0
  78. package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
  79. package/dist/artifact-capture/filesystem-collector.js +237 -0
  80. package/dist/artifact-capture/redact-artifact.d.ts +20 -0
  81. package/dist/artifact-capture/redact-artifact.js +115 -0
  82. package/dist/assertions/source-isolation.d.ts +1 -1
  83. package/dist/assertions/source-isolation.js +1 -1
  84. package/dist/cli.js +4 -0
  85. package/dist/commands/calculate-scores.js +1 -0
  86. package/dist/commands/capture-compare.d.ts +15 -0
  87. package/dist/commands/capture-compare.js +253 -0
  88. package/dist/commands/capture-list.d.ts +12 -0
  89. package/dist/commands/capture-list.js +147 -0
  90. package/dist/commands/capture.d.ts +9 -0
  91. package/dist/commands/capture.js +16 -0
  92. package/dist/commands/chronic-failures.d.ts +8 -0
  93. package/dist/commands/chronic-failures.js +33 -0
  94. package/dist/commands/coverage-audit.js +3 -1
  95. package/dist/commands/explain-handler.d.ts +1 -1
  96. package/dist/commands/explain-handler.js +37 -8
  97. package/dist/commands/fetch-docs.js +1 -0
  98. package/dist/commands/generate-configs.d.ts +3 -3
  99. package/dist/commands/generate-configs.js +20 -8
  100. package/dist/commands/init.d.ts +5 -4
  101. package/dist/commands/init.js +190 -25
  102. package/dist/commands/pipeline-action.d.ts +7 -1
  103. package/dist/commands/pipeline-action.js +43 -19
  104. package/dist/commands/pipeline.d.ts +6 -1
  105. package/dist/commands/pipeline.js +7 -2
  106. package/dist/commands/pr-comment.js +1 -0
  107. package/dist/commands/publish.js +1 -0
  108. package/dist/commands/shared/help.js +2 -2
  109. package/dist/commands/update-quality-scores.d.ts +5 -0
  110. package/dist/commands/update-quality-scores.js +20 -0
  111. package/dist/commands/validate-tasks.d.ts +2 -2
  112. package/dist/commands/validate-tasks.js +26 -15
  113. package/dist/composition-root.d.ts +15 -4
  114. package/dist/composition-root.js +100 -55
  115. package/dist/config/features.ts +23 -0
  116. package/dist/config/models.ts +100 -0
  117. package/dist/config/prompts.ts +16 -0
  118. package/dist/config/rubrics.ts +225 -0
  119. package/dist/config/schedules.ts +47 -0
  120. package/dist/config/sinks.ts +37 -0
  121. package/dist/config/sources.ts +21 -0
  122. package/dist/config/thresholds.ts +61 -0
  123. package/dist/index.d.ts +41 -0
  124. package/dist/index.js +48 -0
  125. package/dist/lib/agent-behavior-report.d.ts +8 -0
  126. package/dist/lib/agent-behavior-report.js +185 -0
  127. package/dist/lib/baseline.d.ts +19 -0
  128. package/dist/lib/baseline.js +153 -0
  129. package/dist/lib/calculate-scores.d.ts +23 -0
  130. package/dist/lib/calculate-scores.js +42 -0
  131. package/dist/lib/compare.d.ts +18 -0
  132. package/dist/lib/compare.js +170 -0
  133. package/dist/lib/coverage-audit.d.ts +4 -0
  134. package/dist/lib/coverage-audit.js +42 -0
  135. package/dist/lib/discovery-report.d.ts +13 -0
  136. package/dist/lib/discovery-report.js +57 -0
  137. package/dist/lib/fetch-docs.d.ts +30 -0
  138. package/dist/lib/fetch-docs.js +171 -0
  139. package/dist/lib/generate-configs.d.ts +25 -0
  140. package/dist/lib/generate-configs.js +42 -0
  141. package/dist/lib/grader-api.d.ts +21 -0
  142. package/dist/lib/grader-api.js +34 -0
  143. package/dist/lib/grader-compare.d.ts +19 -0
  144. package/dist/lib/grader-compare.js +91 -0
  145. package/dist/lib/grader-consistency.d.ts +27 -0
  146. package/dist/lib/grader-consistency.js +79 -0
  147. package/dist/lib/grader-sensitivity.d.ts +19 -0
  148. package/dist/lib/grader-sensitivity.js +75 -0
  149. package/dist/lib/grader-validate.d.ts +19 -0
  150. package/dist/lib/grader-validate.js +78 -0
  151. package/dist/lib/measure-retrieval.d.ts +14 -0
  152. package/dist/lib/measure-retrieval.js +71 -0
  153. package/dist/lib/pr-comment.d.ts +16 -0
  154. package/dist/lib/pr-comment.js +28 -0
  155. package/dist/lib/readiness-report.d.ts +13 -0
  156. package/dist/lib/readiness-report.js +108 -0
  157. package/dist/lib/webhook-server.d.ts +11 -0
  158. package/dist/lib/webhook-server.js +24 -0
  159. package/dist/lib/weekly-digest.d.ts +24 -0
  160. package/dist/lib/weekly-digest.js +148 -0
  161. package/dist/orchestration/build-app-context.js +13 -0
  162. package/dist/orchestration/build-step-sequence.js +4 -2
  163. package/dist/orchestration/cache-context.d.ts +23 -0
  164. package/dist/orchestration/cache-context.js +43 -0
  165. package/dist/orchestration/env-bridge.d.ts +21 -0
  166. package/dist/orchestration/env-bridge.js +66 -0
  167. package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
  168. package/dist/orchestration/load-pipeline-tasks.js +52 -0
  169. package/dist/orchestration/pipeline-orchestrator.js +75 -5
  170. package/dist/orchestration/step-runner.js +5 -1
  171. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
  172. package/dist/orchestration/steps/calculate-scores-step.js +13 -0
  173. package/dist/orchestration/steps/callback-step.js +10 -1
  174. package/dist/orchestration/steps/compare-step.js +6 -3
  175. package/dist/orchestration/steps/discovery-report-step.js +6 -2
  176. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  177. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  178. package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
  179. package/dist/orchestration/steps/fetch-docs-step.js +32 -19
  180. package/dist/orchestration/steps/gap-analysis-step.js +13 -2
  181. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
  182. package/dist/orchestration/steps/generate-configs-step.js +77 -26
  183. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
  184. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  185. package/dist/orchestration/steps/publish-report-step.js +19 -0
  186. package/dist/orchestration/steps/readiness-step.js +8 -3
  187. package/dist/orchestration/steps/report-step.js +17 -4
  188. package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
  189. package/dist/orchestration/steps/run-eval-step.js +51 -31
  190. package/dist/pipeline/agent-behavior-report.js +6 -0
  191. package/dist/pipeline/attribution.d.ts +1 -1
  192. package/dist/pipeline/attribution.js +1 -1
  193. package/dist/pipeline/cache.js +29 -15
  194. package/dist/pipeline/calculate-scores.d.ts +2 -0
  195. package/dist/pipeline/calculate-scores.js +70 -33
  196. package/dist/pipeline/chronic-failures.d.ts +55 -0
  197. package/dist/pipeline/chronic-failures.js +110 -0
  198. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
  199. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
  200. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
  201. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +132 -62
  202. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
  203. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
  204. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
  205. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
  206. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
  207. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
  208. package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
  209. package/dist/pipeline/compiler/assertion-mapper.js +1 -1
  210. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
  211. package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
  212. package/dist/pipeline/compiler/config-loader.d.ts +14 -0
  213. package/dist/pipeline/compiler/config-loader.js +42 -2
  214. package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
  215. package/dist/pipeline/compiler/fixture-resolver.js +1 -1
  216. package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
  217. package/dist/pipeline/compiler/ignore-fields.js +1 -1
  218. package/dist/pipeline/compiler/index.d.ts +2 -5
  219. package/dist/pipeline/compiler/index.js +2 -5
  220. package/dist/pipeline/compiler/literacy-bridge.d.ts +2 -2
  221. package/dist/pipeline/compiler/literacy-bridge.js +2 -2
  222. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
  223. package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
  224. package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
  225. package/dist/pipeline/compiler/mode-bases/index.js +4 -0
  226. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
  227. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
  228. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +23 -0
  229. package/dist/pipeline/compiler/mode-bases/literacy.js +132 -0
  230. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
  231. package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
  232. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
  233. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
  234. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
  235. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
  236. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
  237. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
  238. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
  239. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
  240. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
  241. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
  242. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
  243. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
  244. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
  245. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
  246. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
  247. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
  248. package/dist/pipeline/compiler/mode-handlers/index.d.ts +6 -7
  249. package/dist/pipeline/compiler/mode-handlers/index.js +6 -8
  250. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
  251. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
  252. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
  253. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
  254. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
  255. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
  256. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
  257. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
  258. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
  259. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
  260. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
  261. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
  262. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
  263. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
  264. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
  265. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
  266. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
  267. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
  268. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
  269. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
  270. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
  271. package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
  272. package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
  273. package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
  274. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +63 -6
  275. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
  276. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
  277. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
  278. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
  279. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
  280. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
  281. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
  282. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
  283. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
  284. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +108 -0
  285. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
  286. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
  287. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
  288. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
  289. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +3 -1
  290. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +65 -67
  291. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
  292. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +191 -0
  293. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
  294. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +101 -0
  295. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
  296. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
  297. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +19 -0
  298. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +323 -0
  299. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +103 -0
  300. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
  301. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
  302. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
  303. package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
  304. package/dist/pipeline/compiler/preset-loader.js +99 -0
  305. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +7 -10
  306. package/dist/pipeline/compiler/presets/sanity-literacy.js +11 -157
  307. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
  308. package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
  309. package/dist/pipeline/compiler/provider-assembler.js +13 -7
  310. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
  311. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
  312. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
  313. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
  314. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
  315. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
  316. package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
  317. package/dist/pipeline/compiler/sandbox/index.js +1 -1
  318. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
  319. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
  320. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
  321. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
  322. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
  323. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
  324. package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
  325. package/dist/pipeline/compiler/scoring-bridge.js +1 -1
  326. package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
  327. package/dist/pipeline/compiler/task-bridge.js +92 -0
  328. package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
  329. package/dist/pipeline/compiler/task-graph-builder.js +1 -4
  330. package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
  331. package/dist/pipeline/compiler/telemetry/index.js +1 -1
  332. package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
  333. package/dist/pipeline/compiler/variable-resolver.js +1 -1
  334. package/dist/pipeline/coverage-audit.d.ts +1 -1
  335. package/dist/pipeline/coverage-audit.js +1 -1
  336. package/dist/pipeline/degradations.d.ts +1 -1
  337. package/dist/pipeline/degradations.js +1 -1
  338. package/dist/pipeline/expand-tasks.d.ts +2 -2
  339. package/dist/pipeline/expand-tasks.js +2 -2
  340. package/dist/pipeline/failure-modes.d.ts +1 -1
  341. package/dist/pipeline/failure-modes.js +13 -1
  342. package/dist/pipeline/gap-analysis.d.ts +1 -1
  343. package/dist/pipeline/gap-analysis.js +3 -1
  344. package/dist/pipeline/generate-configs.d.ts +2 -2
  345. package/dist/pipeline/generate-configs.js +16 -9
  346. package/dist/pipeline/grader-compare-runner.d.ts +1 -1
  347. package/dist/pipeline/grader-compare-runner.js +7 -1
  348. package/dist/pipeline/grader-comparison.d.ts +1 -1
  349. package/dist/pipeline/grader-comparison.js +1 -1
  350. package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
  351. package/dist/pipeline/grader-consistency-runner.js +7 -1
  352. package/dist/pipeline/grader-consistency.d.ts +1 -1
  353. package/dist/pipeline/grader-consistency.js +1 -1
  354. package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
  355. package/dist/pipeline/grader-sensitivity-runner.js +1 -1
  356. package/dist/pipeline/grader-sensitivity.d.ts +1 -1
  357. package/dist/pipeline/grader-sensitivity.js +1 -1
  358. package/dist/pipeline/grader-validate-runner.d.ts +1 -1
  359. package/dist/pipeline/grader-validate-runner.js +2 -2
  360. package/dist/pipeline/grader-validation.d.ts +1 -1
  361. package/dist/pipeline/grader-validation.js +1 -1
  362. package/dist/pipeline/map-request-to-config.js +16 -2
  363. package/dist/pipeline/mirror-repo-tasks.d.ts +8 -8
  364. package/dist/pipeline/mirror-repo-tasks.js +10 -10
  365. package/dist/pipeline/plan-format.d.ts +1 -1
  366. package/dist/pipeline/plan-format.js +1 -1
  367. package/dist/pipeline/plan.d.ts +1 -1
  368. package/dist/pipeline/plan.js +68 -30
  369. package/dist/pipeline/probe.d.ts +1 -1
  370. package/dist/pipeline/probe.js +1 -1
  371. package/dist/pipeline/readiness-report.d.ts +2 -2
  372. package/dist/pipeline/readiness-report.js +2 -2
  373. package/dist/pipeline/release-classification.d.ts +1 -1
  374. package/dist/pipeline/release-classification.js +1 -1
  375. package/dist/pipeline/release-report.d.ts +1 -1
  376. package/dist/pipeline/release-report.js +1 -1
  377. package/dist/pipeline/repo-eval-comment.d.ts +1 -1
  378. package/dist/pipeline/repo-eval-comment.js +1 -1
  379. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  380. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  381. package/dist/pipeline/resolve-mappings.d.ts +6 -6
  382. package/dist/pipeline/resolve-mappings.js +44 -44
  383. package/dist/pipeline/retrieval-metrics.d.ts +3 -3
  384. package/dist/pipeline/retrieval-metrics.js +28 -20
  385. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  386. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  387. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  388. package/dist/pipeline/steps/compare-step.js +90 -0
  389. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  390. package/dist/pipeline/steps/eval-step.js +347 -0
  391. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  392. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  393. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  394. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  395. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  396. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  397. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  398. package/dist/pipeline/steps/publish-report-step.js +243 -0
  399. package/dist/pipeline/steps/report-step.d.ts +13 -0
  400. package/dist/pipeline/steps/report-step.js +56 -0
  401. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  402. package/dist/pipeline/steps/update-scores-step.js +42 -0
  403. package/dist/pipeline/targeted-loo.d.ts +1 -1
  404. package/dist/pipeline/targeted-loo.js +1 -1
  405. package/dist/pipeline/thresholds.d.ts +1 -1
  406. package/dist/pipeline/thresholds.js +1 -1
  407. package/dist/pipeline/validate.js +13 -0
  408. package/dist/report-store.d.ts +17 -0
  409. package/dist/report-store.js +24 -0
  410. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  411. package/dist/scripts/agent-behavior-report.js +315 -0
  412. package/dist/scripts/baseline.d.ts +43 -0
  413. package/dist/scripts/baseline.js +267 -0
  414. package/dist/scripts/calculate-scores.d.ts +166 -0
  415. package/dist/scripts/calculate-scores.js +1296 -0
  416. package/dist/scripts/compare.d.ts +22 -0
  417. package/dist/scripts/compare.js +334 -0
  418. package/dist/scripts/coverage-audit.d.ts +44 -0
  419. package/dist/scripts/coverage-audit.js +209 -0
  420. package/dist/scripts/debug-eval.d.ts +19 -0
  421. package/dist/scripts/debug-eval.js +73 -0
  422. package/dist/scripts/discovery-report.d.ts +58 -0
  423. package/dist/scripts/discovery-report.js +250 -0
  424. package/dist/scripts/fetch-docs.d.ts +35 -0
  425. package/dist/scripts/fetch-docs.js +472 -0
  426. package/dist/scripts/generate-configs.d.ts +66 -0
  427. package/dist/scripts/generate-configs.js +459 -0
  428. package/dist/scripts/grader-api.d.ts +27 -0
  429. package/dist/scripts/grader-api.js +206 -0
  430. package/dist/scripts/grader-compare.d.ts +22 -0
  431. package/dist/scripts/grader-compare.js +368 -0
  432. package/dist/scripts/grader-consistency.d.ts +20 -0
  433. package/dist/scripts/grader-consistency.js +313 -0
  434. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  435. package/dist/scripts/grader-sensitivity.js +354 -0
  436. package/dist/scripts/grader-validate.d.ts +19 -0
  437. package/dist/scripts/grader-validate.js +267 -0
  438. package/dist/scripts/measure-retrieval.d.ts +10 -0
  439. package/dist/scripts/measure-retrieval.js +145 -0
  440. package/dist/scripts/migrate-task-mode.d.ts +1 -1
  441. package/dist/scripts/migrate-task-mode.js +1 -1
  442. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
  443. package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
  444. package/dist/scripts/pipeline.d.ts +76 -0
  445. package/dist/scripts/pipeline.js +1031 -0
  446. package/dist/scripts/pr-comment.d.ts +10 -0
  447. package/dist/scripts/pr-comment.js +510 -0
  448. package/dist/scripts/readiness-report.d.ts +88 -0
  449. package/dist/scripts/readiness-report.js +342 -0
  450. package/dist/scripts/update-quality-scores.d.ts +15 -0
  451. package/dist/scripts/update-quality-scores.js +184 -0
  452. package/dist/scripts/validate-task-sources.d.ts +1 -1
  453. package/dist/scripts/validate-task-sources.js +1 -1
  454. package/dist/scripts/validate.d.ts +13 -0
  455. package/dist/scripts/validate.js +79 -0
  456. package/dist/scripts/webhook-server.d.ts +26 -0
  457. package/dist/scripts/webhook-server.js +147 -0
  458. package/dist/scripts/weekly-digest.d.ts +24 -0
  459. package/dist/scripts/weekly-digest.js +144 -0
  460. package/dist/sinks/format-slack.d.ts +64 -0
  461. package/dist/sinks/format-slack.js +306 -0
  462. package/dist/sinks/slack-sink.d.ts +27 -0
  463. package/dist/sinks/slack-sink.js +78 -0
  464. package/dist/sinks/types.d.ts +1 -1
  465. package/dist/sinks/types.js +1 -1
  466. package/dist/sinks/webhook-sink.d.ts +19 -0
  467. package/dist/sinks/webhook-sink.js +50 -0
  468. package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
  469. package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
  470. package/dist/tasks/literacy/content-lake.task.ts +181 -0
  471. package/dist/tasks/literacy/frameworks.task.ts +129 -0
  472. package/dist/tasks/literacy/functions.task.ts +70 -0
  473. package/dist/tasks/literacy/groq.task.ts +259 -0
  474. package/dist/tasks/literacy/image-handling.task.ts +95 -0
  475. package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
  476. package/dist/tasks/literacy/portable-text.task.ts +169 -0
  477. package/dist/tasks/literacy/studio-setup.task.ts +134 -0
  478. package/dist/tasks/literacy/visual-editing.task.ts +147 -0
  479. package/package.json +32 -24
  480. package/tasks/.expanded.agentic.yaml +280 -0
  481. package/tasks/.expanded.yaml +565 -0
  482. package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
  483. package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
  484. package/tasks/literacy/content-lake.task.ts +181 -0
  485. package/tasks/literacy/frameworks.task.ts +1 -0
  486. package/tasks/literacy/functions.task.ts +1 -0
  487. package/tasks/literacy/groq.task.ts +1 -0
  488. package/tasks/literacy/image-handling.task.ts +95 -0
  489. package/tasks/literacy/nextjs-live.task.ts +2 -1
  490. package/tasks/literacy/portable-text.task.ts +169 -0
  491. package/tasks/literacy/studio-setup.task.ts +5 -2
  492. package/tasks/literacy/visual-editing.task.ts +1 -0
  493. package/LICENSE +0 -21
  494. package/tasks/frameworks.yaml +0 -98
  495. package/tasks/functions.yaml +0 -51
  496. package/tasks/groq.yaml +0 -216
  497. package/tasks/nextjs-live.yaml +0 -62
  498. package/tasks/studio-setup.yaml +0 -111
  499. package/tasks/visual-editing.yaml +0 -120
@@ -0,0 +1,315 @@
1
+ /**
2
+ * agent-behavior-report.ts
3
+ *
4
+ * Standalone script that reads Promptfoo evaluation results containing
5
+ * agent behavior observation data and generates a detailed report.
6
+ *
7
+ * This provides deeper analysis than the summary included in the main
8
+ * calculate-scores report, including:
9
+ *
10
+ * - Per-task behavior breakdown (which specific pages each task visited)
11
+ * - Canonical doc coverage (did the agent find the "right" docs?)
12
+ * - Request timeline and latency analysis
13
+ * - Search strategy analysis
14
+ * - Cross-task navigation pattern detection
15
+ *
16
+ * Usage:
17
+ * tsx src/scripts/agent-behavior-report.ts [results-path]
18
+ */
19
+ // oxlint-disable-next-line import/no-unassigned-import -- side-effect: loads .env into process.env
20
+ import "dotenv/config";
21
+ import { readFileSync, writeFileSync, mkdirSync, existsSync } from "fs";
22
+ import { join, dirname } from "path";
23
+ // Canonical doc mapping: task description patterns -> expected doc slugs
24
+ // This maps what docs a well-informed agent *should* visit for each task
25
+ const CANONICAL_DOC_MAP = {
26
+ frameworks: [
27
+ "remix",
28
+ "nuxt",
29
+ "svelte",
30
+ "astro",
31
+ "gatsby",
32
+ "client-libraries",
33
+ ],
34
+ functions: [
35
+ "functions",
36
+ "webhooks",
37
+ "groq-powered-webhooks",
38
+ "event-driven",
39
+ "automations",
40
+ ],
41
+ "nextjs-live": [
42
+ "next-js",
43
+ "live-content-api",
44
+ "content-source-maps",
45
+ "app-router",
46
+ "groq",
47
+ "client-libraries",
48
+ ],
49
+ "studio-setup": [
50
+ "studio",
51
+ "schema-types",
52
+ "structure-builder",
53
+ "configuration",
54
+ "plugins",
55
+ ],
56
+ "visual-editing": [
57
+ "visual-editing",
58
+ "presentation",
59
+ "preview",
60
+ "overlays",
61
+ "loaders",
62
+ ],
63
+ };
64
+ function analyzeResults(resultsPath) {
65
+ const json = JSON.parse(readFileSync(resultsPath, "utf-8"));
66
+ // Support both the flat shape ({ results: TestResult[] }) and the full
67
+ // Promptfoo envelope shape ({ results: { results: TestResult[] } }).
68
+ const results = Array.isArray(json.results)
69
+ ? json.results
70
+ : json.results.results;
71
+ const tasks = [];
72
+ for (const result of results) {
73
+ const metadata = result.metadata;
74
+ if (!metadata?.agentBehaviorSummary)
75
+ continue;
76
+ const behavior = metadata.agentBehaviorSummary;
77
+ tasks.push({
78
+ behavior,
79
+ description: result.description,
80
+ feature: detectFeatureArea(result.description),
81
+ hasDocs: !!(result.vars.docs && result.vars.docs.trim().length > 0),
82
+ });
83
+ }
84
+ if (tasks.length === 0) {
85
+ return { features: [], hasData: false, tasks: [] };
86
+ }
87
+ // Group by feature
88
+ const byFeature = {};
89
+ for (const t of tasks) {
90
+ if (!byFeature[t.feature])
91
+ byFeature[t.feature] = [];
92
+ byFeature[t.feature].push(t);
93
+ }
94
+ const features = Object.entries(byFeature)
95
+ .map(([feature, featureTasks]) => {
96
+ const allDocSlugs = [
97
+ ...new Set(featureTasks.flatMap((t) => t.behavior.docSlugsVisited)),
98
+ ];
99
+ const allSearchQueries = [
100
+ ...new Set(featureTasks.flatMap((t) => t.behavior.uniqueSearchQueries)),
101
+ ];
102
+ const allExternalDomains = [
103
+ ...new Set(featureTasks.flatMap((t) => t.behavior.externalDomains)),
104
+ ];
105
+ const canonicalSlugs = CANONICAL_DOC_MAP[feature] || [];
106
+ const matchedCanonical = canonicalSlugs.filter((slug) => allDocSlugs.some((visited) => visited.includes(slug)));
107
+ const canonicalCoverage = canonicalSlugs.length > 0
108
+ ? matchedCanonical.length / canonicalSlugs.length
109
+ : 0;
110
+ const count = featureTasks.length || 1;
111
+ return {
112
+ allDocSlugs,
113
+ allExternalDomains,
114
+ allSearchQueries,
115
+ avgDocPages: featureTasks.reduce((s, t) => s + t.behavior.docPagesVisited, 0) /
116
+ count,
117
+ avgNetworkMs: featureTasks.reduce((s, t) => s + t.behavior.totalNetworkMs, 0) /
118
+ count,
119
+ avgSearches: featureTasks.reduce((s, t) => s + t.behavior.searchesPerformed, 0) /
120
+ count,
121
+ canonicalCoverage,
122
+ canonicalSlugs,
123
+ feature,
124
+ tasks: featureTasks,
125
+ };
126
+ })
127
+ .sort((a, b) => a.feature.localeCompare(b.feature));
128
+ return { features, hasData: true, tasks };
129
+ }
130
+ function detectFeatureArea(description) {
131
+ const desc = description.toLowerCase();
132
+ if (desc.includes("studio"))
133
+ return "studio-setup";
134
+ if (desc.includes("visual") ||
135
+ desc.includes("presentation") ||
136
+ desc.includes("live preview"))
137
+ return "visual-editing";
138
+ if (desc.includes("function") || desc.includes("webhook"))
139
+ return "functions";
140
+ if (desc.includes("next") || desc.includes("app router"))
141
+ return "nextjs-live";
142
+ if (desc.includes("remix") ||
143
+ desc.includes("nuxt") ||
144
+ desc.includes("svelte"))
145
+ return "frameworks";
146
+ return "other";
147
+ }
148
+ // ---------------------------------------------------------------------------
149
+ // Report output
150
+ // ---------------------------------------------------------------------------
151
+ function main() {
152
+ const ROOT = join(dirname(new URL(import.meta.url).pathname), "..", "..");
153
+ const resultsPath = process.argv[2] || join(ROOT, "results", "latest", "eval-results.json");
154
+ if (!existsSync(resultsPath)) {
155
+ console.error(`Results file not found: ${resultsPath}`);
156
+ console.error("Run an evaluation first: pnpm eval:observed");
157
+ process.exit(1);
158
+ }
159
+ console.log(`Reading results from: ${resultsPath}`);
160
+ console.log();
161
+ const analysis = analyzeResults(resultsPath);
162
+ if (!analysis.hasData) {
163
+ console.log("No agent behavior data found in the results.");
164
+ console.log("Make sure you ran the evaluation with the observed config:");
165
+ console.log(" pnpm eval:observed");
166
+ process.exit(0);
167
+ }
168
+ printReport(analysis);
169
+ // Persist detailed report as JSON
170
+ const outDir = join(ROOT, "results", "latest");
171
+ mkdirSync(outDir, { recursive: true });
172
+ const reportData = {
173
+ features: analysis.features.map((f) => ({
174
+ avgDocPages: f.avgDocPages,
175
+ avgNetworkMs: f.avgNetworkMs,
176
+ avgSearches: f.avgSearches,
177
+ canonicalCoverage: f.canonicalCoverage,
178
+ canonicalSlugs: f.canonicalSlugs,
179
+ docSlugsVisited: f.allDocSlugs,
180
+ externalDomains: f.allExternalDomains,
181
+ feature: f.feature,
182
+ searchQueries: f.allSearchQueries,
183
+ taskCount: f.tasks.length,
184
+ })),
185
+ tasks: analysis.tasks.map((t) => ({
186
+ behavior: t.behavior,
187
+ description: t.description,
188
+ feature: t.feature,
189
+ hasDocs: t.hasDocs,
190
+ })),
191
+ timestamp: new Date().toISOString(),
192
+ totalTasks: analysis.tasks.length,
193
+ };
194
+ writeFileSync(join(outDir, "agent-behavior-report.json"), JSON.stringify(reportData, null, 2));
195
+ console.log("Agent behavior report written to results/latest/agent-behavior-report.json");
196
+ }
197
+ // ---------------------------------------------------------------------------
198
+ // Main
199
+ // ---------------------------------------------------------------------------
200
+ function printReport(analysis) {
201
+ console.log("=".repeat(80));
202
+ console.log(" AGENT BEHAVIOR OBSERVATION REPORT");
203
+ console.log("=".repeat(80));
204
+ console.log();
205
+ // ---- Overview table ----
206
+ console.log("OVERVIEW BY FEATURE AREA");
207
+ console.log("-".repeat(80));
208
+ const h = "| Feature Area | Tasks | Avg Docs | Avg Search | Avg Net(ms) | Canon% |";
209
+ const sep = "|---------------------|-------|----------|------------|-------------|--------|";
210
+ console.log(h);
211
+ console.log(sep);
212
+ for (const f of analysis.features) {
213
+ console.log(`| ${f.feature.padEnd(19)} | ` +
214
+ `${f.tasks.length.toString().padStart(5)} | ` +
215
+ `${f.avgDocPages.toFixed(1).padStart(8)} | ` +
216
+ `${f.avgSearches.toFixed(1).padStart(10)} | ` +
217
+ `${Math.round(f.avgNetworkMs).toString().padStart(11)} | ` +
218
+ `${(f.canonicalCoverage * 100).toFixed(0).padStart(5)}% |`);
219
+ }
220
+ console.log();
221
+ // ---- Canonical coverage breakdown ----
222
+ console.log("CANONICAL DOCUMENTATION COVERAGE");
223
+ console.log("-".repeat(80));
224
+ console.log();
225
+ for (const f of analysis.features) {
226
+ console.log(` ${f.feature} (${(f.canonicalCoverage * 100).toFixed(0)}% canonical coverage):`);
227
+ if (f.canonicalSlugs.length === 0) {
228
+ console.log(" (no canonical docs defined)");
229
+ }
230
+ else {
231
+ for (const slug of f.canonicalSlugs) {
232
+ const found = f.allDocSlugs.some((visited) => visited.includes(slug));
233
+ const marker = found ? "[x]" : "[ ]";
234
+ console.log(` ${marker} ${slug}`);
235
+ }
236
+ }
237
+ if (f.allDocSlugs.length > 0) {
238
+ const nonCanonical = f.allDocSlugs.filter((slug) => !f.canonicalSlugs.some((c) => slug.includes(c)));
239
+ if (nonCanonical.length > 0) {
240
+ console.log(" Additional docs visited:");
241
+ for (const slug of nonCanonical) {
242
+ console.log(` + ${slug}`);
243
+ }
244
+ }
245
+ }
246
+ console.log();
247
+ }
248
+ // ---- Search strategy ----
249
+ const allSearches = analysis.features.flatMap((f) => f.allSearchQueries);
250
+ if (allSearches.length > 0) {
251
+ console.log("SEARCH STRATEGY");
252
+ console.log("-".repeat(80));
253
+ console.log();
254
+ for (const f of analysis.features) {
255
+ if (f.allSearchQueries.length === 0)
256
+ continue;
257
+ console.log(` ${f.feature}:`);
258
+ for (const q of f.allSearchQueries) {
259
+ console.log(` -> "${q}"`);
260
+ }
261
+ }
262
+ console.log();
263
+ }
264
+ // ---- Per-task detail ----
265
+ console.log("PER-TASK DETAIL");
266
+ console.log("-".repeat(80));
267
+ console.log();
268
+ for (const f of analysis.features) {
269
+ console.log(` ## ${f.feature}`);
270
+ console.log();
271
+ for (const t of f.tasks) {
272
+ const variant = t.hasDocs ? "[gold]" : "[baseline]";
273
+ console.log(` ${variant} ${t.description}`);
274
+ console.log(` Requests: ${t.behavior.totalRequests} | ` +
275
+ `Doc pages: ${t.behavior.docPagesVisited} | ` +
276
+ `Searches: ${t.behavior.searchesPerformed} | ` +
277
+ `External: ${t.behavior.externalRequestCount}`);
278
+ if (t.behavior.docSlugsVisited.length > 0) {
279
+ console.log(` Docs: ${t.behavior.docSlugsVisited.join(", ")}`);
280
+ }
281
+ if (t.behavior.uniqueSearchQueries.length > 0) {
282
+ console.log(` Queries: ${t.behavior.uniqueSearchQueries.map((q) => `"${q}"`).join(", ")}`);
283
+ }
284
+ console.log();
285
+ }
286
+ }
287
+ // ---- External domains ----
288
+ const allDomains = [
289
+ ...new Set(analysis.features.flatMap((f) => f.allExternalDomains)),
290
+ ];
291
+ if (allDomains.length > 0) {
292
+ console.log("EXTERNAL DOMAINS");
293
+ console.log("-".repeat(80));
294
+ console.log();
295
+ for (const d of allDomains) {
296
+ console.log(` - ${d}`);
297
+ }
298
+ console.log();
299
+ }
300
+ // ---- Summary stats ----
301
+ console.log("OVERALL STATISTICS");
302
+ console.log("-".repeat(80));
303
+ console.log();
304
+ const totalTasks = analysis.tasks.length;
305
+ const tasksUsingDocs = analysis.tasks.filter((t) => t.behavior.usedDocs).length;
306
+ const tasksUsingSearch = analysis.tasks.filter((t) => t.behavior.usedSearch).length;
307
+ const avgCanonical = analysis.features.reduce((s, f) => s + f.canonicalCoverage, 0) /
308
+ (analysis.features.length || 1);
309
+ console.log(` Total tasks observed: ${totalTasks}`);
310
+ console.log(` Tasks that used docs: ${tasksUsingDocs}/${totalTasks} (${((tasksUsingDocs / totalTasks) * 100).toFixed(0)}%)`);
311
+ console.log(` Tasks that used search: ${tasksUsingSearch}/${totalTasks} (${((tasksUsingSearch / totalTasks) * 100).toFixed(0)}%)`);
312
+ console.log(` Avg canonical coverage: ${(avgCanonical * 100).toFixed(1)}%`);
313
+ console.log();
314
+ }
315
+ main();
@@ -0,0 +1,43 @@
1
+ /**
2
+ * Baseline.ts
3
+ *
4
+ * Manages historical baseline snapshots of evaluation scores.
5
+ * Allows saving, comparing, and listing score baselines over time.
6
+ *
7
+ * Usage:
8
+ * pnpm baseline:save # save current scores as baseline
9
+ * pnpm baseline:save --tag "pre-groq" # save with a descriptive tag
10
+ * pnpm baseline:compare # compare current vs latest baseline
11
+ * pnpm baseline:history # list all saved baselines
12
+ */
13
+ interface BaselineMetadata {
14
+ areaCount: number;
15
+ avgScore: number;
16
+ filename: string;
17
+ graderCost?: number;
18
+ tag?: string;
19
+ timestamp: string;
20
+ totalCost?: number;
21
+ }
22
+ interface CompareResult {
23
+ comparisons?: ScoreComparison[];
24
+ message: string;
25
+ overallDelta?: number;
26
+ success: boolean;
27
+ }
28
+ interface ScoreComparison {
29
+ baseline: number;
30
+ costBaseline?: number;
31
+ costCurrent?: number;
32
+ costDelta?: number;
33
+ current: number;
34
+ delta: number;
35
+ feature: string;
36
+ }
37
+ export declare function compareBaseline(baselineFile?: string): CompareResult;
38
+ export declare function listBaselines(): BaselineMetadata[];
39
+ export declare function saveBaseline(tag?: string): {
40
+ success: boolean;
41
+ message: string;
42
+ };
43
+ export {};
@@ -0,0 +1,267 @@
1
+ /**
2
+ * Baseline.ts
3
+ *
4
+ * Manages historical baseline snapshots of evaluation scores.
5
+ * Allows saving, comparing, and listing score baselines over time.
6
+ *
7
+ * Usage:
8
+ * pnpm baseline:save # save current scores as baseline
9
+ * pnpm baseline:save --tag "pre-groq" # save with a descriptive tag
10
+ * pnpm baseline:compare # compare current vs latest baseline
11
+ * pnpm baseline:history # list all saved baselines
12
+ */
13
+ import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from "fs";
14
+ import { dirname, join, resolve } from "path";
15
+ import { fileURLToPath } from "url";
16
+ const __dirname = dirname(fileURLToPath(import.meta.url));
17
+ const ROOT = resolve(__dirname, "..", "..");
18
+ const BASELINES_DIR = join(ROOT, "results", "baselines");
19
+ const SCORE_SUMMARY_PATH = join(ROOT, "results", "latest", "score-summary.json");
20
+ // ---------------------------------------------------------------------------
21
+ // Compare
22
+ // ---------------------------------------------------------------------------
23
+ export function compareBaseline(baselineFile) {
24
+ if (!existsSync(SCORE_SUMMARY_PATH)) {
25
+ return {
26
+ message: "No current score-summary.json found.",
27
+ success: false,
28
+ };
29
+ }
30
+ // Find baseline to compare against
31
+ const baselines = listBaselines();
32
+ if (baselines.length === 0) {
33
+ return {
34
+ message: "No baselines saved yet. Run 'pnpm baseline:save' first.",
35
+ success: false,
36
+ };
37
+ }
38
+ const targetFile = baselineFile ?? baselines[0].filename;
39
+ const baselinePath = join(BASELINES_DIR, targetFile);
40
+ if (!existsSync(baselinePath)) {
41
+ return {
42
+ message: `Baseline file not found: ${targetFile}`,
43
+ success: false,
44
+ };
45
+ }
46
+ const current = JSON.parse(readFileSync(SCORE_SUMMARY_PATH, "utf-8"));
47
+ const baseline = JSON.parse(readFileSync(baselinePath, "utf-8"));
48
+ const baselineMap = new Map(baseline.scores.map((s) => [s.feature, s.totalScore]));
49
+ const baselineCostMap = new Map(baseline.scores.map((s) => [s.feature, s.totalCost ?? 0]));
50
+ const comparisons = current.scores.map((s) => {
51
+ const baseScore = baselineMap.get(s.feature) ?? 0;
52
+ const currentCost = s.totalCost ?? 0;
53
+ const baseCost = baselineCostMap.get(s.feature) ?? 0;
54
+ return {
55
+ baseline: baseScore,
56
+ costBaseline: baseCost > 0 ? baseCost : undefined,
57
+ costCurrent: currentCost > 0 ? currentCost : undefined,
58
+ costDelta: currentCost > 0 || baseCost > 0 ? currentCost - baseCost : undefined,
59
+ current: s.totalScore,
60
+ delta: s.totalScore - baseScore,
61
+ feature: s.feature,
62
+ };
63
+ });
64
+ // Check for areas in baseline but not in current
65
+ for (const [feature, score] of baselineMap) {
66
+ if (!comparisons.find((c) => c.feature === feature)) {
67
+ comparisons.push({
68
+ baseline: score,
69
+ current: 0,
70
+ delta: -score,
71
+ feature,
72
+ });
73
+ }
74
+ }
75
+ comparisons.sort((a, b) => b.delta - a.delta);
76
+ const overallDelta = Math.round(current.overall.avgScore) - Math.round(baseline.overall.avgScore);
77
+ return {
78
+ comparisons,
79
+ message: `Compared against ${targetFile}`,
80
+ overallDelta,
81
+ success: true,
82
+ };
83
+ }
84
+ export function listBaselines() {
85
+ if (!existsSync(BASELINES_DIR)) {
86
+ return [];
87
+ }
88
+ const files = readdirSync(BASELINES_DIR)
89
+ .filter((f) => f.endsWith(".json"))
90
+ .sort()
91
+ .reverse(); // Newest first
92
+ return files.map((filename) => {
93
+ const raw = readFileSync(join(BASELINES_DIR, filename), "utf-8");
94
+ const data = JSON.parse(raw);
95
+ return {
96
+ areaCount: data.scores.length,
97
+ avgScore: Math.round(data.overall.avgScore),
98
+ filename,
99
+ graderCost: data.overall.cost?.graderTotal,
100
+ tag: data.baselineMeta?.tag,
101
+ timestamp: data.timestamp,
102
+ totalCost: data.overall.cost?.total,
103
+ };
104
+ });
105
+ }
106
+ export function saveBaseline(tag) {
107
+ if (!existsSync(SCORE_SUMMARY_PATH)) {
108
+ return {
109
+ message: "No score-summary.json found. Run 'pnpm calculate-scores' first.",
110
+ success: false,
111
+ };
112
+ }
113
+ const raw = readFileSync(SCORE_SUMMARY_PATH, "utf-8");
114
+ const summary = JSON.parse(raw);
115
+ mkdirSync(BASELINES_DIR, { recursive: true });
116
+ // Generate filename: YYYY-MM-DD_HHmmss[_tag].json
117
+ const now = new Date();
118
+ const datePart = now
119
+ .toISOString()
120
+ .slice(0, 19)
121
+ .replace(/[T:]/g, "_")
122
+ .replace(/-/g, "");
123
+ const tagPart = tag
124
+ ? `_${tag.replace(/[^a-z0-9-]/gi, "-").toLowerCase()}`
125
+ : "";
126
+ const filename = `${datePart}${tagPart}.json`;
127
+ const baseline = {
128
+ ...summary,
129
+ baselineMeta: {
130
+ savedAt: now.toISOString(),
131
+ // oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty string tag should be treated as no tag
132
+ tag: tag || undefined,
133
+ },
134
+ };
135
+ writeFileSync(join(BASELINES_DIR, filename), JSON.stringify(baseline, null, 2));
136
+ return {
137
+ message: `Saved baseline to results/baselines/${filename} (avg: ${Math.round(summary.overall.avgScore)}, ${summary.scores.length} areas)`,
138
+ success: true,
139
+ };
140
+ }
141
+ // ---------------------------------------------------------------------------
142
+ // CLI
143
+ // ---------------------------------------------------------------------------
144
+ if (process.argv[1]?.endsWith("baseline.ts") ||
145
+ process.argv[1]?.endsWith("baseline.js")) {
146
+ const args = process.argv.slice(2);
147
+ const command = args[0] || "save";
148
+ function getArg(name) {
149
+ const idx = args.indexOf(`--${name}`);
150
+ return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : undefined;
151
+ }
152
+ switch (command) {
153
+ case "compare": {
154
+ const file = getArg("file");
155
+ console.log("=== Baseline Comparison ===\n");
156
+ const result = compareBaseline(file);
157
+ if (!result.success) {
158
+ console.error(` ❌ ${result.message}`);
159
+ process.exit(1);
160
+ }
161
+ console.log(` ${result.message}\n`);
162
+ console.log(" " +
163
+ "Feature Area".padEnd(18) +
164
+ "Current".padEnd(10) +
165
+ "Baseline".padEnd(10) +
166
+ "Delta");
167
+ console.log(" " + "-".repeat(50));
168
+ for (const c of result.comparisons) {
169
+ const deltaStr = c.delta > 0 ? `+${c.delta}` : c.delta === 0 ? "=" : String(c.delta);
170
+ const icon = c.delta > 0 ? "📈" : c.delta < 0 ? "📉" : "➡️";
171
+ console.log(" " +
172
+ c.feature.padEnd(18) +
173
+ String(c.current).padEnd(10) +
174
+ String(c.baseline).padEnd(10) +
175
+ `${icon} ${deltaStr}`);
176
+ }
177
+ // Cost comparison (only if cost data exists)
178
+ const hasCostData = result.comparisons.some((c) => c.costCurrent !== undefined || c.costBaseline !== undefined);
179
+ if (hasCostData) {
180
+ console.log();
181
+ console.log(" " + "Cost Comparison:");
182
+ console.log(" " +
183
+ "Feature Area".padEnd(18) +
184
+ "Current".padEnd(10) +
185
+ "Baseline".padEnd(10) +
186
+ "Delta");
187
+ console.log(" " + "-".repeat(50));
188
+ for (const c of result.comparisons) {
189
+ if (c.costCurrent === undefined && c.costBaseline === undefined) {
190
+ continue;
191
+ }
192
+ const cur = `$${(c.costCurrent ?? 0).toFixed(4)}`;
193
+ const base = `$${(c.costBaseline ?? 0).toFixed(4)}`;
194
+ const delta = c.costDelta ?? 0;
195
+ const deltaStr = delta > 0
196
+ ? `+$${delta.toFixed(4)}`
197
+ : delta < 0
198
+ ? `-$${Math.abs(delta).toFixed(4)}`
199
+ : "=";
200
+ const icon = delta > 0 ? "📈" : delta < 0 ? "📉" : "➡️";
201
+ console.log(" " +
202
+ c.feature.padEnd(18) +
203
+ cur.padEnd(10) +
204
+ base.padEnd(10) +
205
+ `${icon} ${deltaStr}`);
206
+ }
207
+ }
208
+ console.log();
209
+ const overallIcon = result.overallDelta > 0 ? "📈" : result.overallDelta < 0 ? "📉" : "➡️";
210
+ const overallStr = result.overallDelta > 0
211
+ ? `+${result.overallDelta}`
212
+ : result.overallDelta === 0
213
+ ? "="
214
+ : String(result.overallDelta);
215
+ console.log(` Overall: ${overallIcon} ${overallStr} points`);
216
+ break;
217
+ }
218
+ case "history": {
219
+ console.log("=== Baseline History ===\n");
220
+ const baselines = listBaselines();
221
+ if (baselines.length === 0) {
222
+ console.log(" No baselines saved yet.");
223
+ }
224
+ else {
225
+ const hasCosts = baselines.some((b) => b.totalCost !== undefined || b.graderCost !== undefined);
226
+ const costHeader = hasCosts ? "Cost".padEnd(10) : "";
227
+ console.log(" " +
228
+ "Date".padEnd(22) +
229
+ "Avg".padEnd(6) +
230
+ "Areas".padEnd(7) +
231
+ costHeader +
232
+ "Tag");
233
+ console.log(" " + "-".repeat(hasCosts ? 60 : 50));
234
+ for (const b of baselines) {
235
+ const date = new Date(b.timestamp).toLocaleString();
236
+ const combinedCost = (b.totalCost ?? 0) + (b.graderCost ?? 0);
237
+ const costStr = hasCosts
238
+ ? (combinedCost > 0 ? `$${combinedCost.toFixed(2)}` : "-").padEnd(10)
239
+ : "";
240
+ console.log(" " +
241
+ date.padEnd(22) +
242
+ String(b.avgScore).padEnd(6) +
243
+ String(b.areaCount).padEnd(7) +
244
+ costStr +
245
+ (b.tag ?? ""));
246
+ }
247
+ }
248
+ break;
249
+ }
250
+ case "save": {
251
+ const tag = getArg("tag");
252
+ console.log("=== Saving baseline snapshot ===\n");
253
+ const result = saveBaseline(tag);
254
+ if (result.success) {
255
+ console.log(` ✅ ${result.message}`);
256
+ }
257
+ else {
258
+ console.error(` ❌ ${result.message}`);
259
+ process.exit(1);
260
+ }
261
+ break;
262
+ }
263
+ default:
264
+ console.error(`Unknown command: "${command}". Use: save, history, compare`);
265
+ process.exit(1);
266
+ }
267
+ }