@sanity/ailf 2.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (442) hide show
  1. package/canonical/grader-references/README.md +2 -2
  2. package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
  3. package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
  4. package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
  5. package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
  6. package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
  7. package/config/features.ts +1 -1
  8. package/config/models.ts +28 -23
  9. package/config/sources.ts +1 -1
  10. package/config/thresholds.ts +1 -1
  11. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
  12. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
  13. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
  14. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
  15. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
  16. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
  17. package/dist/_vendor/ailf-core/config-helpers.d.ts +6 -0
  18. package/dist/_vendor/ailf-core/config-helpers.js +29 -0
  19. package/dist/_vendor/ailf-core/examples/index.d.ts +164 -94
  20. package/dist/_vendor/ailf-core/examples/index.js +208 -114
  21. package/dist/_vendor/ailf-core/index.d.ts +1 -0
  22. package/dist/_vendor/ailf-core/index.js +1 -0
  23. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
  24. package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
  25. package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
  26. package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
  27. package/dist/_vendor/ailf-core/ports/context.d.ts +20 -1
  28. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
  29. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  30. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
  31. package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
  32. package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
  33. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +6 -1
  34. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +14 -2
  35. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
  36. package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
  37. package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
  38. package/dist/_vendor/ailf-core/services/index.js +1 -1
  39. package/dist/_vendor/ailf-core/services/scoring.js +9 -0
  40. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +12 -1
  41. package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
  42. package/dist/_vendor/ailf-core/types/index.d.ts +47 -4
  43. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +27 -0
  44. package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
  45. package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
  46. package/dist/_vendor/ailf-tasks/cli.d.ts +8 -0
  47. package/dist/_vendor/ailf-tasks/cli.js +61 -0
  48. package/dist/_vendor/ailf-tasks/index.d.ts +13 -0
  49. package/dist/_vendor/ailf-tasks/index.js +16 -0
  50. package/dist/_vendor/ailf-tasks/parser.d.ts +27 -0
  51. package/dist/_vendor/ailf-tasks/parser.js +73 -0
  52. package/dist/_vendor/ailf-tasks/schemas.d.ts +198 -0
  53. package/dist/_vendor/ailf-tasks/schemas.js +180 -0
  54. package/dist/_vendor/ailf-tasks/validation.d.ts +47 -0
  55. package/dist/_vendor/ailf-tasks/validation.js +162 -0
  56. package/dist/adapters/api-client/remediation.js +2 -2
  57. package/dist/adapters/config-sources/file-config-adapter.js +6 -1
  58. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
  59. package/dist/adapters/index.d.ts +0 -1
  60. package/dist/adapters/index.js +0 -1
  61. package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
  62. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  63. package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
  64. package/dist/adapters/task-sources/content-lake-task-source.js +4 -6
  65. package/dist/adapters/task-sources/index.d.ts +1 -2
  66. package/dist/adapters/task-sources/index.js +1 -2
  67. package/dist/adapters/task-sources/repo-schemas.d.ts +1 -1
  68. package/dist/adapters/task-sources/repo-schemas.js +2 -2
  69. package/dist/adapters/task-sources/repo-task-source.js +1 -1
  70. package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
  71. package/dist/adapters/task-sources/repo-trigger.js +1 -1
  72. package/dist/adapters/task-sources/task-file-loader.d.ts +9 -6
  73. package/dist/adapters/task-sources/task-file-loader.js +20 -6
  74. package/dist/agent-observer/test-imports.d.ts +7 -0
  75. package/dist/agent-observer/test-imports.js +185 -0
  76. package/dist/artifact-capture/comparator.d.ts +22 -0
  77. package/dist/artifact-capture/comparator.js +493 -0
  78. package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
  79. package/dist/artifact-capture/filesystem-collector.js +237 -0
  80. package/dist/artifact-capture/redact-artifact.d.ts +20 -0
  81. package/dist/artifact-capture/redact-artifact.js +115 -0
  82. package/dist/assertions/source-isolation.d.ts +1 -1
  83. package/dist/assertions/source-isolation.js +1 -1
  84. package/dist/cli.js +4 -0
  85. package/dist/commands/calculate-scores.js +1 -0
  86. package/dist/commands/capture-compare.d.ts +15 -0
  87. package/dist/commands/capture-compare.js +253 -0
  88. package/dist/commands/capture-list.d.ts +12 -0
  89. package/dist/commands/capture-list.js +147 -0
  90. package/dist/commands/capture.d.ts +9 -0
  91. package/dist/commands/capture.js +16 -0
  92. package/dist/commands/chronic-failures.d.ts +8 -0
  93. package/dist/commands/chronic-failures.js +33 -0
  94. package/dist/commands/explain-handler.d.ts +1 -1
  95. package/dist/commands/explain-handler.js +37 -8
  96. package/dist/commands/fetch-docs.js +1 -0
  97. package/dist/commands/generate-configs.d.ts +3 -3
  98. package/dist/commands/generate-configs.js +20 -8
  99. package/dist/commands/init.d.ts +2 -3
  100. package/dist/commands/init.js +56 -170
  101. package/dist/commands/pipeline-action.d.ts +7 -1
  102. package/dist/commands/pipeline-action.js +43 -19
  103. package/dist/commands/pipeline.d.ts +6 -1
  104. package/dist/commands/pipeline.js +7 -2
  105. package/dist/commands/pr-comment.js +1 -0
  106. package/dist/commands/publish.js +1 -0
  107. package/dist/commands/shared/help.js +2 -2
  108. package/dist/commands/update-quality-scores.d.ts +5 -0
  109. package/dist/commands/update-quality-scores.js +20 -0
  110. package/dist/composition-root.d.ts +2 -3
  111. package/dist/composition-root.js +27 -14
  112. package/dist/config/features.ts +23 -0
  113. package/dist/config/models.ts +100 -0
  114. package/dist/config/prompts.ts +16 -0
  115. package/dist/config/rubrics.ts +225 -0
  116. package/dist/config/schedules.ts +47 -0
  117. package/dist/config/sinks.ts +37 -0
  118. package/dist/config/sources.ts +21 -0
  119. package/dist/config/thresholds.ts +61 -0
  120. package/dist/lib/agent-behavior-report.d.ts +8 -0
  121. package/dist/lib/agent-behavior-report.js +185 -0
  122. package/dist/lib/baseline.d.ts +19 -0
  123. package/dist/lib/baseline.js +153 -0
  124. package/dist/lib/calculate-scores.d.ts +23 -0
  125. package/dist/lib/calculate-scores.js +42 -0
  126. package/dist/lib/compare.d.ts +18 -0
  127. package/dist/lib/compare.js +170 -0
  128. package/dist/lib/coverage-audit.d.ts +4 -0
  129. package/dist/lib/coverage-audit.js +42 -0
  130. package/dist/lib/discovery-report.d.ts +13 -0
  131. package/dist/lib/discovery-report.js +57 -0
  132. package/dist/lib/fetch-docs.d.ts +30 -0
  133. package/dist/lib/fetch-docs.js +171 -0
  134. package/dist/lib/generate-configs.d.ts +25 -0
  135. package/dist/lib/generate-configs.js +42 -0
  136. package/dist/lib/grader-api.d.ts +21 -0
  137. package/dist/lib/grader-api.js +34 -0
  138. package/dist/lib/grader-compare.d.ts +19 -0
  139. package/dist/lib/grader-compare.js +91 -0
  140. package/dist/lib/grader-consistency.d.ts +27 -0
  141. package/dist/lib/grader-consistency.js +79 -0
  142. package/dist/lib/grader-sensitivity.d.ts +19 -0
  143. package/dist/lib/grader-sensitivity.js +75 -0
  144. package/dist/lib/grader-validate.d.ts +19 -0
  145. package/dist/lib/grader-validate.js +78 -0
  146. package/dist/lib/measure-retrieval.d.ts +14 -0
  147. package/dist/lib/measure-retrieval.js +71 -0
  148. package/dist/lib/pr-comment.d.ts +16 -0
  149. package/dist/lib/pr-comment.js +28 -0
  150. package/dist/lib/readiness-report.d.ts +13 -0
  151. package/dist/lib/readiness-report.js +108 -0
  152. package/dist/lib/webhook-server.d.ts +11 -0
  153. package/dist/lib/webhook-server.js +24 -0
  154. package/dist/lib/weekly-digest.d.ts +24 -0
  155. package/dist/lib/weekly-digest.js +148 -0
  156. package/dist/orchestration/build-app-context.js +13 -0
  157. package/dist/orchestration/cache-context.d.ts +23 -0
  158. package/dist/orchestration/cache-context.js +43 -0
  159. package/dist/orchestration/env-bridge.d.ts +21 -0
  160. package/dist/orchestration/env-bridge.js +66 -0
  161. package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
  162. package/dist/orchestration/load-pipeline-tasks.js +52 -0
  163. package/dist/orchestration/pipeline-orchestrator.js +75 -5
  164. package/dist/orchestration/step-runner.js +5 -1
  165. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
  166. package/dist/orchestration/steps/calculate-scores-step.js +13 -0
  167. package/dist/orchestration/steps/callback-step.js +10 -1
  168. package/dist/orchestration/steps/compare-step.js +6 -3
  169. package/dist/orchestration/steps/discovery-report-step.js +6 -2
  170. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  171. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  172. package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
  173. package/dist/orchestration/steps/fetch-docs-step.js +30 -16
  174. package/dist/orchestration/steps/gap-analysis-step.js +13 -2
  175. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
  176. package/dist/orchestration/steps/generate-configs-step.js +50 -15
  177. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
  178. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  179. package/dist/orchestration/steps/publish-report-step.js +19 -0
  180. package/dist/orchestration/steps/readiness-step.js +8 -3
  181. package/dist/orchestration/steps/report-step.js +17 -4
  182. package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
  183. package/dist/orchestration/steps/run-eval-step.js +51 -31
  184. package/dist/pipeline/agent-behavior-report.js +6 -0
  185. package/dist/pipeline/attribution.d.ts +1 -1
  186. package/dist/pipeline/attribution.js +1 -1
  187. package/dist/pipeline/cache.js +29 -15
  188. package/dist/pipeline/calculate-scores.d.ts +2 -0
  189. package/dist/pipeline/calculate-scores.js +70 -33
  190. package/dist/pipeline/chronic-failures.d.ts +55 -0
  191. package/dist/pipeline/chronic-failures.js +110 -0
  192. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +33 -0
  193. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
  194. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
  195. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
  196. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
  197. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
  198. package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
  199. package/dist/pipeline/compiler/assertion-mapper.js +1 -1
  200. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
  201. package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
  202. package/dist/pipeline/compiler/config-loader.d.ts +14 -0
  203. package/dist/pipeline/compiler/config-loader.js +42 -2
  204. package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
  205. package/dist/pipeline/compiler/fixture-resolver.js +1 -1
  206. package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
  207. package/dist/pipeline/compiler/ignore-fields.js +1 -1
  208. package/dist/pipeline/compiler/index.d.ts +2 -5
  209. package/dist/pipeline/compiler/index.js +2 -5
  210. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  211. package/dist/pipeline/compiler/literacy-bridge.js +1 -1
  212. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +1 -1
  213. package/dist/pipeline/compiler/mode-bases/agent-harness.js +1 -1
  214. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +1 -1
  215. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +1 -1
  216. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +13 -2
  217. package/dist/pipeline/compiler/mode-bases/literacy.js +55 -1
  218. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +1 -1
  219. package/dist/pipeline/compiler/mode-bases/mcp-server.js +1 -1
  220. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +1 -1
  221. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -1
  222. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
  223. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
  224. package/dist/pipeline/compiler/mode-handlers/index.d.ts +2 -2
  225. package/dist/pipeline/compiler/mode-handlers/index.js +2 -2
  226. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +1 -1
  227. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +1 -1
  228. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
  229. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
  230. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
  231. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +1 -1
  232. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
  233. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
  234. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
  235. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +334 -0
  236. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +1 -1
  237. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +1 -1
  238. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +1 -1
  239. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +1 -1
  240. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +4 -0
  241. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +69 -0
  242. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +307 -0
  243. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +22 -5
  244. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +6 -0
  245. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +10 -5
  246. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +314 -7
  247. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +10 -0
  248. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
  249. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
  250. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +1 -1
  251. package/dist/pipeline/compiler/presets/sanity-literacy.js +1 -1
  252. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
  253. package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
  254. package/dist/pipeline/compiler/provider-assembler.js +13 -7
  255. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
  256. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
  257. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
  258. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
  259. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
  260. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
  261. package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
  262. package/dist/pipeline/compiler/sandbox/index.js +1 -1
  263. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
  264. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
  265. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
  266. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
  267. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
  268. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
  269. package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
  270. package/dist/pipeline/compiler/scoring-bridge.js +1 -1
  271. package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
  272. package/dist/pipeline/compiler/task-bridge.js +92 -0
  273. package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
  274. package/dist/pipeline/compiler/task-graph-builder.js +1 -4
  275. package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
  276. package/dist/pipeline/compiler/telemetry/index.js +1 -1
  277. package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
  278. package/dist/pipeline/compiler/variable-resolver.js +1 -1
  279. package/dist/pipeline/coverage-audit.d.ts +1 -1
  280. package/dist/pipeline/coverage-audit.js +1 -1
  281. package/dist/pipeline/degradations.d.ts +1 -1
  282. package/dist/pipeline/degradations.js +1 -1
  283. package/dist/pipeline/failure-modes.d.ts +1 -1
  284. package/dist/pipeline/failure-modes.js +13 -1
  285. package/dist/pipeline/gap-analysis.d.ts +1 -1
  286. package/dist/pipeline/gap-analysis.js +3 -1
  287. package/dist/pipeline/generate-configs.d.ts +2 -2
  288. package/dist/pipeline/generate-configs.js +15 -8
  289. package/dist/pipeline/grader-compare-runner.d.ts +1 -1
  290. package/dist/pipeline/grader-compare-runner.js +7 -1
  291. package/dist/pipeline/grader-comparison.d.ts +1 -1
  292. package/dist/pipeline/grader-comparison.js +1 -1
  293. package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
  294. package/dist/pipeline/grader-consistency-runner.js +7 -1
  295. package/dist/pipeline/grader-consistency.d.ts +1 -1
  296. package/dist/pipeline/grader-consistency.js +1 -1
  297. package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
  298. package/dist/pipeline/grader-sensitivity-runner.js +1 -1
  299. package/dist/pipeline/grader-sensitivity.d.ts +1 -1
  300. package/dist/pipeline/grader-sensitivity.js +1 -1
  301. package/dist/pipeline/grader-validate-runner.d.ts +1 -1
  302. package/dist/pipeline/grader-validate-runner.js +2 -2
  303. package/dist/pipeline/grader-validation.d.ts +1 -1
  304. package/dist/pipeline/grader-validation.js +1 -1
  305. package/dist/pipeline/map-request-to-config.js +15 -2
  306. package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
  307. package/dist/pipeline/mirror-repo-tasks.js +1 -1
  308. package/dist/pipeline/plan-format.d.ts +1 -1
  309. package/dist/pipeline/plan-format.js +1 -1
  310. package/dist/pipeline/plan.d.ts +1 -1
  311. package/dist/pipeline/plan.js +67 -29
  312. package/dist/pipeline/probe.d.ts +1 -1
  313. package/dist/pipeline/probe.js +1 -1
  314. package/dist/pipeline/readiness-report.d.ts +2 -2
  315. package/dist/pipeline/readiness-report.js +2 -2
  316. package/dist/pipeline/release-classification.d.ts +1 -1
  317. package/dist/pipeline/release-classification.js +1 -1
  318. package/dist/pipeline/release-report.d.ts +1 -1
  319. package/dist/pipeline/release-report.js +1 -1
  320. package/dist/pipeline/repo-eval-comment.d.ts +1 -1
  321. package/dist/pipeline/repo-eval-comment.js +1 -1
  322. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  323. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  324. package/dist/pipeline/resolve-mappings.d.ts +6 -6
  325. package/dist/pipeline/resolve-mappings.js +44 -44
  326. package/dist/pipeline/retrieval-metrics.d.ts +3 -3
  327. package/dist/pipeline/retrieval-metrics.js +28 -20
  328. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  329. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  330. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  331. package/dist/pipeline/steps/compare-step.js +90 -0
  332. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  333. package/dist/pipeline/steps/eval-step.js +347 -0
  334. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  335. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  336. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  337. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  338. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  339. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  340. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  341. package/dist/pipeline/steps/publish-report-step.js +243 -0
  342. package/dist/pipeline/steps/report-step.d.ts +13 -0
  343. package/dist/pipeline/steps/report-step.js +56 -0
  344. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  345. package/dist/pipeline/steps/update-scores-step.js +42 -0
  346. package/dist/pipeline/targeted-loo.d.ts +1 -1
  347. package/dist/pipeline/targeted-loo.js +1 -1
  348. package/dist/pipeline/thresholds.d.ts +1 -1
  349. package/dist/pipeline/thresholds.js +1 -1
  350. package/dist/pipeline/validate.js +13 -0
  351. package/dist/report-store.d.ts +17 -0
  352. package/dist/report-store.js +24 -0
  353. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  354. package/dist/scripts/agent-behavior-report.js +315 -0
  355. package/dist/scripts/baseline.d.ts +43 -0
  356. package/dist/scripts/baseline.js +267 -0
  357. package/dist/scripts/calculate-scores.d.ts +166 -0
  358. package/dist/scripts/calculate-scores.js +1296 -0
  359. package/dist/scripts/compare.d.ts +22 -0
  360. package/dist/scripts/compare.js +334 -0
  361. package/dist/scripts/coverage-audit.d.ts +44 -0
  362. package/dist/scripts/coverage-audit.js +209 -0
  363. package/dist/scripts/debug-eval.d.ts +19 -0
  364. package/dist/scripts/debug-eval.js +73 -0
  365. package/dist/scripts/discovery-report.d.ts +58 -0
  366. package/dist/scripts/discovery-report.js +250 -0
  367. package/dist/scripts/fetch-docs.d.ts +35 -0
  368. package/dist/scripts/fetch-docs.js +472 -0
  369. package/dist/scripts/generate-configs.d.ts +66 -0
  370. package/dist/scripts/generate-configs.js +459 -0
  371. package/dist/scripts/grader-api.d.ts +27 -0
  372. package/dist/scripts/grader-api.js +206 -0
  373. package/dist/scripts/grader-compare.d.ts +22 -0
  374. package/dist/scripts/grader-compare.js +368 -0
  375. package/dist/scripts/grader-consistency.d.ts +20 -0
  376. package/dist/scripts/grader-consistency.js +313 -0
  377. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  378. package/dist/scripts/grader-sensitivity.js +354 -0
  379. package/dist/scripts/grader-validate.d.ts +19 -0
  380. package/dist/scripts/grader-validate.js +267 -0
  381. package/dist/scripts/measure-retrieval.d.ts +10 -0
  382. package/dist/scripts/measure-retrieval.js +145 -0
  383. package/dist/scripts/migrate-task-mode.d.ts +1 -1
  384. package/dist/scripts/migrate-task-mode.js +1 -1
  385. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
  386. package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
  387. package/dist/scripts/pipeline.d.ts +76 -0
  388. package/dist/scripts/pipeline.js +1031 -0
  389. package/dist/scripts/pr-comment.d.ts +10 -0
  390. package/dist/scripts/pr-comment.js +510 -0
  391. package/dist/scripts/readiness-report.d.ts +88 -0
  392. package/dist/scripts/readiness-report.js +342 -0
  393. package/dist/scripts/update-quality-scores.d.ts +15 -0
  394. package/dist/scripts/update-quality-scores.js +184 -0
  395. package/dist/scripts/validate-task-sources.d.ts +1 -1
  396. package/dist/scripts/validate-task-sources.js +1 -1
  397. package/dist/scripts/validate.d.ts +13 -0
  398. package/dist/scripts/validate.js +79 -0
  399. package/dist/scripts/webhook-server.d.ts +26 -0
  400. package/dist/scripts/webhook-server.js +147 -0
  401. package/dist/scripts/weekly-digest.d.ts +24 -0
  402. package/dist/scripts/weekly-digest.js +144 -0
  403. package/dist/sinks/format-slack.d.ts +64 -0
  404. package/dist/sinks/format-slack.js +306 -0
  405. package/dist/sinks/slack-sink.d.ts +27 -0
  406. package/dist/sinks/slack-sink.js +78 -0
  407. package/dist/sinks/types.d.ts +1 -1
  408. package/dist/sinks/types.js +1 -1
  409. package/dist/sinks/webhook-sink.d.ts +19 -0
  410. package/dist/sinks/webhook-sink.js +50 -0
  411. package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
  412. package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
  413. package/dist/tasks/literacy/content-lake.task.ts +181 -0
  414. package/dist/tasks/literacy/frameworks.task.ts +129 -0
  415. package/dist/tasks/literacy/functions.task.ts +70 -0
  416. package/dist/tasks/literacy/groq.task.ts +259 -0
  417. package/dist/tasks/literacy/image-handling.task.ts +95 -0
  418. package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
  419. package/dist/tasks/literacy/portable-text.task.ts +169 -0
  420. package/dist/tasks/literacy/studio-setup.task.ts +134 -0
  421. package/dist/tasks/literacy/visual-editing.task.ts +147 -0
  422. package/package.json +24 -24
  423. package/tasks/.expanded.agentic.yaml +280 -0
  424. package/tasks/.expanded.yaml +565 -0
  425. package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
  426. package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
  427. package/tasks/literacy/content-lake.task.ts +181 -0
  428. package/tasks/literacy/frameworks.task.ts +1 -0
  429. package/tasks/literacy/functions.task.ts +1 -0
  430. package/tasks/literacy/groq.task.ts +1 -0
  431. package/tasks/literacy/image-handling.task.ts +95 -0
  432. package/tasks/literacy/nextjs-live.task.ts +2 -1
  433. package/tasks/literacy/portable-text.task.ts +169 -0
  434. package/tasks/literacy/studio-setup.task.ts +5 -2
  435. package/tasks/literacy/visual-editing.task.ts +1 -0
  436. package/LICENSE +0 -21
  437. package/tasks/frameworks.yaml +0 -98
  438. package/tasks/functions.yaml +0 -51
  439. package/tasks/groq.yaml +0 -216
  440. package/tasks/nextjs-live.yaml +0 -62
  441. package/tasks/studio-setup.yaml +0 -111
  442. package/tasks/visual-editing.yaml +0 -120
@@ -0,0 +1,1296 @@
1
+ /**
2
+ * Calculate-scores.ts
3
+ *
4
+ * Reads Promptfoo evaluation output and computes the AI Literacy Score
5
+ * for each feature area. Each dimension is scored on a uniform 0–100 scale:
6
+ *
7
+ * Task Completion (0–100) — Can the LLM implement the feature?
8
+ * Code Correctness (0–100) — Is the code idiomatic and correct?
9
+ * Doc Coverage (0–100) — Did docs provide the needed info?
10
+ *
11
+ * Dimensions are combined into a weighted composite (0–100) using weights
12
+ * from config/rubrics.yaml (default: Task×0.50 + Code×0.25 + Docs×0.25).
13
+ *
14
+ * Additionally compares with-docs vs without-docs scores to calculate
15
+ * the "Doc Lift" — how much documentation helps vs parametric knowledge.
16
+ *
17
+ * When tests are run with the InstrumentedProvider (agent-observer),
18
+ * this script also aggregates and reports agent behavior data: which
19
+ * documentation pages were visited, what searches were performed, and
20
+ * overall network activity patterns.
21
+ */
22
+ // oxlint-disable-next-line import/no-unassigned-import -- side-effect: loads .env into process.env
23
+ import "dotenv/config";
24
+ import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
25
+ import { dirname, join } from "path";
26
+ import { calculateCost } from "../agent-observer/pricing.js";
27
+ import { checkResultsExist } from "../pipeline/checks.js";
28
+ import { loadRubricTemplates } from "../pipeline/expand-tasks.js";
29
+ import { loadSource } from "../sources.js";
30
+ import { analyzeSourceIsolation, } from "../assertions/source-isolation.js";
31
+ /**
32
+ * Calculate scores grouped by model. Each model gets its own FeatureScore[]
33
+ * and model-level aggregates.
34
+ *
35
+ * Uses the provider.id from Promptfoo results to identify models.
36
+ * Falls back to provider.label, then "unknown" if neither is available.
37
+ *
38
+ * @returns Record keyed by model ID, or null if only one model was used
39
+ * (per-model breakdown is redundant when there's only one model).
40
+ */
41
+ export function calculateScoresPerModel(resultsPath, weights) {
42
+ const results = readAndNormalizeResults(resultsPath);
43
+ // Group results by provider
44
+ const byModel = {};
45
+ for (const result of results) {
46
+ const modelId = result.providerId ?? result.providerLabel ?? "unknown";
47
+ const label = result.providerLabel ?? result.providerId ?? "unknown";
48
+ if (!byModel[modelId]) {
49
+ byModel[modelId] = { label, results: [] };
50
+ }
51
+ byModel[modelId].results.push(result);
52
+ }
53
+ const modelIds = Object.keys(byModel);
54
+ // Skip per-model breakdown if there's only one model — it would be
55
+ // Identical to the overall scores and adds no information.
56
+ if (modelIds.length <= 1) {
57
+ return null;
58
+ }
59
+ const perModel = [];
60
+ for (const [modelId, { label, results: modelResults }] of Object.entries(byModel)) {
61
+ const scores = scoreResults(modelResults, weights, modelId);
62
+ const totalTests = scores.reduce((s, sc) => s + sc.testCount, 0);
63
+ const totalCost = scores.reduce((s, sc) => s + sc.totalCost, 0);
64
+ const avgScore = scores.length > 0
65
+ ? scores.reduce((s, sc) => s + sc.totalScore, 0) / scores.length
66
+ : 0;
67
+ const avgDocLift = scores.length > 0
68
+ ? scores.reduce((s, sc) => s + sc.docLift, 0) / scores.length
69
+ : 0;
70
+ perModel.push({
71
+ label,
72
+ modelId,
73
+ overall: {
74
+ avgDocLift,
75
+ avgScore,
76
+ cost: totalCost > 0 ? totalCost : undefined,
77
+ testCount: totalTests,
78
+ },
79
+ scores,
80
+ });
81
+ }
82
+ return perModel;
83
+ }
84
+ export function classifyRubric(component) {
85
+ // Prefer structured metadata (Approach 5) over heuristic matching
86
+ const metadata = component.assertion?.metadata;
87
+ if (metadata?.dimension) {
88
+ switch (metadata.dimension) {
89
+ case "code-correctness":
90
+ return "codeCorrectness";
91
+ case "doc-coverage":
92
+ return "docCoverage";
93
+ case "task-completion":
94
+ return "taskCompletion";
95
+ default:
96
+ return null;
97
+ }
98
+ }
99
+ // Fallback: heuristic name matching (for backward compatibility)
100
+ const value = (component.assertion?.value ?? "").toLowerCase();
101
+ if (value.includes("task completion")) {
102
+ return "taskCompletion";
103
+ }
104
+ if (value.includes("code correctness")) {
105
+ return "codeCorrectness";
106
+ }
107
+ if (value.includes("documentation coverage") ||
108
+ value.includes("hallucinate")) {
109
+ return "docCoverage";
110
+ }
111
+ return null;
112
+ }
113
+ export function detectFeatureArea(description) {
114
+ const desc = description.toLowerCase();
115
+ if (desc.includes("studio")) {
116
+ return "studio-setup";
117
+ }
118
+ if (desc.includes("visual") ||
119
+ desc.includes("presentation") ||
120
+ desc.includes("live preview")) {
121
+ return "visual-editing";
122
+ }
123
+ if (desc.includes("function") || desc.includes("webhook")) {
124
+ return "functions";
125
+ }
126
+ if (desc.startsWith("groq")) {
127
+ return "groq";
128
+ }
129
+ if (desc.includes("next") || desc.includes("app router")) {
130
+ return "nextjs-live";
131
+ }
132
+ if (desc.includes("remix") ||
133
+ desc.includes("nuxt") ||
134
+ desc.includes("svelte")) {
135
+ return "frameworks";
136
+ }
137
+ return "other";
138
+ }
139
+ // ---------------------------------------------------------------------------
140
+ // URL extraction from assertion metadata
141
+ // ---------------------------------------------------------------------------
142
+ /**
143
+ * Extract grader judgments (reason text + scores) from evaluation results.
144
+ *
145
+ * This preserves the grader's natural language reasoning for downstream
146
+ * analysis (failure mode classification, gap analysis). Each llm-rubric
147
+ * assertion produces one GraderJudgment entry.
148
+ *
149
+ * Phase 3a prerequisite: structured judgment data for failure mode extraction.
150
+ */
151
+ export function extractGraderJudgments(resultsPath) {
152
+ const results = readAndNormalizeResults(resultsPath);
153
+ const judgments = [];
154
+ for (const result of results) {
155
+ const taskId = result.description;
156
+ const modelId = result.providerId ?? result.providerLabel ?? "unknown";
157
+ for (const comp of result.gradingResult.componentResults) {
158
+ if (comp.assertion?.type !== "llm-rubric") {
159
+ continue;
160
+ }
161
+ const kind = classifyRubric(comp);
162
+ if (!kind) {
163
+ continue;
164
+ }
165
+ const score = parseRubricScore(comp);
166
+ // Extract the reason text — the grader's reasoning
167
+ let reason = comp.reason ?? "";
168
+ if (reason) {
169
+ // Try to parse JSON reason to extract the reason field
170
+ try {
171
+ const parsed = JSON.parse(reason);
172
+ const obj = parsed;
173
+ if (typeof obj.reason === "string") {
174
+ ;
175
+ ({ reason } = obj);
176
+ }
177
+ }
178
+ catch {
179
+ // Not JSON — use raw reason string
180
+ }
181
+ }
182
+ // Map internal dimension names to hyphenated form
183
+ const dimensionMap = {
184
+ codeCorrectness: "code-correctness",
185
+ docCoverage: "doc-coverage",
186
+ taskCompletion: "task-completion",
187
+ };
188
+ judgments.push({
189
+ dimension: dimensionMap[kind] ?? kind,
190
+ modelId,
191
+ reason,
192
+ score,
193
+ taskId,
194
+ });
195
+ }
196
+ }
197
+ return judgments;
198
+ }
199
+ /**
200
+ * Finds the URL-extraction assertion result in a test's componentResults
201
+ * and parses the structured JSON from its `reason` field.
202
+ */
203
+ export function extractUrlMetadata(test) {
204
+ for (const comp of test.gradingResult.componentResults) {
205
+ if (comp.assertion?.type !== "javascript") {
206
+ continue;
207
+ }
208
+ if (!comp.reason) {
209
+ continue;
210
+ }
211
+ try {
212
+ const parsed = JSON.parse(comp.reason);
213
+ const obj = parsed;
214
+ if (Array.isArray(obj.sanityUrls) &&
215
+ typeof obj.totalUrlCount === "number") {
216
+ return parsed;
217
+ }
218
+ }
219
+ catch {
220
+ // Not the URL-extraction assertion - skip
221
+ }
222
+ }
223
+ return null;
224
+ }
225
+ // ---------------------------------------------------------------------------
226
+ // Per-model scoring (Approach 3 from evaluation roadmap)
227
+ // ---------------------------------------------------------------------------
228
+ export function parseRubricScore(component) {
229
+ // Direct score field
230
+ if (typeof component.score === "number") {
231
+ return component.score;
232
+ }
233
+ // Try to extract from reason (LLM rubric returns JSON)
234
+ if (component.reason) {
235
+ try {
236
+ const parsed = JSON.parse(component.reason);
237
+ const obj = parsed;
238
+ if (typeof obj.score === "number") {
239
+ return obj.score;
240
+ }
241
+ }
242
+ catch {
243
+ // Try to find a bare number
244
+ const match = component.reason.match(/(\d+)/);
245
+ if (match) {
246
+ return parseInt(match[1], 10);
247
+ }
248
+ }
249
+ }
250
+ return 0;
251
+ }
252
+ /**
253
+ * Aggregates agent behavior data across all test results, grouped by
254
+ * feature area. Returns null if no behavior data is present.
255
+ */
256
+ function aggregateAgentBehavior(resultsPath) {
257
+ const results = readAndNormalizeResults(resultsPath);
258
+ const byFeature = {};
259
+ let hasBehaviorData = false;
260
+ for (const result of results) {
261
+ const feature = detectFeatureArea(result.description);
262
+ const behavior = extractAgentBehavior(result);
263
+ if (!behavior) {
264
+ continue;
265
+ }
266
+ hasBehaviorData = true;
267
+ if (!byFeature[feature]) {
268
+ byFeature[feature] = [];
269
+ }
270
+ byFeature[feature].push(behavior);
271
+ }
272
+ if (!hasBehaviorData) {
273
+ return null;
274
+ }
275
+ return Object.entries(byFeature)
276
+ .map(([feature, behaviors]) => {
277
+ const count = behaviors.length || 1;
278
+ return {
279
+ avgDocPagesVisited: behaviors.reduce((s, b) => s + b.docPagesVisited, 0) / count,
280
+ avgNetworkTimeMs: behaviors.reduce((s, b) => s + b.totalNetworkMs, 0) / count,
281
+ avgSearchesPerformed: behaviors.reduce((s, b) => s + b.searchesPerformed, 0) / count,
282
+ docSlugsVisited: [
283
+ ...new Set(behaviors.flatMap((b) => b.docSlugsVisited)),
284
+ ],
285
+ externalDomains: [
286
+ ...new Set(behaviors.flatMap((b) => b.externalDomains)),
287
+ ],
288
+ feature,
289
+ searchQueries: [
290
+ ...new Set(behaviors.flatMap((b) => b.uniqueSearchQueries)),
291
+ ],
292
+ tasksWithBehaviorData: behaviors.length,
293
+ };
294
+ })
295
+ .sort((a, b) => a.feature.localeCompare(b.feature));
296
+ }
297
+ /**
298
+ * Computes aggregate source isolation metrics from agentic eval results.
299
+ *
300
+ * Reads DOC_ALLOWED_ORIGINS from the environment (set by pipeline.ts)
301
+ * and analyzes all doc page visits across all test results.
302
+ *
303
+ * Returns null if no origin sandboxing was configured or no agent behavior
304
+ * data is present.
305
+ */
306
+ function aggregateSourceIsolation(resultsPath) {
307
+ const originsEnv = process.env.DOC_ALLOWED_ORIGINS;
308
+ if (!originsEnv) {
309
+ return null;
310
+ }
311
+ const allowedOrigins = originsEnv
312
+ .split(",")
313
+ .map((o) => o.trim())
314
+ .filter(Boolean);
315
+ if (allowedOrigins.length === 0) {
316
+ return null;
317
+ }
318
+ const results = readAndNormalizeResults(resultsPath);
319
+ // Collect all doc page visits from all test results
320
+ const allDocVisits = [];
321
+ for (const result of results) {
322
+ const behavior = result.metadata?.agentBehavior;
323
+ if (!behavior?.docPageVisits) {
324
+ continue;
325
+ }
326
+ for (const visit of behavior.docPageVisits) {
327
+ allDocVisits.push({ url: visit.url });
328
+ }
329
+ }
330
+ if (allDocVisits.length === 0) {
331
+ return null;
332
+ }
333
+ return analyzeSourceIsolation(allDocVisits, allowedOrigins);
334
+ }
335
+ // ---------------------------------------------------------------------------
336
+ // Feature area detection
337
+ // ---------------------------------------------------------------------------
338
+ /**
339
+ * Aggregates URL references across all test results, grouped by feature
340
+ * area and gold/baseline variant.
341
+ */
342
+ function aggregateUrlReferences(resultsPath) {
343
+ const results = readAndNormalizeResults(resultsPath);
344
+ const byFeature = {};
345
+ for (const result of results) {
346
+ const feature = detectFeatureArea(result.description);
347
+ if (!byFeature[feature]) {
348
+ byFeature[feature] = {
349
+ baseline: { testCount: 0, urls: {} },
350
+ gold: { testCount: 0, urls: {} },
351
+ };
352
+ }
353
+ const hasDocs = result.vars.docs && result.vars.docs.trim().length > 0;
354
+ const bucket = hasDocs
355
+ ? byFeature[feature].gold
356
+ : byFeature[feature].baseline;
357
+ const meta = extractUrlMetadata(result);
358
+ if (!meta) {
359
+ continue;
360
+ }
361
+ bucket.testCount++;
362
+ for (const url of meta.sanityUrls) {
363
+ bucket.urls[url] = (bucket.urls[url] || 0) + 1;
364
+ }
365
+ }
366
+ return Object.entries(byFeature)
367
+ .map(([feature, data]) => ({ feature, ...data }))
368
+ .sort((a, b) => a.feature.localeCompare(b.feature));
369
+ }
370
+ // ---------------------------------------------------------------------------
371
+ // Score calculation
372
+ // ---------------------------------------------------------------------------
373
+ /**
374
+ * Build source verification data for the score summary.
375
+ *
376
+ * Combines pipeline configuration (mode, source, sandbox) with runtime
377
+ * metadata (URL fetch results, isolation scores) to produce a unified
378
+ * verification report.
379
+ */
380
+ function buildSourceVerification(root, source) {
381
+ // oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty string means unset
382
+ const mode = process.env.EVAL_MODE || "baseline";
383
+ // oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty string means unset
384
+ const sourceUrl = source?.baseUrl || process.env.DOC_BASE_URL || "default";
385
+ // oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty string means unset
386
+ const searchMode = process.env.EVAL_SEARCH_MODE || undefined;
387
+ const allowedOriginsEnv = process.env.DOC_ALLOWED_ORIGINS;
388
+ const allowedOrigins = allowedOriginsEnv
389
+ ? allowedOriginsEnv
390
+ .split(",")
391
+ .map((o) => o.trim())
392
+ .filter(Boolean)
393
+ : undefined;
394
+ // Read URL fetch metadata if it exists (written by fetch-docs.ts)
395
+ let urlFetch;
396
+ const urlFetchPath = join(root, "contexts", "url-fetch.json");
397
+ if (existsSync(urlFetchPath)) {
398
+ try {
399
+ urlFetch = JSON.parse(readFileSync(urlFetchPath, "utf-8"));
400
+ }
401
+ catch {
402
+ // Malformed JSON — skip
403
+ }
404
+ }
405
+ return {
406
+ ...(allowedOrigins && { allowedOrigins }),
407
+ mode,
408
+ ...(searchMode && { searchMode }),
409
+ source: sourceUrl,
410
+ ...(urlFetch && {
411
+ urlFetch: {
412
+ failures: urlFetch.failures,
413
+ fetchedUrls: urlFetch.fetchedUrls.map((f) => ({
414
+ method: f.method,
415
+ status: f.status,
416
+ url: f.url,
417
+ })),
418
+ totalFailed: urlFetch.totalFailed,
419
+ totalFetched: urlFetch.totalFetched,
420
+ },
421
+ }),
422
+ };
423
+ }
424
+ // ---------------------------------------------------------------------------
425
+ // Agent behavior aggregation
426
+ // ---------------------------------------------------------------------------
427
+ /**
428
+ * Calculate overall scores (all models combined).
429
+ * This is the original scoring path — backward compatible.
430
+ */
431
+ function calculateScores(resultsPath, weights) {
432
+ const results = readAndNormalizeResults(resultsPath);
433
+ return scoreResults(results, weights);
434
+ }
435
+ /**
436
+ * Extracts agent behavior summary from a test result's metadata.
437
+ * Returns null if the test was not run with the instrumented provider.
438
+ */
439
+ function extractAgentBehavior(test) {
440
+ const { metadata } = test;
441
+ if (!metadata?.agentBehaviorSummary) {
442
+ return null;
443
+ }
444
+ return metadata.agentBehaviorSummary;
445
+ }
446
+ /**
447
+ * Extracts grader (assertion) cost from the raw Promptfoo results file.
448
+ * Promptfoo tracks assertion token usage separately in stats.tokenUsage.assertions.
449
+ * The grader model is found in config.defaultTest.options.provider.
450
+ */
451
+ function extractGraderCost(resultsPath) {
452
+ const file = JSON.parse(readFileSync(resultsPath, "utf-8"));
453
+ const stats = file.results?.stats;
454
+ const assertions = stats?.tokenUsage?.assertions;
455
+ if (!assertions || assertions.total === 0) {
456
+ return null;
457
+ }
458
+ // Extract grader model from config
459
+ const graderModel = file.config?.defaultTest?.options?.rubricProvider ??
460
+ file.config?.defaultTest?.options?.provider;
461
+ // Extract just the model name from "openai:gpt-5-2025-08-07" format
462
+ const modelName = graderModel?.replace(/^[^:]+:/, "");
463
+ const cost = modelName
464
+ ? calculateCost(modelName, assertions.prompt, assertions.completion)
465
+ : 0;
466
+ return {
467
+ completionTokens: assertions.completion,
468
+ cost,
469
+ model: graderModel,
470
+ promptTokens: assertions.prompt,
471
+ totalTokens: assertions.total,
472
+ };
473
+ }
474
+ /**
475
+ * Prints a formatted report of agent behavior observations.
476
+ */
477
+ function printAgentBehaviorReport(agentBehavior) {
478
+ console.log("-".repeat(80));
479
+ console.log("AGENT BEHAVIOR OBSERVATION");
480
+ console.log("-".repeat(80));
481
+ console.log();
482
+ // Summary table
483
+ const h = "| Feature Area | Tests | Doc Pages | Searches | Net (ms) |";
484
+ const sep = "|---------------------|-------|-----------|----------|----------|";
485
+ console.log(h);
486
+ console.log(sep);
487
+ for (const ab of agentBehavior) {
488
+ console.log(`| ${ab.feature.padEnd(19)} | ` +
489
+ `${ab.tasksWithBehaviorData.toString().padStart(5)} | ` +
490
+ `${ab.avgDocPagesVisited.toFixed(1).padStart(9)} | ` +
491
+ `${ab.avgSearchesPerformed.toFixed(1).padStart(8)} | ` +
492
+ `${Math.round(ab.avgNetworkTimeMs).toString().padStart(8)} |`);
493
+ }
494
+ console.log();
495
+ // Doc pages visited
496
+ console.log(" Doc pages visited:");
497
+ for (const ab of agentBehavior) {
498
+ if (ab.docSlugsVisited.length === 0) {
499
+ console.log(` ${ab.feature}: (none)`);
500
+ }
501
+ else {
502
+ console.log(` ${ab.feature}:`);
503
+ for (const slug of ab.docSlugsVisited) {
504
+ console.log(` - /docs/${slug}`);
505
+ }
506
+ }
507
+ }
508
+ console.log();
509
+ // Search queries
510
+ const hasSearches = agentBehavior.some((ab) => ab.searchQueries.length > 0);
511
+ if (hasSearches) {
512
+ console.log(" Search queries:");
513
+ for (const ab of agentBehavior) {
514
+ if (ab.searchQueries.length === 0) {
515
+ continue;
516
+ }
517
+ console.log(` ${ab.feature}:`);
518
+ for (const q of ab.searchQueries) {
519
+ console.log(` - "${q}"`);
520
+ }
521
+ }
522
+ console.log();
523
+ }
524
+ // External domains
525
+ const allExternalDomains = [
526
+ ...new Set(agentBehavior.flatMap((ab) => ab.externalDomains)),
527
+ ];
528
+ if (allExternalDomains.length > 0) {
529
+ console.log(" External domains contacted:");
530
+ for (const d of allExternalDomains) {
531
+ console.log(` - ${d}`);
532
+ }
533
+ console.log();
534
+ }
535
+ }
536
+ // ---------------------------------------------------------------------------
537
+ // Report
538
+ // ---------------------------------------------------------------------------
539
+ // ---------------------------------------------------------------------------
540
+ // Grader cost extraction
541
+ // ---------------------------------------------------------------------------
542
+ /**
543
+ * Reads the raw Promptfoo output file and normalizes each result so that
544
+ * `description` is always a top-level field (pulled from `testCase` if needed).
545
+ */
546
+ function readAndNormalizeResults(resultsPath) {
547
+ const file = JSON.parse(readFileSync(resultsPath, "utf-8"));
548
+ const wrapper = file.results ?? file;
549
+ const all = wrapper.results.map((r) => ({
550
+ cost: r.cost ?? 0,
551
+ description: r.testCase?.description ?? "unknown",
552
+ error: r.error,
553
+ gradingResult: r.gradingResult,
554
+ metadata: r.metadata,
555
+ provider: r.provider?.label ?? r.provider?.id,
556
+ providerId: r.provider?.id,
557
+ providerLabel: r.provider?.label,
558
+ response: r.response,
559
+ vars: r.vars ?? r.testCase?.vars ?? {},
560
+ }));
561
+ // Filter out results where gradingResult is null (errored/timed-out tests).
562
+ // Promptfoo sets gradingResult to null when a test errors before grading.
563
+ const valid = all.filter((r) => r.gradingResult !== null);
564
+ const skipped = all.length - valid.length;
565
+ if (skipped > 0) {
566
+ console.warn(` ⚠ Skipping ${skipped} of ${all.length} result(s) with null gradingResult (errored tests):`);
567
+ for (const r of all) {
568
+ if (r.gradingResult === null) {
569
+ const providerLabel = r.provider ? `[${r.provider}] ` : "";
570
+ const errorMsg = r.error
571
+ ? r.error.slice(0, 150)
572
+ : "unknown error (no error field in result)";
573
+ console.warn(` ✗ ${providerLabel}"${r.description}" — ${errorMsg}`);
574
+ }
575
+ }
576
+ }
577
+ return valid;
578
+ }
579
+ /**
580
+ * Core scoring logic: takes a pre-filtered array of TestResult and produces
581
+ * FeatureScore[] grouped by feature area. This is the shared implementation
582
+ * used by both the overall scoring and per-model scoring paths.
583
+ *
584
+ * @param results Pre-filtered (valid) test results
585
+ * @param weights Dimension weights from rubrics.yaml
586
+ * @param modelId Optional model identifier to tag each FeatureScore
587
+ */
588
+ function scoreResults(results, weights, modelId) {
589
+ const wTask = weights["task-completion"] ?? 0.5;
590
+ const wCode = weights["code-correctness"] ?? 0.25;
591
+ const wDoc = weights["doc-coverage"] ?? 0.25;
592
+ // Group by feature + docs/no-docs
593
+ const byFeature = {};
594
+ for (const result of results) {
595
+ const feature = detectFeatureArea(result.description);
596
+ if (!byFeature[feature]) {
597
+ byFeature[feature] = { withDocs: [], withoutDocs: [] };
598
+ }
599
+ const hasDocs = result.vars.docs && result.vars.docs.trim().length > 0;
600
+ if (hasDocs) {
601
+ byFeature[feature].withDocs.push(result);
602
+ }
603
+ else {
604
+ byFeature[feature].withoutDocs.push(result);
605
+ }
606
+ }
607
+ const scores = [];
608
+ for (const [feature, data] of Object.entries(byFeature)) {
609
+ // --- With docs ---
610
+ let totalTask = 0;
611
+ let totalCode = 0;
612
+ let totalDoc = 0;
613
+ let featureCost = 0;
614
+ const countWithDocs = data.withDocs.length || 1;
615
+ for (const test of data.withDocs) {
616
+ featureCost += test.cost;
617
+ for (const comp of test.gradingResult.componentResults) {
618
+ if (comp.assertion?.type !== "llm-rubric") {
619
+ continue;
620
+ }
621
+ const score = parseRubricScore(comp);
622
+ const kind = classifyRubric(comp);
623
+ if (kind === "taskCompletion") {
624
+ totalTask += score;
625
+ }
626
+ else if (kind === "codeCorrectness") {
627
+ totalCode += score;
628
+ }
629
+ else if (kind === "docCoverage") {
630
+ totalDoc += score;
631
+ }
632
+ }
633
+ }
634
+ // Per-dimension averages (each 0–100)
635
+ const avgTask = totalTask / countWithDocs;
636
+ const avgCode = totalCode / countWithDocs;
637
+ const avgDoc = totalDoc / countWithDocs;
638
+ // Weighted composite (0–100)
639
+ const withDocsTotal = avgTask * wTask + avgCode * wCode + avgDoc * wDoc;
640
+ // --- Without docs (baseline) ---
641
+ let baselineTotal = 0;
642
+ let baselineCount = 0;
643
+ for (const test of data.withoutDocs) {
644
+ featureCost += test.cost;
645
+ for (const comp of test.gradingResult.componentResults) {
646
+ if (comp.assertion?.type !== "llm-rubric") {
647
+ continue;
648
+ }
649
+ baselineTotal += parseRubricScore(comp);
650
+ baselineCount++;
651
+ }
652
+ }
653
+ const withoutDocsScore = baselineCount > 0 ? baselineTotal / baselineCount : 0;
654
+ const ceilingScore = Math.round(withDocsTotal);
655
+ const floorScore = Math.round(withoutDocsScore);
656
+ const docLift = ceilingScore - floorScore;
657
+ scores.push({
658
+ ceilingScore,
659
+ codeCorrectness: Math.round(avgCode),
660
+ docCoverage: Math.round(avgDoc),
661
+ docLift,
662
+ docQualityGap: 100 - ceilingScore,
663
+ feature,
664
+ floorScore,
665
+ ...(modelId && { modelId }),
666
+ negativeDocLift: docLift < 0,
667
+ taskCompletion: Math.round(avgTask),
668
+ testCount: data.withDocs.length,
669
+ totalCost: featureCost,
670
+ totalScore: ceilingScore,
671
+ });
672
+ }
673
+ return scores.sort((a, b) => a.feature.localeCompare(b.feature));
674
+ }
675
+ export function scoreAgenticResults(resultsPath, weights) {
676
+ const results = readAndNormalizeResults(resultsPath);
677
+ const wTask = weights["task-completion"] ?? 0.5;
678
+ const wCode = weights["code-correctness"] ?? 0.25;
679
+ const wDoc = weights["doc-coverage"] ?? 0.25;
680
+ // Group by feature area
681
+ const byFeature = {};
682
+ for (const result of results) {
683
+ const feature = detectFeatureArea(result.description);
684
+ if (!byFeature[feature]) {
685
+ byFeature[feature] = [];
686
+ }
687
+ byFeature[feature].push(result);
688
+ }
689
+ const entries = {};
690
+ for (const [feature, featureResults] of Object.entries(byFeature)) {
691
+ let totalTask = 0;
692
+ let totalCode = 0;
693
+ let totalDoc = 0;
694
+ let featureCost = 0;
695
+ const count = featureResults.length || 1;
696
+ for (const test of featureResults) {
697
+ featureCost += test.cost;
698
+ for (const comp of test.gradingResult.componentResults) {
699
+ if (comp.assertion?.type !== "llm-rubric")
700
+ continue;
701
+ const score = parseRubricScore(comp);
702
+ const kind = classifyRubric(comp);
703
+ if (kind === "taskCompletion")
704
+ totalTask += score;
705
+ else if (kind === "codeCorrectness")
706
+ totalCode += score;
707
+ else if (kind === "docCoverage")
708
+ totalDoc += score;
709
+ }
710
+ }
711
+ const avgTask = totalTask / count;
712
+ const avgCode = totalCode / count;
713
+ const avgDoc = totalDoc / count;
714
+ const actualScore = Math.round(avgTask * wTask + avgCode * wCode + avgDoc * wDoc);
715
+ entries[feature] = {
716
+ actualScore,
717
+ codeCorrectness: Math.round(avgCode),
718
+ docCoverage: Math.round(avgDoc),
719
+ taskCompletion: Math.round(avgTask),
720
+ testCount: featureResults.length,
721
+ totalCost: featureCost,
722
+ };
723
+ }
724
+ return entries;
725
+ }
726
+ // ---------------------------------------------------------------------------
727
+ // Score merging — combine baseline floor/ceiling with agentic actual
728
+ // ---------------------------------------------------------------------------
729
+ /**
730
+ * Merge baseline FeatureScore[] with agentic actual scores to produce
731
+ * the full three-layer decomposition.
732
+ *
733
+ * The merge is per feature area. For each area:
734
+ * - If baseline data exists: floor, ceiling, docLift, docQualityGap are populated
735
+ * - If agentic data exists: actualScore is populated
736
+ * - If both exist: retrievalGap and infrastructureEfficiency are computed
737
+ *
738
+ * @param baselineScores Floor/ceiling scores from baseline evaluation (may be empty)
739
+ * @param agenticScores Actual scores from agentic evaluation (may be empty)
740
+ */
741
+ export function mergeScores(baselineScores, agenticScores) {
742
+ // Build a map of baseline scores by feature
743
+ const baselineMap = new Map();
744
+ for (const score of baselineScores) {
745
+ baselineMap.set(score.feature, score);
746
+ }
747
+ // Collect all feature areas from both sources
748
+ const allFeatures = new Set([
749
+ ...baselineScores.map((s) => s.feature),
750
+ ...Object.keys(agenticScores),
751
+ ]);
752
+ const merged = [];
753
+ for (const feature of allFeatures) {
754
+ const baseline = baselineMap.get(feature);
755
+ const agentic = agenticScores[feature];
756
+ if (baseline && agentic) {
757
+ // Both data sources — full decomposition
758
+ const retrievalGap = baseline.ceilingScore - agentic.actualScore;
759
+ const negativeDocLift = baseline.docLift < 0;
760
+ // Infrastructure efficiency: actual / ceiling
761
+ // Null when ceiling ≤ 0 or negative Doc Lift (the metric is meaningless
762
+ // when docs hurt performance — see evaluation-ceiling.md)
763
+ let infrastructureEfficiency = null;
764
+ if (!negativeDocLift && baseline.ceilingScore > 0) {
765
+ infrastructureEfficiency = agentic.actualScore / baseline.ceilingScore;
766
+ }
767
+ // Inverted retrieval gap: agents outperform injected docs.
768
+ // This happens when Doc Lift is negative AND actual > ceiling.
769
+ // It means retrieval failure is masking a doc quality problem.
770
+ const invertedRetrievalGap = negativeDocLift && retrievalGap < 0;
771
+ merged.push({
772
+ ...baseline,
773
+ actualScore: agentic.actualScore,
774
+ infrastructureEfficiency,
775
+ invertedRetrievalGap: invertedRetrievalGap || undefined,
776
+ retrievalGap,
777
+ totalCost: baseline.totalCost + agentic.totalCost,
778
+ });
779
+ }
780
+ else if (baseline) {
781
+ // Baseline only — no agentic data (partial summary)
782
+ merged.push({ ...baseline });
783
+ }
784
+ else if (agentic) {
785
+ // Agentic only — no baseline data (partial summary)
786
+ merged.push({
787
+ actualScore: agentic.actualScore,
788
+ ceilingScore: 0,
789
+ codeCorrectness: agentic.codeCorrectness,
790
+ docCoverage: agentic.docCoverage,
791
+ docLift: 0,
792
+ docQualityGap: 100,
793
+ feature,
794
+ floorScore: 0,
795
+ negativeDocLift: false,
796
+ taskCompletion: agentic.taskCompletion,
797
+ testCount: agentic.testCount,
798
+ totalCost: agentic.totalCost,
799
+ totalScore: agentic.actualScore,
800
+ });
801
+ }
802
+ }
803
+ return merged.sort((a, b) => a.feature.localeCompare(b.feature));
804
+ }
805
+ const CRITICAL_THRESHOLD = 40;
806
+ function main() {
807
+ const ROOT = join(dirname(new URL(import.meta.url).pathname), "..", "..");
808
+ const args = process.argv.slice(2);
809
+ // Parse --source <name> argument
810
+ const sourceIdx = args.indexOf("--source");
811
+ const sourceName = sourceIdx !== -1 ? args[sourceIdx + 1] : undefined;
812
+ // Always load source config so environment info is included in score summary.
813
+ // When no --source is specified, defaults to production.
814
+ let source;
815
+ try {
816
+ source = loadSource(sourceName);
817
+ }
818
+ catch {
819
+ console.warn(` [warn] Could not load source "${sourceName}", proceeding without source metadata`);
820
+ }
821
+ // Determine mode — controls which result files are read
822
+ // oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty string means unset
823
+ const mode = process.env.EVAL_MODE || "baseline";
824
+ // First positional arg (not a flag) is the results path (baseline results)
825
+ const baselineResultsPath = args.find((a) => !a.startsWith("--") && args[args.indexOf(a) - 1] !== "--source") ?? join(ROOT, "results", "latest", "eval-results.json");
826
+ // Agentic results path (only used in full mode)
827
+ const agenticResultsPath = join(ROOT, "results", "latest", "eval-results-agentic.json");
828
+ // Validate baseline results file
829
+ const resultsIssues = checkResultsExist(ROOT, baselineResultsPath);
830
+ const resultsErrors = resultsIssues.filter((i) => i.severity === "error");
831
+ if (resultsErrors.length > 0) {
832
+ console.error("❌ Results validation failed:");
833
+ for (const e of resultsErrors) {
834
+ console.error(` ERROR: ${e.message}`);
835
+ if (e.path) {
836
+ console.error(` at ${e.path}`);
837
+ }
838
+ }
839
+ console.error("\nRun 'pnpm eval' first to generate results, then 'pnpm calculate-scores'.");
840
+ process.exit(1);
841
+ }
842
+ console.log(`Reading results from: ${baselineResultsPath}`);
843
+ if (source) {
844
+ console.log(`Source: ${sourceName} (${source.baseUrl})`);
845
+ }
846
+ // Load dimension weights from rubrics.yaml
847
+ const rubricConfig = loadRubricTemplates(ROOT);
848
+ const baselineScores = calculateScores(baselineResultsPath, rubricConfig.weights);
849
+ const perModel = calculateScoresPerModel(baselineResultsPath, rubricConfig.weights);
850
+ const urlRefs = aggregateUrlReferences(baselineResultsPath);
851
+ const sourceVerification = buildSourceVerification(ROOT, source);
852
+ const graderCost = extractGraderCost(baselineResultsPath);
853
+ // Full mode: merge baseline floor/ceiling with agentic actual scores
854
+ let scores;
855
+ let agentBehavior = null;
856
+ let sourceIsolation = null;
857
+ let evaluationMode;
858
+ if (mode === "full" && existsSync(agenticResultsPath)) {
859
+ console.log(`\nReading agentic results from: ${agenticResultsPath}`);
860
+ const agenticScores = scoreAgenticResults(agenticResultsPath, rubricConfig.weights);
861
+ scores = mergeScores(baselineScores, agenticScores);
862
+ evaluationMode = "full";
863
+ // Aggregate agent behavior and source isolation from agentic results
864
+ agentBehavior = aggregateAgentBehavior(agenticResultsPath);
865
+ sourceIsolation = aggregateSourceIsolation(agenticResultsPath);
866
+ // Merge grader costs from both files
867
+ const agenticGraderCost = extractGraderCost(agenticResultsPath);
868
+ if (graderCost && agenticGraderCost) {
869
+ graderCost.cost += agenticGraderCost.cost;
870
+ graderCost.totalTokens += agenticGraderCost.totalTokens;
871
+ graderCost.promptTokens += agenticGraderCost.promptTokens;
872
+ graderCost.completionTokens += agenticGraderCost.completionTokens;
873
+ }
874
+ }
875
+ else if (mode === "agentic") {
876
+ scores = baselineScores;
877
+ agentBehavior = aggregateAgentBehavior(baselineResultsPath);
878
+ sourceIsolation = aggregateSourceIsolation(baselineResultsPath);
879
+ evaluationMode = "agentic";
880
+ }
881
+ else {
882
+ scores = baselineScores;
883
+ agentBehavior = aggregateAgentBehavior(baselineResultsPath);
884
+ sourceIsolation = aggregateSourceIsolation(baselineResultsPath);
885
+ evaluationMode = mode === "observed" ? "observed" : "baseline";
886
+ }
887
+ const summary = printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode);
888
+ // Persist
889
+ const outDir = join(ROOT, "results", "latest");
890
+ mkdirSync(outDir, { recursive: true });
891
+ writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
892
+ console.log("Score summary written to results/latest/score-summary.json");
893
+ // Extract and persist grader judgments (Phase 3a: failure mode extraction)
894
+ const judgments = extractGraderJudgments(baselineResultsPath);
895
+ // In full mode, also extract judgments from agentic results
896
+ if (mode === "full" && existsSync(agenticResultsPath)) {
897
+ const agenticJudgments = extractGraderJudgments(agenticResultsPath);
898
+ judgments.push(...agenticJudgments);
899
+ }
900
+ if (judgments.length > 0) {
901
+ writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
902
+ console.log(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
903
+ }
904
+ // Exit with non-zero if any area below critical threshold
905
+ if (summary.belowCritical.length > 0) {
906
+ process.exit(1);
907
+ }
908
+ }
909
+ function printPerModelReport(perModel) {
910
+ console.log("-".repeat(80));
911
+ console.log("PER-MODEL BREAKDOWN");
912
+ console.log("-".repeat(80));
913
+ console.log();
914
+ // Model summary table
915
+ const h = "| Model | Avg Score | Avg Lift | Tests | Cost |";
916
+ const sep = "|--------------------------------|-----------|----------|-------|----------|";
917
+ console.log(h);
918
+ console.log(sep);
919
+ const sorted = [...perModel].sort((a, b) => b.overall.avgScore - a.overall.avgScore);
920
+ for (const entry of sorted) {
921
+ const displayName = entry.label || entry.modelId;
922
+ const costStr = entry.overall.cost
923
+ ? `$${entry.overall.cost.toFixed(4)}`
924
+ : "—";
925
+ const liftStr = entry.overall.avgDocLift >= 0
926
+ ? `+${entry.overall.avgDocLift.toFixed(1)}`
927
+ : entry.overall.avgDocLift.toFixed(1);
928
+ console.log(`| ${displayName.padEnd(30)} | ` +
929
+ `${entry.overall.avgScore.toFixed(1).padStart(9)} | ` +
930
+ `${liftStr.padStart(8)} | ` +
931
+ `${entry.overall.testCount.toString().padStart(5)} | ` +
932
+ `${costStr.padStart(8)} |`);
933
+ }
934
+ console.log();
935
+ // Per-model × per-area breakdown
936
+ for (const entry of sorted) {
937
+ const displayName = entry.label || entry.modelId;
938
+ console.log(` ${displayName} (${entry.modelId}):`);
939
+ const areaH = " | Feature Area | Task | Code | Docs | Total | Lift |";
940
+ const areaSep = " |---------------------|------|------|------|-------|------|";
941
+ console.log(areaH);
942
+ console.log(areaSep);
943
+ for (const s of entry.scores) {
944
+ const lift = s.docLift >= 0 ? `+${s.docLift}` : `${s.docLift}`;
945
+ console.log(` | ${s.feature.padEnd(19)} | ` +
946
+ `${s.taskCompletion.toString().padStart(4)} | ` +
947
+ `${s.codeCorrectness.toString().padStart(4)} | ` +
948
+ `${s.docCoverage.toString().padStart(4)} | ` +
949
+ `${s.totalScore.toString().padStart(5)} | ` +
950
+ `${lift.padStart(4)} |`);
951
+ }
952
+ console.log();
953
+ }
954
+ // Cost-per-quality-point
955
+ const modelsWithCost = sorted.filter((e) => e.overall.cost && e.overall.cost > 0);
956
+ if (modelsWithCost.length > 0) {
957
+ console.log(" Cost per quality point:");
958
+ for (const entry of modelsWithCost) {
959
+ const displayName = entry.label;
960
+ const costPerPoint = entry.overall.avgScore > 0
961
+ ? (entry.overall.cost ?? 0) / entry.overall.avgScore
962
+ : 0;
963
+ console.log(` ${displayName}: $${costPerPoint.toFixed(6)}/point (score: ${entry.overall.avgScore.toFixed(1)}, cost: $${(entry.overall.cost ?? 0).toFixed(4)})`);
964
+ }
965
+ console.log();
966
+ }
967
+ }
968
+ // ---------------------------------------------------------------------------
969
+ // Main
970
+ // ---------------------------------------------------------------------------
971
+ function printReport(scores, urlRefs, source, agentBehavior, graderCost, perModel, sourceIsolation, sourceVerification, evaluationMode) {
972
+ console.log("\n" + "=".repeat(80));
973
+ console.log(" SANITY AI LITERACY SCORE REPORT");
974
+ console.log("=".repeat(80));
975
+ console.log();
976
+ // Table header
977
+ const h = "| Feature Area | Task | Code | Docs | Total | w/o Docs | Doc Lift |";
978
+ const sep = "|---------------------|------|------|------|-------|----------|----------|";
979
+ console.log(h);
980
+ console.log(sep);
981
+ for (const s of scores) {
982
+ const status = s.totalScore < CRITICAL_THRESHOLD ? "!!" : "ok";
983
+ const lift = s.docLift > 0 ? `+${s.docLift}` : `${s.docLift}`;
984
+ console.log(`| ${status} ${s.feature.padEnd(17)} | ` +
985
+ `${s.taskCompletion.toString().padStart(4)} | ` +
986
+ `${s.codeCorrectness.toString().padStart(4)} | ` +
987
+ `${s.docCoverage.toString().padStart(4)} | ` +
988
+ `${s.totalScore.toString().padStart(5)} | ` +
989
+ `${s.floorScore.toString().padStart(8)} | ` +
990
+ `${lift.padStart(8)} |`);
991
+ }
992
+ console.log();
993
+ // OKR status
994
+ const belowCritical = scores.filter((s) => s.totalScore < CRITICAL_THRESHOLD);
995
+ const lowestScore = scores.reduce((min, s) => s.totalScore < min.totalScore ? s : min);
996
+ const avgScore = scores.reduce((sum, s) => sum + s.totalScore, 0) / scores.length;
997
+ const avgLift = scores.reduce((sum, s) => sum + s.docLift, 0) / scores.length;
998
+ const avgCeilingScore = scores.reduce((sum, s) => sum + s.ceilingScore, 0) / scores.length;
999
+ const avgFloorScore = scores.reduce((sum, s) => sum + s.floorScore, 0) / scores.length;
1000
+ const avgDocQualityGap = scores.reduce((sum, s) => sum + s.docQualityGap, 0) / scores.length;
1001
+ const negativeDocLiftScores = scores.filter((s) => s.negativeDocLift);
1002
+ const negativeDocLiftAreas = negativeDocLiftScores.map((s) => ({
1003
+ area: s.feature,
1004
+ docLift: s.docLift,
1005
+ }));
1006
+ console.log("-".repeat(80));
1007
+ console.log("OKR STATUS");
1008
+ console.log("-".repeat(80));
1009
+ console.log();
1010
+ if (belowCritical.length === 0) {
1011
+ console.log(" KR1: PASS -- All areas above critical threshold (>=40)");
1012
+ }
1013
+ else {
1014
+ console.log(" KR1: FAIL -- Areas below critical threshold:");
1015
+ belowCritical.forEach((s) => console.log(` - ${s.feature}: ${s.totalScore}`));
1016
+ }
1017
+ console.log();
1018
+ console.log(` Lowest area: ${lowestScore.feature} (${lowestScore.totalScore})`);
1019
+ console.log(` Target: +15 points improvement`);
1020
+ console.log();
1021
+ console.log(` Avg score: ${avgScore.toFixed(1)}`);
1022
+ console.log(` Avg doc lift: +${avgLift.toFixed(1)} points`);
1023
+ console.log(` (Doc lift = how much docs help vs parametric knowledge alone)`);
1024
+ console.log();
1025
+ // Ceiling decomposition
1026
+ console.log("-".repeat(80));
1027
+ console.log("CEILING DECOMPOSITION");
1028
+ console.log("-".repeat(80));
1029
+ console.log();
1030
+ const ceilH = "| Feature Area | Floor | Ceiling | Doc Lift | Quality Gap |";
1031
+ const ceilSep = "|---------------------|-------|---------|----------|-------------|";
1032
+ console.log(ceilH);
1033
+ console.log(ceilSep);
1034
+ for (const s of scores) {
1035
+ const liftStr = s.docLift >= 0 ? `+${s.docLift}` : `${s.docLift}`;
1036
+ const liftFlag = s.negativeDocLift ? " 🚨" : "";
1037
+ console.log(`| ${s.feature.padEnd(19)} | ` +
1038
+ `${s.floorScore.toString().padStart(5)} | ` +
1039
+ `${s.ceilingScore.toString().padStart(7)} | ` +
1040
+ `${liftStr.padStart(8)}${liftFlag} | ` +
1041
+ `${s.docQualityGap.toString().padStart(11)} |`);
1042
+ }
1043
+ console.log();
1044
+ if (negativeDocLiftAreas.length > 0) {
1045
+ console.log(" 🚨 NEGATIVE DOC LIFT DETECTED:");
1046
+ for (const { area, docLift } of negativeDocLiftAreas) {
1047
+ const s = scores.find((sc) => sc.feature === area);
1048
+ console.log(` ${area}: Doc Lift = ${docLift} (floor: ${s.floorScore}, ceiling: ${s.ceilingScore})`);
1049
+ }
1050
+ console.log(" Documentation is HURTING model performance for these areas.");
1051
+ console.log(" See docs/design-docs/scenario-matrix/evaluation-ceiling.md");
1052
+ console.log();
1053
+ }
1054
+ else {
1055
+ console.log(" ✅ No areas with negative Doc Lift detected.");
1056
+ console.log();
1057
+ }
1058
+ // Three-layer decomposition (only when actual scores are present)
1059
+ const hasActualScores = scores.some((s) => s.actualScore !== undefined);
1060
+ if (hasActualScores) {
1061
+ console.log("-".repeat(80));
1062
+ console.log("THREE-LAYER DECOMPOSITION (floor → ceiling → actual)");
1063
+ console.log("-".repeat(80));
1064
+ console.log();
1065
+ const decompH = "| Feature Area | Floor | Ceiling | Actual | Doc Lift | Ret. Gap | Infra % |";
1066
+ const decompSep = "|---------------------|-------|---------|--------|----------|----------|---------|";
1067
+ console.log(decompH);
1068
+ console.log(decompSep);
1069
+ for (const s of scores) {
1070
+ const liftStr = s.docLift >= 0 ? `+${s.docLift}` : `${s.docLift}`;
1071
+ const actualStr = s.actualScore !== undefined ? s.actualScore.toString() : "—";
1072
+ const gapStr = s.retrievalGap !== undefined
1073
+ ? s.retrievalGap >= 0
1074
+ ? `+${s.retrievalGap}`
1075
+ : `${s.retrievalGap}`
1076
+ : "—";
1077
+ const infraStr = s.infrastructureEfficiency != null
1078
+ ? `${Math.round(s.infrastructureEfficiency * 100)}%`
1079
+ : "—";
1080
+ const flag = s.invertedRetrievalGap ? " 🔄" : "";
1081
+ console.log(`| ${s.feature.padEnd(19)} | ` +
1082
+ `${s.floorScore.toString().padStart(5)} | ` +
1083
+ `${s.ceilingScore.toString().padStart(7)} | ` +
1084
+ `${actualStr.padStart(6)} | ` +
1085
+ `${liftStr.padStart(8)} | ` +
1086
+ `${(gapStr + flag).padStart(8)} | ` +
1087
+ `${infraStr.padStart(7)} |`);
1088
+ }
1089
+ console.log();
1090
+ console.log(" Doc Lift = ceiling − floor | Ret. Gap = ceiling − actual | Infra = actual / ceiling");
1091
+ console.log(" 🔄 = inverted retrieval gap (agents avoid bad docs → higher actual than ceiling)");
1092
+ console.log();
1093
+ }
1094
+ // Cost summary
1095
+ const totalCost = scores.reduce((sum, s) => sum + s.totalCost, 0);
1096
+ const totalTests = scores.reduce((sum, s) => sum + s.testCount, 0);
1097
+ const graderCostTotal = graderCost?.cost ?? 0;
1098
+ const combinedCost = totalCost + graderCostTotal;
1099
+ if (totalCost > 0 || graderCostTotal > 0) {
1100
+ console.log("-".repeat(80));
1101
+ console.log("COST SUMMARY");
1102
+ console.log("-".repeat(80));
1103
+ console.log();
1104
+ console.log(` Provider cost: $${totalCost.toFixed(4)}`);
1105
+ if (graderCostTotal > 0) {
1106
+ const graderLabel = graderCost?.model ?? "unknown";
1107
+ console.log(` Grader cost: $${graderCostTotal.toFixed(4)} (${graderLabel}, ${(graderCost?.totalTokens ?? 0).toLocaleString()} tokens)`);
1108
+ }
1109
+ console.log(` Total cost: $${combinedCost.toFixed(4)}`);
1110
+ console.log(` Avg cost per test: $${(combinedCost / (totalTests || 1)).toFixed(4)}`);
1111
+ console.log();
1112
+ const costHeader = "| Feature Area | Tests | Cost | Avg/Test |";
1113
+ const costSep = "|---------------------|-------|----------|----------|";
1114
+ console.log(costHeader);
1115
+ console.log(costSep);
1116
+ for (const s of scores) {
1117
+ const avgCost = s.testCount > 0 ? s.totalCost / s.testCount : 0;
1118
+ console.log(`| ${s.feature.padEnd(19)} | ` +
1119
+ `${s.testCount.toString().padStart(5)} | ` +
1120
+ `$${s.totalCost.toFixed(4).padStart(7)} | ` +
1121
+ `$${avgCost.toFixed(4).padStart(7)} |`);
1122
+ }
1123
+ console.log();
1124
+ }
1125
+ // Per-model breakdown
1126
+ if (perModel) {
1127
+ printPerModelReport(perModel);
1128
+ }
1129
+ // URL References
1130
+ printUrlReport(urlRefs);
1131
+ // Agent Behavior (only present when run with instrumented provider)
1132
+ if (agentBehavior && agentBehavior.length > 0) {
1133
+ printAgentBehaviorReport(agentBehavior);
1134
+ }
1135
+ // Source verification (unified report for all modes)
1136
+ if (sourceVerification || sourceIsolation) {
1137
+ console.log("-".repeat(80));
1138
+ console.log("📋 SOURCE VERIFICATION");
1139
+ console.log("-".repeat(80));
1140
+ if (sourceVerification) {
1141
+ console.log(` Source: ${sourceVerification.source}`);
1142
+ console.log(` Mode: ${sourceVerification.mode}`);
1143
+ if (sourceVerification.allowedOrigins) {
1144
+ console.log(` Sandbox: ${sourceVerification.allowedOrigins.join(", ")}`);
1145
+ }
1146
+ if (sourceVerification.searchMode) {
1147
+ console.log(` Search: ${sourceVerification.searchMode}`);
1148
+ }
1149
+ // URL fetch results (baseline mode with direct URLs)
1150
+ if (sourceVerification.urlFetch) {
1151
+ const uf = sourceVerification.urlFetch;
1152
+ console.log();
1153
+ console.log(` URL fetch: ${uf.totalFetched} fetched, ${uf.totalFailed} failed`);
1154
+ for (const f of uf.fetchedUrls) {
1155
+ console.log(` ✅ ${f.url} (via ${f.method})`);
1156
+ }
1157
+ for (const f of uf.failures) {
1158
+ // oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty string means no error info
1159
+ console.log(` ⚠️ ${f.url}: ${f.error || "unknown error"}`);
1160
+ }
1161
+ }
1162
+ }
1163
+ // Agentic isolation score
1164
+ if (sourceIsolation) {
1165
+ const pct = Math.round(sourceIsolation.isolationScore * 100);
1166
+ const icon = sourceIsolation.offOrigin === 0 ? "✅" : "⚠️";
1167
+ console.log();
1168
+ console.log(` Agent isolation: ${icon} ${pct}% (${sourceIsolation.onOrigin}/${sourceIsolation.total} on-origin)`);
1169
+ if (sourceIsolation.offOrigin > 0) {
1170
+ console.log(` Off-origin fetches: ${sourceIsolation.offOrigin}`);
1171
+ for (const url of sourceIsolation.offOriginUrls.slice(0, 10)) {
1172
+ console.log(` • ${url}`);
1173
+ }
1174
+ }
1175
+ if (Object.keys(sourceIsolation.originBreakdown).length > 0) {
1176
+ console.log(" Origin breakdown:");
1177
+ for (const [origin, count] of Object.entries(sourceIsolation.originBreakdown).sort((a, b) => b[1] - a[1])) {
1178
+ console.log(` ${origin}: ${count}`);
1179
+ }
1180
+ }
1181
+ }
1182
+ console.log();
1183
+ }
1184
+ // Build overall agent behavior stats for summary
1185
+ const overallAgentBehavior = agentBehavior && agentBehavior.length > 0
1186
+ ? {
1187
+ avgDocPagesVisited: agentBehavior.reduce((s, ab) => s + ab.avgDocPagesVisited, 0) /
1188
+ agentBehavior.length,
1189
+ avgNetworkTimeMs: agentBehavior.reduce((s, ab) => s + ab.avgNetworkTimeMs, 0) /
1190
+ agentBehavior.length,
1191
+ avgSearchesPerformed: agentBehavior.reduce((s, ab) => s + ab.avgSearchesPerformed, 0) /
1192
+ agentBehavior.length,
1193
+ testsWithBehaviorData: agentBehavior.reduce((s, ab) => s + ab.tasksWithBehaviorData, 0),
1194
+ totalUniqueDocSlugs: [
1195
+ ...new Set(agentBehavior.flatMap((ab) => ab.docSlugsVisited)),
1196
+ ].length,
1197
+ totalUniqueSearchQueries: [
1198
+ ...new Set(agentBehavior.flatMap((ab) => ab.searchQueries)),
1199
+ ].length,
1200
+ }
1201
+ : undefined;
1202
+ // Compute aggregate metrics from actual scores (when agentic data present)
1203
+ const scoresWithActual = scores.filter((s) => s.actualScore !== undefined);
1204
+ const avgActualScore = scoresWithActual.length > 0
1205
+ ? scoresWithActual.reduce((sum, s) => sum + (s.actualScore ?? 0), 0) /
1206
+ scoresWithActual.length
1207
+ : undefined;
1208
+ const scoresWithGap = scores.filter((s) => s.retrievalGap !== undefined);
1209
+ const avgRetrievalGap = scoresWithGap.length > 0
1210
+ ? scoresWithGap.reduce((sum, s) => sum + (s.retrievalGap ?? 0), 0) /
1211
+ scoresWithGap.length
1212
+ : undefined;
1213
+ const scoresWithInfra = scores.filter((s) => s.infrastructureEfficiency != null);
1214
+ const avgInfrastructureEfficiency = scoresWithInfra.length > 0
1215
+ ? scoresWithInfra.reduce((sum, s) => sum + (s.infrastructureEfficiency ?? 0), 0) / scoresWithInfra.length
1216
+ : undefined;
1217
+ return {
1218
+ agentBehavior: agentBehavior ?? undefined,
1219
+ belowCritical: belowCritical.map((s) => s.feature),
1220
+ ...(evaluationMode && { evaluationMode }),
1221
+ lowestArea: lowestScore.feature,
1222
+ lowestScore: lowestScore.totalScore,
1223
+ ...(negativeDocLiftAreas.length > 0 && { negativeDocLiftAreas }),
1224
+ overall: {
1225
+ agentBehavior: overallAgentBehavior,
1226
+ ...(avgActualScore !== undefined && { avgActualScore }),
1227
+ avgCeilingScore: avgCeilingScore,
1228
+ avgDocLift: avgLift,
1229
+ avgDocQualityGap: avgDocQualityGap,
1230
+ avgFloorScore: avgFloorScore,
1231
+ ...(avgInfrastructureEfficiency !== undefined && {
1232
+ avgInfrastructureEfficiency,
1233
+ }),
1234
+ ...(avgRetrievalGap !== undefined && { avgRetrievalGap }),
1235
+ avgScore,
1236
+ cost: totalCost > 0 || graderCostTotal > 0
1237
+ ? {
1238
+ graderModel: graderCost?.model,
1239
+ graderTotal: graderCostTotal,
1240
+ perTest: combinedCost / (totalTests || 1),
1241
+ total: combinedCost,
1242
+ totalTokens: graderCost?.totalTokens ?? 0,
1243
+ }
1244
+ : undefined,
1245
+ negativeDocLiftCount: negativeDocLiftAreas.length,
1246
+ },
1247
+ scores,
1248
+ source: source
1249
+ ? {
1250
+ baseUrl: source.baseUrl,
1251
+ dataset: source.dataset,
1252
+ name: source.name ?? "default",
1253
+ perspective: source.perspective,
1254
+ projectId: source.projectId,
1255
+ }
1256
+ : undefined,
1257
+ ...(perModel && { perModel }),
1258
+ ...(sourceIsolation && { sourceIsolation }),
1259
+ ...(sourceVerification && { sourceVerification }),
1260
+ timestamp: new Date().toISOString(),
1261
+ urlReferences: urlRefs,
1262
+ };
1263
+ }
1264
+ function printUrlReport(urlRefs) {
1265
+ console.log("-".repeat(80));
1266
+ console.log("URL REFERENCES");
1267
+ console.log("-".repeat(80));
1268
+ console.log();
1269
+ for (const ref of urlRefs) {
1270
+ const goldUrls = Object.entries(ref.gold.urls).sort((a, b) => b[1] - a[1]);
1271
+ const baselineUrls = Object.entries(ref.baseline.urls).sort((a, b) => b[1] - a[1]);
1272
+ if (goldUrls.length > 0) {
1273
+ console.log(` ${ref.feature} (gold):`);
1274
+ for (const [url, count] of goldUrls) {
1275
+ const suffix = count > 1 ? ` (${count} tests)` : "";
1276
+ console.log(` ${url}${suffix}`);
1277
+ }
1278
+ }
1279
+ if (baselineUrls.length > 0) {
1280
+ console.log(` ${ref.feature} (baseline):`);
1281
+ for (const [url, count] of baselineUrls) {
1282
+ const suffix = count > 1 ? ` (${count} tests)` : "";
1283
+ console.log(` ${url}${suffix} [parametric]`);
1284
+ }
1285
+ }
1286
+ if (goldUrls.length === 0 && baselineUrls.length === 0) {
1287
+ console.log(` ${ref.feature}: no URLs referenced`);
1288
+ }
1289
+ console.log();
1290
+ }
1291
+ }
1292
+ // Only run when invoked directly (not when imported for testing)
1293
+ if (process.argv[1]?.endsWith("calculate-scores.ts") ||
1294
+ process.argv[1]?.endsWith("calculate-scores.js")) {
1295
+ main();
1296
+ }