@sanity/ailf 1.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (499) hide show
  1. package/README.md +0 -1
  2. package/canonical/grader-references/README.md +2 -2
  3. package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
  4. package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
  5. package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
  6. package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
  7. package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
  8. package/config/features.ts +1 -1
  9. package/config/models.ts +29 -12
  10. package/config/sources.ts +1 -1
  11. package/config/thresholds.ts +1 -1
  12. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
  13. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
  14. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
  15. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
  16. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
  17. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
  18. package/dist/_vendor/ailf-core/config-helpers.d.ts +20 -17
  19. package/dist/_vendor/ailf-core/config-helpers.js +51 -2
  20. package/dist/_vendor/ailf-core/examples/index.d.ts +166 -80
  21. package/dist/_vendor/ailf-core/examples/index.js +213 -94
  22. package/dist/_vendor/ailf-core/index.d.ts +3 -2
  23. package/dist/_vendor/ailf-core/index.js +2 -1
  24. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
  25. package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
  26. package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
  27. package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
  28. package/dist/_vendor/ailf-core/ports/context.d.ts +22 -1
  29. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
  30. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  31. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
  32. package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
  33. package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
  34. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  35. package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
  36. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +7 -1
  37. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +16 -2
  38. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
  39. package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
  40. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
  41. package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
  42. package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
  43. package/dist/_vendor/ailf-core/services/index.js +1 -1
  44. package/dist/_vendor/ailf-core/services/scoring.js +9 -0
  45. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +25 -1
  46. package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
  47. package/dist/_vendor/ailf-core/types/index.d.ts +48 -7
  48. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +105 -23
  49. package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
  50. package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
  51. package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
  52. package/dist/adapters/api-client/remediation.js +2 -2
  53. package/dist/adapters/config-sources/file-config-adapter.js +7 -1
  54. package/dist/adapters/config-sources/ts-config-loader.js +21 -13
  55. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
  56. package/dist/adapters/index.d.ts +0 -1
  57. package/dist/adapters/index.js +0 -1
  58. package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
  59. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  60. package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
  61. package/dist/adapters/task-sources/content-lake-task-source.js +21 -26
  62. package/dist/adapters/task-sources/index.d.ts +3 -4
  63. package/dist/adapters/task-sources/index.js +3 -4
  64. package/dist/adapters/task-sources/repo-schemas.d.ts +219 -17
  65. package/dist/adapters/task-sources/repo-schemas.js +228 -20
  66. package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
  67. package/dist/adapters/task-sources/repo-task-source.js +81 -122
  68. package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
  69. package/dist/adapters/task-sources/repo-trigger.js +1 -1
  70. package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
  71. package/dist/adapters/task-sources/repo-validation.js +126 -5
  72. package/dist/adapters/task-sources/task-file-loader.d.ts +10 -7
  73. package/dist/adapters/task-sources/task-file-loader.js +21 -7
  74. package/dist/agent-observer/test-imports.d.ts +7 -0
  75. package/dist/agent-observer/test-imports.js +185 -0
  76. package/dist/artifact-capture/comparator.d.ts +22 -0
  77. package/dist/artifact-capture/comparator.js +493 -0
  78. package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
  79. package/dist/artifact-capture/filesystem-collector.js +237 -0
  80. package/dist/artifact-capture/redact-artifact.d.ts +20 -0
  81. package/dist/artifact-capture/redact-artifact.js +115 -0
  82. package/dist/assertions/source-isolation.d.ts +1 -1
  83. package/dist/assertions/source-isolation.js +1 -1
  84. package/dist/cli.js +4 -0
  85. package/dist/commands/calculate-scores.js +1 -0
  86. package/dist/commands/capture-compare.d.ts +15 -0
  87. package/dist/commands/capture-compare.js +253 -0
  88. package/dist/commands/capture-list.d.ts +12 -0
  89. package/dist/commands/capture-list.js +147 -0
  90. package/dist/commands/capture.d.ts +9 -0
  91. package/dist/commands/capture.js +16 -0
  92. package/dist/commands/chronic-failures.d.ts +8 -0
  93. package/dist/commands/chronic-failures.js +33 -0
  94. package/dist/commands/coverage-audit.js +3 -1
  95. package/dist/commands/explain-handler.d.ts +1 -1
  96. package/dist/commands/explain-handler.js +37 -8
  97. package/dist/commands/fetch-docs.js +1 -0
  98. package/dist/commands/generate-configs.d.ts +3 -3
  99. package/dist/commands/generate-configs.js +20 -8
  100. package/dist/commands/init.d.ts +5 -4
  101. package/dist/commands/init.js +190 -25
  102. package/dist/commands/pipeline-action.d.ts +7 -1
  103. package/dist/commands/pipeline-action.js +43 -19
  104. package/dist/commands/pipeline.d.ts +6 -1
  105. package/dist/commands/pipeline.js +7 -2
  106. package/dist/commands/pr-comment.js +1 -0
  107. package/dist/commands/publish.js +1 -0
  108. package/dist/commands/shared/help.js +2 -2
  109. package/dist/commands/update-quality-scores.d.ts +5 -0
  110. package/dist/commands/update-quality-scores.js +20 -0
  111. package/dist/commands/validate-tasks.d.ts +2 -2
  112. package/dist/commands/validate-tasks.js +26 -15
  113. package/dist/composition-root.d.ts +15 -4
  114. package/dist/composition-root.js +100 -55
  115. package/dist/config/features.ts +23 -0
  116. package/dist/config/models.ts +100 -0
  117. package/dist/config/prompts.ts +16 -0
  118. package/dist/config/rubrics.ts +225 -0
  119. package/dist/config/schedules.ts +47 -0
  120. package/dist/config/sinks.ts +37 -0
  121. package/dist/config/sources.ts +21 -0
  122. package/dist/config/thresholds.ts +61 -0
  123. package/dist/index.d.ts +41 -0
  124. package/dist/index.js +48 -0
  125. package/dist/lib/agent-behavior-report.d.ts +8 -0
  126. package/dist/lib/agent-behavior-report.js +185 -0
  127. package/dist/lib/baseline.d.ts +19 -0
  128. package/dist/lib/baseline.js +153 -0
  129. package/dist/lib/calculate-scores.d.ts +23 -0
  130. package/dist/lib/calculate-scores.js +42 -0
  131. package/dist/lib/compare.d.ts +18 -0
  132. package/dist/lib/compare.js +170 -0
  133. package/dist/lib/coverage-audit.d.ts +4 -0
  134. package/dist/lib/coverage-audit.js +42 -0
  135. package/dist/lib/discovery-report.d.ts +13 -0
  136. package/dist/lib/discovery-report.js +57 -0
  137. package/dist/lib/fetch-docs.d.ts +30 -0
  138. package/dist/lib/fetch-docs.js +171 -0
  139. package/dist/lib/generate-configs.d.ts +25 -0
  140. package/dist/lib/generate-configs.js +42 -0
  141. package/dist/lib/grader-api.d.ts +21 -0
  142. package/dist/lib/grader-api.js +34 -0
  143. package/dist/lib/grader-compare.d.ts +19 -0
  144. package/dist/lib/grader-compare.js +91 -0
  145. package/dist/lib/grader-consistency.d.ts +27 -0
  146. package/dist/lib/grader-consistency.js +79 -0
  147. package/dist/lib/grader-sensitivity.d.ts +19 -0
  148. package/dist/lib/grader-sensitivity.js +75 -0
  149. package/dist/lib/grader-validate.d.ts +19 -0
  150. package/dist/lib/grader-validate.js +78 -0
  151. package/dist/lib/measure-retrieval.d.ts +14 -0
  152. package/dist/lib/measure-retrieval.js +71 -0
  153. package/dist/lib/pr-comment.d.ts +16 -0
  154. package/dist/lib/pr-comment.js +28 -0
  155. package/dist/lib/readiness-report.d.ts +13 -0
  156. package/dist/lib/readiness-report.js +108 -0
  157. package/dist/lib/webhook-server.d.ts +11 -0
  158. package/dist/lib/webhook-server.js +24 -0
  159. package/dist/lib/weekly-digest.d.ts +24 -0
  160. package/dist/lib/weekly-digest.js +148 -0
  161. package/dist/orchestration/build-app-context.js +13 -0
  162. package/dist/orchestration/build-step-sequence.js +4 -2
  163. package/dist/orchestration/cache-context.d.ts +23 -0
  164. package/dist/orchestration/cache-context.js +43 -0
  165. package/dist/orchestration/env-bridge.d.ts +21 -0
  166. package/dist/orchestration/env-bridge.js +66 -0
  167. package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
  168. package/dist/orchestration/load-pipeline-tasks.js +52 -0
  169. package/dist/orchestration/pipeline-orchestrator.js +75 -5
  170. package/dist/orchestration/step-runner.js +5 -1
  171. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
  172. package/dist/orchestration/steps/calculate-scores-step.js +13 -0
  173. package/dist/orchestration/steps/callback-step.js +10 -1
  174. package/dist/orchestration/steps/compare-step.js +6 -3
  175. package/dist/orchestration/steps/discovery-report-step.js +6 -2
  176. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  177. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  178. package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
  179. package/dist/orchestration/steps/fetch-docs-step.js +32 -19
  180. package/dist/orchestration/steps/gap-analysis-step.js +13 -2
  181. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
  182. package/dist/orchestration/steps/generate-configs-step.js +77 -26
  183. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
  184. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  185. package/dist/orchestration/steps/publish-report-step.js +19 -0
  186. package/dist/orchestration/steps/readiness-step.js +8 -3
  187. package/dist/orchestration/steps/report-step.js +17 -4
  188. package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
  189. package/dist/orchestration/steps/run-eval-step.js +51 -31
  190. package/dist/pipeline/agent-behavior-report.js +6 -0
  191. package/dist/pipeline/attribution.d.ts +1 -1
  192. package/dist/pipeline/attribution.js +1 -1
  193. package/dist/pipeline/cache.js +29 -15
  194. package/dist/pipeline/calculate-scores.d.ts +2 -0
  195. package/dist/pipeline/calculate-scores.js +70 -33
  196. package/dist/pipeline/chronic-failures.d.ts +55 -0
  197. package/dist/pipeline/chronic-failures.js +110 -0
  198. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
  199. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
  200. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
  201. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +132 -62
  202. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
  203. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
  204. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
  205. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
  206. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
  207. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
  208. package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
  209. package/dist/pipeline/compiler/assertion-mapper.js +1 -1
  210. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
  211. package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
  212. package/dist/pipeline/compiler/config-loader.d.ts +14 -0
  213. package/dist/pipeline/compiler/config-loader.js +42 -2
  214. package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
  215. package/dist/pipeline/compiler/fixture-resolver.js +1 -1
  216. package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
  217. package/dist/pipeline/compiler/ignore-fields.js +1 -1
  218. package/dist/pipeline/compiler/index.d.ts +2 -5
  219. package/dist/pipeline/compiler/index.js +2 -5
  220. package/dist/pipeline/compiler/literacy-bridge.d.ts +2 -2
  221. package/dist/pipeline/compiler/literacy-bridge.js +2 -2
  222. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
  223. package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
  224. package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
  225. package/dist/pipeline/compiler/mode-bases/index.js +4 -0
  226. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
  227. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
  228. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +23 -0
  229. package/dist/pipeline/compiler/mode-bases/literacy.js +132 -0
  230. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
  231. package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
  232. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
  233. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
  234. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
  235. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
  236. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
  237. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
  238. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
  239. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
  240. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
  241. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
  242. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
  243. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
  244. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
  245. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
  246. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
  247. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
  248. package/dist/pipeline/compiler/mode-handlers/index.d.ts +6 -7
  249. package/dist/pipeline/compiler/mode-handlers/index.js +6 -8
  250. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
  251. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
  252. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
  253. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
  254. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
  255. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
  256. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
  257. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
  258. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
  259. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
  260. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
  261. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
  262. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
  263. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
  264. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
  265. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
  266. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
  267. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
  268. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
  269. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
  270. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
  271. package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
  272. package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
  273. package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
  274. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +63 -6
  275. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
  276. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
  277. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
  278. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
  279. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
  280. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
  281. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
  282. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
  283. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
  284. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +108 -0
  285. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
  286. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
  287. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
  288. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
  289. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +3 -1
  290. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +65 -67
  291. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
  292. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +191 -0
  293. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
  294. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +101 -0
  295. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
  296. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
  297. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +19 -0
  298. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +323 -0
  299. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +103 -0
  300. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
  301. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
  302. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
  303. package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
  304. package/dist/pipeline/compiler/preset-loader.js +99 -0
  305. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +7 -10
  306. package/dist/pipeline/compiler/presets/sanity-literacy.js +11 -157
  307. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
  308. package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
  309. package/dist/pipeline/compiler/provider-assembler.js +13 -7
  310. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
  311. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
  312. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
  313. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
  314. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
  315. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
  316. package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
  317. package/dist/pipeline/compiler/sandbox/index.js +1 -1
  318. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
  319. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
  320. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
  321. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
  322. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
  323. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
  324. package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
  325. package/dist/pipeline/compiler/scoring-bridge.js +1 -1
  326. package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
  327. package/dist/pipeline/compiler/task-bridge.js +92 -0
  328. package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
  329. package/dist/pipeline/compiler/task-graph-builder.js +1 -4
  330. package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
  331. package/dist/pipeline/compiler/telemetry/index.js +1 -1
  332. package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
  333. package/dist/pipeline/compiler/variable-resolver.js +1 -1
  334. package/dist/pipeline/coverage-audit.d.ts +1 -1
  335. package/dist/pipeline/coverage-audit.js +1 -1
  336. package/dist/pipeline/degradations.d.ts +1 -1
  337. package/dist/pipeline/degradations.js +1 -1
  338. package/dist/pipeline/expand-tasks.d.ts +2 -2
  339. package/dist/pipeline/expand-tasks.js +2 -2
  340. package/dist/pipeline/failure-modes.d.ts +1 -1
  341. package/dist/pipeline/failure-modes.js +13 -1
  342. package/dist/pipeline/gap-analysis.d.ts +1 -1
  343. package/dist/pipeline/gap-analysis.js +3 -1
  344. package/dist/pipeline/generate-configs.d.ts +2 -2
  345. package/dist/pipeline/generate-configs.js +16 -9
  346. package/dist/pipeline/grader-compare-runner.d.ts +1 -1
  347. package/dist/pipeline/grader-compare-runner.js +7 -1
  348. package/dist/pipeline/grader-comparison.d.ts +1 -1
  349. package/dist/pipeline/grader-comparison.js +1 -1
  350. package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
  351. package/dist/pipeline/grader-consistency-runner.js +7 -1
  352. package/dist/pipeline/grader-consistency.d.ts +1 -1
  353. package/dist/pipeline/grader-consistency.js +1 -1
  354. package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
  355. package/dist/pipeline/grader-sensitivity-runner.js +1 -1
  356. package/dist/pipeline/grader-sensitivity.d.ts +1 -1
  357. package/dist/pipeline/grader-sensitivity.js +1 -1
  358. package/dist/pipeline/grader-validate-runner.d.ts +1 -1
  359. package/dist/pipeline/grader-validate-runner.js +2 -2
  360. package/dist/pipeline/grader-validation.d.ts +1 -1
  361. package/dist/pipeline/grader-validation.js +1 -1
  362. package/dist/pipeline/map-request-to-config.js +16 -2
  363. package/dist/pipeline/mirror-repo-tasks.d.ts +8 -8
  364. package/dist/pipeline/mirror-repo-tasks.js +10 -10
  365. package/dist/pipeline/plan-format.d.ts +1 -1
  366. package/dist/pipeline/plan-format.js +1 -1
  367. package/dist/pipeline/plan.d.ts +1 -1
  368. package/dist/pipeline/plan.js +68 -30
  369. package/dist/pipeline/probe.d.ts +1 -1
  370. package/dist/pipeline/probe.js +1 -1
  371. package/dist/pipeline/readiness-report.d.ts +2 -2
  372. package/dist/pipeline/readiness-report.js +2 -2
  373. package/dist/pipeline/release-classification.d.ts +1 -1
  374. package/dist/pipeline/release-classification.js +1 -1
  375. package/dist/pipeline/release-report.d.ts +1 -1
  376. package/dist/pipeline/release-report.js +1 -1
  377. package/dist/pipeline/repo-eval-comment.d.ts +1 -1
  378. package/dist/pipeline/repo-eval-comment.js +1 -1
  379. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  380. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  381. package/dist/pipeline/resolve-mappings.d.ts +6 -6
  382. package/dist/pipeline/resolve-mappings.js +44 -44
  383. package/dist/pipeline/retrieval-metrics.d.ts +3 -3
  384. package/dist/pipeline/retrieval-metrics.js +28 -20
  385. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  386. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  387. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  388. package/dist/pipeline/steps/compare-step.js +90 -0
  389. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  390. package/dist/pipeline/steps/eval-step.js +347 -0
  391. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  392. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  393. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  394. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  395. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  396. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  397. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  398. package/dist/pipeline/steps/publish-report-step.js +243 -0
  399. package/dist/pipeline/steps/report-step.d.ts +13 -0
  400. package/dist/pipeline/steps/report-step.js +56 -0
  401. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  402. package/dist/pipeline/steps/update-scores-step.js +42 -0
  403. package/dist/pipeline/targeted-loo.d.ts +1 -1
  404. package/dist/pipeline/targeted-loo.js +1 -1
  405. package/dist/pipeline/thresholds.d.ts +1 -1
  406. package/dist/pipeline/thresholds.js +1 -1
  407. package/dist/pipeline/validate.js +13 -0
  408. package/dist/report-store.d.ts +17 -0
  409. package/dist/report-store.js +24 -0
  410. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  411. package/dist/scripts/agent-behavior-report.js +315 -0
  412. package/dist/scripts/baseline.d.ts +43 -0
  413. package/dist/scripts/baseline.js +267 -0
  414. package/dist/scripts/calculate-scores.d.ts +166 -0
  415. package/dist/scripts/calculate-scores.js +1296 -0
  416. package/dist/scripts/compare.d.ts +22 -0
  417. package/dist/scripts/compare.js +334 -0
  418. package/dist/scripts/coverage-audit.d.ts +44 -0
  419. package/dist/scripts/coverage-audit.js +209 -0
  420. package/dist/scripts/debug-eval.d.ts +19 -0
  421. package/dist/scripts/debug-eval.js +73 -0
  422. package/dist/scripts/discovery-report.d.ts +58 -0
  423. package/dist/scripts/discovery-report.js +250 -0
  424. package/dist/scripts/fetch-docs.d.ts +35 -0
  425. package/dist/scripts/fetch-docs.js +472 -0
  426. package/dist/scripts/generate-configs.d.ts +66 -0
  427. package/dist/scripts/generate-configs.js +459 -0
  428. package/dist/scripts/grader-api.d.ts +27 -0
  429. package/dist/scripts/grader-api.js +206 -0
  430. package/dist/scripts/grader-compare.d.ts +22 -0
  431. package/dist/scripts/grader-compare.js +368 -0
  432. package/dist/scripts/grader-consistency.d.ts +20 -0
  433. package/dist/scripts/grader-consistency.js +313 -0
  434. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  435. package/dist/scripts/grader-sensitivity.js +354 -0
  436. package/dist/scripts/grader-validate.d.ts +19 -0
  437. package/dist/scripts/grader-validate.js +267 -0
  438. package/dist/scripts/measure-retrieval.d.ts +10 -0
  439. package/dist/scripts/measure-retrieval.js +145 -0
  440. package/dist/scripts/migrate-task-mode.d.ts +1 -1
  441. package/dist/scripts/migrate-task-mode.js +1 -1
  442. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
  443. package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
  444. package/dist/scripts/pipeline.d.ts +76 -0
  445. package/dist/scripts/pipeline.js +1031 -0
  446. package/dist/scripts/pr-comment.d.ts +10 -0
  447. package/dist/scripts/pr-comment.js +510 -0
  448. package/dist/scripts/readiness-report.d.ts +88 -0
  449. package/dist/scripts/readiness-report.js +342 -0
  450. package/dist/scripts/update-quality-scores.d.ts +15 -0
  451. package/dist/scripts/update-quality-scores.js +184 -0
  452. package/dist/scripts/validate-task-sources.d.ts +1 -1
  453. package/dist/scripts/validate-task-sources.js +1 -1
  454. package/dist/scripts/validate.d.ts +13 -0
  455. package/dist/scripts/validate.js +79 -0
  456. package/dist/scripts/webhook-server.d.ts +26 -0
  457. package/dist/scripts/webhook-server.js +147 -0
  458. package/dist/scripts/weekly-digest.d.ts +24 -0
  459. package/dist/scripts/weekly-digest.js +144 -0
  460. package/dist/sinks/format-slack.d.ts +64 -0
  461. package/dist/sinks/format-slack.js +306 -0
  462. package/dist/sinks/slack-sink.d.ts +27 -0
  463. package/dist/sinks/slack-sink.js +78 -0
  464. package/dist/sinks/types.d.ts +1 -1
  465. package/dist/sinks/types.js +1 -1
  466. package/dist/sinks/webhook-sink.d.ts +19 -0
  467. package/dist/sinks/webhook-sink.js +50 -0
  468. package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
  469. package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
  470. package/dist/tasks/literacy/content-lake.task.ts +181 -0
  471. package/dist/tasks/literacy/frameworks.task.ts +129 -0
  472. package/dist/tasks/literacy/functions.task.ts +70 -0
  473. package/dist/tasks/literacy/groq.task.ts +259 -0
  474. package/dist/tasks/literacy/image-handling.task.ts +95 -0
  475. package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
  476. package/dist/tasks/literacy/portable-text.task.ts +169 -0
  477. package/dist/tasks/literacy/studio-setup.task.ts +134 -0
  478. package/dist/tasks/literacy/visual-editing.task.ts +147 -0
  479. package/package.json +32 -24
  480. package/tasks/.expanded.agentic.yaml +280 -0
  481. package/tasks/.expanded.yaml +565 -0
  482. package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
  483. package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
  484. package/tasks/literacy/content-lake.task.ts +181 -0
  485. package/tasks/literacy/frameworks.task.ts +1 -0
  486. package/tasks/literacy/functions.task.ts +1 -0
  487. package/tasks/literacy/groq.task.ts +1 -0
  488. package/tasks/literacy/image-handling.task.ts +95 -0
  489. package/tasks/literacy/nextjs-live.task.ts +2 -1
  490. package/tasks/literacy/portable-text.task.ts +169 -0
  491. package/tasks/literacy/studio-setup.task.ts +5 -2
  492. package/tasks/literacy/visual-editing.task.ts +1 -0
  493. package/LICENSE +0 -21
  494. package/tasks/frameworks.yaml +0 -98
  495. package/tasks/functions.yaml +0 -51
  496. package/tasks/groq.yaml +0 -216
  497. package/tasks/nextjs-live.yaml +0 -62
  498. package/tasks/studio-setup.yaml +0 -111
  499. package/tasks/visual-editing.yaml +0 -120
@@ -0,0 +1,10 @@
1
+ /**
2
+ * Generates a markdown PR comment from eval score-summary.json.
3
+ *
4
+ * Usage:
5
+ * tsx src/scripts/pr-comment.ts [--output <path>] [--promptfoo-url <url>]
6
+ *
7
+ * Reads: results/latest/score-summary.json
8
+ * Writes: markdown to stdout or --output file
9
+ */
10
+ export {};
@@ -0,0 +1,510 @@
1
+ /**
2
+ * Generates a markdown PR comment from eval score-summary.json.
3
+ *
4
+ * Usage:
5
+ * tsx src/scripts/pr-comment.ts [--output <path>] [--promptfoo-url <url>]
6
+ *
7
+ * Reads: results/latest/score-summary.json
8
+ * Writes: markdown to stdout or --output file
9
+ */
10
+ import { existsSync, readFileSync, writeFileSync } from "node:fs";
11
+ import { resolve } from "node:path";
12
+ function formatCost(cost) {
13
+ if (cost === 0) {
14
+ return "$0.00";
15
+ }
16
+ if (cost < 0.01) {
17
+ return `$${cost.toFixed(4)}`;
18
+ }
19
+ return `$${cost.toFixed(2)}`;
20
+ }
21
+ function generateComment(summary, options = {}) {
22
+ const { belowCritical, overall, scores, source, timestamp } = summary;
23
+ const sorted = [...scores].sort((a, b) => b.totalScore - a.totalScore);
24
+ const lines = [];
25
+ // Header
26
+ lines.push(`## ${overallEmoji(overall.avgScore)} AI Literacy Score Report`);
27
+ lines.push("");
28
+ const totalCost = scores.reduce((sum, s) => sum + (s.totalCost ?? 0), 0);
29
+ const graderCostValue = overall.cost?.graderTotal ?? 0;
30
+ const combinedCostValue = totalCost + graderCostValue;
31
+ const costStr = combinedCostValue > 0 ? ` · Cost: ${formatCost(combinedCostValue)}` : "";
32
+ const actualStr = overall.avgActualScore !== undefined
33
+ ? ` · Actual: ${Math.round(overall.avgActualScore)}/100`
34
+ : "";
35
+ const retGapStr = overall.avgRetrievalGap !== undefined
36
+ ? ` · Ret. Gap: ${Math.round(overall.avgRetrievalGap)}`
37
+ : "";
38
+ lines.push(`**Overall: ${Math.round(overall.avgScore)}/100** · Doc Lift: +${Math.round(overall.avgDocLift)}${actualStr}${retGapStr} · ${scores.reduce((sum, s) => sum + s.testCount, 0)} tests across ${scores.length} areas${costStr}`);
39
+ lines.push("");
40
+ // Critical warnings
41
+ if (belowCritical.length > 0) {
42
+ lines.push(`> ⚠️ **Below critical threshold:** ${belowCritical.map((a) => `\`${a}\``).join(", ")}`);
43
+ lines.push("");
44
+ }
45
+ // Environment info
46
+ if (source) {
47
+ lines.push("<details>");
48
+ lines.push("<summary>🔧 Environment</summary>");
49
+ lines.push("");
50
+ lines.push("| Setting | Value |");
51
+ lines.push("|---------|-------|");
52
+ lines.push(`| **Source** | ${source.name} |`);
53
+ lines.push(`| **Docs URL** | ${source.baseUrl} |`);
54
+ if (source.dataset) {
55
+ lines.push(`| **Dataset** | ${source.dataset} |`);
56
+ }
57
+ lines.push(`| **Project** | ${source.projectId} |`);
58
+ lines.push("");
59
+ lines.push("</details>");
60
+ lines.push("");
61
+ }
62
+ // Source verification
63
+ const { sourceIsolation, sourceVerification } = summary;
64
+ if (sourceVerification || sourceIsolation) {
65
+ lines.push("<details>");
66
+ lines.push("<summary>🔍 Source verification</summary>");
67
+ lines.push("");
68
+ lines.push("| Setting | Value |");
69
+ lines.push("|---------|-------|");
70
+ if (sourceVerification) {
71
+ lines.push(`| **Source** | ${sourceVerification.source} |`);
72
+ lines.push(`| **Mode** | ${sourceVerification.mode} |`);
73
+ if (sourceVerification.allowedOrigins) {
74
+ lines.push(`| **Sandbox** | ${sourceVerification.allowedOrigins.join(", ")} |`);
75
+ }
76
+ if (sourceVerification.searchMode) {
77
+ lines.push(`| **Search** | ${sourceVerification.searchMode} |`);
78
+ }
79
+ }
80
+ if (sourceIsolation) {
81
+ const pct = Math.round(sourceIsolation.isolationScore * 100);
82
+ const icon = sourceIsolation.offOrigin === 0 ? "✅" : "⚠️";
83
+ lines.push(`| **Agent isolation** | ${icon} ${pct}% (${sourceIsolation.onOrigin}/${sourceIsolation.total} on-origin) |`);
84
+ if (sourceIsolation.offOrigin > 0) {
85
+ lines.push(`| **Off-origin fetches** | ${sourceIsolation.offOriginUrls.slice(0, 5).join(", ")} |`);
86
+ }
87
+ }
88
+ if (sourceVerification?.urlFetch) {
89
+ const uf = sourceVerification.urlFetch;
90
+ lines.push(`| **URL fetch** | ${uf.totalFetched} fetched, ${uf.totalFailed} failed |`);
91
+ for (const f of uf.fetchedUrls) {
92
+ lines.push(`| | ✅ ${f.url} (via ${f.method}) |`);
93
+ }
94
+ for (const f of uf.failures) {
95
+ // oxlint-disable-next-line @typescript-eslint/prefer-nullish-coalescing -- empty error string should show "unknown"
96
+ lines.push(`| | ⚠️ ${f.url}: ${f.error || "unknown"} |`);
97
+ }
98
+ }
99
+ lines.push("");
100
+ lines.push("</details>");
101
+ lines.push("");
102
+ }
103
+ // Score table
104
+ lines.push("### Scores by Feature Area");
105
+ lines.push("");
106
+ lines.push("| Feature | Score | Grade | Task | Code | Docs | Doc Lift | Tests |");
107
+ lines.push("|---------|-------|-------|------|------|------|----------|-------|");
108
+ for (const s of sorted) {
109
+ lines.push(`| ${s.feature} | **${Math.round(s.totalScore)}** | ${gradeEmoji(s.totalScore)} ${gradeLetter(s.totalScore)} | ${Math.round(s.taskCompletion)} | ${Math.round(s.codeCorrectness)} | ${Math.round(s.docCoverage)} | ${liftArrow(s.docLift)} | ${s.testCount} |`);
110
+ }
111
+ lines.push("");
112
+ // Breakdown legend
113
+ lines.push("<details>");
114
+ lines.push("<summary>📊 Score breakdown legend</summary>");
115
+ lines.push("");
116
+ lines.push("| Column | Description |");
117
+ lines.push("|--------|-------------|");
118
+ lines.push("| **Score** | Overall AI literacy score (0–100) — weighted sum of Task + Code + Docs |");
119
+ lines.push("| **Task** | Task completion — does the LLM understand what to build? |");
120
+ lines.push("| **Code** | Code correctness — does the generated code actually work? |");
121
+ lines.push("| **Docs** | Documentation coverage — are the right APIs/patterns referenced? |");
122
+ lines.push("| **Doc Lift** | Score improvement when docs are provided vs baseline (no docs) |");
123
+ lines.push("| **Grade** | ✅ A (≥80) · 🟡 B (≥70) · 🟠 C (≥50) · 🔴 D (<50) |");
124
+ lines.push("");
125
+ lines.push("</details>");
126
+ lines.push("");
127
+ // Negative Doc Lift warning (always visible, not collapsible)
128
+ const negDocLiftAreas = sorted.filter((s) => s.negativeDocLift);
129
+ if (negDocLiftAreas.length > 0) {
130
+ for (const s of negDocLiftAreas) {
131
+ lines.push(`> 🚨 **Negative Doc Lift:** \`${s.feature}\` (${s.docLift}) — docs hurt performance. Floor: ${s.floorScore}, Ceiling: ${s.ceilingScore}`);
132
+ }
133
+ lines.push("");
134
+ }
135
+ // Three-layer decomposition (when actual scores are present)
136
+ const hasActualScores = sorted.some((s) => s.actualScore !== undefined);
137
+ if (hasActualScores) {
138
+ // Full decomposition — show all three layers prominently
139
+ lines.push("### 🔬 Three-Layer Decomposition");
140
+ lines.push("");
141
+ lines.push("| Feature | Floor | Ceiling | Actual | Doc Lift | Retr. Gap | Infra % |");
142
+ lines.push("|---------|-------|---------|--------|----------|-----------|---------|");
143
+ for (const s of sorted) {
144
+ const actualStr = s.actualScore !== undefined ? String(s.actualScore) : "—";
145
+ const gapStr = s.retrievalGap !== undefined
146
+ ? s.retrievalGap >= 0
147
+ ? `+${s.retrievalGap}`
148
+ : String(s.retrievalGap)
149
+ : "—";
150
+ const infraStr = s.infrastructureEfficiency != null
151
+ ? `${Math.round(s.infrastructureEfficiency * 100)}%`
152
+ : "—";
153
+ const flag = s.invertedRetrievalGap ? " 🔄" : "";
154
+ lines.push(`| ${s.feature} | ${s.floorScore} | ${s.ceilingScore} | ${actualStr} | ${liftArrow(s.docLift)} | ${gapStr}${flag} | ${infraStr} |`);
155
+ }
156
+ lines.push("");
157
+ // Decomposition guide
158
+ lines.push("<details>");
159
+ lines.push("<summary>📖 What do these numbers mean?</summary>");
160
+ lines.push("");
161
+ lines.push("- **Floor:** How well models do without any documentation (just training data)");
162
+ lines.push("- **Ceiling:** How well models do with perfect documentation (hand-picked, injected)");
163
+ lines.push("- **Actual:** How well models do when they have to find docs themselves (like real users)");
164
+ lines.push("- **Doc Lift:** How much docs help (Ceiling − Floor). Negative = docs hurt");
165
+ lines.push("- **Retr. Gap:** How much quality is lost in discovery (Ceiling − Actual)");
166
+ lines.push("- **Infra %:** What fraction of doc quality reaches agents (Actual ÷ Ceiling)");
167
+ if (sorted.some((s) => s.invertedRetrievalGap)) {
168
+ lines.push("- **🔄:** Inverted retrieval gap — agents avoid bad docs, scoring higher than ceiling");
169
+ }
170
+ lines.push("");
171
+ lines.push("</details>");
172
+ lines.push("");
173
+ }
174
+ else {
175
+ // Baseline-only — show the existing ceiling decomposition
176
+ lines.push("<details>");
177
+ lines.push("<summary>📊 Ceiling decomposition</summary>");
178
+ lines.push("");
179
+ lines.push("| Feature | Floor | Ceiling | Doc Lift | Quality Gap |");
180
+ lines.push("|---------|-------|---------|----------|-------------|");
181
+ for (const s of sorted) {
182
+ lines.push(`| ${s.feature} | ${s.floorScore} | ${s.ceilingScore} | ${liftArrow(s.docLift)} | ${s.docQualityGap} |`);
183
+ }
184
+ lines.push("");
185
+ lines.push("</details>");
186
+ lines.push("");
187
+ // With vs without docs comparison (uses ceiling/floor model)
188
+ lines.push("<details>");
189
+ lines.push("<summary>📄 With docs vs without docs</summary>");
190
+ lines.push("");
191
+ lines.push("| Feature | With Docs | Without Docs | Lift |");
192
+ lines.push("|---------|-----------|--------------|------|");
193
+ for (const s of sorted) {
194
+ lines.push(`| ${s.feature} | ${s.ceilingScore} | ${s.floorScore} | ${liftArrow(s.docLift)} |`);
195
+ }
196
+ lines.push("");
197
+ lines.push("</details>");
198
+ lines.push("");
199
+ }
200
+ // Cost breakdown (only when cost data is available)
201
+ if (totalCost > 0 || overall.cost) {
202
+ const graderCost = overall.cost?.graderTotal ?? 0;
203
+ const combinedCost = totalCost + graderCost;
204
+ lines.push("<details>");
205
+ lines.push("<summary>💰 Eval cost breakdown</summary>");
206
+ lines.push("");
207
+ lines.push("| Category | Cost |");
208
+ lines.push("|----------|------|");
209
+ lines.push(`| Provider (model inference) | ${formatCost(totalCost)} |`);
210
+ if (graderCost > 0) {
211
+ const graderLabel = overall.cost?.graderModel ?? "unknown";
212
+ lines.push(`| Grader (${graderLabel}) | ${formatCost(graderCost)} |`);
213
+ }
214
+ lines.push(`| **Total** | **${formatCost(combinedCost)}** |`);
215
+ lines.push("");
216
+ // Per-feature provider cost breakdown
217
+ lines.push("**Provider cost by feature area:**");
218
+ lines.push("");
219
+ lines.push("| Feature | Tests | Cost | Avg/Test |");
220
+ lines.push("|---------|-------|------|----------|");
221
+ for (const s of sorted) {
222
+ const avgCost = s.testCount > 0 ? s.totalCost / s.testCount : 0;
223
+ lines.push(`| ${s.feature} | ${s.testCount} | ${formatCost(s.totalCost)} | ${formatCost(avgCost)} |`);
224
+ }
225
+ lines.push("");
226
+ lines.push("</details>");
227
+ lines.push("");
228
+ }
229
+ // Per-model breakdown (when multiple models were evaluated)
230
+ if (summary.perModel && summary.perModel.length > 1) {
231
+ const sorted = [...summary.perModel].sort((a, b) => b.overall.avgScore - a.overall.avgScore);
232
+ lines.push("<details>");
233
+ lines.push("<summary>🤖 Per-model scores</summary>");
234
+ lines.push("");
235
+ lines.push("| Model | Score | Doc Lift | Tests | Cost |");
236
+ lines.push("|-------|-------|----------|-------|------|");
237
+ for (const entry of sorted) {
238
+ const displayName = entry.label || entry.modelId;
239
+ const costStr = entry.overall.cost ? formatCost(entry.overall.cost) : "—";
240
+ const liftStr = entry.overall.avgDocLift >= 0
241
+ ? `+${Math.round(entry.overall.avgDocLift)}`
242
+ : String(Math.round(entry.overall.avgDocLift));
243
+ lines.push(`| ${displayName} | **${Math.round(entry.overall.avgScore)}** | ${liftStr} | ${entry.overall.testCount} | ${costStr} |`);
244
+ }
245
+ lines.push("");
246
+ // Per-model × per-area table
247
+ for (const entry of sorted) {
248
+ const displayName = entry.label || entry.modelId;
249
+ lines.push(`**${displayName}** (${entry.modelId}):`);
250
+ lines.push("");
251
+ lines.push("| Feature | Score | Task | Code | Docs | Lift |");
252
+ lines.push("|---------|-------|------|------|------|------|");
253
+ for (const s of entry.scores) {
254
+ const lift = s.docLift >= 0 ? `+${s.docLift}` : String(s.docLift);
255
+ lines.push(`| ${s.feature} | **${s.totalScore}** | ${s.taskCompletion} | ${s.codeCorrectness} | ${s.docCoverage} | ${lift} |`);
256
+ }
257
+ lines.push("");
258
+ }
259
+ // Cost per quality point
260
+ const withCost = sorted.filter((e) => e.overall.cost && e.overall.cost > 0);
261
+ if (withCost.length > 0) {
262
+ lines.push("**Cost efficiency:**");
263
+ lines.push("");
264
+ lines.push("| Model | $/point | Score | Cost |");
265
+ lines.push("|-------|---------|-------|------|");
266
+ for (const entry of withCost) {
267
+ const costPerPoint = entry.overall.avgScore > 0
268
+ ? (entry.overall.cost ?? 0) / entry.overall.avgScore
269
+ : 0;
270
+ lines.push(`| ${entry.label} | ${formatCost(costPerPoint)} | ${Math.round(entry.overall.avgScore)} | ${formatCost(entry.overall.cost ?? 0)} |`);
271
+ }
272
+ lines.push("");
273
+ }
274
+ lines.push("</details>");
275
+ lines.push("");
276
+ }
277
+ // Comparison section (when --compare was used)
278
+ if (options.comparisonReport) {
279
+ const report = options.comparisonReport;
280
+ const overallDelta = report.deltas.overall;
281
+ const overallDeltaStr = overallDelta > 0
282
+ ? `+${Math.round(overallDelta)}`
283
+ : String(Math.round(overallDelta));
284
+ const overallChangeIcon = overallDelta > report.noiseThreshold
285
+ ? "📈"
286
+ : overallDelta < -report.noiseThreshold
287
+ ? "📉"
288
+ : "➡️";
289
+ lines.push("### 📊 Score Comparison");
290
+ lines.push("");
291
+ lines.push(`**Overall: ${Math.round(report.baseline.overall.avgScore)} → ${Math.round(report.experiment.overall.avgScore)}** (${overallChangeIcon} ${overallDeltaStr})`);
292
+ lines.push("");
293
+ // Check if comparison data includes actual/retrieval gap deltas
294
+ const hasActualDeltas = report.areas.some((a) => a.actualDelta !== undefined);
295
+ if (hasActualDeltas) {
296
+ lines.push("| Feature | Baseline | Current | Delta | Actual Δ | Ret. Gap Δ | Infra Δ |");
297
+ lines.push("|---------|----------|---------|-------|----------|------------|---------|");
298
+ for (const a of report.areas) {
299
+ const icon = a.change === "improved"
300
+ ? "📈"
301
+ : a.change === "regressed"
302
+ ? "📉"
303
+ : "➡️";
304
+ const d = (n) => n > 0 ? `+${Math.round(n)}` : String(Math.round(n));
305
+ const actualStr = a.actualDelta !== undefined ? d(a.actualDelta) : "—";
306
+ const retGapStr = a.retrievalGapDelta !== undefined ? d(a.retrievalGapDelta) : "—";
307
+ const infraStr = a.infrastructureEfficiencyDelta !== undefined
308
+ ? `${a.infrastructureEfficiencyDelta > 0 ? "+" : ""}${Math.round(a.infrastructureEfficiencyDelta * 100)}pp`
309
+ : "—";
310
+ lines.push(`| ${a.area} | ${a.baseline} | ${a.experiment} | ${icon} ${d(a.delta)} | ${actualStr} | ${retGapStr} | ${infraStr} |`);
311
+ }
312
+ }
313
+ else {
314
+ lines.push("| Feature | Baseline | Current | Delta | Task | Code | Docs |");
315
+ lines.push("|---------|----------|---------|-------|------|------|------|");
316
+ for (const a of report.areas) {
317
+ const icon = a.change === "improved"
318
+ ? "📈"
319
+ : a.change === "regressed"
320
+ ? "📉"
321
+ : "➡️";
322
+ const d = (n) => n > 0 ? `+${Math.round(n)}` : String(Math.round(n));
323
+ lines.push(`| ${a.area} | ${a.baseline} | ${a.experiment} | ${icon} ${d(a.delta)} | ${d(a.dimensions.taskCompletion.delta)} | ${d(a.dimensions.codeCorrectness.delta)} | ${d(a.dimensions.docCoverage.delta)} |`);
324
+ }
325
+ }
326
+ lines.push("");
327
+ const parts = [];
328
+ if (report.improved.length > 0) {
329
+ parts.push(`📈 ${report.improved.length} improved`);
330
+ }
331
+ if (report.regressed.length > 0) {
332
+ parts.push(`📉 ${report.regressed.length} regressed`);
333
+ }
334
+ if (report.unchanged.length > 0) {
335
+ parts.push(`➡️ ${report.unchanged.length} unchanged`);
336
+ }
337
+ if (parts.length > 0) {
338
+ const isEmpirical = "noiseThresholdEmpirical" in report &&
339
+ report.noiseThresholdEmpirical === true;
340
+ const thresholdNote = isEmpirical
341
+ ? ` (empirical threshold: ±${report.noiseThreshold.toFixed(1)})`
342
+ : ` (threshold: ±${report.noiseThreshold})`;
343
+ lines.push(parts.join(" · ") + thresholdNote);
344
+ lines.push("");
345
+ }
346
+ // Dimension averages in collapsible
347
+ lines.push("<details>");
348
+ lines.push("<summary>Dimension averages</summary>");
349
+ lines.push("");
350
+ const dim = report.deltas.perDimension;
351
+ const dd = (n) => n > 0 ? `+${Math.round(n)}` : String(Math.round(n));
352
+ lines.push("| Dimension | Delta |");
353
+ lines.push("|-----------|-------|");
354
+ lines.push(`| Task Completion | ${dd(dim.taskCompletion)} |`);
355
+ lines.push(`| Code Correctness | ${dd(dim.codeCorrectness)} |`);
356
+ lines.push(`| Doc Coverage | ${dd(dim.docCoverage)} |`);
357
+ lines.push(`| Doc Lift | ${dd(report.deltas.docLift)} |`);
358
+ lines.push("");
359
+ lines.push("</details>");
360
+ lines.push("");
361
+ }
362
+ // Recommendations
363
+ if (belowCritical.length > 0 || sorted.some((s) => s.totalScore < 70)) {
364
+ lines.push("### 💡 Recommendations");
365
+ lines.push("");
366
+ for (const s of sorted) {
367
+ if (s.totalScore < 50) {
368
+ lines.push(`- 🔴 **${s.feature}** (score: ${Math.round(s.totalScore)}) — needs significant doc improvements. `);
369
+ if (s.codeCorrectness < 10) {
370
+ lines.push(` Code correctness is very low (${Math.round(s.codeCorrectness)}) — add more complete code examples.`);
371
+ }
372
+ if (s.docCoverage < 10) {
373
+ lines.push(` Doc coverage is very low (${Math.round(s.docCoverage)}) — key APIs/patterns may be missing from docs.`);
374
+ }
375
+ }
376
+ else if (s.totalScore < 70) {
377
+ lines.push(`- 🟠 **${s.feature}** (score: ${Math.round(s.totalScore)}) — has room for improvement.`);
378
+ if (s.codeCorrectness < 15) {
379
+ lines.push(` Code correctness (${Math.round(s.codeCorrectness)}) could improve with better code examples.`);
380
+ }
381
+ }
382
+ }
383
+ lines.push("");
384
+ }
385
+ // Footer
386
+ const date = new Date(timestamp).toLocaleString("en-US", {
387
+ day: "numeric",
388
+ hour: "numeric",
389
+ minute: "2-digit",
390
+ month: "short",
391
+ timeZone: "UTC",
392
+ timeZoneName: "short",
393
+ year: "numeric",
394
+ });
395
+ lines.push("---");
396
+ const runUrl = process.env.GITHUB_RUN_URL ?? "";
397
+ const runLink = runUrl ? ` · <a href="${runUrl}">view run</a>` : "";
398
+ const promptfooLink = options.promptfooUrl
399
+ ? ` · <a href="${options.promptfooUrl}">view detailed results</a>`
400
+ : "";
401
+ lines.push(`<sub>🤖 Generated by AI Literacy Framework · ${date}${runLink}${promptfooLink} · <a href="https://github.com/sanity-labs/ai-literacy-framework">docs</a></sub>`);
402
+ return lines.join("\n");
403
+ }
404
+ function gradeEmoji(score) {
405
+ if (score >= 80) {
406
+ return "✅";
407
+ }
408
+ if (score >= 70) {
409
+ return "🟡";
410
+ }
411
+ if (score >= 50) {
412
+ return "🟠";
413
+ }
414
+ return "🔴";
415
+ }
416
+ function gradeLetter(score) {
417
+ if (score >= 80) {
418
+ return "A";
419
+ }
420
+ if (score >= 70) {
421
+ return "B";
422
+ }
423
+ if (score >= 50) {
424
+ return "C";
425
+ }
426
+ return "D";
427
+ }
428
+ // ── Main ───────────────────────────────────────────────────────────────
429
+ function liftArrow(lift) {
430
+ const rounded = Math.round(lift);
431
+ if (rounded > 0) {
432
+ return `📈 +${rounded}`;
433
+ }
434
+ if (rounded < 0) {
435
+ return `📉 ${rounded}`;
436
+ }
437
+ return "➡️ 0";
438
+ }
439
+ function overallEmoji(avg) {
440
+ if (avg >= 75) {
441
+ return "🟢";
442
+ }
443
+ if (avg >= 60) {
444
+ return "🟡";
445
+ }
446
+ if (avg >= 45) {
447
+ return "🟠";
448
+ }
449
+ return "🔴";
450
+ }
451
+ // ── CLI ────────────────────────────────────────────────────────────────
452
+ const args = process.argv.slice(2);
453
+ const outputIdx = args.indexOf("--output");
454
+ const outputPath = outputIdx !== -1 ? args[outputIdx + 1] : null;
455
+ const promptfooUrlIdx = args.indexOf("--promptfoo-url");
456
+ const promptfooUrl = promptfooUrlIdx !== -1 ? args[promptfooUrlIdx + 1] : undefined;
457
+ const summaryPath = resolve(import.meta.dirname ?? ".", "../../results/latest/score-summary.json");
458
+ const comparisonPath = resolve(import.meta.dirname ?? ".", "../../results/latest/comparison-report.json");
459
+ /** Normalize legacy field names in a score object */
460
+ function normalizeScore(s) {
461
+ const ceiling = s.ceilingScore ??
462
+ s.withDocsScore ??
463
+ s.totalScore ??
464
+ 0;
465
+ const floor = s.floorScore ??
466
+ s.withoutDocsScore ??
467
+ 0;
468
+ const lift = s.docLift ??
469
+ s.liftFromDocs ??
470
+ ceiling - floor;
471
+ return {
472
+ ...s,
473
+ ceilingScore: ceiling,
474
+ docLift: lift,
475
+ docQualityGap: s.docQualityGap ?? 100 - ceiling,
476
+ floorScore: floor,
477
+ negativeDocLift: s.negativeDocLift ?? lift < 0,
478
+ };
479
+ }
480
+ try {
481
+ const raw = readFileSync(summaryPath, "utf-8");
482
+ const parsed = JSON.parse(raw);
483
+ // Normalize legacy field names in scores
484
+ const summary = {
485
+ ...parsed,
486
+ scores: parsed.scores.map((s) => normalizeScore(s)),
487
+ };
488
+ // Load comparison report if it exists (produced by --compare flag)
489
+ let comparisonReport;
490
+ if (existsSync(comparisonPath)) {
491
+ try {
492
+ const compRaw = readFileSync(comparisonPath, "utf-8");
493
+ comparisonReport = JSON.parse(compRaw);
494
+ }
495
+ catch {
496
+ // Non-fatal — comparison report is optional
497
+ }
498
+ }
499
+ const comment = generateComment(summary, { comparisonReport, promptfooUrl });
500
+ if (outputPath) {
501
+ writeFileSync(outputPath, comment);
502
+ }
503
+ else {
504
+ process.stdout.write(comment);
505
+ }
506
+ }
507
+ catch (error) {
508
+ console.error("Failed to generate PR comment:", error);
509
+ process.exit(1);
510
+ }
@@ -0,0 +1,88 @@
1
+ /**
2
+ * readiness-report.ts
3
+ *
4
+ * Launch readiness report generator — Phase 5b of the Scenario Matrix
5
+ * implementation. Combines threshold evaluation, ceiling decomposition,
6
+ * and gap analysis into a single actionable readiness checklist for a
7
+ * given feature area.
8
+ *
9
+ * Usage:
10
+ * pnpm readiness-report --area visual-editing
11
+ * pnpm readiness-report --area groq --history
12
+ * pnpm readiness-report --area groq --output readiness.md
13
+ *
14
+ * Exports pure functions for unit testing:
15
+ * - generateReadinessReport() — builds the structured report
16
+ * - formatReadinessMarkdown() — renders the report as markdown
17
+ *
18
+ * @see docs/exec-plans/completed/scenario-matrix-implementation/phase-5-readiness-thresholds.md
19
+ */
20
+ import { type ThresholdConfig } from "../pipeline/schemas.js";
21
+ import type { GapAnalysisReport, GapEstimate, ScoreSummary, ThresholdEvaluation, ThresholdViolation } from "../pipeline/types.js";
22
+ /** A single dimension's readiness check */
23
+ export interface DimensionCheck {
24
+ /** Dimension display name */
25
+ dimension: string;
26
+ /** Whether the dimension meets its threshold */
27
+ pass: boolean;
28
+ /** Actual score */
29
+ score: number;
30
+ /** Required threshold */
31
+ threshold: number;
32
+ }
33
+ /** A historical baseline entry for trend tracking */
34
+ export interface HistoryEntry {
35
+ /** The area's composite score at this point in time */
36
+ score: number;
37
+ /** Optional tag (e.g., "pre-groq-rewrite") */
38
+ tag?: string;
39
+ /** When this baseline was captured */
40
+ timestamp: string;
41
+ }
42
+ /** Structured readiness report — the output of generateReadinessReport() */
43
+ export interface ReadinessReport {
44
+ /** The feature area being evaluated */
45
+ area: string;
46
+ /** Ceiling decomposition data for this area */
47
+ ceiling: {
48
+ ceilingScore: number;
49
+ docLift: number;
50
+ docQualityGap: number;
51
+ floorScore: number;
52
+ };
53
+ /** Per-dimension breakdown with threshold comparison */
54
+ dimensions: DimensionCheck[];
55
+ /** Gap analysis entries for this area (empty if no gap data) */
56
+ gaps: GapEstimate[];
57
+ /** Historical scores (empty unless --history was used) */
58
+ history: HistoryEntry[];
59
+ /** Whether the area passes all thresholds */
60
+ pass: boolean;
61
+ /** The area's composite score */
62
+ score: number;
63
+ /** The composite threshold for this area */
64
+ threshold: number;
65
+ /** Threshold evaluation result */
66
+ thresholdEvaluation: ThresholdEvaluation;
67
+ /** Violations specific to this area */
68
+ violations: ThresholdViolation[];
69
+ }
70
+ /**
71
+ * Format a readiness report as markdown.
72
+ *
73
+ * Pure function — takes a structured report and returns a markdown string.
74
+ */
75
+ export declare function formatReadinessMarkdown(report: ReadinessReport): string;
76
+ /**
77
+ * Generate a structured readiness report for a given feature area.
78
+ *
79
+ * This is a pure function — it takes all data as parameters and produces
80
+ * a structured report. No I/O.
81
+ */
82
+ export declare function generateReadinessReport(opts: {
83
+ area: string;
84
+ gapAnalysis?: GapAnalysisReport;
85
+ history?: HistoryEntry[];
86
+ scoreSummary: ScoreSummary;
87
+ thresholdConfig: ThresholdConfig;
88
+ }): ReadinessReport;