@sanity/ailf 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (444) hide show
  1. package/canonical/grader-references/README.md +2 -2
  2. package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
  3. package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
  4. package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
  5. package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
  6. package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
  7. package/config/features.ts +1 -1
  8. package/config/models.ts +28 -23
  9. package/config/sources.ts +1 -1
  10. package/config/thresholds.ts +1 -1
  11. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
  12. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
  13. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
  14. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
  15. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
  16. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
  17. package/dist/_vendor/ailf-core/config-helpers.d.ts +6 -0
  18. package/dist/_vendor/ailf-core/config-helpers.js +29 -0
  19. package/dist/_vendor/ailf-core/examples/index.d.ts +164 -94
  20. package/dist/_vendor/ailf-core/examples/index.js +208 -114
  21. package/dist/_vendor/ailf-core/index.d.ts +1 -0
  22. package/dist/_vendor/ailf-core/index.js +1 -0
  23. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
  24. package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
  25. package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
  26. package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
  27. package/dist/_vendor/ailf-core/ports/context.d.ts +20 -1
  28. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
  29. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  30. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
  31. package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
  32. package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
  33. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +6 -1
  34. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +14 -2
  35. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
  36. package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
  37. package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
  38. package/dist/_vendor/ailf-core/services/index.js +1 -1
  39. package/dist/_vendor/ailf-core/services/scoring.js +9 -0
  40. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +12 -1
  41. package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
  42. package/dist/_vendor/ailf-core/types/index.d.ts +47 -4
  43. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +27 -0
  44. package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
  45. package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
  46. package/dist/_vendor/ailf-tasks/cli.d.ts +8 -0
  47. package/dist/_vendor/ailf-tasks/cli.js +61 -0
  48. package/dist/_vendor/ailf-tasks/index.d.ts +13 -0
  49. package/dist/_vendor/ailf-tasks/index.js +16 -0
  50. package/dist/_vendor/ailf-tasks/parser.d.ts +27 -0
  51. package/dist/_vendor/ailf-tasks/parser.js +73 -0
  52. package/dist/_vendor/ailf-tasks/schemas.d.ts +198 -0
  53. package/dist/_vendor/ailf-tasks/schemas.js +180 -0
  54. package/dist/_vendor/ailf-tasks/validation.d.ts +47 -0
  55. package/dist/_vendor/ailf-tasks/validation.js +162 -0
  56. package/dist/adapters/api-client/remediation.js +2 -2
  57. package/dist/adapters/config-sources/file-config-adapter.js +6 -1
  58. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
  59. package/dist/adapters/index.d.ts +0 -1
  60. package/dist/adapters/index.js +0 -1
  61. package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
  62. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  63. package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
  64. package/dist/adapters/task-sources/content-lake-task-source.js +4 -6
  65. package/dist/adapters/task-sources/index.d.ts +1 -2
  66. package/dist/adapters/task-sources/index.js +1 -2
  67. package/dist/adapters/task-sources/repo-schemas.d.ts +1 -1
  68. package/dist/adapters/task-sources/repo-schemas.js +2 -2
  69. package/dist/adapters/task-sources/repo-task-source.js +1 -1
  70. package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
  71. package/dist/adapters/task-sources/repo-trigger.js +1 -1
  72. package/dist/adapters/task-sources/task-file-loader.d.ts +9 -6
  73. package/dist/adapters/task-sources/task-file-loader.js +20 -6
  74. package/dist/agent-observer/test-imports.d.ts +7 -0
  75. package/dist/agent-observer/test-imports.js +185 -0
  76. package/dist/artifact-capture/comparator.d.ts +22 -0
  77. package/dist/artifact-capture/comparator.js +493 -0
  78. package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
  79. package/dist/artifact-capture/filesystem-collector.js +237 -0
  80. package/dist/artifact-capture/redact-artifact.d.ts +20 -0
  81. package/dist/artifact-capture/redact-artifact.js +115 -0
  82. package/dist/assertions/source-isolation.d.ts +1 -1
  83. package/dist/assertions/source-isolation.js +1 -1
  84. package/dist/cli.js +4 -0
  85. package/dist/commands/calculate-scores.js +1 -0
  86. package/dist/commands/capture-compare.d.ts +15 -0
  87. package/dist/commands/capture-compare.js +253 -0
  88. package/dist/commands/capture-list.d.ts +12 -0
  89. package/dist/commands/capture-list.js +147 -0
  90. package/dist/commands/capture.d.ts +9 -0
  91. package/dist/commands/capture.js +16 -0
  92. package/dist/commands/chronic-failures.d.ts +8 -0
  93. package/dist/commands/chronic-failures.js +33 -0
  94. package/dist/commands/explain-handler.d.ts +1 -1
  95. package/dist/commands/explain-handler.js +37 -8
  96. package/dist/commands/fetch-docs.js +1 -0
  97. package/dist/commands/generate-configs.d.ts +3 -3
  98. package/dist/commands/generate-configs.js +20 -8
  99. package/dist/commands/init.d.ts +2 -3
  100. package/dist/commands/init.js +56 -170
  101. package/dist/commands/pipeline-action.d.ts +7 -1
  102. package/dist/commands/pipeline-action.js +43 -19
  103. package/dist/commands/pipeline.d.ts +6 -1
  104. package/dist/commands/pipeline.js +7 -2
  105. package/dist/commands/pr-comment.js +1 -0
  106. package/dist/commands/publish.js +1 -0
  107. package/dist/commands/shared/help.js +2 -2
  108. package/dist/commands/update-quality-scores.d.ts +5 -0
  109. package/dist/commands/update-quality-scores.js +20 -0
  110. package/dist/composition-root.d.ts +2 -3
  111. package/dist/composition-root.js +27 -14
  112. package/dist/config/features.ts +23 -0
  113. package/dist/config/models.ts +100 -0
  114. package/dist/config/prompts.ts +16 -0
  115. package/dist/config/rubrics.ts +225 -0
  116. package/dist/config/schedules.ts +47 -0
  117. package/dist/config/sinks.ts +37 -0
  118. package/dist/config/sources.ts +21 -0
  119. package/dist/config/thresholds.ts +61 -0
  120. package/dist/lib/agent-behavior-report.d.ts +8 -0
  121. package/dist/lib/agent-behavior-report.js +185 -0
  122. package/dist/lib/baseline.d.ts +19 -0
  123. package/dist/lib/baseline.js +153 -0
  124. package/dist/lib/calculate-scores.d.ts +23 -0
  125. package/dist/lib/calculate-scores.js +42 -0
  126. package/dist/lib/compare.d.ts +18 -0
  127. package/dist/lib/compare.js +170 -0
  128. package/dist/lib/coverage-audit.d.ts +4 -0
  129. package/dist/lib/coverage-audit.js +42 -0
  130. package/dist/lib/discovery-report.d.ts +13 -0
  131. package/dist/lib/discovery-report.js +57 -0
  132. package/dist/lib/fetch-docs.d.ts +30 -0
  133. package/dist/lib/fetch-docs.js +171 -0
  134. package/dist/lib/generate-configs.d.ts +25 -0
  135. package/dist/lib/generate-configs.js +42 -0
  136. package/dist/lib/grader-api.d.ts +21 -0
  137. package/dist/lib/grader-api.js +34 -0
  138. package/dist/lib/grader-compare.d.ts +19 -0
  139. package/dist/lib/grader-compare.js +91 -0
  140. package/dist/lib/grader-consistency.d.ts +27 -0
  141. package/dist/lib/grader-consistency.js +79 -0
  142. package/dist/lib/grader-sensitivity.d.ts +19 -0
  143. package/dist/lib/grader-sensitivity.js +75 -0
  144. package/dist/lib/grader-validate.d.ts +19 -0
  145. package/dist/lib/grader-validate.js +78 -0
  146. package/dist/lib/measure-retrieval.d.ts +14 -0
  147. package/dist/lib/measure-retrieval.js +71 -0
  148. package/dist/lib/pr-comment.d.ts +16 -0
  149. package/dist/lib/pr-comment.js +28 -0
  150. package/dist/lib/readiness-report.d.ts +13 -0
  151. package/dist/lib/readiness-report.js +108 -0
  152. package/dist/lib/webhook-server.d.ts +11 -0
  153. package/dist/lib/webhook-server.js +24 -0
  154. package/dist/lib/weekly-digest.d.ts +24 -0
  155. package/dist/lib/weekly-digest.js +148 -0
  156. package/dist/orchestration/build-app-context.js +13 -0
  157. package/dist/orchestration/cache-context.d.ts +23 -0
  158. package/dist/orchestration/cache-context.js +43 -0
  159. package/dist/orchestration/env-bridge.d.ts +21 -0
  160. package/dist/orchestration/env-bridge.js +66 -0
  161. package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
  162. package/dist/orchestration/load-pipeline-tasks.js +52 -0
  163. package/dist/orchestration/pipeline-orchestrator.js +75 -5
  164. package/dist/orchestration/step-runner.js +5 -1
  165. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
  166. package/dist/orchestration/steps/calculate-scores-step.js +13 -0
  167. package/dist/orchestration/steps/callback-step.js +10 -1
  168. package/dist/orchestration/steps/compare-step.js +6 -3
  169. package/dist/orchestration/steps/discovery-report-step.js +6 -2
  170. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  171. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  172. package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
  173. package/dist/orchestration/steps/fetch-docs-step.js +30 -16
  174. package/dist/orchestration/steps/gap-analysis-step.js +13 -2
  175. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
  176. package/dist/orchestration/steps/generate-configs-step.js +50 -15
  177. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
  178. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  179. package/dist/orchestration/steps/publish-report-step.js +19 -0
  180. package/dist/orchestration/steps/readiness-step.js +8 -3
  181. package/dist/orchestration/steps/report-step.js +17 -4
  182. package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
  183. package/dist/orchestration/steps/run-eval-step.js +52 -32
  184. package/dist/pipeline/agent-behavior-report.js +6 -0
  185. package/dist/pipeline/attribution.d.ts +1 -1
  186. package/dist/pipeline/attribution.js +1 -1
  187. package/dist/pipeline/cache.js +29 -15
  188. package/dist/pipeline/calculate-scores.d.ts +2 -0
  189. package/dist/pipeline/calculate-scores.js +70 -33
  190. package/dist/pipeline/checks.d.ts +8 -3
  191. package/dist/pipeline/checks.js +23 -3
  192. package/dist/pipeline/chronic-failures.d.ts +55 -0
  193. package/dist/pipeline/chronic-failures.js +110 -0
  194. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +33 -0
  195. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
  196. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
  197. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
  198. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
  199. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
  200. package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
  201. package/dist/pipeline/compiler/assertion-mapper.js +1 -1
  202. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
  203. package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
  204. package/dist/pipeline/compiler/config-loader.d.ts +14 -0
  205. package/dist/pipeline/compiler/config-loader.js +42 -2
  206. package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
  207. package/dist/pipeline/compiler/fixture-resolver.js +1 -1
  208. package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
  209. package/dist/pipeline/compiler/ignore-fields.js +1 -1
  210. package/dist/pipeline/compiler/index.d.ts +2 -5
  211. package/dist/pipeline/compiler/index.js +2 -5
  212. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  213. package/dist/pipeline/compiler/literacy-bridge.js +1 -1
  214. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +1 -1
  215. package/dist/pipeline/compiler/mode-bases/agent-harness.js +1 -1
  216. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +1 -1
  217. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +1 -1
  218. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +13 -2
  219. package/dist/pipeline/compiler/mode-bases/literacy.js +55 -1
  220. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +1 -1
  221. package/dist/pipeline/compiler/mode-bases/mcp-server.js +1 -1
  222. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +1 -1
  223. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -1
  224. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
  225. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
  226. package/dist/pipeline/compiler/mode-handlers/index.d.ts +2 -2
  227. package/dist/pipeline/compiler/mode-handlers/index.js +2 -2
  228. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +1 -1
  229. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +1 -1
  230. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
  231. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
  232. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
  233. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +1 -1
  234. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
  235. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
  236. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
  237. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +334 -0
  238. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +1 -1
  239. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +1 -1
  240. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +1 -1
  241. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +1 -1
  242. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +4 -0
  243. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +69 -0
  244. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +307 -0
  245. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +22 -5
  246. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +6 -0
  247. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +10 -5
  248. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +314 -7
  249. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +10 -0
  250. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
  251. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
  252. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +1 -1
  253. package/dist/pipeline/compiler/presets/sanity-literacy.js +1 -1
  254. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
  255. package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
  256. package/dist/pipeline/compiler/provider-assembler.js +13 -7
  257. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
  258. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
  259. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
  260. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
  261. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
  262. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
  263. package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
  264. package/dist/pipeline/compiler/sandbox/index.js +1 -1
  265. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
  266. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
  267. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
  268. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
  269. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
  270. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
  271. package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
  272. package/dist/pipeline/compiler/scoring-bridge.js +1 -1
  273. package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
  274. package/dist/pipeline/compiler/task-bridge.js +92 -0
  275. package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
  276. package/dist/pipeline/compiler/task-graph-builder.js +1 -4
  277. package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
  278. package/dist/pipeline/compiler/telemetry/index.js +1 -1
  279. package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
  280. package/dist/pipeline/compiler/variable-resolver.js +1 -1
  281. package/dist/pipeline/coverage-audit.d.ts +1 -1
  282. package/dist/pipeline/coverage-audit.js +1 -1
  283. package/dist/pipeline/degradations.d.ts +1 -1
  284. package/dist/pipeline/degradations.js +1 -1
  285. package/dist/pipeline/failure-modes.d.ts +1 -1
  286. package/dist/pipeline/failure-modes.js +13 -1
  287. package/dist/pipeline/gap-analysis.d.ts +1 -1
  288. package/dist/pipeline/gap-analysis.js +3 -1
  289. package/dist/pipeline/generate-configs.d.ts +2 -2
  290. package/dist/pipeline/generate-configs.js +15 -8
  291. package/dist/pipeline/grader-compare-runner.d.ts +1 -1
  292. package/dist/pipeline/grader-compare-runner.js +7 -1
  293. package/dist/pipeline/grader-comparison.d.ts +1 -1
  294. package/dist/pipeline/grader-comparison.js +1 -1
  295. package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
  296. package/dist/pipeline/grader-consistency-runner.js +7 -1
  297. package/dist/pipeline/grader-consistency.d.ts +1 -1
  298. package/dist/pipeline/grader-consistency.js +1 -1
  299. package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
  300. package/dist/pipeline/grader-sensitivity-runner.js +1 -1
  301. package/dist/pipeline/grader-sensitivity.d.ts +1 -1
  302. package/dist/pipeline/grader-sensitivity.js +1 -1
  303. package/dist/pipeline/grader-validate-runner.d.ts +1 -1
  304. package/dist/pipeline/grader-validate-runner.js +2 -2
  305. package/dist/pipeline/grader-validation.d.ts +1 -1
  306. package/dist/pipeline/grader-validation.js +1 -1
  307. package/dist/pipeline/map-request-to-config.js +15 -2
  308. package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
  309. package/dist/pipeline/mirror-repo-tasks.js +1 -1
  310. package/dist/pipeline/plan-format.d.ts +1 -1
  311. package/dist/pipeline/plan-format.js +1 -1
  312. package/dist/pipeline/plan.d.ts +1 -1
  313. package/dist/pipeline/plan.js +67 -29
  314. package/dist/pipeline/probe.d.ts +1 -1
  315. package/dist/pipeline/probe.js +1 -1
  316. package/dist/pipeline/readiness-report.d.ts +2 -2
  317. package/dist/pipeline/readiness-report.js +2 -2
  318. package/dist/pipeline/release-classification.d.ts +1 -1
  319. package/dist/pipeline/release-classification.js +1 -1
  320. package/dist/pipeline/release-report.d.ts +1 -1
  321. package/dist/pipeline/release-report.js +1 -1
  322. package/dist/pipeline/repo-eval-comment.d.ts +1 -1
  323. package/dist/pipeline/repo-eval-comment.js +1 -1
  324. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  325. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  326. package/dist/pipeline/resolve-mappings.d.ts +6 -6
  327. package/dist/pipeline/resolve-mappings.js +44 -44
  328. package/dist/pipeline/retrieval-metrics.d.ts +3 -3
  329. package/dist/pipeline/retrieval-metrics.js +28 -20
  330. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  331. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  332. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  333. package/dist/pipeline/steps/compare-step.js +90 -0
  334. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  335. package/dist/pipeline/steps/eval-step.js +347 -0
  336. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  337. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  338. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  339. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  340. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  341. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  342. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  343. package/dist/pipeline/steps/publish-report-step.js +243 -0
  344. package/dist/pipeline/steps/report-step.d.ts +13 -0
  345. package/dist/pipeline/steps/report-step.js +56 -0
  346. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  347. package/dist/pipeline/steps/update-scores-step.js +42 -0
  348. package/dist/pipeline/targeted-loo.d.ts +1 -1
  349. package/dist/pipeline/targeted-loo.js +1 -1
  350. package/dist/pipeline/thresholds.d.ts +1 -1
  351. package/dist/pipeline/thresholds.js +1 -1
  352. package/dist/pipeline/validate.js +13 -0
  353. package/dist/report-store.d.ts +17 -0
  354. package/dist/report-store.js +24 -0
  355. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  356. package/dist/scripts/agent-behavior-report.js +315 -0
  357. package/dist/scripts/baseline.d.ts +43 -0
  358. package/dist/scripts/baseline.js +267 -0
  359. package/dist/scripts/calculate-scores.d.ts +166 -0
  360. package/dist/scripts/calculate-scores.js +1296 -0
  361. package/dist/scripts/compare.d.ts +22 -0
  362. package/dist/scripts/compare.js +334 -0
  363. package/dist/scripts/coverage-audit.d.ts +44 -0
  364. package/dist/scripts/coverage-audit.js +209 -0
  365. package/dist/scripts/debug-eval.d.ts +19 -0
  366. package/dist/scripts/debug-eval.js +73 -0
  367. package/dist/scripts/discovery-report.d.ts +58 -0
  368. package/dist/scripts/discovery-report.js +250 -0
  369. package/dist/scripts/fetch-docs.d.ts +35 -0
  370. package/dist/scripts/fetch-docs.js +472 -0
  371. package/dist/scripts/generate-configs.d.ts +66 -0
  372. package/dist/scripts/generate-configs.js +459 -0
  373. package/dist/scripts/grader-api.d.ts +27 -0
  374. package/dist/scripts/grader-api.js +206 -0
  375. package/dist/scripts/grader-compare.d.ts +22 -0
  376. package/dist/scripts/grader-compare.js +368 -0
  377. package/dist/scripts/grader-consistency.d.ts +20 -0
  378. package/dist/scripts/grader-consistency.js +313 -0
  379. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  380. package/dist/scripts/grader-sensitivity.js +354 -0
  381. package/dist/scripts/grader-validate.d.ts +19 -0
  382. package/dist/scripts/grader-validate.js +267 -0
  383. package/dist/scripts/measure-retrieval.d.ts +10 -0
  384. package/dist/scripts/measure-retrieval.js +145 -0
  385. package/dist/scripts/migrate-task-mode.d.ts +1 -1
  386. package/dist/scripts/migrate-task-mode.js +1 -1
  387. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
  388. package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
  389. package/dist/scripts/pipeline.d.ts +76 -0
  390. package/dist/scripts/pipeline.js +1031 -0
  391. package/dist/scripts/pr-comment.d.ts +10 -0
  392. package/dist/scripts/pr-comment.js +510 -0
  393. package/dist/scripts/readiness-report.d.ts +88 -0
  394. package/dist/scripts/readiness-report.js +342 -0
  395. package/dist/scripts/update-quality-scores.d.ts +15 -0
  396. package/dist/scripts/update-quality-scores.js +184 -0
  397. package/dist/scripts/validate-task-sources.d.ts +1 -1
  398. package/dist/scripts/validate-task-sources.js +1 -1
  399. package/dist/scripts/validate.d.ts +13 -0
  400. package/dist/scripts/validate.js +79 -0
  401. package/dist/scripts/webhook-server.d.ts +26 -0
  402. package/dist/scripts/webhook-server.js +147 -0
  403. package/dist/scripts/weekly-digest.d.ts +24 -0
  404. package/dist/scripts/weekly-digest.js +144 -0
  405. package/dist/sinks/format-slack.d.ts +64 -0
  406. package/dist/sinks/format-slack.js +306 -0
  407. package/dist/sinks/slack-sink.d.ts +27 -0
  408. package/dist/sinks/slack-sink.js +78 -0
  409. package/dist/sinks/types.d.ts +1 -1
  410. package/dist/sinks/types.js +1 -1
  411. package/dist/sinks/webhook-sink.d.ts +19 -0
  412. package/dist/sinks/webhook-sink.js +50 -0
  413. package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
  414. package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
  415. package/dist/tasks/literacy/content-lake.task.ts +181 -0
  416. package/dist/tasks/literacy/frameworks.task.ts +129 -0
  417. package/dist/tasks/literacy/functions.task.ts +70 -0
  418. package/dist/tasks/literacy/groq.task.ts +259 -0
  419. package/dist/tasks/literacy/image-handling.task.ts +95 -0
  420. package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
  421. package/dist/tasks/literacy/portable-text.task.ts +169 -0
  422. package/dist/tasks/literacy/studio-setup.task.ts +134 -0
  423. package/dist/tasks/literacy/visual-editing.task.ts +147 -0
  424. package/package.json +25 -25
  425. package/tasks/.expanded.agentic.yaml +280 -0
  426. package/tasks/.expanded.yaml +565 -0
  427. package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
  428. package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
  429. package/tasks/literacy/content-lake.task.ts +181 -0
  430. package/tasks/literacy/frameworks.task.ts +1 -0
  431. package/tasks/literacy/functions.task.ts +1 -0
  432. package/tasks/literacy/groq.task.ts +1 -0
  433. package/tasks/literacy/image-handling.task.ts +95 -0
  434. package/tasks/literacy/nextjs-live.task.ts +2 -1
  435. package/tasks/literacy/portable-text.task.ts +169 -0
  436. package/tasks/literacy/studio-setup.task.ts +5 -2
  437. package/tasks/literacy/visual-editing.task.ts +1 -0
  438. package/LICENSE +0 -21
  439. package/tasks/frameworks.yaml +0 -98
  440. package/tasks/functions.yaml +0 -51
  441. package/tasks/groq.yaml +0 -216
  442. package/tasks/nextjs-live.yaml +0 -62
  443. package/tasks/studio-setup.yaml +0 -111
  444. package/tasks/visual-editing.yaml +0 -120
@@ -0,0 +1,347 @@
1
+ /**
2
+ * Pipeline step: Run promptfoo evaluation.
3
+ *
4
+ * Preconditions: config files and context files exist
5
+ * Postconditions: eval-results.json exists and is valid
6
+ *
7
+ * Cache key: promptfooconfig*.yaml + contexts/*.md + tasks/*.yaml +
8
+ * canonical contexts + reference solutions + config/models.yaml
9
+ * Cache outputs: results/latest/eval-results*.json
10
+ *
11
+ * Remote cache: When local cache misses and a Sanity token is available,
12
+ * the step queries the Content Lake for a report with a matching eval
13
+ * fingerprint. On a hit, the cached score-summary.json is written to disk
14
+ * and the eval + calculate-scores steps are skipped entirely.
15
+ *
16
+ * @see docs/design-docs/content-lake-eval-caching.md
17
+ */
18
+ import { execSync } from "child_process";
19
+ import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
20
+ import { dirname, resolve } from "path";
21
+ import { fileURLToPath } from "url";
22
+ import { getStepInputPaths, hashFiles, lookupCache, recordCache, } from "../cache.js";
23
+ import { checkCanonicalContextsExist, checkGeneratedConfigsExist, checkResultsExist, } from "../checks.js";
24
+ import { computeEvalFingerprint } from "../eval-fingerprint.js";
25
+ import { resolveMappings } from "../resolve-mappings.js";
26
+ const __dirname = dirname(fileURLToPath(import.meta.url));
27
+ const ROOT = resolve(__dirname, "..", "..", "..");
28
+ const CONFIG_FILES = {
29
+ agentic: "promptfooconfig.agentic.yaml",
30
+ baseline: "promptfooconfig.yaml",
31
+ observed: "promptfooconfig.observed.yaml",
32
+ };
33
+ /** Each mode writes eval results to a different file (set in the config's outputPath) */
34
+ export const RESULTS_FILES = {
35
+ agentic: "results/latest/eval-results-agentic.json",
36
+ baseline: "results/latest/eval-results.json",
37
+ observed: "results/latest/eval-results-observed.json",
38
+ };
39
+ export function buildFilterFlags(debug) {
40
+ if (!debug?.enabled)
41
+ return "";
42
+ const flags = [];
43
+ if (debug.pattern) {
44
+ flags.push(`--filter-pattern '${debug.pattern}'`);
45
+ }
46
+ if (debug.sample) {
47
+ flags.push(`--filter-sample ${debug.sample}`);
48
+ }
49
+ if (debug.firstN) {
50
+ flags.push(`--filter-first-n ${debug.firstN}`);
51
+ }
52
+ // Default: first 2 tests when no other filters specified
53
+ if (flags.length === 0) {
54
+ flags.push("--filter-first-n 2");
55
+ }
56
+ return " " + flags.join(" ");
57
+ }
58
+ /**
59
+ * Extract the Promptfoo share URL from the eval results JSON.
60
+ *
61
+ * Promptfoo writes a `shareableUrl` field into the results file when
62
+ * `PROMPTFOO_API_KEY` is set. This replaces the previous approach of
63
+ * scraping the URL from a captured log file (which required piping
64
+ * through `tee` and broke TTY progress reporting).
65
+ */
66
+ export function extractShareUrl(mode) {
67
+ const resultsPath = resolve(ROOT, RESULTS_FILES[mode]);
68
+ if (!existsSync(resultsPath))
69
+ return undefined;
70
+ try {
71
+ const raw = readFileSync(resultsPath, "utf-8");
72
+ const data = JSON.parse(raw);
73
+ return data.shareableUrl ?? undefined;
74
+ }
75
+ catch {
76
+ return undefined;
77
+ }
78
+ }
79
+ // ---------------------------------------------------------------------------
80
+ // Post-eval error scanning
81
+ // ---------------------------------------------------------------------------
82
+ export async function runEval(mode, debug, concurrency, noCache = false, remoteCacheOpts) {
83
+ const start = Date.now();
84
+ // Precondition: config file exists
85
+ const configIssues = checkGeneratedConfigsExist(ROOT);
86
+ const configErrors = configIssues.filter((i) => i.severity === "error");
87
+ if (configErrors.length > 0) {
88
+ return {
89
+ stepResult: {
90
+ durationMs: Date.now() - start,
91
+ error: `Config files missing: ${configErrors.map((e) => e.message).join("; ")}`,
92
+ status: "failed",
93
+ },
94
+ };
95
+ }
96
+ // Precondition: canonical context files exist for all mapped tasks
97
+ const mappings = resolveMappings(ROOT);
98
+ const taskIds = Object.values(mappings.feature_areas).flatMap((area) => area.tasks.map((t) => t.id));
99
+ const contextIssues = checkCanonicalContextsExist(ROOT, taskIds);
100
+ const contextErrors = contextIssues.filter((i) => i.severity === "error");
101
+ if (contextErrors.length > 0) {
102
+ return {
103
+ stepResult: {
104
+ durationMs: Date.now() - start,
105
+ error: `Context files missing. Run 'pnpm fetch-docs' first. ${contextErrors.map((e) => e.message).join("; ")}`,
106
+ status: "failed",
107
+ },
108
+ };
109
+ }
110
+ // -----------------------------------------------------------------------
111
+ // Compute eval fingerprint (used for both remote cache + provenance)
112
+ // Only for non-debug runs — debug runs use test subsets.
113
+ // -----------------------------------------------------------------------
114
+ let evalFingerprint;
115
+ if (!debug?.enabled && remoteCacheOpts?.graderModel) {
116
+ try {
117
+ evalFingerprint = computeEvalFingerprint({
118
+ filter: remoteCacheOpts.filter,
119
+ graderModel: remoteCacheOpts.graderModel,
120
+ mode,
121
+ rootDir: ROOT,
122
+ });
123
+ }
124
+ catch (err) {
125
+ console.warn(` ⚠️ Could not compute eval fingerprint: ${err instanceof Error ? err.message : String(err)}`);
126
+ }
127
+ }
128
+ // -----------------------------------------------------------------------
129
+ // Cache check — local first, then remote
130
+ // -----------------------------------------------------------------------
131
+ // Local cache check — skip eval if inputs unchanged (biggest cost saver).
132
+ // Each mode gets its own cache key so that in `full` mode, a fresh agentic
133
+ // cache doesn't force baseline to re-run (or vice versa).
134
+ const cacheKey = `eval-${mode}`;
135
+ if (!noCache) {
136
+ const cacheResult = lookupCache(ROOT, cacheKey);
137
+ if (cacheResult.hit) {
138
+ return {
139
+ evalFingerprint,
140
+ stepResult: {
141
+ durationMs: Date.now() - start,
142
+ status: "success",
143
+ summary: `Skipped (cached) — ${cacheResult.entry.summary}`,
144
+ },
145
+ };
146
+ }
147
+ }
148
+ // Remote cache check — query Content Lake for matching fingerprint
149
+ if (evalFingerprint &&
150
+ !noCache &&
151
+ !remoteCacheOpts?.noRemoteCache &&
152
+ remoteCacheOpts?.sanityToken) {
153
+ const remoteCacheResult = await checkRemoteCache(evalFingerprint, remoteCacheOpts.sanityToken);
154
+ if (remoteCacheResult) {
155
+ return {
156
+ evalFingerprint,
157
+ remoteCacheHit: true,
158
+ stepResult: {
159
+ durationMs: Date.now() - start,
160
+ status: "success",
161
+ summary: `Skipped (remote cache hit) — reusing report ${remoteCacheResult.reportId} from ${remoteCacheResult.completedAt}`,
162
+ },
163
+ };
164
+ }
165
+ }
166
+ // Execute — run promptfoo directly with inherited stdio so the TTY
167
+ // progress bar works in interactive terminals and the CI progress
168
+ // reporter works in CI environments. Previously this was piped through
169
+ // `tee` to capture a log file for share-URL extraction, but `tee`
170
+ // destroyed TTY detection, disabling all progress output. The share URL
171
+ // is now read from the eval results JSON (`shareableUrl` field) instead.
172
+ //
173
+ // Sharing is enabled by default (via PROMPTFOO_API_KEY / cloud config).
174
+ // We set PROMPTFOO_DISABLE_SHARE_EMAIL_REQUEST=1 to prevent promptfoo's
175
+ // interactive email prompt from blocking the terminal in local TTY
176
+ // environments. In CI, isCI() already guards against the prompt, but
177
+ // the env var provides defense-in-depth for all execution contexts.
178
+ const configFile = CONFIG_FILES[mode];
179
+ const filterFlags = buildFilterFlags(debug);
180
+ const concurrencyFlag = concurrency ? ` --max-concurrency ${concurrency}` : "";
181
+ const noCacheFlag = noCache ? " --no-cache" : "";
182
+ const evalCmd = `dotenv -e ../../.env -o -- promptfoo eval --config ${configFile}${filterFlags}${concurrencyFlag}${noCacheFlag}`;
183
+ let exitCode = 0;
184
+ try {
185
+ execSync(evalCmd, {
186
+ cwd: ROOT,
187
+ env: {
188
+ ...process.env,
189
+ PROMPTFOO_DISABLE_SHARE_EMAIL_REQUEST: "1",
190
+ },
191
+ stdio: "inherit",
192
+ });
193
+ }
194
+ catch (err) {
195
+ // promptfoo exits 100 when assertions fail — that's expected, not an error
196
+ exitCode =
197
+ err !== null && typeof err === "object" && "status" in err
198
+ ? err.status
199
+ : 1;
200
+ if (exitCode !== 100) {
201
+ return {
202
+ evalFingerprint,
203
+ stepResult: {
204
+ durationMs: Date.now() - start,
205
+ error: `promptfoo eval failed with exit code ${exitCode}`,
206
+ status: "failed",
207
+ },
208
+ };
209
+ }
210
+ }
211
+ // Postcondition: results file exists and is valid
212
+ const resultsIssues = checkResultsExist(ROOT, RESULTS_FILES[mode]);
213
+ const resultsErrors = resultsIssues.filter((i) => i.severity === "error");
214
+ if (resultsErrors.length > 0) {
215
+ return {
216
+ evalFingerprint,
217
+ stepResult: {
218
+ durationMs: Date.now() - start,
219
+ error: `Postcondition failed: ${resultsErrors.map((e) => e.message).join("; ")}`,
220
+ status: "failed",
221
+ },
222
+ };
223
+ }
224
+ // Scan results for errors and surface them clearly
225
+ const errorSummary = scanResultsForErrors(resolve(ROOT, RESULTS_FILES[mode]));
226
+ if (errorSummary) {
227
+ console.log();
228
+ console.log(errorSummary);
229
+ }
230
+ const durationMs = Date.now() - start;
231
+ const summary = `Evaluation complete (mode: ${mode}${debug?.enabled ? ", debug" : ""})`;
232
+ // Record cache — only for non-debug runs (debug uses a subset of tests).
233
+ // Uses per-mode cache key so baseline and agentic are independently cached.
234
+ if (!noCache && !debug?.enabled) {
235
+ const inputPaths = getStepInputPaths(ROOT, cacheKey);
236
+ const inputHash = hashFiles(inputPaths);
237
+ recordCache(ROOT, cacheKey, inputHash, summary, durationMs, [
238
+ RESULTS_FILES[mode],
239
+ ]);
240
+ }
241
+ return {
242
+ evalFingerprint,
243
+ stepResult: { durationMs, status: "success", summary },
244
+ };
245
+ }
246
+ // ---------------------------------------------------------------------------
247
+ // Remote cache helpers
248
+ // ---------------------------------------------------------------------------
249
+ /**
250
+ * Query the Sanity Content Lake for a report with a matching eval fingerprint.
251
+ *
252
+ * On a hit, writes the cached score-summary.json to results/latest/ so that
253
+ * downstream steps (report, compare, publish) can proceed as if the eval
254
+ * had just run.
255
+ *
256
+ * @returns The matched report metadata on hit, null on miss or error
257
+ */
258
+ async function checkRemoteCache(fingerprint, sanityToken) {
259
+ try {
260
+ const { ReportStore } = await import("../../report-store.js");
261
+ const store = new ReportStore({
262
+ dataset: process.env.AILF_REPORT_DATASET ?? undefined,
263
+ projectId: process.env.AILF_REPORT_PROJECT_ID ?? undefined,
264
+ token: sanityToken,
265
+ });
266
+ const startQuery = Date.now();
267
+ const cachedReport = await store.findByFingerprint(fingerprint);
268
+ const queryMs = Date.now() - startQuery;
269
+ if (!cachedReport) {
270
+ console.log(` ℹ️ Remote cache miss — no report matches fingerprint (${queryMs}ms)`);
271
+ return null;
272
+ }
273
+ // Write the cached score summary to disk so downstream steps work
274
+ const outDir = resolve(ROOT, "results", "latest");
275
+ if (!existsSync(outDir)) {
276
+ mkdirSync(outDir, { recursive: true });
277
+ }
278
+ writeFileSync(resolve(outDir, "score-summary.json"), JSON.stringify(cachedReport.summary, null, 2));
279
+ console.log(` ✅ Remote cache hit — reusing report ${cachedReport.id} from ${cachedReport.completedAt}`);
280
+ console.log(` ℹ️ Fingerprint: ${fingerprint.slice(0, 16)}... (${queryMs}ms)`);
281
+ console.log(" ⚠️ Cached scores are statistically equivalent, not identical");
282
+ return {
283
+ completedAt: cachedReport.completedAt,
284
+ reportId: cachedReport.id,
285
+ };
286
+ }
287
+ catch (err) {
288
+ console.warn(` ⚠️ Remote cache check failed: ${err instanceof Error ? err.message : String(err)}`);
289
+ return null;
290
+ }
291
+ }
292
+ /**
293
+ * Read the eval results JSON and produce a human-readable summary of any
294
+ * errored or failed tests. This surfaces API errors, timeouts, and other
295
+ * issues that would otherwise be buried in the Promptfoo table output.
296
+ *
297
+ * Returns null if there are no errors/failures worth reporting.
298
+ */
299
+ function scanResultsForErrors(resultsPath) {
300
+ if (!existsSync(resultsPath))
301
+ return null;
302
+ let file;
303
+ try {
304
+ const raw = readFileSync(resultsPath, "utf-8");
305
+ file = JSON.parse(raw);
306
+ }
307
+ catch {
308
+ return null;
309
+ }
310
+ const results = file?.results?.results;
311
+ if (!Array.isArray(results))
312
+ return null;
313
+ const errored = [];
314
+ for (const r of results) {
315
+ if (r.gradingResult !== null)
316
+ continue;
317
+ const desc = r.testCase?.description ?? r.description ?? "unknown";
318
+ const provider = r.provider?.label ?? r.provider?.id ?? "unknown";
319
+ // No grading result = the provider errored before producing a response.
320
+ // This is the only case we surface — API 500s, timeouts, rate limits.
321
+ // Note: r.error may also be set for assertion failures, but those have
322
+ // a non-null gradingResult and are normal pass/fail outcomes.
323
+ const errorMsg = r.error
324
+ ? (typeof r.error === "string" ? r.error : JSON.stringify(r.error)).slice(0, 200)
325
+ : "Provider returned no scorable result";
326
+ errored.push({ description: desc, error: errorMsg, provider });
327
+ }
328
+ if (errored.length === 0)
329
+ return null;
330
+ const total = results.length;
331
+ const lines = [];
332
+ lines.push(` ┌─────────────────────────────────────────────────────────────`);
333
+ lines.push(` │ ⚠️ ${errored.length} of ${total} eval result(s) errored (no gradingResult)`);
334
+ lines.push(` │`);
335
+ for (const e of errored) {
336
+ lines.push(` │ ✗ [${e.provider}] ${e.description}`);
337
+ lines.push(` │ → ${e.error}`);
338
+ }
339
+ const errorRate = Math.round((errored.length / total) * 100);
340
+ if (errorRate >= 25) {
341
+ lines.push(` │`);
342
+ lines.push(` │ 🔥 High error rate (${errorRate}%) — check API keys, rate limits,`);
343
+ lines.push(` │ or model availability. Errored results are excluded from scoring.`);
344
+ }
345
+ lines.push(` └─────────────────────────────────────────────────────────────`);
346
+ return lines.join("\n");
347
+ }
@@ -0,0 +1,11 @@
1
+ /**
2
+ * Pipeline step: Fetch documentation from Sanity CMS.
3
+ *
4
+ * Preconditions: tasks have inline canonical_docs
5
+ * Postconditions: canonical context files exist for all mapped tasks
6
+ *
7
+ * Cache key: tasks/*.yaml + config/sources.yaml + config/models.yaml
8
+ * Cache outputs: contexts/canonical/*.md files
9
+ */
10
+ import type { StepResult } from "../types.js";
11
+ export declare function runFetchDocs(source?: string, noCache?: boolean): Promise<StepResult>;
@@ -0,0 +1,84 @@
1
+ /**
2
+ * Pipeline step: Fetch documentation from Sanity CMS.
3
+ *
4
+ * Preconditions: tasks have inline canonical_docs
5
+ * Postconditions: canonical context files exist for all mapped tasks
6
+ *
7
+ * Cache key: tasks/*.yaml + config/sources.yaml + config/models.yaml
8
+ * Cache outputs: contexts/canonical/*.md files
9
+ */
10
+ import { execSync } from "child_process";
11
+ import { dirname, resolve } from "path";
12
+ import { fileURLToPath } from "url";
13
+ import { lookupCache, recordCache } from "../cache.js";
14
+ import { checkCanonicalContextsExist } from "../checks.js";
15
+ import { resolveMappings } from "../resolve-mappings.js";
16
+ const __dirname = dirname(fileURLToPath(import.meta.url));
17
+ const ROOT = resolve(__dirname, "..", "..", "..");
18
+ export async function runFetchDocs(source, noCache = false) {
19
+ const start = Date.now();
20
+ // Precondition: at least one task has inline canonical mappings
21
+ const mappings = resolveMappings(ROOT);
22
+ const totalTasks = Object.values(mappings.feature_areas).reduce((sum, area) => sum + area.tasks.length, 0);
23
+ if (totalTasks === 0) {
24
+ return {
25
+ durationMs: Date.now() - start,
26
+ error: "No tasks with canonical_docs found in task files. Add canonical_docs to your task definitions.",
27
+ status: "failed",
28
+ };
29
+ }
30
+ // Cache check
31
+ if (!noCache) {
32
+ const cacheResult = lookupCache(ROOT, "fetch-docs");
33
+ if (cacheResult.hit) {
34
+ return {
35
+ durationMs: Date.now() - start,
36
+ status: "success",
37
+ summary: `Skipped (cached) — ${cacheResult.entry.summary}`,
38
+ };
39
+ }
40
+ }
41
+ // Execute
42
+ try {
43
+ const sourceArg = source ? ` --source ${source}` : "";
44
+ execSync(`pnpm fetch-docs${sourceArg}`, {
45
+ cwd: ROOT,
46
+ env: process.env,
47
+ stdio: "inherit",
48
+ });
49
+ }
50
+ catch (err) {
51
+ return {
52
+ durationMs: Date.now() - start,
53
+ error: `fetch-docs failed: ${err instanceof Error ? err.message : String(err)}`,
54
+ status: "failed",
55
+ };
56
+ }
57
+ // Postcondition: canonical context files exist for all mapped tasks
58
+ // Re-resolve in case fetch-docs modified things (unlikely but safe)
59
+ const postMappings = resolveMappings(ROOT);
60
+ const taskIds = Object.values(postMappings.feature_areas).flatMap((area) => area.tasks.map((t) => t.id));
61
+ const contextIssues = checkCanonicalContextsExist(ROOT, taskIds);
62
+ const contextErrors = contextIssues.filter((i) => i.severity === "error");
63
+ if (contextErrors.length > 0) {
64
+ return {
65
+ durationMs: Date.now() - start,
66
+ error: `Postcondition failed: ${contextErrors.map((e) => e.message).join("; ")}`,
67
+ status: "failed",
68
+ };
69
+ }
70
+ const durationMs = Date.now() - start;
71
+ const summary = `Fetched canonical contexts for ${taskIds.length} tasks`;
72
+ // Record cache
73
+ if (!noCache) {
74
+ const { getStepInputPaths, hashFiles } = await import("../cache.js");
75
+ const inputPaths = getStepInputPaths(ROOT, "fetch-docs");
76
+ const inputHash = hashFiles(inputPaths);
77
+ const outputPaths = [
78
+ ...taskIds.map((id) => `contexts/canonical/${id}.md`),
79
+ "contexts/document-manifest.json",
80
+ ];
81
+ recordCache(ROOT, "fetch-docs", inputHash, summary, durationMs, outputPaths);
82
+ }
83
+ return { durationMs, status: "success", summary };
84
+ }
@@ -0,0 +1,11 @@
1
+ /**
2
+ * Pipeline step: Generate promptfoo config files from config/models.yaml.
3
+ *
4
+ * Preconditions: config/models.yaml is valid
5
+ * Postconditions: promptfooconfig*.yaml files exist
6
+ *
7
+ * Cache key: config/models.yaml + config/sources.yaml + tasks/*.yaml
8
+ * Cache outputs: promptfooconfig*.yaml files
9
+ */
10
+ import type { StepResult } from "../types.js";
11
+ export declare function runGenerateConfigs(source?: string, noCache?: boolean): StepResult;
@@ -0,0 +1,98 @@
1
+ /**
2
+ * Pipeline step: Generate promptfoo config files from config/models.yaml.
3
+ *
4
+ * Preconditions: config/models.yaml is valid
5
+ * Postconditions: promptfooconfig*.yaml files exist
6
+ *
7
+ * Cache key: config/models.yaml + config/sources.yaml + tasks/*.yaml
8
+ * Cache outputs: promptfooconfig*.yaml files
9
+ */
10
+ import { execSync } from "child_process";
11
+ import { dirname, resolve } from "path";
12
+ import { fileURLToPath } from "url";
13
+ import { getStepInputPaths, hashFiles, lookupCache, recordCache, } from "../cache.js";
14
+ import { checkGeneratedConfigsExist } from "../checks.js";
15
+ import { validateModelsYaml } from "../validate.js";
16
+ const __dirname = dirname(fileURLToPath(import.meta.url));
17
+ const ROOT = resolve(__dirname, "..", "..", "..");
18
+ export function runGenerateConfigs(source, noCache = false) {
19
+ const start = Date.now();
20
+ // Precondition: config/models.yaml must be valid
21
+ const modelIssues = validateModelsYaml(ROOT);
22
+ const errors = modelIssues.filter((i) => i.severity === "error");
23
+ if (errors.length > 0) {
24
+ return {
25
+ durationMs: Date.now() - start,
26
+ error: `config/models.yaml validation failed: ${errors.map((e) => e.message).join("; ")}`,
27
+ status: "failed",
28
+ };
29
+ }
30
+ // Cache check — include filter env vars so scoped runs don't reuse
31
+ // cached results from unscoped (or differently-scoped) runs.
32
+ const filterContext = buildFilterContext();
33
+ if (!noCache) {
34
+ const cacheResult = lookupCache(ROOT, "generate-configs", filterContext);
35
+ if (cacheResult.hit) {
36
+ return {
37
+ durationMs: Date.now() - start,
38
+ status: "success",
39
+ summary: `Skipped (cached) — ${cacheResult.entry.summary}`,
40
+ };
41
+ }
42
+ }
43
+ // Execute
44
+ try {
45
+ const sourceArg = source ? ` --source ${source}` : "";
46
+ execSync(`pnpm generate-configs${sourceArg}`, {
47
+ cwd: ROOT,
48
+ env: process.env,
49
+ stdio: "inherit",
50
+ });
51
+ }
52
+ catch (err) {
53
+ return {
54
+ durationMs: Date.now() - start,
55
+ error: `generate-configs failed: ${err instanceof Error ? err.message : String(err)}`,
56
+ status: "failed",
57
+ };
58
+ }
59
+ // Postcondition: config files exist
60
+ const configIssues = checkGeneratedConfigsExist(ROOT);
61
+ const configErrors = configIssues.filter((i) => i.severity === "error");
62
+ if (configErrors.length > 0) {
63
+ return {
64
+ durationMs: Date.now() - start,
65
+ error: `Postcondition failed: ${configErrors.map((e) => e.message).join("; ")}`,
66
+ status: "failed",
67
+ };
68
+ }
69
+ const durationMs = Date.now() - start;
70
+ const summary = "Generated promptfoo config files";
71
+ // Record cache
72
+ if (!noCache) {
73
+ const inputPaths = getStepInputPaths(ROOT, "generate-configs");
74
+ const inputHash = hashFiles(inputPaths, filterContext);
75
+ const outputPaths = [
76
+ "promptfooconfig.yaml",
77
+ "promptfooconfig.observed.yaml",
78
+ "promptfooconfig.agentic.yaml",
79
+ ];
80
+ recordCache(ROOT, "generate-configs", inputHash, summary, durationMs, outputPaths);
81
+ }
82
+ return { durationMs, status: "success", summary };
83
+ }
84
+ /**
85
+ * Build cache context strings from filter environment variables.
86
+ * When EVAL_FILTER_AREAS or EVAL_FILTER_TASKS are set, they become part
87
+ * of the cache key so that differently-scoped runs don't share cache entries.
88
+ */
89
+ function buildFilterContext() {
90
+ const context = [];
91
+ if (process.env.EVAL_FILTER_AREAS) {
92
+ context.push(`areas:${process.env.EVAL_FILTER_AREAS}`);
93
+ }
94
+ if (process.env.EVAL_FILTER_TASKS) {
95
+ context.push(`tasks:${process.env.EVAL_FILTER_TASKS}`);
96
+ }
97
+ return context;
98
+ }
@@ -0,0 +1,21 @@
1
+ /**
2
+ * Pipeline step: Measure grader consistency via replicated grading.
3
+ *
4
+ * This step is OPTIONAL — it only runs when --grader-replications N is passed.
5
+ * It re-runs grading assertions N additional times on the same model responses
6
+ * and measures score variance across replications.
7
+ *
8
+ * Preconditions: eval-results.json exists (model responses to re-grade)
9
+ * Postconditions: grader-consistency.json written to results/latest/
10
+ *
11
+ * Not cached: Each run involves fresh API calls to the grader model.
12
+ * The whole point is to measure variance, so caching would defeat the purpose.
13
+ */
14
+ import type { EvalMode, StepResult } from "../types.js";
15
+ /**
16
+ * Run grader consistency analysis.
17
+ *
18
+ * @param replications Number of additional grading replications (default: 5)
19
+ * @param mode Eval mode — determines which results file to read
20
+ */
21
+ export declare function runGraderConsistency(replications?: number, mode?: EvalMode): StepResult;
@@ -0,0 +1,74 @@
1
+ /**
2
+ * Pipeline step: Measure grader consistency via replicated grading.
3
+ *
4
+ * This step is OPTIONAL — it only runs when --grader-replications N is passed.
5
+ * It re-runs grading assertions N additional times on the same model responses
6
+ * and measures score variance across replications.
7
+ *
8
+ * Preconditions: eval-results.json exists (model responses to re-grade)
9
+ * Postconditions: grader-consistency.json written to results/latest/
10
+ *
11
+ * Not cached: Each run involves fresh API calls to the grader model.
12
+ * The whole point is to measure variance, so caching would defeat the purpose.
13
+ */
14
+ import { execSync } from "child_process";
15
+ import { existsSync } from "fs";
16
+ import { dirname, resolve } from "path";
17
+ import { fileURLToPath } from "url";
18
+ import { checkResultsExist } from "../checks.js";
19
+ import { RESULTS_FILES } from "./eval-step.js";
20
+ const __dirname = dirname(fileURLToPath(import.meta.url));
21
+ const ROOT = resolve(__dirname, "..", "..", "..");
22
+ /**
23
+ * Run grader consistency analysis.
24
+ *
25
+ * @param replications Number of additional grading replications (default: 5)
26
+ * @param mode Eval mode — determines which results file to read
27
+ */
28
+ export function runGraderConsistency(replications = 5, mode = "baseline") {
29
+ const start = Date.now();
30
+ // For full mode, use baseline results for grader consistency analysis
31
+ const concreteMode = mode === "full" ? "baseline" : mode;
32
+ const resultsFile = RESULTS_FILES[concreteMode];
33
+ const resultsIssues = checkResultsExist(ROOT, resultsFile);
34
+ const resultsErrors = resultsIssues.filter((i) => i.severity === "error");
35
+ if (resultsErrors.length > 0) {
36
+ return {
37
+ durationMs: Date.now() - start,
38
+ error: `Results missing: ${resultsErrors.map((e) => e.message).join("; ")}. Run eval first.`,
39
+ status: "failed",
40
+ };
41
+ }
42
+ // Execute
43
+ try {
44
+ execSync(`tsx src/lib/grader-consistency.ts --replications ${replications} --results ${resultsFile}`, {
45
+ cwd: ROOT,
46
+ env: process.env,
47
+ stdio: "inherit",
48
+ });
49
+ }
50
+ catch (err) {
51
+ const code = err !== null && typeof err === "object" && "status" in err
52
+ ? err.status
53
+ : 1;
54
+ return {
55
+ durationMs: Date.now() - start,
56
+ error: `grader-consistency failed with exit code ${code}`,
57
+ status: "failed",
58
+ };
59
+ }
60
+ // Postcondition: output file exists
61
+ const outputPath = resolve(ROOT, "results", "latest", "grader-consistency.json");
62
+ if (!existsSync(outputPath)) {
63
+ return {
64
+ durationMs: Date.now() - start,
65
+ error: "grader-consistency.json was not created",
66
+ status: "failed",
67
+ };
68
+ }
69
+ return {
70
+ durationMs: Date.now() - start,
71
+ status: "success",
72
+ summary: `Grader consistency analysis complete (${replications} replications)`,
73
+ };
74
+ }