@sanity/ailf 1.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (499) hide show
  1. package/README.md +0 -1
  2. package/canonical/grader-references/README.md +2 -2
  3. package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
  4. package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
  5. package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
  6. package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
  7. package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
  8. package/config/features.ts +1 -1
  9. package/config/models.ts +29 -12
  10. package/config/sources.ts +1 -1
  11. package/config/thresholds.ts +1 -1
  12. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
  13. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
  14. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
  15. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
  16. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
  17. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
  18. package/dist/_vendor/ailf-core/config-helpers.d.ts +20 -17
  19. package/dist/_vendor/ailf-core/config-helpers.js +51 -2
  20. package/dist/_vendor/ailf-core/examples/index.d.ts +166 -80
  21. package/dist/_vendor/ailf-core/examples/index.js +213 -94
  22. package/dist/_vendor/ailf-core/index.d.ts +3 -2
  23. package/dist/_vendor/ailf-core/index.js +2 -1
  24. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
  25. package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
  26. package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
  27. package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
  28. package/dist/_vendor/ailf-core/ports/context.d.ts +22 -1
  29. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
  30. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  31. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
  32. package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
  33. package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
  34. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  35. package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
  36. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +7 -1
  37. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +16 -2
  38. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
  39. package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
  40. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
  41. package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
  42. package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
  43. package/dist/_vendor/ailf-core/services/index.js +1 -1
  44. package/dist/_vendor/ailf-core/services/scoring.js +9 -0
  45. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +25 -1
  46. package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
  47. package/dist/_vendor/ailf-core/types/index.d.ts +48 -7
  48. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +105 -23
  49. package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
  50. package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
  51. package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
  52. package/dist/adapters/api-client/remediation.js +2 -2
  53. package/dist/adapters/config-sources/file-config-adapter.js +7 -1
  54. package/dist/adapters/config-sources/ts-config-loader.js +21 -13
  55. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
  56. package/dist/adapters/index.d.ts +0 -1
  57. package/dist/adapters/index.js +0 -1
  58. package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
  59. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  60. package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
  61. package/dist/adapters/task-sources/content-lake-task-source.js +21 -26
  62. package/dist/adapters/task-sources/index.d.ts +3 -4
  63. package/dist/adapters/task-sources/index.js +3 -4
  64. package/dist/adapters/task-sources/repo-schemas.d.ts +219 -17
  65. package/dist/adapters/task-sources/repo-schemas.js +228 -20
  66. package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
  67. package/dist/adapters/task-sources/repo-task-source.js +81 -122
  68. package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
  69. package/dist/adapters/task-sources/repo-trigger.js +1 -1
  70. package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
  71. package/dist/adapters/task-sources/repo-validation.js +126 -5
  72. package/dist/adapters/task-sources/task-file-loader.d.ts +10 -7
  73. package/dist/adapters/task-sources/task-file-loader.js +21 -7
  74. package/dist/agent-observer/test-imports.d.ts +7 -0
  75. package/dist/agent-observer/test-imports.js +185 -0
  76. package/dist/artifact-capture/comparator.d.ts +22 -0
  77. package/dist/artifact-capture/comparator.js +493 -0
  78. package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
  79. package/dist/artifact-capture/filesystem-collector.js +237 -0
  80. package/dist/artifact-capture/redact-artifact.d.ts +20 -0
  81. package/dist/artifact-capture/redact-artifact.js +115 -0
  82. package/dist/assertions/source-isolation.d.ts +1 -1
  83. package/dist/assertions/source-isolation.js +1 -1
  84. package/dist/cli.js +4 -0
  85. package/dist/commands/calculate-scores.js +1 -0
  86. package/dist/commands/capture-compare.d.ts +15 -0
  87. package/dist/commands/capture-compare.js +253 -0
  88. package/dist/commands/capture-list.d.ts +12 -0
  89. package/dist/commands/capture-list.js +147 -0
  90. package/dist/commands/capture.d.ts +9 -0
  91. package/dist/commands/capture.js +16 -0
  92. package/dist/commands/chronic-failures.d.ts +8 -0
  93. package/dist/commands/chronic-failures.js +33 -0
  94. package/dist/commands/coverage-audit.js +3 -1
  95. package/dist/commands/explain-handler.d.ts +1 -1
  96. package/dist/commands/explain-handler.js +37 -8
  97. package/dist/commands/fetch-docs.js +1 -0
  98. package/dist/commands/generate-configs.d.ts +3 -3
  99. package/dist/commands/generate-configs.js +20 -8
  100. package/dist/commands/init.d.ts +5 -4
  101. package/dist/commands/init.js +190 -25
  102. package/dist/commands/pipeline-action.d.ts +7 -1
  103. package/dist/commands/pipeline-action.js +43 -19
  104. package/dist/commands/pipeline.d.ts +6 -1
  105. package/dist/commands/pipeline.js +7 -2
  106. package/dist/commands/pr-comment.js +1 -0
  107. package/dist/commands/publish.js +1 -0
  108. package/dist/commands/shared/help.js +2 -2
  109. package/dist/commands/update-quality-scores.d.ts +5 -0
  110. package/dist/commands/update-quality-scores.js +20 -0
  111. package/dist/commands/validate-tasks.d.ts +2 -2
  112. package/dist/commands/validate-tasks.js +26 -15
  113. package/dist/composition-root.d.ts +15 -4
  114. package/dist/composition-root.js +100 -55
  115. package/dist/config/features.ts +23 -0
  116. package/dist/config/models.ts +100 -0
  117. package/dist/config/prompts.ts +16 -0
  118. package/dist/config/rubrics.ts +225 -0
  119. package/dist/config/schedules.ts +47 -0
  120. package/dist/config/sinks.ts +37 -0
  121. package/dist/config/sources.ts +21 -0
  122. package/dist/config/thresholds.ts +61 -0
  123. package/dist/index.d.ts +41 -0
  124. package/dist/index.js +48 -0
  125. package/dist/lib/agent-behavior-report.d.ts +8 -0
  126. package/dist/lib/agent-behavior-report.js +185 -0
  127. package/dist/lib/baseline.d.ts +19 -0
  128. package/dist/lib/baseline.js +153 -0
  129. package/dist/lib/calculate-scores.d.ts +23 -0
  130. package/dist/lib/calculate-scores.js +42 -0
  131. package/dist/lib/compare.d.ts +18 -0
  132. package/dist/lib/compare.js +170 -0
  133. package/dist/lib/coverage-audit.d.ts +4 -0
  134. package/dist/lib/coverage-audit.js +42 -0
  135. package/dist/lib/discovery-report.d.ts +13 -0
  136. package/dist/lib/discovery-report.js +57 -0
  137. package/dist/lib/fetch-docs.d.ts +30 -0
  138. package/dist/lib/fetch-docs.js +171 -0
  139. package/dist/lib/generate-configs.d.ts +25 -0
  140. package/dist/lib/generate-configs.js +42 -0
  141. package/dist/lib/grader-api.d.ts +21 -0
  142. package/dist/lib/grader-api.js +34 -0
  143. package/dist/lib/grader-compare.d.ts +19 -0
  144. package/dist/lib/grader-compare.js +91 -0
  145. package/dist/lib/grader-consistency.d.ts +27 -0
  146. package/dist/lib/grader-consistency.js +79 -0
  147. package/dist/lib/grader-sensitivity.d.ts +19 -0
  148. package/dist/lib/grader-sensitivity.js +75 -0
  149. package/dist/lib/grader-validate.d.ts +19 -0
  150. package/dist/lib/grader-validate.js +78 -0
  151. package/dist/lib/measure-retrieval.d.ts +14 -0
  152. package/dist/lib/measure-retrieval.js +71 -0
  153. package/dist/lib/pr-comment.d.ts +16 -0
  154. package/dist/lib/pr-comment.js +28 -0
  155. package/dist/lib/readiness-report.d.ts +13 -0
  156. package/dist/lib/readiness-report.js +108 -0
  157. package/dist/lib/webhook-server.d.ts +11 -0
  158. package/dist/lib/webhook-server.js +24 -0
  159. package/dist/lib/weekly-digest.d.ts +24 -0
  160. package/dist/lib/weekly-digest.js +148 -0
  161. package/dist/orchestration/build-app-context.js +13 -0
  162. package/dist/orchestration/build-step-sequence.js +4 -2
  163. package/dist/orchestration/cache-context.d.ts +23 -0
  164. package/dist/orchestration/cache-context.js +43 -0
  165. package/dist/orchestration/env-bridge.d.ts +21 -0
  166. package/dist/orchestration/env-bridge.js +66 -0
  167. package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
  168. package/dist/orchestration/load-pipeline-tasks.js +52 -0
  169. package/dist/orchestration/pipeline-orchestrator.js +75 -5
  170. package/dist/orchestration/step-runner.js +5 -1
  171. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
  172. package/dist/orchestration/steps/calculate-scores-step.js +13 -0
  173. package/dist/orchestration/steps/callback-step.js +10 -1
  174. package/dist/orchestration/steps/compare-step.js +6 -3
  175. package/dist/orchestration/steps/discovery-report-step.js +6 -2
  176. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  177. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  178. package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
  179. package/dist/orchestration/steps/fetch-docs-step.js +32 -19
  180. package/dist/orchestration/steps/gap-analysis-step.js +13 -2
  181. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
  182. package/dist/orchestration/steps/generate-configs-step.js +77 -26
  183. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
  184. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  185. package/dist/orchestration/steps/publish-report-step.js +19 -0
  186. package/dist/orchestration/steps/readiness-step.js +8 -3
  187. package/dist/orchestration/steps/report-step.js +17 -4
  188. package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
  189. package/dist/orchestration/steps/run-eval-step.js +51 -31
  190. package/dist/pipeline/agent-behavior-report.js +6 -0
  191. package/dist/pipeline/attribution.d.ts +1 -1
  192. package/dist/pipeline/attribution.js +1 -1
  193. package/dist/pipeline/cache.js +29 -15
  194. package/dist/pipeline/calculate-scores.d.ts +2 -0
  195. package/dist/pipeline/calculate-scores.js +70 -33
  196. package/dist/pipeline/chronic-failures.d.ts +55 -0
  197. package/dist/pipeline/chronic-failures.js +110 -0
  198. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
  199. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
  200. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
  201. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +132 -62
  202. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
  203. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
  204. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
  205. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
  206. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
  207. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
  208. package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
  209. package/dist/pipeline/compiler/assertion-mapper.js +1 -1
  210. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
  211. package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
  212. package/dist/pipeline/compiler/config-loader.d.ts +14 -0
  213. package/dist/pipeline/compiler/config-loader.js +42 -2
  214. package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
  215. package/dist/pipeline/compiler/fixture-resolver.js +1 -1
  216. package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
  217. package/dist/pipeline/compiler/ignore-fields.js +1 -1
  218. package/dist/pipeline/compiler/index.d.ts +2 -5
  219. package/dist/pipeline/compiler/index.js +2 -5
  220. package/dist/pipeline/compiler/literacy-bridge.d.ts +2 -2
  221. package/dist/pipeline/compiler/literacy-bridge.js +2 -2
  222. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
  223. package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
  224. package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
  225. package/dist/pipeline/compiler/mode-bases/index.js +4 -0
  226. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
  227. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
  228. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +23 -0
  229. package/dist/pipeline/compiler/mode-bases/literacy.js +132 -0
  230. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
  231. package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
  232. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
  233. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
  234. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
  235. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
  236. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
  237. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
  238. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
  239. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
  240. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
  241. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
  242. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
  243. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
  244. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
  245. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
  246. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
  247. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
  248. package/dist/pipeline/compiler/mode-handlers/index.d.ts +6 -7
  249. package/dist/pipeline/compiler/mode-handlers/index.js +6 -8
  250. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
  251. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
  252. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
  253. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
  254. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
  255. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
  256. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
  257. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
  258. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
  259. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
  260. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
  261. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
  262. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
  263. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
  264. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
  265. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
  266. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
  267. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
  268. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
  269. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
  270. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
  271. package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
  272. package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
  273. package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
  274. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +63 -6
  275. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
  276. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
  277. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
  278. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
  279. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
  280. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
  281. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
  282. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
  283. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
  284. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +108 -0
  285. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
  286. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
  287. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
  288. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
  289. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +3 -1
  290. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +65 -67
  291. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
  292. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +191 -0
  293. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
  294. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +101 -0
  295. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
  296. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
  297. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +19 -0
  298. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +323 -0
  299. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +103 -0
  300. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
  301. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
  302. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
  303. package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
  304. package/dist/pipeline/compiler/preset-loader.js +99 -0
  305. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +7 -10
  306. package/dist/pipeline/compiler/presets/sanity-literacy.js +11 -157
  307. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
  308. package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
  309. package/dist/pipeline/compiler/provider-assembler.js +13 -7
  310. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
  311. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
  312. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
  313. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
  314. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
  315. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
  316. package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
  317. package/dist/pipeline/compiler/sandbox/index.js +1 -1
  318. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
  319. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
  320. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
  321. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
  322. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
  323. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
  324. package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
  325. package/dist/pipeline/compiler/scoring-bridge.js +1 -1
  326. package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
  327. package/dist/pipeline/compiler/task-bridge.js +92 -0
  328. package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
  329. package/dist/pipeline/compiler/task-graph-builder.js +1 -4
  330. package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
  331. package/dist/pipeline/compiler/telemetry/index.js +1 -1
  332. package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
  333. package/dist/pipeline/compiler/variable-resolver.js +1 -1
  334. package/dist/pipeline/coverage-audit.d.ts +1 -1
  335. package/dist/pipeline/coverage-audit.js +1 -1
  336. package/dist/pipeline/degradations.d.ts +1 -1
  337. package/dist/pipeline/degradations.js +1 -1
  338. package/dist/pipeline/expand-tasks.d.ts +2 -2
  339. package/dist/pipeline/expand-tasks.js +2 -2
  340. package/dist/pipeline/failure-modes.d.ts +1 -1
  341. package/dist/pipeline/failure-modes.js +13 -1
  342. package/dist/pipeline/gap-analysis.d.ts +1 -1
  343. package/dist/pipeline/gap-analysis.js +3 -1
  344. package/dist/pipeline/generate-configs.d.ts +2 -2
  345. package/dist/pipeline/generate-configs.js +16 -9
  346. package/dist/pipeline/grader-compare-runner.d.ts +1 -1
  347. package/dist/pipeline/grader-compare-runner.js +7 -1
  348. package/dist/pipeline/grader-comparison.d.ts +1 -1
  349. package/dist/pipeline/grader-comparison.js +1 -1
  350. package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
  351. package/dist/pipeline/grader-consistency-runner.js +7 -1
  352. package/dist/pipeline/grader-consistency.d.ts +1 -1
  353. package/dist/pipeline/grader-consistency.js +1 -1
  354. package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
  355. package/dist/pipeline/grader-sensitivity-runner.js +1 -1
  356. package/dist/pipeline/grader-sensitivity.d.ts +1 -1
  357. package/dist/pipeline/grader-sensitivity.js +1 -1
  358. package/dist/pipeline/grader-validate-runner.d.ts +1 -1
  359. package/dist/pipeline/grader-validate-runner.js +2 -2
  360. package/dist/pipeline/grader-validation.d.ts +1 -1
  361. package/dist/pipeline/grader-validation.js +1 -1
  362. package/dist/pipeline/map-request-to-config.js +16 -2
  363. package/dist/pipeline/mirror-repo-tasks.d.ts +8 -8
  364. package/dist/pipeline/mirror-repo-tasks.js +10 -10
  365. package/dist/pipeline/plan-format.d.ts +1 -1
  366. package/dist/pipeline/plan-format.js +1 -1
  367. package/dist/pipeline/plan.d.ts +1 -1
  368. package/dist/pipeline/plan.js +68 -30
  369. package/dist/pipeline/probe.d.ts +1 -1
  370. package/dist/pipeline/probe.js +1 -1
  371. package/dist/pipeline/readiness-report.d.ts +2 -2
  372. package/dist/pipeline/readiness-report.js +2 -2
  373. package/dist/pipeline/release-classification.d.ts +1 -1
  374. package/dist/pipeline/release-classification.js +1 -1
  375. package/dist/pipeline/release-report.d.ts +1 -1
  376. package/dist/pipeline/release-report.js +1 -1
  377. package/dist/pipeline/repo-eval-comment.d.ts +1 -1
  378. package/dist/pipeline/repo-eval-comment.js +1 -1
  379. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  380. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  381. package/dist/pipeline/resolve-mappings.d.ts +6 -6
  382. package/dist/pipeline/resolve-mappings.js +44 -44
  383. package/dist/pipeline/retrieval-metrics.d.ts +3 -3
  384. package/dist/pipeline/retrieval-metrics.js +28 -20
  385. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  386. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  387. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  388. package/dist/pipeline/steps/compare-step.js +90 -0
  389. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  390. package/dist/pipeline/steps/eval-step.js +347 -0
  391. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  392. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  393. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  394. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  395. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  396. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  397. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  398. package/dist/pipeline/steps/publish-report-step.js +243 -0
  399. package/dist/pipeline/steps/report-step.d.ts +13 -0
  400. package/dist/pipeline/steps/report-step.js +56 -0
  401. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  402. package/dist/pipeline/steps/update-scores-step.js +42 -0
  403. package/dist/pipeline/targeted-loo.d.ts +1 -1
  404. package/dist/pipeline/targeted-loo.js +1 -1
  405. package/dist/pipeline/thresholds.d.ts +1 -1
  406. package/dist/pipeline/thresholds.js +1 -1
  407. package/dist/pipeline/validate.js +13 -0
  408. package/dist/report-store.d.ts +17 -0
  409. package/dist/report-store.js +24 -0
  410. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  411. package/dist/scripts/agent-behavior-report.js +315 -0
  412. package/dist/scripts/baseline.d.ts +43 -0
  413. package/dist/scripts/baseline.js +267 -0
  414. package/dist/scripts/calculate-scores.d.ts +166 -0
  415. package/dist/scripts/calculate-scores.js +1296 -0
  416. package/dist/scripts/compare.d.ts +22 -0
  417. package/dist/scripts/compare.js +334 -0
  418. package/dist/scripts/coverage-audit.d.ts +44 -0
  419. package/dist/scripts/coverage-audit.js +209 -0
  420. package/dist/scripts/debug-eval.d.ts +19 -0
  421. package/dist/scripts/debug-eval.js +73 -0
  422. package/dist/scripts/discovery-report.d.ts +58 -0
  423. package/dist/scripts/discovery-report.js +250 -0
  424. package/dist/scripts/fetch-docs.d.ts +35 -0
  425. package/dist/scripts/fetch-docs.js +472 -0
  426. package/dist/scripts/generate-configs.d.ts +66 -0
  427. package/dist/scripts/generate-configs.js +459 -0
  428. package/dist/scripts/grader-api.d.ts +27 -0
  429. package/dist/scripts/grader-api.js +206 -0
  430. package/dist/scripts/grader-compare.d.ts +22 -0
  431. package/dist/scripts/grader-compare.js +368 -0
  432. package/dist/scripts/grader-consistency.d.ts +20 -0
  433. package/dist/scripts/grader-consistency.js +313 -0
  434. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  435. package/dist/scripts/grader-sensitivity.js +354 -0
  436. package/dist/scripts/grader-validate.d.ts +19 -0
  437. package/dist/scripts/grader-validate.js +267 -0
  438. package/dist/scripts/measure-retrieval.d.ts +10 -0
  439. package/dist/scripts/measure-retrieval.js +145 -0
  440. package/dist/scripts/migrate-task-mode.d.ts +1 -1
  441. package/dist/scripts/migrate-task-mode.js +1 -1
  442. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
  443. package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
  444. package/dist/scripts/pipeline.d.ts +76 -0
  445. package/dist/scripts/pipeline.js +1031 -0
  446. package/dist/scripts/pr-comment.d.ts +10 -0
  447. package/dist/scripts/pr-comment.js +510 -0
  448. package/dist/scripts/readiness-report.d.ts +88 -0
  449. package/dist/scripts/readiness-report.js +342 -0
  450. package/dist/scripts/update-quality-scores.d.ts +15 -0
  451. package/dist/scripts/update-quality-scores.js +184 -0
  452. package/dist/scripts/validate-task-sources.d.ts +1 -1
  453. package/dist/scripts/validate-task-sources.js +1 -1
  454. package/dist/scripts/validate.d.ts +13 -0
  455. package/dist/scripts/validate.js +79 -0
  456. package/dist/scripts/webhook-server.d.ts +26 -0
  457. package/dist/scripts/webhook-server.js +147 -0
  458. package/dist/scripts/weekly-digest.d.ts +24 -0
  459. package/dist/scripts/weekly-digest.js +144 -0
  460. package/dist/sinks/format-slack.d.ts +64 -0
  461. package/dist/sinks/format-slack.js +306 -0
  462. package/dist/sinks/slack-sink.d.ts +27 -0
  463. package/dist/sinks/slack-sink.js +78 -0
  464. package/dist/sinks/types.d.ts +1 -1
  465. package/dist/sinks/types.js +1 -1
  466. package/dist/sinks/webhook-sink.d.ts +19 -0
  467. package/dist/sinks/webhook-sink.js +50 -0
  468. package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
  469. package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
  470. package/dist/tasks/literacy/content-lake.task.ts +181 -0
  471. package/dist/tasks/literacy/frameworks.task.ts +129 -0
  472. package/dist/tasks/literacy/functions.task.ts +70 -0
  473. package/dist/tasks/literacy/groq.task.ts +259 -0
  474. package/dist/tasks/literacy/image-handling.task.ts +95 -0
  475. package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
  476. package/dist/tasks/literacy/portable-text.task.ts +169 -0
  477. package/dist/tasks/literacy/studio-setup.task.ts +134 -0
  478. package/dist/tasks/literacy/visual-editing.task.ts +147 -0
  479. package/package.json +32 -24
  480. package/tasks/.expanded.agentic.yaml +280 -0
  481. package/tasks/.expanded.yaml +565 -0
  482. package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
  483. package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
  484. package/tasks/literacy/content-lake.task.ts +181 -0
  485. package/tasks/literacy/frameworks.task.ts +1 -0
  486. package/tasks/literacy/functions.task.ts +1 -0
  487. package/tasks/literacy/groq.task.ts +1 -0
  488. package/tasks/literacy/image-handling.task.ts +95 -0
  489. package/tasks/literacy/nextjs-live.task.ts +2 -1
  490. package/tasks/literacy/portable-text.task.ts +169 -0
  491. package/tasks/literacy/studio-setup.task.ts +5 -2
  492. package/tasks/literacy/visual-editing.task.ts +1 -0
  493. package/LICENSE +0 -21
  494. package/tasks/frameworks.yaml +0 -98
  495. package/tasks/functions.yaml +0 -51
  496. package/tasks/groq.yaml +0 -216
  497. package/tasks/nextjs-live.yaml +0 -62
  498. package/tasks/studio-setup.yaml +0 -111
  499. package/tasks/visual-editing.yaml +0 -120
@@ -0,0 +1,100 @@
1
+ /**
2
+ * MCP server task compilation — core compiler logic.
3
+ *
4
+ * Produces Promptfoo configuration from MCP server task definitions:
5
+ * 1. A provider config pointing to the MCP server
6
+ * 2. Test cases with tool-call assertions
7
+ * 3. Appropriate prompts for the evaluation
8
+ */
9
+ import { buildMCPAssertions } from "./assertions.js";
10
+ import { buildMCPProvider } from "./provider-config.js";
11
+ import { validateMCPTask } from "./validation.js";
12
+ // ---------------------------------------------------------------------------
13
+ // Public API
14
+ // ---------------------------------------------------------------------------
15
+ /**
16
+ * Compile an MCP server task definition into Promptfoo configuration.
17
+ *
18
+ * This is the core of the MCP mode handler. It produces:
19
+ * 1. A provider config pointing to the MCP server
20
+ * 2. Test cases with tool-call assertions
21
+ * 3. Appropriate prompts for the evaluation
22
+ */
23
+ export function compileMCPTask(task, options) {
24
+ const warnings = [];
25
+ // Validate
26
+ const validationErrors = validateMCPTask(task);
27
+ if (validationErrors.length > 0) {
28
+ for (const err of validationErrors) {
29
+ warnings.push(`MCP task "${task.id}": ${err.field} — ${err.message}`);
30
+ }
31
+ }
32
+ // Build providers (one LLM provider per model, each with MCP config)
33
+ const providers = buildMCPProvider(task, options?.models ?? [], warnings);
34
+ // Build prompts
35
+ const prompts = buildMCPPrompts(task);
36
+ // Build test cases
37
+ const tests = buildMCPTestCases(task, options, warnings);
38
+ return { providers, tests, prompts, warnings };
39
+ }
40
+ // ---------------------------------------------------------------------------
41
+ // Prompt assembly
42
+ // ---------------------------------------------------------------------------
43
+ function buildMCPPrompts(task) {
44
+ // MCP mode uses a single prompt — the task description
45
+ const promptText = task.prompt?.text ??
46
+ task.prompt?.vars?.task ??
47
+ task.description ??
48
+ `Test MCP server: ${task.title}`;
49
+ return [
50
+ {
51
+ id: "mcp-test",
52
+ label: `MCP: ${task.title}`,
53
+ raw: String(promptText),
54
+ },
55
+ ];
56
+ }
57
+ // ---------------------------------------------------------------------------
58
+ // Test case assembly
59
+ // ---------------------------------------------------------------------------
60
+ function buildMCPTestCases(task, options, warnings) {
61
+ const tests = [];
62
+ // Build assertion context
63
+ const assertionContext = {
64
+ capabilities: task.capabilities ?? [],
65
+ graderProvider: options?.graderProvider,
66
+ taskId: task.id,
67
+ };
68
+ // Compile assertions
69
+ // Cast GeneralizedAssertionDefinition[] → AssertionInput[] (structurally compatible)
70
+ const assertions = [];
71
+ if (task.assertions) {
72
+ const rawAssertions = task.assertions;
73
+ const { assertions: mapped, warnings: assertionWarnings } = buildMCPAssertions(rawAssertions, assertionContext);
74
+ assertions.push(...mapped);
75
+ warnings.push(...assertionWarnings);
76
+ }
77
+ // Build test case vars
78
+ const vars = {
79
+ task: task.prompt?.vars?.task ?? task.description ?? `Test: ${task.title}`,
80
+ ...(task.prompt?.vars ?? {}),
81
+ };
82
+ // Primary test case
83
+ tests.push({
84
+ description: `${task.id} — ${task.title}`,
85
+ vars,
86
+ ...(assertions.length > 0 ? { assert: assertions } : {}),
87
+ });
88
+ // Multi-turn test cases
89
+ if (task.multiTurn?.turns && task.multiTurn.turns.length > 0) {
90
+ tests.push({
91
+ description: `${task.id} — ${task.title} [multi-turn]`,
92
+ vars: {
93
+ ...vars,
94
+ __multiTurn: task.multiTurn.turns,
95
+ },
96
+ ...(assertions.length > 0 ? { assert: assertions } : {}),
97
+ });
98
+ }
99
+ return tests;
100
+ }
@@ -0,0 +1,27 @@
1
+ /**
2
+ * MCP Server mode handler — directory barrel.
3
+ *
4
+ * MCPServerModeHandler — compilation rules for `mcp-server` evaluation mode.
5
+ *
6
+ * This is the first non-literacy mode handler, proving the compiler
7
+ * architecture works end-to-end. It translates MCP server task definitions
8
+ * into Promptfoo configuration with:
9
+ *
10
+ * - An MCP provider that wraps the server under test
11
+ * - Tool-call assertions compiled to Promptfoo `javascript` assertions
12
+ * - Server lifecycle management via Promptfoo provider hooks
13
+ * - Multi-turn conversation support via Promptfoo's `steps` syntax
14
+ *
15
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-3-mcp-server-mode.md
16
+ * @see packages/core/src/types/eval-mode-config.ts — MCPServerModeConfig
17
+ * @see packages/core/src/types/generalized-task.ts — MCPServerTaskDefinition
18
+ */
19
+ import type { ModeHandler } from "../../../../_vendor/ailf-core/index.d.ts";
20
+ /** ModeHandler-conformant export for the mcp-server evaluation mode. */
21
+ export declare const handler: ModeHandler;
22
+ export type { MCPAssertionContext, MCPCompileOptions, MCPCompileResult, MCPValidationError, } from "./types.js";
23
+ export { buildMCPAssertions } from "./assertions.js";
24
+ export { compileMCPTask } from "./compiler.js";
25
+ export { validateMCPTask } from "./validation.js";
26
+ export { MCP_PROMPT_TEMPLATES } from "./prompts.js";
27
+ export { DEFAULT_MAX_TOOL_ROUNDS, MCP_PROVIDER_PATH, } from "./provider-config.js";
@@ -0,0 +1,54 @@
1
+ /**
2
+ * MCP Server mode handler — directory barrel.
3
+ *
4
+ * MCPServerModeHandler — compilation rules for `mcp-server` evaluation mode.
5
+ *
6
+ * This is the first non-literacy mode handler, proving the compiler
7
+ * architecture works end-to-end. It translates MCP server task definitions
8
+ * into Promptfoo configuration with:
9
+ *
10
+ * - An MCP provider that wraps the server under test
11
+ * - Tool-call assertions compiled to Promptfoo `javascript` assertions
12
+ * - Server lifecycle management via Promptfoo provider hooks
13
+ * - Multi-turn conversation support via Promptfoo's `steps` syntax
14
+ *
15
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-3-mcp-server-mode.md
16
+ * @see packages/core/src/types/eval-mode-config.ts — MCPServerModeConfig
17
+ * @see packages/core/src/types/generalized-task.ts — MCPServerTaskDefinition
18
+ */
19
+ import { compileMCPTask } from "./compiler.js";
20
+ import { MCP_PROMPT_TEMPLATES } from "./prompts.js";
21
+ // ---------------------------------------------------------------------------
22
+ // ModeHandler adapter
23
+ // ---------------------------------------------------------------------------
24
+ /** ModeHandler-conformant export for the mcp-server evaluation mode. */
25
+ export const handler = {
26
+ getPrompts() {
27
+ return MCP_PROMPT_TEMPLATES;
28
+ },
29
+ compileTask(task, ctx) {
30
+ if (!("mode" in task) || task.mode !== "mcp-server") {
31
+ throw new Error(`MCP server handler received task with mode "${task.mode ?? "undefined"}" — expected "mcp-server"`);
32
+ }
33
+ const result = compileMCPTask(task, {
34
+ graderProvider: ctx.graderProvider,
35
+ models: ctx.models,
36
+ });
37
+ return {
38
+ providers: result.providers,
39
+ tests: result.tests,
40
+ prompts: result.prompts,
41
+ warnings: result.warnings,
42
+ };
43
+ },
44
+ };
45
+ // Assertions
46
+ export { buildMCPAssertions } from "./assertions.js";
47
+ // Compilation
48
+ export { compileMCPTask } from "./compiler.js";
49
+ // Validation
50
+ export { validateMCPTask } from "./validation.js";
51
+ // Prompts
52
+ export { MCP_PROMPT_TEMPLATES } from "./prompts.js";
53
+ // Provider config
54
+ export { DEFAULT_MAX_TOOL_ROUNDS, MCP_PROVIDER_PATH, } from "./provider-config.js";
@@ -0,0 +1,8 @@
1
+ /**
2
+ * Canonical MCP server prompt templates.
3
+ *
4
+ * Handler-owned prompts for MCP server evaluations. Instructs the model to
5
+ * interact with MCP tools rather than writing standalone code.
6
+ */
7
+ import type { PromptTemplate } from "../../../../_vendor/ailf-core/index.d.ts";
8
+ export declare const MCP_PROMPT_TEMPLATES: Record<string, PromptTemplate>;
@@ -0,0 +1,28 @@
1
+ /**
2
+ * Canonical MCP server prompt templates.
3
+ *
4
+ * Handler-owned prompts for MCP server evaluations. Instructs the model to
5
+ * interact with MCP tools rather than writing standalone code.
6
+ */
7
+ export const MCP_PROMPT_TEMPLATES = {
8
+ "mcp-server": {
9
+ id: "mcp-server",
10
+ label: "MCP Server Tool Use",
11
+ template: `You are an AI assistant with access to an MCP (Model Context Protocol) server that provides tools for interacting with a Sanity content backend.
12
+
13
+ ## Task
14
+ {{task}}
15
+
16
+ ## Instructions
17
+
18
+ 1. Use the available MCP tools to complete the task
19
+ 2. Call tools with the correct parameters as described in their schemas
20
+ 3. Interpret tool responses and use the results to accomplish the goal
21
+ 4. If a tool returns an error, explain the issue clearly
22
+ 5. Prefer using specific tools over broad queries when possible
23
+
24
+ Complete the task using the MCP tools provided:
25
+ `,
26
+ variables: ["task"],
27
+ },
28
+ };
@@ -0,0 +1,28 @@
1
+ /**
2
+ * MCP server provider assembly — builds Promptfoo provider configs.
3
+ */
4
+ import type { MCPServerTaskDefinition, ModeProviderEntry } from "../../../../_vendor/ailf-core/index.d.ts";
5
+ import type { PromptfooProvider } from "../../promptfoo-compiler.js";
6
+ /** Default max tool rounds for MCP multi-turn execution */
7
+ export declare const DEFAULT_MAX_TOOL_ROUNDS = 5;
8
+ /** Provider path relative to eval package dist */
9
+ export declare const MCP_PROVIDER_PATH = "file://dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js";
10
+ /**
11
+ * Build custom MCP tool provider configs — one per model.
12
+ *
13
+ * Each provider uses the custom mcp-tool-provider.ts which implements a
14
+ * multi-turn tool execution loop. The LLM receives a prompt, discovers
15
+ * MCP tools, calls them, gets results, and continues until it produces
16
+ * a final text answer or exhausts maxToolRounds.
17
+ *
18
+ * Config shape passed to the custom provider:
19
+ * { model, mcpServer: { url, auth, name }, mcpTools, maxToolRounds, temperature, ... }
20
+ */
21
+ export declare function buildMCPProvider(task: MCPServerTaskDefinition, models: ModeProviderEntry[], warnings: string[]): PromptfooProvider[];
22
+ /**
23
+ * Build the MCP server connection config for the custom provider.
24
+ *
25
+ * Shape: { url?, command?, name?, auth? }
26
+ * The custom mcp-tool-provider.ts uses this to connect to the MCP server.
27
+ */
28
+ export declare function buildMCPServerConfig(task: MCPServerTaskDefinition, warnings: string[]): Record<string, unknown>;
@@ -0,0 +1,108 @@
1
+ /**
2
+ * MCP server provider assembly — builds Promptfoo provider configs.
3
+ */
4
+ // ---------------------------------------------------------------------------
5
+ // Constants
6
+ // ---------------------------------------------------------------------------
7
+ /** Default max tool rounds for MCP multi-turn execution */
8
+ export const DEFAULT_MAX_TOOL_ROUNDS = 5;
9
+ /** Provider path relative to eval package dist */
10
+ export const MCP_PROVIDER_PATH = "file://dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js";
11
+ // ---------------------------------------------------------------------------
12
+ // Provider assembly
13
+ // ---------------------------------------------------------------------------
14
+ /**
15
+ * Build custom MCP tool provider configs — one per model.
16
+ *
17
+ * Each provider uses the custom mcp-tool-provider.ts which implements a
18
+ * multi-turn tool execution loop. The LLM receives a prompt, discovers
19
+ * MCP tools, calls them, gets results, and continues until it produces
20
+ * a final text answer or exhausts maxToolRounds.
21
+ *
22
+ * Config shape passed to the custom provider:
23
+ * { model, mcpServer: { url, auth, name }, mcpTools, maxToolRounds, temperature, ... }
24
+ */
25
+ export function buildMCPProvider(task, models, warnings) {
26
+ // Build the MCP server config
27
+ const mcpServer = buildMCPServerConfig(task, warnings);
28
+ const mcpTools = task.capabilities ?? undefined;
29
+ const maxToolRounds = task.maxToolRounds ?? DEFAULT_MAX_TOOL_ROUNDS;
30
+ // Helper to build a provider entry for a given model
31
+ function makeProvider(modelId, label, modelConfig) {
32
+ return {
33
+ id: MCP_PROVIDER_PATH,
34
+ label: `${label} + MCP`,
35
+ config: {
36
+ model: modelId,
37
+ mcpServer,
38
+ ...(mcpTools ? { mcpTools } : {}),
39
+ maxToolRounds,
40
+ ...(modelConfig ?? {}),
41
+ },
42
+ };
43
+ }
44
+ // Task-level model override takes precedence over registry models
45
+ const taskModels = task.models;
46
+ if (taskModels && taskModels.length > 0) {
47
+ return taskModels.map((modelId) => makeProvider(modelId, modelId));
48
+ }
49
+ // Use registry models (already filtered to mcp-server mode)
50
+ if (models.length === 0) {
51
+ warnings.push(`MCP task "${task.id}": no models available. Add "mcp-server" to a ` +
52
+ "model's modes array in config/models.ts, or set models on the task.");
53
+ return [
54
+ makeProvider("anthropic:messages:claude-sonnet-4-20250514", "Claude Sonnet 4"),
55
+ ];
56
+ }
57
+ return models.map((model) => makeProvider(model.id, model.label, model.config));
58
+ }
59
+ /**
60
+ * Build the MCP server connection config for the custom provider.
61
+ *
62
+ * Shape: { url?, command?, name?, auth? }
63
+ * The custom mcp-tool-provider.ts uses this to connect to the MCP server.
64
+ */
65
+ export function buildMCPServerConfig(task, warnings) {
66
+ const config = task.serverConfig;
67
+ if (!config) {
68
+ warnings.push(`MCP task "${task.id}": no serverConfig — using placeholder. ` +
69
+ "Set serverConfig.command or serverConfig.url to point to your MCP server.");
70
+ return { name: task.id };
71
+ }
72
+ const serverConfig = { name: task.id };
73
+ if (config.transport === "stdio") {
74
+ serverConfig.command = config.command;
75
+ }
76
+ else {
77
+ serverConfig.url = config.url;
78
+ }
79
+ // Explicit headers for HTTP transports
80
+ if (config.headers) {
81
+ serverConfig.headers = config.headers;
82
+ }
83
+ // Auth config
84
+ if (config.auth) {
85
+ serverConfig.auth = config.auth;
86
+ }
87
+ else if (config.env) {
88
+ const tokenKey = Object.keys(config.env).find((k) => /token|auth|key/i.test(k));
89
+ if (tokenKey) {
90
+ const val = config.env[tokenKey];
91
+ let envVar = val;
92
+ if (val.startsWith("$env(") && val.endsWith(")")) {
93
+ envVar = val.slice(5, -1);
94
+ }
95
+ if (!envVar || !/^[A-Za-z_][A-Za-z0-9_]*$/.test(envVar)) {
96
+ warnings.push(`MCP task: env var name "${envVar}" from "${val}" is not a valid ` +
97
+ "identifier — skipping auth config");
98
+ }
99
+ else {
100
+ serverConfig.auth = {
101
+ type: "bearer",
102
+ token: `{{env.${envVar}}}`,
103
+ };
104
+ }
105
+ }
106
+ }
107
+ return serverConfig;
108
+ }
@@ -0,0 +1,37 @@
1
+ /**
2
+ * Shared types for the MCP server mode handler.
3
+ */
4
+ import type { ModeProviderEntry } from "../../../../_vendor/ailf-core/index.d.ts";
5
+ import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
6
+ /** Options for compiling an MCP server task */
7
+ export interface MCPCompileOptions {
8
+ /** Grader provider for LLM-graded assertions */
9
+ graderProvider?: string;
10
+ /** Model providers to evaluate with (from registry, filtered by mcp-server mode) */
11
+ models?: ModeProviderEntry[];
12
+ }
13
+ /** Result of compiling a single MCP task */
14
+ export interface MCPCompileResult {
15
+ /** Promptfoo provider config for the MCP server */
16
+ providers: PromptfooProvider[];
17
+ /** Compiled test cases */
18
+ tests: PromptfooTestCase[];
19
+ /** Prompts for MCP evaluation */
20
+ prompts: PromptfooPrompt[];
21
+ /** Warnings generated during compilation */
22
+ warnings: string[];
23
+ }
24
+ /** Validation errors for MCP task definitions */
25
+ export interface MCPValidationError {
26
+ field: string;
27
+ message: string;
28
+ }
29
+ /** Context for building MCP assertions */
30
+ export interface MCPAssertionContext {
31
+ /** Task ID (for error messages) */
32
+ taskId: string;
33
+ /** Expected server capabilities */
34
+ capabilities: string[];
35
+ /** Grader provider for LLM-graded assertions */
36
+ graderProvider?: string;
37
+ }
@@ -0,0 +1,4 @@
1
+ /**
2
+ * Shared types for the MCP server mode handler.
3
+ */
4
+ export {};
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Validation for MCP server task definitions.
3
+ */
4
+ import type { MCPServerTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
5
+ import type { MCPValidationError } from "./types.js";
6
+ /**
7
+ * Validate that an MCP task definition has all required fields.
8
+ */
9
+ export declare function validateMCPTask(task: MCPServerTaskDefinition): MCPValidationError[];
@@ -0,0 +1,43 @@
1
+ /**
2
+ * Validation for MCP server task definitions.
3
+ */
4
+ /**
5
+ * Validate that an MCP task definition has all required fields.
6
+ */
7
+ export function validateMCPTask(task) {
8
+ const errors = [];
9
+ if (!task.id) {
10
+ errors.push({ field: "id", message: "Task ID is required" });
11
+ }
12
+ if (!task.title) {
13
+ errors.push({ field: "title", message: "Task title is required" });
14
+ }
15
+ if (task.serverConfig) {
16
+ const { transport, command, url } = task.serverConfig;
17
+ if (transport === "stdio" && !command) {
18
+ errors.push({
19
+ field: "serverConfig.command",
20
+ message: "Server command is required for stdio transport (e.g., 'node dist/server.js')",
21
+ });
22
+ }
23
+ if ((transport === "sse" || transport === "streamable-http") && !url) {
24
+ errors.push({
25
+ field: "serverConfig.url",
26
+ message: `Server URL is required for ${transport} transport`,
27
+ });
28
+ }
29
+ }
30
+ // Assertions should reference MCP-compatible types
31
+ if (task.assertions) {
32
+ for (const assertion of task.assertions) {
33
+ if (assertion.type === "tool-called" &&
34
+ !("value" in assertion && assertion.value)) {
35
+ errors.push({
36
+ field: "assertions",
37
+ message: 'tool-called assertion requires a "value" specifying the tool name',
38
+ });
39
+ }
40
+ }
41
+ }
42
+ return errors;
43
+ }
@@ -26,13 +26,15 @@
26
26
  * @see packages/core/src/types/eval-mode-config.ts — MCPServerModeConfig
27
27
  * @see packages/core/src/types/generalized-task.ts — MCPServerTaskDefinition
28
28
  */
29
- import type { MCPServerTaskDefinition, ModeHandler, PromptTemplate } from "../../../_vendor/ailf-core/index.d.ts";
29
+ import type { MCPServerTaskDefinition, ModeHandler, ModeProviderEntry, PromptTemplate } from "../../../_vendor/ailf-core/index.d.ts";
30
30
  import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../promptfoo-compiler.js";
31
31
  export declare const MCP_PROMPT_TEMPLATES: Record<string, PromptTemplate>;
32
32
  /** Options for compiling an MCP server task */
33
33
  export interface MCPCompileOptions {
34
34
  /** Grader provider for LLM-graded assertions */
35
35
  graderProvider?: string;
36
+ /** Model providers to evaluate with (from registry, filtered by mcp-server mode) */
37
+ models?: ModeProviderEntry[];
36
38
  }
37
39
  /** Result of compiling a single MCP task */
38
40
  export interface MCPCompileResult {
@@ -114,8 +114,8 @@ export function compileMCPTask(task, options) {
114
114
  warnings.push(`MCP task "${task.id}": ${err.field} — ${err.message}`);
115
115
  }
116
116
  }
117
- // Build provider
118
- const providers = buildMCPProvider(task, warnings);
117
+ // Build providers (one LLM provider per model, each with MCP config)
118
+ const providers = buildMCPProvider(task, options?.models ?? [], warnings);
119
119
  // Build prompts
120
120
  const prompts = buildMCPPrompts(task);
121
121
  // Build test cases
@@ -125,103 +125,100 @@ export function compileMCPTask(task, options) {
125
125
  // ---------------------------------------------------------------------------
126
126
  // Provider assembly
127
127
  // ---------------------------------------------------------------------------
128
+ /** Default max tool rounds for MCP multi-turn execution */
129
+ const DEFAULT_MAX_TOOL_ROUNDS = 5;
130
+ /** Provider path relative to eval package dist */
131
+ const MCP_PROVIDER_PATH = "file://dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js";
128
132
  /**
129
- * Build a Promptfoo-native MCP provider config.
133
+ * Build custom MCP tool provider configs — one per model.
130
134
  *
131
- * Promptfoo supports MCP servers natively via `id: "mcp"` with a
132
- * structured config. See: https://www.promptfoo.dev/docs/providers/mcp/
135
+ * Each provider uses the custom mcp-tool-provider.ts which implements a
136
+ * multi-turn tool execution loop. The LLM receives a prompt, discovers
137
+ * MCP tools, calls them, gets results, and continues until it produces
138
+ * a final text answer or exhausts maxToolRounds.
133
139
  *
134
- * Key config shape:
135
- * { enabled: true, server: { url?, command?, args?, name?, auth?, headers? },
136
- * tools?, exclude_tools?, timeout?, debug? }
140
+ * Config shape passed to the custom provider:
141
+ * { model, mcpServer: { url, auth, name }, mcpTools, maxToolRounds, temperature, ... }
137
142
  */
138
- function buildMCPProvider(task, warnings) {
143
+ function buildMCPProvider(task, models, warnings) {
144
+ // Build the MCP server config
145
+ const mcpServer = buildMCPServerConfig(task, warnings);
146
+ const mcpTools = task.capabilities ?? undefined;
147
+ const maxToolRounds = task.maxToolRounds ?? DEFAULT_MAX_TOOL_ROUNDS;
148
+ // Helper to build a provider entry for a given model
149
+ function makeProvider(modelId, label, modelConfig) {
150
+ return {
151
+ id: MCP_PROVIDER_PATH,
152
+ label: `${label} + MCP`,
153
+ config: {
154
+ model: modelId,
155
+ mcpServer,
156
+ ...(mcpTools ? { mcpTools } : {}),
157
+ maxToolRounds,
158
+ ...(modelConfig ?? {}),
159
+ },
160
+ };
161
+ }
162
+ // Task-level model override takes precedence over registry models
163
+ const taskModels = task.models;
164
+ if (taskModels && taskModels.length > 0) {
165
+ return taskModels.map((modelId) => makeProvider(modelId, modelId));
166
+ }
167
+ // Use registry models (already filtered to mcp-server mode)
168
+ if (models.length === 0) {
169
+ warnings.push(`MCP task "${task.id}": no models available. Add "mcp-server" to a ` +
170
+ "model's modes array in config/models.ts, or set models on the task.");
171
+ return [
172
+ makeProvider("anthropic:messages:claude-sonnet-4-20250514", "Claude Sonnet 4"),
173
+ ];
174
+ }
175
+ return models.map((model) => makeProvider(model.id, model.label, model.config));
176
+ }
177
+ /**
178
+ * Build the MCP server connection config for the custom provider.
179
+ *
180
+ * Shape: { url?, command?, name?, auth? }
181
+ * The custom mcp-tool-provider.ts uses this to connect to the MCP server.
182
+ */
183
+ function buildMCPServerConfig(task, warnings) {
139
184
  const config = task.serverConfig;
140
185
  if (!config) {
141
- warnings.push(`MCP task "${task.id}": no serverConfig — using placeholder provider. ` +
186
+ warnings.push(`MCP task "${task.id}": no serverConfig — using placeholder. ` +
142
187
  "Set serverConfig.command or serverConfig.url to point to your MCP server.");
143
- return [
144
- {
145
- id: "mcp",
146
- label: `MCP Server: ${task.title}`,
147
- config: { enabled: true, server: { name: task.id } },
148
- },
149
- ];
188
+ return { name: task.id };
150
189
  }
151
- // Build the server sub-config (Promptfoo's native format)
152
- const server = { name: task.id };
190
+ const serverConfig = { name: task.id };
153
191
  if (config.transport === "stdio") {
154
- // Promptfoo expects command + args as separate fields
155
- const parts = config.command?.split(/\s+/) ?? [];
156
- server.command = parts[0] ?? "node";
157
- if (parts.length > 1) {
158
- server.args = parts.slice(1);
159
- }
192
+ serverConfig.command = config.command;
160
193
  }
161
194
  else {
162
- // sse or streamable-http — use URL-based connection
163
- server.url = config.url;
195
+ serverConfig.url = config.url;
164
196
  }
165
- // Auth config (Promptfoo supports bearer, basic, api_key, oauth)
197
+ // Auth config
166
198
  if (config.auth) {
167
- server.auth = config.auth;
199
+ serverConfig.auth = config.auth;
168
200
  }
169
201
  else if (config.env) {
170
- // Backward compat: if env has a token-like variable, convert to
171
- // bearer auth using Promptfoo's {{env.VAR}} template syntax
172
202
  const tokenKey = Object.keys(config.env).find((k) => /token|auth|key/i.test(k));
173
203
  if (tokenKey) {
174
204
  const val = config.env[tokenKey];
175
- // Convert $env(VAR) syntax to Promptfoo's {{env.VAR}} syntax
176
205
  let envVar = val;
177
206
  if (val.startsWith("$env(") && val.endsWith(")")) {
178
- envVar = val.slice(5, -1); // $env(VAR_NAME) → VAR_NAME
207
+ envVar = val.slice(5, -1);
179
208
  }
180
- // Validate extracted env var name is non-empty and valid
181
209
  if (!envVar || !/^[A-Za-z_][A-Za-z0-9_]*$/.test(envVar)) {
182
210
  warnings.push(`MCP task: env var name "${envVar}" from "${val}" is not a valid ` +
183
211
  "identifier — skipping auth config");
184
212
  }
185
213
  else {
186
- server.auth = {
214
+ serverConfig.auth = {
187
215
  type: "bearer",
188
216
  token: `{{env.${envVar}}}`,
189
217
  };
190
218
  }
191
219
  }
192
220
  }
193
- // Custom headers (if any non-auth env vars remain)
194
- if (config.env) {
195
- const headers = {};
196
- for (const [key, val] of Object.entries(config.env)) {
197
- if (/header[_.]?/i.test(key)) {
198
- headers[key.replace(/^header[_.]?/i, "")] = val;
199
- }
200
- }
201
- if (Object.keys(headers).length > 0) {
202
- server.headers = headers;
203
- }
204
- }
205
- // Build top-level provider config
206
- const providerConfig = {
207
- enabled: true,
208
- server,
209
- };
210
- // Tool filtering — map AILF capabilities to Promptfoo tools
211
- if (task.capabilities && task.capabilities.length > 0) {
212
- providerConfig.tools = task.capabilities;
213
- }
214
- // Timeout
215
- if (config.startupTimeoutMs) {
216
- providerConfig.timeout = config.startupTimeoutMs;
217
- }
218
- return [
219
- {
220
- id: "mcp",
221
- label: `MCP Server: ${task.title}`,
222
- config: providerConfig,
223
- },
224
- ];
221
+ return serverConfig;
225
222
  }
226
223
  // ---------------------------------------------------------------------------
227
224
  // Prompt assembly
@@ -298,6 +295,7 @@ export const handler = {
298
295
  }
299
296
  const result = compileMCPTask(task, {
300
297
  graderProvider: ctx.graderProvider,
298
+ models: ctx.models,
301
299
  });
302
300
  return {
303
301
  providers: result.providers,