@sanity/ailf 1.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (499) hide show
  1. package/README.md +0 -1
  2. package/canonical/grader-references/README.md +2 -2
  3. package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
  4. package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
  5. package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
  6. package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
  7. package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
  8. package/config/features.ts +1 -1
  9. package/config/models.ts +29 -12
  10. package/config/sources.ts +1 -1
  11. package/config/thresholds.ts +1 -1
  12. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
  13. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
  14. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
  15. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
  16. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
  17. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
  18. package/dist/_vendor/ailf-core/config-helpers.d.ts +20 -17
  19. package/dist/_vendor/ailf-core/config-helpers.js +51 -2
  20. package/dist/_vendor/ailf-core/examples/index.d.ts +166 -80
  21. package/dist/_vendor/ailf-core/examples/index.js +213 -94
  22. package/dist/_vendor/ailf-core/index.d.ts +3 -2
  23. package/dist/_vendor/ailf-core/index.js +2 -1
  24. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
  25. package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
  26. package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
  27. package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
  28. package/dist/_vendor/ailf-core/ports/context.d.ts +22 -1
  29. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
  30. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  31. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
  32. package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
  33. package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
  34. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  35. package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
  36. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +7 -1
  37. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +16 -2
  38. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
  39. package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
  40. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
  41. package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
  42. package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
  43. package/dist/_vendor/ailf-core/services/index.js +1 -1
  44. package/dist/_vendor/ailf-core/services/scoring.js +9 -0
  45. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +25 -1
  46. package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
  47. package/dist/_vendor/ailf-core/types/index.d.ts +48 -7
  48. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +105 -23
  49. package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
  50. package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
  51. package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
  52. package/dist/adapters/api-client/remediation.js +2 -2
  53. package/dist/adapters/config-sources/file-config-adapter.js +7 -1
  54. package/dist/adapters/config-sources/ts-config-loader.js +21 -13
  55. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
  56. package/dist/adapters/index.d.ts +0 -1
  57. package/dist/adapters/index.js +0 -1
  58. package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
  59. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  60. package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
  61. package/dist/adapters/task-sources/content-lake-task-source.js +21 -26
  62. package/dist/adapters/task-sources/index.d.ts +3 -4
  63. package/dist/adapters/task-sources/index.js +3 -4
  64. package/dist/adapters/task-sources/repo-schemas.d.ts +219 -17
  65. package/dist/adapters/task-sources/repo-schemas.js +228 -20
  66. package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
  67. package/dist/adapters/task-sources/repo-task-source.js +81 -122
  68. package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
  69. package/dist/adapters/task-sources/repo-trigger.js +1 -1
  70. package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
  71. package/dist/adapters/task-sources/repo-validation.js +126 -5
  72. package/dist/adapters/task-sources/task-file-loader.d.ts +10 -7
  73. package/dist/adapters/task-sources/task-file-loader.js +21 -7
  74. package/dist/agent-observer/test-imports.d.ts +7 -0
  75. package/dist/agent-observer/test-imports.js +185 -0
  76. package/dist/artifact-capture/comparator.d.ts +22 -0
  77. package/dist/artifact-capture/comparator.js +493 -0
  78. package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
  79. package/dist/artifact-capture/filesystem-collector.js +237 -0
  80. package/dist/artifact-capture/redact-artifact.d.ts +20 -0
  81. package/dist/artifact-capture/redact-artifact.js +115 -0
  82. package/dist/assertions/source-isolation.d.ts +1 -1
  83. package/dist/assertions/source-isolation.js +1 -1
  84. package/dist/cli.js +4 -0
  85. package/dist/commands/calculate-scores.js +1 -0
  86. package/dist/commands/capture-compare.d.ts +15 -0
  87. package/dist/commands/capture-compare.js +253 -0
  88. package/dist/commands/capture-list.d.ts +12 -0
  89. package/dist/commands/capture-list.js +147 -0
  90. package/dist/commands/capture.d.ts +9 -0
  91. package/dist/commands/capture.js +16 -0
  92. package/dist/commands/chronic-failures.d.ts +8 -0
  93. package/dist/commands/chronic-failures.js +33 -0
  94. package/dist/commands/coverage-audit.js +3 -1
  95. package/dist/commands/explain-handler.d.ts +1 -1
  96. package/dist/commands/explain-handler.js +37 -8
  97. package/dist/commands/fetch-docs.js +1 -0
  98. package/dist/commands/generate-configs.d.ts +3 -3
  99. package/dist/commands/generate-configs.js +20 -8
  100. package/dist/commands/init.d.ts +5 -4
  101. package/dist/commands/init.js +190 -25
  102. package/dist/commands/pipeline-action.d.ts +7 -1
  103. package/dist/commands/pipeline-action.js +43 -19
  104. package/dist/commands/pipeline.d.ts +6 -1
  105. package/dist/commands/pipeline.js +7 -2
  106. package/dist/commands/pr-comment.js +1 -0
  107. package/dist/commands/publish.js +1 -0
  108. package/dist/commands/shared/help.js +2 -2
  109. package/dist/commands/update-quality-scores.d.ts +5 -0
  110. package/dist/commands/update-quality-scores.js +20 -0
  111. package/dist/commands/validate-tasks.d.ts +2 -2
  112. package/dist/commands/validate-tasks.js +26 -15
  113. package/dist/composition-root.d.ts +15 -4
  114. package/dist/composition-root.js +100 -55
  115. package/dist/config/features.ts +23 -0
  116. package/dist/config/models.ts +100 -0
  117. package/dist/config/prompts.ts +16 -0
  118. package/dist/config/rubrics.ts +225 -0
  119. package/dist/config/schedules.ts +47 -0
  120. package/dist/config/sinks.ts +37 -0
  121. package/dist/config/sources.ts +21 -0
  122. package/dist/config/thresholds.ts +61 -0
  123. package/dist/index.d.ts +41 -0
  124. package/dist/index.js +48 -0
  125. package/dist/lib/agent-behavior-report.d.ts +8 -0
  126. package/dist/lib/agent-behavior-report.js +185 -0
  127. package/dist/lib/baseline.d.ts +19 -0
  128. package/dist/lib/baseline.js +153 -0
  129. package/dist/lib/calculate-scores.d.ts +23 -0
  130. package/dist/lib/calculate-scores.js +42 -0
  131. package/dist/lib/compare.d.ts +18 -0
  132. package/dist/lib/compare.js +170 -0
  133. package/dist/lib/coverage-audit.d.ts +4 -0
  134. package/dist/lib/coverage-audit.js +42 -0
  135. package/dist/lib/discovery-report.d.ts +13 -0
  136. package/dist/lib/discovery-report.js +57 -0
  137. package/dist/lib/fetch-docs.d.ts +30 -0
  138. package/dist/lib/fetch-docs.js +171 -0
  139. package/dist/lib/generate-configs.d.ts +25 -0
  140. package/dist/lib/generate-configs.js +42 -0
  141. package/dist/lib/grader-api.d.ts +21 -0
  142. package/dist/lib/grader-api.js +34 -0
  143. package/dist/lib/grader-compare.d.ts +19 -0
  144. package/dist/lib/grader-compare.js +91 -0
  145. package/dist/lib/grader-consistency.d.ts +27 -0
  146. package/dist/lib/grader-consistency.js +79 -0
  147. package/dist/lib/grader-sensitivity.d.ts +19 -0
  148. package/dist/lib/grader-sensitivity.js +75 -0
  149. package/dist/lib/grader-validate.d.ts +19 -0
  150. package/dist/lib/grader-validate.js +78 -0
  151. package/dist/lib/measure-retrieval.d.ts +14 -0
  152. package/dist/lib/measure-retrieval.js +71 -0
  153. package/dist/lib/pr-comment.d.ts +16 -0
  154. package/dist/lib/pr-comment.js +28 -0
  155. package/dist/lib/readiness-report.d.ts +13 -0
  156. package/dist/lib/readiness-report.js +108 -0
  157. package/dist/lib/webhook-server.d.ts +11 -0
  158. package/dist/lib/webhook-server.js +24 -0
  159. package/dist/lib/weekly-digest.d.ts +24 -0
  160. package/dist/lib/weekly-digest.js +148 -0
  161. package/dist/orchestration/build-app-context.js +13 -0
  162. package/dist/orchestration/build-step-sequence.js +4 -2
  163. package/dist/orchestration/cache-context.d.ts +23 -0
  164. package/dist/orchestration/cache-context.js +43 -0
  165. package/dist/orchestration/env-bridge.d.ts +21 -0
  166. package/dist/orchestration/env-bridge.js +66 -0
  167. package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
  168. package/dist/orchestration/load-pipeline-tasks.js +52 -0
  169. package/dist/orchestration/pipeline-orchestrator.js +75 -5
  170. package/dist/orchestration/step-runner.js +5 -1
  171. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
  172. package/dist/orchestration/steps/calculate-scores-step.js +13 -0
  173. package/dist/orchestration/steps/callback-step.js +10 -1
  174. package/dist/orchestration/steps/compare-step.js +6 -3
  175. package/dist/orchestration/steps/discovery-report-step.js +6 -2
  176. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  177. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  178. package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
  179. package/dist/orchestration/steps/fetch-docs-step.js +32 -19
  180. package/dist/orchestration/steps/gap-analysis-step.js +13 -2
  181. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
  182. package/dist/orchestration/steps/generate-configs-step.js +77 -26
  183. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
  184. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  185. package/dist/orchestration/steps/publish-report-step.js +19 -0
  186. package/dist/orchestration/steps/readiness-step.js +8 -3
  187. package/dist/orchestration/steps/report-step.js +17 -4
  188. package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
  189. package/dist/orchestration/steps/run-eval-step.js +51 -31
  190. package/dist/pipeline/agent-behavior-report.js +6 -0
  191. package/dist/pipeline/attribution.d.ts +1 -1
  192. package/dist/pipeline/attribution.js +1 -1
  193. package/dist/pipeline/cache.js +29 -15
  194. package/dist/pipeline/calculate-scores.d.ts +2 -0
  195. package/dist/pipeline/calculate-scores.js +70 -33
  196. package/dist/pipeline/chronic-failures.d.ts +55 -0
  197. package/dist/pipeline/chronic-failures.js +110 -0
  198. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
  199. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
  200. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
  201. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +132 -62
  202. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
  203. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
  204. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
  205. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
  206. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
  207. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
  208. package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
  209. package/dist/pipeline/compiler/assertion-mapper.js +1 -1
  210. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
  211. package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
  212. package/dist/pipeline/compiler/config-loader.d.ts +14 -0
  213. package/dist/pipeline/compiler/config-loader.js +42 -2
  214. package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
  215. package/dist/pipeline/compiler/fixture-resolver.js +1 -1
  216. package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
  217. package/dist/pipeline/compiler/ignore-fields.js +1 -1
  218. package/dist/pipeline/compiler/index.d.ts +2 -5
  219. package/dist/pipeline/compiler/index.js +2 -5
  220. package/dist/pipeline/compiler/literacy-bridge.d.ts +2 -2
  221. package/dist/pipeline/compiler/literacy-bridge.js +2 -2
  222. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
  223. package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
  224. package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
  225. package/dist/pipeline/compiler/mode-bases/index.js +4 -0
  226. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
  227. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
  228. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +23 -0
  229. package/dist/pipeline/compiler/mode-bases/literacy.js +132 -0
  230. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
  231. package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
  232. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
  233. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
  234. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
  235. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
  236. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
  237. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
  238. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
  239. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
  240. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
  241. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
  242. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
  243. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
  244. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
  245. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
  246. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
  247. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
  248. package/dist/pipeline/compiler/mode-handlers/index.d.ts +6 -7
  249. package/dist/pipeline/compiler/mode-handlers/index.js +6 -8
  250. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
  251. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
  252. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
  253. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
  254. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
  255. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
  256. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
  257. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
  258. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
  259. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
  260. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
  261. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
  262. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
  263. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
  264. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
  265. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
  266. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
  267. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
  268. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
  269. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
  270. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
  271. package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
  272. package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
  273. package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
  274. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +63 -6
  275. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
  276. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
  277. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
  278. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
  279. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
  280. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
  281. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
  282. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
  283. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
  284. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +108 -0
  285. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
  286. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
  287. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
  288. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
  289. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +3 -1
  290. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +65 -67
  291. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
  292. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +191 -0
  293. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
  294. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +101 -0
  295. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
  296. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
  297. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +19 -0
  298. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +323 -0
  299. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +103 -0
  300. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
  301. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
  302. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
  303. package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
  304. package/dist/pipeline/compiler/preset-loader.js +99 -0
  305. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +7 -10
  306. package/dist/pipeline/compiler/presets/sanity-literacy.js +11 -157
  307. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
  308. package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
  309. package/dist/pipeline/compiler/provider-assembler.js +13 -7
  310. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
  311. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
  312. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
  313. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
  314. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
  315. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
  316. package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
  317. package/dist/pipeline/compiler/sandbox/index.js +1 -1
  318. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
  319. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
  320. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
  321. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
  322. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
  323. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
  324. package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
  325. package/dist/pipeline/compiler/scoring-bridge.js +1 -1
  326. package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
  327. package/dist/pipeline/compiler/task-bridge.js +92 -0
  328. package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
  329. package/dist/pipeline/compiler/task-graph-builder.js +1 -4
  330. package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
  331. package/dist/pipeline/compiler/telemetry/index.js +1 -1
  332. package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
  333. package/dist/pipeline/compiler/variable-resolver.js +1 -1
  334. package/dist/pipeline/coverage-audit.d.ts +1 -1
  335. package/dist/pipeline/coverage-audit.js +1 -1
  336. package/dist/pipeline/degradations.d.ts +1 -1
  337. package/dist/pipeline/degradations.js +1 -1
  338. package/dist/pipeline/expand-tasks.d.ts +2 -2
  339. package/dist/pipeline/expand-tasks.js +2 -2
  340. package/dist/pipeline/failure-modes.d.ts +1 -1
  341. package/dist/pipeline/failure-modes.js +13 -1
  342. package/dist/pipeline/gap-analysis.d.ts +1 -1
  343. package/dist/pipeline/gap-analysis.js +3 -1
  344. package/dist/pipeline/generate-configs.d.ts +2 -2
  345. package/dist/pipeline/generate-configs.js +16 -9
  346. package/dist/pipeline/grader-compare-runner.d.ts +1 -1
  347. package/dist/pipeline/grader-compare-runner.js +7 -1
  348. package/dist/pipeline/grader-comparison.d.ts +1 -1
  349. package/dist/pipeline/grader-comparison.js +1 -1
  350. package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
  351. package/dist/pipeline/grader-consistency-runner.js +7 -1
  352. package/dist/pipeline/grader-consistency.d.ts +1 -1
  353. package/dist/pipeline/grader-consistency.js +1 -1
  354. package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
  355. package/dist/pipeline/grader-sensitivity-runner.js +1 -1
  356. package/dist/pipeline/grader-sensitivity.d.ts +1 -1
  357. package/dist/pipeline/grader-sensitivity.js +1 -1
  358. package/dist/pipeline/grader-validate-runner.d.ts +1 -1
  359. package/dist/pipeline/grader-validate-runner.js +2 -2
  360. package/dist/pipeline/grader-validation.d.ts +1 -1
  361. package/dist/pipeline/grader-validation.js +1 -1
  362. package/dist/pipeline/map-request-to-config.js +16 -2
  363. package/dist/pipeline/mirror-repo-tasks.d.ts +8 -8
  364. package/dist/pipeline/mirror-repo-tasks.js +10 -10
  365. package/dist/pipeline/plan-format.d.ts +1 -1
  366. package/dist/pipeline/plan-format.js +1 -1
  367. package/dist/pipeline/plan.d.ts +1 -1
  368. package/dist/pipeline/plan.js +68 -30
  369. package/dist/pipeline/probe.d.ts +1 -1
  370. package/dist/pipeline/probe.js +1 -1
  371. package/dist/pipeline/readiness-report.d.ts +2 -2
  372. package/dist/pipeline/readiness-report.js +2 -2
  373. package/dist/pipeline/release-classification.d.ts +1 -1
  374. package/dist/pipeline/release-classification.js +1 -1
  375. package/dist/pipeline/release-report.d.ts +1 -1
  376. package/dist/pipeline/release-report.js +1 -1
  377. package/dist/pipeline/repo-eval-comment.d.ts +1 -1
  378. package/dist/pipeline/repo-eval-comment.js +1 -1
  379. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  380. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  381. package/dist/pipeline/resolve-mappings.d.ts +6 -6
  382. package/dist/pipeline/resolve-mappings.js +44 -44
  383. package/dist/pipeline/retrieval-metrics.d.ts +3 -3
  384. package/dist/pipeline/retrieval-metrics.js +28 -20
  385. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  386. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  387. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  388. package/dist/pipeline/steps/compare-step.js +90 -0
  389. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  390. package/dist/pipeline/steps/eval-step.js +347 -0
  391. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  392. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  393. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  394. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  395. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  396. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  397. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  398. package/dist/pipeline/steps/publish-report-step.js +243 -0
  399. package/dist/pipeline/steps/report-step.d.ts +13 -0
  400. package/dist/pipeline/steps/report-step.js +56 -0
  401. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  402. package/dist/pipeline/steps/update-scores-step.js +42 -0
  403. package/dist/pipeline/targeted-loo.d.ts +1 -1
  404. package/dist/pipeline/targeted-loo.js +1 -1
  405. package/dist/pipeline/thresholds.d.ts +1 -1
  406. package/dist/pipeline/thresholds.js +1 -1
  407. package/dist/pipeline/validate.js +13 -0
  408. package/dist/report-store.d.ts +17 -0
  409. package/dist/report-store.js +24 -0
  410. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  411. package/dist/scripts/agent-behavior-report.js +315 -0
  412. package/dist/scripts/baseline.d.ts +43 -0
  413. package/dist/scripts/baseline.js +267 -0
  414. package/dist/scripts/calculate-scores.d.ts +166 -0
  415. package/dist/scripts/calculate-scores.js +1296 -0
  416. package/dist/scripts/compare.d.ts +22 -0
  417. package/dist/scripts/compare.js +334 -0
  418. package/dist/scripts/coverage-audit.d.ts +44 -0
  419. package/dist/scripts/coverage-audit.js +209 -0
  420. package/dist/scripts/debug-eval.d.ts +19 -0
  421. package/dist/scripts/debug-eval.js +73 -0
  422. package/dist/scripts/discovery-report.d.ts +58 -0
  423. package/dist/scripts/discovery-report.js +250 -0
  424. package/dist/scripts/fetch-docs.d.ts +35 -0
  425. package/dist/scripts/fetch-docs.js +472 -0
  426. package/dist/scripts/generate-configs.d.ts +66 -0
  427. package/dist/scripts/generate-configs.js +459 -0
  428. package/dist/scripts/grader-api.d.ts +27 -0
  429. package/dist/scripts/grader-api.js +206 -0
  430. package/dist/scripts/grader-compare.d.ts +22 -0
  431. package/dist/scripts/grader-compare.js +368 -0
  432. package/dist/scripts/grader-consistency.d.ts +20 -0
  433. package/dist/scripts/grader-consistency.js +313 -0
  434. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  435. package/dist/scripts/grader-sensitivity.js +354 -0
  436. package/dist/scripts/grader-validate.d.ts +19 -0
  437. package/dist/scripts/grader-validate.js +267 -0
  438. package/dist/scripts/measure-retrieval.d.ts +10 -0
  439. package/dist/scripts/measure-retrieval.js +145 -0
  440. package/dist/scripts/migrate-task-mode.d.ts +1 -1
  441. package/dist/scripts/migrate-task-mode.js +1 -1
  442. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
  443. package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
  444. package/dist/scripts/pipeline.d.ts +76 -0
  445. package/dist/scripts/pipeline.js +1031 -0
  446. package/dist/scripts/pr-comment.d.ts +10 -0
  447. package/dist/scripts/pr-comment.js +510 -0
  448. package/dist/scripts/readiness-report.d.ts +88 -0
  449. package/dist/scripts/readiness-report.js +342 -0
  450. package/dist/scripts/update-quality-scores.d.ts +15 -0
  451. package/dist/scripts/update-quality-scores.js +184 -0
  452. package/dist/scripts/validate-task-sources.d.ts +1 -1
  453. package/dist/scripts/validate-task-sources.js +1 -1
  454. package/dist/scripts/validate.d.ts +13 -0
  455. package/dist/scripts/validate.js +79 -0
  456. package/dist/scripts/webhook-server.d.ts +26 -0
  457. package/dist/scripts/webhook-server.js +147 -0
  458. package/dist/scripts/weekly-digest.d.ts +24 -0
  459. package/dist/scripts/weekly-digest.js +144 -0
  460. package/dist/sinks/format-slack.d.ts +64 -0
  461. package/dist/sinks/format-slack.js +306 -0
  462. package/dist/sinks/slack-sink.d.ts +27 -0
  463. package/dist/sinks/slack-sink.js +78 -0
  464. package/dist/sinks/types.d.ts +1 -1
  465. package/dist/sinks/types.js +1 -1
  466. package/dist/sinks/webhook-sink.d.ts +19 -0
  467. package/dist/sinks/webhook-sink.js +50 -0
  468. package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
  469. package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
  470. package/dist/tasks/literacy/content-lake.task.ts +181 -0
  471. package/dist/tasks/literacy/frameworks.task.ts +129 -0
  472. package/dist/tasks/literacy/functions.task.ts +70 -0
  473. package/dist/tasks/literacy/groq.task.ts +259 -0
  474. package/dist/tasks/literacy/image-handling.task.ts +95 -0
  475. package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
  476. package/dist/tasks/literacy/portable-text.task.ts +169 -0
  477. package/dist/tasks/literacy/studio-setup.task.ts +134 -0
  478. package/dist/tasks/literacy/visual-editing.task.ts +147 -0
  479. package/package.json +32 -24
  480. package/tasks/.expanded.agentic.yaml +280 -0
  481. package/tasks/.expanded.yaml +565 -0
  482. package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
  483. package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
  484. package/tasks/literacy/content-lake.task.ts +181 -0
  485. package/tasks/literacy/frameworks.task.ts +1 -0
  486. package/tasks/literacy/functions.task.ts +1 -0
  487. package/tasks/literacy/groq.task.ts +1 -0
  488. package/tasks/literacy/image-handling.task.ts +95 -0
  489. package/tasks/literacy/nextjs-live.task.ts +2 -1
  490. package/tasks/literacy/portable-text.task.ts +169 -0
  491. package/tasks/literacy/studio-setup.task.ts +5 -2
  492. package/tasks/literacy/visual-editing.task.ts +1 -0
  493. package/LICENSE +0 -21
  494. package/tasks/frameworks.yaml +0 -98
  495. package/tasks/functions.yaml +0 -51
  496. package/tasks/groq.yaml +0 -216
  497. package/tasks/nextjs-live.yaml +0 -62
  498. package/tasks/studio-setup.yaml +0 -111
  499. package/tasks/visual-editing.yaml +0 -120
@@ -0,0 +1,74 @@
1
+ /**
2
+ * Canonical prompt templates for literacy-mode evaluations.
3
+ *
4
+ * These are the source-of-truth templates. Previously lived in
5
+ * config/prompts.ts as global templates; now handler-owned so
6
+ * non-literacy modes can define their own prompts without collision.
7
+ */
8
+ export const LITERACY_PROMPT_TEMPLATES = {
9
+ "with-docs": {
10
+ id: "with-docs",
11
+ label: "With Documentation",
12
+ template: `You are an expert Sanity.io developer. Use the following documentation to help implement the task.
13
+
14
+ ## Sanity Documentation
15
+ {{docs}}
16
+
17
+ ## Task
18
+ {{task}}
19
+
20
+ ## Requirements
21
+
22
+ 1. Use ONLY the APIs and patterns shown in the documentation
23
+ 2. Provide a complete, working implementation
24
+ 3. Include all necessary imports
25
+ 4. Follow Sanity best practices as documented
26
+
27
+ Provide your implementation:
28
+ `,
29
+ variables: ["docs", "task"],
30
+ },
31
+ "without-docs": {
32
+ id: "without-docs",
33
+ label: "Baseline (No Docs)",
34
+ template: `You are an expert Sanity.io developer.
35
+
36
+ ## Task
37
+ {{task}}
38
+
39
+ ## Requirements
40
+
41
+ 1. Provide a complete, working implementation
42
+ 2. Include all necessary imports
43
+ 3. Follow Sanity best practices
44
+
45
+ Provide your implementation:
46
+ `,
47
+ variables: ["task"],
48
+ },
49
+ agentic: {
50
+ id: "agentic",
51
+ label: "Agentic (self-retrieval)",
52
+ template: `You are an expert developer helping implement a Sanity.io feature.
53
+ You have access to web search and page fetching tools.
54
+
55
+ IMPORTANT: Before writing any code, search for and read the relevant
56
+ Sanity.io documentation to ensure you are using the latest APIs and
57
+ best practices. Do not rely on memory alone.
58
+
59
+ ## Task
60
+ {{task}}
61
+
62
+ ## Requirements
63
+
64
+ 1. Search for relevant Sanity documentation before implementing
65
+ 2. Use ONLY the APIs and patterns from the current official docs
66
+ 3. Provide a complete, working implementation
67
+ 4. Include all necessary imports
68
+ 5. Follow Sanity best practices as documented
69
+
70
+ Provide your implementation:
71
+ `,
72
+ variables: ["task"],
73
+ },
74
+ };
@@ -0,0 +1,41 @@
1
+ /**
2
+ * Shared types for the literacy mode handler.
3
+ */
4
+ import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
5
+ /** Options for compiling a literacy task */
6
+ export interface LiteracyCompileOptions {
7
+ /** Grader provider for LLM-graded assertions */
8
+ graderProvider?: string;
9
+ /** Root directory (for resolving file:// doc paths) */
10
+ rootDir?: string;
11
+ /** Evaluation sub-mode — controls which entries are generated */
12
+ evalMode?: import("../../../normalize-mode.js").LiteracyEvalSubMode;
13
+ /** Model providers to include */
14
+ models?: {
15
+ id: string;
16
+ label: string;
17
+ config?: Record<string, unknown>;
18
+ }[];
19
+ /** Rubric config (templates, weights, profiles) — loaded from rubrics config */
20
+ rubricConfig?: RubricConfig;
21
+ }
22
+ /** Minimal rubric config needed by the handler */
23
+ export interface RubricConfig {
24
+ templates: Record<string, {
25
+ dimension?: string;
26
+ header: string;
27
+ scale: string[];
28
+ criteria_label?: string;
29
+ }>;
30
+ }
31
+ /** Result of compiling a single literacy task */
32
+ export interface LiteracyCompileResult {
33
+ /** Promptfoo provider configs */
34
+ providers: PromptfooProvider[];
35
+ /** Compiled test cases (gold + optional baseline) */
36
+ tests: PromptfooTestCase[];
37
+ /** Prompts for evaluation */
38
+ prompts: PromptfooPrompt[];
39
+ /** Warnings generated during compilation */
40
+ warnings: string[];
41
+ }
@@ -0,0 +1,4 @@
1
+ /**
2
+ * Shared types for the literacy mode handler.
3
+ */
4
+ export {};
@@ -0,0 +1,12 @@
1
+ /**
2
+ * Validation for literacy task definitions.
3
+ */
4
+ import type { LiteracyTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
5
+ export interface LiteracyValidationError {
6
+ field: string;
7
+ message: string;
8
+ }
9
+ /**
10
+ * Validate a literacy task definition.
11
+ */
12
+ export declare function validateLiteracyTask(task: LiteracyTaskDefinition): LiteracyValidationError[];
@@ -0,0 +1,28 @@
1
+ /**
2
+ * Validation for literacy task definitions.
3
+ */
4
+ /**
5
+ * Validate a literacy task definition.
6
+ */
7
+ export function validateLiteracyTask(task) {
8
+ const errors = [];
9
+ if (!task.id)
10
+ errors.push({ field: "id", message: "Task ID is required" });
11
+ if (!task.title) {
12
+ errors.push({
13
+ field: "title",
14
+ message: "Task title is required",
15
+ });
16
+ }
17
+ const promptText = task.prompt?.text ??
18
+ task.prompt?.template ??
19
+ task.prompt?.vars?.task ??
20
+ "";
21
+ if (!promptText) {
22
+ errors.push({
23
+ field: "prompt",
24
+ message: "Task prompt text is required",
25
+ });
26
+ }
27
+ return errors;
28
+ }
@@ -83,18 +83,72 @@ function mapMCPAssertion(assertion, context, warnings) {
83
83
  // ---------------------------------------------------------------------------
84
84
  function buildToolCalledAssertion(assertion, _context) {
85
85
  const toolName = String(assertion.value ?? "");
86
+ // Strategy: check multiple sources for tool call evidence.
87
+ // 1. context.vars.__toolCalls (structured, if Promptfoo populates it)
88
+ // 2. Response metadata toolCallLog (from custom mcp-tool-provider)
89
+ // 3. Response output text (LLM+MCP providers embed tool_use JSON blocks)
86
90
  return {
87
91
  type: "javascript",
88
92
  value: buildJsAssertion(`tool-called: ${toolName}`, `
89
- const toolCalls = context.vars.__toolCalls || [];
90
- const called = toolCalls.some(tc => tc.name === ${JSON.stringify(toolName)});
93
+ var toolName = ${JSON.stringify(toolName)};
94
+
95
+ // Strategy 1: structured tool calls from Promptfoo
96
+ var toolCalls = context.vars.__toolCalls || [];
97
+ if (Array.isArray(toolCalls) && toolCalls.length > 0) {
98
+ var called = toolCalls.some(function(tc) { return tc.name === toolName; });
99
+ return {
100
+ pass: called,
101
+ score: called ? 1 : 0,
102
+ reason: called
103
+ ? 'Tool "' + toolName + '" was called (via __toolCalls)'
104
+ : 'Expected tool "' + toolName + '" but found: ' + toolCalls.map(function(tc) { return tc.name; }).join(', '),
105
+ };
106
+ }
107
+
108
+ // Strategy 2: MCP_TOOLS_CALLED summary appended by custom mcp-tool-provider
109
+ var outputStr = typeof output === 'string' ? output : JSON.stringify(output || '');
110
+ var summaryMatch = outputStr.match(/<!-- MCP_TOOLS_CALLED: (\\[.*?\\]) -->/);
111
+ if (summaryMatch) {
112
+ try {
113
+ var calledTools = JSON.parse(summaryMatch[1]);
114
+ var called = calledTools.includes(toolName);
115
+ var count = calledTools.filter(function(n) { return n === toolName; }).length;
116
+ return {
117
+ pass: called,
118
+ score: called ? 1 : 0,
119
+ reason: called
120
+ ? 'Tool "' + toolName + '" was called ' + count + ' time(s)'
121
+ : 'Expected tool "' + toolName + '" but found: ' + calledTools.join(', '),
122
+ };
123
+ } catch (e) { /* fall through to Strategy 3 */ }
124
+ }
125
+
126
+ // Strategy 3: parse output for tool_use blocks (built-in provider fallback)
127
+ var outputStr = typeof output === 'string' ? output : JSON.stringify(output || '');
128
+ var toolUsePattern = /"type"\\s*:\\s*"tool_use"[^}]*"name"\\s*:\\s*"([^"]+)"/g;
129
+ var foundTools = [];
130
+ var match;
131
+ while ((match = toolUsePattern.exec(outputStr)) !== null) {
132
+ foundTools.push(match[1]);
133
+ }
134
+ var fnCallPattern = /"function"\\s*:\\s*\\{[^}]*"name"\\s*:\\s*"([^"]+)"/g;
135
+ while ((match = fnCallPattern.exec(outputStr)) !== null) {
136
+ foundTools.push(match[1]);
137
+ }
138
+ if (foundTools.length === 0 && outputStr.includes(toolName) && outputStr.includes('tool_use')) {
139
+ foundTools.push(toolName);
140
+ }
141
+
142
+ var called = foundTools.includes(toolName);
91
143
  return {
92
144
  pass: called,
93
145
  score: called ? 1 : 0,
94
146
  reason: called
95
- ? 'Tool ' + ${JSON.stringify(JSON.stringify(toolName))} + ' was called as expected'
96
- : 'Expected tool ' + ${JSON.stringify(JSON.stringify(toolName))} + ' to be called, but it was not. ' +
97
- 'Tools called: ' + (toolCalls.map(tc => tc.name).join(', ') || 'none'),
147
+ ? 'Tool "' + toolName + '" was called (detected in output)'
148
+ : 'Expected tool "' + toolName + '" to be called. ' +
149
+ (foundTools.length > 0
150
+ ? 'Tools found in output: ' + foundTools.join(', ')
151
+ : 'No tool calls detected in output'),
98
152
  };`),
99
153
  ...(assertion.weight !== undefined ? { weight: assertion.weight } : {}),
100
154
  };
@@ -273,5 +327,8 @@ function buildCapabilityAssertion(assertion, _context) {
273
327
  * from Promptfoo's assertion runner.
274
328
  */
275
329
  function buildJsAssertion(label, body) {
276
- return `// MCP assertion: ${label}\n(function() {\n${body.trim()}\n})()`;
330
+ // No IIFE wrapper — Promptfoo wraps the assertion in its own function via
331
+ // new Function('output', 'context', ...). The body must use `return` at
332
+ // the top level for the result to reach Promptfoo's validator.
333
+ return `// MCP assertion: ${label}\n${body.trim()}`;
277
334
  }
@@ -0,0 +1,42 @@
1
+ /**
2
+ * MCP-specific assertion types — ergonomic assertions for MCP server testing.
3
+ *
4
+ * Each assertion type compiles down to a Promptfoo `javascript` assertion
5
+ * with the appropriate validation logic. The developer writes:
6
+ *
7
+ * ```typescript
8
+ * assertions: [
9
+ * { type: "tool-called", value: "getDocument" },
10
+ * { type: "tool-input-matches", value: { documentId: "doc-123" } },
11
+ * { type: "tool-output-matches", value: { title: "Hello" } },
12
+ * { type: "error-returned", value: { code: -32602 } },
13
+ * ]
14
+ * ```
15
+ *
16
+ * The compiler transforms these into Promptfoo-compatible `javascript`
17
+ * assertions that inspect the tool call trace in the evaluation output.
18
+ *
19
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-3-mcp-server-mode.md
20
+ */
21
+ import type { PromptfooAssertion } from "../../assertion-mapper.js";
22
+ import type { MCPAssertionContext } from "./types.js";
23
+ /** An AILF assertion definition — accepts both core and generalized types */
24
+ interface AssertionInput {
25
+ type: string;
26
+ value?: unknown;
27
+ weight?: number;
28
+ /** Allow additional properties from generalized assertions */
29
+ [key: string]: unknown;
30
+ }
31
+ /**
32
+ * Build MCP-specific assertions from task assertion definitions.
33
+ *
34
+ * Handles both MCP-specific types (tool-called, tool-input-matches, etc.)
35
+ * and standard assertion types (contains, llm-rubric, etc.) which are
36
+ * passed through unchanged.
37
+ */
38
+ export declare function buildMCPAssertions(assertions: AssertionInput[], context: MCPAssertionContext): {
39
+ assertions: PromptfooAssertion[];
40
+ warnings: string[];
41
+ };
42
+ export {};
@@ -0,0 +1,334 @@
1
+ /**
2
+ * MCP-specific assertion types — ergonomic assertions for MCP server testing.
3
+ *
4
+ * Each assertion type compiles down to a Promptfoo `javascript` assertion
5
+ * with the appropriate validation logic. The developer writes:
6
+ *
7
+ * ```typescript
8
+ * assertions: [
9
+ * { type: "tool-called", value: "getDocument" },
10
+ * { type: "tool-input-matches", value: { documentId: "doc-123" } },
11
+ * { type: "tool-output-matches", value: { title: "Hello" } },
12
+ * { type: "error-returned", value: { code: -32602 } },
13
+ * ]
14
+ * ```
15
+ *
16
+ * The compiler transforms these into Promptfoo-compatible `javascript`
17
+ * assertions that inspect the tool call trace in the evaluation output.
18
+ *
19
+ * @see docs/archive/exec-plans/architecture-overhaul/phase-3-mcp-server-mode.md
20
+ */
21
+ // ---------------------------------------------------------------------------
22
+ // Public API
23
+ // ---------------------------------------------------------------------------
24
+ /**
25
+ * Build MCP-specific assertions from task assertion definitions.
26
+ *
27
+ * Handles both MCP-specific types (tool-called, tool-input-matches, etc.)
28
+ * and standard assertion types (contains, llm-rubric, etc.) which are
29
+ * passed through unchanged.
30
+ */
31
+ export function buildMCPAssertions(assertions, context) {
32
+ const result = [];
33
+ const warnings = [];
34
+ for (const assertion of assertions) {
35
+ const mapped = mapMCPAssertion(assertion, context, warnings);
36
+ if (mapped) {
37
+ result.push(mapped);
38
+ }
39
+ }
40
+ return { assertions: result, warnings };
41
+ }
42
+ // ---------------------------------------------------------------------------
43
+ // Assertion mapping
44
+ // ---------------------------------------------------------------------------
45
+ function mapMCPAssertion(assertion, context, warnings) {
46
+ switch (assertion.type) {
47
+ case "tool-called":
48
+ return buildToolCalledAssertion(assertion, context);
49
+ case "tool-input-matches":
50
+ return buildToolInputMatchesAssertion(assertion, context);
51
+ case "tool-output-matches":
52
+ return buildToolOutputMatchesAssertion(assertion, context);
53
+ case "error-returned":
54
+ return buildErrorReturnedAssertion(assertion, context);
55
+ case "capability-available":
56
+ return buildCapabilityAssertion(assertion, context);
57
+ // Standard assertions — pass through
58
+ case "contains":
59
+ case "equals":
60
+ case "regex":
61
+ case "is-json":
62
+ case "llm-rubric":
63
+ case "javascript":
64
+ case "python":
65
+ return {
66
+ type: assertion.type,
67
+ ...("value" in assertion ? { value: assertion.value } : {}),
68
+ ...(assertion.weight !== undefined ? { weight: assertion.weight } : {}),
69
+ ...(assertion.type === "llm-rubric" && context.graderProvider
70
+ ? { provider: context.graderProvider }
71
+ : {}),
72
+ };
73
+ default:
74
+ warnings.push(`MCP task "${context.taskId}": unknown assertion type "${assertion.type}" — passed through`);
75
+ return {
76
+ type: assertion.type,
77
+ ...("value" in assertion ? { value: assertion.value } : {}),
78
+ };
79
+ }
80
+ }
81
+ // ---------------------------------------------------------------------------
82
+ // tool-called — asserts the model called a specific tool by name
83
+ // ---------------------------------------------------------------------------
84
+ function buildToolCalledAssertion(assertion, _context) {
85
+ const toolName = String(assertion.value ?? "");
86
+ // Strategy: check multiple sources for tool call evidence.
87
+ // 1. context.vars.__toolCalls (structured, if Promptfoo populates it)
88
+ // 2. Response metadata toolCallLog (from custom mcp-tool-provider)
89
+ // 3. Response output text (LLM+MCP providers embed tool_use JSON blocks)
90
+ return {
91
+ type: "javascript",
92
+ value: buildJsAssertion(`tool-called: ${toolName}`, `
93
+ var toolName = ${JSON.stringify(toolName)};
94
+
95
+ // Strategy 1: structured tool calls from Promptfoo
96
+ var toolCalls = context.vars.__toolCalls || [];
97
+ if (Array.isArray(toolCalls) && toolCalls.length > 0) {
98
+ var called = toolCalls.some(function(tc) { return tc.name === toolName; });
99
+ return {
100
+ pass: called,
101
+ score: called ? 1 : 0,
102
+ reason: called
103
+ ? 'Tool "' + toolName + '" was called (via __toolCalls)'
104
+ : 'Expected tool "' + toolName + '" but found: ' + toolCalls.map(function(tc) { return tc.name; }).join(', '),
105
+ };
106
+ }
107
+
108
+ // Strategy 2: MCP_TOOLS_CALLED summary appended by custom mcp-tool-provider
109
+ var outputStr = typeof output === 'string' ? output : JSON.stringify(output || '');
110
+ var summaryMatch = outputStr.match(/<!-- MCP_TOOLS_CALLED: (\\[.*?\\]) -->/);
111
+ if (summaryMatch) {
112
+ try {
113
+ var calledTools = JSON.parse(summaryMatch[1]);
114
+ var called = calledTools.includes(toolName);
115
+ var count = calledTools.filter(function(n) { return n === toolName; }).length;
116
+ return {
117
+ pass: called,
118
+ score: called ? 1 : 0,
119
+ reason: called
120
+ ? 'Tool "' + toolName + '" was called ' + count + ' time(s)'
121
+ : 'Expected tool "' + toolName + '" but found: ' + calledTools.join(', '),
122
+ };
123
+ } catch (e) { /* fall through to Strategy 3 */ }
124
+ }
125
+
126
+ // Strategy 3: parse output for tool_use blocks (built-in provider fallback)
127
+ var outputStr = typeof output === 'string' ? output : JSON.stringify(output || '');
128
+ var toolUsePattern = /"type"\\s*:\\s*"tool_use"[^}]*"name"\\s*:\\s*"([^"]+)"/g;
129
+ var foundTools = [];
130
+ var match;
131
+ while ((match = toolUsePattern.exec(outputStr)) !== null) {
132
+ foundTools.push(match[1]);
133
+ }
134
+ var fnCallPattern = /"function"\\s*:\\s*\\{[^}]*"name"\\s*:\\s*"([^"]+)"/g;
135
+ while ((match = fnCallPattern.exec(outputStr)) !== null) {
136
+ foundTools.push(match[1]);
137
+ }
138
+ if (foundTools.length === 0 && outputStr.includes(toolName) && outputStr.includes('tool_use')) {
139
+ foundTools.push(toolName);
140
+ }
141
+
142
+ var called = foundTools.includes(toolName);
143
+ return {
144
+ pass: called,
145
+ score: called ? 1 : 0,
146
+ reason: called
147
+ ? 'Tool "' + toolName + '" was called (detected in output)'
148
+ : 'Expected tool "' + toolName + '" to be called. ' +
149
+ (foundTools.length > 0
150
+ ? 'Tools found in output: ' + foundTools.join(', ')
151
+ : 'No tool calls detected in output'),
152
+ };`),
153
+ ...(assertion.weight !== undefined ? { weight: assertion.weight } : {}),
154
+ };
155
+ }
156
+ // ---------------------------------------------------------------------------
157
+ // tool-input-matches — asserts tool call inputs match a schema/value
158
+ // ---------------------------------------------------------------------------
159
+ function buildToolInputMatchesAssertion(assertion, _context) {
160
+ const expected = assertion.value;
161
+ const toolName = assertion.toolName ?? assertion.tool;
162
+ return {
163
+ type: "javascript",
164
+ value: buildJsAssertion(`tool-input-matches${toolName ? `: ${toolName}` : ""}`, `
165
+ const toolCalls = context.vars.__toolCalls || [];
166
+ const expected = ${JSON.stringify(expected)};
167
+ const toolFilter = ${JSON.stringify(toolName ?? null)};
168
+
169
+ const targetCalls = toolFilter
170
+ ? toolCalls.filter(tc => tc.name === toolFilter)
171
+ : toolCalls;
172
+
173
+ if (targetCalls.length === 0) {
174
+ return {
175
+ pass: false,
176
+ score: 0,
177
+ reason: toolFilter
178
+ ? 'No calls to tool "' + toolFilter + '" found'
179
+ : 'No tool calls found',
180
+ };
181
+ }
182
+
183
+ // Check if any call's input matches the expected value
184
+ const match = targetCalls.some(tc => {
185
+ const input = tc.input || tc.arguments || {};
186
+ return Object.entries(expected).every(([k, v]) =>
187
+ JSON.stringify(input[k]) === JSON.stringify(v)
188
+ );
189
+ });
190
+
191
+ return {
192
+ pass: match,
193
+ score: match ? 1 : 0,
194
+ reason: match
195
+ ? 'Tool input matches expected values'
196
+ : 'Tool input does not match. Expected: ' + JSON.stringify(expected) +
197
+ ', Got: ' + JSON.stringify(targetCalls.map(tc => tc.input || tc.arguments)),
198
+ };`),
199
+ ...(assertion.weight !== undefined ? { weight: assertion.weight } : {}),
200
+ };
201
+ }
202
+ // ---------------------------------------------------------------------------
203
+ // tool-output-matches — asserts tool outputs match expected shape/values
204
+ // ---------------------------------------------------------------------------
205
+ function buildToolOutputMatchesAssertion(assertion, _context) {
206
+ const expected = assertion.value;
207
+ const toolName = assertion.toolName ?? assertion.tool;
208
+ return {
209
+ type: "javascript",
210
+ value: buildJsAssertion(`tool-output-matches${toolName ? `: ${toolName}` : ""}`, `
211
+ const toolCalls = context.vars.__toolCalls || [];
212
+ const expected = ${JSON.stringify(expected)};
213
+ const toolFilter = ${JSON.stringify(toolName ?? null)};
214
+
215
+ const targetCalls = toolFilter
216
+ ? toolCalls.filter(tc => tc.name === toolFilter)
217
+ : toolCalls;
218
+
219
+ if (targetCalls.length === 0) {
220
+ return {
221
+ pass: false,
222
+ score: 0,
223
+ reason: toolFilter
224
+ ? 'No calls to tool "' + toolFilter + '" found'
225
+ : 'No tool calls found',
226
+ };
227
+ }
228
+
229
+ const match = targetCalls.some(tc => {
230
+ const output = tc.output || tc.result || {};
231
+ return Object.entries(expected).every(([k, v]) =>
232
+ JSON.stringify(output[k]) === JSON.stringify(v)
233
+ );
234
+ });
235
+
236
+ return {
237
+ pass: match,
238
+ score: match ? 1 : 0,
239
+ reason: match
240
+ ? 'Tool output matches expected values'
241
+ : 'Tool output does not match. Expected: ' + JSON.stringify(expected),
242
+ };`),
243
+ ...(assertion.weight !== undefined ? { weight: assertion.weight } : {}),
244
+ };
245
+ }
246
+ // ---------------------------------------------------------------------------
247
+ // error-returned — asserts the server returned a specific error
248
+ // ---------------------------------------------------------------------------
249
+ function buildErrorReturnedAssertion(assertion, _context) {
250
+ const expected = assertion.value;
251
+ return {
252
+ type: "javascript",
253
+ value: buildJsAssertion("error-returned", `
254
+ const toolCalls = context.vars.__toolCalls || [];
255
+ const expected = ${JSON.stringify(expected ?? {})};
256
+
257
+ const errorCall = toolCalls.find(tc => tc.error);
258
+ if (!errorCall) {
259
+ return {
260
+ pass: false,
261
+ score: 0,
262
+ reason: 'Expected an error response but no errors were returned',
263
+ };
264
+ }
265
+
266
+ const error = errorCall.error;
267
+ let pass = true;
268
+ const reasons = [];
269
+
270
+ if (expected.code !== undefined && error.code !== expected.code) {
271
+ pass = false;
272
+ reasons.push('Expected error code ' + expected.code + ', got ' + error.code);
273
+ }
274
+
275
+ if (expected.message !== undefined) {
276
+ const msgMatch = typeof error.message === 'string' &&
277
+ error.message.includes(expected.message);
278
+ if (!msgMatch) {
279
+ pass = false;
280
+ reasons.push('Expected error message containing "' + expected.message +
281
+ '", got "' + (error.message || '') + '"');
282
+ }
283
+ }
284
+
285
+ if (pass) {
286
+ reasons.push('Error matches expected pattern');
287
+ }
288
+
289
+ return {
290
+ pass,
291
+ score: pass ? 1 : 0,
292
+ reason: reasons.join('; '),
293
+ };`),
294
+ ...(assertion.weight !== undefined ? { weight: assertion.weight } : {}),
295
+ };
296
+ }
297
+ // ---------------------------------------------------------------------------
298
+ // capability-available — asserts the server advertises a capability
299
+ // ---------------------------------------------------------------------------
300
+ function buildCapabilityAssertion(assertion, _context) {
301
+ const capability = String(assertion.value ?? "");
302
+ return {
303
+ type: "javascript",
304
+ value: buildJsAssertion(`capability-available: ${capability}`, `
305
+ const capabilities = context.vars.__serverCapabilities || [];
306
+ const expected = ${JSON.stringify(capability)};
307
+ const available = capabilities.includes(expected);
308
+
309
+ return {
310
+ pass: available,
311
+ score: available ? 1 : 0,
312
+ reason: available
313
+ ? 'Server advertises capability "' + expected + '"'
314
+ : 'Server does not advertise capability "' + expected + '". ' +
315
+ 'Available: ' + (capabilities.join(', ') || 'none'),
316
+ };`),
317
+ ...(assertion.weight !== undefined ? { weight: assertion.weight } : {}),
318
+ };
319
+ }
320
+ // ---------------------------------------------------------------------------
321
+ // Helpers
322
+ // ---------------------------------------------------------------------------
323
+ /**
324
+ * Build a Promptfoo-compatible JavaScript assertion string.
325
+ *
326
+ * Wraps the assertion body in a function that receives `output` and `context`
327
+ * from Promptfoo's assertion runner.
328
+ */
329
+ function buildJsAssertion(label, body) {
330
+ // No IIFE wrapper — Promptfoo wraps the assertion in its own function via
331
+ // new Function('output', 'context', ...). The body must use `return` at
332
+ // the top level for the result to reach Promptfoo's validator.
333
+ return `// MCP assertion: ${label}\n${body.trim()}`;
334
+ }
@@ -0,0 +1,19 @@
1
+ /**
2
+ * MCP server task compilation — core compiler logic.
3
+ *
4
+ * Produces Promptfoo configuration from MCP server task definitions:
5
+ * 1. A provider config pointing to the MCP server
6
+ * 2. Test cases with tool-call assertions
7
+ * 3. Appropriate prompts for the evaluation
8
+ */
9
+ import type { MCPServerTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
10
+ import type { MCPCompileOptions, MCPCompileResult } from "./types.js";
11
+ /**
12
+ * Compile an MCP server task definition into Promptfoo configuration.
13
+ *
14
+ * This is the core of the MCP mode handler. It produces:
15
+ * 1. A provider config pointing to the MCP server
16
+ * 2. Test cases with tool-call assertions
17
+ * 3. Appropriate prompts for the evaluation
18
+ */
19
+ export declare function compileMCPTask(task: MCPServerTaskDefinition, options?: MCPCompileOptions): MCPCompileResult;