@sanity/ailf 0.5.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (377) hide show
  1. package/README.md +0 -1
  2. package/config/features.ts +23 -0
  3. package/config/models.ts +95 -0
  4. package/config/prompts.ts +16 -0
  5. package/config/rubrics.ts +225 -0
  6. package/config/schedules.ts +47 -0
  7. package/config/sinks.ts +37 -0
  8. package/config/sources.ts +21 -0
  9. package/config/thresholds.ts +61 -0
  10. package/dist/_vendor/ailf-core/config-helpers.d.ts +171 -0
  11. package/dist/_vendor/ailf-core/config-helpers.js +170 -0
  12. package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
  13. package/dist/_vendor/ailf-core/env-helper.js +45 -0
  14. package/dist/_vendor/ailf-core/examples/index.d.ts +16 -0
  15. package/dist/_vendor/ailf-core/examples/index.js +25 -0
  16. package/dist/_vendor/ailf-core/index.d.ts +3 -0
  17. package/dist/_vendor/ailf-core/index.js +5 -0
  18. package/dist/_vendor/ailf-core/ports/context.d.ts +17 -2
  19. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
  20. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
  21. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
  22. package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
  23. package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
  24. package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
  25. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +8 -2
  26. package/dist/_vendor/ailf-core/schemas/eval-config.js +17 -2
  27. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +9 -3
  28. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +8 -1
  29. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +14 -31
  30. package/dist/_vendor/ailf-core/schemas/pipeline.js +17 -9
  31. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
  32. package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
  33. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
  34. package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
  35. package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
  36. package/dist/_vendor/ailf-core/services/index.js +2 -1
  37. package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
  38. package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
  39. package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
  40. package/dist/_vendor/ailf-core/services/scoring.js +25 -15
  41. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
  42. package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
  43. package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
  44. package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
  45. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +332 -0
  46. package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
  47. package/dist/_vendor/ailf-core/types/index.d.ts +45 -83
  48. package/dist/_vendor/ailf-core/types/index.js +8 -1
  49. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +257 -0
  50. package/dist/_vendor/ailf-core/types/plugin-registry.js +185 -0
  51. package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
  52. package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
  53. package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
  54. package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
  55. package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
  56. package/dist/_vendor/ailf-core/types/trace.js +18 -0
  57. package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
  58. package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
  59. package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
  60. package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
  61. package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
  62. package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
  63. package/dist/_vendor/ailf-shared/index.d.ts +0 -1
  64. package/dist/_vendor/ailf-shared/index.js +0 -1
  65. package/dist/adapters/api-client/build-request.js +14 -13
  66. package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
  67. package/dist/adapters/config-sources/file-config-adapter.js +39 -12
  68. package/dist/adapters/config-sources/index.d.ts +2 -0
  69. package/dist/adapters/config-sources/index.js +1 -0
  70. package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
  71. package/dist/adapters/config-sources/ts-config-loader.js +141 -0
  72. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
  73. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
  74. package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
  75. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  76. package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
  77. package/dist/adapters/task-sources/content-lake-task-source.js +35 -39
  78. package/dist/adapters/task-sources/index.d.ts +3 -2
  79. package/dist/adapters/task-sources/index.js +3 -2
  80. package/dist/adapters/task-sources/repo-schemas.d.ts +218 -16
  81. package/dist/adapters/task-sources/repo-schemas.js +227 -19
  82. package/dist/adapters/task-sources/repo-task-source.d.ts +16 -12
  83. package/dist/adapters/task-sources/repo-task-source.js +92 -80
  84. package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
  85. package/dist/adapters/task-sources/repo-validation.js +126 -5
  86. package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
  87. package/dist/adapters/task-sources/task-file-loader.js +83 -0
  88. package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
  89. package/dist/adapters/task-sources/yaml-task-source.js +19 -16
  90. package/dist/cli.js +0 -2
  91. package/dist/commands/baseline.js +4 -1
  92. package/dist/commands/calculate-scores.js +1 -1
  93. package/dist/commands/coverage-audit.js +9 -1
  94. package/dist/commands/explain-handler.js +25 -23
  95. package/dist/commands/fetch-docs.js +3 -2
  96. package/dist/commands/generate-configs.js +1 -1
  97. package/dist/commands/init.d.ts +6 -4
  98. package/dist/commands/init.js +302 -23
  99. package/dist/commands/interactive.js +11 -7
  100. package/dist/commands/pipeline-action.d.ts +2 -0
  101. package/dist/commands/pipeline-action.js +16 -6
  102. package/dist/commands/pipeline.d.ts +1 -0
  103. package/dist/commands/pipeline.js +4 -2
  104. package/dist/commands/pr-comment.js +1 -1
  105. package/dist/commands/publish.js +2 -2
  106. package/dist/commands/readiness-report.js +13 -6
  107. package/dist/commands/validate-tasks.d.ts +2 -2
  108. package/dist/commands/validate-tasks.js +26 -15
  109. package/dist/composition-root.d.ts +13 -1
  110. package/dist/composition-root.js +99 -4
  111. package/dist/index.d.ts +41 -0
  112. package/dist/index.js +48 -0
  113. package/dist/orchestration/build-app-context.js +1 -0
  114. package/dist/orchestration/build-step-sequence.js +28 -8
  115. package/dist/orchestration/steps/calculate-scores-step.js +24 -11
  116. package/dist/orchestration/steps/fetch-docs-step.js +8 -7
  117. package/dist/orchestration/steps/gap-analysis-step.js +8 -7
  118. package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
  119. package/dist/orchestration/steps/generate-configs-step.js +261 -51
  120. package/dist/orchestration/steps/grader-consistency-step.js +7 -4
  121. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  122. package/dist/orchestration/steps/readiness-step.js +5 -6
  123. package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
  124. package/dist/orchestration/steps/run-eval-step.js +8 -7
  125. package/dist/pipeline/cache.d.ts +1 -1
  126. package/dist/pipeline/cache.js +36 -8
  127. package/dist/pipeline/calculate-scores.d.ts +2 -4
  128. package/dist/pipeline/calculate-scores.js +43 -113
  129. package/dist/pipeline/checks.js +2 -2
  130. package/dist/pipeline/compare.js +8 -8
  131. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
  132. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
  133. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
  134. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
  135. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
  136. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
  137. package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
  138. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
  139. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
  140. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +392 -0
  141. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
  142. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
  143. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
  144. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
  145. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
  146. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +404 -0
  147. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
  148. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
  149. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
  150. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
  151. package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
  152. package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
  153. package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
  154. package/dist/pipeline/compiler/assertion-mapper.js +175 -0
  155. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
  156. package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
  157. package/dist/pipeline/compiler/config-loader.d.ts +56 -0
  158. package/dist/pipeline/compiler/config-loader.js +111 -0
  159. package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
  160. package/dist/pipeline/compiler/fixture-resolver.js +113 -0
  161. package/dist/pipeline/compiler/hash.d.ts +11 -0
  162. package/dist/pipeline/compiler/hash.js +18 -0
  163. package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
  164. package/dist/pipeline/compiler/ignore-fields.js +113 -0
  165. package/dist/pipeline/compiler/index.d.ts +29 -0
  166. package/dist/pipeline/compiler/index.js +45 -0
  167. package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
  168. package/dist/pipeline/compiler/literacy-bridge.js +172 -0
  169. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
  170. package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
  171. package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
  172. package/dist/pipeline/compiler/mode-bases/index.js +4 -0
  173. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
  174. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
  175. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +12 -0
  176. package/dist/pipeline/compiler/mode-bases/literacy.js +78 -0
  177. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
  178. package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
  179. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
  180. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
  181. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
  182. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
  183. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
  184. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
  185. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
  186. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
  187. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
  188. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
  189. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
  190. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
  191. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
  192. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
  193. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
  194. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
  195. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
  196. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
  197. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +49 -0
  198. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
  199. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
  200. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
  201. package/dist/pipeline/compiler/mode-handlers/index.d.ts +15 -0
  202. package/dist/pipeline/compiler/mode-handlers/index.js +19 -0
  203. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
  204. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
  205. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
  206. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
  207. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
  208. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
  209. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
  210. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
  211. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
  212. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
  213. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
  214. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
  215. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
  216. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
  217. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
  218. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
  219. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
  220. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
  221. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
  222. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
  223. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
  224. package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
  225. package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
  226. package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
  227. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +42 -0
  228. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +334 -0
  229. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
  230. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
  231. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
  232. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
  233. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
  234. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
  235. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
  236. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +104 -0
  237. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
  238. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
  239. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
  240. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
  241. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
  242. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +174 -0
  243. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
  244. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +95 -0
  245. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
  246. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
  247. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +14 -0
  248. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +16 -0
  249. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +93 -0
  250. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
  251. package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
  252. package/dist/pipeline/compiler/preset-loader.js +99 -0
  253. package/dist/pipeline/compiler/presets/index.d.ts +9 -0
  254. package/dist/pipeline/compiler/presets/index.js +8 -0
  255. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +42 -0
  256. package/dist/pipeline/compiler/presets/sanity-literacy.js +208 -0
  257. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
  258. package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
  259. package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
  260. package/dist/pipeline/compiler/provider-assembler.js +137 -0
  261. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
  262. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
  263. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
  264. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
  265. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
  266. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
  267. package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
  268. package/dist/pipeline/compiler/sandbox/index.js +11 -0
  269. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
  270. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
  271. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
  272. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
  273. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
  274. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
  275. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
  276. package/dist/pipeline/compiler/scoring-bridge.js +114 -0
  277. package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
  278. package/dist/pipeline/compiler/task-graph-builder.js +291 -0
  279. package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
  280. package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
  281. package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
  282. package/dist/pipeline/compiler/telemetry/index.js +19 -0
  283. package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
  284. package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
  285. package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
  286. package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
  287. package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
  288. package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
  289. package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
  290. package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
  291. package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
  292. package/dist/pipeline/compiler/variable-resolver.js +115 -0
  293. package/dist/pipeline/coverage-audit.d.ts +15 -5
  294. package/dist/pipeline/coverage-audit.js +41 -22
  295. package/dist/pipeline/eval-constants.d.ts +16 -6
  296. package/dist/pipeline/eval-constants.js +25 -4
  297. package/dist/pipeline/eval-fingerprint.d.ts +2 -2
  298. package/dist/pipeline/eval-fingerprint.js +8 -9
  299. package/dist/pipeline/expand-tasks.d.ts +19 -10
  300. package/dist/pipeline/expand-tasks.js +34 -28
  301. package/dist/pipeline/gap-analysis.d.ts +1 -1
  302. package/dist/pipeline/gap-analysis.js +2 -2
  303. package/dist/pipeline/generate-configs.d.ts +22 -4
  304. package/dist/pipeline/generate-configs.js +53 -24
  305. package/dist/pipeline/grader-api.d.ts +3 -3
  306. package/dist/pipeline/grader-api.js +5 -12
  307. package/dist/pipeline/grader-compare-runner.js +20 -27
  308. package/dist/pipeline/grader-comparison.d.ts +4 -8
  309. package/dist/pipeline/grader-comparison.js +11 -17
  310. package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
  311. package/dist/pipeline/grader-consistency-runner.js +16 -20
  312. package/dist/pipeline/grader-consistency.d.ts +6 -10
  313. package/dist/pipeline/grader-consistency.js +13 -32
  314. package/dist/pipeline/grader-sensitivity-runner.js +7 -5
  315. package/dist/pipeline/grader-sensitivity.d.ts +2 -6
  316. package/dist/pipeline/grader-sensitivity.js +10 -10
  317. package/dist/pipeline/grader-validate-runner.js +7 -5
  318. package/dist/pipeline/grader-validation.d.ts +2 -6
  319. package/dist/pipeline/grader-validation.js +14 -22
  320. package/dist/pipeline/map-request-to-config.js +7 -1
  321. package/dist/pipeline/mirror-repo-tasks.d.ts +13 -13
  322. package/dist/pipeline/mirror-repo-tasks.js +22 -21
  323. package/dist/pipeline/normalize-mode.d.ts +49 -0
  324. package/dist/pipeline/normalize-mode.js +64 -0
  325. package/dist/pipeline/plan.d.ts +5 -2
  326. package/dist/pipeline/plan.js +134 -78
  327. package/dist/pipeline/pr-comment.js +2 -0
  328. package/dist/pipeline/profile-resolution.d.ts +22 -14
  329. package/dist/pipeline/profile-resolution.js +41 -19
  330. package/dist/pipeline/provenance.d.ts +2 -2
  331. package/dist/pipeline/provenance.js +12 -17
  332. package/dist/pipeline/release-report.js +4 -4
  333. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  334. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  335. package/dist/pipeline/rubric-loader.d.ts +20 -0
  336. package/dist/pipeline/rubric-loader.js +37 -0
  337. package/dist/pipeline/validate.d.ts +4 -4
  338. package/dist/pipeline/validate.js +64 -53
  339. package/dist/schedules/loader.js +18 -8
  340. package/dist/scripts/migrate-task-mode.d.ts +24 -0
  341. package/dist/scripts/migrate-task-mode.js +85 -0
  342. package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
  343. package/dist/scripts/validate-task-sources.d.ts +1 -1
  344. package/dist/scripts/validate-task-sources.js +15 -15
  345. package/dist/sinks/loader.js +5 -7
  346. package/dist/sources.d.ts +7 -7
  347. package/dist/sources.js +22 -24
  348. package/dist/webhook/dispatch.js +2 -1
  349. package/package.json +15 -4
  350. package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
  351. package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
  352. package/tasks/literacy/frameworks.task.ts +128 -0
  353. package/tasks/literacy/functions.task.ts +69 -0
  354. package/tasks/literacy/groq.task.ts +258 -0
  355. package/tasks/literacy/nextjs-live.task.ts +75 -0
  356. package/tasks/literacy/studio-setup.task.ts +131 -0
  357. package/tasks/literacy/visual-editing.task.ts +146 -0
  358. package/config/features.yaml +0 -116
  359. package/config/models.yaml +0 -116
  360. package/config/prompts.yaml +0 -75
  361. package/config/rubrics.yaml +0 -81
  362. package/config/schedules.yaml +0 -43
  363. package/config/sinks.yaml +0 -54
  364. package/config/sources.yaml +0 -51
  365. package/config/thresholds.yaml +0 -49
  366. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  367. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  368. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  369. package/dist/_vendor/ailf-tasks/index.js +0 -16
  370. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  371. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  372. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  373. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  374. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  375. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  376. package/dist/agent-observer/test-imports.d.ts +0 -7
  377. package/dist/agent-observer/test-imports.js +0 -185
@@ -0,0 +1,334 @@
1
+ /**
2
+ * MCP-specific assertion types — ergonomic assertions for MCP server testing.
3
+ *
4
+ * Each assertion type compiles down to a Promptfoo `javascript` assertion
5
+ * with the appropriate validation logic. The developer writes:
6
+ *
7
+ * ```typescript
8
+ * assertions: [
9
+ * { type: "tool-called", value: "getDocument" },
10
+ * { type: "tool-input-matches", value: { documentId: "doc-123" } },
11
+ * { type: "tool-output-matches", value: { title: "Hello" } },
12
+ * { type: "error-returned", value: { code: -32602 } },
13
+ * ]
14
+ * ```
15
+ *
16
+ * The compiler transforms these into Promptfoo-compatible `javascript`
17
+ * assertions that inspect the tool call trace in the evaluation output.
18
+ *
19
+ * @see docs/exec-plans/architecture-overhaul/phase-3-mcp-server-mode.md
20
+ */
21
+ // ---------------------------------------------------------------------------
22
+ // Public API
23
+ // ---------------------------------------------------------------------------
24
+ /**
25
+ * Build MCP-specific assertions from task assertion definitions.
26
+ *
27
+ * Handles both MCP-specific types (tool-called, tool-input-matches, etc.)
28
+ * and standard assertion types (contains, llm-rubric, etc.) which are
29
+ * passed through unchanged.
30
+ */
31
+ export function buildMCPAssertions(assertions, context) {
32
+ const result = [];
33
+ const warnings = [];
34
+ for (const assertion of assertions) {
35
+ const mapped = mapMCPAssertion(assertion, context, warnings);
36
+ if (mapped) {
37
+ result.push(mapped);
38
+ }
39
+ }
40
+ return { assertions: result, warnings };
41
+ }
42
+ // ---------------------------------------------------------------------------
43
+ // Assertion mapping
44
+ // ---------------------------------------------------------------------------
45
+ function mapMCPAssertion(assertion, context, warnings) {
46
+ switch (assertion.type) {
47
+ case "tool-called":
48
+ return buildToolCalledAssertion(assertion, context);
49
+ case "tool-input-matches":
50
+ return buildToolInputMatchesAssertion(assertion, context);
51
+ case "tool-output-matches":
52
+ return buildToolOutputMatchesAssertion(assertion, context);
53
+ case "error-returned":
54
+ return buildErrorReturnedAssertion(assertion, context);
55
+ case "capability-available":
56
+ return buildCapabilityAssertion(assertion, context);
57
+ // Standard assertions — pass through
58
+ case "contains":
59
+ case "equals":
60
+ case "regex":
61
+ case "is-json":
62
+ case "llm-rubric":
63
+ case "javascript":
64
+ case "python":
65
+ return {
66
+ type: assertion.type,
67
+ ...("value" in assertion ? { value: assertion.value } : {}),
68
+ ...(assertion.weight !== undefined ? { weight: assertion.weight } : {}),
69
+ ...(assertion.type === "llm-rubric" && context.graderProvider
70
+ ? { provider: context.graderProvider }
71
+ : {}),
72
+ };
73
+ default:
74
+ warnings.push(`MCP task "${context.taskId}": unknown assertion type "${assertion.type}" — passed through`);
75
+ return {
76
+ type: assertion.type,
77
+ ...("value" in assertion ? { value: assertion.value } : {}),
78
+ };
79
+ }
80
+ }
81
+ // ---------------------------------------------------------------------------
82
+ // tool-called — asserts the model called a specific tool by name
83
+ // ---------------------------------------------------------------------------
84
+ function buildToolCalledAssertion(assertion, _context) {
85
+ const toolName = String(assertion.value ?? "");
86
+ // Strategy: check multiple sources for tool call evidence.
87
+ // 1. context.vars.__toolCalls (structured, if Promptfoo populates it)
88
+ // 2. Response metadata toolCallLog (from custom mcp-tool-provider)
89
+ // 3. Response output text (LLM+MCP providers embed tool_use JSON blocks)
90
+ return {
91
+ type: "javascript",
92
+ value: buildJsAssertion(`tool-called: ${toolName}`, `
93
+ var toolName = ${JSON.stringify(toolName)};
94
+
95
+ // Strategy 1: structured tool calls from Promptfoo
96
+ var toolCalls = context.vars.__toolCalls || [];
97
+ if (Array.isArray(toolCalls) && toolCalls.length > 0) {
98
+ var called = toolCalls.some(function(tc) { return tc.name === toolName; });
99
+ return {
100
+ pass: called,
101
+ score: called ? 1 : 0,
102
+ reason: called
103
+ ? 'Tool "' + toolName + '" was called (via __toolCalls)'
104
+ : 'Expected tool "' + toolName + '" but found: ' + toolCalls.map(function(tc) { return tc.name; }).join(', '),
105
+ };
106
+ }
107
+
108
+ // Strategy 2: MCP_TOOLS_CALLED summary appended by custom mcp-tool-provider
109
+ var outputStr = typeof output === 'string' ? output : JSON.stringify(output || '');
110
+ var summaryMatch = outputStr.match(/<!-- MCP_TOOLS_CALLED: (\\[.*?\\]) -->/);
111
+ if (summaryMatch) {
112
+ try {
113
+ var calledTools = JSON.parse(summaryMatch[1]);
114
+ var called = calledTools.includes(toolName);
115
+ var count = calledTools.filter(function(n) { return n === toolName; }).length;
116
+ return {
117
+ pass: called,
118
+ score: called ? 1 : 0,
119
+ reason: called
120
+ ? 'Tool "' + toolName + '" was called ' + count + ' time(s)'
121
+ : 'Expected tool "' + toolName + '" but found: ' + calledTools.join(', '),
122
+ };
123
+ } catch (e) { /* fall through to Strategy 3 */ }
124
+ }
125
+
126
+ // Strategy 3: parse output for tool_use blocks (built-in provider fallback)
127
+ var outputStr = typeof output === 'string' ? output : JSON.stringify(output || '');
128
+ var toolUsePattern = /"type"\\s*:\\s*"tool_use"[^}]*"name"\\s*:\\s*"([^"]+)"/g;
129
+ var foundTools = [];
130
+ var match;
131
+ while ((match = toolUsePattern.exec(outputStr)) !== null) {
132
+ foundTools.push(match[1]);
133
+ }
134
+ var fnCallPattern = /"function"\\s*:\\s*\\{[^}]*"name"\\s*:\\s*"([^"]+)"/g;
135
+ while ((match = fnCallPattern.exec(outputStr)) !== null) {
136
+ foundTools.push(match[1]);
137
+ }
138
+ if (foundTools.length === 0 && outputStr.includes(toolName) && outputStr.includes('tool_use')) {
139
+ foundTools.push(toolName);
140
+ }
141
+
142
+ var called = foundTools.includes(toolName);
143
+ return {
144
+ pass: called,
145
+ score: called ? 1 : 0,
146
+ reason: called
147
+ ? 'Tool "' + toolName + '" was called (detected in output)'
148
+ : 'Expected tool "' + toolName + '" to be called. ' +
149
+ (foundTools.length > 0
150
+ ? 'Tools found in output: ' + foundTools.join(', ')
151
+ : 'No tool calls detected in output'),
152
+ };`),
153
+ ...(assertion.weight !== undefined ? { weight: assertion.weight } : {}),
154
+ };
155
+ }
156
+ // ---------------------------------------------------------------------------
157
+ // tool-input-matches — asserts tool call inputs match a schema/value
158
+ // ---------------------------------------------------------------------------
159
+ function buildToolInputMatchesAssertion(assertion, _context) {
160
+ const expected = assertion.value;
161
+ const toolName = assertion.toolName ?? assertion.tool;
162
+ return {
163
+ type: "javascript",
164
+ value: buildJsAssertion(`tool-input-matches${toolName ? `: ${toolName}` : ""}`, `
165
+ const toolCalls = context.vars.__toolCalls || [];
166
+ const expected = ${JSON.stringify(expected)};
167
+ const toolFilter = ${JSON.stringify(toolName ?? null)};
168
+
169
+ const targetCalls = toolFilter
170
+ ? toolCalls.filter(tc => tc.name === toolFilter)
171
+ : toolCalls;
172
+
173
+ if (targetCalls.length === 0) {
174
+ return {
175
+ pass: false,
176
+ score: 0,
177
+ reason: toolFilter
178
+ ? 'No calls to tool "' + toolFilter + '" found'
179
+ : 'No tool calls found',
180
+ };
181
+ }
182
+
183
+ // Check if any call's input matches the expected value
184
+ const match = targetCalls.some(tc => {
185
+ const input = tc.input || tc.arguments || {};
186
+ return Object.entries(expected).every(([k, v]) =>
187
+ JSON.stringify(input[k]) === JSON.stringify(v)
188
+ );
189
+ });
190
+
191
+ return {
192
+ pass: match,
193
+ score: match ? 1 : 0,
194
+ reason: match
195
+ ? 'Tool input matches expected values'
196
+ : 'Tool input does not match. Expected: ' + JSON.stringify(expected) +
197
+ ', Got: ' + JSON.stringify(targetCalls.map(tc => tc.input || tc.arguments)),
198
+ };`),
199
+ ...(assertion.weight !== undefined ? { weight: assertion.weight } : {}),
200
+ };
201
+ }
202
+ // ---------------------------------------------------------------------------
203
+ // tool-output-matches — asserts tool outputs match expected shape/values
204
+ // ---------------------------------------------------------------------------
205
+ function buildToolOutputMatchesAssertion(assertion, _context) {
206
+ const expected = assertion.value;
207
+ const toolName = assertion.toolName ?? assertion.tool;
208
+ return {
209
+ type: "javascript",
210
+ value: buildJsAssertion(`tool-output-matches${toolName ? `: ${toolName}` : ""}`, `
211
+ const toolCalls = context.vars.__toolCalls || [];
212
+ const expected = ${JSON.stringify(expected)};
213
+ const toolFilter = ${JSON.stringify(toolName ?? null)};
214
+
215
+ const targetCalls = toolFilter
216
+ ? toolCalls.filter(tc => tc.name === toolFilter)
217
+ : toolCalls;
218
+
219
+ if (targetCalls.length === 0) {
220
+ return {
221
+ pass: false,
222
+ score: 0,
223
+ reason: toolFilter
224
+ ? 'No calls to tool "' + toolFilter + '" found'
225
+ : 'No tool calls found',
226
+ };
227
+ }
228
+
229
+ const match = targetCalls.some(tc => {
230
+ const output = tc.output || tc.result || {};
231
+ return Object.entries(expected).every(([k, v]) =>
232
+ JSON.stringify(output[k]) === JSON.stringify(v)
233
+ );
234
+ });
235
+
236
+ return {
237
+ pass: match,
238
+ score: match ? 1 : 0,
239
+ reason: match
240
+ ? 'Tool output matches expected values'
241
+ : 'Tool output does not match. Expected: ' + JSON.stringify(expected),
242
+ };`),
243
+ ...(assertion.weight !== undefined ? { weight: assertion.weight } : {}),
244
+ };
245
+ }
246
+ // ---------------------------------------------------------------------------
247
+ // error-returned — asserts the server returned a specific error
248
+ // ---------------------------------------------------------------------------
249
+ function buildErrorReturnedAssertion(assertion, _context) {
250
+ const expected = assertion.value;
251
+ return {
252
+ type: "javascript",
253
+ value: buildJsAssertion("error-returned", `
254
+ const toolCalls = context.vars.__toolCalls || [];
255
+ const expected = ${JSON.stringify(expected ?? {})};
256
+
257
+ const errorCall = toolCalls.find(tc => tc.error);
258
+ if (!errorCall) {
259
+ return {
260
+ pass: false,
261
+ score: 0,
262
+ reason: 'Expected an error response but no errors were returned',
263
+ };
264
+ }
265
+
266
+ const error = errorCall.error;
267
+ let pass = true;
268
+ const reasons = [];
269
+
270
+ if (expected.code !== undefined && error.code !== expected.code) {
271
+ pass = false;
272
+ reasons.push('Expected error code ' + expected.code + ', got ' + error.code);
273
+ }
274
+
275
+ if (expected.message !== undefined) {
276
+ const msgMatch = typeof error.message === 'string' &&
277
+ error.message.includes(expected.message);
278
+ if (!msgMatch) {
279
+ pass = false;
280
+ reasons.push('Expected error message containing "' + expected.message +
281
+ '", got "' + (error.message || '') + '"');
282
+ }
283
+ }
284
+
285
+ if (pass) {
286
+ reasons.push('Error matches expected pattern');
287
+ }
288
+
289
+ return {
290
+ pass,
291
+ score: pass ? 1 : 0,
292
+ reason: reasons.join('; '),
293
+ };`),
294
+ ...(assertion.weight !== undefined ? { weight: assertion.weight } : {}),
295
+ };
296
+ }
297
+ // ---------------------------------------------------------------------------
298
+ // capability-available — asserts the server advertises a capability
299
+ // ---------------------------------------------------------------------------
300
+ function buildCapabilityAssertion(assertion, _context) {
301
+ const capability = String(assertion.value ?? "");
302
+ return {
303
+ type: "javascript",
304
+ value: buildJsAssertion(`capability-available: ${capability}`, `
305
+ const capabilities = context.vars.__serverCapabilities || [];
306
+ const expected = ${JSON.stringify(capability)};
307
+ const available = capabilities.includes(expected);
308
+
309
+ return {
310
+ pass: available,
311
+ score: available ? 1 : 0,
312
+ reason: available
313
+ ? 'Server advertises capability "' + expected + '"'
314
+ : 'Server does not advertise capability "' + expected + '". ' +
315
+ 'Available: ' + (capabilities.join(', ') || 'none'),
316
+ };`),
317
+ ...(assertion.weight !== undefined ? { weight: assertion.weight } : {}),
318
+ };
319
+ }
320
+ // ---------------------------------------------------------------------------
321
+ // Helpers
322
+ // ---------------------------------------------------------------------------
323
+ /**
324
+ * Build a Promptfoo-compatible JavaScript assertion string.
325
+ *
326
+ * Wraps the assertion body in a function that receives `output` and `context`
327
+ * from Promptfoo's assertion runner.
328
+ */
329
+ function buildJsAssertion(label, body) {
330
+ // No IIFE wrapper — Promptfoo wraps the assertion in its own function via
331
+ // new Function('output', 'context', ...). The body must use `return` at
332
+ // the top level for the result to reach Promptfoo's validator.
333
+ return `// MCP assertion: ${label}\n${body.trim()}`;
334
+ }
@@ -0,0 +1,19 @@
1
+ /**
2
+ * MCP server task compilation — core compiler logic.
3
+ *
4
+ * Produces Promptfoo configuration from MCP server task definitions:
5
+ * 1. A provider config pointing to the MCP server
6
+ * 2. Test cases with tool-call assertions
7
+ * 3. Appropriate prompts for the evaluation
8
+ */
9
+ import type { MCPServerTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
10
+ import type { MCPCompileOptions, MCPCompileResult } from "./types.js";
11
+ /**
12
+ * Compile an MCP server task definition into Promptfoo configuration.
13
+ *
14
+ * This is the core of the MCP mode handler. It produces:
15
+ * 1. A provider config pointing to the MCP server
16
+ * 2. Test cases with tool-call assertions
17
+ * 3. Appropriate prompts for the evaluation
18
+ */
19
+ export declare function compileMCPTask(task: MCPServerTaskDefinition, options?: MCPCompileOptions): MCPCompileResult;
@@ -0,0 +1,100 @@
1
+ /**
2
+ * MCP server task compilation — core compiler logic.
3
+ *
4
+ * Produces Promptfoo configuration from MCP server task definitions:
5
+ * 1. A provider config pointing to the MCP server
6
+ * 2. Test cases with tool-call assertions
7
+ * 3. Appropriate prompts for the evaluation
8
+ */
9
+ import { buildMCPAssertions } from "./assertions.js";
10
+ import { buildMCPProvider } from "./provider-config.js";
11
+ import { validateMCPTask } from "./validation.js";
12
+ // ---------------------------------------------------------------------------
13
+ // Public API
14
+ // ---------------------------------------------------------------------------
15
+ /**
16
+ * Compile an MCP server task definition into Promptfoo configuration.
17
+ *
18
+ * This is the core of the MCP mode handler. It produces:
19
+ * 1. A provider config pointing to the MCP server
20
+ * 2. Test cases with tool-call assertions
21
+ * 3. Appropriate prompts for the evaluation
22
+ */
23
+ export function compileMCPTask(task, options) {
24
+ const warnings = [];
25
+ // Validate
26
+ const validationErrors = validateMCPTask(task);
27
+ if (validationErrors.length > 0) {
28
+ for (const err of validationErrors) {
29
+ warnings.push(`MCP task "${task.id}": ${err.field} — ${err.message}`);
30
+ }
31
+ }
32
+ // Build providers (one LLM provider per model, each with MCP config)
33
+ const providers = buildMCPProvider(task, options?.models ?? [], warnings);
34
+ // Build prompts
35
+ const prompts = buildMCPPrompts(task);
36
+ // Build test cases
37
+ const tests = buildMCPTestCases(task, options, warnings);
38
+ return { providers, tests, prompts, warnings };
39
+ }
40
+ // ---------------------------------------------------------------------------
41
+ // Prompt assembly
42
+ // ---------------------------------------------------------------------------
43
+ function buildMCPPrompts(task) {
44
+ // MCP mode uses a single prompt — the task description
45
+ const promptText = task.prompt?.text ??
46
+ task.prompt?.vars?.task ??
47
+ task.description ??
48
+ `Test MCP server: ${task.title}`;
49
+ return [
50
+ {
51
+ id: "mcp-test",
52
+ label: `MCP: ${task.title}`,
53
+ raw: String(promptText),
54
+ },
55
+ ];
56
+ }
57
+ // ---------------------------------------------------------------------------
58
+ // Test case assembly
59
+ // ---------------------------------------------------------------------------
60
+ function buildMCPTestCases(task, options, warnings) {
61
+ const tests = [];
62
+ // Build assertion context
63
+ const assertionContext = {
64
+ capabilities: task.capabilities ?? [],
65
+ graderProvider: options?.graderProvider,
66
+ taskId: task.id,
67
+ };
68
+ // Compile assertions
69
+ // Cast GeneralizedAssertionDefinition[] → AssertionInput[] (structurally compatible)
70
+ const assertions = [];
71
+ if (task.assertions) {
72
+ const rawAssertions = task.assertions;
73
+ const { assertions: mapped, warnings: assertionWarnings } = buildMCPAssertions(rawAssertions, assertionContext);
74
+ assertions.push(...mapped);
75
+ warnings.push(...assertionWarnings);
76
+ }
77
+ // Build test case vars
78
+ const vars = {
79
+ task: task.prompt?.vars?.task ?? task.description ?? `Test: ${task.title}`,
80
+ ...(task.prompt?.vars ?? {}),
81
+ };
82
+ // Primary test case
83
+ tests.push({
84
+ description: `${task.id} — ${task.title}`,
85
+ vars,
86
+ ...(assertions.length > 0 ? { assert: assertions } : {}),
87
+ });
88
+ // Multi-turn test cases
89
+ if (task.multiTurn?.turns && task.multiTurn.turns.length > 0) {
90
+ tests.push({
91
+ description: `${task.id} — ${task.title} [multi-turn]`,
92
+ vars: {
93
+ ...vars,
94
+ __multiTurn: task.multiTurn.turns,
95
+ },
96
+ ...(assertions.length > 0 ? { assert: assertions } : {}),
97
+ });
98
+ }
99
+ return tests;
100
+ }
@@ -0,0 +1,27 @@
1
+ /**
2
+ * MCP Server mode handler — directory barrel.
3
+ *
4
+ * MCPServerModeHandler — compilation rules for `mcp-server` evaluation mode.
5
+ *
6
+ * This is the first non-literacy mode handler, proving the compiler
7
+ * architecture works end-to-end. It translates MCP server task definitions
8
+ * into Promptfoo configuration with:
9
+ *
10
+ * - An MCP provider that wraps the server under test
11
+ * - Tool-call assertions compiled to Promptfoo `javascript` assertions
12
+ * - Server lifecycle management via Promptfoo provider hooks
13
+ * - Multi-turn conversation support via Promptfoo's `steps` syntax
14
+ *
15
+ * @see docs/exec-plans/architecture-overhaul/phase-3-mcp-server-mode.md
16
+ * @see packages/core/src/types/eval-mode-config.ts — MCPServerModeConfig
17
+ * @see packages/core/src/types/generalized-task.ts — MCPServerTaskDefinition
18
+ */
19
+ import type { ModeHandler } from "../../../../_vendor/ailf-core/index.d.ts";
20
+ /** ModeHandler-conformant export for the mcp-server evaluation mode. */
21
+ export declare const handler: ModeHandler;
22
+ export type { MCPAssertionContext, MCPCompileOptions, MCPCompileResult, MCPValidationError, } from "./types.js";
23
+ export { buildMCPAssertions } from "./assertions.js";
24
+ export { compileMCPTask } from "./compiler.js";
25
+ export { validateMCPTask } from "./validation.js";
26
+ export { MCP_PROMPT_TEMPLATES } from "./prompts.js";
27
+ export { DEFAULT_MAX_TOOL_ROUNDS, MCP_PROVIDER_PATH, } from "./provider-config.js";
@@ -0,0 +1,54 @@
1
+ /**
2
+ * MCP Server mode handler — directory barrel.
3
+ *
4
+ * MCPServerModeHandler — compilation rules for `mcp-server` evaluation mode.
5
+ *
6
+ * This is the first non-literacy mode handler, proving the compiler
7
+ * architecture works end-to-end. It translates MCP server task definitions
8
+ * into Promptfoo configuration with:
9
+ *
10
+ * - An MCP provider that wraps the server under test
11
+ * - Tool-call assertions compiled to Promptfoo `javascript` assertions
12
+ * - Server lifecycle management via Promptfoo provider hooks
13
+ * - Multi-turn conversation support via Promptfoo's `steps` syntax
14
+ *
15
+ * @see docs/exec-plans/architecture-overhaul/phase-3-mcp-server-mode.md
16
+ * @see packages/core/src/types/eval-mode-config.ts — MCPServerModeConfig
17
+ * @see packages/core/src/types/generalized-task.ts — MCPServerTaskDefinition
18
+ */
19
+ import { compileMCPTask } from "./compiler.js";
20
+ import { MCP_PROMPT_TEMPLATES } from "./prompts.js";
21
+ // ---------------------------------------------------------------------------
22
+ // ModeHandler adapter
23
+ // ---------------------------------------------------------------------------
24
+ /** ModeHandler-conformant export for the mcp-server evaluation mode. */
25
+ export const handler = {
26
+ getPrompts() {
27
+ return MCP_PROMPT_TEMPLATES;
28
+ },
29
+ compileTask(task, ctx) {
30
+ if (!("mode" in task) || task.mode !== "mcp-server") {
31
+ throw new Error(`MCP server handler received task with mode "${task.mode ?? "undefined"}" — expected "mcp-server"`);
32
+ }
33
+ const result = compileMCPTask(task, {
34
+ graderProvider: ctx.graderProvider,
35
+ models: ctx.models,
36
+ });
37
+ return {
38
+ providers: result.providers,
39
+ tests: result.tests,
40
+ prompts: result.prompts,
41
+ warnings: result.warnings,
42
+ };
43
+ },
44
+ };
45
+ // Assertions
46
+ export { buildMCPAssertions } from "./assertions.js";
47
+ // Compilation
48
+ export { compileMCPTask } from "./compiler.js";
49
+ // Validation
50
+ export { validateMCPTask } from "./validation.js";
51
+ // Prompts
52
+ export { MCP_PROMPT_TEMPLATES } from "./prompts.js";
53
+ // Provider config
54
+ export { DEFAULT_MAX_TOOL_ROUNDS, MCP_PROVIDER_PATH, } from "./provider-config.js";
@@ -0,0 +1,8 @@
1
+ /**
2
+ * Canonical MCP server prompt templates.
3
+ *
4
+ * Handler-owned prompts for MCP server evaluations. Instructs the model to
5
+ * interact with MCP tools rather than writing standalone code.
6
+ */
7
+ import type { PromptTemplate } from "../../../../_vendor/ailf-core/index.d.ts";
8
+ export declare const MCP_PROMPT_TEMPLATES: Record<string, PromptTemplate>;
@@ -0,0 +1,28 @@
1
+ /**
2
+ * Canonical MCP server prompt templates.
3
+ *
4
+ * Handler-owned prompts for MCP server evaluations. Instructs the model to
5
+ * interact with MCP tools rather than writing standalone code.
6
+ */
7
+ export const MCP_PROMPT_TEMPLATES = {
8
+ "mcp-server": {
9
+ id: "mcp-server",
10
+ label: "MCP Server Tool Use",
11
+ template: `You are an AI assistant with access to an MCP (Model Context Protocol) server that provides tools for interacting with a Sanity content backend.
12
+
13
+ ## Task
14
+ {{task}}
15
+
16
+ ## Instructions
17
+
18
+ 1. Use the available MCP tools to complete the task
19
+ 2. Call tools with the correct parameters as described in their schemas
20
+ 3. Interpret tool responses and use the results to accomplish the goal
21
+ 4. If a tool returns an error, explain the issue clearly
22
+ 5. Prefer using specific tools over broad queries when possible
23
+
24
+ Complete the task using the MCP tools provided:
25
+ `,
26
+ variables: ["task"],
27
+ },
28
+ };
@@ -0,0 +1,28 @@
1
+ /**
2
+ * MCP server provider assembly — builds Promptfoo provider configs.
3
+ */
4
+ import type { MCPServerTaskDefinition, ModeProviderEntry } from "../../../../_vendor/ailf-core/index.d.ts";
5
+ import type { PromptfooProvider } from "../../promptfoo-compiler.js";
6
+ /** Default max tool rounds for MCP multi-turn execution */
7
+ export declare const DEFAULT_MAX_TOOL_ROUNDS = 5;
8
+ /** Provider path relative to eval package dist */
9
+ export declare const MCP_PROVIDER_PATH = "file://dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js";
10
+ /**
11
+ * Build custom MCP tool provider configs — one per model.
12
+ *
13
+ * Each provider uses the custom mcp-tool-provider.ts which implements a
14
+ * multi-turn tool execution loop. The LLM receives a prompt, discovers
15
+ * MCP tools, calls them, gets results, and continues until it produces
16
+ * a final text answer or exhausts maxToolRounds.
17
+ *
18
+ * Config shape passed to the custom provider:
19
+ * { model, mcpServer: { url, auth, name }, mcpTools, maxToolRounds, temperature, ... }
20
+ */
21
+ export declare function buildMCPProvider(task: MCPServerTaskDefinition, models: ModeProviderEntry[], warnings: string[]): PromptfooProvider[];
22
+ /**
23
+ * Build the MCP server connection config for the custom provider.
24
+ *
25
+ * Shape: { url?, command?, name?, auth? }
26
+ * The custom mcp-tool-provider.ts uses this to connect to the MCP server.
27
+ */
28
+ export declare function buildMCPServerConfig(task: MCPServerTaskDefinition, warnings: string[]): Record<string, unknown>;