@sanity/ailf 2.0.2 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. package/LICENSE +21 -0
  2. package/dist/_vendor/ailf-core/examples/index.d.ts +50 -1
  3. package/dist/_vendor/ailf-core/examples/index.js +66 -1
  4. package/dist/agent-harness/assertions-runtime.d.ts +49 -0
  5. package/dist/agent-harness/assertions-runtime.js +138 -0
  6. package/dist/agent-harness/provider.d.ts +58 -0
  7. package/dist/agent-harness/provider.js +104 -0
  8. package/dist/cli.js +0 -0
  9. package/dist/commands/init.js +3 -0
  10. package/dist/orchestration/steps/generate-configs-step.d.ts +7 -0
  11. package/dist/orchestration/steps/generate-configs-step.js +35 -2
  12. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +39 -25
  13. package/dist/pipeline/compiler/compiler-to-yaml.js +78 -7
  14. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +9 -0
  15. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +28 -85
  16. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +22 -15
  17. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +8 -1
  18. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +42 -12
  19. package/package.json +25 -24
  20. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
  21. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
  22. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
  23. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
  24. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  25. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  26. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  27. package/dist/_vendor/ailf-tasks/index.js +0 -16
  28. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  29. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  30. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  31. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  32. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  33. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  34. package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
  35. package/dist/adapters/task-sources/yaml-task-source.js +0 -139
  36. package/dist/agent-observer/test-imports.d.ts +0 -7
  37. package/dist/agent-observer/test-imports.js +0 -185
  38. package/dist/commands/update-quality-scores.d.ts +0 -5
  39. package/dist/commands/update-quality-scores.js +0 -20
  40. package/dist/lib/agent-behavior-report.d.ts +0 -8
  41. package/dist/lib/agent-behavior-report.js +0 -185
  42. package/dist/lib/baseline.d.ts +0 -19
  43. package/dist/lib/baseline.js +0 -153
  44. package/dist/lib/calculate-scores.d.ts +0 -23
  45. package/dist/lib/calculate-scores.js +0 -42
  46. package/dist/lib/compare.d.ts +0 -18
  47. package/dist/lib/compare.js +0 -170
  48. package/dist/lib/coverage-audit.d.ts +0 -4
  49. package/dist/lib/coverage-audit.js +0 -42
  50. package/dist/lib/discovery-report.d.ts +0 -13
  51. package/dist/lib/discovery-report.js +0 -57
  52. package/dist/lib/fetch-docs.d.ts +0 -30
  53. package/dist/lib/fetch-docs.js +0 -171
  54. package/dist/lib/generate-configs.d.ts +0 -25
  55. package/dist/lib/generate-configs.js +0 -42
  56. package/dist/lib/grader-api.d.ts +0 -21
  57. package/dist/lib/grader-api.js +0 -34
  58. package/dist/lib/grader-compare.d.ts +0 -19
  59. package/dist/lib/grader-compare.js +0 -91
  60. package/dist/lib/grader-consistency.d.ts +0 -27
  61. package/dist/lib/grader-consistency.js +0 -79
  62. package/dist/lib/grader-sensitivity.d.ts +0 -19
  63. package/dist/lib/grader-sensitivity.js +0 -75
  64. package/dist/lib/grader-validate.d.ts +0 -19
  65. package/dist/lib/grader-validate.js +0 -78
  66. package/dist/lib/measure-retrieval.d.ts +0 -14
  67. package/dist/lib/measure-retrieval.js +0 -71
  68. package/dist/lib/pr-comment.d.ts +0 -16
  69. package/dist/lib/pr-comment.js +0 -28
  70. package/dist/lib/readiness-report.d.ts +0 -13
  71. package/dist/lib/readiness-report.js +0 -108
  72. package/dist/lib/webhook-server.d.ts +0 -11
  73. package/dist/lib/webhook-server.js +0 -24
  74. package/dist/lib/weekly-digest.d.ts +0 -24
  75. package/dist/lib/weekly-digest.js +0 -148
  76. package/dist/orchestration/env-bridge.d.ts +0 -21
  77. package/dist/orchestration/env-bridge.js +0 -66
  78. package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
  79. package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
  80. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
  81. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
  82. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
  83. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
  84. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
  85. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
  86. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
  87. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
  88. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
  89. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
  90. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
  91. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
  92. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
  93. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
  94. package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
  95. package/dist/pipeline/compiler/task-bridge.js +0 -92
  96. package/dist/pipeline/expand-tasks.d.ts +0 -232
  97. package/dist/pipeline/expand-tasks.js +0 -467
  98. package/dist/pipeline/generate-configs.d.ts +0 -92
  99. package/dist/pipeline/generate-configs.js +0 -445
  100. package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
  101. package/dist/pipeline/steps/calculate-scores-step.js +0 -89
  102. package/dist/pipeline/steps/compare-step.d.ts +0 -18
  103. package/dist/pipeline/steps/compare-step.js +0 -90
  104. package/dist/pipeline/steps/eval-step.d.ts +0 -53
  105. package/dist/pipeline/steps/eval-step.js +0 -347
  106. package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
  107. package/dist/pipeline/steps/fetch-docs-step.js +0 -84
  108. package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
  109. package/dist/pipeline/steps/generate-configs-step.js +0 -98
  110. package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
  111. package/dist/pipeline/steps/grader-consistency-step.js +0 -74
  112. package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
  113. package/dist/pipeline/steps/publish-report-step.js +0 -243
  114. package/dist/pipeline/steps/report-step.d.ts +0 -13
  115. package/dist/pipeline/steps/report-step.js +0 -56
  116. package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
  117. package/dist/pipeline/steps/update-scores-step.js +0 -42
  118. package/dist/scripts/agent-behavior-report.d.ts +0 -19
  119. package/dist/scripts/agent-behavior-report.js +0 -315
  120. package/dist/scripts/baseline.d.ts +0 -43
  121. package/dist/scripts/baseline.js +0 -267
  122. package/dist/scripts/calculate-scores.d.ts +0 -166
  123. package/dist/scripts/calculate-scores.js +0 -1296
  124. package/dist/scripts/compare.d.ts +0 -22
  125. package/dist/scripts/compare.js +0 -334
  126. package/dist/scripts/coverage-audit.d.ts +0 -44
  127. package/dist/scripts/coverage-audit.js +0 -209
  128. package/dist/scripts/debug-eval.d.ts +0 -19
  129. package/dist/scripts/debug-eval.js +0 -73
  130. package/dist/scripts/discovery-report.d.ts +0 -58
  131. package/dist/scripts/discovery-report.js +0 -250
  132. package/dist/scripts/fetch-docs.d.ts +0 -35
  133. package/dist/scripts/fetch-docs.js +0 -472
  134. package/dist/scripts/generate-configs.d.ts +0 -66
  135. package/dist/scripts/generate-configs.js +0 -459
  136. package/dist/scripts/grader-api.d.ts +0 -27
  137. package/dist/scripts/grader-api.js +0 -206
  138. package/dist/scripts/grader-compare.d.ts +0 -22
  139. package/dist/scripts/grader-compare.js +0 -368
  140. package/dist/scripts/grader-consistency.d.ts +0 -20
  141. package/dist/scripts/grader-consistency.js +0 -313
  142. package/dist/scripts/grader-sensitivity.d.ts +0 -22
  143. package/dist/scripts/grader-sensitivity.js +0 -354
  144. package/dist/scripts/grader-validate.d.ts +0 -19
  145. package/dist/scripts/grader-validate.js +0 -267
  146. package/dist/scripts/measure-retrieval.d.ts +0 -10
  147. package/dist/scripts/measure-retrieval.js +0 -145
  148. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
  149. package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
  150. package/dist/scripts/pipeline.d.ts +0 -76
  151. package/dist/scripts/pipeline.js +0 -1031
  152. package/dist/scripts/pr-comment.d.ts +0 -10
  153. package/dist/scripts/pr-comment.js +0 -510
  154. package/dist/scripts/readiness-report.d.ts +0 -88
  155. package/dist/scripts/readiness-report.js +0 -342
  156. package/dist/scripts/update-quality-scores.d.ts +0 -15
  157. package/dist/scripts/update-quality-scores.js +0 -184
  158. package/dist/scripts/validate-task-sources.d.ts +0 -21
  159. package/dist/scripts/validate-task-sources.js +0 -210
  160. package/dist/scripts/validate.d.ts +0 -13
  161. package/dist/scripts/validate.js +0 -79
  162. package/dist/scripts/webhook-server.d.ts +0 -26
  163. package/dist/scripts/webhook-server.js +0 -147
  164. package/dist/scripts/weekly-digest.d.ts +0 -24
  165. package/dist/scripts/weekly-digest.js +0 -144
  166. package/dist/sinks/format-slack.d.ts +0 -64
  167. package/dist/sinks/format-slack.js +0 -306
  168. package/dist/sinks/slack-sink.d.ts +0 -27
  169. package/dist/sinks/slack-sink.js +0 -78
  170. package/dist/sinks/webhook-sink.d.ts +0 -19
  171. package/dist/sinks/webhook-sink.js +0 -50
  172. package/tasks/.expanded.agentic.yaml +0 -280
  173. package/tasks/.expanded.yaml +0 -565
@@ -1,307 +0,0 @@
1
- /**
2
- * MCPServerModeHandler — compilation rules for `mcp-server` evaluation mode.
3
- *
4
- * This is the first non-literacy mode handler, proving the compiler
5
- * architecture works end-to-end. It translates MCP server task definitions
6
- * into Promptfoo configuration with:
7
- *
8
- * - An MCP provider that wraps the server under test
9
- * - Tool-call assertions compiled to Promptfoo `javascript` assertions
10
- * - Server lifecycle management via Promptfoo provider hooks
11
- * - Multi-turn conversation support via Promptfoo's `steps` syntax
12
- *
13
- * Promptfoo supports MCP servers as providers natively:
14
- * ```yaml
15
- * providers:
16
- * - id: mcp:./my-server
17
- * config:
18
- * command: node
19
- * args: [./dist/server.js]
20
- * env: { API_KEY: "..." }
21
- * ```
22
- *
23
- * This handler assembles that config from AILF's `MCPServerTaskDefinition`.
24
- *
25
- * @see docs/exec-plans/architecture-overhaul/phase-3-mcp-server-mode.md
26
- * @see packages/core/src/types/eval-mode-config.ts — MCPServerModeConfig
27
- * @see packages/core/src/types/generalized-task.ts — MCPServerTaskDefinition
28
- */
29
- import { buildMCPAssertions, } from "./mcp-assertions.js";
30
- // ---------------------------------------------------------------------------
31
- // Canonical MCP server prompt templates
32
- // ---------------------------------------------------------------------------
33
- // Handler-owned prompts for MCP server evaluations. Instructs the model to
34
- // interact with MCP tools rather than writing standalone code.
35
- export const MCP_PROMPT_TEMPLATES = {
36
- "mcp-server": {
37
- id: "mcp-server",
38
- label: "MCP Server Tool Use",
39
- template: `You are an AI assistant with access to an MCP (Model Context Protocol) server that provides tools for interacting with a Sanity content backend.
40
-
41
- ## Task
42
- {{task}}
43
-
44
- ## Instructions
45
-
46
- 1. Use the available MCP tools to complete the task
47
- 2. Call tools with the correct parameters as described in their schemas
48
- 3. Interpret tool responses and use the results to accomplish the goal
49
- 4. If a tool returns an error, explain the issue clearly
50
- 5. Prefer using specific tools over broad queries when possible
51
-
52
- Complete the task using the MCP tools provided:
53
- `,
54
- variables: ["task"],
55
- },
56
- };
57
- /**
58
- * Validate that an MCP task definition has all required fields.
59
- */
60
- export function validateMCPTask(task) {
61
- const errors = [];
62
- if (!task.id) {
63
- errors.push({ field: "id", message: "Task ID is required" });
64
- }
65
- if (!task.title) {
66
- errors.push({ field: "title", message: "Task title is required" });
67
- }
68
- if (task.serverConfig) {
69
- const { transport, command, url } = task.serverConfig;
70
- if (transport === "stdio" && !command) {
71
- errors.push({
72
- field: "serverConfig.command",
73
- message: "Server command is required for stdio transport (e.g., 'node dist/server.js')",
74
- });
75
- }
76
- if ((transport === "sse" || transport === "streamable-http") && !url) {
77
- errors.push({
78
- field: "serverConfig.url",
79
- message: `Server URL is required for ${transport} transport`,
80
- });
81
- }
82
- }
83
- // Assertions should reference MCP-compatible types
84
- if (task.assertions) {
85
- for (const assertion of task.assertions) {
86
- if (assertion.type === "tool-called" &&
87
- !("value" in assertion && assertion.value)) {
88
- errors.push({
89
- field: "assertions",
90
- message: 'tool-called assertion requires a "value" specifying the tool name',
91
- });
92
- }
93
- }
94
- }
95
- return errors;
96
- }
97
- // ---------------------------------------------------------------------------
98
- // Compilation
99
- // ---------------------------------------------------------------------------
100
- /**
101
- * Compile an MCP server task definition into Promptfoo configuration.
102
- *
103
- * This is the core of the MCP mode handler. It produces:
104
- * 1. A provider config pointing to the MCP server
105
- * 2. Test cases with tool-call assertions
106
- * 3. Appropriate prompts for the evaluation
107
- */
108
- export function compileMCPTask(task, options) {
109
- const warnings = [];
110
- // Validate
111
- const validationErrors = validateMCPTask(task);
112
- if (validationErrors.length > 0) {
113
- for (const err of validationErrors) {
114
- warnings.push(`MCP task "${task.id}": ${err.field} — ${err.message}`);
115
- }
116
- }
117
- // Build providers (one LLM provider per model, each with MCP config)
118
- const providers = buildMCPProvider(task, options?.models ?? [], warnings);
119
- // Build prompts
120
- const prompts = buildMCPPrompts(task);
121
- // Build test cases
122
- const tests = buildMCPTestCases(task, options, warnings);
123
- return { providers, tests, prompts, warnings };
124
- }
125
- // ---------------------------------------------------------------------------
126
- // Provider assembly
127
- // ---------------------------------------------------------------------------
128
- /** Default max tool rounds for MCP multi-turn execution */
129
- const DEFAULT_MAX_TOOL_ROUNDS = 5;
130
- /** Provider path relative to eval package dist */
131
- const MCP_PROVIDER_PATH = "file://dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js";
132
- /**
133
- * Build custom MCP tool provider configs — one per model.
134
- *
135
- * Each provider uses the custom mcp-tool-provider.ts which implements a
136
- * multi-turn tool execution loop. The LLM receives a prompt, discovers
137
- * MCP tools, calls them, gets results, and continues until it produces
138
- * a final text answer or exhausts maxToolRounds.
139
- *
140
- * Config shape passed to the custom provider:
141
- * { model, mcpServer: { url, auth, name }, mcpTools, maxToolRounds, temperature, ... }
142
- */
143
- function buildMCPProvider(task, models, warnings) {
144
- // Build the MCP server config
145
- const mcpServer = buildMCPServerConfig(task, warnings);
146
- const mcpTools = task.capabilities ?? undefined;
147
- const maxToolRounds = task.maxToolRounds ?? DEFAULT_MAX_TOOL_ROUNDS;
148
- // Helper to build a provider entry for a given model
149
- function makeProvider(modelId, label, modelConfig) {
150
- return {
151
- id: MCP_PROVIDER_PATH,
152
- label: `${label} + MCP`,
153
- config: {
154
- model: modelId,
155
- mcpServer,
156
- ...(mcpTools ? { mcpTools } : {}),
157
- maxToolRounds,
158
- ...(modelConfig ?? {}),
159
- },
160
- };
161
- }
162
- // Task-level model override takes precedence over registry models
163
- const taskModels = task.models;
164
- if (taskModels && taskModels.length > 0) {
165
- return taskModels.map((modelId) => makeProvider(modelId, modelId));
166
- }
167
- // Use registry models (already filtered to mcp-server mode)
168
- if (models.length === 0) {
169
- warnings.push(`MCP task "${task.id}": no models available. Add "mcp-server" to a ` +
170
- "model's modes array in config/models.ts, or set models on the task.");
171
- return [
172
- makeProvider("anthropic:messages:claude-sonnet-4-20250514", "Claude Sonnet 4"),
173
- ];
174
- }
175
- return models.map((model) => makeProvider(model.id, model.label, model.config));
176
- }
177
- /**
178
- * Build the MCP server connection config for the custom provider.
179
- *
180
- * Shape: { url?, command?, name?, auth? }
181
- * The custom mcp-tool-provider.ts uses this to connect to the MCP server.
182
- */
183
- function buildMCPServerConfig(task, warnings) {
184
- const config = task.serverConfig;
185
- if (!config) {
186
- warnings.push(`MCP task "${task.id}": no serverConfig — using placeholder. ` +
187
- "Set serverConfig.command or serverConfig.url to point to your MCP server.");
188
- return { name: task.id };
189
- }
190
- const serverConfig = { name: task.id };
191
- if (config.transport === "stdio") {
192
- serverConfig.command = config.command;
193
- }
194
- else {
195
- serverConfig.url = config.url;
196
- }
197
- // Auth config
198
- if (config.auth) {
199
- serverConfig.auth = config.auth;
200
- }
201
- else if (config.env) {
202
- const tokenKey = Object.keys(config.env).find((k) => /token|auth|key/i.test(k));
203
- if (tokenKey) {
204
- const val = config.env[tokenKey];
205
- let envVar = val;
206
- if (val.startsWith("$env(") && val.endsWith(")")) {
207
- envVar = val.slice(5, -1);
208
- }
209
- if (!envVar || !/^[A-Za-z_][A-Za-z0-9_]*$/.test(envVar)) {
210
- warnings.push(`MCP task: env var name "${envVar}" from "${val}" is not a valid ` +
211
- "identifier — skipping auth config");
212
- }
213
- else {
214
- serverConfig.auth = {
215
- type: "bearer",
216
- token: `{{env.${envVar}}}`,
217
- };
218
- }
219
- }
220
- }
221
- return serverConfig;
222
- }
223
- // ---------------------------------------------------------------------------
224
- // Prompt assembly
225
- // ---------------------------------------------------------------------------
226
- function buildMCPPrompts(task) {
227
- // MCP mode uses a single prompt — the task description
228
- const promptText = task.prompt?.text ??
229
- task.prompt?.vars?.task ??
230
- task.description ??
231
- `Test MCP server: ${task.title}`;
232
- return [
233
- {
234
- id: "mcp-test",
235
- label: `MCP: ${task.title}`,
236
- raw: String(promptText),
237
- },
238
- ];
239
- }
240
- // ---------------------------------------------------------------------------
241
- // Test case assembly
242
- // ---------------------------------------------------------------------------
243
- function buildMCPTestCases(task, options, warnings) {
244
- const tests = [];
245
- // Build assertion context
246
- const assertionContext = {
247
- capabilities: task.capabilities ?? [],
248
- graderProvider: options?.graderProvider,
249
- taskId: task.id,
250
- };
251
- // Compile assertions
252
- // Cast GeneralizedAssertionDefinition[] → AssertionInput[] (structurally compatible)
253
- const assertions = [];
254
- if (task.assertions) {
255
- const rawAssertions = task.assertions;
256
- const { assertions: mapped, warnings: assertionWarnings } = buildMCPAssertions(rawAssertions, assertionContext);
257
- assertions.push(...mapped);
258
- warnings.push(...assertionWarnings);
259
- }
260
- // Build test case vars
261
- const vars = {
262
- task: task.prompt?.vars?.task ?? task.description ?? `Test: ${task.title}`,
263
- ...(task.prompt?.vars ?? {}),
264
- };
265
- // Primary test case
266
- tests.push({
267
- description: `${task.id} — ${task.title}`,
268
- vars,
269
- ...(assertions.length > 0 ? { assert: assertions } : {}),
270
- });
271
- // Multi-turn test cases
272
- if (task.multiTurn?.turns && task.multiTurn.turns.length > 0) {
273
- tests.push({
274
- description: `${task.id} — ${task.title} [multi-turn]`,
275
- vars: {
276
- ...vars,
277
- __multiTurn: task.multiTurn.turns,
278
- },
279
- ...(assertions.length > 0 ? { assert: assertions } : {}),
280
- });
281
- }
282
- return tests;
283
- }
284
- // ---------------------------------------------------------------------------
285
- // ModeHandler adapter
286
- // ---------------------------------------------------------------------------
287
- /** ModeHandler-conformant export for the mcp-server evaluation mode. */
288
- export const handler = {
289
- getPrompts() {
290
- return MCP_PROMPT_TEMPLATES;
291
- },
292
- compileTask(task, ctx) {
293
- if (!("mode" in task) || task.mode !== "mcp-server") {
294
- throw new Error(`MCP server handler received task with mode "${task.mode ?? "undefined"}" — expected "mcp-server"`);
295
- }
296
- const result = compileMCPTask(task, {
297
- graderProvider: ctx.graderProvider,
298
- models: ctx.models,
299
- });
300
- return {
301
- providers: result.providers,
302
- tests: result.tests,
303
- prompts: result.prompts,
304
- warnings: result.warnings,
305
- };
306
- },
307
- };
@@ -1,65 +0,0 @@
1
- /**
2
- * mcp-tool-provider.ts — Custom Promptfoo provider for MCP tool-use evaluation.
3
- *
4
- * Implements a multi-turn tool execution loop: the LLM receives a prompt,
5
- * discovers MCP tools, calls them, gets results, and continues until it
6
- * produces a final text answer or exhausts maxToolRounds.
7
- *
8
- * Promptfoo's built-in Anthropic/OpenAI providers with config.mcp only do
9
- * single-turn tool calls. This provider fills that gap by managing the
10
- * full conversation loop, similar to the agentic-provider.ts pattern.
11
- *
12
- * Promptfoo config usage:
13
- *
14
- * providers:
15
- * - id: file://dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js
16
- * label: "Claude Opus 4.6 + MCP"
17
- * config:
18
- * model: anthropic:messages:claude-opus-4-6
19
- * maxToolRounds: 5
20
- * temperature: 0.2
21
- * max_tokens: 4096
22
- * mcpServer:
23
- * url: https://mcp.sanity.io
24
- * auth: { type: bearer, token: "{{env.SANITY_API_TOKEN}}" }
25
- * name: mcp-live-query-documents
26
- * mcpTools: [query_documents, get_schema]
27
- */
28
- interface CallApiContextParams {
29
- prompt?: {
30
- raw: string;
31
- label?: string;
32
- };
33
- vars?: Record<string, object | string>;
34
- }
35
- interface ProviderOptions {
36
- config?: Record<string, unknown>;
37
- id?: string;
38
- }
39
- interface ProviderResponse {
40
- cached?: boolean;
41
- cost?: number;
42
- error?: string;
43
- metadata?: Record<string, unknown>;
44
- output?: object | string;
45
- tokenUsage?: {
46
- total?: number;
47
- prompt?: number;
48
- completion?: number;
49
- cached?: number;
50
- };
51
- }
52
- export default class MCPToolProvider {
53
- config: Record<string, unknown>;
54
- private providerId;
55
- constructor(options?: ProviderOptions);
56
- id(): string;
57
- callApi(prompt: string, _context?: CallApiContextParams): Promise<ProviderResponse>;
58
- private runAnthropicLoop;
59
- private connectMCP;
60
- /**
61
- * Resolve {{env.VAR}} templates in config values.
62
- */
63
- private resolveEnvTemplates;
64
- }
65
- export {};