@sanity/ailf 1.0.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. package/README.md +0 -1
  2. package/config/models.ts +15 -3
  3. package/dist/_vendor/ailf-core/config-helpers.d.ts +14 -17
  4. package/dist/_vendor/ailf-core/config-helpers.js +22 -2
  5. package/dist/_vendor/ailf-core/examples/index.d.ts +16 -0
  6. package/dist/_vendor/ailf-core/examples/index.js +25 -0
  7. package/dist/_vendor/ailf-core/index.d.ts +2 -2
  8. package/dist/_vendor/ailf-core/index.js +1 -1
  9. package/dist/_vendor/ailf-core/ports/context.d.ts +2 -0
  10. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  11. package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
  12. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
  13. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +2 -0
  14. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
  15. package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
  16. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +13 -0
  17. package/dist/_vendor/ailf-core/types/index.d.ts +1 -3
  18. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +78 -23
  19. package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
  20. package/dist/adapters/config-sources/file-config-adapter.js +1 -0
  21. package/dist/adapters/config-sources/ts-config-loader.js +21 -13
  22. package/dist/adapters/task-sources/content-lake-task-source.js +17 -20
  23. package/dist/adapters/task-sources/index.d.ts +2 -2
  24. package/dist/adapters/task-sources/index.js +2 -2
  25. package/dist/adapters/task-sources/repo-schemas.d.ts +218 -16
  26. package/dist/adapters/task-sources/repo-schemas.js +227 -19
  27. package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
  28. package/dist/adapters/task-sources/repo-task-source.js +81 -122
  29. package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
  30. package/dist/adapters/task-sources/repo-validation.js +126 -5
  31. package/dist/adapters/task-sources/task-file-loader.d.ts +2 -2
  32. package/dist/adapters/task-sources/task-file-loader.js +2 -2
  33. package/dist/commands/coverage-audit.js +3 -1
  34. package/dist/commands/init.d.ts +6 -4
  35. package/dist/commands/init.js +302 -23
  36. package/dist/commands/validate-tasks.d.ts +2 -2
  37. package/dist/commands/validate-tasks.js +26 -15
  38. package/dist/composition-root.d.ts +13 -1
  39. package/dist/composition-root.js +73 -41
  40. package/dist/index.d.ts +41 -0
  41. package/dist/index.js +48 -0
  42. package/dist/orchestration/build-step-sequence.js +4 -2
  43. package/dist/orchestration/steps/fetch-docs-step.js +2 -3
  44. package/dist/orchestration/steps/generate-configs-step.js +28 -12
  45. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
  46. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
  47. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
  48. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +105 -68
  49. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
  50. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  51. package/dist/pipeline/compiler/literacy-bridge.js +1 -1
  52. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
  53. package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
  54. package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
  55. package/dist/pipeline/compiler/mode-bases/index.js +4 -0
  56. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
  57. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
  58. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +12 -0
  59. package/dist/pipeline/compiler/mode-bases/literacy.js +78 -0
  60. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
  61. package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
  62. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
  63. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
  64. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
  65. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
  66. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
  67. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
  68. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
  69. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
  70. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
  71. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
  72. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
  73. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
  74. package/dist/pipeline/compiler/mode-handlers/{agent-harness-handler.d.ts → agent-harness/types.d.ts} +3 -24
  75. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
  76. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
  77. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
  78. package/dist/pipeline/compiler/mode-handlers/index.d.ts +4 -5
  79. package/dist/pipeline/compiler/mode-handlers/index.js +4 -6
  80. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
  81. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
  82. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
  83. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
  84. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
  85. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
  86. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
  87. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
  88. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
  89. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
  90. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
  91. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
  92. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
  93. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
  94. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
  95. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
  96. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
  97. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
  98. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
  99. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
  100. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
  101. package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
  102. package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
  103. package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
  104. package/dist/pipeline/compiler/mode-handlers/{mcp-assertions.d.ts → mcp-server/assertions.d.ts} +2 -10
  105. package/dist/pipeline/compiler/mode-handlers/{mcp-assertions.js → mcp-server/assertions.js} +63 -6
  106. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
  107. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
  108. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
  109. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
  110. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
  111. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
  112. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
  113. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +104 -0
  114. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
  115. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
  116. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
  117. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
  118. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
  119. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +174 -0
  120. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
  121. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +95 -0
  122. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
  123. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
  124. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +14 -0
  125. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +16 -0
  126. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +93 -0
  127. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
  128. package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
  129. package/dist/pipeline/compiler/preset-loader.js +99 -0
  130. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +6 -9
  131. package/dist/pipeline/compiler/presets/sanity-literacy.js +10 -156
  132. package/dist/pipeline/expand-tasks.d.ts +2 -2
  133. package/dist/pipeline/expand-tasks.js +2 -2
  134. package/dist/pipeline/generate-configs.js +1 -1
  135. package/dist/pipeline/map-request-to-config.js +1 -0
  136. package/dist/pipeline/mirror-repo-tasks.d.ts +7 -7
  137. package/dist/pipeline/mirror-repo-tasks.js +9 -9
  138. package/dist/pipeline/plan.js +1 -1
  139. package/package.json +11 -3
  140. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  141. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  142. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  143. package/dist/_vendor/ailf-tasks/index.js +0 -16
  144. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  145. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  146. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  147. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  148. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  149. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  150. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
  151. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
  152. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
  153. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
  154. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
  155. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -67
  156. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -309
@@ -0,0 +1,100 @@
1
+ /**
2
+ * MCP server task compilation — core compiler logic.
3
+ *
4
+ * Produces Promptfoo configuration from MCP server task definitions:
5
+ * 1. A provider config pointing to the MCP server
6
+ * 2. Test cases with tool-call assertions
7
+ * 3. Appropriate prompts for the evaluation
8
+ */
9
+ import { buildMCPAssertions } from "./assertions.js";
10
+ import { buildMCPProvider } from "./provider-config.js";
11
+ import { validateMCPTask } from "./validation.js";
12
+ // ---------------------------------------------------------------------------
13
+ // Public API
14
+ // ---------------------------------------------------------------------------
15
+ /**
16
+ * Compile an MCP server task definition into Promptfoo configuration.
17
+ *
18
+ * This is the core of the MCP mode handler. It produces:
19
+ * 1. A provider config pointing to the MCP server
20
+ * 2. Test cases with tool-call assertions
21
+ * 3. Appropriate prompts for the evaluation
22
+ */
23
+ export function compileMCPTask(task, options) {
24
+ const warnings = [];
25
+ // Validate
26
+ const validationErrors = validateMCPTask(task);
27
+ if (validationErrors.length > 0) {
28
+ for (const err of validationErrors) {
29
+ warnings.push(`MCP task "${task.id}": ${err.field} — ${err.message}`);
30
+ }
31
+ }
32
+ // Build providers (one LLM provider per model, each with MCP config)
33
+ const providers = buildMCPProvider(task, options?.models ?? [], warnings);
34
+ // Build prompts
35
+ const prompts = buildMCPPrompts(task);
36
+ // Build test cases
37
+ const tests = buildMCPTestCases(task, options, warnings);
38
+ return { providers, tests, prompts, warnings };
39
+ }
40
+ // ---------------------------------------------------------------------------
41
+ // Prompt assembly
42
+ // ---------------------------------------------------------------------------
43
+ function buildMCPPrompts(task) {
44
+ // MCP mode uses a single prompt — the task description
45
+ const promptText = task.prompt?.text ??
46
+ task.prompt?.vars?.task ??
47
+ task.description ??
48
+ `Test MCP server: ${task.title}`;
49
+ return [
50
+ {
51
+ id: "mcp-test",
52
+ label: `MCP: ${task.title}`,
53
+ raw: String(promptText),
54
+ },
55
+ ];
56
+ }
57
+ // ---------------------------------------------------------------------------
58
+ // Test case assembly
59
+ // ---------------------------------------------------------------------------
60
+ function buildMCPTestCases(task, options, warnings) {
61
+ const tests = [];
62
+ // Build assertion context
63
+ const assertionContext = {
64
+ capabilities: task.capabilities ?? [],
65
+ graderProvider: options?.graderProvider,
66
+ taskId: task.id,
67
+ };
68
+ // Compile assertions
69
+ // Cast GeneralizedAssertionDefinition[] → AssertionInput[] (structurally compatible)
70
+ const assertions = [];
71
+ if (task.assertions) {
72
+ const rawAssertions = task.assertions;
73
+ const { assertions: mapped, warnings: assertionWarnings } = buildMCPAssertions(rawAssertions, assertionContext);
74
+ assertions.push(...mapped);
75
+ warnings.push(...assertionWarnings);
76
+ }
77
+ // Build test case vars
78
+ const vars = {
79
+ task: task.prompt?.vars?.task ?? task.description ?? `Test: ${task.title}`,
80
+ ...(task.prompt?.vars ?? {}),
81
+ };
82
+ // Primary test case
83
+ tests.push({
84
+ description: `${task.id} — ${task.title}`,
85
+ vars,
86
+ ...(assertions.length > 0 ? { assert: assertions } : {}),
87
+ });
88
+ // Multi-turn test cases
89
+ if (task.multiTurn?.turns && task.multiTurn.turns.length > 0) {
90
+ tests.push({
91
+ description: `${task.id} — ${task.title} [multi-turn]`,
92
+ vars: {
93
+ ...vars,
94
+ __multiTurn: task.multiTurn.turns,
95
+ },
96
+ ...(assertions.length > 0 ? { assert: assertions } : {}),
97
+ });
98
+ }
99
+ return tests;
100
+ }
@@ -0,0 +1,27 @@
1
+ /**
2
+ * MCP Server mode handler — directory barrel.
3
+ *
4
+ * MCPServerModeHandler — compilation rules for `mcp-server` evaluation mode.
5
+ *
6
+ * This is the first non-literacy mode handler, proving the compiler
7
+ * architecture works end-to-end. It translates MCP server task definitions
8
+ * into Promptfoo configuration with:
9
+ *
10
+ * - An MCP provider that wraps the server under test
11
+ * - Tool-call assertions compiled to Promptfoo `javascript` assertions
12
+ * - Server lifecycle management via Promptfoo provider hooks
13
+ * - Multi-turn conversation support via Promptfoo's `steps` syntax
14
+ *
15
+ * @see docs/exec-plans/architecture-overhaul/phase-3-mcp-server-mode.md
16
+ * @see packages/core/src/types/eval-mode-config.ts — MCPServerModeConfig
17
+ * @see packages/core/src/types/generalized-task.ts — MCPServerTaskDefinition
18
+ */
19
+ import type { ModeHandler } from "../../../../_vendor/ailf-core/index.d.ts";
20
+ /** ModeHandler-conformant export for the mcp-server evaluation mode. */
21
+ export declare const handler: ModeHandler;
22
+ export type { MCPAssertionContext, MCPCompileOptions, MCPCompileResult, MCPValidationError, } from "./types.js";
23
+ export { buildMCPAssertions } from "./assertions.js";
24
+ export { compileMCPTask } from "./compiler.js";
25
+ export { validateMCPTask } from "./validation.js";
26
+ export { MCP_PROMPT_TEMPLATES } from "./prompts.js";
27
+ export { DEFAULT_MAX_TOOL_ROUNDS, MCP_PROVIDER_PATH, } from "./provider-config.js";
@@ -0,0 +1,54 @@
1
+ /**
2
+ * MCP Server mode handler — directory barrel.
3
+ *
4
+ * MCPServerModeHandler — compilation rules for `mcp-server` evaluation mode.
5
+ *
6
+ * This is the first non-literacy mode handler, proving the compiler
7
+ * architecture works end-to-end. It translates MCP server task definitions
8
+ * into Promptfoo configuration with:
9
+ *
10
+ * - An MCP provider that wraps the server under test
11
+ * - Tool-call assertions compiled to Promptfoo `javascript` assertions
12
+ * - Server lifecycle management via Promptfoo provider hooks
13
+ * - Multi-turn conversation support via Promptfoo's `steps` syntax
14
+ *
15
+ * @see docs/exec-plans/architecture-overhaul/phase-3-mcp-server-mode.md
16
+ * @see packages/core/src/types/eval-mode-config.ts — MCPServerModeConfig
17
+ * @see packages/core/src/types/generalized-task.ts — MCPServerTaskDefinition
18
+ */
19
+ import { compileMCPTask } from "./compiler.js";
20
+ import { MCP_PROMPT_TEMPLATES } from "./prompts.js";
21
+ // ---------------------------------------------------------------------------
22
+ // ModeHandler adapter
23
+ // ---------------------------------------------------------------------------
24
+ /** ModeHandler-conformant export for the mcp-server evaluation mode. */
25
+ export const handler = {
26
+ getPrompts() {
27
+ return MCP_PROMPT_TEMPLATES;
28
+ },
29
+ compileTask(task, ctx) {
30
+ if (!("mode" in task) || task.mode !== "mcp-server") {
31
+ throw new Error(`MCP server handler received task with mode "${task.mode ?? "undefined"}" — expected "mcp-server"`);
32
+ }
33
+ const result = compileMCPTask(task, {
34
+ graderProvider: ctx.graderProvider,
35
+ models: ctx.models,
36
+ });
37
+ return {
38
+ providers: result.providers,
39
+ tests: result.tests,
40
+ prompts: result.prompts,
41
+ warnings: result.warnings,
42
+ };
43
+ },
44
+ };
45
+ // Assertions
46
+ export { buildMCPAssertions } from "./assertions.js";
47
+ // Compilation
48
+ export { compileMCPTask } from "./compiler.js";
49
+ // Validation
50
+ export { validateMCPTask } from "./validation.js";
51
+ // Prompts
52
+ export { MCP_PROMPT_TEMPLATES } from "./prompts.js";
53
+ // Provider config
54
+ export { DEFAULT_MAX_TOOL_ROUNDS, MCP_PROVIDER_PATH, } from "./provider-config.js";
@@ -0,0 +1,8 @@
1
+ /**
2
+ * Canonical MCP server prompt templates.
3
+ *
4
+ * Handler-owned prompts for MCP server evaluations. Instructs the model to
5
+ * interact with MCP tools rather than writing standalone code.
6
+ */
7
+ import type { PromptTemplate } from "../../../../_vendor/ailf-core/index.d.ts";
8
+ export declare const MCP_PROMPT_TEMPLATES: Record<string, PromptTemplate>;
@@ -0,0 +1,28 @@
1
+ /**
2
+ * Canonical MCP server prompt templates.
3
+ *
4
+ * Handler-owned prompts for MCP server evaluations. Instructs the model to
5
+ * interact with MCP tools rather than writing standalone code.
6
+ */
7
+ export const MCP_PROMPT_TEMPLATES = {
8
+ "mcp-server": {
9
+ id: "mcp-server",
10
+ label: "MCP Server Tool Use",
11
+ template: `You are an AI assistant with access to an MCP (Model Context Protocol) server that provides tools for interacting with a Sanity content backend.
12
+
13
+ ## Task
14
+ {{task}}
15
+
16
+ ## Instructions
17
+
18
+ 1. Use the available MCP tools to complete the task
19
+ 2. Call tools with the correct parameters as described in their schemas
20
+ 3. Interpret tool responses and use the results to accomplish the goal
21
+ 4. If a tool returns an error, explain the issue clearly
22
+ 5. Prefer using specific tools over broad queries when possible
23
+
24
+ Complete the task using the MCP tools provided:
25
+ `,
26
+ variables: ["task"],
27
+ },
28
+ };
@@ -0,0 +1,28 @@
1
+ /**
2
+ * MCP server provider assembly — builds Promptfoo provider configs.
3
+ */
4
+ import type { MCPServerTaskDefinition, ModeProviderEntry } from "../../../../_vendor/ailf-core/index.d.ts";
5
+ import type { PromptfooProvider } from "../../promptfoo-compiler.js";
6
+ /** Default max tool rounds for MCP multi-turn execution */
7
+ export declare const DEFAULT_MAX_TOOL_ROUNDS = 5;
8
+ /** Provider path relative to eval package dist */
9
+ export declare const MCP_PROVIDER_PATH = "file://dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js";
10
+ /**
11
+ * Build custom MCP tool provider configs — one per model.
12
+ *
13
+ * Each provider uses the custom mcp-tool-provider.ts which implements a
14
+ * multi-turn tool execution loop. The LLM receives a prompt, discovers
15
+ * MCP tools, calls them, gets results, and continues until it produces
16
+ * a final text answer or exhausts maxToolRounds.
17
+ *
18
+ * Config shape passed to the custom provider:
19
+ * { model, mcpServer: { url, auth, name }, mcpTools, maxToolRounds, temperature, ... }
20
+ */
21
+ export declare function buildMCPProvider(task: MCPServerTaskDefinition, models: ModeProviderEntry[], warnings: string[]): PromptfooProvider[];
22
+ /**
23
+ * Build the MCP server connection config for the custom provider.
24
+ *
25
+ * Shape: { url?, command?, name?, auth? }
26
+ * The custom mcp-tool-provider.ts uses this to connect to the MCP server.
27
+ */
28
+ export declare function buildMCPServerConfig(task: MCPServerTaskDefinition, warnings: string[]): Record<string, unknown>;
@@ -0,0 +1,104 @@
1
+ /**
2
+ * MCP server provider assembly — builds Promptfoo provider configs.
3
+ */
4
+ // ---------------------------------------------------------------------------
5
+ // Constants
6
+ // ---------------------------------------------------------------------------
7
+ /** Default max tool rounds for MCP multi-turn execution */
8
+ export const DEFAULT_MAX_TOOL_ROUNDS = 5;
9
+ /** Provider path relative to eval package dist */
10
+ export const MCP_PROVIDER_PATH = "file://dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js";
11
+ // ---------------------------------------------------------------------------
12
+ // Provider assembly
13
+ // ---------------------------------------------------------------------------
14
+ /**
15
+ * Build custom MCP tool provider configs — one per model.
16
+ *
17
+ * Each provider uses the custom mcp-tool-provider.ts which implements a
18
+ * multi-turn tool execution loop. The LLM receives a prompt, discovers
19
+ * MCP tools, calls them, gets results, and continues until it produces
20
+ * a final text answer or exhausts maxToolRounds.
21
+ *
22
+ * Config shape passed to the custom provider:
23
+ * { model, mcpServer: { url, auth, name }, mcpTools, maxToolRounds, temperature, ... }
24
+ */
25
+ export function buildMCPProvider(task, models, warnings) {
26
+ // Build the MCP server config
27
+ const mcpServer = buildMCPServerConfig(task, warnings);
28
+ const mcpTools = task.capabilities ?? undefined;
29
+ const maxToolRounds = task.maxToolRounds ?? DEFAULT_MAX_TOOL_ROUNDS;
30
+ // Helper to build a provider entry for a given model
31
+ function makeProvider(modelId, label, modelConfig) {
32
+ return {
33
+ id: MCP_PROVIDER_PATH,
34
+ label: `${label} + MCP`,
35
+ config: {
36
+ model: modelId,
37
+ mcpServer,
38
+ ...(mcpTools ? { mcpTools } : {}),
39
+ maxToolRounds,
40
+ ...(modelConfig ?? {}),
41
+ },
42
+ };
43
+ }
44
+ // Task-level model override takes precedence over registry models
45
+ const taskModels = task.models;
46
+ if (taskModels && taskModels.length > 0) {
47
+ return taskModels.map((modelId) => makeProvider(modelId, modelId));
48
+ }
49
+ // Use registry models (already filtered to mcp-server mode)
50
+ if (models.length === 0) {
51
+ warnings.push(`MCP task "${task.id}": no models available. Add "mcp-server" to a ` +
52
+ "model's modes array in config/models.ts, or set models on the task.");
53
+ return [
54
+ makeProvider("anthropic:messages:claude-sonnet-4-20250514", "Claude Sonnet 4"),
55
+ ];
56
+ }
57
+ return models.map((model) => makeProvider(model.id, model.label, model.config));
58
+ }
59
+ /**
60
+ * Build the MCP server connection config for the custom provider.
61
+ *
62
+ * Shape: { url?, command?, name?, auth? }
63
+ * The custom mcp-tool-provider.ts uses this to connect to the MCP server.
64
+ */
65
+ export function buildMCPServerConfig(task, warnings) {
66
+ const config = task.serverConfig;
67
+ if (!config) {
68
+ warnings.push(`MCP task "${task.id}": no serverConfig — using placeholder. ` +
69
+ "Set serverConfig.command or serverConfig.url to point to your MCP server.");
70
+ return { name: task.id };
71
+ }
72
+ const serverConfig = { name: task.id };
73
+ if (config.transport === "stdio") {
74
+ serverConfig.command = config.command;
75
+ }
76
+ else {
77
+ serverConfig.url = config.url;
78
+ }
79
+ // Auth config
80
+ if (config.auth) {
81
+ serverConfig.auth = config.auth;
82
+ }
83
+ else if (config.env) {
84
+ const tokenKey = Object.keys(config.env).find((k) => /token|auth|key/i.test(k));
85
+ if (tokenKey) {
86
+ const val = config.env[tokenKey];
87
+ let envVar = val;
88
+ if (val.startsWith("$env(") && val.endsWith(")")) {
89
+ envVar = val.slice(5, -1);
90
+ }
91
+ if (!envVar || !/^[A-Za-z_][A-Za-z0-9_]*$/.test(envVar)) {
92
+ warnings.push(`MCP task: env var name "${envVar}" from "${val}" is not a valid ` +
93
+ "identifier — skipping auth config");
94
+ }
95
+ else {
96
+ serverConfig.auth = {
97
+ type: "bearer",
98
+ token: `{{env.${envVar}}}`,
99
+ };
100
+ }
101
+ }
102
+ }
103
+ return serverConfig;
104
+ }
@@ -0,0 +1,37 @@
1
+ /**
2
+ * Shared types for the MCP server mode handler.
3
+ */
4
+ import type { ModeProviderEntry } from "../../../../_vendor/ailf-core/index.d.ts";
5
+ import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
6
+ /** Options for compiling an MCP server task */
7
+ export interface MCPCompileOptions {
8
+ /** Grader provider for LLM-graded assertions */
9
+ graderProvider?: string;
10
+ /** Model providers to evaluate with (from registry, filtered by mcp-server mode) */
11
+ models?: ModeProviderEntry[];
12
+ }
13
+ /** Result of compiling a single MCP task */
14
+ export interface MCPCompileResult {
15
+ /** Promptfoo provider config for the MCP server */
16
+ providers: PromptfooProvider[];
17
+ /** Compiled test cases */
18
+ tests: PromptfooTestCase[];
19
+ /** Prompts for MCP evaluation */
20
+ prompts: PromptfooPrompt[];
21
+ /** Warnings generated during compilation */
22
+ warnings: string[];
23
+ }
24
+ /** Validation errors for MCP task definitions */
25
+ export interface MCPValidationError {
26
+ field: string;
27
+ message: string;
28
+ }
29
+ /** Context for building MCP assertions */
30
+ export interface MCPAssertionContext {
31
+ /** Task ID (for error messages) */
32
+ taskId: string;
33
+ /** Expected server capabilities */
34
+ capabilities: string[];
35
+ /** Grader provider for LLM-graded assertions */
36
+ graderProvider?: string;
37
+ }
@@ -0,0 +1,4 @@
1
+ /**
2
+ * Shared types for the MCP server mode handler.
3
+ */
4
+ export {};
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Validation for MCP server task definitions.
3
+ */
4
+ import type { MCPServerTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
5
+ import type { MCPValidationError } from "./types.js";
6
+ /**
7
+ * Validate that an MCP task definition has all required fields.
8
+ */
9
+ export declare function validateMCPTask(task: MCPServerTaskDefinition): MCPValidationError[];
@@ -0,0 +1,43 @@
1
+ /**
2
+ * Validation for MCP server task definitions.
3
+ */
4
+ /**
5
+ * Validate that an MCP task definition has all required fields.
6
+ */
7
+ export function validateMCPTask(task) {
8
+ const errors = [];
9
+ if (!task.id) {
10
+ errors.push({ field: "id", message: "Task ID is required" });
11
+ }
12
+ if (!task.title) {
13
+ errors.push({ field: "title", message: "Task title is required" });
14
+ }
15
+ if (task.serverConfig) {
16
+ const { transport, command, url } = task.serverConfig;
17
+ if (transport === "stdio" && !command) {
18
+ errors.push({
19
+ field: "serverConfig.command",
20
+ message: "Server command is required for stdio transport (e.g., 'node dist/server.js')",
21
+ });
22
+ }
23
+ if ((transport === "sse" || transport === "streamable-http") && !url) {
24
+ errors.push({
25
+ field: "serverConfig.url",
26
+ message: `Server URL is required for ${transport} transport`,
27
+ });
28
+ }
29
+ }
30
+ // Assertions should reference MCP-compatible types
31
+ if (task.assertions) {
32
+ for (const assertion of task.assertions) {
33
+ if (assertion.type === "tool-called" &&
34
+ !("value" in assertion && assertion.value)) {
35
+ errors.push({
36
+ field: "assertions",
37
+ message: 'tool-called assertion requires a "value" specifying the tool name',
38
+ });
39
+ }
40
+ }
41
+ }
42
+ return errors;
43
+ }
@@ -0,0 +1,33 @@
1
+ /**
2
+ * MCPToolProvider — Custom Promptfoo provider for MCP tool-use evaluation.
3
+ *
4
+ * Orchestrates the MCP evaluation flow:
5
+ * 1. Connects to the MCP server and discovers available tools
6
+ * 2. Selects the appropriate LLM backend based on model ID prefix
7
+ * 3. Delegates the multi-turn tool loop to the backend
8
+ * 4. Formats the result for Promptfoo (including tool call summary)
9
+ *
10
+ * Promptfoo config usage:
11
+ *
12
+ * providers:
13
+ * - id: file://dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js
14
+ * label: "Claude Opus 4.6 + MCP"
15
+ * config:
16
+ * model: anthropic:messages:claude-opus-4-6
17
+ * maxToolRounds: 5
18
+ * temperature: 0.2
19
+ * max_tokens: 4096
20
+ * mcpServer:
21
+ * url: https://mcp.sanity.io
22
+ * auth: { type: bearer, token: "{{env.SANITY_API_TOKEN}}" }
23
+ * name: mcp-live-query-documents
24
+ * mcpTools: [query_documents, get_schema]
25
+ */
26
+ import type { CallApiContextParams, ProviderOptions, ProviderResponse } from "./types.js";
27
+ export default class MCPToolProvider {
28
+ config: Record<string, unknown>;
29
+ private providerId;
30
+ constructor(options?: ProviderOptions);
31
+ id(): string;
32
+ callApi(prompt: string, _context?: CallApiContextParams): Promise<ProviderResponse>;
33
+ }
@@ -0,0 +1,174 @@
1
+ /**
2
+ * MCPToolProvider — Custom Promptfoo provider for MCP tool-use evaluation.
3
+ *
4
+ * Orchestrates the MCP evaluation flow:
5
+ * 1. Connects to the MCP server and discovers available tools
6
+ * 2. Selects the appropriate LLM backend based on model ID prefix
7
+ * 3. Delegates the multi-turn tool loop to the backend
8
+ * 4. Formats the result for Promptfoo (including tool call summary)
9
+ *
10
+ * Promptfoo config usage:
11
+ *
12
+ * providers:
13
+ * - id: file://dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js
14
+ * label: "Claude Opus 4.6 + MCP"
15
+ * config:
16
+ * model: anthropic:messages:claude-opus-4-6
17
+ * maxToolRounds: 5
18
+ * temperature: 0.2
19
+ * max_tokens: 4096
20
+ * mcpServer:
21
+ * url: https://mcp.sanity.io
22
+ * auth: { type: bearer, token: "{{env.SANITY_API_TOKEN}}" }
23
+ * name: mcp-live-query-documents
24
+ * mcpTools: [query_documents, get_schema]
25
+ */
26
+ import { config as loadDotenv } from "dotenv";
27
+ import { connectMCP } from "./mcp-connection.js";
28
+ import { runAnthropicToolLoop } from "./tool-loop-anthropic.js";
29
+ import { runOpenAIToolLoop } from "./tool-loop-openai.js";
30
+ loadDotenv({
31
+ override: true,
32
+ path: new URL("../../../../../.env", import.meta.url).pathname,
33
+ });
34
+ // ---------------------------------------------------------------------------
35
+ // Backend registry — maps model ID prefixes to tool loop implementations
36
+ // ---------------------------------------------------------------------------
37
+ const BACKENDS = {
38
+ anthropic: runAnthropicToolLoop,
39
+ openai: runOpenAIToolLoop,
40
+ };
41
+ /**
42
+ * Resolve the LLM backend from a model ID.
43
+ *
44
+ * Model IDs follow the pattern `provider:type:model-name` (e.g.,
45
+ * `anthropic:messages:claude-opus-4-6`). The first segment determines
46
+ * which backend handles the tool loop.
47
+ */
48
+ function resolveBackend(modelId) {
49
+ const prefix = modelId.split(":")[0];
50
+ const backend = BACKENDS[prefix];
51
+ if (!backend) {
52
+ const supported = Object.keys(BACKENDS).join(", ");
53
+ throw new Error(`No backend for model "${modelId}". Supported prefixes: ${supported}`);
54
+ }
55
+ // Extract the model name for the API (e.g., "claude-opus-4-6" from "anthropic:messages:claude-opus-4-6")
56
+ const parts = modelId.split(":");
57
+ const modelName = parts.length > 2 ? parts.slice(2).join(":") : parts[parts.length - 1];
58
+ return { backend, modelName };
59
+ }
60
+ // ---------------------------------------------------------------------------
61
+ // Helpers
62
+ // ---------------------------------------------------------------------------
63
+ /** Append a machine-readable tool call summary for assertion detection */
64
+ function appendToolSummary(text, log) {
65
+ if (log.length === 0)
66
+ return text;
67
+ const names = JSON.stringify(log.map((tc) => tc.name));
68
+ return `${text}\n\n<!-- MCP_TOOLS_CALLED: ${names} -->`;
69
+ }
70
+ /** Resolve the API key for a given model prefix */
71
+ function resolveApiKey(prefix, config) {
72
+ if (config.apiKey)
73
+ return String(config.apiKey);
74
+ const envMap = {
75
+ anthropic: "ANTHROPIC_API_KEY",
76
+ openai: "OPENAI_API_KEY",
77
+ };
78
+ const envVar = envMap[prefix];
79
+ return envVar ? process.env[envVar] : undefined;
80
+ }
81
+ // ---------------------------------------------------------------------------
82
+ // Provider class
83
+ // ---------------------------------------------------------------------------
84
+ export default class MCPToolProvider {
85
+ config;
86
+ providerId;
87
+ constructor(options = {}) {
88
+ this.config = options.config || {};
89
+ this.providerId = options.id || "mcp-tool-provider";
90
+ }
91
+ id() {
92
+ return this.providerId;
93
+ }
94
+ async callApi(prompt, _context) {
95
+ const mcpServerConfig = this.config.mcpServer;
96
+ if (!mcpServerConfig) {
97
+ return { error: "mcpServer config is required", output: undefined };
98
+ }
99
+ // Resolve model and backend
100
+ const modelId = this.config.model || "anthropic:messages:claude-opus-4-6";
101
+ let backend;
102
+ let modelName;
103
+ try {
104
+ const resolved = resolveBackend(modelId);
105
+ backend = resolved.backend;
106
+ modelName = resolved.modelName;
107
+ }
108
+ catch (err) {
109
+ return {
110
+ error: err instanceof Error ? err.message : String(err),
111
+ output: undefined,
112
+ };
113
+ }
114
+ // Resolve API key
115
+ const prefix = modelId.split(":")[0];
116
+ const apiKey = resolveApiKey(prefix, this.config);
117
+ if (!apiKey) {
118
+ return {
119
+ error: `API key not found for ${prefix}. Set ${prefix.toUpperCase()}_API_KEY in env or config.apiKey.`,
120
+ output: undefined,
121
+ };
122
+ }
123
+ // Connect to MCP server
124
+ let mcpClient;
125
+ try {
126
+ mcpClient = await connectMCP(mcpServerConfig);
127
+ }
128
+ catch (err) {
129
+ return {
130
+ error: `Failed to connect to MCP server: ${err instanceof Error ? err.message : String(err)}`,
131
+ output: undefined,
132
+ };
133
+ }
134
+ try {
135
+ // Filter tools by capabilities
136
+ const allTools = mcpClient.getAllTools();
137
+ const toolFilter = this.config.mcpTools;
138
+ const tools = toolFilter
139
+ ? allTools.filter((t) => toolFilter.includes(t.name))
140
+ : allTools;
141
+ if (tools.length === 0) {
142
+ return {
143
+ error: "No MCP tools available after filtering. Check mcpTools config and server capabilities.",
144
+ output: undefined,
145
+ };
146
+ }
147
+ // Run the tool loop
148
+ const result = await backend({
149
+ prompt,
150
+ tools,
151
+ callTool: mcpClient.callTool,
152
+ maxToolRounds: this.config.maxToolRounds || 5,
153
+ model: modelName,
154
+ temperature: this.config.temperature ?? 0.2,
155
+ maxTokens: this.config.max_tokens || 4096,
156
+ apiKey,
157
+ });
158
+ return {
159
+ cost: 0,
160
+ metadata: {
161
+ toolRounds: result.toolRounds,
162
+ toolCallLog: result.toolCallLog,
163
+ exhaustedRounds: result.exhaustedRounds,
164
+ latencyMs: result.latencyMs,
165
+ },
166
+ output: appendToolSummary(result.output, result.toolCallLog),
167
+ tokenUsage: result.tokenUsage,
168
+ };
169
+ }
170
+ finally {
171
+ await mcpClient.cleanup().catch(() => { });
172
+ }
173
+ }
174
+ }