@sanity/ailf 2.0.2 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. package/LICENSE +21 -0
  2. package/dist/_vendor/ailf-core/examples/index.d.ts +50 -1
  3. package/dist/_vendor/ailf-core/examples/index.js +66 -1
  4. package/dist/agent-harness/assertions-runtime.d.ts +49 -0
  5. package/dist/agent-harness/assertions-runtime.js +138 -0
  6. package/dist/agent-harness/provider.d.ts +58 -0
  7. package/dist/agent-harness/provider.js +104 -0
  8. package/dist/cli.js +0 -0
  9. package/dist/commands/init.js +3 -0
  10. package/dist/orchestration/steps/generate-configs-step.d.ts +7 -0
  11. package/dist/orchestration/steps/generate-configs-step.js +35 -2
  12. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +39 -25
  13. package/dist/pipeline/compiler/compiler-to-yaml.js +78 -7
  14. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +9 -0
  15. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +28 -85
  16. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +22 -15
  17. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +8 -1
  18. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +42 -12
  19. package/package.json +25 -24
  20. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
  21. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
  22. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
  23. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
  24. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  25. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  26. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  27. package/dist/_vendor/ailf-tasks/index.js +0 -16
  28. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  29. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  30. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  31. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  32. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  33. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  34. package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
  35. package/dist/adapters/task-sources/yaml-task-source.js +0 -139
  36. package/dist/agent-observer/test-imports.d.ts +0 -7
  37. package/dist/agent-observer/test-imports.js +0 -185
  38. package/dist/commands/update-quality-scores.d.ts +0 -5
  39. package/dist/commands/update-quality-scores.js +0 -20
  40. package/dist/lib/agent-behavior-report.d.ts +0 -8
  41. package/dist/lib/agent-behavior-report.js +0 -185
  42. package/dist/lib/baseline.d.ts +0 -19
  43. package/dist/lib/baseline.js +0 -153
  44. package/dist/lib/calculate-scores.d.ts +0 -23
  45. package/dist/lib/calculate-scores.js +0 -42
  46. package/dist/lib/compare.d.ts +0 -18
  47. package/dist/lib/compare.js +0 -170
  48. package/dist/lib/coverage-audit.d.ts +0 -4
  49. package/dist/lib/coverage-audit.js +0 -42
  50. package/dist/lib/discovery-report.d.ts +0 -13
  51. package/dist/lib/discovery-report.js +0 -57
  52. package/dist/lib/fetch-docs.d.ts +0 -30
  53. package/dist/lib/fetch-docs.js +0 -171
  54. package/dist/lib/generate-configs.d.ts +0 -25
  55. package/dist/lib/generate-configs.js +0 -42
  56. package/dist/lib/grader-api.d.ts +0 -21
  57. package/dist/lib/grader-api.js +0 -34
  58. package/dist/lib/grader-compare.d.ts +0 -19
  59. package/dist/lib/grader-compare.js +0 -91
  60. package/dist/lib/grader-consistency.d.ts +0 -27
  61. package/dist/lib/grader-consistency.js +0 -79
  62. package/dist/lib/grader-sensitivity.d.ts +0 -19
  63. package/dist/lib/grader-sensitivity.js +0 -75
  64. package/dist/lib/grader-validate.d.ts +0 -19
  65. package/dist/lib/grader-validate.js +0 -78
  66. package/dist/lib/measure-retrieval.d.ts +0 -14
  67. package/dist/lib/measure-retrieval.js +0 -71
  68. package/dist/lib/pr-comment.d.ts +0 -16
  69. package/dist/lib/pr-comment.js +0 -28
  70. package/dist/lib/readiness-report.d.ts +0 -13
  71. package/dist/lib/readiness-report.js +0 -108
  72. package/dist/lib/webhook-server.d.ts +0 -11
  73. package/dist/lib/webhook-server.js +0 -24
  74. package/dist/lib/weekly-digest.d.ts +0 -24
  75. package/dist/lib/weekly-digest.js +0 -148
  76. package/dist/orchestration/env-bridge.d.ts +0 -21
  77. package/dist/orchestration/env-bridge.js +0 -66
  78. package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
  79. package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
  80. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
  81. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
  82. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
  83. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
  84. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
  85. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
  86. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
  87. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
  88. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
  89. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
  90. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
  91. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
  92. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
  93. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
  94. package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
  95. package/dist/pipeline/compiler/task-bridge.js +0 -92
  96. package/dist/pipeline/expand-tasks.d.ts +0 -232
  97. package/dist/pipeline/expand-tasks.js +0 -467
  98. package/dist/pipeline/generate-configs.d.ts +0 -92
  99. package/dist/pipeline/generate-configs.js +0 -445
  100. package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
  101. package/dist/pipeline/steps/calculate-scores-step.js +0 -89
  102. package/dist/pipeline/steps/compare-step.d.ts +0 -18
  103. package/dist/pipeline/steps/compare-step.js +0 -90
  104. package/dist/pipeline/steps/eval-step.d.ts +0 -53
  105. package/dist/pipeline/steps/eval-step.js +0 -347
  106. package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
  107. package/dist/pipeline/steps/fetch-docs-step.js +0 -84
  108. package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
  109. package/dist/pipeline/steps/generate-configs-step.js +0 -98
  110. package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
  111. package/dist/pipeline/steps/grader-consistency-step.js +0 -74
  112. package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
  113. package/dist/pipeline/steps/publish-report-step.js +0 -243
  114. package/dist/pipeline/steps/report-step.d.ts +0 -13
  115. package/dist/pipeline/steps/report-step.js +0 -56
  116. package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
  117. package/dist/pipeline/steps/update-scores-step.js +0 -42
  118. package/dist/scripts/agent-behavior-report.d.ts +0 -19
  119. package/dist/scripts/agent-behavior-report.js +0 -315
  120. package/dist/scripts/baseline.d.ts +0 -43
  121. package/dist/scripts/baseline.js +0 -267
  122. package/dist/scripts/calculate-scores.d.ts +0 -166
  123. package/dist/scripts/calculate-scores.js +0 -1296
  124. package/dist/scripts/compare.d.ts +0 -22
  125. package/dist/scripts/compare.js +0 -334
  126. package/dist/scripts/coverage-audit.d.ts +0 -44
  127. package/dist/scripts/coverage-audit.js +0 -209
  128. package/dist/scripts/debug-eval.d.ts +0 -19
  129. package/dist/scripts/debug-eval.js +0 -73
  130. package/dist/scripts/discovery-report.d.ts +0 -58
  131. package/dist/scripts/discovery-report.js +0 -250
  132. package/dist/scripts/fetch-docs.d.ts +0 -35
  133. package/dist/scripts/fetch-docs.js +0 -472
  134. package/dist/scripts/generate-configs.d.ts +0 -66
  135. package/dist/scripts/generate-configs.js +0 -459
  136. package/dist/scripts/grader-api.d.ts +0 -27
  137. package/dist/scripts/grader-api.js +0 -206
  138. package/dist/scripts/grader-compare.d.ts +0 -22
  139. package/dist/scripts/grader-compare.js +0 -368
  140. package/dist/scripts/grader-consistency.d.ts +0 -20
  141. package/dist/scripts/grader-consistency.js +0 -313
  142. package/dist/scripts/grader-sensitivity.d.ts +0 -22
  143. package/dist/scripts/grader-sensitivity.js +0 -354
  144. package/dist/scripts/grader-validate.d.ts +0 -19
  145. package/dist/scripts/grader-validate.js +0 -267
  146. package/dist/scripts/measure-retrieval.d.ts +0 -10
  147. package/dist/scripts/measure-retrieval.js +0 -145
  148. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
  149. package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
  150. package/dist/scripts/pipeline.d.ts +0 -76
  151. package/dist/scripts/pipeline.js +0 -1031
  152. package/dist/scripts/pr-comment.d.ts +0 -10
  153. package/dist/scripts/pr-comment.js +0 -510
  154. package/dist/scripts/readiness-report.d.ts +0 -88
  155. package/dist/scripts/readiness-report.js +0 -342
  156. package/dist/scripts/update-quality-scores.d.ts +0 -15
  157. package/dist/scripts/update-quality-scores.js +0 -184
  158. package/dist/scripts/validate-task-sources.d.ts +0 -21
  159. package/dist/scripts/validate-task-sources.js +0 -210
  160. package/dist/scripts/validate.d.ts +0 -13
  161. package/dist/scripts/validate.js +0 -79
  162. package/dist/scripts/webhook-server.d.ts +0 -26
  163. package/dist/scripts/webhook-server.js +0 -147
  164. package/dist/scripts/weekly-digest.d.ts +0 -24
  165. package/dist/scripts/weekly-digest.js +0 -144
  166. package/dist/sinks/format-slack.d.ts +0 -64
  167. package/dist/sinks/format-slack.js +0 -306
  168. package/dist/sinks/slack-sink.d.ts +0 -27
  169. package/dist/sinks/slack-sink.js +0 -78
  170. package/dist/sinks/webhook-sink.d.ts +0 -19
  171. package/dist/sinks/webhook-sink.js +0 -50
  172. package/tasks/.expanded.agentic.yaml +0 -280
  173. package/tasks/.expanded.yaml +0 -565
@@ -1,334 +0,0 @@
1
- /**
2
- * MCP-specific assertion types — ergonomic assertions for MCP server testing.
3
- *
4
- * Each assertion type compiles down to a Promptfoo `javascript` assertion
5
- * with the appropriate validation logic. The developer writes:
6
- *
7
- * ```typescript
8
- * assertions: [
9
- * { type: "tool-called", value: "getDocument" },
10
- * { type: "tool-input-matches", value: { documentId: "doc-123" } },
11
- * { type: "tool-output-matches", value: { title: "Hello" } },
12
- * { type: "error-returned", value: { code: -32602 } },
13
- * ]
14
- * ```
15
- *
16
- * The compiler transforms these into Promptfoo-compatible `javascript`
17
- * assertions that inspect the tool call trace in the evaluation output.
18
- *
19
- * @see docs/exec-plans/architecture-overhaul/phase-3-mcp-server-mode.md
20
- */
21
- // ---------------------------------------------------------------------------
22
- // Public API
23
- // ---------------------------------------------------------------------------
24
- /**
25
- * Build MCP-specific assertions from task assertion definitions.
26
- *
27
- * Handles both MCP-specific types (tool-called, tool-input-matches, etc.)
28
- * and standard assertion types (contains, llm-rubric, etc.) which are
29
- * passed through unchanged.
30
- */
31
- export function buildMCPAssertions(assertions, context) {
32
- const result = [];
33
- const warnings = [];
34
- for (const assertion of assertions) {
35
- const mapped = mapMCPAssertion(assertion, context, warnings);
36
- if (mapped) {
37
- result.push(mapped);
38
- }
39
- }
40
- return { assertions: result, warnings };
41
- }
42
- // ---------------------------------------------------------------------------
43
- // Assertion mapping
44
- // ---------------------------------------------------------------------------
45
- function mapMCPAssertion(assertion, context, warnings) {
46
- switch (assertion.type) {
47
- case "tool-called":
48
- return buildToolCalledAssertion(assertion, context);
49
- case "tool-input-matches":
50
- return buildToolInputMatchesAssertion(assertion, context);
51
- case "tool-output-matches":
52
- return buildToolOutputMatchesAssertion(assertion, context);
53
- case "error-returned":
54
- return buildErrorReturnedAssertion(assertion, context);
55
- case "capability-available":
56
- return buildCapabilityAssertion(assertion, context);
57
- // Standard assertions — pass through
58
- case "contains":
59
- case "equals":
60
- case "regex":
61
- case "is-json":
62
- case "llm-rubric":
63
- case "javascript":
64
- case "python":
65
- return {
66
- type: assertion.type,
67
- ...("value" in assertion ? { value: assertion.value } : {}),
68
- ...(assertion.weight !== undefined ? { weight: assertion.weight } : {}),
69
- ...(assertion.type === "llm-rubric" && context.graderProvider
70
- ? { provider: context.graderProvider }
71
- : {}),
72
- };
73
- default:
74
- warnings.push(`MCP task "${context.taskId}": unknown assertion type "${assertion.type}" — passed through`);
75
- return {
76
- type: assertion.type,
77
- ...("value" in assertion ? { value: assertion.value } : {}),
78
- };
79
- }
80
- }
81
- // ---------------------------------------------------------------------------
82
- // tool-called — asserts the model called a specific tool by name
83
- // ---------------------------------------------------------------------------
84
- function buildToolCalledAssertion(assertion, _context) {
85
- const toolName = String(assertion.value ?? "");
86
- // Strategy: check multiple sources for tool call evidence.
87
- // 1. context.vars.__toolCalls (structured, if Promptfoo populates it)
88
- // 2. Response metadata toolCallLog (from custom mcp-tool-provider)
89
- // 3. Response output text (LLM+MCP providers embed tool_use JSON blocks)
90
- return {
91
- type: "javascript",
92
- value: buildJsAssertion(`tool-called: ${toolName}`, `
93
- var toolName = ${JSON.stringify(toolName)};
94
-
95
- // Strategy 1: structured tool calls from Promptfoo
96
- var toolCalls = context.vars.__toolCalls || [];
97
- if (Array.isArray(toolCalls) && toolCalls.length > 0) {
98
- var called = toolCalls.some(function(tc) { return tc.name === toolName; });
99
- return {
100
- pass: called,
101
- score: called ? 1 : 0,
102
- reason: called
103
- ? 'Tool "' + toolName + '" was called (via __toolCalls)'
104
- : 'Expected tool "' + toolName + '" but found: ' + toolCalls.map(function(tc) { return tc.name; }).join(', '),
105
- };
106
- }
107
-
108
- // Strategy 2: MCP_TOOLS_CALLED summary appended by custom mcp-tool-provider
109
- var outputStr = typeof output === 'string' ? output : JSON.stringify(output || '');
110
- var summaryMatch = outputStr.match(/<!-- MCP_TOOLS_CALLED: (\\[.*?\\]) -->/);
111
- if (summaryMatch) {
112
- try {
113
- var calledTools = JSON.parse(summaryMatch[1]);
114
- var called = calledTools.includes(toolName);
115
- var count = calledTools.filter(function(n) { return n === toolName; }).length;
116
- return {
117
- pass: called,
118
- score: called ? 1 : 0,
119
- reason: called
120
- ? 'Tool "' + toolName + '" was called ' + count + ' time(s)'
121
- : 'Expected tool "' + toolName + '" but found: ' + calledTools.join(', '),
122
- };
123
- } catch (e) { /* fall through to Strategy 3 */ }
124
- }
125
-
126
- // Strategy 3: parse output for tool_use blocks (built-in provider fallback)
127
- var outputStr = typeof output === 'string' ? output : JSON.stringify(output || '');
128
- var toolUsePattern = /"type"\\s*:\\s*"tool_use"[^}]*"name"\\s*:\\s*"([^"]+)"/g;
129
- var foundTools = [];
130
- var match;
131
- while ((match = toolUsePattern.exec(outputStr)) !== null) {
132
- foundTools.push(match[1]);
133
- }
134
- var fnCallPattern = /"function"\\s*:\\s*\\{[^}]*"name"\\s*:\\s*"([^"]+)"/g;
135
- while ((match = fnCallPattern.exec(outputStr)) !== null) {
136
- foundTools.push(match[1]);
137
- }
138
- if (foundTools.length === 0 && outputStr.includes(toolName) && outputStr.includes('tool_use')) {
139
- foundTools.push(toolName);
140
- }
141
-
142
- var called = foundTools.includes(toolName);
143
- return {
144
- pass: called,
145
- score: called ? 1 : 0,
146
- reason: called
147
- ? 'Tool "' + toolName + '" was called (detected in output)'
148
- : 'Expected tool "' + toolName + '" to be called. ' +
149
- (foundTools.length > 0
150
- ? 'Tools found in output: ' + foundTools.join(', ')
151
- : 'No tool calls detected in output'),
152
- };`),
153
- ...(assertion.weight !== undefined ? { weight: assertion.weight } : {}),
154
- };
155
- }
156
- // ---------------------------------------------------------------------------
157
- // tool-input-matches — asserts tool call inputs match a schema/value
158
- // ---------------------------------------------------------------------------
159
- function buildToolInputMatchesAssertion(assertion, _context) {
160
- const expected = assertion.value;
161
- const toolName = assertion.toolName ?? assertion.tool;
162
- return {
163
- type: "javascript",
164
- value: buildJsAssertion(`tool-input-matches${toolName ? `: ${toolName}` : ""}`, `
165
- const toolCalls = context.vars.__toolCalls || [];
166
- const expected = ${JSON.stringify(expected)};
167
- const toolFilter = ${JSON.stringify(toolName ?? null)};
168
-
169
- const targetCalls = toolFilter
170
- ? toolCalls.filter(tc => tc.name === toolFilter)
171
- : toolCalls;
172
-
173
- if (targetCalls.length === 0) {
174
- return {
175
- pass: false,
176
- score: 0,
177
- reason: toolFilter
178
- ? 'No calls to tool "' + toolFilter + '" found'
179
- : 'No tool calls found',
180
- };
181
- }
182
-
183
- // Check if any call's input matches the expected value
184
- const match = targetCalls.some(tc => {
185
- const input = tc.input || tc.arguments || {};
186
- return Object.entries(expected).every(([k, v]) =>
187
- JSON.stringify(input[k]) === JSON.stringify(v)
188
- );
189
- });
190
-
191
- return {
192
- pass: match,
193
- score: match ? 1 : 0,
194
- reason: match
195
- ? 'Tool input matches expected values'
196
- : 'Tool input does not match. Expected: ' + JSON.stringify(expected) +
197
- ', Got: ' + JSON.stringify(targetCalls.map(tc => tc.input || tc.arguments)),
198
- };`),
199
- ...(assertion.weight !== undefined ? { weight: assertion.weight } : {}),
200
- };
201
- }
202
- // ---------------------------------------------------------------------------
203
- // tool-output-matches — asserts tool outputs match expected shape/values
204
- // ---------------------------------------------------------------------------
205
- function buildToolOutputMatchesAssertion(assertion, _context) {
206
- const expected = assertion.value;
207
- const toolName = assertion.toolName ?? assertion.tool;
208
- return {
209
- type: "javascript",
210
- value: buildJsAssertion(`tool-output-matches${toolName ? `: ${toolName}` : ""}`, `
211
- const toolCalls = context.vars.__toolCalls || [];
212
- const expected = ${JSON.stringify(expected)};
213
- const toolFilter = ${JSON.stringify(toolName ?? null)};
214
-
215
- const targetCalls = toolFilter
216
- ? toolCalls.filter(tc => tc.name === toolFilter)
217
- : toolCalls;
218
-
219
- if (targetCalls.length === 0) {
220
- return {
221
- pass: false,
222
- score: 0,
223
- reason: toolFilter
224
- ? 'No calls to tool "' + toolFilter + '" found'
225
- : 'No tool calls found',
226
- };
227
- }
228
-
229
- const match = targetCalls.some(tc => {
230
- const output = tc.output || tc.result || {};
231
- return Object.entries(expected).every(([k, v]) =>
232
- JSON.stringify(output[k]) === JSON.stringify(v)
233
- );
234
- });
235
-
236
- return {
237
- pass: match,
238
- score: match ? 1 : 0,
239
- reason: match
240
- ? 'Tool output matches expected values'
241
- : 'Tool output does not match. Expected: ' + JSON.stringify(expected),
242
- };`),
243
- ...(assertion.weight !== undefined ? { weight: assertion.weight } : {}),
244
- };
245
- }
246
- // ---------------------------------------------------------------------------
247
- // error-returned — asserts the server returned a specific error
248
- // ---------------------------------------------------------------------------
249
- function buildErrorReturnedAssertion(assertion, _context) {
250
- const expected = assertion.value;
251
- return {
252
- type: "javascript",
253
- value: buildJsAssertion("error-returned", `
254
- const toolCalls = context.vars.__toolCalls || [];
255
- const expected = ${JSON.stringify(expected ?? {})};
256
-
257
- const errorCall = toolCalls.find(tc => tc.error);
258
- if (!errorCall) {
259
- return {
260
- pass: false,
261
- score: 0,
262
- reason: 'Expected an error response but no errors were returned',
263
- };
264
- }
265
-
266
- const error = errorCall.error;
267
- let pass = true;
268
- const reasons = [];
269
-
270
- if (expected.code !== undefined && error.code !== expected.code) {
271
- pass = false;
272
- reasons.push('Expected error code ' + expected.code + ', got ' + error.code);
273
- }
274
-
275
- if (expected.message !== undefined) {
276
- const msgMatch = typeof error.message === 'string' &&
277
- error.message.includes(expected.message);
278
- if (!msgMatch) {
279
- pass = false;
280
- reasons.push('Expected error message containing "' + expected.message +
281
- '", got "' + (error.message || '') + '"');
282
- }
283
- }
284
-
285
- if (pass) {
286
- reasons.push('Error matches expected pattern');
287
- }
288
-
289
- return {
290
- pass,
291
- score: pass ? 1 : 0,
292
- reason: reasons.join('; '),
293
- };`),
294
- ...(assertion.weight !== undefined ? { weight: assertion.weight } : {}),
295
- };
296
- }
297
- // ---------------------------------------------------------------------------
298
- // capability-available — asserts the server advertises a capability
299
- // ---------------------------------------------------------------------------
300
- function buildCapabilityAssertion(assertion, _context) {
301
- const capability = String(assertion.value ?? "");
302
- return {
303
- type: "javascript",
304
- value: buildJsAssertion(`capability-available: ${capability}`, `
305
- const capabilities = context.vars.__serverCapabilities || [];
306
- const expected = ${JSON.stringify(capability)};
307
- const available = capabilities.includes(expected);
308
-
309
- return {
310
- pass: available,
311
- score: available ? 1 : 0,
312
- reason: available
313
- ? 'Server advertises capability "' + expected + '"'
314
- : 'Server does not advertise capability "' + expected + '". ' +
315
- 'Available: ' + (capabilities.join(', ') || 'none'),
316
- };`),
317
- ...(assertion.weight !== undefined ? { weight: assertion.weight } : {}),
318
- };
319
- }
320
- // ---------------------------------------------------------------------------
321
- // Helpers
322
- // ---------------------------------------------------------------------------
323
- /**
324
- * Build a Promptfoo-compatible JavaScript assertion string.
325
- *
326
- * Wraps the assertion body in a function that receives `output` and `context`
327
- * from Promptfoo's assertion runner.
328
- */
329
- function buildJsAssertion(label, body) {
330
- // No IIFE wrapper — Promptfoo wraps the assertion in its own function via
331
- // new Function('output', 'context', ...). The body must use `return` at
332
- // the top level for the result to reach Promptfoo's validator.
333
- return `// MCP assertion: ${label}\n${body.trim()}`;
334
- }
@@ -1,69 +0,0 @@
1
- /**
2
- * MCPServerModeHandler — compilation rules for `mcp-server` evaluation mode.
3
- *
4
- * This is the first non-literacy mode handler, proving the compiler
5
- * architecture works end-to-end. It translates MCP server task definitions
6
- * into Promptfoo configuration with:
7
- *
8
- * - An MCP provider that wraps the server under test
9
- * - Tool-call assertions compiled to Promptfoo `javascript` assertions
10
- * - Server lifecycle management via Promptfoo provider hooks
11
- * - Multi-turn conversation support via Promptfoo's `steps` syntax
12
- *
13
- * Promptfoo supports MCP servers as providers natively:
14
- * ```yaml
15
- * providers:
16
- * - id: mcp:./my-server
17
- * config:
18
- * command: node
19
- * args: [./dist/server.js]
20
- * env: { API_KEY: "..." }
21
- * ```
22
- *
23
- * This handler assembles that config from AILF's `MCPServerTaskDefinition`.
24
- *
25
- * @see docs/exec-plans/architecture-overhaul/phase-3-mcp-server-mode.md
26
- * @see packages/core/src/types/eval-mode-config.ts — MCPServerModeConfig
27
- * @see packages/core/src/types/generalized-task.ts — MCPServerTaskDefinition
28
- */
29
- import type { MCPServerTaskDefinition, ModeHandler, ModeProviderEntry, PromptTemplate } from "../../../_vendor/ailf-core/index.d.ts";
30
- import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../promptfoo-compiler.js";
31
- export declare const MCP_PROMPT_TEMPLATES: Record<string, PromptTemplate>;
32
- /** Options for compiling an MCP server task */
33
- export interface MCPCompileOptions {
34
- /** Grader provider for LLM-graded assertions */
35
- graderProvider?: string;
36
- /** Model providers to evaluate with (from registry, filtered by mcp-server mode) */
37
- models?: ModeProviderEntry[];
38
- }
39
- /** Result of compiling a single MCP task */
40
- export interface MCPCompileResult {
41
- /** Promptfoo provider config for the MCP server */
42
- providers: PromptfooProvider[];
43
- /** Compiled test cases */
44
- tests: PromptfooTestCase[];
45
- /** Prompts for MCP evaluation */
46
- prompts: PromptfooPrompt[];
47
- /** Warnings generated during compilation */
48
- warnings: string[];
49
- }
50
- /** Validation errors for MCP task definitions */
51
- export interface MCPValidationError {
52
- field: string;
53
- message: string;
54
- }
55
- /**
56
- * Validate that an MCP task definition has all required fields.
57
- */
58
- export declare function validateMCPTask(task: MCPServerTaskDefinition): MCPValidationError[];
59
- /**
60
- * Compile an MCP server task definition into Promptfoo configuration.
61
- *
62
- * This is the core of the MCP mode handler. It produces:
63
- * 1. A provider config pointing to the MCP server
64
- * 2. Test cases with tool-call assertions
65
- * 3. Appropriate prompts for the evaluation
66
- */
67
- export declare function compileMCPTask(task: MCPServerTaskDefinition, options?: MCPCompileOptions): MCPCompileResult;
68
- /** ModeHandler-conformant export for the mcp-server evaluation mode. */
69
- export declare const handler: ModeHandler;