@sanity/ailf 1.0.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -1
- package/config/models.ts +15 -3
- package/dist/_vendor/ailf-core/config-helpers.d.ts +14 -17
- package/dist/_vendor/ailf-core/config-helpers.js +22 -2
- package/dist/_vendor/ailf-core/examples/index.d.ts +16 -0
- package/dist/_vendor/ailf-core/examples/index.js +25 -0
- package/dist/_vendor/ailf-core/index.d.ts +2 -2
- package/dist/_vendor/ailf-core/index.js +1 -1
- package/dist/_vendor/ailf-core/ports/context.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +2 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
- package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +13 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +1 -3
- package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +78 -23
- package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
- package/dist/adapters/config-sources/file-config-adapter.js +1 -0
- package/dist/adapters/config-sources/ts-config-loader.js +21 -13
- package/dist/adapters/task-sources/content-lake-task-source.js +17 -20
- package/dist/adapters/task-sources/index.d.ts +2 -2
- package/dist/adapters/task-sources/index.js +2 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +218 -16
- package/dist/adapters/task-sources/repo-schemas.js +227 -19
- package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
- package/dist/adapters/task-sources/repo-task-source.js +81 -122
- package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
- package/dist/adapters/task-sources/repo-validation.js +126 -5
- package/dist/adapters/task-sources/task-file-loader.d.ts +2 -2
- package/dist/adapters/task-sources/task-file-loader.js +2 -2
- package/dist/commands/coverage-audit.js +3 -1
- package/dist/commands/init.d.ts +6 -4
- package/dist/commands/init.js +302 -23
- package/dist/commands/validate-tasks.d.ts +2 -2
- package/dist/commands/validate-tasks.js +26 -15
- package/dist/composition-root.d.ts +13 -1
- package/dist/composition-root.js +73 -41
- package/dist/index.d.ts +41 -0
- package/dist/index.js +48 -0
- package/dist/orchestration/build-step-sequence.js +4 -2
- package/dist/orchestration/steps/fetch-docs-step.js +2 -3
- package/dist/orchestration/steps/generate-configs-step.js +28 -12
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
- package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +105 -68
- package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +1 -1
- package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
- package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
- package/dist/pipeline/compiler/mode-bases/index.js +4 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
- package/dist/pipeline/compiler/mode-bases/literacy.d.ts +12 -0
- package/dist/pipeline/compiler/mode-bases/literacy.js +78 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
- package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
- package/dist/pipeline/compiler/mode-handlers/{agent-harness-handler.d.ts → agent-harness/types.d.ts} +3 -24
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/index.d.ts +4 -5
- package/dist/pipeline/compiler/mode-handlers/index.js +4 -6
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/{mcp-assertions.d.ts → mcp-server/assertions.d.ts} +2 -10
- package/dist/pipeline/compiler/mode-handlers/{mcp-assertions.js → mcp-server/assertions.js} +63 -6
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +104 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +174 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +95 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +14 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +16 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +93 -0
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
- package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
- package/dist/pipeline/compiler/preset-loader.js +99 -0
- package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +6 -9
- package/dist/pipeline/compiler/presets/sanity-literacy.js +10 -156
- package/dist/pipeline/expand-tasks.d.ts +2 -2
- package/dist/pipeline/expand-tasks.js +2 -2
- package/dist/pipeline/generate-configs.js +1 -1
- package/dist/pipeline/map-request-to-config.js +1 -0
- package/dist/pipeline/mirror-repo-tasks.d.ts +7 -7
- package/dist/pipeline/mirror-repo-tasks.js +9 -9
- package/dist/pipeline/plan.js +1 -1
- package/package.json +11 -3
- package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
- package/dist/_vendor/ailf-tasks/cli.js +0 -61
- package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
- package/dist/_vendor/ailf-tasks/index.js +0 -16
- package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
- package/dist/_vendor/ailf-tasks/parser.js +0 -73
- package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
- package/dist/_vendor/ailf-tasks/schemas.js +0 -180
- package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
- package/dist/_vendor/ailf-tasks/validation.js +0 -162
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -67
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -309
|
@@ -1,379 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* LiteracyModeHandler — compilation rules for `literacy` mode.
|
|
3
|
-
*
|
|
4
|
-
* This handler replaces the existing `generate-configs.ts` + `expand-tasks.ts`
|
|
5
|
-
* code path for literacy (documentation) evaluation. It compiles
|
|
6
|
-
* LiteracyTaskDefinition objects into Promptfoo structure:
|
|
7
|
-
*
|
|
8
|
-
* - Gold entry (with-docs prompt, canonical docs injected)
|
|
9
|
-
* - Baseline entry (without-docs prompt, empty docs)
|
|
10
|
-
* - Rubric template resolution from config/rubrics
|
|
11
|
-
* - Doc-coverage auto-generation when opted in
|
|
12
|
-
* - Structured dimension metadata on rubric assertions
|
|
13
|
-
*
|
|
14
|
-
* The handler accepts GeneralizedTaskDefinition, narrows to
|
|
15
|
-
* LiteracyTaskDefinition, and produces Promptfoo output.
|
|
16
|
-
*
|
|
17
|
-
* @see docs/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
|
|
18
|
-
* @see packages/eval/src/pipeline/expand-tasks.ts — the legacy code path
|
|
19
|
-
*/
|
|
20
|
-
import { LiteracyVariant, } from "../../normalize-mode.js";
|
|
21
|
-
// ---------------------------------------------------------------------------
|
|
22
|
-
// Canonical literacy prompt templates
|
|
23
|
-
// ---------------------------------------------------------------------------
|
|
24
|
-
// These are the source-of-truth templates for literacy-mode evaluations.
|
|
25
|
-
// Previously lived in config/prompts.ts as global templates; now handler-owned
|
|
26
|
-
// so non-literacy modes can define their own prompts without collision.
|
|
27
|
-
export const LITERACY_PROMPT_TEMPLATES = {
|
|
28
|
-
"with-docs": {
|
|
29
|
-
id: "with-docs",
|
|
30
|
-
label: "With Documentation",
|
|
31
|
-
template: `You are an expert Sanity.io developer. Use the following documentation to help implement the task.
|
|
32
|
-
|
|
33
|
-
## Sanity Documentation
|
|
34
|
-
{{docs}}
|
|
35
|
-
|
|
36
|
-
## Task
|
|
37
|
-
{{task}}
|
|
38
|
-
|
|
39
|
-
## Requirements
|
|
40
|
-
|
|
41
|
-
1. Use ONLY the APIs and patterns shown in the documentation
|
|
42
|
-
2. Provide a complete, working implementation
|
|
43
|
-
3. Include all necessary imports
|
|
44
|
-
4. Follow Sanity best practices as documented
|
|
45
|
-
|
|
46
|
-
Provide your implementation:
|
|
47
|
-
`,
|
|
48
|
-
variables: ["docs", "task"],
|
|
49
|
-
},
|
|
50
|
-
"without-docs": {
|
|
51
|
-
id: "without-docs",
|
|
52
|
-
label: "Baseline (No Docs)",
|
|
53
|
-
template: `You are an expert Sanity.io developer.
|
|
54
|
-
|
|
55
|
-
## Task
|
|
56
|
-
{{task}}
|
|
57
|
-
|
|
58
|
-
## Requirements
|
|
59
|
-
|
|
60
|
-
1. Provide a complete, working implementation
|
|
61
|
-
2. Include all necessary imports
|
|
62
|
-
3. Follow Sanity best practices
|
|
63
|
-
|
|
64
|
-
Provide your implementation:
|
|
65
|
-
`,
|
|
66
|
-
variables: ["task"],
|
|
67
|
-
},
|
|
68
|
-
agentic: {
|
|
69
|
-
id: "agentic",
|
|
70
|
-
label: "Agentic (self-retrieval)",
|
|
71
|
-
template: `You are an expert developer helping implement a Sanity.io feature.
|
|
72
|
-
You have access to web search and page fetching tools.
|
|
73
|
-
|
|
74
|
-
IMPORTANT: Before writing any code, search for and read the relevant
|
|
75
|
-
Sanity.io documentation to ensure you are using the latest APIs and
|
|
76
|
-
best practices. Do not rely on memory alone.
|
|
77
|
-
|
|
78
|
-
## Task
|
|
79
|
-
{{task}}
|
|
80
|
-
|
|
81
|
-
## Requirements
|
|
82
|
-
|
|
83
|
-
1. Search for relevant Sanity documentation before implementing
|
|
84
|
-
2. Use ONLY the APIs and patterns from the current official docs
|
|
85
|
-
3. Provide a complete, working implementation
|
|
86
|
-
4. Include all necessary imports
|
|
87
|
-
5. Follow Sanity best practices as documented
|
|
88
|
-
|
|
89
|
-
Provide your implementation:
|
|
90
|
-
`,
|
|
91
|
-
variables: ["task"],
|
|
92
|
-
},
|
|
93
|
-
};
|
|
94
|
-
/**
|
|
95
|
-
* Validate a literacy task definition.
|
|
96
|
-
*/
|
|
97
|
-
export function validateLiteracyTask(task) {
|
|
98
|
-
const errors = [];
|
|
99
|
-
if (!task.id)
|
|
100
|
-
errors.push({ field: "id", message: "Task ID is required" });
|
|
101
|
-
if (!task.title) {
|
|
102
|
-
errors.push({
|
|
103
|
-
field: "title",
|
|
104
|
-
message: "Task title is required",
|
|
105
|
-
});
|
|
106
|
-
}
|
|
107
|
-
const promptText = task.prompt?.text ??
|
|
108
|
-
task.prompt?.template ??
|
|
109
|
-
task.prompt?.vars?.task ??
|
|
110
|
-
"";
|
|
111
|
-
if (!promptText) {
|
|
112
|
-
errors.push({
|
|
113
|
-
field: "prompt",
|
|
114
|
-
message: "Task prompt text is required",
|
|
115
|
-
});
|
|
116
|
-
}
|
|
117
|
-
return errors;
|
|
118
|
-
}
|
|
119
|
-
// ---------------------------------------------------------------------------
|
|
120
|
-
// Compilation
|
|
121
|
-
// ---------------------------------------------------------------------------
|
|
122
|
-
/**
|
|
123
|
-
* Compile a literacy task into Promptfoo configuration.
|
|
124
|
-
*
|
|
125
|
-
* Produces the same structure as the legacy expand-tasks.ts path:
|
|
126
|
-
* - Gold entry with with-docs prompt and canonical doc context
|
|
127
|
-
* - Baseline entry with without-docs prompt and empty docs
|
|
128
|
-
* - Rubric assertions with structured dimension metadata
|
|
129
|
-
*/
|
|
130
|
-
export function compileLiteracyTask(task, options) {
|
|
131
|
-
const warnings = [];
|
|
132
|
-
const evalMode = options?.evalMode ?? LiteracyVariant.STANDARD;
|
|
133
|
-
// Validation
|
|
134
|
-
for (const err of validateLiteracyTask(task)) {
|
|
135
|
-
warnings.push(`Literacy task "${task.id}": ${err.field} — ${err.message}`);
|
|
136
|
-
}
|
|
137
|
-
// Build providers from model list
|
|
138
|
-
const providers = buildProviders(options);
|
|
139
|
-
// Build prompts
|
|
140
|
-
const prompts = buildPrompts(evalMode);
|
|
141
|
-
// Build test cases (gold + baseline)
|
|
142
|
-
const tests = buildTestCases(task, evalMode, options, warnings);
|
|
143
|
-
return { providers, tests, prompts, warnings };
|
|
144
|
-
}
|
|
145
|
-
// ---------------------------------------------------------------------------
|
|
146
|
-
// Provider assembly
|
|
147
|
-
// ---------------------------------------------------------------------------
|
|
148
|
-
function buildProviders(options) {
|
|
149
|
-
if (options?.models && options.models.length > 0) {
|
|
150
|
-
return options.models.map((m) => ({
|
|
151
|
-
id: m.id,
|
|
152
|
-
label: m.label,
|
|
153
|
-
config: m.config,
|
|
154
|
-
}));
|
|
155
|
-
}
|
|
156
|
-
return [];
|
|
157
|
-
}
|
|
158
|
-
// ---------------------------------------------------------------------------
|
|
159
|
-
// Prompt assembly
|
|
160
|
-
// ---------------------------------------------------------------------------
|
|
161
|
-
/** Convert a PromptTemplate to the PromptfooPrompt shape used by compile results */
|
|
162
|
-
function templateToPromptfoo(pt) {
|
|
163
|
-
return { id: pt.id, label: pt.label, raw: pt.template };
|
|
164
|
-
}
|
|
165
|
-
function buildPrompts(evalMode) {
|
|
166
|
-
if (evalMode === "agentic") {
|
|
167
|
-
return [templateToPromptfoo(LITERACY_PROMPT_TEMPLATES["agentic"])];
|
|
168
|
-
}
|
|
169
|
-
// Baseline mode: with-docs + without-docs prompts
|
|
170
|
-
return [
|
|
171
|
-
templateToPromptfoo(LITERACY_PROMPT_TEMPLATES["with-docs"]),
|
|
172
|
-
templateToPromptfoo(LITERACY_PROMPT_TEMPLATES["without-docs"]),
|
|
173
|
-
];
|
|
174
|
-
}
|
|
175
|
-
// ---------------------------------------------------------------------------
|
|
176
|
-
// Test case assembly
|
|
177
|
-
// ---------------------------------------------------------------------------
|
|
178
|
-
function buildTestCases(task, evalMode, options, warnings) {
|
|
179
|
-
const tests = [];
|
|
180
|
-
// Extract fields from the LiteracyTaskDefinition shape
|
|
181
|
-
const promptText = task.prompt?.text ?? task.prompt?.template ?? "";
|
|
182
|
-
const contextDocs = task.context?.docs ?? [];
|
|
183
|
-
const taskArea = task.area ?? "";
|
|
184
|
-
const taskTitle = task.title;
|
|
185
|
-
const promptVars = task.prompt?.vars ?? {};
|
|
186
|
-
// Resolve doc path
|
|
187
|
-
const hasDocs = contextDocs.length > 0;
|
|
188
|
-
const docsVar = hasDocs ? `file://contexts/canonical/${task.id}.md` : "";
|
|
189
|
-
// Resolve assertions
|
|
190
|
-
const assertions = resolveAssertions(task, options, warnings);
|
|
191
|
-
// Gold entry — canonical docs injected
|
|
192
|
-
const goldVars = {
|
|
193
|
-
task: promptText,
|
|
194
|
-
docs: docsVar,
|
|
195
|
-
__featureArea: taskArea,
|
|
196
|
-
...promptVars,
|
|
197
|
-
};
|
|
198
|
-
tests.push({
|
|
199
|
-
description: `${taskTitle} (gold)`,
|
|
200
|
-
vars: goldVars,
|
|
201
|
-
...(evalMode === LiteracyVariant.STANDARD
|
|
202
|
-
? { prompts: ["with-docs"] }
|
|
203
|
-
: {}),
|
|
204
|
-
...(assertions.length > 0 ? { assert: assertions } : {}),
|
|
205
|
-
});
|
|
206
|
-
// Baseline entry — no docs (floor measurement)
|
|
207
|
-
// Skipped in agentic mode (the prompt doesn't use {{docs}})
|
|
208
|
-
if (evalMode !== "agentic") {
|
|
209
|
-
const baselineEnabled = task.baseline?.enabled !== false;
|
|
210
|
-
if (baselineEnabled) {
|
|
211
|
-
const baselineAssertions = buildBaselineAssertions(assertions, task.baseline?.rubric);
|
|
212
|
-
tests.push({
|
|
213
|
-
description: `${taskTitle} (baseline)`,
|
|
214
|
-
vars: {
|
|
215
|
-
task: promptText,
|
|
216
|
-
docs: "",
|
|
217
|
-
__featureArea: taskArea,
|
|
218
|
-
...promptVars,
|
|
219
|
-
},
|
|
220
|
-
prompts: ["without-docs"],
|
|
221
|
-
...(baselineAssertions.length > 0
|
|
222
|
-
? { assert: baselineAssertions }
|
|
223
|
-
: {}),
|
|
224
|
-
});
|
|
225
|
-
}
|
|
226
|
-
}
|
|
227
|
-
return tests;
|
|
228
|
-
}
|
|
229
|
-
// ---------------------------------------------------------------------------
|
|
230
|
-
// Assertion resolution
|
|
231
|
-
// ---------------------------------------------------------------------------
|
|
232
|
-
function resolveAssertions(task, options, warnings) {
|
|
233
|
-
const assertions = [];
|
|
234
|
-
for (const a of task.assertions ?? []) {
|
|
235
|
-
if (a.type === "llm-rubric" && "template" in a) {
|
|
236
|
-
// Templated assertion — resolve from rubric config
|
|
237
|
-
const resolved = resolveTemplatedAssertion(a, options?.rubricConfig, options?.graderProvider, warnings);
|
|
238
|
-
if (resolved)
|
|
239
|
-
assertions.push(resolved);
|
|
240
|
-
}
|
|
241
|
-
else {
|
|
242
|
-
// Value assertion — pass through with optional grader
|
|
243
|
-
assertions.push({
|
|
244
|
-
type: a.type,
|
|
245
|
-
...("value" in a ? { value: a.value } : {}),
|
|
246
|
-
...(typeof a.weight === "number"
|
|
247
|
-
? { weight: a.weight }
|
|
248
|
-
: {}),
|
|
249
|
-
...(a.type === "llm-rubric" && options?.graderProvider
|
|
250
|
-
? { provider: options.graderProvider }
|
|
251
|
-
: {}),
|
|
252
|
-
});
|
|
253
|
-
}
|
|
254
|
-
}
|
|
255
|
-
// Doc-coverage auto-generation
|
|
256
|
-
if (task.docCoverage) {
|
|
257
|
-
const docCoverageAssertion = buildDocCoverageAssertion(options?.rubricConfig, options?.graderProvider);
|
|
258
|
-
if (docCoverageAssertion)
|
|
259
|
-
assertions.push(docCoverageAssertion);
|
|
260
|
-
}
|
|
261
|
-
return assertions;
|
|
262
|
-
}
|
|
263
|
-
function resolveTemplatedAssertion(a, rubricConfig, graderProvider, warnings) {
|
|
264
|
-
if (!rubricConfig) {
|
|
265
|
-
warnings.push(`No rubric config — template "${a.template}" cannot be resolved`);
|
|
266
|
-
return null;
|
|
267
|
-
}
|
|
268
|
-
const template = rubricConfig.templates[a.template];
|
|
269
|
-
if (!template) {
|
|
270
|
-
warnings.push(`Unknown rubric template: "${a.template}"`);
|
|
271
|
-
return null;
|
|
272
|
-
}
|
|
273
|
-
// Assemble the rubric text
|
|
274
|
-
const scaleText = template.scale.map((s) => `- ${s}`).join("\n");
|
|
275
|
-
const criteriaText = a.criteria.map((c) => `- ${c}`).join("\n");
|
|
276
|
-
// Match legacy rubric assembly format:
|
|
277
|
-
// header\n- scale...\n\ncriteria_label\n- criteria...\n\nfooter
|
|
278
|
-
const rubricValue = `${template.header}\n${scaleText}\n\n` +
|
|
279
|
-
`${template.criteria_label ?? "Check for:"}\n${criteriaText}\n\n` +
|
|
280
|
-
`Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}`;
|
|
281
|
-
return {
|
|
282
|
-
type: "llm-rubric",
|
|
283
|
-
value: rubricValue,
|
|
284
|
-
...(graderProvider ? { provider: graderProvider } : {}),
|
|
285
|
-
...(template.dimension
|
|
286
|
-
? { metadata: { dimension: template.dimension, maxScore: 100 } }
|
|
287
|
-
: {}),
|
|
288
|
-
};
|
|
289
|
-
}
|
|
290
|
-
function buildDocCoverageAssertion(rubricConfig, graderProvider) {
|
|
291
|
-
if (!rubricConfig?.templates["doc-coverage"])
|
|
292
|
-
return null;
|
|
293
|
-
const template = rubricConfig.templates["doc-coverage"];
|
|
294
|
-
const scaleText = template.scale.map((s) => `- ${s}`).join("\n");
|
|
295
|
-
const rubricValue = `${template.header}\n${scaleText}\n\n` +
|
|
296
|
-
`Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}`;
|
|
297
|
-
return {
|
|
298
|
-
type: "llm-rubric",
|
|
299
|
-
value: rubricValue,
|
|
300
|
-
...(graderProvider ? { provider: graderProvider } : {}),
|
|
301
|
-
...(template.dimension
|
|
302
|
-
? { metadata: { dimension: template.dimension, maxScore: 100 } }
|
|
303
|
-
: {}),
|
|
304
|
-
};
|
|
305
|
-
}
|
|
306
|
-
// ---------------------------------------------------------------------------
|
|
307
|
-
// Baseline assertion filtering
|
|
308
|
-
// ---------------------------------------------------------------------------
|
|
309
|
-
/**
|
|
310
|
-
* Build baseline assertions matching the legacy expand-tasks behavior.
|
|
311
|
-
*
|
|
312
|
-
* - "full": all assertions carried over
|
|
313
|
-
* - "abbreviated": only first llm-rubric with shortened prompt
|
|
314
|
-
* - "none": no assertions
|
|
315
|
-
*/
|
|
316
|
-
function buildBaselineAssertions(goldAssertions, rubricMode) {
|
|
317
|
-
const mode = rubricMode ?? "full";
|
|
318
|
-
if (mode === "none")
|
|
319
|
-
return [];
|
|
320
|
-
if (mode === "full")
|
|
321
|
-
return [...goldAssertions];
|
|
322
|
-
// Abbreviated: keep first llm-rubric as summary, skip rest
|
|
323
|
-
const abbreviated = [];
|
|
324
|
-
let foundFirst = false;
|
|
325
|
-
for (const a of goldAssertions) {
|
|
326
|
-
if (a.type === "llm-rubric") {
|
|
327
|
-
if (!foundFirst) {
|
|
328
|
-
foundFirst = true;
|
|
329
|
-
abbreviated.push({
|
|
330
|
-
type: "llm-rubric",
|
|
331
|
-
value: "Score task completion from 0 to 100 (same criteria as above).\n" +
|
|
332
|
-
'Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}',
|
|
333
|
-
...(a.provider ? { provider: a.provider } : {}),
|
|
334
|
-
});
|
|
335
|
-
}
|
|
336
|
-
}
|
|
337
|
-
// Non-rubric assertions are excluded (matching legacy behavior)
|
|
338
|
-
}
|
|
339
|
-
return abbreviated;
|
|
340
|
-
}
|
|
341
|
-
// ---------------------------------------------------------------------------
|
|
342
|
-
// ModeHandler adapter — wraps compileLiteracyTask for registry dispatch
|
|
343
|
-
// ---------------------------------------------------------------------------
|
|
344
|
-
/**
|
|
345
|
-
* ModeHandler-conformant export for the literacy evaluation mode.
|
|
346
|
-
*
|
|
347
|
-
* The pipeline looks up this handler via `registry.getMode("literacy")`
|
|
348
|
-
* and calls `handler.compileTask()`. The handler narrows the union to
|
|
349
|
-
* LiteracyTaskDefinition and delegates to `compileLiteracyTask()`.
|
|
350
|
-
*
|
|
351
|
-
* Note: The literacy handler's `evalMode` variant ("baseline" vs "agentic")
|
|
352
|
-
* is passed via `ctx.evalMode` — a literacy-specific extension of
|
|
353
|
-
* CompilationContext. The pipeline sets this when compiling literacy tasks.
|
|
354
|
-
*/
|
|
355
|
-
export const handler = {
|
|
356
|
-
getPrompts() {
|
|
357
|
-
return LITERACY_PROMPT_TEMPLATES;
|
|
358
|
-
},
|
|
359
|
-
compileTask(task, ctx) {
|
|
360
|
-
// Type-narrow the union — literacy handler only accepts literacy tasks
|
|
361
|
-
if (task.mode !== "literacy") {
|
|
362
|
-
throw new Error(`Literacy handler received task with mode "${task.mode}" — expected "literacy"`);
|
|
363
|
-
}
|
|
364
|
-
const result = compileLiteracyTask(task, {
|
|
365
|
-
graderProvider: ctx.graderProvider,
|
|
366
|
-
rootDir: ctx.rootDir,
|
|
367
|
-
models: ctx.models,
|
|
368
|
-
rubricConfig: ctx.rubricConfig,
|
|
369
|
-
evalMode: ctx
|
|
370
|
-
.evalMode,
|
|
371
|
-
});
|
|
372
|
-
return {
|
|
373
|
-
providers: result.providers,
|
|
374
|
-
tests: result.tests,
|
|
375
|
-
prompts: result.prompts,
|
|
376
|
-
warnings: result.warnings,
|
|
377
|
-
};
|
|
378
|
-
},
|
|
379
|
-
};
|
|
@@ -1,67 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* MCPServerModeHandler — compilation rules for `mcp-server` evaluation mode.
|
|
3
|
-
*
|
|
4
|
-
* This is the first non-literacy mode handler, proving the compiler
|
|
5
|
-
* architecture works end-to-end. It translates MCP server task definitions
|
|
6
|
-
* into Promptfoo configuration with:
|
|
7
|
-
*
|
|
8
|
-
* - An MCP provider that wraps the server under test
|
|
9
|
-
* - Tool-call assertions compiled to Promptfoo `javascript` assertions
|
|
10
|
-
* - Server lifecycle management via Promptfoo provider hooks
|
|
11
|
-
* - Multi-turn conversation support via Promptfoo's `steps` syntax
|
|
12
|
-
*
|
|
13
|
-
* Promptfoo supports MCP servers as providers natively:
|
|
14
|
-
* ```yaml
|
|
15
|
-
* providers:
|
|
16
|
-
* - id: mcp:./my-server
|
|
17
|
-
* config:
|
|
18
|
-
* command: node
|
|
19
|
-
* args: [./dist/server.js]
|
|
20
|
-
* env: { API_KEY: "..." }
|
|
21
|
-
* ```
|
|
22
|
-
*
|
|
23
|
-
* This handler assembles that config from AILF's `MCPServerTaskDefinition`.
|
|
24
|
-
*
|
|
25
|
-
* @see docs/exec-plans/architecture-overhaul/phase-3-mcp-server-mode.md
|
|
26
|
-
* @see packages/core/src/types/eval-mode-config.ts — MCPServerModeConfig
|
|
27
|
-
* @see packages/core/src/types/generalized-task.ts — MCPServerTaskDefinition
|
|
28
|
-
*/
|
|
29
|
-
import type { MCPServerTaskDefinition, ModeHandler, PromptTemplate } from "../../../_vendor/ailf-core/index.d.ts";
|
|
30
|
-
import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../promptfoo-compiler.js";
|
|
31
|
-
export declare const MCP_PROMPT_TEMPLATES: Record<string, PromptTemplate>;
|
|
32
|
-
/** Options for compiling an MCP server task */
|
|
33
|
-
export interface MCPCompileOptions {
|
|
34
|
-
/** Grader provider for LLM-graded assertions */
|
|
35
|
-
graderProvider?: string;
|
|
36
|
-
}
|
|
37
|
-
/** Result of compiling a single MCP task */
|
|
38
|
-
export interface MCPCompileResult {
|
|
39
|
-
/** Promptfoo provider config for the MCP server */
|
|
40
|
-
providers: PromptfooProvider[];
|
|
41
|
-
/** Compiled test cases */
|
|
42
|
-
tests: PromptfooTestCase[];
|
|
43
|
-
/** Prompts for MCP evaluation */
|
|
44
|
-
prompts: PromptfooPrompt[];
|
|
45
|
-
/** Warnings generated during compilation */
|
|
46
|
-
warnings: string[];
|
|
47
|
-
}
|
|
48
|
-
/** Validation errors for MCP task definitions */
|
|
49
|
-
export interface MCPValidationError {
|
|
50
|
-
field: string;
|
|
51
|
-
message: string;
|
|
52
|
-
}
|
|
53
|
-
/**
|
|
54
|
-
* Validate that an MCP task definition has all required fields.
|
|
55
|
-
*/
|
|
56
|
-
export declare function validateMCPTask(task: MCPServerTaskDefinition): MCPValidationError[];
|
|
57
|
-
/**
|
|
58
|
-
* Compile an MCP server task definition into Promptfoo configuration.
|
|
59
|
-
*
|
|
60
|
-
* This is the core of the MCP mode handler. It produces:
|
|
61
|
-
* 1. A provider config pointing to the MCP server
|
|
62
|
-
* 2. Test cases with tool-call assertions
|
|
63
|
-
* 3. Appropriate prompts for the evaluation
|
|
64
|
-
*/
|
|
65
|
-
export declare function compileMCPTask(task: MCPServerTaskDefinition, options?: MCPCompileOptions): MCPCompileResult;
|
|
66
|
-
/** ModeHandler-conformant export for the mcp-server evaluation mode. */
|
|
67
|
-
export declare const handler: ModeHandler;
|