@sanity/ailf 2.0.2 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/cli.js +0 -0
- package/package.json +24 -24
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
- package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
- package/dist/_vendor/ailf-tasks/cli.js +0 -61
- package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
- package/dist/_vendor/ailf-tasks/index.js +0 -16
- package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
- package/dist/_vendor/ailf-tasks/parser.js +0 -73
- package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
- package/dist/_vendor/ailf-tasks/schemas.js +0 -180
- package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
- package/dist/_vendor/ailf-tasks/validation.js +0 -162
- package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
- package/dist/adapters/task-sources/yaml-task-source.js +0 -139
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
- package/dist/commands/update-quality-scores.d.ts +0 -5
- package/dist/commands/update-quality-scores.js +0 -20
- package/dist/lib/agent-behavior-report.d.ts +0 -8
- package/dist/lib/agent-behavior-report.js +0 -185
- package/dist/lib/baseline.d.ts +0 -19
- package/dist/lib/baseline.js +0 -153
- package/dist/lib/calculate-scores.d.ts +0 -23
- package/dist/lib/calculate-scores.js +0 -42
- package/dist/lib/compare.d.ts +0 -18
- package/dist/lib/compare.js +0 -170
- package/dist/lib/coverage-audit.d.ts +0 -4
- package/dist/lib/coverage-audit.js +0 -42
- package/dist/lib/discovery-report.d.ts +0 -13
- package/dist/lib/discovery-report.js +0 -57
- package/dist/lib/fetch-docs.d.ts +0 -30
- package/dist/lib/fetch-docs.js +0 -171
- package/dist/lib/generate-configs.d.ts +0 -25
- package/dist/lib/generate-configs.js +0 -42
- package/dist/lib/grader-api.d.ts +0 -21
- package/dist/lib/grader-api.js +0 -34
- package/dist/lib/grader-compare.d.ts +0 -19
- package/dist/lib/grader-compare.js +0 -91
- package/dist/lib/grader-consistency.d.ts +0 -27
- package/dist/lib/grader-consistency.js +0 -79
- package/dist/lib/grader-sensitivity.d.ts +0 -19
- package/dist/lib/grader-sensitivity.js +0 -75
- package/dist/lib/grader-validate.d.ts +0 -19
- package/dist/lib/grader-validate.js +0 -78
- package/dist/lib/measure-retrieval.d.ts +0 -14
- package/dist/lib/measure-retrieval.js +0 -71
- package/dist/lib/pr-comment.d.ts +0 -16
- package/dist/lib/pr-comment.js +0 -28
- package/dist/lib/readiness-report.d.ts +0 -13
- package/dist/lib/readiness-report.js +0 -108
- package/dist/lib/webhook-server.d.ts +0 -11
- package/dist/lib/webhook-server.js +0 -24
- package/dist/lib/weekly-digest.d.ts +0 -24
- package/dist/lib/weekly-digest.js +0 -148
- package/dist/orchestration/env-bridge.d.ts +0 -21
- package/dist/orchestration/env-bridge.js +0 -66
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
- package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
- package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
- package/dist/pipeline/compiler/task-bridge.js +0 -92
- package/dist/pipeline/expand-tasks.d.ts +0 -232
- package/dist/pipeline/expand-tasks.js +0 -467
- package/dist/pipeline/generate-configs.d.ts +0 -92
- package/dist/pipeline/generate-configs.js +0 -445
- package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
- package/dist/pipeline/steps/calculate-scores-step.js +0 -89
- package/dist/pipeline/steps/compare-step.d.ts +0 -18
- package/dist/pipeline/steps/compare-step.js +0 -90
- package/dist/pipeline/steps/eval-step.d.ts +0 -53
- package/dist/pipeline/steps/eval-step.js +0 -347
- package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
- package/dist/pipeline/steps/fetch-docs-step.js +0 -84
- package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
- package/dist/pipeline/steps/generate-configs-step.js +0 -98
- package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
- package/dist/pipeline/steps/grader-consistency-step.js +0 -74
- package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
- package/dist/pipeline/steps/publish-report-step.js +0 -243
- package/dist/pipeline/steps/report-step.d.ts +0 -13
- package/dist/pipeline/steps/report-step.js +0 -56
- package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
- package/dist/pipeline/steps/update-scores-step.js +0 -42
- package/dist/scripts/agent-behavior-report.d.ts +0 -19
- package/dist/scripts/agent-behavior-report.js +0 -315
- package/dist/scripts/baseline.d.ts +0 -43
- package/dist/scripts/baseline.js +0 -267
- package/dist/scripts/calculate-scores.d.ts +0 -166
- package/dist/scripts/calculate-scores.js +0 -1296
- package/dist/scripts/compare.d.ts +0 -22
- package/dist/scripts/compare.js +0 -334
- package/dist/scripts/coverage-audit.d.ts +0 -44
- package/dist/scripts/coverage-audit.js +0 -209
- package/dist/scripts/debug-eval.d.ts +0 -19
- package/dist/scripts/debug-eval.js +0 -73
- package/dist/scripts/discovery-report.d.ts +0 -58
- package/dist/scripts/discovery-report.js +0 -250
- package/dist/scripts/fetch-docs.d.ts +0 -35
- package/dist/scripts/fetch-docs.js +0 -472
- package/dist/scripts/generate-configs.d.ts +0 -66
- package/dist/scripts/generate-configs.js +0 -459
- package/dist/scripts/grader-api.d.ts +0 -27
- package/dist/scripts/grader-api.js +0 -206
- package/dist/scripts/grader-compare.d.ts +0 -22
- package/dist/scripts/grader-compare.js +0 -368
- package/dist/scripts/grader-consistency.d.ts +0 -20
- package/dist/scripts/grader-consistency.js +0 -313
- package/dist/scripts/grader-sensitivity.d.ts +0 -22
- package/dist/scripts/grader-sensitivity.js +0 -354
- package/dist/scripts/grader-validate.d.ts +0 -19
- package/dist/scripts/grader-validate.js +0 -267
- package/dist/scripts/measure-retrieval.d.ts +0 -10
- package/dist/scripts/measure-retrieval.js +0 -145
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
- package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
- package/dist/scripts/pipeline.d.ts +0 -76
- package/dist/scripts/pipeline.js +0 -1031
- package/dist/scripts/pr-comment.d.ts +0 -10
- package/dist/scripts/pr-comment.js +0 -510
- package/dist/scripts/readiness-report.d.ts +0 -88
- package/dist/scripts/readiness-report.js +0 -342
- package/dist/scripts/update-quality-scores.d.ts +0 -15
- package/dist/scripts/update-quality-scores.js +0 -184
- package/dist/scripts/validate-task-sources.d.ts +0 -21
- package/dist/scripts/validate-task-sources.js +0 -210
- package/dist/scripts/validate.d.ts +0 -13
- package/dist/scripts/validate.js +0 -79
- package/dist/scripts/webhook-server.d.ts +0 -26
- package/dist/scripts/webhook-server.js +0 -147
- package/dist/scripts/weekly-digest.d.ts +0 -24
- package/dist/scripts/weekly-digest.js +0 -144
- package/dist/sinks/format-slack.d.ts +0 -64
- package/dist/sinks/format-slack.js +0 -306
- package/dist/sinks/slack-sink.d.ts +0 -27
- package/dist/sinks/slack-sink.js +0 -78
- package/dist/sinks/webhook-sink.d.ts +0 -19
- package/dist/sinks/webhook-sink.js +0 -50
- package/tasks/.expanded.agentic.yaml +0 -280
- package/tasks/.expanded.yaml +0 -565
|
@@ -1,485 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* AgentHarnessModeHandler — compilation rules for `agent-harness` mode.
|
|
3
|
-
*
|
|
4
|
-
* Maps agent harness task definitions to Promptfoo configuration with:
|
|
5
|
-
* - Claude Agent SDK / OpenAI Codex SDK providers
|
|
6
|
-
* - Tool permission configuration (preset/allowed/disallowed)
|
|
7
|
-
* - Sandbox setup/teardown via Promptfoo extensions
|
|
8
|
-
* - Fixture provisioning into sandbox working directory
|
|
9
|
-
*
|
|
10
|
-
* @see docs/exec-plans/architecture-overhaul/phase-4-agent-harness.md
|
|
11
|
-
* @see packages/core/src/types/generalized-task.ts — AgentHarnessTaskDefinition
|
|
12
|
-
*/
|
|
13
|
-
// ---------------------------------------------------------------------------
|
|
14
|
-
// Canonical agent harness prompt templates
|
|
15
|
-
// ---------------------------------------------------------------------------
|
|
16
|
-
// Handler-owned prompts for agent harness evaluations. Describes the task
|
|
17
|
-
// for autonomous agent execution within a sandboxed environment with file
|
|
18
|
-
// system and tool access.
|
|
19
|
-
export const AGENT_HARNESS_PROMPT_TEMPLATES = {
|
|
20
|
-
"agent-harness": {
|
|
21
|
-
id: "agent-harness",
|
|
22
|
-
label: "Agent Harness Task",
|
|
23
|
-
template: `You are a coding agent working in a sandboxed environment. You have access to file system tools (read, write, edit) and a shell to complete the following task.
|
|
24
|
-
|
|
25
|
-
## Task
|
|
26
|
-
{{task}}
|
|
27
|
-
|
|
28
|
-
## Instructions
|
|
29
|
-
|
|
30
|
-
1. Read existing files to understand the project structure before making changes
|
|
31
|
-
2. Implement a complete, working solution — no placeholders or TODOs
|
|
32
|
-
3. Ensure all necessary imports and dependencies are included
|
|
33
|
-
4. Verify your implementation compiles and passes any provided test commands
|
|
34
|
-
5. Keep changes minimal and focused on the task
|
|
35
|
-
|
|
36
|
-
Complete the implementation:
|
|
37
|
-
`,
|
|
38
|
-
variables: ["task"],
|
|
39
|
-
},
|
|
40
|
-
};
|
|
41
|
-
// ---------------------------------------------------------------------------
|
|
42
|
-
// Tool permission presets
|
|
43
|
-
// ---------------------------------------------------------------------------
|
|
44
|
-
/** Predefined tool permission sets */
|
|
45
|
-
const TOOL_PRESETS = {
|
|
46
|
-
coding: ["Bash", "Read", "Write", "Edit", "Glob", "Grep"],
|
|
47
|
-
"full-access": [
|
|
48
|
-
"Bash",
|
|
49
|
-
"Read",
|
|
50
|
-
"Write",
|
|
51
|
-
"Edit",
|
|
52
|
-
"Glob",
|
|
53
|
-
"Grep",
|
|
54
|
-
"WebSearch",
|
|
55
|
-
"WebFetch",
|
|
56
|
-
"TodoRead",
|
|
57
|
-
"TodoWrite",
|
|
58
|
-
],
|
|
59
|
-
"read-only": ["Read", "Glob", "Grep", "WebSearch"],
|
|
60
|
-
};
|
|
61
|
-
/**
|
|
62
|
-
* Validate that an agent harness task definition has all required fields.
|
|
63
|
-
*/
|
|
64
|
-
export function validateAgentHarnessTask(task) {
|
|
65
|
-
const errors = [];
|
|
66
|
-
if (!task.id) {
|
|
67
|
-
errors.push({ field: "id", message: "Task ID is required" });
|
|
68
|
-
}
|
|
69
|
-
if (!task.title) {
|
|
70
|
-
errors.push({ field: "title", message: "Task title is required" });
|
|
71
|
-
}
|
|
72
|
-
return errors;
|
|
73
|
-
}
|
|
74
|
-
// ---------------------------------------------------------------------------
|
|
75
|
-
// Compilation
|
|
76
|
-
// ---------------------------------------------------------------------------
|
|
77
|
-
/**
|
|
78
|
-
* Compile an agent harness task definition into Promptfoo configuration.
|
|
79
|
-
*/
|
|
80
|
-
export function compileAgentHarnessTask(task, options) {
|
|
81
|
-
const warnings = [];
|
|
82
|
-
// Validate
|
|
83
|
-
const validationErrors = validateAgentHarnessTask(task);
|
|
84
|
-
for (const err of validationErrors) {
|
|
85
|
-
warnings.push(`Agent harness task "${task.id}": ${err.field} — ${err.message}`);
|
|
86
|
-
}
|
|
87
|
-
// Build provider
|
|
88
|
-
const providers = buildAgentProvider(task, warnings);
|
|
89
|
-
// Build prompts
|
|
90
|
-
const prompts = buildAgentPrompts(task);
|
|
91
|
-
// Build test cases
|
|
92
|
-
const tests = buildAgentTestCases(task, options, warnings);
|
|
93
|
-
// Build sandbox extensions
|
|
94
|
-
const sandboxConfig = buildSandboxConfig(task);
|
|
95
|
-
const extensions = buildLifecycleExtensions(task, sandboxConfig);
|
|
96
|
-
return { providers, tests, prompts, extensions, sandboxConfig, warnings };
|
|
97
|
-
}
|
|
98
|
-
// ---------------------------------------------------------------------------
|
|
99
|
-
// Provider assembly
|
|
100
|
-
// ---------------------------------------------------------------------------
|
|
101
|
-
function buildAgentProvider(task, _warnings) {
|
|
102
|
-
// Resolve tool permissions
|
|
103
|
-
const tools = resolveToolPermissions(task.tools);
|
|
104
|
-
const config = {};
|
|
105
|
-
if (tools.length > 0) {
|
|
106
|
-
config.allowedTools = tools;
|
|
107
|
-
}
|
|
108
|
-
if (task.sandbox) {
|
|
109
|
-
config.sandbox = {
|
|
110
|
-
type: task.sandbox.type,
|
|
111
|
-
...(task.sandbox.image ? { image: task.sandbox.image } : {}),
|
|
112
|
-
};
|
|
113
|
-
}
|
|
114
|
-
// Default to Claude Agent SDK provider
|
|
115
|
-
return [
|
|
116
|
-
{
|
|
117
|
-
id: `agent:${task.id}`,
|
|
118
|
-
label: `Agent Harness: ${task.title}`,
|
|
119
|
-
config,
|
|
120
|
-
},
|
|
121
|
-
];
|
|
122
|
-
}
|
|
123
|
-
/**
|
|
124
|
-
* Resolve tool permissions from task config.
|
|
125
|
-
*
|
|
126
|
-
* Handles:
|
|
127
|
-
* - Preset names ("coding", "read-only", "full-access")
|
|
128
|
-
* - Explicit tool names ("Bash", "Read", "Write")
|
|
129
|
-
* - Mixed arrays ["coding", "WebSearch"] → preset expansion + extras
|
|
130
|
-
*/
|
|
131
|
-
function resolveToolPermissions(tools) {
|
|
132
|
-
if (!tools || tools.length === 0)
|
|
133
|
-
return [];
|
|
134
|
-
const resolved = new Set();
|
|
135
|
-
for (const tool of tools) {
|
|
136
|
-
const preset = TOOL_PRESETS[tool];
|
|
137
|
-
if (preset) {
|
|
138
|
-
for (const t of preset)
|
|
139
|
-
resolved.add(t);
|
|
140
|
-
}
|
|
141
|
-
else {
|
|
142
|
-
resolved.add(tool);
|
|
143
|
-
}
|
|
144
|
-
}
|
|
145
|
-
return [...resolved];
|
|
146
|
-
}
|
|
147
|
-
// ---------------------------------------------------------------------------
|
|
148
|
-
// Prompt assembly
|
|
149
|
-
// ---------------------------------------------------------------------------
|
|
150
|
-
function buildAgentPrompts(task) {
|
|
151
|
-
const promptText = task.prompt?.text ??
|
|
152
|
-
task.prompt?.vars?.task ??
|
|
153
|
-
task.description ??
|
|
154
|
-
`Agent task: ${task.title}`;
|
|
155
|
-
return [
|
|
156
|
-
{
|
|
157
|
-
id: "agent-harness",
|
|
158
|
-
label: `Agent: ${task.title}`,
|
|
159
|
-
raw: String(promptText),
|
|
160
|
-
},
|
|
161
|
-
];
|
|
162
|
-
}
|
|
163
|
-
// ---------------------------------------------------------------------------
|
|
164
|
-
// Test case assembly
|
|
165
|
-
// ---------------------------------------------------------------------------
|
|
166
|
-
function buildAgentTestCases(task, options, warnings) {
|
|
167
|
-
const assertions = [];
|
|
168
|
-
if (task.assertions) {
|
|
169
|
-
for (const assertion of task.assertions) {
|
|
170
|
-
const mapped = mapAgentAssertion(assertion, options, warnings);
|
|
171
|
-
if (mapped)
|
|
172
|
-
assertions.push(mapped);
|
|
173
|
-
}
|
|
174
|
-
}
|
|
175
|
-
const vars = {
|
|
176
|
-
task: task.prompt?.vars?.task ?? task.description ?? `Complete: ${task.title}`,
|
|
177
|
-
...(task.prompt?.vars ?? {}),
|
|
178
|
-
// Internal metadata for sandbox lifecycle hooks
|
|
179
|
-
__sandboxType: task.sandbox?.type ?? "tempdir",
|
|
180
|
-
__fixtures: task.fixtures ?? [],
|
|
181
|
-
};
|
|
182
|
-
const tests = [
|
|
183
|
-
{
|
|
184
|
-
description: `${task.id} — ${task.title}`,
|
|
185
|
-
vars,
|
|
186
|
-
...(assertions.length > 0 ? { assert: assertions } : {}),
|
|
187
|
-
},
|
|
188
|
-
];
|
|
189
|
-
// Multi-turn support
|
|
190
|
-
if (task.multiTurn?.turns && task.multiTurn.turns.length > 0) {
|
|
191
|
-
tests.push({
|
|
192
|
-
description: `${task.id} — ${task.title} [multi-turn]`,
|
|
193
|
-
vars: {
|
|
194
|
-
...vars,
|
|
195
|
-
__multiTurn: task.multiTurn.turns,
|
|
196
|
-
},
|
|
197
|
-
...(assertions.length > 0 ? { assert: assertions } : {}),
|
|
198
|
-
});
|
|
199
|
-
}
|
|
200
|
-
return tests;
|
|
201
|
-
}
|
|
202
|
-
// ---------------------------------------------------------------------------
|
|
203
|
-
// Assertion mapping
|
|
204
|
-
// ---------------------------------------------------------------------------
|
|
205
|
-
function mapAgentAssertion(assertion, options, warnings) {
|
|
206
|
-
switch (assertion.type) {
|
|
207
|
-
case "file-exists":
|
|
208
|
-
return buildFileExistsAssertion(assertion);
|
|
209
|
-
case "file-contains":
|
|
210
|
-
return buildFileContainsAssertion(assertion);
|
|
211
|
-
case "command-succeeds":
|
|
212
|
-
return buildCommandSucceedsAssertion(assertion);
|
|
213
|
-
case "diff-matches":
|
|
214
|
-
return buildDiffMatchesAssertion(assertion);
|
|
215
|
-
// Standard assertions pass through
|
|
216
|
-
case "contains":
|
|
217
|
-
case "equals":
|
|
218
|
-
case "regex":
|
|
219
|
-
case "is-json":
|
|
220
|
-
case "javascript":
|
|
221
|
-
case "python":
|
|
222
|
-
return {
|
|
223
|
-
type: assertion.type,
|
|
224
|
-
...("value" in assertion ? { value: assertion.value } : {}),
|
|
225
|
-
...(typeof assertion.weight === "number"
|
|
226
|
-
? { weight: assertion.weight }
|
|
227
|
-
: {}),
|
|
228
|
-
};
|
|
229
|
-
case "llm-rubric":
|
|
230
|
-
return {
|
|
231
|
-
type: "llm-rubric",
|
|
232
|
-
...("value" in assertion ? { value: assertion.value } : {}),
|
|
233
|
-
...(typeof assertion.weight === "number"
|
|
234
|
-
? { weight: assertion.weight }
|
|
235
|
-
: {}),
|
|
236
|
-
...(options?.graderProvider
|
|
237
|
-
? { provider: options.graderProvider }
|
|
238
|
-
: {}),
|
|
239
|
-
};
|
|
240
|
-
default:
|
|
241
|
-
warnings.push(`Agent task: unknown assertion type "${assertion.type}" — passed through`);
|
|
242
|
-
return {
|
|
243
|
-
type: assertion.type,
|
|
244
|
-
...("value" in assertion ? { value: assertion.value } : {}),
|
|
245
|
-
};
|
|
246
|
-
}
|
|
247
|
-
}
|
|
248
|
-
// ---------------------------------------------------------------------------
|
|
249
|
-
// Agent-specific assertion builders
|
|
250
|
-
// ---------------------------------------------------------------------------
|
|
251
|
-
function buildFileExistsAssertion(assertion) {
|
|
252
|
-
const filePath = String(assertion.value ?? "");
|
|
253
|
-
// Use JSON.stringify for all interpolated values in generated JS to
|
|
254
|
-
// prevent broken strings from filePaths containing quotes/backslashes
|
|
255
|
-
const safeFilePath = JSON.stringify(filePath);
|
|
256
|
-
return {
|
|
257
|
-
type: "javascript",
|
|
258
|
-
value: `// file-exists: ${filePath}\n` +
|
|
259
|
-
`(function() {\n` +
|
|
260
|
-
` const fs = require('fs');\n` +
|
|
261
|
-
` const path = require('path');\n` +
|
|
262
|
-
` const workDir = path.resolve(context.vars.__workingDir || '.');\n` +
|
|
263
|
-
` const target = path.resolve(workDir, ${safeFilePath});\n` +
|
|
264
|
-
` if (!target.startsWith(workDir + path.sep) && target !== workDir) {\n` +
|
|
265
|
-
` return { pass: false, score: 0, reason: 'Path traversal: ' + ${safeFilePath} + ' escapes sandbox' };\n` +
|
|
266
|
-
` }\n` +
|
|
267
|
-
` const exists = fs.existsSync(target);\n` +
|
|
268
|
-
` return {\n` +
|
|
269
|
-
` pass: exists,\n` +
|
|
270
|
-
` score: exists ? 1 : 0,\n` +
|
|
271
|
-
` reason: exists\n` +
|
|
272
|
-
` ? 'File exists: ' + ${safeFilePath}\n` +
|
|
273
|
-
` : 'Expected file not found: ' + ${safeFilePath},\n` +
|
|
274
|
-
` };\n` +
|
|
275
|
-
`})()`,
|
|
276
|
-
...(typeof assertion.weight === "number"
|
|
277
|
-
? { weight: assertion.weight }
|
|
278
|
-
: {}),
|
|
279
|
-
};
|
|
280
|
-
}
|
|
281
|
-
function buildFileContainsAssertion(assertion) {
|
|
282
|
-
const config = assertion.value;
|
|
283
|
-
const filePath = config?.path ?? "";
|
|
284
|
-
const expectedContent = config?.content ?? "";
|
|
285
|
-
const safeFilePath = JSON.stringify(filePath);
|
|
286
|
-
return {
|
|
287
|
-
type: "javascript",
|
|
288
|
-
value: `// file-contains: ${filePath}\n` +
|
|
289
|
-
`(function() {\n` +
|
|
290
|
-
` const fs = require('fs');\n` +
|
|
291
|
-
` const path = require('path');\n` +
|
|
292
|
-
` const workDir = path.resolve(context.vars.__workingDir || '.');\n` +
|
|
293
|
-
` const target = path.resolve(workDir, ${safeFilePath});\n` +
|
|
294
|
-
` if (!target.startsWith(workDir + path.sep) && target !== workDir) {\n` +
|
|
295
|
-
` return { pass: false, score: 0, reason: 'Path traversal: ' + ${safeFilePath} + ' escapes sandbox' };\n` +
|
|
296
|
-
` }\n` +
|
|
297
|
-
` if (!fs.existsSync(target)) {\n` +
|
|
298
|
-
` return { pass: false, score: 0, reason: 'File not found: ' + ${safeFilePath} };\n` +
|
|
299
|
-
` }\n` +
|
|
300
|
-
` const content = fs.readFileSync(target, 'utf-8');\n` +
|
|
301
|
-
` const contains = content.includes(${JSON.stringify(expectedContent)});\n` +
|
|
302
|
-
` return {\n` +
|
|
303
|
-
` pass: contains,\n` +
|
|
304
|
-
` score: contains ? 1 : 0,\n` +
|
|
305
|
-
` reason: contains\n` +
|
|
306
|
-
` ? 'File contains expected content'\n` +
|
|
307
|
-
` : 'File does not contain expected content',\n` +
|
|
308
|
-
` };\n` +
|
|
309
|
-
`})()`,
|
|
310
|
-
...(typeof assertion.weight === "number"
|
|
311
|
-
? { weight: assertion.weight }
|
|
312
|
-
: {}),
|
|
313
|
-
};
|
|
314
|
-
}
|
|
315
|
-
/**
|
|
316
|
-
* SECURITY: Trusted-input boundary.
|
|
317
|
-
*
|
|
318
|
-
* The `command-succeeds` assertion executes an arbitrary shell command
|
|
319
|
-
* inside the sandbox's working directory. The command string comes from
|
|
320
|
-
* task definitions (YAML or TypeScript config files), which are authored
|
|
321
|
-
* by developers — not from user input or LLM output.
|
|
322
|
-
*
|
|
323
|
-
* This is intentional: the assertion is designed to verify agent output
|
|
324
|
-
* by running build/test commands (e.g., "npm test", "tsc --noEmit").
|
|
325
|
-
*
|
|
326
|
-
* Task definitions are the trust boundary. If you accept task definitions
|
|
327
|
-
* from untrusted sources, validate commands against an allowlist first.
|
|
328
|
-
*/
|
|
329
|
-
function buildCommandSucceedsAssertion(assertion) {
|
|
330
|
-
const command = String(assertion.value ?? "");
|
|
331
|
-
return {
|
|
332
|
-
type: "javascript",
|
|
333
|
-
value: `// command-succeeds: ${command}\n` +
|
|
334
|
-
`(function() {\n` +
|
|
335
|
-
` const { execSync } = require('child_process');\n` +
|
|
336
|
-
` const workDir = context.vars.__workingDir || '.';\n` +
|
|
337
|
-
` try {\n` +
|
|
338
|
-
` execSync(${JSON.stringify(command)}, { cwd: workDir, timeout: 30000 });\n` +
|
|
339
|
-
` return { pass: true, score: 1, reason: 'Command succeeded: ' + ${JSON.stringify(command)} };\n` +
|
|
340
|
-
` } catch (err) {\n` +
|
|
341
|
-
` return {\n` +
|
|
342
|
-
` pass: false,\n` +
|
|
343
|
-
` score: 0,\n` +
|
|
344
|
-
` reason: 'Command failed: ' + (err.message || err),\n` +
|
|
345
|
-
` };\n` +
|
|
346
|
-
` }\n` +
|
|
347
|
-
`})()`,
|
|
348
|
-
...(typeof assertion.weight === "number"
|
|
349
|
-
? { weight: assertion.weight }
|
|
350
|
-
: {}),
|
|
351
|
-
};
|
|
352
|
-
}
|
|
353
|
-
function buildDiffMatchesAssertion(assertion) {
|
|
354
|
-
const expected = assertion.value;
|
|
355
|
-
return {
|
|
356
|
-
type: "javascript",
|
|
357
|
-
value: `// diff-matches\n` +
|
|
358
|
-
`(function() {\n` +
|
|
359
|
-
` const { execSync } = require('child_process');\n` +
|
|
360
|
-
` const workDir = context.vars.__workingDir || '.';\n` +
|
|
361
|
-
` try {\n` +
|
|
362
|
-
` const diff = execSync('git diff', { cwd: workDir, encoding: 'utf-8' });\n` +
|
|
363
|
-
` const expected = ${JSON.stringify(expected)};\n` +
|
|
364
|
-
` if (typeof expected === 'string') {\n` +
|
|
365
|
-
` const contains = diff.includes(expected);\n` +
|
|
366
|
-
` return {\n` +
|
|
367
|
-
` pass: contains,\n` +
|
|
368
|
-
` score: contains ? 1 : 0,\n` +
|
|
369
|
-
` reason: contains ? 'Diff matches expected pattern' : 'Diff does not match',\n` +
|
|
370
|
-
` };\n` +
|
|
371
|
-
` }\n` +
|
|
372
|
-
` return { pass: diff.length > 0, score: diff.length > 0 ? 1 : 0, reason: 'Diff exists' };\n` +
|
|
373
|
-
` } catch (err) {\n` +
|
|
374
|
-
` return { pass: false, score: 0, reason: 'Failed to get diff: ' + err.message };\n` +
|
|
375
|
-
` }\n` +
|
|
376
|
-
`})()`,
|
|
377
|
-
...(typeof assertion.weight === "number"
|
|
378
|
-
? { weight: assertion.weight }
|
|
379
|
-
: {}),
|
|
380
|
-
};
|
|
381
|
-
}
|
|
382
|
-
// ---------------------------------------------------------------------------
|
|
383
|
-
// Sandbox configuration
|
|
384
|
-
// ---------------------------------------------------------------------------
|
|
385
|
-
function buildSandboxConfig(task) {
|
|
386
|
-
return {
|
|
387
|
-
type: task.sandbox?.type ?? "tempdir",
|
|
388
|
-
image: task.sandbox?.image,
|
|
389
|
-
fixtures: task.fixtures ?? [],
|
|
390
|
-
limits: task.sandbox?.limits
|
|
391
|
-
? {
|
|
392
|
-
cpus: task.sandbox.limits.cpus,
|
|
393
|
-
memoryBytes: task.sandbox.limits.memoryBytes,
|
|
394
|
-
networkAccess: task.sandbox.limits.networkAccess,
|
|
395
|
-
}
|
|
396
|
-
: undefined,
|
|
397
|
-
};
|
|
398
|
-
}
|
|
399
|
-
// ---------------------------------------------------------------------------
|
|
400
|
-
// Lifecycle extensions
|
|
401
|
-
// ---------------------------------------------------------------------------
|
|
402
|
-
function buildLifecycleExtensions(task, sandboxConfig) {
|
|
403
|
-
const extensions = [];
|
|
404
|
-
// beforeEach: provision sandbox + inject fixtures
|
|
405
|
-
extensions.push({
|
|
406
|
-
type: "beforeEach",
|
|
407
|
-
code: buildBeforeEachHook(task.id, sandboxConfig),
|
|
408
|
-
});
|
|
409
|
-
// afterEach: collect artifacts + teardown
|
|
410
|
-
extensions.push({
|
|
411
|
-
type: "afterEach",
|
|
412
|
-
code: buildAfterEachHook(task.id),
|
|
413
|
-
});
|
|
414
|
-
return extensions;
|
|
415
|
-
}
|
|
416
|
-
function buildBeforeEachHook(taskId, config) {
|
|
417
|
-
return (`// beforeEach: provision sandbox for ${taskId}\n` +
|
|
418
|
-
`async function({ vars }) {\n` +
|
|
419
|
-
` const { mkdirSync, writeFileSync } = require('fs');\n` +
|
|
420
|
-
` const { tmpdir } = require('os');\n` +
|
|
421
|
-
` const { resolve } = require('path');\n` +
|
|
422
|
-
` const id = 'ailf-${taskId}-' + require('crypto').randomUUID().slice(0, 8);\n` +
|
|
423
|
-
` const workDir = resolve(tmpdir(), id);\n` +
|
|
424
|
-
` mkdirSync(workDir, { recursive: true });\n` +
|
|
425
|
-
` vars.__workingDir = workDir;\n` +
|
|
426
|
-
` vars.__sandboxId = id;\n` +
|
|
427
|
-
` // Fixture list: ${JSON.stringify(config.fixtures)}\n` +
|
|
428
|
-
`}`);
|
|
429
|
-
}
|
|
430
|
-
function buildAfterEachHook(taskId) {
|
|
431
|
-
return (`// afterEach: collect artifacts + teardown for ${taskId}\n` +
|
|
432
|
-
`async function({ vars }) {\n` +
|
|
433
|
-
` const { rmSync, readdirSync, existsSync } = require('fs');\n` +
|
|
434
|
-
` const workDir = vars.__workingDir;\n` +
|
|
435
|
-
` if (workDir && existsSync(workDir)) {\n` +
|
|
436
|
-
` try {\n` +
|
|
437
|
-
` // Collect modified files list\n` +
|
|
438
|
-
` const files = [];\n` +
|
|
439
|
-
` function collect(dir, prefix) {\n` +
|
|
440
|
-
` for (const e of readdirSync(dir, { withFileTypes: true })) {\n` +
|
|
441
|
-
` const rel = prefix ? prefix + '/' + e.name : e.name;\n` +
|
|
442
|
-
` if (e.isDirectory()) collect(require('path').resolve(dir, e.name), rel);\n` +
|
|
443
|
-
` else files.push(rel);\n` +
|
|
444
|
-
` }\n` +
|
|
445
|
-
` }\n` +
|
|
446
|
-
` collect(workDir, '');\n` +
|
|
447
|
-
` vars.__artifacts = { modifiedFiles: files };\n` +
|
|
448
|
-
` } finally {\n` +
|
|
449
|
-
` // Guard: only delete directories under os.tmpdir()\n` +
|
|
450
|
-
` const tmp = require('os').tmpdir();\n` +
|
|
451
|
-
` if (require('path').resolve(workDir).startsWith(require('path').resolve(tmp))) {\n` +
|
|
452
|
-
` rmSync(workDir, { recursive: true, force: true });\n` +
|
|
453
|
-
` }\n` +
|
|
454
|
-
` }\n` +
|
|
455
|
-
` }\n` +
|
|
456
|
-
`}`);
|
|
457
|
-
}
|
|
458
|
-
// ---------------------------------------------------------------------------
|
|
459
|
-
// ModeHandler adapter
|
|
460
|
-
// ---------------------------------------------------------------------------
|
|
461
|
-
/** ModeHandler-conformant export for the agent-harness evaluation mode. */
|
|
462
|
-
export const handler = {
|
|
463
|
-
getPrompts() {
|
|
464
|
-
return AGENT_HARNESS_PROMPT_TEMPLATES;
|
|
465
|
-
},
|
|
466
|
-
compileTask(task, ctx) {
|
|
467
|
-
if (!("mode" in task) || task.mode !== "agent-harness") {
|
|
468
|
-
throw new Error(`Agent harness handler received task with mode "${task.mode ?? "undefined"}" — expected "agent-harness"`);
|
|
469
|
-
}
|
|
470
|
-
const result = compileAgentHarnessTask(task, {
|
|
471
|
-
graderProvider: ctx.graderProvider,
|
|
472
|
-
rootDir: ctx.rootDir,
|
|
473
|
-
});
|
|
474
|
-
return {
|
|
475
|
-
providers: result.providers,
|
|
476
|
-
tests: result.tests,
|
|
477
|
-
prompts: result.prompts,
|
|
478
|
-
warnings: result.warnings,
|
|
479
|
-
extras: {
|
|
480
|
-
extensions: result.extensions,
|
|
481
|
-
sandboxConfig: result.sandboxConfig,
|
|
482
|
-
},
|
|
483
|
-
};
|
|
484
|
-
},
|
|
485
|
-
};
|
|
@@ -1,76 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* KnowledgeProbeModeHandler — compilation rules for `knowledge-probe` mode.
|
|
3
|
-
*
|
|
4
|
-
* The simplest mode handler. Knowledge probes measure raw model knowledge
|
|
5
|
-
* without documentation context, tool calling, or sandboxed execution.
|
|
6
|
-
* They answer: "What does this model know about X without any help?"
|
|
7
|
-
*
|
|
8
|
-
* Key properties:
|
|
9
|
-
* - No doc vars injected (intentionally empty)
|
|
10
|
-
* - Uses the without-docs prompt template (or custom prompt)
|
|
11
|
-
* - Standard LLM providers only (no agent SDKs, no MCP)
|
|
12
|
-
* - No retrieval metrics (precision/recall/F1 not applicable)
|
|
13
|
-
* - Results feed into the standard cross-model comparison pipeline
|
|
14
|
-
*
|
|
15
|
-
* This handler is the reference implementation for the mode handler pattern.
|
|
16
|
-
*
|
|
17
|
-
* @see docs/exec-plans/architecture-overhaul/phase-5-knowledge-probe.md
|
|
18
|
-
* @see packages/core/src/types/generalized-task.ts — KnowledgeProbeTaskDefinition
|
|
19
|
-
*/
|
|
20
|
-
import type { KnowledgeProbeTaskDefinition, ModeHandler, PromptTemplate } from "../../../_vendor/ailf-core/index.d.ts";
|
|
21
|
-
import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../promptfoo-compiler.js";
|
|
22
|
-
export declare const KNOWLEDGE_PROBE_PROMPT_TEMPLATES: Record<string, PromptTemplate>;
|
|
23
|
-
/** Options for compiling a knowledge probe task */
|
|
24
|
-
export interface KnowledgeProbeCompileOptions {
|
|
25
|
-
/** Grader provider for LLM-graded assertions */
|
|
26
|
-
graderProvider?: string;
|
|
27
|
-
/** Model registry — knowledge probes run across all configured models */
|
|
28
|
-
models?: {
|
|
29
|
-
id: string;
|
|
30
|
-
label: string;
|
|
31
|
-
config?: Record<string, unknown>;
|
|
32
|
-
}[];
|
|
33
|
-
}
|
|
34
|
-
/** Result of compiling a single knowledge probe task */
|
|
35
|
-
export interface KnowledgeProbeCompileResult {
|
|
36
|
-
/** Promptfoo provider configs (one per model) */
|
|
37
|
-
providers: PromptfooProvider[];
|
|
38
|
-
/** Compiled test cases */
|
|
39
|
-
tests: PromptfooTestCase[];
|
|
40
|
-
/** Prompts for evaluation */
|
|
41
|
-
prompts: PromptfooPrompt[];
|
|
42
|
-
/** Mode metadata for cross-model comparison */
|
|
43
|
-
metadata: KnowledgeProbeMetadata;
|
|
44
|
-
/** Warnings generated during compilation */
|
|
45
|
-
warnings: string[];
|
|
46
|
-
}
|
|
47
|
-
/** Metadata attached to knowledge probe results for comparison */
|
|
48
|
-
export interface KnowledgeProbeMetadata {
|
|
49
|
-
/** Evaluation mode identifier */
|
|
50
|
-
mode: "knowledge-probe";
|
|
51
|
-
/** Probe strategy used */
|
|
52
|
-
probeStrategy: string;
|
|
53
|
-
/** Whether doc context was intentionally excluded */
|
|
54
|
-
noDocContext: true;
|
|
55
|
-
/** Whether retrieval metrics are applicable */
|
|
56
|
-
retrievalMetrics: false;
|
|
57
|
-
}
|
|
58
|
-
/** Validation errors for knowledge probe task definitions */
|
|
59
|
-
export interface KnowledgeProbeValidationError {
|
|
60
|
-
field: string;
|
|
61
|
-
message: string;
|
|
62
|
-
}
|
|
63
|
-
/**
|
|
64
|
-
* Validate that a knowledge probe task definition has all required fields.
|
|
65
|
-
*/
|
|
66
|
-
export declare function validateKnowledgeProbeTask(task: KnowledgeProbeTaskDefinition): KnowledgeProbeValidationError[];
|
|
67
|
-
/**
|
|
68
|
-
* Compile a knowledge probe task definition into Promptfoo configuration.
|
|
69
|
-
*
|
|
70
|
-
* This is intentionally minimal — knowledge probes map almost 1:1 to
|
|
71
|
-
* basic Promptfoo test cases. The AILF value-add is type-safe authoring,
|
|
72
|
-
* cross-model comparison, and score normalization.
|
|
73
|
-
*/
|
|
74
|
-
export declare function compileKnowledgeProbeTask(task: KnowledgeProbeTaskDefinition, options?: KnowledgeProbeCompileOptions): KnowledgeProbeCompileResult;
|
|
75
|
-
/** ModeHandler-conformant export for the knowledge-probe evaluation mode. */
|
|
76
|
-
export declare const handler: ModeHandler;
|