@sanity/ailf 2.0.2 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/_vendor/ailf-core/examples/index.d.ts +50 -1
- package/dist/_vendor/ailf-core/examples/index.js +66 -1
- package/dist/agent-harness/assertions-runtime.d.ts +49 -0
- package/dist/agent-harness/assertions-runtime.js +138 -0
- package/dist/agent-harness/provider.d.ts +58 -0
- package/dist/agent-harness/provider.js +104 -0
- package/dist/cli.js +0 -0
- package/dist/commands/init.js +3 -0
- package/dist/orchestration/steps/generate-configs-step.d.ts +7 -0
- package/dist/orchestration/steps/generate-configs-step.js +35 -2
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +39 -25
- package/dist/pipeline/compiler/compiler-to-yaml.js +78 -7
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +9 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +28 -85
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +22 -15
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +8 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +42 -12
- package/package.json +25 -24
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
- package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
- package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
- package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
- package/dist/_vendor/ailf-tasks/cli.js +0 -61
- package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
- package/dist/_vendor/ailf-tasks/index.js +0 -16
- package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
- package/dist/_vendor/ailf-tasks/parser.js +0 -73
- package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
- package/dist/_vendor/ailf-tasks/schemas.js +0 -180
- package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
- package/dist/_vendor/ailf-tasks/validation.js +0 -162
- package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
- package/dist/adapters/task-sources/yaml-task-source.js +0 -139
- package/dist/agent-observer/test-imports.d.ts +0 -7
- package/dist/agent-observer/test-imports.js +0 -185
- package/dist/commands/update-quality-scores.d.ts +0 -5
- package/dist/commands/update-quality-scores.js +0 -20
- package/dist/lib/agent-behavior-report.d.ts +0 -8
- package/dist/lib/agent-behavior-report.js +0 -185
- package/dist/lib/baseline.d.ts +0 -19
- package/dist/lib/baseline.js +0 -153
- package/dist/lib/calculate-scores.d.ts +0 -23
- package/dist/lib/calculate-scores.js +0 -42
- package/dist/lib/compare.d.ts +0 -18
- package/dist/lib/compare.js +0 -170
- package/dist/lib/coverage-audit.d.ts +0 -4
- package/dist/lib/coverage-audit.js +0 -42
- package/dist/lib/discovery-report.d.ts +0 -13
- package/dist/lib/discovery-report.js +0 -57
- package/dist/lib/fetch-docs.d.ts +0 -30
- package/dist/lib/fetch-docs.js +0 -171
- package/dist/lib/generate-configs.d.ts +0 -25
- package/dist/lib/generate-configs.js +0 -42
- package/dist/lib/grader-api.d.ts +0 -21
- package/dist/lib/grader-api.js +0 -34
- package/dist/lib/grader-compare.d.ts +0 -19
- package/dist/lib/grader-compare.js +0 -91
- package/dist/lib/grader-consistency.d.ts +0 -27
- package/dist/lib/grader-consistency.js +0 -79
- package/dist/lib/grader-sensitivity.d.ts +0 -19
- package/dist/lib/grader-sensitivity.js +0 -75
- package/dist/lib/grader-validate.d.ts +0 -19
- package/dist/lib/grader-validate.js +0 -78
- package/dist/lib/measure-retrieval.d.ts +0 -14
- package/dist/lib/measure-retrieval.js +0 -71
- package/dist/lib/pr-comment.d.ts +0 -16
- package/dist/lib/pr-comment.js +0 -28
- package/dist/lib/readiness-report.d.ts +0 -13
- package/dist/lib/readiness-report.js +0 -108
- package/dist/lib/webhook-server.d.ts +0 -11
- package/dist/lib/webhook-server.js +0 -24
- package/dist/lib/weekly-digest.d.ts +0 -24
- package/dist/lib/weekly-digest.js +0 -148
- package/dist/orchestration/env-bridge.d.ts +0 -21
- package/dist/orchestration/env-bridge.js +0 -66
- package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
- package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
- package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
- package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
- package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
- package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
- package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
- package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
- package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
- package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
- package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
- package/dist/pipeline/compiler/task-bridge.js +0 -92
- package/dist/pipeline/expand-tasks.d.ts +0 -232
- package/dist/pipeline/expand-tasks.js +0 -467
- package/dist/pipeline/generate-configs.d.ts +0 -92
- package/dist/pipeline/generate-configs.js +0 -445
- package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
- package/dist/pipeline/steps/calculate-scores-step.js +0 -89
- package/dist/pipeline/steps/compare-step.d.ts +0 -18
- package/dist/pipeline/steps/compare-step.js +0 -90
- package/dist/pipeline/steps/eval-step.d.ts +0 -53
- package/dist/pipeline/steps/eval-step.js +0 -347
- package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
- package/dist/pipeline/steps/fetch-docs-step.js +0 -84
- package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
- package/dist/pipeline/steps/generate-configs-step.js +0 -98
- package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
- package/dist/pipeline/steps/grader-consistency-step.js +0 -74
- package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
- package/dist/pipeline/steps/publish-report-step.js +0 -243
- package/dist/pipeline/steps/report-step.d.ts +0 -13
- package/dist/pipeline/steps/report-step.js +0 -56
- package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
- package/dist/pipeline/steps/update-scores-step.js +0 -42
- package/dist/scripts/agent-behavior-report.d.ts +0 -19
- package/dist/scripts/agent-behavior-report.js +0 -315
- package/dist/scripts/baseline.d.ts +0 -43
- package/dist/scripts/baseline.js +0 -267
- package/dist/scripts/calculate-scores.d.ts +0 -166
- package/dist/scripts/calculate-scores.js +0 -1296
- package/dist/scripts/compare.d.ts +0 -22
- package/dist/scripts/compare.js +0 -334
- package/dist/scripts/coverage-audit.d.ts +0 -44
- package/dist/scripts/coverage-audit.js +0 -209
- package/dist/scripts/debug-eval.d.ts +0 -19
- package/dist/scripts/debug-eval.js +0 -73
- package/dist/scripts/discovery-report.d.ts +0 -58
- package/dist/scripts/discovery-report.js +0 -250
- package/dist/scripts/fetch-docs.d.ts +0 -35
- package/dist/scripts/fetch-docs.js +0 -472
- package/dist/scripts/generate-configs.d.ts +0 -66
- package/dist/scripts/generate-configs.js +0 -459
- package/dist/scripts/grader-api.d.ts +0 -27
- package/dist/scripts/grader-api.js +0 -206
- package/dist/scripts/grader-compare.d.ts +0 -22
- package/dist/scripts/grader-compare.js +0 -368
- package/dist/scripts/grader-consistency.d.ts +0 -20
- package/dist/scripts/grader-consistency.js +0 -313
- package/dist/scripts/grader-sensitivity.d.ts +0 -22
- package/dist/scripts/grader-sensitivity.js +0 -354
- package/dist/scripts/grader-validate.d.ts +0 -19
- package/dist/scripts/grader-validate.js +0 -267
- package/dist/scripts/measure-retrieval.d.ts +0 -10
- package/dist/scripts/measure-retrieval.js +0 -145
- package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
- package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
- package/dist/scripts/pipeline.d.ts +0 -76
- package/dist/scripts/pipeline.js +0 -1031
- package/dist/scripts/pr-comment.d.ts +0 -10
- package/dist/scripts/pr-comment.js +0 -510
- package/dist/scripts/readiness-report.d.ts +0 -88
- package/dist/scripts/readiness-report.js +0 -342
- package/dist/scripts/update-quality-scores.d.ts +0 -15
- package/dist/scripts/update-quality-scores.js +0 -184
- package/dist/scripts/validate-task-sources.d.ts +0 -21
- package/dist/scripts/validate-task-sources.js +0 -210
- package/dist/scripts/validate.d.ts +0 -13
- package/dist/scripts/validate.js +0 -79
- package/dist/scripts/webhook-server.d.ts +0 -26
- package/dist/scripts/webhook-server.js +0 -147
- package/dist/scripts/weekly-digest.d.ts +0 -24
- package/dist/scripts/weekly-digest.js +0 -144
- package/dist/sinks/format-slack.d.ts +0 -64
- package/dist/sinks/format-slack.js +0 -306
- package/dist/sinks/slack-sink.d.ts +0 -27
- package/dist/sinks/slack-sink.js +0 -78
- package/dist/sinks/webhook-sink.d.ts +0 -19
- package/dist/sinks/webhook-sink.js +0 -50
- package/tasks/.expanded.agentic.yaml +0 -280
- package/tasks/.expanded.yaml +0 -565
|
@@ -20,6 +20,8 @@ import { loadSource } from "../../sources.js";
|
|
|
20
20
|
import { configToSourceOverrides } from "../config-to-source-overrides.js";
|
|
21
21
|
export class GenerateConfigsStep {
|
|
22
22
|
name = "generate-configs";
|
|
23
|
+
/** Task IDs from the last loadTasks call (pre-filter), for error messages. */
|
|
24
|
+
lastLoadedTaskIds = [];
|
|
23
25
|
check(ctx) {
|
|
24
26
|
const issues = validateModelsYaml(ctx.config.rootDir);
|
|
25
27
|
return issues.filter((i) => i.severity === "error");
|
|
@@ -54,10 +56,10 @@ export class GenerateConfigsStep {
|
|
|
54
56
|
// Load tasks
|
|
55
57
|
const tasks = await this.loadTasks(ctx, mode, state);
|
|
56
58
|
if (tasks.length === 0) {
|
|
59
|
+
const error = this.buildNoTasksError(ctx, mode);
|
|
57
60
|
return {
|
|
58
61
|
durationMs: Date.now() - start,
|
|
59
|
-
error
|
|
60
|
-
`packages/eval/tasks/${mode}/`,
|
|
62
|
+
error,
|
|
61
63
|
status: "failed",
|
|
62
64
|
};
|
|
63
65
|
}
|
|
@@ -249,6 +251,10 @@ export class GenerateConfigsStep {
|
|
|
249
251
|
return filtered;
|
|
250
252
|
}
|
|
251
253
|
applyFilters(ctx, tasks) {
|
|
254
|
+
// Capture pre-filter IDs for diagnostic messages
|
|
255
|
+
this.lastLoadedTaskIds = tasks
|
|
256
|
+
.map((t) => t.id)
|
|
257
|
+
.filter((id) => !!id);
|
|
252
258
|
let result = tasks;
|
|
253
259
|
if (ctx.config.areas?.length) {
|
|
254
260
|
const allowed = new Set(ctx.config.areas.map((a) => a.toLowerCase()));
|
|
@@ -273,6 +279,33 @@ export class GenerateConfigsStep {
|
|
|
273
279
|
}
|
|
274
280
|
return result;
|
|
275
281
|
}
|
|
282
|
+
/**
|
|
283
|
+
* Build a descriptive error message when no tasks match the current filters.
|
|
284
|
+
* Distinguishes between "no tasks exist" and "tasks exist but filters exclude them".
|
|
285
|
+
*/
|
|
286
|
+
buildNoTasksError(ctx, mode) {
|
|
287
|
+
const filters = [];
|
|
288
|
+
if (ctx.config.tasks?.length) {
|
|
289
|
+
filters.push(`--task ${ctx.config.tasks.join(", ")}`);
|
|
290
|
+
}
|
|
291
|
+
if (ctx.config.areas?.length) {
|
|
292
|
+
filters.push(`--area ${ctx.config.areas.join(", ")}`);
|
|
293
|
+
}
|
|
294
|
+
if (ctx.config.tags?.length) {
|
|
295
|
+
filters.push(`--tag ${ctx.config.tags.join(", ")}`);
|
|
296
|
+
}
|
|
297
|
+
if (filters.length > 0) {
|
|
298
|
+
// Collect available task IDs for the hint
|
|
299
|
+
const availableIds = this.lastLoadedTaskIds ?? [];
|
|
300
|
+
const hint = availableIds.length > 0
|
|
301
|
+
? `\n Available ${mode} task IDs: ${availableIds.join(", ")}`
|
|
302
|
+
: "";
|
|
303
|
+
return (`No ${mode} tasks match the current filters (${filters.join("; ")}).` +
|
|
304
|
+
hint);
|
|
305
|
+
}
|
|
306
|
+
return (`No ${mode} tasks found. Create *.task.ts files in ` +
|
|
307
|
+
`packages/eval/tasks/${mode}/`);
|
|
308
|
+
}
|
|
276
309
|
// ---------------------------------------------------------------------------
|
|
277
310
|
// Compilation helpers
|
|
278
311
|
// ---------------------------------------------------------------------------
|
|
@@ -87,15 +87,23 @@ describe("validateAgentHarnessTask", () => {
|
|
|
87
87
|
// compileAgentHarnessTask — provider assembly
|
|
88
88
|
// ---------------------------------------------------------------------------
|
|
89
89
|
describe("compileAgentHarnessTask — providers", () => {
|
|
90
|
-
it("produces a provider", () => {
|
|
90
|
+
it("produces a Claude Agent SDK provider", () => {
|
|
91
91
|
const result = compileAgentHarnessTask(makeTask());
|
|
92
92
|
assert.ok(result.providers.length > 0);
|
|
93
|
-
assert.
|
|
93
|
+
assert.equal(result.providers[0].id, "anthropic:claude-agent-sdk");
|
|
94
94
|
});
|
|
95
|
-
it("
|
|
95
|
+
it("sets default agent config", () => {
|
|
96
|
+
const result = compileAgentHarnessTask(makeTask());
|
|
97
|
+
const config = result.providers[0].config;
|
|
98
|
+
assert.ok(config.model, "should set a model");
|
|
99
|
+
assert.ok(config.max_turns, "should set max_turns");
|
|
100
|
+
assert.ok(config.max_budget_usd, "should set budget cap");
|
|
101
|
+
assert.equal(config.permission_mode, "bypassPermissions");
|
|
102
|
+
});
|
|
103
|
+
it("resolves coding tool preset into custom_allowed_tools", () => {
|
|
96
104
|
const result = compileAgentHarnessTask(makeTask({ tools: ["coding"] }));
|
|
97
105
|
const config = result.providers[0].config;
|
|
98
|
-
const tools = config.
|
|
106
|
+
const tools = config.custom_allowed_tools;
|
|
99
107
|
assert.ok(tools.includes("Bash"));
|
|
100
108
|
assert.ok(tools.includes("Read"));
|
|
101
109
|
assert.ok(tools.includes("Write"));
|
|
@@ -104,7 +112,7 @@ describe("compileAgentHarnessTask — providers", () => {
|
|
|
104
112
|
it("resolves read-only tool preset", () => {
|
|
105
113
|
const result = compileAgentHarnessTask(makeTask({ tools: ["read-only"] }));
|
|
106
114
|
const config = result.providers[0].config;
|
|
107
|
-
const tools = config.
|
|
115
|
+
const tools = config.custom_allowed_tools;
|
|
108
116
|
assert.ok(tools.includes("Read"));
|
|
109
117
|
assert.ok(tools.includes("Grep"));
|
|
110
118
|
assert.ok(!tools.includes("Write"), "read-only should not include Write");
|
|
@@ -112,19 +120,10 @@ describe("compileAgentHarnessTask — providers", () => {
|
|
|
112
120
|
it("mixes preset and explicit tools", () => {
|
|
113
121
|
const result = compileAgentHarnessTask(makeTask({ tools: ["read-only", "WebFetch"] }));
|
|
114
122
|
const config = result.providers[0].config;
|
|
115
|
-
const tools = config.
|
|
123
|
+
const tools = config.custom_allowed_tools;
|
|
116
124
|
assert.ok(tools.includes("Read"));
|
|
117
125
|
assert.ok(tools.includes("WebFetch"));
|
|
118
126
|
});
|
|
119
|
-
it("includes sandbox config in provider", () => {
|
|
120
|
-
const result = compileAgentHarnessTask(makeTask({
|
|
121
|
-
sandbox: { type: "docker", image: "node:22-slim" },
|
|
122
|
-
}));
|
|
123
|
-
const config = result.providers[0].config;
|
|
124
|
-
const sandbox = config.sandbox;
|
|
125
|
-
assert.equal(sandbox.type, "docker");
|
|
126
|
-
assert.equal(sandbox.image, "node:22-slim");
|
|
127
|
-
});
|
|
128
127
|
});
|
|
129
128
|
// ---------------------------------------------------------------------------
|
|
130
129
|
// compileAgentHarnessTask — test cases
|
|
@@ -166,16 +165,20 @@ describe("compileAgentHarnessTask — test cases", () => {
|
|
|
166
165
|
// compileAgentHarnessTask — assertions
|
|
167
166
|
// ---------------------------------------------------------------------------
|
|
168
167
|
describe("compileAgentHarnessTask — assertions", () => {
|
|
169
|
-
|
|
168
|
+
const RUNTIME = "file://dist/agent-harness/assertions-runtime.js";
|
|
169
|
+
it("maps file-exists to file-based javascript assertion", () => {
|
|
170
170
|
const result = compileAgentHarnessTask(makeTask({
|
|
171
171
|
assertions: [{ type: "file-exists", value: "sanity.config.ts" }],
|
|
172
172
|
}));
|
|
173
173
|
const assertion = result.tests[0].assert?.[0];
|
|
174
174
|
assert.ok(assertion);
|
|
175
175
|
assert.equal(assertion.type, "javascript");
|
|
176
|
-
assert.
|
|
176
|
+
assert.equal(assertion.value, `${RUNTIME}:fileExists`);
|
|
177
|
+
assert.deepEqual(assertion.config, {
|
|
178
|
+
filePath: "sanity.config.ts",
|
|
179
|
+
});
|
|
177
180
|
});
|
|
178
|
-
it("maps file-contains to javascript assertion", () => {
|
|
181
|
+
it("maps file-contains to file-based javascript assertion", () => {
|
|
179
182
|
const result = compileAgentHarnessTask(makeTask({
|
|
180
183
|
assertions: [
|
|
181
184
|
{
|
|
@@ -187,25 +190,35 @@ describe("compileAgentHarnessTask — assertions", () => {
|
|
|
187
190
|
const assertion = result.tests[0].assert?.[0];
|
|
188
191
|
assert.ok(assertion);
|
|
189
192
|
assert.equal(assertion.type, "javascript");
|
|
190
|
-
assert.
|
|
193
|
+
assert.equal(assertion.value, `${RUNTIME}:fileContains`);
|
|
194
|
+
assert.deepEqual(assertion.config, {
|
|
195
|
+
filePath: "config.ts",
|
|
196
|
+
content: "projectId",
|
|
197
|
+
});
|
|
191
198
|
});
|
|
192
|
-
it("maps command-succeeds to javascript assertion", () => {
|
|
199
|
+
it("maps command-succeeds to file-based javascript assertion", () => {
|
|
193
200
|
const result = compileAgentHarnessTask(makeTask({
|
|
194
201
|
assertions: [{ type: "command-succeeds", value: "npx tsc --noEmit" }],
|
|
195
202
|
}));
|
|
196
203
|
const assertion = result.tests[0].assert?.[0];
|
|
197
204
|
assert.ok(assertion);
|
|
198
205
|
assert.equal(assertion.type, "javascript");
|
|
199
|
-
assert.
|
|
206
|
+
assert.equal(assertion.value, `${RUNTIME}:commandSucceeds`);
|
|
207
|
+
assert.deepEqual(assertion.config, {
|
|
208
|
+
command: "npx tsc --noEmit",
|
|
209
|
+
});
|
|
200
210
|
});
|
|
201
|
-
it("maps diff-matches to javascript assertion", () => {
|
|
211
|
+
it("maps diff-matches to file-based javascript assertion", () => {
|
|
202
212
|
const result = compileAgentHarnessTask(makeTask({
|
|
203
213
|
assertions: [{ type: "diff-matches", value: "createClient" }],
|
|
204
214
|
}));
|
|
205
215
|
const assertion = result.tests[0].assert?.[0];
|
|
206
216
|
assert.ok(assertion);
|
|
207
217
|
assert.equal(assertion.type, "javascript");
|
|
208
|
-
assert.
|
|
218
|
+
assert.equal(assertion.value, `${RUNTIME}:diffMatches`);
|
|
219
|
+
assert.deepEqual(assertion.config, {
|
|
220
|
+
expected: "createClient",
|
|
221
|
+
});
|
|
209
222
|
});
|
|
210
223
|
it("passes through standard assertions", () => {
|
|
211
224
|
const result = compileAgentHarnessTask(makeTask({
|
|
@@ -250,7 +263,7 @@ describe("compileAgentHarnessTask — lifecycle", () => {
|
|
|
250
263
|
}));
|
|
251
264
|
assert.equal(result.sandboxConfig.type, "docker");
|
|
252
265
|
assert.equal(result.sandboxConfig.image, "node:22");
|
|
253
|
-
assert.deepEqual(result.sandboxConfig.fixtures, ["
|
|
266
|
+
assert.deepEqual(result.sandboxConfig.fixtures, ["schema.ts"]);
|
|
254
267
|
assert.equal(result.sandboxConfig.limits?.cpus, 2);
|
|
255
268
|
assert.equal(result.sandboxConfig.limits?.networkAccess, false);
|
|
256
269
|
});
|
|
@@ -278,7 +291,8 @@ describe("example agent harness tasks — end-to-end", () => {
|
|
|
278
291
|
const result = compileAgentHarnessTask(modifyCodeTask);
|
|
279
292
|
assert.ok(result.tests[0].assert);
|
|
280
293
|
assert.ok(result.tests[0].assert.some((a) => a.type === "javascript" &&
|
|
281
|
-
a.value.includes("
|
|
294
|
+
a.value.includes("fileContains") &&
|
|
295
|
+
a.config != null));
|
|
282
296
|
});
|
|
283
297
|
it("refactor task has docker sandbox config", () => {
|
|
284
298
|
const result = compileAgentHarnessTask(multiFileRefactorTask);
|
|
@@ -65,12 +65,38 @@ export function writeCompiledModeConfig(result, mode, options) {
|
|
|
65
65
|
if (options.graderProvider) {
|
|
66
66
|
graderOpts.provider = options.graderProvider;
|
|
67
67
|
}
|
|
68
|
-
//
|
|
68
|
+
// For agent-harness mode, create sandbox directories and inject working_dir
|
|
69
|
+
// into provider configs. The sandbox must exist before the provider initializes
|
|
70
|
+
// (the Claude Agent SDK reads working_dir at construction time).
|
|
71
|
+
// Both working_dir and __workingDir use absolute paths to avoid ambiguity.
|
|
72
|
+
// @see https://www.promptfoo.dev/docs/providers/claude-agent-sdk/
|
|
73
|
+
const sandboxAbsPath = result.extras?.sandboxConfig
|
|
74
|
+
? resolve(options.rootDir, `results/latest/sandbox-${mode}`)
|
|
75
|
+
: undefined;
|
|
76
|
+
if (sandboxAbsPath) {
|
|
77
|
+
mkdirSync(sandboxAbsPath, { recursive: true });
|
|
78
|
+
}
|
|
79
|
+
// Build provider entries, injecting working_dir for agent-harness providers
|
|
69
80
|
const providerEntries = result.providers.map((p) => {
|
|
70
|
-
if (p.config)
|
|
71
|
-
return { id: p.id, label: p.label
|
|
72
|
-
|
|
81
|
+
if (!p.config)
|
|
82
|
+
return p.label ? { id: p.id, label: p.label } : p.id;
|
|
83
|
+
const config = { ...p.config };
|
|
84
|
+
if (sandboxAbsPath && p.id === "anthropic:claude-agent-sdk") {
|
|
85
|
+
config.working_dir = sandboxAbsPath;
|
|
86
|
+
}
|
|
87
|
+
return { id: p.id, label: p.label, config };
|
|
73
88
|
});
|
|
89
|
+
// Inject __workingDir into test vars so assertions can find the sandbox
|
|
90
|
+
if (sandboxAbsPath) {
|
|
91
|
+
for (const test of expandedTests) {
|
|
92
|
+
if (test.vars) {
|
|
93
|
+
;
|
|
94
|
+
test.vars.__workingDir = sandboxAbsPath;
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
// Re-write the tests file with the injected paths
|
|
98
|
+
writeFileSync(testsPath, JSON.stringify(expandedTests, null, 2), "utf-8");
|
|
99
|
+
}
|
|
74
100
|
// Build prompt entries
|
|
75
101
|
const prompts = result.prompts.map((p) => ({
|
|
76
102
|
id: p.id,
|
|
@@ -88,10 +114,11 @@ export function writeCompiledModeConfig(result, mode, options) {
|
|
|
88
114
|
tests: [testsFilename],
|
|
89
115
|
});
|
|
90
116
|
// Include extensions if present (agent-harness mode)
|
|
117
|
+
// Promptfoo expects extensions as string[] (file paths to JS modules),
|
|
118
|
+
// so we materialize the { type, code } objects as a .cjs file on disk.
|
|
91
119
|
if (result.extras?.extensions) {
|
|
92
|
-
;
|
|
93
|
-
config.extensions =
|
|
94
|
-
result.extras.extensions;
|
|
120
|
+
const extPaths = writeExtensionFile(options.rootDir, mode, result.extras.extensions);
|
|
121
|
+
config.extensions = extPaths;
|
|
95
122
|
}
|
|
96
123
|
writeConfig(options.rootDir, filename, config, options.logger);
|
|
97
124
|
}
|
|
@@ -215,3 +242,47 @@ function writeYaml(path, data, header) {
|
|
|
215
242
|
});
|
|
216
243
|
writeFileSync(path, `${header}\n${yamlStr}`, "utf-8");
|
|
217
244
|
}
|
|
245
|
+
/**
|
|
246
|
+
* Materialize Promptfoo lifecycle extensions as a .cjs file on disk.
|
|
247
|
+
*
|
|
248
|
+
* Promptfoo extensions use a single-function dispatch pattern:
|
|
249
|
+
* module.exports = async function(hookName, context) { ... }
|
|
250
|
+
*
|
|
251
|
+
* Each extension entry in the YAML references:
|
|
252
|
+
* file://path/to/file.cjs:exportedFunctionName
|
|
253
|
+
*
|
|
254
|
+
* @see https://www.promptfoo.dev/docs/configuration/reference/ — extensions
|
|
255
|
+
*/
|
|
256
|
+
function writeExtensionFile(rootDir, mode, extensions) {
|
|
257
|
+
// Build a dispatch map: hookName → handler code
|
|
258
|
+
const hookMap = {};
|
|
259
|
+
for (const ext of extensions) {
|
|
260
|
+
hookMap[ext.type] = ext.code;
|
|
261
|
+
}
|
|
262
|
+
// Generate the single dispatch function that promptfoo expects
|
|
263
|
+
const hookCases = Object.entries(hookMap)
|
|
264
|
+
.map(([hookName, code]) => ` if (hookName === '${hookName}') {\n` +
|
|
265
|
+
` const handler = ${code};\n` +
|
|
266
|
+
` return handler(context);\n` +
|
|
267
|
+
` }`)
|
|
268
|
+
.join("\n");
|
|
269
|
+
const fileContent = [
|
|
270
|
+
"// AUTO-GENERATED by compiler pipeline — do not edit directly.",
|
|
271
|
+
"// Run: npx @sanity/ailf generate-configs",
|
|
272
|
+
"//",
|
|
273
|
+
"// Promptfoo extension dispatch function.",
|
|
274
|
+
`// @see https://www.promptfoo.dev/docs/configuration/reference/`,
|
|
275
|
+
"",
|
|
276
|
+
"async function extensionHook(hookName, context) {",
|
|
277
|
+
hookCases,
|
|
278
|
+
"}",
|
|
279
|
+
"",
|
|
280
|
+
"module.exports = extensionHook;",
|
|
281
|
+
"",
|
|
282
|
+
].join("\n");
|
|
283
|
+
const filename = `results/latest/${mode}-extensions.cjs`;
|
|
284
|
+
const outPath = resolve(rootDir, filename);
|
|
285
|
+
writeFileSync(outPath, fileContent, "utf-8");
|
|
286
|
+
// Single entry pointing to the dispatch function
|
|
287
|
+
return [`file://${filename}:extensionHook`];
|
|
288
|
+
}
|
|
@@ -4,6 +4,15 @@
|
|
|
4
4
|
* Handles agent-specific assertion types (file-exists, file-contains,
|
|
5
5
|
* command-succeeds, diff-matches) as well as standard pass-through
|
|
6
6
|
* assertion types.
|
|
7
|
+
*
|
|
8
|
+
* Agent-specific assertions use file-based references to the assertions
|
|
9
|
+
* runtime module (dist/agent-harness/assertions-runtime.js) because
|
|
10
|
+
* promptfoo's inline `type: javascript` assertions run in a restricted
|
|
11
|
+
* eval() sandbox where require() is unavailable. File-based assertions
|
|
12
|
+
* run in a full Node.js context.
|
|
13
|
+
*
|
|
14
|
+
* @see https://www.promptfoo.dev/docs/configuration/expected-outputs/javascript/
|
|
15
|
+
* @see src/agent-harness/assertions-runtime.ts — runtime implementations
|
|
7
16
|
*/
|
|
8
17
|
import type { PromptfooAssertion } from "../../assertion-mapper.js";
|
|
9
18
|
import type { AgentHarnessCompileOptions } from "./types.js";
|
|
@@ -4,7 +4,18 @@
|
|
|
4
4
|
* Handles agent-specific assertion types (file-exists, file-contains,
|
|
5
5
|
* command-succeeds, diff-matches) as well as standard pass-through
|
|
6
6
|
* assertion types.
|
|
7
|
+
*
|
|
8
|
+
* Agent-specific assertions use file-based references to the assertions
|
|
9
|
+
* runtime module (dist/agent-harness/assertions-runtime.js) because
|
|
10
|
+
* promptfoo's inline `type: javascript` assertions run in a restricted
|
|
11
|
+
* eval() sandbox where require() is unavailable. File-based assertions
|
|
12
|
+
* run in a full Node.js context.
|
|
13
|
+
*
|
|
14
|
+
* @see https://www.promptfoo.dev/docs/configuration/expected-outputs/javascript/
|
|
15
|
+
* @see src/agent-harness/assertions-runtime.ts — runtime implementations
|
|
7
16
|
*/
|
|
17
|
+
/** Base path for the file-based assertion runtime module */
|
|
18
|
+
const RUNTIME = "file://dist/agent-harness/assertions-runtime.js";
|
|
8
19
|
// ---------------------------------------------------------------------------
|
|
9
20
|
// Assertion mapping
|
|
10
21
|
// ---------------------------------------------------------------------------
|
|
@@ -53,66 +64,29 @@ export function mapAgentAssertion(assertion, options, warnings) {
|
|
|
53
64
|
}
|
|
54
65
|
// ---------------------------------------------------------------------------
|
|
55
66
|
// Agent-specific assertion builders
|
|
67
|
+
//
|
|
68
|
+
// Each builder returns a file-based assertion referencing the runtime
|
|
69
|
+
// module with parameters passed via the `config` field.
|
|
56
70
|
// ---------------------------------------------------------------------------
|
|
57
71
|
export function buildFileExistsAssertion(assertion) {
|
|
58
|
-
const filePath = String(assertion.value ?? "");
|
|
59
|
-
// Use JSON.stringify for all interpolated values in generated JS to
|
|
60
|
-
// prevent broken strings from filePaths containing quotes/backslashes
|
|
61
|
-
const safeFilePath = JSON.stringify(filePath);
|
|
62
72
|
return {
|
|
63
73
|
type: "javascript",
|
|
64
|
-
value:
|
|
65
|
-
|
|
66
|
-
` const fs = require('fs');\n` +
|
|
67
|
-
` const path = require('path');\n` +
|
|
68
|
-
` const workDir = path.resolve(context.vars.__workingDir || '.');\n` +
|
|
69
|
-
` const target = path.resolve(workDir, ${safeFilePath});\n` +
|
|
70
|
-
` if (!target.startsWith(workDir + path.sep) && target !== workDir) {\n` +
|
|
71
|
-
` return { pass: false, score: 0, reason: 'Path traversal: ' + ${safeFilePath} + ' escapes sandbox' };\n` +
|
|
72
|
-
` }\n` +
|
|
73
|
-
` const exists = fs.existsSync(target);\n` +
|
|
74
|
-
` return {\n` +
|
|
75
|
-
` pass: exists,\n` +
|
|
76
|
-
` score: exists ? 1 : 0,\n` +
|
|
77
|
-
` reason: exists\n` +
|
|
78
|
-
` ? 'File exists: ' + ${safeFilePath}\n` +
|
|
79
|
-
` : 'Expected file not found: ' + ${safeFilePath},\n` +
|
|
80
|
-
` };\n` +
|
|
81
|
-
`})()`,
|
|
74
|
+
value: `${RUNTIME}:fileExists`,
|
|
75
|
+
config: { filePath: String(assertion.value ?? "") },
|
|
82
76
|
...(typeof assertion.weight === "number"
|
|
83
77
|
? { weight: assertion.weight }
|
|
84
78
|
: {}),
|
|
85
79
|
};
|
|
86
80
|
}
|
|
87
81
|
export function buildFileContainsAssertion(assertion) {
|
|
88
|
-
const
|
|
89
|
-
const filePath = config?.path ?? "";
|
|
90
|
-
const expectedContent = config?.content ?? "";
|
|
91
|
-
const safeFilePath = JSON.stringify(filePath);
|
|
82
|
+
const val = assertion.value;
|
|
92
83
|
return {
|
|
93
84
|
type: "javascript",
|
|
94
|
-
value:
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
` const target = path.resolve(workDir, ${safeFilePath});\n` +
|
|
100
|
-
` if (!target.startsWith(workDir + path.sep) && target !== workDir) {\n` +
|
|
101
|
-
` return { pass: false, score: 0, reason: 'Path traversal: ' + ${safeFilePath} + ' escapes sandbox' };\n` +
|
|
102
|
-
` }\n` +
|
|
103
|
-
` if (!fs.existsSync(target)) {\n` +
|
|
104
|
-
` return { pass: false, score: 0, reason: 'File not found: ' + ${safeFilePath} };\n` +
|
|
105
|
-
` }\n` +
|
|
106
|
-
` const content = fs.readFileSync(target, 'utf-8');\n` +
|
|
107
|
-
` const contains = content.includes(${JSON.stringify(expectedContent)});\n` +
|
|
108
|
-
` return {\n` +
|
|
109
|
-
` pass: contains,\n` +
|
|
110
|
-
` score: contains ? 1 : 0,\n` +
|
|
111
|
-
` reason: contains\n` +
|
|
112
|
-
` ? 'File contains expected content'\n` +
|
|
113
|
-
` : 'File does not contain expected content',\n` +
|
|
114
|
-
` };\n` +
|
|
115
|
-
`})()`,
|
|
85
|
+
value: `${RUNTIME}:fileContains`,
|
|
86
|
+
config: {
|
|
87
|
+
filePath: val?.path ?? "",
|
|
88
|
+
content: val?.content ?? "",
|
|
89
|
+
},
|
|
116
90
|
...(typeof assertion.weight === "number"
|
|
117
91
|
? { weight: assertion.weight }
|
|
118
92
|
: {}),
|
|
@@ -133,53 +107,22 @@ export function buildFileContainsAssertion(assertion) {
|
|
|
133
107
|
* from untrusted sources, validate commands against an allowlist first.
|
|
134
108
|
*/
|
|
135
109
|
export function buildCommandSucceedsAssertion(assertion) {
|
|
136
|
-
const command = String(assertion.value ?? "");
|
|
137
110
|
return {
|
|
138
111
|
type: "javascript",
|
|
139
|
-
value:
|
|
140
|
-
|
|
141
|
-
` const { execSync } = require('child_process');\n` +
|
|
142
|
-
` const workDir = context.vars.__workingDir || '.';\n` +
|
|
143
|
-
` try {\n` +
|
|
144
|
-
` execSync(${JSON.stringify(command)}, { cwd: workDir, timeout: 30000 });\n` +
|
|
145
|
-
` return { pass: true, score: 1, reason: 'Command succeeded: ' + ${JSON.stringify(command)} };\n` +
|
|
146
|
-
` } catch (err) {\n` +
|
|
147
|
-
` return {\n` +
|
|
148
|
-
` pass: false,\n` +
|
|
149
|
-
` score: 0,\n` +
|
|
150
|
-
` reason: 'Command failed: ' + (err.message || err),\n` +
|
|
151
|
-
` };\n` +
|
|
152
|
-
` }\n` +
|
|
153
|
-
`})()`,
|
|
112
|
+
value: `${RUNTIME}:commandSucceeds`,
|
|
113
|
+
config: { command: String(assertion.value ?? "") },
|
|
154
114
|
...(typeof assertion.weight === "number"
|
|
155
115
|
? { weight: assertion.weight }
|
|
156
116
|
: {}),
|
|
157
117
|
};
|
|
158
118
|
}
|
|
159
119
|
export function buildDiffMatchesAssertion(assertion) {
|
|
160
|
-
const expected = assertion.value;
|
|
161
120
|
return {
|
|
162
121
|
type: "javascript",
|
|
163
|
-
value:
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
` try {\n` +
|
|
168
|
-
` const diff = execSync('git diff', { cwd: workDir, encoding: 'utf-8' });\n` +
|
|
169
|
-
` const expected = ${JSON.stringify(expected)};\n` +
|
|
170
|
-
` if (typeof expected === 'string') {\n` +
|
|
171
|
-
` const contains = diff.includes(expected);\n` +
|
|
172
|
-
` return {\n` +
|
|
173
|
-
` pass: contains,\n` +
|
|
174
|
-
` score: contains ? 1 : 0,\n` +
|
|
175
|
-
` reason: contains ? 'Diff matches expected pattern' : 'Diff does not match',\n` +
|
|
176
|
-
` };\n` +
|
|
177
|
-
` }\n` +
|
|
178
|
-
` return { pass: diff.length > 0, score: diff.length > 0 ? 1 : 0, reason: 'Diff exists' };\n` +
|
|
179
|
-
` } catch (err) {\n` +
|
|
180
|
-
` return { pass: false, score: 0, reason: 'Failed to get diff: ' + err.message };\n` +
|
|
181
|
-
` }\n` +
|
|
182
|
-
`})()`,
|
|
122
|
+
value: `${RUNTIME}:diffMatches`,
|
|
123
|
+
config: {
|
|
124
|
+
...(assertion.value != null ? { expected: assertion.value } : {}),
|
|
125
|
+
},
|
|
183
126
|
...(typeof assertion.weight === "number"
|
|
184
127
|
? { weight: assertion.weight }
|
|
185
128
|
: {}),
|
|
@@ -27,8 +27,10 @@ export function compileAgentHarnessTask(task, options) {
|
|
|
27
27
|
const prompts = buildAgentPrompts(task);
|
|
28
28
|
// Build test cases
|
|
29
29
|
const tests = buildAgentTestCases(task, options, warnings);
|
|
30
|
-
// Build sandbox extensions
|
|
31
|
-
|
|
30
|
+
// Build sandbox extensions — resolve fixture paths at compile time using
|
|
31
|
+
// the caller's cwd (monorepo root), not the eval package rootDir.
|
|
32
|
+
const callerCwd = process.env.AILF_CALLER_CWD ?? process.cwd();
|
|
33
|
+
const sandboxConfig = buildSandboxConfig(task, callerCwd);
|
|
32
34
|
const extensions = buildLifecycleExtensions(task, sandboxConfig);
|
|
33
35
|
return { providers, tests, prompts, extensions, sandboxConfig, warnings };
|
|
34
36
|
}
|
|
@@ -36,22 +38,25 @@ export function compileAgentHarnessTask(task, options) {
|
|
|
36
38
|
// Provider assembly
|
|
37
39
|
// ---------------------------------------------------------------------------
|
|
38
40
|
export function buildAgentProvider(task, _warnings) {
|
|
39
|
-
// Resolve tool permissions
|
|
40
41
|
const tools = resolveToolPermissions(task.tools);
|
|
41
|
-
|
|
42
|
+
// Claude Agent SDK config.
|
|
43
|
+
// working_dir is set by the YAML writer to the sandbox path it creates.
|
|
44
|
+
// @see https://www.promptfoo.dev/docs/providers/claude-agent-sdk/
|
|
45
|
+
const config = {
|
|
46
|
+
model: "claude-sonnet-4-20250514",
|
|
47
|
+
max_turns: 25,
|
|
48
|
+
max_budget_usd: 1.0,
|
|
49
|
+
permission_mode: "bypassPermissions",
|
|
50
|
+
allow_dangerously_skip_permissions: true,
|
|
51
|
+
};
|
|
52
|
+
// Map AILF tool names to Claude Agent SDK tool config.
|
|
53
|
+
// Claude SDK uses custom_allowed_tools to replace defaults.
|
|
42
54
|
if (tools.length > 0) {
|
|
43
|
-
config.
|
|
44
|
-
}
|
|
45
|
-
if (task.sandbox) {
|
|
46
|
-
config.sandbox = {
|
|
47
|
-
type: task.sandbox.type,
|
|
48
|
-
...(task.sandbox.image ? { image: task.sandbox.image } : {}),
|
|
49
|
-
};
|
|
55
|
+
config.custom_allowed_tools = tools;
|
|
50
56
|
}
|
|
51
|
-
// Default to Claude Agent SDK provider
|
|
52
57
|
return [
|
|
53
58
|
{
|
|
54
|
-
id:
|
|
59
|
+
id: "anthropic:claude-agent-sdk",
|
|
55
60
|
label: `Agent Harness: ${task.title}`,
|
|
56
61
|
config,
|
|
57
62
|
},
|
|
@@ -112,9 +117,11 @@ export function buildAgentTestCases(task, options, warnings) {
|
|
|
112
117
|
const vars = {
|
|
113
118
|
task: task.prompt?.vars?.task ?? task.description ?? `Complete: ${task.title}`,
|
|
114
119
|
...(task.prompt?.vars ?? {}),
|
|
115
|
-
// Internal metadata for sandbox lifecycle hooks
|
|
120
|
+
// Internal metadata for sandbox lifecycle hooks.
|
|
121
|
+
// Fixture paths are plain strings (no file:// prefix) because
|
|
122
|
+
// promptfoo auto-resolves file:// in vars by reading file content.
|
|
116
123
|
__sandboxType: task.sandbox?.type ?? "tempdir",
|
|
117
|
-
__fixtures: task.fixtures ?? [],
|
|
124
|
+
__fixtures: (task.fixtures ?? []).map((f) => f.startsWith("file://") ? f.slice(7) : f),
|
|
118
125
|
};
|
|
119
126
|
const tests = [
|
|
120
127
|
{
|
|
@@ -6,7 +6,14 @@
|
|
|
6
6
|
*/
|
|
7
7
|
import type { AgentHarnessTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
|
|
8
8
|
import type { PromptfooExtension, SandboxConfigMeta } from "./types.js";
|
|
9
|
-
|
|
9
|
+
/**
|
|
10
|
+
* Build sandbox configuration from a task definition.
|
|
11
|
+
*
|
|
12
|
+
* Fixture paths are resolved to absolute at compile time using callerCwd
|
|
13
|
+
* (the directory the pipeline was invoked from), because promptfoo runs
|
|
14
|
+
* with cwd set to packages/eval/ — not the monorepo root where apps/ lives.
|
|
15
|
+
*/
|
|
16
|
+
export declare function buildSandboxConfig(task: AgentHarnessTaskDefinition, callerCwd?: string): SandboxConfigMeta;
|
|
10
17
|
export declare function buildLifecycleExtensions(task: AgentHarnessTaskDefinition, sandboxConfig: SandboxConfigMeta): PromptfooExtension[];
|
|
11
18
|
export declare function buildBeforeEachHook(taskId: string, config: SandboxConfigMeta): string;
|
|
12
19
|
export declare function buildAfterEachHook(taskId: string): string;
|
|
@@ -4,14 +4,26 @@
|
|
|
4
4
|
* Builds Promptfoo beforeEach/afterEach hooks for provisioning and
|
|
5
5
|
* tearing down sandbox working directories.
|
|
6
6
|
*/
|
|
7
|
+
import { resolve } from "path";
|
|
7
8
|
// ---------------------------------------------------------------------------
|
|
8
9
|
// Sandbox configuration
|
|
9
10
|
// ---------------------------------------------------------------------------
|
|
10
|
-
|
|
11
|
+
/**
|
|
12
|
+
* Build sandbox configuration from a task definition.
|
|
13
|
+
*
|
|
14
|
+
* Fixture paths are resolved to absolute at compile time using callerCwd
|
|
15
|
+
* (the directory the pipeline was invoked from), because promptfoo runs
|
|
16
|
+
* with cwd set to packages/eval/ — not the monorepo root where apps/ lives.
|
|
17
|
+
*/
|
|
18
|
+
export function buildSandboxConfig(task, callerCwd) {
|
|
19
|
+
const cwd = callerCwd ?? process.cwd();
|
|
11
20
|
return {
|
|
12
21
|
type: task.sandbox?.type ?? "tempdir",
|
|
13
22
|
image: task.sandbox?.image,
|
|
14
|
-
fixtures: task.fixtures ?? []
|
|
23
|
+
fixtures: (task.fixtures ?? []).map((f) => {
|
|
24
|
+
const stripped = f.startsWith("file://") ? f.slice(7) : f;
|
|
25
|
+
return resolve(cwd, stripped);
|
|
26
|
+
}),
|
|
15
27
|
limits: task.sandbox?.limits
|
|
16
28
|
? {
|
|
17
29
|
cpus: task.sandbox.limits.cpus,
|
|
@@ -39,23 +51,41 @@ export function buildLifecycleExtensions(task, sandboxConfig) {
|
|
|
39
51
|
return extensions;
|
|
40
52
|
}
|
|
41
53
|
export function buildBeforeEachHook(taskId, config) {
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
54
|
+
// Promptfoo extension hooks receive (hookName, context).
|
|
55
|
+
// beforeEach context is { test } — vars live at context.test.vars.
|
|
56
|
+
// Must return context for mutations to persist.
|
|
57
|
+
//
|
|
58
|
+
// The sandbox directory is created by the YAML writer at config-gen time
|
|
59
|
+
// (deterministic path in results/latest/sandbox-{taskId}/) so it exists
|
|
60
|
+
// before the provider is initialized. This hook copies fixtures into it.
|
|
61
|
+
//
|
|
62
|
+
// @see https://www.promptfoo.dev/docs/configuration/reference/ — extensions
|
|
63
|
+
return (`// beforeEach: copy fixtures into sandbox for ${taskId}\n` +
|
|
64
|
+
`async function(context) {\n` +
|
|
65
|
+
` const { cpSync, existsSync, mkdirSync } = require('fs');\n` +
|
|
46
66
|
` const { resolve } = require('path');\n` +
|
|
47
|
-
` const
|
|
48
|
-
`
|
|
67
|
+
` const workDir = context.test.vars?.__workingDir;\n` +
|
|
68
|
+
` if (!workDir) return context;\n` +
|
|
49
69
|
` mkdirSync(workDir, { recursive: true });\n` +
|
|
50
|
-
`
|
|
51
|
-
`
|
|
52
|
-
`
|
|
70
|
+
` // Copy fixtures into sandbox\n` +
|
|
71
|
+
` const fixtures = ${JSON.stringify(config.fixtures)};\n` +
|
|
72
|
+
` for (const fixture of fixtures) {\n` +
|
|
73
|
+
` const src = resolve(process.cwd(), fixture);\n` +
|
|
74
|
+
` if (existsSync(src)) {\n` +
|
|
75
|
+
` cpSync(src, workDir, { recursive: true });\n` +
|
|
76
|
+
` }\n` +
|
|
77
|
+
` }\n` +
|
|
78
|
+
` return context;\n` +
|
|
53
79
|
`}`);
|
|
54
80
|
}
|
|
55
81
|
export function buildAfterEachHook(taskId) {
|
|
82
|
+
// Promptfoo extension hooks receive (hookName, context).
|
|
83
|
+
// afterEach context is { test, result } — vars live at context.test.vars.
|
|
84
|
+
// @see https://www.promptfoo.dev/docs/configuration/reference/ — extensions
|
|
56
85
|
return (`// afterEach: collect artifacts + teardown for ${taskId}\n` +
|
|
57
|
-
`async function(
|
|
86
|
+
`async function(context) {\n` +
|
|
58
87
|
` const { rmSync, readdirSync, existsSync } = require('fs');\n` +
|
|
88
|
+
` const vars = context.test.vars || {};\n` +
|
|
59
89
|
` const workDir = vars.__workingDir;\n` +
|
|
60
90
|
` if (workDir && existsSync(workDir)) {\n` +
|
|
61
91
|
` try {\n` +
|