@sanity/ailf 2.1.0 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/config/rubrics.ts +3 -3
  2. package/dist/_vendor/ailf-core/examples/index.d.ts +50 -1
  3. package/dist/_vendor/ailf-core/examples/index.js +66 -1
  4. package/dist/_vendor/ailf-core/types/index.d.ts +25 -0
  5. package/dist/agent-harness/assertions-runtime.d.ts +49 -0
  6. package/dist/agent-harness/assertions-runtime.js +138 -0
  7. package/dist/agent-harness/provider.d.ts +58 -0
  8. package/dist/agent-harness/provider.js +104 -0
  9. package/dist/commands/calculate-scores.js +7 -2
  10. package/dist/commands/capture-list.d.ts +1 -1
  11. package/dist/commands/capture-list.js +6 -3
  12. package/dist/commands/compare.js +11 -7
  13. package/dist/commands/explain-handler.js +22 -24
  14. package/dist/commands/fetch-docs.js +4 -2
  15. package/dist/commands/generate-configs.js +6 -2
  16. package/dist/commands/init.js +3 -0
  17. package/dist/commands/pipeline-action.js +8 -24
  18. package/dist/commands/pipeline.js +1 -1
  19. package/dist/commands/pr-comment.js +6 -2
  20. package/dist/commands/publish.d.ts +1 -0
  21. package/dist/commands/publish.js +12 -8
  22. package/dist/commands/remote-pipeline.js +1 -1
  23. package/dist/commands/remote-results.d.ts +8 -8
  24. package/dist/commands/remote-results.js +7 -7
  25. package/dist/commands/shared/options.d.ts +8 -0
  26. package/dist/commands/shared/options.js +10 -0
  27. package/dist/commands/shared/resolve-output-dir.d.ts +27 -0
  28. package/dist/commands/shared/resolve-output-dir.js +36 -0
  29. package/dist/composition-root.js +1 -1
  30. package/dist/config/rubrics.ts +3 -3
  31. package/dist/orchestration/build-app-context.js +1 -1
  32. package/dist/orchestration/steps/gap-analysis-step.js +86 -75
  33. package/dist/orchestration/steps/generate-configs-step.d.ts +7 -0
  34. package/dist/orchestration/steps/generate-configs-step.js +47 -2
  35. package/dist/pipeline/calculate-scores.js +113 -2
  36. package/dist/pipeline/compare.js +50 -19
  37. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +103 -25
  38. package/dist/pipeline/compiler/compiler-to-yaml.js +78 -7
  39. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +15 -0
  40. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +42 -85
  41. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +22 -15
  42. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -0
  43. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +8 -1
  44. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +42 -12
  45. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +3 -0
  46. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +1 -27
  47. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +2 -9
  48. package/dist/pipeline/compiler/rubric-resolution.d.ts +40 -0
  49. package/dist/pipeline/compiler/rubric-resolution.js +52 -0
  50. package/dist/pipeline/compiler/scoring-bridge.js +59 -7
  51. package/dist/pipeline/provenance.js +7 -1
  52. package/dist/pipeline/validate.d.ts +5 -4
  53. package/dist/pipeline/validate.js +34 -113
  54. package/package.json +2 -1
@@ -87,15 +87,23 @@ describe("validateAgentHarnessTask", () => {
87
87
  // compileAgentHarnessTask — provider assembly
88
88
  // ---------------------------------------------------------------------------
89
89
  describe("compileAgentHarnessTask — providers", () => {
90
- it("produces a provider", () => {
90
+ it("produces a Claude Agent SDK provider", () => {
91
91
  const result = compileAgentHarnessTask(makeTask());
92
92
  assert.ok(result.providers.length > 0);
93
- assert.ok(result.providers[0].id.startsWith("agent:"));
93
+ assert.equal(result.providers[0].id, "anthropic:claude-agent-sdk");
94
94
  });
95
- it("resolves coding tool preset", () => {
95
+ it("sets default agent config", () => {
96
+ const result = compileAgentHarnessTask(makeTask());
97
+ const config = result.providers[0].config;
98
+ assert.ok(config.model, "should set a model");
99
+ assert.ok(config.max_turns, "should set max_turns");
100
+ assert.ok(config.max_budget_usd, "should set budget cap");
101
+ assert.equal(config.permission_mode, "bypassPermissions");
102
+ });
103
+ it("resolves coding tool preset into custom_allowed_tools", () => {
96
104
  const result = compileAgentHarnessTask(makeTask({ tools: ["coding"] }));
97
105
  const config = result.providers[0].config;
98
- const tools = config.allowedTools;
106
+ const tools = config.custom_allowed_tools;
99
107
  assert.ok(tools.includes("Bash"));
100
108
  assert.ok(tools.includes("Read"));
101
109
  assert.ok(tools.includes("Write"));
@@ -104,7 +112,7 @@ describe("compileAgentHarnessTask — providers", () => {
104
112
  it("resolves read-only tool preset", () => {
105
113
  const result = compileAgentHarnessTask(makeTask({ tools: ["read-only"] }));
106
114
  const config = result.providers[0].config;
107
- const tools = config.allowedTools;
115
+ const tools = config.custom_allowed_tools;
108
116
  assert.ok(tools.includes("Read"));
109
117
  assert.ok(tools.includes("Grep"));
110
118
  assert.ok(!tools.includes("Write"), "read-only should not include Write");
@@ -112,19 +120,10 @@ describe("compileAgentHarnessTask — providers", () => {
112
120
  it("mixes preset and explicit tools", () => {
113
121
  const result = compileAgentHarnessTask(makeTask({ tools: ["read-only", "WebFetch"] }));
114
122
  const config = result.providers[0].config;
115
- const tools = config.allowedTools;
123
+ const tools = config.custom_allowed_tools;
116
124
  assert.ok(tools.includes("Read"));
117
125
  assert.ok(tools.includes("WebFetch"));
118
126
  });
119
- it("includes sandbox config in provider", () => {
120
- const result = compileAgentHarnessTask(makeTask({
121
- sandbox: { type: "docker", image: "node:22-slim" },
122
- }));
123
- const config = result.providers[0].config;
124
- const sandbox = config.sandbox;
125
- assert.equal(sandbox.type, "docker");
126
- assert.equal(sandbox.image, "node:22-slim");
127
- });
128
127
  });
129
128
  // ---------------------------------------------------------------------------
130
129
  // compileAgentHarnessTask — test cases
@@ -166,16 +165,20 @@ describe("compileAgentHarnessTask — test cases", () => {
166
165
  // compileAgentHarnessTask — assertions
167
166
  // ---------------------------------------------------------------------------
168
167
  describe("compileAgentHarnessTask — assertions", () => {
169
- it("maps file-exists to javascript assertion", () => {
168
+ const RUNTIME = "file://dist/agent-harness/assertions-runtime.js";
169
+ it("maps file-exists to file-based javascript assertion", () => {
170
170
  const result = compileAgentHarnessTask(makeTask({
171
171
  assertions: [{ type: "file-exists", value: "sanity.config.ts" }],
172
172
  }));
173
173
  const assertion = result.tests[0].assert?.[0];
174
174
  assert.ok(assertion);
175
175
  assert.equal(assertion.type, "javascript");
176
- assert.ok(assertion.value.includes("sanity.config.ts"));
176
+ assert.equal(assertion.value, `${RUNTIME}:fileExists`);
177
+ assert.deepEqual(assertion.config, {
178
+ filePath: "sanity.config.ts",
179
+ });
177
180
  });
178
- it("maps file-contains to javascript assertion", () => {
181
+ it("maps file-contains to file-based javascript assertion", () => {
179
182
  const result = compileAgentHarnessTask(makeTask({
180
183
  assertions: [
181
184
  {
@@ -187,25 +190,35 @@ describe("compileAgentHarnessTask — assertions", () => {
187
190
  const assertion = result.tests[0].assert?.[0];
188
191
  assert.ok(assertion);
189
192
  assert.equal(assertion.type, "javascript");
190
- assert.ok(assertion.value.includes("projectId"));
193
+ assert.equal(assertion.value, `${RUNTIME}:fileContains`);
194
+ assert.deepEqual(assertion.config, {
195
+ filePath: "config.ts",
196
+ content: "projectId",
197
+ });
191
198
  });
192
- it("maps command-succeeds to javascript assertion", () => {
199
+ it("maps command-succeeds to file-based javascript assertion", () => {
193
200
  const result = compileAgentHarnessTask(makeTask({
194
201
  assertions: [{ type: "command-succeeds", value: "npx tsc --noEmit" }],
195
202
  }));
196
203
  const assertion = result.tests[0].assert?.[0];
197
204
  assert.ok(assertion);
198
205
  assert.equal(assertion.type, "javascript");
199
- assert.ok(assertion.value.includes("tsc"));
206
+ assert.equal(assertion.value, `${RUNTIME}:commandSucceeds`);
207
+ assert.deepEqual(assertion.config, {
208
+ command: "npx tsc --noEmit",
209
+ });
200
210
  });
201
- it("maps diff-matches to javascript assertion", () => {
211
+ it("maps diff-matches to file-based javascript assertion", () => {
202
212
  const result = compileAgentHarnessTask(makeTask({
203
213
  assertions: [{ type: "diff-matches", value: "createClient" }],
204
214
  }));
205
215
  const assertion = result.tests[0].assert?.[0];
206
216
  assert.ok(assertion);
207
217
  assert.equal(assertion.type, "javascript");
208
- assert.ok(assertion.value.includes("git diff"));
218
+ assert.equal(assertion.value, `${RUNTIME}:diffMatches`);
219
+ assert.deepEqual(assertion.config, {
220
+ expected: "createClient",
221
+ });
209
222
  });
210
223
  it("passes through standard assertions", () => {
211
224
  const result = compileAgentHarnessTask(makeTask({
@@ -219,6 +232,70 @@ describe("compileAgentHarnessTask — assertions", () => {
219
232
  }), { graderProvider: "openai:chat:gpt-5" });
220
233
  assert.equal(result.tests[0].assert?.[0]?.provider, "openai:chat:gpt-5");
221
234
  });
235
+ it("resolves templated llm-rubric with rubric text and dimension metadata", () => {
236
+ const rubricConfig = {
237
+ templates: {
238
+ "agent-output": {
239
+ dimension: "agent-output",
240
+ header: "Score the agent's final output from 0 to 100:",
241
+ scale: ["0: Failed", "50: Partial", "100: Complete"],
242
+ criteria_label: "Check for:",
243
+ },
244
+ },
245
+ };
246
+ const result = compileAgentHarnessTask(makeTask({
247
+ assertions: [
248
+ {
249
+ type: "llm-rubric",
250
+ template: "agent-output",
251
+ criteria: ["File created", "Correct content"],
252
+ },
253
+ ],
254
+ }), { rubricConfig, graderProvider: "anthropic:messages:claude-opus-4-5" });
255
+ const assertion = result.tests[0].assert?.[0];
256
+ assert.ok(assertion, "should produce an assertion");
257
+ assert.equal(assertion.type, "llm-rubric");
258
+ // Rubric text should be fully rendered (not empty)
259
+ assert.ok(assertion.value.includes("Score the agent"), "should contain rendered rubric header");
260
+ assert.ok(assertion.value.includes("File created"), "should contain task-specific criteria");
261
+ // Dimension metadata should be attached
262
+ const metadata = assertion.metadata;
263
+ assert.ok(metadata, "should have metadata");
264
+ assert.equal(metadata.dimension, "agent-output");
265
+ assert.equal(metadata.maxScore, 100);
266
+ // Grader provider should be set
267
+ assert.equal(assertion.provider, "anthropic:messages:claude-opus-4-5");
268
+ });
269
+ it("warns when rubric template is unknown", () => {
270
+ const rubricConfig = { templates: {} };
271
+ const result = compileAgentHarnessTask(makeTask({
272
+ assertions: [
273
+ {
274
+ type: "llm-rubric",
275
+ template: "nonexistent-template",
276
+ criteria: ["Something"],
277
+ },
278
+ ],
279
+ }), { rubricConfig });
280
+ // Unknown template produces a warning and no assertion
281
+ assert.ok(result.warnings.some((w) => w.includes("nonexistent-template")), "should warn about unknown template");
282
+ // The assertion should be null (filtered out)
283
+ assert.equal(result.tests[0].assert?.length ?? 0, 0, "should not produce an assertion for unknown template");
284
+ });
285
+ it("warns when rubricConfig is not provided for templated assertion", () => {
286
+ const result = compileAgentHarnessTask(makeTask({
287
+ assertions: [
288
+ {
289
+ type: "llm-rubric",
290
+ template: "agent-output",
291
+ criteria: ["Something"],
292
+ },
293
+ ],
294
+ })
295
+ // No rubricConfig in options
296
+ );
297
+ assert.ok(result.warnings.some((w) => w.includes("No rubric config")), "should warn about missing rubric config");
298
+ });
222
299
  });
223
300
  // ---------------------------------------------------------------------------
224
301
  // compileAgentHarnessTask — lifecycle extensions
@@ -250,7 +327,7 @@ describe("compileAgentHarnessTask — lifecycle", () => {
250
327
  }));
251
328
  assert.equal(result.sandboxConfig.type, "docker");
252
329
  assert.equal(result.sandboxConfig.image, "node:22");
253
- assert.deepEqual(result.sandboxConfig.fixtures, ["file://schema.ts"]);
330
+ assert.deepEqual(result.sandboxConfig.fixtures, ["schema.ts"]);
254
331
  assert.equal(result.sandboxConfig.limits?.cpus, 2);
255
332
  assert.equal(result.sandboxConfig.limits?.networkAccess, false);
256
333
  });
@@ -278,7 +355,8 @@ describe("example agent harness tasks — end-to-end", () => {
278
355
  const result = compileAgentHarnessTask(modifyCodeTask);
279
356
  assert.ok(result.tests[0].assert);
280
357
  assert.ok(result.tests[0].assert.some((a) => a.type === "javascript" &&
281
- a.value.includes("useDocumentOperation")));
358
+ a.value.includes("fileContains") &&
359
+ a.config != null));
282
360
  });
283
361
  it("refactor task has docker sandbox config", () => {
284
362
  const result = compileAgentHarnessTask(multiFileRefactorTask);
@@ -65,12 +65,38 @@ export function writeCompiledModeConfig(result, mode, options) {
65
65
  if (options.graderProvider) {
66
66
  graderOpts.provider = options.graderProvider;
67
67
  }
68
- // Build provider entries
68
+ // For agent-harness mode, create sandbox directories and inject working_dir
69
+ // into provider configs. The sandbox must exist before the provider initializes
70
+ // (the Claude Agent SDK reads working_dir at construction time).
71
+ // Both working_dir and __workingDir use absolute paths to avoid ambiguity.
72
+ // @see https://www.promptfoo.dev/docs/providers/claude-agent-sdk/
73
+ const sandboxAbsPath = result.extras?.sandboxConfig
74
+ ? resolve(options.rootDir, `results/latest/sandbox-${mode}`)
75
+ : undefined;
76
+ if (sandboxAbsPath) {
77
+ mkdirSync(sandboxAbsPath, { recursive: true });
78
+ }
79
+ // Build provider entries, injecting working_dir for agent-harness providers
69
80
  const providerEntries = result.providers.map((p) => {
70
- if (p.config)
71
- return { id: p.id, label: p.label, config: p.config };
72
- return p.label ? { id: p.id, label: p.label } : p.id;
81
+ if (!p.config)
82
+ return p.label ? { id: p.id, label: p.label } : p.id;
83
+ const config = { ...p.config };
84
+ if (sandboxAbsPath && p.id === "anthropic:claude-agent-sdk") {
85
+ config.working_dir = sandboxAbsPath;
86
+ }
87
+ return { id: p.id, label: p.label, config };
73
88
  });
89
+ // Inject __workingDir into test vars so assertions can find the sandbox
90
+ if (sandboxAbsPath) {
91
+ for (const test of expandedTests) {
92
+ if (test.vars) {
93
+ ;
94
+ test.vars.__workingDir = sandboxAbsPath;
95
+ }
96
+ }
97
+ // Re-write the tests file with the injected paths
98
+ writeFileSync(testsPath, JSON.stringify(expandedTests, null, 2), "utf-8");
99
+ }
74
100
  // Build prompt entries
75
101
  const prompts = result.prompts.map((p) => ({
76
102
  id: p.id,
@@ -88,10 +114,11 @@ export function writeCompiledModeConfig(result, mode, options) {
88
114
  tests: [testsFilename],
89
115
  });
90
116
  // Include extensions if present (agent-harness mode)
117
+ // Promptfoo expects extensions as string[] (file paths to JS modules),
118
+ // so we materialize the { type, code } objects as a .cjs file on disk.
91
119
  if (result.extras?.extensions) {
92
- ;
93
- config.extensions =
94
- result.extras.extensions;
120
+ const extPaths = writeExtensionFile(options.rootDir, mode, result.extras.extensions);
121
+ config.extensions = extPaths;
95
122
  }
96
123
  writeConfig(options.rootDir, filename, config, options.logger);
97
124
  }
@@ -215,3 +242,47 @@ function writeYaml(path, data, header) {
215
242
  });
216
243
  writeFileSync(path, `${header}\n${yamlStr}`, "utf-8");
217
244
  }
245
+ /**
246
+ * Materialize Promptfoo lifecycle extensions as a .cjs file on disk.
247
+ *
248
+ * Promptfoo extensions use a single-function dispatch pattern:
249
+ * module.exports = async function(hookName, context) { ... }
250
+ *
251
+ * Each extension entry in the YAML references:
252
+ * file://path/to/file.cjs:exportedFunctionName
253
+ *
254
+ * @see https://www.promptfoo.dev/docs/configuration/reference/ — extensions
255
+ */
256
+ function writeExtensionFile(rootDir, mode, extensions) {
257
+ // Build a dispatch map: hookName → handler code
258
+ const hookMap = {};
259
+ for (const ext of extensions) {
260
+ hookMap[ext.type] = ext.code;
261
+ }
262
+ // Generate the single dispatch function that promptfoo expects
263
+ const hookCases = Object.entries(hookMap)
264
+ .map(([hookName, code]) => ` if (hookName === '${hookName}') {\n` +
265
+ ` const handler = ${code};\n` +
266
+ ` return handler(context);\n` +
267
+ ` }`)
268
+ .join("\n");
269
+ const fileContent = [
270
+ "// AUTO-GENERATED by compiler pipeline — do not edit directly.",
271
+ "// Run: npx @sanity/ailf generate-configs",
272
+ "//",
273
+ "// Promptfoo extension dispatch function.",
274
+ `// @see https://www.promptfoo.dev/docs/configuration/reference/`,
275
+ "",
276
+ "async function extensionHook(hookName, context) {",
277
+ hookCases,
278
+ "}",
279
+ "",
280
+ "module.exports = extensionHook;",
281
+ "",
282
+ ].join("\n");
283
+ const filename = `results/latest/${mode}-extensions.cjs`;
284
+ const outPath = resolve(rootDir, filename);
285
+ writeFileSync(outPath, fileContent, "utf-8");
286
+ // Single entry pointing to the dispatch function
287
+ return [`file://${filename}:extensionHook`];
288
+ }
@@ -4,6 +4,21 @@
4
4
  * Handles agent-specific assertion types (file-exists, file-contains,
5
5
  * command-succeeds, diff-matches) as well as standard pass-through
6
6
  * assertion types.
7
+ *
8
+ * Templated LLM-rubric assertions (those with `template` + `criteria`)
9
+ * are resolved via the shared rubric-resolution module, producing fully
10
+ * assembled rubric text and dimension metadata. This is critical for
11
+ * scoring — without it, the grader receives empty rubrics and the
12
+ * scoring pipeline has no dimension data to work with (DOC-2029).
13
+ *
14
+ * Agent-specific assertions use file-based references to the assertions
15
+ * runtime module (dist/agent-harness/assertions-runtime.js) because
16
+ * promptfoo's inline `type: javascript` assertions run in a restricted
17
+ * eval() sandbox where require() is unavailable. File-based assertions
18
+ * run in a full Node.js context.
19
+ *
20
+ * @see https://www.promptfoo.dev/docs/configuration/expected-outputs/javascript/
21
+ * @see src/agent-harness/assertions-runtime.ts — runtime implementations
7
22
  */
8
23
  import type { PromptfooAssertion } from "../../assertion-mapper.js";
9
24
  import type { AgentHarnessCompileOptions } from "./types.js";
@@ -4,7 +4,25 @@
4
4
  * Handles agent-specific assertion types (file-exists, file-contains,
5
5
  * command-succeeds, diff-matches) as well as standard pass-through
6
6
  * assertion types.
7
+ *
8
+ * Templated LLM-rubric assertions (those with `template` + `criteria`)
9
+ * are resolved via the shared rubric-resolution module, producing fully
10
+ * assembled rubric text and dimension metadata. This is critical for
11
+ * scoring — without it, the grader receives empty rubrics and the
12
+ * scoring pipeline has no dimension data to work with (DOC-2029).
13
+ *
14
+ * Agent-specific assertions use file-based references to the assertions
15
+ * runtime module (dist/agent-harness/assertions-runtime.js) because
16
+ * promptfoo's inline `type: javascript` assertions run in a restricted
17
+ * eval() sandbox where require() is unavailable. File-based assertions
18
+ * run in a full Node.js context.
19
+ *
20
+ * @see https://www.promptfoo.dev/docs/configuration/expected-outputs/javascript/
21
+ * @see src/agent-harness/assertions-runtime.ts — runtime implementations
7
22
  */
23
+ import { resolveTemplatedAssertion } from "../../rubric-resolution.js";
24
+ /** Base path for the file-based assertion runtime module */
25
+ const RUNTIME = "file://dist/agent-harness/assertions-runtime.js";
8
26
  // ---------------------------------------------------------------------------
9
27
  // Assertion mapping
10
28
  // ---------------------------------------------------------------------------
@@ -33,6 +51,13 @@ export function mapAgentAssertion(assertion, options, warnings) {
33
51
  : {}),
34
52
  };
35
53
  case "llm-rubric":
54
+ // Templated assertions (template + criteria) need full resolution
55
+ // to produce rubric text and dimension metadata for scoring.
56
+ if ("template" in assertion && "criteria" in assertion) {
57
+ const resolved = resolveTemplatedAssertion(assertion, options?.rubricConfig, options?.graderProvider, warnings);
58
+ return resolved;
59
+ }
60
+ // Non-templated llm-rubric (inline value) — pass through
36
61
  return {
37
62
  type: "llm-rubric",
38
63
  ...("value" in assertion ? { value: assertion.value } : {}),
@@ -53,66 +78,29 @@ export function mapAgentAssertion(assertion, options, warnings) {
53
78
  }
54
79
  // ---------------------------------------------------------------------------
55
80
  // Agent-specific assertion builders
81
+ //
82
+ // Each builder returns a file-based assertion referencing the runtime
83
+ // module with parameters passed via the `config` field.
56
84
  // ---------------------------------------------------------------------------
57
85
  export function buildFileExistsAssertion(assertion) {
58
- const filePath = String(assertion.value ?? "");
59
- // Use JSON.stringify for all interpolated values in generated JS to
60
- // prevent broken strings from filePaths containing quotes/backslashes
61
- const safeFilePath = JSON.stringify(filePath);
62
86
  return {
63
87
  type: "javascript",
64
- value: `// file-exists: ${filePath}\n` +
65
- `(function() {\n` +
66
- ` const fs = require('fs');\n` +
67
- ` const path = require('path');\n` +
68
- ` const workDir = path.resolve(context.vars.__workingDir || '.');\n` +
69
- ` const target = path.resolve(workDir, ${safeFilePath});\n` +
70
- ` if (!target.startsWith(workDir + path.sep) && target !== workDir) {\n` +
71
- ` return { pass: false, score: 0, reason: 'Path traversal: ' + ${safeFilePath} + ' escapes sandbox' };\n` +
72
- ` }\n` +
73
- ` const exists = fs.existsSync(target);\n` +
74
- ` return {\n` +
75
- ` pass: exists,\n` +
76
- ` score: exists ? 1 : 0,\n` +
77
- ` reason: exists\n` +
78
- ` ? 'File exists: ' + ${safeFilePath}\n` +
79
- ` : 'Expected file not found: ' + ${safeFilePath},\n` +
80
- ` };\n` +
81
- `})()`,
88
+ value: `${RUNTIME}:fileExists`,
89
+ config: { filePath: String(assertion.value ?? "") },
82
90
  ...(typeof assertion.weight === "number"
83
91
  ? { weight: assertion.weight }
84
92
  : {}),
85
93
  };
86
94
  }
87
95
  export function buildFileContainsAssertion(assertion) {
88
- const config = assertion.value;
89
- const filePath = config?.path ?? "";
90
- const expectedContent = config?.content ?? "";
91
- const safeFilePath = JSON.stringify(filePath);
96
+ const val = assertion.value;
92
97
  return {
93
98
  type: "javascript",
94
- value: `// file-contains: ${filePath}\n` +
95
- `(function() {\n` +
96
- ` const fs = require('fs');\n` +
97
- ` const path = require('path');\n` +
98
- ` const workDir = path.resolve(context.vars.__workingDir || '.');\n` +
99
- ` const target = path.resolve(workDir, ${safeFilePath});\n` +
100
- ` if (!target.startsWith(workDir + path.sep) && target !== workDir) {\n` +
101
- ` return { pass: false, score: 0, reason: 'Path traversal: ' + ${safeFilePath} + ' escapes sandbox' };\n` +
102
- ` }\n` +
103
- ` if (!fs.existsSync(target)) {\n` +
104
- ` return { pass: false, score: 0, reason: 'File not found: ' + ${safeFilePath} };\n` +
105
- ` }\n` +
106
- ` const content = fs.readFileSync(target, 'utf-8');\n` +
107
- ` const contains = content.includes(${JSON.stringify(expectedContent)});\n` +
108
- ` return {\n` +
109
- ` pass: contains,\n` +
110
- ` score: contains ? 1 : 0,\n` +
111
- ` reason: contains\n` +
112
- ` ? 'File contains expected content'\n` +
113
- ` : 'File does not contain expected content',\n` +
114
- ` };\n` +
115
- `})()`,
99
+ value: `${RUNTIME}:fileContains`,
100
+ config: {
101
+ filePath: val?.path ?? "",
102
+ content: val?.content ?? "",
103
+ },
116
104
  ...(typeof assertion.weight === "number"
117
105
  ? { weight: assertion.weight }
118
106
  : {}),
@@ -133,53 +121,22 @@ export function buildFileContainsAssertion(assertion) {
133
121
  * from untrusted sources, validate commands against an allowlist first.
134
122
  */
135
123
  export function buildCommandSucceedsAssertion(assertion) {
136
- const command = String(assertion.value ?? "");
137
124
  return {
138
125
  type: "javascript",
139
- value: `// command-succeeds: ${command}\n` +
140
- `(function() {\n` +
141
- ` const { execSync } = require('child_process');\n` +
142
- ` const workDir = context.vars.__workingDir || '.';\n` +
143
- ` try {\n` +
144
- ` execSync(${JSON.stringify(command)}, { cwd: workDir, timeout: 30000 });\n` +
145
- ` return { pass: true, score: 1, reason: 'Command succeeded: ' + ${JSON.stringify(command)} };\n` +
146
- ` } catch (err) {\n` +
147
- ` return {\n` +
148
- ` pass: false,\n` +
149
- ` score: 0,\n` +
150
- ` reason: 'Command failed: ' + (err.message || err),\n` +
151
- ` };\n` +
152
- ` }\n` +
153
- `})()`,
126
+ value: `${RUNTIME}:commandSucceeds`,
127
+ config: { command: String(assertion.value ?? "") },
154
128
  ...(typeof assertion.weight === "number"
155
129
  ? { weight: assertion.weight }
156
130
  : {}),
157
131
  };
158
132
  }
159
133
  export function buildDiffMatchesAssertion(assertion) {
160
- const expected = assertion.value;
161
134
  return {
162
135
  type: "javascript",
163
- value: `// diff-matches\n` +
164
- `(function() {\n` +
165
- ` const { execSync } = require('child_process');\n` +
166
- ` const workDir = context.vars.__workingDir || '.';\n` +
167
- ` try {\n` +
168
- ` const diff = execSync('git diff', { cwd: workDir, encoding: 'utf-8' });\n` +
169
- ` const expected = ${JSON.stringify(expected)};\n` +
170
- ` if (typeof expected === 'string') {\n` +
171
- ` const contains = diff.includes(expected);\n` +
172
- ` return {\n` +
173
- ` pass: contains,\n` +
174
- ` score: contains ? 1 : 0,\n` +
175
- ` reason: contains ? 'Diff matches expected pattern' : 'Diff does not match',\n` +
176
- ` };\n` +
177
- ` }\n` +
178
- ` return { pass: diff.length > 0, score: diff.length > 0 ? 1 : 0, reason: 'Diff exists' };\n` +
179
- ` } catch (err) {\n` +
180
- ` return { pass: false, score: 0, reason: 'Failed to get diff: ' + err.message };\n` +
181
- ` }\n` +
182
- `})()`,
136
+ value: `${RUNTIME}:diffMatches`,
137
+ config: {
138
+ ...(assertion.value != null ? { expected: assertion.value } : {}),
139
+ },
183
140
  ...(typeof assertion.weight === "number"
184
141
  ? { weight: assertion.weight }
185
142
  : {}),
@@ -27,8 +27,10 @@ export function compileAgentHarnessTask(task, options) {
27
27
  const prompts = buildAgentPrompts(task);
28
28
  // Build test cases
29
29
  const tests = buildAgentTestCases(task, options, warnings);
30
- // Build sandbox extensions
31
- const sandboxConfig = buildSandboxConfig(task);
30
+ // Build sandbox extensions — resolve fixture paths at compile time using
31
+ // the caller's cwd (monorepo root), not the eval package rootDir.
32
+ const callerCwd = process.env.AILF_CALLER_CWD ?? process.cwd();
33
+ const sandboxConfig = buildSandboxConfig(task, callerCwd);
32
34
  const extensions = buildLifecycleExtensions(task, sandboxConfig);
33
35
  return { providers, tests, prompts, extensions, sandboxConfig, warnings };
34
36
  }
@@ -36,22 +38,25 @@ export function compileAgentHarnessTask(task, options) {
36
38
  // Provider assembly
37
39
  // ---------------------------------------------------------------------------
38
40
  export function buildAgentProvider(task, _warnings) {
39
- // Resolve tool permissions
40
41
  const tools = resolveToolPermissions(task.tools);
41
- const config = {};
42
+ // Claude Agent SDK config.
43
+ // working_dir is set by the YAML writer to the sandbox path it creates.
44
+ // @see https://www.promptfoo.dev/docs/providers/claude-agent-sdk/
45
+ const config = {
46
+ model: "claude-sonnet-4-20250514",
47
+ max_turns: 25,
48
+ max_budget_usd: 1.0,
49
+ permission_mode: "bypassPermissions",
50
+ allow_dangerously_skip_permissions: true,
51
+ };
52
+ // Map AILF tool names to Claude Agent SDK tool config.
53
+ // Claude SDK uses custom_allowed_tools to replace defaults.
42
54
  if (tools.length > 0) {
43
- config.allowedTools = tools;
44
- }
45
- if (task.sandbox) {
46
- config.sandbox = {
47
- type: task.sandbox.type,
48
- ...(task.sandbox.image ? { image: task.sandbox.image } : {}),
49
- };
55
+ config.custom_allowed_tools = tools;
50
56
  }
51
- // Default to Claude Agent SDK provider
52
57
  return [
53
58
  {
54
- id: `agent:${task.id}`,
59
+ id: "anthropic:claude-agent-sdk",
55
60
  label: `Agent Harness: ${task.title}`,
56
61
  config,
57
62
  },
@@ -112,9 +117,11 @@ export function buildAgentTestCases(task, options, warnings) {
112
117
  const vars = {
113
118
  task: task.prompt?.vars?.task ?? task.description ?? `Complete: ${task.title}`,
114
119
  ...(task.prompt?.vars ?? {}),
115
- // Internal metadata for sandbox lifecycle hooks
120
+ // Internal metadata for sandbox lifecycle hooks.
121
+ // Fixture paths are plain strings (no file:// prefix) because
122
+ // promptfoo auto-resolves file:// in vars by reading file content.
116
123
  __sandboxType: task.sandbox?.type ?? "tempdir",
117
- __fixtures: task.fixtures ?? [],
124
+ __fixtures: (task.fixtures ?? []).map((f) => f.startsWith("file://") ? f.slice(7) : f),
118
125
  };
119
126
  const tests = [
120
127
  {
@@ -28,6 +28,7 @@ export const handler = {
28
28
  const result = compileAgentHarnessTask(task, {
29
29
  graderProvider: ctx.graderProvider,
30
30
  rootDir: ctx.rootDir,
31
+ rubricConfig: ctx.rubricConfig,
31
32
  });
32
33
  return {
33
34
  providers: result.providers,
@@ -6,7 +6,14 @@
6
6
  */
7
7
  import type { AgentHarnessTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
8
8
  import type { PromptfooExtension, SandboxConfigMeta } from "./types.js";
9
- export declare function buildSandboxConfig(task: AgentHarnessTaskDefinition): SandboxConfigMeta;
9
+ /**
10
+ * Build sandbox configuration from a task definition.
11
+ *
12
+ * Fixture paths are resolved to absolute at compile time using callerCwd
13
+ * (the directory the pipeline was invoked from), because promptfoo runs
14
+ * with cwd set to packages/eval/ — not the monorepo root where apps/ lives.
15
+ */
16
+ export declare function buildSandboxConfig(task: AgentHarnessTaskDefinition, callerCwd?: string): SandboxConfigMeta;
10
17
  export declare function buildLifecycleExtensions(task: AgentHarnessTaskDefinition, sandboxConfig: SandboxConfigMeta): PromptfooExtension[];
11
18
  export declare function buildBeforeEachHook(taskId: string, config: SandboxConfigMeta): string;
12
19
  export declare function buildAfterEachHook(taskId: string): string;