npm - @sanity/ailf - Versions diffs - 2.1.0 → 2.3.0 - Mend

@sanity/ailf 2.1.0 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

package/config/rubrics.ts +3 -3
package/dist/_vendor/ailf-core/examples/index.d.ts +50 -1
package/dist/_vendor/ailf-core/examples/index.js +66 -1
package/dist/_vendor/ailf-core/types/index.d.ts +25 -0
package/dist/agent-harness/assertions-runtime.d.ts +49 -0
package/dist/agent-harness/assertions-runtime.js +138 -0
package/dist/agent-harness/provider.d.ts +58 -0
package/dist/agent-harness/provider.js +104 -0
package/dist/commands/calculate-scores.js +7 -2
package/dist/commands/capture-list.d.ts +1 -1
package/dist/commands/capture-list.js +6 -3
package/dist/commands/compare.js +11 -7
package/dist/commands/explain-handler.js +22 -24
package/dist/commands/fetch-docs.js +4 -2
package/dist/commands/generate-configs.js +6 -2
package/dist/commands/init.js +3 -0
package/dist/commands/pipeline-action.js +8 -24
package/dist/commands/pipeline.js +1 -1
package/dist/commands/pr-comment.js +6 -2
package/dist/commands/publish.d.ts +1 -0
package/dist/commands/publish.js +12 -8
package/dist/commands/remote-pipeline.js +1 -1
package/dist/commands/remote-results.d.ts +8 -8
package/dist/commands/remote-results.js +7 -7
package/dist/commands/shared/options.d.ts +8 -0
package/dist/commands/shared/options.js +10 -0
package/dist/commands/shared/resolve-output-dir.d.ts +27 -0
package/dist/commands/shared/resolve-output-dir.js +36 -0
package/dist/composition-root.js +1 -1
package/dist/config/rubrics.ts +3 -3
package/dist/orchestration/build-app-context.js +1 -1
package/dist/orchestration/steps/gap-analysis-step.js +86 -75
package/dist/orchestration/steps/generate-configs-step.d.ts +7 -0
package/dist/orchestration/steps/generate-configs-step.js +47 -2
package/dist/pipeline/calculate-scores.js +113 -2
package/dist/pipeline/compare.js +50 -19
package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +103 -25
package/dist/pipeline/compiler/compiler-to-yaml.js +78 -7
package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +15 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +42 -85
package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +22 -15
package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +8 -1
package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +42 -12
package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +3 -0
package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +1 -27
package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +2 -9
package/dist/pipeline/compiler/rubric-resolution.d.ts +40 -0
package/dist/pipeline/compiler/rubric-resolution.js +52 -0
package/dist/pipeline/compiler/scoring-bridge.js +59 -7
package/dist/pipeline/provenance.js +7 -1
package/dist/pipeline/validate.d.ts +5 -4
package/dist/pipeline/validate.js +34 -113
package/package.json +2 -1

package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js CHANGED Viewed

@@ -87,15 +87,23 @@ describe("validateAgentHarnessTask", () => {
 // compileAgentHarnessTask — provider assembly
 // ---------------------------------------------------------------------------
 describe("compileAgentHarnessTask — providers", () => {
-    it("produces a provider", () => {
+    it("produces a Claude Agent SDK provider", () => {
         const result = compileAgentHarnessTask(makeTask());
         assert.ok(result.providers.length > 0);
-        assert.ok(result.providers[0].id.startsWith("agent:"));
+        assert.equal(result.providers[0].id, "anthropic:claude-agent-sdk");
     });
-    it("resolves coding tool preset", () => {
+    it("sets default agent config", () => {
+        const result = compileAgentHarnessTask(makeTask());
+        const config = result.providers[0].config;
+        assert.ok(config.model, "should set a model");
+        assert.ok(config.max_turns, "should set max_turns");
+        assert.ok(config.max_budget_usd, "should set budget cap");
+        assert.equal(config.permission_mode, "bypassPermissions");
+    });
+    it("resolves coding tool preset into custom_allowed_tools", () => {
         const result = compileAgentHarnessTask(makeTask({ tools: ["coding"] }));
         const config = result.providers[0].config;
-        const tools = config.allowedTools;
+        const tools = config.custom_allowed_tools;
         assert.ok(tools.includes("Bash"));
         assert.ok(tools.includes("Read"));
         assert.ok(tools.includes("Write"));
@@ -104,7 +112,7 @@ describe("compileAgentHarnessTask — providers", () => {
     it("resolves read-only tool preset", () => {
         const result = compileAgentHarnessTask(makeTask({ tools: ["read-only"] }));
         const config = result.providers[0].config;
-        const tools = config.allowedTools;
+        const tools = config.custom_allowed_tools;
         assert.ok(tools.includes("Read"));
         assert.ok(tools.includes("Grep"));
         assert.ok(!tools.includes("Write"), "read-only should not include Write");
@@ -112,19 +120,10 @@ describe("compileAgentHarnessTask — providers", () => {
     it("mixes preset and explicit tools", () => {
         const result = compileAgentHarnessTask(makeTask({ tools: ["read-only", "WebFetch"] }));
         const config = result.providers[0].config;
-        const tools = config.allowedTools;
+        const tools = config.custom_allowed_tools;
         assert.ok(tools.includes("Read"));
         assert.ok(tools.includes("WebFetch"));
     });
-    it("includes sandbox config in provider", () => {
-        const result = compileAgentHarnessTask(makeTask({
-            sandbox: { type: "docker", image: "node:22-slim" },
-        }));
-        const config = result.providers[0].config;
-        const sandbox = config.sandbox;
-        assert.equal(sandbox.type, "docker");
-        assert.equal(sandbox.image, "node:22-slim");
-    });
 });
 // ---------------------------------------------------------------------------
 // compileAgentHarnessTask — test cases
@@ -166,16 +165,20 @@ describe("compileAgentHarnessTask — test cases", () => {
 // compileAgentHarnessTask — assertions
 // ---------------------------------------------------------------------------
 describe("compileAgentHarnessTask — assertions", () => {
-    it("maps file-exists to javascript assertion", () => {
+    const RUNTIME = "file://dist/agent-harness/assertions-runtime.js";
+    it("maps file-exists to file-based javascript assertion", () => {
         const result = compileAgentHarnessTask(makeTask({
             assertions: [{ type: "file-exists", value: "sanity.config.ts" }],
         }));
         const assertion = result.tests[0].assert?.[0];
         assert.ok(assertion);
         assert.equal(assertion.type, "javascript");
-        assert.ok(assertion.value.includes("sanity.config.ts"));
+        assert.equal(assertion.value, `${RUNTIME}:fileExists`);
+        assert.deepEqual(assertion.config, {
+            filePath: "sanity.config.ts",
+        });
     });
-    it("maps file-contains to javascript assertion", () => {
+    it("maps file-contains to file-based javascript assertion", () => {
         const result = compileAgentHarnessTask(makeTask({
             assertions: [
                 {
@@ -187,25 +190,35 @@ describe("compileAgentHarnessTask — assertions", () => {
         const assertion = result.tests[0].assert?.[0];
         assert.ok(assertion);
         assert.equal(assertion.type, "javascript");
-        assert.ok(assertion.value.includes("projectId"));
+        assert.equal(assertion.value, `${RUNTIME}:fileContains`);
+        assert.deepEqual(assertion.config, {
+            filePath: "config.ts",
+            content: "projectId",
+        });
     });
-    it("maps command-succeeds to javascript assertion", () => {
+    it("maps command-succeeds to file-based javascript assertion", () => {
         const result = compileAgentHarnessTask(makeTask({
             assertions: [{ type: "command-succeeds", value: "npx tsc --noEmit" }],
         }));
         const assertion = result.tests[0].assert?.[0];
         assert.ok(assertion);
         assert.equal(assertion.type, "javascript");
-        assert.ok(assertion.value.includes("tsc"));
+        assert.equal(assertion.value, `${RUNTIME}:commandSucceeds`);
+        assert.deepEqual(assertion.config, {
+            command: "npx tsc --noEmit",
+        });
     });
-    it("maps diff-matches to javascript assertion", () => {
+    it("maps diff-matches to file-based javascript assertion", () => {
         const result = compileAgentHarnessTask(makeTask({
             assertions: [{ type: "diff-matches", value: "createClient" }],
         }));
         const assertion = result.tests[0].assert?.[0];
         assert.ok(assertion);
         assert.equal(assertion.type, "javascript");
-        assert.ok(assertion.value.includes("git diff"));
+        assert.equal(assertion.value, `${RUNTIME}:diffMatches`);
+        assert.deepEqual(assertion.config, {
+            expected: "createClient",
+        });
     });
     it("passes through standard assertions", () => {
         const result = compileAgentHarnessTask(makeTask({
@@ -219,6 +232,70 @@ describe("compileAgentHarnessTask — assertions", () => {
         }), { graderProvider: "openai:chat:gpt-5" });
         assert.equal(result.tests[0].assert?.[0]?.provider, "openai:chat:gpt-5");
     });
+    it("resolves templated llm-rubric with rubric text and dimension metadata", () => {
+        const rubricConfig = {
+            templates: {
+                "agent-output": {
+                    dimension: "agent-output",
+                    header: "Score the agent's final output from 0 to 100:",
+                    scale: ["0: Failed", "50: Partial", "100: Complete"],
+                    criteria_label: "Check for:",
+                },
+            },
+        };
+        const result = compileAgentHarnessTask(makeTask({
+            assertions: [
+                {
+                    type: "llm-rubric",
+                    template: "agent-output",
+                    criteria: ["File created", "Correct content"],
+                },
+            ],
+        }), { rubricConfig, graderProvider: "anthropic:messages:claude-opus-4-5" });
+        const assertion = result.tests[0].assert?.[0];
+        assert.ok(assertion, "should produce an assertion");
+        assert.equal(assertion.type, "llm-rubric");
+        // Rubric text should be fully rendered (not empty)
+        assert.ok(assertion.value.includes("Score the agent"), "should contain rendered rubric header");
+        assert.ok(assertion.value.includes("File created"), "should contain task-specific criteria");
+        // Dimension metadata should be attached
+        const metadata = assertion.metadata;
+        assert.ok(metadata, "should have metadata");
+        assert.equal(metadata.dimension, "agent-output");
+        assert.equal(metadata.maxScore, 100);
+        // Grader provider should be set
+        assert.equal(assertion.provider, "anthropic:messages:claude-opus-4-5");
+    });
+    it("warns when rubric template is unknown", () => {
+        const rubricConfig = { templates: {} };
+        const result = compileAgentHarnessTask(makeTask({
+            assertions: [
+                {
+                    type: "llm-rubric",
+                    template: "nonexistent-template",
+                    criteria: ["Something"],
+                },
+            ],
+        }), { rubricConfig });
+        // Unknown template produces a warning and no assertion
+        assert.ok(result.warnings.some((w) => w.includes("nonexistent-template")), "should warn about unknown template");
+        // The assertion should be null (filtered out)
+        assert.equal(result.tests[0].assert?.length ?? 0, 0, "should not produce an assertion for unknown template");
+    });
+    it("warns when rubricConfig is not provided for templated assertion", () => {
+        const result = compileAgentHarnessTask(makeTask({
+            assertions: [
+                {
+                    type: "llm-rubric",
+                    template: "agent-output",
+                    criteria: ["Something"],
+                },
+            ],
+        })
+        // No rubricConfig in options
+        );
+        assert.ok(result.warnings.some((w) => w.includes("No rubric config")), "should warn about missing rubric config");
+    });
 });
 // ---------------------------------------------------------------------------
 // compileAgentHarnessTask — lifecycle extensions
@@ -250,7 +327,7 @@ describe("compileAgentHarnessTask — lifecycle", () => {
         }));
         assert.equal(result.sandboxConfig.type, "docker");
         assert.equal(result.sandboxConfig.image, "node:22");
-        assert.deepEqual(result.sandboxConfig.fixtures, ["file://schema.ts"]);
+        assert.deepEqual(result.sandboxConfig.fixtures, ["schema.ts"]);
         assert.equal(result.sandboxConfig.limits?.cpus, 2);
         assert.equal(result.sandboxConfig.limits?.networkAccess, false);
     });
@@ -278,7 +355,8 @@ describe("example agent harness tasks — end-to-end", () => {
         const result = compileAgentHarnessTask(modifyCodeTask);
         assert.ok(result.tests[0].assert);
         assert.ok(result.tests[0].assert.some((a) => a.type === "javascript" &&
-            a.value.includes("useDocumentOperation")));
+            a.value.includes("fileContains") &&
+            a.config != null));
     });
     it("refactor task has docker sandbox config", () => {
         const result = compileAgentHarnessTask(multiFileRefactorTask);

package/dist/pipeline/compiler/compiler-to-yaml.js CHANGED Viewed

@@ -65,12 +65,38 @@ export function writeCompiledModeConfig(result, mode, options) {
     if (options.graderProvider) {
         graderOpts.provider = options.graderProvider;
     }
-    // Build provider entries
+    // For agent-harness mode, create sandbox directories and inject working_dir
+    // into provider configs. The sandbox must exist before the provider initializes
+    // (the Claude Agent SDK reads working_dir at construction time).
+    // Both working_dir and __workingDir use absolute paths to avoid ambiguity.
+    // @see https://www.promptfoo.dev/docs/providers/claude-agent-sdk/
+    const sandboxAbsPath = result.extras?.sandboxConfig
+        ? resolve(options.rootDir, `results/latest/sandbox-${mode}`)
+        : undefined;
+    if (sandboxAbsPath) {
+        mkdirSync(sandboxAbsPath, { recursive: true });
+    }
+    // Build provider entries, injecting working_dir for agent-harness providers
     const providerEntries = result.providers.map((p) => {
-        if (p.config)
-            return { id: p.id, label: p.label, config: p.config };
-        return p.label ? { id: p.id, label: p.label } : p.id;
+        if (!p.config)
+            return p.label ? { id: p.id, label: p.label } : p.id;
+        const config = { ...p.config };
+        if (sandboxAbsPath && p.id === "anthropic:claude-agent-sdk") {
+            config.working_dir = sandboxAbsPath;
+        }
+        return { id: p.id, label: p.label, config };
     });
+    // Inject __workingDir into test vars so assertions can find the sandbox
+    if (sandboxAbsPath) {
+        for (const test of expandedTests) {
+            if (test.vars) {
+                ;
+                test.vars.__workingDir = sandboxAbsPath;
+            }
+        }
+        // Re-write the tests file with the injected paths
+        writeFileSync(testsPath, JSON.stringify(expandedTests, null, 2), "utf-8");
+    }
     // Build prompt entries
     const prompts = result.prompts.map((p) => ({
         id: p.id,
@@ -88,10 +114,11 @@ export function writeCompiledModeConfig(result, mode, options) {
         tests: [testsFilename],
     });
     // Include extensions if present (agent-harness mode)
+    // Promptfoo expects extensions as string[] (file paths to JS modules),
+    // so we materialize the { type, code } objects as a .cjs file on disk.
     if (result.extras?.extensions) {
-        ;
-        config.extensions =
-            result.extras.extensions;
+        const extPaths = writeExtensionFile(options.rootDir, mode, result.extras.extensions);
+        config.extensions = extPaths;
     }
     writeConfig(options.rootDir, filename, config, options.logger);
 }
@@ -215,3 +242,47 @@ function writeYaml(path, data, header) {
     });
     writeFileSync(path, `${header}\n${yamlStr}`, "utf-8");
 }
+/**
+ * Materialize Promptfoo lifecycle extensions as a .cjs file on disk.
+ *
+ * Promptfoo extensions use a single-function dispatch pattern:
+ *   module.exports = async function(hookName, context) { ... }
+ *
+ * Each extension entry in the YAML references:
+ *   file://path/to/file.cjs:exportedFunctionName
+ *
+ * @see https://www.promptfoo.dev/docs/configuration/reference/ — extensions
+ */
+function writeExtensionFile(rootDir, mode, extensions) {
+    // Build a dispatch map: hookName → handler code
+    const hookMap = {};
+    for (const ext of extensions) {
+        hookMap[ext.type] = ext.code;
+    }
+    // Generate the single dispatch function that promptfoo expects
+    const hookCases = Object.entries(hookMap)
+        .map(([hookName, code]) => `  if (hookName === '${hookName}') {\n` +
+        `    const handler = ${code};\n` +
+        `    return handler(context);\n` +
+        `  }`)
+        .join("\n");
+    const fileContent = [
+        "// AUTO-GENERATED by compiler pipeline — do not edit directly.",
+        "// Run: npx @sanity/ailf generate-configs",
+        "//",
+        "// Promptfoo extension dispatch function.",
+        `// @see https://www.promptfoo.dev/docs/configuration/reference/`,
+        "",
+        "async function extensionHook(hookName, context) {",
+        hookCases,
+        "}",
+        "",
+        "module.exports = extensionHook;",
+        "",
+    ].join("\n");
+    const filename = `results/latest/${mode}-extensions.cjs`;
+    const outPath = resolve(rootDir, filename);
+    writeFileSync(outPath, fileContent, "utf-8");
+    // Single entry pointing to the dispatch function
+    return [`file://${filename}:extensionHook`];
+}

package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts CHANGED Viewed

@@ -4,6 +4,21 @@
  * Handles agent-specific assertion types (file-exists, file-contains,
  * command-succeeds, diff-matches) as well as standard pass-through
  * assertion types.
+ *
+ * Templated LLM-rubric assertions (those with `template` + `criteria`)
+ * are resolved via the shared rubric-resolution module, producing fully
+ * assembled rubric text and dimension metadata. This is critical for
+ * scoring — without it, the grader receives empty rubrics and the
+ * scoring pipeline has no dimension data to work with (DOC-2029).
+ *
+ * Agent-specific assertions use file-based references to the assertions
+ * runtime module (dist/agent-harness/assertions-runtime.js) because
+ * promptfoo's inline `type: javascript` assertions run in a restricted
+ * eval() sandbox where require() is unavailable. File-based assertions
+ * run in a full Node.js context.
+ *
+ * @see https://www.promptfoo.dev/docs/configuration/expected-outputs/javascript/
+ * @see src/agent-harness/assertions-runtime.ts — runtime implementations
  */
 import type { PromptfooAssertion } from "../../assertion-mapper.js";
 import type { AgentHarnessCompileOptions } from "./types.js";

package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js CHANGED Viewed

@@ -4,7 +4,25 @@
  * Handles agent-specific assertion types (file-exists, file-contains,
  * command-succeeds, diff-matches) as well as standard pass-through
  * assertion types.
+ *
+ * Templated LLM-rubric assertions (those with `template` + `criteria`)
+ * are resolved via the shared rubric-resolution module, producing fully
+ * assembled rubric text and dimension metadata. This is critical for
+ * scoring — without it, the grader receives empty rubrics and the
+ * scoring pipeline has no dimension data to work with (DOC-2029).
+ *
+ * Agent-specific assertions use file-based references to the assertions
+ * runtime module (dist/agent-harness/assertions-runtime.js) because
+ * promptfoo's inline `type: javascript` assertions run in a restricted
+ * eval() sandbox where require() is unavailable. File-based assertions
+ * run in a full Node.js context.
+ *
+ * @see https://www.promptfoo.dev/docs/configuration/expected-outputs/javascript/
+ * @see src/agent-harness/assertions-runtime.ts — runtime implementations
  */
+import { resolveTemplatedAssertion } from "../../rubric-resolution.js";
+/** Base path for the file-based assertion runtime module */
+const RUNTIME = "file://dist/agent-harness/assertions-runtime.js";
 // ---------------------------------------------------------------------------
 // Assertion mapping
 // ---------------------------------------------------------------------------
@@ -33,6 +51,13 @@ export function mapAgentAssertion(assertion, options, warnings) {
                     : {}),
             };
         case "llm-rubric":
+            // Templated assertions (template + criteria) need full resolution
+            // to produce rubric text and dimension metadata for scoring.
+            if ("template" in assertion && "criteria" in assertion) {
+                const resolved = resolveTemplatedAssertion(assertion, options?.rubricConfig, options?.graderProvider, warnings);
+                return resolved;
+            }
+            // Non-templated llm-rubric (inline value) — pass through
             return {
                 type: "llm-rubric",
                 ...("value" in assertion ? { value: assertion.value } : {}),
@@ -53,66 +78,29 @@ export function mapAgentAssertion(assertion, options, warnings) {
 }
 // ---------------------------------------------------------------------------
 // Agent-specific assertion builders
+//
+// Each builder returns a file-based assertion referencing the runtime
+// module with parameters passed via the `config` field.
 // ---------------------------------------------------------------------------
 export function buildFileExistsAssertion(assertion) {
-    const filePath = String(assertion.value ?? "");
-    // Use JSON.stringify for all interpolated values in generated JS to
-    // prevent broken strings from filePaths containing quotes/backslashes
-    const safeFilePath = JSON.stringify(filePath);
     return {
         type: "javascript",
-        value: `// file-exists: ${filePath}\n` +
-            `(function() {\n` +
-            `  const fs = require('fs');\n` +
-            `  const path = require('path');\n` +
-            `  const workDir = path.resolve(context.vars.__workingDir || '.');\n` +
-            `  const target = path.resolve(workDir, ${safeFilePath});\n` +
-            `  if (!target.startsWith(workDir + path.sep) && target !== workDir) {\n` +
-            `    return { pass: false, score: 0, reason: 'Path traversal: ' + ${safeFilePath} + ' escapes sandbox' };\n` +
-            `  }\n` +
-            `  const exists = fs.existsSync(target);\n` +
-            `  return {\n` +
-            `    pass: exists,\n` +
-            `    score: exists ? 1 : 0,\n` +
-            `    reason: exists\n` +
-            `      ? 'File exists: ' + ${safeFilePath}\n` +
-            `      : 'Expected file not found: ' + ${safeFilePath},\n` +
-            `  };\n` +
-            `})()`,
+        value: `${RUNTIME}:fileExists`,
+        config: { filePath: String(assertion.value ?? "") },
         ...(typeof assertion.weight === "number"
             ? { weight: assertion.weight }
             : {}),
     };
 }
 export function buildFileContainsAssertion(assertion) {
-    const config = assertion.value;
-    const filePath = config?.path ?? "";
-    const expectedContent = config?.content ?? "";
-    const safeFilePath = JSON.stringify(filePath);
+    const val = assertion.value;
     return {
         type: "javascript",
-        value: `// file-contains: ${filePath}\n` +
-            `(function() {\n` +
-            `  const fs = require('fs');\n` +
-            `  const path = require('path');\n` +
-            `  const workDir = path.resolve(context.vars.__workingDir || '.');\n` +
-            `  const target = path.resolve(workDir, ${safeFilePath});\n` +
-            `  if (!target.startsWith(workDir + path.sep) && target !== workDir) {\n` +
-            `    return { pass: false, score: 0, reason: 'Path traversal: ' + ${safeFilePath} + ' escapes sandbox' };\n` +
-            `  }\n` +
-            `  if (!fs.existsSync(target)) {\n` +
-            `    return { pass: false, score: 0, reason: 'File not found: ' + ${safeFilePath} };\n` +
-            `  }\n` +
-            `  const content = fs.readFileSync(target, 'utf-8');\n` +
-            `  const contains = content.includes(${JSON.stringify(expectedContent)});\n` +
-            `  return {\n` +
-            `    pass: contains,\n` +
-            `    score: contains ? 1 : 0,\n` +
-            `    reason: contains\n` +
-            `      ? 'File contains expected content'\n` +
-            `      : 'File does not contain expected content',\n` +
-            `  };\n` +
-            `})()`,
+        value: `${RUNTIME}:fileContains`,
+        config: {
+            filePath: val?.path ?? "",
+            content: val?.content ?? "",
+        },
         ...(typeof assertion.weight === "number"
             ? { weight: assertion.weight }
             : {}),
@@ -133,53 +121,22 @@ export function buildFileContainsAssertion(assertion) {
  * from untrusted sources, validate commands against an allowlist first.
  */
 export function buildCommandSucceedsAssertion(assertion) {
-    const command = String(assertion.value ?? "");
     return {
         type: "javascript",
-        value: `// command-succeeds: ${command}\n` +
-            `(function() {\n` +
-            `  const { execSync } = require('child_process');\n` +
-            `  const workDir = context.vars.__workingDir || '.';\n` +
-            `  try {\n` +
-            `    execSync(${JSON.stringify(command)}, { cwd: workDir, timeout: 30000 });\n` +
-            `    return { pass: true, score: 1, reason: 'Command succeeded: ' + ${JSON.stringify(command)} };\n` +
-            `  } catch (err) {\n` +
-            `    return {\n` +
-            `      pass: false,\n` +
-            `      score: 0,\n` +
-            `      reason: 'Command failed: ' + (err.message || err),\n` +
-            `    };\n` +
-            `  }\n` +
-            `})()`,
+        value: `${RUNTIME}:commandSucceeds`,
+        config: { command: String(assertion.value ?? "") },
         ...(typeof assertion.weight === "number"
             ? { weight: assertion.weight }
             : {}),
     };
 }
 export function buildDiffMatchesAssertion(assertion) {
-    const expected = assertion.value;
     return {
         type: "javascript",
-        value: `// diff-matches\n` +
-            `(function() {\n` +
-            `  const { execSync } = require('child_process');\n` +
-            `  const workDir = context.vars.__workingDir || '.';\n` +
-            `  try {\n` +
-            `    const diff = execSync('git diff', { cwd: workDir, encoding: 'utf-8' });\n` +
-            `    const expected = ${JSON.stringify(expected)};\n` +
-            `    if (typeof expected === 'string') {\n` +
-            `      const contains = diff.includes(expected);\n` +
-            `      return {\n` +
-            `        pass: contains,\n` +
-            `        score: contains ? 1 : 0,\n` +
-            `        reason: contains ? 'Diff matches expected pattern' : 'Diff does not match',\n` +
-            `      };\n` +
-            `    }\n` +
-            `    return { pass: diff.length > 0, score: diff.length > 0 ? 1 : 0, reason: 'Diff exists' };\n` +
-            `  } catch (err) {\n` +
-            `    return { pass: false, score: 0, reason: 'Failed to get diff: ' + err.message };\n` +
-            `  }\n` +
-            `})()`,
+        value: `${RUNTIME}:diffMatches`,
+        config: {
+            ...(assertion.value != null ? { expected: assertion.value } : {}),
+        },
         ...(typeof assertion.weight === "number"
             ? { weight: assertion.weight }
             : {}),

package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js CHANGED Viewed

@@ -27,8 +27,10 @@ export function compileAgentHarnessTask(task, options) {
     const prompts = buildAgentPrompts(task);
     // Build test cases
     const tests = buildAgentTestCases(task, options, warnings);
-    // Build sandbox extensions
-    const sandboxConfig = buildSandboxConfig(task);
+    // Build sandbox extensions — resolve fixture paths at compile time using
+    // the caller's cwd (monorepo root), not the eval package rootDir.
+    const callerCwd = process.env.AILF_CALLER_CWD ?? process.cwd();
+    const sandboxConfig = buildSandboxConfig(task, callerCwd);
     const extensions = buildLifecycleExtensions(task, sandboxConfig);
     return { providers, tests, prompts, extensions, sandboxConfig, warnings };
 }
@@ -36,22 +38,25 @@ export function compileAgentHarnessTask(task, options) {
 // Provider assembly
 // ---------------------------------------------------------------------------
 export function buildAgentProvider(task, _warnings) {
-    // Resolve tool permissions
     const tools = resolveToolPermissions(task.tools);
-    const config = {};
+    // Claude Agent SDK config.
+    // working_dir is set by the YAML writer to the sandbox path it creates.
+    // @see https://www.promptfoo.dev/docs/providers/claude-agent-sdk/
+    const config = {
+        model: "claude-sonnet-4-20250514",
+        max_turns: 25,
+        max_budget_usd: 1.0,
+        permission_mode: "bypassPermissions",
+        allow_dangerously_skip_permissions: true,
+    };
+    // Map AILF tool names to Claude Agent SDK tool config.
+    // Claude SDK uses custom_allowed_tools to replace defaults.
     if (tools.length > 0) {
-        config.allowedTools = tools;
-    }
-    if (task.sandbox) {
-        config.sandbox = {
-            type: task.sandbox.type,
-            ...(task.sandbox.image ? { image: task.sandbox.image } : {}),
-        };
+        config.custom_allowed_tools = tools;
     }
-    // Default to Claude Agent SDK provider
     return [
         {
-            id: `agent:${task.id}`,
+            id: "anthropic:claude-agent-sdk",
             label: `Agent Harness: ${task.title}`,
             config,
         },
@@ -112,9 +117,11 @@ export function buildAgentTestCases(task, options, warnings) {
     const vars = {
         task: task.prompt?.vars?.task ?? task.description ?? `Complete: ${task.title}`,
         ...(task.prompt?.vars ?? {}),
-        // Internal metadata for sandbox lifecycle hooks
+        // Internal metadata for sandbox lifecycle hooks.
+        // Fixture paths are plain strings (no file:// prefix) because
+        // promptfoo auto-resolves file:// in vars by reading file content.
         __sandboxType: task.sandbox?.type ?? "tempdir",
-        __fixtures: task.fixtures ?? [],
+        __fixtures: (task.fixtures ?? []).map((f) => f.startsWith("file://") ? f.slice(7) : f),
     };
     const tests = [
         {

package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js CHANGED Viewed

@@ -28,6 +28,7 @@ export const handler = {
         const result = compileAgentHarnessTask(task, {
             graderProvider: ctx.graderProvider,
             rootDir: ctx.rootDir,
+            rubricConfig: ctx.rubricConfig,
         });
         return {
             providers: result.providers,

package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts CHANGED Viewed

@@ -6,7 +6,14 @@
  */
 import type { AgentHarnessTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
 import type { PromptfooExtension, SandboxConfigMeta } from "./types.js";
-export declare function buildSandboxConfig(task: AgentHarnessTaskDefinition): SandboxConfigMeta;
+/**
+ * Build sandbox configuration from a task definition.
+ *
+ * Fixture paths are resolved to absolute at compile time using callerCwd
+ * (the directory the pipeline was invoked from), because promptfoo runs
+ * with cwd set to packages/eval/ — not the monorepo root where apps/ lives.
+ */
+export declare function buildSandboxConfig(task: AgentHarnessTaskDefinition, callerCwd?: string): SandboxConfigMeta;
 export declare function buildLifecycleExtensions(task: AgentHarnessTaskDefinition, sandboxConfig: SandboxConfigMeta): PromptfooExtension[];
 export declare function buildBeforeEachHook(taskId: string, config: SandboxConfigMeta): string;
 export declare function buildAfterEachHook(taskId: string): string;