@sanity/ailf 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -112,6 +112,55 @@ export declare const ailfConfigData: {
112
112
  };
113
113
  /** Raw YAML string for ailf-config example (preserves comments) */
114
114
  export declare const ailfConfigYaml = "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# .ailf/config.yaml \u2014 AI Literacy Framework project configuration\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n#\n# This file configures how the AILF evaluation pipeline runs in this\n# repository. Place it at .ailf/config.yaml in your project root.\n#\n# Evaluations are submitted to the AILF API (ailf-api.sanity.build).\n# The API handles LLM calls, doc fetching, grading, and report\n# publishing. Your repo only needs one secret: AILF_API_KEY.\n#\n# Docs: https://github.com/sanity-labs/ai-literacy-framework\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n\n# Documentation source \u2014 which docs are being evaluated.\n#\n# This tells the pipeline which Sanity project and dataset contain\n# the documentation under test. For most users, this is Sanity's own\n# docs project.\n#\n# projectId \u2014 Sanity project ID (find yours at sanity.io/manage)\n# dataset \u2014 the dataset to query (e.g., \"production\", \"next\")\n# baseUrl \u2014 the public URL of your documentation site\n# (used by agentic mode to test agent discoverability)\nsource:\n projectId: \"3do82whm\"\n dataset: next\n baseUrl: \"https://www.sanity.io/docs\"\n\n# Trigger configuration \u2014 when evaluations run automatically.\n#\n# Each key is a trigger context. The pipeline checks which trigger\n# matches the current execution context (PR, merge, schedule, etc.)\n# and applies its settings.\n#\n# mode options:\n# validate-only \u2014 check that task YAML parses correctly (fast, no LLM calls)\n# eval \u2014 run the full evaluation pipeline\n#\n# paths \u2014 only trigger when files matching these globs change\n# blocking \u2014 if true, a failing eval blocks the PR merge\n# notify \u2014 if true, post results to configured notification channels\ntriggers:\n # On pull requests: just validate task files parse correctly\n pr:\n mode: validate-only\n\n # When .ailf/ files change in a PR: run a real evaluation\n pr-task-change:\n mode: eval\n paths: [\".ailf/**\"]\n\n # On merge to main: run evaluation (non-blocking)\n main:\n mode: eval\n blocking: false\n notify: true\n";
115
+ /** Parsed task data for example-agent-add-schema (JSON-safe) */
116
+ export declare const exampleAgentAddSchemaData: readonly [{
117
+ readonly mode: "agent-harness";
118
+ readonly id: "example-agent-add-schema";
119
+ readonly title: "Add a document schema to a Sanity Studio project";
120
+ readonly description: "Example — tests whether an agent can create a schema file and register it in an existing Sanity Studio project";
121
+ readonly area: "studio";
122
+ readonly sandbox: {
123
+ readonly type: "tempdir";
124
+ };
125
+ readonly tools: readonly ["coding"];
126
+ readonly fixtures: readonly ["file://apps/studio-basic"];
127
+ readonly prompt: {
128
+ readonly text: "You have a Sanity Studio project with an existing article schema.\nAdd a new \"post\" document type with the following fields:\n1. title (string, required)\n2. slug (slug, sourced from title, required)\n3. author (string)\n4. publishedAt (datetime)\n5. body (array of block content)\n\nCreate the schema file at schemas/post.ts using defineType() and defineField().\nRegister it in schemas/index.ts alongside the existing article schema.";
129
+ };
130
+ readonly assertions: readonly [{
131
+ readonly type: "file-exists";
132
+ readonly value: "schemas/post.ts";
133
+ }, {
134
+ readonly type: "file-contains";
135
+ readonly value: {
136
+ readonly path: "schemas/post.ts";
137
+ readonly content: "defineType";
138
+ };
139
+ }, {
140
+ readonly type: "file-contains";
141
+ readonly value: {
142
+ readonly path: "schemas/post.ts";
143
+ readonly content: "defineField";
144
+ };
145
+ }, {
146
+ readonly type: "file-contains";
147
+ readonly value: {
148
+ readonly path: "schemas/index.ts";
149
+ readonly content: "post";
150
+ };
151
+ }, {
152
+ readonly type: "file-contains";
153
+ readonly value: {
154
+ readonly path: "sanity.config.ts";
155
+ readonly content: "defineConfig";
156
+ };
157
+ }];
158
+ readonly status: "draft";
159
+ }];
160
+ /** TypeScript task template for example-agent-add-schema */
161
+ export declare const exampleAgentAddSchemaTs = "/**\n * Example Task: Agent harness \u2014 add a schema to a Sanity Studio project.\n *\n * This is a starter template for agent-harness evaluations. It tests whether\n * an autonomous coding agent can modify a real project in a sandboxed\n * environment using file system tools.\n *\n * Unlike literacy or knowledge-probe tasks that evaluate text responses,\n * agent-harness tasks evaluate *side-effects*: files created, code modified,\n * commands executed. The agent gets a working directory with real files and\n * tools (Bash, Read, Write, Edit, etc.) and is graded on what it produces.\n *\n * This task is a DRAFT \u2014 it won't run unless activated or explicitly targeted.\n * To activate: change status to \"active\" or remove the status field.\n *\n * @see https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/modes.md#agent-harness\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n // \u2500\u2500 Mode \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // \"agent-harness\" runs a real coding agent (Claude Agent SDK) with\n // filesystem tools in an isolated sandbox. The agent can read, write,\n // edit files and run shell commands \u2014 then assertions verify the results.\n //\n // Other modes: \"literacy\" (text Q&A), \"mcp-server\" (tool use),\n // \"knowledge-probe\" (baseline knowledge).\n mode: \"agent-harness\",\n\n id: \"example-agent-add-schema\",\n title: \"Add a document schema to a Sanity Studio project\",\n description:\n \"Example \u2014 tests whether an agent can create a schema file and \" +\n \"register it in an existing Sanity Studio project\",\n\n // Area groups tasks for scoring. All agent-harness tasks in the same\n // area are aggregated together in the score report.\n area: \"studio\",\n\n // \u2500\u2500 Sandbox \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // The sandbox isolates the agent's file operations from your real repo.\n // Currently only \"tempdir\" is implemented (creates a temporary directory).\n // The pipeline creates the sandbox dir at config-generation time, copies\n // fixtures into it before the agent starts, and the agent's working_dir\n // is set to the sandbox path.\n //\n // See: docs/modes.md#sandbox-lifecycle\n sandbox: { type: \"tempdir\" },\n\n // \u2500\u2500 Tools \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // Tool presets control which Claude Agent SDK tools the agent can use:\n // \"coding\" \u2192 Bash, Read, Write, Edit, Glob, Grep\n // \"full-access\" \u2192 coding + WebSearch, WebFetch, TodoRead, TodoWrite\n // \"read-only\" \u2192 Read, Glob, Grep, WebSearch\n //\n // You can also mix presets with explicit tool names:\n // tools: [\"coding\", \"WebFetch\"]\n tools: [\"coding\"],\n\n // \u2500\u2500 Fixtures \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // Fixtures are files or directories copied into the sandbox before the\n // agent starts. Paths use the file:// prefix and resolve relative to\n // the directory where you run the pipeline (typically your repo root).\n //\n // The agent sees these files in its working directory as if they were\n // a real project. For example, \"file://apps/studio-basic\" copies the\n // entire studio-basic app into the sandbox root.\n //\n // If you don't have fixture projects, you can omit this field and the\n // agent starts with an empty directory.\n fixtures: [\"file://apps/studio-basic\"],\n\n // \u2500\u2500 Prompt \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // The task instructions sent to the agent. Be specific about expected\n // file paths and patterns \u2014 the assertions check for exact paths.\n prompt: {\n text: `You have a Sanity Studio project with an existing article schema.\nAdd a new \"post\" document type with the following fields:\n1. title (string, required)\n2. slug (slug, sourced from title, required)\n3. author (string)\n4. publishedAt (datetime)\n5. body (array of block content)\n\nCreate the schema file at schemas/post.ts using defineType() and defineField().\nRegister it in schemas/index.ts alongside the existing article schema.`,\n },\n\n // \u2500\u2500 Assertions \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // Agent-harness assertions verify the sandbox state after the agent runs.\n // They execute in a full Node.js context (not eval()) so they can use\n // fs, child_process, etc. All file paths resolve relative to the sandbox.\n //\n // Available assertion types:\n // file-exists \u2014 check a file was created\n // file-contains \u2014 check a file contains a substring\n // command-succeeds \u2014 run a shell command (exit 0 = pass)\n // diff-matches \u2014 check git diff contains a pattern\n // llm-rubric \u2014 LLM grades the agent's text output\n assertions: [\n // Verify the agent created the new schema file\n { type: \"file-exists\", value: \"schemas/post.ts\" },\n\n // Verify it uses the modern Sanity schema API\n {\n type: \"file-contains\",\n value: { path: \"schemas/post.ts\", content: \"defineType\" },\n },\n {\n type: \"file-contains\",\n value: { path: \"schemas/post.ts\", content: \"defineField\" },\n },\n\n // Verify the schema was registered in the barrel export\n {\n type: \"file-contains\",\n value: { path: \"schemas/index.ts\", content: \"post\" },\n },\n\n // Verify the existing config is still intact\n {\n type: \"file-contains\",\n value: { path: \"sanity.config.ts\", content: \"defineConfig\" },\n },\n ],\n\n status: \"draft\",\n})\n";
162
+ /** Generated YAML for example-agent-add-schema (from parsed TS data) */
163
+ export declare const exampleAgentAddSchemaYaml = "- mode: agent-harness\n id: example-agent-add-schema\n title: Add a document schema to a Sanity Studio project\n description: Example \u2014 tests whether an agent can create a schema file and register it in an existing Sanity Studio project\n area: studio\n sandbox:\n type: tempdir\n tools:\n - coding\n fixtures:\n - file://apps/studio-basic\n prompt:\n text: |-\n You have a Sanity Studio project with an existing article schema.\n Add a new \"post\" document type with the following fields:\n 1. title (string, required)\n 2. slug (slug, sourced from title, required)\n 3. author (string)\n 4. publishedAt (datetime)\n 5. body (array of block content)\n\n Create the schema file at schemas/post.ts using defineType() and defineField().\n Register it in schemas/index.ts alongside the existing article schema.\n assertions:\n - type: file-exists\n value: schemas/post.ts\n - type: file-contains\n value:\n path: schemas/post.ts\n content: defineType\n - type: file-contains\n value:\n path: schemas/post.ts\n content: defineField\n - type: file-contains\n value:\n path: schemas/index.ts\n content: post\n - type: file-contains\n value:\n path: sanity.config.ts\n content: defineConfig\n status: draft\n";
115
164
  /** Parsed task data for example-groq-blog-listing (JSON-safe) */
116
165
  export declare const exampleGroqBlogListingData: readonly [{
117
166
  readonly mode: "literacy";
@@ -367,7 +416,7 @@ export declare const taskTsFiles: Record<string, string>;
367
416
  /** Map of task ID (filename stem) → generated YAML string */
368
417
  export declare const taskYamlFiles: Record<string, string>;
369
418
  /** List of task file stems, in alphabetical order */
370
- export declare const TASK_FILE_NAMES: readonly ["example-groq-blog-listing", "example-id-based-ref", "example-knowledge-probe", "example-mcp-tool-usage", "example-path-based-ref", "example-perspective-ref", "example-studio-custom-input"];
419
+ export declare const TASK_FILE_NAMES: readonly ["example-agent-add-schema", "example-groq-blog-listing", "example-id-based-ref", "example-knowledge-probe", "example-mcp-tool-usage", "example-path-based-ref", "example-perspective-ref", "example-studio-custom-input"];
371
420
  /** Task metadata for mode-based filtering in init and other consumers */
372
421
  export interface TaskExampleMeta {
373
422
  stem: string;
@@ -143,6 +143,67 @@ export const ailfConfigData = {
143
143
  };
144
144
  /** Raw YAML string for ailf-config example (preserves comments) */
145
145
  export const ailfConfigYaml = "# ──────────────────────────────────────────────────────────────────────\n# .ailf/config.yaml — AI Literacy Framework project configuration\n# ──────────────────────────────────────────────────────────────────────\n#\n# This file configures how the AILF evaluation pipeline runs in this\n# repository. Place it at .ailf/config.yaml in your project root.\n#\n# Evaluations are submitted to the AILF API (ailf-api.sanity.build).\n# The API handles LLM calls, doc fetching, grading, and report\n# publishing. Your repo only needs one secret: AILF_API_KEY.\n#\n# Docs: https://github.com/sanity-labs/ai-literacy-framework\n# ──────────────────────────────────────────────────────────────────────\n\n# Documentation source — which docs are being evaluated.\n#\n# This tells the pipeline which Sanity project and dataset contain\n# the documentation under test. For most users, this is Sanity's own\n# docs project.\n#\n# projectId — Sanity project ID (find yours at sanity.io/manage)\n# dataset — the dataset to query (e.g., \"production\", \"next\")\n# baseUrl — the public URL of your documentation site\n# (used by agentic mode to test agent discoverability)\nsource:\n projectId: \"3do82whm\"\n dataset: next\n baseUrl: \"https://www.sanity.io/docs\"\n\n# Trigger configuration — when evaluations run automatically.\n#\n# Each key is a trigger context. The pipeline checks which trigger\n# matches the current execution context (PR, merge, schedule, etc.)\n# and applies its settings.\n#\n# mode options:\n# validate-only — check that task YAML parses correctly (fast, no LLM calls)\n# eval — run the full evaluation pipeline\n#\n# paths — only trigger when files matching these globs change\n# blocking — if true, a failing eval blocks the PR merge\n# notify — if true, post results to configured notification channels\ntriggers:\n # On pull requests: just validate task files parse correctly\n pr:\n mode: validate-only\n\n # When .ailf/ files change in a PR: run a real evaluation\n pr-task-change:\n mode: eval\n paths: [\".ailf/**\"]\n\n # On merge to main: run evaluation (non-blocking)\n main:\n mode: eval\n blocking: false\n notify: true\n";
146
+ /** Parsed task data for example-agent-add-schema (JSON-safe) */
147
+ export const exampleAgentAddSchemaData = [
148
+ {
149
+ "mode": "agent-harness",
150
+ "id": "example-agent-add-schema",
151
+ "title": "Add a document schema to a Sanity Studio project",
152
+ "description": "Example — tests whether an agent can create a schema file and register it in an existing Sanity Studio project",
153
+ "area": "studio",
154
+ "sandbox": {
155
+ "type": "tempdir"
156
+ },
157
+ "tools": [
158
+ "coding"
159
+ ],
160
+ "fixtures": [
161
+ "file://apps/studio-basic"
162
+ ],
163
+ "prompt": {
164
+ "text": "You have a Sanity Studio project with an existing article schema.\nAdd a new \"post\" document type with the following fields:\n1. title (string, required)\n2. slug (slug, sourced from title, required)\n3. author (string)\n4. publishedAt (datetime)\n5. body (array of block content)\n\nCreate the schema file at schemas/post.ts using defineType() and defineField().\nRegister it in schemas/index.ts alongside the existing article schema."
165
+ },
166
+ "assertions": [
167
+ {
168
+ "type": "file-exists",
169
+ "value": "schemas/post.ts"
170
+ },
171
+ {
172
+ "type": "file-contains",
173
+ "value": {
174
+ "path": "schemas/post.ts",
175
+ "content": "defineType"
176
+ }
177
+ },
178
+ {
179
+ "type": "file-contains",
180
+ "value": {
181
+ "path": "schemas/post.ts",
182
+ "content": "defineField"
183
+ }
184
+ },
185
+ {
186
+ "type": "file-contains",
187
+ "value": {
188
+ "path": "schemas/index.ts",
189
+ "content": "post"
190
+ }
191
+ },
192
+ {
193
+ "type": "file-contains",
194
+ "value": {
195
+ "path": "sanity.config.ts",
196
+ "content": "defineConfig"
197
+ }
198
+ }
199
+ ],
200
+ "status": "draft"
201
+ }
202
+ ];
203
+ /** TypeScript task template for example-agent-add-schema */
204
+ export const exampleAgentAddSchemaTs = "/**\n * Example Task: Agent harness — add a schema to a Sanity Studio project.\n *\n * This is a starter template for agent-harness evaluations. It tests whether\n * an autonomous coding agent can modify a real project in a sandboxed\n * environment using file system tools.\n *\n * Unlike literacy or knowledge-probe tasks that evaluate text responses,\n * agent-harness tasks evaluate *side-effects*: files created, code modified,\n * commands executed. The agent gets a working directory with real files and\n * tools (Bash, Read, Write, Edit, etc.) and is graded on what it produces.\n *\n * This task is a DRAFT — it won't run unless activated or explicitly targeted.\n * To activate: change status to \"active\" or remove the status field.\n *\n * @see https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/modes.md#agent-harness\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n // ── Mode ──────────────────────────────────────────────────────\n // \"agent-harness\" runs a real coding agent (Claude Agent SDK) with\n // filesystem tools in an isolated sandbox. The agent can read, write,\n // edit files and run shell commands — then assertions verify the results.\n //\n // Other modes: \"literacy\" (text Q&A), \"mcp-server\" (tool use),\n // \"knowledge-probe\" (baseline knowledge).\n mode: \"agent-harness\",\n\n id: \"example-agent-add-schema\",\n title: \"Add a document schema to a Sanity Studio project\",\n description:\n \"Example — tests whether an agent can create a schema file and \" +\n \"register it in an existing Sanity Studio project\",\n\n // Area groups tasks for scoring. All agent-harness tasks in the same\n // area are aggregated together in the score report.\n area: \"studio\",\n\n // ── Sandbox ───────────────────────────────────────────────────\n // The sandbox isolates the agent's file operations from your real repo.\n // Currently only \"tempdir\" is implemented (creates a temporary directory).\n // The pipeline creates the sandbox dir at config-generation time, copies\n // fixtures into it before the agent starts, and the agent's working_dir\n // is set to the sandbox path.\n //\n // See: docs/modes.md#sandbox-lifecycle\n sandbox: { type: \"tempdir\" },\n\n // ── Tools ─────────────────────────────────────────────────────\n // Tool presets control which Claude Agent SDK tools the agent can use:\n // \"coding\" → Bash, Read, Write, Edit, Glob, Grep\n // \"full-access\" → coding + WebSearch, WebFetch, TodoRead, TodoWrite\n // \"read-only\" → Read, Glob, Grep, WebSearch\n //\n // You can also mix presets with explicit tool names:\n // tools: [\"coding\", \"WebFetch\"]\n tools: [\"coding\"],\n\n // ── Fixtures ──────────────────────────────────────────────────\n // Fixtures are files or directories copied into the sandbox before the\n // agent starts. Paths use the file:// prefix and resolve relative to\n // the directory where you run the pipeline (typically your repo root).\n //\n // The agent sees these files in its working directory as if they were\n // a real project. For example, \"file://apps/studio-basic\" copies the\n // entire studio-basic app into the sandbox root.\n //\n // If you don't have fixture projects, you can omit this field and the\n // agent starts with an empty directory.\n fixtures: [\"file://apps/studio-basic\"],\n\n // ── Prompt ────────────────────────────────────────────────────\n // The task instructions sent to the agent. Be specific about expected\n // file paths and patterns — the assertions check for exact paths.\n prompt: {\n text: `You have a Sanity Studio project with an existing article schema.\nAdd a new \"post\" document type with the following fields:\n1. title (string, required)\n2. slug (slug, sourced from title, required)\n3. author (string)\n4. publishedAt (datetime)\n5. body (array of block content)\n\nCreate the schema file at schemas/post.ts using defineType() and defineField().\nRegister it in schemas/index.ts alongside the existing article schema.`,\n },\n\n // ── Assertions ────────────────────────────────────────────────\n // Agent-harness assertions verify the sandbox state after the agent runs.\n // They execute in a full Node.js context (not eval()) so they can use\n // fs, child_process, etc. All file paths resolve relative to the sandbox.\n //\n // Available assertion types:\n // file-exists — check a file was created\n // file-contains — check a file contains a substring\n // command-succeeds — run a shell command (exit 0 = pass)\n // diff-matches — check git diff contains a pattern\n // llm-rubric — LLM grades the agent's text output\n assertions: [\n // Verify the agent created the new schema file\n { type: \"file-exists\", value: \"schemas/post.ts\" },\n\n // Verify it uses the modern Sanity schema API\n {\n type: \"file-contains\",\n value: { path: \"schemas/post.ts\", content: \"defineType\" },\n },\n {\n type: \"file-contains\",\n value: { path: \"schemas/post.ts\", content: \"defineField\" },\n },\n\n // Verify the schema was registered in the barrel export\n {\n type: \"file-contains\",\n value: { path: \"schemas/index.ts\", content: \"post\" },\n },\n\n // Verify the existing config is still intact\n {\n type: \"file-contains\",\n value: { path: \"sanity.config.ts\", content: \"defineConfig\" },\n },\n ],\n\n status: \"draft\",\n})\n";
205
+ /** Generated YAML for example-agent-add-schema (from parsed TS data) */
206
+ export const exampleAgentAddSchemaYaml = "- mode: agent-harness\n id: example-agent-add-schema\n title: Add a document schema to a Sanity Studio project\n description: Example — tests whether an agent can create a schema file and register it in an existing Sanity Studio project\n area: studio\n sandbox:\n type: tempdir\n tools:\n - coding\n fixtures:\n - file://apps/studio-basic\n prompt:\n text: |-\n You have a Sanity Studio project with an existing article schema.\n Add a new \"post\" document type with the following fields:\n 1. title (string, required)\n 2. slug (slug, sourced from title, required)\n 3. author (string)\n 4. publishedAt (datetime)\n 5. body (array of block content)\n\n Create the schema file at schemas/post.ts using defineType() and defineField().\n Register it in schemas/index.ts alongside the existing article schema.\n assertions:\n - type: file-exists\n value: schemas/post.ts\n - type: file-contains\n value:\n path: schemas/post.ts\n content: defineType\n - type: file-contains\n value:\n path: schemas/post.ts\n content: defineField\n - type: file-contains\n value:\n path: schemas/index.ts\n content: post\n - type: file-contains\n value:\n path: sanity.config.ts\n content: defineConfig\n status: draft\n";
146
207
  /** Parsed task data for example-groq-blog-listing (JSON-safe) */
147
208
  export const exampleGroqBlogListingData = [
148
209
  {
@@ -489,6 +550,7 @@ export const exampleStudioCustomInputYaml = "- mode: literacy\n id: example-stu
489
550
  // ---------------------------------------------------------------------------
490
551
  /** All task example data as a flat array (JSON-safe) */
491
552
  export const allTaskData = [
553
+ ...exampleAgentAddSchemaData,
492
554
  ...exampleGroqBlogListingData,
493
555
  ...exampleIdBasedRefData,
494
556
  ...exampleKnowledgeProbeData,
@@ -499,6 +561,7 @@ export const allTaskData = [
499
561
  ];
500
562
  /** Map of task ID (filename stem) → raw TypeScript source */
501
563
  export const taskTsFiles = {
564
+ "example-agent-add-schema": exampleAgentAddSchemaTs,
502
565
  "example-groq-blog-listing": exampleGroqBlogListingTs,
503
566
  "example-id-based-ref": exampleIdBasedRefTs,
504
567
  "example-knowledge-probe": exampleKnowledgeProbeTs,
@@ -509,6 +572,7 @@ export const taskTsFiles = {
509
572
  };
510
573
  /** Map of task ID (filename stem) → generated YAML string */
511
574
  export const taskYamlFiles = {
575
+ "example-agent-add-schema": exampleAgentAddSchemaYaml,
512
576
  "example-groq-blog-listing": exampleGroqBlogListingYaml,
513
577
  "example-id-based-ref": exampleIdBasedRefYaml,
514
578
  "example-knowledge-probe": exampleKnowledgeProbeYaml,
@@ -518,8 +582,9 @@ export const taskYamlFiles = {
518
582
  "example-studio-custom-input": exampleStudioCustomInputYaml,
519
583
  };
520
584
  /** List of task file stems, in alphabetical order */
521
- export const TASK_FILE_NAMES = ["example-groq-blog-listing", "example-id-based-ref", "example-knowledge-probe", "example-mcp-tool-usage", "example-path-based-ref", "example-perspective-ref", "example-studio-custom-input"];
585
+ export const TASK_FILE_NAMES = ["example-agent-add-schema", "example-groq-blog-listing", "example-id-based-ref", "example-knowledge-probe", "example-mcp-tool-usage", "example-path-based-ref", "example-perspective-ref", "example-studio-custom-input"];
522
586
  export const TASK_EXAMPLES = [
587
+ { stem: "example-agent-add-schema", mode: "agent-harness", status: "draft" },
523
588
  { stem: "example-groq-blog-listing", mode: "literacy", status: "draft" },
524
589
  { stem: "example-id-based-ref", mode: "literacy", status: "draft" },
525
590
  { stem: "example-knowledge-probe", mode: "knowledge-probe", status: "draft" },
@@ -0,0 +1,49 @@
1
+ /**
2
+ * assertions-runtime.ts — Runtime assertion functions for agent harness tasks.
3
+ *
4
+ * These run as file-based promptfoo JavaScript assertions, which execute
5
+ * in a full Node.js context (unlike inline `type: javascript` assertions
6
+ * which run in a restricted eval() sandbox without require()).
7
+ *
8
+ * Each export is a named function with the signature:
9
+ * (output: string, context: { vars, config, ... }) => GradingResult
10
+ *
11
+ * Referenced in promptfoo config as:
12
+ * value: file://dist/agent-harness/assertions-runtime.js:functionName
13
+ * config: { filePath: "...", ... }
14
+ *
15
+ * @see https://www.promptfoo.dev/docs/configuration/expected-outputs/javascript/
16
+ */
17
+ interface GradingResult {
18
+ pass: boolean;
19
+ score: number;
20
+ reason: string;
21
+ }
22
+ interface AssertionContext {
23
+ vars: Record<string, unknown>;
24
+ config?: Record<string, unknown>;
25
+ }
26
+ /**
27
+ * Assert that a file exists in the sandbox working directory.
28
+ * Config: { filePath: string }
29
+ */
30
+ export declare function fileExists(_output: string, context: AssertionContext): GradingResult;
31
+ /**
32
+ * Assert that a file contains expected content.
33
+ * Config: { filePath: string, content: string }
34
+ */
35
+ export declare function fileContains(_output: string, context: AssertionContext): GradingResult;
36
+ /**
37
+ * Assert that a shell command succeeds (exit code 0) in the sandbox.
38
+ * Config: { command: string, timeoutMs?: number }
39
+ *
40
+ * SECURITY: Commands come from developer-authored task definitions,
41
+ * not from user input or LLM output.
42
+ */
43
+ export declare function commandSucceeds(_output: string, context: AssertionContext): GradingResult;
44
+ /**
45
+ * Assert that the git diff in the sandbox contains expected content.
46
+ * Config: { expected?: string }
47
+ */
48
+ export declare function diffMatches(_output: string, context: AssertionContext): GradingResult;
49
+ export {};
@@ -0,0 +1,138 @@
1
+ /**
2
+ * assertions-runtime.ts — Runtime assertion functions for agent harness tasks.
3
+ *
4
+ * These run as file-based promptfoo JavaScript assertions, which execute
5
+ * in a full Node.js context (unlike inline `type: javascript` assertions
6
+ * which run in a restricted eval() sandbox without require()).
7
+ *
8
+ * Each export is a named function with the signature:
9
+ * (output: string, context: { vars, config, ... }) => GradingResult
10
+ *
11
+ * Referenced in promptfoo config as:
12
+ * value: file://dist/agent-harness/assertions-runtime.js:functionName
13
+ * config: { filePath: "...", ... }
14
+ *
15
+ * @see https://www.promptfoo.dev/docs/configuration/expected-outputs/javascript/
16
+ */
17
+ import { existsSync, readFileSync } from "fs";
18
+ import { execSync } from "child_process";
19
+ import { resolve, sep } from "path";
20
+ function resolveWorkDir(context) {
21
+ return resolve(context.vars.__workingDir || ".");
22
+ }
23
+ function guardTraversal(workDir, target, label) {
24
+ if (!target.startsWith(workDir + sep) && target !== workDir) {
25
+ return {
26
+ pass: false,
27
+ score: 0,
28
+ reason: `Path traversal: ${label} escapes sandbox`,
29
+ };
30
+ }
31
+ return null;
32
+ }
33
+ /**
34
+ * Assert that a file exists in the sandbox working directory.
35
+ * Config: { filePath: string }
36
+ */
37
+ export function fileExists(_output, context) {
38
+ const filePath = context.config?.filePath ?? "";
39
+ const workDir = resolveWorkDir(context);
40
+ const target = resolve(workDir, filePath);
41
+ const traversal = guardTraversal(workDir, target, filePath);
42
+ if (traversal)
43
+ return traversal;
44
+ const exists = existsSync(target);
45
+ return {
46
+ pass: exists,
47
+ score: exists ? 1 : 0,
48
+ reason: exists
49
+ ? `File exists: ${filePath}`
50
+ : `Expected file not found: ${filePath}`,
51
+ };
52
+ }
53
+ /**
54
+ * Assert that a file contains expected content.
55
+ * Config: { filePath: string, content: string }
56
+ */
57
+ export function fileContains(_output, context) {
58
+ const filePath = context.config?.filePath ?? "";
59
+ const expected = context.config?.content ?? "";
60
+ const workDir = resolveWorkDir(context);
61
+ const target = resolve(workDir, filePath);
62
+ const traversal = guardTraversal(workDir, target, filePath);
63
+ if (traversal)
64
+ return traversal;
65
+ if (!existsSync(target)) {
66
+ return { pass: false, score: 0, reason: `File not found: ${filePath}` };
67
+ }
68
+ const content = readFileSync(target, "utf-8");
69
+ const contains = content.includes(expected);
70
+ return {
71
+ pass: contains,
72
+ score: contains ? 1 : 0,
73
+ reason: contains
74
+ ? "File contains expected content"
75
+ : "File does not contain expected content",
76
+ };
77
+ }
78
+ /**
79
+ * Assert that a shell command succeeds (exit code 0) in the sandbox.
80
+ * Config: { command: string, timeoutMs?: number }
81
+ *
82
+ * SECURITY: Commands come from developer-authored task definitions,
83
+ * not from user input or LLM output.
84
+ */
85
+ export function commandSucceeds(_output, context) {
86
+ const command = context.config?.command ?? "";
87
+ const timeoutMs = context.config?.timeoutMs ?? 30000;
88
+ const workDir = context.vars.__workingDir || ".";
89
+ try {
90
+ execSync(command, { cwd: workDir, timeout: timeoutMs });
91
+ return { pass: true, score: 1, reason: `Command succeeded: ${command}` };
92
+ }
93
+ catch (err) {
94
+ const error = err;
95
+ return {
96
+ pass: false,
97
+ score: 0,
98
+ reason: `Command failed: ${error.message || String(err)}`,
99
+ };
100
+ }
101
+ }
102
+ /**
103
+ * Assert that the git diff in the sandbox contains expected content.
104
+ * Config: { expected?: string }
105
+ */
106
+ export function diffMatches(_output, context) {
107
+ const expected = context.config?.expected;
108
+ const workDir = context.vars.__workingDir || ".";
109
+ try {
110
+ const diff = execSync("git diff", {
111
+ cwd: workDir,
112
+ encoding: "utf-8",
113
+ });
114
+ if (typeof expected === "string") {
115
+ const contains = diff.includes(expected);
116
+ return {
117
+ pass: contains,
118
+ score: contains ? 1 : 0,
119
+ reason: contains
120
+ ? "Diff matches expected pattern"
121
+ : "Diff does not match",
122
+ };
123
+ }
124
+ return {
125
+ pass: diff.length > 0,
126
+ score: diff.length > 0 ? 1 : 0,
127
+ reason: "Diff exists",
128
+ };
129
+ }
130
+ catch (err) {
131
+ const error = err;
132
+ return {
133
+ pass: false,
134
+ score: 0,
135
+ reason: `Failed to get diff: ${error.message}`,
136
+ };
137
+ }
138
+ }
@@ -0,0 +1,58 @@
1
+ /**
2
+ * provider.ts — Agent harness Promptfoo custom provider.
3
+ *
4
+ * Handles agent-harness evaluation tasks by calling the Anthropic API
5
+ * with the task prompt and returning the response. Sandbox provisioning
6
+ * and teardown are handled by lifecycle extensions (beforeEach/afterEach),
7
+ * not by the provider itself.
8
+ *
9
+ * Promptfoo config usage:
10
+ *
11
+ * providers:
12
+ * - id: file://dist/agent-harness/provider.js
13
+ * label: "Agent Harness: My Task"
14
+ * config:
15
+ * allowedTools: ["Bash", "Read", "Write"]
16
+ * sandbox: { type: "tempdir" }
17
+ *
18
+ * Promptfoo loads this file and instantiates the default export class.
19
+ */
20
+ interface CallApiContextParams {
21
+ prompt?: {
22
+ raw: string;
23
+ label?: string;
24
+ };
25
+ vars?: Record<string, object | string>;
26
+ }
27
+ interface ProviderOptions {
28
+ config?: Record<string, unknown>;
29
+ id?: string;
30
+ }
31
+ interface ProviderResponse {
32
+ cached?: boolean;
33
+ cost?: number;
34
+ error?: string;
35
+ metadata?: Record<string, unknown>;
36
+ output?: object | string;
37
+ tokenUsage?: {
38
+ total?: number;
39
+ prompt?: number;
40
+ completion?: number;
41
+ cached?: number;
42
+ };
43
+ }
44
+ export default class AgentHarnessProvider {
45
+ config: Record<string, unknown>;
46
+ private providerId;
47
+ constructor(options: ProviderOptions);
48
+ id(): string;
49
+ /**
50
+ * Main Promptfoo provider entry point. Called for each test case.
51
+ *
52
+ * Sends the task prompt to the Anthropic API and returns the response.
53
+ * The sandbox working directory is available in context.vars.__workingDir
54
+ * (set by the beforeEach extension).
55
+ */
56
+ callApi(prompt: string, context?: CallApiContextParams): Promise<ProviderResponse>;
57
+ }
58
+ export {};
@@ -0,0 +1,104 @@
1
+ /**
2
+ * provider.ts — Agent harness Promptfoo custom provider.
3
+ *
4
+ * Handles agent-harness evaluation tasks by calling the Anthropic API
5
+ * with the task prompt and returning the response. Sandbox provisioning
6
+ * and teardown are handled by lifecycle extensions (beforeEach/afterEach),
7
+ * not by the provider itself.
8
+ *
9
+ * Promptfoo config usage:
10
+ *
11
+ * providers:
12
+ * - id: file://dist/agent-harness/provider.js
13
+ * label: "Agent Harness: My Task"
14
+ * config:
15
+ * allowedTools: ["Bash", "Read", "Write"]
16
+ * sandbox: { type: "tempdir" }
17
+ *
18
+ * Promptfoo loads this file and instantiates the default export class.
19
+ */
20
+ import { config as loadDotenv } from "dotenv";
21
+ loadDotenv({
22
+ override: true,
23
+ path: new URL("../../.env", import.meta.url).pathname,
24
+ });
25
+ // ---------------------------------------------------------------------------
26
+ // Provider implementation
27
+ // ---------------------------------------------------------------------------
28
+ export default class AgentHarnessProvider {
29
+ config;
30
+ providerId;
31
+ constructor(options) {
32
+ this.providerId = options.id ?? "agent-harness";
33
+ this.config = options.config ?? {};
34
+ }
35
+ id() {
36
+ return this.providerId;
37
+ }
38
+ /**
39
+ * Main Promptfoo provider entry point. Called for each test case.
40
+ *
41
+ * Sends the task prompt to the Anthropic API and returns the response.
42
+ * The sandbox working directory is available in context.vars.__workingDir
43
+ * (set by the beforeEach extension).
44
+ */
45
+ async callApi(prompt, context) {
46
+ const apiKey = process.env.ANTHROPIC_API_KEY;
47
+ if (!apiKey) {
48
+ return {
49
+ error: "ANTHROPIC_API_KEY not set. Required for agent-harness evaluation.",
50
+ };
51
+ }
52
+ const model = this.config.model ?? "claude-sonnet-4-20250514";
53
+ const maxTokens = this.config.maxTokens ?? 4096;
54
+ const workingDir = context?.vars?.__workingDir;
55
+ const systemPrompt = workingDir
56
+ ? `You are an AI coding agent. Your working directory is: ${workingDir}\nComplete the following task.`
57
+ : "You are an AI coding agent. Complete the following task.";
58
+ try {
59
+ const response = await fetch("https://api.anthropic.com/v1/messages", {
60
+ method: "POST",
61
+ headers: {
62
+ "Content-Type": "application/json",
63
+ "x-api-key": apiKey,
64
+ "anthropic-version": "2023-06-01",
65
+ },
66
+ body: JSON.stringify({
67
+ model,
68
+ max_tokens: maxTokens,
69
+ system: systemPrompt,
70
+ messages: [{ role: "user", content: prompt }],
71
+ }),
72
+ });
73
+ if (!response.ok) {
74
+ const errorBody = await response.text();
75
+ return {
76
+ error: `Anthropic API error (${response.status}): ${errorBody}`,
77
+ };
78
+ }
79
+ const data = (await response.json());
80
+ const output = data.content
81
+ .filter((block) => block.type === "text")
82
+ .map((block) => block.text)
83
+ .join("\n");
84
+ return {
85
+ output,
86
+ tokenUsage: {
87
+ prompt: data.usage?.input_tokens,
88
+ completion: data.usage?.output_tokens,
89
+ total: (data.usage?.input_tokens ?? 0) + (data.usage?.output_tokens ?? 0),
90
+ },
91
+ metadata: {
92
+ model,
93
+ workingDir,
94
+ allowedTools: this.config.allowedTools,
95
+ sandboxType: this.config.sandbox?.type ?? "none",
96
+ },
97
+ };
98
+ }
99
+ catch (err) {
100
+ const error = err;
101
+ return { error: `Agent harness provider error: ${error.message}` };
102
+ }
103
+ }
104
+ }
@@ -138,6 +138,9 @@ async function runInit(opts) {
138
138
  else if (modeFilter === "knowledge-probe") {
139
139
  stemsToWrite = taskStemsForMode("knowledge-probe");
140
140
  }
141
+ else if (modeFilter === "agent-harness") {
142
+ stemsToWrite = taskStemsForMode("agent-harness");
143
+ }
141
144
  else {
142
145
  // Default (no --mode): write all tasks
143
146
  stemsToWrite = [...TASK_FILE_NAMES];
@@ -11,12 +11,19 @@
11
11
  import type { AppContext, PipelineState, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
12
12
  export declare class GenerateConfigsStep implements PipelineStep {
13
13
  readonly name = "generate-configs";
14
+ /** Task IDs from the last loadTasks call (pre-filter), for error messages. */
15
+ private lastLoadedTaskIds;
14
16
  check(ctx: AppContext): ValidationIssue[];
15
17
  execute(ctx: AppContext, state: PipelineState): Promise<StepResult>;
16
18
  private compileLiteracyVariants;
17
19
  private compileSingleMode;
18
20
  private loadTasks;
19
21
  private applyFilters;
22
+ /**
23
+ * Build a descriptive error message when no tasks match the current filters.
24
+ * Distinguishes between "no tasks exist" and "tasks exist but filters exclude them".
25
+ */
26
+ private buildNoTasksError;
20
27
  /**
21
28
  * Compile all tasks through a handler, merging results.
22
29
  * For literacy mode, ctx can carry evalMode as an extension.
@@ -20,6 +20,8 @@ import { loadSource } from "../../sources.js";
20
20
  import { configToSourceOverrides } from "../config-to-source-overrides.js";
21
21
  export class GenerateConfigsStep {
22
22
  name = "generate-configs";
23
+ /** Task IDs from the last loadTasks call (pre-filter), for error messages. */
24
+ lastLoadedTaskIds = [];
23
25
  check(ctx) {
24
26
  const issues = validateModelsYaml(ctx.config.rootDir);
25
27
  return issues.filter((i) => i.severity === "error");
@@ -54,10 +56,10 @@ export class GenerateConfigsStep {
54
56
  // Load tasks
55
57
  const tasks = await this.loadTasks(ctx, mode, state);
56
58
  if (tasks.length === 0) {
59
+ const error = this.buildNoTasksError(ctx, mode);
57
60
  return {
58
61
  durationMs: Date.now() - start,
59
- error: `No ${mode} tasks found. Create *.task.ts files in ` +
60
- `packages/eval/tasks/${mode}/`,
62
+ error,
61
63
  status: "failed",
62
64
  };
63
65
  }
@@ -249,6 +251,10 @@ export class GenerateConfigsStep {
249
251
  return filtered;
250
252
  }
251
253
  applyFilters(ctx, tasks) {
254
+ // Capture pre-filter IDs for diagnostic messages
255
+ this.lastLoadedTaskIds = tasks
256
+ .map((t) => t.id)
257
+ .filter((id) => !!id);
252
258
  let result = tasks;
253
259
  if (ctx.config.areas?.length) {
254
260
  const allowed = new Set(ctx.config.areas.map((a) => a.toLowerCase()));
@@ -273,6 +279,33 @@ export class GenerateConfigsStep {
273
279
  }
274
280
  return result;
275
281
  }
282
+ /**
283
+ * Build a descriptive error message when no tasks match the current filters.
284
+ * Distinguishes between "no tasks exist" and "tasks exist but filters exclude them".
285
+ */
286
+ buildNoTasksError(ctx, mode) {
287
+ const filters = [];
288
+ if (ctx.config.tasks?.length) {
289
+ filters.push(`--task ${ctx.config.tasks.join(", ")}`);
290
+ }
291
+ if (ctx.config.areas?.length) {
292
+ filters.push(`--area ${ctx.config.areas.join(", ")}`);
293
+ }
294
+ if (ctx.config.tags?.length) {
295
+ filters.push(`--tag ${ctx.config.tags.join(", ")}`);
296
+ }
297
+ if (filters.length > 0) {
298
+ // Collect available task IDs for the hint
299
+ const availableIds = this.lastLoadedTaskIds ?? [];
300
+ const hint = availableIds.length > 0
301
+ ? `\n Available ${mode} task IDs: ${availableIds.join(", ")}`
302
+ : "";
303
+ return (`No ${mode} tasks match the current filters (${filters.join("; ")}).` +
304
+ hint);
305
+ }
306
+ return (`No ${mode} tasks found. Create *.task.ts files in ` +
307
+ `packages/eval/tasks/${mode}/`);
308
+ }
276
309
  // ---------------------------------------------------------------------------
277
310
  // Compilation helpers
278
311
  // ---------------------------------------------------------------------------
@@ -87,15 +87,23 @@ describe("validateAgentHarnessTask", () => {
87
87
  // compileAgentHarnessTask — provider assembly
88
88
  // ---------------------------------------------------------------------------
89
89
  describe("compileAgentHarnessTask — providers", () => {
90
- it("produces a provider", () => {
90
+ it("produces a Claude Agent SDK provider", () => {
91
91
  const result = compileAgentHarnessTask(makeTask());
92
92
  assert.ok(result.providers.length > 0);
93
- assert.ok(result.providers[0].id.startsWith("agent:"));
93
+ assert.equal(result.providers[0].id, "anthropic:claude-agent-sdk");
94
94
  });
95
- it("resolves coding tool preset", () => {
95
+ it("sets default agent config", () => {
96
+ const result = compileAgentHarnessTask(makeTask());
97
+ const config = result.providers[0].config;
98
+ assert.ok(config.model, "should set a model");
99
+ assert.ok(config.max_turns, "should set max_turns");
100
+ assert.ok(config.max_budget_usd, "should set budget cap");
101
+ assert.equal(config.permission_mode, "bypassPermissions");
102
+ });
103
+ it("resolves coding tool preset into custom_allowed_tools", () => {
96
104
  const result = compileAgentHarnessTask(makeTask({ tools: ["coding"] }));
97
105
  const config = result.providers[0].config;
98
- const tools = config.allowedTools;
106
+ const tools = config.custom_allowed_tools;
99
107
  assert.ok(tools.includes("Bash"));
100
108
  assert.ok(tools.includes("Read"));
101
109
  assert.ok(tools.includes("Write"));
@@ -104,7 +112,7 @@ describe("compileAgentHarnessTask — providers", () => {
104
112
  it("resolves read-only tool preset", () => {
105
113
  const result = compileAgentHarnessTask(makeTask({ tools: ["read-only"] }));
106
114
  const config = result.providers[0].config;
107
- const tools = config.allowedTools;
115
+ const tools = config.custom_allowed_tools;
108
116
  assert.ok(tools.includes("Read"));
109
117
  assert.ok(tools.includes("Grep"));
110
118
  assert.ok(!tools.includes("Write"), "read-only should not include Write");
@@ -112,19 +120,10 @@ describe("compileAgentHarnessTask — providers", () => {
112
120
  it("mixes preset and explicit tools", () => {
113
121
  const result = compileAgentHarnessTask(makeTask({ tools: ["read-only", "WebFetch"] }));
114
122
  const config = result.providers[0].config;
115
- const tools = config.allowedTools;
123
+ const tools = config.custom_allowed_tools;
116
124
  assert.ok(tools.includes("Read"));
117
125
  assert.ok(tools.includes("WebFetch"));
118
126
  });
119
- it("includes sandbox config in provider", () => {
120
- const result = compileAgentHarnessTask(makeTask({
121
- sandbox: { type: "docker", image: "node:22-slim" },
122
- }));
123
- const config = result.providers[0].config;
124
- const sandbox = config.sandbox;
125
- assert.equal(sandbox.type, "docker");
126
- assert.equal(sandbox.image, "node:22-slim");
127
- });
128
127
  });
129
128
  // ---------------------------------------------------------------------------
130
129
  // compileAgentHarnessTask — test cases
@@ -166,16 +165,20 @@ describe("compileAgentHarnessTask — test cases", () => {
166
165
  // compileAgentHarnessTask — assertions
167
166
  // ---------------------------------------------------------------------------
168
167
  describe("compileAgentHarnessTask — assertions", () => {
169
- it("maps file-exists to javascript assertion", () => {
168
+ const RUNTIME = "file://dist/agent-harness/assertions-runtime.js";
169
+ it("maps file-exists to file-based javascript assertion", () => {
170
170
  const result = compileAgentHarnessTask(makeTask({
171
171
  assertions: [{ type: "file-exists", value: "sanity.config.ts" }],
172
172
  }));
173
173
  const assertion = result.tests[0].assert?.[0];
174
174
  assert.ok(assertion);
175
175
  assert.equal(assertion.type, "javascript");
176
- assert.ok(assertion.value.includes("sanity.config.ts"));
176
+ assert.equal(assertion.value, `${RUNTIME}:fileExists`);
177
+ assert.deepEqual(assertion.config, {
178
+ filePath: "sanity.config.ts",
179
+ });
177
180
  });
178
- it("maps file-contains to javascript assertion", () => {
181
+ it("maps file-contains to file-based javascript assertion", () => {
179
182
  const result = compileAgentHarnessTask(makeTask({
180
183
  assertions: [
181
184
  {
@@ -187,25 +190,35 @@ describe("compileAgentHarnessTask — assertions", () => {
187
190
  const assertion = result.tests[0].assert?.[0];
188
191
  assert.ok(assertion);
189
192
  assert.equal(assertion.type, "javascript");
190
- assert.ok(assertion.value.includes("projectId"));
193
+ assert.equal(assertion.value, `${RUNTIME}:fileContains`);
194
+ assert.deepEqual(assertion.config, {
195
+ filePath: "config.ts",
196
+ content: "projectId",
197
+ });
191
198
  });
192
- it("maps command-succeeds to javascript assertion", () => {
199
+ it("maps command-succeeds to file-based javascript assertion", () => {
193
200
  const result = compileAgentHarnessTask(makeTask({
194
201
  assertions: [{ type: "command-succeeds", value: "npx tsc --noEmit" }],
195
202
  }));
196
203
  const assertion = result.tests[0].assert?.[0];
197
204
  assert.ok(assertion);
198
205
  assert.equal(assertion.type, "javascript");
199
- assert.ok(assertion.value.includes("tsc"));
206
+ assert.equal(assertion.value, `${RUNTIME}:commandSucceeds`);
207
+ assert.deepEqual(assertion.config, {
208
+ command: "npx tsc --noEmit",
209
+ });
200
210
  });
201
- it("maps diff-matches to javascript assertion", () => {
211
+ it("maps diff-matches to file-based javascript assertion", () => {
202
212
  const result = compileAgentHarnessTask(makeTask({
203
213
  assertions: [{ type: "diff-matches", value: "createClient" }],
204
214
  }));
205
215
  const assertion = result.tests[0].assert?.[0];
206
216
  assert.ok(assertion);
207
217
  assert.equal(assertion.type, "javascript");
208
- assert.ok(assertion.value.includes("git diff"));
218
+ assert.equal(assertion.value, `${RUNTIME}:diffMatches`);
219
+ assert.deepEqual(assertion.config, {
220
+ expected: "createClient",
221
+ });
209
222
  });
210
223
  it("passes through standard assertions", () => {
211
224
  const result = compileAgentHarnessTask(makeTask({
@@ -250,7 +263,7 @@ describe("compileAgentHarnessTask — lifecycle", () => {
250
263
  }));
251
264
  assert.equal(result.sandboxConfig.type, "docker");
252
265
  assert.equal(result.sandboxConfig.image, "node:22");
253
- assert.deepEqual(result.sandboxConfig.fixtures, ["file://schema.ts"]);
266
+ assert.deepEqual(result.sandboxConfig.fixtures, ["schema.ts"]);
254
267
  assert.equal(result.sandboxConfig.limits?.cpus, 2);
255
268
  assert.equal(result.sandboxConfig.limits?.networkAccess, false);
256
269
  });
@@ -278,7 +291,8 @@ describe("example agent harness tasks — end-to-end", () => {
278
291
  const result = compileAgentHarnessTask(modifyCodeTask);
279
292
  assert.ok(result.tests[0].assert);
280
293
  assert.ok(result.tests[0].assert.some((a) => a.type === "javascript" &&
281
- a.value.includes("useDocumentOperation")));
294
+ a.value.includes("fileContains") &&
295
+ a.config != null));
282
296
  });
283
297
  it("refactor task has docker sandbox config", () => {
284
298
  const result = compileAgentHarnessTask(multiFileRefactorTask);
@@ -65,12 +65,38 @@ export function writeCompiledModeConfig(result, mode, options) {
65
65
  if (options.graderProvider) {
66
66
  graderOpts.provider = options.graderProvider;
67
67
  }
68
- // Build provider entries
68
+ // For agent-harness mode, create sandbox directories and inject working_dir
69
+ // into provider configs. The sandbox must exist before the provider initializes
70
+ // (the Claude Agent SDK reads working_dir at construction time).
71
+ // Both working_dir and __workingDir use absolute paths to avoid ambiguity.
72
+ // @see https://www.promptfoo.dev/docs/providers/claude-agent-sdk/
73
+ const sandboxAbsPath = result.extras?.sandboxConfig
74
+ ? resolve(options.rootDir, `results/latest/sandbox-${mode}`)
75
+ : undefined;
76
+ if (sandboxAbsPath) {
77
+ mkdirSync(sandboxAbsPath, { recursive: true });
78
+ }
79
+ // Build provider entries, injecting working_dir for agent-harness providers
69
80
  const providerEntries = result.providers.map((p) => {
70
- if (p.config)
71
- return { id: p.id, label: p.label, config: p.config };
72
- return p.label ? { id: p.id, label: p.label } : p.id;
81
+ if (!p.config)
82
+ return p.label ? { id: p.id, label: p.label } : p.id;
83
+ const config = { ...p.config };
84
+ if (sandboxAbsPath && p.id === "anthropic:claude-agent-sdk") {
85
+ config.working_dir = sandboxAbsPath;
86
+ }
87
+ return { id: p.id, label: p.label, config };
73
88
  });
89
+ // Inject __workingDir into test vars so assertions can find the sandbox
90
+ if (sandboxAbsPath) {
91
+ for (const test of expandedTests) {
92
+ if (test.vars) {
93
+ ;
94
+ test.vars.__workingDir = sandboxAbsPath;
95
+ }
96
+ }
97
+ // Re-write the tests file with the injected paths
98
+ writeFileSync(testsPath, JSON.stringify(expandedTests, null, 2), "utf-8");
99
+ }
74
100
  // Build prompt entries
75
101
  const prompts = result.prompts.map((p) => ({
76
102
  id: p.id,
@@ -88,10 +114,11 @@ export function writeCompiledModeConfig(result, mode, options) {
88
114
  tests: [testsFilename],
89
115
  });
90
116
  // Include extensions if present (agent-harness mode)
117
+ // Promptfoo expects extensions as string[] (file paths to JS modules),
118
+ // so we materialize the { type, code } objects as a .cjs file on disk.
91
119
  if (result.extras?.extensions) {
92
- ;
93
- config.extensions =
94
- result.extras.extensions;
120
+ const extPaths = writeExtensionFile(options.rootDir, mode, result.extras.extensions);
121
+ config.extensions = extPaths;
95
122
  }
96
123
  writeConfig(options.rootDir, filename, config, options.logger);
97
124
  }
@@ -215,3 +242,47 @@ function writeYaml(path, data, header) {
215
242
  });
216
243
  writeFileSync(path, `${header}\n${yamlStr}`, "utf-8");
217
244
  }
245
+ /**
246
+ * Materialize Promptfoo lifecycle extensions as a .cjs file on disk.
247
+ *
248
+ * Promptfoo extensions use a single-function dispatch pattern:
249
+ * module.exports = async function(hookName, context) { ... }
250
+ *
251
+ * Each extension entry in the YAML references:
252
+ * file://path/to/file.cjs:exportedFunctionName
253
+ *
254
+ * @see https://www.promptfoo.dev/docs/configuration/reference/ — extensions
255
+ */
256
+ function writeExtensionFile(rootDir, mode, extensions) {
257
+ // Build a dispatch map: hookName → handler code
258
+ const hookMap = {};
259
+ for (const ext of extensions) {
260
+ hookMap[ext.type] = ext.code;
261
+ }
262
+ // Generate the single dispatch function that promptfoo expects
263
+ const hookCases = Object.entries(hookMap)
264
+ .map(([hookName, code]) => ` if (hookName === '${hookName}') {\n` +
265
+ ` const handler = ${code};\n` +
266
+ ` return handler(context);\n` +
267
+ ` }`)
268
+ .join("\n");
269
+ const fileContent = [
270
+ "// AUTO-GENERATED by compiler pipeline — do not edit directly.",
271
+ "// Run: npx @sanity/ailf generate-configs",
272
+ "//",
273
+ "// Promptfoo extension dispatch function.",
274
+ `// @see https://www.promptfoo.dev/docs/configuration/reference/`,
275
+ "",
276
+ "async function extensionHook(hookName, context) {",
277
+ hookCases,
278
+ "}",
279
+ "",
280
+ "module.exports = extensionHook;",
281
+ "",
282
+ ].join("\n");
283
+ const filename = `results/latest/${mode}-extensions.cjs`;
284
+ const outPath = resolve(rootDir, filename);
285
+ writeFileSync(outPath, fileContent, "utf-8");
286
+ // Single entry pointing to the dispatch function
287
+ return [`file://${filename}:extensionHook`];
288
+ }
@@ -4,6 +4,15 @@
4
4
  * Handles agent-specific assertion types (file-exists, file-contains,
5
5
  * command-succeeds, diff-matches) as well as standard pass-through
6
6
  * assertion types.
7
+ *
8
+ * Agent-specific assertions use file-based references to the assertions
9
+ * runtime module (dist/agent-harness/assertions-runtime.js) because
10
+ * promptfoo's inline `type: javascript` assertions run in a restricted
11
+ * eval() sandbox where require() is unavailable. File-based assertions
12
+ * run in a full Node.js context.
13
+ *
14
+ * @see https://www.promptfoo.dev/docs/configuration/expected-outputs/javascript/
15
+ * @see src/agent-harness/assertions-runtime.ts — runtime implementations
7
16
  */
8
17
  import type { PromptfooAssertion } from "../../assertion-mapper.js";
9
18
  import type { AgentHarnessCompileOptions } from "./types.js";
@@ -4,7 +4,18 @@
4
4
  * Handles agent-specific assertion types (file-exists, file-contains,
5
5
  * command-succeeds, diff-matches) as well as standard pass-through
6
6
  * assertion types.
7
+ *
8
+ * Agent-specific assertions use file-based references to the assertions
9
+ * runtime module (dist/agent-harness/assertions-runtime.js) because
10
+ * promptfoo's inline `type: javascript` assertions run in a restricted
11
+ * eval() sandbox where require() is unavailable. File-based assertions
12
+ * run in a full Node.js context.
13
+ *
14
+ * @see https://www.promptfoo.dev/docs/configuration/expected-outputs/javascript/
15
+ * @see src/agent-harness/assertions-runtime.ts — runtime implementations
7
16
  */
17
+ /** Base path for the file-based assertion runtime module */
18
+ const RUNTIME = "file://dist/agent-harness/assertions-runtime.js";
8
19
  // ---------------------------------------------------------------------------
9
20
  // Assertion mapping
10
21
  // ---------------------------------------------------------------------------
@@ -53,66 +64,29 @@ export function mapAgentAssertion(assertion, options, warnings) {
53
64
  }
54
65
  // ---------------------------------------------------------------------------
55
66
  // Agent-specific assertion builders
67
+ //
68
+ // Each builder returns a file-based assertion referencing the runtime
69
+ // module with parameters passed via the `config` field.
56
70
  // ---------------------------------------------------------------------------
57
71
  export function buildFileExistsAssertion(assertion) {
58
- const filePath = String(assertion.value ?? "");
59
- // Use JSON.stringify for all interpolated values in generated JS to
60
- // prevent broken strings from filePaths containing quotes/backslashes
61
- const safeFilePath = JSON.stringify(filePath);
62
72
  return {
63
73
  type: "javascript",
64
- value: `// file-exists: ${filePath}\n` +
65
- `(function() {\n` +
66
- ` const fs = require('fs');\n` +
67
- ` const path = require('path');\n` +
68
- ` const workDir = path.resolve(context.vars.__workingDir || '.');\n` +
69
- ` const target = path.resolve(workDir, ${safeFilePath});\n` +
70
- ` if (!target.startsWith(workDir + path.sep) && target !== workDir) {\n` +
71
- ` return { pass: false, score: 0, reason: 'Path traversal: ' + ${safeFilePath} + ' escapes sandbox' };\n` +
72
- ` }\n` +
73
- ` const exists = fs.existsSync(target);\n` +
74
- ` return {\n` +
75
- ` pass: exists,\n` +
76
- ` score: exists ? 1 : 0,\n` +
77
- ` reason: exists\n` +
78
- ` ? 'File exists: ' + ${safeFilePath}\n` +
79
- ` : 'Expected file not found: ' + ${safeFilePath},\n` +
80
- ` };\n` +
81
- `})()`,
74
+ value: `${RUNTIME}:fileExists`,
75
+ config: { filePath: String(assertion.value ?? "") },
82
76
  ...(typeof assertion.weight === "number"
83
77
  ? { weight: assertion.weight }
84
78
  : {}),
85
79
  };
86
80
  }
87
81
  export function buildFileContainsAssertion(assertion) {
88
- const config = assertion.value;
89
- const filePath = config?.path ?? "";
90
- const expectedContent = config?.content ?? "";
91
- const safeFilePath = JSON.stringify(filePath);
82
+ const val = assertion.value;
92
83
  return {
93
84
  type: "javascript",
94
- value: `// file-contains: ${filePath}\n` +
95
- `(function() {\n` +
96
- ` const fs = require('fs');\n` +
97
- ` const path = require('path');\n` +
98
- ` const workDir = path.resolve(context.vars.__workingDir || '.');\n` +
99
- ` const target = path.resolve(workDir, ${safeFilePath});\n` +
100
- ` if (!target.startsWith(workDir + path.sep) && target !== workDir) {\n` +
101
- ` return { pass: false, score: 0, reason: 'Path traversal: ' + ${safeFilePath} + ' escapes sandbox' };\n` +
102
- ` }\n` +
103
- ` if (!fs.existsSync(target)) {\n` +
104
- ` return { pass: false, score: 0, reason: 'File not found: ' + ${safeFilePath} };\n` +
105
- ` }\n` +
106
- ` const content = fs.readFileSync(target, 'utf-8');\n` +
107
- ` const contains = content.includes(${JSON.stringify(expectedContent)});\n` +
108
- ` return {\n` +
109
- ` pass: contains,\n` +
110
- ` score: contains ? 1 : 0,\n` +
111
- ` reason: contains\n` +
112
- ` ? 'File contains expected content'\n` +
113
- ` : 'File does not contain expected content',\n` +
114
- ` };\n` +
115
- `})()`,
85
+ value: `${RUNTIME}:fileContains`,
86
+ config: {
87
+ filePath: val?.path ?? "",
88
+ content: val?.content ?? "",
89
+ },
116
90
  ...(typeof assertion.weight === "number"
117
91
  ? { weight: assertion.weight }
118
92
  : {}),
@@ -133,53 +107,22 @@ export function buildFileContainsAssertion(assertion) {
133
107
  * from untrusted sources, validate commands against an allowlist first.
134
108
  */
135
109
  export function buildCommandSucceedsAssertion(assertion) {
136
- const command = String(assertion.value ?? "");
137
110
  return {
138
111
  type: "javascript",
139
- value: `// command-succeeds: ${command}\n` +
140
- `(function() {\n` +
141
- ` const { execSync } = require('child_process');\n` +
142
- ` const workDir = context.vars.__workingDir || '.';\n` +
143
- ` try {\n` +
144
- ` execSync(${JSON.stringify(command)}, { cwd: workDir, timeout: 30000 });\n` +
145
- ` return { pass: true, score: 1, reason: 'Command succeeded: ' + ${JSON.stringify(command)} };\n` +
146
- ` } catch (err) {\n` +
147
- ` return {\n` +
148
- ` pass: false,\n` +
149
- ` score: 0,\n` +
150
- ` reason: 'Command failed: ' + (err.message || err),\n` +
151
- ` };\n` +
152
- ` }\n` +
153
- `})()`,
112
+ value: `${RUNTIME}:commandSucceeds`,
113
+ config: { command: String(assertion.value ?? "") },
154
114
  ...(typeof assertion.weight === "number"
155
115
  ? { weight: assertion.weight }
156
116
  : {}),
157
117
  };
158
118
  }
159
119
  export function buildDiffMatchesAssertion(assertion) {
160
- const expected = assertion.value;
161
120
  return {
162
121
  type: "javascript",
163
- value: `// diff-matches\n` +
164
- `(function() {\n` +
165
- ` const { execSync } = require('child_process');\n` +
166
- ` const workDir = context.vars.__workingDir || '.';\n` +
167
- ` try {\n` +
168
- ` const diff = execSync('git diff', { cwd: workDir, encoding: 'utf-8' });\n` +
169
- ` const expected = ${JSON.stringify(expected)};\n` +
170
- ` if (typeof expected === 'string') {\n` +
171
- ` const contains = diff.includes(expected);\n` +
172
- ` return {\n` +
173
- ` pass: contains,\n` +
174
- ` score: contains ? 1 : 0,\n` +
175
- ` reason: contains ? 'Diff matches expected pattern' : 'Diff does not match',\n` +
176
- ` };\n` +
177
- ` }\n` +
178
- ` return { pass: diff.length > 0, score: diff.length > 0 ? 1 : 0, reason: 'Diff exists' };\n` +
179
- ` } catch (err) {\n` +
180
- ` return { pass: false, score: 0, reason: 'Failed to get diff: ' + err.message };\n` +
181
- ` }\n` +
182
- `})()`,
122
+ value: `${RUNTIME}:diffMatches`,
123
+ config: {
124
+ ...(assertion.value != null ? { expected: assertion.value } : {}),
125
+ },
183
126
  ...(typeof assertion.weight === "number"
184
127
  ? { weight: assertion.weight }
185
128
  : {}),
@@ -27,8 +27,10 @@ export function compileAgentHarnessTask(task, options) {
27
27
  const prompts = buildAgentPrompts(task);
28
28
  // Build test cases
29
29
  const tests = buildAgentTestCases(task, options, warnings);
30
- // Build sandbox extensions
31
- const sandboxConfig = buildSandboxConfig(task);
30
+ // Build sandbox extensions — resolve fixture paths at compile time using
31
+ // the caller's cwd (monorepo root), not the eval package rootDir.
32
+ const callerCwd = process.env.AILF_CALLER_CWD ?? process.cwd();
33
+ const sandboxConfig = buildSandboxConfig(task, callerCwd);
32
34
  const extensions = buildLifecycleExtensions(task, sandboxConfig);
33
35
  return { providers, tests, prompts, extensions, sandboxConfig, warnings };
34
36
  }
@@ -36,22 +38,25 @@ export function compileAgentHarnessTask(task, options) {
36
38
  // Provider assembly
37
39
  // ---------------------------------------------------------------------------
38
40
  export function buildAgentProvider(task, _warnings) {
39
- // Resolve tool permissions
40
41
  const tools = resolveToolPermissions(task.tools);
41
- const config = {};
42
+ // Claude Agent SDK config.
43
+ // working_dir is set by the YAML writer to the sandbox path it creates.
44
+ // @see https://www.promptfoo.dev/docs/providers/claude-agent-sdk/
45
+ const config = {
46
+ model: "claude-sonnet-4-20250514",
47
+ max_turns: 25,
48
+ max_budget_usd: 1.0,
49
+ permission_mode: "bypassPermissions",
50
+ allow_dangerously_skip_permissions: true,
51
+ };
52
+ // Map AILF tool names to Claude Agent SDK tool config.
53
+ // Claude SDK uses custom_allowed_tools to replace defaults.
42
54
  if (tools.length > 0) {
43
- config.allowedTools = tools;
44
- }
45
- if (task.sandbox) {
46
- config.sandbox = {
47
- type: task.sandbox.type,
48
- ...(task.sandbox.image ? { image: task.sandbox.image } : {}),
49
- };
55
+ config.custom_allowed_tools = tools;
50
56
  }
51
- // Default to Claude Agent SDK provider
52
57
  return [
53
58
  {
54
- id: `agent:${task.id}`,
59
+ id: "anthropic:claude-agent-sdk",
55
60
  label: `Agent Harness: ${task.title}`,
56
61
  config,
57
62
  },
@@ -112,9 +117,11 @@ export function buildAgentTestCases(task, options, warnings) {
112
117
  const vars = {
113
118
  task: task.prompt?.vars?.task ?? task.description ?? `Complete: ${task.title}`,
114
119
  ...(task.prompt?.vars ?? {}),
115
- // Internal metadata for sandbox lifecycle hooks
120
+ // Internal metadata for sandbox lifecycle hooks.
121
+ // Fixture paths are plain strings (no file:// prefix) because
122
+ // promptfoo auto-resolves file:// in vars by reading file content.
116
123
  __sandboxType: task.sandbox?.type ?? "tempdir",
117
- __fixtures: task.fixtures ?? [],
124
+ __fixtures: (task.fixtures ?? []).map((f) => f.startsWith("file://") ? f.slice(7) : f),
118
125
  };
119
126
  const tests = [
120
127
  {
@@ -6,7 +6,14 @@
6
6
  */
7
7
  import type { AgentHarnessTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
8
8
  import type { PromptfooExtension, SandboxConfigMeta } from "./types.js";
9
- export declare function buildSandboxConfig(task: AgentHarnessTaskDefinition): SandboxConfigMeta;
9
+ /**
10
+ * Build sandbox configuration from a task definition.
11
+ *
12
+ * Fixture paths are resolved to absolute at compile time using callerCwd
13
+ * (the directory the pipeline was invoked from), because promptfoo runs
14
+ * with cwd set to packages/eval/ — not the monorepo root where apps/ lives.
15
+ */
16
+ export declare function buildSandboxConfig(task: AgentHarnessTaskDefinition, callerCwd?: string): SandboxConfigMeta;
10
17
  export declare function buildLifecycleExtensions(task: AgentHarnessTaskDefinition, sandboxConfig: SandboxConfigMeta): PromptfooExtension[];
11
18
  export declare function buildBeforeEachHook(taskId: string, config: SandboxConfigMeta): string;
12
19
  export declare function buildAfterEachHook(taskId: string): string;
@@ -4,14 +4,26 @@
4
4
  * Builds Promptfoo beforeEach/afterEach hooks for provisioning and
5
5
  * tearing down sandbox working directories.
6
6
  */
7
+ import { resolve } from "path";
7
8
  // ---------------------------------------------------------------------------
8
9
  // Sandbox configuration
9
10
  // ---------------------------------------------------------------------------
10
- export function buildSandboxConfig(task) {
11
+ /**
12
+ * Build sandbox configuration from a task definition.
13
+ *
14
+ * Fixture paths are resolved to absolute at compile time using callerCwd
15
+ * (the directory the pipeline was invoked from), because promptfoo runs
16
+ * with cwd set to packages/eval/ — not the monorepo root where apps/ lives.
17
+ */
18
+ export function buildSandboxConfig(task, callerCwd) {
19
+ const cwd = callerCwd ?? process.cwd();
11
20
  return {
12
21
  type: task.sandbox?.type ?? "tempdir",
13
22
  image: task.sandbox?.image,
14
- fixtures: task.fixtures ?? [],
23
+ fixtures: (task.fixtures ?? []).map((f) => {
24
+ const stripped = f.startsWith("file://") ? f.slice(7) : f;
25
+ return resolve(cwd, stripped);
26
+ }),
15
27
  limits: task.sandbox?.limits
16
28
  ? {
17
29
  cpus: task.sandbox.limits.cpus,
@@ -39,23 +51,41 @@ export function buildLifecycleExtensions(task, sandboxConfig) {
39
51
  return extensions;
40
52
  }
41
53
  export function buildBeforeEachHook(taskId, config) {
42
- return (`// beforeEach: provision sandbox for ${taskId}\n` +
43
- `async function({ vars }) {\n` +
44
- ` const { mkdirSync, writeFileSync } = require('fs');\n` +
45
- ` const { tmpdir } = require('os');\n` +
54
+ // Promptfoo extension hooks receive (hookName, context).
55
+ // beforeEach context is { test } vars live at context.test.vars.
56
+ // Must return context for mutations to persist.
57
+ //
58
+ // The sandbox directory is created by the YAML writer at config-gen time
59
+ // (deterministic path in results/latest/sandbox-{taskId}/) so it exists
60
+ // before the provider is initialized. This hook copies fixtures into it.
61
+ //
62
+ // @see https://www.promptfoo.dev/docs/configuration/reference/ — extensions
63
+ return (`// beforeEach: copy fixtures into sandbox for ${taskId}\n` +
64
+ `async function(context) {\n` +
65
+ ` const { cpSync, existsSync, mkdirSync } = require('fs');\n` +
46
66
  ` const { resolve } = require('path');\n` +
47
- ` const id = 'ailf-${taskId}-' + require('crypto').randomUUID().slice(0, 8);\n` +
48
- ` const workDir = resolve(tmpdir(), id);\n` +
67
+ ` const workDir = context.test.vars?.__workingDir;\n` +
68
+ ` if (!workDir) return context;\n` +
49
69
  ` mkdirSync(workDir, { recursive: true });\n` +
50
- ` vars.__workingDir = workDir;\n` +
51
- ` vars.__sandboxId = id;\n` +
52
- ` // Fixture list: ${JSON.stringify(config.fixtures)}\n` +
70
+ ` // Copy fixtures into sandbox\n` +
71
+ ` const fixtures = ${JSON.stringify(config.fixtures)};\n` +
72
+ ` for (const fixture of fixtures) {\n` +
73
+ ` const src = resolve(process.cwd(), fixture);\n` +
74
+ ` if (existsSync(src)) {\n` +
75
+ ` cpSync(src, workDir, { recursive: true });\n` +
76
+ ` }\n` +
77
+ ` }\n` +
78
+ ` return context;\n` +
53
79
  `}`);
54
80
  }
55
81
  export function buildAfterEachHook(taskId) {
82
+ // Promptfoo extension hooks receive (hookName, context).
83
+ // afterEach context is { test, result } — vars live at context.test.vars.
84
+ // @see https://www.promptfoo.dev/docs/configuration/reference/ — extensions
56
85
  return (`// afterEach: collect artifacts + teardown for ${taskId}\n` +
57
- `async function({ vars }) {\n` +
86
+ `async function(context) {\n` +
58
87
  ` const { rmSync, readdirSync, existsSync } = require('fs');\n` +
88
+ ` const vars = context.test.vars || {};\n` +
59
89
  ` const workDir = vars.__workingDir;\n` +
60
90
  ` if (workDir && existsSync(workDir)) {\n` +
61
91
  ` try {\n` +
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sanity/ailf",
3
- "version": "2.1.0",
3
+ "version": "2.2.0",
4
4
  "private": false,
5
5
  "publishConfig": {
6
6
  "access": "public"
@@ -46,6 +46,7 @@
46
46
  "zod": "^4.3.6"
47
47
  },
48
48
  "devDependencies": {
49
+ "@anthropic-ai/claude-agent-sdk": "^0.2.105",
49
50
  "@types/js-yaml": "^4.0.9",
50
51
  "@types/node": "^22.13.1",
51
52
  "tsx": "^4.19.2",