@sanity/ailf 2.0.2 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. package/LICENSE +21 -0
  2. package/dist/_vendor/ailf-core/examples/index.d.ts +50 -1
  3. package/dist/_vendor/ailf-core/examples/index.js +66 -1
  4. package/dist/agent-harness/assertions-runtime.d.ts +49 -0
  5. package/dist/agent-harness/assertions-runtime.js +138 -0
  6. package/dist/agent-harness/provider.d.ts +58 -0
  7. package/dist/agent-harness/provider.js +104 -0
  8. package/dist/cli.js +0 -0
  9. package/dist/commands/init.js +3 -0
  10. package/dist/orchestration/steps/generate-configs-step.d.ts +7 -0
  11. package/dist/orchestration/steps/generate-configs-step.js +35 -2
  12. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +39 -25
  13. package/dist/pipeline/compiler/compiler-to-yaml.js +78 -7
  14. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +9 -0
  15. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +28 -85
  16. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +22 -15
  17. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +8 -1
  18. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +42 -12
  19. package/package.json +25 -24
  20. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
  21. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
  22. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
  23. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
  24. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  25. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  26. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  27. package/dist/_vendor/ailf-tasks/index.js +0 -16
  28. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  29. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  30. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  31. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  32. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  33. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  34. package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
  35. package/dist/adapters/task-sources/yaml-task-source.js +0 -139
  36. package/dist/agent-observer/test-imports.d.ts +0 -7
  37. package/dist/agent-observer/test-imports.js +0 -185
  38. package/dist/commands/update-quality-scores.d.ts +0 -5
  39. package/dist/commands/update-quality-scores.js +0 -20
  40. package/dist/lib/agent-behavior-report.d.ts +0 -8
  41. package/dist/lib/agent-behavior-report.js +0 -185
  42. package/dist/lib/baseline.d.ts +0 -19
  43. package/dist/lib/baseline.js +0 -153
  44. package/dist/lib/calculate-scores.d.ts +0 -23
  45. package/dist/lib/calculate-scores.js +0 -42
  46. package/dist/lib/compare.d.ts +0 -18
  47. package/dist/lib/compare.js +0 -170
  48. package/dist/lib/coverage-audit.d.ts +0 -4
  49. package/dist/lib/coverage-audit.js +0 -42
  50. package/dist/lib/discovery-report.d.ts +0 -13
  51. package/dist/lib/discovery-report.js +0 -57
  52. package/dist/lib/fetch-docs.d.ts +0 -30
  53. package/dist/lib/fetch-docs.js +0 -171
  54. package/dist/lib/generate-configs.d.ts +0 -25
  55. package/dist/lib/generate-configs.js +0 -42
  56. package/dist/lib/grader-api.d.ts +0 -21
  57. package/dist/lib/grader-api.js +0 -34
  58. package/dist/lib/grader-compare.d.ts +0 -19
  59. package/dist/lib/grader-compare.js +0 -91
  60. package/dist/lib/grader-consistency.d.ts +0 -27
  61. package/dist/lib/grader-consistency.js +0 -79
  62. package/dist/lib/grader-sensitivity.d.ts +0 -19
  63. package/dist/lib/grader-sensitivity.js +0 -75
  64. package/dist/lib/grader-validate.d.ts +0 -19
  65. package/dist/lib/grader-validate.js +0 -78
  66. package/dist/lib/measure-retrieval.d.ts +0 -14
  67. package/dist/lib/measure-retrieval.js +0 -71
  68. package/dist/lib/pr-comment.d.ts +0 -16
  69. package/dist/lib/pr-comment.js +0 -28
  70. package/dist/lib/readiness-report.d.ts +0 -13
  71. package/dist/lib/readiness-report.js +0 -108
  72. package/dist/lib/webhook-server.d.ts +0 -11
  73. package/dist/lib/webhook-server.js +0 -24
  74. package/dist/lib/weekly-digest.d.ts +0 -24
  75. package/dist/lib/weekly-digest.js +0 -148
  76. package/dist/orchestration/env-bridge.d.ts +0 -21
  77. package/dist/orchestration/env-bridge.js +0 -66
  78. package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
  79. package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
  80. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
  81. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
  82. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
  83. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
  84. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
  85. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
  86. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
  87. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
  88. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
  89. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
  90. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
  91. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
  92. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
  93. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
  94. package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
  95. package/dist/pipeline/compiler/task-bridge.js +0 -92
  96. package/dist/pipeline/expand-tasks.d.ts +0 -232
  97. package/dist/pipeline/expand-tasks.js +0 -467
  98. package/dist/pipeline/generate-configs.d.ts +0 -92
  99. package/dist/pipeline/generate-configs.js +0 -445
  100. package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
  101. package/dist/pipeline/steps/calculate-scores-step.js +0 -89
  102. package/dist/pipeline/steps/compare-step.d.ts +0 -18
  103. package/dist/pipeline/steps/compare-step.js +0 -90
  104. package/dist/pipeline/steps/eval-step.d.ts +0 -53
  105. package/dist/pipeline/steps/eval-step.js +0 -347
  106. package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
  107. package/dist/pipeline/steps/fetch-docs-step.js +0 -84
  108. package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
  109. package/dist/pipeline/steps/generate-configs-step.js +0 -98
  110. package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
  111. package/dist/pipeline/steps/grader-consistency-step.js +0 -74
  112. package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
  113. package/dist/pipeline/steps/publish-report-step.js +0 -243
  114. package/dist/pipeline/steps/report-step.d.ts +0 -13
  115. package/dist/pipeline/steps/report-step.js +0 -56
  116. package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
  117. package/dist/pipeline/steps/update-scores-step.js +0 -42
  118. package/dist/scripts/agent-behavior-report.d.ts +0 -19
  119. package/dist/scripts/agent-behavior-report.js +0 -315
  120. package/dist/scripts/baseline.d.ts +0 -43
  121. package/dist/scripts/baseline.js +0 -267
  122. package/dist/scripts/calculate-scores.d.ts +0 -166
  123. package/dist/scripts/calculate-scores.js +0 -1296
  124. package/dist/scripts/compare.d.ts +0 -22
  125. package/dist/scripts/compare.js +0 -334
  126. package/dist/scripts/coverage-audit.d.ts +0 -44
  127. package/dist/scripts/coverage-audit.js +0 -209
  128. package/dist/scripts/debug-eval.d.ts +0 -19
  129. package/dist/scripts/debug-eval.js +0 -73
  130. package/dist/scripts/discovery-report.d.ts +0 -58
  131. package/dist/scripts/discovery-report.js +0 -250
  132. package/dist/scripts/fetch-docs.d.ts +0 -35
  133. package/dist/scripts/fetch-docs.js +0 -472
  134. package/dist/scripts/generate-configs.d.ts +0 -66
  135. package/dist/scripts/generate-configs.js +0 -459
  136. package/dist/scripts/grader-api.d.ts +0 -27
  137. package/dist/scripts/grader-api.js +0 -206
  138. package/dist/scripts/grader-compare.d.ts +0 -22
  139. package/dist/scripts/grader-compare.js +0 -368
  140. package/dist/scripts/grader-consistency.d.ts +0 -20
  141. package/dist/scripts/grader-consistency.js +0 -313
  142. package/dist/scripts/grader-sensitivity.d.ts +0 -22
  143. package/dist/scripts/grader-sensitivity.js +0 -354
  144. package/dist/scripts/grader-validate.d.ts +0 -19
  145. package/dist/scripts/grader-validate.js +0 -267
  146. package/dist/scripts/measure-retrieval.d.ts +0 -10
  147. package/dist/scripts/measure-retrieval.js +0 -145
  148. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
  149. package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
  150. package/dist/scripts/pipeline.d.ts +0 -76
  151. package/dist/scripts/pipeline.js +0 -1031
  152. package/dist/scripts/pr-comment.d.ts +0 -10
  153. package/dist/scripts/pr-comment.js +0 -510
  154. package/dist/scripts/readiness-report.d.ts +0 -88
  155. package/dist/scripts/readiness-report.js +0 -342
  156. package/dist/scripts/update-quality-scores.d.ts +0 -15
  157. package/dist/scripts/update-quality-scores.js +0 -184
  158. package/dist/scripts/validate-task-sources.d.ts +0 -21
  159. package/dist/scripts/validate-task-sources.js +0 -210
  160. package/dist/scripts/validate.d.ts +0 -13
  161. package/dist/scripts/validate.js +0 -79
  162. package/dist/scripts/webhook-server.d.ts +0 -26
  163. package/dist/scripts/webhook-server.js +0 -147
  164. package/dist/scripts/weekly-digest.d.ts +0 -24
  165. package/dist/scripts/weekly-digest.js +0 -144
  166. package/dist/sinks/format-slack.d.ts +0 -64
  167. package/dist/sinks/format-slack.js +0 -306
  168. package/dist/sinks/slack-sink.d.ts +0 -27
  169. package/dist/sinks/slack-sink.js +0 -78
  170. package/dist/sinks/webhook-sink.d.ts +0 -19
  171. package/dist/sinks/webhook-sink.js +0 -50
  172. package/tasks/.expanded.agentic.yaml +0 -280
  173. package/tasks/.expanded.yaml +0 -565
@@ -20,6 +20,8 @@ import { loadSource } from "../../sources.js";
20
20
  import { configToSourceOverrides } from "../config-to-source-overrides.js";
21
21
  export class GenerateConfigsStep {
22
22
  name = "generate-configs";
23
+ /** Task IDs from the last loadTasks call (pre-filter), for error messages. */
24
+ lastLoadedTaskIds = [];
23
25
  check(ctx) {
24
26
  const issues = validateModelsYaml(ctx.config.rootDir);
25
27
  return issues.filter((i) => i.severity === "error");
@@ -54,10 +56,10 @@ export class GenerateConfigsStep {
54
56
  // Load tasks
55
57
  const tasks = await this.loadTasks(ctx, mode, state);
56
58
  if (tasks.length === 0) {
59
+ const error = this.buildNoTasksError(ctx, mode);
57
60
  return {
58
61
  durationMs: Date.now() - start,
59
- error: `No ${mode} tasks found. Create *.task.ts files in ` +
60
- `packages/eval/tasks/${mode}/`,
62
+ error,
61
63
  status: "failed",
62
64
  };
63
65
  }
@@ -249,6 +251,10 @@ export class GenerateConfigsStep {
249
251
  return filtered;
250
252
  }
251
253
  applyFilters(ctx, tasks) {
254
+ // Capture pre-filter IDs for diagnostic messages
255
+ this.lastLoadedTaskIds = tasks
256
+ .map((t) => t.id)
257
+ .filter((id) => !!id);
252
258
  let result = tasks;
253
259
  if (ctx.config.areas?.length) {
254
260
  const allowed = new Set(ctx.config.areas.map((a) => a.toLowerCase()));
@@ -273,6 +279,33 @@ export class GenerateConfigsStep {
273
279
  }
274
280
  return result;
275
281
  }
282
+ /**
283
+ * Build a descriptive error message when no tasks match the current filters.
284
+ * Distinguishes between "no tasks exist" and "tasks exist but filters exclude them".
285
+ */
286
+ buildNoTasksError(ctx, mode) {
287
+ const filters = [];
288
+ if (ctx.config.tasks?.length) {
289
+ filters.push(`--task ${ctx.config.tasks.join(", ")}`);
290
+ }
291
+ if (ctx.config.areas?.length) {
292
+ filters.push(`--area ${ctx.config.areas.join(", ")}`);
293
+ }
294
+ if (ctx.config.tags?.length) {
295
+ filters.push(`--tag ${ctx.config.tags.join(", ")}`);
296
+ }
297
+ if (filters.length > 0) {
298
+ // Collect available task IDs for the hint
299
+ const availableIds = this.lastLoadedTaskIds ?? [];
300
+ const hint = availableIds.length > 0
301
+ ? `\n Available ${mode} task IDs: ${availableIds.join(", ")}`
302
+ : "";
303
+ return (`No ${mode} tasks match the current filters (${filters.join("; ")}).` +
304
+ hint);
305
+ }
306
+ return (`No ${mode} tasks found. Create *.task.ts files in ` +
307
+ `packages/eval/tasks/${mode}/`);
308
+ }
276
309
  // ---------------------------------------------------------------------------
277
310
  // Compilation helpers
278
311
  // ---------------------------------------------------------------------------
@@ -87,15 +87,23 @@ describe("validateAgentHarnessTask", () => {
87
87
  // compileAgentHarnessTask — provider assembly
88
88
  // ---------------------------------------------------------------------------
89
89
  describe("compileAgentHarnessTask — providers", () => {
90
- it("produces a provider", () => {
90
+ it("produces a Claude Agent SDK provider", () => {
91
91
  const result = compileAgentHarnessTask(makeTask());
92
92
  assert.ok(result.providers.length > 0);
93
- assert.ok(result.providers[0].id.startsWith("agent:"));
93
+ assert.equal(result.providers[0].id, "anthropic:claude-agent-sdk");
94
94
  });
95
- it("resolves coding tool preset", () => {
95
+ it("sets default agent config", () => {
96
+ const result = compileAgentHarnessTask(makeTask());
97
+ const config = result.providers[0].config;
98
+ assert.ok(config.model, "should set a model");
99
+ assert.ok(config.max_turns, "should set max_turns");
100
+ assert.ok(config.max_budget_usd, "should set budget cap");
101
+ assert.equal(config.permission_mode, "bypassPermissions");
102
+ });
103
+ it("resolves coding tool preset into custom_allowed_tools", () => {
96
104
  const result = compileAgentHarnessTask(makeTask({ tools: ["coding"] }));
97
105
  const config = result.providers[0].config;
98
- const tools = config.allowedTools;
106
+ const tools = config.custom_allowed_tools;
99
107
  assert.ok(tools.includes("Bash"));
100
108
  assert.ok(tools.includes("Read"));
101
109
  assert.ok(tools.includes("Write"));
@@ -104,7 +112,7 @@ describe("compileAgentHarnessTask — providers", () => {
104
112
  it("resolves read-only tool preset", () => {
105
113
  const result = compileAgentHarnessTask(makeTask({ tools: ["read-only"] }));
106
114
  const config = result.providers[0].config;
107
- const tools = config.allowedTools;
115
+ const tools = config.custom_allowed_tools;
108
116
  assert.ok(tools.includes("Read"));
109
117
  assert.ok(tools.includes("Grep"));
110
118
  assert.ok(!tools.includes("Write"), "read-only should not include Write");
@@ -112,19 +120,10 @@ describe("compileAgentHarnessTask — providers", () => {
112
120
  it("mixes preset and explicit tools", () => {
113
121
  const result = compileAgentHarnessTask(makeTask({ tools: ["read-only", "WebFetch"] }));
114
122
  const config = result.providers[0].config;
115
- const tools = config.allowedTools;
123
+ const tools = config.custom_allowed_tools;
116
124
  assert.ok(tools.includes("Read"));
117
125
  assert.ok(tools.includes("WebFetch"));
118
126
  });
119
- it("includes sandbox config in provider", () => {
120
- const result = compileAgentHarnessTask(makeTask({
121
- sandbox: { type: "docker", image: "node:22-slim" },
122
- }));
123
- const config = result.providers[0].config;
124
- const sandbox = config.sandbox;
125
- assert.equal(sandbox.type, "docker");
126
- assert.equal(sandbox.image, "node:22-slim");
127
- });
128
127
  });
129
128
  // ---------------------------------------------------------------------------
130
129
  // compileAgentHarnessTask — test cases
@@ -166,16 +165,20 @@ describe("compileAgentHarnessTask — test cases", () => {
166
165
  // compileAgentHarnessTask — assertions
167
166
  // ---------------------------------------------------------------------------
168
167
  describe("compileAgentHarnessTask — assertions", () => {
169
- it("maps file-exists to javascript assertion", () => {
168
+ const RUNTIME = "file://dist/agent-harness/assertions-runtime.js";
169
+ it("maps file-exists to file-based javascript assertion", () => {
170
170
  const result = compileAgentHarnessTask(makeTask({
171
171
  assertions: [{ type: "file-exists", value: "sanity.config.ts" }],
172
172
  }));
173
173
  const assertion = result.tests[0].assert?.[0];
174
174
  assert.ok(assertion);
175
175
  assert.equal(assertion.type, "javascript");
176
- assert.ok(assertion.value.includes("sanity.config.ts"));
176
+ assert.equal(assertion.value, `${RUNTIME}:fileExists`);
177
+ assert.deepEqual(assertion.config, {
178
+ filePath: "sanity.config.ts",
179
+ });
177
180
  });
178
- it("maps file-contains to javascript assertion", () => {
181
+ it("maps file-contains to file-based javascript assertion", () => {
179
182
  const result = compileAgentHarnessTask(makeTask({
180
183
  assertions: [
181
184
  {
@@ -187,25 +190,35 @@ describe("compileAgentHarnessTask — assertions", () => {
187
190
  const assertion = result.tests[0].assert?.[0];
188
191
  assert.ok(assertion);
189
192
  assert.equal(assertion.type, "javascript");
190
- assert.ok(assertion.value.includes("projectId"));
193
+ assert.equal(assertion.value, `${RUNTIME}:fileContains`);
194
+ assert.deepEqual(assertion.config, {
195
+ filePath: "config.ts",
196
+ content: "projectId",
197
+ });
191
198
  });
192
- it("maps command-succeeds to javascript assertion", () => {
199
+ it("maps command-succeeds to file-based javascript assertion", () => {
193
200
  const result = compileAgentHarnessTask(makeTask({
194
201
  assertions: [{ type: "command-succeeds", value: "npx tsc --noEmit" }],
195
202
  }));
196
203
  const assertion = result.tests[0].assert?.[0];
197
204
  assert.ok(assertion);
198
205
  assert.equal(assertion.type, "javascript");
199
- assert.ok(assertion.value.includes("tsc"));
206
+ assert.equal(assertion.value, `${RUNTIME}:commandSucceeds`);
207
+ assert.deepEqual(assertion.config, {
208
+ command: "npx tsc --noEmit",
209
+ });
200
210
  });
201
- it("maps diff-matches to javascript assertion", () => {
211
+ it("maps diff-matches to file-based javascript assertion", () => {
202
212
  const result = compileAgentHarnessTask(makeTask({
203
213
  assertions: [{ type: "diff-matches", value: "createClient" }],
204
214
  }));
205
215
  const assertion = result.tests[0].assert?.[0];
206
216
  assert.ok(assertion);
207
217
  assert.equal(assertion.type, "javascript");
208
- assert.ok(assertion.value.includes("git diff"));
218
+ assert.equal(assertion.value, `${RUNTIME}:diffMatches`);
219
+ assert.deepEqual(assertion.config, {
220
+ expected: "createClient",
221
+ });
209
222
  });
210
223
  it("passes through standard assertions", () => {
211
224
  const result = compileAgentHarnessTask(makeTask({
@@ -250,7 +263,7 @@ describe("compileAgentHarnessTask — lifecycle", () => {
250
263
  }));
251
264
  assert.equal(result.sandboxConfig.type, "docker");
252
265
  assert.equal(result.sandboxConfig.image, "node:22");
253
- assert.deepEqual(result.sandboxConfig.fixtures, ["file://schema.ts"]);
266
+ assert.deepEqual(result.sandboxConfig.fixtures, ["schema.ts"]);
254
267
  assert.equal(result.sandboxConfig.limits?.cpus, 2);
255
268
  assert.equal(result.sandboxConfig.limits?.networkAccess, false);
256
269
  });
@@ -278,7 +291,8 @@ describe("example agent harness tasks — end-to-end", () => {
278
291
  const result = compileAgentHarnessTask(modifyCodeTask);
279
292
  assert.ok(result.tests[0].assert);
280
293
  assert.ok(result.tests[0].assert.some((a) => a.type === "javascript" &&
281
- a.value.includes("useDocumentOperation")));
294
+ a.value.includes("fileContains") &&
295
+ a.config != null));
282
296
  });
283
297
  it("refactor task has docker sandbox config", () => {
284
298
  const result = compileAgentHarnessTask(multiFileRefactorTask);
@@ -65,12 +65,38 @@ export function writeCompiledModeConfig(result, mode, options) {
65
65
  if (options.graderProvider) {
66
66
  graderOpts.provider = options.graderProvider;
67
67
  }
68
- // Build provider entries
68
+ // For agent-harness mode, create sandbox directories and inject working_dir
69
+ // into provider configs. The sandbox must exist before the provider initializes
70
+ // (the Claude Agent SDK reads working_dir at construction time).
71
+ // Both working_dir and __workingDir use absolute paths to avoid ambiguity.
72
+ // @see https://www.promptfoo.dev/docs/providers/claude-agent-sdk/
73
+ const sandboxAbsPath = result.extras?.sandboxConfig
74
+ ? resolve(options.rootDir, `results/latest/sandbox-${mode}`)
75
+ : undefined;
76
+ if (sandboxAbsPath) {
77
+ mkdirSync(sandboxAbsPath, { recursive: true });
78
+ }
79
+ // Build provider entries, injecting working_dir for agent-harness providers
69
80
  const providerEntries = result.providers.map((p) => {
70
- if (p.config)
71
- return { id: p.id, label: p.label, config: p.config };
72
- return p.label ? { id: p.id, label: p.label } : p.id;
81
+ if (!p.config)
82
+ return p.label ? { id: p.id, label: p.label } : p.id;
83
+ const config = { ...p.config };
84
+ if (sandboxAbsPath && p.id === "anthropic:claude-agent-sdk") {
85
+ config.working_dir = sandboxAbsPath;
86
+ }
87
+ return { id: p.id, label: p.label, config };
73
88
  });
89
+ // Inject __workingDir into test vars so assertions can find the sandbox
90
+ if (sandboxAbsPath) {
91
+ for (const test of expandedTests) {
92
+ if (test.vars) {
93
+ ;
94
+ test.vars.__workingDir = sandboxAbsPath;
95
+ }
96
+ }
97
+ // Re-write the tests file with the injected paths
98
+ writeFileSync(testsPath, JSON.stringify(expandedTests, null, 2), "utf-8");
99
+ }
74
100
  // Build prompt entries
75
101
  const prompts = result.prompts.map((p) => ({
76
102
  id: p.id,
@@ -88,10 +114,11 @@ export function writeCompiledModeConfig(result, mode, options) {
88
114
  tests: [testsFilename],
89
115
  });
90
116
  // Include extensions if present (agent-harness mode)
117
+ // Promptfoo expects extensions as string[] (file paths to JS modules),
118
+ // so we materialize the { type, code } objects as a .cjs file on disk.
91
119
  if (result.extras?.extensions) {
92
- ;
93
- config.extensions =
94
- result.extras.extensions;
120
+ const extPaths = writeExtensionFile(options.rootDir, mode, result.extras.extensions);
121
+ config.extensions = extPaths;
95
122
  }
96
123
  writeConfig(options.rootDir, filename, config, options.logger);
97
124
  }
@@ -215,3 +242,47 @@ function writeYaml(path, data, header) {
215
242
  });
216
243
  writeFileSync(path, `${header}\n${yamlStr}`, "utf-8");
217
244
  }
245
+ /**
246
+ * Materialize Promptfoo lifecycle extensions as a .cjs file on disk.
247
+ *
248
+ * Promptfoo extensions use a single-function dispatch pattern:
249
+ * module.exports = async function(hookName, context) { ... }
250
+ *
251
+ * Each extension entry in the YAML references:
252
+ * file://path/to/file.cjs:exportedFunctionName
253
+ *
254
+ * @see https://www.promptfoo.dev/docs/configuration/reference/ — extensions
255
+ */
256
+ function writeExtensionFile(rootDir, mode, extensions) {
257
+ // Build a dispatch map: hookName → handler code
258
+ const hookMap = {};
259
+ for (const ext of extensions) {
260
+ hookMap[ext.type] = ext.code;
261
+ }
262
+ // Generate the single dispatch function that promptfoo expects
263
+ const hookCases = Object.entries(hookMap)
264
+ .map(([hookName, code]) => ` if (hookName === '${hookName}') {\n` +
265
+ ` const handler = ${code};\n` +
266
+ ` return handler(context);\n` +
267
+ ` }`)
268
+ .join("\n");
269
+ const fileContent = [
270
+ "// AUTO-GENERATED by compiler pipeline — do not edit directly.",
271
+ "// Run: npx @sanity/ailf generate-configs",
272
+ "//",
273
+ "// Promptfoo extension dispatch function.",
274
+ `// @see https://www.promptfoo.dev/docs/configuration/reference/`,
275
+ "",
276
+ "async function extensionHook(hookName, context) {",
277
+ hookCases,
278
+ "}",
279
+ "",
280
+ "module.exports = extensionHook;",
281
+ "",
282
+ ].join("\n");
283
+ const filename = `results/latest/${mode}-extensions.cjs`;
284
+ const outPath = resolve(rootDir, filename);
285
+ writeFileSync(outPath, fileContent, "utf-8");
286
+ // Single entry pointing to the dispatch function
287
+ return [`file://${filename}:extensionHook`];
288
+ }
@@ -4,6 +4,15 @@
4
4
  * Handles agent-specific assertion types (file-exists, file-contains,
5
5
  * command-succeeds, diff-matches) as well as standard pass-through
6
6
  * assertion types.
7
+ *
8
+ * Agent-specific assertions use file-based references to the assertions
9
+ * runtime module (dist/agent-harness/assertions-runtime.js) because
10
+ * promptfoo's inline `type: javascript` assertions run in a restricted
11
+ * eval() sandbox where require() is unavailable. File-based assertions
12
+ * run in a full Node.js context.
13
+ *
14
+ * @see https://www.promptfoo.dev/docs/configuration/expected-outputs/javascript/
15
+ * @see src/agent-harness/assertions-runtime.ts — runtime implementations
7
16
  */
8
17
  import type { PromptfooAssertion } from "../../assertion-mapper.js";
9
18
  import type { AgentHarnessCompileOptions } from "./types.js";
@@ -4,7 +4,18 @@
4
4
  * Handles agent-specific assertion types (file-exists, file-contains,
5
5
  * command-succeeds, diff-matches) as well as standard pass-through
6
6
  * assertion types.
7
+ *
8
+ * Agent-specific assertions use file-based references to the assertions
9
+ * runtime module (dist/agent-harness/assertions-runtime.js) because
10
+ * promptfoo's inline `type: javascript` assertions run in a restricted
11
+ * eval() sandbox where require() is unavailable. File-based assertions
12
+ * run in a full Node.js context.
13
+ *
14
+ * @see https://www.promptfoo.dev/docs/configuration/expected-outputs/javascript/
15
+ * @see src/agent-harness/assertions-runtime.ts — runtime implementations
7
16
  */
17
+ /** Base path for the file-based assertion runtime module */
18
+ const RUNTIME = "file://dist/agent-harness/assertions-runtime.js";
8
19
  // ---------------------------------------------------------------------------
9
20
  // Assertion mapping
10
21
  // ---------------------------------------------------------------------------
@@ -53,66 +64,29 @@ export function mapAgentAssertion(assertion, options, warnings) {
53
64
  }
54
65
  // ---------------------------------------------------------------------------
55
66
  // Agent-specific assertion builders
67
+ //
68
+ // Each builder returns a file-based assertion referencing the runtime
69
+ // module with parameters passed via the `config` field.
56
70
  // ---------------------------------------------------------------------------
57
71
  export function buildFileExistsAssertion(assertion) {
58
- const filePath = String(assertion.value ?? "");
59
- // Use JSON.stringify for all interpolated values in generated JS to
60
- // prevent broken strings from filePaths containing quotes/backslashes
61
- const safeFilePath = JSON.stringify(filePath);
62
72
  return {
63
73
  type: "javascript",
64
- value: `// file-exists: ${filePath}\n` +
65
- `(function() {\n` +
66
- ` const fs = require('fs');\n` +
67
- ` const path = require('path');\n` +
68
- ` const workDir = path.resolve(context.vars.__workingDir || '.');\n` +
69
- ` const target = path.resolve(workDir, ${safeFilePath});\n` +
70
- ` if (!target.startsWith(workDir + path.sep) && target !== workDir) {\n` +
71
- ` return { pass: false, score: 0, reason: 'Path traversal: ' + ${safeFilePath} + ' escapes sandbox' };\n` +
72
- ` }\n` +
73
- ` const exists = fs.existsSync(target);\n` +
74
- ` return {\n` +
75
- ` pass: exists,\n` +
76
- ` score: exists ? 1 : 0,\n` +
77
- ` reason: exists\n` +
78
- ` ? 'File exists: ' + ${safeFilePath}\n` +
79
- ` : 'Expected file not found: ' + ${safeFilePath},\n` +
80
- ` };\n` +
81
- `})()`,
74
+ value: `${RUNTIME}:fileExists`,
75
+ config: { filePath: String(assertion.value ?? "") },
82
76
  ...(typeof assertion.weight === "number"
83
77
  ? { weight: assertion.weight }
84
78
  : {}),
85
79
  };
86
80
  }
87
81
  export function buildFileContainsAssertion(assertion) {
88
- const config = assertion.value;
89
- const filePath = config?.path ?? "";
90
- const expectedContent = config?.content ?? "";
91
- const safeFilePath = JSON.stringify(filePath);
82
+ const val = assertion.value;
92
83
  return {
93
84
  type: "javascript",
94
- value: `// file-contains: ${filePath}\n` +
95
- `(function() {\n` +
96
- ` const fs = require('fs');\n` +
97
- ` const path = require('path');\n` +
98
- ` const workDir = path.resolve(context.vars.__workingDir || '.');\n` +
99
- ` const target = path.resolve(workDir, ${safeFilePath});\n` +
100
- ` if (!target.startsWith(workDir + path.sep) && target !== workDir) {\n` +
101
- ` return { pass: false, score: 0, reason: 'Path traversal: ' + ${safeFilePath} + ' escapes sandbox' };\n` +
102
- ` }\n` +
103
- ` if (!fs.existsSync(target)) {\n` +
104
- ` return { pass: false, score: 0, reason: 'File not found: ' + ${safeFilePath} };\n` +
105
- ` }\n` +
106
- ` const content = fs.readFileSync(target, 'utf-8');\n` +
107
- ` const contains = content.includes(${JSON.stringify(expectedContent)});\n` +
108
- ` return {\n` +
109
- ` pass: contains,\n` +
110
- ` score: contains ? 1 : 0,\n` +
111
- ` reason: contains\n` +
112
- ` ? 'File contains expected content'\n` +
113
- ` : 'File does not contain expected content',\n` +
114
- ` };\n` +
115
- `})()`,
85
+ value: `${RUNTIME}:fileContains`,
86
+ config: {
87
+ filePath: val?.path ?? "",
88
+ content: val?.content ?? "",
89
+ },
116
90
  ...(typeof assertion.weight === "number"
117
91
  ? { weight: assertion.weight }
118
92
  : {}),
@@ -133,53 +107,22 @@ export function buildFileContainsAssertion(assertion) {
133
107
  * from untrusted sources, validate commands against an allowlist first.
134
108
  */
135
109
  export function buildCommandSucceedsAssertion(assertion) {
136
- const command = String(assertion.value ?? "");
137
110
  return {
138
111
  type: "javascript",
139
- value: `// command-succeeds: ${command}\n` +
140
- `(function() {\n` +
141
- ` const { execSync } = require('child_process');\n` +
142
- ` const workDir = context.vars.__workingDir || '.';\n` +
143
- ` try {\n` +
144
- ` execSync(${JSON.stringify(command)}, { cwd: workDir, timeout: 30000 });\n` +
145
- ` return { pass: true, score: 1, reason: 'Command succeeded: ' + ${JSON.stringify(command)} };\n` +
146
- ` } catch (err) {\n` +
147
- ` return {\n` +
148
- ` pass: false,\n` +
149
- ` score: 0,\n` +
150
- ` reason: 'Command failed: ' + (err.message || err),\n` +
151
- ` };\n` +
152
- ` }\n` +
153
- `})()`,
112
+ value: `${RUNTIME}:commandSucceeds`,
113
+ config: { command: String(assertion.value ?? "") },
154
114
  ...(typeof assertion.weight === "number"
155
115
  ? { weight: assertion.weight }
156
116
  : {}),
157
117
  };
158
118
  }
159
119
  export function buildDiffMatchesAssertion(assertion) {
160
- const expected = assertion.value;
161
120
  return {
162
121
  type: "javascript",
163
- value: `// diff-matches\n` +
164
- `(function() {\n` +
165
- ` const { execSync } = require('child_process');\n` +
166
- ` const workDir = context.vars.__workingDir || '.';\n` +
167
- ` try {\n` +
168
- ` const diff = execSync('git diff', { cwd: workDir, encoding: 'utf-8' });\n` +
169
- ` const expected = ${JSON.stringify(expected)};\n` +
170
- ` if (typeof expected === 'string') {\n` +
171
- ` const contains = diff.includes(expected);\n` +
172
- ` return {\n` +
173
- ` pass: contains,\n` +
174
- ` score: contains ? 1 : 0,\n` +
175
- ` reason: contains ? 'Diff matches expected pattern' : 'Diff does not match',\n` +
176
- ` };\n` +
177
- ` }\n` +
178
- ` return { pass: diff.length > 0, score: diff.length > 0 ? 1 : 0, reason: 'Diff exists' };\n` +
179
- ` } catch (err) {\n` +
180
- ` return { pass: false, score: 0, reason: 'Failed to get diff: ' + err.message };\n` +
181
- ` }\n` +
182
- `})()`,
122
+ value: `${RUNTIME}:diffMatches`,
123
+ config: {
124
+ ...(assertion.value != null ? { expected: assertion.value } : {}),
125
+ },
183
126
  ...(typeof assertion.weight === "number"
184
127
  ? { weight: assertion.weight }
185
128
  : {}),
@@ -27,8 +27,10 @@ export function compileAgentHarnessTask(task, options) {
27
27
  const prompts = buildAgentPrompts(task);
28
28
  // Build test cases
29
29
  const tests = buildAgentTestCases(task, options, warnings);
30
- // Build sandbox extensions
31
- const sandboxConfig = buildSandboxConfig(task);
30
+ // Build sandbox extensions — resolve fixture paths at compile time using
31
+ // the caller's cwd (monorepo root), not the eval package rootDir.
32
+ const callerCwd = process.env.AILF_CALLER_CWD ?? process.cwd();
33
+ const sandboxConfig = buildSandboxConfig(task, callerCwd);
32
34
  const extensions = buildLifecycleExtensions(task, sandboxConfig);
33
35
  return { providers, tests, prompts, extensions, sandboxConfig, warnings };
34
36
  }
@@ -36,22 +38,25 @@ export function compileAgentHarnessTask(task, options) {
36
38
  // Provider assembly
37
39
  // ---------------------------------------------------------------------------
38
40
  export function buildAgentProvider(task, _warnings) {
39
- // Resolve tool permissions
40
41
  const tools = resolveToolPermissions(task.tools);
41
- const config = {};
42
+ // Claude Agent SDK config.
43
+ // working_dir is set by the YAML writer to the sandbox path it creates.
44
+ // @see https://www.promptfoo.dev/docs/providers/claude-agent-sdk/
45
+ const config = {
46
+ model: "claude-sonnet-4-20250514",
47
+ max_turns: 25,
48
+ max_budget_usd: 1.0,
49
+ permission_mode: "bypassPermissions",
50
+ allow_dangerously_skip_permissions: true,
51
+ };
52
+ // Map AILF tool names to Claude Agent SDK tool config.
53
+ // Claude SDK uses custom_allowed_tools to replace defaults.
42
54
  if (tools.length > 0) {
43
- config.allowedTools = tools;
44
- }
45
- if (task.sandbox) {
46
- config.sandbox = {
47
- type: task.sandbox.type,
48
- ...(task.sandbox.image ? { image: task.sandbox.image } : {}),
49
- };
55
+ config.custom_allowed_tools = tools;
50
56
  }
51
- // Default to Claude Agent SDK provider
52
57
  return [
53
58
  {
54
- id: `agent:${task.id}`,
59
+ id: "anthropic:claude-agent-sdk",
55
60
  label: `Agent Harness: ${task.title}`,
56
61
  config,
57
62
  },
@@ -112,9 +117,11 @@ export function buildAgentTestCases(task, options, warnings) {
112
117
  const vars = {
113
118
  task: task.prompt?.vars?.task ?? task.description ?? `Complete: ${task.title}`,
114
119
  ...(task.prompt?.vars ?? {}),
115
- // Internal metadata for sandbox lifecycle hooks
120
+ // Internal metadata for sandbox lifecycle hooks.
121
+ // Fixture paths are plain strings (no file:// prefix) because
122
+ // promptfoo auto-resolves file:// in vars by reading file content.
116
123
  __sandboxType: task.sandbox?.type ?? "tempdir",
117
- __fixtures: task.fixtures ?? [],
124
+ __fixtures: (task.fixtures ?? []).map((f) => f.startsWith("file://") ? f.slice(7) : f),
118
125
  };
119
126
  const tests = [
120
127
  {
@@ -6,7 +6,14 @@
6
6
  */
7
7
  import type { AgentHarnessTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
8
8
  import type { PromptfooExtension, SandboxConfigMeta } from "./types.js";
9
- export declare function buildSandboxConfig(task: AgentHarnessTaskDefinition): SandboxConfigMeta;
9
+ /**
10
+ * Build sandbox configuration from a task definition.
11
+ *
12
+ * Fixture paths are resolved to absolute at compile time using callerCwd
13
+ * (the directory the pipeline was invoked from), because promptfoo runs
14
+ * with cwd set to packages/eval/ — not the monorepo root where apps/ lives.
15
+ */
16
+ export declare function buildSandboxConfig(task: AgentHarnessTaskDefinition, callerCwd?: string): SandboxConfigMeta;
10
17
  export declare function buildLifecycleExtensions(task: AgentHarnessTaskDefinition, sandboxConfig: SandboxConfigMeta): PromptfooExtension[];
11
18
  export declare function buildBeforeEachHook(taskId: string, config: SandboxConfigMeta): string;
12
19
  export declare function buildAfterEachHook(taskId: string): string;
@@ -4,14 +4,26 @@
4
4
  * Builds Promptfoo beforeEach/afterEach hooks for provisioning and
5
5
  * tearing down sandbox working directories.
6
6
  */
7
+ import { resolve } from "path";
7
8
  // ---------------------------------------------------------------------------
8
9
  // Sandbox configuration
9
10
  // ---------------------------------------------------------------------------
10
- export function buildSandboxConfig(task) {
11
+ /**
12
+ * Build sandbox configuration from a task definition.
13
+ *
14
+ * Fixture paths are resolved to absolute at compile time using callerCwd
15
+ * (the directory the pipeline was invoked from), because promptfoo runs
16
+ * with cwd set to packages/eval/ — not the monorepo root where apps/ lives.
17
+ */
18
+ export function buildSandboxConfig(task, callerCwd) {
19
+ const cwd = callerCwd ?? process.cwd();
11
20
  return {
12
21
  type: task.sandbox?.type ?? "tempdir",
13
22
  image: task.sandbox?.image,
14
- fixtures: task.fixtures ?? [],
23
+ fixtures: (task.fixtures ?? []).map((f) => {
24
+ const stripped = f.startsWith("file://") ? f.slice(7) : f;
25
+ return resolve(cwd, stripped);
26
+ }),
15
27
  limits: task.sandbox?.limits
16
28
  ? {
17
29
  cpus: task.sandbox.limits.cpus,
@@ -39,23 +51,41 @@ export function buildLifecycleExtensions(task, sandboxConfig) {
39
51
  return extensions;
40
52
  }
41
53
  export function buildBeforeEachHook(taskId, config) {
42
- return (`// beforeEach: provision sandbox for ${taskId}\n` +
43
- `async function({ vars }) {\n` +
44
- ` const { mkdirSync, writeFileSync } = require('fs');\n` +
45
- ` const { tmpdir } = require('os');\n` +
54
+ // Promptfoo extension hooks receive (hookName, context).
55
+ // beforeEach context is { test } vars live at context.test.vars.
56
+ // Must return context for mutations to persist.
57
+ //
58
+ // The sandbox directory is created by the YAML writer at config-gen time
59
+ // (deterministic path in results/latest/sandbox-{taskId}/) so it exists
60
+ // before the provider is initialized. This hook copies fixtures into it.
61
+ //
62
+ // @see https://www.promptfoo.dev/docs/configuration/reference/ — extensions
63
+ return (`// beforeEach: copy fixtures into sandbox for ${taskId}\n` +
64
+ `async function(context) {\n` +
65
+ ` const { cpSync, existsSync, mkdirSync } = require('fs');\n` +
46
66
  ` const { resolve } = require('path');\n` +
47
- ` const id = 'ailf-${taskId}-' + require('crypto').randomUUID().slice(0, 8);\n` +
48
- ` const workDir = resolve(tmpdir(), id);\n` +
67
+ ` const workDir = context.test.vars?.__workingDir;\n` +
68
+ ` if (!workDir) return context;\n` +
49
69
  ` mkdirSync(workDir, { recursive: true });\n` +
50
- ` vars.__workingDir = workDir;\n` +
51
- ` vars.__sandboxId = id;\n` +
52
- ` // Fixture list: ${JSON.stringify(config.fixtures)}\n` +
70
+ ` // Copy fixtures into sandbox\n` +
71
+ ` const fixtures = ${JSON.stringify(config.fixtures)};\n` +
72
+ ` for (const fixture of fixtures) {\n` +
73
+ ` const src = resolve(process.cwd(), fixture);\n` +
74
+ ` if (existsSync(src)) {\n` +
75
+ ` cpSync(src, workDir, { recursive: true });\n` +
76
+ ` }\n` +
77
+ ` }\n` +
78
+ ` return context;\n` +
53
79
  `}`);
54
80
  }
55
81
  export function buildAfterEachHook(taskId) {
82
+ // Promptfoo extension hooks receive (hookName, context).
83
+ // afterEach context is { test, result } — vars live at context.test.vars.
84
+ // @see https://www.promptfoo.dev/docs/configuration/reference/ — extensions
56
85
  return (`// afterEach: collect artifacts + teardown for ${taskId}\n` +
57
- `async function({ vars }) {\n` +
86
+ `async function(context) {\n` +
58
87
  ` const { rmSync, readdirSync, existsSync } = require('fs');\n` +
88
+ ` const vars = context.test.vars || {};\n` +
59
89
  ` const workDir = vars.__workingDir;\n` +
60
90
  ` if (workDir && existsSync(workDir)) {\n` +
61
91
  ` try {\n` +