@sanity/ailf 3.8.0 → 3.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/config/canary-tasks.ts +64 -0
  2. package/config/test-budgets.ts +24 -0
  3. package/dist/_vendor/ailf-core/config-helpers.d.ts +19 -0
  4. package/dist/_vendor/ailf-core/config-helpers.js +27 -0
  5. package/dist/_vendor/ailf-core/index.d.ts +1 -1
  6. package/dist/_vendor/ailf-core/index.js +1 -1
  7. package/dist/_vendor/ailf-core/schemas/canary-tasks.d.ts +52 -0
  8. package/dist/_vendor/ailf-core/schemas/canary-tasks.js +46 -0
  9. package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
  10. package/dist/_vendor/ailf-core/schemas/index.js +2 -0
  11. package/dist/_vendor/ailf-core/schemas/test-budgets.d.ts +19 -0
  12. package/dist/_vendor/ailf-core/schemas/test-budgets.js +34 -0
  13. package/dist/_vendor/ailf-shared/canary-drift.d.ts +84 -0
  14. package/dist/_vendor/ailf-shared/canary-drift.js +86 -0
  15. package/dist/_vendor/ailf-shared/index.d.ts +1 -0
  16. package/dist/_vendor/ailf-shared/index.js +1 -0
  17. package/dist/adapters/config-sources/file-config-adapter.js +4 -5
  18. package/dist/adapters/task-sources/repo-schemas.d.ts +3 -3
  19. package/dist/cli-program.d.ts +39 -0
  20. package/dist/cli-program.js +137 -0
  21. package/dist/cli.d.ts +6 -0
  22. package/dist/cli.js +12 -122
  23. package/dist/config/canary-tasks.ts +64 -0
  24. package/dist/config/test-budgets.ts +24 -0
  25. package/dist/pipeline/calculate-scores.d.ts +17 -2
  26. package/dist/pipeline/calculate-scores.js +99 -0
  27. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +5 -0
  28. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +25 -2
  29. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +5 -1
  30. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +4 -0
  31. package/dist/pipeline/compiler/promptfoo-compiler.js +23 -0
  32. package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
  33. package/dist/tasks/knowledge-probe/groq-projections.task.ts +29 -11
  34. package/package.json +6 -3
  35. package/tasks/knowledge-probe/groq-projections.task.ts +29 -11
  36. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +0 -10
  37. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +0 -366
  38. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +0 -9
  39. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +0 -145
  40. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +0 -10
  41. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +0 -314
  42. package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +0 -10
  43. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +0 -486
  44. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +0 -10
  45. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +0 -425
  46. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +0 -9
  47. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +0 -332
  48. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +0 -12
  49. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +0 -210
  50. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +0 -7
  51. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +0 -404
  52. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +0 -10
  53. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +0 -184
  54. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +0 -8
  55. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +0 -301
  56. package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +0 -9
  57. package/dist/pipeline/compiler/__tests__/telemetry.test.js +0 -503
  58. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +0 -10
  59. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +0 -509
@@ -11,10 +11,20 @@
11
11
  *
12
12
  * @see docs/archive/exec-plans/architecture-overhaul/phase-2-config-compiler.md
13
13
  */
14
+ import { dirname, resolve as resolvePath } from "node:path";
15
+ import { fileURLToPath } from "node:url";
14
16
  import { mapAssertions } from "./assertion-mapper.js";
15
17
  import { resolveTaskFixtures } from "./fixture-resolver.js";
16
18
  import { LiteracyVariant } from "../normalize-mode.js";
17
19
  import { resolveVariables } from "./variable-resolver.js";
20
+ /**
21
+ * Absolute filesystem path to the AILF mock Promptfoo provider. Resolved
22
+ * once at module load relative to this file. Promptfoo's `file://` provider
23
+ * loader requires an absolute path. See buildProviders for the env-var
24
+ * gate that swaps real providers for this mock.
25
+ */
26
+ const __dirname = dirname(fileURLToPath(import.meta.url));
27
+ const MOCK_PROVIDER_ABSPATH = resolvePath(__dirname, "..", "..", "promptfoo-providers", "mock-provider.cjs");
18
28
  // ---------------------------------------------------------------------------
19
29
  // Public API
20
30
  // ---------------------------------------------------------------------------
@@ -143,6 +153,19 @@ function buildProviders(models, mode) {
143
153
  },
144
154
  });
145
155
  }
156
+ // Replay swap — when AILF_REPLAY_LLMS=1 is set, rewrite every provider's
157
+ // `id` to the file-based AILF mock provider so the Promptfoo subprocess
158
+ // never makes a live LLM call. We preserve `label` and stash the
159
+ // original `id` in `config.originalId` so the mock provider can surface
160
+ // model identity in its output and reports remain interpretable.
161
+ // See W0110 (M5.1) and docs/design-docs/testing-strategy.md.
162
+ if (process.env.AILF_REPLAY_LLMS === "1") {
163
+ return providers.map((p) => ({
164
+ id: `file://${MOCK_PROVIDER_ABSPATH}`,
165
+ label: p.label,
166
+ config: { ...p.config, originalId: p.id },
167
+ }));
168
+ }
146
169
  return providers;
147
170
  }
148
171
  /**
@@ -107,7 +107,7 @@ export declare function buildMirrorDocument(task: LiteracyTaskDefinition, opts:
107
107
  slugToDocId: Map<string, string>;
108
108
  }): {
109
109
  baseline?: {
110
- rubric?: "full" | "abbreviated" | "none" | undefined;
110
+ rubric?: "abbreviated" | "full" | "none" | undefined;
111
111
  enabled?: boolean | undefined;
112
112
  } | undefined;
113
113
  _id: string;
@@ -41,22 +41,40 @@ export default defineTask({
41
41
  assertions: [
42
42
  { type: "contains", value: "->" },
43
43
  { type: "contains", value: "select(" },
44
+ // Templated rubrics so the compiled assertions carry `metadata.dimension`
45
+ // and the scoring engine can populate per-dimension scores from the KP
46
+ // profile (factual-correctness 0.45 / completeness 0.35 / currency 0.20).
44
47
  {
45
48
  type: "llm-rubric",
46
- value:
47
- "The response should demonstrate accurate knowledge of GROQ " +
48
- "projection syntax with working code examples. Check that the " +
49
- "dereference operator, spread syntax, and select() are correctly " +
50
- "explained with valid GROQ code.",
51
- weight: 0.6,
49
+ template: "factual-correctness",
50
+ criteria: [
51
+ "The dereference operator `->` is correctly explained for following references",
52
+ "The spread operator `...` is shown in a valid projection example",
53
+ "`select()` is used with valid syntax for conditional projections",
54
+ 'Computed field names (e.g., `"label": title`) are demonstrated correctly',
55
+ "Code examples use valid GROQ — no fabricated operators or deprecated syntax",
56
+ ],
52
57
  },
53
58
  {
54
59
  type: "llm-rubric",
55
- value:
56
- "Evaluate whether the response reflects current GROQ syntax " +
57
- "(post-2023). Check for deprecated patterns or outdated " +
58
- "recommendations.",
59
- weight: 0.4,
60
+ template: "completeness",
61
+ criteria: [
62
+ "Basic object projection with `{}` is covered",
63
+ "Nested projections and the spread operator are both addressed",
64
+ "Computed/aliased field names are demonstrated",
65
+ "The dereference operator `->` is included with a worked example",
66
+ "Both inclusive (`[0..5]`) and exclusive (`[0...5]`) array slicing are explained",
67
+ "Conditional projections via `select()` are covered",
68
+ ],
69
+ },
70
+ {
71
+ type: "llm-rubric",
72
+ template: "currency",
73
+ criteria: [
74
+ "Examples reflect current GROQ syntax (post-2023) — no deprecated patterns",
75
+ "Recommendations don't reference removed or legacy query forms",
76
+ "Modern projection idioms are used (e.g., spread + override)",
77
+ ],
60
78
  },
61
79
  ],
62
80
  })
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sanity/ailf",
3
- "version": "3.8.0",
3
+ "version": "3.9.0",
4
4
  "private": false,
5
5
  "publishConfig": {
6
6
  "access": "public"
@@ -50,6 +50,7 @@
50
50
  "@anthropic-ai/claude-agent-sdk": "^0.2.105",
51
51
  "@types/js-yaml": "^4.0.9",
52
52
  "@types/node": "^22.13.1",
53
+ "nock": "^14.0.13",
53
54
  "tsx": "^4.19.2",
54
55
  "typescript": "^5.7.3",
55
56
  "@sanity/ailf-core": "0.1.0",
@@ -73,9 +74,11 @@
73
74
  "cli": "tsx src/cli.ts",
74
75
  "pipeline": "tsx src/cli.ts pipeline",
75
76
  "validate": "tsx src/cli.ts validate config",
76
- "test": "tsx --test src/__tests__/*.test.ts",
77
+ "test": "tsx --test src/__tests__/*.test.ts src/adapters/**/__tests__/*.adapter.test.ts",
77
78
  "test:e2e": "AILF_E2E=1 tsx --test src/__tests__/e2e/*.e2e.test.ts",
78
- "test:all": "AILF_E2E=1 tsx --test src/__tests__/*.test.ts src/pipeline/compiler/__tests__/*.test.ts src/__tests__/e2e/*.e2e.test.ts",
79
+ "test:e2e:adapters": "AILF_E2E=1 tsx --test src/adapters/**/__tests__/*.adapter.test.ts",
80
+ "test:e2e:api": "AILF_E2E_API=1 tsx --test src/__tests__/api-tier2-tenant-integration.test.ts src/__tests__/run-remote-tier2.test.ts",
81
+ "test:all": "AILF_E2E=1 tsx --test src/__tests__/*.test.ts src/pipeline/compiler/__tests__/*.test.ts src/__tests__/e2e/*.e2e.test.ts src/adapters/**/__tests__/*.adapter.test.ts",
79
82
  "pr-comment": "tsx src/cli.ts pr-comment",
80
83
  "coverage-audit": "tsx src/cli.ts report coverage",
81
84
  "readiness-report": "tsx src/cli.ts report readiness",
@@ -41,22 +41,40 @@ export default defineTask({
41
41
  assertions: [
42
42
  { type: "contains", value: "->" },
43
43
  { type: "contains", value: "select(" },
44
+ // Templated rubrics so the compiled assertions carry `metadata.dimension`
45
+ // and the scoring engine can populate per-dimension scores from the KP
46
+ // profile (factual-correctness 0.45 / completeness 0.35 / currency 0.20).
44
47
  {
45
48
  type: "llm-rubric",
46
- value:
47
- "The response should demonstrate accurate knowledge of GROQ " +
48
- "projection syntax with working code examples. Check that the " +
49
- "dereference operator, spread syntax, and select() are correctly " +
50
- "explained with valid GROQ code.",
51
- weight: 0.6,
49
+ template: "factual-correctness",
50
+ criteria: [
51
+ "The dereference operator `->` is correctly explained for following references",
52
+ "The spread operator `...` is shown in a valid projection example",
53
+ "`select()` is used with valid syntax for conditional projections",
54
+ 'Computed field names (e.g., `"label": title`) are demonstrated correctly',
55
+ "Code examples use valid GROQ — no fabricated operators or deprecated syntax",
56
+ ],
52
57
  },
53
58
  {
54
59
  type: "llm-rubric",
55
- value:
56
- "Evaluate whether the response reflects current GROQ syntax " +
57
- "(post-2023). Check for deprecated patterns or outdated " +
58
- "recommendations.",
59
- weight: 0.4,
60
+ template: "completeness",
61
+ criteria: [
62
+ "Basic object projection with `{}` is covered",
63
+ "Nested projections and the spread operator are both addressed",
64
+ "Computed/aliased field names are demonstrated",
65
+ "The dereference operator `->` is included with a worked example",
66
+ "Both inclusive (`[0..5]`) and exclusive (`[0...5]`) array slicing are explained",
67
+ "Conditional projections via `select()` are covered",
68
+ ],
69
+ },
70
+ {
71
+ type: "llm-rubric",
72
+ template: "currency",
73
+ criteria: [
74
+ "Examples reflect current GROQ syntax (post-2023) — no deprecated patterns",
75
+ "Recommendations don't reference removed or legacy query forms",
76
+ "Modern projection idioms are used (e.g., spread + override)",
77
+ ],
60
78
  },
61
79
  ],
62
80
  })
@@ -1,10 +0,0 @@
1
- /**
2
- * agent-harness-handler.test.ts — Tests for agent harness mode compilation.
3
- *
4
- * Tests validation, provider assembly, tool permission resolution,
5
- * assertion mapping, sandbox config, lifecycle extensions, and
6
- * end-to-end compilation of example tasks.
7
- *
8
- * Run: npx tsx --test src/pipeline/compiler/__tests__/agent-harness-handler.test.ts
9
- */
10
- export {};
@@ -1,366 +0,0 @@
1
- /**
2
- * agent-harness-handler.test.ts — Tests for agent harness mode compilation.
3
- *
4
- * Tests validation, provider assembly, tool permission resolution,
5
- * assertion mapping, sandbox config, lifecycle extensions, and
6
- * end-to-end compilation of example tasks.
7
- *
8
- * Run: npx tsx --test src/pipeline/compiler/__tests__/agent-harness-handler.test.ts
9
- */
10
- import assert from "node:assert/strict";
11
- import { describe, it } from "node:test";
12
- import { LiteracyVariant } from "../../normalize-mode.js";
13
- import { compileAgentHarnessTask, AGENT_HARNESS_PROMPT_TEMPLATES, handler as agentHandler, validateAgentHarnessTask, } from "../mode-handlers/agent-harness/index.js";
14
- import { allAgentHarnessExampleTasks, scaffoldProjectTask, modifyCodeTask, multiFileRefactorTask, } from "../mode-handlers/__fixtures__/agent-harness-example-tasks.js";
15
- // ---------------------------------------------------------------------------
16
- // Helpers
17
- // ---------------------------------------------------------------------------
18
- function makeTask(overrides) {
19
- return {
20
- mode: "agent-harness",
21
- id: "test-agent-task",
22
- title: "Test Agent Task",
23
- description: "A test agent harness task",
24
- area: "studio",
25
- ...overrides,
26
- };
27
- }
28
- // ---------------------------------------------------------------------------
29
- // handler.getPrompts() — prompt template ownership
30
- // ---------------------------------------------------------------------------
31
- describe("AgentHarnessHandler.getPrompts", () => {
32
- it("returns prompt templates", () => {
33
- const prompts = agentHandler.getPrompts();
34
- assert.ok(prompts, "getPrompts() should return a record");
35
- assert.ok(Object.keys(prompts).length > 0, "should return at least one template");
36
- });
37
- it("returns templates keyed by agent-specific IDs (not literacy names)", () => {
38
- const prompts = agentHandler.getPrompts();
39
- const keys = Object.keys(prompts);
40
- // Must not use literacy template names
41
- assert.ok(!keys.includes("with-docs"), "should not use literacy key 'with-docs'");
42
- assert.ok(!keys.includes("without-docs"), "should not use literacy key 'without-docs'");
43
- assert.ok(!keys.includes(LiteracyVariant.AGENTIC), "should not use literacy key 'agentic'");
44
- // Must have agent-appropriate key(s)
45
- assert.ok(keys.includes("agent-harness"), "should include 'agent-harness' template");
46
- });
47
- it("agent-harness template describes task for agent execution", () => {
48
- const prompts = agentHandler.getPrompts();
49
- const template = prompts["agent-harness"];
50
- assert.ok(template, "agent-harness template should exist");
51
- assert.ok(template.template.includes("{{task}}"), "should include {{task}} placeholder");
52
- // Should reference agent / sandbox / tool execution context
53
- assert.ok(/sandbox|file|tool|implement|code/i.test(template.template), "template should reference agent execution concepts");
54
- });
55
- it("template has correct PromptTemplate shape", () => {
56
- const prompts = agentHandler.getPrompts();
57
- const template = prompts["agent-harness"];
58
- assert.equal(template.id, "agent-harness");
59
- assert.ok(template.label, "should have a human-readable label");
60
- assert.ok(template.template, "should have a template string");
61
- assert.ok(Array.isArray(template.variables), "should declare variables");
62
- assert.ok(template.variables.includes("task"), "variables should include 'task'");
63
- });
64
- it("exported AGENT_HARNESS_PROMPT_TEMPLATES matches handler.getPrompts()", () => {
65
- const fromHandler = agentHandler.getPrompts();
66
- assert.deepEqual(fromHandler, AGENT_HARNESS_PROMPT_TEMPLATES);
67
- });
68
- });
69
- // ---------------------------------------------------------------------------
70
- // validateAgentHarnessTask
71
- // ---------------------------------------------------------------------------
72
- describe("validateAgentHarnessTask", () => {
73
- it("passes for a valid minimal task", () => {
74
- const errors = validateAgentHarnessTask(makeTask());
75
- assert.equal(errors.length, 0);
76
- });
77
- it("errors on missing ID", () => {
78
- const errors = validateAgentHarnessTask(makeTask({ id: "" }));
79
- assert.ok(errors.some((e) => e.field === "id"));
80
- });
81
- it("errors on missing title", () => {
82
- const errors = validateAgentHarnessTask(makeTask({ title: "" }));
83
- assert.ok(errors.some((e) => e.field === "title"));
84
- });
85
- });
86
- // ---------------------------------------------------------------------------
87
- // compileAgentHarnessTask — provider assembly
88
- // ---------------------------------------------------------------------------
89
- describe("compileAgentHarnessTask — providers", () => {
90
- it("produces a Claude Agent SDK provider", () => {
91
- const result = compileAgentHarnessTask(makeTask());
92
- assert.ok(result.providers.length > 0);
93
- assert.equal(result.providers[0].id, "anthropic:claude-agent-sdk");
94
- });
95
- it("sets default agent config", () => {
96
- const result = compileAgentHarnessTask(makeTask());
97
- const config = result.providers[0].config;
98
- assert.ok(config.model, "should set a model");
99
- assert.ok(config.max_turns, "should set max_turns");
100
- assert.ok(config.max_budget_usd, "should set budget cap");
101
- assert.equal(config.permission_mode, "bypassPermissions");
102
- });
103
- it("resolves coding tool preset into custom_allowed_tools", () => {
104
- const result = compileAgentHarnessTask(makeTask({ tools: ["coding"] }));
105
- const config = result.providers[0].config;
106
- const tools = config.custom_allowed_tools;
107
- assert.ok(tools.includes("Bash"));
108
- assert.ok(tools.includes("Read"));
109
- assert.ok(tools.includes("Write"));
110
- assert.ok(tools.includes("Edit"));
111
- });
112
- it("resolves read-only tool preset", () => {
113
- const result = compileAgentHarnessTask(makeTask({ tools: ["read-only"] }));
114
- const config = result.providers[0].config;
115
- const tools = config.custom_allowed_tools;
116
- assert.ok(tools.includes("Read"));
117
- assert.ok(tools.includes("Grep"));
118
- assert.ok(!tools.includes("Write"), "read-only should not include Write");
119
- });
120
- it("mixes preset and explicit tools", () => {
121
- const result = compileAgentHarnessTask(makeTask({ tools: ["read-only", "WebFetch"] }));
122
- const config = result.providers[0].config;
123
- const tools = config.custom_allowed_tools;
124
- assert.ok(tools.includes("Read"));
125
- assert.ok(tools.includes("WebFetch"));
126
- });
127
- });
128
- // ---------------------------------------------------------------------------
129
- // compileAgentHarnessTask — test cases
130
- // ---------------------------------------------------------------------------
131
- describe("compileAgentHarnessTask — test cases", () => {
132
- it("produces at least one test case", () => {
133
- const result = compileAgentHarnessTask(makeTask());
134
- assert.ok(result.tests.length > 0);
135
- });
136
- it("includes task description in vars", () => {
137
- const result = compileAgentHarnessTask(makeTask({ description: "Do the thing" }));
138
- assert.equal(result.tests[0].vars.task, "Do the thing");
139
- });
140
- it("prefers prompt.vars.task over description", () => {
141
- const result = compileAgentHarnessTask(makeTask({
142
- description: "Description",
143
- prompt: { vars: { task: "Custom prompt" } },
144
- }));
145
- assert.equal(result.tests[0].vars.task, "Custom prompt");
146
- });
147
- it("creates multi-turn test case", () => {
148
- const result = compileAgentHarnessTask(makeTask({
149
- multiTurn: {
150
- turns: [
151
- { role: "user", content: "Hello" },
152
- { role: "assistant", content: "Hi" },
153
- ],
154
- },
155
- }));
156
- assert.equal(result.tests.length, 2);
157
- assert.ok(result.tests[1].description.includes("[multi-turn]"));
158
- });
159
- it("sets sandbox metadata in vars", () => {
160
- const result = compileAgentHarnessTask(makeTask({ sandbox: { type: "docker" } }));
161
- assert.equal(result.tests[0].vars.__sandboxType, "docker");
162
- });
163
- });
164
- // ---------------------------------------------------------------------------
165
- // compileAgentHarnessTask — assertions
166
- // ---------------------------------------------------------------------------
167
- describe("compileAgentHarnessTask — assertions", () => {
168
- const RUNTIME = "file://dist/agent-harness/assertions-runtime.js";
169
- it("maps file-exists to file-based javascript assertion", () => {
170
- const result = compileAgentHarnessTask(makeTask({
171
- assertions: [{ type: "file-exists", value: "sanity.config.ts" }],
172
- }));
173
- const assertion = result.tests[0].assert?.[0];
174
- assert.ok(assertion);
175
- assert.equal(assertion.type, "javascript");
176
- assert.equal(assertion.value, `${RUNTIME}:fileExists`);
177
- assert.deepEqual(assertion.config, {
178
- filePath: "sanity.config.ts",
179
- });
180
- });
181
- it("maps file-contains to file-based javascript assertion", () => {
182
- const result = compileAgentHarnessTask(makeTask({
183
- assertions: [
184
- {
185
- type: "file-contains",
186
- value: { path: "config.ts", content: "projectId" },
187
- },
188
- ],
189
- }));
190
- const assertion = result.tests[0].assert?.[0];
191
- assert.ok(assertion);
192
- assert.equal(assertion.type, "javascript");
193
- assert.equal(assertion.value, `${RUNTIME}:fileContains`);
194
- assert.deepEqual(assertion.config, {
195
- filePath: "config.ts",
196
- content: "projectId",
197
- });
198
- });
199
- it("maps command-succeeds to file-based javascript assertion", () => {
200
- const result = compileAgentHarnessTask(makeTask({
201
- assertions: [{ type: "command-succeeds", value: "npx tsc --noEmit" }],
202
- }));
203
- const assertion = result.tests[0].assert?.[0];
204
- assert.ok(assertion);
205
- assert.equal(assertion.type, "javascript");
206
- assert.equal(assertion.value, `${RUNTIME}:commandSucceeds`);
207
- assert.deepEqual(assertion.config, {
208
- command: "npx tsc --noEmit",
209
- });
210
- });
211
- it("maps diff-matches to file-based javascript assertion", () => {
212
- const result = compileAgentHarnessTask(makeTask({
213
- assertions: [{ type: "diff-matches", value: "createClient" }],
214
- }));
215
- const assertion = result.tests[0].assert?.[0];
216
- assert.ok(assertion);
217
- assert.equal(assertion.type, "javascript");
218
- assert.equal(assertion.value, `${RUNTIME}:diffMatches`);
219
- assert.deepEqual(assertion.config, {
220
- expected: "createClient",
221
- });
222
- });
223
- it("passes through standard assertions", () => {
224
- const result = compileAgentHarnessTask(makeTask({
225
- assertions: [{ type: "contains", value: "sanity" }],
226
- }));
227
- assert.equal(result.tests[0].assert?.[0]?.type, "contains");
228
- });
229
- it("sets grader provider on llm-rubric", () => {
230
- const result = compileAgentHarnessTask(makeTask({
231
- assertions: [{ type: "llm-rubric", value: "Check quality" }],
232
- }), { graderProvider: "openai:chat:gpt-5" });
233
- assert.equal(result.tests[0].assert?.[0]?.provider, "openai:chat:gpt-5");
234
- });
235
- it("resolves templated llm-rubric with rubric text and dimension metadata", () => {
236
- const rubricConfig = {
237
- templates: {
238
- "agent-output": {
239
- dimension: "agent-output",
240
- header: "Score the agent's final output from 0 to 100:",
241
- scale: ["0: Failed", "50: Partial", "100: Complete"],
242
- criteria_label: "Check for:",
243
- },
244
- },
245
- };
246
- const result = compileAgentHarnessTask(makeTask({
247
- assertions: [
248
- {
249
- type: "llm-rubric",
250
- template: "agent-output",
251
- criteria: ["File created", "Correct content"],
252
- },
253
- ],
254
- }), { rubricConfig, graderProvider: "anthropic:messages:claude-opus-4-5" });
255
- const assertion = result.tests[0].assert?.[0];
256
- assert.ok(assertion, "should produce an assertion");
257
- assert.equal(assertion.type, "llm-rubric");
258
- // Rubric text should be fully rendered (not empty)
259
- assert.ok(assertion.value.includes("Score the agent"), "should contain rendered rubric header");
260
- assert.ok(assertion.value.includes("File created"), "should contain task-specific criteria");
261
- // Dimension metadata should be attached
262
- const metadata = assertion.metadata;
263
- assert.ok(metadata, "should have metadata");
264
- assert.equal(metadata.dimension, "agent-output");
265
- assert.equal(metadata.maxScore, 100);
266
- // Grader provider should be set
267
- assert.equal(assertion.provider, "anthropic:messages:claude-opus-4-5");
268
- });
269
- it("warns when rubric template is unknown", () => {
270
- const rubricConfig = { templates: {} };
271
- const result = compileAgentHarnessTask(makeTask({
272
- assertions: [
273
- {
274
- type: "llm-rubric",
275
- template: "nonexistent-template",
276
- criteria: ["Something"],
277
- },
278
- ],
279
- }), { rubricConfig });
280
- // Unknown template produces a warning and no assertion
281
- assert.ok(result.warnings.some((w) => w.includes("nonexistent-template")), "should warn about unknown template");
282
- // The assertion should be null (filtered out)
283
- assert.equal(result.tests[0].assert?.length ?? 0, 0, "should not produce an assertion for unknown template");
284
- });
285
- it("warns when rubricConfig is not provided for templated assertion", () => {
286
- const result = compileAgentHarnessTask(makeTask({
287
- assertions: [
288
- {
289
- type: "llm-rubric",
290
- template: "agent-output",
291
- criteria: ["Something"],
292
- },
293
- ],
294
- })
295
- // No rubricConfig in options
296
- );
297
- assert.ok(result.warnings.some((w) => w.includes("No rubric config")), "should warn about missing rubric config");
298
- });
299
- });
300
- // ---------------------------------------------------------------------------
301
- // compileAgentHarnessTask — lifecycle extensions
302
- // ---------------------------------------------------------------------------
303
- describe("compileAgentHarnessTask — lifecycle", () => {
304
- it("produces beforeEach and afterEach extensions", () => {
305
- const result = compileAgentHarnessTask(makeTask());
306
- assert.equal(result.extensions.length, 2);
307
- assert.equal(result.extensions[0].type, "beforeEach");
308
- assert.equal(result.extensions[1].type, "afterEach");
309
- });
310
- it("beforeEach hook creates working directory", () => {
311
- const result = compileAgentHarnessTask(makeTask());
312
- assert.ok(result.extensions[0].code.includes("mkdirSync"));
313
- assert.ok(result.extensions[0].code.includes("__workingDir"));
314
- });
315
- it("afterEach hook cleans up", () => {
316
- const result = compileAgentHarnessTask(makeTask());
317
- assert.ok(result.extensions[1].code.includes("rmSync"));
318
- });
319
- it("sandbox config captures task settings", () => {
320
- const result = compileAgentHarnessTask(makeTask({
321
- sandbox: {
322
- type: "docker",
323
- image: "node:22",
324
- limits: { cpus: 2, networkAccess: false },
325
- },
326
- fixtures: ["file://schema.ts"],
327
- }));
328
- assert.equal(result.sandboxConfig.type, "docker");
329
- assert.equal(result.sandboxConfig.image, "node:22");
330
- assert.deepEqual(result.sandboxConfig.fixtures, ["schema.ts"]);
331
- assert.equal(result.sandboxConfig.limits?.cpus, 2);
332
- assert.equal(result.sandboxConfig.limits?.networkAccess, false);
333
- });
334
- });
335
- // ---------------------------------------------------------------------------
336
- // Example task compilation (end-to-end)
337
- // ---------------------------------------------------------------------------
338
- describe("example agent harness tasks — end-to-end", () => {
339
- it("compiles all example tasks without errors", () => {
340
- for (const task of allAgentHarnessExampleTasks) {
341
- const result = compileAgentHarnessTask(task);
342
- assert.ok(result.providers.length > 0, `${task.id}: should produce providers`);
343
- assert.ok(result.tests.length > 0, `${task.id}: should produce test cases`);
344
- assert.ok(result.extensions.length > 0, `${task.id}: should produce lifecycle extensions`);
345
- }
346
- });
347
- it("scaffold task has file-exists assertions", () => {
348
- const result = compileAgentHarnessTask(scaffoldProjectTask);
349
- assert.ok(result.tests[0].assert);
350
- assert.ok(result.tests[0].assert.length >= 3);
351
- // First two are file-exists (javascript), third is file-contains, fourth is command-succeeds
352
- assert.equal(result.tests[0].assert[0].type, "javascript");
353
- });
354
- it("modify task has file-contains assertions", () => {
355
- const result = compileAgentHarnessTask(modifyCodeTask);
356
- assert.ok(result.tests[0].assert);
357
- assert.ok(result.tests[0].assert.some((a) => a.type === "javascript" &&
358
- a.value.includes("fileContains") &&
359
- a.config != null));
360
- });
361
- it("refactor task has docker sandbox config", () => {
362
- const result = compileAgentHarnessTask(multiFileRefactorTask);
363
- assert.equal(result.sandboxConfig.type, "docker");
364
- assert.equal(result.sandboxConfig.image, "node:22-slim");
365
- });
366
- });
@@ -1,9 +0,0 @@
1
- /**
2
- * assertion-mapper.test.ts — Unit tests for the assertion type mapper.
3
- *
4
- * Tests mapping of AILF assertion types to Promptfoo assertion types,
5
- * mode compatibility checking, negation support, and templated assertions.
6
- *
7
- * Run: npx tsx --test src/pipeline/compiler/__tests__/assertion-mapper.test.ts
8
- */
9
- export {};