@sanity/ailf 1.0.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. package/README.md +0 -1
  2. package/config/models.ts +15 -3
  3. package/dist/_vendor/ailf-core/config-helpers.d.ts +14 -17
  4. package/dist/_vendor/ailf-core/config-helpers.js +22 -2
  5. package/dist/_vendor/ailf-core/examples/index.d.ts +16 -0
  6. package/dist/_vendor/ailf-core/examples/index.js +25 -0
  7. package/dist/_vendor/ailf-core/index.d.ts +2 -2
  8. package/dist/_vendor/ailf-core/index.js +1 -1
  9. package/dist/_vendor/ailf-core/ports/context.d.ts +2 -0
  10. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  11. package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
  12. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
  13. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +2 -0
  14. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
  15. package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
  16. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +13 -0
  17. package/dist/_vendor/ailf-core/types/index.d.ts +1 -3
  18. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +78 -23
  19. package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
  20. package/dist/adapters/config-sources/file-config-adapter.js +1 -0
  21. package/dist/adapters/config-sources/ts-config-loader.js +21 -13
  22. package/dist/adapters/task-sources/content-lake-task-source.js +17 -20
  23. package/dist/adapters/task-sources/index.d.ts +2 -2
  24. package/dist/adapters/task-sources/index.js +2 -2
  25. package/dist/adapters/task-sources/repo-schemas.d.ts +218 -16
  26. package/dist/adapters/task-sources/repo-schemas.js +227 -19
  27. package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
  28. package/dist/adapters/task-sources/repo-task-source.js +81 -122
  29. package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
  30. package/dist/adapters/task-sources/repo-validation.js +126 -5
  31. package/dist/adapters/task-sources/task-file-loader.d.ts +2 -2
  32. package/dist/adapters/task-sources/task-file-loader.js +2 -2
  33. package/dist/commands/coverage-audit.js +3 -1
  34. package/dist/commands/init.d.ts +6 -4
  35. package/dist/commands/init.js +302 -23
  36. package/dist/commands/validate-tasks.d.ts +2 -2
  37. package/dist/commands/validate-tasks.js +26 -15
  38. package/dist/composition-root.d.ts +13 -1
  39. package/dist/composition-root.js +73 -41
  40. package/dist/index.d.ts +41 -0
  41. package/dist/index.js +48 -0
  42. package/dist/orchestration/build-step-sequence.js +4 -2
  43. package/dist/orchestration/steps/fetch-docs-step.js +2 -3
  44. package/dist/orchestration/steps/generate-configs-step.js +28 -12
  45. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
  46. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
  47. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
  48. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +105 -68
  49. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
  50. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  51. package/dist/pipeline/compiler/literacy-bridge.js +1 -1
  52. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
  53. package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
  54. package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
  55. package/dist/pipeline/compiler/mode-bases/index.js +4 -0
  56. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
  57. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
  58. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +12 -0
  59. package/dist/pipeline/compiler/mode-bases/literacy.js +78 -0
  60. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
  61. package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
  62. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
  63. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
  64. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
  65. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
  66. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
  67. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
  68. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
  69. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
  70. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
  71. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
  72. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
  73. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
  74. package/dist/pipeline/compiler/mode-handlers/{agent-harness-handler.d.ts → agent-harness/types.d.ts} +3 -24
  75. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
  76. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
  77. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
  78. package/dist/pipeline/compiler/mode-handlers/index.d.ts +4 -5
  79. package/dist/pipeline/compiler/mode-handlers/index.js +4 -6
  80. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
  81. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
  82. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
  83. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
  84. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
  85. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
  86. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
  87. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
  88. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
  89. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
  90. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
  91. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
  92. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
  93. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
  94. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
  95. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
  96. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
  97. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
  98. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
  99. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
  100. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
  101. package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
  102. package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
  103. package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
  104. package/dist/pipeline/compiler/mode-handlers/{mcp-assertions.d.ts → mcp-server/assertions.d.ts} +2 -10
  105. package/dist/pipeline/compiler/mode-handlers/{mcp-assertions.js → mcp-server/assertions.js} +63 -6
  106. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
  107. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
  108. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
  109. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
  110. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
  111. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
  112. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
  113. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +104 -0
  114. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
  115. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
  116. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
  117. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
  118. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
  119. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +174 -0
  120. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
  121. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +95 -0
  122. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
  123. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
  124. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +14 -0
  125. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +16 -0
  126. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +93 -0
  127. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
  128. package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
  129. package/dist/pipeline/compiler/preset-loader.js +99 -0
  130. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +6 -9
  131. package/dist/pipeline/compiler/presets/sanity-literacy.js +10 -156
  132. package/dist/pipeline/expand-tasks.d.ts +2 -2
  133. package/dist/pipeline/expand-tasks.js +2 -2
  134. package/dist/pipeline/generate-configs.js +1 -1
  135. package/dist/pipeline/map-request-to-config.js +1 -0
  136. package/dist/pipeline/mirror-repo-tasks.d.ts +7 -7
  137. package/dist/pipeline/mirror-repo-tasks.js +9 -9
  138. package/dist/pipeline/plan.js +1 -1
  139. package/package.json +11 -3
  140. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  141. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  142. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  143. package/dist/_vendor/ailf-tasks/index.js +0 -16
  144. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  145. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  146. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  147. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  148. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  149. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  150. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
  151. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
  152. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
  153. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
  154. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
  155. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -67
  156. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -309
@@ -10,8 +10,7 @@
10
10
  import assert from "node:assert/strict";
11
11
  import { describe, it } from "node:test";
12
12
  import { LiteracyVariant } from "../../normalize-mode.js";
13
- import { compileMCPTask, handler as mcpHandler, MCP_PROMPT_TEMPLATES, validateMCPTask, } from "../mode-handlers/mcp-server-handler.js";
14
- import { buildMCPAssertions } from "../mode-handlers/mcp-assertions.js";
13
+ import { buildMCPAssertions, compileMCPTask, handler as mcpHandler, MCP_PROMPT_TEMPLATES, validateMCPTask, } from "../mode-handlers/mcp-server/index.js";
15
14
  import { allMCPExampleTasks, createAndPublishTask, inspectSchemaTask, queryDocumentsTask, semanticSearchTask, stdioServerTask, } from "../mode-handlers/__fixtures__/mcp-example-tasks.js";
16
15
  // ---------------------------------------------------------------------------
17
16
  // Helpers
@@ -26,6 +25,29 @@ function makeMinimalMCPTask(overrides) {
26
25
  ...overrides,
27
26
  };
28
27
  }
28
+ /** Test models for compilation — simulates models from the registry */
29
+ const TEST_MODELS = [
30
+ {
31
+ id: "anthropic:messages:claude-opus-4-6",
32
+ label: "Claude Opus 4.6",
33
+ config: { temperature: 0.2 },
34
+ },
35
+ {
36
+ id: "openai:responses:gpt-5.4",
37
+ label: "GPT 5.4",
38
+ config: { reasoning_effort: "medium" },
39
+ },
40
+ ];
41
+ /** The custom MCP provider file:// path */
42
+ const MCP_PROVIDER_PATH = "file://dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js";
43
+ /** Helper to get provider config */
44
+ function cfg(provider) {
45
+ return provider.config;
46
+ }
47
+ /** Helper to get mcpServer sub-config from provider */
48
+ function serverCfg(provider) {
49
+ return cfg(provider)?.mcpServer;
50
+ }
29
51
  // ---------------------------------------------------------------------------
30
52
  // handler.getPrompts() — prompt template ownership
31
53
  // ---------------------------------------------------------------------------
@@ -38,11 +60,9 @@ describe("MCPServerHandler.getPrompts", () => {
38
60
  it("returns templates keyed by MCP-specific IDs (not literacy names)", () => {
39
61
  const prompts = mcpHandler.getPrompts();
40
62
  const keys = Object.keys(prompts);
41
- // Must not use literacy template names
42
63
  assert.ok(!keys.includes("with-docs"), "should not use literacy key 'with-docs'");
43
64
  assert.ok(!keys.includes("without-docs"), "should not use literacy key 'without-docs'");
44
65
  assert.ok(!keys.includes(LiteracyVariant.AGENTIC), "should not use literacy key 'agentic'");
45
- // Must have MCP-appropriate key(s)
46
66
  assert.ok(keys.includes("mcp-server"), "should include 'mcp-server' template");
47
67
  });
48
68
  it("mcp-server template instructs model to use MCP tools", () => {
@@ -50,7 +70,6 @@ describe("MCPServerHandler.getPrompts", () => {
50
70
  const template = prompts["mcp-server"];
51
71
  assert.ok(template, "mcp-server template should exist");
52
72
  assert.ok(template.template.includes("{{task}}"), "should include {{task}} placeholder");
53
- // Should reference MCP tools / tool usage
54
73
  assert.ok(/tool/i.test(template.template), "template should mention tools (MCP-appropriate content)");
55
74
  });
56
75
  it("template has correct PromptTemplate shape", () => {
@@ -121,70 +140,91 @@ describe("validateMCPTask", () => {
121
140
  });
122
141
  });
123
142
  // ---------------------------------------------------------------------------
124
- // compileMCPTask
143
+ // compileMCPTask — provider assembly
125
144
  // ---------------------------------------------------------------------------
126
145
  describe("compileMCPTask", () => {
127
146
  it("produces provider, tests, and prompts", () => {
128
- const result = compileMCPTask(makeMinimalMCPTask());
147
+ const result = compileMCPTask(makeMinimalMCPTask(), { models: TEST_MODELS });
129
148
  assert.ok(result.providers.length > 0, "Should produce providers");
130
149
  assert.ok(result.tests.length > 0, "Should produce test cases");
131
150
  assert.ok(result.prompts.length > 0, "Should produce prompts");
132
151
  });
133
- it("builds Promptfoo-native MCP provider for stdio", () => {
152
+ it("emits file:// providers using the custom MCP tool provider", () => {
134
153
  const result = compileMCPTask(makeMinimalMCPTask({
135
154
  serverConfig: {
136
155
  transport: "stdio",
137
156
  command: "node dist/server.js --flag",
138
157
  },
139
- }));
140
- assert.equal(result.providers.length, 1);
141
- assert.equal(result.providers[0].id, "mcp");
142
- const config = result.providers[0].config;
143
- assert.equal(config.enabled, true);
144
- const server = config.server;
145
- assert.equal(server.command, "node");
146
- assert.deepEqual(server.args, ["dist/server.js", "--flag"]);
147
- });
148
- it("builds Promptfoo-native MCP provider for URL-based transport", () => {
158
+ }), { models: TEST_MODELS });
159
+ assert.equal(result.providers.length, 2, "One provider per model");
160
+ // All providers use the custom MCP tool provider path
161
+ assert.equal(result.providers[0].id, MCP_PROVIDER_PATH);
162
+ assert.equal(result.providers[1].id, MCP_PROVIDER_PATH);
163
+ // Model ID is passed in config
164
+ assert.equal(cfg(result.providers[0]).model, "anthropic:messages:claude-opus-4-6");
165
+ assert.equal(cfg(result.providers[1]).model, "openai:responses:gpt-5.4");
166
+ // MCP server config is in config.mcpServer
167
+ const server = serverCfg(result.providers[0]);
168
+ assert.equal(server.command, "node dist/server.js --flag");
169
+ });
170
+ it("preserves model config in provider config", () => {
149
171
  const result = compileMCPTask(makeMinimalMCPTask({
150
- serverConfig: {
151
- transport: "sse",
152
- url: "http://localhost:3000/sse",
153
- },
154
- }));
155
- assert.equal(result.providers[0].id, "mcp");
156
- const config = result.providers[0].config;
157
- const server = config.server;
172
+ serverConfig: { transport: "sse", url: "http://localhost:3000/sse" },
173
+ }), { models: TEST_MODELS });
174
+ const c = cfg(result.providers[0]);
175
+ assert.equal(c.temperature, 0.2, "Model config preserved");
176
+ assert.ok(c.mcpServer, "MCP server config present");
177
+ assert.equal(c.maxToolRounds, 5, "Default maxToolRounds");
178
+ });
179
+ it("builds MCP server config for URL-based transport", () => {
180
+ const result = compileMCPTask(makeMinimalMCPTask({
181
+ serverConfig: { transport: "sse", url: "http://localhost:3000/sse" },
182
+ }), { models: TEST_MODELS });
183
+ const server = serverCfg(result.providers[0]);
158
184
  assert.equal(server.url, "http://localhost:3000/sse");
159
185
  });
160
- it("maps auth config to Promptfoo provider", () => {
186
+ it("maps auth config to mcpServer config", () => {
161
187
  const result = compileMCPTask(makeMinimalMCPTask({
162
188
  serverConfig: {
163
189
  transport: "streamable-http",
164
190
  url: "https://mcp.example.com",
165
- auth: {
166
- type: "bearer",
167
- token: "{{env.MY_TOKEN}}",
168
- },
191
+ auth: { type: "bearer", token: "{{env.MY_TOKEN}}" },
169
192
  },
170
- }));
171
- const config = result.providers[0].config;
172
- const server = config.server;
173
- assert.deepEqual(server.auth, {
174
- type: "bearer",
175
- token: "{{env.MY_TOKEN}}",
176
- });
193
+ }), { models: TEST_MODELS });
194
+ const server = serverCfg(result.providers[0]);
195
+ assert.deepEqual(server.auth, { type: "bearer", token: "{{env.MY_TOKEN}}" });
177
196
  });
178
- it("maps capabilities to Promptfoo tools filter", () => {
197
+ it("maps capabilities to mcpTools config", () => {
179
198
  const result = compileMCPTask(makeMinimalMCPTask({
180
199
  capabilities: ["query_documents", "get_schema"],
181
200
  serverConfig: {
182
201
  transport: "streamable-http",
183
202
  url: "https://mcp.example.com",
184
203
  },
185
- }));
186
- const config = result.providers[0].config;
187
- assert.deepEqual(config.tools, ["query_documents", "get_schema"]);
204
+ }), { models: TEST_MODELS });
205
+ assert.deepEqual(cfg(result.providers[0]).mcpTools, [
206
+ "query_documents",
207
+ "get_schema",
208
+ ]);
209
+ });
210
+ it("uses task-level models override when specified", () => {
211
+ const result = compileMCPTask(makeMinimalMCPTask({
212
+ models: ["anthropic:messages:claude-sonnet-4-20250514"],
213
+ serverConfig: { transport: "sse", url: "http://localhost:3000" },
214
+ }), { models: TEST_MODELS });
215
+ assert.equal(result.providers.length, 1);
216
+ assert.equal(cfg(result.providers[0]).model, "anthropic:messages:claude-sonnet-4-20250514");
217
+ });
218
+ it("respects task-level maxToolRounds", () => {
219
+ const result = compileMCPTask(makeMinimalMCPTask({ maxToolRounds: 10 }), {
220
+ models: TEST_MODELS,
221
+ });
222
+ assert.equal(cfg(result.providers[0]).maxToolRounds, 10);
223
+ });
224
+ it("falls back to default model when no models provided", () => {
225
+ const result = compileMCPTask(makeMinimalMCPTask());
226
+ assert.ok(result.providers.length > 0, "Should have a fallback provider");
227
+ assert.ok(result.warnings.some((w) => w.includes("no models")));
188
228
  });
189
229
  it("uses task description as prompt text", () => {
190
230
  const result = compileMCPTask(makeMinimalMCPTask({
@@ -217,7 +257,6 @@ describe("compileMCPTask", () => {
217
257
  ],
218
258
  },
219
259
  }));
220
- // Primary + multi-turn test cases
221
260
  assert.equal(result.tests.length, 2);
222
261
  assert.ok(result.tests[1].description.includes("[multi-turn]"));
223
262
  });
@@ -293,63 +332,61 @@ describe("buildMCPAssertions", () => {
293
332
  // Example task compilation (end-to-end)
294
333
  // ---------------------------------------------------------------------------
295
334
  describe("example MCP tasks — end-to-end compilation", () => {
335
+ const opts = { models: TEST_MODELS };
296
336
  it("compiles all example tasks without errors", () => {
297
337
  for (const task of allMCPExampleTasks) {
298
- const result = compileMCPTask(task);
338
+ const result = compileMCPTask(task, opts);
299
339
  assert.ok(result.providers.length > 0, `${task.id}: should produce providers`);
300
340
  assert.ok(result.tests.length > 0, `${task.id}: should produce test cases`);
301
341
  assert.ok(result.prompts.length > 0, `${task.id}: should produce prompts`);
302
342
  }
303
343
  });
304
344
  it("query task has tool-called + contains + llm-rubric assertions", () => {
305
- const result = compileMCPTask(queryDocumentsTask);
345
+ const result = compileMCPTask(queryDocumentsTask, opts);
306
346
  const asserts = result.tests[0].assert;
307
- // tool-called (→ javascript), contains × 2, llm-rubric
308
347
  assert.equal(asserts.length, 4);
309
- assert.equal(asserts[0].type, "javascript"); // tool-called → javascript
348
+ assert.equal(asserts[0].type, "javascript");
310
349
  assert.equal(asserts[1].type, "contains");
311
350
  assert.equal(asserts[2].type, "contains");
312
351
  assert.equal(asserts[3].type, "llm-rubric");
313
352
  });
314
353
  it("schema task uses get_schema tool", () => {
315
- const result = compileMCPTask(inspectSchemaTask);
354
+ const result = compileMCPTask(inspectSchemaTask, opts);
316
355
  const asserts = result.tests[0].assert;
317
356
  assert.ok(asserts.some((a) => a.type === "javascript" && a.value.includes("get_schema")), "Should have tool-called assertion for get_schema");
318
357
  });
319
358
  it("create-publish task produces multi-turn test case", () => {
320
- const result = compileMCPTask(createAndPublishTask);
321
- // Primary + multi-turn
359
+ const result = compileMCPTask(createAndPublishTask, opts);
322
360
  assert.equal(result.tests.length, 2);
323
361
  assert.ok(result.tests[1].description?.includes("[multi-turn]"));
324
362
  });
325
- it("stdio task has Promptfoo-native MCP provider with command", () => {
326
- const result = compileMCPTask(stdioServerTask);
327
- assert.equal(result.providers[0].id, "mcp");
328
- const config = result.providers[0].config;
329
- assert.equal(config.enabled, true);
330
- const server = config.server;
331
- assert.equal(server.command, "node");
332
- assert.deepEqual(server.args, ["dist/sanity-mcp-server.js"]);
363
+ it("stdio task uses custom provider with command config", () => {
364
+ const result = compileMCPTask(stdioServerTask, opts);
365
+ assert.equal(result.providers[0].id, MCP_PROVIDER_PATH);
366
+ assert.equal(cfg(result.providers[0]).model, "anthropic:messages:claude-opus-4-6");
367
+ const server = serverCfg(result.providers[0]);
368
+ assert.equal(server.command, "node dist/sanity-mcp-server.js");
333
369
  });
334
370
  it("semantic search task has two tool-called + one llm-rubric assertion", () => {
335
- const result = compileMCPTask(semanticSearchTask);
371
+ const result = compileMCPTask(semanticSearchTask, opts);
336
372
  const asserts = result.tests[0].assert;
337
- // tool-called × 2 (→ javascript) + llm-rubric
338
373
  assert.equal(asserts.length, 3);
339
- assert.equal(asserts[0].type, "javascript"); // tool-called → javascript
340
- assert.ok(asserts[0].value.includes("list_embeddings_indices"), "Should have tool-called assertion for list_embeddings_indices");
341
- assert.equal(asserts[1].type, "javascript"); // tool-called → javascript
342
- assert.ok(asserts[1].value.includes("semantic_search"), "Should have tool-called assertion for semantic_search");
374
+ assert.equal(asserts[0].type, "javascript");
375
+ assert.ok(asserts[0].value.includes("list_embeddings_indices"));
376
+ assert.equal(asserts[1].type, "javascript");
377
+ assert.ok(asserts[1].value.includes("semantic_search"));
343
378
  assert.equal(asserts[2].type, "llm-rubric");
344
379
  });
345
380
  it("remote task has bearer auth and tools filter", () => {
346
- const result = compileMCPTask(queryDocumentsTask);
347
- const config = result.providers[0].config;
348
- const server = config.server;
381
+ const result = compileMCPTask(queryDocumentsTask, opts);
382
+ const server = serverCfg(result.providers[0]);
349
383
  assert.deepEqual(server.auth, {
350
384
  type: "bearer",
351
385
  token: "{{env.SANITY_MCP_AUTH_TOKEN}}",
352
386
  });
353
- assert.deepEqual(config.tools, ["query_documents", "get_schema"]);
387
+ assert.deepEqual(cfg(result.providers[0]).mcpTools, [
388
+ "query_documents",
389
+ "get_schema",
390
+ ]);
354
391
  });
355
392
  });
@@ -262,12 +262,14 @@ describe("InMemoryPluginRegistry", () => {
262
262
  });
263
263
  assert.equal(registry.getAssertions().length, 1);
264
264
  });
265
- it("registers a complete preset", () => {
265
+ it("registers a complete preset with mode base", () => {
266
266
  const registry = new InMemoryPluginRegistry();
267
+ // Must register mode base first
268
+ const { createLiteracyModeBase } = require("../mode-bases/literacy.js");
269
+ registry.registerModeBase(createLiteracyModeBase());
267
270
  registry.registerPreset(sanityLiteracyPreset);
268
- // Preset should register its modes, assertions, rubric templates
271
+ // Mode + rubrics from mode base, domain config from preset
269
272
  assert.ok(registry.getMode("literacy"));
270
- assert.ok(registry.getAssertions().length > 0);
271
273
  assert.ok(registry.getRubricTemplates().length > 0);
272
274
  assert.ok(registry.getPresets().length === 1);
273
275
  });
@@ -280,83 +282,21 @@ describe("sanityLiteracyPreset", () => {
280
282
  assert.equal(sanityLiteracyPreset.name, "sanity-literacy");
281
283
  assert.equal(sanityLiteracyPreset.manifest.pluginApiVersion, 1);
282
284
  });
283
- it("registers literacy mode", () => {
284
- assert.equal(sanityLiteracyPreset.modes?.length, 1);
285
- assert.equal(sanityLiteracyPreset.modes[0].id, "literacy");
286
- });
287
- it("includes core assertion types", () => {
288
- const types = sanityLiteracyPreset.assertions.map((a) => a.type);
289
- assert.ok(types.includes("contains"));
290
- assert.ok(types.includes("llm-rubric"));
291
- assert.ok(types.includes("javascript"));
292
- });
293
- it("includes 3 rubric templates", () => {
294
- assert.equal(sanityLiteracyPreset.rubricTemplates?.length, 3);
295
- const ids = sanityLiteracyPreset.rubricTemplates.map((t) => t.id);
296
- assert.ok(ids.includes("task-completion"));
297
- assert.ok(ids.includes("code-correctness"));
298
- assert.ok(ids.includes("doc-coverage"));
299
- });
300
- it("rubric template scales match config/rubrics.ts authoritative source", () => {
301
- const templates = sanityLiteracyPreset.rubricTemplates;
302
- const tc = templates.find((t) => t.id === "task-completion");
303
- assert.deepEqual(tc.scale, [
304
- "0: Couldn't attempt — missing critical information",
305
- "20: Attempted but fundamentally wrong approach",
306
- "50: Partial implementation — major functional gaps",
307
- "80: Mostly complete — minor issues or missing edge cases",
308
- "100: Fully functional code — works as expected",
309
- ]);
310
- assert.equal(tc.criteriaLabel, "Must demonstrate:");
311
- const cc = templates.find((t) => t.id === "code-correctness");
312
- assert.deepEqual(cc.scale, [
313
- "0: Broken code, syntax errors, or deprecated APIs",
314
- "30: Works but uses anti-patterns or inefficient approaches",
315
- "50: Works but not idiomatic",
316
- "80: Follows most best practices",
317
- "100: Follows all best practices, idiomatic implementation",
318
- ]);
319
- assert.equal(cc.criteriaLabel, "Check for:");
320
- const dc = templates.find((t) => t.id === "doc-coverage");
321
- assert.deepEqual(dc.scale, [
322
- "0: Had to hallucinate/guess most implementation details",
323
- "30: Significant gaps — filled with assumptions",
324
- "50: Some gaps — inferred from partial information",
325
- "80: Minor gaps — almost everything was documented",
326
- "100: Complete coverage — all necessary info was in docs",
327
- ]);
285
+ it("targets literacy mode base", () => {
286
+ assert.equal(sanityLiteracyPreset.mode, "literacy");
287
+ });
288
+ it("does not bundle assertions (now framework built-ins)", () => {
289
+ assert.equal(sanityLiteracyPreset.assertions, undefined);
290
+ });
291
+ it("does not bundle rubrics/scoring/prompts (now in literacy mode base)", () => {
292
+ // Evaluation methodology moved to mode-bases/literacy.ts
293
+ assert.equal(sanityLiteracyPreset.rubricTemplates, undefined);
294
+ assert.equal(sanityLiteracyPreset.scoringProfiles, undefined);
295
+ assert.equal(sanityLiteracyPreset.promptTemplates, undefined);
328
296
  });
329
297
  it("includes sanity:// fixture resolver", () => {
330
298
  assert.ok(sanityLiteracyPreset.fixtureResolvers?.some((r) => r.scheme === "sanity://"));
331
299
  });
332
- it("includes 3 prompt templates", () => {
333
- const templates = sanityLiteracyPreset.promptTemplates;
334
- assert.ok(templates);
335
- assert.ok(templates["with-docs"]);
336
- assert.ok(templates["without-docs"]);
337
- assert.ok(templates["agentic"]);
338
- assert.equal(Object.keys(templates).length, 3);
339
- });
340
- it("prompt template content matches literacy handler", () => {
341
- const templates = sanityLiteracyPreset.promptTemplates;
342
- assert.ok(templates["with-docs"].template.includes("{{docs}}"));
343
- assert.ok(templates["with-docs"].template.includes("{{task}}"));
344
- assert.ok(templates["without-docs"].template.includes("{{task}}"));
345
- assert.ok(templates["agentic"].template.includes("{{task}}"));
346
- });
347
- it("includes default and output-only scoring profiles", () => {
348
- const profiles = sanityLiteracyPreset.scoringProfiles;
349
- assert.ok(profiles);
350
- assert.deepEqual(profiles["default"], {
351
- "task-completion": 0.5,
352
- "code-correctness": 0.25,
353
- "doc-coverage": 0.25,
354
- });
355
- assert.deepEqual(profiles["output-only"], {
356
- "task-completion": 0.6,
357
- "code-correctness": 0.4,
358
- });
359
- });
360
300
  it("includes 3 source definitions", () => {
361
301
  const sources = sanityLiteracyPreset.sourceDefs;
362
302
  assert.ok(sources);
@@ -376,26 +316,13 @@ describe("sanityLiteracyPreset", () => {
376
316
  assert.ok(features);
377
317
  assert.equal(features.features.length, 14);
378
318
  const ids = features.features.map((f) => f.id);
379
- // Covered features
380
319
  assert.ok(ids.includes("groq"));
381
320
  assert.ok(ids.includes("visual-editing"));
382
- assert.ok(ids.includes("nextjs-live"));
383
- assert.ok(ids.includes("functions"));
384
- assert.ok(ids.includes("studio-setup"));
385
- assert.ok(ids.includes("frameworks"));
386
- // Uncovered features
387
321
  assert.ok(ids.includes("portable-text"));
388
- assert.ok(ids.includes("image-assets"));
389
- assert.ok(ids.includes("mutations"));
390
- assert.ok(ids.includes("schemas"));
391
- assert.ok(ids.includes("authentication"));
392
- assert.ok(ids.includes("webhooks"));
393
- assert.ok(ids.includes("realtime"));
394
322
  assert.ok(ids.includes("ai-assist"));
395
323
  });
396
324
  it("includes a docFetcher factory", () => {
397
325
  assert.equal(typeof sanityLiteracyPreset.docFetcher, "function");
398
- // The factory should return a SanityDocFetcher instance
399
326
  const fetcher = sanityLiteracyPreset.docFetcher();
400
327
  assert.ok(fetcher);
401
328
  assert.equal(typeof fetcher.fetch, "function");
@@ -405,28 +332,34 @@ describe("sanityLiteracyPreset", () => {
405
332
  // createSanityLiteracyPreset factory
406
333
  // ---------------------------------------------------------------------------
407
334
  describe("createSanityLiteracyPreset", () => {
408
- it("returns a preset with all extension points populated", () => {
335
+ it("returns a domain-only preset targeting literacy mode", () => {
409
336
  const preset = createSanityLiteracyPreset({ rootDir: "/tmp/test" });
410
337
  assert.equal(preset.name, "sanity-literacy");
411
- assert.ok(preset.modes);
412
- assert.ok(preset.assertions);
413
- assert.ok(preset.rubricTemplates);
338
+ assert.equal(preset.mode, "literacy");
339
+ // Domain config present
414
340
  assert.ok(preset.fixtureResolvers);
415
- assert.ok(preset.promptTemplates);
416
- assert.ok(preset.scoringProfiles);
417
341
  assert.ok(preset.docFetcher);
418
342
  assert.ok(preset.sourceDefs);
419
343
  assert.ok(preset.featureDefs);
344
+ // Methodology inherited from mode base, not on preset
345
+ assert.equal(preset.rubricTemplates, undefined);
346
+ assert.equal(preset.scoringProfiles, undefined);
347
+ assert.equal(preset.promptTemplates, undefined);
420
348
  });
421
- it("registers all extension points into the registry", () => {
349
+ it("registers all extension points via mode base + domain config", () => {
422
350
  const registry = new InMemoryPluginRegistry();
351
+ // Must register mode base first (composition root does this)
352
+ const { createLiteracyModeBase } = require("../mode-bases/literacy.js");
353
+ registry.registerModeBase(createLiteracyModeBase());
423
354
  const preset = createSanityLiteracyPreset({ rootDir: "/tmp/test" });
424
355
  registry.registerPreset(preset);
356
+ // Mode from mode base
425
357
  assert.ok(registry.getMode("literacy"));
426
- assert.ok(registry.getAssertions().length > 0);
427
- assert.ok(registry.getRubricTemplates().length === 3);
428
- assert.ok(Object.keys(registry.getPromptTemplates()).length === 3);
429
- assert.ok(Object.keys(registry.getScoringProfiles()).length === 2);
358
+ // Rubrics, scoring, prompts inherited from mode base
359
+ assert.equal(registry.getRubricTemplates().length, 3);
360
+ assert.equal(Object.keys(registry.getPromptTemplates()).length, 3);
361
+ assert.equal(Object.keys(registry.getScoringProfiles()).length, 2);
362
+ // Domain config from preset
430
363
  assert.ok(registry.getDocFetcherFactory());
431
364
  assert.equal(registry.getSourceDefs().length, 3);
432
365
  assert.ok(registry.getFeatureDefs());
@@ -19,7 +19,7 @@
19
19
  * @see docs/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
20
20
  */
21
21
  import type { LiteracyTaskDefinition } from "../../_vendor/ailf-core/index.d.ts";
22
- import { type LiteracyCompileResult } from "./mode-handlers/literacy-handler.js";
22
+ import { type LiteracyCompileResult } from "./mode-handlers/literacy/index.js";
23
23
  import { type LiteracyEvalSubMode } from "../normalize-mode.js";
24
24
  /** Options for compiling all literacy tasks via the new compiler */
25
25
  export interface LiteracyBridgeOptions {
@@ -18,7 +18,7 @@
18
18
  *
19
19
  * @see docs/exec-plans/architecture-overhaul/phase-7-migrate-literacy.md
20
20
  */
21
- import { compileLiteracyTask, } from "./mode-handlers/literacy-handler.js";
21
+ import { compileLiteracyTask, } from "./mode-handlers/literacy/index.js";
22
22
  import { tryLoadConfigFile } from "./config-loader.js";
23
23
  import { buildTaskGraph } from "./task-graph-builder.js";
24
24
  // ---------------------------------------------------------------------------
@@ -0,0 +1,10 @@
1
+ /**
2
+ * Agent Harness mode base — evaluation methodology for autonomous agent testing.
3
+ *
4
+ * Tests whether an autonomous agent can complete implementation tasks
5
+ * end-to-end, including tool use, file creation, and code generation.
6
+ *
7
+ * @see docs/MODES.md
8
+ */
9
+ import type { ModeBase } from "../../../_vendor/ailf-core/index.d.ts";
10
+ export declare function createAgentHarnessBase(): ModeBase;
@@ -0,0 +1,21 @@
1
+ /**
2
+ * Agent Harness mode base — evaluation methodology for autonomous agent testing.
3
+ *
4
+ * Tests whether an autonomous agent can complete implementation tasks
5
+ * end-to-end, including tool use, file creation, and code generation.
6
+ *
7
+ * @see docs/MODES.md
8
+ */
9
+ export function createAgentHarnessBase() {
10
+ return {
11
+ mode: {
12
+ id: "agent-harness",
13
+ label: "Agent Harness",
14
+ validProviderPatterns: ["^openai:", "^anthropic:", "^file://"],
15
+ rubricTemplateIds: [],
16
+ handlerModule: "./mode-handlers/agent-harness/index.js",
17
+ },
18
+ // Agent harness rubric templates and scoring profiles will be defined
19
+ // as the mode matures. The structural registration is in place.
20
+ };
21
+ }
@@ -0,0 +1,4 @@
1
+ export { createAgentHarnessBase } from "./agent-harness.js";
2
+ export { createKnowledgeProbeBase } from "./knowledge-probe.js";
3
+ export { createLiteracyModeBase } from "./literacy.js";
4
+ export { createMcpServerModeBase } from "./mcp-server.js";
@@ -0,0 +1,4 @@
1
+ export { createAgentHarnessBase } from "./agent-harness.js";
2
+ export { createKnowledgeProbeBase } from "./knowledge-probe.js";
3
+ export { createLiteracyModeBase } from "./literacy.js";
4
+ export { createMcpServerModeBase } from "./mcp-server.js";
@@ -0,0 +1,10 @@
1
+ /**
2
+ * Knowledge Probe mode base — evaluation methodology for testing model knowledge.
3
+ *
4
+ * Tests what the model knows about a topic without providing documentation,
5
+ * establishing a baseline of model knowledge.
6
+ *
7
+ * @see docs/MODES.md
8
+ */
9
+ import type { ModeBase } from "../../../_vendor/ailf-core/index.d.ts";
10
+ export declare function createKnowledgeProbeBase(): ModeBase;
@@ -0,0 +1,22 @@
1
+ /**
2
+ * Knowledge Probe mode base — evaluation methodology for testing model knowledge.
3
+ *
4
+ * Tests what the model knows about a topic without providing documentation,
5
+ * establishing a baseline of model knowledge.
6
+ *
7
+ * @see docs/MODES.md
8
+ */
9
+ export function createKnowledgeProbeBase() {
10
+ return {
11
+ mode: {
12
+ id: "knowledge-probe",
13
+ label: "Knowledge Probe",
14
+ validProviderPatterns: ["^openai:", "^anthropic:", "^file://"],
15
+ rubricTemplateIds: [],
16
+ handlerModule: "./mode-handlers/knowledge-probe/index.js",
17
+ },
18
+ // Knowledge probe uses the same rubric dimensions as literacy
19
+ // but without doc-coverage (since no docs are provided).
20
+ // Rubric templates will be inherited or defined as the mode matures.
21
+ };
22
+ }
@@ -0,0 +1,12 @@
1
+ /**
2
+ * Literacy mode base — shared evaluation methodology for documentation literacy.
3
+ *
4
+ * Defines HOW literacy evaluations are scored (rubrics, weights, prompts),
5
+ * independently of WHAT documentation is being evaluated. Domain presets
6
+ * like `sanity-literacy` target this mode base and add their own sources,
7
+ * features, and doc fetcher.
8
+ *
9
+ * @see docs/MODES.md
10
+ */
11
+ import type { ModeBase } from "../../../_vendor/ailf-core/index.d.ts";
12
+ export declare function createLiteracyModeBase(): ModeBase;