@sanity/ailf 1.0.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. package/README.md +0 -1
  2. package/config/models.ts +15 -3
  3. package/dist/_vendor/ailf-core/config-helpers.d.ts +14 -17
  4. package/dist/_vendor/ailf-core/config-helpers.js +22 -2
  5. package/dist/_vendor/ailf-core/examples/index.d.ts +16 -0
  6. package/dist/_vendor/ailf-core/examples/index.js +25 -0
  7. package/dist/_vendor/ailf-core/index.d.ts +2 -2
  8. package/dist/_vendor/ailf-core/index.js +1 -1
  9. package/dist/_vendor/ailf-core/ports/context.d.ts +2 -0
  10. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  11. package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
  12. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
  13. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +2 -0
  14. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
  15. package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
  16. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +13 -0
  17. package/dist/_vendor/ailf-core/types/index.d.ts +1 -3
  18. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +78 -23
  19. package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
  20. package/dist/adapters/config-sources/file-config-adapter.js +1 -0
  21. package/dist/adapters/config-sources/ts-config-loader.js +21 -13
  22. package/dist/adapters/task-sources/content-lake-task-source.js +17 -20
  23. package/dist/adapters/task-sources/index.d.ts +2 -2
  24. package/dist/adapters/task-sources/index.js +2 -2
  25. package/dist/adapters/task-sources/repo-schemas.d.ts +218 -16
  26. package/dist/adapters/task-sources/repo-schemas.js +227 -19
  27. package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
  28. package/dist/adapters/task-sources/repo-task-source.js +81 -122
  29. package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
  30. package/dist/adapters/task-sources/repo-validation.js +126 -5
  31. package/dist/adapters/task-sources/task-file-loader.d.ts +2 -2
  32. package/dist/adapters/task-sources/task-file-loader.js +2 -2
  33. package/dist/commands/coverage-audit.js +3 -1
  34. package/dist/commands/init.d.ts +6 -4
  35. package/dist/commands/init.js +302 -23
  36. package/dist/commands/validate-tasks.d.ts +2 -2
  37. package/dist/commands/validate-tasks.js +26 -15
  38. package/dist/composition-root.d.ts +13 -1
  39. package/dist/composition-root.js +73 -41
  40. package/dist/index.d.ts +41 -0
  41. package/dist/index.js +48 -0
  42. package/dist/orchestration/build-step-sequence.js +4 -2
  43. package/dist/orchestration/steps/fetch-docs-step.js +2 -3
  44. package/dist/orchestration/steps/generate-configs-step.js +28 -12
  45. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
  46. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
  47. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
  48. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +105 -68
  49. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
  50. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  51. package/dist/pipeline/compiler/literacy-bridge.js +1 -1
  52. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
  53. package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
  54. package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
  55. package/dist/pipeline/compiler/mode-bases/index.js +4 -0
  56. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
  57. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
  58. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +12 -0
  59. package/dist/pipeline/compiler/mode-bases/literacy.js +78 -0
  60. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
  61. package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
  62. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
  63. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
  64. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
  65. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
  66. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
  67. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
  68. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
  69. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
  70. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
  71. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
  72. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
  73. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
  74. package/dist/pipeline/compiler/mode-handlers/{agent-harness-handler.d.ts → agent-harness/types.d.ts} +3 -24
  75. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
  76. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
  77. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
  78. package/dist/pipeline/compiler/mode-handlers/index.d.ts +4 -5
  79. package/dist/pipeline/compiler/mode-handlers/index.js +4 -6
  80. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
  81. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
  82. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
  83. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
  84. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
  85. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
  86. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
  87. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
  88. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
  89. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
  90. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
  91. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
  92. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
  93. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
  94. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
  95. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
  96. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
  97. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
  98. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
  99. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
  100. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
  101. package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
  102. package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
  103. package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
  104. package/dist/pipeline/compiler/mode-handlers/{mcp-assertions.d.ts → mcp-server/assertions.d.ts} +2 -10
  105. package/dist/pipeline/compiler/mode-handlers/{mcp-assertions.js → mcp-server/assertions.js} +63 -6
  106. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
  107. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
  108. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
  109. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
  110. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
  111. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
  112. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
  113. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +104 -0
  114. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
  115. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
  116. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
  117. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
  118. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
  119. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +174 -0
  120. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
  121. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +95 -0
  122. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
  123. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
  124. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +14 -0
  125. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +16 -0
  126. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +93 -0
  127. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
  128. package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
  129. package/dist/pipeline/compiler/preset-loader.js +99 -0
  130. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +6 -9
  131. package/dist/pipeline/compiler/presets/sanity-literacy.js +10 -156
  132. package/dist/pipeline/expand-tasks.d.ts +2 -2
  133. package/dist/pipeline/expand-tasks.js +2 -2
  134. package/dist/pipeline/generate-configs.js +1 -1
  135. package/dist/pipeline/map-request-to-config.js +1 -0
  136. package/dist/pipeline/mirror-repo-tasks.d.ts +7 -7
  137. package/dist/pipeline/mirror-repo-tasks.js +9 -9
  138. package/dist/pipeline/plan.js +1 -1
  139. package/package.json +11 -3
  140. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  141. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  142. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  143. package/dist/_vendor/ailf-tasks/index.js +0 -16
  144. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  145. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  146. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  147. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  148. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  149. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  150. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
  151. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
  152. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
  153. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
  154. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
  155. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -67
  156. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -309
@@ -0,0 +1,82 @@
1
+ /**
2
+ * Sandbox configuration and lifecycle extensions for agent harness tasks.
3
+ *
4
+ * Builds Promptfoo beforeEach/afterEach hooks for provisioning and
5
+ * tearing down sandbox working directories.
6
+ */
7
+ // ---------------------------------------------------------------------------
8
+ // Sandbox configuration
9
+ // ---------------------------------------------------------------------------
10
+ export function buildSandboxConfig(task) {
11
+ return {
12
+ type: task.sandbox?.type ?? "tempdir",
13
+ image: task.sandbox?.image,
14
+ fixtures: task.fixtures ?? [],
15
+ limits: task.sandbox?.limits
16
+ ? {
17
+ cpus: task.sandbox.limits.cpus,
18
+ memoryBytes: task.sandbox.limits.memoryBytes,
19
+ networkAccess: task.sandbox.limits.networkAccess,
20
+ }
21
+ : undefined,
22
+ };
23
+ }
24
+ // ---------------------------------------------------------------------------
25
+ // Lifecycle extensions
26
+ // ---------------------------------------------------------------------------
27
+ export function buildLifecycleExtensions(task, sandboxConfig) {
28
+ const extensions = [];
29
+ // beforeEach: provision sandbox + inject fixtures
30
+ extensions.push({
31
+ type: "beforeEach",
32
+ code: buildBeforeEachHook(task.id, sandboxConfig),
33
+ });
34
+ // afterEach: collect artifacts + teardown
35
+ extensions.push({
36
+ type: "afterEach",
37
+ code: buildAfterEachHook(task.id),
38
+ });
39
+ return extensions;
40
+ }
41
+ export function buildBeforeEachHook(taskId, config) {
42
+ return (`// beforeEach: provision sandbox for ${taskId}\n` +
43
+ `async function({ vars }) {\n` +
44
+ ` const { mkdirSync, writeFileSync } = require('fs');\n` +
45
+ ` const { tmpdir } = require('os');\n` +
46
+ ` const { resolve } = require('path');\n` +
47
+ ` const id = 'ailf-${taskId}-' + require('crypto').randomUUID().slice(0, 8);\n` +
48
+ ` const workDir = resolve(tmpdir(), id);\n` +
49
+ ` mkdirSync(workDir, { recursive: true });\n` +
50
+ ` vars.__workingDir = workDir;\n` +
51
+ ` vars.__sandboxId = id;\n` +
52
+ ` // Fixture list: ${JSON.stringify(config.fixtures)}\n` +
53
+ `}`);
54
+ }
55
+ export function buildAfterEachHook(taskId) {
56
+ return (`// afterEach: collect artifacts + teardown for ${taskId}\n` +
57
+ `async function({ vars }) {\n` +
58
+ ` const { rmSync, readdirSync, existsSync } = require('fs');\n` +
59
+ ` const workDir = vars.__workingDir;\n` +
60
+ ` if (workDir && existsSync(workDir)) {\n` +
61
+ ` try {\n` +
62
+ ` // Collect modified files list\n` +
63
+ ` const files = [];\n` +
64
+ ` function collect(dir, prefix) {\n` +
65
+ ` for (const e of readdirSync(dir, { withFileTypes: true })) {\n` +
66
+ ` const rel = prefix ? prefix + '/' + e.name : e.name;\n` +
67
+ ` if (e.isDirectory()) collect(require('path').resolve(dir, e.name), rel);\n` +
68
+ ` else files.push(rel);\n` +
69
+ ` }\n` +
70
+ ` }\n` +
71
+ ` collect(workDir, '');\n` +
72
+ ` vars.__artifacts = { modifiedFiles: files };\n` +
73
+ ` } finally {\n` +
74
+ ` // Guard: only delete directories under os.tmpdir()\n` +
75
+ ` const tmp = require('os').tmpdir();\n` +
76
+ ` if (require('path').resolve(workDir).startsWith(require('path').resolve(tmp))) {\n` +
77
+ ` rmSync(workDir, { recursive: true, force: true });\n` +
78
+ ` }\n` +
79
+ ` }\n` +
80
+ ` }\n` +
81
+ `}`);
82
+ }
@@ -0,0 +1,4 @@
1
+ /**
2
+ * Predefined tool permission sets for agent harness evaluations.
3
+ */
4
+ export declare const TOOL_PRESETS: Record<string, string[]>;
@@ -0,0 +1,19 @@
1
+ /**
2
+ * Predefined tool permission sets for agent harness evaluations.
3
+ */
4
+ export const TOOL_PRESETS = {
5
+ coding: ["Bash", "Read", "Write", "Edit", "Glob", "Grep"],
6
+ "full-access": [
7
+ "Bash",
8
+ "Read",
9
+ "Write",
10
+ "Edit",
11
+ "Glob",
12
+ "Grep",
13
+ "WebSearch",
14
+ "WebFetch",
15
+ "TodoRead",
16
+ "TodoWrite",
17
+ ],
18
+ "read-only": ["Read", "Glob", "Grep", "WebSearch"],
19
+ };
@@ -1,19 +1,8 @@
1
1
  /**
2
- * AgentHarnessModeHandler compilation rules for `agent-harness` mode.
3
- *
4
- * Maps agent harness task definitions to Promptfoo configuration with:
5
- * - Claude Agent SDK / OpenAI Codex SDK providers
6
- * - Tool permission configuration (preset/allowed/disallowed)
7
- * - Sandbox setup/teardown via Promptfoo extensions
8
- * - Fixture provisioning into sandbox working directory
9
- *
10
- * @see docs/exec-plans/architecture-overhaul/phase-4-agent-harness.md
11
- * @see packages/core/src/types/generalized-task.ts — AgentHarnessTaskDefinition
2
+ * Shared types for the agent harness mode handler.
12
3
  */
13
- import type { AgentHarnessTaskDefinition, ModeHandler, PromptTemplate } from "../../../_vendor/ailf-core/index.d.ts";
14
- import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../promptfoo-compiler.js";
15
- import type { SandboxType } from "../sandbox/sandbox-strategy.js";
16
- export declare const AGENT_HARNESS_PROMPT_TEMPLATES: Record<string, PromptTemplate>;
4
+ import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
5
+ import type { SandboxType } from "../../sandbox/sandbox-strategy.js";
17
6
  /** Options for compiling an agent harness task */
18
7
  export interface AgentHarnessCompileOptions {
19
8
  /** Grader provider for LLM-graded assertions */
@@ -58,13 +47,3 @@ export interface AgentHarnessValidationError {
58
47
  field: string;
59
48
  message: string;
60
49
  }
61
- /**
62
- * Validate that an agent harness task definition has all required fields.
63
- */
64
- export declare function validateAgentHarnessTask(task: AgentHarnessTaskDefinition): AgentHarnessValidationError[];
65
- /**
66
- * Compile an agent harness task definition into Promptfoo configuration.
67
- */
68
- export declare function compileAgentHarnessTask(task: AgentHarnessTaskDefinition, options?: AgentHarnessCompileOptions): AgentHarnessCompileResult;
69
- /** ModeHandler-conformant export for the agent-harness evaluation mode. */
70
- export declare const handler: ModeHandler;
@@ -0,0 +1,4 @@
1
+ /**
2
+ * Shared types for the agent harness mode handler.
3
+ */
4
+ export {};
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Validation for agent harness task definitions.
3
+ */
4
+ import type { AgentHarnessTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
5
+ import type { AgentHarnessValidationError } from "./types.js";
6
+ /**
7
+ * Validate that an agent harness task definition has all required fields.
8
+ */
9
+ export declare function validateAgentHarnessTask(task: AgentHarnessTaskDefinition): AgentHarnessValidationError[];
@@ -0,0 +1,16 @@
1
+ /**
2
+ * Validation for agent harness task definitions.
3
+ */
4
+ /**
5
+ * Validate that an agent harness task definition has all required fields.
6
+ */
7
+ export function validateAgentHarnessTask(task) {
8
+ const errors = [];
9
+ if (!task.id) {
10
+ errors.push({ field: "id", message: "Task ID is required" });
11
+ }
12
+ if (!task.title) {
13
+ errors.push({ field: "title", message: "Task title is required" });
14
+ }
15
+ return errors;
16
+ }
@@ -9,8 +9,7 @@
9
9
  * @see docs/exec-plans/architecture-overhaul/phase-3-mcp-server-mode.md
10
10
  * @see docs/exec-plans/architecture-overhaul/phase-4-agent-harness.md
11
11
  */
12
- export { compileMCPTask, handler as mcpServerHandler, validateMCPTask, type MCPCompileOptions, type MCPCompileResult, type MCPValidationError, } from "./mcp-server-handler.js";
13
- export { buildMCPAssertions, type MCPAssertionContext, } from "./mcp-assertions.js";
14
- export { compileLiteracyTask, handler as literacyHandler, validateLiteracyTask, type LiteracyCompileOptions, type LiteracyCompileResult, type LiteracyValidationError, type RubricConfig, } from "./literacy-handler.js";
15
- export { compileKnowledgeProbeTask, handler as knowledgeProbeHandler, validateKnowledgeProbeTask, type KnowledgeProbeCompileOptions, type KnowledgeProbeCompileResult, type KnowledgeProbeMetadata, type KnowledgeProbeValidationError, } from "./knowledge-probe-handler.js";
16
- export { compileAgentHarnessTask, handler as agentHarnessHandler, validateAgentHarnessTask, type AgentHarnessCompileOptions, type AgentHarnessCompileResult, type AgentHarnessValidationError, type PromptfooExtension, type SandboxConfigMeta, } from "./agent-harness-handler.js";
12
+ export { buildMCPAssertions, compileMCPTask, handler as mcpServerHandler, validateMCPTask, type MCPAssertionContext, type MCPCompileOptions, type MCPCompileResult, type MCPValidationError, } from "./mcp-server/index.js";
13
+ export { compileLiteracyTask, handler as literacyHandler, validateLiteracyTask, type LiteracyCompileOptions, type LiteracyCompileResult, type LiteracyValidationError, type RubricConfig, } from "./literacy/index.js";
14
+ export { compileKnowledgeProbeTask, handler as knowledgeProbeHandler, validateKnowledgeProbeTask, type KnowledgeProbeCompileOptions, type KnowledgeProbeCompileResult, type KnowledgeProbeMetadata, type KnowledgeProbeValidationError, } from "./knowledge-probe/index.js";
15
+ export { compileAgentHarnessTask, handler as agentHarnessHandler, validateAgentHarnessTask, type AgentHarnessCompileOptions, type AgentHarnessCompileResult, type AgentHarnessValidationError, type PromptfooExtension, type SandboxConfigMeta, } from "./agent-harness/index.js";
@@ -10,12 +10,10 @@
10
10
  * @see docs/exec-plans/architecture-overhaul/phase-4-agent-harness.md
11
11
  */
12
12
  // MCP Server mode
13
- export { compileMCPTask, handler as mcpServerHandler, validateMCPTask, } from "./mcp-server-handler.js";
14
- // MCP assertion builders
15
- export { buildMCPAssertions, } from "./mcp-assertions.js";
13
+ export { buildMCPAssertions, compileMCPTask, handler as mcpServerHandler, validateMCPTask, } from "./mcp-server/index.js";
16
14
  // Literacy mode
17
- export { compileLiteracyTask, handler as literacyHandler, validateLiteracyTask, } from "./literacy-handler.js";
15
+ export { compileLiteracyTask, handler as literacyHandler, validateLiteracyTask, } from "./literacy/index.js";
18
16
  // Knowledge Probe mode
19
- export { compileKnowledgeProbeTask, handler as knowledgeProbeHandler, validateKnowledgeProbeTask, } from "./knowledge-probe-handler.js";
17
+ export { compileKnowledgeProbeTask, handler as knowledgeProbeHandler, validateKnowledgeProbeTask, } from "./knowledge-probe/index.js";
20
18
  // Agent Harness mode
21
- export { compileAgentHarnessTask, handler as agentHarnessHandler, validateAgentHarnessTask, } from "./agent-harness-handler.js";
19
+ export { compileAgentHarnessTask, handler as agentHarnessHandler, validateAgentHarnessTask, } from "./agent-harness/index.js";
@@ -0,0 +1,16 @@
1
+ /**
2
+ * Assertion mapping for knowledge probe evaluations.
3
+ */
4
+ import type { PromptfooAssertion } from "../../assertion-mapper.js";
5
+ import type { KnowledgeProbeCompileOptions } from "./types.js";
6
+ /**
7
+ * Map a raw knowledge probe assertion to a Promptfoo assertion.
8
+ *
9
+ * Tool-use assertions are rejected (knowledge probes don't use tools).
10
+ * LLM-graded assertions receive the configured grader provider.
11
+ * All other assertions are passed through.
12
+ */
13
+ export declare function mapKnowledgeProbeAssertion(assertion: {
14
+ type: string;
15
+ [k: string]: unknown;
16
+ }, options: KnowledgeProbeCompileOptions | undefined, warnings: string[]): PromptfooAssertion | null;
@@ -0,0 +1,61 @@
1
+ /**
2
+ * Assertion mapping for knowledge probe evaluations.
3
+ */
4
+ /**
5
+ * Map a raw knowledge probe assertion to a Promptfoo assertion.
6
+ *
7
+ * Tool-use assertions are rejected (knowledge probes don't use tools).
8
+ * LLM-graded assertions receive the configured grader provider.
9
+ * All other assertions are passed through.
10
+ */
11
+ export function mapKnowledgeProbeAssertion(assertion, options, warnings) {
12
+ switch (assertion.type) {
13
+ // Standard assertions — pass through
14
+ case "contains":
15
+ case "contains-all":
16
+ case "contains-any":
17
+ case "equals":
18
+ case "is-json":
19
+ case "javascript":
20
+ case "python":
21
+ case "regex":
22
+ case "similar":
23
+ return {
24
+ type: assertion.type,
25
+ ...("value" in assertion ? { value: assertion.value } : {}),
26
+ ...(typeof assertion.weight === "number"
27
+ ? { weight: assertion.weight }
28
+ : {}),
29
+ };
30
+ // LLM-graded assertions — add grader provider
31
+ case "g-eval":
32
+ case "llm-rubric":
33
+ case "model-graded-closedqa":
34
+ case "model-graded-factuality":
35
+ return {
36
+ type: assertion.type,
37
+ ...("value" in assertion ? { value: assertion.value } : {}),
38
+ ...(typeof assertion.weight === "number"
39
+ ? { weight: assertion.weight }
40
+ : {}),
41
+ ...(options?.graderProvider
42
+ ? { provider: options.graderProvider }
43
+ : {}),
44
+ };
45
+ // Tool-use assertions are NOT valid for knowledge probes
46
+ case "skill-used":
47
+ case "tool-call-f1":
48
+ case "tool-called":
49
+ case "tool-input-matches":
50
+ case "tool-output-matches":
51
+ warnings.push(`Knowledge probe "${assertion.type}" assertion is not applicable — ` +
52
+ "knowledge probes don't use tools. Assertion skipped.");
53
+ return null;
54
+ default:
55
+ warnings.push(`Knowledge probe: unknown assertion type "${assertion.type}" — passed through`);
56
+ return {
57
+ type: assertion.type,
58
+ ...("value" in assertion ? { value: assertion.value } : {}),
59
+ };
60
+ }
61
+ }
@@ -0,0 +1,18 @@
1
+ /**
2
+ * Core compilation logic for knowledge probe tasks.
3
+ *
4
+ * Compiles a knowledge probe task definition into Promptfoo configuration.
5
+ * This is intentionally minimal — knowledge probes map almost 1:1 to
6
+ * basic Promptfoo test cases. The AILF value-add is type-safe authoring,
7
+ * cross-model comparison, and score normalization.
8
+ */
9
+ import type { KnowledgeProbeTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
10
+ import type { KnowledgeProbeCompileOptions, KnowledgeProbeCompileResult } from "./types.js";
11
+ /**
12
+ * Compile a knowledge probe task definition into Promptfoo configuration.
13
+ *
14
+ * This is intentionally minimal — knowledge probes map almost 1:1 to
15
+ * basic Promptfoo test cases. The AILF value-add is type-safe authoring,
16
+ * cross-model comparison, and score normalization.
17
+ */
18
+ export declare function compileKnowledgeProbeTask(task: KnowledgeProbeTaskDefinition, options?: KnowledgeProbeCompileOptions): KnowledgeProbeCompileResult;
@@ -0,0 +1,112 @@
1
+ /**
2
+ * Core compilation logic for knowledge probe tasks.
3
+ *
4
+ * Compiles a knowledge probe task definition into Promptfoo configuration.
5
+ * This is intentionally minimal — knowledge probes map almost 1:1 to
6
+ * basic Promptfoo test cases. The AILF value-add is type-safe authoring,
7
+ * cross-model comparison, and score normalization.
8
+ */
9
+ import { mapKnowledgeProbeAssertion } from "./assertions.js";
10
+ import { validateKnowledgeProbeTask } from "./validation.js";
11
+ // ---------------------------------------------------------------------------
12
+ // Public compilation entry point
13
+ // ---------------------------------------------------------------------------
14
+ /**
15
+ * Compile a knowledge probe task definition into Promptfoo configuration.
16
+ *
17
+ * This is intentionally minimal — knowledge probes map almost 1:1 to
18
+ * basic Promptfoo test cases. The AILF value-add is type-safe authoring,
19
+ * cross-model comparison, and score normalization.
20
+ */
21
+ export function compileKnowledgeProbeTask(task, options) {
22
+ const warnings = [];
23
+ // Validate
24
+ const validationErrors = validateKnowledgeProbeTask(task);
25
+ for (const err of validationErrors) {
26
+ warnings.push(`Knowledge probe "${task.id}": ${err.field} — ${err.message}`);
27
+ }
28
+ // Build providers from model list (or use a default placeholder)
29
+ const providers = buildProviders(options);
30
+ // Build prompts — knowledge probes use a single no-docs prompt
31
+ const prompts = buildPrompts(task);
32
+ // Build test cases
33
+ const tests = buildTestCases(task, options, warnings);
34
+ // Build metadata
35
+ const metadata = {
36
+ mode: "knowledge-probe",
37
+ probeStrategy: task.probeStrategy ?? "breadth-first",
38
+ noDocContext: true,
39
+ retrievalMetrics: false,
40
+ };
41
+ return { providers, tests, prompts, metadata, warnings };
42
+ }
43
+ // ---------------------------------------------------------------------------
44
+ // Provider assembly
45
+ // ---------------------------------------------------------------------------
46
+ function buildProviders(options) {
47
+ if (options?.models && options.models.length > 0) {
48
+ return options.models.map((model) => ({
49
+ id: model.id,
50
+ label: model.label,
51
+ config: model.config,
52
+ }));
53
+ }
54
+ // No models specified — return empty (caller should provide models)
55
+ return [];
56
+ }
57
+ // ---------------------------------------------------------------------------
58
+ // Prompt assembly
59
+ // ---------------------------------------------------------------------------
60
+ function buildPrompts(task) {
61
+ // Knowledge probes use a single prompt — no with-docs/without-docs split.
62
+ // The prompt IS the probe question.
63
+ const promptText = task.prompt?.text ??
64
+ task.prompt?.vars?.task ??
65
+ task.description ??
66
+ `Knowledge probe: ${task.title}`;
67
+ const systemMessage = task.prompt?.systemMessage;
68
+ return [
69
+ {
70
+ id: "knowledge-probe",
71
+ label: `Probe: ${task.title}`,
72
+ raw: systemMessage
73
+ ? `[system]\n${systemMessage}\n\n[user]\n${String(promptText)}`
74
+ : String(promptText),
75
+ },
76
+ ];
77
+ }
78
+ // ---------------------------------------------------------------------------
79
+ // Test case assembly
80
+ // ---------------------------------------------------------------------------
81
+ function buildTestCases(task, options, warnings) {
82
+ // Build assertions
83
+ const assertions = [];
84
+ if (task.assertions) {
85
+ for (const assertion of task.assertions) {
86
+ const raw = assertion;
87
+ const mapped = mapKnowledgeProbeAssertion(raw, options, warnings);
88
+ if (mapped)
89
+ assertions.push(mapped);
90
+ }
91
+ }
92
+ // Build vars — intentionally no docs
93
+ const vars = {
94
+ task: task.prompt?.vars?.task ??
95
+ task.description ??
96
+ `Knowledge probe: ${task.title}`,
97
+ ...(task.prompt?.vars ?? {}),
98
+ // Metadata for scoring pipeline
99
+ __mode: "knowledge-probe",
100
+ __probeStrategy: task.probeStrategy ?? "breadth-first",
101
+ };
102
+ // Explicitly do NOT include docs
103
+ // This is the defining characteristic of knowledge-probe mode
104
+ delete vars.docs;
105
+ return [
106
+ {
107
+ description: `${task.id} — ${task.title}`,
108
+ vars,
109
+ ...(assertions.length > 0 ? { assert: assertions } : {}),
110
+ },
111
+ ];
112
+ }
@@ -0,0 +1,26 @@
1
+ /**
2
+ * KnowledgeProbeModeHandler — compilation rules for `knowledge-probe` mode.
3
+ *
4
+ * The simplest mode handler. Knowledge probes measure raw model knowledge
5
+ * without documentation context, tool calling, or sandboxed execution.
6
+ * They answer: "What does this model know about X without any help?"
7
+ *
8
+ * Key properties:
9
+ * - No doc vars injected (intentionally empty)
10
+ * - Uses the without-docs prompt template (or custom prompt)
11
+ * - Standard LLM providers only (no agent SDKs, no MCP)
12
+ * - No retrieval metrics (precision/recall/F1 not applicable)
13
+ * - Results feed into the standard cross-model comparison pipeline
14
+ *
15
+ * This handler is the reference implementation for the mode handler pattern.
16
+ *
17
+ * @see docs/exec-plans/architecture-overhaul/phase-5-knowledge-probe.md
18
+ * @see packages/core/src/types/generalized-task.ts — KnowledgeProbeTaskDefinition
19
+ */
20
+ import type { ModeHandler } from "../../../../_vendor/ailf-core/index.d.ts";
21
+ export { compileKnowledgeProbeTask } from "./compiler.js";
22
+ export { KNOWLEDGE_PROBE_PROMPT_TEMPLATES } from "./prompts.js";
23
+ export type { KnowledgeProbeCompileOptions, KnowledgeProbeCompileResult, KnowledgeProbeMetadata, KnowledgeProbeValidationError, } from "./types.js";
24
+ export { validateKnowledgeProbeTask } from "./validation.js";
25
+ /** ModeHandler-conformant export for the knowledge-probe evaluation mode. */
26
+ export declare const handler: ModeHandler;
@@ -0,0 +1,49 @@
1
+ /**
2
+ * KnowledgeProbeModeHandler — compilation rules for `knowledge-probe` mode.
3
+ *
4
+ * The simplest mode handler. Knowledge probes measure raw model knowledge
5
+ * without documentation context, tool calling, or sandboxed execution.
6
+ * They answer: "What does this model know about X without any help?"
7
+ *
8
+ * Key properties:
9
+ * - No doc vars injected (intentionally empty)
10
+ * - Uses the without-docs prompt template (or custom prompt)
11
+ * - Standard LLM providers only (no agent SDKs, no MCP)
12
+ * - No retrieval metrics (precision/recall/F1 not applicable)
13
+ * - Results feed into the standard cross-model comparison pipeline
14
+ *
15
+ * This handler is the reference implementation for the mode handler pattern.
16
+ *
17
+ * @see docs/exec-plans/architecture-overhaul/phase-5-knowledge-probe.md
18
+ * @see packages/core/src/types/generalized-task.ts — KnowledgeProbeTaskDefinition
19
+ */
20
+ import { compileKnowledgeProbeTask } from "./compiler.js";
21
+ import { KNOWLEDGE_PROBE_PROMPT_TEMPLATES } from "./prompts.js";
22
+ // ---------------------------------------------------------------------------
23
+ // Re-exports
24
+ // ---------------------------------------------------------------------------
25
+ export { compileKnowledgeProbeTask } from "./compiler.js";
26
+ export { KNOWLEDGE_PROBE_PROMPT_TEMPLATES } from "./prompts.js";
27
+ export { validateKnowledgeProbeTask } from "./validation.js";
28
+ // ---------------------------------------------------------------------------
29
+ // ModeHandler adapter
30
+ // ---------------------------------------------------------------------------
31
+ /** ModeHandler-conformant export for the knowledge-probe evaluation mode. */
32
+ export const handler = {
33
+ getPrompts() {
34
+ return KNOWLEDGE_PROBE_PROMPT_TEMPLATES;
35
+ },
36
+ compileTask(task, ctx) {
37
+ if (!("mode" in task) || task.mode !== "knowledge-probe") {
38
+ throw new Error(`Knowledge probe handler received task with mode "${task.mode ?? "undefined"}" — expected "knowledge-probe"`);
39
+ }
40
+ const result = compileKnowledgeProbeTask(task, { graderProvider: ctx.graderProvider, models: ctx.models });
41
+ return {
42
+ providers: result.providers,
43
+ tests: result.tests,
44
+ prompts: result.prompts,
45
+ warnings: result.warnings,
46
+ extras: { metadata: result.metadata },
47
+ };
48
+ },
49
+ };
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Canonical knowledge probe prompt templates.
3
+ *
4
+ * Handler-owned prompts for knowledge probe evaluations. These ask factual
5
+ * questions without injecting documentation context — measuring raw model
6
+ * knowledge about Sanity concepts.
7
+ */
8
+ import type { PromptTemplate } from "../../../../_vendor/ailf-core/index.d.ts";
9
+ export declare const KNOWLEDGE_PROBE_PROMPT_TEMPLATES: Record<string, PromptTemplate>;
@@ -0,0 +1,28 @@
1
+ /**
2
+ * Canonical knowledge probe prompt templates.
3
+ *
4
+ * Handler-owned prompts for knowledge probe evaluations. These ask factual
5
+ * questions without injecting documentation context — measuring raw model
6
+ * knowledge about Sanity concepts.
7
+ */
8
+ export const KNOWLEDGE_PROBE_PROMPT_TEMPLATES = {
9
+ "knowledge-probe": {
10
+ id: "knowledge-probe",
11
+ label: "Knowledge Probe (No Docs)",
12
+ template: `Answer the following question about Sanity.io based on your existing knowledge. Do not search for or reference external documentation.
13
+
14
+ ## Question
15
+ {{task}}
16
+
17
+ ## Instructions
18
+
19
+ 1. Answer based solely on what you already know
20
+ 2. Be specific — include API names, function signatures, and code examples where relevant
21
+ 3. If you are unsure about a detail, say so rather than guessing
22
+ 4. Provide a complete, accurate answer
23
+
24
+ Your answer:
25
+ `,
26
+ variables: ["task"],
27
+ },
28
+ };
@@ -0,0 +1,44 @@
1
+ /**
2
+ * Public types for the knowledge-probe mode handler.
3
+ */
4
+ import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
5
+ /** Options for compiling a knowledge probe task */
6
+ export interface KnowledgeProbeCompileOptions {
7
+ /** Grader provider for LLM-graded assertions */
8
+ graderProvider?: string;
9
+ /** Model registry — knowledge probes run across all configured models */
10
+ models?: {
11
+ id: string;
12
+ label: string;
13
+ config?: Record<string, unknown>;
14
+ }[];
15
+ }
16
+ /** Result of compiling a single knowledge probe task */
17
+ export interface KnowledgeProbeCompileResult {
18
+ /** Promptfoo provider configs (one per model) */
19
+ providers: PromptfooProvider[];
20
+ /** Compiled test cases */
21
+ tests: PromptfooTestCase[];
22
+ /** Prompts for evaluation */
23
+ prompts: PromptfooPrompt[];
24
+ /** Mode metadata for cross-model comparison */
25
+ metadata: KnowledgeProbeMetadata;
26
+ /** Warnings generated during compilation */
27
+ warnings: string[];
28
+ }
29
+ /** Metadata attached to knowledge probe results for comparison */
30
+ export interface KnowledgeProbeMetadata {
31
+ /** Evaluation mode identifier */
32
+ mode: "knowledge-probe";
33
+ /** Probe strategy used */
34
+ probeStrategy: string;
35
+ /** Whether doc context was intentionally excluded */
36
+ noDocContext: true;
37
+ /** Whether retrieval metrics are applicable */
38
+ retrievalMetrics: false;
39
+ }
40
+ /** Validation errors for knowledge probe task definitions */
41
+ export interface KnowledgeProbeValidationError {
42
+ field: string;
43
+ message: string;
44
+ }
@@ -0,0 +1,4 @@
1
+ /**
2
+ * Public types for the knowledge-probe mode handler.
3
+ */
4
+ export {};
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Validation logic for knowledge probe task definitions.
3
+ */
4
+ import type { KnowledgeProbeTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
5
+ import type { KnowledgeProbeValidationError } from "./types.js";
6
+ /**
7
+ * Validate that a knowledge probe task definition has all required fields.
8
+ */
9
+ export declare function validateKnowledgeProbeTask(task: KnowledgeProbeTaskDefinition): KnowledgeProbeValidationError[];
@@ -0,0 +1,24 @@
1
+ /**
2
+ * Validation logic for knowledge probe task definitions.
3
+ */
4
+ /**
5
+ * Validate that a knowledge probe task definition has all required fields.
6
+ */
7
+ export function validateKnowledgeProbeTask(task) {
8
+ const errors = [];
9
+ if (!task.id) {
10
+ errors.push({ field: "id", message: "Task ID is required" });
11
+ }
12
+ if (!task.title) {
13
+ errors.push({ field: "title", message: "Task title is required" });
14
+ }
15
+ // Knowledge probes must have either a prompt or a description
16
+ if (!task.prompt?.text && !task.prompt?.vars?.task && !task.description) {
17
+ errors.push({
18
+ field: "prompt",
19
+ message: "Knowledge probe tasks require either prompt.text, prompt.vars.task, " +
20
+ "or description — the question to ask the model",
21
+ });
22
+ }
23
+ return errors;
24
+ }
@@ -0,0 +1,18 @@
1
+ /**
2
+ * Assertion resolution for literacy tasks.
3
+ *
4
+ * Handles rubric template resolution, doc-coverage auto-generation,
5
+ * and baseline assertion filtering.
6
+ */
7
+ import type { LiteracyTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
8
+ import type { PromptfooAssertion } from "../../assertion-mapper.js";
9
+ import type { LiteracyCompileOptions } from "./types.js";
10
+ export declare function resolveAssertions(task: LiteracyTaskDefinition, options: LiteracyCompileOptions | undefined, warnings: string[]): PromptfooAssertion[];
11
+ /**
12
+ * Build baseline assertions matching the legacy expand-tasks behavior.
13
+ *
14
+ * - "full": all assertions carried over
15
+ * - "abbreviated": only first llm-rubric with shortened prompt
16
+ * - "none": no assertions
17
+ */
18
+ export declare function buildBaselineAssertions(goldAssertions: PromptfooAssertion[], rubricMode?: "abbreviated" | "full" | "none"): PromptfooAssertion[];