@sanity/ailf 1.0.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. package/README.md +0 -1
  2. package/config/models.ts +15 -3
  3. package/dist/_vendor/ailf-core/config-helpers.d.ts +14 -17
  4. package/dist/_vendor/ailf-core/config-helpers.js +22 -2
  5. package/dist/_vendor/ailf-core/examples/index.d.ts +16 -0
  6. package/dist/_vendor/ailf-core/examples/index.js +25 -0
  7. package/dist/_vendor/ailf-core/index.d.ts +2 -2
  8. package/dist/_vendor/ailf-core/index.js +1 -1
  9. package/dist/_vendor/ailf-core/ports/context.d.ts +2 -0
  10. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
  11. package/dist/_vendor/ailf-core/schemas/eval-config.js +10 -0
  12. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
  13. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +2 -0
  14. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +0 -2
  15. package/dist/_vendor/ailf-core/schemas/pipeline.js +0 -1
  16. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +13 -0
  17. package/dist/_vendor/ailf-core/types/index.d.ts +1 -3
  18. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +78 -23
  19. package/dist/_vendor/ailf-core/types/plugin-registry.js +73 -20
  20. package/dist/adapters/config-sources/file-config-adapter.js +1 -0
  21. package/dist/adapters/config-sources/ts-config-loader.js +21 -13
  22. package/dist/adapters/task-sources/content-lake-task-source.js +17 -20
  23. package/dist/adapters/task-sources/index.d.ts +2 -2
  24. package/dist/adapters/task-sources/index.js +2 -2
  25. package/dist/adapters/task-sources/repo-schemas.d.ts +218 -16
  26. package/dist/adapters/task-sources/repo-schemas.js +227 -19
  27. package/dist/adapters/task-sources/repo-task-source.d.ts +14 -10
  28. package/dist/adapters/task-sources/repo-task-source.js +81 -122
  29. package/dist/adapters/task-sources/repo-validation.d.ts +36 -5
  30. package/dist/adapters/task-sources/repo-validation.js +126 -5
  31. package/dist/adapters/task-sources/task-file-loader.d.ts +2 -2
  32. package/dist/adapters/task-sources/task-file-loader.js +2 -2
  33. package/dist/commands/coverage-audit.js +3 -1
  34. package/dist/commands/init.d.ts +6 -4
  35. package/dist/commands/init.js +302 -23
  36. package/dist/commands/validate-tasks.d.ts +2 -2
  37. package/dist/commands/validate-tasks.js +26 -15
  38. package/dist/composition-root.d.ts +13 -1
  39. package/dist/composition-root.js +73 -41
  40. package/dist/index.d.ts +41 -0
  41. package/dist/index.js +48 -0
  42. package/dist/orchestration/build-step-sequence.js +4 -2
  43. package/dist/orchestration/steps/fetch-docs-step.js +2 -3
  44. package/dist/orchestration/steps/generate-configs-step.js +28 -12
  45. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +1 -1
  46. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +1 -1
  47. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +1 -1
  48. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +105 -68
  49. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +33 -100
  50. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  51. package/dist/pipeline/compiler/literacy-bridge.js +1 -1
  52. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +10 -0
  53. package/dist/pipeline/compiler/mode-bases/agent-harness.js +21 -0
  54. package/dist/pipeline/compiler/mode-bases/index.d.ts +4 -0
  55. package/dist/pipeline/compiler/mode-bases/index.js +4 -0
  56. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +10 -0
  57. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +22 -0
  58. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +12 -0
  59. package/dist/pipeline/compiler/mode-bases/literacy.js +78 -0
  60. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +10 -0
  61. package/dist/pipeline/compiler/mode-bases/mcp-server.js +70 -0
  62. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +43 -0
  63. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +187 -0
  64. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.d.ts +19 -0
  65. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +138 -0
  66. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +16 -0
  67. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +43 -0
  68. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.d.ts +9 -0
  69. package/dist/pipeline/compiler/mode-handlers/agent-harness/prompts.js +29 -0
  70. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +12 -0
  71. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +82 -0
  72. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.d.ts +4 -0
  73. package/dist/pipeline/compiler/mode-handlers/agent-harness/tool-presets.js +19 -0
  74. package/dist/pipeline/compiler/mode-handlers/{agent-harness-handler.d.ts → agent-harness/types.d.ts} +3 -24
  75. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.js +4 -0
  76. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.d.ts +9 -0
  77. package/dist/pipeline/compiler/mode-handlers/agent-harness/validation.js +16 -0
  78. package/dist/pipeline/compiler/mode-handlers/index.d.ts +4 -5
  79. package/dist/pipeline/compiler/mode-handlers/index.js +4 -6
  80. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.d.ts +16 -0
  81. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/assertions.js +61 -0
  82. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.d.ts +18 -0
  83. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/compiler.js +112 -0
  84. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +26 -0
  85. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +49 -0
  86. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.d.ts +9 -0
  87. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/prompts.js +28 -0
  88. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.d.ts +44 -0
  89. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/types.js +4 -0
  90. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.d.ts +9 -0
  91. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/validation.js +24 -0
  92. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +18 -0
  93. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +118 -0
  94. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.d.ts +14 -0
  95. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +105 -0
  96. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +11 -0
  97. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +38 -0
  98. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.d.ts +9 -0
  99. package/dist/pipeline/compiler/mode-handlers/literacy/prompts.js +74 -0
  100. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +41 -0
  101. package/dist/pipeline/compiler/mode-handlers/literacy/types.js +4 -0
  102. package/dist/pipeline/compiler/mode-handlers/literacy/validation.d.ts +12 -0
  103. package/dist/pipeline/compiler/mode-handlers/literacy/validation.js +28 -0
  104. package/dist/pipeline/compiler/mode-handlers/{mcp-assertions.d.ts → mcp-server/assertions.d.ts} +2 -10
  105. package/dist/pipeline/compiler/mode-handlers/{mcp-assertions.js → mcp-server/assertions.js} +63 -6
  106. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.d.ts +19 -0
  107. package/dist/pipeline/compiler/mode-handlers/mcp-server/compiler.js +100 -0
  108. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +27 -0
  109. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +54 -0
  110. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.d.ts +8 -0
  111. package/dist/pipeline/compiler/mode-handlers/mcp-server/prompts.js +28 -0
  112. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.d.ts +28 -0
  113. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +104 -0
  114. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.d.ts +37 -0
  115. package/dist/pipeline/compiler/mode-handlers/mcp-server/types.js +4 -0
  116. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.d.ts +9 -0
  117. package/dist/pipeline/compiler/mode-handlers/mcp-server/validation.js +43 -0
  118. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.d.ts +33 -0
  119. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +174 -0
  120. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.d.ts +19 -0
  121. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +95 -0
  122. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.d.ts +19 -0
  123. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-anthropic.js +172 -0
  124. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +14 -0
  125. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +16 -0
  126. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +93 -0
  127. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.js +4 -0
  128. package/dist/pipeline/compiler/preset-loader.d.ts +22 -0
  129. package/dist/pipeline/compiler/preset-loader.js +99 -0
  130. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +6 -9
  131. package/dist/pipeline/compiler/presets/sanity-literacy.js +10 -156
  132. package/dist/pipeline/expand-tasks.d.ts +2 -2
  133. package/dist/pipeline/expand-tasks.js +2 -2
  134. package/dist/pipeline/generate-configs.js +1 -1
  135. package/dist/pipeline/map-request-to-config.js +1 -0
  136. package/dist/pipeline/mirror-repo-tasks.d.ts +7 -7
  137. package/dist/pipeline/mirror-repo-tasks.js +9 -9
  138. package/dist/pipeline/plan.js +1 -1
  139. package/package.json +11 -3
  140. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  141. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  142. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  143. package/dist/_vendor/ailf-tasks/index.js +0 -16
  144. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  145. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  146. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  147. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  148. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  149. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  150. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
  151. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
  152. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
  153. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
  154. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
  155. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -67
  156. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -309
@@ -0,0 +1,78 @@
1
+ /**
2
+ * Literacy mode base — shared evaluation methodology for documentation literacy.
3
+ *
4
+ * Defines HOW literacy evaluations are scored (rubrics, weights, prompts),
5
+ * independently of WHAT documentation is being evaluated. Domain presets
6
+ * like `sanity-literacy` target this mode base and add their own sources,
7
+ * features, and doc fetcher.
8
+ *
9
+ * @see docs/MODES.md
10
+ */
11
+ import { LITERACY_PROMPT_TEMPLATES } from "../mode-handlers/literacy/index.js";
12
+ export function createLiteracyModeBase() {
13
+ return {
14
+ mode: {
15
+ id: "literacy",
16
+ label: "Documentation Literacy",
17
+ validProviderPatterns: ["^openai:", "^anthropic:", "^file://"],
18
+ rubricTemplateIds: [
19
+ "task-completion",
20
+ "code-correctness",
21
+ "doc-coverage",
22
+ ],
23
+ handlerModule: "./mode-handlers/literacy/index.js",
24
+ },
25
+ rubricTemplates: [
26
+ {
27
+ id: "task-completion",
28
+ dimension: "task-completion",
29
+ header: "Score task completion from 0 to 100:",
30
+ scale: [
31
+ "0: Couldn't attempt — missing critical information",
32
+ "20: Attempted but fundamentally wrong approach",
33
+ "50: Partial implementation — major functional gaps",
34
+ "80: Mostly complete — minor issues or missing edge cases",
35
+ "100: Fully functional code — works as expected",
36
+ ],
37
+ criteriaLabel: "Must demonstrate:",
38
+ },
39
+ {
40
+ id: "code-correctness",
41
+ dimension: "code-correctness",
42
+ header: "Score code correctness from 0 to 100:",
43
+ scale: [
44
+ "0: Broken code, syntax errors, or deprecated APIs",
45
+ "30: Works but uses anti-patterns or inefficient approaches",
46
+ "50: Works but not idiomatic",
47
+ "80: Follows most best practices",
48
+ "100: Follows all best practices, idiomatic implementation",
49
+ ],
50
+ criteriaLabel: "Check for:",
51
+ },
52
+ {
53
+ id: "doc-coverage",
54
+ dimension: "doc-coverage",
55
+ header: "Score documentation coverage from 0 to 100:",
56
+ scale: [
57
+ "0: Had to hallucinate/guess most implementation details",
58
+ "30: Significant gaps — filled with assumptions",
59
+ "50: Some gaps — inferred from partial information",
60
+ "80: Minor gaps — almost everything was documented",
61
+ "100: Complete coverage — all necessary info was in docs",
62
+ ],
63
+ },
64
+ ],
65
+ scoringProfiles: {
66
+ default: {
67
+ "task-completion": 0.5,
68
+ "code-correctness": 0.25,
69
+ "doc-coverage": 0.25,
70
+ },
71
+ "output-only": {
72
+ "task-completion": 0.6,
73
+ "code-correctness": 0.4,
74
+ },
75
+ },
76
+ promptTemplates: LITERACY_PROMPT_TEMPLATES,
77
+ };
78
+ }
@@ -0,0 +1,10 @@
1
+ /**
2
+ * MCP Server mode base — evaluation methodology for MCP server tool-use testing.
3
+ *
4
+ * Defines rubric templates and scoring for evaluating how well an LLM can
5
+ * discover and use MCP server tools correctly.
6
+ *
7
+ * @see docs/MODES.md
8
+ */
9
+ import type { ModeBase } from "../../../_vendor/ailf-core/index.d.ts";
10
+ export declare function createMcpServerModeBase(): ModeBase;
@@ -0,0 +1,70 @@
1
+ /**
2
+ * MCP Server mode base — evaluation methodology for MCP server tool-use testing.
3
+ *
4
+ * Defines rubric templates and scoring for evaluating how well an LLM can
5
+ * discover and use MCP server tools correctly.
6
+ *
7
+ * @see docs/MODES.md
8
+ */
9
+ export function createMcpServerModeBase() {
10
+ return {
11
+ mode: {
12
+ id: "mcp-server",
13
+ label: "MCP Server Testing",
14
+ validProviderPatterns: ["^mcp:", "^file://"],
15
+ rubricTemplateIds: [
16
+ "mcp-input-validation",
17
+ "mcp-output-correctness",
18
+ "mcp-error-handling",
19
+ ],
20
+ handlerModule: "./mode-handlers/mcp-server/index.js",
21
+ },
22
+ rubricTemplates: [
23
+ {
24
+ id: "mcp-input-validation",
25
+ dimension: "mcp-input-validation",
26
+ header: "Score MCP input validation from 0 to 100:",
27
+ scale: [
28
+ "0: Tool called with completely wrong parameters",
29
+ "30: Some parameters correct but critical ones missing or wrong",
30
+ "50: Parameters mostly correct but types or formats are off",
31
+ "80: All required parameters correct, minor optional parameter issues",
32
+ "100: Perfect tool invocation with all parameters correctly formed",
33
+ ],
34
+ criteriaLabel: "Check for:",
35
+ },
36
+ {
37
+ id: "mcp-output-correctness",
38
+ dimension: "mcp-output-correctness",
39
+ header: "Score MCP output correctness from 0 to 100:",
40
+ scale: [
41
+ "0: Output is completely wrong or tool returned an error",
42
+ "30: Output partially correct but missing key data",
43
+ "50: Output correct but incomplete or improperly formatted",
44
+ "80: Output correct and well-formatted, minor issues",
45
+ "100: Output is correct, complete, and properly formatted",
46
+ ],
47
+ criteriaLabel: "Check for:",
48
+ },
49
+ {
50
+ id: "mcp-error-handling",
51
+ dimension: "mcp-error-handling",
52
+ header: "Score MCP error handling from 0 to 100:",
53
+ scale: [
54
+ "0: No error handling — crashes or returns garbage on bad input",
55
+ "30: Basic error detection but poor recovery or messaging",
56
+ "50: Errors detected and reported but not gracefully handled",
57
+ "80: Good error handling with clear messages and appropriate fallbacks",
58
+ "100: Excellent error handling — validates input, provides actionable errors, degrades gracefully",
59
+ ],
60
+ },
61
+ ],
62
+ scoringProfiles: {
63
+ default: {
64
+ "mcp-input-validation": 0.4,
65
+ "mcp-output-correctness": 0.4,
66
+ "mcp-error-handling": 0.2,
67
+ },
68
+ },
69
+ };
70
+ }
@@ -0,0 +1,43 @@
1
+ /**
2
+ * Assertion mapping and builders for agent harness tasks.
3
+ *
4
+ * Handles agent-specific assertion types (file-exists, file-contains,
5
+ * command-succeeds, diff-matches) as well as standard pass-through
6
+ * assertion types.
7
+ */
8
+ import type { PromptfooAssertion } from "../../assertion-mapper.js";
9
+ import type { AgentHarnessCompileOptions } from "./types.js";
10
+ export declare function mapAgentAssertion(assertion: {
11
+ type: string;
12
+ [k: string]: unknown;
13
+ }, options: AgentHarnessCompileOptions | undefined, warnings: string[]): PromptfooAssertion | null;
14
+ export declare function buildFileExistsAssertion(assertion: {
15
+ type: string;
16
+ [k: string]: unknown;
17
+ }): PromptfooAssertion;
18
+ export declare function buildFileContainsAssertion(assertion: {
19
+ type: string;
20
+ [k: string]: unknown;
21
+ }): PromptfooAssertion;
22
+ /**
23
+ * SECURITY: Trusted-input boundary.
24
+ *
25
+ * The `command-succeeds` assertion executes an arbitrary shell command
26
+ * inside the sandbox's working directory. The command string comes from
27
+ * task definitions (YAML or TypeScript config files), which are authored
28
+ * by developers — not from user input or LLM output.
29
+ *
30
+ * This is intentional: the assertion is designed to verify agent output
31
+ * by running build/test commands (e.g., "npm test", "tsc --noEmit").
32
+ *
33
+ * Task definitions are the trust boundary. If you accept task definitions
34
+ * from untrusted sources, validate commands against an allowlist first.
35
+ */
36
+ export declare function buildCommandSucceedsAssertion(assertion: {
37
+ type: string;
38
+ [k: string]: unknown;
39
+ }): PromptfooAssertion;
40
+ export declare function buildDiffMatchesAssertion(assertion: {
41
+ type: string;
42
+ [k: string]: unknown;
43
+ }): PromptfooAssertion;
@@ -0,0 +1,187 @@
1
+ /**
2
+ * Assertion mapping and builders for agent harness tasks.
3
+ *
4
+ * Handles agent-specific assertion types (file-exists, file-contains,
5
+ * command-succeeds, diff-matches) as well as standard pass-through
6
+ * assertion types.
7
+ */
8
+ // ---------------------------------------------------------------------------
9
+ // Assertion mapping
10
+ // ---------------------------------------------------------------------------
11
+ export function mapAgentAssertion(assertion, options, warnings) {
12
+ switch (assertion.type) {
13
+ case "file-exists":
14
+ return buildFileExistsAssertion(assertion);
15
+ case "file-contains":
16
+ return buildFileContainsAssertion(assertion);
17
+ case "command-succeeds":
18
+ return buildCommandSucceedsAssertion(assertion);
19
+ case "diff-matches":
20
+ return buildDiffMatchesAssertion(assertion);
21
+ // Standard assertions pass through
22
+ case "contains":
23
+ case "equals":
24
+ case "regex":
25
+ case "is-json":
26
+ case "javascript":
27
+ case "python":
28
+ return {
29
+ type: assertion.type,
30
+ ...("value" in assertion ? { value: assertion.value } : {}),
31
+ ...(typeof assertion.weight === "number"
32
+ ? { weight: assertion.weight }
33
+ : {}),
34
+ };
35
+ case "llm-rubric":
36
+ return {
37
+ type: "llm-rubric",
38
+ ...("value" in assertion ? { value: assertion.value } : {}),
39
+ ...(typeof assertion.weight === "number"
40
+ ? { weight: assertion.weight }
41
+ : {}),
42
+ ...(options?.graderProvider
43
+ ? { provider: options.graderProvider }
44
+ : {}),
45
+ };
46
+ default:
47
+ warnings.push(`Agent task: unknown assertion type "${assertion.type}" — passed through`);
48
+ return {
49
+ type: assertion.type,
50
+ ...("value" in assertion ? { value: assertion.value } : {}),
51
+ };
52
+ }
53
+ }
54
+ // ---------------------------------------------------------------------------
55
+ // Agent-specific assertion builders
56
+ // ---------------------------------------------------------------------------
57
+ export function buildFileExistsAssertion(assertion) {
58
+ const filePath = String(assertion.value ?? "");
59
+ // Use JSON.stringify for all interpolated values in generated JS to
60
+ // prevent broken strings from filePaths containing quotes/backslashes
61
+ const safeFilePath = JSON.stringify(filePath);
62
+ return {
63
+ type: "javascript",
64
+ value: `// file-exists: ${filePath}\n` +
65
+ `(function() {\n` +
66
+ ` const fs = require('fs');\n` +
67
+ ` const path = require('path');\n` +
68
+ ` const workDir = path.resolve(context.vars.__workingDir || '.');\n` +
69
+ ` const target = path.resolve(workDir, ${safeFilePath});\n` +
70
+ ` if (!target.startsWith(workDir + path.sep) && target !== workDir) {\n` +
71
+ ` return { pass: false, score: 0, reason: 'Path traversal: ' + ${safeFilePath} + ' escapes sandbox' };\n` +
72
+ ` }\n` +
73
+ ` const exists = fs.existsSync(target);\n` +
74
+ ` return {\n` +
75
+ ` pass: exists,\n` +
76
+ ` score: exists ? 1 : 0,\n` +
77
+ ` reason: exists\n` +
78
+ ` ? 'File exists: ' + ${safeFilePath}\n` +
79
+ ` : 'Expected file not found: ' + ${safeFilePath},\n` +
80
+ ` };\n` +
81
+ `})()`,
82
+ ...(typeof assertion.weight === "number"
83
+ ? { weight: assertion.weight }
84
+ : {}),
85
+ };
86
+ }
87
+ export function buildFileContainsAssertion(assertion) {
88
+ const config = assertion.value;
89
+ const filePath = config?.path ?? "";
90
+ const expectedContent = config?.content ?? "";
91
+ const safeFilePath = JSON.stringify(filePath);
92
+ return {
93
+ type: "javascript",
94
+ value: `// file-contains: ${filePath}\n` +
95
+ `(function() {\n` +
96
+ ` const fs = require('fs');\n` +
97
+ ` const path = require('path');\n` +
98
+ ` const workDir = path.resolve(context.vars.__workingDir || '.');\n` +
99
+ ` const target = path.resolve(workDir, ${safeFilePath});\n` +
100
+ ` if (!target.startsWith(workDir + path.sep) && target !== workDir) {\n` +
101
+ ` return { pass: false, score: 0, reason: 'Path traversal: ' + ${safeFilePath} + ' escapes sandbox' };\n` +
102
+ ` }\n` +
103
+ ` if (!fs.existsSync(target)) {\n` +
104
+ ` return { pass: false, score: 0, reason: 'File not found: ' + ${safeFilePath} };\n` +
105
+ ` }\n` +
106
+ ` const content = fs.readFileSync(target, 'utf-8');\n` +
107
+ ` const contains = content.includes(${JSON.stringify(expectedContent)});\n` +
108
+ ` return {\n` +
109
+ ` pass: contains,\n` +
110
+ ` score: contains ? 1 : 0,\n` +
111
+ ` reason: contains\n` +
112
+ ` ? 'File contains expected content'\n` +
113
+ ` : 'File does not contain expected content',\n` +
114
+ ` };\n` +
115
+ `})()`,
116
+ ...(typeof assertion.weight === "number"
117
+ ? { weight: assertion.weight }
118
+ : {}),
119
+ };
120
+ }
121
+ /**
122
+ * SECURITY: Trusted-input boundary.
123
+ *
124
+ * The `command-succeeds` assertion executes an arbitrary shell command
125
+ * inside the sandbox's working directory. The command string comes from
126
+ * task definitions (YAML or TypeScript config files), which are authored
127
+ * by developers — not from user input or LLM output.
128
+ *
129
+ * This is intentional: the assertion is designed to verify agent output
130
+ * by running build/test commands (e.g., "npm test", "tsc --noEmit").
131
+ *
132
+ * Task definitions are the trust boundary. If you accept task definitions
133
+ * from untrusted sources, validate commands against an allowlist first.
134
+ */
135
+ export function buildCommandSucceedsAssertion(assertion) {
136
+ const command = String(assertion.value ?? "");
137
+ return {
138
+ type: "javascript",
139
+ value: `// command-succeeds: ${command}\n` +
140
+ `(function() {\n` +
141
+ ` const { execSync } = require('child_process');\n` +
142
+ ` const workDir = context.vars.__workingDir || '.';\n` +
143
+ ` try {\n` +
144
+ ` execSync(${JSON.stringify(command)}, { cwd: workDir, timeout: 30000 });\n` +
145
+ ` return { pass: true, score: 1, reason: 'Command succeeded: ' + ${JSON.stringify(command)} };\n` +
146
+ ` } catch (err) {\n` +
147
+ ` return {\n` +
148
+ ` pass: false,\n` +
149
+ ` score: 0,\n` +
150
+ ` reason: 'Command failed: ' + (err.message || err),\n` +
151
+ ` };\n` +
152
+ ` }\n` +
153
+ `})()`,
154
+ ...(typeof assertion.weight === "number"
155
+ ? { weight: assertion.weight }
156
+ : {}),
157
+ };
158
+ }
159
+ export function buildDiffMatchesAssertion(assertion) {
160
+ const expected = assertion.value;
161
+ return {
162
+ type: "javascript",
163
+ value: `// diff-matches\n` +
164
+ `(function() {\n` +
165
+ ` const { execSync } = require('child_process');\n` +
166
+ ` const workDir = context.vars.__workingDir || '.';\n` +
167
+ ` try {\n` +
168
+ ` const diff = execSync('git diff', { cwd: workDir, encoding: 'utf-8' });\n` +
169
+ ` const expected = ${JSON.stringify(expected)};\n` +
170
+ ` if (typeof expected === 'string') {\n` +
171
+ ` const contains = diff.includes(expected);\n` +
172
+ ` return {\n` +
173
+ ` pass: contains,\n` +
174
+ ` score: contains ? 1 : 0,\n` +
175
+ ` reason: contains ? 'Diff matches expected pattern' : 'Diff does not match',\n` +
176
+ ` };\n` +
177
+ ` }\n` +
178
+ ` return { pass: diff.length > 0, score: diff.length > 0 ? 1 : 0, reason: 'Diff exists' };\n` +
179
+ ` } catch (err) {\n` +
180
+ ` return { pass: false, score: 0, reason: 'Failed to get diff: ' + err.message };\n` +
181
+ ` }\n` +
182
+ `})()`,
183
+ ...(typeof assertion.weight === "number"
184
+ ? { weight: assertion.weight }
185
+ : {}),
186
+ };
187
+ }
@@ -0,0 +1,19 @@
1
+ /**
2
+ * Agent harness task compilation — core compilation logic.
3
+ *
4
+ * Maps agent harness task definitions to Promptfoo configuration with:
5
+ * - Claude Agent SDK / OpenAI Codex SDK providers
6
+ * - Tool permission configuration (preset/allowed/disallowed)
7
+ * - Sandbox setup/teardown via Promptfoo extensions
8
+ * - Fixture provisioning into sandbox working directory
9
+ */
10
+ import type { AgentHarnessTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
11
+ import type { PromptfooPrompt, PromptfooProvider, PromptfooTestCase } from "../../promptfoo-compiler.js";
12
+ import type { AgentHarnessCompileOptions, AgentHarnessCompileResult } from "./types.js";
13
+ /**
14
+ * Compile an agent harness task definition into Promptfoo configuration.
15
+ */
16
+ export declare function compileAgentHarnessTask(task: AgentHarnessTaskDefinition, options?: AgentHarnessCompileOptions): AgentHarnessCompileResult;
17
+ export declare function buildAgentProvider(task: AgentHarnessTaskDefinition, _warnings: string[]): PromptfooProvider[];
18
+ export declare function buildAgentPrompts(task: AgentHarnessTaskDefinition): PromptfooPrompt[];
19
+ export declare function buildAgentTestCases(task: AgentHarnessTaskDefinition, options: AgentHarnessCompileOptions | undefined, warnings: string[]): PromptfooTestCase[];
@@ -0,0 +1,138 @@
1
+ /**
2
+ * Agent harness task compilation — core compilation logic.
3
+ *
4
+ * Maps agent harness task definitions to Promptfoo configuration with:
5
+ * - Claude Agent SDK / OpenAI Codex SDK providers
6
+ * - Tool permission configuration (preset/allowed/disallowed)
7
+ * - Sandbox setup/teardown via Promptfoo extensions
8
+ * - Fixture provisioning into sandbox working directory
9
+ */
10
+ import { mapAgentAssertion } from "./assertions.js";
11
+ import { buildLifecycleExtensions, buildSandboxConfig } from "./sandbox.js";
12
+ import { TOOL_PRESETS } from "./tool-presets.js";
13
+ import { validateAgentHarnessTask } from "./validation.js";
14
+ /**
15
+ * Compile an agent harness task definition into Promptfoo configuration.
16
+ */
17
+ export function compileAgentHarnessTask(task, options) {
18
+ const warnings = [];
19
+ // Validate
20
+ const validationErrors = validateAgentHarnessTask(task);
21
+ for (const err of validationErrors) {
22
+ warnings.push(`Agent harness task "${task.id}": ${err.field} — ${err.message}`);
23
+ }
24
+ // Build provider
25
+ const providers = buildAgentProvider(task, warnings);
26
+ // Build prompts
27
+ const prompts = buildAgentPrompts(task);
28
+ // Build test cases
29
+ const tests = buildAgentTestCases(task, options, warnings);
30
+ // Build sandbox extensions
31
+ const sandboxConfig = buildSandboxConfig(task);
32
+ const extensions = buildLifecycleExtensions(task, sandboxConfig);
33
+ return { providers, tests, prompts, extensions, sandboxConfig, warnings };
34
+ }
35
+ // ---------------------------------------------------------------------------
36
+ // Provider assembly
37
+ // ---------------------------------------------------------------------------
38
+ export function buildAgentProvider(task, _warnings) {
39
+ // Resolve tool permissions
40
+ const tools = resolveToolPermissions(task.tools);
41
+ const config = {};
42
+ if (tools.length > 0) {
43
+ config.allowedTools = tools;
44
+ }
45
+ if (task.sandbox) {
46
+ config.sandbox = {
47
+ type: task.sandbox.type,
48
+ ...(task.sandbox.image ? { image: task.sandbox.image } : {}),
49
+ };
50
+ }
51
+ // Default to Claude Agent SDK provider
52
+ return [
53
+ {
54
+ id: `agent:${task.id}`,
55
+ label: `Agent Harness: ${task.title}`,
56
+ config,
57
+ },
58
+ ];
59
+ }
60
+ /**
61
+ * Resolve tool permissions from task config.
62
+ *
63
+ * Handles:
64
+ * - Preset names ("coding", "read-only", "full-access")
65
+ * - Explicit tool names ("Bash", "Read", "Write")
66
+ * - Mixed arrays ["coding", "WebSearch"] → preset expansion + extras
67
+ */
68
+ function resolveToolPermissions(tools) {
69
+ if (!tools || tools.length === 0)
70
+ return [];
71
+ const resolved = new Set();
72
+ for (const tool of tools) {
73
+ const preset = TOOL_PRESETS[tool];
74
+ if (preset) {
75
+ for (const t of preset)
76
+ resolved.add(t);
77
+ }
78
+ else {
79
+ resolved.add(tool);
80
+ }
81
+ }
82
+ return [...resolved];
83
+ }
84
+ // ---------------------------------------------------------------------------
85
+ // Prompt assembly
86
+ // ---------------------------------------------------------------------------
87
+ export function buildAgentPrompts(task) {
88
+ const promptText = task.prompt?.text ??
89
+ task.prompt?.vars?.task ??
90
+ task.description ??
91
+ `Agent task: ${task.title}`;
92
+ return [
93
+ {
94
+ id: "agent-harness",
95
+ label: `Agent: ${task.title}`,
96
+ raw: String(promptText),
97
+ },
98
+ ];
99
+ }
100
+ // ---------------------------------------------------------------------------
101
+ // Test case assembly
102
+ // ---------------------------------------------------------------------------
103
+ export function buildAgentTestCases(task, options, warnings) {
104
+ const assertions = [];
105
+ if (task.assertions) {
106
+ for (const assertion of task.assertions) {
107
+ const mapped = mapAgentAssertion(assertion, options, warnings);
108
+ if (mapped)
109
+ assertions.push(mapped);
110
+ }
111
+ }
112
+ const vars = {
113
+ task: task.prompt?.vars?.task ?? task.description ?? `Complete: ${task.title}`,
114
+ ...(task.prompt?.vars ?? {}),
115
+ // Internal metadata for sandbox lifecycle hooks
116
+ __sandboxType: task.sandbox?.type ?? "tempdir",
117
+ __fixtures: task.fixtures ?? [],
118
+ };
119
+ const tests = [
120
+ {
121
+ description: `${task.id} — ${task.title}`,
122
+ vars,
123
+ ...(assertions.length > 0 ? { assert: assertions } : {}),
124
+ },
125
+ ];
126
+ // Multi-turn support
127
+ if (task.multiTurn?.turns && task.multiTurn.turns.length > 0) {
128
+ tests.push({
129
+ description: `${task.id} — ${task.title} [multi-turn]`,
130
+ vars: {
131
+ ...vars,
132
+ __multiTurn: task.multiTurn.turns,
133
+ },
134
+ ...(assertions.length > 0 ? { assert: assertions } : {}),
135
+ });
136
+ }
137
+ return tests;
138
+ }
@@ -0,0 +1,16 @@
1
+ /**
2
+ * Agent harness mode handler — compiles AgentHarnessTaskDefinition into Promptfoo config.
3
+ *
4
+ * @see docs/exec-plans/architecture-overhaul/phase-4-agent-harness.md
5
+ * @see packages/core/src/types/generalized-task.ts — AgentHarnessTaskDefinition
6
+ */
7
+ import type { ModeHandler } from "../../../../_vendor/ailf-core/index.d.ts";
8
+ export { AGENT_HARNESS_PROMPT_TEMPLATES } from "./prompts.js";
9
+ export { TOOL_PRESETS } from "./tool-presets.js";
10
+ export { validateAgentHarnessTask } from "./validation.js";
11
+ export { mapAgentAssertion, buildFileExistsAssertion, buildFileContainsAssertion, buildCommandSucceedsAssertion, buildDiffMatchesAssertion, } from "./assertions.js";
12
+ export { buildLifecycleExtensions, buildBeforeEachHook, buildAfterEachHook, buildSandboxConfig, } from "./sandbox.js";
13
+ export { compileAgentHarnessTask, buildAgentProvider, buildAgentPrompts, buildAgentTestCases, } from "./compiler.js";
14
+ export type { AgentHarnessCompileOptions, AgentHarnessCompileResult, AgentHarnessValidationError, PromptfooExtension, SandboxConfigMeta, } from "./types.js";
15
+ /** ModeHandler-conformant export for the agent-harness evaluation mode. */
16
+ export declare const handler: ModeHandler;
@@ -0,0 +1,43 @@
1
+ /**
2
+ * Agent harness mode handler — compiles AgentHarnessTaskDefinition into Promptfoo config.
3
+ *
4
+ * @see docs/exec-plans/architecture-overhaul/phase-4-agent-harness.md
5
+ * @see packages/core/src/types/generalized-task.ts — AgentHarnessTaskDefinition
6
+ */
7
+ import { compileAgentHarnessTask } from "./compiler.js";
8
+ import { AGENT_HARNESS_PROMPT_TEMPLATES } from "./prompts.js";
9
+ // Re-export public API
10
+ export { AGENT_HARNESS_PROMPT_TEMPLATES } from "./prompts.js";
11
+ export { TOOL_PRESETS } from "./tool-presets.js";
12
+ export { validateAgentHarnessTask } from "./validation.js";
13
+ export { mapAgentAssertion, buildFileExistsAssertion, buildFileContainsAssertion, buildCommandSucceedsAssertion, buildDiffMatchesAssertion, } from "./assertions.js";
14
+ export { buildLifecycleExtensions, buildBeforeEachHook, buildAfterEachHook, buildSandboxConfig, } from "./sandbox.js";
15
+ export { compileAgentHarnessTask, buildAgentProvider, buildAgentPrompts, buildAgentTestCases, } from "./compiler.js";
16
+ // ---------------------------------------------------------------------------
17
+ // ModeHandler adapter — wraps compileAgentHarnessTask for registry dispatch
18
+ // ---------------------------------------------------------------------------
19
+ /** ModeHandler-conformant export for the agent-harness evaluation mode. */
20
+ export const handler = {
21
+ getPrompts() {
22
+ return AGENT_HARNESS_PROMPT_TEMPLATES;
23
+ },
24
+ compileTask(task, ctx) {
25
+ if (!("mode" in task) || task.mode !== "agent-harness") {
26
+ throw new Error(`Agent harness handler received task with mode "${task.mode ?? "undefined"}" — expected "agent-harness"`);
27
+ }
28
+ const result = compileAgentHarnessTask(task, {
29
+ graderProvider: ctx.graderProvider,
30
+ rootDir: ctx.rootDir,
31
+ });
32
+ return {
33
+ providers: result.providers,
34
+ tests: result.tests,
35
+ prompts: result.prompts,
36
+ warnings: result.warnings,
37
+ extras: {
38
+ extensions: result.extensions,
39
+ sandboxConfig: result.sandboxConfig,
40
+ },
41
+ };
42
+ },
43
+ };
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Canonical prompt templates for agent-harness-mode evaluations.
3
+ *
4
+ * Handler-owned prompts for agent harness evaluations. Describes the task
5
+ * for autonomous agent execution within a sandboxed environment with file
6
+ * system and tool access.
7
+ */
8
+ import type { PromptTemplate } from "../../../../_vendor/ailf-core/index.d.ts";
9
+ export declare const AGENT_HARNESS_PROMPT_TEMPLATES: Record<string, PromptTemplate>;
@@ -0,0 +1,29 @@
1
+ /**
2
+ * Canonical prompt templates for agent-harness-mode evaluations.
3
+ *
4
+ * Handler-owned prompts for agent harness evaluations. Describes the task
5
+ * for autonomous agent execution within a sandboxed environment with file
6
+ * system and tool access.
7
+ */
8
+ export const AGENT_HARNESS_PROMPT_TEMPLATES = {
9
+ "agent-harness": {
10
+ id: "agent-harness",
11
+ label: "Agent Harness Task",
12
+ template: `You are a coding agent working in a sandboxed environment. You have access to file system tools (read, write, edit) and a shell to complete the following task.
13
+
14
+ ## Task
15
+ {{task}}
16
+
17
+ ## Instructions
18
+
19
+ 1. Read existing files to understand the project structure before making changes
20
+ 2. Implement a complete, working solution — no placeholders or TODOs
21
+ 3. Ensure all necessary imports and dependencies are included
22
+ 4. Verify your implementation compiles and passes any provided test commands
23
+ 5. Keep changes minimal and focused on the task
24
+
25
+ Complete the implementation:
26
+ `,
27
+ variables: ["task"],
28
+ },
29
+ };
@@ -0,0 +1,12 @@
1
+ /**
2
+ * Sandbox configuration and lifecycle extensions for agent harness tasks.
3
+ *
4
+ * Builds Promptfoo beforeEach/afterEach hooks for provisioning and
5
+ * tearing down sandbox working directories.
6
+ */
7
+ import type { AgentHarnessTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
8
+ import type { PromptfooExtension, SandboxConfigMeta } from "./types.js";
9
+ export declare function buildSandboxConfig(task: AgentHarnessTaskDefinition): SandboxConfigMeta;
10
+ export declare function buildLifecycleExtensions(task: AgentHarnessTaskDefinition, sandboxConfig: SandboxConfigMeta): PromptfooExtension[];
11
+ export declare function buildBeforeEachHook(taskId: string, config: SandboxConfigMeta): string;
12
+ export declare function buildAfterEachHook(taskId: string): string;