@sanity/ailf 2.0.2 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. package/LICENSE +21 -0
  2. package/dist/_vendor/ailf-core/examples/index.d.ts +50 -1
  3. package/dist/_vendor/ailf-core/examples/index.js +66 -1
  4. package/dist/agent-harness/assertions-runtime.d.ts +49 -0
  5. package/dist/agent-harness/assertions-runtime.js +138 -0
  6. package/dist/agent-harness/provider.d.ts +58 -0
  7. package/dist/agent-harness/provider.js +104 -0
  8. package/dist/cli.js +0 -0
  9. package/dist/commands/init.js +3 -0
  10. package/dist/orchestration/steps/generate-configs-step.d.ts +7 -0
  11. package/dist/orchestration/steps/generate-configs-step.js +35 -2
  12. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +39 -25
  13. package/dist/pipeline/compiler/compiler-to-yaml.js +78 -7
  14. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +9 -0
  15. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +28 -85
  16. package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +22 -15
  17. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +8 -1
  18. package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +42 -12
  19. package/package.json +25 -24
  20. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +0 -10
  21. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +0 -185
  22. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +0 -6
  23. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +0 -42
  24. package/dist/_vendor/ailf-tasks/cli.d.ts +0 -8
  25. package/dist/_vendor/ailf-tasks/cli.js +0 -61
  26. package/dist/_vendor/ailf-tasks/index.d.ts +0 -13
  27. package/dist/_vendor/ailf-tasks/index.js +0 -16
  28. package/dist/_vendor/ailf-tasks/parser.d.ts +0 -27
  29. package/dist/_vendor/ailf-tasks/parser.js +0 -73
  30. package/dist/_vendor/ailf-tasks/schemas.d.ts +0 -198
  31. package/dist/_vendor/ailf-tasks/schemas.js +0 -180
  32. package/dist/_vendor/ailf-tasks/validation.d.ts +0 -47
  33. package/dist/_vendor/ailf-tasks/validation.js +0 -162
  34. package/dist/adapters/task-sources/yaml-task-source.d.ts +0 -18
  35. package/dist/adapters/task-sources/yaml-task-source.js +0 -139
  36. package/dist/agent-observer/test-imports.d.ts +0 -7
  37. package/dist/agent-observer/test-imports.js +0 -185
  38. package/dist/commands/update-quality-scores.d.ts +0 -5
  39. package/dist/commands/update-quality-scores.js +0 -20
  40. package/dist/lib/agent-behavior-report.d.ts +0 -8
  41. package/dist/lib/agent-behavior-report.js +0 -185
  42. package/dist/lib/baseline.d.ts +0 -19
  43. package/dist/lib/baseline.js +0 -153
  44. package/dist/lib/calculate-scores.d.ts +0 -23
  45. package/dist/lib/calculate-scores.js +0 -42
  46. package/dist/lib/compare.d.ts +0 -18
  47. package/dist/lib/compare.js +0 -170
  48. package/dist/lib/coverage-audit.d.ts +0 -4
  49. package/dist/lib/coverage-audit.js +0 -42
  50. package/dist/lib/discovery-report.d.ts +0 -13
  51. package/dist/lib/discovery-report.js +0 -57
  52. package/dist/lib/fetch-docs.d.ts +0 -30
  53. package/dist/lib/fetch-docs.js +0 -171
  54. package/dist/lib/generate-configs.d.ts +0 -25
  55. package/dist/lib/generate-configs.js +0 -42
  56. package/dist/lib/grader-api.d.ts +0 -21
  57. package/dist/lib/grader-api.js +0 -34
  58. package/dist/lib/grader-compare.d.ts +0 -19
  59. package/dist/lib/grader-compare.js +0 -91
  60. package/dist/lib/grader-consistency.d.ts +0 -27
  61. package/dist/lib/grader-consistency.js +0 -79
  62. package/dist/lib/grader-sensitivity.d.ts +0 -19
  63. package/dist/lib/grader-sensitivity.js +0 -75
  64. package/dist/lib/grader-validate.d.ts +0 -19
  65. package/dist/lib/grader-validate.js +0 -78
  66. package/dist/lib/measure-retrieval.d.ts +0 -14
  67. package/dist/lib/measure-retrieval.js +0 -71
  68. package/dist/lib/pr-comment.d.ts +0 -16
  69. package/dist/lib/pr-comment.js +0 -28
  70. package/dist/lib/readiness-report.d.ts +0 -13
  71. package/dist/lib/readiness-report.js +0 -108
  72. package/dist/lib/webhook-server.d.ts +0 -11
  73. package/dist/lib/webhook-server.js +0 -24
  74. package/dist/lib/weekly-digest.d.ts +0 -24
  75. package/dist/lib/weekly-digest.js +0 -148
  76. package/dist/orchestration/env-bridge.d.ts +0 -21
  77. package/dist/orchestration/env-bridge.js +0 -66
  78. package/dist/orchestration/steps/fetch-docs-shell.d.ts +0 -17
  79. package/dist/orchestration/steps/fetch-docs-shell.js +0 -30
  80. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +0 -9
  81. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +0 -339
  82. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +0 -70
  83. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +0 -485
  84. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +0 -76
  85. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +0 -245
  86. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +0 -89
  87. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +0 -379
  88. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +0 -50
  89. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +0 -334
  90. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +0 -69
  91. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +0 -307
  92. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +0 -65
  93. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +0 -368
  94. package/dist/pipeline/compiler/task-bridge.d.ts +0 -41
  95. package/dist/pipeline/compiler/task-bridge.js +0 -92
  96. package/dist/pipeline/expand-tasks.d.ts +0 -232
  97. package/dist/pipeline/expand-tasks.js +0 -467
  98. package/dist/pipeline/generate-configs.d.ts +0 -92
  99. package/dist/pipeline/generate-configs.js +0 -445
  100. package/dist/pipeline/steps/calculate-scores-step.d.ts +0 -11
  101. package/dist/pipeline/steps/calculate-scores-step.js +0 -89
  102. package/dist/pipeline/steps/compare-step.d.ts +0 -18
  103. package/dist/pipeline/steps/compare-step.js +0 -90
  104. package/dist/pipeline/steps/eval-step.d.ts +0 -53
  105. package/dist/pipeline/steps/eval-step.js +0 -347
  106. package/dist/pipeline/steps/fetch-docs-step.d.ts +0 -11
  107. package/dist/pipeline/steps/fetch-docs-step.js +0 -84
  108. package/dist/pipeline/steps/generate-configs-step.d.ts +0 -11
  109. package/dist/pipeline/steps/generate-configs-step.js +0 -98
  110. package/dist/pipeline/steps/grader-consistency-step.d.ts +0 -21
  111. package/dist/pipeline/steps/grader-consistency-step.js +0 -74
  112. package/dist/pipeline/steps/publish-report-step.d.ts +0 -57
  113. package/dist/pipeline/steps/publish-report-step.js +0 -243
  114. package/dist/pipeline/steps/report-step.d.ts +0 -13
  115. package/dist/pipeline/steps/report-step.js +0 -56
  116. package/dist/pipeline/steps/update-scores-step.d.ts +0 -11
  117. package/dist/pipeline/steps/update-scores-step.js +0 -42
  118. package/dist/scripts/agent-behavior-report.d.ts +0 -19
  119. package/dist/scripts/agent-behavior-report.js +0 -315
  120. package/dist/scripts/baseline.d.ts +0 -43
  121. package/dist/scripts/baseline.js +0 -267
  122. package/dist/scripts/calculate-scores.d.ts +0 -166
  123. package/dist/scripts/calculate-scores.js +0 -1296
  124. package/dist/scripts/compare.d.ts +0 -22
  125. package/dist/scripts/compare.js +0 -334
  126. package/dist/scripts/coverage-audit.d.ts +0 -44
  127. package/dist/scripts/coverage-audit.js +0 -209
  128. package/dist/scripts/debug-eval.d.ts +0 -19
  129. package/dist/scripts/debug-eval.js +0 -73
  130. package/dist/scripts/discovery-report.d.ts +0 -58
  131. package/dist/scripts/discovery-report.js +0 -250
  132. package/dist/scripts/fetch-docs.d.ts +0 -35
  133. package/dist/scripts/fetch-docs.js +0 -472
  134. package/dist/scripts/generate-configs.d.ts +0 -66
  135. package/dist/scripts/generate-configs.js +0 -459
  136. package/dist/scripts/grader-api.d.ts +0 -27
  137. package/dist/scripts/grader-api.js +0 -206
  138. package/dist/scripts/grader-compare.d.ts +0 -22
  139. package/dist/scripts/grader-compare.js +0 -368
  140. package/dist/scripts/grader-consistency.d.ts +0 -20
  141. package/dist/scripts/grader-consistency.js +0 -313
  142. package/dist/scripts/grader-sensitivity.d.ts +0 -22
  143. package/dist/scripts/grader-sensitivity.js +0 -354
  144. package/dist/scripts/grader-validate.d.ts +0 -19
  145. package/dist/scripts/grader-validate.js +0 -267
  146. package/dist/scripts/measure-retrieval.d.ts +0 -10
  147. package/dist/scripts/measure-retrieval.js +0 -145
  148. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +0 -24
  149. package/dist/scripts/migrate-tasks-to-content-lake.js +0 -328
  150. package/dist/scripts/pipeline.d.ts +0 -76
  151. package/dist/scripts/pipeline.js +0 -1031
  152. package/dist/scripts/pr-comment.d.ts +0 -10
  153. package/dist/scripts/pr-comment.js +0 -510
  154. package/dist/scripts/readiness-report.d.ts +0 -88
  155. package/dist/scripts/readiness-report.js +0 -342
  156. package/dist/scripts/update-quality-scores.d.ts +0 -15
  157. package/dist/scripts/update-quality-scores.js +0 -184
  158. package/dist/scripts/validate-task-sources.d.ts +0 -21
  159. package/dist/scripts/validate-task-sources.js +0 -210
  160. package/dist/scripts/validate.d.ts +0 -13
  161. package/dist/scripts/validate.js +0 -79
  162. package/dist/scripts/webhook-server.d.ts +0 -26
  163. package/dist/scripts/webhook-server.js +0 -147
  164. package/dist/scripts/weekly-digest.d.ts +0 -24
  165. package/dist/scripts/weekly-digest.js +0 -144
  166. package/dist/sinks/format-slack.d.ts +0 -64
  167. package/dist/sinks/format-slack.js +0 -306
  168. package/dist/sinks/slack-sink.d.ts +0 -27
  169. package/dist/sinks/slack-sink.js +0 -78
  170. package/dist/sinks/webhook-sink.d.ts +0 -19
  171. package/dist/sinks/webhook-sink.js +0 -50
  172. package/tasks/.expanded.agentic.yaml +0 -280
  173. package/tasks/.expanded.yaml +0 -565
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Sanity.io
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -112,6 +112,55 @@ export declare const ailfConfigData: {
112
112
  };
113
113
  /** Raw YAML string for ailf-config example (preserves comments) */
114
114
  export declare const ailfConfigYaml = "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# .ailf/config.yaml \u2014 AI Literacy Framework project configuration\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n#\n# This file configures how the AILF evaluation pipeline runs in this\n# repository. Place it at .ailf/config.yaml in your project root.\n#\n# Evaluations are submitted to the AILF API (ailf-api.sanity.build).\n# The API handles LLM calls, doc fetching, grading, and report\n# publishing. Your repo only needs one secret: AILF_API_KEY.\n#\n# Docs: https://github.com/sanity-labs/ai-literacy-framework\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n\n# Documentation source \u2014 which docs are being evaluated.\n#\n# This tells the pipeline which Sanity project and dataset contain\n# the documentation under test. For most users, this is Sanity's own\n# docs project.\n#\n# projectId \u2014 Sanity project ID (find yours at sanity.io/manage)\n# dataset \u2014 the dataset to query (e.g., \"production\", \"next\")\n# baseUrl \u2014 the public URL of your documentation site\n# (used by agentic mode to test agent discoverability)\nsource:\n projectId: \"3do82whm\"\n dataset: next\n baseUrl: \"https://www.sanity.io/docs\"\n\n# Trigger configuration \u2014 when evaluations run automatically.\n#\n# Each key is a trigger context. The pipeline checks which trigger\n# matches the current execution context (PR, merge, schedule, etc.)\n# and applies its settings.\n#\n# mode options:\n# validate-only \u2014 check that task YAML parses correctly (fast, no LLM calls)\n# eval \u2014 run the full evaluation pipeline\n#\n# paths \u2014 only trigger when files matching these globs change\n# blocking \u2014 if true, a failing eval blocks the PR merge\n# notify \u2014 if true, post results to configured notification channels\ntriggers:\n # On pull requests: just validate task files parse correctly\n pr:\n mode: validate-only\n\n # When .ailf/ files change in a PR: run a real evaluation\n pr-task-change:\n mode: eval\n paths: [\".ailf/**\"]\n\n # On merge to main: run evaluation (non-blocking)\n main:\n mode: eval\n blocking: false\n notify: true\n";
115
+ /** Parsed task data for example-agent-add-schema (JSON-safe) */
116
+ export declare const exampleAgentAddSchemaData: readonly [{
117
+ readonly mode: "agent-harness";
118
+ readonly id: "example-agent-add-schema";
119
+ readonly title: "Add a document schema to a Sanity Studio project";
120
+ readonly description: "Example — tests whether an agent can create a schema file and register it in an existing Sanity Studio project";
121
+ readonly area: "studio";
122
+ readonly sandbox: {
123
+ readonly type: "tempdir";
124
+ };
125
+ readonly tools: readonly ["coding"];
126
+ readonly fixtures: readonly ["file://apps/studio-basic"];
127
+ readonly prompt: {
128
+ readonly text: "You have a Sanity Studio project with an existing article schema.\nAdd a new \"post\" document type with the following fields:\n1. title (string, required)\n2. slug (slug, sourced from title, required)\n3. author (string)\n4. publishedAt (datetime)\n5. body (array of block content)\n\nCreate the schema file at schemas/post.ts using defineType() and defineField().\nRegister it in schemas/index.ts alongside the existing article schema.";
129
+ };
130
+ readonly assertions: readonly [{
131
+ readonly type: "file-exists";
132
+ readonly value: "schemas/post.ts";
133
+ }, {
134
+ readonly type: "file-contains";
135
+ readonly value: {
136
+ readonly path: "schemas/post.ts";
137
+ readonly content: "defineType";
138
+ };
139
+ }, {
140
+ readonly type: "file-contains";
141
+ readonly value: {
142
+ readonly path: "schemas/post.ts";
143
+ readonly content: "defineField";
144
+ };
145
+ }, {
146
+ readonly type: "file-contains";
147
+ readonly value: {
148
+ readonly path: "schemas/index.ts";
149
+ readonly content: "post";
150
+ };
151
+ }, {
152
+ readonly type: "file-contains";
153
+ readonly value: {
154
+ readonly path: "sanity.config.ts";
155
+ readonly content: "defineConfig";
156
+ };
157
+ }];
158
+ readonly status: "draft";
159
+ }];
160
+ /** TypeScript task template for example-agent-add-schema */
161
+ export declare const exampleAgentAddSchemaTs = "/**\n * Example Task: Agent harness \u2014 add a schema to a Sanity Studio project.\n *\n * This is a starter template for agent-harness evaluations. It tests whether\n * an autonomous coding agent can modify a real project in a sandboxed\n * environment using file system tools.\n *\n * Unlike literacy or knowledge-probe tasks that evaluate text responses,\n * agent-harness tasks evaluate *side-effects*: files created, code modified,\n * commands executed. The agent gets a working directory with real files and\n * tools (Bash, Read, Write, Edit, etc.) and is graded on what it produces.\n *\n * This task is a DRAFT \u2014 it won't run unless activated or explicitly targeted.\n * To activate: change status to \"active\" or remove the status field.\n *\n * @see https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/modes.md#agent-harness\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n // \u2500\u2500 Mode \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // \"agent-harness\" runs a real coding agent (Claude Agent SDK) with\n // filesystem tools in an isolated sandbox. The agent can read, write,\n // edit files and run shell commands \u2014 then assertions verify the results.\n //\n // Other modes: \"literacy\" (text Q&A), \"mcp-server\" (tool use),\n // \"knowledge-probe\" (baseline knowledge).\n mode: \"agent-harness\",\n\n id: \"example-agent-add-schema\",\n title: \"Add a document schema to a Sanity Studio project\",\n description:\n \"Example \u2014 tests whether an agent can create a schema file and \" +\n \"register it in an existing Sanity Studio project\",\n\n // Area groups tasks for scoring. All agent-harness tasks in the same\n // area are aggregated together in the score report.\n area: \"studio\",\n\n // \u2500\u2500 Sandbox \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // The sandbox isolates the agent's file operations from your real repo.\n // Currently only \"tempdir\" is implemented (creates a temporary directory).\n // The pipeline creates the sandbox dir at config-generation time, copies\n // fixtures into it before the agent starts, and the agent's working_dir\n // is set to the sandbox path.\n //\n // See: docs/modes.md#sandbox-lifecycle\n sandbox: { type: \"tempdir\" },\n\n // \u2500\u2500 Tools \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // Tool presets control which Claude Agent SDK tools the agent can use:\n // \"coding\" \u2192 Bash, Read, Write, Edit, Glob, Grep\n // \"full-access\" \u2192 coding + WebSearch, WebFetch, TodoRead, TodoWrite\n // \"read-only\" \u2192 Read, Glob, Grep, WebSearch\n //\n // You can also mix presets with explicit tool names:\n // tools: [\"coding\", \"WebFetch\"]\n tools: [\"coding\"],\n\n // \u2500\u2500 Fixtures \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // Fixtures are files or directories copied into the sandbox before the\n // agent starts. Paths use the file:// prefix and resolve relative to\n // the directory where you run the pipeline (typically your repo root).\n //\n // The agent sees these files in its working directory as if they were\n // a real project. For example, \"file://apps/studio-basic\" copies the\n // entire studio-basic app into the sandbox root.\n //\n // If you don't have fixture projects, you can omit this field and the\n // agent starts with an empty directory.\n fixtures: [\"file://apps/studio-basic\"],\n\n // \u2500\u2500 Prompt \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // The task instructions sent to the agent. Be specific about expected\n // file paths and patterns \u2014 the assertions check for exact paths.\n prompt: {\n text: `You have a Sanity Studio project with an existing article schema.\nAdd a new \"post\" document type with the following fields:\n1. title (string, required)\n2. slug (slug, sourced from title, required)\n3. author (string)\n4. publishedAt (datetime)\n5. body (array of block content)\n\nCreate the schema file at schemas/post.ts using defineType() and defineField().\nRegister it in schemas/index.ts alongside the existing article schema.`,\n },\n\n // \u2500\u2500 Assertions \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // Agent-harness assertions verify the sandbox state after the agent runs.\n // They execute in a full Node.js context (not eval()) so they can use\n // fs, child_process, etc. All file paths resolve relative to the sandbox.\n //\n // Available assertion types:\n // file-exists \u2014 check a file was created\n // file-contains \u2014 check a file contains a substring\n // command-succeeds \u2014 run a shell command (exit 0 = pass)\n // diff-matches \u2014 check git diff contains a pattern\n // llm-rubric \u2014 LLM grades the agent's text output\n assertions: [\n // Verify the agent created the new schema file\n { type: \"file-exists\", value: \"schemas/post.ts\" },\n\n // Verify it uses the modern Sanity schema API\n {\n type: \"file-contains\",\n value: { path: \"schemas/post.ts\", content: \"defineType\" },\n },\n {\n type: \"file-contains\",\n value: { path: \"schemas/post.ts\", content: \"defineField\" },\n },\n\n // Verify the schema was registered in the barrel export\n {\n type: \"file-contains\",\n value: { path: \"schemas/index.ts\", content: \"post\" },\n },\n\n // Verify the existing config is still intact\n {\n type: \"file-contains\",\n value: { path: \"sanity.config.ts\", content: \"defineConfig\" },\n },\n ],\n\n status: \"draft\",\n})\n";
162
+ /** Generated YAML for example-agent-add-schema (from parsed TS data) */
163
+ export declare const exampleAgentAddSchemaYaml = "- mode: agent-harness\n id: example-agent-add-schema\n title: Add a document schema to a Sanity Studio project\n description: Example \u2014 tests whether an agent can create a schema file and register it in an existing Sanity Studio project\n area: studio\n sandbox:\n type: tempdir\n tools:\n - coding\n fixtures:\n - file://apps/studio-basic\n prompt:\n text: |-\n You have a Sanity Studio project with an existing article schema.\n Add a new \"post\" document type with the following fields:\n 1. title (string, required)\n 2. slug (slug, sourced from title, required)\n 3. author (string)\n 4. publishedAt (datetime)\n 5. body (array of block content)\n\n Create the schema file at schemas/post.ts using defineType() and defineField().\n Register it in schemas/index.ts alongside the existing article schema.\n assertions:\n - type: file-exists\n value: schemas/post.ts\n - type: file-contains\n value:\n path: schemas/post.ts\n content: defineType\n - type: file-contains\n value:\n path: schemas/post.ts\n content: defineField\n - type: file-contains\n value:\n path: schemas/index.ts\n content: post\n - type: file-contains\n value:\n path: sanity.config.ts\n content: defineConfig\n status: draft\n";
115
164
  /** Parsed task data for example-groq-blog-listing (JSON-safe) */
116
165
  export declare const exampleGroqBlogListingData: readonly [{
117
166
  readonly mode: "literacy";
@@ -367,7 +416,7 @@ export declare const taskTsFiles: Record<string, string>;
367
416
  /** Map of task ID (filename stem) → generated YAML string */
368
417
  export declare const taskYamlFiles: Record<string, string>;
369
418
  /** List of task file stems, in alphabetical order */
370
- export declare const TASK_FILE_NAMES: readonly ["example-groq-blog-listing", "example-id-based-ref", "example-knowledge-probe", "example-mcp-tool-usage", "example-path-based-ref", "example-perspective-ref", "example-studio-custom-input"];
419
+ export declare const TASK_FILE_NAMES: readonly ["example-agent-add-schema", "example-groq-blog-listing", "example-id-based-ref", "example-knowledge-probe", "example-mcp-tool-usage", "example-path-based-ref", "example-perspective-ref", "example-studio-custom-input"];
371
420
  /** Task metadata for mode-based filtering in init and other consumers */
372
421
  export interface TaskExampleMeta {
373
422
  stem: string;
@@ -143,6 +143,67 @@ export const ailfConfigData = {
143
143
  };
144
144
  /** Raw YAML string for ailf-config example (preserves comments) */
145
145
  export const ailfConfigYaml = "# ──────────────────────────────────────────────────────────────────────\n# .ailf/config.yaml — AI Literacy Framework project configuration\n# ──────────────────────────────────────────────────────────────────────\n#\n# This file configures how the AILF evaluation pipeline runs in this\n# repository. Place it at .ailf/config.yaml in your project root.\n#\n# Evaluations are submitted to the AILF API (ailf-api.sanity.build).\n# The API handles LLM calls, doc fetching, grading, and report\n# publishing. Your repo only needs one secret: AILF_API_KEY.\n#\n# Docs: https://github.com/sanity-labs/ai-literacy-framework\n# ──────────────────────────────────────────────────────────────────────\n\n# Documentation source — which docs are being evaluated.\n#\n# This tells the pipeline which Sanity project and dataset contain\n# the documentation under test. For most users, this is Sanity's own\n# docs project.\n#\n# projectId — Sanity project ID (find yours at sanity.io/manage)\n# dataset — the dataset to query (e.g., \"production\", \"next\")\n# baseUrl — the public URL of your documentation site\n# (used by agentic mode to test agent discoverability)\nsource:\n projectId: \"3do82whm\"\n dataset: next\n baseUrl: \"https://www.sanity.io/docs\"\n\n# Trigger configuration — when evaluations run automatically.\n#\n# Each key is a trigger context. The pipeline checks which trigger\n# matches the current execution context (PR, merge, schedule, etc.)\n# and applies its settings.\n#\n# mode options:\n# validate-only — check that task YAML parses correctly (fast, no LLM calls)\n# eval — run the full evaluation pipeline\n#\n# paths — only trigger when files matching these globs change\n# blocking — if true, a failing eval blocks the PR merge\n# notify — if true, post results to configured notification channels\ntriggers:\n # On pull requests: just validate task files parse correctly\n pr:\n mode: validate-only\n\n # When .ailf/ files change in a PR: run a real evaluation\n pr-task-change:\n mode: eval\n paths: [\".ailf/**\"]\n\n # On merge to main: run evaluation (non-blocking)\n main:\n mode: eval\n blocking: false\n notify: true\n";
146
+ /** Parsed task data for example-agent-add-schema (JSON-safe) */
147
+ export const exampleAgentAddSchemaData = [
148
+ {
149
+ "mode": "agent-harness",
150
+ "id": "example-agent-add-schema",
151
+ "title": "Add a document schema to a Sanity Studio project",
152
+ "description": "Example — tests whether an agent can create a schema file and register it in an existing Sanity Studio project",
153
+ "area": "studio",
154
+ "sandbox": {
155
+ "type": "tempdir"
156
+ },
157
+ "tools": [
158
+ "coding"
159
+ ],
160
+ "fixtures": [
161
+ "file://apps/studio-basic"
162
+ ],
163
+ "prompt": {
164
+ "text": "You have a Sanity Studio project with an existing article schema.\nAdd a new \"post\" document type with the following fields:\n1. title (string, required)\n2. slug (slug, sourced from title, required)\n3. author (string)\n4. publishedAt (datetime)\n5. body (array of block content)\n\nCreate the schema file at schemas/post.ts using defineType() and defineField().\nRegister it in schemas/index.ts alongside the existing article schema."
165
+ },
166
+ "assertions": [
167
+ {
168
+ "type": "file-exists",
169
+ "value": "schemas/post.ts"
170
+ },
171
+ {
172
+ "type": "file-contains",
173
+ "value": {
174
+ "path": "schemas/post.ts",
175
+ "content": "defineType"
176
+ }
177
+ },
178
+ {
179
+ "type": "file-contains",
180
+ "value": {
181
+ "path": "schemas/post.ts",
182
+ "content": "defineField"
183
+ }
184
+ },
185
+ {
186
+ "type": "file-contains",
187
+ "value": {
188
+ "path": "schemas/index.ts",
189
+ "content": "post"
190
+ }
191
+ },
192
+ {
193
+ "type": "file-contains",
194
+ "value": {
195
+ "path": "sanity.config.ts",
196
+ "content": "defineConfig"
197
+ }
198
+ }
199
+ ],
200
+ "status": "draft"
201
+ }
202
+ ];
203
+ /** TypeScript task template for example-agent-add-schema */
204
+ export const exampleAgentAddSchemaTs = "/**\n * Example Task: Agent harness — add a schema to a Sanity Studio project.\n *\n * This is a starter template for agent-harness evaluations. It tests whether\n * an autonomous coding agent can modify a real project in a sandboxed\n * environment using file system tools.\n *\n * Unlike literacy or knowledge-probe tasks that evaluate text responses,\n * agent-harness tasks evaluate *side-effects*: files created, code modified,\n * commands executed. The agent gets a working directory with real files and\n * tools (Bash, Read, Write, Edit, etc.) and is graded on what it produces.\n *\n * This task is a DRAFT — it won't run unless activated or explicitly targeted.\n * To activate: change status to \"active\" or remove the status field.\n *\n * @see https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/modes.md#agent-harness\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n // ── Mode ──────────────────────────────────────────────────────\n // \"agent-harness\" runs a real coding agent (Claude Agent SDK) with\n // filesystem tools in an isolated sandbox. The agent can read, write,\n // edit files and run shell commands — then assertions verify the results.\n //\n // Other modes: \"literacy\" (text Q&A), \"mcp-server\" (tool use),\n // \"knowledge-probe\" (baseline knowledge).\n mode: \"agent-harness\",\n\n id: \"example-agent-add-schema\",\n title: \"Add a document schema to a Sanity Studio project\",\n description:\n \"Example — tests whether an agent can create a schema file and \" +\n \"register it in an existing Sanity Studio project\",\n\n // Area groups tasks for scoring. All agent-harness tasks in the same\n // area are aggregated together in the score report.\n area: \"studio\",\n\n // ── Sandbox ───────────────────────────────────────────────────\n // The sandbox isolates the agent's file operations from your real repo.\n // Currently only \"tempdir\" is implemented (creates a temporary directory).\n // The pipeline creates the sandbox dir at config-generation time, copies\n // fixtures into it before the agent starts, and the agent's working_dir\n // is set to the sandbox path.\n //\n // See: docs/modes.md#sandbox-lifecycle\n sandbox: { type: \"tempdir\" },\n\n // ── Tools ─────────────────────────────────────────────────────\n // Tool presets control which Claude Agent SDK tools the agent can use:\n // \"coding\" → Bash, Read, Write, Edit, Glob, Grep\n // \"full-access\" → coding + WebSearch, WebFetch, TodoRead, TodoWrite\n // \"read-only\" → Read, Glob, Grep, WebSearch\n //\n // You can also mix presets with explicit tool names:\n // tools: [\"coding\", \"WebFetch\"]\n tools: [\"coding\"],\n\n // ── Fixtures ──────────────────────────────────────────────────\n // Fixtures are files or directories copied into the sandbox before the\n // agent starts. Paths use the file:// prefix and resolve relative to\n // the directory where you run the pipeline (typically your repo root).\n //\n // The agent sees these files in its working directory as if they were\n // a real project. For example, \"file://apps/studio-basic\" copies the\n // entire studio-basic app into the sandbox root.\n //\n // If you don't have fixture projects, you can omit this field and the\n // agent starts with an empty directory.\n fixtures: [\"file://apps/studio-basic\"],\n\n // ── Prompt ────────────────────────────────────────────────────\n // The task instructions sent to the agent. Be specific about expected\n // file paths and patterns — the assertions check for exact paths.\n prompt: {\n text: `You have a Sanity Studio project with an existing article schema.\nAdd a new \"post\" document type with the following fields:\n1. title (string, required)\n2. slug (slug, sourced from title, required)\n3. author (string)\n4. publishedAt (datetime)\n5. body (array of block content)\n\nCreate the schema file at schemas/post.ts using defineType() and defineField().\nRegister it in schemas/index.ts alongside the existing article schema.`,\n },\n\n // ── Assertions ────────────────────────────────────────────────\n // Agent-harness assertions verify the sandbox state after the agent runs.\n // They execute in a full Node.js context (not eval()) so they can use\n // fs, child_process, etc. All file paths resolve relative to the sandbox.\n //\n // Available assertion types:\n // file-exists — check a file was created\n // file-contains — check a file contains a substring\n // command-succeeds — run a shell command (exit 0 = pass)\n // diff-matches — check git diff contains a pattern\n // llm-rubric — LLM grades the agent's text output\n assertions: [\n // Verify the agent created the new schema file\n { type: \"file-exists\", value: \"schemas/post.ts\" },\n\n // Verify it uses the modern Sanity schema API\n {\n type: \"file-contains\",\n value: { path: \"schemas/post.ts\", content: \"defineType\" },\n },\n {\n type: \"file-contains\",\n value: { path: \"schemas/post.ts\", content: \"defineField\" },\n },\n\n // Verify the schema was registered in the barrel export\n {\n type: \"file-contains\",\n value: { path: \"schemas/index.ts\", content: \"post\" },\n },\n\n // Verify the existing config is still intact\n {\n type: \"file-contains\",\n value: { path: \"sanity.config.ts\", content: \"defineConfig\" },\n },\n ],\n\n status: \"draft\",\n})\n";
205
+ /** Generated YAML for example-agent-add-schema (from parsed TS data) */
206
+ export const exampleAgentAddSchemaYaml = "- mode: agent-harness\n id: example-agent-add-schema\n title: Add a document schema to a Sanity Studio project\n description: Example — tests whether an agent can create a schema file and register it in an existing Sanity Studio project\n area: studio\n sandbox:\n type: tempdir\n tools:\n - coding\n fixtures:\n - file://apps/studio-basic\n prompt:\n text: |-\n You have a Sanity Studio project with an existing article schema.\n Add a new \"post\" document type with the following fields:\n 1. title (string, required)\n 2. slug (slug, sourced from title, required)\n 3. author (string)\n 4. publishedAt (datetime)\n 5. body (array of block content)\n\n Create the schema file at schemas/post.ts using defineType() and defineField().\n Register it in schemas/index.ts alongside the existing article schema.\n assertions:\n - type: file-exists\n value: schemas/post.ts\n - type: file-contains\n value:\n path: schemas/post.ts\n content: defineType\n - type: file-contains\n value:\n path: schemas/post.ts\n content: defineField\n - type: file-contains\n value:\n path: schemas/index.ts\n content: post\n - type: file-contains\n value:\n path: sanity.config.ts\n content: defineConfig\n status: draft\n";
146
207
  /** Parsed task data for example-groq-blog-listing (JSON-safe) */
147
208
  export const exampleGroqBlogListingData = [
148
209
  {
@@ -489,6 +550,7 @@ export const exampleStudioCustomInputYaml = "- mode: literacy\n id: example-stu
489
550
  // ---------------------------------------------------------------------------
490
551
  /** All task example data as a flat array (JSON-safe) */
491
552
  export const allTaskData = [
553
+ ...exampleAgentAddSchemaData,
492
554
  ...exampleGroqBlogListingData,
493
555
  ...exampleIdBasedRefData,
494
556
  ...exampleKnowledgeProbeData,
@@ -499,6 +561,7 @@ export const allTaskData = [
499
561
  ];
500
562
  /** Map of task ID (filename stem) → raw TypeScript source */
501
563
  export const taskTsFiles = {
564
+ "example-agent-add-schema": exampleAgentAddSchemaTs,
502
565
  "example-groq-blog-listing": exampleGroqBlogListingTs,
503
566
  "example-id-based-ref": exampleIdBasedRefTs,
504
567
  "example-knowledge-probe": exampleKnowledgeProbeTs,
@@ -509,6 +572,7 @@ export const taskTsFiles = {
509
572
  };
510
573
  /** Map of task ID (filename stem) → generated YAML string */
511
574
  export const taskYamlFiles = {
575
+ "example-agent-add-schema": exampleAgentAddSchemaYaml,
512
576
  "example-groq-blog-listing": exampleGroqBlogListingYaml,
513
577
  "example-id-based-ref": exampleIdBasedRefYaml,
514
578
  "example-knowledge-probe": exampleKnowledgeProbeYaml,
@@ -518,8 +582,9 @@ export const taskYamlFiles = {
518
582
  "example-studio-custom-input": exampleStudioCustomInputYaml,
519
583
  };
520
584
  /** List of task file stems, in alphabetical order */
521
- export const TASK_FILE_NAMES = ["example-groq-blog-listing", "example-id-based-ref", "example-knowledge-probe", "example-mcp-tool-usage", "example-path-based-ref", "example-perspective-ref", "example-studio-custom-input"];
585
+ export const TASK_FILE_NAMES = ["example-agent-add-schema", "example-groq-blog-listing", "example-id-based-ref", "example-knowledge-probe", "example-mcp-tool-usage", "example-path-based-ref", "example-perspective-ref", "example-studio-custom-input"];
522
586
  export const TASK_EXAMPLES = [
587
+ { stem: "example-agent-add-schema", mode: "agent-harness", status: "draft" },
523
588
  { stem: "example-groq-blog-listing", mode: "literacy", status: "draft" },
524
589
  { stem: "example-id-based-ref", mode: "literacy", status: "draft" },
525
590
  { stem: "example-knowledge-probe", mode: "knowledge-probe", status: "draft" },
@@ -0,0 +1,49 @@
1
+ /**
2
+ * assertions-runtime.ts — Runtime assertion functions for agent harness tasks.
3
+ *
4
+ * These run as file-based promptfoo JavaScript assertions, which execute
5
+ * in a full Node.js context (unlike inline `type: javascript` assertions
6
+ * which run in a restricted eval() sandbox without require()).
7
+ *
8
+ * Each export is a named function with the signature:
9
+ * (output: string, context: { vars, config, ... }) => GradingResult
10
+ *
11
+ * Referenced in promptfoo config as:
12
+ * value: file://dist/agent-harness/assertions-runtime.js:functionName
13
+ * config: { filePath: "...", ... }
14
+ *
15
+ * @see https://www.promptfoo.dev/docs/configuration/expected-outputs/javascript/
16
+ */
17
+ interface GradingResult {
18
+ pass: boolean;
19
+ score: number;
20
+ reason: string;
21
+ }
22
+ interface AssertionContext {
23
+ vars: Record<string, unknown>;
24
+ config?: Record<string, unknown>;
25
+ }
26
+ /**
27
+ * Assert that a file exists in the sandbox working directory.
28
+ * Config: { filePath: string }
29
+ */
30
+ export declare function fileExists(_output: string, context: AssertionContext): GradingResult;
31
+ /**
32
+ * Assert that a file contains expected content.
33
+ * Config: { filePath: string, content: string }
34
+ */
35
+ export declare function fileContains(_output: string, context: AssertionContext): GradingResult;
36
+ /**
37
+ * Assert that a shell command succeeds (exit code 0) in the sandbox.
38
+ * Config: { command: string, timeoutMs?: number }
39
+ *
40
+ * SECURITY: Commands come from developer-authored task definitions,
41
+ * not from user input or LLM output.
42
+ */
43
+ export declare function commandSucceeds(_output: string, context: AssertionContext): GradingResult;
44
+ /**
45
+ * Assert that the git diff in the sandbox contains expected content.
46
+ * Config: { expected?: string }
47
+ */
48
+ export declare function diffMatches(_output: string, context: AssertionContext): GradingResult;
49
+ export {};
@@ -0,0 +1,138 @@
1
+ /**
2
+ * assertions-runtime.ts — Runtime assertion functions for agent harness tasks.
3
+ *
4
+ * These run as file-based promptfoo JavaScript assertions, which execute
5
+ * in a full Node.js context (unlike inline `type: javascript` assertions
6
+ * which run in a restricted eval() sandbox without require()).
7
+ *
8
+ * Each export is a named function with the signature:
9
+ * (output: string, context: { vars, config, ... }) => GradingResult
10
+ *
11
+ * Referenced in promptfoo config as:
12
+ * value: file://dist/agent-harness/assertions-runtime.js:functionName
13
+ * config: { filePath: "...", ... }
14
+ *
15
+ * @see https://www.promptfoo.dev/docs/configuration/expected-outputs/javascript/
16
+ */
17
+ import { existsSync, readFileSync } from "fs";
18
+ import { execSync } from "child_process";
19
+ import { resolve, sep } from "path";
20
+ function resolveWorkDir(context) {
21
+ return resolve(context.vars.__workingDir || ".");
22
+ }
23
+ function guardTraversal(workDir, target, label) {
24
+ if (!target.startsWith(workDir + sep) && target !== workDir) {
25
+ return {
26
+ pass: false,
27
+ score: 0,
28
+ reason: `Path traversal: ${label} escapes sandbox`,
29
+ };
30
+ }
31
+ return null;
32
+ }
33
+ /**
34
+ * Assert that a file exists in the sandbox working directory.
35
+ * Config: { filePath: string }
36
+ */
37
+ export function fileExists(_output, context) {
38
+ const filePath = context.config?.filePath ?? "";
39
+ const workDir = resolveWorkDir(context);
40
+ const target = resolve(workDir, filePath);
41
+ const traversal = guardTraversal(workDir, target, filePath);
42
+ if (traversal)
43
+ return traversal;
44
+ const exists = existsSync(target);
45
+ return {
46
+ pass: exists,
47
+ score: exists ? 1 : 0,
48
+ reason: exists
49
+ ? `File exists: ${filePath}`
50
+ : `Expected file not found: ${filePath}`,
51
+ };
52
+ }
53
+ /**
54
+ * Assert that a file contains expected content.
55
+ * Config: { filePath: string, content: string }
56
+ */
57
+ export function fileContains(_output, context) {
58
+ const filePath = context.config?.filePath ?? "";
59
+ const expected = context.config?.content ?? "";
60
+ const workDir = resolveWorkDir(context);
61
+ const target = resolve(workDir, filePath);
62
+ const traversal = guardTraversal(workDir, target, filePath);
63
+ if (traversal)
64
+ return traversal;
65
+ if (!existsSync(target)) {
66
+ return { pass: false, score: 0, reason: `File not found: ${filePath}` };
67
+ }
68
+ const content = readFileSync(target, "utf-8");
69
+ const contains = content.includes(expected);
70
+ return {
71
+ pass: contains,
72
+ score: contains ? 1 : 0,
73
+ reason: contains
74
+ ? "File contains expected content"
75
+ : "File does not contain expected content",
76
+ };
77
+ }
78
+ /**
79
+ * Assert that a shell command succeeds (exit code 0) in the sandbox.
80
+ * Config: { command: string, timeoutMs?: number }
81
+ *
82
+ * SECURITY: Commands come from developer-authored task definitions,
83
+ * not from user input or LLM output.
84
+ */
85
+ export function commandSucceeds(_output, context) {
86
+ const command = context.config?.command ?? "";
87
+ const timeoutMs = context.config?.timeoutMs ?? 30000;
88
+ const workDir = context.vars.__workingDir || ".";
89
+ try {
90
+ execSync(command, { cwd: workDir, timeout: timeoutMs });
91
+ return { pass: true, score: 1, reason: `Command succeeded: ${command}` };
92
+ }
93
+ catch (err) {
94
+ const error = err;
95
+ return {
96
+ pass: false,
97
+ score: 0,
98
+ reason: `Command failed: ${error.message || String(err)}`,
99
+ };
100
+ }
101
+ }
102
+ /**
103
+ * Assert that the git diff in the sandbox contains expected content.
104
+ * Config: { expected?: string }
105
+ */
106
+ export function diffMatches(_output, context) {
107
+ const expected = context.config?.expected;
108
+ const workDir = context.vars.__workingDir || ".";
109
+ try {
110
+ const diff = execSync("git diff", {
111
+ cwd: workDir,
112
+ encoding: "utf-8",
113
+ });
114
+ if (typeof expected === "string") {
115
+ const contains = diff.includes(expected);
116
+ return {
117
+ pass: contains,
118
+ score: contains ? 1 : 0,
119
+ reason: contains
120
+ ? "Diff matches expected pattern"
121
+ : "Diff does not match",
122
+ };
123
+ }
124
+ return {
125
+ pass: diff.length > 0,
126
+ score: diff.length > 0 ? 1 : 0,
127
+ reason: "Diff exists",
128
+ };
129
+ }
130
+ catch (err) {
131
+ const error = err;
132
+ return {
133
+ pass: false,
134
+ score: 0,
135
+ reason: `Failed to get diff: ${error.message}`,
136
+ };
137
+ }
138
+ }
@@ -0,0 +1,58 @@
1
+ /**
2
+ * provider.ts — Agent harness Promptfoo custom provider.
3
+ *
4
+ * Handles agent-harness evaluation tasks by calling the Anthropic API
5
+ * with the task prompt and returning the response. Sandbox provisioning
6
+ * and teardown are handled by lifecycle extensions (beforeEach/afterEach),
7
+ * not by the provider itself.
8
+ *
9
+ * Promptfoo config usage:
10
+ *
11
+ * providers:
12
+ * - id: file://dist/agent-harness/provider.js
13
+ * label: "Agent Harness: My Task"
14
+ * config:
15
+ * allowedTools: ["Bash", "Read", "Write"]
16
+ * sandbox: { type: "tempdir" }
17
+ *
18
+ * Promptfoo loads this file and instantiates the default export class.
19
+ */
20
+ interface CallApiContextParams {
21
+ prompt?: {
22
+ raw: string;
23
+ label?: string;
24
+ };
25
+ vars?: Record<string, object | string>;
26
+ }
27
+ interface ProviderOptions {
28
+ config?: Record<string, unknown>;
29
+ id?: string;
30
+ }
31
+ interface ProviderResponse {
32
+ cached?: boolean;
33
+ cost?: number;
34
+ error?: string;
35
+ metadata?: Record<string, unknown>;
36
+ output?: object | string;
37
+ tokenUsage?: {
38
+ total?: number;
39
+ prompt?: number;
40
+ completion?: number;
41
+ cached?: number;
42
+ };
43
+ }
44
+ export default class AgentHarnessProvider {
45
+ config: Record<string, unknown>;
46
+ private providerId;
47
+ constructor(options: ProviderOptions);
48
+ id(): string;
49
+ /**
50
+ * Main Promptfoo provider entry point. Called for each test case.
51
+ *
52
+ * Sends the task prompt to the Anthropic API and returns the response.
53
+ * The sandbox working directory is available in context.vars.__workingDir
54
+ * (set by the beforeEach extension).
55
+ */
56
+ callApi(prompt: string, context?: CallApiContextParams): Promise<ProviderResponse>;
57
+ }
58
+ export {};
@@ -0,0 +1,104 @@
1
+ /**
2
+ * provider.ts — Agent harness Promptfoo custom provider.
3
+ *
4
+ * Handles agent-harness evaluation tasks by calling the Anthropic API
5
+ * with the task prompt and returning the response. Sandbox provisioning
6
+ * and teardown are handled by lifecycle extensions (beforeEach/afterEach),
7
+ * not by the provider itself.
8
+ *
9
+ * Promptfoo config usage:
10
+ *
11
+ * providers:
12
+ * - id: file://dist/agent-harness/provider.js
13
+ * label: "Agent Harness: My Task"
14
+ * config:
15
+ * allowedTools: ["Bash", "Read", "Write"]
16
+ * sandbox: { type: "tempdir" }
17
+ *
18
+ * Promptfoo loads this file and instantiates the default export class.
19
+ */
20
+ import { config as loadDotenv } from "dotenv";
21
+ loadDotenv({
22
+ override: true,
23
+ path: new URL("../../.env", import.meta.url).pathname,
24
+ });
25
+ // ---------------------------------------------------------------------------
26
+ // Provider implementation
27
+ // ---------------------------------------------------------------------------
28
+ export default class AgentHarnessProvider {
29
+ config;
30
+ providerId;
31
+ constructor(options) {
32
+ this.providerId = options.id ?? "agent-harness";
33
+ this.config = options.config ?? {};
34
+ }
35
+ id() {
36
+ return this.providerId;
37
+ }
38
+ /**
39
+ * Main Promptfoo provider entry point. Called for each test case.
40
+ *
41
+ * Sends the task prompt to the Anthropic API and returns the response.
42
+ * The sandbox working directory is available in context.vars.__workingDir
43
+ * (set by the beforeEach extension).
44
+ */
45
+ async callApi(prompt, context) {
46
+ const apiKey = process.env.ANTHROPIC_API_KEY;
47
+ if (!apiKey) {
48
+ return {
49
+ error: "ANTHROPIC_API_KEY not set. Required for agent-harness evaluation.",
50
+ };
51
+ }
52
+ const model = this.config.model ?? "claude-sonnet-4-20250514";
53
+ const maxTokens = this.config.maxTokens ?? 4096;
54
+ const workingDir = context?.vars?.__workingDir;
55
+ const systemPrompt = workingDir
56
+ ? `You are an AI coding agent. Your working directory is: ${workingDir}\nComplete the following task.`
57
+ : "You are an AI coding agent. Complete the following task.";
58
+ try {
59
+ const response = await fetch("https://api.anthropic.com/v1/messages", {
60
+ method: "POST",
61
+ headers: {
62
+ "Content-Type": "application/json",
63
+ "x-api-key": apiKey,
64
+ "anthropic-version": "2023-06-01",
65
+ },
66
+ body: JSON.stringify({
67
+ model,
68
+ max_tokens: maxTokens,
69
+ system: systemPrompt,
70
+ messages: [{ role: "user", content: prompt }],
71
+ }),
72
+ });
73
+ if (!response.ok) {
74
+ const errorBody = await response.text();
75
+ return {
76
+ error: `Anthropic API error (${response.status}): ${errorBody}`,
77
+ };
78
+ }
79
+ const data = (await response.json());
80
+ const output = data.content
81
+ .filter((block) => block.type === "text")
82
+ .map((block) => block.text)
83
+ .join("\n");
84
+ return {
85
+ output,
86
+ tokenUsage: {
87
+ prompt: data.usage?.input_tokens,
88
+ completion: data.usage?.output_tokens,
89
+ total: (data.usage?.input_tokens ?? 0) + (data.usage?.output_tokens ?? 0),
90
+ },
91
+ metadata: {
92
+ model,
93
+ workingDir,
94
+ allowedTools: this.config.allowedTools,
95
+ sandboxType: this.config.sandbox?.type ?? "none",
96
+ },
97
+ };
98
+ }
99
+ catch (err) {
100
+ const error = err;
101
+ return { error: `Agent harness provider error: ${error.message}` };
102
+ }
103
+ }
104
+ }
package/dist/cli.js CHANGED
File without changes
@@ -138,6 +138,9 @@ async function runInit(opts) {
138
138
  else if (modeFilter === "knowledge-probe") {
139
139
  stemsToWrite = taskStemsForMode("knowledge-probe");
140
140
  }
141
+ else if (modeFilter === "agent-harness") {
142
+ stemsToWrite = taskStemsForMode("agent-harness");
143
+ }
141
144
  else {
142
145
  // Default (no --mode): write all tasks
143
146
  stemsToWrite = [...TASK_FILE_NAMES];
@@ -11,12 +11,19 @@
11
11
  import type { AppContext, PipelineState, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
12
12
  export declare class GenerateConfigsStep implements PipelineStep {
13
13
  readonly name = "generate-configs";
14
+ /** Task IDs from the last loadTasks call (pre-filter), for error messages. */
15
+ private lastLoadedTaskIds;
14
16
  check(ctx: AppContext): ValidationIssue[];
15
17
  execute(ctx: AppContext, state: PipelineState): Promise<StepResult>;
16
18
  private compileLiteracyVariants;
17
19
  private compileSingleMode;
18
20
  private loadTasks;
19
21
  private applyFilters;
22
+ /**
23
+ * Build a descriptive error message when no tasks match the current filters.
24
+ * Distinguishes between "no tasks exist" and "tasks exist but filters exclude them".
25
+ */
26
+ private buildNoTasksError;
20
27
  /**
21
28
  * Compile all tasks through a handler, merging results.
22
29
  * For literacy mode, ctx can carry evalMode as an extension.