@sanity/ailf 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (288) hide show
  1. package/config/features.ts +23 -0
  2. package/config/models.ts +83 -0
  3. package/config/prompts.ts +16 -0
  4. package/config/rubrics.ts +225 -0
  5. package/config/schedules.ts +47 -0
  6. package/config/sinks.ts +37 -0
  7. package/config/sources.ts +21 -0
  8. package/config/thresholds.ts +61 -0
  9. package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
  10. package/dist/_vendor/ailf-core/config-helpers.js +150 -0
  11. package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
  12. package/dist/_vendor/ailf-core/env-helper.js +45 -0
  13. package/dist/_vendor/ailf-core/index.d.ts +3 -0
  14. package/dist/_vendor/ailf-core/index.js +5 -0
  15. package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
  16. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
  17. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
  18. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
  19. package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
  20. package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
  21. package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
  22. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
  23. package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
  24. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
  25. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
  26. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +14 -29
  27. package/dist/_vendor/ailf-core/schemas/pipeline.js +17 -8
  28. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
  29. package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
  30. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
  31. package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
  32. package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
  33. package/dist/_vendor/ailf-core/services/index.js +2 -1
  34. package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
  35. package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
  36. package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
  37. package/dist/_vendor/ailf-core/services/scoring.js +25 -15
  38. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
  39. package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
  40. package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
  41. package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
  42. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
  43. package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
  44. package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
  45. package/dist/_vendor/ailf-core/types/index.js +8 -1
  46. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
  47. package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
  48. package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
  49. package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
  50. package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
  51. package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
  52. package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
  53. package/dist/_vendor/ailf-core/types/trace.js +18 -0
  54. package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
  55. package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
  56. package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
  57. package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
  58. package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
  59. package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
  60. package/dist/_vendor/ailf-shared/index.d.ts +0 -1
  61. package/dist/_vendor/ailf-shared/index.js +0 -1
  62. package/dist/adapters/api-client/build-request.js +14 -13
  63. package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
  64. package/dist/adapters/config-sources/file-config-adapter.js +38 -12
  65. package/dist/adapters/config-sources/index.d.ts +2 -0
  66. package/dist/adapters/config-sources/index.js +1 -0
  67. package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
  68. package/dist/adapters/config-sources/ts-config-loader.js +133 -0
  69. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
  70. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
  71. package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
  72. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  73. package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
  74. package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
  75. package/dist/adapters/task-sources/index.d.ts +1 -0
  76. package/dist/adapters/task-sources/index.js +1 -0
  77. package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
  78. package/dist/adapters/task-sources/repo-task-source.js +69 -16
  79. package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
  80. package/dist/adapters/task-sources/task-file-loader.js +83 -0
  81. package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
  82. package/dist/adapters/task-sources/yaml-task-source.js +19 -16
  83. package/dist/cli.js +0 -2
  84. package/dist/commands/baseline.js +4 -1
  85. package/dist/commands/calculate-scores.js +1 -1
  86. package/dist/commands/coverage-audit.js +7 -1
  87. package/dist/commands/explain-handler.js +25 -23
  88. package/dist/commands/fetch-docs.js +3 -2
  89. package/dist/commands/generate-configs.js +1 -1
  90. package/dist/commands/interactive.js +11 -7
  91. package/dist/commands/pipeline-action.d.ts +2 -0
  92. package/dist/commands/pipeline-action.js +16 -6
  93. package/dist/commands/pipeline.d.ts +1 -0
  94. package/dist/commands/pipeline.js +4 -2
  95. package/dist/commands/pr-comment.js +1 -1
  96. package/dist/commands/publish.js +2 -2
  97. package/dist/commands/readiness-report.js +13 -6
  98. package/dist/composition-root.d.ts +1 -1
  99. package/dist/composition-root.js +67 -4
  100. package/dist/orchestration/build-app-context.js +1 -0
  101. package/dist/orchestration/build-step-sequence.js +24 -6
  102. package/dist/orchestration/steps/calculate-scores-step.js +24 -11
  103. package/dist/orchestration/steps/fetch-docs-step.js +6 -4
  104. package/dist/orchestration/steps/gap-analysis-step.js +8 -7
  105. package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
  106. package/dist/orchestration/steps/generate-configs-step.js +245 -51
  107. package/dist/orchestration/steps/grader-consistency-step.js +7 -4
  108. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  109. package/dist/orchestration/steps/readiness-step.js +5 -6
  110. package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
  111. package/dist/orchestration/steps/run-eval-step.js +8 -7
  112. package/dist/pipeline/cache.d.ts +1 -1
  113. package/dist/pipeline/cache.js +36 -8
  114. package/dist/pipeline/calculate-scores.d.ts +2 -4
  115. package/dist/pipeline/calculate-scores.js +43 -113
  116. package/dist/pipeline/checks.js +2 -2
  117. package/dist/pipeline/compare.js +8 -8
  118. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
  119. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
  120. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
  121. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
  122. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
  123. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
  124. package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
  125. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
  126. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
  127. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
  128. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
  129. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
  130. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
  131. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
  132. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
  133. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
  134. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
  135. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
  136. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
  137. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
  138. package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
  139. package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
  140. package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
  141. package/dist/pipeline/compiler/assertion-mapper.js +175 -0
  142. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
  143. package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
  144. package/dist/pipeline/compiler/config-loader.d.ts +56 -0
  145. package/dist/pipeline/compiler/config-loader.js +111 -0
  146. package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
  147. package/dist/pipeline/compiler/fixture-resolver.js +113 -0
  148. package/dist/pipeline/compiler/hash.d.ts +11 -0
  149. package/dist/pipeline/compiler/hash.js +18 -0
  150. package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
  151. package/dist/pipeline/compiler/ignore-fields.js +113 -0
  152. package/dist/pipeline/compiler/index.d.ts +29 -0
  153. package/dist/pipeline/compiler/index.js +45 -0
  154. package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
  155. package/dist/pipeline/compiler/literacy-bridge.js +172 -0
  156. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
  157. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
  158. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
  159. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
  160. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
  161. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
  162. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
  163. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
  164. package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
  165. package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
  166. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
  167. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
  168. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
  169. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
  170. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
  171. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
  172. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
  173. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
  174. package/dist/pipeline/compiler/presets/index.d.ts +9 -0
  175. package/dist/pipeline/compiler/presets/index.js +8 -0
  176. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
  177. package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
  178. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
  179. package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
  180. package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
  181. package/dist/pipeline/compiler/provider-assembler.js +137 -0
  182. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
  183. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
  184. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
  185. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
  186. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
  187. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
  188. package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
  189. package/dist/pipeline/compiler/sandbox/index.js +11 -0
  190. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
  191. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
  192. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
  193. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
  194. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
  195. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
  196. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
  197. package/dist/pipeline/compiler/scoring-bridge.js +114 -0
  198. package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
  199. package/dist/pipeline/compiler/task-graph-builder.js +291 -0
  200. package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
  201. package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
  202. package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
  203. package/dist/pipeline/compiler/telemetry/index.js +19 -0
  204. package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
  205. package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
  206. package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
  207. package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
  208. package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
  209. package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
  210. package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
  211. package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
  212. package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
  213. package/dist/pipeline/compiler/variable-resolver.js +115 -0
  214. package/dist/pipeline/coverage-audit.d.ts +15 -5
  215. package/dist/pipeline/coverage-audit.js +41 -22
  216. package/dist/pipeline/eval-constants.d.ts +16 -6
  217. package/dist/pipeline/eval-constants.js +25 -4
  218. package/dist/pipeline/eval-fingerprint.d.ts +2 -2
  219. package/dist/pipeline/eval-fingerprint.js +8 -9
  220. package/dist/pipeline/expand-tasks.d.ts +19 -10
  221. package/dist/pipeline/expand-tasks.js +34 -28
  222. package/dist/pipeline/gap-analysis.d.ts +1 -1
  223. package/dist/pipeline/gap-analysis.js +2 -2
  224. package/dist/pipeline/generate-configs.d.ts +22 -4
  225. package/dist/pipeline/generate-configs.js +53 -24
  226. package/dist/pipeline/grader-api.d.ts +3 -3
  227. package/dist/pipeline/grader-api.js +5 -12
  228. package/dist/pipeline/grader-compare-runner.js +20 -27
  229. package/dist/pipeline/grader-comparison.d.ts +4 -8
  230. package/dist/pipeline/grader-comparison.js +11 -17
  231. package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
  232. package/dist/pipeline/grader-consistency-runner.js +16 -20
  233. package/dist/pipeline/grader-consistency.d.ts +6 -10
  234. package/dist/pipeline/grader-consistency.js +13 -32
  235. package/dist/pipeline/grader-sensitivity-runner.js +7 -5
  236. package/dist/pipeline/grader-sensitivity.d.ts +2 -6
  237. package/dist/pipeline/grader-sensitivity.js +10 -10
  238. package/dist/pipeline/grader-validate-runner.js +7 -5
  239. package/dist/pipeline/grader-validation.d.ts +2 -6
  240. package/dist/pipeline/grader-validation.js +14 -22
  241. package/dist/pipeline/map-request-to-config.js +6 -1
  242. package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
  243. package/dist/pipeline/mirror-repo-tasks.js +16 -15
  244. package/dist/pipeline/normalize-mode.d.ts +49 -0
  245. package/dist/pipeline/normalize-mode.js +64 -0
  246. package/dist/pipeline/plan.d.ts +5 -2
  247. package/dist/pipeline/plan.js +134 -78
  248. package/dist/pipeline/pr-comment.js +2 -0
  249. package/dist/pipeline/profile-resolution.d.ts +22 -14
  250. package/dist/pipeline/profile-resolution.js +41 -19
  251. package/dist/pipeline/provenance.d.ts +2 -2
  252. package/dist/pipeline/provenance.js +12 -17
  253. package/dist/pipeline/release-report.js +4 -4
  254. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  255. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  256. package/dist/pipeline/rubric-loader.d.ts +20 -0
  257. package/dist/pipeline/rubric-loader.js +37 -0
  258. package/dist/pipeline/validate.d.ts +4 -4
  259. package/dist/pipeline/validate.js +64 -53
  260. package/dist/schedules/loader.js +18 -8
  261. package/dist/scripts/migrate-task-mode.d.ts +24 -0
  262. package/dist/scripts/migrate-task-mode.js +85 -0
  263. package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
  264. package/dist/scripts/validate-task-sources.d.ts +1 -1
  265. package/dist/scripts/validate-task-sources.js +15 -15
  266. package/dist/sinks/loader.js +5 -7
  267. package/dist/sources.d.ts +7 -7
  268. package/dist/sources.js +22 -24
  269. package/dist/webhook/dispatch.js +2 -1
  270. package/package.json +6 -3
  271. package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
  272. package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
  273. package/tasks/literacy/frameworks.task.ts +128 -0
  274. package/tasks/literacy/functions.task.ts +69 -0
  275. package/tasks/literacy/groq.task.ts +258 -0
  276. package/tasks/literacy/nextjs-live.task.ts +75 -0
  277. package/tasks/literacy/studio-setup.task.ts +131 -0
  278. package/tasks/literacy/visual-editing.task.ts +146 -0
  279. package/config/features.yaml +0 -116
  280. package/config/models.yaml +0 -116
  281. package/config/prompts.yaml +0 -75
  282. package/config/rubrics.yaml +0 -81
  283. package/config/schedules.yaml +0 -43
  284. package/config/sinks.yaml +0 -54
  285. package/config/sources.yaml +0 -51
  286. package/config/thresholds.yaml +0 -49
  287. package/dist/agent-observer/test-imports.d.ts +0 -7
  288. package/dist/agent-observer/test-imports.js +0 -185
@@ -0,0 +1,14 @@
1
+ /**
2
+ * Example agent harness task definitions — demonstrates the agent-harness mode.
3
+ *
4
+ * Three tasks of increasing complexity:
5
+ * 1. Scaffold a project from template
6
+ * 2. Modify existing code following docs
7
+ * 3. Multi-file refactoring task
8
+ */
9
+ import type { AgentHarnessTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
10
+ export declare const scaffoldProjectTask: AgentHarnessTaskDefinition;
11
+ export declare const modifyCodeTask: AgentHarnessTaskDefinition;
12
+ export declare const multiFileRefactorTask: AgentHarnessTaskDefinition;
13
+ /** All example agent harness tasks */
14
+ export declare const allAgentHarnessExampleTasks: AgentHarnessTaskDefinition[];
@@ -0,0 +1,152 @@
1
+ /**
2
+ * Example agent harness task definitions — demonstrates the agent-harness mode.
3
+ *
4
+ * Three tasks of increasing complexity:
5
+ * 1. Scaffold a project from template
6
+ * 2. Modify existing code following docs
7
+ * 3. Multi-file refactoring task
8
+ */
9
+ // ---------------------------------------------------------------------------
10
+ // Task 1: Scaffold a project from template
11
+ // ---------------------------------------------------------------------------
12
+ export const scaffoldProjectTask = {
13
+ mode: "agent-harness",
14
+ id: "agent-scaffold-project",
15
+ title: "Scaffold a Sanity Studio Project",
16
+ description: "The agent should create a new Sanity Studio project following the " +
17
+ "official documentation. It should use the CLI to initialize the project, " +
18
+ "configure sanity.config.ts, and set up basic schema types.",
19
+ area: "studio",
20
+ difficulty: "basic",
21
+ tags: ["agent", "scaffold", "studio"],
22
+ sandbox: {
23
+ type: "tempdir",
24
+ },
25
+ tools: ["coding"],
26
+ fixtures: ["file://fixtures/package-scaffold.json"],
27
+ prompt: {
28
+ text: "Create a new Sanity Studio project with the following requirements:\n" +
29
+ "1. Initialize with `npm create sanity@latest`\n" +
30
+ "2. Configure sanity.config.ts with project ID 'test-project' and dataset 'production'\n" +
31
+ "3. Create a 'post' schema type with title, slug, body, and author fields\n" +
32
+ "4. Ensure the project builds without errors",
33
+ vars: {
34
+ task: "Scaffold a Sanity Studio project with a post schema type. " +
35
+ "The project should build cleanly.",
36
+ },
37
+ },
38
+ assertions: [
39
+ { type: "file-exists", value: "sanity.config.ts" },
40
+ { type: "file-exists", value: "schemaTypes/post.ts" },
41
+ {
42
+ type: "file-contains",
43
+ value: { path: "sanity.config.ts", content: "test-project" },
44
+ },
45
+ { type: "command-succeeds", value: "npx tsc --noEmit" },
46
+ ],
47
+ options: {
48
+ timeout: 120_000,
49
+ },
50
+ };
51
+ // ---------------------------------------------------------------------------
52
+ // Task 2: Modify existing code following docs
53
+ // ---------------------------------------------------------------------------
54
+ export const modifyCodeTask = {
55
+ mode: "agent-harness",
56
+ id: "agent-modify-schema",
57
+ title: "Add Document Actions Following Docs",
58
+ description: "Given an existing Studio project, the agent should add custom document " +
59
+ "actions following the Sanity documentation. The starting codebase has a " +
60
+ "basic Studio setup; the agent needs to create a custom publish action.",
61
+ area: "studio",
62
+ difficulty: "intermediate",
63
+ tags: ["agent", "modify", "document-actions"],
64
+ sandbox: {
65
+ type: "tempdir",
66
+ },
67
+ tools: ["coding"],
68
+ fixtures: ["file://fixtures/existing-studio-project/"],
69
+ prompt: {
70
+ text: "In the existing Sanity Studio project, add a custom document action " +
71
+ "that logs a message before publishing. Follow the Sanity docs for " +
72
+ "custom document actions.",
73
+ vars: {
74
+ task: "Add a custom document action that wraps the default publish action " +
75
+ "and logs 'Publishing document: <title>' before executing.",
76
+ },
77
+ },
78
+ assertions: [
79
+ { type: "file-exists", value: "actions/logPublishAction.ts" },
80
+ {
81
+ type: "file-contains",
82
+ value: {
83
+ path: "actions/logPublishAction.ts",
84
+ content: "useDocumentOperation",
85
+ },
86
+ },
87
+ {
88
+ type: "file-contains",
89
+ value: {
90
+ path: "sanity.config.ts",
91
+ content: "document.actions",
92
+ },
93
+ },
94
+ ],
95
+ options: {
96
+ timeout: 90_000,
97
+ },
98
+ };
99
+ // ---------------------------------------------------------------------------
100
+ // Task 3: Multi-file refactoring task
101
+ // ---------------------------------------------------------------------------
102
+ export const multiFileRefactorTask = {
103
+ mode: "agent-harness",
104
+ id: "agent-migrate-api",
105
+ title: "Migrate from Client v5 to v6 API",
106
+ description: "The agent should migrate a multi-file project from the Sanity client v5 " +
107
+ "API to the v6 API, updating import paths, method calls, and configuration " +
108
+ "patterns across all files.",
109
+ area: "mutations",
110
+ difficulty: "advanced",
111
+ tags: ["agent", "migration", "refactoring"],
112
+ sandbox: {
113
+ type: "docker",
114
+ image: "node:22-slim",
115
+ limits: {
116
+ cpus: 2,
117
+ memoryBytes: 2_147_483_648, // 2GB
118
+ networkAccess: false,
119
+ },
120
+ },
121
+ tools: ["coding", "WebSearch"],
122
+ fixtures: ["file://fixtures/v5-project/"],
123
+ prompt: {
124
+ text: "Migrate this project from @sanity/client v5 to v6. Update all:\n" +
125
+ "1. Import statements (createClient instead of SanityClient)\n" +
126
+ "2. Client configuration (new config shape)\n" +
127
+ "3. Query method calls (fetch → client.fetch with new signature)\n" +
128
+ "4. Mutation helpers (create/patch/delete API changes)\n" +
129
+ "Ensure the project compiles after migration.",
130
+ vars: {
131
+ task: "Migrate the codebase from @sanity/client v5 to v6, " +
132
+ "updating all files. Project must compile cleanly after migration.",
133
+ },
134
+ },
135
+ assertions: [
136
+ {
137
+ type: "file-contains",
138
+ value: { path: "lib/sanity.ts", content: "createClient" },
139
+ },
140
+ { type: "command-succeeds", value: "npx tsc --noEmit" },
141
+ { type: "diff-matches", value: "createClient" },
142
+ ],
143
+ options: {
144
+ timeout: 180_000,
145
+ },
146
+ };
147
+ /** All example agent harness tasks */
148
+ export const allAgentHarnessExampleTasks = [
149
+ scaffoldProjectTask,
150
+ modifyCodeTask,
151
+ multiFileRefactorTask,
152
+ ];
@@ -0,0 +1,32 @@
1
+ /**
2
+ * Example knowledge probe task definitions — demonstrates the knowledge-probe mode.
3
+ *
4
+ * These tasks measure raw model knowledge without documentation context.
5
+ * They answer: "What does this model know about X without any help?"
6
+ *
7
+ * Three tasks of increasing breadth:
8
+ * 1. GROQ projection syntax — deep, specific query language knowledge
9
+ * 2. defineType API — recent API knowledge + currency dimension
10
+ * 3. Sanity vs Contentful — broad ecosystem understanding
11
+ */
12
+ import type { KnowledgeProbeTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
13
+ /**
14
+ * Tests deep knowledge of Sanity's query language.
15
+ * Models should demonstrate understanding of projection syntax,
16
+ * coercion, dereferencing, and the slice operator.
17
+ */
18
+ export declare const groqProjectionTask: KnowledgeProbeTaskDefinition;
19
+ /**
20
+ * Tests knowledge of the schema definition API, including recent changes.
21
+ * The currency dimension is key here — models should know about defineType,
22
+ * defineField, and the typed schema helpers, not the legacy untyped API.
23
+ */
24
+ export declare const defineTypeApiTask: KnowledgeProbeTaskDefinition;
25
+ /**
26
+ * Tests broader ecosystem knowledge and the ability to articulate
27
+ * differences between Sanity and a major competitor. This probes
28
+ * conceptual understanding, not just API syntax.
29
+ */
30
+ export declare const ecosystemComparisonTask: KnowledgeProbeTaskDefinition;
31
+ /** All example knowledge probe tasks */
32
+ export declare const allKnowledgeProbeExampleTasks: KnowledgeProbeTaskDefinition[];
@@ -0,0 +1,176 @@
1
+ /**
2
+ * Example knowledge probe task definitions — demonstrates the knowledge-probe mode.
3
+ *
4
+ * These tasks measure raw model knowledge without documentation context.
5
+ * They answer: "What does this model know about X without any help?"
6
+ *
7
+ * Three tasks of increasing breadth:
8
+ * 1. GROQ projection syntax — deep, specific query language knowledge
9
+ * 2. defineType API — recent API knowledge + currency dimension
10
+ * 3. Sanity vs Contentful — broad ecosystem understanding
11
+ */
12
+ // ---------------------------------------------------------------------------
13
+ // Task 1: GROQ projection syntax
14
+ // ---------------------------------------------------------------------------
15
+ /**
16
+ * Tests deep knowledge of Sanity's query language.
17
+ * Models should demonstrate understanding of projection syntax,
18
+ * coercion, dereferencing, and the slice operator.
19
+ */
20
+ export const groqProjectionTask = {
21
+ mode: "knowledge-probe",
22
+ id: "kp-groq-projections",
23
+ title: "Explain GROQ Projection Syntax",
24
+ description: "Explain GROQ's projection syntax in detail, including: " +
25
+ "object projections ({}), array projections, spread operator (...), " +
26
+ "computed field names, the dereference operator (->), and " +
27
+ "conditional projections using select().",
28
+ area: "groq",
29
+ difficulty: "intermediate",
30
+ tags: ["knowledge-probe", "groq", "syntax"],
31
+ probeStrategy: "depth-first",
32
+ prompt: {
33
+ text: "Explain GROQ's projection syntax in Sanity. Cover these topics:\n\n" +
34
+ "1. Basic object projections with `{}`\n" +
35
+ "2. Nested projections and the spread operator `...`\n" +
36
+ "3. Computed field names\n" +
37
+ "4. The dereference operator `->` for following references\n" +
38
+ "5. Array slicing with `[0..5]` and `[0...5]`\n" +
39
+ "6. Conditional projections using `select()`\n\n" +
40
+ "Provide working code examples for each.",
41
+ vars: {
42
+ task: "Explain GROQ projection syntax with working code examples " +
43
+ "covering projections, spread, dereference, slicing, and select().",
44
+ },
45
+ },
46
+ assertions: [
47
+ { type: "contains", value: "->" },
48
+ { type: "contains", value: "select(" },
49
+ {
50
+ type: "llm-rubric",
51
+ value: "The response should demonstrate accurate knowledge of GROQ " +
52
+ "projection syntax with working code examples. Check that the " +
53
+ "dereference operator, spread syntax, and select() are correctly " +
54
+ "explained with valid GROQ code.",
55
+ weight: 0.6,
56
+ },
57
+ {
58
+ type: "llm-rubric",
59
+ value: "Evaluate whether the response reflects current GROQ syntax " +
60
+ "(post-2023). Check for deprecated patterns or outdated " +
61
+ "recommendations.",
62
+ weight: 0.4,
63
+ },
64
+ ],
65
+ };
66
+ // ---------------------------------------------------------------------------
67
+ // Task 2: defineType API
68
+ // ---------------------------------------------------------------------------
69
+ /**
70
+ * Tests knowledge of the schema definition API, including recent changes.
71
+ * The currency dimension is key here — models should know about defineType,
72
+ * defineField, and the typed schema helpers, not the legacy untyped API.
73
+ */
74
+ export const defineTypeApiTask = {
75
+ mode: "knowledge-probe",
76
+ id: "kp-define-type-api",
77
+ title: "What is Sanity's defineType API?",
78
+ description: "Explain how to define document schemas in Sanity using the defineType, " +
79
+ "defineField, and defineArrayMember helper functions. Include the " +
80
+ "motivation for typed schemas and common patterns.",
81
+ area: "studio",
82
+ difficulty: "basic",
83
+ tags: ["knowledge-probe", "studio", "schema"],
84
+ probeStrategy: "breadth-first",
85
+ prompt: {
86
+ text: "Explain Sanity's schema definition API:\n\n" +
87
+ "1. What is `defineType` and how do you use it?\n" +
88
+ "2. What are `defineField` and `defineArrayMember`?\n" +
89
+ "3. Why were these typed helpers introduced? What did they replace?\n" +
90
+ "4. Show a complete example of a document schema with various field types\n" +
91
+ "5. How do you add validation rules using the typed API?",
92
+ vars: {
93
+ task: "Explain Sanity's defineType/defineField schema API with examples, " +
94
+ "motivation, and validation rules.",
95
+ },
96
+ },
97
+ assertions: [
98
+ { type: "contains", value: "defineType" },
99
+ { type: "contains", value: "defineField" },
100
+ {
101
+ type: "llm-rubric",
102
+ value: "The response should accurately explain the typed schema helpers " +
103
+ "(defineType, defineField, defineArrayMember). Check that the code " +
104
+ "examples use the current API, not the legacy untyped format. " +
105
+ "Penalize if the response uses the old `{ type: 'document', fields: [...] }` " +
106
+ "pattern without mentioning defineType.",
107
+ weight: 0.5,
108
+ },
109
+ {
110
+ type: "llm-rubric",
111
+ value: "Evaluate currency: does the response know about defineType " +
112
+ "(introduced in Sanity v3)? Does it mention TypeScript type " +
113
+ "inference benefits? Does it use current import paths?",
114
+ weight: 0.5,
115
+ },
116
+ ],
117
+ };
118
+ // ---------------------------------------------------------------------------
119
+ // Task 3: Sanity vs Contentful comparison
120
+ // ---------------------------------------------------------------------------
121
+ /**
122
+ * Tests broader ecosystem knowledge and the ability to articulate
123
+ * differences between Sanity and a major competitor. This probes
124
+ * conceptual understanding, not just API syntax.
125
+ */
126
+ export const ecosystemComparisonTask = {
127
+ mode: "knowledge-probe",
128
+ id: "kp-sanity-vs-contentful",
129
+ title: "Compare Sanity and Contentful",
130
+ description: "Compare Sanity and Contentful as headless CMS platforms. " +
131
+ "Cover architecture, content modeling, querying, pricing, " +
132
+ "and developer experience differences.",
133
+ area: "general",
134
+ difficulty: "advanced",
135
+ tags: ["knowledge-probe", "ecosystem", "comparison"],
136
+ probeStrategy: "breadth-first",
137
+ prompt: {
138
+ text: "Compare Sanity and Contentful as headless CMS platforms. Address:\n\n" +
139
+ "1. Architecture differences (real-time vs CDN-based)\n" +
140
+ "2. Content modeling approaches\n" +
141
+ "3. Query languages (GROQ vs GraphQL)\n" +
142
+ "4. Developer experience and customization\n" +
143
+ "5. Pricing models\n" +
144
+ "6. When would you choose one over the other?",
145
+ vars: {
146
+ task: "Compare Sanity and Contentful across architecture, content modeling, " +
147
+ "querying, DX, pricing, and use case fit.",
148
+ },
149
+ },
150
+ assertions: [
151
+ { type: "contains-any", value: ["GROQ", "groq"] },
152
+ { type: "contains-any", value: ["GraphQL", "graphql"] },
153
+ {
154
+ type: "llm-rubric",
155
+ value: "The comparison should be factually accurate about both platforms. " +
156
+ "Check that key architectural differences are correctly stated: " +
157
+ "Sanity uses a real-time document store with GROQ; Contentful " +
158
+ "uses a CDN-based content infrastructure with GraphQL/REST. " +
159
+ "Penalize incorrect claims about either platform.",
160
+ weight: 0.5,
161
+ },
162
+ {
163
+ type: "llm-rubric",
164
+ value: "Evaluate completeness: does the response cover all requested " +
165
+ "areas (architecture, modeling, querying, DX, pricing, use cases)? " +
166
+ "Are the trade-offs balanced and fair?",
167
+ weight: 0.5,
168
+ },
169
+ ],
170
+ };
171
+ /** All example knowledge probe tasks */
172
+ export const allKnowledgeProbeExampleTasks = [
173
+ groqProjectionTask,
174
+ defineTypeApiTask,
175
+ ecosystemComparisonTask,
176
+ ];
@@ -0,0 +1,49 @@
1
+ /**
2
+ * Example MCP server task definitions — Sanity MCP Server reference.
3
+ *
4
+ * These are reference implementations showing how to author MCP server
5
+ * evaluation tasks. They model the real Sanity MCP Server (mcp.sanity.io)
6
+ * and its actual tool names. They serve as:
7
+ *
8
+ * 1. Documentation — developers learn the API by reading examples
9
+ * 2. Test fixtures — the compiler tests compile these and verify output
10
+ * 3. Validation — proves the compiler handles real-world task shapes
11
+ * 4. Reference — shows all transport types, auth patterns, and assertion styles
12
+ *
13
+ * NOTE: These are NOT executable without a valid SANITY_MCP_AUTH_TOKEN.
14
+ * They exist purely for compilation testing and as authoring examples.
15
+ * For real evaluation, write tasks in an external repo's .ailf/tasks/
16
+ * directory or a standalone config file.
17
+ *
18
+ * @see https://www.sanity.io/docs/ai/mcp-server — Sanity MCP Server docs
19
+ * @see https://www.promptfoo.dev/docs/providers/mcp/ — Promptfoo MCP provider
20
+ */
21
+ import type { MCPServerTaskDefinition } from "../../../../_vendor/ailf-core/index.d.ts";
22
+ /**
23
+ * Tests that a model can construct a valid GROQ query and use the
24
+ * query_documents tool to fetch results from a Sanity dataset.
25
+ */
26
+ export declare const queryDocumentsTask: MCPServerTaskDefinition;
27
+ /**
28
+ * Tests that a model can retrieve and interpret schema information
29
+ * from the MCP server.
30
+ */
31
+ export declare const inspectSchemaTask: MCPServerTaskDefinition;
32
+ /**
33
+ * Tests a multi-step document lifecycle workflow using multiple tools
34
+ * across a multi-turn conversation. This is the most complex example,
35
+ * demonstrating tool chaining and verification patterns.
36
+ */
37
+ export declare const createAndPublishTask: MCPServerTaskDefinition;
38
+ /**
39
+ * Tests that a model can discover embeddings indices and perform
40
+ * semantic search against them.
41
+ */
42
+ export declare const semanticSearchTask: MCPServerTaskDefinition;
43
+ /**
44
+ * Demonstrates a stdio-based MCP server (local process) for contrast
45
+ * with the remote streamable-http tasks above.
46
+ */
47
+ export declare const stdioServerTask: MCPServerTaskDefinition;
48
+ /** All example MCP tasks — used by compiler tests */
49
+ export declare const allMCPExampleTasks: MCPServerTaskDefinition[];