@sanity/ailf 2.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (442) hide show
  1. package/canonical/grader-references/README.md +2 -2
  2. package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
  3. package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
  4. package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
  5. package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
  6. package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
  7. package/config/features.ts +1 -1
  8. package/config/models.ts +28 -23
  9. package/config/sources.ts +1 -1
  10. package/config/thresholds.ts +1 -1
  11. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
  12. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
  13. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
  14. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
  15. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
  16. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
  17. package/dist/_vendor/ailf-core/config-helpers.d.ts +6 -0
  18. package/dist/_vendor/ailf-core/config-helpers.js +29 -0
  19. package/dist/_vendor/ailf-core/examples/index.d.ts +164 -94
  20. package/dist/_vendor/ailf-core/examples/index.js +208 -114
  21. package/dist/_vendor/ailf-core/index.d.ts +1 -0
  22. package/dist/_vendor/ailf-core/index.js +1 -0
  23. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
  24. package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
  25. package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
  26. package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
  27. package/dist/_vendor/ailf-core/ports/context.d.ts +20 -1
  28. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
  29. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  30. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
  31. package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
  32. package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
  33. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +6 -1
  34. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +14 -2
  35. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
  36. package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
  37. package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
  38. package/dist/_vendor/ailf-core/services/index.js +1 -1
  39. package/dist/_vendor/ailf-core/services/scoring.js +9 -0
  40. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +12 -1
  41. package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
  42. package/dist/_vendor/ailf-core/types/index.d.ts +47 -4
  43. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +27 -0
  44. package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
  45. package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
  46. package/dist/_vendor/ailf-tasks/cli.d.ts +8 -0
  47. package/dist/_vendor/ailf-tasks/cli.js +61 -0
  48. package/dist/_vendor/ailf-tasks/index.d.ts +13 -0
  49. package/dist/_vendor/ailf-tasks/index.js +16 -0
  50. package/dist/_vendor/ailf-tasks/parser.d.ts +27 -0
  51. package/dist/_vendor/ailf-tasks/parser.js +73 -0
  52. package/dist/_vendor/ailf-tasks/schemas.d.ts +198 -0
  53. package/dist/_vendor/ailf-tasks/schemas.js +180 -0
  54. package/dist/_vendor/ailf-tasks/validation.d.ts +47 -0
  55. package/dist/_vendor/ailf-tasks/validation.js +162 -0
  56. package/dist/adapters/api-client/remediation.js +2 -2
  57. package/dist/adapters/config-sources/file-config-adapter.js +6 -1
  58. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
  59. package/dist/adapters/index.d.ts +0 -1
  60. package/dist/adapters/index.js +0 -1
  61. package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
  62. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  63. package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
  64. package/dist/adapters/task-sources/content-lake-task-source.js +4 -6
  65. package/dist/adapters/task-sources/index.d.ts +1 -2
  66. package/dist/adapters/task-sources/index.js +1 -2
  67. package/dist/adapters/task-sources/repo-schemas.d.ts +1 -1
  68. package/dist/adapters/task-sources/repo-schemas.js +2 -2
  69. package/dist/adapters/task-sources/repo-task-source.js +1 -1
  70. package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
  71. package/dist/adapters/task-sources/repo-trigger.js +1 -1
  72. package/dist/adapters/task-sources/task-file-loader.d.ts +9 -6
  73. package/dist/adapters/task-sources/task-file-loader.js +20 -6
  74. package/dist/agent-observer/test-imports.d.ts +7 -0
  75. package/dist/agent-observer/test-imports.js +185 -0
  76. package/dist/artifact-capture/comparator.d.ts +22 -0
  77. package/dist/artifact-capture/comparator.js +493 -0
  78. package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
  79. package/dist/artifact-capture/filesystem-collector.js +237 -0
  80. package/dist/artifact-capture/redact-artifact.d.ts +20 -0
  81. package/dist/artifact-capture/redact-artifact.js +115 -0
  82. package/dist/assertions/source-isolation.d.ts +1 -1
  83. package/dist/assertions/source-isolation.js +1 -1
  84. package/dist/cli.js +4 -0
  85. package/dist/commands/calculate-scores.js +1 -0
  86. package/dist/commands/capture-compare.d.ts +15 -0
  87. package/dist/commands/capture-compare.js +253 -0
  88. package/dist/commands/capture-list.d.ts +12 -0
  89. package/dist/commands/capture-list.js +147 -0
  90. package/dist/commands/capture.d.ts +9 -0
  91. package/dist/commands/capture.js +16 -0
  92. package/dist/commands/chronic-failures.d.ts +8 -0
  93. package/dist/commands/chronic-failures.js +33 -0
  94. package/dist/commands/explain-handler.d.ts +1 -1
  95. package/dist/commands/explain-handler.js +37 -8
  96. package/dist/commands/fetch-docs.js +1 -0
  97. package/dist/commands/generate-configs.d.ts +3 -3
  98. package/dist/commands/generate-configs.js +20 -8
  99. package/dist/commands/init.d.ts +2 -3
  100. package/dist/commands/init.js +56 -170
  101. package/dist/commands/pipeline-action.d.ts +7 -1
  102. package/dist/commands/pipeline-action.js +43 -19
  103. package/dist/commands/pipeline.d.ts +6 -1
  104. package/dist/commands/pipeline.js +7 -2
  105. package/dist/commands/pr-comment.js +1 -0
  106. package/dist/commands/publish.js +1 -0
  107. package/dist/commands/shared/help.js +2 -2
  108. package/dist/commands/update-quality-scores.d.ts +5 -0
  109. package/dist/commands/update-quality-scores.js +20 -0
  110. package/dist/composition-root.d.ts +2 -3
  111. package/dist/composition-root.js +27 -14
  112. package/dist/config/features.ts +23 -0
  113. package/dist/config/models.ts +100 -0
  114. package/dist/config/prompts.ts +16 -0
  115. package/dist/config/rubrics.ts +225 -0
  116. package/dist/config/schedules.ts +47 -0
  117. package/dist/config/sinks.ts +37 -0
  118. package/dist/config/sources.ts +21 -0
  119. package/dist/config/thresholds.ts +61 -0
  120. package/dist/lib/agent-behavior-report.d.ts +8 -0
  121. package/dist/lib/agent-behavior-report.js +185 -0
  122. package/dist/lib/baseline.d.ts +19 -0
  123. package/dist/lib/baseline.js +153 -0
  124. package/dist/lib/calculate-scores.d.ts +23 -0
  125. package/dist/lib/calculate-scores.js +42 -0
  126. package/dist/lib/compare.d.ts +18 -0
  127. package/dist/lib/compare.js +170 -0
  128. package/dist/lib/coverage-audit.d.ts +4 -0
  129. package/dist/lib/coverage-audit.js +42 -0
  130. package/dist/lib/discovery-report.d.ts +13 -0
  131. package/dist/lib/discovery-report.js +57 -0
  132. package/dist/lib/fetch-docs.d.ts +30 -0
  133. package/dist/lib/fetch-docs.js +171 -0
  134. package/dist/lib/generate-configs.d.ts +25 -0
  135. package/dist/lib/generate-configs.js +42 -0
  136. package/dist/lib/grader-api.d.ts +21 -0
  137. package/dist/lib/grader-api.js +34 -0
  138. package/dist/lib/grader-compare.d.ts +19 -0
  139. package/dist/lib/grader-compare.js +91 -0
  140. package/dist/lib/grader-consistency.d.ts +27 -0
  141. package/dist/lib/grader-consistency.js +79 -0
  142. package/dist/lib/grader-sensitivity.d.ts +19 -0
  143. package/dist/lib/grader-sensitivity.js +75 -0
  144. package/dist/lib/grader-validate.d.ts +19 -0
  145. package/dist/lib/grader-validate.js +78 -0
  146. package/dist/lib/measure-retrieval.d.ts +14 -0
  147. package/dist/lib/measure-retrieval.js +71 -0
  148. package/dist/lib/pr-comment.d.ts +16 -0
  149. package/dist/lib/pr-comment.js +28 -0
  150. package/dist/lib/readiness-report.d.ts +13 -0
  151. package/dist/lib/readiness-report.js +108 -0
  152. package/dist/lib/webhook-server.d.ts +11 -0
  153. package/dist/lib/webhook-server.js +24 -0
  154. package/dist/lib/weekly-digest.d.ts +24 -0
  155. package/dist/lib/weekly-digest.js +148 -0
  156. package/dist/orchestration/build-app-context.js +13 -0
  157. package/dist/orchestration/cache-context.d.ts +23 -0
  158. package/dist/orchestration/cache-context.js +43 -0
  159. package/dist/orchestration/env-bridge.d.ts +21 -0
  160. package/dist/orchestration/env-bridge.js +66 -0
  161. package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
  162. package/dist/orchestration/load-pipeline-tasks.js +52 -0
  163. package/dist/orchestration/pipeline-orchestrator.js +75 -5
  164. package/dist/orchestration/step-runner.js +5 -1
  165. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
  166. package/dist/orchestration/steps/calculate-scores-step.js +13 -0
  167. package/dist/orchestration/steps/callback-step.js +10 -1
  168. package/dist/orchestration/steps/compare-step.js +6 -3
  169. package/dist/orchestration/steps/discovery-report-step.js +6 -2
  170. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  171. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  172. package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
  173. package/dist/orchestration/steps/fetch-docs-step.js +30 -16
  174. package/dist/orchestration/steps/gap-analysis-step.js +13 -2
  175. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
  176. package/dist/orchestration/steps/generate-configs-step.js +50 -15
  177. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
  178. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  179. package/dist/orchestration/steps/publish-report-step.js +19 -0
  180. package/dist/orchestration/steps/readiness-step.js +8 -3
  181. package/dist/orchestration/steps/report-step.js +17 -4
  182. package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
  183. package/dist/orchestration/steps/run-eval-step.js +51 -31
  184. package/dist/pipeline/agent-behavior-report.js +6 -0
  185. package/dist/pipeline/attribution.d.ts +1 -1
  186. package/dist/pipeline/attribution.js +1 -1
  187. package/dist/pipeline/cache.js +29 -15
  188. package/dist/pipeline/calculate-scores.d.ts +2 -0
  189. package/dist/pipeline/calculate-scores.js +70 -33
  190. package/dist/pipeline/chronic-failures.d.ts +55 -0
  191. package/dist/pipeline/chronic-failures.js +110 -0
  192. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +33 -0
  193. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
  194. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
  195. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
  196. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
  197. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
  198. package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
  199. package/dist/pipeline/compiler/assertion-mapper.js +1 -1
  200. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
  201. package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
  202. package/dist/pipeline/compiler/config-loader.d.ts +14 -0
  203. package/dist/pipeline/compiler/config-loader.js +42 -2
  204. package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
  205. package/dist/pipeline/compiler/fixture-resolver.js +1 -1
  206. package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
  207. package/dist/pipeline/compiler/ignore-fields.js +1 -1
  208. package/dist/pipeline/compiler/index.d.ts +2 -5
  209. package/dist/pipeline/compiler/index.js +2 -5
  210. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  211. package/dist/pipeline/compiler/literacy-bridge.js +1 -1
  212. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +1 -1
  213. package/dist/pipeline/compiler/mode-bases/agent-harness.js +1 -1
  214. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +1 -1
  215. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +1 -1
  216. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +13 -2
  217. package/dist/pipeline/compiler/mode-bases/literacy.js +55 -1
  218. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +1 -1
  219. package/dist/pipeline/compiler/mode-bases/mcp-server.js +1 -1
  220. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +1 -1
  221. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -1
  222. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
  223. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
  224. package/dist/pipeline/compiler/mode-handlers/index.d.ts +2 -2
  225. package/dist/pipeline/compiler/mode-handlers/index.js +2 -2
  226. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +1 -1
  227. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +1 -1
  228. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
  229. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
  230. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
  231. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +1 -1
  232. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
  233. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
  234. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
  235. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +334 -0
  236. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +1 -1
  237. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +1 -1
  238. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +1 -1
  239. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +1 -1
  240. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +4 -0
  241. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +69 -0
  242. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +307 -0
  243. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +22 -5
  244. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +6 -0
  245. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +10 -5
  246. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +314 -7
  247. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +10 -0
  248. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
  249. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
  250. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +1 -1
  251. package/dist/pipeline/compiler/presets/sanity-literacy.js +1 -1
  252. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
  253. package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
  254. package/dist/pipeline/compiler/provider-assembler.js +13 -7
  255. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
  256. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
  257. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
  258. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
  259. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
  260. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
  261. package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
  262. package/dist/pipeline/compiler/sandbox/index.js +1 -1
  263. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
  264. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
  265. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
  266. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
  267. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
  268. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
  269. package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
  270. package/dist/pipeline/compiler/scoring-bridge.js +1 -1
  271. package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
  272. package/dist/pipeline/compiler/task-bridge.js +92 -0
  273. package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
  274. package/dist/pipeline/compiler/task-graph-builder.js +1 -4
  275. package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
  276. package/dist/pipeline/compiler/telemetry/index.js +1 -1
  277. package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
  278. package/dist/pipeline/compiler/variable-resolver.js +1 -1
  279. package/dist/pipeline/coverage-audit.d.ts +1 -1
  280. package/dist/pipeline/coverage-audit.js +1 -1
  281. package/dist/pipeline/degradations.d.ts +1 -1
  282. package/dist/pipeline/degradations.js +1 -1
  283. package/dist/pipeline/failure-modes.d.ts +1 -1
  284. package/dist/pipeline/failure-modes.js +13 -1
  285. package/dist/pipeline/gap-analysis.d.ts +1 -1
  286. package/dist/pipeline/gap-analysis.js +3 -1
  287. package/dist/pipeline/generate-configs.d.ts +2 -2
  288. package/dist/pipeline/generate-configs.js +15 -8
  289. package/dist/pipeline/grader-compare-runner.d.ts +1 -1
  290. package/dist/pipeline/grader-compare-runner.js +7 -1
  291. package/dist/pipeline/grader-comparison.d.ts +1 -1
  292. package/dist/pipeline/grader-comparison.js +1 -1
  293. package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
  294. package/dist/pipeline/grader-consistency-runner.js +7 -1
  295. package/dist/pipeline/grader-consistency.d.ts +1 -1
  296. package/dist/pipeline/grader-consistency.js +1 -1
  297. package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
  298. package/dist/pipeline/grader-sensitivity-runner.js +1 -1
  299. package/dist/pipeline/grader-sensitivity.d.ts +1 -1
  300. package/dist/pipeline/grader-sensitivity.js +1 -1
  301. package/dist/pipeline/grader-validate-runner.d.ts +1 -1
  302. package/dist/pipeline/grader-validate-runner.js +2 -2
  303. package/dist/pipeline/grader-validation.d.ts +1 -1
  304. package/dist/pipeline/grader-validation.js +1 -1
  305. package/dist/pipeline/map-request-to-config.js +15 -2
  306. package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
  307. package/dist/pipeline/mirror-repo-tasks.js +1 -1
  308. package/dist/pipeline/plan-format.d.ts +1 -1
  309. package/dist/pipeline/plan-format.js +1 -1
  310. package/dist/pipeline/plan.d.ts +1 -1
  311. package/dist/pipeline/plan.js +67 -29
  312. package/dist/pipeline/probe.d.ts +1 -1
  313. package/dist/pipeline/probe.js +1 -1
  314. package/dist/pipeline/readiness-report.d.ts +2 -2
  315. package/dist/pipeline/readiness-report.js +2 -2
  316. package/dist/pipeline/release-classification.d.ts +1 -1
  317. package/dist/pipeline/release-classification.js +1 -1
  318. package/dist/pipeline/release-report.d.ts +1 -1
  319. package/dist/pipeline/release-report.js +1 -1
  320. package/dist/pipeline/repo-eval-comment.d.ts +1 -1
  321. package/dist/pipeline/repo-eval-comment.js +1 -1
  322. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  323. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  324. package/dist/pipeline/resolve-mappings.d.ts +6 -6
  325. package/dist/pipeline/resolve-mappings.js +44 -44
  326. package/dist/pipeline/retrieval-metrics.d.ts +3 -3
  327. package/dist/pipeline/retrieval-metrics.js +28 -20
  328. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  329. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  330. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  331. package/dist/pipeline/steps/compare-step.js +90 -0
  332. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  333. package/dist/pipeline/steps/eval-step.js +347 -0
  334. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  335. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  336. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  337. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  338. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  339. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  340. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  341. package/dist/pipeline/steps/publish-report-step.js +243 -0
  342. package/dist/pipeline/steps/report-step.d.ts +13 -0
  343. package/dist/pipeline/steps/report-step.js +56 -0
  344. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  345. package/dist/pipeline/steps/update-scores-step.js +42 -0
  346. package/dist/pipeline/targeted-loo.d.ts +1 -1
  347. package/dist/pipeline/targeted-loo.js +1 -1
  348. package/dist/pipeline/thresholds.d.ts +1 -1
  349. package/dist/pipeline/thresholds.js +1 -1
  350. package/dist/pipeline/validate.js +13 -0
  351. package/dist/report-store.d.ts +17 -0
  352. package/dist/report-store.js +24 -0
  353. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  354. package/dist/scripts/agent-behavior-report.js +315 -0
  355. package/dist/scripts/baseline.d.ts +43 -0
  356. package/dist/scripts/baseline.js +267 -0
  357. package/dist/scripts/calculate-scores.d.ts +166 -0
  358. package/dist/scripts/calculate-scores.js +1296 -0
  359. package/dist/scripts/compare.d.ts +22 -0
  360. package/dist/scripts/compare.js +334 -0
  361. package/dist/scripts/coverage-audit.d.ts +44 -0
  362. package/dist/scripts/coverage-audit.js +209 -0
  363. package/dist/scripts/debug-eval.d.ts +19 -0
  364. package/dist/scripts/debug-eval.js +73 -0
  365. package/dist/scripts/discovery-report.d.ts +58 -0
  366. package/dist/scripts/discovery-report.js +250 -0
  367. package/dist/scripts/fetch-docs.d.ts +35 -0
  368. package/dist/scripts/fetch-docs.js +472 -0
  369. package/dist/scripts/generate-configs.d.ts +66 -0
  370. package/dist/scripts/generate-configs.js +459 -0
  371. package/dist/scripts/grader-api.d.ts +27 -0
  372. package/dist/scripts/grader-api.js +206 -0
  373. package/dist/scripts/grader-compare.d.ts +22 -0
  374. package/dist/scripts/grader-compare.js +368 -0
  375. package/dist/scripts/grader-consistency.d.ts +20 -0
  376. package/dist/scripts/grader-consistency.js +313 -0
  377. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  378. package/dist/scripts/grader-sensitivity.js +354 -0
  379. package/dist/scripts/grader-validate.d.ts +19 -0
  380. package/dist/scripts/grader-validate.js +267 -0
  381. package/dist/scripts/measure-retrieval.d.ts +10 -0
  382. package/dist/scripts/measure-retrieval.js +145 -0
  383. package/dist/scripts/migrate-task-mode.d.ts +1 -1
  384. package/dist/scripts/migrate-task-mode.js +1 -1
  385. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
  386. package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
  387. package/dist/scripts/pipeline.d.ts +76 -0
  388. package/dist/scripts/pipeline.js +1031 -0
  389. package/dist/scripts/pr-comment.d.ts +10 -0
  390. package/dist/scripts/pr-comment.js +510 -0
  391. package/dist/scripts/readiness-report.d.ts +88 -0
  392. package/dist/scripts/readiness-report.js +342 -0
  393. package/dist/scripts/update-quality-scores.d.ts +15 -0
  394. package/dist/scripts/update-quality-scores.js +184 -0
  395. package/dist/scripts/validate-task-sources.d.ts +1 -1
  396. package/dist/scripts/validate-task-sources.js +1 -1
  397. package/dist/scripts/validate.d.ts +13 -0
  398. package/dist/scripts/validate.js +79 -0
  399. package/dist/scripts/webhook-server.d.ts +26 -0
  400. package/dist/scripts/webhook-server.js +147 -0
  401. package/dist/scripts/weekly-digest.d.ts +24 -0
  402. package/dist/scripts/weekly-digest.js +144 -0
  403. package/dist/sinks/format-slack.d.ts +64 -0
  404. package/dist/sinks/format-slack.js +306 -0
  405. package/dist/sinks/slack-sink.d.ts +27 -0
  406. package/dist/sinks/slack-sink.js +78 -0
  407. package/dist/sinks/types.d.ts +1 -1
  408. package/dist/sinks/types.js +1 -1
  409. package/dist/sinks/webhook-sink.d.ts +19 -0
  410. package/dist/sinks/webhook-sink.js +50 -0
  411. package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
  412. package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
  413. package/dist/tasks/literacy/content-lake.task.ts +181 -0
  414. package/dist/tasks/literacy/frameworks.task.ts +129 -0
  415. package/dist/tasks/literacy/functions.task.ts +70 -0
  416. package/dist/tasks/literacy/groq.task.ts +259 -0
  417. package/dist/tasks/literacy/image-handling.task.ts +95 -0
  418. package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
  419. package/dist/tasks/literacy/portable-text.task.ts +169 -0
  420. package/dist/tasks/literacy/studio-setup.task.ts +134 -0
  421. package/dist/tasks/literacy/visual-editing.task.ts +147 -0
  422. package/package.json +24 -24
  423. package/tasks/.expanded.agentic.yaml +280 -0
  424. package/tasks/.expanded.yaml +565 -0
  425. package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
  426. package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
  427. package/tasks/literacy/content-lake.task.ts +181 -0
  428. package/tasks/literacy/frameworks.task.ts +1 -0
  429. package/tasks/literacy/functions.task.ts +1 -0
  430. package/tasks/literacy/groq.task.ts +1 -0
  431. package/tasks/literacy/image-handling.task.ts +95 -0
  432. package/tasks/literacy/nextjs-live.task.ts +2 -1
  433. package/tasks/literacy/portable-text.task.ts +169 -0
  434. package/tasks/literacy/studio-setup.task.ts +5 -2
  435. package/tasks/literacy/visual-editing.task.ts +1 -0
  436. package/LICENSE +0 -21
  437. package/tasks/frameworks.yaml +0 -98
  438. package/tasks/functions.yaml +0 -51
  439. package/tasks/groq.yaml +0 -216
  440. package/tasks/nextjs-live.yaml +0 -62
  441. package/tasks/studio-setup.yaml +0 -111
  442. package/tasks/visual-editing.yaml +0 -120
@@ -2,7 +2,8 @@
2
2
  * src/examples/index.ts — Generated example data.
3
3
  *
4
4
  * DO NOT EDIT — this file is generated by scripts/generate-examples.ts
5
- * from the YAML files in packages/core/examples/.
5
+ * from TypeScript task files in packages/core/examples/tasks/
6
+ * and YAML config files in packages/core/examples/.
6
7
  *
7
8
  * To regenerate: pnpm generate-examples
8
9
  */
@@ -113,23 +114,26 @@ export declare const ailfConfigData: {
113
114
  export declare const ailfConfigYaml = "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# .ailf/config.yaml \u2014 AI Literacy Framework project configuration\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n#\n# This file configures how the AILF evaluation pipeline runs in this\n# repository. Place it at .ailf/config.yaml in your project root.\n#\n# Evaluations are submitted to the AILF API (ailf-api.sanity.build).\n# The API handles LLM calls, doc fetching, grading, and report\n# publishing. Your repo only needs one secret: AILF_API_KEY.\n#\n# Docs: https://github.com/sanity-labs/ai-literacy-framework\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n\n# Documentation source \u2014 which docs are being evaluated.\n#\n# This tells the pipeline which Sanity project and dataset contain\n# the documentation under test. For most users, this is Sanity's own\n# docs project.\n#\n# projectId \u2014 Sanity project ID (find yours at sanity.io/manage)\n# dataset \u2014 the dataset to query (e.g., \"production\", \"next\")\n# baseUrl \u2014 the public URL of your documentation site\n# (used by agentic mode to test agent discoverability)\nsource:\n projectId: \"3do82whm\"\n dataset: next\n baseUrl: \"https://www.sanity.io/docs\"\n\n# Trigger configuration \u2014 when evaluations run automatically.\n#\n# Each key is a trigger context. The pipeline checks which trigger\n# matches the current execution context (PR, merge, schedule, etc.)\n# and applies its settings.\n#\n# mode options:\n# validate-only \u2014 check that task YAML parses correctly (fast, no LLM calls)\n# eval \u2014 run the full evaluation pipeline\n#\n# paths \u2014 only trigger when files matching these globs change\n# blocking \u2014 if true, a failing eval blocks the PR merge\n# notify \u2014 if true, post results to configured notification channels\ntriggers:\n # On pull requests: just validate task files parse correctly\n pr:\n mode: validate-only\n\n # When .ailf/ files change in a PR: run a real evaluation\n pr-task-change:\n mode: eval\n paths: [\".ailf/**\"]\n\n # On merge to main: run evaluation (non-blocking)\n main:\n mode: eval\n blocking: false\n notify: true\n";
114
115
  /** Parsed task data for example-groq-blog-listing (JSON-safe) */
115
116
  export declare const exampleGroqBlogListingData: readonly [{
117
+ readonly mode: "literacy";
116
118
  readonly id: "example-groq-blog-listing";
117
- readonly description: "Example — Blog listing with GROQ queries";
118
- readonly featureArea: "groq";
119
- readonly canonicalDocs: readonly [{
120
- readonly slug: "groq-introduction";
121
- readonly reason: "Core GROQ syntax and query language reference";
122
- }, {
123
- readonly slug: "how-queries-work";
124
- readonly reason: "Query execution model and best practices";
125
- }];
119
+ readonly title: "Blog listing with GROQ queries";
120
+ readonly description: "Example — tests GROQ blog listing implementation";
121
+ readonly area: "groq";
122
+ readonly context: {
123
+ readonly docs: readonly [{
124
+ readonly slug: "groq-introduction";
125
+ readonly reason: "Core GROQ syntax and query language reference";
126
+ }, {
127
+ readonly slug: "how-queries-work";
128
+ readonly reason: "Query execution model and best practices";
129
+ }];
130
+ };
126
131
  readonly docCoverage: true;
127
132
  readonly referenceSolution: "canonical/example-groq-blog-listing.ts";
128
- readonly vars: {
129
- readonly task: "Create a Next.js page component that lists blog posts from Sanity\nusing GROQ. The page should display the title, slug, and published\ndate for each post, sorted by most recent first. Use the Sanity\nclient to fetch data.\n";
130
- readonly docs: "";
133
+ readonly prompt: {
134
+ readonly text: "Create a Next.js page component that lists blog posts from Sanity\nusing GROQ. The page should display the title, slug, and published\ndate for each post, sorted by most recent first. Use the Sanity\nclient to fetch data.";
131
135
  };
132
- readonly assert: readonly [{
136
+ readonly assertions: readonly [{
133
137
  readonly type: "llm-rubric";
134
138
  readonly template: "task-completion";
135
139
  readonly criteria: readonly ["Uses the groq tagged template literal", "Fetches blog posts with title, slug, and publishedAt fields", "Orders results by publishedAt in descending order"];
@@ -144,28 +148,33 @@ export declare const exampleGroqBlogListingData: readonly [{
144
148
  };
145
149
  readonly status: "draft";
146
150
  }];
147
- /** Raw YAML string for example-groq-blog-listing (preserves comments) */
148
- export declare const exampleGroqBlogListingYaml = "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# Example Task: Blog listing with GROQ queries\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n#\n# This is a starter template \u2014 edit it for your own documentation.\n# Each task evaluates whether an AI coding agent can implement a feature\n# using your docs as context. Delete this file or replace it entirely.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n#\n# Full field reference:\n# https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/CONTRIBUTING_TASKS.md\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n\n# Unique identifier \u2014 lowercase alphanumeric with hyphens.\n# Must be unique across all task files in .ailf/tasks/.\n- id: example-groq-blog-listing\n\n # Short human-readable summary. Shown in score tables and reports.\n description: \"Example \u2014 Blog listing with GROQ queries\"\n\n # Feature area this task belongs to. Tasks with the same area are\n # grouped together in score summaries. Use a short kebab-case name.\n featureArea: groq\n\n # Gold-standard documentation articles for this task. The pipeline\n # fetches these from Sanity and injects them into the prompt for\n # baseline evaluation. Each entry needs:\n # slug \u2014 the article's URL slug in your docs site\n # reason \u2014 why this doc is relevant (helps with auditing)\n #\n # This example uses slug-based references \u2014 the simplest form.\n # See the other example tasks for path, id, and perspective references.\n canonicalDocs:\n - slug: groq-introduction\n reason: \"Core GROQ syntax and query language reference\"\n - slug: how-queries-work\n reason: \"Query execution model and best practices\"\n\n # When true, the pipeline auto-generates an additional rubric that\n # checks whether the LLM's response actually used the provided docs.\n docCoverage: true\n\n # Path to a gold-standard implementation, relative to canonical/.\n # The grader uses this as a reference when scoring code correctness.\n referenceSolution: canonical/example-groq-blog-listing.ts\n\n # vars.task \u2014 the implementation prompt given to the LLM.\n # Write this as if you're asking a developer to build the feature.\n # Be specific about requirements so the grader can evaluate clearly.\n #\n # vars.docs \u2014 leave empty (\"\"). The pipeline fills this in:\n # \u2022 Gold variant: injected with canonical doc content\n # \u2022 Baseline variant: left empty (tests model knowledge alone)\n vars:\n task: |\n Create a Next.js page component that lists blog posts from Sanity\n using GROQ. The page should display the title, slug, and published\n date for each post, sorted by most recent first. Use the Sanity\n client to fetch data.\n docs: \"\"\n\n # Grading assertions \u2014 how the LLM's response is scored.\n #\n # \"llm-rubric\" assertions use a grader LLM to score against criteria.\n # The \"template\" references a rubric from config/rubrics.yaml.\n # The \"criteria\" are task-specific bullets injected into the template.\n #\n # Available templates:\n # task-completion \u2014 did the LLM implement the feature? (weight: 0.50)\n # code-correctness \u2014 is the code idiomatic and correct? (weight: 0.25)\n #\n # You can also use value-based assertions:\n # - type: contains\n # value: \"client.fetch\"\n # - type: contains-any\n # value: [\"createClient\", \"sanityClient\"]\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Uses the groq tagged template literal\"\n - \"Fetches blog posts with title, slug, and publishedAt fields\"\n - \"Orders results by publishedAt in descending order\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses createClient from @sanity/client or next-sanity\"\n - \"Exports a valid Next.js page component\"\n\n # Baseline variant configuration.\n # enabled \u2014 set to false to skip this task entirely\n # rubric \u2014 \"full\" (default), \"abbreviated\" (faster), or \"none\"\n baseline:\n enabled: true\n rubric: full\n\n # Example tasks ship as drafts so they don't run in production evals.\n # Change to \"active\" (or remove this line) to activate.\n status: draft\n";
151
+ /** TypeScript task template for example-groq-blog-listing */
152
+ export declare const exampleGroqBlogListingTs = "/**\n * Example Task: Blog listing with GROQ queries.\n *\n * This is a starter template \u2014 edit it for your own documentation.\n * Each task evaluates whether an AI coding agent can implement a feature\n * using your docs as context. Delete this file or replace it entirely.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * Full field reference:\n * https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/contributing-tasks.md\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n // \u2500\u2500 Mode \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // \"literacy\" tests whether AI coding tools can implement features\n // using your docs as context. Other modes: \"mcp-server\",\n // \"knowledge-probe\", \"agent-harness\", \"custom\".\n mode: \"literacy\",\n\n // \u2500\u2500 Identity \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // Unique identifier \u2014 lowercase alphanumeric with hyphens.\n // Must be unique across all task files in .ailf/tasks/.\n id: \"example-groq-blog-listing\",\n title: \"Blog listing with GROQ queries\",\n description: \"Example \u2014 tests GROQ blog listing implementation\",\n\n // Feature area this task belongs to. Tasks with the same area are\n // grouped together in score summaries.\n area: \"groq\",\n\n // \u2500\u2500 Documentation context \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // Canonical doc references for this task. The pipeline fetches these\n // from Sanity and injects them into the prompt for baseline evaluation.\n //\n // This example uses slug-based references \u2014 the simplest form.\n // See the other example tasks for path, id, and perspective references.\n context: {\n docs: [\n {\n slug: \"groq-introduction\",\n reason: \"Core GROQ syntax and query language reference\",\n },\n {\n slug: \"how-queries-work\",\n reason: \"Query execution model and best practices\",\n },\n ],\n },\n\n // When true, the pipeline auto-generates an additional rubric that\n // checks whether the LLM's response actually used the provided docs.\n docCoverage: true,\n\n // Path to a gold-standard implementation, relative to canonical/.\n // The grader uses this as a reference when scoring code correctness.\n referenceSolution: \"canonical/example-groq-blog-listing.ts\",\n\n // \u2500\u2500 Prompt \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // prompt.text \u2014 the implementation prompt given to the LLM.\n // Write this as if you're asking a developer to build the feature.\n // Be specific about requirements so the grader can evaluate clearly.\n prompt: {\n text: `Create a Next.js page component that lists blog posts from Sanity\nusing GROQ. The page should display the title, slug, and published\ndate for each post, sorted by most recent first. Use the Sanity\nclient to fetch data.`,\n },\n\n // \u2500\u2500 Assertions \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // Grading assertions \u2014 how the LLM's response is scored.\n //\n // \"llm-rubric\" assertions use a grader LLM to score against criteria.\n // The \"template\" references a rubric template (e.g. task-completion).\n //\n // Available templates:\n // task-completion \u2014 did the LLM implement the feature? (weight: 0.50)\n // code-correctness \u2014 is the code idiomatic and correct? (weight: 0.25)\n //\n // You can also use value-based assertions:\n // { type: \"contains\", value: \"client.fetch\" }\n // { type: \"contains-any\", value: [\"createClient\", \"sanityClient\"] }\n assertions: [\n {\n type: \"llm-rubric\",\n template: \"task-completion\",\n criteria: [\n \"Uses the groq tagged template literal\",\n \"Fetches blog posts with title, slug, and publishedAt fields\",\n \"Orders results by publishedAt in descending order\",\n ],\n },\n {\n type: \"llm-rubric\",\n template: \"code-correctness\",\n criteria: [\n \"Uses createClient from @sanity/client or next-sanity\",\n \"Exports a valid Next.js page component\",\n ],\n },\n ],\n\n // \u2500\u2500 Baseline variant \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // enabled \u2014 set to false to skip this task entirely\n // rubric \u2014 \"full\" (default), \"abbreviated\" (faster), or \"none\"\n baseline: {\n enabled: true,\n rubric: \"full\",\n },\n\n // Example tasks ship as drafts so they don't run in production evals.\n // Change to \"active\" (or remove this field) to activate.\n status: \"draft\",\n})\n";
153
+ /** Generated YAML for example-groq-blog-listing (from parsed TS data) */
154
+ export declare const exampleGroqBlogListingYaml = "- mode: literacy\n id: example-groq-blog-listing\n title: Blog listing with GROQ queries\n description: Example \u2014 tests GROQ blog listing implementation\n area: groq\n context:\n docs:\n - slug: groq-introduction\n reason: Core GROQ syntax and query language reference\n - slug: how-queries-work\n reason: Query execution model and best practices\n docCoverage: true\n referenceSolution: canonical/example-groq-blog-listing.ts\n prompt:\n text: |-\n Create a Next.js page component that lists blog posts from Sanity\n using GROQ. The page should display the title, slug, and published\n date for each post, sorted by most recent first. Use the Sanity\n client to fetch data.\n assertions:\n - type: llm-rubric\n template: task-completion\n criteria:\n - Uses the groq tagged template literal\n - Fetches blog posts with title, slug, and publishedAt fields\n - Orders results by publishedAt in descending order\n - type: llm-rubric\n template: code-correctness\n criteria:\n - Uses createClient from @sanity/client or next-sanity\n - Exports a valid Next.js page component\n baseline:\n enabled: true\n rubric: full\n status: draft\n";
149
155
  /** Parsed task data for example-id-based-ref (JSON-safe) */
150
156
  export declare const exampleIdBasedRefData: readonly [{
157
+ readonly mode: "literacy";
151
158
  readonly id: "example-id-based-ref";
152
- readonly description: "Example — GROQ feature support (ID-based doc references)";
153
- readonly featureArea: "groq";
154
- readonly canonicalDocs: readonly [{
155
- readonly id: "0ba88f1b-d1a7-418a-9267-2e343d01886a";
156
- readonly slug: "groq-feature-support-by-context";
157
- readonly reason: "GROQ feature support across different Sanity contexts";
158
- }, {
159
- readonly id: "5b9c2863-ef01-4565-af8e-ee54e081ee74";
160
- readonly slug: "custom-groq-functions";
161
- readonly reason: "Custom GROQ functions and pipelines";
162
- }];
159
+ readonly title: "GROQ feature support (ID-based doc references)";
160
+ readonly description: "Example — demonstrates ID-based canonical doc references";
161
+ readonly area: "groq";
162
+ readonly context: {
163
+ readonly docs: readonly [{
164
+ readonly id: "0ba88f1b-d1a7-418a-9267-2e343d01886a";
165
+ readonly slug: "groq-feature-support-by-context";
166
+ readonly reason: "GROQ feature support across different Sanity contexts";
167
+ }, {
168
+ readonly id: "5b9c2863-ef01-4565-af8e-ee54e081ee74";
169
+ readonly slug: "custom-groq-functions";
170
+ readonly reason: "Custom GROQ functions and pipelines";
171
+ }];
172
+ };
163
173
  readonly docCoverage: true;
164
- readonly vars: {
165
- readonly task: "Explain how GROQ is used across different Sanity contexts.\nCover the following:\n1. Which GROQ features are available in each context (API queries,\n webhooks, custom functions, access control)\n2. How to create and use custom GROQ functions\n3. Any differences in GROQ support between contexts\nProvide examples demonstrating context-specific GROQ patterns.\n";
166
- readonly docs: "";
174
+ readonly prompt: {
175
+ readonly text: "Explain how GROQ is used across different Sanity contexts.\nCover the following:\n1. Which GROQ features are available in each context (API queries,\n webhooks, custom functions, access control)\n2. How to create and use custom GROQ functions\n3. Any differences in GROQ support between contexts\nProvide examples demonstrating context-specific GROQ patterns.";
167
176
  };
168
- readonly assert: readonly [{
177
+ readonly assertions: readonly [{
169
178
  readonly type: "llm-rubric";
170
179
  readonly template: "task-completion";
171
180
  readonly criteria: readonly ["Explains GROQ availability across different Sanity contexts", "Describes custom GROQ function creation and usage", "Notes differences in GROQ support between contexts"];
@@ -180,26 +189,80 @@ export declare const exampleIdBasedRefData: readonly [{
180
189
  };
181
190
  readonly status: "draft";
182
191
  }];
183
- /** Raw YAML string for example-id-based-ref (preserves comments) */
184
- export declare const exampleIdBasedRefYaml = "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# Example Task: Document ID-based canonical doc references\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n#\n# Demonstrates using `id` to reference canonical documentation by\n# Sanity document `_id`. This is useful for:\n# - Draft documents that don't have a stable slug yet\n# - Programmatic references from imports or migrations\n# - Documents where you know the _id but not the slug\n#\n# The `id` ref type can also carry optional `slug` and `path` fields\n# as human-readable annotations \u2014 these are NOT used for resolution,\n# only for display in logs and reports.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n#\n# @see docs/design-docs/canonical-doc-resolution.md\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n\n- id: example-id-based-ref\n description: \"Example \u2014 GROQ feature support (ID-based doc references)\"\n\n featureArea: groq\n\n # ID-based canonical doc references.\n #\n # Use the Sanity document _id to reference articles directly.\n # Optional slug/path annotations help humans reading the YAML\n # but are NOT used for resolution \u2014 only the `id` field matters.\n #\n # These IDs reference real articles in the Sanity docs (next dataset):\n # 0ba88f1b... = \"GROQ feature support across Sanity\"\n # 5b9c2863... = \"Custom GROQ functions\"\n canonicalDocs:\n - id: \"0ba88f1b-d1a7-418a-9267-2e343d01886a\"\n slug: groq-feature-support-by-context # annotation only \u2014 not used for resolution\n reason: \"GROQ feature support across different Sanity contexts\"\n - id: \"5b9c2863-ef01-4565-af8e-ee54e081ee74\"\n slug: custom-groq-functions # annotation only \u2014 not used for resolution\n reason: \"Custom GROQ functions and pipelines\"\n\n docCoverage: true\n\n vars:\n task: |\n Explain how GROQ is used across different Sanity contexts.\n Cover the following:\n 1. Which GROQ features are available in each context (API queries,\n webhooks, custom functions, access control)\n 2. How to create and use custom GROQ functions\n 3. Any differences in GROQ support between contexts\n Provide examples demonstrating context-specific GROQ patterns.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Explains GROQ availability across different Sanity contexts\"\n - \"Describes custom GROQ function creation and usage\"\n - \"Notes differences in GROQ support between contexts\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"GROQ examples use valid syntax\"\n - \"Custom function examples follow the correct API pattern\"\n\n baseline:\n enabled: true\n rubric: full\n\n # Example tasks ship as drafts so they don't run in production evals.\n # Change to \"active\" (or remove this line) to activate.\n status: draft\n";
192
+ /** TypeScript task template for example-id-based-ref */
193
+ export declare const exampleIdBasedRefTs = "/**\n * Example Task: Document ID-based canonical doc references.\n *\n * Demonstrates using `id` to reference canonical documentation by\n * Sanity document `_id`. This is useful for:\n * - Draft documents that don't have a stable slug yet\n * - Programmatic references from imports or migrations\n * - Documents where you know the _id but not the slug\n *\n * The `id` ref type can also carry optional `slug` and `path` fields\n * as human-readable annotations \u2014 these are NOT used for resolution,\n * only for display in logs and reports.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * @see docs/design-docs/canonical-doc-resolution.md\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n mode: \"literacy\",\n id: \"example-id-based-ref\",\n title: \"GROQ feature support (ID-based doc references)\",\n description: \"Example \u2014 demonstrates ID-based canonical doc references\",\n\n area: \"groq\",\n\n // ID-based canonical doc references.\n //\n // Use the Sanity document _id to reference articles directly.\n // Optional slug/path annotations help humans reading the file\n // but are NOT used for resolution \u2014 only the `id` field matters.\n //\n // These IDs reference real articles in the Sanity docs (next dataset):\n // 0ba88f1b... = \"GROQ feature support across Sanity\"\n // 5b9c2863... = \"Custom GROQ functions\"\n context: {\n docs: [\n {\n id: \"0ba88f1b-d1a7-418a-9267-2e343d01886a\",\n slug: \"groq-feature-support-by-context\", // annotation only\n reason: \"GROQ feature support across different Sanity contexts\",\n },\n {\n id: \"5b9c2863-ef01-4565-af8e-ee54e081ee74\",\n slug: \"custom-groq-functions\", // annotation only\n reason: \"Custom GROQ functions and pipelines\",\n },\n ],\n },\n\n docCoverage: true,\n\n prompt: {\n text: `Explain how GROQ is used across different Sanity contexts.\nCover the following:\n1. Which GROQ features are available in each context (API queries,\n webhooks, custom functions, access control)\n2. How to create and use custom GROQ functions\n3. Any differences in GROQ support between contexts\nProvide examples demonstrating context-specific GROQ patterns.`,\n },\n\n assertions: [\n {\n type: \"llm-rubric\",\n template: \"task-completion\",\n criteria: [\n \"Explains GROQ availability across different Sanity contexts\",\n \"Describes custom GROQ function creation and usage\",\n \"Notes differences in GROQ support between contexts\",\n ],\n },\n {\n type: \"llm-rubric\",\n template: \"code-correctness\",\n criteria: [\n \"GROQ examples use valid syntax\",\n \"Custom function examples follow the correct API pattern\",\n ],\n },\n ],\n\n baseline: { enabled: true, rubric: \"full\" },\n status: \"draft\",\n})\n";
194
+ /** Generated YAML for example-id-based-ref (from parsed TS data) */
195
+ export declare const exampleIdBasedRefYaml = "- mode: literacy\n id: example-id-based-ref\n title: GROQ feature support (ID-based doc references)\n description: Example \u2014 demonstrates ID-based canonical doc references\n area: groq\n context:\n docs:\n - id: 0ba88f1b-d1a7-418a-9267-2e343d01886a\n slug: groq-feature-support-by-context\n reason: GROQ feature support across different Sanity contexts\n - id: 5b9c2863-ef01-4565-af8e-ee54e081ee74\n slug: custom-groq-functions\n reason: Custom GROQ functions and pipelines\n docCoverage: true\n prompt:\n text: |-\n Explain how GROQ is used across different Sanity contexts.\n Cover the following:\n 1. Which GROQ features are available in each context (API queries,\n webhooks, custom functions, access control)\n 2. How to create and use custom GROQ functions\n 3. Any differences in GROQ support between contexts\n Provide examples demonstrating context-specific GROQ patterns.\n assertions:\n - type: llm-rubric\n template: task-completion\n criteria:\n - Explains GROQ availability across different Sanity contexts\n - Describes custom GROQ function creation and usage\n - Notes differences in GROQ support between contexts\n - type: llm-rubric\n template: code-correctness\n criteria:\n - GROQ examples use valid syntax\n - Custom function examples follow the correct API pattern\n baseline:\n enabled: true\n rubric: full\n status: draft\n";
196
+ /** Parsed task data for example-knowledge-probe (JSON-safe) */
197
+ export declare const exampleKnowledgeProbeData: readonly [{
198
+ readonly mode: "knowledge-probe";
199
+ readonly id: "example-knowledge-probe";
200
+ readonly title: "Model knowledge of GROQ syntax";
201
+ readonly description: "Example — probes baseline model knowledge (draft)";
202
+ readonly area: "groq";
203
+ readonly prompt: {
204
+ readonly text: "Explain the GROQ query language used by Sanity. Cover:\n1. Basic query syntax and projections\n2. How to filter and sort results\n3. Common patterns for fetching related documents\nProvide working code examples.";
205
+ };
206
+ readonly assertions: readonly [{
207
+ readonly type: "llm-rubric";
208
+ readonly template: "task-completion";
209
+ readonly criteria: readonly ["Demonstrates understanding of GROQ query syntax", "Shows filtering and projection patterns", "Code examples use valid GROQ syntax"];
210
+ }];
211
+ readonly status: "draft";
212
+ }];
213
+ /** TypeScript task template for example-knowledge-probe */
214
+ export declare const exampleKnowledgeProbeTs = "/**\n * Example Task: Knowledge probe baseline (DRAFT).\n *\n * Tests what the model knows about a topic without providing documentation.\n * Used to establish a baseline for comparison with literacy evaluations.\n * This task is a DRAFT \u2014 it won't run unless activated or explicitly targeted.\n *\n * To activate: change status to \"active\" or remove the status field.\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n mode: \"knowledge-probe\",\n id: \"example-knowledge-probe\",\n title: \"Model knowledge of GROQ syntax\",\n description: \"Example \u2014 probes baseline model knowledge (draft)\",\n area: \"groq\",\n\n prompt: {\n text: `Explain the GROQ query language used by Sanity. Cover:\n1. Basic query syntax and projections\n2. How to filter and sort results\n3. Common patterns for fetching related documents\nProvide working code examples.`,\n },\n\n assertions: [\n {\n type: \"llm-rubric\",\n template: \"task-completion\",\n criteria: [\n \"Demonstrates understanding of GROQ query syntax\",\n \"Shows filtering and projection patterns\",\n \"Code examples use valid GROQ syntax\",\n ],\n },\n ],\n\n status: \"draft\",\n})\n";
215
+ /** Generated YAML for example-knowledge-probe (from parsed TS data) */
216
+ export declare const exampleKnowledgeProbeYaml = "- mode: knowledge-probe\n id: example-knowledge-probe\n title: Model knowledge of GROQ syntax\n description: Example \u2014 probes baseline model knowledge (draft)\n area: groq\n prompt:\n text: |-\n Explain the GROQ query language used by Sanity. Cover:\n 1. Basic query syntax and projections\n 2. How to filter and sort results\n 3. Common patterns for fetching related documents\n Provide working code examples.\n assertions:\n - type: llm-rubric\n template: task-completion\n criteria:\n - Demonstrates understanding of GROQ query syntax\n - Shows filtering and projection patterns\n - Code examples use valid GROQ syntax\n status: draft\n";
217
+ /** Parsed task data for example-mcp-tool-usage (JSON-safe) */
218
+ export declare const exampleMcpToolUsageData: readonly [{
219
+ readonly mode: "mcp-server";
220
+ readonly id: "example-mcp-tool-usage";
221
+ readonly title: "MCP tool discovery and invocation";
222
+ readonly description: "Example — tests MCP server tool-use (draft)";
223
+ readonly area: "mcp";
224
+ readonly serverConfig: {
225
+ readonly transport: "streamable-http";
226
+ readonly url: "https://your-mcp-server.example.com";
227
+ readonly headers: {
228
+ readonly Authorization: "Bearer {{env.MCP_AUTH_TOKEN}}";
229
+ };
230
+ };
231
+ readonly prompt: {
232
+ readonly text: "Use the available MCP tools to complete the task.\nReplace this prompt with instructions specific to your MCP server.";
233
+ };
234
+ readonly assertions: readonly [{
235
+ readonly type: "llm-rubric";
236
+ readonly template: "mcp-output-correctness";
237
+ readonly criteria: readonly ["Correctly discovers and selects the appropriate tool", "Passes valid arguments to the tool", "Interprets the tool response coherently"];
238
+ }];
239
+ readonly status: "draft";
240
+ }];
241
+ /** TypeScript task template for example-mcp-tool-usage */
242
+ export declare const exampleMcpToolUsageTs = "/**\n * Example Task: MCP Server tool-use evaluation (DRAFT).\n *\n * Tests whether an LLM can correctly discover and invoke tools from\n * an MCP server. Replace the placeholder serverConfig with your own\n * MCP server's URL and authentication details.\n *\n * Transports:\n * - \"streamable-http\" / \"sse\" \u2014 remote servers (set url + optional headers)\n * - \"stdio\" \u2014 local process (set command instead of url)\n *\n * Authentication:\n * - `headers` \u2014 send arbitrary HTTP headers (e.g., Authorization)\n * - `auth` \u2014 structured auth config (bearer, basic, api_key, oauth)\n * Values support {{env.VAR}} syntax so secrets stay out of source control.\n *\n * This task is a DRAFT \u2014 it won't run unless activated or explicitly targeted.\n * To activate: change status to \"active\" or remove the status field.\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n mode: \"mcp-server\",\n id: \"example-mcp-tool-usage\",\n title: \"MCP tool discovery and invocation\",\n description: \"Example \u2014 tests MCP server tool-use (draft)\",\n area: \"mcp\",\n\n // \u2500\u2500 Server configuration \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // Replace the URL and headers below with your MCP server's details.\n //\n // For a local stdio server, use:\n // transport: \"stdio\",\n // command: \"node dist/my-mcp-server.js\",\n serverConfig: {\n transport: \"streamable-http\",\n url: \"https://your-mcp-server.example.com\",\n headers: {\n Authorization: \"Bearer {{env.MCP_AUTH_TOKEN}}\",\n },\n },\n\n // \u2500\u2500 Capabilities \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // Restrict which MCP tools the model can call. If omitted, all\n // tools discovered from the server are available.\n // capabilities: [\"tool_a\", \"tool_b\"],\n\n prompt: {\n text: `Use the available MCP tools to complete the task.\nReplace this prompt with instructions specific to your MCP server.`,\n },\n\n assertions: [\n {\n type: \"llm-rubric\",\n template: \"mcp-output-correctness\",\n criteria: [\n \"Correctly discovers and selects the appropriate tool\",\n \"Passes valid arguments to the tool\",\n \"Interprets the tool response coherently\",\n ],\n },\n ],\n\n status: \"draft\",\n})\n";
243
+ /** Generated YAML for example-mcp-tool-usage (from parsed TS data) */
244
+ export declare const exampleMcpToolUsageYaml = "- mode: mcp-server\n id: example-mcp-tool-usage\n title: MCP tool discovery and invocation\n description: Example \u2014 tests MCP server tool-use (draft)\n area: mcp\n serverConfig:\n transport: streamable-http\n url: https://your-mcp-server.example.com\n headers:\n Authorization: Bearer {{env.MCP_AUTH_TOKEN}}\n prompt:\n text: |-\n Use the available MCP tools to complete the task.\n Replace this prompt with instructions specific to your MCP server.\n assertions:\n - type: llm-rubric\n template: mcp-output-correctness\n criteria:\n - Correctly discovers and selects the appropriate tool\n - Passes valid arguments to the tool\n - Interprets the tool response coherently\n status: draft\n";
185
245
  /** Parsed task data for example-path-based-ref (JSON-safe) */
186
246
  export declare const examplePathBasedRefData: readonly [{
247
+ readonly mode: "literacy";
187
248
  readonly id: "example-path-based-ref";
188
- readonly description: "Example — GROQ mutations (path-based doc references)";
189
- readonly featureArea: "groq";
190
- readonly canonicalDocs: readonly [{
191
- readonly path: "content-lake/mutations-introduction";
192
- readonly reason: "Introduction to document mutations in the Content Lake";
193
- }, {
194
- readonly path: "content-lake/documents";
195
- readonly reason: "Document structure and types (Content Lake, not CLI reference)";
196
- }];
249
+ readonly title: "GROQ mutations (path-based doc references)";
250
+ readonly description: "Example — demonstrates path-based canonical doc references";
251
+ readonly area: "groq";
252
+ readonly context: {
253
+ readonly docs: readonly [{
254
+ readonly path: "content-lake/mutations-introduction";
255
+ readonly reason: "Introduction to document mutations in the Content Lake";
256
+ }, {
257
+ readonly path: "content-lake/documents";
258
+ readonly reason: "Document structure and types (Content Lake, not CLI reference)";
259
+ }];
260
+ };
197
261
  readonly docCoverage: true;
198
- readonly vars: {
199
- readonly task: "Explain how to create, update, and delete documents in Sanity's\nContent Lake using mutations. Cover:\n1. The different mutation types (create, createOrReplace, patch, delete)\n2. Document structure and required fields (_id, _type)\n3. How to use patch operations to update specific fields\n4. Best practices for mutation patterns\nProvide working code examples using @sanity/client.\n";
200
- readonly docs: "";
262
+ readonly prompt: {
263
+ readonly text: "Explain how to create, update, and delete documents in Sanity's\nContent Lake using mutations. Cover:\n1. The different mutation types (create, createOrReplace, patch, delete)\n2. Document structure and required fields (_id, _type)\n3. How to use patch operations to update specific fields\n4. Best practices for mutation patterns\nProvide working code examples using @sanity/client.";
201
264
  };
202
- readonly assert: readonly [{
265
+ readonly assertions: readonly [{
203
266
  readonly type: "llm-rubric";
204
267
  readonly template: "task-completion";
205
268
  readonly criteria: readonly ["Explains create, createOrReplace, patch, and delete mutations", "Describes required document fields (_id, _type)", "Shows patch operations for field-level updates", "Includes practical code examples"];
@@ -214,26 +277,31 @@ export declare const examplePathBasedRefData: readonly [{
214
277
  };
215
278
  readonly status: "draft";
216
279
  }];
217
- /** Raw YAML string for example-path-based-ref (preserves comments) */
218
- export declare const examplePathBasedRefYaml = "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# Example Task: Path-based canonical doc references\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n#\n# Demonstrates using `path` to reference canonical documentation.\n# Paths are the preferred reference type because they uniquely identify\n# an article across sections (unlike slugs, which can collide).\n#\n# Path format:\n# - Simple: \"webhooks\" \u2192 resolves by slug lookup\n# - Sectioned: \"content-lake/webhooks\" \u2192 disambiguates by section + slug\n#\n# This example demonstrates why paths matter: the slug \"documents\"\n# exists in both the \"content-lake\" and \"cli-reference\" sections.\n# Using \"content-lake/documents\" ensures we get the right one.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n#\n# @see docs/design-docs/canonical-doc-resolution.md\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n\n- id: example-path-based-ref\n description: \"Example \u2014 GROQ mutations (path-based doc references)\"\n\n featureArea: groq\n\n # Path-based canonical doc references.\n #\n # Use \"section/slug\" format to uniquely identify articles:\n # - \"content-lake/mutations-introduction\" \u2192 the mutations article\n # - \"content-lake/documents\" \u2192 the documents article in Content Lake\n # (not the CLI \"documents\" article in cli-reference section)\n #\n # The \"documents\" slug exists in two sections \u2014 this is exactly why\n # path-based references are preferred over slug-based references.\n canonicalDocs:\n - path: content-lake/mutations-introduction\n reason: \"Introduction to document mutations in the Content Lake\"\n - path: content-lake/documents\n reason: \"Document structure and types (Content Lake, not CLI reference)\"\n\n docCoverage: true\n\n vars:\n task: |\n Explain how to create, update, and delete documents in Sanity's\n Content Lake using mutations. Cover:\n 1. The different mutation types (create, createOrReplace, patch, delete)\n 2. Document structure and required fields (_id, _type)\n 3. How to use patch operations to update specific fields\n 4. Best practices for mutation patterns\n Provide working code examples using @sanity/client.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Explains create, createOrReplace, patch, and delete mutations\"\n - \"Describes required document fields (_id, _type)\"\n - \"Shows patch operations for field-level updates\"\n - \"Includes practical code examples\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses correct @sanity/client mutation API\"\n - \"Patch operations use valid set/unset/inc syntax\"\n\n baseline:\n enabled: true\n rubric: full\n\n # Example tasks ship as drafts so they don't run in production evals.\n # Change to \"active\" (or remove this line) to activate.\n status: draft\n";
280
+ /** TypeScript task template for example-path-based-ref */
281
+ export declare const examplePathBasedRefTs = "/**\n * Example Task: Path-based canonical doc references.\n *\n * Demonstrates using `path` to reference canonical documentation.\n * Paths are the preferred reference type because they uniquely identify\n * an article across sections (unlike slugs, which can collide).\n *\n * Path format:\n * - Simple: \"webhooks\" \u2192 resolves by slug lookup\n * - Sectioned: \"content-lake/webhooks\" \u2192 disambiguates by section + slug\n *\n * This example demonstrates why paths matter: the slug \"documents\"\n * exists in both the \"content-lake\" and \"cli-reference\" sections.\n * Using \"content-lake/documents\" ensures we get the right one.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * @see docs/design-docs/canonical-doc-resolution.md\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n mode: \"literacy\",\n id: \"example-path-based-ref\",\n title: \"GROQ mutations (path-based doc references)\",\n description: \"Example \u2014 demonstrates path-based canonical doc references\",\n\n area: \"groq\",\n\n // Path-based canonical doc references.\n //\n // Use \"section/slug\" format to uniquely identify articles:\n // - \"content-lake/mutations-introduction\" \u2192 the mutations article\n // - \"content-lake/documents\" \u2192 the documents article in Content Lake\n // (not the CLI \"documents\" article in cli-reference section)\n //\n // The \"documents\" slug exists in two sections \u2014 this is exactly why\n // path-based references are preferred over slug-based references.\n context: {\n docs: [\n {\n path: \"content-lake/mutations-introduction\",\n reason: \"Introduction to document mutations in the Content Lake\",\n },\n {\n path: \"content-lake/documents\",\n reason:\n \"Document structure and types (Content Lake, not CLI reference)\",\n },\n ],\n },\n\n docCoverage: true,\n\n prompt: {\n text: `Explain how to create, update, and delete documents in Sanity's\nContent Lake using mutations. Cover:\n1. The different mutation types (create, createOrReplace, patch, delete)\n2. Document structure and required fields (_id, _type)\n3. How to use patch operations to update specific fields\n4. Best practices for mutation patterns\nProvide working code examples using @sanity/client.`,\n },\n\n assertions: [\n {\n type: \"llm-rubric\",\n template: \"task-completion\",\n criteria: [\n \"Explains create, createOrReplace, patch, and delete mutations\",\n \"Describes required document fields (_id, _type)\",\n \"Shows patch operations for field-level updates\",\n \"Includes practical code examples\",\n ],\n },\n {\n type: \"llm-rubric\",\n template: \"code-correctness\",\n criteria: [\n \"Uses correct @sanity/client mutation API\",\n \"Patch operations use valid set/unset/inc syntax\",\n ],\n },\n ],\n\n baseline: { enabled: true, rubric: \"full\" },\n status: \"draft\",\n})\n";
282
+ /** Generated YAML for example-path-based-ref (from parsed TS data) */
283
+ export declare const examplePathBasedRefYaml = "- mode: literacy\n id: example-path-based-ref\n title: GROQ mutations (path-based doc references)\n description: Example \u2014 demonstrates path-based canonical doc references\n area: groq\n context:\n docs:\n - path: content-lake/mutations-introduction\n reason: Introduction to document mutations in the Content Lake\n - path: content-lake/documents\n reason: Document structure and types (Content Lake, not CLI reference)\n docCoverage: true\n prompt:\n text: |-\n Explain how to create, update, and delete documents in Sanity's\n Content Lake using mutations. Cover:\n 1. The different mutation types (create, createOrReplace, patch, delete)\n 2. Document structure and required fields (_id, _type)\n 3. How to use patch operations to update specific fields\n 4. Best practices for mutation patterns\n Provide working code examples using @sanity/client.\n assertions:\n - type: llm-rubric\n template: task-completion\n criteria:\n - Explains create, createOrReplace, patch, and delete mutations\n - Describes required document fields (_id, _type)\n - Shows patch operations for field-level updates\n - Includes practical code examples\n - type: llm-rubric\n template: code-correctness\n criteria:\n - Uses correct @sanity/client mutation API\n - Patch operations use valid set/unset/inc syntax\n baseline:\n enabled: true\n rubric: full\n status: draft\n";
219
284
  /** Parsed task data for example-perspective-ref (JSON-safe) */
220
285
  export declare const examplePerspectiveRefData: readonly [{
286
+ readonly mode: "literacy";
221
287
  readonly id: "example-perspective-ref";
222
- readonly description: "Example — GROQ features from content release (perspective-based doc references)";
223
- readonly featureArea: "groq";
224
- readonly canonicalDocs: readonly [{
225
- readonly perspective: "rE9TSJvR4";
226
- readonly reason: "All GROQ documentation updates in the test content release";
227
- }, {
228
- readonly slug: "groq-data-types";
229
- readonly reason: "GROQ data type reference (published, stable)";
230
- }];
288
+ readonly title: "GROQ features from content release (perspective-based doc references)";
289
+ readonly description: "Example — demonstrates perspective-based canonical doc references";
290
+ readonly area: "groq";
291
+ readonly context: {
292
+ readonly docs: readonly [{
293
+ readonly perspective: "rE9TSJvR4";
294
+ readonly reason: "All GROQ documentation updates in the test content release";
295
+ }, {
296
+ readonly slug: "groq-data-types";
297
+ readonly reason: "GROQ data type reference (published, stable)";
298
+ }];
299
+ };
231
300
  readonly docCoverage: true;
232
- readonly vars: {
233
- readonly task: "Using GROQ, demonstrate advanced query patterns including:\n1. Joining data across document types using references\n2. Filtering webhook payloads with GROQ projections\n3. Using the query cheat sheet patterns for common operations\n4. Working with different GROQ data types in filters\nProvide working GROQ query examples for each pattern.\n";
234
- readonly docs: "";
301
+ readonly prompt: {
302
+ readonly text: "Using GROQ, demonstrate advanced query patterns including:\n1. Joining data across document types using references\n2. Filtering webhook payloads with GROQ projections\n3. Using the query cheat sheet patterns for common operations\n4. Working with different GROQ data types in filters\nProvide working GROQ query examples for each pattern.";
235
303
  };
236
- readonly assert: readonly [{
304
+ readonly assertions: readonly [{
237
305
  readonly type: "llm-rubric";
238
306
  readonly template: "task-completion";
239
307
  readonly criteria: readonly ["Demonstrates GROQ join syntax for cross-document queries", "Shows GROQ filter patterns for webhook configuration", "Includes practical query examples from cheat sheet patterns"];
@@ -248,27 +316,32 @@ export declare const examplePerspectiveRefData: readonly [{
248
316
  };
249
317
  readonly status: "draft";
250
318
  }];
251
- /** Raw YAML string for example-perspective-ref (preserves comments) */
252
- export declare const examplePerspectiveRefYaml = "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# Example Task: Perspective / content release doc references\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n#\n# Demonstrates using `perspective` to reference all documentation\n# articles within a content release. This is the key capability for\n# evaluating NEW feature documentation before it's published.\n#\n# How it works:\n# - A perspective ref is one-to-many: the doc fetcher queries the\n# named release and expands it to ALL articles versioned within it.\n# - Downstream consumers see the same flat DocContext[] regardless\n# of how docs were resolved.\n# - When the release is published, the perspective entry becomes a\n# no-op (articles are now in published). Migrate to explicit path\n# or slug refs at your convenience.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n#\n# @see docs/design-docs/canonical-doc-resolution.md\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n\n- id: example-perspective-ref\n description:\n \"Example \u2014 GROQ features from content release (perspective-based doc\n references)\"\n\n featureArea: groq\n\n # Perspective-based canonical doc reference.\n #\n # The perspective ID references a content release in the Sanity\n # Content Lake. At evaluation time, the doc fetcher auto-discovers\n # all articles versioned in this release and includes them as\n # canonical documentation context.\n #\n # Release rE9TSJvR4 contains:\n # - \"GROQ-powered webhooks\" (webhooks)\n # - \"Query Cheat Sheet - GROQ\" (query-cheat-sheet)\n # - \"GROQ joins\" (groq-joins)\n #\n # You can combine perspective refs with explicit slug/path/id refs\n # to include foundational published docs alongside release content.\n # Here we add groq-data-types as a complementary published reference.\n canonicalDocs:\n - perspective: rE9TSJvR4\n reason: \"All GROQ documentation updates in the test content release\"\n - slug: groq-data-types\n reason: \"GROQ data type reference (published, stable)\"\n\n docCoverage: true\n\n vars:\n task: |\n Using GROQ, demonstrate advanced query patterns including:\n 1. Joining data across document types using references\n 2. Filtering webhook payloads with GROQ projections\n 3. Using the query cheat sheet patterns for common operations\n 4. Working with different GROQ data types in filters\n Provide working GROQ query examples for each pattern.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Demonstrates GROQ join syntax for cross-document queries\"\n - \"Shows GROQ filter patterns for webhook configuration\"\n - \"Includes practical query examples from cheat sheet patterns\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"All GROQ queries use valid syntax\"\n - \"Reference joins use correct dereference operator (->)\"\n\n baseline:\n enabled: true\n rubric: full\n\n # Example tasks ship as drafts so they don't run in production evals.\n # Change to \"active\" (or remove this line) to activate.\n status: draft\n";
319
+ /** TypeScript task template for example-perspective-ref */
320
+ export declare const examplePerspectiveRefTs = "/**\n * Example Task: Perspective / content release doc references.\n *\n * Demonstrates using `perspective` to reference all documentation\n * articles within a content release. This is the key capability for\n * evaluating NEW feature documentation before it's published.\n *\n * How it works:\n * - A perspective ref is one-to-many: the doc fetcher queries the\n * named release and expands it to ALL articles versioned within it.\n * - Downstream consumers see the same flat DocContext[] regardless\n * of how docs were resolved.\n * - When the release is published, the perspective entry becomes a\n * no-op (articles are now in published). Migrate to explicit path\n * or slug refs at your convenience.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * @see docs/design-docs/canonical-doc-resolution.md\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n mode: \"literacy\",\n id: \"example-perspective-ref\",\n title:\n \"GROQ features from content release (perspective-based doc references)\",\n description:\n \"Example \u2014 demonstrates perspective-based canonical doc references\",\n\n area: \"groq\",\n\n // Perspective-based canonical doc reference.\n //\n // The perspective ID references a content release in the Sanity\n // Content Lake. At evaluation time, the doc fetcher auto-discovers\n // all articles versioned in this release and includes them as\n // canonical documentation context.\n //\n // Release rE9TSJvR4 contains:\n // - \"GROQ-powered webhooks\" (webhooks)\n // - \"Query Cheat Sheet - GROQ\" (query-cheat-sheet)\n // - \"GROQ joins\" (groq-joins)\n //\n // You can combine perspective refs with explicit slug/path/id refs\n // to include foundational published docs alongside release content.\n // Here we add groq-data-types as a complementary published reference.\n context: {\n docs: [\n {\n perspective: \"rE9TSJvR4\",\n reason: \"All GROQ documentation updates in the test content release\",\n },\n {\n slug: \"groq-data-types\",\n reason: \"GROQ data type reference (published, stable)\",\n },\n ],\n },\n\n docCoverage: true,\n\n prompt: {\n text: `Using GROQ, demonstrate advanced query patterns including:\n1. Joining data across document types using references\n2. Filtering webhook payloads with GROQ projections\n3. Using the query cheat sheet patterns for common operations\n4. Working with different GROQ data types in filters\nProvide working GROQ query examples for each pattern.`,\n },\n\n assertions: [\n {\n type: \"llm-rubric\",\n template: \"task-completion\",\n criteria: [\n \"Demonstrates GROQ join syntax for cross-document queries\",\n \"Shows GROQ filter patterns for webhook configuration\",\n \"Includes practical query examples from cheat sheet patterns\",\n ],\n },\n {\n type: \"llm-rubric\",\n template: \"code-correctness\",\n criteria: [\n \"All GROQ queries use valid syntax\",\n \"Reference joins use correct dereference operator (->)\",\n ],\n },\n ],\n\n baseline: { enabled: true, rubric: \"full\" },\n status: \"draft\",\n})\n";
321
+ /** Generated YAML for example-perspective-ref (from parsed TS data) */
322
+ export declare const examplePerspectiveRefYaml = "- mode: literacy\n id: example-perspective-ref\n title: GROQ features from content release (perspective-based doc references)\n description: Example \u2014 demonstrates perspective-based canonical doc references\n area: groq\n context:\n docs:\n - perspective: rE9TSJvR4\n reason: All GROQ documentation updates in the test content release\n - slug: groq-data-types\n reason: GROQ data type reference (published, stable)\n docCoverage: true\n prompt:\n text: |-\n Using GROQ, demonstrate advanced query patterns including:\n 1. Joining data across document types using references\n 2. Filtering webhook payloads with GROQ projections\n 3. Using the query cheat sheet patterns for common operations\n 4. Working with different GROQ data types in filters\n Provide working GROQ query examples for each pattern.\n assertions:\n - type: llm-rubric\n template: task-completion\n criteria:\n - Demonstrates GROQ join syntax for cross-document queries\n - Shows GROQ filter patterns for webhook configuration\n - Includes practical query examples from cheat sheet patterns\n - type: llm-rubric\n template: code-correctness\n criteria:\n - All GROQ queries use valid syntax\n - Reference joins use correct dereference operator (->)\n baseline:\n enabled: true\n rubric: full\n status: draft\n";
253
323
  /** Parsed task data for example-studio-custom-input (JSON-safe) */
254
324
  export declare const exampleStudioCustomInputData: readonly [{
325
+ readonly mode: "literacy";
255
326
  readonly id: "example-studio-custom-input";
256
- readonly description: "Example — Custom input component in Sanity Studio";
257
- readonly featureArea: "studio";
258
- readonly canonicalDocs: readonly [{
259
- readonly slug: "custom-input-widgets";
260
- readonly reason: "Guide for building custom form inputs in Sanity Studio";
261
- }, {
262
- readonly slug: "form-components";
263
- readonly reason: "Form component API and customization patterns";
264
- }];
327
+ readonly title: "Custom input component in Sanity Studio";
328
+ readonly description: "Example — tests Studio custom input implementation";
329
+ readonly area: "studio";
330
+ readonly context: {
331
+ readonly docs: readonly [{
332
+ readonly slug: "custom-input-widgets";
333
+ readonly reason: "Guide for building custom form inputs in Sanity Studio";
334
+ }, {
335
+ readonly slug: "form-components";
336
+ readonly reason: "Form component API and customization patterns";
337
+ }];
338
+ };
265
339
  readonly docCoverage: true;
266
340
  readonly referenceSolution: "canonical/example-studio-custom-input.ts";
267
- readonly vars: {
268
- readonly task: "Build a custom string input component for Sanity Studio that shows\na character count below the input field. The component should accept\na maxLength option from the field schema and display a warning when\nthe text exceeds the limit.\n";
269
- readonly docs: "";
341
+ readonly prompt: {
342
+ readonly text: "Build a custom string input component for Sanity Studio that shows\na character count below the input field. The component should accept\na maxLength option from the field schema and display a warning when\nthe text exceeds the limit.";
270
343
  };
271
- readonly assert: readonly [{
344
+ readonly assertions: readonly [{
272
345
  readonly type: "llm-rubric";
273
346
  readonly template: "task-completion";
274
347
  readonly criteria: readonly ["Implements a React component that renders a text input", "Displays a live character count", "Reads maxLength from schema options", "Shows a visual warning when limit is exceeded"];
@@ -283,14 +356,25 @@ export declare const exampleStudioCustomInputData: readonly [{
283
356
  };
284
357
  readonly status: "draft";
285
358
  }];
286
- /** Raw YAML string for example-studio-custom-input (preserves comments) */
287
- export declare const exampleStudioCustomInputYaml = "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# Example Task: Custom input component in Sanity Studio\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n#\n# This is a starter template \u2014 edit it for your own documentation.\n# Delete this file or replace it with your own tasks.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n\n- id: example-studio-custom-input\n description: \"Example \u2014 Custom input component in Sanity Studio\"\n\n featureArea: studio\n\n # Slug-based canonical doc references.\n canonicalDocs:\n - slug: custom-input-widgets\n reason: \"Guide for building custom form inputs in Sanity Studio\"\n - slug: form-components\n reason: \"Form component API and customization patterns\"\n\n docCoverage: true\n referenceSolution: canonical/example-studio-custom-input.ts\n\n vars:\n task: |\n Build a custom string input component for Sanity Studio that shows\n a character count below the input field. The component should accept\n a maxLength option from the field schema and display a warning when\n the text exceeds the limit.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Implements a React component that renders a text input\"\n - \"Displays a live character count\"\n - \"Reads maxLength from schema options\"\n - \"Shows a visual warning when limit is exceeded\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses the Sanity UI library for styling\"\n - \"Calls onChange with patch operations\"\n\n baseline:\n enabled: true\n rubric: full\n\n # Example tasks ship as drafts so they don't run in production evals.\n # Change to \"active\" (or remove this line) to activate.\n status: draft\n";
359
+ /** TypeScript task template for example-studio-custom-input */
360
+ export declare const exampleStudioCustomInputTs = "/**\n * Example Task: Custom input component in Sanity Studio.\n *\n * This is a starter template \u2014 edit it for your own documentation.\n * Delete this file or replace it with your own tasks.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n mode: \"literacy\",\n id: \"example-studio-custom-input\",\n title: \"Custom input component in Sanity Studio\",\n description: \"Example \u2014 tests Studio custom input implementation\",\n\n area: \"studio\",\n\n context: {\n docs: [\n {\n slug: \"custom-input-widgets\",\n reason: \"Guide for building custom form inputs in Sanity Studio\",\n },\n {\n slug: \"form-components\",\n reason: \"Form component API and customization patterns\",\n },\n ],\n },\n\n docCoverage: true,\n referenceSolution: \"canonical/example-studio-custom-input.ts\",\n\n prompt: {\n text: `Build a custom string input component for Sanity Studio that shows\na character count below the input field. The component should accept\na maxLength option from the field schema and display a warning when\nthe text exceeds the limit.`,\n },\n\n assertions: [\n {\n type: \"llm-rubric\",\n template: \"task-completion\",\n criteria: [\n \"Implements a React component that renders a text input\",\n \"Displays a live character count\",\n \"Reads maxLength from schema options\",\n \"Shows a visual warning when limit is exceeded\",\n ],\n },\n {\n type: \"llm-rubric\",\n template: \"code-correctness\",\n criteria: [\n \"Uses the Sanity UI library for styling\",\n \"Calls onChange with patch operations\",\n ],\n },\n ],\n\n baseline: { enabled: true, rubric: \"full\" },\n status: \"draft\",\n})\n";
361
+ /** Generated YAML for example-studio-custom-input (from parsed TS data) */
362
+ export declare const exampleStudioCustomInputYaml = "- mode: literacy\n id: example-studio-custom-input\n title: Custom input component in Sanity Studio\n description: Example \u2014 tests Studio custom input implementation\n area: studio\n context:\n docs:\n - slug: custom-input-widgets\n reason: Guide for building custom form inputs in Sanity Studio\n - slug: form-components\n reason: Form component API and customization patterns\n docCoverage: true\n referenceSolution: canonical/example-studio-custom-input.ts\n prompt:\n text: |-\n Build a custom string input component for Sanity Studio that shows\n a character count below the input field. The component should accept\n a maxLength option from the field schema and display a warning when\n the text exceeds the limit.\n assertions:\n - type: llm-rubric\n template: task-completion\n criteria:\n - Implements a React component that renders a text input\n - Displays a live character count\n - Reads maxLength from schema options\n - Shows a visual warning when limit is exceeded\n - type: llm-rubric\n template: code-correctness\n criteria:\n - Uses the Sanity UI library for styling\n - Calls onChange with patch operations\n baseline:\n enabled: true\n rubric: full\n status: draft\n";
288
363
  /** All task example data as a flat array (JSON-safe) */
289
364
  export declare const allTaskData: readonly unknown[];
290
- /** Map of task ID (filename stem) → raw YAML string (preserves comments) */
365
+ /** Map of task ID (filename stem) → raw TypeScript source */
366
+ export declare const taskTsFiles: Record<string, string>;
367
+ /** Map of task ID (filename stem) → generated YAML string */
291
368
  export declare const taskYamlFiles: Record<string, string>;
292
369
  /** List of task file stems, in alphabetical order */
293
- export declare const TASK_FILE_NAMES: readonly ["example-groq-blog-listing", "example-id-based-ref", "example-path-based-ref", "example-perspective-ref", "example-studio-custom-input"];
370
+ export declare const TASK_FILE_NAMES: readonly ["example-groq-blog-listing", "example-id-based-ref", "example-knowledge-probe", "example-mcp-tool-usage", "example-path-based-ref", "example-perspective-ref", "example-studio-custom-input"];
371
+ /** Task metadata for mode-based filtering in init and other consumers */
372
+ export interface TaskExampleMeta {
373
+ stem: string;
374
+ mode: string;
375
+ status: string;
376
+ }
377
+ export declare const TASK_EXAMPLES: readonly TaskExampleMeta[];
294
378
  export type ExampleType = "config" | "source" | "rubric" | "threshold" | "ailf-config" | "task";
295
379
  export declare const EXAMPLE_TYPES: readonly ExampleType[];
296
380
  export interface ExampleRecord {
@@ -303,17 +387,3 @@ export declare const EXAMPLES: Record<ExampleType, ExampleRecord>;
303
387
  export declare const workflowYaml = "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# AI Literacy Evaluation \u2014 GitHub Actions workflow\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n#\n# Evaluates your documentation quality on every pull request.\n# The AILF CLI reads your .ailf/tasks/ definitions, submits them\n# to the AILF API for evaluation, and writes a score report.\n#\n# Prerequisites:\n# Add one secret to your repository (Settings \u2192 Secrets \u2192 Actions):\n# AILF_API_KEY \u2014 your API key (starts with ailf_live_sk_)\n# NPM_TOKEN \u2014 npm token with read access to @sanity scope\n#\n# Customization:\n# - Narrow the trigger paths to reduce cost (see comment below)\n# - Check debug_mode for faster iteration (fewer tests)\n# - See: https://github.com/sanity-labs/ai-literacy-framework\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n\nname: AI Literacy Eval\n\non:\n pull_request:\n branches: [main]\n # Runs on every PR to main by default. To reduce cost:\n # paths: [\".ailf/**\", \"docs/**\"]\n\n workflow_dispatch:\n inputs:\n debug_mode:\n description: \"Run in debug mode (fewer tests, faster iteration)\"\n type: boolean\n default: false\n\nconcurrency:\n group: ailf-eval-${{ github.event.pull_request.number || github.ref }}\n cancel-in-progress: true\n\njobs:\n evaluate:\n name: AI Literacy Evaluation\n runs-on: ubuntu-latest\n permissions:\n contents: read\n pull-requests: write\n steps:\n - uses: actions/checkout@v4\n\n - name: Configure npm for @sanity scope\n run:\n echo \"//registry.npmjs.org/:_authToken=${{ secrets.NPM_TOKEN }}\" >>\n ~/.npmrc\n\n - name: Run evaluation\n id: eval\n env:\n AILF_API_KEY: ${{ secrets.AILF_API_KEY }}\n run: |\n npx @sanity/ailf@latest pipeline --remote \\\n --output /tmp/ailf-report.md \\\n ${{ inputs.debug_mode && '--debug' || '' }}\n\n - name: Post PR comment\n if: always() && github.event_name == 'pull_request'\n uses: actions/github-script@v7\n with:\n script: |\n const fs = require('fs');\n\n // --- Constants ---\n const MARKER = '<!-- ailf-score-report -->';\n const HISTORY_START = '<!-- ailf-score-history -->';\n const HISTORY_END = '<!-- /ailf-score-history -->';\n const MAX_HISTORY = 3; // keep at most 3 prior runs\n\n // --- Read new report ---\n let newReport;\n try {\n newReport = fs.readFileSync('/tmp/ailf-report.md', 'utf-8');\n } catch {\n newReport = `## \u26A0\uFE0F AI Literacy Evaluation\\n\\nNo report generated. Check the [workflow logs](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}).`;\n }\n\n const prNumber = context.issue?.number || context.payload?.pull_request?.number;\n if (!prNumber) {\n console.log('No PR number found, skipping comment');\n return;\n }\n\n // --- Find existing comment ---\n const { data: comments } = await github.rest.issues.listComments({\n owner: context.repo.owner, repo: context.repo.repo,\n issue_number: prNumber,\n });\n const existing = comments.find(c => c.body?.includes(MARKER));\n\n // --- Build history from previous comment ---\n let historyEntries = [];\n if (existing) {\n const oldBody = existing.body || '';\n\n // Collect existing collapsed history entries\n const histStart = oldBody.indexOf(HISTORY_START);\n const histEnd = oldBody.indexOf(HISTORY_END);\n if (histStart !== -1 && histEnd !== -1) {\n const historyContent = oldBody.slice(histStart + HISTORY_START.length, histEnd).trim();\n // Split on </details> boundaries to get individual entries\n if (historyContent) {\n historyEntries = historyContent\n .split(/<\\/details>\\s*/)\n .map(s => s.trim())\n .filter(s => s.startsWith('<details>'))\n .map(s => s + '\\n</details>');\n }\n }\n\n // Extract the current report (will become the newest history entry)\n let previousReport = '';\n if (histStart !== -1) {\n // Report is between MARKER and the \"Previous runs\" heading (or history section)\n const markerIdx = oldBody.indexOf(MARKER);\n // Find the --- separator before history\n const separatorIdx = oldBody.lastIndexOf('---', histStart);\n const endIdx = separatorIdx > markerIdx ? separatorIdx : histStart;\n previousReport = oldBody.slice(markerIdx + MARKER.length, endIdx).trim();\n } else {\n // No history yet \u2014 everything after MARKER is the report\n const markerIdx = oldBody.indexOf(MARKER);\n if (markerIdx !== -1) {\n previousReport = oldBody.slice(markerIdx + MARKER.length).trim();\n }\n }\n\n // Collapse the previous report into a <details> entry\n if (previousReport) {\n const scoreMatch = previousReport.match(/Overall:\\s*(\\d+)\\/100/);\n const score = scoreMatch ? scoreMatch[1] : '?';\n const dateMatch = previousReport.match(/Generated by.*?\u00B7\\s*([^\u00B7<\\n*]+)/);\n const date = dateMatch\n ? dateMatch[1].trim()\n : new Date().toISOString().slice(0, 16).replace('T', ' ') + ' UTC';\n const entry = `<details>\\n<summary>\uD83D\uDCDC ${date} \u2014 ${score}/100</summary>\\n\\n${previousReport}\\n\\n</details>`;\n historyEntries.unshift(entry); // newest first\n }\n\n // Enforce max history limit\n historyEntries = historyEntries.slice(0, MAX_HISTORY);\n }\n\n // --- Assemble final comment ---\n const historySection = historyEntries.length > 0\n ? `\\n\\n---\\n\\n### \uD83D\uDCDC Previous runs\\n\\n${HISTORY_START}\\n${historyEntries.join('\\n\\n')}\\n${HISTORY_END}`\n : '';\n const finalBody = `${MARKER}\\n${newReport}${historySection}`;\n\n if (existing) {\n await github.rest.issues.updateComment({\n owner: context.repo.owner, repo: context.repo.repo,\n comment_id: existing.id, body: finalBody,\n });\n console.log(`Updated comment (${historyEntries.length} history entries)`);\n } else {\n await github.rest.issues.createComment({\n owner: context.repo.owner, repo: context.repo.repo,\n issue_number: prNumber, body: finalBody,\n });\n console.log('Created new PR comment');\n }\n\n - name: Summary\n if: always()\n run: |\n if [ -f /tmp/ailf-report.md ]; then\n cat /tmp/ailf-report.md >> \"$GITHUB_STEP_SUMMARY\"\n else\n echo \"## \u26A0\uFE0F AI Literacy Evaluation\" >> \"$GITHUB_STEP_SUMMARY\"\n echo \"\" >> \"$GITHUB_STEP_SUMMARY\"\n echo \"No report generated. Check the workflow logs.\" >> \"$GITHUB_STEP_SUMMARY\"\n fi\n";
304
388
  /** TypeScript project configuration template (ailf.config.ts) */
305
389
  export declare const ailfConfigTs = "/**\n * .ailf/ailf.config.ts \u2014 AI Literacy Framework project configuration.\n *\n * This file configures how the AILF evaluation pipeline runs in this\n * repository. Place it at .ailf/ailf.config.ts in your project root.\n *\n * Evaluations are submitted to the AILF API (ailf-api.sanity.build).\n * The API handles LLM calls, doc fetching, grading, and report\n * publishing. Your repo only needs one secret: AILF_API_KEY.\n *\n * Docs: https://github.com/sanity-labs/ai-literacy-framework\n */\n\nexport default {\n /**\n * Documentation source \u2014 which docs are being evaluated.\n *\n * This tells the pipeline which Sanity project and dataset contain\n * the documentation under test. For most users, this is Sanity's own\n * docs project.\n */\n source: {\n /** Sanity project ID (find yours at sanity.io/manage) */\n projectId: \"3do82whm\",\n /** The dataset to query (e.g., \"production\", \"next\") */\n dataset: \"next\",\n /**\n * The public URL of your documentation site.\n * Used by agentic mode to test agent discoverability.\n */\n baseUrl: \"https://www.sanity.io/docs\",\n },\n\n /**\n * Trigger configuration \u2014 when evaluations run automatically.\n *\n * Each key is a trigger context. The pipeline checks which trigger\n * matches the current execution context (PR, merge, schedule, etc.)\n * and applies its settings.\n *\n * Mode options:\n * \"validate-only\" \u2014 check that task files parse correctly (fast, no LLM calls)\n * \"eval\" \u2014 run the full evaluation pipeline\n */\n triggers: {\n /** On pull requests: just validate task files parse correctly. */\n pr: {\n mode: \"validate-only\",\n },\n\n /** When .ailf/ files change in a PR: run a real evaluation. */\n \"pr-task-change\": {\n mode: \"eval\",\n paths: [\".ailf/**\"],\n },\n\n /** On merge to main: run evaluation (non-blocking). */\n main: {\n mode: \"eval\",\n blocking: false,\n notify: true,\n },\n },\n}\n";
306
- /** TypeScript task template for example-groq-blog-listing */
307
- export declare const exampleGroqBlogListingTs = "/**\n * Example Task: Blog listing with GROQ queries.\n *\n * This is a starter template \u2014 edit it for your own documentation.\n * Each task evaluates whether an AI coding agent can implement a feature\n * using your docs as context. Delete this file or replace it entirely.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * Full field reference:\n * https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/CONTRIBUTING_TASKS.md\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n // \u2500\u2500 Mode \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // \"literacy\" tests whether AI coding tools can implement features\n // using your docs as context. Other modes: \"mcp-server\",\n // \"knowledge-probe\", \"agent-harness\", \"custom\".\n mode: \"literacy\",\n\n // \u2500\u2500 Identity \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // Unique identifier \u2014 lowercase alphanumeric with hyphens.\n // Must be unique across all task files in .ailf/tasks/.\n id: \"example-groq-blog-listing\",\n title: \"Blog listing with GROQ queries\",\n description: \"Example \u2014 tests GROQ blog listing implementation\",\n\n // Feature area this task belongs to. Tasks with the same area are\n // grouped together in score summaries.\n area: \"groq\",\n\n // \u2500\u2500 Documentation context \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // Canonical doc references for this task. The pipeline fetches these\n // from Sanity and injects them into the prompt for baseline evaluation.\n //\n // This example uses slug-based references \u2014 the simplest form.\n // See the other example tasks for path, id, and perspective references.\n context: {\n docs: [\n {\n slug: \"groq-introduction\",\n reason: \"Core GROQ syntax and query language reference\",\n },\n {\n slug: \"how-queries-work\",\n reason: \"Query execution model and best practices\",\n },\n ],\n },\n\n // When true, the pipeline auto-generates an additional rubric that\n // checks whether the LLM's response actually used the provided docs.\n docCoverage: true,\n\n // Path to a gold-standard implementation, relative to canonical/.\n // The grader uses this as a reference when scoring code correctness.\n referenceSolution: \"canonical/example-groq-blog-listing.ts\",\n\n // \u2500\u2500 Prompt \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // prompt.text \u2014 the implementation prompt given to the LLM.\n // Write this as if you're asking a developer to build the feature.\n // Be specific about requirements so the grader can evaluate clearly.\n prompt: {\n text: `Create a Next.js page component that lists blog posts from Sanity\nusing GROQ. The page should display the title, slug, and published\ndate for each post, sorted by most recent first. Use the Sanity\nclient to fetch data.`,\n },\n\n // \u2500\u2500 Assertions \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // Grading assertions \u2014 how the LLM's response is scored.\n //\n // \"llm-rubric\" assertions use a grader LLM to score against criteria.\n // The \"template\" references a rubric template (e.g. task-completion).\n //\n // Available templates:\n // task-completion \u2014 did the LLM implement the feature? (weight: 0.50)\n // code-correctness \u2014 is the code idiomatic and correct? (weight: 0.25)\n //\n // You can also use value-based assertions:\n // { type: \"contains\", value: \"client.fetch\" }\n // { type: \"contains-any\", value: [\"createClient\", \"sanityClient\"] }\n assertions: [\n {\n type: \"llm-rubric\",\n template: \"task-completion\",\n criteria: [\n \"Uses the groq tagged template literal\",\n \"Fetches blog posts with title, slug, and publishedAt fields\",\n \"Orders results by publishedAt in descending order\",\n ],\n },\n {\n type: \"llm-rubric\",\n template: \"code-correctness\",\n criteria: [\n \"Uses createClient from @sanity/client or next-sanity\",\n \"Exports a valid Next.js page component\",\n ],\n },\n ],\n\n // \u2500\u2500 Baseline variant \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // enabled \u2014 set to false to skip this task entirely\n // rubric \u2014 \"full\" (default), \"abbreviated\" (faster), or \"none\"\n baseline: {\n enabled: true,\n rubric: \"full\",\n },\n\n // Example tasks ship as drafts so they don't run in production evals.\n // Change to \"active\" (or remove this field) to activate.\n status: \"draft\",\n})\n";
308
- /** TypeScript task template for example-id-based-ref */
309
- export declare const exampleIdBasedRefTs = "/**\n * Example Task: Document ID-based canonical doc references.\n *\n * Demonstrates using `id` to reference canonical documentation by\n * Sanity document `_id`. This is useful for:\n * - Draft documents that don't have a stable slug yet\n * - Programmatic references from imports or migrations\n * - Documents where you know the _id but not the slug\n *\n * The `id` ref type can also carry optional `slug` and `path` fields\n * as human-readable annotations \u2014 these are NOT used for resolution,\n * only for display in logs and reports.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * @see docs/design-docs/canonical-doc-resolution.md\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n mode: \"literacy\",\n id: \"example-id-based-ref\",\n title: \"GROQ feature support (ID-based doc references)\",\n description: \"Example \u2014 demonstrates ID-based canonical doc references\",\n\n area: \"groq\",\n\n // ID-based canonical doc references.\n //\n // Use the Sanity document _id to reference articles directly.\n // Optional slug/path annotations help humans reading the file\n // but are NOT used for resolution \u2014 only the `id` field matters.\n //\n // These IDs reference real articles in the Sanity docs (next dataset):\n // 0ba88f1b... = \"GROQ feature support across Sanity\"\n // 5b9c2863... = \"Custom GROQ functions\"\n context: {\n docs: [\n {\n id: \"0ba88f1b-d1a7-418a-9267-2e343d01886a\",\n slug: \"groq-feature-support-by-context\", // annotation only\n reason: \"GROQ feature support across different Sanity contexts\",\n },\n {\n id: \"5b9c2863-ef01-4565-af8e-ee54e081ee74\",\n slug: \"custom-groq-functions\", // annotation only\n reason: \"Custom GROQ functions and pipelines\",\n },\n ],\n },\n\n docCoverage: true,\n\n prompt: {\n text: `Explain how GROQ is used across different Sanity contexts.\nCover the following:\n1. Which GROQ features are available in each context (API queries,\n webhooks, custom functions, access control)\n2. How to create and use custom GROQ functions\n3. Any differences in GROQ support between contexts\nProvide examples demonstrating context-specific GROQ patterns.`,\n },\n\n assertions: [\n {\n type: \"llm-rubric\",\n template: \"task-completion\",\n criteria: [\n \"Explains GROQ availability across different Sanity contexts\",\n \"Describes custom GROQ function creation and usage\",\n \"Notes differences in GROQ support between contexts\",\n ],\n },\n {\n type: \"llm-rubric\",\n template: \"code-correctness\",\n criteria: [\n \"GROQ examples use valid syntax\",\n \"Custom function examples follow the correct API pattern\",\n ],\n },\n ],\n\n baseline: { enabled: true, rubric: \"full\" },\n status: \"draft\",\n})\n";
310
- /** TypeScript task template for example-path-based-ref */
311
- export declare const examplePathBasedRefTs = "/**\n * Example Task: Path-based canonical doc references.\n *\n * Demonstrates using `path` to reference canonical documentation.\n * Paths are the preferred reference type because they uniquely identify\n * an article across sections (unlike slugs, which can collide).\n *\n * Path format:\n * - Simple: \"webhooks\" \u2192 resolves by slug lookup\n * - Sectioned: \"content-lake/webhooks\" \u2192 disambiguates by section + slug\n *\n * This example demonstrates why paths matter: the slug \"documents\"\n * exists in both the \"content-lake\" and \"cli-reference\" sections.\n * Using \"content-lake/documents\" ensures we get the right one.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * @see docs/design-docs/canonical-doc-resolution.md\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n mode: \"literacy\",\n id: \"example-path-based-ref\",\n title: \"GROQ mutations (path-based doc references)\",\n description: \"Example \u2014 demonstrates path-based canonical doc references\",\n\n area: \"groq\",\n\n // Path-based canonical doc references.\n //\n // Use \"section/slug\" format to uniquely identify articles:\n // - \"content-lake/mutations-introduction\" \u2192 the mutations article\n // - \"content-lake/documents\" \u2192 the documents article in Content Lake\n // (not the CLI \"documents\" article in cli-reference section)\n //\n // The \"documents\" slug exists in two sections \u2014 this is exactly why\n // path-based references are preferred over slug-based references.\n context: {\n docs: [\n {\n path: \"content-lake/mutations-introduction\",\n reason: \"Introduction to document mutations in the Content Lake\",\n },\n {\n path: \"content-lake/documents\",\n reason:\n \"Document structure and types (Content Lake, not CLI reference)\",\n },\n ],\n },\n\n docCoverage: true,\n\n prompt: {\n text: `Explain how to create, update, and delete documents in Sanity's\nContent Lake using mutations. Cover:\n1. The different mutation types (create, createOrReplace, patch, delete)\n2. Document structure and required fields (_id, _type)\n3. How to use patch operations to update specific fields\n4. Best practices for mutation patterns\nProvide working code examples using @sanity/client.`,\n },\n\n assertions: [\n {\n type: \"llm-rubric\",\n template: \"task-completion\",\n criteria: [\n \"Explains create, createOrReplace, patch, and delete mutations\",\n \"Describes required document fields (_id, _type)\",\n \"Shows patch operations for field-level updates\",\n \"Includes practical code examples\",\n ],\n },\n {\n type: \"llm-rubric\",\n template: \"code-correctness\",\n criteria: [\n \"Uses correct @sanity/client mutation API\",\n \"Patch operations use valid set/unset/inc syntax\",\n ],\n },\n ],\n\n baseline: { enabled: true, rubric: \"full\" },\n status: \"draft\",\n})\n";
312
- /** TypeScript task template for example-perspective-ref */
313
- export declare const examplePerspectiveRefTs = "/**\n * Example Task: Perspective / content release doc references.\n *\n * Demonstrates using `perspective` to reference all documentation\n * articles within a content release. This is the key capability for\n * evaluating NEW feature documentation before it's published.\n *\n * How it works:\n * - A perspective ref is one-to-many: the doc fetcher queries the\n * named release and expands it to ALL articles versioned within it.\n * - Downstream consumers see the same flat DocContext[] regardless\n * of how docs were resolved.\n * - When the release is published, the perspective entry becomes a\n * no-op (articles are now in published). Migrate to explicit path\n * or slug refs at your convenience.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * @see docs/design-docs/canonical-doc-resolution.md\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n mode: \"literacy\",\n id: \"example-perspective-ref\",\n title:\n \"GROQ features from content release (perspective-based doc references)\",\n description:\n \"Example \u2014 demonstrates perspective-based canonical doc references\",\n\n area: \"groq\",\n\n // Perspective-based canonical doc reference.\n //\n // The perspective ID references a content release in the Sanity\n // Content Lake. At evaluation time, the doc fetcher auto-discovers\n // all articles versioned in this release and includes them as\n // canonical documentation context.\n //\n // Release rE9TSJvR4 contains:\n // - \"GROQ-powered webhooks\" (webhooks)\n // - \"Query Cheat Sheet - GROQ\" (query-cheat-sheet)\n // - \"GROQ joins\" (groq-joins)\n //\n // You can combine perspective refs with explicit slug/path/id refs\n // to include foundational published docs alongside release content.\n // Here we add groq-data-types as a complementary published reference.\n context: {\n docs: [\n {\n perspective: \"rE9TSJvR4\",\n reason: \"All GROQ documentation updates in the test content release\",\n },\n {\n slug: \"groq-data-types\",\n reason: \"GROQ data type reference (published, stable)\",\n },\n ],\n },\n\n docCoverage: true,\n\n prompt: {\n text: `Using GROQ, demonstrate advanced query patterns including:\n1. Joining data across document types using references\n2. Filtering webhook payloads with GROQ projections\n3. Using the query cheat sheet patterns for common operations\n4. Working with different GROQ data types in filters\nProvide working GROQ query examples for each pattern.`,\n },\n\n assertions: [\n {\n type: \"llm-rubric\",\n template: \"task-completion\",\n criteria: [\n \"Demonstrates GROQ join syntax for cross-document queries\",\n \"Shows GROQ filter patterns for webhook configuration\",\n \"Includes practical query examples from cheat sheet patterns\",\n ],\n },\n {\n type: \"llm-rubric\",\n template: \"code-correctness\",\n criteria: [\n \"All GROQ queries use valid syntax\",\n \"Reference joins use correct dereference operator (->)\",\n ],\n },\n ],\n\n baseline: { enabled: true, rubric: \"full\" },\n status: \"draft\",\n})\n";
314
- /** TypeScript task template for example-studio-custom-input */
315
- export declare const exampleStudioCustomInputTs = "/**\n * Example Task: Custom input component in Sanity Studio.\n *\n * This is a starter template \u2014 edit it for your own documentation.\n * Delete this file or replace it with your own tasks.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n mode: \"literacy\",\n id: \"example-studio-custom-input\",\n title: \"Custom input component in Sanity Studio\",\n description: \"Example \u2014 tests Studio custom input implementation\",\n\n area: \"studio\",\n\n context: {\n docs: [\n {\n slug: \"custom-input-widgets\",\n reason: \"Guide for building custom form inputs in Sanity Studio\",\n },\n {\n slug: \"form-components\",\n reason: \"Form component API and customization patterns\",\n },\n ],\n },\n\n docCoverage: true,\n referenceSolution: \"canonical/example-studio-custom-input.ts\",\n\n prompt: {\n text: `Build a custom string input component for Sanity Studio that shows\na character count below the input field. The component should accept\na maxLength option from the field schema and display a warning when\nthe text exceeds the limit.`,\n },\n\n assertions: [\n {\n type: \"llm-rubric\",\n template: \"task-completion\",\n criteria: [\n \"Implements a React component that renders a text input\",\n \"Displays a live character count\",\n \"Reads maxLength from schema options\",\n \"Shows a visual warning when limit is exceeded\",\n ],\n },\n {\n type: \"llm-rubric\",\n template: \"code-correctness\",\n criteria: [\n \"Uses the Sanity UI library for styling\",\n \"Calls onChange with patch operations\",\n ],\n },\n ],\n\n baseline: { enabled: true, rubric: \"full\" },\n status: \"draft\",\n})\n";
316
- /** Map of task ID (filename stem) → raw TypeScript source */
317
- export declare const taskTsFiles: Record<string, string>;
318
- /** List of TS task file stems, in alphabetical order */
319
- export declare const TASK_TS_FILE_NAMES: readonly ["example-groq-blog-listing", "example-id-based-ref", "example-path-based-ref", "example-perspective-ref", "example-studio-custom-input"];