@sanity/ailf 2.0.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (442) hide show
  1. package/canonical/grader-references/README.md +2 -2
  2. package/canonical/reference-solutions/content-lake/mutations.ts +160 -0
  3. package/canonical/reference-solutions/content-lake/realtime.ts +187 -0
  4. package/canonical/reference-solutions/image-handling/asset-pipeline.tsx +166 -0
  5. package/canonical/reference-solutions/portable-text/custom-blocks.ts +204 -0
  6. package/canonical/reference-solutions/portable-text/rendering.tsx +163 -0
  7. package/config/features.ts +1 -1
  8. package/config/models.ts +28 -23
  9. package/config/sources.ts +1 -1
  10. package/config/thresholds.ts +1 -1
  11. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.d.ts +10 -0
  12. package/dist/_vendor/ailf-core/__tests__/comparison-formatters.test.js +185 -0
  13. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.d.ts +6 -0
  14. package/dist/_vendor/ailf-core/artifact-capture/__tests__/noop-collector.test.js +42 -0
  15. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +14 -0
  16. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +25 -0
  17. package/dist/_vendor/ailf-core/config-helpers.d.ts +6 -0
  18. package/dist/_vendor/ailf-core/config-helpers.js +29 -0
  19. package/dist/_vendor/ailf-core/examples/index.d.ts +164 -94
  20. package/dist/_vendor/ailf-core/examples/index.js +208 -114
  21. package/dist/_vendor/ailf-core/index.d.ts +1 -0
  22. package/dist/_vendor/ailf-core/index.js +1 -0
  23. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +94 -0
  24. package/dist/_vendor/ailf-core/ports/artifact-collector.js +13 -0
  25. package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +138 -0
  26. package/dist/_vendor/ailf-core/ports/capture-comparator.js +10 -0
  27. package/dist/_vendor/ailf-core/ports/context.d.ts +20 -1
  28. package/dist/_vendor/ailf-core/ports/eval-runner.d.ts +6 -0
  29. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  30. package/dist/_vendor/ailf-core/ports/pipeline-step.d.ts +11 -0
  31. package/dist/_vendor/ailf-core/ports/task-source.d.ts +3 -3
  32. package/dist/_vendor/ailf-core/ports/task-source.js +3 -3
  33. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +6 -1
  34. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +14 -2
  35. package/dist/_vendor/ailf-core/services/config-helpers.d.ts +16 -1
  36. package/dist/_vendor/ailf-core/services/config-helpers.js +21 -0
  37. package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
  38. package/dist/_vendor/ailf-core/services/index.js +1 -1
  39. package/dist/_vendor/ailf-core/services/scoring.js +9 -0
  40. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +12 -1
  41. package/dist/_vendor/ailf-core/types/generalized-task.js +1 -1
  42. package/dist/_vendor/ailf-core/types/index.d.ts +47 -4
  43. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +27 -0
  44. package/dist/_vendor/ailf-shared/eval-modes.d.ts +15 -0
  45. package/dist/_vendor/ailf-shared/eval-modes.js +18 -0
  46. package/dist/_vendor/ailf-tasks/cli.d.ts +8 -0
  47. package/dist/_vendor/ailf-tasks/cli.js +61 -0
  48. package/dist/_vendor/ailf-tasks/index.d.ts +13 -0
  49. package/dist/_vendor/ailf-tasks/index.js +16 -0
  50. package/dist/_vendor/ailf-tasks/parser.d.ts +27 -0
  51. package/dist/_vendor/ailf-tasks/parser.js +73 -0
  52. package/dist/_vendor/ailf-tasks/schemas.d.ts +198 -0
  53. package/dist/_vendor/ailf-tasks/schemas.js +180 -0
  54. package/dist/_vendor/ailf-tasks/validation.d.ts +47 -0
  55. package/dist/_vendor/ailf-tasks/validation.js +162 -0
  56. package/dist/adapters/api-client/remediation.js +2 -2
  57. package/dist/adapters/config-sources/file-config-adapter.js +6 -1
  58. package/dist/adapters/eval-runners/promptfoo-eval-adapter.js +8 -2
  59. package/dist/adapters/index.d.ts +0 -1
  60. package/dist/adapters/index.js +0 -1
  61. package/dist/adapters/task-sources/composite-task-source.d.ts +1 -1
  62. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  63. package/dist/adapters/task-sources/content-lake-task-source.d.ts +4 -6
  64. package/dist/adapters/task-sources/content-lake-task-source.js +4 -6
  65. package/dist/adapters/task-sources/index.d.ts +1 -2
  66. package/dist/adapters/task-sources/index.js +1 -2
  67. package/dist/adapters/task-sources/repo-schemas.d.ts +1 -1
  68. package/dist/adapters/task-sources/repo-schemas.js +2 -2
  69. package/dist/adapters/task-sources/repo-task-source.js +1 -1
  70. package/dist/adapters/task-sources/repo-trigger.d.ts +1 -1
  71. package/dist/adapters/task-sources/repo-trigger.js +1 -1
  72. package/dist/adapters/task-sources/task-file-loader.d.ts +9 -6
  73. package/dist/adapters/task-sources/task-file-loader.js +20 -6
  74. package/dist/agent-observer/test-imports.d.ts +7 -0
  75. package/dist/agent-observer/test-imports.js +185 -0
  76. package/dist/artifact-capture/comparator.d.ts +22 -0
  77. package/dist/artifact-capture/comparator.js +493 -0
  78. package/dist/artifact-capture/filesystem-collector.d.ts +42 -0
  79. package/dist/artifact-capture/filesystem-collector.js +237 -0
  80. package/dist/artifact-capture/redact-artifact.d.ts +20 -0
  81. package/dist/artifact-capture/redact-artifact.js +115 -0
  82. package/dist/assertions/source-isolation.d.ts +1 -1
  83. package/dist/assertions/source-isolation.js +1 -1
  84. package/dist/cli.js +4 -0
  85. package/dist/commands/calculate-scores.js +1 -0
  86. package/dist/commands/capture-compare.d.ts +15 -0
  87. package/dist/commands/capture-compare.js +253 -0
  88. package/dist/commands/capture-list.d.ts +12 -0
  89. package/dist/commands/capture-list.js +147 -0
  90. package/dist/commands/capture.d.ts +9 -0
  91. package/dist/commands/capture.js +16 -0
  92. package/dist/commands/chronic-failures.d.ts +8 -0
  93. package/dist/commands/chronic-failures.js +33 -0
  94. package/dist/commands/explain-handler.d.ts +1 -1
  95. package/dist/commands/explain-handler.js +37 -8
  96. package/dist/commands/fetch-docs.js +1 -0
  97. package/dist/commands/generate-configs.d.ts +3 -3
  98. package/dist/commands/generate-configs.js +20 -8
  99. package/dist/commands/init.d.ts +2 -3
  100. package/dist/commands/init.js +56 -170
  101. package/dist/commands/pipeline-action.d.ts +7 -1
  102. package/dist/commands/pipeline-action.js +43 -19
  103. package/dist/commands/pipeline.d.ts +6 -1
  104. package/dist/commands/pipeline.js +7 -2
  105. package/dist/commands/pr-comment.js +1 -0
  106. package/dist/commands/publish.js +1 -0
  107. package/dist/commands/shared/help.js +2 -2
  108. package/dist/commands/update-quality-scores.d.ts +5 -0
  109. package/dist/commands/update-quality-scores.js +20 -0
  110. package/dist/composition-root.d.ts +2 -3
  111. package/dist/composition-root.js +27 -14
  112. package/dist/config/features.ts +23 -0
  113. package/dist/config/models.ts +100 -0
  114. package/dist/config/prompts.ts +16 -0
  115. package/dist/config/rubrics.ts +225 -0
  116. package/dist/config/schedules.ts +47 -0
  117. package/dist/config/sinks.ts +37 -0
  118. package/dist/config/sources.ts +21 -0
  119. package/dist/config/thresholds.ts +61 -0
  120. package/dist/lib/agent-behavior-report.d.ts +8 -0
  121. package/dist/lib/agent-behavior-report.js +185 -0
  122. package/dist/lib/baseline.d.ts +19 -0
  123. package/dist/lib/baseline.js +153 -0
  124. package/dist/lib/calculate-scores.d.ts +23 -0
  125. package/dist/lib/calculate-scores.js +42 -0
  126. package/dist/lib/compare.d.ts +18 -0
  127. package/dist/lib/compare.js +170 -0
  128. package/dist/lib/coverage-audit.d.ts +4 -0
  129. package/dist/lib/coverage-audit.js +42 -0
  130. package/dist/lib/discovery-report.d.ts +13 -0
  131. package/dist/lib/discovery-report.js +57 -0
  132. package/dist/lib/fetch-docs.d.ts +30 -0
  133. package/dist/lib/fetch-docs.js +171 -0
  134. package/dist/lib/generate-configs.d.ts +25 -0
  135. package/dist/lib/generate-configs.js +42 -0
  136. package/dist/lib/grader-api.d.ts +21 -0
  137. package/dist/lib/grader-api.js +34 -0
  138. package/dist/lib/grader-compare.d.ts +19 -0
  139. package/dist/lib/grader-compare.js +91 -0
  140. package/dist/lib/grader-consistency.d.ts +27 -0
  141. package/dist/lib/grader-consistency.js +79 -0
  142. package/dist/lib/grader-sensitivity.d.ts +19 -0
  143. package/dist/lib/grader-sensitivity.js +75 -0
  144. package/dist/lib/grader-validate.d.ts +19 -0
  145. package/dist/lib/grader-validate.js +78 -0
  146. package/dist/lib/measure-retrieval.d.ts +14 -0
  147. package/dist/lib/measure-retrieval.js +71 -0
  148. package/dist/lib/pr-comment.d.ts +16 -0
  149. package/dist/lib/pr-comment.js +28 -0
  150. package/dist/lib/readiness-report.d.ts +13 -0
  151. package/dist/lib/readiness-report.js +108 -0
  152. package/dist/lib/webhook-server.d.ts +11 -0
  153. package/dist/lib/webhook-server.js +24 -0
  154. package/dist/lib/weekly-digest.d.ts +24 -0
  155. package/dist/lib/weekly-digest.js +148 -0
  156. package/dist/orchestration/build-app-context.js +13 -0
  157. package/dist/orchestration/cache-context.d.ts +23 -0
  158. package/dist/orchestration/cache-context.js +43 -0
  159. package/dist/orchestration/env-bridge.d.ts +21 -0
  160. package/dist/orchestration/env-bridge.js +66 -0
  161. package/dist/orchestration/load-pipeline-tasks.d.ts +34 -0
  162. package/dist/orchestration/load-pipeline-tasks.js +52 -0
  163. package/dist/orchestration/pipeline-orchestrator.js +75 -5
  164. package/dist/orchestration/step-runner.js +5 -1
  165. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -0
  166. package/dist/orchestration/steps/calculate-scores-step.js +13 -0
  167. package/dist/orchestration/steps/callback-step.js +10 -1
  168. package/dist/orchestration/steps/compare-step.js +6 -3
  169. package/dist/orchestration/steps/discovery-report-step.js +6 -2
  170. package/dist/orchestration/steps/fetch-docs-shell.d.ts +17 -0
  171. package/dist/orchestration/steps/fetch-docs-shell.js +30 -0
  172. package/dist/orchestration/steps/fetch-docs-step.d.ts +1 -0
  173. package/dist/orchestration/steps/fetch-docs-step.js +30 -16
  174. package/dist/orchestration/steps/gap-analysis-step.js +13 -2
  175. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -0
  176. package/dist/orchestration/steps/generate-configs-step.js +50 -15
  177. package/dist/orchestration/steps/mirror-repo-tasks-step.d.ts +1 -1
  178. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  179. package/dist/orchestration/steps/publish-report-step.js +19 -0
  180. package/dist/orchestration/steps/readiness-step.js +8 -3
  181. package/dist/orchestration/steps/report-step.js +17 -4
  182. package/dist/orchestration/steps/run-eval-step.d.ts +1 -0
  183. package/dist/orchestration/steps/run-eval-step.js +51 -31
  184. package/dist/pipeline/agent-behavior-report.js +6 -0
  185. package/dist/pipeline/attribution.d.ts +1 -1
  186. package/dist/pipeline/attribution.js +1 -1
  187. package/dist/pipeline/cache.js +29 -15
  188. package/dist/pipeline/calculate-scores.d.ts +2 -0
  189. package/dist/pipeline/calculate-scores.js +70 -33
  190. package/dist/pipeline/chronic-failures.d.ts +55 -0
  191. package/dist/pipeline/chronic-failures.js +110 -0
  192. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +33 -0
  193. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +2 -3
  194. package/dist/pipeline/compiler/__tests__/task-bridge.test.d.ts +9 -0
  195. package/dist/pipeline/compiler/__tests__/task-bridge.test.js +339 -0
  196. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.d.ts +10 -0
  197. package/dist/pipeline/compiler/__tests__/tool-loop-openai.test.js +509 -0
  198. package/dist/pipeline/compiler/assertion-mapper.d.ts +1 -1
  199. package/dist/pipeline/compiler/assertion-mapper.js +1 -1
  200. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +2 -7
  201. package/dist/pipeline/compiler/compiler-to-yaml.js +2 -7
  202. package/dist/pipeline/compiler/config-loader.d.ts +14 -0
  203. package/dist/pipeline/compiler/config-loader.js +42 -2
  204. package/dist/pipeline/compiler/fixture-resolver.d.ts +1 -1
  205. package/dist/pipeline/compiler/fixture-resolver.js +1 -1
  206. package/dist/pipeline/compiler/ignore-fields.d.ts +1 -1
  207. package/dist/pipeline/compiler/ignore-fields.js +1 -1
  208. package/dist/pipeline/compiler/index.d.ts +2 -5
  209. package/dist/pipeline/compiler/index.js +2 -5
  210. package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
  211. package/dist/pipeline/compiler/literacy-bridge.js +1 -1
  212. package/dist/pipeline/compiler/mode-bases/agent-harness.d.ts +1 -1
  213. package/dist/pipeline/compiler/mode-bases/agent-harness.js +1 -1
  214. package/dist/pipeline/compiler/mode-bases/knowledge-probe.d.ts +1 -1
  215. package/dist/pipeline/compiler/mode-bases/knowledge-probe.js +1 -1
  216. package/dist/pipeline/compiler/mode-bases/literacy.d.ts +13 -2
  217. package/dist/pipeline/compiler/mode-bases/literacy.js +55 -1
  218. package/dist/pipeline/compiler/mode-bases/mcp-server.d.ts +1 -1
  219. package/dist/pipeline/compiler/mode-bases/mcp-server.js +1 -1
  220. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.d.ts +1 -1
  221. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -1
  222. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
  223. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
  224. package/dist/pipeline/compiler/mode-handlers/index.d.ts +2 -2
  225. package/dist/pipeline/compiler/mode-handlers/index.js +2 -2
  226. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.d.ts +1 -1
  227. package/dist/pipeline/compiler/mode-handlers/knowledge-probe/index.js +1 -1
  228. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
  229. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
  230. package/dist/pipeline/compiler/mode-handlers/literacy/index.d.ts +1 -1
  231. package/dist/pipeline/compiler/mode-handlers/literacy/index.js +1 -1
  232. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
  233. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
  234. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
  235. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +334 -0
  236. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.d.ts +1 -1
  237. package/dist/pipeline/compiler/mode-handlers/mcp-server/assertions.js +1 -1
  238. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.d.ts +1 -1
  239. package/dist/pipeline/compiler/mode-handlers/mcp-server/index.js +1 -1
  240. package/dist/pipeline/compiler/mode-handlers/mcp-server/provider-config.js +4 -0
  241. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +69 -0
  242. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +307 -0
  243. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/index.js +22 -5
  244. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/mcp-connection.js +6 -0
  245. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.d.ts +10 -5
  246. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/tool-loop-openai.js +314 -7
  247. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider/types.d.ts +10 -0
  248. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.d.ts +65 -0
  249. package/dist/pipeline/compiler/mode-handlers/mcp-tool-provider.js +368 -0
  250. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +1 -1
  251. package/dist/pipeline/compiler/presets/sanity-literacy.js +1 -1
  252. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +1 -4
  253. package/dist/pipeline/compiler/promptfoo-compiler.js +3 -12
  254. package/dist/pipeline/compiler/provider-assembler.js +13 -7
  255. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +1 -1
  256. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +1 -1
  257. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +1 -1
  258. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +1 -1
  259. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +1 -1
  260. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +1 -1
  261. package/dist/pipeline/compiler/sandbox/index.d.ts +1 -1
  262. package/dist/pipeline/compiler/sandbox/index.js +1 -1
  263. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +1 -1
  264. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +1 -1
  265. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +1 -1
  266. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +1 -1
  267. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +1 -1
  268. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +1 -1
  269. package/dist/pipeline/compiler/scoring-bridge.d.ts +1 -1
  270. package/dist/pipeline/compiler/scoring-bridge.js +1 -1
  271. package/dist/pipeline/compiler/task-bridge.d.ts +41 -0
  272. package/dist/pipeline/compiler/task-bridge.js +92 -0
  273. package/dist/pipeline/compiler/task-graph-builder.d.ts +1 -4
  274. package/dist/pipeline/compiler/task-graph-builder.js +1 -4
  275. package/dist/pipeline/compiler/telemetry/index.d.ts +1 -1
  276. package/dist/pipeline/compiler/telemetry/index.js +1 -1
  277. package/dist/pipeline/compiler/variable-resolver.d.ts +1 -1
  278. package/dist/pipeline/compiler/variable-resolver.js +1 -1
  279. package/dist/pipeline/coverage-audit.d.ts +1 -1
  280. package/dist/pipeline/coverage-audit.js +1 -1
  281. package/dist/pipeline/degradations.d.ts +1 -1
  282. package/dist/pipeline/degradations.js +1 -1
  283. package/dist/pipeline/failure-modes.d.ts +1 -1
  284. package/dist/pipeline/failure-modes.js +13 -1
  285. package/dist/pipeline/gap-analysis.d.ts +1 -1
  286. package/dist/pipeline/gap-analysis.js +3 -1
  287. package/dist/pipeline/generate-configs.d.ts +2 -2
  288. package/dist/pipeline/generate-configs.js +15 -8
  289. package/dist/pipeline/grader-compare-runner.d.ts +1 -1
  290. package/dist/pipeline/grader-compare-runner.js +7 -1
  291. package/dist/pipeline/grader-comparison.d.ts +1 -1
  292. package/dist/pipeline/grader-comparison.js +1 -1
  293. package/dist/pipeline/grader-consistency-runner.d.ts +1 -1
  294. package/dist/pipeline/grader-consistency-runner.js +7 -1
  295. package/dist/pipeline/grader-consistency.d.ts +1 -1
  296. package/dist/pipeline/grader-consistency.js +1 -1
  297. package/dist/pipeline/grader-sensitivity-runner.d.ts +1 -1
  298. package/dist/pipeline/grader-sensitivity-runner.js +1 -1
  299. package/dist/pipeline/grader-sensitivity.d.ts +1 -1
  300. package/dist/pipeline/grader-sensitivity.js +1 -1
  301. package/dist/pipeline/grader-validate-runner.d.ts +1 -1
  302. package/dist/pipeline/grader-validate-runner.js +2 -2
  303. package/dist/pipeline/grader-validation.d.ts +1 -1
  304. package/dist/pipeline/grader-validation.js +1 -1
  305. package/dist/pipeline/map-request-to-config.js +15 -2
  306. package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
  307. package/dist/pipeline/mirror-repo-tasks.js +1 -1
  308. package/dist/pipeline/plan-format.d.ts +1 -1
  309. package/dist/pipeline/plan-format.js +1 -1
  310. package/dist/pipeline/plan.d.ts +1 -1
  311. package/dist/pipeline/plan.js +67 -29
  312. package/dist/pipeline/probe.d.ts +1 -1
  313. package/dist/pipeline/probe.js +1 -1
  314. package/dist/pipeline/readiness-report.d.ts +2 -2
  315. package/dist/pipeline/readiness-report.js +2 -2
  316. package/dist/pipeline/release-classification.d.ts +1 -1
  317. package/dist/pipeline/release-classification.js +1 -1
  318. package/dist/pipeline/release-report.d.ts +1 -1
  319. package/dist/pipeline/release-report.js +1 -1
  320. package/dist/pipeline/repo-eval-comment.d.ts +1 -1
  321. package/dist/pipeline/repo-eval-comment.js +1 -1
  322. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  323. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  324. package/dist/pipeline/resolve-mappings.d.ts +6 -6
  325. package/dist/pipeline/resolve-mappings.js +44 -44
  326. package/dist/pipeline/retrieval-metrics.d.ts +3 -3
  327. package/dist/pipeline/retrieval-metrics.js +28 -20
  328. package/dist/pipeline/steps/calculate-scores-step.d.ts +11 -0
  329. package/dist/pipeline/steps/calculate-scores-step.js +89 -0
  330. package/dist/pipeline/steps/compare-step.d.ts +18 -0
  331. package/dist/pipeline/steps/compare-step.js +90 -0
  332. package/dist/pipeline/steps/eval-step.d.ts +53 -0
  333. package/dist/pipeline/steps/eval-step.js +347 -0
  334. package/dist/pipeline/steps/fetch-docs-step.d.ts +11 -0
  335. package/dist/pipeline/steps/fetch-docs-step.js +84 -0
  336. package/dist/pipeline/steps/generate-configs-step.d.ts +11 -0
  337. package/dist/pipeline/steps/generate-configs-step.js +98 -0
  338. package/dist/pipeline/steps/grader-consistency-step.d.ts +21 -0
  339. package/dist/pipeline/steps/grader-consistency-step.js +74 -0
  340. package/dist/pipeline/steps/publish-report-step.d.ts +57 -0
  341. package/dist/pipeline/steps/publish-report-step.js +243 -0
  342. package/dist/pipeline/steps/report-step.d.ts +13 -0
  343. package/dist/pipeline/steps/report-step.js +56 -0
  344. package/dist/pipeline/steps/update-scores-step.d.ts +11 -0
  345. package/dist/pipeline/steps/update-scores-step.js +42 -0
  346. package/dist/pipeline/targeted-loo.d.ts +1 -1
  347. package/dist/pipeline/targeted-loo.js +1 -1
  348. package/dist/pipeline/thresholds.d.ts +1 -1
  349. package/dist/pipeline/thresholds.js +1 -1
  350. package/dist/pipeline/validate.js +13 -0
  351. package/dist/report-store.d.ts +17 -0
  352. package/dist/report-store.js +24 -0
  353. package/dist/scripts/agent-behavior-report.d.ts +19 -0
  354. package/dist/scripts/agent-behavior-report.js +315 -0
  355. package/dist/scripts/baseline.d.ts +43 -0
  356. package/dist/scripts/baseline.js +267 -0
  357. package/dist/scripts/calculate-scores.d.ts +166 -0
  358. package/dist/scripts/calculate-scores.js +1296 -0
  359. package/dist/scripts/compare.d.ts +22 -0
  360. package/dist/scripts/compare.js +334 -0
  361. package/dist/scripts/coverage-audit.d.ts +44 -0
  362. package/dist/scripts/coverage-audit.js +209 -0
  363. package/dist/scripts/debug-eval.d.ts +19 -0
  364. package/dist/scripts/debug-eval.js +73 -0
  365. package/dist/scripts/discovery-report.d.ts +58 -0
  366. package/dist/scripts/discovery-report.js +250 -0
  367. package/dist/scripts/fetch-docs.d.ts +35 -0
  368. package/dist/scripts/fetch-docs.js +472 -0
  369. package/dist/scripts/generate-configs.d.ts +66 -0
  370. package/dist/scripts/generate-configs.js +459 -0
  371. package/dist/scripts/grader-api.d.ts +27 -0
  372. package/dist/scripts/grader-api.js +206 -0
  373. package/dist/scripts/grader-compare.d.ts +22 -0
  374. package/dist/scripts/grader-compare.js +368 -0
  375. package/dist/scripts/grader-consistency.d.ts +20 -0
  376. package/dist/scripts/grader-consistency.js +313 -0
  377. package/dist/scripts/grader-sensitivity.d.ts +22 -0
  378. package/dist/scripts/grader-sensitivity.js +354 -0
  379. package/dist/scripts/grader-validate.d.ts +19 -0
  380. package/dist/scripts/grader-validate.js +267 -0
  381. package/dist/scripts/measure-retrieval.d.ts +10 -0
  382. package/dist/scripts/measure-retrieval.js +145 -0
  383. package/dist/scripts/migrate-task-mode.d.ts +1 -1
  384. package/dist/scripts/migrate-task-mode.js +1 -1
  385. package/dist/scripts/migrate-tasks-to-content-lake.d.ts +1 -1
  386. package/dist/scripts/migrate-tasks-to-content-lake.js +1 -1
  387. package/dist/scripts/pipeline.d.ts +76 -0
  388. package/dist/scripts/pipeline.js +1031 -0
  389. package/dist/scripts/pr-comment.d.ts +10 -0
  390. package/dist/scripts/pr-comment.js +510 -0
  391. package/dist/scripts/readiness-report.d.ts +88 -0
  392. package/dist/scripts/readiness-report.js +342 -0
  393. package/dist/scripts/update-quality-scores.d.ts +15 -0
  394. package/dist/scripts/update-quality-scores.js +184 -0
  395. package/dist/scripts/validate-task-sources.d.ts +1 -1
  396. package/dist/scripts/validate-task-sources.js +1 -1
  397. package/dist/scripts/validate.d.ts +13 -0
  398. package/dist/scripts/validate.js +79 -0
  399. package/dist/scripts/webhook-server.d.ts +26 -0
  400. package/dist/scripts/webhook-server.js +147 -0
  401. package/dist/scripts/weekly-digest.d.ts +24 -0
  402. package/dist/scripts/weekly-digest.js +144 -0
  403. package/dist/sinks/format-slack.d.ts +64 -0
  404. package/dist/sinks/format-slack.js +306 -0
  405. package/dist/sinks/slack-sink.d.ts +27 -0
  406. package/dist/sinks/slack-sink.js +78 -0
  407. package/dist/sinks/types.d.ts +1 -1
  408. package/dist/sinks/types.js +1 -1
  409. package/dist/sinks/webhook-sink.d.ts +19 -0
  410. package/dist/sinks/webhook-sink.js +50 -0
  411. package/dist/tasks/knowledge-probe/define-type-api.task.ts +66 -0
  412. package/dist/tasks/knowledge-probe/groq-projections.task.ts +62 -0
  413. package/dist/tasks/literacy/content-lake.task.ts +181 -0
  414. package/dist/tasks/literacy/frameworks.task.ts +129 -0
  415. package/dist/tasks/literacy/functions.task.ts +70 -0
  416. package/dist/tasks/literacy/groq.task.ts +259 -0
  417. package/dist/tasks/literacy/image-handling.task.ts +95 -0
  418. package/dist/tasks/literacy/nextjs-live.task.ts +76 -0
  419. package/dist/tasks/literacy/portable-text.task.ts +169 -0
  420. package/dist/tasks/literacy/studio-setup.task.ts +134 -0
  421. package/dist/tasks/literacy/visual-editing.task.ts +147 -0
  422. package/package.json +24 -24
  423. package/tasks/.expanded.agentic.yaml +280 -0
  424. package/tasks/.expanded.yaml +565 -0
  425. package/tasks/knowledge-probe/define-type-api.task.ts +11 -0
  426. package/tasks/knowledge-probe/groq-projections.task.ts +3 -0
  427. package/tasks/literacy/content-lake.task.ts +181 -0
  428. package/tasks/literacy/frameworks.task.ts +1 -0
  429. package/tasks/literacy/functions.task.ts +1 -0
  430. package/tasks/literacy/groq.task.ts +1 -0
  431. package/tasks/literacy/image-handling.task.ts +95 -0
  432. package/tasks/literacy/nextjs-live.task.ts +2 -1
  433. package/tasks/literacy/portable-text.task.ts +169 -0
  434. package/tasks/literacy/studio-setup.task.ts +5 -2
  435. package/tasks/literacy/visual-editing.task.ts +1 -0
  436. package/LICENSE +0 -21
  437. package/tasks/frameworks.yaml +0 -98
  438. package/tasks/functions.yaml +0 -51
  439. package/tasks/groq.yaml +0 -216
  440. package/tasks/nextjs-live.yaml +0 -62
  441. package/tasks/studio-setup.yaml +0 -111
  442. package/tasks/visual-editing.yaml +0 -120
@@ -0,0 +1,565 @@
1
+ # .expanded.yaml
2
+ #
3
+ # AUTO-GENERATED by compiler pipeline — do not edit directly.
4
+ # Run: npx @sanity/ailf generate-configs
5
+
6
+ - description: GROQ - Blog queries with filtering and pagination (gold)
7
+ vars:
8
+ task: |-
9
+ Write GROQ queries for a Sanity blog application:
10
+
11
+ 1. Fetch all published blog posts ordered by publishedAt descending,
12
+ with a projection that includes: _id, title, slug (from slug.current),
13
+ publishedAt, excerpt, and the author's name (resolved from a reference)
14
+ 2. Add pagination to return only the first 10 results
15
+ 3. Fetch a single post by its slug parameter, including the full body
16
+ content and resolved author and category references
17
+ 4. Fetch posts published after a specific date
18
+ 5. Fetch posts that belong to a specific category (where categories
19
+ is an array of references)
20
+
21
+ Use @sanity/client with client.fetch() for all queries. Include
22
+ TypeScript types for the query results.
23
+ docs: file://contexts/canonical/groq-blog-queries.md
24
+ __featureArea: groq
25
+ assert:
26
+ - type: llm-rubric
27
+ value: |-
28
+ Score task completion from 0 to 100:
29
+ - 0: Couldn't attempt — missing critical information
30
+ - 20: Attempted but fundamentally wrong approach
31
+ - 50: Partial implementation — major functional gaps
32
+ - 80: Mostly complete — minor issues or missing edge cases
33
+ - 100: Fully functional code — works as expected
34
+
35
+ Must demonstrate:
36
+ - GROQ filter with _type == "post"
37
+ - Projection with aliased slug field ("slug": slug.current)
38
+ - Reference resolution with -> for author
39
+ - Ordering with | order(publishedAt desc)
40
+ - Slice/pagination syntax [0...10] or [0..9]
41
+ - Parameterized query with $slug for single post fetch
42
+ - Date filtering with dateTime() or string comparison
43
+ - Category filtering using references or array contains
44
+
45
+ Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
46
+ provider: anthropic:messages:claude-opus-4-5-20251101
47
+ metadata:
48
+ dimension: task-completion
49
+ maxScore: 100
50
+ - type: llm-rubric
51
+ value: |-
52
+ Score code correctness from 0 to 100:
53
+ - 0: Broken code, syntax errors, or deprecated APIs
54
+ - 30: Works but uses anti-patterns or inefficient approaches
55
+ - 50: Works but not idiomatic
56
+ - 80: Follows most best practices
57
+ - 100: Follows all best practices, idiomatic implementation
58
+
59
+ Check for:
60
+ - Valid GROQ syntax (proper filter brackets, projection braces)
61
+ - Uses @sanity/client createClient + client.fetch()
62
+ - Correct parameter passing syntax ($param)
63
+ - Proper reference dereference with ->
64
+ - No deprecated patterns
65
+
66
+ Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
67
+ provider: anthropic:messages:claude-opus-4-5-20251101
68
+ metadata:
69
+ dimension: code-correctness
70
+ maxScore: 100
71
+ - type: contains-any
72
+ value:
73
+ - client.fetch
74
+ - createClient
75
+ weight: 1
76
+ - type: contains-any
77
+ value:
78
+ - order(publishedAt
79
+ - order(_createdAt
80
+ - '| order('
81
+ weight: 1
82
+ - type: contains-any
83
+ value:
84
+ - '[0...10]'
85
+ - '[0..9]'
86
+ - '[0...'
87
+ weight: 1
88
+ - type: llm-rubric
89
+ value: |-
90
+ Score documentation coverage from 0 to 100:
91
+ - 0: Had to hallucinate/guess most implementation details
92
+ - 30: Significant gaps — filled with assumptions
93
+ - 50: Some gaps — inferred from partial information
94
+ - 80: Minor gaps — almost everything was documented
95
+ - 100: Complete coverage — all necessary info was in docs
96
+
97
+ Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
98
+ provider: anthropic:messages:claude-opus-4-5-20251101
99
+ metadata:
100
+ dimension: doc-coverage
101
+ maxScore: 100
102
+ prompts:
103
+ - with-docs
104
+ - description: GROQ - Blog queries with filtering and pagination (baseline)
105
+ vars:
106
+ task: |-
107
+ Write GROQ queries for a Sanity blog application:
108
+
109
+ 1. Fetch all published blog posts ordered by publishedAt descending,
110
+ with a projection that includes: _id, title, slug (from slug.current),
111
+ publishedAt, excerpt, and the author's name (resolved from a reference)
112
+ 2. Add pagination to return only the first 10 results
113
+ 3. Fetch a single post by its slug parameter, including the full body
114
+ content and resolved author and category references
115
+ 4. Fetch posts published after a specific date
116
+ 5. Fetch posts that belong to a specific category (where categories
117
+ is an array of references)
118
+
119
+ Use @sanity/client with client.fetch() for all queries. Include
120
+ TypeScript types for the query results.
121
+ docs: file://contexts/canonical/groq-blog-queries.md
122
+ __featureArea: groq
123
+ assert:
124
+ - type: llm-rubric
125
+ value: |-
126
+ Score task completion from 0 to 100:
127
+ - 0: Couldn't attempt — missing critical information
128
+ - 20: Attempted but fundamentally wrong approach
129
+ - 50: Partial implementation — major functional gaps
130
+ - 80: Mostly complete — minor issues or missing edge cases
131
+ - 100: Fully functional code — works as expected
132
+
133
+ Must demonstrate:
134
+ - GROQ filter with _type == "post"
135
+ - Projection with aliased slug field ("slug": slug.current)
136
+ - Reference resolution with -> for author
137
+ - Ordering with | order(publishedAt desc)
138
+ - Slice/pagination syntax [0...10] or [0..9]
139
+ - Parameterized query with $slug for single post fetch
140
+ - Date filtering with dateTime() or string comparison
141
+ - Category filtering using references or array contains
142
+
143
+ Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
144
+ provider: anthropic:messages:claude-opus-4-5-20251101
145
+ metadata:
146
+ dimension: task-completion
147
+ maxScore: 100
148
+ - type: llm-rubric
149
+ value: |-
150
+ Score code correctness from 0 to 100:
151
+ - 0: Broken code, syntax errors, or deprecated APIs
152
+ - 30: Works but uses anti-patterns or inefficient approaches
153
+ - 50: Works but not idiomatic
154
+ - 80: Follows most best practices
155
+ - 100: Follows all best practices, idiomatic implementation
156
+
157
+ Check for:
158
+ - Valid GROQ syntax (proper filter brackets, projection braces)
159
+ - Uses @sanity/client createClient + client.fetch()
160
+ - Correct parameter passing syntax ($param)
161
+ - Proper reference dereference with ->
162
+ - No deprecated patterns
163
+
164
+ Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
165
+ provider: anthropic:messages:claude-opus-4-5-20251101
166
+ metadata:
167
+ dimension: code-correctness
168
+ maxScore: 100
169
+ - type: contains-any
170
+ value:
171
+ - client.fetch
172
+ - createClient
173
+ weight: 1
174
+ - type: contains-any
175
+ value:
176
+ - order(publishedAt
177
+ - order(_createdAt
178
+ - '| order('
179
+ weight: 1
180
+ - type: contains-any
181
+ value:
182
+ - '[0...10]'
183
+ - '[0..9]'
184
+ - '[0...'
185
+ weight: 1
186
+ - type: llm-rubric
187
+ value: |-
188
+ Score documentation coverage from 0 to 100:
189
+ - 0: Had to hallucinate/guess most implementation details
190
+ - 30: Significant gaps — filled with assumptions
191
+ - 50: Some gaps — inferred from partial information
192
+ - 80: Minor gaps — almost everything was documented
193
+ - 100: Complete coverage — all necessary info was in docs
194
+
195
+ Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
196
+ provider: anthropic:messages:claude-opus-4-5-20251101
197
+ metadata:
198
+ dimension: doc-coverage
199
+ maxScore: 100
200
+ prompts:
201
+ - without-docs
202
+ - description: GROQ - Joins and reference resolution (gold)
203
+ vars:
204
+ task: |-
205
+ Write GROQ queries that demonstrate join patterns in Sanity:
206
+
207
+ 1. Follow a single reference to resolve an author's full profile
208
+ from a post (post.author -> author document with name, bio, image)
209
+ 2. Resolve an array of category references from a post
210
+ (post.categories[]-> with title and slug)
211
+ 3. Write a reverse reference query: given an author's ID, find all
212
+ posts by that author using a subquery and the parent scope operator (^)
213
+ 4. Create a nested join: for each author, include their 5 most recent
214
+ posts as a nested array
215
+ 5. Use the references() function to find all documents that reference
216
+ a specific document ID
217
+
218
+ Use @sanity/client with client.fetch(). Include TypeScript types.
219
+ docs: file://contexts/canonical/groq-joins-references.md
220
+ __featureArea: groq
221
+ assert:
222
+ - type: llm-rubric
223
+ value: |-
224
+ Score task completion from 0 to 100:
225
+ - 0: Couldn't attempt — missing critical information
226
+ - 20: Attempted but fundamentally wrong approach
227
+ - 50: Partial implementation — major functional gaps
228
+ - 80: Mostly complete — minor issues or missing edge cases
229
+ - 100: Fully functional code — works as expected
230
+
231
+ Must demonstrate:
232
+ - Single reference follow with -> operator
233
+ - Array reference resolution with []->
234
+ - Reverse reference / subquery using *[references(^._id)]
235
+ - Nested join pattern with parent scope (^)
236
+ - The references() function
237
+
238
+ Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
239
+ provider: anthropic:messages:claude-opus-4-5-20251101
240
+ metadata:
241
+ dimension: task-completion
242
+ maxScore: 100
243
+ - type: llm-rubric
244
+ value: |-
245
+ Score code correctness from 0 to 100:
246
+ - 0: Broken code, syntax errors, or deprecated APIs
247
+ - 30: Works but uses anti-patterns or inefficient approaches
248
+ - 50: Works but not idiomatic
249
+ - 80: Follows most best practices
250
+ - 100: Follows all best practices, idiomatic implementation
251
+
252
+ Check for:
253
+ - Correct -> dereference syntax
254
+ - Valid []-> array dereference
255
+ - Proper use of ^ parent scope operator
256
+ - Valid references() function usage
257
+ - No made-up syntax
258
+
259
+ Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
260
+ provider: anthropic:messages:claude-opus-4-5-20251101
261
+ metadata:
262
+ dimension: code-correctness
263
+ maxScore: 100
264
+ - type: contains
265
+ value: '->'
266
+ weight: 1
267
+ - type: contains-any
268
+ value:
269
+ - references(
270
+ - references(^
271
+ weight: 1
272
+ - type: llm-rubric
273
+ value: |-
274
+ Score documentation coverage from 0 to 100:
275
+ - 0: Had to hallucinate/guess most implementation details
276
+ - 30: Significant gaps — filled with assumptions
277
+ - 50: Some gaps — inferred from partial information
278
+ - 80: Minor gaps — almost everything was documented
279
+ - 100: Complete coverage — all necessary info was in docs
280
+
281
+ Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
282
+ provider: anthropic:messages:claude-opus-4-5-20251101
283
+ metadata:
284
+ dimension: doc-coverage
285
+ maxScore: 100
286
+ prompts:
287
+ - with-docs
288
+ - description: GROQ - Joins and reference resolution (baseline)
289
+ vars:
290
+ task: |-
291
+ Write GROQ queries that demonstrate join patterns in Sanity:
292
+
293
+ 1. Follow a single reference to resolve an author's full profile
294
+ from a post (post.author -> author document with name, bio, image)
295
+ 2. Resolve an array of category references from a post
296
+ (post.categories[]-> with title and slug)
297
+ 3. Write a reverse reference query: given an author's ID, find all
298
+ posts by that author using a subquery and the parent scope operator (^)
299
+ 4. Create a nested join: for each author, include their 5 most recent
300
+ posts as a nested array
301
+ 5. Use the references() function to find all documents that reference
302
+ a specific document ID
303
+
304
+ Use @sanity/client with client.fetch(). Include TypeScript types.
305
+ docs: file://contexts/canonical/groq-joins-references.md
306
+ __featureArea: groq
307
+ assert:
308
+ - type: llm-rubric
309
+ value: |-
310
+ Score task completion from 0 to 100:
311
+ - 0: Couldn't attempt — missing critical information
312
+ - 20: Attempted but fundamentally wrong approach
313
+ - 50: Partial implementation — major functional gaps
314
+ - 80: Mostly complete — minor issues or missing edge cases
315
+ - 100: Fully functional code — works as expected
316
+
317
+ Must demonstrate:
318
+ - Single reference follow with -> operator
319
+ - Array reference resolution with []->
320
+ - Reverse reference / subquery using *[references(^._id)]
321
+ - Nested join pattern with parent scope (^)
322
+ - The references() function
323
+
324
+ Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
325
+ provider: anthropic:messages:claude-opus-4-5-20251101
326
+ metadata:
327
+ dimension: task-completion
328
+ maxScore: 100
329
+ - type: llm-rubric
330
+ value: |-
331
+ Score code correctness from 0 to 100:
332
+ - 0: Broken code, syntax errors, or deprecated APIs
333
+ - 30: Works but uses anti-patterns or inefficient approaches
334
+ - 50: Works but not idiomatic
335
+ - 80: Follows most best practices
336
+ - 100: Follows all best practices, idiomatic implementation
337
+
338
+ Check for:
339
+ - Correct -> dereference syntax
340
+ - Valid []-> array dereference
341
+ - Proper use of ^ parent scope operator
342
+ - Valid references() function usage
343
+ - No made-up syntax
344
+
345
+ Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
346
+ provider: anthropic:messages:claude-opus-4-5-20251101
347
+ metadata:
348
+ dimension: code-correctness
349
+ maxScore: 100
350
+ - type: contains
351
+ value: '->'
352
+ weight: 1
353
+ - type: contains-any
354
+ value:
355
+ - references(
356
+ - references(^
357
+ weight: 1
358
+ - type: llm-rubric
359
+ value: |-
360
+ Score documentation coverage from 0 to 100:
361
+ - 0: Had to hallucinate/guess most implementation details
362
+ - 30: Significant gaps — filled with assumptions
363
+ - 50: Some gaps — inferred from partial information
364
+ - 80: Minor gaps — almost everything was documented
365
+ - 100: Complete coverage — all necessary info was in docs
366
+
367
+ Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
368
+ provider: anthropic:messages:claude-opus-4-5-20251101
369
+ metadata:
370
+ dimension: doc-coverage
371
+ maxScore: 100
372
+ prompts:
373
+ - without-docs
374
+ - description: GROQ - Advanced filtering and projections (gold)
375
+ vars:
376
+ task: |-
377
+ Write GROQ queries demonstrating advanced filtering and projection patterns:
378
+
379
+ 1. Use select() for conditional projections — return different fields
380
+ based on the document's _type (e.g., posts get excerpt, events get
381
+ date and venue)
382
+ 2. Use coalesce() for fallback values — e.g., use seoTitle if it
383
+ exists, otherwise fall back to title
384
+ 3. Use the match operator for full-text search in titles
385
+ 4. Use count() to count documents matching a filter and to count
386
+ items within an array field
387
+ 5. Use defined() to filter for documents that have a specific field set
388
+ 6. Filter items within an array using [condition] syntax
389
+ 7. Order results by multiple fields (e.g., featured status first,
390
+ then by publishedAt)
391
+
392
+ Use @sanity/client with client.fetch(). Include TypeScript types.
393
+ docs: file://contexts/canonical/groq-advanced-filtering.md
394
+ __featureArea: groq
395
+ assert:
396
+ - type: llm-rubric
397
+ value: |-
398
+ Score task completion from 0 to 100:
399
+ - 0: Couldn't attempt — missing critical information
400
+ - 20: Attempted but fundamentally wrong approach
401
+ - 50: Partial implementation — major functional gaps
402
+ - 80: Mostly complete — minor issues or missing edge cases
403
+ - 100: Fully functional code — works as expected
404
+
405
+ Must demonstrate:
406
+ - select() for conditional projections
407
+ - coalesce() for fallback values
408
+ - match operator for text search
409
+ - count() function usage
410
+ - defined() function for existence checks
411
+ - Array filtering with [condition]
412
+ - Multi-field ordering
413
+
414
+ Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
415
+ provider: anthropic:messages:claude-opus-4-5-20251101
416
+ metadata:
417
+ dimension: task-completion
418
+ maxScore: 100
419
+ - type: llm-rubric
420
+ value: |-
421
+ Score code correctness from 0 to 100:
422
+ - 0: Broken code, syntax errors, or deprecated APIs
423
+ - 30: Works but uses anti-patterns or inefficient approaches
424
+ - 50: Works but not idiomatic
425
+ - 80: Follows most best practices
426
+ - 100: Follows all best practices, idiomatic implementation
427
+
428
+ Check for:
429
+ - Valid select() syntax with => arrow notation
430
+ - Correct coalesce() usage
431
+ - Proper match operator usage (on text fields)
432
+ - Valid count() and defined() function calls
433
+ - Correct array filter syntax
434
+
435
+ Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
436
+ provider: anthropic:messages:claude-opus-4-5-20251101
437
+ metadata:
438
+ dimension: code-correctness
439
+ maxScore: 100
440
+ - type: contains-any
441
+ value:
442
+ - select(
443
+ - coalesce(
444
+ weight: 1
445
+ - type: contains-any
446
+ value:
447
+ - count(
448
+ - defined(
449
+ weight: 1
450
+ - type: contains-any
451
+ value:
452
+ - match
453
+ weight: 1
454
+ - type: llm-rubric
455
+ value: |-
456
+ Score documentation coverage from 0 to 100:
457
+ - 0: Had to hallucinate/guess most implementation details
458
+ - 30: Significant gaps — filled with assumptions
459
+ - 50: Some gaps — inferred from partial information
460
+ - 80: Minor gaps — almost everything was documented
461
+ - 100: Complete coverage — all necessary info was in docs
462
+
463
+ Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
464
+ provider: anthropic:messages:claude-opus-4-5-20251101
465
+ metadata:
466
+ dimension: doc-coverage
467
+ maxScore: 100
468
+ prompts:
469
+ - with-docs
470
+ - description: GROQ - Advanced filtering and projections (baseline)
471
+ vars:
472
+ task: |-
473
+ Write GROQ queries demonstrating advanced filtering and projection patterns:
474
+
475
+ 1. Use select() for conditional projections — return different fields
476
+ based on the document's _type (e.g., posts get excerpt, events get
477
+ date and venue)
478
+ 2. Use coalesce() for fallback values — e.g., use seoTitle if it
479
+ exists, otherwise fall back to title
480
+ 3. Use the match operator for full-text search in titles
481
+ 4. Use count() to count documents matching a filter and to count
482
+ items within an array field
483
+ 5. Use defined() to filter for documents that have a specific field set
484
+ 6. Filter items within an array using [condition] syntax
485
+ 7. Order results by multiple fields (e.g., featured status first,
486
+ then by publishedAt)
487
+
488
+ Use @sanity/client with client.fetch(). Include TypeScript types.
489
+ docs: file://contexts/canonical/groq-advanced-filtering.md
490
+ __featureArea: groq
491
+ assert:
492
+ - type: llm-rubric
493
+ value: |-
494
+ Score task completion from 0 to 100:
495
+ - 0: Couldn't attempt — missing critical information
496
+ - 20: Attempted but fundamentally wrong approach
497
+ - 50: Partial implementation — major functional gaps
498
+ - 80: Mostly complete — minor issues or missing edge cases
499
+ - 100: Fully functional code — works as expected
500
+
501
+ Must demonstrate:
502
+ - select() for conditional projections
503
+ - coalesce() for fallback values
504
+ - match operator for text search
505
+ - count() function usage
506
+ - defined() function for existence checks
507
+ - Array filtering with [condition]
508
+ - Multi-field ordering
509
+
510
+ Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
511
+ provider: anthropic:messages:claude-opus-4-5-20251101
512
+ metadata:
513
+ dimension: task-completion
514
+ maxScore: 100
515
+ - type: llm-rubric
516
+ value: |-
517
+ Score code correctness from 0 to 100:
518
+ - 0: Broken code, syntax errors, or deprecated APIs
519
+ - 30: Works but uses anti-patterns or inefficient approaches
520
+ - 50: Works but not idiomatic
521
+ - 80: Follows most best practices
522
+ - 100: Follows all best practices, idiomatic implementation
523
+
524
+ Check for:
525
+ - Valid select() syntax with => arrow notation
526
+ - Correct coalesce() usage
527
+ - Proper match operator usage (on text fields)
528
+ - Valid count() and defined() function calls
529
+ - Correct array filter syntax
530
+
531
+ Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
532
+ provider: anthropic:messages:claude-opus-4-5-20251101
533
+ metadata:
534
+ dimension: code-correctness
535
+ maxScore: 100
536
+ - type: contains-any
537
+ value:
538
+ - select(
539
+ - coalesce(
540
+ weight: 1
541
+ - type: contains-any
542
+ value:
543
+ - count(
544
+ - defined(
545
+ weight: 1
546
+ - type: contains-any
547
+ value:
548
+ - match
549
+ weight: 1
550
+ - type: llm-rubric
551
+ value: |-
552
+ Score documentation coverage from 0 to 100:
553
+ - 0: Had to hallucinate/guess most implementation details
554
+ - 30: Significant gaps — filled with assumptions
555
+ - 50: Some gaps — inferred from partial information
556
+ - 80: Minor gaps — almost everything was documented
557
+ - 100: Complete coverage — all necessary info was in docs
558
+
559
+ Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
560
+ provider: anthropic:messages:claude-opus-4-5-20251101
561
+ metadata:
562
+ dimension: doc-coverage
563
+ maxScore: 100
564
+ prompts:
565
+ - without-docs
@@ -3,21 +3,30 @@
3
3
  *
4
4
  * Tests whether models know the current typed schema API
5
5
  * vs the legacy untyped format.
6
+ *
7
+ * Knowledge probes test the model's built-in knowledge WITHOUT providing documentation
8
+ * context (no `context.docs`). Contrast with "literacy" tasks which inject docs.
6
9
  */
7
10
  import { defineTask } from "@sanity/ailf-core"
8
11
 
9
12
  export default defineTask({
13
+ // "knowledge-probe" mode: no docs injected — measures what the model already knows
10
14
  mode: "knowledge-probe",
11
15
  id: "kp-define-type-api",
12
16
  title: "What is Sanity's defineType API?",
13
17
  description:
14
18
  "Explain how to define document schemas in Sanity using the defineType, " +
15
19
  "defineField, and defineArrayMember helper functions.",
20
+ // Used for score aggregation in reports and --area CLI filtering
16
21
  area: "studio",
22
+ // Metadata for reporting; does not affect evaluation behavior
17
23
  difficulty: "basic",
24
+ // Freeform labels for --tag CLI filtering
18
25
  tags: ["knowledge-probe", "studio", "schema"],
26
+ // Controls how the probe explores knowledge: "breadth-first" covers many topics, "depth-first" drills deep
19
27
  probeStrategy: "breadth-first",
20
28
  prompt: {
29
+ // Direct prompt text sent to the model (knowledge probes use text, literacy tasks use vars.task with a template)
21
30
  text:
22
31
  "Explain Sanity's schema definition API:\n\n" +
23
32
  "1. What is `defineType` and how do you use it?\n" +
@@ -34,6 +43,7 @@ export default defineTask({
34
43
  assertions: [
35
44
  { type: "contains", value: "defineType" },
36
45
  { type: "contains", value: "defineField" },
46
+ // Inline llm-rubric (value is the rubric text itself, unlike template+criteria in literacy tasks)
37
47
  {
38
48
  type: "llm-rubric",
39
49
  value:
@@ -41,6 +51,7 @@ export default defineTask({
41
51
  "Check that the code examples use the current API, not the legacy " +
42
52
  "untyped format. Penalize if the response uses the old pattern " +
43
53
  "without mentioning defineType.",
54
+ // weight: relative weight in the overall score (these two rubrics split evenly at 0.5 each)
44
55
  weight: 0.5,
45
56
  },
46
57
  {
@@ -3,6 +3,8 @@
3
3
  *
4
4
  * Tests deep knowledge of Sanity's query language without
5
5
  * providing any documentation context.
6
+ *
7
+ * See define-type-api.task.ts for detailed explanations of knowledge-probe properties.
6
8
  */
7
9
  import { defineTask } from "@sanity/ailf-core"
8
10
 
@@ -18,6 +20,7 @@ export default defineTask({
18
20
  area: "groq",
19
21
  difficulty: "intermediate",
20
22
  tags: ["knowledge-probe", "groq", "syntax"],
23
+ // "depth-first" drills deep into one topic (vs "breadth-first" which covers many shallowly)
21
24
  probeStrategy: "depth-first",
22
25
  prompt: {
23
26
  text: