@sanity/ailf 0.4.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (290) hide show
  1. package/config/features.ts +23 -0
  2. package/config/models.ts +83 -0
  3. package/config/prompts.ts +16 -0
  4. package/config/rubrics.ts +225 -0
  5. package/config/schedules.ts +47 -0
  6. package/config/sinks.ts +37 -0
  7. package/config/sources.ts +21 -0
  8. package/config/thresholds.ts +61 -0
  9. package/dist/_vendor/ailf-core/config-helpers.d.ts +174 -0
  10. package/dist/_vendor/ailf-core/config-helpers.js +150 -0
  11. package/dist/_vendor/ailf-core/env-helper.d.ts +35 -0
  12. package/dist/_vendor/ailf-core/env-helper.js +45 -0
  13. package/dist/_vendor/ailf-core/examples/index.d.ts +10 -10
  14. package/dist/_vendor/ailf-core/examples/index.js +10 -10
  15. package/dist/_vendor/ailf-core/index.d.ts +3 -0
  16. package/dist/_vendor/ailf-core/index.js +5 -0
  17. package/dist/_vendor/ailf-core/ports/context.d.ts +15 -2
  18. package/dist/_vendor/ailf-core/ports/doc-fetcher.d.ts +2 -2
  19. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -1
  20. package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +129 -0
  21. package/dist/_vendor/ailf-core/ports/mode-handler.js +19 -0
  22. package/dist/_vendor/ailf-core/ports/task-source.d.ts +16 -122
  23. package/dist/_vendor/ailf-core/ports/task-source.js +7 -7
  24. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -2
  25. package/dist/_vendor/ailf-core/schemas/eval-config.js +7 -2
  26. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +8 -3
  27. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +6 -1
  28. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +32 -31
  29. package/dist/_vendor/ailf-core/schemas/pipeline.js +52 -12
  30. package/dist/_vendor/ailf-core/schemas/schedules.d.ts +14 -4
  31. package/dist/_vendor/ailf-core/schemas/schedules.js +6 -2
  32. package/dist/_vendor/ailf-core/schemas/sinks.d.ts +1 -1
  33. package/dist/_vendor/ailf-core/services/comparison-formatters.js +57 -19
  34. package/dist/_vendor/ailf-core/services/index.d.ts +2 -1
  35. package/dist/_vendor/ailf-core/services/index.js +2 -1
  36. package/dist/_vendor/ailf-core/services/scoring-engine.d.ts +153 -0
  37. package/dist/_vendor/ailf-core/services/scoring-engine.js +237 -0
  38. package/dist/_vendor/ailf-core/services/scoring.d.ts +15 -2
  39. package/dist/_vendor/ailf-core/services/scoring.js +25 -15
  40. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +137 -0
  41. package/dist/_vendor/ailf-core/types/branded-ids.js +136 -0
  42. package/dist/_vendor/ailf-core/types/eval-mode-config.d.ts +150 -0
  43. package/dist/_vendor/ailf-core/types/eval-mode-config.js +24 -0
  44. package/dist/_vendor/ailf-core/types/generalized-task.d.ts +319 -0
  45. package/dist/_vendor/ailf-core/types/generalized-task.js +13 -0
  46. package/dist/_vendor/ailf-core/types/index.d.ts +45 -81
  47. package/dist/_vendor/ailf-core/types/index.js +8 -1
  48. package/dist/_vendor/ailf-core/types/plugin-registry.d.ts +202 -0
  49. package/dist/_vendor/ailf-core/types/plugin-registry.js +132 -0
  50. package/dist/_vendor/ailf-core/types/storage-schema.d.ts +199 -0
  51. package/dist/_vendor/ailf-core/types/storage-schema.js +39 -0
  52. package/dist/_vendor/ailf-core/types/task-graph.d.ts +86 -0
  53. package/dist/_vendor/ailf-core/types/task-graph.js +20 -0
  54. package/dist/_vendor/ailf-core/types/trace.d.ts +118 -0
  55. package/dist/_vendor/ailf-core/types/trace.js +18 -0
  56. package/dist/_vendor/ailf-core/types/variable-envelope.d.ts +80 -0
  57. package/dist/_vendor/ailf-core/types/variable-envelope.js +16 -0
  58. package/dist/_vendor/ailf-shared/dimension-names.d.ts +5 -18
  59. package/dist/_vendor/ailf-shared/dimension-names.js +6 -24
  60. package/dist/_vendor/ailf-shared/eval-modes.d.ts +38 -6
  61. package/dist/_vendor/ailf-shared/eval-modes.js +26 -2
  62. package/dist/_vendor/ailf-shared/index.d.ts +0 -1
  63. package/dist/_vendor/ailf-shared/index.js +0 -1
  64. package/dist/adapters/api-client/build-request.js +14 -13
  65. package/dist/adapters/config-sources/file-config-adapter.d.ts +20 -11
  66. package/dist/adapters/config-sources/file-config-adapter.js +38 -12
  67. package/dist/adapters/config-sources/index.d.ts +2 -0
  68. package/dist/adapters/config-sources/index.js +1 -0
  69. package/dist/adapters/config-sources/ts-config-loader.d.ts +59 -0
  70. package/dist/adapters/config-sources/ts-config-loader.js +133 -0
  71. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.d.ts +3 -2
  72. package/dist/adapters/doc-fetchers/sanity-doc-fetcher.js +7 -2
  73. package/dist/adapters/task-sources/composite-task-source.d.ts +3 -3
  74. package/dist/adapters/task-sources/composite-task-source.js +1 -1
  75. package/dist/adapters/task-sources/content-lake-task-source.d.ts +7 -6
  76. package/dist/adapters/task-sources/content-lake-task-source.js +22 -23
  77. package/dist/adapters/task-sources/index.d.ts +1 -0
  78. package/dist/adapters/task-sources/index.js +1 -0
  79. package/dist/adapters/task-sources/repo-task-source.d.ts +4 -4
  80. package/dist/adapters/task-sources/repo-task-source.js +69 -16
  81. package/dist/adapters/task-sources/task-file-loader.d.ts +64 -0
  82. package/dist/adapters/task-sources/task-file-loader.js +83 -0
  83. package/dist/adapters/task-sources/yaml-task-source.d.ts +6 -6
  84. package/dist/adapters/task-sources/yaml-task-source.js +19 -16
  85. package/dist/cli.js +0 -2
  86. package/dist/commands/baseline.js +4 -1
  87. package/dist/commands/calculate-scores.js +1 -1
  88. package/dist/commands/coverage-audit.js +7 -1
  89. package/dist/commands/explain-handler.js +25 -23
  90. package/dist/commands/fetch-docs.js +3 -2
  91. package/dist/commands/generate-configs.js +1 -1
  92. package/dist/commands/interactive.js +11 -7
  93. package/dist/commands/pipeline-action.d.ts +2 -0
  94. package/dist/commands/pipeline-action.js +16 -6
  95. package/dist/commands/pipeline.d.ts +1 -0
  96. package/dist/commands/pipeline.js +4 -2
  97. package/dist/commands/pr-comment.js +1 -1
  98. package/dist/commands/publish.js +2 -2
  99. package/dist/commands/readiness-report.js +13 -6
  100. package/dist/composition-root.d.ts +1 -1
  101. package/dist/composition-root.js +67 -4
  102. package/dist/orchestration/build-app-context.js +1 -0
  103. package/dist/orchestration/build-step-sequence.js +24 -6
  104. package/dist/orchestration/steps/calculate-scores-step.js +24 -11
  105. package/dist/orchestration/steps/fetch-docs-step.js +6 -4
  106. package/dist/orchestration/steps/gap-analysis-step.js +8 -7
  107. package/dist/orchestration/steps/generate-configs-step.d.ts +16 -3
  108. package/dist/orchestration/steps/generate-configs-step.js +245 -51
  109. package/dist/orchestration/steps/grader-consistency-step.js +7 -4
  110. package/dist/orchestration/steps/mirror-repo-tasks-step.js +1 -1
  111. package/dist/orchestration/steps/readiness-step.js +5 -6
  112. package/dist/orchestration/steps/run-eval-step.d.ts +1 -2
  113. package/dist/orchestration/steps/run-eval-step.js +8 -7
  114. package/dist/pipeline/cache.d.ts +1 -1
  115. package/dist/pipeline/cache.js +36 -8
  116. package/dist/pipeline/calculate-scores.d.ts +5 -7
  117. package/dist/pipeline/calculate-scores.js +74 -153
  118. package/dist/pipeline/checks.js +2 -2
  119. package/dist/pipeline/compare.js +8 -8
  120. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.d.ts +10 -0
  121. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +288 -0
  122. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.d.ts +9 -0
  123. package/dist/pipeline/compiler/__tests__/assertion-mapper.test.js +145 -0
  124. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.d.ts +10 -0
  125. package/dist/pipeline/compiler/__tests__/knowledge-probe-handler.test.js +314 -0
  126. package/dist/pipeline/compiler/__tests__/literacy-handler.test.d.ts +10 -0
  127. package/dist/pipeline/compiler/__tests__/literacy-handler.test.js +486 -0
  128. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.d.ts +10 -0
  129. package/dist/pipeline/compiler/__tests__/mcp-server-handler.test.js +355 -0
  130. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.d.ts +9 -0
  131. package/dist/pipeline/compiler/__tests__/promptfoo-compiler.test.js +333 -0
  132. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.d.ts +12 -0
  133. package/dist/pipeline/compiler/__tests__/sandbox-and-fixtures.test.js +210 -0
  134. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.d.ts +7 -0
  135. package/dist/pipeline/compiler/__tests__/scoring-and-presets.test.js +471 -0
  136. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.d.ts +10 -0
  137. package/dist/pipeline/compiler/__tests__/scoring-bridge.test.js +184 -0
  138. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.d.ts +8 -0
  139. package/dist/pipeline/compiler/__tests__/task-graph-builder.test.js +301 -0
  140. package/dist/pipeline/compiler/__tests__/telemetry.test.d.ts +9 -0
  141. package/dist/pipeline/compiler/__tests__/telemetry.test.js +503 -0
  142. package/dist/pipeline/compiler/assertion-mapper.d.ts +58 -0
  143. package/dist/pipeline/compiler/assertion-mapper.js +175 -0
  144. package/dist/pipeline/compiler/compiler-to-yaml.d.ts +51 -0
  145. package/dist/pipeline/compiler/compiler-to-yaml.js +222 -0
  146. package/dist/pipeline/compiler/config-loader.d.ts +56 -0
  147. package/dist/pipeline/compiler/config-loader.js +111 -0
  148. package/dist/pipeline/compiler/fixture-resolver.d.ts +41 -0
  149. package/dist/pipeline/compiler/fixture-resolver.js +113 -0
  150. package/dist/pipeline/compiler/hash.d.ts +11 -0
  151. package/dist/pipeline/compiler/hash.js +18 -0
  152. package/dist/pipeline/compiler/ignore-fields.d.ts +53 -0
  153. package/dist/pipeline/compiler/ignore-fields.js +113 -0
  154. package/dist/pipeline/compiler/index.d.ts +29 -0
  155. package/dist/pipeline/compiler/index.js +45 -0
  156. package/dist/pipeline/compiler/literacy-bridge.d.ts +102 -0
  157. package/dist/pipeline/compiler/literacy-bridge.js +172 -0
  158. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.d.ts +14 -0
  159. package/dist/pipeline/compiler/mode-handlers/__fixtures__/agent-harness-example-tasks.js +152 -0
  160. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.d.ts +32 -0
  161. package/dist/pipeline/compiler/mode-handlers/__fixtures__/knowledge-probe-example-tasks.js +176 -0
  162. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.d.ts +49 -0
  163. package/dist/pipeline/compiler/mode-handlers/__fixtures__/mcp-example-tasks.js +259 -0
  164. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.d.ts +70 -0
  165. package/dist/pipeline/compiler/mode-handlers/agent-harness-handler.js +485 -0
  166. package/dist/pipeline/compiler/mode-handlers/index.d.ts +16 -0
  167. package/dist/pipeline/compiler/mode-handlers/index.js +21 -0
  168. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.d.ts +76 -0
  169. package/dist/pipeline/compiler/mode-handlers/knowledge-probe-handler.js +245 -0
  170. package/dist/pipeline/compiler/mode-handlers/literacy-handler.d.ts +89 -0
  171. package/dist/pipeline/compiler/mode-handlers/literacy-handler.js +379 -0
  172. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.d.ts +50 -0
  173. package/dist/pipeline/compiler/mode-handlers/mcp-assertions.js +277 -0
  174. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.d.ts +67 -0
  175. package/dist/pipeline/compiler/mode-handlers/mcp-server-handler.js +309 -0
  176. package/dist/pipeline/compiler/presets/index.d.ts +9 -0
  177. package/dist/pipeline/compiler/presets/index.js +8 -0
  178. package/dist/pipeline/compiler/presets/sanity-literacy.d.ts +45 -0
  179. package/dist/pipeline/compiler/presets/sanity-literacy.js +354 -0
  180. package/dist/pipeline/compiler/promptfoo-compiler.d.ts +96 -0
  181. package/dist/pipeline/compiler/promptfoo-compiler.js +230 -0
  182. package/dist/pipeline/compiler/provider-assembler.d.ts +39 -0
  183. package/dist/pipeline/compiler/provider-assembler.js +137 -0
  184. package/dist/pipeline/compiler/sandbox/docker-sandbox.d.ts +21 -0
  185. package/dist/pipeline/compiler/sandbox/docker-sandbox.js +136 -0
  186. package/dist/pipeline/compiler/sandbox/fixture-provisioner.d.ts +69 -0
  187. package/dist/pipeline/compiler/sandbox/fixture-provisioner.js +189 -0
  188. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.d.ts +20 -0
  189. package/dist/pipeline/compiler/sandbox/git-worktree-sandbox.js +114 -0
  190. package/dist/pipeline/compiler/sandbox/index.d.ts +10 -0
  191. package/dist/pipeline/compiler/sandbox/index.js +11 -0
  192. package/dist/pipeline/compiler/sandbox/sandbox-selector.d.ts +35 -0
  193. package/dist/pipeline/compiler/sandbox/sandbox-selector.js +86 -0
  194. package/dist/pipeline/compiler/sandbox/sandbox-strategy.d.ts +81 -0
  195. package/dist/pipeline/compiler/sandbox/sandbox-strategy.js +15 -0
  196. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.d.ts +20 -0
  197. package/dist/pipeline/compiler/sandbox/tempdir-sandbox.js +74 -0
  198. package/dist/pipeline/compiler/scoring-bridge.d.ts +49 -0
  199. package/dist/pipeline/compiler/scoring-bridge.js +114 -0
  200. package/dist/pipeline/compiler/task-graph-builder.d.ts +54 -0
  201. package/dist/pipeline/compiler/task-graph-builder.js +291 -0
  202. package/dist/pipeline/compiler/telemetry/cost-tracker.d.ts +90 -0
  203. package/dist/pipeline/compiler/telemetry/cost-tracker.js +146 -0
  204. package/dist/pipeline/compiler/telemetry/index.d.ts +14 -0
  205. package/dist/pipeline/compiler/telemetry/index.js +19 -0
  206. package/dist/pipeline/compiler/telemetry/redactor.d.ts +58 -0
  207. package/dist/pipeline/compiler/telemetry/redactor.js +222 -0
  208. package/dist/pipeline/compiler/telemetry/tool-classifier.d.ts +32 -0
  209. package/dist/pipeline/compiler/telemetry/tool-classifier.js +120 -0
  210. package/dist/pipeline/compiler/telemetry/trace-collector.d.ts +75 -0
  211. package/dist/pipeline/compiler/telemetry/trace-collector.js +297 -0
  212. package/dist/pipeline/compiler/telemetry/trace-store.d.ts +78 -0
  213. package/dist/pipeline/compiler/telemetry/trace-store.js +85 -0
  214. package/dist/pipeline/compiler/variable-resolver.d.ts +46 -0
  215. package/dist/pipeline/compiler/variable-resolver.js +115 -0
  216. package/dist/pipeline/coverage-audit.d.ts +15 -5
  217. package/dist/pipeline/coverage-audit.js +41 -22
  218. package/dist/pipeline/eval-constants.d.ts +16 -6
  219. package/dist/pipeline/eval-constants.js +25 -4
  220. package/dist/pipeline/eval-fingerprint.d.ts +2 -2
  221. package/dist/pipeline/eval-fingerprint.js +8 -9
  222. package/dist/pipeline/expand-tasks.d.ts +23 -14
  223. package/dist/pipeline/expand-tasks.js +37 -31
  224. package/dist/pipeline/gap-analysis.d.ts +1 -1
  225. package/dist/pipeline/gap-analysis.js +2 -2
  226. package/dist/pipeline/generate-configs.d.ts +22 -4
  227. package/dist/pipeline/generate-configs.js +53 -24
  228. package/dist/pipeline/grader-api.d.ts +3 -3
  229. package/dist/pipeline/grader-api.js +5 -12
  230. package/dist/pipeline/grader-compare-runner.js +20 -27
  231. package/dist/pipeline/grader-comparison.d.ts +4 -8
  232. package/dist/pipeline/grader-comparison.js +11 -17
  233. package/dist/pipeline/grader-consistency-runner.d.ts +2 -3
  234. package/dist/pipeline/grader-consistency-runner.js +18 -21
  235. package/dist/pipeline/grader-consistency.d.ts +6 -10
  236. package/dist/pipeline/grader-consistency.js +13 -32
  237. package/dist/pipeline/grader-sensitivity-runner.js +7 -5
  238. package/dist/pipeline/grader-sensitivity.d.ts +2 -6
  239. package/dist/pipeline/grader-sensitivity.js +10 -10
  240. package/dist/pipeline/grader-validate-runner.js +7 -5
  241. package/dist/pipeline/grader-validation.d.ts +2 -6
  242. package/dist/pipeline/grader-validation.js +14 -22
  243. package/dist/pipeline/map-request-to-config.js +6 -1
  244. package/dist/pipeline/mirror-repo-tasks.d.ts +6 -6
  245. package/dist/pipeline/mirror-repo-tasks.js +16 -15
  246. package/dist/pipeline/normalize-mode.d.ts +49 -0
  247. package/dist/pipeline/normalize-mode.js +64 -0
  248. package/dist/pipeline/plan.d.ts +5 -2
  249. package/dist/pipeline/plan.js +134 -78
  250. package/dist/pipeline/pr-comment.js +2 -0
  251. package/dist/pipeline/profile-resolution.d.ts +47 -0
  252. package/dist/pipeline/profile-resolution.js +91 -0
  253. package/dist/pipeline/provenance.d.ts +2 -2
  254. package/dist/pipeline/provenance.js +12 -17
  255. package/dist/pipeline/release-report.js +4 -4
  256. package/dist/pipeline/repo-threshold-evaluator.d.ts +1 -1
  257. package/dist/pipeline/repo-threshold-evaluator.js +1 -1
  258. package/dist/pipeline/rubric-loader.d.ts +20 -0
  259. package/dist/pipeline/rubric-loader.js +37 -0
  260. package/dist/pipeline/validate.d.ts +4 -4
  261. package/dist/pipeline/validate.js +64 -53
  262. package/dist/schedules/loader.js +18 -8
  263. package/dist/scripts/migrate-task-mode.d.ts +24 -0
  264. package/dist/scripts/migrate-task-mode.js +85 -0
  265. package/dist/scripts/migrate-tasks-to-content-lake.js +11 -10
  266. package/dist/scripts/validate-task-sources.d.ts +1 -1
  267. package/dist/scripts/validate-task-sources.js +15 -15
  268. package/dist/sinks/loader.js +5 -7
  269. package/dist/sources.d.ts +7 -7
  270. package/dist/sources.js +22 -24
  271. package/dist/webhook/dispatch.js +2 -1
  272. package/package.json +6 -3
  273. package/tasks/knowledge-probe/define-type-api.task.ts +55 -0
  274. package/tasks/knowledge-probe/groq-projections.task.ts +59 -0
  275. package/tasks/literacy/frameworks.task.ts +128 -0
  276. package/tasks/literacy/functions.task.ts +69 -0
  277. package/tasks/literacy/groq.task.ts +258 -0
  278. package/tasks/literacy/nextjs-live.task.ts +75 -0
  279. package/tasks/literacy/studio-setup.task.ts +131 -0
  280. package/tasks/literacy/visual-editing.task.ts +146 -0
  281. package/config/features.yaml +0 -116
  282. package/config/models.yaml +0 -116
  283. package/config/prompts.yaml +0 -75
  284. package/config/rubrics.yaml +0 -62
  285. package/config/schedules.yaml +0 -43
  286. package/config/sinks.yaml +0 -54
  287. package/config/sources.yaml +0 -51
  288. package/config/thresholds.yaml +0 -49
  289. package/dist/agent-observer/test-imports.d.ts +0 -7
  290. package/dist/agent-observer/test-imports.js +0 -185
@@ -185,13 +185,13 @@ export const exampleGroqBlogListingData = [
185
185
  ],
186
186
  "baseline": {
187
187
  "enabled": true,
188
- "rubric": "abbreviated"
188
+ "rubric": "full"
189
189
  },
190
190
  "status": "draft"
191
191
  }
192
192
  ];
193
193
  /** Raw YAML string for example-groq-blog-listing (preserves comments) */
194
- export const exampleGroqBlogListingYaml = "# ──────────────────────────────────────────────────────────────────────\n# Example Task: Blog listing with GROQ queries\n# ──────────────────────────────────────────────────────────────────────\n#\n# This is a starter template — edit it for your own documentation.\n# Each task evaluates whether an AI coding agent can implement a feature\n# using your docs as context. Delete this file or replace it entirely.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n#\n# Full field reference:\n# https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/CONTRIBUTING_TASKS.md\n# ──────────────────────────────────────────────────────────────────────\n\n# Unique identifier — lowercase alphanumeric with hyphens.\n# Must be unique across all task files in .ailf/tasks/.\n- id: example-groq-blog-listing\n\n # Short human-readable summary. Shown in score tables and reports.\n description: \"Example — Blog listing with GROQ queries\"\n\n # Feature area this task belongs to. Tasks with the same area are\n # grouped together in score summaries. Use a short kebab-case name.\n featureArea: groq\n\n # Gold-standard documentation articles for this task. The pipeline\n # fetches these from Sanity and injects them into the prompt for\n # baseline evaluation. Each entry needs:\n # slug — the article's URL slug in your docs site\n # reason — why this doc is relevant (helps with auditing)\n #\n # This example uses slug-based references — the simplest form.\n # See the other example tasks for path, id, and perspective references.\n canonicalDocs:\n - slug: groq-introduction\n reason: \"Core GROQ syntax and query language reference\"\n - slug: how-queries-work\n reason: \"Query execution model and best practices\"\n\n # When true, the pipeline auto-generates an additional rubric that\n # checks whether the LLM's response actually used the provided docs.\n docCoverage: true\n\n # Path to a gold-standard implementation, relative to canonical/.\n # The grader uses this as a reference when scoring code correctness.\n referenceSolution: canonical/example-groq-blog-listing.ts\n\n # vars.task — the implementation prompt given to the LLM.\n # Write this as if you're asking a developer to build the feature.\n # Be specific about requirements so the grader can evaluate clearly.\n #\n # vars.docs — leave empty (\"\"). The pipeline fills this in:\n # • Gold variant: injected with canonical doc content\n # • Baseline variant: left empty (tests model knowledge alone)\n vars:\n task: |\n Create a Next.js page component that lists blog posts from Sanity\n using GROQ. The page should display the title, slug, and published\n date for each post, sorted by most recent first. Use the Sanity\n client to fetch data.\n docs: \"\"\n\n # Grading assertions — how the LLM's response is scored.\n #\n # \"llm-rubric\" assertions use a grader LLM to score against criteria.\n # The \"template\" references a rubric from config/rubrics.yaml.\n # The \"criteria\" are task-specific bullets injected into the template.\n #\n # Available templates:\n # task-completion — did the LLM implement the feature? (weight: 0.50)\n # code-correctness — is the code idiomatic and correct? (weight: 0.25)\n #\n # You can also use value-based assertions:\n # - type: contains\n # value: \"client.fetch\"\n # - type: contains-any\n # value: [\"createClient\", \"sanityClient\"]\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Uses the groq tagged template literal\"\n - \"Fetches blog posts with title, slug, and publishedAt fields\"\n - \"Orders results by publishedAt in descending order\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses createClient from @sanity/client or next-sanity\"\n - \"Exports a valid Next.js page component\"\n\n # Baseline variant configuration.\n # enabled — set to false to skip this task entirely\n # rubric — \"abbreviated\" (faster, default), \"full\", or \"none\"\n baseline:\n enabled: true\n rubric: abbreviated\n\n # Example tasks ship as drafts so they don't run in production evals.\n # Change to \"active\" (or remove this line) to activate.\n status: draft\n";
194
+ export const exampleGroqBlogListingYaml = "# ──────────────────────────────────────────────────────────────────────\n# Example Task: Blog listing with GROQ queries\n# ──────────────────────────────────────────────────────────────────────\n#\n# This is a starter template — edit it for your own documentation.\n# Each task evaluates whether an AI coding agent can implement a feature\n# using your docs as context. Delete this file or replace it entirely.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n#\n# Full field reference:\n# https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/CONTRIBUTING_TASKS.md\n# ──────────────────────────────────────────────────────────────────────\n\n# Unique identifier — lowercase alphanumeric with hyphens.\n# Must be unique across all task files in .ailf/tasks/.\n- id: example-groq-blog-listing\n\n # Short human-readable summary. Shown in score tables and reports.\n description: \"Example — Blog listing with GROQ queries\"\n\n # Feature area this task belongs to. Tasks with the same area are\n # grouped together in score summaries. Use a short kebab-case name.\n featureArea: groq\n\n # Gold-standard documentation articles for this task. The pipeline\n # fetches these from Sanity and injects them into the prompt for\n # baseline evaluation. Each entry needs:\n # slug — the article's URL slug in your docs site\n # reason — why this doc is relevant (helps with auditing)\n #\n # This example uses slug-based references — the simplest form.\n # See the other example tasks for path, id, and perspective references.\n canonicalDocs:\n - slug: groq-introduction\n reason: \"Core GROQ syntax and query language reference\"\n - slug: how-queries-work\n reason: \"Query execution model and best practices\"\n\n # When true, the pipeline auto-generates an additional rubric that\n # checks whether the LLM's response actually used the provided docs.\n docCoverage: true\n\n # Path to a gold-standard implementation, relative to canonical/.\n # The grader uses this as a reference when scoring code correctness.\n referenceSolution: canonical/example-groq-blog-listing.ts\n\n # vars.task — the implementation prompt given to the LLM.\n # Write this as if you're asking a developer to build the feature.\n # Be specific about requirements so the grader can evaluate clearly.\n #\n # vars.docs — leave empty (\"\"). The pipeline fills this in:\n # • Gold variant: injected with canonical doc content\n # • Baseline variant: left empty (tests model knowledge alone)\n vars:\n task: |\n Create a Next.js page component that lists blog posts from Sanity\n using GROQ. The page should display the title, slug, and published\n date for each post, sorted by most recent first. Use the Sanity\n client to fetch data.\n docs: \"\"\n\n # Grading assertions — how the LLM's response is scored.\n #\n # \"llm-rubric\" assertions use a grader LLM to score against criteria.\n # The \"template\" references a rubric from config/rubrics.yaml.\n # The \"criteria\" are task-specific bullets injected into the template.\n #\n # Available templates:\n # task-completion — did the LLM implement the feature? (weight: 0.50)\n # code-correctness — is the code idiomatic and correct? (weight: 0.25)\n #\n # You can also use value-based assertions:\n # - type: contains\n # value: \"client.fetch\"\n # - type: contains-any\n # value: [\"createClient\", \"sanityClient\"]\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Uses the groq tagged template literal\"\n - \"Fetches blog posts with title, slug, and publishedAt fields\"\n - \"Orders results by publishedAt in descending order\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses createClient from @sanity/client or next-sanity\"\n - \"Exports a valid Next.js page component\"\n\n # Baseline variant configuration.\n # enabled — set to false to skip this task entirely\n # rubric — \"full\" (default), \"abbreviated\" (faster), or \"none\"\n baseline:\n enabled: true\n rubric: full\n\n # Example tasks ship as drafts so they don't run in production evals.\n # Change to \"active\" (or remove this line) to activate.\n status: draft\n";
195
195
  /** Parsed task data for example-id-based-ref (JSON-safe) */
196
196
  export const exampleIdBasedRefData = [
197
197
  {
@@ -236,13 +236,13 @@ export const exampleIdBasedRefData = [
236
236
  ],
237
237
  "baseline": {
238
238
  "enabled": true,
239
- "rubric": "abbreviated"
239
+ "rubric": "full"
240
240
  },
241
241
  "status": "draft"
242
242
  }
243
243
  ];
244
244
  /** Raw YAML string for example-id-based-ref (preserves comments) */
245
- export const exampleIdBasedRefYaml = "# ──────────────────────────────────────────────────────────────────────\n# Example Task: Document ID-based canonical doc references\n# ──────────────────────────────────────────────────────────────────────\n#\n# Demonstrates using `id` to reference canonical documentation by\n# Sanity document `_id`. This is useful for:\n# - Draft documents that don't have a stable slug yet\n# - Programmatic references from imports or migrations\n# - Documents where you know the _id but not the slug\n#\n# The `id` ref type can also carry optional `slug` and `path` fields\n# as human-readable annotations — these are NOT used for resolution,\n# only for display in logs and reports.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n#\n# @see docs/design-docs/canonical-doc-resolution.md\n# ──────────────────────────────────────────────────────────────────────\n\n- id: example-id-based-ref\n description: \"Example — GROQ feature support (ID-based doc references)\"\n\n featureArea: groq\n\n # ID-based canonical doc references.\n #\n # Use the Sanity document _id to reference articles directly.\n # Optional slug/path annotations help humans reading the YAML\n # but are NOT used for resolution — only the `id` field matters.\n #\n # These IDs reference real articles in the Sanity docs (next dataset):\n # 0ba88f1b... = \"GROQ feature support across Sanity\"\n # 5b9c2863... = \"Custom GROQ functions\"\n canonicalDocs:\n - id: \"0ba88f1b-d1a7-418a-9267-2e343d01886a\"\n slug: groq-feature-support-by-context # annotation only — not used for resolution\n reason: \"GROQ feature support across different Sanity contexts\"\n - id: \"5b9c2863-ef01-4565-af8e-ee54e081ee74\"\n slug: custom-groq-functions # annotation only — not used for resolution\n reason: \"Custom GROQ functions and pipelines\"\n\n docCoverage: true\n\n vars:\n task: |\n Explain how GROQ is used across different Sanity contexts.\n Cover the following:\n 1. Which GROQ features are available in each context (API queries,\n webhooks, custom functions, access control)\n 2. How to create and use custom GROQ functions\n 3. Any differences in GROQ support between contexts\n Provide examples demonstrating context-specific GROQ patterns.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Explains GROQ availability across different Sanity contexts\"\n - \"Describes custom GROQ function creation and usage\"\n - \"Notes differences in GROQ support between contexts\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"GROQ examples use valid syntax\"\n - \"Custom function examples follow the correct API pattern\"\n\n baseline:\n enabled: true\n rubric: abbreviated\n\n # Example tasks ship as drafts so they don't run in production evals.\n # Change to \"active\" (or remove this line) to activate.\n status: draft\n";
245
+ export const exampleIdBasedRefYaml = "# ──────────────────────────────────────────────────────────────────────\n# Example Task: Document ID-based canonical doc references\n# ──────────────────────────────────────────────────────────────────────\n#\n# Demonstrates using `id` to reference canonical documentation by\n# Sanity document `_id`. This is useful for:\n# - Draft documents that don't have a stable slug yet\n# - Programmatic references from imports or migrations\n# - Documents where you know the _id but not the slug\n#\n# The `id` ref type can also carry optional `slug` and `path` fields\n# as human-readable annotations — these are NOT used for resolution,\n# only for display in logs and reports.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n#\n# @see docs/design-docs/canonical-doc-resolution.md\n# ──────────────────────────────────────────────────────────────────────\n\n- id: example-id-based-ref\n description: \"Example — GROQ feature support (ID-based doc references)\"\n\n featureArea: groq\n\n # ID-based canonical doc references.\n #\n # Use the Sanity document _id to reference articles directly.\n # Optional slug/path annotations help humans reading the YAML\n # but are NOT used for resolution — only the `id` field matters.\n #\n # These IDs reference real articles in the Sanity docs (next dataset):\n # 0ba88f1b... = \"GROQ feature support across Sanity\"\n # 5b9c2863... = \"Custom GROQ functions\"\n canonicalDocs:\n - id: \"0ba88f1b-d1a7-418a-9267-2e343d01886a\"\n slug: groq-feature-support-by-context # annotation only — not used for resolution\n reason: \"GROQ feature support across different Sanity contexts\"\n - id: \"5b9c2863-ef01-4565-af8e-ee54e081ee74\"\n slug: custom-groq-functions # annotation only — not used for resolution\n reason: \"Custom GROQ functions and pipelines\"\n\n docCoverage: true\n\n vars:\n task: |\n Explain how GROQ is used across different Sanity contexts.\n Cover the following:\n 1. Which GROQ features are available in each context (API queries,\n webhooks, custom functions, access control)\n 2. How to create and use custom GROQ functions\n 3. Any differences in GROQ support between contexts\n Provide examples demonstrating context-specific GROQ patterns.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Explains GROQ availability across different Sanity contexts\"\n - \"Describes custom GROQ function creation and usage\"\n - \"Notes differences in GROQ support between contexts\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"GROQ examples use valid syntax\"\n - \"Custom function examples follow the correct API pattern\"\n\n baseline:\n enabled: true\n rubric: full\n\n # Example tasks ship as drafts so they don't run in production evals.\n # Change to \"active\" (or remove this line) to activate.\n status: draft\n";
246
246
  /** Parsed task data for example-path-based-ref (JSON-safe) */
247
247
  export const examplePathBasedRefData = [
248
248
  {
@@ -286,13 +286,13 @@ export const examplePathBasedRefData = [
286
286
  ],
287
287
  "baseline": {
288
288
  "enabled": true,
289
- "rubric": "abbreviated"
289
+ "rubric": "full"
290
290
  },
291
291
  "status": "draft"
292
292
  }
293
293
  ];
294
294
  /** Raw YAML string for example-path-based-ref (preserves comments) */
295
- export const examplePathBasedRefYaml = "# ──────────────────────────────────────────────────────────────────────\n# Example Task: Path-based canonical doc references\n# ──────────────────────────────────────────────────────────────────────\n#\n# Demonstrates using `path` to reference canonical documentation.\n# Paths are the preferred reference type because they uniquely identify\n# an article across sections (unlike slugs, which can collide).\n#\n# Path format:\n# - Simple: \"webhooks\" → resolves by slug lookup\n# - Sectioned: \"content-lake/webhooks\" → disambiguates by section + slug\n#\n# This example demonstrates why paths matter: the slug \"documents\"\n# exists in both the \"content-lake\" and \"cli-reference\" sections.\n# Using \"content-lake/documents\" ensures we get the right one.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n#\n# @see docs/design-docs/canonical-doc-resolution.md\n# ──────────────────────────────────────────────────────────────────────\n\n- id: example-path-based-ref\n description: \"Example — GROQ mutations (path-based doc references)\"\n\n featureArea: groq\n\n # Path-based canonical doc references.\n #\n # Use \"section/slug\" format to uniquely identify articles:\n # - \"content-lake/mutations-introduction\" → the mutations article\n # - \"content-lake/documents\" → the documents article in Content Lake\n # (not the CLI \"documents\" article in cli-reference section)\n #\n # The \"documents\" slug exists in two sections — this is exactly why\n # path-based references are preferred over slug-based references.\n canonicalDocs:\n - path: content-lake/mutations-introduction\n reason: \"Introduction to document mutations in the Content Lake\"\n - path: content-lake/documents\n reason: \"Document structure and types (Content Lake, not CLI reference)\"\n\n docCoverage: true\n\n vars:\n task: |\n Explain how to create, update, and delete documents in Sanity's\n Content Lake using mutations. Cover:\n 1. The different mutation types (create, createOrReplace, patch, delete)\n 2. Document structure and required fields (_id, _type)\n 3. How to use patch operations to update specific fields\n 4. Best practices for mutation patterns\n Provide working code examples using @sanity/client.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Explains create, createOrReplace, patch, and delete mutations\"\n - \"Describes required document fields (_id, _type)\"\n - \"Shows patch operations for field-level updates\"\n - \"Includes practical code examples\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses correct @sanity/client mutation API\"\n - \"Patch operations use valid set/unset/inc syntax\"\n\n baseline:\n enabled: true\n rubric: abbreviated\n\n # Example tasks ship as drafts so they don't run in production evals.\n # Change to \"active\" (or remove this line) to activate.\n status: draft\n";
295
+ export const examplePathBasedRefYaml = "# ──────────────────────────────────────────────────────────────────────\n# Example Task: Path-based canonical doc references\n# ──────────────────────────────────────────────────────────────────────\n#\n# Demonstrates using `path` to reference canonical documentation.\n# Paths are the preferred reference type because they uniquely identify\n# an article across sections (unlike slugs, which can collide).\n#\n# Path format:\n# - Simple: \"webhooks\" → resolves by slug lookup\n# - Sectioned: \"content-lake/webhooks\" → disambiguates by section + slug\n#\n# This example demonstrates why paths matter: the slug \"documents\"\n# exists in both the \"content-lake\" and \"cli-reference\" sections.\n# Using \"content-lake/documents\" ensures we get the right one.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n#\n# @see docs/design-docs/canonical-doc-resolution.md\n# ──────────────────────────────────────────────────────────────────────\n\n- id: example-path-based-ref\n description: \"Example — GROQ mutations (path-based doc references)\"\n\n featureArea: groq\n\n # Path-based canonical doc references.\n #\n # Use \"section/slug\" format to uniquely identify articles:\n # - \"content-lake/mutations-introduction\" → the mutations article\n # - \"content-lake/documents\" → the documents article in Content Lake\n # (not the CLI \"documents\" article in cli-reference section)\n #\n # The \"documents\" slug exists in two sections — this is exactly why\n # path-based references are preferred over slug-based references.\n canonicalDocs:\n - path: content-lake/mutations-introduction\n reason: \"Introduction to document mutations in the Content Lake\"\n - path: content-lake/documents\n reason: \"Document structure and types (Content Lake, not CLI reference)\"\n\n docCoverage: true\n\n vars:\n task: |\n Explain how to create, update, and delete documents in Sanity's\n Content Lake using mutations. Cover:\n 1. The different mutation types (create, createOrReplace, patch, delete)\n 2. Document structure and required fields (_id, _type)\n 3. How to use patch operations to update specific fields\n 4. Best practices for mutation patterns\n Provide working code examples using @sanity/client.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Explains create, createOrReplace, patch, and delete mutations\"\n - \"Describes required document fields (_id, _type)\"\n - \"Shows patch operations for field-level updates\"\n - \"Includes practical code examples\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses correct @sanity/client mutation API\"\n - \"Patch operations use valid set/unset/inc syntax\"\n\n baseline:\n enabled: true\n rubric: full\n\n # Example tasks ship as drafts so they don't run in production evals.\n # Change to \"active\" (or remove this line) to activate.\n status: draft\n";
296
296
  /** Parsed task data for example-perspective-ref (JSON-safe) */
297
297
  export const examplePerspectiveRefData = [
298
298
  {
@@ -335,13 +335,13 @@ export const examplePerspectiveRefData = [
335
335
  ],
336
336
  "baseline": {
337
337
  "enabled": true,
338
- "rubric": "abbreviated"
338
+ "rubric": "full"
339
339
  },
340
340
  "status": "draft"
341
341
  }
342
342
  ];
343
343
  /** Raw YAML string for example-perspective-ref (preserves comments) */
344
- export const examplePerspectiveRefYaml = "# ──────────────────────────────────────────────────────────────────────\n# Example Task: Perspective / content release doc references\n# ──────────────────────────────────────────────────────────────────────\n#\n# Demonstrates using `perspective` to reference all documentation\n# articles within a content release. This is the key capability for\n# evaluating NEW feature documentation before it's published.\n#\n# How it works:\n# - A perspective ref is one-to-many: the doc fetcher queries the\n# named release and expands it to ALL articles versioned within it.\n# - Downstream consumers see the same flat DocContext[] regardless\n# of how docs were resolved.\n# - When the release is published, the perspective entry becomes a\n# no-op (articles are now in published). Migrate to explicit path\n# or slug refs at your convenience.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n#\n# @see docs/design-docs/canonical-doc-resolution.md\n# ──────────────────────────────────────────────────────────────────────\n\n- id: example-perspective-ref\n description:\n \"Example — GROQ features from content release (perspective-based doc\n references)\"\n\n featureArea: groq\n\n # Perspective-based canonical doc reference.\n #\n # The perspective ID references a content release in the Sanity\n # Content Lake. At evaluation time, the doc fetcher auto-discovers\n # all articles versioned in this release and includes them as\n # canonical documentation context.\n #\n # Release rE9TSJvR4 contains:\n # - \"GROQ-powered webhooks\" (webhooks)\n # - \"Query Cheat Sheet - GROQ\" (query-cheat-sheet)\n # - \"GROQ joins\" (groq-joins)\n #\n # You can combine perspective refs with explicit slug/path/id refs\n # to include foundational published docs alongside release content.\n # Here we add groq-data-types as a complementary published reference.\n canonicalDocs:\n - perspective: rE9TSJvR4\n reason: \"All GROQ documentation updates in the test content release\"\n - slug: groq-data-types\n reason: \"GROQ data type reference (published, stable)\"\n\n docCoverage: true\n\n vars:\n task: |\n Using GROQ, demonstrate advanced query patterns including:\n 1. Joining data across document types using references\n 2. Filtering webhook payloads with GROQ projections\n 3. Using the query cheat sheet patterns for common operations\n 4. Working with different GROQ data types in filters\n Provide working GROQ query examples for each pattern.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Demonstrates GROQ join syntax for cross-document queries\"\n - \"Shows GROQ filter patterns for webhook configuration\"\n - \"Includes practical query examples from cheat sheet patterns\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"All GROQ queries use valid syntax\"\n - \"Reference joins use correct dereference operator (->)\"\n\n baseline:\n enabled: true\n rubric: abbreviated\n\n # Example tasks ship as drafts so they don't run in production evals.\n # Change to \"active\" (or remove this line) to activate.\n status: draft\n";
344
+ export const examplePerspectiveRefYaml = "# ──────────────────────────────────────────────────────────────────────\n# Example Task: Perspective / content release doc references\n# ──────────────────────────────────────────────────────────────────────\n#\n# Demonstrates using `perspective` to reference all documentation\n# articles within a content release. This is the key capability for\n# evaluating NEW feature documentation before it's published.\n#\n# How it works:\n# - A perspective ref is one-to-many: the doc fetcher queries the\n# named release and expands it to ALL articles versioned within it.\n# - Downstream consumers see the same flat DocContext[] regardless\n# of how docs were resolved.\n# - When the release is published, the perspective entry becomes a\n# no-op (articles are now in published). Migrate to explicit path\n# or slug refs at your convenience.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n#\n# @see docs/design-docs/canonical-doc-resolution.md\n# ──────────────────────────────────────────────────────────────────────\n\n- id: example-perspective-ref\n description:\n \"Example — GROQ features from content release (perspective-based doc\n references)\"\n\n featureArea: groq\n\n # Perspective-based canonical doc reference.\n #\n # The perspective ID references a content release in the Sanity\n # Content Lake. At evaluation time, the doc fetcher auto-discovers\n # all articles versioned in this release and includes them as\n # canonical documentation context.\n #\n # Release rE9TSJvR4 contains:\n # - \"GROQ-powered webhooks\" (webhooks)\n # - \"Query Cheat Sheet - GROQ\" (query-cheat-sheet)\n # - \"GROQ joins\" (groq-joins)\n #\n # You can combine perspective refs with explicit slug/path/id refs\n # to include foundational published docs alongside release content.\n # Here we add groq-data-types as a complementary published reference.\n canonicalDocs:\n - perspective: rE9TSJvR4\n reason: \"All GROQ documentation updates in the test content release\"\n - slug: groq-data-types\n reason: \"GROQ data type reference (published, stable)\"\n\n docCoverage: true\n\n vars:\n task: |\n Using GROQ, demonstrate advanced query patterns including:\n 1. Joining data across document types using references\n 2. Filtering webhook payloads with GROQ projections\n 3. Using the query cheat sheet patterns for common operations\n 4. Working with different GROQ data types in filters\n Provide working GROQ query examples for each pattern.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Demonstrates GROQ join syntax for cross-document queries\"\n - \"Shows GROQ filter patterns for webhook configuration\"\n - \"Includes practical query examples from cheat sheet patterns\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"All GROQ queries use valid syntax\"\n - \"Reference joins use correct dereference operator (->)\"\n\n baseline:\n enabled: true\n rubric: full\n\n # Example tasks ship as drafts so they don't run in production evals.\n # Change to \"active\" (or remove this line) to activate.\n status: draft\n";
345
345
  /** Parsed task data for example-studio-custom-input (JSON-safe) */
346
346
  export const exampleStudioCustomInputData = [
347
347
  {
@@ -386,13 +386,13 @@ export const exampleStudioCustomInputData = [
386
386
  ],
387
387
  "baseline": {
388
388
  "enabled": true,
389
- "rubric": "abbreviated"
389
+ "rubric": "full"
390
390
  },
391
391
  "status": "draft"
392
392
  }
393
393
  ];
394
394
  /** Raw YAML string for example-studio-custom-input (preserves comments) */
395
- export const exampleStudioCustomInputYaml = "# ──────────────────────────────────────────────────────────────────────\n# Example Task: Custom input component in Sanity Studio\n# ──────────────────────────────────────────────────────────────────────\n#\n# This is a starter template — edit it for your own documentation.\n# Delete this file or replace it with your own tasks.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n# ──────────────────────────────────────────────────────────────────────\n\n- id: example-studio-custom-input\n description: \"Example — Custom input component in Sanity Studio\"\n\n featureArea: studio\n\n # Slug-based canonical doc references.\n canonicalDocs:\n - slug: custom-input-widgets\n reason: \"Guide for building custom form inputs in Sanity Studio\"\n - slug: form-components\n reason: \"Form component API and customization patterns\"\n\n docCoverage: true\n referenceSolution: canonical/example-studio-custom-input.ts\n\n vars:\n task: |\n Build a custom string input component for Sanity Studio that shows\n a character count below the input field. The component should accept\n a maxLength option from the field schema and display a warning when\n the text exceeds the limit.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Implements a React component that renders a text input\"\n - \"Displays a live character count\"\n - \"Reads maxLength from schema options\"\n - \"Shows a visual warning when limit is exceeded\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses the Sanity UI library for styling\"\n - \"Calls onChange with patch operations\"\n\n baseline:\n enabled: true\n rubric: abbreviated\n\n # Example tasks ship as drafts so they don't run in production evals.\n # Change to \"active\" (or remove this line) to activate.\n status: draft\n";
395
+ export const exampleStudioCustomInputYaml = "# ──────────────────────────────────────────────────────────────────────\n# Example Task: Custom input component in Sanity Studio\n# ──────────────────────────────────────────────────────────────────────\n#\n# This is a starter template — edit it for your own documentation.\n# Delete this file or replace it with your own tasks.\n#\n# This example task ships as a DRAFT so it does not run in production\n# evaluations automatically. To activate it, change status to \"active\"\n# or remove the status line entirely (defaults to active).\n# ──────────────────────────────────────────────────────────────────────\n\n- id: example-studio-custom-input\n description: \"Example — Custom input component in Sanity Studio\"\n\n featureArea: studio\n\n # Slug-based canonical doc references.\n canonicalDocs:\n - slug: custom-input-widgets\n reason: \"Guide for building custom form inputs in Sanity Studio\"\n - slug: form-components\n reason: \"Form component API and customization patterns\"\n\n docCoverage: true\n referenceSolution: canonical/example-studio-custom-input.ts\n\n vars:\n task: |\n Build a custom string input component for Sanity Studio that shows\n a character count below the input field. The component should accept\n a maxLength option from the field schema and display a warning when\n the text exceeds the limit.\n docs: \"\"\n\n assert:\n - type: llm-rubric\n template: task-completion\n criteria:\n - \"Implements a React component that renders a text input\"\n - \"Displays a live character count\"\n - \"Reads maxLength from schema options\"\n - \"Shows a visual warning when limit is exceeded\"\n\n - type: llm-rubric\n template: code-correctness\n criteria:\n - \"Uses the Sanity UI library for styling\"\n - \"Calls onChange with patch operations\"\n\n baseline:\n enabled: true\n rubric: full\n\n # Example tasks ship as drafts so they don't run in production evals.\n # Change to \"active\" (or remove this line) to activate.\n status: draft\n";
396
396
  // ---------------------------------------------------------------------------
397
397
  // Aggregate task exports
398
398
  // ---------------------------------------------------------------------------
@@ -15,3 +15,6 @@ export * from "./schemas/index.js";
15
15
  export * from "./ports/index.js";
16
16
  export * from "./services/index.js";
17
17
  export * from "./examples/index.js";
18
+ export { defineConfig, defineFeatures, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./config-helpers.js";
19
+ export type { PresetConfig, PricingEntry, PromptEntry, SourceEntry, } from "./config-helpers.js";
20
+ export { env } from "./env-helper.js";
@@ -15,3 +15,8 @@ export * from "./schemas/index.js";
15
15
  export * from "./ports/index.js";
16
16
  export * from "./services/index.js";
17
17
  export * from "./examples/index.js";
18
+ // ---------------------------------------------------------------------------
19
+ // Architecture overhaul — Phase 0 helpers
20
+ // ---------------------------------------------------------------------------
21
+ export { defineConfig, defineFeatures, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./config-helpers.js";
22
+ export { env } from "./env-helper.js";
@@ -11,7 +11,7 @@
11
11
  * Fields marked optional are transitional — they will become required
12
12
  * as downstream consumers are converted to use them.
13
13
  */
14
- import type { DebugOptions, EvalMode } from "../types/index.js";
14
+ import type { DebugOptions, EvalMode, PluginRegistry } from "../types/index.js";
15
15
  import type { CacheStore } from "./cache-store.js";
16
16
  import type { DocFetcher } from "./doc-fetcher.js";
17
17
  import type { EvalRunner } from "./eval-runner.js";
@@ -27,8 +27,19 @@ import type { TaskSource } from "./task-source.js";
27
27
  export interface ResolvedConfig {
28
28
  /** Eval package root directory */
29
29
  rootDir: string;
30
- /** Evaluation mode */
30
+ /** Evaluation mode — canonical name (e.g., "literacy", "knowledge-probe") */
31
31
  mode: EvalMode;
32
+ /**
33
+ * Literacy variant — only meaningful when mode is "literacy".
34
+ *
35
+ * When a user passes `--mode baseline`, the CLI normalizes this to
36
+ * `mode: "literacy", variant: "baseline"`. This keeps the pipeline
37
+ * mode-agnostic while preserving literacy's multi-variant behavior.
38
+ *
39
+ * Values: "baseline" | "agentic" | "observed" | "full" | undefined
40
+ * Undefined means "use the default variant for the mode" (baseline for literacy).
41
+ */
42
+ variant?: string;
32
43
  /** Debug options */
33
44
  debug?: DebugOptions;
34
45
  /** Feature area filter */
@@ -153,6 +164,8 @@ export interface AppContext {
153
164
  readonly evalRunner: EvalRunner;
154
165
  /** Structured logger */
155
166
  readonly logger: Logger;
167
+ /** Plugin registry — mode handlers, assertions, rubric templates, etc. */
168
+ readonly registry: PluginRegistry;
156
169
  /**
157
170
  * Persistent report store (Sanity Content Lake).
158
171
  * Optional — not all commands need it. Commands that publish or
@@ -9,7 +9,7 @@
9
9
  * The pipeline orchestrator and all downstream steps work with
10
10
  * FetchResult regardless of where the documentation came from.
11
11
  */
12
- import type { TaskDefinition } from "./task-source.js";
12
+ import type { GeneralizedTaskDefinition } from "../types/generalized-task.js";
13
13
  /**
14
14
  * A fetched documentation context ready for injection into prompts.
15
15
  *
@@ -127,5 +127,5 @@ export interface DocFetcher {
127
127
  * @param source — Where to fetch documentation from
128
128
  * @returns Fetched doc contexts + optional metadata
129
129
  */
130
- fetch(tasks: TaskDefinition[], source?: DocSourceConfig): Promise<FetchResult>;
130
+ fetch(tasks: GeneralizedTaskDefinition[], source?: DocSourceConfig): Promise<FetchResult>;
131
131
  }
@@ -9,7 +9,8 @@ export type { ConfigSource } from "./config-source.js";
9
9
  export type { AppContext, ReportSinkPort, ReportStorePort, ResolvedConfig, } from "./context.js";
10
10
  export type { DocContext, DocFetcher, DocSourceConfig, DocumentManifestEntry, DocumentOverlaySummary, FetchMetadata, FetchResult, ReleaseImpact, UrlFetchEntry, UrlFetchSummary, } from "./doc-fetcher.js";
11
11
  export type { EvalRunConfig, EvalRunner } from "./eval-runner.js";
12
+ export type { CompilationContext, CompileResultAssertion, CompileResultPrompt, CompileResultProvider, CompileResultTestCase, ModeCompileResult, ModeHandler, ModeProviderEntry, ModeRubricConfig, PromptTemplate, } from "./mode-handler.js";
12
13
  export type { Logger } from "./logger.js";
13
14
  export type { PipelineStep } from "./pipeline-step.js";
14
- export type { AssertionDefinition, BaselineConfig, CanonicalDocRef, IdDocRef, PathDocRef, PerspectiveDocRef, SlugDocRef, TaskDefinition, TaskSource, TemplatedAssertion, ValueAssertion, } from "./task-source.js";
15
+ export type { TaskSource } from "./task-source.js";
15
16
  export { canonicalDocRefLabel, isIdRef, isPathRef, isPerspectiveRef, isSlugRef, isTemplatedAssertion, } from "./task-source.js";
@@ -0,0 +1,129 @@
1
+ /**
2
+ * ModeHandler — the common interface every evaluation mode implements.
3
+ *
4
+ * The pipeline dispatches to mode handlers through the PluginRegistry:
5
+ * 1. Look up the mode: `ctx.registry.getMode(mode)`
6
+ * 2. Import the handler module: `import(registration.handlerModule)`
7
+ * 3. Call: `module.handler.compileTask(task, ctx)`
8
+ *
9
+ * Each handler file exports a `handler` object conforming to this interface.
10
+ * The handler narrows the GeneralizedTaskDefinition to its mode-specific
11
+ * variant and produces a ModeCompileResult.
12
+ *
13
+ * Types here are minimal structural contracts — the eval package's Promptfoo
14
+ * types satisfy them via TypeScript structural compatibility.
15
+ *
16
+ * @see docs/design-docs/architecture-overhaul/extensibility-plugins.md
17
+ * @see packages/eval/src/pipeline/compiler/mode-handlers/
18
+ */
19
+ import type { GeneralizedTaskDefinition } from "../types/generalized-task.js";
20
+ /**
21
+ * A prompt template owned by a mode handler.
22
+ *
23
+ * Mode handlers can return these via `getPrompts()` to override the global
24
+ * config/prompts.ts templates. This lets non-literacy modes define their
25
+ * own prompt structures without polluting the global config.
26
+ */
27
+ export interface PromptTemplate {
28
+ /** Unique identifier (e.g. "with-docs", "agentic") */
29
+ id: string;
30
+ /** Human-readable label for display */
31
+ label: string;
32
+ /** The prompt template string with {{variable}} placeholders */
33
+ template: string;
34
+ /** Variable names used by this template — for documentation only */
35
+ variables?: string[];
36
+ }
37
+ /** Compilation context — shared state the pipeline provides to every handler */
38
+ export interface CompilationContext {
39
+ /** Eval package root directory (for resolving file paths) */
40
+ rootDir: string;
41
+ /** Grader provider ID for LLM-graded assertions */
42
+ graderProvider?: string;
43
+ /** Model providers to include in the evaluation */
44
+ models?: ModeProviderEntry[];
45
+ /** Rubric config (templates, weights) — loaded from config/rubrics */
46
+ rubricConfig?: ModeRubricConfig;
47
+ }
48
+ /** A model provider entry for compilation */
49
+ export interface ModeProviderEntry {
50
+ id: string;
51
+ label: string;
52
+ config?: Record<string, unknown>;
53
+ }
54
+ /** Minimal rubric config needed by mode handlers */
55
+ export interface ModeRubricConfig {
56
+ templates: Record<string, {
57
+ dimension?: string;
58
+ header: string;
59
+ scale: string[];
60
+ criteria_label?: string;
61
+ }>;
62
+ }
63
+ /** A provider entry in the compile result */
64
+ export interface CompileResultProvider {
65
+ id: string;
66
+ label?: string;
67
+ config?: Record<string, unknown>;
68
+ }
69
+ /** A prompt entry in the compile result */
70
+ export interface CompileResultPrompt {
71
+ id: string;
72
+ label: string;
73
+ raw: string;
74
+ }
75
+ /** A test case entry in the compile result */
76
+ export interface CompileResultTestCase {
77
+ description: string;
78
+ vars: Record<string, unknown>;
79
+ assert?: CompileResultAssertion[];
80
+ prompts?: string[];
81
+ }
82
+ /** An assertion entry in a test case */
83
+ export interface CompileResultAssertion {
84
+ type: string;
85
+ value?: unknown;
86
+ weight?: number;
87
+ provider?: string;
88
+ metadata?: Record<string, unknown>;
89
+ }
90
+ /**
91
+ * ModeCompileResult — the common output every mode handler produces.
92
+ *
93
+ * All four fields (providers, tests, prompts, warnings) are present in
94
+ * every handler's result type. Mode-specific extras (sandbox config,
95
+ * metadata, extensions) go in the `extras` bag.
96
+ */
97
+ export interface ModeCompileResult {
98
+ /** Provider configurations for Promptfoo */
99
+ providers: CompileResultProvider[];
100
+ /** Compiled test cases */
101
+ tests: CompileResultTestCase[];
102
+ /** Prompt templates */
103
+ prompts: CompileResultPrompt[];
104
+ /** Warnings generated during compilation */
105
+ warnings: string[];
106
+ /** Mode-specific extras (extensions, sandboxConfig, metadata, etc.) */
107
+ extras?: Record<string, unknown>;
108
+ }
109
+ /**
110
+ * ModeHandler — the interface every evaluation mode handler exports.
111
+ *
112
+ * Handler modules are referenced by `ModeRegistration.handlerModule` in the
113
+ * plugin registry. The pipeline imports the module and calls `handler.compileTask()`.
114
+ *
115
+ * Each handler file should export:
116
+ * export const handler: ModeHandler = { ... }
117
+ */
118
+ export interface ModeHandler {
119
+ /** Compile a task definition into evaluation configuration */
120
+ compileTask(task: GeneralizedTaskDefinition, ctx: CompilationContext): ModeCompileResult;
121
+ /**
122
+ * Return prompt templates owned by this mode.
123
+ *
124
+ * When defined, the compiler uses these instead of global config/prompts.ts.
125
+ * Keys are prompt IDs (e.g. "with-docs", "agentic"). Returning undefined
126
+ * or omitting the method falls back to global prompts.
127
+ */
128
+ getPrompts?(): Record<string, PromptTemplate>;
129
+ }
@@ -0,0 +1,19 @@
1
+ /**
2
+ * ModeHandler — the common interface every evaluation mode implements.
3
+ *
4
+ * The pipeline dispatches to mode handlers through the PluginRegistry:
5
+ * 1. Look up the mode: `ctx.registry.getMode(mode)`
6
+ * 2. Import the handler module: `import(registration.handlerModule)`
7
+ * 3. Call: `module.handler.compileTask(task, ctx)`
8
+ *
9
+ * Each handler file exports a `handler` object conforming to this interface.
10
+ * The handler narrows the GeneralizedTaskDefinition to its mode-specific
11
+ * variant and produces a ModeCompileResult.
12
+ *
13
+ * Types here are minimal structural contracts — the eval package's Promptfoo
14
+ * types satisfy them via TypeScript structural compatibility.
15
+ *
16
+ * @see docs/design-docs/architecture-overhaul/extensibility-plugins.md
17
+ * @see packages/eval/src/pipeline/compiler/mode-handlers/
18
+ */
19
+ export {};
@@ -7,150 +7,44 @@
7
7
  * - RepoTaskSource (tasks-as-content Phase 4) — reads .ailf/tasks/
8
8
  *
9
9
  * The key invariant: the pipeline orchestrator and all downstream steps
10
- * work with TaskDefinition[] regardless of where they came from.
10
+ * work with GeneralizedTaskDefinition[] regardless of where they came from.
11
11
  */
12
+ import type { GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, IdDocRef, PathDocRef, PerspectiveDocRef, SlugDocRef } from "../types/generalized-task.js";
12
13
  import type { FilterOptions } from "../types/index.js";
13
- /** A templated assertion referencing a rubric template from config/rubrics.yaml */
14
- export interface TemplatedAssertion {
15
- type: "llm-rubric";
16
- template: string;
17
- criteria: string[];
18
- weight?: number;
19
- }
20
- /** A value-based assertion (contains, javascript, etc.) */
21
- export interface ValueAssertion {
22
- type: string;
23
- value?: unknown;
24
- weight?: number;
25
- [key: string]: unknown;
26
- }
27
- /** Any assertion definition — either templated or value-based */
28
- export type AssertionDefinition = TemplatedAssertion | ValueAssertion;
29
- /** Baseline variant configuration */
30
- export interface BaselineConfig {
31
- /** Whether to generate a baseline variant. Default: true */
32
- enabled?: boolean;
33
- /** Rubric mode for baseline. Default: "abbreviated" */
34
- rubric?: "abbreviated" | "full" | "none";
35
- }
36
- /**
37
- * A canonical documentation reference. Each entry resolves docs through
38
- * one of four strategies, discriminated by key presence (no explicit
39
- * `type` field). All strategies carry an optional `reason` for context.
40
- *
41
- * Strategies:
42
- * - `slug` — one article by slug field (legacy, may not be unique)
43
- * - `path` — one article by URL path (unique across sections)
44
- * - `id` — one document by Sanity `_id` (drafts, imports)
45
- * - `perspective` — all articles in a content release (one-to-many)
46
- *
47
- * @see docs/design-docs/canonical-doc-resolution.md
48
- */
49
- export type CanonicalDocRef = SlugDocRef | PathDocRef | IdDocRef | PerspectiveDocRef;
50
- /** Resolve by article slug field. Legacy — prefer `path` for uniqueness. */
51
- export interface SlugDocRef {
52
- slug: string;
53
- reason?: string;
54
- }
55
- /** Resolve by URL path (after /docs/). Unique across sections. */
56
- export interface PathDocRef {
57
- path: string;
58
- reason?: string;
59
- }
60
- /** Resolve by Sanity document `_id`. The primary resolution strategy.
61
- *
62
- * Optional `slug` and `path` provide human-readable context — they are
63
- * NOT used for resolution (the `_id` is authoritative) but help YAML
64
- * authors understand which document is being referenced. The Content Lake
65
- * adapter populates them from the dereferenced article.
66
- */
67
- export interface IdDocRef {
68
- id: string;
69
- reason?: string;
70
- /** Human-readable slug (informational only — not used for resolution) */
71
- slug?: string;
72
- /** Human-readable path (informational only — not used for resolution) */
73
- path?: string;
74
- }
75
- /** Resolve all articles in a content release. One-to-many. */
76
- export interface PerspectiveDocRef {
77
- perspective: string;
78
- reason?: string;
79
- }
80
- /**
81
- * A loaded, validated task definition ready for expansion.
82
- *
83
- * This is the canonical intermediate representation — adapters produce
84
- * this from YAML, Content Lake, or .ailf/ files. Downstream consumers
85
- * (expansion, doc fetching, validation) work exclusively with this type.
86
- *
87
- * Design notes:
88
- * - `taskPrompt` is extracted from `vars.task` in YAML format
89
- * - `docsPath` is NOT included — it's an infrastructure detail derived
90
- * from convention (`file://contexts/canonical/${id}.md`)
91
- * - `featureArea` is derived by the adapter (filename stem, document
92
- * field, directory structure — depends on the source)
93
- */
94
- export interface TaskDefinition {
95
- /** Unique task identifier */
96
- id: string;
97
- /** Human-readable description */
98
- description: string;
99
- /** Feature area this task belongs to */
100
- featureArea: string;
101
- /** The implementation task prompt (the user-facing request) */
102
- taskPrompt: string;
103
- /** Canonical doc references with reasons */
104
- canonicalDocs: CanonicalDocRef[];
105
- /** Path to the reference solution (relative to eval package root) */
106
- referenceSolution: string;
107
- /** Whether doc coverage rubric should be auto-generated */
108
- docCoverage: boolean;
109
- /** Assertion definitions (rubric templates + value assertions) */
110
- assertions: AssertionDefinition[];
111
- /** Baseline variant configuration */
112
- baseline?: BaselineConfig;
113
- /** Additional template variables beyond task (e.g., custom vars) */
114
- extraVars?: Record<string, unknown>;
115
- /** Lifecycle status — controls pipeline inclusion. Absent = "active". */
116
- status?: "active" | "archived" | "draft" | "paused";
117
- /** Freeform labels for filtering and organization */
118
- tags?: string[];
119
- }
120
- /** Check if a canonical doc ref resolves by slug.
14
+ /** Check if a doc ref resolves by slug.
121
15
  *
122
16
  * Excludes IdDocRef (which may carry an optional `slug` for display).
123
17
  * When both `id` and `slug` are present, it's an IdDocRef, not a SlugDocRef.
124
18
  */
125
- export declare function isSlugRef(ref: CanonicalDocRef): ref is SlugDocRef;
126
- /** Check if a canonical doc ref resolves by path.
19
+ export declare function isSlugRef(ref: GeneralizedDocRef): ref is SlugDocRef;
20
+ /** Check if a doc ref resolves by path.
127
21
  *
128
22
  * Excludes IdDocRef (which may carry an optional `path` for display).
129
23
  * When both `id` and `path` are present, it's an IdDocRef, not a PathDocRef.
130
24
  */
131
- export declare function isPathRef(ref: CanonicalDocRef): ref is PathDocRef;
132
- /** Check if a canonical doc ref resolves by document ID.
25
+ export declare function isPathRef(ref: GeneralizedDocRef): ref is PathDocRef;
26
+ /** Check if a doc ref resolves by document ID.
133
27
  *
134
28
  * Uses `"id" in ref` as the primary discriminator. IdDocRef may also carry
135
29
  * optional `slug` and `path` for display purposes, so we cannot exclude
136
30
  * on those keys. When both `id` and `slug` are present, `id` wins.
137
31
  */
138
- export declare function isIdRef(ref: CanonicalDocRef): ref is IdDocRef;
139
- /** Check if a canonical doc ref resolves by content release perspective */
140
- export declare function isPerspectiveRef(ref: CanonicalDocRef): ref is PerspectiveDocRef;
32
+ export declare function isIdRef(ref: GeneralizedDocRef): ref is IdDocRef;
33
+ /** Check if a doc ref resolves by content release perspective */
34
+ export declare function isPerspectiveRef(ref: GeneralizedDocRef): ref is PerspectiveDocRef;
141
35
  /**
142
- * Extract a display identifier from any canonical doc ref.
36
+ * Extract a display identifier from any doc ref.
143
37
  * Useful for logging, error messages, and retrieval metrics.
144
38
  */
145
- export declare function canonicalDocRefLabel(ref: CanonicalDocRef): string;
39
+ export declare function canonicalDocRefLabel(ref: GeneralizedDocRef): string;
146
40
  /** Check if an assertion uses the templated format (template + criteria) */
147
- export declare function isTemplatedAssertion(entry: AssertionDefinition): entry is TemplatedAssertion;
41
+ export declare function isTemplatedAssertion(entry: GeneralizedAssertionDefinition): entry is GeneralizedTemplatedAssertion;
148
42
  /**
149
43
  * Port: Where task definitions come from.
150
44
  *
151
45
  * The pipeline never knows HOW tasks are loaded — it only sees
152
- * TaskDefinition[]. The adapter handles YAML parsing, GROQ queries,
153
- * filesystem scanning, etc.
46
+ * GeneralizedTaskDefinition[]. The adapter handles YAML parsing, GROQ
47
+ * queries, filesystem scanning, etc.
154
48
  */
155
49
  export interface TaskSource {
156
50
  /**
@@ -159,5 +53,5 @@ export interface TaskSource {
159
53
  * @param filter — Area, task ID, or changed-doc filters
160
54
  * @returns Validated task definitions ready for expansion
161
55
  */
162
- loadTasks(filter?: FilterOptions): Promise<TaskDefinition[]>;
56
+ loadTasks(filter?: FilterOptions): Promise<GeneralizedTaskDefinition[]>;
163
57
  }
@@ -7,12 +7,12 @@
7
7
  * - RepoTaskSource (tasks-as-content Phase 4) — reads .ailf/tasks/
8
8
  *
9
9
  * The key invariant: the pipeline orchestrator and all downstream steps
10
- * work with TaskDefinition[] regardless of where they came from.
10
+ * work with GeneralizedTaskDefinition[] regardless of where they came from.
11
11
  */
12
12
  // ---------------------------------------------------------------------------
13
- // Type guards — canonical doc refs
13
+ // Type guards — doc refs
14
14
  // ---------------------------------------------------------------------------
15
- /** Check if a canonical doc ref resolves by slug.
15
+ /** Check if a doc ref resolves by slug.
16
16
  *
17
17
  * Excludes IdDocRef (which may carry an optional `slug` for display).
18
18
  * When both `id` and `slug` are present, it's an IdDocRef, not a SlugDocRef.
@@ -20,7 +20,7 @@
20
20
  export function isSlugRef(ref) {
21
21
  return "slug" in ref && !("id" in ref);
22
22
  }
23
- /** Check if a canonical doc ref resolves by path.
23
+ /** Check if a doc ref resolves by path.
24
24
  *
25
25
  * Excludes IdDocRef (which may carry an optional `path` for display).
26
26
  * When both `id` and `path` are present, it's an IdDocRef, not a PathDocRef.
@@ -28,7 +28,7 @@ export function isSlugRef(ref) {
28
28
  export function isPathRef(ref) {
29
29
  return "path" in ref && !("id" in ref);
30
30
  }
31
- /** Check if a canonical doc ref resolves by document ID.
31
+ /** Check if a doc ref resolves by document ID.
32
32
  *
33
33
  * Uses `"id" in ref` as the primary discriminator. IdDocRef may also carry
34
34
  * optional `slug` and `path` for display purposes, so we cannot exclude
@@ -37,12 +37,12 @@ export function isPathRef(ref) {
37
37
  export function isIdRef(ref) {
38
38
  return "id" in ref;
39
39
  }
40
- /** Check if a canonical doc ref resolves by content release perspective */
40
+ /** Check if a doc ref resolves by content release perspective */
41
41
  export function isPerspectiveRef(ref) {
42
42
  return "perspective" in ref;
43
43
  }
44
44
  /**
45
- * Extract a display identifier from any canonical doc ref.
45
+ * Extract a display identifier from any doc ref.
46
46
  * Useful for logging, error messages, and retrieval metrics.
47
47
  */
48
48
  export function canonicalDocRefLabel(ref) {
@@ -29,10 +29,15 @@ export declare const EvalConfigSchema: z.ZodObject<{
29
29
  graderReplications: z.ZodOptional<z.ZodNumber>;
30
30
  headers: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
31
31
  mode: z.ZodOptional<z.ZodEnum<{
32
- agentic: "agentic";
32
+ custom: "custom";
33
+ literacy: "literacy";
34
+ "mcp-server": "mcp-server";
35
+ "agent-harness": "agent-harness";
36
+ "knowledge-probe": "knowledge-probe";
33
37
  baseline: "baseline";
34
- full: "full";
38
+ agentic: "agentic";
35
39
  observed: "observed";
40
+ full: "full";
36
41
  }>>;
37
42
  noAutoScope: z.ZodOptional<z.ZodBoolean>;
38
43
  noCache: z.ZodOptional<z.ZodBoolean>;