npm - @sanity/ailf - Versions diffs - 3.2.0 → 3.3.1 - Mend

@sanity/ailf 3.2.0 → 3.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/dist/_vendor/ailf-core/examples/index.d.ts +8 -8
package/dist/_vendor/ailf-core/examples/index.js +8 -8
package/dist/_vendor/ailf-shared/feature-flags.d.ts +59 -0
package/dist/_vendor/ailf-shared/feature-flags.js +44 -0
package/dist/_vendor/ailf-shared/index.d.ts +1 -0
package/dist/_vendor/ailf-shared/index.js +1 -0
package/dist/adapters/config-sources/ailf-resolver.d.ts +55 -0
package/dist/adapters/config-sources/ailf-resolver.js +147 -0
package/dist/adapters/config-sources/ts-config-loader.js +7 -0
package/dist/adapters/task-sources/repo-schemas.d.ts +35 -5
package/dist/adapters/task-sources/repo-schemas.js +25 -3
package/dist/adapters/task-sources/task-file-loader.js +3 -0
package/dist/commands/init.d.ts +1 -1
package/dist/commands/init.js +19 -5
package/dist/commands/pipeline-action.js +51 -6
package/dist/commands/pipeline.js +1 -1
package/dist/commands/validate-tasks.d.ts +14 -3
package/dist/commands/validate-tasks.js +125 -81
package/dist/index.d.ts +2 -0
package/dist/index.js +4 -0
package/dist/pipeline/compiler/config-loader.js +6 -1
package/dist/pipeline/compiler/preset-loader.js +3 -0
package/package.json +1 -1

package/dist/_vendor/ailf-core/examples/index.d.ts CHANGED Viewed

@@ -158,7 +158,7 @@ export declare const exampleAgentAddSchemaData: readonly [{
     readonly status: "draft";
 }];
 /** TypeScript task template for example-agent-add-schema */
-export declare const exampleAgentAddSchemaTs = "/**\n * Example Task: Agent harness \u2014 add a schema to a Sanity Studio project.\n *\n * This is a starter template for agent-harness evaluations. It tests whether\n * an autonomous coding agent can modify a real project in a sandboxed\n * environment using file system tools.\n *\n * Unlike literacy or knowledge-probe tasks that evaluate text responses,\n * agent-harness tasks evaluate *side-effects*: files created, code modified,\n * commands executed. The agent gets a working directory with real files and\n * tools (Bash, Read, Write, Edit, etc.) and is graded on what it produces.\n *\n * This task is a DRAFT \u2014 it won't run unless activated or explicitly targeted.\n * To activate: change status to \"active\" or remove the status field.\n *\n * @see https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/modes.md#agent-harness\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n  // \u2500\u2500 Mode \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // \"agent-harness\" runs a real coding agent (Claude Agent SDK) with\n  // filesystem tools in an isolated sandbox. The agent can read, write,\n  // edit files and run shell commands \u2014 then assertions verify the results.\n  //\n  // Other modes: \"literacy\" (text Q&A), \"mcp-server\" (tool use),\n  //              \"knowledge-probe\" (baseline knowledge).\n  mode: \"agent-harness\",\n\n  id: \"example-agent-add-schema\",\n  title: \"Add a document schema to a Sanity Studio project\",\n  description:\n    \"Example \u2014 tests whether an agent can create a schema file and \" +\n    \"register it in an existing Sanity Studio project\",\n\n  // Area groups tasks for scoring. All agent-harness tasks in the same\n  // area are aggregated together in the score report.\n  area: \"studio\",\n\n  // \u2500\u2500 Sandbox \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // The sandbox isolates the agent's file operations from your real repo.\n  // Currently only \"tempdir\" is implemented (creates a temporary directory).\n  // The pipeline creates the sandbox dir at config-generation time, copies\n  // fixtures into it before the agent starts, and the agent's working_dir\n  // is set to the sandbox path.\n  //\n  // See: docs/modes.md#sandbox-lifecycle\n  sandbox: { type: \"tempdir\" },\n\n  // \u2500\u2500 Tools \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // Tool presets control which Claude Agent SDK tools the agent can use:\n  //   \"coding\"      \u2192 Bash, Read, Write, Edit, Glob, Grep\n  //   \"full-access\" \u2192 coding + WebSearch, WebFetch, TodoRead, TodoWrite\n  //   \"read-only\"   \u2192 Read, Glob, Grep, WebSearch\n  //\n  // You can also mix presets with explicit tool names:\n  //   tools: [\"coding\", \"WebFetch\"]\n  tools: [\"coding\"],\n\n  // \u2500\u2500 Fixtures \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // Fixtures are files or directories copied into the sandbox before the\n  // agent starts. Paths use the file:// prefix and resolve relative to\n  // the directory where you run the pipeline (typically your repo root).\n  //\n  // The agent sees these files in its working directory as if they were\n  // a real project. For example, \"file://apps/studio-basic\" copies the\n  // entire studio-basic app into the sandbox root.\n  //\n  // If you don't have fixture projects, you can omit this field and the\n  // agent starts with an empty directory.\n  fixtures: [\"file://apps/studio-basic\"],\n\n  // \u2500\u2500 Prompt \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // The task instructions sent to the agent. Be specific about expected\n  // file paths and patterns \u2014 the assertions check for exact paths.\n  prompt: {\n    text: `You have a Sanity Studio project with an existing article schema.\nAdd a new \"post\" document type with the following fields:\n1. title (string, required)\n2. slug (slug, sourced from title, required)\n3. author (string)\n4. publishedAt (datetime)\n5. body (array of block content)\n\nCreate the schema file at schemas/post.ts using defineType() and defineField().\nRegister it in schemas/index.ts alongside the existing article schema.`,\n  },\n\n  // \u2500\u2500 Assertions \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // Agent-harness assertions verify the sandbox state after the agent runs.\n  // They execute in a full Node.js context (not eval()) so they can use\n  // fs, child_process, etc. All file paths resolve relative to the sandbox.\n  //\n  // Available assertion types:\n  //   file-exists      \u2014 check a file was created\n  //   file-contains    \u2014 check a file contains a substring\n  //   command-succeeds \u2014 run a shell command (exit 0 = pass)\n  //   diff-matches     \u2014 check git diff contains a pattern\n  //   llm-rubric       \u2014 LLM grades the agent's text output\n  assertions: [\n    // Verify the agent created the new schema file\n    { type: \"file-exists\", value: \"schemas/post.ts\" },\n\n    // Verify it uses the modern Sanity schema API\n    {\n      type: \"file-contains\",\n      value: { path: \"schemas/post.ts\", content: \"defineType\" },\n    },\n    {\n      type: \"file-contains\",\n      value: { path: \"schemas/post.ts\", content: \"defineField\" },\n    },\n\n    // Verify the schema was registered in the barrel export\n    {\n      type: \"file-contains\",\n      value: { path: \"schemas/index.ts\", content: \"post\" },\n    },\n\n    // Verify the existing config is still intact\n    {\n      type: \"file-contains\",\n      value: { path: \"sanity.config.ts\", content: \"defineConfig\" },\n    },\n  ],\n\n  status: \"draft\",\n})\n";
+export declare const exampleAgentAddSchemaTs = "/**\n * Example Task: Agent harness \u2014 add a schema to a Sanity Studio project.\n *\n * This is a starter template for agent-harness evaluations. It tests whether\n * an autonomous coding agent can modify a real project in a sandboxed\n * environment using file system tools.\n *\n * Unlike literacy or knowledge-probe tasks that evaluate text responses,\n * agent-harness tasks evaluate *side-effects*: files created, code modified,\n * commands executed. The agent gets a working directory with real files and\n * tools (Bash, Read, Write, Edit, etc.) and is graded on what it produces.\n *\n * This task is a DRAFT \u2014 it won't run unless activated or explicitly targeted.\n * To activate: change status to \"active\" or remove the status field.\n *\n * @see https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/modes.md#agent-harness\n */\n\nimport { defineTask } from \"@sanity/ailf\"\n\nexport default defineTask({\n  // \u2500\u2500 Mode \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // \"agent-harness\" runs a real coding agent (Claude Agent SDK) with\n  // filesystem tools in an isolated sandbox. The agent can read, write,\n  // edit files and run shell commands \u2014 then assertions verify the results.\n  //\n  // Other modes: \"literacy\" (text Q&A), \"mcp-server\" (tool use),\n  //              \"knowledge-probe\" (baseline knowledge).\n  mode: \"agent-harness\",\n\n  id: \"example-agent-add-schema\",\n  title: \"Add a document schema to a Sanity Studio project\",\n  description:\n    \"Example \u2014 tests whether an agent can create a schema file and \" +\n    \"register it in an existing Sanity Studio project\",\n\n  // Area groups tasks for scoring. All agent-harness tasks in the same\n  // area are aggregated together in the score report.\n  area: \"studio\",\n\n  // \u2500\u2500 Sandbox \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // The sandbox isolates the agent's file operations from your real repo.\n  // Currently only \"tempdir\" is implemented (creates a temporary directory).\n  // The pipeline creates the sandbox dir at config-generation time, copies\n  // fixtures into it before the agent starts, and the agent's working_dir\n  // is set to the sandbox path.\n  //\n  // See: docs/modes.md#sandbox-lifecycle\n  sandbox: { type: \"tempdir\" },\n\n  // \u2500\u2500 Tools \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // Tool presets control which Claude Agent SDK tools the agent can use:\n  //   \"coding\"      \u2192 Bash, Read, Write, Edit, Glob, Grep\n  //   \"full-access\" \u2192 coding + WebSearch, WebFetch, TodoRead, TodoWrite\n  //   \"read-only\"   \u2192 Read, Glob, Grep, WebSearch\n  //\n  // You can also mix presets with explicit tool names:\n  //   tools: [\"coding\", \"WebFetch\"]\n  tools: [\"coding\"],\n\n  // \u2500\u2500 Fixtures \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // Fixtures are files or directories copied into the sandbox before the\n  // agent starts. Paths use the file:// prefix and resolve relative to\n  // the directory where you run the pipeline (typically your repo root).\n  //\n  // The agent sees these files in its working directory as if they were\n  // a real project. For example, \"file://apps/studio-basic\" copies the\n  // entire studio-basic app into the sandbox root.\n  //\n  // If you don't have fixture projects, you can omit this field and the\n  // agent starts with an empty directory.\n  fixtures: [\"file://apps/studio-basic\"],\n\n  // \u2500\u2500 Prompt \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // The task instructions sent to the agent. Be specific about expected\n  // file paths and patterns \u2014 the assertions check for exact paths.\n  prompt: {\n    text: `You have a Sanity Studio project with an existing article schema.\nAdd a new \"post\" document type with the following fields:\n1. title (string, required)\n2. slug (slug, sourced from title, required)\n3. author (string)\n4. publishedAt (datetime)\n5. body (array of block content)\n\nCreate the schema file at schemas/post.ts using defineType() and defineField().\nRegister it in schemas/index.ts alongside the existing article schema.`,\n  },\n\n  // \u2500\u2500 Assertions \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // Agent-harness assertions verify the sandbox state after the agent runs.\n  // They execute in a full Node.js context (not eval()) so they can use\n  // fs, child_process, etc. All file paths resolve relative to the sandbox.\n  //\n  // Available assertion types:\n  //   file-exists      \u2014 check a file was created\n  //   file-contains    \u2014 check a file contains a substring\n  //   command-succeeds \u2014 run a shell command (exit 0 = pass)\n  //   diff-matches     \u2014 check git diff contains a pattern\n  //   llm-rubric       \u2014 LLM grades the agent's text output\n  assertions: [\n    // Verify the agent created the new schema file\n    { type: \"file-exists\", value: \"schemas/post.ts\" },\n\n    // Verify it uses the modern Sanity schema API\n    {\n      type: \"file-contains\",\n      value: { path: \"schemas/post.ts\", content: \"defineType\" },\n    },\n    {\n      type: \"file-contains\",\n      value: { path: \"schemas/post.ts\", content: \"defineField\" },\n    },\n\n    // Verify the schema was registered in the barrel export\n    {\n      type: \"file-contains\",\n      value: { path: \"schemas/index.ts\", content: \"post\" },\n    },\n\n    // Verify the existing config is still intact\n    {\n      type: \"file-contains\",\n      value: { path: \"sanity.config.ts\", content: \"defineConfig\" },\n    },\n  ],\n\n  status: \"draft\",\n})\n";
 /** Generated YAML for example-agent-add-schema (from parsed TS data) */
 export declare const exampleAgentAddSchemaYaml = "- mode: agent-harness\n  id: example-agent-add-schema\n  title: Add a document schema to a Sanity Studio project\n  description: Example \u2014 tests whether an agent can create a schema file and register it in an existing Sanity Studio project\n  area: studio\n  sandbox:\n    type: tempdir\n  tools:\n    - coding\n  fixtures:\n    - file://apps/studio-basic\n  prompt:\n    text: |-\n      You have a Sanity Studio project with an existing article schema.\n      Add a new \"post\" document type with the following fields:\n      1. title (string, required)\n      2. slug (slug, sourced from title, required)\n      3. author (string)\n      4. publishedAt (datetime)\n      5. body (array of block content)\n\n      Create the schema file at schemas/post.ts using defineType() and defineField().\n      Register it in schemas/index.ts alongside the existing article schema.\n  assertions:\n    - type: file-exists\n      value: schemas/post.ts\n    - type: file-contains\n      value:\n        path: schemas/post.ts\n        content: defineType\n    - type: file-contains\n      value:\n        path: schemas/post.ts\n        content: defineField\n    - type: file-contains\n      value:\n        path: schemas/index.ts\n        content: post\n    - type: file-contains\n      value:\n        path: sanity.config.ts\n        content: defineConfig\n  status: draft\n";
 /** Parsed task data for example-groq-blog-listing (JSON-safe) */
@@ -198,7 +198,7 @@ export declare const exampleGroqBlogListingData: readonly [{
     readonly status: "draft";
 }];
 /** TypeScript task template for example-groq-blog-listing */
-export declare const exampleGroqBlogListingTs = "/**\n * Example Task: Blog listing with GROQ queries.\n *\n * This is a starter template \u2014 edit it for your own documentation.\n * Each task evaluates whether an AI coding agent can implement a feature\n * using your docs as context. Delete this file or replace it entirely.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * Full field reference:\n *   https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/contributing-tasks.md\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n  // \u2500\u2500 Mode \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // \"literacy\" tests whether AI coding tools can implement features\n  // using your docs as context. Other modes: \"mcp-server\",\n  // \"knowledge-probe\", \"agent-harness\", \"custom\".\n  mode: \"literacy\",\n\n  // \u2500\u2500 Identity \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // Unique identifier \u2014 lowercase alphanumeric with hyphens.\n  // Must be unique across all task files in .ailf/tasks/.\n  id: \"example-groq-blog-listing\",\n  title: \"Blog listing with GROQ queries\",\n  description: \"Example \u2014 tests GROQ blog listing implementation\",\n\n  // Feature area this task belongs to. Tasks with the same area are\n  // grouped together in score summaries.\n  area: \"groq\",\n\n  // \u2500\u2500 Documentation context \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // Canonical doc references for this task. The pipeline fetches these\n  // from Sanity and injects them into the prompt for baseline evaluation.\n  //\n  // This example uses slug-based references \u2014 the simplest form.\n  // See the other example tasks for path, id, and perspective references.\n  context: {\n    docs: [\n      {\n        slug: \"groq-introduction\",\n        reason: \"Core GROQ syntax and query language reference\",\n      },\n      {\n        slug: \"how-queries-work\",\n        reason: \"Query execution model and best practices\",\n      },\n    ],\n  },\n\n  // When true, the pipeline auto-generates an additional rubric that\n  // checks whether the LLM's response actually used the provided docs.\n  docCoverage: true,\n\n  // Path to a gold-standard implementation, relative to canonical/.\n  // The grader uses this as a reference when scoring code correctness.\n  referenceSolution: \"canonical/example-groq-blog-listing.ts\",\n\n  // \u2500\u2500 Prompt \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // prompt.text \u2014 the implementation prompt given to the LLM.\n  // Write this as if you're asking a developer to build the feature.\n  // Be specific about requirements so the grader can evaluate clearly.\n  prompt: {\n    text: `Create a Next.js page component that lists blog posts from Sanity\nusing GROQ. The page should display the title, slug, and published\ndate for each post, sorted by most recent first. Use the Sanity\nclient to fetch data.`,\n  },\n\n  // \u2500\u2500 Assertions \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // Grading assertions \u2014 how the LLM's response is scored.\n  //\n  // \"llm-rubric\" assertions use a grader LLM to score against criteria.\n  // The \"template\" references a rubric template (e.g. task-completion).\n  //\n  // Available templates:\n  //   task-completion   \u2014 did the LLM implement the feature? (weight: 0.50)\n  //   code-correctness  \u2014 is the code idiomatic and correct? (weight: 0.25)\n  //\n  // You can also use value-based assertions:\n  //   { type: \"contains\", value: \"client.fetch\" }\n  //   { type: \"contains-any\", value: [\"createClient\", \"sanityClient\"] }\n  assertions: [\n    {\n      type: \"llm-rubric\",\n      template: \"task-completion\",\n      criteria: [\n        \"Uses the groq tagged template literal\",\n        \"Fetches blog posts with title, slug, and publishedAt fields\",\n        \"Orders results by publishedAt in descending order\",\n      ],\n    },\n    {\n      type: \"llm-rubric\",\n      template: \"code-correctness\",\n      criteria: [\n        \"Uses createClient from @sanity/client or next-sanity\",\n        \"Exports a valid Next.js page component\",\n      ],\n    },\n  ],\n\n  // \u2500\u2500 Baseline variant \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  //   enabled \u2014 set to false to skip this task entirely\n  //   rubric  \u2014 \"full\" (default), \"abbreviated\" (faster), or \"none\"\n  baseline: {\n    enabled: true,\n    rubric: \"full\",\n  },\n\n  // Example tasks ship as drafts so they don't run in production evals.\n  // Change to \"active\" (or remove this field) to activate.\n  status: \"draft\",\n})\n";
+export declare const exampleGroqBlogListingTs = "/**\n * Example Task: Blog listing with GROQ queries.\n *\n * This is a starter template \u2014 edit it for your own documentation.\n * Each task evaluates whether an AI coding agent can implement a feature\n * using your docs as context. Delete this file or replace it entirely.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * Full field reference:\n *   https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/contributing-tasks.md\n */\n\nimport { defineTask } from \"@sanity/ailf\"\n\nexport default defineTask({\n  // \u2500\u2500 Mode \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // \"literacy\" tests whether AI coding tools can implement features\n  // using your docs as context. Other modes: \"mcp-server\",\n  // \"knowledge-probe\", \"agent-harness\", \"custom\".\n  mode: \"literacy\",\n\n  // \u2500\u2500 Identity \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // Unique identifier \u2014 lowercase alphanumeric with hyphens.\n  // Must be unique across all task files in .ailf/tasks/.\n  id: \"example-groq-blog-listing\",\n  title: \"Blog listing with GROQ queries\",\n  description: \"Example \u2014 tests GROQ blog listing implementation\",\n\n  // Feature area this task belongs to. Tasks with the same area are\n  // grouped together in score summaries.\n  area: \"groq\",\n\n  // \u2500\u2500 Documentation context \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // Canonical doc references for this task. The pipeline fetches these\n  // from Sanity and injects them into the prompt for baseline evaluation.\n  //\n  // This example uses slug-based references \u2014 the simplest form.\n  // See the other example tasks for path, id, and perspective references.\n  context: {\n    docs: [\n      {\n        slug: \"groq-introduction\",\n        reason: \"Core GROQ syntax and query language reference\",\n      },\n      {\n        slug: \"how-queries-work\",\n        reason: \"Query execution model and best practices\",\n      },\n    ],\n  },\n\n  // When true, the pipeline auto-generates an additional rubric that\n  // checks whether the LLM's response actually used the provided docs.\n  docCoverage: true,\n\n  // Path to a gold-standard implementation, relative to canonical/.\n  // The grader uses this as a reference when scoring code correctness.\n  referenceSolution: \"canonical/example-groq-blog-listing.ts\",\n\n  // \u2500\u2500 Prompt \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // prompt.text \u2014 the implementation prompt given to the LLM.\n  // Write this as if you're asking a developer to build the feature.\n  // Be specific about requirements so the grader can evaluate clearly.\n  prompt: {\n    text: `Create a Next.js page component that lists blog posts from Sanity\nusing GROQ. The page should display the title, slug, and published\ndate for each post, sorted by most recent first. Use the Sanity\nclient to fetch data.`,\n  },\n\n  // \u2500\u2500 Assertions \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // Grading assertions \u2014 how the LLM's response is scored.\n  //\n  // \"llm-rubric\" assertions use a grader LLM to score against criteria.\n  // The \"template\" references a rubric template (e.g. task-completion).\n  //\n  // Available templates:\n  //   task-completion   \u2014 did the LLM implement the feature? (weight: 0.50)\n  //   code-correctness  \u2014 is the code idiomatic and correct? (weight: 0.25)\n  //\n  // You can also use value-based assertions:\n  //   { type: \"contains\", value: \"client.fetch\" }\n  //   { type: \"contains-any\", value: [\"createClient\", \"sanityClient\"] }\n  assertions: [\n    {\n      type: \"llm-rubric\",\n      template: \"task-completion\",\n      criteria: [\n        \"Uses the groq tagged template literal\",\n        \"Fetches blog posts with title, slug, and publishedAt fields\",\n        \"Orders results by publishedAt in descending order\",\n      ],\n    },\n    {\n      type: \"llm-rubric\",\n      template: \"code-correctness\",\n      criteria: [\n        \"Uses createClient from @sanity/client or next-sanity\",\n        \"Exports a valid Next.js page component\",\n      ],\n    },\n  ],\n\n  // \u2500\u2500 Baseline variant \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  //   enabled \u2014 set to false to skip this task entirely\n  //   rubric  \u2014 \"full\" (default), \"abbreviated\" (faster), or \"none\"\n  baseline: {\n    enabled: true,\n    rubric: \"full\",\n  },\n\n  // Example tasks ship as drafts so they don't run in production evals.\n  // Change to \"active\" (or remove this field) to activate.\n  status: \"draft\",\n})\n";
 /** Generated YAML for example-groq-blog-listing (from parsed TS data) */
 export declare const exampleGroqBlogListingYaml = "- mode: literacy\n  id: example-groq-blog-listing\n  title: Blog listing with GROQ queries\n  description: Example \u2014 tests GROQ blog listing implementation\n  area: groq\n  context:\n    docs:\n      - slug: groq-introduction\n        reason: Core GROQ syntax and query language reference\n      - slug: how-queries-work\n        reason: Query execution model and best practices\n  docCoverage: true\n  referenceSolution: canonical/example-groq-blog-listing.ts\n  prompt:\n    text: |-\n      Create a Next.js page component that lists blog posts from Sanity\n      using GROQ. The page should display the title, slug, and published\n      date for each post, sorted by most recent first. Use the Sanity\n      client to fetch data.\n  assertions:\n    - type: llm-rubric\n      template: task-completion\n      criteria:\n        - Uses the groq tagged template literal\n        - Fetches blog posts with title, slug, and publishedAt fields\n        - Orders results by publishedAt in descending order\n    - type: llm-rubric\n      template: code-correctness\n      criteria:\n        - Uses createClient from @sanity/client or next-sanity\n        - Exports a valid Next.js page component\n  baseline:\n    enabled: true\n    rubric: full\n  status: draft\n";
 /** Parsed task data for example-id-based-ref (JSON-safe) */
@@ -239,7 +239,7 @@ export declare const exampleIdBasedRefData: readonly [{
     readonly status: "draft";
 }];
 /** TypeScript task template for example-id-based-ref */
-export declare const exampleIdBasedRefTs = "/**\n * Example Task: Document ID-based canonical doc references.\n *\n * Demonstrates using `id` to reference canonical documentation by\n * Sanity document `_id`. This is useful for:\n *   - Draft documents that don't have a stable slug yet\n *   - Programmatic references from imports or migrations\n *   - Documents where you know the _id but not the slug\n *\n * The `id` ref type can also carry optional `slug` and `path` fields\n * as human-readable annotations \u2014 these are NOT used for resolution,\n * only for display in logs and reports.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * @see docs/design-docs/canonical-doc-resolution.md\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n  mode: \"literacy\",\n  id: \"example-id-based-ref\",\n  title: \"GROQ feature support (ID-based doc references)\",\n  description: \"Example \u2014 demonstrates ID-based canonical doc references\",\n\n  area: \"groq\",\n\n  // ID-based canonical doc references.\n  //\n  // Use the Sanity document _id to reference articles directly.\n  // Optional slug/path annotations help humans reading the file\n  // but are NOT used for resolution \u2014 only the `id` field matters.\n  //\n  // These IDs reference real articles in the Sanity docs (next dataset):\n  //   0ba88f1b... = \"GROQ feature support across Sanity\"\n  //   5b9c2863... = \"Custom GROQ functions\"\n  context: {\n    docs: [\n      {\n        id: \"0ba88f1b-d1a7-418a-9267-2e343d01886a\",\n        slug: \"groq-feature-support-by-context\", // annotation only\n        reason: \"GROQ feature support across different Sanity contexts\",\n      },\n      {\n        id: \"5b9c2863-ef01-4565-af8e-ee54e081ee74\",\n        slug: \"custom-groq-functions\", // annotation only\n        reason: \"Custom GROQ functions and pipelines\",\n      },\n    ],\n  },\n\n  docCoverage: true,\n\n  prompt: {\n    text: `Explain how GROQ is used across different Sanity contexts.\nCover the following:\n1. Which GROQ features are available in each context (API queries,\n   webhooks, custom functions, access control)\n2. How to create and use custom GROQ functions\n3. Any differences in GROQ support between contexts\nProvide examples demonstrating context-specific GROQ patterns.`,\n  },\n\n  assertions: [\n    {\n      type: \"llm-rubric\",\n      template: \"task-completion\",\n      criteria: [\n        \"Explains GROQ availability across different Sanity contexts\",\n        \"Describes custom GROQ function creation and usage\",\n        \"Notes differences in GROQ support between contexts\",\n      ],\n    },\n    {\n      type: \"llm-rubric\",\n      template: \"code-correctness\",\n      criteria: [\n        \"GROQ examples use valid syntax\",\n        \"Custom function examples follow the correct API pattern\",\n      ],\n    },\n  ],\n\n  baseline: { enabled: true, rubric: \"full\" },\n  status: \"draft\",\n})\n";
+export declare const exampleIdBasedRefTs = "/**\n * Example Task: Document ID-based canonical doc references.\n *\n * Demonstrates using `id` to reference canonical documentation by\n * Sanity document `_id`. This is useful for:\n *   - Draft documents that don't have a stable slug yet\n *   - Programmatic references from imports or migrations\n *   - Documents where you know the _id but not the slug\n *\n * The `id` ref type can also carry optional `slug` and `path` fields\n * as human-readable annotations \u2014 these are NOT used for resolution,\n * only for display in logs and reports.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * @see docs/design-docs/canonical-doc-resolution.md\n */\n\nimport { defineTask } from \"@sanity/ailf\"\n\nexport default defineTask({\n  mode: \"literacy\",\n  id: \"example-id-based-ref\",\n  title: \"GROQ feature support (ID-based doc references)\",\n  description: \"Example \u2014 demonstrates ID-based canonical doc references\",\n\n  area: \"groq\",\n\n  // ID-based canonical doc references.\n  //\n  // Use the Sanity document _id to reference articles directly.\n  // Optional slug/path annotations help humans reading the file\n  // but are NOT used for resolution \u2014 only the `id` field matters.\n  //\n  // These IDs reference real articles in the Sanity docs (next dataset):\n  //   0ba88f1b... = \"GROQ feature support across Sanity\"\n  //   5b9c2863... = \"Custom GROQ functions\"\n  context: {\n    docs: [\n      {\n        id: \"0ba88f1b-d1a7-418a-9267-2e343d01886a\",\n        slug: \"groq-feature-support-by-context\", // annotation only\n        reason: \"GROQ feature support across different Sanity contexts\",\n      },\n      {\n        id: \"5b9c2863-ef01-4565-af8e-ee54e081ee74\",\n        slug: \"custom-groq-functions\", // annotation only\n        reason: \"Custom GROQ functions and pipelines\",\n      },\n    ],\n  },\n\n  docCoverage: true,\n\n  prompt: {\n    text: `Explain how GROQ is used across different Sanity contexts.\nCover the following:\n1. Which GROQ features are available in each context (API queries,\n   webhooks, custom functions, access control)\n2. How to create and use custom GROQ functions\n3. Any differences in GROQ support between contexts\nProvide examples demonstrating context-specific GROQ patterns.`,\n  },\n\n  assertions: [\n    {\n      type: \"llm-rubric\",\n      template: \"task-completion\",\n      criteria: [\n        \"Explains GROQ availability across different Sanity contexts\",\n        \"Describes custom GROQ function creation and usage\",\n        \"Notes differences in GROQ support between contexts\",\n      ],\n    },\n    {\n      type: \"llm-rubric\",\n      template: \"code-correctness\",\n      criteria: [\n        \"GROQ examples use valid syntax\",\n        \"Custom function examples follow the correct API pattern\",\n      ],\n    },\n  ],\n\n  baseline: { enabled: true, rubric: \"full\" },\n  status: \"draft\",\n})\n";
 /** Generated YAML for example-id-based-ref (from parsed TS data) */
 export declare const exampleIdBasedRefYaml = "- mode: literacy\n  id: example-id-based-ref\n  title: GROQ feature support (ID-based doc references)\n  description: Example \u2014 demonstrates ID-based canonical doc references\n  area: groq\n  context:\n    docs:\n      - id: 0ba88f1b-d1a7-418a-9267-2e343d01886a\n        slug: groq-feature-support-by-context\n        reason: GROQ feature support across different Sanity contexts\n      - id: 5b9c2863-ef01-4565-af8e-ee54e081ee74\n        slug: custom-groq-functions\n        reason: Custom GROQ functions and pipelines\n  docCoverage: true\n  prompt:\n    text: |-\n      Explain how GROQ is used across different Sanity contexts.\n      Cover the following:\n      1. Which GROQ features are available in each context (API queries,\n         webhooks, custom functions, access control)\n      2. How to create and use custom GROQ functions\n      3. Any differences in GROQ support between contexts\n      Provide examples demonstrating context-specific GROQ patterns.\n  assertions:\n    - type: llm-rubric\n      template: task-completion\n      criteria:\n        - Explains GROQ availability across different Sanity contexts\n        - Describes custom GROQ function creation and usage\n        - Notes differences in GROQ support between contexts\n    - type: llm-rubric\n      template: code-correctness\n      criteria:\n        - GROQ examples use valid syntax\n        - Custom function examples follow the correct API pattern\n  baseline:\n    enabled: true\n    rubric: full\n  status: draft\n";
 /** Parsed task data for example-knowledge-probe (JSON-safe) */
@@ -260,7 +260,7 @@ export declare const exampleKnowledgeProbeData: readonly [{
     readonly status: "draft";
 }];
 /** TypeScript task template for example-knowledge-probe */
-export declare const exampleKnowledgeProbeTs = "/**\n * Example Task: Knowledge probe baseline (DRAFT).\n *\n * Tests what the model knows about a topic without providing documentation.\n * Used to establish a baseline for comparison with literacy evaluations.\n * This task is a DRAFT \u2014 it won't run unless activated or explicitly targeted.\n *\n * To activate: change status to \"active\" or remove the status field.\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n  mode: \"knowledge-probe\",\n  id: \"example-knowledge-probe\",\n  title: \"Model knowledge of GROQ syntax\",\n  description: \"Example \u2014 probes baseline model knowledge (draft)\",\n  area: \"groq\",\n\n  prompt: {\n    text: `Explain the GROQ query language used by Sanity. Cover:\n1. Basic query syntax and projections\n2. How to filter and sort results\n3. Common patterns for fetching related documents\nProvide working code examples.`,\n  },\n\n  assertions: [\n    {\n      type: \"llm-rubric\",\n      template: \"task-completion\",\n      criteria: [\n        \"Demonstrates understanding of GROQ query syntax\",\n        \"Shows filtering and projection patterns\",\n        \"Code examples use valid GROQ syntax\",\n      ],\n    },\n  ],\n\n  status: \"draft\",\n})\n";
+export declare const exampleKnowledgeProbeTs = "/**\n * Example Task: Knowledge probe baseline (DRAFT).\n *\n * Tests what the model knows about a topic without providing documentation.\n * Used to establish a baseline for comparison with literacy evaluations.\n * This task is a DRAFT \u2014 it won't run unless activated or explicitly targeted.\n *\n * To activate: change status to \"active\" or remove the status field.\n */\n\nimport { defineTask } from \"@sanity/ailf\"\n\nexport default defineTask({\n  mode: \"knowledge-probe\",\n  id: \"example-knowledge-probe\",\n  title: \"Model knowledge of GROQ syntax\",\n  description: \"Example \u2014 probes baseline model knowledge (draft)\",\n  area: \"groq\",\n\n  prompt: {\n    text: `Explain the GROQ query language used by Sanity. Cover:\n1. Basic query syntax and projections\n2. How to filter and sort results\n3. Common patterns for fetching related documents\nProvide working code examples.`,\n  },\n\n  assertions: [\n    {\n      type: \"llm-rubric\",\n      template: \"task-completion\",\n      criteria: [\n        \"Demonstrates understanding of GROQ query syntax\",\n        \"Shows filtering and projection patterns\",\n        \"Code examples use valid GROQ syntax\",\n      ],\n    },\n  ],\n\n  status: \"draft\",\n})\n";
 /** Generated YAML for example-knowledge-probe (from parsed TS data) */
 export declare const exampleKnowledgeProbeYaml = "- mode: knowledge-probe\n  id: example-knowledge-probe\n  title: Model knowledge of GROQ syntax\n  description: Example \u2014 probes baseline model knowledge (draft)\n  area: groq\n  prompt:\n    text: |-\n      Explain the GROQ query language used by Sanity. Cover:\n      1. Basic query syntax and projections\n      2. How to filter and sort results\n      3. Common patterns for fetching related documents\n      Provide working code examples.\n  assertions:\n    - type: llm-rubric\n      template: task-completion\n      criteria:\n        - Demonstrates understanding of GROQ query syntax\n        - Shows filtering and projection patterns\n        - Code examples use valid GROQ syntax\n  status: draft\n";
 /** Parsed task data for example-mcp-tool-usage (JSON-safe) */
@@ -288,7 +288,7 @@ export declare const exampleMcpToolUsageData: readonly [{
     readonly status: "draft";
 }];
 /** TypeScript task template for example-mcp-tool-usage */
-export declare const exampleMcpToolUsageTs = "/**\n * Example Task: MCP Server tool-use evaluation (DRAFT).\n *\n * Tests whether an LLM can correctly discover and invoke tools from\n * an MCP server. Replace the placeholder serverConfig with your own\n * MCP server's URL and authentication details.\n *\n * Transports:\n *   - \"streamable-http\" / \"sse\" \u2014 remote servers (set url + optional headers)\n *   - \"stdio\" \u2014 local process (set command instead of url)\n *\n * Authentication:\n *   - `headers` \u2014 send arbitrary HTTP headers (e.g., Authorization)\n *   - `auth`    \u2014 structured auth config (bearer, basic, api_key, oauth)\n *   Values support {{env.VAR}} syntax so secrets stay out of source control.\n *\n * This task is a DRAFT \u2014 it won't run unless activated or explicitly targeted.\n * To activate: change status to \"active\" or remove the status field.\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n  mode: \"mcp-server\",\n  id: \"example-mcp-tool-usage\",\n  title: \"MCP tool discovery and invocation\",\n  description: \"Example \u2014 tests MCP server tool-use (draft)\",\n  area: \"mcp\",\n\n  // \u2500\u2500 Server configuration \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // Replace the URL and headers below with your MCP server's details.\n  //\n  // For a local stdio server, use:\n  //   transport: \"stdio\",\n  //   command: \"node dist/my-mcp-server.js\",\n  serverConfig: {\n    transport: \"streamable-http\",\n    url: \"https://your-mcp-server.example.com\",\n    headers: {\n      Authorization: \"Bearer {{env.MCP_AUTH_TOKEN}}\",\n    },\n  },\n\n  // \u2500\u2500 Capabilities \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // Restrict which MCP tools the model can call. If omitted, all\n  // tools discovered from the server are available.\n  // capabilities: [\"tool_a\", \"tool_b\"],\n\n  prompt: {\n    text: `Use the available MCP tools to complete the task.\nReplace this prompt with instructions specific to your MCP server.`,\n  },\n\n  assertions: [\n    {\n      type: \"llm-rubric\",\n      template: \"mcp-output-correctness\",\n      criteria: [\n        \"Correctly discovers and selects the appropriate tool\",\n        \"Passes valid arguments to the tool\",\n        \"Interprets the tool response coherently\",\n      ],\n    },\n  ],\n\n  status: \"draft\",\n})\n";
+export declare const exampleMcpToolUsageTs = "/**\n * Example Task: MCP Server tool-use evaluation (DRAFT).\n *\n * Tests whether an LLM can correctly discover and invoke tools from\n * an MCP server. Replace the placeholder serverConfig with your own\n * MCP server's URL and authentication details.\n *\n * Transports:\n *   - \"streamable-http\" / \"sse\" \u2014 remote servers (set url + optional headers)\n *   - \"stdio\" \u2014 local process (set command instead of url)\n *\n * Authentication:\n *   - `headers` \u2014 send arbitrary HTTP headers (e.g., Authorization)\n *   - `auth`    \u2014 structured auth config (bearer, basic, api_key, oauth)\n *   Values support {{env.VAR}} syntax so secrets stay out of source control.\n *\n * This task is a DRAFT \u2014 it won't run unless activated or explicitly targeted.\n * To activate: change status to \"active\" or remove the status field.\n */\n\nimport { defineTask } from \"@sanity/ailf\"\n\nexport default defineTask({\n  mode: \"mcp-server\",\n  id: \"example-mcp-tool-usage\",\n  title: \"MCP tool discovery and invocation\",\n  description: \"Example \u2014 tests MCP server tool-use (draft)\",\n  area: \"mcp\",\n\n  // \u2500\u2500 Server configuration \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // Replace the URL and headers below with your MCP server's details.\n  //\n  // For a local stdio server, use:\n  //   transport: \"stdio\",\n  //   command: \"node dist/my-mcp-server.js\",\n  serverConfig: {\n    transport: \"streamable-http\",\n    url: \"https://your-mcp-server.example.com\",\n    headers: {\n      Authorization: \"Bearer {{env.MCP_AUTH_TOKEN}}\",\n    },\n  },\n\n  // \u2500\u2500 Capabilities \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // Restrict which MCP tools the model can call. If omitted, all\n  // tools discovered from the server are available.\n  // capabilities: [\"tool_a\", \"tool_b\"],\n\n  prompt: {\n    text: `Use the available MCP tools to complete the task.\nReplace this prompt with instructions specific to your MCP server.`,\n  },\n\n  assertions: [\n    {\n      type: \"llm-rubric\",\n      template: \"mcp-output-correctness\",\n      criteria: [\n        \"Correctly discovers and selects the appropriate tool\",\n        \"Passes valid arguments to the tool\",\n        \"Interprets the tool response coherently\",\n      ],\n    },\n  ],\n\n  status: \"draft\",\n})\n";
 /** Generated YAML for example-mcp-tool-usage (from parsed TS data) */
 export declare const exampleMcpToolUsageYaml = "- mode: mcp-server\n  id: example-mcp-tool-usage\n  title: MCP tool discovery and invocation\n  description: Example \u2014 tests MCP server tool-use (draft)\n  area: mcp\n  serverConfig:\n    transport: streamable-http\n    url: https://your-mcp-server.example.com\n    headers:\n      Authorization: Bearer {{env.MCP_AUTH_TOKEN}}\n  prompt:\n    text: |-\n      Use the available MCP tools to complete the task.\n      Replace this prompt with instructions specific to your MCP server.\n  assertions:\n    - type: llm-rubric\n      template: mcp-output-correctness\n      criteria:\n        - Correctly discovers and selects the appropriate tool\n        - Passes valid arguments to the tool\n        - Interprets the tool response coherently\n  status: draft\n";
 /** Parsed task data for example-path-based-ref (JSON-safe) */
@@ -327,7 +327,7 @@ export declare const examplePathBasedRefData: readonly [{
     readonly status: "draft";
 }];
 /** TypeScript task template for example-path-based-ref */
-export declare const examplePathBasedRefTs = "/**\n * Example Task: Path-based canonical doc references.\n *\n * Demonstrates using `path` to reference canonical documentation.\n * Paths are the preferred reference type because they uniquely identify\n * an article across sections (unlike slugs, which can collide).\n *\n * Path format:\n *   - Simple:    \"webhooks\"                \u2192 resolves by slug lookup\n *   - Sectioned: \"content-lake/webhooks\"   \u2192 disambiguates by section + slug\n *\n * This example demonstrates why paths matter: the slug \"documents\"\n * exists in both the \"content-lake\" and \"cli-reference\" sections.\n * Using \"content-lake/documents\" ensures we get the right one.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * @see docs/design-docs/canonical-doc-resolution.md\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n  mode: \"literacy\",\n  id: \"example-path-based-ref\",\n  title: \"GROQ mutations (path-based doc references)\",\n  description: \"Example \u2014 demonstrates path-based canonical doc references\",\n\n  area: \"groq\",\n\n  // Path-based canonical doc references.\n  //\n  // Use \"section/slug\" format to uniquely identify articles:\n  //   - \"content-lake/mutations-introduction\" \u2192 the mutations article\n  //   - \"content-lake/documents\" \u2192 the documents article in Content Lake\n  //     (not the CLI \"documents\" article in cli-reference section)\n  //\n  // The \"documents\" slug exists in two sections \u2014 this is exactly why\n  // path-based references are preferred over slug-based references.\n  context: {\n    docs: [\n      {\n        path: \"content-lake/mutations-introduction\",\n        reason: \"Introduction to document mutations in the Content Lake\",\n      },\n      {\n        path: \"content-lake/documents\",\n        reason:\n          \"Document structure and types (Content Lake, not CLI reference)\",\n      },\n    ],\n  },\n\n  docCoverage: true,\n\n  prompt: {\n    text: `Explain how to create, update, and delete documents in Sanity's\nContent Lake using mutations. Cover:\n1. The different mutation types (create, createOrReplace, patch, delete)\n2. Document structure and required fields (_id, _type)\n3. How to use patch operations to update specific fields\n4. Best practices for mutation patterns\nProvide working code examples using @sanity/client.`,\n  },\n\n  assertions: [\n    {\n      type: \"llm-rubric\",\n      template: \"task-completion\",\n      criteria: [\n        \"Explains create, createOrReplace, patch, and delete mutations\",\n        \"Describes required document fields (_id, _type)\",\n        \"Shows patch operations for field-level updates\",\n        \"Includes practical code examples\",\n      ],\n    },\n    {\n      type: \"llm-rubric\",\n      template: \"code-correctness\",\n      criteria: [\n        \"Uses correct @sanity/client mutation API\",\n        \"Patch operations use valid set/unset/inc syntax\",\n      ],\n    },\n  ],\n\n  baseline: { enabled: true, rubric: \"full\" },\n  status: \"draft\",\n})\n";
+export declare const examplePathBasedRefTs = "/**\n * Example Task: Path-based canonical doc references.\n *\n * Demonstrates using `path` to reference canonical documentation.\n * Paths are the preferred reference type because they uniquely identify\n * an article across sections (unlike slugs, which can collide).\n *\n * Path format:\n *   - Simple:    \"webhooks\"                \u2192 resolves by slug lookup\n *   - Sectioned: \"content-lake/webhooks\"   \u2192 disambiguates by section + slug\n *\n * This example demonstrates why paths matter: the slug \"documents\"\n * exists in both the \"content-lake\" and \"cli-reference\" sections.\n * Using \"content-lake/documents\" ensures we get the right one.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * @see docs/design-docs/canonical-doc-resolution.md\n */\n\nimport { defineTask } from \"@sanity/ailf\"\n\nexport default defineTask({\n  mode: \"literacy\",\n  id: \"example-path-based-ref\",\n  title: \"GROQ mutations (path-based doc references)\",\n  description: \"Example \u2014 demonstrates path-based canonical doc references\",\n\n  area: \"groq\",\n\n  // Path-based canonical doc references.\n  //\n  // Use \"section/slug\" format to uniquely identify articles:\n  //   - \"content-lake/mutations-introduction\" \u2192 the mutations article\n  //   - \"content-lake/documents\" \u2192 the documents article in Content Lake\n  //     (not the CLI \"documents\" article in cli-reference section)\n  //\n  // The \"documents\" slug exists in two sections \u2014 this is exactly why\n  // path-based references are preferred over slug-based references.\n  context: {\n    docs: [\n      {\n        path: \"content-lake/mutations-introduction\",\n        reason: \"Introduction to document mutations in the Content Lake\",\n      },\n      {\n        path: \"content-lake/documents\",\n        reason:\n          \"Document structure and types (Content Lake, not CLI reference)\",\n      },\n    ],\n  },\n\n  docCoverage: true,\n\n  prompt: {\n    text: `Explain how to create, update, and delete documents in Sanity's\nContent Lake using mutations. Cover:\n1. The different mutation types (create, createOrReplace, patch, delete)\n2. Document structure and required fields (_id, _type)\n3. How to use patch operations to update specific fields\n4. Best practices for mutation patterns\nProvide working code examples using @sanity/client.`,\n  },\n\n  assertions: [\n    {\n      type: \"llm-rubric\",\n      template: \"task-completion\",\n      criteria: [\n        \"Explains create, createOrReplace, patch, and delete mutations\",\n        \"Describes required document fields (_id, _type)\",\n        \"Shows patch operations for field-level updates\",\n        \"Includes practical code examples\",\n      ],\n    },\n    {\n      type: \"llm-rubric\",\n      template: \"code-correctness\",\n      criteria: [\n        \"Uses correct @sanity/client mutation API\",\n        \"Patch operations use valid set/unset/inc syntax\",\n      ],\n    },\n  ],\n\n  baseline: { enabled: true, rubric: \"full\" },\n  status: \"draft\",\n})\n";
 /** Generated YAML for example-path-based-ref (from parsed TS data) */
 export declare const examplePathBasedRefYaml = "- mode: literacy\n  id: example-path-based-ref\n  title: GROQ mutations (path-based doc references)\n  description: Example \u2014 demonstrates path-based canonical doc references\n  area: groq\n  context:\n    docs:\n      - path: content-lake/mutations-introduction\n        reason: Introduction to document mutations in the Content Lake\n      - path: content-lake/documents\n        reason: Document structure and types (Content Lake, not CLI reference)\n  docCoverage: true\n  prompt:\n    text: |-\n      Explain how to create, update, and delete documents in Sanity's\n      Content Lake using mutations. Cover:\n      1. The different mutation types (create, createOrReplace, patch, delete)\n      2. Document structure and required fields (_id, _type)\n      3. How to use patch operations to update specific fields\n      4. Best practices for mutation patterns\n      Provide working code examples using @sanity/client.\n  assertions:\n    - type: llm-rubric\n      template: task-completion\n      criteria:\n        - Explains create, createOrReplace, patch, and delete mutations\n        - Describes required document fields (_id, _type)\n        - Shows patch operations for field-level updates\n        - Includes practical code examples\n    - type: llm-rubric\n      template: code-correctness\n      criteria:\n        - Uses correct @sanity/client mutation API\n        - Patch operations use valid set/unset/inc syntax\n  baseline:\n    enabled: true\n    rubric: full\n  status: draft\n";
 /** Parsed task data for example-perspective-ref (JSON-safe) */
@@ -366,7 +366,7 @@ export declare const examplePerspectiveRefData: readonly [{
     readonly status: "draft";
 }];
 /** TypeScript task template for example-perspective-ref */
-export declare const examplePerspectiveRefTs = "/**\n * Example Task: Perspective / content release doc references.\n *\n * Demonstrates using `perspective` to reference all documentation\n * articles within a content release. This is the key capability for\n * evaluating NEW feature documentation before it's published.\n *\n * How it works:\n *   - A perspective ref is one-to-many: the doc fetcher queries the\n *     named release and expands it to ALL articles versioned within it.\n *   - Downstream consumers see the same flat DocContext[] regardless\n *     of how docs were resolved.\n *   - When the release is published, the perspective entry becomes a\n *     no-op (articles are now in published). Migrate to explicit path\n *     or slug refs at your convenience.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * @see docs/design-docs/canonical-doc-resolution.md\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n  mode: \"literacy\",\n  id: \"example-perspective-ref\",\n  title:\n    \"GROQ features from content release (perspective-based doc references)\",\n  description:\n    \"Example \u2014 demonstrates perspective-based canonical doc references\",\n\n  area: \"groq\",\n\n  // Perspective-based canonical doc reference.\n  //\n  // The perspective ID references a content release in the Sanity\n  // Content Lake. At evaluation time, the doc fetcher auto-discovers\n  // all articles versioned in this release and includes them as\n  // canonical documentation context.\n  //\n  // Release rE9TSJvR4 contains:\n  //   - \"GROQ-powered webhooks\" (webhooks)\n  //   - \"Query Cheat Sheet - GROQ\" (query-cheat-sheet)\n  //   - \"GROQ joins\" (groq-joins)\n  //\n  // You can combine perspective refs with explicit slug/path/id refs\n  // to include foundational published docs alongside release content.\n  // Here we add groq-data-types as a complementary published reference.\n  context: {\n    docs: [\n      {\n        perspective: \"rE9TSJvR4\",\n        reason: \"All GROQ documentation updates in the test content release\",\n      },\n      {\n        slug: \"groq-data-types\",\n        reason: \"GROQ data type reference (published, stable)\",\n      },\n    ],\n  },\n\n  docCoverage: true,\n\n  prompt: {\n    text: `Using GROQ, demonstrate advanced query patterns including:\n1. Joining data across document types using references\n2. Filtering webhook payloads with GROQ projections\n3. Using the query cheat sheet patterns for common operations\n4. Working with different GROQ data types in filters\nProvide working GROQ query examples for each pattern.`,\n  },\n\n  assertions: [\n    {\n      type: \"llm-rubric\",\n      template: \"task-completion\",\n      criteria: [\n        \"Demonstrates GROQ join syntax for cross-document queries\",\n        \"Shows GROQ filter patterns for webhook configuration\",\n        \"Includes practical query examples from cheat sheet patterns\",\n      ],\n    },\n    {\n      type: \"llm-rubric\",\n      template: \"code-correctness\",\n      criteria: [\n        \"All GROQ queries use valid syntax\",\n        \"Reference joins use correct dereference operator (->)\",\n      ],\n    },\n  ],\n\n  baseline: { enabled: true, rubric: \"full\" },\n  status: \"draft\",\n})\n";
+export declare const examplePerspectiveRefTs = "/**\n * Example Task: Perspective / content release doc references.\n *\n * Demonstrates using `perspective` to reference all documentation\n * articles within a content release. This is the key capability for\n * evaluating NEW feature documentation before it's published.\n *\n * How it works:\n *   - A perspective ref is one-to-many: the doc fetcher queries the\n *     named release and expands it to ALL articles versioned within it.\n *   - Downstream consumers see the same flat DocContext[] regardless\n *     of how docs were resolved.\n *   - When the release is published, the perspective entry becomes a\n *     no-op (articles are now in published). Migrate to explicit path\n *     or slug refs at your convenience.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * @see docs/design-docs/canonical-doc-resolution.md\n */\n\nimport { defineTask } from \"@sanity/ailf\"\n\nexport default defineTask({\n  mode: \"literacy\",\n  id: \"example-perspective-ref\",\n  title:\n    \"GROQ features from content release (perspective-based doc references)\",\n  description:\n    \"Example \u2014 demonstrates perspective-based canonical doc references\",\n\n  area: \"groq\",\n\n  // Perspective-based canonical doc reference.\n  //\n  // The perspective ID references a content release in the Sanity\n  // Content Lake. At evaluation time, the doc fetcher auto-discovers\n  // all articles versioned in this release and includes them as\n  // canonical documentation context.\n  //\n  // Release rE9TSJvR4 contains:\n  //   - \"GROQ-powered webhooks\" (webhooks)\n  //   - \"Query Cheat Sheet - GROQ\" (query-cheat-sheet)\n  //   - \"GROQ joins\" (groq-joins)\n  //\n  // You can combine perspective refs with explicit slug/path/id refs\n  // to include foundational published docs alongside release content.\n  // Here we add groq-data-types as a complementary published reference.\n  context: {\n    docs: [\n      {\n        perspective: \"rE9TSJvR4\",\n        reason: \"All GROQ documentation updates in the test content release\",\n      },\n      {\n        slug: \"groq-data-types\",\n        reason: \"GROQ data type reference (published, stable)\",\n      },\n    ],\n  },\n\n  docCoverage: true,\n\n  prompt: {\n    text: `Using GROQ, demonstrate advanced query patterns including:\n1. Joining data across document types using references\n2. Filtering webhook payloads with GROQ projections\n3. Using the query cheat sheet patterns for common operations\n4. Working with different GROQ data types in filters\nProvide working GROQ query examples for each pattern.`,\n  },\n\n  assertions: [\n    {\n      type: \"llm-rubric\",\n      template: \"task-completion\",\n      criteria: [\n        \"Demonstrates GROQ join syntax for cross-document queries\",\n        \"Shows GROQ filter patterns for webhook configuration\",\n        \"Includes practical query examples from cheat sheet patterns\",\n      ],\n    },\n    {\n      type: \"llm-rubric\",\n      template: \"code-correctness\",\n      criteria: [\n        \"All GROQ queries use valid syntax\",\n        \"Reference joins use correct dereference operator (->)\",\n      ],\n    },\n  ],\n\n  baseline: { enabled: true, rubric: \"full\" },\n  status: \"draft\",\n})\n";
 /** Generated YAML for example-perspective-ref (from parsed TS data) */
 export declare const examplePerspectiveRefYaml = "- mode: literacy\n  id: example-perspective-ref\n  title: GROQ features from content release (perspective-based doc references)\n  description: Example \u2014 demonstrates perspective-based canonical doc references\n  area: groq\n  context:\n    docs:\n      - perspective: rE9TSJvR4\n        reason: All GROQ documentation updates in the test content release\n      - slug: groq-data-types\n        reason: GROQ data type reference (published, stable)\n  docCoverage: true\n  prompt:\n    text: |-\n      Using GROQ, demonstrate advanced query patterns including:\n      1. Joining data across document types using references\n      2. Filtering webhook payloads with GROQ projections\n      3. Using the query cheat sheet patterns for common operations\n      4. Working with different GROQ data types in filters\n      Provide working GROQ query examples for each pattern.\n  assertions:\n    - type: llm-rubric\n      template: task-completion\n      criteria:\n        - Demonstrates GROQ join syntax for cross-document queries\n        - Shows GROQ filter patterns for webhook configuration\n        - Includes practical query examples from cheat sheet patterns\n    - type: llm-rubric\n      template: code-correctness\n      criteria:\n        - All GROQ queries use valid syntax\n        - Reference joins use correct dereference operator (->)\n  baseline:\n    enabled: true\n    rubric: full\n  status: draft\n";
 /** Parsed task data for example-studio-custom-input (JSON-safe) */
@@ -406,7 +406,7 @@ export declare const exampleStudioCustomInputData: readonly [{
     readonly status: "draft";
 }];
 /** TypeScript task template for example-studio-custom-input */
-export declare const exampleStudioCustomInputTs = "/**\n * Example Task: Custom input component in Sanity Studio.\n *\n * This is a starter template \u2014 edit it for your own documentation.\n * Delete this file or replace it with your own tasks.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n  mode: \"literacy\",\n  id: \"example-studio-custom-input\",\n  title: \"Custom input component in Sanity Studio\",\n  description: \"Example \u2014 tests Studio custom input implementation\",\n\n  area: \"studio\",\n\n  context: {\n    docs: [\n      {\n        slug: \"custom-input-widgets\",\n        reason: \"Guide for building custom form inputs in Sanity Studio\",\n      },\n      {\n        slug: \"form-components\",\n        reason: \"Form component API and customization patterns\",\n      },\n    ],\n  },\n\n  docCoverage: true,\n  referenceSolution: \"canonical/example-studio-custom-input.ts\",\n\n  prompt: {\n    text: `Build a custom string input component for Sanity Studio that shows\na character count below the input field. The component should accept\na maxLength option from the field schema and display a warning when\nthe text exceeds the limit.`,\n  },\n\n  assertions: [\n    {\n      type: \"llm-rubric\",\n      template: \"task-completion\",\n      criteria: [\n        \"Implements a React component that renders a text input\",\n        \"Displays a live character count\",\n        \"Reads maxLength from schema options\",\n        \"Shows a visual warning when limit is exceeded\",\n      ],\n    },\n    {\n      type: \"llm-rubric\",\n      template: \"code-correctness\",\n      criteria: [\n        \"Uses the Sanity UI library for styling\",\n        \"Calls onChange with patch operations\",\n      ],\n    },\n  ],\n\n  baseline: { enabled: true, rubric: \"full\" },\n  status: \"draft\",\n})\n";
+export declare const exampleStudioCustomInputTs = "/**\n * Example Task: Custom input component in Sanity Studio.\n *\n * This is a starter template \u2014 edit it for your own documentation.\n * Delete this file or replace it with your own tasks.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n */\n\nimport { defineTask } from \"@sanity/ailf\"\n\nexport default defineTask({\n  mode: \"literacy\",\n  id: \"example-studio-custom-input\",\n  title: \"Custom input component in Sanity Studio\",\n  description: \"Example \u2014 tests Studio custom input implementation\",\n\n  area: \"studio\",\n\n  context: {\n    docs: [\n      {\n        slug: \"custom-input-widgets\",\n        reason: \"Guide for building custom form inputs in Sanity Studio\",\n      },\n      {\n        slug: \"form-components\",\n        reason: \"Form component API and customization patterns\",\n      },\n    ],\n  },\n\n  docCoverage: true,\n  referenceSolution: \"canonical/example-studio-custom-input.ts\",\n\n  prompt: {\n    text: `Build a custom string input component for Sanity Studio that shows\na character count below the input field. The component should accept\na maxLength option from the field schema and display a warning when\nthe text exceeds the limit.`,\n  },\n\n  assertions: [\n    {\n      type: \"llm-rubric\",\n      template: \"task-completion\",\n      criteria: [\n        \"Implements a React component that renders a text input\",\n        \"Displays a live character count\",\n        \"Reads maxLength from schema options\",\n        \"Shows a visual warning when limit is exceeded\",\n      ],\n    },\n    {\n      type: \"llm-rubric\",\n      template: \"code-correctness\",\n      criteria: [\n        \"Uses the Sanity UI library for styling\",\n        \"Calls onChange with patch operations\",\n      ],\n    },\n  ],\n\n  baseline: { enabled: true, rubric: \"full\" },\n  status: \"draft\",\n})\n";
 /** Generated YAML for example-studio-custom-input (from parsed TS data) */
 export declare const exampleStudioCustomInputYaml = "- mode: literacy\n  id: example-studio-custom-input\n  title: Custom input component in Sanity Studio\n  description: Example \u2014 tests Studio custom input implementation\n  area: studio\n  context:\n    docs:\n      - slug: custom-input-widgets\n        reason: Guide for building custom form inputs in Sanity Studio\n      - slug: form-components\n        reason: Form component API and customization patterns\n  docCoverage: true\n  referenceSolution: canonical/example-studio-custom-input.ts\n  prompt:\n    text: |-\n      Build a custom string input component for Sanity Studio that shows\n      a character count below the input field. The component should accept\n      a maxLength option from the field schema and display a warning when\n      the text exceeds the limit.\n  assertions:\n    - type: llm-rubric\n      template: task-completion\n      criteria:\n        - Implements a React component that renders a text input\n        - Displays a live character count\n        - Reads maxLength from schema options\n        - Shows a visual warning when limit is exceeded\n    - type: llm-rubric\n      template: code-correctness\n      criteria:\n        - Uses the Sanity UI library for styling\n        - Calls onChange with patch operations\n  baseline:\n    enabled: true\n    rubric: full\n  status: draft\n";
 /** All task example data as a flat array (JSON-safe) */

package/dist/_vendor/ailf-core/examples/index.js CHANGED Viewed

@@ -201,7 +201,7 @@ export const exampleAgentAddSchemaData = [
     }
 ];
 /** TypeScript task template for example-agent-add-schema */
-export const exampleAgentAddSchemaTs = "/**\n * Example Task: Agent harness — add a schema to a Sanity Studio project.\n *\n * This is a starter template for agent-harness evaluations. It tests whether\n * an autonomous coding agent can modify a real project in a sandboxed\n * environment using file system tools.\n *\n * Unlike literacy or knowledge-probe tasks that evaluate text responses,\n * agent-harness tasks evaluate *side-effects*: files created, code modified,\n * commands executed. The agent gets a working directory with real files and\n * tools (Bash, Read, Write, Edit, etc.) and is graded on what it produces.\n *\n * This task is a DRAFT — it won't run unless activated or explicitly targeted.\n * To activate: change status to \"active\" or remove the status field.\n *\n * @see https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/modes.md#agent-harness\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n  // ── Mode ──────────────────────────────────────────────────────\n  // \"agent-harness\" runs a real coding agent (Claude Agent SDK) with\n  // filesystem tools in an isolated sandbox. The agent can read, write,\n  // edit files and run shell commands — then assertions verify the results.\n  //\n  // Other modes: \"literacy\" (text Q&A), \"mcp-server\" (tool use),\n  //              \"knowledge-probe\" (baseline knowledge).\n  mode: \"agent-harness\",\n\n  id: \"example-agent-add-schema\",\n  title: \"Add a document schema to a Sanity Studio project\",\n  description:\n    \"Example — tests whether an agent can create a schema file and \" +\n    \"register it in an existing Sanity Studio project\",\n\n  // Area groups tasks for scoring. All agent-harness tasks in the same\n  // area are aggregated together in the score report.\n  area: \"studio\",\n\n  // ── Sandbox ───────────────────────────────────────────────────\n  // The sandbox isolates the agent's file operations from your real repo.\n  // Currently only \"tempdir\" is implemented (creates a temporary directory).\n  // The pipeline creates the sandbox dir at config-generation time, copies\n  // fixtures into it before the agent starts, and the agent's working_dir\n  // is set to the sandbox path.\n  //\n  // See: docs/modes.md#sandbox-lifecycle\n  sandbox: { type: \"tempdir\" },\n\n  // ── Tools ─────────────────────────────────────────────────────\n  // Tool presets control which Claude Agent SDK tools the agent can use:\n  //   \"coding\"      → Bash, Read, Write, Edit, Glob, Grep\n  //   \"full-access\" → coding + WebSearch, WebFetch, TodoRead, TodoWrite\n  //   \"read-only\"   → Read, Glob, Grep, WebSearch\n  //\n  // You can also mix presets with explicit tool names:\n  //   tools: [\"coding\", \"WebFetch\"]\n  tools: [\"coding\"],\n\n  // ── Fixtures ──────────────────────────────────────────────────\n  // Fixtures are files or directories copied into the sandbox before the\n  // agent starts. Paths use the file:// prefix and resolve relative to\n  // the directory where you run the pipeline (typically your repo root).\n  //\n  // The agent sees these files in its working directory as if they were\n  // a real project. For example, \"file://apps/studio-basic\" copies the\n  // entire studio-basic app into the sandbox root.\n  //\n  // If you don't have fixture projects, you can omit this field and the\n  // agent starts with an empty directory.\n  fixtures: [\"file://apps/studio-basic\"],\n\n  // ── Prompt ────────────────────────────────────────────────────\n  // The task instructions sent to the agent. Be specific about expected\n  // file paths and patterns — the assertions check for exact paths.\n  prompt: {\n    text: `You have a Sanity Studio project with an existing article schema.\nAdd a new \"post\" document type with the following fields:\n1. title (string, required)\n2. slug (slug, sourced from title, required)\n3. author (string)\n4. publishedAt (datetime)\n5. body (array of block content)\n\nCreate the schema file at schemas/post.ts using defineType() and defineField().\nRegister it in schemas/index.ts alongside the existing article schema.`,\n  },\n\n  // ── Assertions ────────────────────────────────────────────────\n  // Agent-harness assertions verify the sandbox state after the agent runs.\n  // They execute in a full Node.js context (not eval()) so they can use\n  // fs, child_process, etc. All file paths resolve relative to the sandbox.\n  //\n  // Available assertion types:\n  //   file-exists      — check a file was created\n  //   file-contains    — check a file contains a substring\n  //   command-succeeds — run a shell command (exit 0 = pass)\n  //   diff-matches     — check git diff contains a pattern\n  //   llm-rubric       — LLM grades the agent's text output\n  assertions: [\n    // Verify the agent created the new schema file\n    { type: \"file-exists\", value: \"schemas/post.ts\" },\n\n    // Verify it uses the modern Sanity schema API\n    {\n      type: \"file-contains\",\n      value: { path: \"schemas/post.ts\", content: \"defineType\" },\n    },\n    {\n      type: \"file-contains\",\n      value: { path: \"schemas/post.ts\", content: \"defineField\" },\n    },\n\n    // Verify the schema was registered in the barrel export\n    {\n      type: \"file-contains\",\n      value: { path: \"schemas/index.ts\", content: \"post\" },\n    },\n\n    // Verify the existing config is still intact\n    {\n      type: \"file-contains\",\n      value: { path: \"sanity.config.ts\", content: \"defineConfig\" },\n    },\n  ],\n\n  status: \"draft\",\n})\n";
+export const exampleAgentAddSchemaTs = "/**\n * Example Task: Agent harness — add a schema to a Sanity Studio project.\n *\n * This is a starter template for agent-harness evaluations. It tests whether\n * an autonomous coding agent can modify a real project in a sandboxed\n * environment using file system tools.\n *\n * Unlike literacy or knowledge-probe tasks that evaluate text responses,\n * agent-harness tasks evaluate *side-effects*: files created, code modified,\n * commands executed. The agent gets a working directory with real files and\n * tools (Bash, Read, Write, Edit, etc.) and is graded on what it produces.\n *\n * This task is a DRAFT — it won't run unless activated or explicitly targeted.\n * To activate: change status to \"active\" or remove the status field.\n *\n * @see https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/modes.md#agent-harness\n */\n\nimport { defineTask } from \"@sanity/ailf\"\n\nexport default defineTask({\n  // ── Mode ──────────────────────────────────────────────────────\n  // \"agent-harness\" runs a real coding agent (Claude Agent SDK) with\n  // filesystem tools in an isolated sandbox. The agent can read, write,\n  // edit files and run shell commands — then assertions verify the results.\n  //\n  // Other modes: \"literacy\" (text Q&A), \"mcp-server\" (tool use),\n  //              \"knowledge-probe\" (baseline knowledge).\n  mode: \"agent-harness\",\n\n  id: \"example-agent-add-schema\",\n  title: \"Add a document schema to a Sanity Studio project\",\n  description:\n    \"Example — tests whether an agent can create a schema file and \" +\n    \"register it in an existing Sanity Studio project\",\n\n  // Area groups tasks for scoring. All agent-harness tasks in the same\n  // area are aggregated together in the score report.\n  area: \"studio\",\n\n  // ── Sandbox ───────────────────────────────────────────────────\n  // The sandbox isolates the agent's file operations from your real repo.\n  // Currently only \"tempdir\" is implemented (creates a temporary directory).\n  // The pipeline creates the sandbox dir at config-generation time, copies\n  // fixtures into it before the agent starts, and the agent's working_dir\n  // is set to the sandbox path.\n  //\n  // See: docs/modes.md#sandbox-lifecycle\n  sandbox: { type: \"tempdir\" },\n\n  // ── Tools ─────────────────────────────────────────────────────\n  // Tool presets control which Claude Agent SDK tools the agent can use:\n  //   \"coding\"      → Bash, Read, Write, Edit, Glob, Grep\n  //   \"full-access\" → coding + WebSearch, WebFetch, TodoRead, TodoWrite\n  //   \"read-only\"   → Read, Glob, Grep, WebSearch\n  //\n  // You can also mix presets with explicit tool names:\n  //   tools: [\"coding\", \"WebFetch\"]\n  tools: [\"coding\"],\n\n  // ── Fixtures ──────────────────────────────────────────────────\n  // Fixtures are files or directories copied into the sandbox before the\n  // agent starts. Paths use the file:// prefix and resolve relative to\n  // the directory where you run the pipeline (typically your repo root).\n  //\n  // The agent sees these files in its working directory as if they were\n  // a real project. For example, \"file://apps/studio-basic\" copies the\n  // entire studio-basic app into the sandbox root.\n  //\n  // If you don't have fixture projects, you can omit this field and the\n  // agent starts with an empty directory.\n  fixtures: [\"file://apps/studio-basic\"],\n\n  // ── Prompt ────────────────────────────────────────────────────\n  // The task instructions sent to the agent. Be specific about expected\n  // file paths and patterns — the assertions check for exact paths.\n  prompt: {\n    text: `You have a Sanity Studio project with an existing article schema.\nAdd a new \"post\" document type with the following fields:\n1. title (string, required)\n2. slug (slug, sourced from title, required)\n3. author (string)\n4. publishedAt (datetime)\n5. body (array of block content)\n\nCreate the schema file at schemas/post.ts using defineType() and defineField().\nRegister it in schemas/index.ts alongside the existing article schema.`,\n  },\n\n  // ── Assertions ────────────────────────────────────────────────\n  // Agent-harness assertions verify the sandbox state after the agent runs.\n  // They execute in a full Node.js context (not eval()) so they can use\n  // fs, child_process, etc. All file paths resolve relative to the sandbox.\n  //\n  // Available assertion types:\n  //   file-exists      — check a file was created\n  //   file-contains    — check a file contains a substring\n  //   command-succeeds — run a shell command (exit 0 = pass)\n  //   diff-matches     — check git diff contains a pattern\n  //   llm-rubric       — LLM grades the agent's text output\n  assertions: [\n    // Verify the agent created the new schema file\n    { type: \"file-exists\", value: \"schemas/post.ts\" },\n\n    // Verify it uses the modern Sanity schema API\n    {\n      type: \"file-contains\",\n      value: { path: \"schemas/post.ts\", content: \"defineType\" },\n    },\n    {\n      type: \"file-contains\",\n      value: { path: \"schemas/post.ts\", content: \"defineField\" },\n    },\n\n    // Verify the schema was registered in the barrel export\n    {\n      type: \"file-contains\",\n      value: { path: \"schemas/index.ts\", content: \"post\" },\n    },\n\n    // Verify the existing config is still intact\n    {\n      type: \"file-contains\",\n      value: { path: \"sanity.config.ts\", content: \"defineConfig\" },\n    },\n  ],\n\n  status: \"draft\",\n})\n";
 /** Generated YAML for example-agent-add-schema (from parsed TS data) */
 export const exampleAgentAddSchemaYaml = "- mode: agent-harness\n  id: example-agent-add-schema\n  title: Add a document schema to a Sanity Studio project\n  description: Example — tests whether an agent can create a schema file and register it in an existing Sanity Studio project\n  area: studio\n  sandbox:\n    type: tempdir\n  tools:\n    - coding\n  fixtures:\n    - file://apps/studio-basic\n  prompt:\n    text: |-\n      You have a Sanity Studio project with an existing article schema.\n      Add a new \"post\" document type with the following fields:\n      1. title (string, required)\n      2. slug (slug, sourced from title, required)\n      3. author (string)\n      4. publishedAt (datetime)\n      5. body (array of block content)\n\n      Create the schema file at schemas/post.ts using defineType() and defineField().\n      Register it in schemas/index.ts alongside the existing article schema.\n  assertions:\n    - type: file-exists\n      value: schemas/post.ts\n    - type: file-contains\n      value:\n        path: schemas/post.ts\n        content: defineType\n    - type: file-contains\n      value:\n        path: schemas/post.ts\n        content: defineField\n    - type: file-contains\n      value:\n        path: schemas/index.ts\n        content: post\n    - type: file-contains\n      value:\n        path: sanity.config.ts\n        content: defineConfig\n  status: draft\n";
 /** Parsed task data for example-groq-blog-listing (JSON-safe) */
@@ -256,7 +256,7 @@ export const exampleGroqBlogListingData = [
     }
 ];
 /** TypeScript task template for example-groq-blog-listing */
-export const exampleGroqBlogListingTs = "/**\n * Example Task: Blog listing with GROQ queries.\n *\n * This is a starter template — edit it for your own documentation.\n * Each task evaluates whether an AI coding agent can implement a feature\n * using your docs as context. Delete this file or replace it entirely.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * Full field reference:\n *   https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/contributing-tasks.md\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n  // ── Mode ────────────────────────────────────────────────────────────\n  // \"literacy\" tests whether AI coding tools can implement features\n  // using your docs as context. Other modes: \"mcp-server\",\n  // \"knowledge-probe\", \"agent-harness\", \"custom\".\n  mode: \"literacy\",\n\n  // ── Identity ────────────────────────────────────────────────────────\n  // Unique identifier — lowercase alphanumeric with hyphens.\n  // Must be unique across all task files in .ailf/tasks/.\n  id: \"example-groq-blog-listing\",\n  title: \"Blog listing with GROQ queries\",\n  description: \"Example — tests GROQ blog listing implementation\",\n\n  // Feature area this task belongs to. Tasks with the same area are\n  // grouped together in score summaries.\n  area: \"groq\",\n\n  // ── Documentation context ───────────────────────────────────────────\n  // Canonical doc references for this task. The pipeline fetches these\n  // from Sanity and injects them into the prompt for baseline evaluation.\n  //\n  // This example uses slug-based references — the simplest form.\n  // See the other example tasks for path, id, and perspective references.\n  context: {\n    docs: [\n      {\n        slug: \"groq-introduction\",\n        reason: \"Core GROQ syntax and query language reference\",\n      },\n      {\n        slug: \"how-queries-work\",\n        reason: \"Query execution model and best practices\",\n      },\n    ],\n  },\n\n  // When true, the pipeline auto-generates an additional rubric that\n  // checks whether the LLM's response actually used the provided docs.\n  docCoverage: true,\n\n  // Path to a gold-standard implementation, relative to canonical/.\n  // The grader uses this as a reference when scoring code correctness.\n  referenceSolution: \"canonical/example-groq-blog-listing.ts\",\n\n  // ── Prompt ──────────────────────────────────────────────────────────\n  // prompt.text — the implementation prompt given to the LLM.\n  // Write this as if you're asking a developer to build the feature.\n  // Be specific about requirements so the grader can evaluate clearly.\n  prompt: {\n    text: `Create a Next.js page component that lists blog posts from Sanity\nusing GROQ. The page should display the title, slug, and published\ndate for each post, sorted by most recent first. Use the Sanity\nclient to fetch data.`,\n  },\n\n  // ── Assertions ──────────────────────────────────────────────────────\n  // Grading assertions — how the LLM's response is scored.\n  //\n  // \"llm-rubric\" assertions use a grader LLM to score against criteria.\n  // The \"template\" references a rubric template (e.g. task-completion).\n  //\n  // Available templates:\n  //   task-completion   — did the LLM implement the feature? (weight: 0.50)\n  //   code-correctness  — is the code idiomatic and correct? (weight: 0.25)\n  //\n  // You can also use value-based assertions:\n  //   { type: \"contains\", value: \"client.fetch\" }\n  //   { type: \"contains-any\", value: [\"createClient\", \"sanityClient\"] }\n  assertions: [\n    {\n      type: \"llm-rubric\",\n      template: \"task-completion\",\n      criteria: [\n        \"Uses the groq tagged template literal\",\n        \"Fetches blog posts with title, slug, and publishedAt fields\",\n        \"Orders results by publishedAt in descending order\",\n      ],\n    },\n    {\n      type: \"llm-rubric\",\n      template: \"code-correctness\",\n      criteria: [\n        \"Uses createClient from @sanity/client or next-sanity\",\n        \"Exports a valid Next.js page component\",\n      ],\n    },\n  ],\n\n  // ── Baseline variant ────────────────────────────────────────────────\n  //   enabled — set to false to skip this task entirely\n  //   rubric  — \"full\" (default), \"abbreviated\" (faster), or \"none\"\n  baseline: {\n    enabled: true,\n    rubric: \"full\",\n  },\n\n  // Example tasks ship as drafts so they don't run in production evals.\n  // Change to \"active\" (or remove this field) to activate.\n  status: \"draft\",\n})\n";
+export const exampleGroqBlogListingTs = "/**\n * Example Task: Blog listing with GROQ queries.\n *\n * This is a starter template — edit it for your own documentation.\n * Each task evaluates whether an AI coding agent can implement a feature\n * using your docs as context. Delete this file or replace it entirely.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * Full field reference:\n *   https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/contributing-tasks.md\n */\n\nimport { defineTask } from \"@sanity/ailf\"\n\nexport default defineTask({\n  // ── Mode ────────────────────────────────────────────────────────────\n  // \"literacy\" tests whether AI coding tools can implement features\n  // using your docs as context. Other modes: \"mcp-server\",\n  // \"knowledge-probe\", \"agent-harness\", \"custom\".\n  mode: \"literacy\",\n\n  // ── Identity ────────────────────────────────────────────────────────\n  // Unique identifier — lowercase alphanumeric with hyphens.\n  // Must be unique across all task files in .ailf/tasks/.\n  id: \"example-groq-blog-listing\",\n  title: \"Blog listing with GROQ queries\",\n  description: \"Example — tests GROQ blog listing implementation\",\n\n  // Feature area this task belongs to. Tasks with the same area are\n  // grouped together in score summaries.\n  area: \"groq\",\n\n  // ── Documentation context ───────────────────────────────────────────\n  // Canonical doc references for this task. The pipeline fetches these\n  // from Sanity and injects them into the prompt for baseline evaluation.\n  //\n  // This example uses slug-based references — the simplest form.\n  // See the other example tasks for path, id, and perspective references.\n  context: {\n    docs: [\n      {\n        slug: \"groq-introduction\",\n        reason: \"Core GROQ syntax and query language reference\",\n      },\n      {\n        slug: \"how-queries-work\",\n        reason: \"Query execution model and best practices\",\n      },\n    ],\n  },\n\n  // When true, the pipeline auto-generates an additional rubric that\n  // checks whether the LLM's response actually used the provided docs.\n  docCoverage: true,\n\n  // Path to a gold-standard implementation, relative to canonical/.\n  // The grader uses this as a reference when scoring code correctness.\n  referenceSolution: \"canonical/example-groq-blog-listing.ts\",\n\n  // ── Prompt ──────────────────────────────────────────────────────────\n  // prompt.text — the implementation prompt given to the LLM.\n  // Write this as if you're asking a developer to build the feature.\n  // Be specific about requirements so the grader can evaluate clearly.\n  prompt: {\n    text: `Create a Next.js page component that lists blog posts from Sanity\nusing GROQ. The page should display the title, slug, and published\ndate for each post, sorted by most recent first. Use the Sanity\nclient to fetch data.`,\n  },\n\n  // ── Assertions ──────────────────────────────────────────────────────\n  // Grading assertions — how the LLM's response is scored.\n  //\n  // \"llm-rubric\" assertions use a grader LLM to score against criteria.\n  // The \"template\" references a rubric template (e.g. task-completion).\n  //\n  // Available templates:\n  //   task-completion   — did the LLM implement the feature? (weight: 0.50)\n  //   code-correctness  — is the code idiomatic and correct? (weight: 0.25)\n  //\n  // You can also use value-based assertions:\n  //   { type: \"contains\", value: \"client.fetch\" }\n  //   { type: \"contains-any\", value: [\"createClient\", \"sanityClient\"] }\n  assertions: [\n    {\n      type: \"llm-rubric\",\n      template: \"task-completion\",\n      criteria: [\n        \"Uses the groq tagged template literal\",\n        \"Fetches blog posts with title, slug, and publishedAt fields\",\n        \"Orders results by publishedAt in descending order\",\n      ],\n    },\n    {\n      type: \"llm-rubric\",\n      template: \"code-correctness\",\n      criteria: [\n        \"Uses createClient from @sanity/client or next-sanity\",\n        \"Exports a valid Next.js page component\",\n      ],\n    },\n  ],\n\n  // ── Baseline variant ────────────────────────────────────────────────\n  //   enabled — set to false to skip this task entirely\n  //   rubric  — \"full\" (default), \"abbreviated\" (faster), or \"none\"\n  baseline: {\n    enabled: true,\n    rubric: \"full\",\n  },\n\n  // Example tasks ship as drafts so they don't run in production evals.\n  // Change to \"active\" (or remove this field) to activate.\n  status: \"draft\",\n})\n";
 /** Generated YAML for example-groq-blog-listing (from parsed TS data) */
 export const exampleGroqBlogListingYaml = "- mode: literacy\n  id: example-groq-blog-listing\n  title: Blog listing with GROQ queries\n  description: Example — tests GROQ blog listing implementation\n  area: groq\n  context:\n    docs:\n      - slug: groq-introduction\n        reason: Core GROQ syntax and query language reference\n      - slug: how-queries-work\n        reason: Query execution model and best practices\n  docCoverage: true\n  referenceSolution: canonical/example-groq-blog-listing.ts\n  prompt:\n    text: |-\n      Create a Next.js page component that lists blog posts from Sanity\n      using GROQ. The page should display the title, slug, and published\n      date for each post, sorted by most recent first. Use the Sanity\n      client to fetch data.\n  assertions:\n    - type: llm-rubric\n      template: task-completion\n      criteria:\n        - Uses the groq tagged template literal\n        - Fetches blog posts with title, slug, and publishedAt fields\n        - Orders results by publishedAt in descending order\n    - type: llm-rubric\n      template: code-correctness\n      criteria:\n        - Uses createClient from @sanity/client or next-sanity\n        - Exports a valid Next.js page component\n  baseline:\n    enabled: true\n    rubric: full\n  status: draft\n";
 /** Parsed task data for example-id-based-ref (JSON-safe) */
@@ -312,7 +312,7 @@ export const exampleIdBasedRefData = [
     }
 ];
 /** TypeScript task template for example-id-based-ref */
-export const exampleIdBasedRefTs = "/**\n * Example Task: Document ID-based canonical doc references.\n *\n * Demonstrates using `id` to reference canonical documentation by\n * Sanity document `_id`. This is useful for:\n *   - Draft documents that don't have a stable slug yet\n *   - Programmatic references from imports or migrations\n *   - Documents where you know the _id but not the slug\n *\n * The `id` ref type can also carry optional `slug` and `path` fields\n * as human-readable annotations — these are NOT used for resolution,\n * only for display in logs and reports.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * @see docs/design-docs/canonical-doc-resolution.md\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n  mode: \"literacy\",\n  id: \"example-id-based-ref\",\n  title: \"GROQ feature support (ID-based doc references)\",\n  description: \"Example — demonstrates ID-based canonical doc references\",\n\n  area: \"groq\",\n\n  // ID-based canonical doc references.\n  //\n  // Use the Sanity document _id to reference articles directly.\n  // Optional slug/path annotations help humans reading the file\n  // but are NOT used for resolution — only the `id` field matters.\n  //\n  // These IDs reference real articles in the Sanity docs (next dataset):\n  //   0ba88f1b... = \"GROQ feature support across Sanity\"\n  //   5b9c2863... = \"Custom GROQ functions\"\n  context: {\n    docs: [\n      {\n        id: \"0ba88f1b-d1a7-418a-9267-2e343d01886a\",\n        slug: \"groq-feature-support-by-context\", // annotation only\n        reason: \"GROQ feature support across different Sanity contexts\",\n      },\n      {\n        id: \"5b9c2863-ef01-4565-af8e-ee54e081ee74\",\n        slug: \"custom-groq-functions\", // annotation only\n        reason: \"Custom GROQ functions and pipelines\",\n      },\n    ],\n  },\n\n  docCoverage: true,\n\n  prompt: {\n    text: `Explain how GROQ is used across different Sanity contexts.\nCover the following:\n1. Which GROQ features are available in each context (API queries,\n   webhooks, custom functions, access control)\n2. How to create and use custom GROQ functions\n3. Any differences in GROQ support between contexts\nProvide examples demonstrating context-specific GROQ patterns.`,\n  },\n\n  assertions: [\n    {\n      type: \"llm-rubric\",\n      template: \"task-completion\",\n      criteria: [\n        \"Explains GROQ availability across different Sanity contexts\",\n        \"Describes custom GROQ function creation and usage\",\n        \"Notes differences in GROQ support between contexts\",\n      ],\n    },\n    {\n      type: \"llm-rubric\",\n      template: \"code-correctness\",\n      criteria: [\n        \"GROQ examples use valid syntax\",\n        \"Custom function examples follow the correct API pattern\",\n      ],\n    },\n  ],\n\n  baseline: { enabled: true, rubric: \"full\" },\n  status: \"draft\",\n})\n";
+export const exampleIdBasedRefTs = "/**\n * Example Task: Document ID-based canonical doc references.\n *\n * Demonstrates using `id` to reference canonical documentation by\n * Sanity document `_id`. This is useful for:\n *   - Draft documents that don't have a stable slug yet\n *   - Programmatic references from imports or migrations\n *   - Documents where you know the _id but not the slug\n *\n * The `id` ref type can also carry optional `slug` and `path` fields\n * as human-readable annotations — these are NOT used for resolution,\n * only for display in logs and reports.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * @see docs/design-docs/canonical-doc-resolution.md\n */\n\nimport { defineTask } from \"@sanity/ailf\"\n\nexport default defineTask({\n  mode: \"literacy\",\n  id: \"example-id-based-ref\",\n  title: \"GROQ feature support (ID-based doc references)\",\n  description: \"Example — demonstrates ID-based canonical doc references\",\n\n  area: \"groq\",\n\n  // ID-based canonical doc references.\n  //\n  // Use the Sanity document _id to reference articles directly.\n  // Optional slug/path annotations help humans reading the file\n  // but are NOT used for resolution — only the `id` field matters.\n  //\n  // These IDs reference real articles in the Sanity docs (next dataset):\n  //   0ba88f1b... = \"GROQ feature support across Sanity\"\n  //   5b9c2863... = \"Custom GROQ functions\"\n  context: {\n    docs: [\n      {\n        id: \"0ba88f1b-d1a7-418a-9267-2e343d01886a\",\n        slug: \"groq-feature-support-by-context\", // annotation only\n        reason: \"GROQ feature support across different Sanity contexts\",\n      },\n      {\n        id: \"5b9c2863-ef01-4565-af8e-ee54e081ee74\",\n        slug: \"custom-groq-functions\", // annotation only\n        reason: \"Custom GROQ functions and pipelines\",\n      },\n    ],\n  },\n\n  docCoverage: true,\n\n  prompt: {\n    text: `Explain how GROQ is used across different Sanity contexts.\nCover the following:\n1. Which GROQ features are available in each context (API queries,\n   webhooks, custom functions, access control)\n2. How to create and use custom GROQ functions\n3. Any differences in GROQ support between contexts\nProvide examples demonstrating context-specific GROQ patterns.`,\n  },\n\n  assertions: [\n    {\n      type: \"llm-rubric\",\n      template: \"task-completion\",\n      criteria: [\n        \"Explains GROQ availability across different Sanity contexts\",\n        \"Describes custom GROQ function creation and usage\",\n        \"Notes differences in GROQ support between contexts\",\n      ],\n    },\n    {\n      type: \"llm-rubric\",\n      template: \"code-correctness\",\n      criteria: [\n        \"GROQ examples use valid syntax\",\n        \"Custom function examples follow the correct API pattern\",\n      ],\n    },\n  ],\n\n  baseline: { enabled: true, rubric: \"full\" },\n  status: \"draft\",\n})\n";
 /** Generated YAML for example-id-based-ref (from parsed TS data) */
 export const exampleIdBasedRefYaml = "- mode: literacy\n  id: example-id-based-ref\n  title: GROQ feature support (ID-based doc references)\n  description: Example — demonstrates ID-based canonical doc references\n  area: groq\n  context:\n    docs:\n      - id: 0ba88f1b-d1a7-418a-9267-2e343d01886a\n        slug: groq-feature-support-by-context\n        reason: GROQ feature support across different Sanity contexts\n      - id: 5b9c2863-ef01-4565-af8e-ee54e081ee74\n        slug: custom-groq-functions\n        reason: Custom GROQ functions and pipelines\n  docCoverage: true\n  prompt:\n    text: |-\n      Explain how GROQ is used across different Sanity contexts.\n      Cover the following:\n      1. Which GROQ features are available in each context (API queries,\n         webhooks, custom functions, access control)\n      2. How to create and use custom GROQ functions\n      3. Any differences in GROQ support between contexts\n      Provide examples demonstrating context-specific GROQ patterns.\n  assertions:\n    - type: llm-rubric\n      template: task-completion\n      criteria:\n        - Explains GROQ availability across different Sanity contexts\n        - Describes custom GROQ function creation and usage\n        - Notes differences in GROQ support between contexts\n    - type: llm-rubric\n      template: code-correctness\n      criteria:\n        - GROQ examples use valid syntax\n        - Custom function examples follow the correct API pattern\n  baseline:\n    enabled: true\n    rubric: full\n  status: draft\n";
 /** Parsed task data for example-knowledge-probe (JSON-safe) */
@@ -341,7 +341,7 @@ export const exampleKnowledgeProbeData = [
     }
 ];
 /** TypeScript task template for example-knowledge-probe */
-export const exampleKnowledgeProbeTs = "/**\n * Example Task: Knowledge probe baseline (DRAFT).\n *\n * Tests what the model knows about a topic without providing documentation.\n * Used to establish a baseline for comparison with literacy evaluations.\n * This task is a DRAFT — it won't run unless activated or explicitly targeted.\n *\n * To activate: change status to \"active\" or remove the status field.\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n  mode: \"knowledge-probe\",\n  id: \"example-knowledge-probe\",\n  title: \"Model knowledge of GROQ syntax\",\n  description: \"Example — probes baseline model knowledge (draft)\",\n  area: \"groq\",\n\n  prompt: {\n    text: `Explain the GROQ query language used by Sanity. Cover:\n1. Basic query syntax and projections\n2. How to filter and sort results\n3. Common patterns for fetching related documents\nProvide working code examples.`,\n  },\n\n  assertions: [\n    {\n      type: \"llm-rubric\",\n      template: \"task-completion\",\n      criteria: [\n        \"Demonstrates understanding of GROQ query syntax\",\n        \"Shows filtering and projection patterns\",\n        \"Code examples use valid GROQ syntax\",\n      ],\n    },\n  ],\n\n  status: \"draft\",\n})\n";
+export const exampleKnowledgeProbeTs = "/**\n * Example Task: Knowledge probe baseline (DRAFT).\n *\n * Tests what the model knows about a topic without providing documentation.\n * Used to establish a baseline for comparison with literacy evaluations.\n * This task is a DRAFT — it won't run unless activated or explicitly targeted.\n *\n * To activate: change status to \"active\" or remove the status field.\n */\n\nimport { defineTask } from \"@sanity/ailf\"\n\nexport default defineTask({\n  mode: \"knowledge-probe\",\n  id: \"example-knowledge-probe\",\n  title: \"Model knowledge of GROQ syntax\",\n  description: \"Example — probes baseline model knowledge (draft)\",\n  area: \"groq\",\n\n  prompt: {\n    text: `Explain the GROQ query language used by Sanity. Cover:\n1. Basic query syntax and projections\n2. How to filter and sort results\n3. Common patterns for fetching related documents\nProvide working code examples.`,\n  },\n\n  assertions: [\n    {\n      type: \"llm-rubric\",\n      template: \"task-completion\",\n      criteria: [\n        \"Demonstrates understanding of GROQ query syntax\",\n        \"Shows filtering and projection patterns\",\n        \"Code examples use valid GROQ syntax\",\n      ],\n    },\n  ],\n\n  status: \"draft\",\n})\n";
 /** Generated YAML for example-knowledge-probe (from parsed TS data) */
 export const exampleKnowledgeProbeYaml = "- mode: knowledge-probe\n  id: example-knowledge-probe\n  title: Model knowledge of GROQ syntax\n  description: Example — probes baseline model knowledge (draft)\n  area: groq\n  prompt:\n    text: |-\n      Explain the GROQ query language used by Sanity. Cover:\n      1. Basic query syntax and projections\n      2. How to filter and sort results\n      3. Common patterns for fetching related documents\n      Provide working code examples.\n  assertions:\n    - type: llm-rubric\n      template: task-completion\n      criteria:\n        - Demonstrates understanding of GROQ query syntax\n        - Shows filtering and projection patterns\n        - Code examples use valid GROQ syntax\n  status: draft\n";
 /** Parsed task data for example-mcp-tool-usage (JSON-safe) */
@@ -377,7 +377,7 @@ export const exampleMcpToolUsageData = [
     }
 ];
 /** TypeScript task template for example-mcp-tool-usage */
-export const exampleMcpToolUsageTs = "/**\n * Example Task: MCP Server tool-use evaluation (DRAFT).\n *\n * Tests whether an LLM can correctly discover and invoke tools from\n * an MCP server. Replace the placeholder serverConfig with your own\n * MCP server's URL and authentication details.\n *\n * Transports:\n *   - \"streamable-http\" / \"sse\" — remote servers (set url + optional headers)\n *   - \"stdio\" — local process (set command instead of url)\n *\n * Authentication:\n *   - `headers` — send arbitrary HTTP headers (e.g., Authorization)\n *   - `auth`    — structured auth config (bearer, basic, api_key, oauth)\n *   Values support {{env.VAR}} syntax so secrets stay out of source control.\n *\n * This task is a DRAFT — it won't run unless activated or explicitly targeted.\n * To activate: change status to \"active\" or remove the status field.\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n  mode: \"mcp-server\",\n  id: \"example-mcp-tool-usage\",\n  title: \"MCP tool discovery and invocation\",\n  description: \"Example — tests MCP server tool-use (draft)\",\n  area: \"mcp\",\n\n  // ── Server configuration ────────────────────────────────────\n  // Replace the URL and headers below with your MCP server's details.\n  //\n  // For a local stdio server, use:\n  //   transport: \"stdio\",\n  //   command: \"node dist/my-mcp-server.js\",\n  serverConfig: {\n    transport: \"streamable-http\",\n    url: \"https://your-mcp-server.example.com\",\n    headers: {\n      Authorization: \"Bearer {{env.MCP_AUTH_TOKEN}}\",\n    },\n  },\n\n  // ── Capabilities ────────────────────────────────────────────\n  // Restrict which MCP tools the model can call. If omitted, all\n  // tools discovered from the server are available.\n  // capabilities: [\"tool_a\", \"tool_b\"],\n\n  prompt: {\n    text: `Use the available MCP tools to complete the task.\nReplace this prompt with instructions specific to your MCP server.`,\n  },\n\n  assertions: [\n    {\n      type: \"llm-rubric\",\n      template: \"mcp-output-correctness\",\n      criteria: [\n        \"Correctly discovers and selects the appropriate tool\",\n        \"Passes valid arguments to the tool\",\n        \"Interprets the tool response coherently\",\n      ],\n    },\n  ],\n\n  status: \"draft\",\n})\n";
+export const exampleMcpToolUsageTs = "/**\n * Example Task: MCP Server tool-use evaluation (DRAFT).\n *\n * Tests whether an LLM can correctly discover and invoke tools from\n * an MCP server. Replace the placeholder serverConfig with your own\n * MCP server's URL and authentication details.\n *\n * Transports:\n *   - \"streamable-http\" / \"sse\" — remote servers (set url + optional headers)\n *   - \"stdio\" — local process (set command instead of url)\n *\n * Authentication:\n *   - `headers` — send arbitrary HTTP headers (e.g., Authorization)\n *   - `auth`    — structured auth config (bearer, basic, api_key, oauth)\n *   Values support {{env.VAR}} syntax so secrets stay out of source control.\n *\n * This task is a DRAFT — it won't run unless activated or explicitly targeted.\n * To activate: change status to \"active\" or remove the status field.\n */\n\nimport { defineTask } from \"@sanity/ailf\"\n\nexport default defineTask({\n  mode: \"mcp-server\",\n  id: \"example-mcp-tool-usage\",\n  title: \"MCP tool discovery and invocation\",\n  description: \"Example — tests MCP server tool-use (draft)\",\n  area: \"mcp\",\n\n  // ── Server configuration ────────────────────────────────────\n  // Replace the URL and headers below with your MCP server's details.\n  //\n  // For a local stdio server, use:\n  //   transport: \"stdio\",\n  //   command: \"node dist/my-mcp-server.js\",\n  serverConfig: {\n    transport: \"streamable-http\",\n    url: \"https://your-mcp-server.example.com\",\n    headers: {\n      Authorization: \"Bearer {{env.MCP_AUTH_TOKEN}}\",\n    },\n  },\n\n  // ── Capabilities ────────────────────────────────────────────\n  // Restrict which MCP tools the model can call. If omitted, all\n  // tools discovered from the server are available.\n  // capabilities: [\"tool_a\", \"tool_b\"],\n\n  prompt: {\n    text: `Use the available MCP tools to complete the task.\nReplace this prompt with instructions specific to your MCP server.`,\n  },\n\n  assertions: [\n    {\n      type: \"llm-rubric\",\n      template: \"mcp-output-correctness\",\n      criteria: [\n        \"Correctly discovers and selects the appropriate tool\",\n        \"Passes valid arguments to the tool\",\n        \"Interprets the tool response coherently\",\n      ],\n    },\n  ],\n\n  status: \"draft\",\n})\n";
 /** Generated YAML for example-mcp-tool-usage (from parsed TS data) */
 export const exampleMcpToolUsageYaml = "- mode: mcp-server\n  id: example-mcp-tool-usage\n  title: MCP tool discovery and invocation\n  description: Example — tests MCP server tool-use (draft)\n  area: mcp\n  serverConfig:\n    transport: streamable-http\n    url: https://your-mcp-server.example.com\n    headers:\n      Authorization: Bearer {{env.MCP_AUTH_TOKEN}}\n  prompt:\n    text: |-\n      Use the available MCP tools to complete the task.\n      Replace this prompt with instructions specific to your MCP server.\n  assertions:\n    - type: llm-rubric\n      template: mcp-output-correctness\n      criteria:\n        - Correctly discovers and selects the appropriate tool\n        - Passes valid arguments to the tool\n        - Interprets the tool response coherently\n  status: draft\n";
 /** Parsed task data for example-path-based-ref (JSON-safe) */
@@ -432,7 +432,7 @@ export const examplePathBasedRefData = [
     }
 ];
 /** TypeScript task template for example-path-based-ref */
-export const examplePathBasedRefTs = "/**\n * Example Task: Path-based canonical doc references.\n *\n * Demonstrates using `path` to reference canonical documentation.\n * Paths are the preferred reference type because they uniquely identify\n * an article across sections (unlike slugs, which can collide).\n *\n * Path format:\n *   - Simple:    \"webhooks\"                → resolves by slug lookup\n *   - Sectioned: \"content-lake/webhooks\"   → disambiguates by section + slug\n *\n * This example demonstrates why paths matter: the slug \"documents\"\n * exists in both the \"content-lake\" and \"cli-reference\" sections.\n * Using \"content-lake/documents\" ensures we get the right one.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * @see docs/design-docs/canonical-doc-resolution.md\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n  mode: \"literacy\",\n  id: \"example-path-based-ref\",\n  title: \"GROQ mutations (path-based doc references)\",\n  description: \"Example — demonstrates path-based canonical doc references\",\n\n  area: \"groq\",\n\n  // Path-based canonical doc references.\n  //\n  // Use \"section/slug\" format to uniquely identify articles:\n  //   - \"content-lake/mutations-introduction\" → the mutations article\n  //   - \"content-lake/documents\" → the documents article in Content Lake\n  //     (not the CLI \"documents\" article in cli-reference section)\n  //\n  // The \"documents\" slug exists in two sections — this is exactly why\n  // path-based references are preferred over slug-based references.\n  context: {\n    docs: [\n      {\n        path: \"content-lake/mutations-introduction\",\n        reason: \"Introduction to document mutations in the Content Lake\",\n      },\n      {\n        path: \"content-lake/documents\",\n        reason:\n          \"Document structure and types (Content Lake, not CLI reference)\",\n      },\n    ],\n  },\n\n  docCoverage: true,\n\n  prompt: {\n    text: `Explain how to create, update, and delete documents in Sanity's\nContent Lake using mutations. Cover:\n1. The different mutation types (create, createOrReplace, patch, delete)\n2. Document structure and required fields (_id, _type)\n3. How to use patch operations to update specific fields\n4. Best practices for mutation patterns\nProvide working code examples using @sanity/client.`,\n  },\n\n  assertions: [\n    {\n      type: \"llm-rubric\",\n      template: \"task-completion\",\n      criteria: [\n        \"Explains create, createOrReplace, patch, and delete mutations\",\n        \"Describes required document fields (_id, _type)\",\n        \"Shows patch operations for field-level updates\",\n        \"Includes practical code examples\",\n      ],\n    },\n    {\n      type: \"llm-rubric\",\n      template: \"code-correctness\",\n      criteria: [\n        \"Uses correct @sanity/client mutation API\",\n        \"Patch operations use valid set/unset/inc syntax\",\n      ],\n    },\n  ],\n\n  baseline: { enabled: true, rubric: \"full\" },\n  status: \"draft\",\n})\n";
+export const examplePathBasedRefTs = "/**\n * Example Task: Path-based canonical doc references.\n *\n * Demonstrates using `path` to reference canonical documentation.\n * Paths are the preferred reference type because they uniquely identify\n * an article across sections (unlike slugs, which can collide).\n *\n * Path format:\n *   - Simple:    \"webhooks\"                → resolves by slug lookup\n *   - Sectioned: \"content-lake/webhooks\"   → disambiguates by section + slug\n *\n * This example demonstrates why paths matter: the slug \"documents\"\n * exists in both the \"content-lake\" and \"cli-reference\" sections.\n * Using \"content-lake/documents\" ensures we get the right one.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * @see docs/design-docs/canonical-doc-resolution.md\n */\n\nimport { defineTask } from \"@sanity/ailf\"\n\nexport default defineTask({\n  mode: \"literacy\",\n  id: \"example-path-based-ref\",\n  title: \"GROQ mutations (path-based doc references)\",\n  description: \"Example — demonstrates path-based canonical doc references\",\n\n  area: \"groq\",\n\n  // Path-based canonical doc references.\n  //\n  // Use \"section/slug\" format to uniquely identify articles:\n  //   - \"content-lake/mutations-introduction\" → the mutations article\n  //   - \"content-lake/documents\" → the documents article in Content Lake\n  //     (not the CLI \"documents\" article in cli-reference section)\n  //\n  // The \"documents\" slug exists in two sections — this is exactly why\n  // path-based references are preferred over slug-based references.\n  context: {\n    docs: [\n      {\n        path: \"content-lake/mutations-introduction\",\n        reason: \"Introduction to document mutations in the Content Lake\",\n      },\n      {\n        path: \"content-lake/documents\",\n        reason:\n          \"Document structure and types (Content Lake, not CLI reference)\",\n      },\n    ],\n  },\n\n  docCoverage: true,\n\n  prompt: {\n    text: `Explain how to create, update, and delete documents in Sanity's\nContent Lake using mutations. Cover:\n1. The different mutation types (create, createOrReplace, patch, delete)\n2. Document structure and required fields (_id, _type)\n3. How to use patch operations to update specific fields\n4. Best practices for mutation patterns\nProvide working code examples using @sanity/client.`,\n  },\n\n  assertions: [\n    {\n      type: \"llm-rubric\",\n      template: \"task-completion\",\n      criteria: [\n        \"Explains create, createOrReplace, patch, and delete mutations\",\n        \"Describes required document fields (_id, _type)\",\n        \"Shows patch operations for field-level updates\",\n        \"Includes practical code examples\",\n      ],\n    },\n    {\n      type: \"llm-rubric\",\n      template: \"code-correctness\",\n      criteria: [\n        \"Uses correct @sanity/client mutation API\",\n        \"Patch operations use valid set/unset/inc syntax\",\n      ],\n    },\n  ],\n\n  baseline: { enabled: true, rubric: \"full\" },\n  status: \"draft\",\n})\n";
 /** Generated YAML for example-path-based-ref (from parsed TS data) */
 export const examplePathBasedRefYaml = "- mode: literacy\n  id: example-path-based-ref\n  title: GROQ mutations (path-based doc references)\n  description: Example — demonstrates path-based canonical doc references\n  area: groq\n  context:\n    docs:\n      - path: content-lake/mutations-introduction\n        reason: Introduction to document mutations in the Content Lake\n      - path: content-lake/documents\n        reason: Document structure and types (Content Lake, not CLI reference)\n  docCoverage: true\n  prompt:\n    text: |-\n      Explain how to create, update, and delete documents in Sanity's\n      Content Lake using mutations. Cover:\n      1. The different mutation types (create, createOrReplace, patch, delete)\n      2. Document structure and required fields (_id, _type)\n      3. How to use patch operations to update specific fields\n      4. Best practices for mutation patterns\n      Provide working code examples using @sanity/client.\n  assertions:\n    - type: llm-rubric\n      template: task-completion\n      criteria:\n        - Explains create, createOrReplace, patch, and delete mutations\n        - Describes required document fields (_id, _type)\n        - Shows patch operations for field-level updates\n        - Includes practical code examples\n    - type: llm-rubric\n      template: code-correctness\n      criteria:\n        - Uses correct @sanity/client mutation API\n        - Patch operations use valid set/unset/inc syntax\n  baseline:\n    enabled: true\n    rubric: full\n  status: draft\n";
 /** Parsed task data for example-perspective-ref (JSON-safe) */
@@ -486,7 +486,7 @@ export const examplePerspectiveRefData = [
     }
 ];
 /** TypeScript task template for example-perspective-ref */
-export const examplePerspectiveRefTs = "/**\n * Example Task: Perspective / content release doc references.\n *\n * Demonstrates using `perspective` to reference all documentation\n * articles within a content release. This is the key capability for\n * evaluating NEW feature documentation before it's published.\n *\n * How it works:\n *   - A perspective ref is one-to-many: the doc fetcher queries the\n *     named release and expands it to ALL articles versioned within it.\n *   - Downstream consumers see the same flat DocContext[] regardless\n *     of how docs were resolved.\n *   - When the release is published, the perspective entry becomes a\n *     no-op (articles are now in published). Migrate to explicit path\n *     or slug refs at your convenience.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * @see docs/design-docs/canonical-doc-resolution.md\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n  mode: \"literacy\",\n  id: \"example-perspective-ref\",\n  title:\n    \"GROQ features from content release (perspective-based doc references)\",\n  description:\n    \"Example — demonstrates perspective-based canonical doc references\",\n\n  area: \"groq\",\n\n  // Perspective-based canonical doc reference.\n  //\n  // The perspective ID references a content release in the Sanity\n  // Content Lake. At evaluation time, the doc fetcher auto-discovers\n  // all articles versioned in this release and includes them as\n  // canonical documentation context.\n  //\n  // Release rE9TSJvR4 contains:\n  //   - \"GROQ-powered webhooks\" (webhooks)\n  //   - \"Query Cheat Sheet - GROQ\" (query-cheat-sheet)\n  //   - \"GROQ joins\" (groq-joins)\n  //\n  // You can combine perspective refs with explicit slug/path/id refs\n  // to include foundational published docs alongside release content.\n  // Here we add groq-data-types as a complementary published reference.\n  context: {\n    docs: [\n      {\n        perspective: \"rE9TSJvR4\",\n        reason: \"All GROQ documentation updates in the test content release\",\n      },\n      {\n        slug: \"groq-data-types\",\n        reason: \"GROQ data type reference (published, stable)\",\n      },\n    ],\n  },\n\n  docCoverage: true,\n\n  prompt: {\n    text: `Using GROQ, demonstrate advanced query patterns including:\n1. Joining data across document types using references\n2. Filtering webhook payloads with GROQ projections\n3. Using the query cheat sheet patterns for common operations\n4. Working with different GROQ data types in filters\nProvide working GROQ query examples for each pattern.`,\n  },\n\n  assertions: [\n    {\n      type: \"llm-rubric\",\n      template: \"task-completion\",\n      criteria: [\n        \"Demonstrates GROQ join syntax for cross-document queries\",\n        \"Shows GROQ filter patterns for webhook configuration\",\n        \"Includes practical query examples from cheat sheet patterns\",\n      ],\n    },\n    {\n      type: \"llm-rubric\",\n      template: \"code-correctness\",\n      criteria: [\n        \"All GROQ queries use valid syntax\",\n        \"Reference joins use correct dereference operator (->)\",\n      ],\n    },\n  ],\n\n  baseline: { enabled: true, rubric: \"full\" },\n  status: \"draft\",\n})\n";
+export const examplePerspectiveRefTs = "/**\n * Example Task: Perspective / content release doc references.\n *\n * Demonstrates using `perspective` to reference all documentation\n * articles within a content release. This is the key capability for\n * evaluating NEW feature documentation before it's published.\n *\n * How it works:\n *   - A perspective ref is one-to-many: the doc fetcher queries the\n *     named release and expands it to ALL articles versioned within it.\n *   - Downstream consumers see the same flat DocContext[] regardless\n *     of how docs were resolved.\n *   - When the release is published, the perspective entry becomes a\n *     no-op (articles are now in published). Migrate to explicit path\n *     or slug refs at your convenience.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n *\n * @see docs/design-docs/canonical-doc-resolution.md\n */\n\nimport { defineTask } from \"@sanity/ailf\"\n\nexport default defineTask({\n  mode: \"literacy\",\n  id: \"example-perspective-ref\",\n  title:\n    \"GROQ features from content release (perspective-based doc references)\",\n  description:\n    \"Example — demonstrates perspective-based canonical doc references\",\n\n  area: \"groq\",\n\n  // Perspective-based canonical doc reference.\n  //\n  // The perspective ID references a content release in the Sanity\n  // Content Lake. At evaluation time, the doc fetcher auto-discovers\n  // all articles versioned in this release and includes them as\n  // canonical documentation context.\n  //\n  // Release rE9TSJvR4 contains:\n  //   - \"GROQ-powered webhooks\" (webhooks)\n  //   - \"Query Cheat Sheet - GROQ\" (query-cheat-sheet)\n  //   - \"GROQ joins\" (groq-joins)\n  //\n  // You can combine perspective refs with explicit slug/path/id refs\n  // to include foundational published docs alongside release content.\n  // Here we add groq-data-types as a complementary published reference.\n  context: {\n    docs: [\n      {\n        perspective: \"rE9TSJvR4\",\n        reason: \"All GROQ documentation updates in the test content release\",\n      },\n      {\n        slug: \"groq-data-types\",\n        reason: \"GROQ data type reference (published, stable)\",\n      },\n    ],\n  },\n\n  docCoverage: true,\n\n  prompt: {\n    text: `Using GROQ, demonstrate advanced query patterns including:\n1. Joining data across document types using references\n2. Filtering webhook payloads with GROQ projections\n3. Using the query cheat sheet patterns for common operations\n4. Working with different GROQ data types in filters\nProvide working GROQ query examples for each pattern.`,\n  },\n\n  assertions: [\n    {\n      type: \"llm-rubric\",\n      template: \"task-completion\",\n      criteria: [\n        \"Demonstrates GROQ join syntax for cross-document queries\",\n        \"Shows GROQ filter patterns for webhook configuration\",\n        \"Includes practical query examples from cheat sheet patterns\",\n      ],\n    },\n    {\n      type: \"llm-rubric\",\n      template: \"code-correctness\",\n      criteria: [\n        \"All GROQ queries use valid syntax\",\n        \"Reference joins use correct dereference operator (->)\",\n      ],\n    },\n  ],\n\n  baseline: { enabled: true, rubric: \"full\" },\n  status: \"draft\",\n})\n";
 /** Generated YAML for example-perspective-ref (from parsed TS data) */
 export const examplePerspectiveRefYaml = "- mode: literacy\n  id: example-perspective-ref\n  title: GROQ features from content release (perspective-based doc references)\n  description: Example — demonstrates perspective-based canonical doc references\n  area: groq\n  context:\n    docs:\n      - perspective: rE9TSJvR4\n        reason: All GROQ documentation updates in the test content release\n      - slug: groq-data-types\n        reason: GROQ data type reference (published, stable)\n  docCoverage: true\n  prompt:\n    text: |-\n      Using GROQ, demonstrate advanced query patterns including:\n      1. Joining data across document types using references\n      2. Filtering webhook payloads with GROQ projections\n      3. Using the query cheat sheet patterns for common operations\n      4. Working with different GROQ data types in filters\n      Provide working GROQ query examples for each pattern.\n  assertions:\n    - type: llm-rubric\n      template: task-completion\n      criteria:\n        - Demonstrates GROQ join syntax for cross-document queries\n        - Shows GROQ filter patterns for webhook configuration\n        - Includes practical query examples from cheat sheet patterns\n    - type: llm-rubric\n      template: code-correctness\n      criteria:\n        - All GROQ queries use valid syntax\n        - Reference joins use correct dereference operator (->)\n  baseline:\n    enabled: true\n    rubric: full\n  status: draft\n";
 /** Parsed task data for example-studio-custom-input (JSON-safe) */
@@ -542,7 +542,7 @@ export const exampleStudioCustomInputData = [
     }
 ];
 /** TypeScript task template for example-studio-custom-input */
-export const exampleStudioCustomInputTs = "/**\n * Example Task: Custom input component in Sanity Studio.\n *\n * This is a starter template — edit it for your own documentation.\n * Delete this file or replace it with your own tasks.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n */\n\nimport { defineTask } from \"@sanity/ailf-core\"\n\nexport default defineTask({\n  mode: \"literacy\",\n  id: \"example-studio-custom-input\",\n  title: \"Custom input component in Sanity Studio\",\n  description: \"Example — tests Studio custom input implementation\",\n\n  area: \"studio\",\n\n  context: {\n    docs: [\n      {\n        slug: \"custom-input-widgets\",\n        reason: \"Guide for building custom form inputs in Sanity Studio\",\n      },\n      {\n        slug: \"form-components\",\n        reason: \"Form component API and customization patterns\",\n      },\n    ],\n  },\n\n  docCoverage: true,\n  referenceSolution: \"canonical/example-studio-custom-input.ts\",\n\n  prompt: {\n    text: `Build a custom string input component for Sanity Studio that shows\na character count below the input field. The component should accept\na maxLength option from the field schema and display a warning when\nthe text exceeds the limit.`,\n  },\n\n  assertions: [\n    {\n      type: \"llm-rubric\",\n      template: \"task-completion\",\n      criteria: [\n        \"Implements a React component that renders a text input\",\n        \"Displays a live character count\",\n        \"Reads maxLength from schema options\",\n        \"Shows a visual warning when limit is exceeded\",\n      ],\n    },\n    {\n      type: \"llm-rubric\",\n      template: \"code-correctness\",\n      criteria: [\n        \"Uses the Sanity UI library for styling\",\n        \"Calls onChange with patch operations\",\n      ],\n    },\n  ],\n\n  baseline: { enabled: true, rubric: \"full\" },\n  status: \"draft\",\n})\n";
+export const exampleStudioCustomInputTs = "/**\n * Example Task: Custom input component in Sanity Studio.\n *\n * This is a starter template — edit it for your own documentation.\n * Delete this file or replace it with your own tasks.\n *\n * This example task ships as a DRAFT so it does not run in production\n * evaluations automatically. To activate it, change status to \"active\"\n * or remove the status field entirely (defaults to active).\n */\n\nimport { defineTask } from \"@sanity/ailf\"\n\nexport default defineTask({\n  mode: \"literacy\",\n  id: \"example-studio-custom-input\",\n  title: \"Custom input component in Sanity Studio\",\n  description: \"Example — tests Studio custom input implementation\",\n\n  area: \"studio\",\n\n  context: {\n    docs: [\n      {\n        slug: \"custom-input-widgets\",\n        reason: \"Guide for building custom form inputs in Sanity Studio\",\n      },\n      {\n        slug: \"form-components\",\n        reason: \"Form component API and customization patterns\",\n      },\n    ],\n  },\n\n  docCoverage: true,\n  referenceSolution: \"canonical/example-studio-custom-input.ts\",\n\n  prompt: {\n    text: `Build a custom string input component for Sanity Studio that shows\na character count below the input field. The component should accept\na maxLength option from the field schema and display a warning when\nthe text exceeds the limit.`,\n  },\n\n  assertions: [\n    {\n      type: \"llm-rubric\",\n      template: \"task-completion\",\n      criteria: [\n        \"Implements a React component that renders a text input\",\n        \"Displays a live character count\",\n        \"Reads maxLength from schema options\",\n        \"Shows a visual warning when limit is exceeded\",\n      ],\n    },\n    {\n      type: \"llm-rubric\",\n      template: \"code-correctness\",\n      criteria: [\n        \"Uses the Sanity UI library for styling\",\n        \"Calls onChange with patch operations\",\n      ],\n    },\n  ],\n\n  baseline: { enabled: true, rubric: \"full\" },\n  status: \"draft\",\n})\n";
 /** Generated YAML for example-studio-custom-input (from parsed TS data) */
 export const exampleStudioCustomInputYaml = "- mode: literacy\n  id: example-studio-custom-input\n  title: Custom input component in Sanity Studio\n  description: Example — tests Studio custom input implementation\n  area: studio\n  context:\n    docs:\n      - slug: custom-input-widgets\n        reason: Guide for building custom form inputs in Sanity Studio\n      - slug: form-components\n        reason: Form component API and customization patterns\n  docCoverage: true\n  referenceSolution: canonical/example-studio-custom-input.ts\n  prompt:\n    text: |-\n      Build a custom string input component for Sanity Studio that shows\n      a character count below the input field. The component should accept\n      a maxLength option from the field schema and display a warning when\n      the text exceeds the limit.\n  assertions:\n    - type: llm-rubric\n      template: task-completion\n      criteria:\n        - Implements a React component that renders a text input\n        - Displays a live character count\n        - Reads maxLength from schema options\n        - Shows a visual warning when limit is exceeded\n    - type: llm-rubric\n      template: code-correctness\n      criteria:\n        - Uses the Sanity UI library for styling\n        - Calls onChange with patch operations\n  baseline:\n    enabled: true\n    rubric: full\n  status: draft\n";
 // ---------------------------------------------------------------------------

package/dist/_vendor/ailf-shared/feature-flags.d.ts ADDED Viewed

@@ -0,0 +1,59 @@
+/**
+ * Feature flags — compile-time UI/feature visibility toggles.
+ *
+ * Single source of truth for "temporary" flags that hide in-flight features,
+ * gate partially-built panels, or carry a known rollback. Each entry carries
+ * the metadata needed to answer "why is this off and when can it go?" so
+ * flags don't rot into undiscoverable tombstones.
+ *
+ * This is intentionally NOT a runtime feature-flag system — no user
+ * segmentation, no A/B, no env-var overrides. Just a typed map of booleans
+ * with audit metadata. Flipping a flag is a code change.
+ *
+ * Adding a flag:
+ *   1. Add an entry below with every metadata field populated.
+ *   2. Import `FEATURE_FLAGS` at the call site and read `.enabled`.
+ *   3. When the re-enable condition is met, remove the entry and the gate.
+ *
+ * See docs/guides/feature-flags.md for the full lifecycle.
+ */
+/** Shape of a single feature-flag entry. All fields required. */
+export interface FeatureFlag {
+    /** Whether the gated feature is visible / active. */
+    readonly enabled: boolean;
+    /** Why the flag exists. Answers "what problem did turning this off solve?" */
+    readonly rationale: string;
+    /** The condition under which this flag should be re-enabled or removed. */
+    readonly reEnableWhen: string;
+    /** ID of the work item that owns the flag's resolution, or null if none. */
+    readonly relatedWorkItem: `W${string}` | null;
+    /** ISO 8601 date (YYYY-MM-DD) the flag was introduced. Used for staleness audits. */
+    readonly addedAt: string;
+}
+/**
+ * Registry of all active feature flags across AILF packages.
+ *
+ * Consumers read values directly:
+ *   if (FEATURE_FLAGS.showFailureModes.enabled) { ... }
+ *
+ * Adding a key here extends the `FeatureFlagKey` union automatically; typos
+ * at call sites fail at compile time.
+ */
+export declare const FEATURE_FLAGS: {
+    readonly showFailureModes: {
+        readonly enabled: false;
+        readonly rationale: "Current classification is too broad (majority \"Unclassified\") to be actionable in the diagnostics view.";
+        readonly reEnableWhen: "Failure taxonomy is refined so non-Unclassified buckets carry meaningful signal.";
+        readonly relatedWorkItem: "W0037-detect-model-output-failures";
+        readonly addedAt: "2026-04-22";
+    };
+    readonly showRegressedSinceLastRun: {
+        readonly enabled: false;
+        readonly rationale: "Bare list of regressed area names lacks explanatory context for why each regressed.";
+        readonly reEnableWhen: "Per-area regression attribution can be surfaced alongside the list.";
+        readonly relatedWorkItem: null;
+        readonly addedAt: "2026-04-22";
+    };
+};
+/** Union of all registered flag keys. Typos at call sites fail at compile time. */
+export type FeatureFlagKey = keyof typeof FEATURE_FLAGS;

package/dist/_vendor/ailf-shared/feature-flags.js ADDED Viewed

@@ -0,0 +1,44 @@
+/**
+ * Feature flags — compile-time UI/feature visibility toggles.
+ *
+ * Single source of truth for "temporary" flags that hide in-flight features,
+ * gate partially-built panels, or carry a known rollback. Each entry carries
+ * the metadata needed to answer "why is this off and when can it go?" so
+ * flags don't rot into undiscoverable tombstones.
+ *
+ * This is intentionally NOT a runtime feature-flag system — no user
+ * segmentation, no A/B, no env-var overrides. Just a typed map of booleans
+ * with audit metadata. Flipping a flag is a code change.
+ *
+ * Adding a flag:
+ *   1. Add an entry below with every metadata field populated.
+ *   2. Import `FEATURE_FLAGS` at the call site and read `.enabled`.
+ *   3. When the re-enable condition is met, remove the entry and the gate.
+ *
+ * See docs/guides/feature-flags.md for the full lifecycle.
+ */
+/**
+ * Registry of all active feature flags across AILF packages.
+ *
+ * Consumers read values directly:
+ *   if (FEATURE_FLAGS.showFailureModes.enabled) { ... }
+ *
+ * Adding a key here extends the `FeatureFlagKey` union automatically; typos
+ * at call sites fail at compile time.
+ */
+export const FEATURE_FLAGS = {
+    showFailureModes: {
+        enabled: false,
+        rationale: 'Current classification is too broad (majority "Unclassified") to be actionable in the diagnostics view.',
+        reEnableWhen: "Failure taxonomy is refined so non-Unclassified buckets carry meaningful signal.",
+        relatedWorkItem: "W0037-detect-model-output-failures",
+        addedAt: "2026-04-22",
+    },
+    showRegressedSinceLastRun: {
+        enabled: false,
+        rationale: "Bare list of regressed area names lacks explanatory context for why each regressed.",
+        reEnableWhen: "Per-area regression attribution can be surfaced alongside the list.",
+        relatedWorkItem: null,
+        addedAt: "2026-04-22",
+    },
+};

package/dist/_vendor/ailf-shared/index.d.ts CHANGED Viewed

@@ -10,6 +10,7 @@
  * @sanity/ailf-studio. It is the leaf of the dependency graph.
  */
 export * from "./document-ref.js";
+export * from "./feature-flags.js";
 export * from "./score-grades.js";
 export * from "./noise-threshold.js";
 export * from "./eval-modes.js";

package/dist/_vendor/ailf-shared/index.js CHANGED Viewed

@@ -10,6 +10,7 @@
  * @sanity/ailf-studio. It is the leaf of the dependency graph.
  */
 export * from "./document-ref.js";
+export * from "./feature-flags.js";
 export * from "./score-grades.js";
 export * from "./noise-threshold.js";
 export * from "./eval-modes.js";