npm - @sanity/ailf - Versions diffs - 7.0.0 → 7.1.0 - Mend

@sanity/ailf 7.0.0 → 7.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

package/config/rubrics.ts +12 -13
package/dist/_vendor/ailf-core/examples/index.d.ts +3 -3
package/dist/_vendor/ailf-core/examples/index.js +3 -3
package/dist/_vendor/ailf-core/ports/context.d.ts +45 -3
package/dist/_vendor/ailf-core/ports/index.d.ts +1 -1
package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +9 -1
package/dist/_vendor/ailf-core/schemas/branded-string.js +16 -6
package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +2 -0
package/dist/_vendor/ailf-core/schemas/pipeline-request.js +7 -0
package/dist/_vendor/ailf-core/schemas/report.d.ts +12 -0
package/dist/_vendor/ailf-core/schemas/report.js +2 -0
package/dist/_vendor/ailf-core/schemas/team.d.ts +22 -0
package/dist/_vendor/ailf-core/schemas/team.js +63 -0
package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +51 -0
package/dist/_vendor/ailf-core/types/index.d.ts +8 -1
package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +17 -0
package/dist/_vendor/ailf-core/types/team.d.ts +65 -0
package/dist/_vendor/ailf-core/types/team.js +1 -0
package/dist/_vendor/ailf-shared/eval-modes.d.ts +2 -0
package/dist/_vendor/ailf-shared/eval-modes.js +5 -0
package/dist/_vendor/ailf-shared/event-types.d.ts +15 -0
package/dist/_vendor/ailf-shared/event-types.js +23 -0
package/dist/_vendor/ailf-shared/generated/help-content.js +2 -2
package/dist/_vendor/ailf-shared/index.d.ts +4 -2
package/dist/_vendor/ailf-shared/index.js +4 -2
package/dist/_vendor/ailf-shared/member-roles.d.ts +16 -0
package/dist/_vendor/ailf-shared/member-roles.js +16 -0
package/dist/_vendor/ailf-shared/owner-teams.d.ts +19 -0
package/dist/_vendor/ailf-shared/owner-teams.js +7 -0
package/dist/_vendor/ailf-shared/run-context.d.ts +8 -1
package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +65 -1
package/dist/adapters/grader-outputs/promptfoo-grader-output.js +35 -0
package/dist/adapters/task-sources/changed-docs-filter.d.ts +12 -0
package/dist/adapters/task-sources/changed-docs-filter.js +30 -0
package/dist/adapters/task-sources/content-lake-task-source.js +2 -1
package/dist/adapters/task-sources/repo-task-source.js +2 -1
package/dist/commands/pipeline-action.d.ts +4 -3
package/dist/commands/pipeline-action.js +7 -5
package/dist/commands/run.js +2 -2
package/dist/config/rubrics.ts +12 -13
package/dist/job-store.d.ts +18 -0
package/dist/job-store.js +34 -0
package/dist/orchestration/build-app-context.js +8 -1
package/dist/orchestration/pipeline-orchestrator.js +46 -1
package/dist/orchestration/steps/compare-step.d.ts +7 -0
package/dist/orchestration/steps/compare-step.js +59 -23
package/dist/orchestration/steps/fetch-docs-step.js +3 -0
package/dist/orchestration/steps/finalize-run-step.js +2 -0
package/dist/orchestration/steps/generate-configs-step.d.ts +32 -1
package/dist/orchestration/steps/generate-configs-step.js +47 -13
package/dist/orchestration/steps/grader-consistency-step.js +11 -0
package/dist/orchestration/steps/publish-report-step.d.ts +12 -1
package/dist/orchestration/steps/publish-report-step.js +19 -3
package/dist/pipeline/cache-hit-restore.d.ts +30 -5
package/dist/pipeline/cache-hit-restore.js +36 -6
package/dist/pipeline/calculate-scores.js +57 -21
package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +7 -2
package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +13 -4
package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +1 -1
package/dist/pipeline/compiler/provider-assembler.d.ts +15 -1
package/dist/pipeline/compiler/provider-assembler.js +16 -3
package/dist/pipeline/failure-modes.d.ts +20 -10
package/dist/pipeline/failure-modes.js +84 -15
package/dist/pipeline/map-request-to-config.js +2 -0
package/dist/pipeline/normalize-mode.d.ts +1 -1
package/dist/pipeline/normalize-mode.js +2 -0
package/dist/pipeline/run-context.d.ts +16 -1
package/dist/pipeline/run-context.js +12 -1
package/dist/pipeline/validate.d.ts +8 -4
package/dist/pipeline/validate.js +8 -18
package/dist/report-store.d.ts +14 -1
package/dist/report-store.js +32 -0
package/dist/sanity/client.js +2 -2
package/package.json +3 -3

package/config/rubrics.ts CHANGED Viewed

@@ -15,10 +15,6 @@ import { defineRubrics } from "@sanity/ailf-core"
 // template entry below. Source of truth lives in packages/eval/src/grader/;
 // the helper picks the right list by dimension family.
 import { failureModesForDimension } from "../src/grader/index.js"
-// Single source of truth for the wire-format version stamped into the
-// grader-prompt footer (VER-01 D-02). Interpolated below so the
-// announced version cannot drift from the schema's expected value.
-import { graderJudgmentsVersion } from "../src/adapters/grader-outputs/index.js"
 export default defineRubrics({
   templates: {
@@ -242,20 +238,23 @@ export default defineRubrics({
     "agent-harness": { gold: "agent-harness" },
   },
-  // Phase 3 GRAD-05 (Plan 03-01) — structured GraderJudgment JSON sketch.
-  // Documents the target wire format the grader emits. The strict schema's
-  // GRAD-02 additive fields stay optional in this plan; Plan 03-04 flips
-  // them to required and bumps graderJudgmentsVersion to 1.0.0.
+  // W0273 — the footer documents the wire-format subset of GraderJudgment
+  // that the grader LLM actually controls. The pipeline parses this against
+  // GraderEmittedJudgmentSchema and then synthesizes the four pipeline-owned
+  // fields (judgmentId, metadata.{graderModel,graderJudgmentsVersion},
+  // hallucinationCheckedAgainst) to build the storage GraderJudgment.
+  //
+  // See docs/audits/2026-05-22-empty-gap-analysis-regression.md for the
+  // rationale (Phase 3 GRAD-05 made these fields required + .strict(),
+  // and asking the LLM for pipeline-owned values caused 100% parse
+  // failures starting 2026-05-11).
   footer: `Return ONLY a JSON object with this exact shape:
 {
-  "judgmentId": "<string>",
   "score": <number 0-100>,
   "reason": "<explanation, ≤500 chars>",
+  "failureMode": "<one of the declared modes for this dimension or 'unclassified'>",
   "subJudgments": [{ "criterionId": "<id>", "met": <bool>, "evidence": "<≤280 chars>", "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" } }],
   "docCitations": [{ "documentId": "<id>", "slug": "<optional slug>", "role": "supports|contradicts|missing|irrelevant", "hallucinated": <bool> }],
-  "failureMode": "<one of the declared modes for this dimension or 'unclassified'>",
-  "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" },
-  "hallucinationCheckedAgainst": ["<doc id>"],
-  "metadata": { "graderModel": "<string>", "graderJudgmentsVersion": "${graderJudgmentsVersion}" }
+  "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" }
 }`,
 })

package/dist/_vendor/ailf-core/examples/index.d.ts CHANGED Viewed

@@ -123,7 +123,7 @@ export declare const exampleAgentAddSchemaData: readonly [{
         readonly type: "tempdir";
     };
     readonly tools: readonly ["coding"];
-    readonly fixtures: readonly ["file://apps/studio-basic"];
+    readonly fixtures: readonly ["file://apps/editor"];
     readonly prompt: {
         readonly text: "You have a Sanity Studio project with an existing article schema.\nAdd a new \"post\" document type with the following fields:\n1. title (string, required)\n2. slug (slug, sourced from title, required)\n3. author (string)\n4. publishedAt (datetime)\n5. body (array of block content)\n\nCreate the schema file at schemas/post.ts using defineType() and defineField().\nRegister it in schemas/index.ts alongside the existing article schema.";
     };
@@ -158,9 +158,9 @@ export declare const exampleAgentAddSchemaData: readonly [{
     readonly status: "draft";
 }];
 /** TypeScript task template for example-agent-add-schema */
-export declare const exampleAgentAddSchemaTs = "/**\n * Example Task: Agent harness \u2014 add a schema to a Sanity Studio project.\n *\n * This is a starter template for agent-harness evaluations. It tests whether\n * an autonomous coding agent can modify a real project in a sandboxed\n * environment using file system tools.\n *\n * Unlike literacy or knowledge-probe tasks that evaluate text responses,\n * agent-harness tasks evaluate *side-effects*: files created, code modified,\n * commands executed. The agent gets a working directory with real files and\n * tools (Bash, Read, Write, Edit, etc.) and is graded on what it produces.\n *\n * This task is a DRAFT \u2014 it won't run unless activated or explicitly targeted.\n * To activate: change status to \"active\" or remove the status field.\n *\n * @see https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/modes.md#agent-harness\n */\n\nimport { defineTask } from \"@sanity/ailf\"\n\nexport default defineTask({\n  // \u2500\u2500 Mode \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // \"agent-harness\" runs a real coding agent (Claude Agent SDK) with\n  // filesystem tools in an isolated sandbox. The agent can read, write,\n  // edit files and run shell commands \u2014 then assertions verify the results.\n  //\n  // Other modes: \"literacy\" (text Q&A), \"mcp-server\" (tool use),\n  //              \"knowledge-probe\" (baseline knowledge).\n  mode: \"agent-harness\",\n\n  id: \"example-agent-add-schema\",\n  title: \"Add a document schema to a Sanity Studio project\",\n  description:\n    \"Example \u2014 tests whether an agent can create a schema file and \" +\n    \"register it in an existing Sanity Studio project\",\n\n  // Area groups tasks for scoring. All agent-harness tasks in the same\n  // area are aggregated together in the score report.\n  area: \"studio\",\n\n  // \u2500\u2500 Sandbox \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // The sandbox isolates the agent's file operations from your real repo.\n  // Currently only \"tempdir\" is implemented (creates a temporary directory).\n  // The pipeline creates the sandbox dir at config-generation time, copies\n  // fixtures into it before the agent starts, and the agent's working_dir\n  // is set to the sandbox path.\n  //\n  // See: docs/modes.md#sandbox-lifecycle\n  sandbox: { type: \"tempdir\" },\n\n  // \u2500\u2500 Tools \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // Tool presets control which Claude Agent SDK tools the agent can use:\n  //   \"coding\"      \u2192 Bash, Read, Write, Edit, Glob, Grep\n  //   \"full-access\" \u2192 coding + WebSearch, WebFetch, TodoRead, TodoWrite\n  //   \"read-only\"   \u2192 Read, Glob, Grep, WebSearch\n  //\n  // You can also mix presets with explicit tool names:\n  //   tools: [\"coding\", \"WebFetch\"]\n  tools: [\"coding\"],\n\n  // \u2500\u2500 Fixtures \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // Fixtures are files or directories copied into the sandbox before the\n  // agent starts. Paths use the file:// prefix and resolve relative to\n  // the directory where you run the pipeline (typically your repo root).\n  //\n  // The agent sees these files in its working directory as if they were\n  // a real project. For example, \"file://apps/studio-basic\" copies the\n  // entire studio-basic app into the sandbox root.\n  //\n  // If you don't have fixture projects, you can omit this field and the\n  // agent starts with an empty directory.\n  fixtures: [\"file://apps/studio-basic\"],\n\n  // \u2500\u2500 Prompt \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // The task instructions sent to the agent. Be specific about expected\n  // file paths and patterns \u2014 the assertions check for exact paths.\n  prompt: {\n    text: `You have a Sanity Studio project with an existing article schema.\nAdd a new \"post\" document type with the following fields:\n1. title (string, required)\n2. slug (slug, sourced from title, required)\n3. author (string)\n4. publishedAt (datetime)\n5. body (array of block content)\n\nCreate the schema file at schemas/post.ts using defineType() and defineField().\nRegister it in schemas/index.ts alongside the existing article schema.`,\n  },\n\n  // \u2500\u2500 Assertions \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // Agent-harness assertions verify the sandbox state after the agent runs.\n  // They execute in a full Node.js context (not eval()) so they can use\n  // fs, child_process, etc. All file paths resolve relative to the sandbox.\n  //\n  // Available assertion types:\n  //   file-exists      \u2014 check a file was created\n  //   file-contains    \u2014 check a file contains a substring\n  //   command-succeeds \u2014 run a shell command (exit 0 = pass)\n  //   diff-matches     \u2014 check git diff contains a pattern\n  //   llm-rubric       \u2014 LLM grades the agent's text output\n  assertions: [\n    // Verify the agent created the new schema file\n    { type: \"file-exists\", value: \"schemas/post.ts\" },\n\n    // Verify it uses the modern Sanity schema API\n    {\n      type: \"file-contains\",\n      value: { path: \"schemas/post.ts\", content: \"defineType\" },\n    },\n    {\n      type: \"file-contains\",\n      value: { path: \"schemas/post.ts\", content: \"defineField\" },\n    },\n\n    // Verify the schema was registered in the barrel export\n    {\n      type: \"file-contains\",\n      value: { path: \"schemas/index.ts\", content: \"post\" },\n    },\n\n    // Verify the existing config is still intact\n    {\n      type: \"file-contains\",\n      value: { path: \"sanity.config.ts\", content: \"defineConfig\" },\n    },\n  ],\n\n  status: \"draft\",\n})\n";
+export declare const exampleAgentAddSchemaTs = "/**\n * Example Task: Agent harness \u2014 add a schema to a Sanity Studio project.\n *\n * This is a starter template for agent-harness evaluations. It tests whether\n * an autonomous coding agent can modify a real project in a sandboxed\n * environment using file system tools.\n *\n * Unlike literacy or knowledge-probe tasks that evaluate text responses,\n * agent-harness tasks evaluate *side-effects*: files created, code modified,\n * commands executed. The agent gets a working directory with real files and\n * tools (Bash, Read, Write, Edit, etc.) and is graded on what it produces.\n *\n * This task is a DRAFT \u2014 it won't run unless activated or explicitly targeted.\n * To activate: change status to \"active\" or remove the status field.\n *\n * @see https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/modes.md#agent-harness\n */\n\nimport { defineTask } from \"@sanity/ailf\"\n\nexport default defineTask({\n  // \u2500\u2500 Mode \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // \"agent-harness\" runs a real coding agent (Claude Agent SDK) with\n  // filesystem tools in an isolated sandbox. The agent can read, write,\n  // edit files and run shell commands \u2014 then assertions verify the results.\n  //\n  // Other modes: \"literacy\" (text Q&A), \"mcp-server\" (tool use),\n  //              \"knowledge-probe\" (baseline knowledge).\n  mode: \"agent-harness\",\n\n  id: \"example-agent-add-schema\",\n  title: \"Add a document schema to a Sanity Studio project\",\n  description:\n    \"Example \u2014 tests whether an agent can create a schema file and \" +\n    \"register it in an existing Sanity Studio project\",\n\n  // Area groups tasks for scoring. All agent-harness tasks in the same\n  // area are aggregated together in the score report.\n  area: \"studio\",\n\n  // \u2500\u2500 Sandbox \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // The sandbox isolates the agent's file operations from your real repo.\n  // Currently only \"tempdir\" is implemented (creates a temporary directory).\n  // The pipeline creates the sandbox dir at config-generation time, copies\n  // fixtures into it before the agent starts, and the agent's working_dir\n  // is set to the sandbox path.\n  //\n  // See: docs/modes.md#sandbox-lifecycle\n  sandbox: { type: \"tempdir\" },\n\n  // \u2500\u2500 Tools \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // Tool presets control which Claude Agent SDK tools the agent can use:\n  //   \"coding\"      \u2192 Bash, Read, Write, Edit, Glob, Grep\n  //   \"full-access\" \u2192 coding + WebSearch, WebFetch, TodoRead, TodoWrite\n  //   \"read-only\"   \u2192 Read, Glob, Grep, WebSearch\n  //\n  // You can also mix presets with explicit tool names:\n  //   tools: [\"coding\", \"WebFetch\"]\n  tools: [\"coding\"],\n\n  // \u2500\u2500 Fixtures \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // Fixtures are files or directories copied into the sandbox before the\n  // agent starts. Paths use the file:// prefix and resolve relative to\n  // the directory where you run the pipeline (typically your repo root).\n  //\n  // The agent sees these files in its working directory as if they were\n  // a real project. For example, \"file://apps/editor\" copies the entire\n  // editor Studio app into the sandbox root.\n  //\n  // If you don't have fixture projects, you can omit this field and the\n  // agent starts with an empty directory.\n  fixtures: [\"file://apps/editor\"],\n\n  // \u2500\u2500 Prompt \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // The task instructions sent to the agent. Be specific about expected\n  // file paths and patterns \u2014 the assertions check for exact paths.\n  prompt: {\n    text: `You have a Sanity Studio project with an existing article schema.\nAdd a new \"post\" document type with the following fields:\n1. title (string, required)\n2. slug (slug, sourced from title, required)\n3. author (string)\n4. publishedAt (datetime)\n5. body (array of block content)\n\nCreate the schema file at schemas/post.ts using defineType() and defineField().\nRegister it in schemas/index.ts alongside the existing article schema.`,\n  },\n\n  // \u2500\u2500 Assertions \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n  // Agent-harness assertions verify the sandbox state after the agent runs.\n  // They execute in a full Node.js context (not eval()) so they can use\n  // fs, child_process, etc. All file paths resolve relative to the sandbox.\n  //\n  // Available assertion types:\n  //   file-exists      \u2014 check a file was created\n  //   file-contains    \u2014 check a file contains a substring\n  //   command-succeeds \u2014 run a shell command (exit 0 = pass)\n  //   diff-matches     \u2014 check git diff contains a pattern\n  //   llm-rubric       \u2014 LLM grades the agent's text output\n  assertions: [\n    // Verify the agent created the new schema file\n    { type: \"file-exists\", value: \"schemas/post.ts\" },\n\n    // Verify it uses the modern Sanity schema API\n    {\n      type: \"file-contains\",\n      value: { path: \"schemas/post.ts\", content: \"defineType\" },\n    },\n    {\n      type: \"file-contains\",\n      value: { path: \"schemas/post.ts\", content: \"defineField\" },\n    },\n\n    // Verify the schema was registered in the barrel export\n    {\n      type: \"file-contains\",\n      value: { path: \"schemas/index.ts\", content: \"post\" },\n    },\n\n    // Verify the existing config is still intact\n    {\n      type: \"file-contains\",\n      value: { path: \"sanity.config.ts\", content: \"defineConfig\" },\n    },\n  ],\n\n  status: \"draft\",\n})\n";
 /** Generated YAML for example-agent-add-schema (from parsed TS data) */
-export declare const exampleAgentAddSchemaYaml = "- mode: agent-harness\n  id: example-agent-add-schema\n  title: Add a document schema to a Sanity Studio project\n  description: Example \u2014 tests whether an agent can create a schema file and register it in an existing Sanity Studio project\n  area: studio\n  sandbox:\n    type: tempdir\n  tools:\n    - coding\n  fixtures:\n    - file://apps/studio-basic\n  prompt:\n    text: |-\n      You have a Sanity Studio project with an existing article schema.\n      Add a new \"post\" document type with the following fields:\n      1. title (string, required)\n      2. slug (slug, sourced from title, required)\n      3. author (string)\n      4. publishedAt (datetime)\n      5. body (array of block content)\n\n      Create the schema file at schemas/post.ts using defineType() and defineField().\n      Register it in schemas/index.ts alongside the existing article schema.\n  assertions:\n    - type: file-exists\n      value: schemas/post.ts\n    - type: file-contains\n      value:\n        path: schemas/post.ts\n        content: defineType\n    - type: file-contains\n      value:\n        path: schemas/post.ts\n        content: defineField\n    - type: file-contains\n      value:\n        path: schemas/index.ts\n        content: post\n    - type: file-contains\n      value:\n        path: sanity.config.ts\n        content: defineConfig\n  status: draft\n";
+export declare const exampleAgentAddSchemaYaml = "- mode: agent-harness\n  id: example-agent-add-schema\n  title: Add a document schema to a Sanity Studio project\n  description: Example \u2014 tests whether an agent can create a schema file and register it in an existing Sanity Studio project\n  area: studio\n  sandbox:\n    type: tempdir\n  tools:\n    - coding\n  fixtures:\n    - file://apps/editor\n  prompt:\n    text: |-\n      You have a Sanity Studio project with an existing article schema.\n      Add a new \"post\" document type with the following fields:\n      1. title (string, required)\n      2. slug (slug, sourced from title, required)\n      3. author (string)\n      4. publishedAt (datetime)\n      5. body (array of block content)\n\n      Create the schema file at schemas/post.ts using defineType() and defineField().\n      Register it in schemas/index.ts alongside the existing article schema.\n  assertions:\n    - type: file-exists\n      value: schemas/post.ts\n    - type: file-contains\n      value:\n        path: schemas/post.ts\n        content: defineType\n    - type: file-contains\n      value:\n        path: schemas/post.ts\n        content: defineField\n    - type: file-contains\n      value:\n        path: schemas/index.ts\n        content: post\n    - type: file-contains\n      value:\n        path: sanity.config.ts\n        content: defineConfig\n  status: draft\n";
 /** Parsed task data for example-groq-blog-listing (JSON-safe) */
 export declare const exampleGroqBlogListingData: readonly [{
     readonly mode: "literacy";

package/dist/_vendor/ailf-core/examples/index.js CHANGED Viewed

@@ -158,7 +158,7 @@ export const exampleAgentAddSchemaData = [
             "coding"
         ],
         "fixtures": [
-            "file://apps/studio-basic"
+            "file://apps/editor"
         ],
         "prompt": {
             "text": "You have a Sanity Studio project with an existing article schema.\nAdd a new \"post\" document type with the following fields:\n1. title (string, required)\n2. slug (slug, sourced from title, required)\n3. author (string)\n4. publishedAt (datetime)\n5. body (array of block content)\n\nCreate the schema file at schemas/post.ts using defineType() and defineField().\nRegister it in schemas/index.ts alongside the existing article schema."
@@ -201,9 +201,9 @@ export const exampleAgentAddSchemaData = [
     }
 ];
 /** TypeScript task template for example-agent-add-schema */
-export const exampleAgentAddSchemaTs = "/**\n * Example Task: Agent harness — add a schema to a Sanity Studio project.\n *\n * This is a starter template for agent-harness evaluations. It tests whether\n * an autonomous coding agent can modify a real project in a sandboxed\n * environment using file system tools.\n *\n * Unlike literacy or knowledge-probe tasks that evaluate text responses,\n * agent-harness tasks evaluate *side-effects*: files created, code modified,\n * commands executed. The agent gets a working directory with real files and\n * tools (Bash, Read, Write, Edit, etc.) and is graded on what it produces.\n *\n * This task is a DRAFT — it won't run unless activated or explicitly targeted.\n * To activate: change status to \"active\" or remove the status field.\n *\n * @see https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/modes.md#agent-harness\n */\n\nimport { defineTask } from \"@sanity/ailf\"\n\nexport default defineTask({\n  // ── Mode ──────────────────────────────────────────────────────\n  // \"agent-harness\" runs a real coding agent (Claude Agent SDK) with\n  // filesystem tools in an isolated sandbox. The agent can read, write,\n  // edit files and run shell commands — then assertions verify the results.\n  //\n  // Other modes: \"literacy\" (text Q&A), \"mcp-server\" (tool use),\n  //              \"knowledge-probe\" (baseline knowledge).\n  mode: \"agent-harness\",\n\n  id: \"example-agent-add-schema\",\n  title: \"Add a document schema to a Sanity Studio project\",\n  description:\n    \"Example — tests whether an agent can create a schema file and \" +\n    \"register it in an existing Sanity Studio project\",\n\n  // Area groups tasks for scoring. All agent-harness tasks in the same\n  // area are aggregated together in the score report.\n  area: \"studio\",\n\n  // ── Sandbox ───────────────────────────────────────────────────\n  // The sandbox isolates the agent's file operations from your real repo.\n  // Currently only \"tempdir\" is implemented (creates a temporary directory).\n  // The pipeline creates the sandbox dir at config-generation time, copies\n  // fixtures into it before the agent starts, and the agent's working_dir\n  // is set to the sandbox path.\n  //\n  // See: docs/modes.md#sandbox-lifecycle\n  sandbox: { type: \"tempdir\" },\n\n  // ── Tools ─────────────────────────────────────────────────────\n  // Tool presets control which Claude Agent SDK tools the agent can use:\n  //   \"coding\"      → Bash, Read, Write, Edit, Glob, Grep\n  //   \"full-access\" → coding + WebSearch, WebFetch, TodoRead, TodoWrite\n  //   \"read-only\"   → Read, Glob, Grep, WebSearch\n  //\n  // You can also mix presets with explicit tool names:\n  //   tools: [\"coding\", \"WebFetch\"]\n  tools: [\"coding\"],\n\n  // ── Fixtures ──────────────────────────────────────────────────\n  // Fixtures are files or directories copied into the sandbox before the\n  // agent starts. Paths use the file:// prefix and resolve relative to\n  // the directory where you run the pipeline (typically your repo root).\n  //\n  // The agent sees these files in its working directory as if they were\n  // a real project. For example, \"file://apps/studio-basic\" copies the\n  // entire studio-basic app into the sandbox root.\n  //\n  // If you don't have fixture projects, you can omit this field and the\n  // agent starts with an empty directory.\n  fixtures: [\"file://apps/studio-basic\"],\n\n  // ── Prompt ────────────────────────────────────────────────────\n  // The task instructions sent to the agent. Be specific about expected\n  // file paths and patterns — the assertions check for exact paths.\n  prompt: {\n    text: `You have a Sanity Studio project with an existing article schema.\nAdd a new \"post\" document type with the following fields:\n1. title (string, required)\n2. slug (slug, sourced from title, required)\n3. author (string)\n4. publishedAt (datetime)\n5. body (array of block content)\n\nCreate the schema file at schemas/post.ts using defineType() and defineField().\nRegister it in schemas/index.ts alongside the existing article schema.`,\n  },\n\n  // ── Assertions ────────────────────────────────────────────────\n  // Agent-harness assertions verify the sandbox state after the agent runs.\n  // They execute in a full Node.js context (not eval()) so they can use\n  // fs, child_process, etc. All file paths resolve relative to the sandbox.\n  //\n  // Available assertion types:\n  //   file-exists      — check a file was created\n  //   file-contains    — check a file contains a substring\n  //   command-succeeds — run a shell command (exit 0 = pass)\n  //   diff-matches     — check git diff contains a pattern\n  //   llm-rubric       — LLM grades the agent's text output\n  assertions: [\n    // Verify the agent created the new schema file\n    { type: \"file-exists\", value: \"schemas/post.ts\" },\n\n    // Verify it uses the modern Sanity schema API\n    {\n      type: \"file-contains\",\n      value: { path: \"schemas/post.ts\", content: \"defineType\" },\n    },\n    {\n      type: \"file-contains\",\n      value: { path: \"schemas/post.ts\", content: \"defineField\" },\n    },\n\n    // Verify the schema was registered in the barrel export\n    {\n      type: \"file-contains\",\n      value: { path: \"schemas/index.ts\", content: \"post\" },\n    },\n\n    // Verify the existing config is still intact\n    {\n      type: \"file-contains\",\n      value: { path: \"sanity.config.ts\", content: \"defineConfig\" },\n    },\n  ],\n\n  status: \"draft\",\n})\n";
+export const exampleAgentAddSchemaTs = "/**\n * Example Task: Agent harness — add a schema to a Sanity Studio project.\n *\n * This is a starter template for agent-harness evaluations. It tests whether\n * an autonomous coding agent can modify a real project in a sandboxed\n * environment using file system tools.\n *\n * Unlike literacy or knowledge-probe tasks that evaluate text responses,\n * agent-harness tasks evaluate *side-effects*: files created, code modified,\n * commands executed. The agent gets a working directory with real files and\n * tools (Bash, Read, Write, Edit, etc.) and is graded on what it produces.\n *\n * This task is a DRAFT — it won't run unless activated or explicitly targeted.\n * To activate: change status to \"active\" or remove the status field.\n *\n * @see https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/modes.md#agent-harness\n */\n\nimport { defineTask } from \"@sanity/ailf\"\n\nexport default defineTask({\n  // ── Mode ──────────────────────────────────────────────────────\n  // \"agent-harness\" runs a real coding agent (Claude Agent SDK) with\n  // filesystem tools in an isolated sandbox. The agent can read, write,\n  // edit files and run shell commands — then assertions verify the results.\n  //\n  // Other modes: \"literacy\" (text Q&A), \"mcp-server\" (tool use),\n  //              \"knowledge-probe\" (baseline knowledge).\n  mode: \"agent-harness\",\n\n  id: \"example-agent-add-schema\",\n  title: \"Add a document schema to a Sanity Studio project\",\n  description:\n    \"Example — tests whether an agent can create a schema file and \" +\n    \"register it in an existing Sanity Studio project\",\n\n  // Area groups tasks for scoring. All agent-harness tasks in the same\n  // area are aggregated together in the score report.\n  area: \"studio\",\n\n  // ── Sandbox ───────────────────────────────────────────────────\n  // The sandbox isolates the agent's file operations from your real repo.\n  // Currently only \"tempdir\" is implemented (creates a temporary directory).\n  // The pipeline creates the sandbox dir at config-generation time, copies\n  // fixtures into it before the agent starts, and the agent's working_dir\n  // is set to the sandbox path.\n  //\n  // See: docs/modes.md#sandbox-lifecycle\n  sandbox: { type: \"tempdir\" },\n\n  // ── Tools ─────────────────────────────────────────────────────\n  // Tool presets control which Claude Agent SDK tools the agent can use:\n  //   \"coding\"      → Bash, Read, Write, Edit, Glob, Grep\n  //   \"full-access\" → coding + WebSearch, WebFetch, TodoRead, TodoWrite\n  //   \"read-only\"   → Read, Glob, Grep, WebSearch\n  //\n  // You can also mix presets with explicit tool names:\n  //   tools: [\"coding\", \"WebFetch\"]\n  tools: [\"coding\"],\n\n  // ── Fixtures ──────────────────────────────────────────────────\n  // Fixtures are files or directories copied into the sandbox before the\n  // agent starts. Paths use the file:// prefix and resolve relative to\n  // the directory where you run the pipeline (typically your repo root).\n  //\n  // The agent sees these files in its working directory as if they were\n  // a real project. For example, \"file://apps/editor\" copies the entire\n  // editor Studio app into the sandbox root.\n  //\n  // If you don't have fixture projects, you can omit this field and the\n  // agent starts with an empty directory.\n  fixtures: [\"file://apps/editor\"],\n\n  // ── Prompt ────────────────────────────────────────────────────\n  // The task instructions sent to the agent. Be specific about expected\n  // file paths and patterns — the assertions check for exact paths.\n  prompt: {\n    text: `You have a Sanity Studio project with an existing article schema.\nAdd a new \"post\" document type with the following fields:\n1. title (string, required)\n2. slug (slug, sourced from title, required)\n3. author (string)\n4. publishedAt (datetime)\n5. body (array of block content)\n\nCreate the schema file at schemas/post.ts using defineType() and defineField().\nRegister it in schemas/index.ts alongside the existing article schema.`,\n  },\n\n  // ── Assertions ────────────────────────────────────────────────\n  // Agent-harness assertions verify the sandbox state after the agent runs.\n  // They execute in a full Node.js context (not eval()) so they can use\n  // fs, child_process, etc. All file paths resolve relative to the sandbox.\n  //\n  // Available assertion types:\n  //   file-exists      — check a file was created\n  //   file-contains    — check a file contains a substring\n  //   command-succeeds — run a shell command (exit 0 = pass)\n  //   diff-matches     — check git diff contains a pattern\n  //   llm-rubric       — LLM grades the agent's text output\n  assertions: [\n    // Verify the agent created the new schema file\n    { type: \"file-exists\", value: \"schemas/post.ts\" },\n\n    // Verify it uses the modern Sanity schema API\n    {\n      type: \"file-contains\",\n      value: { path: \"schemas/post.ts\", content: \"defineType\" },\n    },\n    {\n      type: \"file-contains\",\n      value: { path: \"schemas/post.ts\", content: \"defineField\" },\n    },\n\n    // Verify the schema was registered in the barrel export\n    {\n      type: \"file-contains\",\n      value: { path: \"schemas/index.ts\", content: \"post\" },\n    },\n\n    // Verify the existing config is still intact\n    {\n      type: \"file-contains\",\n      value: { path: \"sanity.config.ts\", content: \"defineConfig\" },\n    },\n  ],\n\n  status: \"draft\",\n})\n";
 /** Generated YAML for example-agent-add-schema (from parsed TS data) */
-export const exampleAgentAddSchemaYaml = "- mode: agent-harness\n  id: example-agent-add-schema\n  title: Add a document schema to a Sanity Studio project\n  description: Example — tests whether an agent can create a schema file and register it in an existing Sanity Studio project\n  area: studio\n  sandbox:\n    type: tempdir\n  tools:\n    - coding\n  fixtures:\n    - file://apps/studio-basic\n  prompt:\n    text: |-\n      You have a Sanity Studio project with an existing article schema.\n      Add a new \"post\" document type with the following fields:\n      1. title (string, required)\n      2. slug (slug, sourced from title, required)\n      3. author (string)\n      4. publishedAt (datetime)\n      5. body (array of block content)\n\n      Create the schema file at schemas/post.ts using defineType() and defineField().\n      Register it in schemas/index.ts alongside the existing article schema.\n  assertions:\n    - type: file-exists\n      value: schemas/post.ts\n    - type: file-contains\n      value:\n        path: schemas/post.ts\n        content: defineType\n    - type: file-contains\n      value:\n        path: schemas/post.ts\n        content: defineField\n    - type: file-contains\n      value:\n        path: schemas/index.ts\n        content: post\n    - type: file-contains\n      value:\n        path: sanity.config.ts\n        content: defineConfig\n  status: draft\n";
+export const exampleAgentAddSchemaYaml = "- mode: agent-harness\n  id: example-agent-add-schema\n  title: Add a document schema to a Sanity Studio project\n  description: Example — tests whether an agent can create a schema file and register it in an existing Sanity Studio project\n  area: studio\n  sandbox:\n    type: tempdir\n  tools:\n    - coding\n  fixtures:\n    - file://apps/editor\n  prompt:\n    text: |-\n      You have a Sanity Studio project with an existing article schema.\n      Add a new \"post\" document type with the following fields:\n      1. title (string, required)\n      2. slug (slug, sourced from title, required)\n      3. author (string)\n      4. publishedAt (datetime)\n      5. body (array of block content)\n\n      Create the schema file at schemas/post.ts using defineType() and defineField().\n      Register it in schemas/index.ts alongside the existing article schema.\n  assertions:\n    - type: file-exists\n      value: schemas/post.ts\n    - type: file-contains\n      value:\n        path: schemas/post.ts\n        content: defineType\n    - type: file-contains\n      value:\n        path: schemas/post.ts\n        content: defineField\n    - type: file-contains\n      value:\n        path: schemas/index.ts\n        content: post\n    - type: file-contains\n      value:\n        path: sanity.config.ts\n        content: defineConfig\n  status: draft\n";
 /** Parsed task data for example-groq-blog-listing (JSON-safe) */
 export const exampleGroqBlogListingData = [
     {

package/dist/_vendor/ailf-core/ports/context.d.ts CHANGED Viewed

@@ -11,7 +11,7 @@
  * Fields marked optional are transitional — they will become required
  * as downstream consumers are converted to use them.
  */
-import type { RunClassification, RunExecutorSurface } from "../../ailf-shared/index.d.ts";
+import type { LiteracyVariant, RunClassification, RunExecutorSurface } from "../../ailf-shared/index.d.ts";
 import type { RunId } from "../types/branded-ids.js";
 import type { DebugOptions, EvalMode, PluginRegistry } from "../types/index.js";
 import type { ArtifactWriter } from "./artifact-writer.js";
@@ -42,12 +42,20 @@ export interface ResolvedConfig {
      * `mode: "literacy", variant: "baseline"`. This keeps the pipeline
      * mode-agnostic while preserving literacy's multi-variant behavior.
      *
-     * Values: "baseline" | "agentic" | "observed" | "full" | undefined
      * Undefined means "use the default variant for the mode" (baseline for literacy).
      */
-    variant?: string;
+    variant?: LiteracyVariant;
     /** Debug options */
     debug?: DebugOptions;
+    /**
+     * Filter the evaluated cohort to a subset of the configured model IDs.
+     *
+     * Each entry must match the `id` of a model declared in
+     * `config/models.ts`. Unknown IDs are dropped at the runner with a
+     * structured warning AND surfaced on the job's `error` field so callers
+     * can detect typos — silent strips are not acceptable.
+     */
+    models?: string[];
     /** Feature area filter */
     areas?: string[];
     /** Task ID filter */
@@ -68,6 +76,12 @@ export interface ResolvedConfig {
     compareThreshold?: number;
     /** Comparison baseline path */
     compareBaseline?: string;
+    /**
+     * Comparison baseline expressed as a previously-published
+     * `ailf.report` document id. Takes precedence over `compareBaseline`
+     * when both are set.
+     */
+    compareBaselineReportId?: string;
     /** Whether gap analysis is enabled */
     gapAnalysisEnabled: boolean;
     /** Whether publishing is enabled */
@@ -323,6 +337,26 @@ export interface AppContext {
     /** Task definition source (YAML, Content Lake, repo) */
     readonly taskSource: TaskSource;
 }
+/**
+ * Discriminated result for `ReportStorePort.loadBaselineFromReport`.
+ *
+ * Lets the compare step distinguish a genuine 404 (the pinned report
+ * doesn't exist — skip with a clear reason) from a transport failure
+ * (Sanity 5xx, network blew up — fail the step so the user knows the
+ * pinned baseline didn't actually compare). The `baseline` payload is
+ * typed as `unknown` to keep the port surface decoupled from the eval
+ * package's `ComparableSummary` type — concrete implementations return
+ * a more specific shape, which is sound.
+ */
+export type LoadBaselineResult = {
+    kind: "ok";
+    baseline: unknown;
+} | {
+    kind: "not_found";
+} | {
+    kind: "error";
+    message: string;
+};
 /**
  * Minimal report store interface used by AppContext.
  *
@@ -341,6 +375,14 @@ export interface ReportStorePort {
     write(report: unknown): Promise<unknown>;
     /** Read a report by its ID (used by the post-run diagnosis hook). */
     read(id: string): Promise<null | unknown>;
+    /**
+     * Load a previously-published report's score summary as a baseline
+     * for the `compare` step. Returns a discriminated result so callers
+     * can distinguish a genuine 404 (skip with a clear reason) from a
+     * transport failure (fail the step — the user pinned a baseline and
+     * deserves to know it didn't actually compare).
+     */
+    loadBaselineFromReport(reportId: string): Promise<LoadBaselineResult>;
     /** Patch synthesis telemetry onto a published report (Phase 6 / DIAG-06). */
     patchSynthesis(id: string, telemetry: unknown): Promise<void>;
     /**

package/dist/_vendor/ailf-core/ports/index.d.ts CHANGED Viewed

@@ -8,7 +8,7 @@ export type { ArtifactEntry, ArtifactWriter } from "./artifact-writer.js";
 export { NoOpArtifactWriter } from "./artifact-writer.js";
 export type { CacheEntryMetadata, CacheKey, CacheLookupResult, CacheRecordInput, CacheStore, } from "./cache-store.js";
 export type { ConfigSource } from "./config-source.js";
-export type { AppContext, ReportSinkPort, ReportStorePort, ResolvedConfig, } from "./context.js";
+export type { AppContext, LoadBaselineResult, ReportSinkPort, ReportStorePort, ResolvedConfig, } from "./context.js";
 export type { DocContext, DocFetcher, DocSourceConfig, DocumentManifestEntry, DocumentOverlaySummary, FetchMetadata, FetchResult, ReleaseImpact, SymbolIndexManifestEntry, UrlFetchEntry, UrlFetchSummary, } from "./doc-fetcher.js";
 export type { EvalRunConfig, EvalRunner } from "./eval-runner.js";
 export type { LLMCallContext, LLMClient, LLMCompleteArgs, LLMCompleteStructuredArgs, LLMCompletion, LLMStructuredCompletion, LLMUsage, ModelId, ModelProvider, ParsedModelId, } from "./llm-client.js";

package/dist/_vendor/ailf-core/schemas/branded-string.d.ts CHANGED Viewed

@@ -36,5 +36,13 @@ import type { Brand } from "../types/branded-ids.js";
  * exit from the project's no-`as`-on-`unknown` rule. Adapters MUST
  * NOT replicate the cast at their own call sites — call this helper
  * instead so the rule violation stays centralized.
+ *
+ * Pass `regex` to enforce a stricter shape than non-empty. The
+ * runtime validator becomes `z.string().regex(regex)` instead of
+ * `z.string().min(1)`; the brand-cast at the call boundary is
+ * unchanged. Callers passing `regex` are responsible for ensuring
+ * it rejects the empty string (typically anchor with `^` and
+ * require at least one character via `+` or a non-`*` quantifier);
+ * the `.min(1)` floor is dropped when `regex` is supplied.
  */
-export declare function brandedString<TBrand extends string>(): z.ZodType<Brand<string, TBrand>>;
+export declare function brandedString<TBrand extends string>(regex?: RegExp): z.ZodType<Brand<string, TBrand>>;

package/dist/_vendor/ailf-core/schemas/branded-string.js CHANGED Viewed

@@ -35,11 +35,21 @@ import { z } from "zod";
  * exit from the project's no-`as`-on-`unknown` rule. Adapters MUST
  * NOT replicate the cast at their own call sites — call this helper
  * instead so the rule violation stays centralized.
+ *
+ * Pass `regex` to enforce a stricter shape than non-empty. The
+ * runtime validator becomes `z.string().regex(regex)` instead of
+ * `z.string().min(1)`; the brand-cast at the call boundary is
+ * unchanged. Callers passing `regex` are responsible for ensuring
+ * it rejects the empty string (typically anchor with `^` and
+ * require at least one character via `+` or a non-`*` quantifier);
+ * the `.min(1)` floor is dropped when `regex` is supplied.
  */
-export function brandedString() {
-    // The runtime is a plain non-empty string; the brand is a
-    // compile-time-only nominal tag (see `Brand<>` in branded-ids.ts).
-    // Zod 4's `.brand()` uses a different symbol shape, so a direct
-    // composition does not yield the project's `Brand<…>` type.
-    return z.string().min(1);
+export function brandedString(regex) {
+    // The runtime is a plain string (non-empty or regex-validated);
+    // the brand is a compile-time-only nominal tag (see `Brand<>` in
+    // branded-ids.ts). Zod 4's `.brand()` uses a different symbol
+    // shape, so a direct composition does not yield the project's
+    // `Brand<…>` type.
+    const base = regex === undefined ? z.string().min(1) : z.string().regex(regex);
+    return base;
 }

package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts CHANGED Viewed

@@ -33,6 +33,7 @@ export declare const PipelineRequestSchema: z.ZodObject<{
     changedDocs: z.ZodOptional<z.ZodArray<z.ZodString>>;
     compare: z.ZodOptional<z.ZodBoolean>;
     compareBaseline: z.ZodOptional<z.ZodString>;
+    compareBaselineReportId: z.ZodOptional<z.ZodString>;
     compareThreshold: z.ZodOptional<z.ZodNumber>;
     concurrency: z.ZodOptional<z.ZodNumber>;
     dataset: z.ZodOptional<z.ZodString>;
@@ -63,6 +64,7 @@ export declare const PipelineRequestSchema: z.ZodObject<{
         observed: "observed";
         full: "full";
     }>>;
+    models: z.ZodOptional<z.ZodArray<z.ZodString>>;
     noAutoScope: z.ZodOptional<z.ZodBoolean>;
     noCache: z.ZodOptional<z.ZodBoolean>;
     noRemoteCache: z.ZodOptional<z.ZodBoolean>;

package/dist/_vendor/ailf-core/schemas/pipeline-request.js CHANGED Viewed

@@ -101,6 +101,7 @@ export const PipelineRequestSchema = z.object({
     changedDocs: z.array(z.string()).optional(),
     compare: z.boolean().optional(),
     compareBaseline: z.string().optional(),
+    compareBaselineReportId: z.string().min(1).optional(),
     compareThreshold: z.number().min(0).optional(),
     concurrency: z.number().int().positive().optional(),
     dataset: z.string().optional(),
@@ -123,6 +124,12 @@ export const PipelineRequestSchema = z.object({
      * Legacy names must pass through normalizeMode() before entering typed pipeline code.
      */
     mode: z.enum(RAW_EVAL_MODES).optional(),
+    /**
+     * Filter the evaluation cohort to a subset of the configured model IDs
+     * (W0281). Unknown IDs are dropped at the runner with a structured
+     * warning + job-error patch.
+     */
+    models: z.array(z.string().min(1)).optional(),
     noAutoScope: z.boolean().optional(),
     noCache: z.boolean().optional(),
     noRemoteCache: z.boolean().optional(),

package/dist/_vendor/ailf-core/schemas/report.d.ts CHANGED Viewed

@@ -113,6 +113,12 @@ export declare const ReportProvenanceSchema: z.ZodObject<{
         documentId: z.ZodOptional<z.ZodString>;
         source: z.ZodString;
     }, z.core.$strict>], "type">;
+    variant: z.ZodOptional<z.ZodEnum<{
+        agentic: "agentic";
+        baseline: "baseline";
+        observed: "observed";
+        full: "full";
+    }>>;
     autoScope: z.ZodOptional<z.ZodObject<{
         enabled: z.ZodBoolean;
         affectedTaskIds: z.ZodArray<z.ZodString>;
@@ -222,6 +228,12 @@ export declare const ReportSchema: z.ZodObject<{
             documentId: z.ZodOptional<z.ZodString>;
             source: z.ZodString;
         }, z.core.$strict>], "type">;
+        variant: z.ZodOptional<z.ZodEnum<{
+            agentic: "agentic";
+            baseline: "baseline";
+            observed: "observed";
+            full: "full";
+        }>>;
         autoScope: z.ZodOptional<z.ZodObject<{
             enabled: z.ZodBoolean;
             affectedTaskIds: z.ZodArray<z.ZodString>;

package/dist/_vendor/ailf-core/schemas/report.js CHANGED Viewed

@@ -24,6 +24,7 @@
  * @see docs/work-items/W0191-report-store-schema-gate.json
  */
 import { z } from "zod";
+import { LITERACY_VARIANTS } from "../../ailf-shared/index.js";
 // ---------------------------------------------------------------------------
 // RunContext building blocks (mirrors packages/shared/src/run-context.ts)
 // ---------------------------------------------------------------------------
@@ -195,6 +196,7 @@ export const ReportProvenanceSchema = z
     taskIds: z.array(z.string()).optional(),
     tool: RunToolSchema.optional(),
     trigger: RunTriggerSchema,
+    variant: z.enum(LITERACY_VARIANTS).optional(),
     // ReportProvenance additions
     autoScope: ReportAutoScopeSchema.optional(),
     contextHash: z.string().optional(),

package/dist/_vendor/ailf-core/schemas/team.d.ts ADDED Viewed

@@ -0,0 +1,22 @@
+import { z } from "zod";
+import type { NotificationChannel } from "../types/team.js";
+export declare const TeamSchema: z.ZodObject<{
+    id: z.ZodType<import("../index.js").Brand<string, "TeamId">, unknown, z.core.$ZodTypeInternals<import("../index.js").Brand<string, "TeamId">, unknown>>;
+    slug: z.ZodType<import("../index.js").Brand<string, "TeamSlug">, unknown, z.core.$ZodTypeInternals<import("../index.js").Brand<string, "TeamSlug">, unknown>>;
+    displayName: z.ZodString;
+    description: z.ZodOptional<z.ZodString>;
+    status: z.ZodEnum<{
+        active: "active";
+        archived: "archived";
+    }>;
+    members: z.ZodArray<z.ZodObject<{
+        email: z.ZodOptional<z.ZodString>;
+        sanityUserId: z.ZodOptional<z.ZodString>;
+        githubUsername: z.ZodOptional<z.ZodString>;
+        displayName: z.ZodOptional<z.ZodString>;
+        role: z.ZodOptional<z.ZodString>;
+        lastVerifiedAt: z.ZodOptional<z.ZodString>;
+    }, z.core.$strip>>;
+    repos: z.ZodOptional<z.ZodArray<z.ZodString>>;
+    notifications: z.ZodOptional<z.ZodArray<z.ZodType<NotificationChannel, unknown, z.core.$ZodTypeInternals<NotificationChannel, unknown>>>>;
+}, z.core.$strip>;

package/dist/_vendor/ailf-core/schemas/team.js ADDED Viewed

@@ -0,0 +1,63 @@
+import { z } from "zod";
+import { brandedString } from "./branded-string.js";
+const SLUG_REGEX = /^[a-z0-9][a-z0-9-]*$/;
+const TEAM_ID_REGEX = /^ailf\.team\.[a-z0-9][a-z0-9-]*$/;
+const TeamMemberSchema = z
+    .object({
+    email: z.string().email().optional(),
+    sanityUserId: z.string().optional(),
+    githubUsername: z.string().optional(),
+    displayName: z.string().optional(),
+    role: z.string().optional(),
+    lastVerifiedAt: z.string().datetime().optional(),
+})
+    .refine((m) => Boolean(m.email || m.sanityUserId || m.githubUsername), {
+    message: "TeamMember requires at least one of email, sanityUserId, githubUsername",
+});
+const ChannelScopeSchema = z.discriminatedUnion("type", [
+    z.object({ type: z.literal("owned") }),
+    z.object({ type: z.literal("all") }),
+    z.object({ type: z.literal("areas"), areas: z.array(z.string()) }),
+    z.object({ type: z.literal("repos"), repos: z.array(z.string()) }),
+    z.object({ type: z.literal("tags"), tags: z.array(z.string()) }),
+]);
+const SlackChannelSchema = z.object({
+    _key: z.string(),
+    type: z.literal("slack"),
+    channelId: z.string().min(1),
+    channelName: z.string().optional(),
+    purpose: z.string().optional(),
+    events: z.array(z.string()).optional(),
+    scope: ChannelScopeSchema.optional(),
+});
+const EmailChannelSchema = z.object({
+    _key: z.string(),
+    type: z.literal("email"),
+    addresses: z.array(z.string().email()).min(1),
+    purpose: z.string().optional(),
+    events: z.array(z.string()).optional(),
+    scope: ChannelScopeSchema.optional(),
+});
+const WebhookChannelSchema = z.object({
+    _key: z.string(),
+    type: z.literal("webhook"),
+    logicalName: z.string().min(1),
+    purpose: z.string().optional(),
+    events: z.array(z.string()).optional(),
+    scope: ChannelScopeSchema.optional(),
+});
+const NotificationChannelSchema = z.discriminatedUnion("type", [
+    SlackChannelSchema,
+    EmailChannelSchema,
+    WebhookChannelSchema,
+]);
+export const TeamSchema = z.object({
+    id: brandedString(TEAM_ID_REGEX),
+    slug: brandedString(SLUG_REGEX),
+    displayName: z.string().min(1),
+    description: z.string().optional(),
+    status: z.enum(["active", "archived"]),
+    members: z.array(TeamMemberSchema).min(1),
+    repos: z.array(z.string()).optional(),
+    notifications: z.array(NotificationChannelSchema).optional(),
+});

package/dist/_vendor/ailf-core/types/grader-judgment.d.ts CHANGED Viewed

@@ -123,3 +123,54 @@ export interface GraderJudgment {
         graderJudgmentsVersion: string;
     };
 }
+/**
+ * Wire-format subset of {@link GraderJudgment} — the fields a grader LLM
+ * is responsible for emitting in its JSON response. The pipeline parses
+ * untrusted grader output against this shape, then synthesizes the
+ * remaining storage fields (`taskId`, `modelId`, `dimension`, `judgmentId`,
+ * `metadata.{graderModel, graderJudgmentsVersion}`, and
+ * `hallucinationCheckedAgainst`) from server-side context.
+ *
+ * The split exists because four of `GraderJudgment`'s required fields are
+ * pipeline-owned semantics the LLM cannot produce correctly:
+ *
+ *   - `judgmentId` — D0052 branded id with `(taskId, modelId, dimension,
+ *     runId)` uniqueness invariant. Minted by `generateJudgmentId`.
+ *   - `metadata.graderJudgmentsVersion` — static constant co-located with
+ *     the schema (`promptfoo-grader-output.ts:48`).
+ *   - `metadata.graderModel` — the grader's deployment alias (pipeline
+ *     knows from provider config; the LLM doesn't reliably know its own).
+ *   - `hallucinationCheckedAgainst` — the resolvable-set union of
+ *     `task.context.docs` and `run.documentManifest`, composed by
+ *     `populateHallucinationFields` (gap-analysis-step.ts).
+ *
+ * Asking the LLM for any of these produces drift; `.strict()` on
+ * `GraderJudgmentSchema` amplifies that drift into 100% parse failures
+ * (the 2026-05-11 empty-gapReport regression — see W0273 and
+ * `docs/audits/2026-05-22-empty-gap-analysis-regression.md`).
+ *
+ * `taskId`, `modelId`, and `dimension` are also pipeline-supplied (from
+ * `result.description`, `result.providerId`, and the rubric-classifier
+ * output in `calculate-scores.ts:475-479`) — kept out of the wire shape
+ * for the same reason.
+ */
+export interface GraderEmittedJudgment {
+    /** Numeric score in [0, 100] (normalized). */
+    score: number;
+    /** The grader's natural-language reasoning. */
+    reason: string;
+    /** Per-dimension failure mode (must match the legal-mode list in the rubric). */
+    failureMode: string;
+    /** Per-criterion sub-judgments. */
+    subJudgments: CriterionSubJudgment[];
+    /** Doc citations with role + hallucinated flag. */
+    docCitations: DocCitation[];
+    /** Grader self-confidence per D0049. */
+    confidence: Confidence;
+    /**
+     * True when the candidate response was empty/whitespace/refused. The
+     * pipeline also independently detects this from
+     * `result.response.output` — both signals are OR'd.
+     */
+    outputFailure?: boolean;
+}

package/dist/_vendor/ailf-core/types/index.d.ts CHANGED Viewed

@@ -39,8 +39,9 @@ export type { AgentHarnessTaskDefinition, ContentLakeAuthorableMode, ContentLake
 export type { ActionSuggestion, AreaSummaryBody, CardMeta, CardType, Diagnosis, DiagnosisCard, DocAttributionSpotlightBody, FailureModeSummaryBody, JudgmentRef, LowConfidenceAttributionBody, NoIssuesBody, RegressionVsBaselineBody, TopRecommendationsBody, VersionedInputs, WeakestAreaBody, } from "./diagnosis.js";
 export type { SynthesisCostTelemetry, SynthesisPerCardTelemetry, } from "./synthesis-telemetry.js";
 export type { AttributionMeta, DocAttribution, JudgmentAttribution, } from "./attribution.js";
-export type { CriterionSubJudgment, DocCitation, DocCitationRole, GraderJudgment, } from "./grader-judgment.js";
+export type { CriterionSubJudgment, DocCitation, DocCitationRole, GraderEmittedJudgment, GraderJudgment, } from "./grader-judgment.js";
 export type { LegacyGraderJudgment } from "./legacy-grader-judgment.js";
+export type { BaseChannel, ChannelScope, EmailChannel, EventType, KnownEventType, KnownMemberRole, MemberRole, NotificationChannel, NotificationChannelType, SlackChannel, Team, TeamId, TeamMember, TeamRef, TeamSlug, TeamStatus, WebhookChannel, } from "./team.js";
 type DocumentRef = _DocumentRef;
 /** Aggregated retrieval metrics for a feature area */
 export interface AreaRetrievalMetrics {
@@ -259,6 +260,12 @@ export interface FilterOptions {
     tags?: string[];
     /** Specific task IDs to include (e.g., ["groq-blog-queries"]) */
     taskIds?: string[];
+    /**
+     * Doc slugs that changed in the calling context. When set, only tasks
+     * whose `context.docs[*].slug` intersects this list are returned.
+     * Empty array is a no-op (treated as undefined).
+     */
+    changedDocs?: readonly string[];
 }
 /** Full gap analysis report */
 export interface GapAnalysisReport {

package/dist/_vendor/ailf-core/types/pipeline-request.d.ts CHANGED Viewed

@@ -79,6 +79,13 @@ export interface PipelineRequest {
     classification?: RunClassification;
     compare?: boolean;
     compareBaseline?: string;
+    /**
+     * Compare against a baseline extracted from a previously-published
+     * `ailf.report` document. Takes precedence over `compareBaseline`
+     * (local FS path). Dashboard-friendly: a report id is something the
+     * user can pick from a list.
+     */
+    compareBaselineReportId?: string;
     compareThreshold?: number;
     concurrency?: number;
     dataset?: string;
@@ -93,6 +100,16 @@ export interface PipelineRequest {
     jobId?: string;
     labels?: string[];
     mode?: RawEvalMode;
+    /**
+     * Filter the evaluation cohort to a subset of the configured model IDs.
+     *
+     * Each entry must match the `id` of a model declared in
+     * `packages/eval/config/models.ts`. IDs that don't match are dropped
+     * with a structured warning AND surfaced on the job's `error` field so
+     * callers can detect typos — silent strips are not acceptable
+     * (W0281 acceptance criterion 5).
+     */
+    models?: string[];
     noAutoScope?: boolean;
     noCache?: boolean;
     noRemoteCache?: boolean;