@sanity/ailf 7.0.0 → 7.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/config/rubrics.ts +12 -13
  2. package/dist/_vendor/ailf-core/examples/index.d.ts +3 -3
  3. package/dist/_vendor/ailf-core/examples/index.js +3 -3
  4. package/dist/_vendor/ailf-core/ports/context.d.ts +45 -3
  5. package/dist/_vendor/ailf-core/ports/index.d.ts +1 -1
  6. package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +9 -1
  7. package/dist/_vendor/ailf-core/schemas/branded-string.js +16 -6
  8. package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +2 -0
  9. package/dist/_vendor/ailf-core/schemas/pipeline-request.js +7 -0
  10. package/dist/_vendor/ailf-core/schemas/report.d.ts +12 -0
  11. package/dist/_vendor/ailf-core/schemas/report.js +2 -0
  12. package/dist/_vendor/ailf-core/schemas/team.d.ts +22 -0
  13. package/dist/_vendor/ailf-core/schemas/team.js +63 -0
  14. package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +51 -0
  15. package/dist/_vendor/ailf-core/types/index.d.ts +8 -1
  16. package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +17 -0
  17. package/dist/_vendor/ailf-core/types/team.d.ts +65 -0
  18. package/dist/_vendor/ailf-core/types/team.js +1 -0
  19. package/dist/_vendor/ailf-shared/eval-modes.d.ts +2 -0
  20. package/dist/_vendor/ailf-shared/eval-modes.js +5 -0
  21. package/dist/_vendor/ailf-shared/event-types.d.ts +15 -0
  22. package/dist/_vendor/ailf-shared/event-types.js +23 -0
  23. package/dist/_vendor/ailf-shared/generated/help-content.js +2 -2
  24. package/dist/_vendor/ailf-shared/index.d.ts +4 -2
  25. package/dist/_vendor/ailf-shared/index.js +4 -2
  26. package/dist/_vendor/ailf-shared/member-roles.d.ts +16 -0
  27. package/dist/_vendor/ailf-shared/member-roles.js +16 -0
  28. package/dist/_vendor/ailf-shared/owner-teams.d.ts +19 -0
  29. package/dist/_vendor/ailf-shared/owner-teams.js +7 -0
  30. package/dist/_vendor/ailf-shared/run-context.d.ts +8 -1
  31. package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +65 -1
  32. package/dist/adapters/grader-outputs/promptfoo-grader-output.js +35 -0
  33. package/dist/adapters/task-sources/changed-docs-filter.d.ts +12 -0
  34. package/dist/adapters/task-sources/changed-docs-filter.js +30 -0
  35. package/dist/adapters/task-sources/content-lake-task-source.js +2 -1
  36. package/dist/adapters/task-sources/repo-task-source.js +2 -1
  37. package/dist/commands/pipeline-action.d.ts +4 -3
  38. package/dist/commands/pipeline-action.js +7 -5
  39. package/dist/commands/run.js +2 -2
  40. package/dist/config/rubrics.ts +12 -13
  41. package/dist/job-store.d.ts +18 -0
  42. package/dist/job-store.js +34 -0
  43. package/dist/orchestration/build-app-context.js +8 -1
  44. package/dist/orchestration/pipeline-orchestrator.js +46 -1
  45. package/dist/orchestration/steps/compare-step.d.ts +7 -0
  46. package/dist/orchestration/steps/compare-step.js +59 -23
  47. package/dist/orchestration/steps/fetch-docs-step.js +3 -0
  48. package/dist/orchestration/steps/finalize-run-step.js +2 -0
  49. package/dist/orchestration/steps/generate-configs-step.d.ts +32 -1
  50. package/dist/orchestration/steps/generate-configs-step.js +47 -13
  51. package/dist/orchestration/steps/grader-consistency-step.js +11 -0
  52. package/dist/orchestration/steps/publish-report-step.d.ts +12 -1
  53. package/dist/orchestration/steps/publish-report-step.js +19 -3
  54. package/dist/pipeline/cache-hit-restore.d.ts +30 -5
  55. package/dist/pipeline/cache-hit-restore.js +36 -6
  56. package/dist/pipeline/calculate-scores.js +57 -21
  57. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.d.ts +7 -2
  58. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +13 -4
  59. package/dist/pipeline/compiler/mode-handlers/literacy/compiler.js +1 -1
  60. package/dist/pipeline/compiler/provider-assembler.d.ts +15 -1
  61. package/dist/pipeline/compiler/provider-assembler.js +16 -3
  62. package/dist/pipeline/failure-modes.d.ts +20 -10
  63. package/dist/pipeline/failure-modes.js +84 -15
  64. package/dist/pipeline/map-request-to-config.js +2 -0
  65. package/dist/pipeline/normalize-mode.d.ts +1 -1
  66. package/dist/pipeline/normalize-mode.js +2 -0
  67. package/dist/pipeline/run-context.d.ts +16 -1
  68. package/dist/pipeline/run-context.js +12 -1
  69. package/dist/pipeline/validate.d.ts +8 -4
  70. package/dist/pipeline/validate.js +8 -18
  71. package/dist/report-store.d.ts +14 -1
  72. package/dist/report-store.js +32 -0
  73. package/dist/sanity/client.js +2 -2
  74. package/package.json +3 -3
package/config/rubrics.ts CHANGED
@@ -15,10 +15,6 @@ import { defineRubrics } from "@sanity/ailf-core"
15
15
  // template entry below. Source of truth lives in packages/eval/src/grader/;
16
16
  // the helper picks the right list by dimension family.
17
17
  import { failureModesForDimension } from "../src/grader/index.js"
18
- // Single source of truth for the wire-format version stamped into the
19
- // grader-prompt footer (VER-01 D-02). Interpolated below so the
20
- // announced version cannot drift from the schema's expected value.
21
- import { graderJudgmentsVersion } from "../src/adapters/grader-outputs/index.js"
22
18
 
23
19
  export default defineRubrics({
24
20
  templates: {
@@ -242,20 +238,23 @@ export default defineRubrics({
242
238
  "agent-harness": { gold: "agent-harness" },
243
239
  },
244
240
 
245
- // Phase 3 GRAD-05 (Plan 03-01) structured GraderJudgment JSON sketch.
246
- // Documents the target wire format the grader emits. The strict schema's
247
- // GRAD-02 additive fields stay optional in this plan; Plan 03-04 flips
248
- // them to required and bumps graderJudgmentsVersion to 1.0.0.
241
+ // W0273 the footer documents the wire-format subset of GraderJudgment
242
+ // that the grader LLM actually controls. The pipeline parses this against
243
+ // GraderEmittedJudgmentSchema and then synthesizes the four pipeline-owned
244
+ // fields (judgmentId, metadata.{graderModel,graderJudgmentsVersion},
245
+ // hallucinationCheckedAgainst) to build the storage GraderJudgment.
246
+ //
247
+ // See docs/audits/2026-05-22-empty-gap-analysis-regression.md for the
248
+ // rationale (Phase 3 GRAD-05 made these fields required + .strict(),
249
+ // and asking the LLM for pipeline-owned values caused 100% parse
250
+ // failures starting 2026-05-11).
249
251
  footer: `Return ONLY a JSON object with this exact shape:
250
252
  {
251
- "judgmentId": "<string>",
252
253
  "score": <number 0-100>,
253
254
  "reason": "<explanation, ≤500 chars>",
255
+ "failureMode": "<one of the declared modes for this dimension or 'unclassified'>",
254
256
  "subJudgments": [{ "criterionId": "<id>", "met": <bool>, "evidence": "<≤280 chars>", "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" } }],
255
257
  "docCitations": [{ "documentId": "<id>", "slug": "<optional slug>", "role": "supports|contradicts|missing|irrelevant", "hallucinated": <bool> }],
256
- "failureMode": "<one of the declared modes for this dimension or 'unclassified'>",
257
- "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" },
258
- "hallucinationCheckedAgainst": ["<doc id>"],
259
- "metadata": { "graderModel": "<string>", "graderJudgmentsVersion": "${graderJudgmentsVersion}" }
258
+ "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" }
260
259
  }`,
261
260
  })
@@ -123,7 +123,7 @@ export declare const exampleAgentAddSchemaData: readonly [{
123
123
  readonly type: "tempdir";
124
124
  };
125
125
  readonly tools: readonly ["coding"];
126
- readonly fixtures: readonly ["file://apps/studio-basic"];
126
+ readonly fixtures: readonly ["file://apps/editor"];
127
127
  readonly prompt: {
128
128
  readonly text: "You have a Sanity Studio project with an existing article schema.\nAdd a new \"post\" document type with the following fields:\n1. title (string, required)\n2. slug (slug, sourced from title, required)\n3. author (string)\n4. publishedAt (datetime)\n5. body (array of block content)\n\nCreate the schema file at schemas/post.ts using defineType() and defineField().\nRegister it in schemas/index.ts alongside the existing article schema.";
129
129
  };
@@ -158,9 +158,9 @@ export declare const exampleAgentAddSchemaData: readonly [{
158
158
  readonly status: "draft";
159
159
  }];
160
160
  /** TypeScript task template for example-agent-add-schema */
161
- export declare const exampleAgentAddSchemaTs = "/**\n * Example Task: Agent harness \u2014 add a schema to a Sanity Studio project.\n *\n * This is a starter template for agent-harness evaluations. It tests whether\n * an autonomous coding agent can modify a real project in a sandboxed\n * environment using file system tools.\n *\n * Unlike literacy or knowledge-probe tasks that evaluate text responses,\n * agent-harness tasks evaluate *side-effects*: files created, code modified,\n * commands executed. The agent gets a working directory with real files and\n * tools (Bash, Read, Write, Edit, etc.) and is graded on what it produces.\n *\n * This task is a DRAFT \u2014 it won't run unless activated or explicitly targeted.\n * To activate: change status to \"active\" or remove the status field.\n *\n * @see https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/modes.md#agent-harness\n */\n\nimport { defineTask } from \"@sanity/ailf\"\n\nexport default defineTask({\n // \u2500\u2500 Mode \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // \"agent-harness\" runs a real coding agent (Claude Agent SDK) with\n // filesystem tools in an isolated sandbox. The agent can read, write,\n // edit files and run shell commands \u2014 then assertions verify the results.\n //\n // Other modes: \"literacy\" (text Q&A), \"mcp-server\" (tool use),\n // \"knowledge-probe\" (baseline knowledge).\n mode: \"agent-harness\",\n\n id: \"example-agent-add-schema\",\n title: \"Add a document schema to a Sanity Studio project\",\n description:\n \"Example \u2014 tests whether an agent can create a schema file and \" +\n \"register it in an existing Sanity Studio project\",\n\n // Area groups tasks for scoring. All agent-harness tasks in the same\n // area are aggregated together in the score report.\n area: \"studio\",\n\n // \u2500\u2500 Sandbox \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // The sandbox isolates the agent's file operations from your real repo.\n // Currently only \"tempdir\" is implemented (creates a temporary directory).\n // The pipeline creates the sandbox dir at config-generation time, copies\n // fixtures into it before the agent starts, and the agent's working_dir\n // is set to the sandbox path.\n //\n // See: docs/modes.md#sandbox-lifecycle\n sandbox: { type: \"tempdir\" },\n\n // \u2500\u2500 Tools \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // Tool presets control which Claude Agent SDK tools the agent can use:\n // \"coding\" \u2192 Bash, Read, Write, Edit, Glob, Grep\n // \"full-access\" \u2192 coding + WebSearch, WebFetch, TodoRead, TodoWrite\n // \"read-only\" \u2192 Read, Glob, Grep, WebSearch\n //\n // You can also mix presets with explicit tool names:\n // tools: [\"coding\", \"WebFetch\"]\n tools: [\"coding\"],\n\n // \u2500\u2500 Fixtures \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // Fixtures are files or directories copied into the sandbox before the\n // agent starts. Paths use the file:// prefix and resolve relative to\n // the directory where you run the pipeline (typically your repo root).\n //\n // The agent sees these files in its working directory as if they were\n // a real project. For example, \"file://apps/studio-basic\" copies the\n // entire studio-basic app into the sandbox root.\n //\n // If you don't have fixture projects, you can omit this field and the\n // agent starts with an empty directory.\n fixtures: [\"file://apps/studio-basic\"],\n\n // \u2500\u2500 Prompt \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // The task instructions sent to the agent. Be specific about expected\n // file paths and patterns \u2014 the assertions check for exact paths.\n prompt: {\n text: `You have a Sanity Studio project with an existing article schema.\nAdd a new \"post\" document type with the following fields:\n1. title (string, required)\n2. slug (slug, sourced from title, required)\n3. author (string)\n4. publishedAt (datetime)\n5. body (array of block content)\n\nCreate the schema file at schemas/post.ts using defineType() and defineField().\nRegister it in schemas/index.ts alongside the existing article schema.`,\n },\n\n // \u2500\u2500 Assertions \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // Agent-harness assertions verify the sandbox state after the agent runs.\n // They execute in a full Node.js context (not eval()) so they can use\n // fs, child_process, etc. All file paths resolve relative to the sandbox.\n //\n // Available assertion types:\n // file-exists \u2014 check a file was created\n // file-contains \u2014 check a file contains a substring\n // command-succeeds \u2014 run a shell command (exit 0 = pass)\n // diff-matches \u2014 check git diff contains a pattern\n // llm-rubric \u2014 LLM grades the agent's text output\n assertions: [\n // Verify the agent created the new schema file\n { type: \"file-exists\", value: \"schemas/post.ts\" },\n\n // Verify it uses the modern Sanity schema API\n {\n type: \"file-contains\",\n value: { path: \"schemas/post.ts\", content: \"defineType\" },\n },\n {\n type: \"file-contains\",\n value: { path: \"schemas/post.ts\", content: \"defineField\" },\n },\n\n // Verify the schema was registered in the barrel export\n {\n type: \"file-contains\",\n value: { path: \"schemas/index.ts\", content: \"post\" },\n },\n\n // Verify the existing config is still intact\n {\n type: \"file-contains\",\n value: { path: \"sanity.config.ts\", content: \"defineConfig\" },\n },\n ],\n\n status: \"draft\",\n})\n";
161
+ export declare const exampleAgentAddSchemaTs = "/**\n * Example Task: Agent harness \u2014 add a schema to a Sanity Studio project.\n *\n * This is a starter template for agent-harness evaluations. It tests whether\n * an autonomous coding agent can modify a real project in a sandboxed\n * environment using file system tools.\n *\n * Unlike literacy or knowledge-probe tasks that evaluate text responses,\n * agent-harness tasks evaluate *side-effects*: files created, code modified,\n * commands executed. The agent gets a working directory with real files and\n * tools (Bash, Read, Write, Edit, etc.) and is graded on what it produces.\n *\n * This task is a DRAFT \u2014 it won't run unless activated or explicitly targeted.\n * To activate: change status to \"active\" or remove the status field.\n *\n * @see https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/modes.md#agent-harness\n */\n\nimport { defineTask } from \"@sanity/ailf\"\n\nexport default defineTask({\n // \u2500\u2500 Mode \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // \"agent-harness\" runs a real coding agent (Claude Agent SDK) with\n // filesystem tools in an isolated sandbox. The agent can read, write,\n // edit files and run shell commands \u2014 then assertions verify the results.\n //\n // Other modes: \"literacy\" (text Q&A), \"mcp-server\" (tool use),\n // \"knowledge-probe\" (baseline knowledge).\n mode: \"agent-harness\",\n\n id: \"example-agent-add-schema\",\n title: \"Add a document schema to a Sanity Studio project\",\n description:\n \"Example \u2014 tests whether an agent can create a schema file and \" +\n \"register it in an existing Sanity Studio project\",\n\n // Area groups tasks for scoring. All agent-harness tasks in the same\n // area are aggregated together in the score report.\n area: \"studio\",\n\n // \u2500\u2500 Sandbox \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // The sandbox isolates the agent's file operations from your real repo.\n // Currently only \"tempdir\" is implemented (creates a temporary directory).\n // The pipeline creates the sandbox dir at config-generation time, copies\n // fixtures into it before the agent starts, and the agent's working_dir\n // is set to the sandbox path.\n //\n // See: docs/modes.md#sandbox-lifecycle\n sandbox: { type: \"tempdir\" },\n\n // \u2500\u2500 Tools \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // Tool presets control which Claude Agent SDK tools the agent can use:\n // \"coding\" \u2192 Bash, Read, Write, Edit, Glob, Grep\n // \"full-access\" \u2192 coding + WebSearch, WebFetch, TodoRead, TodoWrite\n // \"read-only\" \u2192 Read, Glob, Grep, WebSearch\n //\n // You can also mix presets with explicit tool names:\n // tools: [\"coding\", \"WebFetch\"]\n tools: [\"coding\"],\n\n // \u2500\u2500 Fixtures \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // Fixtures are files or directories copied into the sandbox before the\n // agent starts. Paths use the file:// prefix and resolve relative to\n // the directory where you run the pipeline (typically your repo root).\n //\n // The agent sees these files in its working directory as if they were\n // a real project. For example, \"file://apps/editor\" copies the entire\n // editor Studio app into the sandbox root.\n //\n // If you don't have fixture projects, you can omit this field and the\n // agent starts with an empty directory.\n fixtures: [\"file://apps/editor\"],\n\n // \u2500\u2500 Prompt \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // The task instructions sent to the agent. Be specific about expected\n // file paths and patterns \u2014 the assertions check for exact paths.\n prompt: {\n text: `You have a Sanity Studio project with an existing article schema.\nAdd a new \"post\" document type with the following fields:\n1. title (string, required)\n2. slug (slug, sourced from title, required)\n3. author (string)\n4. publishedAt (datetime)\n5. body (array of block content)\n\nCreate the schema file at schemas/post.ts using defineType() and defineField().\nRegister it in schemas/index.ts alongside the existing article schema.`,\n },\n\n // \u2500\u2500 Assertions \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n // Agent-harness assertions verify the sandbox state after the agent runs.\n // They execute in a full Node.js context (not eval()) so they can use\n // fs, child_process, etc. All file paths resolve relative to the sandbox.\n //\n // Available assertion types:\n // file-exists \u2014 check a file was created\n // file-contains \u2014 check a file contains a substring\n // command-succeeds \u2014 run a shell command (exit 0 = pass)\n // diff-matches \u2014 check git diff contains a pattern\n // llm-rubric \u2014 LLM grades the agent's text output\n assertions: [\n // Verify the agent created the new schema file\n { type: \"file-exists\", value: \"schemas/post.ts\" },\n\n // Verify it uses the modern Sanity schema API\n {\n type: \"file-contains\",\n value: { path: \"schemas/post.ts\", content: \"defineType\" },\n },\n {\n type: \"file-contains\",\n value: { path: \"schemas/post.ts\", content: \"defineField\" },\n },\n\n // Verify the schema was registered in the barrel export\n {\n type: \"file-contains\",\n value: { path: \"schemas/index.ts\", content: \"post\" },\n },\n\n // Verify the existing config is still intact\n {\n type: \"file-contains\",\n value: { path: \"sanity.config.ts\", content: \"defineConfig\" },\n },\n ],\n\n status: \"draft\",\n})\n";
162
162
  /** Generated YAML for example-agent-add-schema (from parsed TS data) */
163
- export declare const exampleAgentAddSchemaYaml = "- mode: agent-harness\n id: example-agent-add-schema\n title: Add a document schema to a Sanity Studio project\n description: Example \u2014 tests whether an agent can create a schema file and register it in an existing Sanity Studio project\n area: studio\n sandbox:\n type: tempdir\n tools:\n - coding\n fixtures:\n - file://apps/studio-basic\n prompt:\n text: |-\n You have a Sanity Studio project with an existing article schema.\n Add a new \"post\" document type with the following fields:\n 1. title (string, required)\n 2. slug (slug, sourced from title, required)\n 3. author (string)\n 4. publishedAt (datetime)\n 5. body (array of block content)\n\n Create the schema file at schemas/post.ts using defineType() and defineField().\n Register it in schemas/index.ts alongside the existing article schema.\n assertions:\n - type: file-exists\n value: schemas/post.ts\n - type: file-contains\n value:\n path: schemas/post.ts\n content: defineType\n - type: file-contains\n value:\n path: schemas/post.ts\n content: defineField\n - type: file-contains\n value:\n path: schemas/index.ts\n content: post\n - type: file-contains\n value:\n path: sanity.config.ts\n content: defineConfig\n status: draft\n";
163
+ export declare const exampleAgentAddSchemaYaml = "- mode: agent-harness\n id: example-agent-add-schema\n title: Add a document schema to a Sanity Studio project\n description: Example \u2014 tests whether an agent can create a schema file and register it in an existing Sanity Studio project\n area: studio\n sandbox:\n type: tempdir\n tools:\n - coding\n fixtures:\n - file://apps/editor\n prompt:\n text: |-\n You have a Sanity Studio project with an existing article schema.\n Add a new \"post\" document type with the following fields:\n 1. title (string, required)\n 2. slug (slug, sourced from title, required)\n 3. author (string)\n 4. publishedAt (datetime)\n 5. body (array of block content)\n\n Create the schema file at schemas/post.ts using defineType() and defineField().\n Register it in schemas/index.ts alongside the existing article schema.\n assertions:\n - type: file-exists\n value: schemas/post.ts\n - type: file-contains\n value:\n path: schemas/post.ts\n content: defineType\n - type: file-contains\n value:\n path: schemas/post.ts\n content: defineField\n - type: file-contains\n value:\n path: schemas/index.ts\n content: post\n - type: file-contains\n value:\n path: sanity.config.ts\n content: defineConfig\n status: draft\n";
164
164
  /** Parsed task data for example-groq-blog-listing (JSON-safe) */
165
165
  export declare const exampleGroqBlogListingData: readonly [{
166
166
  readonly mode: "literacy";
@@ -158,7 +158,7 @@ export const exampleAgentAddSchemaData = [
158
158
  "coding"
159
159
  ],
160
160
  "fixtures": [
161
- "file://apps/studio-basic"
161
+ "file://apps/editor"
162
162
  ],
163
163
  "prompt": {
164
164
  "text": "You have a Sanity Studio project with an existing article schema.\nAdd a new \"post\" document type with the following fields:\n1. title (string, required)\n2. slug (slug, sourced from title, required)\n3. author (string)\n4. publishedAt (datetime)\n5. body (array of block content)\n\nCreate the schema file at schemas/post.ts using defineType() and defineField().\nRegister it in schemas/index.ts alongside the existing article schema."
@@ -201,9 +201,9 @@ export const exampleAgentAddSchemaData = [
201
201
  }
202
202
  ];
203
203
  /** TypeScript task template for example-agent-add-schema */
204
- export const exampleAgentAddSchemaTs = "/**\n * Example Task: Agent harness — add a schema to a Sanity Studio project.\n *\n * This is a starter template for agent-harness evaluations. It tests whether\n * an autonomous coding agent can modify a real project in a sandboxed\n * environment using file system tools.\n *\n * Unlike literacy or knowledge-probe tasks that evaluate text responses,\n * agent-harness tasks evaluate *side-effects*: files created, code modified,\n * commands executed. The agent gets a working directory with real files and\n * tools (Bash, Read, Write, Edit, etc.) and is graded on what it produces.\n *\n * This task is a DRAFT — it won't run unless activated or explicitly targeted.\n * To activate: change status to \"active\" or remove the status field.\n *\n * @see https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/modes.md#agent-harness\n */\n\nimport { defineTask } from \"@sanity/ailf\"\n\nexport default defineTask({\n // ── Mode ──────────────────────────────────────────────────────\n // \"agent-harness\" runs a real coding agent (Claude Agent SDK) with\n // filesystem tools in an isolated sandbox. The agent can read, write,\n // edit files and run shell commands — then assertions verify the results.\n //\n // Other modes: \"literacy\" (text Q&A), \"mcp-server\" (tool use),\n // \"knowledge-probe\" (baseline knowledge).\n mode: \"agent-harness\",\n\n id: \"example-agent-add-schema\",\n title: \"Add a document schema to a Sanity Studio project\",\n description:\n \"Example — tests whether an agent can create a schema file and \" +\n \"register it in an existing Sanity Studio project\",\n\n // Area groups tasks for scoring. All agent-harness tasks in the same\n // area are aggregated together in the score report.\n area: \"studio\",\n\n // ── Sandbox ───────────────────────────────────────────────────\n // The sandbox isolates the agent's file operations from your real repo.\n // Currently only \"tempdir\" is implemented (creates a temporary directory).\n // The pipeline creates the sandbox dir at config-generation time, copies\n // fixtures into it before the agent starts, and the agent's working_dir\n // is set to the sandbox path.\n //\n // See: docs/modes.md#sandbox-lifecycle\n sandbox: { type: \"tempdir\" },\n\n // ── Tools ─────────────────────────────────────────────────────\n // Tool presets control which Claude Agent SDK tools the agent can use:\n // \"coding\" → Bash, Read, Write, Edit, Glob, Grep\n // \"full-access\" → coding + WebSearch, WebFetch, TodoRead, TodoWrite\n // \"read-only\" → Read, Glob, Grep, WebSearch\n //\n // You can also mix presets with explicit tool names:\n // tools: [\"coding\", \"WebFetch\"]\n tools: [\"coding\"],\n\n // ── Fixtures ──────────────────────────────────────────────────\n // Fixtures are files or directories copied into the sandbox before the\n // agent starts. Paths use the file:// prefix and resolve relative to\n // the directory where you run the pipeline (typically your repo root).\n //\n // The agent sees these files in its working directory as if they were\n // a real project. For example, \"file://apps/studio-basic\" copies the\n // entire studio-basic app into the sandbox root.\n //\n // If you don't have fixture projects, you can omit this field and the\n // agent starts with an empty directory.\n fixtures: [\"file://apps/studio-basic\"],\n\n // ── Prompt ────────────────────────────────────────────────────\n // The task instructions sent to the agent. Be specific about expected\n // file paths and patterns — the assertions check for exact paths.\n prompt: {\n text: `You have a Sanity Studio project with an existing article schema.\nAdd a new \"post\" document type with the following fields:\n1. title (string, required)\n2. slug (slug, sourced from title, required)\n3. author (string)\n4. publishedAt (datetime)\n5. body (array of block content)\n\nCreate the schema file at schemas/post.ts using defineType() and defineField().\nRegister it in schemas/index.ts alongside the existing article schema.`,\n },\n\n // ── Assertions ────────────────────────────────────────────────\n // Agent-harness assertions verify the sandbox state after the agent runs.\n // They execute in a full Node.js context (not eval()) so they can use\n // fs, child_process, etc. All file paths resolve relative to the sandbox.\n //\n // Available assertion types:\n // file-exists — check a file was created\n // file-contains — check a file contains a substring\n // command-succeeds — run a shell command (exit 0 = pass)\n // diff-matches — check git diff contains a pattern\n // llm-rubric — LLM grades the agent's text output\n assertions: [\n // Verify the agent created the new schema file\n { type: \"file-exists\", value: \"schemas/post.ts\" },\n\n // Verify it uses the modern Sanity schema API\n {\n type: \"file-contains\",\n value: { path: \"schemas/post.ts\", content: \"defineType\" },\n },\n {\n type: \"file-contains\",\n value: { path: \"schemas/post.ts\", content: \"defineField\" },\n },\n\n // Verify the schema was registered in the barrel export\n {\n type: \"file-contains\",\n value: { path: \"schemas/index.ts\", content: \"post\" },\n },\n\n // Verify the existing config is still intact\n {\n type: \"file-contains\",\n value: { path: \"sanity.config.ts\", content: \"defineConfig\" },\n },\n ],\n\n status: \"draft\",\n})\n";
204
+ export const exampleAgentAddSchemaTs = "/**\n * Example Task: Agent harness — add a schema to a Sanity Studio project.\n *\n * This is a starter template for agent-harness evaluations. It tests whether\n * an autonomous coding agent can modify a real project in a sandboxed\n * environment using file system tools.\n *\n * Unlike literacy or knowledge-probe tasks that evaluate text responses,\n * agent-harness tasks evaluate *side-effects*: files created, code modified,\n * commands executed. The agent gets a working directory with real files and\n * tools (Bash, Read, Write, Edit, etc.) and is graded on what it produces.\n *\n * This task is a DRAFT — it won't run unless activated or explicitly targeted.\n * To activate: change status to \"active\" or remove the status field.\n *\n * @see https://github.com/sanity-labs/ai-literacy-framework/blob/main/docs/modes.md#agent-harness\n */\n\nimport { defineTask } from \"@sanity/ailf\"\n\nexport default defineTask({\n // ── Mode ──────────────────────────────────────────────────────\n // \"agent-harness\" runs a real coding agent (Claude Agent SDK) with\n // filesystem tools in an isolated sandbox. The agent can read, write,\n // edit files and run shell commands — then assertions verify the results.\n //\n // Other modes: \"literacy\" (text Q&A), \"mcp-server\" (tool use),\n // \"knowledge-probe\" (baseline knowledge).\n mode: \"agent-harness\",\n\n id: \"example-agent-add-schema\",\n title: \"Add a document schema to a Sanity Studio project\",\n description:\n \"Example — tests whether an agent can create a schema file and \" +\n \"register it in an existing Sanity Studio project\",\n\n // Area groups tasks for scoring. All agent-harness tasks in the same\n // area are aggregated together in the score report.\n area: \"studio\",\n\n // ── Sandbox ───────────────────────────────────────────────────\n // The sandbox isolates the agent's file operations from your real repo.\n // Currently only \"tempdir\" is implemented (creates a temporary directory).\n // The pipeline creates the sandbox dir at config-generation time, copies\n // fixtures into it before the agent starts, and the agent's working_dir\n // is set to the sandbox path.\n //\n // See: docs/modes.md#sandbox-lifecycle\n sandbox: { type: \"tempdir\" },\n\n // ── Tools ─────────────────────────────────────────────────────\n // Tool presets control which Claude Agent SDK tools the agent can use:\n // \"coding\" → Bash, Read, Write, Edit, Glob, Grep\n // \"full-access\" → coding + WebSearch, WebFetch, TodoRead, TodoWrite\n // \"read-only\" → Read, Glob, Grep, WebSearch\n //\n // You can also mix presets with explicit tool names:\n // tools: [\"coding\", \"WebFetch\"]\n tools: [\"coding\"],\n\n // ── Fixtures ──────────────────────────────────────────────────\n // Fixtures are files or directories copied into the sandbox before the\n // agent starts. Paths use the file:// prefix and resolve relative to\n // the directory where you run the pipeline (typically your repo root).\n //\n // The agent sees these files in its working directory as if they were\n // a real project. For example, \"file://apps/editor\" copies the entire\n // editor Studio app into the sandbox root.\n //\n // If you don't have fixture projects, you can omit this field and the\n // agent starts with an empty directory.\n fixtures: [\"file://apps/editor\"],\n\n // ── Prompt ────────────────────────────────────────────────────\n // The task instructions sent to the agent. Be specific about expected\n // file paths and patterns — the assertions check for exact paths.\n prompt: {\n text: `You have a Sanity Studio project with an existing article schema.\nAdd a new \"post\" document type with the following fields:\n1. title (string, required)\n2. slug (slug, sourced from title, required)\n3. author (string)\n4. publishedAt (datetime)\n5. body (array of block content)\n\nCreate the schema file at schemas/post.ts using defineType() and defineField().\nRegister it in schemas/index.ts alongside the existing article schema.`,\n },\n\n // ── Assertions ────────────────────────────────────────────────\n // Agent-harness assertions verify the sandbox state after the agent runs.\n // They execute in a full Node.js context (not eval()) so they can use\n // fs, child_process, etc. All file paths resolve relative to the sandbox.\n //\n // Available assertion types:\n // file-exists — check a file was created\n // file-contains — check a file contains a substring\n // command-succeeds — run a shell command (exit 0 = pass)\n // diff-matches — check git diff contains a pattern\n // llm-rubric — LLM grades the agent's text output\n assertions: [\n // Verify the agent created the new schema file\n { type: \"file-exists\", value: \"schemas/post.ts\" },\n\n // Verify it uses the modern Sanity schema API\n {\n type: \"file-contains\",\n value: { path: \"schemas/post.ts\", content: \"defineType\" },\n },\n {\n type: \"file-contains\",\n value: { path: \"schemas/post.ts\", content: \"defineField\" },\n },\n\n // Verify the schema was registered in the barrel export\n {\n type: \"file-contains\",\n value: { path: \"schemas/index.ts\", content: \"post\" },\n },\n\n // Verify the existing config is still intact\n {\n type: \"file-contains\",\n value: { path: \"sanity.config.ts\", content: \"defineConfig\" },\n },\n ],\n\n status: \"draft\",\n})\n";
205
205
  /** Generated YAML for example-agent-add-schema (from parsed TS data) */
206
- export const exampleAgentAddSchemaYaml = "- mode: agent-harness\n id: example-agent-add-schema\n title: Add a document schema to a Sanity Studio project\n description: Example — tests whether an agent can create a schema file and register it in an existing Sanity Studio project\n area: studio\n sandbox:\n type: tempdir\n tools:\n - coding\n fixtures:\n - file://apps/studio-basic\n prompt:\n text: |-\n You have a Sanity Studio project with an existing article schema.\n Add a new \"post\" document type with the following fields:\n 1. title (string, required)\n 2. slug (slug, sourced from title, required)\n 3. author (string)\n 4. publishedAt (datetime)\n 5. body (array of block content)\n\n Create the schema file at schemas/post.ts using defineType() and defineField().\n Register it in schemas/index.ts alongside the existing article schema.\n assertions:\n - type: file-exists\n value: schemas/post.ts\n - type: file-contains\n value:\n path: schemas/post.ts\n content: defineType\n - type: file-contains\n value:\n path: schemas/post.ts\n content: defineField\n - type: file-contains\n value:\n path: schemas/index.ts\n content: post\n - type: file-contains\n value:\n path: sanity.config.ts\n content: defineConfig\n status: draft\n";
206
+ export const exampleAgentAddSchemaYaml = "- mode: agent-harness\n id: example-agent-add-schema\n title: Add a document schema to a Sanity Studio project\n description: Example — tests whether an agent can create a schema file and register it in an existing Sanity Studio project\n area: studio\n sandbox:\n type: tempdir\n tools:\n - coding\n fixtures:\n - file://apps/editor\n prompt:\n text: |-\n You have a Sanity Studio project with an existing article schema.\n Add a new \"post\" document type with the following fields:\n 1. title (string, required)\n 2. slug (slug, sourced from title, required)\n 3. author (string)\n 4. publishedAt (datetime)\n 5. body (array of block content)\n\n Create the schema file at schemas/post.ts using defineType() and defineField().\n Register it in schemas/index.ts alongside the existing article schema.\n assertions:\n - type: file-exists\n value: schemas/post.ts\n - type: file-contains\n value:\n path: schemas/post.ts\n content: defineType\n - type: file-contains\n value:\n path: schemas/post.ts\n content: defineField\n - type: file-contains\n value:\n path: schemas/index.ts\n content: post\n - type: file-contains\n value:\n path: sanity.config.ts\n content: defineConfig\n status: draft\n";
207
207
  /** Parsed task data for example-groq-blog-listing (JSON-safe) */
208
208
  export const exampleGroqBlogListingData = [
209
209
  {
@@ -11,7 +11,7 @@
11
11
  * Fields marked optional are transitional — they will become required
12
12
  * as downstream consumers are converted to use them.
13
13
  */
14
- import type { RunClassification, RunExecutorSurface } from "../../ailf-shared/index.d.ts";
14
+ import type { LiteracyVariant, RunClassification, RunExecutorSurface } from "../../ailf-shared/index.d.ts";
15
15
  import type { RunId } from "../types/branded-ids.js";
16
16
  import type { DebugOptions, EvalMode, PluginRegistry } from "../types/index.js";
17
17
  import type { ArtifactWriter } from "./artifact-writer.js";
@@ -42,12 +42,20 @@ export interface ResolvedConfig {
42
42
  * `mode: "literacy", variant: "baseline"`. This keeps the pipeline
43
43
  * mode-agnostic while preserving literacy's multi-variant behavior.
44
44
  *
45
- * Values: "baseline" | "agentic" | "observed" | "full" | undefined
46
45
  * Undefined means "use the default variant for the mode" (baseline for literacy).
47
46
  */
48
- variant?: string;
47
+ variant?: LiteracyVariant;
49
48
  /** Debug options */
50
49
  debug?: DebugOptions;
50
+ /**
51
+ * Filter the evaluated cohort to a subset of the configured model IDs.
52
+ *
53
+ * Each entry must match the `id` of a model declared in
54
+ * `config/models.ts`. Unknown IDs are dropped at the runner with a
55
+ * structured warning AND surfaced on the job's `error` field so callers
56
+ * can detect typos — silent strips are not acceptable.
57
+ */
58
+ models?: string[];
51
59
  /** Feature area filter */
52
60
  areas?: string[];
53
61
  /** Task ID filter */
@@ -68,6 +76,12 @@ export interface ResolvedConfig {
68
76
  compareThreshold?: number;
69
77
  /** Comparison baseline path */
70
78
  compareBaseline?: string;
79
+ /**
80
+ * Comparison baseline expressed as a previously-published
81
+ * `ailf.report` document id. Takes precedence over `compareBaseline`
82
+ * when both are set.
83
+ */
84
+ compareBaselineReportId?: string;
71
85
  /** Whether gap analysis is enabled */
72
86
  gapAnalysisEnabled: boolean;
73
87
  /** Whether publishing is enabled */
@@ -323,6 +337,26 @@ export interface AppContext {
323
337
  /** Task definition source (YAML, Content Lake, repo) */
324
338
  readonly taskSource: TaskSource;
325
339
  }
340
+ /**
341
+ * Discriminated result for `ReportStorePort.loadBaselineFromReport`.
342
+ *
343
+ * Lets the compare step distinguish a genuine 404 (the pinned report
344
+ * doesn't exist — skip with a clear reason) from a transport failure
345
+ * (Sanity 5xx, network blew up — fail the step so the user knows the
346
+ * pinned baseline didn't actually compare). The `baseline` payload is
347
+ * typed as `unknown` to keep the port surface decoupled from the eval
348
+ * package's `ComparableSummary` type — concrete implementations return
349
+ * a more specific shape, which is sound.
350
+ */
351
+ export type LoadBaselineResult = {
352
+ kind: "ok";
353
+ baseline: unknown;
354
+ } | {
355
+ kind: "not_found";
356
+ } | {
357
+ kind: "error";
358
+ message: string;
359
+ };
326
360
  /**
327
361
  * Minimal report store interface used by AppContext.
328
362
  *
@@ -341,6 +375,14 @@ export interface ReportStorePort {
341
375
  write(report: unknown): Promise<unknown>;
342
376
  /** Read a report by its ID (used by the post-run diagnosis hook). */
343
377
  read(id: string): Promise<null | unknown>;
378
+ /**
379
+ * Load a previously-published report's score summary as a baseline
380
+ * for the `compare` step. Returns a discriminated result so callers
381
+ * can distinguish a genuine 404 (skip with a clear reason) from a
382
+ * transport failure (fail the step — the user pinned a baseline and
383
+ * deserves to know it didn't actually compare).
384
+ */
385
+ loadBaselineFromReport(reportId: string): Promise<LoadBaselineResult>;
344
386
  /** Patch synthesis telemetry onto a published report (Phase 6 / DIAG-06). */
345
387
  patchSynthesis(id: string, telemetry: unknown): Promise<void>;
346
388
  /**
@@ -8,7 +8,7 @@ export type { ArtifactEntry, ArtifactWriter } from "./artifact-writer.js";
8
8
  export { NoOpArtifactWriter } from "./artifact-writer.js";
9
9
  export type { CacheEntryMetadata, CacheKey, CacheLookupResult, CacheRecordInput, CacheStore, } from "./cache-store.js";
10
10
  export type { ConfigSource } from "./config-source.js";
11
- export type { AppContext, ReportSinkPort, ReportStorePort, ResolvedConfig, } from "./context.js";
11
+ export type { AppContext, LoadBaselineResult, ReportSinkPort, ReportStorePort, ResolvedConfig, } from "./context.js";
12
12
  export type { DocContext, DocFetcher, DocSourceConfig, DocumentManifestEntry, DocumentOverlaySummary, FetchMetadata, FetchResult, ReleaseImpact, SymbolIndexManifestEntry, UrlFetchEntry, UrlFetchSummary, } from "./doc-fetcher.js";
13
13
  export type { EvalRunConfig, EvalRunner } from "./eval-runner.js";
14
14
  export type { LLMCallContext, LLMClient, LLMCompleteArgs, LLMCompleteStructuredArgs, LLMCompletion, LLMStructuredCompletion, LLMUsage, ModelId, ModelProvider, ParsedModelId, } from "./llm-client.js";
@@ -36,5 +36,13 @@ import type { Brand } from "../types/branded-ids.js";
36
36
  * exit from the project's no-`as`-on-`unknown` rule. Adapters MUST
37
37
  * NOT replicate the cast at their own call sites — call this helper
38
38
  * instead so the rule violation stays centralized.
39
+ *
40
+ * Pass `regex` to enforce a stricter shape than non-empty. The
41
+ * runtime validator becomes `z.string().regex(regex)` instead of
42
+ * `z.string().min(1)`; the brand-cast at the call boundary is
43
+ * unchanged. Callers passing `regex` are responsible for ensuring
44
+ * it rejects the empty string (typically anchor with `^` and
45
+ * require at least one character via `+` or a non-`*` quantifier);
46
+ * the `.min(1)` floor is dropped when `regex` is supplied.
39
47
  */
40
- export declare function brandedString<TBrand extends string>(): z.ZodType<Brand<string, TBrand>>;
48
+ export declare function brandedString<TBrand extends string>(regex?: RegExp): z.ZodType<Brand<string, TBrand>>;
@@ -35,11 +35,21 @@ import { z } from "zod";
35
35
  * exit from the project's no-`as`-on-`unknown` rule. Adapters MUST
36
36
  * NOT replicate the cast at their own call sites — call this helper
37
37
  * instead so the rule violation stays centralized.
38
+ *
39
+ * Pass `regex` to enforce a stricter shape than non-empty. The
40
+ * runtime validator becomes `z.string().regex(regex)` instead of
41
+ * `z.string().min(1)`; the brand-cast at the call boundary is
42
+ * unchanged. Callers passing `regex` are responsible for ensuring
43
+ * it rejects the empty string (typically anchor with `^` and
44
+ * require at least one character via `+` or a non-`*` quantifier);
45
+ * the `.min(1)` floor is dropped when `regex` is supplied.
38
46
  */
39
- export function brandedString() {
40
- // The runtime is a plain non-empty string; the brand is a
41
- // compile-time-only nominal tag (see `Brand<>` in branded-ids.ts).
42
- // Zod 4's `.brand()` uses a different symbol shape, so a direct
43
- // composition does not yield the project's `Brand<…>` type.
44
- return z.string().min(1);
47
+ export function brandedString(regex) {
48
+ // The runtime is a plain string (non-empty or regex-validated);
49
+ // the brand is a compile-time-only nominal tag (see `Brand<>` in
50
+ // branded-ids.ts). Zod 4's `.brand()` uses a different symbol
51
+ // shape, so a direct composition does not yield the project's
52
+ // `Brand<…>` type.
53
+ const base = regex === undefined ? z.string().min(1) : z.string().regex(regex);
54
+ return base;
45
55
  }
@@ -33,6 +33,7 @@ export declare const PipelineRequestSchema: z.ZodObject<{
33
33
  changedDocs: z.ZodOptional<z.ZodArray<z.ZodString>>;
34
34
  compare: z.ZodOptional<z.ZodBoolean>;
35
35
  compareBaseline: z.ZodOptional<z.ZodString>;
36
+ compareBaselineReportId: z.ZodOptional<z.ZodString>;
36
37
  compareThreshold: z.ZodOptional<z.ZodNumber>;
37
38
  concurrency: z.ZodOptional<z.ZodNumber>;
38
39
  dataset: z.ZodOptional<z.ZodString>;
@@ -63,6 +64,7 @@ export declare const PipelineRequestSchema: z.ZodObject<{
63
64
  observed: "observed";
64
65
  full: "full";
65
66
  }>>;
67
+ models: z.ZodOptional<z.ZodArray<z.ZodString>>;
66
68
  noAutoScope: z.ZodOptional<z.ZodBoolean>;
67
69
  noCache: z.ZodOptional<z.ZodBoolean>;
68
70
  noRemoteCache: z.ZodOptional<z.ZodBoolean>;
@@ -101,6 +101,7 @@ export const PipelineRequestSchema = z.object({
101
101
  changedDocs: z.array(z.string()).optional(),
102
102
  compare: z.boolean().optional(),
103
103
  compareBaseline: z.string().optional(),
104
+ compareBaselineReportId: z.string().min(1).optional(),
104
105
  compareThreshold: z.number().min(0).optional(),
105
106
  concurrency: z.number().int().positive().optional(),
106
107
  dataset: z.string().optional(),
@@ -123,6 +124,12 @@ export const PipelineRequestSchema = z.object({
123
124
  * Legacy names must pass through normalizeMode() before entering typed pipeline code.
124
125
  */
125
126
  mode: z.enum(RAW_EVAL_MODES).optional(),
127
+ /**
128
+ * Filter the evaluation cohort to a subset of the configured model IDs
129
+ * (W0281). Unknown IDs are dropped at the runner with a structured
130
+ * warning + job-error patch.
131
+ */
132
+ models: z.array(z.string().min(1)).optional(),
126
133
  noAutoScope: z.boolean().optional(),
127
134
  noCache: z.boolean().optional(),
128
135
  noRemoteCache: z.boolean().optional(),
@@ -113,6 +113,12 @@ export declare const ReportProvenanceSchema: z.ZodObject<{
113
113
  documentId: z.ZodOptional<z.ZodString>;
114
114
  source: z.ZodString;
115
115
  }, z.core.$strict>], "type">;
116
+ variant: z.ZodOptional<z.ZodEnum<{
117
+ agentic: "agentic";
118
+ baseline: "baseline";
119
+ observed: "observed";
120
+ full: "full";
121
+ }>>;
116
122
  autoScope: z.ZodOptional<z.ZodObject<{
117
123
  enabled: z.ZodBoolean;
118
124
  affectedTaskIds: z.ZodArray<z.ZodString>;
@@ -222,6 +228,12 @@ export declare const ReportSchema: z.ZodObject<{
222
228
  documentId: z.ZodOptional<z.ZodString>;
223
229
  source: z.ZodString;
224
230
  }, z.core.$strict>], "type">;
231
+ variant: z.ZodOptional<z.ZodEnum<{
232
+ agentic: "agentic";
233
+ baseline: "baseline";
234
+ observed: "observed";
235
+ full: "full";
236
+ }>>;
225
237
  autoScope: z.ZodOptional<z.ZodObject<{
226
238
  enabled: z.ZodBoolean;
227
239
  affectedTaskIds: z.ZodArray<z.ZodString>;
@@ -24,6 +24,7 @@
24
24
  * @see docs/work-items/W0191-report-store-schema-gate.json
25
25
  */
26
26
  import { z } from "zod";
27
+ import { LITERACY_VARIANTS } from "../../ailf-shared/index.js";
27
28
  // ---------------------------------------------------------------------------
28
29
  // RunContext building blocks (mirrors packages/shared/src/run-context.ts)
29
30
  // ---------------------------------------------------------------------------
@@ -195,6 +196,7 @@ export const ReportProvenanceSchema = z
195
196
  taskIds: z.array(z.string()).optional(),
196
197
  tool: RunToolSchema.optional(),
197
198
  trigger: RunTriggerSchema,
199
+ variant: z.enum(LITERACY_VARIANTS).optional(),
198
200
  // ReportProvenance additions
199
201
  autoScope: ReportAutoScopeSchema.optional(),
200
202
  contextHash: z.string().optional(),
@@ -0,0 +1,22 @@
1
+ import { z } from "zod";
2
+ import type { NotificationChannel } from "../types/team.js";
3
+ export declare const TeamSchema: z.ZodObject<{
4
+ id: z.ZodType<import("../index.js").Brand<string, "TeamId">, unknown, z.core.$ZodTypeInternals<import("../index.js").Brand<string, "TeamId">, unknown>>;
5
+ slug: z.ZodType<import("../index.js").Brand<string, "TeamSlug">, unknown, z.core.$ZodTypeInternals<import("../index.js").Brand<string, "TeamSlug">, unknown>>;
6
+ displayName: z.ZodString;
7
+ description: z.ZodOptional<z.ZodString>;
8
+ status: z.ZodEnum<{
9
+ active: "active";
10
+ archived: "archived";
11
+ }>;
12
+ members: z.ZodArray<z.ZodObject<{
13
+ email: z.ZodOptional<z.ZodString>;
14
+ sanityUserId: z.ZodOptional<z.ZodString>;
15
+ githubUsername: z.ZodOptional<z.ZodString>;
16
+ displayName: z.ZodOptional<z.ZodString>;
17
+ role: z.ZodOptional<z.ZodString>;
18
+ lastVerifiedAt: z.ZodOptional<z.ZodString>;
19
+ }, z.core.$strip>>;
20
+ repos: z.ZodOptional<z.ZodArray<z.ZodString>>;
21
+ notifications: z.ZodOptional<z.ZodArray<z.ZodType<NotificationChannel, unknown, z.core.$ZodTypeInternals<NotificationChannel, unknown>>>>;
22
+ }, z.core.$strip>;
@@ -0,0 +1,63 @@
1
+ import { z } from "zod";
2
+ import { brandedString } from "./branded-string.js";
3
+ const SLUG_REGEX = /^[a-z0-9][a-z0-9-]*$/;
4
+ const TEAM_ID_REGEX = /^ailf\.team\.[a-z0-9][a-z0-9-]*$/;
5
+ const TeamMemberSchema = z
6
+ .object({
7
+ email: z.string().email().optional(),
8
+ sanityUserId: z.string().optional(),
9
+ githubUsername: z.string().optional(),
10
+ displayName: z.string().optional(),
11
+ role: z.string().optional(),
12
+ lastVerifiedAt: z.string().datetime().optional(),
13
+ })
14
+ .refine((m) => Boolean(m.email || m.sanityUserId || m.githubUsername), {
15
+ message: "TeamMember requires at least one of email, sanityUserId, githubUsername",
16
+ });
17
+ const ChannelScopeSchema = z.discriminatedUnion("type", [
18
+ z.object({ type: z.literal("owned") }),
19
+ z.object({ type: z.literal("all") }),
20
+ z.object({ type: z.literal("areas"), areas: z.array(z.string()) }),
21
+ z.object({ type: z.literal("repos"), repos: z.array(z.string()) }),
22
+ z.object({ type: z.literal("tags"), tags: z.array(z.string()) }),
23
+ ]);
24
+ const SlackChannelSchema = z.object({
25
+ _key: z.string(),
26
+ type: z.literal("slack"),
27
+ channelId: z.string().min(1),
28
+ channelName: z.string().optional(),
29
+ purpose: z.string().optional(),
30
+ events: z.array(z.string()).optional(),
31
+ scope: ChannelScopeSchema.optional(),
32
+ });
33
+ const EmailChannelSchema = z.object({
34
+ _key: z.string(),
35
+ type: z.literal("email"),
36
+ addresses: z.array(z.string().email()).min(1),
37
+ purpose: z.string().optional(),
38
+ events: z.array(z.string()).optional(),
39
+ scope: ChannelScopeSchema.optional(),
40
+ });
41
+ const WebhookChannelSchema = z.object({
42
+ _key: z.string(),
43
+ type: z.literal("webhook"),
44
+ logicalName: z.string().min(1),
45
+ purpose: z.string().optional(),
46
+ events: z.array(z.string()).optional(),
47
+ scope: ChannelScopeSchema.optional(),
48
+ });
49
+ const NotificationChannelSchema = z.discriminatedUnion("type", [
50
+ SlackChannelSchema,
51
+ EmailChannelSchema,
52
+ WebhookChannelSchema,
53
+ ]);
54
+ export const TeamSchema = z.object({
55
+ id: brandedString(TEAM_ID_REGEX),
56
+ slug: brandedString(SLUG_REGEX),
57
+ displayName: z.string().min(1),
58
+ description: z.string().optional(),
59
+ status: z.enum(["active", "archived"]),
60
+ members: z.array(TeamMemberSchema).min(1),
61
+ repos: z.array(z.string()).optional(),
62
+ notifications: z.array(NotificationChannelSchema).optional(),
63
+ });
@@ -123,3 +123,54 @@ export interface GraderJudgment {
123
123
  graderJudgmentsVersion: string;
124
124
  };
125
125
  }
126
+ /**
127
+ * Wire-format subset of {@link GraderJudgment} — the fields a grader LLM
128
+ * is responsible for emitting in its JSON response. The pipeline parses
129
+ * untrusted grader output against this shape, then synthesizes the
130
+ * remaining storage fields (`taskId`, `modelId`, `dimension`, `judgmentId`,
131
+ * `metadata.{graderModel, graderJudgmentsVersion}`, and
132
+ * `hallucinationCheckedAgainst`) from server-side context.
133
+ *
134
+ * The split exists because four of `GraderJudgment`'s required fields are
135
+ * pipeline-owned semantics the LLM cannot produce correctly:
136
+ *
137
+ * - `judgmentId` — D0052 branded id with `(taskId, modelId, dimension,
138
+ * runId)` uniqueness invariant. Minted by `generateJudgmentId`.
139
+ * - `metadata.graderJudgmentsVersion` — static constant co-located with
140
+ * the schema (`promptfoo-grader-output.ts:48`).
141
+ * - `metadata.graderModel` — the grader's deployment alias (pipeline
142
+ * knows from provider config; the LLM doesn't reliably know its own).
143
+ * - `hallucinationCheckedAgainst` — the resolvable-set union of
144
+ * `task.context.docs` and `run.documentManifest`, composed by
145
+ * `populateHallucinationFields` (gap-analysis-step.ts).
146
+ *
147
+ * Asking the LLM for any of these produces drift; `.strict()` on
148
+ * `GraderJudgmentSchema` amplifies that drift into 100% parse failures
149
+ * (the 2026-05-11 empty-gapReport regression — see W0273 and
150
+ * `docs/audits/2026-05-22-empty-gap-analysis-regression.md`).
151
+ *
152
+ * `taskId`, `modelId`, and `dimension` are also pipeline-supplied (from
153
+ * `result.description`, `result.providerId`, and the rubric-classifier
154
+ * output in `calculate-scores.ts:475-479`) — kept out of the wire shape
155
+ * for the same reason.
156
+ */
157
+ export interface GraderEmittedJudgment {
158
+ /** Numeric score in [0, 100] (normalized). */
159
+ score: number;
160
+ /** The grader's natural-language reasoning. */
161
+ reason: string;
162
+ /** Per-dimension failure mode (must match the legal-mode list in the rubric). */
163
+ failureMode: string;
164
+ /** Per-criterion sub-judgments. */
165
+ subJudgments: CriterionSubJudgment[];
166
+ /** Doc citations with role + hallucinated flag. */
167
+ docCitations: DocCitation[];
168
+ /** Grader self-confidence per D0049. */
169
+ confidence: Confidence;
170
+ /**
171
+ * True when the candidate response was empty/whitespace/refused. The
172
+ * pipeline also independently detects this from
173
+ * `result.response.output` — both signals are OR'd.
174
+ */
175
+ outputFailure?: boolean;
176
+ }
@@ -39,8 +39,9 @@ export type { AgentHarnessTaskDefinition, ContentLakeAuthorableMode, ContentLake
39
39
  export type { ActionSuggestion, AreaSummaryBody, CardMeta, CardType, Diagnosis, DiagnosisCard, DocAttributionSpotlightBody, FailureModeSummaryBody, JudgmentRef, LowConfidenceAttributionBody, NoIssuesBody, RegressionVsBaselineBody, TopRecommendationsBody, VersionedInputs, WeakestAreaBody, } from "./diagnosis.js";
40
40
  export type { SynthesisCostTelemetry, SynthesisPerCardTelemetry, } from "./synthesis-telemetry.js";
41
41
  export type { AttributionMeta, DocAttribution, JudgmentAttribution, } from "./attribution.js";
42
- export type { CriterionSubJudgment, DocCitation, DocCitationRole, GraderJudgment, } from "./grader-judgment.js";
42
+ export type { CriterionSubJudgment, DocCitation, DocCitationRole, GraderEmittedJudgment, GraderJudgment, } from "./grader-judgment.js";
43
43
  export type { LegacyGraderJudgment } from "./legacy-grader-judgment.js";
44
+ export type { BaseChannel, ChannelScope, EmailChannel, EventType, KnownEventType, KnownMemberRole, MemberRole, NotificationChannel, NotificationChannelType, SlackChannel, Team, TeamId, TeamMember, TeamRef, TeamSlug, TeamStatus, WebhookChannel, } from "./team.js";
44
45
  type DocumentRef = _DocumentRef;
45
46
  /** Aggregated retrieval metrics for a feature area */
46
47
  export interface AreaRetrievalMetrics {
@@ -259,6 +260,12 @@ export interface FilterOptions {
259
260
  tags?: string[];
260
261
  /** Specific task IDs to include (e.g., ["groq-blog-queries"]) */
261
262
  taskIds?: string[];
263
+ /**
264
+ * Doc slugs that changed in the calling context. When set, only tasks
265
+ * whose `context.docs[*].slug` intersects this list are returned.
266
+ * Empty array is a no-op (treated as undefined).
267
+ */
268
+ changedDocs?: readonly string[];
262
269
  }
263
270
  /** Full gap analysis report */
264
271
  export interface GapAnalysisReport {
@@ -79,6 +79,13 @@ export interface PipelineRequest {
79
79
  classification?: RunClassification;
80
80
  compare?: boolean;
81
81
  compareBaseline?: string;
82
+ /**
83
+ * Compare against a baseline extracted from a previously-published
84
+ * `ailf.report` document. Takes precedence over `compareBaseline`
85
+ * (local FS path). Dashboard-friendly: a report id is something the
86
+ * user can pick from a list.
87
+ */
88
+ compareBaselineReportId?: string;
82
89
  compareThreshold?: number;
83
90
  concurrency?: number;
84
91
  dataset?: string;
@@ -93,6 +100,16 @@ export interface PipelineRequest {
93
100
  jobId?: string;
94
101
  labels?: string[];
95
102
  mode?: RawEvalMode;
103
+ /**
104
+ * Filter the evaluation cohort to a subset of the configured model IDs.
105
+ *
106
+ * Each entry must match the `id` of a model declared in
107
+ * `packages/eval/config/models.ts`. IDs that don't match are dropped
108
+ * with a structured warning AND surfaced on the job's `error` field so
109
+ * callers can detect typos — silent strips are not acceptable
110
+ * (W0281 acceptance criterion 5).
111
+ */
112
+ models?: string[];
96
113
  noAutoScope?: boolean;
97
114
  noCache?: boolean;
98
115
  noRemoteCache?: boolean;