npm - @sanity/ailf - Versions diffs - 4.6.0 → 6.0.0 - Mend

@sanity/ailf 4.6.0 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (185) hide show

package/canonical/grader-references/agent-harness-tools.yaml ADDED Viewed

@@ -0,0 +1,42 @@
+# agent-harness-tools.yaml — Human-graded reference sample for agent-harness tasks
+#
+# Phase 3 GRAD-03 calibration fixture (Plan 03-02). Top-level
+# `expectedFailureMode` + `dimension` are read by the calibration test in
+# packages/eval/src/grader/__tests__/calibration.test.ts.
+- taskId: agent-multi-step-schema-edit
+  area: agent-harness
+  dimension: agent-harness
+  expectedFailureMode: tool-misuse
+  response: |
+    To add a `coverImage` field to the `post` schema and migrate existing
+    documents, the agent ran the following tool calls:
+    1. `read_file({ path: "schemas/post.ts" })` — read the schema.
+    2. `write_file({ path: "schemas/post.ts", content: "<new schema>" })` — wrote the updated schema.
+    3. `run_command({ command: "sanity migrate add-cover-image" })` — attempted to run a migration.
+    The `sanity migrate` command failed because no migration script exists at
+    that name. The agent then stopped without creating the migration file.
+  rubrics:
+    - dimension: tool-usage
+      rubricText: |
+        Score the agent's tool usage from 0 to 100:
+        - 0: Completely wrong tool usage — called wrong tools or with invalid inputs
+        - 25: Poor tool usage — correct tools but wrong parameters or sequencing
+        - 50: Adequate — correct tools and basic parameters, some inefficiency
+        - 75: Good — efficient tool usage with proper error handling
+        - 100: Excellent — optimal tool selection, correct inputs, minimal redundancy
+        Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
+      humanScore: 25
+      notes: |
+        The agent invoked `run_command` with a migration name that did not
+        exist in the project, instead of first listing available migrations
+        with `list_migrations` (or equivalent) or scaffolding a new migration
+        file with the correct tool. The migration step was a tool-misuse:
+        the right tool exists, but the agent supplied invalid arguments
+        (a non-existent migration name) and then failed to recover by
+        scaffolding a new migration. Failure mode: tool-misuse.
+  gradedBy: "senior-engineer"
+  gradedAt: "2026-05-10"

package/canonical/grader-references/knowledge-probe-recall.yaml ADDED Viewed

@@ -0,0 +1,36 @@
+# knowledge-probe-recall.yaml — Human-graded reference sample for KP tasks
+#
+# Phase 3 GRAD-03 calibration fixture (Plan 03-02). Top-level
+# `expectedFailureMode` + `dimension` are read by the calibration test in
+# packages/eval/src/grader/__tests__/calibration.test.ts.
+- taskId: kp-default-dataset
+  area: knowledge-probe
+  dimension: knowledge-probe
+  expectedFailureMode: factual-error
+  response: |
+    The default dataset for new Sanity Studio projects is `production`. When
+    you run `npm create sanity@latest` and accept the defaults, the CLI
+    creates a dataset named `production`.
+  rubrics:
+    - dimension: factual-correctness
+      rubricText: |
+        Score factual correctness from 0 to 100:
+        - 0: Completely wrong — fabricated facts, hallucinated APIs, incorrect syntax
+        - 25: Mostly wrong — some real concepts but significant factual errors
+        - 50: Mixed — some correct facts alongside notable inaccuracies
+        - 75: Mostly correct — minor inaccuracies or imprecisions
+        - 100: Fully correct — all statements are accurate and verifiable
+        Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
+      humanScore: 30
+      notes: |
+        Per current Sanity CLI docs the default dataset chosen by
+        `npm create sanity@latest` is selectable, with `production` being
+        the conventional default name surfaced in the prompt — but the
+        candidate phrases this as a deterministic default rather than a
+        prompt-driven choice. The factual claim "the default is production"
+        is wrong as stated; the CLI prompts for a dataset name and offers
+        `production` as a suggested value. Failure mode: factual-error.
+  gradedBy: "senior-engineer"
+  gradedAt: "2026-05-10"

package/canonical/grader-references/mcp-server-spec.yaml ADDED Viewed

@@ -0,0 +1,51 @@
+# mcp-server-spec.yaml — Human-graded reference sample for MCP server tasks
+#
+# Phase 3 GRAD-03 calibration fixture (Plan 03-02). Top-level
+# `expectedFailureMode` + `dimension` are read by the calibration test in
+# packages/eval/src/grader/__tests__/calibration.test.ts.
+- taskId: mcp-search-tool-pagination
+  area: mcp-server
+  dimension: mcp-behavior
+  expectedFailureMode: spec-mismatch
+  response: |
+    Here's an MCP `search` tool that returns paginated results:
+    ```typescript
+    import { Server } from "@modelcontextprotocol/sdk/server/index.js"
+    server.setRequestHandler({ method: "tools/call" }, async (request) => {
+      if (request.params.name === "search") {
+        const { query, page = 1, pageSize = 10 } = request.params.arguments
+        const results = await searchIndex(query, page, pageSize)
+        return {
+          content: [
+            { type: "text", text: JSON.stringify(results) },
+          ],
+          nextPageToken: page + 1,
+        }
+      }
+    })
+    ```
+  rubrics:
+    - dimension: output-correctness
+      rubricText: |
+        Score MCP tool output handling from 0 to 100:
+        - 0: Completely failed to use tool outputs
+        - 25: Used outputs but misinterpreted the data
+        - 50: Partially correct interpretation of tool outputs
+        - 75: Mostly correct — minor misuse of returned data
+        - 100: Perfect output handling — correctly interpreted all tool responses
+        Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
+      humanScore: 30
+      notes: |
+        The candidate adds a `nextPageToken` field at the top level of the
+        tool-call response. The published MCP `tools/call` response spec only
+        defines `content`, `isError`, and `_meta` at the top level; arbitrary
+        top-level fields like `nextPageToken` are not part of the spec and
+        are ignored by spec-compliant clients. Pagination metadata belongs
+        inside the `content` block (e.g., as a structured JSON payload) or
+        in `_meta`. Failure mode: spec-mismatch.
+  gradedBy: "senior-engineer"
+  gradedAt: "2026-05-10"

package/canonical/grader-references/portable-text.yaml ADDED Viewed

@@ -0,0 +1,48 @@
+# portable-text.yaml — Human-graded reference sample for Portable Text tasks
+#
+# Phase 3 GRAD-03 calibration fixture (Plan 03-02). Top-level
+# `expectedFailureMode` + `dimension` are read by the calibration test in
+# packages/eval/src/grader/__tests__/calibration.test.ts.
+- taskId: portable-text-react-marks
+  area: portable-text
+  dimension: doc-coverage
+  expectedFailureMode: outdated-docs
+  response: |
+    To customize the marks renderer in @portabletext/react, you can pass a `serializers` prop:
+    ```tsx
+    import { PortableText } from "@portabletext/react"
+    const serializers = {
+      marks: {
+        link: ({ children, mark }) => (
+          <a href={mark.href} target="_blank" rel="noopener">{children}</a>
+        ),
+      },
+    }
+    export function Body({ value }: { value: any }) {
+      return <PortableText blocks={value} serializers={serializers} />
+    }
+    ```
+  rubrics:
+    - dimension: doc-coverage
+      rubricText: |
+        Score documentation coverage from 0 to 100:
+        - 0: Had to hallucinate/guess most implementation details
+        - 30: Significant gaps — filled with assumptions
+        - 50: Some gaps — inferred from partial information
+        - 80: Minor gaps — almost everything was documented
+        - 100: Complete coverage — all necessary info was in docs
+        Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}
+      humanScore: 25
+      notes: |
+        The candidate uses the v2 `serializers` prop and `blocks` prop, both of
+        which were renamed in @portabletext/react v3+ to `components` and
+        `value`. The doc the candidate is following is outdated — current
+        docs document the v3 component-based API. Failure mode:
+        outdated-docs (the doc reflects an older API).
+  gradedBy: "senior-engineer"
+  gradedAt: "2026-05-10"

package/config/diagnosis-cards.ts ADDED Viewed

@@ -0,0 +1,318 @@
+/**
+ * diagnosis-cards.ts — Diagnosis eval matrix config.
+ *
+ * TS-first config (per .claude/rules/config.md) defining the 5 LLM card types
+ * × 3 first-class models eval matrix. Consumed by
+ * `scripts/generate-diagnosis-config.ts` to emit
+ * `promptfooconfig-diagnosis.yaml`. Never hand-edit the YAML — run
+ * `pnpm generate-configs` instead.
+ *
+ * Per AI-SPEC §5 and CONTEXT D-04 (path b: standalone generator entry point
+ * for the diagnosis config, additive — does not modify the existing literacy
+ * generate-configs pipeline).
+ *
+ * @see packages/eval/scripts/generate-diagnosis-config.ts — generator
+ * @see packages/eval/promptfooconfig-diagnosis.yaml — generated output
+ */
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+/**
+ * A first-class model entry in the diagnosis eval matrix.
+ * Mirrors the shape of model entries in `config/models.ts`.
+ */
+export interface DiagnosisModelEntry {
+  /** Promptfoo provider string (e.g. "anthropic:messages:claude-opus-4-6") */
+  id: string
+  /** Human-readable label for reports */
+  label: string
+  /** Per-model config overrides (temperature, max_tokens, etc.) */
+  config?: Record<string, unknown>
+}
+/**
+ * The 5 LLM-driven card types under evaluation.
+ * Deterministic cards (area-summary, failure-mode-summary, no-issues) are
+ * tested via `fixture-matrix.test.ts` (vitest), not via the promptfoo matrix.
+ */
+export type LLMCardType =
+  | "top-recommendations"
+  | "weakest-area"
+  | "low-confidence-attribution"
+  | "doc-attribution-spotlight"
+  | "regression-vs-baseline"
+/**
+ * A single evaluation scenario: one fixture path × one expected outcome.
+ *
+ * The `fixturePath` is relative to `packages/eval/` so the promptfoo config
+ * can resolve it from any working directory. `expectedStatus` drives the
+ * pass/fail assertion in the generated YAML.
+ */
+export interface DiagnosisScenario {
+  /** Short slug used in promptfoo `description` fields */
+  name: string
+  /** Path to the Report JSON fixture, relative to `packages/eval/` */
+  fixturePath: string
+  /**
+   * Card type this scenario exercises. The eval matrix runs all LLM cards
+   * per scenario; this field annotates which card type is the primary focus
+   * for the rubric.
+   */
+  primaryCard: LLMCardType
+  /** Expected card status when all LLM calls succeed */
+  expectedStatus: "ready" | "degraded" | "missing"
+  /** Optional: path to canned LLM response for adversarial scenarios */
+  cannedResponsePath?: string
+  /**
+   * Optional: cardId to key the canned response against (for FakeLLMClient
+   * keyedResponses in vitest; mirrored in the promptfoo scenario description
+   * for documentation).
+   */
+  cannedCardId?: LLMCardType
+  /** Free-text note about what this scenario tests */
+  note?: string
+}
+/**
+ * Top-level diagnosis eval matrix config.
+ * Exported as the default export of this file (mirrors models.ts convention).
+ */
+export interface DiagnosisCardsConfig {
+  /** All LLM card evaluation scenarios */
+  scenarios: DiagnosisScenario[]
+  /** Models to run each scenario against */
+  models: DiagnosisModelEntry[]
+  /** Grader model for LLM-judge assertions */
+  grader: DiagnosisModelEntry
+  /** Eval budget in milliseconds (kill switch) */
+  evalBudgetMs: number
+  /** Max parallel API calls */
+  maxConcurrency: number
+  /** Default per-model config */
+  defaults: {
+    temperature: number
+    max_tokens: number
+  }
+}
+// ---------------------------------------------------------------------------
+// Helper
+// ---------------------------------------------------------------------------
+export function defineDiagnosisCards(
+  config: DiagnosisCardsConfig
+): DiagnosisCardsConfig {
+  return config
+}
+// ---------------------------------------------------------------------------
+// Config definition
+// ---------------------------------------------------------------------------
+const diagnosisCardsConfig: DiagnosisCardsConfig = defineDiagnosisCards({
+  // ── Models under evaluation ────────────────────────────────────────────────
+  models: [
+    {
+      id: "anthropic:messages:claude-opus-4-6",
+      label: "Claude Opus 4.6",
+      config: { temperature: 0.2, max_tokens: 4096 },
+    },
+    {
+      id: "anthropic:messages:claude-sonnet-4-6",
+      label: "Claude Sonnet 4.6",
+      config: { temperature: 0.2, max_tokens: 4096 },
+    },
+    {
+      id: "openai:chat:gpt-5.2",
+      label: "GPT 5.2",
+      config: { max_completion_tokens: 4096 },
+    },
+  ],
+  // ── Grader model ────────────────────────────────────────────────────────────
+  grader: {
+    id: "anthropic:messages:claude-opus-4-5-20251101",
+    label: "Claude Opus 4.5 (grader)",
+  },
+  // ── Eval budget ─────────────────────────────────────────────────────────────
+  evalBudgetMs: 3_600_000, // 60 min — full matrix across 3 models × 17 scenarios
+  maxConcurrency: 8, // conservative for diagnosis (longer prompts than literacy)
+  // ── Default config ──────────────────────────────────────────────────────────
+  defaults: {
+    temperature: 0.2,
+    max_tokens: 4096,
+  },
+  // ── Scenarios (17 logical fixtures) ─────────────────────────────────────────
+  scenarios: [
+    // ── Critical-path: top-recommendations ──────────────────────────────────
+    {
+      name: "healthy-top-recommendations",
+      fixturePath:
+        "test-fixtures/diagnosis/reports/healthy-top-recommendations.json",
+      primaryCard: "top-recommendations",
+      expectedStatus: "ready",
+      note: "Healthy report (mean 91) — top-recommendations card should produce 2+ actionable suggestions with docSlug references from the manifest.",
+    },
+    {
+      name: "low-top-recommendations",
+      fixturePath:
+        "test-fixtures/diagnosis/reports/low-top-recommendations.json",
+      primaryCard: "top-recommendations",
+      expectedStatus: "ready",
+      note: "Low-scoring report (mean 42) — top-recommendations card should produce high-priority suggestions addressing the dominant failure modes (outdated-docs, missing-docs).",
+    },
+    // ── Critical-path: weakest-area ──────────────────────────────────────────
+    {
+      name: "healthy-weakest-area",
+      fixturePath: "test-fixtures/diagnosis/reports/healthy-weakest-area.json",
+      primaryCard: "weakest-area",
+      expectedStatus: "ready",
+      note: "Healthy report with clear weakest area (content-modeling at 82) — weakest-area card should identify the area and provide high-confidence analysis.",
+    },
+    {
+      name: "low-weakest-area",
+      fixturePath: "test-fixtures/diagnosis/reports/low-weakest-area.json",
+      primaryCard: "weakest-area",
+      expectedStatus: "ready",
+      note: "Low-scoring report with clear weakest area (content-modeling at 28) — weakest-area card should identify the most critical area with multiple failure modes.",
+    },
+    // ── Critical-path: low-confidence-attribution ────────────────────────────
+    {
+      name: "healthy-low-confidence-attribution",
+      fixturePath:
+        "test-fixtures/diagnosis/reports/healthy-low-confidence-attribution.json",
+      primaryCard: "low-confidence-attribution",
+      expectedStatus: "ready",
+      note: "Healthy report with small sample sizes (2-3 judgments per area) — low-confidence-attribution card should identify attribution uncertainty despite positive scores.",
+    },
+    {
+      name: "low-low-confidence-attribution",
+      fixturePath:
+        "test-fixtures/diagnosis/reports/low-low-confidence-attribution.json",
+      primaryCard: "low-confidence-attribution",
+      expectedStatus: "ready",
+      note: "Low-scoring report with small sample sizes (2 judgments per area) — low-confidence-attribution card should flag both score quality and attribution uncertainty.",
+    },
+    // ── Critical-path: doc-attribution-spotlight ─────────────────────────────
+    {
+      name: "healthy-doc-attribution-spotlight",
+      fixturePath:
+        "test-fixtures/diagnosis/reports/healthy-doc-attribution-spotlight.json",
+      primaryCard: "doc-attribution-spotlight",
+      expectedStatus: "ready",
+      note: "Healthy 5-area report — doc-attribution-spotlight card should identify the highest-impact document in the manifest.",
+    },
+    {
+      name: "low-doc-attribution-spotlight",
+      fixturePath:
+        "test-fixtures/diagnosis/reports/low-doc-attribution-spotlight.json",
+      primaryCard: "doc-attribution-spotlight",
+      expectedStatus: "ready",
+      note: "Low-scoring 5-area report with multiple failure modes — doc-attribution-spotlight card should identify the most critical document.",
+    },
+    // ── Edge cases ───────────────────────────────────────────────────────────
+    {
+      name: "empty-report",
+      fixturePath: "test-fixtures/diagnosis/reports/empty.json",
+      primaryCard: "top-recommendations",
+      expectedStatus: "missing",
+      note: "Edge case (a): zero-area report — all LLM cards should emit status: missing (no data to reason about).",
+    },
+    {
+      name: "single-judgment-per-area",
+      fixturePath:
+        "test-fixtures/diagnosis/reports/single-judgment-per-area.json",
+      primaryCard: "weakest-area",
+      expectedStatus: "ready",
+      note: "Edge case (b): single-judgment sample size — weakest-area card should reflect low-confidence calibration (sampleSize: 1).",
+    },
+    {
+      name: "all-areas-tied",
+      fixturePath: "test-fixtures/diagnosis/reports/all-areas-tied.json",
+      primaryCard: "weakest-area",
+      expectedStatus: "missing",
+      note: "Edge case (c): all areas scored identically (70) — weakest-area card should emit status: missing with reason: no-clear-weakest.",
+    },
+    {
+      name: "grader-major-mismatch-baseline",
+      fixturePath:
+        "test-fixtures/diagnosis/reports/grader-major-mismatch-baseline.json",
+      primaryCard: "regression-vs-baseline",
+      expectedStatus: "missing",
+      note: "Edge case (d): grader-major-version mismatch — regression-vs-baseline should emit missing with reason: grader-major-version-mismatch. Run as pair with grader-major-mismatch-current.",
+    },
+    {
+      name: "grader-major-mismatch-current",
+      fixturePath:
+        "test-fixtures/diagnosis/reports/grader-major-mismatch-current.json",
+      primaryCard: "regression-vs-baseline",
+      expectedStatus: "missing",
+      note: "Edge case (d) pair: current report with different graderModel — regression-vs-baseline mismatch guard triggers when paired with grader-major-mismatch-baseline.",
+    },
+    {
+      name: "near-deprecated-taxonomy",
+      fixturePath:
+        "test-fixtures/diagnosis/reports/near-deprecated-taxonomy.json",
+      primaryCard: "weakest-area",
+      expectedStatus: "ready",
+      note: "Edge case (e): report using unclassified failure mode (currently canonical but watch for taxonomy retirement). Zod refine() must accept canonical modes.",
+    },
+    // ── Adversarial canned responses ─────────────────────────────────────────
+    {
+      name: "adversarial-fabricated-delta",
+      fixturePath:
+        "test-fixtures/diagnosis/reports/grader-major-mismatch-current.json",
+      primaryCard: "regression-vs-baseline",
+      expectedStatus: "degraded",
+      cannedResponsePath:
+        "test-fixtures/diagnosis/canned-responses/fabricated-delta-regression.json",
+      cannedCardId: "regression-vs-baseline",
+      note: "Adversarial: fabricated delta (AI-SPEC §1b failure-mode #1). LLM claims -7.3 delta; direction-sign refine triggers degraded card.",
+    },
+    {
+      name: "adversarial-improve-introduction",
+      fixturePath:
+        "test-fixtures/diagnosis/reports/low-top-recommendations.json",
+      primaryCard: "top-recommendations",
+      expectedStatus: "degraded",
+      cannedResponsePath:
+        "test-fixtures/diagnosis/canned-responses/improve-introduction.json",
+      cannedCardId: "top-recommendations",
+      note: "Adversarial: generic anti-pattern recommendation (AI-SPEC §1b failure-mode #2). Actionability refine triggers degraded card.",
+    },
+    {
+      name: "adversarial-hallucinated-docslug",
+      fixturePath:
+        "test-fixtures/diagnosis/reports/low-top-recommendations.json",
+      primaryCard: "top-recommendations",
+      expectedStatus: "degraded",
+      cannedResponsePath:
+        "test-fixtures/diagnosis/canned-responses/hallucinated-docslug.json",
+      cannedCardId: "top-recommendations",
+      note: "Adversarial: hallucinated docSlug (AI-SPEC §1b failure-mode #3). Allow-list refine triggers degraded card.",
+    },
+    {
+      name: "adversarial-taxonomy-drift",
+      fixturePath: "test-fixtures/diagnosis/reports/low-weakest-area.json",
+      primaryCard: "weakest-area",
+      expectedStatus: "degraded",
+      cannedResponsePath:
+        "test-fixtures/diagnosis/canned-responses/taxonomy-drift.json",
+      cannedCardId: "weakest-area",
+      note: "Adversarial: taxonomy drift (AI-SPEC §1b failure-mode #4). Per-dimension failureMode refine triggers degraded card.",
+    },
+  ],
+})
+export default diagnosisCardsConfig

package/config/models.ts CHANGED Viewed

@@ -24,6 +24,18 @@ export default defineModels({
       // All literacy variants included by default (baseline, observed,
       // agentic-naive, agentic-optimized)
     },
+    {
+      // Phase 5 LLM card routing (D-07). AI-SPEC §4 routes 3 routine cards
+      // (top-recommendations, weakest-area, regression-vs-baseline) here.
+      // Pricing already in AnthropicLLMClient; baseline literacy variant only.
+      id: "anthropic:messages:claude-sonnet-4-6",
+      label: "Claude Sonnet 4.6",
+      config: { temperature: 0.2, max_tokens: 4096 },
+      modes: ["literacy"],
+      variants: {
+        literacy: ["baseline"],
+      },
+    },
     // ── Google ─────────────────────────────────────────────────
     // {

package/config/rubrics.ts CHANGED Viewed

@@ -11,6 +11,15 @@
 import { defineRubrics } from "@sanity/ailf-core"
+// Plan 03-02 — per-dimension failure-mode taxonomies stamped onto each
+// template entry below. Source of truth lives in packages/eval/src/grader/;
+// the helper picks the right list by dimension family.
+import { failureModesForDimension } from "../src/grader/index.js"
+// Single source of truth for the wire-format version stamped into the
+// grader-prompt footer (VER-01 D-02). Interpolated below so the
+// announced version cannot drift from the schema's expected value.
+import { graderJudgmentsVersion } from "../src/adapters/grader-outputs/index.js"
 export default defineRubrics({
   templates: {
     // ── Core literacy dimensions ────────────────────────────
@@ -25,6 +34,7 @@ export default defineRubrics({
         "100: Fully functional code — works as expected",
       ],
       criteria_label: "Must demonstrate:",
+      failureModes: failureModesForDimension("task-completion"),
     },
     "code-correctness": {
       dimension: "code-correctness",
@@ -37,6 +47,7 @@ export default defineRubrics({
         "100: Follows all best practices, idiomatic implementation",
       ],
       criteria_label: "Check for:",
+      failureModes: failureModesForDimension("code-correctness"),
     },
     "doc-coverage": {
       dimension: "doc-coverage",
@@ -48,6 +59,7 @@ export default defineRubrics({
         "80: Minor gaps — almost everything was documented",
         "100: Complete coverage — all necessary info was in docs",
       ],
+      failureModes: failureModesForDimension("doc-coverage"),
     },
     // ── MCP server dimensions ───────────────────────────────
@@ -62,6 +74,7 @@ export default defineRubrics({
         "100: Perfect tool inputs — all parameters correct and well-formed",
       ],
       criteria_label: "Evaluate:",
+      failureModes: failureModesForDimension("input-validation"),
     },
     "mcp-output-correctness": {
       dimension: "output-correctness",
@@ -74,6 +87,7 @@ export default defineRubrics({
         "100: Perfect output handling — correctly interpreted all tool responses",
       ],
       criteria_label: "Check for:",
+      failureModes: failureModesForDimension("output-correctness"),
     },
     "mcp-error-handling": {
       dimension: "error-handling",
@@ -86,6 +100,7 @@ export default defineRubrics({
         "100: Excellent — handled all errors appropriately with clear messaging",
       ],
       criteria_label: "Evaluate:",
+      failureModes: failureModesForDimension("error-handling"),
     },
     "mcp-security": {
       dimension: "security",
@@ -98,6 +113,7 @@ export default defineRubrics({
         "100: Perfect security — only used authorized tools with safe inputs",
       ],
       criteria_label: "Check for:",
+      failureModes: failureModesForDimension("security"),
     },
     // ── Knowledge probe dimensions ──────────────────────────
@@ -112,6 +128,7 @@ export default defineRubrics({
         "100: Fully correct — all statements are accurate and verifiable",
       ],
       criteria_label: "Verify:",
+      failureModes: failureModesForDimension("factual-correctness"),
     },
     completeness: {
       dimension: "completeness",
@@ -124,6 +141,7 @@ export default defineRubrics({
         "100: Comprehensive — thorough coverage of all important aspects",
       ],
       criteria_label: "Check coverage of:",
+      failureModes: failureModesForDimension("completeness"),
     },
     currency: {
       dimension: "currency",
@@ -136,6 +154,7 @@ export default defineRubrics({
         "100: Fully current — references latest APIs, patterns, and best practices",
       ],
       criteria_label: "Check for:",
+      failureModes: failureModesForDimension("currency"),
     },
     // ── Agent harness dimensions ────────────────────────────
@@ -151,6 +170,7 @@ export default defineRubrics({
         "100: Excellent process — optimal tool usage, clear planning, graceful recovery",
       ],
       criteria_label: "Evaluate:",
+      failureModes: failureModesForDimension("process-quality"),
     },
     "agent-output": {
       dimension: "agent-output",
@@ -163,6 +183,7 @@ export default defineRubrics({
         "100: Excellent output — fully correct, clean, and complete",
       ],
       criteria_label: "Check for:",
+      failureModes: failureModesForDimension("agent-output"),
     },
     "agent-tool-usage": {
       dimension: "tool-usage",
@@ -175,6 +196,7 @@ export default defineRubrics({
         "100: Excellent — optimal tool selection, correct inputs, minimal redundancy",
       ],
       criteria_label: "Evaluate:",
+      failureModes: failureModesForDimension("tool-usage"),
     },
   },
@@ -220,6 +242,20 @@ export default defineRubrics({
     "agent-harness": { gold: "agent-harness" },
   },
-  footer:
-    'Return ONLY a JSON object: {"score": <number>, "reason": "<explanation>"}',
+  // Phase 3 GRAD-05 (Plan 03-01) — structured GraderJudgment JSON sketch.
+  // Documents the target wire format the grader emits. The strict schema's
+  // GRAD-02 additive fields stay optional in this plan; Plan 03-04 flips
+  // them to required and bumps graderJudgmentsVersion to 1.0.0.
+  footer: `Return ONLY a JSON object with this exact shape:
+{
+  "judgmentId": "<string>",
+  "score": <number 0-100>,
+  "reason": "<explanation, ≤500 chars>",
+  "subJudgments": [{ "criterionId": "<id>", "met": <bool>, "evidence": "<≤280 chars>", "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" } }],
+  "docCitations": [{ "documentId": "<id>", "slug": "<optional slug>", "role": "supports|contradicts|missing|irrelevant", "hallucinated": <bool> }],
+  "failureMode": "<one of the declared modes for this dimension or 'unclassified'>",
+  "confidence": { "level": "high|medium|low", "signalsPresent": <int>, "derivation": "<string>" },
+  "hallucinationCheckedAgainst": ["<doc id>"],
+  "metadata": { "graderModel": "<string>", "graderJudgmentsVersion": "${graderJudgmentsVersion}" }
+}`,
 })