npm - @sanity/ailf - Versions diffs - 5.0.0 → 6.1.0 - Mend

@sanity/ailf 5.0.0 → 6.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

package/config/airbyte/ai_literacy_framework.connector.yaml CHANGED Viewed

@@ -225,6 +225,134 @@ definitions:
         schema:
           $ref: "#/schemas/area_scores"
+    # ------------------------------------------------------------------
+    # Stream 3: synthesis_summary — one row per report with synthesis telemetry
+    # ------------------------------------------------------------------
+    # GROQ projection emits cost, parse-failure counts, and rate from the
+    # summary.synthesis.diagnosis path written by the Phase-6 post-run hook.
+    # Rows are gated on defined(summary.synthesis.diagnosis) so reports that
+    # predate Phase 6 produce no rows (incremental cursor still catches them
+    # on re-sync once backfilled).
+    synthesis_summary:
+      type: DeclarativeStream
+      name: synthesis_summary
+      retriever:
+        type: SimpleRetriever
+        decoder:
+          type: JsonDecoder
+        requester:
+          $ref: "#/definitions/base_requester"
+          path: /v2026-03-12/data/query/{{ config['dataset'] }}
+          http_method: GET
+          request_parameters:
+            query: >-
+              *[_type=="ailf.report" && _createdAt > "{{
+              stream_interval.start_time or '1970-01-01T00:00:00Z' }}" &&
+              _createdAt <= "{{ stream_interval.end_time }}" &&
+              defined(summary.synthesis.diagnosis)]|order(_createdAt asc){
+                "report_id": reportId,
+                "completed_at": completedAt,
+                "mode": provenance.mode,
+                "source_name": provenance.source.name,
+                "grader_model": provenance.graderModel,
+                "synthesis_cost": summary.synthesis.diagnosis.cost,
+                "parse_failure_count":
+              summary.synthesis.diagnosis.parseFailureCount,
+                "parse_failure_rate":
+              summary.synthesis.diagnosis.parseFailureRate,
+                _createdAt
+              }
+        record_selector:
+          type: RecordSelector
+          extractor:
+            type: DpathExtractor
+            field_path:
+              - result
+      primary_key:
+        - report_id
+      incremental_sync:
+        type: DatetimeBasedCursor
+        cursor_field: _createdAt
+        cursor_datetime_formats:
+          - "%Y-%m-%dT%H:%M:%S.%fZ"
+          - "%Y-%m-%dT%H:%M:%SZ"
+        datetime_format: "%Y-%m-%dT%H:%M:%SZ"
+        start_datetime:
+          type: MinMaxDatetime
+          datetime: "{{ config.get('start_date', '2026-01-01T00:00:00Z') }}"
+          datetime_format: "%Y-%m-%dT%H:%M:%SZ"
+        step: P30D
+        cursor_granularity: PT1S
+      schema_loader:
+        type: InlineSchemaLoader
+        schema:
+          $ref: "#/schemas/synthesis_summary"
+    # ------------------------------------------------------------------
+    # Stream 4: synthesis_per_card — one row per report with per-card array
+    # ------------------------------------------------------------------
+    # GROQ projection emits the nested perCard array. GROQ cannot explode
+    # arrays into flat rows, so the nesting is preserved — BigQuery consumers
+    # should UNNEST(JSON_QUERY_ARRAY(per_card)) to get flat rows per card.
+    # primary_key is report_id only (not compound) for the same reason.
+    synthesis_per_card:
+      type: DeclarativeStream
+      name: synthesis_per_card
+      retriever:
+        type: SimpleRetriever
+        decoder:
+          type: JsonDecoder
+        requester:
+          $ref: "#/definitions/base_requester"
+          path: /v2026-03-12/data/query/{{ config['dataset'] }}
+          http_method: GET
+          request_parameters:
+            query: >-
+              *[_type=="ailf.report" && _createdAt > "{{
+              stream_interval.start_time or '1970-01-01T00:00:00Z' }}" &&
+              _createdAt <= "{{ stream_interval.end_time }}" &&
+              defined(summary.synthesis.diagnosis.perCard)]|order(_createdAt
+              asc){
+                "report_id": reportId,
+                "completed_at": completedAt,
+                "per_card": summary.synthesis.diagnosis.perCard[]{
+                  "card_type": cardType,
+                  "cost": cost,
+                  "parse_failed": parseFailed,
+                  "latency_ms": latencyMs,
+                  "token_input": tokenInput,
+                  "token_output": tokenOutput,
+                  "card_version": cardVersion,
+                  "generated_at": generatedAt
+                },
+                _createdAt
+              }
+        record_selector:
+          type: RecordSelector
+          extractor:
+            type: DpathExtractor
+            field_path:
+              - result
+      primary_key:
+        - report_id
+      incremental_sync:
+        type: DatetimeBasedCursor
+        cursor_field: _createdAt
+        cursor_datetime_formats:
+          - "%Y-%m-%dT%H:%M:%S.%fZ"
+          - "%Y-%m-%dT%H:%M:%SZ"
+        datetime_format: "%Y-%m-%dT%H:%M:%SZ"
+        start_datetime:
+          type: MinMaxDatetime
+          datetime: "{{ config.get('start_date', '2026-01-01T00:00:00Z') }}"
+          datetime_format: "%Y-%m-%dT%H:%M:%SZ"
+        step: P30D
+        cursor_granularity: PT1S
+      schema_loader:
+        type: InlineSchemaLoader
+        schema:
+          $ref: "#/schemas/synthesis_per_card"
   base_requester:
     type: HttpRequester
     url_base: https://{{ config['project_id'] }}.api.sanity.io
@@ -235,6 +363,8 @@ definitions:
 streams:
   - $ref: "#/definitions/streams/reports"
   - $ref: "#/definitions/streams/area_scores"
+  - $ref: "#/definitions/streams/synthesis_summary"
+  - $ref: "#/definitions/streams/synthesis_per_card"
 spec:
   type: Spec
@@ -299,9 +429,25 @@ metadata:
       primaryKeysAreUnique: true
       primaryKeysArePresent: true
       responsesAreSuccessful: true
+    synthesis_summary:
+      hasRecords: true
+      streamHash: null
+      hasResponse: true
+      primaryKeysAreUnique: true
+      primaryKeysArePresent: true
+      responsesAreSuccessful: true
+    synthesis_per_card:
+      hasRecords: true
+      streamHash: null
+      hasResponse: true
+      primaryKeysAreUnique: true
+      primaryKeysArePresent: true
+      responsesAreSuccessful: true
   autoImportSchema:
     reports: false
     area_scores: false
+    synthesis_summary: false
+    synthesis_per_card: false
 # ======================================================================
 # Inline schemas — manually defined to match the designed BigQuery tables.
@@ -757,3 +903,133 @@ schemas:
           - "null"
         description: Sanity document creation timestamp (incremental cursor)
     additionalProperties: true
+  # ------------------------------------------------------------------
+  # synthesis_summary schema — flat, one row per report with synthesis telemetry
+  # ------------------------------------------------------------------
+  synthesis_summary:
+    type: object
+    $schema: http://json-schema.org/schema#
+    required:
+      - report_id
+    properties:
+      report_id:
+        type: string
+        description: UUID v7 report identifier (primary key)
+      completed_at:
+        type:
+          - string
+          - "null"
+        description: ISO 8601 timestamp when the evaluation completed
+      mode:
+        type:
+          - string
+          - "null"
+        description: "Evaluation mode: baseline, observed, or agentic"
+      source_name:
+        type:
+          - string
+          - "null"
+        description: Documentation source name (e.g., "production")
+      grader_model:
+        type:
+          - string
+          - "null"
+        description: Model used for LLM grading (context for cost comparison)
+      synthesis_cost:
+        type:
+          - number
+          - "null"
+        description:
+          Total USD cost of the Diagnosis synthesis run (sum of all card costs)
+      parse_failure_count:
+        type:
+          - number
+          - "null"
+        description:
+          Number of cards that failed Zod schema parse during synthesis
+      parse_failure_rate:
+        type:
+          - number
+          - "null"
+        description:
+          Fraction of cards that failed parse (0–1); 0.0 = no failures
+      _createdAt:
+        type:
+          - string
+          - "null"
+        description:
+          Sanity document creation timestamp (used as incremental cursor)
+    additionalProperties: true
+  # ------------------------------------------------------------------
+  # synthesis_per_card schema — nested per-card array, one row per report
+  # ------------------------------------------------------------------
+  # BigQuery consumers should UNNEST(JSON_QUERY_ARRAY(per_card)) to get
+  # flat rows per (report × card). See bigquery/views/synthesis_parse_failure_rate_7d.sql
+  synthesis_per_card:
+    type: object
+    $schema: http://json-schema.org/schema#
+    required:
+      - report_id
+    properties:
+      report_id:
+        type: string
+        description: UUID v7 report identifier (primary key)
+      completed_at:
+        type:
+          - string
+          - "null"
+        description: Denormalized timestamp for partitioning
+      per_card:
+        type:
+          - array
+          - "null"
+        description: >-
+          Per-card synthesis metrics array. UNNEST in BigQuery to get one flat
+          row per card. card_type identifies the diagnosis card type (e.g.,
+          "top-recommendations").
+        items:
+          type: object
+          properties:
+            card_type:
+              type: string
+              description: Diagnosis card type identifier (≤25 chars)
+            cost:
+              type:
+                - number
+                - "null"
+              description:
+                USD cost of this card's LLM call (null for deterministic cards)
+            parse_failed:
+              type: boolean
+              description: Whether the card's Zod schema parse failed
+            latency_ms:
+              type:
+                - number
+                - "null"
+              description: LLM call latency in milliseconds
+            token_input:
+              type:
+                - number
+                - "null"
+              description: Input tokens consumed by the LLM call
+            token_output:
+              type:
+                - number
+                - "null"
+              description: Output tokens produced by the LLM call
+            card_version:
+              type: string
+              description:
+                Card implementation version (e.g., "area-summary@0.1.0")
+            generated_at:
+              type: string
+              description: ISO 8601 UTC timestamp when this card was generated
+      _createdAt:
+        type:
+          - string
+          - "null"
+        description:
+          Sanity document creation timestamp (used as incremental cursor)
+    additionalProperties: true

package/config/bigquery/views/synthesis_parse_failure_rate_7d.sql ADDED Viewed

@@ -0,0 +1,42 @@
+-- ailf.synthesis_parse_failure_rate_7d — per-card parse-failure rate over 7 days
+--
+-- Computes the Zod-schema parse-failure rate per Diagnosis card type over the
+-- previous 7 days, sourced from the synthesis_per_card Airbyte stream. Any row
+-- returned by this view represents a card type that breached the 2% threshold
+-- defined in D6-18 and should trigger a manual investigation per the runbook.
+--
+-- Source: ailf_raw.synthesis_per_card (Airbyte stream: "synthesis_per_card")
+-- Target: ailf.synthesis_parse_failure_rate_7d (this view)
+--
+-- Threshold: failure_rate > 0.02 (2%) over INTERVAL 7 DAY  [D6-18]
+-- To change the threshold, edit the HAVING clause and WHERE clause below;
+-- both are the single edit points per D6-18 (not lifted to config).
+--
+-- Usage:
+--   bq query --use_legacy_sql=false < views/synthesis_parse_failure_rate_7d.sql
+--
+-- @see docs/runbooks/diagnosis-parse-failure-watch.md — operator runbook
+-- @see packages/eval/config/airbyte/ai_literacy_framework.connector.yaml — synthesis_per_card stream
+CREATE OR REPLACE VIEW `data-platform-302218.ailf.synthesis_parse_failure_rate_7d` AS
+SELECT
+  JSON_VALUE(card, '$.card_type')                                        AS card_type,
+  COUNT(*)                                                               AS total_runs,
+  COUNTIF(SAFE_CAST(JSON_VALUE(card, '$.parse_failed') AS BOOL))        AS parse_failures,
+  ROUND(SAFE_DIVIDE(
+    COUNTIF(SAFE_CAST(JSON_VALUE(card, '$.parse_failed') AS BOOL)),
+    COUNT(*)
+  ), 4)                                                                  AS failure_rate
+FROM
+  `data-platform-302218.ailf_raw.synthesis_per_card` AS r,
+  UNNEST(JSON_QUERY_ARRAY(r.per_card)) AS card
+WHERE
+  r.completed_at IS NOT NULL
+  AND TIMESTAMP(r.completed_at) >= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 7 DAY)
+  AND JSON_VALUE(card, '$.card_type') IS NOT NULL
+GROUP BY
+  card_type
+HAVING
+  failure_rate > 0.02
+ORDER BY
+  failure_rate DESC

package/config/diagnosis-cards.ts ADDED Viewed

@@ -0,0 +1,318 @@
+/**
+ * diagnosis-cards.ts — Diagnosis eval matrix config.
+ *
+ * TS-first config (per .claude/rules/config.md) defining the 5 LLM card types
+ * × 3 first-class models eval matrix. Consumed by
+ * `scripts/generate-diagnosis-config.ts` to emit
+ * `promptfooconfig-diagnosis.yaml`. Never hand-edit the YAML — run
+ * `pnpm generate-configs` instead.
+ *
+ * Per AI-SPEC §5 and CONTEXT D-04 (path b: standalone generator entry point
+ * for the diagnosis config, additive — does not modify the existing literacy
+ * generate-configs pipeline).
+ *
+ * @see packages/eval/scripts/generate-diagnosis-config.ts — generator
+ * @see packages/eval/promptfooconfig-diagnosis.yaml — generated output
+ */
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+/**
+ * A first-class model entry in the diagnosis eval matrix.
+ * Mirrors the shape of model entries in `config/models.ts`.
+ */
+export interface DiagnosisModelEntry {
+  /** Promptfoo provider string (e.g. "anthropic:messages:claude-opus-4-6") */
+  id: string
+  /** Human-readable label for reports */
+  label: string
+  /** Per-model config overrides (temperature, max_tokens, etc.) */
+  config?: Record<string, unknown>
+}
+/**
+ * The 5 LLM-driven card types under evaluation.
+ * Deterministic cards (area-summary, failure-mode-summary, no-issues) are
+ * tested via `fixture-matrix.test.ts` (vitest), not via the promptfoo matrix.
+ */
+export type LLMCardType =
+  | "top-recommendations"
+  | "weakest-area"
+  | "low-confidence-attribution"
+  | "doc-attribution-spotlight"
+  | "regression-vs-baseline"
+/**
+ * A single evaluation scenario: one fixture path × one expected outcome.
+ *
+ * The `fixturePath` is relative to `packages/eval/` so the promptfoo config
+ * can resolve it from any working directory. `expectedStatus` drives the
+ * pass/fail assertion in the generated YAML.
+ */
+export interface DiagnosisScenario {
+  /** Short slug used in promptfoo `description` fields */
+  name: string
+  /** Path to the Report JSON fixture, relative to `packages/eval/` */
+  fixturePath: string
+  /**
+   * Card type this scenario exercises. The eval matrix runs all LLM cards
+   * per scenario; this field annotates which card type is the primary focus
+   * for the rubric.
+   */
+  primaryCard: LLMCardType
+  /** Expected card status when all LLM calls succeed */
+  expectedStatus: "ready" | "degraded" | "missing"
+  /** Optional: path to canned LLM response for adversarial scenarios */
+  cannedResponsePath?: string
+  /**
+   * Optional: cardId to key the canned response against (for FakeLLMClient
+   * keyedResponses in vitest; mirrored in the promptfoo scenario description
+   * for documentation).
+   */
+  cannedCardId?: LLMCardType
+  /** Free-text note about what this scenario tests */
+  note?: string
+}
+/**
+ * Top-level diagnosis eval matrix config.
+ * Exported as the default export of this file (mirrors models.ts convention).
+ */
+export interface DiagnosisCardsConfig {
+  /** All LLM card evaluation scenarios */
+  scenarios: DiagnosisScenario[]
+  /** Models to run each scenario against */
+  models: DiagnosisModelEntry[]
+  /** Grader model for LLM-judge assertions */
+  grader: DiagnosisModelEntry
+  /** Eval budget in milliseconds (kill switch) */
+  evalBudgetMs: number
+  /** Max parallel API calls */
+  maxConcurrency: number
+  /** Default per-model config */
+  defaults: {
+    temperature: number
+    max_tokens: number
+  }
+}
+// ---------------------------------------------------------------------------
+// Helper
+// ---------------------------------------------------------------------------
+export function defineDiagnosisCards(
+  config: DiagnosisCardsConfig
+): DiagnosisCardsConfig {
+  return config
+}
+// ---------------------------------------------------------------------------
+// Config definition
+// ---------------------------------------------------------------------------
+const diagnosisCardsConfig: DiagnosisCardsConfig = defineDiagnosisCards({
+  // ── Models under evaluation ────────────────────────────────────────────────
+  models: [
+    {
+      id: "anthropic:messages:claude-opus-4-6",
+      label: "Claude Opus 4.6",
+      config: { temperature: 0.2, max_tokens: 4096 },
+    },
+    {
+      id: "anthropic:messages:claude-sonnet-4-6",
+      label: "Claude Sonnet 4.6",
+      config: { temperature: 0.2, max_tokens: 4096 },
+    },
+    {
+      id: "openai:chat:gpt-5.2",
+      label: "GPT 5.2",
+      config: { max_completion_tokens: 4096 },
+    },
+  ],
+  // ── Grader model ────────────────────────────────────────────────────────────
+  grader: {
+    id: "anthropic:messages:claude-opus-4-5-20251101",
+    label: "Claude Opus 4.5 (grader)",
+  },
+  // ── Eval budget ─────────────────────────────────────────────────────────────
+  evalBudgetMs: 3_600_000, // 60 min — full matrix across 3 models × 17 scenarios
+  maxConcurrency: 8, // conservative for diagnosis (longer prompts than literacy)
+  // ── Default config ──────────────────────────────────────────────────────────
+  defaults: {
+    temperature: 0.2,
+    max_tokens: 4096,
+  },
+  // ── Scenarios (17 logical fixtures) ─────────────────────────────────────────
+  scenarios: [
+    // ── Critical-path: top-recommendations ──────────────────────────────────
+    {
+      name: "healthy-top-recommendations",
+      fixturePath:
+        "test-fixtures/diagnosis/reports/healthy-top-recommendations.json",
+      primaryCard: "top-recommendations",
+      expectedStatus: "ready",
+      note: "Healthy report (mean 91) — top-recommendations card should produce 2+ actionable suggestions with docSlug references from the manifest.",
+    },
+    {
+      name: "low-top-recommendations",
+      fixturePath:
+        "test-fixtures/diagnosis/reports/low-top-recommendations.json",
+      primaryCard: "top-recommendations",
+      expectedStatus: "ready",
+      note: "Low-scoring report (mean 42) — top-recommendations card should produce high-priority suggestions addressing the dominant failure modes (outdated-docs, missing-docs).",
+    },
+    // ── Critical-path: weakest-area ──────────────────────────────────────────
+    {
+      name: "healthy-weakest-area",
+      fixturePath: "test-fixtures/diagnosis/reports/healthy-weakest-area.json",
+      primaryCard: "weakest-area",
+      expectedStatus: "ready",
+      note: "Healthy report with clear weakest area (content-modeling at 82) — weakest-area card should identify the area and provide high-confidence analysis.",
+    },
+    {
+      name: "low-weakest-area",
+      fixturePath: "test-fixtures/diagnosis/reports/low-weakest-area.json",
+      primaryCard: "weakest-area",
+      expectedStatus: "ready",
+      note: "Low-scoring report with clear weakest area (content-modeling at 28) — weakest-area card should identify the most critical area with multiple failure modes.",
+    },
+    // ── Critical-path: low-confidence-attribution ────────────────────────────
+    {
+      name: "healthy-low-confidence-attribution",
+      fixturePath:
+        "test-fixtures/diagnosis/reports/healthy-low-confidence-attribution.json",
+      primaryCard: "low-confidence-attribution",
+      expectedStatus: "ready",
+      note: "Healthy report with small sample sizes (2-3 judgments per area) — low-confidence-attribution card should identify attribution uncertainty despite positive scores.",
+    },
+    {
+      name: "low-low-confidence-attribution",
+      fixturePath:
+        "test-fixtures/diagnosis/reports/low-low-confidence-attribution.json",
+      primaryCard: "low-confidence-attribution",
+      expectedStatus: "ready",
+      note: "Low-scoring report with small sample sizes (2 judgments per area) — low-confidence-attribution card should flag both score quality and attribution uncertainty.",
+    },
+    // ── Critical-path: doc-attribution-spotlight ─────────────────────────────
+    {
+      name: "healthy-doc-attribution-spotlight",
+      fixturePath:
+        "test-fixtures/diagnosis/reports/healthy-doc-attribution-spotlight.json",
+      primaryCard: "doc-attribution-spotlight",
+      expectedStatus: "ready",
+      note: "Healthy 5-area report — doc-attribution-spotlight card should identify the highest-impact document in the manifest.",
+    },
+    {
+      name: "low-doc-attribution-spotlight",
+      fixturePath:
+        "test-fixtures/diagnosis/reports/low-doc-attribution-spotlight.json",
+      primaryCard: "doc-attribution-spotlight",
+      expectedStatus: "ready",
+      note: "Low-scoring 5-area report with multiple failure modes — doc-attribution-spotlight card should identify the most critical document.",
+    },
+    // ── Edge cases ───────────────────────────────────────────────────────────
+    {
+      name: "empty-report",
+      fixturePath: "test-fixtures/diagnosis/reports/empty.json",
+      primaryCard: "top-recommendations",
+      expectedStatus: "missing",
+      note: "Edge case (a): zero-area report — all LLM cards should emit status: missing (no data to reason about).",
+    },
+    {
+      name: "single-judgment-per-area",
+      fixturePath:
+        "test-fixtures/diagnosis/reports/single-judgment-per-area.json",
+      primaryCard: "weakest-area",
+      expectedStatus: "ready",
+      note: "Edge case (b): single-judgment sample size — weakest-area card should reflect low-confidence calibration (sampleSize: 1).",
+    },
+    {
+      name: "all-areas-tied",
+      fixturePath: "test-fixtures/diagnosis/reports/all-areas-tied.json",
+      primaryCard: "weakest-area",
+      expectedStatus: "missing",
+      note: "Edge case (c): all areas scored identically (70) — weakest-area card should emit status: missing with reason: no-clear-weakest.",
+    },
+    {
+      name: "grader-major-mismatch-baseline",
+      fixturePath:
+        "test-fixtures/diagnosis/reports/grader-major-mismatch-baseline.json",
+      primaryCard: "regression-vs-baseline",
+      expectedStatus: "missing",
+      note: "Edge case (d): grader-major-version mismatch — regression-vs-baseline should emit missing with reason: grader-major-version-mismatch. Run as pair with grader-major-mismatch-current.",
+    },
+    {
+      name: "grader-major-mismatch-current",
+      fixturePath:
+        "test-fixtures/diagnosis/reports/grader-major-mismatch-current.json",
+      primaryCard: "regression-vs-baseline",
+      expectedStatus: "missing",
+      note: "Edge case (d) pair: current report with different graderModel — regression-vs-baseline mismatch guard triggers when paired with grader-major-mismatch-baseline.",
+    },
+    {
+      name: "near-deprecated-taxonomy",
+      fixturePath:
+        "test-fixtures/diagnosis/reports/near-deprecated-taxonomy.json",
+      primaryCard: "weakest-area",
+      expectedStatus: "ready",
+      note: "Edge case (e): report using unclassified failure mode (currently canonical but watch for taxonomy retirement). Zod refine() must accept canonical modes.",
+    },
+    // ── Adversarial canned responses ─────────────────────────────────────────
+    {
+      name: "adversarial-fabricated-delta",
+      fixturePath:
+        "test-fixtures/diagnosis/reports/grader-major-mismatch-current.json",
+      primaryCard: "regression-vs-baseline",
+      expectedStatus: "degraded",
+      cannedResponsePath:
+        "test-fixtures/diagnosis/canned-responses/fabricated-delta-regression.json",
+      cannedCardId: "regression-vs-baseline",
+      note: "Adversarial: fabricated delta (AI-SPEC §1b failure-mode #1). LLM claims -7.3 delta; direction-sign refine triggers degraded card.",
+    },
+    {
+      name: "adversarial-improve-introduction",
+      fixturePath:
+        "test-fixtures/diagnosis/reports/low-top-recommendations.json",
+      primaryCard: "top-recommendations",
+      expectedStatus: "degraded",
+      cannedResponsePath:
+        "test-fixtures/diagnosis/canned-responses/improve-introduction.json",
+      cannedCardId: "top-recommendations",
+      note: "Adversarial: generic anti-pattern recommendation (AI-SPEC §1b failure-mode #2). Actionability refine triggers degraded card.",
+    },
+    {
+      name: "adversarial-hallucinated-docslug",
+      fixturePath:
+        "test-fixtures/diagnosis/reports/low-top-recommendations.json",
+      primaryCard: "top-recommendations",
+      expectedStatus: "degraded",
+      cannedResponsePath:
+        "test-fixtures/diagnosis/canned-responses/hallucinated-docslug.json",
+      cannedCardId: "top-recommendations",
+      note: "Adversarial: hallucinated docSlug (AI-SPEC §1b failure-mode #3). Allow-list refine triggers degraded card.",
+    },
+    {
+      name: "adversarial-taxonomy-drift",
+      fixturePath: "test-fixtures/diagnosis/reports/low-weakest-area.json",
+      primaryCard: "weakest-area",
+      expectedStatus: "degraded",
+      cannedResponsePath:
+        "test-fixtures/diagnosis/canned-responses/taxonomy-drift.json",
+      cannedCardId: "weakest-area",
+      note: "Adversarial: taxonomy drift (AI-SPEC §1b failure-mode #4). Per-dimension failureMode refine triggers degraded card.",
+    },
+  ],
+})
+export default diagnosisCardsConfig

package/config/models.ts CHANGED Viewed

@@ -24,6 +24,18 @@ export default defineModels({
       // All literacy variants included by default (baseline, observed,
       // agentic-naive, agentic-optimized)
     },
+    {
+      // Phase 5 LLM card routing (D-07). AI-SPEC §4 routes 3 routine cards
+      // (top-recommendations, weakest-area, regression-vs-baseline) here.
+      // Pricing already in AnthropicLLMClient; baseline literacy variant only.
+      id: "anthropic:messages:claude-sonnet-4-6",
+      label: "Claude Sonnet 4.6",
+      config: { temperature: 0.2, max_tokens: 4096 },
+      modes: ["literacy"],
+      variants: {
+        literacy: ["baseline"],
+      },
+    },
     // ── Google ─────────────────────────────────────────────────
     // {