@sanity/ailf 5.0.0 → 6.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/config/airbyte/ai_literacy_framework.connector.yaml +276 -0
  2. package/config/bigquery/views/synthesis_parse_failure_rate_7d.sql +42 -0
  3. package/config/diagnosis-cards.ts +318 -0
  4. package/config/models.ts +12 -0
  5. package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
  6. package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
  7. package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
  8. package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
  9. package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
  10. package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
  11. package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
  12. package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
  13. package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
  14. package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
  15. package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
  16. package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
  17. package/dist/_vendor/ailf-core/index.d.ts +1 -0
  18. package/dist/_vendor/ailf-core/index.js +4 -0
  19. package/dist/_vendor/ailf-core/ports/context.d.ts +12 -0
  20. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -0
  21. package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -0
  22. package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
  23. package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
  24. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
  25. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
  26. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
  27. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +230 -0
  28. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
  29. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
  30. package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
  31. package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
  32. package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
  33. package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +108 -0
  34. package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
  35. package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +140 -0
  36. package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +49 -0
  37. package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +65 -0
  38. package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
  39. package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +93 -0
  40. package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
  41. package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
  42. package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
  43. package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +130 -0
  44. package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
  45. package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +111 -0
  46. package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
  47. package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +118 -0
  48. package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
  49. package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +286 -0
  50. package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
  51. package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
  52. package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
  53. package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
  54. package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
  55. package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
  56. package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
  57. package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
  58. package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
  59. package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
  60. package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +18 -0
  61. package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +74 -0
  62. package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +10 -0
  63. package/dist/_vendor/ailf-core/services/diagnosis/registry.js +10 -0
  64. package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +119 -2
  65. package/dist/_vendor/ailf-core/services/diagnosis-runner.js +136 -2
  66. package/dist/_vendor/ailf-core/services/index.d.ts +5 -1
  67. package/dist/_vendor/ailf-core/services/index.js +15 -2
  68. package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
  69. package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
  70. package/dist/_vendor/ailf-core/types/diagnosis.d.ts +115 -10
  71. package/dist/_vendor/ailf-core/types/diagnosis.js +3 -1
  72. package/dist/_vendor/ailf-core/types/index.d.ts +8 -1
  73. package/dist/_vendor/ailf-core/types/repo-config.d.ts +16 -0
  74. package/dist/_vendor/ailf-core/types/synthesis-telemetry.d.ts +101 -0
  75. package/dist/_vendor/ailf-core/types/synthesis-telemetry.js +18 -0
  76. package/dist/adapters/config-sources/file-config-adapter.js +8 -6
  77. package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
  78. package/dist/adapters/llm/fake-llm-client.js +38 -1
  79. package/dist/adapters/llm/index.d.ts +1 -1
  80. package/dist/adapters/llm/index.js +1 -1
  81. package/dist/adapters/llm/openai-llm-client.js +59 -5
  82. package/dist/adapters/llm/retry.d.ts +18 -0
  83. package/dist/adapters/llm/retry.js +21 -0
  84. package/dist/adapters/synthesis/synthesis-telemetry-schema.d.ts +49 -0
  85. package/dist/adapters/synthesis/synthesis-telemetry-schema.js +55 -0
  86. package/dist/adapters/task-sources/content-lake-task-source.js +10 -5
  87. package/dist/adapters/task-sources/repo-schemas.d.ts +7 -0
  88. package/dist/adapters/task-sources/repo-schemas.js +10 -0
  89. package/dist/cli-program.js +3 -0
  90. package/dist/commands/interpret.d.ts +70 -0
  91. package/dist/commands/interpret.js +221 -0
  92. package/dist/commands/pipeline-action.d.ts +44 -0
  93. package/dist/commands/pipeline-action.js +193 -1
  94. package/dist/commands/run.d.ts +2 -0
  95. package/dist/commands/run.js +2 -0
  96. package/dist/composition-root.d.ts +21 -23
  97. package/dist/composition-root.js +107 -41
  98. package/dist/config/diagnosis-cards.ts +318 -0
  99. package/dist/config/models.ts +12 -0
  100. package/dist/grader/agent-harness.d.ts +5 -10
  101. package/dist/grader/agent-harness.js +5 -13
  102. package/dist/grader/common.d.ts +5 -13
  103. package/dist/grader/common.js +5 -17
  104. package/dist/grader/index.d.ts +15 -29
  105. package/dist/grader/index.js +15 -66
  106. package/dist/grader/knowledge-probe.d.ts +5 -10
  107. package/dist/grader/knowledge-probe.js +5 -14
  108. package/dist/grader/literacy.d.ts +5 -9
  109. package/dist/grader/literacy.js +5 -13
  110. package/dist/grader/mcp.d.ts +5 -10
  111. package/dist/grader/mcp.js +5 -14
  112. package/dist/orchestration/pipeline-orchestrator.js +3 -0
  113. package/dist/report-store.d.ts +26 -0
  114. package/dist/report-store.js +63 -0
  115. package/package.json +2 -2
@@ -15,7 +15,10 @@
15
15
  * @see packages/core/src/ports/context.ts — AppContext interface
16
16
  * @see docs/archive/exec-plans/ports-and-adapters/phase-7-composition-root.md
17
17
  */
18
- import { ARTIFACT_EXPORT_PHASE_ID, InMemoryPluginRegistry, NoOpArtifactWriter, NoOpProgressReporter, generateRunId, isArtifactType, } from "./_vendor/ailf-core/index.js";
18
+ import { promises as fs } from "node:fs";
19
+ import path from "node:path";
20
+ import { ARTIFACT_EXPORT_PHASE_ID, DIAGNOSIS_CARD_GENERATORS, InMemoryPluginRegistry, NoOpArtifactWriter, NoOpProgressReporter, createDiagnosisRunner, createLLMClient, generateRunId, isArtifactType, modelId, } from "./_vendor/ailf-core/index.js";
21
+ import { JudgmentAttributionSchema } from "./adapters/attribution/per-entry-attribution-writer.js";
19
22
  import { AccumulatingArtifactWriter } from "./artifact-capture/accumulating-artifact-writer.js";
20
23
  import { ApiGatewayArtifactWriter } from "./artifact-capture/api-gateway-artifact-writer.js";
21
24
  import { BatchingApiGatewayArtifactWriter } from "./artifact-capture/batching-api-gateway-artifact-writer.js";
@@ -96,10 +99,19 @@ export function createAppContext(config) {
96
99
  // LLM client (D0051) — wired when an API key is present. The grader path
97
100
  // does NOT consume this; D0051 defers grader migration as a follow-up.
98
101
  // Env mapping happens here so `createLLMClient` stays pure and testable.
99
- const llmClient = createLLMClient(config, {
102
+ // D-01: factory hoisted to @sanity/ailf-core; adapter ctors injected here.
103
+ const llmAdapters = {
104
+ newAnthropicClient: (opts) => new AnthropicLLMClient(opts),
105
+ newOpenAIClient: (opts) => new OpenAILLMClient(opts),
106
+ };
107
+ const llmKeys = {
100
108
  anthropicApiKey: process.env.ANTHROPIC_API_KEY,
101
109
  openaiApiKey: process.env.OPENAI_API_KEY,
102
- }, logger);
110
+ };
111
+ const llmClient = createLLMClient(config, llmKeys, {
112
+ logger,
113
+ adapters: llmAdapters,
114
+ });
103
115
  return {
104
116
  artifactWriter,
105
117
  cache,
@@ -117,44 +129,6 @@ export function createAppContext(config) {
117
129
  taskSource,
118
130
  };
119
131
  }
120
- /**
121
- * Select the LLMClient adapter based on `config.llmProvider` and the
122
- * supplied API keys. Returns `undefined` when no usable credential is
123
- * present — `AppContext.llmClient` stays unset and consumers handle that
124
- * explicitly.
125
- *
126
- * Adapters never read `process.env` themselves (per
127
- * `.claude/rules/typescript.md`); env mapping happens at the call site
128
- * (typically `createAppContext`).
129
- *
130
- * Exported for unit-test access; not part of the public package API.
131
- */
132
- export function createLLMClient(config, keys, logger) {
133
- const explicit = config.llmProvider;
134
- const anthropicKey = keys.anthropicApiKey;
135
- const openaiKey = keys.openaiApiKey;
136
- // Auto-select: prefer Anthropic when both are present (matches the
137
- // current grader's default model in `config/models.ts`).
138
- const provider = explicit ?? (anthropicKey ? "anthropic" : openaiKey ? "openai" : undefined);
139
- if (!provider) {
140
- logger.debug("LLM client: not wired — no Anthropic or OpenAI API key supplied");
141
- return undefined;
142
- }
143
- if (provider === "anthropic") {
144
- if (!anthropicKey) {
145
- logger.warn('llmProvider="anthropic" but no Anthropic API key supplied — LLMClient not wired');
146
- return undefined;
147
- }
148
- logger.debug("LLM client: AnthropicLLMClient");
149
- return new AnthropicLLMClient({ apiKey: anthropicKey, logger });
150
- }
151
- if (!openaiKey) {
152
- logger.warn('llmProvider="openai" but no OpenAI API key supplied — LLMClient not wired');
153
- return undefined;
154
- }
155
- logger.debug("LLM client: OpenAILLMClient");
156
- return new OpenAILLMClient({ apiKey: openaiKey, logger });
157
- }
158
132
  // ---------------------------------------------------------------------------
159
133
  // Sub-factories (extracted to keep createAppContext readable)
160
134
  // ---------------------------------------------------------------------------
@@ -541,3 +515,95 @@ export function createBorderlineConsensusRunner(opts) {
541
515
  thresholds: BORDERLINE_SEVERITY_THRESHOLDS,
542
516
  });
543
517
  }
518
+ // ---------------------------------------------------------------------------
519
+ // Diagnosis runner wiring (Plan 05-05 / D0048)
520
+ // ---------------------------------------------------------------------------
521
+ /**
522
+ * Returns the full 8-card `CardRegistry` backed by `DIAGNOSIS_CARD_GENERATORS`
523
+ * from `@sanity/ailf-core`. Exposed as a function (not a module-level const)
524
+ * so the composition root remains the single-seam factory and tests can assert
525
+ * the call site (AI-SPEC §3 Pitfall 1 — no module-scope mutables).
526
+ */
527
+ export function buildDiagnosisRegistry() {
528
+ return DIAGNOSIS_CARD_GENERATORS;
529
+ }
530
+ /**
531
+ * Default local artifacts root (mirrors `createArtifactWriter` default above).
532
+ */
533
+ const DIAGNOSIS_LOCAL_ARTIFACTS_DIR = ".ailf/results/captures";
534
+ /**
535
+ * Load all per-entry attribution objects for a given `runId` from the local
536
+ * filesystem at `{artifactsDir}/runs/{runId}/attribution/*.json`.
537
+ *
538
+ * Reads every `.json` file in the run's `attribution/` directory, JSON-parses
539
+ * it, and Zod-validates each entry through `JudgmentAttributionSchema` (Phase-4
540
+ * canonical schema — D0045). Malformed entries are skipped with a warning.
541
+ *
542
+ * Returns an empty array when the directory does not exist (expected on runs
543
+ * without Phase-4 attribution data — the runner treats this as Landmine-11
544
+ * "no data" and both attribution cards return `status: "missing"`).
545
+ */
546
+ async function loadAttributionsFromLocalFs(runId, artifactsDir, logger) {
547
+ const attrDir = path.join(artifactsDir, "runs", runId, "attribution");
548
+ let entries;
549
+ try {
550
+ entries = (await fs.readdir(attrDir)).filter((f) => f.endsWith(".json"));
551
+ }
552
+ catch {
553
+ // Directory missing — no attribution data for this run.
554
+ return [];
555
+ }
556
+ const results = [];
557
+ for (const filename of entries) {
558
+ const filePath = path.join(attrDir, filename);
559
+ let raw;
560
+ try {
561
+ const bytes = await fs.readFile(filePath, "utf8");
562
+ raw = JSON.parse(bytes);
563
+ }
564
+ catch (err) {
565
+ logger?.warn("loadAttributions: failed to read/parse attribution file", {
566
+ file: filePath,
567
+ error: err instanceof Error ? err.message : String(err),
568
+ });
569
+ continue;
570
+ }
571
+ const parsed = JudgmentAttributionSchema.safeParse(raw);
572
+ if (!parsed.success) {
573
+ logger?.warn("loadAttributions: attribution file failed schema validation", {
574
+ file: filePath,
575
+ errors: parsed.error.flatten(),
576
+ });
577
+ continue;
578
+ }
579
+ results.push(parsed.data);
580
+ }
581
+ return results;
582
+ }
583
+ /**
584
+ * Build a fully-wired `DiagnosisRunner` from an `AppContext`.
585
+ *
586
+ * Wires the full 8-card registry, `loadAttributions` bound to the local
587
+ * filesystem (Phase-4 per-entry attribution objects at
588
+ * `{artifactsDir}/runs/{runId}/attribution/*.json`), and no-op cache
589
+ * reader/writer (Plan-06 CLI command will wire the real cache seam).
590
+ *
591
+ * Plan-06 API/CLI consumers import this function from the composition root
592
+ * and pass `ctx` from `createAppContext(config)`.
593
+ */
594
+ export function getDiagnosisRunner(ctx) {
595
+ const artifactsDir = ctx.config.artifactsDir ?? DIAGNOSIS_LOCAL_ARTIFACTS_DIR;
596
+ // No-op cache shims — Plan 06 wires the real cache.
597
+ const diagnosisReader = async (_path) => null;
598
+ const diagnosisWriter = async (_path, _diagnosis) => { };
599
+ return createDiagnosisRunner({
600
+ llm: ctx.llmClient,
601
+ model: modelId("anthropic:claude-opus-4-6"),
602
+ logger: ctx.logger,
603
+ progress: ctx.progress,
604
+ registry: buildDiagnosisRegistry(),
605
+ diagnosisReader,
606
+ diagnosisWriter,
607
+ loadAttributions: (runId) => loadAttributionsFromLocalFs(runId, artifactsDir, ctx.logger),
608
+ });
609
+ }
@@ -0,0 +1,318 @@
1
+ /**
2
+ * diagnosis-cards.ts — Diagnosis eval matrix config.
3
+ *
4
+ * TS-first config (per .claude/rules/config.md) defining the 5 LLM card types
5
+ * × 3 first-class models eval matrix. Consumed by
6
+ * `scripts/generate-diagnosis-config.ts` to emit
7
+ * `promptfooconfig-diagnosis.yaml`. Never hand-edit the YAML — run
8
+ * `pnpm generate-configs` instead.
9
+ *
10
+ * Per AI-SPEC §5 and CONTEXT D-04 (path b: standalone generator entry point
11
+ * for the diagnosis config, additive — does not modify the existing literacy
12
+ * generate-configs pipeline).
13
+ *
14
+ * @see packages/eval/scripts/generate-diagnosis-config.ts — generator
15
+ * @see packages/eval/promptfooconfig-diagnosis.yaml — generated output
16
+ */
17
+
18
+ // ---------------------------------------------------------------------------
19
+ // Types
20
+ // ---------------------------------------------------------------------------
21
+
22
+ /**
23
+ * A first-class model entry in the diagnosis eval matrix.
24
+ * Mirrors the shape of model entries in `config/models.ts`.
25
+ */
26
+ export interface DiagnosisModelEntry {
27
+ /** Promptfoo provider string (e.g. "anthropic:messages:claude-opus-4-6") */
28
+ id: string
29
+ /** Human-readable label for reports */
30
+ label: string
31
+ /** Per-model config overrides (temperature, max_tokens, etc.) */
32
+ config?: Record<string, unknown>
33
+ }
34
+
35
+ /**
36
+ * The 5 LLM-driven card types under evaluation.
37
+ * Deterministic cards (area-summary, failure-mode-summary, no-issues) are
38
+ * tested via `fixture-matrix.test.ts` (vitest), not via the promptfoo matrix.
39
+ */
40
+ export type LLMCardType =
41
+ | "top-recommendations"
42
+ | "weakest-area"
43
+ | "low-confidence-attribution"
44
+ | "doc-attribution-spotlight"
45
+ | "regression-vs-baseline"
46
+
47
+ /**
48
+ * A single evaluation scenario: one fixture path × one expected outcome.
49
+ *
50
+ * The `fixturePath` is relative to `packages/eval/` so the promptfoo config
51
+ * can resolve it from any working directory. `expectedStatus` drives the
52
+ * pass/fail assertion in the generated YAML.
53
+ */
54
+ export interface DiagnosisScenario {
55
+ /** Short slug used in promptfoo `description` fields */
56
+ name: string
57
+ /** Path to the Report JSON fixture, relative to `packages/eval/` */
58
+ fixturePath: string
59
+ /**
60
+ * Card type this scenario exercises. The eval matrix runs all LLM cards
61
+ * per scenario; this field annotates which card type is the primary focus
62
+ * for the rubric.
63
+ */
64
+ primaryCard: LLMCardType
65
+ /** Expected card status when all LLM calls succeed */
66
+ expectedStatus: "ready" | "degraded" | "missing"
67
+ /** Optional: path to canned LLM response for adversarial scenarios */
68
+ cannedResponsePath?: string
69
+ /**
70
+ * Optional: cardId to key the canned response against (for FakeLLMClient
71
+ * keyedResponses in vitest; mirrored in the promptfoo scenario description
72
+ * for documentation).
73
+ */
74
+ cannedCardId?: LLMCardType
75
+ /** Free-text note about what this scenario tests */
76
+ note?: string
77
+ }
78
+
79
+ /**
80
+ * Top-level diagnosis eval matrix config.
81
+ * Exported as the default export of this file (mirrors models.ts convention).
82
+ */
83
+ export interface DiagnosisCardsConfig {
84
+ /** All LLM card evaluation scenarios */
85
+ scenarios: DiagnosisScenario[]
86
+ /** Models to run each scenario against */
87
+ models: DiagnosisModelEntry[]
88
+ /** Grader model for LLM-judge assertions */
89
+ grader: DiagnosisModelEntry
90
+ /** Eval budget in milliseconds (kill switch) */
91
+ evalBudgetMs: number
92
+ /** Max parallel API calls */
93
+ maxConcurrency: number
94
+ /** Default per-model config */
95
+ defaults: {
96
+ temperature: number
97
+ max_tokens: number
98
+ }
99
+ }
100
+
101
+ // ---------------------------------------------------------------------------
102
+ // Helper
103
+ // ---------------------------------------------------------------------------
104
+
105
+ export function defineDiagnosisCards(
106
+ config: DiagnosisCardsConfig
107
+ ): DiagnosisCardsConfig {
108
+ return config
109
+ }
110
+
111
+ // ---------------------------------------------------------------------------
112
+ // Config definition
113
+ // ---------------------------------------------------------------------------
114
+
115
+ const diagnosisCardsConfig: DiagnosisCardsConfig = defineDiagnosisCards({
116
+ // ── Models under evaluation ────────────────────────────────────────────────
117
+ models: [
118
+ {
119
+ id: "anthropic:messages:claude-opus-4-6",
120
+ label: "Claude Opus 4.6",
121
+ config: { temperature: 0.2, max_tokens: 4096 },
122
+ },
123
+ {
124
+ id: "anthropic:messages:claude-sonnet-4-6",
125
+ label: "Claude Sonnet 4.6",
126
+ config: { temperature: 0.2, max_tokens: 4096 },
127
+ },
128
+ {
129
+ id: "openai:chat:gpt-5.2",
130
+ label: "GPT 5.2",
131
+ config: { max_completion_tokens: 4096 },
132
+ },
133
+ ],
134
+
135
+ // ── Grader model ────────────────────────────────────────────────────────────
136
+ grader: {
137
+ id: "anthropic:messages:claude-opus-4-5-20251101",
138
+ label: "Claude Opus 4.5 (grader)",
139
+ },
140
+
141
+ // ── Eval budget ─────────────────────────────────────────────────────────────
142
+ evalBudgetMs: 3_600_000, // 60 min — full matrix across 3 models × 17 scenarios
143
+ maxConcurrency: 8, // conservative for diagnosis (longer prompts than literacy)
144
+
145
+ // ── Default config ──────────────────────────────────────────────────────────
146
+ defaults: {
147
+ temperature: 0.2,
148
+ max_tokens: 4096,
149
+ },
150
+
151
+ // ── Scenarios (17 logical fixtures) ─────────────────────────────────────────
152
+ scenarios: [
153
+ // ── Critical-path: top-recommendations ──────────────────────────────────
154
+ {
155
+ name: "healthy-top-recommendations",
156
+ fixturePath:
157
+ "test-fixtures/diagnosis/reports/healthy-top-recommendations.json",
158
+ primaryCard: "top-recommendations",
159
+ expectedStatus: "ready",
160
+ note: "Healthy report (mean 91) — top-recommendations card should produce 2+ actionable suggestions with docSlug references from the manifest.",
161
+ },
162
+ {
163
+ name: "low-top-recommendations",
164
+ fixturePath:
165
+ "test-fixtures/diagnosis/reports/low-top-recommendations.json",
166
+ primaryCard: "top-recommendations",
167
+ expectedStatus: "ready",
168
+ note: "Low-scoring report (mean 42) — top-recommendations card should produce high-priority suggestions addressing the dominant failure modes (outdated-docs, missing-docs).",
169
+ },
170
+
171
+ // ── Critical-path: weakest-area ──────────────────────────────────────────
172
+ {
173
+ name: "healthy-weakest-area",
174
+ fixturePath: "test-fixtures/diagnosis/reports/healthy-weakest-area.json",
175
+ primaryCard: "weakest-area",
176
+ expectedStatus: "ready",
177
+ note: "Healthy report with clear weakest area (content-modeling at 82) — weakest-area card should identify the area and provide high-confidence analysis.",
178
+ },
179
+ {
180
+ name: "low-weakest-area",
181
+ fixturePath: "test-fixtures/diagnosis/reports/low-weakest-area.json",
182
+ primaryCard: "weakest-area",
183
+ expectedStatus: "ready",
184
+ note: "Low-scoring report with clear weakest area (content-modeling at 28) — weakest-area card should identify the most critical area with multiple failure modes.",
185
+ },
186
+
187
+ // ── Critical-path: low-confidence-attribution ────────────────────────────
188
+ {
189
+ name: "healthy-low-confidence-attribution",
190
+ fixturePath:
191
+ "test-fixtures/diagnosis/reports/healthy-low-confidence-attribution.json",
192
+ primaryCard: "low-confidence-attribution",
193
+ expectedStatus: "ready",
194
+ note: "Healthy report with small sample sizes (2-3 judgments per area) — low-confidence-attribution card should identify attribution uncertainty despite positive scores.",
195
+ },
196
+ {
197
+ name: "low-low-confidence-attribution",
198
+ fixturePath:
199
+ "test-fixtures/diagnosis/reports/low-low-confidence-attribution.json",
200
+ primaryCard: "low-confidence-attribution",
201
+ expectedStatus: "ready",
202
+ note: "Low-scoring report with small sample sizes (2 judgments per area) — low-confidence-attribution card should flag both score quality and attribution uncertainty.",
203
+ },
204
+
205
+ // ── Critical-path: doc-attribution-spotlight ─────────────────────────────
206
+ {
207
+ name: "healthy-doc-attribution-spotlight",
208
+ fixturePath:
209
+ "test-fixtures/diagnosis/reports/healthy-doc-attribution-spotlight.json",
210
+ primaryCard: "doc-attribution-spotlight",
211
+ expectedStatus: "ready",
212
+ note: "Healthy 5-area report — doc-attribution-spotlight card should identify the highest-impact document in the manifest.",
213
+ },
214
+ {
215
+ name: "low-doc-attribution-spotlight",
216
+ fixturePath:
217
+ "test-fixtures/diagnosis/reports/low-doc-attribution-spotlight.json",
218
+ primaryCard: "doc-attribution-spotlight",
219
+ expectedStatus: "ready",
220
+ note: "Low-scoring 5-area report with multiple failure modes — doc-attribution-spotlight card should identify the most critical document.",
221
+ },
222
+
223
+ // ── Edge cases ───────────────────────────────────────────────────────────
224
+ {
225
+ name: "empty-report",
226
+ fixturePath: "test-fixtures/diagnosis/reports/empty.json",
227
+ primaryCard: "top-recommendations",
228
+ expectedStatus: "missing",
229
+ note: "Edge case (a): zero-area report — all LLM cards should emit status: missing (no data to reason about).",
230
+ },
231
+ {
232
+ name: "single-judgment-per-area",
233
+ fixturePath:
234
+ "test-fixtures/diagnosis/reports/single-judgment-per-area.json",
235
+ primaryCard: "weakest-area",
236
+ expectedStatus: "ready",
237
+ note: "Edge case (b): single-judgment sample size — weakest-area card should reflect low-confidence calibration (sampleSize: 1).",
238
+ },
239
+ {
240
+ name: "all-areas-tied",
241
+ fixturePath: "test-fixtures/diagnosis/reports/all-areas-tied.json",
242
+ primaryCard: "weakest-area",
243
+ expectedStatus: "missing",
244
+ note: "Edge case (c): all areas scored identically (70) — weakest-area card should emit status: missing with reason: no-clear-weakest.",
245
+ },
246
+ {
247
+ name: "grader-major-mismatch-baseline",
248
+ fixturePath:
249
+ "test-fixtures/diagnosis/reports/grader-major-mismatch-baseline.json",
250
+ primaryCard: "regression-vs-baseline",
251
+ expectedStatus: "missing",
252
+ note: "Edge case (d): grader-major-version mismatch — regression-vs-baseline should emit missing with reason: grader-major-version-mismatch. Run as pair with grader-major-mismatch-current.",
253
+ },
254
+ {
255
+ name: "grader-major-mismatch-current",
256
+ fixturePath:
257
+ "test-fixtures/diagnosis/reports/grader-major-mismatch-current.json",
258
+ primaryCard: "regression-vs-baseline",
259
+ expectedStatus: "missing",
260
+ note: "Edge case (d) pair: current report with different graderModel — regression-vs-baseline mismatch guard triggers when paired with grader-major-mismatch-baseline.",
261
+ },
262
+ {
263
+ name: "near-deprecated-taxonomy",
264
+ fixturePath:
265
+ "test-fixtures/diagnosis/reports/near-deprecated-taxonomy.json",
266
+ primaryCard: "weakest-area",
267
+ expectedStatus: "ready",
268
+ note: "Edge case (e): report using unclassified failure mode (currently canonical but watch for taxonomy retirement). Zod refine() must accept canonical modes.",
269
+ },
270
+
271
+ // ── Adversarial canned responses ─────────────────────────────────────────
272
+ {
273
+ name: "adversarial-fabricated-delta",
274
+ fixturePath:
275
+ "test-fixtures/diagnosis/reports/grader-major-mismatch-current.json",
276
+ primaryCard: "regression-vs-baseline",
277
+ expectedStatus: "degraded",
278
+ cannedResponsePath:
279
+ "test-fixtures/diagnosis/canned-responses/fabricated-delta-regression.json",
280
+ cannedCardId: "regression-vs-baseline",
281
+ note: "Adversarial: fabricated delta (AI-SPEC §1b failure-mode #1). LLM claims -7.3 delta; direction-sign refine triggers degraded card.",
282
+ },
283
+ {
284
+ name: "adversarial-improve-introduction",
285
+ fixturePath:
286
+ "test-fixtures/diagnosis/reports/low-top-recommendations.json",
287
+ primaryCard: "top-recommendations",
288
+ expectedStatus: "degraded",
289
+ cannedResponsePath:
290
+ "test-fixtures/diagnosis/canned-responses/improve-introduction.json",
291
+ cannedCardId: "top-recommendations",
292
+ note: "Adversarial: generic anti-pattern recommendation (AI-SPEC §1b failure-mode #2). Actionability refine triggers degraded card.",
293
+ },
294
+ {
295
+ name: "adversarial-hallucinated-docslug",
296
+ fixturePath:
297
+ "test-fixtures/diagnosis/reports/low-top-recommendations.json",
298
+ primaryCard: "top-recommendations",
299
+ expectedStatus: "degraded",
300
+ cannedResponsePath:
301
+ "test-fixtures/diagnosis/canned-responses/hallucinated-docslug.json",
302
+ cannedCardId: "top-recommendations",
303
+ note: "Adversarial: hallucinated docSlug (AI-SPEC §1b failure-mode #3). Allow-list refine triggers degraded card.",
304
+ },
305
+ {
306
+ name: "adversarial-taxonomy-drift",
307
+ fixturePath: "test-fixtures/diagnosis/reports/low-weakest-area.json",
308
+ primaryCard: "weakest-area",
309
+ expectedStatus: "degraded",
310
+ cannedResponsePath:
311
+ "test-fixtures/diagnosis/canned-responses/taxonomy-drift.json",
312
+ cannedCardId: "weakest-area",
313
+ note: "Adversarial: taxonomy drift (AI-SPEC §1b failure-mode #4). Per-dimension failureMode refine triggers degraded card.",
314
+ },
315
+ ],
316
+ })
317
+
318
+ export default diagnosisCardsConfig
@@ -24,6 +24,18 @@ export default defineModels({
24
24
  // All literacy variants included by default (baseline, observed,
25
25
  // agentic-naive, agentic-optimized)
26
26
  },
27
+ {
28
+ // Phase 5 LLM card routing (D-07). AI-SPEC §4 routes 3 routine cards
29
+ // (top-recommendations, weakest-area, regression-vs-baseline) here.
30
+ // Pricing already in AnthropicLLMClient; baseline literacy variant only.
31
+ id: "anthropic:messages:claude-sonnet-4-6",
32
+ label: "Claude Sonnet 4.6",
33
+ config: { temperature: 0.2, max_tokens: 4096 },
34
+ modes: ["literacy"],
35
+ variants: {
36
+ literacy: ["baseline"],
37
+ },
38
+ },
27
39
 
28
40
  // ── Google ─────────────────────────────────────────────────
29
41
  // {
@@ -1,14 +1,9 @@
1
1
  /**
2
- * Agent-harness failure modes — valid for the `agent-harness` dimension
3
- * family (process-quality, agent-output, tool-usage).
2
+ * Agent-harness failure modes — re-export shim (D-05).
4
3
  *
5
- * Phase 3 GRAD-03 (Plan 03-02). Agent-harness failures track how an agent
6
- * uses tools and handles multi-step processes; the v0 modes are tool-misuse,
7
- * chaotic-process (no plan), and missing-recovery (doesn't recover from
8
- * tool errors).
4
+ * Canonical data relocated to @sanity/ailf-core.
5
+ * Existing callers of this file continue to work unchanged.
9
6
  *
10
- * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
11
- * §"Per-dimension failure-mode taxonomies" (lines 239-283).
7
+ * @see packages/core/src/grader/failure-modes/agent-harness.ts
12
8
  */
13
- export declare const AGENT_FAILURE_MODES: readonly ["tool-misuse", "chaotic-process", "missing-recovery"];
14
- export type AgentFailureMode = (typeof AGENT_FAILURE_MODES)[number];
9
+ export { AGENT_FAILURE_MODES, type AgentFailureMode } from "../_vendor/ailf-core/index.d.ts";
@@ -1,17 +1,9 @@
1
1
  /**
2
- * Agent-harness failure modes — valid for the `agent-harness` dimension
3
- * family (process-quality, agent-output, tool-usage).
2
+ * Agent-harness failure modes — re-export shim (D-05).
4
3
  *
5
- * Phase 3 GRAD-03 (Plan 03-02). Agent-harness failures track how an agent
6
- * uses tools and handles multi-step processes; the v0 modes are tool-misuse,
7
- * chaotic-process (no plan), and missing-recovery (doesn't recover from
8
- * tool errors).
4
+ * Canonical data relocated to @sanity/ailf-core.
5
+ * Existing callers of this file continue to work unchanged.
9
6
  *
10
- * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
11
- * §"Per-dimension failure-mode taxonomies" (lines 239-283).
7
+ * @see packages/core/src/grader/failure-modes/agent-harness.ts
12
8
  */
13
- export const AGENT_FAILURE_MODES = [
14
- "tool-misuse", // assistant calls tools incorrectly or with wrong args
15
- "chaotic-process", // assistant flails — undirected exploration, no plan
16
- "missing-recovery", // assistant doesn't recover from a tool error
17
- ];
9
+ export { AGENT_FAILURE_MODES } from "../_vendor/ailf-core/index.js";
@@ -1,17 +1,9 @@
1
1
  /**
2
- * Cross-cutting failure modes — valid for any dimension family.
2
+ * Cross-cutting failure modes — re-export shim (D-05).
3
3
  *
4
- * Phase 3 GRAD-03 (Plan 03-02). The four cross-cutting modes capture failures
5
- * that aren't tied to a specific dimension family: infrastructure failures,
6
- * model ceiling effects, false-floor (model already knew the answer; docs
7
- * added no value), and the low-confidence fallback. The per-dimension
8
- * taxonomies (literacy, MCP, knowledge-probe, agent-harness) extend this
9
- * cross-cutting list.
4
+ * Canonical data relocated to @sanity/ailf-core.
5
+ * Existing callers of this file continue to work unchanged.
10
6
  *
11
- * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
12
- * §"Per-dimension failure-mode taxonomies" (lines 239-283 — the v0 lists)
13
- * @see docs/decisions/D0005-grader-model-separation.md — single grader model;
14
- * taxonomies travel with the rubric prompt for reproducibility.
7
+ * @see packages/core/src/grader/failure-modes/common.ts
15
8
  */
16
- export declare const COMMON_FAILURE_MODES: readonly ["api-error", "model-limitation", "false-floor", "unclassified"];
17
- export type CommonFailureMode = (typeof COMMON_FAILURE_MODES)[number];
9
+ export { COMMON_FAILURE_MODES, type CommonFailureMode } from "../_vendor/ailf-core/index.d.ts";
@@ -1,21 +1,9 @@
1
1
  /**
2
- * Cross-cutting failure modes — valid for any dimension family.
2
+ * Cross-cutting failure modes — re-export shim (D-05).
3
3
  *
4
- * Phase 3 GRAD-03 (Plan 03-02). The four cross-cutting modes capture failures
5
- * that aren't tied to a specific dimension family: infrastructure failures,
6
- * model ceiling effects, false-floor (model already knew the answer; docs
7
- * added no value), and the low-confidence fallback. The per-dimension
8
- * taxonomies (literacy, MCP, knowledge-probe, agent-harness) extend this
9
- * cross-cutting list.
4
+ * Canonical data relocated to @sanity/ailf-core.
5
+ * Existing callers of this file continue to work unchanged.
10
6
  *
11
- * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
12
- * §"Per-dimension failure-mode taxonomies" (lines 239-283 — the v0 lists)
13
- * @see docs/decisions/D0005-grader-model-separation.md — single grader model;
14
- * taxonomies travel with the rubric prompt for reproducibility.
7
+ * @see packages/core/src/grader/failure-modes/common.ts
15
8
  */
16
- export const COMMON_FAILURE_MODES = [
17
- "api-error", // infrastructure failure, not a docs problem
18
- "model-limitation", // high ceiling, model can't reach it
19
- "false-floor", // model already knew the answer; docs added no value
20
- "unclassified", // grader could not pick a mode (low-confidence fallback)
21
- ];
9
+ export { COMMON_FAILURE_MODES } from "../_vendor/ailf-core/index.js";
@@ -1,38 +1,24 @@
1
1
  /**
2
2
  * Per-dimension failure-mode taxonomy barrel.
3
3
  *
4
+ * D-05: taxonomy data relocated to @sanity/ailf-core so card files in
5
+ * packages/core/src/services/diagnosis/cards/ can import without violating
6
+ * the core→eval import direction rule.
7
+ *
8
+ * This file is now a re-export shim — all behavior lives in
9
+ * packages/core/src/grader/failure-modes/. Existing eval-side callers
10
+ * (rubrics.ts, rubric-resolution.ts, calibration.test.ts) continue to
11
+ * work with zero source changes.
12
+ *
4
13
  * Named re-exports only (W0124 — never `export *`).
5
14
  *
6
15
  * Consumers:
7
- * - `packages/eval/config/rubrics.ts` — calls `failureModesForDimension()` to
8
- * stamp a per-template legal-mode list onto every rubric template entry.
9
- * - `packages/eval/src/pipeline/compiler/rubric-resolution.ts` — reads
10
- * `template.failureModes` at prompt-assembly time and announces the legal
11
- * modes to the grader before the structured-shape footer (Plan 03-01).
12
- * - `packages/eval/src/grader/__tests__/calibration.test.ts` — fixture-driven
13
- * ≥90% non-`unclassified` static calibration check (ROADMAP success
14
- * criterion 1).
16
+ * - `packages/eval/config/rubrics.ts` — calls `failureModesForDimension()`
17
+ * - `packages/eval/src/pipeline/compiler/rubric-resolution.ts`
18
+ * - `packages/eval/src/grader/__tests__/calibration.test.ts`
15
19
  *
20
+ * @see packages/core/src/grader/failure-modes/index.ts — canonical location
16
21
  * @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
17
- * §"Per-dimension failure-mode taxonomies" (lines 239-283).
18
- * @see docs/decisions/D0005-grader-model-separation.md — single grader model;
19
- * taxonomies travel with the rubric prompt for reproducibility.
20
- */
21
- export { COMMON_FAILURE_MODES, type CommonFailureMode } from "./common.js";
22
- export { LITERACY_FAILURE_MODES, type LiteracyFailureMode } from "./literacy.js";
23
- export { MCP_FAILURE_MODES, type MCPFailureMode } from "./mcp.js";
24
- export { KP_FAILURE_MODES, type KPFailureMode } from "./knowledge-probe.js";
25
- export { AGENT_FAILURE_MODES, type AgentFailureMode } from "./agent-harness.js";
26
- /**
27
- * Return the legal failure-mode list for a given rubric dimension.
28
- *
29
- * Accepts both family-level keys (`mcp-behavior`, `knowledge-probe`,
30
- * `agent-harness`) and the per-template `dimension` strings used in
31
- * `config/rubrics.ts` (`task-completion`, `input-validation`,
32
- * `factual-correctness`, `process-quality`, …). The cross-cutting
33
- * `COMMON_FAILURE_MODES` is always included.
34
- *
35
- * Unknown dimensions fall through to `COMMON_FAILURE_MODES` only — safe
36
- * default, the grader can still pick `unclassified`.
22
+ * @see docs/decisions/D0005-grader-model-separation.md
37
23
  */
38
- export declare function failureModesForDimension(dimension: string): readonly string[];
24
+ export { AGENT_FAILURE_MODES, CANONICAL_DIMENSIONS, COMMON_FAILURE_MODES, KP_FAILURE_MODES, LITERACY_FAILURE_MODES, MCP_FAILURE_MODES, failureModesForDimension, isCanonicalFailureMode, type AgentFailureMode, type CommonFailureMode, type KPFailureMode, type LiteracyFailureMode, type MCPFailureMode, } from "../_vendor/ailf-core/index.d.ts";