@sanity/ailf 5.0.0 → 6.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/diagnosis-cards.ts +318 -0
- package/config/models.ts +12 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +4 -0
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +171 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +104 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +96 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +39 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +52 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +77 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +126 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +107 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +114 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +273 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +16 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +86 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.js +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +119 -2
- package/dist/_vendor/ailf-core/services/diagnosis-runner.js +136 -2
- package/dist/_vendor/ailf-core/services/index.d.ts +5 -1
- package/dist/_vendor/ailf-core/services/index.js +15 -2
- package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
- package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
- package/dist/_vendor/ailf-core/types/diagnosis.d.ts +112 -10
- package/dist/_vendor/ailf-core/types/diagnosis.js +3 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +1 -1
- package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
- package/dist/adapters/llm/fake-llm-client.js +38 -1
- package/dist/adapters/llm/openai-llm-client.js +52 -3
- package/dist/cli-program.js +3 -0
- package/dist/commands/interpret.d.ts +50 -0
- package/dist/commands/interpret.js +212 -0
- package/dist/composition-root.d.ts +21 -23
- package/dist/composition-root.js +107 -41
- package/dist/config/diagnosis-cards.ts +318 -0
- package/dist/config/models.ts +12 -0
- package/dist/grader/agent-harness.d.ts +5 -10
- package/dist/grader/agent-harness.js +5 -13
- package/dist/grader/common.d.ts +5 -13
- package/dist/grader/common.js +5 -17
- package/dist/grader/index.d.ts +15 -29
- package/dist/grader/index.js +15 -66
- package/dist/grader/knowledge-probe.d.ts +5 -10
- package/dist/grader/knowledge-probe.js +5 -14
- package/dist/grader/literacy.d.ts +5 -9
- package/dist/grader/literacy.js +5 -13
- package/dist/grader/mcp.d.ts +5 -10
- package/dist/grader/mcp.js +5 -14
- package/package.json +2 -2
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* diagnosis-cards.ts — Diagnosis eval matrix config.
|
|
3
|
+
*
|
|
4
|
+
* TS-first config (per .claude/rules/config.md) defining the 5 LLM card types
|
|
5
|
+
* × 3 first-class models eval matrix. Consumed by
|
|
6
|
+
* `scripts/generate-diagnosis-config.ts` to emit
|
|
7
|
+
* `promptfooconfig-diagnosis.yaml`. Never hand-edit the YAML — run
|
|
8
|
+
* `pnpm generate-configs` instead.
|
|
9
|
+
*
|
|
10
|
+
* Per AI-SPEC §5 and CONTEXT D-04 (path b: standalone generator entry point
|
|
11
|
+
* for the diagnosis config, additive — does not modify the existing literacy
|
|
12
|
+
* generate-configs pipeline).
|
|
13
|
+
*
|
|
14
|
+
* @see packages/eval/scripts/generate-diagnosis-config.ts — generator
|
|
15
|
+
* @see packages/eval/promptfooconfig-diagnosis.yaml — generated output
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
// ---------------------------------------------------------------------------
|
|
19
|
+
// Types
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* A first-class model entry in the diagnosis eval matrix.
|
|
24
|
+
* Mirrors the shape of model entries in `config/models.ts`.
|
|
25
|
+
*/
|
|
26
|
+
export interface DiagnosisModelEntry {
|
|
27
|
+
/** Promptfoo provider string (e.g. "anthropic:messages:claude-opus-4-6") */
|
|
28
|
+
id: string
|
|
29
|
+
/** Human-readable label for reports */
|
|
30
|
+
label: string
|
|
31
|
+
/** Per-model config overrides (temperature, max_tokens, etc.) */
|
|
32
|
+
config?: Record<string, unknown>
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* The 5 LLM-driven card types under evaluation.
|
|
37
|
+
* Deterministic cards (area-summary, failure-mode-summary, no-issues) are
|
|
38
|
+
* tested via `fixture-matrix.test.ts` (vitest), not via the promptfoo matrix.
|
|
39
|
+
*/
|
|
40
|
+
export type LLMCardType =
|
|
41
|
+
| "top-recommendations"
|
|
42
|
+
| "weakest-area"
|
|
43
|
+
| "low-confidence-attribution"
|
|
44
|
+
| "doc-attribution-spotlight"
|
|
45
|
+
| "regression-vs-baseline"
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* A single evaluation scenario: one fixture path × one expected outcome.
|
|
49
|
+
*
|
|
50
|
+
* The `fixturePath` is relative to `packages/eval/` so the promptfoo config
|
|
51
|
+
* can resolve it from any working directory. `expectedStatus` drives the
|
|
52
|
+
* pass/fail assertion in the generated YAML.
|
|
53
|
+
*/
|
|
54
|
+
export interface DiagnosisScenario {
|
|
55
|
+
/** Short slug used in promptfoo `description` fields */
|
|
56
|
+
name: string
|
|
57
|
+
/** Path to the Report JSON fixture, relative to `packages/eval/` */
|
|
58
|
+
fixturePath: string
|
|
59
|
+
/**
|
|
60
|
+
* Card type this scenario exercises. The eval matrix runs all LLM cards
|
|
61
|
+
* per scenario; this field annotates which card type is the primary focus
|
|
62
|
+
* for the rubric.
|
|
63
|
+
*/
|
|
64
|
+
primaryCard: LLMCardType
|
|
65
|
+
/** Expected card status when all LLM calls succeed */
|
|
66
|
+
expectedStatus: "ready" | "degraded" | "missing"
|
|
67
|
+
/** Optional: path to canned LLM response for adversarial scenarios */
|
|
68
|
+
cannedResponsePath?: string
|
|
69
|
+
/**
|
|
70
|
+
* Optional: cardId to key the canned response against (for FakeLLMClient
|
|
71
|
+
* keyedResponses in vitest; mirrored in the promptfoo scenario description
|
|
72
|
+
* for documentation).
|
|
73
|
+
*/
|
|
74
|
+
cannedCardId?: LLMCardType
|
|
75
|
+
/** Free-text note about what this scenario tests */
|
|
76
|
+
note?: string
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Top-level diagnosis eval matrix config.
|
|
81
|
+
* Exported as the default export of this file (mirrors models.ts convention).
|
|
82
|
+
*/
|
|
83
|
+
export interface DiagnosisCardsConfig {
|
|
84
|
+
/** All LLM card evaluation scenarios */
|
|
85
|
+
scenarios: DiagnosisScenario[]
|
|
86
|
+
/** Models to run each scenario against */
|
|
87
|
+
models: DiagnosisModelEntry[]
|
|
88
|
+
/** Grader model for LLM-judge assertions */
|
|
89
|
+
grader: DiagnosisModelEntry
|
|
90
|
+
/** Eval budget in milliseconds (kill switch) */
|
|
91
|
+
evalBudgetMs: number
|
|
92
|
+
/** Max parallel API calls */
|
|
93
|
+
maxConcurrency: number
|
|
94
|
+
/** Default per-model config */
|
|
95
|
+
defaults: {
|
|
96
|
+
temperature: number
|
|
97
|
+
max_tokens: number
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// ---------------------------------------------------------------------------
|
|
102
|
+
// Helper
|
|
103
|
+
// ---------------------------------------------------------------------------
|
|
104
|
+
|
|
105
|
+
export function defineDiagnosisCards(
|
|
106
|
+
config: DiagnosisCardsConfig
|
|
107
|
+
): DiagnosisCardsConfig {
|
|
108
|
+
return config
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// ---------------------------------------------------------------------------
|
|
112
|
+
// Config definition
|
|
113
|
+
// ---------------------------------------------------------------------------
|
|
114
|
+
|
|
115
|
+
const diagnosisCardsConfig: DiagnosisCardsConfig = defineDiagnosisCards({
|
|
116
|
+
// ── Models under evaluation ────────────────────────────────────────────────
|
|
117
|
+
models: [
|
|
118
|
+
{
|
|
119
|
+
id: "anthropic:messages:claude-opus-4-6",
|
|
120
|
+
label: "Claude Opus 4.6",
|
|
121
|
+
config: { temperature: 0.2, max_tokens: 4096 },
|
|
122
|
+
},
|
|
123
|
+
{
|
|
124
|
+
id: "anthropic:messages:claude-sonnet-4-6",
|
|
125
|
+
label: "Claude Sonnet 4.6",
|
|
126
|
+
config: { temperature: 0.2, max_tokens: 4096 },
|
|
127
|
+
},
|
|
128
|
+
{
|
|
129
|
+
id: "openai:chat:gpt-5.2",
|
|
130
|
+
label: "GPT 5.2",
|
|
131
|
+
config: { max_completion_tokens: 4096 },
|
|
132
|
+
},
|
|
133
|
+
],
|
|
134
|
+
|
|
135
|
+
// ── Grader model ────────────────────────────────────────────────────────────
|
|
136
|
+
grader: {
|
|
137
|
+
id: "anthropic:messages:claude-opus-4-5-20251101",
|
|
138
|
+
label: "Claude Opus 4.5 (grader)",
|
|
139
|
+
},
|
|
140
|
+
|
|
141
|
+
// ── Eval budget ─────────────────────────────────────────────────────────────
|
|
142
|
+
evalBudgetMs: 3_600_000, // 60 min — full matrix across 3 models × 17 scenarios
|
|
143
|
+
maxConcurrency: 8, // conservative for diagnosis (longer prompts than literacy)
|
|
144
|
+
|
|
145
|
+
// ── Default config ──────────────────────────────────────────────────────────
|
|
146
|
+
defaults: {
|
|
147
|
+
temperature: 0.2,
|
|
148
|
+
max_tokens: 4096,
|
|
149
|
+
},
|
|
150
|
+
|
|
151
|
+
// ── Scenarios (17 logical fixtures) ─────────────────────────────────────────
|
|
152
|
+
scenarios: [
|
|
153
|
+
// ── Critical-path: top-recommendations ──────────────────────────────────
|
|
154
|
+
{
|
|
155
|
+
name: "healthy-top-recommendations",
|
|
156
|
+
fixturePath:
|
|
157
|
+
"test-fixtures/diagnosis/reports/healthy-top-recommendations.json",
|
|
158
|
+
primaryCard: "top-recommendations",
|
|
159
|
+
expectedStatus: "ready",
|
|
160
|
+
note: "Healthy report (mean 91) — top-recommendations card should produce 2+ actionable suggestions with docSlug references from the manifest.",
|
|
161
|
+
},
|
|
162
|
+
{
|
|
163
|
+
name: "low-top-recommendations",
|
|
164
|
+
fixturePath:
|
|
165
|
+
"test-fixtures/diagnosis/reports/low-top-recommendations.json",
|
|
166
|
+
primaryCard: "top-recommendations",
|
|
167
|
+
expectedStatus: "ready",
|
|
168
|
+
note: "Low-scoring report (mean 42) — top-recommendations card should produce high-priority suggestions addressing the dominant failure modes (outdated-docs, missing-docs).",
|
|
169
|
+
},
|
|
170
|
+
|
|
171
|
+
// ── Critical-path: weakest-area ──────────────────────────────────────────
|
|
172
|
+
{
|
|
173
|
+
name: "healthy-weakest-area",
|
|
174
|
+
fixturePath: "test-fixtures/diagnosis/reports/healthy-weakest-area.json",
|
|
175
|
+
primaryCard: "weakest-area",
|
|
176
|
+
expectedStatus: "ready",
|
|
177
|
+
note: "Healthy report with clear weakest area (content-modeling at 82) — weakest-area card should identify the area and provide high-confidence analysis.",
|
|
178
|
+
},
|
|
179
|
+
{
|
|
180
|
+
name: "low-weakest-area",
|
|
181
|
+
fixturePath: "test-fixtures/diagnosis/reports/low-weakest-area.json",
|
|
182
|
+
primaryCard: "weakest-area",
|
|
183
|
+
expectedStatus: "ready",
|
|
184
|
+
note: "Low-scoring report with clear weakest area (content-modeling at 28) — weakest-area card should identify the most critical area with multiple failure modes.",
|
|
185
|
+
},
|
|
186
|
+
|
|
187
|
+
// ── Critical-path: low-confidence-attribution ────────────────────────────
|
|
188
|
+
{
|
|
189
|
+
name: "healthy-low-confidence-attribution",
|
|
190
|
+
fixturePath:
|
|
191
|
+
"test-fixtures/diagnosis/reports/healthy-low-confidence-attribution.json",
|
|
192
|
+
primaryCard: "low-confidence-attribution",
|
|
193
|
+
expectedStatus: "ready",
|
|
194
|
+
note: "Healthy report with small sample sizes (2-3 judgments per area) — low-confidence-attribution card should identify attribution uncertainty despite positive scores.",
|
|
195
|
+
},
|
|
196
|
+
{
|
|
197
|
+
name: "low-low-confidence-attribution",
|
|
198
|
+
fixturePath:
|
|
199
|
+
"test-fixtures/diagnosis/reports/low-low-confidence-attribution.json",
|
|
200
|
+
primaryCard: "low-confidence-attribution",
|
|
201
|
+
expectedStatus: "ready",
|
|
202
|
+
note: "Low-scoring report with small sample sizes (2 judgments per area) — low-confidence-attribution card should flag both score quality and attribution uncertainty.",
|
|
203
|
+
},
|
|
204
|
+
|
|
205
|
+
// ── Critical-path: doc-attribution-spotlight ─────────────────────────────
|
|
206
|
+
{
|
|
207
|
+
name: "healthy-doc-attribution-spotlight",
|
|
208
|
+
fixturePath:
|
|
209
|
+
"test-fixtures/diagnosis/reports/healthy-doc-attribution-spotlight.json",
|
|
210
|
+
primaryCard: "doc-attribution-spotlight",
|
|
211
|
+
expectedStatus: "ready",
|
|
212
|
+
note: "Healthy 5-area report — doc-attribution-spotlight card should identify the highest-impact document in the manifest.",
|
|
213
|
+
},
|
|
214
|
+
{
|
|
215
|
+
name: "low-doc-attribution-spotlight",
|
|
216
|
+
fixturePath:
|
|
217
|
+
"test-fixtures/diagnosis/reports/low-doc-attribution-spotlight.json",
|
|
218
|
+
primaryCard: "doc-attribution-spotlight",
|
|
219
|
+
expectedStatus: "ready",
|
|
220
|
+
note: "Low-scoring 5-area report with multiple failure modes — doc-attribution-spotlight card should identify the most critical document.",
|
|
221
|
+
},
|
|
222
|
+
|
|
223
|
+
// ── Edge cases ───────────────────────────────────────────────────────────
|
|
224
|
+
{
|
|
225
|
+
name: "empty-report",
|
|
226
|
+
fixturePath: "test-fixtures/diagnosis/reports/empty.json",
|
|
227
|
+
primaryCard: "top-recommendations",
|
|
228
|
+
expectedStatus: "missing",
|
|
229
|
+
note: "Edge case (a): zero-area report — all LLM cards should emit status: missing (no data to reason about).",
|
|
230
|
+
},
|
|
231
|
+
{
|
|
232
|
+
name: "single-judgment-per-area",
|
|
233
|
+
fixturePath:
|
|
234
|
+
"test-fixtures/diagnosis/reports/single-judgment-per-area.json",
|
|
235
|
+
primaryCard: "weakest-area",
|
|
236
|
+
expectedStatus: "ready",
|
|
237
|
+
note: "Edge case (b): single-judgment sample size — weakest-area card should reflect low-confidence calibration (sampleSize: 1).",
|
|
238
|
+
},
|
|
239
|
+
{
|
|
240
|
+
name: "all-areas-tied",
|
|
241
|
+
fixturePath: "test-fixtures/diagnosis/reports/all-areas-tied.json",
|
|
242
|
+
primaryCard: "weakest-area",
|
|
243
|
+
expectedStatus: "missing",
|
|
244
|
+
note: "Edge case (c): all areas scored identically (70) — weakest-area card should emit status: missing with reason: no-clear-weakest.",
|
|
245
|
+
},
|
|
246
|
+
{
|
|
247
|
+
name: "grader-major-mismatch-baseline",
|
|
248
|
+
fixturePath:
|
|
249
|
+
"test-fixtures/diagnosis/reports/grader-major-mismatch-baseline.json",
|
|
250
|
+
primaryCard: "regression-vs-baseline",
|
|
251
|
+
expectedStatus: "missing",
|
|
252
|
+
note: "Edge case (d): grader-major-version mismatch — regression-vs-baseline should emit missing with reason: grader-major-version-mismatch. Run as pair with grader-major-mismatch-current.",
|
|
253
|
+
},
|
|
254
|
+
{
|
|
255
|
+
name: "grader-major-mismatch-current",
|
|
256
|
+
fixturePath:
|
|
257
|
+
"test-fixtures/diagnosis/reports/grader-major-mismatch-current.json",
|
|
258
|
+
primaryCard: "regression-vs-baseline",
|
|
259
|
+
expectedStatus: "missing",
|
|
260
|
+
note: "Edge case (d) pair: current report with different graderModel — regression-vs-baseline mismatch guard triggers when paired with grader-major-mismatch-baseline.",
|
|
261
|
+
},
|
|
262
|
+
{
|
|
263
|
+
name: "near-deprecated-taxonomy",
|
|
264
|
+
fixturePath:
|
|
265
|
+
"test-fixtures/diagnosis/reports/near-deprecated-taxonomy.json",
|
|
266
|
+
primaryCard: "weakest-area",
|
|
267
|
+
expectedStatus: "ready",
|
|
268
|
+
note: "Edge case (e): report using unclassified failure mode (currently canonical but watch for taxonomy retirement). Zod refine() must accept canonical modes.",
|
|
269
|
+
},
|
|
270
|
+
|
|
271
|
+
// ── Adversarial canned responses ─────────────────────────────────────────
|
|
272
|
+
{
|
|
273
|
+
name: "adversarial-fabricated-delta",
|
|
274
|
+
fixturePath:
|
|
275
|
+
"test-fixtures/diagnosis/reports/grader-major-mismatch-current.json",
|
|
276
|
+
primaryCard: "regression-vs-baseline",
|
|
277
|
+
expectedStatus: "degraded",
|
|
278
|
+
cannedResponsePath:
|
|
279
|
+
"test-fixtures/diagnosis/canned-responses/fabricated-delta-regression.json",
|
|
280
|
+
cannedCardId: "regression-vs-baseline",
|
|
281
|
+
note: "Adversarial: fabricated delta (AI-SPEC §1b failure-mode #1). LLM claims -7.3 delta; direction-sign refine triggers degraded card.",
|
|
282
|
+
},
|
|
283
|
+
{
|
|
284
|
+
name: "adversarial-improve-introduction",
|
|
285
|
+
fixturePath:
|
|
286
|
+
"test-fixtures/diagnosis/reports/low-top-recommendations.json",
|
|
287
|
+
primaryCard: "top-recommendations",
|
|
288
|
+
expectedStatus: "degraded",
|
|
289
|
+
cannedResponsePath:
|
|
290
|
+
"test-fixtures/diagnosis/canned-responses/improve-introduction.json",
|
|
291
|
+
cannedCardId: "top-recommendations",
|
|
292
|
+
note: "Adversarial: generic anti-pattern recommendation (AI-SPEC §1b failure-mode #2). Actionability refine triggers degraded card.",
|
|
293
|
+
},
|
|
294
|
+
{
|
|
295
|
+
name: "adversarial-hallucinated-docslug",
|
|
296
|
+
fixturePath:
|
|
297
|
+
"test-fixtures/diagnosis/reports/low-top-recommendations.json",
|
|
298
|
+
primaryCard: "top-recommendations",
|
|
299
|
+
expectedStatus: "degraded",
|
|
300
|
+
cannedResponsePath:
|
|
301
|
+
"test-fixtures/diagnosis/canned-responses/hallucinated-docslug.json",
|
|
302
|
+
cannedCardId: "top-recommendations",
|
|
303
|
+
note: "Adversarial: hallucinated docSlug (AI-SPEC §1b failure-mode #3). Allow-list refine triggers degraded card.",
|
|
304
|
+
},
|
|
305
|
+
{
|
|
306
|
+
name: "adversarial-taxonomy-drift",
|
|
307
|
+
fixturePath: "test-fixtures/diagnosis/reports/low-weakest-area.json",
|
|
308
|
+
primaryCard: "weakest-area",
|
|
309
|
+
expectedStatus: "degraded",
|
|
310
|
+
cannedResponsePath:
|
|
311
|
+
"test-fixtures/diagnosis/canned-responses/taxonomy-drift.json",
|
|
312
|
+
cannedCardId: "weakest-area",
|
|
313
|
+
note: "Adversarial: taxonomy drift (AI-SPEC §1b failure-mode #4). Per-dimension failureMode refine triggers degraded card.",
|
|
314
|
+
},
|
|
315
|
+
],
|
|
316
|
+
})
|
|
317
|
+
|
|
318
|
+
export default diagnosisCardsConfig
|
package/config/models.ts
CHANGED
|
@@ -24,6 +24,18 @@ export default defineModels({
|
|
|
24
24
|
// All literacy variants included by default (baseline, observed,
|
|
25
25
|
// agentic-naive, agentic-optimized)
|
|
26
26
|
},
|
|
27
|
+
{
|
|
28
|
+
// Phase 5 LLM card routing (D-07). AI-SPEC §4 routes 3 routine cards
|
|
29
|
+
// (top-recommendations, weakest-area, regression-vs-baseline) here.
|
|
30
|
+
// Pricing already in AnthropicLLMClient; baseline literacy variant only.
|
|
31
|
+
id: "anthropic:messages:claude-sonnet-4-6",
|
|
32
|
+
label: "Claude Sonnet 4.6",
|
|
33
|
+
config: { temperature: 0.2, max_tokens: 4096 },
|
|
34
|
+
modes: ["literacy"],
|
|
35
|
+
variants: {
|
|
36
|
+
literacy: ["baseline"],
|
|
37
|
+
},
|
|
38
|
+
},
|
|
27
39
|
|
|
28
40
|
// ── Google ─────────────────────────────────────────────────
|
|
29
41
|
// {
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Agent-harness failure modes — valid for the `agent-harness` dimension
|
|
3
|
+
* family (process-quality, agent-output, tool-usage).
|
|
4
|
+
*
|
|
5
|
+
* Relocated from packages/eval/src/grader/agent-harness.ts to @sanity/ailf-core
|
|
6
|
+
* so card files in packages/core/src/services/diagnosis/cards/ can import
|
|
7
|
+
* them without violating the core→eval import direction rule (D-05).
|
|
8
|
+
*
|
|
9
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
10
|
+
* §"Per-dimension failure-mode taxonomies" (lines 239-283).
|
|
11
|
+
*/
|
|
12
|
+
export declare const AGENT_FAILURE_MODES: readonly ["tool-misuse", "chaotic-process", "missing-recovery"];
|
|
13
|
+
export type AgentFailureMode = (typeof AGENT_FAILURE_MODES)[number];
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Agent-harness failure modes — valid for the `agent-harness` dimension
|
|
3
|
+
* family (process-quality, agent-output, tool-usage).
|
|
4
|
+
*
|
|
5
|
+
* Relocated from packages/eval/src/grader/agent-harness.ts to @sanity/ailf-core
|
|
6
|
+
* so card files in packages/core/src/services/diagnosis/cards/ can import
|
|
7
|
+
* them without violating the core→eval import direction rule (D-05).
|
|
8
|
+
*
|
|
9
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
10
|
+
* §"Per-dimension failure-mode taxonomies" (lines 239-283).
|
|
11
|
+
*/
|
|
12
|
+
export const AGENT_FAILURE_MODES = [
|
|
13
|
+
"tool-misuse", // assistant calls tools incorrectly or with wrong args
|
|
14
|
+
"chaotic-process", // assistant flails — undirected exploration, no plan
|
|
15
|
+
"missing-recovery", // assistant doesn't recover from a tool error
|
|
16
|
+
];
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cross-cutting failure modes — valid for any dimension family.
|
|
3
|
+
*
|
|
4
|
+
* Relocated from packages/eval/src/grader/common.ts to @sanity/ailf-core
|
|
5
|
+
* so card files in packages/core/src/services/diagnosis/cards/ can import
|
|
6
|
+
* them without violating the core→eval import direction rule (D-05).
|
|
7
|
+
*
|
|
8
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
9
|
+
* §"Per-dimension failure-mode taxonomies" (lines 239-283 — the v0 lists)
|
|
10
|
+
* @see docs/decisions/D0005-grader-model-separation.md — single grader model;
|
|
11
|
+
* taxonomies travel with the rubric prompt for reproducibility.
|
|
12
|
+
*/
|
|
13
|
+
export declare const COMMON_FAILURE_MODES: readonly ["api-error", "model-limitation", "false-floor", "unclassified"];
|
|
14
|
+
export type CommonFailureMode = (typeof COMMON_FAILURE_MODES)[number];
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cross-cutting failure modes — valid for any dimension family.
|
|
3
|
+
*
|
|
4
|
+
* Relocated from packages/eval/src/grader/common.ts to @sanity/ailf-core
|
|
5
|
+
* so card files in packages/core/src/services/diagnosis/cards/ can import
|
|
6
|
+
* them without violating the core→eval import direction rule (D-05).
|
|
7
|
+
*
|
|
8
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
9
|
+
* §"Per-dimension failure-mode taxonomies" (lines 239-283 — the v0 lists)
|
|
10
|
+
* @see docs/decisions/D0005-grader-model-separation.md — single grader model;
|
|
11
|
+
* taxonomies travel with the rubric prompt for reproducibility.
|
|
12
|
+
*/
|
|
13
|
+
export const COMMON_FAILURE_MODES = [
|
|
14
|
+
"api-error", // infrastructure failure, not a docs problem
|
|
15
|
+
"model-limitation", // high ceiling, model can't reach it
|
|
16
|
+
"false-floor", // model already knew the answer; docs added no value
|
|
17
|
+
"unclassified", // grader could not pick a mode (low-confidence fallback)
|
|
18
|
+
];
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Per-dimension failure-mode taxonomy barrel — relocated to @sanity/ailf-core.
|
|
3
|
+
*
|
|
4
|
+
* Previously lived in packages/eval/src/grader/index.ts. Moved here (D-05)
|
|
5
|
+
* so card files in packages/core/src/services/diagnosis/cards/ can import
|
|
6
|
+
* `failureModesForDimension` without violating the core→eval import rule.
|
|
7
|
+
*
|
|
8
|
+
* Named re-exports only (W0124 — never `export *`).
|
|
9
|
+
*
|
|
10
|
+
* The eval-side packages/eval/src/grader/index.ts now re-exports from here,
|
|
11
|
+
* keeping all existing eval-side callers working without source changes.
|
|
12
|
+
*
|
|
13
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
14
|
+
* §"Per-dimension failure-mode taxonomies" (lines 239-283).
|
|
15
|
+
* @see docs/decisions/D0005-grader-model-separation.md — single grader model;
|
|
16
|
+
* taxonomies travel with the rubric prompt for reproducibility.
|
|
17
|
+
*/
|
|
18
|
+
export { COMMON_FAILURE_MODES, type CommonFailureMode } from "./common.js";
|
|
19
|
+
export { LITERACY_FAILURE_MODES, type LiteracyFailureMode } from "./literacy.js";
|
|
20
|
+
export { MCP_FAILURE_MODES, type MCPFailureMode } from "./mcp.js";
|
|
21
|
+
export { KP_FAILURE_MODES, type KPFailureMode } from "./knowledge-probe.js";
|
|
22
|
+
export { AGENT_FAILURE_MODES, type AgentFailureMode } from "./agent-harness.js";
|
|
23
|
+
/**
|
|
24
|
+
* Return the legal failure-mode list for a given rubric dimension.
|
|
25
|
+
*
|
|
26
|
+
* Accepts both family-level keys (`mcp-behavior`, `knowledge-probe`,
|
|
27
|
+
* `agent-harness`) and the per-template `dimension` strings used in
|
|
28
|
+
* `config/rubrics.ts` (`task-completion`, `input-validation`,
|
|
29
|
+
* `factual-correctness`, `process-quality`, …). The cross-cutting
|
|
30
|
+
* `COMMON_FAILURE_MODES` is always included.
|
|
31
|
+
*
|
|
32
|
+
* Unknown dimensions fall through to `COMMON_FAILURE_MODES` only — safe
|
|
33
|
+
* default, the grader can still pick `unclassified`.
|
|
34
|
+
*/
|
|
35
|
+
export declare function failureModesForDimension(dimension: string): readonly string[];
|
|
36
|
+
/**
|
|
37
|
+
* Flat list of all dimension names recognized by `failureModesForDimension`.
|
|
38
|
+
* Used by card-validators and test calibration.
|
|
39
|
+
*/
|
|
40
|
+
export declare const CANONICAL_DIMENSIONS: readonly string[];
|
|
41
|
+
/**
|
|
42
|
+
* Returns `true` when `mode` appears in the failure-mode list of any
|
|
43
|
+
* canonical dimension family.
|
|
44
|
+
*/
|
|
45
|
+
export declare function isCanonicalFailureMode(mode: string): boolean;
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Per-dimension failure-mode taxonomy barrel — relocated to @sanity/ailf-core.
|
|
3
|
+
*
|
|
4
|
+
* Previously lived in packages/eval/src/grader/index.ts. Moved here (D-05)
|
|
5
|
+
* so card files in packages/core/src/services/diagnosis/cards/ can import
|
|
6
|
+
* `failureModesForDimension` without violating the core→eval import rule.
|
|
7
|
+
*
|
|
8
|
+
* Named re-exports only (W0124 — never `export *`).
|
|
9
|
+
*
|
|
10
|
+
* The eval-side packages/eval/src/grader/index.ts now re-exports from here,
|
|
11
|
+
* keeping all existing eval-side callers working without source changes.
|
|
12
|
+
*
|
|
13
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
14
|
+
* §"Per-dimension failure-mode taxonomies" (lines 239-283).
|
|
15
|
+
* @see docs/decisions/D0005-grader-model-separation.md — single grader model;
|
|
16
|
+
* taxonomies travel with the rubric prompt for reproducibility.
|
|
17
|
+
*/
|
|
18
|
+
export { COMMON_FAILURE_MODES } from "./common.js";
|
|
19
|
+
export { LITERACY_FAILURE_MODES } from "./literacy.js";
|
|
20
|
+
export { MCP_FAILURE_MODES } from "./mcp.js";
|
|
21
|
+
export { KP_FAILURE_MODES } from "./knowledge-probe.js";
|
|
22
|
+
export { AGENT_FAILURE_MODES } from "./agent-harness.js";
|
|
23
|
+
import { COMMON_FAILURE_MODES } from "./common.js";
|
|
24
|
+
import { LITERACY_FAILURE_MODES } from "./literacy.js";
|
|
25
|
+
import { MCP_FAILURE_MODES } from "./mcp.js";
|
|
26
|
+
import { KP_FAILURE_MODES } from "./knowledge-probe.js";
|
|
27
|
+
import { AGENT_FAILURE_MODES } from "./agent-harness.js";
|
|
28
|
+
/**
|
|
29
|
+
* Return the legal failure-mode list for a given rubric dimension.
|
|
30
|
+
*
|
|
31
|
+
* Accepts both family-level keys (`mcp-behavior`, `knowledge-probe`,
|
|
32
|
+
* `agent-harness`) and the per-template `dimension` strings used in
|
|
33
|
+
* `config/rubrics.ts` (`task-completion`, `input-validation`,
|
|
34
|
+
* `factual-correctness`, `process-quality`, …). The cross-cutting
|
|
35
|
+
* `COMMON_FAILURE_MODES` is always included.
|
|
36
|
+
*
|
|
37
|
+
* Unknown dimensions fall through to `COMMON_FAILURE_MODES` only — safe
|
|
38
|
+
* default, the grader can still pick `unclassified`.
|
|
39
|
+
*/
|
|
40
|
+
export function failureModesForDimension(dimension) {
|
|
41
|
+
switch (dimension) {
|
|
42
|
+
// ── Literacy family ──────────────────────────────────────
|
|
43
|
+
case "task-completion":
|
|
44
|
+
case "code-correctness":
|
|
45
|
+
case "doc-coverage":
|
|
46
|
+
return [...COMMON_FAILURE_MODES, ...LITERACY_FAILURE_MODES];
|
|
47
|
+
// ── MCP family ───────────────────────────────────────────
|
|
48
|
+
case "mcp-behavior":
|
|
49
|
+
case "input-validation":
|
|
50
|
+
case "output-correctness":
|
|
51
|
+
case "error-handling":
|
|
52
|
+
case "security":
|
|
53
|
+
return [...COMMON_FAILURE_MODES, ...MCP_FAILURE_MODES];
|
|
54
|
+
// ── Knowledge-probe family ───────────────────────────────
|
|
55
|
+
case "knowledge-probe":
|
|
56
|
+
case "factual-correctness":
|
|
57
|
+
case "completeness":
|
|
58
|
+
case "currency":
|
|
59
|
+
return [...COMMON_FAILURE_MODES, ...KP_FAILURE_MODES];
|
|
60
|
+
// ── Agent-harness family ─────────────────────────────────
|
|
61
|
+
case "agent-harness":
|
|
62
|
+
case "process-quality":
|
|
63
|
+
case "agent-output":
|
|
64
|
+
case "tool-usage":
|
|
65
|
+
return [...COMMON_FAILURE_MODES, ...AGENT_FAILURE_MODES];
|
|
66
|
+
default:
|
|
67
|
+
return COMMON_FAILURE_MODES;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* Flat list of all dimension names recognized by `failureModesForDimension`.
|
|
72
|
+
* Used by card-validators and test calibration.
|
|
73
|
+
*/
|
|
74
|
+
export const CANONICAL_DIMENSIONS = [
|
|
75
|
+
// Literacy family
|
|
76
|
+
"task-completion",
|
|
77
|
+
"code-correctness",
|
|
78
|
+
"doc-coverage",
|
|
79
|
+
// MCP family
|
|
80
|
+
"mcp-behavior",
|
|
81
|
+
"input-validation",
|
|
82
|
+
"output-correctness",
|
|
83
|
+
"error-handling",
|
|
84
|
+
"security",
|
|
85
|
+
// Knowledge-probe family
|
|
86
|
+
"knowledge-probe",
|
|
87
|
+
"factual-correctness",
|
|
88
|
+
"completeness",
|
|
89
|
+
"currency",
|
|
90
|
+
// Agent-harness family
|
|
91
|
+
"agent-harness",
|
|
92
|
+
"process-quality",
|
|
93
|
+
"agent-output",
|
|
94
|
+
"tool-usage",
|
|
95
|
+
];
|
|
96
|
+
/**
|
|
97
|
+
* Returns `true` when `mode` appears in the failure-mode list of any
|
|
98
|
+
* canonical dimension family.
|
|
99
|
+
*/
|
|
100
|
+
export function isCanonicalFailureMode(mode) {
|
|
101
|
+
const allModes = new Set([
|
|
102
|
+
...COMMON_FAILURE_MODES,
|
|
103
|
+
...LITERACY_FAILURE_MODES,
|
|
104
|
+
...MCP_FAILURE_MODES,
|
|
105
|
+
...KP_FAILURE_MODES,
|
|
106
|
+
...AGENT_FAILURE_MODES,
|
|
107
|
+
]);
|
|
108
|
+
return allModes.has(mode);
|
|
109
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Knowledge-probe failure modes — valid for the `knowledge-probe` dimension
|
|
3
|
+
* family (factual-correctness, completeness, currency).
|
|
4
|
+
*
|
|
5
|
+
* Relocated from packages/eval/src/grader/knowledge-probe.ts to @sanity/ailf-core
|
|
6
|
+
* so card files in packages/core/src/services/diagnosis/cards/ can import
|
|
7
|
+
* them without violating the core→eval import direction rule (D-05).
|
|
8
|
+
*
|
|
9
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
10
|
+
* §"Per-dimension failure-mode taxonomies" (lines 239-283).
|
|
11
|
+
*/
|
|
12
|
+
export declare const KP_FAILURE_MODES: readonly ["factual-error", "incompleteness", "currency-violation", "hallucination"];
|
|
13
|
+
export type KPFailureMode = (typeof KP_FAILURE_MODES)[number];
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Knowledge-probe failure modes — valid for the `knowledge-probe` dimension
|
|
3
|
+
* family (factual-correctness, completeness, currency).
|
|
4
|
+
*
|
|
5
|
+
* Relocated from packages/eval/src/grader/knowledge-probe.ts to @sanity/ailf-core
|
|
6
|
+
* so card files in packages/core/src/services/diagnosis/cards/ can import
|
|
7
|
+
* them without violating the core→eval import direction rule (D-05).
|
|
8
|
+
*
|
|
9
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
10
|
+
* §"Per-dimension failure-mode taxonomies" (lines 239-283).
|
|
11
|
+
*/
|
|
12
|
+
export const KP_FAILURE_MODES = [
|
|
13
|
+
"factual-error", // assistant asserts something demonstrably false
|
|
14
|
+
"incompleteness", // assistant covers part of the answer; misses key piece
|
|
15
|
+
"currency-violation", // assistant cites stale facts beyond doc currency horizon
|
|
16
|
+
"hallucination", // assistant invents details not present in any doc
|
|
17
|
+
];
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Literacy failure modes — valid for `task-completion`, `code-correctness`,
|
|
3
|
+
* `doc-coverage` (the literacy dimension family).
|
|
4
|
+
*
|
|
5
|
+
* Relocated from packages/eval/src/grader/literacy.ts to @sanity/ailf-core
|
|
6
|
+
* so card files in packages/core/src/services/diagnosis/cards/ can import
|
|
7
|
+
* them without violating the core→eval import direction rule (D-05).
|
|
8
|
+
*
|
|
9
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
10
|
+
* §"Per-dimension failure-mode taxonomies" (lines 239-283).
|
|
11
|
+
*/
|
|
12
|
+
export declare const LITERACY_FAILURE_MODES: readonly ["missing-docs", "outdated-docs", "incorrect-docs", "poor-structure"];
|
|
13
|
+
export type LiteracyFailureMode = (typeof LITERACY_FAILURE_MODES)[number];
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Literacy failure modes — valid for `task-completion`, `code-correctness`,
|
|
3
|
+
* `doc-coverage` (the literacy dimension family).
|
|
4
|
+
*
|
|
5
|
+
* Relocated from packages/eval/src/grader/literacy.ts to @sanity/ailf-core
|
|
6
|
+
* so card files in packages/core/src/services/diagnosis/cards/ can import
|
|
7
|
+
* them without violating the core→eval import direction rule (D-05).
|
|
8
|
+
*
|
|
9
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
10
|
+
* §"Per-dimension failure-mode taxonomies" (lines 239-283).
|
|
11
|
+
*/
|
|
12
|
+
export const LITERACY_FAILURE_MODES = [
|
|
13
|
+
"missing-docs", // relevant doc didn't exist
|
|
14
|
+
"outdated-docs", // doc reflects an older API/version
|
|
15
|
+
"incorrect-docs", // doc states something factually wrong
|
|
16
|
+
"poor-structure", // doc exists but is hard to find or follow
|
|
17
|
+
];
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MCP failure modes — valid for the `mcp-behavior` dimension family
|
|
3
|
+
* (input-validation, output-correctness, error-handling, security).
|
|
4
|
+
*
|
|
5
|
+
* Relocated from packages/eval/src/grader/mcp.ts to @sanity/ailf-core
|
|
6
|
+
* so card files in packages/core/src/services/diagnosis/cards/ can import
|
|
7
|
+
* them without violating the core→eval import direction rule (D-05).
|
|
8
|
+
*
|
|
9
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
10
|
+
* §"Per-dimension failure-mode taxonomies" (lines 239-283).
|
|
11
|
+
*/
|
|
12
|
+
export declare const MCP_FAILURE_MODES: readonly ["spec-mismatch", "missing-error-handling", "over-privileged", "missing-docs"];
|
|
13
|
+
export type MCPFailureMode = (typeof MCP_FAILURE_MODES)[number];
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MCP failure modes — valid for the `mcp-behavior` dimension family
|
|
3
|
+
* (input-validation, output-correctness, error-handling, security).
|
|
4
|
+
*
|
|
5
|
+
* Relocated from packages/eval/src/grader/mcp.ts to @sanity/ailf-core
|
|
6
|
+
* so card files in packages/core/src/services/diagnosis/cards/ can import
|
|
7
|
+
* them without violating the core→eval import direction rule (D-05).
|
|
8
|
+
*
|
|
9
|
+
* @see docs/design-docs/actionability-ladder/03-structured-grader-judgments.md
|
|
10
|
+
* §"Per-dimension failure-mode taxonomies" (lines 239-283).
|
|
11
|
+
*/
|
|
12
|
+
export const MCP_FAILURE_MODES = [
|
|
13
|
+
"spec-mismatch", // tool/server output doesn't match published MCP spec
|
|
14
|
+
"missing-error-handling", // tool failure path under-documented or absent
|
|
15
|
+
"over-privileged", // tool exposes operations the doc didn't sanction
|
|
16
|
+
"missing-docs", // re-export from literacy (cross-cutting)
|
|
17
|
+
];
|