@sanity/ailf 5.0.0 → 6.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/airbyte/ai_literacy_framework.connector.yaml +276 -0
- package/config/bigquery/views/synthesis_parse_failure_rate_7d.sql +42 -0
- package/config/diagnosis-cards.ts +318 -0
- package/config/models.ts +12 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +4 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +12 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -0
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +230 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +108 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +140 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +49 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +65 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +93 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +130 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +111 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +118 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +286 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +18 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +74 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.js +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +119 -2
- package/dist/_vendor/ailf-core/services/diagnosis-runner.js +136 -2
- package/dist/_vendor/ailf-core/services/index.d.ts +5 -1
- package/dist/_vendor/ailf-core/services/index.js +15 -2
- package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
- package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
- package/dist/_vendor/ailf-core/types/diagnosis.d.ts +115 -10
- package/dist/_vendor/ailf-core/types/diagnosis.js +3 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +8 -1
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +16 -0
- package/dist/_vendor/ailf-core/types/synthesis-telemetry.d.ts +101 -0
- package/dist/_vendor/ailf-core/types/synthesis-telemetry.js +18 -0
- package/dist/adapters/config-sources/file-config-adapter.js +8 -6
- package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
- package/dist/adapters/llm/fake-llm-client.js +38 -1
- package/dist/adapters/llm/index.d.ts +1 -1
- package/dist/adapters/llm/index.js +1 -1
- package/dist/adapters/llm/openai-llm-client.js +59 -5
- package/dist/adapters/llm/retry.d.ts +18 -0
- package/dist/adapters/llm/retry.js +21 -0
- package/dist/adapters/synthesis/synthesis-telemetry-schema.d.ts +49 -0
- package/dist/adapters/synthesis/synthesis-telemetry-schema.js +55 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +10 -5
- package/dist/adapters/task-sources/repo-schemas.d.ts +7 -0
- package/dist/adapters/task-sources/repo-schemas.js +10 -0
- package/dist/cli-program.js +3 -0
- package/dist/commands/interpret.d.ts +70 -0
- package/dist/commands/interpret.js +221 -0
- package/dist/commands/pipeline-action.d.ts +44 -0
- package/dist/commands/pipeline-action.js +193 -1
- package/dist/commands/run.d.ts +2 -0
- package/dist/commands/run.js +2 -0
- package/dist/composition-root.d.ts +21 -23
- package/dist/composition-root.js +107 -41
- package/dist/config/diagnosis-cards.ts +318 -0
- package/dist/config/models.ts +12 -0
- package/dist/grader/agent-harness.d.ts +5 -10
- package/dist/grader/agent-harness.js +5 -13
- package/dist/grader/common.d.ts +5 -13
- package/dist/grader/common.js +5 -17
- package/dist/grader/index.d.ts +15 -29
- package/dist/grader/index.js +15 -66
- package/dist/grader/knowledge-probe.d.ts +5 -10
- package/dist/grader/knowledge-probe.js +5 -14
- package/dist/grader/literacy.d.ts +5 -9
- package/dist/grader/literacy.js +5 -13
- package/dist/grader/mcp.d.ts +5 -10
- package/dist/grader/mcp.js +5 -14
- package/dist/orchestration/pipeline-orchestrator.js +3 -0
- package/dist/report-store.d.ts +26 -0
- package/dist/report-store.js +63 -0
- package/package.json +2 -2
package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* failure-mode-summary.test.ts — Tests 5-9 for deterministic failure-mode-summary card.
|
|
3
|
+
*
|
|
4
|
+
* D0042: vitest only.
|
|
5
|
+
* D-05: schema must reject cross-dimension (dimension, failureMode) pairs.
|
|
6
|
+
*/
|
|
7
|
+
import { describe, expect, it } from "vitest";
|
|
8
|
+
import { FailureModeSummaryBodySchema, generateFailureModeSummary, } from "../failure-mode-summary.js";
|
|
9
|
+
import { failureModesForDimension } from "../../../../grader/failure-modes/index.js";
|
|
10
|
+
// Compile-time gate
|
|
11
|
+
const _satisfiesCheck = FailureModeSummaryBodySchema;
|
|
12
|
+
const silentLogger = {
|
|
13
|
+
debug: () => { },
|
|
14
|
+
info: () => { },
|
|
15
|
+
warn: () => { },
|
|
16
|
+
error: () => { },
|
|
17
|
+
step: () => { },
|
|
18
|
+
section: () => { },
|
|
19
|
+
table: () => { },
|
|
20
|
+
};
|
|
21
|
+
const noopProgress = {
|
|
22
|
+
phaseStart: () => { },
|
|
23
|
+
phaseProgress: () => { },
|
|
24
|
+
phaseComplete: () => { },
|
|
25
|
+
};
|
|
26
|
+
function makeCtx() {
|
|
27
|
+
return {
|
|
28
|
+
llm: undefined,
|
|
29
|
+
model: "anthropic:claude-sonnet-4-6",
|
|
30
|
+
logger: silentLogger,
|
|
31
|
+
progress: noopProgress,
|
|
32
|
+
versions: {
|
|
33
|
+
graderJudgmentsVersion: "1.0.0",
|
|
34
|
+
ensembleVersion: "1.0.0",
|
|
35
|
+
diagnosisVersion: "0.1.0",
|
|
36
|
+
cardVersion: "1.0.0",
|
|
37
|
+
},
|
|
38
|
+
runId: "run-001",
|
|
39
|
+
reportId: "report-001",
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
function makeReport(failureModes) {
|
|
43
|
+
return {
|
|
44
|
+
id: "report-001",
|
|
45
|
+
completedAt: "2026-01-01T00:00:00Z",
|
|
46
|
+
durationMs: 1000,
|
|
47
|
+
provenance: {
|
|
48
|
+
runId: "run-001",
|
|
49
|
+
mode: "standard",
|
|
50
|
+
areas: [],
|
|
51
|
+
taskIds: [],
|
|
52
|
+
models: [],
|
|
53
|
+
graderModel: "gpt-4o",
|
|
54
|
+
source: "local",
|
|
55
|
+
evalFingerprint: "",
|
|
56
|
+
trigger: "manual",
|
|
57
|
+
git: undefined,
|
|
58
|
+
},
|
|
59
|
+
summary: {
|
|
60
|
+
belowCritical: [],
|
|
61
|
+
lowestArea: "groq",
|
|
62
|
+
lowestScore: 50,
|
|
63
|
+
overall: {
|
|
64
|
+
avgCeilingScore: 80,
|
|
65
|
+
avgScore: 70,
|
|
66
|
+
avgDocLift: 5,
|
|
67
|
+
avgDocQualityGap: 20,
|
|
68
|
+
avgFloorScore: 65,
|
|
69
|
+
negativeDocLiftCount: 0,
|
|
70
|
+
},
|
|
71
|
+
scores: [
|
|
72
|
+
{
|
|
73
|
+
feature: "groq",
|
|
74
|
+
ceilingScore: 80,
|
|
75
|
+
codeCorrectness: 70,
|
|
76
|
+
docCoverage: 65,
|
|
77
|
+
docLift: 5,
|
|
78
|
+
docQualityGap: 20,
|
|
79
|
+
floorScore: 65,
|
|
80
|
+
negativeDocLift: false,
|
|
81
|
+
taskCompletion: 70,
|
|
82
|
+
testCount: 10,
|
|
83
|
+
totalCost: 0.05,
|
|
84
|
+
totalScore: 70,
|
|
85
|
+
},
|
|
86
|
+
],
|
|
87
|
+
failureModes,
|
|
88
|
+
},
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
describe("FailureModeSummaryBodySchema — field structure (Test 5)", () => {
|
|
92
|
+
it("includes dimension, failureMode, count, sampleSize, summary fields", () => {
|
|
93
|
+
const canonicalMode = failureModesForDimension("task-completion")[0];
|
|
94
|
+
const result = FailureModeSummaryBodySchema.safeParse({
|
|
95
|
+
summary: "test summary",
|
|
96
|
+
dimension: "task-completion",
|
|
97
|
+
failureMode: canonicalMode,
|
|
98
|
+
count: 5,
|
|
99
|
+
sampleSize: 20,
|
|
100
|
+
});
|
|
101
|
+
expect(result.success).toBe(true);
|
|
102
|
+
// compile-time gate
|
|
103
|
+
expect(_satisfiesCheck).toBeDefined();
|
|
104
|
+
});
|
|
105
|
+
});
|
|
106
|
+
describe("FailureModeSummaryBodySchema — D-05 refine rejects cross-dimension (Test 6)", () => {
|
|
107
|
+
it("rejects mcp-tool-not-found for task-completion dimension", () => {
|
|
108
|
+
// "mcp-tool-not-found" doesn't exist in the taxonomy but we want to test
|
|
109
|
+
// cross-dimension hallucination. Use a valid MCP mode in a literacy dimension.
|
|
110
|
+
const mcpOnlyMode = "spec-mismatch"; // valid for mcp-behavior, not task-completion
|
|
111
|
+
const result = FailureModeSummaryBodySchema.safeParse({
|
|
112
|
+
summary: "test",
|
|
113
|
+
dimension: "task-completion",
|
|
114
|
+
failureMode: mcpOnlyMode,
|
|
115
|
+
count: 1,
|
|
116
|
+
sampleSize: 5,
|
|
117
|
+
});
|
|
118
|
+
expect(result.success).toBe(false);
|
|
119
|
+
});
|
|
120
|
+
});
|
|
121
|
+
describe("FailureModeSummaryBodySchema — accepts canonical pair (Test 7)", () => {
|
|
122
|
+
it("accepts a valid (dimension, failureMode) pair from the taxonomy", () => {
|
|
123
|
+
const canonicalMode = failureModesForDimension("task-completion")[0];
|
|
124
|
+
const result = FailureModeSummaryBodySchema.safeParse({
|
|
125
|
+
summary: "test summary about missing docs",
|
|
126
|
+
dimension: "task-completion",
|
|
127
|
+
failureMode: canonicalMode,
|
|
128
|
+
count: 3,
|
|
129
|
+
sampleSize: 15,
|
|
130
|
+
});
|
|
131
|
+
expect(result.success).toBe(true);
|
|
132
|
+
});
|
|
133
|
+
});
|
|
134
|
+
describe("generateFailureModeSummary — happy path (Test 8)", () => {
|
|
135
|
+
it("returns a ready card with the dominant failure mode for a report with failure modes", async () => {
|
|
136
|
+
const canonicalMode = failureModesForDimension("task-completion")[0];
|
|
137
|
+
const report = makeReport({
|
|
138
|
+
counts: { "task-completion": 7 },
|
|
139
|
+
topTitles: [
|
|
140
|
+
{
|
|
141
|
+
id: `task-completion::${canonicalMode}`,
|
|
142
|
+
category: canonicalMode,
|
|
143
|
+
severity: "high",
|
|
144
|
+
title: canonicalMode,
|
|
145
|
+
count: 7,
|
|
146
|
+
},
|
|
147
|
+
],
|
|
148
|
+
totalJudgments: 20,
|
|
149
|
+
classificationRate: 0.35,
|
|
150
|
+
});
|
|
151
|
+
const card = await generateFailureModeSummary(report, makeCtx());
|
|
152
|
+
expect(card.status).toBe("ready");
|
|
153
|
+
if (card.status === "ready") {
|
|
154
|
+
expect(card.cardType).toBe("failure-mode-summary");
|
|
155
|
+
expect(card.meta.cardVersion).toBe("failure-mode-summary@0.1.0");
|
|
156
|
+
const body = card.body;
|
|
157
|
+
expect(body.count).toBeGreaterThan(0);
|
|
158
|
+
}
|
|
159
|
+
});
|
|
160
|
+
});
|
|
161
|
+
describe("generateFailureModeSummary — empty failure modes (Test 9)", () => {
|
|
162
|
+
it("returns missing card when report has no failure modes", async () => {
|
|
163
|
+
const report = makeReport(undefined);
|
|
164
|
+
const card = await generateFailureModeSummary(report, makeCtx());
|
|
165
|
+
expect(card.status).toBe("missing");
|
|
166
|
+
if (card.status === "missing") {
|
|
167
|
+
expect(card.cardType).toBe("failure-mode-summary");
|
|
168
|
+
expect(card.reason).toContain("no failure modes");
|
|
169
|
+
}
|
|
170
|
+
});
|
|
171
|
+
});
|
|
172
|
+
describe("generateFailureModeSummary — cross-cutting mode resolution", () => {
|
|
173
|
+
// `missing-docs` appears in both LITERACY_FAILURE_MODES and MCP_FAILURE_MODES.
|
|
174
|
+
// Before the per-EvalMode preference table, linear scan of CANONICAL_DIMENSIONS
|
|
175
|
+
// always resolved it to task-completion, mislabelling MCP-only runs.
|
|
176
|
+
function withMode(report, mode) {
|
|
177
|
+
return {
|
|
178
|
+
...report,
|
|
179
|
+
provenance: { ...report.provenance, mode },
|
|
180
|
+
};
|
|
181
|
+
}
|
|
182
|
+
it("resolves 'missing-docs' to mcp-behavior on an mcp-server report", async () => {
|
|
183
|
+
const base = makeReport({
|
|
184
|
+
counts: { "mcp-behavior": 7 },
|
|
185
|
+
topTitles: [
|
|
186
|
+
{
|
|
187
|
+
id: "mcp-behavior::missing-docs",
|
|
188
|
+
category: "missing-docs",
|
|
189
|
+
severity: "high",
|
|
190
|
+
title: "missing-docs",
|
|
191
|
+
count: 7,
|
|
192
|
+
},
|
|
193
|
+
],
|
|
194
|
+
totalJudgments: 20,
|
|
195
|
+
classificationRate: 0.35,
|
|
196
|
+
});
|
|
197
|
+
const report = withMode(base, "mcp-server");
|
|
198
|
+
const card = await generateFailureModeSummary(report, makeCtx());
|
|
199
|
+
expect(card.status).toBe("ready");
|
|
200
|
+
if (card.status === "ready") {
|
|
201
|
+
const body = card.body;
|
|
202
|
+
expect(body.dimension).toBe("mcp-behavior");
|
|
203
|
+
expect(body.failureMode).toBe("missing-docs");
|
|
204
|
+
}
|
|
205
|
+
});
|
|
206
|
+
it("resolves 'missing-docs' to task-completion on a literacy report", async () => {
|
|
207
|
+
const base = makeReport({
|
|
208
|
+
counts: { "task-completion": 7 },
|
|
209
|
+
topTitles: [
|
|
210
|
+
{
|
|
211
|
+
id: "task-completion::missing-docs",
|
|
212
|
+
category: "missing-docs",
|
|
213
|
+
severity: "high",
|
|
214
|
+
title: "missing-docs",
|
|
215
|
+
count: 7,
|
|
216
|
+
},
|
|
217
|
+
],
|
|
218
|
+
totalJudgments: 20,
|
|
219
|
+
classificationRate: 0.35,
|
|
220
|
+
});
|
|
221
|
+
const report = withMode(base, "literacy");
|
|
222
|
+
const card = await generateFailureModeSummary(report, makeCtx());
|
|
223
|
+
expect(card.status).toBe("ready");
|
|
224
|
+
if (card.status === "ready") {
|
|
225
|
+
const body = card.body;
|
|
226
|
+
expect(body.dimension).toBe("task-completion");
|
|
227
|
+
expect(body.failureMode).toBe("missing-docs");
|
|
228
|
+
}
|
|
229
|
+
});
|
|
230
|
+
});
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* no-issues.test.ts — Tests 10-12 for the deterministic no-issues card.
|
|
3
|
+
*
|
|
4
|
+
* D0042: vitest only.
|
|
5
|
+
* AI-SPEC §1b failure-mode #7: sycophancy guard — firing rate ≤30%.
|
|
6
|
+
*/
|
|
7
|
+
import { describe, expect, it } from "vitest";
|
|
8
|
+
import { NO_ISSUES_THRESHOLD, NoIssuesBodySchema, generateNoIssues, } from "../no-issues.js";
|
|
9
|
+
// Compile-time gate
|
|
10
|
+
const _satisfiesCheck = NoIssuesBodySchema;
|
|
11
|
+
const silentLogger = {
|
|
12
|
+
debug: () => { },
|
|
13
|
+
info: () => { },
|
|
14
|
+
warn: () => { },
|
|
15
|
+
error: () => { },
|
|
16
|
+
step: () => { },
|
|
17
|
+
section: () => { },
|
|
18
|
+
table: () => { },
|
|
19
|
+
};
|
|
20
|
+
const noopProgress = {
|
|
21
|
+
phaseStart: () => { },
|
|
22
|
+
phaseProgress: () => { },
|
|
23
|
+
phaseComplete: () => { },
|
|
24
|
+
};
|
|
25
|
+
function makeCtx() {
|
|
26
|
+
return {
|
|
27
|
+
llm: undefined,
|
|
28
|
+
model: "anthropic:claude-sonnet-4-6",
|
|
29
|
+
logger: silentLogger,
|
|
30
|
+
progress: noopProgress,
|
|
31
|
+
versions: {
|
|
32
|
+
graderJudgmentsVersion: "1.0.0",
|
|
33
|
+
ensembleVersion: "1.0.0",
|
|
34
|
+
diagnosisVersion: "0.1.0",
|
|
35
|
+
cardVersion: "1.0.0",
|
|
36
|
+
},
|
|
37
|
+
runId: "run-001",
|
|
38
|
+
reportId: "report-001",
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
function makeReportWithScores(scores) {
|
|
42
|
+
return {
|
|
43
|
+
id: "report-001",
|
|
44
|
+
completedAt: "2026-01-01T00:00:00Z",
|
|
45
|
+
durationMs: 1000,
|
|
46
|
+
provenance: {
|
|
47
|
+
runId: "run-001",
|
|
48
|
+
mode: "standard",
|
|
49
|
+
areas: [],
|
|
50
|
+
taskIds: [],
|
|
51
|
+
models: [],
|
|
52
|
+
graderModel: "gpt-4o",
|
|
53
|
+
source: "local",
|
|
54
|
+
evalFingerprint: "",
|
|
55
|
+
trigger: "manual",
|
|
56
|
+
git: undefined,
|
|
57
|
+
},
|
|
58
|
+
summary: {
|
|
59
|
+
belowCritical: [],
|
|
60
|
+
lowestArea: scores.length ? `area-0` : "",
|
|
61
|
+
lowestScore: scores.length ? Math.min(...scores) : 0,
|
|
62
|
+
overall: {
|
|
63
|
+
avgCeilingScore: 90,
|
|
64
|
+
avgScore: scores.length
|
|
65
|
+
? scores.reduce((a, b) => a + b, 0) / scores.length
|
|
66
|
+
: 0,
|
|
67
|
+
avgDocLift: 5,
|
|
68
|
+
avgDocQualityGap: 10,
|
|
69
|
+
avgFloorScore: 80,
|
|
70
|
+
negativeDocLiftCount: 0,
|
|
71
|
+
},
|
|
72
|
+
scores: scores.map((s, i) => ({
|
|
73
|
+
feature: `area-${i}`,
|
|
74
|
+
ceilingScore: 95,
|
|
75
|
+
codeCorrectness: s,
|
|
76
|
+
docCoverage: s,
|
|
77
|
+
docLift: 5,
|
|
78
|
+
docQualityGap: 5,
|
|
79
|
+
floorScore: s - 5,
|
|
80
|
+
negativeDocLift: false,
|
|
81
|
+
taskCompletion: s,
|
|
82
|
+
testCount: 10,
|
|
83
|
+
totalCost: 0.05,
|
|
84
|
+
totalScore: s,
|
|
85
|
+
})),
|
|
86
|
+
},
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
describe("NO_ISSUES_THRESHOLD (Test 10)", () => {
|
|
90
|
+
it("is exported as a const with a numeric value", () => {
|
|
91
|
+
expect(typeof NO_ISSUES_THRESHOLD).toBe("number");
|
|
92
|
+
expect(NO_ISSUES_THRESHOLD).toBeGreaterThan(0);
|
|
93
|
+
expect(NO_ISSUES_THRESHOLD).toBeLessThanOrEqual(100);
|
|
94
|
+
// compile-time gate
|
|
95
|
+
expect(_satisfiesCheck).toBeDefined();
|
|
96
|
+
});
|
|
97
|
+
});
|
|
98
|
+
describe("generateNoIssues — ready when all areas ≥ threshold (Test 11a)", () => {
|
|
99
|
+
it("returns ready card with thresholdScore when all areas score ≥ threshold", async () => {
|
|
100
|
+
const scores = [
|
|
101
|
+
NO_ISSUES_THRESHOLD,
|
|
102
|
+
NO_ISSUES_THRESHOLD + 5,
|
|
103
|
+
NO_ISSUES_THRESHOLD + 10,
|
|
104
|
+
];
|
|
105
|
+
const report = makeReportWithScores(scores);
|
|
106
|
+
const card = await generateNoIssues(report, makeCtx());
|
|
107
|
+
expect(card.status).toBe("ready");
|
|
108
|
+
if (card.status === "ready") {
|
|
109
|
+
expect(card.cardType).toBe("no-issues");
|
|
110
|
+
expect(card.meta.cardVersion).toBe("no-issues@0.1.0");
|
|
111
|
+
const body = card.body;
|
|
112
|
+
expect(body.thresholdScore).toBe(NO_ISSUES_THRESHOLD);
|
|
113
|
+
expect(body.summary).toBeTruthy();
|
|
114
|
+
}
|
|
115
|
+
});
|
|
116
|
+
});
|
|
117
|
+
describe("generateNoIssues — missing when any area below threshold (Test 11b)", () => {
|
|
118
|
+
it("returns missing when at least one area scores below the threshold", async () => {
|
|
119
|
+
const scores = [NO_ISSUES_THRESHOLD - 1, NO_ISSUES_THRESHOLD + 10];
|
|
120
|
+
const report = makeReportWithScores(scores);
|
|
121
|
+
const card = await generateNoIssues(report, makeCtx());
|
|
122
|
+
expect(card.status).toBe("missing");
|
|
123
|
+
if (card.status === "missing") {
|
|
124
|
+
expect(card.cardType).toBe("no-issues");
|
|
125
|
+
expect(card.reason).toContain("below threshold");
|
|
126
|
+
}
|
|
127
|
+
});
|
|
128
|
+
});
|
|
129
|
+
describe("generateNoIssues — calibration firing rate ≤30% (Test 12)", () => {
|
|
130
|
+
it("fires ≤30% on a 10-report fixture set spanning low to high scores", async () => {
|
|
131
|
+
// 10 reports with scores from 0 to 100 in 10-point increments
|
|
132
|
+
// Only reports where ALL areas are ≥ NO_ISSUES_THRESHOLD should fire
|
|
133
|
+
const fixtureSet = [
|
|
134
|
+
makeReportWithScores([0, 10]), // far below threshold
|
|
135
|
+
makeReportWithScores([20, 30]), // below threshold
|
|
136
|
+
makeReportWithScores([40, 50]), // below threshold
|
|
137
|
+
makeReportWithScores([60, 65]), // likely below threshold (≤85)
|
|
138
|
+
makeReportWithScores([70, 75]), // below threshold if threshold=85
|
|
139
|
+
makeReportWithScores([75, 80]), // below threshold if threshold=85
|
|
140
|
+
makeReportWithScores([80, 82]), // below threshold if threshold=85
|
|
141
|
+
makeReportWithScores([90, 92]), // above threshold
|
|
142
|
+
makeReportWithScores([95, 97]), // above threshold
|
|
143
|
+
makeReportWithScores([98, 99]), // above threshold
|
|
144
|
+
];
|
|
145
|
+
const ctx = makeCtx();
|
|
146
|
+
let firingCount = 0;
|
|
147
|
+
for (const report of fixtureSet) {
|
|
148
|
+
const card = await generateNoIssues(report, ctx);
|
|
149
|
+
if (card.status === "ready")
|
|
150
|
+
firingCount++;
|
|
151
|
+
}
|
|
152
|
+
const firingRate = firingCount / fixtureSet.length;
|
|
153
|
+
expect(firingRate).toBeLessThanOrEqual(0.3);
|
|
154
|
+
});
|
|
155
|
+
});
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* area-summary card — deterministic projection over Report.summary.scores.
|
|
3
|
+
*
|
|
4
|
+
* Pure computation, no LLM call. Produces a human-readable sentence
|
|
5
|
+
* summarizing the number of evaluated areas and their mean composite score.
|
|
6
|
+
*
|
|
7
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
8
|
+
* mandatory (cards/ is a SCAN_ROOT in check-trust-boundary-satisfies.ts).
|
|
9
|
+
*
|
|
10
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §3 lines 569-587
|
|
11
|
+
*/
|
|
12
|
+
import { z } from "zod";
|
|
13
|
+
import type { CardGenerator } from "../../diagnosis-runner.js";
|
|
14
|
+
export declare const AreaSummaryBodySchema: z.ZodObject<{
|
|
15
|
+
summary: z.ZodString;
|
|
16
|
+
}, z.core.$strip>;
|
|
17
|
+
export declare const generateAreaSummary: CardGenerator;
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* area-summary card — deterministic projection over Report.summary.scores.
|
|
3
|
+
*
|
|
4
|
+
* Pure computation, no LLM call. Produces a human-readable sentence
|
|
5
|
+
* summarizing the number of evaluated areas and their mean composite score.
|
|
6
|
+
*
|
|
7
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
8
|
+
* mandatory (cards/ is a SCAN_ROOT in check-trust-boundary-satisfies.ts).
|
|
9
|
+
*
|
|
10
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §3 lines 569-587
|
|
11
|
+
*/
|
|
12
|
+
import { z } from "zod";
|
|
13
|
+
// ---------------------------------------------------------------------------
|
|
14
|
+
// Body schema (D0045 trust boundary — satisfies required)
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
export const AreaSummaryBodySchema = z.object({
|
|
17
|
+
summary: z.string().min(1).max(500),
|
|
18
|
+
});
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
// Generator
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
export const generateAreaSummary = async (report) => {
|
|
23
|
+
const scores = report.summary.scores;
|
|
24
|
+
if (!scores || scores.length === 0) {
|
|
25
|
+
return {
|
|
26
|
+
status: "missing",
|
|
27
|
+
cardType: "area-summary",
|
|
28
|
+
reason: "report has no areas",
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
const meanScore = scores.reduce((sum, s) => sum + s.totalScore, 0) / scores.length;
|
|
32
|
+
const summary = `Across ${scores.length} areas, mean score ${meanScore.toFixed(1)}.`;
|
|
33
|
+
const body = AreaSummaryBodySchema.parse({ summary });
|
|
34
|
+
return {
|
|
35
|
+
status: "ready",
|
|
36
|
+
cardType: "area-summary",
|
|
37
|
+
body,
|
|
38
|
+
meta: {
|
|
39
|
+
cardVersion: "area-summary@0.1.0",
|
|
40
|
+
generatedAt: new Date().toISOString(),
|
|
41
|
+
},
|
|
42
|
+
};
|
|
43
|
+
};
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* doc-attribution-spotlight card — LLM-driven doc-citation spotlight.
|
|
3
|
+
*
|
|
4
|
+
* Model: claude-sonnet-4-6 (routine per AI-SPEC §4 model routing)
|
|
5
|
+
* Version: doc-attribution-spotlight@0.1.0
|
|
6
|
+
*
|
|
7
|
+
* Landmine 11: reads `ctx.judgmentAttributions` (NOT Report.summary).
|
|
8
|
+
* Returns `status: "missing"` when attributions are undefined or empty.
|
|
9
|
+
*
|
|
10
|
+
* Mitigations:
|
|
11
|
+
* - failure-mode #5: docCitations[].docSlug refined against the manifest
|
|
12
|
+
* allow-list so hallucinated slugs fail Zod parse
|
|
13
|
+
*
|
|
14
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
15
|
+
* mandatory.
|
|
16
|
+
*
|
|
17
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-RESEARCH.md (Landmine 11)
|
|
18
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
19
|
+
*/
|
|
20
|
+
import { z } from "zod";
|
|
21
|
+
import type { CardGenerator } from "../../diagnosis-runner.js";
|
|
22
|
+
/**
|
|
23
|
+
* Module-level static shape. Per-call adds the allow-list refine on docSlug.
|
|
24
|
+
*/
|
|
25
|
+
export declare const DocAttributionSpotlightBodySchema: z.ZodObject<{
|
|
26
|
+
summary: z.ZodString;
|
|
27
|
+
docCitations: z.ZodArray<z.ZodObject<{
|
|
28
|
+
docSlug: z.ZodString;
|
|
29
|
+
confidence: z.ZodObject<{
|
|
30
|
+
level: z.ZodEnum<{
|
|
31
|
+
low: "low";
|
|
32
|
+
medium: "medium";
|
|
33
|
+
high: "high";
|
|
34
|
+
}>;
|
|
35
|
+
signalsPresent: z.ZodNumber;
|
|
36
|
+
derivation: z.ZodString;
|
|
37
|
+
}, z.core.$strip>;
|
|
38
|
+
role: z.ZodEnum<{
|
|
39
|
+
missing: "missing";
|
|
40
|
+
supports: "supports";
|
|
41
|
+
contradicts: "contradicts";
|
|
42
|
+
irrelevant: "irrelevant";
|
|
43
|
+
}>;
|
|
44
|
+
}, z.core.$strip>>;
|
|
45
|
+
}, z.core.$strip>;
|
|
46
|
+
export declare const generateDocAttributionSpotlight: CardGenerator;
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* doc-attribution-spotlight card — LLM-driven doc-citation spotlight.
|
|
3
|
+
*
|
|
4
|
+
* Model: claude-sonnet-4-6 (routine per AI-SPEC §4 model routing)
|
|
5
|
+
* Version: doc-attribution-spotlight@0.1.0
|
|
6
|
+
*
|
|
7
|
+
* Landmine 11: reads `ctx.judgmentAttributions` (NOT Report.summary).
|
|
8
|
+
* Returns `status: "missing"` when attributions are undefined or empty.
|
|
9
|
+
*
|
|
10
|
+
* Mitigations:
|
|
11
|
+
* - failure-mode #5: docCitations[].docSlug refined against the manifest
|
|
12
|
+
* allow-list so hallucinated slugs fail Zod parse
|
|
13
|
+
*
|
|
14
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
15
|
+
* mandatory.
|
|
16
|
+
*
|
|
17
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-RESEARCH.md (Landmine 11)
|
|
18
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
19
|
+
*/
|
|
20
|
+
import { z } from "zod";
|
|
21
|
+
import { ConfidenceSchema } from "../../../schemas/confidence-schema.js";
|
|
22
|
+
import { modelId as mkModelId } from "../../../ports/llm-client.js";
|
|
23
|
+
import { buildDocAttributionSpotlightPrompt, buildDocSlugAllowList, } from "../prompt-builders.js";
|
|
24
|
+
// ---------------------------------------------------------------------------
|
|
25
|
+
// Body schema (D0045 trust boundary — satisfies required)
|
|
26
|
+
// ---------------------------------------------------------------------------
|
|
27
|
+
/**
|
|
28
|
+
* Module-level static shape. Per-call adds the allow-list refine on docSlug.
|
|
29
|
+
*/
|
|
30
|
+
export const DocAttributionSpotlightBodySchema = z.object({
|
|
31
|
+
summary: z.string().min(1).max(800),
|
|
32
|
+
docCitations: z
|
|
33
|
+
.array(z.object({
|
|
34
|
+
docSlug: z.string().min(1),
|
|
35
|
+
confidence: ConfidenceSchema,
|
|
36
|
+
role: z.enum(["supports", "contradicts", "missing", "irrelevant"]),
|
|
37
|
+
}))
|
|
38
|
+
.min(1)
|
|
39
|
+
.max(5),
|
|
40
|
+
});
|
|
41
|
+
// ---------------------------------------------------------------------------
|
|
42
|
+
// Generator
|
|
43
|
+
// ---------------------------------------------------------------------------
|
|
44
|
+
const CARD_MODEL = mkModelId("anthropic:claude-sonnet-4-6");
|
|
45
|
+
export const generateDocAttributionSpotlight = async (report, ctx) => {
|
|
46
|
+
// C1: no LLM → missing
|
|
47
|
+
if (!ctx.llm) {
|
|
48
|
+
return {
|
|
49
|
+
status: "missing",
|
|
50
|
+
cardType: "doc-attribution-spotlight",
|
|
51
|
+
reason: "no LLMClient wired",
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
// D1: Landmine 11 — short-circuit BEFORE calling LLM when no attribution data
|
|
55
|
+
if (!ctx.judgmentAttributions || ctx.judgmentAttributions.length === 0) {
|
|
56
|
+
return {
|
|
57
|
+
status: "missing",
|
|
58
|
+
cardType: "doc-attribution-spotlight",
|
|
59
|
+
reason: "no attribution data for this run",
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
// Build allow-list from the runtime report
|
|
63
|
+
const allowList = buildDocSlugAllowList(report);
|
|
64
|
+
// Per-call schema with docSlug allow-list refine (AI-SPEC §3 Pitfall 1)
|
|
65
|
+
const PerCallSchema = z.object({
|
|
66
|
+
summary: z.string().min(1).max(800),
|
|
67
|
+
docCitations: z
|
|
68
|
+
.array(z.object({
|
|
69
|
+
docSlug: z
|
|
70
|
+
.string()
|
|
71
|
+
.min(1)
|
|
72
|
+
.refine((slug) => allowList.has(slug), {
|
|
73
|
+
message: "docCitations[].docSlug is not in the report document manifest allow-list",
|
|
74
|
+
}),
|
|
75
|
+
confidence: ConfidenceSchema,
|
|
76
|
+
role: z.enum(["supports", "contradicts", "missing", "irrelevant"]),
|
|
77
|
+
}))
|
|
78
|
+
.min(1)
|
|
79
|
+
.max(5),
|
|
80
|
+
});
|
|
81
|
+
const prompt = buildDocAttributionSpotlightPrompt(report, ctx.judgmentAttributions);
|
|
82
|
+
// Destructure `cost` and `model` from the LLMClient return —
|
|
83
|
+
// already provided per llm-client.ts:139-144, previously discarded.
|
|
84
|
+
const { value, usage, cost, model } = await ctx.llm.completeStructured({
|
|
85
|
+
model: CARD_MODEL,
|
|
86
|
+
prompt: `${prompt.system}\n\n${prompt.user}`,
|
|
87
|
+
schema: PerCallSchema,
|
|
88
|
+
temperature: 0.1,
|
|
89
|
+
maxTokens: 2000,
|
|
90
|
+
context: {
|
|
91
|
+
feature: "diagnosis",
|
|
92
|
+
runId: ctx.runId,
|
|
93
|
+
cardId: "doc-attribution-spotlight",
|
|
94
|
+
},
|
|
95
|
+
});
|
|
96
|
+
return {
|
|
97
|
+
status: "ready",
|
|
98
|
+
cardType: "doc-attribution-spotlight",
|
|
99
|
+
body: value,
|
|
100
|
+
meta: {
|
|
101
|
+
cardVersion: "doc-attribution-spotlight@0.1.0",
|
|
102
|
+
tokenUsage: { input: usage.promptTokens, output: usage.completionTokens },
|
|
103
|
+
generatedAt: new Date().toISOString(),
|
|
104
|
+
cost,
|
|
105
|
+
model,
|
|
106
|
+
},
|
|
107
|
+
};
|
|
108
|
+
};
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* failure-mode-summary card — deterministic projection over Report.summary.failureModes.
|
|
3
|
+
*
|
|
4
|
+
* Pure computation, no LLM call. Identifies the dominant failure mode
|
|
5
|
+
* across all dimensions in the report's slim failure-mode summary.
|
|
6
|
+
*
|
|
7
|
+
* D-05: `.refine(buildFailureModeRefinement())` rejects cross-dimension
|
|
8
|
+
* (dimension, failureMode) pairs that the schema otherwise would accept —
|
|
9
|
+
* turning a "Zod-passes, semantically wrong" LLM output into a
|
|
10
|
+
* `parseFailed: true` degraded card. For this deterministic card, the
|
|
11
|
+
* refinement also defends against bad Report data.
|
|
12
|
+
*
|
|
13
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
14
|
+
* mandatory (cards/ is a SCAN_ROOT in check-trust-boundary-satisfies.ts).
|
|
15
|
+
*
|
|
16
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-CONTEXT.md (D-05)
|
|
17
|
+
* @see packages/core/src/services/diagnosis/card-validators.ts
|
|
18
|
+
*/
|
|
19
|
+
import { z } from "zod";
|
|
20
|
+
import type { CardGenerator } from "../../diagnosis-runner.js";
|
|
21
|
+
export declare const FailureModeSummaryBodySchema: z.ZodObject<{
|
|
22
|
+
summary: z.ZodString;
|
|
23
|
+
dimension: z.ZodString;
|
|
24
|
+
failureMode: z.ZodString;
|
|
25
|
+
count: z.ZodNumber;
|
|
26
|
+
sampleSize: z.ZodNumber;
|
|
27
|
+
}, z.core.$strip>;
|
|
28
|
+
export declare const generateFailureModeSummary: CardGenerator;
|