@sanity/ailf 4.6.0 → 6.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/agent-harness-tools.yaml +42 -0
- package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
- package/canonical/grader-references/mcp-server-spec.yaml +51 -0
- package/canonical/grader-references/portable-text.yaml +48 -0
- package/config/diagnosis-cards.ts +318 -0
- package/config/models.ts +12 -0
- package/config/rubrics.ts +38 -2
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +60 -2
- package/dist/_vendor/ailf-core/artifact-registry.js +288 -7
- package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
- package/dist/_vendor/ailf-core/examples/index.js +146 -47
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +4 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +8 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
- package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/index.js +9 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
- package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +171 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +104 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +96 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +39 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +52 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +77 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +126 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +107 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +114 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +273 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +16 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +86 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +50 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.js +35 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +136 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.js +153 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +6 -0
- package/dist/_vendor/ailf-core/services/index.js +18 -0
- package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
- package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
- package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
- package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
- package/dist/_vendor/ailf-core/types/attribution.js +18 -0
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
- package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
- package/dist/_vendor/ailf-core/types/confidence.d.ts +1 -1
- package/dist/_vendor/ailf-core/types/confidence.js +7 -0
- package/dist/_vendor/ailf-core/types/diagnosis.d.ts +271 -0
- package/dist/_vendor/ailf-core/types/diagnosis.js +19 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
- package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
- package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +80 -29
- package/dist/_vendor/ailf-core/types/index.js +15 -1
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
- package/dist/adapters/api-client/build-request.d.ts +1 -0
- package/dist/adapters/api-client/build-request.js +3 -0
- package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
- package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
- package/dist/adapters/attribution/index.d.ts +9 -0
- package/dist/adapters/attribution/index.js +8 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
- package/dist/adapters/config-sources/file-config-adapter.js +1 -0
- package/dist/adapters/grader-outputs/index.d.ts +10 -0
- package/dist/adapters/grader-outputs/index.js +8 -0
- package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
- package/dist/adapters/grader-outputs/legacy/index.js +10 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
- package/dist/adapters/index.d.ts +3 -0
- package/dist/adapters/index.js +4 -0
- package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
- package/dist/adapters/llm/fake-llm-client.js +38 -1
- package/dist/adapters/llm/openai-llm-client.js +52 -3
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
- package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +79 -11
- package/dist/adapters/task-sources/repo-schemas.js +19 -2
- package/dist/cli-program.js +3 -0
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/explain-handler.js +1 -1
- package/dist/commands/interpret.d.ts +50 -0
- package/dist/commands/interpret.js +212 -0
- package/dist/commands/lookup-doc.d.ts +1 -1
- package/dist/commands/lookup-doc.js +3 -3
- package/dist/commands/pipeline-action.d.ts +6 -0
- package/dist/commands/pipeline-action.js +2 -0
- package/dist/commands/remote-pipeline.js +1 -0
- package/dist/composition-root.d.ts +57 -23
- package/dist/composition-root.js +155 -41
- package/dist/config/diagnosis-cards.ts +318 -0
- package/dist/config/models.ts +12 -0
- package/dist/config/rubrics.ts +38 -2
- package/dist/grader/agent-harness.d.ts +9 -0
- package/dist/grader/agent-harness.js +9 -0
- package/dist/grader/common.d.ts +9 -0
- package/dist/grader/common.js +9 -0
- package/dist/grader/index.d.ts +24 -0
- package/dist/grader/index.js +24 -0
- package/dist/grader/knowledge-probe.d.ts +9 -0
- package/dist/grader/knowledge-probe.js +9 -0
- package/dist/grader/literacy.d.ts +9 -0
- package/dist/grader/literacy.js +9 -0
- package/dist/grader/mcp.d.ts +9 -0
- package/dist/grader/mcp.js +9 -0
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +5 -0
- package/dist/orchestration/steps/calculate-scores-step.js +23 -1
- package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
- package/dist/orchestration/steps/compute-attribution-step.js +279 -0
- package/dist/orchestration/steps/gap-analysis-step.js +35 -7
- package/dist/orchestration/steps/index.d.ts +1 -0
- package/dist/orchestration/steps/index.js +1 -0
- package/dist/pipeline/attribution.d.ts +15 -0
- package/dist/pipeline/attribution.js +18 -9
- package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
- package/dist/pipeline/borderline-consensus-runner.js +124 -0
- package/dist/pipeline/borderline-detector.d.ts +24 -0
- package/dist/pipeline/borderline-detector.js +26 -0
- package/dist/pipeline/calculate-scores.d.ts +114 -3
- package/dist/pipeline/calculate-scores.js +426 -24
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +35 -17
- package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
- package/dist/pipeline/compiler/rubric-resolution.js +9 -1
- package/dist/pipeline/compute-attribution.d.ts +80 -0
- package/dist/pipeline/compute-attribution.js +196 -0
- package/dist/pipeline/failure-modes.d.ts +52 -17
- package/dist/pipeline/failure-modes.js +178 -117
- package/dist/pipeline/map-request-to-config.js +1 -0
- package/package.json +7 -5
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* doc-attribution-spotlight card — LLM-driven doc-citation spotlight.
|
|
3
|
+
*
|
|
4
|
+
* Model: claude-sonnet-4-6 (routine per AI-SPEC §4 model routing)
|
|
5
|
+
* Version: doc-attribution-spotlight@0.1.0
|
|
6
|
+
*
|
|
7
|
+
* Landmine 11: reads `ctx.judgmentAttributions` (NOT Report.summary).
|
|
8
|
+
* Returns `status: "missing"` when attributions are undefined or empty.
|
|
9
|
+
*
|
|
10
|
+
* Mitigations:
|
|
11
|
+
* - failure-mode #5: docCitations[].docSlug refined against the manifest
|
|
12
|
+
* allow-list so hallucinated slugs fail Zod parse
|
|
13
|
+
*
|
|
14
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
15
|
+
* mandatory.
|
|
16
|
+
*
|
|
17
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-RESEARCH.md (Landmine 11)
|
|
18
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
19
|
+
*/
|
|
20
|
+
import { z } from "zod";
|
|
21
|
+
import { ConfidenceSchema } from "../../../schemas/confidence-schema.js";
|
|
22
|
+
import { modelId as mkModelId } from "../../../ports/llm-client.js";
|
|
23
|
+
import { buildDocAttributionSpotlightPrompt, buildDocSlugAllowList, } from "../prompt-builders.js";
|
|
24
|
+
// ---------------------------------------------------------------------------
|
|
25
|
+
// Body schema (D0045 trust boundary — satisfies required)
|
|
26
|
+
// ---------------------------------------------------------------------------
|
|
27
|
+
/**
|
|
28
|
+
* Module-level static shape. Per-call adds the allow-list refine on docSlug.
|
|
29
|
+
*/
|
|
30
|
+
export const DocAttributionSpotlightBodySchema = z.object({
|
|
31
|
+
summary: z.string().min(1).max(800),
|
|
32
|
+
docCitations: z
|
|
33
|
+
.array(z.object({
|
|
34
|
+
docSlug: z.string().min(1),
|
|
35
|
+
confidence: ConfidenceSchema,
|
|
36
|
+
role: z.enum(["supports", "contradicts", "missing", "irrelevant"]),
|
|
37
|
+
}))
|
|
38
|
+
.min(1)
|
|
39
|
+
.max(5),
|
|
40
|
+
});
|
|
41
|
+
// ---------------------------------------------------------------------------
|
|
42
|
+
// Generator
|
|
43
|
+
// ---------------------------------------------------------------------------
|
|
44
|
+
const CARD_MODEL = mkModelId("anthropic:claude-sonnet-4-6");
|
|
45
|
+
export const generateDocAttributionSpotlight = async (report, ctx) => {
|
|
46
|
+
// C1: no LLM → missing
|
|
47
|
+
if (!ctx.llm) {
|
|
48
|
+
return {
|
|
49
|
+
status: "missing",
|
|
50
|
+
cardType: "doc-attribution-spotlight",
|
|
51
|
+
reason: "no LLMClient wired",
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
// D1: Landmine 11 — short-circuit BEFORE calling LLM when no attribution data
|
|
55
|
+
if (!ctx.judgmentAttributions || ctx.judgmentAttributions.length === 0) {
|
|
56
|
+
return {
|
|
57
|
+
status: "missing",
|
|
58
|
+
cardType: "doc-attribution-spotlight",
|
|
59
|
+
reason: "no attribution data for this run",
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
// Build allow-list from the runtime report
|
|
63
|
+
const allowList = buildDocSlugAllowList(report);
|
|
64
|
+
// Per-call schema with docSlug allow-list refine (AI-SPEC §3 Pitfall 1)
|
|
65
|
+
const PerCallSchema = z.object({
|
|
66
|
+
summary: z.string().min(1).max(800),
|
|
67
|
+
docCitations: z
|
|
68
|
+
.array(z.object({
|
|
69
|
+
docSlug: z
|
|
70
|
+
.string()
|
|
71
|
+
.min(1)
|
|
72
|
+
.refine((slug) => allowList.has(slug), {
|
|
73
|
+
message: "docCitations[].docSlug is not in the report document manifest allow-list",
|
|
74
|
+
}),
|
|
75
|
+
confidence: ConfidenceSchema,
|
|
76
|
+
role: z.enum(["supports", "contradicts", "missing", "irrelevant"]),
|
|
77
|
+
}))
|
|
78
|
+
.min(1)
|
|
79
|
+
.max(5),
|
|
80
|
+
});
|
|
81
|
+
const prompt = buildDocAttributionSpotlightPrompt(report, ctx.judgmentAttributions);
|
|
82
|
+
const { value, usage } = await ctx.llm.completeStructured({
|
|
83
|
+
model: CARD_MODEL,
|
|
84
|
+
prompt: `${prompt.system}\n\n${prompt.user}`,
|
|
85
|
+
schema: PerCallSchema,
|
|
86
|
+
temperature: 0.1,
|
|
87
|
+
maxTokens: 2000,
|
|
88
|
+
context: {
|
|
89
|
+
feature: "diagnosis",
|
|
90
|
+
runId: ctx.runId,
|
|
91
|
+
cardId: "doc-attribution-spotlight",
|
|
92
|
+
},
|
|
93
|
+
});
|
|
94
|
+
return {
|
|
95
|
+
status: "ready",
|
|
96
|
+
cardType: "doc-attribution-spotlight",
|
|
97
|
+
body: value,
|
|
98
|
+
meta: {
|
|
99
|
+
cardVersion: "doc-attribution-spotlight@0.1.0",
|
|
100
|
+
tokenUsage: { input: usage.promptTokens, output: usage.completionTokens },
|
|
101
|
+
generatedAt: new Date().toISOString(),
|
|
102
|
+
},
|
|
103
|
+
};
|
|
104
|
+
};
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* failure-mode-summary card — deterministic projection over Report.summary.failureModes.
|
|
3
|
+
*
|
|
4
|
+
* Pure computation, no LLM call. Identifies the dominant failure mode
|
|
5
|
+
* across all dimensions in the report's slim failure-mode summary.
|
|
6
|
+
*
|
|
7
|
+
* D-05: `.refine(buildFailureModeRefinement())` rejects cross-dimension
|
|
8
|
+
* (dimension, failureMode) pairs that the schema otherwise would accept —
|
|
9
|
+
* turning a "Zod-passes, semantically wrong" LLM output into a
|
|
10
|
+
* `parseFailed: true` degraded card. For this deterministic card, the
|
|
11
|
+
* refinement also defends against bad Report data.
|
|
12
|
+
*
|
|
13
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
14
|
+
* mandatory (cards/ is a SCAN_ROOT in check-trust-boundary-satisfies.ts).
|
|
15
|
+
*
|
|
16
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-CONTEXT.md (D-05)
|
|
17
|
+
* @see packages/core/src/services/diagnosis/card-validators.ts
|
|
18
|
+
*/
|
|
19
|
+
import { z } from "zod";
|
|
20
|
+
import type { CardGenerator } from "../../diagnosis-runner.js";
|
|
21
|
+
export declare const FailureModeSummaryBodySchema: z.ZodObject<{
|
|
22
|
+
summary: z.ZodString;
|
|
23
|
+
dimension: z.ZodString;
|
|
24
|
+
failureMode: z.ZodString;
|
|
25
|
+
count: z.ZodNumber;
|
|
26
|
+
sampleSize: z.ZodNumber;
|
|
27
|
+
}, z.core.$strip>;
|
|
28
|
+
export declare const generateFailureModeSummary: CardGenerator;
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* failure-mode-summary card — deterministic projection over Report.summary.failureModes.
|
|
3
|
+
*
|
|
4
|
+
* Pure computation, no LLM call. Identifies the dominant failure mode
|
|
5
|
+
* across all dimensions in the report's slim failure-mode summary.
|
|
6
|
+
*
|
|
7
|
+
* D-05: `.refine(buildFailureModeRefinement())` rejects cross-dimension
|
|
8
|
+
* (dimension, failureMode) pairs that the schema otherwise would accept —
|
|
9
|
+
* turning a "Zod-passes, semantically wrong" LLM output into a
|
|
10
|
+
* `parseFailed: true` degraded card. For this deterministic card, the
|
|
11
|
+
* refinement also defends against bad Report data.
|
|
12
|
+
*
|
|
13
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
14
|
+
* mandatory (cards/ is a SCAN_ROOT in check-trust-boundary-satisfies.ts).
|
|
15
|
+
*
|
|
16
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-CONTEXT.md (D-05)
|
|
17
|
+
* @see packages/core/src/services/diagnosis/card-validators.ts
|
|
18
|
+
*/
|
|
19
|
+
import { z } from "zod";
|
|
20
|
+
import { CANONICAL_DIMENSIONS, failureModesForDimension, } from "../../../grader/failure-modes/index.js";
|
|
21
|
+
import { buildFailureModeRefinement } from "../card-validators.js";
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
// Body schema (D0045 trust boundary — satisfies required; D-05 refine)
|
|
24
|
+
// ---------------------------------------------------------------------------
|
|
25
|
+
export const FailureModeSummaryBodySchema = z
|
|
26
|
+
.object({
|
|
27
|
+
summary: z.string().min(1).max(800),
|
|
28
|
+
dimension: z.string().min(1),
|
|
29
|
+
failureMode: z.string().min(1),
|
|
30
|
+
count: z.number().int().nonnegative(),
|
|
31
|
+
sampleSize: z.number().int().nonnegative(),
|
|
32
|
+
})
|
|
33
|
+
.refine(buildFailureModeRefinement(), {
|
|
34
|
+
message: "failureMode is not in the canonical taxonomy for this dimension",
|
|
35
|
+
path: ["failureMode"],
|
|
36
|
+
});
|
|
37
|
+
// ---------------------------------------------------------------------------
|
|
38
|
+
// Private helper — find the dimension a failure mode belongs to
|
|
39
|
+
// ---------------------------------------------------------------------------
|
|
40
|
+
/**
|
|
41
|
+
* Find the first canonical dimension whose taxonomy includes `mode`.
|
|
42
|
+
* Returns `undefined` if the mode is not in any dimension's taxonomy.
|
|
43
|
+
*/
|
|
44
|
+
function findDimensionForMode(mode) {
|
|
45
|
+
for (const dim of CANONICAL_DIMENSIONS) {
|
|
46
|
+
if (failureModesForDimension(dim).includes(mode)) {
|
|
47
|
+
return dim;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
return undefined;
|
|
51
|
+
}
|
|
52
|
+
// ---------------------------------------------------------------------------
|
|
53
|
+
// Generator
|
|
54
|
+
// ---------------------------------------------------------------------------
|
|
55
|
+
export const generateFailureModeSummary = async (report) => {
|
|
56
|
+
const slimFm = report.summary.failureModes;
|
|
57
|
+
if (!slimFm ||
|
|
58
|
+
!slimFm.topTitles ||
|
|
59
|
+
slimFm.topTitles.length === 0 ||
|
|
60
|
+
slimFm.totalJudgments === 0) {
|
|
61
|
+
return {
|
|
62
|
+
status: "missing",
|
|
63
|
+
cardType: "failure-mode-summary",
|
|
64
|
+
reason: "report has no failure modes",
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
// Find the top entry — topTitles is already sorted by count descending
|
|
68
|
+
const topEntry = slimFm.topTitles.reduce((best, entry) => (entry.count > best.count ? entry : best), slimFm.topTitles[0]);
|
|
69
|
+
const failureMode = topEntry.category;
|
|
70
|
+
const dimension = findDimensionForMode(failureMode);
|
|
71
|
+
if (!dimension) {
|
|
72
|
+
return {
|
|
73
|
+
status: "missing",
|
|
74
|
+
cardType: "failure-mode-summary",
|
|
75
|
+
reason: `failure mode "${failureMode}" is not in the canonical taxonomy`,
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
const sampleSize = slimFm.totalJudgments;
|
|
79
|
+
const summary = `The most frequent failure mode is "${failureMode}" (${topEntry.count} of ${sampleSize} judgments in dimension "${dimension}").`;
|
|
80
|
+
const body = FailureModeSummaryBodySchema.parse({
|
|
81
|
+
summary,
|
|
82
|
+
dimension,
|
|
83
|
+
failureMode,
|
|
84
|
+
count: topEntry.count,
|
|
85
|
+
sampleSize,
|
|
86
|
+
});
|
|
87
|
+
return {
|
|
88
|
+
status: "ready",
|
|
89
|
+
cardType: "failure-mode-summary",
|
|
90
|
+
body,
|
|
91
|
+
meta: {
|
|
92
|
+
cardVersion: "failure-mode-summary@0.1.0",
|
|
93
|
+
generatedAt: new Date().toISOString(),
|
|
94
|
+
},
|
|
95
|
+
};
|
|
96
|
+
};
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Card-generator barrel — exports all 8 generators + DIAGNOSIS_CARD_GENERATORS.
|
|
3
|
+
*
|
|
4
|
+
* This barrel lives in @sanity/ailf-core so the Plan-06 API route can import
|
|
5
|
+
* `DIAGNOSIS_CARD_GENERATORS` via `import { DIAGNOSIS_CARD_GENERATORS } from
|
|
6
|
+
* "@sanity/ailf-core"` without depending on `@sanity/ailf` (D-01 boundary).
|
|
7
|
+
*
|
|
8
|
+
* This is a CARD-GENERATOR barrel only — no vendor SDK classes, no adapter
|
|
9
|
+
* implementations. Cards speak the `LLMClient` port exclusively (D0051).
|
|
10
|
+
*
|
|
11
|
+
* TypeScript exhaustiveness: `CardType` is a 8-element literal union; the
|
|
12
|
+
* `Record<CardType, CardGenerator>` annotation causes a build error if any
|
|
13
|
+
* key is missing or extra.
|
|
14
|
+
*
|
|
15
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-CONTEXT.md (D-01)
|
|
16
|
+
* @see docs/decisions/D0051-llm-client-port.md
|
|
17
|
+
*/
|
|
18
|
+
import { generateAreaSummary } from "./area-summary.js";
|
|
19
|
+
import { generateFailureModeSummary } from "./failure-mode-summary.js";
|
|
20
|
+
import { generateNoIssues } from "./no-issues.js";
|
|
21
|
+
import { generateTopRecommendations } from "./top-recommendations.js";
|
|
22
|
+
import { generateWeakestArea } from "./weakest-area.js";
|
|
23
|
+
import { generateLowConfidenceAttribution } from "./low-confidence-attribution.js";
|
|
24
|
+
import { generateDocAttributionSpotlight } from "./doc-attribution-spotlight.js";
|
|
25
|
+
import { generateRegressionVsBaseline } from "./regression-vs-baseline.js";
|
|
26
|
+
import type { CardGenerator } from "../../diagnosis-runner.js";
|
|
27
|
+
import type { CardType } from "../../../types/diagnosis.js";
|
|
28
|
+
/**
|
|
29
|
+
* The canonical card-generator registry for the diagnosis engine.
|
|
30
|
+
*
|
|
31
|
+
* `Readonly<Record<CardType, CardGenerator>>` — TypeScript exhaustiveness
|
|
32
|
+
* ensures all 8 `CardType` strings appear (no rogue keys, no missing keys).
|
|
33
|
+
* The composition root (`packages/eval/src/composition-root.ts`) passes this
|
|
34
|
+
* directly into `createDiagnosisRunner(deps)`.
|
|
35
|
+
*
|
|
36
|
+
* Also consumed by the Plan-06 API route, which imports via `@sanity/ailf-core`.
|
|
37
|
+
*/
|
|
38
|
+
export declare const DIAGNOSIS_CARD_GENERATORS: Readonly<Record<CardType, CardGenerator>>;
|
|
39
|
+
export { generateAreaSummary, generateFailureModeSummary, generateNoIssues, generateTopRecommendations, generateWeakestArea, generateLowConfidenceAttribution, generateDocAttributionSpotlight, generateRegressionVsBaseline, };
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Card-generator barrel — exports all 8 generators + DIAGNOSIS_CARD_GENERATORS.
|
|
3
|
+
*
|
|
4
|
+
* This barrel lives in @sanity/ailf-core so the Plan-06 API route can import
|
|
5
|
+
* `DIAGNOSIS_CARD_GENERATORS` via `import { DIAGNOSIS_CARD_GENERATORS } from
|
|
6
|
+
* "@sanity/ailf-core"` without depending on `@sanity/ailf` (D-01 boundary).
|
|
7
|
+
*
|
|
8
|
+
* This is a CARD-GENERATOR barrel only — no vendor SDK classes, no adapter
|
|
9
|
+
* implementations. Cards speak the `LLMClient` port exclusively (D0051).
|
|
10
|
+
*
|
|
11
|
+
* TypeScript exhaustiveness: `CardType` is a 8-element literal union; the
|
|
12
|
+
* `Record<CardType, CardGenerator>` annotation causes a build error if any
|
|
13
|
+
* key is missing or extra.
|
|
14
|
+
*
|
|
15
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-CONTEXT.md (D-01)
|
|
16
|
+
* @see docs/decisions/D0051-llm-client-port.md
|
|
17
|
+
*/
|
|
18
|
+
import { generateAreaSummary } from "./area-summary.js";
|
|
19
|
+
import { generateFailureModeSummary } from "./failure-mode-summary.js";
|
|
20
|
+
import { generateNoIssues } from "./no-issues.js";
|
|
21
|
+
import { generateTopRecommendations } from "./top-recommendations.js";
|
|
22
|
+
import { generateWeakestArea } from "./weakest-area.js";
|
|
23
|
+
import { generateLowConfidenceAttribution } from "./low-confidence-attribution.js";
|
|
24
|
+
import { generateDocAttributionSpotlight } from "./doc-attribution-spotlight.js";
|
|
25
|
+
import { generateRegressionVsBaseline } from "./regression-vs-baseline.js";
|
|
26
|
+
// ---------------------------------------------------------------------------
|
|
27
|
+
// DIAGNOSIS_CARD_GENERATORS — full 8-card registry literal
|
|
28
|
+
// ---------------------------------------------------------------------------
|
|
29
|
+
/**
|
|
30
|
+
* The canonical card-generator registry for the diagnosis engine.
|
|
31
|
+
*
|
|
32
|
+
* `Readonly<Record<CardType, CardGenerator>>` — TypeScript exhaustiveness
|
|
33
|
+
* ensures all 8 `CardType` strings appear (no rogue keys, no missing keys).
|
|
34
|
+
* The composition root (`packages/eval/src/composition-root.ts`) passes this
|
|
35
|
+
* directly into `createDiagnosisRunner(deps)`.
|
|
36
|
+
*
|
|
37
|
+
* Also consumed by the Plan-06 API route, which imports via `@sanity/ailf-core`.
|
|
38
|
+
*/
|
|
39
|
+
export const DIAGNOSIS_CARD_GENERATORS = {
|
|
40
|
+
"area-summary": generateAreaSummary,
|
|
41
|
+
"failure-mode-summary": generateFailureModeSummary,
|
|
42
|
+
"no-issues": generateNoIssues,
|
|
43
|
+
"top-recommendations": generateTopRecommendations,
|
|
44
|
+
"weakest-area": generateWeakestArea,
|
|
45
|
+
"low-confidence-attribution": generateLowConfidenceAttribution,
|
|
46
|
+
"doc-attribution-spotlight": generateDocAttributionSpotlight,
|
|
47
|
+
"regression-vs-baseline": generateRegressionVsBaseline,
|
|
48
|
+
};
|
|
49
|
+
// ---------------------------------------------------------------------------
|
|
50
|
+
// Individual re-exports (for callers that want a single generator)
|
|
51
|
+
// ---------------------------------------------------------------------------
|
|
52
|
+
export { generateAreaSummary, generateFailureModeSummary, generateNoIssues, generateTopRecommendations, generateWeakestArea, generateLowConfidenceAttribution, generateDocAttributionSpotlight, generateRegressionVsBaseline, };
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* low-confidence-attribution card — LLM-driven uncertain-attribution finder.
|
|
3
|
+
*
|
|
4
|
+
* Model: claude-sonnet-4-6 (routine per AI-SPEC §4 model routing)
|
|
5
|
+
* Version: low-confidence-attribution@0.1.0
|
|
6
|
+
*
|
|
7
|
+
* Landmine 11: this card reads `ctx.judgmentAttributions` (NOT Report.summary).
|
|
8
|
+
* Returns `status: "missing"` when attributions are undefined or empty BEFORE
|
|
9
|
+
* calling the LLM — this is a structural seam, not an error path.
|
|
10
|
+
*
|
|
11
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
12
|
+
* mandatory.
|
|
13
|
+
*
|
|
14
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-RESEARCH.md (Landmine 11)
|
|
15
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
16
|
+
*/
|
|
17
|
+
import { z } from "zod";
|
|
18
|
+
import type { CardGenerator } from "../../diagnosis-runner.js";
|
|
19
|
+
export declare const LowConfidenceAttributionBodySchema: z.ZodObject<{
|
|
20
|
+
summary: z.ZodString;
|
|
21
|
+
judgmentRefs: z.ZodArray<z.ZodObject<{
|
|
22
|
+
taskId: z.ZodString;
|
|
23
|
+
modelId: z.ZodString;
|
|
24
|
+
dimension: z.ZodString;
|
|
25
|
+
}, z.core.$strip>>;
|
|
26
|
+
}, z.core.$strip>;
|
|
27
|
+
export declare const generateLowConfidenceAttribution: CardGenerator;
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* low-confidence-attribution card — LLM-driven uncertain-attribution finder.
|
|
3
|
+
*
|
|
4
|
+
* Model: claude-sonnet-4-6 (routine per AI-SPEC §4 model routing)
|
|
5
|
+
* Version: low-confidence-attribution@0.1.0
|
|
6
|
+
*
|
|
7
|
+
* Landmine 11: this card reads `ctx.judgmentAttributions` (NOT Report.summary).
|
|
8
|
+
* Returns `status: "missing"` when attributions are undefined or empty BEFORE
|
|
9
|
+
* calling the LLM — this is a structural seam, not an error path.
|
|
10
|
+
*
|
|
11
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
12
|
+
* mandatory.
|
|
13
|
+
*
|
|
14
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-RESEARCH.md (Landmine 11)
|
|
15
|
+
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
16
|
+
*/
|
|
17
|
+
import { z } from "zod";
|
|
18
|
+
import { modelId as mkModelId } from "../../../ports/llm-client.js";
|
|
19
|
+
import { buildLowConfidenceAttributionPrompt } from "../prompt-builders.js";
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
// Body schema (D0045 trust boundary — satisfies required)
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
export const LowConfidenceAttributionBodySchema = z.object({
|
|
24
|
+
summary: z.string().min(1).max(800),
|
|
25
|
+
judgmentRefs: z
|
|
26
|
+
.array(z.object({
|
|
27
|
+
taskId: z.string().min(1),
|
|
28
|
+
modelId: z.string().min(1),
|
|
29
|
+
dimension: z.string().min(1),
|
|
30
|
+
}))
|
|
31
|
+
.min(1),
|
|
32
|
+
});
|
|
33
|
+
// ---------------------------------------------------------------------------
|
|
34
|
+
// Generator
|
|
35
|
+
// ---------------------------------------------------------------------------
|
|
36
|
+
const CARD_MODEL = mkModelId("anthropic:claude-sonnet-4-6");
|
|
37
|
+
export const generateLowConfidenceAttribution = async (report, ctx) => {
|
|
38
|
+
// C1: no LLM → missing
|
|
39
|
+
if (!ctx.llm) {
|
|
40
|
+
return {
|
|
41
|
+
status: "missing",
|
|
42
|
+
cardType: "low-confidence-attribution",
|
|
43
|
+
reason: "no LLMClient wired",
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
// L1: Landmine 11 — short-circuit BEFORE calling LLM when no attribution data
|
|
47
|
+
if (!ctx.judgmentAttributions || ctx.judgmentAttributions.length === 0) {
|
|
48
|
+
return {
|
|
49
|
+
status: "missing",
|
|
50
|
+
cardType: "low-confidence-attribution",
|
|
51
|
+
reason: "no attribution data for this run",
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
const prompt = buildLowConfidenceAttributionPrompt(report, ctx.judgmentAttributions);
|
|
55
|
+
const { value, usage } = await ctx.llm.completeStructured({
|
|
56
|
+
model: CARD_MODEL,
|
|
57
|
+
prompt: `${prompt.system}\n\n${prompt.user}`,
|
|
58
|
+
schema: LowConfidenceAttributionBodySchema,
|
|
59
|
+
temperature: 0.1,
|
|
60
|
+
maxTokens: 2000,
|
|
61
|
+
context: {
|
|
62
|
+
feature: "diagnosis",
|
|
63
|
+
runId: ctx.runId,
|
|
64
|
+
cardId: "low-confidence-attribution",
|
|
65
|
+
},
|
|
66
|
+
});
|
|
67
|
+
return {
|
|
68
|
+
status: "ready",
|
|
69
|
+
cardType: "low-confidence-attribution",
|
|
70
|
+
body: value,
|
|
71
|
+
meta: {
|
|
72
|
+
cardVersion: "low-confidence-attribution@0.1.0",
|
|
73
|
+
tokenUsage: { input: usage.promptTokens, output: usage.completionTokens },
|
|
74
|
+
generatedAt: new Date().toISOString(),
|
|
75
|
+
},
|
|
76
|
+
};
|
|
77
|
+
};
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* no-issues card — deterministic guard against sycophantic "all is well" reports.
|
|
3
|
+
*
|
|
4
|
+
* Only fires when ALL areas in the report scored at or above
|
|
5
|
+
* `NO_ISSUES_THRESHOLD`. The threshold is calibrated to keep the firing rate
|
|
6
|
+
* ≤30% per AI-SPEC §1b failure-mode #7 (sycophantic no-issues).
|
|
7
|
+
*
|
|
8
|
+
* Pure computation, no LLM call. Uses `report.summary.scores` — each
|
|
9
|
+
* `FeatureScore.totalScore` is the composite (0–100 scale).
|
|
10
|
+
*
|
|
11
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
12
|
+
* mandatory (cards/ is a SCAN_ROOT in check-trust-boundary-satisfies.ts).
|
|
13
|
+
*
|
|
14
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §1b failure-mode #7
|
|
15
|
+
*/
|
|
16
|
+
import { z } from "zod";
|
|
17
|
+
import type { CardGenerator } from "../../diagnosis-runner.js";
|
|
18
|
+
/**
|
|
19
|
+
* Threshold above which a Report area is considered "no issues" per the
|
|
20
|
+
* team's action threshold (AI-SPEC §1b failure-mode #7). Tuned against
|
|
21
|
+
* the fixture set in no-issues.test.ts to keep firing rate ≤30%.
|
|
22
|
+
*
|
|
23
|
+
* At 85, only reports where every area scores ≥85 trigger this card.
|
|
24
|
+
* On the 10-report fixture spanning 0–99, only reports at [90,92],
|
|
25
|
+
* [95,97], [98,99] qualify — a 3/10 = 30% firing rate (right at the cap).
|
|
26
|
+
*/
|
|
27
|
+
export declare const NO_ISSUES_THRESHOLD = 85;
|
|
28
|
+
export declare const NoIssuesBodySchema: z.ZodObject<{
|
|
29
|
+
summary: z.ZodString;
|
|
30
|
+
thresholdScore: z.ZodNumber;
|
|
31
|
+
}, z.core.$strip>;
|
|
32
|
+
export declare const generateNoIssues: CardGenerator;
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* no-issues card — deterministic guard against sycophantic "all is well" reports.
|
|
3
|
+
*
|
|
4
|
+
* Only fires when ALL areas in the report scored at or above
|
|
5
|
+
* `NO_ISSUES_THRESHOLD`. The threshold is calibrated to keep the firing rate
|
|
6
|
+
* ≤30% per AI-SPEC §1b failure-mode #7 (sycophantic no-issues).
|
|
7
|
+
*
|
|
8
|
+
* Pure computation, no LLM call. Uses `report.summary.scores` — each
|
|
9
|
+
* `FeatureScore.totalScore` is the composite (0–100 scale).
|
|
10
|
+
*
|
|
11
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
12
|
+
* mandatory (cards/ is a SCAN_ROOT in check-trust-boundary-satisfies.ts).
|
|
13
|
+
*
|
|
14
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §1b failure-mode #7
|
|
15
|
+
*/
|
|
16
|
+
import { z } from "zod";
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
// Threshold constant (calibration)
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
/**
|
|
21
|
+
* Threshold above which a Report area is considered "no issues" per the
|
|
22
|
+
* team's action threshold (AI-SPEC §1b failure-mode #7). Tuned against
|
|
23
|
+
* the fixture set in no-issues.test.ts to keep firing rate ≤30%.
|
|
24
|
+
*
|
|
25
|
+
* At 85, only reports where every area scores ≥85 trigger this card.
|
|
26
|
+
* On the 10-report fixture spanning 0–99, only reports at [90,92],
|
|
27
|
+
* [95,97], [98,99] qualify — a 3/10 = 30% firing rate (right at the cap).
|
|
28
|
+
*/
|
|
29
|
+
export const NO_ISSUES_THRESHOLD = 85;
|
|
30
|
+
// ---------------------------------------------------------------------------
|
|
31
|
+
// Body schema (D0045 trust boundary — satisfies required)
|
|
32
|
+
// ---------------------------------------------------------------------------
|
|
33
|
+
export const NoIssuesBodySchema = z.object({
|
|
34
|
+
summary: z.string().min(1).max(500),
|
|
35
|
+
thresholdScore: z.number(),
|
|
36
|
+
});
|
|
37
|
+
// ---------------------------------------------------------------------------
|
|
38
|
+
// Generator
|
|
39
|
+
// ---------------------------------------------------------------------------
|
|
40
|
+
export const generateNoIssues = async (report) => {
|
|
41
|
+
const scores = report.summary.scores;
|
|
42
|
+
if (!scores || scores.length === 0) {
|
|
43
|
+
return {
|
|
44
|
+
status: "missing",
|
|
45
|
+
cardType: "no-issues",
|
|
46
|
+
reason: "at least one area scored below threshold",
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
const allAboveThreshold = scores.every((s) => s.totalScore >= NO_ISSUES_THRESHOLD);
|
|
50
|
+
if (!allAboveThreshold) {
|
|
51
|
+
return {
|
|
52
|
+
status: "missing",
|
|
53
|
+
cardType: "no-issues",
|
|
54
|
+
reason: "at least one area scored below threshold",
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
const summary = `All ${scores.length} areas scored ≥${NO_ISSUES_THRESHOLD} — no action required.`;
|
|
58
|
+
const body = NoIssuesBodySchema.parse({
|
|
59
|
+
summary,
|
|
60
|
+
thresholdScore: NO_ISSUES_THRESHOLD,
|
|
61
|
+
});
|
|
62
|
+
return {
|
|
63
|
+
status: "ready",
|
|
64
|
+
cardType: "no-issues",
|
|
65
|
+
body,
|
|
66
|
+
meta: {
|
|
67
|
+
cardVersion: "no-issues@0.1.0",
|
|
68
|
+
generatedAt: new Date().toISOString(),
|
|
69
|
+
},
|
|
70
|
+
};
|
|
71
|
+
};
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* regression-vs-baseline card — LLM-driven run comparison card.
|
|
3
|
+
*
|
|
4
|
+
* Model: claude-opus-4-6 (high-stakes per AI-SPEC §4 model routing)
|
|
5
|
+
* Version: regression-vs-baseline@0.1.0
|
|
6
|
+
*
|
|
7
|
+
* DIAG-05: emits ONLY when `ctx.baseline` is set. When baseline is absent,
|
|
8
|
+
* returns `status: "missing", reason: "no --compare baseline supplied"`.
|
|
9
|
+
*
|
|
10
|
+
* Mitigations:
|
|
11
|
+
* - failure-mode #1: `buildRegressionVsBaselinePrompt` computes deltas in JS
|
|
12
|
+
* BEFORE the LLM call; schema refine asserts sign-consistency (R3)
|
|
13
|
+
*
|
|
14
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
15
|
+
* mandatory.
|
|
16
|
+
*
|
|
17
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §3 lines 603-664
|
|
18
|
+
*/
|
|
19
|
+
import { z } from "zod";
|
|
20
|
+
import type { CardGenerator } from "../../diagnosis-runner.js";
|
|
21
|
+
/**
|
|
22
|
+
* Module-level static shape only. Per-call adds:
|
|
23
|
+
* - sign-consistency refine (R3): Math.sign(pointsDelta) === directionSign(direction)
|
|
24
|
+
*/
|
|
25
|
+
export declare const RegressionVsBaselineBodySchema: z.ZodObject<{
|
|
26
|
+
summary: z.ZodString;
|
|
27
|
+
deltas: z.ZodArray<z.ZodObject<{
|
|
28
|
+
area: z.ZodString;
|
|
29
|
+
direction: z.ZodEnum<{
|
|
30
|
+
improved: "improved";
|
|
31
|
+
regressed: "regressed";
|
|
32
|
+
unchanged: "unchanged";
|
|
33
|
+
}>;
|
|
34
|
+
pointsDelta: z.ZodNumber;
|
|
35
|
+
drivers: z.ZodArray<z.ZodString>;
|
|
36
|
+
}, z.core.$strip>>;
|
|
37
|
+
overallTrend: z.ZodEnum<{
|
|
38
|
+
"net-improved": "net-improved";
|
|
39
|
+
"net-regressed": "net-regressed";
|
|
40
|
+
mixed: "mixed";
|
|
41
|
+
stable: "stable";
|
|
42
|
+
}>;
|
|
43
|
+
}, z.core.$strip>;
|
|
44
|
+
export declare const generateRegressionVsBaseline: CardGenerator;
|