@sanity/ailf 4.6.0 → 6.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/canonical/grader-references/agent-harness-tools.yaml +42 -0
- package/canonical/grader-references/knowledge-probe-recall.yaml +36 -0
- package/canonical/grader-references/mcp-server-spec.yaml +51 -0
- package/canonical/grader-references/portable-text.yaml +48 -0
- package/config/diagnosis-cards.ts +318 -0
- package/config/models.ts +12 -0
- package/config/rubrics.ts +38 -2
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +60 -2
- package/dist/_vendor/ailf-core/artifact-registry.js +288 -7
- package/dist/_vendor/ailf-core/examples/index.d.ts +125 -26
- package/dist/_vendor/ailf-core/examples/index.js +146 -47
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +4 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +8 -0
- package/dist/_vendor/ailf-core/ports/mode-handler.d.ts +15 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.d.ts +40 -0
- package/dist/_vendor/ailf-core/schemas/branded-string.js +45 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.d.ts +36 -0
- package/dist/_vendor/ailf-core/schemas/confidence-schema.js +32 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -4
- package/dist/_vendor/ailf-core/schemas/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/schemas/index.js +9 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline-request.js +1 -0
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +34 -8
- package/dist/_vendor/ailf-core/schemas/pipeline.js +23 -1
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +171 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +104 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +96 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +39 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +52 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +77 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +126 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +107 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +114 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +273 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +16 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +86 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +50 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.js +35 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +136 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.js +153 -0
- package/dist/_vendor/ailf-core/services/index.d.ts +6 -0
- package/dist/_vendor/ailf-core/services/index.js +18 -0
- package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
- package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
- package/dist/_vendor/ailf-core/services/report-to-markdown.js +3 -2
- package/dist/_vendor/ailf-core/types/attribution.d.ts +82 -0
- package/dist/_vendor/ailf-core/types/attribution.js +18 -0
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +26 -1
- package/dist/_vendor/ailf-core/types/branded-ids.js +80 -4
- package/dist/_vendor/ailf-core/types/confidence.d.ts +1 -1
- package/dist/_vendor/ailf-core/types/confidence.js +7 -0
- package/dist/_vendor/ailf-core/types/diagnosis.d.ts +271 -0
- package/dist/_vendor/ailf-core/types/diagnosis.js +19 -0
- package/dist/_vendor/ailf-core/types/generalized-task.d.ts +16 -1
- package/dist/_vendor/ailf-core/types/grader-judgment.d.ts +125 -0
- package/dist/_vendor/ailf-core/types/grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +80 -29
- package/dist/_vendor/ailf-core/types/index.js +15 -1
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.d.ts +55 -0
- package/dist/_vendor/ailf-core/types/legacy-grader-judgment.js +30 -0
- package/dist/_vendor/ailf-core/types/pipeline-request.d.ts +1 -0
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +8 -0
- package/dist/_vendor/ailf-shared/document-ref.d.ts +1 -1
- package/dist/adapters/api-client/build-request.d.ts +1 -0
- package/dist/adapters/api-client/build-request.js +3 -0
- package/dist/adapters/attribution/attribution-meta-writer.d.ts +35 -0
- package/dist/adapters/attribution/attribution-meta-writer.js +34 -0
- package/dist/adapters/attribution/index.d.ts +9 -0
- package/dist/adapters/attribution/index.js +8 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.d.ts +56 -0
- package/dist/adapters/attribution/per-entry-attribution-writer.js +49 -0
- package/dist/adapters/config-sources/file-config-adapter.js +1 -0
- package/dist/adapters/grader-outputs/index.d.ts +10 -0
- package/dist/adapters/grader-outputs/index.js +8 -0
- package/dist/adapters/grader-outputs/legacy/index.d.ts +11 -0
- package/dist/adapters/grader-outputs/legacy/index.js +10 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.d.ts +49 -0
- package/dist/adapters/grader-outputs/legacy/promptfoo-grader-output-legacy.js +48 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.d.ts +102 -0
- package/dist/adapters/grader-outputs/promptfoo-grader-output.js +93 -0
- package/dist/adapters/index.d.ts +3 -0
- package/dist/adapters/index.js +4 -0
- package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
- package/dist/adapters/llm/fake-llm-client.js +38 -1
- package/dist/adapters/llm/openai-llm-client.js +52 -3
- package/dist/adapters/task-sources/content-lake-task-source.d.ts +5 -1
- package/dist/adapters/task-sources/content-lake-task-source.js +28 -2
- package/dist/adapters/task-sources/repo-schemas.d.ts +79 -11
- package/dist/adapters/task-sources/repo-schemas.js +19 -2
- package/dist/cli-program.js +3 -0
- package/dist/commands/calculate-scores.js +1 -1
- package/dist/commands/explain-handler.js +1 -1
- package/dist/commands/interpret.d.ts +50 -0
- package/dist/commands/interpret.js +212 -0
- package/dist/commands/lookup-doc.d.ts +1 -1
- package/dist/commands/lookup-doc.js +3 -3
- package/dist/commands/pipeline-action.d.ts +6 -0
- package/dist/commands/pipeline-action.js +2 -0
- package/dist/commands/remote-pipeline.js +1 -0
- package/dist/composition-root.d.ts +57 -23
- package/dist/composition-root.js +155 -41
- package/dist/config/diagnosis-cards.ts +318 -0
- package/dist/config/models.ts +12 -0
- package/dist/config/rubrics.ts +38 -2
- package/dist/grader/agent-harness.d.ts +9 -0
- package/dist/grader/agent-harness.js +9 -0
- package/dist/grader/common.d.ts +9 -0
- package/dist/grader/common.js +9 -0
- package/dist/grader/index.d.ts +24 -0
- package/dist/grader/index.js +24 -0
- package/dist/grader/knowledge-probe.d.ts +9 -0
- package/dist/grader/knowledge-probe.js +9 -0
- package/dist/grader/literacy.d.ts +9 -0
- package/dist/grader/literacy.js +9 -0
- package/dist/grader/mcp.d.ts +9 -0
- package/dist/grader/mcp.js +9 -0
- package/dist/orchestration/build-app-context.js +1 -0
- package/dist/orchestration/build-step-sequence.js +5 -0
- package/dist/orchestration/steps/calculate-scores-step.js +23 -1
- package/dist/orchestration/steps/compute-attribution-step.d.ts +44 -0
- package/dist/orchestration/steps/compute-attribution-step.js +279 -0
- package/dist/orchestration/steps/gap-analysis-step.js +35 -7
- package/dist/orchestration/steps/index.d.ts +1 -0
- package/dist/orchestration/steps/index.js +1 -0
- package/dist/pipeline/attribution.d.ts +15 -0
- package/dist/pipeline/attribution.js +18 -9
- package/dist/pipeline/borderline-consensus-runner.d.ts +63 -0
- package/dist/pipeline/borderline-consensus-runner.js +124 -0
- package/dist/pipeline/borderline-detector.d.ts +24 -0
- package/dist/pipeline/borderline-detector.js +26 -0
- package/dist/pipeline/calculate-scores.d.ts +114 -3
- package/dist/pipeline/calculate-scores.js +426 -24
- package/dist/pipeline/compiler/literacy-bridge.d.ts +1 -1
- package/dist/pipeline/compiler/literacy-bridge.js +35 -17
- package/dist/pipeline/compiler/rubric-resolution.d.ts +15 -0
- package/dist/pipeline/compiler/rubric-resolution.js +9 -1
- package/dist/pipeline/compute-attribution.d.ts +80 -0
- package/dist/pipeline/compute-attribution.js +196 -0
- package/dist/pipeline/failure-modes.d.ts +52 -17
- package/dist/pipeline/failure-modes.js +178 -117
- package/dist/pipeline/map-request-to-config.js +1 -0
- package/package.json +7 -5
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* regression-vs-baseline card — LLM-driven run comparison card.
|
|
3
|
+
*
|
|
4
|
+
* Model: claude-opus-4-6 (high-stakes per AI-SPEC §4 model routing)
|
|
5
|
+
* Version: regression-vs-baseline@0.1.0
|
|
6
|
+
*
|
|
7
|
+
* DIAG-05: emits ONLY when `ctx.baseline` is set. When baseline is absent,
|
|
8
|
+
* returns `status: "missing", reason: "no --compare baseline supplied"`.
|
|
9
|
+
*
|
|
10
|
+
* Mitigations:
|
|
11
|
+
* - failure-mode #1: `buildRegressionVsBaselinePrompt` computes deltas in JS
|
|
12
|
+
* BEFORE the LLM call; schema refine asserts sign-consistency (R3)
|
|
13
|
+
*
|
|
14
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
15
|
+
* mandatory.
|
|
16
|
+
*
|
|
17
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §3 lines 603-664
|
|
18
|
+
*/
|
|
19
|
+
import { z } from "zod";
|
|
20
|
+
import { modelId as mkModelId } from "../../../ports/llm-client.js";
|
|
21
|
+
import { buildRegressionVsBaselinePrompt } from "../prompt-builders.js";
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
// Helper: direction sign check
|
|
24
|
+
// ---------------------------------------------------------------------------
|
|
25
|
+
/**
|
|
26
|
+
* Map direction label to its expected sign. Returns the expected sign:
|
|
27
|
+
* +1 for improved, -1 for regressed, 0 for unchanged.
|
|
28
|
+
*/
|
|
29
|
+
function directionSign(direction) {
|
|
30
|
+
if (direction === "improved")
|
|
31
|
+
return 1;
|
|
32
|
+
if (direction === "regressed")
|
|
33
|
+
return -1;
|
|
34
|
+
return 0;
|
|
35
|
+
}
|
|
36
|
+
// ---------------------------------------------------------------------------
|
|
37
|
+
// Body schema (D0045 trust boundary — satisfies required)
|
|
38
|
+
// ---------------------------------------------------------------------------
|
|
39
|
+
/**
|
|
40
|
+
* Module-level static shape only. Per-call adds:
|
|
41
|
+
* - sign-consistency refine (R3): Math.sign(pointsDelta) === directionSign(direction)
|
|
42
|
+
*/
|
|
43
|
+
export const RegressionVsBaselineBodySchema = z.object({
|
|
44
|
+
summary: z.string().min(1).max(800),
|
|
45
|
+
deltas: z
|
|
46
|
+
.array(z.object({
|
|
47
|
+
area: z.string().min(1),
|
|
48
|
+
direction: z.enum(["improved", "regressed", "unchanged"]),
|
|
49
|
+
pointsDelta: z.number(),
|
|
50
|
+
drivers: z.array(z.string()),
|
|
51
|
+
}))
|
|
52
|
+
.max(10),
|
|
53
|
+
overallTrend: z.enum(["net-improved", "net-regressed", "mixed", "stable"]),
|
|
54
|
+
});
|
|
55
|
+
// ---------------------------------------------------------------------------
|
|
56
|
+
// Generator
|
|
57
|
+
// ---------------------------------------------------------------------------
|
|
58
|
+
const CARD_MODEL = mkModelId("anthropic:claude-opus-4-6");
|
|
59
|
+
export const generateRegressionVsBaseline = async (report, ctx) => {
|
|
60
|
+
// C1: no LLM → missing
|
|
61
|
+
if (!ctx.llm) {
|
|
62
|
+
return {
|
|
63
|
+
status: "missing",
|
|
64
|
+
cardType: "regression-vs-baseline",
|
|
65
|
+
reason: "no LLMClient wired",
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
// R1: DIAG-05 no-auto-comparison — only emits when baseline is supplied
|
|
69
|
+
if (!ctx.baseline) {
|
|
70
|
+
return {
|
|
71
|
+
status: "missing",
|
|
72
|
+
cardType: "regression-vs-baseline",
|
|
73
|
+
reason: "no --compare baseline supplied",
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
// Compute deltas in JS BEFORE the LLM call (failure-mode #1 mitigation).
|
|
77
|
+
// The prompt embeds the precomputed numbers so the LLM cannot fabricate deltas.
|
|
78
|
+
const { system, user } = buildRegressionVsBaselinePrompt(report, ctx.baseline);
|
|
79
|
+
// Per-call schema: additive R3 sign-consistency refine
|
|
80
|
+
// (AI-SPEC §3 Pitfall 1 — schema local to generator, not module scope)
|
|
81
|
+
const PerCallSchema = z
|
|
82
|
+
.object({
|
|
83
|
+
summary: z.string().min(1).max(800),
|
|
84
|
+
deltas: z
|
|
85
|
+
.array(z.object({
|
|
86
|
+
area: z.string().min(1),
|
|
87
|
+
direction: z.enum(["improved", "regressed", "unchanged"]),
|
|
88
|
+
pointsDelta: z.number(),
|
|
89
|
+
drivers: z.array(z.string()),
|
|
90
|
+
}))
|
|
91
|
+
.max(10),
|
|
92
|
+
overallTrend: z.enum([
|
|
93
|
+
"net-improved",
|
|
94
|
+
"net-regressed",
|
|
95
|
+
"mixed",
|
|
96
|
+
"stable",
|
|
97
|
+
]),
|
|
98
|
+
})
|
|
99
|
+
// R3: sign-consistency guardrail — LLM must not invert direction vs delta
|
|
100
|
+
.refine((body) => body.deltas.every((d) => Math.sign(d.pointsDelta) === directionSign(d.direction)), {
|
|
101
|
+
message: "direction label must match sign of pointsDelta (improved=positive, regressed=negative, unchanged=zero)",
|
|
102
|
+
path: ["deltas"],
|
|
103
|
+
});
|
|
104
|
+
const { value, usage } = await ctx.llm.completeStructured({
|
|
105
|
+
model: CARD_MODEL,
|
|
106
|
+
prompt: `${system}\n\n${user}`,
|
|
107
|
+
schema: PerCallSchema,
|
|
108
|
+
temperature: 0.1,
|
|
109
|
+
maxTokens: 2000,
|
|
110
|
+
context: {
|
|
111
|
+
feature: "diagnosis",
|
|
112
|
+
runId: ctx.runId,
|
|
113
|
+
cardId: "regression-vs-baseline",
|
|
114
|
+
},
|
|
115
|
+
});
|
|
116
|
+
return {
|
|
117
|
+
status: "ready",
|
|
118
|
+
cardType: "regression-vs-baseline",
|
|
119
|
+
body: value,
|
|
120
|
+
meta: {
|
|
121
|
+
cardVersion: "regression-vs-baseline@0.1.0",
|
|
122
|
+
tokenUsage: { input: usage.promptTokens, output: usage.completionTokens },
|
|
123
|
+
generatedAt: new Date().toISOString(),
|
|
124
|
+
},
|
|
125
|
+
};
|
|
126
|
+
};
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* top-recommendations card — LLM-driven actionable suggestion generator.
|
|
3
|
+
*
|
|
4
|
+
* Model: claude-opus-4-6 (high-stakes per AI-SPEC §4 model routing)
|
|
5
|
+
* Version: top-recommendations@0.1.0
|
|
6
|
+
*
|
|
7
|
+
* Mitigations:
|
|
8
|
+
* - failure-mode #2: few-shot examples in system prompt; body ≥40 chars
|
|
9
|
+
* - failure-mode #5: docSlug refined against report manifest allow-list
|
|
10
|
+
*
|
|
11
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
12
|
+
* mandatory (cards/ is a SCAN_ROOT in check-trust-boundary-satisfies.ts).
|
|
13
|
+
*
|
|
14
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §3
|
|
15
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4
|
|
16
|
+
*/
|
|
17
|
+
import { z } from "zod";
|
|
18
|
+
import type { CardGenerator } from "../../diagnosis-runner.js";
|
|
19
|
+
/**
|
|
20
|
+
* Module-level schema: static shape check only.
|
|
21
|
+
* Per-call: an additive `.refine()` over the allow-list (built inside the
|
|
22
|
+
* generator so it closes over the runtime `report`).
|
|
23
|
+
*
|
|
24
|
+
* AI-SPEC §3 Pitfall 1: per-call schemas are LOCAL to the generator,
|
|
25
|
+
* NOT at module scope.
|
|
26
|
+
*/
|
|
27
|
+
export declare const TopRecommendationsBodySchema: z.ZodObject<{
|
|
28
|
+
summary: z.ZodString;
|
|
29
|
+
suggestions: z.ZodArray<z.ZodObject<{
|
|
30
|
+
title: z.ZodString;
|
|
31
|
+
body: z.ZodString;
|
|
32
|
+
priority: z.ZodEnum<{
|
|
33
|
+
low: "low";
|
|
34
|
+
medium: "medium";
|
|
35
|
+
high: "high";
|
|
36
|
+
}>;
|
|
37
|
+
docSlug: z.ZodString;
|
|
38
|
+
sectionHeading: z.ZodNullable<z.ZodString>;
|
|
39
|
+
}, z.core.$strip>>;
|
|
40
|
+
}, z.core.$strip>;
|
|
41
|
+
export declare const generateTopRecommendations: CardGenerator;
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* top-recommendations card — LLM-driven actionable suggestion generator.
|
|
3
|
+
*
|
|
4
|
+
* Model: claude-opus-4-6 (high-stakes per AI-SPEC §4 model routing)
|
|
5
|
+
* Version: top-recommendations@0.1.0
|
|
6
|
+
*
|
|
7
|
+
* Mitigations:
|
|
8
|
+
* - failure-mode #2: few-shot examples in system prompt; body ≥40 chars
|
|
9
|
+
* - failure-mode #5: docSlug refined against report manifest allow-list
|
|
10
|
+
*
|
|
11
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
12
|
+
* mandatory (cards/ is a SCAN_ROOT in check-trust-boundary-satisfies.ts).
|
|
13
|
+
*
|
|
14
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §3
|
|
15
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4
|
|
16
|
+
*/
|
|
17
|
+
import { z } from "zod";
|
|
18
|
+
import { modelId as mkModelId } from "../../../ports/llm-client.js";
|
|
19
|
+
import { buildTopRecommendationsPrompt, buildDocSlugAllowList, } from "../prompt-builders.js";
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
// Body schema (D0045 trust boundary — satisfies required)
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
/**
|
|
24
|
+
* Module-level schema: static shape check only.
|
|
25
|
+
* Per-call: an additive `.refine()` over the allow-list (built inside the
|
|
26
|
+
* generator so it closes over the runtime `report`).
|
|
27
|
+
*
|
|
28
|
+
* AI-SPEC §3 Pitfall 1: per-call schemas are LOCAL to the generator,
|
|
29
|
+
* NOT at module scope.
|
|
30
|
+
*/
|
|
31
|
+
export const TopRecommendationsBodySchema = z.object({
|
|
32
|
+
summary: z.string().min(1).max(500),
|
|
33
|
+
suggestions: z
|
|
34
|
+
.array(z.object({
|
|
35
|
+
title: z.string().min(1).max(200),
|
|
36
|
+
body: z
|
|
37
|
+
.string()
|
|
38
|
+
.min(40, "suggestion.body must be ≥40 characters")
|
|
39
|
+
.refine((b) => /`[^`]+`/.test(b), "suggestion.body must contain at least one backtick-delimited artifact"),
|
|
40
|
+
priority: z.enum(["high", "medium", "low"]),
|
|
41
|
+
docSlug: z.string().min(1),
|
|
42
|
+
sectionHeading: z.string().nullable(),
|
|
43
|
+
}))
|
|
44
|
+
.min(1)
|
|
45
|
+
.max(5),
|
|
46
|
+
});
|
|
47
|
+
// ---------------------------------------------------------------------------
|
|
48
|
+
// Generator
|
|
49
|
+
// ---------------------------------------------------------------------------
|
|
50
|
+
const CARD_MODEL = mkModelId("anthropic:claude-opus-4-6");
|
|
51
|
+
export const generateTopRecommendations = async (report, ctx) => {
|
|
52
|
+
// C1: no LLM → missing
|
|
53
|
+
if (!ctx.llm) {
|
|
54
|
+
return {
|
|
55
|
+
status: "missing",
|
|
56
|
+
cardType: "top-recommendations",
|
|
57
|
+
reason: "no LLMClient wired",
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
// Build allow-list from the runtime report
|
|
61
|
+
const allowList = buildDocSlugAllowList(report);
|
|
62
|
+
// Per-call schema: additive docSlug allow-list refine (AI-SPEC §3 Pitfall 1)
|
|
63
|
+
const PerCallSchema = z.object({
|
|
64
|
+
summary: z.string().min(1).max(500),
|
|
65
|
+
suggestions: z
|
|
66
|
+
.array(z.object({
|
|
67
|
+
title: z.string().min(1).max(200),
|
|
68
|
+
body: z
|
|
69
|
+
.string()
|
|
70
|
+
.min(40, "suggestion.body must be ≥40 characters")
|
|
71
|
+
.refine((b) => /`[^`]+`/.test(b), "suggestion.body must contain at least one backtick-delimited artifact"),
|
|
72
|
+
priority: z.enum(["high", "medium", "low"]),
|
|
73
|
+
docSlug: z
|
|
74
|
+
.string()
|
|
75
|
+
.min(1)
|
|
76
|
+
.refine((slug) => allowList.has(slug), {
|
|
77
|
+
message: "suggestion.docSlug is not in the report document manifest allow-list",
|
|
78
|
+
}),
|
|
79
|
+
sectionHeading: z.string().nullable(),
|
|
80
|
+
}))
|
|
81
|
+
.min(1)
|
|
82
|
+
.max(5),
|
|
83
|
+
});
|
|
84
|
+
const prompt = buildTopRecommendationsPrompt(report, allowList);
|
|
85
|
+
const { value, usage } = await ctx.llm.completeStructured({
|
|
86
|
+
model: CARD_MODEL,
|
|
87
|
+
prompt: `${prompt.system}\n\n${prompt.user}`,
|
|
88
|
+
schema: PerCallSchema,
|
|
89
|
+
temperature: 0.1,
|
|
90
|
+
maxTokens: 2000,
|
|
91
|
+
context: {
|
|
92
|
+
feature: "diagnosis",
|
|
93
|
+
runId: ctx.runId,
|
|
94
|
+
cardId: "top-recommendations",
|
|
95
|
+
},
|
|
96
|
+
});
|
|
97
|
+
return {
|
|
98
|
+
status: "ready",
|
|
99
|
+
cardType: "top-recommendations",
|
|
100
|
+
body: value,
|
|
101
|
+
meta: {
|
|
102
|
+
cardVersion: "top-recommendations@0.1.0",
|
|
103
|
+
tokenUsage: { input: usage.promptTokens, output: usage.completionTokens },
|
|
104
|
+
generatedAt: new Date().toISOString(),
|
|
105
|
+
},
|
|
106
|
+
};
|
|
107
|
+
};
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* weakest-area card — LLM-driven weakest area identification.
|
|
3
|
+
*
|
|
4
|
+
* Model: claude-sonnet-4-6 (routine per AI-SPEC §4 model routing)
|
|
5
|
+
* Version: weakest-area@0.1.0
|
|
6
|
+
*
|
|
7
|
+
* Mitigations:
|
|
8
|
+
* - failure-mode #3: sampleSize = report.areas[area].judgmentCount (per-call refine W2)
|
|
9
|
+
* - failure-mode #3: sampleSize < 10 → confidence.level must be "low" (per-call refine W3)
|
|
10
|
+
* - failure-mode #4: taxonomy drift → buildFailureModeRefinement() Zod predicate
|
|
11
|
+
*
|
|
12
|
+
* Per-call schemas live INSIDE the generator (AI-SPEC §3 Pitfall 1 — no
|
|
13
|
+
* module-scope mutables that change across calls).
|
|
14
|
+
*
|
|
15
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
16
|
+
* mandatory (cards/ is a SCAN_ROOT in check-trust-boundary-satisfies.ts).
|
|
17
|
+
*
|
|
18
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §3
|
|
19
|
+
*/
|
|
20
|
+
import { z } from "zod";
|
|
21
|
+
import type { CardGenerator } from "../../diagnosis-runner.js";
|
|
22
|
+
/**
|
|
23
|
+
* Module-level schema asserts the static shape + taxonomy constraint (D-05).
|
|
24
|
+
* Per-call schemas add the sampleSize (W2) and small-sample confidence (W3)
|
|
25
|
+
* refinements inside the generator, since they need runtime `report` data.
|
|
26
|
+
*/
|
|
27
|
+
export declare const WeakestAreaBodySchema: z.ZodObject<{
|
|
28
|
+
summary: z.ZodString;
|
|
29
|
+
area: z.ZodString;
|
|
30
|
+
dimension: z.ZodString;
|
|
31
|
+
failureMode: z.ZodString;
|
|
32
|
+
sampleSize: z.ZodNumber;
|
|
33
|
+
confidence: z.ZodObject<{
|
|
34
|
+
level: z.ZodEnum<{
|
|
35
|
+
low: "low";
|
|
36
|
+
medium: "medium";
|
|
37
|
+
high: "high";
|
|
38
|
+
}>;
|
|
39
|
+
signalsPresent: z.ZodNumber;
|
|
40
|
+
derivation: z.ZodString;
|
|
41
|
+
}, z.core.$strip>;
|
|
42
|
+
}, z.core.$strip>;
|
|
43
|
+
export declare const generateWeakestArea: CardGenerator;
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* weakest-area card — LLM-driven weakest area identification.
|
|
3
|
+
*
|
|
4
|
+
* Model: claude-sonnet-4-6 (routine per AI-SPEC §4 model routing)
|
|
5
|
+
* Version: weakest-area@0.1.0
|
|
6
|
+
*
|
|
7
|
+
* Mitigations:
|
|
8
|
+
* - failure-mode #3: sampleSize = report.areas[area].judgmentCount (per-call refine W2)
|
|
9
|
+
* - failure-mode #3: sampleSize < 10 → confidence.level must be "low" (per-call refine W3)
|
|
10
|
+
* - failure-mode #4: taxonomy drift → buildFailureModeRefinement() Zod predicate
|
|
11
|
+
*
|
|
12
|
+
* Per-call schemas live INSIDE the generator (AI-SPEC §3 Pitfall 1 — no
|
|
13
|
+
* module-scope mutables that change across calls).
|
|
14
|
+
*
|
|
15
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
16
|
+
* mandatory (cards/ is a SCAN_ROOT in check-trust-boundary-satisfies.ts).
|
|
17
|
+
*
|
|
18
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §3
|
|
19
|
+
*/
|
|
20
|
+
import { z } from "zod";
|
|
21
|
+
import { ConfidenceSchema } from "../../../schemas/confidence-schema.js";
|
|
22
|
+
import { modelId as mkModelId } from "../../../ports/llm-client.js";
|
|
23
|
+
import { buildFailureModeRefinement } from "../card-validators.js";
|
|
24
|
+
import { buildWeakestAreaPrompt } from "../prompt-builders.js";
|
|
25
|
+
// ---------------------------------------------------------------------------
|
|
26
|
+
// Module-level schema: static shape + D-05 taxonomy refine
|
|
27
|
+
// ---------------------------------------------------------------------------
|
|
28
|
+
/**
|
|
29
|
+
* Module-level schema asserts the static shape + taxonomy constraint (D-05).
|
|
30
|
+
* Per-call schemas add the sampleSize (W2) and small-sample confidence (W3)
|
|
31
|
+
* refinements inside the generator, since they need runtime `report` data.
|
|
32
|
+
*/
|
|
33
|
+
export const WeakestAreaBodySchema = z
|
|
34
|
+
.object({
|
|
35
|
+
summary: z.string().min(1).max(800),
|
|
36
|
+
area: z.string().min(1),
|
|
37
|
+
dimension: z.string().min(1),
|
|
38
|
+
failureMode: z.string().min(1),
|
|
39
|
+
sampleSize: z.number().int().nonnegative(),
|
|
40
|
+
confidence: ConfidenceSchema,
|
|
41
|
+
})
|
|
42
|
+
.refine(buildFailureModeRefinement(), {
|
|
43
|
+
message: "failureMode is not in the canonical taxonomy for this dimension",
|
|
44
|
+
path: ["failureMode"],
|
|
45
|
+
});
|
|
46
|
+
// ---------------------------------------------------------------------------
|
|
47
|
+
// Generator
|
|
48
|
+
// ---------------------------------------------------------------------------
|
|
49
|
+
const CARD_MODEL = mkModelId("anthropic:claude-sonnet-4-6");
|
|
50
|
+
export const generateWeakestArea = async (report, ctx) => {
|
|
51
|
+
// C1: no LLM → missing
|
|
52
|
+
if (!ctx.llm) {
|
|
53
|
+
return {
|
|
54
|
+
status: "missing",
|
|
55
|
+
cardType: "weakest-area",
|
|
56
|
+
reason: "no LLMClient wired",
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
const scores = report.summary.scores ?? [];
|
|
60
|
+
if (scores.length === 0) {
|
|
61
|
+
return {
|
|
62
|
+
status: "missing",
|
|
63
|
+
cardType: "weakest-area",
|
|
64
|
+
reason: "report has no areas",
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
// Per-call schema: close over report to get sampleSize + confidence constraints
|
|
68
|
+
// (AI-SPEC §3 Pitfall 1 — never hoist to module scope)
|
|
69
|
+
const PerCallSchema = z
|
|
70
|
+
.object({
|
|
71
|
+
summary: z.string().min(1).max(800),
|
|
72
|
+
area: z.string().min(1),
|
|
73
|
+
dimension: z.string().min(1),
|
|
74
|
+
failureMode: z.string().min(1),
|
|
75
|
+
sampleSize: z.number().int().nonnegative(),
|
|
76
|
+
confidence: ConfidenceSchema,
|
|
77
|
+
})
|
|
78
|
+
.refine(buildFailureModeRefinement(), {
|
|
79
|
+
message: "failureMode is not in the canonical taxonomy for this dimension",
|
|
80
|
+
path: ["failureMode"],
|
|
81
|
+
})
|
|
82
|
+
// W3: small-sample forces confidence.level = "low"
|
|
83
|
+
.refine((body) => {
|
|
84
|
+
if (body.sampleSize < 10)
|
|
85
|
+
return body.confidence.level === "low";
|
|
86
|
+
return true;
|
|
87
|
+
}, {
|
|
88
|
+
message: 'When sampleSize < 10, confidence.level must be "low"',
|
|
89
|
+
path: ["confidence", "level"],
|
|
90
|
+
});
|
|
91
|
+
const prompt = buildWeakestAreaPrompt(report);
|
|
92
|
+
const { value, usage } = await ctx.llm.completeStructured({
|
|
93
|
+
model: CARD_MODEL,
|
|
94
|
+
prompt: `${prompt.system}\n\n${prompt.user}`,
|
|
95
|
+
schema: PerCallSchema,
|
|
96
|
+
temperature: 0.1,
|
|
97
|
+
maxTokens: 2000,
|
|
98
|
+
context: {
|
|
99
|
+
feature: "diagnosis",
|
|
100
|
+
runId: ctx.runId,
|
|
101
|
+
cardId: "weakest-area",
|
|
102
|
+
},
|
|
103
|
+
});
|
|
104
|
+
return {
|
|
105
|
+
status: "ready",
|
|
106
|
+
cardType: "weakest-area",
|
|
107
|
+
body: value,
|
|
108
|
+
meta: {
|
|
109
|
+
cardVersion: "weakest-area@0.1.0",
|
|
110
|
+
tokenUsage: { input: usage.promptTokens, output: usage.completionTokens },
|
|
111
|
+
generatedAt: new Date().toISOString(),
|
|
112
|
+
},
|
|
113
|
+
};
|
|
114
|
+
};
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Deterministic prompt-builder helpers for the 5 LLM diagnosis cards.
|
|
3
|
+
*
|
|
4
|
+
* Each function takes typed inputs and returns `{ system, user }` strings
|
|
5
|
+
* (+ a `deltas` side-channel for regression-vs-baseline). The same inputs
|
|
6
|
+
* always produce the same output — no randomness, no side effects.
|
|
7
|
+
*
|
|
8
|
+
* Per AI-SPEC §4: input tokens are bounded by truncation; the user message
|
|
9
|
+
* projects only the fields each card needs.
|
|
10
|
+
*
|
|
11
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4
|
|
12
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §3 lines 603-664
|
|
13
|
+
*/
|
|
14
|
+
import type { JudgmentAttribution } from "../../types/attribution.js";
|
|
15
|
+
import type { Report } from "../../types/index.js";
|
|
16
|
+
/**
|
|
17
|
+
* Build the allow-list of doc slugs for a report. This is the union of:
|
|
18
|
+
* - report.summary.documentManifest[].slug
|
|
19
|
+
* - per-score documents[].slug
|
|
20
|
+
*
|
|
21
|
+
* Only slugs (not documentIds) appear in the allow-list because the cards
|
|
22
|
+
* reference human-readable slugs. DocumentId-only entries are skipped.
|
|
23
|
+
*/
|
|
24
|
+
export declare function buildDocSlugAllowList(report: Report): Set<string>;
|
|
25
|
+
/**
|
|
26
|
+
* Projects the 3 weakest areas + top failure modes + doc-slug allow-list
|
|
27
|
+
* into the user message (~2500 input tokens).
|
|
28
|
+
*/
|
|
29
|
+
export declare function buildTopRecommendationsPrompt(report: Report, allowList: Set<string>): {
|
|
30
|
+
system: string;
|
|
31
|
+
user: string;
|
|
32
|
+
};
|
|
33
|
+
/**
|
|
34
|
+
* Projects the single weakest area + full failure-mode breakdown.
|
|
35
|
+
*/
|
|
36
|
+
export declare function buildWeakestAreaPrompt(report: Report): {
|
|
37
|
+
system: string;
|
|
38
|
+
user: string;
|
|
39
|
+
};
|
|
40
|
+
/**
|
|
41
|
+
* Filters to low-confidence entries (or top-N if none low), produces a table
|
|
42
|
+
* for the LLM (~2000 input tokens).
|
|
43
|
+
*
|
|
44
|
+
* Caller short-circuits on empty `judgmentAttributions` BEFORE calling this.
|
|
45
|
+
*/
|
|
46
|
+
export declare function buildLowConfidenceAttributionPrompt(report: Report, judgmentAttributions: JudgmentAttribution[]): {
|
|
47
|
+
system: string;
|
|
48
|
+
user: string;
|
|
49
|
+
};
|
|
50
|
+
/**
|
|
51
|
+
* Aggregates attributions by documentId, picks top-5 by aggregate score,
|
|
52
|
+
* emits a table for the LLM.
|
|
53
|
+
*
|
|
54
|
+
* Caller short-circuits on empty `judgmentAttributions` BEFORE calling this.
|
|
55
|
+
*/
|
|
56
|
+
export declare function buildDocAttributionSpotlightPrompt(report: Report, judgmentAttributions: JudgmentAttribution[]): {
|
|
57
|
+
system: string;
|
|
58
|
+
user: string;
|
|
59
|
+
};
|
|
60
|
+
/**
|
|
61
|
+
* Computes per-area deltas in JS (failure-mode #1 mitigation), then builds
|
|
62
|
+
* the user message. Returns `deltas` as a side-channel for the per-call
|
|
63
|
+
* schema `.refine()`.
|
|
64
|
+
*/
|
|
65
|
+
export declare function buildRegressionVsBaselinePrompt(report: Report, baseline: Report): {
|
|
66
|
+
system: string;
|
|
67
|
+
user: string;
|
|
68
|
+
deltas: {
|
|
69
|
+
area: string;
|
|
70
|
+
pointsDelta: number;
|
|
71
|
+
}[];
|
|
72
|
+
};
|