@sanity/ailf 5.0.0 → 6.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/airbyte/ai_literacy_framework.connector.yaml +276 -0
- package/config/bigquery/views/synthesis_parse_failure_rate_7d.sql +42 -0
- package/config/diagnosis-cards.ts +318 -0
- package/config/models.ts +12 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +4 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +12 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -0
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +230 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +108 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +140 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +49 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +65 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +93 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +130 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +111 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +118 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +286 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +18 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +74 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.js +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +119 -2
- package/dist/_vendor/ailf-core/services/diagnosis-runner.js +136 -2
- package/dist/_vendor/ailf-core/services/index.d.ts +5 -1
- package/dist/_vendor/ailf-core/services/index.js +15 -2
- package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
- package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
- package/dist/_vendor/ailf-core/types/diagnosis.d.ts +115 -10
- package/dist/_vendor/ailf-core/types/diagnosis.js +3 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +8 -1
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +16 -0
- package/dist/_vendor/ailf-core/types/synthesis-telemetry.d.ts +101 -0
- package/dist/_vendor/ailf-core/types/synthesis-telemetry.js +18 -0
- package/dist/adapters/config-sources/file-config-adapter.js +8 -6
- package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
- package/dist/adapters/llm/fake-llm-client.js +38 -1
- package/dist/adapters/llm/index.d.ts +1 -1
- package/dist/adapters/llm/index.js +1 -1
- package/dist/adapters/llm/openai-llm-client.js +59 -5
- package/dist/adapters/llm/retry.d.ts +18 -0
- package/dist/adapters/llm/retry.js +21 -0
- package/dist/adapters/synthesis/synthesis-telemetry-schema.d.ts +49 -0
- package/dist/adapters/synthesis/synthesis-telemetry-schema.js +55 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +10 -5
- package/dist/adapters/task-sources/repo-schemas.d.ts +7 -0
- package/dist/adapters/task-sources/repo-schemas.js +10 -0
- package/dist/cli-program.js +3 -0
- package/dist/commands/interpret.d.ts +70 -0
- package/dist/commands/interpret.js +221 -0
- package/dist/commands/pipeline-action.d.ts +44 -0
- package/dist/commands/pipeline-action.js +193 -1
- package/dist/commands/run.d.ts +2 -0
- package/dist/commands/run.js +2 -0
- package/dist/composition-root.d.ts +21 -23
- package/dist/composition-root.js +107 -41
- package/dist/config/diagnosis-cards.ts +318 -0
- package/dist/config/models.ts +12 -0
- package/dist/grader/agent-harness.d.ts +5 -10
- package/dist/grader/agent-harness.js +5 -13
- package/dist/grader/common.d.ts +5 -13
- package/dist/grader/common.js +5 -17
- package/dist/grader/index.d.ts +15 -29
- package/dist/grader/index.js +15 -66
- package/dist/grader/knowledge-probe.d.ts +5 -10
- package/dist/grader/knowledge-probe.js +5 -14
- package/dist/grader/literacy.d.ts +5 -9
- package/dist/grader/literacy.js +5 -13
- package/dist/grader/mcp.d.ts +5 -10
- package/dist/grader/mcp.js +5 -14
- package/dist/orchestration/pipeline-orchestrator.js +3 -0
- package/dist/report-store.d.ts +26 -0
- package/dist/report-store.js +63 -0
- package/package.json +2 -2
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* top-recommendations card — LLM-driven actionable suggestion generator.
|
|
3
|
+
*
|
|
4
|
+
* Model: claude-opus-4-6 (high-stakes per AI-SPEC §4 model routing)
|
|
5
|
+
* Version: top-recommendations@0.1.0
|
|
6
|
+
*
|
|
7
|
+
* Mitigations:
|
|
8
|
+
* - failure-mode #2: few-shot examples in system prompt; body ≥40 chars
|
|
9
|
+
* - failure-mode #5: docSlug refined against report manifest allow-list
|
|
10
|
+
*
|
|
11
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
12
|
+
* mandatory (cards/ is a SCAN_ROOT in check-trust-boundary-satisfies.ts).
|
|
13
|
+
*
|
|
14
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §3
|
|
15
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4
|
|
16
|
+
*/
|
|
17
|
+
import { z } from "zod";
|
|
18
|
+
import type { CardGenerator } from "../../diagnosis-runner.js";
|
|
19
|
+
/**
|
|
20
|
+
* Module-level schema: static shape check only.
|
|
21
|
+
* Per-call: an additive `.refine()` over the allow-list (built inside the
|
|
22
|
+
* generator so it closes over the runtime `report`).
|
|
23
|
+
*
|
|
24
|
+
* AI-SPEC §3 Pitfall 1: per-call schemas are LOCAL to the generator,
|
|
25
|
+
* NOT at module scope.
|
|
26
|
+
*/
|
|
27
|
+
export declare const TopRecommendationsBodySchema: z.ZodObject<{
|
|
28
|
+
summary: z.ZodString;
|
|
29
|
+
suggestions: z.ZodArray<z.ZodObject<{
|
|
30
|
+
title: z.ZodString;
|
|
31
|
+
body: z.ZodString;
|
|
32
|
+
priority: z.ZodEnum<{
|
|
33
|
+
low: "low";
|
|
34
|
+
medium: "medium";
|
|
35
|
+
high: "high";
|
|
36
|
+
}>;
|
|
37
|
+
docSlug: z.ZodString;
|
|
38
|
+
sectionHeading: z.ZodNullable<z.ZodString>;
|
|
39
|
+
}, z.core.$strip>>;
|
|
40
|
+
}, z.core.$strip>;
|
|
41
|
+
export declare const generateTopRecommendations: CardGenerator;
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* top-recommendations card — LLM-driven actionable suggestion generator.
|
|
3
|
+
*
|
|
4
|
+
* Model: claude-opus-4-6 (high-stakes per AI-SPEC §4 model routing)
|
|
5
|
+
* Version: top-recommendations@0.1.0
|
|
6
|
+
*
|
|
7
|
+
* Mitigations:
|
|
8
|
+
* - failure-mode #2: few-shot examples in system prompt; body ≥40 chars
|
|
9
|
+
* - failure-mode #5: docSlug refined against report manifest allow-list
|
|
10
|
+
*
|
|
11
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
12
|
+
* mandatory (cards/ is a SCAN_ROOT in check-trust-boundary-satisfies.ts).
|
|
13
|
+
*
|
|
14
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §3
|
|
15
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4
|
|
16
|
+
*/
|
|
17
|
+
import { z } from "zod";
|
|
18
|
+
import { modelId as mkModelId } from "../../../ports/llm-client.js";
|
|
19
|
+
import { buildTopRecommendationsPrompt, buildDocSlugAllowList, } from "../prompt-builders.js";
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
// Body schema (D0045 trust boundary — satisfies required)
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
/**
|
|
24
|
+
* Module-level schema: static shape check only.
|
|
25
|
+
* Per-call: an additive `.refine()` over the allow-list (built inside the
|
|
26
|
+
* generator so it closes over the runtime `report`).
|
|
27
|
+
*
|
|
28
|
+
* AI-SPEC §3 Pitfall 1: per-call schemas are LOCAL to the generator,
|
|
29
|
+
* NOT at module scope.
|
|
30
|
+
*/
|
|
31
|
+
export const TopRecommendationsBodySchema = z.object({
|
|
32
|
+
summary: z.string().min(1).max(500),
|
|
33
|
+
suggestions: z
|
|
34
|
+
.array(z.object({
|
|
35
|
+
title: z.string().min(1).max(200),
|
|
36
|
+
body: z
|
|
37
|
+
.string()
|
|
38
|
+
.min(40, "suggestion.body must be ≥40 characters")
|
|
39
|
+
.refine((b) => /`[^`]+`/.test(b), "suggestion.body must contain at least one backtick-delimited artifact"),
|
|
40
|
+
priority: z.enum(["high", "medium", "low"]),
|
|
41
|
+
docSlug: z.string().min(1),
|
|
42
|
+
sectionHeading: z.string().nullable(),
|
|
43
|
+
}))
|
|
44
|
+
.min(1)
|
|
45
|
+
.max(5),
|
|
46
|
+
});
|
|
47
|
+
// ---------------------------------------------------------------------------
|
|
48
|
+
// Generator
|
|
49
|
+
// ---------------------------------------------------------------------------
|
|
50
|
+
const CARD_MODEL = mkModelId("anthropic:claude-opus-4-6");
|
|
51
|
+
export const generateTopRecommendations = async (report, ctx) => {
|
|
52
|
+
// C1: no LLM → missing
|
|
53
|
+
if (!ctx.llm) {
|
|
54
|
+
return {
|
|
55
|
+
status: "missing",
|
|
56
|
+
cardType: "top-recommendations",
|
|
57
|
+
reason: "no LLMClient wired",
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
// Build allow-list from the runtime report
|
|
61
|
+
const allowList = buildDocSlugAllowList(report);
|
|
62
|
+
// Per-call schema: additive docSlug allow-list refine (AI-SPEC §3 Pitfall 1)
|
|
63
|
+
const PerCallSchema = z.object({
|
|
64
|
+
summary: z.string().min(1).max(500),
|
|
65
|
+
suggestions: z
|
|
66
|
+
.array(z.object({
|
|
67
|
+
title: z.string().min(1).max(200),
|
|
68
|
+
body: z
|
|
69
|
+
.string()
|
|
70
|
+
.min(40, "suggestion.body must be ≥40 characters")
|
|
71
|
+
.refine((b) => /`[^`]+`/.test(b), "suggestion.body must contain at least one backtick-delimited artifact"),
|
|
72
|
+
priority: z.enum(["high", "medium", "low"]),
|
|
73
|
+
docSlug: z
|
|
74
|
+
.string()
|
|
75
|
+
.min(1)
|
|
76
|
+
.refine((slug) => allowList.has(slug), {
|
|
77
|
+
message: "suggestion.docSlug is not in the report document manifest allow-list",
|
|
78
|
+
}),
|
|
79
|
+
sectionHeading: z.string().nullable(),
|
|
80
|
+
}))
|
|
81
|
+
.min(1)
|
|
82
|
+
.max(5),
|
|
83
|
+
});
|
|
84
|
+
const prompt = buildTopRecommendationsPrompt(report, allowList);
|
|
85
|
+
// Destructure `cost` and `model` from the LLMClient return —
|
|
86
|
+
// already provided per llm-client.ts:139-144, previously discarded.
|
|
87
|
+
const { value, usage, cost, model } = await ctx.llm.completeStructured({
|
|
88
|
+
model: CARD_MODEL,
|
|
89
|
+
prompt: `${prompt.system}\n\n${prompt.user}`,
|
|
90
|
+
schema: PerCallSchema,
|
|
91
|
+
temperature: 0.1,
|
|
92
|
+
maxTokens: 2000,
|
|
93
|
+
context: {
|
|
94
|
+
feature: "diagnosis",
|
|
95
|
+
runId: ctx.runId,
|
|
96
|
+
cardId: "top-recommendations",
|
|
97
|
+
},
|
|
98
|
+
});
|
|
99
|
+
return {
|
|
100
|
+
status: "ready",
|
|
101
|
+
cardType: "top-recommendations",
|
|
102
|
+
body: value,
|
|
103
|
+
meta: {
|
|
104
|
+
cardVersion: "top-recommendations@0.1.0",
|
|
105
|
+
tokenUsage: { input: usage.promptTokens, output: usage.completionTokens },
|
|
106
|
+
generatedAt: new Date().toISOString(),
|
|
107
|
+
cost,
|
|
108
|
+
model,
|
|
109
|
+
},
|
|
110
|
+
};
|
|
111
|
+
};
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* weakest-area card — LLM-driven weakest area identification.
|
|
3
|
+
*
|
|
4
|
+
* Model: claude-sonnet-4-6 (routine per AI-SPEC §4 model routing)
|
|
5
|
+
* Version: weakest-area@0.1.0
|
|
6
|
+
*
|
|
7
|
+
* Mitigations:
|
|
8
|
+
* - failure-mode #3: sampleSize = report.areas[area].judgmentCount (per-call refine W2)
|
|
9
|
+
* - failure-mode #3: sampleSize < 10 → confidence.level must be "low" (per-call refine W3)
|
|
10
|
+
* - failure-mode #4: taxonomy drift → buildFailureModeRefinement() Zod predicate
|
|
11
|
+
*
|
|
12
|
+
* Per-call schemas live INSIDE the generator (AI-SPEC §3 Pitfall 1 — no
|
|
13
|
+
* module-scope mutables that change across calls).
|
|
14
|
+
*
|
|
15
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
16
|
+
* mandatory (cards/ is a SCAN_ROOT in check-trust-boundary-satisfies.ts).
|
|
17
|
+
*
|
|
18
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §3
|
|
19
|
+
*/
|
|
20
|
+
import { z } from "zod";
|
|
21
|
+
import type { CardGenerator } from "../../diagnosis-runner.js";
|
|
22
|
+
/**
|
|
23
|
+
* Module-level schema asserts the static shape + taxonomy constraint (D-05).
|
|
24
|
+
* Per-call schemas add the sampleSize (W2) and small-sample confidence (W3)
|
|
25
|
+
* refinements inside the generator, since they need runtime `report` data.
|
|
26
|
+
*/
|
|
27
|
+
export declare const WeakestAreaBodySchema: z.ZodObject<{
|
|
28
|
+
summary: z.ZodString;
|
|
29
|
+
area: z.ZodString;
|
|
30
|
+
dimension: z.ZodString;
|
|
31
|
+
failureMode: z.ZodString;
|
|
32
|
+
sampleSize: z.ZodNumber;
|
|
33
|
+
confidence: z.ZodObject<{
|
|
34
|
+
level: z.ZodEnum<{
|
|
35
|
+
low: "low";
|
|
36
|
+
medium: "medium";
|
|
37
|
+
high: "high";
|
|
38
|
+
}>;
|
|
39
|
+
signalsPresent: z.ZodNumber;
|
|
40
|
+
derivation: z.ZodString;
|
|
41
|
+
}, z.core.$strip>;
|
|
42
|
+
}, z.core.$strip>;
|
|
43
|
+
export declare const generateWeakestArea: CardGenerator;
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* weakest-area card — LLM-driven weakest area identification.
|
|
3
|
+
*
|
|
4
|
+
* Model: claude-sonnet-4-6 (routine per AI-SPEC §4 model routing)
|
|
5
|
+
* Version: weakest-area@0.1.0
|
|
6
|
+
*
|
|
7
|
+
* Mitigations:
|
|
8
|
+
* - failure-mode #3: sampleSize = report.areas[area].judgmentCount (per-call refine W2)
|
|
9
|
+
* - failure-mode #3: sampleSize < 10 → confidence.level must be "low" (per-call refine W3)
|
|
10
|
+
* - failure-mode #4: taxonomy drift → buildFailureModeRefinement() Zod predicate
|
|
11
|
+
*
|
|
12
|
+
* Per-call schemas live INSIDE the generator (AI-SPEC §3 Pitfall 1 — no
|
|
13
|
+
* module-scope mutables that change across calls).
|
|
14
|
+
*
|
|
15
|
+
* Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
|
|
16
|
+
* mandatory (cards/ is a SCAN_ROOT in check-trust-boundary-satisfies.ts).
|
|
17
|
+
*
|
|
18
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §3
|
|
19
|
+
*/
|
|
20
|
+
import { z } from "zod";
|
|
21
|
+
import { ConfidenceSchema } from "../../../schemas/confidence-schema.js";
|
|
22
|
+
import { modelId as mkModelId } from "../../../ports/llm-client.js";
|
|
23
|
+
import { buildFailureModeRefinement } from "../card-validators.js";
|
|
24
|
+
import { buildWeakestAreaPrompt } from "../prompt-builders.js";
|
|
25
|
+
// ---------------------------------------------------------------------------
|
|
26
|
+
// Module-level schema: static shape + D-05 taxonomy refine
|
|
27
|
+
// ---------------------------------------------------------------------------
|
|
28
|
+
/**
|
|
29
|
+
* Module-level schema asserts the static shape + taxonomy constraint (D-05).
|
|
30
|
+
* Per-call schemas add the sampleSize (W2) and small-sample confidence (W3)
|
|
31
|
+
* refinements inside the generator, since they need runtime `report` data.
|
|
32
|
+
*/
|
|
33
|
+
export const WeakestAreaBodySchema = z
|
|
34
|
+
.object({
|
|
35
|
+
summary: z.string().min(1).max(800),
|
|
36
|
+
area: z.string().min(1),
|
|
37
|
+
dimension: z.string().min(1),
|
|
38
|
+
failureMode: z.string().min(1),
|
|
39
|
+
sampleSize: z.number().int().nonnegative(),
|
|
40
|
+
confidence: ConfidenceSchema,
|
|
41
|
+
})
|
|
42
|
+
.refine(buildFailureModeRefinement(), {
|
|
43
|
+
message: "failureMode is not in the canonical taxonomy for this dimension",
|
|
44
|
+
path: ["failureMode"],
|
|
45
|
+
});
|
|
46
|
+
// ---------------------------------------------------------------------------
|
|
47
|
+
// Generator
|
|
48
|
+
// ---------------------------------------------------------------------------
|
|
49
|
+
const CARD_MODEL = mkModelId("anthropic:claude-sonnet-4-6");
|
|
50
|
+
export const generateWeakestArea = async (report, ctx) => {
|
|
51
|
+
// C1: no LLM → missing
|
|
52
|
+
if (!ctx.llm) {
|
|
53
|
+
return {
|
|
54
|
+
status: "missing",
|
|
55
|
+
cardType: "weakest-area",
|
|
56
|
+
reason: "no LLMClient wired",
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
const scores = report.summary.scores ?? [];
|
|
60
|
+
if (scores.length === 0) {
|
|
61
|
+
return {
|
|
62
|
+
status: "missing",
|
|
63
|
+
cardType: "weakest-area",
|
|
64
|
+
reason: "report has no areas",
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
// Per-call schema: close over report to get sampleSize + confidence constraints
|
|
68
|
+
// (AI-SPEC §3 Pitfall 1 — never hoist to module scope)
|
|
69
|
+
const PerCallSchema = z
|
|
70
|
+
.object({
|
|
71
|
+
summary: z.string().min(1).max(800),
|
|
72
|
+
area: z.string().min(1),
|
|
73
|
+
dimension: z.string().min(1),
|
|
74
|
+
failureMode: z.string().min(1),
|
|
75
|
+
sampleSize: z.number().int().nonnegative(),
|
|
76
|
+
confidence: ConfidenceSchema,
|
|
77
|
+
})
|
|
78
|
+
.refine(buildFailureModeRefinement(), {
|
|
79
|
+
message: "failureMode is not in the canonical taxonomy for this dimension",
|
|
80
|
+
path: ["failureMode"],
|
|
81
|
+
})
|
|
82
|
+
// W3: small-sample forces confidence.level = "low"
|
|
83
|
+
.refine((body) => {
|
|
84
|
+
if (body.sampleSize < 10)
|
|
85
|
+
return body.confidence.level === "low";
|
|
86
|
+
return true;
|
|
87
|
+
}, {
|
|
88
|
+
message: 'When sampleSize < 10, confidence.level must be "low"',
|
|
89
|
+
path: ["confidence", "level"],
|
|
90
|
+
});
|
|
91
|
+
const prompt = buildWeakestAreaPrompt(report);
|
|
92
|
+
// Destructure `cost` and `model` from the LLMClient return —
|
|
93
|
+
// already provided per llm-client.ts:139-144, previously discarded.
|
|
94
|
+
const { value, usage, cost, model } = await ctx.llm.completeStructured({
|
|
95
|
+
model: CARD_MODEL,
|
|
96
|
+
prompt: `${prompt.system}\n\n${prompt.user}`,
|
|
97
|
+
schema: PerCallSchema,
|
|
98
|
+
temperature: 0.1,
|
|
99
|
+
maxTokens: 2000,
|
|
100
|
+
context: {
|
|
101
|
+
feature: "diagnosis",
|
|
102
|
+
runId: ctx.runId,
|
|
103
|
+
cardId: "weakest-area",
|
|
104
|
+
},
|
|
105
|
+
});
|
|
106
|
+
return {
|
|
107
|
+
status: "ready",
|
|
108
|
+
cardType: "weakest-area",
|
|
109
|
+
body: value,
|
|
110
|
+
meta: {
|
|
111
|
+
cardVersion: "weakest-area@0.1.0",
|
|
112
|
+
tokenUsage: { input: usage.promptTokens, output: usage.completionTokens },
|
|
113
|
+
generatedAt: new Date().toISOString(),
|
|
114
|
+
cost,
|
|
115
|
+
model,
|
|
116
|
+
},
|
|
117
|
+
};
|
|
118
|
+
};
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Deterministic prompt-builder helpers for the 5 LLM diagnosis cards.
|
|
3
|
+
*
|
|
4
|
+
* Each function takes typed inputs and returns `{ system, user }` strings
|
|
5
|
+
* (+ a `deltas` side-channel for regression-vs-baseline). The same inputs
|
|
6
|
+
* always produce the same output — no randomness, no side effects.
|
|
7
|
+
*
|
|
8
|
+
* Per AI-SPEC §4: input tokens are bounded by truncation; the user message
|
|
9
|
+
* projects only the fields each card needs.
|
|
10
|
+
*
|
|
11
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4
|
|
12
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §3 lines 603-664
|
|
13
|
+
*/
|
|
14
|
+
import type { JudgmentAttribution } from "../../types/attribution.js";
|
|
15
|
+
import type { Report } from "../../types/index.js";
|
|
16
|
+
/**
|
|
17
|
+
* Build the allow-list of doc slugs for a report. This is the union of:
|
|
18
|
+
* - report.summary.documentManifest[].slug
|
|
19
|
+
* - per-score documents[].slug
|
|
20
|
+
*
|
|
21
|
+
* Only slugs (not documentIds) appear in the allow-list because the cards
|
|
22
|
+
* reference human-readable slugs. DocumentId-only entries are skipped.
|
|
23
|
+
*/
|
|
24
|
+
export declare function buildDocSlugAllowList(report: Report): Set<string>;
|
|
25
|
+
/**
|
|
26
|
+
* Projects the 3 weakest areas + top failure modes + doc-slug allow-list
|
|
27
|
+
* into the user message (~2500 input tokens).
|
|
28
|
+
*/
|
|
29
|
+
export declare function buildTopRecommendationsPrompt(report: Report, allowList: Set<string>): {
|
|
30
|
+
system: string;
|
|
31
|
+
user: string;
|
|
32
|
+
};
|
|
33
|
+
/**
|
|
34
|
+
* Projects the single weakest area + full failure-mode breakdown.
|
|
35
|
+
*/
|
|
36
|
+
export declare function buildWeakestAreaPrompt(report: Report): {
|
|
37
|
+
system: string;
|
|
38
|
+
user: string;
|
|
39
|
+
};
|
|
40
|
+
/**
|
|
41
|
+
* Filters to low-confidence entries (or top-N if none low), produces a table
|
|
42
|
+
* for the LLM (~2000 input tokens).
|
|
43
|
+
*
|
|
44
|
+
* Caller short-circuits on empty `judgmentAttributions` BEFORE calling this.
|
|
45
|
+
*/
|
|
46
|
+
export declare function buildLowConfidenceAttributionPrompt(report: Report, judgmentAttributions: JudgmentAttribution[]): {
|
|
47
|
+
system: string;
|
|
48
|
+
user: string;
|
|
49
|
+
};
|
|
50
|
+
/**
|
|
51
|
+
* Aggregates attributions by documentId, picks top-5 by aggregate score,
|
|
52
|
+
* emits a table for the LLM.
|
|
53
|
+
*
|
|
54
|
+
* Caller short-circuits on empty `judgmentAttributions` BEFORE calling this.
|
|
55
|
+
*/
|
|
56
|
+
export declare function buildDocAttributionSpotlightPrompt(report: Report, judgmentAttributions: JudgmentAttribution[]): {
|
|
57
|
+
system: string;
|
|
58
|
+
user: string;
|
|
59
|
+
};
|
|
60
|
+
/**
|
|
61
|
+
* Computes per-area deltas in JS (failure-mode #1 mitigation), then builds
|
|
62
|
+
* the user message. Returns `deltas` as a side-channel for the per-call
|
|
63
|
+
* schema `.refine()`.
|
|
64
|
+
*/
|
|
65
|
+
export declare function buildRegressionVsBaselinePrompt(report: Report, baseline: Report): {
|
|
66
|
+
system: string;
|
|
67
|
+
user: string;
|
|
68
|
+
deltas: {
|
|
69
|
+
area: string;
|
|
70
|
+
pointsDelta: number;
|
|
71
|
+
}[];
|
|
72
|
+
};
|
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Deterministic prompt-builder helpers for the 5 LLM diagnosis cards.
|
|
3
|
+
*
|
|
4
|
+
* Each function takes typed inputs and returns `{ system, user }` strings
|
|
5
|
+
* (+ a `deltas` side-channel for regression-vs-baseline). The same inputs
|
|
6
|
+
* always produce the same output — no randomness, no side effects.
|
|
7
|
+
*
|
|
8
|
+
* Per AI-SPEC §4: input tokens are bounded by truncation; the user message
|
|
9
|
+
* projects only the fields each card needs.
|
|
10
|
+
*
|
|
11
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4
|
|
12
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §3 lines 603-664
|
|
13
|
+
*/
|
|
14
|
+
import { TOP_RECOMMENDATIONS_SYSTEM_PROMPT, WEAKEST_AREA_SYSTEM_PROMPT, LOW_CONFIDENCE_ATTRIBUTION_SYSTEM_PROMPT, DOC_ATTRIBUTION_SPOTLIGHT_SYSTEM_PROMPT, REGRESSION_VS_BASELINE_SYSTEM_PROMPT, } from "./prompts/index.js";
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
// Shared helper: build the docSlug allow-list from a report
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
/**
|
|
19
|
+
* Build the allow-list of doc slugs for a report. This is the union of:
|
|
20
|
+
* - report.summary.documentManifest[].slug
|
|
21
|
+
* - per-score documents[].slug
|
|
22
|
+
*
|
|
23
|
+
* Only slugs (not documentIds) appear in the allow-list because the cards
|
|
24
|
+
* reference human-readable slugs. DocumentId-only entries are skipped.
|
|
25
|
+
*/
|
|
26
|
+
export function buildDocSlugAllowList(report) {
|
|
27
|
+
const slugs = new Set();
|
|
28
|
+
const manifest = report.summary.documentManifest ?? [];
|
|
29
|
+
for (const doc of manifest) {
|
|
30
|
+
if ("slug" in doc && typeof doc.slug === "string" && doc.slug.length > 0) {
|
|
31
|
+
slugs.add(doc.slug);
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
for (const score of report.summary.scores ?? []) {
|
|
35
|
+
for (const doc of score.documents ?? []) {
|
|
36
|
+
if ("slug" in doc &&
|
|
37
|
+
typeof doc.slug === "string" &&
|
|
38
|
+
doc.slug.length > 0) {
|
|
39
|
+
slugs.add(doc.slug);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
return slugs;
|
|
44
|
+
}
|
|
45
|
+
// ---------------------------------------------------------------------------
|
|
46
|
+
// top-recommendations prompt builder
|
|
47
|
+
// ---------------------------------------------------------------------------
|
|
48
|
+
/**
|
|
49
|
+
* Projects the 3 weakest areas + top failure modes + doc-slug allow-list
|
|
50
|
+
* into the user message (~2500 input tokens).
|
|
51
|
+
*/
|
|
52
|
+
export function buildTopRecommendationsPrompt(report, allowList) {
|
|
53
|
+
const scores = report.summary.scores ?? [];
|
|
54
|
+
const sorted = [...scores].sort((a, b) => a.totalScore - b.totalScore);
|
|
55
|
+
const weakest3 = sorted.slice(0, 3);
|
|
56
|
+
const failureModes = report.summary.failureModes;
|
|
57
|
+
const topModes = failureModes?.topTitles
|
|
58
|
+
?.slice(0, 5)
|
|
59
|
+
.map((t) => `${t.category} (${t.count})`) ?? [];
|
|
60
|
+
const allowListArr = [...allowList].slice(0, 50); // cap at 50 slugs
|
|
61
|
+
const user = [
|
|
62
|
+
"## Weakest Areas",
|
|
63
|
+
weakest3
|
|
64
|
+
.map((s) => `- ${s.feature}: totalScore=${s.totalScore}, judgmentCount=${s.testCount}`)
|
|
65
|
+
.join("\n"),
|
|
66
|
+
"",
|
|
67
|
+
"## Top Failure Modes",
|
|
68
|
+
topModes.length > 0 ? topModes.join(", ") : "(none recorded)",
|
|
69
|
+
"",
|
|
70
|
+
"## Document Slug Allow-List",
|
|
71
|
+
"Suggestions MUST use one of these slugs:",
|
|
72
|
+
allowListArr.map((s) => `- ${s}`).join("\n"),
|
|
73
|
+
"",
|
|
74
|
+
"Generate 1-5 actionable recommendations targeting the weakest areas above.",
|
|
75
|
+
].join("\n");
|
|
76
|
+
return { system: TOP_RECOMMENDATIONS_SYSTEM_PROMPT, user };
|
|
77
|
+
}
|
|
78
|
+
// ---------------------------------------------------------------------------
|
|
79
|
+
// weakest-area prompt builder
|
|
80
|
+
// ---------------------------------------------------------------------------
|
|
81
|
+
/**
|
|
82
|
+
* Projects the single weakest area + full failure-mode breakdown.
|
|
83
|
+
*/
|
|
84
|
+
export function buildWeakestAreaPrompt(report) {
|
|
85
|
+
const scores = report.summary.scores ?? [];
|
|
86
|
+
if (scores.length === 0) {
|
|
87
|
+
return {
|
|
88
|
+
system: WEAKEST_AREA_SYSTEM_PROMPT,
|
|
89
|
+
user: "No areas in report.",
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
const weakest = [...scores].sort((a, b) => a.totalScore - b.totalScore)[0];
|
|
93
|
+
const judgmentCount = weakest.testCount ?? 0;
|
|
94
|
+
const topMode = report.summary.failureModes?.topTitles?.[0]?.category ?? "unclassified";
|
|
95
|
+
const user = [
|
|
96
|
+
"## Weakest Area",
|
|
97
|
+
`Feature: ${weakest.feature}`,
|
|
98
|
+
`Total Score: ${weakest.totalScore}`,
|
|
99
|
+
`Ceiling Score: ${weakest.ceilingScore}`,
|
|
100
|
+
`Floor Score: ${weakest.floorScore}`,
|
|
101
|
+
`Judgment Count (sampleSize): ${judgmentCount}`,
|
|
102
|
+
"",
|
|
103
|
+
"## Top Failure Mode Observed",
|
|
104
|
+
`Category: ${topMode}`,
|
|
105
|
+
"",
|
|
106
|
+
"## Failure Mode by Dimension",
|
|
107
|
+
report.summary.failureModes?.topTitles
|
|
108
|
+
?.slice(0, 5)
|
|
109
|
+
.map((t) => `- ${t.category}: ${t.count} judgments`)
|
|
110
|
+
.join("\n") ?? "(no data)",
|
|
111
|
+
"",
|
|
112
|
+
"Identify the area, its primary dimension, and most frequent failure mode.",
|
|
113
|
+
`sampleSize in your response MUST equal exactly ${judgmentCount} (the judgment count above).`,
|
|
114
|
+
judgmentCount < 10
|
|
115
|
+
? `WARNING: sampleSize=${judgmentCount} < 10 — you MUST set confidence.level = "low".`
|
|
116
|
+
: "",
|
|
117
|
+
]
|
|
118
|
+
.filter(Boolean)
|
|
119
|
+
.join("\n");
|
|
120
|
+
return { system: WEAKEST_AREA_SYSTEM_PROMPT, user };
|
|
121
|
+
}
|
|
122
|
+
// ---------------------------------------------------------------------------
|
|
123
|
+
// low-confidence-attribution prompt builder
|
|
124
|
+
// ---------------------------------------------------------------------------
|
|
125
|
+
/**
|
|
126
|
+
* Filters to low-confidence entries (or top-N if none low), produces a table
|
|
127
|
+
* for the LLM (~2000 input tokens).
|
|
128
|
+
*
|
|
129
|
+
* Caller short-circuits on empty `judgmentAttributions` BEFORE calling this.
|
|
130
|
+
*/
|
|
131
|
+
export function buildLowConfidenceAttributionPrompt(report, judgmentAttributions) {
|
|
132
|
+
// Filter to low-confidence entries (by any attribution in the set)
|
|
133
|
+
const lowConf = judgmentAttributions.filter((ja) => ja.attributions.some((a) => a.confidence.level === "low"));
|
|
134
|
+
// If no low-confidence entries, use all sorted by score ascending (most
|
|
135
|
+
// uncertain first). Guard against entries with empty `attributions` arrays:
|
|
136
|
+
// `Math.min(...[])` is Infinity and produces an unstable sort that ranks
|
|
137
|
+
// empty-attribution entries identically at the top of the prompt. The card
|
|
138
|
+
// schema requires `judgmentRefs.min(1)`, so emitting empty-attribution rows
|
|
139
|
+
// here forces a degraded card downstream. Caller should short-circuit to
|
|
140
|
+
// missing before reaching here, but defend against the seam regressing.
|
|
141
|
+
const validAttrs = judgmentAttributions.filter((ja) => ja.attributions.length > 0);
|
|
142
|
+
if (validAttrs.length === 0) {
|
|
143
|
+
return {
|
|
144
|
+
system: LOW_CONFIDENCE_ATTRIBUTION_SYSTEM_PROMPT,
|
|
145
|
+
user: "(no attribution data with non-empty attributions)",
|
|
146
|
+
};
|
|
147
|
+
}
|
|
148
|
+
const source = lowConf.length > 0
|
|
149
|
+
? lowConf
|
|
150
|
+
: [...validAttrs].sort((a, b) => {
|
|
151
|
+
const aMin = Math.min(...a.attributions.map((x) => x.score));
|
|
152
|
+
const bMin = Math.min(...b.attributions.map((x) => x.score));
|
|
153
|
+
return aMin - bMin;
|
|
154
|
+
});
|
|
155
|
+
// Cap at 20 to stay within token budget
|
|
156
|
+
const capped = source.slice(0, 20);
|
|
157
|
+
const tableRows = capped
|
|
158
|
+
.map((ja) => {
|
|
159
|
+
const minConf = ja.attributions.reduce((worst, a) => a.confidence.level === "low"
|
|
160
|
+
? "low"
|
|
161
|
+
: worst === "low"
|
|
162
|
+
? "low"
|
|
163
|
+
: a.confidence.level === "medium"
|
|
164
|
+
? "medium"
|
|
165
|
+
: worst, "high");
|
|
166
|
+
return `| ${ja.judgmentRef} | ${ja.taskId} | ${ja.modelId} | ${ja.dimension} | ${minConf} |`;
|
|
167
|
+
})
|
|
168
|
+
.join("\n");
|
|
169
|
+
const user = [
|
|
170
|
+
"## Per-Judgment Attribution Confidence",
|
|
171
|
+
`Total entries: ${judgmentAttributions.length}; Low-confidence: ${lowConf.length}`,
|
|
172
|
+
"",
|
|
173
|
+
"| judgmentRef | taskId | modelId | dimension | minConfidence |",
|
|
174
|
+
"|-------------|--------|---------|-----------|---------------|",
|
|
175
|
+
tableRows,
|
|
176
|
+
"",
|
|
177
|
+
lowConf.length === 0
|
|
178
|
+
? "No low-confidence entries found. Return the highest-uncertainty entry and note that confidence is well-calibrated."
|
|
179
|
+
: `Identify the ${Math.min(lowConf.length, 5)} most uncertain judgments.`,
|
|
180
|
+
].join("\n");
|
|
181
|
+
return { system: LOW_CONFIDENCE_ATTRIBUTION_SYSTEM_PROMPT, user };
|
|
182
|
+
}
|
|
183
|
+
// ---------------------------------------------------------------------------
|
|
184
|
+
// doc-attribution-spotlight prompt builder
|
|
185
|
+
// ---------------------------------------------------------------------------
|
|
186
|
+
/**
|
|
187
|
+
* Aggregates attributions by documentId, picks top-5 by aggregate score,
|
|
188
|
+
* emits a table for the LLM.
|
|
189
|
+
*
|
|
190
|
+
* Caller short-circuits on empty `judgmentAttributions` BEFORE calling this.
|
|
191
|
+
*/
|
|
192
|
+
export function buildDocAttributionSpotlightPrompt(report, judgmentAttributions) {
|
|
193
|
+
// Aggregate by documentId (D0052 canonical ref)
|
|
194
|
+
const byDoc = new Map();
|
|
195
|
+
for (const ja of judgmentAttributions) {
|
|
196
|
+
for (const a of ja.attributions) {
|
|
197
|
+
const entry = byDoc.get(a.documentId) ?? {
|
|
198
|
+
slug: a.slug,
|
|
199
|
+
scoreSum: 0,
|
|
200
|
+
count: 0,
|
|
201
|
+
};
|
|
202
|
+
entry.scoreSum += a.score;
|
|
203
|
+
entry.count += 1;
|
|
204
|
+
// Keep slug if we have it
|
|
205
|
+
if (a.slug && !entry.slug)
|
|
206
|
+
entry.slug = a.slug;
|
|
207
|
+
byDoc.set(a.documentId, entry);
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
// Sort by aggregate score descending, take top 5
|
|
211
|
+
const sorted = [...byDoc.entries()]
|
|
212
|
+
.map(([docId, v]) => ({
|
|
213
|
+
documentId: docId,
|
|
214
|
+
slug: v.slug,
|
|
215
|
+
aggregateScore: v.scoreSum / v.count,
|
|
216
|
+
signalCount: v.count,
|
|
217
|
+
}))
|
|
218
|
+
.filter((d) => d.slug) // only emit docs with slugs (allow-list check in Zod)
|
|
219
|
+
.sort((a, b) => b.aggregateScore - a.aggregateScore)
|
|
220
|
+
.slice(0, 5);
|
|
221
|
+
const tableRows = sorted
|
|
222
|
+
.map((d) => `| ${d.documentId} | ${d.slug ?? "(no slug)"} | ${d.aggregateScore.toFixed(3)} | ${d.signalCount} |`)
|
|
223
|
+
.join("\n");
|
|
224
|
+
const user = [
|
|
225
|
+
"## Top Documents by Attribution Score",
|
|
226
|
+
`Total unique documents: ${byDoc.size}`,
|
|
227
|
+
"",
|
|
228
|
+
"| documentId | slug | aggregateScore | signalCount |",
|
|
229
|
+
"|------------|------|----------------|-------------|",
|
|
230
|
+
tableRows,
|
|
231
|
+
"",
|
|
232
|
+
"For each document in the table, determine its role (supports/contradicts/missing/irrelevant).",
|
|
233
|
+
"docSlug in your output MUST exactly match the slug column — do not invent slugs.",
|
|
234
|
+
].join("\n");
|
|
235
|
+
return { system: DOC_ATTRIBUTION_SPOTLIGHT_SYSTEM_PROMPT, user };
|
|
236
|
+
}
|
|
237
|
+
// ---------------------------------------------------------------------------
|
|
238
|
+
// regression-vs-baseline prompt builder
|
|
239
|
+
// ---------------------------------------------------------------------------
|
|
240
|
+
/**
|
|
241
|
+
* Computes per-area deltas in JS (failure-mode #1 mitigation), then builds
|
|
242
|
+
* the user message. Returns `deltas` as a side-channel for the per-call
|
|
243
|
+
* schema `.refine()`.
|
|
244
|
+
*/
|
|
245
|
+
export function buildRegressionVsBaselinePrompt(report, baseline) {
|
|
246
|
+
// Build area → score maps
|
|
247
|
+
const currentByArea = new Map((report.summary.scores ?? []).map((s) => [s.feature, s.totalScore]));
|
|
248
|
+
const baselineByArea = new Map((baseline.summary.scores ?? []).map((s) => [s.feature, s.totalScore]));
|
|
249
|
+
// Only compute deltas for areas present in BOTH reports
|
|
250
|
+
const deltas = [];
|
|
251
|
+
for (const [area, current] of currentByArea) {
|
|
252
|
+
const base = baselineByArea.get(area);
|
|
253
|
+
if (base !== undefined) {
|
|
254
|
+
deltas.push({
|
|
255
|
+
area,
|
|
256
|
+
pointsDelta: parseFloat((current - base).toFixed(2)),
|
|
257
|
+
});
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
// Sort by absolute delta descending (most changed first), cap at 10
|
|
261
|
+
const topDeltas = deltas
|
|
262
|
+
.sort((a, b) => Math.abs(b.pointsDelta) - Math.abs(a.pointsDelta))
|
|
263
|
+
.slice(0, 10);
|
|
264
|
+
const deltaRows = topDeltas
|
|
265
|
+
.map((d) => `| ${d.area} | ${d.pointsDelta > 0 ? "+" : ""}${d.pointsDelta} | ${d.pointsDelta > 0 ? "improved" : d.pointsDelta < 0 ? "regressed" : "unchanged"} |`)
|
|
266
|
+
.join("\n");
|
|
267
|
+
const user = [
|
|
268
|
+
"## Pre-Computed Score Deltas (current minus baseline)",
|
|
269
|
+
"These values are FACTS — do not modify them.",
|
|
270
|
+
"",
|
|
271
|
+
"| area | pointsDelta | expectedDirection |",
|
|
272
|
+
"|------|-------------|-------------------|",
|
|
273
|
+
deltaRows,
|
|
274
|
+
"",
|
|
275
|
+
"For each row, echo the exact area + pointsDelta, assign the matching direction label, and add prose drivers.",
|
|
276
|
+
"Do NOT round or change any numeric value.",
|
|
277
|
+
"",
|
|
278
|
+
`Current run: ${report.provenance.runId}`,
|
|
279
|
+
`Baseline run: ${baseline.provenance.runId}`,
|
|
280
|
+
].join("\n");
|
|
281
|
+
return {
|
|
282
|
+
system: REGRESSION_VS_BASELINE_SYSTEM_PROMPT,
|
|
283
|
+
user,
|
|
284
|
+
deltas: topDeltas,
|
|
285
|
+
};
|
|
286
|
+
}
|