@sanity/ailf 5.0.0 → 6.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/config/airbyte/ai_literacy_framework.connector.yaml +276 -0
  2. package/config/bigquery/views/synthesis_parse_failure_rate_7d.sql +42 -0
  3. package/config/diagnosis-cards.ts +318 -0
  4. package/config/models.ts +12 -0
  5. package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
  6. package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
  7. package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
  8. package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
  9. package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
  10. package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
  11. package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
  12. package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
  13. package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
  14. package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
  15. package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
  16. package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
  17. package/dist/_vendor/ailf-core/index.d.ts +1 -0
  18. package/dist/_vendor/ailf-core/index.js +4 -0
  19. package/dist/_vendor/ailf-core/ports/context.d.ts +12 -0
  20. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -0
  21. package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -0
  22. package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
  23. package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
  24. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
  25. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
  26. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
  27. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +230 -0
  28. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
  29. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
  30. package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
  31. package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
  32. package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
  33. package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +108 -0
  34. package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
  35. package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +140 -0
  36. package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +49 -0
  37. package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +65 -0
  38. package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
  39. package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +93 -0
  40. package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
  41. package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
  42. package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
  43. package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +130 -0
  44. package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
  45. package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +111 -0
  46. package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
  47. package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +118 -0
  48. package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
  49. package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +286 -0
  50. package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
  51. package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
  52. package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
  53. package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
  54. package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
  55. package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
  56. package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
  57. package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
  58. package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
  59. package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
  60. package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +18 -0
  61. package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +74 -0
  62. package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +10 -0
  63. package/dist/_vendor/ailf-core/services/diagnosis/registry.js +10 -0
  64. package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +119 -2
  65. package/dist/_vendor/ailf-core/services/diagnosis-runner.js +136 -2
  66. package/dist/_vendor/ailf-core/services/index.d.ts +5 -1
  67. package/dist/_vendor/ailf-core/services/index.js +15 -2
  68. package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
  69. package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
  70. package/dist/_vendor/ailf-core/types/diagnosis.d.ts +115 -10
  71. package/dist/_vendor/ailf-core/types/diagnosis.js +3 -1
  72. package/dist/_vendor/ailf-core/types/index.d.ts +8 -1
  73. package/dist/_vendor/ailf-core/types/repo-config.d.ts +16 -0
  74. package/dist/_vendor/ailf-core/types/synthesis-telemetry.d.ts +101 -0
  75. package/dist/_vendor/ailf-core/types/synthesis-telemetry.js +18 -0
  76. package/dist/adapters/config-sources/file-config-adapter.js +8 -6
  77. package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
  78. package/dist/adapters/llm/fake-llm-client.js +38 -1
  79. package/dist/adapters/llm/index.d.ts +1 -1
  80. package/dist/adapters/llm/index.js +1 -1
  81. package/dist/adapters/llm/openai-llm-client.js +59 -5
  82. package/dist/adapters/llm/retry.d.ts +18 -0
  83. package/dist/adapters/llm/retry.js +21 -0
  84. package/dist/adapters/synthesis/synthesis-telemetry-schema.d.ts +49 -0
  85. package/dist/adapters/synthesis/synthesis-telemetry-schema.js +55 -0
  86. package/dist/adapters/task-sources/content-lake-task-source.js +10 -5
  87. package/dist/adapters/task-sources/repo-schemas.d.ts +7 -0
  88. package/dist/adapters/task-sources/repo-schemas.js +10 -0
  89. package/dist/cli-program.js +3 -0
  90. package/dist/commands/interpret.d.ts +70 -0
  91. package/dist/commands/interpret.js +221 -0
  92. package/dist/commands/pipeline-action.d.ts +44 -0
  93. package/dist/commands/pipeline-action.js +193 -1
  94. package/dist/commands/run.d.ts +2 -0
  95. package/dist/commands/run.js +2 -0
  96. package/dist/composition-root.d.ts +21 -23
  97. package/dist/composition-root.js +107 -41
  98. package/dist/config/diagnosis-cards.ts +318 -0
  99. package/dist/config/models.ts +12 -0
  100. package/dist/grader/agent-harness.d.ts +5 -10
  101. package/dist/grader/agent-harness.js +5 -13
  102. package/dist/grader/common.d.ts +5 -13
  103. package/dist/grader/common.js +5 -17
  104. package/dist/grader/index.d.ts +15 -29
  105. package/dist/grader/index.js +15 -66
  106. package/dist/grader/knowledge-probe.d.ts +5 -10
  107. package/dist/grader/knowledge-probe.js +5 -14
  108. package/dist/grader/literacy.d.ts +5 -9
  109. package/dist/grader/literacy.js +5 -13
  110. package/dist/grader/mcp.d.ts +5 -10
  111. package/dist/grader/mcp.js +5 -14
  112. package/dist/orchestration/pipeline-orchestrator.js +3 -0
  113. package/dist/report-store.d.ts +26 -0
  114. package/dist/report-store.js +63 -0
  115. package/package.json +2 -2
@@ -0,0 +1,41 @@
1
+ /**
2
+ * top-recommendations card — LLM-driven actionable suggestion generator.
3
+ *
4
+ * Model: claude-opus-4-6 (high-stakes per AI-SPEC §4 model routing)
5
+ * Version: top-recommendations@0.1.0
6
+ *
7
+ * Mitigations:
8
+ * - failure-mode #2: few-shot examples in system prompt; body ≥40 chars
9
+ * - failure-mode #5: docSlug refined against report manifest allow-list
10
+ *
11
+ * Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
12
+ * mandatory (cards/ is a SCAN_ROOT in check-trust-boundary-satisfies.ts).
13
+ *
14
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §3
15
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4
16
+ */
17
+ import { z } from "zod";
18
+ import type { CardGenerator } from "../../diagnosis-runner.js";
19
+ /**
20
+ * Module-level schema: static shape check only.
21
+ * Per-call: an additive `.refine()` over the allow-list (built inside the
22
+ * generator so it closes over the runtime `report`).
23
+ *
24
+ * AI-SPEC §3 Pitfall 1: per-call schemas are LOCAL to the generator,
25
+ * NOT at module scope.
26
+ */
27
+ export declare const TopRecommendationsBodySchema: z.ZodObject<{
28
+ summary: z.ZodString;
29
+ suggestions: z.ZodArray<z.ZodObject<{
30
+ title: z.ZodString;
31
+ body: z.ZodString;
32
+ priority: z.ZodEnum<{
33
+ low: "low";
34
+ medium: "medium";
35
+ high: "high";
36
+ }>;
37
+ docSlug: z.ZodString;
38
+ sectionHeading: z.ZodNullable<z.ZodString>;
39
+ }, z.core.$strip>>;
40
+ }, z.core.$strip>;
41
+ export declare const generateTopRecommendations: CardGenerator;
@@ -0,0 +1,111 @@
1
+ /**
2
+ * top-recommendations card — LLM-driven actionable suggestion generator.
3
+ *
4
+ * Model: claude-opus-4-6 (high-stakes per AI-SPEC §4 model routing)
5
+ * Version: top-recommendations@0.1.0
6
+ *
7
+ * Mitigations:
8
+ * - failure-mode #2: few-shot examples in system prompt; body ≥40 chars
9
+ * - failure-mode #5: docSlug refined against report manifest allow-list
10
+ *
11
+ * Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
12
+ * mandatory (cards/ is a SCAN_ROOT in check-trust-boundary-satisfies.ts).
13
+ *
14
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §3
15
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4
16
+ */
17
+ import { z } from "zod";
18
+ import { modelId as mkModelId } from "../../../ports/llm-client.js";
19
+ import { buildTopRecommendationsPrompt, buildDocSlugAllowList, } from "../prompt-builders.js";
20
+ // ---------------------------------------------------------------------------
21
+ // Body schema (D0045 trust boundary — satisfies required)
22
+ // ---------------------------------------------------------------------------
23
+ /**
24
+ * Module-level schema: static shape check only.
25
+ * Per-call: an additive `.refine()` over the allow-list (built inside the
26
+ * generator so it closes over the runtime `report`).
27
+ *
28
+ * AI-SPEC §3 Pitfall 1: per-call schemas are LOCAL to the generator,
29
+ * NOT at module scope.
30
+ */
31
+ export const TopRecommendationsBodySchema = z.object({
32
+ summary: z.string().min(1).max(500),
33
+ suggestions: z
34
+ .array(z.object({
35
+ title: z.string().min(1).max(200),
36
+ body: z
37
+ .string()
38
+ .min(40, "suggestion.body must be ≥40 characters")
39
+ .refine((b) => /`[^`]+`/.test(b), "suggestion.body must contain at least one backtick-delimited artifact"),
40
+ priority: z.enum(["high", "medium", "low"]),
41
+ docSlug: z.string().min(1),
42
+ sectionHeading: z.string().nullable(),
43
+ }))
44
+ .min(1)
45
+ .max(5),
46
+ });
47
+ // ---------------------------------------------------------------------------
48
+ // Generator
49
+ // ---------------------------------------------------------------------------
50
+ const CARD_MODEL = mkModelId("anthropic:claude-opus-4-6");
51
+ export const generateTopRecommendations = async (report, ctx) => {
52
+ // C1: no LLM → missing
53
+ if (!ctx.llm) {
54
+ return {
55
+ status: "missing",
56
+ cardType: "top-recommendations",
57
+ reason: "no LLMClient wired",
58
+ };
59
+ }
60
+ // Build allow-list from the runtime report
61
+ const allowList = buildDocSlugAllowList(report);
62
+ // Per-call schema: additive docSlug allow-list refine (AI-SPEC §3 Pitfall 1)
63
+ const PerCallSchema = z.object({
64
+ summary: z.string().min(1).max(500),
65
+ suggestions: z
66
+ .array(z.object({
67
+ title: z.string().min(1).max(200),
68
+ body: z
69
+ .string()
70
+ .min(40, "suggestion.body must be ≥40 characters")
71
+ .refine((b) => /`[^`]+`/.test(b), "suggestion.body must contain at least one backtick-delimited artifact"),
72
+ priority: z.enum(["high", "medium", "low"]),
73
+ docSlug: z
74
+ .string()
75
+ .min(1)
76
+ .refine((slug) => allowList.has(slug), {
77
+ message: "suggestion.docSlug is not in the report document manifest allow-list",
78
+ }),
79
+ sectionHeading: z.string().nullable(),
80
+ }))
81
+ .min(1)
82
+ .max(5),
83
+ });
84
+ const prompt = buildTopRecommendationsPrompt(report, allowList);
85
+ // Destructure `cost` and `model` from the LLMClient return —
86
+ // already provided per llm-client.ts:139-144, previously discarded.
87
+ const { value, usage, cost, model } = await ctx.llm.completeStructured({
88
+ model: CARD_MODEL,
89
+ prompt: `${prompt.system}\n\n${prompt.user}`,
90
+ schema: PerCallSchema,
91
+ temperature: 0.1,
92
+ maxTokens: 2000,
93
+ context: {
94
+ feature: "diagnosis",
95
+ runId: ctx.runId,
96
+ cardId: "top-recommendations",
97
+ },
98
+ });
99
+ return {
100
+ status: "ready",
101
+ cardType: "top-recommendations",
102
+ body: value,
103
+ meta: {
104
+ cardVersion: "top-recommendations@0.1.0",
105
+ tokenUsage: { input: usage.promptTokens, output: usage.completionTokens },
106
+ generatedAt: new Date().toISOString(),
107
+ cost,
108
+ model,
109
+ },
110
+ };
111
+ };
@@ -0,0 +1,43 @@
1
+ /**
2
+ * weakest-area card — LLM-driven weakest area identification.
3
+ *
4
+ * Model: claude-sonnet-4-6 (routine per AI-SPEC §4 model routing)
5
+ * Version: weakest-area@0.1.0
6
+ *
7
+ * Mitigations:
8
+ * - failure-mode #3: sampleSize = report.areas[area].judgmentCount (per-call refine W2)
9
+ * - failure-mode #3: sampleSize < 10 → confidence.level must be "low" (per-call refine W3)
10
+ * - failure-mode #4: taxonomy drift → buildFailureModeRefinement() Zod predicate
11
+ *
12
+ * Per-call schemas live INSIDE the generator (AI-SPEC §3 Pitfall 1 — no
13
+ * module-scope mutables that change across calls).
14
+ *
15
+ * Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
16
+ * mandatory (cards/ is a SCAN_ROOT in check-trust-boundary-satisfies.ts).
17
+ *
18
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §3
19
+ */
20
+ import { z } from "zod";
21
+ import type { CardGenerator } from "../../diagnosis-runner.js";
22
+ /**
23
+ * Module-level schema asserts the static shape + taxonomy constraint (D-05).
24
+ * Per-call schemas add the sampleSize (W2) and small-sample confidence (W3)
25
+ * refinements inside the generator, since they need runtime `report` data.
26
+ */
27
+ export declare const WeakestAreaBodySchema: z.ZodObject<{
28
+ summary: z.ZodString;
29
+ area: z.ZodString;
30
+ dimension: z.ZodString;
31
+ failureMode: z.ZodString;
32
+ sampleSize: z.ZodNumber;
33
+ confidence: z.ZodObject<{
34
+ level: z.ZodEnum<{
35
+ low: "low";
36
+ medium: "medium";
37
+ high: "high";
38
+ }>;
39
+ signalsPresent: z.ZodNumber;
40
+ derivation: z.ZodString;
41
+ }, z.core.$strip>;
42
+ }, z.core.$strip>;
43
+ export declare const generateWeakestArea: CardGenerator;
@@ -0,0 +1,118 @@
1
+ /**
2
+ * weakest-area card — LLM-driven weakest area identification.
3
+ *
4
+ * Model: claude-sonnet-4-6 (routine per AI-SPEC §4 model routing)
5
+ * Version: weakest-area@0.1.0
6
+ *
7
+ * Mitigations:
8
+ * - failure-mode #3: sampleSize = report.areas[area].judgmentCount (per-call refine W2)
9
+ * - failure-mode #3: sampleSize < 10 → confidence.level must be "low" (per-call refine W3)
10
+ * - failure-mode #4: taxonomy drift → buildFailureModeRefinement() Zod predicate
11
+ *
12
+ * Per-call schemas live INSIDE the generator (AI-SPEC §3 Pitfall 1 — no
13
+ * module-scope mutables that change across calls).
14
+ *
15
+ * Schema is in the D0045 trust-boundary scan root; `satisfies` clause is
16
+ * mandatory (cards/ is a SCAN_ROOT in check-trust-boundary-satisfies.ts).
17
+ *
18
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §3
19
+ */
20
+ import { z } from "zod";
21
+ import { ConfidenceSchema } from "../../../schemas/confidence-schema.js";
22
+ import { modelId as mkModelId } from "../../../ports/llm-client.js";
23
+ import { buildFailureModeRefinement } from "../card-validators.js";
24
+ import { buildWeakestAreaPrompt } from "../prompt-builders.js";
25
+ // ---------------------------------------------------------------------------
26
+ // Module-level schema: static shape + D-05 taxonomy refine
27
+ // ---------------------------------------------------------------------------
28
+ /**
29
+ * Module-level schema asserts the static shape + taxonomy constraint (D-05).
30
+ * Per-call schemas add the sampleSize (W2) and small-sample confidence (W3)
31
+ * refinements inside the generator, since they need runtime `report` data.
32
+ */
33
+ export const WeakestAreaBodySchema = z
34
+ .object({
35
+ summary: z.string().min(1).max(800),
36
+ area: z.string().min(1),
37
+ dimension: z.string().min(1),
38
+ failureMode: z.string().min(1),
39
+ sampleSize: z.number().int().nonnegative(),
40
+ confidence: ConfidenceSchema,
41
+ })
42
+ .refine(buildFailureModeRefinement(), {
43
+ message: "failureMode is not in the canonical taxonomy for this dimension",
44
+ path: ["failureMode"],
45
+ });
46
+ // ---------------------------------------------------------------------------
47
+ // Generator
48
+ // ---------------------------------------------------------------------------
49
+ const CARD_MODEL = mkModelId("anthropic:claude-sonnet-4-6");
50
+ export const generateWeakestArea = async (report, ctx) => {
51
+ // C1: no LLM → missing
52
+ if (!ctx.llm) {
53
+ return {
54
+ status: "missing",
55
+ cardType: "weakest-area",
56
+ reason: "no LLMClient wired",
57
+ };
58
+ }
59
+ const scores = report.summary.scores ?? [];
60
+ if (scores.length === 0) {
61
+ return {
62
+ status: "missing",
63
+ cardType: "weakest-area",
64
+ reason: "report has no areas",
65
+ };
66
+ }
67
+ // Per-call schema: close over report to get sampleSize + confidence constraints
68
+ // (AI-SPEC §3 Pitfall 1 — never hoist to module scope)
69
+ const PerCallSchema = z
70
+ .object({
71
+ summary: z.string().min(1).max(800),
72
+ area: z.string().min(1),
73
+ dimension: z.string().min(1),
74
+ failureMode: z.string().min(1),
75
+ sampleSize: z.number().int().nonnegative(),
76
+ confidence: ConfidenceSchema,
77
+ })
78
+ .refine(buildFailureModeRefinement(), {
79
+ message: "failureMode is not in the canonical taxonomy for this dimension",
80
+ path: ["failureMode"],
81
+ })
82
+ // W3: small-sample forces confidence.level = "low"
83
+ .refine((body) => {
84
+ if (body.sampleSize < 10)
85
+ return body.confidence.level === "low";
86
+ return true;
87
+ }, {
88
+ message: 'When sampleSize < 10, confidence.level must be "low"',
89
+ path: ["confidence", "level"],
90
+ });
91
+ const prompt = buildWeakestAreaPrompt(report);
92
+ // Destructure `cost` and `model` from the LLMClient return —
93
+ // already provided per llm-client.ts:139-144, previously discarded.
94
+ const { value, usage, cost, model } = await ctx.llm.completeStructured({
95
+ model: CARD_MODEL,
96
+ prompt: `${prompt.system}\n\n${prompt.user}`,
97
+ schema: PerCallSchema,
98
+ temperature: 0.1,
99
+ maxTokens: 2000,
100
+ context: {
101
+ feature: "diagnosis",
102
+ runId: ctx.runId,
103
+ cardId: "weakest-area",
104
+ },
105
+ });
106
+ return {
107
+ status: "ready",
108
+ cardType: "weakest-area",
109
+ body: value,
110
+ meta: {
111
+ cardVersion: "weakest-area@0.1.0",
112
+ tokenUsage: { input: usage.promptTokens, output: usage.completionTokens },
113
+ generatedAt: new Date().toISOString(),
114
+ cost,
115
+ model,
116
+ },
117
+ };
118
+ };
@@ -0,0 +1,72 @@
1
+ /**
2
+ * Deterministic prompt-builder helpers for the 5 LLM diagnosis cards.
3
+ *
4
+ * Each function takes typed inputs and returns `{ system, user }` strings
5
+ * (+ a `deltas` side-channel for regression-vs-baseline). The same inputs
6
+ * always produce the same output — no randomness, no side effects.
7
+ *
8
+ * Per AI-SPEC §4: input tokens are bounded by truncation; the user message
9
+ * projects only the fields each card needs.
10
+ *
11
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4
12
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §3 lines 603-664
13
+ */
14
+ import type { JudgmentAttribution } from "../../types/attribution.js";
15
+ import type { Report } from "../../types/index.js";
16
+ /**
17
+ * Build the allow-list of doc slugs for a report. This is the union of:
18
+ * - report.summary.documentManifest[].slug
19
+ * - per-score documents[].slug
20
+ *
21
+ * Only slugs (not documentIds) appear in the allow-list because the cards
22
+ * reference human-readable slugs. DocumentId-only entries are skipped.
23
+ */
24
+ export declare function buildDocSlugAllowList(report: Report): Set<string>;
25
+ /**
26
+ * Projects the 3 weakest areas + top failure modes + doc-slug allow-list
27
+ * into the user message (~2500 input tokens).
28
+ */
29
+ export declare function buildTopRecommendationsPrompt(report: Report, allowList: Set<string>): {
30
+ system: string;
31
+ user: string;
32
+ };
33
+ /**
34
+ * Projects the single weakest area + full failure-mode breakdown.
35
+ */
36
+ export declare function buildWeakestAreaPrompt(report: Report): {
37
+ system: string;
38
+ user: string;
39
+ };
40
+ /**
41
+ * Filters to low-confidence entries (or top-N if none low), produces a table
42
+ * for the LLM (~2000 input tokens).
43
+ *
44
+ * Caller short-circuits on empty `judgmentAttributions` BEFORE calling this.
45
+ */
46
+ export declare function buildLowConfidenceAttributionPrompt(report: Report, judgmentAttributions: JudgmentAttribution[]): {
47
+ system: string;
48
+ user: string;
49
+ };
50
+ /**
51
+ * Aggregates attributions by documentId, picks top-5 by aggregate score,
52
+ * emits a table for the LLM.
53
+ *
54
+ * Caller short-circuits on empty `judgmentAttributions` BEFORE calling this.
55
+ */
56
+ export declare function buildDocAttributionSpotlightPrompt(report: Report, judgmentAttributions: JudgmentAttribution[]): {
57
+ system: string;
58
+ user: string;
59
+ };
60
+ /**
61
+ * Computes per-area deltas in JS (failure-mode #1 mitigation), then builds
62
+ * the user message. Returns `deltas` as a side-channel for the per-call
63
+ * schema `.refine()`.
64
+ */
65
+ export declare function buildRegressionVsBaselinePrompt(report: Report, baseline: Report): {
66
+ system: string;
67
+ user: string;
68
+ deltas: {
69
+ area: string;
70
+ pointsDelta: number;
71
+ }[];
72
+ };
@@ -0,0 +1,286 @@
1
+ /**
2
+ * Deterministic prompt-builder helpers for the 5 LLM diagnosis cards.
3
+ *
4
+ * Each function takes typed inputs and returns `{ system, user }` strings
5
+ * (+ a `deltas` side-channel for regression-vs-baseline). The same inputs
6
+ * always produce the same output — no randomness, no side effects.
7
+ *
8
+ * Per AI-SPEC §4: input tokens are bounded by truncation; the user message
9
+ * projects only the fields each card needs.
10
+ *
11
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4
12
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §3 lines 603-664
13
+ */
14
+ import { TOP_RECOMMENDATIONS_SYSTEM_PROMPT, WEAKEST_AREA_SYSTEM_PROMPT, LOW_CONFIDENCE_ATTRIBUTION_SYSTEM_PROMPT, DOC_ATTRIBUTION_SPOTLIGHT_SYSTEM_PROMPT, REGRESSION_VS_BASELINE_SYSTEM_PROMPT, } from "./prompts/index.js";
15
+ // ---------------------------------------------------------------------------
16
+ // Shared helper: build the docSlug allow-list from a report
17
+ // ---------------------------------------------------------------------------
18
+ /**
19
+ * Build the allow-list of doc slugs for a report. This is the union of:
20
+ * - report.summary.documentManifest[].slug
21
+ * - per-score documents[].slug
22
+ *
23
+ * Only slugs (not documentIds) appear in the allow-list because the cards
24
+ * reference human-readable slugs. DocumentId-only entries are skipped.
25
+ */
26
+ export function buildDocSlugAllowList(report) {
27
+ const slugs = new Set();
28
+ const manifest = report.summary.documentManifest ?? [];
29
+ for (const doc of manifest) {
30
+ if ("slug" in doc && typeof doc.slug === "string" && doc.slug.length > 0) {
31
+ slugs.add(doc.slug);
32
+ }
33
+ }
34
+ for (const score of report.summary.scores ?? []) {
35
+ for (const doc of score.documents ?? []) {
36
+ if ("slug" in doc &&
37
+ typeof doc.slug === "string" &&
38
+ doc.slug.length > 0) {
39
+ slugs.add(doc.slug);
40
+ }
41
+ }
42
+ }
43
+ return slugs;
44
+ }
45
+ // ---------------------------------------------------------------------------
46
+ // top-recommendations prompt builder
47
+ // ---------------------------------------------------------------------------
48
+ /**
49
+ * Projects the 3 weakest areas + top failure modes + doc-slug allow-list
50
+ * into the user message (~2500 input tokens).
51
+ */
52
+ export function buildTopRecommendationsPrompt(report, allowList) {
53
+ const scores = report.summary.scores ?? [];
54
+ const sorted = [...scores].sort((a, b) => a.totalScore - b.totalScore);
55
+ const weakest3 = sorted.slice(0, 3);
56
+ const failureModes = report.summary.failureModes;
57
+ const topModes = failureModes?.topTitles
58
+ ?.slice(0, 5)
59
+ .map((t) => `${t.category} (${t.count})`) ?? [];
60
+ const allowListArr = [...allowList].slice(0, 50); // cap at 50 slugs
61
+ const user = [
62
+ "## Weakest Areas",
63
+ weakest3
64
+ .map((s) => `- ${s.feature}: totalScore=${s.totalScore}, judgmentCount=${s.testCount}`)
65
+ .join("\n"),
66
+ "",
67
+ "## Top Failure Modes",
68
+ topModes.length > 0 ? topModes.join(", ") : "(none recorded)",
69
+ "",
70
+ "## Document Slug Allow-List",
71
+ "Suggestions MUST use one of these slugs:",
72
+ allowListArr.map((s) => `- ${s}`).join("\n"),
73
+ "",
74
+ "Generate 1-5 actionable recommendations targeting the weakest areas above.",
75
+ ].join("\n");
76
+ return { system: TOP_RECOMMENDATIONS_SYSTEM_PROMPT, user };
77
+ }
78
+ // ---------------------------------------------------------------------------
79
+ // weakest-area prompt builder
80
+ // ---------------------------------------------------------------------------
81
+ /**
82
+ * Projects the single weakest area + full failure-mode breakdown.
83
+ */
84
+ export function buildWeakestAreaPrompt(report) {
85
+ const scores = report.summary.scores ?? [];
86
+ if (scores.length === 0) {
87
+ return {
88
+ system: WEAKEST_AREA_SYSTEM_PROMPT,
89
+ user: "No areas in report.",
90
+ };
91
+ }
92
+ const weakest = [...scores].sort((a, b) => a.totalScore - b.totalScore)[0];
93
+ const judgmentCount = weakest.testCount ?? 0;
94
+ const topMode = report.summary.failureModes?.topTitles?.[0]?.category ?? "unclassified";
95
+ const user = [
96
+ "## Weakest Area",
97
+ `Feature: ${weakest.feature}`,
98
+ `Total Score: ${weakest.totalScore}`,
99
+ `Ceiling Score: ${weakest.ceilingScore}`,
100
+ `Floor Score: ${weakest.floorScore}`,
101
+ `Judgment Count (sampleSize): ${judgmentCount}`,
102
+ "",
103
+ "## Top Failure Mode Observed",
104
+ `Category: ${topMode}`,
105
+ "",
106
+ "## Failure Mode by Dimension",
107
+ report.summary.failureModes?.topTitles
108
+ ?.slice(0, 5)
109
+ .map((t) => `- ${t.category}: ${t.count} judgments`)
110
+ .join("\n") ?? "(no data)",
111
+ "",
112
+ "Identify the area, its primary dimension, and most frequent failure mode.",
113
+ `sampleSize in your response MUST equal exactly ${judgmentCount} (the judgment count above).`,
114
+ judgmentCount < 10
115
+ ? `WARNING: sampleSize=${judgmentCount} < 10 — you MUST set confidence.level = "low".`
116
+ : "",
117
+ ]
118
+ .filter(Boolean)
119
+ .join("\n");
120
+ return { system: WEAKEST_AREA_SYSTEM_PROMPT, user };
121
+ }
122
+ // ---------------------------------------------------------------------------
123
+ // low-confidence-attribution prompt builder
124
+ // ---------------------------------------------------------------------------
125
+ /**
126
+ * Filters to low-confidence entries (or top-N if none low), produces a table
127
+ * for the LLM (~2000 input tokens).
128
+ *
129
+ * Caller short-circuits on empty `judgmentAttributions` BEFORE calling this.
130
+ */
131
+ export function buildLowConfidenceAttributionPrompt(report, judgmentAttributions) {
132
+ // Filter to low-confidence entries (by any attribution in the set)
133
+ const lowConf = judgmentAttributions.filter((ja) => ja.attributions.some((a) => a.confidence.level === "low"));
134
+ // If no low-confidence entries, use all sorted by score ascending (most
135
+ // uncertain first). Guard against entries with empty `attributions` arrays:
136
+ // `Math.min(...[])` is Infinity and produces an unstable sort that ranks
137
+ // empty-attribution entries identically at the top of the prompt. The card
138
+ // schema requires `judgmentRefs.min(1)`, so emitting empty-attribution rows
139
+ // here forces a degraded card downstream. Caller should short-circuit to
140
+ // missing before reaching here, but defend against the seam regressing.
141
+ const validAttrs = judgmentAttributions.filter((ja) => ja.attributions.length > 0);
142
+ if (validAttrs.length === 0) {
143
+ return {
144
+ system: LOW_CONFIDENCE_ATTRIBUTION_SYSTEM_PROMPT,
145
+ user: "(no attribution data with non-empty attributions)",
146
+ };
147
+ }
148
+ const source = lowConf.length > 0
149
+ ? lowConf
150
+ : [...validAttrs].sort((a, b) => {
151
+ const aMin = Math.min(...a.attributions.map((x) => x.score));
152
+ const bMin = Math.min(...b.attributions.map((x) => x.score));
153
+ return aMin - bMin;
154
+ });
155
+ // Cap at 20 to stay within token budget
156
+ const capped = source.slice(0, 20);
157
+ const tableRows = capped
158
+ .map((ja) => {
159
+ const minConf = ja.attributions.reduce((worst, a) => a.confidence.level === "low"
160
+ ? "low"
161
+ : worst === "low"
162
+ ? "low"
163
+ : a.confidence.level === "medium"
164
+ ? "medium"
165
+ : worst, "high");
166
+ return `| ${ja.judgmentRef} | ${ja.taskId} | ${ja.modelId} | ${ja.dimension} | ${minConf} |`;
167
+ })
168
+ .join("\n");
169
+ const user = [
170
+ "## Per-Judgment Attribution Confidence",
171
+ `Total entries: ${judgmentAttributions.length}; Low-confidence: ${lowConf.length}`,
172
+ "",
173
+ "| judgmentRef | taskId | modelId | dimension | minConfidence |",
174
+ "|-------------|--------|---------|-----------|---------------|",
175
+ tableRows,
176
+ "",
177
+ lowConf.length === 0
178
+ ? "No low-confidence entries found. Return the highest-uncertainty entry and note that confidence is well-calibrated."
179
+ : `Identify the ${Math.min(lowConf.length, 5)} most uncertain judgments.`,
180
+ ].join("\n");
181
+ return { system: LOW_CONFIDENCE_ATTRIBUTION_SYSTEM_PROMPT, user };
182
+ }
183
+ // ---------------------------------------------------------------------------
184
+ // doc-attribution-spotlight prompt builder
185
+ // ---------------------------------------------------------------------------
186
+ /**
187
+ * Aggregates attributions by documentId, picks top-5 by aggregate score,
188
+ * emits a table for the LLM.
189
+ *
190
+ * Caller short-circuits on empty `judgmentAttributions` BEFORE calling this.
191
+ */
192
+ export function buildDocAttributionSpotlightPrompt(report, judgmentAttributions) {
193
+ // Aggregate by documentId (D0052 canonical ref)
194
+ const byDoc = new Map();
195
+ for (const ja of judgmentAttributions) {
196
+ for (const a of ja.attributions) {
197
+ const entry = byDoc.get(a.documentId) ?? {
198
+ slug: a.slug,
199
+ scoreSum: 0,
200
+ count: 0,
201
+ };
202
+ entry.scoreSum += a.score;
203
+ entry.count += 1;
204
+ // Keep slug if we have it
205
+ if (a.slug && !entry.slug)
206
+ entry.slug = a.slug;
207
+ byDoc.set(a.documentId, entry);
208
+ }
209
+ }
210
+ // Sort by aggregate score descending, take top 5
211
+ const sorted = [...byDoc.entries()]
212
+ .map(([docId, v]) => ({
213
+ documentId: docId,
214
+ slug: v.slug,
215
+ aggregateScore: v.scoreSum / v.count,
216
+ signalCount: v.count,
217
+ }))
218
+ .filter((d) => d.slug) // only emit docs with slugs (allow-list check in Zod)
219
+ .sort((a, b) => b.aggregateScore - a.aggregateScore)
220
+ .slice(0, 5);
221
+ const tableRows = sorted
222
+ .map((d) => `| ${d.documentId} | ${d.slug ?? "(no slug)"} | ${d.aggregateScore.toFixed(3)} | ${d.signalCount} |`)
223
+ .join("\n");
224
+ const user = [
225
+ "## Top Documents by Attribution Score",
226
+ `Total unique documents: ${byDoc.size}`,
227
+ "",
228
+ "| documentId | slug | aggregateScore | signalCount |",
229
+ "|------------|------|----------------|-------------|",
230
+ tableRows,
231
+ "",
232
+ "For each document in the table, determine its role (supports/contradicts/missing/irrelevant).",
233
+ "docSlug in your output MUST exactly match the slug column — do not invent slugs.",
234
+ ].join("\n");
235
+ return { system: DOC_ATTRIBUTION_SPOTLIGHT_SYSTEM_PROMPT, user };
236
+ }
237
+ // ---------------------------------------------------------------------------
238
+ // regression-vs-baseline prompt builder
239
+ // ---------------------------------------------------------------------------
240
+ /**
241
+ * Computes per-area deltas in JS (failure-mode #1 mitigation), then builds
242
+ * the user message. Returns `deltas` as a side-channel for the per-call
243
+ * schema `.refine()`.
244
+ */
245
+ export function buildRegressionVsBaselinePrompt(report, baseline) {
246
+ // Build area → score maps
247
+ const currentByArea = new Map((report.summary.scores ?? []).map((s) => [s.feature, s.totalScore]));
248
+ const baselineByArea = new Map((baseline.summary.scores ?? []).map((s) => [s.feature, s.totalScore]));
249
+ // Only compute deltas for areas present in BOTH reports
250
+ const deltas = [];
251
+ for (const [area, current] of currentByArea) {
252
+ const base = baselineByArea.get(area);
253
+ if (base !== undefined) {
254
+ deltas.push({
255
+ area,
256
+ pointsDelta: parseFloat((current - base).toFixed(2)),
257
+ });
258
+ }
259
+ }
260
+ // Sort by absolute delta descending (most changed first), cap at 10
261
+ const topDeltas = deltas
262
+ .sort((a, b) => Math.abs(b.pointsDelta) - Math.abs(a.pointsDelta))
263
+ .slice(0, 10);
264
+ const deltaRows = topDeltas
265
+ .map((d) => `| ${d.area} | ${d.pointsDelta > 0 ? "+" : ""}${d.pointsDelta} | ${d.pointsDelta > 0 ? "improved" : d.pointsDelta < 0 ? "regressed" : "unchanged"} |`)
266
+ .join("\n");
267
+ const user = [
268
+ "## Pre-Computed Score Deltas (current minus baseline)",
269
+ "These values are FACTS — do not modify them.",
270
+ "",
271
+ "| area | pointsDelta | expectedDirection |",
272
+ "|------|-------------|-------------------|",
273
+ deltaRows,
274
+ "",
275
+ "For each row, echo the exact area + pointsDelta, assign the matching direction label, and add prose drivers.",
276
+ "Do NOT round or change any numeric value.",
277
+ "",
278
+ `Current run: ${report.provenance.runId}`,
279
+ `Baseline run: ${baseline.provenance.runId}`,
280
+ ].join("\n");
281
+ return {
282
+ system: REGRESSION_VS_BASELINE_SYSTEM_PROMPT,
283
+ user,
284
+ deltas: topDeltas,
285
+ };
286
+ }