@sanity/ailf 5.0.0 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/config/diagnosis-cards.ts +318 -0
  2. package/config/models.ts +12 -0
  3. package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
  4. package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
  5. package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
  6. package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
  7. package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
  8. package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
  9. package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
  10. package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
  11. package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
  12. package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
  13. package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
  14. package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
  15. package/dist/_vendor/ailf-core/index.d.ts +1 -0
  16. package/dist/_vendor/ailf-core/index.js +4 -0
  17. package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
  18. package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
  19. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
  20. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
  21. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
  22. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +171 -0
  23. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
  24. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
  25. package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
  26. package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
  27. package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
  28. package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +104 -0
  29. package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
  30. package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +96 -0
  31. package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +39 -0
  32. package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +52 -0
  33. package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
  34. package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +77 -0
  35. package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
  36. package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
  37. package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
  38. package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +126 -0
  39. package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
  40. package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +107 -0
  41. package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
  42. package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +114 -0
  43. package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
  44. package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +273 -0
  45. package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
  46. package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
  47. package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
  48. package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
  49. package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
  50. package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
  51. package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
  52. package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
  53. package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
  54. package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
  55. package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +16 -0
  56. package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +86 -0
  57. package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +10 -0
  58. package/dist/_vendor/ailf-core/services/diagnosis/registry.js +10 -0
  59. package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +119 -2
  60. package/dist/_vendor/ailf-core/services/diagnosis-runner.js +136 -2
  61. package/dist/_vendor/ailf-core/services/index.d.ts +5 -1
  62. package/dist/_vendor/ailf-core/services/index.js +15 -2
  63. package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
  64. package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
  65. package/dist/_vendor/ailf-core/types/diagnosis.d.ts +112 -10
  66. package/dist/_vendor/ailf-core/types/diagnosis.js +3 -1
  67. package/dist/_vendor/ailf-core/types/index.d.ts +1 -1
  68. package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
  69. package/dist/adapters/llm/fake-llm-client.js +38 -1
  70. package/dist/adapters/llm/openai-llm-client.js +52 -3
  71. package/dist/cli-program.js +3 -0
  72. package/dist/commands/interpret.d.ts +50 -0
  73. package/dist/commands/interpret.js +212 -0
  74. package/dist/composition-root.d.ts +21 -23
  75. package/dist/composition-root.js +107 -41
  76. package/dist/config/diagnosis-cards.ts +318 -0
  77. package/dist/config/models.ts +12 -0
  78. package/dist/grader/agent-harness.d.ts +5 -10
  79. package/dist/grader/agent-harness.js +5 -13
  80. package/dist/grader/common.d.ts +5 -13
  81. package/dist/grader/common.js +5 -17
  82. package/dist/grader/index.d.ts +15 -29
  83. package/dist/grader/index.js +15 -66
  84. package/dist/grader/knowledge-probe.d.ts +5 -10
  85. package/dist/grader/knowledge-probe.js +5 -14
  86. package/dist/grader/literacy.d.ts +5 -9
  87. package/dist/grader/literacy.js +5 -13
  88. package/dist/grader/mcp.d.ts +5 -10
  89. package/dist/grader/mcp.js +5 -14
  90. package/package.json +2 -2
@@ -8,12 +8,15 @@
8
8
  * discriminator inside the `ready` variant.
9
9
  *
10
10
  * Phase 1 lands placeholder body shapes; Phase 5 enriches each per
11
- * Doc 05 specs.
11
+ * AI-SPEC §3 and CONTEXT D-05/D-07. The `DiagnosisCard` discriminated
12
+ * union surface (arms + `cardType` literals) is stable — only the
13
+ * `body: <BodyInterface>` references resolve to richer shapes.
12
14
  *
13
15
  * @see docs/decisions/D0049-shared-confidence-contract.md
14
16
  * @see docs/decisions/D0052-judgment-ref-granularity.md
15
17
  * @see docs/decisions/D0050-artifact-registry-post-hoc-versioned-extension.md
16
18
  */
19
+ import type { Confidence } from "./confidence.js";
17
20
  import type { RunId } from "./branded-ids.js";
18
21
  import type { ReportId } from "./index.js";
19
22
  /**
@@ -54,44 +57,139 @@ export interface CardMeta {
54
57
  }
55
58
  /**
56
59
  * A single actionable suggestion surfaced by a recommendations card.
57
- * The full Phase 5 shape may add fields (per Doc 05 specs); Phase 1
58
- * locks the minimum required surface.
60
+ *
61
+ * Phase 5 adds `docSlug` (the canonical doc page to rewrite) and
62
+ * `sectionHeading` (null when the suggestion targets the whole page)
63
+ * per AI-SPEC actionability-specificity rubric + failure-mode #2
64
+ * mitigation.
59
65
  */
60
66
  export interface ActionSuggestion {
61
67
  title: string;
62
68
  body: string;
63
69
  priority: "high" | "medium" | "low";
70
+ /** Canonical slug of the documentation page this suggestion targets. */
71
+ docSlug: string;
72
+ /**
73
+ * Heading within `docSlug` that should be revised, or `null` when the
74
+ * suggestion targets the page as a whole.
75
+ */
76
+ sectionHeading: string | null;
77
+ }
78
+ /**
79
+ * Minimal judgment reference per D0052 (taskId × modelId × dimension).
80
+ * Used by `LowConfidenceAttributionBody.judgmentRefs` to cite the
81
+ * specific judgments that drove a low-confidence finding.
82
+ */
83
+ export interface JudgmentRef {
84
+ taskId: string;
85
+ modelId: string;
86
+ dimension: string;
64
87
  }
65
88
  /**
66
- * Phase 1 body placeholders. Each shape is intentionally minimal; Phase 5
67
- * card files enrich them per Doc 05 specs and assert
68
- * `satisfies z.ZodType<Extract<DiagnosisCard, { status: "ready"; cardType: "X" }>["body"]>`
69
- * against these declarations.
89
+ * Phase 5 enriched body shapes. Each keeps `summary: string` (load-bearing
90
+ * for CLI default render per AI-SPEC §6) and adds fields the corresponding
91
+ * Zod schema needs (asserting `satisfies z.ZodType<T>` in the card file).
70
92
  */
93
+ /** area-summary: deterministic — keep only summary (no behavioral claims). */
71
94
  export interface AreaSummaryBody {
72
95
  summary: string;
73
96
  }
97
+ /**
98
+ * failure-mode-summary: deterministic + D-05 dimension/failureMode gate.
99
+ * `count` = frequency in the report; `sampleSize` = judgment count for the
100
+ * dimension (per AI-SPEC failure-mode #3 mitigation).
101
+ */
74
102
  export interface FailureModeSummaryBody {
75
103
  summary: string;
104
+ /** Rubric dimension this summary targets (e.g. "task-completion"). */
105
+ dimension: string;
106
+ /** Canonical failure mode within this dimension. */
107
+ failureMode: string;
108
+ /** Number of judgments in this report with this failure mode. */
109
+ count: number;
110
+ /** Total judgments for this dimension — calibration denominator. */
111
+ sampleSize: number;
76
112
  }
113
+ /**
114
+ * no-issues: deterministic + AI-SPEC failure-mode #7 sycophancy guard.
115
+ * `thresholdScore` surfaces the threshold used to qualify as "no issues"
116
+ * so readers can see the criterion behind the positive assessment.
117
+ */
77
118
  export interface NoIssuesBody {
78
119
  summary: string;
120
+ /** Minimum composite score that qualified this area as "no issues". */
121
+ thresholdScore: number;
79
122
  }
123
+ /**
124
+ * top-recommendations: LLM-driven. `suggestions` reuses the enriched
125
+ * `ActionSuggestion` shape (docSlug + sectionHeading per AI-SPEC
126
+ * actionability-specificity rubric + failure-mode #2 mitigation).
127
+ */
80
128
  export interface TopRecommendationsBody {
81
129
  summary: string;
82
130
  suggestions: ActionSuggestion[];
83
131
  }
132
+ /**
133
+ * weakest-area: LLM-driven. Adds area identification, dimension/failureMode
134
+ * context, and a small-sample calibration guard (AI-SPEC failure-mode #3).
135
+ */
84
136
  export interface WeakestAreaBody {
85
137
  summary: string;
138
+ /** Documentation area with the lowest composite score. */
139
+ area: string;
140
+ /** Primary dimension driving the low score. */
141
+ dimension: string;
142
+ /** Dominant failure mode in this area. */
143
+ failureMode: string;
144
+ /** Number of judgments sampled for this area — calibration denominator. */
145
+ sampleSize: number;
146
+ /** Calibrated confidence per D0049 (ensemble-stdev derivation). */
147
+ confidence: Confidence;
86
148
  }
87
- export interface LowConfidenceAttributionBody {
149
+ /**
150
+ * regression-vs-baseline: LLM-driven. `deltas` is the per-area diff
151
+ * (JS-computed pre-call, max 10 entries); `drivers` is LLM prose;
152
+ * `overallTrend` is a 4-bucket summary per AI-SPEC §3 lines 605-613.
153
+ */
154
+ export interface RegressionVsBaselineBody {
88
155
  summary: string;
156
+ /**
157
+ * Per-area score deltas (max 10). `drivers` carries the LLM's prose
158
+ * reasoning about what caused the change.
159
+ */
160
+ deltas: {
161
+ area: string;
162
+ direction: "improved" | "regressed" | "unchanged";
163
+ pointsDelta: number;
164
+ drivers: string[];
165
+ }[];
166
+ /** 4-bucket aggregate trend across all deltas. */
167
+ overallTrend: "net-improved" | "net-regressed" | "mixed" | "stable";
89
168
  }
90
- export interface DocAttributionSpotlightBody {
169
+ /**
170
+ * low-confidence-attribution: LLM-driven. `judgmentRefs` cites the
171
+ * specific judgments (D0052 triple) that drove the low-confidence finding.
172
+ */
173
+ export interface LowConfidenceAttributionBody {
91
174
  summary: string;
175
+ /** Judgment references (D0052) driving this low-confidence finding. */
176
+ judgmentRefs: JudgmentRef[];
92
177
  }
93
- export interface RegressionVsBaselineBody {
178
+ /**
179
+ * doc-attribution-spotlight: LLM-driven. `docCitations` carries per-doc
180
+ * attribution roles and confidence calibration (AI-SPEC failure-mode #5).
181
+ */
182
+ export interface DocAttributionSpotlightBody {
94
183
  summary: string;
184
+ /**
185
+ * Per-doc attribution records. `role` classifies how the doc contributed;
186
+ * `confidence` calibrates the attribution certainty (D0049).
187
+ */
188
+ docCitations: {
189
+ docSlug: string;
190
+ confidence: Confidence;
191
+ role: "supports" | "contradicts" | "missing" | "irrelevant";
192
+ }[];
95
193
  }
96
194
  /**
97
195
  * Outer-`status` discriminated union: 8 ready variants (one per
@@ -102,6 +200,10 @@ export interface RegressionVsBaselineBody {
102
200
  * No `not-yet-generated` variant — old-report fallback is a Phase 7
103
201
  * concern at the slim-shape boundary, handled at fetch-time, not in
104
202
  * `DiagnosisCard` itself.
203
+ *
204
+ * D-07: only the `body: <BodyInterface>` references resolve to richer
205
+ * shapes. The union arms, status literals, and cardType literals are
206
+ * identical to Phase 1.
105
207
  */
106
208
  export type DiagnosisCard = {
107
209
  status: "ready";
@@ -8,7 +8,9 @@
8
8
  * discriminator inside the `ready` variant.
9
9
  *
10
10
  * Phase 1 lands placeholder body shapes; Phase 5 enriches each per
11
- * Doc 05 specs.
11
+ * AI-SPEC §3 and CONTEXT D-05/D-07. The `DiagnosisCard` discriminated
12
+ * union surface (arms + `cardType` literals) is stable — only the
13
+ * `body: <BodyInterface>` references resolve to richer shapes.
12
14
  *
13
15
  * @see docs/decisions/D0049-shared-confidence-contract.md
14
16
  * @see docs/decisions/D0052-judgment-ref-granularity.md
@@ -36,7 +36,7 @@ export { CONVENTIONAL_DERIVATIONS, isConfidence } from "./confidence.js";
36
36
  export type { ArtifactId, AssociationAxis, AssociationValues, Brand, EntryKey, Err, FixtureId, IdValidationError, JudgmentId, NewReportId, Ok, ProviderId, PromptId, Result, ResultId, RubricId, RunFingerprint, RunId, SuiteId, TaskId, TaskSlug, TraceId, } from "./branded-ids.js";
37
37
  export { err, fixtureId, generateJudgmentId, generateRunId, judgmentId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
38
38
  export type { AgentHarnessTaskDefinition, ContentLakeAuthorableMode, ContentLakeAuthorableTask, CriterionRef, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PromptVars, PathDocRef, PerspectiveDocRef, ReservedPromptVarKey, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./generalized-task.js";
39
- export type { ActionSuggestion, AreaSummaryBody, CardMeta, CardType, Diagnosis, DiagnosisCard, DocAttributionSpotlightBody, FailureModeSummaryBody, LowConfidenceAttributionBody, NoIssuesBody, RegressionVsBaselineBody, TopRecommendationsBody, VersionedInputs, WeakestAreaBody, } from "./diagnosis.js";
39
+ export type { ActionSuggestion, AreaSummaryBody, CardMeta, CardType, Diagnosis, DiagnosisCard, DocAttributionSpotlightBody, FailureModeSummaryBody, JudgmentRef, LowConfidenceAttributionBody, NoIssuesBody, RegressionVsBaselineBody, TopRecommendationsBody, VersionedInputs, WeakestAreaBody, } from "./diagnosis.js";
40
40
  export type { AttributionMeta, DocAttribution, JudgmentAttribution, } from "./attribution.js";
41
41
  export type { CriterionSubJudgment, DocCitation, DocCitationRole, GraderJudgment, } from "./grader-judgment.js";
42
42
  export type { LegacyGraderJudgment } from "./legacy-grader-judgment.js";
@@ -40,9 +40,29 @@ export declare class FakeLLMClient implements LLMClient {
40
40
  readonly calls: FakeCallRecord[];
41
41
  private readonly completeQueue;
42
42
  private readonly structuredQueue;
43
+ /**
44
+ * Per-cardId keyed responses. A single-value entry is returned on every
45
+ * call for that cardId (repeated calls always get the same response). An
46
+ * array-value entry is consumed in order; once exhausted, calls for that
47
+ * cardId fall back to the FIFO structuredQueue.
48
+ *
49
+ * This is the substrate Plan 07's 17-fixture eval matrix uses to wire
50
+ * deterministic responses to specific LLM cards.
51
+ */
52
+ private readonly keyedResponses;
43
53
  constructor(args?: {
44
54
  completeResponses?: FakeCompletionResponse[];
45
55
  structuredResponses?: FakeStructuredResponse[];
56
+ /**
57
+ * Optional keyed-response map. Keys are `cardId` values from
58
+ * `args.context.cardId`. When a call matches a key the keyed entry is
59
+ * used instead of the FIFO queue.
60
+ *
61
+ * - Single-value entry: same response on every call for this cardId.
62
+ * - Array-value entry: entries consumed in insertion order; falls back
63
+ * to FIFO (or throws) when the array is exhausted.
64
+ */
65
+ keyedResponses?: Record<string, FakeStructuredResponse | FakeStructuredResponse[]>;
46
66
  });
47
67
  complete(args: LLMCompleteArgs): Promise<LLMCompletion>;
48
68
  completeStructured<T>(args: LLMCompleteStructuredArgs<T>): Promise<LLMStructuredCompletion<T>>;
@@ -11,9 +11,25 @@ export class FakeLLMClient {
11
11
  calls = [];
12
12
  completeQueue;
13
13
  structuredQueue;
14
+ /**
15
+ * Per-cardId keyed responses. A single-value entry is returned on every
16
+ * call for that cardId (repeated calls always get the same response). An
17
+ * array-value entry is consumed in order; once exhausted, calls for that
18
+ * cardId fall back to the FIFO structuredQueue.
19
+ *
20
+ * This is the substrate Plan 07's 17-fixture eval matrix uses to wire
21
+ * deterministic responses to specific LLM cards.
22
+ */
23
+ keyedResponses;
14
24
  constructor(args = {}) {
15
25
  this.completeQueue = [...(args.completeResponses ?? [])];
16
26
  this.structuredQueue = [...(args.structuredResponses ?? [])];
27
+ // Deep-copy arrays so the caller's fixture data is not mutated.
28
+ const keyed = {};
29
+ for (const [key, val] of Object.entries(args.keyedResponses ?? {})) {
30
+ keyed[key] = Array.isArray(val) ? [...val] : val;
31
+ }
32
+ this.keyedResponses = keyed;
17
33
  }
18
34
  async complete(args) {
19
35
  this.calls.push({
@@ -37,13 +53,34 @@ export class FakeLLMClient {
37
53
  };
38
54
  }
39
55
  async completeStructured(args) {
56
+ // Record every call first so test assertions on this.calls are never
57
+ // affected by which branch (keyed vs FIFO) handles the response.
40
58
  this.calls.push({
41
59
  kind: "completeStructured",
42
60
  model: args.model,
43
61
  prompt: args.prompt,
44
62
  ...(args.context ? { context: args.context } : {}),
45
63
  });
46
- const next = this.structuredQueue.shift();
64
+ let next;
65
+ const cardId = args.context?.cardId;
66
+ if (cardId !== undefined && cardId in this.keyedResponses) {
67
+ const entry = this.keyedResponses[cardId];
68
+ if (Array.isArray(entry)) {
69
+ // Array-value: consume one entry per call. When exhausted, fall
70
+ // through to the FIFO queue below.
71
+ if (entry.length > 0) {
72
+ next = entry.shift();
73
+ }
74
+ }
75
+ else {
76
+ // Single-value: return the same response on every call.
77
+ next = entry;
78
+ }
79
+ }
80
+ if (next === undefined) {
81
+ // FIFO fallback (existing behavior)
82
+ next = this.structuredQueue.shift();
83
+ }
47
84
  if (!next) {
48
85
  throw new Error("FakeLLMClient: no more queued structured responses (call exceeded queue)");
49
86
  }
@@ -10,6 +10,7 @@
10
10
  * the adapter never reads `process.env`. The composition root maps env vars
11
11
  * to typed constructor args.
12
12
  */
13
+ import { z } from "zod";
13
14
  import { OpenAIChatResponseSchema, splitModelId, } from "../../_vendor/ailf-core/index.js";
14
15
  import { DEFAULT_RETRY_POLICY, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
15
16
  const DEFAULT_BASE_URL = "https://api.openai.com/v1/chat/completions";
@@ -67,10 +68,25 @@ export class OpenAILLMClient {
67
68
  }
68
69
  async completeStructured(args) {
69
70
  const { modelName } = splitModelId(args.model);
71
+ // Derive the JSON Schema from the caller's Zod schema. Zod v4 natively
72
+ // emits `additionalProperties: false` on every nested z.object node —
73
+ // this is required for OpenAI strict-mode.
74
+ const jsonSchema = z.toJSONSchema(args.schema, { target: "draft-2020-12" });
75
+ // OpenAI strict-mode requires the root to be a plain object schema (no
76
+ // anyOf/oneOf/allOf at the top level). Discriminated unions produce
77
+ // anyOf at the root — callers must wrap them in a discriminator object.
78
+ assertSchemaIsObjectRoot(jsonSchema, args.model);
70
79
  const body = buildBody(modelName, args.prompt, {
71
- temperature: args.temperature,
72
- maxTokens: args.maxTokens,
73
- responseFormat: { type: "json_object" },
80
+ temperature: args.temperature ?? 0.1,
81
+ maxTokens: args.maxTokens ?? 2000,
82
+ responseFormat: {
83
+ type: "json_schema",
84
+ json_schema: {
85
+ name: args.context?.cardId ?? "structured_output",
86
+ schema: jsonSchema,
87
+ strict: true,
88
+ },
89
+ },
74
90
  });
75
91
  const data = await this.callApi(body);
76
92
  const raw = data.choices?.[0]?.message?.content;
@@ -84,6 +100,9 @@ export class OpenAILLMClient {
84
100
  catch (err) {
85
101
  throw new Error(`OpenAI structured completion returned invalid JSON for model ${args.model}: ${err instanceof Error ? err.message : String(err)}`, { cause: err });
86
102
  }
103
+ // strict:true guarantees a valid-against-the-schema JSON document, but
104
+ // the Zod parse is still load-bearing — it brands the result as T and is
105
+ // the only contract the engine trusts (D0045 parse-don't-validate).
87
106
  const value = args.schema.parse(parsed);
88
107
  const usage = extractUsage(data.usage);
89
108
  const cost = this.computeCost(modelName, usage);
@@ -145,6 +164,36 @@ export class OpenAILLMClient {
145
164
  `cost_usd=${cost.toFixed(6)}`);
146
165
  }
147
166
  }
167
+ /**
168
+ * Assert that the JSON Schema root is a plain object type.
169
+ *
170
+ * OpenAI strict-mode requires the root schema to be `{ type: "object" }`.
171
+ * A discriminated union (`z.union([...])`) produces `{ anyOf: [...] }` at
172
+ * the root — callers must wrap the union in a discriminator object before
173
+ * passing it to `completeStructured`.
174
+ *
175
+ * Per AI-SPEC §3 Pitfall 6 + T-05-03-01: caught at request-build time to
176
+ * avoid wasting API budget on a guaranteed 400.
177
+ */
178
+ function assertSchemaIsObjectRoot(schema, modelId) {
179
+ if (typeof schema !== "object" || schema === null) {
180
+ throw new Error(`OpenAILLMClient: OpenAI strict-mode requires a single z.object at the ` +
181
+ `schema root for model ${modelId}; got non-object JSON Schema root.`);
182
+ }
183
+ const node = schema;
184
+ if (node.type !== "object") {
185
+ // Identify the kind so the error message is actionable.
186
+ const kind = "anyOf" in node
187
+ ? "z.union"
188
+ : "oneOf" in node
189
+ ? "z.discriminatedUnion"
190
+ : "allOf" in node
191
+ ? "z.intersection"
192
+ : String(node.type ?? "unknown");
193
+ throw new Error(`OpenAILLMClient: OpenAI strict-mode requires a single z.object at the ` +
194
+ `schema root; got ${kind}. Wrap the union in a discriminator object.`);
195
+ }
196
+ }
148
197
  function buildBody(modelName, prompt, opts) {
149
198
  const body = {
150
199
  model: modelName,
@@ -32,6 +32,7 @@ import { createFetchDocsCommand } from "./commands/fetch-docs.js";
32
32
  import { createGenerateConfigsCommand } from "./commands/generate-configs.js";
33
33
  import { createGraderCommand } from "./commands/grader/index.js";
34
34
  import { createInitCommand } from "./commands/init.js";
35
+ import { createInterpretCommand } from "./commands/interpret.js";
35
36
  import { createInteractiveCommand } from "./commands/interactive.js";
36
37
  import { createLookupDocCommand } from "./commands/lookup-doc.js";
37
38
  import { createMeasureRetrievalCommand } from "./commands/measure-retrieval.js";
@@ -110,6 +111,8 @@ export function buildCliProgram(opts) {
110
111
  .addCommand(createWeeklyDigestCommand())
111
112
  .addCommand(createCheckStalenessCommand());
112
113
  program.addCommand(reportCommand.helpGroup(CommandGroup.AnalysisReports));
114
+ // `ailf interpret <reportId>` — top-level (not nested under report) per AI-SPEC
115
+ program.addCommand(createInterpretCommand().helpGroup(CommandGroup.AnalysisReports));
113
116
  // ── Grader Reliability ────────────────────────────────────────────────
114
117
  program.addCommand(createGraderCommand().helpGroup(CommandGroup.GraderReliability));
115
118
  // ── Setup & Configuration ─────────────────────────────────────────────
@@ -0,0 +1,50 @@
1
+ /**
2
+ * interpret command — generate a Diagnosis for a Report.
3
+ *
4
+ * Wraps `getDiagnosisRunner(ctx)` from the composition root in a Commander
5
+ * command for consistent CLI integration. Closest analog: compare.ts.
6
+ *
7
+ * Entry points:
8
+ * ailf interpret <reportId> — one-line-per-card summary
9
+ * ailf interpret <reportId> --json — full Diagnosis JSON
10
+ * ailf interpret latest — most recent report
11
+ * ailf interpret <id> --compare <ref> — DIAG-05 regression comparison
12
+ * ailf interpret <id> --refresh — bypass version-keyed cache
13
+ *
14
+ * @see packages/eval/src/commands/compare.ts — CLI factory analog
15
+ * @see packages/eval/src/composition-root.ts — getDiagnosisRunner
16
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §6
17
+ */
18
+ import { Command } from "commander";
19
+ import type { DiagnosisRunner, VersionedInputs } from "../_vendor/ailf-core/index.d.ts";
20
+ interface MinimalReportStore {
21
+ read(id: string): Promise<unknown | null>;
22
+ latest(): Promise<unknown | null>;
23
+ }
24
+ export interface InterpretCommandOptions {
25
+ /**
26
+ * Override the runner factory for tests. When omitted, the command
27
+ * imports `getDiagnosisRunner` from the composition root at action time.
28
+ */
29
+ readonly runnerFactory?: (ctx: unknown) => DiagnosisRunner;
30
+ /**
31
+ * Override the store factory for tests. When omitted, the command
32
+ * creates the app context and uses `ctx.reportStore` at action time.
33
+ */
34
+ readonly storeFactory?: () => MinimalReportStore | null;
35
+ /**
36
+ * Override the versions resolver for tests. Receives the stored report
37
+ * record and returns the `VersionedInputs` needed by the runner.
38
+ * When omitted, the command derives versions from the report's metadata.
39
+ */
40
+ readonly versionsFromReport?: (report: unknown) => VersionedInputs;
41
+ }
42
+ /**
43
+ * Create the `ailf interpret <reportId>` Commander command.
44
+ *
45
+ * Accepts optional `InterpretCommandOptions` for testability — tests can
46
+ * inject a fake runner factory and store factory without touching module
47
+ * mocks (preferred per testing.md).
48
+ */
49
+ export declare function createInterpretCommand(options?: InterpretCommandOptions): Command;
50
+ export {};
@@ -0,0 +1,212 @@
1
+ /**
2
+ * interpret command — generate a Diagnosis for a Report.
3
+ *
4
+ * Wraps `getDiagnosisRunner(ctx)` from the composition root in a Commander
5
+ * command for consistent CLI integration. Closest analog: compare.ts.
6
+ *
7
+ * Entry points:
8
+ * ailf interpret <reportId> — one-line-per-card summary
9
+ * ailf interpret <reportId> --json — full Diagnosis JSON
10
+ * ailf interpret latest — most recent report
11
+ * ailf interpret <id> --compare <ref> — DIAG-05 regression comparison
12
+ * ailf interpret <id> --refresh — bypass version-keyed cache
13
+ *
14
+ * @see packages/eval/src/commands/compare.ts — CLI factory analog
15
+ * @see packages/eval/src/composition-root.ts — getDiagnosisRunner
16
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §6
17
+ */
18
+ import { dirname, resolve } from "path";
19
+ import { fileURLToPath } from "url";
20
+ import { Command } from "commander";
21
+ import { addOutputDirOption } from "./shared/options.js";
22
+ import { resolveOutputDir } from "./shared/resolve-output-dir.js";
23
+ // ---------------------------------------------------------------------------
24
+ // Module-level root constant (same pattern as compare.ts)
25
+ // ---------------------------------------------------------------------------
26
+ const __dirname = dirname(fileURLToPath(import.meta.url));
27
+ const ROOT = resolve(__dirname, "..", "..");
28
+ // ---------------------------------------------------------------------------
29
+ // Card output formatting (AI-SPEC §6 graceful-degradation-visibility)
30
+ // ---------------------------------------------------------------------------
31
+ /**
32
+ * Visual status markers — locked visual contract per plan Test 7:
33
+ * ready: "✓", degraded: "⚠", missing: "—"
34
+ */
35
+ const STATUS_ICONS = {
36
+ ready: "✓",
37
+ degraded: "⚠",
38
+ missing: "—",
39
+ };
40
+ function getCardSummaryText(card) {
41
+ if (card.status === "ready") {
42
+ return card.body.summary;
43
+ }
44
+ if (card.status === "degraded") {
45
+ return card.reason;
46
+ }
47
+ // missing
48
+ return card.reason;
49
+ }
50
+ /**
51
+ * Format a single card as a one-line summary string.
52
+ *
53
+ * Format: `<icon> <cardType>: <summary>`
54
+ * Per AI-SPEC §6: distinct icons for ready / degraded / missing.
55
+ */
56
+ function formatCardSummaryLine(card) {
57
+ const icon = STATUS_ICONS[card.status];
58
+ const text = getCardSummaryText(card);
59
+ return `${icon} ${card.cardType}: ${text}`;
60
+ }
61
+ // ---------------------------------------------------------------------------
62
+ // Default versions resolver
63
+ // ---------------------------------------------------------------------------
64
+ /**
65
+ * Derive VersionedInputs from a stored report record.
66
+ *
67
+ * The four-version chain is carried in `report.summary.versions` per the
68
+ * Phase 5 schema, with `diagnosisVersion` sourced from the runner's const.
69
+ * Falls back to hard-coded "unknown" values when the fields are not present
70
+ * (legacy reports without version metadata).
71
+ */
72
+ function defaultVersionsFromReport(report) {
73
+ const rec = report;
74
+ const summary = rec.summary;
75
+ const versions = summary?.versions;
76
+ return {
77
+ graderJudgmentsVersion: typeof versions?.graderJudgmentsVersion === "string"
78
+ ? versions.graderJudgmentsVersion
79
+ : "unknown",
80
+ ensembleVersion: typeof versions?.ensembleVersion === "string"
81
+ ? versions.ensembleVersion
82
+ : "unknown",
83
+ diagnosisVersion: typeof versions?.diagnosisVersion === "string"
84
+ ? versions.diagnosisVersion
85
+ : "0.1.0",
86
+ cardVersion: typeof versions?.cardVersion === "string"
87
+ ? versions.cardVersion
88
+ : "0.1.0",
89
+ };
90
+ }
91
+ // ---------------------------------------------------------------------------
92
+ // Command factory
93
+ // ---------------------------------------------------------------------------
94
+ /**
95
+ * Create the `ailf interpret <reportId>` Commander command.
96
+ *
97
+ * Accepts optional `InterpretCommandOptions` for testability — tests can
98
+ * inject a fake runner factory and store factory without touching module
99
+ * mocks (preferred per testing.md).
100
+ */
101
+ export function createInterpretCommand(options = {}) {
102
+ const { runnerFactory, storeFactory, versionsFromReport } = options;
103
+ const cmd = new Command("interpret")
104
+ .description("Generate a Diagnosis for a Report — 8 typed cards explaining what's weak and what to do")
105
+ .argument("<reportId>", "Report ID (or 'latest' for the most recent)")
106
+ .option("-c, --compare <ref>", "Baseline report ID for regression-vs-baseline comparison")
107
+ .option("--refresh", "Bypass the version-keyed cache and recompute")
108
+ .option("--json", "Print full Diagnosis JSON instead of one-line-per-card summary")
109
+ .action(async (reportId, opts) => {
110
+ const outputDir = resolveOutputDir(opts.outputDir);
111
+ // ---------------------------------------------------------------------------
112
+ // Resolve store: injected factory (tests) or composition root (production)
113
+ // ---------------------------------------------------------------------------
114
+ let store;
115
+ let ctx;
116
+ if (storeFactory) {
117
+ store = storeFactory();
118
+ ctx = null;
119
+ }
120
+ else {
121
+ // Production path — lazy import to keep the module fast in tests
122
+ // Minimal config: report-read-only, no eval/fetch/publish.
123
+ const { createAppContext } = await import("../composition-root.js");
124
+ ctx = createAppContext({
125
+ compareEnabled: false,
126
+ gapAnalysisEnabled: false,
127
+ mode: "literacy",
128
+ noAutoScope: false,
129
+ noCache: true,
130
+ noRemoteCache: true,
131
+ outputDir,
132
+ publishEnabled: false,
133
+ rootDir: ROOT,
134
+ searchMode: "open",
135
+ skipEval: true,
136
+ skipFetch: true,
137
+ remote: false,
138
+ apiUrl: "https://ailf-api.sanity.build",
139
+ });
140
+ const prodCtx = ctx;
141
+ store = prodCtx.reportStore;
142
+ }
143
+ if (!store) {
144
+ process.stderr.write("Error: report store is not available\n");
145
+ process.exit(1);
146
+ }
147
+ // ---------------------------------------------------------------------------
148
+ // Resolve main report
149
+ // ---------------------------------------------------------------------------
150
+ const report = reportId === "latest"
151
+ ? await store.latest()
152
+ : await store.read(reportId);
153
+ if (!report) {
154
+ process.stderr.write(`Error: report not found: ${reportId}\n`);
155
+ process.exit(1);
156
+ }
157
+ // ---------------------------------------------------------------------------
158
+ // Optionally resolve baseline (DIAG-05)
159
+ // ---------------------------------------------------------------------------
160
+ let baseline;
161
+ if (opts.compare) {
162
+ baseline = await store.read(opts.compare);
163
+ if (!baseline) {
164
+ process.stderr.write(`Error: baseline report not found: ${opts.compare}\n`);
165
+ process.exit(1);
166
+ }
167
+ }
168
+ // ---------------------------------------------------------------------------
169
+ // Resolve versions
170
+ // ---------------------------------------------------------------------------
171
+ const versions = versionsFromReport
172
+ ? versionsFromReport(report)
173
+ : defaultVersionsFromReport(report);
174
+ // ---------------------------------------------------------------------------
175
+ // Build runner
176
+ // ---------------------------------------------------------------------------
177
+ let runner;
178
+ if (runnerFactory) {
179
+ runner = runnerFactory(ctx);
180
+ }
181
+ else {
182
+ const { getDiagnosisRunner } = await import("../composition-root.js");
183
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
184
+ runner = getDiagnosisRunner(ctx);
185
+ }
186
+ // ---------------------------------------------------------------------------
187
+ // Run diagnosis
188
+ // ---------------------------------------------------------------------------
189
+ const diagnosis = await runner.run({
190
+ // The report here is the eval's ReportStore record, which satisfies
191
+ // the Report interface for runner.run purposes (both carry id + provenance.runId).
192
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
193
+ report: report,
194
+ versions,
195
+ ...(baseline ? { baseline: baseline } : {}),
196
+ refresh: opts.refresh ?? false,
197
+ });
198
+ // ---------------------------------------------------------------------------
199
+ // Print output
200
+ // ---------------------------------------------------------------------------
201
+ if (opts.json) {
202
+ process.stdout.write(`${JSON.stringify(diagnosis, null, 2)}\n`);
203
+ }
204
+ else {
205
+ for (const card of diagnosis.cards) {
206
+ process.stdout.write(`${formatCardSummaryLine(card)}\n`);
207
+ }
208
+ }
209
+ });
210
+ addOutputDirOption(cmd);
211
+ return cmd;
212
+ }