@sanity/ailf 5.0.0 → 6.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. package/config/airbyte/ai_literacy_framework.connector.yaml +276 -0
  2. package/config/bigquery/views/synthesis_parse_failure_rate_7d.sql +42 -0
  3. package/config/diagnosis-cards.ts +318 -0
  4. package/config/models.ts +12 -0
  5. package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
  6. package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
  7. package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
  8. package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
  9. package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
  10. package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
  11. package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
  12. package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
  13. package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
  14. package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
  15. package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
  16. package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
  17. package/dist/_vendor/ailf-core/index.d.ts +1 -0
  18. package/dist/_vendor/ailf-core/index.js +4 -0
  19. package/dist/_vendor/ailf-core/ports/context.d.ts +12 -0
  20. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -0
  21. package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -0
  22. package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
  23. package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
  24. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
  25. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
  26. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
  27. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +230 -0
  28. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
  29. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
  30. package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
  31. package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
  32. package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
  33. package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +108 -0
  34. package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
  35. package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +140 -0
  36. package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +49 -0
  37. package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +65 -0
  38. package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
  39. package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +93 -0
  40. package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
  41. package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
  42. package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
  43. package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +130 -0
  44. package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
  45. package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +111 -0
  46. package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
  47. package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +118 -0
  48. package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
  49. package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +286 -0
  50. package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
  51. package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
  52. package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
  53. package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
  54. package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
  55. package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
  56. package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
  57. package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
  58. package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
  59. package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
  60. package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +18 -0
  61. package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +74 -0
  62. package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +10 -0
  63. package/dist/_vendor/ailf-core/services/diagnosis/registry.js +10 -0
  64. package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +119 -2
  65. package/dist/_vendor/ailf-core/services/diagnosis-runner.js +136 -2
  66. package/dist/_vendor/ailf-core/services/index.d.ts +5 -1
  67. package/dist/_vendor/ailf-core/services/index.js +15 -2
  68. package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
  69. package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
  70. package/dist/_vendor/ailf-core/types/diagnosis.d.ts +115 -10
  71. package/dist/_vendor/ailf-core/types/diagnosis.js +3 -1
  72. package/dist/_vendor/ailf-core/types/index.d.ts +8 -1
  73. package/dist/_vendor/ailf-core/types/repo-config.d.ts +16 -0
  74. package/dist/_vendor/ailf-core/types/synthesis-telemetry.d.ts +101 -0
  75. package/dist/_vendor/ailf-core/types/synthesis-telemetry.js +18 -0
  76. package/dist/adapters/config-sources/file-config-adapter.js +8 -6
  77. package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
  78. package/dist/adapters/llm/fake-llm-client.js +38 -1
  79. package/dist/adapters/llm/index.d.ts +1 -1
  80. package/dist/adapters/llm/index.js +1 -1
  81. package/dist/adapters/llm/openai-llm-client.js +59 -5
  82. package/dist/adapters/llm/retry.d.ts +18 -0
  83. package/dist/adapters/llm/retry.js +21 -0
  84. package/dist/adapters/synthesis/synthesis-telemetry-schema.d.ts +49 -0
  85. package/dist/adapters/synthesis/synthesis-telemetry-schema.js +55 -0
  86. package/dist/adapters/task-sources/content-lake-task-source.js +10 -5
  87. package/dist/adapters/task-sources/repo-schemas.d.ts +7 -0
  88. package/dist/adapters/task-sources/repo-schemas.js +10 -0
  89. package/dist/cli-program.js +3 -0
  90. package/dist/commands/interpret.d.ts +70 -0
  91. package/dist/commands/interpret.js +221 -0
  92. package/dist/commands/pipeline-action.d.ts +44 -0
  93. package/dist/commands/pipeline-action.js +193 -1
  94. package/dist/commands/run.d.ts +2 -0
  95. package/dist/commands/run.js +2 -0
  96. package/dist/composition-root.d.ts +21 -23
  97. package/dist/composition-root.js +107 -41
  98. package/dist/config/diagnosis-cards.ts +318 -0
  99. package/dist/config/models.ts +12 -0
  100. package/dist/grader/agent-harness.d.ts +5 -10
  101. package/dist/grader/agent-harness.js +5 -13
  102. package/dist/grader/common.d.ts +5 -13
  103. package/dist/grader/common.js +5 -17
  104. package/dist/grader/index.d.ts +15 -29
  105. package/dist/grader/index.js +15 -66
  106. package/dist/grader/knowledge-probe.d.ts +5 -10
  107. package/dist/grader/knowledge-probe.js +5 -14
  108. package/dist/grader/literacy.d.ts +5 -9
  109. package/dist/grader/literacy.js +5 -13
  110. package/dist/grader/mcp.d.ts +5 -10
  111. package/dist/grader/mcp.js +5 -14
  112. package/dist/orchestration/pipeline-orchestrator.js +3 -0
  113. package/dist/report-store.d.ts +26 -0
  114. package/dist/report-store.js +63 -0
  115. package/package.json +2 -2
@@ -1,12 +1,22 @@
1
1
  /**
2
2
  * Diagnosis runner — engine entry point (D0048).
3
3
  *
4
- * Phase 1 lands the version constant only; the runner factory + cache
5
- * lookup land in Phase 5.
4
+ * Phase 5 implements the factory body; Phase 1 shipped `diagnosisVersion` only.
5
+ * `GeneratorContext.judgmentAttributions` is sourced once per `.run({...})` via
6
+ * `deps.loadAttributions(runId)` reading Phase 4's
7
+ * `runs/{runId}/attribution/{entryKey}.json` per-entry artifacts (RESEARCH
8
+ * Landmine 11).
6
9
  *
7
10
  * @see docs/decisions/D0048-engine-homes-for-cli-api-parity.md
8
11
  * @see .planning/phases/01-foundation-contracts-cross-cutting-schemas/01-CONTEXT.md (D-02)
12
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-RESEARCH.md (Landmine 11)
13
+ * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-CONTEXT.md (D-02, D-06, D-10)
9
14
  */
15
+ import { z } from "zod";
16
+ import { ARTIFACT_REGISTRY, encodeDiagnosisPathVersion, } from "../artifact-registry.js";
17
+ // ---------------------------------------------------------------------------
18
+ // Version constant (Phase 1 / VER-01 / D-02)
19
+ // ---------------------------------------------------------------------------
10
20
  /**
11
21
  * Bumped when the runner's selection logic, prompt orchestration, or
12
22
  * card-set composition changes in a way that should invalidate cached
@@ -17,3 +27,127 @@
17
27
  * across vitest workers (cross-cutting hazard #2).
18
28
  */
19
29
  export const diagnosisVersion = "0.1.0";
30
+ // ---------------------------------------------------------------------------
31
+ // Private helpers
32
+ // ---------------------------------------------------------------------------
33
+ /**
34
+ * Build the deterministic cache path that incorporates all four version
35
+ * segments AND the model id (AI-SPEC §3 lines 463-473 + D-02).
36
+ *
37
+ * The artifact path from `ARTIFACT_REGISTRY.diagnosis.objectPath(...)` is
38
+ * already version-scoped; we append `::${model}` to include model identity
39
+ * in the key without changing the artifact path shape.
40
+ */
41
+ function buildCacheKey(report, versions, model) {
42
+ const artifactPath = ARTIFACT_REGISTRY.diagnosis.objectPath(report.provenance.runId, report.id, encodeDiagnosisPathVersion(versions.diagnosisVersion, versions.cardVersion));
43
+ // Embed the remaining two version axes + model in the key string. The
44
+ // artifact path already carries diagnosisVersion + cardVersion; the other
45
+ // two axes are appended here so any single-segment bump produces a
46
+ // distinct key.
47
+ return `${artifactPath}::grader=${versions.graderJudgmentsVersion}::ensemble=${versions.ensembleVersion}::model=${model}`;
48
+ }
49
+ /**
50
+ * Per-card invocation — never panics. ZodError or any other thrown value
51
+ * both translate to a degraded card (AI-SPEC §3 lines 530-552).
52
+ */
53
+ async function runOne(generator, report, ctx, cardType) {
54
+ try {
55
+ return await generator(report, ctx);
56
+ }
57
+ catch (err) {
58
+ const meta = {
59
+ cardVersion: `${cardType}@unknown`,
60
+ generatedAt: new Date().toISOString(),
61
+ };
62
+ const isZodErr = err instanceof z.ZodError;
63
+ return {
64
+ status: "degraded",
65
+ cardType,
66
+ reason: err instanceof Error ? err.message : String(err),
67
+ parseFailed: isZodErr,
68
+ meta,
69
+ };
70
+ }
71
+ }
72
+ // ---------------------------------------------------------------------------
73
+ // Factory (AI-SPEC §3 lines 458-523 + D-02 / Landmine-11 deltas)
74
+ // ---------------------------------------------------------------------------
75
+ /**
76
+ * Build a `DiagnosisRunner` whose `.run({report, versions, baseline?, refresh?})`
77
+ * produces a `Diagnosis` with cards in registry-order.
78
+ *
79
+ * No module-scope `let` — all state lives in the `deps` closure and per-run
80
+ * local variables (AI-SPEC §3 Pitfall 1).
81
+ */
82
+ export function createDiagnosisRunner(deps) {
83
+ return {
84
+ async run({ report, versions, baseline, refresh }) {
85
+ const cachePath = buildCacheKey(report, versions, deps.model);
86
+ // Cache lookup (bypassed when --refresh).
87
+ if (!refresh) {
88
+ const cached = await deps.diagnosisReader(cachePath);
89
+ if (cached !== null)
90
+ return cached;
91
+ }
92
+ // One-shot attribution load (Landmine 11 — Phase 4 per-entry artifacts).
93
+ let judgmentAttributions;
94
+ try {
95
+ judgmentAttributions = await deps.loadAttributions(report.provenance.runId);
96
+ }
97
+ catch (err) {
98
+ deps.logger.warn("diagnosis-runner: loadAttributions failed", {
99
+ runId: report.provenance.runId,
100
+ error: err instanceof Error ? err.message : String(err),
101
+ });
102
+ judgmentAttributions = undefined;
103
+ }
104
+ const ctx = {
105
+ llm: deps.llm,
106
+ model: deps.model,
107
+ logger: deps.logger,
108
+ progress: deps.progress,
109
+ versions,
110
+ runId: report.provenance.runId, // D-10: provenance.runId, NOT report.runId
111
+ reportId: report.id,
112
+ judgmentAttributions, // Landmine 11
113
+ ...(baseline ? { baseline } : {}),
114
+ };
115
+ const cardTypes = Object.keys(deps.registry);
116
+ const cards = [];
117
+ let parseFailures = 0;
118
+ for (const cardType of cardTypes) {
119
+ const generator = deps.registry[cardType];
120
+ // Budget enforcement: once ≤1 budget is breached, downgrade
121
+ // subsequent parse-failing cards to "missing" before even running
122
+ // the generator (AI-SPEC §3 lines 496-510 + must-have #4).
123
+ // We still RUN the generator here to match the behavior spec —
124
+ // the budget check happens AFTER the card result is obtained.
125
+ const card = await runOne(generator, report, ctx, cardType);
126
+ if (card.status === "degraded" && card.parseFailed) {
127
+ if (parseFailures >= 1) {
128
+ // Budget exceeded — demote to missing.
129
+ deps.logger.warn(`diagnosis-runner: parse-failure budget exceeded for card "${cardType}"; demoting to missing`, { reportId: report.id });
130
+ cards.push({
131
+ status: "missing",
132
+ cardType,
133
+ reason: "degraded-budget-exceeded",
134
+ });
135
+ continue;
136
+ }
137
+ parseFailures++;
138
+ }
139
+ cards.push(card);
140
+ }
141
+ const diagnosis = {
142
+ runId: report.provenance.runId, // D-10: provenance.runId
143
+ reportId: report.id,
144
+ inputs: versions,
145
+ cards,
146
+ generatedAt: new Date().toISOString(),
147
+ };
148
+ // Unconditional write — a refreshed call replaces the cached Diagnosis.
149
+ await deps.diagnosisWriter(cachePath, diagnosis);
150
+ return diagnosis;
151
+ },
152
+ };
153
+ }
@@ -13,5 +13,9 @@ export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskS
13
13
  export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, resolveModelVariants, } from "./config-helpers.js";
14
14
  export { buildSlimReportSummary } from "./slim-report-summary.js";
15
15
  export { reportToMarkdown, type RenderableReport, } from "./report-to-markdown.js";
16
- export { diagnosisVersion } from "./diagnosis-runner.js";
16
+ export { createDiagnosisRunner, diagnosisVersion, type CardGenerator, type CardRegistry, type DiagnosisRunner, type DiagnosisRunnerDeps, type DiagnosisRunnerRunArgs, type GeneratorContext, } from "./diagnosis-runner.js";
17
17
  export { cardRegistry, type CardDefinition } from "./diagnosis/registry.js";
18
+ export { createLLMClient, type LLMClientAdapters, type LLMClientFactoryConfig, type LLMClientKeys, } from "./llm-client-factory.js";
19
+ export { buildFailureModeRefinement, isFailureModeInDimensionTaxonomy, } from "./diagnosis/card-validators.js";
20
+ export { CARD_REGISTRY_VERSION, DIAGNOSIS_CARD_GENERATORS, generateAreaSummary, generateFailureModeSummary, generateNoIssues, generateTopRecommendations, generateWeakestArea, generateLowConfidenceAttribution, generateDocAttributionSpotlight, generateRegressionVsBaseline, } from "./diagnosis/cards/index.js";
21
+ export { buildTopRecommendationsPrompt, buildWeakestAreaPrompt, buildLowConfidenceAttributionPrompt, buildDocAttributionSpotlightPrompt, buildRegressionVsBaselinePrompt, buildDocSlugAllowList, } from "./diagnosis/prompt-builders.js";
@@ -14,7 +14,20 @@ export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, resol
14
14
  export { buildSlimReportSummary } from "./slim-report-summary.js";
15
15
  export { reportToMarkdown, } from "./report-to-markdown.js";
16
16
  // ---------------------------------------------------------------------------
17
- // Actionability ladder Phase 1 — diagnosis runner + card registry
17
+ // Actionability ladder Phase 1 + Phase 5 — diagnosis runner + card registry
18
18
  // ---------------------------------------------------------------------------
19
- export { diagnosisVersion } from "./diagnosis-runner.js";
19
+ export { createDiagnosisRunner, diagnosisVersion, } from "./diagnosis-runner.js";
20
20
  export { cardRegistry } from "./diagnosis/registry.js";
21
+ // ---------------------------------------------------------------------------
22
+ // Phase 5 — LLM client factory (D-01 hoist)
23
+ // ---------------------------------------------------------------------------
24
+ export { createLLMClient, } from "./llm-client-factory.js";
25
+ // ---------------------------------------------------------------------------
26
+ // Phase 5 — card validators (D-05 refine helpers)
27
+ // ---------------------------------------------------------------------------
28
+ export { buildFailureModeRefinement, isFailureModeInDimensionTaxonomy, } from "./diagnosis/card-validators.js";
29
+ // ---------------------------------------------------------------------------
30
+ // Phase 5 Plan 05 — card generators barrel + prompt builders
31
+ // ---------------------------------------------------------------------------
32
+ export { CARD_REGISTRY_VERSION, DIAGNOSIS_CARD_GENERATORS, generateAreaSummary, generateFailureModeSummary, generateNoIssues, generateTopRecommendations, generateWeakestArea, generateLowConfidenceAttribution, generateDocAttributionSpotlight, generateRegressionVsBaseline, } from "./diagnosis/cards/index.js";
33
+ export { buildTopRecommendationsPrompt, buildWeakestAreaPrompt, buildLowConfidenceAttributionPrompt, buildDocAttributionSpotlightPrompt, buildRegressionVsBaselinePrompt, buildDocSlugAllowList, } from "./diagnosis/prompt-builders.js";
@@ -0,0 +1,64 @@
1
+ /**
2
+ * LLM client factory — hoisted from packages/eval/src/composition-root.ts
3
+ * so packages/api can build a DiagnosisRunner without importing eval (D-01).
4
+ *
5
+ * Adapter CLASSES stay in packages/eval/src/adapters/llm/. Only the factory
6
+ * function lives here. Adapter constructors are injected via `LLMClientAdapters`
7
+ * so core never static-imports vendor SDK code (D0051 invariant / T-05-01-01).
8
+ *
9
+ * @see docs/decisions/D0051-llm-client-port.md
10
+ * @see packages/eval/src/composition-root.ts — call site (updated to use this)
11
+ */
12
+ import type { LLMClient } from "../ports/llm-client.js";
13
+ import type { Logger } from "../ports/logger.js";
14
+ /**
15
+ * Narrow config slice consumed by the LLM client factory.
16
+ * Does NOT depend on `ResolvedConfig` from packages/eval — only the
17
+ * llmProvider field is needed here.
18
+ */
19
+ export interface LLMClientFactoryConfig {
20
+ readonly llmProvider?: "anthropic" | "openai";
21
+ }
22
+ /**
23
+ * Typed key bag passed to `createLLMClient`. The composition root reads
24
+ * env once and supplies values here; the factory stays pure so tests don't
25
+ * have to mutate `process.env`.
26
+ */
27
+ export interface LLMClientKeys {
28
+ readonly anthropicApiKey?: string;
29
+ readonly openaiApiKey?: string;
30
+ }
31
+ /**
32
+ * Constructor callbacks for adapter classes that live in packages/eval.
33
+ * The eval composition root passes real constructors; tests pass spies.
34
+ *
35
+ * This pattern satisfies T-05-01-01: core never static-imports
36
+ * openai / @anthropic-ai/sdk. The vendor code stays in eval.
37
+ */
38
+ export interface LLMClientAdapters {
39
+ readonly newAnthropicClient: (opts: {
40
+ apiKey: string;
41
+ logger: Logger;
42
+ }) => LLMClient;
43
+ readonly newOpenAIClient: (opts: {
44
+ apiKey: string;
45
+ logger: Logger;
46
+ }) => LLMClient;
47
+ }
48
+ /**
49
+ * Select the LLMClient adapter based on `config.llmProvider` and the
50
+ * supplied API keys. Returns `undefined` when no usable credential is
51
+ * present — `AppContext.llmClient` stays unset and consumers handle that
52
+ * explicitly.
53
+ *
54
+ * Adapters never read `process.env` themselves (per
55
+ * `.claude/rules/typescript.md`); env mapping happens at the call site
56
+ * (typically `createAppContext`).
57
+ *
58
+ * Adapter classes stay in packages/eval; they are passed in via `deps.adapters`
59
+ * so this factory has zero eval imports (D-01 / T-05-01-01).
60
+ */
61
+ export declare function createLLMClient(config: LLMClientFactoryConfig, keys: LLMClientKeys, deps: {
62
+ logger: Logger;
63
+ adapters: LLMClientAdapters;
64
+ }): LLMClient | undefined;
@@ -0,0 +1,54 @@
1
+ /**
2
+ * LLM client factory — hoisted from packages/eval/src/composition-root.ts
3
+ * so packages/api can build a DiagnosisRunner without importing eval (D-01).
4
+ *
5
+ * Adapter CLASSES stay in packages/eval/src/adapters/llm/. Only the factory
6
+ * function lives here. Adapter constructors are injected via `LLMClientAdapters`
7
+ * so core never static-imports vendor SDK code (D0051 invariant / T-05-01-01).
8
+ *
9
+ * @see docs/decisions/D0051-llm-client-port.md
10
+ * @see packages/eval/src/composition-root.ts — call site (updated to use this)
11
+ */
12
+ // ---------------------------------------------------------------------------
13
+ // Factory function
14
+ // ---------------------------------------------------------------------------
15
+ /**
16
+ * Select the LLMClient adapter based on `config.llmProvider` and the
17
+ * supplied API keys. Returns `undefined` when no usable credential is
18
+ * present — `AppContext.llmClient` stays unset and consumers handle that
19
+ * explicitly.
20
+ *
21
+ * Adapters never read `process.env` themselves (per
22
+ * `.claude/rules/typescript.md`); env mapping happens at the call site
23
+ * (typically `createAppContext`).
24
+ *
25
+ * Adapter classes stay in packages/eval; they are passed in via `deps.adapters`
26
+ * so this factory has zero eval imports (D-01 / T-05-01-01).
27
+ */
28
+ export function createLLMClient(config, keys, deps) {
29
+ const { logger, adapters } = deps;
30
+ const explicit = config.llmProvider;
31
+ const anthropicKey = keys.anthropicApiKey;
32
+ const openaiKey = keys.openaiApiKey;
33
+ // Auto-select: prefer Anthropic when both are present (matches the
34
+ // current grader's default model in config/models.ts).
35
+ const provider = explicit ?? (anthropicKey ? "anthropic" : openaiKey ? "openai" : undefined);
36
+ if (!provider) {
37
+ logger.debug("LLM client: not wired — no Anthropic or OpenAI API key supplied");
38
+ return undefined;
39
+ }
40
+ if (provider === "anthropic") {
41
+ if (!anthropicKey) {
42
+ logger.warn('llmProvider="anthropic" but no Anthropic API key supplied — LLMClient not wired');
43
+ return undefined;
44
+ }
45
+ logger.debug("LLM client: AnthropicLLMClient");
46
+ return adapters.newAnthropicClient({ apiKey: anthropicKey, logger });
47
+ }
48
+ if (!openaiKey) {
49
+ logger.warn('llmProvider="openai" but no OpenAI API key supplied — LLMClient not wired');
50
+ return undefined;
51
+ }
52
+ logger.debug("LLM client: OpenAILLMClient");
53
+ return adapters.newOpenAIClient({ apiKey: openaiKey, logger });
54
+ }
@@ -8,14 +8,18 @@
8
8
  * discriminator inside the `ready` variant.
9
9
  *
10
10
  * Phase 1 lands placeholder body shapes; Phase 5 enriches each per
11
- * Doc 05 specs.
11
+ * AI-SPEC §3 and CONTEXT D-05/D-07. The `DiagnosisCard` discriminated
12
+ * union surface (arms + `cardType` literals) is stable — only the
13
+ * `body: <BodyInterface>` references resolve to richer shapes.
12
14
  *
13
15
  * @see docs/decisions/D0049-shared-confidence-contract.md
14
16
  * @see docs/decisions/D0052-judgment-ref-granularity.md
15
17
  * @see docs/decisions/D0050-artifact-registry-post-hoc-versioned-extension.md
16
18
  */
19
+ import type { Confidence } from "./confidence.js";
17
20
  import type { RunId } from "./branded-ids.js";
18
21
  import type { ReportId } from "./index.js";
22
+ import type { ModelId } from "../ports/llm-client.js";
19
23
  /**
20
24
  * The four-version cache envelope. Every cached `Diagnosis` carries the
21
25
  * versions of the inputs that produced it; any bump in any segment
@@ -51,47 +55,144 @@ export interface CardMeta {
51
55
  latencyMs?: number;
52
56
  /** ISO 8601 UTC timestamp. */
53
57
  generatedAt: string;
58
+ cost?: number;
59
+ model?: ModelId;
54
60
  }
55
61
  /**
56
62
  * A single actionable suggestion surfaced by a recommendations card.
57
- * The full Phase 5 shape may add fields (per Doc 05 specs); Phase 1
58
- * locks the minimum required surface.
63
+ *
64
+ * Phase 5 adds `docSlug` (the canonical doc page to rewrite) and
65
+ * `sectionHeading` (null when the suggestion targets the whole page)
66
+ * per AI-SPEC actionability-specificity rubric + failure-mode #2
67
+ * mitigation.
59
68
  */
60
69
  export interface ActionSuggestion {
61
70
  title: string;
62
71
  body: string;
63
72
  priority: "high" | "medium" | "low";
73
+ /** Canonical slug of the documentation page this suggestion targets. */
74
+ docSlug: string;
75
+ /**
76
+ * Heading within `docSlug` that should be revised, or `null` when the
77
+ * suggestion targets the page as a whole.
78
+ */
79
+ sectionHeading: string | null;
80
+ }
81
+ /**
82
+ * Minimal judgment reference per D0052 (taskId × modelId × dimension).
83
+ * Used by `LowConfidenceAttributionBody.judgmentRefs` to cite the
84
+ * specific judgments that drove a low-confidence finding.
85
+ */
86
+ export interface JudgmentRef {
87
+ taskId: string;
88
+ modelId: string;
89
+ dimension: string;
64
90
  }
65
91
  /**
66
- * Phase 1 body placeholders. Each shape is intentionally minimal; Phase 5
67
- * card files enrich them per Doc 05 specs and assert
68
- * `satisfies z.ZodType<Extract<DiagnosisCard, { status: "ready"; cardType: "X" }>["body"]>`
69
- * against these declarations.
92
+ * Phase 5 enriched body shapes. Each keeps `summary: string` (load-bearing
93
+ * for CLI default render per AI-SPEC §6) and adds fields the corresponding
94
+ * Zod schema needs (asserting `satisfies z.ZodType<T>` in the card file).
70
95
  */
96
+ /** area-summary: deterministic — keep only summary (no behavioral claims). */
71
97
  export interface AreaSummaryBody {
72
98
  summary: string;
73
99
  }
100
+ /**
101
+ * failure-mode-summary: deterministic + D-05 dimension/failureMode gate.
102
+ * `count` = frequency in the report; `sampleSize` = judgment count for the
103
+ * dimension (per AI-SPEC failure-mode #3 mitigation).
104
+ */
74
105
  export interface FailureModeSummaryBody {
75
106
  summary: string;
107
+ /** Rubric dimension this summary targets (e.g. "task-completion"). */
108
+ dimension: string;
109
+ /** Canonical failure mode within this dimension. */
110
+ failureMode: string;
111
+ /** Number of judgments in this report with this failure mode. */
112
+ count: number;
113
+ /** Total judgments for this dimension — calibration denominator. */
114
+ sampleSize: number;
76
115
  }
116
+ /**
117
+ * no-issues: deterministic + AI-SPEC failure-mode #7 sycophancy guard.
118
+ * `thresholdScore` surfaces the threshold used to qualify as "no issues"
119
+ * so readers can see the criterion behind the positive assessment.
120
+ */
77
121
  export interface NoIssuesBody {
78
122
  summary: string;
123
+ /** Minimum composite score that qualified this area as "no issues". */
124
+ thresholdScore: number;
79
125
  }
126
+ /**
127
+ * top-recommendations: LLM-driven. `suggestions` reuses the enriched
128
+ * `ActionSuggestion` shape (docSlug + sectionHeading per AI-SPEC
129
+ * actionability-specificity rubric + failure-mode #2 mitigation).
130
+ */
80
131
  export interface TopRecommendationsBody {
81
132
  summary: string;
82
133
  suggestions: ActionSuggestion[];
83
134
  }
135
+ /**
136
+ * weakest-area: LLM-driven. Adds area identification, dimension/failureMode
137
+ * context, and a small-sample calibration guard (AI-SPEC failure-mode #3).
138
+ */
84
139
  export interface WeakestAreaBody {
85
140
  summary: string;
141
+ /** Documentation area with the lowest composite score. */
142
+ area: string;
143
+ /** Primary dimension driving the low score. */
144
+ dimension: string;
145
+ /** Dominant failure mode in this area. */
146
+ failureMode: string;
147
+ /** Number of judgments sampled for this area — calibration denominator. */
148
+ sampleSize: number;
149
+ /** Calibrated confidence per D0049 (ensemble-stdev derivation). */
150
+ confidence: Confidence;
86
151
  }
87
- export interface LowConfidenceAttributionBody {
152
+ /**
153
+ * regression-vs-baseline: LLM-driven. `deltas` is the per-area diff
154
+ * (JS-computed pre-call, max 10 entries); `drivers` is LLM prose;
155
+ * `overallTrend` is a 4-bucket summary per AI-SPEC §3 lines 605-613.
156
+ */
157
+ export interface RegressionVsBaselineBody {
88
158
  summary: string;
159
+ /**
160
+ * Per-area score deltas (max 10). `drivers` carries the LLM's prose
161
+ * reasoning about what caused the change.
162
+ */
163
+ deltas: {
164
+ area: string;
165
+ direction: "improved" | "regressed" | "unchanged";
166
+ pointsDelta: number;
167
+ drivers: string[];
168
+ }[];
169
+ /** 4-bucket aggregate trend across all deltas. */
170
+ overallTrend: "net-improved" | "net-regressed" | "mixed" | "stable";
89
171
  }
90
- export interface DocAttributionSpotlightBody {
172
+ /**
173
+ * low-confidence-attribution: LLM-driven. `judgmentRefs` cites the
174
+ * specific judgments (D0052 triple) that drove the low-confidence finding.
175
+ */
176
+ export interface LowConfidenceAttributionBody {
91
177
  summary: string;
178
+ /** Judgment references (D0052) driving this low-confidence finding. */
179
+ judgmentRefs: JudgmentRef[];
92
180
  }
93
- export interface RegressionVsBaselineBody {
181
+ /**
182
+ * doc-attribution-spotlight: LLM-driven. `docCitations` carries per-doc
183
+ * attribution roles and confidence calibration (AI-SPEC failure-mode #5).
184
+ */
185
+ export interface DocAttributionSpotlightBody {
94
186
  summary: string;
187
+ /**
188
+ * Per-doc attribution records. `role` classifies how the doc contributed;
189
+ * `confidence` calibrates the attribution certainty (D0049).
190
+ */
191
+ docCitations: {
192
+ docSlug: string;
193
+ confidence: Confidence;
194
+ role: "supports" | "contradicts" | "missing" | "irrelevant";
195
+ }[];
95
196
  }
96
197
  /**
97
198
  * Outer-`status` discriminated union: 8 ready variants (one per
@@ -102,6 +203,10 @@ export interface RegressionVsBaselineBody {
102
203
  * No `not-yet-generated` variant — old-report fallback is a Phase 7
103
204
  * concern at the slim-shape boundary, handled at fetch-time, not in
104
205
  * `DiagnosisCard` itself.
206
+ *
207
+ * D-07: only the `body: <BodyInterface>` references resolve to richer
208
+ * shapes. The union arms, status literals, and cardType literals are
209
+ * identical to Phase 1.
105
210
  */
106
211
  export type DiagnosisCard = {
107
212
  status: "ready";
@@ -8,7 +8,9 @@
8
8
  * discriminator inside the `ready` variant.
9
9
  *
10
10
  * Phase 1 lands placeholder body shapes; Phase 5 enriches each per
11
- * Doc 05 specs.
11
+ * AI-SPEC §3 and CONTEXT D-05/D-07. The `DiagnosisCard` discriminated
12
+ * union surface (arms + `cardType` literals) is stable — only the
13
+ * `body: <BodyInterface>` references resolve to richer shapes.
12
14
  *
13
15
  * @see docs/decisions/D0049-shared-confidence-contract.md
14
16
  * @see docs/decisions/D0052-judgment-ref-granularity.md
@@ -36,7 +36,8 @@ export { CONVENTIONAL_DERIVATIONS, isConfidence } from "./confidence.js";
36
36
  export type { ArtifactId, AssociationAxis, AssociationValues, Brand, EntryKey, Err, FixtureId, IdValidationError, JudgmentId, NewReportId, Ok, ProviderId, PromptId, Result, ResultId, RubricId, RunFingerprint, RunId, SuiteId, TaskId, TaskSlug, TraceId, } from "./branded-ids.js";
37
37
  export { err, fixtureId, generateJudgmentId, generateRunId, judgmentId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
38
38
  export type { AgentHarnessTaskDefinition, ContentLakeAuthorableMode, ContentLakeAuthorableTask, CriterionRef, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PromptVars, PathDocRef, PerspectiveDocRef, ReservedPromptVarKey, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./generalized-task.js";
39
- export type { ActionSuggestion, AreaSummaryBody, CardMeta, CardType, Diagnosis, DiagnosisCard, DocAttributionSpotlightBody, FailureModeSummaryBody, LowConfidenceAttributionBody, NoIssuesBody, RegressionVsBaselineBody, TopRecommendationsBody, VersionedInputs, WeakestAreaBody, } from "./diagnosis.js";
39
+ export type { ActionSuggestion, AreaSummaryBody, CardMeta, CardType, Diagnosis, DiagnosisCard, DocAttributionSpotlightBody, FailureModeSummaryBody, JudgmentRef, LowConfidenceAttributionBody, NoIssuesBody, RegressionVsBaselineBody, TopRecommendationsBody, VersionedInputs, WeakestAreaBody, } from "./diagnosis.js";
40
+ export type { SynthesisCostTelemetry, SynthesisPerCardTelemetry, } from "./synthesis-telemetry.js";
40
41
  export type { AttributionMeta, DocAttribution, JudgmentAttribution, } from "./attribution.js";
41
42
  export type { CriterionSubJudgment, DocCitation, DocCitationRole, GraderJudgment, } from "./grader-judgment.js";
42
43
  export type { LegacyGraderJudgment } from "./legacy-grader-judgment.js";
@@ -754,6 +755,12 @@ export interface PipelineResult {
754
755
  promptfooUrls?: PromptfooUrlEntry[];
755
756
  /** Results per step */
756
757
  steps: Record<string, StepResult>;
758
+ /** Report ID produced by PublishReportStep (when publish was enabled). Used by
759
+ * post-run hooks (e.g. runPostPipelineHooks) to target diagnosis and telemetry
760
+ * writeback at the correct Content Lake document. Absent when publish was
761
+ * skipped or the publish step did not produce a report. (Phase 6 / DIAG-06)
762
+ */
763
+ reportId?: string;
757
764
  /** Overall success (all non-skipped steps succeeded) */
758
765
  success: boolean;
759
766
  /** Summary of test execution outcomes. */
@@ -106,6 +106,21 @@ export interface RepoTriggersConfig {
106
106
  "pr-task-change"?: TriggerConfig;
107
107
  schedule?: ScheduleTriggerConfig;
108
108
  }
109
+ /**
110
+ * Post-run diagnosis summary policy (Phase 6 / DIAG-06).
111
+ * Controls whether `ailf run` auto-fires the in-process diagnosis runner
112
+ * at the end of a published pipeline. Precedence is resolved at the CLI
113
+ * layer — see `shouldRunPostSummary()` in `pipeline-action.ts`.
114
+ */
115
+ export interface RepoSummaryConfig {
116
+ /**
117
+ * - `"auto"` — fire only when `process.stdout.isTTY === true` AND
118
+ * `CI !== "true"`.
119
+ * - `"always"` — fire unconditionally (bypasses TTY check).
120
+ * - `"never"` — never fire.
121
+ */
122
+ onRun?: "auto" | "always" | "never";
123
+ }
109
124
  /**
110
125
  * Parsed shape of `.ailf/config.yaml`.
111
126
  *
@@ -124,6 +139,7 @@ export interface RepoConfig {
124
139
  publish?: RepoPublishConfig;
125
140
  reportStore?: RepoReportStoreConfig;
126
141
  source?: RepoSourceConfig;
142
+ summary?: RepoSummaryConfig;
127
143
  taskSource?: RepoTaskSourceConfig;
128
144
  triggers?: RepoTriggersConfig;
129
145
  }
@@ -0,0 +1,101 @@
1
+ /**
2
+ * Synthesis cost telemetry types — canonical TS-first shapes for
3
+ * Phase 6 DIAG-06 cost and parse-failure observability.
4
+ *
5
+ * These interfaces are authored independently of their Zod adapter schema
6
+ * (Plan 06-02) per D0045: the Zod schema declares
7
+ * `satisfies z.ZodType<SynthesisCostTelemetry>` against this independent
8
+ * type so drift is a build error, not a runtime bug.
9
+ *
10
+ * The 14 attribute paths on `SynthesisCostTelemetry` + `SynthesisPerCardTelemetry`
11
+ * land on the `ailf.report` Sanity doc under `summary.synthesis.diagnosis.*`
12
+ * (D6-09). No new sibling doc type (D0033 / D6-09).
13
+ *
14
+ * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
15
+ * @see .planning/phases/06-post-run-integration-cost-telemetry/06-CONTEXT.md §D6-09
16
+ * @see .planning/phases/06-post-run-integration-cost-telemetry/06-CONTEXT.md §D6-12
17
+ */
18
+ import type { CardType } from "./diagnosis.js";
19
+ /**
20
+ * Per-card telemetry row for the `synthesis_per_card` Airbyte stream
21
+ * (D6-11) and the `summary.synthesis.diagnosis.perCard[]` Sanity doc path
22
+ * (D6-09).
23
+ *
24
+ * Fields map directly to the 8 per-card attribute paths in D6-09:
25
+ * `…perCard[].cardType`, `…perCard[].cost`, `…perCard[].parseFailed`,
26
+ * `…perCard[].latencyMs`, `…perCard[].tokenInput`, `…perCard[].tokenOutput`,
27
+ * `…perCard[].cardVersion`, `…perCard[].generatedAt`.
28
+ *
29
+ * `cost` is undefined when the card did not make an LLM call (deterministic
30
+ * cards) and contributes 0 to the roll-up.
31
+ */
32
+ export interface SynthesisPerCardTelemetry {
33
+ /** Card archetype — reuses `CardType` from diagnosis.ts:55-63; not redeclared. */
34
+ cardType: CardType;
35
+ /**
36
+ * Per-call USD cost captured from `LLMStructuredCompletion.cost`.
37
+ * `undefined` for deterministic cards (area-summary, failure-mode-summary,
38
+ * no-issues) which make no LLM call.
39
+ */
40
+ cost?: number;
41
+ /**
42
+ * Whether the card's Zod schema parse failed (produces a degraded card).
43
+ * Used for the 7-day rolling parse-failure rate in BigQuery (D6-15).
44
+ */
45
+ parseFailed: boolean;
46
+ /**
47
+ * End-to-end latency for the LLM call in milliseconds.
48
+ * `undefined` for deterministic cards.
49
+ */
50
+ latencyMs?: number;
51
+ /**
52
+ * Prompt tokens consumed by the LLM call.
53
+ * `undefined` for deterministic cards.
54
+ */
55
+ tokenInput?: number;
56
+ /**
57
+ * Completion tokens produced by the LLM call.
58
+ * `undefined` for deterministic cards.
59
+ */
60
+ tokenOutput?: number;
61
+ /** Per-card version string (e.g. `"top-recommendations@0.1.0"`). */
62
+ cardVersion: string;
63
+ /** ISO 8601 UTC timestamp when this card was generated. */
64
+ generatedAt: string;
65
+ }
66
+ /**
67
+ * Aggregate synthesis cost telemetry for a single Diagnosis run.
68
+ * Lands on the `ailf.report` Sanity doc under `summary.synthesis.diagnosis.*`
69
+ * (D6-09 / D6-10: parallel to `summary.overall.cost` — not additive).
70
+ *
71
+ * Written by the post-run hook (D6-08); not written by standalone
72
+ * `ailf interpret`.
73
+ *
74
+ * Field set matches the 4 top-level D6-09 attribute paths:
75
+ * `summary.synthesis.diagnosis.cost`,
76
+ * `summary.synthesis.diagnosis.parseFailureCount`,
77
+ * `summary.synthesis.diagnosis.parseFailureRate`,
78
+ * `summary.synthesis.diagnosis.perCard`.
79
+ */
80
+ export interface SynthesisCostTelemetry {
81
+ /**
82
+ * Total USD cost across all LLM cards in this Diagnosis run.
83
+ * Roll-up: `sum(perCard[].cost ?? 0)` for ready + degraded cards.
84
+ * Missing cards contribute 0.
85
+ */
86
+ cost: number;
87
+ /**
88
+ * Number of cards whose Zod parse failed in this Diagnosis run.
89
+ * Counted across all 8 card types (including deterministic cards;
90
+ * a deterministic-card parse failure indicates a code bug).
91
+ */
92
+ parseFailureCount: number;
93
+ /**
94
+ * Parse-failure rate: `parseFailureCount / 8` (8 = fixed card registry size).
95
+ * Range: 0–1. Used as the denominator for the D6-15 BigQuery 7-day
96
+ * rolling rate view (`synthesis_parse_failure_rate_7d.sql`).
97
+ */
98
+ parseFailureRate: number;
99
+ /** Per-card telemetry rows — one entry per card in registry-order. */
100
+ perCard: SynthesisPerCardTelemetry[];
101
+ }