@sanity/ailf 5.0.0 → 6.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/airbyte/ai_literacy_framework.connector.yaml +276 -0
- package/config/bigquery/views/synthesis_parse_failure_rate_7d.sql +42 -0
- package/config/diagnosis-cards.ts +318 -0
- package/config/models.ts +12 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/agent-harness.js +16 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.d.ts +14 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/common.js +18 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.d.ts +45 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/index.js +109 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/knowledge-probe.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/literacy.js +17 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.d.ts +13 -0
- package/dist/_vendor/ailf-core/grader/failure-modes/mcp.js +17 -0
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +4 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +12 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -0
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/card-validators.js +40 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/area-summary.test.js +131 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +230 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.d.ts +7 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/no-issues.test.js +155 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/area-summary.js +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.d.ts +46 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +108 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.d.ts +28 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +140 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +49 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +65 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.d.ts +27 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +93 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.d.ts +32 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +71 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.d.ts +44 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +130 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.d.ts +41 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +111 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.d.ts +43 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +118 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.d.ts +72 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +286 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.d.ts +17 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/doc-attribution-spotlight.system.js +58 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.d.ts +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/index.js +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.d.ts +15 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/low-confidence-attribution.system.js +53 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.d.ts +14 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/regression-vs-baseline.system.js +63 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.d.ts +16 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/top-recommendations.system.js +78 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +18 -0
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +74 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.d.ts +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/registry.js +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis-runner.d.ts +119 -2
- package/dist/_vendor/ailf-core/services/diagnosis-runner.js +136 -2
- package/dist/_vendor/ailf-core/services/index.d.ts +5 -1
- package/dist/_vendor/ailf-core/services/index.js +15 -2
- package/dist/_vendor/ailf-core/services/llm-client-factory.d.ts +64 -0
- package/dist/_vendor/ailf-core/services/llm-client-factory.js +54 -0
- package/dist/_vendor/ailf-core/types/diagnosis.d.ts +115 -10
- package/dist/_vendor/ailf-core/types/diagnosis.js +3 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +8 -1
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +16 -0
- package/dist/_vendor/ailf-core/types/synthesis-telemetry.d.ts +101 -0
- package/dist/_vendor/ailf-core/types/synthesis-telemetry.js +18 -0
- package/dist/adapters/config-sources/file-config-adapter.js +8 -6
- package/dist/adapters/llm/fake-llm-client.d.ts +20 -0
- package/dist/adapters/llm/fake-llm-client.js +38 -1
- package/dist/adapters/llm/index.d.ts +1 -1
- package/dist/adapters/llm/index.js +1 -1
- package/dist/adapters/llm/openai-llm-client.js +59 -5
- package/dist/adapters/llm/retry.d.ts +18 -0
- package/dist/adapters/llm/retry.js +21 -0
- package/dist/adapters/synthesis/synthesis-telemetry-schema.d.ts +49 -0
- package/dist/adapters/synthesis/synthesis-telemetry-schema.js +55 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +10 -5
- package/dist/adapters/task-sources/repo-schemas.d.ts +7 -0
- package/dist/adapters/task-sources/repo-schemas.js +10 -0
- package/dist/cli-program.js +3 -0
- package/dist/commands/interpret.d.ts +70 -0
- package/dist/commands/interpret.js +221 -0
- package/dist/commands/pipeline-action.d.ts +44 -0
- package/dist/commands/pipeline-action.js +193 -1
- package/dist/commands/run.d.ts +2 -0
- package/dist/commands/run.js +2 -0
- package/dist/composition-root.d.ts +21 -23
- package/dist/composition-root.js +107 -41
- package/dist/config/diagnosis-cards.ts +318 -0
- package/dist/config/models.ts +12 -0
- package/dist/grader/agent-harness.d.ts +5 -10
- package/dist/grader/agent-harness.js +5 -13
- package/dist/grader/common.d.ts +5 -13
- package/dist/grader/common.js +5 -17
- package/dist/grader/index.d.ts +15 -29
- package/dist/grader/index.js +15 -66
- package/dist/grader/knowledge-probe.d.ts +5 -10
- package/dist/grader/knowledge-probe.js +5 -14
- package/dist/grader/literacy.d.ts +5 -9
- package/dist/grader/literacy.js +5 -13
- package/dist/grader/mcp.d.ts +5 -10
- package/dist/grader/mcp.js +5 -14
- package/dist/orchestration/pipeline-orchestrator.js +3 -0
- package/dist/report-store.d.ts +26 -0
- package/dist/report-store.js +63 -0
- package/package.json +2 -2
|
@@ -1,12 +1,22 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Diagnosis runner — engine entry point (D0048).
|
|
3
3
|
*
|
|
4
|
-
* Phase
|
|
5
|
-
*
|
|
4
|
+
* Phase 5 implements the factory body; Phase 1 shipped `diagnosisVersion` only.
|
|
5
|
+
* `GeneratorContext.judgmentAttributions` is sourced once per `.run({...})` via
|
|
6
|
+
* `deps.loadAttributions(runId)` reading Phase 4's
|
|
7
|
+
* `runs/{runId}/attribution/{entryKey}.json` per-entry artifacts (RESEARCH
|
|
8
|
+
* Landmine 11).
|
|
6
9
|
*
|
|
7
10
|
* @see docs/decisions/D0048-engine-homes-for-cli-api-parity.md
|
|
8
11
|
* @see .planning/phases/01-foundation-contracts-cross-cutting-schemas/01-CONTEXT.md (D-02)
|
|
12
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-RESEARCH.md (Landmine 11)
|
|
13
|
+
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-CONTEXT.md (D-02, D-06, D-10)
|
|
9
14
|
*/
|
|
15
|
+
import { z } from "zod";
|
|
16
|
+
import { ARTIFACT_REGISTRY, encodeDiagnosisPathVersion, } from "../artifact-registry.js";
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
// Version constant (Phase 1 / VER-01 / D-02)
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
10
20
|
/**
|
|
11
21
|
* Bumped when the runner's selection logic, prompt orchestration, or
|
|
12
22
|
* card-set composition changes in a way that should invalidate cached
|
|
@@ -17,3 +27,127 @@
|
|
|
17
27
|
* across vitest workers (cross-cutting hazard #2).
|
|
18
28
|
*/
|
|
19
29
|
export const diagnosisVersion = "0.1.0";
|
|
30
|
+
// ---------------------------------------------------------------------------
|
|
31
|
+
// Private helpers
|
|
32
|
+
// ---------------------------------------------------------------------------
|
|
33
|
+
/**
|
|
34
|
+
* Build the deterministic cache path that incorporates all four version
|
|
35
|
+
* segments AND the model id (AI-SPEC §3 lines 463-473 + D-02).
|
|
36
|
+
*
|
|
37
|
+
* The artifact path from `ARTIFACT_REGISTRY.diagnosis.objectPath(...)` is
|
|
38
|
+
* already version-scoped; we append `::${model}` to include model identity
|
|
39
|
+
* in the key without changing the artifact path shape.
|
|
40
|
+
*/
|
|
41
|
+
function buildCacheKey(report, versions, model) {
|
|
42
|
+
const artifactPath = ARTIFACT_REGISTRY.diagnosis.objectPath(report.provenance.runId, report.id, encodeDiagnosisPathVersion(versions.diagnosisVersion, versions.cardVersion));
|
|
43
|
+
// Embed the remaining two version axes + model in the key string. The
|
|
44
|
+
// artifact path already carries diagnosisVersion + cardVersion; the other
|
|
45
|
+
// two axes are appended here so any single-segment bump produces a
|
|
46
|
+
// distinct key.
|
|
47
|
+
return `${artifactPath}::grader=${versions.graderJudgmentsVersion}::ensemble=${versions.ensembleVersion}::model=${model}`;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Per-card invocation — never panics. ZodError or any other thrown value
|
|
51
|
+
* both translate to a degraded card (AI-SPEC §3 lines 530-552).
|
|
52
|
+
*/
|
|
53
|
+
async function runOne(generator, report, ctx, cardType) {
|
|
54
|
+
try {
|
|
55
|
+
return await generator(report, ctx);
|
|
56
|
+
}
|
|
57
|
+
catch (err) {
|
|
58
|
+
const meta = {
|
|
59
|
+
cardVersion: `${cardType}@unknown`,
|
|
60
|
+
generatedAt: new Date().toISOString(),
|
|
61
|
+
};
|
|
62
|
+
const isZodErr = err instanceof z.ZodError;
|
|
63
|
+
return {
|
|
64
|
+
status: "degraded",
|
|
65
|
+
cardType,
|
|
66
|
+
reason: err instanceof Error ? err.message : String(err),
|
|
67
|
+
parseFailed: isZodErr,
|
|
68
|
+
meta,
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
// ---------------------------------------------------------------------------
|
|
73
|
+
// Factory (AI-SPEC §3 lines 458-523 + D-02 / Landmine-11 deltas)
|
|
74
|
+
// ---------------------------------------------------------------------------
|
|
75
|
+
/**
|
|
76
|
+
* Build a `DiagnosisRunner` whose `.run({report, versions, baseline?, refresh?})`
|
|
77
|
+
* produces a `Diagnosis` with cards in registry-order.
|
|
78
|
+
*
|
|
79
|
+
* No module-scope `let` — all state lives in the `deps` closure and per-run
|
|
80
|
+
* local variables (AI-SPEC §3 Pitfall 1).
|
|
81
|
+
*/
|
|
82
|
+
export function createDiagnosisRunner(deps) {
|
|
83
|
+
return {
|
|
84
|
+
async run({ report, versions, baseline, refresh }) {
|
|
85
|
+
const cachePath = buildCacheKey(report, versions, deps.model);
|
|
86
|
+
// Cache lookup (bypassed when --refresh).
|
|
87
|
+
if (!refresh) {
|
|
88
|
+
const cached = await deps.diagnosisReader(cachePath);
|
|
89
|
+
if (cached !== null)
|
|
90
|
+
return cached;
|
|
91
|
+
}
|
|
92
|
+
// One-shot attribution load (Landmine 11 — Phase 4 per-entry artifacts).
|
|
93
|
+
let judgmentAttributions;
|
|
94
|
+
try {
|
|
95
|
+
judgmentAttributions = await deps.loadAttributions(report.provenance.runId);
|
|
96
|
+
}
|
|
97
|
+
catch (err) {
|
|
98
|
+
deps.logger.warn("diagnosis-runner: loadAttributions failed", {
|
|
99
|
+
runId: report.provenance.runId,
|
|
100
|
+
error: err instanceof Error ? err.message : String(err),
|
|
101
|
+
});
|
|
102
|
+
judgmentAttributions = undefined;
|
|
103
|
+
}
|
|
104
|
+
const ctx = {
|
|
105
|
+
llm: deps.llm,
|
|
106
|
+
model: deps.model,
|
|
107
|
+
logger: deps.logger,
|
|
108
|
+
progress: deps.progress,
|
|
109
|
+
versions,
|
|
110
|
+
runId: report.provenance.runId, // D-10: provenance.runId, NOT report.runId
|
|
111
|
+
reportId: report.id,
|
|
112
|
+
judgmentAttributions, // Landmine 11
|
|
113
|
+
...(baseline ? { baseline } : {}),
|
|
114
|
+
};
|
|
115
|
+
const cardTypes = Object.keys(deps.registry);
|
|
116
|
+
const cards = [];
|
|
117
|
+
let parseFailures = 0;
|
|
118
|
+
for (const cardType of cardTypes) {
|
|
119
|
+
const generator = deps.registry[cardType];
|
|
120
|
+
// Budget enforcement: once ≤1 budget is breached, downgrade
|
|
121
|
+
// subsequent parse-failing cards to "missing" before even running
|
|
122
|
+
// the generator (AI-SPEC §3 lines 496-510 + must-have #4).
|
|
123
|
+
// We still RUN the generator here to match the behavior spec —
|
|
124
|
+
// the budget check happens AFTER the card result is obtained.
|
|
125
|
+
const card = await runOne(generator, report, ctx, cardType);
|
|
126
|
+
if (card.status === "degraded" && card.parseFailed) {
|
|
127
|
+
if (parseFailures >= 1) {
|
|
128
|
+
// Budget exceeded — demote to missing.
|
|
129
|
+
deps.logger.warn(`diagnosis-runner: parse-failure budget exceeded for card "${cardType}"; demoting to missing`, { reportId: report.id });
|
|
130
|
+
cards.push({
|
|
131
|
+
status: "missing",
|
|
132
|
+
cardType,
|
|
133
|
+
reason: "degraded-budget-exceeded",
|
|
134
|
+
});
|
|
135
|
+
continue;
|
|
136
|
+
}
|
|
137
|
+
parseFailures++;
|
|
138
|
+
}
|
|
139
|
+
cards.push(card);
|
|
140
|
+
}
|
|
141
|
+
const diagnosis = {
|
|
142
|
+
runId: report.provenance.runId, // D-10: provenance.runId
|
|
143
|
+
reportId: report.id,
|
|
144
|
+
inputs: versions,
|
|
145
|
+
cards,
|
|
146
|
+
generatedAt: new Date().toISOString(),
|
|
147
|
+
};
|
|
148
|
+
// Unconditional write — a refreshed call replaces the cached Diagnosis.
|
|
149
|
+
await deps.diagnosisWriter(cachePath, diagnosis);
|
|
150
|
+
return diagnosis;
|
|
151
|
+
},
|
|
152
|
+
};
|
|
153
|
+
}
|
|
@@ -13,5 +13,9 @@ export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskS
|
|
|
13
13
|
export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, resolveModelVariants, } from "./config-helpers.js";
|
|
14
14
|
export { buildSlimReportSummary } from "./slim-report-summary.js";
|
|
15
15
|
export { reportToMarkdown, type RenderableReport, } from "./report-to-markdown.js";
|
|
16
|
-
export { diagnosisVersion } from "./diagnosis-runner.js";
|
|
16
|
+
export { createDiagnosisRunner, diagnosisVersion, type CardGenerator, type CardRegistry, type DiagnosisRunner, type DiagnosisRunnerDeps, type DiagnosisRunnerRunArgs, type GeneratorContext, } from "./diagnosis-runner.js";
|
|
17
17
|
export { cardRegistry, type CardDefinition } from "./diagnosis/registry.js";
|
|
18
|
+
export { createLLMClient, type LLMClientAdapters, type LLMClientFactoryConfig, type LLMClientKeys, } from "./llm-client-factory.js";
|
|
19
|
+
export { buildFailureModeRefinement, isFailureModeInDimensionTaxonomy, } from "./diagnosis/card-validators.js";
|
|
20
|
+
export { CARD_REGISTRY_VERSION, DIAGNOSIS_CARD_GENERATORS, generateAreaSummary, generateFailureModeSummary, generateNoIssues, generateTopRecommendations, generateWeakestArea, generateLowConfidenceAttribution, generateDocAttributionSpotlight, generateRegressionVsBaseline, } from "./diagnosis/cards/index.js";
|
|
21
|
+
export { buildTopRecommendationsPrompt, buildWeakestAreaPrompt, buildLowConfidenceAttributionPrompt, buildDocAttributionSpotlightPrompt, buildRegressionVsBaselinePrompt, buildDocSlugAllowList, } from "./diagnosis/prompt-builders.js";
|
|
@@ -14,7 +14,20 @@ export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, resol
|
|
|
14
14
|
export { buildSlimReportSummary } from "./slim-report-summary.js";
|
|
15
15
|
export { reportToMarkdown, } from "./report-to-markdown.js";
|
|
16
16
|
// ---------------------------------------------------------------------------
|
|
17
|
-
// Actionability ladder Phase 1 — diagnosis runner + card registry
|
|
17
|
+
// Actionability ladder Phase 1 + Phase 5 — diagnosis runner + card registry
|
|
18
18
|
// ---------------------------------------------------------------------------
|
|
19
|
-
export { diagnosisVersion } from "./diagnosis-runner.js";
|
|
19
|
+
export { createDiagnosisRunner, diagnosisVersion, } from "./diagnosis-runner.js";
|
|
20
20
|
export { cardRegistry } from "./diagnosis/registry.js";
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
// Phase 5 — LLM client factory (D-01 hoist)
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
export { createLLMClient, } from "./llm-client-factory.js";
|
|
25
|
+
// ---------------------------------------------------------------------------
|
|
26
|
+
// Phase 5 — card validators (D-05 refine helpers)
|
|
27
|
+
// ---------------------------------------------------------------------------
|
|
28
|
+
export { buildFailureModeRefinement, isFailureModeInDimensionTaxonomy, } from "./diagnosis/card-validators.js";
|
|
29
|
+
// ---------------------------------------------------------------------------
|
|
30
|
+
// Phase 5 Plan 05 — card generators barrel + prompt builders
|
|
31
|
+
// ---------------------------------------------------------------------------
|
|
32
|
+
export { CARD_REGISTRY_VERSION, DIAGNOSIS_CARD_GENERATORS, generateAreaSummary, generateFailureModeSummary, generateNoIssues, generateTopRecommendations, generateWeakestArea, generateLowConfidenceAttribution, generateDocAttributionSpotlight, generateRegressionVsBaseline, } from "./diagnosis/cards/index.js";
|
|
33
|
+
export { buildTopRecommendationsPrompt, buildWeakestAreaPrompt, buildLowConfidenceAttributionPrompt, buildDocAttributionSpotlightPrompt, buildRegressionVsBaselinePrompt, buildDocSlugAllowList, } from "./diagnosis/prompt-builders.js";
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM client factory — hoisted from packages/eval/src/composition-root.ts
|
|
3
|
+
* so packages/api can build a DiagnosisRunner without importing eval (D-01).
|
|
4
|
+
*
|
|
5
|
+
* Adapter CLASSES stay in packages/eval/src/adapters/llm/. Only the factory
|
|
6
|
+
* function lives here. Adapter constructors are injected via `LLMClientAdapters`
|
|
7
|
+
* so core never static-imports vendor SDK code (D0051 invariant / T-05-01-01).
|
|
8
|
+
*
|
|
9
|
+
* @see docs/decisions/D0051-llm-client-port.md
|
|
10
|
+
* @see packages/eval/src/composition-root.ts — call site (updated to use this)
|
|
11
|
+
*/
|
|
12
|
+
import type { LLMClient } from "../ports/llm-client.js";
|
|
13
|
+
import type { Logger } from "../ports/logger.js";
|
|
14
|
+
/**
|
|
15
|
+
* Narrow config slice consumed by the LLM client factory.
|
|
16
|
+
* Does NOT depend on `ResolvedConfig` from packages/eval — only the
|
|
17
|
+
* llmProvider field is needed here.
|
|
18
|
+
*/
|
|
19
|
+
export interface LLMClientFactoryConfig {
|
|
20
|
+
readonly llmProvider?: "anthropic" | "openai";
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Typed key bag passed to `createLLMClient`. The composition root reads
|
|
24
|
+
* env once and supplies values here; the factory stays pure so tests don't
|
|
25
|
+
* have to mutate `process.env`.
|
|
26
|
+
*/
|
|
27
|
+
export interface LLMClientKeys {
|
|
28
|
+
readonly anthropicApiKey?: string;
|
|
29
|
+
readonly openaiApiKey?: string;
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Constructor callbacks for adapter classes that live in packages/eval.
|
|
33
|
+
* The eval composition root passes real constructors; tests pass spies.
|
|
34
|
+
*
|
|
35
|
+
* This pattern satisfies T-05-01-01: core never static-imports
|
|
36
|
+
* openai / @anthropic-ai/sdk. The vendor code stays in eval.
|
|
37
|
+
*/
|
|
38
|
+
export interface LLMClientAdapters {
|
|
39
|
+
readonly newAnthropicClient: (opts: {
|
|
40
|
+
apiKey: string;
|
|
41
|
+
logger: Logger;
|
|
42
|
+
}) => LLMClient;
|
|
43
|
+
readonly newOpenAIClient: (opts: {
|
|
44
|
+
apiKey: string;
|
|
45
|
+
logger: Logger;
|
|
46
|
+
}) => LLMClient;
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Select the LLMClient adapter based on `config.llmProvider` and the
|
|
50
|
+
* supplied API keys. Returns `undefined` when no usable credential is
|
|
51
|
+
* present — `AppContext.llmClient` stays unset and consumers handle that
|
|
52
|
+
* explicitly.
|
|
53
|
+
*
|
|
54
|
+
* Adapters never read `process.env` themselves (per
|
|
55
|
+
* `.claude/rules/typescript.md`); env mapping happens at the call site
|
|
56
|
+
* (typically `createAppContext`).
|
|
57
|
+
*
|
|
58
|
+
* Adapter classes stay in packages/eval; they are passed in via `deps.adapters`
|
|
59
|
+
* so this factory has zero eval imports (D-01 / T-05-01-01).
|
|
60
|
+
*/
|
|
61
|
+
export declare function createLLMClient(config: LLMClientFactoryConfig, keys: LLMClientKeys, deps: {
|
|
62
|
+
logger: Logger;
|
|
63
|
+
adapters: LLMClientAdapters;
|
|
64
|
+
}): LLMClient | undefined;
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM client factory — hoisted from packages/eval/src/composition-root.ts
|
|
3
|
+
* so packages/api can build a DiagnosisRunner without importing eval (D-01).
|
|
4
|
+
*
|
|
5
|
+
* Adapter CLASSES stay in packages/eval/src/adapters/llm/. Only the factory
|
|
6
|
+
* function lives here. Adapter constructors are injected via `LLMClientAdapters`
|
|
7
|
+
* so core never static-imports vendor SDK code (D0051 invariant / T-05-01-01).
|
|
8
|
+
*
|
|
9
|
+
* @see docs/decisions/D0051-llm-client-port.md
|
|
10
|
+
* @see packages/eval/src/composition-root.ts — call site (updated to use this)
|
|
11
|
+
*/
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
// Factory function
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
/**
|
|
16
|
+
* Select the LLMClient adapter based on `config.llmProvider` and the
|
|
17
|
+
* supplied API keys. Returns `undefined` when no usable credential is
|
|
18
|
+
* present — `AppContext.llmClient` stays unset and consumers handle that
|
|
19
|
+
* explicitly.
|
|
20
|
+
*
|
|
21
|
+
* Adapters never read `process.env` themselves (per
|
|
22
|
+
* `.claude/rules/typescript.md`); env mapping happens at the call site
|
|
23
|
+
* (typically `createAppContext`).
|
|
24
|
+
*
|
|
25
|
+
* Adapter classes stay in packages/eval; they are passed in via `deps.adapters`
|
|
26
|
+
* so this factory has zero eval imports (D-01 / T-05-01-01).
|
|
27
|
+
*/
|
|
28
|
+
export function createLLMClient(config, keys, deps) {
|
|
29
|
+
const { logger, adapters } = deps;
|
|
30
|
+
const explicit = config.llmProvider;
|
|
31
|
+
const anthropicKey = keys.anthropicApiKey;
|
|
32
|
+
const openaiKey = keys.openaiApiKey;
|
|
33
|
+
// Auto-select: prefer Anthropic when both are present (matches the
|
|
34
|
+
// current grader's default model in config/models.ts).
|
|
35
|
+
const provider = explicit ?? (anthropicKey ? "anthropic" : openaiKey ? "openai" : undefined);
|
|
36
|
+
if (!provider) {
|
|
37
|
+
logger.debug("LLM client: not wired — no Anthropic or OpenAI API key supplied");
|
|
38
|
+
return undefined;
|
|
39
|
+
}
|
|
40
|
+
if (provider === "anthropic") {
|
|
41
|
+
if (!anthropicKey) {
|
|
42
|
+
logger.warn('llmProvider="anthropic" but no Anthropic API key supplied — LLMClient not wired');
|
|
43
|
+
return undefined;
|
|
44
|
+
}
|
|
45
|
+
logger.debug("LLM client: AnthropicLLMClient");
|
|
46
|
+
return adapters.newAnthropicClient({ apiKey: anthropicKey, logger });
|
|
47
|
+
}
|
|
48
|
+
if (!openaiKey) {
|
|
49
|
+
logger.warn('llmProvider="openai" but no OpenAI API key supplied — LLMClient not wired');
|
|
50
|
+
return undefined;
|
|
51
|
+
}
|
|
52
|
+
logger.debug("LLM client: OpenAILLMClient");
|
|
53
|
+
return adapters.newOpenAIClient({ apiKey: openaiKey, logger });
|
|
54
|
+
}
|
|
@@ -8,14 +8,18 @@
|
|
|
8
8
|
* discriminator inside the `ready` variant.
|
|
9
9
|
*
|
|
10
10
|
* Phase 1 lands placeholder body shapes; Phase 5 enriches each per
|
|
11
|
-
*
|
|
11
|
+
* AI-SPEC §3 and CONTEXT D-05/D-07. The `DiagnosisCard` discriminated
|
|
12
|
+
* union surface (arms + `cardType` literals) is stable — only the
|
|
13
|
+
* `body: <BodyInterface>` references resolve to richer shapes.
|
|
12
14
|
*
|
|
13
15
|
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
14
16
|
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
15
17
|
* @see docs/decisions/D0050-artifact-registry-post-hoc-versioned-extension.md
|
|
16
18
|
*/
|
|
19
|
+
import type { Confidence } from "./confidence.js";
|
|
17
20
|
import type { RunId } from "./branded-ids.js";
|
|
18
21
|
import type { ReportId } from "./index.js";
|
|
22
|
+
import type { ModelId } from "../ports/llm-client.js";
|
|
19
23
|
/**
|
|
20
24
|
* The four-version cache envelope. Every cached `Diagnosis` carries the
|
|
21
25
|
* versions of the inputs that produced it; any bump in any segment
|
|
@@ -51,47 +55,144 @@ export interface CardMeta {
|
|
|
51
55
|
latencyMs?: number;
|
|
52
56
|
/** ISO 8601 UTC timestamp. */
|
|
53
57
|
generatedAt: string;
|
|
58
|
+
cost?: number;
|
|
59
|
+
model?: ModelId;
|
|
54
60
|
}
|
|
55
61
|
/**
|
|
56
62
|
* A single actionable suggestion surfaced by a recommendations card.
|
|
57
|
-
*
|
|
58
|
-
*
|
|
63
|
+
*
|
|
64
|
+
* Phase 5 adds `docSlug` (the canonical doc page to rewrite) and
|
|
65
|
+
* `sectionHeading` (null when the suggestion targets the whole page)
|
|
66
|
+
* per AI-SPEC actionability-specificity rubric + failure-mode #2
|
|
67
|
+
* mitigation.
|
|
59
68
|
*/
|
|
60
69
|
export interface ActionSuggestion {
|
|
61
70
|
title: string;
|
|
62
71
|
body: string;
|
|
63
72
|
priority: "high" | "medium" | "low";
|
|
73
|
+
/** Canonical slug of the documentation page this suggestion targets. */
|
|
74
|
+
docSlug: string;
|
|
75
|
+
/**
|
|
76
|
+
* Heading within `docSlug` that should be revised, or `null` when the
|
|
77
|
+
* suggestion targets the page as a whole.
|
|
78
|
+
*/
|
|
79
|
+
sectionHeading: string | null;
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Minimal judgment reference per D0052 (taskId × modelId × dimension).
|
|
83
|
+
* Used by `LowConfidenceAttributionBody.judgmentRefs` to cite the
|
|
84
|
+
* specific judgments that drove a low-confidence finding.
|
|
85
|
+
*/
|
|
86
|
+
export interface JudgmentRef {
|
|
87
|
+
taskId: string;
|
|
88
|
+
modelId: string;
|
|
89
|
+
dimension: string;
|
|
64
90
|
}
|
|
65
91
|
/**
|
|
66
|
-
* Phase
|
|
67
|
-
*
|
|
68
|
-
* `satisfies z.ZodType<
|
|
69
|
-
* against these declarations.
|
|
92
|
+
* Phase 5 enriched body shapes. Each keeps `summary: string` (load-bearing
|
|
93
|
+
* for CLI default render per AI-SPEC §6) and adds fields the corresponding
|
|
94
|
+
* Zod schema needs (asserting `satisfies z.ZodType<T>` in the card file).
|
|
70
95
|
*/
|
|
96
|
+
/** area-summary: deterministic — keep only summary (no behavioral claims). */
|
|
71
97
|
export interface AreaSummaryBody {
|
|
72
98
|
summary: string;
|
|
73
99
|
}
|
|
100
|
+
/**
|
|
101
|
+
* failure-mode-summary: deterministic + D-05 dimension/failureMode gate.
|
|
102
|
+
* `count` = frequency in the report; `sampleSize` = judgment count for the
|
|
103
|
+
* dimension (per AI-SPEC failure-mode #3 mitigation).
|
|
104
|
+
*/
|
|
74
105
|
export interface FailureModeSummaryBody {
|
|
75
106
|
summary: string;
|
|
107
|
+
/** Rubric dimension this summary targets (e.g. "task-completion"). */
|
|
108
|
+
dimension: string;
|
|
109
|
+
/** Canonical failure mode within this dimension. */
|
|
110
|
+
failureMode: string;
|
|
111
|
+
/** Number of judgments in this report with this failure mode. */
|
|
112
|
+
count: number;
|
|
113
|
+
/** Total judgments for this dimension — calibration denominator. */
|
|
114
|
+
sampleSize: number;
|
|
76
115
|
}
|
|
116
|
+
/**
|
|
117
|
+
* no-issues: deterministic + AI-SPEC failure-mode #7 sycophancy guard.
|
|
118
|
+
* `thresholdScore` surfaces the threshold used to qualify as "no issues"
|
|
119
|
+
* so readers can see the criterion behind the positive assessment.
|
|
120
|
+
*/
|
|
77
121
|
export interface NoIssuesBody {
|
|
78
122
|
summary: string;
|
|
123
|
+
/** Minimum composite score that qualified this area as "no issues". */
|
|
124
|
+
thresholdScore: number;
|
|
79
125
|
}
|
|
126
|
+
/**
|
|
127
|
+
* top-recommendations: LLM-driven. `suggestions` reuses the enriched
|
|
128
|
+
* `ActionSuggestion` shape (docSlug + sectionHeading per AI-SPEC
|
|
129
|
+
* actionability-specificity rubric + failure-mode #2 mitigation).
|
|
130
|
+
*/
|
|
80
131
|
export interface TopRecommendationsBody {
|
|
81
132
|
summary: string;
|
|
82
133
|
suggestions: ActionSuggestion[];
|
|
83
134
|
}
|
|
135
|
+
/**
|
|
136
|
+
* weakest-area: LLM-driven. Adds area identification, dimension/failureMode
|
|
137
|
+
* context, and a small-sample calibration guard (AI-SPEC failure-mode #3).
|
|
138
|
+
*/
|
|
84
139
|
export interface WeakestAreaBody {
|
|
85
140
|
summary: string;
|
|
141
|
+
/** Documentation area with the lowest composite score. */
|
|
142
|
+
area: string;
|
|
143
|
+
/** Primary dimension driving the low score. */
|
|
144
|
+
dimension: string;
|
|
145
|
+
/** Dominant failure mode in this area. */
|
|
146
|
+
failureMode: string;
|
|
147
|
+
/** Number of judgments sampled for this area — calibration denominator. */
|
|
148
|
+
sampleSize: number;
|
|
149
|
+
/** Calibrated confidence per D0049 (ensemble-stdev derivation). */
|
|
150
|
+
confidence: Confidence;
|
|
86
151
|
}
|
|
87
|
-
|
|
152
|
+
/**
|
|
153
|
+
* regression-vs-baseline: LLM-driven. `deltas` is the per-area diff
|
|
154
|
+
* (JS-computed pre-call, max 10 entries); `drivers` is LLM prose;
|
|
155
|
+
* `overallTrend` is a 4-bucket summary per AI-SPEC §3 lines 605-613.
|
|
156
|
+
*/
|
|
157
|
+
export interface RegressionVsBaselineBody {
|
|
88
158
|
summary: string;
|
|
159
|
+
/**
|
|
160
|
+
* Per-area score deltas (max 10). `drivers` carries the LLM's prose
|
|
161
|
+
* reasoning about what caused the change.
|
|
162
|
+
*/
|
|
163
|
+
deltas: {
|
|
164
|
+
area: string;
|
|
165
|
+
direction: "improved" | "regressed" | "unchanged";
|
|
166
|
+
pointsDelta: number;
|
|
167
|
+
drivers: string[];
|
|
168
|
+
}[];
|
|
169
|
+
/** 4-bucket aggregate trend across all deltas. */
|
|
170
|
+
overallTrend: "net-improved" | "net-regressed" | "mixed" | "stable";
|
|
89
171
|
}
|
|
90
|
-
|
|
172
|
+
/**
|
|
173
|
+
* low-confidence-attribution: LLM-driven. `judgmentRefs` cites the
|
|
174
|
+
* specific judgments (D0052 triple) that drove the low-confidence finding.
|
|
175
|
+
*/
|
|
176
|
+
export interface LowConfidenceAttributionBody {
|
|
91
177
|
summary: string;
|
|
178
|
+
/** Judgment references (D0052) driving this low-confidence finding. */
|
|
179
|
+
judgmentRefs: JudgmentRef[];
|
|
92
180
|
}
|
|
93
|
-
|
|
181
|
+
/**
|
|
182
|
+
* doc-attribution-spotlight: LLM-driven. `docCitations` carries per-doc
|
|
183
|
+
* attribution roles and confidence calibration (AI-SPEC failure-mode #5).
|
|
184
|
+
*/
|
|
185
|
+
export interface DocAttributionSpotlightBody {
|
|
94
186
|
summary: string;
|
|
187
|
+
/**
|
|
188
|
+
* Per-doc attribution records. `role` classifies how the doc contributed;
|
|
189
|
+
* `confidence` calibrates the attribution certainty (D0049).
|
|
190
|
+
*/
|
|
191
|
+
docCitations: {
|
|
192
|
+
docSlug: string;
|
|
193
|
+
confidence: Confidence;
|
|
194
|
+
role: "supports" | "contradicts" | "missing" | "irrelevant";
|
|
195
|
+
}[];
|
|
95
196
|
}
|
|
96
197
|
/**
|
|
97
198
|
* Outer-`status` discriminated union: 8 ready variants (one per
|
|
@@ -102,6 +203,10 @@ export interface RegressionVsBaselineBody {
|
|
|
102
203
|
* No `not-yet-generated` variant — old-report fallback is a Phase 7
|
|
103
204
|
* concern at the slim-shape boundary, handled at fetch-time, not in
|
|
104
205
|
* `DiagnosisCard` itself.
|
|
206
|
+
*
|
|
207
|
+
* D-07: only the `body: <BodyInterface>` references resolve to richer
|
|
208
|
+
* shapes. The union arms, status literals, and cardType literals are
|
|
209
|
+
* identical to Phase 1.
|
|
105
210
|
*/
|
|
106
211
|
export type DiagnosisCard = {
|
|
107
212
|
status: "ready";
|
|
@@ -8,7 +8,9 @@
|
|
|
8
8
|
* discriminator inside the `ready` variant.
|
|
9
9
|
*
|
|
10
10
|
* Phase 1 lands placeholder body shapes; Phase 5 enriches each per
|
|
11
|
-
*
|
|
11
|
+
* AI-SPEC §3 and CONTEXT D-05/D-07. The `DiagnosisCard` discriminated
|
|
12
|
+
* union surface (arms + `cardType` literals) is stable — only the
|
|
13
|
+
* `body: <BodyInterface>` references resolve to richer shapes.
|
|
12
14
|
*
|
|
13
15
|
* @see docs/decisions/D0049-shared-confidence-contract.md
|
|
14
16
|
* @see docs/decisions/D0052-judgment-ref-granularity.md
|
|
@@ -36,7 +36,8 @@ export { CONVENTIONAL_DERIVATIONS, isConfidence } from "./confidence.js";
|
|
|
36
36
|
export type { ArtifactId, AssociationAxis, AssociationValues, Brand, EntryKey, Err, FixtureId, IdValidationError, JudgmentId, NewReportId, Ok, ProviderId, PromptId, Result, ResultId, RubricId, RunFingerprint, RunId, SuiteId, TaskId, TaskSlug, TraceId, } from "./branded-ids.js";
|
|
37
37
|
export { err, fixtureId, generateJudgmentId, generateRunId, judgmentId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
|
|
38
38
|
export type { AgentHarnessTaskDefinition, ContentLakeAuthorableMode, ContentLakeAuthorableTask, CriterionRef, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PromptVars, PathDocRef, PerspectiveDocRef, ReservedPromptVarKey, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./generalized-task.js";
|
|
39
|
-
export type { ActionSuggestion, AreaSummaryBody, CardMeta, CardType, Diagnosis, DiagnosisCard, DocAttributionSpotlightBody, FailureModeSummaryBody, LowConfidenceAttributionBody, NoIssuesBody, RegressionVsBaselineBody, TopRecommendationsBody, VersionedInputs, WeakestAreaBody, } from "./diagnosis.js";
|
|
39
|
+
export type { ActionSuggestion, AreaSummaryBody, CardMeta, CardType, Diagnosis, DiagnosisCard, DocAttributionSpotlightBody, FailureModeSummaryBody, JudgmentRef, LowConfidenceAttributionBody, NoIssuesBody, RegressionVsBaselineBody, TopRecommendationsBody, VersionedInputs, WeakestAreaBody, } from "./diagnosis.js";
|
|
40
|
+
export type { SynthesisCostTelemetry, SynthesisPerCardTelemetry, } from "./synthesis-telemetry.js";
|
|
40
41
|
export type { AttributionMeta, DocAttribution, JudgmentAttribution, } from "./attribution.js";
|
|
41
42
|
export type { CriterionSubJudgment, DocCitation, DocCitationRole, GraderJudgment, } from "./grader-judgment.js";
|
|
42
43
|
export type { LegacyGraderJudgment } from "./legacy-grader-judgment.js";
|
|
@@ -754,6 +755,12 @@ export interface PipelineResult {
|
|
|
754
755
|
promptfooUrls?: PromptfooUrlEntry[];
|
|
755
756
|
/** Results per step */
|
|
756
757
|
steps: Record<string, StepResult>;
|
|
758
|
+
/** Report ID produced by PublishReportStep (when publish was enabled). Used by
|
|
759
|
+
* post-run hooks (e.g. runPostPipelineHooks) to target diagnosis and telemetry
|
|
760
|
+
* writeback at the correct Content Lake document. Absent when publish was
|
|
761
|
+
* skipped or the publish step did not produce a report. (Phase 6 / DIAG-06)
|
|
762
|
+
*/
|
|
763
|
+
reportId?: string;
|
|
757
764
|
/** Overall success (all non-skipped steps succeeded) */
|
|
758
765
|
success: boolean;
|
|
759
766
|
/** Summary of test execution outcomes. */
|
|
@@ -106,6 +106,21 @@ export interface RepoTriggersConfig {
|
|
|
106
106
|
"pr-task-change"?: TriggerConfig;
|
|
107
107
|
schedule?: ScheduleTriggerConfig;
|
|
108
108
|
}
|
|
109
|
+
/**
|
|
110
|
+
* Post-run diagnosis summary policy (Phase 6 / DIAG-06).
|
|
111
|
+
* Controls whether `ailf run` auto-fires the in-process diagnosis runner
|
|
112
|
+
* at the end of a published pipeline. Precedence is resolved at the CLI
|
|
113
|
+
* layer — see `shouldRunPostSummary()` in `pipeline-action.ts`.
|
|
114
|
+
*/
|
|
115
|
+
export interface RepoSummaryConfig {
|
|
116
|
+
/**
|
|
117
|
+
* - `"auto"` — fire only when `process.stdout.isTTY === true` AND
|
|
118
|
+
* `CI !== "true"`.
|
|
119
|
+
* - `"always"` — fire unconditionally (bypasses TTY check).
|
|
120
|
+
* - `"never"` — never fire.
|
|
121
|
+
*/
|
|
122
|
+
onRun?: "auto" | "always" | "never";
|
|
123
|
+
}
|
|
109
124
|
/**
|
|
110
125
|
* Parsed shape of `.ailf/config.yaml`.
|
|
111
126
|
*
|
|
@@ -124,6 +139,7 @@ export interface RepoConfig {
|
|
|
124
139
|
publish?: RepoPublishConfig;
|
|
125
140
|
reportStore?: RepoReportStoreConfig;
|
|
126
141
|
source?: RepoSourceConfig;
|
|
142
|
+
summary?: RepoSummaryConfig;
|
|
127
143
|
taskSource?: RepoTaskSourceConfig;
|
|
128
144
|
triggers?: RepoTriggersConfig;
|
|
129
145
|
}
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Synthesis cost telemetry types — canonical TS-first shapes for
|
|
3
|
+
* Phase 6 DIAG-06 cost and parse-failure observability.
|
|
4
|
+
*
|
|
5
|
+
* These interfaces are authored independently of their Zod adapter schema
|
|
6
|
+
* (Plan 06-02) per D0045: the Zod schema declares
|
|
7
|
+
* `satisfies z.ZodType<SynthesisCostTelemetry>` against this independent
|
|
8
|
+
* type so drift is a build error, not a runtime bug.
|
|
9
|
+
*
|
|
10
|
+
* The 14 attribute paths on `SynthesisCostTelemetry` + `SynthesisPerCardTelemetry`
|
|
11
|
+
* land on the `ailf.report` Sanity doc under `summary.synthesis.diagnosis.*`
|
|
12
|
+
* (D6-09). No new sibling doc type (D0033 / D6-09).
|
|
13
|
+
*
|
|
14
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
15
|
+
* @see .planning/phases/06-post-run-integration-cost-telemetry/06-CONTEXT.md §D6-09
|
|
16
|
+
* @see .planning/phases/06-post-run-integration-cost-telemetry/06-CONTEXT.md §D6-12
|
|
17
|
+
*/
|
|
18
|
+
import type { CardType } from "./diagnosis.js";
|
|
19
|
+
/**
|
|
20
|
+
* Per-card telemetry row for the `synthesis_per_card` Airbyte stream
|
|
21
|
+
* (D6-11) and the `summary.synthesis.diagnosis.perCard[]` Sanity doc path
|
|
22
|
+
* (D6-09).
|
|
23
|
+
*
|
|
24
|
+
* Fields map directly to the 8 per-card attribute paths in D6-09:
|
|
25
|
+
* `…perCard[].cardType`, `…perCard[].cost`, `…perCard[].parseFailed`,
|
|
26
|
+
* `…perCard[].latencyMs`, `…perCard[].tokenInput`, `…perCard[].tokenOutput`,
|
|
27
|
+
* `…perCard[].cardVersion`, `…perCard[].generatedAt`.
|
|
28
|
+
*
|
|
29
|
+
* `cost` is undefined when the card did not make an LLM call (deterministic
|
|
30
|
+
* cards) and contributes 0 to the roll-up.
|
|
31
|
+
*/
|
|
32
|
+
export interface SynthesisPerCardTelemetry {
|
|
33
|
+
/** Card archetype — reuses `CardType` from diagnosis.ts:55-63; not redeclared. */
|
|
34
|
+
cardType: CardType;
|
|
35
|
+
/**
|
|
36
|
+
* Per-call USD cost captured from `LLMStructuredCompletion.cost`.
|
|
37
|
+
* `undefined` for deterministic cards (area-summary, failure-mode-summary,
|
|
38
|
+
* no-issues) which make no LLM call.
|
|
39
|
+
*/
|
|
40
|
+
cost?: number;
|
|
41
|
+
/**
|
|
42
|
+
* Whether the card's Zod schema parse failed (produces a degraded card).
|
|
43
|
+
* Used for the 7-day rolling parse-failure rate in BigQuery (D6-15).
|
|
44
|
+
*/
|
|
45
|
+
parseFailed: boolean;
|
|
46
|
+
/**
|
|
47
|
+
* End-to-end latency for the LLM call in milliseconds.
|
|
48
|
+
* `undefined` for deterministic cards.
|
|
49
|
+
*/
|
|
50
|
+
latencyMs?: number;
|
|
51
|
+
/**
|
|
52
|
+
* Prompt tokens consumed by the LLM call.
|
|
53
|
+
* `undefined` for deterministic cards.
|
|
54
|
+
*/
|
|
55
|
+
tokenInput?: number;
|
|
56
|
+
/**
|
|
57
|
+
* Completion tokens produced by the LLM call.
|
|
58
|
+
* `undefined` for deterministic cards.
|
|
59
|
+
*/
|
|
60
|
+
tokenOutput?: number;
|
|
61
|
+
/** Per-card version string (e.g. `"top-recommendations@0.1.0"`). */
|
|
62
|
+
cardVersion: string;
|
|
63
|
+
/** ISO 8601 UTC timestamp when this card was generated. */
|
|
64
|
+
generatedAt: string;
|
|
65
|
+
}
|
|
66
|
+
/**
|
|
67
|
+
* Aggregate synthesis cost telemetry for a single Diagnosis run.
|
|
68
|
+
* Lands on the `ailf.report` Sanity doc under `summary.synthesis.diagnosis.*`
|
|
69
|
+
* (D6-09 / D6-10: parallel to `summary.overall.cost` — not additive).
|
|
70
|
+
*
|
|
71
|
+
* Written by the post-run hook (D6-08); not written by standalone
|
|
72
|
+
* `ailf interpret`.
|
|
73
|
+
*
|
|
74
|
+
* Field set matches the 4 top-level D6-09 attribute paths:
|
|
75
|
+
* `summary.synthesis.diagnosis.cost`,
|
|
76
|
+
* `summary.synthesis.diagnosis.parseFailureCount`,
|
|
77
|
+
* `summary.synthesis.diagnosis.parseFailureRate`,
|
|
78
|
+
* `summary.synthesis.diagnosis.perCard`.
|
|
79
|
+
*/
|
|
80
|
+
export interface SynthesisCostTelemetry {
|
|
81
|
+
/**
|
|
82
|
+
* Total USD cost across all LLM cards in this Diagnosis run.
|
|
83
|
+
* Roll-up: `sum(perCard[].cost ?? 0)` for ready + degraded cards.
|
|
84
|
+
* Missing cards contribute 0.
|
|
85
|
+
*/
|
|
86
|
+
cost: number;
|
|
87
|
+
/**
|
|
88
|
+
* Number of cards whose Zod parse failed in this Diagnosis run.
|
|
89
|
+
* Counted across all 8 card types (including deterministic cards;
|
|
90
|
+
* a deterministic-card parse failure indicates a code bug).
|
|
91
|
+
*/
|
|
92
|
+
parseFailureCount: number;
|
|
93
|
+
/**
|
|
94
|
+
* Parse-failure rate: `parseFailureCount / 8` (8 = fixed card registry size).
|
|
95
|
+
* Range: 0–1. Used as the denominator for the D6-15 BigQuery 7-day
|
|
96
|
+
* rolling rate view (`synthesis_parse_failure_rate_7d.sql`).
|
|
97
|
+
*/
|
|
98
|
+
parseFailureRate: number;
|
|
99
|
+
/** Per-card telemetry rows — one entry per card in registry-order. */
|
|
100
|
+
perCard: SynthesisPerCardTelemetry[];
|
|
101
|
+
}
|