@sanity/ailf 6.0.0 → 6.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/airbyte/ai_literacy_framework.connector.yaml +276 -0
- package/config/bigquery/views/synthesis_parse_failure_rate_7d.sql +42 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +12 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +59 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +5 -1
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +47 -3
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +13 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +17 -1
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +1 -1
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +5 -1
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +5 -1
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +5 -1
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +15 -2
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +5 -3
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +19 -31
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/index.js +1 -1
- package/dist/_vendor/ailf-core/types/diagnosis.d.ts +3 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +7 -0
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +16 -0
- package/dist/_vendor/ailf-core/types/synthesis-telemetry.d.ts +101 -0
- package/dist/_vendor/ailf-core/types/synthesis-telemetry.js +18 -0
- package/dist/adapters/config-sources/file-config-adapter.js +8 -6
- package/dist/adapters/llm/index.d.ts +1 -1
- package/dist/adapters/llm/index.js +1 -1
- package/dist/adapters/llm/openai-llm-client.js +7 -2
- package/dist/adapters/llm/retry.d.ts +18 -0
- package/dist/adapters/llm/retry.js +21 -0
- package/dist/adapters/synthesis/synthesis-telemetry-schema.d.ts +49 -0
- package/dist/adapters/synthesis/synthesis-telemetry-schema.js +55 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +10 -5
- package/dist/adapters/task-sources/repo-schemas.d.ts +7 -0
- package/dist/adapters/task-sources/repo-schemas.js +10 -0
- package/dist/commands/interpret.d.ts +21 -1
- package/dist/commands/interpret.js +13 -4
- package/dist/commands/pipeline-action.d.ts +44 -0
- package/dist/commands/pipeline-action.js +193 -1
- package/dist/commands/run.d.ts +2 -0
- package/dist/commands/run.js +2 -0
- package/dist/orchestration/pipeline-orchestrator.js +3 -0
- package/dist/report-store.d.ts +26 -0
- package/dist/report-store.js +63 -0
- package/package.json +1 -1
|
@@ -89,7 +89,9 @@ export const generateWeakestArea = async (report, ctx) => {
|
|
|
89
89
|
path: ["confidence", "level"],
|
|
90
90
|
});
|
|
91
91
|
const prompt = buildWeakestAreaPrompt(report);
|
|
92
|
-
|
|
92
|
+
// Destructure `cost` and `model` from the LLMClient return —
|
|
93
|
+
// already provided per llm-client.ts:139-144, previously discarded.
|
|
94
|
+
const { value, usage, cost, model } = await ctx.llm.completeStructured({
|
|
93
95
|
model: CARD_MODEL,
|
|
94
96
|
prompt: `${prompt.system}\n\n${prompt.user}`,
|
|
95
97
|
schema: PerCallSchema,
|
|
@@ -109,6 +111,8 @@ export const generateWeakestArea = async (report, ctx) => {
|
|
|
109
111
|
cardVersion: "weakest-area@0.1.0",
|
|
110
112
|
tokenUsage: { input: usage.promptTokens, output: usage.completionTokens },
|
|
111
113
|
generatedAt: new Date().toISOString(),
|
|
114
|
+
cost,
|
|
115
|
+
model,
|
|
112
116
|
},
|
|
113
117
|
};
|
|
114
118
|
};
|
|
@@ -131,10 +131,23 @@ export function buildWeakestAreaPrompt(report) {
|
|
|
131
131
|
export function buildLowConfidenceAttributionPrompt(report, judgmentAttributions) {
|
|
132
132
|
// Filter to low-confidence entries (by any attribution in the set)
|
|
133
133
|
const lowConf = judgmentAttributions.filter((ja) => ja.attributions.some((a) => a.confidence.level === "low"));
|
|
134
|
-
// If no low-confidence entries, use all sorted by score ascending (most
|
|
134
|
+
// If no low-confidence entries, use all sorted by score ascending (most
|
|
135
|
+
// uncertain first). Guard against entries with empty `attributions` arrays:
|
|
136
|
+
// `Math.min(...[])` is Infinity and produces an unstable sort that ranks
|
|
137
|
+
// empty-attribution entries identically at the top of the prompt. The card
|
|
138
|
+
// schema requires `judgmentRefs.min(1)`, so emitting empty-attribution rows
|
|
139
|
+
// here forces a degraded card downstream. Caller should short-circuit to
|
|
140
|
+
// missing before reaching here, but defend against the seam regressing.
|
|
141
|
+
const validAttrs = judgmentAttributions.filter((ja) => ja.attributions.length > 0);
|
|
142
|
+
if (validAttrs.length === 0) {
|
|
143
|
+
return {
|
|
144
|
+
system: LOW_CONFIDENCE_ATTRIBUTION_SYSTEM_PROMPT,
|
|
145
|
+
user: "(no attribution data with non-empty attributions)",
|
|
146
|
+
};
|
|
147
|
+
}
|
|
135
148
|
const source = lowConf.length > 0
|
|
136
149
|
? lowConf
|
|
137
|
-
: [...
|
|
150
|
+
: [...validAttrs].sort((a, b) => {
|
|
138
151
|
const aMin = Math.min(...a.attributions.map((x) => x.score));
|
|
139
152
|
const bMin = Math.min(...b.attributions.map((x) => x.score));
|
|
140
153
|
return aMin - bMin;
|
|
@@ -8,9 +8,11 @@
|
|
|
8
8
|
* Mitigations embedded:
|
|
9
9
|
* - failure-mode #3: confidence inflation on small samples — prompt instructs
|
|
10
10
|
* to hedge when sampleSize < 10; Zod W3 refine enforces at parse time
|
|
11
|
-
* - failure-mode #4: taxonomy drift —
|
|
12
|
-
*
|
|
11
|
+
* - failure-mode #4: taxonomy drift — failure-mode lists derived at build
|
|
12
|
+
* time from the canonical const arrays in
|
|
13
|
+
* `packages/core/src/grader/failure-modes/*.ts`, so the prompt and the
|
|
14
|
+
* Zod `.refine(buildFailureModeRefinement())` validator always agree.
|
|
13
15
|
*
|
|
14
16
|
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4b
|
|
15
17
|
*/
|
|
16
|
-
export declare const SYSTEM_PROMPT
|
|
18
|
+
export declare const SYSTEM_PROMPT: string;
|
|
@@ -8,11 +8,19 @@
|
|
|
8
8
|
* Mitigations embedded:
|
|
9
9
|
* - failure-mode #3: confidence inflation on small samples — prompt instructs
|
|
10
10
|
* to hedge when sampleSize < 10; Zod W3 refine enforces at parse time
|
|
11
|
-
* - failure-mode #4: taxonomy drift —
|
|
12
|
-
*
|
|
11
|
+
* - failure-mode #4: taxonomy drift — failure-mode lists derived at build
|
|
12
|
+
* time from the canonical const arrays in
|
|
13
|
+
* `packages/core/src/grader/failure-modes/*.ts`, so the prompt and the
|
|
14
|
+
* Zod `.refine(buildFailureModeRefinement())` validator always agree.
|
|
13
15
|
*
|
|
14
16
|
* @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4b
|
|
15
17
|
*/
|
|
18
|
+
import { AGENT_FAILURE_MODES, COMMON_FAILURE_MODES, KP_FAILURE_MODES, LITERACY_FAILURE_MODES, MCP_FAILURE_MODES, } from "../../../grader/failure-modes/index.js";
|
|
19
|
+
const literacyList = LITERACY_FAILURE_MODES.map((m) => `- ${m}`).join("\n");
|
|
20
|
+
const mcpList = MCP_FAILURE_MODES.map((m) => `- ${m}`).join("\n");
|
|
21
|
+
const kpList = KP_FAILURE_MODES.map((m) => `- ${m}`).join("\n");
|
|
22
|
+
const agentList = AGENT_FAILURE_MODES.map((m) => `- ${m}`).join("\n");
|
|
23
|
+
const commonList = COMMON_FAILURE_MODES.join(", ");
|
|
16
24
|
export const SYSTEM_PROMPT = `You are an AILF evaluation analyst identifying the documentation area most in need of improvement.
|
|
17
25
|
|
|
18
26
|
## Your Output
|
|
@@ -33,47 +41,27 @@ Return a JSON object matching this exact shape:
|
|
|
33
41
|
|
|
34
42
|
## CANONICAL DIMENSIONS AND FAILURE MODES
|
|
35
43
|
|
|
36
|
-
You MUST pick dimension and failureMode from this exact taxonomy. Cross-dimension combinations are invalid (e.g., "security" dimension with
|
|
44
|
+
You MUST pick dimension and failureMode from this exact taxonomy. Cross-dimension combinations are invalid (e.g., "security" dimension with a literacy-only failure mode is rejected). The lists below are derived at build time from \`packages/core/src/grader/failure-modes/*.ts\` — the Zod validator on the card schema enforces the same taxonomy.
|
|
37
45
|
|
|
38
46
|
### Literacy family (dimensions: task-completion, code-correctness, doc-coverage)
|
|
39
47
|
Failure modes:
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
- incorrect-docs — doc states something factually wrong
|
|
43
|
-
- poor-structure — doc exists but is hard to find or follow
|
|
44
|
-
Plus cross-cutting: api-error, model-limitation, false-floor, unclassified
|
|
48
|
+
${literacyList}
|
|
49
|
+
Plus cross-cutting: ${commonList}
|
|
45
50
|
|
|
46
51
|
### MCP family (dimensions: mcp-behavior, input-validation, output-correctness, error-handling, security)
|
|
47
52
|
Failure modes:
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
- extra-param — unexpected extra parameter sent
|
|
51
|
-
- wrong-tool-selected — chose wrong tool for task
|
|
52
|
-
- tool-call-order — tools called in wrong sequence
|
|
53
|
-
- no-tool-call — should have used a tool but didn't
|
|
54
|
-
- schema-mismatch — response did not match expected schema
|
|
55
|
-
- unsafe-operation — operation could cause data loss
|
|
56
|
-
- auth-bypass — security check skipped
|
|
57
|
-
Plus cross-cutting: api-error, model-limitation, false-floor, unclassified
|
|
53
|
+
${mcpList}
|
|
54
|
+
Plus cross-cutting: ${commonList}
|
|
58
55
|
|
|
59
56
|
### Knowledge-probe family (dimensions: knowledge-probe, factual-correctness, completeness, currency)
|
|
60
57
|
Failure modes:
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
- missing-step — omitted a required step
|
|
64
|
-
- hallucinated-api — invented an API that does not exist
|
|
65
|
-
- wrong-version — used v1 API when v2 was required
|
|
66
|
-
- incomplete-coverage — missed important edge case
|
|
67
|
-
Plus cross-cutting: api-error, model-limitation, false-floor, unclassified
|
|
58
|
+
${kpList}
|
|
59
|
+
Plus cross-cutting: ${commonList}
|
|
68
60
|
|
|
69
61
|
### Agent-harness family (dimensions: agent-harness, process-quality, agent-output, tool-usage)
|
|
70
62
|
Failure modes:
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
- incorrect-output — output was wrong or incomplete
|
|
74
|
-
- inefficient-path — completed task but via unnecessary steps
|
|
75
|
-
- assertion-failure — failed a structural assertion check
|
|
76
|
-
Plus cross-cutting: api-error, model-limitation, false-floor, unclassified
|
|
63
|
+
${agentList}
|
|
64
|
+
Plus cross-cutting: ${commonList}
|
|
77
65
|
|
|
78
66
|
## Confidence Calibration Rules
|
|
79
67
|
|
|
@@ -17,5 +17,5 @@ export { createDiagnosisRunner, diagnosisVersion, type CardGenerator, type CardR
|
|
|
17
17
|
export { cardRegistry, type CardDefinition } from "./diagnosis/registry.js";
|
|
18
18
|
export { createLLMClient, type LLMClientAdapters, type LLMClientFactoryConfig, type LLMClientKeys, } from "./llm-client-factory.js";
|
|
19
19
|
export { buildFailureModeRefinement, isFailureModeInDimensionTaxonomy, } from "./diagnosis/card-validators.js";
|
|
20
|
-
export { DIAGNOSIS_CARD_GENERATORS, generateAreaSummary, generateFailureModeSummary, generateNoIssues, generateTopRecommendations, generateWeakestArea, generateLowConfidenceAttribution, generateDocAttributionSpotlight, generateRegressionVsBaseline, } from "./diagnosis/cards/index.js";
|
|
20
|
+
export { CARD_REGISTRY_VERSION, DIAGNOSIS_CARD_GENERATORS, generateAreaSummary, generateFailureModeSummary, generateNoIssues, generateTopRecommendations, generateWeakestArea, generateLowConfidenceAttribution, generateDocAttributionSpotlight, generateRegressionVsBaseline, } from "./diagnosis/cards/index.js";
|
|
21
21
|
export { buildTopRecommendationsPrompt, buildWeakestAreaPrompt, buildLowConfidenceAttributionPrompt, buildDocAttributionSpotlightPrompt, buildRegressionVsBaselinePrompt, buildDocSlugAllowList, } from "./diagnosis/prompt-builders.js";
|
|
@@ -29,5 +29,5 @@ export { buildFailureModeRefinement, isFailureModeInDimensionTaxonomy, } from ".
|
|
|
29
29
|
// ---------------------------------------------------------------------------
|
|
30
30
|
// Phase 5 Plan 05 — card generators barrel + prompt builders
|
|
31
31
|
// ---------------------------------------------------------------------------
|
|
32
|
-
export { DIAGNOSIS_CARD_GENERATORS, generateAreaSummary, generateFailureModeSummary, generateNoIssues, generateTopRecommendations, generateWeakestArea, generateLowConfidenceAttribution, generateDocAttributionSpotlight, generateRegressionVsBaseline, } from "./diagnosis/cards/index.js";
|
|
32
|
+
export { CARD_REGISTRY_VERSION, DIAGNOSIS_CARD_GENERATORS, generateAreaSummary, generateFailureModeSummary, generateNoIssues, generateTopRecommendations, generateWeakestArea, generateLowConfidenceAttribution, generateDocAttributionSpotlight, generateRegressionVsBaseline, } from "./diagnosis/cards/index.js";
|
|
33
33
|
export { buildTopRecommendationsPrompt, buildWeakestAreaPrompt, buildLowConfidenceAttributionPrompt, buildDocAttributionSpotlightPrompt, buildRegressionVsBaselinePrompt, buildDocSlugAllowList, } from "./diagnosis/prompt-builders.js";
|
|
@@ -19,6 +19,7 @@
|
|
|
19
19
|
import type { Confidence } from "./confidence.js";
|
|
20
20
|
import type { RunId } from "./branded-ids.js";
|
|
21
21
|
import type { ReportId } from "./index.js";
|
|
22
|
+
import type { ModelId } from "../ports/llm-client.js";
|
|
22
23
|
/**
|
|
23
24
|
* The four-version cache envelope. Every cached `Diagnosis` carries the
|
|
24
25
|
* versions of the inputs that produced it; any bump in any segment
|
|
@@ -54,6 +55,8 @@ export interface CardMeta {
|
|
|
54
55
|
latencyMs?: number;
|
|
55
56
|
/** ISO 8601 UTC timestamp. */
|
|
56
57
|
generatedAt: string;
|
|
58
|
+
cost?: number;
|
|
59
|
+
model?: ModelId;
|
|
57
60
|
}
|
|
58
61
|
/**
|
|
59
62
|
* A single actionable suggestion surfaced by a recommendations card.
|
|
@@ -37,6 +37,7 @@ export type { ArtifactId, AssociationAxis, AssociationValues, Brand, EntryKey, E
|
|
|
37
37
|
export { err, fixtureId, generateJudgmentId, generateRunId, judgmentId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
|
|
38
38
|
export type { AgentHarnessTaskDefinition, ContentLakeAuthorableMode, ContentLakeAuthorableTask, CriterionRef, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PromptVars, PathDocRef, PerspectiveDocRef, ReservedPromptVarKey, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./generalized-task.js";
|
|
39
39
|
export type { ActionSuggestion, AreaSummaryBody, CardMeta, CardType, Diagnosis, DiagnosisCard, DocAttributionSpotlightBody, FailureModeSummaryBody, JudgmentRef, LowConfidenceAttributionBody, NoIssuesBody, RegressionVsBaselineBody, TopRecommendationsBody, VersionedInputs, WeakestAreaBody, } from "./diagnosis.js";
|
|
40
|
+
export type { SynthesisCostTelemetry, SynthesisPerCardTelemetry, } from "./synthesis-telemetry.js";
|
|
40
41
|
export type { AttributionMeta, DocAttribution, JudgmentAttribution, } from "./attribution.js";
|
|
41
42
|
export type { CriterionSubJudgment, DocCitation, DocCitationRole, GraderJudgment, } from "./grader-judgment.js";
|
|
42
43
|
export type { LegacyGraderJudgment } from "./legacy-grader-judgment.js";
|
|
@@ -754,6 +755,12 @@ export interface PipelineResult {
|
|
|
754
755
|
promptfooUrls?: PromptfooUrlEntry[];
|
|
755
756
|
/** Results per step */
|
|
756
757
|
steps: Record<string, StepResult>;
|
|
758
|
+
/** Report ID produced by PublishReportStep (when publish was enabled). Used by
|
|
759
|
+
* post-run hooks (e.g. runPostPipelineHooks) to target diagnosis and telemetry
|
|
760
|
+
* writeback at the correct Content Lake document. Absent when publish was
|
|
761
|
+
* skipped or the publish step did not produce a report. (Phase 6 / DIAG-06)
|
|
762
|
+
*/
|
|
763
|
+
reportId?: string;
|
|
757
764
|
/** Overall success (all non-skipped steps succeeded) */
|
|
758
765
|
success: boolean;
|
|
759
766
|
/** Summary of test execution outcomes. */
|
|
@@ -106,6 +106,21 @@ export interface RepoTriggersConfig {
|
|
|
106
106
|
"pr-task-change"?: TriggerConfig;
|
|
107
107
|
schedule?: ScheduleTriggerConfig;
|
|
108
108
|
}
|
|
109
|
+
/**
|
|
110
|
+
* Post-run diagnosis summary policy (Phase 6 / DIAG-06).
|
|
111
|
+
* Controls whether `ailf run` auto-fires the in-process diagnosis runner
|
|
112
|
+
* at the end of a published pipeline. Precedence is resolved at the CLI
|
|
113
|
+
* layer — see `shouldRunPostSummary()` in `pipeline-action.ts`.
|
|
114
|
+
*/
|
|
115
|
+
export interface RepoSummaryConfig {
|
|
116
|
+
/**
|
|
117
|
+
* - `"auto"` — fire only when `process.stdout.isTTY === true` AND
|
|
118
|
+
* `CI !== "true"`.
|
|
119
|
+
* - `"always"` — fire unconditionally (bypasses TTY check).
|
|
120
|
+
* - `"never"` — never fire.
|
|
121
|
+
*/
|
|
122
|
+
onRun?: "auto" | "always" | "never";
|
|
123
|
+
}
|
|
109
124
|
/**
|
|
110
125
|
* Parsed shape of `.ailf/config.yaml`.
|
|
111
126
|
*
|
|
@@ -124,6 +139,7 @@ export interface RepoConfig {
|
|
|
124
139
|
publish?: RepoPublishConfig;
|
|
125
140
|
reportStore?: RepoReportStoreConfig;
|
|
126
141
|
source?: RepoSourceConfig;
|
|
142
|
+
summary?: RepoSummaryConfig;
|
|
127
143
|
taskSource?: RepoTaskSourceConfig;
|
|
128
144
|
triggers?: RepoTriggersConfig;
|
|
129
145
|
}
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Synthesis cost telemetry types — canonical TS-first shapes for
|
|
3
|
+
* Phase 6 DIAG-06 cost and parse-failure observability.
|
|
4
|
+
*
|
|
5
|
+
* These interfaces are authored independently of their Zod adapter schema
|
|
6
|
+
* (Plan 06-02) per D0045: the Zod schema declares
|
|
7
|
+
* `satisfies z.ZodType<SynthesisCostTelemetry>` against this independent
|
|
8
|
+
* type so drift is a build error, not a runtime bug.
|
|
9
|
+
*
|
|
10
|
+
* The 14 attribute paths on `SynthesisCostTelemetry` + `SynthesisPerCardTelemetry`
|
|
11
|
+
* land on the `ailf.report` Sanity doc under `summary.synthesis.diagnosis.*`
|
|
12
|
+
* (D6-09). No new sibling doc type (D0033 / D6-09).
|
|
13
|
+
*
|
|
14
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
15
|
+
* @see .planning/phases/06-post-run-integration-cost-telemetry/06-CONTEXT.md §D6-09
|
|
16
|
+
* @see .planning/phases/06-post-run-integration-cost-telemetry/06-CONTEXT.md §D6-12
|
|
17
|
+
*/
|
|
18
|
+
import type { CardType } from "./diagnosis.js";
|
|
19
|
+
/**
|
|
20
|
+
* Per-card telemetry row for the `synthesis_per_card` Airbyte stream
|
|
21
|
+
* (D6-11) and the `summary.synthesis.diagnosis.perCard[]` Sanity doc path
|
|
22
|
+
* (D6-09).
|
|
23
|
+
*
|
|
24
|
+
* Fields map directly to the 8 per-card attribute paths in D6-09:
|
|
25
|
+
* `…perCard[].cardType`, `…perCard[].cost`, `…perCard[].parseFailed`,
|
|
26
|
+
* `…perCard[].latencyMs`, `…perCard[].tokenInput`, `…perCard[].tokenOutput`,
|
|
27
|
+
* `…perCard[].cardVersion`, `…perCard[].generatedAt`.
|
|
28
|
+
*
|
|
29
|
+
* `cost` is undefined when the card did not make an LLM call (deterministic
|
|
30
|
+
* cards) and contributes 0 to the roll-up.
|
|
31
|
+
*/
|
|
32
|
+
export interface SynthesisPerCardTelemetry {
|
|
33
|
+
/** Card archetype — reuses `CardType` from diagnosis.ts:55-63; not redeclared. */
|
|
34
|
+
cardType: CardType;
|
|
35
|
+
/**
|
|
36
|
+
* Per-call USD cost captured from `LLMStructuredCompletion.cost`.
|
|
37
|
+
* `undefined` for deterministic cards (area-summary, failure-mode-summary,
|
|
38
|
+
* no-issues) which make no LLM call.
|
|
39
|
+
*/
|
|
40
|
+
cost?: number;
|
|
41
|
+
/**
|
|
42
|
+
* Whether the card's Zod schema parse failed (produces a degraded card).
|
|
43
|
+
* Used for the 7-day rolling parse-failure rate in BigQuery (D6-15).
|
|
44
|
+
*/
|
|
45
|
+
parseFailed: boolean;
|
|
46
|
+
/**
|
|
47
|
+
* End-to-end latency for the LLM call in milliseconds.
|
|
48
|
+
* `undefined` for deterministic cards.
|
|
49
|
+
*/
|
|
50
|
+
latencyMs?: number;
|
|
51
|
+
/**
|
|
52
|
+
* Prompt tokens consumed by the LLM call.
|
|
53
|
+
* `undefined` for deterministic cards.
|
|
54
|
+
*/
|
|
55
|
+
tokenInput?: number;
|
|
56
|
+
/**
|
|
57
|
+
* Completion tokens produced by the LLM call.
|
|
58
|
+
* `undefined` for deterministic cards.
|
|
59
|
+
*/
|
|
60
|
+
tokenOutput?: number;
|
|
61
|
+
/** Per-card version string (e.g. `"top-recommendations@0.1.0"`). */
|
|
62
|
+
cardVersion: string;
|
|
63
|
+
/** ISO 8601 UTC timestamp when this card was generated. */
|
|
64
|
+
generatedAt: string;
|
|
65
|
+
}
|
|
66
|
+
/**
|
|
67
|
+
* Aggregate synthesis cost telemetry for a single Diagnosis run.
|
|
68
|
+
* Lands on the `ailf.report` Sanity doc under `summary.synthesis.diagnosis.*`
|
|
69
|
+
* (D6-09 / D6-10: parallel to `summary.overall.cost` — not additive).
|
|
70
|
+
*
|
|
71
|
+
* Written by the post-run hook (D6-08); not written by standalone
|
|
72
|
+
* `ailf interpret`.
|
|
73
|
+
*
|
|
74
|
+
* Field set matches the 4 top-level D6-09 attribute paths:
|
|
75
|
+
* `summary.synthesis.diagnosis.cost`,
|
|
76
|
+
* `summary.synthesis.diagnosis.parseFailureCount`,
|
|
77
|
+
* `summary.synthesis.diagnosis.parseFailureRate`,
|
|
78
|
+
* `summary.synthesis.diagnosis.perCard`.
|
|
79
|
+
*/
|
|
80
|
+
export interface SynthesisCostTelemetry {
|
|
81
|
+
/**
|
|
82
|
+
* Total USD cost across all LLM cards in this Diagnosis run.
|
|
83
|
+
* Roll-up: `sum(perCard[].cost ?? 0)` for ready + degraded cards.
|
|
84
|
+
* Missing cards contribute 0.
|
|
85
|
+
*/
|
|
86
|
+
cost: number;
|
|
87
|
+
/**
|
|
88
|
+
* Number of cards whose Zod parse failed in this Diagnosis run.
|
|
89
|
+
* Counted across all 8 card types (including deterministic cards;
|
|
90
|
+
* a deterministic-card parse failure indicates a code bug).
|
|
91
|
+
*/
|
|
92
|
+
parseFailureCount: number;
|
|
93
|
+
/**
|
|
94
|
+
* Parse-failure rate: `parseFailureCount / 8` (8 = fixed card registry size).
|
|
95
|
+
* Range: 0–1. Used as the denominator for the D6-15 BigQuery 7-day
|
|
96
|
+
* rolling rate view (`synthesis_parse_failure_rate_7d.sql`).
|
|
97
|
+
*/
|
|
98
|
+
parseFailureRate: number;
|
|
99
|
+
/** Per-card telemetry rows — one entry per card in registry-order. */
|
|
100
|
+
perCard: SynthesisPerCardTelemetry[];
|
|
101
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Synthesis cost telemetry types — canonical TS-first shapes for
|
|
3
|
+
* Phase 6 DIAG-06 cost and parse-failure observability.
|
|
4
|
+
*
|
|
5
|
+
* These interfaces are authored independently of their Zod adapter schema
|
|
6
|
+
* (Plan 06-02) per D0045: the Zod schema declares
|
|
7
|
+
* `satisfies z.ZodType<SynthesisCostTelemetry>` against this independent
|
|
8
|
+
* type so drift is a build error, not a runtime bug.
|
|
9
|
+
*
|
|
10
|
+
* The 14 attribute paths on `SynthesisCostTelemetry` + `SynthesisPerCardTelemetry`
|
|
11
|
+
* land on the `ailf.report` Sanity doc under `summary.synthesis.diagnosis.*`
|
|
12
|
+
* (D6-09). No new sibling doc type (D0033 / D6-09).
|
|
13
|
+
*
|
|
14
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
15
|
+
* @see .planning/phases/06-post-run-integration-cost-telemetry/06-CONTEXT.md §D6-09
|
|
16
|
+
* @see .planning/phases/06-post-run-integration-cost-telemetry/06-CONTEXT.md §D6-12
|
|
17
|
+
*/
|
|
18
|
+
export {};
|
|
@@ -115,12 +115,10 @@ function mapEvalConfigToResolvedConfig(config, rootDir) {
|
|
|
115
115
|
compareBaseline: config.compareBaseline,
|
|
116
116
|
gapAnalysisEnabled: config.execution?.gapAnalysis ?? true,
|
|
117
117
|
// W0077 Phase 4 — `publish` is now a policy object. Map the auto value
|
|
118
|
-
//
|
|
119
|
-
//
|
|
120
|
-
//
|
|
121
|
-
publishEnabled: config.publish?.auto
|
|
122
|
-
? false
|
|
123
|
-
: config.publish?.auto !== undefined,
|
|
118
|
+
// to a boolean for the file-config path. Absence of publish.auto mirrors
|
|
119
|
+
// the CLI's "full-runs" default (enable publish; composition root gates on
|
|
120
|
+
// token availability). Only "never" explicitly disables auto-publish.
|
|
121
|
+
publishEnabled: config.publish?.auto !== "never",
|
|
124
122
|
publishTag: config.publish?.tag,
|
|
125
123
|
noCache: config.noCache ?? false,
|
|
126
124
|
noRemoteCache: config.noRemoteCache ?? false,
|
|
@@ -150,5 +148,9 @@ function mapEvalConfigToResolvedConfig(config, rootDir) {
|
|
|
150
148
|
? resolve(rootDir, config.taskSource.repoTasksPath)
|
|
151
149
|
: undefined,
|
|
152
150
|
presets: config.presets,
|
|
151
|
+
// Phase 6 / DIAG-06 — thread summary.onRun into ResolvedConfig so the
|
|
152
|
+
// file-config exit branch in executePipeline can pass it to
|
|
153
|
+
// runPostPipelineHooks.
|
|
154
|
+
summaryOnRun: config.summary?.onRun,
|
|
153
155
|
};
|
|
154
156
|
}
|
|
@@ -5,5 +5,5 @@ export type { FakeCallRecord, FakeCompletionResponse, FakeStructuredResponse, }
|
|
|
5
5
|
export { OpenAILLMClient } from "./openai-llm-client.js";
|
|
6
6
|
export type { OpenAILLMClientOptions } from "./openai-llm-client.js";
|
|
7
7
|
export type { ModelPricing } from "./pricing.js";
|
|
8
|
-
export { DEFAULT_RETRY_POLICY, LLMHttpError, isRetryableStatus, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
|
|
8
|
+
export { DEFAULT_RETRY_POLICY, LLMHttpError, LLMParseError, isRetryableStatus, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
|
|
9
9
|
export type { RetryPolicy } from "./retry.js";
|
|
@@ -1,4 +1,4 @@
|
|
|
1
1
|
export { AnthropicLLMClient } from "./anthropic-llm-client.js";
|
|
2
2
|
export { FakeLLMClient } from "./fake-llm-client.js";
|
|
3
3
|
export { OpenAILLMClient } from "./openai-llm-client.js";
|
|
4
|
-
export { DEFAULT_RETRY_POLICY, LLMHttpError, isRetryableStatus, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
|
|
4
|
+
export { DEFAULT_RETRY_POLICY, LLMHttpError, LLMParseError, isRetryableStatus, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
*/
|
|
13
13
|
import { z } from "zod";
|
|
14
14
|
import { OpenAIChatResponseSchema, splitModelId, } from "../../_vendor/ailf-core/index.js";
|
|
15
|
-
import { DEFAULT_RETRY_POLICY, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
|
|
15
|
+
import { DEFAULT_RETRY_POLICY, LLMParseError, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
|
|
16
16
|
const DEFAULT_BASE_URL = "https://api.openai.com/v1/chat/completions";
|
|
17
17
|
/**
|
|
18
18
|
* Conservative defaults for the models in `packages/eval/config/models.ts`.
|
|
@@ -98,7 +98,12 @@ export class OpenAILLMClient {
|
|
|
98
98
|
parsed = JSON.parse(raw);
|
|
99
99
|
}
|
|
100
100
|
catch (err) {
|
|
101
|
-
|
|
101
|
+
// Sanitize: SyntaxError.message embeds a snippet at the failure offset,
|
|
102
|
+
// which can leak prompt text or user content echoed back by the model.
|
|
103
|
+
// Keep the raw body on the instance for callers that opt in via .raw,
|
|
104
|
+
// mirroring the LLMHttpError pattern (verified by the "does not leak
|
|
105
|
+
// the response body" test in openai-llm-client.test.ts).
|
|
106
|
+
throw new LLMParseError(`OpenAI structured completion returned invalid JSON for model ${args.model}`, raw, { cause: err });
|
|
102
107
|
}
|
|
103
108
|
// strict:true guarantees a valid-against-the-schema JSON document, but
|
|
104
109
|
// the Zod parse is still load-bearing — it brands the result as T and is
|
|
@@ -33,6 +33,24 @@ export declare class LLMHttpError extends Error {
|
|
|
33
33
|
readonly body: string;
|
|
34
34
|
constructor(status: number, body: string, attempts: number);
|
|
35
35
|
}
|
|
36
|
+
/**
|
|
37
|
+
* Sanitized error raised when an LLM adapter receives an HTTP-200 response
|
|
38
|
+
* whose body is not valid JSON. The raw response body (which may echo back
|
|
39
|
+
* user prompt content or even API-key fragments from prompts) is kept on the
|
|
40
|
+
* instance for callers that opt in via `.raw`, NOT in the message string.
|
|
41
|
+
*
|
|
42
|
+
* Mirrors the LLMHttpError pattern verified by the
|
|
43
|
+
* "does not leak the response body" test in openai-llm-client.test.ts.
|
|
44
|
+
*/
|
|
45
|
+
export declare class LLMParseError extends Error {
|
|
46
|
+
/** Full raw response body (kept on the instance, NOT in `message`). */
|
|
47
|
+
readonly raw: string;
|
|
48
|
+
/** Byte length of `raw` — safe to include in the message. */
|
|
49
|
+
readonly rawLength: number;
|
|
50
|
+
constructor(message: string, raw: string, options?: {
|
|
51
|
+
cause?: unknown;
|
|
52
|
+
});
|
|
53
|
+
}
|
|
36
54
|
export declare function isRetryableStatus(status: number): boolean;
|
|
37
55
|
export interface RunWithRetryArgs<T> {
|
|
38
56
|
policy: RetryPolicy;
|
|
@@ -29,6 +29,27 @@ export class LLMHttpError extends Error {
|
|
|
29
29
|
this.body = body;
|
|
30
30
|
}
|
|
31
31
|
}
|
|
32
|
+
/**
|
|
33
|
+
* Sanitized error raised when an LLM adapter receives an HTTP-200 response
|
|
34
|
+
* whose body is not valid JSON. The raw response body (which may echo back
|
|
35
|
+
* user prompt content or even API-key fragments from prompts) is kept on the
|
|
36
|
+
* instance for callers that opt in via `.raw`, NOT in the message string.
|
|
37
|
+
*
|
|
38
|
+
* Mirrors the LLMHttpError pattern verified by the
|
|
39
|
+
* "does not leak the response body" test in openai-llm-client.test.ts.
|
|
40
|
+
*/
|
|
41
|
+
export class LLMParseError extends Error {
|
|
42
|
+
/** Full raw response body (kept on the instance, NOT in `message`). */
|
|
43
|
+
raw;
|
|
44
|
+
/** Byte length of `raw` — safe to include in the message. */
|
|
45
|
+
rawLength;
|
|
46
|
+
constructor(message, raw, options) {
|
|
47
|
+
super(`${message} (raw=${raw.length}B)`, options);
|
|
48
|
+
this.name = "LLMParseError";
|
|
49
|
+
this.raw = raw;
|
|
50
|
+
this.rawLength = raw.length;
|
|
51
|
+
}
|
|
52
|
+
}
|
|
32
53
|
export function isRetryableStatus(status) {
|
|
33
54
|
return status === 429 || (status >= 500 && status < 600);
|
|
34
55
|
}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Zod adapter schema for SynthesisCostTelemetry at the trust boundary.
|
|
3
|
+
*
|
|
4
|
+
* This schema sits at `packages/eval/src/adapters/**` and is therefore
|
|
5
|
+
* scanned by `pnpm check-trust-boundary-satisfies` (D0045). The
|
|
6
|
+
* `satisfies z.ZodType<SynthesisCostTelemetry>` clause makes schema/type
|
|
7
|
+
* drift a build error, not a runtime bug.
|
|
8
|
+
*
|
|
9
|
+
* Used by:
|
|
10
|
+
* - Plan 06-04 `ReportStore.patchSynthesis` — validates telemetry before
|
|
11
|
+
* writing to Sanity (process memory → Sanity write boundary, T-06-04).
|
|
12
|
+
* - Any future Sanity-side reader of `summary.synthesis.diagnosis.*`
|
|
13
|
+
* (Sanity Content Lake → eval process boundary, T-06-04).
|
|
14
|
+
*
|
|
15
|
+
* Security constraints:
|
|
16
|
+
* - No `.passthrough()` — schema is closed to prevent PII leakage from
|
|
17
|
+
* card body text into the telemetry shape (T-06-05).
|
|
18
|
+
* - Satisfies clause is load-bearing (T-06-06); no exemption marker.
|
|
19
|
+
*
|
|
20
|
+
* @see packages/core/src/types/synthesis-telemetry.ts — independently authored domain types
|
|
21
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
22
|
+
* @see .planning/phases/06-post-run-integration-cost-telemetry/06-CONTEXT.md §D6-09
|
|
23
|
+
*/
|
|
24
|
+
import { z } from "zod";
|
|
25
|
+
export declare const SynthesisCostTelemetrySchema: z.ZodObject<{
|
|
26
|
+
cost: z.ZodNumber;
|
|
27
|
+
parseFailureCount: z.ZodNumber;
|
|
28
|
+
parseFailureRate: z.ZodNumber;
|
|
29
|
+
perCard: z.ZodArray<z.ZodObject<{
|
|
30
|
+
cardType: z.ZodEnum<{
|
|
31
|
+
"area-summary": "area-summary";
|
|
32
|
+
"failure-mode-summary": "failure-mode-summary";
|
|
33
|
+
"no-issues": "no-issues";
|
|
34
|
+
"top-recommendations": "top-recommendations";
|
|
35
|
+
"weakest-area": "weakest-area";
|
|
36
|
+
"low-confidence-attribution": "low-confidence-attribution";
|
|
37
|
+
"doc-attribution-spotlight": "doc-attribution-spotlight";
|
|
38
|
+
"regression-vs-baseline": "regression-vs-baseline";
|
|
39
|
+
}>;
|
|
40
|
+
cost: z.ZodOptional<z.ZodNumber>;
|
|
41
|
+
parseFailed: z.ZodBoolean;
|
|
42
|
+
latencyMs: z.ZodOptional<z.ZodNumber>;
|
|
43
|
+
tokenInput: z.ZodOptional<z.ZodNumber>;
|
|
44
|
+
tokenOutput: z.ZodOptional<z.ZodNumber>;
|
|
45
|
+
cardVersion: z.ZodString;
|
|
46
|
+
generatedAt: z.ZodString;
|
|
47
|
+
}, z.core.$strip>>;
|
|
48
|
+
}, z.core.$strip>;
|
|
49
|
+
export type { SynthesisCostTelemetry, SynthesisPerCardTelemetry, } from "../../_vendor/ailf-core/index.d.ts";
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Zod adapter schema for SynthesisCostTelemetry at the trust boundary.
|
|
3
|
+
*
|
|
4
|
+
* This schema sits at `packages/eval/src/adapters/**` and is therefore
|
|
5
|
+
* scanned by `pnpm check-trust-boundary-satisfies` (D0045). The
|
|
6
|
+
* `satisfies z.ZodType<SynthesisCostTelemetry>` clause makes schema/type
|
|
7
|
+
* drift a build error, not a runtime bug.
|
|
8
|
+
*
|
|
9
|
+
* Used by:
|
|
10
|
+
* - Plan 06-04 `ReportStore.patchSynthesis` — validates telemetry before
|
|
11
|
+
* writing to Sanity (process memory → Sanity write boundary, T-06-04).
|
|
12
|
+
* - Any future Sanity-side reader of `summary.synthesis.diagnosis.*`
|
|
13
|
+
* (Sanity Content Lake → eval process boundary, T-06-04).
|
|
14
|
+
*
|
|
15
|
+
* Security constraints:
|
|
16
|
+
* - No `.passthrough()` — schema is closed to prevent PII leakage from
|
|
17
|
+
* card body text into the telemetry shape (T-06-05).
|
|
18
|
+
* - Satisfies clause is load-bearing (T-06-06); no exemption marker.
|
|
19
|
+
*
|
|
20
|
+
* @see packages/core/src/types/synthesis-telemetry.ts — independently authored domain types
|
|
21
|
+
* @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
|
|
22
|
+
* @see .planning/phases/06-post-run-integration-cost-telemetry/06-CONTEXT.md §D6-09
|
|
23
|
+
*/
|
|
24
|
+
import { z } from "zod";
|
|
25
|
+
/**
|
|
26
|
+
* Enum of all valid card types — mirrors `CardType` from diagnosis.ts.
|
|
27
|
+
* Using `z.enum()` (not `z.string()`) so the schema satisfies
|
|
28
|
+
* `z.ZodType<SynthesisPerCardTelemetry>` (which requires `cardType: CardType`).
|
|
29
|
+
*/
|
|
30
|
+
const CardTypeSchema = z.enum([
|
|
31
|
+
"area-summary",
|
|
32
|
+
"failure-mode-summary",
|
|
33
|
+
"no-issues",
|
|
34
|
+
"top-recommendations",
|
|
35
|
+
"weakest-area",
|
|
36
|
+
"low-confidence-attribution",
|
|
37
|
+
"doc-attribution-spotlight",
|
|
38
|
+
"regression-vs-baseline",
|
|
39
|
+
]);
|
|
40
|
+
const SynthesisPerCardSchema = z.object({
|
|
41
|
+
cardType: CardTypeSchema,
|
|
42
|
+
cost: z.number().nonnegative().optional(),
|
|
43
|
+
parseFailed: z.boolean(),
|
|
44
|
+
latencyMs: z.number().int().nonnegative().optional(),
|
|
45
|
+
tokenInput: z.number().int().nonnegative().optional(),
|
|
46
|
+
tokenOutput: z.number().int().nonnegative().optional(),
|
|
47
|
+
cardVersion: z.string(),
|
|
48
|
+
generatedAt: z.string().datetime({ offset: false }), // ISO 8601 UTC required
|
|
49
|
+
});
|
|
50
|
+
export const SynthesisCostTelemetrySchema = z.object({
|
|
51
|
+
cost: z.number().nonnegative(),
|
|
52
|
+
parseFailureCount: z.number().int().nonnegative(),
|
|
53
|
+
parseFailureRate: z.number().min(0).max(1),
|
|
54
|
+
perCard: z.array(SynthesisPerCardSchema),
|
|
55
|
+
});
|
|
@@ -286,16 +286,21 @@ function mapAssertions(raw) {
|
|
|
286
286
|
.map((c) => ({ id: c.id, text: c.text })),
|
|
287
287
|
template: a.template,
|
|
288
288
|
type: "llm-rubric",
|
|
289
|
-
|
|
289
|
+
// Use `!= null` (loose) so we drop both `undefined` AND `null`.
|
|
290
|
+
// GROQ projects missing scalar fields as `null`, but the domain
|
|
291
|
+
// schema's `z.number().optional()` accepts `T | undefined`, not
|
|
292
|
+
// `T | null` — a strict `!== undefined` check would forward
|
|
293
|
+
// `weight: null` and trigger Zod's "Invalid input" on assertions.
|
|
294
|
+
...(a.weight != null ? { weight: a.weight } : {}),
|
|
290
295
|
};
|
|
291
296
|
}
|
|
292
|
-
// Value-based assertion
|
|
297
|
+
// Value-based assertion — same null-vs-undefined hazard as above.
|
|
293
298
|
const result = { type: a.type };
|
|
294
|
-
if (a.value
|
|
299
|
+
if (a.value != null)
|
|
295
300
|
result.value = a.value;
|
|
296
|
-
if (a.threshold
|
|
301
|
+
if (a.threshold != null)
|
|
297
302
|
result.threshold = a.threshold;
|
|
298
|
-
if (a.weight
|
|
303
|
+
if (a.weight != null)
|
|
299
304
|
result.weight = a.weight;
|
|
300
305
|
return result;
|
|
301
306
|
});
|
|
@@ -1561,6 +1561,13 @@ export declare const RepoConfigSchema: z.ZodObject<{
|
|
|
1561
1561
|
dir: z.ZodOptional<z.ZodString>;
|
|
1562
1562
|
exclude: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
1563
1563
|
}, z.core.$strip>>;
|
|
1564
|
+
summary: z.ZodOptional<z.ZodObject<{
|
|
1565
|
+
onRun: z.ZodOptional<z.ZodEnum<{
|
|
1566
|
+
never: "never";
|
|
1567
|
+
always: "always";
|
|
1568
|
+
auto: "auto";
|
|
1569
|
+
}>>;
|
|
1570
|
+
}, z.core.$strip>>;
|
|
1564
1571
|
taskSource: z.ZodOptional<z.ZodObject<{
|
|
1565
1572
|
type: z.ZodOptional<z.ZodEnum<{
|
|
1566
1573
|
"content-lake": "content-lake";
|