@sanity/ailf 6.0.0 → 6.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/config/airbyte/ai_literacy_framework.connector.yaml +276 -0
  2. package/config/bigquery/views/synthesis_parse_failure_rate_7d.sql +42 -0
  3. package/dist/_vendor/ailf-core/ports/context.d.ts +12 -0
  4. package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -0
  5. package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -0
  6. package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +59 -0
  7. package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +5 -1
  8. package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +47 -3
  9. package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +10 -0
  10. package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +13 -0
  11. package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +17 -1
  12. package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +1 -1
  13. package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +5 -1
  14. package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +5 -1
  15. package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +5 -1
  16. package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +15 -2
  17. package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +5 -3
  18. package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +19 -31
  19. package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
  20. package/dist/_vendor/ailf-core/services/index.js +1 -1
  21. package/dist/_vendor/ailf-core/types/diagnosis.d.ts +3 -0
  22. package/dist/_vendor/ailf-core/types/index.d.ts +7 -0
  23. package/dist/_vendor/ailf-core/types/repo-config.d.ts +16 -0
  24. package/dist/_vendor/ailf-core/types/synthesis-telemetry.d.ts +101 -0
  25. package/dist/_vendor/ailf-core/types/synthesis-telemetry.js +18 -0
  26. package/dist/adapters/config-sources/file-config-adapter.js +8 -6
  27. package/dist/adapters/llm/index.d.ts +1 -1
  28. package/dist/adapters/llm/index.js +1 -1
  29. package/dist/adapters/llm/openai-llm-client.js +7 -2
  30. package/dist/adapters/llm/retry.d.ts +18 -0
  31. package/dist/adapters/llm/retry.js +21 -0
  32. package/dist/adapters/synthesis/synthesis-telemetry-schema.d.ts +49 -0
  33. package/dist/adapters/synthesis/synthesis-telemetry-schema.js +55 -0
  34. package/dist/adapters/task-sources/content-lake-task-source.js +10 -5
  35. package/dist/adapters/task-sources/repo-schemas.d.ts +7 -0
  36. package/dist/adapters/task-sources/repo-schemas.js +10 -0
  37. package/dist/commands/interpret.d.ts +21 -1
  38. package/dist/commands/interpret.js +13 -4
  39. package/dist/commands/pipeline-action.d.ts +44 -0
  40. package/dist/commands/pipeline-action.js +193 -1
  41. package/dist/commands/run.d.ts +2 -0
  42. package/dist/commands/run.js +2 -0
  43. package/dist/orchestration/pipeline-orchestrator.js +3 -0
  44. package/dist/report-store.d.ts +26 -0
  45. package/dist/report-store.js +63 -0
  46. package/package.json +1 -1
@@ -89,7 +89,9 @@ export const generateWeakestArea = async (report, ctx) => {
89
89
  path: ["confidence", "level"],
90
90
  });
91
91
  const prompt = buildWeakestAreaPrompt(report);
92
- const { value, usage } = await ctx.llm.completeStructured({
92
+ // Destructure `cost` and `model` from the LLMClient return —
93
+ // already provided per llm-client.ts:139-144, previously discarded.
94
+ const { value, usage, cost, model } = await ctx.llm.completeStructured({
93
95
  model: CARD_MODEL,
94
96
  prompt: `${prompt.system}\n\n${prompt.user}`,
95
97
  schema: PerCallSchema,
@@ -109,6 +111,8 @@ export const generateWeakestArea = async (report, ctx) => {
109
111
  cardVersion: "weakest-area@0.1.0",
110
112
  tokenUsage: { input: usage.promptTokens, output: usage.completionTokens },
111
113
  generatedAt: new Date().toISOString(),
114
+ cost,
115
+ model,
112
116
  },
113
117
  };
114
118
  };
@@ -131,10 +131,23 @@ export function buildWeakestAreaPrompt(report) {
131
131
  export function buildLowConfidenceAttributionPrompt(report, judgmentAttributions) {
132
132
  // Filter to low-confidence entries (by any attribution in the set)
133
133
  const lowConf = judgmentAttributions.filter((ja) => ja.attributions.some((a) => a.confidence.level === "low"));
134
- // If no low-confidence entries, use all sorted by score ascending (most uncertain first)
134
+ // If no low-confidence entries, use all sorted by score ascending (most
135
+ // uncertain first). Guard against entries with empty `attributions` arrays:
136
+ // `Math.min(...[])` is Infinity and produces an unstable sort that ranks
137
+ // empty-attribution entries identically at the top of the prompt. The card
138
+ // schema requires `judgmentRefs.min(1)`, so emitting empty-attribution rows
139
+ // here forces a degraded card downstream. Caller should short-circuit to
140
+ // missing before reaching here, but defend against the seam regressing.
141
+ const validAttrs = judgmentAttributions.filter((ja) => ja.attributions.length > 0);
142
+ if (validAttrs.length === 0) {
143
+ return {
144
+ system: LOW_CONFIDENCE_ATTRIBUTION_SYSTEM_PROMPT,
145
+ user: "(no attribution data with non-empty attributions)",
146
+ };
147
+ }
135
148
  const source = lowConf.length > 0
136
149
  ? lowConf
137
- : [...judgmentAttributions].sort((a, b) => {
150
+ : [...validAttrs].sort((a, b) => {
138
151
  const aMin = Math.min(...a.attributions.map((x) => x.score));
139
152
  const bMin = Math.min(...b.attributions.map((x) => x.score));
140
153
  return aMin - bMin;
@@ -8,9 +8,11 @@
8
8
  * Mitigations embedded:
9
9
  * - failure-mode #3: confidence inflation on small samples — prompt instructs
10
10
  * to hedge when sampleSize < 10; Zod W3 refine enforces at parse time
11
- * - failure-mode #4: taxonomy drift — full canonical taxonomy enumerated
12
- * verbatim in this prompt so the LLM picks from a known list
11
+ * - failure-mode #4: taxonomy drift — failure-mode lists derived at build
12
+ * time from the canonical const arrays in
13
+ * `packages/core/src/grader/failure-modes/*.ts`, so the prompt and the
14
+ * Zod `.refine(buildFailureModeRefinement())` validator always agree.
13
15
  *
14
16
  * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4b
15
17
  */
16
- export declare const SYSTEM_PROMPT = "You are an AILF evaluation analyst identifying the documentation area most in need of improvement.\n\n## Your Output\n\nReturn a JSON object matching this exact shape:\n{\n \"summary\": \"<1-2 sentence description of the weakest area and why>\",\n \"area\": \"<feature area name, e.g. 'schema-deploy'>\",\n \"dimension\": \"<MUST be one of the canonical dimensions listed below>\",\n \"failureMode\": \"<MUST be from the canonical taxonomy for the chosen dimension>\",\n \"sampleSize\": <number \u2014 MUST equal the judgmentCount provided for this area>,\n \"confidence\": {\n \"level\": \"high\" | \"medium\" | \"low\",\n \"signalsPresent\": <number of tasks backing this finding>,\n \"derivation\": \"card-type-specific\"\n }\n}\n\n## CANONICAL DIMENSIONS AND FAILURE MODES\n\nYou MUST pick dimension and failureMode from this exact taxonomy. Cross-dimension combinations are invalid (e.g., \"security\" dimension with \"missing-docs\" failure mode is rejected).\n\n### Literacy family (dimensions: task-completion, code-correctness, doc-coverage)\nFailure modes:\n- missing-docs \u2014 relevant doc didn't exist\n- outdated-docs \u2014 doc reflects an older API/version\n- incorrect-docs \u2014 doc states something factually wrong\n- poor-structure \u2014 doc exists but is hard to find or follow\nPlus cross-cutting: api-error, model-limitation, false-floor, unclassified\n\n### MCP family (dimensions: mcp-behavior, input-validation, output-correctness, error-handling, security)\nFailure modes:\n- invalid-tool-call \u2014 model called tool with wrong args\n- missing-required-param \u2014 required parameter omitted\n- extra-param \u2014 unexpected extra parameter sent\n- wrong-tool-selected \u2014 chose wrong tool for task\n- tool-call-order \u2014 tools called in wrong sequence\n- no-tool-call \u2014 should have used a tool but didn't\n- schema-mismatch \u2014 response did not match expected schema\n- unsafe-operation \u2014 operation could cause data loss\n- auth-bypass \u2014 security check skipped\nPlus cross-cutting: api-error, model-limitation, false-floor, unclassified\n\n### Knowledge-probe family (dimensions: knowledge-probe, factual-correctness, completeness, currency)\nFailure modes:\n- factual-error \u2014 stated an incorrect fact\n- out-of-date \u2014 used deprecated API or old syntax\n- missing-step \u2014 omitted a required step\n- hallucinated-api \u2014 invented an API that does not exist\n- wrong-version \u2014 used v1 API when v2 was required\n- incomplete-coverage \u2014 missed important edge case\nPlus cross-cutting: api-error, model-limitation, false-floor, unclassified\n\n### Agent-harness family (dimensions: agent-harness, process-quality, agent-output, tool-usage)\nFailure modes:\n- excessive-loops \u2014 agent looped unnecessarily\n- premature-stop \u2014 stopped before completing the task\n- incorrect-output \u2014 output was wrong or incomplete\n- inefficient-path \u2014 completed task but via unnecessary steps\n- assertion-failure \u2014 failed a structural assertion check\nPlus cross-cutting: api-error, model-limitation, false-floor, unclassified\n\n## Confidence Calibration Rules\n\n**CRITICAL:** When sampleSize < 10, you MUST set confidence.level = \"low\".\n\n- sampleSize >= 30 \u2192 \"high\" is appropriate\n- sampleSize >= 10 \u2192 \"medium\" is appropriate\n- sampleSize < 10 \u2192 MUST use \"low\" (small-sample hedge required)\n\nIn your summary, reflect the confidence level: if \"low\", include language like \"small sample (N=X) \u2014 re-run with broader dataset before acting\".";
18
+ export declare const SYSTEM_PROMPT: string;
@@ -8,11 +8,19 @@
8
8
  * Mitigations embedded:
9
9
  * - failure-mode #3: confidence inflation on small samples — prompt instructs
10
10
  * to hedge when sampleSize < 10; Zod W3 refine enforces at parse time
11
- * - failure-mode #4: taxonomy drift — full canonical taxonomy enumerated
12
- * verbatim in this prompt so the LLM picks from a known list
11
+ * - failure-mode #4: taxonomy drift — failure-mode lists derived at build
12
+ * time from the canonical const arrays in
13
+ * `packages/core/src/grader/failure-modes/*.ts`, so the prompt and the
14
+ * Zod `.refine(buildFailureModeRefinement())` validator always agree.
13
15
  *
14
16
  * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4b
15
17
  */
18
+ import { AGENT_FAILURE_MODES, COMMON_FAILURE_MODES, KP_FAILURE_MODES, LITERACY_FAILURE_MODES, MCP_FAILURE_MODES, } from "../../../grader/failure-modes/index.js";
19
+ const literacyList = LITERACY_FAILURE_MODES.map((m) => `- ${m}`).join("\n");
20
+ const mcpList = MCP_FAILURE_MODES.map((m) => `- ${m}`).join("\n");
21
+ const kpList = KP_FAILURE_MODES.map((m) => `- ${m}`).join("\n");
22
+ const agentList = AGENT_FAILURE_MODES.map((m) => `- ${m}`).join("\n");
23
+ const commonList = COMMON_FAILURE_MODES.join(", ");
16
24
  export const SYSTEM_PROMPT = `You are an AILF evaluation analyst identifying the documentation area most in need of improvement.
17
25
 
18
26
  ## Your Output
@@ -33,47 +41,27 @@ Return a JSON object matching this exact shape:
33
41
 
34
42
  ## CANONICAL DIMENSIONS AND FAILURE MODES
35
43
 
36
- You MUST pick dimension and failureMode from this exact taxonomy. Cross-dimension combinations are invalid (e.g., "security" dimension with "missing-docs" failure mode is rejected).
44
+ You MUST pick dimension and failureMode from this exact taxonomy. Cross-dimension combinations are invalid (e.g., "security" dimension with a literacy-only failure mode is rejected). The lists below are derived at build time from \`packages/core/src/grader/failure-modes/*.ts\` — the Zod validator on the card schema enforces the same taxonomy.
37
45
 
38
46
  ### Literacy family (dimensions: task-completion, code-correctness, doc-coverage)
39
47
  Failure modes:
40
- - missing-docs — relevant doc didn't exist
41
- - outdated-docs — doc reflects an older API/version
42
- - incorrect-docs — doc states something factually wrong
43
- - poor-structure — doc exists but is hard to find or follow
44
- Plus cross-cutting: api-error, model-limitation, false-floor, unclassified
48
+ ${literacyList}
49
+ Plus cross-cutting: ${commonList}
45
50
 
46
51
  ### MCP family (dimensions: mcp-behavior, input-validation, output-correctness, error-handling, security)
47
52
  Failure modes:
48
- - invalid-tool-call — model called tool with wrong args
49
- - missing-required-param — required parameter omitted
50
- - extra-param — unexpected extra parameter sent
51
- - wrong-tool-selected — chose wrong tool for task
52
- - tool-call-order — tools called in wrong sequence
53
- - no-tool-call — should have used a tool but didn't
54
- - schema-mismatch — response did not match expected schema
55
- - unsafe-operation — operation could cause data loss
56
- - auth-bypass — security check skipped
57
- Plus cross-cutting: api-error, model-limitation, false-floor, unclassified
53
+ ${mcpList}
54
+ Plus cross-cutting: ${commonList}
58
55
 
59
56
  ### Knowledge-probe family (dimensions: knowledge-probe, factual-correctness, completeness, currency)
60
57
  Failure modes:
61
- - factual-error — stated an incorrect fact
62
- - out-of-date — used deprecated API or old syntax
63
- - missing-step — omitted a required step
64
- - hallucinated-api — invented an API that does not exist
65
- - wrong-version — used v1 API when v2 was required
66
- - incomplete-coverage — missed important edge case
67
- Plus cross-cutting: api-error, model-limitation, false-floor, unclassified
58
+ ${kpList}
59
+ Plus cross-cutting: ${commonList}
68
60
 
69
61
  ### Agent-harness family (dimensions: agent-harness, process-quality, agent-output, tool-usage)
70
62
  Failure modes:
71
- - excessive-loops — agent looped unnecessarily
72
- - premature-stop — stopped before completing the task
73
- - incorrect-output — output was wrong or incomplete
74
- - inefficient-path — completed task but via unnecessary steps
75
- - assertion-failure — failed a structural assertion check
76
- Plus cross-cutting: api-error, model-limitation, false-floor, unclassified
63
+ ${agentList}
64
+ Plus cross-cutting: ${commonList}
77
65
 
78
66
  ## Confidence Calibration Rules
79
67
 
@@ -17,5 +17,5 @@ export { createDiagnosisRunner, diagnosisVersion, type CardGenerator, type CardR
17
17
  export { cardRegistry, type CardDefinition } from "./diagnosis/registry.js";
18
18
  export { createLLMClient, type LLMClientAdapters, type LLMClientFactoryConfig, type LLMClientKeys, } from "./llm-client-factory.js";
19
19
  export { buildFailureModeRefinement, isFailureModeInDimensionTaxonomy, } from "./diagnosis/card-validators.js";
20
- export { DIAGNOSIS_CARD_GENERATORS, generateAreaSummary, generateFailureModeSummary, generateNoIssues, generateTopRecommendations, generateWeakestArea, generateLowConfidenceAttribution, generateDocAttributionSpotlight, generateRegressionVsBaseline, } from "./diagnosis/cards/index.js";
20
+ export { CARD_REGISTRY_VERSION, DIAGNOSIS_CARD_GENERATORS, generateAreaSummary, generateFailureModeSummary, generateNoIssues, generateTopRecommendations, generateWeakestArea, generateLowConfidenceAttribution, generateDocAttributionSpotlight, generateRegressionVsBaseline, } from "./diagnosis/cards/index.js";
21
21
  export { buildTopRecommendationsPrompt, buildWeakestAreaPrompt, buildLowConfidenceAttributionPrompt, buildDocAttributionSpotlightPrompt, buildRegressionVsBaselinePrompt, buildDocSlugAllowList, } from "./diagnosis/prompt-builders.js";
@@ -29,5 +29,5 @@ export { buildFailureModeRefinement, isFailureModeInDimensionTaxonomy, } from ".
29
29
  // ---------------------------------------------------------------------------
30
30
  // Phase 5 Plan 05 — card generators barrel + prompt builders
31
31
  // ---------------------------------------------------------------------------
32
- export { DIAGNOSIS_CARD_GENERATORS, generateAreaSummary, generateFailureModeSummary, generateNoIssues, generateTopRecommendations, generateWeakestArea, generateLowConfidenceAttribution, generateDocAttributionSpotlight, generateRegressionVsBaseline, } from "./diagnosis/cards/index.js";
32
+ export { CARD_REGISTRY_VERSION, DIAGNOSIS_CARD_GENERATORS, generateAreaSummary, generateFailureModeSummary, generateNoIssues, generateTopRecommendations, generateWeakestArea, generateLowConfidenceAttribution, generateDocAttributionSpotlight, generateRegressionVsBaseline, } from "./diagnosis/cards/index.js";
33
33
  export { buildTopRecommendationsPrompt, buildWeakestAreaPrompt, buildLowConfidenceAttributionPrompt, buildDocAttributionSpotlightPrompt, buildRegressionVsBaselinePrompt, buildDocSlugAllowList, } from "./diagnosis/prompt-builders.js";
@@ -19,6 +19,7 @@
19
19
  import type { Confidence } from "./confidence.js";
20
20
  import type { RunId } from "./branded-ids.js";
21
21
  import type { ReportId } from "./index.js";
22
+ import type { ModelId } from "../ports/llm-client.js";
22
23
  /**
23
24
  * The four-version cache envelope. Every cached `Diagnosis` carries the
24
25
  * versions of the inputs that produced it; any bump in any segment
@@ -54,6 +55,8 @@ export interface CardMeta {
54
55
  latencyMs?: number;
55
56
  /** ISO 8601 UTC timestamp. */
56
57
  generatedAt: string;
58
+ cost?: number;
59
+ model?: ModelId;
57
60
  }
58
61
  /**
59
62
  * A single actionable suggestion surfaced by a recommendations card.
@@ -37,6 +37,7 @@ export type { ArtifactId, AssociationAxis, AssociationValues, Brand, EntryKey, E
37
37
  export { err, fixtureId, generateJudgmentId, generateRunId, judgmentId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
38
38
  export type { AgentHarnessTaskDefinition, ContentLakeAuthorableMode, ContentLakeAuthorableTask, CriterionRef, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PromptVars, PathDocRef, PerspectiveDocRef, ReservedPromptVarKey, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./generalized-task.js";
39
39
  export type { ActionSuggestion, AreaSummaryBody, CardMeta, CardType, Diagnosis, DiagnosisCard, DocAttributionSpotlightBody, FailureModeSummaryBody, JudgmentRef, LowConfidenceAttributionBody, NoIssuesBody, RegressionVsBaselineBody, TopRecommendationsBody, VersionedInputs, WeakestAreaBody, } from "./diagnosis.js";
40
+ export type { SynthesisCostTelemetry, SynthesisPerCardTelemetry, } from "./synthesis-telemetry.js";
40
41
  export type { AttributionMeta, DocAttribution, JudgmentAttribution, } from "./attribution.js";
41
42
  export type { CriterionSubJudgment, DocCitation, DocCitationRole, GraderJudgment, } from "./grader-judgment.js";
42
43
  export type { LegacyGraderJudgment } from "./legacy-grader-judgment.js";
@@ -754,6 +755,12 @@ export interface PipelineResult {
754
755
  promptfooUrls?: PromptfooUrlEntry[];
755
756
  /** Results per step */
756
757
  steps: Record<string, StepResult>;
758
+ /** Report ID produced by PublishReportStep (when publish was enabled). Used by
759
+ * post-run hooks (e.g. runPostPipelineHooks) to target diagnosis and telemetry
760
+ * writeback at the correct Content Lake document. Absent when publish was
761
+ * skipped or the publish step did not produce a report. (Phase 6 / DIAG-06)
762
+ */
763
+ reportId?: string;
757
764
  /** Overall success (all non-skipped steps succeeded) */
758
765
  success: boolean;
759
766
  /** Summary of test execution outcomes. */
@@ -106,6 +106,21 @@ export interface RepoTriggersConfig {
106
106
  "pr-task-change"?: TriggerConfig;
107
107
  schedule?: ScheduleTriggerConfig;
108
108
  }
109
+ /**
110
+ * Post-run diagnosis summary policy (Phase 6 / DIAG-06).
111
+ * Controls whether `ailf run` auto-fires the in-process diagnosis runner
112
+ * at the end of a published pipeline. Precedence is resolved at the CLI
113
+ * layer — see `shouldRunPostSummary()` in `pipeline-action.ts`.
114
+ */
115
+ export interface RepoSummaryConfig {
116
+ /**
117
+ * - `"auto"` — fire only when `process.stdout.isTTY === true` AND
118
+ * `CI !== "true"`.
119
+ * - `"always"` — fire unconditionally (bypasses TTY check).
120
+ * - `"never"` — never fire.
121
+ */
122
+ onRun?: "auto" | "always" | "never";
123
+ }
109
124
  /**
110
125
  * Parsed shape of `.ailf/config.yaml`.
111
126
  *
@@ -124,6 +139,7 @@ export interface RepoConfig {
124
139
  publish?: RepoPublishConfig;
125
140
  reportStore?: RepoReportStoreConfig;
126
141
  source?: RepoSourceConfig;
142
+ summary?: RepoSummaryConfig;
127
143
  taskSource?: RepoTaskSourceConfig;
128
144
  triggers?: RepoTriggersConfig;
129
145
  }
@@ -0,0 +1,101 @@
1
+ /**
2
+ * Synthesis cost telemetry types — canonical TS-first shapes for
3
+ * Phase 6 DIAG-06 cost and parse-failure observability.
4
+ *
5
+ * These interfaces are authored independently of their Zod adapter schema
6
+ * (Plan 06-02) per D0045: the Zod schema declares
7
+ * `satisfies z.ZodType<SynthesisCostTelemetry>` against this independent
8
+ * type so drift is a build error, not a runtime bug.
9
+ *
10
+ * The 14 attribute paths on `SynthesisCostTelemetry` + `SynthesisPerCardTelemetry`
11
+ * land on the `ailf.report` Sanity doc under `summary.synthesis.diagnosis.*`
12
+ * (D6-09). No new sibling doc type (D0033 / D6-09).
13
+ *
14
+ * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
15
+ * @see .planning/phases/06-post-run-integration-cost-telemetry/06-CONTEXT.md §D6-09
16
+ * @see .planning/phases/06-post-run-integration-cost-telemetry/06-CONTEXT.md §D6-12
17
+ */
18
+ import type { CardType } from "./diagnosis.js";
19
+ /**
20
+ * Per-card telemetry row for the `synthesis_per_card` Airbyte stream
21
+ * (D6-11) and the `summary.synthesis.diagnosis.perCard[]` Sanity doc path
22
+ * (D6-09).
23
+ *
24
+ * Fields map directly to the 8 per-card attribute paths in D6-09:
25
+ * `…perCard[].cardType`, `…perCard[].cost`, `…perCard[].parseFailed`,
26
+ * `…perCard[].latencyMs`, `…perCard[].tokenInput`, `…perCard[].tokenOutput`,
27
+ * `…perCard[].cardVersion`, `…perCard[].generatedAt`.
28
+ *
29
+ * `cost` is undefined when the card did not make an LLM call (deterministic
30
+ * cards) and contributes 0 to the roll-up.
31
+ */
32
+ export interface SynthesisPerCardTelemetry {
33
+ /** Card archetype — reuses `CardType` from diagnosis.ts:55-63; not redeclared. */
34
+ cardType: CardType;
35
+ /**
36
+ * Per-call USD cost captured from `LLMStructuredCompletion.cost`.
37
+ * `undefined` for deterministic cards (area-summary, failure-mode-summary,
38
+ * no-issues) which make no LLM call.
39
+ */
40
+ cost?: number;
41
+ /**
42
+ * Whether the card's Zod schema parse failed (produces a degraded card).
43
+ * Used for the 7-day rolling parse-failure rate in BigQuery (D6-15).
44
+ */
45
+ parseFailed: boolean;
46
+ /**
47
+ * End-to-end latency for the LLM call in milliseconds.
48
+ * `undefined` for deterministic cards.
49
+ */
50
+ latencyMs?: number;
51
+ /**
52
+ * Prompt tokens consumed by the LLM call.
53
+ * `undefined` for deterministic cards.
54
+ */
55
+ tokenInput?: number;
56
+ /**
57
+ * Completion tokens produced by the LLM call.
58
+ * `undefined` for deterministic cards.
59
+ */
60
+ tokenOutput?: number;
61
+ /** Per-card version string (e.g. `"top-recommendations@0.1.0"`). */
62
+ cardVersion: string;
63
+ /** ISO 8601 UTC timestamp when this card was generated. */
64
+ generatedAt: string;
65
+ }
66
+ /**
67
+ * Aggregate synthesis cost telemetry for a single Diagnosis run.
68
+ * Lands on the `ailf.report` Sanity doc under `summary.synthesis.diagnosis.*`
69
+ * (D6-09 / D6-10: parallel to `summary.overall.cost` — not additive).
70
+ *
71
+ * Written by the post-run hook (D6-08); not written by standalone
72
+ * `ailf interpret`.
73
+ *
74
+ * Field set matches the 4 top-level D6-09 attribute paths:
75
+ * `summary.synthesis.diagnosis.cost`,
76
+ * `summary.synthesis.diagnosis.parseFailureCount`,
77
+ * `summary.synthesis.diagnosis.parseFailureRate`,
78
+ * `summary.synthesis.diagnosis.perCard`.
79
+ */
80
+ export interface SynthesisCostTelemetry {
81
+ /**
82
+ * Total USD cost across all LLM cards in this Diagnosis run.
83
+ * Roll-up: `sum(perCard[].cost ?? 0)` for ready + degraded cards.
84
+ * Missing cards contribute 0.
85
+ */
86
+ cost: number;
87
+ /**
88
+ * Number of cards whose Zod parse failed in this Diagnosis run.
89
+ * Counted across all 8 card types (including deterministic cards;
90
+ * a deterministic-card parse failure indicates a code bug).
91
+ */
92
+ parseFailureCount: number;
93
+ /**
94
+ * Parse-failure rate: `parseFailureCount / 8` (8 = fixed card registry size).
95
+ * Range: 0–1. Used as the denominator for the D6-15 BigQuery 7-day
96
+ * rolling rate view (`synthesis_parse_failure_rate_7d.sql`).
97
+ */
98
+ parseFailureRate: number;
99
+ /** Per-card telemetry rows — one entry per card in registry-order. */
100
+ perCard: SynthesisPerCardTelemetry[];
101
+ }
@@ -0,0 +1,18 @@
1
+ /**
2
+ * Synthesis cost telemetry types — canonical TS-first shapes for
3
+ * Phase 6 DIAG-06 cost and parse-failure observability.
4
+ *
5
+ * These interfaces are authored independently of their Zod adapter schema
6
+ * (Plan 06-02) per D0045: the Zod schema declares
7
+ * `satisfies z.ZodType<SynthesisCostTelemetry>` against this independent
8
+ * type so drift is a build error, not a runtime bug.
9
+ *
10
+ * The 14 attribute paths on `SynthesisCostTelemetry` + `SynthesisPerCardTelemetry`
11
+ * land on the `ailf.report` Sanity doc under `summary.synthesis.diagnosis.*`
12
+ * (D6-09). No new sibling doc type (D0033 / D6-09).
13
+ *
14
+ * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
15
+ * @see .planning/phases/06-post-run-integration-cost-telemetry/06-CONTEXT.md §D6-09
16
+ * @see .planning/phases/06-post-run-integration-cost-telemetry/06-CONTEXT.md §D6-12
17
+ */
18
+ export {};
@@ -115,12 +115,10 @@ function mapEvalConfigToResolvedConfig(config, rootDir) {
115
115
  compareBaseline: config.compareBaseline,
116
116
  gapAnalysisEnabled: config.execution?.gapAnalysis ?? true,
117
117
  // W0077 Phase 4 — `publish` is now a policy object. Map the auto value
118
- // directly to a boolean for the file-config path; the runtime
119
- // smart-default logic in pipeline-action.ts isn't relevant here because
120
- // the user has explicitly handed us a config file.
121
- publishEnabled: config.publish?.auto === "never"
122
- ? false
123
- : config.publish?.auto !== undefined,
118
+ // to a boolean for the file-config path. Absence of publish.auto mirrors
119
+ // the CLI's "full-runs" default (enable publish; composition root gates on
120
+ // token availability). Only "never" explicitly disables auto-publish.
121
+ publishEnabled: config.publish?.auto !== "never",
124
122
  publishTag: config.publish?.tag,
125
123
  noCache: config.noCache ?? false,
126
124
  noRemoteCache: config.noRemoteCache ?? false,
@@ -150,5 +148,9 @@ function mapEvalConfigToResolvedConfig(config, rootDir) {
150
148
  ? resolve(rootDir, config.taskSource.repoTasksPath)
151
149
  : undefined,
152
150
  presets: config.presets,
151
+ // Phase 6 / DIAG-06 — thread summary.onRun into ResolvedConfig so the
152
+ // file-config exit branch in executePipeline can pass it to
153
+ // runPostPipelineHooks.
154
+ summaryOnRun: config.summary?.onRun,
153
155
  };
154
156
  }
@@ -5,5 +5,5 @@ export type { FakeCallRecord, FakeCompletionResponse, FakeStructuredResponse, }
5
5
  export { OpenAILLMClient } from "./openai-llm-client.js";
6
6
  export type { OpenAILLMClientOptions } from "./openai-llm-client.js";
7
7
  export type { ModelPricing } from "./pricing.js";
8
- export { DEFAULT_RETRY_POLICY, LLMHttpError, isRetryableStatus, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
8
+ export { DEFAULT_RETRY_POLICY, LLMHttpError, LLMParseError, isRetryableStatus, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
9
9
  export type { RetryPolicy } from "./retry.js";
@@ -1,4 +1,4 @@
1
1
  export { AnthropicLLMClient } from "./anthropic-llm-client.js";
2
2
  export { FakeLLMClient } from "./fake-llm-client.js";
3
3
  export { OpenAILLMClient } from "./openai-llm-client.js";
4
- export { DEFAULT_RETRY_POLICY, LLMHttpError, isRetryableStatus, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
4
+ export { DEFAULT_RETRY_POLICY, LLMHttpError, LLMParseError, isRetryableStatus, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
@@ -12,7 +12,7 @@
12
12
  */
13
13
  import { z } from "zod";
14
14
  import { OpenAIChatResponseSchema, splitModelId, } from "../../_vendor/ailf-core/index.js";
15
- import { DEFAULT_RETRY_POLICY, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
15
+ import { DEFAULT_RETRY_POLICY, LLMParseError, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
16
16
  const DEFAULT_BASE_URL = "https://api.openai.com/v1/chat/completions";
17
17
  /**
18
18
  * Conservative defaults for the models in `packages/eval/config/models.ts`.
@@ -98,7 +98,12 @@ export class OpenAILLMClient {
98
98
  parsed = JSON.parse(raw);
99
99
  }
100
100
  catch (err) {
101
- throw new Error(`OpenAI structured completion returned invalid JSON for model ${args.model}: ${err instanceof Error ? err.message : String(err)}`, { cause: err });
101
+ // Sanitize: SyntaxError.message embeds a snippet at the failure offset,
102
+ // which can leak prompt text or user content echoed back by the model.
103
+ // Keep the raw body on the instance for callers that opt in via .raw,
104
+ // mirroring the LLMHttpError pattern (verified by the "does not leak
105
+ // the response body" test in openai-llm-client.test.ts).
106
+ throw new LLMParseError(`OpenAI structured completion returned invalid JSON for model ${args.model}`, raw, { cause: err });
102
107
  }
103
108
  // strict:true guarantees a valid-against-the-schema JSON document, but
104
109
  // the Zod parse is still load-bearing — it brands the result as T and is
@@ -33,6 +33,24 @@ export declare class LLMHttpError extends Error {
33
33
  readonly body: string;
34
34
  constructor(status: number, body: string, attempts: number);
35
35
  }
36
+ /**
37
+ * Sanitized error raised when an LLM adapter receives an HTTP-200 response
38
+ * whose body is not valid JSON. The raw response body (which may echo back
39
+ * user prompt content or even API-key fragments from prompts) is kept on the
40
+ * instance for callers that opt in via `.raw`, NOT in the message string.
41
+ *
42
+ * Mirrors the LLMHttpError pattern verified by the
43
+ * "does not leak the response body" test in openai-llm-client.test.ts.
44
+ */
45
+ export declare class LLMParseError extends Error {
46
+ /** Full raw response body (kept on the instance, NOT in `message`). */
47
+ readonly raw: string;
48
+ /** Byte length of `raw` — safe to include in the message. */
49
+ readonly rawLength: number;
50
+ constructor(message: string, raw: string, options?: {
51
+ cause?: unknown;
52
+ });
53
+ }
36
54
  export declare function isRetryableStatus(status: number): boolean;
37
55
  export interface RunWithRetryArgs<T> {
38
56
  policy: RetryPolicy;
@@ -29,6 +29,27 @@ export class LLMHttpError extends Error {
29
29
  this.body = body;
30
30
  }
31
31
  }
32
+ /**
33
+ * Sanitized error raised when an LLM adapter receives an HTTP-200 response
34
+ * whose body is not valid JSON. The raw response body (which may echo back
35
+ * user prompt content or even API-key fragments from prompts) is kept on the
36
+ * instance for callers that opt in via `.raw`, NOT in the message string.
37
+ *
38
+ * Mirrors the LLMHttpError pattern verified by the
39
+ * "does not leak the response body" test in openai-llm-client.test.ts.
40
+ */
41
+ export class LLMParseError extends Error {
42
+ /** Full raw response body (kept on the instance, NOT in `message`). */
43
+ raw;
44
+ /** Byte length of `raw` — safe to include in the message. */
45
+ rawLength;
46
+ constructor(message, raw, options) {
47
+ super(`${message} (raw=${raw.length}B)`, options);
48
+ this.name = "LLMParseError";
49
+ this.raw = raw;
50
+ this.rawLength = raw.length;
51
+ }
52
+ }
32
53
  export function isRetryableStatus(status) {
33
54
  return status === 429 || (status >= 500 && status < 600);
34
55
  }
@@ -0,0 +1,49 @@
1
+ /**
2
+ * Zod adapter schema for SynthesisCostTelemetry at the trust boundary.
3
+ *
4
+ * This schema sits at `packages/eval/src/adapters/**` and is therefore
5
+ * scanned by `pnpm check-trust-boundary-satisfies` (D0045). The
6
+ * `satisfies z.ZodType<SynthesisCostTelemetry>` clause makes schema/type
7
+ * drift a build error, not a runtime bug.
8
+ *
9
+ * Used by:
10
+ * - Plan 06-04 `ReportStore.patchSynthesis` — validates telemetry before
11
+ * writing to Sanity (process memory → Sanity write boundary, T-06-04).
12
+ * - Any future Sanity-side reader of `summary.synthesis.diagnosis.*`
13
+ * (Sanity Content Lake → eval process boundary, T-06-04).
14
+ *
15
+ * Security constraints:
16
+ * - No `.passthrough()` — schema is closed to prevent PII leakage from
17
+ * card body text into the telemetry shape (T-06-05).
18
+ * - Satisfies clause is load-bearing (T-06-06); no exemption marker.
19
+ *
20
+ * @see packages/core/src/types/synthesis-telemetry.ts — independently authored domain types
21
+ * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
22
+ * @see .planning/phases/06-post-run-integration-cost-telemetry/06-CONTEXT.md §D6-09
23
+ */
24
+ import { z } from "zod";
25
+ export declare const SynthesisCostTelemetrySchema: z.ZodObject<{
26
+ cost: z.ZodNumber;
27
+ parseFailureCount: z.ZodNumber;
28
+ parseFailureRate: z.ZodNumber;
29
+ perCard: z.ZodArray<z.ZodObject<{
30
+ cardType: z.ZodEnum<{
31
+ "area-summary": "area-summary";
32
+ "failure-mode-summary": "failure-mode-summary";
33
+ "no-issues": "no-issues";
34
+ "top-recommendations": "top-recommendations";
35
+ "weakest-area": "weakest-area";
36
+ "low-confidence-attribution": "low-confidence-attribution";
37
+ "doc-attribution-spotlight": "doc-attribution-spotlight";
38
+ "regression-vs-baseline": "regression-vs-baseline";
39
+ }>;
40
+ cost: z.ZodOptional<z.ZodNumber>;
41
+ parseFailed: z.ZodBoolean;
42
+ latencyMs: z.ZodOptional<z.ZodNumber>;
43
+ tokenInput: z.ZodOptional<z.ZodNumber>;
44
+ tokenOutput: z.ZodOptional<z.ZodNumber>;
45
+ cardVersion: z.ZodString;
46
+ generatedAt: z.ZodString;
47
+ }, z.core.$strip>>;
48
+ }, z.core.$strip>;
49
+ export type { SynthesisCostTelemetry, SynthesisPerCardTelemetry, } from "../../_vendor/ailf-core/index.d.ts";
@@ -0,0 +1,55 @@
1
+ /**
2
+ * Zod adapter schema for SynthesisCostTelemetry at the trust boundary.
3
+ *
4
+ * This schema sits at `packages/eval/src/adapters/**` and is therefore
5
+ * scanned by `pnpm check-trust-boundary-satisfies` (D0045). The
6
+ * `satisfies z.ZodType<SynthesisCostTelemetry>` clause makes schema/type
7
+ * drift a build error, not a runtime bug.
8
+ *
9
+ * Used by:
10
+ * - Plan 06-04 `ReportStore.patchSynthesis` — validates telemetry before
11
+ * writing to Sanity (process memory → Sanity write boundary, T-06-04).
12
+ * - Any future Sanity-side reader of `summary.synthesis.diagnosis.*`
13
+ * (Sanity Content Lake → eval process boundary, T-06-04).
14
+ *
15
+ * Security constraints:
16
+ * - No `.passthrough()` — schema is closed to prevent PII leakage from
17
+ * card body text into the telemetry shape (T-06-05).
18
+ * - Satisfies clause is load-bearing (T-06-06); no exemption marker.
19
+ *
20
+ * @see packages/core/src/types/synthesis-telemetry.ts — independently authored domain types
21
+ * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
22
+ * @see .planning/phases/06-post-run-integration-cost-telemetry/06-CONTEXT.md §D6-09
23
+ */
24
+ import { z } from "zod";
25
+ /**
26
+ * Enum of all valid card types — mirrors `CardType` from diagnosis.ts.
27
+ * Using `z.enum()` (not `z.string()`) so the schema satisfies
28
+ * `z.ZodType<SynthesisPerCardTelemetry>` (which requires `cardType: CardType`).
29
+ */
30
+ const CardTypeSchema = z.enum([
31
+ "area-summary",
32
+ "failure-mode-summary",
33
+ "no-issues",
34
+ "top-recommendations",
35
+ "weakest-area",
36
+ "low-confidence-attribution",
37
+ "doc-attribution-spotlight",
38
+ "regression-vs-baseline",
39
+ ]);
40
+ const SynthesisPerCardSchema = z.object({
41
+ cardType: CardTypeSchema,
42
+ cost: z.number().nonnegative().optional(),
43
+ parseFailed: z.boolean(),
44
+ latencyMs: z.number().int().nonnegative().optional(),
45
+ tokenInput: z.number().int().nonnegative().optional(),
46
+ tokenOutput: z.number().int().nonnegative().optional(),
47
+ cardVersion: z.string(),
48
+ generatedAt: z.string().datetime({ offset: false }), // ISO 8601 UTC required
49
+ });
50
+ export const SynthesisCostTelemetrySchema = z.object({
51
+ cost: z.number().nonnegative(),
52
+ parseFailureCount: z.number().int().nonnegative(),
53
+ parseFailureRate: z.number().min(0).max(1),
54
+ perCard: z.array(SynthesisPerCardSchema),
55
+ });
@@ -286,16 +286,21 @@ function mapAssertions(raw) {
286
286
  .map((c) => ({ id: c.id, text: c.text })),
287
287
  template: a.template,
288
288
  type: "llm-rubric",
289
- ...(a.weight !== undefined ? { weight: a.weight } : {}),
289
+ // Use `!= null` (loose) so we drop both `undefined` AND `null`.
290
+ // GROQ projects missing scalar fields as `null`, but the domain
291
+ // schema's `z.number().optional()` accepts `T | undefined`, not
292
+ // `T | null` — a strict `!== undefined` check would forward
293
+ // `weight: null` and trigger Zod's "Invalid input" on assertions.
294
+ ...(a.weight != null ? { weight: a.weight } : {}),
290
295
  };
291
296
  }
292
- // Value-based assertion
297
+ // Value-based assertion — same null-vs-undefined hazard as above.
293
298
  const result = { type: a.type };
294
- if (a.value !== undefined)
299
+ if (a.value != null)
295
300
  result.value = a.value;
296
- if (a.threshold !== undefined)
301
+ if (a.threshold != null)
297
302
  result.threshold = a.threshold;
298
- if (a.weight !== undefined)
303
+ if (a.weight != null)
299
304
  result.weight = a.weight;
300
305
  return result;
301
306
  });
@@ -1561,6 +1561,13 @@ export declare const RepoConfigSchema: z.ZodObject<{
1561
1561
  dir: z.ZodOptional<z.ZodString>;
1562
1562
  exclude: z.ZodOptional<z.ZodArray<z.ZodString>>;
1563
1563
  }, z.core.$strip>>;
1564
+ summary: z.ZodOptional<z.ZodObject<{
1565
+ onRun: z.ZodOptional<z.ZodEnum<{
1566
+ never: "never";
1567
+ always: "always";
1568
+ auto: "auto";
1569
+ }>>;
1570
+ }, z.core.$strip>>;
1564
1571
  taskSource: z.ZodOptional<z.ZodObject<{
1565
1572
  type: z.ZodOptional<z.ZodEnum<{
1566
1573
  "content-lake": "content-lake";