npm - @sanity/ailf - Versions diffs - 6.0.0 → 6.1.0 - Mend

@sanity/ailf 6.0.0 → 6.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js CHANGED Viewed

@@ -89,7 +89,9 @@ export const generateWeakestArea = async (report, ctx) => {
         path: ["confidence", "level"],
     });
     const prompt = buildWeakestAreaPrompt(report);
-    const { value, usage } = await ctx.llm.completeStructured({
+    // Destructure `cost` and `model` from the LLMClient return —
+    // already provided per llm-client.ts:139-144, previously discarded.
+    const { value, usage, cost, model } = await ctx.llm.completeStructured({
         model: CARD_MODEL,
         prompt: `${prompt.system}\n\n${prompt.user}`,
         schema: PerCallSchema,
@@ -109,6 +111,8 @@ export const generateWeakestArea = async (report, ctx) => {
             cardVersion: "weakest-area@0.1.0",
             tokenUsage: { input: usage.promptTokens, output: usage.completionTokens },
             generatedAt: new Date().toISOString(),
+            cost,
+            model,
         },
     };
 };

package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js CHANGED Viewed

@@ -131,10 +131,23 @@ export function buildWeakestAreaPrompt(report) {
 export function buildLowConfidenceAttributionPrompt(report, judgmentAttributions) {
     // Filter to low-confidence entries (by any attribution in the set)
     const lowConf = judgmentAttributions.filter((ja) => ja.attributions.some((a) => a.confidence.level === "low"));
-    // If no low-confidence entries, use all sorted by score ascending (most uncertain first)
+    // If no low-confidence entries, use all sorted by score ascending (most
+    // uncertain first). Guard against entries with empty `attributions` arrays:
+    // `Math.min(...[])` is Infinity and produces an unstable sort that ranks
+    // empty-attribution entries identically at the top of the prompt. The card
+    // schema requires `judgmentRefs.min(1)`, so emitting empty-attribution rows
+    // here forces a degraded card downstream. Caller should short-circuit to
+    // missing before reaching here, but defend against the seam regressing.
+    const validAttrs = judgmentAttributions.filter((ja) => ja.attributions.length > 0);
+    if (validAttrs.length === 0) {
+        return {
+            system: LOW_CONFIDENCE_ATTRIBUTION_SYSTEM_PROMPT,
+            user: "(no attribution data with non-empty attributions)",
+        };
+    }
     const source = lowConf.length > 0
         ? lowConf
-        : [...judgmentAttributions].sort((a, b) => {
+        : [...validAttrs].sort((a, b) => {
             const aMin = Math.min(...a.attributions.map((x) => x.score));
             const bMin = Math.min(...b.attributions.map((x) => x.score));
             return aMin - bMin;

package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts CHANGED Viewed

@@ -8,9 +8,11 @@
  * Mitigations embedded:
  * - failure-mode #3: confidence inflation on small samples — prompt instructs
  *   to hedge when sampleSize < 10; Zod W3 refine enforces at parse time
- * - failure-mode #4: taxonomy drift — full canonical taxonomy enumerated
- *   verbatim in this prompt so the LLM picks from a known list
+ * - failure-mode #4: taxonomy drift — failure-mode lists derived at build
+ *   time from the canonical const arrays in
+ *   `packages/core/src/grader/failure-modes/*.ts`, so the prompt and the
+ *   Zod `.refine(buildFailureModeRefinement())` validator always agree.
  *
  * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4b
  */
-export declare const SYSTEM_PROMPT = "You are an AILF evaluation analyst identifying the documentation area most in need of improvement.\n\n## Your Output\n\nReturn a JSON object matching this exact shape:\n{\n  \"summary\": \"<1-2 sentence description of the weakest area and why>\",\n  \"area\": \"<feature area name, e.g. 'schema-deploy'>\",\n  \"dimension\": \"<MUST be one of the canonical dimensions listed below>\",\n  \"failureMode\": \"<MUST be from the canonical taxonomy for the chosen dimension>\",\n  \"sampleSize\": <number \u2014 MUST equal the judgmentCount provided for this area>,\n  \"confidence\": {\n    \"level\": \"high\" | \"medium\" | \"low\",\n    \"signalsPresent\": <number of tasks backing this finding>,\n    \"derivation\": \"card-type-specific\"\n  }\n}\n\n## CANONICAL DIMENSIONS AND FAILURE MODES\n\nYou MUST pick dimension and failureMode from this exact taxonomy. Cross-dimension combinations are invalid (e.g., \"security\" dimension with \"missing-docs\" failure mode is rejected).\n\n### Literacy family (dimensions: task-completion, code-correctness, doc-coverage)\nFailure modes:\n- missing-docs \u2014 relevant doc didn't exist\n- outdated-docs \u2014 doc reflects an older API/version\n- incorrect-docs \u2014 doc states something factually wrong\n- poor-structure \u2014 doc exists but is hard to find or follow\nPlus cross-cutting: api-error, model-limitation, false-floor, unclassified\n\n### MCP family (dimensions: mcp-behavior, input-validation, output-correctness, error-handling, security)\nFailure modes:\n- invalid-tool-call \u2014 model called tool with wrong args\n- missing-required-param \u2014 required parameter omitted\n- extra-param \u2014 unexpected extra parameter sent\n- wrong-tool-selected \u2014 chose wrong tool for task\n- tool-call-order \u2014 tools called in wrong sequence\n- no-tool-call \u2014 should have used a tool but didn't\n- schema-mismatch \u2014 response did not match expected schema\n- unsafe-operation \u2014 operation could cause data loss\n- auth-bypass \u2014 security check skipped\nPlus cross-cutting: api-error, model-limitation, false-floor, unclassified\n\n### Knowledge-probe family (dimensions: knowledge-probe, factual-correctness, completeness, currency)\nFailure modes:\n- factual-error \u2014 stated an incorrect fact\n- out-of-date \u2014 used deprecated API or old syntax\n- missing-step \u2014 omitted a required step\n- hallucinated-api \u2014 invented an API that does not exist\n- wrong-version \u2014 used v1 API when v2 was required\n- incomplete-coverage \u2014 missed important edge case\nPlus cross-cutting: api-error, model-limitation, false-floor, unclassified\n\n### Agent-harness family (dimensions: agent-harness, process-quality, agent-output, tool-usage)\nFailure modes:\n- excessive-loops \u2014 agent looped unnecessarily\n- premature-stop \u2014 stopped before completing the task\n- incorrect-output \u2014 output was wrong or incomplete\n- inefficient-path \u2014 completed task but via unnecessary steps\n- assertion-failure \u2014 failed a structural assertion check\nPlus cross-cutting: api-error, model-limitation, false-floor, unclassified\n\n## Confidence Calibration Rules\n\n**CRITICAL:** When sampleSize < 10, you MUST set confidence.level = \"low\".\n\n- sampleSize >= 30 \u2192 \"high\" is appropriate\n- sampleSize >= 10 \u2192 \"medium\" is appropriate\n- sampleSize < 10 \u2192 MUST use \"low\" (small-sample hedge required)\n\nIn your summary, reflect the confidence level: if \"low\", include language like \"small sample (N=X) \u2014 re-run with broader dataset before acting\".";
+export declare const SYSTEM_PROMPT: string;

package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js CHANGED Viewed

@@ -8,11 +8,19 @@
  * Mitigations embedded:
  * - failure-mode #3: confidence inflation on small samples — prompt instructs
  *   to hedge when sampleSize < 10; Zod W3 refine enforces at parse time
- * - failure-mode #4: taxonomy drift — full canonical taxonomy enumerated
- *   verbatim in this prompt so the LLM picks from a known list
+ * - failure-mode #4: taxonomy drift — failure-mode lists derived at build
+ *   time from the canonical const arrays in
+ *   `packages/core/src/grader/failure-modes/*.ts`, so the prompt and the
+ *   Zod `.refine(buildFailureModeRefinement())` validator always agree.
  *
  * @see .planning/phases/05-diagnosis-engine-cli-llm-cards/05-AI-SPEC.md §4b
  */
+import { AGENT_FAILURE_MODES, COMMON_FAILURE_MODES, KP_FAILURE_MODES, LITERACY_FAILURE_MODES, MCP_FAILURE_MODES, } from "../../../grader/failure-modes/index.js";
+const literacyList = LITERACY_FAILURE_MODES.map((m) => `- ${m}`).join("\n");
+const mcpList = MCP_FAILURE_MODES.map((m) => `- ${m}`).join("\n");
+const kpList = KP_FAILURE_MODES.map((m) => `- ${m}`).join("\n");
+const agentList = AGENT_FAILURE_MODES.map((m) => `- ${m}`).join("\n");
+const commonList = COMMON_FAILURE_MODES.join(", ");
 export const SYSTEM_PROMPT = `You are an AILF evaluation analyst identifying the documentation area most in need of improvement.
 ## Your Output
@@ -33,47 +41,27 @@ Return a JSON object matching this exact shape:
 ## CANONICAL DIMENSIONS AND FAILURE MODES
-You MUST pick dimension and failureMode from this exact taxonomy. Cross-dimension combinations are invalid (e.g., "security" dimension with "missing-docs" failure mode is rejected).
+You MUST pick dimension and failureMode from this exact taxonomy. Cross-dimension combinations are invalid (e.g., "security" dimension with a literacy-only failure mode is rejected). The lists below are derived at build time from \`packages/core/src/grader/failure-modes/*.ts\` — the Zod validator on the card schema enforces the same taxonomy.
 ### Literacy family (dimensions: task-completion, code-correctness, doc-coverage)
 Failure modes:
-- missing-docs — relevant doc didn't exist
-- outdated-docs — doc reflects an older API/version
-- incorrect-docs — doc states something factually wrong
-- poor-structure — doc exists but is hard to find or follow
-Plus cross-cutting: api-error, model-limitation, false-floor, unclassified
+${literacyList}
+Plus cross-cutting: ${commonList}
 ### MCP family (dimensions: mcp-behavior, input-validation, output-correctness, error-handling, security)
 Failure modes:
-- invalid-tool-call — model called tool with wrong args
-- missing-required-param — required parameter omitted
-- extra-param — unexpected extra parameter sent
-- wrong-tool-selected — chose wrong tool for task
-- tool-call-order — tools called in wrong sequence
-- no-tool-call — should have used a tool but didn't
-- schema-mismatch — response did not match expected schema
-- unsafe-operation — operation could cause data loss
-- auth-bypass — security check skipped
-Plus cross-cutting: api-error, model-limitation, false-floor, unclassified
+${mcpList}
+Plus cross-cutting: ${commonList}
 ### Knowledge-probe family (dimensions: knowledge-probe, factual-correctness, completeness, currency)
 Failure modes:
-- factual-error — stated an incorrect fact
-- out-of-date — used deprecated API or old syntax
-- missing-step — omitted a required step
-- hallucinated-api — invented an API that does not exist
-- wrong-version — used v1 API when v2 was required
-- incomplete-coverage — missed important edge case
-Plus cross-cutting: api-error, model-limitation, false-floor, unclassified
+${kpList}
+Plus cross-cutting: ${commonList}
 ### Agent-harness family (dimensions: agent-harness, process-quality, agent-output, tool-usage)
 Failure modes:
-- excessive-loops — agent looped unnecessarily
-- premature-stop — stopped before completing the task
-- incorrect-output — output was wrong or incomplete
-- inefficient-path — completed task but via unnecessary steps
-- assertion-failure — failed a structural assertion check
-Plus cross-cutting: api-error, model-limitation, false-floor, unclassified
+${agentList}
+Plus cross-cutting: ${commonList}
 ## Confidence Calibration Rules

package/dist/_vendor/ailf-core/services/index.d.ts CHANGED Viewed

@@ -17,5 +17,5 @@ export { createDiagnosisRunner, diagnosisVersion, type CardGenerator, type CardR
 export { cardRegistry, type CardDefinition } from "./diagnosis/registry.js";
 export { createLLMClient, type LLMClientAdapters, type LLMClientFactoryConfig, type LLMClientKeys, } from "./llm-client-factory.js";
 export { buildFailureModeRefinement, isFailureModeInDimensionTaxonomy, } from "./diagnosis/card-validators.js";
-export { DIAGNOSIS_CARD_GENERATORS, generateAreaSummary, generateFailureModeSummary, generateNoIssues, generateTopRecommendations, generateWeakestArea, generateLowConfidenceAttribution, generateDocAttributionSpotlight, generateRegressionVsBaseline, } from "./diagnosis/cards/index.js";
+export { CARD_REGISTRY_VERSION, DIAGNOSIS_CARD_GENERATORS, generateAreaSummary, generateFailureModeSummary, generateNoIssues, generateTopRecommendations, generateWeakestArea, generateLowConfidenceAttribution, generateDocAttributionSpotlight, generateRegressionVsBaseline, } from "./diagnosis/cards/index.js";
 export { buildTopRecommendationsPrompt, buildWeakestAreaPrompt, buildLowConfidenceAttributionPrompt, buildDocAttributionSpotlightPrompt, buildRegressionVsBaselinePrompt, buildDocSlugAllowList, } from "./diagnosis/prompt-builders.js";

package/dist/_vendor/ailf-core/services/index.js CHANGED Viewed

@@ -29,5 +29,5 @@ export { buildFailureModeRefinement, isFailureModeInDimensionTaxonomy, } from ".
 // ---------------------------------------------------------------------------
 // Phase 5 Plan 05 — card generators barrel + prompt builders
 // ---------------------------------------------------------------------------
-export { DIAGNOSIS_CARD_GENERATORS, generateAreaSummary, generateFailureModeSummary, generateNoIssues, generateTopRecommendations, generateWeakestArea, generateLowConfidenceAttribution, generateDocAttributionSpotlight, generateRegressionVsBaseline, } from "./diagnosis/cards/index.js";
+export { CARD_REGISTRY_VERSION, DIAGNOSIS_CARD_GENERATORS, generateAreaSummary, generateFailureModeSummary, generateNoIssues, generateTopRecommendations, generateWeakestArea, generateLowConfidenceAttribution, generateDocAttributionSpotlight, generateRegressionVsBaseline, } from "./diagnosis/cards/index.js";
 export { buildTopRecommendationsPrompt, buildWeakestAreaPrompt, buildLowConfidenceAttributionPrompt, buildDocAttributionSpotlightPrompt, buildRegressionVsBaselinePrompt, buildDocSlugAllowList, } from "./diagnosis/prompt-builders.js";

package/dist/_vendor/ailf-core/types/diagnosis.d.ts CHANGED Viewed

@@ -19,6 +19,7 @@
 import type { Confidence } from "./confidence.js";
 import type { RunId } from "./branded-ids.js";
 import type { ReportId } from "./index.js";
+import type { ModelId } from "../ports/llm-client.js";
 /**
  * The four-version cache envelope. Every cached `Diagnosis` carries the
  * versions of the inputs that produced it; any bump in any segment
@@ -54,6 +55,8 @@ export interface CardMeta {
     latencyMs?: number;
     /** ISO 8601 UTC timestamp. */
     generatedAt: string;
+    cost?: number;
+    model?: ModelId;
 }
 /**
  * A single actionable suggestion surfaced by a recommendations card.

package/dist/_vendor/ailf-core/types/index.d.ts CHANGED Viewed

@@ -37,6 +37,7 @@ export type { ArtifactId, AssociationAxis, AssociationValues, Brand, EntryKey, E
 export { err, fixtureId, generateJudgmentId, generateRunId, judgmentId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
 export type { AgentHarnessTaskDefinition, ContentLakeAuthorableMode, ContentLakeAuthorableTask, CriterionRef, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PromptVars, PathDocRef, PerspectiveDocRef, ReservedPromptVarKey, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./generalized-task.js";
 export type { ActionSuggestion, AreaSummaryBody, CardMeta, CardType, Diagnosis, DiagnosisCard, DocAttributionSpotlightBody, FailureModeSummaryBody, JudgmentRef, LowConfidenceAttributionBody, NoIssuesBody, RegressionVsBaselineBody, TopRecommendationsBody, VersionedInputs, WeakestAreaBody, } from "./diagnosis.js";
+export type { SynthesisCostTelemetry, SynthesisPerCardTelemetry, } from "./synthesis-telemetry.js";
 export type { AttributionMeta, DocAttribution, JudgmentAttribution, } from "./attribution.js";
 export type { CriterionSubJudgment, DocCitation, DocCitationRole, GraderJudgment, } from "./grader-judgment.js";
 export type { LegacyGraderJudgment } from "./legacy-grader-judgment.js";
@@ -754,6 +755,12 @@ export interface PipelineResult {
     promptfooUrls?: PromptfooUrlEntry[];
     /** Results per step */
     steps: Record<string, StepResult>;
+    /** Report ID produced by PublishReportStep (when publish was enabled). Used by
+     * post-run hooks (e.g. runPostPipelineHooks) to target diagnosis and telemetry
+     * writeback at the correct Content Lake document. Absent when publish was
+     * skipped or the publish step did not produce a report. (Phase 6 / DIAG-06)
+     */
+    reportId?: string;
     /** Overall success (all non-skipped steps succeeded) */
     success: boolean;
     /** Summary of test execution outcomes. */

package/dist/_vendor/ailf-core/types/repo-config.d.ts CHANGED Viewed

@@ -106,6 +106,21 @@ export interface RepoTriggersConfig {
     "pr-task-change"?: TriggerConfig;
     schedule?: ScheduleTriggerConfig;
 }
+/**
+ * Post-run diagnosis summary policy (Phase 6 / DIAG-06).
+ * Controls whether `ailf run` auto-fires the in-process diagnosis runner
+ * at the end of a published pipeline. Precedence is resolved at the CLI
+ * layer — see `shouldRunPostSummary()` in `pipeline-action.ts`.
+ */
+export interface RepoSummaryConfig {
+    /**
+     * - `"auto"`   — fire only when `process.stdout.isTTY === true` AND
+     *                `CI !== "true"`.
+     * - `"always"` — fire unconditionally (bypasses TTY check).
+     * - `"never"`  — never fire.
+     */
+    onRun?: "auto" | "always" | "never";
+}
 /**
  * Parsed shape of `.ailf/config.yaml`.
  *
@@ -124,6 +139,7 @@ export interface RepoConfig {
     publish?: RepoPublishConfig;
     reportStore?: RepoReportStoreConfig;
     source?: RepoSourceConfig;
+    summary?: RepoSummaryConfig;
     taskSource?: RepoTaskSourceConfig;
     triggers?: RepoTriggersConfig;
 }

package/dist/_vendor/ailf-core/types/synthesis-telemetry.d.ts ADDED Viewed

@@ -0,0 +1,101 @@
+/**
+ * Synthesis cost telemetry types — canonical TS-first shapes for
+ * Phase 6 DIAG-06 cost and parse-failure observability.
+ *
+ * These interfaces are authored independently of their Zod adapter schema
+ * (Plan 06-02) per D0045: the Zod schema declares
+ * `satisfies z.ZodType<SynthesisCostTelemetry>` against this independent
+ * type so drift is a build error, not a runtime bug.
+ *
+ * The 14 attribute paths on `SynthesisCostTelemetry` + `SynthesisPerCardTelemetry`
+ * land on the `ailf.report` Sanity doc under `summary.synthesis.diagnosis.*`
+ * (D6-09). No new sibling doc type (D0033 / D6-09).
+ *
+ * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
+ * @see .planning/phases/06-post-run-integration-cost-telemetry/06-CONTEXT.md §D6-09
+ * @see .planning/phases/06-post-run-integration-cost-telemetry/06-CONTEXT.md §D6-12
+ */
+import type { CardType } from "./diagnosis.js";
+/**
+ * Per-card telemetry row for the `synthesis_per_card` Airbyte stream
+ * (D6-11) and the `summary.synthesis.diagnosis.perCard[]` Sanity doc path
+ * (D6-09).
+ *
+ * Fields map directly to the 8 per-card attribute paths in D6-09:
+ * `…perCard[].cardType`, `…perCard[].cost`, `…perCard[].parseFailed`,
+ * `…perCard[].latencyMs`, `…perCard[].tokenInput`, `…perCard[].tokenOutput`,
+ * `…perCard[].cardVersion`, `…perCard[].generatedAt`.
+ *
+ * `cost` is undefined when the card did not make an LLM call (deterministic
+ * cards) and contributes 0 to the roll-up.
+ */
+export interface SynthesisPerCardTelemetry {
+    /** Card archetype — reuses `CardType` from diagnosis.ts:55-63; not redeclared. */
+    cardType: CardType;
+    /**
+     * Per-call USD cost captured from `LLMStructuredCompletion.cost`.
+     * `undefined` for deterministic cards (area-summary, failure-mode-summary,
+     * no-issues) which make no LLM call.
+     */
+    cost?: number;
+    /**
+     * Whether the card's Zod schema parse failed (produces a degraded card).
+     * Used for the 7-day rolling parse-failure rate in BigQuery (D6-15).
+     */
+    parseFailed: boolean;
+    /**
+     * End-to-end latency for the LLM call in milliseconds.
+     * `undefined` for deterministic cards.
+     */
+    latencyMs?: number;
+    /**
+     * Prompt tokens consumed by the LLM call.
+     * `undefined` for deterministic cards.
+     */
+    tokenInput?: number;
+    /**
+     * Completion tokens produced by the LLM call.
+     * `undefined` for deterministic cards.
+     */
+    tokenOutput?: number;
+    /** Per-card version string (e.g. `"top-recommendations@0.1.0"`). */
+    cardVersion: string;
+    /** ISO 8601 UTC timestamp when this card was generated. */
+    generatedAt: string;
+}
+/**
+ * Aggregate synthesis cost telemetry for a single Diagnosis run.
+ * Lands on the `ailf.report` Sanity doc under `summary.synthesis.diagnosis.*`
+ * (D6-09 / D6-10: parallel to `summary.overall.cost` — not additive).
+ *
+ * Written by the post-run hook (D6-08); not written by standalone
+ * `ailf interpret`.
+ *
+ * Field set matches the 4 top-level D6-09 attribute paths:
+ * `summary.synthesis.diagnosis.cost`,
+ * `summary.synthesis.diagnosis.parseFailureCount`,
+ * `summary.synthesis.diagnosis.parseFailureRate`,
+ * `summary.synthesis.diagnosis.perCard`.
+ */
+export interface SynthesisCostTelemetry {
+    /**
+     * Total USD cost across all LLM cards in this Diagnosis run.
+     * Roll-up: `sum(perCard[].cost ?? 0)` for ready + degraded cards.
+     * Missing cards contribute 0.
+     */
+    cost: number;
+    /**
+     * Number of cards whose Zod parse failed in this Diagnosis run.
+     * Counted across all 8 card types (including deterministic cards;
+     * a deterministic-card parse failure indicates a code bug).
+     */
+    parseFailureCount: number;
+    /**
+     * Parse-failure rate: `parseFailureCount / 8` (8 = fixed card registry size).
+     * Range: 0–1. Used as the denominator for the D6-15 BigQuery 7-day
+     * rolling rate view (`synthesis_parse_failure_rate_7d.sql`).
+     */
+    parseFailureRate: number;
+    /** Per-card telemetry rows — one entry per card in registry-order. */
+    perCard: SynthesisPerCardTelemetry[];
+}

package/dist/_vendor/ailf-core/types/synthesis-telemetry.js ADDED Viewed

@@ -0,0 +1,18 @@
+/**
+ * Synthesis cost telemetry types — canonical TS-first shapes for
+ * Phase 6 DIAG-06 cost and parse-failure observability.
+ *
+ * These interfaces are authored independently of their Zod adapter schema
+ * (Plan 06-02) per D0045: the Zod schema declares
+ * `satisfies z.ZodType<SynthesisCostTelemetry>` against this independent
+ * type so drift is a build error, not a runtime bug.
+ *
+ * The 14 attribute paths on `SynthesisCostTelemetry` + `SynthesisPerCardTelemetry`
+ * land on the `ailf.report` Sanity doc under `summary.synthesis.diagnosis.*`
+ * (D6-09). No new sibling doc type (D0033 / D6-09).
+ *
+ * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
+ * @see .planning/phases/06-post-run-integration-cost-telemetry/06-CONTEXT.md §D6-09
+ * @see .planning/phases/06-post-run-integration-cost-telemetry/06-CONTEXT.md §D6-12
+ */
+export {};

package/dist/adapters/config-sources/file-config-adapter.js CHANGED Viewed

@@ -115,12 +115,10 @@ function mapEvalConfigToResolvedConfig(config, rootDir) {
         compareBaseline: config.compareBaseline,
         gapAnalysisEnabled: config.execution?.gapAnalysis ?? true,
         // W0077 Phase 4 — `publish` is now a policy object. Map the auto value
-        // directly to a boolean for the file-config path; the runtime
-        // smart-default logic in pipeline-action.ts isn't relevant here because
-        // the user has explicitly handed us a config file.
-        publishEnabled: config.publish?.auto === "never"
-            ? false
-            : config.publish?.auto !== undefined,
+        // to a boolean for the file-config path. Absence of publish.auto mirrors
+        // the CLI's "full-runs" default (enable publish; composition root gates on
+        // token availability). Only "never" explicitly disables auto-publish.
+        publishEnabled: config.publish?.auto !== "never",
         publishTag: config.publish?.tag,
         noCache: config.noCache ?? false,
         noRemoteCache: config.noRemoteCache ?? false,
@@ -150,5 +148,9 @@ function mapEvalConfigToResolvedConfig(config, rootDir) {
             ? resolve(rootDir, config.taskSource.repoTasksPath)
             : undefined,
         presets: config.presets,
+        // Phase 6 / DIAG-06 — thread summary.onRun into ResolvedConfig so the
+        // file-config exit branch in executePipeline can pass it to
+        // runPostPipelineHooks.
+        summaryOnRun: config.summary?.onRun,
     };
 }

package/dist/adapters/llm/index.d.ts CHANGED Viewed

@@ -5,5 +5,5 @@ export type { FakeCallRecord, FakeCompletionResponse, FakeStructuredResponse, }
 export { OpenAILLMClient } from "./openai-llm-client.js";
 export type { OpenAILLMClientOptions } from "./openai-llm-client.js";
 export type { ModelPricing } from "./pricing.js";
-export { DEFAULT_RETRY_POLICY, LLMHttpError, isRetryableStatus, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
+export { DEFAULT_RETRY_POLICY, LLMHttpError, LLMParseError, isRetryableStatus, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
 export type { RetryPolicy } from "./retry.js";

package/dist/adapters/llm/index.js CHANGED Viewed

@@ -1,4 +1,4 @@
 export { AnthropicLLMClient } from "./anthropic-llm-client.js";
 export { FakeLLMClient } from "./fake-llm-client.js";
 export { OpenAILLMClient } from "./openai-llm-client.js";
-export { DEFAULT_RETRY_POLICY, LLMHttpError, isRetryableStatus, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
+export { DEFAULT_RETRY_POLICY, LLMHttpError, LLMParseError, isRetryableStatus, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";

package/dist/adapters/llm/openai-llm-client.js CHANGED Viewed

@@ -12,7 +12,7 @@
  */
 import { z } from "zod";
 import { OpenAIChatResponseSchema, splitModelId, } from "../../_vendor/ailf-core/index.js";
-import { DEFAULT_RETRY_POLICY, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
+import { DEFAULT_RETRY_POLICY, LLMParseError, parseRetryAfterSeconds, runWithRetry, } from "./retry.js";
 const DEFAULT_BASE_URL = "https://api.openai.com/v1/chat/completions";
 /**
  * Conservative defaults for the models in `packages/eval/config/models.ts`.
@@ -98,7 +98,12 @@ export class OpenAILLMClient {
             parsed = JSON.parse(raw);
         }
         catch (err) {
-            throw new Error(`OpenAI structured completion returned invalid JSON for model ${args.model}: ${err instanceof Error ? err.message : String(err)}`, { cause: err });
+            // Sanitize: SyntaxError.message embeds a snippet at the failure offset,
+            // which can leak prompt text or user content echoed back by the model.
+            // Keep the raw body on the instance for callers that opt in via .raw,
+            // mirroring the LLMHttpError pattern (verified by the "does not leak
+            // the response body" test in openai-llm-client.test.ts).
+            throw new LLMParseError(`OpenAI structured completion returned invalid JSON for model ${args.model}`, raw, { cause: err });
         }
         // strict:true guarantees a valid-against-the-schema JSON document, but
         // the Zod parse is still load-bearing — it brands the result as T and is

package/dist/adapters/llm/retry.d.ts CHANGED Viewed

@@ -33,6 +33,24 @@ export declare class LLMHttpError extends Error {
     readonly body: string;
     constructor(status: number, body: string, attempts: number);
 }
+/**
+ * Sanitized error raised when an LLM adapter receives an HTTP-200 response
+ * whose body is not valid JSON. The raw response body (which may echo back
+ * user prompt content or even API-key fragments from prompts) is kept on the
+ * instance for callers that opt in via `.raw`, NOT in the message string.
+ *
+ * Mirrors the LLMHttpError pattern verified by the
+ * "does not leak the response body" test in openai-llm-client.test.ts.
+ */
+export declare class LLMParseError extends Error {
+    /** Full raw response body (kept on the instance, NOT in `message`). */
+    readonly raw: string;
+    /** Byte length of `raw` — safe to include in the message. */
+    readonly rawLength: number;
+    constructor(message: string, raw: string, options?: {
+        cause?: unknown;
+    });
+}
 export declare function isRetryableStatus(status: number): boolean;
 export interface RunWithRetryArgs<T> {
     policy: RetryPolicy;

package/dist/adapters/llm/retry.js CHANGED Viewed

@@ -29,6 +29,27 @@ export class LLMHttpError extends Error {
         this.body = body;
     }
 }
+/**
+ * Sanitized error raised when an LLM adapter receives an HTTP-200 response
+ * whose body is not valid JSON. The raw response body (which may echo back
+ * user prompt content or even API-key fragments from prompts) is kept on the
+ * instance for callers that opt in via `.raw`, NOT in the message string.
+ *
+ * Mirrors the LLMHttpError pattern verified by the
+ * "does not leak the response body" test in openai-llm-client.test.ts.
+ */
+export class LLMParseError extends Error {
+    /** Full raw response body (kept on the instance, NOT in `message`). */
+    raw;
+    /** Byte length of `raw` — safe to include in the message. */
+    rawLength;
+    constructor(message, raw, options) {
+        super(`${message} (raw=${raw.length}B)`, options);
+        this.name = "LLMParseError";
+        this.raw = raw;
+        this.rawLength = raw.length;
+    }
+}
 export function isRetryableStatus(status) {
     return status === 429 || (status >= 500 && status < 600);
 }

package/dist/adapters/synthesis/synthesis-telemetry-schema.d.ts ADDED Viewed

@@ -0,0 +1,49 @@
+/**
+ * Zod adapter schema for SynthesisCostTelemetry at the trust boundary.
+ *
+ * This schema sits at `packages/eval/src/adapters/**` and is therefore
+ * scanned by `pnpm check-trust-boundary-satisfies` (D0045). The
+ * `satisfies z.ZodType<SynthesisCostTelemetry>` clause makes schema/type
+ * drift a build error, not a runtime bug.
+ *
+ * Used by:
+ * - Plan 06-04 `ReportStore.patchSynthesis` — validates telemetry before
+ *   writing to Sanity (process memory → Sanity write boundary, T-06-04).
+ * - Any future Sanity-side reader of `summary.synthesis.diagnosis.*`
+ *   (Sanity Content Lake → eval process boundary, T-06-04).
+ *
+ * Security constraints:
+ * - No `.passthrough()` — schema is closed to prevent PII leakage from
+ *   card body text into the telemetry shape (T-06-05).
+ * - Satisfies clause is load-bearing (T-06-06); no exemption marker.
+ *
+ * @see packages/core/src/types/synthesis-telemetry.ts — independently authored domain types
+ * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
+ * @see .planning/phases/06-post-run-integration-cost-telemetry/06-CONTEXT.md §D6-09
+ */
+import { z } from "zod";
+export declare const SynthesisCostTelemetrySchema: z.ZodObject<{
+    cost: z.ZodNumber;
+    parseFailureCount: z.ZodNumber;
+    parseFailureRate: z.ZodNumber;
+    perCard: z.ZodArray<z.ZodObject<{
+        cardType: z.ZodEnum<{
+            "area-summary": "area-summary";
+            "failure-mode-summary": "failure-mode-summary";
+            "no-issues": "no-issues";
+            "top-recommendations": "top-recommendations";
+            "weakest-area": "weakest-area";
+            "low-confidence-attribution": "low-confidence-attribution";
+            "doc-attribution-spotlight": "doc-attribution-spotlight";
+            "regression-vs-baseline": "regression-vs-baseline";
+        }>;
+        cost: z.ZodOptional<z.ZodNumber>;
+        parseFailed: z.ZodBoolean;
+        latencyMs: z.ZodOptional<z.ZodNumber>;
+        tokenInput: z.ZodOptional<z.ZodNumber>;
+        tokenOutput: z.ZodOptional<z.ZodNumber>;
+        cardVersion: z.ZodString;
+        generatedAt: z.ZodString;
+    }, z.core.$strip>>;
+}, z.core.$strip>;
+export type { SynthesisCostTelemetry, SynthesisPerCardTelemetry, } from "../../_vendor/ailf-core/index.d.ts";

package/dist/adapters/synthesis/synthesis-telemetry-schema.js ADDED Viewed

@@ -0,0 +1,55 @@
+/**
+ * Zod adapter schema for SynthesisCostTelemetry at the trust boundary.
+ *
+ * This schema sits at `packages/eval/src/adapters/**` and is therefore
+ * scanned by `pnpm check-trust-boundary-satisfies` (D0045). The
+ * `satisfies z.ZodType<SynthesisCostTelemetry>` clause makes schema/type
+ * drift a build error, not a runtime bug.
+ *
+ * Used by:
+ * - Plan 06-04 `ReportStore.patchSynthesis` — validates telemetry before
+ *   writing to Sanity (process memory → Sanity write boundary, T-06-04).
+ * - Any future Sanity-side reader of `summary.synthesis.diagnosis.*`
+ *   (Sanity Content Lake → eval process boundary, T-06-04).
+ *
+ * Security constraints:
+ * - No `.passthrough()` — schema is closed to prevent PII leakage from
+ *   card body text into the telemetry shape (T-06-05).
+ * - Satisfies clause is load-bearing (T-06-06); no exemption marker.
+ *
+ * @see packages/core/src/types/synthesis-telemetry.ts — independently authored domain types
+ * @see docs/decisions/D0045-type-architecture-and-contract-enforcement.md
+ * @see .planning/phases/06-post-run-integration-cost-telemetry/06-CONTEXT.md §D6-09
+ */
+import { z } from "zod";
+/**
+ * Enum of all valid card types — mirrors `CardType` from diagnosis.ts.
+ * Using `z.enum()` (not `z.string()`) so the schema satisfies
+ * `z.ZodType<SynthesisPerCardTelemetry>` (which requires `cardType: CardType`).
+ */
+const CardTypeSchema = z.enum([
+    "area-summary",
+    "failure-mode-summary",
+    "no-issues",
+    "top-recommendations",
+    "weakest-area",
+    "low-confidence-attribution",
+    "doc-attribution-spotlight",
+    "regression-vs-baseline",
+]);
+const SynthesisPerCardSchema = z.object({
+    cardType: CardTypeSchema,
+    cost: z.number().nonnegative().optional(),
+    parseFailed: z.boolean(),
+    latencyMs: z.number().int().nonnegative().optional(),
+    tokenInput: z.number().int().nonnegative().optional(),
+    tokenOutput: z.number().int().nonnegative().optional(),
+    cardVersion: z.string(),
+    generatedAt: z.string().datetime({ offset: false }), // ISO 8601 UTC required
+});
+export const SynthesisCostTelemetrySchema = z.object({
+    cost: z.number().nonnegative(),
+    parseFailureCount: z.number().int().nonnegative(),
+    parseFailureRate: z.number().min(0).max(1),
+    perCard: z.array(SynthesisPerCardSchema),
+});

package/dist/adapters/task-sources/content-lake-task-source.js CHANGED Viewed

@@ -286,16 +286,21 @@ function mapAssertions(raw) {
                     .map((c) => ({ id: c.id, text: c.text })),
                 template: a.template,
                 type: "llm-rubric",
-                ...(a.weight !== undefined ? { weight: a.weight } : {}),
+                // Use `!= null` (loose) so we drop both `undefined` AND `null`.
+                // GROQ projects missing scalar fields as `null`, but the domain
+                // schema's `z.number().optional()` accepts `T | undefined`, not
+                // `T | null` — a strict `!== undefined` check would forward
+                // `weight: null` and trigger Zod's "Invalid input" on assertions.
+                ...(a.weight != null ? { weight: a.weight } : {}),
             };
         }
-        // Value-based assertion
+        // Value-based assertion — same null-vs-undefined hazard as above.
         const result = { type: a.type };
-        if (a.value !== undefined)
+        if (a.value != null)
             result.value = a.value;
-        if (a.threshold !== undefined)
+        if (a.threshold != null)
             result.threshold = a.threshold;
-        if (a.weight !== undefined)
+        if (a.weight != null)
             result.weight = a.weight;
         return result;
     });

package/dist/adapters/task-sources/repo-schemas.d.ts CHANGED Viewed

@@ -1561,6 +1561,13 @@ export declare const RepoConfigSchema: z.ZodObject<{
         dir: z.ZodOptional<z.ZodString>;
         exclude: z.ZodOptional<z.ZodArray<z.ZodString>>;
     }, z.core.$strip>>;
+    summary: z.ZodOptional<z.ZodObject<{
+        onRun: z.ZodOptional<z.ZodEnum<{
+            never: "never";
+            always: "always";
+            auto: "auto";
+        }>>;
+    }, z.core.$strip>>;
     taskSource: z.ZodOptional<z.ZodObject<{
         type: z.ZodOptional<z.ZodEnum<{
             "content-lake": "content-lake";