npm - @sanity/ailf - Versions diffs - 2.3.2 → 2.4.0 - Mend

@sanity/ailf 2.3.2 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/dist/_vendor/ailf-core/index.d.ts +1 -0
package/dist/_vendor/ailf-core/index.js +1 -0
package/dist/_vendor/ailf-core/ports/artifact-uploader.d.ts +35 -0
package/dist/_vendor/ailf-core/ports/artifact-uploader.js +18 -0
package/dist/_vendor/ailf-core/ports/context.d.ts +9 -0
package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
package/dist/_vendor/ailf-core/ports/index.js +1 -0
package/dist/_vendor/ailf-core/types/index.d.ts +77 -0
package/dist/_vendor/ailf-core/types/scoring-input.d.ts +2 -0
package/dist/artifact-capture/gcs-collector.d.ts +55 -0
package/dist/artifact-capture/gcs-collector.js +117 -0
package/dist/artifact-capture/gcs-report-artifact-uploader.d.ts +31 -0
package/dist/artifact-capture/gcs-report-artifact-uploader.js +66 -0
package/dist/cli.js +2 -0
package/dist/commands/pipeline-action.js +3 -0
package/dist/composition-root.js +21 -5
package/dist/orchestration/build-app-context.js +3 -0
package/dist/orchestration/steps/calculate-scores-step.js +5 -1
package/dist/orchestration/steps/gap-analysis-step.js +15 -0
package/dist/orchestration/steps/publish-report-step.js +31 -0
package/dist/pipeline/calculate-scores.d.ts +12 -2
package/dist/pipeline/calculate-scores.js +95 -0
package/dist/report-store.js +28 -2
package/package.json +2 -1

package/dist/_vendor/ailf-core/index.d.ts CHANGED Viewed

@@ -19,3 +19,4 @@ export { defineConfig, defineFeatures, defineModeBase, defineModels, definePrici
 export type { PricingEntry, PromptEntry, SourceEntry, } from "./config-helpers.js";
 export { env } from "./env-helper.js";
 export { NoOpArtifactCollector } from "./artifact-capture/noop-collector.js";
+export { NoOpArtifactUploader } from "./ports/artifact-uploader.js";

package/dist/_vendor/ailf-core/index.js CHANGED Viewed

@@ -21,3 +21,4 @@ export * from "./examples/index.js";
 export { defineConfig, defineFeatures, defineModeBase, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./config-helpers.js";
 export { env } from "./env-helper.js";
 export { NoOpArtifactCollector } from "./artifact-capture/noop-collector.js";
+export { NoOpArtifactUploader } from "./ports/artifact-uploader.js";

package/dist/_vendor/ailf-core/ports/artifact-uploader.d.ts ADDED Viewed

@@ -0,0 +1,35 @@
+/**
+ * Port: ArtifactUploader — uploads report artifacts to external object storage.
+ *
+ * Separate from ArtifactCollector (which captures forensic archives).
+ * This port puts structured files at known paths so Studio can fetch
+ * them on demand via signed URLs.
+ *
+ * @see docs/design-docs/external-artifact-store.md
+ * @see docs/decisions/D0030-external-artifact-store.md
+ */
+import type { ArtifactRef } from "../types/index.js";
+/**
+ * Uploads report artifacts to external storage.
+ *
+ * Implementations:
+ * - GcsReportArtifactUploader (packages/eval) — uploads to GCS
+ * - NoOpArtifactUploader (below) — returns null (no-op when GCS is not configured)
+ */
+export interface ArtifactUploader {
+    /**
+     * Upload a JSON artifact for a report.
+     *
+     * @param reportId - Report identifier (used as the GCS path prefix)
+     * @param fileName - File name within the report prefix (e.g., "test-outputs.json")
+     * @param data     - Serializable data (will be JSON.stringify'd)
+     * @returns ArtifactRef on success, null if upload is skipped or fails
+     */
+    upload(reportId: string, fileName: string, data: unknown): Promise<ArtifactRef | null>;
+}
+/**
+ * No-op uploader — always returns null. Used when GCS is not configured.
+ */
+export declare class NoOpArtifactUploader implements ArtifactUploader {
+    upload(): Promise<null>;
+}

package/dist/_vendor/ailf-core/ports/artifact-uploader.js ADDED Viewed

@@ -0,0 +1,18 @@
+/**
+ * Port: ArtifactUploader — uploads report artifacts to external object storage.
+ *
+ * Separate from ArtifactCollector (which captures forensic archives).
+ * This port puts structured files at known paths so Studio can fetch
+ * them on demand via signed URLs.
+ *
+ * @see docs/design-docs/external-artifact-store.md
+ * @see docs/decisions/D0030-external-artifact-store.md
+ */
+/**
+ * No-op uploader — always returns null. Used when GCS is not configured.
+ */
+export class NoOpArtifactUploader {
+    async upload() {
+        return null;
+    }
+}

package/dist/_vendor/ailf-core/ports/context.d.ts CHANGED Viewed

@@ -13,6 +13,7 @@
  */
 import type { DebugOptions, EvalMode, PluginRegistry } from "../types/index.js";
 import type { ArtifactCollector } from "./artifact-collector.js";
+import type { ArtifactUploader } from "./artifact-uploader.js";
 import type { CacheStore } from "./cache-store.js";
 import type { DocFetcher } from "./doc-fetcher.js";
 import type { EvalRunner } from "./eval-runner.js";
@@ -159,6 +160,12 @@ export interface ResolvedConfig {
     captureCompress?: boolean;
     /** Whether to include mode-specific extra artifacts (default: true) */
     captureExtras?: boolean;
+    /** GCS bucket for capture upload (enables GCS decorator when set) */
+    captureGcsBucket?: string;
+    /** GCS object prefix for capture uploads (default: "captures/") */
+    captureGcsPrefix?: string;
+    /** GCS bucket for report artifact uploads — enables ArtifactUploader (D0030) */
+    artifactGcsBucket?: string;
 }
 /**
  * Application context — the complete dependency carrier.
@@ -173,6 +180,8 @@ export interface ResolvedConfig {
  * Created per-test by createTestContext().
  */
 export interface AppContext {
+    /** Report artifact uploader — uploads structured files to GCS for Studio (D0030) */
+    readonly artifactUploader?: ArtifactUploader;
     /** Evaluation caching (filesystem + optional Content Lake fallback) */
     readonly cache?: CacheStore;
     /** Artifact capture collector (no-op when --capture is not set) */

package/dist/_vendor/ailf-core/ports/index.d.ts CHANGED Viewed

@@ -5,6 +5,8 @@
  * Adapters (in packages/eval) implement these interfaces.
  */
 export type { ArtifactCollector, ArtifactManifest, ArtifactManifestEntry, CaptureFlushResult, } from "./artifact-collector.js";
+export type { ArtifactUploader } from "./artifact-uploader.js";
+export { NoOpArtifactUploader } from "./artifact-uploader.js";
 export type { ArtifactContentDiff, CaptureDiffReport, ComparisonMode, ComparisonOptions, InventoryDiff, JsonDiffEntry, MetadataComparison, ScoreComparison, SecurityScan, TimingComparison, } from "./capture-comparator.js";
 export type { CacheEntryMetadata, CacheKey, CacheLookupResult, CacheRecordInput, CacheStore, } from "./cache-store.js";
 export type { ConfigSource } from "./config-source.js";

package/dist/_vendor/ailf-core/ports/index.js CHANGED Viewed

@@ -4,4 +4,5 @@
  * Ports define the contracts between the domain kernel and the outside world.
  * Adapters (in packages/eval) implement these interfaces.
  */
+export { NoOpArtifactUploader } from "./artifact-uploader.js";
 export { canonicalDocRefLabel, isIdRef, isPathRef, isPerspectiveRef, isSlugRef, isTemplatedAssertion, } from "./task-source.js";

package/dist/_vendor/ailf-core/types/index.d.ts CHANGED Viewed

@@ -256,6 +256,13 @@ export interface GraderJudgment {
     dimension: string;
     /** The model that produced the response being graded */
     modelId: string;
+    /**
+     * True when the model failed to produce meaningful output (empty response,
+     * API error, or refusal). Distinguishes infrastructure failures from
+     * genuinely incorrect responses — a score of 0 from no output is
+     * fundamentally different from a score of 0 from wrong output.
+     */
+    outputFailure?: boolean;
     /** The grader's natural language reasoning */
     reason: string;
     /** The numeric score (0–100) */
@@ -268,6 +275,55 @@ export interface StoredJudgment extends GraderJudgment {
     /** Canonical docs that the task expected the model to use */
     canonicalDocs?: DocumentRef[];
 }
+/**
+ * Per-test result stored in reports for drill-down and audit.
+ *
+ * Captures the model's response output, grader reasoning per dimension,
+ * and response metadata. One entry per test × model combination.
+ * See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
+ */
+export interface StoredTestResult {
+    /** Resolved feature area (from __featureArea or description) */
+    area: string;
+    /** Canonical docs the task expected the model to use */
+    canonicalDocs?: DocumentRef[];
+    /** Weighted composite score (gold variant only) */
+    compositeScore?: number;
+    /** Per-test cost (USD) */
+    cost?: number;
+    /** Per-dimension grader scores and reasoning */
+    dimensions: {
+        /** Rubric dimension: task-completion, code-correctness, doc-coverage */
+        dimension: string;
+        /** Grader's natural language reasoning */
+        reason: string;
+        /** Numeric score (0–100, normalized) */
+        score: number;
+    }[];
+    /** Response latency in milliseconds */
+    latencyMs?: number;
+    /** Model that produced the response */
+    modelId: string;
+    /**
+     * True when the model failed to produce meaningful output (empty response,
+     * API error, or refusal). Same semantics as GraderJudgment.outputFailure.
+     */
+    outputFailure?: boolean;
+    /** The model's generated code/response (truncated to 8000 chars) */
+    responseOutput: string;
+    /** True when responseOutput was truncated from a longer response */
+    responseOutputTruncated?: boolean;
+    /** Task description (e.g. "Functions - Webhook handler (gold)") */
+    taskId: string;
+    /** Token usage breakdown */
+    tokenUsage?: {
+        completion: number;
+        prompt: number;
+        total: number;
+    };
+    /** "gold" (with docs) or "baseline" (without docs) */
+    variant: "baseline" | "gold";
+}
 /** Grader consistency diagnostics — does not affect scores, reported alongside */
 export interface GraderReliability {
     /** Inter-grader agreement (from multi-grader comparison) — Phase 3 */
@@ -769,6 +825,12 @@ export interface ScoreSummary {
     lowScoringJudgments?: StoredJudgment[];
     /** Gap analysis recommendations (Phase 3b) — prioritized remediation plan */
     recommendations?: GapAnalysisReport;
+    /**
+     * Per-test results with model output, grader reasoning, and metadata.
+     * One entry per test × model combination. Populated during gap-analysis
+     * enrichment from test-results.json. See D0029.
+     */
+    testResults?: StoredTestResult[];
     /** Grader reliability diagnostics (does not affect scores) */
     graderReliability?: GraderReliability;
     lowestArea: string;
@@ -1095,8 +1157,23 @@ export interface PublishResult {
         result: SinkResult;
     }[];
 }
+/** Reference to an artifact in external object storage (GCS). See D0030. */
+export interface ArtifactRef {
+    store: "gcs";
+    bucket: string;
+    path: string;
+    bytes?: number;
+    entryCount?: number;
+}
 /** A published evaluation report — the atomic unit of the report store */
 export interface Report {
+    /** External artifact references — set by publish step when uploader is available (D0030) */
+    artifacts?: {
+        testOutputs?: ArtifactRef;
+        renderedPrompts?: ArtifactRef;
+        rawResults?: ArtifactRef;
+        traces?: ArtifactRef;
+    };
     /** Optional auto-comparison against the most recent comparable report */
     comparison?: ComparisonReport;
     /** When the evaluation completed */

package/dist/_vendor/ailf-core/types/scoring-input.d.ts CHANGED Viewed

@@ -26,6 +26,8 @@ export interface TestResult {
         componentResults: ComponentResult[];
         pass: boolean;
     };
+    /** Per-test latency in ms (propagated from Promptfoo when available) */
+    latencyMs?: number;
     metadata?: Record<string, unknown>;
     /** Provider identifier (e.g., "openai:gpt-4o") */
     providerId?: string;

package/dist/artifact-capture/gcs-collector.d.ts ADDED Viewed

@@ -0,0 +1,55 @@
+/**
+ * GcsArtifactCollector — decorator that uploads capture artifacts to GCS.
+ *
+ * Wraps the FilesystemArtifactCollector: local flush first (preserving
+ * the existing manifest + redaction logic), then upload to a GCS bucket.
+ *
+ * Design principles:
+ * - P5: Non-blocking — GCS upload failure should not block the pipeline.
+ *   Local artifacts are always preserved.
+ * - Decorator pattern — delegates capture() and captureFile() to the inner
+ *   collector unchanged. Only flush() adds the GCS upload step.
+ * - Lazy client — GCS Storage client is created on first flush(), not at
+ *   construction (same pattern as BigQuerySink).
+ *
+ * @see docs/decisions/D0030-external-artifact-store.md
+ * @see docs/work-items/W0035-gcs-artifact-output.json
+ */
+import type { ArtifactCollector, CaptureFlushResult } from "../_vendor/ailf-core/index.d.ts";
+export interface GcsCollectorOptions {
+    /** GCS bucket name (e.g., "ailf-artifacts") */
+    bucket: string;
+    /** Object prefix in the bucket (e.g., "captures/") */
+    prefix?: string;
+    /** Path to service account credentials JSON (optional — falls back to ADC) */
+    credentials?: string;
+}
+export interface GcsFlushResult extends CaptureFlushResult {
+    /** GCS upload status */
+    gcs: {
+        status: "uploaded";
+        bucket: string;
+        path: string;
+    } | {
+        status: "skipped";
+        reason: string;
+    } | {
+        status: "failed";
+        error: string;
+    };
+}
+export declare class GcsArtifactCollector implements ArtifactCollector {
+    get enabled(): boolean;
+    get extrasEnabled(): boolean;
+    private client;
+    private readonly inner;
+    private readonly options;
+    constructor(inner: ArtifactCollector, options: GcsCollectorOptions);
+    capture(step: string, type: string, data: unknown, meta?: Record<string, unknown>): void;
+    captureFile(step: string, type: string, filePath: string, meta?: Record<string, unknown>): void;
+    flush(): Promise<GcsFlushResult>;
+    /** Lazily create the GCS Storage client. */
+    private getClient;
+    /** Upload the flushed artifact (tar.gz or directory) to GCS. */
+    private uploadToGcs;
+}

package/dist/artifact-capture/gcs-collector.js ADDED Viewed

@@ -0,0 +1,117 @@
+/**
+ * GcsArtifactCollector — decorator that uploads capture artifacts to GCS.
+ *
+ * Wraps the FilesystemArtifactCollector: local flush first (preserving
+ * the existing manifest + redaction logic), then upload to a GCS bucket.
+ *
+ * Design principles:
+ * - P5: Non-blocking — GCS upload failure should not block the pipeline.
+ *   Local artifacts are always preserved.
+ * - Decorator pattern — delegates capture() and captureFile() to the inner
+ *   collector unchanged. Only flush() adds the GCS upload step.
+ * - Lazy client — GCS Storage client is created on first flush(), not at
+ *   construction (same pattern as BigQuerySink).
+ *
+ * @see docs/decisions/D0030-external-artifact-store.md
+ * @see docs/work-items/W0035-gcs-artifact-output.json
+ */
+import { readFileSync } from "node:fs";
+import { Storage } from "@google-cloud/storage";
+// ---------------------------------------------------------------------------
+// Collector
+// ---------------------------------------------------------------------------
+export class GcsArtifactCollector {
+    get enabled() {
+        return this.inner.enabled;
+    }
+    get extrasEnabled() {
+        return this.inner.extrasEnabled;
+    }
+    client = null;
+    inner;
+    options;
+    constructor(inner, options) {
+        this.inner = inner;
+        this.options = options;
+    }
+    capture(step, type, data, meta) {
+        this.inner.capture(step, type, data, meta);
+    }
+    captureFile(step, type, filePath, meta) {
+        this.inner.captureFile(step, type, filePath, meta);
+    }
+    async flush() {
+        // Step 1: Flush to local filesystem first (always succeeds or throws)
+        const localResult = await this.inner.flush();
+        // Step 2: Upload to GCS (non-blocking — P5)
+        if (localResult.artifactCount === 0) {
+            return {
+                ...localResult,
+                gcs: { status: "skipped", reason: "No artifacts to upload" },
+            };
+        }
+        try {
+            const gcsPath = await this.uploadToGcs(localResult);
+            return {
+                ...localResult,
+                gcs: {
+                    status: "uploaded",
+                    bucket: this.options.bucket,
+                    path: gcsPath,
+                },
+            };
+        }
+        catch (err) {
+            const message = err instanceof Error ? err.message : String(err);
+            console.warn(`  ⚠️  GCS upload failed (non-blocking): ${message}`);
+            return {
+                ...localResult,
+                gcs: { status: "failed", error: message },
+            };
+        }
+    }
+    // -----------------------------------------------------------------------
+    // Private helpers
+    // -----------------------------------------------------------------------
+    /** Lazily create the GCS Storage client. */
+    getClient() {
+        if (this.client)
+            return this.client;
+        this.client = this.options.credentials
+            ? new Storage({ keyFilename: this.options.credentials })
+            : new Storage();
+        return this.client;
+    }
+    /** Upload the flushed artifact (tar.gz or directory) to GCS. */
+    async uploadToGcs(result) {
+        const storage = this.getClient();
+        const bucket = storage.bucket(this.options.bucket);
+        const prefix = this.options.prefix ?? "captures/";
+        if (result.compressed) {
+            // Upload the tar.gz directly
+            const fileName = result.destination.split("/").pop() ?? "capture.tar.gz";
+            const gcsPath = `${prefix}${fileName}`;
+            const fileContent = readFileSync(result.destination);
+            await bucket.file(gcsPath).save(fileContent, {
+                contentType: "application/gzip",
+                metadata: {
+                    artifactCount: String(result.artifactCount),
+                    totalBytes: String(result.totalBytes),
+                },
+            });
+            return gcsPath;
+        }
+        // Uncompressed: upload the manifest.json as the representative file.
+        // The full directory could be uploaded file-by-file, but for the
+        // capture use case (forensic archive), the compressed bundle is the
+        // expected path. Upload just the manifest as a reference.
+        const manifestPath = `${result.destination}/manifest.json`;
+        const dirName = result.destination.split("/").pop() ?? "capture";
+        const gcsPath = `${prefix}${dirName}/manifest.json`;
+        const manifestContent = readFileSync(manifestPath, "utf-8");
+        await bucket.file(gcsPath).save(manifestContent, {
+            contentType: "application/json",
+        });
+        return gcsPath;
+    }
+}

package/dist/artifact-capture/gcs-report-artifact-uploader.d.ts ADDED Viewed

@@ -0,0 +1,31 @@
+/**
+ * GcsReportArtifactUploader — uploads report artifacts to known GCS paths.
+ *
+ * Separate from GcsArtifactCollector (which handles forensic capture archives).
+ * This uploader puts structured JSON files at predictable paths so the
+ * API Gateway can sign URLs and Studio can fetch them on demand.
+ *
+ * GCS path convention:  reports/{reportId}/{fileName}
+ * Example:              reports/01926abc.../test-outputs.json
+ *
+ * Design principles:
+ * - P5: Non-blocking — GCS upload failure returns null, never throws
+ * - Lazy client — Storage created on first upload, not at construction
+ * - Same credentials path as GcsArtifactCollector (ADC or key file)
+ *
+ * @see docs/design-docs/external-artifact-store.md
+ * @see docs/decisions/D0030-external-artifact-store.md
+ */
+import type { ArtifactRef, ArtifactUploader } from "../_vendor/ailf-core/index.d.ts";
+export interface GcsUploaderOptions {
+    /** GCS bucket name (e.g., "ailf-artifacts") */
+    bucket: string;
+}
+export declare class GcsReportArtifactUploader implements ArtifactUploader {
+    private client;
+    private readonly options;
+    constructor(options: GcsUploaderOptions);
+    upload(reportId: string, fileName: string, data: unknown): Promise<ArtifactRef | null>;
+    /** Lazily create the GCS Storage client (ADC). */
+    private getClient;
+}

package/dist/artifact-capture/gcs-report-artifact-uploader.js ADDED Viewed

@@ -0,0 +1,66 @@
+/**
+ * GcsReportArtifactUploader — uploads report artifacts to known GCS paths.
+ *
+ * Separate from GcsArtifactCollector (which handles forensic capture archives).
+ * This uploader puts structured JSON files at predictable paths so the
+ * API Gateway can sign URLs and Studio can fetch them on demand.
+ *
+ * GCS path convention:  reports/{reportId}/{fileName}
+ * Example:              reports/01926abc.../test-outputs.json
+ *
+ * Design principles:
+ * - P5: Non-blocking — GCS upload failure returns null, never throws
+ * - Lazy client — Storage created on first upload, not at construction
+ * - Same credentials path as GcsArtifactCollector (ADC or key file)
+ *
+ * @see docs/design-docs/external-artifact-store.md
+ * @see docs/decisions/D0030-external-artifact-store.md
+ */
+import { Storage } from "@google-cloud/storage";
+export class GcsReportArtifactUploader {
+    client = null;
+    options;
+    constructor(options) {
+        this.options = options;
+    }
+    async upload(reportId, fileName, data) {
+        const objectPath = `reports/${reportId}/${fileName}`;
+        const json = JSON.stringify(data);
+        const bytes = Buffer.byteLength(json, "utf-8");
+        try {
+            const storage = this.getClient();
+            const file = storage.bucket(this.options.bucket).file(objectPath);
+            await file.save(json, {
+                contentType: "application/json",
+                metadata: {
+                    reportId,
+                },
+            });
+            return {
+                store: "gcs",
+                bucket: this.options.bucket,
+                path: objectPath,
+                bytes,
+                entryCount: typeof data === "object" &&
+                    data !== null &&
+                    "entries" in data &&
+                    typeof data.entries === "object"
+                    ? Object.keys(data.entries)
+                        .length
+                    : undefined,
+            };
+        }
+        catch (err) {
+            const message = err instanceof Error ? err.message : String(err);
+            console.warn(`  ⚠️  Artifact upload failed (non-blocking): ${objectPath} — ${message}`);
+            return null;
+        }
+    }
+    /** Lazily create the GCS Storage client (ADC). */
+    getClient() {
+        if (this.client)
+            return this.client;
+        this.client = new Storage();
+        return this.client;
+    }
+}

package/dist/cli.js CHANGED Viewed

@@ -168,6 +168,8 @@ import { createCalculateScoresCommand } from "./commands/calculate-scores.js";
 program.addCommand(createCalculateScoresCommand().helpGroup(CommandGroup.PipelineInternals));
 import { createPrCommentCommand } from "./commands/pr-comment.js";
 program.addCommand(createPrCommentCommand().helpGroup(CommandGroup.PipelineInternals));
+import { createGenerateConfigsCommand } from "./commands/generate-configs.js";
+program.addCommand(createGenerateConfigsCommand().helpGroup(CommandGroup.PipelineInternals));
 import { createMeasureRetrievalCommand } from "./commands/measure-retrieval.js";
 program.addCommand(createMeasureRetrievalCommand().helpGroup(CommandGroup.PipelineInternals));
 import { createLookupDocCommand } from "./commands/lookup-doc.js";

package/dist/commands/pipeline-action.js CHANGED Viewed

@@ -326,6 +326,9 @@ export async function executePipeline(cliOpts) {
                 process.env.AILF_CAPTURE_COMPRESS !== "0";
         config.captureExtras =
             cliOpts.captureExtras !== false && process.env.AILF_CAPTURE_EXTRAS !== "0";
+        config.captureGcsBucket ??= process.env.AILF_CAPTURE_GCS_BUCKET;
+        config.captureGcsPrefix ??= process.env.AILF_CAPTURE_GCS_PREFIX;
+        config.artifactGcsBucket ??= process.env.AILF_GCS_ARTIFACT_BUCKET;
         // Create AppContext directly from the merged config so adapters
         // (especially taskSource) are wired from the file config's
         // taskSourceType — not from CLI defaults.

package/dist/composition-root.js CHANGED Viewed

@@ -18,6 +18,8 @@
 import { join } from "node:path";
 import { InMemoryPluginRegistry, NoOpArtifactCollector, } from "./_vendor/ailf-core/index.js";
 import { FilesystemArtifactCollector } from "./artifact-capture/filesystem-collector.js";
+import { GcsArtifactCollector } from "./artifact-capture/gcs-collector.js";
+import { GcsReportArtifactUploader } from "./artifact-capture/gcs-report-artifact-uploader.js";
 import { ContentLakeCacheAdapter } from "./adapters/cache/content-lake-cache.js";
 import { loadExternalPresets } from "./pipeline/compiler/preset-loader.js";
 import { FilesystemCache } from "./adapters/cache/filesystem-cache.js";
@@ -57,9 +59,11 @@ export function createAppContext(config) {
     const reportStore = createReportStore(config);
     // Sinks — loaded from config/sinks
     const sinks = loadSinks();
-    // Artifact collector — no-op by default, filesystem when --capture is set
-    const collector = config.captureEnabled
-        ? new FilesystemArtifactCollector({
+    // Artifact collector — no-op by default, filesystem when --capture is set,
+    // GCS decorator when --capture-gcs-bucket is also provided (D0030/W0035)
+    let collector = new NoOpArtifactCollector();
+    if (config.captureEnabled) {
+        const fsCollector = new FilesystemArtifactCollector({
             captureDir: config.captureDir ?? join(config.outputDir, "..", "captures"),
             mode: config.mode,
             compress: config.captureCompress ?? true,
@@ -69,9 +73,21 @@ export function createAppContext(config) {
                 source: config.source,
                 areas: config.areas,
             },
-        })
-        : new NoOpArtifactCollector();
+        });
+        collector = config.captureGcsBucket
+            ? new GcsArtifactCollector(fsCollector, {
+                bucket: config.captureGcsBucket,
+                prefix: config.captureGcsPrefix,
+            })
+            : fsCollector;
+    }
+    // Report artifact uploader — uploads structured files to GCS at known
+    // paths for Studio to fetch via signed URLs (D0030)
+    const artifactUploader = config.artifactGcsBucket
+        ? new GcsReportArtifactUploader({ bucket: config.artifactGcsBucket })
+        : undefined;
     return {
+        artifactUploader,
         cache,
         collector,
         config,

package/dist/orchestration/build-app-context.js CHANGED Viewed

@@ -82,6 +82,9 @@ export function mapToResolvedConfig(opts, rootDir) {
         captureDir: opts.captureDir ?? join(opts.outputDir, "..", "captures"),
         captureCompress: opts.captureCompress ?? true,
         captureExtras: opts.captureExtras ?? true,
+        captureGcsBucket: process.env.AILF_CAPTURE_GCS_BUCKET,
+        captureGcsPrefix: process.env.AILF_CAPTURE_GCS_PREFIX,
+        artifactGcsBucket: process.env.AILF_GCS_ARTIFACT_BUCKET,
     };
 }
 /**

package/dist/orchestration/steps/calculate-scores-step.js CHANGED Viewed

@@ -122,7 +122,11 @@ export class CalculateScoresStep {
         }
         // Capture score artifacts
         const resultsDir = join(ctx.config.rootDir, "results", "latest");
-        for (const file of ["score-summary.json", "grader-judgments.json"]) {
+        for (const file of [
+            "score-summary.json",
+            "grader-judgments.json",
+            "test-results.json",
+        ]) {
             const filePath = join(resultsDir, file);
             if (existsSync(filePath)) {
                 ctx.collector.captureFile("calculate-scores", file.replace(".json", ""), filePath);

package/dist/orchestration/steps/gap-analysis-step.js CHANGED Viewed

@@ -151,6 +151,20 @@ export class GapAnalysisStep {
                     documents: areaToDocRefs.get(s.feature),
                 }));
             }
+            // ── Per-test results (D0029: model output + metadata) ──────
+            const testResultsPath = resolve(root, "results", "latest", "test-results.json");
+            let testResults;
+            if (existsSync(testResultsPath)) {
+                const rawTestResults = JSON.parse(readFileSync(testResultsPath, "utf-8"));
+                // Enrich with canonical docs (literacy mode only)
+                testResults = rawTestResults.map((tr) => {
+                    if (!isLiteracyMode)
+                        return tr;
+                    const baseDesc = tr.taskId.replace(/\s*\((gold|baseline)\)\s*$/, "");
+                    const canonicalDocs = descToDocRefs.get(baseDesc);
+                    return canonicalDocs ? { ...tr, canonicalDocs } : tr;
+                });
+            }
             // ── Low-scoring judgments ────────────────────────────────────
             const LOW_SCORE_THRESHOLD = 70;
             const MAX_STORED_JUDGMENTS = 50;
@@ -177,6 +191,7 @@ export class GapAnalysisStep {
                 lowScoringJudgments,
                 recommendations: gapReport,
                 scores: enrichedScores,
+                ...(testResults !== undefined && { testResults }),
             };
             writeFileSync(scoreSummaryPath, JSON.stringify(enrichedSummary, null, 2));
             // Capture gap analysis artifacts

package/dist/orchestration/steps/publish-report-step.js CHANGED Viewed

@@ -113,6 +113,13 @@ export class PublishReportStep {
             tag: this.options.publishTag ?? ctx.config.publishTag,
             title,
         };
+        // Upload test output artifacts to GCS (D0030 — non-blocking, P5)
+        if (ctx.artifactUploader && summary.testResults?.length) {
+            const artifactRef = await uploadTestOutputs(ctx.artifactUploader, reportId, now, summary.testResults);
+            if (artifactRef) {
+                report.artifacts = { testOutputs: artifactRef };
+            }
+        }
         // Share reportId with downstream steps (CallbackStep + orchestrator job update)
         state.reportId = reportId;
         // Capture report object (Tier 2)
@@ -211,6 +218,30 @@ function buildProvenanceInput(summary, ctx, options, autoScope) {
         taskIds,
     };
 }
+/**
+ * Extract test outputs from StoredTestResult[] and upload as a single
+ * JSON artifact to GCS. The artifact is keyed by `{taskId}::{modelId}`
+ * to match the lookup pattern in Studio's JudgmentList component.
+ *
+ * Non-blocking: returns null if upload fails (P5).
+ */
+async function uploadTestOutputs(uploader, reportId, createdAt, testResults) {
+    const entries = {};
+    for (const tr of testResults) {
+        const key = `${tr.taskId}::${tr.modelId}`;
+        entries[key] = {
+            responseOutput: tr.responseOutput,
+            responseOutputTruncated: tr.responseOutputTruncated ?? false,
+        };
+    }
+    const artifact = {
+        version: 1,
+        reportId,
+        createdAt,
+        entries,
+    };
+    return uploader.upload(reportId, "test-outputs.json", artifact);
+}
 /**
  * Fan out a report to all configured sinks.
  *

package/dist/pipeline/calculate-scores.d.ts CHANGED Viewed

@@ -1,7 +1,7 @@
-import { type ActualScoreEntry, type ComponentResult, type Logger, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
+import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
 import { type ResolvedSourceConfig } from "../sources.js";
 import type { GraderJudgment, PerModelEntry } from "./types.js";
-export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, type ActualScoreEntry, type ComponentResult, type TestResult, type UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
+export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, type ActualScoreEntry, type ComponentResult, type StoredTestResult, type TestResult, type UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
 export interface PromptfooResultsWrapper {
     results: RawTestResult[];
     stats: {
@@ -75,6 +75,16 @@ export declare function calculateScoresPerModel(resultsPath: string, goldProfile
  * Phase 3a prerequisite: structured judgment data for failure mode extraction.
  */
 export declare function extractGraderJudgments(resultsPath: string): GraderJudgment[];
+/**
+ * Extract per-test results with model output from evaluation results.
+ *
+ * Mirrors extractGraderJudgments() but captures the full StoredTestResult
+ * shape including response.output (truncated), latency, and cost.
+ * One StoredTestResult per test × model combination.
+ *
+ * See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
+ */
+export declare function extractStoredTestResults(resultsPath: string): StoredTestResult[];
 /**
  * Score agentic evaluation results. In agentic mode, all test entries are
  * gold-only (no baseline entries — the .expanded.agentic.yaml fix ensures this).

package/dist/pipeline/calculate-scores.js CHANGED Viewed

@@ -114,6 +114,10 @@ export function extractGraderJudgments(resultsPath) {
     for (const result of results) {
         const taskId = result.description;
         const modelId = result.providerId ?? result.providerLabel ?? "unknown";
+        // Detect output failures: empty/whitespace response means the model
+        // failed to produce output (API error, token exhaustion, refusal).
+        const output = result.response?.output ?? "";
+        const isOutputFailure = !output.trim();
         for (const comp of result.gradingResult.componentResults) {
             if (comp.assertion?.type !== "llm-rubric") {
                 continue;
@@ -139,9 +143,12 @@ export function extractGraderJudgments(resultsPath) {
                     // Not JSON — use raw reason string
                 }
             }
+            // Also flag synthesized api-error judgments as output failures
+            const outputFailure = isOutputFailure || reason.startsWith("[api-error]");
             judgments.push({
                 dimension: kind,
                 modelId,
+                ...(outputFailure && { outputFailure: true }),
                 reason,
                 score,
                 taskId,
@@ -150,6 +157,76 @@ export function extractGraderJudgments(resultsPath) {
     }
     return judgments;
 }
+/** Maximum characters to store for model response output */
+const MAX_RESPONSE_OUTPUT_LENGTH = 8000;
+/**
+ * Extract per-test results with model output from evaluation results.
+ *
+ * Mirrors extractGraderJudgments() but captures the full StoredTestResult
+ * shape including response.output (truncated), latency, and cost.
+ * One StoredTestResult per test × model combination.
+ *
+ * See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
+ */
+export function extractStoredTestResults(resultsPath) {
+    const results = readAndNormalizeResults(resultsPath);
+    const testResults = [];
+    for (const result of results) {
+        const taskId = result.description;
+        const modelId = result.providerId ?? result.providerLabel ?? "unknown";
+        const area = result.vars.__featureArea || detectFeatureArea(result.description);
+        // Determine variant from docs variable presence (same logic as scoreResults)
+        const docs = result.vars.docs ?? "";
+        const variant = docs.trim().length > 0 ? "gold" : "baseline";
+        // Detect output failure (same logic as extractGraderJudgments)
+        const output = result.response?.output ?? "";
+        const isOutputFailure = !output.trim();
+        // Truncate response output
+        const responseOutput = output.slice(0, MAX_RESPONSE_OUTPUT_LENGTH);
+        const responseOutputTruncated = output.length > MAX_RESPONSE_OUTPUT_LENGTH;
+        // Extract per-dimension scores and reasons
+        const dimensions = [];
+        for (const comp of result.gradingResult.componentResults) {
+            if (comp.assertion?.type !== "llm-rubric") {
+                continue;
+            }
+            const dimension = classifyRubric(comp);
+            if (!dimension) {
+                continue;
+            }
+            const score = parseRubricScore(comp);
+            // Extract reason text (same JSON parsing as extractGraderJudgments)
+            let reason = comp.reason ?? "";
+            if (reason) {
+                try {
+                    const parsed = JSON.parse(reason);
+                    const obj = parsed;
+                    if (typeof obj.reason === "string") {
+                        ;
+                        ({ reason } = obj);
+                    }
+                }
+                catch {
+                    // Not JSON — use raw reason string
+                }
+            }
+            dimensions.push({ dimension, reason, score });
+        }
+        testResults.push({
+            area,
+            cost: result.cost || undefined,
+            dimensions,
+            latencyMs: result.latencyMs,
+            modelId,
+            ...(isOutputFailure && { outputFailure: true }),
+            responseOutput,
+            ...(responseOutputTruncated && { responseOutputTruncated: true }),
+            taskId,
+            variant,
+        });
+    }
+    return testResults;
+}
 /**
  * Finds the URL-extraction assertion result in a test's componentResults
  * and parses the structured JSON from its `reason` field.
@@ -463,6 +540,7 @@ function readAndNormalizeResults(resultsPath, log) {
         const base = {
             cost: r.cost ?? 0,
             description: r.testCase?.description ?? "unknown",
+            latencyMs: r.latencyMs,
             metadata: r.metadata,
             providerId: r.provider?.id,
             providerLabel: r.provider?.label,
@@ -793,6 +871,12 @@ export function calculateAndWriteScores(options) {
             writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
             log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
         }
+        // Extract and persist per-test results (D0029: model output + metadata)
+        const testResults = extractStoredTestResults(baselineResultsPath);
+        if (testResults.length > 0) {
+            writeFileSync(join(outDir, "test-results.json"), JSON.stringify(testResults, null, 2));
+            log.info(`Test results written to results/latest/test-results.json (${testResults.length} results)`);
+        }
         const testSummary = computeTestSummary(baselineResultsPath);
         return { belowCritical: summary.belowCritical, testSummary };
     }
@@ -897,6 +981,17 @@ export function calculateAndWriteScores(options) {
         writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
         log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
     }
+    // Extract and persist per-test results (D0029: model output + metadata)
+    const testResults = extractStoredTestResults(baselineResultsPath);
+    // In full mode, also extract test results from agentic results
+    if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
+        const agenticTestResults = extractStoredTestResults(agenticResultsPath);
+        testResults.push(...agenticTestResults);
+    }
+    if (testResults.length > 0) {
+        writeFileSync(join(outDir, "test-results.json"), JSON.stringify(testResults, null, 2));
+        log.info(`Test results written to results/latest/test-results.json (${testResults.length} results)`);
+    }
     // Compute test summary from the raw results file
     const testSummary = computeTestSummary(baselineResultsPath);
     return { belowCritical: summary.belowCritical, testSummary };

package/dist/report-store.js CHANGED Viewed

@@ -193,15 +193,27 @@ export class ReportStore {
      */
     async write(report) {
         try {
+            // Strip baseline and experiment ScoreSummary objects from comparison
+            // before persisting — they duplicate report.summary (experiment) and
+            // are fetchable by ID via provenance.lineage.comparedAgainst (baseline).
+            // This reduces document size by ~50-65% for full-mode reports.
+            const comparison = report.comparison
+                ? stripComparisonBulk(report.comparison)
+                : null;
             await this.client.create({
                 _id: `report-${report.id}`,
                 _type: REPORT_TYPE,
-                comparison: report.comparison ?? null,
+                comparison,
                 completedAt: report.completedAt,
                 durationMs: report.durationMs,
                 provenance: report.provenance,
                 reportId: report.id,
-                summary: report.summary,
+                summary: {
+                    ...report.summary,
+                    // Artifact references live inside summary in Sanity so they're
+                    // projected automatically by the reportDetailQuery (D0030)
+                    ...(report.artifacts ? { artifacts: report.artifacts } : {}),
+                },
                 tag: report.tag ?? null,
                 title: report.title ?? null,
             });
@@ -283,3 +295,17 @@ function toReport(doc) {
         title: doc.title,
     };
 }
+/**
+ * Remove the `baseline` and `experiment` ScoreSummary objects from a
+ * ComparisonReport, producing a slim copy suitable for persistence.
+ *
+ * These fields are redundant in the stored document:
+ * - `experiment` is byte-for-byte identical to `report.summary`
+ * - `baseline` is fetchable via `provenance.lineage.comparedAgainst`
+ *
+ * Everything else (deltas, areas, classifications) is preserved.
+ */
+function stripComparisonBulk(comparison) {
+    const { baseline: _, experiment: __, ...slim } = comparison;
+    return slim;
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@sanity/ailf",
-  "version": "2.3.2",
+  "version": "2.4.0",
   "private": false,
   "publishConfig": {
     "access": "public"
@@ -33,6 +33,7 @@
   ],
   "dependencies": {
     "@google-cloud/bigquery": "^8.1.1",
+    "@google-cloud/storage": "^7.19.0",
     "@inquirer/prompts": "^8.3.0",
     "@modelcontextprotocol/sdk": "^1.29.0",
     "@portabletext/markdown": "^1.0.0",