npm - @sanity/ailf - Versions diffs - 2.7.0 → 2.8.0 - Mend

@sanity/ailf 2.7.0 → 2.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

package/dist/_vendor/ailf-core/artifact-registry.d.ts +72 -0
package/dist/_vendor/ailf-core/artifact-registry.js +150 -0
package/dist/_vendor/ailf-core/examples/index.d.ts +1 -1
package/dist/_vendor/ailf-core/examples/index.js +1 -1
package/dist/_vendor/ailf-core/index.d.ts +2 -1
package/dist/_vendor/ailf-core/index.js +2 -1
package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +3 -3
package/dist/_vendor/ailf-core/ports/artifact-writer.d.ts +56 -0
package/dist/_vendor/ailf-core/ports/artifact-writer.js +28 -0
package/dist/_vendor/ailf-core/ports/context.d.ts +13 -3
package/dist/_vendor/ailf-core/ports/index.d.ts +3 -3
package/dist/_vendor/ailf-core/ports/index.js +1 -1
package/dist/_vendor/ailf-core/types/branded-ids.d.ts +9 -0
package/dist/_vendor/ailf-core/types/branded-ids.js +21 -0
package/dist/_vendor/ailf-core/types/index.d.ts +117 -70
package/dist/_vendor/ailf-core/types/index.js +1 -1
package/dist/_vendor/ailf-shared/index.d.ts +2 -0
package/dist/_vendor/ailf-shared/index.js +2 -0
package/dist/_vendor/ailf-shared/run-context.d.ts +55 -0
package/dist/_vendor/ailf-shared/run-context.js +17 -0
package/dist/_vendor/ailf-shared/run-trigger.d.ts +30 -0
package/dist/_vendor/ailf-shared/run-trigger.js +13 -0
package/dist/artifact-capture/api-gateway-artifact-writer.d.ts +39 -0
package/dist/artifact-capture/api-gateway-artifact-writer.js +148 -0
package/dist/artifact-capture/gcs-artifact-writer.d.ts +30 -0
package/dist/artifact-capture/gcs-artifact-writer.js +119 -0
package/dist/commands/init.js +2 -6
package/dist/commands/publish.js +3 -2
package/dist/composition-root.d.ts +3 -3
package/dist/composition-root.js +20 -15
package/dist/orchestration/build-step-sequence.js +6 -1
package/dist/orchestration/steps/calculate-scores-step.js +42 -2
package/dist/orchestration/steps/finalize-run-step.d.ts +29 -0
package/dist/orchestration/steps/finalize-run-step.js +103 -0
package/dist/orchestration/steps/publish-report-step.js +25 -27
package/dist/pipeline/calculate-scores.js +13 -2
package/dist/pipeline/provenance.d.ts +24 -44
package/dist/pipeline/provenance.js +17 -165
package/dist/pipeline/report-title.d.ts +2 -2
package/dist/pipeline/run-context.d.ts +57 -0
package/dist/pipeline/run-context.js +156 -0
package/dist/pipeline/upload-test-outputs.d.ts +26 -0
package/dist/pipeline/upload-test-outputs.js +34 -0
package/dist/report-store.js +4 -2
package/package.json +1 -1
package/dist/_vendor/ailf-core/ports/artifact-uploader.d.ts +0 -35
package/dist/_vendor/ailf-core/ports/artifact-uploader.js +0 -18
package/dist/artifact-capture/api-gateway-artifact-uploader.d.ts +0 -41
package/dist/artifact-capture/api-gateway-artifact-uploader.js +0 -123
package/dist/artifact-capture/gcs-report-artifact-uploader.d.ts +0 -31
package/dist/artifact-capture/gcs-report-artifact-uploader.js +0 -66

package/dist/artifact-capture/gcs-artifact-writer.js ADDED Viewed

@@ -0,0 +1,119 @@
+/**
+ * GcsArtifactWriter — writes AILF run artifacts + manifest directly to GCS.
+ *
+ * Uses Application Default Credentials (ADC). Used when the CLI runs in CI or
+ * anywhere ADC is configured — the client talks to GCS without the API Gateway
+ * acting as a middleman.
+ *
+ * Paths come from `ARTIFACT_REGISTRY` so writers, signers, and readers agree.
+ *
+ * Design principles:
+ * - P5: Non-blocking — upload failure returns null, never throws.
+ * - Lazy client — Storage created on first write.
+ *
+ * @see docs/decisions/D0032-run-anchored-artifact-store.md
+ */
+import { Storage } from "@google-cloud/storage";
+import { ARTIFACT_REGISTRY, } from "../_vendor/ailf-core/index.js";
+export class GcsArtifactWriter {
+    client = null;
+    options;
+    constructor(options) {
+        this.options = options;
+    }
+    async writeBulk(type, runId, data) {
+        const descriptor = ARTIFACT_REGISTRY[type];
+        const path = descriptor.objectPath(runId);
+        return this.putJson(path, data, {
+            layout: "bulk",
+            entryCount: entryCountOf(data),
+        });
+    }
+    async writePerEntry(type, runId, entries) {
+        const descriptor = ARTIFACT_REGISTRY[type];
+        if (!descriptor.parseEntryKey) {
+            console.warn(`  ⚠️  writePerEntry called for "${type}" but the registry has no parseEntryKey`);
+            return null;
+        }
+        const storage = this.getClient();
+        const uploaded = [];
+        let totalBytes = 0;
+        for (const entry of entries) {
+            const parsed = descriptor.parseEntryKey(entry.key);
+            if (!parsed.ok) {
+                console.warn(`  ⚠️  Skipping entry with invalid key "${entry.key}": ${parsed.reason}`);
+                continue;
+            }
+            const path = descriptor.objectPath(runId, entry.key);
+            const json = JSON.stringify(entry.data);
+            const bytes = Buffer.byteLength(json, "utf-8");
+            try {
+                await storage
+                    .bucket(this.options.bucket)
+                    .file(path)
+                    .save(json, { contentType: "application/json" });
+                uploaded.push({ key: entry.key, bytes });
+                totalBytes += bytes;
+            }
+            catch (err) {
+                const message = err instanceof Error ? err.message : String(err);
+                console.warn(`  ⚠️  Artifact entry upload failed (non-blocking): ${path} — ${message}`);
+            }
+        }
+        if (uploaded.length === 0)
+            return null;
+        return {
+            store: "gcs",
+            bucket: this.options.bucket,
+            path: `runs/${runId}/${descriptor.slug}`,
+            bytes: totalBytes,
+            entryCount: uploaded.length,
+            layout: "per-entry",
+            entries: uploaded,
+        };
+    }
+    async writeManifest(runId, manifest) {
+        const path = `runs/${runId}/manifest.json`;
+        return this.putJson(path, manifest, { layout: "bulk" });
+    }
+    async putJson(path, data, meta) {
+        const json = JSON.stringify(data);
+        const bytes = Buffer.byteLength(json, "utf-8");
+        try {
+            const storage = this.getClient();
+            await storage
+                .bucket(this.options.bucket)
+                .file(path)
+                .save(json, { contentType: "application/json" });
+            return {
+                store: "gcs",
+                bucket: this.options.bucket,
+                path,
+                bytes,
+                entryCount: meta.entryCount,
+                layout: meta.layout,
+            };
+        }
+        catch (err) {
+            const message = err instanceof Error ? err.message : String(err);
+            console.warn(`  ⚠️  Artifact upload failed (non-blocking): ${path} — ${message}`);
+            return null;
+        }
+    }
+    getClient() {
+        if (this.client)
+            return this.client;
+        this.client = new Storage();
+        return this.client;
+    }
+}
+function entryCountOf(data) {
+    if (typeof data === "object" &&
+        data !== null &&
+        "entries" in data &&
+        typeof data.entries === "object") {
+        return Object.keys(data.entries)
+            .length;
+    }
+    return undefined;
+}

package/dist/commands/init.js CHANGED Viewed

@@ -250,10 +250,9 @@ async function runInit(opts) {
     console.log(`  1. Edit the example tasks in ${rel(targetDir, tasksDir)}/ — update`);
     console.log("     slugs and prompts for your documentation");
     console.log(`  2. Validate locally: npx @sanity/ailf@latest validate-tasks .ailf/tasks/`);
-    console.log("  3. Add two GitHub Actions secrets");
+    console.log("  3. Add a GitHub Actions secret");
     console.log("     (Settings → Secrets and variables → Actions):");
     console.log("     • AILF_API_KEY — your API key (starts with ailf_live_sk_)");
-    console.log("     • NPM_TOKEN   — npm token with read access to @sanity scope");
     console.log("  4. Push — the workflow at .github/workflows/ailf-eval.yml runs");
     console.log("     automatically on PRs");
     if (format === "ts") {
@@ -262,14 +261,11 @@ async function runInit(opts) {
         console.log("     via defineTask() from @sanity/ailf-core.");
     }
     console.log();
-    console.log("  🔑 Retrieve secrets from 1Password (Sanity employees):");
+    console.log("  🔑 Retrieve the API key from 1Password (Sanity employees):");
     console.log();
     console.log("     # Shared dev API key (for local testing and CI)");
     console.log('     op read "op://Shared/AI Literacy Framework - Shared API Tokens/AILF_API_KEY_DEV"');
     console.log();
-    console.log("     # npm token (read access to @sanity scope)");
-    console.log('     op read "op://Shared/AI Literacy Framework - Shared API Tokens/NPM_TOKEN"');
-    console.log();
     console.log("     Not a Sanity employee? Request an API key from the AILF team.");
     console.log();
     console.log("  💡 Test locally before pushing:");

package/dist/commands/publish.js CHANGED Viewed

@@ -55,7 +55,7 @@ export function createPublishCommand() {
  * the summary metadata and environment. Some fields (contextHash,
  * promptfooUrl) are not available for manual publishes.
  */
-function buildProvenanceFromSummary(summary) {
+function buildProvenanceFromSummary(summary, runId) {
     const areas = summary.scores.map((s) => s.feature);
     const mode = (process.env.EVAL_MODE ?? "literacy");
     const source = {
@@ -76,6 +76,7 @@ function buildProvenanceFromSummary(summary) {
         areas,
         mode,
         rootDir: ROOT,
+        runId,
         source,
     };
 }
@@ -145,7 +146,7 @@ async function runPublishCommand(summaryPath, outputDir, opts) {
     // -----------------------------------------------------------------------
     // 2. Build provenance
     // -----------------------------------------------------------------------
-    const provenanceInput = buildProvenanceFromSummary(summary);
+    const provenanceInput = buildProvenanceFromSummary(summary, ctx.runId);
     const provenance = buildProvenance(provenanceInput);
     // -----------------------------------------------------------------------
     // 3. Create report

package/dist/composition-root.d.ts CHANGED Viewed

@@ -15,7 +15,7 @@
  * @see packages/core/src/ports/context.ts — AppContext interface
  * @see docs/archive/exec-plans/ports-and-adapters/phase-7-composition-root.md
  */
-import { type AppContext, type ArtifactUploader, type AssertionRegistration, type Logger, type ResolvedConfig } from "./_vendor/ailf-core/index.d.ts";
+import { type AppContext, type ArtifactWriter, type AssertionRegistration, type Logger, type ResolvedConfig } from "./_vendor/ailf-core/index.d.ts";
 /**
  * Create a fully wired AppContext from resolved configuration.
  *
@@ -24,7 +24,7 @@ import { type AppContext, type ArtifactUploader, type AssertionRegistration, typ
  */
 export declare function createAppContext(config: ResolvedConfig): AppContext;
 /**
- * Selects an ArtifactUploader implementation based on available credentials.
+ * Selects an ArtifactWriter implementation based on available credentials.
  *
  * Selection order:
  *   1. config.artifactUpload === false → always skip (explicit opt-out)
@@ -38,7 +38,7 @@ export declare function createAppContext(config: ResolvedConfig): AppContext;
  *
  * Exported for unit-test access; not part of the public package API.
  */
-export declare function createArtifactUploader(config: ResolvedConfig, logger: Logger): ArtifactUploader | undefined;
+export declare function createArtifactWriter(config: ResolvedConfig, logger: Logger): ArtifactWriter | undefined;
 /**
  * Generic Promptfoo assertion types available to all evaluation modes.
  *

package/dist/composition-root.js CHANGED Viewed

@@ -16,11 +16,11 @@
  * @see docs/archive/exec-plans/ports-and-adapters/phase-7-composition-root.md
  */
 import { join } from "node:path";
-import { InMemoryPluginRegistry, NoOpArtifactCollector, } from "./_vendor/ailf-core/index.js";
-import { ApiGatewayArtifactUploader } from "./artifact-capture/api-gateway-artifact-uploader.js";
+import { InMemoryPluginRegistry, NoOpArtifactCollector, generateRunId, } from "./_vendor/ailf-core/index.js";
+import { ApiGatewayArtifactWriter } from "./artifact-capture/api-gateway-artifact-writer.js";
 import { FilesystemArtifactCollector } from "./artifact-capture/filesystem-collector.js";
 import { GcsArtifactCollector } from "./artifact-capture/gcs-collector.js";
-import { GcsReportArtifactUploader } from "./artifact-capture/gcs-report-artifact-uploader.js";
+import { GcsArtifactWriter } from "./artifact-capture/gcs-artifact-writer.js";
 import { ContentLakeCacheAdapter } from "./adapters/cache/content-lake-cache.js";
 import { loadExternalPresets } from "./pipeline/compiler/preset-loader.js";
 import { FilesystemCache } from "./adapters/cache/filesystem-cache.js";
@@ -82,13 +82,17 @@ export function createAppContext(config) {
             })
             : fsCollector;
     }
-    // Report artifact uploader — uploads structured files to GCS at known
-    // paths for Studio to fetch via signed URLs (D0030). Auto-detects the
-    // right adapter from available credentials; defaults bucket to
-    // "ailf-artifacts". Set artifactUpload: false to opt out entirely.
-    const artifactUploader = createArtifactUploader(config, logger);
+    // Artifact writer — writes run artifacts + manifest to GCS at known
+    // `runs/{runId}/…` paths (D0032). Auto-detects the right adapter from
+    // available credentials; defaults bucket to "ailf-artifacts". Set
+    // artifactUpload: false to opt out entirely.
+    const artifactWriter = createArtifactWriter(config, logger);
+    // Generate the pipeline's RunId once; every downstream step reads it
+    // from the context (D0032).
+    const runId = generateRunId();
+    logger.debug(`Pipeline runId: ${runId}`);
     return {
-        artifactUploader,
+        artifactWriter,
         cache,
         collector,
         config,
@@ -97,6 +101,7 @@ export function createAppContext(config) {
         logger,
         registry,
         reportStore,
+        runId,
         sinks,
         taskSource,
     };
@@ -124,7 +129,7 @@ function createLogger() {
  */
 const DEFAULT_ARTIFACT_BUCKET = "ailf-artifacts";
 /**
- * Selects an ArtifactUploader implementation based on available credentials.
+ * Selects an ArtifactWriter implementation based on available credentials.
  *
  * Selection order:
  *   1. config.artifactUpload === false → always skip (explicit opt-out)
@@ -138,7 +143,7 @@ const DEFAULT_ARTIFACT_BUCKET = "ailf-artifacts";
  *
  * Exported for unit-test access; not part of the public package API.
  */
-export function createArtifactUploader(config, logger) {
+export function createArtifactWriter(config, logger) {
     if (config.artifactUpload === false) {
         logger.debug("Artifact upload explicitly disabled via artifactUpload=false");
         return undefined;
@@ -148,13 +153,13 @@ export function createArtifactUploader(config, logger) {
     // We treat the presence of either env var as the user opting in to ADC.
     const hasGcsCredentials = Boolean(process.env.GOOGLE_APPLICATION_CREDENTIALS || process.env.GCLOUD_PROJECT);
     if (hasGcsCredentials) {
-        logger.debug(`Artifact uploader: GcsReportArtifactUploader (direct GCS via ADC, bucket=${bucket})`);
-        return new GcsReportArtifactUploader({ bucket });
+        logger.debug(`Artifact writer: GcsArtifactWriter (direct GCS via ADC, bucket=${bucket})`);
+        return new GcsArtifactWriter({ bucket });
     }
     // Local dev — request signed PUT URLs from the API gateway, no GCS creds needed.
     if (config.apiKey && config.apiUrl) {
-        logger.debug(`Artifact uploader: ApiGatewayArtifactUploader (signed URL via ${config.apiUrl}, bucket=${bucket})`);
-        return new ApiGatewayArtifactUploader({
+        logger.debug(`Artifact writer: ApiGatewayArtifactWriter (signed URL via ${config.apiUrl}, bucket=${bucket})`);
+        return new ApiGatewayArtifactWriter({
             apiBaseUrl: config.apiUrl,
             apiKey: config.apiKey,
             bucket,

package/dist/orchestration/build-step-sequence.js CHANGED Viewed

@@ -11,6 +11,7 @@ import { CalculateScoresStep } from "./steps/calculate-scores-step.js";
 import { CompareStep } from "./steps/compare-step.js";
 import { DiscoveryReportStep } from "./steps/discovery-report-step.js";
 import { FetchDocsStep } from "./steps/fetch-docs-step.js";
+import { FinalizeRunStep } from "./steps/finalize-run-step.js";
 import { GapAnalysisStep } from "./steps/gap-analysis-step.js";
 import { GenerateConfigsStep } from "./steps/generate-configs-step.js";
 import { GraderConsistencyStep } from "./steps/grader-consistency-step.js";
@@ -76,7 +77,11 @@ export function buildStepSequence(ctx, pipelineStart = Date.now()) {
     if (config.gapAnalysisEnabled) {
         steps.push(new GapAnalysisStep());
     }
-    // Step 4b: Publish report (optional, when token is configured)
+    // Step 4c: Finalize the run — write `runs/{runId}/manifest.json` with the
+    // catalog of artifacts produced so far. Skipped silently when no
+    // artifactWriter is wired (D0032).
+    steps.push(new FinalizeRunStep(pipelineStart));
+    // Step 4d: Publish report (optional, when token is configured)
     if (config.publishEnabled) {
         steps.push(new PublishReportStep(pipelineStart, {
             publishTag: config.publishTag,

package/dist/orchestration/steps/calculate-scores-step.js CHANGED Viewed

@@ -4,8 +4,8 @@
  * Calls calculateAndWriteScores() from pipeline/calculate-scores.ts with
  * typed options derived from AppContext. No env bridge needed.
  */
-import { existsSync } from "node:fs";
-import { join } from "path";
+import { existsSync, readFileSync } from "node:fs";
+import { join, resolve } from "path";
 import { LiteracyVariant } from "../../pipeline/normalize-mode.js";
 import { getStepInputPaths } from "../../pipeline/cache.js";
 import { buildCacheContext } from "../cache-context.js";
@@ -13,6 +13,7 @@ import { calculateAndWriteScores } from "../../pipeline/calculate-scores.js";
 import { checkResultsExist, checkScoreSummaryValid, } from "../../pipeline/checks.js";
 import { resultsFileForMode } from "../../pipeline/eval-constants.js";
 import { loadSource } from "../../sources.js";
+import { uploadTestOutputs } from "../../pipeline/upload-test-outputs.js";
 import { configToSourceOverrides } from "../config-to-source-overrides.js";
 export class CalculateScoresStep {
     name = "calculate-scores";
@@ -132,6 +133,27 @@ export class CalculateScoresStep {
                 ctx.collector.captureFile("calculate-scores", file.replace(".json", ""), filePath);
             }
         }
+        // Upload testOutputs to GCS (D0032 — non-blocking, P5).
+        // Read from test-results.json rather than score-summary.json: the
+        // gap-analysis step (downstream) is the one that enriches score-summary
+        // with testResults, so at this point the summary still has an empty
+        // testResults[]. test-results.json is written by calculateAndWriteScores
+        // above and carries the full per-test shape we need for per-entry upload.
+        // The full responseOutput lives in the GCS artifact; PublishReportStep
+        // later strips it from the inline Content Lake document when this
+        // upload succeeds.
+        if (ctx.artifactWriter) {
+            const testResults = tryReadTestResults(ctx.config.rootDir);
+            if (testResults?.length) {
+                const artifactRef = await uploadTestOutputs(ctx.artifactWriter, ctx.runId, testResults);
+                if (artifactRef) {
+                    state.artifactRefs = {
+                        ...state.artifactRefs,
+                        testOutputs: artifactRef,
+                    };
+                }
+            }
+        }
         const criticalSuffix = belowCritical.length > 0
             ? ` (${belowCritical.length} area(s) below critical threshold: ${belowCritical.join(", ")})`
             : "";
@@ -148,3 +170,21 @@ export class CalculateScoresStep {
         return buildCacheContext(ctx.config);
     }
 }
+/**
+ * Read the per-test result set written by `calculateAndWriteScores`.
+ *
+ * This is the authoritative source for `uploadTestOutputs` at the time
+ * CalculateScoresStep runs — `score-summary.json` doesn't carry
+ * `testResults[]` until `gap-analysis-step` enriches it downstream.
+ */
+function tryReadTestResults(rootDir) {
+    const path = resolve(rootDir, "results", "latest", "test-results.json");
+    if (!existsSync(path))
+        return undefined;
+    try {
+        return JSON.parse(readFileSync(path, "utf-8"));
+    }
+    catch {
+        return undefined;
+    }
+}

package/dist/orchestration/steps/finalize-run-step.d.ts ADDED Viewed

@@ -0,0 +1,29 @@
+/**
+ * Pipeline step: FinalizeRunStep — writes the run manifest at pipeline end.
+ *
+ * Inserts between `GapAnalysis` and `PublishReport`. Assembles a
+ * `RunManifest` from `state.artifactRefs` (populated by producer steps)
+ * and the shared `RunContext` (via `buildRunContext`), then writes it to
+ * `runs/{runId}/manifest.json`. The written manifest becomes the source
+ * of truth for artifact locations; `PublishReportStep` snapshots the
+ * `artifacts` slice into `Report.artifactManifest` (D0032).
+ *
+ * Design principles:
+ * - Single writer — one `writeManifest()` call per pipeline run.
+ * - Idempotent — retries produce the same manifest bytes for the same inputs.
+ * - Skipped when no writer is wired (local/air-gapped runs stay functional).
+ *
+ * @see docs/decisions/D0032-run-anchored-artifact-store.md
+ */
+import type { AppContext, PipelineState, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
+export declare class FinalizeRunStep implements PipelineStep {
+    private readonly pipelineStart;
+    private readonly options;
+    readonly name = "finalize-run";
+    readonly optional = true;
+    constructor(pipelineStart: number, options?: {
+        evalFingerprint?: string;
+    });
+    check(): ValidationIssue[];
+    execute(ctx: AppContext, state: PipelineState): Promise<StepResult>;
+}

package/dist/orchestration/steps/finalize-run-step.js ADDED Viewed

@@ -0,0 +1,103 @@
+/**
+ * Pipeline step: FinalizeRunStep — writes the run manifest at pipeline end.
+ *
+ * Inserts between `GapAnalysis` and `PublishReport`. Assembles a
+ * `RunManifest` from `state.artifactRefs` (populated by producer steps)
+ * and the shared `RunContext` (via `buildRunContext`), then writes it to
+ * `runs/{runId}/manifest.json`. The written manifest becomes the source
+ * of truth for artifact locations; `PublishReportStep` snapshots the
+ * `artifacts` slice into `Report.artifactManifest` (D0032).
+ *
+ * Design principles:
+ * - Single writer — one `writeManifest()` call per pipeline run.
+ * - Idempotent — retries produce the same manifest bytes for the same inputs.
+ * - Skipped when no writer is wired (local/air-gapped runs stay functional).
+ *
+ * @see docs/decisions/D0032-run-anchored-artifact-store.md
+ */
+import { existsSync, readFileSync } from "node:fs";
+import { resolve } from "node:path";
+import { buildRunContext } from "../../pipeline/run-context.js";
+import { loadSource } from "../../sources.js";
+import { configToSourceOverrides } from "../config-to-source-overrides.js";
+export class FinalizeRunStep {
+    pipelineStart;
+    options;
+    name = "finalize-run";
+    optional = true;
+    constructor(pipelineStart, options = {}) {
+        this.pipelineStart = pipelineStart;
+        this.options = options;
+    }
+    check() {
+        return [];
+    }
+    async execute(ctx, state) {
+        const start = Date.now();
+        if (!ctx.artifactWriter) {
+            return {
+                status: "skipped",
+                reason: "No artifactWriter wired — manifest is only written when a writer is available",
+            };
+        }
+        // Resolve the source (same input buildProvenance uses).
+        const overrides = configToSourceOverrides(ctx.config);
+        const resolvedSource = loadSource(ctx.config.source, overrides);
+        // Optional: try to read the on-disk summary for test mode inference,
+        // but don't fail finalize if it's missing — the manifest should still
+        // be written so artifacts have a catalog.
+        const maybeSummary = tryReadScoreSummary(ctx.config.rootDir);
+        const runContext = buildRunContext({
+            areas: maybeSummary?.scores?.map((s) => s.feature) ?? ctx.config.areas ?? [],
+            callerGit: ctx.config.callerGit,
+            evalFingerprint: state.evalFingerprint ?? this.options.evalFingerprint,
+            logger: ctx.logger,
+            mode: ctx.config.mode,
+            rootDir: ctx.config.rootDir,
+            source: resolvedSource,
+            taskIds: ctx.config.tasks,
+        });
+        const manifest = {
+            version: 1,
+            runId: ctx.runId,
+            createdAt: new Date().toISOString(),
+            durationMs: Date.now() - this.pipelineStart,
+            status: "completed",
+            context: runContext,
+            outcomes: state.testSummary
+                ? { testSummary: state.testSummary }
+                : undefined,
+            promptfooUrls: state.promptfooUrls,
+            artifacts: state.artifactRefs ?? {},
+        };
+        const ref = await ctx.artifactWriter.writeManifest(ctx.runId, manifest);
+        if (!ref) {
+            // Non-blocking: writer logged the warning. Still populate state so
+            // publish can snapshot `artifacts` even without a persisted manifest.
+            state.runManifest = manifest;
+            return {
+                durationMs: Date.now() - start,
+                status: "success",
+                summary: "Run manifest computed (GCS write failed — non-blocking)",
+            };
+        }
+        state.runManifest = manifest;
+        const artifactCount = Object.keys(manifest.artifacts).length;
+        return {
+            durationMs: Date.now() - start,
+            status: "success",
+            summary: `Run manifest written to ${ref.path} (${artifactCount} artifact ref${artifactCount === 1 ? "" : "s"})`,
+        };
+    }
+}
+function tryReadScoreSummary(rootDir) {
+    const path = resolve(rootDir, "results", "latest", "score-summary.json");
+    if (!existsSync(path))
+        return undefined;
+    try {
+        return JSON.parse(readFileSync(path, "utf-8"));
+    }
+    catch {
+        return undefined;
+    }
+}

package/dist/orchestration/steps/publish-report-step.js CHANGED Viewed

@@ -113,12 +113,24 @@ export class PublishReportStep {
             tag: this.options.publishTag ?? ctx.config.publishTag,
             title,
         };
-        // Upload test output artifacts to GCS (D0030 — non-blocking, P5)
-        if (ctx.artifactUploader && summary.testResults?.length) {
-            const artifactRef = await uploadTestOutputs(ctx.artifactUploader, reportId, now, summary.testResults);
-            if (artifactRef) {
-                report.artifacts = { testOutputs: artifactRef };
-            }
+        // Snapshot the artifact manifest from FinalizeRunStep's output (D0032).
+        // The source of truth is `runs/{runId}/manifest.json` in GCS; the report
+        // carries a denormalized copy so Studio can render drill-down state
+        // without an extra GCS fetch.
+        const artifactManifest = state.runManifest?.artifacts;
+        if (artifactManifest && Object.keys(artifactManifest).length > 0) {
+            report.artifactManifest = artifactManifest;
+        }
+        // When testOutputs was uploaded to GCS, strip responseOutput from the
+        // inline testResults[] so the Content Lake document stays slim — the
+        // full output lives in the GCS artifact. When no testOutputs artifact
+        // exists, leave the inline shape intact so Studio's drill-down UI
+        // falls back to it.
+        if (artifactManifest?.testOutputs && summary.testResults?.length) {
+            report.summary = {
+                ...summary,
+                testResults: summary.testResults.map(slimTestResult),
+            };
         }
         // Share reportId with downstream steps (CallbackStep + orchestrator job update)
         state.reportId = reportId;
@@ -212,6 +224,7 @@ function buildProvenanceInput(summary, ctx, options, autoScope) {
         mode,
         promptfooUrls: options.promptfooUrls,
         rootDir: ctx.config.rootDir,
+        runId: ctx.runId,
         sanityDocumentIds,
         source,
         sourceReportId: ctx.config.sourceReportId,
@@ -219,28 +232,13 @@ function buildProvenanceInput(summary, ctx, options, autoScope) {
     };
 }
 /**
- * Extract test outputs from StoredTestResult[] and upload as a single
- * JSON artifact to GCS. The artifact is keyed by `{taskId}::{modelId}`
- * to match the lookup pattern in Studio's JudgmentList component.
- *
- * Non-blocking: returns null if upload fails (P5).
+ * Strip the large responseOutput fields from a StoredTestResult so the
+ * remaining object is safe to inline in the Content Lake document (D0030).
+ * The full output lives in the GCS artifact uploaded by uploadTestOutputs.
  */
-async function uploadTestOutputs(uploader, reportId, createdAt, testResults) {
-    const entries = {};
-    for (const tr of testResults) {
-        const key = `${tr.taskId}::${tr.modelId}`;
-        entries[key] = {
-            responseOutput: tr.responseOutput,
-            responseOutputTruncated: tr.responseOutputTruncated ?? false,
-        };
-    }
-    const artifact = {
-        version: 1,
-        reportId,
-        createdAt,
-        entries,
-    };
-    return uploader.upload(reportId, "test-outputs.json", artifact);
+function slimTestResult(tr) {
+    const { responseOutput: _o, responseOutputTruncated: _t, ...rest } = tr;
+    return rest;
 }
 /**
  * Fan out a report to all configured sinks.

package/dist/pipeline/calculate-scores.js CHANGED Viewed

@@ -157,8 +157,19 @@ export function extractGraderJudgments(resultsPath) {
     }
     return judgments;
 }
-/** Maximum characters to store for model response output */
-const MAX_RESPONSE_OUTPUT_LENGTH = 8000;
+/**
+ * Maximum characters (JS string length, not bytes) to store for model
+ * response output. ASCII-heavy responses at this cap JSON-encode to ~1 MB;
+ * pathological multi-byte UTF-8 could encode to ~4 MB, still well within
+ * per-entry GCS object limits.
+ *
+ * Raised from 8 000 to 1 000 000 in W0048 because the per-entry artifact
+ * layout (D0032) makes the cap irrelevant to Studio's fetch cost — each
+ * entry is fetched independently on click, so a larger ceiling only costs
+ * GCS bytes, not main-thread blocking or baseline report payload.
+ * `responseOutputTruncated` still flips for the extreme tail.
+ */
+const MAX_RESPONSE_OUTPUT_LENGTH = 1_000_000;
 /**
  * Extract per-test results with model output from evaluation results.
  *