npm - @sanity/ailf - Versions diffs - 2.7.1 → 2.9.0 - Mend

@sanity/ailf 2.7.1 → 2.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (92) hide show

package/dist/_vendor/ailf-core/artifact-capture/association.d.ts +35 -0
package/dist/_vendor/ailf-core/artifact-capture/association.js +28 -0
package/dist/_vendor/ailf-core/artifact-registry.d.ts +173 -0
package/dist/_vendor/ailf-core/artifact-registry.js +811 -0
package/dist/_vendor/ailf-core/index.d.ts +3 -1
package/dist/_vendor/ailf-core/index.js +3 -1
package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +3 -3
package/dist/_vendor/ailf-core/ports/artifact-writer.d.ts +95 -0
package/dist/_vendor/ailf-core/ports/artifact-writer.js +51 -0
package/dist/_vendor/ailf-core/ports/context.d.ts +32 -3
package/dist/_vendor/ailf-core/ports/index.d.ts +3 -3
package/dist/_vendor/ailf-core/ports/index.js +1 -1
package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +6 -6
package/dist/_vendor/ailf-core/services/index.d.ts +1 -0
package/dist/_vendor/ailf-core/services/index.js +1 -0
package/dist/_vendor/ailf-core/services/slim-report-summary.d.ts +31 -0
package/dist/_vendor/ailf-core/services/slim-report-summary.js +217 -0
package/dist/_vendor/ailf-core/types/branded-ids.d.ts +42 -0
package/dist/_vendor/ailf-core/types/branded-ids.js +21 -0
package/dist/_vendor/ailf-core/types/index.d.ts +298 -77
package/dist/_vendor/ailf-core/types/index.js +1 -1
package/dist/_vendor/ailf-shared/index.d.ts +2 -0
package/dist/_vendor/ailf-shared/index.js +2 -0
package/dist/_vendor/ailf-shared/run-context.d.ts +55 -0
package/dist/_vendor/ailf-shared/run-context.js +17 -0
package/dist/_vendor/ailf-shared/run-trigger.d.ts +30 -0
package/dist/_vendor/ailf-shared/run-trigger.js +13 -0
package/dist/artifact-capture/accumulating-artifact-writer.d.ts +50 -0
package/dist/artifact-capture/accumulating-artifact-writer.js +111 -0
package/dist/artifact-capture/api-gateway-artifact-writer.d.ts +52 -0
package/dist/artifact-capture/api-gateway-artifact-writer.js +199 -0
package/dist/artifact-capture/emit-file.d.ts +28 -0
package/dist/artifact-capture/emit-file.js +56 -0
package/dist/artifact-capture/fanout-artifact-writer.d.ts +39 -0
package/dist/artifact-capture/fanout-artifact-writer.js +76 -0
package/dist/artifact-capture/filesystem-collector.d.ts +22 -4
package/dist/artifact-capture/filesystem-collector.js +48 -23
package/dist/artifact-capture/gcs-artifact-writer.d.ts +67 -0
package/dist/artifact-capture/gcs-artifact-writer.js +343 -0
package/dist/artifact-capture/local-fs-artifact-writer.d.ts +71 -0
package/dist/artifact-capture/local-fs-artifact-writer.js +273 -0
package/dist/commands/explain-handler.js +4 -0
package/dist/commands/pipeline-action.d.ts +5 -0
package/dist/commands/pipeline-action.js +56 -5
package/dist/commands/pipeline.d.ts +4 -0
package/dist/commands/pipeline.js +6 -2
package/dist/commands/publish.js +7 -3
package/dist/composition-root.d.ts +14 -11
package/dist/composition-root.js +90 -31
package/dist/orchestration/build-step-sequence.js +6 -1
package/dist/orchestration/pipeline-orchestrator.d.ts +1 -1
package/dist/orchestration/pipeline-orchestrator.js +41 -30
package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -1
package/dist/orchestration/steps/calculate-scores-step.js +50 -10
package/dist/orchestration/steps/callback-step.d.ts +1 -1
package/dist/orchestration/steps/callback-step.js +6 -4
package/dist/orchestration/steps/compare-step.d.ts +1 -1
package/dist/orchestration/steps/compare-step.js +4 -2
package/dist/orchestration/steps/discovery-report-step.d.ts +1 -1
package/dist/orchestration/steps/discovery-report-step.js +4 -1
package/dist/orchestration/steps/fetch-docs-step.js +9 -15
package/dist/orchestration/steps/finalize-run-step.d.ts +29 -0
package/dist/orchestration/steps/finalize-run-step.js +117 -0
package/dist/orchestration/steps/gap-analysis-step.js +34 -6
package/dist/orchestration/steps/generate-configs-step.d.ts +1 -1
package/dist/orchestration/steps/generate-configs-step.js +11 -11
package/dist/orchestration/steps/publish-report-step.d.ts +1 -1
package/dist/orchestration/steps/publish-report-step.js +40 -55
package/dist/orchestration/steps/readiness-step.d.ts +1 -1
package/dist/orchestration/steps/readiness-step.js +4 -1
package/dist/orchestration/steps/report-step.d.ts +1 -1
package/dist/orchestration/steps/report-step.js +6 -3
package/dist/orchestration/steps/run-eval-step.js +14 -9
package/dist/pipeline/calculate-scores.js +13 -2
package/dist/pipeline/compare.d.ts +2 -2
package/dist/pipeline/emit-eval-results.d.ts +38 -0
package/dist/pipeline/emit-eval-results.js +100 -0
package/dist/pipeline/provenance.d.ts +24 -44
package/dist/pipeline/provenance.js +17 -165
package/dist/pipeline/report-title.d.ts +2 -2
package/dist/pipeline/run-context.d.ts +57 -0
package/dist/pipeline/run-context.js +156 -0
package/dist/pipeline/upload-test-outputs.d.ts +26 -0
package/dist/pipeline/upload-test-outputs.js +34 -0
package/dist/report-store.js +4 -2
package/package.json +3 -3
package/dist/_vendor/ailf-core/ports/artifact-uploader.d.ts +0 -35
package/dist/_vendor/ailf-core/ports/artifact-uploader.js +0 -18
package/dist/artifact-capture/api-gateway-artifact-uploader.d.ts +0 -41
package/dist/artifact-capture/api-gateway-artifact-uploader.js +0 -123
package/dist/artifact-capture/gcs-report-artifact-uploader.d.ts +0 -31
package/dist/artifact-capture/gcs-report-artifact-uploader.js +0 -66

package/dist/orchestration/steps/finalize-run-step.js ADDED Viewed

@@ -0,0 +1,117 @@
+/**
+ * Pipeline step: FinalizeRunStep — writes the run manifest at pipeline end.
+ *
+ * Inserts between `GapAnalysis` and `PublishReport`. Assembles a
+ * `RunManifest` from `state.artifactRefs` (populated by producer steps)
+ * and the shared `RunContext` (via `buildRunContext`), then writes it to
+ * `runs/{runId}/manifest.json`. The written manifest becomes the source
+ * of truth for artifact locations; `PublishReportStep` snapshots the
+ * `artifacts` slice into `Report.artifactManifest` (D0032).
+ *
+ * Design principles:
+ * - Single writer — one `writeManifest()` call per pipeline run.
+ * - Idempotent — retries produce the same manifest bytes for the same inputs.
+ * - Skipped when no writer is wired (local/air-gapped runs stay functional).
+ *
+ * @see docs/decisions/D0032-run-anchored-artifact-store.md
+ */
+import { existsSync, readFileSync } from "node:fs";
+import { resolve } from "node:path";
+import { AccumulatingArtifactWriter } from "../../artifact-capture/accumulating-artifact-writer.js";
+import { buildRunContext } from "../../pipeline/run-context.js";
+import { loadSource } from "../../sources.js";
+import { configToSourceOverrides } from "../config-to-source-overrides.js";
+export class FinalizeRunStep {
+    pipelineStart;
+    options;
+    name = "finalize-run";
+    optional = true;
+    constructor(pipelineStart, options = {}) {
+        this.pipelineStart = pipelineStart;
+        this.options = options;
+    }
+    check() {
+        return [];
+    }
+    async execute(ctx, state) {
+        const start = Date.now();
+        // W0050 — `ctx.artifactWriter` is now required on AppContext
+        // (composition root always provides one; NoOpArtifactWriter when
+        // `--no-artifacts`). The pre-W0050 guard that returned "skipped" has
+        // been removed — a NoOp writer's writeManifest returns null and the
+        // code below already handles that as a non-blocking failure.
+        // Resolve the source (same input buildProvenance uses).
+        const overrides = configToSourceOverrides(ctx.config);
+        const resolvedSource = loadSource(ctx.config.source, overrides);
+        // Optional: try to read the on-disk summary for test mode inference,
+        // but don't fail finalize if it's missing — the manifest should still
+        // be written so artifacts have a catalog.
+        const maybeSummary = tryReadScoreSummary(ctx.config.rootDir);
+        const runContext = buildRunContext({
+            areas: maybeSummary?.scores?.map((s) => s.feature) ?? ctx.config.areas ?? [],
+            callerGit: ctx.config.callerGit,
+            evalFingerprint: state.evalFingerprint ?? this.options.evalFingerprint,
+            logger: ctx.logger,
+            mode: ctx.config.mode,
+            rootDir: ctx.config.rootDir,
+            source: resolvedSource,
+            taskIds: ctx.config.tasks,
+        });
+        // W0051 revisit: the composition-root wraps `ctx.artifactWriter` in
+        // `AccumulatingArtifactWriter`, which keeps a map of every ref any
+        // producer emitted this run. Merge that into `state.artifactRefs` so
+        // the manifest reflects the FULL set — not just the subset producers
+        // happened to register manually. When the writer is a NoOp / plain
+        // decorator without accumulation, `aggregated` stays empty and the
+        // manifest falls back to the producer-side registration.
+        const aggregated = ctx.artifactWriter instanceof AccumulatingArtifactWriter
+            ? ctx.artifactWriter.getAccumulatedArtifactRefs()
+            : {};
+        const artifacts = {
+            ...aggregated,
+            ...(state.artifactRefs ?? {}),
+        };
+        const manifest = {
+            version: 1,
+            runId: ctx.runId,
+            createdAt: new Date().toISOString(),
+            durationMs: Date.now() - this.pipelineStart,
+            status: "completed",
+            context: runContext,
+            outcomes: state.testSummary
+                ? { testSummary: state.testSummary }
+                : undefined,
+            promptfooUrls: state.promptfooUrls,
+            artifacts,
+        };
+        const ref = await ctx.artifactWriter.writeManifest(ctx.runId, manifest);
+        if (!ref) {
+            // Non-blocking: writer logged the warning. Still populate state so
+            // publish can snapshot `artifacts` even without a persisted manifest.
+            state.runManifest = manifest;
+            return {
+                durationMs: Date.now() - start,
+                status: "success",
+                summary: "Run manifest computed (GCS write failed — non-blocking)",
+            };
+        }
+        state.runManifest = manifest;
+        const artifactCount = Object.keys(manifest.artifacts).length;
+        return {
+            durationMs: Date.now() - start,
+            status: "success",
+            summary: `Run manifest written to ${ref.path} (${artifactCount} artifact ref${artifactCount === 1 ? "" : "s"})`,
+        };
+    }
+}
+function tryReadScoreSummary(rootDir) {
+    const path = resolve(rootDir, "results", "latest", "score-summary.json");
+    if (!existsSync(path))
+        return undefined;
+    try {
+        return JSON.parse(readFileSync(path, "utf-8"));
+    }
+    catch {
+        return undefined;
+    }
+}

package/dist/orchestration/steps/gap-analysis-step.js CHANGED Viewed

@@ -16,7 +16,8 @@
  */
 import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
 import { join, resolve } from "path";
-import { isSlugRef } from "../../_vendor/ailf-core/index.js";
+import { assoc, isSlugRef } from "../../_vendor/ailf-core/index.js";
+import { emitFileContents } from "../../artifact-capture/emit-file.js";
 export class GapAnalysisStep {
     name = "gap-analysis";
     optional = true;
@@ -194,14 +195,29 @@ export class GapAnalysisStep {
                 ...(testResults !== undefined && { testResults }),
             };
             writeFileSync(scoreSummaryPath, JSON.stringify(enrichedSummary, null, 2));
-            // Capture gap analysis artifacts
-            const failureModesPath = join(outDir, "failure-modes.json");
-            if (existsSync(failureModesPath)) {
-                ctx.collector.captureFile("gap-analysis", "failure-modes", failureModesPath);
+            // W0051 Slice 2 — failureModes is per-entry keyed by {mode, category};
+            // one entry per classified FailureModeType. Zero-count categories are
+            // skipped to keep the manifest honest about what the run surfaced.
+            const classifiedByCategory = new Map();
+            for (const cj of failureModeReport.classifiedJudgments) {
+                const cat = cj.classification.mode;
+                const bucket = classifiedByCategory.get(cat) ?? [];
+                bucket.push(cj);
+                classifiedByCategory.set(cat, bucket);
+            }
+            for (const [category, classified] of classifiedByCategory) {
+                if (classified.length === 0)
+                    continue;
+                await ctx.artifactWriter.emit("failureModes", assoc(ctx, { mode: ctx.config.mode, category }), {
+                    category,
+                    count: classified.length,
+                    title: toTitleCase(category),
+                    judgments: classified.map((c) => c.judgment),
+                });
             }
             const gapReportPath = join(outDir, "gap-analysis.json");
             if (existsSync(gapReportPath)) {
-                ctx.collector.captureFile("gap-analysis", "gap-report", gapReportPath);
+                await emitFileContents(ctx.artifactWriter, "gapReport", assoc(ctx), gapReportPath);
             }
             const gapCount = gapReport.gaps.length;
             const classRate = failureModeReport.classificationRate.toFixed(0);
@@ -223,6 +239,18 @@ export class GapAnalysisStep {
 // ---------------------------------------------------------------------------
 // Helpers
 // ---------------------------------------------------------------------------
+/**
+ * Render a kebab-case FailureModeType id as Title Case for the manifest
+ * entry's display title (e.g. `"missing-docs"` → `"Missing Docs"`). Kept
+ * local to the producer so the registry descriptor stays decoupled from
+ * eval-side types.
+ */
+function toTitleCase(id) {
+    return id
+        .split("-")
+        .map((w) => (w.length === 0 ? w : w[0].toUpperCase() + w.slice(1)))
+        .join(" ");
+}
 /**
  * Extract slug strings from polymorphic canonical doc refs.
  *

package/dist/orchestration/steps/generate-configs-step.d.ts CHANGED Viewed

@@ -8,7 +8,7 @@
  * When the variant is "full", the handler is called twice (baseline + agentic)
  * and three YAML files are written. Other modes produce one YAML file.
  */
-import type { AppContext, PipelineState, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
+import { type AppContext, type PipelineState, type PipelineStep, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
 export declare class GenerateConfigsStep implements PipelineStep {
     readonly name = "generate-configs";
     /** Task IDs from the last loadTasks call (pre-filter), for error messages. */

package/dist/orchestration/steps/generate-configs-step.js CHANGED Viewed

@@ -10,6 +10,8 @@
  */
 import { existsSync } from "node:fs";
 import { resolve } from "node:path";
+import { assoc, } from "../../_vendor/ailf-core/index.js";
+import { emitFileContents } from "../../artifact-capture/emit-file.js";
 import { LiteracyVariant } from "../../pipeline/normalize-mode.js";
 import { modelMatchesLiteracyVariant } from "../../pipeline/compiler/mode-bases/literacy.js";
 import { getStepInputPaths } from "../../pipeline/cache.js";
@@ -136,12 +138,14 @@ export class GenerateConfigsStep {
             maxConcurrency: models.maxConcurrency,
             logger: ctx.logger,
         });
-        // Capture generated config files (use configFileForMode for legacy naming)
+        // W0050 — configSnapshot is per-entry keyed by mode. For literacy,
+        // each variant produces a distinct config, so the variant name is the
+        // mode-axis value here.
         const { configFileForMode } = await import("../../pipeline/eval-constants.js");
         for (const variant of ["baseline", "agentic", "observed"]) {
             const configPath = resolve(ctx.config.rootDir, configFileForMode(variant));
             if (existsSync(configPath)) {
-                ctx.collector.captureFile("generate-configs", `promptfoo-config-${variant}`, configPath, { mode: "literacy", variant });
+                await emitFileContents(ctx.artifactWriter, "configSnapshot", assoc(ctx, { mode: `literacy-${variant}` }), configPath);
             }
         }
         return this.checkLiteracyPostconditions(ctx, start);
@@ -187,18 +191,14 @@ export class GenerateConfigsStep {
             maxConcurrency: models.maxConcurrency,
             logger: ctx.logger,
         });
-        // Capture generated config file
+        // W0050 — configSnapshot for a single-mode compile.
         const configPath = resolve(ctx.config.rootDir, `promptfooconfig.${mode}.yaml`);
         if (existsSync(configPath)) {
-            ctx.collector.captureFile("generate-configs", "promptfoo-config", configPath, { mode });
-        }
-        // Capture mode-specific test artifacts (extras)
-        if (ctx.collector.extrasEnabled) {
-            const testsPath = resolve(ctx.config.rootDir, "results", "latest", `${mode}-tests.json`);
-            if (existsSync(testsPath)) {
-                ctx.collector.captureFile("generate-configs", `${mode}-tests`, testsPath, { mode });
-            }
+            await emitFileContents(ctx.artifactWriter, "configSnapshot", assoc(ctx, { mode }), configPath);
         }
+        // W0050 — the mode-specific `${mode}-tests.json` file was an
+        // extras-only capture with no registered descriptor. Dropped; the
+        // same information lives in the configSnapshot + rawResults chain.
         return {
             durationMs: Date.now() - start,
             status: "success",

package/dist/orchestration/steps/publish-report-step.d.ts CHANGED Viewed

@@ -10,7 +10,7 @@
  * - P5: Local-first (pipeline never fails because of a store write)
  * - P6: Sinks are fire-and-forget (failures logged, not thrown)
  */
-import type { AppContext, PipelineState, PipelineStep, PromptfooUrlEntry, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
+import { type AppContext, type PipelineState, type PipelineStep, type PromptfooUrlEntry, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
 export declare class PublishReportStep implements PipelineStep {
     private readonly pipelineStart;
     private readonly options;

package/dist/orchestration/steps/publish-report-step.js CHANGED Viewed

@@ -12,6 +12,7 @@
  */
 import { readFileSync } from "fs";
 import { resolve } from "path";
+import { assoc, buildSlimReportSummary, } from "../../_vendor/ailf-core/index.js";
 import { checkScoreSummaryValid } from "../../pipeline/checks.js";
 import { buildProvenance, } from "../../pipeline/provenance.js";
 import { generateReportTitle } from "../../pipeline/report-title.js";
@@ -103,39 +104,49 @@ export class PublishReportStep {
             };
         }
         const title = generateReportTitle({ provenance });
+        // W0051 Slice 3: transform the full pipeline-internal ScoreSummary into
+        // the slim ReportSummary that lives on the Content Lake document.
+        // Prose fields (grader reasons, failureModes full text, gap prose,
+        // agentBehavior arrays) point at their external artifacts via
+        // `id = manifestEntryKey`; Studio hydrates on drill-down.
+        const slimSummary = buildSlimReportSummary(summary, ctx.config.mode);
         const report = {
             comparison: comparison ?? undefined,
             completedAt: now,
             durationMs,
             id: reportId,
             provenance,
-            summary,
+            summary: slimSummary,
             tag: this.options.publishTag ?? ctx.config.publishTag,
             title,
         };
-        // Upload test output artifacts to GCS (D0030 — non-blocking, P5).
-        // When upload succeeds, strip responseOutput from the inline
-        // testResults[] so the Content Lake document carries only the slim
-        // shape; the full output lives in the GCS artifact. When upload
-        // fails, leave the inline shape intact so Studio's drill-down UI
-        // still works via the backward-compat fallback.
-        if (ctx.artifactUploader && summary.testResults?.length) {
-            const artifactRef = await uploadTestOutputs(ctx.artifactUploader, reportId, now, summary.testResults);
-            if (artifactRef) {
-                report.artifacts = { testOutputs: artifactRef };
-                report.summary = {
-                    ...summary,
-                    testResults: summary.testResults.map(slimTestResult),
-                };
-            }
+        // Snapshot the artifact manifest from FinalizeRunStep's output (D0032).
+        // The source of truth is `runs/{runId}/manifest.json` in GCS; the report
+        // carries a denormalized copy so Studio can render drill-down state
+        // without an extra GCS fetch.
+        const artifactManifest = state.runManifest?.artifacts;
+        if (artifactManifest && Object.keys(artifactManifest).length > 0) {
+            report.artifactManifest = artifactManifest;
+        }
+        // When testOutputs was uploaded to GCS, strip responseOutput from the
+        // inline testResults[] so the Content Lake document stays slim — the
+        // full output lives in the GCS artifact. When no testOutputs artifact
+        // exists, leave the inline shape intact so Studio's drill-down UI
+        // falls back to it.
+        if (artifactManifest?.testOutputs && slimSummary.testResults?.length) {
+            report.summary = {
+                ...slimSummary,
+                testResults: slimSummary.testResults.map(slimTestResult),
+            };
         }
         // Share reportId with downstream steps (CallbackStep + orchestrator job update)
         state.reportId = reportId;
-        // Capture report object (Tier 2)
-        ctx.collector.capture("publish-report", "report-object", report);
-        // Capture auto-comparison if present (Tier 2)
+        // W0050 — migrated from ctx.collector.capture to the unified writer.
+        // reportSnapshot: full Report JSON for replay (run-scoped, bulk).
+        await ctx.artifactWriter.emit("reportSnapshot", assoc(ctx), report);
+        // autoComparison: delta vs baseline (run-scoped, bulk, optional).
         if (comparison) {
-            ctx.collector.capture("publish-report", "auto-comparison", comparison);
+            await ctx.artifactWriter.emit("autoComparison", assoc(ctx), comparison);
         }
         // Write to store (system of record — best-effort, P5)
         const sanityResult = ctx.reportStore
@@ -143,17 +154,14 @@ export class PublishReportStep {
             : null;
         // Run sinks (fire-and-forget, P6)
         const publishResult = await runSinks(report, ctx);
-        // Capture sink results (Tier 2)
-        if (publishResult.sinkResults.length > 0) {
-            ctx.collector.capture("publish-report", "sink-results", {
-                sinkCount: publishResult.sinkResults.length,
-                results: publishResult.sinkResults.map((r) => ({
-                    name: r.name,
-                    status: r.result.status,
-                    ...(r.result.status === "success" ? { detail: r.result.detail } : {}),
-                    ...(r.result.status === "failed" ? { error: r.result.error } : {}),
-                    ...(r.result.status === "skipped" ? { reason: r.result.reason } : {}),
-                })),
+        // sinkResults: per-sink outcome (run-scoped, per-entry keyed by sink name).
+        for (const r of publishResult.sinkResults) {
+            await ctx.artifactWriter.emit("sinkResults", assoc(ctx, { name: r.name }), {
+                name: r.name,
+                status: r.result.status,
+                ...(r.result.status === "success" ? { detail: r.result.detail } : {}),
+                ...(r.result.status === "failed" ? { error: r.result.error } : {}),
+                ...(r.result.status === "skipped" ? { reason: r.result.reason } : {}),
             });
         }
         // Build result summary
@@ -221,6 +229,7 @@ function buildProvenanceInput(summary, ctx, options, autoScope) {
         mode,
         promptfooUrls: options.promptfooUrls,
         rootDir: ctx.config.rootDir,
+        runId: ctx.runId,
         sanityDocumentIds,
         source,
         sourceReportId: ctx.config.sourceReportId,
@@ -236,30 +245,6 @@ function slimTestResult(tr) {
     const { responseOutput: _o, responseOutputTruncated: _t, ...rest } = tr;
     return rest;
 }
-/**
- * Extract test outputs from StoredTestResult[] and upload as a single
- * JSON artifact to GCS. The artifact is keyed by `{taskId}::{modelId}`
- * to match the lookup pattern in Studio's JudgmentList component.
- *
- * Non-blocking: returns null if upload fails (P5).
- */
-async function uploadTestOutputs(uploader, reportId, createdAt, testResults) {
-    const entries = {};
-    for (const tr of testResults) {
-        const key = `${tr.taskId}::${tr.modelId}`;
-        entries[key] = {
-            responseOutput: tr.responseOutput ?? "",
-            responseOutputTruncated: tr.responseOutputTruncated ?? false,
-        };
-    }
-    const artifact = {
-        version: 1,
-        reportId,
-        createdAt,
-        entries,
-    };
-    return uploader.upload(reportId, "test-outputs.json", artifact);
-}
 /**
  * Fan out a report to all configured sinks.
  *

package/dist/orchestration/steps/readiness-step.d.ts CHANGED Viewed

@@ -4,7 +4,7 @@
  * Calls pure functions from pipeline/readiness-report.ts directly.
  * Optional step — failure doesn't stop the pipeline.
  */
-import type { AppContext, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
+import { type AppContext, type PipelineStep, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
 export declare class ReadinessStep implements PipelineStep {
     readonly name = "readiness";
     readonly optional = true;

package/dist/orchestration/steps/readiness-step.js CHANGED Viewed

@@ -7,6 +7,8 @@
 import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
 import { resolve } from "path";
 import { tryLoadConfigFile } from "../../pipeline/compiler/config-loader.js";
+import { assoc, } from "../../_vendor/ailf-core/index.js";
+import { emitFileContents } from "../../artifact-capture/emit-file.js";
 import { formatReadinessMarkdown, generateReadinessReport, } from "../../pipeline/readiness-report.js";
 import { ThresholdConfigSchema } from "../../pipeline/schemas.js";
 export class ReadinessStep {
@@ -65,7 +67,8 @@ export class ReadinessStep {
                 mkdirSync(ctx.config.outputDir, { recursive: true });
                 const readinessPath = resolve(ctx.config.outputDir, "readiness-report.md");
                 writeFileSync(readinessPath, readinessLines.join("\n---\n\n"));
-                ctx.collector.captureFile("readiness", "readiness-report", readinessPath);
+                // W0050 — readinessReport is run-scoped bulk markdown.
+                await emitFileContents(ctx.artifactWriter, "readinessReport", assoc(ctx), readinessPath);
             }
             const passCount = readinessAreas.filter((area) => {
                 const areaScore = scoreSummary.scores.find((s) => s.feature === area);

package/dist/orchestration/steps/report-step.d.ts CHANGED Viewed

@@ -4,7 +4,7 @@
  * Calls generatePrComment() from pipeline/pr-comment.ts with typed options.
  * No env bridge or process.argv manipulation needed.
  */
-import type { AppContext, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
+import { type AppContext, type PipelineStep, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
 export declare class ReportStep implements PipelineStep {
     readonly name = "report";
     check(): ValidationIssue[];

package/dist/orchestration/steps/report-step.js CHANGED Viewed

@@ -6,6 +6,8 @@
  */
 import { existsSync, mkdirSync } from "node:fs";
 import { dirname, resolve } from "path";
+import { assoc, } from "../../_vendor/ailf-core/index.js";
+import { emitFileContents } from "../../artifact-capture/emit-file.js";
 import { checkScoreSummaryValid } from "../../pipeline/checks.js";
 import { generatePrComment } from "../../pipeline/pr-comment.js";
 export class ReportStep {
@@ -45,13 +47,14 @@ export class ReportStep {
                 status: "failed",
             };
         }
-        // Capture report artifacts
+        // W0050 — captureFile → emitFileContents. Both are run-scoped bulk
+        // artifacts; the writer handles redaction + excluded-types gating.
         if (existsSync(resolvedOutput)) {
-            ctx.collector.captureFile("report", "pr-comment", resolvedOutput);
+            await emitFileContents(ctx.artifactWriter, "prComment", assoc(ctx), resolvedOutput);
         }
         const pipelineResultPath = resolve(ctx.config.outputDir, "pipeline-result.json");
         if (existsSync(pipelineResultPath)) {
-            ctx.collector.captureFile("report", "pipeline-result", pipelineResultPath);
+            await emitFileContents(ctx.artifactWriter, "pipelineResult", assoc(ctx), pipelineResultPath);
         }
         return {
             durationMs: Date.now() - start,

package/dist/orchestration/steps/run-eval-step.js CHANGED Viewed

@@ -7,6 +7,7 @@
  */
 import { existsSync, mkdirSync, writeFileSync } from "fs";
 import { resolve } from "path";
+import { emitPerEntryEvalResults } from "../../pipeline/emit-eval-results.js";
 import { getStepInputPaths } from "../../pipeline/cache.js";
 import { buildCacheContext } from "../cache-context.js";
 import { checkCanonicalContextsExist, checkGeneratedConfigsExist, checkResultsExist, } from "../../pipeline/checks.js";
@@ -118,11 +119,11 @@ export class RunEvalStep {
                     state.promptfooUrls ??= [];
                     state.promptfooUrls.push(...remoteCacheResult.promptfooUrls);
                 }
-                // Capture the restored score-summary from remote cache
-                const cachedSummaryPath = resolve(rootDir, "results", "latest", "score-summary.json");
-                if (existsSync(cachedSummaryPath)) {
-                    ctx.collector.captureFile("run-eval", "score-summary-cached", cachedSummaryPath, { source: "remote-cache", mode: this.mode });
-                }
+                // W0050 — score-summary-cached was an unregistered capture;
+                // scoreSummary is already emitted by calculate-scores-step on the
+                // non-cached path, which also runs when we have a remote cache hit
+                // (populating state.remoteCacheHits → CalculateScoresStep still
+                // invokes for the score-summary emit). Dropped here.
                 return {
                     durationMs: Date.now() - start,
                     status: "success",
@@ -187,12 +188,16 @@ export class RunEvalStep {
             console.log();
             console.log(errorSummary);
         }
-        // Capture eval results
+        // W0050 — decompose the promptfoo aggregate into the per-entry
+        // descriptors the W0049 registry expects: rawResults / renderedPrompts
+        // per (run, mode, task, model); graderPrompts / graderJudgments per
+        // (run, mode, task, model, grader). See pipeline/emit-eval-results.ts.
+        // `testOutputs` still flows through uploadTestOutputs() in
+        // calculate-scores-step. `traces` ships via agent-observer (out of
+        // scope for the promptfoo shape parser — follow-up).
         const resultsPath = resolve(rootDir, resultsFileForMode(this.mode));
         if (existsSync(resultsPath)) {
-            ctx.collector.captureFile("run-eval", `eval-results-${this.mode}`, resultsPath, {
-                mode: this.mode,
-            });
+            await emitPerEntryEvalResults(ctx.artifactWriter, ctx, this.mode, resultsPath);
         }
         // Extract Promptfoo share URL from eval results (Step 3b)
         if (ctx.evalRunner.extractShareUrl) {

package/dist/pipeline/calculate-scores.js CHANGED Viewed

@@ -157,8 +157,19 @@ export function extractGraderJudgments(resultsPath) {
     }
     return judgments;
 }
-/** Maximum characters to store for model response output */
-const MAX_RESPONSE_OUTPUT_LENGTH = 8000;
+/**
+ * Maximum characters (JS string length, not bytes) to store for model
+ * response output. ASCII-heavy responses at this cap JSON-encode to ~1 MB;
+ * pathological multi-byte UTF-8 could encode to ~4 MB, still well within
+ * per-entry GCS object limits.
+ *
+ * Raised from 8 000 to 1 000 000 in W0048 because the per-entry artifact
+ * layout (D0032) makes the cap irrelevant to Studio's fetch cost — each
+ * entry is fetched independently on click, so a larger ceiling only costs
+ * GCS bytes, not main-thread blocking or baseline report payload.
+ * `responseOutputTruncated` still flips for the extreme tail.
+ */
+const MAX_RESPONSE_OUTPUT_LENGTH = 1_000_000;
 /**
  * Extract per-test results with model output from evaluation results.
  *

package/dist/pipeline/compare.d.ts CHANGED Viewed

@@ -15,7 +15,7 @@
  * @see docs/ideas/evaluation-roadmap.md — BP5: Make comparison a primitive
  * @see docs/ideas/metrics-design.md — Tier 4: Comparison results
  */
-import { type ChangeClass, type CompareOptions, type ComparisonReport, type ScoreSummary } from "./types.js";
+import { type ChangeClass, type ComparableSummary, type CompareOptions, type ComparisonReport } from "./types.js";
 /** Classify a delta as improved, regressed, or unchanged given a threshold */
 export declare function classifyChange(delta: number, threshold: number): ChangeClass;
 /**
@@ -28,4 +28,4 @@ export declare function classifyChange(delta: number, threshold: number): Change
  * @param options  Optional configuration (noise threshold, etc.)
  * @returns A ComparisonReport with deltas, classifications, and breakdowns
  */
-export declare function compare(baseline: ScoreSummary, experiment: ScoreSummary, options?: CompareOptions): ComparisonReport;
+export declare function compare(baseline: ComparableSummary, experiment: ComparableSummary, options?: CompareOptions): ComparisonReport;

package/dist/pipeline/emit-eval-results.d.ts ADDED Viewed

@@ -0,0 +1,38 @@
+/**
+ * emit-eval-results.ts — decompose the promptfoo results file into the
+ * per-entry descriptors that W0049's registry expects.
+ *
+ * Replaces the Phase-B-stopgap "route the aggregated JSON through the
+ * deprecated `evalResults` bulk descriptor" path. For each test in the
+ * promptfoo output we emit:
+ *
+ *   - `rawResults`      per (run, mode, task, model)           — the full result
+ *   - `renderedPrompts` per (run, mode, task, model)           — prompt the model saw
+ *   - `graderPrompts`   per (run, mode, task, model, grader)   — rubric text
+ *   - `graderJudgments` per (run, mode, task, model, grader)   — {score, reason, pass}
+ *
+ * `testOutputs` is still emitted separately by `calculate-scores-step`
+ * via `uploadTestOutputs()` (carried forward from W0048 for byte-
+ * equivalence with the original rollout).
+ *
+ * `traces` is NOT produced here — agentic trace data flows through the
+ * agent-observer, not through the promptfoo result shape. Traces
+ * emission is out of scope for this helper and lands when the observer
+ * integration migrates (follow-up; not in W0050).
+ *
+ * The "grader" axis value is the rubric dimension string produced by
+ * `classifyRubric` (e.g. "task-completion", "code-correctness"). Non-
+ * LLM-rubric component assertions (javascript, contains, etc.) don't
+ * have a natural grader identifier and are skipped — their outcomes
+ * still live inside the full `rawResults` object.
+ */
+import { type ArtifactWriter, type RunId } from "../_vendor/ailf-core/index.d.ts";
+/**
+ * Parse a promptfoo results file and emit the per-entry artifacts.
+ *
+ * Non-blocking: any individual emit failure warns but does not halt.
+ * File read/parse errors are caught and logged; the caller keeps going.
+ */
+export declare function emitPerEntryEvalResults(writer: ArtifactWriter, ctx: {
+    runId: RunId;
+}, mode: string, resultsPath: string): Promise<void>;