npm - @sanity/ailf - Versions diffs - 3.1.0 → 3.2.0 - Mend

@sanity/ailf 3.1.0 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/dist/_vendor/ailf-core/artifact-registry.d.ts CHANGED Viewed

@@ -41,7 +41,7 @@ export type ArtifactMime = "application/json" | "application/x-ndjson" | "text/m
  */
 export type ArtifactTruncationPolicy = "reject" | "trailing-truncate" | "fielded-truncate" | "trial-oversize";
 /** The union of every artifact type known to AILF. */
-export type ArtifactType = "runManifest" | "scoreSummary" | "pipelineResult" | "pipelineContext" | "documentManifest" | "prComment" | "readinessReport" | "reportSnapshot" | "autoComparison" | "gapReport" | "sinkResults" | "callbackRequest" | "callbackResponse" | "configSnapshot" | "evalConfigGenerated" | "comparisonReport" | "discoveryReport" | "failureModes" | "taskDefinitions" | "renderedPrompts" | "rawResults" | "testOutputs" | "graderPrompts" | "graderJudgments" | "traces";
+export type ArtifactType = "runManifest" | "scoreSummary" | "pipelineResult" | "pipelineContext" | "documentManifest" | "prComment" | "readinessReport" | "reportSnapshot" | "autoComparison" | "gapReport" | "sinkResults" | "callbackRequest" | "callbackResponse" | "configSnapshot" | "evalConfigGenerated" | "comparisonReport" | "discoveryReport" | "failureModes" | "renderedPrompts" | "rawResults" | "testOutputs" | "graderPrompts" | "graderJudgments" | "traces";
 /**
  * Result of parsing a per-entry key into a sanitized filename component.
  * Success carries the sanitized value; failure carries a reason for 4xx responses.

package/dist/_vendor/ailf-core/artifact-registry.js CHANGED Viewed

@@ -255,6 +255,71 @@ const graderJudgmentPreviewSchema = z.object({
     reasonPreview: z.string().max(280),
     dimensionScores: z.record(z.string(), z.number()).optional(),
 });
+/**
+ * Preview shape for `rawResults` manifest entries. A per-call projection
+ * over the Promptfoo result payload — cost, latency, pass/fail, and token
+ * usage — so a Studio list row can render without signing the full
+ * artifact (which routinely runs to tens of KB of response text).
+ *
+ * Projected fields mirror top-level Promptfoo result keys (`success`,
+ * `score`, `cost`, `latencyMs`) and `response.finishReason` /
+ * `response.tokenUsage.total` for the two useful nested numbers.
+ */
+const rawResultPreviewSchema = z.object({
+    passed: z.boolean(),
+    score: z.number().optional(),
+    cost: z.number().nonnegative().optional(),
+    latencyMs: z.number().int().nonnegative().optional(),
+    totalTokens: z.number().int().nonnegative().optional(),
+    finishReason: z.string().max(40).optional(),
+});
+/**
+ * Preview shape for the run-scoped `scoreSummary` bulk artifact. Projects
+ * the composite score, run-level cost/token totals, the weakest area, and
+ * the top 3 scoring areas — enough for a Studio rollup row without pulling
+ * the full summary (which carries enriched per-test data and can exceed
+ * 100 KB on multi-area runs).
+ */
+const scoreSummaryPreviewSchema = z.object({
+    composite: z.number(),
+    testCount: z.number().int().nonnegative().optional(),
+    lowestArea: z.string().max(60).optional(),
+    lowestScore: z.number().optional(),
+    totalCost: z.number().nonnegative().optional(),
+    totalTokens: z.number().int().nonnegative().optional(),
+    topAreas: z
+        .array(z.object({
+        feature: z.string().max(60),
+        score: z.number(),
+    }))
+        .max(3)
+        .optional(),
+});
+/**
+ * Preview shape for `renderedPrompts` manifest entries (W0061 / D0033 M7).
+ * Lets the Studio PromptReplayDrawer surface a quick-look pane — prompt
+ * char count, the variant label (e.g. "Baseline (No Docs)" / "Gold (with
+ * docs)") as a secondary signal, and a short leading snippet — without
+ * round-tripping the full rendered prompt (which can run 50+ KB on
+ * multi-turn conversations).
+ */
+const renderedPromptPreviewSchema = z.object({
+    promptCharCount: z.number().int().nonnegative(),
+    label: z.string().max(60).optional(),
+    snippet: z.string().max(120),
+});
+/**
+ * Preview shape for `graderPrompts` manifest entries (W0061 / D0033 M7).
+ * `rubricName` maps to the classified dimension (e.g. "correctness",
+ * "completeness"); char count and snippet cover the rubric text itself so
+ * the Studio drawer can render a preview before the full assertion payload
+ * is signed and fetched.
+ */
+const graderPromptPreviewSchema = z.object({
+    rubricCharCount: z.number().int().nonnegative(),
+    rubricName: z.string().max(60).optional(),
+    snippet: z.string().max(120),
+});
 // Aspirational: most payload shapes are still loose. Tightening per-type as
 // consumers stabilize is explicitly a W0050/W0051 concern — W0049 fixes the
 // structural shape around them without changing the payload contracts.
@@ -361,6 +426,57 @@ export const ARTIFACT_REGISTRY = {
         entrySchema: unknownEntry,
         mime: "application/json",
         capBytes: 1_000_000,
+        manifestPreview: {
+            schema: scoreSummaryPreviewSchema,
+            extract: (entry) => {
+                const e = entry;
+                const avgScore = typeof e.overall?.avgScore === "number" ? e.overall.avgScore : 0;
+                const totalCost = typeof e.overall?.cost?.total === "number"
+                    ? e.overall.cost.total
+                    : undefined;
+                const rawTotalTokens = e.overall?.cost?.totalTokens;
+                const totalTokens = typeof rawTotalTokens === "number" &&
+                    Number.isFinite(rawTotalTokens) &&
+                    rawTotalTokens >= 0
+                    ? Math.trunc(rawTotalTokens)
+                    : undefined;
+                const scores = Array.isArray(e.scores) ? e.scores : [];
+                const areas = [];
+                for (const s of scores) {
+                    if (s === null || typeof s !== "object")
+                        continue;
+                    const rec = s;
+                    if (typeof rec.feature !== "string")
+                        continue;
+                    if (typeof rec.totalScore !== "number")
+                        continue;
+                    const testCount = typeof rec.testCount === "number" ? rec.testCount : undefined;
+                    areas.push({
+                        feature: rec.feature,
+                        totalScore: rec.totalScore,
+                        ...(testCount !== undefined ? { testCount } : {}),
+                    });
+                }
+                const topAreas = areas
+                    .slice()
+                    .sort((a, b) => b.totalScore - a.totalScore)
+                    .slice(0, 3)
+                    .map((a) => ({ feature: a.feature, score: a.totalScore }));
+                const testCount = areas.reduce((sum, a) => sum + (a.testCount ?? 0), 0);
+                const lowestArea = typeof e.lowestArea === "string" ? e.lowestArea : undefined;
+                const lowestScore = typeof e.lowestScore === "number" ? e.lowestScore : undefined;
+                return {
+                    composite: Math.round(avgScore),
+                    ...(testCount > 0 ? { testCount } : {}),
+                    ...(lowestArea === undefined ? {} : { lowestArea }),
+                    ...(lowestScore === undefined ? {} : { lowestScore }),
+                    ...(totalCost === undefined ? {} : { totalCost }),
+                    ...(totalTokens === undefined ? {} : { totalTokens }),
+                    ...(topAreas.length > 0 ? { topAreas } : {}),
+                };
+            },
+            capBytes: 512,
+        },
     }),
     pipelineResult: buildDescriptor({
         type: "pipelineResult",
@@ -534,15 +650,6 @@ export const ARTIFACT_REGISTRY = {
         },
     }),
     // -- Run × Mode × Task(+…) -------------------------------------------------
-    taskDefinitions: buildDescriptor({
-        type: "taskDefinitions",
-        slug: "task-definitions",
-        layout: "per-entry",
-        axes: ["run", "mode", "task"],
-        entrySchema: unknownEntry,
-        mime: "application/json",
-        capBytes: 256_000,
-    }),
     renderedPrompts: buildDescriptor({
         type: "renderedPrompts",
         slug: "rendered-prompts",
@@ -551,6 +658,29 @@ export const ARTIFACT_REGISTRY = {
         entrySchema: unknownEntry,
         mime: "application/json",
         capBytes: 1_000_000,
+        manifestPreview: {
+            schema: renderedPromptPreviewSchema,
+            extract: (entry) => {
+                // Producer shape from `emit-eval-results.ts`:
+                //   { prompt: { raw: string, label: string, config: {...} },
+                //     provider: { id: string, label: string } }
+                // The prompt text lives at `prompt.raw`; `prompt.label` carries the
+                // human-readable variant name ("Baseline (No Docs)" / "Gold (with
+                // docs)") and is a more useful secondary signal than a
+                // provider-level format hint (which isn't exposed here).
+                const e = entry;
+                const raw = typeof e.prompt?.raw === "string" ? e.prompt.raw : "";
+                const label = typeof e.prompt?.label === "string"
+                    ? truncateString(e.prompt.label, 60)
+                    : undefined;
+                return {
+                    promptCharCount: raw.length,
+                    ...(label === undefined ? {} : { label }),
+                    snippet: truncateString(raw, 120),
+                };
+            },
+            capBytes: 256,
+        },
     }),
     rawResults: buildDescriptor({
         type: "rawResults",
@@ -560,6 +690,39 @@ export const ARTIFACT_REGISTRY = {
         entrySchema: unknownEntry,
         mime: "application/json",
         capBytes: 1_000_000,
+        manifestPreview: {
+            schema: rawResultPreviewSchema,
+            extract: (entry) => {
+                const e = entry;
+                const passed = e.success === true;
+                const score = typeof e.score === "number" ? e.score : undefined;
+                const cost = typeof e.cost === "number" && e.cost >= 0 ? e.cost : undefined;
+                const rawLatency = e.latencyMs;
+                const latencyMs = typeof rawLatency === "number" &&
+                    Number.isFinite(rawLatency) &&
+                    rawLatency >= 0
+                    ? Math.trunc(rawLatency)
+                    : undefined;
+                const rawTotalTokens = e.response?.tokenUsage?.total;
+                const totalTokens = typeof rawTotalTokens === "number" &&
+                    Number.isFinite(rawTotalTokens) &&
+                    rawTotalTokens >= 0
+                    ? Math.trunc(rawTotalTokens)
+                    : undefined;
+                const finishReason = typeof e.response?.finishReason === "string"
+                    ? truncateString(e.response.finishReason, 40)
+                    : undefined;
+                return {
+                    passed,
+                    ...(score === undefined ? {} : { score }),
+                    ...(cost === undefined ? {} : { cost }),
+                    ...(latencyMs === undefined ? {} : { latencyMs }),
+                    ...(totalTokens === undefined ? {} : { totalTokens }),
+                    ...(finishReason === undefined ? {} : { finishReason }),
+                };
+            },
+            capBytes: 256,
+        },
     }),
     testOutputs: buildDescriptor({
         type: "testOutputs",
@@ -593,6 +756,22 @@ export const ARTIFACT_REGISTRY = {
         entrySchema: unknownEntry,
         mime: "application/json",
         capBytes: 512_000,
+        manifestPreview: {
+            schema: graderPromptPreviewSchema,
+            extract: (entry) => {
+                const e = entry;
+                const rubricText = typeof e.assertion?.value === "string" ? e.assertion.value : "";
+                const rubricName = typeof e.dimension === "string"
+                    ? truncateString(e.dimension, 60)
+                    : undefined;
+                return {
+                    rubricCharCount: rubricText.length,
+                    ...(rubricName === undefined ? {} : { rubricName }),
+                    snippet: truncateString(rubricText, 120),
+                };
+            },
+            capBytes: 256,
+        },
     }),
     graderJudgments: buildDescriptor({
         type: "graderJudgments",

package/dist/_vendor/ailf-core/types/branded-ids.d.ts CHANGED Viewed

@@ -134,16 +134,18 @@ export declare function taskId(raw: string): Result<TaskId, IdValidationError>;
 /**
  * Parse a raw string into a `RunId`.
  *
- * Valid format: `run_` prefix followed by alphanumeric characters.
+ * Accepts the canonical `run_YYYYMMDDTHHMMSSZ_<8 base36>` shape (D0035)
+ * and the legacy `run_<32 hex>` UUIDv7 shape for back-compat with runs
+ * generated before the scheme change.
  */
 export declare function runId(raw: string): Result<RunId, IdValidationError>;
 /**
- * Generate a new `RunId` using a time-sortable UUIDv7 payload.
+ * Generate a new `RunId` with a human-readable UTC timestamp plus a
+ * random suffix (D0035). Shape: `run_YYYYMMDDTHHMMSSZ_<8 base36>`.
  *
- * The `run_` prefix plus the hyphen-stripped UUIDv7 yields 36 characters
- * that sort lexicographically by creation time — same pattern used for
- * `ReportId`. One generator call per pipeline invocation; every step
- * reads the resulting id from `AppContext.runId`.
+ * Lexicographic ordering matches creation time. One generator call per
+ * pipeline invocation; every step reads the resulting id from
+ * `AppContext.runId`.
  */
 export declare function generateRunId(): RunId;
 /**

package/dist/_vendor/ailf-core/types/branded-ids.js CHANGED Viewed

@@ -44,41 +44,52 @@ export function taskId(raw) {
     }
     return ok(raw);
 }
+/**
+ * Canonical shape emitted by `generateRunId` (D0035).
+ * `run_YYYYMMDDTHHMMSSZ_<8 lowercase base36>` — sortable + readable.
+ */
+const RUN_ID_RE = /^run_\d{8}T\d{6}Z_[0-9a-z]{8}$/;
+/**
+ * Legacy UUIDv7-hex shape. Accepted so existing runs in GCS and the
+ * Content Lake remain valid; never emitted by `generateRunId`.
+ */
+const LEGACY_RUN_ID_RE = /^run_[0-9a-f]{32}$/;
 /**
  * Parse a raw string into a `RunId`.
  *
- * Valid format: `run_` prefix followed by alphanumeric characters.
+ * Accepts the canonical `run_YYYYMMDDTHHMMSSZ_<8 base36>` shape (D0035)
+ * and the legacy `run_<32 hex>` UUIDv7 shape for back-compat with runs
+ * generated before the scheme change.
  */
 export function runId(raw) {
-    if (!raw.match(/^run_[a-zA-Z0-9]{8,}$/)) {
-        return err({
-            code: "INVALID_RUN_ID",
-            raw,
-            message: `Invalid RunId "${raw}": must match run_[a-zA-Z0-9]{8,}`,
-        });
+    if (RUN_ID_RE.test(raw) || LEGACY_RUN_ID_RE.test(raw)) {
+        return ok(raw);
     }
-    return ok(raw);
+    return err({
+        code: "INVALID_RUN_ID",
+        raw,
+        message: `Invalid RunId "${raw}": must match run_YYYYMMDDTHHMMSSZ_<8 base36> or legacy run_<32 hex>`,
+    });
 }
 /**
- * Generate a new `RunId` using a time-sortable UUIDv7 payload.
+ * Generate a new `RunId` with a human-readable UTC timestamp plus a
+ * random suffix (D0035). Shape: `run_YYYYMMDDTHHMMSSZ_<8 base36>`.
  *
- * The `run_` prefix plus the hyphen-stripped UUIDv7 yields 36 characters
- * that sort lexicographically by creation time — same pattern used for
- * `ReportId`. One generator call per pipeline invocation; every step
- * reads the resulting id from `AppContext.runId`.
+ * Lexicographic ordering matches creation time. One generator call per
+ * pipeline invocation; every step reads the resulting id from
+ * `AppContext.runId`.
  */
 export function generateRunId() {
-    const now = Date.now();
-    const uuid = crypto.randomUUID();
-    // UUID v7: encode 48-bit timestamp in the first 12 hex chars
-    const hex = now.toString(16).padStart(12, "0");
-    const v7 = hex.slice(0, 8) +
-        hex.slice(8, 12) +
-        "7" +
-        uuid.slice(15, 18) +
-        uuid.slice(19, 23) +
-        uuid.slice(24);
-    return `run_${v7}`;
+    const ts = new Date()
+        .toISOString()
+        .replace(/[-:]/g, "")
+        .replace(/\.\d{3}Z$/, "Z");
+    const bytes = crypto.getRandomValues(new Uint8Array(8));
+    let suffix = "";
+    for (const b of bytes) {
+        suffix += (b % 36).toString(36);
+    }
+    return `run_${ts}_${suffix}`;
 }
 /**
  * Parse a raw string into a `SuiteId`.

package/dist/_vendor/ailf-core/types/index.d.ts CHANGED Viewed

@@ -322,11 +322,18 @@ export interface StoredTestResult {
     responseOutputTruncated?: boolean;
     /** Task description (e.g. "Functions - Webhook handler (gold)") */
     taskId: string;
-    /** Token usage breakdown */
+    /**
+     * Token usage breakdown. All fields are optional because Promptfoo's
+     * row-level shape varies — cached results typically carry only
+     * `{ cached, total }` while fresh results carry `{ prompt, completion,
+     * total }`. Populated from `response.tokenUsage` on the raw result; see
+     * `extractStoredTestResults` in packages/eval/src/pipeline/calculate-scores.ts.
+     */
     tokenUsage?: {
-        completion: number;
-        prompt: number;
-        total: number;
+        cached?: number;
+        completion?: number;
+        prompt?: number;
+        total?: number;
     };
     /** "gold" (with docs) or "baseline" (without docs) */
     variant: "baseline" | "gold";

package/dist/_vendor/ailf-core/types/scoring-input.d.ts CHANGED Viewed

@@ -35,6 +35,12 @@ export interface TestResult {
     providerLabel?: string;
     response: {
         output: string;
+        tokenUsage?: {
+            cached?: number;
+            completion?: number;
+            prompt?: number;
+            total?: number;
+        };
     };
     vars: Record<string, string>;
 }

package/dist/artifact-capture/api-gateway-artifact-writer.js CHANGED Viewed

@@ -28,6 +28,7 @@
  * @see docs/decisions/D0033-unified-run-anchored-artifact-capture.md
  */
 import { ARTIFACT_REGISTRY, NotImplementedError, } from "../_vendor/ailf-core/index.js";
+import { prepareUploadBody } from "./prepare-upload-body.js";
 import { NO_OP_UPLOAD_METRICS, } from "./upload-metrics.js";
 export class ApiGatewayArtifactWriter {
     options;
@@ -143,8 +144,9 @@ export class ApiGatewayArtifactWriter {
         };
     }
     async putJsonRaw(uploadUrlPath, data, type) {
-        const json = JSON.stringify(data);
-        const bytes = Buffer.byteLength(json, "utf-8");
+        // W0059 — redact + serialize through the shared helper so the gateway
+        // path strips secrets before they reach the signed GCS URL.
+        const { body, bytes } = prepareUploadBody(data, "application/json");
         try {
             const signed = await this.fetchSignedUrl(uploadUrlPath, type);
             if (!signed)
@@ -153,7 +155,7 @@ export class ApiGatewayArtifactWriter {
             let putSuccess = false;
             try {
                 const putRes = await fetch(signed.url, {
-                    body: json,
+                    body,
                     headers: signed.requiredHeaders,
                     method: "PUT",
                 });

package/dist/artifact-capture/batching-api-gateway-artifact-writer.js CHANGED Viewed

@@ -26,6 +26,7 @@
  * credentials are present.
  */
 import { ARTIFACT_REGISTRY, BULK_ENTRY_KEY, NotImplementedError, } from "../_vendor/ailf-core/index.js";
+import { prepareUploadBody } from "./prepare-upload-body.js";
 import { NO_OP_UPLOAD_METRICS, } from "./upload-metrics.js";
 /**
  * How many entries to bundle into a single `/batch/upload-urls` request.
@@ -307,16 +308,18 @@ export class BatchingApiGatewayArtifactWriter {
         const start = Date.now();
         let ok = false;
         // Tracked outside the try so the `finally` metrics event still gets a
-        // bytes figure when JSON.stringify itself throws (circular payload,
+        // bytes figure when serialization itself throws (circular payload,
         // bigint, etc.) — P5 requires we never hang the producer on a
         // pathological payload.
         let bytes = 0;
         try {
-            const json = JSON.stringify(pending.payload);
-            bytes = Buffer.byteLength(json, "utf-8");
+            // W0059 — redact + serialize through the shared helper so the
+            // batching path strips secrets before they reach the signed GCS URL.
+            const prepared = prepareUploadBody(pending.payload, "application/json");
+            bytes = prepared.bytes;
             const res = await fetch(signed.url, {
                 method: "PUT",
-                body: json,
+                body: prepared.body,
                 headers: signed.requiredHeaders,
             });
             if (!res.ok) {
@@ -414,14 +417,15 @@ export class BatchingApiGatewayArtifactWriter {
         }
         if (!signed)
             return null;
-        const json = JSON.stringify(payload);
-        const bytes = Buffer.byteLength(json, "utf-8");
+        // W0059 — manifest upload also flows through the shared helper so the
+        // redaction policy covers every PUT the batching writer performs.
+        const { body, bytes } = prepareUploadBody(payload, "application/json");
         const putStart = Date.now();
         let putOk = false;
         try {
             const res = await fetch(signed.url, {
                 method: "PUT",
-                body: json,
+                body,
                 headers: signed.requiredHeaders,
             });
             if (!res.ok) {

package/dist/artifact-capture/fanout-artifact-writer.d.ts CHANGED Viewed

@@ -2,15 +2,17 @@
  * FanoutArtifactWriter — layers multiple writers so each `emit()` fans out
  * to every configured backend.
  *
- * D0033 M4 default wiring:
- *   `FanoutArtifactWriter([ LocalFilesystemArtifactWriter, GcsArtifactWriter ])`
+ * Default wiring (W0064 reorder of D0033 M4):
+ *   `FanoutArtifactWriter([ GcsArtifactWriter, LocalFilesystemArtifactWriter ])`
  *
  * Semantics:
  * - Fan out in declaration order. Every writer runs, even if earlier ones fail.
- * - Return the **first non-null ArtifactRef**. Local is listed first, so a
- *   local success + GCS failure still produces a non-null ref pointing at
- *   local — the pipeline succeeds and Studio retrieval works against the
- *   local tree with a warning logged for the GCS leg.
+ * - Return the **first non-null ArtifactRef**. The remote backend is listed
+ *   first so its ref wins when both succeed — the published manifest points
+ *   at a cross-machine-readable store. A remote failure falls through to
+ *   local, so a local success + remote failure still produces a non-null ref
+ *   pointing at local and the pipeline succeeds with a warning on the remote
+ *   leg.
  * - Failures on individual writers warn (via their own P5 paths) but do
  *   not propagate. The fanout never throws.
  *
@@ -19,6 +21,7 @@
  * plus a recording test double.
  *
  * @see docs/decisions/D0033-unified-run-anchored-artifact-capture.md (§ M4)
+ * @see docs/design-docs/canonical-artifact-ref-selection.md — W0064 ordering
  */
 import type { ArtifactEntry, ArtifactRef, ArtifactType, ArtifactWriter, ArtifactWriterProgressOptions, AssociationValues, RunId, RunManifest } from "../_vendor/ailf-core/index.d.ts";
 export interface FanoutArtifactWriterOptions {

package/dist/artifact-capture/fanout-artifact-writer.js CHANGED Viewed

@@ -2,15 +2,17 @@
  * FanoutArtifactWriter — layers multiple writers so each `emit()` fans out
  * to every configured backend.
  *
- * D0033 M4 default wiring:
- *   `FanoutArtifactWriter([ LocalFilesystemArtifactWriter, GcsArtifactWriter ])`
+ * Default wiring (W0064 reorder of D0033 M4):
+ *   `FanoutArtifactWriter([ GcsArtifactWriter, LocalFilesystemArtifactWriter ])`
  *
  * Semantics:
  * - Fan out in declaration order. Every writer runs, even if earlier ones fail.
- * - Return the **first non-null ArtifactRef**. Local is listed first, so a
- *   local success + GCS failure still produces a non-null ref pointing at
- *   local — the pipeline succeeds and Studio retrieval works against the
- *   local tree with a warning logged for the GCS leg.
+ * - Return the **first non-null ArtifactRef**. The remote backend is listed
+ *   first so its ref wins when both succeed — the published manifest points
+ *   at a cross-machine-readable store. A remote failure falls through to
+ *   local, so a local success + remote failure still produces a non-null ref
+ *   pointing at local and the pipeline succeeds with a warning on the remote
+ *   leg.
  * - Failures on individual writers warn (via their own P5 paths) but do
  *   not propagate. The fanout never throws.
  *
@@ -19,6 +21,7 @@
  * plus a recording test double.
  *
  * @see docs/decisions/D0033-unified-run-anchored-artifact-capture.md (§ M4)
+ * @see docs/design-docs/canonical-artifact-ref-selection.md — W0064 ordering
  */
 export class FanoutArtifactWriter {
     writers;

package/dist/artifact-capture/gcs-artifact-writer.js CHANGED Viewed

@@ -30,6 +30,7 @@
 import { Storage } from "@google-cloud/storage";
 import { ARTIFACT_REGISTRY, buildManifestPreview, } from "../_vendor/ailf-core/index.js";
 import { resolveUploadConcurrency } from "./parallel-emit.js";
+import { prepareUploadBody } from "./prepare-upload-body.js";
 import { redactArtifactData } from "./redact-artifact.js";
 import { NO_OP_UPLOAD_METRICS, } from "./upload-metrics.js";
 /**
@@ -83,18 +84,18 @@ export class GcsArtifactWriter {
             console.warn(`  ⚠️  emit("${type}"): association.run is required, skipping`);
             return null;
         }
-        // AC10 — redact at the writer boundary so secrets never reach GCS.
-        const redacted = redactArtifactData(payload);
-        // Preview reads the pre-redaction payload (same as local writer — the
-        // preview carries a descriptor-controlled summary bounded by capBytes,
-        // not the raw entry bytes).
+        // W0059 — redact + serialize through the shared helper so the three
+        // writer implementations can't drift. Preview reads the pre-redaction
+        // payload (same as local writer — the preview carries a descriptor-
+        // controlled summary bounded by capBytes, not the raw entry bytes).
+        const { body } = prepareUploadBody(payload, descriptor.mime);
         const preview = buildManifestPreview(descriptor, payload);
         if (descriptor.layout === "bulk") {
             const path = descriptor.objectPath(runId);
-            const ref = await this.putBody(path, serializeForMime(redacted, descriptor.mime), {
+            const ref = await this.putBody(path, body, {
                 layout: "bulk",
                 mime: descriptor.mime,
-                entryCount: entryCountOf(redacted),
+                entryCount: entryCountOf(payload),
                 type,
             });
             if (!ref)
@@ -106,7 +107,11 @@ export class GcsArtifactWriter {
         // per-entry
         const entryKey = descriptor.formatEntryKey(association);
         const path = descriptor.objectPath(runId, entryKey);
-        const ref = await this.putBody(path, serializeForMime(redacted, descriptor.mime), { layout: "per-entry", mime: descriptor.mime, type });
+        const ref = await this.putBody(path, body, {
+            layout: "per-entry",
+            mime: descriptor.mime,
+            type,
+        });
         if (!ref)
             return null;
         const finalRef = {
@@ -222,11 +227,11 @@ export class GcsArtifactWriter {
             return null;
         }
         const path = descriptor.objectPath(runId);
-        const redacted = redactArtifactData(data);
-        return this.putBody(path, serializeForMime(redacted, descriptor.mime), {
+        const { body } = prepareUploadBody(data, descriptor.mime);
+        return this.putBody(path, body, {
             layout: "bulk",
             mime: descriptor.mime,
-            entryCount: entryCountOf(redacted),
+            entryCount: entryCountOf(data),
             type,
         });
     }
@@ -251,9 +256,7 @@ export class GcsArtifactWriter {
                 continue;
             }
             const path = descriptor.objectPath(runId, entry.key);
-            const redacted = redactArtifactData(entry.data);
-            const body = serializeForMime(redacted, descriptor.mime);
-            const bytes = Buffer.byteLength(body, "utf-8");
+            const { body, bytes } = prepareUploadBody(entry.data, descriptor.mime);
             const start = Date.now();
             let success = false;
             try {
@@ -416,14 +419,6 @@ class ConcurrencyLimiter {
         }
     }
 }
-function serializeForMime(payload, mime) {
-    if (mime === "text/markdown" || mime === "application/yaml") {
-        if (typeof payload === "string")
-            return payload;
-        return String(payload ?? "");
-    }
-    return JSON.stringify(payload);
-}
 function entryCountOf(data) {
     if (typeof data === "object" &&
         data !== null &&

package/dist/artifact-capture/prepare-upload-body.d.ts ADDED Viewed

@@ -0,0 +1,27 @@
+/**
+ * Shared preamble for uploading an artifact payload from an `ArtifactWriter`.
+ *
+ * All three `ArtifactWriter` implementations (direct GCS, API Gateway, batching
+ * API Gateway) must apply the same `redact → serialize → bytecount` pipeline so
+ * secrets are stripped before leaving the process. Routing each writer through
+ * this helper prevents drift — any future writer that skips the helper would
+ * fail the contract test in
+ * `src/__tests__/artifact-upload-redaction.test.ts`.
+ *
+ * NDJSON streaming is **not** handled here — each row is redacted independently
+ * by the NDJSON writer path before being concatenated into a part body.
+ */
+import type { ArtifactMime } from "../_vendor/ailf-core/index.d.ts";
+export interface PreparedUploadBody {
+    readonly body: string;
+    readonly bytes: number;
+}
+/**
+ * Redact, serialize, and size `payload` for upload.
+ *
+ * Serialization branches on `mime`:
+ * - `application/json` (and anything else JSON-shaped, including the
+ *   single-shot side of `application/x-ndjson`) → `JSON.stringify`.
+ * - `text/markdown` / `application/yaml` → coerce to string via `String()`.
+ */
+export declare function prepareUploadBody(payload: unknown, mime: ArtifactMime): PreparedUploadBody;

package/dist/artifact-capture/prepare-upload-body.js ADDED Viewed

@@ -0,0 +1,36 @@
+/**
+ * Shared preamble for uploading an artifact payload from an `ArtifactWriter`.
+ *
+ * All three `ArtifactWriter` implementations (direct GCS, API Gateway, batching
+ * API Gateway) must apply the same `redact → serialize → bytecount` pipeline so
+ * secrets are stripped before leaving the process. Routing each writer through
+ * this helper prevents drift — any future writer that skips the helper would
+ * fail the contract test in
+ * `src/__tests__/artifact-upload-redaction.test.ts`.
+ *
+ * NDJSON streaming is **not** handled here — each row is redacted independently
+ * by the NDJSON writer path before being concatenated into a part body.
+ */
+import { redactArtifactData } from "./redact-artifact.js";
+/**
+ * Redact, serialize, and size `payload` for upload.
+ *
+ * Serialization branches on `mime`:
+ * - `application/json` (and anything else JSON-shaped, including the
+ *   single-shot side of `application/x-ndjson`) → `JSON.stringify`.
+ * - `text/markdown` / `application/yaml` → coerce to string via `String()`.
+ */
+export function prepareUploadBody(payload, mime) {
+    const redacted = redactArtifactData(payload);
+    const body = serializeForMime(redacted, mime);
+    const bytes = Buffer.byteLength(body, "utf-8");
+    return { body, bytes };
+}
+function serializeForMime(payload, mime) {
+    if (mime === "text/markdown" || mime === "application/yaml") {
+        if (typeof payload === "string")
+            return payload;
+        return String(payload ?? "");
+    }
+    return JSON.stringify(payload);
+}

package/dist/composition-root.js CHANGED Viewed

@@ -188,14 +188,19 @@ export function createArtifactWriter(config, logger, progress) {
         exclude,
         ...(remote ? {} : { progress }),
     });
+    // W0064 — when a remote backend is wired, list it first so its ArtifactRef
+    // wins the fanout's firstNonNull() selection and the published manifest
+    // points at a cross-machine-readable store. Local stays attached as the
+    // resilience tier: if the remote leg fails, firstNonNull falls through to
+    // local and the pipeline still produces a non-null ref.
     const base = remote
-        ? new FanoutArtifactWriter([local, remote], { progress })
+        ? new FanoutArtifactWriter([remote, local], { progress })
         : local;
     if (!remote) {
         logger.debug(`Artifact writer: LocalFilesystemArtifactWriter only (rootDir=${rootDir})`);
     }
     else {
-        logger.debug(`Artifact writer: FanoutArtifactWriter([local=${rootDir}, ${remote.constructor.name}])`);
+        logger.debug(`Artifact writer: FanoutArtifactWriter([${remote.constructor.name}, local=${rootDir}])`);
     }
     // Wrap in the accumulator so FinalizeRunStep can build a populated
     // RunManifest without each producer bookkeeping its own ArtifactRefs

package/dist/orchestration/pipeline-orchestrator.js CHANGED Viewed

@@ -130,6 +130,12 @@ export async function orchestratePipeline(ctx, steps) {
     const pipelineStart = Date.now();
     const hasJob = !!ctx.config.jobId;
     const jobUpdates = [];
+    // DOC-2064 — tracks whether the pre-finalize pipelineContext emit fired so
+    // the post-loop fallback can skip redundant writes. A second emit to the
+    // same GCS path produces a 412 Precondition Failed from the signed-URL
+    // writer (which enforces no-overwrite), logging spurious warnings on every
+    // successful run.
+    let pipelineContextEmitted = false;
     ctx.logger.section("ai-literacy-framework — Evaluation Pipeline");
     ctx.logger.debug(`Pipeline starting with ${steps.length} steps`, {
         steps: steps.map((s) => s.name),
@@ -152,6 +158,16 @@ export async function orchestratePipeline(ctx, steps) {
         ctx.logger.debug(`Starting step ${i + 1}/${steps.length}: ${step.name}`);
         ctx.logger.section(step.name);
         exportPhase.maybeOpen(step.name);
+        // DOC-2064 — emit pipelineContext BEFORE finalize-run so the artifact
+        // ref registers with the accumulator and lands in RunManifest.artifacts,
+        // which PublishReportStep then snapshots into Report.artifactManifest.
+        // The previous post-loop emit ran after publish and was invisible to
+        // Content Lake readers. The failure-path capture below still fires on
+        // pre-finalize aborts so aborted runs retain the on-disk artifact.
+        if (step.name === "finalize-run") {
+            await capturePipelineContext(ctx, state, results);
+            pipelineContextEmitted = true;
+        }
         // Report current step progress
         if (hasJob) {
             await reportJobProgress(ctx, step.name, i, steps.length, "running", undefined, jobUpdates);
@@ -175,8 +191,12 @@ export async function orchestratePipeline(ctx, steps) {
             }
             // Capture pipeline context before exiting. `job-updates` was an
             // observability-only capture not tied to a registered artifact type;
-            // dropped in W0050. Use the JobStore path for job telemetry.
-            await capturePipelineContext(ctx, state, results);
+            // dropped in W0050. Use the JobStore path for job telemetry. Skip
+            // when the pre-finalize emit already fired to avoid a 412 overwrite
+            // warning (DOC-2064).
+            if (!pipelineContextEmitted) {
+                await capturePipelineContext(ctx, state, results);
+            }
             exportPhase.close();
             return {
                 belowCritical: state.belowCritical,
@@ -231,9 +251,18 @@ export async function orchestratePipeline(ctx, steps) {
             ctx.logger.warn("Failed to report job completion — continuing");
         }
     }
-    // Capture pipeline context. `job-updates` observability captures were
-    // dropped in Slice 6.1 — JobStore is the supported telemetry path.
-    await capturePipelineContext(ctx, state, results);
+    // DOC-2064 — post-loop fallback. Only fires when the pre-finalize emit
+    // inside the step loop didn't run — typically because the pipeline has no
+    // finalize-run step (test harnesses, air-gapped runs). Skipping this when
+    // the pre-finalize emit already fired avoids a 412 Precondition Failed
+    // from the signed-URL writer, which refuses to overwrite the existing
+    // path. The tradeoff is that pipelineContext captures pipeline state as
+    // of finalize-run, not post-publish — reportId is absent. Acceptable
+    // because runId is the primary join key and reportId is trivially
+    // looked up from Content Lake via runId.
+    if (!pipelineContextEmitted) {
+        await capturePipelineContext(ctx, state, results);
+    }
     exportPhase.close();
     return {
         belowCritical: state.belowCritical,

package/dist/orchestration/steps/calculate-scores-step.js CHANGED Viewed

@@ -143,16 +143,18 @@ export class CalculateScoresStep {
         // The full responseOutput lives in the GCS artifact; PublishReportStep
         // later strips it from the inline Content Lake document when this
         // upload succeeds.
+        //
+        // The emits flow through `ctx.artifactWriter`, which the composition
+        // root wraps in `AccumulatingArtifactWriter`. That's where the
+        // authoritative merged ref is built; `FinalizeRunStep` reads it
+        // straight from the accumulator, so producer-side registration on
+        // `state.artifactRefs` would only clobber the accumulator's full set
+        // with a partial single-entry ref.
+        //
         // W0050 — ctx.artifactWriter is always present; no guard needed.
         const testResults = tryReadTestResults(ctx.config.rootDir);
         if (testResults?.length) {
-            const artifactRef = await uploadTestOutputs(ctx.artifactWriter, ctx.runId, testResults, ctx.config.mode);
-            if (artifactRef) {
-                state.artifactRefs = {
-                    ...state.artifactRefs,
-                    testOutputs: artifactRef,
-                };
-            }
+            await uploadTestOutputs(ctx.artifactWriter, ctx.runId, testResults, ctx.config.mode);
         }
         const criticalSuffix = belowCritical.length > 0
             ? ` (${belowCritical.length} area(s) below critical threshold: ${belowCritical.join(", ")})`

package/dist/orchestration/steps/finalize-run-step.js CHANGED Viewed

@@ -86,11 +86,12 @@ export class FinalizeRunStep {
         });
         // W0051 revisit: the composition-root wraps `ctx.artifactWriter` in
         // `AccumulatingArtifactWriter`, which keeps a map of every ref any
-        // producer emitted this run. Merge that into `state.artifactRefs` so
-        // the manifest reflects the FULL set — not just the subset producers
-        // happened to register manually. When the writer is a NoOp / plain
-        // decorator without accumulation, `aggregated` stays empty and the
-        // manifest falls back to the producer-side registration.
+        // producer emitted this run — the authoritative FULL set, merged by
+        // entry key. `state.artifactRefs` is a producer-side fallback for
+        // writers that don't accumulate (NoOp / plain decorators). The
+        // accumulator wins per type when both exist, because producer-side
+        // registrations tend to capture only the last ref from a parallel
+        // batch and would otherwise clobber the merged entries list.
         //
         // W0058: `findAccumulator` unwraps `InstrumentedArtifactWriter` too
         // so the manifest stays fully populated when `AILF_UPLOAD_METRICS=1`.
@@ -99,8 +100,8 @@ export class FinalizeRunStep {
             ? accumulator.getAccumulatedArtifactRefs()
             : {};
         const artifacts = {
-            ...aggregated,
             ...(state.artifactRefs ?? {}),
+            ...aggregated,
         };
         const manifest = {
             version: 1,

package/dist/pipeline/calculate-scores.d.ts CHANGED Viewed

@@ -47,6 +47,12 @@ export interface RawTestResult {
     };
     response: {
         output: string;
+        tokenUsage?: {
+            cached?: number;
+            completion?: number;
+            prompt?: number;
+            total?: number;
+        };
     };
     testCase?: {
         description?: string;

package/dist/pipeline/calculate-scores.js CHANGED Viewed

@@ -223,6 +223,7 @@ export function extractStoredTestResults(resultsPath) {
             }
             dimensions.push({ dimension, reason, score });
         }
+        const tokenUsage = result.response?.tokenUsage;
         testResults.push({
             area,
             cost: result.cost || undefined,
@@ -233,6 +234,7 @@ export function extractStoredTestResults(resultsPath) {
             responseOutput,
             ...(responseOutputTruncated && { responseOutputTruncated: true }),
             taskId,
+            ...(tokenUsage && { tokenUsage }),
             variant,
         });
     }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@sanity/ailf",
-  "version": "3.1.0",
+  "version": "3.2.0",
   "private": false,
   "publishConfig": {
     "access": "public"
@@ -52,8 +52,8 @@
     "@types/node": "^22.13.1",
     "tsx": "^4.19.2",
     "typescript": "^5.7.3",
-    "@sanity/ailf-shared": "0.1.0",
-    "@sanity/ailf-core": "0.1.0"
+    "@sanity/ailf-core": "0.1.0",
+    "@sanity/ailf-shared": "0.1.0"
   },
   "scripts": {
     "build": "tsc && tsx scripts/bundle-workspace-deps.ts",