@sanity/ailf 3.1.0 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -41,7 +41,7 @@ export type ArtifactMime = "application/json" | "application/x-ndjson" | "text/m
41
41
  */
42
42
  export type ArtifactTruncationPolicy = "reject" | "trailing-truncate" | "fielded-truncate" | "trial-oversize";
43
43
  /** The union of every artifact type known to AILF. */
44
- export type ArtifactType = "runManifest" | "scoreSummary" | "pipelineResult" | "pipelineContext" | "documentManifest" | "prComment" | "readinessReport" | "reportSnapshot" | "autoComparison" | "gapReport" | "sinkResults" | "callbackRequest" | "callbackResponse" | "configSnapshot" | "evalConfigGenerated" | "comparisonReport" | "discoveryReport" | "failureModes" | "taskDefinitions" | "renderedPrompts" | "rawResults" | "testOutputs" | "graderPrompts" | "graderJudgments" | "traces";
44
+ export type ArtifactType = "runManifest" | "scoreSummary" | "pipelineResult" | "pipelineContext" | "documentManifest" | "prComment" | "readinessReport" | "reportSnapshot" | "autoComparison" | "gapReport" | "sinkResults" | "callbackRequest" | "callbackResponse" | "configSnapshot" | "evalConfigGenerated" | "comparisonReport" | "discoveryReport" | "failureModes" | "renderedPrompts" | "rawResults" | "testOutputs" | "graderPrompts" | "graderJudgments" | "traces";
45
45
  /**
46
46
  * Result of parsing a per-entry key into a sanitized filename component.
47
47
  * Success carries the sanitized value; failure carries a reason for 4xx responses.
@@ -255,6 +255,71 @@ const graderJudgmentPreviewSchema = z.object({
255
255
  reasonPreview: z.string().max(280),
256
256
  dimensionScores: z.record(z.string(), z.number()).optional(),
257
257
  });
258
+ /**
259
+ * Preview shape for `rawResults` manifest entries. A per-call projection
260
+ * over the Promptfoo result payload — cost, latency, pass/fail, and token
261
+ * usage — so a Studio list row can render without signing the full
262
+ * artifact (which routinely runs to tens of KB of response text).
263
+ *
264
+ * Projected fields mirror top-level Promptfoo result keys (`success`,
265
+ * `score`, `cost`, `latencyMs`) and `response.finishReason` /
266
+ * `response.tokenUsage.total` for the two useful nested numbers.
267
+ */
268
+ const rawResultPreviewSchema = z.object({
269
+ passed: z.boolean(),
270
+ score: z.number().optional(),
271
+ cost: z.number().nonnegative().optional(),
272
+ latencyMs: z.number().int().nonnegative().optional(),
273
+ totalTokens: z.number().int().nonnegative().optional(),
274
+ finishReason: z.string().max(40).optional(),
275
+ });
276
+ /**
277
+ * Preview shape for the run-scoped `scoreSummary` bulk artifact. Projects
278
+ * the composite score, run-level cost/token totals, the weakest area, and
279
+ * the top 3 scoring areas — enough for a Studio rollup row without pulling
280
+ * the full summary (which carries enriched per-test data and can exceed
281
+ * 100 KB on multi-area runs).
282
+ */
283
+ const scoreSummaryPreviewSchema = z.object({
284
+ composite: z.number(),
285
+ testCount: z.number().int().nonnegative().optional(),
286
+ lowestArea: z.string().max(60).optional(),
287
+ lowestScore: z.number().optional(),
288
+ totalCost: z.number().nonnegative().optional(),
289
+ totalTokens: z.number().int().nonnegative().optional(),
290
+ topAreas: z
291
+ .array(z.object({
292
+ feature: z.string().max(60),
293
+ score: z.number(),
294
+ }))
295
+ .max(3)
296
+ .optional(),
297
+ });
298
+ /**
299
+ * Preview shape for `renderedPrompts` manifest entries (W0061 / D0033 M7).
300
+ * Lets the Studio PromptReplayDrawer surface a quick-look pane — prompt
301
+ * char count, the variant label (e.g. "Baseline (No Docs)" / "Gold (with
302
+ * docs)") as a secondary signal, and a short leading snippet — without
303
+ * round-tripping the full rendered prompt (which can run 50+ KB on
304
+ * multi-turn conversations).
305
+ */
306
+ const renderedPromptPreviewSchema = z.object({
307
+ promptCharCount: z.number().int().nonnegative(),
308
+ label: z.string().max(60).optional(),
309
+ snippet: z.string().max(120),
310
+ });
311
+ /**
312
+ * Preview shape for `graderPrompts` manifest entries (W0061 / D0033 M7).
313
+ * `rubricName` maps to the classified dimension (e.g. "correctness",
314
+ * "completeness"); char count and snippet cover the rubric text itself so
315
+ * the Studio drawer can render a preview before the full assertion payload
316
+ * is signed and fetched.
317
+ */
318
+ const graderPromptPreviewSchema = z.object({
319
+ rubricCharCount: z.number().int().nonnegative(),
320
+ rubricName: z.string().max(60).optional(),
321
+ snippet: z.string().max(120),
322
+ });
258
323
  // Aspirational: most payload shapes are still loose. Tightening per-type as
259
324
  // consumers stabilize is explicitly a W0050/W0051 concern — W0049 fixes the
260
325
  // structural shape around them without changing the payload contracts.
@@ -361,6 +426,57 @@ export const ARTIFACT_REGISTRY = {
361
426
  entrySchema: unknownEntry,
362
427
  mime: "application/json",
363
428
  capBytes: 1_000_000,
429
+ manifestPreview: {
430
+ schema: scoreSummaryPreviewSchema,
431
+ extract: (entry) => {
432
+ const e = entry;
433
+ const avgScore = typeof e.overall?.avgScore === "number" ? e.overall.avgScore : 0;
434
+ const totalCost = typeof e.overall?.cost?.total === "number"
435
+ ? e.overall.cost.total
436
+ : undefined;
437
+ const rawTotalTokens = e.overall?.cost?.totalTokens;
438
+ const totalTokens = typeof rawTotalTokens === "number" &&
439
+ Number.isFinite(rawTotalTokens) &&
440
+ rawTotalTokens >= 0
441
+ ? Math.trunc(rawTotalTokens)
442
+ : undefined;
443
+ const scores = Array.isArray(e.scores) ? e.scores : [];
444
+ const areas = [];
445
+ for (const s of scores) {
446
+ if (s === null || typeof s !== "object")
447
+ continue;
448
+ const rec = s;
449
+ if (typeof rec.feature !== "string")
450
+ continue;
451
+ if (typeof rec.totalScore !== "number")
452
+ continue;
453
+ const testCount = typeof rec.testCount === "number" ? rec.testCount : undefined;
454
+ areas.push({
455
+ feature: rec.feature,
456
+ totalScore: rec.totalScore,
457
+ ...(testCount !== undefined ? { testCount } : {}),
458
+ });
459
+ }
460
+ const topAreas = areas
461
+ .slice()
462
+ .sort((a, b) => b.totalScore - a.totalScore)
463
+ .slice(0, 3)
464
+ .map((a) => ({ feature: a.feature, score: a.totalScore }));
465
+ const testCount = areas.reduce((sum, a) => sum + (a.testCount ?? 0), 0);
466
+ const lowestArea = typeof e.lowestArea === "string" ? e.lowestArea : undefined;
467
+ const lowestScore = typeof e.lowestScore === "number" ? e.lowestScore : undefined;
468
+ return {
469
+ composite: Math.round(avgScore),
470
+ ...(testCount > 0 ? { testCount } : {}),
471
+ ...(lowestArea === undefined ? {} : { lowestArea }),
472
+ ...(lowestScore === undefined ? {} : { lowestScore }),
473
+ ...(totalCost === undefined ? {} : { totalCost }),
474
+ ...(totalTokens === undefined ? {} : { totalTokens }),
475
+ ...(topAreas.length > 0 ? { topAreas } : {}),
476
+ };
477
+ },
478
+ capBytes: 512,
479
+ },
364
480
  }),
365
481
  pipelineResult: buildDescriptor({
366
482
  type: "pipelineResult",
@@ -534,15 +650,6 @@ export const ARTIFACT_REGISTRY = {
534
650
  },
535
651
  }),
536
652
  // -- Run × Mode × Task(+…) -------------------------------------------------
537
- taskDefinitions: buildDescriptor({
538
- type: "taskDefinitions",
539
- slug: "task-definitions",
540
- layout: "per-entry",
541
- axes: ["run", "mode", "task"],
542
- entrySchema: unknownEntry,
543
- mime: "application/json",
544
- capBytes: 256_000,
545
- }),
546
653
  renderedPrompts: buildDescriptor({
547
654
  type: "renderedPrompts",
548
655
  slug: "rendered-prompts",
@@ -551,6 +658,29 @@ export const ARTIFACT_REGISTRY = {
551
658
  entrySchema: unknownEntry,
552
659
  mime: "application/json",
553
660
  capBytes: 1_000_000,
661
+ manifestPreview: {
662
+ schema: renderedPromptPreviewSchema,
663
+ extract: (entry) => {
664
+ // Producer shape from `emit-eval-results.ts`:
665
+ // { prompt: { raw: string, label: string, config: {...} },
666
+ // provider: { id: string, label: string } }
667
+ // The prompt text lives at `prompt.raw`; `prompt.label` carries the
668
+ // human-readable variant name ("Baseline (No Docs)" / "Gold (with
669
+ // docs)") and is a more useful secondary signal than a
670
+ // provider-level format hint (which isn't exposed here).
671
+ const e = entry;
672
+ const raw = typeof e.prompt?.raw === "string" ? e.prompt.raw : "";
673
+ const label = typeof e.prompt?.label === "string"
674
+ ? truncateString(e.prompt.label, 60)
675
+ : undefined;
676
+ return {
677
+ promptCharCount: raw.length,
678
+ ...(label === undefined ? {} : { label }),
679
+ snippet: truncateString(raw, 120),
680
+ };
681
+ },
682
+ capBytes: 256,
683
+ },
554
684
  }),
555
685
  rawResults: buildDescriptor({
556
686
  type: "rawResults",
@@ -560,6 +690,39 @@ export const ARTIFACT_REGISTRY = {
560
690
  entrySchema: unknownEntry,
561
691
  mime: "application/json",
562
692
  capBytes: 1_000_000,
693
+ manifestPreview: {
694
+ schema: rawResultPreviewSchema,
695
+ extract: (entry) => {
696
+ const e = entry;
697
+ const passed = e.success === true;
698
+ const score = typeof e.score === "number" ? e.score : undefined;
699
+ const cost = typeof e.cost === "number" && e.cost >= 0 ? e.cost : undefined;
700
+ const rawLatency = e.latencyMs;
701
+ const latencyMs = typeof rawLatency === "number" &&
702
+ Number.isFinite(rawLatency) &&
703
+ rawLatency >= 0
704
+ ? Math.trunc(rawLatency)
705
+ : undefined;
706
+ const rawTotalTokens = e.response?.tokenUsage?.total;
707
+ const totalTokens = typeof rawTotalTokens === "number" &&
708
+ Number.isFinite(rawTotalTokens) &&
709
+ rawTotalTokens >= 0
710
+ ? Math.trunc(rawTotalTokens)
711
+ : undefined;
712
+ const finishReason = typeof e.response?.finishReason === "string"
713
+ ? truncateString(e.response.finishReason, 40)
714
+ : undefined;
715
+ return {
716
+ passed,
717
+ ...(score === undefined ? {} : { score }),
718
+ ...(cost === undefined ? {} : { cost }),
719
+ ...(latencyMs === undefined ? {} : { latencyMs }),
720
+ ...(totalTokens === undefined ? {} : { totalTokens }),
721
+ ...(finishReason === undefined ? {} : { finishReason }),
722
+ };
723
+ },
724
+ capBytes: 256,
725
+ },
563
726
  }),
564
727
  testOutputs: buildDescriptor({
565
728
  type: "testOutputs",
@@ -593,6 +756,22 @@ export const ARTIFACT_REGISTRY = {
593
756
  entrySchema: unknownEntry,
594
757
  mime: "application/json",
595
758
  capBytes: 512_000,
759
+ manifestPreview: {
760
+ schema: graderPromptPreviewSchema,
761
+ extract: (entry) => {
762
+ const e = entry;
763
+ const rubricText = typeof e.assertion?.value === "string" ? e.assertion.value : "";
764
+ const rubricName = typeof e.dimension === "string"
765
+ ? truncateString(e.dimension, 60)
766
+ : undefined;
767
+ return {
768
+ rubricCharCount: rubricText.length,
769
+ ...(rubricName === undefined ? {} : { rubricName }),
770
+ snippet: truncateString(rubricText, 120),
771
+ };
772
+ },
773
+ capBytes: 256,
774
+ },
596
775
  }),
597
776
  graderJudgments: buildDescriptor({
598
777
  type: "graderJudgments",
@@ -134,16 +134,18 @@ export declare function taskId(raw: string): Result<TaskId, IdValidationError>;
134
134
  /**
135
135
  * Parse a raw string into a `RunId`.
136
136
  *
137
- * Valid format: `run_` prefix followed by alphanumeric characters.
137
+ * Accepts the canonical `run_YYYYMMDDTHHMMSSZ_<8 base36>` shape (D0035)
138
+ * and the legacy `run_<32 hex>` UUIDv7 shape for back-compat with runs
139
+ * generated before the scheme change.
138
140
  */
139
141
  export declare function runId(raw: string): Result<RunId, IdValidationError>;
140
142
  /**
141
- * Generate a new `RunId` using a time-sortable UUIDv7 payload.
143
+ * Generate a new `RunId` with a human-readable UTC timestamp plus a
144
+ * random suffix (D0035). Shape: `run_YYYYMMDDTHHMMSSZ_<8 base36>`.
142
145
  *
143
- * The `run_` prefix plus the hyphen-stripped UUIDv7 yields 36 characters
144
- * that sort lexicographically by creation time same pattern used for
145
- * `ReportId`. One generator call per pipeline invocation; every step
146
- * reads the resulting id from `AppContext.runId`.
146
+ * Lexicographic ordering matches creation time. One generator call per
147
+ * pipeline invocation; every step reads the resulting id from
148
+ * `AppContext.runId`.
147
149
  */
148
150
  export declare function generateRunId(): RunId;
149
151
  /**
@@ -44,41 +44,52 @@ export function taskId(raw) {
44
44
  }
45
45
  return ok(raw);
46
46
  }
47
+ /**
48
+ * Canonical shape emitted by `generateRunId` (D0035).
49
+ * `run_YYYYMMDDTHHMMSSZ_<8 lowercase base36>` — sortable + readable.
50
+ */
51
+ const RUN_ID_RE = /^run_\d{8}T\d{6}Z_[0-9a-z]{8}$/;
52
+ /**
53
+ * Legacy UUIDv7-hex shape. Accepted so existing runs in GCS and the
54
+ * Content Lake remain valid; never emitted by `generateRunId`.
55
+ */
56
+ const LEGACY_RUN_ID_RE = /^run_[0-9a-f]{32}$/;
47
57
  /**
48
58
  * Parse a raw string into a `RunId`.
49
59
  *
50
- * Valid format: `run_` prefix followed by alphanumeric characters.
60
+ * Accepts the canonical `run_YYYYMMDDTHHMMSSZ_<8 base36>` shape (D0035)
61
+ * and the legacy `run_<32 hex>` UUIDv7 shape for back-compat with runs
62
+ * generated before the scheme change.
51
63
  */
52
64
  export function runId(raw) {
53
- if (!raw.match(/^run_[a-zA-Z0-9]{8,}$/)) {
54
- return err({
55
- code: "INVALID_RUN_ID",
56
- raw,
57
- message: `Invalid RunId "${raw}": must match run_[a-zA-Z0-9]{8,}`,
58
- });
65
+ if (RUN_ID_RE.test(raw) || LEGACY_RUN_ID_RE.test(raw)) {
66
+ return ok(raw);
59
67
  }
60
- return ok(raw);
68
+ return err({
69
+ code: "INVALID_RUN_ID",
70
+ raw,
71
+ message: `Invalid RunId "${raw}": must match run_YYYYMMDDTHHMMSSZ_<8 base36> or legacy run_<32 hex>`,
72
+ });
61
73
  }
62
74
  /**
63
- * Generate a new `RunId` using a time-sortable UUIDv7 payload.
75
+ * Generate a new `RunId` with a human-readable UTC timestamp plus a
76
+ * random suffix (D0035). Shape: `run_YYYYMMDDTHHMMSSZ_<8 base36>`.
64
77
  *
65
- * The `run_` prefix plus the hyphen-stripped UUIDv7 yields 36 characters
66
- * that sort lexicographically by creation time same pattern used for
67
- * `ReportId`. One generator call per pipeline invocation; every step
68
- * reads the resulting id from `AppContext.runId`.
78
+ * Lexicographic ordering matches creation time. One generator call per
79
+ * pipeline invocation; every step reads the resulting id from
80
+ * `AppContext.runId`.
69
81
  */
70
82
  export function generateRunId() {
71
- const now = Date.now();
72
- const uuid = crypto.randomUUID();
73
- // UUID v7: encode 48-bit timestamp in the first 12 hex chars
74
- const hex = now.toString(16).padStart(12, "0");
75
- const v7 = hex.slice(0, 8) +
76
- hex.slice(8, 12) +
77
- "7" +
78
- uuid.slice(15, 18) +
79
- uuid.slice(19, 23) +
80
- uuid.slice(24);
81
- return `run_${v7}`;
83
+ const ts = new Date()
84
+ .toISOString()
85
+ .replace(/[-:]/g, "")
86
+ .replace(/\.\d{3}Z$/, "Z");
87
+ const bytes = crypto.getRandomValues(new Uint8Array(8));
88
+ let suffix = "";
89
+ for (const b of bytes) {
90
+ suffix += (b % 36).toString(36);
91
+ }
92
+ return `run_${ts}_${suffix}`;
82
93
  }
83
94
  /**
84
95
  * Parse a raw string into a `SuiteId`.
@@ -322,11 +322,18 @@ export interface StoredTestResult {
322
322
  responseOutputTruncated?: boolean;
323
323
  /** Task description (e.g. "Functions - Webhook handler (gold)") */
324
324
  taskId: string;
325
- /** Token usage breakdown */
325
+ /**
326
+ * Token usage breakdown. All fields are optional because Promptfoo's
327
+ * row-level shape varies — cached results typically carry only
328
+ * `{ cached, total }` while fresh results carry `{ prompt, completion,
329
+ * total }`. Populated from `response.tokenUsage` on the raw result; see
330
+ * `extractStoredTestResults` in packages/eval/src/pipeline/calculate-scores.ts.
331
+ */
326
332
  tokenUsage?: {
327
- completion: number;
328
- prompt: number;
329
- total: number;
333
+ cached?: number;
334
+ completion?: number;
335
+ prompt?: number;
336
+ total?: number;
330
337
  };
331
338
  /** "gold" (with docs) or "baseline" (without docs) */
332
339
  variant: "baseline" | "gold";
@@ -35,6 +35,12 @@ export interface TestResult {
35
35
  providerLabel?: string;
36
36
  response: {
37
37
  output: string;
38
+ tokenUsage?: {
39
+ cached?: number;
40
+ completion?: number;
41
+ prompt?: number;
42
+ total?: number;
43
+ };
38
44
  };
39
45
  vars: Record<string, string>;
40
46
  }
@@ -28,6 +28,7 @@
28
28
  * @see docs/decisions/D0033-unified-run-anchored-artifact-capture.md
29
29
  */
30
30
  import { ARTIFACT_REGISTRY, NotImplementedError, } from "../_vendor/ailf-core/index.js";
31
+ import { prepareUploadBody } from "./prepare-upload-body.js";
31
32
  import { NO_OP_UPLOAD_METRICS, } from "./upload-metrics.js";
32
33
  export class ApiGatewayArtifactWriter {
33
34
  options;
@@ -143,8 +144,9 @@ export class ApiGatewayArtifactWriter {
143
144
  };
144
145
  }
145
146
  async putJsonRaw(uploadUrlPath, data, type) {
146
- const json = JSON.stringify(data);
147
- const bytes = Buffer.byteLength(json, "utf-8");
147
+ // W0059 redact + serialize through the shared helper so the gateway
148
+ // path strips secrets before they reach the signed GCS URL.
149
+ const { body, bytes } = prepareUploadBody(data, "application/json");
148
150
  try {
149
151
  const signed = await this.fetchSignedUrl(uploadUrlPath, type);
150
152
  if (!signed)
@@ -153,7 +155,7 @@ export class ApiGatewayArtifactWriter {
153
155
  let putSuccess = false;
154
156
  try {
155
157
  const putRes = await fetch(signed.url, {
156
- body: json,
158
+ body,
157
159
  headers: signed.requiredHeaders,
158
160
  method: "PUT",
159
161
  });
@@ -26,6 +26,7 @@
26
26
  * credentials are present.
27
27
  */
28
28
  import { ARTIFACT_REGISTRY, BULK_ENTRY_KEY, NotImplementedError, } from "../_vendor/ailf-core/index.js";
29
+ import { prepareUploadBody } from "./prepare-upload-body.js";
29
30
  import { NO_OP_UPLOAD_METRICS, } from "./upload-metrics.js";
30
31
  /**
31
32
  * How many entries to bundle into a single `/batch/upload-urls` request.
@@ -307,16 +308,18 @@ export class BatchingApiGatewayArtifactWriter {
307
308
  const start = Date.now();
308
309
  let ok = false;
309
310
  // Tracked outside the try so the `finally` metrics event still gets a
310
- // bytes figure when JSON.stringify itself throws (circular payload,
311
+ // bytes figure when serialization itself throws (circular payload,
311
312
  // bigint, etc.) — P5 requires we never hang the producer on a
312
313
  // pathological payload.
313
314
  let bytes = 0;
314
315
  try {
315
- const json = JSON.stringify(pending.payload);
316
- bytes = Buffer.byteLength(json, "utf-8");
316
+ // W0059 redact + serialize through the shared helper so the
317
+ // batching path strips secrets before they reach the signed GCS URL.
318
+ const prepared = prepareUploadBody(pending.payload, "application/json");
319
+ bytes = prepared.bytes;
317
320
  const res = await fetch(signed.url, {
318
321
  method: "PUT",
319
- body: json,
322
+ body: prepared.body,
320
323
  headers: signed.requiredHeaders,
321
324
  });
322
325
  if (!res.ok) {
@@ -414,14 +417,15 @@ export class BatchingApiGatewayArtifactWriter {
414
417
  }
415
418
  if (!signed)
416
419
  return null;
417
- const json = JSON.stringify(payload);
418
- const bytes = Buffer.byteLength(json, "utf-8");
420
+ // W0059 manifest upload also flows through the shared helper so the
421
+ // redaction policy covers every PUT the batching writer performs.
422
+ const { body, bytes } = prepareUploadBody(payload, "application/json");
419
423
  const putStart = Date.now();
420
424
  let putOk = false;
421
425
  try {
422
426
  const res = await fetch(signed.url, {
423
427
  method: "PUT",
424
- body: json,
428
+ body,
425
429
  headers: signed.requiredHeaders,
426
430
  });
427
431
  if (!res.ok) {
@@ -2,15 +2,17 @@
2
2
  * FanoutArtifactWriter — layers multiple writers so each `emit()` fans out
3
3
  * to every configured backend.
4
4
  *
5
- * D0033 M4 default wiring:
6
- * `FanoutArtifactWriter([ LocalFilesystemArtifactWriter, GcsArtifactWriter ])`
5
+ * Default wiring (W0064 reorder of D0033 M4):
6
+ * `FanoutArtifactWriter([ GcsArtifactWriter, LocalFilesystemArtifactWriter ])`
7
7
  *
8
8
  * Semantics:
9
9
  * - Fan out in declaration order. Every writer runs, even if earlier ones fail.
10
- * - Return the **first non-null ArtifactRef**. Local is listed first, so a
11
- * local success + GCS failure still produces a non-null ref pointing at
12
- * local the pipeline succeeds and Studio retrieval works against the
13
- * local tree with a warning logged for the GCS leg.
10
+ * - Return the **first non-null ArtifactRef**. The remote backend is listed
11
+ * first so its ref wins when both succeed the published manifest points
12
+ * at a cross-machine-readable store. A remote failure falls through to
13
+ * local, so a local success + remote failure still produces a non-null ref
14
+ * pointing at local and the pipeline succeeds with a warning on the remote
15
+ * leg.
14
16
  * - Failures on individual writers warn (via their own P5 paths) but do
15
17
  * not propagate. The fanout never throws.
16
18
  *
@@ -19,6 +21,7 @@
19
21
  * plus a recording test double.
20
22
  *
21
23
  * @see docs/decisions/D0033-unified-run-anchored-artifact-capture.md (§ M4)
24
+ * @see docs/design-docs/canonical-artifact-ref-selection.md — W0064 ordering
22
25
  */
23
26
  import type { ArtifactEntry, ArtifactRef, ArtifactType, ArtifactWriter, ArtifactWriterProgressOptions, AssociationValues, RunId, RunManifest } from "../_vendor/ailf-core/index.d.ts";
24
27
  export interface FanoutArtifactWriterOptions {
@@ -2,15 +2,17 @@
2
2
  * FanoutArtifactWriter — layers multiple writers so each `emit()` fans out
3
3
  * to every configured backend.
4
4
  *
5
- * D0033 M4 default wiring:
6
- * `FanoutArtifactWriter([ LocalFilesystemArtifactWriter, GcsArtifactWriter ])`
5
+ * Default wiring (W0064 reorder of D0033 M4):
6
+ * `FanoutArtifactWriter([ GcsArtifactWriter, LocalFilesystemArtifactWriter ])`
7
7
  *
8
8
  * Semantics:
9
9
  * - Fan out in declaration order. Every writer runs, even if earlier ones fail.
10
- * - Return the **first non-null ArtifactRef**. Local is listed first, so a
11
- * local success + GCS failure still produces a non-null ref pointing at
12
- * local the pipeline succeeds and Studio retrieval works against the
13
- * local tree with a warning logged for the GCS leg.
10
+ * - Return the **first non-null ArtifactRef**. The remote backend is listed
11
+ * first so its ref wins when both succeed the published manifest points
12
+ * at a cross-machine-readable store. A remote failure falls through to
13
+ * local, so a local success + remote failure still produces a non-null ref
14
+ * pointing at local and the pipeline succeeds with a warning on the remote
15
+ * leg.
14
16
  * - Failures on individual writers warn (via their own P5 paths) but do
15
17
  * not propagate. The fanout never throws.
16
18
  *
@@ -19,6 +21,7 @@
19
21
  * plus a recording test double.
20
22
  *
21
23
  * @see docs/decisions/D0033-unified-run-anchored-artifact-capture.md (§ M4)
24
+ * @see docs/design-docs/canonical-artifact-ref-selection.md — W0064 ordering
22
25
  */
23
26
  export class FanoutArtifactWriter {
24
27
  writers;
@@ -30,6 +30,7 @@
30
30
  import { Storage } from "@google-cloud/storage";
31
31
  import { ARTIFACT_REGISTRY, buildManifestPreview, } from "../_vendor/ailf-core/index.js";
32
32
  import { resolveUploadConcurrency } from "./parallel-emit.js";
33
+ import { prepareUploadBody } from "./prepare-upload-body.js";
33
34
  import { redactArtifactData } from "./redact-artifact.js";
34
35
  import { NO_OP_UPLOAD_METRICS, } from "./upload-metrics.js";
35
36
  /**
@@ -83,18 +84,18 @@ export class GcsArtifactWriter {
83
84
  console.warn(` ⚠️ emit("${type}"): association.run is required, skipping`);
84
85
  return null;
85
86
  }
86
- // AC10 — redact at the writer boundary so secrets never reach GCS.
87
- const redacted = redactArtifactData(payload);
88
- // Preview reads the pre-redaction payload (same as local writer — the
89
- // preview carries a descriptor-controlled summary bounded by capBytes,
90
- // not the raw entry bytes).
87
+ // W0059 — redact + serialize through the shared helper so the three
88
+ // writer implementations can't drift. Preview reads the pre-redaction
89
+ // payload (same as local writer — the preview carries a descriptor-
90
+ // controlled summary bounded by capBytes, not the raw entry bytes).
91
+ const { body } = prepareUploadBody(payload, descriptor.mime);
91
92
  const preview = buildManifestPreview(descriptor, payload);
92
93
  if (descriptor.layout === "bulk") {
93
94
  const path = descriptor.objectPath(runId);
94
- const ref = await this.putBody(path, serializeForMime(redacted, descriptor.mime), {
95
+ const ref = await this.putBody(path, body, {
95
96
  layout: "bulk",
96
97
  mime: descriptor.mime,
97
- entryCount: entryCountOf(redacted),
98
+ entryCount: entryCountOf(payload),
98
99
  type,
99
100
  });
100
101
  if (!ref)
@@ -106,7 +107,11 @@ export class GcsArtifactWriter {
106
107
  // per-entry
107
108
  const entryKey = descriptor.formatEntryKey(association);
108
109
  const path = descriptor.objectPath(runId, entryKey);
109
- const ref = await this.putBody(path, serializeForMime(redacted, descriptor.mime), { layout: "per-entry", mime: descriptor.mime, type });
110
+ const ref = await this.putBody(path, body, {
111
+ layout: "per-entry",
112
+ mime: descriptor.mime,
113
+ type,
114
+ });
110
115
  if (!ref)
111
116
  return null;
112
117
  const finalRef = {
@@ -222,11 +227,11 @@ export class GcsArtifactWriter {
222
227
  return null;
223
228
  }
224
229
  const path = descriptor.objectPath(runId);
225
- const redacted = redactArtifactData(data);
226
- return this.putBody(path, serializeForMime(redacted, descriptor.mime), {
230
+ const { body } = prepareUploadBody(data, descriptor.mime);
231
+ return this.putBody(path, body, {
227
232
  layout: "bulk",
228
233
  mime: descriptor.mime,
229
- entryCount: entryCountOf(redacted),
234
+ entryCount: entryCountOf(data),
230
235
  type,
231
236
  });
232
237
  }
@@ -251,9 +256,7 @@ export class GcsArtifactWriter {
251
256
  continue;
252
257
  }
253
258
  const path = descriptor.objectPath(runId, entry.key);
254
- const redacted = redactArtifactData(entry.data);
255
- const body = serializeForMime(redacted, descriptor.mime);
256
- const bytes = Buffer.byteLength(body, "utf-8");
259
+ const { body, bytes } = prepareUploadBody(entry.data, descriptor.mime);
257
260
  const start = Date.now();
258
261
  let success = false;
259
262
  try {
@@ -416,14 +419,6 @@ class ConcurrencyLimiter {
416
419
  }
417
420
  }
418
421
  }
419
- function serializeForMime(payload, mime) {
420
- if (mime === "text/markdown" || mime === "application/yaml") {
421
- if (typeof payload === "string")
422
- return payload;
423
- return String(payload ?? "");
424
- }
425
- return JSON.stringify(payload);
426
- }
427
422
  function entryCountOf(data) {
428
423
  if (typeof data === "object" &&
429
424
  data !== null &&
@@ -0,0 +1,27 @@
1
+ /**
2
+ * Shared preamble for uploading an artifact payload from an `ArtifactWriter`.
3
+ *
4
+ * All three `ArtifactWriter` implementations (direct GCS, API Gateway, batching
5
+ * API Gateway) must apply the same `redact → serialize → bytecount` pipeline so
6
+ * secrets are stripped before leaving the process. Routing each writer through
7
+ * this helper prevents drift — any future writer that skips the helper would
8
+ * fail the contract test in
9
+ * `src/__tests__/artifact-upload-redaction.test.ts`.
10
+ *
11
+ * NDJSON streaming is **not** handled here — each row is redacted independently
12
+ * by the NDJSON writer path before being concatenated into a part body.
13
+ */
14
+ import type { ArtifactMime } from "../_vendor/ailf-core/index.d.ts";
15
+ export interface PreparedUploadBody {
16
+ readonly body: string;
17
+ readonly bytes: number;
18
+ }
19
+ /**
20
+ * Redact, serialize, and size `payload` for upload.
21
+ *
22
+ * Serialization branches on `mime`:
23
+ * - `application/json` (and anything else JSON-shaped, including the
24
+ * single-shot side of `application/x-ndjson`) → `JSON.stringify`.
25
+ * - `text/markdown` / `application/yaml` → coerce to string via `String()`.
26
+ */
27
+ export declare function prepareUploadBody(payload: unknown, mime: ArtifactMime): PreparedUploadBody;
@@ -0,0 +1,36 @@
1
+ /**
2
+ * Shared preamble for uploading an artifact payload from an `ArtifactWriter`.
3
+ *
4
+ * All three `ArtifactWriter` implementations (direct GCS, API Gateway, batching
5
+ * API Gateway) must apply the same `redact → serialize → bytecount` pipeline so
6
+ * secrets are stripped before leaving the process. Routing each writer through
7
+ * this helper prevents drift — any future writer that skips the helper would
8
+ * fail the contract test in
9
+ * `src/__tests__/artifact-upload-redaction.test.ts`.
10
+ *
11
+ * NDJSON streaming is **not** handled here — each row is redacted independently
12
+ * by the NDJSON writer path before being concatenated into a part body.
13
+ */
14
+ import { redactArtifactData } from "./redact-artifact.js";
15
+ /**
16
+ * Redact, serialize, and size `payload` for upload.
17
+ *
18
+ * Serialization branches on `mime`:
19
+ * - `application/json` (and anything else JSON-shaped, including the
20
+ * single-shot side of `application/x-ndjson`) → `JSON.stringify`.
21
+ * - `text/markdown` / `application/yaml` → coerce to string via `String()`.
22
+ */
23
+ export function prepareUploadBody(payload, mime) {
24
+ const redacted = redactArtifactData(payload);
25
+ const body = serializeForMime(redacted, mime);
26
+ const bytes = Buffer.byteLength(body, "utf-8");
27
+ return { body, bytes };
28
+ }
29
+ function serializeForMime(payload, mime) {
30
+ if (mime === "text/markdown" || mime === "application/yaml") {
31
+ if (typeof payload === "string")
32
+ return payload;
33
+ return String(payload ?? "");
34
+ }
35
+ return JSON.stringify(payload);
36
+ }
@@ -188,14 +188,19 @@ export function createArtifactWriter(config, logger, progress) {
188
188
  exclude,
189
189
  ...(remote ? {} : { progress }),
190
190
  });
191
+ // W0064 — when a remote backend is wired, list it first so its ArtifactRef
192
+ // wins the fanout's firstNonNull() selection and the published manifest
193
+ // points at a cross-machine-readable store. Local stays attached as the
194
+ // resilience tier: if the remote leg fails, firstNonNull falls through to
195
+ // local and the pipeline still produces a non-null ref.
191
196
  const base = remote
192
- ? new FanoutArtifactWriter([local, remote], { progress })
197
+ ? new FanoutArtifactWriter([remote, local], { progress })
193
198
  : local;
194
199
  if (!remote) {
195
200
  logger.debug(`Artifact writer: LocalFilesystemArtifactWriter only (rootDir=${rootDir})`);
196
201
  }
197
202
  else {
198
- logger.debug(`Artifact writer: FanoutArtifactWriter([local=${rootDir}, ${remote.constructor.name}])`);
203
+ logger.debug(`Artifact writer: FanoutArtifactWriter([${remote.constructor.name}, local=${rootDir}])`);
199
204
  }
200
205
  // Wrap in the accumulator so FinalizeRunStep can build a populated
201
206
  // RunManifest without each producer bookkeeping its own ArtifactRefs
@@ -130,6 +130,12 @@ export async function orchestratePipeline(ctx, steps) {
130
130
  const pipelineStart = Date.now();
131
131
  const hasJob = !!ctx.config.jobId;
132
132
  const jobUpdates = [];
133
+ // DOC-2064 — tracks whether the pre-finalize pipelineContext emit fired so
134
+ // the post-loop fallback can skip redundant writes. A second emit to the
135
+ // same GCS path produces a 412 Precondition Failed from the signed-URL
136
+ // writer (which enforces no-overwrite), logging spurious warnings on every
137
+ // successful run.
138
+ let pipelineContextEmitted = false;
133
139
  ctx.logger.section("ai-literacy-framework — Evaluation Pipeline");
134
140
  ctx.logger.debug(`Pipeline starting with ${steps.length} steps`, {
135
141
  steps: steps.map((s) => s.name),
@@ -152,6 +158,16 @@ export async function orchestratePipeline(ctx, steps) {
152
158
  ctx.logger.debug(`Starting step ${i + 1}/${steps.length}: ${step.name}`);
153
159
  ctx.logger.section(step.name);
154
160
  exportPhase.maybeOpen(step.name);
161
+ // DOC-2064 — emit pipelineContext BEFORE finalize-run so the artifact
162
+ // ref registers with the accumulator and lands in RunManifest.artifacts,
163
+ // which PublishReportStep then snapshots into Report.artifactManifest.
164
+ // The previous post-loop emit ran after publish and was invisible to
165
+ // Content Lake readers. The failure-path capture below still fires on
166
+ // pre-finalize aborts so aborted runs retain the on-disk artifact.
167
+ if (step.name === "finalize-run") {
168
+ await capturePipelineContext(ctx, state, results);
169
+ pipelineContextEmitted = true;
170
+ }
155
171
  // Report current step progress
156
172
  if (hasJob) {
157
173
  await reportJobProgress(ctx, step.name, i, steps.length, "running", undefined, jobUpdates);
@@ -175,8 +191,12 @@ export async function orchestratePipeline(ctx, steps) {
175
191
  }
176
192
  // Capture pipeline context before exiting. `job-updates` was an
177
193
  // observability-only capture not tied to a registered artifact type;
178
- // dropped in W0050. Use the JobStore path for job telemetry.
179
- await capturePipelineContext(ctx, state, results);
194
+ // dropped in W0050. Use the JobStore path for job telemetry. Skip
195
+ // when the pre-finalize emit already fired to avoid a 412 overwrite
196
+ // warning (DOC-2064).
197
+ if (!pipelineContextEmitted) {
198
+ await capturePipelineContext(ctx, state, results);
199
+ }
180
200
  exportPhase.close();
181
201
  return {
182
202
  belowCritical: state.belowCritical,
@@ -231,9 +251,18 @@ export async function orchestratePipeline(ctx, steps) {
231
251
  ctx.logger.warn("Failed to report job completion — continuing");
232
252
  }
233
253
  }
234
- // Capture pipeline context. `job-updates` observability captures were
235
- // dropped in Slice 6.1JobStore is the supported telemetry path.
236
- await capturePipelineContext(ctx, state, results);
254
+ // DOC-2064 post-loop fallback. Only fires when the pre-finalize emit
255
+ // inside the step loop didn't run typically because the pipeline has no
256
+ // finalize-run step (test harnesses, air-gapped runs). Skipping this when
257
+ // the pre-finalize emit already fired avoids a 412 Precondition Failed
258
+ // from the signed-URL writer, which refuses to overwrite the existing
259
+ // path. The tradeoff is that pipelineContext captures pipeline state as
260
+ // of finalize-run, not post-publish — reportId is absent. Acceptable
261
+ // because runId is the primary join key and reportId is trivially
262
+ // looked up from Content Lake via runId.
263
+ if (!pipelineContextEmitted) {
264
+ await capturePipelineContext(ctx, state, results);
265
+ }
237
266
  exportPhase.close();
238
267
  return {
239
268
  belowCritical: state.belowCritical,
@@ -143,16 +143,18 @@ export class CalculateScoresStep {
143
143
  // The full responseOutput lives in the GCS artifact; PublishReportStep
144
144
  // later strips it from the inline Content Lake document when this
145
145
  // upload succeeds.
146
+ //
147
+ // The emits flow through `ctx.artifactWriter`, which the composition
148
+ // root wraps in `AccumulatingArtifactWriter`. That's where the
149
+ // authoritative merged ref is built; `FinalizeRunStep` reads it
150
+ // straight from the accumulator, so producer-side registration on
151
+ // `state.artifactRefs` would only clobber the accumulator's full set
152
+ // with a partial single-entry ref.
153
+ //
146
154
  // W0050 — ctx.artifactWriter is always present; no guard needed.
147
155
  const testResults = tryReadTestResults(ctx.config.rootDir);
148
156
  if (testResults?.length) {
149
- const artifactRef = await uploadTestOutputs(ctx.artifactWriter, ctx.runId, testResults, ctx.config.mode);
150
- if (artifactRef) {
151
- state.artifactRefs = {
152
- ...state.artifactRefs,
153
- testOutputs: artifactRef,
154
- };
155
- }
157
+ await uploadTestOutputs(ctx.artifactWriter, ctx.runId, testResults, ctx.config.mode);
156
158
  }
157
159
  const criticalSuffix = belowCritical.length > 0
158
160
  ? ` (${belowCritical.length} area(s) below critical threshold: ${belowCritical.join(", ")})`
@@ -86,11 +86,12 @@ export class FinalizeRunStep {
86
86
  });
87
87
  // W0051 revisit: the composition-root wraps `ctx.artifactWriter` in
88
88
  // `AccumulatingArtifactWriter`, which keeps a map of every ref any
89
- // producer emitted this run. Merge that into `state.artifactRefs` so
90
- // the manifest reflects the FULL set not just the subset producers
91
- // happened to register manually. When the writer is a NoOp / plain
92
- // decorator without accumulation, `aggregated` stays empty and the
93
- // manifest falls back to the producer-side registration.
89
+ // producer emitted this run the authoritative FULL set, merged by
90
+ // entry key. `state.artifactRefs` is a producer-side fallback for
91
+ // writers that don't accumulate (NoOp / plain decorators). The
92
+ // accumulator wins per type when both exist, because producer-side
93
+ // registrations tend to capture only the last ref from a parallel
94
+ // batch and would otherwise clobber the merged entries list.
94
95
  //
95
96
  // W0058: `findAccumulator` unwraps `InstrumentedArtifactWriter` too
96
97
  // so the manifest stays fully populated when `AILF_UPLOAD_METRICS=1`.
@@ -99,8 +100,8 @@ export class FinalizeRunStep {
99
100
  ? accumulator.getAccumulatedArtifactRefs()
100
101
  : {};
101
102
  const artifacts = {
102
- ...aggregated,
103
103
  ...(state.artifactRefs ?? {}),
104
+ ...aggregated,
104
105
  };
105
106
  const manifest = {
106
107
  version: 1,
@@ -47,6 +47,12 @@ export interface RawTestResult {
47
47
  };
48
48
  response: {
49
49
  output: string;
50
+ tokenUsage?: {
51
+ cached?: number;
52
+ completion?: number;
53
+ prompt?: number;
54
+ total?: number;
55
+ };
50
56
  };
51
57
  testCase?: {
52
58
  description?: string;
@@ -223,6 +223,7 @@ export function extractStoredTestResults(resultsPath) {
223
223
  }
224
224
  dimensions.push({ dimension, reason, score });
225
225
  }
226
+ const tokenUsage = result.response?.tokenUsage;
226
227
  testResults.push({
227
228
  area,
228
229
  cost: result.cost || undefined,
@@ -233,6 +234,7 @@ export function extractStoredTestResults(resultsPath) {
233
234
  responseOutput,
234
235
  ...(responseOutputTruncated && { responseOutputTruncated: true }),
235
236
  taskId,
237
+ ...(tokenUsage && { tokenUsage }),
236
238
  variant,
237
239
  });
238
240
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sanity/ailf",
3
- "version": "3.1.0",
3
+ "version": "3.2.0",
4
4
  "private": false,
5
5
  "publishConfig": {
6
6
  "access": "public"
@@ -52,8 +52,8 @@
52
52
  "@types/node": "^22.13.1",
53
53
  "tsx": "^4.19.2",
54
54
  "typescript": "^5.7.3",
55
- "@sanity/ailf-shared": "0.1.0",
56
- "@sanity/ailf-core": "0.1.0"
55
+ "@sanity/ailf-core": "0.1.0",
56
+ "@sanity/ailf-shared": "0.1.0"
57
57
  },
58
58
  "scripts": {
59
59
  "build": "tsc && tsx scripts/bundle-workspace-deps.ts",