@sanity/ailf 2.7.1 → 2.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/dist/_vendor/ailf-core/artifact-registry.d.ts +72 -0
  2. package/dist/_vendor/ailf-core/artifact-registry.js +150 -0
  3. package/dist/_vendor/ailf-core/index.d.ts +2 -1
  4. package/dist/_vendor/ailf-core/index.js +2 -1
  5. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +3 -3
  6. package/dist/_vendor/ailf-core/ports/artifact-writer.d.ts +56 -0
  7. package/dist/_vendor/ailf-core/ports/artifact-writer.js +28 -0
  8. package/dist/_vendor/ailf-core/ports/context.d.ts +13 -3
  9. package/dist/_vendor/ailf-core/ports/index.d.ts +3 -3
  10. package/dist/_vendor/ailf-core/ports/index.js +1 -1
  11. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +9 -0
  12. package/dist/_vendor/ailf-core/types/branded-ids.js +21 -0
  13. package/dist/_vendor/ailf-core/types/index.d.ts +110 -68
  14. package/dist/_vendor/ailf-core/types/index.js +1 -1
  15. package/dist/_vendor/ailf-shared/index.d.ts +2 -0
  16. package/dist/_vendor/ailf-shared/index.js +2 -0
  17. package/dist/_vendor/ailf-shared/run-context.d.ts +55 -0
  18. package/dist/_vendor/ailf-shared/run-context.js +17 -0
  19. package/dist/_vendor/ailf-shared/run-trigger.d.ts +30 -0
  20. package/dist/_vendor/ailf-shared/run-trigger.js +13 -0
  21. package/dist/artifact-capture/api-gateway-artifact-writer.d.ts +39 -0
  22. package/dist/artifact-capture/api-gateway-artifact-writer.js +148 -0
  23. package/dist/artifact-capture/gcs-artifact-writer.d.ts +30 -0
  24. package/dist/artifact-capture/gcs-artifact-writer.js +119 -0
  25. package/dist/commands/publish.js +3 -2
  26. package/dist/composition-root.d.ts +3 -3
  27. package/dist/composition-root.js +20 -15
  28. package/dist/orchestration/build-step-sequence.js +6 -1
  29. package/dist/orchestration/steps/calculate-scores-step.js +42 -2
  30. package/dist/orchestration/steps/finalize-run-step.d.ts +29 -0
  31. package/dist/orchestration/steps/finalize-run-step.js +103 -0
  32. package/dist/orchestration/steps/publish-report-step.js +19 -39
  33. package/dist/pipeline/calculate-scores.js +13 -2
  34. package/dist/pipeline/provenance.d.ts +24 -44
  35. package/dist/pipeline/provenance.js +17 -165
  36. package/dist/pipeline/report-title.d.ts +2 -2
  37. package/dist/pipeline/run-context.d.ts +57 -0
  38. package/dist/pipeline/run-context.js +156 -0
  39. package/dist/pipeline/upload-test-outputs.d.ts +26 -0
  40. package/dist/pipeline/upload-test-outputs.js +34 -0
  41. package/dist/report-store.js +4 -2
  42. package/package.json +3 -3
  43. package/dist/_vendor/ailf-core/ports/artifact-uploader.d.ts +0 -35
  44. package/dist/_vendor/ailf-core/ports/artifact-uploader.js +0 -18
  45. package/dist/artifact-capture/api-gateway-artifact-uploader.d.ts +0 -41
  46. package/dist/artifact-capture/api-gateway-artifact-uploader.js +0 -123
  47. package/dist/artifact-capture/gcs-report-artifact-uploader.d.ts +0 -31
  48. package/dist/artifact-capture/gcs-report-artifact-uploader.js +0 -66
@@ -0,0 +1,119 @@
1
+ /**
2
+ * GcsArtifactWriter — writes AILF run artifacts + manifest directly to GCS.
3
+ *
4
+ * Uses Application Default Credentials (ADC). Used when the CLI runs in CI or
5
+ * anywhere ADC is configured — the client talks to GCS without the API Gateway
6
+ * acting as a middleman.
7
+ *
8
+ * Paths come from `ARTIFACT_REGISTRY` so writers, signers, and readers agree.
9
+ *
10
+ * Design principles:
11
+ * - P5: Non-blocking — upload failure returns null, never throws.
12
+ * - Lazy client — Storage created on first write.
13
+ *
14
+ * @see docs/decisions/D0032-run-anchored-artifact-store.md
15
+ */
16
+ import { Storage } from "@google-cloud/storage";
17
+ import { ARTIFACT_REGISTRY, } from "../_vendor/ailf-core/index.js";
18
+ export class GcsArtifactWriter {
19
+ client = null;
20
+ options;
21
+ constructor(options) {
22
+ this.options = options;
23
+ }
24
+ async writeBulk(type, runId, data) {
25
+ const descriptor = ARTIFACT_REGISTRY[type];
26
+ const path = descriptor.objectPath(runId);
27
+ return this.putJson(path, data, {
28
+ layout: "bulk",
29
+ entryCount: entryCountOf(data),
30
+ });
31
+ }
32
+ async writePerEntry(type, runId, entries) {
33
+ const descriptor = ARTIFACT_REGISTRY[type];
34
+ if (!descriptor.parseEntryKey) {
35
+ console.warn(` ⚠️ writePerEntry called for "${type}" but the registry has no parseEntryKey`);
36
+ return null;
37
+ }
38
+ const storage = this.getClient();
39
+ const uploaded = [];
40
+ let totalBytes = 0;
41
+ for (const entry of entries) {
42
+ const parsed = descriptor.parseEntryKey(entry.key);
43
+ if (!parsed.ok) {
44
+ console.warn(` ⚠️ Skipping entry with invalid key "${entry.key}": ${parsed.reason}`);
45
+ continue;
46
+ }
47
+ const path = descriptor.objectPath(runId, entry.key);
48
+ const json = JSON.stringify(entry.data);
49
+ const bytes = Buffer.byteLength(json, "utf-8");
50
+ try {
51
+ await storage
52
+ .bucket(this.options.bucket)
53
+ .file(path)
54
+ .save(json, { contentType: "application/json" });
55
+ uploaded.push({ key: entry.key, bytes });
56
+ totalBytes += bytes;
57
+ }
58
+ catch (err) {
59
+ const message = err instanceof Error ? err.message : String(err);
60
+ console.warn(` ⚠️ Artifact entry upload failed (non-blocking): ${path} — ${message}`);
61
+ }
62
+ }
63
+ if (uploaded.length === 0)
64
+ return null;
65
+ return {
66
+ store: "gcs",
67
+ bucket: this.options.bucket,
68
+ path: `runs/${runId}/${descriptor.slug}`,
69
+ bytes: totalBytes,
70
+ entryCount: uploaded.length,
71
+ layout: "per-entry",
72
+ entries: uploaded,
73
+ };
74
+ }
75
+ async writeManifest(runId, manifest) {
76
+ const path = `runs/${runId}/manifest.json`;
77
+ return this.putJson(path, manifest, { layout: "bulk" });
78
+ }
79
+ async putJson(path, data, meta) {
80
+ const json = JSON.stringify(data);
81
+ const bytes = Buffer.byteLength(json, "utf-8");
82
+ try {
83
+ const storage = this.getClient();
84
+ await storage
85
+ .bucket(this.options.bucket)
86
+ .file(path)
87
+ .save(json, { contentType: "application/json" });
88
+ return {
89
+ store: "gcs",
90
+ bucket: this.options.bucket,
91
+ path,
92
+ bytes,
93
+ entryCount: meta.entryCount,
94
+ layout: meta.layout,
95
+ };
96
+ }
97
+ catch (err) {
98
+ const message = err instanceof Error ? err.message : String(err);
99
+ console.warn(` ⚠️ Artifact upload failed (non-blocking): ${path} — ${message}`);
100
+ return null;
101
+ }
102
+ }
103
+ getClient() {
104
+ if (this.client)
105
+ return this.client;
106
+ this.client = new Storage();
107
+ return this.client;
108
+ }
109
+ }
110
+ function entryCountOf(data) {
111
+ if (typeof data === "object" &&
112
+ data !== null &&
113
+ "entries" in data &&
114
+ typeof data.entries === "object") {
115
+ return Object.keys(data.entries)
116
+ .length;
117
+ }
118
+ return undefined;
119
+ }
@@ -55,7 +55,7 @@ export function createPublishCommand() {
55
55
  * the summary metadata and environment. Some fields (contextHash,
56
56
  * promptfooUrl) are not available for manual publishes.
57
57
  */
58
- function buildProvenanceFromSummary(summary) {
58
+ function buildProvenanceFromSummary(summary, runId) {
59
59
  const areas = summary.scores.map((s) => s.feature);
60
60
  const mode = (process.env.EVAL_MODE ?? "literacy");
61
61
  const source = {
@@ -76,6 +76,7 @@ function buildProvenanceFromSummary(summary) {
76
76
  areas,
77
77
  mode,
78
78
  rootDir: ROOT,
79
+ runId,
79
80
  source,
80
81
  };
81
82
  }
@@ -145,7 +146,7 @@ async function runPublishCommand(summaryPath, outputDir, opts) {
145
146
  // -----------------------------------------------------------------------
146
147
  // 2. Build provenance
147
148
  // -----------------------------------------------------------------------
148
- const provenanceInput = buildProvenanceFromSummary(summary);
149
+ const provenanceInput = buildProvenanceFromSummary(summary, ctx.runId);
149
150
  const provenance = buildProvenance(provenanceInput);
150
151
  // -----------------------------------------------------------------------
151
152
  // 3. Create report
@@ -15,7 +15,7 @@
15
15
  * @see packages/core/src/ports/context.ts — AppContext interface
16
16
  * @see docs/archive/exec-plans/ports-and-adapters/phase-7-composition-root.md
17
17
  */
18
- import { type AppContext, type ArtifactUploader, type AssertionRegistration, type Logger, type ResolvedConfig } from "./_vendor/ailf-core/index.d.ts";
18
+ import { type AppContext, type ArtifactWriter, type AssertionRegistration, type Logger, type ResolvedConfig } from "./_vendor/ailf-core/index.d.ts";
19
19
  /**
20
20
  * Create a fully wired AppContext from resolved configuration.
21
21
  *
@@ -24,7 +24,7 @@ import { type AppContext, type ArtifactUploader, type AssertionRegistration, typ
24
24
  */
25
25
  export declare function createAppContext(config: ResolvedConfig): AppContext;
26
26
  /**
27
- * Selects an ArtifactUploader implementation based on available credentials.
27
+ * Selects an ArtifactWriter implementation based on available credentials.
28
28
  *
29
29
  * Selection order:
30
30
  * 1. config.artifactUpload === false → always skip (explicit opt-out)
@@ -38,7 +38,7 @@ export declare function createAppContext(config: ResolvedConfig): AppContext;
38
38
  *
39
39
  * Exported for unit-test access; not part of the public package API.
40
40
  */
41
- export declare function createArtifactUploader(config: ResolvedConfig, logger: Logger): ArtifactUploader | undefined;
41
+ export declare function createArtifactWriter(config: ResolvedConfig, logger: Logger): ArtifactWriter | undefined;
42
42
  /**
43
43
  * Generic Promptfoo assertion types available to all evaluation modes.
44
44
  *
@@ -16,11 +16,11 @@
16
16
  * @see docs/archive/exec-plans/ports-and-adapters/phase-7-composition-root.md
17
17
  */
18
18
  import { join } from "node:path";
19
- import { InMemoryPluginRegistry, NoOpArtifactCollector, } from "./_vendor/ailf-core/index.js";
20
- import { ApiGatewayArtifactUploader } from "./artifact-capture/api-gateway-artifact-uploader.js";
19
+ import { InMemoryPluginRegistry, NoOpArtifactCollector, generateRunId, } from "./_vendor/ailf-core/index.js";
20
+ import { ApiGatewayArtifactWriter } from "./artifact-capture/api-gateway-artifact-writer.js";
21
21
  import { FilesystemArtifactCollector } from "./artifact-capture/filesystem-collector.js";
22
22
  import { GcsArtifactCollector } from "./artifact-capture/gcs-collector.js";
23
- import { GcsReportArtifactUploader } from "./artifact-capture/gcs-report-artifact-uploader.js";
23
+ import { GcsArtifactWriter } from "./artifact-capture/gcs-artifact-writer.js";
24
24
  import { ContentLakeCacheAdapter } from "./adapters/cache/content-lake-cache.js";
25
25
  import { loadExternalPresets } from "./pipeline/compiler/preset-loader.js";
26
26
  import { FilesystemCache } from "./adapters/cache/filesystem-cache.js";
@@ -82,13 +82,17 @@ export function createAppContext(config) {
82
82
  })
83
83
  : fsCollector;
84
84
  }
85
- // Report artifact uploader uploads structured files to GCS at known
86
- // paths for Studio to fetch via signed URLs (D0030). Auto-detects the
87
- // right adapter from available credentials; defaults bucket to
88
- // "ailf-artifacts". Set artifactUpload: false to opt out entirely.
89
- const artifactUploader = createArtifactUploader(config, logger);
85
+ // Artifact writerwrites run artifacts + manifest to GCS at known
86
+ // `runs/{runId}/…` paths (D0032). Auto-detects the right adapter from
87
+ // available credentials; defaults bucket to "ailf-artifacts". Set
88
+ // artifactUpload: false to opt out entirely.
89
+ const artifactWriter = createArtifactWriter(config, logger);
90
+ // Generate the pipeline's RunId once; every downstream step reads it
91
+ // from the context (D0032).
92
+ const runId = generateRunId();
93
+ logger.debug(`Pipeline runId: ${runId}`);
90
94
  return {
91
- artifactUploader,
95
+ artifactWriter,
92
96
  cache,
93
97
  collector,
94
98
  config,
@@ -97,6 +101,7 @@ export function createAppContext(config) {
97
101
  logger,
98
102
  registry,
99
103
  reportStore,
104
+ runId,
100
105
  sinks,
101
106
  taskSource,
102
107
  };
@@ -124,7 +129,7 @@ function createLogger() {
124
129
  */
125
130
  const DEFAULT_ARTIFACT_BUCKET = "ailf-artifacts";
126
131
  /**
127
- * Selects an ArtifactUploader implementation based on available credentials.
132
+ * Selects an ArtifactWriter implementation based on available credentials.
128
133
  *
129
134
  * Selection order:
130
135
  * 1. config.artifactUpload === false → always skip (explicit opt-out)
@@ -138,7 +143,7 @@ const DEFAULT_ARTIFACT_BUCKET = "ailf-artifacts";
138
143
  *
139
144
  * Exported for unit-test access; not part of the public package API.
140
145
  */
141
- export function createArtifactUploader(config, logger) {
146
+ export function createArtifactWriter(config, logger) {
142
147
  if (config.artifactUpload === false) {
143
148
  logger.debug("Artifact upload explicitly disabled via artifactUpload=false");
144
149
  return undefined;
@@ -148,13 +153,13 @@ export function createArtifactUploader(config, logger) {
148
153
  // We treat the presence of either env var as the user opting in to ADC.
149
154
  const hasGcsCredentials = Boolean(process.env.GOOGLE_APPLICATION_CREDENTIALS || process.env.GCLOUD_PROJECT);
150
155
  if (hasGcsCredentials) {
151
- logger.debug(`Artifact uploader: GcsReportArtifactUploader (direct GCS via ADC, bucket=${bucket})`);
152
- return new GcsReportArtifactUploader({ bucket });
156
+ logger.debug(`Artifact writer: GcsArtifactWriter (direct GCS via ADC, bucket=${bucket})`);
157
+ return new GcsArtifactWriter({ bucket });
153
158
  }
154
159
  // Local dev — request signed PUT URLs from the API gateway, no GCS creds needed.
155
160
  if (config.apiKey && config.apiUrl) {
156
- logger.debug(`Artifact uploader: ApiGatewayArtifactUploader (signed URL via ${config.apiUrl}, bucket=${bucket})`);
157
- return new ApiGatewayArtifactUploader({
161
+ logger.debug(`Artifact writer: ApiGatewayArtifactWriter (signed URL via ${config.apiUrl}, bucket=${bucket})`);
162
+ return new ApiGatewayArtifactWriter({
158
163
  apiBaseUrl: config.apiUrl,
159
164
  apiKey: config.apiKey,
160
165
  bucket,
@@ -11,6 +11,7 @@ import { CalculateScoresStep } from "./steps/calculate-scores-step.js";
11
11
  import { CompareStep } from "./steps/compare-step.js";
12
12
  import { DiscoveryReportStep } from "./steps/discovery-report-step.js";
13
13
  import { FetchDocsStep } from "./steps/fetch-docs-step.js";
14
+ import { FinalizeRunStep } from "./steps/finalize-run-step.js";
14
15
  import { GapAnalysisStep } from "./steps/gap-analysis-step.js";
15
16
  import { GenerateConfigsStep } from "./steps/generate-configs-step.js";
16
17
  import { GraderConsistencyStep } from "./steps/grader-consistency-step.js";
@@ -76,7 +77,11 @@ export function buildStepSequence(ctx, pipelineStart = Date.now()) {
76
77
  if (config.gapAnalysisEnabled) {
77
78
  steps.push(new GapAnalysisStep());
78
79
  }
79
- // Step 4b: Publish report (optional, when token is configured)
80
+ // Step 4c: Finalize the run write `runs/{runId}/manifest.json` with the
81
+ // catalog of artifacts produced so far. Skipped silently when no
82
+ // artifactWriter is wired (D0032).
83
+ steps.push(new FinalizeRunStep(pipelineStart));
84
+ // Step 4d: Publish report (optional, when token is configured)
80
85
  if (config.publishEnabled) {
81
86
  steps.push(new PublishReportStep(pipelineStart, {
82
87
  publishTag: config.publishTag,
@@ -4,8 +4,8 @@
4
4
  * Calls calculateAndWriteScores() from pipeline/calculate-scores.ts with
5
5
  * typed options derived from AppContext. No env bridge needed.
6
6
  */
7
- import { existsSync } from "node:fs";
8
- import { join } from "path";
7
+ import { existsSync, readFileSync } from "node:fs";
8
+ import { join, resolve } from "path";
9
9
  import { LiteracyVariant } from "../../pipeline/normalize-mode.js";
10
10
  import { getStepInputPaths } from "../../pipeline/cache.js";
11
11
  import { buildCacheContext } from "../cache-context.js";
@@ -13,6 +13,7 @@ import { calculateAndWriteScores } from "../../pipeline/calculate-scores.js";
13
13
  import { checkResultsExist, checkScoreSummaryValid, } from "../../pipeline/checks.js";
14
14
  import { resultsFileForMode } from "../../pipeline/eval-constants.js";
15
15
  import { loadSource } from "../../sources.js";
16
+ import { uploadTestOutputs } from "../../pipeline/upload-test-outputs.js";
16
17
  import { configToSourceOverrides } from "../config-to-source-overrides.js";
17
18
  export class CalculateScoresStep {
18
19
  name = "calculate-scores";
@@ -132,6 +133,27 @@ export class CalculateScoresStep {
132
133
  ctx.collector.captureFile("calculate-scores", file.replace(".json", ""), filePath);
133
134
  }
134
135
  }
136
+ // Upload testOutputs to GCS (D0032 — non-blocking, P5).
137
+ // Read from test-results.json rather than score-summary.json: the
138
+ // gap-analysis step (downstream) is the one that enriches score-summary
139
+ // with testResults, so at this point the summary still has an empty
140
+ // testResults[]. test-results.json is written by calculateAndWriteScores
141
+ // above and carries the full per-test shape we need for per-entry upload.
142
+ // The full responseOutput lives in the GCS artifact; PublishReportStep
143
+ // later strips it from the inline Content Lake document when this
144
+ // upload succeeds.
145
+ if (ctx.artifactWriter) {
146
+ const testResults = tryReadTestResults(ctx.config.rootDir);
147
+ if (testResults?.length) {
148
+ const artifactRef = await uploadTestOutputs(ctx.artifactWriter, ctx.runId, testResults);
149
+ if (artifactRef) {
150
+ state.artifactRefs = {
151
+ ...state.artifactRefs,
152
+ testOutputs: artifactRef,
153
+ };
154
+ }
155
+ }
156
+ }
135
157
  const criticalSuffix = belowCritical.length > 0
136
158
  ? ` (${belowCritical.length} area(s) below critical threshold: ${belowCritical.join(", ")})`
137
159
  : "";
@@ -148,3 +170,21 @@ export class CalculateScoresStep {
148
170
  return buildCacheContext(ctx.config);
149
171
  }
150
172
  }
173
+ /**
174
+ * Read the per-test result set written by `calculateAndWriteScores`.
175
+ *
176
+ * This is the authoritative source for `uploadTestOutputs` at the time
177
+ * CalculateScoresStep runs — `score-summary.json` doesn't carry
178
+ * `testResults[]` until `gap-analysis-step` enriches it downstream.
179
+ */
180
+ function tryReadTestResults(rootDir) {
181
+ const path = resolve(rootDir, "results", "latest", "test-results.json");
182
+ if (!existsSync(path))
183
+ return undefined;
184
+ try {
185
+ return JSON.parse(readFileSync(path, "utf-8"));
186
+ }
187
+ catch {
188
+ return undefined;
189
+ }
190
+ }
@@ -0,0 +1,29 @@
1
+ /**
2
+ * Pipeline step: FinalizeRunStep — writes the run manifest at pipeline end.
3
+ *
4
+ * Inserts between `GapAnalysis` and `PublishReport`. Assembles a
5
+ * `RunManifest` from `state.artifactRefs` (populated by producer steps)
6
+ * and the shared `RunContext` (via `buildRunContext`), then writes it to
7
+ * `runs/{runId}/manifest.json`. The written manifest becomes the source
8
+ * of truth for artifact locations; `PublishReportStep` snapshots the
9
+ * `artifacts` slice into `Report.artifactManifest` (D0032).
10
+ *
11
+ * Design principles:
12
+ * - Single writer — one `writeManifest()` call per pipeline run.
13
+ * - Idempotent — retries produce the same manifest bytes for the same inputs.
14
+ * - Skipped when no writer is wired (local/air-gapped runs stay functional).
15
+ *
16
+ * @see docs/decisions/D0032-run-anchored-artifact-store.md
17
+ */
18
+ import type { AppContext, PipelineState, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
19
+ export declare class FinalizeRunStep implements PipelineStep {
20
+ private readonly pipelineStart;
21
+ private readonly options;
22
+ readonly name = "finalize-run";
23
+ readonly optional = true;
24
+ constructor(pipelineStart: number, options?: {
25
+ evalFingerprint?: string;
26
+ });
27
+ check(): ValidationIssue[];
28
+ execute(ctx: AppContext, state: PipelineState): Promise<StepResult>;
29
+ }
@@ -0,0 +1,103 @@
1
+ /**
2
+ * Pipeline step: FinalizeRunStep — writes the run manifest at pipeline end.
3
+ *
4
+ * Inserts between `GapAnalysis` and `PublishReport`. Assembles a
5
+ * `RunManifest` from `state.artifactRefs` (populated by producer steps)
6
+ * and the shared `RunContext` (via `buildRunContext`), then writes it to
7
+ * `runs/{runId}/manifest.json`. The written manifest becomes the source
8
+ * of truth for artifact locations; `PublishReportStep` snapshots the
9
+ * `artifacts` slice into `Report.artifactManifest` (D0032).
10
+ *
11
+ * Design principles:
12
+ * - Single writer — one `writeManifest()` call per pipeline run.
13
+ * - Idempotent — retries produce the same manifest bytes for the same inputs.
14
+ * - Skipped when no writer is wired (local/air-gapped runs stay functional).
15
+ *
16
+ * @see docs/decisions/D0032-run-anchored-artifact-store.md
17
+ */
18
+ import { existsSync, readFileSync } from "node:fs";
19
+ import { resolve } from "node:path";
20
+ import { buildRunContext } from "../../pipeline/run-context.js";
21
+ import { loadSource } from "../../sources.js";
22
+ import { configToSourceOverrides } from "../config-to-source-overrides.js";
23
+ export class FinalizeRunStep {
24
+ pipelineStart;
25
+ options;
26
+ name = "finalize-run";
27
+ optional = true;
28
+ constructor(pipelineStart, options = {}) {
29
+ this.pipelineStart = pipelineStart;
30
+ this.options = options;
31
+ }
32
+ check() {
33
+ return [];
34
+ }
35
+ async execute(ctx, state) {
36
+ const start = Date.now();
37
+ if (!ctx.artifactWriter) {
38
+ return {
39
+ status: "skipped",
40
+ reason: "No artifactWriter wired — manifest is only written when a writer is available",
41
+ };
42
+ }
43
+ // Resolve the source (same input buildProvenance uses).
44
+ const overrides = configToSourceOverrides(ctx.config);
45
+ const resolvedSource = loadSource(ctx.config.source, overrides);
46
+ // Optional: try to read the on-disk summary for test mode inference,
47
+ // but don't fail finalize if it's missing — the manifest should still
48
+ // be written so artifacts have a catalog.
49
+ const maybeSummary = tryReadScoreSummary(ctx.config.rootDir);
50
+ const runContext = buildRunContext({
51
+ areas: maybeSummary?.scores?.map((s) => s.feature) ?? ctx.config.areas ?? [],
52
+ callerGit: ctx.config.callerGit,
53
+ evalFingerprint: state.evalFingerprint ?? this.options.evalFingerprint,
54
+ logger: ctx.logger,
55
+ mode: ctx.config.mode,
56
+ rootDir: ctx.config.rootDir,
57
+ source: resolvedSource,
58
+ taskIds: ctx.config.tasks,
59
+ });
60
+ const manifest = {
61
+ version: 1,
62
+ runId: ctx.runId,
63
+ createdAt: new Date().toISOString(),
64
+ durationMs: Date.now() - this.pipelineStart,
65
+ status: "completed",
66
+ context: runContext,
67
+ outcomes: state.testSummary
68
+ ? { testSummary: state.testSummary }
69
+ : undefined,
70
+ promptfooUrls: state.promptfooUrls,
71
+ artifacts: state.artifactRefs ?? {},
72
+ };
73
+ const ref = await ctx.artifactWriter.writeManifest(ctx.runId, manifest);
74
+ if (!ref) {
75
+ // Non-blocking: writer logged the warning. Still populate state so
76
+ // publish can snapshot `artifacts` even without a persisted manifest.
77
+ state.runManifest = manifest;
78
+ return {
79
+ durationMs: Date.now() - start,
80
+ status: "success",
81
+ summary: "Run manifest computed (GCS write failed — non-blocking)",
82
+ };
83
+ }
84
+ state.runManifest = manifest;
85
+ const artifactCount = Object.keys(manifest.artifacts).length;
86
+ return {
87
+ durationMs: Date.now() - start,
88
+ status: "success",
89
+ summary: `Run manifest written to ${ref.path} (${artifactCount} artifact ref${artifactCount === 1 ? "" : "s"})`,
90
+ };
91
+ }
92
+ }
93
+ function tryReadScoreSummary(rootDir) {
94
+ const path = resolve(rootDir, "results", "latest", "score-summary.json");
95
+ if (!existsSync(path))
96
+ return undefined;
97
+ try {
98
+ return JSON.parse(readFileSync(path, "utf-8"));
99
+ }
100
+ catch {
101
+ return undefined;
102
+ }
103
+ }
@@ -113,21 +113,24 @@ export class PublishReportStep {
113
113
  tag: this.options.publishTag ?? ctx.config.publishTag,
114
114
  title,
115
115
  };
116
- // Upload test output artifacts to GCS (D0030 — non-blocking, P5).
117
- // When upload succeeds, strip responseOutput from the inline
118
- // testResults[] so the Content Lake document carries only the slim
119
- // shape; the full output lives in the GCS artifact. When upload
120
- // fails, leave the inline shape intact so Studio's drill-down UI
121
- // still works via the backward-compat fallback.
122
- if (ctx.artifactUploader && summary.testResults?.length) {
123
- const artifactRef = await uploadTestOutputs(ctx.artifactUploader, reportId, now, summary.testResults);
124
- if (artifactRef) {
125
- report.artifacts = { testOutputs: artifactRef };
126
- report.summary = {
127
- ...summary,
128
- testResults: summary.testResults.map(slimTestResult),
129
- };
130
- }
116
+ // Snapshot the artifact manifest from FinalizeRunStep's output (D0032).
117
+ // The source of truth is `runs/{runId}/manifest.json` in GCS; the report
118
+ // carries a denormalized copy so Studio can render drill-down state
119
+ // without an extra GCS fetch.
120
+ const artifactManifest = state.runManifest?.artifacts;
121
+ if (artifactManifest && Object.keys(artifactManifest).length > 0) {
122
+ report.artifactManifest = artifactManifest;
123
+ }
124
+ // When testOutputs was uploaded to GCS, strip responseOutput from the
125
+ // inline testResults[] so the Content Lake document stays slim — the
126
+ // full output lives in the GCS artifact. When no testOutputs artifact
127
+ // exists, leave the inline shape intact so Studio's drill-down UI
128
+ // falls back to it.
129
+ if (artifactManifest?.testOutputs && summary.testResults?.length) {
130
+ report.summary = {
131
+ ...summary,
132
+ testResults: summary.testResults.map(slimTestResult),
133
+ };
131
134
  }
132
135
  // Share reportId with downstream steps (CallbackStep + orchestrator job update)
133
136
  state.reportId = reportId;
@@ -221,6 +224,7 @@ function buildProvenanceInput(summary, ctx, options, autoScope) {
221
224
  mode,
222
225
  promptfooUrls: options.promptfooUrls,
223
226
  rootDir: ctx.config.rootDir,
227
+ runId: ctx.runId,
224
228
  sanityDocumentIds,
225
229
  source,
226
230
  sourceReportId: ctx.config.sourceReportId,
@@ -236,30 +240,6 @@ function slimTestResult(tr) {
236
240
  const { responseOutput: _o, responseOutputTruncated: _t, ...rest } = tr;
237
241
  return rest;
238
242
  }
239
- /**
240
- * Extract test outputs from StoredTestResult[] and upload as a single
241
- * JSON artifact to GCS. The artifact is keyed by `{taskId}::{modelId}`
242
- * to match the lookup pattern in Studio's JudgmentList component.
243
- *
244
- * Non-blocking: returns null if upload fails (P5).
245
- */
246
- async function uploadTestOutputs(uploader, reportId, createdAt, testResults) {
247
- const entries = {};
248
- for (const tr of testResults) {
249
- const key = `${tr.taskId}::${tr.modelId}`;
250
- entries[key] = {
251
- responseOutput: tr.responseOutput ?? "",
252
- responseOutputTruncated: tr.responseOutputTruncated ?? false,
253
- };
254
- }
255
- const artifact = {
256
- version: 1,
257
- reportId,
258
- createdAt,
259
- entries,
260
- };
261
- return uploader.upload(reportId, "test-outputs.json", artifact);
262
- }
263
243
  /**
264
244
  * Fan out a report to all configured sinks.
265
245
  *
@@ -157,8 +157,19 @@ export function extractGraderJudgments(resultsPath) {
157
157
  }
158
158
  return judgments;
159
159
  }
160
- /** Maximum characters to store for model response output */
161
- const MAX_RESPONSE_OUTPUT_LENGTH = 8000;
160
+ /**
161
+ * Maximum characters (JS string length, not bytes) to store for model
162
+ * response output. ASCII-heavy responses at this cap JSON-encode to ~1 MB;
163
+ * pathological multi-byte UTF-8 could encode to ~4 MB, still well within
164
+ * per-entry GCS object limits.
165
+ *
166
+ * Raised from 8 000 to 1 000 000 in W0048 because the per-entry artifact
167
+ * layout (D0032) makes the cap irrelevant to Studio's fetch cost — each
168
+ * entry is fetched independently on click, so a larger ceiling only costs
169
+ * GCS bytes, not main-thread blocking or baseline report payload.
170
+ * `responseOutputTruncated` still flips for the extreme tail.
171
+ */
172
+ const MAX_RESPONSE_OUTPUT_LENGTH = 1_000_000;
162
173
  /**
163
174
  * Extract per-test results with model output from evaluation results.
164
175
  *