@sanity/ailf 2.7.0 → 2.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/dist/_vendor/ailf-core/artifact-registry.d.ts +72 -0
  2. package/dist/_vendor/ailf-core/artifact-registry.js +150 -0
  3. package/dist/_vendor/ailf-core/examples/index.d.ts +1 -1
  4. package/dist/_vendor/ailf-core/examples/index.js +1 -1
  5. package/dist/_vendor/ailf-core/index.d.ts +2 -1
  6. package/dist/_vendor/ailf-core/index.js +2 -1
  7. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +3 -3
  8. package/dist/_vendor/ailf-core/ports/artifact-writer.d.ts +56 -0
  9. package/dist/_vendor/ailf-core/ports/artifact-writer.js +28 -0
  10. package/dist/_vendor/ailf-core/ports/context.d.ts +13 -3
  11. package/dist/_vendor/ailf-core/ports/index.d.ts +3 -3
  12. package/dist/_vendor/ailf-core/ports/index.js +1 -1
  13. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +9 -0
  14. package/dist/_vendor/ailf-core/types/branded-ids.js +21 -0
  15. package/dist/_vendor/ailf-core/types/index.d.ts +117 -70
  16. package/dist/_vendor/ailf-core/types/index.js +1 -1
  17. package/dist/_vendor/ailf-shared/index.d.ts +2 -0
  18. package/dist/_vendor/ailf-shared/index.js +2 -0
  19. package/dist/_vendor/ailf-shared/run-context.d.ts +55 -0
  20. package/dist/_vendor/ailf-shared/run-context.js +17 -0
  21. package/dist/_vendor/ailf-shared/run-trigger.d.ts +30 -0
  22. package/dist/_vendor/ailf-shared/run-trigger.js +13 -0
  23. package/dist/artifact-capture/api-gateway-artifact-writer.d.ts +39 -0
  24. package/dist/artifact-capture/api-gateway-artifact-writer.js +148 -0
  25. package/dist/artifact-capture/gcs-artifact-writer.d.ts +30 -0
  26. package/dist/artifact-capture/gcs-artifact-writer.js +119 -0
  27. package/dist/commands/init.js +2 -6
  28. package/dist/commands/publish.js +3 -2
  29. package/dist/composition-root.d.ts +3 -3
  30. package/dist/composition-root.js +20 -15
  31. package/dist/orchestration/build-step-sequence.js +6 -1
  32. package/dist/orchestration/steps/calculate-scores-step.js +42 -2
  33. package/dist/orchestration/steps/finalize-run-step.d.ts +29 -0
  34. package/dist/orchestration/steps/finalize-run-step.js +103 -0
  35. package/dist/orchestration/steps/publish-report-step.js +25 -27
  36. package/dist/pipeline/calculate-scores.js +13 -2
  37. package/dist/pipeline/provenance.d.ts +24 -44
  38. package/dist/pipeline/provenance.js +17 -165
  39. package/dist/pipeline/report-title.d.ts +2 -2
  40. package/dist/pipeline/run-context.d.ts +57 -0
  41. package/dist/pipeline/run-context.js +156 -0
  42. package/dist/pipeline/upload-test-outputs.d.ts +26 -0
  43. package/dist/pipeline/upload-test-outputs.js +34 -0
  44. package/dist/report-store.js +4 -2
  45. package/package.json +1 -1
  46. package/dist/_vendor/ailf-core/ports/artifact-uploader.d.ts +0 -35
  47. package/dist/_vendor/ailf-core/ports/artifact-uploader.js +0 -18
  48. package/dist/artifact-capture/api-gateway-artifact-uploader.d.ts +0 -41
  49. package/dist/artifact-capture/api-gateway-artifact-uploader.js +0 -123
  50. package/dist/artifact-capture/gcs-report-artifact-uploader.d.ts +0 -31
  51. package/dist/artifact-capture/gcs-report-artifact-uploader.js +0 -66
@@ -0,0 +1,119 @@
1
+ /**
2
+ * GcsArtifactWriter — writes AILF run artifacts + manifest directly to GCS.
3
+ *
4
+ * Uses Application Default Credentials (ADC). Used when the CLI runs in CI or
5
+ * anywhere ADC is configured — the client talks to GCS without the API Gateway
6
+ * acting as a middleman.
7
+ *
8
+ * Paths come from `ARTIFACT_REGISTRY` so writers, signers, and readers agree.
9
+ *
10
+ * Design principles:
11
+ * - P5: Non-blocking — upload failure returns null, never throws.
12
+ * - Lazy client — Storage created on first write.
13
+ *
14
+ * @see docs/decisions/D0032-run-anchored-artifact-store.md
15
+ */
16
+ import { Storage } from "@google-cloud/storage";
17
+ import { ARTIFACT_REGISTRY, } from "../_vendor/ailf-core/index.js";
18
+ export class GcsArtifactWriter {
19
+ client = null;
20
+ options;
21
+ constructor(options) {
22
+ this.options = options;
23
+ }
24
+ async writeBulk(type, runId, data) {
25
+ const descriptor = ARTIFACT_REGISTRY[type];
26
+ const path = descriptor.objectPath(runId);
27
+ return this.putJson(path, data, {
28
+ layout: "bulk",
29
+ entryCount: entryCountOf(data),
30
+ });
31
+ }
32
+ async writePerEntry(type, runId, entries) {
33
+ const descriptor = ARTIFACT_REGISTRY[type];
34
+ if (!descriptor.parseEntryKey) {
35
+ console.warn(` ⚠️ writePerEntry called for "${type}" but the registry has no parseEntryKey`);
36
+ return null;
37
+ }
38
+ const storage = this.getClient();
39
+ const uploaded = [];
40
+ let totalBytes = 0;
41
+ for (const entry of entries) {
42
+ const parsed = descriptor.parseEntryKey(entry.key);
43
+ if (!parsed.ok) {
44
+ console.warn(` ⚠️ Skipping entry with invalid key "${entry.key}": ${parsed.reason}`);
45
+ continue;
46
+ }
47
+ const path = descriptor.objectPath(runId, entry.key);
48
+ const json = JSON.stringify(entry.data);
49
+ const bytes = Buffer.byteLength(json, "utf-8");
50
+ try {
51
+ await storage
52
+ .bucket(this.options.bucket)
53
+ .file(path)
54
+ .save(json, { contentType: "application/json" });
55
+ uploaded.push({ key: entry.key, bytes });
56
+ totalBytes += bytes;
57
+ }
58
+ catch (err) {
59
+ const message = err instanceof Error ? err.message : String(err);
60
+ console.warn(` ⚠️ Artifact entry upload failed (non-blocking): ${path} — ${message}`);
61
+ }
62
+ }
63
+ if (uploaded.length === 0)
64
+ return null;
65
+ return {
66
+ store: "gcs",
67
+ bucket: this.options.bucket,
68
+ path: `runs/${runId}/${descriptor.slug}`,
69
+ bytes: totalBytes,
70
+ entryCount: uploaded.length,
71
+ layout: "per-entry",
72
+ entries: uploaded,
73
+ };
74
+ }
75
+ async writeManifest(runId, manifest) {
76
+ const path = `runs/${runId}/manifest.json`;
77
+ return this.putJson(path, manifest, { layout: "bulk" });
78
+ }
79
+ async putJson(path, data, meta) {
80
+ const json = JSON.stringify(data);
81
+ const bytes = Buffer.byteLength(json, "utf-8");
82
+ try {
83
+ const storage = this.getClient();
84
+ await storage
85
+ .bucket(this.options.bucket)
86
+ .file(path)
87
+ .save(json, { contentType: "application/json" });
88
+ return {
89
+ store: "gcs",
90
+ bucket: this.options.bucket,
91
+ path,
92
+ bytes,
93
+ entryCount: meta.entryCount,
94
+ layout: meta.layout,
95
+ };
96
+ }
97
+ catch (err) {
98
+ const message = err instanceof Error ? err.message : String(err);
99
+ console.warn(` ⚠️ Artifact upload failed (non-blocking): ${path} — ${message}`);
100
+ return null;
101
+ }
102
+ }
103
+ getClient() {
104
+ if (this.client)
105
+ return this.client;
106
+ this.client = new Storage();
107
+ return this.client;
108
+ }
109
+ }
110
+ function entryCountOf(data) {
111
+ if (typeof data === "object" &&
112
+ data !== null &&
113
+ "entries" in data &&
114
+ typeof data.entries === "object") {
115
+ return Object.keys(data.entries)
116
+ .length;
117
+ }
118
+ return undefined;
119
+ }
@@ -250,10 +250,9 @@ async function runInit(opts) {
250
250
  console.log(` 1. Edit the example tasks in ${rel(targetDir, tasksDir)}/ — update`);
251
251
  console.log(" slugs and prompts for your documentation");
252
252
  console.log(` 2. Validate locally: npx @sanity/ailf@latest validate-tasks .ailf/tasks/`);
253
- console.log(" 3. Add two GitHub Actions secrets");
253
+ console.log(" 3. Add a GitHub Actions secret");
254
254
  console.log(" (Settings → Secrets and variables → Actions):");
255
255
  console.log(" • AILF_API_KEY — your API key (starts with ailf_live_sk_)");
256
- console.log(" • NPM_TOKEN — npm token with read access to @sanity scope");
257
256
  console.log(" 4. Push — the workflow at .github/workflows/ailf-eval.yml runs");
258
257
  console.log(" automatically on PRs");
259
258
  if (format === "ts") {
@@ -262,14 +261,11 @@ async function runInit(opts) {
262
261
  console.log(" via defineTask() from @sanity/ailf-core.");
263
262
  }
264
263
  console.log();
265
- console.log(" 🔑 Retrieve secrets from 1Password (Sanity employees):");
264
+ console.log(" 🔑 Retrieve the API key from 1Password (Sanity employees):");
266
265
  console.log();
267
266
  console.log(" # Shared dev API key (for local testing and CI)");
268
267
  console.log(' op read "op://Shared/AI Literacy Framework - Shared API Tokens/AILF_API_KEY_DEV"');
269
268
  console.log();
270
- console.log(" # npm token (read access to @sanity scope)");
271
- console.log(' op read "op://Shared/AI Literacy Framework - Shared API Tokens/NPM_TOKEN"');
272
- console.log();
273
269
  console.log(" Not a Sanity employee? Request an API key from the AILF team.");
274
270
  console.log();
275
271
  console.log(" 💡 Test locally before pushing:");
@@ -55,7 +55,7 @@ export function createPublishCommand() {
55
55
  * the summary metadata and environment. Some fields (contextHash,
56
56
  * promptfooUrl) are not available for manual publishes.
57
57
  */
58
- function buildProvenanceFromSummary(summary) {
58
+ function buildProvenanceFromSummary(summary, runId) {
59
59
  const areas = summary.scores.map((s) => s.feature);
60
60
  const mode = (process.env.EVAL_MODE ?? "literacy");
61
61
  const source = {
@@ -76,6 +76,7 @@ function buildProvenanceFromSummary(summary) {
76
76
  areas,
77
77
  mode,
78
78
  rootDir: ROOT,
79
+ runId,
79
80
  source,
80
81
  };
81
82
  }
@@ -145,7 +146,7 @@ async function runPublishCommand(summaryPath, outputDir, opts) {
145
146
  // -----------------------------------------------------------------------
146
147
  // 2. Build provenance
147
148
  // -----------------------------------------------------------------------
148
- const provenanceInput = buildProvenanceFromSummary(summary);
149
+ const provenanceInput = buildProvenanceFromSummary(summary, ctx.runId);
149
150
  const provenance = buildProvenance(provenanceInput);
150
151
  // -----------------------------------------------------------------------
151
152
  // 3. Create report
@@ -15,7 +15,7 @@
15
15
  * @see packages/core/src/ports/context.ts — AppContext interface
16
16
  * @see docs/archive/exec-plans/ports-and-adapters/phase-7-composition-root.md
17
17
  */
18
- import { type AppContext, type ArtifactUploader, type AssertionRegistration, type Logger, type ResolvedConfig } from "./_vendor/ailf-core/index.d.ts";
18
+ import { type AppContext, type ArtifactWriter, type AssertionRegistration, type Logger, type ResolvedConfig } from "./_vendor/ailf-core/index.d.ts";
19
19
  /**
20
20
  * Create a fully wired AppContext from resolved configuration.
21
21
  *
@@ -24,7 +24,7 @@ import { type AppContext, type ArtifactUploader, type AssertionRegistration, typ
24
24
  */
25
25
  export declare function createAppContext(config: ResolvedConfig): AppContext;
26
26
  /**
27
- * Selects an ArtifactUploader implementation based on available credentials.
27
+ * Selects an ArtifactWriter implementation based on available credentials.
28
28
  *
29
29
  * Selection order:
30
30
  * 1. config.artifactUpload === false → always skip (explicit opt-out)
@@ -38,7 +38,7 @@ export declare function createAppContext(config: ResolvedConfig): AppContext;
38
38
  *
39
39
  * Exported for unit-test access; not part of the public package API.
40
40
  */
41
- export declare function createArtifactUploader(config: ResolvedConfig, logger: Logger): ArtifactUploader | undefined;
41
+ export declare function createArtifactWriter(config: ResolvedConfig, logger: Logger): ArtifactWriter | undefined;
42
42
  /**
43
43
  * Generic Promptfoo assertion types available to all evaluation modes.
44
44
  *
@@ -16,11 +16,11 @@
16
16
  * @see docs/archive/exec-plans/ports-and-adapters/phase-7-composition-root.md
17
17
  */
18
18
  import { join } from "node:path";
19
- import { InMemoryPluginRegistry, NoOpArtifactCollector, } from "./_vendor/ailf-core/index.js";
20
- import { ApiGatewayArtifactUploader } from "./artifact-capture/api-gateway-artifact-uploader.js";
19
+ import { InMemoryPluginRegistry, NoOpArtifactCollector, generateRunId, } from "./_vendor/ailf-core/index.js";
20
+ import { ApiGatewayArtifactWriter } from "./artifact-capture/api-gateway-artifact-writer.js";
21
21
  import { FilesystemArtifactCollector } from "./artifact-capture/filesystem-collector.js";
22
22
  import { GcsArtifactCollector } from "./artifact-capture/gcs-collector.js";
23
- import { GcsReportArtifactUploader } from "./artifact-capture/gcs-report-artifact-uploader.js";
23
+ import { GcsArtifactWriter } from "./artifact-capture/gcs-artifact-writer.js";
24
24
  import { ContentLakeCacheAdapter } from "./adapters/cache/content-lake-cache.js";
25
25
  import { loadExternalPresets } from "./pipeline/compiler/preset-loader.js";
26
26
  import { FilesystemCache } from "./adapters/cache/filesystem-cache.js";
@@ -82,13 +82,17 @@ export function createAppContext(config) {
82
82
  })
83
83
  : fsCollector;
84
84
  }
85
- // Report artifact uploader uploads structured files to GCS at known
86
- // paths for Studio to fetch via signed URLs (D0030). Auto-detects the
87
- // right adapter from available credentials; defaults bucket to
88
- // "ailf-artifacts". Set artifactUpload: false to opt out entirely.
89
- const artifactUploader = createArtifactUploader(config, logger);
85
+ // Artifact writerwrites run artifacts + manifest to GCS at known
86
+ // `runs/{runId}/…` paths (D0032). Auto-detects the right adapter from
87
+ // available credentials; defaults bucket to "ailf-artifacts". Set
88
+ // artifactUpload: false to opt out entirely.
89
+ const artifactWriter = createArtifactWriter(config, logger);
90
+ // Generate the pipeline's RunId once; every downstream step reads it
91
+ // from the context (D0032).
92
+ const runId = generateRunId();
93
+ logger.debug(`Pipeline runId: ${runId}`);
90
94
  return {
91
- artifactUploader,
95
+ artifactWriter,
92
96
  cache,
93
97
  collector,
94
98
  config,
@@ -97,6 +101,7 @@ export function createAppContext(config) {
97
101
  logger,
98
102
  registry,
99
103
  reportStore,
104
+ runId,
100
105
  sinks,
101
106
  taskSource,
102
107
  };
@@ -124,7 +129,7 @@ function createLogger() {
124
129
  */
125
130
  const DEFAULT_ARTIFACT_BUCKET = "ailf-artifacts";
126
131
  /**
127
- * Selects an ArtifactUploader implementation based on available credentials.
132
+ * Selects an ArtifactWriter implementation based on available credentials.
128
133
  *
129
134
  * Selection order:
130
135
  * 1. config.artifactUpload === false → always skip (explicit opt-out)
@@ -138,7 +143,7 @@ const DEFAULT_ARTIFACT_BUCKET = "ailf-artifacts";
138
143
  *
139
144
  * Exported for unit-test access; not part of the public package API.
140
145
  */
141
- export function createArtifactUploader(config, logger) {
146
+ export function createArtifactWriter(config, logger) {
142
147
  if (config.artifactUpload === false) {
143
148
  logger.debug("Artifact upload explicitly disabled via artifactUpload=false");
144
149
  return undefined;
@@ -148,13 +153,13 @@ export function createArtifactUploader(config, logger) {
148
153
  // We treat the presence of either env var as the user opting in to ADC.
149
154
  const hasGcsCredentials = Boolean(process.env.GOOGLE_APPLICATION_CREDENTIALS || process.env.GCLOUD_PROJECT);
150
155
  if (hasGcsCredentials) {
151
- logger.debug(`Artifact uploader: GcsReportArtifactUploader (direct GCS via ADC, bucket=${bucket})`);
152
- return new GcsReportArtifactUploader({ bucket });
156
+ logger.debug(`Artifact writer: GcsArtifactWriter (direct GCS via ADC, bucket=${bucket})`);
157
+ return new GcsArtifactWriter({ bucket });
153
158
  }
154
159
  // Local dev — request signed PUT URLs from the API gateway, no GCS creds needed.
155
160
  if (config.apiKey && config.apiUrl) {
156
- logger.debug(`Artifact uploader: ApiGatewayArtifactUploader (signed URL via ${config.apiUrl}, bucket=${bucket})`);
157
- return new ApiGatewayArtifactUploader({
161
+ logger.debug(`Artifact writer: ApiGatewayArtifactWriter (signed URL via ${config.apiUrl}, bucket=${bucket})`);
162
+ return new ApiGatewayArtifactWriter({
158
163
  apiBaseUrl: config.apiUrl,
159
164
  apiKey: config.apiKey,
160
165
  bucket,
@@ -11,6 +11,7 @@ import { CalculateScoresStep } from "./steps/calculate-scores-step.js";
11
11
  import { CompareStep } from "./steps/compare-step.js";
12
12
  import { DiscoveryReportStep } from "./steps/discovery-report-step.js";
13
13
  import { FetchDocsStep } from "./steps/fetch-docs-step.js";
14
+ import { FinalizeRunStep } from "./steps/finalize-run-step.js";
14
15
  import { GapAnalysisStep } from "./steps/gap-analysis-step.js";
15
16
  import { GenerateConfigsStep } from "./steps/generate-configs-step.js";
16
17
  import { GraderConsistencyStep } from "./steps/grader-consistency-step.js";
@@ -76,7 +77,11 @@ export function buildStepSequence(ctx, pipelineStart = Date.now()) {
76
77
  if (config.gapAnalysisEnabled) {
77
78
  steps.push(new GapAnalysisStep());
78
79
  }
79
- // Step 4b: Publish report (optional, when token is configured)
80
+ // Step 4c: Finalize the run write `runs/{runId}/manifest.json` with the
81
+ // catalog of artifacts produced so far. Skipped silently when no
82
+ // artifactWriter is wired (D0032).
83
+ steps.push(new FinalizeRunStep(pipelineStart));
84
+ // Step 4d: Publish report (optional, when token is configured)
80
85
  if (config.publishEnabled) {
81
86
  steps.push(new PublishReportStep(pipelineStart, {
82
87
  publishTag: config.publishTag,
@@ -4,8 +4,8 @@
4
4
  * Calls calculateAndWriteScores() from pipeline/calculate-scores.ts with
5
5
  * typed options derived from AppContext. No env bridge needed.
6
6
  */
7
- import { existsSync } from "node:fs";
8
- import { join } from "path";
7
+ import { existsSync, readFileSync } from "node:fs";
8
+ import { join, resolve } from "path";
9
9
  import { LiteracyVariant } from "../../pipeline/normalize-mode.js";
10
10
  import { getStepInputPaths } from "../../pipeline/cache.js";
11
11
  import { buildCacheContext } from "../cache-context.js";
@@ -13,6 +13,7 @@ import { calculateAndWriteScores } from "../../pipeline/calculate-scores.js";
13
13
  import { checkResultsExist, checkScoreSummaryValid, } from "../../pipeline/checks.js";
14
14
  import { resultsFileForMode } from "../../pipeline/eval-constants.js";
15
15
  import { loadSource } from "../../sources.js";
16
+ import { uploadTestOutputs } from "../../pipeline/upload-test-outputs.js";
16
17
  import { configToSourceOverrides } from "../config-to-source-overrides.js";
17
18
  export class CalculateScoresStep {
18
19
  name = "calculate-scores";
@@ -132,6 +133,27 @@ export class CalculateScoresStep {
132
133
  ctx.collector.captureFile("calculate-scores", file.replace(".json", ""), filePath);
133
134
  }
134
135
  }
136
+ // Upload testOutputs to GCS (D0032 — non-blocking, P5).
137
+ // Read from test-results.json rather than score-summary.json: the
138
+ // gap-analysis step (downstream) is the one that enriches score-summary
139
+ // with testResults, so at this point the summary still has an empty
140
+ // testResults[]. test-results.json is written by calculateAndWriteScores
141
+ // above and carries the full per-test shape we need for per-entry upload.
142
+ // The full responseOutput lives in the GCS artifact; PublishReportStep
143
+ // later strips it from the inline Content Lake document when this
144
+ // upload succeeds.
145
+ if (ctx.artifactWriter) {
146
+ const testResults = tryReadTestResults(ctx.config.rootDir);
147
+ if (testResults?.length) {
148
+ const artifactRef = await uploadTestOutputs(ctx.artifactWriter, ctx.runId, testResults);
149
+ if (artifactRef) {
150
+ state.artifactRefs = {
151
+ ...state.artifactRefs,
152
+ testOutputs: artifactRef,
153
+ };
154
+ }
155
+ }
156
+ }
135
157
  const criticalSuffix = belowCritical.length > 0
136
158
  ? ` (${belowCritical.length} area(s) below critical threshold: ${belowCritical.join(", ")})`
137
159
  : "";
@@ -148,3 +170,21 @@ export class CalculateScoresStep {
148
170
  return buildCacheContext(ctx.config);
149
171
  }
150
172
  }
173
+ /**
174
+ * Read the per-test result set written by `calculateAndWriteScores`.
175
+ *
176
+ * This is the authoritative source for `uploadTestOutputs` at the time
177
+ * CalculateScoresStep runs — `score-summary.json` doesn't carry
178
+ * `testResults[]` until `gap-analysis-step` enriches it downstream.
179
+ */
180
+ function tryReadTestResults(rootDir) {
181
+ const path = resolve(rootDir, "results", "latest", "test-results.json");
182
+ if (!existsSync(path))
183
+ return undefined;
184
+ try {
185
+ return JSON.parse(readFileSync(path, "utf-8"));
186
+ }
187
+ catch {
188
+ return undefined;
189
+ }
190
+ }
@@ -0,0 +1,29 @@
1
+ /**
2
+ * Pipeline step: FinalizeRunStep — writes the run manifest at pipeline end.
3
+ *
4
+ * Inserts between `GapAnalysis` and `PublishReport`. Assembles a
5
+ * `RunManifest` from `state.artifactRefs` (populated by producer steps)
6
+ * and the shared `RunContext` (via `buildRunContext`), then writes it to
7
+ * `runs/{runId}/manifest.json`. The written manifest becomes the source
8
+ * of truth for artifact locations; `PublishReportStep` snapshots the
9
+ * `artifacts` slice into `Report.artifactManifest` (D0032).
10
+ *
11
+ * Design principles:
12
+ * - Single writer — one `writeManifest()` call per pipeline run.
13
+ * - Idempotent — retries produce the same manifest bytes for the same inputs.
14
+ * - Skipped when no writer is wired (local/air-gapped runs stay functional).
15
+ *
16
+ * @see docs/decisions/D0032-run-anchored-artifact-store.md
17
+ */
18
+ import type { AppContext, PipelineState, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
19
+ export declare class FinalizeRunStep implements PipelineStep {
20
+ private readonly pipelineStart;
21
+ private readonly options;
22
+ readonly name = "finalize-run";
23
+ readonly optional = true;
24
+ constructor(pipelineStart: number, options?: {
25
+ evalFingerprint?: string;
26
+ });
27
+ check(): ValidationIssue[];
28
+ execute(ctx: AppContext, state: PipelineState): Promise<StepResult>;
29
+ }
@@ -0,0 +1,103 @@
1
+ /**
2
+ * Pipeline step: FinalizeRunStep — writes the run manifest at pipeline end.
3
+ *
4
+ * Inserts between `GapAnalysis` and `PublishReport`. Assembles a
5
+ * `RunManifest` from `state.artifactRefs` (populated by producer steps)
6
+ * and the shared `RunContext` (via `buildRunContext`), then writes it to
7
+ * `runs/{runId}/manifest.json`. The written manifest becomes the source
8
+ * of truth for artifact locations; `PublishReportStep` snapshots the
9
+ * `artifacts` slice into `Report.artifactManifest` (D0032).
10
+ *
11
+ * Design principles:
12
+ * - Single writer — one `writeManifest()` call per pipeline run.
13
+ * - Idempotent — retries produce the same manifest bytes for the same inputs.
14
+ * - Skipped when no writer is wired (local/air-gapped runs stay functional).
15
+ *
16
+ * @see docs/decisions/D0032-run-anchored-artifact-store.md
17
+ */
18
+ import { existsSync, readFileSync } from "node:fs";
19
+ import { resolve } from "node:path";
20
+ import { buildRunContext } from "../../pipeline/run-context.js";
21
+ import { loadSource } from "../../sources.js";
22
+ import { configToSourceOverrides } from "../config-to-source-overrides.js";
23
+ export class FinalizeRunStep {
24
+ pipelineStart;
25
+ options;
26
+ name = "finalize-run";
27
+ optional = true;
28
+ constructor(pipelineStart, options = {}) {
29
+ this.pipelineStart = pipelineStart;
30
+ this.options = options;
31
+ }
32
+ check() {
33
+ return [];
34
+ }
35
+ async execute(ctx, state) {
36
+ const start = Date.now();
37
+ if (!ctx.artifactWriter) {
38
+ return {
39
+ status: "skipped",
40
+ reason: "No artifactWriter wired — manifest is only written when a writer is available",
41
+ };
42
+ }
43
+ // Resolve the source (same input buildProvenance uses).
44
+ const overrides = configToSourceOverrides(ctx.config);
45
+ const resolvedSource = loadSource(ctx.config.source, overrides);
46
+ // Optional: try to read the on-disk summary for test mode inference,
47
+ // but don't fail finalize if it's missing — the manifest should still
48
+ // be written so artifacts have a catalog.
49
+ const maybeSummary = tryReadScoreSummary(ctx.config.rootDir);
50
+ const runContext = buildRunContext({
51
+ areas: maybeSummary?.scores?.map((s) => s.feature) ?? ctx.config.areas ?? [],
52
+ callerGit: ctx.config.callerGit,
53
+ evalFingerprint: state.evalFingerprint ?? this.options.evalFingerprint,
54
+ logger: ctx.logger,
55
+ mode: ctx.config.mode,
56
+ rootDir: ctx.config.rootDir,
57
+ source: resolvedSource,
58
+ taskIds: ctx.config.tasks,
59
+ });
60
+ const manifest = {
61
+ version: 1,
62
+ runId: ctx.runId,
63
+ createdAt: new Date().toISOString(),
64
+ durationMs: Date.now() - this.pipelineStart,
65
+ status: "completed",
66
+ context: runContext,
67
+ outcomes: state.testSummary
68
+ ? { testSummary: state.testSummary }
69
+ : undefined,
70
+ promptfooUrls: state.promptfooUrls,
71
+ artifacts: state.artifactRefs ?? {},
72
+ };
73
+ const ref = await ctx.artifactWriter.writeManifest(ctx.runId, manifest);
74
+ if (!ref) {
75
+ // Non-blocking: writer logged the warning. Still populate state so
76
+ // publish can snapshot `artifacts` even without a persisted manifest.
77
+ state.runManifest = manifest;
78
+ return {
79
+ durationMs: Date.now() - start,
80
+ status: "success",
81
+ summary: "Run manifest computed (GCS write failed — non-blocking)",
82
+ };
83
+ }
84
+ state.runManifest = manifest;
85
+ const artifactCount = Object.keys(manifest.artifacts).length;
86
+ return {
87
+ durationMs: Date.now() - start,
88
+ status: "success",
89
+ summary: `Run manifest written to ${ref.path} (${artifactCount} artifact ref${artifactCount === 1 ? "" : "s"})`,
90
+ };
91
+ }
92
+ }
93
+ function tryReadScoreSummary(rootDir) {
94
+ const path = resolve(rootDir, "results", "latest", "score-summary.json");
95
+ if (!existsSync(path))
96
+ return undefined;
97
+ try {
98
+ return JSON.parse(readFileSync(path, "utf-8"));
99
+ }
100
+ catch {
101
+ return undefined;
102
+ }
103
+ }
@@ -113,12 +113,24 @@ export class PublishReportStep {
113
113
  tag: this.options.publishTag ?? ctx.config.publishTag,
114
114
  title,
115
115
  };
116
- // Upload test output artifacts to GCS (D0030 — non-blocking, P5)
117
- if (ctx.artifactUploader && summary.testResults?.length) {
118
- const artifactRef = await uploadTestOutputs(ctx.artifactUploader, reportId, now, summary.testResults);
119
- if (artifactRef) {
120
- report.artifacts = { testOutputs: artifactRef };
121
- }
116
+ // Snapshot the artifact manifest from FinalizeRunStep's output (D0032).
117
+ // The source of truth is `runs/{runId}/manifest.json` in GCS; the report
118
+ // carries a denormalized copy so Studio can render drill-down state
119
+ // without an extra GCS fetch.
120
+ const artifactManifest = state.runManifest?.artifacts;
121
+ if (artifactManifest && Object.keys(artifactManifest).length > 0) {
122
+ report.artifactManifest = artifactManifest;
123
+ }
124
+ // When testOutputs was uploaded to GCS, strip responseOutput from the
125
+ // inline testResults[] so the Content Lake document stays slim — the
126
+ // full output lives in the GCS artifact. When no testOutputs artifact
127
+ // exists, leave the inline shape intact so Studio's drill-down UI
128
+ // falls back to it.
129
+ if (artifactManifest?.testOutputs && summary.testResults?.length) {
130
+ report.summary = {
131
+ ...summary,
132
+ testResults: summary.testResults.map(slimTestResult),
133
+ };
122
134
  }
123
135
  // Share reportId with downstream steps (CallbackStep + orchestrator job update)
124
136
  state.reportId = reportId;
@@ -212,6 +224,7 @@ function buildProvenanceInput(summary, ctx, options, autoScope) {
212
224
  mode,
213
225
  promptfooUrls: options.promptfooUrls,
214
226
  rootDir: ctx.config.rootDir,
227
+ runId: ctx.runId,
215
228
  sanityDocumentIds,
216
229
  source,
217
230
  sourceReportId: ctx.config.sourceReportId,
@@ -219,28 +232,13 @@ function buildProvenanceInput(summary, ctx, options, autoScope) {
219
232
  };
220
233
  }
221
234
  /**
222
- * Extract test outputs from StoredTestResult[] and upload as a single
223
- * JSON artifact to GCS. The artifact is keyed by `{taskId}::{modelId}`
224
- * to match the lookup pattern in Studio's JudgmentList component.
225
- *
226
- * Non-blocking: returns null if upload fails (P5).
235
+ * Strip the large responseOutput fields from a StoredTestResult so the
236
+ * remaining object is safe to inline in the Content Lake document (D0030).
237
+ * The full output lives in the GCS artifact uploaded by uploadTestOutputs.
227
238
  */
228
- async function uploadTestOutputs(uploader, reportId, createdAt, testResults) {
229
- const entries = {};
230
- for (const tr of testResults) {
231
- const key = `${tr.taskId}::${tr.modelId}`;
232
- entries[key] = {
233
- responseOutput: tr.responseOutput,
234
- responseOutputTruncated: tr.responseOutputTruncated ?? false,
235
- };
236
- }
237
- const artifact = {
238
- version: 1,
239
- reportId,
240
- createdAt,
241
- entries,
242
- };
243
- return uploader.upload(reportId, "test-outputs.json", artifact);
239
+ function slimTestResult(tr) {
240
+ const { responseOutput: _o, responseOutputTruncated: _t, ...rest } = tr;
241
+ return rest;
244
242
  }
245
243
  /**
246
244
  * Fan out a report to all configured sinks.
@@ -157,8 +157,19 @@ export function extractGraderJudgments(resultsPath) {
157
157
  }
158
158
  return judgments;
159
159
  }
160
- /** Maximum characters to store for model response output */
161
- const MAX_RESPONSE_OUTPUT_LENGTH = 8000;
160
+ /**
161
+ * Maximum characters (JS string length, not bytes) to store for model
162
+ * response output. ASCII-heavy responses at this cap JSON-encode to ~1 MB;
163
+ * pathological multi-byte UTF-8 could encode to ~4 MB, still well within
164
+ * per-entry GCS object limits.
165
+ *
166
+ * Raised from 8 000 to 1 000 000 in W0048 because the per-entry artifact
167
+ * layout (D0032) makes the cap irrelevant to Studio's fetch cost — each
168
+ * entry is fetched independently on click, so a larger ceiling only costs
169
+ * GCS bytes, not main-thread blocking or baseline report payload.
170
+ * `responseOutputTruncated` still flips for the extreme tail.
171
+ */
172
+ const MAX_RESPONSE_OUTPUT_LENGTH = 1_000_000;
162
173
  /**
163
174
  * Extract per-test results with model output from evaluation results.
164
175
  *