@sanity/ailf 2.9.0 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/dist/_vendor/ailf-core/artifact-capture/association.d.ts +37 -0
  2. package/dist/_vendor/ailf-core/artifact-capture/association.js +19 -0
  3. package/dist/_vendor/ailf-core/artifact-registry.d.ts +1 -1
  4. package/dist/_vendor/ailf-core/artifact-registry.js +1 -18
  5. package/dist/_vendor/ailf-core/batch-signing.d.ts +64 -0
  6. package/dist/_vendor/ailf-core/batch-signing.js +23 -0
  7. package/dist/_vendor/ailf-core/index.d.ts +2 -2
  8. package/dist/_vendor/ailf-core/index.js +2 -2
  9. package/dist/_vendor/ailf-core/ports/context.d.ts +12 -20
  10. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -2
  11. package/dist/_vendor/ailf-core/ports/index.js +1 -0
  12. package/dist/_vendor/ailf-core/ports/progress-reporter.d.ts +74 -0
  13. package/dist/_vendor/ailf-core/ports/progress-reporter.js +26 -0
  14. package/dist/_vendor/ailf-core/services/slim-report-summary.js +1 -16
  15. package/dist/adapters/config-sources/file-config-adapter.js +0 -4
  16. package/dist/adapters/progress/console-progress-reporter.d.ts +35 -0
  17. package/dist/adapters/progress/console-progress-reporter.js +110 -0
  18. package/dist/artifact-capture/api-gateway-artifact-writer.d.ts +8 -1
  19. package/dist/artifact-capture/api-gateway-artifact-writer.js +79 -42
  20. package/dist/artifact-capture/batching-api-gateway-artifact-writer.d.ts +108 -0
  21. package/dist/artifact-capture/batching-api-gateway-artifact-writer.js +492 -0
  22. package/dist/artifact-capture/fanout-artifact-writer.d.ts +14 -2
  23. package/dist/artifact-capture/fanout-artifact-writer.js +25 -4
  24. package/dist/artifact-capture/gcs-artifact-writer.d.ts +27 -1
  25. package/dist/artifact-capture/gcs-artifact-writer.js +168 -38
  26. package/dist/artifact-capture/instrumented-artifact-writer.d.ts +32 -0
  27. package/dist/artifact-capture/instrumented-artifact-writer.js +151 -0
  28. package/dist/artifact-capture/local-fs-artifact-writer.d.ts +8 -1
  29. package/dist/artifact-capture/local-fs-artifact-writer.js +23 -4
  30. package/dist/artifact-capture/parallel-emit.d.ts +43 -0
  31. package/dist/artifact-capture/parallel-emit.js +84 -0
  32. package/dist/artifact-capture/redact-artifact.d.ts +3 -5
  33. package/dist/artifact-capture/redact-artifact.js +3 -5
  34. package/dist/artifact-capture/upload-metrics.d.ts +62 -0
  35. package/dist/artifact-capture/upload-metrics.js +125 -0
  36. package/dist/cli.js +56 -2
  37. package/dist/commands/explain-handler.js +1 -5
  38. package/dist/commands/pipeline-action.d.ts +0 -4
  39. package/dist/commands/pipeline-action.js +11 -45
  40. package/dist/commands/pipeline.d.ts +1 -5
  41. package/dist/commands/pipeline.js +1 -5
  42. package/dist/commands/runs.d.ts +18 -0
  43. package/dist/commands/runs.js +71 -0
  44. package/dist/composition-root.d.ts +2 -2
  45. package/dist/composition-root.js +98 -38
  46. package/dist/orchestration/build-app-context.js +4 -7
  47. package/dist/orchestration/pipeline-orchestrator.js +100 -24
  48. package/dist/orchestration/steps/calculate-scores-step.js +1 -1
  49. package/dist/orchestration/steps/finalize-run-step.js +33 -2
  50. package/dist/pipeline/emit-eval-results.js +29 -11
  51. package/dist/pipeline/map-request-to-config.js +0 -4
  52. package/dist/pipeline/upload-test-outputs.d.ts +12 -5
  53. package/dist/pipeline/upload-test-outputs.js +27 -10
  54. package/package.json +3 -3
  55. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +0 -14
  56. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +0 -25
  57. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +0 -94
  58. package/dist/_vendor/ailf-core/ports/artifact-collector.js +0 -13
  59. package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +0 -138
  60. package/dist/_vendor/ailf-core/ports/capture-comparator.js +0 -10
  61. package/dist/artifact-capture/comparator.d.ts +0 -22
  62. package/dist/artifact-capture/comparator.js +0 -493
  63. package/dist/artifact-capture/filesystem-collector.d.ts +0 -60
  64. package/dist/artifact-capture/filesystem-collector.js +0 -262
  65. package/dist/artifact-capture/gcs-collector.d.ts +0 -55
  66. package/dist/artifact-capture/gcs-collector.js +0 -117
  67. package/dist/commands/capture-compare.d.ts +0 -15
  68. package/dist/commands/capture-compare.js +0 -253
  69. package/dist/commands/capture-list.d.ts +0 -12
  70. package/dist/commands/capture-list.js +0 -150
  71. package/dist/commands/capture.d.ts +0 -9
  72. package/dist/commands/capture.js +0 -16
@@ -15,20 +15,22 @@
15
15
  * @see packages/core/src/ports/context.ts — AppContext interface
16
16
  * @see docs/archive/exec-plans/ports-and-adapters/phase-7-composition-root.md
17
17
  */
18
- import { join } from "node:path";
19
- import { InMemoryPluginRegistry, NoOpArtifactCollector, NoOpArtifactWriter, generateRunId, isArtifactType, } from "./_vendor/ailf-core/index.js";
18
+ import { ARTIFACT_EXPORT_PHASE_ID, InMemoryPluginRegistry, NoOpArtifactWriter, NoOpProgressReporter, generateRunId, isArtifactType, } from "./_vendor/ailf-core/index.js";
20
19
  import { AccumulatingArtifactWriter } from "./artifact-capture/accumulating-artifact-writer.js";
21
20
  import { ApiGatewayArtifactWriter } from "./artifact-capture/api-gateway-artifact-writer.js";
21
+ import { BatchingApiGatewayArtifactWriter } from "./artifact-capture/batching-api-gateway-artifact-writer.js";
22
22
  import { FanoutArtifactWriter } from "./artifact-capture/fanout-artifact-writer.js";
23
- import { FilesystemArtifactCollector } from "./artifact-capture/filesystem-collector.js";
24
- import { GcsArtifactCollector } from "./artifact-capture/gcs-collector.js";
25
23
  import { GcsArtifactWriter } from "./artifact-capture/gcs-artifact-writer.js";
24
+ import { InstrumentedArtifactWriter } from "./artifact-capture/instrumented-artifact-writer.js";
26
25
  import { LocalFilesystemArtifactWriter } from "./artifact-capture/local-fs-artifact-writer.js";
26
+ import { resolveUploadConcurrency, setDefaultUploadConcurrency, } from "./artifact-capture/parallel-emit.js";
27
+ import { UploadMetrics } from "./artifact-capture/upload-metrics.js";
27
28
  import { ContentLakeCacheAdapter } from "./adapters/cache/content-lake-cache.js";
28
29
  import { loadExternalPresets } from "./pipeline/compiler/preset-loader.js";
29
30
  import { FilesystemCache } from "./adapters/cache/filesystem-cache.js";
30
31
  import { PromptfooEvalAdapter } from "./adapters/eval-runners/promptfoo-eval-adapter.js";
31
32
  import { ConsoleLogger, JsonLogger, QuietLogger, } from "./adapters/loggers/index.js";
33
+ import { ConsoleProgressReporter } from "./adapters/progress/console-progress-reporter.js";
32
34
  import { CompositeTaskSource, ContentLakeTaskSource, RepoTaskSource, } from "./adapters/task-sources/index.js";
33
35
  import { createAgentHarnessBase, createKnowledgeProbeBase, createLiteracyModeBase, createMcpServerModeBase, } from "./pipeline/compiler/mode-bases/index.js";
34
36
  import { createSanityLiteracyPreset } from "./pipeline/compiler/presets/index.js";
@@ -44,6 +46,9 @@ import { loadSinks } from "./sinks/index.js";
44
46
  export function createAppContext(config) {
45
47
  // Logger — selected by env var preferences
46
48
  const logger = createLogger();
49
+ // Progress reporter — console-backed for the default logger; no-op for
50
+ // JSON/quiet modes and tests where interactive output is inappropriate.
51
+ const progress = createProgressReporter();
47
52
  // Cache — filesystem, optionally decorated with Content Lake fallback
48
53
  const cache = config.noCache ? undefined : createCache(config);
49
54
  // Task source — selected by config.taskSourceType
@@ -63,33 +68,16 @@ export function createAppContext(config) {
63
68
  const reportStore = createReportStore(config);
64
69
  // Sinks — loaded from config/sinks
65
70
  const sinks = loadSinks();
66
- // Artifact collector — no-op by default, filesystem when --capture is set,
67
- // GCS decorator when --capture-gcs-bucket is also provided (D0030/W0035)
68
- let collector = new NoOpArtifactCollector();
69
- if (config.captureEnabled) {
70
- const fsCollector = new FilesystemArtifactCollector({
71
- captureDir: config.captureDir ?? join(config.outputDir, "..", "captures"),
72
- mode: config.mode,
73
- compress: config.captureCompress ?? true,
74
- extras: config.captureExtras ?? true,
75
- pipeline: {
76
- variant: config.variant,
77
- source: config.source,
78
- areas: config.areas,
79
- },
80
- });
81
- collector = config.captureGcsBucket
82
- ? new GcsArtifactCollector(fsCollector, {
83
- bucket: config.captureGcsBucket,
84
- prefix: config.captureGcsPrefix,
85
- })
86
- : fsCollector;
87
- }
88
71
  // Artifact writer — writes run artifacts + manifest to GCS at known
89
72
  // `runs/{runId}/…` paths (D0032). Auto-detects the right adapter from
90
73
  // available credentials; defaults bucket to "ailf-artifacts". Set
91
74
  // artifactUpload: false to opt out entirely.
92
- const artifactWriter = createArtifactWriter(config, logger);
75
+ // W0053 writers receive a progress reporter scoped to a single
76
+ // `artifact-export` phase so the CLI can render per-batch updates.
77
+ const artifactWriter = createArtifactWriter(config, logger, {
78
+ reporter: progress,
79
+ phaseId: ARTIFACT_EXPORT_PHASE_ID,
80
+ });
93
81
  // Generate the pipeline's RunId once; every downstream step reads it
94
82
  // from the context (D0032).
95
83
  const runId = generateRunId();
@@ -97,11 +85,11 @@ export function createAppContext(config) {
97
85
  return {
98
86
  artifactWriter,
99
87
  cache,
100
- collector,
101
88
  config,
102
89
  docFetcher,
103
90
  evalRunner,
104
91
  logger,
92
+ progress,
105
93
  registry,
106
94
  reportStore,
107
95
  runId,
@@ -124,6 +112,23 @@ function createLogger() {
124
112
  process.env.AILF_VERBOSE === "1",
125
113
  });
126
114
  }
115
+ /**
116
+ * Select a ProgressReporter adapter. Matches the logger environment — JSON
117
+ * and quiet loggers get a no-op reporter so machine-readable output stays
118
+ * clean; interactive sessions get the console adapter with verbose mirroring.
119
+ */
120
+ function createProgressReporter() {
121
+ if (process.env.AILF_LOG_FORMAT === "json")
122
+ return new NoOpProgressReporter();
123
+ if (process.env.AILF_LOG_LEVEL === "quiet" ||
124
+ process.env.AILF_QUIET === "1") {
125
+ return new NoOpProgressReporter();
126
+ }
127
+ return new ConsoleProgressReporter({
128
+ verbose: process.env.AILF_LOG_LEVEL === "verbose" ||
129
+ process.env.AILF_VERBOSE === "1",
130
+ });
131
+ }
127
132
  /**
128
133
  * Shared GCS bucket for report artifacts. Matches the gateway default at
129
134
  * packages/api/src/routes/artifacts.ts — both sides assume ailf-artifacts
@@ -155,7 +160,7 @@ const DEFAULT_LOCAL_ARTIFACTS_DIR = ".ailf/results/captures";
155
160
  *
156
161
  * Exported for unit-test access; not part of the public package API.
157
162
  */
158
- export function createArtifactWriter(config, logger) {
163
+ export function createArtifactWriter(config, logger, progress) {
159
164
  // Legacy `artifactUpload: false` still disables — treat as an alias for
160
165
  // the canonical `artifactsDisabled: true` until W0052 removes it.
161
166
  if (config.artifactsDisabled === true || config.artifactUpload === false) {
@@ -164,10 +169,27 @@ export function createArtifactWriter(config, logger) {
164
169
  }
165
170
  const exclude = resolveExcludeList(config.artifactsExclude, logger);
166
171
  const rootDir = config.artifactsDir ?? DEFAULT_LOCAL_ARTIFACTS_DIR;
167
- const local = new LocalFilesystemArtifactWriter({ rootDir, exclude });
168
- const remote = createRemoteArtifactWriter(config, logger);
172
+ // W0056 opt-in measurement of the upload path. The collector is passed
173
+ // to the remote writer (where sign/PUT/compose phases live) AND wraps the
174
+ // final writer to record caller-observed `emit`/`writeManifest` totals.
175
+ // `summarize()` fires from the decorator's `writeManifest` hook.
176
+ const metrics = process.env.AILF_UPLOAD_METRICS === "1"
177
+ ? new UploadMetrics({
178
+ logger,
179
+ detailFile: `${rootDir}/upload-metrics/run-${Date.now()}.ndjson`,
180
+ })
181
+ : null;
182
+ // W0053: progress attaches to the OUTERMOST of (local-only | fanout). When
183
+ // fanout is wired, the delegates stay silent so we don't double-count the
184
+ // same caller-visible write across two backends.
185
+ const remote = createRemoteArtifactWriter(config, logger, metrics);
186
+ const local = new LocalFilesystemArtifactWriter({
187
+ rootDir,
188
+ exclude,
189
+ ...(remote ? {} : { progress }),
190
+ });
169
191
  const base = remote
170
- ? new FanoutArtifactWriter([local, remote])
192
+ ? new FanoutArtifactWriter([local, remote], { progress })
171
193
  : local;
172
194
  if (!remote) {
173
195
  logger.debug(`Artifact writer: LocalFilesystemArtifactWriter only (rootDir=${rootDir})`);
@@ -179,7 +201,10 @@ export function createArtifactWriter(config, logger) {
179
201
  // RunManifest without each producer bookkeeping its own ArtifactRefs
180
202
  // (W0051 Slice 3 revisit — Option B of the "manifest empty on real runs"
181
203
  // fix).
182
- return new AccumulatingArtifactWriter(base);
204
+ const accumulating = new AccumulatingArtifactWriter(base);
205
+ return metrics
206
+ ? new InstrumentedArtifactWriter(accumulating, metrics)
207
+ : accumulating;
183
208
  }
184
209
  /**
185
210
  * Validate the exclude list against the registry. Unknown types are dropped
@@ -194,7 +219,7 @@ function resolveExcludeList(raw, logger) {
194
219
  valid.push(name);
195
220
  }
196
221
  else {
197
- logger.warn(`--capture-exclude: "${name}" is not a known artifact type — ignored`);
222
+ logger.warn(`--artifacts-exclude: "${name}" is not a known artifact type — ignored`);
198
223
  }
199
224
  }
200
225
  return valid;
@@ -205,19 +230,54 @@ function resolveExcludeList(raw, logger) {
205
230
  * the sole backend for that run, which is the D0033 M4 default for laptops
206
231
  * and CI without GCS creds.
207
232
  */
208
- function createRemoteArtifactWriter(config, logger) {
233
+ function createRemoteArtifactWriter(config, logger, metrics) {
209
234
  const bucket = config.artifactGcsBucket ?? DEFAULT_ARTIFACT_BUCKET;
210
235
  const hasGcsCredentials = Boolean(process.env.GOOGLE_APPLICATION_CREDENTIALS || process.env.GCLOUD_PROJECT);
211
236
  if (hasGcsCredentials) {
212
- logger.debug(`Artifact remote backend: GcsArtifactWriter (ADC, bucket=${bucket})`);
213
- return new GcsArtifactWriter({ bucket });
237
+ // W0056 Phase 1: the GCS-direct path measured 0 failures at
238
+ // concurrency 8 with a 60 % pipeline-time reduction. Flip parallelism
239
+ // on by default on this path. `AILF_PARALLEL_UPLOAD=0` still forces
240
+ // serial as an escape hatch.
241
+ setDefaultUploadConcurrency(8);
242
+ logger.debug(`Artifact remote backend: GcsArtifactWriter (ADC, bucket=${bucket}, defaultConcurrency=8)`);
243
+ return new GcsArtifactWriter({
244
+ bucket,
245
+ ...(metrics ? { metrics } : {}),
246
+ });
214
247
  }
215
248
  if (config.apiKey && config.apiUrl) {
216
- logger.debug(`Artifact remote backend: ApiGatewayArtifactWriter (via ${config.apiUrl}, bucket=${bucket})`);
249
+ // W0058 Phase 2: batching writer is the default on the API Gateway path.
250
+ // Prototype B (W0056) showed batch signing + client-side parallelism
251
+ // eliminates the 429 storm that single-URL parallelism triggered on the
252
+ // Vercel signing endpoint, at parity with the GCS-direct parallel path
253
+ // once the sign+PUT overlap optimization lands. Flip the default to 8
254
+ // concurrency; `AILF_PARALLEL_UPLOAD=0` forces serial as a rollback
255
+ // escape hatch and auto-selects the legacy single-URL writer.
256
+ setDefaultUploadConcurrency(8);
257
+ const concurrency = resolveUploadConcurrency();
258
+ if (concurrency > 1) {
259
+ logger.debug(`Artifact remote backend: BatchingApiGatewayArtifactWriter (via ${config.apiUrl}, bucket=${bucket}, putConcurrency=${concurrency})`);
260
+ // D0034: neither API Gateway writer supports NDJSON `appendNdjson`.
261
+ // Traces that flow through `appendNdjson` are dropped on this path.
262
+ // Surface the gap once at startup instead of ambushing users with a
263
+ // silent null ref at emit time.
264
+ logger.warn("Artifacts: API Gateway path selected without GCS ADC — " +
265
+ "trace (NDJSON) artifacts will be skipped (D0034). Set " +
266
+ "GOOGLE_APPLICATION_CREDENTIALS or GCLOUD_PROJECT to capture traces.");
267
+ return new BatchingApiGatewayArtifactWriter({
268
+ apiBaseUrl: config.apiUrl,
269
+ apiKey: config.apiKey,
270
+ bucket,
271
+ putConcurrency: concurrency,
272
+ ...(metrics ? { metrics } : {}),
273
+ });
274
+ }
275
+ logger.debug(`Artifact remote backend: ApiGatewayArtifactWriter (via ${config.apiUrl}, bucket=${bucket}, serial — AILF_PARALLEL_UPLOAD=0 override)`);
217
276
  return new ApiGatewayArtifactWriter({
218
277
  apiBaseUrl: config.apiUrl,
219
278
  apiKey: config.apiKey,
220
279
  bucket,
280
+ ...(metrics ? { metrics } : {}),
221
281
  });
222
282
  }
223
283
  return null;
@@ -8,7 +8,6 @@
8
8
  * Once all commands construct ResolvedConfig directly (or use --config),
9
9
  * this bridge can be deleted.
10
10
  */
11
- import { join } from "node:path";
12
11
  import { createAppContext } from "../composition-root.js";
13
12
  import { tryLoadConfigFile } from "../pipeline/compiler/config-loader.js";
14
13
  /**
@@ -78,12 +77,10 @@ export function mapToResolvedConfig(opts, rootDir) {
78
77
  remote: opts.remote ?? false,
79
78
  apiUrl: opts.apiUrl ?? "https://ailf-api.sanity.build",
80
79
  apiKey: opts.apiKey,
81
- captureEnabled: opts.captureEnabled ?? false,
82
- captureDir: opts.captureDir ?? join(opts.outputDir, "..", "captures"),
83
- captureCompress: opts.captureCompress ?? true,
84
- captureExtras: opts.captureExtras ?? true,
85
- captureGcsBucket: process.env.AILF_CAPTURE_GCS_BUCKET,
86
- captureGcsPrefix: process.env.AILF_CAPTURE_GCS_PREFIX,
80
+ artifactsDisabled: opts.artifactsDisabled,
81
+ artifactsDir: opts.artifactsDir,
82
+ artifactsDryRun: opts.artifactsDryRun,
83
+ artifactsExclude: opts.artifactsExclude,
87
84
  artifactGcsBucket: process.env.AILF_GCS_ARTIFACT_BUCKET,
88
85
  artifactUpload: parseArtifactUploadEnv(process.env.AILF_ARTIFACT_UPLOAD),
89
86
  };
@@ -11,7 +11,7 @@
11
11
  * each step completes. This enables the GET /v1/jobs/:jobId polling
12
12
  * endpoint to show real-time progress.
13
13
  */
14
- import { assoc, } from "../_vendor/ailf-core/index.js";
14
+ import { ARTIFACT_EXPORT_PHASE_ID, assoc, } from "../_vendor/ailf-core/index.js";
15
15
  import { runStep } from "./step-runner.js";
16
16
  // ---------------------------------------------------------------------------
17
17
  // Job progress reporter
@@ -111,21 +111,6 @@ async function capturePipelineContext(ctx, state, results) {
111
111
  ctx.logger.debug(`pipelineContext emit rejected: ${err instanceof Error ? err.message : String(err)}`);
112
112
  }
113
113
  }
114
- /**
115
- * Flush captured artifacts to disk. Non-blocking — failures are logged
116
- * but never affect the pipeline result.
117
- */
118
- async function flushArtifacts(ctx) {
119
- if (!ctx.collector.enabled)
120
- return;
121
- try {
122
- const result = await ctx.collector.flush();
123
- ctx.logger.info(`Captured ${result.artifactCount} artifacts → ${result.destination}`);
124
- }
125
- catch (err) {
126
- ctx.logger.warn(`Artifact capture flush failed: ${err instanceof Error ? err.message : err}`);
127
- }
128
- }
129
114
  // ---------------------------------------------------------------------------
130
115
  // Orchestrator
131
116
  // ---------------------------------------------------------------------------
@@ -157,10 +142,16 @@ export async function orchestratePipeline(ctx, steps) {
157
142
  if (hasJob) {
158
143
  await reportJobProgress(ctx, steps[0]?.name ?? "init", 0, steps.length, "running", undefined, jobUpdates);
159
144
  }
145
+ // W0053 — artifact export phase. Opens the first time a non-`run-eval`
146
+ // step starts, signalling the user that promptfoo's progress bar is done
147
+ // and the (previously silent) GCS export/upload window is now active.
148
+ // Closed in a finally after the step loop, regardless of pipeline outcome.
149
+ const exportPhase = createExportPhaseGate(ctx);
160
150
  for (let i = 0; i < steps.length; i++) {
161
151
  const step = steps[i];
162
152
  ctx.logger.debug(`Starting step ${i + 1}/${steps.length}: ${step.name}`);
163
153
  ctx.logger.section(step.name);
154
+ exportPhase.maybeOpen(step.name);
164
155
  // Report current step progress
165
156
  if (hasJob) {
166
157
  await reportJobProgress(ctx, step.name, i, steps.length, "running", undefined, jobUpdates);
@@ -182,13 +173,11 @@ export async function orchestratePipeline(ctx, steps) {
182
173
  step: step.name,
183
174
  }, jobUpdates);
184
175
  }
185
- // Capture pipeline context and job updates before flushing
176
+ // Capture pipeline context before exiting. `job-updates` was an
177
+ // observability-only capture not tied to a registered artifact type;
178
+ // dropped in W0050. Use the JobStore path for job telemetry.
186
179
  await capturePipelineContext(ctx, state, results);
187
- // W0050 — `job-updates` was an observability-only capture not tied
188
- // to a registered artifact type; dropped here. Use the JobStore
189
- // path if job telemetry is needed.
190
- // Flush captured artifacts even on failure (partial capture is useful)
191
- await flushArtifacts(ctx);
180
+ exportPhase.close();
192
181
  return {
193
182
  belowCritical: state.belowCritical,
194
183
  durationMs: Date.now() - pipelineStart,
@@ -245,8 +234,7 @@ export async function orchestratePipeline(ctx, steps) {
245
234
  // Capture pipeline context. `job-updates` observability captures were
246
235
  // dropped in Slice 6.1 — JobStore is the supported telemetry path.
247
236
  await capturePipelineContext(ctx, state, results);
248
- // Flush captured artifacts (non-blocking — failures never affect pipeline result)
249
- await flushArtifacts(ctx);
237
+ exportPhase.close();
250
238
  return {
251
239
  belowCritical: state.belowCritical,
252
240
  durationMs,
@@ -257,3 +245,91 @@ export async function orchestratePipeline(ctx, steps) {
257
245
  validation,
258
246
  };
259
247
  }
248
+ // ---------------------------------------------------------------------------
249
+ // Artifact export phase gate (W0053)
250
+ // ---------------------------------------------------------------------------
251
+ /**
252
+ * Returns a lazy gate that opens the `artifact-export` progress phase on the
253
+ * first step after `run-eval` and closes it on pipeline completion. The gate
254
+ * tolerates repeated opens / closes — each is a no-op after the first.
255
+ *
256
+ * The phase is keyed on step names rather than timestamps so the header lands
257
+ * exactly when the user sees promptfoo's `Evaluating` bar hit 100% and the
258
+ * next pipeline step takes over. `run-eval` produces artifacts too, but its
259
+ * own progress is owned by promptfoo; opening the phase before run-eval would
260
+ * produce a duplicate progress channel for the same window.
261
+ */
262
+ function createExportPhaseGate(ctx) {
263
+ let opened = false;
264
+ let closed = false;
265
+ let startedAt = 0;
266
+ // Step names that run BEFORE the artifact-heavy post-eval section. The
267
+ // phase opens on the first step whose name is not in this set — typically
268
+ // `calculate-scores` once promptfoo has handed back control.
269
+ const preExportSteps = new Set([
270
+ "validate",
271
+ "mirror-repo-tasks",
272
+ "fetch-docs",
273
+ "generate-configs",
274
+ "grader-consistency",
275
+ ]);
276
+ const { label, detail } = describeExportPhase(ctx);
277
+ return {
278
+ maybeOpen(stepName) {
279
+ if (opened)
280
+ return;
281
+ if (stepName.startsWith("run-eval"))
282
+ return;
283
+ if (preExportSteps.has(stepName))
284
+ return;
285
+ opened = true;
286
+ startedAt = Date.now();
287
+ ctx.progress.phaseStart({
288
+ phaseId: ARTIFACT_EXPORT_PHASE_ID,
289
+ label,
290
+ detail,
291
+ startedAt,
292
+ });
293
+ },
294
+ close() {
295
+ if (!opened || closed)
296
+ return;
297
+ closed = true;
298
+ // Cumulative counts live inside the reporter adapter (it accumulates
299
+ // each phaseProgress event). The orchestrator does not track the
300
+ // running total — it only knows when the phase is over. Adapters that
301
+ // render a final summary use their own state; NoOp / JSON adapters
302
+ // ignore the event.
303
+ ctx.progress.phaseComplete({
304
+ phaseId: ARTIFACT_EXPORT_PHASE_ID,
305
+ itemsCompleted: 0,
306
+ bytesCompleted: 0,
307
+ durationMs: Date.now() - startedAt,
308
+ });
309
+ },
310
+ };
311
+ }
312
+ /**
313
+ * Build the user-facing phase label by peeking at the wired writer chain.
314
+ * `AccumulatingArtifactWriter` wraps a `FanoutArtifactWriter([local, remote])`
315
+ * when remote credentials are present, or a bare `LocalFilesystemArtifactWriter`
316
+ * otherwise — naming the destination in the label keeps every progress line
317
+ * self-describing.
318
+ */
319
+ function describeExportPhase(ctx) {
320
+ const writer = ctx.artifactWriter;
321
+ const inner = writer.inner?.constructor.name ?? writer.constructor.name;
322
+ if (inner === "FanoutArtifactWriter") {
323
+ return { label: "Exporting run artifacts", detail: "local + GCS" };
324
+ }
325
+ if (inner === "GcsArtifactWriter") {
326
+ return { label: "Exporting run artifacts", detail: "GCS" };
327
+ }
328
+ if (inner === "ApiGatewayArtifactWriter") {
329
+ return { label: "Exporting run artifacts", detail: "API gateway" };
330
+ }
331
+ if (inner === "NoOpArtifactWriter") {
332
+ return { label: "Finalizing run" };
333
+ }
334
+ return { label: "Exporting run artifacts", detail: "local" };
335
+ }
@@ -146,7 +146,7 @@ export class CalculateScoresStep {
146
146
  // W0050 — ctx.artifactWriter is always present; no guard needed.
147
147
  const testResults = tryReadTestResults(ctx.config.rootDir);
148
148
  if (testResults?.length) {
149
- const artifactRef = await uploadTestOutputs(ctx.artifactWriter, ctx.runId, testResults);
149
+ const artifactRef = await uploadTestOutputs(ctx.artifactWriter, ctx.runId, testResults, ctx.config.mode);
150
150
  if (artifactRef) {
151
151
  state.artifactRefs = {
152
152
  ...state.artifactRefs,
@@ -18,9 +18,36 @@
18
18
  import { existsSync, readFileSync } from "node:fs";
19
19
  import { resolve } from "node:path";
20
20
  import { AccumulatingArtifactWriter } from "../../artifact-capture/accumulating-artifact-writer.js";
21
+ import { InstrumentedArtifactWriter } from "../../artifact-capture/instrumented-artifact-writer.js";
21
22
  import { buildRunContext } from "../../pipeline/run-context.js";
22
23
  import { loadSource } from "../../sources.js";
23
24
  import { configToSourceOverrides } from "../config-to-source-overrides.js";
25
+ /**
26
+ * Walk a writer's `inner` decorator chain looking for an
27
+ * `AccumulatingArtifactWriter`. Composition root wraps the accumulator
28
+ * in `InstrumentedArtifactWriter` when `AILF_UPLOAD_METRICS=1`, so a
29
+ * naive `instanceof AccumulatingArtifactWriter` check misses it and the
30
+ * manifest comes out empty. Rather than teaching every caller about the
31
+ * instrumentation wrapper, unwrap once here.
32
+ *
33
+ * `MAX_DEPTH` is a safety belt against a future decorator chain
34
+ * accidentally introducing a cycle — the current writers can't, but one
35
+ * `inner` self-reference would otherwise spin forever.
36
+ */
37
+ const FIND_ACCUMULATOR_MAX_DEPTH = 8;
38
+ function findAccumulator(writer) {
39
+ let cursor = writer;
40
+ for (let depth = 0; cursor && depth < FIND_ACCUMULATOR_MAX_DEPTH; depth++) {
41
+ if (cursor instanceof AccumulatingArtifactWriter)
42
+ return cursor;
43
+ if (cursor instanceof InstrumentedArtifactWriter) {
44
+ cursor = cursor.inner;
45
+ continue;
46
+ }
47
+ return null;
48
+ }
49
+ return null;
50
+ }
24
51
  export class FinalizeRunStep {
25
52
  pipelineStart;
26
53
  options;
@@ -64,8 +91,12 @@ export class FinalizeRunStep {
64
91
  // happened to register manually. When the writer is a NoOp / plain
65
92
  // decorator without accumulation, `aggregated` stays empty and the
66
93
  // manifest falls back to the producer-side registration.
67
- const aggregated = ctx.artifactWriter instanceof AccumulatingArtifactWriter
68
- ? ctx.artifactWriter.getAccumulatedArtifactRefs()
94
+ //
95
+ // W0058: `findAccumulator` unwraps `InstrumentedArtifactWriter` too
96
+ // so the manifest stays fully populated when `AILF_UPLOAD_METRICS=1`.
97
+ const accumulator = findAccumulator(ctx.artifactWriter);
98
+ const aggregated = accumulator
99
+ ? accumulator.getAccumulatedArtifactRefs()
69
100
  : {};
70
101
  const artifacts = {
71
102
  ...aggregated,
@@ -27,7 +27,7 @@
27
27
  * still live inside the full `rawResults` object.
28
28
  */
29
29
  import { readFileSync } from "node:fs";
30
- import { classifyRubric, parseRubricScore, } from "../_vendor/ailf-core/index.js";
30
+ import { classifyRubric, parseRubricScore, resolveVariantMode, } from "../_vendor/ailf-core/index.js";
31
31
  // ---------------------------------------------------------------------------
32
32
  // Public entry point
33
33
  // ---------------------------------------------------------------------------
@@ -57,23 +57,40 @@ export async function emitPerEntryEvalResults(writer, ctx, mode, resultsPath) {
57
57
  console.warn(` ⚠️ emitPerEntryEvalResults: ${resultsPath} has no results[]`);
58
58
  return;
59
59
  }
60
+ // W0058: fire every emit synchronously and `Promise.all` once at the
61
+ // end. The previous `parallelMap` pattern created producer-side
62
+ // backpressure (each worker was blocked on its own `await emit(...)`),
63
+ // which kept the batching writer's queue shallow (≤ producer
64
+ // concurrency). With queueing delegated entirely to the writer, the
65
+ // batching API-Gateway writer gets a fully-populated pending queue
66
+ // and can pack ~hundreds of entries into a single batch-sign RTT;
67
+ // the GCS-direct writer's own `ConcurrencyLimiter` caps the PUT
68
+ // fan-out so the 1 500-concurrent-PUT scenario cannot happen.
69
+ const emits = [];
60
70
  for (const result of rows) {
61
- const taskId = result.testCase?.description ?? "unknown-task";
71
+ const rawTaskId = result.testCase?.description ?? "unknown-task";
62
72
  const modelId = result.provider?.id ?? result.provider?.label ?? "unknown-model";
73
+ // D0033 axis convention: literacy-mode task descriptions carry a
74
+ // `(gold)` / `(baseline)` suffix; that variant IS the `mode` axis
75
+ // value. Stripping the suffix here keeps the writer's key aligned
76
+ // with what `slim-report-summary#slimJudgments` and the Studio
77
+ // `testOutputsKeyFor` hook compute on the read side. Without this,
78
+ // signed-URL lookups for grader/judgment artifacts 404.
79
+ const { mode: axisMode, task: axisTask } = resolveVariantMode(rawTaskId, mode);
63
80
  const baseAssoc = {
64
81
  run: ctx.runId,
65
- mode,
66
- task: taskId,
82
+ mode: axisMode,
83
+ task: axisTask,
67
84
  model: modelId,
68
85
  };
69
86
  // rawResults — full raw entry (bounded by descriptor capBytes: 1 MB)
70
- await writer.emit("rawResults", baseAssoc, result);
87
+ emits.push(writer.emit("rawResults", baseAssoc, result));
71
88
  // renderedPrompts — what the model saw + which provider it went to
72
89
  if (result.prompt !== undefined) {
73
- await writer.emit("renderedPrompts", baseAssoc, {
90
+ emits.push(writer.emit("renderedPrompts", baseAssoc, {
74
91
  prompt: result.prompt,
75
92
  provider: result.provider,
76
- });
93
+ }));
77
94
  }
78
95
  // Per-grader decomposition — only LLM-rubric assertions have a
79
96
  // natural grader identity. Code assertions (javascript/contains/…)
@@ -86,15 +103,16 @@ export async function emitPerEntryEvalResults(writer, ctx, mode, resultsPath) {
86
103
  if (!dimension)
87
104
  continue;
88
105
  const graderAssoc = { ...baseAssoc, grader: dimension };
89
- await writer.emit("graderPrompts", graderAssoc, {
106
+ emits.push(writer.emit("graderPrompts", graderAssoc, {
90
107
  dimension,
91
108
  assertion: comp.assertion,
92
- });
93
- await writer.emit("graderJudgments", graderAssoc, {
109
+ }));
110
+ emits.push(writer.emit("graderJudgments", graderAssoc, {
94
111
  score: parseRubricScore(comp) ?? 0,
95
112
  reason: comp.reason ?? "",
96
113
  pass: comp.pass,
97
- });
114
+ }));
98
115
  }
99
116
  }
117
+ await Promise.all(emits);
100
118
  }
@@ -74,10 +74,6 @@ export function mapRequestToConfig(request, rootDir) {
74
74
  callerGit: request.callerGit,
75
75
  callback: request.callback,
76
76
  jobId: request.jobId,
77
- captureEnabled: false,
78
- captureDir: undefined,
79
- captureCompress: true,
80
- captureExtras: true,
81
77
  remote: false,
82
78
  apiUrl: "https://ailf-api.sanity.build",
83
79
  presets: request.presets,
@@ -15,12 +15,19 @@
15
15
  *
16
16
  * @see docs/decisions/D0032-run-anchored-artifact-store.md
17
17
  */
18
- import type { ArtifactRef, ArtifactWriter, RunId, StoredTestResult } from "../_vendor/ailf-core/index.d.ts";
18
+ import { type ArtifactRef, type ArtifactWriter, type RunId, type StoredTestResult } from "../_vendor/ailf-core/index.d.ts";
19
19
  /**
20
20
  * Upload testOutputs as per-entry GCS objects under
21
- * `runs/{runId}/test-outputs/`, one per `{taskId}::{modelId}` pair.
21
+ * `runs/{runId}/test-outputs/`, one per `(mode, task, model)` triple.
22
22
  *
23
- * Returns the `ArtifactRef` on success, or `null` when upload is skipped or
24
- * fails (P5: non-blocking).
23
+ * D0033 axis convention: literacy-mode taskIds carry a `(gold)` / `(baseline)`
24
+ * suffix whose value is the `mode` axis on the artifact — stripped via
25
+ * `resolveVariantMode`. This aligns the on-disk key with what the Studio
26
+ * hover-prefetch (`testOutputsKeyFor`) and slim-report readers compute; the
27
+ * legacy 2-segment form produced by the pre-D0033 writer 404'd on the new
28
+ * 3-segment read path.
29
+ *
30
+ * Returns the first non-null `ArtifactRef` emitted, or `null` when upload is
31
+ * skipped / every emit fails (P5: non-blocking).
25
32
  */
26
- export declare function uploadTestOutputs(writer: ArtifactWriter, runId: RunId, testResults: StoredTestResult[]): Promise<ArtifactRef | null>;
33
+ export declare function uploadTestOutputs(writer: ArtifactWriter, runId: RunId, testResults: StoredTestResult[], defaultMode: string): Promise<ArtifactRef | null>;
@@ -15,20 +15,37 @@
15
15
  *
16
16
  * @see docs/decisions/D0032-run-anchored-artifact-store.md
17
17
  */
18
+ import { resolveVariantMode, } from "../_vendor/ailf-core/index.js";
18
19
  /**
19
20
  * Upload testOutputs as per-entry GCS objects under
20
- * `runs/{runId}/test-outputs/`, one per `{taskId}::{modelId}` pair.
21
+ * `runs/{runId}/test-outputs/`, one per `(mode, task, model)` triple.
21
22
  *
22
- * Returns the `ArtifactRef` on success, or `null` when upload is skipped or
23
- * fails (P5: non-blocking).
23
+ * D0033 axis convention: literacy-mode taskIds carry a `(gold)` / `(baseline)`
24
+ * suffix whose value is the `mode` axis on the artifact — stripped via
25
+ * `resolveVariantMode`. This aligns the on-disk key with what the Studio
26
+ * hover-prefetch (`testOutputsKeyFor`) and slim-report readers compute; the
27
+ * legacy 2-segment form produced by the pre-D0033 writer 404'd on the new
28
+ * 3-segment read path.
29
+ *
30
+ * Returns the first non-null `ArtifactRef` emitted, or `null` when upload is
31
+ * skipped / every emit fails (P5: non-blocking).
24
32
  */
25
- export async function uploadTestOutputs(writer, runId, testResults) {
26
- const entries = testResults.map((tr) => ({
27
- key: `${tr.taskId}::${tr.modelId}`,
28
- data: {
33
+ export async function uploadTestOutputs(writer, runId, testResults, defaultMode) {
34
+ // W0058: fire every emit synchronously and `Promise.all` once — the
35
+ // writer (batching or GCS-direct) owns concurrency bounds. See the
36
+ // equivalent rationale in `emit-eval-results.ts`.
37
+ const emits = testResults.map((tr) => {
38
+ const { mode: axisMode, task: axisTask } = resolveVariantMode(tr.taskId, defaultMode);
39
+ return writer.emit("testOutputs", { run: runId, mode: axisMode, task: axisTask, model: tr.modelId }, {
29
40
  responseOutput: tr.responseOutput ?? "",
30
41
  responseOutputTruncated: tr.responseOutputTruncated ?? false,
31
- },
32
- }));
33
- return writer.writePerEntry("testOutputs", runId, entries);
42
+ });
43
+ });
44
+ const refs = await Promise.all(emits);
45
+ let lastRef = null;
46
+ for (const ref of refs) {
47
+ if (ref)
48
+ lastRef = ref;
49
+ }
50
+ return lastRef;
34
51
  }