@sanity/ailf 2.7.1 → 2.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. package/dist/_vendor/ailf-core/artifact-capture/association.d.ts +35 -0
  2. package/dist/_vendor/ailf-core/artifact-capture/association.js +28 -0
  3. package/dist/_vendor/ailf-core/artifact-registry.d.ts +173 -0
  4. package/dist/_vendor/ailf-core/artifact-registry.js +811 -0
  5. package/dist/_vendor/ailf-core/index.d.ts +3 -1
  6. package/dist/_vendor/ailf-core/index.js +3 -1
  7. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +3 -3
  8. package/dist/_vendor/ailf-core/ports/artifact-writer.d.ts +95 -0
  9. package/dist/_vendor/ailf-core/ports/artifact-writer.js +51 -0
  10. package/dist/_vendor/ailf-core/ports/context.d.ts +32 -3
  11. package/dist/_vendor/ailf-core/ports/index.d.ts +3 -3
  12. package/dist/_vendor/ailf-core/ports/index.js +1 -1
  13. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +6 -6
  14. package/dist/_vendor/ailf-core/services/index.d.ts +1 -0
  15. package/dist/_vendor/ailf-core/services/index.js +1 -0
  16. package/dist/_vendor/ailf-core/services/slim-report-summary.d.ts +31 -0
  17. package/dist/_vendor/ailf-core/services/slim-report-summary.js +217 -0
  18. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +42 -0
  19. package/dist/_vendor/ailf-core/types/branded-ids.js +21 -0
  20. package/dist/_vendor/ailf-core/types/index.d.ts +298 -77
  21. package/dist/_vendor/ailf-core/types/index.js +1 -1
  22. package/dist/_vendor/ailf-shared/index.d.ts +2 -0
  23. package/dist/_vendor/ailf-shared/index.js +2 -0
  24. package/dist/_vendor/ailf-shared/run-context.d.ts +55 -0
  25. package/dist/_vendor/ailf-shared/run-context.js +17 -0
  26. package/dist/_vendor/ailf-shared/run-trigger.d.ts +30 -0
  27. package/dist/_vendor/ailf-shared/run-trigger.js +13 -0
  28. package/dist/artifact-capture/accumulating-artifact-writer.d.ts +50 -0
  29. package/dist/artifact-capture/accumulating-artifact-writer.js +111 -0
  30. package/dist/artifact-capture/api-gateway-artifact-writer.d.ts +52 -0
  31. package/dist/artifact-capture/api-gateway-artifact-writer.js +199 -0
  32. package/dist/artifact-capture/emit-file.d.ts +28 -0
  33. package/dist/artifact-capture/emit-file.js +56 -0
  34. package/dist/artifact-capture/fanout-artifact-writer.d.ts +39 -0
  35. package/dist/artifact-capture/fanout-artifact-writer.js +76 -0
  36. package/dist/artifact-capture/filesystem-collector.d.ts +22 -4
  37. package/dist/artifact-capture/filesystem-collector.js +48 -23
  38. package/dist/artifact-capture/gcs-artifact-writer.d.ts +67 -0
  39. package/dist/artifact-capture/gcs-artifact-writer.js +343 -0
  40. package/dist/artifact-capture/local-fs-artifact-writer.d.ts +71 -0
  41. package/dist/artifact-capture/local-fs-artifact-writer.js +273 -0
  42. package/dist/commands/explain-handler.js +4 -0
  43. package/dist/commands/pipeline-action.d.ts +5 -0
  44. package/dist/commands/pipeline-action.js +56 -5
  45. package/dist/commands/pipeline.d.ts +4 -0
  46. package/dist/commands/pipeline.js +6 -2
  47. package/dist/commands/publish.js +7 -3
  48. package/dist/composition-root.d.ts +14 -11
  49. package/dist/composition-root.js +90 -31
  50. package/dist/orchestration/build-step-sequence.js +6 -1
  51. package/dist/orchestration/pipeline-orchestrator.d.ts +1 -1
  52. package/dist/orchestration/pipeline-orchestrator.js +41 -30
  53. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -1
  54. package/dist/orchestration/steps/calculate-scores-step.js +50 -10
  55. package/dist/orchestration/steps/callback-step.d.ts +1 -1
  56. package/dist/orchestration/steps/callback-step.js +6 -4
  57. package/dist/orchestration/steps/compare-step.d.ts +1 -1
  58. package/dist/orchestration/steps/compare-step.js +4 -2
  59. package/dist/orchestration/steps/discovery-report-step.d.ts +1 -1
  60. package/dist/orchestration/steps/discovery-report-step.js +4 -1
  61. package/dist/orchestration/steps/fetch-docs-step.js +9 -15
  62. package/dist/orchestration/steps/finalize-run-step.d.ts +29 -0
  63. package/dist/orchestration/steps/finalize-run-step.js +117 -0
  64. package/dist/orchestration/steps/gap-analysis-step.js +34 -6
  65. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -1
  66. package/dist/orchestration/steps/generate-configs-step.js +11 -11
  67. package/dist/orchestration/steps/publish-report-step.d.ts +1 -1
  68. package/dist/orchestration/steps/publish-report-step.js +40 -55
  69. package/dist/orchestration/steps/readiness-step.d.ts +1 -1
  70. package/dist/orchestration/steps/readiness-step.js +4 -1
  71. package/dist/orchestration/steps/report-step.d.ts +1 -1
  72. package/dist/orchestration/steps/report-step.js +6 -3
  73. package/dist/orchestration/steps/run-eval-step.js +14 -9
  74. package/dist/pipeline/calculate-scores.js +13 -2
  75. package/dist/pipeline/compare.d.ts +2 -2
  76. package/dist/pipeline/emit-eval-results.d.ts +38 -0
  77. package/dist/pipeline/emit-eval-results.js +100 -0
  78. package/dist/pipeline/provenance.d.ts +24 -44
  79. package/dist/pipeline/provenance.js +17 -165
  80. package/dist/pipeline/report-title.d.ts +2 -2
  81. package/dist/pipeline/run-context.d.ts +57 -0
  82. package/dist/pipeline/run-context.js +156 -0
  83. package/dist/pipeline/upload-test-outputs.d.ts +26 -0
  84. package/dist/pipeline/upload-test-outputs.js +34 -0
  85. package/dist/report-store.js +4 -2
  86. package/package.json +3 -3
  87. package/dist/_vendor/ailf-core/ports/artifact-uploader.d.ts +0 -35
  88. package/dist/_vendor/ailf-core/ports/artifact-uploader.js +0 -18
  89. package/dist/artifact-capture/api-gateway-artifact-uploader.d.ts +0 -41
  90. package/dist/artifact-capture/api-gateway-artifact-uploader.js +0 -123
  91. package/dist/artifact-capture/gcs-report-artifact-uploader.d.ts +0 -31
  92. package/dist/artifact-capture/gcs-report-artifact-uploader.js +0 -66
@@ -0,0 +1,117 @@
1
+ /**
2
+ * Pipeline step: FinalizeRunStep — writes the run manifest at pipeline end.
3
+ *
4
+ * Inserts between `GapAnalysis` and `PublishReport`. Assembles a
5
+ * `RunManifest` from `state.artifactRefs` (populated by producer steps)
6
+ * and the shared `RunContext` (via `buildRunContext`), then writes it to
7
+ * `runs/{runId}/manifest.json`. The written manifest becomes the source
8
+ * of truth for artifact locations; `PublishReportStep` snapshots the
9
+ * `artifacts` slice into `Report.artifactManifest` (D0032).
10
+ *
11
+ * Design principles:
12
+ * - Single writer — one `writeManifest()` call per pipeline run.
13
+ * - Idempotent — retries produce the same manifest bytes for the same inputs.
14
+ * - Skipped when no writer is wired (local/air-gapped runs stay functional).
15
+ *
16
+ * @see docs/decisions/D0032-run-anchored-artifact-store.md
17
+ */
18
+ import { existsSync, readFileSync } from "node:fs";
19
+ import { resolve } from "node:path";
20
+ import { AccumulatingArtifactWriter } from "../../artifact-capture/accumulating-artifact-writer.js";
21
+ import { buildRunContext } from "../../pipeline/run-context.js";
22
+ import { loadSource } from "../../sources.js";
23
+ import { configToSourceOverrides } from "../config-to-source-overrides.js";
24
+ export class FinalizeRunStep {
25
+ pipelineStart;
26
+ options;
27
+ name = "finalize-run";
28
+ optional = true;
29
+ constructor(pipelineStart, options = {}) {
30
+ this.pipelineStart = pipelineStart;
31
+ this.options = options;
32
+ }
33
+ check() {
34
+ return [];
35
+ }
36
+ async execute(ctx, state) {
37
+ const start = Date.now();
38
+ // W0050 — `ctx.artifactWriter` is now required on AppContext
39
+ // (composition root always provides one; NoOpArtifactWriter when
40
+ // `--no-artifacts`). The pre-W0050 guard that returned "skipped" has
41
+ // been removed — a NoOp writer's writeManifest returns null and the
42
+ // code below already handles that as a non-blocking failure.
43
+ // Resolve the source (same input buildProvenance uses).
44
+ const overrides = configToSourceOverrides(ctx.config);
45
+ const resolvedSource = loadSource(ctx.config.source, overrides);
46
+ // Optional: try to read the on-disk summary for test mode inference,
47
+ // but don't fail finalize if it's missing — the manifest should still
48
+ // be written so artifacts have a catalog.
49
+ const maybeSummary = tryReadScoreSummary(ctx.config.rootDir);
50
+ const runContext = buildRunContext({
51
+ areas: maybeSummary?.scores?.map((s) => s.feature) ?? ctx.config.areas ?? [],
52
+ callerGit: ctx.config.callerGit,
53
+ evalFingerprint: state.evalFingerprint ?? this.options.evalFingerprint,
54
+ logger: ctx.logger,
55
+ mode: ctx.config.mode,
56
+ rootDir: ctx.config.rootDir,
57
+ source: resolvedSource,
58
+ taskIds: ctx.config.tasks,
59
+ });
60
+ // W0051 revisit: the composition-root wraps `ctx.artifactWriter` in
61
+ // `AccumulatingArtifactWriter`, which keeps a map of every ref any
62
+ // producer emitted this run. Merge that into `state.artifactRefs` so
63
+ // the manifest reflects the FULL set — not just the subset producers
64
+ // happened to register manually. When the writer is a NoOp / plain
65
+ // decorator without accumulation, `aggregated` stays empty and the
66
+ // manifest falls back to the producer-side registration.
67
+ const aggregated = ctx.artifactWriter instanceof AccumulatingArtifactWriter
68
+ ? ctx.artifactWriter.getAccumulatedArtifactRefs()
69
+ : {};
70
+ const artifacts = {
71
+ ...aggregated,
72
+ ...(state.artifactRefs ?? {}),
73
+ };
74
+ const manifest = {
75
+ version: 1,
76
+ runId: ctx.runId,
77
+ createdAt: new Date().toISOString(),
78
+ durationMs: Date.now() - this.pipelineStart,
79
+ status: "completed",
80
+ context: runContext,
81
+ outcomes: state.testSummary
82
+ ? { testSummary: state.testSummary }
83
+ : undefined,
84
+ promptfooUrls: state.promptfooUrls,
85
+ artifacts,
86
+ };
87
+ const ref = await ctx.artifactWriter.writeManifest(ctx.runId, manifest);
88
+ if (!ref) {
89
+ // Non-blocking: writer logged the warning. Still populate state so
90
+ // publish can snapshot `artifacts` even without a persisted manifest.
91
+ state.runManifest = manifest;
92
+ return {
93
+ durationMs: Date.now() - start,
94
+ status: "success",
95
+ summary: "Run manifest computed (GCS write failed — non-blocking)",
96
+ };
97
+ }
98
+ state.runManifest = manifest;
99
+ const artifactCount = Object.keys(manifest.artifacts).length;
100
+ return {
101
+ durationMs: Date.now() - start,
102
+ status: "success",
103
+ summary: `Run manifest written to ${ref.path} (${artifactCount} artifact ref${artifactCount === 1 ? "" : "s"})`,
104
+ };
105
+ }
106
+ }
107
+ function tryReadScoreSummary(rootDir) {
108
+ const path = resolve(rootDir, "results", "latest", "score-summary.json");
109
+ if (!existsSync(path))
110
+ return undefined;
111
+ try {
112
+ return JSON.parse(readFileSync(path, "utf-8"));
113
+ }
114
+ catch {
115
+ return undefined;
116
+ }
117
+ }
@@ -16,7 +16,8 @@
16
16
  */
17
17
  import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
18
18
  import { join, resolve } from "path";
19
- import { isSlugRef } from "../../_vendor/ailf-core/index.js";
19
+ import { assoc, isSlugRef } from "../../_vendor/ailf-core/index.js";
20
+ import { emitFileContents } from "../../artifact-capture/emit-file.js";
20
21
  export class GapAnalysisStep {
21
22
  name = "gap-analysis";
22
23
  optional = true;
@@ -194,14 +195,29 @@ export class GapAnalysisStep {
194
195
  ...(testResults !== undefined && { testResults }),
195
196
  };
196
197
  writeFileSync(scoreSummaryPath, JSON.stringify(enrichedSummary, null, 2));
197
- // Capture gap analysis artifacts
198
- const failureModesPath = join(outDir, "failure-modes.json");
199
- if (existsSync(failureModesPath)) {
200
- ctx.collector.captureFile("gap-analysis", "failure-modes", failureModesPath);
198
+ // W0051 Slice 2 — failureModes is per-entry keyed by {mode, category};
199
+ // one entry per classified FailureModeType. Zero-count categories are
200
+ // skipped to keep the manifest honest about what the run surfaced.
201
+ const classifiedByCategory = new Map();
202
+ for (const cj of failureModeReport.classifiedJudgments) {
203
+ const cat = cj.classification.mode;
204
+ const bucket = classifiedByCategory.get(cat) ?? [];
205
+ bucket.push(cj);
206
+ classifiedByCategory.set(cat, bucket);
207
+ }
208
+ for (const [category, classified] of classifiedByCategory) {
209
+ if (classified.length === 0)
210
+ continue;
211
+ await ctx.artifactWriter.emit("failureModes", assoc(ctx, { mode: ctx.config.mode, category }), {
212
+ category,
213
+ count: classified.length,
214
+ title: toTitleCase(category),
215
+ judgments: classified.map((c) => c.judgment),
216
+ });
201
217
  }
202
218
  const gapReportPath = join(outDir, "gap-analysis.json");
203
219
  if (existsSync(gapReportPath)) {
204
- ctx.collector.captureFile("gap-analysis", "gap-report", gapReportPath);
220
+ await emitFileContents(ctx.artifactWriter, "gapReport", assoc(ctx), gapReportPath);
205
221
  }
206
222
  const gapCount = gapReport.gaps.length;
207
223
  const classRate = failureModeReport.classificationRate.toFixed(0);
@@ -223,6 +239,18 @@ export class GapAnalysisStep {
223
239
  // ---------------------------------------------------------------------------
224
240
  // Helpers
225
241
  // ---------------------------------------------------------------------------
242
+ /**
243
+ * Render a kebab-case FailureModeType id as Title Case for the manifest
244
+ * entry's display title (e.g. `"missing-docs"` → `"Missing Docs"`). Kept
245
+ * local to the producer so the registry descriptor stays decoupled from
246
+ * eval-side types.
247
+ */
248
+ function toTitleCase(id) {
249
+ return id
250
+ .split("-")
251
+ .map((w) => (w.length === 0 ? w : w[0].toUpperCase() + w.slice(1)))
252
+ .join(" ");
253
+ }
226
254
  /**
227
255
  * Extract slug strings from polymorphic canonical doc refs.
228
256
  *
@@ -8,7 +8,7 @@
8
8
  * When the variant is "full", the handler is called twice (baseline + agentic)
9
9
  * and three YAML files are written. Other modes produce one YAML file.
10
10
  */
11
- import type { AppContext, PipelineState, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
11
+ import { type AppContext, type PipelineState, type PipelineStep, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
12
12
  export declare class GenerateConfigsStep implements PipelineStep {
13
13
  readonly name = "generate-configs";
14
14
  /** Task IDs from the last loadTasks call (pre-filter), for error messages. */
@@ -10,6 +10,8 @@
10
10
  */
11
11
  import { existsSync } from "node:fs";
12
12
  import { resolve } from "node:path";
13
+ import { assoc, } from "../../_vendor/ailf-core/index.js";
14
+ import { emitFileContents } from "../../artifact-capture/emit-file.js";
13
15
  import { LiteracyVariant } from "../../pipeline/normalize-mode.js";
14
16
  import { modelMatchesLiteracyVariant } from "../../pipeline/compiler/mode-bases/literacy.js";
15
17
  import { getStepInputPaths } from "../../pipeline/cache.js";
@@ -136,12 +138,14 @@ export class GenerateConfigsStep {
136
138
  maxConcurrency: models.maxConcurrency,
137
139
  logger: ctx.logger,
138
140
  });
139
- // Capture generated config files (use configFileForMode for legacy naming)
141
+ // W0050 configSnapshot is per-entry keyed by mode. For literacy,
142
+ // each variant produces a distinct config, so the variant name is the
143
+ // mode-axis value here.
140
144
  const { configFileForMode } = await import("../../pipeline/eval-constants.js");
141
145
  for (const variant of ["baseline", "agentic", "observed"]) {
142
146
  const configPath = resolve(ctx.config.rootDir, configFileForMode(variant));
143
147
  if (existsSync(configPath)) {
144
- ctx.collector.captureFile("generate-configs", `promptfoo-config-${variant}`, configPath, { mode: "literacy", variant });
148
+ await emitFileContents(ctx.artifactWriter, "configSnapshot", assoc(ctx, { mode: `literacy-${variant}` }), configPath);
145
149
  }
146
150
  }
147
151
  return this.checkLiteracyPostconditions(ctx, start);
@@ -187,18 +191,14 @@ export class GenerateConfigsStep {
187
191
  maxConcurrency: models.maxConcurrency,
188
192
  logger: ctx.logger,
189
193
  });
190
- // Capture generated config file
194
+ // W0050 configSnapshot for a single-mode compile.
191
195
  const configPath = resolve(ctx.config.rootDir, `promptfooconfig.${mode}.yaml`);
192
196
  if (existsSync(configPath)) {
193
- ctx.collector.captureFile("generate-configs", "promptfoo-config", configPath, { mode });
194
- }
195
- // Capture mode-specific test artifacts (extras)
196
- if (ctx.collector.extrasEnabled) {
197
- const testsPath = resolve(ctx.config.rootDir, "results", "latest", `${mode}-tests.json`);
198
- if (existsSync(testsPath)) {
199
- ctx.collector.captureFile("generate-configs", `${mode}-tests`, testsPath, { mode });
200
- }
197
+ await emitFileContents(ctx.artifactWriter, "configSnapshot", assoc(ctx, { mode }), configPath);
201
198
  }
199
+ // W0050 — the mode-specific `${mode}-tests.json` file was an
200
+ // extras-only capture with no registered descriptor. Dropped; the
201
+ // same information lives in the configSnapshot + rawResults chain.
202
202
  return {
203
203
  durationMs: Date.now() - start,
204
204
  status: "success",
@@ -10,7 +10,7 @@
10
10
  * - P5: Local-first (pipeline never fails because of a store write)
11
11
  * - P6: Sinks are fire-and-forget (failures logged, not thrown)
12
12
  */
13
- import type { AppContext, PipelineState, PipelineStep, PromptfooUrlEntry, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
13
+ import { type AppContext, type PipelineState, type PipelineStep, type PromptfooUrlEntry, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
14
14
  export declare class PublishReportStep implements PipelineStep {
15
15
  private readonly pipelineStart;
16
16
  private readonly options;
@@ -12,6 +12,7 @@
12
12
  */
13
13
  import { readFileSync } from "fs";
14
14
  import { resolve } from "path";
15
+ import { assoc, buildSlimReportSummary, } from "../../_vendor/ailf-core/index.js";
15
16
  import { checkScoreSummaryValid } from "../../pipeline/checks.js";
16
17
  import { buildProvenance, } from "../../pipeline/provenance.js";
17
18
  import { generateReportTitle } from "../../pipeline/report-title.js";
@@ -103,39 +104,49 @@ export class PublishReportStep {
103
104
  };
104
105
  }
105
106
  const title = generateReportTitle({ provenance });
107
+ // W0051 Slice 3: transform the full pipeline-internal ScoreSummary into
108
+ // the slim ReportSummary that lives on the Content Lake document.
109
+ // Prose fields (grader reasons, failureModes full text, gap prose,
110
+ // agentBehavior arrays) point at their external artifacts via
111
+ // `id = manifestEntryKey`; Studio hydrates on drill-down.
112
+ const slimSummary = buildSlimReportSummary(summary, ctx.config.mode);
106
113
  const report = {
107
114
  comparison: comparison ?? undefined,
108
115
  completedAt: now,
109
116
  durationMs,
110
117
  id: reportId,
111
118
  provenance,
112
- summary,
119
+ summary: slimSummary,
113
120
  tag: this.options.publishTag ?? ctx.config.publishTag,
114
121
  title,
115
122
  };
116
- // Upload test output artifacts to GCS (D0030 — non-blocking, P5).
117
- // When upload succeeds, strip responseOutput from the inline
118
- // testResults[] so the Content Lake document carries only the slim
119
- // shape; the full output lives in the GCS artifact. When upload
120
- // fails, leave the inline shape intact so Studio's drill-down UI
121
- // still works via the backward-compat fallback.
122
- if (ctx.artifactUploader && summary.testResults?.length) {
123
- const artifactRef = await uploadTestOutputs(ctx.artifactUploader, reportId, now, summary.testResults);
124
- if (artifactRef) {
125
- report.artifacts = { testOutputs: artifactRef };
126
- report.summary = {
127
- ...summary,
128
- testResults: summary.testResults.map(slimTestResult),
129
- };
130
- }
123
+ // Snapshot the artifact manifest from FinalizeRunStep's output (D0032).
124
+ // The source of truth is `runs/{runId}/manifest.json` in GCS; the report
125
+ // carries a denormalized copy so Studio can render drill-down state
126
+ // without an extra GCS fetch.
127
+ const artifactManifest = state.runManifest?.artifacts;
128
+ if (artifactManifest && Object.keys(artifactManifest).length > 0) {
129
+ report.artifactManifest = artifactManifest;
130
+ }
131
+ // When testOutputs was uploaded to GCS, strip responseOutput from the
132
+ // inline testResults[] so the Content Lake document stays slim — the
133
+ // full output lives in the GCS artifact. When no testOutputs artifact
134
+ // exists, leave the inline shape intact so Studio's drill-down UI
135
+ // falls back to it.
136
+ if (artifactManifest?.testOutputs && slimSummary.testResults?.length) {
137
+ report.summary = {
138
+ ...slimSummary,
139
+ testResults: slimSummary.testResults.map(slimTestResult),
140
+ };
131
141
  }
132
142
  // Share reportId with downstream steps (CallbackStep + orchestrator job update)
133
143
  state.reportId = reportId;
134
- // Capture report object (Tier 2)
135
- ctx.collector.capture("publish-report", "report-object", report);
136
- // Capture auto-comparison if present (Tier 2)
144
+ // W0050 migrated from ctx.collector.capture to the unified writer.
145
+ // reportSnapshot: full Report JSON for replay (run-scoped, bulk).
146
+ await ctx.artifactWriter.emit("reportSnapshot", assoc(ctx), report);
147
+ // autoComparison: delta vs baseline (run-scoped, bulk, optional).
137
148
  if (comparison) {
138
- ctx.collector.capture("publish-report", "auto-comparison", comparison);
149
+ await ctx.artifactWriter.emit("autoComparison", assoc(ctx), comparison);
139
150
  }
140
151
  // Write to store (system of record — best-effort, P5)
141
152
  const sanityResult = ctx.reportStore
@@ -143,17 +154,14 @@ export class PublishReportStep {
143
154
  : null;
144
155
  // Run sinks (fire-and-forget, P6)
145
156
  const publishResult = await runSinks(report, ctx);
146
- // Capture sink results (Tier 2)
147
- if (publishResult.sinkResults.length > 0) {
148
- ctx.collector.capture("publish-report", "sink-results", {
149
- sinkCount: publishResult.sinkResults.length,
150
- results: publishResult.sinkResults.map((r) => ({
151
- name: r.name,
152
- status: r.result.status,
153
- ...(r.result.status === "success" ? { detail: r.result.detail } : {}),
154
- ...(r.result.status === "failed" ? { error: r.result.error } : {}),
155
- ...(r.result.status === "skipped" ? { reason: r.result.reason } : {}),
156
- })),
157
+ // sinkResults: per-sink outcome (run-scoped, per-entry keyed by sink name).
158
+ for (const r of publishResult.sinkResults) {
159
+ await ctx.artifactWriter.emit("sinkResults", assoc(ctx, { name: r.name }), {
160
+ name: r.name,
161
+ status: r.result.status,
162
+ ...(r.result.status === "success" ? { detail: r.result.detail } : {}),
163
+ ...(r.result.status === "failed" ? { error: r.result.error } : {}),
164
+ ...(r.result.status === "skipped" ? { reason: r.result.reason } : {}),
157
165
  });
158
166
  }
159
167
  // Build result summary
@@ -221,6 +229,7 @@ function buildProvenanceInput(summary, ctx, options, autoScope) {
221
229
  mode,
222
230
  promptfooUrls: options.promptfooUrls,
223
231
  rootDir: ctx.config.rootDir,
232
+ runId: ctx.runId,
224
233
  sanityDocumentIds,
225
234
  source,
226
235
  sourceReportId: ctx.config.sourceReportId,
@@ -236,30 +245,6 @@ function slimTestResult(tr) {
236
245
  const { responseOutput: _o, responseOutputTruncated: _t, ...rest } = tr;
237
246
  return rest;
238
247
  }
239
- /**
240
- * Extract test outputs from StoredTestResult[] and upload as a single
241
- * JSON artifact to GCS. The artifact is keyed by `{taskId}::{modelId}`
242
- * to match the lookup pattern in Studio's JudgmentList component.
243
- *
244
- * Non-blocking: returns null if upload fails (P5).
245
- */
246
- async function uploadTestOutputs(uploader, reportId, createdAt, testResults) {
247
- const entries = {};
248
- for (const tr of testResults) {
249
- const key = `${tr.taskId}::${tr.modelId}`;
250
- entries[key] = {
251
- responseOutput: tr.responseOutput ?? "",
252
- responseOutputTruncated: tr.responseOutputTruncated ?? false,
253
- };
254
- }
255
- const artifact = {
256
- version: 1,
257
- reportId,
258
- createdAt,
259
- entries,
260
- };
261
- return uploader.upload(reportId, "test-outputs.json", artifact);
262
- }
263
248
  /**
264
249
  * Fan out a report to all configured sinks.
265
250
  *
@@ -4,7 +4,7 @@
4
4
  * Calls pure functions from pipeline/readiness-report.ts directly.
5
5
  * Optional step — failure doesn't stop the pipeline.
6
6
  */
7
- import type { AppContext, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
7
+ import { type AppContext, type PipelineStep, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
8
8
  export declare class ReadinessStep implements PipelineStep {
9
9
  readonly name = "readiness";
10
10
  readonly optional = true;
@@ -7,6 +7,8 @@
7
7
  import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
8
8
  import { resolve } from "path";
9
9
  import { tryLoadConfigFile } from "../../pipeline/compiler/config-loader.js";
10
+ import { assoc, } from "../../_vendor/ailf-core/index.js";
11
+ import { emitFileContents } from "../../artifact-capture/emit-file.js";
10
12
  import { formatReadinessMarkdown, generateReadinessReport, } from "../../pipeline/readiness-report.js";
11
13
  import { ThresholdConfigSchema } from "../../pipeline/schemas.js";
12
14
  export class ReadinessStep {
@@ -65,7 +67,8 @@ export class ReadinessStep {
65
67
  mkdirSync(ctx.config.outputDir, { recursive: true });
66
68
  const readinessPath = resolve(ctx.config.outputDir, "readiness-report.md");
67
69
  writeFileSync(readinessPath, readinessLines.join("\n---\n\n"));
68
- ctx.collector.captureFile("readiness", "readiness-report", readinessPath);
70
+ // W0050 — readinessReport is run-scoped bulk markdown.
71
+ await emitFileContents(ctx.artifactWriter, "readinessReport", assoc(ctx), readinessPath);
69
72
  }
70
73
  const passCount = readinessAreas.filter((area) => {
71
74
  const areaScore = scoreSummary.scores.find((s) => s.feature === area);
@@ -4,7 +4,7 @@
4
4
  * Calls generatePrComment() from pipeline/pr-comment.ts with typed options.
5
5
  * No env bridge or process.argv manipulation needed.
6
6
  */
7
- import type { AppContext, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
7
+ import { type AppContext, type PipelineStep, type StepResult, type ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
8
8
  export declare class ReportStep implements PipelineStep {
9
9
  readonly name = "report";
10
10
  check(): ValidationIssue[];
@@ -6,6 +6,8 @@
6
6
  */
7
7
  import { existsSync, mkdirSync } from "node:fs";
8
8
  import { dirname, resolve } from "path";
9
+ import { assoc, } from "../../_vendor/ailf-core/index.js";
10
+ import { emitFileContents } from "../../artifact-capture/emit-file.js";
9
11
  import { checkScoreSummaryValid } from "../../pipeline/checks.js";
10
12
  import { generatePrComment } from "../../pipeline/pr-comment.js";
11
13
  export class ReportStep {
@@ -45,13 +47,14 @@ export class ReportStep {
45
47
  status: "failed",
46
48
  };
47
49
  }
48
- // Capture report artifacts
50
+ // W0050 captureFile → emitFileContents. Both are run-scoped bulk
51
+ // artifacts; the writer handles redaction + excluded-types gating.
49
52
  if (existsSync(resolvedOutput)) {
50
- ctx.collector.captureFile("report", "pr-comment", resolvedOutput);
53
+ await emitFileContents(ctx.artifactWriter, "prComment", assoc(ctx), resolvedOutput);
51
54
  }
52
55
  const pipelineResultPath = resolve(ctx.config.outputDir, "pipeline-result.json");
53
56
  if (existsSync(pipelineResultPath)) {
54
- ctx.collector.captureFile("report", "pipeline-result", pipelineResultPath);
57
+ await emitFileContents(ctx.artifactWriter, "pipelineResult", assoc(ctx), pipelineResultPath);
55
58
  }
56
59
  return {
57
60
  durationMs: Date.now() - start,
@@ -7,6 +7,7 @@
7
7
  */
8
8
  import { existsSync, mkdirSync, writeFileSync } from "fs";
9
9
  import { resolve } from "path";
10
+ import { emitPerEntryEvalResults } from "../../pipeline/emit-eval-results.js";
10
11
  import { getStepInputPaths } from "../../pipeline/cache.js";
11
12
  import { buildCacheContext } from "../cache-context.js";
12
13
  import { checkCanonicalContextsExist, checkGeneratedConfigsExist, checkResultsExist, } from "../../pipeline/checks.js";
@@ -118,11 +119,11 @@ export class RunEvalStep {
118
119
  state.promptfooUrls ??= [];
119
120
  state.promptfooUrls.push(...remoteCacheResult.promptfooUrls);
120
121
  }
121
- // Capture the restored score-summary from remote cache
122
- const cachedSummaryPath = resolve(rootDir, "results", "latest", "score-summary.json");
123
- if (existsSync(cachedSummaryPath)) {
124
- ctx.collector.captureFile("run-eval", "score-summary-cached", cachedSummaryPath, { source: "remote-cache", mode: this.mode });
125
- }
122
+ // W0050 score-summary-cached was an unregistered capture;
123
+ // scoreSummary is already emitted by calculate-scores-step on the
124
+ // non-cached path, which also runs when we have a remote cache hit
125
+ // (populating state.remoteCacheHits CalculateScoresStep still
126
+ // invokes for the score-summary emit). Dropped here.
126
127
  return {
127
128
  durationMs: Date.now() - start,
128
129
  status: "success",
@@ -187,12 +188,16 @@ export class RunEvalStep {
187
188
  console.log();
188
189
  console.log(errorSummary);
189
190
  }
190
- // Capture eval results
191
+ // W0050 decompose the promptfoo aggregate into the per-entry
192
+ // descriptors the W0049 registry expects: rawResults / renderedPrompts
193
+ // per (run, mode, task, model); graderPrompts / graderJudgments per
194
+ // (run, mode, task, model, grader). See pipeline/emit-eval-results.ts.
195
+ // `testOutputs` still flows through uploadTestOutputs() in
196
+ // calculate-scores-step. `traces` ships via agent-observer (out of
197
+ // scope for the promptfoo shape parser — follow-up).
191
198
  const resultsPath = resolve(rootDir, resultsFileForMode(this.mode));
192
199
  if (existsSync(resultsPath)) {
193
- ctx.collector.captureFile("run-eval", `eval-results-${this.mode}`, resultsPath, {
194
- mode: this.mode,
195
- });
200
+ await emitPerEntryEvalResults(ctx.artifactWriter, ctx, this.mode, resultsPath);
196
201
  }
197
202
  // Extract Promptfoo share URL from eval results (Step 3b)
198
203
  if (ctx.evalRunner.extractShareUrl) {
@@ -157,8 +157,19 @@ export function extractGraderJudgments(resultsPath) {
157
157
  }
158
158
  return judgments;
159
159
  }
160
- /** Maximum characters to store for model response output */
161
- const MAX_RESPONSE_OUTPUT_LENGTH = 8000;
160
+ /**
161
+ * Maximum characters (JS string length, not bytes) to store for model
162
+ * response output. ASCII-heavy responses at this cap JSON-encode to ~1 MB;
163
+ * pathological multi-byte UTF-8 could encode to ~4 MB, still well within
164
+ * per-entry GCS object limits.
165
+ *
166
+ * Raised from 8 000 to 1 000 000 in W0048 because the per-entry artifact
167
+ * layout (D0032) makes the cap irrelevant to Studio's fetch cost — each
168
+ * entry is fetched independently on click, so a larger ceiling only costs
169
+ * GCS bytes, not main-thread blocking or baseline report payload.
170
+ * `responseOutputTruncated` still flips for the extreme tail.
171
+ */
172
+ const MAX_RESPONSE_OUTPUT_LENGTH = 1_000_000;
162
173
  /**
163
174
  * Extract per-test results with model output from evaluation results.
164
175
  *
@@ -15,7 +15,7 @@
15
15
  * @see docs/ideas/evaluation-roadmap.md — BP5: Make comparison a primitive
16
16
  * @see docs/ideas/metrics-design.md — Tier 4: Comparison results
17
17
  */
18
- import { type ChangeClass, type CompareOptions, type ComparisonReport, type ScoreSummary } from "./types.js";
18
+ import { type ChangeClass, type ComparableSummary, type CompareOptions, type ComparisonReport } from "./types.js";
19
19
  /** Classify a delta as improved, regressed, or unchanged given a threshold */
20
20
  export declare function classifyChange(delta: number, threshold: number): ChangeClass;
21
21
  /**
@@ -28,4 +28,4 @@ export declare function classifyChange(delta: number, threshold: number): Change
28
28
  * @param options Optional configuration (noise threshold, etc.)
29
29
  * @returns A ComparisonReport with deltas, classifications, and breakdowns
30
30
  */
31
- export declare function compare(baseline: ScoreSummary, experiment: ScoreSummary, options?: CompareOptions): ComparisonReport;
31
+ export declare function compare(baseline: ComparableSummary, experiment: ComparableSummary, options?: CompareOptions): ComparisonReport;
@@ -0,0 +1,38 @@
1
+ /**
2
+ * emit-eval-results.ts — decompose the promptfoo results file into the
3
+ * per-entry descriptors that W0049's registry expects.
4
+ *
5
+ * Replaces the Phase-B-stopgap "route the aggregated JSON through the
6
+ * deprecated `evalResults` bulk descriptor" path. For each test in the
7
+ * promptfoo output we emit:
8
+ *
9
+ * - `rawResults` per (run, mode, task, model) — the full result
10
+ * - `renderedPrompts` per (run, mode, task, model) — prompt the model saw
11
+ * - `graderPrompts` per (run, mode, task, model, grader) — rubric text
12
+ * - `graderJudgments` per (run, mode, task, model, grader) — {score, reason, pass}
13
+ *
14
+ * `testOutputs` is still emitted separately by `calculate-scores-step`
15
+ * via `uploadTestOutputs()` (carried forward from W0048 for byte-
16
+ * equivalence with the original rollout).
17
+ *
18
+ * `traces` is NOT produced here — agentic trace data flows through the
19
+ * agent-observer, not through the promptfoo result shape. Traces
20
+ * emission is out of scope for this helper and lands when the observer
21
+ * integration migrates (follow-up; not in W0050).
22
+ *
23
+ * The "grader" axis value is the rubric dimension string produced by
24
+ * `classifyRubric` (e.g. "task-completion", "code-correctness"). Non-
25
+ * LLM-rubric component assertions (javascript, contains, etc.) don't
26
+ * have a natural grader identifier and are skipped — their outcomes
27
+ * still live inside the full `rawResults` object.
28
+ */
29
+ import { type ArtifactWriter, type RunId } from "../_vendor/ailf-core/index.d.ts";
30
+ /**
31
+ * Parse a promptfoo results file and emit the per-entry artifacts.
32
+ *
33
+ * Non-blocking: any individual emit failure warns but does not halt.
34
+ * File read/parse errors are caught and logged; the caller keeps going.
35
+ */
36
+ export declare function emitPerEntryEvalResults(writer: ArtifactWriter, ctx: {
37
+ runId: RunId;
38
+ }, mode: string, resultsPath: string): Promise<void>;