@sanity/ailf 2.3.3 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. package/dist/_vendor/ailf-core/index.d.ts +1 -0
  2. package/dist/_vendor/ailf-core/index.js +1 -0
  3. package/dist/_vendor/ailf-core/ports/artifact-uploader.d.ts +35 -0
  4. package/dist/_vendor/ailf-core/ports/artifact-uploader.js +18 -0
  5. package/dist/_vendor/ailf-core/ports/context.d.ts +9 -0
  6. package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
  7. package/dist/_vendor/ailf-core/ports/index.js +1 -0
  8. package/dist/_vendor/ailf-core/types/index.d.ts +70 -0
  9. package/dist/_vendor/ailf-core/types/scoring-input.d.ts +2 -0
  10. package/dist/adapters/api-client/build-request.js +33 -11
  11. package/dist/artifact-capture/api-gateway-artifact-uploader.d.ts +41 -0
  12. package/dist/artifact-capture/api-gateway-artifact-uploader.js +123 -0
  13. package/dist/artifact-capture/gcs-collector.d.ts +55 -0
  14. package/dist/artifact-capture/gcs-collector.js +117 -0
  15. package/dist/artifact-capture/gcs-report-artifact-uploader.d.ts +31 -0
  16. package/dist/artifact-capture/gcs-report-artifact-uploader.js +66 -0
  17. package/dist/cli.js +2 -0
  18. package/dist/commands/pipeline-action.js +3 -0
  19. package/dist/composition-root.d.ts +10 -1
  20. package/dist/composition-root.js +56 -5
  21. package/dist/orchestration/build-app-context.js +3 -0
  22. package/dist/orchestration/steps/calculate-scores-step.js +5 -1
  23. package/dist/orchestration/steps/gap-analysis-step.js +15 -0
  24. package/dist/orchestration/steps/publish-report-step.js +31 -0
  25. package/dist/pipeline/calculate-scores.d.ts +12 -2
  26. package/dist/pipeline/calculate-scores.js +88 -0
  27. package/dist/report-store.js +28 -2
  28. package/package.json +2 -1
@@ -19,3 +19,4 @@ export { defineConfig, defineFeatures, defineModeBase, defineModels, definePrici
19
19
  export type { PricingEntry, PromptEntry, SourceEntry, } from "./config-helpers.js";
20
20
  export { env } from "./env-helper.js";
21
21
  export { NoOpArtifactCollector } from "./artifact-capture/noop-collector.js";
22
+ export { NoOpArtifactUploader } from "./ports/artifact-uploader.js";
@@ -21,3 +21,4 @@ export * from "./examples/index.js";
21
21
  export { defineConfig, defineFeatures, defineModeBase, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./config-helpers.js";
22
22
  export { env } from "./env-helper.js";
23
23
  export { NoOpArtifactCollector } from "./artifact-capture/noop-collector.js";
24
+ export { NoOpArtifactUploader } from "./ports/artifact-uploader.js";
@@ -0,0 +1,35 @@
1
+ /**
2
+ * Port: ArtifactUploader — uploads report artifacts to external object storage.
3
+ *
4
+ * Separate from ArtifactCollector (which captures forensic archives).
5
+ * This port puts structured files at known paths so Studio can fetch
6
+ * them on demand via signed URLs.
7
+ *
8
+ * @see docs/design-docs/external-artifact-store.md
9
+ * @see docs/decisions/D0030-external-artifact-store.md
10
+ */
11
+ import type { ArtifactRef } from "../types/index.js";
12
+ /**
13
+ * Uploads report artifacts to external storage.
14
+ *
15
+ * Implementations:
16
+ * - GcsReportArtifactUploader (packages/eval) — uploads to GCS
17
+ * - NoOpArtifactUploader (below) — returns null (no-op when GCS is not configured)
18
+ */
19
+ export interface ArtifactUploader {
20
+ /**
21
+ * Upload a JSON artifact for a report.
22
+ *
23
+ * @param reportId - Report identifier (used as the GCS path prefix)
24
+ * @param fileName - File name within the report prefix (e.g., "test-outputs.json")
25
+ * @param data - Serializable data (will be JSON.stringify'd)
26
+ * @returns ArtifactRef on success, null if upload is skipped or fails
27
+ */
28
+ upload(reportId: string, fileName: string, data: unknown): Promise<ArtifactRef | null>;
29
+ }
30
+ /**
31
+ * No-op uploader — always returns null. Used when GCS is not configured.
32
+ */
33
+ export declare class NoOpArtifactUploader implements ArtifactUploader {
34
+ upload(): Promise<null>;
35
+ }
@@ -0,0 +1,18 @@
1
+ /**
2
+ * Port: ArtifactUploader — uploads report artifacts to external object storage.
3
+ *
4
+ * Separate from ArtifactCollector (which captures forensic archives).
5
+ * This port puts structured files at known paths so Studio can fetch
6
+ * them on demand via signed URLs.
7
+ *
8
+ * @see docs/design-docs/external-artifact-store.md
9
+ * @see docs/decisions/D0030-external-artifact-store.md
10
+ */
11
+ /**
12
+ * No-op uploader — always returns null. Used when GCS is not configured.
13
+ */
14
+ export class NoOpArtifactUploader {
15
+ async upload() {
16
+ return null;
17
+ }
18
+ }
@@ -13,6 +13,7 @@
13
13
  */
14
14
  import type { DebugOptions, EvalMode, PluginRegistry } from "../types/index.js";
15
15
  import type { ArtifactCollector } from "./artifact-collector.js";
16
+ import type { ArtifactUploader } from "./artifact-uploader.js";
16
17
  import type { CacheStore } from "./cache-store.js";
17
18
  import type { DocFetcher } from "./doc-fetcher.js";
18
19
  import type { EvalRunner } from "./eval-runner.js";
@@ -159,6 +160,12 @@ export interface ResolvedConfig {
159
160
  captureCompress?: boolean;
160
161
  /** Whether to include mode-specific extra artifacts (default: true) */
161
162
  captureExtras?: boolean;
163
+ /** GCS bucket for capture upload (enables GCS decorator when set) */
164
+ captureGcsBucket?: string;
165
+ /** GCS object prefix for capture uploads (default: "captures/") */
166
+ captureGcsPrefix?: string;
167
+ /** GCS bucket for report artifact uploads — enables ArtifactUploader (D0030) */
168
+ artifactGcsBucket?: string;
162
169
  }
163
170
  /**
164
171
  * Application context — the complete dependency carrier.
@@ -173,6 +180,8 @@ export interface ResolvedConfig {
173
180
  * Created per-test by createTestContext().
174
181
  */
175
182
  export interface AppContext {
183
+ /** Report artifact uploader — uploads structured files to GCS for Studio (D0030) */
184
+ readonly artifactUploader?: ArtifactUploader;
176
185
  /** Evaluation caching (filesystem + optional Content Lake fallback) */
177
186
  readonly cache?: CacheStore;
178
187
  /** Artifact capture collector (no-op when --capture is not set) */
@@ -5,6 +5,8 @@
5
5
  * Adapters (in packages/eval) implement these interfaces.
6
6
  */
7
7
  export type { ArtifactCollector, ArtifactManifest, ArtifactManifestEntry, CaptureFlushResult, } from "./artifact-collector.js";
8
+ export type { ArtifactUploader } from "./artifact-uploader.js";
9
+ export { NoOpArtifactUploader } from "./artifact-uploader.js";
8
10
  export type { ArtifactContentDiff, CaptureDiffReport, ComparisonMode, ComparisonOptions, InventoryDiff, JsonDiffEntry, MetadataComparison, ScoreComparison, SecurityScan, TimingComparison, } from "./capture-comparator.js";
9
11
  export type { CacheEntryMetadata, CacheKey, CacheLookupResult, CacheRecordInput, CacheStore, } from "./cache-store.js";
10
12
  export type { ConfigSource } from "./config-source.js";
@@ -4,4 +4,5 @@
4
4
  * Ports define the contracts between the domain kernel and the outside world.
5
5
  * Adapters (in packages/eval) implement these interfaces.
6
6
  */
7
+ export { NoOpArtifactUploader } from "./artifact-uploader.js";
7
8
  export { canonicalDocRefLabel, isIdRef, isPathRef, isPerspectiveRef, isSlugRef, isTemplatedAssertion, } from "./task-source.js";
@@ -275,6 +275,55 @@ export interface StoredJudgment extends GraderJudgment {
275
275
  /** Canonical docs that the task expected the model to use */
276
276
  canonicalDocs?: DocumentRef[];
277
277
  }
278
+ /**
279
+ * Per-test result stored in reports for drill-down and audit.
280
+ *
281
+ * Captures the model's response output, grader reasoning per dimension,
282
+ * and response metadata. One entry per test × model combination.
283
+ * See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
284
+ */
285
+ export interface StoredTestResult {
286
+ /** Resolved feature area (from __featureArea or description) */
287
+ area: string;
288
+ /** Canonical docs the task expected the model to use */
289
+ canonicalDocs?: DocumentRef[];
290
+ /** Weighted composite score (gold variant only) */
291
+ compositeScore?: number;
292
+ /** Per-test cost (USD) */
293
+ cost?: number;
294
+ /** Per-dimension grader scores and reasoning */
295
+ dimensions: {
296
+ /** Rubric dimension: task-completion, code-correctness, doc-coverage */
297
+ dimension: string;
298
+ /** Grader's natural language reasoning */
299
+ reason: string;
300
+ /** Numeric score (0–100, normalized) */
301
+ score: number;
302
+ }[];
303
+ /** Response latency in milliseconds */
304
+ latencyMs?: number;
305
+ /** Model that produced the response */
306
+ modelId: string;
307
+ /**
308
+ * True when the model failed to produce meaningful output (empty response,
309
+ * API error, or refusal). Same semantics as GraderJudgment.outputFailure.
310
+ */
311
+ outputFailure?: boolean;
312
+ /** The model's generated code/response (truncated to 8000 chars) */
313
+ responseOutput: string;
314
+ /** True when responseOutput was truncated from a longer response */
315
+ responseOutputTruncated?: boolean;
316
+ /** Task description (e.g. "Functions - Webhook handler (gold)") */
317
+ taskId: string;
318
+ /** Token usage breakdown */
319
+ tokenUsage?: {
320
+ completion: number;
321
+ prompt: number;
322
+ total: number;
323
+ };
324
+ /** "gold" (with docs) or "baseline" (without docs) */
325
+ variant: "baseline" | "gold";
326
+ }
278
327
  /** Grader consistency diagnostics — does not affect scores, reported alongside */
279
328
  export interface GraderReliability {
280
329
  /** Inter-grader agreement (from multi-grader comparison) — Phase 3 */
@@ -776,6 +825,12 @@ export interface ScoreSummary {
776
825
  lowScoringJudgments?: StoredJudgment[];
777
826
  /** Gap analysis recommendations (Phase 3b) — prioritized remediation plan */
778
827
  recommendations?: GapAnalysisReport;
828
+ /**
829
+ * Per-test results with model output, grader reasoning, and metadata.
830
+ * One entry per test × model combination. Populated during gap-analysis
831
+ * enrichment from test-results.json. See D0029.
832
+ */
833
+ testResults?: StoredTestResult[];
779
834
  /** Grader reliability diagnostics (does not affect scores) */
780
835
  graderReliability?: GraderReliability;
781
836
  lowestArea: string;
@@ -1102,8 +1157,23 @@ export interface PublishResult {
1102
1157
  result: SinkResult;
1103
1158
  }[];
1104
1159
  }
1160
+ /** Reference to an artifact in external object storage (GCS). See D0030. */
1161
+ export interface ArtifactRef {
1162
+ store: "gcs";
1163
+ bucket: string;
1164
+ path: string;
1165
+ bytes?: number;
1166
+ entryCount?: number;
1167
+ }
1105
1168
  /** A published evaluation report — the atomic unit of the report store */
1106
1169
  export interface Report {
1170
+ /** External artifact references — set by publish step when uploader is available (D0030) */
1171
+ artifacts?: {
1172
+ testOutputs?: ArtifactRef;
1173
+ renderedPrompts?: ArtifactRef;
1174
+ rawResults?: ArtifactRef;
1175
+ traces?: ArtifactRef;
1176
+ };
1107
1177
  /** Optional auto-comparison against the most recent comparable report */
1108
1178
  comparison?: ComparisonReport;
1109
1179
  /** When the evaluation completed */
@@ -26,6 +26,8 @@ export interface TestResult {
26
26
  componentResults: ComponentResult[];
27
27
  pass: boolean;
28
28
  };
29
+ /** Per-test latency in ms (propagated from Promptfoo when available) */
30
+ latencyMs?: number;
29
31
  metadata?: Record<string, unknown>;
30
32
  /** Provider identifier (e.g., "openai:gpt-4o") */
31
33
  providerId?: string;
@@ -15,8 +15,18 @@
15
15
  import { existsSync } from "fs";
16
16
  import { resolve } from "path";
17
17
  import { PipelineRequestSchema, } from "../../_vendor/ailf-core/index.js";
18
+ import { LEGACY_EVAL_MODE_ALIASES } from "../../_vendor/ailf-shared/index.js";
18
19
  import { LiteracyVariant } from "../../pipeline/normalize-mode.js";
19
20
  import { RepoTaskSource } from "../task-sources/repo-task-source.js";
21
+ const LEGACY_LITERACY_VARIANT_SET = new Set(LEGACY_EVAL_MODE_ALIASES);
22
+ /**
23
+ * Resolve a raw `config.mode` (which may be a CLI literacy variant such as
24
+ * `"baseline"` or `"full"`) to the canonical task-level mode that appears on
25
+ * `GeneralizedTaskDefinition.mode`. Literacy variants all map to `"literacy"`.
26
+ */
27
+ function resolveCanonicalTaskMode(configMode) {
28
+ return LEGACY_LITERACY_VARIANT_SET.has(configMode) ? "literacy" : configMode;
29
+ }
20
30
  // ---------------------------------------------------------------------------
21
31
  // Public API
22
32
  // ---------------------------------------------------------------------------
@@ -33,10 +43,18 @@ import { RepoTaskSource } from "../task-sources/repo-task-source.js";
33
43
  */
34
44
  export async function buildRemoteRequest(options) {
35
45
  const { tasksDir, config } = options;
36
- // 1. Load and validate local tasks
46
+ // 1. Load and validate local tasks, filtered to the requested mode.
47
+ // `config.mode` may be a literacy variant (baseline/agentic/full/observed)
48
+ // — those all map to task mode "literacy". Other modes match 1:1.
37
49
  const taskSource = new RepoTaskSource(tasksDir);
38
50
  const filterOptions = buildFilterOptions(config);
39
- const tasks = (await taskSource.loadTasks(filterOptions)).filter((t) => t.mode === "literacy");
51
+ const allTasks = await taskSource.loadTasks(filterOptions);
52
+ const taskModeFilter = config.mode
53
+ ? resolveCanonicalTaskMode(config.mode)
54
+ : undefined;
55
+ const tasks = taskModeFilter
56
+ ? allTasks.filter((t) => t.mode === taskModeFilter)
57
+ : allTasks;
40
58
  if (tasks.length === 0) {
41
59
  throw new Error("No tasks found after applying filters.\n" +
42
60
  ` Tasks directory: ${tasksDir}\n` +
@@ -145,12 +163,13 @@ export function resolveTasksDir(rootDir, explicitPath) {
145
163
  // Helpers
146
164
  // ---------------------------------------------------------------------------
147
165
  /**
148
- * Convert a LiteracyTaskDefinition to the camelCase inline format expected
166
+ * Convert a GeneralizedTaskDefinition to the camelCase inline format expected
149
167
  * by the API.
150
168
  */
151
169
  function taskToInlineFormat(task) {
152
170
  const inline = {
153
171
  id: task.id,
172
+ mode: task.mode,
154
173
  description: task.title,
155
174
  featureArea: task.area ?? "",
156
175
  assert: task.assertions ?? [],
@@ -166,14 +185,17 @@ function taskToInlineFormat(task) {
166
185
  ...(task.prompt?.vars ?? {}),
167
186
  };
168
187
  }
169
- if (task.docCoverage) {
170
- inline.docCoverage = true;
171
- }
172
- if (task.referenceSolution) {
173
- inline.referenceSolution = task.referenceSolution;
174
- }
175
- if (task.baseline) {
176
- inline.baseline = task.baseline;
188
+ // Literacy-specific fields
189
+ if (task.mode === "literacy") {
190
+ if (task.docCoverage) {
191
+ inline.docCoverage = true;
192
+ }
193
+ if (task.referenceSolution) {
194
+ inline.referenceSolution = task.referenceSolution;
195
+ }
196
+ if (task.baseline) {
197
+ inline.baseline = task.baseline;
198
+ }
177
199
  }
178
200
  if (task.tags?.length) {
179
201
  inline.tags = task.tags;
@@ -0,0 +1,41 @@
1
+ /**
2
+ * ApiGatewayArtifactUploader — uploads report artifacts via the API Gateway.
3
+ *
4
+ * Counterpart to GcsReportArtifactUploader. Used when the CLI runs locally
5
+ * without GCS credentials. Two-step flow:
6
+ *
7
+ * 1. GET {apiBaseUrl}/v1/artifacts/{reportId}/upload-url?type={artifactType}
8
+ * with Authorization: Bearer {apiKey} — returns a signed PUT URL.
9
+ * 2. PUT the JSON to that URL with Content-Type: application/json and
10
+ * x-goog-if-generation-match: 0 (overwrite-protection contract from
11
+ * the gateway's signed URL).
12
+ *
13
+ * The gateway stays out of the data path — Vercel only signs the URL,
14
+ * the artifact bytes go directly to GCS.
15
+ *
16
+ * Design principles:
17
+ * - P5: Non-blocking — any failure returns null and warns, never throws.
18
+ * - Stateless — no client to keep around between calls.
19
+ *
20
+ * @see docs/design-docs/external-artifact-store.md
21
+ * @see docs/decisions/D0030-external-artifact-store.md
22
+ */
23
+ import type { ArtifactRef, ArtifactUploader } from "../_vendor/ailf-core/index.d.ts";
24
+ export interface ApiGatewayUploaderOptions {
25
+ /** Base URL of the API gateway (e.g., "https://api.ailf.sanity.io"). */
26
+ apiBaseUrl: string;
27
+ /** AILF API key with the `artifact:write` scope. */
28
+ apiKey: string;
29
+ /** GCS bucket name — included in the returned ArtifactRef. */
30
+ bucket: string;
31
+ }
32
+ export declare class ApiGatewayArtifactUploader implements ArtifactUploader {
33
+ private readonly options;
34
+ constructor(options: ApiGatewayUploaderOptions);
35
+ upload(reportId: string, fileName: string, data: unknown): Promise<ArtifactRef | null>;
36
+ /**
37
+ * Fetch a signed upload URL from the gateway. Returns null on any non-2xx
38
+ * response or malformed body so the caller can stay non-blocking.
39
+ */
40
+ private fetchSignedUrl;
41
+ }
@@ -0,0 +1,123 @@
1
+ /**
2
+ * ApiGatewayArtifactUploader — uploads report artifacts via the API Gateway.
3
+ *
4
+ * Counterpart to GcsReportArtifactUploader. Used when the CLI runs locally
5
+ * without GCS credentials. Two-step flow:
6
+ *
7
+ * 1. GET {apiBaseUrl}/v1/artifacts/{reportId}/upload-url?type={artifactType}
8
+ * with Authorization: Bearer {apiKey} — returns a signed PUT URL.
9
+ * 2. PUT the JSON to that URL with Content-Type: application/json and
10
+ * x-goog-if-generation-match: 0 (overwrite-protection contract from
11
+ * the gateway's signed URL).
12
+ *
13
+ * The gateway stays out of the data path — Vercel only signs the URL,
14
+ * the artifact bytes go directly to GCS.
15
+ *
16
+ * Design principles:
17
+ * - P5: Non-blocking — any failure returns null and warns, never throws.
18
+ * - Stateless — no client to keep around between calls.
19
+ *
20
+ * @see docs/design-docs/external-artifact-store.md
21
+ * @see docs/decisions/D0030-external-artifact-store.md
22
+ */
23
+ // ---------------------------------------------------------------------------
24
+ // File-name → artifact-type mapping (mirrors packages/api ARTIFACT_FILES)
25
+ // ---------------------------------------------------------------------------
26
+ /**
27
+ * Reverse map of the API gateway's ARTIFACT_FILES. The uploader port speaks
28
+ * file names; the gateway endpoint speaks artifact types. Keep these in sync
29
+ * with packages/api/src/routes/artifacts.ts.
30
+ */
31
+ const FILE_TO_TYPE = {
32
+ "eval-results.json": "evalResults",
33
+ "grader-prompts.json": "graderPrompts",
34
+ "rendered-prompts.json": "renderedPrompts",
35
+ "task-definitions.json": "taskDefinitions",
36
+ "test-outputs.json": "testOutputs",
37
+ };
38
+ export class ApiGatewayArtifactUploader {
39
+ options;
40
+ constructor(options) {
41
+ this.options = options;
42
+ }
43
+ async upload(reportId, fileName, data) {
44
+ const artifactType = FILE_TO_TYPE[fileName];
45
+ if (!artifactType) {
46
+ console.warn(` ⚠️ Artifact upload skipped (unknown fileName): ${fileName}`);
47
+ return null;
48
+ }
49
+ const objectPath = `reports/${reportId}/${fileName}`;
50
+ const json = JSON.stringify(data);
51
+ const bytes = Buffer.byteLength(json, "utf-8");
52
+ try {
53
+ const signed = await this.fetchSignedUrl(reportId, artifactType);
54
+ if (!signed)
55
+ return null;
56
+ const putRes = await fetch(signed.url, {
57
+ body: json,
58
+ headers: signed.requiredHeaders,
59
+ method: "PUT",
60
+ });
61
+ if (!putRes.ok) {
62
+ console.warn(` ⚠️ Artifact upload failed (non-blocking): ${objectPath} — GCS PUT ${putRes.status} ${putRes.statusText}`);
63
+ return null;
64
+ }
65
+ return {
66
+ bucket: signed.bucket,
67
+ bytes,
68
+ entryCount: extractEntryCount(data),
69
+ path: signed.path,
70
+ store: "gcs",
71
+ };
72
+ }
73
+ catch (err) {
74
+ const message = err instanceof Error ? err.message : String(err);
75
+ console.warn(` ⚠️ Artifact upload failed (non-blocking): ${objectPath} — ${message}`);
76
+ return null;
77
+ }
78
+ }
79
+ /**
80
+ * Fetch a signed upload URL from the gateway. Returns null on any non-2xx
81
+ * response or malformed body so the caller can stay non-blocking.
82
+ */
83
+ async fetchSignedUrl(reportId, artifactType) {
84
+ const url = `${this.options.apiBaseUrl.replace(/\/$/, "")}/v1/artifacts/${encodeURIComponent(reportId)}/upload-url?type=${encodeURIComponent(artifactType)}`;
85
+ const res = await fetch(url, {
86
+ headers: {
87
+ Authorization: `Bearer ${this.options.apiKey}`,
88
+ },
89
+ method: "GET",
90
+ });
91
+ if (!res.ok) {
92
+ console.warn(` ⚠️ Signed-URL request failed: ${res.status} ${res.statusText}`);
93
+ return null;
94
+ }
95
+ const body = (await res.json());
96
+ if (body.object !== "signed_upload_url" ||
97
+ typeof body.url !== "string" ||
98
+ typeof body.path !== "string" ||
99
+ typeof body.bucket !== "string" ||
100
+ !body.requiredHeaders) {
101
+ console.warn(` ⚠️ Signed-URL response was malformed`);
102
+ return null;
103
+ }
104
+ return {
105
+ bucket: body.bucket,
106
+ method: "PUT",
107
+ object: "signed_upload_url",
108
+ path: body.path,
109
+ requiredHeaders: body.requiredHeaders,
110
+ url: body.url,
111
+ };
112
+ }
113
+ }
114
+ function extractEntryCount(data) {
115
+ if (typeof data === "object" &&
116
+ data !== null &&
117
+ "entries" in data &&
118
+ typeof data.entries === "object") {
119
+ return Object.keys(data.entries)
120
+ .length;
121
+ }
122
+ return undefined;
123
+ }
@@ -0,0 +1,55 @@
1
+ /**
2
+ * GcsArtifactCollector — decorator that uploads capture artifacts to GCS.
3
+ *
4
+ * Wraps the FilesystemArtifactCollector: local flush first (preserving
5
+ * the existing manifest + redaction logic), then upload to a GCS bucket.
6
+ *
7
+ * Design principles:
8
+ * - P5: Non-blocking — GCS upload failure should not block the pipeline.
9
+ * Local artifacts are always preserved.
10
+ * - Decorator pattern — delegates capture() and captureFile() to the inner
11
+ * collector unchanged. Only flush() adds the GCS upload step.
12
+ * - Lazy client — GCS Storage client is created on first flush(), not at
13
+ * construction (same pattern as BigQuerySink).
14
+ *
15
+ * @see docs/decisions/D0030-external-artifact-store.md
16
+ * @see docs/work-items/W0035-gcs-artifact-output.json
17
+ */
18
+ import type { ArtifactCollector, CaptureFlushResult } from "../_vendor/ailf-core/index.d.ts";
19
+ export interface GcsCollectorOptions {
20
+ /** GCS bucket name (e.g., "ailf-artifacts") */
21
+ bucket: string;
22
+ /** Object prefix in the bucket (e.g., "captures/") */
23
+ prefix?: string;
24
+ /** Path to service account credentials JSON (optional — falls back to ADC) */
25
+ credentials?: string;
26
+ }
27
+ export interface GcsFlushResult extends CaptureFlushResult {
28
+ /** GCS upload status */
29
+ gcs: {
30
+ status: "uploaded";
31
+ bucket: string;
32
+ path: string;
33
+ } | {
34
+ status: "skipped";
35
+ reason: string;
36
+ } | {
37
+ status: "failed";
38
+ error: string;
39
+ };
40
+ }
41
+ export declare class GcsArtifactCollector implements ArtifactCollector {
42
+ get enabled(): boolean;
43
+ get extrasEnabled(): boolean;
44
+ private client;
45
+ private readonly inner;
46
+ private readonly options;
47
+ constructor(inner: ArtifactCollector, options: GcsCollectorOptions);
48
+ capture(step: string, type: string, data: unknown, meta?: Record<string, unknown>): void;
49
+ captureFile(step: string, type: string, filePath: string, meta?: Record<string, unknown>): void;
50
+ flush(): Promise<GcsFlushResult>;
51
+ /** Lazily create the GCS Storage client. */
52
+ private getClient;
53
+ /** Upload the flushed artifact (tar.gz or directory) to GCS. */
54
+ private uploadToGcs;
55
+ }
@@ -0,0 +1,117 @@
1
+ /**
2
+ * GcsArtifactCollector — decorator that uploads capture artifacts to GCS.
3
+ *
4
+ * Wraps the FilesystemArtifactCollector: local flush first (preserving
5
+ * the existing manifest + redaction logic), then upload to a GCS bucket.
6
+ *
7
+ * Design principles:
8
+ * - P5: Non-blocking — GCS upload failure should not block the pipeline.
9
+ * Local artifacts are always preserved.
10
+ * - Decorator pattern — delegates capture() and captureFile() to the inner
11
+ * collector unchanged. Only flush() adds the GCS upload step.
12
+ * - Lazy client — GCS Storage client is created on first flush(), not at
13
+ * construction (same pattern as BigQuerySink).
14
+ *
15
+ * @see docs/decisions/D0030-external-artifact-store.md
16
+ * @see docs/work-items/W0035-gcs-artifact-output.json
17
+ */
18
+ import { readFileSync } from "node:fs";
19
+ import { Storage } from "@google-cloud/storage";
20
+ // ---------------------------------------------------------------------------
21
+ // Collector
22
+ // ---------------------------------------------------------------------------
23
+ export class GcsArtifactCollector {
24
+ get enabled() {
25
+ return this.inner.enabled;
26
+ }
27
+ get extrasEnabled() {
28
+ return this.inner.extrasEnabled;
29
+ }
30
+ client = null;
31
+ inner;
32
+ options;
33
+ constructor(inner, options) {
34
+ this.inner = inner;
35
+ this.options = options;
36
+ }
37
+ capture(step, type, data, meta) {
38
+ this.inner.capture(step, type, data, meta);
39
+ }
40
+ captureFile(step, type, filePath, meta) {
41
+ this.inner.captureFile(step, type, filePath, meta);
42
+ }
43
+ async flush() {
44
+ // Step 1: Flush to local filesystem first (always succeeds or throws)
45
+ const localResult = await this.inner.flush();
46
+ // Step 2: Upload to GCS (non-blocking — P5)
47
+ if (localResult.artifactCount === 0) {
48
+ return {
49
+ ...localResult,
50
+ gcs: { status: "skipped", reason: "No artifacts to upload" },
51
+ };
52
+ }
53
+ try {
54
+ const gcsPath = await this.uploadToGcs(localResult);
55
+ return {
56
+ ...localResult,
57
+ gcs: {
58
+ status: "uploaded",
59
+ bucket: this.options.bucket,
60
+ path: gcsPath,
61
+ },
62
+ };
63
+ }
64
+ catch (err) {
65
+ const message = err instanceof Error ? err.message : String(err);
66
+ console.warn(` ⚠️ GCS upload failed (non-blocking): ${message}`);
67
+ return {
68
+ ...localResult,
69
+ gcs: { status: "failed", error: message },
70
+ };
71
+ }
72
+ }
73
+ // -----------------------------------------------------------------------
74
+ // Private helpers
75
+ // -----------------------------------------------------------------------
76
+ /** Lazily create the GCS Storage client. */
77
+ getClient() {
78
+ if (this.client)
79
+ return this.client;
80
+ this.client = this.options.credentials
81
+ ? new Storage({ keyFilename: this.options.credentials })
82
+ : new Storage();
83
+ return this.client;
84
+ }
85
+ /** Upload the flushed artifact (tar.gz or directory) to GCS. */
86
+ async uploadToGcs(result) {
87
+ const storage = this.getClient();
88
+ const bucket = storage.bucket(this.options.bucket);
89
+ const prefix = this.options.prefix ?? "captures/";
90
+ if (result.compressed) {
91
+ // Upload the tar.gz directly
92
+ const fileName = result.destination.split("/").pop() ?? "capture.tar.gz";
93
+ const gcsPath = `${prefix}${fileName}`;
94
+ const fileContent = readFileSync(result.destination);
95
+ await bucket.file(gcsPath).save(fileContent, {
96
+ contentType: "application/gzip",
97
+ metadata: {
98
+ artifactCount: String(result.artifactCount),
99
+ totalBytes: String(result.totalBytes),
100
+ },
101
+ });
102
+ return gcsPath;
103
+ }
104
+ // Uncompressed: upload the manifest.json as the representative file.
105
+ // The full directory could be uploaded file-by-file, but for the
106
+ // capture use case (forensic archive), the compressed bundle is the
107
+ // expected path. Upload just the manifest as a reference.
108
+ const manifestPath = `${result.destination}/manifest.json`;
109
+ const dirName = result.destination.split("/").pop() ?? "capture";
110
+ const gcsPath = `${prefix}${dirName}/manifest.json`;
111
+ const manifestContent = readFileSync(manifestPath, "utf-8");
112
+ await bucket.file(gcsPath).save(manifestContent, {
113
+ contentType: "application/json",
114
+ });
115
+ return gcsPath;
116
+ }
117
+ }
@@ -0,0 +1,31 @@
1
+ /**
2
+ * GcsReportArtifactUploader — uploads report artifacts to known GCS paths.
3
+ *
4
+ * Separate from GcsArtifactCollector (which handles forensic capture archives).
5
+ * This uploader puts structured JSON files at predictable paths so the
6
+ * API Gateway can sign URLs and Studio can fetch them on demand.
7
+ *
8
+ * GCS path convention: reports/{reportId}/{fileName}
9
+ * Example: reports/01926abc.../test-outputs.json
10
+ *
11
+ * Design principles:
12
+ * - P5: Non-blocking — GCS upload failure returns null, never throws
13
+ * - Lazy client — Storage created on first upload, not at construction
14
+ * - Same credentials path as GcsArtifactCollector (ADC or key file)
15
+ *
16
+ * @see docs/design-docs/external-artifact-store.md
17
+ * @see docs/decisions/D0030-external-artifact-store.md
18
+ */
19
+ import type { ArtifactRef, ArtifactUploader } from "../_vendor/ailf-core/index.d.ts";
20
+ export interface GcsUploaderOptions {
21
+ /** GCS bucket name (e.g., "ailf-artifacts") */
22
+ bucket: string;
23
+ }
24
+ export declare class GcsReportArtifactUploader implements ArtifactUploader {
25
+ private client;
26
+ private readonly options;
27
+ constructor(options: GcsUploaderOptions);
28
+ upload(reportId: string, fileName: string, data: unknown): Promise<ArtifactRef | null>;
29
+ /** Lazily create the GCS Storage client (ADC). */
30
+ private getClient;
31
+ }
@@ -0,0 +1,66 @@
1
+ /**
2
+ * GcsReportArtifactUploader — uploads report artifacts to known GCS paths.
3
+ *
4
+ * Separate from GcsArtifactCollector (which handles forensic capture archives).
5
+ * This uploader puts structured JSON files at predictable paths so the
6
+ * API Gateway can sign URLs and Studio can fetch them on demand.
7
+ *
8
+ * GCS path convention: reports/{reportId}/{fileName}
9
+ * Example: reports/01926abc.../test-outputs.json
10
+ *
11
+ * Design principles:
12
+ * - P5: Non-blocking — GCS upload failure returns null, never throws
13
+ * - Lazy client — Storage created on first upload, not at construction
14
+ * - Same credentials path as GcsArtifactCollector (ADC or key file)
15
+ *
16
+ * @see docs/design-docs/external-artifact-store.md
17
+ * @see docs/decisions/D0030-external-artifact-store.md
18
+ */
19
+ import { Storage } from "@google-cloud/storage";
20
+ export class GcsReportArtifactUploader {
21
+ client = null;
22
+ options;
23
+ constructor(options) {
24
+ this.options = options;
25
+ }
26
+ async upload(reportId, fileName, data) {
27
+ const objectPath = `reports/${reportId}/${fileName}`;
28
+ const json = JSON.stringify(data);
29
+ const bytes = Buffer.byteLength(json, "utf-8");
30
+ try {
31
+ const storage = this.getClient();
32
+ const file = storage.bucket(this.options.bucket).file(objectPath);
33
+ await file.save(json, {
34
+ contentType: "application/json",
35
+ metadata: {
36
+ reportId,
37
+ },
38
+ });
39
+ return {
40
+ store: "gcs",
41
+ bucket: this.options.bucket,
42
+ path: objectPath,
43
+ bytes,
44
+ entryCount: typeof data === "object" &&
45
+ data !== null &&
46
+ "entries" in data &&
47
+ typeof data.entries === "object"
48
+ ? Object.keys(data.entries)
49
+ .length
50
+ : undefined,
51
+ };
52
+ }
53
+ catch (err) {
54
+ const message = err instanceof Error ? err.message : String(err);
55
+ console.warn(` ⚠️ Artifact upload failed (non-blocking): ${objectPath} — ${message}`);
56
+ return null;
57
+ }
58
+ }
59
+ /** Lazily create the GCS Storage client (ADC). */
60
+ getClient() {
61
+ if (this.client)
62
+ return this.client;
63
+ this.client = new Storage();
64
+ return this.client;
65
+ }
66
+ }
package/dist/cli.js CHANGED
@@ -168,6 +168,8 @@ import { createCalculateScoresCommand } from "./commands/calculate-scores.js";
168
168
  program.addCommand(createCalculateScoresCommand().helpGroup(CommandGroup.PipelineInternals));
169
169
  import { createPrCommentCommand } from "./commands/pr-comment.js";
170
170
  program.addCommand(createPrCommentCommand().helpGroup(CommandGroup.PipelineInternals));
171
+ import { createGenerateConfigsCommand } from "./commands/generate-configs.js";
172
+ program.addCommand(createGenerateConfigsCommand().helpGroup(CommandGroup.PipelineInternals));
171
173
  import { createMeasureRetrievalCommand } from "./commands/measure-retrieval.js";
172
174
  program.addCommand(createMeasureRetrievalCommand().helpGroup(CommandGroup.PipelineInternals));
173
175
  import { createLookupDocCommand } from "./commands/lookup-doc.js";
@@ -326,6 +326,9 @@ export async function executePipeline(cliOpts) {
326
326
  process.env.AILF_CAPTURE_COMPRESS !== "0";
327
327
  config.captureExtras =
328
328
  cliOpts.captureExtras !== false && process.env.AILF_CAPTURE_EXTRAS !== "0";
329
+ config.captureGcsBucket ??= process.env.AILF_CAPTURE_GCS_BUCKET;
330
+ config.captureGcsPrefix ??= process.env.AILF_CAPTURE_GCS_PREFIX;
331
+ config.artifactGcsBucket ??= process.env.AILF_GCS_ARTIFACT_BUCKET;
329
332
  // Create AppContext directly from the merged config so adapters
330
333
  // (especially taskSource) are wired from the file config's
331
334
  // taskSourceType — not from CLI defaults.
@@ -15,7 +15,7 @@
15
15
  * @see packages/core/src/ports/context.ts — AppContext interface
16
16
  * @see docs/archive/exec-plans/ports-and-adapters/phase-7-composition-root.md
17
17
  */
18
- import { type AppContext, type AssertionRegistration, type ResolvedConfig } from "./_vendor/ailf-core/index.d.ts";
18
+ import { type AppContext, type ArtifactUploader, type AssertionRegistration, type Logger, type ResolvedConfig } from "./_vendor/ailf-core/index.d.ts";
19
19
  /**
20
20
  * Create a fully wired AppContext from resolved configuration.
21
21
  *
@@ -23,6 +23,15 @@ import { type AppContext, type AssertionRegistration, type ResolvedConfig } from
23
23
  * Swapping an adapter is a one-line change in this function.
24
24
  */
25
25
  export declare function createAppContext(config: ResolvedConfig): AppContext;
26
+ /**
27
+ * Selects an ArtifactUploader implementation based on available credentials.
28
+ *
29
+ * Returns undefined when artifact upload is not configured — the publish
30
+ * step skips silently in that case (P5).
31
+ *
32
+ * Exported for unit-test access; not part of the public package API.
33
+ */
34
+ export declare function createArtifactUploader(config: ResolvedConfig, logger: Logger): ArtifactUploader | undefined;
26
35
  /**
27
36
  * Generic Promptfoo assertion types available to all evaluation modes.
28
37
  *
@@ -17,7 +17,10 @@
17
17
  */
18
18
  import { join } from "node:path";
19
19
  import { InMemoryPluginRegistry, NoOpArtifactCollector, } from "./_vendor/ailf-core/index.js";
20
+ import { ApiGatewayArtifactUploader } from "./artifact-capture/api-gateway-artifact-uploader.js";
20
21
  import { FilesystemArtifactCollector } from "./artifact-capture/filesystem-collector.js";
22
+ import { GcsArtifactCollector } from "./artifact-capture/gcs-collector.js";
23
+ import { GcsReportArtifactUploader } from "./artifact-capture/gcs-report-artifact-uploader.js";
21
24
  import { ContentLakeCacheAdapter } from "./adapters/cache/content-lake-cache.js";
22
25
  import { loadExternalPresets } from "./pipeline/compiler/preset-loader.js";
23
26
  import { FilesystemCache } from "./adapters/cache/filesystem-cache.js";
@@ -57,9 +60,11 @@ export function createAppContext(config) {
57
60
  const reportStore = createReportStore(config);
58
61
  // Sinks — loaded from config/sinks
59
62
  const sinks = loadSinks();
60
- // Artifact collector — no-op by default, filesystem when --capture is set
61
- const collector = config.captureEnabled
62
- ? new FilesystemArtifactCollector({
63
+ // Artifact collector — no-op by default, filesystem when --capture is set,
64
+ // GCS decorator when --capture-gcs-bucket is also provided (D0030/W0035)
65
+ let collector = new NoOpArtifactCollector();
66
+ if (config.captureEnabled) {
67
+ const fsCollector = new FilesystemArtifactCollector({
63
68
  captureDir: config.captureDir ?? join(config.outputDir, "..", "captures"),
64
69
  mode: config.mode,
65
70
  compress: config.captureCompress ?? true,
@@ -69,9 +74,25 @@ export function createAppContext(config) {
69
74
  source: config.source,
70
75
  areas: config.areas,
71
76
  },
72
- })
73
- : new NoOpArtifactCollector();
77
+ });
78
+ collector = config.captureGcsBucket
79
+ ? new GcsArtifactCollector(fsCollector, {
80
+ bucket: config.captureGcsBucket,
81
+ prefix: config.captureGcsPrefix,
82
+ })
83
+ : fsCollector;
84
+ }
85
+ // Report artifact uploader — uploads structured files to GCS at known
86
+ // paths for Studio to fetch via signed URLs (D0030).
87
+ //
88
+ // Selection (W0042):
89
+ // 1. Direct GCS upload when ADC env vars are present (CI / GCP runtime)
90
+ // 2. API Gateway signed-URL upload when only an AILF API key is present
91
+ // (local dev — no GCS credentials needed)
92
+ // 3. Skipped silently when neither is configured
93
+ const artifactUploader = createArtifactUploader(config, logger);
74
94
  return {
95
+ artifactUploader,
75
96
  cache,
76
97
  collector,
77
98
  config,
@@ -99,6 +120,36 @@ function createLogger() {
99
120
  process.env.AILF_VERBOSE === "1",
100
121
  });
101
122
  }
123
+ /**
124
+ * Selects an ArtifactUploader implementation based on available credentials.
125
+ *
126
+ * Returns undefined when artifact upload is not configured — the publish
127
+ * step skips silently in that case (P5).
128
+ *
129
+ * Exported for unit-test access; not part of the public package API.
130
+ */
131
+ export function createArtifactUploader(config, logger) {
132
+ if (!config.artifactGcsBucket)
133
+ return undefined;
134
+ // CI / GCP runtime — direct GCS upload (fastest, no extra hop).
135
+ // We treat the presence of either env var as the user opting in to ADC.
136
+ const hasGcsCredentials = Boolean(process.env.GOOGLE_APPLICATION_CREDENTIALS || process.env.GCLOUD_PROJECT);
137
+ if (hasGcsCredentials) {
138
+ logger.debug("Artifact uploader: GcsReportArtifactUploader (direct GCS via ADC)");
139
+ return new GcsReportArtifactUploader({ bucket: config.artifactGcsBucket });
140
+ }
141
+ // Local dev — request signed PUT URLs from the API gateway, no GCS creds needed.
142
+ if (config.apiKey && config.apiUrl) {
143
+ logger.debug(`Artifact uploader: ApiGatewayArtifactUploader (signed URL via ${config.apiUrl})`);
144
+ return new ApiGatewayArtifactUploader({
145
+ apiBaseUrl: config.apiUrl,
146
+ apiKey: config.apiKey,
147
+ bucket: config.artifactGcsBucket,
148
+ });
149
+ }
150
+ logger.debug("Artifact upload skipped: AILF_GCS_ARTIFACT_BUCKET set but no GCS credentials or AILF_API_KEY available");
151
+ return undefined;
152
+ }
102
153
  function createCache(config) {
103
154
  const local = new FilesystemCache(config.rootDir);
104
155
  if (config.noRemoteCache)
@@ -82,6 +82,9 @@ export function mapToResolvedConfig(opts, rootDir) {
82
82
  captureDir: opts.captureDir ?? join(opts.outputDir, "..", "captures"),
83
83
  captureCompress: opts.captureCompress ?? true,
84
84
  captureExtras: opts.captureExtras ?? true,
85
+ captureGcsBucket: process.env.AILF_CAPTURE_GCS_BUCKET,
86
+ captureGcsPrefix: process.env.AILF_CAPTURE_GCS_PREFIX,
87
+ artifactGcsBucket: process.env.AILF_GCS_ARTIFACT_BUCKET,
85
88
  };
86
89
  }
87
90
  /**
@@ -122,7 +122,11 @@ export class CalculateScoresStep {
122
122
  }
123
123
  // Capture score artifacts
124
124
  const resultsDir = join(ctx.config.rootDir, "results", "latest");
125
- for (const file of ["score-summary.json", "grader-judgments.json"]) {
125
+ for (const file of [
126
+ "score-summary.json",
127
+ "grader-judgments.json",
128
+ "test-results.json",
129
+ ]) {
126
130
  const filePath = join(resultsDir, file);
127
131
  if (existsSync(filePath)) {
128
132
  ctx.collector.captureFile("calculate-scores", file.replace(".json", ""), filePath);
@@ -151,6 +151,20 @@ export class GapAnalysisStep {
151
151
  documents: areaToDocRefs.get(s.feature),
152
152
  }));
153
153
  }
154
+ // ── Per-test results (D0029: model output + metadata) ──────
155
+ const testResultsPath = resolve(root, "results", "latest", "test-results.json");
156
+ let testResults;
157
+ if (existsSync(testResultsPath)) {
158
+ const rawTestResults = JSON.parse(readFileSync(testResultsPath, "utf-8"));
159
+ // Enrich with canonical docs (literacy mode only)
160
+ testResults = rawTestResults.map((tr) => {
161
+ if (!isLiteracyMode)
162
+ return tr;
163
+ const baseDesc = tr.taskId.replace(/\s*\((gold|baseline)\)\s*$/, "");
164
+ const canonicalDocs = descToDocRefs.get(baseDesc);
165
+ return canonicalDocs ? { ...tr, canonicalDocs } : tr;
166
+ });
167
+ }
154
168
  // ── Low-scoring judgments ────────────────────────────────────
155
169
  const LOW_SCORE_THRESHOLD = 70;
156
170
  const MAX_STORED_JUDGMENTS = 50;
@@ -177,6 +191,7 @@ export class GapAnalysisStep {
177
191
  lowScoringJudgments,
178
192
  recommendations: gapReport,
179
193
  scores: enrichedScores,
194
+ ...(testResults !== undefined && { testResults }),
180
195
  };
181
196
  writeFileSync(scoreSummaryPath, JSON.stringify(enrichedSummary, null, 2));
182
197
  // Capture gap analysis artifacts
@@ -113,6 +113,13 @@ export class PublishReportStep {
113
113
  tag: this.options.publishTag ?? ctx.config.publishTag,
114
114
  title,
115
115
  };
116
+ // Upload test output artifacts to GCS (D0030 — non-blocking, P5)
117
+ if (ctx.artifactUploader && summary.testResults?.length) {
118
+ const artifactRef = await uploadTestOutputs(ctx.artifactUploader, reportId, now, summary.testResults);
119
+ if (artifactRef) {
120
+ report.artifacts = { testOutputs: artifactRef };
121
+ }
122
+ }
116
123
  // Share reportId with downstream steps (CallbackStep + orchestrator job update)
117
124
  state.reportId = reportId;
118
125
  // Capture report object (Tier 2)
@@ -211,6 +218,30 @@ function buildProvenanceInput(summary, ctx, options, autoScope) {
211
218
  taskIds,
212
219
  };
213
220
  }
221
+ /**
222
+ * Extract test outputs from StoredTestResult[] and upload as a single
223
+ * JSON artifact to GCS. The artifact is keyed by `{taskId}::{modelId}`
224
+ * to match the lookup pattern in Studio's JudgmentList component.
225
+ *
226
+ * Non-blocking: returns null if upload fails (P5).
227
+ */
228
+ async function uploadTestOutputs(uploader, reportId, createdAt, testResults) {
229
+ const entries = {};
230
+ for (const tr of testResults) {
231
+ const key = `${tr.taskId}::${tr.modelId}`;
232
+ entries[key] = {
233
+ responseOutput: tr.responseOutput,
234
+ responseOutputTruncated: tr.responseOutputTruncated ?? false,
235
+ };
236
+ }
237
+ const artifact = {
238
+ version: 1,
239
+ reportId,
240
+ createdAt,
241
+ entries,
242
+ };
243
+ return uploader.upload(reportId, "test-outputs.json", artifact);
244
+ }
214
245
  /**
215
246
  * Fan out a report to all configured sinks.
216
247
  *
@@ -1,7 +1,7 @@
1
- import { type ActualScoreEntry, type ComponentResult, type Logger, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
1
+ import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
2
2
  import { type ResolvedSourceConfig } from "../sources.js";
3
3
  import type { GraderJudgment, PerModelEntry } from "./types.js";
4
- export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, type ActualScoreEntry, type ComponentResult, type TestResult, type UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
4
+ export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, type ActualScoreEntry, type ComponentResult, type StoredTestResult, type TestResult, type UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
5
5
  export interface PromptfooResultsWrapper {
6
6
  results: RawTestResult[];
7
7
  stats: {
@@ -75,6 +75,16 @@ export declare function calculateScoresPerModel(resultsPath: string, goldProfile
75
75
  * Phase 3a prerequisite: structured judgment data for failure mode extraction.
76
76
  */
77
77
  export declare function extractGraderJudgments(resultsPath: string): GraderJudgment[];
78
+ /**
79
+ * Extract per-test results with model output from evaluation results.
80
+ *
81
+ * Mirrors extractGraderJudgments() but captures the full StoredTestResult
82
+ * shape including response.output (truncated), latency, and cost.
83
+ * One StoredTestResult per test × model combination.
84
+ *
85
+ * See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
86
+ */
87
+ export declare function extractStoredTestResults(resultsPath: string): StoredTestResult[];
78
88
  /**
79
89
  * Score agentic evaluation results. In agentic mode, all test entries are
80
90
  * gold-only (no baseline entries — the .expanded.agentic.yaml fix ensures this).
@@ -157,6 +157,76 @@ export function extractGraderJudgments(resultsPath) {
157
157
  }
158
158
  return judgments;
159
159
  }
160
+ /** Maximum characters to store for model response output */
161
+ const MAX_RESPONSE_OUTPUT_LENGTH = 8000;
162
+ /**
163
+ * Extract per-test results with model output from evaluation results.
164
+ *
165
+ * Mirrors extractGraderJudgments() but captures the full StoredTestResult
166
+ * shape including response.output (truncated), latency, and cost.
167
+ * One StoredTestResult per test × model combination.
168
+ *
169
+ * See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
170
+ */
171
+ export function extractStoredTestResults(resultsPath) {
172
+ const results = readAndNormalizeResults(resultsPath);
173
+ const testResults = [];
174
+ for (const result of results) {
175
+ const taskId = result.description;
176
+ const modelId = result.providerId ?? result.providerLabel ?? "unknown";
177
+ const area = result.vars.__featureArea || detectFeatureArea(result.description);
178
+ // Determine variant from docs variable presence (same logic as scoreResults)
179
+ const docs = result.vars.docs ?? "";
180
+ const variant = docs.trim().length > 0 ? "gold" : "baseline";
181
+ // Detect output failure (same logic as extractGraderJudgments)
182
+ const output = result.response?.output ?? "";
183
+ const isOutputFailure = !output.trim();
184
+ // Truncate response output
185
+ const responseOutput = output.slice(0, MAX_RESPONSE_OUTPUT_LENGTH);
186
+ const responseOutputTruncated = output.length > MAX_RESPONSE_OUTPUT_LENGTH;
187
+ // Extract per-dimension scores and reasons
188
+ const dimensions = [];
189
+ for (const comp of result.gradingResult.componentResults) {
190
+ if (comp.assertion?.type !== "llm-rubric") {
191
+ continue;
192
+ }
193
+ const dimension = classifyRubric(comp);
194
+ if (!dimension) {
195
+ continue;
196
+ }
197
+ const score = parseRubricScore(comp);
198
+ // Extract reason text (same JSON parsing as extractGraderJudgments)
199
+ let reason = comp.reason ?? "";
200
+ if (reason) {
201
+ try {
202
+ const parsed = JSON.parse(reason);
203
+ const obj = parsed;
204
+ if (typeof obj.reason === "string") {
205
+ ;
206
+ ({ reason } = obj);
207
+ }
208
+ }
209
+ catch {
210
+ // Not JSON — use raw reason string
211
+ }
212
+ }
213
+ dimensions.push({ dimension, reason, score });
214
+ }
215
+ testResults.push({
216
+ area,
217
+ cost: result.cost || undefined,
218
+ dimensions,
219
+ latencyMs: result.latencyMs,
220
+ modelId,
221
+ ...(isOutputFailure && { outputFailure: true }),
222
+ responseOutput,
223
+ ...(responseOutputTruncated && { responseOutputTruncated: true }),
224
+ taskId,
225
+ variant,
226
+ });
227
+ }
228
+ return testResults;
229
+ }
160
230
  /**
161
231
  * Finds the URL-extraction assertion result in a test's componentResults
162
232
  * and parses the structured JSON from its `reason` field.
@@ -470,6 +540,7 @@ function readAndNormalizeResults(resultsPath, log) {
470
540
  const base = {
471
541
  cost: r.cost ?? 0,
472
542
  description: r.testCase?.description ?? "unknown",
543
+ latencyMs: r.latencyMs,
473
544
  metadata: r.metadata,
474
545
  providerId: r.provider?.id,
475
546
  providerLabel: r.provider?.label,
@@ -800,6 +871,12 @@ export function calculateAndWriteScores(options) {
800
871
  writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
801
872
  log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
802
873
  }
874
+ // Extract and persist per-test results (D0029: model output + metadata)
875
+ const testResults = extractStoredTestResults(baselineResultsPath);
876
+ if (testResults.length > 0) {
877
+ writeFileSync(join(outDir, "test-results.json"), JSON.stringify(testResults, null, 2));
878
+ log.info(`Test results written to results/latest/test-results.json (${testResults.length} results)`);
879
+ }
803
880
  const testSummary = computeTestSummary(baselineResultsPath);
804
881
  return { belowCritical: summary.belowCritical, testSummary };
805
882
  }
@@ -904,6 +981,17 @@ export function calculateAndWriteScores(options) {
904
981
  writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
905
982
  log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
906
983
  }
984
+ // Extract and persist per-test results (D0029: model output + metadata)
985
+ const testResults = extractStoredTestResults(baselineResultsPath);
986
+ // In full mode, also extract test results from agentic results
987
+ if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
988
+ const agenticTestResults = extractStoredTestResults(agenticResultsPath);
989
+ testResults.push(...agenticTestResults);
990
+ }
991
+ if (testResults.length > 0) {
992
+ writeFileSync(join(outDir, "test-results.json"), JSON.stringify(testResults, null, 2));
993
+ log.info(`Test results written to results/latest/test-results.json (${testResults.length} results)`);
994
+ }
907
995
  // Compute test summary from the raw results file
908
996
  const testSummary = computeTestSummary(baselineResultsPath);
909
997
  return { belowCritical: summary.belowCritical, testSummary };
@@ -193,15 +193,27 @@ export class ReportStore {
193
193
  */
194
194
  async write(report) {
195
195
  try {
196
+ // Strip baseline and experiment ScoreSummary objects from comparison
197
+ // before persisting — they duplicate report.summary (experiment) and
198
+ // are fetchable by ID via provenance.lineage.comparedAgainst (baseline).
199
+ // This reduces document size by ~50-65% for full-mode reports.
200
+ const comparison = report.comparison
201
+ ? stripComparisonBulk(report.comparison)
202
+ : null;
196
203
  await this.client.create({
197
204
  _id: `report-${report.id}`,
198
205
  _type: REPORT_TYPE,
199
- comparison: report.comparison ?? null,
206
+ comparison,
200
207
  completedAt: report.completedAt,
201
208
  durationMs: report.durationMs,
202
209
  provenance: report.provenance,
203
210
  reportId: report.id,
204
- summary: report.summary,
211
+ summary: {
212
+ ...report.summary,
213
+ // Artifact references live inside summary in Sanity so they're
214
+ // projected automatically by the reportDetailQuery (D0030)
215
+ ...(report.artifacts ? { artifacts: report.artifacts } : {}),
216
+ },
205
217
  tag: report.tag ?? null,
206
218
  title: report.title ?? null,
207
219
  });
@@ -283,3 +295,17 @@ function toReport(doc) {
283
295
  title: doc.title,
284
296
  };
285
297
  }
298
+ /**
299
+ * Remove the `baseline` and `experiment` ScoreSummary objects from a
300
+ * ComparisonReport, producing a slim copy suitable for persistence.
301
+ *
302
+ * These fields are redundant in the stored document:
303
+ * - `experiment` is byte-for-byte identical to `report.summary`
304
+ * - `baseline` is fetchable via `provenance.lineage.comparedAgainst`
305
+ *
306
+ * Everything else (deltas, areas, classifications) is preserved.
307
+ */
308
+ function stripComparisonBulk(comparison) {
309
+ const { baseline: _, experiment: __, ...slim } = comparison;
310
+ return slim;
311
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sanity/ailf",
3
- "version": "2.3.3",
3
+ "version": "2.5.0",
4
4
  "private": false,
5
5
  "publishConfig": {
6
6
  "access": "public"
@@ -33,6 +33,7 @@
33
33
  ],
34
34
  "dependencies": {
35
35
  "@google-cloud/bigquery": "^8.1.1",
36
+ "@google-cloud/storage": "^7.19.0",
36
37
  "@inquirer/prompts": "^8.3.0",
37
38
  "@modelcontextprotocol/sdk": "^1.29.0",
38
39
  "@portabletext/markdown": "^1.0.0",