@sanity/ailf 2.3.2 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,3 +19,4 @@ export { defineConfig, defineFeatures, defineModeBase, defineModels, definePrici
19
19
  export type { PricingEntry, PromptEntry, SourceEntry, } from "./config-helpers.js";
20
20
  export { env } from "./env-helper.js";
21
21
  export { NoOpArtifactCollector } from "./artifact-capture/noop-collector.js";
22
+ export { NoOpArtifactUploader } from "./ports/artifact-uploader.js";
@@ -21,3 +21,4 @@ export * from "./examples/index.js";
21
21
  export { defineConfig, defineFeatures, defineModeBase, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./config-helpers.js";
22
22
  export { env } from "./env-helper.js";
23
23
  export { NoOpArtifactCollector } from "./artifact-capture/noop-collector.js";
24
+ export { NoOpArtifactUploader } from "./ports/artifact-uploader.js";
@@ -0,0 +1,35 @@
1
+ /**
2
+ * Port: ArtifactUploader — uploads report artifacts to external object storage.
3
+ *
4
+ * Separate from ArtifactCollector (which captures forensic archives).
5
+ * This port puts structured files at known paths so Studio can fetch
6
+ * them on demand via signed URLs.
7
+ *
8
+ * @see docs/design-docs/external-artifact-store.md
9
+ * @see docs/decisions/D0030-external-artifact-store.md
10
+ */
11
+ import type { ArtifactRef } from "../types/index.js";
12
+ /**
13
+ * Uploads report artifacts to external storage.
14
+ *
15
+ * Implementations:
16
+ * - GcsReportArtifactUploader (packages/eval) — uploads to GCS
17
+ * - NoOpArtifactUploader (below) — returns null (no-op when GCS is not configured)
18
+ */
19
+ export interface ArtifactUploader {
20
+ /**
21
+ * Upload a JSON artifact for a report.
22
+ *
23
+ * @param reportId - Report identifier (used as the GCS path prefix)
24
+ * @param fileName - File name within the report prefix (e.g., "test-outputs.json")
25
+ * @param data - Serializable data (will be JSON.stringify'd)
26
+ * @returns ArtifactRef on success, null if upload is skipped or fails
27
+ */
28
+ upload(reportId: string, fileName: string, data: unknown): Promise<ArtifactRef | null>;
29
+ }
30
+ /**
31
+ * No-op uploader — always returns null. Used when GCS is not configured.
32
+ */
33
+ export declare class NoOpArtifactUploader implements ArtifactUploader {
34
+ upload(): Promise<null>;
35
+ }
@@ -0,0 +1,18 @@
1
+ /**
2
+ * Port: ArtifactUploader — uploads report artifacts to external object storage.
3
+ *
4
+ * Separate from ArtifactCollector (which captures forensic archives).
5
+ * This port puts structured files at known paths so Studio can fetch
6
+ * them on demand via signed URLs.
7
+ *
8
+ * @see docs/design-docs/external-artifact-store.md
9
+ * @see docs/decisions/D0030-external-artifact-store.md
10
+ */
11
+ /**
12
+ * No-op uploader — always returns null. Used when GCS is not configured.
13
+ */
14
+ export class NoOpArtifactUploader {
15
+ async upload() {
16
+ return null;
17
+ }
18
+ }
@@ -13,6 +13,7 @@
13
13
  */
14
14
  import type { DebugOptions, EvalMode, PluginRegistry } from "../types/index.js";
15
15
  import type { ArtifactCollector } from "./artifact-collector.js";
16
+ import type { ArtifactUploader } from "./artifact-uploader.js";
16
17
  import type { CacheStore } from "./cache-store.js";
17
18
  import type { DocFetcher } from "./doc-fetcher.js";
18
19
  import type { EvalRunner } from "./eval-runner.js";
@@ -159,6 +160,12 @@ export interface ResolvedConfig {
159
160
  captureCompress?: boolean;
160
161
  /** Whether to include mode-specific extra artifacts (default: true) */
161
162
  captureExtras?: boolean;
163
+ /** GCS bucket for capture upload (enables GCS decorator when set) */
164
+ captureGcsBucket?: string;
165
+ /** GCS object prefix for capture uploads (default: "captures/") */
166
+ captureGcsPrefix?: string;
167
+ /** GCS bucket for report artifact uploads — enables ArtifactUploader (D0030) */
168
+ artifactGcsBucket?: string;
162
169
  }
163
170
  /**
164
171
  * Application context — the complete dependency carrier.
@@ -173,6 +180,8 @@ export interface ResolvedConfig {
173
180
  * Created per-test by createTestContext().
174
181
  */
175
182
  export interface AppContext {
183
+ /** Report artifact uploader — uploads structured files to GCS for Studio (D0030) */
184
+ readonly artifactUploader?: ArtifactUploader;
176
185
  /** Evaluation caching (filesystem + optional Content Lake fallback) */
177
186
  readonly cache?: CacheStore;
178
187
  /** Artifact capture collector (no-op when --capture is not set) */
@@ -5,6 +5,8 @@
5
5
  * Adapters (in packages/eval) implement these interfaces.
6
6
  */
7
7
  export type { ArtifactCollector, ArtifactManifest, ArtifactManifestEntry, CaptureFlushResult, } from "./artifact-collector.js";
8
+ export type { ArtifactUploader } from "./artifact-uploader.js";
9
+ export { NoOpArtifactUploader } from "./artifact-uploader.js";
8
10
  export type { ArtifactContentDiff, CaptureDiffReport, ComparisonMode, ComparisonOptions, InventoryDiff, JsonDiffEntry, MetadataComparison, ScoreComparison, SecurityScan, TimingComparison, } from "./capture-comparator.js";
9
11
  export type { CacheEntryMetadata, CacheKey, CacheLookupResult, CacheRecordInput, CacheStore, } from "./cache-store.js";
10
12
  export type { ConfigSource } from "./config-source.js";
@@ -4,4 +4,5 @@
4
4
  * Ports define the contracts between the domain kernel and the outside world.
5
5
  * Adapters (in packages/eval) implement these interfaces.
6
6
  */
7
+ export { NoOpArtifactUploader } from "./artifact-uploader.js";
7
8
  export { canonicalDocRefLabel, isIdRef, isPathRef, isPerspectiveRef, isSlugRef, isTemplatedAssertion, } from "./task-source.js";
@@ -256,6 +256,13 @@ export interface GraderJudgment {
256
256
  dimension: string;
257
257
  /** The model that produced the response being graded */
258
258
  modelId: string;
259
+ /**
260
+ * True when the model failed to produce meaningful output (empty response,
261
+ * API error, or refusal). Distinguishes infrastructure failures from
262
+ * genuinely incorrect responses — a score of 0 from no output is
263
+ * fundamentally different from a score of 0 from wrong output.
264
+ */
265
+ outputFailure?: boolean;
259
266
  /** The grader's natural language reasoning */
260
267
  reason: string;
261
268
  /** The numeric score (0–100) */
@@ -268,6 +275,55 @@ export interface StoredJudgment extends GraderJudgment {
268
275
  /** Canonical docs that the task expected the model to use */
269
276
  canonicalDocs?: DocumentRef[];
270
277
  }
278
+ /**
279
+ * Per-test result stored in reports for drill-down and audit.
280
+ *
281
+ * Captures the model's response output, grader reasoning per dimension,
282
+ * and response metadata. One entry per test × model combination.
283
+ * See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
284
+ */
285
+ export interface StoredTestResult {
286
+ /** Resolved feature area (from __featureArea or description) */
287
+ area: string;
288
+ /** Canonical docs the task expected the model to use */
289
+ canonicalDocs?: DocumentRef[];
290
+ /** Weighted composite score (gold variant only) */
291
+ compositeScore?: number;
292
+ /** Per-test cost (USD) */
293
+ cost?: number;
294
+ /** Per-dimension grader scores and reasoning */
295
+ dimensions: {
296
+ /** Rubric dimension: task-completion, code-correctness, doc-coverage */
297
+ dimension: string;
298
+ /** Grader's natural language reasoning */
299
+ reason: string;
300
+ /** Numeric score (0–100, normalized) */
301
+ score: number;
302
+ }[];
303
+ /** Response latency in milliseconds */
304
+ latencyMs?: number;
305
+ /** Model that produced the response */
306
+ modelId: string;
307
+ /**
308
+ * True when the model failed to produce meaningful output (empty response,
309
+ * API error, or refusal). Same semantics as GraderJudgment.outputFailure.
310
+ */
311
+ outputFailure?: boolean;
312
+ /** The model's generated code/response (truncated to 8000 chars) */
313
+ responseOutput: string;
314
+ /** True when responseOutput was truncated from a longer response */
315
+ responseOutputTruncated?: boolean;
316
+ /** Task description (e.g. "Functions - Webhook handler (gold)") */
317
+ taskId: string;
318
+ /** Token usage breakdown */
319
+ tokenUsage?: {
320
+ completion: number;
321
+ prompt: number;
322
+ total: number;
323
+ };
324
+ /** "gold" (with docs) or "baseline" (without docs) */
325
+ variant: "baseline" | "gold";
326
+ }
271
327
  /** Grader consistency diagnostics — does not affect scores, reported alongside */
272
328
  export interface GraderReliability {
273
329
  /** Inter-grader agreement (from multi-grader comparison) — Phase 3 */
@@ -769,6 +825,12 @@ export interface ScoreSummary {
769
825
  lowScoringJudgments?: StoredJudgment[];
770
826
  /** Gap analysis recommendations (Phase 3b) — prioritized remediation plan */
771
827
  recommendations?: GapAnalysisReport;
828
+ /**
829
+ * Per-test results with model output, grader reasoning, and metadata.
830
+ * One entry per test × model combination. Populated during gap-analysis
831
+ * enrichment from test-results.json. See D0029.
832
+ */
833
+ testResults?: StoredTestResult[];
772
834
  /** Grader reliability diagnostics (does not affect scores) */
773
835
  graderReliability?: GraderReliability;
774
836
  lowestArea: string;
@@ -1095,8 +1157,23 @@ export interface PublishResult {
1095
1157
  result: SinkResult;
1096
1158
  }[];
1097
1159
  }
1160
+ /** Reference to an artifact in external object storage (GCS). See D0030. */
1161
+ export interface ArtifactRef {
1162
+ store: "gcs";
1163
+ bucket: string;
1164
+ path: string;
1165
+ bytes?: number;
1166
+ entryCount?: number;
1167
+ }
1098
1168
  /** A published evaluation report — the atomic unit of the report store */
1099
1169
  export interface Report {
1170
+ /** External artifact references — set by publish step when uploader is available (D0030) */
1171
+ artifacts?: {
1172
+ testOutputs?: ArtifactRef;
1173
+ renderedPrompts?: ArtifactRef;
1174
+ rawResults?: ArtifactRef;
1175
+ traces?: ArtifactRef;
1176
+ };
1100
1177
  /** Optional auto-comparison against the most recent comparable report */
1101
1178
  comparison?: ComparisonReport;
1102
1179
  /** When the evaluation completed */
@@ -26,6 +26,8 @@ export interface TestResult {
26
26
  componentResults: ComponentResult[];
27
27
  pass: boolean;
28
28
  };
29
+ /** Per-test latency in ms (propagated from Promptfoo when available) */
30
+ latencyMs?: number;
29
31
  metadata?: Record<string, unknown>;
30
32
  /** Provider identifier (e.g., "openai:gpt-4o") */
31
33
  providerId?: string;
@@ -0,0 +1,55 @@
1
+ /**
2
+ * GcsArtifactCollector — decorator that uploads capture artifacts to GCS.
3
+ *
4
+ * Wraps the FilesystemArtifactCollector: local flush first (preserving
5
+ * the existing manifest + redaction logic), then upload to a GCS bucket.
6
+ *
7
+ * Design principles:
8
+ * - P5: Non-blocking — GCS upload failure should not block the pipeline.
9
+ * Local artifacts are always preserved.
10
+ * - Decorator pattern — delegates capture() and captureFile() to the inner
11
+ * collector unchanged. Only flush() adds the GCS upload step.
12
+ * - Lazy client — GCS Storage client is created on first flush(), not at
13
+ * construction (same pattern as BigQuerySink).
14
+ *
15
+ * @see docs/decisions/D0030-external-artifact-store.md
16
+ * @see docs/work-items/W0035-gcs-artifact-output.json
17
+ */
18
+ import type { ArtifactCollector, CaptureFlushResult } from "../_vendor/ailf-core/index.d.ts";
19
+ export interface GcsCollectorOptions {
20
+ /** GCS bucket name (e.g., "ailf-artifacts") */
21
+ bucket: string;
22
+ /** Object prefix in the bucket (e.g., "captures/") */
23
+ prefix?: string;
24
+ /** Path to service account credentials JSON (optional — falls back to ADC) */
25
+ credentials?: string;
26
+ }
27
+ export interface GcsFlushResult extends CaptureFlushResult {
28
+ /** GCS upload status */
29
+ gcs: {
30
+ status: "uploaded";
31
+ bucket: string;
32
+ path: string;
33
+ } | {
34
+ status: "skipped";
35
+ reason: string;
36
+ } | {
37
+ status: "failed";
38
+ error: string;
39
+ };
40
+ }
41
+ export declare class GcsArtifactCollector implements ArtifactCollector {
42
+ get enabled(): boolean;
43
+ get extrasEnabled(): boolean;
44
+ private client;
45
+ private readonly inner;
46
+ private readonly options;
47
+ constructor(inner: ArtifactCollector, options: GcsCollectorOptions);
48
+ capture(step: string, type: string, data: unknown, meta?: Record<string, unknown>): void;
49
+ captureFile(step: string, type: string, filePath: string, meta?: Record<string, unknown>): void;
50
+ flush(): Promise<GcsFlushResult>;
51
+ /** Lazily create the GCS Storage client. */
52
+ private getClient;
53
+ /** Upload the flushed artifact (tar.gz or directory) to GCS. */
54
+ private uploadToGcs;
55
+ }
@@ -0,0 +1,117 @@
1
+ /**
2
+ * GcsArtifactCollector — decorator that uploads capture artifacts to GCS.
3
+ *
4
+ * Wraps the FilesystemArtifactCollector: local flush first (preserving
5
+ * the existing manifest + redaction logic), then upload to a GCS bucket.
6
+ *
7
+ * Design principles:
8
+ * - P5: Non-blocking — GCS upload failure should not block the pipeline.
9
+ * Local artifacts are always preserved.
10
+ * - Decorator pattern — delegates capture() and captureFile() to the inner
11
+ * collector unchanged. Only flush() adds the GCS upload step.
12
+ * - Lazy client — GCS Storage client is created on first flush(), not at
13
+ * construction (same pattern as BigQuerySink).
14
+ *
15
+ * @see docs/decisions/D0030-external-artifact-store.md
16
+ * @see docs/work-items/W0035-gcs-artifact-output.json
17
+ */
18
+ import { readFileSync } from "node:fs";
19
+ import { Storage } from "@google-cloud/storage";
20
+ // ---------------------------------------------------------------------------
21
+ // Collector
22
+ // ---------------------------------------------------------------------------
23
+ export class GcsArtifactCollector {
24
+ get enabled() {
25
+ return this.inner.enabled;
26
+ }
27
+ get extrasEnabled() {
28
+ return this.inner.extrasEnabled;
29
+ }
30
+ client = null;
31
+ inner;
32
+ options;
33
+ constructor(inner, options) {
34
+ this.inner = inner;
35
+ this.options = options;
36
+ }
37
+ capture(step, type, data, meta) {
38
+ this.inner.capture(step, type, data, meta);
39
+ }
40
+ captureFile(step, type, filePath, meta) {
41
+ this.inner.captureFile(step, type, filePath, meta);
42
+ }
43
+ async flush() {
44
+ // Step 1: Flush to local filesystem first (always succeeds or throws)
45
+ const localResult = await this.inner.flush();
46
+ // Step 2: Upload to GCS (non-blocking — P5)
47
+ if (localResult.artifactCount === 0) {
48
+ return {
49
+ ...localResult,
50
+ gcs: { status: "skipped", reason: "No artifacts to upload" },
51
+ };
52
+ }
53
+ try {
54
+ const gcsPath = await this.uploadToGcs(localResult);
55
+ return {
56
+ ...localResult,
57
+ gcs: {
58
+ status: "uploaded",
59
+ bucket: this.options.bucket,
60
+ path: gcsPath,
61
+ },
62
+ };
63
+ }
64
+ catch (err) {
65
+ const message = err instanceof Error ? err.message : String(err);
66
+ console.warn(` ⚠️ GCS upload failed (non-blocking): ${message}`);
67
+ return {
68
+ ...localResult,
69
+ gcs: { status: "failed", error: message },
70
+ };
71
+ }
72
+ }
73
+ // -----------------------------------------------------------------------
74
+ // Private helpers
75
+ // -----------------------------------------------------------------------
76
+ /** Lazily create the GCS Storage client. */
77
+ getClient() {
78
+ if (this.client)
79
+ return this.client;
80
+ this.client = this.options.credentials
81
+ ? new Storage({ keyFilename: this.options.credentials })
82
+ : new Storage();
83
+ return this.client;
84
+ }
85
+ /** Upload the flushed artifact (tar.gz or directory) to GCS. */
86
+ async uploadToGcs(result) {
87
+ const storage = this.getClient();
88
+ const bucket = storage.bucket(this.options.bucket);
89
+ const prefix = this.options.prefix ?? "captures/";
90
+ if (result.compressed) {
91
+ // Upload the tar.gz directly
92
+ const fileName = result.destination.split("/").pop() ?? "capture.tar.gz";
93
+ const gcsPath = `${prefix}${fileName}`;
94
+ const fileContent = readFileSync(result.destination);
95
+ await bucket.file(gcsPath).save(fileContent, {
96
+ contentType: "application/gzip",
97
+ metadata: {
98
+ artifactCount: String(result.artifactCount),
99
+ totalBytes: String(result.totalBytes),
100
+ },
101
+ });
102
+ return gcsPath;
103
+ }
104
+ // Uncompressed: upload the manifest.json as the representative file.
105
+ // The full directory could be uploaded file-by-file, but for the
106
+ // capture use case (forensic archive), the compressed bundle is the
107
+ // expected path. Upload just the manifest as a reference.
108
+ const manifestPath = `${result.destination}/manifest.json`;
109
+ const dirName = result.destination.split("/").pop() ?? "capture";
110
+ const gcsPath = `${prefix}${dirName}/manifest.json`;
111
+ const manifestContent = readFileSync(manifestPath, "utf-8");
112
+ await bucket.file(gcsPath).save(manifestContent, {
113
+ contentType: "application/json",
114
+ });
115
+ return gcsPath;
116
+ }
117
+ }
@@ -0,0 +1,31 @@
1
+ /**
2
+ * GcsReportArtifactUploader — uploads report artifacts to known GCS paths.
3
+ *
4
+ * Separate from GcsArtifactCollector (which handles forensic capture archives).
5
+ * This uploader puts structured JSON files at predictable paths so the
6
+ * API Gateway can sign URLs and Studio can fetch them on demand.
7
+ *
8
+ * GCS path convention: reports/{reportId}/{fileName}
9
+ * Example: reports/01926abc.../test-outputs.json
10
+ *
11
+ * Design principles:
12
+ * - P5: Non-blocking — GCS upload failure returns null, never throws
13
+ * - Lazy client — Storage created on first upload, not at construction
14
+ * - Same credentials path as GcsArtifactCollector (ADC or key file)
15
+ *
16
+ * @see docs/design-docs/external-artifact-store.md
17
+ * @see docs/decisions/D0030-external-artifact-store.md
18
+ */
19
+ import type { ArtifactRef, ArtifactUploader } from "../_vendor/ailf-core/index.d.ts";
20
+ export interface GcsUploaderOptions {
21
+ /** GCS bucket name (e.g., "ailf-artifacts") */
22
+ bucket: string;
23
+ }
24
+ export declare class GcsReportArtifactUploader implements ArtifactUploader {
25
+ private client;
26
+ private readonly options;
27
+ constructor(options: GcsUploaderOptions);
28
+ upload(reportId: string, fileName: string, data: unknown): Promise<ArtifactRef | null>;
29
+ /** Lazily create the GCS Storage client (ADC). */
30
+ private getClient;
31
+ }
@@ -0,0 +1,66 @@
1
+ /**
2
+ * GcsReportArtifactUploader — uploads report artifacts to known GCS paths.
3
+ *
4
+ * Separate from GcsArtifactCollector (which handles forensic capture archives).
5
+ * This uploader puts structured JSON files at predictable paths so the
6
+ * API Gateway can sign URLs and Studio can fetch them on demand.
7
+ *
8
+ * GCS path convention: reports/{reportId}/{fileName}
9
+ * Example: reports/01926abc.../test-outputs.json
10
+ *
11
+ * Design principles:
12
+ * - P5: Non-blocking — GCS upload failure returns null, never throws
13
+ * - Lazy client — Storage created on first upload, not at construction
14
+ * - Same credentials path as GcsArtifactCollector (ADC or key file)
15
+ *
16
+ * @see docs/design-docs/external-artifact-store.md
17
+ * @see docs/decisions/D0030-external-artifact-store.md
18
+ */
19
+ import { Storage } from "@google-cloud/storage";
20
+ export class GcsReportArtifactUploader {
21
+ client = null;
22
+ options;
23
+ constructor(options) {
24
+ this.options = options;
25
+ }
26
+ async upload(reportId, fileName, data) {
27
+ const objectPath = `reports/${reportId}/${fileName}`;
28
+ const json = JSON.stringify(data);
29
+ const bytes = Buffer.byteLength(json, "utf-8");
30
+ try {
31
+ const storage = this.getClient();
32
+ const file = storage.bucket(this.options.bucket).file(objectPath);
33
+ await file.save(json, {
34
+ contentType: "application/json",
35
+ metadata: {
36
+ reportId,
37
+ },
38
+ });
39
+ return {
40
+ store: "gcs",
41
+ bucket: this.options.bucket,
42
+ path: objectPath,
43
+ bytes,
44
+ entryCount: typeof data === "object" &&
45
+ data !== null &&
46
+ "entries" in data &&
47
+ typeof data.entries === "object"
48
+ ? Object.keys(data.entries)
49
+ .length
50
+ : undefined,
51
+ };
52
+ }
53
+ catch (err) {
54
+ const message = err instanceof Error ? err.message : String(err);
55
+ console.warn(` ⚠️ Artifact upload failed (non-blocking): ${objectPath} — ${message}`);
56
+ return null;
57
+ }
58
+ }
59
+ /** Lazily create the GCS Storage client (ADC). */
60
+ getClient() {
61
+ if (this.client)
62
+ return this.client;
63
+ this.client = new Storage();
64
+ return this.client;
65
+ }
66
+ }
package/dist/cli.js CHANGED
@@ -168,6 +168,8 @@ import { createCalculateScoresCommand } from "./commands/calculate-scores.js";
168
168
  program.addCommand(createCalculateScoresCommand().helpGroup(CommandGroup.PipelineInternals));
169
169
  import { createPrCommentCommand } from "./commands/pr-comment.js";
170
170
  program.addCommand(createPrCommentCommand().helpGroup(CommandGroup.PipelineInternals));
171
+ import { createGenerateConfigsCommand } from "./commands/generate-configs.js";
172
+ program.addCommand(createGenerateConfigsCommand().helpGroup(CommandGroup.PipelineInternals));
171
173
  import { createMeasureRetrievalCommand } from "./commands/measure-retrieval.js";
172
174
  program.addCommand(createMeasureRetrievalCommand().helpGroup(CommandGroup.PipelineInternals));
173
175
  import { createLookupDocCommand } from "./commands/lookup-doc.js";
@@ -326,6 +326,9 @@ export async function executePipeline(cliOpts) {
326
326
  process.env.AILF_CAPTURE_COMPRESS !== "0";
327
327
  config.captureExtras =
328
328
  cliOpts.captureExtras !== false && process.env.AILF_CAPTURE_EXTRAS !== "0";
329
+ config.captureGcsBucket ??= process.env.AILF_CAPTURE_GCS_BUCKET;
330
+ config.captureGcsPrefix ??= process.env.AILF_CAPTURE_GCS_PREFIX;
331
+ config.artifactGcsBucket ??= process.env.AILF_GCS_ARTIFACT_BUCKET;
329
332
  // Create AppContext directly from the merged config so adapters
330
333
  // (especially taskSource) are wired from the file config's
331
334
  // taskSourceType — not from CLI defaults.
@@ -18,6 +18,8 @@
18
18
  import { join } from "node:path";
19
19
  import { InMemoryPluginRegistry, NoOpArtifactCollector, } from "./_vendor/ailf-core/index.js";
20
20
  import { FilesystemArtifactCollector } from "./artifact-capture/filesystem-collector.js";
21
+ import { GcsArtifactCollector } from "./artifact-capture/gcs-collector.js";
22
+ import { GcsReportArtifactUploader } from "./artifact-capture/gcs-report-artifact-uploader.js";
21
23
  import { ContentLakeCacheAdapter } from "./adapters/cache/content-lake-cache.js";
22
24
  import { loadExternalPresets } from "./pipeline/compiler/preset-loader.js";
23
25
  import { FilesystemCache } from "./adapters/cache/filesystem-cache.js";
@@ -57,9 +59,11 @@ export function createAppContext(config) {
57
59
  const reportStore = createReportStore(config);
58
60
  // Sinks — loaded from config/sinks
59
61
  const sinks = loadSinks();
60
- // Artifact collector — no-op by default, filesystem when --capture is set
61
- const collector = config.captureEnabled
62
- ? new FilesystemArtifactCollector({
62
+ // Artifact collector — no-op by default, filesystem when --capture is set,
63
+ // GCS decorator when --capture-gcs-bucket is also provided (D0030/W0035)
64
+ let collector = new NoOpArtifactCollector();
65
+ if (config.captureEnabled) {
66
+ const fsCollector = new FilesystemArtifactCollector({
63
67
  captureDir: config.captureDir ?? join(config.outputDir, "..", "captures"),
64
68
  mode: config.mode,
65
69
  compress: config.captureCompress ?? true,
@@ -69,9 +73,21 @@ export function createAppContext(config) {
69
73
  source: config.source,
70
74
  areas: config.areas,
71
75
  },
72
- })
73
- : new NoOpArtifactCollector();
76
+ });
77
+ collector = config.captureGcsBucket
78
+ ? new GcsArtifactCollector(fsCollector, {
79
+ bucket: config.captureGcsBucket,
80
+ prefix: config.captureGcsPrefix,
81
+ })
82
+ : fsCollector;
83
+ }
84
+ // Report artifact uploader — uploads structured files to GCS at known
85
+ // paths for Studio to fetch via signed URLs (D0030)
86
+ const artifactUploader = config.artifactGcsBucket
87
+ ? new GcsReportArtifactUploader({ bucket: config.artifactGcsBucket })
88
+ : undefined;
74
89
  return {
90
+ artifactUploader,
75
91
  cache,
76
92
  collector,
77
93
  config,
@@ -82,6 +82,9 @@ export function mapToResolvedConfig(opts, rootDir) {
82
82
  captureDir: opts.captureDir ?? join(opts.outputDir, "..", "captures"),
83
83
  captureCompress: opts.captureCompress ?? true,
84
84
  captureExtras: opts.captureExtras ?? true,
85
+ captureGcsBucket: process.env.AILF_CAPTURE_GCS_BUCKET,
86
+ captureGcsPrefix: process.env.AILF_CAPTURE_GCS_PREFIX,
87
+ artifactGcsBucket: process.env.AILF_GCS_ARTIFACT_BUCKET,
85
88
  };
86
89
  }
87
90
  /**
@@ -122,7 +122,11 @@ export class CalculateScoresStep {
122
122
  }
123
123
  // Capture score artifacts
124
124
  const resultsDir = join(ctx.config.rootDir, "results", "latest");
125
- for (const file of ["score-summary.json", "grader-judgments.json"]) {
125
+ for (const file of [
126
+ "score-summary.json",
127
+ "grader-judgments.json",
128
+ "test-results.json",
129
+ ]) {
126
130
  const filePath = join(resultsDir, file);
127
131
  if (existsSync(filePath)) {
128
132
  ctx.collector.captureFile("calculate-scores", file.replace(".json", ""), filePath);
@@ -151,6 +151,20 @@ export class GapAnalysisStep {
151
151
  documents: areaToDocRefs.get(s.feature),
152
152
  }));
153
153
  }
154
+ // ── Per-test results (D0029: model output + metadata) ──────
155
+ const testResultsPath = resolve(root, "results", "latest", "test-results.json");
156
+ let testResults;
157
+ if (existsSync(testResultsPath)) {
158
+ const rawTestResults = JSON.parse(readFileSync(testResultsPath, "utf-8"));
159
+ // Enrich with canonical docs (literacy mode only)
160
+ testResults = rawTestResults.map((tr) => {
161
+ if (!isLiteracyMode)
162
+ return tr;
163
+ const baseDesc = tr.taskId.replace(/\s*\((gold|baseline)\)\s*$/, "");
164
+ const canonicalDocs = descToDocRefs.get(baseDesc);
165
+ return canonicalDocs ? { ...tr, canonicalDocs } : tr;
166
+ });
167
+ }
154
168
  // ── Low-scoring judgments ────────────────────────────────────
155
169
  const LOW_SCORE_THRESHOLD = 70;
156
170
  const MAX_STORED_JUDGMENTS = 50;
@@ -177,6 +191,7 @@ export class GapAnalysisStep {
177
191
  lowScoringJudgments,
178
192
  recommendations: gapReport,
179
193
  scores: enrichedScores,
194
+ ...(testResults !== undefined && { testResults }),
180
195
  };
181
196
  writeFileSync(scoreSummaryPath, JSON.stringify(enrichedSummary, null, 2));
182
197
  // Capture gap analysis artifacts
@@ -113,6 +113,13 @@ export class PublishReportStep {
113
113
  tag: this.options.publishTag ?? ctx.config.publishTag,
114
114
  title,
115
115
  };
116
+ // Upload test output artifacts to GCS (D0030 — non-blocking, P5)
117
+ if (ctx.artifactUploader && summary.testResults?.length) {
118
+ const artifactRef = await uploadTestOutputs(ctx.artifactUploader, reportId, now, summary.testResults);
119
+ if (artifactRef) {
120
+ report.artifacts = { testOutputs: artifactRef };
121
+ }
122
+ }
116
123
  // Share reportId with downstream steps (CallbackStep + orchestrator job update)
117
124
  state.reportId = reportId;
118
125
  // Capture report object (Tier 2)
@@ -211,6 +218,30 @@ function buildProvenanceInput(summary, ctx, options, autoScope) {
211
218
  taskIds,
212
219
  };
213
220
  }
221
+ /**
222
+ * Extract test outputs from StoredTestResult[] and upload as a single
223
+ * JSON artifact to GCS. The artifact is keyed by `{taskId}::{modelId}`
224
+ * to match the lookup pattern in Studio's JudgmentList component.
225
+ *
226
+ * Non-blocking: returns null if upload fails (P5).
227
+ */
228
+ async function uploadTestOutputs(uploader, reportId, createdAt, testResults) {
229
+ const entries = {};
230
+ for (const tr of testResults) {
231
+ const key = `${tr.taskId}::${tr.modelId}`;
232
+ entries[key] = {
233
+ responseOutput: tr.responseOutput,
234
+ responseOutputTruncated: tr.responseOutputTruncated ?? false,
235
+ };
236
+ }
237
+ const artifact = {
238
+ version: 1,
239
+ reportId,
240
+ createdAt,
241
+ entries,
242
+ };
243
+ return uploader.upload(reportId, "test-outputs.json", artifact);
244
+ }
214
245
  /**
215
246
  * Fan out a report to all configured sinks.
216
247
  *
@@ -1,7 +1,7 @@
1
- import { type ActualScoreEntry, type ComponentResult, type Logger, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
1
+ import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
2
2
  import { type ResolvedSourceConfig } from "../sources.js";
3
3
  import type { GraderJudgment, PerModelEntry } from "./types.js";
4
- export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, type ActualScoreEntry, type ComponentResult, type TestResult, type UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
4
+ export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, type ActualScoreEntry, type ComponentResult, type StoredTestResult, type TestResult, type UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
5
5
  export interface PromptfooResultsWrapper {
6
6
  results: RawTestResult[];
7
7
  stats: {
@@ -75,6 +75,16 @@ export declare function calculateScoresPerModel(resultsPath: string, goldProfile
75
75
  * Phase 3a prerequisite: structured judgment data for failure mode extraction.
76
76
  */
77
77
  export declare function extractGraderJudgments(resultsPath: string): GraderJudgment[];
78
+ /**
79
+ * Extract per-test results with model output from evaluation results.
80
+ *
81
+ * Mirrors extractGraderJudgments() but captures the full StoredTestResult
82
+ * shape including response.output (truncated), latency, and cost.
83
+ * One StoredTestResult per test × model combination.
84
+ *
85
+ * See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
86
+ */
87
+ export declare function extractStoredTestResults(resultsPath: string): StoredTestResult[];
78
88
  /**
79
89
  * Score agentic evaluation results. In agentic mode, all test entries are
80
90
  * gold-only (no baseline entries — the .expanded.agentic.yaml fix ensures this).
@@ -114,6 +114,10 @@ export function extractGraderJudgments(resultsPath) {
114
114
  for (const result of results) {
115
115
  const taskId = result.description;
116
116
  const modelId = result.providerId ?? result.providerLabel ?? "unknown";
117
+ // Detect output failures: empty/whitespace response means the model
118
+ // failed to produce output (API error, token exhaustion, refusal).
119
+ const output = result.response?.output ?? "";
120
+ const isOutputFailure = !output.trim();
117
121
  for (const comp of result.gradingResult.componentResults) {
118
122
  if (comp.assertion?.type !== "llm-rubric") {
119
123
  continue;
@@ -139,9 +143,12 @@ export function extractGraderJudgments(resultsPath) {
139
143
  // Not JSON — use raw reason string
140
144
  }
141
145
  }
146
+ // Also flag synthesized api-error judgments as output failures
147
+ const outputFailure = isOutputFailure || reason.startsWith("[api-error]");
142
148
  judgments.push({
143
149
  dimension: kind,
144
150
  modelId,
151
+ ...(outputFailure && { outputFailure: true }),
145
152
  reason,
146
153
  score,
147
154
  taskId,
@@ -150,6 +157,76 @@ export function extractGraderJudgments(resultsPath) {
150
157
  }
151
158
  return judgments;
152
159
  }
160
+ /** Maximum characters to store for model response output */
161
+ const MAX_RESPONSE_OUTPUT_LENGTH = 8000;
162
+ /**
163
+ * Extract per-test results with model output from evaluation results.
164
+ *
165
+ * Mirrors extractGraderJudgments() but captures the full StoredTestResult
166
+ * shape including response.output (truncated), latency, and cost.
167
+ * One StoredTestResult per test × model combination.
168
+ *
169
+ * See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
170
+ */
171
+ export function extractStoredTestResults(resultsPath) {
172
+ const results = readAndNormalizeResults(resultsPath);
173
+ const testResults = [];
174
+ for (const result of results) {
175
+ const taskId = result.description;
176
+ const modelId = result.providerId ?? result.providerLabel ?? "unknown";
177
+ const area = result.vars.__featureArea || detectFeatureArea(result.description);
178
+ // Determine variant from docs variable presence (same logic as scoreResults)
179
+ const docs = result.vars.docs ?? "";
180
+ const variant = docs.trim().length > 0 ? "gold" : "baseline";
181
+ // Detect output failure (same logic as extractGraderJudgments)
182
+ const output = result.response?.output ?? "";
183
+ const isOutputFailure = !output.trim();
184
+ // Truncate response output
185
+ const responseOutput = output.slice(0, MAX_RESPONSE_OUTPUT_LENGTH);
186
+ const responseOutputTruncated = output.length > MAX_RESPONSE_OUTPUT_LENGTH;
187
+ // Extract per-dimension scores and reasons
188
+ const dimensions = [];
189
+ for (const comp of result.gradingResult.componentResults) {
190
+ if (comp.assertion?.type !== "llm-rubric") {
191
+ continue;
192
+ }
193
+ const dimension = classifyRubric(comp);
194
+ if (!dimension) {
195
+ continue;
196
+ }
197
+ const score = parseRubricScore(comp);
198
+ // Extract reason text (same JSON parsing as extractGraderJudgments)
199
+ let reason = comp.reason ?? "";
200
+ if (reason) {
201
+ try {
202
+ const parsed = JSON.parse(reason);
203
+ const obj = parsed;
204
+ if (typeof obj.reason === "string") {
205
+ ;
206
+ ({ reason } = obj);
207
+ }
208
+ }
209
+ catch {
210
+ // Not JSON — use raw reason string
211
+ }
212
+ }
213
+ dimensions.push({ dimension, reason, score });
214
+ }
215
+ testResults.push({
216
+ area,
217
+ cost: result.cost || undefined,
218
+ dimensions,
219
+ latencyMs: result.latencyMs,
220
+ modelId,
221
+ ...(isOutputFailure && { outputFailure: true }),
222
+ responseOutput,
223
+ ...(responseOutputTruncated && { responseOutputTruncated: true }),
224
+ taskId,
225
+ variant,
226
+ });
227
+ }
228
+ return testResults;
229
+ }
153
230
  /**
154
231
  * Finds the URL-extraction assertion result in a test's componentResults
155
232
  * and parses the structured JSON from its `reason` field.
@@ -463,6 +540,7 @@ function readAndNormalizeResults(resultsPath, log) {
463
540
  const base = {
464
541
  cost: r.cost ?? 0,
465
542
  description: r.testCase?.description ?? "unknown",
543
+ latencyMs: r.latencyMs,
466
544
  metadata: r.metadata,
467
545
  providerId: r.provider?.id,
468
546
  providerLabel: r.provider?.label,
@@ -793,6 +871,12 @@ export function calculateAndWriteScores(options) {
793
871
  writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
794
872
  log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
795
873
  }
874
+ // Extract and persist per-test results (D0029: model output + metadata)
875
+ const testResults = extractStoredTestResults(baselineResultsPath);
876
+ if (testResults.length > 0) {
877
+ writeFileSync(join(outDir, "test-results.json"), JSON.stringify(testResults, null, 2));
878
+ log.info(`Test results written to results/latest/test-results.json (${testResults.length} results)`);
879
+ }
796
880
  const testSummary = computeTestSummary(baselineResultsPath);
797
881
  return { belowCritical: summary.belowCritical, testSummary };
798
882
  }
@@ -897,6 +981,17 @@ export function calculateAndWriteScores(options) {
897
981
  writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
898
982
  log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
899
983
  }
984
+ // Extract and persist per-test results (D0029: model output + metadata)
985
+ const testResults = extractStoredTestResults(baselineResultsPath);
986
+ // In full mode, also extract test results from agentic results
987
+ if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
988
+ const agenticTestResults = extractStoredTestResults(agenticResultsPath);
989
+ testResults.push(...agenticTestResults);
990
+ }
991
+ if (testResults.length > 0) {
992
+ writeFileSync(join(outDir, "test-results.json"), JSON.stringify(testResults, null, 2));
993
+ log.info(`Test results written to results/latest/test-results.json (${testResults.length} results)`);
994
+ }
900
995
  // Compute test summary from the raw results file
901
996
  const testSummary = computeTestSummary(baselineResultsPath);
902
997
  return { belowCritical: summary.belowCritical, testSummary };
@@ -193,15 +193,27 @@ export class ReportStore {
193
193
  */
194
194
  async write(report) {
195
195
  try {
196
+ // Strip baseline and experiment ScoreSummary objects from comparison
197
+ // before persisting — they duplicate report.summary (experiment) and
198
+ // are fetchable by ID via provenance.lineage.comparedAgainst (baseline).
199
+ // This reduces document size by ~50-65% for full-mode reports.
200
+ const comparison = report.comparison
201
+ ? stripComparisonBulk(report.comparison)
202
+ : null;
196
203
  await this.client.create({
197
204
  _id: `report-${report.id}`,
198
205
  _type: REPORT_TYPE,
199
- comparison: report.comparison ?? null,
206
+ comparison,
200
207
  completedAt: report.completedAt,
201
208
  durationMs: report.durationMs,
202
209
  provenance: report.provenance,
203
210
  reportId: report.id,
204
- summary: report.summary,
211
+ summary: {
212
+ ...report.summary,
213
+ // Artifact references live inside summary in Sanity so they're
214
+ // projected automatically by the reportDetailQuery (D0030)
215
+ ...(report.artifacts ? { artifacts: report.artifacts } : {}),
216
+ },
205
217
  tag: report.tag ?? null,
206
218
  title: report.title ?? null,
207
219
  });
@@ -283,3 +295,17 @@ function toReport(doc) {
283
295
  title: doc.title,
284
296
  };
285
297
  }
298
+ /**
299
+ * Remove the `baseline` and `experiment` ScoreSummary objects from a
300
+ * ComparisonReport, producing a slim copy suitable for persistence.
301
+ *
302
+ * These fields are redundant in the stored document:
303
+ * - `experiment` is byte-for-byte identical to `report.summary`
304
+ * - `baseline` is fetchable via `provenance.lineage.comparedAgainst`
305
+ *
306
+ * Everything else (deltas, areas, classifications) is preserved.
307
+ */
308
+ function stripComparisonBulk(comparison) {
309
+ const { baseline: _, experiment: __, ...slim } = comparison;
310
+ return slim;
311
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sanity/ailf",
3
- "version": "2.3.2",
3
+ "version": "2.4.0",
4
4
  "private": false,
5
5
  "publishConfig": {
6
6
  "access": "public"
@@ -33,6 +33,7 @@
33
33
  ],
34
34
  "dependencies": {
35
35
  "@google-cloud/bigquery": "^8.1.1",
36
+ "@google-cloud/storage": "^7.19.0",
36
37
  "@inquirer/prompts": "^8.3.0",
37
38
  "@modelcontextprotocol/sdk": "^1.29.0",
38
39
  "@portabletext/markdown": "^1.0.0",