@sanity/ailf 2.3.3 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_vendor/ailf-core/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/index.js +1 -0
- package/dist/_vendor/ailf-core/ports/artifact-uploader.d.ts +35 -0
- package/dist/_vendor/ailf-core/ports/artifact-uploader.js +18 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +9 -0
- package/dist/_vendor/ailf-core/ports/index.d.ts +2 -0
- package/dist/_vendor/ailf-core/ports/index.js +1 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +70 -0
- package/dist/_vendor/ailf-core/types/scoring-input.d.ts +2 -0
- package/dist/artifact-capture/gcs-collector.d.ts +55 -0
- package/dist/artifact-capture/gcs-collector.js +117 -0
- package/dist/artifact-capture/gcs-report-artifact-uploader.d.ts +31 -0
- package/dist/artifact-capture/gcs-report-artifact-uploader.js +66 -0
- package/dist/cli.js +2 -0
- package/dist/commands/pipeline-action.js +3 -0
- package/dist/composition-root.js +21 -5
- package/dist/orchestration/build-app-context.js +3 -0
- package/dist/orchestration/steps/calculate-scores-step.js +5 -1
- package/dist/orchestration/steps/gap-analysis-step.js +15 -0
- package/dist/orchestration/steps/publish-report-step.js +31 -0
- package/dist/pipeline/calculate-scores.d.ts +12 -2
- package/dist/pipeline/calculate-scores.js +88 -0
- package/dist/report-store.js +28 -2
- package/package.json +2 -1
|
@@ -19,3 +19,4 @@ export { defineConfig, defineFeatures, defineModeBase, defineModels, definePrici
|
|
|
19
19
|
export type { PricingEntry, PromptEntry, SourceEntry, } from "./config-helpers.js";
|
|
20
20
|
export { env } from "./env-helper.js";
|
|
21
21
|
export { NoOpArtifactCollector } from "./artifact-capture/noop-collector.js";
|
|
22
|
+
export { NoOpArtifactUploader } from "./ports/artifact-uploader.js";
|
|
@@ -21,3 +21,4 @@ export * from "./examples/index.js";
|
|
|
21
21
|
export { defineConfig, defineFeatures, defineModeBase, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./config-helpers.js";
|
|
22
22
|
export { env } from "./env-helper.js";
|
|
23
23
|
export { NoOpArtifactCollector } from "./artifact-capture/noop-collector.js";
|
|
24
|
+
export { NoOpArtifactUploader } from "./ports/artifact-uploader.js";
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Port: ArtifactUploader — uploads report artifacts to external object storage.
|
|
3
|
+
*
|
|
4
|
+
* Separate from ArtifactCollector (which captures forensic archives).
|
|
5
|
+
* This port puts structured files at known paths so Studio can fetch
|
|
6
|
+
* them on demand via signed URLs.
|
|
7
|
+
*
|
|
8
|
+
* @see docs/design-docs/external-artifact-store.md
|
|
9
|
+
* @see docs/decisions/D0030-external-artifact-store.md
|
|
10
|
+
*/
|
|
11
|
+
import type { ArtifactRef } from "../types/index.js";
|
|
12
|
+
/**
|
|
13
|
+
* Uploads report artifacts to external storage.
|
|
14
|
+
*
|
|
15
|
+
* Implementations:
|
|
16
|
+
* - GcsReportArtifactUploader (packages/eval) — uploads to GCS
|
|
17
|
+
* - NoOpArtifactUploader (below) — returns null (no-op when GCS is not configured)
|
|
18
|
+
*/
|
|
19
|
+
export interface ArtifactUploader {
|
|
20
|
+
/**
|
|
21
|
+
* Upload a JSON artifact for a report.
|
|
22
|
+
*
|
|
23
|
+
* @param reportId - Report identifier (used as the GCS path prefix)
|
|
24
|
+
* @param fileName - File name within the report prefix (e.g., "test-outputs.json")
|
|
25
|
+
* @param data - Serializable data (will be JSON.stringify'd)
|
|
26
|
+
* @returns ArtifactRef on success, null if upload is skipped or fails
|
|
27
|
+
*/
|
|
28
|
+
upload(reportId: string, fileName: string, data: unknown): Promise<ArtifactRef | null>;
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* No-op uploader — always returns null. Used when GCS is not configured.
|
|
32
|
+
*/
|
|
33
|
+
export declare class NoOpArtifactUploader implements ArtifactUploader {
|
|
34
|
+
upload(): Promise<null>;
|
|
35
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Port: ArtifactUploader — uploads report artifacts to external object storage.
|
|
3
|
+
*
|
|
4
|
+
* Separate from ArtifactCollector (which captures forensic archives).
|
|
5
|
+
* This port puts structured files at known paths so Studio can fetch
|
|
6
|
+
* them on demand via signed URLs.
|
|
7
|
+
*
|
|
8
|
+
* @see docs/design-docs/external-artifact-store.md
|
|
9
|
+
* @see docs/decisions/D0030-external-artifact-store.md
|
|
10
|
+
*/
|
|
11
|
+
/**
|
|
12
|
+
* No-op uploader — always returns null. Used when GCS is not configured.
|
|
13
|
+
*/
|
|
14
|
+
export class NoOpArtifactUploader {
|
|
15
|
+
async upload() {
|
|
16
|
+
return null;
|
|
17
|
+
}
|
|
18
|
+
}
|
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
*/
|
|
14
14
|
import type { DebugOptions, EvalMode, PluginRegistry } from "../types/index.js";
|
|
15
15
|
import type { ArtifactCollector } from "./artifact-collector.js";
|
|
16
|
+
import type { ArtifactUploader } from "./artifact-uploader.js";
|
|
16
17
|
import type { CacheStore } from "./cache-store.js";
|
|
17
18
|
import type { DocFetcher } from "./doc-fetcher.js";
|
|
18
19
|
import type { EvalRunner } from "./eval-runner.js";
|
|
@@ -159,6 +160,12 @@ export interface ResolvedConfig {
|
|
|
159
160
|
captureCompress?: boolean;
|
|
160
161
|
/** Whether to include mode-specific extra artifacts (default: true) */
|
|
161
162
|
captureExtras?: boolean;
|
|
163
|
+
/** GCS bucket for capture upload (enables GCS decorator when set) */
|
|
164
|
+
captureGcsBucket?: string;
|
|
165
|
+
/** GCS object prefix for capture uploads (default: "captures/") */
|
|
166
|
+
captureGcsPrefix?: string;
|
|
167
|
+
/** GCS bucket for report artifact uploads — enables ArtifactUploader (D0030) */
|
|
168
|
+
artifactGcsBucket?: string;
|
|
162
169
|
}
|
|
163
170
|
/**
|
|
164
171
|
* Application context — the complete dependency carrier.
|
|
@@ -173,6 +180,8 @@ export interface ResolvedConfig {
|
|
|
173
180
|
* Created per-test by createTestContext().
|
|
174
181
|
*/
|
|
175
182
|
export interface AppContext {
|
|
183
|
+
/** Report artifact uploader — uploads structured files to GCS for Studio (D0030) */
|
|
184
|
+
readonly artifactUploader?: ArtifactUploader;
|
|
176
185
|
/** Evaluation caching (filesystem + optional Content Lake fallback) */
|
|
177
186
|
readonly cache?: CacheStore;
|
|
178
187
|
/** Artifact capture collector (no-op when --capture is not set) */
|
|
@@ -5,6 +5,8 @@
|
|
|
5
5
|
* Adapters (in packages/eval) implement these interfaces.
|
|
6
6
|
*/
|
|
7
7
|
export type { ArtifactCollector, ArtifactManifest, ArtifactManifestEntry, CaptureFlushResult, } from "./artifact-collector.js";
|
|
8
|
+
export type { ArtifactUploader } from "./artifact-uploader.js";
|
|
9
|
+
export { NoOpArtifactUploader } from "./artifact-uploader.js";
|
|
8
10
|
export type { ArtifactContentDiff, CaptureDiffReport, ComparisonMode, ComparisonOptions, InventoryDiff, JsonDiffEntry, MetadataComparison, ScoreComparison, SecurityScan, TimingComparison, } from "./capture-comparator.js";
|
|
9
11
|
export type { CacheEntryMetadata, CacheKey, CacheLookupResult, CacheRecordInput, CacheStore, } from "./cache-store.js";
|
|
10
12
|
export type { ConfigSource } from "./config-source.js";
|
|
@@ -4,4 +4,5 @@
|
|
|
4
4
|
* Ports define the contracts between the domain kernel and the outside world.
|
|
5
5
|
* Adapters (in packages/eval) implement these interfaces.
|
|
6
6
|
*/
|
|
7
|
+
export { NoOpArtifactUploader } from "./artifact-uploader.js";
|
|
7
8
|
export { canonicalDocRefLabel, isIdRef, isPathRef, isPerspectiveRef, isSlugRef, isTemplatedAssertion, } from "./task-source.js";
|
|
@@ -275,6 +275,55 @@ export interface StoredJudgment extends GraderJudgment {
|
|
|
275
275
|
/** Canonical docs that the task expected the model to use */
|
|
276
276
|
canonicalDocs?: DocumentRef[];
|
|
277
277
|
}
|
|
278
|
+
/**
|
|
279
|
+
* Per-test result stored in reports for drill-down and audit.
|
|
280
|
+
*
|
|
281
|
+
* Captures the model's response output, grader reasoning per dimension,
|
|
282
|
+
* and response metadata. One entry per test × model combination.
|
|
283
|
+
* See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
|
|
284
|
+
*/
|
|
285
|
+
export interface StoredTestResult {
|
|
286
|
+
/** Resolved feature area (from __featureArea or description) */
|
|
287
|
+
area: string;
|
|
288
|
+
/** Canonical docs the task expected the model to use */
|
|
289
|
+
canonicalDocs?: DocumentRef[];
|
|
290
|
+
/** Weighted composite score (gold variant only) */
|
|
291
|
+
compositeScore?: number;
|
|
292
|
+
/** Per-test cost (USD) */
|
|
293
|
+
cost?: number;
|
|
294
|
+
/** Per-dimension grader scores and reasoning */
|
|
295
|
+
dimensions: {
|
|
296
|
+
/** Rubric dimension: task-completion, code-correctness, doc-coverage */
|
|
297
|
+
dimension: string;
|
|
298
|
+
/** Grader's natural language reasoning */
|
|
299
|
+
reason: string;
|
|
300
|
+
/** Numeric score (0–100, normalized) */
|
|
301
|
+
score: number;
|
|
302
|
+
}[];
|
|
303
|
+
/** Response latency in milliseconds */
|
|
304
|
+
latencyMs?: number;
|
|
305
|
+
/** Model that produced the response */
|
|
306
|
+
modelId: string;
|
|
307
|
+
/**
|
|
308
|
+
* True when the model failed to produce meaningful output (empty response,
|
|
309
|
+
* API error, or refusal). Same semantics as GraderJudgment.outputFailure.
|
|
310
|
+
*/
|
|
311
|
+
outputFailure?: boolean;
|
|
312
|
+
/** The model's generated code/response (truncated to 8000 chars) */
|
|
313
|
+
responseOutput: string;
|
|
314
|
+
/** True when responseOutput was truncated from a longer response */
|
|
315
|
+
responseOutputTruncated?: boolean;
|
|
316
|
+
/** Task description (e.g. "Functions - Webhook handler (gold)") */
|
|
317
|
+
taskId: string;
|
|
318
|
+
/** Token usage breakdown */
|
|
319
|
+
tokenUsage?: {
|
|
320
|
+
completion: number;
|
|
321
|
+
prompt: number;
|
|
322
|
+
total: number;
|
|
323
|
+
};
|
|
324
|
+
/** "gold" (with docs) or "baseline" (without docs) */
|
|
325
|
+
variant: "baseline" | "gold";
|
|
326
|
+
}
|
|
278
327
|
/** Grader consistency diagnostics — does not affect scores, reported alongside */
|
|
279
328
|
export interface GraderReliability {
|
|
280
329
|
/** Inter-grader agreement (from multi-grader comparison) — Phase 3 */
|
|
@@ -776,6 +825,12 @@ export interface ScoreSummary {
|
|
|
776
825
|
lowScoringJudgments?: StoredJudgment[];
|
|
777
826
|
/** Gap analysis recommendations (Phase 3b) — prioritized remediation plan */
|
|
778
827
|
recommendations?: GapAnalysisReport;
|
|
828
|
+
/**
|
|
829
|
+
* Per-test results with model output, grader reasoning, and metadata.
|
|
830
|
+
* One entry per test × model combination. Populated during gap-analysis
|
|
831
|
+
* enrichment from test-results.json. See D0029.
|
|
832
|
+
*/
|
|
833
|
+
testResults?: StoredTestResult[];
|
|
779
834
|
/** Grader reliability diagnostics (does not affect scores) */
|
|
780
835
|
graderReliability?: GraderReliability;
|
|
781
836
|
lowestArea: string;
|
|
@@ -1102,8 +1157,23 @@ export interface PublishResult {
|
|
|
1102
1157
|
result: SinkResult;
|
|
1103
1158
|
}[];
|
|
1104
1159
|
}
|
|
1160
|
+
/** Reference to an artifact in external object storage (GCS). See D0030. */
|
|
1161
|
+
export interface ArtifactRef {
|
|
1162
|
+
store: "gcs";
|
|
1163
|
+
bucket: string;
|
|
1164
|
+
path: string;
|
|
1165
|
+
bytes?: number;
|
|
1166
|
+
entryCount?: number;
|
|
1167
|
+
}
|
|
1105
1168
|
/** A published evaluation report — the atomic unit of the report store */
|
|
1106
1169
|
export interface Report {
|
|
1170
|
+
/** External artifact references — set by publish step when uploader is available (D0030) */
|
|
1171
|
+
artifacts?: {
|
|
1172
|
+
testOutputs?: ArtifactRef;
|
|
1173
|
+
renderedPrompts?: ArtifactRef;
|
|
1174
|
+
rawResults?: ArtifactRef;
|
|
1175
|
+
traces?: ArtifactRef;
|
|
1176
|
+
};
|
|
1107
1177
|
/** Optional auto-comparison against the most recent comparable report */
|
|
1108
1178
|
comparison?: ComparisonReport;
|
|
1109
1179
|
/** When the evaluation completed */
|
|
@@ -26,6 +26,8 @@ export interface TestResult {
|
|
|
26
26
|
componentResults: ComponentResult[];
|
|
27
27
|
pass: boolean;
|
|
28
28
|
};
|
|
29
|
+
/** Per-test latency in ms (propagated from Promptfoo when available) */
|
|
30
|
+
latencyMs?: number;
|
|
29
31
|
metadata?: Record<string, unknown>;
|
|
30
32
|
/** Provider identifier (e.g., "openai:gpt-4o") */
|
|
31
33
|
providerId?: string;
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GcsArtifactCollector — decorator that uploads capture artifacts to GCS.
|
|
3
|
+
*
|
|
4
|
+
* Wraps the FilesystemArtifactCollector: local flush first (preserving
|
|
5
|
+
* the existing manifest + redaction logic), then upload to a GCS bucket.
|
|
6
|
+
*
|
|
7
|
+
* Design principles:
|
|
8
|
+
* - P5: Non-blocking — GCS upload failure should not block the pipeline.
|
|
9
|
+
* Local artifacts are always preserved.
|
|
10
|
+
* - Decorator pattern — delegates capture() and captureFile() to the inner
|
|
11
|
+
* collector unchanged. Only flush() adds the GCS upload step.
|
|
12
|
+
* - Lazy client — GCS Storage client is created on first flush(), not at
|
|
13
|
+
* construction (same pattern as BigQuerySink).
|
|
14
|
+
*
|
|
15
|
+
* @see docs/decisions/D0030-external-artifact-store.md
|
|
16
|
+
* @see docs/work-items/W0035-gcs-artifact-output.json
|
|
17
|
+
*/
|
|
18
|
+
import type { ArtifactCollector, CaptureFlushResult } from "../_vendor/ailf-core/index.d.ts";
|
|
19
|
+
export interface GcsCollectorOptions {
|
|
20
|
+
/** GCS bucket name (e.g., "ailf-artifacts") */
|
|
21
|
+
bucket: string;
|
|
22
|
+
/** Object prefix in the bucket (e.g., "captures/") */
|
|
23
|
+
prefix?: string;
|
|
24
|
+
/** Path to service account credentials JSON (optional — falls back to ADC) */
|
|
25
|
+
credentials?: string;
|
|
26
|
+
}
|
|
27
|
+
export interface GcsFlushResult extends CaptureFlushResult {
|
|
28
|
+
/** GCS upload status */
|
|
29
|
+
gcs: {
|
|
30
|
+
status: "uploaded";
|
|
31
|
+
bucket: string;
|
|
32
|
+
path: string;
|
|
33
|
+
} | {
|
|
34
|
+
status: "skipped";
|
|
35
|
+
reason: string;
|
|
36
|
+
} | {
|
|
37
|
+
status: "failed";
|
|
38
|
+
error: string;
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
export declare class GcsArtifactCollector implements ArtifactCollector {
|
|
42
|
+
get enabled(): boolean;
|
|
43
|
+
get extrasEnabled(): boolean;
|
|
44
|
+
private client;
|
|
45
|
+
private readonly inner;
|
|
46
|
+
private readonly options;
|
|
47
|
+
constructor(inner: ArtifactCollector, options: GcsCollectorOptions);
|
|
48
|
+
capture(step: string, type: string, data: unknown, meta?: Record<string, unknown>): void;
|
|
49
|
+
captureFile(step: string, type: string, filePath: string, meta?: Record<string, unknown>): void;
|
|
50
|
+
flush(): Promise<GcsFlushResult>;
|
|
51
|
+
/** Lazily create the GCS Storage client. */
|
|
52
|
+
private getClient;
|
|
53
|
+
/** Upload the flushed artifact (tar.gz or directory) to GCS. */
|
|
54
|
+
private uploadToGcs;
|
|
55
|
+
}
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GcsArtifactCollector — decorator that uploads capture artifacts to GCS.
|
|
3
|
+
*
|
|
4
|
+
* Wraps the FilesystemArtifactCollector: local flush first (preserving
|
|
5
|
+
* the existing manifest + redaction logic), then upload to a GCS bucket.
|
|
6
|
+
*
|
|
7
|
+
* Design principles:
|
|
8
|
+
* - P5: Non-blocking — GCS upload failure should not block the pipeline.
|
|
9
|
+
* Local artifacts are always preserved.
|
|
10
|
+
* - Decorator pattern — delegates capture() and captureFile() to the inner
|
|
11
|
+
* collector unchanged. Only flush() adds the GCS upload step.
|
|
12
|
+
* - Lazy client — GCS Storage client is created on first flush(), not at
|
|
13
|
+
* construction (same pattern as BigQuerySink).
|
|
14
|
+
*
|
|
15
|
+
* @see docs/decisions/D0030-external-artifact-store.md
|
|
16
|
+
* @see docs/work-items/W0035-gcs-artifact-output.json
|
|
17
|
+
*/
|
|
18
|
+
import { readFileSync } from "node:fs";
|
|
19
|
+
import { Storage } from "@google-cloud/storage";
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
// Collector
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
export class GcsArtifactCollector {
|
|
24
|
+
get enabled() {
|
|
25
|
+
return this.inner.enabled;
|
|
26
|
+
}
|
|
27
|
+
get extrasEnabled() {
|
|
28
|
+
return this.inner.extrasEnabled;
|
|
29
|
+
}
|
|
30
|
+
client = null;
|
|
31
|
+
inner;
|
|
32
|
+
options;
|
|
33
|
+
constructor(inner, options) {
|
|
34
|
+
this.inner = inner;
|
|
35
|
+
this.options = options;
|
|
36
|
+
}
|
|
37
|
+
capture(step, type, data, meta) {
|
|
38
|
+
this.inner.capture(step, type, data, meta);
|
|
39
|
+
}
|
|
40
|
+
captureFile(step, type, filePath, meta) {
|
|
41
|
+
this.inner.captureFile(step, type, filePath, meta);
|
|
42
|
+
}
|
|
43
|
+
async flush() {
|
|
44
|
+
// Step 1: Flush to local filesystem first (always succeeds or throws)
|
|
45
|
+
const localResult = await this.inner.flush();
|
|
46
|
+
// Step 2: Upload to GCS (non-blocking — P5)
|
|
47
|
+
if (localResult.artifactCount === 0) {
|
|
48
|
+
return {
|
|
49
|
+
...localResult,
|
|
50
|
+
gcs: { status: "skipped", reason: "No artifacts to upload" },
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
try {
|
|
54
|
+
const gcsPath = await this.uploadToGcs(localResult);
|
|
55
|
+
return {
|
|
56
|
+
...localResult,
|
|
57
|
+
gcs: {
|
|
58
|
+
status: "uploaded",
|
|
59
|
+
bucket: this.options.bucket,
|
|
60
|
+
path: gcsPath,
|
|
61
|
+
},
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
catch (err) {
|
|
65
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
66
|
+
console.warn(` ⚠️ GCS upload failed (non-blocking): ${message}`);
|
|
67
|
+
return {
|
|
68
|
+
...localResult,
|
|
69
|
+
gcs: { status: "failed", error: message },
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
// -----------------------------------------------------------------------
|
|
74
|
+
// Private helpers
|
|
75
|
+
// -----------------------------------------------------------------------
|
|
76
|
+
/** Lazily create the GCS Storage client. */
|
|
77
|
+
getClient() {
|
|
78
|
+
if (this.client)
|
|
79
|
+
return this.client;
|
|
80
|
+
this.client = this.options.credentials
|
|
81
|
+
? new Storage({ keyFilename: this.options.credentials })
|
|
82
|
+
: new Storage();
|
|
83
|
+
return this.client;
|
|
84
|
+
}
|
|
85
|
+
/** Upload the flushed artifact (tar.gz or directory) to GCS. */
|
|
86
|
+
async uploadToGcs(result) {
|
|
87
|
+
const storage = this.getClient();
|
|
88
|
+
const bucket = storage.bucket(this.options.bucket);
|
|
89
|
+
const prefix = this.options.prefix ?? "captures/";
|
|
90
|
+
if (result.compressed) {
|
|
91
|
+
// Upload the tar.gz directly
|
|
92
|
+
const fileName = result.destination.split("/").pop() ?? "capture.tar.gz";
|
|
93
|
+
const gcsPath = `${prefix}${fileName}`;
|
|
94
|
+
const fileContent = readFileSync(result.destination);
|
|
95
|
+
await bucket.file(gcsPath).save(fileContent, {
|
|
96
|
+
contentType: "application/gzip",
|
|
97
|
+
metadata: {
|
|
98
|
+
artifactCount: String(result.artifactCount),
|
|
99
|
+
totalBytes: String(result.totalBytes),
|
|
100
|
+
},
|
|
101
|
+
});
|
|
102
|
+
return gcsPath;
|
|
103
|
+
}
|
|
104
|
+
// Uncompressed: upload the manifest.json as the representative file.
|
|
105
|
+
// The full directory could be uploaded file-by-file, but for the
|
|
106
|
+
// capture use case (forensic archive), the compressed bundle is the
|
|
107
|
+
// expected path. Upload just the manifest as a reference.
|
|
108
|
+
const manifestPath = `${result.destination}/manifest.json`;
|
|
109
|
+
const dirName = result.destination.split("/").pop() ?? "capture";
|
|
110
|
+
const gcsPath = `${prefix}${dirName}/manifest.json`;
|
|
111
|
+
const manifestContent = readFileSync(manifestPath, "utf-8");
|
|
112
|
+
await bucket.file(gcsPath).save(manifestContent, {
|
|
113
|
+
contentType: "application/json",
|
|
114
|
+
});
|
|
115
|
+
return gcsPath;
|
|
116
|
+
}
|
|
117
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GcsReportArtifactUploader — uploads report artifacts to known GCS paths.
|
|
3
|
+
*
|
|
4
|
+
* Separate from GcsArtifactCollector (which handles forensic capture archives).
|
|
5
|
+
* This uploader puts structured JSON files at predictable paths so the
|
|
6
|
+
* API Gateway can sign URLs and Studio can fetch them on demand.
|
|
7
|
+
*
|
|
8
|
+
* GCS path convention: reports/{reportId}/{fileName}
|
|
9
|
+
* Example: reports/01926abc.../test-outputs.json
|
|
10
|
+
*
|
|
11
|
+
* Design principles:
|
|
12
|
+
* - P5: Non-blocking — GCS upload failure returns null, never throws
|
|
13
|
+
* - Lazy client — Storage created on first upload, not at construction
|
|
14
|
+
* - Same credentials path as GcsArtifactCollector (ADC or key file)
|
|
15
|
+
*
|
|
16
|
+
* @see docs/design-docs/external-artifact-store.md
|
|
17
|
+
* @see docs/decisions/D0030-external-artifact-store.md
|
|
18
|
+
*/
|
|
19
|
+
import type { ArtifactRef, ArtifactUploader } from "../_vendor/ailf-core/index.d.ts";
|
|
20
|
+
export interface GcsUploaderOptions {
|
|
21
|
+
/** GCS bucket name (e.g., "ailf-artifacts") */
|
|
22
|
+
bucket: string;
|
|
23
|
+
}
|
|
24
|
+
export declare class GcsReportArtifactUploader implements ArtifactUploader {
|
|
25
|
+
private client;
|
|
26
|
+
private readonly options;
|
|
27
|
+
constructor(options: GcsUploaderOptions);
|
|
28
|
+
upload(reportId: string, fileName: string, data: unknown): Promise<ArtifactRef | null>;
|
|
29
|
+
/** Lazily create the GCS Storage client (ADC). */
|
|
30
|
+
private getClient;
|
|
31
|
+
}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GcsReportArtifactUploader — uploads report artifacts to known GCS paths.
|
|
3
|
+
*
|
|
4
|
+
* Separate from GcsArtifactCollector (which handles forensic capture archives).
|
|
5
|
+
* This uploader puts structured JSON files at predictable paths so the
|
|
6
|
+
* API Gateway can sign URLs and Studio can fetch them on demand.
|
|
7
|
+
*
|
|
8
|
+
* GCS path convention: reports/{reportId}/{fileName}
|
|
9
|
+
* Example: reports/01926abc.../test-outputs.json
|
|
10
|
+
*
|
|
11
|
+
* Design principles:
|
|
12
|
+
* - P5: Non-blocking — GCS upload failure returns null, never throws
|
|
13
|
+
* - Lazy client — Storage created on first upload, not at construction
|
|
14
|
+
* - Same credentials path as GcsArtifactCollector (ADC or key file)
|
|
15
|
+
*
|
|
16
|
+
* @see docs/design-docs/external-artifact-store.md
|
|
17
|
+
* @see docs/decisions/D0030-external-artifact-store.md
|
|
18
|
+
*/
|
|
19
|
+
import { Storage } from "@google-cloud/storage";
|
|
20
|
+
export class GcsReportArtifactUploader {
|
|
21
|
+
client = null;
|
|
22
|
+
options;
|
|
23
|
+
constructor(options) {
|
|
24
|
+
this.options = options;
|
|
25
|
+
}
|
|
26
|
+
async upload(reportId, fileName, data) {
|
|
27
|
+
const objectPath = `reports/${reportId}/${fileName}`;
|
|
28
|
+
const json = JSON.stringify(data);
|
|
29
|
+
const bytes = Buffer.byteLength(json, "utf-8");
|
|
30
|
+
try {
|
|
31
|
+
const storage = this.getClient();
|
|
32
|
+
const file = storage.bucket(this.options.bucket).file(objectPath);
|
|
33
|
+
await file.save(json, {
|
|
34
|
+
contentType: "application/json",
|
|
35
|
+
metadata: {
|
|
36
|
+
reportId,
|
|
37
|
+
},
|
|
38
|
+
});
|
|
39
|
+
return {
|
|
40
|
+
store: "gcs",
|
|
41
|
+
bucket: this.options.bucket,
|
|
42
|
+
path: objectPath,
|
|
43
|
+
bytes,
|
|
44
|
+
entryCount: typeof data === "object" &&
|
|
45
|
+
data !== null &&
|
|
46
|
+
"entries" in data &&
|
|
47
|
+
typeof data.entries === "object"
|
|
48
|
+
? Object.keys(data.entries)
|
|
49
|
+
.length
|
|
50
|
+
: undefined,
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
catch (err) {
|
|
54
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
55
|
+
console.warn(` ⚠️ Artifact upload failed (non-blocking): ${objectPath} — ${message}`);
|
|
56
|
+
return null;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
/** Lazily create the GCS Storage client (ADC). */
|
|
60
|
+
getClient() {
|
|
61
|
+
if (this.client)
|
|
62
|
+
return this.client;
|
|
63
|
+
this.client = new Storage();
|
|
64
|
+
return this.client;
|
|
65
|
+
}
|
|
66
|
+
}
|
package/dist/cli.js
CHANGED
|
@@ -168,6 +168,8 @@ import { createCalculateScoresCommand } from "./commands/calculate-scores.js";
|
|
|
168
168
|
program.addCommand(createCalculateScoresCommand().helpGroup(CommandGroup.PipelineInternals));
|
|
169
169
|
import { createPrCommentCommand } from "./commands/pr-comment.js";
|
|
170
170
|
program.addCommand(createPrCommentCommand().helpGroup(CommandGroup.PipelineInternals));
|
|
171
|
+
import { createGenerateConfigsCommand } from "./commands/generate-configs.js";
|
|
172
|
+
program.addCommand(createGenerateConfigsCommand().helpGroup(CommandGroup.PipelineInternals));
|
|
171
173
|
import { createMeasureRetrievalCommand } from "./commands/measure-retrieval.js";
|
|
172
174
|
program.addCommand(createMeasureRetrievalCommand().helpGroup(CommandGroup.PipelineInternals));
|
|
173
175
|
import { createLookupDocCommand } from "./commands/lookup-doc.js";
|
|
@@ -326,6 +326,9 @@ export async function executePipeline(cliOpts) {
|
|
|
326
326
|
process.env.AILF_CAPTURE_COMPRESS !== "0";
|
|
327
327
|
config.captureExtras =
|
|
328
328
|
cliOpts.captureExtras !== false && process.env.AILF_CAPTURE_EXTRAS !== "0";
|
|
329
|
+
config.captureGcsBucket ??= process.env.AILF_CAPTURE_GCS_BUCKET;
|
|
330
|
+
config.captureGcsPrefix ??= process.env.AILF_CAPTURE_GCS_PREFIX;
|
|
331
|
+
config.artifactGcsBucket ??= process.env.AILF_GCS_ARTIFACT_BUCKET;
|
|
329
332
|
// Create AppContext directly from the merged config so adapters
|
|
330
333
|
// (especially taskSource) are wired from the file config's
|
|
331
334
|
// taskSourceType — not from CLI defaults.
|
package/dist/composition-root.js
CHANGED
|
@@ -18,6 +18,8 @@
|
|
|
18
18
|
import { join } from "node:path";
|
|
19
19
|
import { InMemoryPluginRegistry, NoOpArtifactCollector, } from "./_vendor/ailf-core/index.js";
|
|
20
20
|
import { FilesystemArtifactCollector } from "./artifact-capture/filesystem-collector.js";
|
|
21
|
+
import { GcsArtifactCollector } from "./artifact-capture/gcs-collector.js";
|
|
22
|
+
import { GcsReportArtifactUploader } from "./artifact-capture/gcs-report-artifact-uploader.js";
|
|
21
23
|
import { ContentLakeCacheAdapter } from "./adapters/cache/content-lake-cache.js";
|
|
22
24
|
import { loadExternalPresets } from "./pipeline/compiler/preset-loader.js";
|
|
23
25
|
import { FilesystemCache } from "./adapters/cache/filesystem-cache.js";
|
|
@@ -57,9 +59,11 @@ export function createAppContext(config) {
|
|
|
57
59
|
const reportStore = createReportStore(config);
|
|
58
60
|
// Sinks — loaded from config/sinks
|
|
59
61
|
const sinks = loadSinks();
|
|
60
|
-
// Artifact collector — no-op by default, filesystem when --capture is set
|
|
61
|
-
|
|
62
|
-
|
|
62
|
+
// Artifact collector — no-op by default, filesystem when --capture is set,
|
|
63
|
+
// GCS decorator when --capture-gcs-bucket is also provided (D0030/W0035)
|
|
64
|
+
let collector = new NoOpArtifactCollector();
|
|
65
|
+
if (config.captureEnabled) {
|
|
66
|
+
const fsCollector = new FilesystemArtifactCollector({
|
|
63
67
|
captureDir: config.captureDir ?? join(config.outputDir, "..", "captures"),
|
|
64
68
|
mode: config.mode,
|
|
65
69
|
compress: config.captureCompress ?? true,
|
|
@@ -69,9 +73,21 @@ export function createAppContext(config) {
|
|
|
69
73
|
source: config.source,
|
|
70
74
|
areas: config.areas,
|
|
71
75
|
},
|
|
72
|
-
})
|
|
73
|
-
|
|
76
|
+
});
|
|
77
|
+
collector = config.captureGcsBucket
|
|
78
|
+
? new GcsArtifactCollector(fsCollector, {
|
|
79
|
+
bucket: config.captureGcsBucket,
|
|
80
|
+
prefix: config.captureGcsPrefix,
|
|
81
|
+
})
|
|
82
|
+
: fsCollector;
|
|
83
|
+
}
|
|
84
|
+
// Report artifact uploader — uploads structured files to GCS at known
|
|
85
|
+
// paths for Studio to fetch via signed URLs (D0030)
|
|
86
|
+
const artifactUploader = config.artifactGcsBucket
|
|
87
|
+
? new GcsReportArtifactUploader({ bucket: config.artifactGcsBucket })
|
|
88
|
+
: undefined;
|
|
74
89
|
return {
|
|
90
|
+
artifactUploader,
|
|
75
91
|
cache,
|
|
76
92
|
collector,
|
|
77
93
|
config,
|
|
@@ -82,6 +82,9 @@ export function mapToResolvedConfig(opts, rootDir) {
|
|
|
82
82
|
captureDir: opts.captureDir ?? join(opts.outputDir, "..", "captures"),
|
|
83
83
|
captureCompress: opts.captureCompress ?? true,
|
|
84
84
|
captureExtras: opts.captureExtras ?? true,
|
|
85
|
+
captureGcsBucket: process.env.AILF_CAPTURE_GCS_BUCKET,
|
|
86
|
+
captureGcsPrefix: process.env.AILF_CAPTURE_GCS_PREFIX,
|
|
87
|
+
artifactGcsBucket: process.env.AILF_GCS_ARTIFACT_BUCKET,
|
|
85
88
|
};
|
|
86
89
|
}
|
|
87
90
|
/**
|
|
@@ -122,7 +122,11 @@ export class CalculateScoresStep {
|
|
|
122
122
|
}
|
|
123
123
|
// Capture score artifacts
|
|
124
124
|
const resultsDir = join(ctx.config.rootDir, "results", "latest");
|
|
125
|
-
for (const file of [
|
|
125
|
+
for (const file of [
|
|
126
|
+
"score-summary.json",
|
|
127
|
+
"grader-judgments.json",
|
|
128
|
+
"test-results.json",
|
|
129
|
+
]) {
|
|
126
130
|
const filePath = join(resultsDir, file);
|
|
127
131
|
if (existsSync(filePath)) {
|
|
128
132
|
ctx.collector.captureFile("calculate-scores", file.replace(".json", ""), filePath);
|
|
@@ -151,6 +151,20 @@ export class GapAnalysisStep {
|
|
|
151
151
|
documents: areaToDocRefs.get(s.feature),
|
|
152
152
|
}));
|
|
153
153
|
}
|
|
154
|
+
// ── Per-test results (D0029: model output + metadata) ──────
|
|
155
|
+
const testResultsPath = resolve(root, "results", "latest", "test-results.json");
|
|
156
|
+
let testResults;
|
|
157
|
+
if (existsSync(testResultsPath)) {
|
|
158
|
+
const rawTestResults = JSON.parse(readFileSync(testResultsPath, "utf-8"));
|
|
159
|
+
// Enrich with canonical docs (literacy mode only)
|
|
160
|
+
testResults = rawTestResults.map((tr) => {
|
|
161
|
+
if (!isLiteracyMode)
|
|
162
|
+
return tr;
|
|
163
|
+
const baseDesc = tr.taskId.replace(/\s*\((gold|baseline)\)\s*$/, "");
|
|
164
|
+
const canonicalDocs = descToDocRefs.get(baseDesc);
|
|
165
|
+
return canonicalDocs ? { ...tr, canonicalDocs } : tr;
|
|
166
|
+
});
|
|
167
|
+
}
|
|
154
168
|
// ── Low-scoring judgments ────────────────────────────────────
|
|
155
169
|
const LOW_SCORE_THRESHOLD = 70;
|
|
156
170
|
const MAX_STORED_JUDGMENTS = 50;
|
|
@@ -177,6 +191,7 @@ export class GapAnalysisStep {
|
|
|
177
191
|
lowScoringJudgments,
|
|
178
192
|
recommendations: gapReport,
|
|
179
193
|
scores: enrichedScores,
|
|
194
|
+
...(testResults !== undefined && { testResults }),
|
|
180
195
|
};
|
|
181
196
|
writeFileSync(scoreSummaryPath, JSON.stringify(enrichedSummary, null, 2));
|
|
182
197
|
// Capture gap analysis artifacts
|
|
@@ -113,6 +113,13 @@ export class PublishReportStep {
|
|
|
113
113
|
tag: this.options.publishTag ?? ctx.config.publishTag,
|
|
114
114
|
title,
|
|
115
115
|
};
|
|
116
|
+
// Upload test output artifacts to GCS (D0030 — non-blocking, P5)
|
|
117
|
+
if (ctx.artifactUploader && summary.testResults?.length) {
|
|
118
|
+
const artifactRef = await uploadTestOutputs(ctx.artifactUploader, reportId, now, summary.testResults);
|
|
119
|
+
if (artifactRef) {
|
|
120
|
+
report.artifacts = { testOutputs: artifactRef };
|
|
121
|
+
}
|
|
122
|
+
}
|
|
116
123
|
// Share reportId with downstream steps (CallbackStep + orchestrator job update)
|
|
117
124
|
state.reportId = reportId;
|
|
118
125
|
// Capture report object (Tier 2)
|
|
@@ -211,6 +218,30 @@ function buildProvenanceInput(summary, ctx, options, autoScope) {
|
|
|
211
218
|
taskIds,
|
|
212
219
|
};
|
|
213
220
|
}
|
|
221
|
+
/**
|
|
222
|
+
* Extract test outputs from StoredTestResult[] and upload as a single
|
|
223
|
+
* JSON artifact to GCS. The artifact is keyed by `{taskId}::{modelId}`
|
|
224
|
+
* to match the lookup pattern in Studio's JudgmentList component.
|
|
225
|
+
*
|
|
226
|
+
* Non-blocking: returns null if upload fails (P5).
|
|
227
|
+
*/
|
|
228
|
+
async function uploadTestOutputs(uploader, reportId, createdAt, testResults) {
|
|
229
|
+
const entries = {};
|
|
230
|
+
for (const tr of testResults) {
|
|
231
|
+
const key = `${tr.taskId}::${tr.modelId}`;
|
|
232
|
+
entries[key] = {
|
|
233
|
+
responseOutput: tr.responseOutput,
|
|
234
|
+
responseOutputTruncated: tr.responseOutputTruncated ?? false,
|
|
235
|
+
};
|
|
236
|
+
}
|
|
237
|
+
const artifact = {
|
|
238
|
+
version: 1,
|
|
239
|
+
reportId,
|
|
240
|
+
createdAt,
|
|
241
|
+
entries,
|
|
242
|
+
};
|
|
243
|
+
return uploader.upload(reportId, "test-outputs.json", artifact);
|
|
244
|
+
}
|
|
214
245
|
/**
|
|
215
246
|
* Fan out a report to all configured sinks.
|
|
216
247
|
*
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import { type ActualScoreEntry, type ComponentResult, type Logger, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
|
|
1
|
+
import { type ActualScoreEntry, type ComponentResult, type Logger, type StoredTestResult, type TestSummary } from "../_vendor/ailf-core/index.d.ts";
|
|
2
2
|
import { type ResolvedSourceConfig } from "../sources.js";
|
|
3
3
|
import type { GraderJudgment, PerModelEntry } from "./types.js";
|
|
4
|
-
export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, type ActualScoreEntry, type ComponentResult, type TestResult, type UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
|
|
4
|
+
export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, type ActualScoreEntry, type ComponentResult, type StoredTestResult, type TestResult, type UrlMetadata, } from "../_vendor/ailf-core/index.d.ts";
|
|
5
5
|
export interface PromptfooResultsWrapper {
|
|
6
6
|
results: RawTestResult[];
|
|
7
7
|
stats: {
|
|
@@ -75,6 +75,16 @@ export declare function calculateScoresPerModel(resultsPath: string, goldProfile
|
|
|
75
75
|
* Phase 3a prerequisite: structured judgment data for failure mode extraction.
|
|
76
76
|
*/
|
|
77
77
|
export declare function extractGraderJudgments(resultsPath: string): GraderJudgment[];
|
|
78
|
+
/**
|
|
79
|
+
* Extract per-test results with model output from evaluation results.
|
|
80
|
+
*
|
|
81
|
+
* Mirrors extractGraderJudgments() but captures the full StoredTestResult
|
|
82
|
+
* shape including response.output (truncated), latency, and cost.
|
|
83
|
+
* One StoredTestResult per test × model combination.
|
|
84
|
+
*
|
|
85
|
+
* See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
|
|
86
|
+
*/
|
|
87
|
+
export declare function extractStoredTestResults(resultsPath: string): StoredTestResult[];
|
|
78
88
|
/**
|
|
79
89
|
* Score agentic evaluation results. In agentic mode, all test entries are
|
|
80
90
|
* gold-only (no baseline entries — the .expanded.agentic.yaml fix ensures this).
|
|
@@ -157,6 +157,76 @@ export function extractGraderJudgments(resultsPath) {
|
|
|
157
157
|
}
|
|
158
158
|
return judgments;
|
|
159
159
|
}
|
|
160
|
+
/** Maximum characters to store for model response output */
|
|
161
|
+
const MAX_RESPONSE_OUTPUT_LENGTH = 8000;
|
|
162
|
+
/**
|
|
163
|
+
* Extract per-test results with model output from evaluation results.
|
|
164
|
+
*
|
|
165
|
+
* Mirrors extractGraderJudgments() but captures the full StoredTestResult
|
|
166
|
+
* shape including response.output (truncated), latency, and cost.
|
|
167
|
+
* One StoredTestResult per test × model combination.
|
|
168
|
+
*
|
|
169
|
+
* See D0029 and docs/design-docs/score-drill-down.md (Phase 1).
|
|
170
|
+
*/
|
|
171
|
+
export function extractStoredTestResults(resultsPath) {
|
|
172
|
+
const results = readAndNormalizeResults(resultsPath);
|
|
173
|
+
const testResults = [];
|
|
174
|
+
for (const result of results) {
|
|
175
|
+
const taskId = result.description;
|
|
176
|
+
const modelId = result.providerId ?? result.providerLabel ?? "unknown";
|
|
177
|
+
const area = result.vars.__featureArea || detectFeatureArea(result.description);
|
|
178
|
+
// Determine variant from docs variable presence (same logic as scoreResults)
|
|
179
|
+
const docs = result.vars.docs ?? "";
|
|
180
|
+
const variant = docs.trim().length > 0 ? "gold" : "baseline";
|
|
181
|
+
// Detect output failure (same logic as extractGraderJudgments)
|
|
182
|
+
const output = result.response?.output ?? "";
|
|
183
|
+
const isOutputFailure = !output.trim();
|
|
184
|
+
// Truncate response output
|
|
185
|
+
const responseOutput = output.slice(0, MAX_RESPONSE_OUTPUT_LENGTH);
|
|
186
|
+
const responseOutputTruncated = output.length > MAX_RESPONSE_OUTPUT_LENGTH;
|
|
187
|
+
// Extract per-dimension scores and reasons
|
|
188
|
+
const dimensions = [];
|
|
189
|
+
for (const comp of result.gradingResult.componentResults) {
|
|
190
|
+
if (comp.assertion?.type !== "llm-rubric") {
|
|
191
|
+
continue;
|
|
192
|
+
}
|
|
193
|
+
const dimension = classifyRubric(comp);
|
|
194
|
+
if (!dimension) {
|
|
195
|
+
continue;
|
|
196
|
+
}
|
|
197
|
+
const score = parseRubricScore(comp);
|
|
198
|
+
// Extract reason text (same JSON parsing as extractGraderJudgments)
|
|
199
|
+
let reason = comp.reason ?? "";
|
|
200
|
+
if (reason) {
|
|
201
|
+
try {
|
|
202
|
+
const parsed = JSON.parse(reason);
|
|
203
|
+
const obj = parsed;
|
|
204
|
+
if (typeof obj.reason === "string") {
|
|
205
|
+
;
|
|
206
|
+
({ reason } = obj);
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
catch {
|
|
210
|
+
// Not JSON — use raw reason string
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
dimensions.push({ dimension, reason, score });
|
|
214
|
+
}
|
|
215
|
+
testResults.push({
|
|
216
|
+
area,
|
|
217
|
+
cost: result.cost || undefined,
|
|
218
|
+
dimensions,
|
|
219
|
+
latencyMs: result.latencyMs,
|
|
220
|
+
modelId,
|
|
221
|
+
...(isOutputFailure && { outputFailure: true }),
|
|
222
|
+
responseOutput,
|
|
223
|
+
...(responseOutputTruncated && { responseOutputTruncated: true }),
|
|
224
|
+
taskId,
|
|
225
|
+
variant,
|
|
226
|
+
});
|
|
227
|
+
}
|
|
228
|
+
return testResults;
|
|
229
|
+
}
|
|
160
230
|
/**
|
|
161
231
|
* Finds the URL-extraction assertion result in a test's componentResults
|
|
162
232
|
* and parses the structured JSON from its `reason` field.
|
|
@@ -470,6 +540,7 @@ function readAndNormalizeResults(resultsPath, log) {
|
|
|
470
540
|
const base = {
|
|
471
541
|
cost: r.cost ?? 0,
|
|
472
542
|
description: r.testCase?.description ?? "unknown",
|
|
543
|
+
latencyMs: r.latencyMs,
|
|
473
544
|
metadata: r.metadata,
|
|
474
545
|
providerId: r.provider?.id,
|
|
475
546
|
providerLabel: r.provider?.label,
|
|
@@ -800,6 +871,12 @@ export function calculateAndWriteScores(options) {
|
|
|
800
871
|
writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
|
|
801
872
|
log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
|
|
802
873
|
}
|
|
874
|
+
// Extract and persist per-test results (D0029: model output + metadata)
|
|
875
|
+
const testResults = extractStoredTestResults(baselineResultsPath);
|
|
876
|
+
if (testResults.length > 0) {
|
|
877
|
+
writeFileSync(join(outDir, "test-results.json"), JSON.stringify(testResults, null, 2));
|
|
878
|
+
log.info(`Test results written to results/latest/test-results.json (${testResults.length} results)`);
|
|
879
|
+
}
|
|
803
880
|
const testSummary = computeTestSummary(baselineResultsPath);
|
|
804
881
|
return { belowCritical: summary.belowCritical, testSummary };
|
|
805
882
|
}
|
|
@@ -904,6 +981,17 @@ export function calculateAndWriteScores(options) {
|
|
|
904
981
|
writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
|
|
905
982
|
log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
|
|
906
983
|
}
|
|
984
|
+
// Extract and persist per-test results (D0029: model output + metadata)
|
|
985
|
+
const testResults = extractStoredTestResults(baselineResultsPath);
|
|
986
|
+
// In full mode, also extract test results from agentic results
|
|
987
|
+
if (mode === LiteracyVariant.FULL && existsSync(agenticResultsPath)) {
|
|
988
|
+
const agenticTestResults = extractStoredTestResults(agenticResultsPath);
|
|
989
|
+
testResults.push(...agenticTestResults);
|
|
990
|
+
}
|
|
991
|
+
if (testResults.length > 0) {
|
|
992
|
+
writeFileSync(join(outDir, "test-results.json"), JSON.stringify(testResults, null, 2));
|
|
993
|
+
log.info(`Test results written to results/latest/test-results.json (${testResults.length} results)`);
|
|
994
|
+
}
|
|
907
995
|
// Compute test summary from the raw results file
|
|
908
996
|
const testSummary = computeTestSummary(baselineResultsPath);
|
|
909
997
|
return { belowCritical: summary.belowCritical, testSummary };
|
package/dist/report-store.js
CHANGED
|
@@ -193,15 +193,27 @@ export class ReportStore {
|
|
|
193
193
|
*/
|
|
194
194
|
async write(report) {
|
|
195
195
|
try {
|
|
196
|
+
// Strip baseline and experiment ScoreSummary objects from comparison
|
|
197
|
+
// before persisting — they duplicate report.summary (experiment) and
|
|
198
|
+
// are fetchable by ID via provenance.lineage.comparedAgainst (baseline).
|
|
199
|
+
// This reduces document size by ~50-65% for full-mode reports.
|
|
200
|
+
const comparison = report.comparison
|
|
201
|
+
? stripComparisonBulk(report.comparison)
|
|
202
|
+
: null;
|
|
196
203
|
await this.client.create({
|
|
197
204
|
_id: `report-${report.id}`,
|
|
198
205
|
_type: REPORT_TYPE,
|
|
199
|
-
comparison
|
|
206
|
+
comparison,
|
|
200
207
|
completedAt: report.completedAt,
|
|
201
208
|
durationMs: report.durationMs,
|
|
202
209
|
provenance: report.provenance,
|
|
203
210
|
reportId: report.id,
|
|
204
|
-
summary:
|
|
211
|
+
summary: {
|
|
212
|
+
...report.summary,
|
|
213
|
+
// Artifact references live inside summary in Sanity so they're
|
|
214
|
+
// projected automatically by the reportDetailQuery (D0030)
|
|
215
|
+
...(report.artifacts ? { artifacts: report.artifacts } : {}),
|
|
216
|
+
},
|
|
205
217
|
tag: report.tag ?? null,
|
|
206
218
|
title: report.title ?? null,
|
|
207
219
|
});
|
|
@@ -283,3 +295,17 @@ function toReport(doc) {
|
|
|
283
295
|
title: doc.title,
|
|
284
296
|
};
|
|
285
297
|
}
|
|
298
|
+
/**
|
|
299
|
+
* Remove the `baseline` and `experiment` ScoreSummary objects from a
|
|
300
|
+
* ComparisonReport, producing a slim copy suitable for persistence.
|
|
301
|
+
*
|
|
302
|
+
* These fields are redundant in the stored document:
|
|
303
|
+
* - `experiment` is byte-for-byte identical to `report.summary`
|
|
304
|
+
* - `baseline` is fetchable via `provenance.lineage.comparedAgainst`
|
|
305
|
+
*
|
|
306
|
+
* Everything else (deltas, areas, classifications) is preserved.
|
|
307
|
+
*/
|
|
308
|
+
function stripComparisonBulk(comparison) {
|
|
309
|
+
const { baseline: _, experiment: __, ...slim } = comparison;
|
|
310
|
+
return slim;
|
|
311
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@sanity/ailf",
|
|
3
|
-
"version": "2.
|
|
3
|
+
"version": "2.4.0",
|
|
4
4
|
"private": false,
|
|
5
5
|
"publishConfig": {
|
|
6
6
|
"access": "public"
|
|
@@ -33,6 +33,7 @@
|
|
|
33
33
|
],
|
|
34
34
|
"dependencies": {
|
|
35
35
|
"@google-cloud/bigquery": "^8.1.1",
|
|
36
|
+
"@google-cloud/storage": "^7.19.0",
|
|
36
37
|
"@inquirer/prompts": "^8.3.0",
|
|
37
38
|
"@modelcontextprotocol/sdk": "^1.29.0",
|
|
38
39
|
"@portabletext/markdown": "^1.0.0",
|