@sanity/ailf 2.7.1 → 2.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +72 -0
- package/dist/_vendor/ailf-core/artifact-registry.js +150 -0
- package/dist/_vendor/ailf-core/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/index.js +2 -1
- package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +3 -3
- package/dist/_vendor/ailf-core/ports/artifact-writer.d.ts +56 -0
- package/dist/_vendor/ailf-core/ports/artifact-writer.js +28 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +13 -3
- package/dist/_vendor/ailf-core/ports/index.d.ts +3 -3
- package/dist/_vendor/ailf-core/ports/index.js +1 -1
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +9 -0
- package/dist/_vendor/ailf-core/types/branded-ids.js +21 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +110 -68
- package/dist/_vendor/ailf-core/types/index.js +1 -1
- package/dist/_vendor/ailf-shared/index.d.ts +2 -0
- package/dist/_vendor/ailf-shared/index.js +2 -0
- package/dist/_vendor/ailf-shared/run-context.d.ts +55 -0
- package/dist/_vendor/ailf-shared/run-context.js +17 -0
- package/dist/_vendor/ailf-shared/run-trigger.d.ts +30 -0
- package/dist/_vendor/ailf-shared/run-trigger.js +13 -0
- package/dist/artifact-capture/api-gateway-artifact-writer.d.ts +39 -0
- package/dist/artifact-capture/api-gateway-artifact-writer.js +148 -0
- package/dist/artifact-capture/gcs-artifact-writer.d.ts +30 -0
- package/dist/artifact-capture/gcs-artifact-writer.js +119 -0
- package/dist/commands/publish.js +3 -2
- package/dist/composition-root.d.ts +3 -3
- package/dist/composition-root.js +20 -15
- package/dist/orchestration/build-step-sequence.js +6 -1
- package/dist/orchestration/steps/calculate-scores-step.js +42 -2
- package/dist/orchestration/steps/finalize-run-step.d.ts +29 -0
- package/dist/orchestration/steps/finalize-run-step.js +103 -0
- package/dist/orchestration/steps/publish-report-step.js +19 -39
- package/dist/pipeline/calculate-scores.js +13 -2
- package/dist/pipeline/provenance.d.ts +24 -44
- package/dist/pipeline/provenance.js +17 -165
- package/dist/pipeline/report-title.d.ts +2 -2
- package/dist/pipeline/run-context.d.ts +57 -0
- package/dist/pipeline/run-context.js +156 -0
- package/dist/pipeline/upload-test-outputs.d.ts +26 -0
- package/dist/pipeline/upload-test-outputs.js +34 -0
- package/dist/report-store.js +4 -2
- package/package.json +3 -3
- package/dist/_vendor/ailf-core/ports/artifact-uploader.d.ts +0 -35
- package/dist/_vendor/ailf-core/ports/artifact-uploader.js +0 -18
- package/dist/artifact-capture/api-gateway-artifact-uploader.d.ts +0 -41
- package/dist/artifact-capture/api-gateway-artifact-uploader.js +0 -123
- package/dist/artifact-capture/gcs-report-artifact-uploader.d.ts +0 -31
- package/dist/artifact-capture/gcs-report-artifact-uploader.js +0 -66
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GcsArtifactWriter — writes AILF run artifacts + manifest directly to GCS.
|
|
3
|
+
*
|
|
4
|
+
* Uses Application Default Credentials (ADC). Used when the CLI runs in CI or
|
|
5
|
+
* anywhere ADC is configured — the client talks to GCS without the API Gateway
|
|
6
|
+
* acting as a middleman.
|
|
7
|
+
*
|
|
8
|
+
* Paths come from `ARTIFACT_REGISTRY` so writers, signers, and readers agree.
|
|
9
|
+
*
|
|
10
|
+
* Design principles:
|
|
11
|
+
* - P5: Non-blocking — upload failure returns null, never throws.
|
|
12
|
+
* - Lazy client — Storage created on first write.
|
|
13
|
+
*
|
|
14
|
+
* @see docs/decisions/D0032-run-anchored-artifact-store.md
|
|
15
|
+
*/
|
|
16
|
+
import { Storage } from "@google-cloud/storage";
|
|
17
|
+
import { ARTIFACT_REGISTRY, } from "../_vendor/ailf-core/index.js";
|
|
18
|
+
export class GcsArtifactWriter {
|
|
19
|
+
client = null;
|
|
20
|
+
options;
|
|
21
|
+
constructor(options) {
|
|
22
|
+
this.options = options;
|
|
23
|
+
}
|
|
24
|
+
async writeBulk(type, runId, data) {
|
|
25
|
+
const descriptor = ARTIFACT_REGISTRY[type];
|
|
26
|
+
const path = descriptor.objectPath(runId);
|
|
27
|
+
return this.putJson(path, data, {
|
|
28
|
+
layout: "bulk",
|
|
29
|
+
entryCount: entryCountOf(data),
|
|
30
|
+
});
|
|
31
|
+
}
|
|
32
|
+
async writePerEntry(type, runId, entries) {
|
|
33
|
+
const descriptor = ARTIFACT_REGISTRY[type];
|
|
34
|
+
if (!descriptor.parseEntryKey) {
|
|
35
|
+
console.warn(` ⚠️ writePerEntry called for "${type}" but the registry has no parseEntryKey`);
|
|
36
|
+
return null;
|
|
37
|
+
}
|
|
38
|
+
const storage = this.getClient();
|
|
39
|
+
const uploaded = [];
|
|
40
|
+
let totalBytes = 0;
|
|
41
|
+
for (const entry of entries) {
|
|
42
|
+
const parsed = descriptor.parseEntryKey(entry.key);
|
|
43
|
+
if (!parsed.ok) {
|
|
44
|
+
console.warn(` ⚠️ Skipping entry with invalid key "${entry.key}": ${parsed.reason}`);
|
|
45
|
+
continue;
|
|
46
|
+
}
|
|
47
|
+
const path = descriptor.objectPath(runId, entry.key);
|
|
48
|
+
const json = JSON.stringify(entry.data);
|
|
49
|
+
const bytes = Buffer.byteLength(json, "utf-8");
|
|
50
|
+
try {
|
|
51
|
+
await storage
|
|
52
|
+
.bucket(this.options.bucket)
|
|
53
|
+
.file(path)
|
|
54
|
+
.save(json, { contentType: "application/json" });
|
|
55
|
+
uploaded.push({ key: entry.key, bytes });
|
|
56
|
+
totalBytes += bytes;
|
|
57
|
+
}
|
|
58
|
+
catch (err) {
|
|
59
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
60
|
+
console.warn(` ⚠️ Artifact entry upload failed (non-blocking): ${path} — ${message}`);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
if (uploaded.length === 0)
|
|
64
|
+
return null;
|
|
65
|
+
return {
|
|
66
|
+
store: "gcs",
|
|
67
|
+
bucket: this.options.bucket,
|
|
68
|
+
path: `runs/${runId}/${descriptor.slug}`,
|
|
69
|
+
bytes: totalBytes,
|
|
70
|
+
entryCount: uploaded.length,
|
|
71
|
+
layout: "per-entry",
|
|
72
|
+
entries: uploaded,
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
async writeManifest(runId, manifest) {
|
|
76
|
+
const path = `runs/${runId}/manifest.json`;
|
|
77
|
+
return this.putJson(path, manifest, { layout: "bulk" });
|
|
78
|
+
}
|
|
79
|
+
async putJson(path, data, meta) {
|
|
80
|
+
const json = JSON.stringify(data);
|
|
81
|
+
const bytes = Buffer.byteLength(json, "utf-8");
|
|
82
|
+
try {
|
|
83
|
+
const storage = this.getClient();
|
|
84
|
+
await storage
|
|
85
|
+
.bucket(this.options.bucket)
|
|
86
|
+
.file(path)
|
|
87
|
+
.save(json, { contentType: "application/json" });
|
|
88
|
+
return {
|
|
89
|
+
store: "gcs",
|
|
90
|
+
bucket: this.options.bucket,
|
|
91
|
+
path,
|
|
92
|
+
bytes,
|
|
93
|
+
entryCount: meta.entryCount,
|
|
94
|
+
layout: meta.layout,
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
catch (err) {
|
|
98
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
99
|
+
console.warn(` ⚠️ Artifact upload failed (non-blocking): ${path} — ${message}`);
|
|
100
|
+
return null;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
getClient() {
|
|
104
|
+
if (this.client)
|
|
105
|
+
return this.client;
|
|
106
|
+
this.client = new Storage();
|
|
107
|
+
return this.client;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
function entryCountOf(data) {
|
|
111
|
+
if (typeof data === "object" &&
|
|
112
|
+
data !== null &&
|
|
113
|
+
"entries" in data &&
|
|
114
|
+
typeof data.entries === "object") {
|
|
115
|
+
return Object.keys(data.entries)
|
|
116
|
+
.length;
|
|
117
|
+
}
|
|
118
|
+
return undefined;
|
|
119
|
+
}
|
package/dist/commands/publish.js
CHANGED
|
@@ -55,7 +55,7 @@ export function createPublishCommand() {
|
|
|
55
55
|
* the summary metadata and environment. Some fields (contextHash,
|
|
56
56
|
* promptfooUrl) are not available for manual publishes.
|
|
57
57
|
*/
|
|
58
|
-
function buildProvenanceFromSummary(summary) {
|
|
58
|
+
function buildProvenanceFromSummary(summary, runId) {
|
|
59
59
|
const areas = summary.scores.map((s) => s.feature);
|
|
60
60
|
const mode = (process.env.EVAL_MODE ?? "literacy");
|
|
61
61
|
const source = {
|
|
@@ -76,6 +76,7 @@ function buildProvenanceFromSummary(summary) {
|
|
|
76
76
|
areas,
|
|
77
77
|
mode,
|
|
78
78
|
rootDir: ROOT,
|
|
79
|
+
runId,
|
|
79
80
|
source,
|
|
80
81
|
};
|
|
81
82
|
}
|
|
@@ -145,7 +146,7 @@ async function runPublishCommand(summaryPath, outputDir, opts) {
|
|
|
145
146
|
// -----------------------------------------------------------------------
|
|
146
147
|
// 2. Build provenance
|
|
147
148
|
// -----------------------------------------------------------------------
|
|
148
|
-
const provenanceInput = buildProvenanceFromSummary(summary);
|
|
149
|
+
const provenanceInput = buildProvenanceFromSummary(summary, ctx.runId);
|
|
149
150
|
const provenance = buildProvenance(provenanceInput);
|
|
150
151
|
// -----------------------------------------------------------------------
|
|
151
152
|
// 3. Create report
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
* @see packages/core/src/ports/context.ts — AppContext interface
|
|
16
16
|
* @see docs/archive/exec-plans/ports-and-adapters/phase-7-composition-root.md
|
|
17
17
|
*/
|
|
18
|
-
import { type AppContext, type
|
|
18
|
+
import { type AppContext, type ArtifactWriter, type AssertionRegistration, type Logger, type ResolvedConfig } from "./_vendor/ailf-core/index.d.ts";
|
|
19
19
|
/**
|
|
20
20
|
* Create a fully wired AppContext from resolved configuration.
|
|
21
21
|
*
|
|
@@ -24,7 +24,7 @@ import { type AppContext, type ArtifactUploader, type AssertionRegistration, typ
|
|
|
24
24
|
*/
|
|
25
25
|
export declare function createAppContext(config: ResolvedConfig): AppContext;
|
|
26
26
|
/**
|
|
27
|
-
* Selects an
|
|
27
|
+
* Selects an ArtifactWriter implementation based on available credentials.
|
|
28
28
|
*
|
|
29
29
|
* Selection order:
|
|
30
30
|
* 1. config.artifactUpload === false → always skip (explicit opt-out)
|
|
@@ -38,7 +38,7 @@ export declare function createAppContext(config: ResolvedConfig): AppContext;
|
|
|
38
38
|
*
|
|
39
39
|
* Exported for unit-test access; not part of the public package API.
|
|
40
40
|
*/
|
|
41
|
-
export declare function
|
|
41
|
+
export declare function createArtifactWriter(config: ResolvedConfig, logger: Logger): ArtifactWriter | undefined;
|
|
42
42
|
/**
|
|
43
43
|
* Generic Promptfoo assertion types available to all evaluation modes.
|
|
44
44
|
*
|
package/dist/composition-root.js
CHANGED
|
@@ -16,11 +16,11 @@
|
|
|
16
16
|
* @see docs/archive/exec-plans/ports-and-adapters/phase-7-composition-root.md
|
|
17
17
|
*/
|
|
18
18
|
import { join } from "node:path";
|
|
19
|
-
import { InMemoryPluginRegistry, NoOpArtifactCollector, } from "./_vendor/ailf-core/index.js";
|
|
20
|
-
import {
|
|
19
|
+
import { InMemoryPluginRegistry, NoOpArtifactCollector, generateRunId, } from "./_vendor/ailf-core/index.js";
|
|
20
|
+
import { ApiGatewayArtifactWriter } from "./artifact-capture/api-gateway-artifact-writer.js";
|
|
21
21
|
import { FilesystemArtifactCollector } from "./artifact-capture/filesystem-collector.js";
|
|
22
22
|
import { GcsArtifactCollector } from "./artifact-capture/gcs-collector.js";
|
|
23
|
-
import {
|
|
23
|
+
import { GcsArtifactWriter } from "./artifact-capture/gcs-artifact-writer.js";
|
|
24
24
|
import { ContentLakeCacheAdapter } from "./adapters/cache/content-lake-cache.js";
|
|
25
25
|
import { loadExternalPresets } from "./pipeline/compiler/preset-loader.js";
|
|
26
26
|
import { FilesystemCache } from "./adapters/cache/filesystem-cache.js";
|
|
@@ -82,13 +82,17 @@ export function createAppContext(config) {
|
|
|
82
82
|
})
|
|
83
83
|
: fsCollector;
|
|
84
84
|
}
|
|
85
|
-
//
|
|
86
|
-
// paths
|
|
87
|
-
//
|
|
88
|
-
//
|
|
89
|
-
const
|
|
85
|
+
// Artifact writer — writes run artifacts + manifest to GCS at known
|
|
86
|
+
// `runs/{runId}/…` paths (D0032). Auto-detects the right adapter from
|
|
87
|
+
// available credentials; defaults bucket to "ailf-artifacts". Set
|
|
88
|
+
// artifactUpload: false to opt out entirely.
|
|
89
|
+
const artifactWriter = createArtifactWriter(config, logger);
|
|
90
|
+
// Generate the pipeline's RunId once; every downstream step reads it
|
|
91
|
+
// from the context (D0032).
|
|
92
|
+
const runId = generateRunId();
|
|
93
|
+
logger.debug(`Pipeline runId: ${runId}`);
|
|
90
94
|
return {
|
|
91
|
-
|
|
95
|
+
artifactWriter,
|
|
92
96
|
cache,
|
|
93
97
|
collector,
|
|
94
98
|
config,
|
|
@@ -97,6 +101,7 @@ export function createAppContext(config) {
|
|
|
97
101
|
logger,
|
|
98
102
|
registry,
|
|
99
103
|
reportStore,
|
|
104
|
+
runId,
|
|
100
105
|
sinks,
|
|
101
106
|
taskSource,
|
|
102
107
|
};
|
|
@@ -124,7 +129,7 @@ function createLogger() {
|
|
|
124
129
|
*/
|
|
125
130
|
const DEFAULT_ARTIFACT_BUCKET = "ailf-artifacts";
|
|
126
131
|
/**
|
|
127
|
-
* Selects an
|
|
132
|
+
* Selects an ArtifactWriter implementation based on available credentials.
|
|
128
133
|
*
|
|
129
134
|
* Selection order:
|
|
130
135
|
* 1. config.artifactUpload === false → always skip (explicit opt-out)
|
|
@@ -138,7 +143,7 @@ const DEFAULT_ARTIFACT_BUCKET = "ailf-artifacts";
|
|
|
138
143
|
*
|
|
139
144
|
* Exported for unit-test access; not part of the public package API.
|
|
140
145
|
*/
|
|
141
|
-
export function
|
|
146
|
+
export function createArtifactWriter(config, logger) {
|
|
142
147
|
if (config.artifactUpload === false) {
|
|
143
148
|
logger.debug("Artifact upload explicitly disabled via artifactUpload=false");
|
|
144
149
|
return undefined;
|
|
@@ -148,13 +153,13 @@ export function createArtifactUploader(config, logger) {
|
|
|
148
153
|
// We treat the presence of either env var as the user opting in to ADC.
|
|
149
154
|
const hasGcsCredentials = Boolean(process.env.GOOGLE_APPLICATION_CREDENTIALS || process.env.GCLOUD_PROJECT);
|
|
150
155
|
if (hasGcsCredentials) {
|
|
151
|
-
logger.debug(`Artifact
|
|
152
|
-
return new
|
|
156
|
+
logger.debug(`Artifact writer: GcsArtifactWriter (direct GCS via ADC, bucket=${bucket})`);
|
|
157
|
+
return new GcsArtifactWriter({ bucket });
|
|
153
158
|
}
|
|
154
159
|
// Local dev — request signed PUT URLs from the API gateway, no GCS creds needed.
|
|
155
160
|
if (config.apiKey && config.apiUrl) {
|
|
156
|
-
logger.debug(`Artifact
|
|
157
|
-
return new
|
|
161
|
+
logger.debug(`Artifact writer: ApiGatewayArtifactWriter (signed URL via ${config.apiUrl}, bucket=${bucket})`);
|
|
162
|
+
return new ApiGatewayArtifactWriter({
|
|
158
163
|
apiBaseUrl: config.apiUrl,
|
|
159
164
|
apiKey: config.apiKey,
|
|
160
165
|
bucket,
|
|
@@ -11,6 +11,7 @@ import { CalculateScoresStep } from "./steps/calculate-scores-step.js";
|
|
|
11
11
|
import { CompareStep } from "./steps/compare-step.js";
|
|
12
12
|
import { DiscoveryReportStep } from "./steps/discovery-report-step.js";
|
|
13
13
|
import { FetchDocsStep } from "./steps/fetch-docs-step.js";
|
|
14
|
+
import { FinalizeRunStep } from "./steps/finalize-run-step.js";
|
|
14
15
|
import { GapAnalysisStep } from "./steps/gap-analysis-step.js";
|
|
15
16
|
import { GenerateConfigsStep } from "./steps/generate-configs-step.js";
|
|
16
17
|
import { GraderConsistencyStep } from "./steps/grader-consistency-step.js";
|
|
@@ -76,7 +77,11 @@ export function buildStepSequence(ctx, pipelineStart = Date.now()) {
|
|
|
76
77
|
if (config.gapAnalysisEnabled) {
|
|
77
78
|
steps.push(new GapAnalysisStep());
|
|
78
79
|
}
|
|
79
|
-
// Step
|
|
80
|
+
// Step 4c: Finalize the run — write `runs/{runId}/manifest.json` with the
|
|
81
|
+
// catalog of artifacts produced so far. Skipped silently when no
|
|
82
|
+
// artifactWriter is wired (D0032).
|
|
83
|
+
steps.push(new FinalizeRunStep(pipelineStart));
|
|
84
|
+
// Step 4d: Publish report (optional, when token is configured)
|
|
80
85
|
if (config.publishEnabled) {
|
|
81
86
|
steps.push(new PublishReportStep(pipelineStart, {
|
|
82
87
|
publishTag: config.publishTag,
|
|
@@ -4,8 +4,8 @@
|
|
|
4
4
|
* Calls calculateAndWriteScores() from pipeline/calculate-scores.ts with
|
|
5
5
|
* typed options derived from AppContext. No env bridge needed.
|
|
6
6
|
*/
|
|
7
|
-
import { existsSync } from "node:fs";
|
|
8
|
-
import { join } from "path";
|
|
7
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
8
|
+
import { join, resolve } from "path";
|
|
9
9
|
import { LiteracyVariant } from "../../pipeline/normalize-mode.js";
|
|
10
10
|
import { getStepInputPaths } from "../../pipeline/cache.js";
|
|
11
11
|
import { buildCacheContext } from "../cache-context.js";
|
|
@@ -13,6 +13,7 @@ import { calculateAndWriteScores } from "../../pipeline/calculate-scores.js";
|
|
|
13
13
|
import { checkResultsExist, checkScoreSummaryValid, } from "../../pipeline/checks.js";
|
|
14
14
|
import { resultsFileForMode } from "../../pipeline/eval-constants.js";
|
|
15
15
|
import { loadSource } from "../../sources.js";
|
|
16
|
+
import { uploadTestOutputs } from "../../pipeline/upload-test-outputs.js";
|
|
16
17
|
import { configToSourceOverrides } from "../config-to-source-overrides.js";
|
|
17
18
|
export class CalculateScoresStep {
|
|
18
19
|
name = "calculate-scores";
|
|
@@ -132,6 +133,27 @@ export class CalculateScoresStep {
|
|
|
132
133
|
ctx.collector.captureFile("calculate-scores", file.replace(".json", ""), filePath);
|
|
133
134
|
}
|
|
134
135
|
}
|
|
136
|
+
// Upload testOutputs to GCS (D0032 — non-blocking, P5).
|
|
137
|
+
// Read from test-results.json rather than score-summary.json: the
|
|
138
|
+
// gap-analysis step (downstream) is the one that enriches score-summary
|
|
139
|
+
// with testResults, so at this point the summary still has an empty
|
|
140
|
+
// testResults[]. test-results.json is written by calculateAndWriteScores
|
|
141
|
+
// above and carries the full per-test shape we need for per-entry upload.
|
|
142
|
+
// The full responseOutput lives in the GCS artifact; PublishReportStep
|
|
143
|
+
// later strips it from the inline Content Lake document when this
|
|
144
|
+
// upload succeeds.
|
|
145
|
+
if (ctx.artifactWriter) {
|
|
146
|
+
const testResults = tryReadTestResults(ctx.config.rootDir);
|
|
147
|
+
if (testResults?.length) {
|
|
148
|
+
const artifactRef = await uploadTestOutputs(ctx.artifactWriter, ctx.runId, testResults);
|
|
149
|
+
if (artifactRef) {
|
|
150
|
+
state.artifactRefs = {
|
|
151
|
+
...state.artifactRefs,
|
|
152
|
+
testOutputs: artifactRef,
|
|
153
|
+
};
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
}
|
|
135
157
|
const criticalSuffix = belowCritical.length > 0
|
|
136
158
|
? ` (${belowCritical.length} area(s) below critical threshold: ${belowCritical.join(", ")})`
|
|
137
159
|
: "";
|
|
@@ -148,3 +170,21 @@ export class CalculateScoresStep {
|
|
|
148
170
|
return buildCacheContext(ctx.config);
|
|
149
171
|
}
|
|
150
172
|
}
|
|
173
|
+
/**
|
|
174
|
+
* Read the per-test result set written by `calculateAndWriteScores`.
|
|
175
|
+
*
|
|
176
|
+
* This is the authoritative source for `uploadTestOutputs` at the time
|
|
177
|
+
* CalculateScoresStep runs — `score-summary.json` doesn't carry
|
|
178
|
+
* `testResults[]` until `gap-analysis-step` enriches it downstream.
|
|
179
|
+
*/
|
|
180
|
+
function tryReadTestResults(rootDir) {
|
|
181
|
+
const path = resolve(rootDir, "results", "latest", "test-results.json");
|
|
182
|
+
if (!existsSync(path))
|
|
183
|
+
return undefined;
|
|
184
|
+
try {
|
|
185
|
+
return JSON.parse(readFileSync(path, "utf-8"));
|
|
186
|
+
}
|
|
187
|
+
catch {
|
|
188
|
+
return undefined;
|
|
189
|
+
}
|
|
190
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pipeline step: FinalizeRunStep — writes the run manifest at pipeline end.
|
|
3
|
+
*
|
|
4
|
+
* Inserts between `GapAnalysis` and `PublishReport`. Assembles a
|
|
5
|
+
* `RunManifest` from `state.artifactRefs` (populated by producer steps)
|
|
6
|
+
* and the shared `RunContext` (via `buildRunContext`), then writes it to
|
|
7
|
+
* `runs/{runId}/manifest.json`. The written manifest becomes the source
|
|
8
|
+
* of truth for artifact locations; `PublishReportStep` snapshots the
|
|
9
|
+
* `artifacts` slice into `Report.artifactManifest` (D0032).
|
|
10
|
+
*
|
|
11
|
+
* Design principles:
|
|
12
|
+
* - Single writer — one `writeManifest()` call per pipeline run.
|
|
13
|
+
* - Idempotent — retries produce the same manifest bytes for the same inputs.
|
|
14
|
+
* - Skipped when no writer is wired (local/air-gapped runs stay functional).
|
|
15
|
+
*
|
|
16
|
+
* @see docs/decisions/D0032-run-anchored-artifact-store.md
|
|
17
|
+
*/
|
|
18
|
+
import type { AppContext, PipelineState, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
|
|
19
|
+
export declare class FinalizeRunStep implements PipelineStep {
|
|
20
|
+
private readonly pipelineStart;
|
|
21
|
+
private readonly options;
|
|
22
|
+
readonly name = "finalize-run";
|
|
23
|
+
readonly optional = true;
|
|
24
|
+
constructor(pipelineStart: number, options?: {
|
|
25
|
+
evalFingerprint?: string;
|
|
26
|
+
});
|
|
27
|
+
check(): ValidationIssue[];
|
|
28
|
+
execute(ctx: AppContext, state: PipelineState): Promise<StepResult>;
|
|
29
|
+
}
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pipeline step: FinalizeRunStep — writes the run manifest at pipeline end.
|
|
3
|
+
*
|
|
4
|
+
* Inserts between `GapAnalysis` and `PublishReport`. Assembles a
|
|
5
|
+
* `RunManifest` from `state.artifactRefs` (populated by producer steps)
|
|
6
|
+
* and the shared `RunContext` (via `buildRunContext`), then writes it to
|
|
7
|
+
* `runs/{runId}/manifest.json`. The written manifest becomes the source
|
|
8
|
+
* of truth for artifact locations; `PublishReportStep` snapshots the
|
|
9
|
+
* `artifacts` slice into `Report.artifactManifest` (D0032).
|
|
10
|
+
*
|
|
11
|
+
* Design principles:
|
|
12
|
+
* - Single writer — one `writeManifest()` call per pipeline run.
|
|
13
|
+
* - Idempotent — retries produce the same manifest bytes for the same inputs.
|
|
14
|
+
* - Skipped when no writer is wired (local/air-gapped runs stay functional).
|
|
15
|
+
*
|
|
16
|
+
* @see docs/decisions/D0032-run-anchored-artifact-store.md
|
|
17
|
+
*/
|
|
18
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
19
|
+
import { resolve } from "node:path";
|
|
20
|
+
import { buildRunContext } from "../../pipeline/run-context.js";
|
|
21
|
+
import { loadSource } from "../../sources.js";
|
|
22
|
+
import { configToSourceOverrides } from "../config-to-source-overrides.js";
|
|
23
|
+
export class FinalizeRunStep {
|
|
24
|
+
pipelineStart;
|
|
25
|
+
options;
|
|
26
|
+
name = "finalize-run";
|
|
27
|
+
optional = true;
|
|
28
|
+
constructor(pipelineStart, options = {}) {
|
|
29
|
+
this.pipelineStart = pipelineStart;
|
|
30
|
+
this.options = options;
|
|
31
|
+
}
|
|
32
|
+
check() {
|
|
33
|
+
return [];
|
|
34
|
+
}
|
|
35
|
+
async execute(ctx, state) {
|
|
36
|
+
const start = Date.now();
|
|
37
|
+
if (!ctx.artifactWriter) {
|
|
38
|
+
return {
|
|
39
|
+
status: "skipped",
|
|
40
|
+
reason: "No artifactWriter wired — manifest is only written when a writer is available",
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
// Resolve the source (same input buildProvenance uses).
|
|
44
|
+
const overrides = configToSourceOverrides(ctx.config);
|
|
45
|
+
const resolvedSource = loadSource(ctx.config.source, overrides);
|
|
46
|
+
// Optional: try to read the on-disk summary for test mode inference,
|
|
47
|
+
// but don't fail finalize if it's missing — the manifest should still
|
|
48
|
+
// be written so artifacts have a catalog.
|
|
49
|
+
const maybeSummary = tryReadScoreSummary(ctx.config.rootDir);
|
|
50
|
+
const runContext = buildRunContext({
|
|
51
|
+
areas: maybeSummary?.scores?.map((s) => s.feature) ?? ctx.config.areas ?? [],
|
|
52
|
+
callerGit: ctx.config.callerGit,
|
|
53
|
+
evalFingerprint: state.evalFingerprint ?? this.options.evalFingerprint,
|
|
54
|
+
logger: ctx.logger,
|
|
55
|
+
mode: ctx.config.mode,
|
|
56
|
+
rootDir: ctx.config.rootDir,
|
|
57
|
+
source: resolvedSource,
|
|
58
|
+
taskIds: ctx.config.tasks,
|
|
59
|
+
});
|
|
60
|
+
const manifest = {
|
|
61
|
+
version: 1,
|
|
62
|
+
runId: ctx.runId,
|
|
63
|
+
createdAt: new Date().toISOString(),
|
|
64
|
+
durationMs: Date.now() - this.pipelineStart,
|
|
65
|
+
status: "completed",
|
|
66
|
+
context: runContext,
|
|
67
|
+
outcomes: state.testSummary
|
|
68
|
+
? { testSummary: state.testSummary }
|
|
69
|
+
: undefined,
|
|
70
|
+
promptfooUrls: state.promptfooUrls,
|
|
71
|
+
artifacts: state.artifactRefs ?? {},
|
|
72
|
+
};
|
|
73
|
+
const ref = await ctx.artifactWriter.writeManifest(ctx.runId, manifest);
|
|
74
|
+
if (!ref) {
|
|
75
|
+
// Non-blocking: writer logged the warning. Still populate state so
|
|
76
|
+
// publish can snapshot `artifacts` even without a persisted manifest.
|
|
77
|
+
state.runManifest = manifest;
|
|
78
|
+
return {
|
|
79
|
+
durationMs: Date.now() - start,
|
|
80
|
+
status: "success",
|
|
81
|
+
summary: "Run manifest computed (GCS write failed — non-blocking)",
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
state.runManifest = manifest;
|
|
85
|
+
const artifactCount = Object.keys(manifest.artifacts).length;
|
|
86
|
+
return {
|
|
87
|
+
durationMs: Date.now() - start,
|
|
88
|
+
status: "success",
|
|
89
|
+
summary: `Run manifest written to ${ref.path} (${artifactCount} artifact ref${artifactCount === 1 ? "" : "s"})`,
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
function tryReadScoreSummary(rootDir) {
|
|
94
|
+
const path = resolve(rootDir, "results", "latest", "score-summary.json");
|
|
95
|
+
if (!existsSync(path))
|
|
96
|
+
return undefined;
|
|
97
|
+
try {
|
|
98
|
+
return JSON.parse(readFileSync(path, "utf-8"));
|
|
99
|
+
}
|
|
100
|
+
catch {
|
|
101
|
+
return undefined;
|
|
102
|
+
}
|
|
103
|
+
}
|
|
@@ -113,21 +113,24 @@ export class PublishReportStep {
|
|
|
113
113
|
tag: this.options.publishTag ?? ctx.config.publishTag,
|
|
114
114
|
title,
|
|
115
115
|
};
|
|
116
|
-
//
|
|
117
|
-
//
|
|
118
|
-
//
|
|
119
|
-
//
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
116
|
+
// Snapshot the artifact manifest from FinalizeRunStep's output (D0032).
|
|
117
|
+
// The source of truth is `runs/{runId}/manifest.json` in GCS; the report
|
|
118
|
+
// carries a denormalized copy so Studio can render drill-down state
|
|
119
|
+
// without an extra GCS fetch.
|
|
120
|
+
const artifactManifest = state.runManifest?.artifacts;
|
|
121
|
+
if (artifactManifest && Object.keys(artifactManifest).length > 0) {
|
|
122
|
+
report.artifactManifest = artifactManifest;
|
|
123
|
+
}
|
|
124
|
+
// When testOutputs was uploaded to GCS, strip responseOutput from the
|
|
125
|
+
// inline testResults[] so the Content Lake document stays slim — the
|
|
126
|
+
// full output lives in the GCS artifact. When no testOutputs artifact
|
|
127
|
+
// exists, leave the inline shape intact so Studio's drill-down UI
|
|
128
|
+
// falls back to it.
|
|
129
|
+
if (artifactManifest?.testOutputs && summary.testResults?.length) {
|
|
130
|
+
report.summary = {
|
|
131
|
+
...summary,
|
|
132
|
+
testResults: summary.testResults.map(slimTestResult),
|
|
133
|
+
};
|
|
131
134
|
}
|
|
132
135
|
// Share reportId with downstream steps (CallbackStep + orchestrator job update)
|
|
133
136
|
state.reportId = reportId;
|
|
@@ -221,6 +224,7 @@ function buildProvenanceInput(summary, ctx, options, autoScope) {
|
|
|
221
224
|
mode,
|
|
222
225
|
promptfooUrls: options.promptfooUrls,
|
|
223
226
|
rootDir: ctx.config.rootDir,
|
|
227
|
+
runId: ctx.runId,
|
|
224
228
|
sanityDocumentIds,
|
|
225
229
|
source,
|
|
226
230
|
sourceReportId: ctx.config.sourceReportId,
|
|
@@ -236,30 +240,6 @@ function slimTestResult(tr) {
|
|
|
236
240
|
const { responseOutput: _o, responseOutputTruncated: _t, ...rest } = tr;
|
|
237
241
|
return rest;
|
|
238
242
|
}
|
|
239
|
-
/**
|
|
240
|
-
* Extract test outputs from StoredTestResult[] and upload as a single
|
|
241
|
-
* JSON artifact to GCS. The artifact is keyed by `{taskId}::{modelId}`
|
|
242
|
-
* to match the lookup pattern in Studio's JudgmentList component.
|
|
243
|
-
*
|
|
244
|
-
* Non-blocking: returns null if upload fails (P5).
|
|
245
|
-
*/
|
|
246
|
-
async function uploadTestOutputs(uploader, reportId, createdAt, testResults) {
|
|
247
|
-
const entries = {};
|
|
248
|
-
for (const tr of testResults) {
|
|
249
|
-
const key = `${tr.taskId}::${tr.modelId}`;
|
|
250
|
-
entries[key] = {
|
|
251
|
-
responseOutput: tr.responseOutput ?? "",
|
|
252
|
-
responseOutputTruncated: tr.responseOutputTruncated ?? false,
|
|
253
|
-
};
|
|
254
|
-
}
|
|
255
|
-
const artifact = {
|
|
256
|
-
version: 1,
|
|
257
|
-
reportId,
|
|
258
|
-
createdAt,
|
|
259
|
-
entries,
|
|
260
|
-
};
|
|
261
|
-
return uploader.upload(reportId, "test-outputs.json", artifact);
|
|
262
|
-
}
|
|
263
243
|
/**
|
|
264
244
|
* Fan out a report to all configured sinks.
|
|
265
245
|
*
|
|
@@ -157,8 +157,19 @@ export function extractGraderJudgments(resultsPath) {
|
|
|
157
157
|
}
|
|
158
158
|
return judgments;
|
|
159
159
|
}
|
|
160
|
-
/**
|
|
161
|
-
|
|
160
|
+
/**
|
|
161
|
+
* Maximum characters (JS string length, not bytes) to store for model
|
|
162
|
+
* response output. ASCII-heavy responses at this cap JSON-encode to ~1 MB;
|
|
163
|
+
* pathological multi-byte UTF-8 could encode to ~4 MB, still well within
|
|
164
|
+
* per-entry GCS object limits.
|
|
165
|
+
*
|
|
166
|
+
* Raised from 8 000 to 1 000 000 in W0048 because the per-entry artifact
|
|
167
|
+
* layout (D0032) makes the cap irrelevant to Studio's fetch cost — each
|
|
168
|
+
* entry is fetched independently on click, so a larger ceiling only costs
|
|
169
|
+
* GCS bytes, not main-thread blocking or baseline report payload.
|
|
170
|
+
* `responseOutputTruncated` still flips for the extreme tail.
|
|
171
|
+
*/
|
|
172
|
+
const MAX_RESPONSE_OUTPUT_LENGTH = 1_000_000;
|
|
162
173
|
/**
|
|
163
174
|
* Extract per-test results with model output from evaluation results.
|
|
164
175
|
*
|