@sanity/ailf 3.1.1 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +1 -1
- package/dist/_vendor/ailf-core/artifact-registry.js +188 -9
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +8 -6
- package/dist/_vendor/ailf-core/types/branded-ids.js +35 -24
- package/dist/_vendor/ailf-core/types/index.d.ts +11 -4
- package/dist/_vendor/ailf-core/types/scoring-input.d.ts +6 -0
- package/dist/artifact-capture/api-gateway-artifact-writer.js +5 -3
- package/dist/artifact-capture/batching-api-gateway-artifact-writer.js +11 -7
- package/dist/artifact-capture/fanout-artifact-writer.d.ts +9 -6
- package/dist/artifact-capture/fanout-artifact-writer.js +9 -6
- package/dist/artifact-capture/gcs-artifact-writer.js +17 -22
- package/dist/artifact-capture/prepare-upload-body.d.ts +27 -0
- package/dist/artifact-capture/prepare-upload-body.js +36 -0
- package/dist/composition-root.js +7 -2
- package/dist/orchestration/pipeline-orchestrator.js +34 -5
- package/dist/pipeline/calculate-scores.d.ts +6 -0
- package/dist/pipeline/calculate-scores.js +2 -0
- package/package.json +1 -1
|
@@ -41,7 +41,7 @@ export type ArtifactMime = "application/json" | "application/x-ndjson" | "text/m
|
|
|
41
41
|
*/
|
|
42
42
|
export type ArtifactTruncationPolicy = "reject" | "trailing-truncate" | "fielded-truncate" | "trial-oversize";
|
|
43
43
|
/** The union of every artifact type known to AILF. */
|
|
44
|
-
export type ArtifactType = "runManifest" | "scoreSummary" | "pipelineResult" | "pipelineContext" | "documentManifest" | "prComment" | "readinessReport" | "reportSnapshot" | "autoComparison" | "gapReport" | "sinkResults" | "callbackRequest" | "callbackResponse" | "configSnapshot" | "evalConfigGenerated" | "comparisonReport" | "discoveryReport" | "failureModes" | "
|
|
44
|
+
export type ArtifactType = "runManifest" | "scoreSummary" | "pipelineResult" | "pipelineContext" | "documentManifest" | "prComment" | "readinessReport" | "reportSnapshot" | "autoComparison" | "gapReport" | "sinkResults" | "callbackRequest" | "callbackResponse" | "configSnapshot" | "evalConfigGenerated" | "comparisonReport" | "discoveryReport" | "failureModes" | "renderedPrompts" | "rawResults" | "testOutputs" | "graderPrompts" | "graderJudgments" | "traces";
|
|
45
45
|
/**
|
|
46
46
|
* Result of parsing a per-entry key into a sanitized filename component.
|
|
47
47
|
* Success carries the sanitized value; failure carries a reason for 4xx responses.
|
|
@@ -255,6 +255,71 @@ const graderJudgmentPreviewSchema = z.object({
|
|
|
255
255
|
reasonPreview: z.string().max(280),
|
|
256
256
|
dimensionScores: z.record(z.string(), z.number()).optional(),
|
|
257
257
|
});
|
|
258
|
+
/**
|
|
259
|
+
* Preview shape for `rawResults` manifest entries. A per-call projection
|
|
260
|
+
* over the Promptfoo result payload — cost, latency, pass/fail, and token
|
|
261
|
+
* usage — so a Studio list row can render without signing the full
|
|
262
|
+
* artifact (which routinely runs to tens of KB of response text).
|
|
263
|
+
*
|
|
264
|
+
* Projected fields mirror top-level Promptfoo result keys (`success`,
|
|
265
|
+
* `score`, `cost`, `latencyMs`) and `response.finishReason` /
|
|
266
|
+
* `response.tokenUsage.total` for the two useful nested numbers.
|
|
267
|
+
*/
|
|
268
|
+
const rawResultPreviewSchema = z.object({
|
|
269
|
+
passed: z.boolean(),
|
|
270
|
+
score: z.number().optional(),
|
|
271
|
+
cost: z.number().nonnegative().optional(),
|
|
272
|
+
latencyMs: z.number().int().nonnegative().optional(),
|
|
273
|
+
totalTokens: z.number().int().nonnegative().optional(),
|
|
274
|
+
finishReason: z.string().max(40).optional(),
|
|
275
|
+
});
|
|
276
|
+
/**
|
|
277
|
+
* Preview shape for the run-scoped `scoreSummary` bulk artifact. Projects
|
|
278
|
+
* the composite score, run-level cost/token totals, the weakest area, and
|
|
279
|
+
* the top 3 scoring areas — enough for a Studio rollup row without pulling
|
|
280
|
+
* the full summary (which carries enriched per-test data and can exceed
|
|
281
|
+
* 100 KB on multi-area runs).
|
|
282
|
+
*/
|
|
283
|
+
const scoreSummaryPreviewSchema = z.object({
|
|
284
|
+
composite: z.number(),
|
|
285
|
+
testCount: z.number().int().nonnegative().optional(),
|
|
286
|
+
lowestArea: z.string().max(60).optional(),
|
|
287
|
+
lowestScore: z.number().optional(),
|
|
288
|
+
totalCost: z.number().nonnegative().optional(),
|
|
289
|
+
totalTokens: z.number().int().nonnegative().optional(),
|
|
290
|
+
topAreas: z
|
|
291
|
+
.array(z.object({
|
|
292
|
+
feature: z.string().max(60),
|
|
293
|
+
score: z.number(),
|
|
294
|
+
}))
|
|
295
|
+
.max(3)
|
|
296
|
+
.optional(),
|
|
297
|
+
});
|
|
298
|
+
/**
|
|
299
|
+
* Preview shape for `renderedPrompts` manifest entries (W0061 / D0033 M7).
|
|
300
|
+
* Lets the Studio PromptReplayDrawer surface a quick-look pane — prompt
|
|
301
|
+
* char count, the variant label (e.g. "Baseline (No Docs)" / "Gold (with
|
|
302
|
+
* docs)") as a secondary signal, and a short leading snippet — without
|
|
303
|
+
* round-tripping the full rendered prompt (which can run 50+ KB on
|
|
304
|
+
* multi-turn conversations).
|
|
305
|
+
*/
|
|
306
|
+
const renderedPromptPreviewSchema = z.object({
|
|
307
|
+
promptCharCount: z.number().int().nonnegative(),
|
|
308
|
+
label: z.string().max(60).optional(),
|
|
309
|
+
snippet: z.string().max(120),
|
|
310
|
+
});
|
|
311
|
+
/**
|
|
312
|
+
* Preview shape for `graderPrompts` manifest entries (W0061 / D0033 M7).
|
|
313
|
+
* `rubricName` maps to the classified dimension (e.g. "correctness",
|
|
314
|
+
* "completeness"); char count and snippet cover the rubric text itself so
|
|
315
|
+
* the Studio drawer can render a preview before the full assertion payload
|
|
316
|
+
* is signed and fetched.
|
|
317
|
+
*/
|
|
318
|
+
const graderPromptPreviewSchema = z.object({
|
|
319
|
+
rubricCharCount: z.number().int().nonnegative(),
|
|
320
|
+
rubricName: z.string().max(60).optional(),
|
|
321
|
+
snippet: z.string().max(120),
|
|
322
|
+
});
|
|
258
323
|
// Aspirational: most payload shapes are still loose. Tightening per-type as
|
|
259
324
|
// consumers stabilize is explicitly a W0050/W0051 concern — W0049 fixes the
|
|
260
325
|
// structural shape around them without changing the payload contracts.
|
|
@@ -361,6 +426,57 @@ export const ARTIFACT_REGISTRY = {
|
|
|
361
426
|
entrySchema: unknownEntry,
|
|
362
427
|
mime: "application/json",
|
|
363
428
|
capBytes: 1_000_000,
|
|
429
|
+
manifestPreview: {
|
|
430
|
+
schema: scoreSummaryPreviewSchema,
|
|
431
|
+
extract: (entry) => {
|
|
432
|
+
const e = entry;
|
|
433
|
+
const avgScore = typeof e.overall?.avgScore === "number" ? e.overall.avgScore : 0;
|
|
434
|
+
const totalCost = typeof e.overall?.cost?.total === "number"
|
|
435
|
+
? e.overall.cost.total
|
|
436
|
+
: undefined;
|
|
437
|
+
const rawTotalTokens = e.overall?.cost?.totalTokens;
|
|
438
|
+
const totalTokens = typeof rawTotalTokens === "number" &&
|
|
439
|
+
Number.isFinite(rawTotalTokens) &&
|
|
440
|
+
rawTotalTokens >= 0
|
|
441
|
+
? Math.trunc(rawTotalTokens)
|
|
442
|
+
: undefined;
|
|
443
|
+
const scores = Array.isArray(e.scores) ? e.scores : [];
|
|
444
|
+
const areas = [];
|
|
445
|
+
for (const s of scores) {
|
|
446
|
+
if (s === null || typeof s !== "object")
|
|
447
|
+
continue;
|
|
448
|
+
const rec = s;
|
|
449
|
+
if (typeof rec.feature !== "string")
|
|
450
|
+
continue;
|
|
451
|
+
if (typeof rec.totalScore !== "number")
|
|
452
|
+
continue;
|
|
453
|
+
const testCount = typeof rec.testCount === "number" ? rec.testCount : undefined;
|
|
454
|
+
areas.push({
|
|
455
|
+
feature: rec.feature,
|
|
456
|
+
totalScore: rec.totalScore,
|
|
457
|
+
...(testCount !== undefined ? { testCount } : {}),
|
|
458
|
+
});
|
|
459
|
+
}
|
|
460
|
+
const topAreas = areas
|
|
461
|
+
.slice()
|
|
462
|
+
.sort((a, b) => b.totalScore - a.totalScore)
|
|
463
|
+
.slice(0, 3)
|
|
464
|
+
.map((a) => ({ feature: a.feature, score: a.totalScore }));
|
|
465
|
+
const testCount = areas.reduce((sum, a) => sum + (a.testCount ?? 0), 0);
|
|
466
|
+
const lowestArea = typeof e.lowestArea === "string" ? e.lowestArea : undefined;
|
|
467
|
+
const lowestScore = typeof e.lowestScore === "number" ? e.lowestScore : undefined;
|
|
468
|
+
return {
|
|
469
|
+
composite: Math.round(avgScore),
|
|
470
|
+
...(testCount > 0 ? { testCount } : {}),
|
|
471
|
+
...(lowestArea === undefined ? {} : { lowestArea }),
|
|
472
|
+
...(lowestScore === undefined ? {} : { lowestScore }),
|
|
473
|
+
...(totalCost === undefined ? {} : { totalCost }),
|
|
474
|
+
...(totalTokens === undefined ? {} : { totalTokens }),
|
|
475
|
+
...(topAreas.length > 0 ? { topAreas } : {}),
|
|
476
|
+
};
|
|
477
|
+
},
|
|
478
|
+
capBytes: 512,
|
|
479
|
+
},
|
|
364
480
|
}),
|
|
365
481
|
pipelineResult: buildDescriptor({
|
|
366
482
|
type: "pipelineResult",
|
|
@@ -534,15 +650,6 @@ export const ARTIFACT_REGISTRY = {
|
|
|
534
650
|
},
|
|
535
651
|
}),
|
|
536
652
|
// -- Run × Mode × Task(+…) -------------------------------------------------
|
|
537
|
-
taskDefinitions: buildDescriptor({
|
|
538
|
-
type: "taskDefinitions",
|
|
539
|
-
slug: "task-definitions",
|
|
540
|
-
layout: "per-entry",
|
|
541
|
-
axes: ["run", "mode", "task"],
|
|
542
|
-
entrySchema: unknownEntry,
|
|
543
|
-
mime: "application/json",
|
|
544
|
-
capBytes: 256_000,
|
|
545
|
-
}),
|
|
546
653
|
renderedPrompts: buildDescriptor({
|
|
547
654
|
type: "renderedPrompts",
|
|
548
655
|
slug: "rendered-prompts",
|
|
@@ -551,6 +658,29 @@ export const ARTIFACT_REGISTRY = {
|
|
|
551
658
|
entrySchema: unknownEntry,
|
|
552
659
|
mime: "application/json",
|
|
553
660
|
capBytes: 1_000_000,
|
|
661
|
+
manifestPreview: {
|
|
662
|
+
schema: renderedPromptPreviewSchema,
|
|
663
|
+
extract: (entry) => {
|
|
664
|
+
// Producer shape from `emit-eval-results.ts`:
|
|
665
|
+
// { prompt: { raw: string, label: string, config: {...} },
|
|
666
|
+
// provider: { id: string, label: string } }
|
|
667
|
+
// The prompt text lives at `prompt.raw`; `prompt.label` carries the
|
|
668
|
+
// human-readable variant name ("Baseline (No Docs)" / "Gold (with
|
|
669
|
+
// docs)") and is a more useful secondary signal than a
|
|
670
|
+
// provider-level format hint (which isn't exposed here).
|
|
671
|
+
const e = entry;
|
|
672
|
+
const raw = typeof e.prompt?.raw === "string" ? e.prompt.raw : "";
|
|
673
|
+
const label = typeof e.prompt?.label === "string"
|
|
674
|
+
? truncateString(e.prompt.label, 60)
|
|
675
|
+
: undefined;
|
|
676
|
+
return {
|
|
677
|
+
promptCharCount: raw.length,
|
|
678
|
+
...(label === undefined ? {} : { label }),
|
|
679
|
+
snippet: truncateString(raw, 120),
|
|
680
|
+
};
|
|
681
|
+
},
|
|
682
|
+
capBytes: 256,
|
|
683
|
+
},
|
|
554
684
|
}),
|
|
555
685
|
rawResults: buildDescriptor({
|
|
556
686
|
type: "rawResults",
|
|
@@ -560,6 +690,39 @@ export const ARTIFACT_REGISTRY = {
|
|
|
560
690
|
entrySchema: unknownEntry,
|
|
561
691
|
mime: "application/json",
|
|
562
692
|
capBytes: 1_000_000,
|
|
693
|
+
manifestPreview: {
|
|
694
|
+
schema: rawResultPreviewSchema,
|
|
695
|
+
extract: (entry) => {
|
|
696
|
+
const e = entry;
|
|
697
|
+
const passed = e.success === true;
|
|
698
|
+
const score = typeof e.score === "number" ? e.score : undefined;
|
|
699
|
+
const cost = typeof e.cost === "number" && e.cost >= 0 ? e.cost : undefined;
|
|
700
|
+
const rawLatency = e.latencyMs;
|
|
701
|
+
const latencyMs = typeof rawLatency === "number" &&
|
|
702
|
+
Number.isFinite(rawLatency) &&
|
|
703
|
+
rawLatency >= 0
|
|
704
|
+
? Math.trunc(rawLatency)
|
|
705
|
+
: undefined;
|
|
706
|
+
const rawTotalTokens = e.response?.tokenUsage?.total;
|
|
707
|
+
const totalTokens = typeof rawTotalTokens === "number" &&
|
|
708
|
+
Number.isFinite(rawTotalTokens) &&
|
|
709
|
+
rawTotalTokens >= 0
|
|
710
|
+
? Math.trunc(rawTotalTokens)
|
|
711
|
+
: undefined;
|
|
712
|
+
const finishReason = typeof e.response?.finishReason === "string"
|
|
713
|
+
? truncateString(e.response.finishReason, 40)
|
|
714
|
+
: undefined;
|
|
715
|
+
return {
|
|
716
|
+
passed,
|
|
717
|
+
...(score === undefined ? {} : { score }),
|
|
718
|
+
...(cost === undefined ? {} : { cost }),
|
|
719
|
+
...(latencyMs === undefined ? {} : { latencyMs }),
|
|
720
|
+
...(totalTokens === undefined ? {} : { totalTokens }),
|
|
721
|
+
...(finishReason === undefined ? {} : { finishReason }),
|
|
722
|
+
};
|
|
723
|
+
},
|
|
724
|
+
capBytes: 256,
|
|
725
|
+
},
|
|
563
726
|
}),
|
|
564
727
|
testOutputs: buildDescriptor({
|
|
565
728
|
type: "testOutputs",
|
|
@@ -593,6 +756,22 @@ export const ARTIFACT_REGISTRY = {
|
|
|
593
756
|
entrySchema: unknownEntry,
|
|
594
757
|
mime: "application/json",
|
|
595
758
|
capBytes: 512_000,
|
|
759
|
+
manifestPreview: {
|
|
760
|
+
schema: graderPromptPreviewSchema,
|
|
761
|
+
extract: (entry) => {
|
|
762
|
+
const e = entry;
|
|
763
|
+
const rubricText = typeof e.assertion?.value === "string" ? e.assertion.value : "";
|
|
764
|
+
const rubricName = typeof e.dimension === "string"
|
|
765
|
+
? truncateString(e.dimension, 60)
|
|
766
|
+
: undefined;
|
|
767
|
+
return {
|
|
768
|
+
rubricCharCount: rubricText.length,
|
|
769
|
+
...(rubricName === undefined ? {} : { rubricName }),
|
|
770
|
+
snippet: truncateString(rubricText, 120),
|
|
771
|
+
};
|
|
772
|
+
},
|
|
773
|
+
capBytes: 256,
|
|
774
|
+
},
|
|
596
775
|
}),
|
|
597
776
|
graderJudgments: buildDescriptor({
|
|
598
777
|
type: "graderJudgments",
|
|
@@ -134,16 +134,18 @@ export declare function taskId(raw: string): Result<TaskId, IdValidationError>;
|
|
|
134
134
|
/**
|
|
135
135
|
* Parse a raw string into a `RunId`.
|
|
136
136
|
*
|
|
137
|
-
*
|
|
137
|
+
* Accepts the canonical `run_YYYYMMDDTHHMMSSZ_<8 base36>` shape (D0035)
|
|
138
|
+
* and the legacy `run_<32 hex>` UUIDv7 shape for back-compat with runs
|
|
139
|
+
* generated before the scheme change.
|
|
138
140
|
*/
|
|
139
141
|
export declare function runId(raw: string): Result<RunId, IdValidationError>;
|
|
140
142
|
/**
|
|
141
|
-
* Generate a new `RunId`
|
|
143
|
+
* Generate a new `RunId` with a human-readable UTC timestamp plus a
|
|
144
|
+
* random suffix (D0035). Shape: `run_YYYYMMDDTHHMMSSZ_<8 base36>`.
|
|
142
145
|
*
|
|
143
|
-
*
|
|
144
|
-
*
|
|
145
|
-
* `
|
|
146
|
-
* reads the resulting id from `AppContext.runId`.
|
|
146
|
+
* Lexicographic ordering matches creation time. One generator call per
|
|
147
|
+
* pipeline invocation; every step reads the resulting id from
|
|
148
|
+
* `AppContext.runId`.
|
|
147
149
|
*/
|
|
148
150
|
export declare function generateRunId(): RunId;
|
|
149
151
|
/**
|
|
@@ -44,41 +44,52 @@ export function taskId(raw) {
|
|
|
44
44
|
}
|
|
45
45
|
return ok(raw);
|
|
46
46
|
}
|
|
47
|
+
/**
|
|
48
|
+
* Canonical shape emitted by `generateRunId` (D0035).
|
|
49
|
+
* `run_YYYYMMDDTHHMMSSZ_<8 lowercase base36>` — sortable + readable.
|
|
50
|
+
*/
|
|
51
|
+
const RUN_ID_RE = /^run_\d{8}T\d{6}Z_[0-9a-z]{8}$/;
|
|
52
|
+
/**
|
|
53
|
+
* Legacy UUIDv7-hex shape. Accepted so existing runs in GCS and the
|
|
54
|
+
* Content Lake remain valid; never emitted by `generateRunId`.
|
|
55
|
+
*/
|
|
56
|
+
const LEGACY_RUN_ID_RE = /^run_[0-9a-f]{32}$/;
|
|
47
57
|
/**
|
|
48
58
|
* Parse a raw string into a `RunId`.
|
|
49
59
|
*
|
|
50
|
-
*
|
|
60
|
+
* Accepts the canonical `run_YYYYMMDDTHHMMSSZ_<8 base36>` shape (D0035)
|
|
61
|
+
* and the legacy `run_<32 hex>` UUIDv7 shape for back-compat with runs
|
|
62
|
+
* generated before the scheme change.
|
|
51
63
|
*/
|
|
52
64
|
export function runId(raw) {
|
|
53
|
-
if (
|
|
54
|
-
return
|
|
55
|
-
code: "INVALID_RUN_ID",
|
|
56
|
-
raw,
|
|
57
|
-
message: `Invalid RunId "${raw}": must match run_[a-zA-Z0-9]{8,}`,
|
|
58
|
-
});
|
|
65
|
+
if (RUN_ID_RE.test(raw) || LEGACY_RUN_ID_RE.test(raw)) {
|
|
66
|
+
return ok(raw);
|
|
59
67
|
}
|
|
60
|
-
return
|
|
68
|
+
return err({
|
|
69
|
+
code: "INVALID_RUN_ID",
|
|
70
|
+
raw,
|
|
71
|
+
message: `Invalid RunId "${raw}": must match run_YYYYMMDDTHHMMSSZ_<8 base36> or legacy run_<32 hex>`,
|
|
72
|
+
});
|
|
61
73
|
}
|
|
62
74
|
/**
|
|
63
|
-
* Generate a new `RunId`
|
|
75
|
+
* Generate a new `RunId` with a human-readable UTC timestamp plus a
|
|
76
|
+
* random suffix (D0035). Shape: `run_YYYYMMDDTHHMMSSZ_<8 base36>`.
|
|
64
77
|
*
|
|
65
|
-
*
|
|
66
|
-
*
|
|
67
|
-
* `
|
|
68
|
-
* reads the resulting id from `AppContext.runId`.
|
|
78
|
+
* Lexicographic ordering matches creation time. One generator call per
|
|
79
|
+
* pipeline invocation; every step reads the resulting id from
|
|
80
|
+
* `AppContext.runId`.
|
|
69
81
|
*/
|
|
70
82
|
export function generateRunId() {
|
|
71
|
-
const
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
const
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
return `run_${v7}`;
|
|
83
|
+
const ts = new Date()
|
|
84
|
+
.toISOString()
|
|
85
|
+
.replace(/[-:]/g, "")
|
|
86
|
+
.replace(/\.\d{3}Z$/, "Z");
|
|
87
|
+
const bytes = crypto.getRandomValues(new Uint8Array(8));
|
|
88
|
+
let suffix = "";
|
|
89
|
+
for (const b of bytes) {
|
|
90
|
+
suffix += (b % 36).toString(36);
|
|
91
|
+
}
|
|
92
|
+
return `run_${ts}_${suffix}`;
|
|
82
93
|
}
|
|
83
94
|
/**
|
|
84
95
|
* Parse a raw string into a `SuiteId`.
|
|
@@ -322,11 +322,18 @@ export interface StoredTestResult {
|
|
|
322
322
|
responseOutputTruncated?: boolean;
|
|
323
323
|
/** Task description (e.g. "Functions - Webhook handler (gold)") */
|
|
324
324
|
taskId: string;
|
|
325
|
-
/**
|
|
325
|
+
/**
|
|
326
|
+
* Token usage breakdown. All fields are optional because Promptfoo's
|
|
327
|
+
* row-level shape varies — cached results typically carry only
|
|
328
|
+
* `{ cached, total }` while fresh results carry `{ prompt, completion,
|
|
329
|
+
* total }`. Populated from `response.tokenUsage` on the raw result; see
|
|
330
|
+
* `extractStoredTestResults` in packages/eval/src/pipeline/calculate-scores.ts.
|
|
331
|
+
*/
|
|
326
332
|
tokenUsage?: {
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
333
|
+
cached?: number;
|
|
334
|
+
completion?: number;
|
|
335
|
+
prompt?: number;
|
|
336
|
+
total?: number;
|
|
330
337
|
};
|
|
331
338
|
/** "gold" (with docs) or "baseline" (without docs) */
|
|
332
339
|
variant: "baseline" | "gold";
|
|
@@ -28,6 +28,7 @@
|
|
|
28
28
|
* @see docs/decisions/D0033-unified-run-anchored-artifact-capture.md
|
|
29
29
|
*/
|
|
30
30
|
import { ARTIFACT_REGISTRY, NotImplementedError, } from "../_vendor/ailf-core/index.js";
|
|
31
|
+
import { prepareUploadBody } from "./prepare-upload-body.js";
|
|
31
32
|
import { NO_OP_UPLOAD_METRICS, } from "./upload-metrics.js";
|
|
32
33
|
export class ApiGatewayArtifactWriter {
|
|
33
34
|
options;
|
|
@@ -143,8 +144,9 @@ export class ApiGatewayArtifactWriter {
|
|
|
143
144
|
};
|
|
144
145
|
}
|
|
145
146
|
async putJsonRaw(uploadUrlPath, data, type) {
|
|
146
|
-
|
|
147
|
-
|
|
147
|
+
// W0059 — redact + serialize through the shared helper so the gateway
|
|
148
|
+
// path strips secrets before they reach the signed GCS URL.
|
|
149
|
+
const { body, bytes } = prepareUploadBody(data, "application/json");
|
|
148
150
|
try {
|
|
149
151
|
const signed = await this.fetchSignedUrl(uploadUrlPath, type);
|
|
150
152
|
if (!signed)
|
|
@@ -153,7 +155,7 @@ export class ApiGatewayArtifactWriter {
|
|
|
153
155
|
let putSuccess = false;
|
|
154
156
|
try {
|
|
155
157
|
const putRes = await fetch(signed.url, {
|
|
156
|
-
body
|
|
158
|
+
body,
|
|
157
159
|
headers: signed.requiredHeaders,
|
|
158
160
|
method: "PUT",
|
|
159
161
|
});
|
|
@@ -26,6 +26,7 @@
|
|
|
26
26
|
* credentials are present.
|
|
27
27
|
*/
|
|
28
28
|
import { ARTIFACT_REGISTRY, BULK_ENTRY_KEY, NotImplementedError, } from "../_vendor/ailf-core/index.js";
|
|
29
|
+
import { prepareUploadBody } from "./prepare-upload-body.js";
|
|
29
30
|
import { NO_OP_UPLOAD_METRICS, } from "./upload-metrics.js";
|
|
30
31
|
/**
|
|
31
32
|
* How many entries to bundle into a single `/batch/upload-urls` request.
|
|
@@ -307,16 +308,18 @@ export class BatchingApiGatewayArtifactWriter {
|
|
|
307
308
|
const start = Date.now();
|
|
308
309
|
let ok = false;
|
|
309
310
|
// Tracked outside the try so the `finally` metrics event still gets a
|
|
310
|
-
// bytes figure when
|
|
311
|
+
// bytes figure when serialization itself throws (circular payload,
|
|
311
312
|
// bigint, etc.) — P5 requires we never hang the producer on a
|
|
312
313
|
// pathological payload.
|
|
313
314
|
let bytes = 0;
|
|
314
315
|
try {
|
|
315
|
-
|
|
316
|
-
|
|
316
|
+
// W0059 — redact + serialize through the shared helper so the
|
|
317
|
+
// batching path strips secrets before they reach the signed GCS URL.
|
|
318
|
+
const prepared = prepareUploadBody(pending.payload, "application/json");
|
|
319
|
+
bytes = prepared.bytes;
|
|
317
320
|
const res = await fetch(signed.url, {
|
|
318
321
|
method: "PUT",
|
|
319
|
-
body:
|
|
322
|
+
body: prepared.body,
|
|
320
323
|
headers: signed.requiredHeaders,
|
|
321
324
|
});
|
|
322
325
|
if (!res.ok) {
|
|
@@ -414,14 +417,15 @@ export class BatchingApiGatewayArtifactWriter {
|
|
|
414
417
|
}
|
|
415
418
|
if (!signed)
|
|
416
419
|
return null;
|
|
417
|
-
|
|
418
|
-
|
|
420
|
+
// W0059 — manifest upload also flows through the shared helper so the
|
|
421
|
+
// redaction policy covers every PUT the batching writer performs.
|
|
422
|
+
const { body, bytes } = prepareUploadBody(payload, "application/json");
|
|
419
423
|
const putStart = Date.now();
|
|
420
424
|
let putOk = false;
|
|
421
425
|
try {
|
|
422
426
|
const res = await fetch(signed.url, {
|
|
423
427
|
method: "PUT",
|
|
424
|
-
body
|
|
428
|
+
body,
|
|
425
429
|
headers: signed.requiredHeaders,
|
|
426
430
|
});
|
|
427
431
|
if (!res.ok) {
|
|
@@ -2,15 +2,17 @@
|
|
|
2
2
|
* FanoutArtifactWriter — layers multiple writers so each `emit()` fans out
|
|
3
3
|
* to every configured backend.
|
|
4
4
|
*
|
|
5
|
-
* D0033 M4
|
|
6
|
-
* `FanoutArtifactWriter([
|
|
5
|
+
* Default wiring (W0064 reorder of D0033 M4):
|
|
6
|
+
* `FanoutArtifactWriter([ GcsArtifactWriter, LocalFilesystemArtifactWriter ])`
|
|
7
7
|
*
|
|
8
8
|
* Semantics:
|
|
9
9
|
* - Fan out in declaration order. Every writer runs, even if earlier ones fail.
|
|
10
|
-
* - Return the **first non-null ArtifactRef**.
|
|
11
|
-
*
|
|
12
|
-
*
|
|
13
|
-
* local
|
|
10
|
+
* - Return the **first non-null ArtifactRef**. The remote backend is listed
|
|
11
|
+
* first so its ref wins when both succeed — the published manifest points
|
|
12
|
+
* at a cross-machine-readable store. A remote failure falls through to
|
|
13
|
+
* local, so a local success + remote failure still produces a non-null ref
|
|
14
|
+
* pointing at local and the pipeline succeeds with a warning on the remote
|
|
15
|
+
* leg.
|
|
14
16
|
* - Failures on individual writers warn (via their own P5 paths) but do
|
|
15
17
|
* not propagate. The fanout never throws.
|
|
16
18
|
*
|
|
@@ -19,6 +21,7 @@
|
|
|
19
21
|
* plus a recording test double.
|
|
20
22
|
*
|
|
21
23
|
* @see docs/decisions/D0033-unified-run-anchored-artifact-capture.md (§ M4)
|
|
24
|
+
* @see docs/design-docs/canonical-artifact-ref-selection.md — W0064 ordering
|
|
22
25
|
*/
|
|
23
26
|
import type { ArtifactEntry, ArtifactRef, ArtifactType, ArtifactWriter, ArtifactWriterProgressOptions, AssociationValues, RunId, RunManifest } from "../_vendor/ailf-core/index.d.ts";
|
|
24
27
|
export interface FanoutArtifactWriterOptions {
|
|
@@ -2,15 +2,17 @@
|
|
|
2
2
|
* FanoutArtifactWriter — layers multiple writers so each `emit()` fans out
|
|
3
3
|
* to every configured backend.
|
|
4
4
|
*
|
|
5
|
-
* D0033 M4
|
|
6
|
-
* `FanoutArtifactWriter([
|
|
5
|
+
* Default wiring (W0064 reorder of D0033 M4):
|
|
6
|
+
* `FanoutArtifactWriter([ GcsArtifactWriter, LocalFilesystemArtifactWriter ])`
|
|
7
7
|
*
|
|
8
8
|
* Semantics:
|
|
9
9
|
* - Fan out in declaration order. Every writer runs, even if earlier ones fail.
|
|
10
|
-
* - Return the **first non-null ArtifactRef**.
|
|
11
|
-
*
|
|
12
|
-
*
|
|
13
|
-
* local
|
|
10
|
+
* - Return the **first non-null ArtifactRef**. The remote backend is listed
|
|
11
|
+
* first so its ref wins when both succeed — the published manifest points
|
|
12
|
+
* at a cross-machine-readable store. A remote failure falls through to
|
|
13
|
+
* local, so a local success + remote failure still produces a non-null ref
|
|
14
|
+
* pointing at local and the pipeline succeeds with a warning on the remote
|
|
15
|
+
* leg.
|
|
14
16
|
* - Failures on individual writers warn (via their own P5 paths) but do
|
|
15
17
|
* not propagate. The fanout never throws.
|
|
16
18
|
*
|
|
@@ -19,6 +21,7 @@
|
|
|
19
21
|
* plus a recording test double.
|
|
20
22
|
*
|
|
21
23
|
* @see docs/decisions/D0033-unified-run-anchored-artifact-capture.md (§ M4)
|
|
24
|
+
* @see docs/design-docs/canonical-artifact-ref-selection.md — W0064 ordering
|
|
22
25
|
*/
|
|
23
26
|
export class FanoutArtifactWriter {
|
|
24
27
|
writers;
|
|
@@ -30,6 +30,7 @@
|
|
|
30
30
|
import { Storage } from "@google-cloud/storage";
|
|
31
31
|
import { ARTIFACT_REGISTRY, buildManifestPreview, } from "../_vendor/ailf-core/index.js";
|
|
32
32
|
import { resolveUploadConcurrency } from "./parallel-emit.js";
|
|
33
|
+
import { prepareUploadBody } from "./prepare-upload-body.js";
|
|
33
34
|
import { redactArtifactData } from "./redact-artifact.js";
|
|
34
35
|
import { NO_OP_UPLOAD_METRICS, } from "./upload-metrics.js";
|
|
35
36
|
/**
|
|
@@ -83,18 +84,18 @@ export class GcsArtifactWriter {
|
|
|
83
84
|
console.warn(` ⚠️ emit("${type}"): association.run is required, skipping`);
|
|
84
85
|
return null;
|
|
85
86
|
}
|
|
86
|
-
//
|
|
87
|
-
|
|
88
|
-
//
|
|
89
|
-
//
|
|
90
|
-
|
|
87
|
+
// W0059 — redact + serialize through the shared helper so the three
|
|
88
|
+
// writer implementations can't drift. Preview reads the pre-redaction
|
|
89
|
+
// payload (same as local writer — the preview carries a descriptor-
|
|
90
|
+
// controlled summary bounded by capBytes, not the raw entry bytes).
|
|
91
|
+
const { body } = prepareUploadBody(payload, descriptor.mime);
|
|
91
92
|
const preview = buildManifestPreview(descriptor, payload);
|
|
92
93
|
if (descriptor.layout === "bulk") {
|
|
93
94
|
const path = descriptor.objectPath(runId);
|
|
94
|
-
const ref = await this.putBody(path,
|
|
95
|
+
const ref = await this.putBody(path, body, {
|
|
95
96
|
layout: "bulk",
|
|
96
97
|
mime: descriptor.mime,
|
|
97
|
-
entryCount: entryCountOf(
|
|
98
|
+
entryCount: entryCountOf(payload),
|
|
98
99
|
type,
|
|
99
100
|
});
|
|
100
101
|
if (!ref)
|
|
@@ -106,7 +107,11 @@ export class GcsArtifactWriter {
|
|
|
106
107
|
// per-entry
|
|
107
108
|
const entryKey = descriptor.formatEntryKey(association);
|
|
108
109
|
const path = descriptor.objectPath(runId, entryKey);
|
|
109
|
-
const ref = await this.putBody(path,
|
|
110
|
+
const ref = await this.putBody(path, body, {
|
|
111
|
+
layout: "per-entry",
|
|
112
|
+
mime: descriptor.mime,
|
|
113
|
+
type,
|
|
114
|
+
});
|
|
110
115
|
if (!ref)
|
|
111
116
|
return null;
|
|
112
117
|
const finalRef = {
|
|
@@ -222,11 +227,11 @@ export class GcsArtifactWriter {
|
|
|
222
227
|
return null;
|
|
223
228
|
}
|
|
224
229
|
const path = descriptor.objectPath(runId);
|
|
225
|
-
const
|
|
226
|
-
return this.putBody(path,
|
|
230
|
+
const { body } = prepareUploadBody(data, descriptor.mime);
|
|
231
|
+
return this.putBody(path, body, {
|
|
227
232
|
layout: "bulk",
|
|
228
233
|
mime: descriptor.mime,
|
|
229
|
-
entryCount: entryCountOf(
|
|
234
|
+
entryCount: entryCountOf(data),
|
|
230
235
|
type,
|
|
231
236
|
});
|
|
232
237
|
}
|
|
@@ -251,9 +256,7 @@ export class GcsArtifactWriter {
|
|
|
251
256
|
continue;
|
|
252
257
|
}
|
|
253
258
|
const path = descriptor.objectPath(runId, entry.key);
|
|
254
|
-
const
|
|
255
|
-
const body = serializeForMime(redacted, descriptor.mime);
|
|
256
|
-
const bytes = Buffer.byteLength(body, "utf-8");
|
|
259
|
+
const { body, bytes } = prepareUploadBody(entry.data, descriptor.mime);
|
|
257
260
|
const start = Date.now();
|
|
258
261
|
let success = false;
|
|
259
262
|
try {
|
|
@@ -416,14 +419,6 @@ class ConcurrencyLimiter {
|
|
|
416
419
|
}
|
|
417
420
|
}
|
|
418
421
|
}
|
|
419
|
-
function serializeForMime(payload, mime) {
|
|
420
|
-
if (mime === "text/markdown" || mime === "application/yaml") {
|
|
421
|
-
if (typeof payload === "string")
|
|
422
|
-
return payload;
|
|
423
|
-
return String(payload ?? "");
|
|
424
|
-
}
|
|
425
|
-
return JSON.stringify(payload);
|
|
426
|
-
}
|
|
427
422
|
function entryCountOf(data) {
|
|
428
423
|
if (typeof data === "object" &&
|
|
429
424
|
data !== null &&
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared preamble for uploading an artifact payload from an `ArtifactWriter`.
|
|
3
|
+
*
|
|
4
|
+
* All three `ArtifactWriter` implementations (direct GCS, API Gateway, batching
|
|
5
|
+
* API Gateway) must apply the same `redact → serialize → bytecount` pipeline so
|
|
6
|
+
* secrets are stripped before leaving the process. Routing each writer through
|
|
7
|
+
* this helper prevents drift — any future writer that skips the helper would
|
|
8
|
+
* fail the contract test in
|
|
9
|
+
* `src/__tests__/artifact-upload-redaction.test.ts`.
|
|
10
|
+
*
|
|
11
|
+
* NDJSON streaming is **not** handled here — each row is redacted independently
|
|
12
|
+
* by the NDJSON writer path before being concatenated into a part body.
|
|
13
|
+
*/
|
|
14
|
+
import type { ArtifactMime } from "../_vendor/ailf-core/index.d.ts";
|
|
15
|
+
export interface PreparedUploadBody {
|
|
16
|
+
readonly body: string;
|
|
17
|
+
readonly bytes: number;
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Redact, serialize, and size `payload` for upload.
|
|
21
|
+
*
|
|
22
|
+
* Serialization branches on `mime`:
|
|
23
|
+
* - `application/json` (and anything else JSON-shaped, including the
|
|
24
|
+
* single-shot side of `application/x-ndjson`) → `JSON.stringify`.
|
|
25
|
+
* - `text/markdown` / `application/yaml` → coerce to string via `String()`.
|
|
26
|
+
*/
|
|
27
|
+
export declare function prepareUploadBody(payload: unknown, mime: ArtifactMime): PreparedUploadBody;
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared preamble for uploading an artifact payload from an `ArtifactWriter`.
|
|
3
|
+
*
|
|
4
|
+
* All three `ArtifactWriter` implementations (direct GCS, API Gateway, batching
|
|
5
|
+
* API Gateway) must apply the same `redact → serialize → bytecount` pipeline so
|
|
6
|
+
* secrets are stripped before leaving the process. Routing each writer through
|
|
7
|
+
* this helper prevents drift — any future writer that skips the helper would
|
|
8
|
+
* fail the contract test in
|
|
9
|
+
* `src/__tests__/artifact-upload-redaction.test.ts`.
|
|
10
|
+
*
|
|
11
|
+
* NDJSON streaming is **not** handled here — each row is redacted independently
|
|
12
|
+
* by the NDJSON writer path before being concatenated into a part body.
|
|
13
|
+
*/
|
|
14
|
+
import { redactArtifactData } from "./redact-artifact.js";
|
|
15
|
+
/**
|
|
16
|
+
* Redact, serialize, and size `payload` for upload.
|
|
17
|
+
*
|
|
18
|
+
* Serialization branches on `mime`:
|
|
19
|
+
* - `application/json` (and anything else JSON-shaped, including the
|
|
20
|
+
* single-shot side of `application/x-ndjson`) → `JSON.stringify`.
|
|
21
|
+
* - `text/markdown` / `application/yaml` → coerce to string via `String()`.
|
|
22
|
+
*/
|
|
23
|
+
export function prepareUploadBody(payload, mime) {
|
|
24
|
+
const redacted = redactArtifactData(payload);
|
|
25
|
+
const body = serializeForMime(redacted, mime);
|
|
26
|
+
const bytes = Buffer.byteLength(body, "utf-8");
|
|
27
|
+
return { body, bytes };
|
|
28
|
+
}
|
|
29
|
+
function serializeForMime(payload, mime) {
|
|
30
|
+
if (mime === "text/markdown" || mime === "application/yaml") {
|
|
31
|
+
if (typeof payload === "string")
|
|
32
|
+
return payload;
|
|
33
|
+
return String(payload ?? "");
|
|
34
|
+
}
|
|
35
|
+
return JSON.stringify(payload);
|
|
36
|
+
}
|
package/dist/composition-root.js
CHANGED
|
@@ -188,14 +188,19 @@ export function createArtifactWriter(config, logger, progress) {
|
|
|
188
188
|
exclude,
|
|
189
189
|
...(remote ? {} : { progress }),
|
|
190
190
|
});
|
|
191
|
+
// W0064 — when a remote backend is wired, list it first so its ArtifactRef
|
|
192
|
+
// wins the fanout's firstNonNull() selection and the published manifest
|
|
193
|
+
// points at a cross-machine-readable store. Local stays attached as the
|
|
194
|
+
// resilience tier: if the remote leg fails, firstNonNull falls through to
|
|
195
|
+
// local and the pipeline still produces a non-null ref.
|
|
191
196
|
const base = remote
|
|
192
|
-
? new FanoutArtifactWriter([
|
|
197
|
+
? new FanoutArtifactWriter([remote, local], { progress })
|
|
193
198
|
: local;
|
|
194
199
|
if (!remote) {
|
|
195
200
|
logger.debug(`Artifact writer: LocalFilesystemArtifactWriter only (rootDir=${rootDir})`);
|
|
196
201
|
}
|
|
197
202
|
else {
|
|
198
|
-
logger.debug(`Artifact writer: FanoutArtifactWriter([
|
|
203
|
+
logger.debug(`Artifact writer: FanoutArtifactWriter([${remote.constructor.name}, local=${rootDir}])`);
|
|
199
204
|
}
|
|
200
205
|
// Wrap in the accumulator so FinalizeRunStep can build a populated
|
|
201
206
|
// RunManifest without each producer bookkeeping its own ArtifactRefs
|
|
@@ -130,6 +130,12 @@ export async function orchestratePipeline(ctx, steps) {
|
|
|
130
130
|
const pipelineStart = Date.now();
|
|
131
131
|
const hasJob = !!ctx.config.jobId;
|
|
132
132
|
const jobUpdates = [];
|
|
133
|
+
// DOC-2064 — tracks whether the pre-finalize pipelineContext emit fired so
|
|
134
|
+
// the post-loop fallback can skip redundant writes. A second emit to the
|
|
135
|
+
// same GCS path produces a 412 Precondition Failed from the signed-URL
|
|
136
|
+
// writer (which enforces no-overwrite), logging spurious warnings on every
|
|
137
|
+
// successful run.
|
|
138
|
+
let pipelineContextEmitted = false;
|
|
133
139
|
ctx.logger.section("ai-literacy-framework — Evaluation Pipeline");
|
|
134
140
|
ctx.logger.debug(`Pipeline starting with ${steps.length} steps`, {
|
|
135
141
|
steps: steps.map((s) => s.name),
|
|
@@ -152,6 +158,16 @@ export async function orchestratePipeline(ctx, steps) {
|
|
|
152
158
|
ctx.logger.debug(`Starting step ${i + 1}/${steps.length}: ${step.name}`);
|
|
153
159
|
ctx.logger.section(step.name);
|
|
154
160
|
exportPhase.maybeOpen(step.name);
|
|
161
|
+
// DOC-2064 — emit pipelineContext BEFORE finalize-run so the artifact
|
|
162
|
+
// ref registers with the accumulator and lands in RunManifest.artifacts,
|
|
163
|
+
// which PublishReportStep then snapshots into Report.artifactManifest.
|
|
164
|
+
// The previous post-loop emit ran after publish and was invisible to
|
|
165
|
+
// Content Lake readers. The failure-path capture below still fires on
|
|
166
|
+
// pre-finalize aborts so aborted runs retain the on-disk artifact.
|
|
167
|
+
if (step.name === "finalize-run") {
|
|
168
|
+
await capturePipelineContext(ctx, state, results);
|
|
169
|
+
pipelineContextEmitted = true;
|
|
170
|
+
}
|
|
155
171
|
// Report current step progress
|
|
156
172
|
if (hasJob) {
|
|
157
173
|
await reportJobProgress(ctx, step.name, i, steps.length, "running", undefined, jobUpdates);
|
|
@@ -175,8 +191,12 @@ export async function orchestratePipeline(ctx, steps) {
|
|
|
175
191
|
}
|
|
176
192
|
// Capture pipeline context before exiting. `job-updates` was an
|
|
177
193
|
// observability-only capture not tied to a registered artifact type;
|
|
178
|
-
// dropped in W0050. Use the JobStore path for job telemetry.
|
|
179
|
-
|
|
194
|
+
// dropped in W0050. Use the JobStore path for job telemetry. Skip
|
|
195
|
+
// when the pre-finalize emit already fired to avoid a 412 overwrite
|
|
196
|
+
// warning (DOC-2064).
|
|
197
|
+
if (!pipelineContextEmitted) {
|
|
198
|
+
await capturePipelineContext(ctx, state, results);
|
|
199
|
+
}
|
|
180
200
|
exportPhase.close();
|
|
181
201
|
return {
|
|
182
202
|
belowCritical: state.belowCritical,
|
|
@@ -231,9 +251,18 @@ export async function orchestratePipeline(ctx, steps) {
|
|
|
231
251
|
ctx.logger.warn("Failed to report job completion — continuing");
|
|
232
252
|
}
|
|
233
253
|
}
|
|
234
|
-
//
|
|
235
|
-
//
|
|
236
|
-
|
|
254
|
+
// DOC-2064 — post-loop fallback. Only fires when the pre-finalize emit
|
|
255
|
+
// inside the step loop didn't run — typically because the pipeline has no
|
|
256
|
+
// finalize-run step (test harnesses, air-gapped runs). Skipping this when
|
|
257
|
+
// the pre-finalize emit already fired avoids a 412 Precondition Failed
|
|
258
|
+
// from the signed-URL writer, which refuses to overwrite the existing
|
|
259
|
+
// path. The tradeoff is that pipelineContext captures pipeline state as
|
|
260
|
+
// of finalize-run, not post-publish — reportId is absent. Acceptable
|
|
261
|
+
// because runId is the primary join key and reportId is trivially
|
|
262
|
+
// looked up from Content Lake via runId.
|
|
263
|
+
if (!pipelineContextEmitted) {
|
|
264
|
+
await capturePipelineContext(ctx, state, results);
|
|
265
|
+
}
|
|
237
266
|
exportPhase.close();
|
|
238
267
|
return {
|
|
239
268
|
belowCritical: state.belowCritical,
|
|
@@ -223,6 +223,7 @@ export function extractStoredTestResults(resultsPath) {
|
|
|
223
223
|
}
|
|
224
224
|
dimensions.push({ dimension, reason, score });
|
|
225
225
|
}
|
|
226
|
+
const tokenUsage = result.response?.tokenUsage;
|
|
226
227
|
testResults.push({
|
|
227
228
|
area,
|
|
228
229
|
cost: result.cost || undefined,
|
|
@@ -233,6 +234,7 @@ export function extractStoredTestResults(resultsPath) {
|
|
|
233
234
|
responseOutput,
|
|
234
235
|
...(responseOutputTruncated && { responseOutputTruncated: true }),
|
|
235
236
|
taskId,
|
|
237
|
+
...(tokenUsage && { tokenUsage }),
|
|
236
238
|
variant,
|
|
237
239
|
});
|
|
238
240
|
}
|