@sanity/ailf 2.8.0 → 2.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/dist/_vendor/ailf-core/artifact-capture/association.d.ts +35 -0
  2. package/dist/_vendor/ailf-core/artifact-capture/association.js +28 -0
  3. package/dist/_vendor/ailf-core/artifact-registry.d.ts +124 -23
  4. package/dist/_vendor/ailf-core/artifact-registry.js +724 -63
  5. package/dist/_vendor/ailf-core/index.d.ts +2 -1
  6. package/dist/_vendor/ailf-core/index.js +2 -1
  7. package/dist/_vendor/ailf-core/ports/artifact-writer.d.ts +59 -20
  8. package/dist/_vendor/ailf-core/ports/artifact-writer.js +33 -10
  9. package/dist/_vendor/ailf-core/ports/context.d.ts +21 -2
  10. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +6 -6
  11. package/dist/_vendor/ailf-core/services/index.d.ts +1 -0
  12. package/dist/_vendor/ailf-core/services/index.js +1 -0
  13. package/dist/_vendor/ailf-core/services/slim-report-summary.d.ts +31 -0
  14. package/dist/_vendor/ailf-core/services/slim-report-summary.js +217 -0
  15. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +33 -0
  16. package/dist/_vendor/ailf-core/types/index.d.ts +202 -23
  17. package/dist/artifact-capture/accumulating-artifact-writer.d.ts +50 -0
  18. package/dist/artifact-capture/accumulating-artifact-writer.js +111 -0
  19. package/dist/artifact-capture/api-gateway-artifact-writer.d.ts +17 -4
  20. package/dist/artifact-capture/api-gateway-artifact-writer.js +58 -7
  21. package/dist/artifact-capture/emit-file.d.ts +28 -0
  22. package/dist/artifact-capture/emit-file.js +56 -0
  23. package/dist/artifact-capture/fanout-artifact-writer.d.ts +39 -0
  24. package/dist/artifact-capture/fanout-artifact-writer.js +76 -0
  25. package/dist/artifact-capture/filesystem-collector.d.ts +22 -4
  26. package/dist/artifact-capture/filesystem-collector.js +48 -23
  27. package/dist/artifact-capture/gcs-artifact-writer.d.ts +40 -3
  28. package/dist/artifact-capture/gcs-artifact-writer.js +238 -14
  29. package/dist/artifact-capture/local-fs-artifact-writer.d.ts +71 -0
  30. package/dist/artifact-capture/local-fs-artifact-writer.js +273 -0
  31. package/dist/commands/explain-handler.js +4 -0
  32. package/dist/commands/pipeline-action.d.ts +5 -0
  33. package/dist/commands/pipeline-action.js +56 -5
  34. package/dist/commands/pipeline.d.ts +4 -0
  35. package/dist/commands/pipeline.js +6 -2
  36. package/dist/commands/publish.js +4 -1
  37. package/dist/composition-root.d.ts +13 -10
  38. package/dist/composition-root.js +74 -20
  39. package/dist/orchestration/pipeline-orchestrator.d.ts +1 -1
  40. package/dist/orchestration/pipeline-orchestrator.js +41 -30
  41. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -1
  42. package/dist/orchestration/steps/calculate-scores-step.js +19 -19
  43. package/dist/orchestration/steps/callback-step.d.ts +1 -1
  44. package/dist/orchestration/steps/callback-step.js +6 -4
  45. package/dist/orchestration/steps/compare-step.d.ts +1 -1
  46. package/dist/orchestration/steps/compare-step.js +4 -2
  47. package/dist/orchestration/steps/discovery-report-step.d.ts +1 -1
  48. package/dist/orchestration/steps/discovery-report-step.js +4 -1
  49. package/dist/orchestration/steps/fetch-docs-step.js +9 -15
  50. package/dist/orchestration/steps/finalize-run-step.js +21 -7
  51. package/dist/orchestration/steps/gap-analysis-step.js +34 -6
  52. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -1
  53. package/dist/orchestration/steps/generate-configs-step.js +11 -11
  54. package/dist/orchestration/steps/publish-report-step.d.ts +1 -1
  55. package/dist/orchestration/steps/publish-report-step.js +24 -19
  56. package/dist/orchestration/steps/readiness-step.d.ts +1 -1
  57. package/dist/orchestration/steps/readiness-step.js +4 -1
  58. package/dist/orchestration/steps/report-step.d.ts +1 -1
  59. package/dist/orchestration/steps/report-step.js +6 -3
  60. package/dist/orchestration/steps/run-eval-step.js +14 -9
  61. package/dist/pipeline/compare.d.ts +2 -2
  62. package/dist/pipeline/emit-eval-results.d.ts +38 -0
  63. package/dist/pipeline/emit-eval-results.js +100 -0
  64. package/package.json +1 -1
@@ -1,31 +1,62 @@
1
1
  /**
2
2
  * Artifact registry — single source of truth for AILF's external artifact types.
3
3
  *
4
- * Every artifact that lives in GCS declares itself here exactly once:
5
- * layout, path builder, entry schema, and (for per-entry layouts) key parser.
4
+ * Every artifact that lives in GCS (or on the local filesystem after W0050)
5
+ * declares itself here exactly once: association axes, layout, path builder,
6
+ * entry schema, mime, cap, and (for per-entry layouts) format/parse helpers.
6
7
  * Eval writers, the API Gateway's signing endpoint, and the Studio hook all
7
8
  * consume this same record.
8
9
  *
9
10
  * Adding a new artifact type = one entry here. No call-site changes needed in
10
11
  * the generic writer / signer / hook — they all iterate the registry.
11
12
  *
13
+ * ## Association axes (D0033 / W0049)
14
+ *
15
+ * Each descriptor declares the pipeline dimensions it is evidence about. At
16
+ * module load a structural invariant rejects descriptors that declare an
17
+ * unbounded axis (`task`, `model`, `trial`) but a `"bulk"` layout — such a
18
+ * shape would serialize as a single JSON array that blows past the object-
19
+ * size cap at scale. The invariant converts that class of mistake into a
20
+ * process-won't-start error rather than a silent data bug.
21
+ *
12
22
  * @see docs/decisions/D0032-run-anchored-artifact-store.md
13
- * @see docs/design-docs/run-artifact-store.md (§ Move 4 — Artifact Registry)
23
+ * @see docs/decisions/D0033-unified-run-anchored-artifact-capture.md
24
+ * @see docs/design-docs/unified-run-artifacts.md (§ M1, § M5)
14
25
  */
15
26
  import { z } from "zod";
16
27
  // ---------------------------------------------------------------------------
17
28
  // Path + key helpers
18
29
  // ---------------------------------------------------------------------------
19
- function bulkPath(slug) {
20
- return (runId) => `runs/${runId}/${slug}.json`;
30
+ /**
31
+ * File extension per MIME. Kept in sync with the `ArtifactMime` union at the
32
+ * type level — adding a new mime requires adding a case here, and the L1
33
+ * contract test in slice 4 verifies every descriptor's path ends with the
34
+ * correct extension for its mime.
35
+ */
36
+ function mimeExtension(mime) {
37
+ switch (mime) {
38
+ case "application/json":
39
+ return "json";
40
+ case "application/x-ndjson":
41
+ return "ndjson";
42
+ case "text/markdown":
43
+ return "md";
44
+ case "application/yaml":
45
+ return "yaml";
46
+ }
21
47
  }
22
- function perEntryPath(slug) {
48
+ function bulkPathBuilder(slug, mime) {
49
+ const ext = mimeExtension(mime);
50
+ return (runId) => `runs/${runId}/${slug}.${ext}`;
51
+ }
52
+ function perEntryPathBuilder(slug, mime) {
53
+ const ext = mimeExtension(mime);
23
54
  return (runId, entryKey) => {
24
55
  if (entryKey === undefined) {
25
56
  throw new Error(`Artifact "${slug}" uses per-entry layout; an entry key is required`);
26
57
  }
27
58
  const sanitized = sanitizeEntryKey(entryKey);
28
- return `runs/${runId}/${slug}/${sanitized}.json`;
59
+ return `runs/${runId}/${slug}/${sanitized}.${ext}`;
29
60
  };
30
61
  }
31
62
  /**
@@ -34,41 +65,150 @@ function perEntryPath(slug) {
34
65
  *
35
66
  * - `::` → `--` so the wire separator doesn't show up in the filename.
36
67
  * - `/` → `_` so task names like "Content Lake with @sanity/client" don't
37
- * create unintended GCS subdirectories (`.../test-outputs/@sanity/client…`)
38
- * and so `ls` against the per-entry directory shows one row per entry.
68
+ * create unintended GCS subdirectories and so `ls` against the per-entry
69
+ * directory shows one row per entry.
39
70
  *
40
71
  * Single colons (`:`) are preserved — modelIds like
41
72
  * `anthropic:messages:claude-opus-4-6` are valid GCS object names.
42
73
  *
43
74
  * NOTE: this mapping is not bijective. A taskId containing literal `--`
44
75
  * combined with a modelId could in theory collide with one whose taskId
45
- * contains `::`, and `_` collides with `/`. In practice, production
46
- * taskIds don't exercise these combinations. If collision-safety becomes a
47
- * concern (e.g., user-provided free-form task names), switch to
48
- * percent-encoding or a hash-based scheme at the key boundary.
76
+ * contains `::`. In practice, production taskIds don't exercise these
77
+ * combinations.
49
78
  */
50
79
  function sanitizeEntryKey(key) {
51
80
  return key.replace(/::/g, "--").replace(/\//g, "_");
52
81
  }
53
82
  /**
54
- * Entry-key parser for artifacts keyed by `{taskId}::{modelId}` testOutputs
55
- * today, other per-entry types in future.
83
+ * Reject ASCII control characters and DEL in raw entry keys.
84
+ *
85
+ * Used at the top of every `parseEntryKey`. An authenticated caller can
86
+ * otherwise embed CRLF in a URL-path entryKey, reach `console.warn` via
87
+ * the non-blocking upload-failure path, and forge lines in shared log
88
+ * aggregators (W0049 review finding I4 / security-audit LOW).
89
+ *
90
+ * GCS also rejects most control characters in object names, so this
91
+ * catches the class at the boundary rather than waiting for a 400 from
92
+ * the signed-URL PUT.
93
+ */
94
+ // eslint-disable-next-line no-control-regex
95
+ const CONTROL_CHAR_PATTERN = /[\x00-\x1f\x7f]/;
96
+ function hasControlChars(raw) {
97
+ return CONTROL_CHAR_PATTERN.test(raw);
98
+ }
99
+ /**
100
+ * Order of axes in the canonical key representation. `run` is never included
101
+ * in the entry key because the run dimension lives in the path prefix
102
+ * (`runs/{runId}/…`).
103
+ */
104
+ const KEY_AXIS_ORDER = [
105
+ "mode",
106
+ "task",
107
+ "model",
108
+ "grader",
109
+ "trial",
110
+ "category",
111
+ ];
112
+ /**
113
+ * Build a filename-safe entry key from association values by concatenating
114
+ * the axis values in `KEY_AXIS_ORDER` with `--`. Axes not listed in the
115
+ * descriptor's `association.axes` are skipped.
56
116
  *
57
- * The separator is `::` (double colon). Either segment may contain single
58
- * colons: production model ids commonly look like
59
- * `anthropic:messages:claude-opus-4-6`. The constraint is that `::` must
60
- * appear exactly once and neither segment is empty, so the API Gateway can
61
- * return 400 on malformed input.
117
+ * For descriptors with bounded-only axes (only `run`) that are per-entry
118
+ * (e.g. `sinkResults`), `assoc.name` is used as the key.
119
+ */
120
+ function formatKeyFromAxes(axes) {
121
+ const keyAxes = KEY_AXIS_ORDER.filter((a) => axes.includes(a));
122
+ return (assoc) => {
123
+ if (keyAxes.length === 0) {
124
+ if (!assoc.name) {
125
+ throw new Error(`formatEntryKey: descriptor with axes [${axes.join(", ")}] requires assoc.name`);
126
+ }
127
+ return sanitizeEntryKey(assoc.name);
128
+ }
129
+ const parts = keyAxes.map((axis) => {
130
+ const raw = assoc[axis];
131
+ if (raw === undefined || raw === null || raw === "") {
132
+ throw new Error(`formatEntryKey: missing required axis "${axis}" in association`);
133
+ }
134
+ return String(raw);
135
+ });
136
+ return sanitizeEntryKey(parts.join("--"));
137
+ };
138
+ }
139
+ /**
140
+ * Strict parser that accepts only the canonical N-segment key matching the
141
+ * descriptor's axis count (minus `run`). Used by most per-entry descriptors.
62
142
  */
63
- function parseTaskModelKey(key) {
64
- const parts = key.split("::");
65
- if (parts.length !== 2 || !parts[0] || !parts[1]) {
143
+ function parseKeyByAxes(type, axes) {
144
+ const expected = KEY_AXIS_ORDER.filter((a) => axes.includes(a)).length;
145
+ return (raw) => {
146
+ if (hasControlChars(raw)) {
147
+ return {
148
+ ok: false,
149
+ reason: `Entry key for "${type}" must not contain control characters`,
150
+ };
151
+ }
152
+ const sanitized = sanitizeEntryKey(raw);
153
+ if (expected === 0) {
154
+ if (!sanitized) {
155
+ return {
156
+ ok: false,
157
+ reason: `Entry key for "${type}" must be a non-empty identifier`,
158
+ };
159
+ }
160
+ return { ok: true, sanitized: sanitized };
161
+ }
162
+ const parts = sanitized.split("--");
163
+ if (parts.length !== expected || parts.some((p) => !p)) {
164
+ return {
165
+ ok: false,
166
+ reason: `Entry key "${raw}" for "${type}" must match ${expected}-segment form (${KEY_AXIS_ORDER.filter((a) => axes.includes(a)).join("--")}) with non-empty segments`,
167
+ };
168
+ }
169
+ return { ok: true, sanitized: sanitized };
170
+ };
171
+ }
172
+ /**
173
+ * testOutputs parser. Accepts both the new 3-segment form
174
+ * (`mode--task--model`) and the legacy 2-segment form (`task--model`); the
175
+ * latter emits a one-time-per-process deprecation warning so noisy logs do
176
+ * not mask the signal.
177
+ *
178
+ * The separator is `--` (post-sanitization). The legacy wire format
179
+ * (`task::model`) still works because `sanitizeEntryKey` maps `::` → `--`
180
+ * before the split — producers that haven't migrated keep functioning.
181
+ *
182
+ * Deprecation scheduled for W0052.
183
+ */
184
+ let warnedLegacyTestOutputsKey = false;
185
+ function parseTestOutputsEntryKey(raw) {
186
+ if (hasControlChars(raw)) {
66
187
  return {
67
188
  ok: false,
68
- reason: `Entry key "${key}" must match {taskId}::{modelId} with exactly one "::" separator and non-empty segments`,
189
+ reason: `Entry key for "testOutputs" must not contain control characters`,
69
190
  };
70
191
  }
71
- return { ok: true, sanitized: sanitizeEntryKey(key) };
192
+ const sanitized = sanitizeEntryKey(raw);
193
+ const parts = sanitized.split("--");
194
+ if (parts.length === 3 && parts.every((p) => p.length > 0)) {
195
+ return { ok: true, sanitized: sanitized };
196
+ }
197
+ if (parts.length === 2 && parts.every((p) => p.length > 0)) {
198
+ if (!warnedLegacyTestOutputsKey) {
199
+ warnedLegacyTestOutputsKey = true;
200
+ console.warn("legacy testOutputs entry key (2-segment) is deprecated");
201
+ }
202
+ return { ok: true, sanitized: sanitized };
203
+ }
204
+ return {
205
+ ok: false,
206
+ reason: `Entry key "${raw}" for "testOutputs" must match {mode}--{task}--{model} (3-segment) or legacy {task}--{model} (2-segment) with non-empty segments`,
207
+ };
208
+ }
209
+ /** Test-only reset for the legacy-key warning flag. Not exported publicly. */
210
+ export function __resetLegacyTestOutputsWarning() {
211
+ warnedLegacyTestOutputsKey = false;
72
212
  }
73
213
  // ---------------------------------------------------------------------------
74
214
  // Entry schemas
@@ -77,67 +217,436 @@ const testOutputEntrySchema = z.object({
77
217
  responseOutput: z.string(),
78
218
  responseOutputTruncated: z.boolean(),
79
219
  });
80
- // Aspirational: renderedPrompts / rawResults / traces / etc. currently have
81
- // loose shapes. Tighten per-type as consumers stabilize.
220
+ /**
221
+ * Preview shape for `testOutputs` manifest entries (W0051 / D0033 M7). A
222
+ * Studio list view can render the truncated response + truncation flag
223
+ * without a signed-URL round trip.
224
+ */
225
+ const testOutputPreviewSchema = z.object({
226
+ responsePreview: z.string().max(280),
227
+ truncated: z.boolean(),
228
+ });
229
+ /**
230
+ * Preview shape for `failureModes` per-category manifest entries (W0051
231
+ * Slice 2 / D0033 M7). `failureModes` is keyed by `{mode, category}` after
232
+ * Slice 2 — one entry per classified FailureModeType inside each mode.
233
+ *
234
+ * Severity is derived at extract time from the entry's `count` using the
235
+ * bucketing in `severityForCount`. The bucket thresholds are a first-pass
236
+ * heuristic; callers can override by supplying an explicit `severity` on
237
+ * the entry payload if more nuanced signals become available.
238
+ */
239
+ const failureModePreviewSchema = z.object({
240
+ category: z.string(),
241
+ severity: z.enum(["low", "medium", "high", "critical"]),
242
+ titlePreview: z.string().max(120),
243
+ });
244
+ /**
245
+ * Preview shape for `graderJudgments` manifest entries (W0051 / D0033 M7).
246
+ * List views render score + a short reason excerpt; drill-down hydrates
247
+ * the full `{ reason, pass, dimensionScores }` from the external artifact.
248
+ *
249
+ * `score` is kept as `number` rather than `int(0..100)` so legacy judgments
250
+ * with fractional scores (pre-W0048 rubrics produced 0–1 values) don't
251
+ * reject at read time; the score-normalization step is upstream of emit.
252
+ */
253
+ const graderJudgmentPreviewSchema = z.object({
254
+ score: z.number(),
255
+ reasonPreview: z.string().max(280),
256
+ dimensionScores: z.record(z.string(), z.number()).optional(),
257
+ });
258
+ // Aspirational: most payload shapes are still loose. Tightening per-type as
259
+ // consumers stabilize is explicitly a W0050/W0051 concern — W0049 fixes the
260
+ // structural shape around them without changing the payload contracts.
82
261
  const unknownEntry = z.unknown();
262
+ /**
263
+ * Truncate a string to at most `maxChars` code units. Code-unit-based
264
+ * truncation is safe for the preview use-case — previews are display-bound
265
+ * approximations and a rare mid-surrogate cut manifests as a replacement
266
+ * glyph, not data corruption. Pure function.
267
+ */
268
+ function truncateString(s, maxChars) {
269
+ return s.length <= maxChars ? s : s.slice(0, maxChars);
270
+ }
271
+ /**
272
+ * Narrow check that a value is a plain object whose entries are all number-
273
+ * valued strings. Used by preview extractors to safely include optional
274
+ * per-dimension score fields without trusting the caller's runtime shape.
275
+ */
276
+ function isStringNumberRecord(value) {
277
+ if (value === null || typeof value !== "object" || Array.isArray(value))
278
+ return false;
279
+ for (const v of Object.values(value)) {
280
+ if (typeof v !== "number")
281
+ return false;
282
+ }
283
+ return true;
284
+ }
285
+ /**
286
+ * Bucket a classified-judgment count into a severity tier. Thresholds are
287
+ * count-based: a category with 10+ judgments is critical triage; 5–9 is
288
+ * high; 2–4 medium; 0–1 low. Exposed so producers can mirror this when
289
+ * supplying explicit severity values on entries that have richer signal
290
+ * (e.g. per-dimension aggregate).
291
+ */
292
+ function severityForCount(count) {
293
+ if (count >= 10)
294
+ return "critical";
295
+ if (count >= 5)
296
+ return "high";
297
+ if (count >= 2)
298
+ return "medium";
299
+ return "low";
300
+ }
301
+ /**
302
+ * Render a FailureModeType-style kebab-case category id as Title Case for
303
+ * preview display (e.g. `"missing-docs"` → `"Missing Docs"`).
304
+ */
305
+ function titleCaseCategory(id) {
306
+ return id
307
+ .split("-")
308
+ .map((w) => (w.length === 0 ? w : w[0].toUpperCase() + w.slice(1)))
309
+ .join(" ");
310
+ }
311
+ function buildDescriptor(input) {
312
+ const objectPath = input.layout === "bulk"
313
+ ? bulkPathBuilder(input.slug, input.mime)
314
+ : perEntryPathBuilder(input.slug, input.mime);
315
+ const formatEntryKey = input.layout === "per-entry" ? formatKeyFromAxes(input.axes) : undefined;
316
+ const parseEntryKey = input.layout === "per-entry"
317
+ ? (input.parseEntryKey ?? parseKeyByAxes(input.type, input.axes))
318
+ : undefined;
319
+ return {
320
+ type: input.type,
321
+ layout: input.layout,
322
+ slug: input.slug,
323
+ association: { axes: input.axes },
324
+ entrySchema: input.entrySchema,
325
+ mime: input.mime,
326
+ capBytes: input.capBytes,
327
+ truncation: input.truncation,
328
+ optional: input.optional,
329
+ objectPath,
330
+ formatEntryKey,
331
+ parseEntryKey,
332
+ manifestPreview: input.manifestPreview,
333
+ };
334
+ }
83
335
  // ---------------------------------------------------------------------------
84
- // The registry
336
+ // The registry — 21 live descriptors + 1 deprecated (evalResults)
85
337
  // ---------------------------------------------------------------------------
86
338
  /**
87
339
  * The canonical artifact descriptor for every artifact type. Iterate with
88
340
  * `Object.values(ARTIFACT_REGISTRY)` or look up by `ARTIFACT_REGISTRY[type]`.
341
+ *
342
+ * Axes, layout, and caps come from docs/design-docs/unified-run-artifacts.md
343
+ * § M5. The mapping is verified by the L1 contract tests.
89
344
  */
90
345
  export const ARTIFACT_REGISTRY = {
91
- testOutputs: {
92
- type: "testOutputs",
346
+ // -- Run-scoped bulk artifacts (one per run) ------------------------------
347
+ runManifest: buildDescriptor({
348
+ type: "runManifest",
349
+ slug: "run-manifest",
350
+ layout: "bulk",
351
+ axes: ["run"],
352
+ entrySchema: unknownEntry,
353
+ mime: "application/json",
354
+ capBytes: 1_000_000,
355
+ }),
356
+ scoreSummary: buildDescriptor({
357
+ type: "scoreSummary",
358
+ slug: "score-summary",
359
+ layout: "bulk",
360
+ axes: ["run"],
361
+ entrySchema: unknownEntry,
362
+ mime: "application/json",
363
+ capBytes: 1_000_000,
364
+ }),
365
+ pipelineResult: buildDescriptor({
366
+ type: "pipelineResult",
367
+ slug: "pipeline-result",
368
+ layout: "bulk",
369
+ axes: ["run"],
370
+ entrySchema: unknownEntry,
371
+ mime: "application/json",
372
+ capBytes: 1_000_000,
373
+ }),
374
+ pipelineContext: buildDescriptor({
375
+ type: "pipelineContext",
376
+ slug: "pipeline-context",
377
+ layout: "bulk",
378
+ axes: ["run"],
379
+ entrySchema: unknownEntry,
380
+ mime: "application/json",
381
+ capBytes: 64_000,
382
+ }),
383
+ documentManifest: buildDescriptor({
384
+ type: "documentManifest",
385
+ slug: "document-manifest",
386
+ layout: "bulk",
387
+ axes: ["run"],
388
+ entrySchema: unknownEntry,
389
+ mime: "application/json",
390
+ capBytes: 256_000,
391
+ }),
392
+ prComment: buildDescriptor({
393
+ type: "prComment",
394
+ slug: "pr-comment",
395
+ layout: "bulk",
396
+ axes: ["run"],
397
+ entrySchema: z.string(),
398
+ mime: "text/markdown",
399
+ capBytes: 256_000,
400
+ optional: true,
401
+ }),
402
+ readinessReport: buildDescriptor({
403
+ type: "readinessReport",
404
+ slug: "readiness-report",
405
+ layout: "bulk",
406
+ axes: ["run"],
407
+ entrySchema: z.string(),
408
+ mime: "text/markdown",
409
+ capBytes: 256_000,
410
+ optional: true,
411
+ }),
412
+ reportSnapshot: buildDescriptor({
413
+ type: "reportSnapshot",
414
+ slug: "report-snapshot",
415
+ layout: "bulk",
416
+ axes: ["run"],
417
+ entrySchema: unknownEntry,
418
+ mime: "application/json",
419
+ capBytes: 10_000_000,
420
+ }),
421
+ autoComparison: buildDescriptor({
422
+ type: "autoComparison",
423
+ slug: "auto-comparison",
424
+ layout: "bulk",
425
+ axes: ["run"],
426
+ entrySchema: unknownEntry,
427
+ mime: "application/json",
428
+ capBytes: 4_000_000,
429
+ optional: true,
430
+ }),
431
+ gapReport: buildDescriptor({
432
+ type: "gapReport",
433
+ slug: "gap-report",
434
+ layout: "bulk",
435
+ axes: ["run"],
436
+ entrySchema: unknownEntry,
437
+ mime: "application/json",
438
+ capBytes: 1_000_000,
439
+ optional: true,
440
+ }),
441
+ // -- Run-scoped per-entry artifacts (keyed by assoc.name) -----------------
442
+ sinkResults: buildDescriptor({
443
+ type: "sinkResults",
444
+ slug: "sink-results",
93
445
  layout: "per-entry",
94
- slug: "test-outputs",
95
- entrySchema: testOutputEntrySchema,
96
- objectPath: perEntryPath("test-outputs"),
97
- parseEntryKey: parseTaskModelKey,
98
- },
99
- renderedPrompts: {
446
+ axes: ["run"],
447
+ entrySchema: unknownEntry,
448
+ mime: "application/json",
449
+ capBytes: 64_000,
450
+ }),
451
+ callbackRequest: buildDescriptor({
452
+ type: "callbackRequest",
453
+ slug: "callback-request",
454
+ layout: "per-entry",
455
+ axes: ["run"],
456
+ entrySchema: unknownEntry,
457
+ mime: "application/json",
458
+ capBytes: 64_000,
459
+ optional: true,
460
+ }),
461
+ callbackResponse: buildDescriptor({
462
+ type: "callbackResponse",
463
+ slug: "callback-response",
464
+ layout: "per-entry",
465
+ axes: ["run"],
466
+ entrySchema: unknownEntry,
467
+ mime: "application/json",
468
+ capBytes: 64_000,
469
+ optional: true,
470
+ }),
471
+ // -- Run × Mode ------------------------------------------------------------
472
+ configSnapshot: buildDescriptor({
473
+ type: "configSnapshot",
474
+ slug: "config-snapshot",
475
+ layout: "per-entry",
476
+ axes: ["run", "mode"],
477
+ entrySchema: z.string(),
478
+ mime: "application/yaml",
479
+ capBytes: 256_000,
480
+ }),
481
+ evalConfigGenerated: buildDescriptor({
482
+ type: "evalConfigGenerated",
483
+ slug: "eval-config-generated",
484
+ layout: "per-entry",
485
+ axes: ["run", "mode"],
486
+ entrySchema: z.string(),
487
+ mime: "application/yaml",
488
+ capBytes: 256_000,
489
+ optional: true,
490
+ }),
491
+ comparisonReport: buildDescriptor({
492
+ type: "comparisonReport",
493
+ slug: "comparison-report",
494
+ layout: "per-entry",
495
+ axes: ["run", "mode"],
496
+ entrySchema: unknownEntry,
497
+ mime: "application/json",
498
+ capBytes: 1_000_000,
499
+ optional: true,
500
+ }),
501
+ discoveryReport: buildDescriptor({
502
+ type: "discoveryReport",
503
+ slug: "discovery-report",
504
+ layout: "per-entry",
505
+ axes: ["run", "mode"],
506
+ entrySchema: z.string(),
507
+ mime: "text/markdown",
508
+ capBytes: 1_000_000,
509
+ optional: true,
510
+ }),
511
+ failureModes: buildDescriptor({
512
+ type: "failureModes",
513
+ slug: "failure-modes",
514
+ layout: "per-entry",
515
+ axes: ["run", "mode", "category"],
516
+ entrySchema: unknownEntry,
517
+ mime: "application/json",
518
+ capBytes: 1_000_000,
519
+ optional: true,
520
+ manifestPreview: {
521
+ schema: failureModePreviewSchema,
522
+ extract: (entry) => {
523
+ const e = entry;
524
+ const category = typeof e.category === "string" ? e.category : "unknown";
525
+ const count = typeof e.count === "number" ? e.count : 0;
526
+ const title = typeof e.title === "string" ? e.title : titleCaseCategory(category);
527
+ return {
528
+ category,
529
+ severity: severityForCount(count),
530
+ titlePreview: truncateString(title, 120),
531
+ };
532
+ },
533
+ capBytes: 256,
534
+ },
535
+ }),
536
+ // -- Run × Mode × Task(+…) -------------------------------------------------
537
+ taskDefinitions: buildDescriptor({
538
+ type: "taskDefinitions",
539
+ slug: "task-definitions",
540
+ layout: "per-entry",
541
+ axes: ["run", "mode", "task"],
542
+ entrySchema: unknownEntry,
543
+ mime: "application/json",
544
+ capBytes: 256_000,
545
+ }),
546
+ renderedPrompts: buildDescriptor({
100
547
  type: "renderedPrompts",
101
- layout: "bulk",
102
548
  slug: "rendered-prompts",
549
+ layout: "per-entry",
550
+ axes: ["run", "mode", "task", "model"],
103
551
  entrySchema: unknownEntry,
104
- objectPath: bulkPath("rendered-prompts"),
105
- },
106
- rawResults: {
552
+ mime: "application/json",
553
+ capBytes: 1_000_000,
554
+ }),
555
+ rawResults: buildDescriptor({
107
556
  type: "rawResults",
108
- layout: "bulk",
109
557
  slug: "raw-results",
558
+ layout: "per-entry",
559
+ axes: ["run", "mode", "task", "model"],
110
560
  entrySchema: unknownEntry,
111
- objectPath: bulkPath("raw-results"),
112
- },
113
- graderPrompts: {
561
+ mime: "application/json",
562
+ capBytes: 1_000_000,
563
+ }),
564
+ testOutputs: buildDescriptor({
565
+ type: "testOutputs",
566
+ slug: "test-outputs",
567
+ layout: "per-entry",
568
+ axes: ["run", "mode", "task", "model"],
569
+ entrySchema: testOutputEntrySchema,
570
+ mime: "application/json",
571
+ capBytes: 1_000_000,
572
+ parseEntryKey: parseTestOutputsEntryKey,
573
+ manifestPreview: {
574
+ schema: testOutputPreviewSchema,
575
+ extract: (entry) => {
576
+ const e = entry;
577
+ const raw = typeof e.responseOutput === "string" ? e.responseOutput : "";
578
+ return {
579
+ responsePreview: truncateString(raw, 280),
580
+ truncated: typeof e.responseOutputTruncated === "boolean"
581
+ ? e.responseOutputTruncated
582
+ : false,
583
+ };
584
+ },
585
+ capBytes: 320,
586
+ },
587
+ }),
588
+ graderPrompts: buildDescriptor({
114
589
  type: "graderPrompts",
115
- layout: "bulk",
116
590
  slug: "grader-prompts",
591
+ layout: "per-entry",
592
+ axes: ["run", "mode", "task", "model", "grader"],
117
593
  entrySchema: unknownEntry,
118
- objectPath: bulkPath("grader-prompts"),
119
- },
120
- taskDefinitions: {
121
- type: "taskDefinitions",
122
- layout: "bulk",
123
- slug: "task-definitions",
594
+ mime: "application/json",
595
+ capBytes: 512_000,
596
+ }),
597
+ graderJudgments: buildDescriptor({
598
+ type: "graderJudgments",
599
+ slug: "grader-judgments",
600
+ layout: "per-entry",
601
+ axes: ["run", "mode", "task", "model", "grader"],
124
602
  entrySchema: unknownEntry,
125
- objectPath: bulkPath("task-definitions"),
126
- },
127
- evalResults: {
603
+ mime: "application/json",
604
+ capBytes: 512_000,
605
+ manifestPreview: {
606
+ schema: graderJudgmentPreviewSchema,
607
+ extract: (entry) => {
608
+ const e = entry;
609
+ const score = typeof e.score === "number" ? e.score : 0;
610
+ const reasonText = typeof e.reason === "string" ? e.reason : "";
611
+ const dimensionScores = isStringNumberRecord(e.dimensionScores)
612
+ ? e.dimensionScores
613
+ : undefined;
614
+ return {
615
+ score,
616
+ reasonPreview: truncateString(reasonText, 280),
617
+ ...(dimensionScores === undefined ? {} : { dimensionScores }),
618
+ };
619
+ },
620
+ capBytes: 512,
621
+ },
622
+ }),
623
+ traces: buildDescriptor({
624
+ type: "traces",
625
+ slug: "traces",
626
+ layout: "per-entry",
627
+ axes: ["run", "mode", "task", "model", "trial"],
628
+ entrySchema: unknownEntry,
629
+ mime: "application/x-ndjson",
630
+ capBytes: 10_000_000,
631
+ truncation: "trial-oversize",
632
+ }),
633
+ /**
634
+ * @deprecated Emit removed in W0050 (no producer calls `emit("evalResults")`
635
+ * any more — `emit-eval-results.ts` decomposes the promptfoo aggregate into
636
+ * per-entry rawResults / renderedPrompts / graderPrompts / graderJudgments
637
+ * instead). Descriptor retained for read-compat on pre-W0050 reports until
638
+ * W0052 removes it entirely. No code path should re-introduce emission.
639
+ */
640
+ evalResults: buildDescriptor({
128
641
  type: "evalResults",
129
- layout: "bulk",
130
642
  slug: "eval-results",
131
- entrySchema: unknownEntry,
132
- objectPath: bulkPath("eval-results"),
133
- },
134
- traces: {
135
- type: "traces",
136
643
  layout: "bulk",
137
- slug: "traces",
644
+ axes: ["run"],
138
645
  entrySchema: unknownEntry,
139
- objectPath: bulkPath("traces"),
140
- },
646
+ mime: "application/json",
647
+ capBytes: 10_000_000,
648
+ optional: true,
649
+ }),
141
650
  };
142
651
  /** All artifact types in declaration order. */
143
652
  export const ARTIFACT_TYPES = Object.keys(ARTIFACT_REGISTRY);
@@ -148,3 +657,155 @@ export const ARTIFACT_TYPES = Object.keys(ARTIFACT_REGISTRY);
148
657
  export function isArtifactType(value) {
149
658
  return value in ARTIFACT_REGISTRY;
150
659
  }
660
+ // ---------------------------------------------------------------------------
661
+ // Module-load invariant (D0033 / W0049)
662
+ // ---------------------------------------------------------------------------
663
+ /**
664
+ * Unbounded axes — dimensions whose cardinality grows with a run. A bulk
665
+ * artifact fanning across these cannot bound its payload; the registry
666
+ * forbids that shape at import time.
667
+ */
668
+ const UNBOUNDED_AXES = [
669
+ "task",
670
+ "model",
671
+ "trial",
672
+ ];
673
+ /**
674
+ * Structural check run against a single descriptor. Exported so L1 contract
675
+ * tests can construct an invalid descriptor inline and assert the throw.
676
+ */
677
+ export function assertValidArtifactDescriptor(desc) {
678
+ const hasUnboundedAxis = desc.association.axes.some((a) => UNBOUNDED_AXES.includes(a));
679
+ if (hasUnboundedAxis && desc.layout !== "per-entry") {
680
+ throw new Error(`Artifact ${desc.type}: association contains unbounded axis (${desc.association.axes
681
+ .filter((a) => UNBOUNDED_AXES.includes(a))
682
+ .join(", ")}) but layout is "${desc.layout}". Unbounded axes require layout "per-entry".`);
683
+ }
684
+ if (desc.capBytes <= 0) {
685
+ throw new Error(`Artifact ${desc.type}: capBytes must be > 0 (got ${desc.capBytes})`);
686
+ }
687
+ if (desc.layout === "per-entry" && !desc.formatEntryKey) {
688
+ throw new Error(`Artifact ${desc.type}: per-entry descriptors must declare formatEntryKey`);
689
+ }
690
+ }
691
+ // Fire the invariant at import time — a bad descriptor kills the process
692
+ // before any producer can silently serialize an oversized JSON array.
693
+ for (const desc of Object.values(ARTIFACT_REGISTRY)) {
694
+ assertValidArtifactDescriptor(desc);
695
+ }
696
+ // ---------------------------------------------------------------------------
697
+ // Manifest preview helper (W0051 / D0033 M7)
698
+ // ---------------------------------------------------------------------------
699
+ /**
700
+ * Build the inline preview for a manifest entry at write time. Returns
701
+ * `undefined` when the descriptor has no `manifestPreview` declaration,
702
+ * when extraction throws, when the schema rejects the extracted shape, or
703
+ * when cap-enforcement cannot bring the serialized preview under the
704
+ * descriptor's `capBytes` budget.
705
+ *
706
+ * Failure is non-fatal: preview is triage metadata, never critical-path data.
707
+ * The full payload still lands in the external artifact regardless.
708
+ *
709
+ * Cap enforcement (hard truncation) iteratively shortens the longest string
710
+ * field on the preview object by ~10% per pass until the JSON-serialized
711
+ * form fits under `capBytes` or no string remains to trim. Nested objects
712
+ * are not recursed — previews are intentionally shallow (a handful of
713
+ * top-level fields).
714
+ */
715
+ export function buildManifestPreview(descriptor, payload) {
716
+ const decl = descriptor.manifestPreview;
717
+ if (!decl)
718
+ return undefined;
719
+ let extracted;
720
+ try {
721
+ extracted = decl.extract(payload);
722
+ }
723
+ catch (err) {
724
+ console.warn(` ⚠️ manifestPreview.extract("${descriptor.type}") threw: ${errMessage(err)} — dropping preview`);
725
+ return undefined;
726
+ }
727
+ const parsed = decl.schema.safeParse(extracted);
728
+ if (!parsed.success) {
729
+ console.warn(` ⚠️ manifestPreview schema rejected "${descriptor.type}" preview — dropping preview`);
730
+ return undefined;
731
+ }
732
+ const fitted = fitPreviewToCap(parsed.data, decl.capBytes);
733
+ if (!fitted) {
734
+ console.warn(` ⚠️ manifestPreview for "${descriptor.type}" exceeds capBytes=${decl.capBytes} and cannot be truncated — dropping preview`);
735
+ return undefined;
736
+ }
737
+ return fitted;
738
+ }
739
+ function errMessage(err) {
740
+ return err instanceof Error ? err.message : String(err);
741
+ }
742
+ /**
743
+ * Hard-truncate string fields of `preview` until `JSON.stringify(preview)`
744
+ * fits within `capBytes`. Operates on a shallow clone to preserve purity.
745
+ * Returns the fitted preview, or `undefined` if no amount of shortening
746
+ * brings the preview under cap (e.g. the non-string fields alone exceed it).
747
+ */
748
+ function fitPreviewToCap(preview, capBytes) {
749
+ if (preview === null || typeof preview !== "object") {
750
+ return byteLengthUtf8(JSON.stringify(preview)) <= capBytes
751
+ ? preview
752
+ : undefined;
753
+ }
754
+ const clone = { ...preview };
755
+ let bytes = byteLengthUtf8(JSON.stringify(clone));
756
+ if (bytes <= capBytes)
757
+ return preview; // already fits; original returned
758
+ // Repeatedly trim the longest string field by ~10% of its length (min 4)
759
+ // until we fit or no trimmable string is left.
760
+ // Bound the loop to guard against pathological schemas.
761
+ for (let pass = 0; pass < 256; pass++) {
762
+ const longestKey = findLongestStringKey(clone);
763
+ if (longestKey === null)
764
+ return undefined;
765
+ const current = clone[longestKey];
766
+ if (current.length <= 1) {
767
+ // Drop this field entirely — it can't be shortened further.
768
+ delete clone[longestKey];
769
+ }
770
+ else {
771
+ const trimBy = Math.max(4, Math.ceil(current.length * 0.1));
772
+ clone[longestKey] = current.slice(0, current.length - trimBy);
773
+ }
774
+ bytes = byteLengthUtf8(JSON.stringify(clone));
775
+ if (bytes <= capBytes)
776
+ return clone;
777
+ }
778
+ return undefined;
779
+ }
780
+ /**
781
+ * UTF-8 byte length of a string. Implemented without Node's `Buffer` so
782
+ * `@sanity/ailf-core` stays dependency-free of `@types/node` — the kernel
783
+ * runs in both Node and the browser (Studio).
784
+ */
785
+ function byteLengthUtf8(s) {
786
+ let bytes = 0;
787
+ for (let i = 0; i < s.length; i++) {
788
+ const c = s.charCodeAt(i);
789
+ if (c < 0x80)
790
+ bytes += 1;
791
+ else if (c < 0x800)
792
+ bytes += 2;
793
+ else if (c >= 0xd800 && c < 0xdc00) {
794
+ // high surrogate — a 4-byte UTF-8 sequence; skip the paired low surrogate
795
+ bytes += 4;
796
+ i++;
797
+ }
798
+ else
799
+ bytes += 3;
800
+ }
801
+ return bytes;
802
+ }
803
+ function findLongestStringKey(obj) {
804
+ let best = null;
805
+ for (const [k, v] of Object.entries(obj)) {
806
+ if (typeof v === "string" && (best === null || v.length > best.len)) {
807
+ best = { key: k, len: v.length };
808
+ }
809
+ }
810
+ return best === null ? null : best.key;
811
+ }