@sanity/ailf 2.8.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/dist/_vendor/ailf-core/artifact-capture/association.d.ts +35 -0
  2. package/dist/_vendor/ailf-core/artifact-capture/association.js +28 -0
  3. package/dist/_vendor/ailf-core/artifact-registry.d.ts +124 -23
  4. package/dist/_vendor/ailf-core/artifact-registry.js +708 -64
  5. package/dist/_vendor/ailf-core/batch-signing.d.ts +64 -0
  6. package/dist/_vendor/ailf-core/batch-signing.js +23 -0
  7. package/dist/_vendor/ailf-core/index.d.ts +3 -2
  8. package/dist/_vendor/ailf-core/index.js +3 -2
  9. package/dist/_vendor/ailf-core/ports/artifact-writer.d.ts +59 -20
  10. package/dist/_vendor/ailf-core/ports/artifact-writer.js +33 -10
  11. package/dist/_vendor/ailf-core/ports/context.d.ts +20 -17
  12. package/dist/_vendor/ailf-core/ports/index.d.ts +0 -2
  13. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +6 -6
  14. package/dist/_vendor/ailf-core/services/index.d.ts +1 -0
  15. package/dist/_vendor/ailf-core/services/index.js +1 -0
  16. package/dist/_vendor/ailf-core/services/slim-report-summary.d.ts +31 -0
  17. package/dist/_vendor/ailf-core/services/slim-report-summary.js +217 -0
  18. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +33 -0
  19. package/dist/_vendor/ailf-core/types/index.d.ts +202 -23
  20. package/dist/adapters/config-sources/file-config-adapter.js +0 -4
  21. package/dist/artifact-capture/accumulating-artifact-writer.d.ts +50 -0
  22. package/dist/artifact-capture/accumulating-artifact-writer.js +111 -0
  23. package/dist/artifact-capture/api-gateway-artifact-writer.d.ts +17 -4
  24. package/dist/artifact-capture/api-gateway-artifact-writer.js +58 -7
  25. package/dist/artifact-capture/emit-file.d.ts +28 -0
  26. package/dist/artifact-capture/emit-file.js +56 -0
  27. package/dist/artifact-capture/fanout-artifact-writer.d.ts +39 -0
  28. package/dist/artifact-capture/fanout-artifact-writer.js +76 -0
  29. package/dist/artifact-capture/gcs-artifact-writer.d.ts +40 -3
  30. package/dist/artifact-capture/gcs-artifact-writer.js +238 -14
  31. package/dist/artifact-capture/local-fs-artifact-writer.d.ts +71 -0
  32. package/dist/artifact-capture/local-fs-artifact-writer.js +273 -0
  33. package/dist/artifact-capture/redact-artifact.d.ts +3 -5
  34. package/dist/artifact-capture/redact-artifact.js +3 -5
  35. package/dist/cli.js +56 -2
  36. package/dist/commands/explain-handler.js +4 -4
  37. package/dist/commands/pipeline-action.d.ts +5 -4
  38. package/dist/commands/pipeline-action.js +33 -16
  39. package/dist/commands/pipeline.d.ts +4 -4
  40. package/dist/commands/pipeline.js +4 -4
  41. package/dist/commands/publish.js +4 -1
  42. package/dist/commands/runs.d.ts +18 -0
  43. package/dist/commands/runs.js +71 -0
  44. package/dist/composition-root.d.ts +13 -10
  45. package/dist/composition-root.js +74 -46
  46. package/dist/orchestration/build-app-context.js +4 -7
  47. package/dist/orchestration/pipeline-orchestrator.d.ts +1 -1
  48. package/dist/orchestration/pipeline-orchestrator.js +37 -46
  49. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -1
  50. package/dist/orchestration/steps/calculate-scores-step.js +19 -19
  51. package/dist/orchestration/steps/callback-step.d.ts +1 -1
  52. package/dist/orchestration/steps/callback-step.js +6 -4
  53. package/dist/orchestration/steps/compare-step.d.ts +1 -1
  54. package/dist/orchestration/steps/compare-step.js +4 -2
  55. package/dist/orchestration/steps/discovery-report-step.d.ts +1 -1
  56. package/dist/orchestration/steps/discovery-report-step.js +4 -1
  57. package/dist/orchestration/steps/fetch-docs-step.js +9 -15
  58. package/dist/orchestration/steps/finalize-run-step.js +21 -7
  59. package/dist/orchestration/steps/gap-analysis-step.js +34 -6
  60. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -1
  61. package/dist/orchestration/steps/generate-configs-step.js +11 -11
  62. package/dist/orchestration/steps/publish-report-step.d.ts +1 -1
  63. package/dist/orchestration/steps/publish-report-step.js +24 -19
  64. package/dist/orchestration/steps/readiness-step.d.ts +1 -1
  65. package/dist/orchestration/steps/readiness-step.js +4 -1
  66. package/dist/orchestration/steps/report-step.d.ts +1 -1
  67. package/dist/orchestration/steps/report-step.js +6 -3
  68. package/dist/orchestration/steps/run-eval-step.js +14 -9
  69. package/dist/pipeline/compare.d.ts +2 -2
  70. package/dist/pipeline/emit-eval-results.d.ts +38 -0
  71. package/dist/pipeline/emit-eval-results.js +100 -0
  72. package/dist/pipeline/map-request-to-config.js +0 -4
  73. package/package.json +1 -1
  74. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.d.ts +0 -14
  75. package/dist/_vendor/ailf-core/artifact-capture/noop-collector.js +0 -25
  76. package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +0 -94
  77. package/dist/_vendor/ailf-core/ports/artifact-collector.js +0 -13
  78. package/dist/_vendor/ailf-core/ports/capture-comparator.d.ts +0 -138
  79. package/dist/_vendor/ailf-core/ports/capture-comparator.js +0 -10
  80. package/dist/artifact-capture/comparator.d.ts +0 -22
  81. package/dist/artifact-capture/comparator.js +0 -493
  82. package/dist/artifact-capture/filesystem-collector.d.ts +0 -42
  83. package/dist/artifact-capture/filesystem-collector.js +0 -237
  84. package/dist/artifact-capture/gcs-collector.d.ts +0 -55
  85. package/dist/artifact-capture/gcs-collector.js +0 -117
  86. package/dist/commands/capture-compare.d.ts +0 -15
  87. package/dist/commands/capture-compare.js +0 -253
  88. package/dist/commands/capture-list.d.ts +0 -12
  89. package/dist/commands/capture-list.js +0 -150
  90. package/dist/commands/capture.d.ts +0 -9
  91. package/dist/commands/capture.js +0 -16
@@ -0,0 +1,217 @@
1
+ /**
2
+ * slim-report-summary.ts
3
+ *
4
+ * Pure transformer: a full `ScoreSummary` (the shape of `score-summary.json`)
5
+ * into the slim `ReportSummary` that Phase C of D0033 publishes on the
6
+ * Report Content-Lake document. Inlined prose and long arrays are replaced
7
+ * with `id` references to external artifacts (graderJudgments, failureModes,
8
+ * gapReport, traces).
9
+ *
10
+ * **The `id` principle**: every slim reference carries the manifest entry
11
+ * key of the external artifact it points at, produced by the descriptor's
12
+ * own `formatEntryKey(axes)`. Studio looks the id up in `Report.artifactManifest`
13
+ * to get the preview, and hydrates the full payload on drill-down.
14
+ *
15
+ * @see docs/decisions/D0033-unified-run-anchored-artifact-capture.md (§ M7)
16
+ * @see docs/work-items/W0051-report-slim-down-manifest-preview-hooks.json
17
+ */
18
+ import { ARTIFACT_REGISTRY } from "../artifact-registry.js";
19
+ /**
20
+ * Transform a full pipeline `ScoreSummary` into its slim Report counterpart.
21
+ *
22
+ * Each of the four heavy fields is reshaped independently; everything else
23
+ * flows through untouched via structural spread. Pure function — the input
24
+ * summary is not mutated.
25
+ *
26
+ * @param mode The evaluation mode (used to populate the `mode` axis on slim
27
+ * judgment / failure-mode ids). `score-summary.json` carries
28
+ * the mode in `evaluationMode` but the publisher supplies it
29
+ * explicitly to keep the helper independent of that field.
30
+ */
31
+ export function buildSlimReportSummary(summary, mode) {
32
+ const { agentBehavior: fullAgentBehavior, failureModes: fullFailureModes, lowScoringJudgments: fullJudgments, recommendations: fullRecommendations, ...rest } = summary;
33
+ return {
34
+ ...rest,
35
+ ...(fullJudgments
36
+ ? { lowScoringJudgments: slimJudgments(fullJudgments, mode) }
37
+ : {}),
38
+ ...(fullFailureModes
39
+ ? { failureModes: slimFailureModes(fullFailureModes, mode) }
40
+ : {}),
41
+ ...(fullRecommendations
42
+ ? { recommendations: slimRecommendations(fullRecommendations) }
43
+ : {}),
44
+ ...(fullAgentBehavior
45
+ ? { agentBehavior: slimAgentBehavior(fullAgentBehavior) }
46
+ : {}),
47
+ };
48
+ }
49
+ // ---------------------------------------------------------------------------
50
+ // Judgments — axes {mode, task, model, grader}
51
+ // ---------------------------------------------------------------------------
52
+ /**
53
+ * Variant-suffix stripper. Judgments' `taskId` today carries `(gold)` /
54
+ * `(baseline)` suffixes that encode the pipeline mode. We strip the suffix
55
+ * to build the canonical `task` axis value and use the caller-supplied
56
+ * `mode` for the `mode` axis — matching how `formatEntryKey` is computed
57
+ * at producer emit time.
58
+ */
59
+ function splitTaskVariant(taskId) {
60
+ const match = /\s*\((gold|baseline)\)\s*$/i.exec(taskId);
61
+ if (!match)
62
+ return { task: taskId, variant: null };
63
+ return {
64
+ task: taskId.slice(0, match.index).trim(),
65
+ variant: match[1].toLowerCase(),
66
+ };
67
+ }
68
+ function slimJudgments(full, defaultMode) {
69
+ const descriptor = ARTIFACT_REGISTRY.graderJudgments;
70
+ const formatKey = descriptor.formatEntryKey;
71
+ if (!formatKey) {
72
+ throw new Error("slimJudgments: graderJudgments descriptor is missing formatEntryKey");
73
+ }
74
+ return full.map((j) => {
75
+ const { task, variant } = splitTaskVariant(j.taskId);
76
+ // The judgment's task variant overrides the caller-supplied mode when
77
+ // present — a single summary run may carry judgments from both the
78
+ // gold and baseline halves of a literacy run.
79
+ const mode = variant ?? defaultMode;
80
+ const graderId = j.dimension;
81
+ const id = formatKey({
82
+ mode,
83
+ task,
84
+ model: j.modelId,
85
+ grader: graderId,
86
+ });
87
+ const reason = typeof j.reason === "string" ? j.reason : "";
88
+ const reasonPreview = reason.length > 280 ? reason.slice(0, 280) : reason;
89
+ return {
90
+ id,
91
+ taskId: j.taskId,
92
+ modelId: j.modelId,
93
+ graderId,
94
+ // Legacy alias — pre-W0051 Studio readers access `.dimension`.
95
+ // Removed in Slice 6 when those consumers migrate.
96
+ dimension: graderId,
97
+ score: j.score,
98
+ // Inline fallback for offline/cache-skipped runs whose manifest is
99
+ // empty. Truncated to match the graderJudgments descriptor preview
100
+ // schema so either source renders identically at the same cap.
101
+ reasonPreview,
102
+ // Legacy alias — same truncated text under the old name.
103
+ reason: reasonPreview,
104
+ };
105
+ });
106
+ }
107
+ // ---------------------------------------------------------------------------
108
+ // Failure modes — axes {mode, category}, top-N by count
109
+ // ---------------------------------------------------------------------------
110
+ const FAILURE_MODE_TOP_N = 5;
111
+ function slimFailureModes(full, defaultMode) {
112
+ const descriptor = ARTIFACT_REGISTRY.failureModes;
113
+ const formatKey = descriptor.formatEntryKey;
114
+ if (!formatKey) {
115
+ throw new Error("slimFailureModes: failureModes descriptor is missing formatEntryKey");
116
+ }
117
+ const counts = { ...full.summary };
118
+ const nonZero = Object.entries(counts).filter(([, n]) => n > 0);
119
+ nonZero.sort((a, b) => b[1] - a[1]);
120
+ const topTitles = nonZero
121
+ .slice(0, FAILURE_MODE_TOP_N)
122
+ .map(([category, count]) => ({
123
+ id: formatKey({ mode: defaultMode, category }),
124
+ category,
125
+ severity: severityForCount(count),
126
+ title: toTitleCase(category),
127
+ count,
128
+ }));
129
+ return {
130
+ counts,
131
+ topTitles,
132
+ totalJudgments: full.totalJudgments,
133
+ classificationRate: full.classificationRate,
134
+ };
135
+ }
136
+ function severityForCount(count) {
137
+ if (count >= 10)
138
+ return "critical";
139
+ if (count >= 5)
140
+ return "high";
141
+ if (count >= 2)
142
+ return "medium";
143
+ return "low";
144
+ }
145
+ function toTitleCase(id) {
146
+ return id
147
+ .split("-")
148
+ .map((w) => (w.length === 0 ? w : w[0].toUpperCase() + w.slice(1)))
149
+ .join(" ");
150
+ }
151
+ // ---------------------------------------------------------------------------
152
+ // Recommendations — bulk artifact; id is a synthetic area--mode composite
153
+ // ---------------------------------------------------------------------------
154
+ const RECOMMENDATION_TOP_N = 3;
155
+ function slimRecommendations(full) {
156
+ const counts = {};
157
+ for (const gap of full.gaps) {
158
+ counts[gap.area] = (counts[gap.area] ?? 0) + 1;
159
+ }
160
+ // Sort by priority descending, break ties by estimatedLift.
161
+ const sorted = [...full.gaps].sort((a, b) => (b.priority ?? 0) - (a.priority ?? 0) ||
162
+ (b.estimatedLift ?? 0) - (a.estimatedLift ?? 0));
163
+ const top3 = sorted
164
+ .slice(0, RECOMMENDATION_TOP_N)
165
+ .map((g) => ({
166
+ id: gapId(g),
167
+ area: g.area,
168
+ title: toTitleCase(g.failureMode),
169
+ priority: g.priority,
170
+ }));
171
+ return {
172
+ counts,
173
+ top3,
174
+ totalGaps: full.gaps.length,
175
+ totalPotentialLift: full.totalPotentialLift,
176
+ };
177
+ }
178
+ /**
179
+ * Synthetic composite id for a gap — the gapReport artifact is bulk, so a
180
+ * per-gap manifest entry does not exist. This id lets Studio deduplicate
181
+ * gaps and deep-link within the bulk artifact.
182
+ */
183
+ function gapId(g) {
184
+ return `${sanitizeIdSegment(g.area)}--${sanitizeIdSegment(g.failureMode)}`;
185
+ }
186
+ function sanitizeIdSegment(s) {
187
+ return s
188
+ .trim()
189
+ .toLowerCase()
190
+ .replace(/[^a-z0-9]+/g, "-")
191
+ .replace(/^-+|-+$/g, "");
192
+ }
193
+ // ---------------------------------------------------------------------------
194
+ // Agent behavior — counts + first-N samples per feature
195
+ // ---------------------------------------------------------------------------
196
+ const BEHAVIOR_SAMPLE_N = 5;
197
+ function slimAgentBehavior(full) {
198
+ return full.map((f) => {
199
+ const uniqueQueries = dedupe(f.searchQueries);
200
+ const uniqueSlugs = dedupe(f.docSlugsVisited);
201
+ return {
202
+ feature: f.feature,
203
+ avgDocPagesVisited: f.avgDocPagesVisited,
204
+ avgNetworkTimeMs: f.avgNetworkTimeMs,
205
+ avgSearchesPerformed: f.avgSearchesPerformed,
206
+ tasksWithBehaviorData: f.tasksWithBehaviorData,
207
+ externalDomains: f.externalDomains,
208
+ searchQueriesCount: uniqueQueries.length,
209
+ searchQueriesSample: uniqueQueries.slice(0, BEHAVIOR_SAMPLE_N),
210
+ docSlugsVisitedCount: uniqueSlugs.length,
211
+ docSlugsVisitedSample: uniqueSlugs.slice(0, BEHAVIOR_SAMPLE_N),
212
+ };
213
+ });
214
+ }
215
+ function dedupe(items) {
216
+ return [...new Set(items)];
217
+ }
@@ -62,6 +62,39 @@ export type RubricId = Brand<string, "RubricId">;
62
62
  export type FixtureId = Brand<string, "FixtureId">;
63
63
  /** Unique identifier for a build artifact */
64
64
  export type ArtifactId = Brand<string, "ArtifactId">;
65
+ /**
66
+ * The dimensions an artifact is evidence about. Axes determine where the
67
+ * artifact lives, how its entry key is shaped, and — at module load — whether
68
+ * its declared layout is allowed.
69
+ *
70
+ * Unbounded axes (`task`, `model`, `trial`) force `layout: "per-entry"` at the
71
+ * registry invariant; the other axes are finite enough that bulk is acceptable.
72
+ *
73
+ * `category` identifies a classification bucket for artifacts that partition
74
+ * per-mode (e.g. `failureModes`, one entry per classified failure category —
75
+ * D0033 M7, W0051 Slice 2).
76
+ */
77
+ export type AssociationAxis = "run" | "mode" | "task" | "model" | "grader" | "trial" | "category";
78
+ /**
79
+ * The sanitized, filename-safe identifier for a single per-entry artifact
80
+ * object. Produced by `ArtifactDescriptor.formatEntryKey` and parsed by
81
+ * `parseEntryKey`; not a free-form string at runtime.
82
+ */
83
+ export type EntryKey = Brand<string, "EntryKey">;
84
+ /**
85
+ * The subset of `AssociationAxis` values an artifact carries at write time.
86
+ * Used on the writer API (`emit(type, association, payload)`) and on manifest
87
+ * entries so the axes that identify the entry are self-describing in storage.
88
+ *
89
+ * Canonical axes are strongly-keyed. Descriptors whose per-entry layout is
90
+ * discriminated by a name that isn't a pipeline axis (e.g. `sinkResults` by
91
+ * sink name, `callbackRequest` by callback target) read that identifier from
92
+ * the `name` slot. Keep ad-hoc keys out of the canonical axis list so the
93
+ * module-load invariant (unbounded-axis ⇒ per-entry) stays crisp.
94
+ */
95
+ export type AssociationValues = Partial<Record<AssociationAxis, string | number>> & {
96
+ readonly name?: string;
97
+ };
65
98
  /**
66
99
  * A success result containing a value.
67
100
  */
@@ -10,7 +10,8 @@
10
10
  * re-export barrel that preserves backward compatibility.
11
11
  */
12
12
  import type { DocumentRef as _DocumentRef, EvalMode, RunContext } from "../../ailf-shared/index.d.ts";
13
- import type { RunId } from "./branded-ids.js";
13
+ import type { ArtifactType } from "../artifact-registry.js";
14
+ import type { AssociationValues, RunId } from "./branded-ids.js";
14
15
  export type { ActualScoreEntry, ComponentResult, TestResult, UrlMetadata, } from "./scoring-input.js";
15
16
  export type { DocumentRef, RunContext, RunTrigger } from "../../ailf-shared/index.d.ts";
16
17
  export type { StoredBaseline, StoredReport, StoredRun, StoredTaskResult, StoredTrace, SchemaVersioned, } from "./storage-schema.js";
@@ -22,7 +23,7 @@ export { evalModeType } from "./eval-mode-config.js";
22
23
  export type { DependencyEdge, ResolvedFixture, TaskGraph, TaskNode, } from "./task-graph.js";
23
24
  export type { VariableDeclaration, VariableEnvelope, VariableProvenance, VariableSource, } from "./variable-envelope.js";
24
25
  export type { EvalTrace, ToolCallCategory, ToolCallRecord, TraceEvent, TraceSpan, TraceTokenUsage, } from "./trace.js";
25
- export type { ArtifactId, Brand, Err, FixtureId, IdValidationError, NewReportId, Ok, ProviderId, PromptId, Result, ResultId, RubricId, RunFingerprint, RunId, SuiteId, TaskId, TaskSlug, TraceId, } from "./branded-ids.js";
26
+ export type { ArtifactId, AssociationAxis, AssociationValues, Brand, EntryKey, Err, FixtureId, IdValidationError, NewReportId, Ok, ProviderId, PromptId, Result, ResultId, RubricId, RunFingerprint, RunId, SuiteId, TaskId, TaskSlug, TraceId, } from "./branded-ids.js";
26
27
  export { err, fixtureId, generateRunId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
27
28
  export type { AgentHarnessTaskDefinition, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PathDocRef, PerspectiveDocRef, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./generalized-task.js";
28
29
  type DocumentRef = _DocumentRef;
@@ -901,6 +902,142 @@ export interface ScoreSummary {
901
902
  };
902
903
  timestamp: string;
903
904
  }
905
+ /**
906
+ * The subset of `ScoreSummary` the `compare()` primitive reads and the
907
+ * only fields consumers of `ComparisonReport.baseline` / `.experiment`
908
+ * access at runtime. Both the full pipeline `ScoreSummary` and the slim
909
+ * `ReportSummary` (W0051) carry these fields unchanged, so stored Reports
910
+ * can participate in auto-compare without re-hydrating prose fields.
911
+ */
912
+ export type ComparableSummary = Pick<ScoreSummary, "overall" | "perModel" | "scores">;
913
+ /**
914
+ * Slim pointer to a single low-scoring grader judgment. Replaces the full
915
+ * `StoredJudgment` inlined on pre-W0051 reports. The `id` field IS the
916
+ * `graderJudgments` manifest entry key — Studio looks up `reasonPreview`
917
+ * there for list rendering and hydrates the full reason on drill-down.
918
+ *
919
+ * `reasonPreview` is ALSO carried inline as a graceful-degradation fallback
920
+ * for runs whose `artifactManifest` is empty (offline dev, failed GCS
921
+ * upload, cache-skip before manifest aggregation lands). At ~280 chars per
922
+ * judgment × the 50-judgment cap ≈ 14 KB on the Report — tiny against the
923
+ * 500 KB budget. Studio's dispatch prefers the manifest-entry preview when
924
+ * present so GCS drill-down still renders the hydrated-on-demand copy.
925
+ *
926
+ * `graderId` replaces the historical `dimension` field at the slim-shape
927
+ * boundary to match D0033 axis naming; runtime value is identical.
928
+ */
929
+ export interface SlimJudgmentRef {
930
+ /** Manifest entry key = `formatEntryKey({mode, task, model, grader})` for `graderJudgments`. */
931
+ id: string;
932
+ taskId: string;
933
+ modelId: string;
934
+ /** Rubric dimension name at runtime (what `GraderJudgment.dimension` carries). */
935
+ graderId: string;
936
+ /**
937
+ * Alias of `graderId` — carried for pre-W0051 Studio readers that still
938
+ * access `.dimension` (JudgmentList, judgment-formatting, et al). Remove
939
+ * in Slice 6 when those consumers migrate to `graderId`. Identical
940
+ * runtime value; no additional ambiguity.
941
+ */
942
+ dimension: string;
943
+ /** Normalized 0–100 score. */
944
+ score: number;
945
+ /**
946
+ * Truncated grader reason (≤280 chars). Inline fallback used when the
947
+ * manifest entry's preview is unavailable; the authoritative full reason
948
+ * lives in the `graderJudgments` external artifact.
949
+ */
950
+ reasonPreview?: string;
951
+ /**
952
+ * Alias of `reasonPreview` — legacy Studio renderers read `.reason`;
953
+ * expose the same truncated text under both names for the compat window.
954
+ * Remove alongside `dimension` in Slice 6.
955
+ */
956
+ reason?: string;
957
+ }
958
+ /**
959
+ * Slim failure-mode entry on the Report summary. One per classified
960
+ * `FailureModeType` bucket; `id` is the `failureModes` manifest entry key
961
+ * so drill-down can fetch the full category payload.
962
+ */
963
+ export interface SlimFailureModeTopTitle {
964
+ /** Manifest entry key = `formatEntryKey({mode, category})` for `failureModes`. */
965
+ id: string;
966
+ category: string;
967
+ severity: "low" | "medium" | "high" | "critical";
968
+ title: string;
969
+ count: number;
970
+ }
971
+ /** Counts + top-N per-category summary on the Report. */
972
+ export interface SlimFailureModesSummary {
973
+ /** Count by FailureModeType-ish id. */
974
+ counts: Record<string, number>;
975
+ /** Top-N categories by count, descending. */
976
+ topTitles: SlimFailureModeTopTitle[];
977
+ /** Total classified judgments across all categories. */
978
+ totalJudgments: number;
979
+ /** Percentage of judgments that landed in a non-unclassified bucket. */
980
+ classificationRate: number;
981
+ }
982
+ /**
983
+ * Slim gap pointer. The `gapReport` artifact is bulk (axes: `{run}`) so
984
+ * there is no per-gap manifest entry to point at — `id` is a stable
985
+ * synthetic composite so the UI can deduplicate and deep-link. Drill-down
986
+ * reads the full `gapReport` artifact and filters by id.
987
+ */
988
+ export interface SlimRecommendationGap {
989
+ /** Synthetic id: `${area}--${failureMode}`, kebab-safe. */
990
+ id: string;
991
+ area: string;
992
+ title: string;
993
+ /** Priority bucketing for triage UI ordering. */
994
+ priority: number;
995
+ }
996
+ /** Counts + top-3 summary on the Report. */
997
+ export interface SlimRecommendations {
998
+ /** Count of gaps by area. */
999
+ counts: Record<string, number>;
1000
+ /** Top-3 gaps by priority, descending. */
1001
+ top3: SlimRecommendationGap[];
1002
+ /** Total actionable gaps identified (sum of counts). */
1003
+ totalGaps: number;
1004
+ /** Aggregate estimated lift (matches `GapAnalysisReport.totalPotentialLift`). */
1005
+ totalPotentialLift: number;
1006
+ }
1007
+ /**
1008
+ * Slim per-feature agent-behavior summary. Full `searchQueries` and
1009
+ * `docSlugsVisited` arrays move to `traces` NDJSON; the Report keeps only
1010
+ * counts + first-N samples for triage preview. The `firstN` cap is the
1011
+ * producer's choice (we default to 5).
1012
+ */
1013
+ export interface SlimAgentBehaviorFeature {
1014
+ feature: string;
1015
+ avgDocPagesVisited: number;
1016
+ avgNetworkTimeMs: number;
1017
+ avgSearchesPerformed: number;
1018
+ tasksWithBehaviorData: number;
1019
+ externalDomains: string[];
1020
+ /** Distinct count of unique search queries across tasks in this feature. */
1021
+ searchQueriesCount: number;
1022
+ /** First-N unique search queries (bounded samples). */
1023
+ searchQueriesSample: string[];
1024
+ /** Distinct count of unique doc slugs visited. */
1025
+ docSlugsVisitedCount: number;
1026
+ /** First-N unique doc slugs (bounded samples). */
1027
+ docSlugsVisitedSample: string[];
1028
+ }
1029
+ /**
1030
+ * Slim `summary` field on a published `Report`. Structurally
1031
+ * `Omit<ScoreSummary, slimmed-fields> & slim-replacements` — every
1032
+ * pipeline-produced field survives except the four prose/array fields
1033
+ * W0051 moves to external artifacts.
1034
+ */
1035
+ export type ReportSummary = Omit<ScoreSummary, "agentBehavior" | "failureModes" | "lowScoringJudgments" | "recommendations"> & {
1036
+ agentBehavior?: SlimAgentBehaviorFeature[];
1037
+ failureModes?: SlimFailureModesSummary;
1038
+ lowScoringJudgments?: SlimJudgmentRef[];
1039
+ recommendations?: SlimRecommendations;
1040
+ };
904
1041
  /** Result of a single pipeline step */
905
1042
  export type StepResult = {
906
1043
  status: "failed";
@@ -1074,8 +1211,8 @@ export interface ComparisonReport {
1074
1211
  areas: AreaDelta[];
1075
1212
  /** Per-document attribution (when changed docs are known) */
1076
1213
  attribution?: AttributionReport;
1077
- /** The "before" or "control" summary */
1078
- baseline: ScoreSummary;
1214
+ /** The "before" or "control" summary (narrowed in W0051 so slim Reports compare) */
1215
+ baseline: ComparableSummary;
1079
1216
  /** Aggregate deltas */
1080
1217
  deltas: {
1081
1218
  /** Overall score delta (experiment.avgScore − baseline.avgScore) */
@@ -1094,8 +1231,8 @@ export interface ComparisonReport {
1094
1231
  modelId: string;
1095
1232
  }[];
1096
1233
  };
1097
- /** The "after" or "treatment" summary */
1098
- experiment: ScoreSummary;
1234
+ /** The "after" or "treatment" summary (narrowed in W0051 so slim Reports compare) */
1235
+ experiment: ComparableSummary;
1099
1236
  /** When this comparison was generated */
1100
1237
  generatedAt: string;
1101
1238
  /** Areas that improved beyond the noise threshold */
@@ -1177,7 +1314,37 @@ export interface PublishResult {
1177
1314
  }[];
1178
1315
  }
1179
1316
  /**
1180
- * Reference to an artifact in external object storage (GCS). See D0032.
1317
+ * A single per-entry row in `ArtifactRef.entries`. Carries enough metadata for
1318
+ * Studio list/triage views to render without fetching the external payload.
1319
+ *
1320
+ * D0033/W0049 extensions:
1321
+ * - `association` — the axis values that identify this entry (`{task, model}`
1322
+ * for testOutputs, etc.). Present only on manifests written by
1323
+ * `emit()`; legacy manifests omit it and readers treat absence as `{}`.
1324
+ * - `truncated` — whether the entry payload was capped by the descriptor's
1325
+ * `capBytes`. Readers treat absence as `false` (pre-W0049 manifests were
1326
+ * never truncated because no caps were enforced).
1327
+ * - `preview` — an inline summary produced by the descriptor's
1328
+ * `manifestPreview.extract()`. Typed via the descriptor's preview schema;
1329
+ * omitted when the descriptor has no `manifestPreview`. Wiring lands in
1330
+ * W0051; the field is reserved here so manifests written now parse there.
1331
+ */
1332
+ export interface ArtifactRefEntry {
1333
+ key: string;
1334
+ bytes: number;
1335
+ association?: AssociationValues;
1336
+ truncated?: boolean;
1337
+ preview?: unknown;
1338
+ }
1339
+ /**
1340
+ * Reference to an artifact in external object storage.
1341
+ *
1342
+ * `store` discriminates the backend: `"gcs"` uses `bucket` as the bucket
1343
+ * name (D0032); `"local"` uses `bucket` as the absolute rootDir path
1344
+ * under which `path` resolves to a file on disk (D0033 / W0050 M4).
1345
+ * Consumers (Studio retrieval, contract tests) branch on `store` only
1346
+ * when constructing the fetch URL — the `path` is store-relative and
1347
+ * identical across backends for the same logical artifact.
1181
1348
  *
1182
1349
  * `layout` determines the on-disk shape:
1183
1350
  * - `"bulk"` — a single object at `path`. `entries` is absent.
@@ -1185,34 +1352,40 @@ export interface PublishResult {
1185
1352
  * separate object at `{path}/{sanitizedKey}.json`. `entries` inlines
1186
1353
  * the catalog so consumers can render drill-down states without a
1187
1354
  * second list call.
1355
+ *
1356
+ * D0033/W0049 extensions (optional — legacy manifests parse without them):
1357
+ * - `truncated` on the bulk row indicates the single-object body was capped.
1358
+ * - `preview` on the bulk row carries a descriptor-typed summary for list
1359
+ * views; wiring lands in W0051.
1188
1360
  */
1189
1361
  export interface ArtifactRef {
1190
- store: "gcs";
1362
+ store: "gcs" | "local";
1363
+ /**
1364
+ * GCS bucket name for `store: "gcs"`; absolute rootDir path for
1365
+ * `store: "local"`. Kept as a single field so callers iterating
1366
+ * manifest entries don't need to branch on `store` just to read the
1367
+ * storage container.
1368
+ */
1191
1369
  bucket: string;
1192
1370
  path: string;
1193
1371
  bytes?: number;
1194
1372
  entryCount?: number;
1195
1373
  layout: "bulk" | "per-entry";
1196
- entries?: {
1197
- key: string;
1198
- bytes: number;
1199
- }[];
1374
+ entries?: ArtifactRefEntry[];
1375
+ truncated?: boolean;
1376
+ preview?: unknown;
1200
1377
  }
1201
1378
  /**
1202
1379
  * Catalog of artifact refs produced by a single pipeline run.
1203
1380
  *
1204
1381
  * Lives on `RunManifest.artifacts` (source of truth in GCS) and is
1205
1382
  * snapshotted onto `Report.artifactManifest` at publish time.
1383
+ *
1384
+ * Derived from `ArtifactType` so adding a descriptor to the registry
1385
+ * automatically admits it to the manifest catalog — drift between the
1386
+ * two becomes a compile error (W0049 review finding C1).
1206
1387
  */
1207
- export interface ArtifactManifest {
1208
- testOutputs?: ArtifactRef;
1209
- renderedPrompts?: ArtifactRef;
1210
- rawResults?: ArtifactRef;
1211
- graderPrompts?: ArtifactRef;
1212
- taskDefinitions?: ArtifactRef;
1213
- evalResults?: ArtifactRef;
1214
- traces?: ArtifactRef;
1215
- }
1388
+ export type ArtifactManifest = Partial<Record<ArtifactType, ArtifactRef>>;
1216
1389
  /** A published evaluation report — the atomic unit of the report store */
1217
1390
  export interface Report {
1218
1391
  /**
@@ -1232,8 +1405,14 @@ export interface Report {
1232
1405
  id: ReportId;
1233
1406
  /** What produced this report */
1234
1407
  provenance: ReportProvenance;
1235
- /** The full score summary */
1236
- summary: ScoreSummary;
1408
+ /**
1409
+ * The slim published summary. Inlined prose fields (grader reasons, full
1410
+ * failure-mode text, gap prose, agent-behavior arrays) moved to external
1411
+ * artifacts in W0051; see `ReportSummary` for the retained shape and
1412
+ * `docs/decisions/D0033-unified-run-anchored-artifact-capture.md` §§ M7 for
1413
+ * the full migration table.
1414
+ */
1415
+ summary: ReportSummary;
1237
1416
  /** Optional human-supplied label */
1238
1417
  tag?: string;
1239
1418
  /** Auto-generated descriptive title for discoverability and sharing */
@@ -120,10 +120,6 @@ function mapEvalConfigToResolvedConfig(config, rootDir) {
120
120
  allowedOrigins: config.allowedOrigins,
121
121
  searchMode: config.searchMode ?? "open",
122
122
  concurrency: config.concurrency,
123
- captureEnabled: false,
124
- captureDir: undefined,
125
- captureCompress: true,
126
- captureExtras: true,
127
123
  remote: false,
128
124
  apiUrl: "https://ailf-api.sanity.build",
129
125
  presets: config.presets,
@@ -0,0 +1,50 @@
1
+ /**
2
+ * accumulating-artifact-writer.ts
3
+ *
4
+ * Decorator that wraps any `ArtifactWriter` and accumulates every
5
+ * successful `emit()` / `appendNdjson()` return into a run-scoped
6
+ * manifest slice. FinalizeRunStep reads the accumulator at the end of
7
+ * a pipeline and writes a `runs/{runId}/manifest.json` populated with
8
+ * one entry per produced artifact type.
9
+ *
10
+ * Before W0051 revisit: only `calculate-scores-step` registered its
11
+ * ref (for `testOutputs`) into `state.artifactRefs`; every other
12
+ * producer discarded the returned ref. The result was empty
13
+ * `Report.artifactManifest` fields and no per-entry preview lookup
14
+ * for Studio hooks. Wrapping the writer at the composition-root level
15
+ * closes that gap without per-producer bookkeeping.
16
+ *
17
+ * Merging rules (per type):
18
+ * - `bulk`: last-writer-wins. A pipeline that emits the same bulk
19
+ * artifact twice overwrites — matches GCS semantics.
20
+ * - `per-entry`: entries accumulate into a keyed map. A later emit
21
+ * at the same `entries[].key` replaces the earlier one.
22
+ *
23
+ * The decorator holds no disk state; the `_resetAccumulated()` hook is
24
+ * for unit tests that rerun emit sequences within a single writer.
25
+ *
26
+ * @see docs/decisions/D0033-unified-run-anchored-artifact-capture.md (§ M5)
27
+ */
28
+ import type { ArtifactEntry, ArtifactManifest, ArtifactRef, ArtifactType, ArtifactWriter, AssociationValues, RunId, RunManifest } from "../_vendor/ailf-core/index.d.ts";
29
+ export declare class AccumulatingArtifactWriter implements ArtifactWriter {
30
+ /**
31
+ * Exposed so composition-root tests can assert on the underlying backend
32
+ * (LocalFilesystemArtifactWriter, FanoutArtifactWriter, etc.) without
33
+ * plumbing a separate accessor. Treat as read-only.
34
+ */
35
+ readonly inner: ArtifactWriter;
36
+ private readonly accumulated;
37
+ constructor(inner: ArtifactWriter);
38
+ /** Snapshot of every ref produced this far, keyed by artifact type. */
39
+ getAccumulatedArtifactRefs(): ArtifactManifest;
40
+ /** Test-only. Clears accumulated refs without touching the inner writer. */
41
+ _resetAccumulated(): void;
42
+ emit<T extends ArtifactType>(type: T, association: AssociationValues, payload: unknown): Promise<ArtifactRef | null>;
43
+ appendNdjson<T extends ArtifactType>(type: T, association: AssociationValues, rows: readonly unknown[]): Promise<ArtifactRef | null>;
44
+ writeManifest(runId: RunId, manifest: RunManifest): Promise<ArtifactRef | null>;
45
+ /** @deprecated — forwarded to the inner writer without accumulation. */
46
+ writeBulk(type: ArtifactType, runId: RunId, data: unknown): Promise<ArtifactRef | null>;
47
+ /** @deprecated — forwarded to the inner writer without accumulation. */
48
+ writePerEntry(type: ArtifactType, runId: RunId, entries: readonly ArtifactEntry[]): Promise<ArtifactRef | null>;
49
+ private mergeRef;
50
+ }