@sanity/ailf 2.8.0 → 2.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/dist/_vendor/ailf-core/artifact-capture/association.d.ts +35 -0
  2. package/dist/_vendor/ailf-core/artifact-capture/association.js +28 -0
  3. package/dist/_vendor/ailf-core/artifact-registry.d.ts +124 -23
  4. package/dist/_vendor/ailf-core/artifact-registry.js +724 -63
  5. package/dist/_vendor/ailf-core/index.d.ts +2 -1
  6. package/dist/_vendor/ailf-core/index.js +2 -1
  7. package/dist/_vendor/ailf-core/ports/artifact-writer.d.ts +59 -20
  8. package/dist/_vendor/ailf-core/ports/artifact-writer.js +33 -10
  9. package/dist/_vendor/ailf-core/ports/context.d.ts +21 -2
  10. package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +6 -6
  11. package/dist/_vendor/ailf-core/services/index.d.ts +1 -0
  12. package/dist/_vendor/ailf-core/services/index.js +1 -0
  13. package/dist/_vendor/ailf-core/services/slim-report-summary.d.ts +31 -0
  14. package/dist/_vendor/ailf-core/services/slim-report-summary.js +217 -0
  15. package/dist/_vendor/ailf-core/types/branded-ids.d.ts +33 -0
  16. package/dist/_vendor/ailf-core/types/index.d.ts +202 -23
  17. package/dist/artifact-capture/accumulating-artifact-writer.d.ts +50 -0
  18. package/dist/artifact-capture/accumulating-artifact-writer.js +111 -0
  19. package/dist/artifact-capture/api-gateway-artifact-writer.d.ts +17 -4
  20. package/dist/artifact-capture/api-gateway-artifact-writer.js +58 -7
  21. package/dist/artifact-capture/emit-file.d.ts +28 -0
  22. package/dist/artifact-capture/emit-file.js +56 -0
  23. package/dist/artifact-capture/fanout-artifact-writer.d.ts +39 -0
  24. package/dist/artifact-capture/fanout-artifact-writer.js +76 -0
  25. package/dist/artifact-capture/filesystem-collector.d.ts +22 -4
  26. package/dist/artifact-capture/filesystem-collector.js +48 -23
  27. package/dist/artifact-capture/gcs-artifact-writer.d.ts +40 -3
  28. package/dist/artifact-capture/gcs-artifact-writer.js +238 -14
  29. package/dist/artifact-capture/local-fs-artifact-writer.d.ts +71 -0
  30. package/dist/artifact-capture/local-fs-artifact-writer.js +273 -0
  31. package/dist/commands/explain-handler.js +4 -0
  32. package/dist/commands/pipeline-action.d.ts +5 -0
  33. package/dist/commands/pipeline-action.js +56 -5
  34. package/dist/commands/pipeline.d.ts +4 -0
  35. package/dist/commands/pipeline.js +6 -2
  36. package/dist/commands/publish.js +4 -1
  37. package/dist/composition-root.d.ts +13 -10
  38. package/dist/composition-root.js +74 -20
  39. package/dist/orchestration/pipeline-orchestrator.d.ts +1 -1
  40. package/dist/orchestration/pipeline-orchestrator.js +41 -30
  41. package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -1
  42. package/dist/orchestration/steps/calculate-scores-step.js +19 -19
  43. package/dist/orchestration/steps/callback-step.d.ts +1 -1
  44. package/dist/orchestration/steps/callback-step.js +6 -4
  45. package/dist/orchestration/steps/compare-step.d.ts +1 -1
  46. package/dist/orchestration/steps/compare-step.js +4 -2
  47. package/dist/orchestration/steps/discovery-report-step.d.ts +1 -1
  48. package/dist/orchestration/steps/discovery-report-step.js +4 -1
  49. package/dist/orchestration/steps/fetch-docs-step.js +9 -15
  50. package/dist/orchestration/steps/finalize-run-step.js +21 -7
  51. package/dist/orchestration/steps/gap-analysis-step.js +34 -6
  52. package/dist/orchestration/steps/generate-configs-step.d.ts +1 -1
  53. package/dist/orchestration/steps/generate-configs-step.js +11 -11
  54. package/dist/orchestration/steps/publish-report-step.d.ts +1 -1
  55. package/dist/orchestration/steps/publish-report-step.js +24 -19
  56. package/dist/orchestration/steps/readiness-step.d.ts +1 -1
  57. package/dist/orchestration/steps/readiness-step.js +4 -1
  58. package/dist/orchestration/steps/report-step.d.ts +1 -1
  59. package/dist/orchestration/steps/report-step.js +6 -3
  60. package/dist/orchestration/steps/run-eval-step.js +14 -9
  61. package/dist/pipeline/compare.d.ts +2 -2
  62. package/dist/pipeline/emit-eval-results.d.ts +38 -0
  63. package/dist/pipeline/emit-eval-results.js +100 -0
  64. package/package.json +1 -1
@@ -20,4 +20,5 @@ export { defineConfig, defineFeatures, defineModeBase, defineModels, definePrici
20
20
  export type { PricingEntry, PromptEntry, SourceEntry, } from "./config-helpers.js";
21
21
  export { env } from "./env-helper.js";
22
22
  export { NoOpArtifactCollector } from "./artifact-capture/noop-collector.js";
23
- export { NoOpArtifactWriter } from "./ports/artifact-writer.js";
23
+ export { NoOpArtifactWriter, NotImplementedError, } from "./ports/artifact-writer.js";
24
+ export { assoc, type AssocContext } from "./artifact-capture/association.js";
@@ -22,4 +22,5 @@ export * from "./artifact-registry.js";
22
22
  export { defineConfig, defineFeatures, defineModeBase, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./config-helpers.js";
23
23
  export { env } from "./env-helper.js";
24
24
  export { NoOpArtifactCollector } from "./artifact-capture/noop-collector.js";
25
- export { NoOpArtifactWriter } from "./ports/artifact-writer.js";
25
+ export { NoOpArtifactWriter, NotImplementedError, } from "./ports/artifact-writer.js";
26
+ export { assoc } from "./artifact-capture/association.js";
@@ -1,26 +1,35 @@
1
1
  /**
2
2
  * Port: ArtifactWriter — writes run artifacts + the run manifest to external storage.
3
3
  *
4
- * Replaces the older `ArtifactUploader` port from D0030. Differences:
5
- * - Paths anchor to `RunId` (not `ReportId`) via the registry's `objectPath`.
6
- * - Supports both `"bulk"` and `"per-entry"` layouts.
7
- * - A dedicated `writeManifest()` method for the run manifest at
8
- * `runs/{runId}/manifest.json`.
4
+ * D0033 / W0049 unifies the writer API around a single caller-facing method:
9
5
  *
10
- * Producer steps call writer methods directly with the artifact type from
11
- * `ARTIFACT_REGISTRY`. Path construction, schema validation, and entry-key
12
- * sanitization live in the registry, not the call site.
6
+ * - `emit(type, association, payload)` the canonical write. Dispatch on
7
+ * `descriptor.layout` is internal; callers never pick a shape.
8
+ * - `appendNdjson(type, association, rows)` streaming-append variant used
9
+ * only by the `traces` artifact. Semantics differ from `emit` (repeated
10
+ * append vs. single-shot write) so it gets its own method rather than an
11
+ * overload.
12
+ * - `writeManifest(runId, manifest)` — writes the run manifest at
13
+ * `runs/{runId}/manifest.json`.
14
+ * - `writeBulk` / `writePerEntry` — @deprecated legacy surface retained for
15
+ * producer code that has not migrated to `emit`. Removal scheduled for
16
+ * W0052 (see `docs/decisions/D0033-unified-run-anchored-artifact-capture.md`).
13
17
  *
14
18
  * @see docs/decisions/D0032-run-anchored-artifact-store.md
19
+ * @see docs/decisions/D0033-unified-run-anchored-artifact-capture.md
15
20
  * @see packages/core/src/artifact-registry.ts
16
21
  */
17
22
  import type { ArtifactType } from "../artifact-registry.js";
18
- import type { RunId } from "../types/branded-ids.js";
23
+ import type { AssociationValues, RunId } from "../types/branded-ids.js";
19
24
  import type { ArtifactRef, RunManifest } from "../types/index.js";
20
25
  /**
21
26
  * An entry in a per-entry upload. The `key` is the wire-format identifier
22
27
  * (e.g. `{taskId}::{modelId}` for testOutputs); the writer sanitizes it into
23
- * the filename using the registry's `parseEntryKey`.
28
+ * the filename using the registry's `parseEntryKey` or direct path building.
29
+ *
30
+ * @deprecated Use `ArtifactWriter.emit()` with `AssociationValues` instead.
31
+ * This type is retained for producers still on the legacy
32
+ * `writeBulk`/`writePerEntry` path; removal scheduled for W0052.
24
33
  */
25
34
  export interface ArtifactEntry<TData = unknown> {
26
35
  key: string;
@@ -28,29 +37,59 @@ export interface ArtifactEntry<TData = unknown> {
28
37
  }
29
38
  export interface ArtifactWriter {
30
39
  /**
31
- * Write a bulk artifact one JSON object per (runId, type).
40
+ * Write a single artifact. The descriptor's `layout` determines whether
41
+ * this produces a bulk object (`runs/{runId}/{slug}.{ext}`) or a per-entry
42
+ * object (`runs/{runId}/{slug}/{entryKey}.{ext}`).
32
43
  *
33
- * @returns An `ArtifactRef` pointing at `runs/{runId}/{slug}.json`, or
34
- * `null` when upload is skipped or fails (P5: non-blocking).
44
+ * For per-entry descriptors, `association` must carry the axis values the
45
+ * descriptor's `formatEntryKey` consumes. For bulk descriptors, only `run`
46
+ * is required.
47
+ *
48
+ * @returns `ArtifactRef` on success, or `null` when upload is skipped or
49
+ * fails (P5: non-blocking).
35
50
  */
36
- writeBulk(type: ArtifactType, runId: RunId, data: unknown): Promise<ArtifactRef | null>;
51
+ emit<T extends ArtifactType>(type: T, association: AssociationValues, payload: unknown): Promise<ArtifactRef | null>;
37
52
  /**
38
- * Write a per-entry artifact one JSON object per entry, all under
39
- * `runs/{runId}/{slug}/`.
53
+ * Append NDJSON rows to a per-entry artifact. Used only by `traces`, whose
54
+ * per-entry payload is itself unbounded during production. The writer
55
+ * buffers rows keyed on (type, entryKey) and flushes to numbered part
56
+ * objects; the backend composes parts into the final object at trial
57
+ * completion.
40
58
  *
41
- * The returned `ArtifactRef.entries` inlines the catalog so consumers
42
- * can render drill-down state without a second listing call.
59
+ * @throws NotImplementedError on writers that don't support streaming
60
+ * appends (e.g. the API-gateway-backed writer; see W0052).
43
61
  */
44
- writePerEntry(type: ArtifactType, runId: RunId, entries: readonly ArtifactEntry[]): Promise<ArtifactRef | null>;
62
+ appendNdjson<T extends ArtifactType>(type: T, association: AssociationValues, rows: readonly unknown[]): Promise<ArtifactRef | null>;
45
63
  /**
46
64
  * Write the run manifest to `runs/{runId}/manifest.json`. Single-writer
47
65
  * per run; subsequent publishes may rewrite to append `reportIds[]`.
48
66
  */
49
67
  writeManifest(runId: RunId, manifest: RunManifest): Promise<ArtifactRef | null>;
68
+ /**
69
+ * @deprecated Use `emit()` with `AssociationValues` instead. Retained for
70
+ * producers still on the legacy path; removal scheduled for W0052.
71
+ */
72
+ writeBulk(type: ArtifactType, runId: RunId, data: unknown): Promise<ArtifactRef | null>;
73
+ /**
74
+ * @deprecated Use `emit()` per entry instead. Retained for producers still
75
+ * on the legacy path; removal scheduled for W0052.
76
+ */
77
+ writePerEntry(type: ArtifactType, runId: RunId, entries: readonly ArtifactEntry[]): Promise<ArtifactRef | null>;
78
+ }
79
+ /**
80
+ * Thrown by writers that can't satisfy a method — e.g. an
81
+ * `ApiGatewayArtifactWriter` cannot implement `appendNdjson` until the batch
82
+ * signing endpoint (W0052) lands. Callers should treat this as an explicit
83
+ * failure rather than a silent no-op so the gap surfaces in logs.
84
+ */
85
+ export declare class NotImplementedError extends Error {
86
+ constructor(message: string);
50
87
  }
51
88
  /** No-op writer — every method returns null. Used when no storage is configured. */
52
89
  export declare class NoOpArtifactWriter implements ArtifactWriter {
90
+ emit(): Promise<null>;
91
+ appendNdjson(): Promise<null>;
92
+ writeManifest(): Promise<null>;
53
93
  writeBulk(): Promise<null>;
54
94
  writePerEntry(): Promise<null>;
55
- writeManifest(): Promise<null>;
56
95
  }
@@ -1,28 +1,51 @@
1
1
  /**
2
2
  * Port: ArtifactWriter — writes run artifacts + the run manifest to external storage.
3
3
  *
4
- * Replaces the older `ArtifactUploader` port from D0030. Differences:
5
- * - Paths anchor to `RunId` (not `ReportId`) via the registry's `objectPath`.
6
- * - Supports both `"bulk"` and `"per-entry"` layouts.
7
- * - A dedicated `writeManifest()` method for the run manifest at
8
- * `runs/{runId}/manifest.json`.
4
+ * D0033 / W0049 unifies the writer API around a single caller-facing method:
9
5
  *
10
- * Producer steps call writer methods directly with the artifact type from
11
- * `ARTIFACT_REGISTRY`. Path construction, schema validation, and entry-key
12
- * sanitization live in the registry, not the call site.
6
+ * - `emit(type, association, payload)` the canonical write. Dispatch on
7
+ * `descriptor.layout` is internal; callers never pick a shape.
8
+ * - `appendNdjson(type, association, rows)` streaming-append variant used
9
+ * only by the `traces` artifact. Semantics differ from `emit` (repeated
10
+ * append vs. single-shot write) so it gets its own method rather than an
11
+ * overload.
12
+ * - `writeManifest(runId, manifest)` — writes the run manifest at
13
+ * `runs/{runId}/manifest.json`.
14
+ * - `writeBulk` / `writePerEntry` — @deprecated legacy surface retained for
15
+ * producer code that has not migrated to `emit`. Removal scheduled for
16
+ * W0052 (see `docs/decisions/D0033-unified-run-anchored-artifact-capture.md`).
13
17
  *
14
18
  * @see docs/decisions/D0032-run-anchored-artifact-store.md
19
+ * @see docs/decisions/D0033-unified-run-anchored-artifact-capture.md
15
20
  * @see packages/core/src/artifact-registry.ts
16
21
  */
22
+ /**
23
+ * Thrown by writers that can't satisfy a method — e.g. an
24
+ * `ApiGatewayArtifactWriter` cannot implement `appendNdjson` until the batch
25
+ * signing endpoint (W0052) lands. Callers should treat this as an explicit
26
+ * failure rather than a silent no-op so the gap surfaces in logs.
27
+ */
28
+ export class NotImplementedError extends Error {
29
+ constructor(message) {
30
+ super(message);
31
+ this.name = "NotImplementedError";
32
+ }
33
+ }
17
34
  /** No-op writer — every method returns null. Used when no storage is configured. */
18
35
  export class NoOpArtifactWriter {
19
- async writeBulk() {
36
+ async emit() {
20
37
  return null;
21
38
  }
22
- async writePerEntry() {
39
+ async appendNdjson() {
23
40
  return null;
24
41
  }
25
42
  async writeManifest() {
26
43
  return null;
27
44
  }
45
+ async writeBulk() {
46
+ return null;
47
+ }
48
+ async writePerEntry() {
49
+ return null;
50
+ }
28
51
  }
@@ -161,6 +161,20 @@ export interface ResolvedConfig {
161
161
  captureCompress?: boolean;
162
162
  /** Whether to include mode-specific extra artifacts (default: true) */
163
163
  captureExtras?: boolean;
164
+ /**
165
+ * D0033 / W0049 — the unified artifact surface. Wired into the writer in
166
+ * W0050; consumed by the writer factory to decide whether to attach a
167
+ * writer at all, where it writes to, and what to skip. These fields are
168
+ * additive and do not replace the legacy `capture*` fields until W0052.
169
+ */
170
+ /** Disables all artifact writers — `--no-artifacts`. */
171
+ artifactsDisabled?: boolean;
172
+ /** Root directory for local artifact output — `--artifacts-dir`. */
173
+ artifactsDir?: string;
174
+ /** Run writers in dry-run mode — `--artifacts-dry-run`. */
175
+ artifactsDryRun?: boolean;
176
+ /** Comma-separated artifact types to skip — `--capture-exclude`. */
177
+ artifactsExclude?: readonly string[];
164
178
  /** GCS bucket for capture upload (enables GCS decorator when set) */
165
179
  captureGcsBucket?: string;
166
180
  /** GCS object prefix for capture uploads (default: "captures/") */
@@ -198,8 +212,13 @@ export interface ResolvedConfig {
198
212
  * Created per-test by createTestContext().
199
213
  */
200
214
  export interface AppContext {
201
- /** Artifact writer — writes run artifacts + manifest to GCS (D0032) */
202
- readonly artifactWriter?: ArtifactWriter;
215
+ /**
216
+ * Artifact writer — writes run artifacts + manifest to local fs (D0033
217
+ * M4: always on) and optionally to GCS (D0032, layered via
218
+ * FanoutArtifactWriter). Required post-W0050 — the composition root
219
+ * always provides a writer (NoOpArtifactWriter when `--no-artifacts`).
220
+ */
221
+ readonly artifactWriter: ArtifactWriter;
203
222
  /** Evaluation caching (filesystem + optional Content Lake fallback) */
204
223
  readonly cache?: CacheStore;
205
224
  /** Artifact capture collector (no-op when --capture is not set) */
@@ -67,10 +67,10 @@ export declare const FeatureSchema: z.ZodObject<{
67
67
  id: z.ZodString;
68
68
  name: z.ZodString;
69
69
  priority: z.ZodEnum<{
70
- critical: "critical";
71
- high: "high";
72
- medium: "medium";
73
70
  low: "low";
71
+ medium: "medium";
72
+ high: "high";
73
+ critical: "critical";
74
74
  }>;
75
75
  sections: z.ZodArray<z.ZodString>;
76
76
  status: z.ZodEnum<{
@@ -91,10 +91,10 @@ export declare const FeatureRegistrySchema: z.ZodObject<{
91
91
  id: z.ZodString;
92
92
  name: z.ZodString;
93
93
  priority: z.ZodEnum<{
94
- critical: "critical";
95
- high: "high";
96
- medium: "medium";
97
94
  low: "low";
95
+ medium: "medium";
96
+ high: "high";
97
+ critical: "critical";
98
98
  }>;
99
99
  sections: z.ZodArray<z.ZodString>;
100
100
  status: z.ZodEnum<{
@@ -11,3 +11,4 @@ export { classifyRubric, detectFeatureArea, extractDimensions, extractUrlMetadat
11
11
  export { formatComparisonMarkdown, formatComparisonTable, } from "./comparison-formatters.js";
12
12
  export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskScore, normalizeScore, type AggregationStrategy, type AreaScore, type AssertionScore, type DimensionScore, type EnsembleGradingConfig, type GraderTransitionConfig, type TaskScore, type TaskScoreOptions, } from "./scoring-engine.js";
13
13
  export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, resolveModelVariants, } from "./config-helpers.js";
14
+ export { buildSlimReportSummary } from "./slim-report-summary.js";
@@ -11,3 +11,4 @@ export { classifyRubric, detectFeatureArea, extractDimensions, extractUrlMetadat
11
11
  export { formatComparisonMarkdown, formatComparisonTable, } from "./comparison-formatters.js";
12
12
  export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskScore, normalizeScore, } from "./scoring-engine.js";
13
13
  export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, resolveModelVariants, } from "./config-helpers.js";
14
+ export { buildSlimReportSummary } from "./slim-report-summary.js";
@@ -0,0 +1,31 @@
1
+ /**
2
+ * slim-report-summary.ts
3
+ *
4
+ * Pure transformer: a full `ScoreSummary` (the shape of `score-summary.json`)
5
+ * into the slim `ReportSummary` that Phase C of D0033 publishes on the
6
+ * Report Content-Lake document. Inlined prose and long arrays are replaced
7
+ * with `id` references to external artifacts (graderJudgments, failureModes,
8
+ * gapReport, traces).
9
+ *
10
+ * **The `id` principle**: every slim reference carries the manifest entry
11
+ * key of the external artifact it points at, produced by the descriptor's
12
+ * own `formatEntryKey(axes)`. Studio looks the id up in `Report.artifactManifest`
13
+ * to get the preview, and hydrates the full payload on drill-down.
14
+ *
15
+ * @see docs/decisions/D0033-unified-run-anchored-artifact-capture.md (§ M7)
16
+ * @see docs/work-items/W0051-report-slim-down-manifest-preview-hooks.json
17
+ */
18
+ import type { ReportSummary, ScoreSummary } from "../types/index.js";
19
+ /**
20
+ * Transform a full pipeline `ScoreSummary` into its slim Report counterpart.
21
+ *
22
+ * Each of the four heavy fields is reshaped independently; everything else
23
+ * flows through untouched via structural spread. Pure function — the input
24
+ * summary is not mutated.
25
+ *
26
+ * @param mode The evaluation mode (used to populate the `mode` axis on slim
27
+ * judgment / failure-mode ids). `score-summary.json` carries
28
+ * the mode in `evaluationMode` but the publisher supplies it
29
+ * explicitly to keep the helper independent of that field.
30
+ */
31
+ export declare function buildSlimReportSummary(summary: ScoreSummary, mode: string): ReportSummary;
@@ -0,0 +1,217 @@
1
+ /**
2
+ * slim-report-summary.ts
3
+ *
4
+ * Pure transformer: a full `ScoreSummary` (the shape of `score-summary.json`)
5
+ * into the slim `ReportSummary` that Phase C of D0033 publishes on the
6
+ * Report Content-Lake document. Inlined prose and long arrays are replaced
7
+ * with `id` references to external artifacts (graderJudgments, failureModes,
8
+ * gapReport, traces).
9
+ *
10
+ * **The `id` principle**: every slim reference carries the manifest entry
11
+ * key of the external artifact it points at, produced by the descriptor's
12
+ * own `formatEntryKey(axes)`. Studio looks the id up in `Report.artifactManifest`
13
+ * to get the preview, and hydrates the full payload on drill-down.
14
+ *
15
+ * @see docs/decisions/D0033-unified-run-anchored-artifact-capture.md (§ M7)
16
+ * @see docs/work-items/W0051-report-slim-down-manifest-preview-hooks.json
17
+ */
18
+ import { ARTIFACT_REGISTRY } from "../artifact-registry.js";
19
+ /**
20
+ * Transform a full pipeline `ScoreSummary` into its slim Report counterpart.
21
+ *
22
+ * Each of the four heavy fields is reshaped independently; everything else
23
+ * flows through untouched via structural spread. Pure function — the input
24
+ * summary is not mutated.
25
+ *
26
+ * @param mode The evaluation mode (used to populate the `mode` axis on slim
27
+ * judgment / failure-mode ids). `score-summary.json` carries
28
+ * the mode in `evaluationMode` but the publisher supplies it
29
+ * explicitly to keep the helper independent of that field.
30
+ */
31
+ export function buildSlimReportSummary(summary, mode) {
32
+ const { agentBehavior: fullAgentBehavior, failureModes: fullFailureModes, lowScoringJudgments: fullJudgments, recommendations: fullRecommendations, ...rest } = summary;
33
+ return {
34
+ ...rest,
35
+ ...(fullJudgments
36
+ ? { lowScoringJudgments: slimJudgments(fullJudgments, mode) }
37
+ : {}),
38
+ ...(fullFailureModes
39
+ ? { failureModes: slimFailureModes(fullFailureModes, mode) }
40
+ : {}),
41
+ ...(fullRecommendations
42
+ ? { recommendations: slimRecommendations(fullRecommendations) }
43
+ : {}),
44
+ ...(fullAgentBehavior
45
+ ? { agentBehavior: slimAgentBehavior(fullAgentBehavior) }
46
+ : {}),
47
+ };
48
+ }
49
+ // ---------------------------------------------------------------------------
50
+ // Judgments — axes {mode, task, model, grader}
51
+ // ---------------------------------------------------------------------------
52
+ /**
53
+ * Variant-suffix stripper. Judgments' `taskId` today carries `(gold)` /
54
+ * `(baseline)` suffixes that encode the pipeline mode. We strip the suffix
55
+ * to build the canonical `task` axis value and use the caller-supplied
56
+ * `mode` for the `mode` axis — matching how `formatEntryKey` is computed
57
+ * at producer emit time.
58
+ */
59
+ function splitTaskVariant(taskId) {
60
+ const match = /\s*\((gold|baseline)\)\s*$/i.exec(taskId);
61
+ if (!match)
62
+ return { task: taskId, variant: null };
63
+ return {
64
+ task: taskId.slice(0, match.index).trim(),
65
+ variant: match[1].toLowerCase(),
66
+ };
67
+ }
68
+ function slimJudgments(full, defaultMode) {
69
+ const descriptor = ARTIFACT_REGISTRY.graderJudgments;
70
+ const formatKey = descriptor.formatEntryKey;
71
+ if (!formatKey) {
72
+ throw new Error("slimJudgments: graderJudgments descriptor is missing formatEntryKey");
73
+ }
74
+ return full.map((j) => {
75
+ const { task, variant } = splitTaskVariant(j.taskId);
76
+ // The judgment's task variant overrides the caller-supplied mode when
77
+ // present — a single summary run may carry judgments from both the
78
+ // gold and baseline halves of a literacy run.
79
+ const mode = variant ?? defaultMode;
80
+ const graderId = j.dimension;
81
+ const id = formatKey({
82
+ mode,
83
+ task,
84
+ model: j.modelId,
85
+ grader: graderId,
86
+ });
87
+ const reason = typeof j.reason === "string" ? j.reason : "";
88
+ const reasonPreview = reason.length > 280 ? reason.slice(0, 280) : reason;
89
+ return {
90
+ id,
91
+ taskId: j.taskId,
92
+ modelId: j.modelId,
93
+ graderId,
94
+ // Legacy alias — pre-W0051 Studio readers access `.dimension`.
95
+ // Removed in Slice 6 when those consumers migrate.
96
+ dimension: graderId,
97
+ score: j.score,
98
+ // Inline fallback for offline/cache-skipped runs whose manifest is
99
+ // empty. Truncated to match the graderJudgments descriptor preview
100
+ // schema so either source renders identically at the same cap.
101
+ reasonPreview,
102
+ // Legacy alias — same truncated text under the old name.
103
+ reason: reasonPreview,
104
+ };
105
+ });
106
+ }
107
+ // ---------------------------------------------------------------------------
108
+ // Failure modes — axes {mode, category}, top-N by count
109
+ // ---------------------------------------------------------------------------
110
+ const FAILURE_MODE_TOP_N = 5;
111
+ function slimFailureModes(full, defaultMode) {
112
+ const descriptor = ARTIFACT_REGISTRY.failureModes;
113
+ const formatKey = descriptor.formatEntryKey;
114
+ if (!formatKey) {
115
+ throw new Error("slimFailureModes: failureModes descriptor is missing formatEntryKey");
116
+ }
117
+ const counts = { ...full.summary };
118
+ const nonZero = Object.entries(counts).filter(([, n]) => n > 0);
119
+ nonZero.sort((a, b) => b[1] - a[1]);
120
+ const topTitles = nonZero
121
+ .slice(0, FAILURE_MODE_TOP_N)
122
+ .map(([category, count]) => ({
123
+ id: formatKey({ mode: defaultMode, category }),
124
+ category,
125
+ severity: severityForCount(count),
126
+ title: toTitleCase(category),
127
+ count,
128
+ }));
129
+ return {
130
+ counts,
131
+ topTitles,
132
+ totalJudgments: full.totalJudgments,
133
+ classificationRate: full.classificationRate,
134
+ };
135
+ }
136
+ function severityForCount(count) {
137
+ if (count >= 10)
138
+ return "critical";
139
+ if (count >= 5)
140
+ return "high";
141
+ if (count >= 2)
142
+ return "medium";
143
+ return "low";
144
+ }
145
+ function toTitleCase(id) {
146
+ return id
147
+ .split("-")
148
+ .map((w) => (w.length === 0 ? w : w[0].toUpperCase() + w.slice(1)))
149
+ .join(" ");
150
+ }
151
+ // ---------------------------------------------------------------------------
152
+ // Recommendations — bulk artifact; id is a synthetic area--mode composite
153
+ // ---------------------------------------------------------------------------
154
+ const RECOMMENDATION_TOP_N = 3;
155
+ function slimRecommendations(full) {
156
+ const counts = {};
157
+ for (const gap of full.gaps) {
158
+ counts[gap.area] = (counts[gap.area] ?? 0) + 1;
159
+ }
160
+ // Sort by priority descending, break ties by estimatedLift.
161
+ const sorted = [...full.gaps].sort((a, b) => (b.priority ?? 0) - (a.priority ?? 0) ||
162
+ (b.estimatedLift ?? 0) - (a.estimatedLift ?? 0));
163
+ const top3 = sorted
164
+ .slice(0, RECOMMENDATION_TOP_N)
165
+ .map((g) => ({
166
+ id: gapId(g),
167
+ area: g.area,
168
+ title: toTitleCase(g.failureMode),
169
+ priority: g.priority,
170
+ }));
171
+ return {
172
+ counts,
173
+ top3,
174
+ totalGaps: full.gaps.length,
175
+ totalPotentialLift: full.totalPotentialLift,
176
+ };
177
+ }
178
+ /**
179
+ * Synthetic composite id for a gap — the gapReport artifact is bulk, so a
180
+ * per-gap manifest entry does not exist. This id lets Studio deduplicate
181
+ * gaps and deep-link within the bulk artifact.
182
+ */
183
+ function gapId(g) {
184
+ return `${sanitizeIdSegment(g.area)}--${sanitizeIdSegment(g.failureMode)}`;
185
+ }
186
+ function sanitizeIdSegment(s) {
187
+ return s
188
+ .trim()
189
+ .toLowerCase()
190
+ .replace(/[^a-z0-9]+/g, "-")
191
+ .replace(/^-+|-+$/g, "");
192
+ }
193
+ // ---------------------------------------------------------------------------
194
+ // Agent behavior — counts + first-N samples per feature
195
+ // ---------------------------------------------------------------------------
196
+ const BEHAVIOR_SAMPLE_N = 5;
197
+ function slimAgentBehavior(full) {
198
+ return full.map((f) => {
199
+ const uniqueQueries = dedupe(f.searchQueries);
200
+ const uniqueSlugs = dedupe(f.docSlugsVisited);
201
+ return {
202
+ feature: f.feature,
203
+ avgDocPagesVisited: f.avgDocPagesVisited,
204
+ avgNetworkTimeMs: f.avgNetworkTimeMs,
205
+ avgSearchesPerformed: f.avgSearchesPerformed,
206
+ tasksWithBehaviorData: f.tasksWithBehaviorData,
207
+ externalDomains: f.externalDomains,
208
+ searchQueriesCount: uniqueQueries.length,
209
+ searchQueriesSample: uniqueQueries.slice(0, BEHAVIOR_SAMPLE_N),
210
+ docSlugsVisitedCount: uniqueSlugs.length,
211
+ docSlugsVisitedSample: uniqueSlugs.slice(0, BEHAVIOR_SAMPLE_N),
212
+ };
213
+ });
214
+ }
215
+ function dedupe(items) {
216
+ return [...new Set(items)];
217
+ }
@@ -62,6 +62,39 @@ export type RubricId = Brand<string, "RubricId">;
62
62
  export type FixtureId = Brand<string, "FixtureId">;
63
63
  /** Unique identifier for a build artifact */
64
64
  export type ArtifactId = Brand<string, "ArtifactId">;
65
+ /**
66
+ * The dimensions an artifact is evidence about. Axes determine where the
67
+ * artifact lives, how its entry key is shaped, and — at module load — whether
68
+ * its declared layout is allowed.
69
+ *
70
+ * Unbounded axes (`task`, `model`, `trial`) force `layout: "per-entry"` at the
71
+ * registry invariant; the other axes are finite enough that bulk is acceptable.
72
+ *
73
+ * `category` identifies a classification bucket for artifacts that partition
74
+ * per-mode (e.g. `failureModes`, one entry per classified failure category —
75
+ * D0033 M7, W0051 Slice 2).
76
+ */
77
+ export type AssociationAxis = "run" | "mode" | "task" | "model" | "grader" | "trial" | "category";
78
+ /**
79
+ * The sanitized, filename-safe identifier for a single per-entry artifact
80
+ * object. Produced by `ArtifactDescriptor.formatEntryKey` and parsed by
81
+ * `parseEntryKey`; not a free-form string at runtime.
82
+ */
83
+ export type EntryKey = Brand<string, "EntryKey">;
84
+ /**
85
+ * The subset of `AssociationAxis` values an artifact carries at write time.
86
+ * Used on the writer API (`emit(type, association, payload)`) and on manifest
87
+ * entries so the axes that identify the entry are self-describing in storage.
88
+ *
89
+ * Canonical axes are strongly-keyed. Descriptors whose per-entry layout is
90
+ * discriminated by a name that isn't a pipeline axis (e.g. `sinkResults` by
91
+ * sink name, `callbackRequest` by callback target) read that identifier from
92
+ * the `name` slot. Keep ad-hoc keys out of the canonical axis list so the
93
+ * module-load invariant (unbounded-axis ⇒ per-entry) stays crisp.
94
+ */
95
+ export type AssociationValues = Partial<Record<AssociationAxis, string | number>> & {
96
+ readonly name?: string;
97
+ };
65
98
  /**
66
99
  * A success result containing a value.
67
100
  */