@sanity/ailf 2.8.0 → 2.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_vendor/ailf-core/artifact-capture/association.d.ts +35 -0
- package/dist/_vendor/ailf-core/artifact-capture/association.js +28 -0
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +124 -23
- package/dist/_vendor/ailf-core/artifact-registry.js +724 -63
- package/dist/_vendor/ailf-core/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/index.js +2 -1
- package/dist/_vendor/ailf-core/ports/artifact-writer.d.ts +59 -20
- package/dist/_vendor/ailf-core/ports/artifact-writer.js +33 -10
- package/dist/_vendor/ailf-core/ports/context.d.ts +21 -2
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +6 -6
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/services/index.js +1 -0
- package/dist/_vendor/ailf-core/services/slim-report-summary.d.ts +31 -0
- package/dist/_vendor/ailf-core/services/slim-report-summary.js +217 -0
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +33 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +202 -23
- package/dist/artifact-capture/accumulating-artifact-writer.d.ts +50 -0
- package/dist/artifact-capture/accumulating-artifact-writer.js +111 -0
- package/dist/artifact-capture/api-gateway-artifact-writer.d.ts +17 -4
- package/dist/artifact-capture/api-gateway-artifact-writer.js +58 -7
- package/dist/artifact-capture/emit-file.d.ts +28 -0
- package/dist/artifact-capture/emit-file.js +56 -0
- package/dist/artifact-capture/fanout-artifact-writer.d.ts +39 -0
- package/dist/artifact-capture/fanout-artifact-writer.js +76 -0
- package/dist/artifact-capture/filesystem-collector.d.ts +22 -4
- package/dist/artifact-capture/filesystem-collector.js +48 -23
- package/dist/artifact-capture/gcs-artifact-writer.d.ts +40 -3
- package/dist/artifact-capture/gcs-artifact-writer.js +238 -14
- package/dist/artifact-capture/local-fs-artifact-writer.d.ts +71 -0
- package/dist/artifact-capture/local-fs-artifact-writer.js +273 -0
- package/dist/commands/explain-handler.js +4 -0
- package/dist/commands/pipeline-action.d.ts +5 -0
- package/dist/commands/pipeline-action.js +56 -5
- package/dist/commands/pipeline.d.ts +4 -0
- package/dist/commands/pipeline.js +6 -2
- package/dist/commands/publish.js +4 -1
- package/dist/composition-root.d.ts +13 -10
- package/dist/composition-root.js +74 -20
- package/dist/orchestration/pipeline-orchestrator.d.ts +1 -1
- package/dist/orchestration/pipeline-orchestrator.js +41 -30
- package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -1
- package/dist/orchestration/steps/calculate-scores-step.js +19 -19
- package/dist/orchestration/steps/callback-step.d.ts +1 -1
- package/dist/orchestration/steps/callback-step.js +6 -4
- package/dist/orchestration/steps/compare-step.d.ts +1 -1
- package/dist/orchestration/steps/compare-step.js +4 -2
- package/dist/orchestration/steps/discovery-report-step.d.ts +1 -1
- package/dist/orchestration/steps/discovery-report-step.js +4 -1
- package/dist/orchestration/steps/fetch-docs-step.js +9 -15
- package/dist/orchestration/steps/finalize-run-step.js +21 -7
- package/dist/orchestration/steps/gap-analysis-step.js +34 -6
- package/dist/orchestration/steps/generate-configs-step.d.ts +1 -1
- package/dist/orchestration/steps/generate-configs-step.js +11 -11
- package/dist/orchestration/steps/publish-report-step.d.ts +1 -1
- package/dist/orchestration/steps/publish-report-step.js +24 -19
- package/dist/orchestration/steps/readiness-step.d.ts +1 -1
- package/dist/orchestration/steps/readiness-step.js +4 -1
- package/dist/orchestration/steps/report-step.d.ts +1 -1
- package/dist/orchestration/steps/report-step.js +6 -3
- package/dist/orchestration/steps/run-eval-step.js +14 -9
- package/dist/pipeline/compare.d.ts +2 -2
- package/dist/pipeline/emit-eval-results.d.ts +38 -0
- package/dist/pipeline/emit-eval-results.js +100 -0
- package/package.json +1 -1
|
@@ -20,4 +20,5 @@ export { defineConfig, defineFeatures, defineModeBase, defineModels, definePrici
|
|
|
20
20
|
export type { PricingEntry, PromptEntry, SourceEntry, } from "./config-helpers.js";
|
|
21
21
|
export { env } from "./env-helper.js";
|
|
22
22
|
export { NoOpArtifactCollector } from "./artifact-capture/noop-collector.js";
|
|
23
|
-
export { NoOpArtifactWriter } from "./ports/artifact-writer.js";
|
|
23
|
+
export { NoOpArtifactWriter, NotImplementedError, } from "./ports/artifact-writer.js";
|
|
24
|
+
export { assoc, type AssocContext } from "./artifact-capture/association.js";
|
|
@@ -22,4 +22,5 @@ export * from "./artifact-registry.js";
|
|
|
22
22
|
export { defineConfig, defineFeatures, defineModeBase, defineModels, definePricingTable, definePreset, definePrompts, defineRubrics, defineSchedules, defineSinks, defineSources, defineTask, defineThresholds, } from "./config-helpers.js";
|
|
23
23
|
export { env } from "./env-helper.js";
|
|
24
24
|
export { NoOpArtifactCollector } from "./artifact-capture/noop-collector.js";
|
|
25
|
-
export { NoOpArtifactWriter } from "./ports/artifact-writer.js";
|
|
25
|
+
export { NoOpArtifactWriter, NotImplementedError, } from "./ports/artifact-writer.js";
|
|
26
|
+
export { assoc } from "./artifact-capture/association.js";
|
|
@@ -1,26 +1,35 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Port: ArtifactWriter — writes run artifacts + the run manifest to external storage.
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
* - Paths anchor to `RunId` (not `ReportId`) via the registry's `objectPath`.
|
|
6
|
-
* - Supports both `"bulk"` and `"per-entry"` layouts.
|
|
7
|
-
* - A dedicated `writeManifest()` method for the run manifest at
|
|
8
|
-
* `runs/{runId}/manifest.json`.
|
|
4
|
+
* D0033 / W0049 unifies the writer API around a single caller-facing method:
|
|
9
5
|
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
6
|
+
* - `emit(type, association, payload)` — the canonical write. Dispatch on
|
|
7
|
+
* `descriptor.layout` is internal; callers never pick a shape.
|
|
8
|
+
* - `appendNdjson(type, association, rows)` — streaming-append variant used
|
|
9
|
+
* only by the `traces` artifact. Semantics differ from `emit` (repeated
|
|
10
|
+
* append vs. single-shot write) so it gets its own method rather than an
|
|
11
|
+
* overload.
|
|
12
|
+
* - `writeManifest(runId, manifest)` — writes the run manifest at
|
|
13
|
+
* `runs/{runId}/manifest.json`.
|
|
14
|
+
* - `writeBulk` / `writePerEntry` — @deprecated legacy surface retained for
|
|
15
|
+
* producer code that has not migrated to `emit`. Removal scheduled for
|
|
16
|
+
* W0052 (see `docs/decisions/D0033-unified-run-anchored-artifact-capture.md`).
|
|
13
17
|
*
|
|
14
18
|
* @see docs/decisions/D0032-run-anchored-artifact-store.md
|
|
19
|
+
* @see docs/decisions/D0033-unified-run-anchored-artifact-capture.md
|
|
15
20
|
* @see packages/core/src/artifact-registry.ts
|
|
16
21
|
*/
|
|
17
22
|
import type { ArtifactType } from "../artifact-registry.js";
|
|
18
|
-
import type { RunId } from "../types/branded-ids.js";
|
|
23
|
+
import type { AssociationValues, RunId } from "../types/branded-ids.js";
|
|
19
24
|
import type { ArtifactRef, RunManifest } from "../types/index.js";
|
|
20
25
|
/**
|
|
21
26
|
* An entry in a per-entry upload. The `key` is the wire-format identifier
|
|
22
27
|
* (e.g. `{taskId}::{modelId}` for testOutputs); the writer sanitizes it into
|
|
23
|
-
* the filename using the registry's `parseEntryKey
|
|
28
|
+
* the filename using the registry's `parseEntryKey` or direct path building.
|
|
29
|
+
*
|
|
30
|
+
* @deprecated Use `ArtifactWriter.emit()` with `AssociationValues` instead.
|
|
31
|
+
* This type is retained for producers still on the legacy
|
|
32
|
+
* `writeBulk`/`writePerEntry` path; removal scheduled for W0052.
|
|
24
33
|
*/
|
|
25
34
|
export interface ArtifactEntry<TData = unknown> {
|
|
26
35
|
key: string;
|
|
@@ -28,29 +37,59 @@ export interface ArtifactEntry<TData = unknown> {
|
|
|
28
37
|
}
|
|
29
38
|
export interface ArtifactWriter {
|
|
30
39
|
/**
|
|
31
|
-
* Write a
|
|
40
|
+
* Write a single artifact. The descriptor's `layout` determines whether
|
|
41
|
+
* this produces a bulk object (`runs/{runId}/{slug}.{ext}`) or a per-entry
|
|
42
|
+
* object (`runs/{runId}/{slug}/{entryKey}.{ext}`).
|
|
32
43
|
*
|
|
33
|
-
*
|
|
34
|
-
*
|
|
44
|
+
* For per-entry descriptors, `association` must carry the axis values the
|
|
45
|
+
* descriptor's `formatEntryKey` consumes. For bulk descriptors, only `run`
|
|
46
|
+
* is required.
|
|
47
|
+
*
|
|
48
|
+
* @returns `ArtifactRef` on success, or `null` when upload is skipped or
|
|
49
|
+
* fails (P5: non-blocking).
|
|
35
50
|
*/
|
|
36
|
-
|
|
51
|
+
emit<T extends ArtifactType>(type: T, association: AssociationValues, payload: unknown): Promise<ArtifactRef | null>;
|
|
37
52
|
/**
|
|
38
|
-
*
|
|
39
|
-
*
|
|
53
|
+
* Append NDJSON rows to a per-entry artifact. Used only by `traces`, whose
|
|
54
|
+
* per-entry payload is itself unbounded during production. The writer
|
|
55
|
+
* buffers rows keyed on (type, entryKey) and flushes to numbered part
|
|
56
|
+
* objects; the backend composes parts into the final object at trial
|
|
57
|
+
* completion.
|
|
40
58
|
*
|
|
41
|
-
*
|
|
42
|
-
*
|
|
59
|
+
* @throws NotImplementedError on writers that don't support streaming
|
|
60
|
+
* appends (e.g. the API-gateway-backed writer; see W0052).
|
|
43
61
|
*/
|
|
44
|
-
|
|
62
|
+
appendNdjson<T extends ArtifactType>(type: T, association: AssociationValues, rows: readonly unknown[]): Promise<ArtifactRef | null>;
|
|
45
63
|
/**
|
|
46
64
|
* Write the run manifest to `runs/{runId}/manifest.json`. Single-writer
|
|
47
65
|
* per run; subsequent publishes may rewrite to append `reportIds[]`.
|
|
48
66
|
*/
|
|
49
67
|
writeManifest(runId: RunId, manifest: RunManifest): Promise<ArtifactRef | null>;
|
|
68
|
+
/**
|
|
69
|
+
* @deprecated Use `emit()` with `AssociationValues` instead. Retained for
|
|
70
|
+
* producers still on the legacy path; removal scheduled for W0052.
|
|
71
|
+
*/
|
|
72
|
+
writeBulk(type: ArtifactType, runId: RunId, data: unknown): Promise<ArtifactRef | null>;
|
|
73
|
+
/**
|
|
74
|
+
* @deprecated Use `emit()` per entry instead. Retained for producers still
|
|
75
|
+
* on the legacy path; removal scheduled for W0052.
|
|
76
|
+
*/
|
|
77
|
+
writePerEntry(type: ArtifactType, runId: RunId, entries: readonly ArtifactEntry[]): Promise<ArtifactRef | null>;
|
|
78
|
+
}
|
|
79
|
+
/**
|
|
80
|
+
* Thrown by writers that can't satisfy a method — e.g. an
|
|
81
|
+
* `ApiGatewayArtifactWriter` cannot implement `appendNdjson` until the batch
|
|
82
|
+
* signing endpoint (W0052) lands. Callers should treat this as an explicit
|
|
83
|
+
* failure rather than a silent no-op so the gap surfaces in logs.
|
|
84
|
+
*/
|
|
85
|
+
export declare class NotImplementedError extends Error {
|
|
86
|
+
constructor(message: string);
|
|
50
87
|
}
|
|
51
88
|
/** No-op writer — every method returns null. Used when no storage is configured. */
|
|
52
89
|
export declare class NoOpArtifactWriter implements ArtifactWriter {
|
|
90
|
+
emit(): Promise<null>;
|
|
91
|
+
appendNdjson(): Promise<null>;
|
|
92
|
+
writeManifest(): Promise<null>;
|
|
53
93
|
writeBulk(): Promise<null>;
|
|
54
94
|
writePerEntry(): Promise<null>;
|
|
55
|
-
writeManifest(): Promise<null>;
|
|
56
95
|
}
|
|
@@ -1,28 +1,51 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Port: ArtifactWriter — writes run artifacts + the run manifest to external storage.
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
* - Paths anchor to `RunId` (not `ReportId`) via the registry's `objectPath`.
|
|
6
|
-
* - Supports both `"bulk"` and `"per-entry"` layouts.
|
|
7
|
-
* - A dedicated `writeManifest()` method for the run manifest at
|
|
8
|
-
* `runs/{runId}/manifest.json`.
|
|
4
|
+
* D0033 / W0049 unifies the writer API around a single caller-facing method:
|
|
9
5
|
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
6
|
+
* - `emit(type, association, payload)` — the canonical write. Dispatch on
|
|
7
|
+
* `descriptor.layout` is internal; callers never pick a shape.
|
|
8
|
+
* - `appendNdjson(type, association, rows)` — streaming-append variant used
|
|
9
|
+
* only by the `traces` artifact. Semantics differ from `emit` (repeated
|
|
10
|
+
* append vs. single-shot write) so it gets its own method rather than an
|
|
11
|
+
* overload.
|
|
12
|
+
* - `writeManifest(runId, manifest)` — writes the run manifest at
|
|
13
|
+
* `runs/{runId}/manifest.json`.
|
|
14
|
+
* - `writeBulk` / `writePerEntry` — @deprecated legacy surface retained for
|
|
15
|
+
* producer code that has not migrated to `emit`. Removal scheduled for
|
|
16
|
+
* W0052 (see `docs/decisions/D0033-unified-run-anchored-artifact-capture.md`).
|
|
13
17
|
*
|
|
14
18
|
* @see docs/decisions/D0032-run-anchored-artifact-store.md
|
|
19
|
+
* @see docs/decisions/D0033-unified-run-anchored-artifact-capture.md
|
|
15
20
|
* @see packages/core/src/artifact-registry.ts
|
|
16
21
|
*/
|
|
22
|
+
/**
|
|
23
|
+
* Thrown by writers that can't satisfy a method — e.g. an
|
|
24
|
+
* `ApiGatewayArtifactWriter` cannot implement `appendNdjson` until the batch
|
|
25
|
+
* signing endpoint (W0052) lands. Callers should treat this as an explicit
|
|
26
|
+
* failure rather than a silent no-op so the gap surfaces in logs.
|
|
27
|
+
*/
|
|
28
|
+
export class NotImplementedError extends Error {
|
|
29
|
+
constructor(message) {
|
|
30
|
+
super(message);
|
|
31
|
+
this.name = "NotImplementedError";
|
|
32
|
+
}
|
|
33
|
+
}
|
|
17
34
|
/** No-op writer — every method returns null. Used when no storage is configured. */
|
|
18
35
|
export class NoOpArtifactWriter {
|
|
19
|
-
async
|
|
36
|
+
async emit() {
|
|
20
37
|
return null;
|
|
21
38
|
}
|
|
22
|
-
async
|
|
39
|
+
async appendNdjson() {
|
|
23
40
|
return null;
|
|
24
41
|
}
|
|
25
42
|
async writeManifest() {
|
|
26
43
|
return null;
|
|
27
44
|
}
|
|
45
|
+
async writeBulk() {
|
|
46
|
+
return null;
|
|
47
|
+
}
|
|
48
|
+
async writePerEntry() {
|
|
49
|
+
return null;
|
|
50
|
+
}
|
|
28
51
|
}
|
|
@@ -161,6 +161,20 @@ export interface ResolvedConfig {
|
|
|
161
161
|
captureCompress?: boolean;
|
|
162
162
|
/** Whether to include mode-specific extra artifacts (default: true) */
|
|
163
163
|
captureExtras?: boolean;
|
|
164
|
+
/**
|
|
165
|
+
* D0033 / W0049 — the unified artifact surface. Wired into the writer in
|
|
166
|
+
* W0050; consumed by the writer factory to decide whether to attach a
|
|
167
|
+
* writer at all, where it writes to, and what to skip. These fields are
|
|
168
|
+
* additive and do not replace the legacy `capture*` fields until W0052.
|
|
169
|
+
*/
|
|
170
|
+
/** Disables all artifact writers — `--no-artifacts`. */
|
|
171
|
+
artifactsDisabled?: boolean;
|
|
172
|
+
/** Root directory for local artifact output — `--artifacts-dir`. */
|
|
173
|
+
artifactsDir?: string;
|
|
174
|
+
/** Run writers in dry-run mode — `--artifacts-dry-run`. */
|
|
175
|
+
artifactsDryRun?: boolean;
|
|
176
|
+
/** Comma-separated artifact types to skip — `--capture-exclude`. */
|
|
177
|
+
artifactsExclude?: readonly string[];
|
|
164
178
|
/** GCS bucket for capture upload (enables GCS decorator when set) */
|
|
165
179
|
captureGcsBucket?: string;
|
|
166
180
|
/** GCS object prefix for capture uploads (default: "captures/") */
|
|
@@ -198,8 +212,13 @@ export interface ResolvedConfig {
|
|
|
198
212
|
* Created per-test by createTestContext().
|
|
199
213
|
*/
|
|
200
214
|
export interface AppContext {
|
|
201
|
-
/**
|
|
202
|
-
|
|
215
|
+
/**
|
|
216
|
+
* Artifact writer — writes run artifacts + manifest to local fs (D0033
|
|
217
|
+
* M4: always on) and optionally to GCS (D0032, layered via
|
|
218
|
+
* FanoutArtifactWriter). Required post-W0050 — the composition root
|
|
219
|
+
* always provides a writer (NoOpArtifactWriter when `--no-artifacts`).
|
|
220
|
+
*/
|
|
221
|
+
readonly artifactWriter: ArtifactWriter;
|
|
203
222
|
/** Evaluation caching (filesystem + optional Content Lake fallback) */
|
|
204
223
|
readonly cache?: CacheStore;
|
|
205
224
|
/** Artifact capture collector (no-op when --capture is not set) */
|
|
@@ -67,10 +67,10 @@ export declare const FeatureSchema: z.ZodObject<{
|
|
|
67
67
|
id: z.ZodString;
|
|
68
68
|
name: z.ZodString;
|
|
69
69
|
priority: z.ZodEnum<{
|
|
70
|
-
critical: "critical";
|
|
71
|
-
high: "high";
|
|
72
|
-
medium: "medium";
|
|
73
70
|
low: "low";
|
|
71
|
+
medium: "medium";
|
|
72
|
+
high: "high";
|
|
73
|
+
critical: "critical";
|
|
74
74
|
}>;
|
|
75
75
|
sections: z.ZodArray<z.ZodString>;
|
|
76
76
|
status: z.ZodEnum<{
|
|
@@ -91,10 +91,10 @@ export declare const FeatureRegistrySchema: z.ZodObject<{
|
|
|
91
91
|
id: z.ZodString;
|
|
92
92
|
name: z.ZodString;
|
|
93
93
|
priority: z.ZodEnum<{
|
|
94
|
-
critical: "critical";
|
|
95
|
-
high: "high";
|
|
96
|
-
medium: "medium";
|
|
97
94
|
low: "low";
|
|
95
|
+
medium: "medium";
|
|
96
|
+
high: "high";
|
|
97
|
+
critical: "critical";
|
|
98
98
|
}>;
|
|
99
99
|
sections: z.ZodArray<z.ZodString>;
|
|
100
100
|
status: z.ZodEnum<{
|
|
@@ -11,3 +11,4 @@ export { classifyRubric, detectFeatureArea, extractDimensions, extractUrlMetadat
|
|
|
11
11
|
export { formatComparisonMarkdown, formatComparisonTable, } from "./comparison-formatters.js";
|
|
12
12
|
export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskScore, normalizeScore, type AggregationStrategy, type AreaScore, type AssertionScore, type DimensionScore, type EnsembleGradingConfig, type GraderTransitionConfig, type TaskScore, type TaskScoreOptions, } from "./scoring-engine.js";
|
|
13
13
|
export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, resolveModelVariants, } from "./config-helpers.js";
|
|
14
|
+
export { buildSlimReportSummary } from "./slim-report-summary.js";
|
|
@@ -11,3 +11,4 @@ export { classifyRubric, detectFeatureArea, extractDimensions, extractUrlMetadat
|
|
|
11
11
|
export { formatComparisonMarkdown, formatComparisonTable, } from "./comparison-formatters.js";
|
|
12
12
|
export { aggregateAreas, aggregateDimensions, computeEnsembleScore, computeTaskScore, normalizeScore, } from "./scoring-engine.js";
|
|
13
13
|
export { extractModelName, extractProvider, mergeConfig, modelMatchesMode, resolveModelVariants, } from "./config-helpers.js";
|
|
14
|
+
export { buildSlimReportSummary } from "./slim-report-summary.js";
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* slim-report-summary.ts
|
|
3
|
+
*
|
|
4
|
+
* Pure transformer: a full `ScoreSummary` (the shape of `score-summary.json`)
|
|
5
|
+
* into the slim `ReportSummary` that Phase C of D0033 publishes on the
|
|
6
|
+
* Report Content-Lake document. Inlined prose and long arrays are replaced
|
|
7
|
+
* with `id` references to external artifacts (graderJudgments, failureModes,
|
|
8
|
+
* gapReport, traces).
|
|
9
|
+
*
|
|
10
|
+
* **The `id` principle**: every slim reference carries the manifest entry
|
|
11
|
+
* key of the external artifact it points at, produced by the descriptor's
|
|
12
|
+
* own `formatEntryKey(axes)`. Studio looks the id up in `Report.artifactManifest`
|
|
13
|
+
* to get the preview, and hydrates the full payload on drill-down.
|
|
14
|
+
*
|
|
15
|
+
* @see docs/decisions/D0033-unified-run-anchored-artifact-capture.md (§ M7)
|
|
16
|
+
* @see docs/work-items/W0051-report-slim-down-manifest-preview-hooks.json
|
|
17
|
+
*/
|
|
18
|
+
import type { ReportSummary, ScoreSummary } from "../types/index.js";
|
|
19
|
+
/**
|
|
20
|
+
* Transform a full pipeline `ScoreSummary` into its slim Report counterpart.
|
|
21
|
+
*
|
|
22
|
+
* Each of the four heavy fields is reshaped independently; everything else
|
|
23
|
+
* flows through untouched via structural spread. Pure function — the input
|
|
24
|
+
* summary is not mutated.
|
|
25
|
+
*
|
|
26
|
+
* @param mode The evaluation mode (used to populate the `mode` axis on slim
|
|
27
|
+
* judgment / failure-mode ids). `score-summary.json` carries
|
|
28
|
+
* the mode in `evaluationMode` but the publisher supplies it
|
|
29
|
+
* explicitly to keep the helper independent of that field.
|
|
30
|
+
*/
|
|
31
|
+
export declare function buildSlimReportSummary(summary: ScoreSummary, mode: string): ReportSummary;
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* slim-report-summary.ts
|
|
3
|
+
*
|
|
4
|
+
* Pure transformer: a full `ScoreSummary` (the shape of `score-summary.json`)
|
|
5
|
+
* into the slim `ReportSummary` that Phase C of D0033 publishes on the
|
|
6
|
+
* Report Content-Lake document. Inlined prose and long arrays are replaced
|
|
7
|
+
* with `id` references to external artifacts (graderJudgments, failureModes,
|
|
8
|
+
* gapReport, traces).
|
|
9
|
+
*
|
|
10
|
+
* **The `id` principle**: every slim reference carries the manifest entry
|
|
11
|
+
* key of the external artifact it points at, produced by the descriptor's
|
|
12
|
+
* own `formatEntryKey(axes)`. Studio looks the id up in `Report.artifactManifest`
|
|
13
|
+
* to get the preview, and hydrates the full payload on drill-down.
|
|
14
|
+
*
|
|
15
|
+
* @see docs/decisions/D0033-unified-run-anchored-artifact-capture.md (§ M7)
|
|
16
|
+
* @see docs/work-items/W0051-report-slim-down-manifest-preview-hooks.json
|
|
17
|
+
*/
|
|
18
|
+
import { ARTIFACT_REGISTRY } from "../artifact-registry.js";
|
|
19
|
+
/**
|
|
20
|
+
* Transform a full pipeline `ScoreSummary` into its slim Report counterpart.
|
|
21
|
+
*
|
|
22
|
+
* Each of the four heavy fields is reshaped independently; everything else
|
|
23
|
+
* flows through untouched via structural spread. Pure function — the input
|
|
24
|
+
* summary is not mutated.
|
|
25
|
+
*
|
|
26
|
+
* @param mode The evaluation mode (used to populate the `mode` axis on slim
|
|
27
|
+
* judgment / failure-mode ids). `score-summary.json` carries
|
|
28
|
+
* the mode in `evaluationMode` but the publisher supplies it
|
|
29
|
+
* explicitly to keep the helper independent of that field.
|
|
30
|
+
*/
|
|
31
|
+
export function buildSlimReportSummary(summary, mode) {
|
|
32
|
+
const { agentBehavior: fullAgentBehavior, failureModes: fullFailureModes, lowScoringJudgments: fullJudgments, recommendations: fullRecommendations, ...rest } = summary;
|
|
33
|
+
return {
|
|
34
|
+
...rest,
|
|
35
|
+
...(fullJudgments
|
|
36
|
+
? { lowScoringJudgments: slimJudgments(fullJudgments, mode) }
|
|
37
|
+
: {}),
|
|
38
|
+
...(fullFailureModes
|
|
39
|
+
? { failureModes: slimFailureModes(fullFailureModes, mode) }
|
|
40
|
+
: {}),
|
|
41
|
+
...(fullRecommendations
|
|
42
|
+
? { recommendations: slimRecommendations(fullRecommendations) }
|
|
43
|
+
: {}),
|
|
44
|
+
...(fullAgentBehavior
|
|
45
|
+
? { agentBehavior: slimAgentBehavior(fullAgentBehavior) }
|
|
46
|
+
: {}),
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
// ---------------------------------------------------------------------------
|
|
50
|
+
// Judgments — axes {mode, task, model, grader}
|
|
51
|
+
// ---------------------------------------------------------------------------
|
|
52
|
+
/**
|
|
53
|
+
* Variant-suffix stripper. Judgments' `taskId` today carries `(gold)` /
|
|
54
|
+
* `(baseline)` suffixes that encode the pipeline mode. We strip the suffix
|
|
55
|
+
* to build the canonical `task` axis value and use the caller-supplied
|
|
56
|
+
* `mode` for the `mode` axis — matching how `formatEntryKey` is computed
|
|
57
|
+
* at producer emit time.
|
|
58
|
+
*/
|
|
59
|
+
function splitTaskVariant(taskId) {
|
|
60
|
+
const match = /\s*\((gold|baseline)\)\s*$/i.exec(taskId);
|
|
61
|
+
if (!match)
|
|
62
|
+
return { task: taskId, variant: null };
|
|
63
|
+
return {
|
|
64
|
+
task: taskId.slice(0, match.index).trim(),
|
|
65
|
+
variant: match[1].toLowerCase(),
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
function slimJudgments(full, defaultMode) {
|
|
69
|
+
const descriptor = ARTIFACT_REGISTRY.graderJudgments;
|
|
70
|
+
const formatKey = descriptor.formatEntryKey;
|
|
71
|
+
if (!formatKey) {
|
|
72
|
+
throw new Error("slimJudgments: graderJudgments descriptor is missing formatEntryKey");
|
|
73
|
+
}
|
|
74
|
+
return full.map((j) => {
|
|
75
|
+
const { task, variant } = splitTaskVariant(j.taskId);
|
|
76
|
+
// The judgment's task variant overrides the caller-supplied mode when
|
|
77
|
+
// present — a single summary run may carry judgments from both the
|
|
78
|
+
// gold and baseline halves of a literacy run.
|
|
79
|
+
const mode = variant ?? defaultMode;
|
|
80
|
+
const graderId = j.dimension;
|
|
81
|
+
const id = formatKey({
|
|
82
|
+
mode,
|
|
83
|
+
task,
|
|
84
|
+
model: j.modelId,
|
|
85
|
+
grader: graderId,
|
|
86
|
+
});
|
|
87
|
+
const reason = typeof j.reason === "string" ? j.reason : "";
|
|
88
|
+
const reasonPreview = reason.length > 280 ? reason.slice(0, 280) : reason;
|
|
89
|
+
return {
|
|
90
|
+
id,
|
|
91
|
+
taskId: j.taskId,
|
|
92
|
+
modelId: j.modelId,
|
|
93
|
+
graderId,
|
|
94
|
+
// Legacy alias — pre-W0051 Studio readers access `.dimension`.
|
|
95
|
+
// Removed in Slice 6 when those consumers migrate.
|
|
96
|
+
dimension: graderId,
|
|
97
|
+
score: j.score,
|
|
98
|
+
// Inline fallback for offline/cache-skipped runs whose manifest is
|
|
99
|
+
// empty. Truncated to match the graderJudgments descriptor preview
|
|
100
|
+
// schema so either source renders identically at the same cap.
|
|
101
|
+
reasonPreview,
|
|
102
|
+
// Legacy alias — same truncated text under the old name.
|
|
103
|
+
reason: reasonPreview,
|
|
104
|
+
};
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
// ---------------------------------------------------------------------------
|
|
108
|
+
// Failure modes — axes {mode, category}, top-N by count
|
|
109
|
+
// ---------------------------------------------------------------------------
|
|
110
|
+
const FAILURE_MODE_TOP_N = 5;
|
|
111
|
+
function slimFailureModes(full, defaultMode) {
|
|
112
|
+
const descriptor = ARTIFACT_REGISTRY.failureModes;
|
|
113
|
+
const formatKey = descriptor.formatEntryKey;
|
|
114
|
+
if (!formatKey) {
|
|
115
|
+
throw new Error("slimFailureModes: failureModes descriptor is missing formatEntryKey");
|
|
116
|
+
}
|
|
117
|
+
const counts = { ...full.summary };
|
|
118
|
+
const nonZero = Object.entries(counts).filter(([, n]) => n > 0);
|
|
119
|
+
nonZero.sort((a, b) => b[1] - a[1]);
|
|
120
|
+
const topTitles = nonZero
|
|
121
|
+
.slice(0, FAILURE_MODE_TOP_N)
|
|
122
|
+
.map(([category, count]) => ({
|
|
123
|
+
id: formatKey({ mode: defaultMode, category }),
|
|
124
|
+
category,
|
|
125
|
+
severity: severityForCount(count),
|
|
126
|
+
title: toTitleCase(category),
|
|
127
|
+
count,
|
|
128
|
+
}));
|
|
129
|
+
return {
|
|
130
|
+
counts,
|
|
131
|
+
topTitles,
|
|
132
|
+
totalJudgments: full.totalJudgments,
|
|
133
|
+
classificationRate: full.classificationRate,
|
|
134
|
+
};
|
|
135
|
+
}
|
|
136
|
+
function severityForCount(count) {
|
|
137
|
+
if (count >= 10)
|
|
138
|
+
return "critical";
|
|
139
|
+
if (count >= 5)
|
|
140
|
+
return "high";
|
|
141
|
+
if (count >= 2)
|
|
142
|
+
return "medium";
|
|
143
|
+
return "low";
|
|
144
|
+
}
|
|
145
|
+
function toTitleCase(id) {
|
|
146
|
+
return id
|
|
147
|
+
.split("-")
|
|
148
|
+
.map((w) => (w.length === 0 ? w : w[0].toUpperCase() + w.slice(1)))
|
|
149
|
+
.join(" ");
|
|
150
|
+
}
|
|
151
|
+
// ---------------------------------------------------------------------------
|
|
152
|
+
// Recommendations — bulk artifact; id is a synthetic area--mode composite
|
|
153
|
+
// ---------------------------------------------------------------------------
|
|
154
|
+
const RECOMMENDATION_TOP_N = 3;
|
|
155
|
+
function slimRecommendations(full) {
|
|
156
|
+
const counts = {};
|
|
157
|
+
for (const gap of full.gaps) {
|
|
158
|
+
counts[gap.area] = (counts[gap.area] ?? 0) + 1;
|
|
159
|
+
}
|
|
160
|
+
// Sort by priority descending, break ties by estimatedLift.
|
|
161
|
+
const sorted = [...full.gaps].sort((a, b) => (b.priority ?? 0) - (a.priority ?? 0) ||
|
|
162
|
+
(b.estimatedLift ?? 0) - (a.estimatedLift ?? 0));
|
|
163
|
+
const top3 = sorted
|
|
164
|
+
.slice(0, RECOMMENDATION_TOP_N)
|
|
165
|
+
.map((g) => ({
|
|
166
|
+
id: gapId(g),
|
|
167
|
+
area: g.area,
|
|
168
|
+
title: toTitleCase(g.failureMode),
|
|
169
|
+
priority: g.priority,
|
|
170
|
+
}));
|
|
171
|
+
return {
|
|
172
|
+
counts,
|
|
173
|
+
top3,
|
|
174
|
+
totalGaps: full.gaps.length,
|
|
175
|
+
totalPotentialLift: full.totalPotentialLift,
|
|
176
|
+
};
|
|
177
|
+
}
|
|
178
|
+
/**
|
|
179
|
+
* Synthetic composite id for a gap — the gapReport artifact is bulk, so a
|
|
180
|
+
* per-gap manifest entry does not exist. This id lets Studio deduplicate
|
|
181
|
+
* gaps and deep-link within the bulk artifact.
|
|
182
|
+
*/
|
|
183
|
+
function gapId(g) {
|
|
184
|
+
return `${sanitizeIdSegment(g.area)}--${sanitizeIdSegment(g.failureMode)}`;
|
|
185
|
+
}
|
|
186
|
+
function sanitizeIdSegment(s) {
|
|
187
|
+
return s
|
|
188
|
+
.trim()
|
|
189
|
+
.toLowerCase()
|
|
190
|
+
.replace(/[^a-z0-9]+/g, "-")
|
|
191
|
+
.replace(/^-+|-+$/g, "");
|
|
192
|
+
}
|
|
193
|
+
// ---------------------------------------------------------------------------
|
|
194
|
+
// Agent behavior — counts + first-N samples per feature
|
|
195
|
+
// ---------------------------------------------------------------------------
|
|
196
|
+
const BEHAVIOR_SAMPLE_N = 5;
|
|
197
|
+
function slimAgentBehavior(full) {
|
|
198
|
+
return full.map((f) => {
|
|
199
|
+
const uniqueQueries = dedupe(f.searchQueries);
|
|
200
|
+
const uniqueSlugs = dedupe(f.docSlugsVisited);
|
|
201
|
+
return {
|
|
202
|
+
feature: f.feature,
|
|
203
|
+
avgDocPagesVisited: f.avgDocPagesVisited,
|
|
204
|
+
avgNetworkTimeMs: f.avgNetworkTimeMs,
|
|
205
|
+
avgSearchesPerformed: f.avgSearchesPerformed,
|
|
206
|
+
tasksWithBehaviorData: f.tasksWithBehaviorData,
|
|
207
|
+
externalDomains: f.externalDomains,
|
|
208
|
+
searchQueriesCount: uniqueQueries.length,
|
|
209
|
+
searchQueriesSample: uniqueQueries.slice(0, BEHAVIOR_SAMPLE_N),
|
|
210
|
+
docSlugsVisitedCount: uniqueSlugs.length,
|
|
211
|
+
docSlugsVisitedSample: uniqueSlugs.slice(0, BEHAVIOR_SAMPLE_N),
|
|
212
|
+
};
|
|
213
|
+
});
|
|
214
|
+
}
|
|
215
|
+
function dedupe(items) {
|
|
216
|
+
return [...new Set(items)];
|
|
217
|
+
}
|
|
@@ -62,6 +62,39 @@ export type RubricId = Brand<string, "RubricId">;
|
|
|
62
62
|
export type FixtureId = Brand<string, "FixtureId">;
|
|
63
63
|
/** Unique identifier for a build artifact */
|
|
64
64
|
export type ArtifactId = Brand<string, "ArtifactId">;
|
|
65
|
+
/**
|
|
66
|
+
* The dimensions an artifact is evidence about. Axes determine where the
|
|
67
|
+
* artifact lives, how its entry key is shaped, and — at module load — whether
|
|
68
|
+
* its declared layout is allowed.
|
|
69
|
+
*
|
|
70
|
+
* Unbounded axes (`task`, `model`, `trial`) force `layout: "per-entry"` at the
|
|
71
|
+
* registry invariant; the other axes are finite enough that bulk is acceptable.
|
|
72
|
+
*
|
|
73
|
+
* `category` identifies a classification bucket for artifacts that partition
|
|
74
|
+
* per-mode (e.g. `failureModes`, one entry per classified failure category —
|
|
75
|
+
* D0033 M7, W0051 Slice 2).
|
|
76
|
+
*/
|
|
77
|
+
export type AssociationAxis = "run" | "mode" | "task" | "model" | "grader" | "trial" | "category";
|
|
78
|
+
/**
|
|
79
|
+
* The sanitized, filename-safe identifier for a single per-entry artifact
|
|
80
|
+
* object. Produced by `ArtifactDescriptor.formatEntryKey` and parsed by
|
|
81
|
+
* `parseEntryKey`; not a free-form string at runtime.
|
|
82
|
+
*/
|
|
83
|
+
export type EntryKey = Brand<string, "EntryKey">;
|
|
84
|
+
/**
|
|
85
|
+
* The subset of `AssociationAxis` values an artifact carries at write time.
|
|
86
|
+
* Used on the writer API (`emit(type, association, payload)`) and on manifest
|
|
87
|
+
* entries so the axes that identify the entry are self-describing in storage.
|
|
88
|
+
*
|
|
89
|
+
* Canonical axes are strongly-keyed. Descriptors whose per-entry layout is
|
|
90
|
+
* discriminated by a name that isn't a pipeline axis (e.g. `sinkResults` by
|
|
91
|
+
* sink name, `callbackRequest` by callback target) read that identifier from
|
|
92
|
+
* the `name` slot. Keep ad-hoc keys out of the canonical axis list so the
|
|
93
|
+
* module-load invariant (unbounded-axis ⇒ per-entry) stays crisp.
|
|
94
|
+
*/
|
|
95
|
+
export type AssociationValues = Partial<Record<AssociationAxis, string | number>> & {
|
|
96
|
+
readonly name?: string;
|
|
97
|
+
};
|
|
65
98
|
/**
|
|
66
99
|
* A success result containing a value.
|
|
67
100
|
*/
|