@sanity/ailf 2.8.0 → 2.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_vendor/ailf-core/artifact-capture/association.d.ts +35 -0
- package/dist/_vendor/ailf-core/artifact-capture/association.js +28 -0
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +124 -23
- package/dist/_vendor/ailf-core/artifact-registry.js +724 -63
- package/dist/_vendor/ailf-core/index.d.ts +2 -1
- package/dist/_vendor/ailf-core/index.js +2 -1
- package/dist/_vendor/ailf-core/ports/artifact-writer.d.ts +59 -20
- package/dist/_vendor/ailf-core/ports/artifact-writer.js +33 -10
- package/dist/_vendor/ailf-core/ports/context.d.ts +21 -2
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +6 -6
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/services/index.js +1 -0
- package/dist/_vendor/ailf-core/services/slim-report-summary.d.ts +31 -0
- package/dist/_vendor/ailf-core/services/slim-report-summary.js +217 -0
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +33 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +202 -23
- package/dist/artifact-capture/accumulating-artifact-writer.d.ts +50 -0
- package/dist/artifact-capture/accumulating-artifact-writer.js +111 -0
- package/dist/artifact-capture/api-gateway-artifact-writer.d.ts +17 -4
- package/dist/artifact-capture/api-gateway-artifact-writer.js +58 -7
- package/dist/artifact-capture/emit-file.d.ts +28 -0
- package/dist/artifact-capture/emit-file.js +56 -0
- package/dist/artifact-capture/fanout-artifact-writer.d.ts +39 -0
- package/dist/artifact-capture/fanout-artifact-writer.js +76 -0
- package/dist/artifact-capture/filesystem-collector.d.ts +22 -4
- package/dist/artifact-capture/filesystem-collector.js +48 -23
- package/dist/artifact-capture/gcs-artifact-writer.d.ts +40 -3
- package/dist/artifact-capture/gcs-artifact-writer.js +238 -14
- package/dist/artifact-capture/local-fs-artifact-writer.d.ts +71 -0
- package/dist/artifact-capture/local-fs-artifact-writer.js +273 -0
- package/dist/commands/explain-handler.js +4 -0
- package/dist/commands/pipeline-action.d.ts +5 -0
- package/dist/commands/pipeline-action.js +56 -5
- package/dist/commands/pipeline.d.ts +4 -0
- package/dist/commands/pipeline.js +6 -2
- package/dist/commands/publish.js +4 -1
- package/dist/composition-root.d.ts +13 -10
- package/dist/composition-root.js +74 -20
- package/dist/orchestration/pipeline-orchestrator.d.ts +1 -1
- package/dist/orchestration/pipeline-orchestrator.js +41 -30
- package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -1
- package/dist/orchestration/steps/calculate-scores-step.js +19 -19
- package/dist/orchestration/steps/callback-step.d.ts +1 -1
- package/dist/orchestration/steps/callback-step.js +6 -4
- package/dist/orchestration/steps/compare-step.d.ts +1 -1
- package/dist/orchestration/steps/compare-step.js +4 -2
- package/dist/orchestration/steps/discovery-report-step.d.ts +1 -1
- package/dist/orchestration/steps/discovery-report-step.js +4 -1
- package/dist/orchestration/steps/fetch-docs-step.js +9 -15
- package/dist/orchestration/steps/finalize-run-step.js +21 -7
- package/dist/orchestration/steps/gap-analysis-step.js +34 -6
- package/dist/orchestration/steps/generate-configs-step.d.ts +1 -1
- package/dist/orchestration/steps/generate-configs-step.js +11 -11
- package/dist/orchestration/steps/publish-report-step.d.ts +1 -1
- package/dist/orchestration/steps/publish-report-step.js +24 -19
- package/dist/orchestration/steps/readiness-step.d.ts +1 -1
- package/dist/orchestration/steps/readiness-step.js +4 -1
- package/dist/orchestration/steps/report-step.d.ts +1 -1
- package/dist/orchestration/steps/report-step.js +6 -3
- package/dist/orchestration/steps/run-eval-step.js +14 -9
- package/dist/pipeline/compare.d.ts +2 -2
- package/dist/pipeline/emit-eval-results.d.ts +38 -0
- package/dist/pipeline/emit-eval-results.js +100 -0
- package/package.json +1 -1
|
@@ -10,7 +10,8 @@
|
|
|
10
10
|
* re-export barrel that preserves backward compatibility.
|
|
11
11
|
*/
|
|
12
12
|
import type { DocumentRef as _DocumentRef, EvalMode, RunContext } from "../../ailf-shared/index.d.ts";
|
|
13
|
-
import type {
|
|
13
|
+
import type { ArtifactType } from "../artifact-registry.js";
|
|
14
|
+
import type { AssociationValues, RunId } from "./branded-ids.js";
|
|
14
15
|
export type { ActualScoreEntry, ComponentResult, TestResult, UrlMetadata, } from "./scoring-input.js";
|
|
15
16
|
export type { DocumentRef, RunContext, RunTrigger } from "../../ailf-shared/index.d.ts";
|
|
16
17
|
export type { StoredBaseline, StoredReport, StoredRun, StoredTaskResult, StoredTrace, SchemaVersioned, } from "./storage-schema.js";
|
|
@@ -22,7 +23,7 @@ export { evalModeType } from "./eval-mode-config.js";
|
|
|
22
23
|
export type { DependencyEdge, ResolvedFixture, TaskGraph, TaskNode, } from "./task-graph.js";
|
|
23
24
|
export type { VariableDeclaration, VariableEnvelope, VariableProvenance, VariableSource, } from "./variable-envelope.js";
|
|
24
25
|
export type { EvalTrace, ToolCallCategory, ToolCallRecord, TraceEvent, TraceSpan, TraceTokenUsage, } from "./trace.js";
|
|
25
|
-
export type { ArtifactId, Brand, Err, FixtureId, IdValidationError, NewReportId, Ok, ProviderId, PromptId, Result, ResultId, RubricId, RunFingerprint, RunId, SuiteId, TaskId, TaskSlug, TraceId, } from "./branded-ids.js";
|
|
26
|
+
export type { ArtifactId, AssociationAxis, AssociationValues, Brand, EntryKey, Err, FixtureId, IdValidationError, NewReportId, Ok, ProviderId, PromptId, Result, ResultId, RubricId, RunFingerprint, RunId, SuiteId, TaskId, TaskSlug, TraceId, } from "./branded-ids.js";
|
|
26
27
|
export { err, fixtureId, generateRunId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
|
|
27
28
|
export type { AgentHarnessTaskDefinition, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PathDocRef, PerspectiveDocRef, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./generalized-task.js";
|
|
28
29
|
type DocumentRef = _DocumentRef;
|
|
@@ -901,6 +902,142 @@ export interface ScoreSummary {
|
|
|
901
902
|
};
|
|
902
903
|
timestamp: string;
|
|
903
904
|
}
|
|
905
|
+
/**
|
|
906
|
+
* The subset of `ScoreSummary` the `compare()` primitive reads and the
|
|
907
|
+
* only fields consumers of `ComparisonReport.baseline` / `.experiment`
|
|
908
|
+
* access at runtime. Both the full pipeline `ScoreSummary` and the slim
|
|
909
|
+
* `ReportSummary` (W0051) carry these fields unchanged, so stored Reports
|
|
910
|
+
* can participate in auto-compare without re-hydrating prose fields.
|
|
911
|
+
*/
|
|
912
|
+
export type ComparableSummary = Pick<ScoreSummary, "overall" | "perModel" | "scores">;
|
|
913
|
+
/**
|
|
914
|
+
* Slim pointer to a single low-scoring grader judgment. Replaces the full
|
|
915
|
+
* `StoredJudgment` inlined on pre-W0051 reports. The `id` field IS the
|
|
916
|
+
* `graderJudgments` manifest entry key — Studio looks up `reasonPreview`
|
|
917
|
+
* there for list rendering and hydrates the full reason on drill-down.
|
|
918
|
+
*
|
|
919
|
+
* `reasonPreview` is ALSO carried inline as a graceful-degradation fallback
|
|
920
|
+
* for runs whose `artifactManifest` is empty (offline dev, failed GCS
|
|
921
|
+
* upload, cache-skip before manifest aggregation lands). At ~280 chars per
|
|
922
|
+
* judgment × the 50-judgment cap ≈ 14 KB on the Report — tiny against the
|
|
923
|
+
* 500 KB budget. Studio's dispatch prefers the manifest-entry preview when
|
|
924
|
+
* present so GCS drill-down still renders the hydrated-on-demand copy.
|
|
925
|
+
*
|
|
926
|
+
* `graderId` replaces the historical `dimension` field at the slim-shape
|
|
927
|
+
* boundary to match D0033 axis naming; runtime value is identical.
|
|
928
|
+
*/
|
|
929
|
+
export interface SlimJudgmentRef {
|
|
930
|
+
/** Manifest entry key = `formatEntryKey({mode, task, model, grader})` for `graderJudgments`. */
|
|
931
|
+
id: string;
|
|
932
|
+
taskId: string;
|
|
933
|
+
modelId: string;
|
|
934
|
+
/** Rubric dimension name at runtime (what `GraderJudgment.dimension` carries). */
|
|
935
|
+
graderId: string;
|
|
936
|
+
/**
|
|
937
|
+
* Alias of `graderId` — carried for pre-W0051 Studio readers that still
|
|
938
|
+
* access `.dimension` (JudgmentList, judgment-formatting, et al). Remove
|
|
939
|
+
* in Slice 6 when those consumers migrate to `graderId`. Identical
|
|
940
|
+
* runtime value; no additional ambiguity.
|
|
941
|
+
*/
|
|
942
|
+
dimension: string;
|
|
943
|
+
/** Normalized 0–100 score. */
|
|
944
|
+
score: number;
|
|
945
|
+
/**
|
|
946
|
+
* Truncated grader reason (≤280 chars). Inline fallback used when the
|
|
947
|
+
* manifest entry's preview is unavailable; the authoritative full reason
|
|
948
|
+
* lives in the `graderJudgments` external artifact.
|
|
949
|
+
*/
|
|
950
|
+
reasonPreview?: string;
|
|
951
|
+
/**
|
|
952
|
+
* Alias of `reasonPreview` — legacy Studio renderers read `.reason`;
|
|
953
|
+
* expose the same truncated text under both names for the compat window.
|
|
954
|
+
* Remove alongside `dimension` in Slice 6.
|
|
955
|
+
*/
|
|
956
|
+
reason?: string;
|
|
957
|
+
}
|
|
958
|
+
/**
|
|
959
|
+
* Slim failure-mode entry on the Report summary. One per classified
|
|
960
|
+
* `FailureModeType` bucket; `id` is the `failureModes` manifest entry key
|
|
961
|
+
* so drill-down can fetch the full category payload.
|
|
962
|
+
*/
|
|
963
|
+
export interface SlimFailureModeTopTitle {
|
|
964
|
+
/** Manifest entry key = `formatEntryKey({mode, category})` for `failureModes`. */
|
|
965
|
+
id: string;
|
|
966
|
+
category: string;
|
|
967
|
+
severity: "low" | "medium" | "high" | "critical";
|
|
968
|
+
title: string;
|
|
969
|
+
count: number;
|
|
970
|
+
}
|
|
971
|
+
/** Counts + top-N per-category summary on the Report. */
|
|
972
|
+
export interface SlimFailureModesSummary {
|
|
973
|
+
/** Count by FailureModeType-ish id. */
|
|
974
|
+
counts: Record<string, number>;
|
|
975
|
+
/** Top-N categories by count, descending. */
|
|
976
|
+
topTitles: SlimFailureModeTopTitle[];
|
|
977
|
+
/** Total classified judgments across all categories. */
|
|
978
|
+
totalJudgments: number;
|
|
979
|
+
/** Percentage of judgments that landed in a non-unclassified bucket. */
|
|
980
|
+
classificationRate: number;
|
|
981
|
+
}
|
|
982
|
+
/**
|
|
983
|
+
* Slim gap pointer. The `gapReport` artifact is bulk (axes: `{run}`) so
|
|
984
|
+
* there is no per-gap manifest entry to point at — `id` is a stable
|
|
985
|
+
* synthetic composite so the UI can deduplicate and deep-link. Drill-down
|
|
986
|
+
* reads the full `gapReport` artifact and filters by id.
|
|
987
|
+
*/
|
|
988
|
+
export interface SlimRecommendationGap {
|
|
989
|
+
/** Synthetic id: `${area}--${failureMode}`, kebab-safe. */
|
|
990
|
+
id: string;
|
|
991
|
+
area: string;
|
|
992
|
+
title: string;
|
|
993
|
+
/** Priority bucketing for triage UI ordering. */
|
|
994
|
+
priority: number;
|
|
995
|
+
}
|
|
996
|
+
/** Counts + top-3 summary on the Report. */
|
|
997
|
+
export interface SlimRecommendations {
|
|
998
|
+
/** Count of gaps by area. */
|
|
999
|
+
counts: Record<string, number>;
|
|
1000
|
+
/** Top-3 gaps by priority, descending. */
|
|
1001
|
+
top3: SlimRecommendationGap[];
|
|
1002
|
+
/** Total actionable gaps identified (sum of counts). */
|
|
1003
|
+
totalGaps: number;
|
|
1004
|
+
/** Aggregate estimated lift (matches `GapAnalysisReport.totalPotentialLift`). */
|
|
1005
|
+
totalPotentialLift: number;
|
|
1006
|
+
}
|
|
1007
|
+
/**
|
|
1008
|
+
* Slim per-feature agent-behavior summary. Full `searchQueries` and
|
|
1009
|
+
* `docSlugsVisited` arrays move to `traces` NDJSON; the Report keeps only
|
|
1010
|
+
* counts + first-N samples for triage preview. The `firstN` cap is the
|
|
1011
|
+
* producer's choice (we default to 5).
|
|
1012
|
+
*/
|
|
1013
|
+
export interface SlimAgentBehaviorFeature {
|
|
1014
|
+
feature: string;
|
|
1015
|
+
avgDocPagesVisited: number;
|
|
1016
|
+
avgNetworkTimeMs: number;
|
|
1017
|
+
avgSearchesPerformed: number;
|
|
1018
|
+
tasksWithBehaviorData: number;
|
|
1019
|
+
externalDomains: string[];
|
|
1020
|
+
/** Distinct count of unique search queries across tasks in this feature. */
|
|
1021
|
+
searchQueriesCount: number;
|
|
1022
|
+
/** First-N unique search queries (bounded samples). */
|
|
1023
|
+
searchQueriesSample: string[];
|
|
1024
|
+
/** Distinct count of unique doc slugs visited. */
|
|
1025
|
+
docSlugsVisitedCount: number;
|
|
1026
|
+
/** First-N unique doc slugs (bounded samples). */
|
|
1027
|
+
docSlugsVisitedSample: string[];
|
|
1028
|
+
}
|
|
1029
|
+
/**
|
|
1030
|
+
* Slim `summary` field on a published `Report`. Structurally
|
|
1031
|
+
* `Omit<ScoreSummary, slimmed-fields> & slim-replacements` — every
|
|
1032
|
+
* pipeline-produced field survives except the four prose/array fields
|
|
1033
|
+
* W0051 moves to external artifacts.
|
|
1034
|
+
*/
|
|
1035
|
+
export type ReportSummary = Omit<ScoreSummary, "agentBehavior" | "failureModes" | "lowScoringJudgments" | "recommendations"> & {
|
|
1036
|
+
agentBehavior?: SlimAgentBehaviorFeature[];
|
|
1037
|
+
failureModes?: SlimFailureModesSummary;
|
|
1038
|
+
lowScoringJudgments?: SlimJudgmentRef[];
|
|
1039
|
+
recommendations?: SlimRecommendations;
|
|
1040
|
+
};
|
|
904
1041
|
/** Result of a single pipeline step */
|
|
905
1042
|
export type StepResult = {
|
|
906
1043
|
status: "failed";
|
|
@@ -1074,8 +1211,8 @@ export interface ComparisonReport {
|
|
|
1074
1211
|
areas: AreaDelta[];
|
|
1075
1212
|
/** Per-document attribution (when changed docs are known) */
|
|
1076
1213
|
attribution?: AttributionReport;
|
|
1077
|
-
/** The "before" or "control" summary */
|
|
1078
|
-
baseline:
|
|
1214
|
+
/** The "before" or "control" summary (narrowed in W0051 so slim Reports compare) */
|
|
1215
|
+
baseline: ComparableSummary;
|
|
1079
1216
|
/** Aggregate deltas */
|
|
1080
1217
|
deltas: {
|
|
1081
1218
|
/** Overall score delta (experiment.avgScore − baseline.avgScore) */
|
|
@@ -1094,8 +1231,8 @@ export interface ComparisonReport {
|
|
|
1094
1231
|
modelId: string;
|
|
1095
1232
|
}[];
|
|
1096
1233
|
};
|
|
1097
|
-
/** The "after" or "treatment" summary */
|
|
1098
|
-
experiment:
|
|
1234
|
+
/** The "after" or "treatment" summary (narrowed in W0051 so slim Reports compare) */
|
|
1235
|
+
experiment: ComparableSummary;
|
|
1099
1236
|
/** When this comparison was generated */
|
|
1100
1237
|
generatedAt: string;
|
|
1101
1238
|
/** Areas that improved beyond the noise threshold */
|
|
@@ -1177,7 +1314,37 @@ export interface PublishResult {
|
|
|
1177
1314
|
}[];
|
|
1178
1315
|
}
|
|
1179
1316
|
/**
|
|
1180
|
-
*
|
|
1317
|
+
* A single per-entry row in `ArtifactRef.entries`. Carries enough metadata for
|
|
1318
|
+
* Studio list/triage views to render without fetching the external payload.
|
|
1319
|
+
*
|
|
1320
|
+
* D0033/W0049 extensions:
|
|
1321
|
+
* - `association` — the axis values that identify this entry (`{task, model}`
|
|
1322
|
+
* for testOutputs, etc.). Present only on manifests written by
|
|
1323
|
+
* `emit()`; legacy manifests omit it and readers treat absence as `{}`.
|
|
1324
|
+
* - `truncated` — whether the entry payload was capped by the descriptor's
|
|
1325
|
+
* `capBytes`. Readers treat absence as `false` (pre-W0049 manifests were
|
|
1326
|
+
* never truncated because no caps were enforced).
|
|
1327
|
+
* - `preview` — an inline summary produced by the descriptor's
|
|
1328
|
+
* `manifestPreview.extract()`. Typed via the descriptor's preview schema;
|
|
1329
|
+
* omitted when the descriptor has no `manifestPreview`. Wiring lands in
|
|
1330
|
+
* W0051; the field is reserved here so manifests written now parse there.
|
|
1331
|
+
*/
|
|
1332
|
+
export interface ArtifactRefEntry {
|
|
1333
|
+
key: string;
|
|
1334
|
+
bytes: number;
|
|
1335
|
+
association?: AssociationValues;
|
|
1336
|
+
truncated?: boolean;
|
|
1337
|
+
preview?: unknown;
|
|
1338
|
+
}
|
|
1339
|
+
/**
|
|
1340
|
+
* Reference to an artifact in external object storage.
|
|
1341
|
+
*
|
|
1342
|
+
* `store` discriminates the backend: `"gcs"` uses `bucket` as the bucket
|
|
1343
|
+
* name (D0032); `"local"` uses `bucket` as the absolute rootDir path
|
|
1344
|
+
* under which `path` resolves to a file on disk (D0033 / W0050 M4).
|
|
1345
|
+
* Consumers (Studio retrieval, contract tests) branch on `store` only
|
|
1346
|
+
* when constructing the fetch URL — the `path` is store-relative and
|
|
1347
|
+
* identical across backends for the same logical artifact.
|
|
1181
1348
|
*
|
|
1182
1349
|
* `layout` determines the on-disk shape:
|
|
1183
1350
|
* - `"bulk"` — a single object at `path`. `entries` is absent.
|
|
@@ -1185,34 +1352,40 @@ export interface PublishResult {
|
|
|
1185
1352
|
* separate object at `{path}/{sanitizedKey}.json`. `entries` inlines
|
|
1186
1353
|
* the catalog so consumers can render drill-down states without a
|
|
1187
1354
|
* second list call.
|
|
1355
|
+
*
|
|
1356
|
+
* D0033/W0049 extensions (optional — legacy manifests parse without them):
|
|
1357
|
+
* - `truncated` on the bulk row indicates the single-object body was capped.
|
|
1358
|
+
* - `preview` on the bulk row carries a descriptor-typed summary for list
|
|
1359
|
+
* views; wiring lands in W0051.
|
|
1188
1360
|
*/
|
|
1189
1361
|
export interface ArtifactRef {
|
|
1190
|
-
store: "gcs";
|
|
1362
|
+
store: "gcs" | "local";
|
|
1363
|
+
/**
|
|
1364
|
+
* GCS bucket name for `store: "gcs"`; absolute rootDir path for
|
|
1365
|
+
* `store: "local"`. Kept as a single field so callers iterating
|
|
1366
|
+
* manifest entries don't need to branch on `store` just to read the
|
|
1367
|
+
* storage container.
|
|
1368
|
+
*/
|
|
1191
1369
|
bucket: string;
|
|
1192
1370
|
path: string;
|
|
1193
1371
|
bytes?: number;
|
|
1194
1372
|
entryCount?: number;
|
|
1195
1373
|
layout: "bulk" | "per-entry";
|
|
1196
|
-
entries?:
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
}[];
|
|
1374
|
+
entries?: ArtifactRefEntry[];
|
|
1375
|
+
truncated?: boolean;
|
|
1376
|
+
preview?: unknown;
|
|
1200
1377
|
}
|
|
1201
1378
|
/**
|
|
1202
1379
|
* Catalog of artifact refs produced by a single pipeline run.
|
|
1203
1380
|
*
|
|
1204
1381
|
* Lives on `RunManifest.artifacts` (source of truth in GCS) and is
|
|
1205
1382
|
* snapshotted onto `Report.artifactManifest` at publish time.
|
|
1383
|
+
*
|
|
1384
|
+
* Derived from `ArtifactType` so adding a descriptor to the registry
|
|
1385
|
+
* automatically admits it to the manifest catalog — drift between the
|
|
1386
|
+
* two becomes a compile error (W0049 review finding C1).
|
|
1206
1387
|
*/
|
|
1207
|
-
export
|
|
1208
|
-
testOutputs?: ArtifactRef;
|
|
1209
|
-
renderedPrompts?: ArtifactRef;
|
|
1210
|
-
rawResults?: ArtifactRef;
|
|
1211
|
-
graderPrompts?: ArtifactRef;
|
|
1212
|
-
taskDefinitions?: ArtifactRef;
|
|
1213
|
-
evalResults?: ArtifactRef;
|
|
1214
|
-
traces?: ArtifactRef;
|
|
1215
|
-
}
|
|
1388
|
+
export type ArtifactManifest = Partial<Record<ArtifactType, ArtifactRef>>;
|
|
1216
1389
|
/** A published evaluation report — the atomic unit of the report store */
|
|
1217
1390
|
export interface Report {
|
|
1218
1391
|
/**
|
|
@@ -1232,8 +1405,14 @@ export interface Report {
|
|
|
1232
1405
|
id: ReportId;
|
|
1233
1406
|
/** What produced this report */
|
|
1234
1407
|
provenance: ReportProvenance;
|
|
1235
|
-
/**
|
|
1236
|
-
|
|
1408
|
+
/**
|
|
1409
|
+
* The slim published summary. Inlined prose fields (grader reasons, full
|
|
1410
|
+
* failure-mode text, gap prose, agent-behavior arrays) moved to external
|
|
1411
|
+
* artifacts in W0051; see `ReportSummary` for the retained shape and
|
|
1412
|
+
* `docs/decisions/D0033-unified-run-anchored-artifact-capture.md` §§ M7 for
|
|
1413
|
+
* the full migration table.
|
|
1414
|
+
*/
|
|
1415
|
+
summary: ReportSummary;
|
|
1237
1416
|
/** Optional human-supplied label */
|
|
1238
1417
|
tag?: string;
|
|
1239
1418
|
/** Auto-generated descriptive title for discoverability and sharing */
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* accumulating-artifact-writer.ts
|
|
3
|
+
*
|
|
4
|
+
* Decorator that wraps any `ArtifactWriter` and accumulates every
|
|
5
|
+
* successful `emit()` / `appendNdjson()` return into a run-scoped
|
|
6
|
+
* manifest slice. FinalizeRunStep reads the accumulator at the end of
|
|
7
|
+
* a pipeline and writes a `runs/{runId}/manifest.json` populated with
|
|
8
|
+
* one entry per produced artifact type.
|
|
9
|
+
*
|
|
10
|
+
* Before W0051 revisit: only `calculate-scores-step` registered its
|
|
11
|
+
* ref (for `testOutputs`) into `state.artifactRefs`; every other
|
|
12
|
+
* producer discarded the returned ref. The result was empty
|
|
13
|
+
* `Report.artifactManifest` fields and no per-entry preview lookup
|
|
14
|
+
* for Studio hooks. Wrapping the writer at the composition-root level
|
|
15
|
+
* closes that gap without per-producer bookkeeping.
|
|
16
|
+
*
|
|
17
|
+
* Merging rules (per type):
|
|
18
|
+
* - `bulk`: last-writer-wins. A pipeline that emits the same bulk
|
|
19
|
+
* artifact twice overwrites — matches GCS semantics.
|
|
20
|
+
* - `per-entry`: entries accumulate into a keyed map. A later emit
|
|
21
|
+
* at the same `entries[].key` replaces the earlier one.
|
|
22
|
+
*
|
|
23
|
+
* The decorator holds no disk state; the `_resetAccumulated()` hook is
|
|
24
|
+
* for unit tests that rerun emit sequences within a single writer.
|
|
25
|
+
*
|
|
26
|
+
* @see docs/decisions/D0033-unified-run-anchored-artifact-capture.md (§ M5)
|
|
27
|
+
*/
|
|
28
|
+
import type { ArtifactEntry, ArtifactManifest, ArtifactRef, ArtifactType, ArtifactWriter, AssociationValues, RunId, RunManifest } from "../_vendor/ailf-core/index.d.ts";
|
|
29
|
+
export declare class AccumulatingArtifactWriter implements ArtifactWriter {
|
|
30
|
+
/**
|
|
31
|
+
* Exposed so composition-root tests can assert on the underlying backend
|
|
32
|
+
* (LocalFilesystemArtifactWriter, FanoutArtifactWriter, etc.) without
|
|
33
|
+
* plumbing a separate accessor. Treat as read-only.
|
|
34
|
+
*/
|
|
35
|
+
readonly inner: ArtifactWriter;
|
|
36
|
+
private readonly accumulated;
|
|
37
|
+
constructor(inner: ArtifactWriter);
|
|
38
|
+
/** Snapshot of every ref produced this far, keyed by artifact type. */
|
|
39
|
+
getAccumulatedArtifactRefs(): ArtifactManifest;
|
|
40
|
+
/** Test-only. Clears accumulated refs without touching the inner writer. */
|
|
41
|
+
_resetAccumulated(): void;
|
|
42
|
+
emit<T extends ArtifactType>(type: T, association: AssociationValues, payload: unknown): Promise<ArtifactRef | null>;
|
|
43
|
+
appendNdjson<T extends ArtifactType>(type: T, association: AssociationValues, rows: readonly unknown[]): Promise<ArtifactRef | null>;
|
|
44
|
+
writeManifest(runId: RunId, manifest: RunManifest): Promise<ArtifactRef | null>;
|
|
45
|
+
/** @deprecated — forwarded to the inner writer without accumulation. */
|
|
46
|
+
writeBulk(type: ArtifactType, runId: RunId, data: unknown): Promise<ArtifactRef | null>;
|
|
47
|
+
/** @deprecated — forwarded to the inner writer without accumulation. */
|
|
48
|
+
writePerEntry(type: ArtifactType, runId: RunId, entries: readonly ArtifactEntry[]): Promise<ArtifactRef | null>;
|
|
49
|
+
private mergeRef;
|
|
50
|
+
}
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* accumulating-artifact-writer.ts
|
|
3
|
+
*
|
|
4
|
+
* Decorator that wraps any `ArtifactWriter` and accumulates every
|
|
5
|
+
* successful `emit()` / `appendNdjson()` return into a run-scoped
|
|
6
|
+
* manifest slice. FinalizeRunStep reads the accumulator at the end of
|
|
7
|
+
* a pipeline and writes a `runs/{runId}/manifest.json` populated with
|
|
8
|
+
* one entry per produced artifact type.
|
|
9
|
+
*
|
|
10
|
+
* Before W0051 revisit: only `calculate-scores-step` registered its
|
|
11
|
+
* ref (for `testOutputs`) into `state.artifactRefs`; every other
|
|
12
|
+
* producer discarded the returned ref. The result was empty
|
|
13
|
+
* `Report.artifactManifest` fields and no per-entry preview lookup
|
|
14
|
+
* for Studio hooks. Wrapping the writer at the composition-root level
|
|
15
|
+
* closes that gap without per-producer bookkeeping.
|
|
16
|
+
*
|
|
17
|
+
* Merging rules (per type):
|
|
18
|
+
* - `bulk`: last-writer-wins. A pipeline that emits the same bulk
|
|
19
|
+
* artifact twice overwrites — matches GCS semantics.
|
|
20
|
+
* - `per-entry`: entries accumulate into a keyed map. A later emit
|
|
21
|
+
* at the same `entries[].key` replaces the earlier one.
|
|
22
|
+
*
|
|
23
|
+
* The decorator holds no disk state; the `_resetAccumulated()` hook is
|
|
24
|
+
* for unit tests that rerun emit sequences within a single writer.
|
|
25
|
+
*
|
|
26
|
+
* @see docs/decisions/D0033-unified-run-anchored-artifact-capture.md (§ M5)
|
|
27
|
+
*/
|
|
28
|
+
export class AccumulatingArtifactWriter {
|
|
29
|
+
/**
|
|
30
|
+
* Exposed so composition-root tests can assert on the underlying backend
|
|
31
|
+
* (LocalFilesystemArtifactWriter, FanoutArtifactWriter, etc.) without
|
|
32
|
+
* plumbing a separate accessor. Treat as read-only.
|
|
33
|
+
*/
|
|
34
|
+
inner;
|
|
35
|
+
accumulated = {};
|
|
36
|
+
constructor(inner) {
|
|
37
|
+
this.inner = inner;
|
|
38
|
+
}
|
|
39
|
+
/** Snapshot of every ref produced this far, keyed by artifact type. */
|
|
40
|
+
getAccumulatedArtifactRefs() {
|
|
41
|
+
return { ...this.accumulated };
|
|
42
|
+
}
|
|
43
|
+
/** Test-only. Clears accumulated refs without touching the inner writer. */
|
|
44
|
+
_resetAccumulated() {
|
|
45
|
+
for (const k of Object.keys(this.accumulated)) {
|
|
46
|
+
delete this.accumulated[k];
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
// ---- ArtifactWriter surface --------------------------------------------
|
|
50
|
+
async emit(type, association, payload) {
|
|
51
|
+
const ref = await this.inner.emit(type, association, payload);
|
|
52
|
+
if (ref)
|
|
53
|
+
this.mergeRef(type, ref);
|
|
54
|
+
return ref;
|
|
55
|
+
}
|
|
56
|
+
async appendNdjson(type, association, rows) {
|
|
57
|
+
const ref = await this.inner.appendNdjson(type, association, rows);
|
|
58
|
+
if (ref)
|
|
59
|
+
this.mergeRef(type, ref);
|
|
60
|
+
return ref;
|
|
61
|
+
}
|
|
62
|
+
async writeManifest(runId, manifest) {
|
|
63
|
+
return this.inner.writeManifest(runId, manifest);
|
|
64
|
+
}
|
|
65
|
+
/** @deprecated — forwarded to the inner writer without accumulation. */
|
|
66
|
+
async writeBulk(type, runId, data) {
|
|
67
|
+
const ref = await this.inner.writeBulk(type, runId, data);
|
|
68
|
+
if (ref)
|
|
69
|
+
this.mergeRef(type, ref);
|
|
70
|
+
return ref;
|
|
71
|
+
}
|
|
72
|
+
/** @deprecated — forwarded to the inner writer without accumulation. */
|
|
73
|
+
async writePerEntry(type, runId, entries) {
|
|
74
|
+
const ref = await this.inner.writePerEntry(type, runId, entries);
|
|
75
|
+
if (ref)
|
|
76
|
+
this.mergeRef(type, ref);
|
|
77
|
+
return ref;
|
|
78
|
+
}
|
|
79
|
+
// ---- Merge rules --------------------------------------------------------
|
|
80
|
+
mergeRef(type, ref) {
|
|
81
|
+
const existing = this.accumulated[type];
|
|
82
|
+
if (!existing) {
|
|
83
|
+
this.accumulated[type] = ref;
|
|
84
|
+
return;
|
|
85
|
+
}
|
|
86
|
+
// Bulk: last-writer-wins. A step that re-emits a bulk artifact (e.g.
|
|
87
|
+
// a rerun of calculate-scores) overwrites the earlier body on disk,
|
|
88
|
+
// so the manifest reflects the latest.
|
|
89
|
+
if (ref.layout === "bulk") {
|
|
90
|
+
this.accumulated[type] = ref;
|
|
91
|
+
return;
|
|
92
|
+
}
|
|
93
|
+
// Per-entry: merge entries by key. Duplicate keys replace (last write
|
|
94
|
+
// wins at that key — matches local/gcs overwrite semantics).
|
|
95
|
+
const merged = new Map();
|
|
96
|
+
for (const e of existing.entries ?? [])
|
|
97
|
+
merged.set(e.key, e);
|
|
98
|
+
for (const e of ref.entries ?? [])
|
|
99
|
+
merged.set(e.key, e);
|
|
100
|
+
const entries = Array.from(merged.values());
|
|
101
|
+
this.accumulated[type] = {
|
|
102
|
+
...existing,
|
|
103
|
+
layout: "per-entry",
|
|
104
|
+
entries,
|
|
105
|
+
entryCount: entries.length,
|
|
106
|
+
bytes: entries.reduce((sum, e) => sum + (e.bytes ?? 0), 0),
|
|
107
|
+
// `store`, `bucket`, `path` stay from the first ref — per-entry
|
|
108
|
+
// paths are descriptor-derived and stable across calls.
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
}
|
|
@@ -10,15 +10,24 @@
|
|
|
10
10
|
* - Per-entry: GET {apiBaseUrl}/v1/runs/{runId}/artifacts/{type}/{entryKey}/upload-url
|
|
11
11
|
* - Manifest: GET {apiBaseUrl}/v1/runs/{runId}/artifacts/manifest/upload-url
|
|
12
12
|
*
|
|
13
|
-
*
|
|
13
|
+
* ## W0049 API surface
|
|
14
|
+
*
|
|
15
|
+
* - `emit(type, association, payload)` — canonical single-shot write. Uses
|
|
16
|
+
* the registry to resolve `layout` and the signing endpoint.
|
|
17
|
+
* - `appendNdjson` — NOT IMPLEMENTED. The API Gateway has no batch-signing
|
|
18
|
+
* endpoint yet (W0052), and per-object signing would issue one sign call
|
|
19
|
+
* per row, which blows the Vercel Function budget. Throws
|
|
20
|
+
* `NotImplementedError` so the gap is explicit rather than a silent no-op.
|
|
21
|
+
* - `writeBulk` / `writePerEntry` — @deprecated legacy surface; removal in W0052.
|
|
14
22
|
*
|
|
15
23
|
* Design principles:
|
|
16
|
-
* - P5: Non-blocking — any failure returns null and warns
|
|
24
|
+
* - P5: Non-blocking — any non-structural failure returns null and warns.
|
|
17
25
|
* - Stateless — no client state between calls.
|
|
18
26
|
*
|
|
19
27
|
* @see docs/decisions/D0032-run-anchored-artifact-store.md
|
|
28
|
+
* @see docs/decisions/D0033-unified-run-anchored-artifact-capture.md
|
|
20
29
|
*/
|
|
21
|
-
import { type ArtifactEntry, type ArtifactRef, type ArtifactType, type ArtifactWriter, type RunId, type RunManifest } from "../_vendor/ailf-core/index.d.ts";
|
|
30
|
+
import { type ArtifactEntry, type ArtifactRef, type ArtifactType, type ArtifactWriter, type AssociationValues, type RunId, type RunManifest } from "../_vendor/ailf-core/index.d.ts";
|
|
22
31
|
export interface ApiGatewayArtifactWriterOptions {
|
|
23
32
|
/** Base URL of the API gateway (e.g., "https://ailf-api.sanity.build"). */
|
|
24
33
|
apiBaseUrl: string;
|
|
@@ -30,9 +39,13 @@ export interface ApiGatewayArtifactWriterOptions {
|
|
|
30
39
|
export declare class ApiGatewayArtifactWriter implements ArtifactWriter {
|
|
31
40
|
private readonly options;
|
|
32
41
|
constructor(options: ApiGatewayArtifactWriterOptions);
|
|
42
|
+
emit<T extends ArtifactType>(type: T, association: AssociationValues, payload: unknown): Promise<ArtifactRef | null>;
|
|
43
|
+
appendNdjson(): Promise<ArtifactRef | null>;
|
|
44
|
+
writeManifest(runId: RunId, manifest: RunManifest): Promise<ArtifactRef | null>;
|
|
45
|
+
/** @deprecated Use `emit()` instead. */
|
|
33
46
|
writeBulk(type: ArtifactType, runId: RunId, data: unknown): Promise<ArtifactRef | null>;
|
|
47
|
+
/** @deprecated Use `emit()` per entry instead. */
|
|
34
48
|
writePerEntry(type: ArtifactType, runId: RunId, entries: readonly ArtifactEntry[]): Promise<ArtifactRef | null>;
|
|
35
|
-
writeManifest(runId: RunId, manifest: RunManifest): Promise<ArtifactRef | null>;
|
|
36
49
|
private putJson;
|
|
37
50
|
private putJsonRaw;
|
|
38
51
|
private fetchSignedUrl;
|
|
@@ -10,20 +10,73 @@
|
|
|
10
10
|
* - Per-entry: GET {apiBaseUrl}/v1/runs/{runId}/artifacts/{type}/{entryKey}/upload-url
|
|
11
11
|
* - Manifest: GET {apiBaseUrl}/v1/runs/{runId}/artifacts/manifest/upload-url
|
|
12
12
|
*
|
|
13
|
-
*
|
|
13
|
+
* ## W0049 API surface
|
|
14
|
+
*
|
|
15
|
+
* - `emit(type, association, payload)` — canonical single-shot write. Uses
|
|
16
|
+
* the registry to resolve `layout` and the signing endpoint.
|
|
17
|
+
* - `appendNdjson` — NOT IMPLEMENTED. The API Gateway has no batch-signing
|
|
18
|
+
* endpoint yet (W0052), and per-object signing would issue one sign call
|
|
19
|
+
* per row, which blows the Vercel Function budget. Throws
|
|
20
|
+
* `NotImplementedError` so the gap is explicit rather than a silent no-op.
|
|
21
|
+
* - `writeBulk` / `writePerEntry` — @deprecated legacy surface; removal in W0052.
|
|
14
22
|
*
|
|
15
23
|
* Design principles:
|
|
16
|
-
* - P5: Non-blocking — any failure returns null and warns
|
|
24
|
+
* - P5: Non-blocking — any non-structural failure returns null and warns.
|
|
17
25
|
* - Stateless — no client state between calls.
|
|
18
26
|
*
|
|
19
27
|
* @see docs/decisions/D0032-run-anchored-artifact-store.md
|
|
28
|
+
* @see docs/decisions/D0033-unified-run-anchored-artifact-capture.md
|
|
20
29
|
*/
|
|
21
|
-
import { ARTIFACT_REGISTRY, } from "../_vendor/ailf-core/index.js";
|
|
30
|
+
import { ARTIFACT_REGISTRY, NotImplementedError, } from "../_vendor/ailf-core/index.js";
|
|
22
31
|
export class ApiGatewayArtifactWriter {
|
|
23
32
|
options;
|
|
24
33
|
constructor(options) {
|
|
25
34
|
this.options = options;
|
|
26
35
|
}
|
|
36
|
+
// ---- Canonical W0049 API ------------------------------------------------
|
|
37
|
+
async emit(type, association, payload) {
|
|
38
|
+
const descriptor = ARTIFACT_REGISTRY[type];
|
|
39
|
+
const runId = association.run;
|
|
40
|
+
if (!runId) {
|
|
41
|
+
console.warn(` ⚠️ emit("${type}"): association.run is required, skipping`);
|
|
42
|
+
return null;
|
|
43
|
+
}
|
|
44
|
+
if (descriptor.layout === "bulk") {
|
|
45
|
+
const uploadUrlPath = `/v1/runs/${encodeURIComponent(runId)}/artifacts/${encodeURIComponent(type)}/upload-url`;
|
|
46
|
+
return this.putJson(uploadUrlPath, payload, {
|
|
47
|
+
layout: "bulk",
|
|
48
|
+
entryCount: entryCountOf(payload),
|
|
49
|
+
});
|
|
50
|
+
}
|
|
51
|
+
// per-entry
|
|
52
|
+
const entryKey = descriptor.formatEntryKey(association);
|
|
53
|
+
const uploadUrlPath = `/v1/runs/${encodeURIComponent(runId)}/artifacts/${encodeURIComponent(type)}/${encodeURIComponent(entryKey)}/upload-url`;
|
|
54
|
+
const result = await this.putJsonRaw(uploadUrlPath, payload);
|
|
55
|
+
if (!result)
|
|
56
|
+
return null;
|
|
57
|
+
return {
|
|
58
|
+
store: "gcs",
|
|
59
|
+
bucket: result.bucket,
|
|
60
|
+
path: `runs/${runId}/${descriptor.slug}`,
|
|
61
|
+
bytes: result.bytes,
|
|
62
|
+
entryCount: 1,
|
|
63
|
+
layout: "per-entry",
|
|
64
|
+
entries: [{ key: entryKey, bytes: result.bytes, association }],
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
async appendNdjson() {
|
|
68
|
+
throw new NotImplementedError("ApiGatewayArtifactWriter.appendNdjson is not supported. " +
|
|
69
|
+
"NDJSON streaming for traces requires the batch signing endpoint " +
|
|
70
|
+
"(W0052). Producers should use GcsArtifactWriter directly when " +
|
|
71
|
+
"running locally, or defer traces emission until the gateway lands " +
|
|
72
|
+
"the batch route.");
|
|
73
|
+
}
|
|
74
|
+
async writeManifest(runId, manifest) {
|
|
75
|
+
const uploadUrlPath = `/v1/runs/${encodeURIComponent(runId)}/artifacts/manifest/upload-url`;
|
|
76
|
+
return this.putJson(uploadUrlPath, manifest, { layout: "bulk" });
|
|
77
|
+
}
|
|
78
|
+
// ---- Deprecated legacy surface (W0052) ----------------------------------
|
|
79
|
+
/** @deprecated Use `emit()` instead. */
|
|
27
80
|
async writeBulk(type, runId, data) {
|
|
28
81
|
const uploadUrlPath = `/v1/runs/${encodeURIComponent(runId)}/artifacts/${encodeURIComponent(type)}/upload-url`;
|
|
29
82
|
return this.putJson(uploadUrlPath, data, {
|
|
@@ -31,6 +84,7 @@ export class ApiGatewayArtifactWriter {
|
|
|
31
84
|
entryCount: entryCountOf(data),
|
|
32
85
|
});
|
|
33
86
|
}
|
|
87
|
+
/** @deprecated Use `emit()` per entry instead. */
|
|
34
88
|
async writePerEntry(type, runId, entries) {
|
|
35
89
|
const descriptor = ARTIFACT_REGISTRY[type];
|
|
36
90
|
if (!descriptor.parseEntryKey) {
|
|
@@ -66,10 +120,7 @@ export class ApiGatewayArtifactWriter {
|
|
|
66
120
|
entries: uploaded,
|
|
67
121
|
};
|
|
68
122
|
}
|
|
69
|
-
|
|
70
|
-
const uploadUrlPath = `/v1/runs/${encodeURIComponent(runId)}/artifacts/manifest/upload-url`;
|
|
71
|
-
return this.putJson(uploadUrlPath, manifest, { layout: "bulk" });
|
|
72
|
-
}
|
|
123
|
+
// ---- Internals ----------------------------------------------------------
|
|
73
124
|
async putJson(uploadUrlPath, data, meta) {
|
|
74
125
|
const result = await this.putJsonRaw(uploadUrlPath, data);
|
|
75
126
|
if (!result)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* emitFileContents() — reads a file from disk and hands its contents to
|
|
3
|
+
* the writer's `emit()`. Shim for the legacy `captureFile(path)` pattern
|
|
4
|
+
* now that the port takes in-memory payloads.
|
|
5
|
+
*
|
|
6
|
+
* Covers the ~13 producer sites that write a file for user-facing output
|
|
7
|
+
* (e.g. promptfoo writes YAML configs and JSON results) and then need to
|
|
8
|
+
* also capture them as artifacts. Reading at emit-time is uniform, keeps
|
|
9
|
+
* the port narrow, and costs nothing at the sizes in play (all descriptors
|
|
10
|
+
* cap ≤10 MB; most are ≤256 KB).
|
|
11
|
+
*
|
|
12
|
+
* Failures are non-blocking per P5 — a missing file or unparseable JSON
|
|
13
|
+
* returns null + warns rather than throwing, so the pipeline keeps moving
|
|
14
|
+
* even if the user-facing file wasn't produced.
|
|
15
|
+
*
|
|
16
|
+
* See `tasks/plan.md § Q2` for the design rationale.
|
|
17
|
+
*/
|
|
18
|
+
import { type ArtifactRef, type ArtifactType, type ArtifactWriter, type AssociationValues } from "../_vendor/ailf-core/index.d.ts";
|
|
19
|
+
/**
|
|
20
|
+
* Read a file from disk, parse it per the descriptor's mime, and emit it.
|
|
21
|
+
*
|
|
22
|
+
* - JSON mime: file contents are `JSON.parse`d into an object before `emit()`.
|
|
23
|
+
* - Markdown / YAML mime: file contents are passed to `emit()` as a string.
|
|
24
|
+
* - NDJSON: not supported by `emit()` — use `appendNdjson()` directly instead.
|
|
25
|
+
*
|
|
26
|
+
* Returns null (with a warn) on any error. Never throws.
|
|
27
|
+
*/
|
|
28
|
+
export declare function emitFileContents<T extends ArtifactType>(writer: ArtifactWriter, type: T, association: AssociationValues, filePath: string): Promise<ArtifactRef | null>;
|