@sanity/ailf 2.7.1 → 2.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_vendor/ailf-core/artifact-capture/association.d.ts +35 -0
- package/dist/_vendor/ailf-core/artifact-capture/association.js +28 -0
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +173 -0
- package/dist/_vendor/ailf-core/artifact-registry.js +811 -0
- package/dist/_vendor/ailf-core/index.d.ts +3 -1
- package/dist/_vendor/ailf-core/index.js +3 -1
- package/dist/_vendor/ailf-core/ports/artifact-collector.d.ts +3 -3
- package/dist/_vendor/ailf-core/ports/artifact-writer.d.ts +95 -0
- package/dist/_vendor/ailf-core/ports/artifact-writer.js +51 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +32 -3
- package/dist/_vendor/ailf-core/ports/index.d.ts +3 -3
- package/dist/_vendor/ailf-core/ports/index.js +1 -1
- package/dist/_vendor/ailf-core/schemas/pipeline.d.ts +6 -6
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -0
- package/dist/_vendor/ailf-core/services/index.js +1 -0
- package/dist/_vendor/ailf-core/services/slim-report-summary.d.ts +31 -0
- package/dist/_vendor/ailf-core/services/slim-report-summary.js +217 -0
- package/dist/_vendor/ailf-core/types/branded-ids.d.ts +42 -0
- package/dist/_vendor/ailf-core/types/branded-ids.js +21 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +298 -77
- package/dist/_vendor/ailf-core/types/index.js +1 -1
- package/dist/_vendor/ailf-shared/index.d.ts +2 -0
- package/dist/_vendor/ailf-shared/index.js +2 -0
- package/dist/_vendor/ailf-shared/run-context.d.ts +55 -0
- package/dist/_vendor/ailf-shared/run-context.js +17 -0
- package/dist/_vendor/ailf-shared/run-trigger.d.ts +30 -0
- package/dist/_vendor/ailf-shared/run-trigger.js +13 -0
- package/dist/artifact-capture/accumulating-artifact-writer.d.ts +50 -0
- package/dist/artifact-capture/accumulating-artifact-writer.js +111 -0
- package/dist/artifact-capture/api-gateway-artifact-writer.d.ts +52 -0
- package/dist/artifact-capture/api-gateway-artifact-writer.js +199 -0
- package/dist/artifact-capture/emit-file.d.ts +28 -0
- package/dist/artifact-capture/emit-file.js +56 -0
- package/dist/artifact-capture/fanout-artifact-writer.d.ts +39 -0
- package/dist/artifact-capture/fanout-artifact-writer.js +76 -0
- package/dist/artifact-capture/filesystem-collector.d.ts +22 -4
- package/dist/artifact-capture/filesystem-collector.js +48 -23
- package/dist/artifact-capture/gcs-artifact-writer.d.ts +67 -0
- package/dist/artifact-capture/gcs-artifact-writer.js +343 -0
- package/dist/artifact-capture/local-fs-artifact-writer.d.ts +71 -0
- package/dist/artifact-capture/local-fs-artifact-writer.js +273 -0
- package/dist/commands/explain-handler.js +4 -0
- package/dist/commands/pipeline-action.d.ts +5 -0
- package/dist/commands/pipeline-action.js +56 -5
- package/dist/commands/pipeline.d.ts +4 -0
- package/dist/commands/pipeline.js +6 -2
- package/dist/commands/publish.js +7 -3
- package/dist/composition-root.d.ts +14 -11
- package/dist/composition-root.js +90 -31
- package/dist/orchestration/build-step-sequence.js +6 -1
- package/dist/orchestration/pipeline-orchestrator.d.ts +1 -1
- package/dist/orchestration/pipeline-orchestrator.js +41 -30
- package/dist/orchestration/steps/calculate-scores-step.d.ts +1 -1
- package/dist/orchestration/steps/calculate-scores-step.js +50 -10
- package/dist/orchestration/steps/callback-step.d.ts +1 -1
- package/dist/orchestration/steps/callback-step.js +6 -4
- package/dist/orchestration/steps/compare-step.d.ts +1 -1
- package/dist/orchestration/steps/compare-step.js +4 -2
- package/dist/orchestration/steps/discovery-report-step.d.ts +1 -1
- package/dist/orchestration/steps/discovery-report-step.js +4 -1
- package/dist/orchestration/steps/fetch-docs-step.js +9 -15
- package/dist/orchestration/steps/finalize-run-step.d.ts +29 -0
- package/dist/orchestration/steps/finalize-run-step.js +117 -0
- package/dist/orchestration/steps/gap-analysis-step.js +34 -6
- package/dist/orchestration/steps/generate-configs-step.d.ts +1 -1
- package/dist/orchestration/steps/generate-configs-step.js +11 -11
- package/dist/orchestration/steps/publish-report-step.d.ts +1 -1
- package/dist/orchestration/steps/publish-report-step.js +40 -55
- package/dist/orchestration/steps/readiness-step.d.ts +1 -1
- package/dist/orchestration/steps/readiness-step.js +4 -1
- package/dist/orchestration/steps/report-step.d.ts +1 -1
- package/dist/orchestration/steps/report-step.js +6 -3
- package/dist/orchestration/steps/run-eval-step.js +14 -9
- package/dist/pipeline/calculate-scores.js +13 -2
- package/dist/pipeline/compare.d.ts +2 -2
- package/dist/pipeline/emit-eval-results.d.ts +38 -0
- package/dist/pipeline/emit-eval-results.js +100 -0
- package/dist/pipeline/provenance.d.ts +24 -44
- package/dist/pipeline/provenance.js +17 -165
- package/dist/pipeline/report-title.d.ts +2 -2
- package/dist/pipeline/run-context.d.ts +57 -0
- package/dist/pipeline/run-context.js +156 -0
- package/dist/pipeline/upload-test-outputs.d.ts +26 -0
- package/dist/pipeline/upload-test-outputs.js +34 -0
- package/dist/report-store.js +4 -2
- package/package.json +3 -3
- package/dist/_vendor/ailf-core/ports/artifact-uploader.d.ts +0 -35
- package/dist/_vendor/ailf-core/ports/artifact-uploader.js +0 -18
- package/dist/artifact-capture/api-gateway-artifact-uploader.d.ts +0 -41
- package/dist/artifact-capture/api-gateway-artifact-uploader.js +0 -123
- package/dist/artifact-capture/gcs-report-artifact-uploader.d.ts +0 -31
- package/dist/artifact-capture/gcs-report-artifact-uploader.js +0 -66
|
@@ -9,9 +9,11 @@
|
|
|
9
9
|
* Ports & Adapters migration (Phase 0c). The original file is now a
|
|
10
10
|
* re-export barrel that preserves backward compatibility.
|
|
11
11
|
*/
|
|
12
|
-
import type { DocumentRef as _DocumentRef, EvalMode } from "../../ailf-shared/index.d.ts";
|
|
12
|
+
import type { DocumentRef as _DocumentRef, EvalMode, RunContext } from "../../ailf-shared/index.d.ts";
|
|
13
|
+
import type { ArtifactType } from "../artifact-registry.js";
|
|
14
|
+
import type { AssociationValues, RunId } from "./branded-ids.js";
|
|
13
15
|
export type { ActualScoreEntry, ComponentResult, TestResult, UrlMetadata, } from "./scoring-input.js";
|
|
14
|
-
export type { DocumentRef } from "../../ailf-shared/index.d.ts";
|
|
16
|
+
export type { DocumentRef, RunContext, RunTrigger } from "../../ailf-shared/index.d.ts";
|
|
15
17
|
export type { StoredBaseline, StoredReport, StoredRun, StoredTaskResult, StoredTrace, SchemaVersioned, } from "./storage-schema.js";
|
|
16
18
|
export { CURRENT_SCHEMA_VERSION, isSchemaVersioned, migrateDocument, } from "./storage-schema.js";
|
|
17
19
|
export type { AssertionRegistration, FixtureResolverRegistration, ModeBase, ModeRegistration, PluginManifest, PluginRegistry, PresetDefinition, ReportSinkRegistration, RubricTemplateRegistration, } from "./plugin-registry.js";
|
|
@@ -21,8 +23,8 @@ export { evalModeType } from "./eval-mode-config.js";
|
|
|
21
23
|
export type { DependencyEdge, ResolvedFixture, TaskGraph, TaskNode, } from "./task-graph.js";
|
|
22
24
|
export type { VariableDeclaration, VariableEnvelope, VariableProvenance, VariableSource, } from "./variable-envelope.js";
|
|
23
25
|
export type { EvalTrace, ToolCallCategory, ToolCallRecord, TraceEvent, TraceSpan, TraceTokenUsage, } from "./trace.js";
|
|
24
|
-
export type { ArtifactId, Brand, Err, FixtureId, IdValidationError, NewReportId, Ok, ProviderId, PromptId, Result, ResultId, RubricId, RunFingerprint, RunId, SuiteId, TaskId, TaskSlug, TraceId, } from "./branded-ids.js";
|
|
25
|
-
export { err, fixtureId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
|
|
26
|
+
export type { ArtifactId, AssociationAxis, AssociationValues, Brand, EntryKey, Err, FixtureId, IdValidationError, NewReportId, Ok, ProviderId, PromptId, Result, ResultId, RubricId, RunFingerprint, RunId, SuiteId, TaskId, TaskSlug, TraceId, } from "./branded-ids.js";
|
|
27
|
+
export { err, fixtureId, generateRunId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
|
|
26
28
|
export type { AgentHarnessTaskDefinition, CustomTaskDefinition, GeneralizedAssertionDefinition, GeneralizedDocRef, GeneralizedTaskDefinition, GeneralizedTemplatedAssertion, GeneralizedValueAssertion, IdDocRef, KnowledgeProbeTaskDefinition, LiteracyTaskDefinition, MCPServerTaskDefinition, PathDocRef, PerspectiveDocRef, RubricRef, SlugDocRef, TaskCommonFields, TaskDifficulty, TaskOptions, TaskProviderConfig, TaskStatus, } from "./generalized-task.js";
|
|
27
29
|
type DocumentRef = _DocumentRef;
|
|
28
30
|
/** Aggregated retrieval metrics for a feature area */
|
|
@@ -536,6 +538,19 @@ export interface PromptfooUrlEntry {
|
|
|
536
538
|
* only matter during a single pipeline execution.
|
|
537
539
|
*/
|
|
538
540
|
export interface PipelineState {
|
|
541
|
+
/**
|
|
542
|
+
* Artifact refs produced by upstream steps during the run.
|
|
543
|
+
* Populated incrementally (CalculateScoresStep writes testOutputs; future
|
|
544
|
+
* steps will write renderedPrompts, traces, etc.). Read by FinalizeRunStep
|
|
545
|
+
* to build the `RunManifest.artifacts` catalog.
|
|
546
|
+
*/
|
|
547
|
+
artifactRefs?: Partial<ArtifactManifest>;
|
|
548
|
+
/**
|
|
549
|
+
* The run manifest, finalized and written to `runs/{runId}/manifest.json`.
|
|
550
|
+
* Populated by FinalizeRunStep. Consumed by PublishReportStep to snapshot
|
|
551
|
+
* the `artifacts` slice into `Report.artifactManifest` (D0032).
|
|
552
|
+
*/
|
|
553
|
+
runManifest?: RunManifest;
|
|
539
554
|
/** Report ID generated by PublishReportStep, consumed by CallbackStep + orchestrator job update */
|
|
540
555
|
reportId?: string;
|
|
541
556
|
/** Eval fingerprint computed by RunEvalStep, consumed by PublishReportStep */
|
|
@@ -887,6 +902,142 @@ export interface ScoreSummary {
|
|
|
887
902
|
};
|
|
888
903
|
timestamp: string;
|
|
889
904
|
}
|
|
905
|
+
/**
|
|
906
|
+
* The subset of `ScoreSummary` the `compare()` primitive reads and the
|
|
907
|
+
* only fields consumers of `ComparisonReport.baseline` / `.experiment`
|
|
908
|
+
* access at runtime. Both the full pipeline `ScoreSummary` and the slim
|
|
909
|
+
* `ReportSummary` (W0051) carry these fields unchanged, so stored Reports
|
|
910
|
+
* can participate in auto-compare without re-hydrating prose fields.
|
|
911
|
+
*/
|
|
912
|
+
export type ComparableSummary = Pick<ScoreSummary, "overall" | "perModel" | "scores">;
|
|
913
|
+
/**
|
|
914
|
+
* Slim pointer to a single low-scoring grader judgment. Replaces the full
|
|
915
|
+
* `StoredJudgment` inlined on pre-W0051 reports. The `id` field IS the
|
|
916
|
+
* `graderJudgments` manifest entry key — Studio looks up `reasonPreview`
|
|
917
|
+
* there for list rendering and hydrates the full reason on drill-down.
|
|
918
|
+
*
|
|
919
|
+
* `reasonPreview` is ALSO carried inline as a graceful-degradation fallback
|
|
920
|
+
* for runs whose `artifactManifest` is empty (offline dev, failed GCS
|
|
921
|
+
* upload, cache-skip before manifest aggregation lands). At ~280 chars per
|
|
922
|
+
* judgment × the 50-judgment cap ≈ 14 KB on the Report — tiny against the
|
|
923
|
+
* 500 KB budget. Studio's dispatch prefers the manifest-entry preview when
|
|
924
|
+
* present so GCS drill-down still renders the hydrated-on-demand copy.
|
|
925
|
+
*
|
|
926
|
+
* `graderId` replaces the historical `dimension` field at the slim-shape
|
|
927
|
+
* boundary to match D0033 axis naming; runtime value is identical.
|
|
928
|
+
*/
|
|
929
|
+
export interface SlimJudgmentRef {
|
|
930
|
+
/** Manifest entry key = `formatEntryKey({mode, task, model, grader})` for `graderJudgments`. */
|
|
931
|
+
id: string;
|
|
932
|
+
taskId: string;
|
|
933
|
+
modelId: string;
|
|
934
|
+
/** Rubric dimension name at runtime (what `GraderJudgment.dimension` carries). */
|
|
935
|
+
graderId: string;
|
|
936
|
+
/**
|
|
937
|
+
* Alias of `graderId` — carried for pre-W0051 Studio readers that still
|
|
938
|
+
* access `.dimension` (JudgmentList, judgment-formatting, et al). Remove
|
|
939
|
+
* in Slice 6 when those consumers migrate to `graderId`. Identical
|
|
940
|
+
* runtime value; no additional ambiguity.
|
|
941
|
+
*/
|
|
942
|
+
dimension: string;
|
|
943
|
+
/** Normalized 0–100 score. */
|
|
944
|
+
score: number;
|
|
945
|
+
/**
|
|
946
|
+
* Truncated grader reason (≤280 chars). Inline fallback used when the
|
|
947
|
+
* manifest entry's preview is unavailable; the authoritative full reason
|
|
948
|
+
* lives in the `graderJudgments` external artifact.
|
|
949
|
+
*/
|
|
950
|
+
reasonPreview?: string;
|
|
951
|
+
/**
|
|
952
|
+
* Alias of `reasonPreview` — legacy Studio renderers read `.reason`;
|
|
953
|
+
* expose the same truncated text under both names for the compat window.
|
|
954
|
+
* Remove alongside `dimension` in Slice 6.
|
|
955
|
+
*/
|
|
956
|
+
reason?: string;
|
|
957
|
+
}
|
|
958
|
+
/**
|
|
959
|
+
* Slim failure-mode entry on the Report summary. One per classified
|
|
960
|
+
* `FailureModeType` bucket; `id` is the `failureModes` manifest entry key
|
|
961
|
+
* so drill-down can fetch the full category payload.
|
|
962
|
+
*/
|
|
963
|
+
export interface SlimFailureModeTopTitle {
|
|
964
|
+
/** Manifest entry key = `formatEntryKey({mode, category})` for `failureModes`. */
|
|
965
|
+
id: string;
|
|
966
|
+
category: string;
|
|
967
|
+
severity: "low" | "medium" | "high" | "critical";
|
|
968
|
+
title: string;
|
|
969
|
+
count: number;
|
|
970
|
+
}
|
|
971
|
+
/** Counts + top-N per-category summary on the Report. */
|
|
972
|
+
export interface SlimFailureModesSummary {
|
|
973
|
+
/** Count by FailureModeType-ish id. */
|
|
974
|
+
counts: Record<string, number>;
|
|
975
|
+
/** Top-N categories by count, descending. */
|
|
976
|
+
topTitles: SlimFailureModeTopTitle[];
|
|
977
|
+
/** Total classified judgments across all categories. */
|
|
978
|
+
totalJudgments: number;
|
|
979
|
+
/** Percentage of judgments that landed in a non-unclassified bucket. */
|
|
980
|
+
classificationRate: number;
|
|
981
|
+
}
|
|
982
|
+
/**
|
|
983
|
+
* Slim gap pointer. The `gapReport` artifact is bulk (axes: `{run}`) so
|
|
984
|
+
* there is no per-gap manifest entry to point at — `id` is a stable
|
|
985
|
+
* synthetic composite so the UI can deduplicate and deep-link. Drill-down
|
|
986
|
+
* reads the full `gapReport` artifact and filters by id.
|
|
987
|
+
*/
|
|
988
|
+
export interface SlimRecommendationGap {
|
|
989
|
+
/** Synthetic id: `${area}--${failureMode}`, kebab-safe. */
|
|
990
|
+
id: string;
|
|
991
|
+
area: string;
|
|
992
|
+
title: string;
|
|
993
|
+
/** Priority bucketing for triage UI ordering. */
|
|
994
|
+
priority: number;
|
|
995
|
+
}
|
|
996
|
+
/** Counts + top-3 summary on the Report. */
|
|
997
|
+
export interface SlimRecommendations {
|
|
998
|
+
/** Count of gaps by area. */
|
|
999
|
+
counts: Record<string, number>;
|
|
1000
|
+
/** Top-3 gaps by priority, descending. */
|
|
1001
|
+
top3: SlimRecommendationGap[];
|
|
1002
|
+
/** Total actionable gaps identified (sum of counts). */
|
|
1003
|
+
totalGaps: number;
|
|
1004
|
+
/** Aggregate estimated lift (matches `GapAnalysisReport.totalPotentialLift`). */
|
|
1005
|
+
totalPotentialLift: number;
|
|
1006
|
+
}
|
|
1007
|
+
/**
|
|
1008
|
+
* Slim per-feature agent-behavior summary. Full `searchQueries` and
|
|
1009
|
+
* `docSlugsVisited` arrays move to `traces` NDJSON; the Report keeps only
|
|
1010
|
+
* counts + first-N samples for triage preview. The `firstN` cap is the
|
|
1011
|
+
* producer's choice (we default to 5).
|
|
1012
|
+
*/
|
|
1013
|
+
export interface SlimAgentBehaviorFeature {
|
|
1014
|
+
feature: string;
|
|
1015
|
+
avgDocPagesVisited: number;
|
|
1016
|
+
avgNetworkTimeMs: number;
|
|
1017
|
+
avgSearchesPerformed: number;
|
|
1018
|
+
tasksWithBehaviorData: number;
|
|
1019
|
+
externalDomains: string[];
|
|
1020
|
+
/** Distinct count of unique search queries across tasks in this feature. */
|
|
1021
|
+
searchQueriesCount: number;
|
|
1022
|
+
/** First-N unique search queries (bounded samples). */
|
|
1023
|
+
searchQueriesSample: string[];
|
|
1024
|
+
/** Distinct count of unique doc slugs visited. */
|
|
1025
|
+
docSlugsVisitedCount: number;
|
|
1026
|
+
/** First-N unique doc slugs (bounded samples). */
|
|
1027
|
+
docSlugsVisitedSample: string[];
|
|
1028
|
+
}
|
|
1029
|
+
/**
|
|
1030
|
+
* Slim `summary` field on a published `Report`. Structurally
|
|
1031
|
+
* `Omit<ScoreSummary, slimmed-fields> & slim-replacements` — every
|
|
1032
|
+
* pipeline-produced field survives except the four prose/array fields
|
|
1033
|
+
* W0051 moves to external artifacts.
|
|
1034
|
+
*/
|
|
1035
|
+
export type ReportSummary = Omit<ScoreSummary, "agentBehavior" | "failureModes" | "lowScoringJudgments" | "recommendations"> & {
|
|
1036
|
+
agentBehavior?: SlimAgentBehaviorFeature[];
|
|
1037
|
+
failureModes?: SlimFailureModesSummary;
|
|
1038
|
+
lowScoringJudgments?: SlimJudgmentRef[];
|
|
1039
|
+
recommendations?: SlimRecommendations;
|
|
1040
|
+
};
|
|
890
1041
|
/** Result of a single pipeline step */
|
|
891
1042
|
export type StepResult = {
|
|
892
1043
|
status: "failed";
|
|
@@ -1060,8 +1211,8 @@ export interface ComparisonReport {
|
|
|
1060
1211
|
areas: AreaDelta[];
|
|
1061
1212
|
/** Per-document attribution (when changed docs are known) */
|
|
1062
1213
|
attribution?: AttributionReport;
|
|
1063
|
-
/** The "before" or "control" summary */
|
|
1064
|
-
baseline:
|
|
1214
|
+
/** The "before" or "control" summary (narrowed in W0051 so slim Reports compare) */
|
|
1215
|
+
baseline: ComparableSummary;
|
|
1065
1216
|
/** Aggregate deltas */
|
|
1066
1217
|
deltas: {
|
|
1067
1218
|
/** Overall score delta (experiment.avgScore − baseline.avgScore) */
|
|
@@ -1080,8 +1231,8 @@ export interface ComparisonReport {
|
|
|
1080
1231
|
modelId: string;
|
|
1081
1232
|
}[];
|
|
1082
1233
|
};
|
|
1083
|
-
/** The "after" or "treatment" summary */
|
|
1084
|
-
experiment:
|
|
1234
|
+
/** The "after" or "treatment" summary (narrowed in W0051 so slim Reports compare) */
|
|
1235
|
+
experiment: ComparableSummary;
|
|
1085
1236
|
/** When this comparison was generated */
|
|
1086
1237
|
generatedAt: string;
|
|
1087
1238
|
/** Areas that improved beyond the noise threshold */
|
|
@@ -1162,23 +1313,88 @@ export interface PublishResult {
|
|
|
1162
1313
|
result: SinkResult;
|
|
1163
1314
|
}[];
|
|
1164
1315
|
}
|
|
1165
|
-
/**
|
|
1316
|
+
/**
|
|
1317
|
+
* A single per-entry row in `ArtifactRef.entries`. Carries enough metadata for
|
|
1318
|
+
* Studio list/triage views to render without fetching the external payload.
|
|
1319
|
+
*
|
|
1320
|
+
* D0033/W0049 extensions:
|
|
1321
|
+
* - `association` — the axis values that identify this entry (`{task, model}`
|
|
1322
|
+
* for testOutputs, etc.). Present only on manifests written by
|
|
1323
|
+
* `emit()`; legacy manifests omit it and readers treat absence as `{}`.
|
|
1324
|
+
* - `truncated` — whether the entry payload was capped by the descriptor's
|
|
1325
|
+
* `capBytes`. Readers treat absence as `false` (pre-W0049 manifests were
|
|
1326
|
+
* never truncated because no caps were enforced).
|
|
1327
|
+
* - `preview` — an inline summary produced by the descriptor's
|
|
1328
|
+
* `manifestPreview.extract()`. Typed via the descriptor's preview schema;
|
|
1329
|
+
* omitted when the descriptor has no `manifestPreview`. Wiring lands in
|
|
1330
|
+
* W0051; the field is reserved here so manifests written now parse there.
|
|
1331
|
+
*/
|
|
1332
|
+
export interface ArtifactRefEntry {
|
|
1333
|
+
key: string;
|
|
1334
|
+
bytes: number;
|
|
1335
|
+
association?: AssociationValues;
|
|
1336
|
+
truncated?: boolean;
|
|
1337
|
+
preview?: unknown;
|
|
1338
|
+
}
|
|
1339
|
+
/**
|
|
1340
|
+
* Reference to an artifact in external object storage.
|
|
1341
|
+
*
|
|
1342
|
+
* `store` discriminates the backend: `"gcs"` uses `bucket` as the bucket
|
|
1343
|
+
* name (D0032); `"local"` uses `bucket` as the absolute rootDir path
|
|
1344
|
+
* under which `path` resolves to a file on disk (D0033 / W0050 M4).
|
|
1345
|
+
* Consumers (Studio retrieval, contract tests) branch on `store` only
|
|
1346
|
+
* when constructing the fetch URL — the `path` is store-relative and
|
|
1347
|
+
* identical across backends for the same logical artifact.
|
|
1348
|
+
*
|
|
1349
|
+
* `layout` determines the on-disk shape:
|
|
1350
|
+
* - `"bulk"` — a single object at `path`. `entries` is absent.
|
|
1351
|
+
* - `"per-entry"` — `path` is a directory prefix. Each entry is a
|
|
1352
|
+
* separate object at `{path}/{sanitizedKey}.json`. `entries` inlines
|
|
1353
|
+
* the catalog so consumers can render drill-down states without a
|
|
1354
|
+
* second list call.
|
|
1355
|
+
*
|
|
1356
|
+
* D0033/W0049 extensions (optional — legacy manifests parse without them):
|
|
1357
|
+
* - `truncated` on the bulk row indicates the single-object body was capped.
|
|
1358
|
+
* - `preview` on the bulk row carries a descriptor-typed summary for list
|
|
1359
|
+
* views; wiring lands in W0051.
|
|
1360
|
+
*/
|
|
1166
1361
|
export interface ArtifactRef {
|
|
1167
|
-
store: "gcs";
|
|
1362
|
+
store: "gcs" | "local";
|
|
1363
|
+
/**
|
|
1364
|
+
* GCS bucket name for `store: "gcs"`; absolute rootDir path for
|
|
1365
|
+
* `store: "local"`. Kept as a single field so callers iterating
|
|
1366
|
+
* manifest entries don't need to branch on `store` just to read the
|
|
1367
|
+
* storage container.
|
|
1368
|
+
*/
|
|
1168
1369
|
bucket: string;
|
|
1169
1370
|
path: string;
|
|
1170
1371
|
bytes?: number;
|
|
1171
1372
|
entryCount?: number;
|
|
1373
|
+
layout: "bulk" | "per-entry";
|
|
1374
|
+
entries?: ArtifactRefEntry[];
|
|
1375
|
+
truncated?: boolean;
|
|
1376
|
+
preview?: unknown;
|
|
1172
1377
|
}
|
|
1378
|
+
/**
|
|
1379
|
+
* Catalog of artifact refs produced by a single pipeline run.
|
|
1380
|
+
*
|
|
1381
|
+
* Lives on `RunManifest.artifacts` (source of truth in GCS) and is
|
|
1382
|
+
* snapshotted onto `Report.artifactManifest` at publish time.
|
|
1383
|
+
*
|
|
1384
|
+
* Derived from `ArtifactType` so adding a descriptor to the registry
|
|
1385
|
+
* automatically admits it to the manifest catalog — drift between the
|
|
1386
|
+
* two becomes a compile error (W0049 review finding C1).
|
|
1387
|
+
*/
|
|
1388
|
+
export type ArtifactManifest = Partial<Record<ArtifactType, ArtifactRef>>;
|
|
1173
1389
|
/** A published evaluation report — the atomic unit of the report store */
|
|
1174
1390
|
export interface Report {
|
|
1175
|
-
/**
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1391
|
+
/**
|
|
1392
|
+
* Snapshot of the run manifest's `artifacts` slice at publish time (D0032).
|
|
1393
|
+
* The source of truth lives in `gs://…/runs/{runId}/manifest.json`; this
|
|
1394
|
+
* field denormalizes enough information for Studio to render drill-down
|
|
1395
|
+
* states without an extra manifest fetch.
|
|
1396
|
+
*/
|
|
1397
|
+
artifactManifest?: ArtifactManifest;
|
|
1182
1398
|
/** Optional auto-comparison against the most recent comparable report */
|
|
1183
1399
|
comparison?: ComparisonReport;
|
|
1184
1400
|
/** When the evaluation completed */
|
|
@@ -1189,8 +1405,14 @@ export interface Report {
|
|
|
1189
1405
|
id: ReportId;
|
|
1190
1406
|
/** What produced this report */
|
|
1191
1407
|
provenance: ReportProvenance;
|
|
1192
|
-
/**
|
|
1193
|
-
|
|
1408
|
+
/**
|
|
1409
|
+
* The slim published summary. Inlined prose fields (grader reasons, full
|
|
1410
|
+
* failure-mode text, gap prose, agent-behavior arrays) moved to external
|
|
1411
|
+
* artifacts in W0051; see `ReportSummary` for the retained shape and
|
|
1412
|
+
* `docs/decisions/D0033-unified-run-anchored-artifact-capture.md` §§ M7 for
|
|
1413
|
+
* the full migration table.
|
|
1414
|
+
*/
|
|
1415
|
+
summary: ReportSummary;
|
|
1194
1416
|
/** Optional human-supplied label */
|
|
1195
1417
|
tag?: string;
|
|
1196
1418
|
/** Auto-generated descriptive title for discoverability and sharing */
|
|
@@ -1239,76 +1461,75 @@ export interface ReportLineage {
|
|
|
1239
1461
|
*/
|
|
1240
1462
|
rerunOf?: ReportId;
|
|
1241
1463
|
}
|
|
1242
|
-
/**
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1464
|
+
/**
|
|
1465
|
+
* Full provenance metadata for an evaluation report.
|
|
1466
|
+
*
|
|
1467
|
+
* Inherits the 9 run-description fields (mode, areas, taskIds, models,
|
|
1468
|
+
* graderModel, source, evalFingerprint, trigger, git) from `RunContext`.
|
|
1469
|
+
* Adding a field to `RunContext` makes it available here automatically —
|
|
1470
|
+
* the structural extension is the drift-prevention mechanism described in
|
|
1471
|
+
* D0032 § "Drift Prevention".
|
|
1472
|
+
*/
|
|
1473
|
+
export interface ReportProvenance extends RunContext {
|
|
1246
1474
|
/** Release auto-scope metadata (when perspective evaluation was scoped to affected tasks) */
|
|
1247
1475
|
autoScope?: ReportAutoScope;
|
|
1248
1476
|
/** Content hash of the documentation context at eval time */
|
|
1249
1477
|
contextHash?: string;
|
|
1250
|
-
/**
|
|
1251
|
-
* Evaluation fingerprint — SHA-256 of all inputs that affect eval output.
|
|
1252
|
-
* Used for cross-environment cache lookup (CI → Content Lake).
|
|
1253
|
-
* @see docs/design-docs/content-lake-eval-caching.md
|
|
1254
|
-
*/
|
|
1255
|
-
evalFingerprint?: string;
|
|
1256
|
-
/** Git metadata (when run from CI) */
|
|
1257
|
-
git?: {
|
|
1258
|
-
branch: string;
|
|
1259
|
-
prNumber?: number;
|
|
1260
|
-
repo: string;
|
|
1261
|
-
sha: string;
|
|
1262
|
-
};
|
|
1263
|
-
/** Grader model used for scoring */
|
|
1264
|
-
graderModel: string;
|
|
1265
1478
|
/** Typed relationships with other reports (re-run, comparison) */
|
|
1266
1479
|
lineage?: ReportLineage;
|
|
1267
|
-
/** Evaluation mode */
|
|
1268
|
-
mode: EvalMode;
|
|
1269
|
-
/** Models under evaluation */
|
|
1270
|
-
models: {
|
|
1271
|
-
id: string;
|
|
1272
|
-
label: string;
|
|
1273
|
-
}[];
|
|
1274
1480
|
/** @deprecated Use `promptfooUrls` — kept for backward compatibility */
|
|
1275
1481
|
promptfooUrl?: string;
|
|
1276
1482
|
/** Per-mode Promptfoo share URLs (one per sub-eval that produced a shareable link) */
|
|
1277
1483
|
promptfooUrls?: PromptfooUrlEntry[];
|
|
1278
|
-
/**
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
projectId?: string;
|
|
1285
|
-
};
|
|
1484
|
+
/**
|
|
1485
|
+
* Identity of the pipeline run that produced this report. Links the
|
|
1486
|
+
* Content Lake document back to the GCS run manifest and its artifacts.
|
|
1487
|
+
* @see docs/decisions/D0032-run-anchored-artifact-store.md
|
|
1488
|
+
*/
|
|
1489
|
+
runId: RunId;
|
|
1286
1490
|
/** Sanity document IDs that were targeted (if using --sanity-document) */
|
|
1287
1491
|
targetDocuments?: string[];
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1492
|
+
}
|
|
1493
|
+
/**
|
|
1494
|
+
* A run's manifest in GCS (`runs/{runId}/manifest.json`). Source of truth
|
|
1495
|
+
* for artifact locations, run identity, and outcome. Reports snapshot the
|
|
1496
|
+
* `artifacts` slice into `Report.artifactManifest` at publish time.
|
|
1497
|
+
*
|
|
1498
|
+
* Written once by `FinalizeRunStep`; `reportIds` may be appended by
|
|
1499
|
+
* `PublishReportStep` via strongly-consistent GCS overwrite.
|
|
1500
|
+
*/
|
|
1501
|
+
export interface RunManifest {
|
|
1502
|
+
/** Schema version — bumped when the manifest shape changes */
|
|
1503
|
+
version: 1;
|
|
1504
|
+
/** Identity for this pipeline run */
|
|
1505
|
+
runId: RunId;
|
|
1506
|
+
/** When the manifest was written (pipeline finalization time) */
|
|
1507
|
+
createdAt: ISOTimestamp;
|
|
1508
|
+
/** Total pipeline duration */
|
|
1509
|
+
durationMs: number;
|
|
1510
|
+
/** Outcome of the run */
|
|
1511
|
+
status: "completed" | "failed" | "partial";
|
|
1512
|
+
/** Failure classification when status is not "completed" */
|
|
1513
|
+
failureReason?: PipelineFailureReason;
|
|
1514
|
+
/** What ran — shared shape with ReportProvenance */
|
|
1515
|
+
context: RunContext;
|
|
1516
|
+
/** Run-level aggregates (self-describing without a report) */
|
|
1517
|
+
outcomes?: {
|
|
1518
|
+
testSummary?: TestSummary;
|
|
1519
|
+
usage?: PipelineUsage;
|
|
1520
|
+
cache?: {
|
|
1521
|
+
hits: number;
|
|
1522
|
+
misses: number;
|
|
1523
|
+
skipped: number;
|
|
1524
|
+
};
|
|
1525
|
+
};
|
|
1526
|
+
/** Reports published from this run (0..N). Authoritative link is Report.provenance.runId. */
|
|
1527
|
+
reportIds?: ReportId[];
|
|
1528
|
+
/** Promptfoo share URLs collected during the run */
|
|
1529
|
+
promptfooUrls?: PromptfooUrlEntry[];
|
|
1530
|
+
/** Artifact catalog — per-type refs with inline per-entry indexes */
|
|
1531
|
+
artifacts: ArtifactManifest;
|
|
1532
|
+
}
|
|
1312
1533
|
/** Health check result for a sink */
|
|
1313
1534
|
export type SinkHealthStatus = {
|
|
1314
1535
|
healthy: false;
|
|
@@ -16,7 +16,7 @@ export { InMemoryPluginRegistry } from "./plugin-registry.js";
|
|
|
16
16
|
// version is used internally by LiteracyModeConfig. If consumers need
|
|
17
17
|
// the mode-specific version, they import from "./eval-mode-config.js".
|
|
18
18
|
export { evalModeType } from "./eval-mode-config.js";
|
|
19
|
-
export { err, fixtureId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
|
|
19
|
+
export { err, fixtureId, generateRunId, ok, providerId, resultId, runId, suiteId, taskId, traceId, } from "./branded-ids.js";
|
|
20
20
|
// ---------------------------------------------------------------------------
|
|
21
21
|
// Comparison (Approach 2: structured comparison output)
|
|
22
22
|
// ---------------------------------------------------------------------------
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* RunContext — the set of fields that describe an AILF pipeline run.
|
|
3
|
+
*
|
|
4
|
+
* This is the single source of truth for run-description data. Both
|
|
5
|
+
* `RunManifest.context` (in GCS, written by FinalizeRunStep) and
|
|
6
|
+
* `ReportProvenance` (in Content Lake, built by PublishReportStep)
|
|
7
|
+
* carry this shape. `ReportProvenance extends RunContext` to
|
|
8
|
+
* structurally enforce parity — adding a field here becomes a
|
|
9
|
+
* compile-time failure until every consumer threads it through.
|
|
10
|
+
*
|
|
11
|
+
* Fields are alphabetized to match the surrounding codebase convention
|
|
12
|
+
* (see `ReportProvenance` in `@sanity/ailf-core`).
|
|
13
|
+
*
|
|
14
|
+
* @see docs/decisions/D0032-run-anchored-artifact-store.md
|
|
15
|
+
* @see docs/design-docs/run-artifact-store.md (§ Drift Prevention)
|
|
16
|
+
*/
|
|
17
|
+
import type { EvalMode } from "./eval-modes.js";
|
|
18
|
+
import type { RunTrigger } from "./run-trigger.js";
|
|
19
|
+
export interface RunContext {
|
|
20
|
+
/** Which feature areas were evaluated */
|
|
21
|
+
areas: string[];
|
|
22
|
+
/**
|
|
23
|
+
* Evaluation fingerprint — SHA-256 of all inputs that affect eval output.
|
|
24
|
+
* Used for cross-environment cache lookup (CI → Content Lake).
|
|
25
|
+
*/
|
|
26
|
+
evalFingerprint?: string;
|
|
27
|
+
/** Git metadata (when run from CI) */
|
|
28
|
+
git?: {
|
|
29
|
+
branch: string;
|
|
30
|
+
prNumber?: number;
|
|
31
|
+
repo: string;
|
|
32
|
+
sha: string;
|
|
33
|
+
};
|
|
34
|
+
/** Grader model used for scoring */
|
|
35
|
+
graderModel: string;
|
|
36
|
+
/** Evaluation mode */
|
|
37
|
+
mode: EvalMode;
|
|
38
|
+
/** Models under evaluation */
|
|
39
|
+
models: {
|
|
40
|
+
id: string;
|
|
41
|
+
label: string;
|
|
42
|
+
}[];
|
|
43
|
+
/** Documentation source configuration */
|
|
44
|
+
source: {
|
|
45
|
+
baseUrl: string;
|
|
46
|
+
dataset?: string;
|
|
47
|
+
name: string;
|
|
48
|
+
perspective?: string;
|
|
49
|
+
projectId?: string;
|
|
50
|
+
};
|
|
51
|
+
/** Specific task IDs evaluated when scoped to a subset */
|
|
52
|
+
taskIds?: string[];
|
|
53
|
+
/** What initiated this run */
|
|
54
|
+
trigger: RunTrigger;
|
|
55
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* RunContext — the set of fields that describe an AILF pipeline run.
|
|
3
|
+
*
|
|
4
|
+
* This is the single source of truth for run-description data. Both
|
|
5
|
+
* `RunManifest.context` (in GCS, written by FinalizeRunStep) and
|
|
6
|
+
* `ReportProvenance` (in Content Lake, built by PublishReportStep)
|
|
7
|
+
* carry this shape. `ReportProvenance extends RunContext` to
|
|
8
|
+
* structurally enforce parity — adding a field here becomes a
|
|
9
|
+
* compile-time failure until every consumer threads it through.
|
|
10
|
+
*
|
|
11
|
+
* Fields are alphabetized to match the surrounding codebase convention
|
|
12
|
+
* (see `ReportProvenance` in `@sanity/ailf-core`).
|
|
13
|
+
*
|
|
14
|
+
* @see docs/decisions/D0032-run-anchored-artifact-store.md
|
|
15
|
+
* @see docs/design-docs/run-artifact-store.md (§ Drift Prevention)
|
|
16
|
+
*/
|
|
17
|
+
export {};
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* RunTrigger — what initiated a pipeline run.
|
|
3
|
+
*
|
|
4
|
+
* Lives in shared so it can be referenced from RunContext (the single
|
|
5
|
+
* source of truth for run-description fields) and consumed by both
|
|
6
|
+
* RunManifest (GCS) and ReportProvenance (Content Lake) without creating
|
|
7
|
+
* a core → core cycle.
|
|
8
|
+
*
|
|
9
|
+
* The `ci.runId` field is the external workflow run identifier (e.g.
|
|
10
|
+
* GitHub Actions run ID). It is unrelated to the pipeline-level `RunId`
|
|
11
|
+
* brand in `@sanity/ailf-core`, which identifies an AILF pipeline run.
|
|
12
|
+
*/
|
|
13
|
+
export type RunTrigger = {
|
|
14
|
+
type: "ci";
|
|
15
|
+
runId: string;
|
|
16
|
+
workflow: string;
|
|
17
|
+
} | {
|
|
18
|
+
type: "cross-repo";
|
|
19
|
+
callerRef?: string;
|
|
20
|
+
callerRepo: string;
|
|
21
|
+
} | {
|
|
22
|
+
type: "manual";
|
|
23
|
+
} | {
|
|
24
|
+
type: "scheduled";
|
|
25
|
+
schedule: string;
|
|
26
|
+
} | {
|
|
27
|
+
type: "webhook";
|
|
28
|
+
documentId?: string;
|
|
29
|
+
source: string;
|
|
30
|
+
};
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* RunTrigger — what initiated a pipeline run.
|
|
3
|
+
*
|
|
4
|
+
* Lives in shared so it can be referenced from RunContext (the single
|
|
5
|
+
* source of truth for run-description fields) and consumed by both
|
|
6
|
+
* RunManifest (GCS) and ReportProvenance (Content Lake) without creating
|
|
7
|
+
* a core → core cycle.
|
|
8
|
+
*
|
|
9
|
+
* The `ci.runId` field is the external workflow run identifier (e.g.
|
|
10
|
+
* GitHub Actions run ID). It is unrelated to the pipeline-level `RunId`
|
|
11
|
+
* brand in `@sanity/ailf-core`, which identifies an AILF pipeline run.
|
|
12
|
+
*/
|
|
13
|
+
export {};
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* accumulating-artifact-writer.ts
|
|
3
|
+
*
|
|
4
|
+
* Decorator that wraps any `ArtifactWriter` and accumulates every
|
|
5
|
+
* successful `emit()` / `appendNdjson()` return into a run-scoped
|
|
6
|
+
* manifest slice. FinalizeRunStep reads the accumulator at the end of
|
|
7
|
+
* a pipeline and writes a `runs/{runId}/manifest.json` populated with
|
|
8
|
+
* one entry per produced artifact type.
|
|
9
|
+
*
|
|
10
|
+
* Before W0051 revisit: only `calculate-scores-step` registered its
|
|
11
|
+
* ref (for `testOutputs`) into `state.artifactRefs`; every other
|
|
12
|
+
* producer discarded the returned ref. The result was empty
|
|
13
|
+
* `Report.artifactManifest` fields and no per-entry preview lookup
|
|
14
|
+
* for Studio hooks. Wrapping the writer at the composition-root level
|
|
15
|
+
* closes that gap without per-producer bookkeeping.
|
|
16
|
+
*
|
|
17
|
+
* Merging rules (per type):
|
|
18
|
+
* - `bulk`: last-writer-wins. A pipeline that emits the same bulk
|
|
19
|
+
* artifact twice overwrites — matches GCS semantics.
|
|
20
|
+
* - `per-entry`: entries accumulate into a keyed map. A later emit
|
|
21
|
+
* at the same `entries[].key` replaces the earlier one.
|
|
22
|
+
*
|
|
23
|
+
* The decorator holds no disk state; the `_resetAccumulated()` hook is
|
|
24
|
+
* for unit tests that rerun emit sequences within a single writer.
|
|
25
|
+
*
|
|
26
|
+
* @see docs/decisions/D0033-unified-run-anchored-artifact-capture.md (§ M5)
|
|
27
|
+
*/
|
|
28
|
+
import type { ArtifactEntry, ArtifactManifest, ArtifactRef, ArtifactType, ArtifactWriter, AssociationValues, RunId, RunManifest } from "../_vendor/ailf-core/index.d.ts";
|
|
29
|
+
export declare class AccumulatingArtifactWriter implements ArtifactWriter {
|
|
30
|
+
/**
|
|
31
|
+
* Exposed so composition-root tests can assert on the underlying backend
|
|
32
|
+
* (LocalFilesystemArtifactWriter, FanoutArtifactWriter, etc.) without
|
|
33
|
+
* plumbing a separate accessor. Treat as read-only.
|
|
34
|
+
*/
|
|
35
|
+
readonly inner: ArtifactWriter;
|
|
36
|
+
private readonly accumulated;
|
|
37
|
+
constructor(inner: ArtifactWriter);
|
|
38
|
+
/** Snapshot of every ref produced this far, keyed by artifact type. */
|
|
39
|
+
getAccumulatedArtifactRefs(): ArtifactManifest;
|
|
40
|
+
/** Test-only. Clears accumulated refs without touching the inner writer. */
|
|
41
|
+
_resetAccumulated(): void;
|
|
42
|
+
emit<T extends ArtifactType>(type: T, association: AssociationValues, payload: unknown): Promise<ArtifactRef | null>;
|
|
43
|
+
appendNdjson<T extends ArtifactType>(type: T, association: AssociationValues, rows: readonly unknown[]): Promise<ArtifactRef | null>;
|
|
44
|
+
writeManifest(runId: RunId, manifest: RunManifest): Promise<ArtifactRef | null>;
|
|
45
|
+
/** @deprecated — forwarded to the inner writer without accumulation. */
|
|
46
|
+
writeBulk(type: ArtifactType, runId: RunId, data: unknown): Promise<ArtifactRef | null>;
|
|
47
|
+
/** @deprecated — forwarded to the inner writer without accumulation. */
|
|
48
|
+
writePerEntry(type: ArtifactType, runId: RunId, entries: readonly ArtifactEntry[]): Promise<ArtifactRef | null>;
|
|
49
|
+
private mergeRef;
|
|
50
|
+
}
|