@sanity/ailf 0.1.26 → 0.1.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -89,6 +89,8 @@ export interface ResolvedConfig {
89
89
  studioOriginOverride?: string;
90
90
  /** Sanity document filter args */
91
91
  sanityDocumentArgs?: string[];
92
+ /** Report ID that triggered this re-run (flows to provenance.lineage.rerunOf) */
93
+ sourceReportId?: string;
92
94
  /** Disable release-aware auto-scoping (evaluate all tasks even when perspective is set) */
93
95
  noAutoScope: boolean;
94
96
  /** Before option for comparison */
@@ -68,6 +68,7 @@ export declare const PipelineRequestSchema: z.ZodObject<{
68
68
  "origin-only": "origin-only";
69
69
  }>>;
70
70
  source: z.ZodOptional<z.ZodString>;
71
+ sourceReportId: z.ZodOptional<z.ZodString>;
71
72
  taskMode: z.ZodOptional<z.ZodEnum<{
72
73
  "content-lake": "content-lake";
73
74
  yaml: "yaml";
@@ -80,6 +80,7 @@ export const PipelineRequestSchema = z.object({
80
80
  readiness: z.boolean().optional(),
81
81
  searchMode: z.enum(["off", "open", "origin-only"]).optional(),
82
82
  source: z.string().optional(),
83
+ sourceReportId: z.string().optional(),
83
84
  taskMode: z.enum(["content-lake", "yaml", "inline"]).optional(),
84
85
  tasks: z.array(z.string()).optional(),
85
86
  urls: z.array(z.string().url()).optional(),
@@ -438,6 +438,12 @@ export interface PipelineState {
438
438
  evalFingerprint?: string;
439
439
  /** Promptfoo share URLs produced by RunEvalStep, consumed by PublishReportStep */
440
440
  promptfooUrls?: PromptfooUrlEntry[];
441
+ /**
442
+ * Eval modes that were satisfied by a remote cache hit (score-summary.json
443
+ * was restored from the Content Lake). Produced by RunEvalStep, consumed by
444
+ * CalculateScoresStep to skip re-calculation when all required modes are cached.
445
+ */
446
+ remoteCacheHits?: Set<string>;
441
447
  /**
442
448
  * Release auto-scope metadata. Set by FetchDocsStep when a perspective
443
449
  * is active and release impact identifies affected documents.
@@ -1026,6 +1032,28 @@ export interface ReportAutoScope {
1026
1032
  removed: number;
1027
1033
  };
1028
1034
  }
1035
+ /**
1036
+ * Typed relationships between reports. Each field is optional and
1037
+ * independent — populated only when that relationship exists.
1038
+ *
1039
+ * Stored at `provenance.lineage` in the report document.
1040
+ *
1041
+ * @see docs/design-docs/report-store/domain-model.md
1042
+ */
1043
+ export interface ReportLineage {
1044
+ /**
1045
+ * This report was explicitly compared against another report.
1046
+ * Set when auto-compare selects a specific baseline or when the user
1047
+ * requests comparison against a named report.
1048
+ */
1049
+ comparedAgainst?: ReportId;
1050
+ /**
1051
+ * This report was explicitly re-run from another report.
1052
+ * The re-run has the same EvalScope (mode, areas, perspective, etc.)
1053
+ * but measures the current state of docs/models/tasks.
1054
+ */
1055
+ rerunOf?: ReportId;
1056
+ }
1029
1057
  /** Full provenance metadata for an evaluation report */
1030
1058
  export interface ReportProvenance {
1031
1059
  /** Which feature areas were evaluated */
@@ -1049,6 +1077,8 @@ export interface ReportProvenance {
1049
1077
  };
1050
1078
  /** Grader model used for scoring */
1051
1079
  graderModel: string;
1080
+ /** Typed relationships with other reports (re-run, comparison) */
1081
+ lineage?: ReportLineage;
1052
1082
  /** Evaluation mode */
1053
1083
  mode: EvalMode;
1054
1084
  /** Models under evaluation */
@@ -36,7 +36,7 @@ export function createProgressDisplay() {
36
36
  line = `⏳ [queued] Waiting for runner... (${elapsed})`;
37
37
  break;
38
38
  case "running": {
39
- if (job.progress) {
39
+ if (job.progress?.step && job.progress.current && job.progress.total) {
40
40
  const { step, current, total } = job.progress;
41
41
  line = `⏳ [running] Step ${current}/${total}: ${step} (${elapsed})`;
42
42
  }
@@ -24,7 +24,7 @@ import { fileURLToPath } from "url";
24
24
  import { Command } from "commander";
25
25
  import { createAppContext } from "../composition-root.js";
26
26
  import { buildProvenance, } from "../pipeline/provenance.js";
27
- import { generateReportId } from "../report-store.js";
27
+ import { generateReportId, } from "../report-store.js";
28
28
  import { withRetry } from "../sinks/retry.js";
29
29
  const __dirname = dirname(fileURLToPath(import.meta.url));
30
30
  const ROOT = resolve(__dirname, "..", "..");
@@ -154,9 +154,17 @@ async function runPublishCommand(summaryPath, opts) {
154
154
  }
155
155
  }
156
156
  // Auto-compare against most recent comparable baseline
157
- const comparison = opts.dryRun || !store
157
+ const autoCompareResult = opts.dryRun || !store
158
158
  ? null
159
159
  : await store.autoCompare(summary, provenance, now);
160
+ const comparison = autoCompareResult?.comparison ?? null;
161
+ // Record which report we compared against in lineage
162
+ if (autoCompareResult) {
163
+ provenance.lineage = {
164
+ ...provenance.lineage,
165
+ comparedAgainst: autoCompareResult.baselineReportId,
166
+ };
167
+ }
160
168
  const reportId = generateReportId();
161
169
  const report = {
162
170
  comparison: comparison ?? undefined,
@@ -4,10 +4,10 @@
4
4
  * Calls calculateAndWriteScores() from pipeline/calculate-scores.ts with
5
5
  * typed options derived from AppContext. No env bridge needed.
6
6
  */
7
- import type { AppContext, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
7
+ import type { AppContext, PipelineState, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
8
8
  export declare class CalculateScoresStep implements PipelineStep {
9
9
  readonly name = "calculate-scores";
10
10
  check(): ValidationIssue[];
11
- execute(ctx: AppContext): Promise<StepResult>;
11
+ execute(ctx: AppContext, state: PipelineState): Promise<StepResult>;
12
12
  cacheInputs(ctx: AppContext): string[];
13
13
  }
@@ -5,6 +5,7 @@
5
5
  * typed options derived from AppContext. No env bridge needed.
6
6
  */
7
7
  import { join } from "path";
8
+ import { FULL_MODE_SUBMODES } from "../../_vendor/ailf-shared/index.js";
8
9
  import { getStepInputPaths } from "../../pipeline/cache.js";
9
10
  import { calculateAndWriteScores } from "../../pipeline/calculate-scores.js";
10
11
  import { checkResultsExist, checkScoreSummaryValid, } from "../../pipeline/checks.js";
@@ -16,8 +17,29 @@ export class CalculateScoresStep {
16
17
  check() {
17
18
  return [];
18
19
  }
19
- async execute(ctx) {
20
+ async execute(ctx, state) {
20
21
  const start = Date.now();
22
+ // When all required eval modes were satisfied by remote cache hits,
23
+ // score-summary.json was already restored from the cached report.
24
+ // Skip re-calculation — the raw eval-results files don't exist.
25
+ if (state.remoteCacheHits?.size) {
26
+ const requiredModes = ctx.config.mode === "full"
27
+ ? [...FULL_MODE_SUBMODES]
28
+ : [ctx.config.mode];
29
+ const allCached = requiredModes.every((m) => state.remoteCacheHits.has(m));
30
+ if (allCached) {
31
+ // Verify the restored score-summary.json is valid
32
+ const summaryIssues = checkScoreSummaryValid(ctx.config.rootDir);
33
+ const summaryErrors = summaryIssues.filter((i) => i.severity === "error");
34
+ if (summaryErrors.length === 0) {
35
+ return {
36
+ reason: "Remote cache hit — score-summary.json restored from cached report",
37
+ status: "skipped",
38
+ };
39
+ }
40
+ // If the summary is invalid, fall through to normal calculation
41
+ }
42
+ }
21
43
  const primaryMode = ctx.config.mode === "full"
22
44
  ? "baseline"
23
45
  : ctx.config.mode;
@@ -80,10 +80,19 @@ export class PublishReportStep {
80
80
  const now = new Date().toISOString();
81
81
  const reportId = generateReportId();
82
82
  const durationMs = Date.now() - this.pipelineStart;
83
- // Auto-compare against most recent comparable baseline
84
- const comparison = ctx.reportStore
83
+ // Auto-compare against most recent comparable baseline.
84
+ // Returns the comparison + baseline report ID for lineage tracking.
85
+ const autoCompareResult = ctx.reportStore
85
86
  ? (await ctx.reportStore.autoCompare(summary, provenance, now))
86
87
  : null;
88
+ const comparison = autoCompareResult?.comparison ?? null;
89
+ // Record which report we compared against in lineage
90
+ if (autoCompareResult) {
91
+ provenance.lineage = {
92
+ ...provenance.lineage,
93
+ comparedAgainst: autoCompareResult.baselineReportId,
94
+ };
95
+ }
87
96
  const report = {
88
97
  comparison: comparison ?? undefined,
89
98
  completedAt: now,
@@ -168,6 +177,7 @@ function buildProvenanceInput(summary, ctx, options, autoScope) {
168
177
  rootDir: ctx.config.rootDir,
169
178
  sanityDocumentIds,
170
179
  source,
180
+ sourceReportId: ctx.config.sourceReportId,
171
181
  taskIds,
172
182
  };
173
183
  }
@@ -102,6 +102,10 @@ export class RunEvalStep {
102
102
  ctx.reportStore) {
103
103
  const remoteCacheResult = await checkRemoteCache(evalFingerprint, ctx.reportStore, rootDir);
104
104
  if (remoteCacheResult) {
105
+ // Record the cache hit so CalculateScoresStep can skip when all
106
+ // required eval modes were satisfied from the remote cache.
107
+ state.remoteCacheHits ??= new Set();
108
+ state.remoteCacheHits.add(this.mode);
105
109
  return {
106
110
  durationMs: Date.now() - start,
107
111
  status: "success",
@@ -55,6 +55,7 @@ export function mapRequestToConfig(request, rootDir) {
55
55
  promptfooUrl: undefined,
56
56
  studioOriginOverride: undefined,
57
57
  sanityDocumentArgs: undefined,
58
+ sourceReportId: request.sourceReportId,
58
59
  beforeOption: undefined,
59
60
  repoTasksPath: undefined,
60
61
  callerGit: request.callerGit,
@@ -41,6 +41,8 @@ export interface ProvenanceInput {
41
41
  promptfooUrls?: PromptfooUrlEntry[];
42
42
  /** Path to the package root (for reading models.yaml) */
43
43
  rootDir: string;
44
+ /** Report ID that triggered this re-run (becomes lineage.rerunOf) */
45
+ sourceReportId?: string;
44
46
  /** Sanity document IDs targeted */
45
47
  sanityDocumentIds?: string[];
46
48
  /** Resolved documentation source */
@@ -35,6 +35,10 @@ export function buildProvenance(input) {
35
35
  sha: input.callerGit.sha ?? "unknown",
36
36
  }
37
37
  : detectGitMetadata();
38
+ // Build lineage from explicit relationships
39
+ const lineage = input.sourceReportId
40
+ ? { rerunOf: input.sourceReportId }
41
+ : undefined;
38
42
  return {
39
43
  areas: input.areas,
40
44
  autoScope: input.autoScope,
@@ -42,6 +46,7 @@ export function buildProvenance(input) {
42
46
  evalFingerprint: input.evalFingerprint,
43
47
  git,
44
48
  graderModel: models.grader.id,
49
+ lineage,
45
50
  mode: input.mode,
46
51
  models: models.models.map((m) => ({ id: m.id, label: m.label })),
47
52
  promptfooUrl: input.promptfooUrl,
@@ -16,6 +16,16 @@
16
16
  */
17
17
  import type { SanityClient } from "@sanity/client";
18
18
  import type { ComparisonReport, ISOTimestamp, LineageQuery, Report, ReportId, ReportProvenance, ScoreSummary } from "./pipeline/types.js";
19
+ /**
20
+ * Result of an auto-comparison, bundling the ComparisonReport with the
21
+ * baseline report ID so the caller can record lineage (comparedAgainst).
22
+ */
23
+ export interface AutoCompareResult {
24
+ /** The report ID of the baseline used for comparison */
25
+ baselineReportId: ReportId;
26
+ /** The computed comparison report */
27
+ comparison: ComparisonReport;
28
+ }
19
29
  export interface ReportStoreOptions {
20
30
  /** Override the Sanity client (for testing) */
21
31
  client?: SanityClient;
@@ -33,9 +43,17 @@ export declare class ReportStore {
33
43
  * Auto-compare: find the most recent comparable report and compute
34
44
  * a ComparisonReport using the existing compare() primitive.
35
45
  *
36
- * @returns The comparison report, or null if no baseline found or on error
46
+ * Baseline selection order:
47
+ * 1. Explicit lineage — if `provenance.lineage.rerunOf` is set,
48
+ * compare against that specific report (deterministic re-run comparison)
49
+ * 2. Fuzzy matching — most recent report with same mode + source name
50
+ *
51
+ * Returns the comparison plus the baseline report ID so the caller
52
+ * can record `provenance.lineage.comparedAgainst`.
53
+ *
54
+ * @returns The comparison result with baseline ID, or null if no baseline found
37
55
  */
38
- autoCompare(currentSummary: ScoreSummary, provenance: ReportProvenance, completedAt: ISOTimestamp): Promise<ComparisonReport | null>;
56
+ autoCompare(currentSummary: ScoreSummary, provenance: ReportProvenance, completedAt: ISOTimestamp): Promise<AutoCompareResult | null>;
39
57
  /**
40
58
  * Find a report by its evaluation fingerprint (cross-environment cache lookup).
41
59
  *
@@ -41,19 +41,43 @@ export class ReportStore {
41
41
  * Auto-compare: find the most recent comparable report and compute
42
42
  * a ComparisonReport using the existing compare() primitive.
43
43
  *
44
- * @returns The comparison report, or null if no baseline found or on error
44
+ * Baseline selection order:
45
+ * 1. Explicit lineage — if `provenance.lineage.rerunOf` is set,
46
+ * compare against that specific report (deterministic re-run comparison)
47
+ * 2. Fuzzy matching — most recent report with same mode + source name
48
+ *
49
+ * Returns the comparison plus the baseline report ID so the caller
50
+ * can record `provenance.lineage.comparedAgainst`.
51
+ *
52
+ * @returns The comparison result with baseline ID, or null if no baseline found
45
53
  */
46
54
  async autoCompare(currentSummary, provenance, completedAt) {
47
- const baseline = await this.findComparableBaseline({
48
- before: completedAt,
49
- mode: provenance.mode,
50
- source: { name: provenance.source.name },
51
- });
55
+ // 1. Prefer explicit lineage source (deterministic re-run comparison)
56
+ const rerunSourceId = provenance.lineage?.rerunOf;
57
+ let baseline = null;
58
+ if (rerunSourceId) {
59
+ baseline = await this.read(rerunSourceId);
60
+ if (baseline) {
61
+ console.log(` 🔗 Comparing against lineage source: ${rerunSourceId}`);
62
+ }
63
+ else {
64
+ console.warn(` ⚠️ Lineage source ${rerunSourceId} not found, falling back to fuzzy match`);
65
+ }
66
+ }
67
+ // 2. Fall back to fuzzy matching
68
+ if (!baseline) {
69
+ baseline = await this.findComparableBaseline({
70
+ before: completedAt,
71
+ mode: provenance.mode,
72
+ source: { name: provenance.source.name },
73
+ });
74
+ }
52
75
  if (!baseline) {
53
76
  return null;
54
77
  }
55
78
  try {
56
- return compare(baseline.summary, currentSummary);
79
+ const comparison = compare(baseline.summary, currentSummary);
80
+ return { baselineReportId: baseline.id, comparison };
57
81
  }
58
82
  catch (error) {
59
83
  console.warn(` ⚠️ Auto-comparison failed: ${error instanceof Error ? error.message : String(error)}`);
@@ -68,6 +68,8 @@ export interface EvalRequestPayload {
68
68
  requestedAt: string;
69
69
  /** User ID who requested */
70
70
  requestedBy?: string;
71
+ /** Report ID that triggered this re-run (if any) */
72
+ sourceReportId?: string;
71
73
  /** Request status */
72
74
  status: string;
73
75
  /** Publish tag */
@@ -180,6 +180,9 @@ async function dispatchGitHubEval(repo, payload, config) {
180
180
  ...(hasAreas ? { areas: payload.areas } : {}),
181
181
  ...(payload.debug ? { debug: true } : {}),
182
182
  ...(payload.tag ? { publishTag: payload.tag } : {}),
183
+ ...(payload.sourceReportId
184
+ ? { sourceReportId: payload.sourceReportId }
185
+ : {}),
183
186
  },
184
187
  event_type: "external-eval",
185
188
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sanity/ailf",
3
- "version": "0.1.26",
3
+ "version": "0.1.28",
4
4
  "private": false,
5
5
  "publishConfig": {
6
6
  "access": "restricted"