@sanity/ailf 0.1.32 → 0.1.34

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -177,7 +177,7 @@ export interface AppContext {
177
177
  */
178
178
  export interface ReportStorePort {
179
179
  /** Auto-compare against the most recent comparable baseline */
180
- autoCompare(currentSummary: unknown, provenance: unknown, completedAt: unknown): Promise<unknown>;
180
+ autoCompare(currentSummary: unknown, provenance: unknown, completedAt: unknown, scopedAreas?: Set<string>): Promise<unknown>;
181
181
  /** Find a report by its eval fingerprint (for cache lookup) */
182
182
  findByFingerprint(fingerprint: string): Promise<null | unknown>;
183
183
  /** Find the most recent comparable baseline for auto-comparison */
@@ -853,7 +853,7 @@ export interface AttributionReport {
853
853
  untrackedDocs: string[];
854
854
  }
855
855
  /** Classification of a feature area's score change */
856
- export type ChangeClass = "improved" | "regressed" | "unchanged";
856
+ export type ChangeClass = "improved" | "not-evaluated" | "regressed" | "unchanged";
857
857
  /** Options for the compare function */
858
858
  export interface CompareOptions {
859
859
  /** Grader consistency data — if provided, used to compute empirical noise threshold */
@@ -927,6 +927,12 @@ export interface ComparisonReport {
927
927
  noiseThreshold: number;
928
928
  /** Whether the noise threshold was derived from empirical grader consistency data */
929
929
  noiseThresholdEmpirical: boolean;
930
+ /**
931
+ * Areas present in only one summary (not evaluated in both runs).
932
+ * These are excluded from improved/regressed/unchanged classification
933
+ * because comparing against a missing score is meaningless.
934
+ */
935
+ notEvaluated: string[];
930
936
  /** Areas that regressed beyond the noise threshold */
931
937
  regressed: string[];
932
938
  /** Areas within the noise threshold */
@@ -101,6 +101,13 @@ export async function buildRemoteRequest(options) {
101
101
  raw.discoveryReport = true;
102
102
  if (config.noRemoteCache)
103
103
  raw.noRemoteCache = true;
104
+ // Caller git metadata — auto-detect from CI environment variables.
105
+ // When running via `ailf pipeline --remote` in a GitHub Actions workflow,
106
+ // the GITHUB_* env vars identify the *calling* repo (not the AILF core
107
+ // repo). This ensures report provenance attributes to the right repo.
108
+ const callerGit = detectCallerGit();
109
+ if (callerGit)
110
+ raw.callerGit = callerGit;
104
111
  // 4. Validate the assembled request
105
112
  const parsed = PipelineRequestSchema.parse(raw);
106
113
  return { request: parsed, taskCount: tasks.length };
@@ -180,3 +187,32 @@ function buildFilterOptions(config) {
180
187
  return undefined;
181
188
  return { areas, taskIds, tags };
182
189
  }
190
+ /**
191
+ * Auto-detect caller git metadata from GitHub Actions environment variables.
192
+ *
193
+ * When the CLI runs in a calling repo's CI (via `npx @sanity/ailf pipeline
194
+ * --remote`), the GITHUB_* env vars reflect that repo — not the AILF core
195
+ * repo. We capture them here so the API can carry them through to report
196
+ * provenance.
197
+ *
198
+ * Returns undefined when not running in GitHub Actions.
199
+ */
200
+ function detectCallerGit() {
201
+ const repo = process.env.GITHUB_REPOSITORY;
202
+ if (!repo)
203
+ return undefined;
204
+ const sha = process.env.GITHUB_SHA;
205
+ const ref = process.env.GITHUB_REF ?? "";
206
+ // For PRs, GITHUB_HEAD_REF is the source branch name (e.g., "fix/docs").
207
+ // For pushes, GITHUB_REF_NAME is the branch (e.g., "main").
208
+ const branch = process.env.GITHUB_HEAD_REF || process.env.GITHUB_REF_NAME;
209
+ // Extract PR number from GITHUB_REF (refs/pull/123/merge)
210
+ const prMatch = ref.match(/^refs\/pull\/(\d+)\//);
211
+ const prNumber = prMatch ? parseInt(prMatch[1], 10) : undefined;
212
+ return {
213
+ repo,
214
+ ...(sha ? { sha } : {}),
215
+ ...(branch ? { branch } : {}),
216
+ ...(prNumber ? { prNumber } : {}),
217
+ };
218
+ }
@@ -82,8 +82,16 @@ export class PublishReportStep {
82
82
  const durationMs = Date.now() - this.pipelineStart;
83
83
  // Auto-compare against most recent comparable baseline.
84
84
  // Returns the comparison + baseline report ID for lineage tracking.
85
+ //
86
+ // When release auto-scope is active, the current experiment only covers
87
+ // a subset of areas. We pass the evaluated area set so autoCompare can
88
+ // scope the baseline to match — preventing mismatched areas from
89
+ // polluting the overall delta.
90
+ const evaluatedAreas = state.releaseAutoScope
91
+ ? new Set(summary.scores.map((s) => s.feature))
92
+ : undefined;
85
93
  const autoCompareResult = ctx.reportStore
86
- ? (await ctx.reportStore.autoCompare(summary, provenance, now))
94
+ ? (await ctx.reportStore.autoCompare(summary, provenance, now, evaluatedAreas))
87
95
  : null;
88
96
  const comparison = autoCompareResult?.comparison ?? null;
89
97
  // Record which report we compared against in lineage
@@ -51,11 +51,14 @@ export function compare(baseline, experiment, options) {
51
51
  // Identify mismatched areas
52
52
  const onlyInBaseline = [...baselineAreas].filter((a) => !experimentAreas.has(a));
53
53
  const onlyInExperiment = [...experimentAreas].filter((a) => !baselineAreas.has(a));
54
- // Build per-area deltas
54
+ // Build per-area deltas.
55
+ // Areas present in only one summary get change: "not-evaluated" — comparing
56
+ // against a missing score is meaningless (it would produce false ±100 deltas).
57
+ const mismatchedSet = new Set([...onlyInBaseline, ...onlyInExperiment]);
55
58
  const areas = [...allAreas]
56
59
  .sort()
57
- .map((area) => buildAreaDelta(area, findScore(baseline.scores, area), findScore(experiment.scores, area), threshold));
58
- // Classify areas
60
+ .map((area) => buildAreaDelta(area, findScore(baseline.scores, area), findScore(experiment.scores, area), threshold, mismatchedSet.has(area)));
61
+ // Classify areas — mismatched areas are excluded from all three buckets
59
62
  const improved = areas
60
63
  .filter((a) => a.change === "improved")
61
64
  .map((a) => a.area);
@@ -65,6 +68,9 @@ export function compare(baseline, experiment, options) {
65
68
  const unchanged = areas
66
69
  .filter((a) => a.change === "unchanged")
67
70
  .map((a) => a.area);
71
+ const notEvaluated = areas
72
+ .filter((a) => a.change === "not-evaluated")
73
+ .map((a) => a.area);
68
74
  // Per-area deltas as a record
69
75
  const perArea = {};
70
76
  for (const a of areas) {
@@ -128,12 +134,13 @@ export function compare(baseline, experiment, options) {
128
134
  },
129
135
  noiseThreshold: threshold,
130
136
  noiseThresholdEmpirical: empirical,
137
+ notEvaluated,
131
138
  regressed,
132
139
  unchanged,
133
140
  };
134
141
  }
135
142
  /** Build an AreaDelta from baseline and experiment scores for a single area */
136
- function buildAreaDelta(area, baselineScore, experimentScore, threshold) {
143
+ function buildAreaDelta(area, baselineScore, experimentScore, threshold, isMismatched = false) {
137
144
  const b = baselineScore;
138
145
  const e = experimentScore;
139
146
  const bTotal = b?.totalScore ?? 0;
@@ -174,7 +181,7 @@ function buildAreaDelta(area, baselineScore, experimentScore, threshold) {
174
181
  area,
175
182
  baseline: bTotal,
176
183
  ceilingDelta: eCeiling - bCeiling,
177
- change: classifyChange(delta, threshold),
184
+ change: isMismatched ? "not-evaluated" : classifyChange(delta, threshold),
178
185
  delta,
179
186
  dimensions: {
180
187
  codeCorrectness: {
@@ -295,7 +295,7 @@ function generateComment(summary, options = {}) {
295
295
  if (hasActualDeltas) {
296
296
  lines.push("| Feature | Baseline | Current | Delta | Actual Δ | Ret. Gap Δ | Infra Δ |");
297
297
  lines.push("|---------|----------|---------|-------|----------|------------|---------|");
298
- for (const a of report.areas) {
298
+ for (const a of report.areas.filter((a) => a.change !== "not-evaluated")) {
299
299
  const icon = a.change === "improved"
300
300
  ? "📈"
301
301
  : a.change === "regressed"
@@ -313,7 +313,7 @@ function generateComment(summary, options = {}) {
313
313
  else {
314
314
  lines.push("| Feature | Baseline | Current | Delta | Task | Code | Docs |");
315
315
  lines.push("|---------|----------|---------|-------|------|------|------|");
316
- for (const a of report.areas) {
316
+ for (const a of report.areas.filter((a) => a.change !== "not-evaluated")) {
317
317
  const icon = a.change === "improved"
318
318
  ? "📈"
319
319
  : a.change === "regressed"
@@ -334,6 +334,9 @@ function generateComment(summary, options = {}) {
334
334
  if (report.unchanged.length > 0) {
335
335
  parts.push(`➡️ ${report.unchanged.length} unchanged`);
336
336
  }
337
+ if (report.notEvaluated?.length > 0) {
338
+ parts.push(`⏭️ ${report.notEvaluated.length} not evaluated`);
339
+ }
337
340
  if (parts.length > 0) {
338
341
  const isEmpirical = "noiseThresholdEmpirical" in report &&
339
342
  report.noiseThresholdEmpirical === true;
@@ -36,6 +36,10 @@ export function buildReleaseImpactReport(classification, comparison, attribution
36
36
  const confirmedUnchanged = [];
37
37
  if (comparison) {
38
38
  for (const areaDelta of comparison.areas) {
39
+ // Skip areas that weren't evaluated in both runs — these are
40
+ // mismatched areas (e.g., auto-scoped release eval vs full baseline).
41
+ if (areaDelta.change === "not-evaluated")
42
+ continue;
39
43
  const regressed = areaDelta.delta < -threshold;
40
44
  // Find tasks and their attributed documents for this area
41
45
  const areaTasks = [];
@@ -51,9 +51,13 @@ export declare class ReportStore {
51
51
  * Returns the comparison plus the baseline report ID so the caller
52
52
  * can record `provenance.lineage.comparedAgainst`.
53
53
  *
54
+ * @param scopedAreas When provided, the baseline's scores are filtered to
55
+ * only include these areas before comparison. This prevents mismatched
56
+ * areas from polluting the overall delta (e.g., release auto-scope
57
+ * evaluates only GROQ but the baseline has all areas).
54
58
  * @returns The comparison result with baseline ID, or null if no baseline found
55
59
  */
56
- autoCompare(currentSummary: ScoreSummary, provenance: ReportProvenance, completedAt: ISOTimestamp): Promise<AutoCompareResult | null>;
60
+ autoCompare(currentSummary: ScoreSummary, provenance: ReportProvenance, completedAt: ISOTimestamp, scopedAreas?: Set<string>): Promise<AutoCompareResult | null>;
57
61
  /**
58
62
  * Find a report by its evaluation fingerprint (cross-environment cache lookup).
59
63
  *
@@ -49,9 +49,13 @@ export class ReportStore {
49
49
  * Returns the comparison plus the baseline report ID so the caller
50
50
  * can record `provenance.lineage.comparedAgainst`.
51
51
  *
52
+ * @param scopedAreas When provided, the baseline's scores are filtered to
53
+ * only include these areas before comparison. This prevents mismatched
54
+ * areas from polluting the overall delta (e.g., release auto-scope
55
+ * evaluates only GROQ but the baseline has all areas).
52
56
  * @returns The comparison result with baseline ID, or null if no baseline found
53
57
  */
54
- async autoCompare(currentSummary, provenance, completedAt) {
58
+ async autoCompare(currentSummary, provenance, completedAt, scopedAreas) {
55
59
  // 1. Prefer explicit lineage source (deterministic re-run comparison)
56
60
  const rerunSourceId = provenance.lineage?.rerunOf;
57
61
  let baseline = null;
@@ -76,7 +80,30 @@ export class ReportStore {
76
80
  return null;
77
81
  }
78
82
  try {
79
- const comparison = compare(baseline.summary, currentSummary);
83
+ // When auto-scope is active, filter the baseline to only include
84
+ // areas that were actually evaluated. This produces a fair
85
+ // comparison where the overall delta reflects only tested areas.
86
+ let baselineSummary = baseline.summary;
87
+ if (scopedAreas && scopedAreas.size > 0) {
88
+ const filteredScores = baselineSummary.scores.filter((s) => scopedAreas.has(s.feature));
89
+ if (filteredScores.length > 0 &&
90
+ filteredScores.length < baselineSummary.scores.length) {
91
+ const len = filteredScores.length;
92
+ const avgScore = filteredScores.reduce((s, sc) => s + sc.totalScore, 0) / len;
93
+ const avgDocLift = filteredScores.reduce((s, sc) => s + sc.docLift, 0) / len;
94
+ baselineSummary = {
95
+ ...baselineSummary,
96
+ overall: {
97
+ ...baselineSummary.overall,
98
+ avgScore,
99
+ avgDocLift,
100
+ },
101
+ scores: filteredScores,
102
+ };
103
+ console.log(` 🎯 Scoped baseline to ${filteredScores.length} of ${baseline.summary.scores.length} areas for comparison`);
104
+ }
105
+ }
106
+ const comparison = compare(baselineSummary, currentSummary);
80
107
  return { baselineReportId: baseline.id, comparison };
81
108
  }
82
109
  catch (error) {
@@ -110,6 +110,16 @@ export function formatRegressionAlert(report) {
110
110
  type: "section",
111
111
  });
112
112
  }
113
+ // Not-evaluated areas — informational mention
114
+ if (comparison.notEvaluated?.length > 0) {
115
+ blocks.push({
116
+ text: {
117
+ text: `⏭️ ${comparison.notEvaluated.length} area${comparison.notEvaluated.length === 1 ? "" : "s"} not evaluated: ${comparison.notEvaluated.join(", ")}`,
118
+ type: "mrkdwn",
119
+ },
120
+ type: "section",
121
+ });
122
+ }
113
123
  return {
114
124
  blocks,
115
125
  text: `📉 AI Literacy Score Regression: ${baselineScore} → ${experimentScore} (${formatDelta(delta)})`,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sanity/ailf",
3
- "version": "0.1.32",
3
+ "version": "0.1.34",
4
4
  "private": false,
5
5
  "publishConfig": {
6
6
  "access": "restricted"