@sanity/ailf 0.1.33 → 0.1.34
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_vendor/ailf-core/ports/context.d.ts +1 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +7 -1
- package/dist/orchestration/steps/publish-report-step.js +9 -1
- package/dist/pipeline/compare.js +12 -5
- package/dist/pipeline/pr-comment.js +5 -2
- package/dist/pipeline/release-report.js +4 -0
- package/dist/report-store.d.ts +5 -1
- package/dist/report-store.js +29 -2
- package/dist/sinks/slack/format.js +10 -0
- package/package.json +1 -1
|
@@ -177,7 +177,7 @@ export interface AppContext {
|
|
|
177
177
|
*/
|
|
178
178
|
export interface ReportStorePort {
|
|
179
179
|
/** Auto-compare against the most recent comparable baseline */
|
|
180
|
-
autoCompare(currentSummary: unknown, provenance: unknown, completedAt: unknown): Promise<unknown>;
|
|
180
|
+
autoCompare(currentSummary: unknown, provenance: unknown, completedAt: unknown, scopedAreas?: Set<string>): Promise<unknown>;
|
|
181
181
|
/** Find a report by its eval fingerprint (for cache lookup) */
|
|
182
182
|
findByFingerprint(fingerprint: string): Promise<null | unknown>;
|
|
183
183
|
/** Find the most recent comparable baseline for auto-comparison */
|
|
@@ -853,7 +853,7 @@ export interface AttributionReport {
|
|
|
853
853
|
untrackedDocs: string[];
|
|
854
854
|
}
|
|
855
855
|
/** Classification of a feature area's score change */
|
|
856
|
-
export type ChangeClass = "improved" | "regressed" | "unchanged";
|
|
856
|
+
export type ChangeClass = "improved" | "not-evaluated" | "regressed" | "unchanged";
|
|
857
857
|
/** Options for the compare function */
|
|
858
858
|
export interface CompareOptions {
|
|
859
859
|
/** Grader consistency data — if provided, used to compute empirical noise threshold */
|
|
@@ -927,6 +927,12 @@ export interface ComparisonReport {
|
|
|
927
927
|
noiseThreshold: number;
|
|
928
928
|
/** Whether the noise threshold was derived from empirical grader consistency data */
|
|
929
929
|
noiseThresholdEmpirical: boolean;
|
|
930
|
+
/**
|
|
931
|
+
* Areas present in only one summary (not evaluated in both runs).
|
|
932
|
+
* These are excluded from improved/regressed/unchanged classification
|
|
933
|
+
* because comparing against a missing score is meaningless.
|
|
934
|
+
*/
|
|
935
|
+
notEvaluated: string[];
|
|
930
936
|
/** Areas that regressed beyond the noise threshold */
|
|
931
937
|
regressed: string[];
|
|
932
938
|
/** Areas within the noise threshold */
|
|
@@ -82,8 +82,16 @@ export class PublishReportStep {
|
|
|
82
82
|
const durationMs = Date.now() - this.pipelineStart;
|
|
83
83
|
// Auto-compare against most recent comparable baseline.
|
|
84
84
|
// Returns the comparison + baseline report ID for lineage tracking.
|
|
85
|
+
//
|
|
86
|
+
// When release auto-scope is active, the current experiment only covers
|
|
87
|
+
// a subset of areas. We pass the evaluated area set so autoCompare can
|
|
88
|
+
// scope the baseline to match — preventing mismatched areas from
|
|
89
|
+
// polluting the overall delta.
|
|
90
|
+
const evaluatedAreas = state.releaseAutoScope
|
|
91
|
+
? new Set(summary.scores.map((s) => s.feature))
|
|
92
|
+
: undefined;
|
|
85
93
|
const autoCompareResult = ctx.reportStore
|
|
86
|
-
? (await ctx.reportStore.autoCompare(summary, provenance, now))
|
|
94
|
+
? (await ctx.reportStore.autoCompare(summary, provenance, now, evaluatedAreas))
|
|
87
95
|
: null;
|
|
88
96
|
const comparison = autoCompareResult?.comparison ?? null;
|
|
89
97
|
// Record which report we compared against in lineage
|
package/dist/pipeline/compare.js
CHANGED
|
@@ -51,11 +51,14 @@ export function compare(baseline, experiment, options) {
|
|
|
51
51
|
// Identify mismatched areas
|
|
52
52
|
const onlyInBaseline = [...baselineAreas].filter((a) => !experimentAreas.has(a));
|
|
53
53
|
const onlyInExperiment = [...experimentAreas].filter((a) => !baselineAreas.has(a));
|
|
54
|
-
// Build per-area deltas
|
|
54
|
+
// Build per-area deltas.
|
|
55
|
+
// Areas present in only one summary get change: "not-evaluated" — comparing
|
|
56
|
+
// against a missing score is meaningless (it would produce false ±100 deltas).
|
|
57
|
+
const mismatchedSet = new Set([...onlyInBaseline, ...onlyInExperiment]);
|
|
55
58
|
const areas = [...allAreas]
|
|
56
59
|
.sort()
|
|
57
|
-
.map((area) => buildAreaDelta(area, findScore(baseline.scores, area), findScore(experiment.scores, area), threshold));
|
|
58
|
-
// Classify areas
|
|
60
|
+
.map((area) => buildAreaDelta(area, findScore(baseline.scores, area), findScore(experiment.scores, area), threshold, mismatchedSet.has(area)));
|
|
61
|
+
// Classify areas — mismatched areas are excluded from all three buckets
|
|
59
62
|
const improved = areas
|
|
60
63
|
.filter((a) => a.change === "improved")
|
|
61
64
|
.map((a) => a.area);
|
|
@@ -65,6 +68,9 @@ export function compare(baseline, experiment, options) {
|
|
|
65
68
|
const unchanged = areas
|
|
66
69
|
.filter((a) => a.change === "unchanged")
|
|
67
70
|
.map((a) => a.area);
|
|
71
|
+
const notEvaluated = areas
|
|
72
|
+
.filter((a) => a.change === "not-evaluated")
|
|
73
|
+
.map((a) => a.area);
|
|
68
74
|
// Per-area deltas as a record
|
|
69
75
|
const perArea = {};
|
|
70
76
|
for (const a of areas) {
|
|
@@ -128,12 +134,13 @@ export function compare(baseline, experiment, options) {
|
|
|
128
134
|
},
|
|
129
135
|
noiseThreshold: threshold,
|
|
130
136
|
noiseThresholdEmpirical: empirical,
|
|
137
|
+
notEvaluated,
|
|
131
138
|
regressed,
|
|
132
139
|
unchanged,
|
|
133
140
|
};
|
|
134
141
|
}
|
|
135
142
|
/** Build an AreaDelta from baseline and experiment scores for a single area */
|
|
136
|
-
function buildAreaDelta(area, baselineScore, experimentScore, threshold) {
|
|
143
|
+
function buildAreaDelta(area, baselineScore, experimentScore, threshold, isMismatched = false) {
|
|
137
144
|
const b = baselineScore;
|
|
138
145
|
const e = experimentScore;
|
|
139
146
|
const bTotal = b?.totalScore ?? 0;
|
|
@@ -174,7 +181,7 @@ function buildAreaDelta(area, baselineScore, experimentScore, threshold) {
|
|
|
174
181
|
area,
|
|
175
182
|
baseline: bTotal,
|
|
176
183
|
ceilingDelta: eCeiling - bCeiling,
|
|
177
|
-
change: classifyChange(delta, threshold),
|
|
184
|
+
change: isMismatched ? "not-evaluated" : classifyChange(delta, threshold),
|
|
178
185
|
delta,
|
|
179
186
|
dimensions: {
|
|
180
187
|
codeCorrectness: {
|
|
@@ -295,7 +295,7 @@ function generateComment(summary, options = {}) {
|
|
|
295
295
|
if (hasActualDeltas) {
|
|
296
296
|
lines.push("| Feature | Baseline | Current | Delta | Actual Δ | Ret. Gap Δ | Infra Δ |");
|
|
297
297
|
lines.push("|---------|----------|---------|-------|----------|------------|---------|");
|
|
298
|
-
for (const a of report.areas) {
|
|
298
|
+
for (const a of report.areas.filter((a) => a.change !== "not-evaluated")) {
|
|
299
299
|
const icon = a.change === "improved"
|
|
300
300
|
? "📈"
|
|
301
301
|
: a.change === "regressed"
|
|
@@ -313,7 +313,7 @@ function generateComment(summary, options = {}) {
|
|
|
313
313
|
else {
|
|
314
314
|
lines.push("| Feature | Baseline | Current | Delta | Task | Code | Docs |");
|
|
315
315
|
lines.push("|---------|----------|---------|-------|------|------|------|");
|
|
316
|
-
for (const a of report.areas) {
|
|
316
|
+
for (const a of report.areas.filter((a) => a.change !== "not-evaluated")) {
|
|
317
317
|
const icon = a.change === "improved"
|
|
318
318
|
? "📈"
|
|
319
319
|
: a.change === "regressed"
|
|
@@ -334,6 +334,9 @@ function generateComment(summary, options = {}) {
|
|
|
334
334
|
if (report.unchanged.length > 0) {
|
|
335
335
|
parts.push(`➡️ ${report.unchanged.length} unchanged`);
|
|
336
336
|
}
|
|
337
|
+
if (report.notEvaluated?.length > 0) {
|
|
338
|
+
parts.push(`⏭️ ${report.notEvaluated.length} not evaluated`);
|
|
339
|
+
}
|
|
337
340
|
if (parts.length > 0) {
|
|
338
341
|
const isEmpirical = "noiseThresholdEmpirical" in report &&
|
|
339
342
|
report.noiseThresholdEmpirical === true;
|
|
@@ -36,6 +36,10 @@ export function buildReleaseImpactReport(classification, comparison, attribution
|
|
|
36
36
|
const confirmedUnchanged = [];
|
|
37
37
|
if (comparison) {
|
|
38
38
|
for (const areaDelta of comparison.areas) {
|
|
39
|
+
// Skip areas that weren't evaluated in both runs — these are
|
|
40
|
+
// mismatched areas (e.g., auto-scoped release eval vs full baseline).
|
|
41
|
+
if (areaDelta.change === "not-evaluated")
|
|
42
|
+
continue;
|
|
39
43
|
const regressed = areaDelta.delta < -threshold;
|
|
40
44
|
// Find tasks and their attributed documents for this area
|
|
41
45
|
const areaTasks = [];
|
package/dist/report-store.d.ts
CHANGED
|
@@ -51,9 +51,13 @@ export declare class ReportStore {
|
|
|
51
51
|
* Returns the comparison plus the baseline report ID so the caller
|
|
52
52
|
* can record `provenance.lineage.comparedAgainst`.
|
|
53
53
|
*
|
|
54
|
+
* @param scopedAreas When provided, the baseline's scores are filtered to
|
|
55
|
+
* only include these areas before comparison. This prevents mismatched
|
|
56
|
+
* areas from polluting the overall delta (e.g., release auto-scope
|
|
57
|
+
* evaluates only GROQ but the baseline has all areas).
|
|
54
58
|
* @returns The comparison result with baseline ID, or null if no baseline found
|
|
55
59
|
*/
|
|
56
|
-
autoCompare(currentSummary: ScoreSummary, provenance: ReportProvenance, completedAt: ISOTimestamp): Promise<AutoCompareResult | null>;
|
|
60
|
+
autoCompare(currentSummary: ScoreSummary, provenance: ReportProvenance, completedAt: ISOTimestamp, scopedAreas?: Set<string>): Promise<AutoCompareResult | null>;
|
|
57
61
|
/**
|
|
58
62
|
* Find a report by its evaluation fingerprint (cross-environment cache lookup).
|
|
59
63
|
*
|
package/dist/report-store.js
CHANGED
|
@@ -49,9 +49,13 @@ export class ReportStore {
|
|
|
49
49
|
* Returns the comparison plus the baseline report ID so the caller
|
|
50
50
|
* can record `provenance.lineage.comparedAgainst`.
|
|
51
51
|
*
|
|
52
|
+
* @param scopedAreas When provided, the baseline's scores are filtered to
|
|
53
|
+
* only include these areas before comparison. This prevents mismatched
|
|
54
|
+
* areas from polluting the overall delta (e.g., release auto-scope
|
|
55
|
+
* evaluates only GROQ but the baseline has all areas).
|
|
52
56
|
* @returns The comparison result with baseline ID, or null if no baseline found
|
|
53
57
|
*/
|
|
54
|
-
async autoCompare(currentSummary, provenance, completedAt) {
|
|
58
|
+
async autoCompare(currentSummary, provenance, completedAt, scopedAreas) {
|
|
55
59
|
// 1. Prefer explicit lineage source (deterministic re-run comparison)
|
|
56
60
|
const rerunSourceId = provenance.lineage?.rerunOf;
|
|
57
61
|
let baseline = null;
|
|
@@ -76,7 +80,30 @@ export class ReportStore {
|
|
|
76
80
|
return null;
|
|
77
81
|
}
|
|
78
82
|
try {
|
|
79
|
-
|
|
83
|
+
// When auto-scope is active, filter the baseline to only include
|
|
84
|
+
// areas that were actually evaluated. This produces a fair
|
|
85
|
+
// comparison where the overall delta reflects only tested areas.
|
|
86
|
+
let baselineSummary = baseline.summary;
|
|
87
|
+
if (scopedAreas && scopedAreas.size > 0) {
|
|
88
|
+
const filteredScores = baselineSummary.scores.filter((s) => scopedAreas.has(s.feature));
|
|
89
|
+
if (filteredScores.length > 0 &&
|
|
90
|
+
filteredScores.length < baselineSummary.scores.length) {
|
|
91
|
+
const len = filteredScores.length;
|
|
92
|
+
const avgScore = filteredScores.reduce((s, sc) => s + sc.totalScore, 0) / len;
|
|
93
|
+
const avgDocLift = filteredScores.reduce((s, sc) => s + sc.docLift, 0) / len;
|
|
94
|
+
baselineSummary = {
|
|
95
|
+
...baselineSummary,
|
|
96
|
+
overall: {
|
|
97
|
+
...baselineSummary.overall,
|
|
98
|
+
avgScore,
|
|
99
|
+
avgDocLift,
|
|
100
|
+
},
|
|
101
|
+
scores: filteredScores,
|
|
102
|
+
};
|
|
103
|
+
console.log(` 🎯 Scoped baseline to ${filteredScores.length} of ${baseline.summary.scores.length} areas for comparison`);
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
const comparison = compare(baselineSummary, currentSummary);
|
|
80
107
|
return { baselineReportId: baseline.id, comparison };
|
|
81
108
|
}
|
|
82
109
|
catch (error) {
|
|
@@ -110,6 +110,16 @@ export function formatRegressionAlert(report) {
|
|
|
110
110
|
type: "section",
|
|
111
111
|
});
|
|
112
112
|
}
|
|
113
|
+
// Not-evaluated areas — informational mention
|
|
114
|
+
if (comparison.notEvaluated?.length > 0) {
|
|
115
|
+
blocks.push({
|
|
116
|
+
text: {
|
|
117
|
+
text: `⏭️ ${comparison.notEvaluated.length} area${comparison.notEvaluated.length === 1 ? "" : "s"} not evaluated: ${comparison.notEvaluated.join(", ")}`,
|
|
118
|
+
type: "mrkdwn",
|
|
119
|
+
},
|
|
120
|
+
type: "section",
|
|
121
|
+
});
|
|
122
|
+
}
|
|
113
123
|
return {
|
|
114
124
|
blocks,
|
|
115
125
|
text: `📉 AI Literacy Score Regression: ${baselineScore} → ${experimentScore} (${formatDelta(delta)})`,
|