npm - @sanity/ailf - Versions diffs - 0.1.33 → 0.1.34 - Mend

@sanity/ailf 0.1.33 → 0.1.34

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/dist/_vendor/ailf-core/ports/context.d.ts +1 -1
package/dist/_vendor/ailf-core/types/index.d.ts +7 -1
package/dist/orchestration/steps/publish-report-step.js +9 -1
package/dist/pipeline/compare.js +12 -5
package/dist/pipeline/pr-comment.js +5 -2
package/dist/pipeline/release-report.js +4 -0
package/dist/report-store.d.ts +5 -1
package/dist/report-store.js +29 -2
package/dist/sinks/slack/format.js +10 -0
package/package.json +1 -1

package/dist/_vendor/ailf-core/ports/context.d.ts CHANGED Viewed

@@ -177,7 +177,7 @@ export interface AppContext {
  */
 export interface ReportStorePort {
     /** Auto-compare against the most recent comparable baseline */
-    autoCompare(currentSummary: unknown, provenance: unknown, completedAt: unknown): Promise<unknown>;
+    autoCompare(currentSummary: unknown, provenance: unknown, completedAt: unknown, scopedAreas?: Set<string>): Promise<unknown>;
     /** Find a report by its eval fingerprint (for cache lookup) */
     findByFingerprint(fingerprint: string): Promise<null | unknown>;
     /** Find the most recent comparable baseline for auto-comparison */

package/dist/_vendor/ailf-core/types/index.d.ts CHANGED Viewed

@@ -853,7 +853,7 @@ export interface AttributionReport {
     untrackedDocs: string[];
 }
 /** Classification of a feature area's score change */
-export type ChangeClass = "improved" | "regressed" | "unchanged";
+export type ChangeClass = "improved" | "not-evaluated" | "regressed" | "unchanged";
 /** Options for the compare function */
 export interface CompareOptions {
     /** Grader consistency data — if provided, used to compute empirical noise threshold */
@@ -927,6 +927,12 @@ export interface ComparisonReport {
     noiseThreshold: number;
     /** Whether the noise threshold was derived from empirical grader consistency data */
     noiseThresholdEmpirical: boolean;
+    /**
+     * Areas present in only one summary (not evaluated in both runs).
+     * These are excluded from improved/regressed/unchanged classification
+     * because comparing against a missing score is meaningless.
+     */
+    notEvaluated: string[];
     /** Areas that regressed beyond the noise threshold */
     regressed: string[];
     /** Areas within the noise threshold */

package/dist/orchestration/steps/publish-report-step.js CHANGED Viewed

@@ -82,8 +82,16 @@ export class PublishReportStep {
         const durationMs = Date.now() - this.pipelineStart;
         // Auto-compare against most recent comparable baseline.
         // Returns the comparison + baseline report ID for lineage tracking.
+        //
+        // When release auto-scope is active, the current experiment only covers
+        // a subset of areas. We pass the evaluated area set so autoCompare can
+        // scope the baseline to match — preventing mismatched areas from
+        // polluting the overall delta.
+        const evaluatedAreas = state.releaseAutoScope
+            ? new Set(summary.scores.map((s) => s.feature))
+            : undefined;
         const autoCompareResult = ctx.reportStore
-            ? (await ctx.reportStore.autoCompare(summary, provenance, now))
+            ? (await ctx.reportStore.autoCompare(summary, provenance, now, evaluatedAreas))
             : null;
         const comparison = autoCompareResult?.comparison ?? null;
         // Record which report we compared against in lineage

package/dist/pipeline/compare.js CHANGED Viewed

@@ -51,11 +51,14 @@ export function compare(baseline, experiment, options) {
     // Identify mismatched areas
     const onlyInBaseline = [...baselineAreas].filter((a) => !experimentAreas.has(a));
     const onlyInExperiment = [...experimentAreas].filter((a) => !baselineAreas.has(a));
-    // Build per-area deltas
+    // Build per-area deltas.
+    // Areas present in only one summary get change: "not-evaluated" — comparing
+    // against a missing score is meaningless (it would produce false ±100 deltas).
+    const mismatchedSet = new Set([...onlyInBaseline, ...onlyInExperiment]);
     const areas = [...allAreas]
         .sort()
-        .map((area) => buildAreaDelta(area, findScore(baseline.scores, area), findScore(experiment.scores, area), threshold));
-    // Classify areas
+        .map((area) => buildAreaDelta(area, findScore(baseline.scores, area), findScore(experiment.scores, area), threshold, mismatchedSet.has(area)));
+    // Classify areas — mismatched areas are excluded from all three buckets
     const improved = areas
         .filter((a) => a.change === "improved")
         .map((a) => a.area);
@@ -65,6 +68,9 @@ export function compare(baseline, experiment, options) {
     const unchanged = areas
         .filter((a) => a.change === "unchanged")
         .map((a) => a.area);
+    const notEvaluated = areas
+        .filter((a) => a.change === "not-evaluated")
+        .map((a) => a.area);
     // Per-area deltas as a record
     const perArea = {};
     for (const a of areas) {
@@ -128,12 +134,13 @@ export function compare(baseline, experiment, options) {
         },
         noiseThreshold: threshold,
         noiseThresholdEmpirical: empirical,
+        notEvaluated,
         regressed,
         unchanged,
     };
 }
 /** Build an AreaDelta from baseline and experiment scores for a single area */
-function buildAreaDelta(area, baselineScore, experimentScore, threshold) {
+function buildAreaDelta(area, baselineScore, experimentScore, threshold, isMismatched = false) {
     const b = baselineScore;
     const e = experimentScore;
     const bTotal = b?.totalScore ?? 0;
@@ -174,7 +181,7 @@ function buildAreaDelta(area, baselineScore, experimentScore, threshold) {
         area,
         baseline: bTotal,
         ceilingDelta: eCeiling - bCeiling,
-        change: classifyChange(delta, threshold),
+        change: isMismatched ? "not-evaluated" : classifyChange(delta, threshold),
         delta,
         dimensions: {
             codeCorrectness: {

package/dist/pipeline/pr-comment.js CHANGED Viewed

@@ -295,7 +295,7 @@ function generateComment(summary, options = {}) {
         if (hasActualDeltas) {
             lines.push("| Feature | Baseline | Current | Delta | Actual Δ | Ret. Gap Δ | Infra Δ |");
             lines.push("|---------|----------|---------|-------|----------|------------|---------|");
-            for (const a of report.areas) {
+            for (const a of report.areas.filter((a) => a.change !== "not-evaluated")) {
                 const icon = a.change === "improved"
                     ? "📈"
                     : a.change === "regressed"
@@ -313,7 +313,7 @@ function generateComment(summary, options = {}) {
         else {
             lines.push("| Feature | Baseline | Current | Delta | Task | Code | Docs |");
             lines.push("|---------|----------|---------|-------|------|------|------|");
-            for (const a of report.areas) {
+            for (const a of report.areas.filter((a) => a.change !== "not-evaluated")) {
                 const icon = a.change === "improved"
                     ? "📈"
                     : a.change === "regressed"
@@ -334,6 +334,9 @@ function generateComment(summary, options = {}) {
         if (report.unchanged.length > 0) {
             parts.push(`➡️ ${report.unchanged.length} unchanged`);
         }
+        if (report.notEvaluated?.length > 0) {
+            parts.push(`⏭️ ${report.notEvaluated.length} not evaluated`);
+        }
         if (parts.length > 0) {
             const isEmpirical = "noiseThresholdEmpirical" in report &&
                 report.noiseThresholdEmpirical === true;

package/dist/pipeline/release-report.js CHANGED Viewed

@@ -36,6 +36,10 @@ export function buildReleaseImpactReport(classification, comparison, attribution
     const confirmedUnchanged = [];
     if (comparison) {
         for (const areaDelta of comparison.areas) {
+            // Skip areas that weren't evaluated in both runs — these are
+            // mismatched areas (e.g., auto-scoped release eval vs full baseline).
+            if (areaDelta.change === "not-evaluated")
+                continue;
             const regressed = areaDelta.delta < -threshold;
             // Find tasks and their attributed documents for this area
             const areaTasks = [];

package/dist/report-store.d.ts CHANGED Viewed

@@ -51,9 +51,13 @@ export declare class ReportStore {
      * Returns the comparison plus the baseline report ID so the caller
      * can record `provenance.lineage.comparedAgainst`.
      *
+     * @param scopedAreas When provided, the baseline's scores are filtered to
+     *   only include these areas before comparison. This prevents mismatched
+     *   areas from polluting the overall delta (e.g., release auto-scope
+     *   evaluates only GROQ but the baseline has all areas).
      * @returns The comparison result with baseline ID, or null if no baseline found
      */
-    autoCompare(currentSummary: ScoreSummary, provenance: ReportProvenance, completedAt: ISOTimestamp): Promise<AutoCompareResult | null>;
+    autoCompare(currentSummary: ScoreSummary, provenance: ReportProvenance, completedAt: ISOTimestamp, scopedAreas?: Set<string>): Promise<AutoCompareResult | null>;
     /**
      * Find a report by its evaluation fingerprint (cross-environment cache lookup).
      *

package/dist/report-store.js CHANGED Viewed

@@ -49,9 +49,13 @@ export class ReportStore {
      * Returns the comparison plus the baseline report ID so the caller
      * can record `provenance.lineage.comparedAgainst`.
      *
+     * @param scopedAreas When provided, the baseline's scores are filtered to
+     *   only include these areas before comparison. This prevents mismatched
+     *   areas from polluting the overall delta (e.g., release auto-scope
+     *   evaluates only GROQ but the baseline has all areas).
      * @returns The comparison result with baseline ID, or null if no baseline found
      */
-    async autoCompare(currentSummary, provenance, completedAt) {
+    async autoCompare(currentSummary, provenance, completedAt, scopedAreas) {
         // 1. Prefer explicit lineage source (deterministic re-run comparison)
         const rerunSourceId = provenance.lineage?.rerunOf;
         let baseline = null;
@@ -76,7 +80,30 @@ export class ReportStore {
             return null;
         }
         try {
-            const comparison = compare(baseline.summary, currentSummary);
+            // When auto-scope is active, filter the baseline to only include
+            // areas that were actually evaluated. This produces a fair
+            // comparison where the overall delta reflects only tested areas.
+            let baselineSummary = baseline.summary;
+            if (scopedAreas && scopedAreas.size > 0) {
+                const filteredScores = baselineSummary.scores.filter((s) => scopedAreas.has(s.feature));
+                if (filteredScores.length > 0 &&
+                    filteredScores.length < baselineSummary.scores.length) {
+                    const len = filteredScores.length;
+                    const avgScore = filteredScores.reduce((s, sc) => s + sc.totalScore, 0) / len;
+                    const avgDocLift = filteredScores.reduce((s, sc) => s + sc.docLift, 0) / len;
+                    baselineSummary = {
+                        ...baselineSummary,
+                        overall: {
+                            ...baselineSummary.overall,
+                            avgScore,
+                            avgDocLift,
+                        },
+                        scores: filteredScores,
+                    };
+                    console.log(`  🎯 Scoped baseline to ${filteredScores.length} of ${baseline.summary.scores.length} areas for comparison`);
+                }
+            }
+            const comparison = compare(baselineSummary, currentSummary);
             return { baselineReportId: baseline.id, comparison };
         }
         catch (error) {

package/dist/sinks/slack/format.js CHANGED Viewed

@@ -110,6 +110,16 @@ export function formatRegressionAlert(report) {
             type: "section",
         });
     }
+    // Not-evaluated areas — informational mention
+    if (comparison.notEvaluated?.length > 0) {
+        blocks.push({
+            text: {
+                text: `⏭️ ${comparison.notEvaluated.length} area${comparison.notEvaluated.length === 1 ? "" : "s"} not evaluated: ${comparison.notEvaluated.join(", ")}`,
+                type: "mrkdwn",
+            },
+            type: "section",
+        });
+    }
     return {
         blocks,
         text: `📉 AI Literacy Score Regression: ${baselineScore} → ${experimentScore} (${formatDelta(delta)})`,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@sanity/ailf",
-  "version": "0.1.33",
+  "version": "0.1.34",
   "private": false,
   "publishConfig": {
     "access": "restricted"