npm - @sanity/ailf - Versions diffs - 7.2.1 → 7.2.2 - Mend

@sanity/ailf 7.2.1 → 7.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/orchestration/required-eval-runs.d.ts ADDED Viewed

@@ -0,0 +1,35 @@
+/**
+ * Which eval sub-runs a pipeline configuration requires, and whether every one
+ * of them was satisfied by a remote-cache hit.
+ *
+ * The post-scoring enrichment steps (gap-analysis, compute-attribution) may
+ * skip benignly when grader judgments are absent because ALL required runs came
+ * from the remote cache — a cache hit restores `score-summary.json` but never
+ * writes `grader-judgments.json`. They must NOT skip when at least one required
+ * run was evaluated fresh this pipeline: a fresh run that scored tests yet
+ * persisted no judgments is a degraded outcome that has to fail loud.
+ *
+ * Mirrors the required-run derivation in `calculate-scores-step` so the
+ * "all required runs cached" judgement is defined in exactly one place.
+ */
+interface EvalRunSelector {
+    mode: string;
+    variant?: string | null;
+}
+interface RemoteCacheState {
+    remoteCacheHits?: ReadonlySet<string>;
+}
+/**
+ * The eval sub-runs a configuration requires, keyed by the same strings
+ * `RunEvalStep` records in `state.remoteCacheHits` (`"baseline"`, `"agentic"`,
+ * or the bare mode name for non-literacy modes).
+ */
+export declare function requiredEvalRuns(config: EvalRunSelector): string[];
+/**
+ * True only when every eval sub-run the configuration requires was satisfied by
+ * a remote-cache hit. A cache hit on a subset of required runs (e.g. agentic
+ * cached, baseline fresh) returns false — the fresh run's outputs are still the
+ * pipeline's responsibility.
+ */
+export declare function allRequiredEvalRunsCached(config: EvalRunSelector, state: RemoteCacheState | undefined): boolean;
+export {};

package/dist/orchestration/required-eval-runs.js ADDED Viewed

@@ -0,0 +1,41 @@
+/**
+ * Which eval sub-runs a pipeline configuration requires, and whether every one
+ * of them was satisfied by a remote-cache hit.
+ *
+ * The post-scoring enrichment steps (gap-analysis, compute-attribution) may
+ * skip benignly when grader judgments are absent because ALL required runs came
+ * from the remote cache — a cache hit restores `score-summary.json` but never
+ * writes `grader-judgments.json`. They must NOT skip when at least one required
+ * run was evaluated fresh this pipeline: a fresh run that scored tests yet
+ * persisted no judgments is a degraded outcome that has to fail loud.
+ *
+ * Mirrors the required-run derivation in `calculate-scores-step` so the
+ * "all required runs cached" judgement is defined in exactly one place.
+ */
+import { LiteracyVariant } from "../pipeline/normalize-mode.js";
+/**
+ * The eval sub-runs a configuration requires, keyed by the same strings
+ * `RunEvalStep` records in `state.remoteCacheHits` (`"baseline"`, `"agentic"`,
+ * or the bare mode name for non-literacy modes).
+ */
+export function requiredEvalRuns(config) {
+    if (config.mode === "literacy") {
+        const variant = config.variant ?? LiteracyVariant.STANDARD;
+        return variant === LiteracyVariant.FULL
+            ? [LiteracyVariant.STANDARD, LiteracyVariant.AGENTIC]
+            : [variant];
+    }
+    return [config.mode];
+}
+/**
+ * True only when every eval sub-run the configuration requires was satisfied by
+ * a remote-cache hit. A cache hit on a subset of required runs (e.g. agentic
+ * cached, baseline fresh) returns false — the fresh run's outputs are still the
+ * pipeline's responsibility.
+ */
+export function allRequiredEvalRunsCached(config, state) {
+    const hits = state?.remoteCacheHits;
+    if (!hits || hits.size === 0)
+        return false;
+    return requiredEvalRuns(config).every((run) => hits.has(run));
+}

package/dist/orchestration/steps/calculate-scores-step.js CHANGED Viewed

@@ -11,6 +11,7 @@ import { emitFileContents } from "../../artifact-capture/emit-file.js";
 import { LiteracyVariant } from "../../pipeline/normalize-mode.js";
 import { getStepInputPaths } from "../../pipeline/cache.js";
 import { buildCacheContext } from "../cache-context.js";
+import { allRequiredEvalRunsCached } from "../required-eval-runs.js";
 import { calculateAndWriteScores } from "../../pipeline/calculate-scores.js";
 import { checkResultsExist, checkScoreSummaryValid, } from "../../pipeline/checks.js";
 import { resultsFileForMode } from "../../pipeline/eval-constants.js";
@@ -27,30 +28,22 @@ export class CalculateScoresStep {
     }
     async execute(ctx, state) {
         const start = Date.now();
-        // When all required eval modes were satisfied by remote cache hits,
+        // When all required eval runs were satisfied by remote cache hits,
         // score-summary.json was already restored from the cached report.
-        // Skip re-calculation — the raw eval-results files don't exist.
-        if (state.remoteCacheHits?.size) {
-            // For literacy mode, determine required eval runs from variant
-            const variant = ctx.config.variant ?? LiteracyVariant.STANDARD;
-            const requiredRuns = ctx.config.mode === "literacy" && variant === LiteracyVariant.FULL
-                ? [LiteracyVariant.STANDARD, LiteracyVariant.AGENTIC]
-                : ctx.config.mode === "literacy"
-                    ? [variant]
-                    : [ctx.config.mode];
-            const allCached = requiredRuns.every((m) => state.remoteCacheHits.has(m));
-            if (allCached) {
-                // Verify the restored score-summary.json is valid
-                const summaryIssues = checkScoreSummaryValid(ctx.config.rootDir);
-                const summaryErrors = summaryIssues.filter((i) => i.severity === "error");
-                if (summaryErrors.length === 0) {
-                    return {
-                        reason: "Remote cache hit — score-summary.json restored from cached report",
-                        status: "skipped",
-                    };
-                }
-                // If the summary is invalid, fall through to normal calculation
+        // Skip re-calculation — the raw eval-results files don't exist. A partial
+        // cache hit (only some required runs cached) falls through to normal
+        // calculation: the freshly-run sub-evals produced raw results to score.
+        if (allRequiredEvalRunsCached(ctx.config, state)) {
+            // Verify the restored score-summary.json is valid
+            const summaryIssues = checkScoreSummaryValid(ctx.config.rootDir);
+            const summaryErrors = summaryIssues.filter((i) => i.severity === "error");
+            if (summaryErrors.length === 0) {
+                return {
+                    reason: "Remote cache hit — score-summary.json restored from cached report",
+                    status: "skipped",
+                };
             }
+            // If the summary is invalid, fall through to normal calculation
         }
         // Primary results file to score.
         // For literacy: "full" variant uses baseline as primary; others use variant directly.

package/dist/orchestration/steps/compute-attribution-step.js CHANGED Viewed

@@ -41,6 +41,7 @@ import { isSlugRef } from "../../_vendor/ailf-core/index.js";
 import { calibrationSetVersion, embeddingModel, ensembleVersion, } from "../../pipeline/attribution.js";
 import { V0_WEIGHTS, computeJudgmentAttribution, } from "../../pipeline/compute-attribution.js";
 import { classifyEnrichmentInputs, degradedEnrichmentError, } from "../../pipeline/enrichment-preconditions.js";
+import { allRequiredEvalRunsCached } from "../required-eval-runs.js";
 // ---------------------------------------------------------------------------
 // Step implementation
 // ---------------------------------------------------------------------------
@@ -89,10 +90,12 @@ export class ComputeAttributionStep {
         // grader judgments is a degraded run, not a benign skip. Fail loud so the
         // outcome surfaces in pipeline-result and on the job document. A remote
         // cache hit restores score-summary.json without grader-judgments.json, so
-        // its missing judgments are legitimate — never fail loud on a cache hit.
-        const fromRemoteCache = (state?.remoteCacheHits?.size ?? 0) > 0;
+        // missing judgments are legitimate ONLY when every required sub-eval came
+        // from the cache — a hybrid full run with a freshly-evaluated sub-eval that
+        // persisted no judgments is still degraded.
+        const allCached = allRequiredEvalRunsCached(ctx.config, state);
         const inputs = classifyEnrichmentInputs(root);
-        if (inputs.kind === "judgments-missing-after-eval" && !fromRemoteCache) {
+        if (inputs.kind === "judgments-missing-after-eval" && !allCached) {
             return {
                 durationMs: Date.now() - start,
                 status: "failed",

package/dist/orchestration/steps/gap-analysis-step.js CHANGED Viewed

@@ -19,6 +19,7 @@ import { join, resolve } from "path";
 import { assoc, isSlugRef } from "../../_vendor/ailf-core/index.js";
 import { emitFileContents } from "../../artifact-capture/emit-file.js";
 import { classifyEnrichmentInputs, degradedEnrichmentError, } from "../../pipeline/enrichment-preconditions.js";
+import { allRequiredEvalRunsCached } from "../required-eval-runs.js";
 export class GapAnalysisStep {
     name = "gap-analysis";
     optional = true;
@@ -46,11 +47,14 @@ export class GapAnalysisStep {
         // reports publish with a score but no test details.
         //
         // A remote cache hit restores score-summary.json (with testCount) from a
-        // prior report but never writes grader-judgments.json, so judgments are
-        // legitimately absent — that is a benign skip, not a degraded full eval.
-        const fromRemoteCache = (state?.remoteCacheHits?.size ?? 0) > 0;
+        // prior report but never writes grader-judgments.json, so absent judgments
+        // are legitimate ONLY when every required sub-eval came from the cache. In a
+        // hybrid full run (e.g. agentic cached, baseline evaluated fresh) the fresh
+        // run's missing judgments are still degraded — gate the skip on ALL required
+        // runs being cached, not merely any.
+        const allCached = allRequiredEvalRunsCached(ctx.config, state);
         const inputs = classifyEnrichmentInputs(root);
-        if (inputs.kind === "judgments-missing-after-eval" && !fromRemoteCache) {
+        if (inputs.kind === "judgments-missing-after-eval" && !allCached) {
             return {
                 durationMs: Date.now() - start,
                 status: "failed",

package/dist/pipeline/assert-grader-judgments-persisted.d.ts ADDED Viewed

@@ -0,0 +1,35 @@
+/**
+ * pipeline/assert-grader-judgments-persisted.ts
+ *
+ * Post-persist guard for the grader-judgments write junction in
+ * `calculateAndWriteScores`.
+ *
+ * `extractGraderJudgmentsResilient` returns N judgments in memory, after which
+ * `runBorderlinePass` may mutate the array in place and a `judgments.length > 0`
+ * guard decides whether `grader-judgments.json` is written. A transient read
+ * anomaly or an unexpected in-place emptying can leave the file absent or empty
+ * even though extraction yielded judgments. Silently skipping the write strands
+ * gap-analysis and ships a scored report with no test details.
+ *
+ * This guard re-reads the file from disk — the same read gap-analysis performs
+ * — and fails loud when a non-empty extraction did not round-trip. The check is
+ * deliberately narrow: it fires only on the catastrophic "extracted N>0,
+ * persisted 0" divergence, never on a genuinely judgment-free run.
+ */
+/** Injectable seam — counts the grader judgments actually on disk. */
+export interface PersistVerificationDeps {
+    countPersisted: (path: string) => number;
+}
+/**
+ * Fail loud when a non-empty grader-judgment extraction did not round-trip to
+ * disk. No-ops when nothing was extracted — a judgment-free run (all api-errors
+ * / no llm-rubric) is valid and persists nothing by design.
+ *
+ * @param extractedCount  Judgments returned by extraction, captured BEFORE any
+ *   in-place mutation (e.g. the borderline-consensus pass) so the count
+ *   reflects what extraction actually produced.
+ * @param judgmentsPath  Absolute path to `grader-judgments.json`.
+ * @param deps  Injectable disk reader; defaults to the real filesystem.
+ * @throws {Error} when `extractedCount > 0` but the persisted file holds 0.
+ */
+export declare function assertGraderJudgmentsPersisted(extractedCount: number, judgmentsPath: string, deps?: PersistVerificationDeps): void;

package/dist/pipeline/assert-grader-judgments-persisted.js ADDED Viewed

@@ -0,0 +1,58 @@
+/**
+ * pipeline/assert-grader-judgments-persisted.ts
+ *
+ * Post-persist guard for the grader-judgments write junction in
+ * `calculateAndWriteScores`.
+ *
+ * `extractGraderJudgmentsResilient` returns N judgments in memory, after which
+ * `runBorderlinePass` may mutate the array in place and a `judgments.length > 0`
+ * guard decides whether `grader-judgments.json` is written. A transient read
+ * anomaly or an unexpected in-place emptying can leave the file absent or empty
+ * even though extraction yielded judgments. Silently skipping the write strands
+ * gap-analysis and ships a scored report with no test details.
+ *
+ * This guard re-reads the file from disk — the same read gap-analysis performs
+ * — and fails loud when a non-empty extraction did not round-trip. The check is
+ * deliberately narrow: it fires only on the catastrophic "extracted N>0,
+ * persisted 0" divergence, never on a genuinely judgment-free run.
+ */
+import { existsSync, readFileSync } from "node:fs";
+/**
+ * Parse `grader-judgments.json` and return its array length. Every "no usable
+ * judgments" shape (missing, unreadable, invalid JSON, non-array) collapses to
+ * 0 — mirroring how the downstream enrichment precondition reads the same file.
+ */
+function defaultCountPersisted(path) {
+    if (!existsSync(path))
+        return 0;
+    try {
+        const parsed = JSON.parse(readFileSync(path, "utf-8"));
+        return Array.isArray(parsed) ? parsed.length : 0;
+    }
+    catch {
+        return 0;
+    }
+}
+/**
+ * Fail loud when a non-empty grader-judgment extraction did not round-trip to
+ * disk. No-ops when nothing was extracted — a judgment-free run (all api-errors
+ * / no llm-rubric) is valid and persists nothing by design.
+ *
+ * @param extractedCount  Judgments returned by extraction, captured BEFORE any
+ *   in-place mutation (e.g. the borderline-consensus pass) so the count
+ *   reflects what extraction actually produced.
+ * @param judgmentsPath  Absolute path to `grader-judgments.json`.
+ * @param deps  Injectable disk reader; defaults to the real filesystem.
+ * @throws {Error} when `extractedCount > 0` but the persisted file holds 0.
+ */
+export function assertGraderJudgmentsPersisted(extractedCount, judgmentsPath, deps = { countPersisted: defaultCountPersisted }) {
+    if (extractedCount <= 0)
+        return;
+    const persisted = deps.countPersisted(judgmentsPath);
+    if (persisted <= 0) {
+        throw new Error(`Grader judgments extract/persist divergence: extracted ${extractedCount} ` +
+            `judgment(s) but grader-judgments.json persisted 0. Refusing to finish ` +
+            `scoring — a scored report with no grader judgments would strand ` +
+            `gap-analysis and ship with no test details.`);
+    }
+}

package/dist/pipeline/calculate-scores.js CHANGED Viewed

@@ -42,6 +42,7 @@ import { loadSource } from "../sources.js";
 import { LiteracyVariant } from "./normalize-mode.js";
 import { scoreTestGroup, } from "./compiler/scoring-bridge.js";
 import { extractGraderJudgmentsResilient, } from "./extract-grader-judgments-resilient.js";
+import { assertGraderJudgmentsPersisted } from "./assert-grader-judgments-persisted.js";
 // Re-export from core for backward compatibility.
 // Existing imports from this file continue to work unchanged.
 export { classifyRubric, detectFeatureArea, extractUrlMetadata, mergeScores, parseRubricScore, } from "../_vendor/ailf-core/index.js";
@@ -1544,6 +1545,7 @@ export async function calculateAndWriteScores(options) {
         log.info("Score summary written to results/latest/score-summary.json");
         // Extract and persist grader judgments
         const judgments = await extractGraderJudgmentsResilient([baselineResultsPath], undefined, log, { deps: resilientJudgmentDeps });
+        const extractedJudgmentCount = judgments.length;
         const borderlineConsistency = await runBorderlinePass(judgments, [
             baselineResultsPath,
         ]);
@@ -1555,6 +1557,10 @@ export async function calculateAndWriteScores(options) {
             writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
             log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
         }
+        // Fail loud if a non-empty extraction did not round-trip to disk (a
+        // transient divergence at the persist junction): otherwise gap-analysis
+        // skips and the report ships a score with no test details.
+        assertGraderJudgmentsPersisted(extractedJudgmentCount, join(outDir, "grader-judgments.json"));
         // Extract and persist per-test results (D0029: model output + metadata)
         // Agent-harness produces a single profile shared across detected variants
         // (the docs/no-docs split doesn't apply — there is no gold/baseline pair).
@@ -1607,6 +1613,7 @@ export async function calculateAndWriteScores(options) {
         writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
         log.info("Score summary written to results/latest/score-summary.json");
         const judgments = await extractGraderJudgmentsResilient([baselineResultsPath], undefined, log, { deps: resilientJudgmentDeps });
+        const extractedJudgmentCount = judgments.length;
         const borderlineConsistency = await runBorderlinePass(judgments, [
             baselineResultsPath,
         ]);
@@ -1618,6 +1625,10 @@ export async function calculateAndWriteScores(options) {
             writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
             log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
         }
+        // Fail loud if a non-empty extraction did not round-trip to disk (a
+        // transient divergence at the persist junction): otherwise gap-analysis
+        // skips and the report ships a score with no test details.
+        assertGraderJudgmentsPersisted(extractedJudgmentCount, join(outDir, "grader-judgments.json"));
         // Knowledge-probe deletes vars.docs in the compiler, so every entry's
         // detected variant is "baseline" — supply the probe profile under both
         // keys so the composite is populated regardless of detection.
@@ -1744,6 +1755,9 @@ export async function calculateAndWriteScores(options) {
         ? [baselineResultsPath, agenticResultsPath]
         : [baselineResultsPath];
     const judgments = await extractGraderJudgmentsResilient(judgmentResultPaths, { reliability, ...(options.runId ? { runId: options.runId } : {}) }, log, { deps: resilientJudgmentDeps });
+    // Capture the extracted count before the borderline pass mutates the array
+    // in place — the persist guard below compares it against what lands on disk.
+    const extractedJudgmentCount = judgments.length;
     // Borderline-consensus pass — re-grade the ±5 borderline subset N times
     // and merge medians back into the canonical judgments BEFORE
     // `validateGraderJudgmentsCalibration` runs, so the calibration counter
@@ -1774,6 +1788,10 @@ export async function calculateAndWriteScores(options) {
             });
         }
     }
+    // Fail loud if a non-empty extraction did not round-trip to disk (a transient
+    // divergence at the persist junction): otherwise gap-analysis skips and the
+    // report ships a score with no test details.
+    assertGraderJudgmentsPersisted(extractedJudgmentCount, join(outDir, "grader-judgments.json"));
     // Extract and persist per-test results (D0029: model output + metadata).
     // Literacy gold (with-docs) entries score against the default profile;
     // baseline (without-docs) entries score against the output-only profile.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@sanity/ailf",
-  "version": "7.2.1",
+  "version": "7.2.2",
   "private": false,
   "publishConfig": {
     "access": "public"