npm - @sanity/ailf - Versions diffs - 4.0.0 → 4.0.1 - Mend

@sanity/ailf 4.0.0 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/dist/_vendor/ailf-core/services/slim-report-summary.js +13 -4
package/dist/_vendor/ailf-core/types/index.d.ts +10 -0
package/dist/artifact-capture/accumulating-artifact-writer.d.ts +13 -0
package/dist/artifact-capture/accumulating-artifact-writer.js +19 -0
package/dist/commands/check-staleness.js +12 -4
package/dist/orchestration/steps/run-eval-step.js +39 -29
package/dist/pipeline/cache-hit-restore.d.ts +24 -0
package/dist/pipeline/cache-hit-restore.js +32 -0
package/dist/pipeline/eval-fingerprint.d.ts +33 -35
package/dist/pipeline/eval-fingerprint.js +124 -106
package/dist/report-store.js +3 -0
package/package.json +1 -1

package/dist/_vendor/ailf-core/services/slim-report-summary.js CHANGED Viewed

@@ -138,12 +138,21 @@ function toTitleCase(id) {
 // ---------------------------------------------------------------------------
 const RECOMMENDATION_TOP_N = 3;
 function slimRecommendations(full) {
+    // Cache-hit pass-through: when the pipeline restores a previously
+    // published report on a remote cache hit, `score-summary.json` carries
+    // recommendations in their already-slim shape (no `.gaps` field).
+    // Re-slimming would crash on `for (gap of undefined)`; the slim shape
+    // has no full-fidelity data to recover, so we return it verbatim.
+    if (!Array.isArray(full.gaps)) {
+        return full;
+    }
+    const fullReport = full;
     const counts = {};
-    for (const gap of full.gaps) {
+    for (const gap of fullReport.gaps) {
         counts[gap.area] = (counts[gap.area] ?? 0) + 1;
     }
     // Sort by priority descending, break ties by estimatedLift.
-    const sorted = [...full.gaps].sort((a, b) => (b.priority ?? 0) - (a.priority ?? 0) ||
+    const sorted = [...fullReport.gaps].sort((a, b) => (b.priority ?? 0) - (a.priority ?? 0) ||
         (b.estimatedLift ?? 0) - (a.estimatedLift ?? 0));
     const top3 = sorted
         .slice(0, RECOMMENDATION_TOP_N)
@@ -156,8 +165,8 @@ function slimRecommendations(full) {
     return {
         counts,
         top3,
-        totalGaps: full.gaps.length,
-        totalPotentialLift: full.totalPotentialLift,
+        totalGaps: fullReport.gaps.length,
+        totalPotentialLift: fullReport.totalPotentialLift,
     };
 }
 /**

package/dist/_vendor/ailf-core/types/index.d.ts CHANGED Viewed

@@ -1364,6 +1364,15 @@ export interface ArtifactRefEntry {
  *   - `truncated` on the bulk row indicates the single-object body was capped.
  *   - `preview` on the bulk row carries a descriptor-typed summary for list
  *     views; wiring lands in W0051.
+ *
+ * D0040/W0135 extension:
+ *   - `sourceRunId` declares that this ref's bytes physically live under a
+ *     different run's storage prefix than the manifest containing it.
+ *     `path` is already self-contained and authoritative for resolution;
+ *     `sourceRunId` is purely a lineage marker for retention, GC,
+ *     observability, and BigQuery joins. Set by the cache-hit branch in
+ *     `RunEvalStep` when a new run reuses a prior report's artifacts;
+ *     unset on cold-path producers.
  */
 export interface ArtifactRef {
     store: "gcs" | "local";
@@ -1381,6 +1390,7 @@ export interface ArtifactRef {
     entries?: ArtifactRefEntry[];
     truncated?: boolean;
     preview?: unknown;
+    sourceRunId?: RunId;
 }
 /**
  * Catalog of artifact refs produced by a single pipeline run.

package/dist/artifact-capture/accumulating-artifact-writer.d.ts CHANGED Viewed

@@ -39,6 +39,19 @@ export declare class AccumulatingArtifactWriter implements ArtifactWriter {
     getAccumulatedArtifactRefs(): ArtifactManifest;
     /** Test-only. Clears accumulated refs without touching the inner writer. */
     _resetAccumulated(): void;
+    /**
+     * Merge externally-supplied refs into the accumulator without touching
+     * the inner backend. Used by `RunEvalStep`'s cache-hit branch (D0040 /
+     * W0135) to restore a cached report's `Report.artifactManifest` so the
+     * new run's `RunManifest` advertises the cached artifacts via cross-run
+     * paths instead of skipping them entirely.
+     *
+     * The injected refs already carry `path`, `bucket`, `entries`, etc. as
+     * the source run wrote them — we don't synthesize new paths, we copy.
+     * Refs typically carry `sourceRunId` (set by `remapToCacheHitRefs`) so
+     * downstream tooling can follow the lineage.
+     */
+    injectAccumulated(refs: ArtifactManifest): void;
     emit<T extends ArtifactType>(type: T, association: AssociationValues, payload: unknown): Promise<ArtifactRef | null>;
     appendNdjson<T extends ArtifactType>(type: T, association: AssociationValues, rows: readonly unknown[]): Promise<ArtifactRef | null>;
     writeManifest(runId: RunId, manifest: RunManifest): Promise<ArtifactRef | null>;

package/dist/artifact-capture/accumulating-artifact-writer.js CHANGED Viewed

@@ -46,6 +46,25 @@ export class AccumulatingArtifactWriter {
             delete this.accumulated[k];
         }
     }
+    /**
+     * Merge externally-supplied refs into the accumulator without touching
+     * the inner backend. Used by `RunEvalStep`'s cache-hit branch (D0040 /
+     * W0135) to restore a cached report's `Report.artifactManifest` so the
+     * new run's `RunManifest` advertises the cached artifacts via cross-run
+     * paths instead of skipping them entirely.
+     *
+     * The injected refs already carry `path`, `bucket`, `entries`, etc. as
+     * the source run wrote them — we don't synthesize new paths, we copy.
+     * Refs typically carry `sourceRunId` (set by `remapToCacheHitRefs`) so
+     * downstream tooling can follow the lineage.
+     */
+    injectAccumulated(refs) {
+        for (const [type, ref] of Object.entries(refs)) {
+            if (!ref)
+                continue;
+            this.mergeRef(type, ref);
+        }
+    }
     // ---- ArtifactWriter surface --------------------------------------------
     async emit(type, association, payload) {
         const ref = await this.inner.emit(type, association, payload);

package/dist/commands/check-staleness.js CHANGED Viewed

@@ -21,11 +21,19 @@ export function createCheckStalenessCommand() {
         // weekly-digest.ts and composition-root.ts — AILF_REPORT_* wins over
         // the evaluated-source SANITY_* defaults so the staleness probe tracks
         // the actual report dataset even when it diverges from the eval source.
+        //
+        // The `??` fallbacks matter: passing `{ projectId: undefined }` would
+        // clobber `getSanityClient`'s built-in default via spread. The
+        // staleness workflow only sets SANITY_PROJECT_ID, so without the
+        // fallback the probe crashes with "Configuration must contain
+        // `projectId`" instead of doing its job (issue #272).
+        const projectId = process.env.AILF_REPORT_PROJECT_ID ?? process.env.SANITY_PROJECT_ID;
+        const dataset = process.env.AILF_REPORT_DATASET ?? process.env.SANITY_DATASET;
+        const token = process.env.AILF_REPORT_SANITY_API_TOKEN ?? process.env.SANITY_API_TOKEN;
         const client = getSanityClient({
-            dataset: process.env.AILF_REPORT_DATASET,
-            projectId: process.env.AILF_REPORT_PROJECT_ID,
-            token: process.env.AILF_REPORT_SANITY_API_TOKEN ??
-                process.env.SANITY_API_TOKEN,
+            ...(projectId ? { projectId } : {}),
+            ...(dataset ? { dataset } : {}),
+            ...(token ? { token } : {}),
         });
         const maxAgeDays = opts.maxAge;
         // Bound the GROQ sort with a `completedAt > $floor` filter. Beyond

package/dist/orchestration/steps/run-eval-step.js CHANGED Viewed

@@ -8,10 +8,13 @@
 import { existsSync, mkdirSync, writeFileSync } from "fs";
 import { resolve } from "path";
 import { emitPerEntryEvalResults } from "../../pipeline/emit-eval-results.js";
+import { AccumulatingArtifactWriter } from "../../artifact-capture/accumulating-artifact-writer.js";
 import { getStepInputPaths } from "../../pipeline/cache.js";
 import { buildCacheContext } from "../cache-context.js";
+import { remapToCacheHitRefs } from "../../pipeline/cache-hit-restore.js";
 import { checkCanonicalContextsExist, checkGeneratedConfigsExist, checkResultsExist, } from "../../pipeline/checks.js";
 import { computeEvalFingerprint } from "../../pipeline/eval-fingerprint.js";
+import { loadGraderModel } from "../../pipeline/grader-api.js";
 import { buildFilterFlags, configFileForMode, resultsFileForMode, scanResultsForErrors, } from "../../pipeline/eval-constants.js";
 export class RunEvalStep {
     mode;
@@ -39,31 +42,31 @@ export class RunEvalStep {
                 status: "failed",
             };
         }
+        // Load the task set once and reuse it for both the literacy precondition
+        // check and the fingerprint. Mirrors the area/task filter applied by
+        // fetch-docs so we only see tasks that were actually fetched.
+        const filter = ctx.config.areas || ctx.config.tasks || ctx.config.tags
+            ? {
+                ...(ctx.config.areas ? { areas: ctx.config.areas } : {}),
+                ...(ctx.config.tasks ? { taskIds: ctx.config.tasks } : {}),
+                ...(ctx.config.tags ? { tags: ctx.config.tags } : {}),
+            }
+            : undefined;
+        let tasks = await ctx.taskSource.loadTasks(filter);
+        // Release auto-scope: narrow to affected tasks (mirrors GenerateConfigsStep)
+        if (state.releaseAutoScope && !ctx.config.noAutoScope) {
+            const scopedIds = new Set(state.releaseAutoScope.affectedTaskIds);
+            tasks = tasks.filter((t) => scopedIds.has(t.id));
+        }
         // Precondition: canonical context files exist for filtered tasks.
         // Only applies to literacy mode — other modes don't use canonical doc contexts.
         if (this.mode === "literacy") {
-            // Must apply the same area/task filter as fetch-docs so we only
-            // check contexts that were actually fetched.
-            const filter = ctx.config.areas || ctx.config.tasks || ctx.config.tags
-                ? {
-                    ...(ctx.config.areas ? { areas: ctx.config.areas } : {}),
-                    ...(ctx.config.tasks ? { taskIds: ctx.config.tasks } : {}),
-                    ...(ctx.config.tags ? { tags: ctx.config.tags } : {}),
-                }
-                : undefined;
-            let tasks = await ctx.taskSource.loadTasks(filter);
-            // Release auto-scope: narrow to affected tasks (mirrors GenerateConfigsStep)
-            if (state.releaseAutoScope && !ctx.config.noAutoScope) {
-                const scopedIds = new Set(state.releaseAutoScope.affectedTaskIds);
-                tasks = tasks.filter((t) => scopedIds.has(t.id));
-            }
             // Only check context files for tasks that have canonical docs.
             // Tasks without canonical docs are skipped by FetchDocsStep (they
             // have no docs to fetch), so no context file is written for them.
             // The generated Promptfoo config still includes their "without-docs"
             // variant (testing model knowledge alone), which doesn't need a
             // context file.
-            // Bridge: narrow to literacy tasks with docs
             const tasksWithDocs = tasks.filter((t) => t.mode === "literacy" && (t.context?.docs?.length ?? 0) > 0);
             const taskIds = tasksWithDocs.map((t) => t.id);
             const contextIssues = checkCanonicalContextsExist(rootDir, taskIds);
@@ -83,14 +86,8 @@ export class RunEvalStep {
         if (!debug?.enabled) {
             try {
                 evalFingerprint = computeEvalFingerprint({
-                    filter: ctx.config.areas || ctx.config.tasks || ctx.config.tags
-                        ? {
-                            areas: ctx.config.areas,
-                            taskIds: ctx.config.tasks,
-                            tags: ctx.config.tags,
-                        }
-                        : undefined,
-                    graderModel: "default",
+                    tasks,
+                    graderModel: loadGraderModel(rootDir).id,
                     mode: this.mode,
                     rootDir,
                 });
@@ -119,11 +116,22 @@ export class RunEvalStep {
                     state.promptfooUrls ??= [];
                     state.promptfooUrls.push(...remoteCacheResult.promptfooUrls);
                 }
-                // W0050 — score-summary-cached was an unregistered capture;
-                // scoreSummary is already emitted by calculate-scores-step on the
-                // non-cached path, which also runs when we have a remote cache hit
-                // (populating state.remoteCacheHits → CalculateScoresStep still
-                // invokes for the score-summary emit). Dropped here.
+                // D0040 / W0135 — restore the cached report's artifact manifest into
+                // the accumulator so the new run's RunManifest advertises the cached
+                // artifacts via cross-run lineage (`sourceRunId`) instead of skipping
+                // them entirely. Without this, Studio drill-downs on the new report
+                // 404 because per-entry GCS objects were never written under the new
+                // runId. Bytes are not duplicated; the original prefix is untouched.
+                if (remoteCacheResult.artifactManifest &&
+                    remoteCacheResult.sourceRunId &&
+                    ctx.artifactWriter instanceof AccumulatingArtifactWriter) {
+                    const restored = remapToCacheHitRefs(remoteCacheResult.artifactManifest, { sourceRunId: remoteCacheResult.sourceRunId });
+                    ctx.artifactWriter.injectAccumulated(restored);
+                    const count = Object.keys(restored).length;
+                    if (count > 0) {
+                        console.log(`  ↪ Restored ${count} artifact ref${count === 1 ? "" : "s"} from run ${remoteCacheResult.sourceRunId}`);
+                    }
+                }
                 return {
                     durationMs: Date.now() - start,
                     status: "success",
@@ -241,9 +249,11 @@ async function checkRemoteCache(fingerprint, reportStore, rootDir) {
         console.log(`  ✅ Remote cache hit — reusing report ${cachedReport.id} from ${cachedReport.completedAt}`);
         console.log(`  ℹ️  Fingerprint: ${fingerprint.slice(0, 16)}... (${queryMs}ms)`);
         return {
+            artifactManifest: cachedReport.artifactManifest,
             completedAt: cachedReport.completedAt,
             promptfooUrls: cachedReport.provenance?.promptfooUrls,
             reportId: cachedReport.id,
+            sourceRunId: cachedReport.provenance?.runId,
         };
     }
     catch (err) {

package/dist/pipeline/cache-hit-restore.d.ts ADDED Viewed

@@ -0,0 +1,24 @@
+/**
+ * cache-hit-restore.ts — helpers for the eval cache-hit branch in
+ * `RunEvalStep`. Stamps `sourceRunId` onto a cached report's artifact
+ * refs so the new run's manifest advertises the cached artifacts via
+ * cross-run lineage instead of pointing at GCS objects that were never
+ * written under the new runId.
+ *
+ * @see docs/decisions/D0040-artifact-ref-source-run-id.md
+ * @see docs/design-docs/cache-hit-artifact-restoration.md
+ */
+import type { ArtifactManifest, RunId } from "../_vendor/ailf-core/index.d.ts";
+/**
+ * Copy an artifact manifest verbatim and stamp `sourceRunId` on every ref.
+ *
+ * The ref's `path`, `bucket`, `entries`, `bytes`, `preview`, etc. travel
+ * unchanged — they already point at the source run's storage. Only
+ * `sourceRunId` is added so retention/GC and observability tooling can
+ * follow the cross-run dependency.
+ *
+ * Pure function; safe to call without side effects.
+ */
+export declare function remapToCacheHitRefs(source: ArtifactManifest, opts: {
+    sourceRunId: RunId;
+}): ArtifactManifest;

package/dist/pipeline/cache-hit-restore.js ADDED Viewed

@@ -0,0 +1,32 @@
+/**
+ * cache-hit-restore.ts — helpers for the eval cache-hit branch in
+ * `RunEvalStep`. Stamps `sourceRunId` onto a cached report's artifact
+ * refs so the new run's manifest advertises the cached artifacts via
+ * cross-run lineage instead of pointing at GCS objects that were never
+ * written under the new runId.
+ *
+ * @see docs/decisions/D0040-artifact-ref-source-run-id.md
+ * @see docs/design-docs/cache-hit-artifact-restoration.md
+ */
+/**
+ * Copy an artifact manifest verbatim and stamp `sourceRunId` on every ref.
+ *
+ * The ref's `path`, `bucket`, `entries`, `bytes`, `preview`, etc. travel
+ * unchanged — they already point at the source run's storage. Only
+ * `sourceRunId` is added so retention/GC and observability tooling can
+ * follow the cross-run dependency.
+ *
+ * Pure function; safe to call without side effects.
+ */
+export function remapToCacheHitRefs(source, opts) {
+    const out = {};
+    for (const [type, ref] of Object.entries(source)) {
+        if (!ref)
+            continue;
+        out[type] = {
+            ...ref,
+            sourceRunId: opts.sourceRunId,
+        };
+    }
+    return out;
+}

package/dist/pipeline/eval-fingerprint.d.ts CHANGED Viewed

@@ -6,30 +6,35 @@
  * pipeline can query the Sanity Content Lake for a previous report with an
  * identical fingerprint and skip the expensive eval step.
  *
- * The fingerprint captures everything that would change evaluation results:
+ * The fingerprint captures:
  * - Evaluation mode (baseline, observed, agentic)
- * - Model configuration (which models, their settings)
  * - Grader model identity (different graders score differently)
- * - Prompt templates (different instructions → different outputs)
- * - Rubric templates (different criteria → different scores)
- * - Task definitions (what's being evaluated)
- * - Reference solutions (used by grader assertions)
- * - Documentation content (the docs being evaluated — the primary variable)
- * - Filter flags (which subset of tasks is included)
+ * - The task set that was actually loaded for this run, in its canonical
+ *   shape (taken straight from `ctx.taskSource.loadTasks(filter)` so that
+ *   Studio-authored task edits in the Content Lake are picked up — pre-v2
+ *   the fingerprint walked `tasks/` on disk and missed them entirely).
+ * - Repo-tracked config (models, prompts, rubrics) and reference solutions.
+ * - Fetched canonical doc content (contexts/canonical/*.md).
  *
  * The fingerprint intentionally EXCLUDES:
- * - Source name/URL (content matters, not origin)
- * - Git metadata (informational, not eval-affecting)
- * - Trigger type (manual vs CI → same inputs → same results)
- * - Report tags (human labels)
+ * - Source name/URL (content matters, not origin).
+ * - Git metadata (informational, not eval-affecting).
+ * - Trigger type (manual vs CI → same inputs → same results).
+ * - Report tags (human labels).
  *
  * @see docs/design-docs/content-lake-eval-caching.md
  */
-import type { EvalMode, FilterOptions } from "./types.js";
+import type { GeneralizedTaskDefinition } from "../_vendor/ailf-core/index.d.ts";
+import type { EvalMode } from "../_vendor/ailf-shared/index.d.ts";
 /** Inputs needed to compute an evaluation fingerprint. */
 export interface FingerprintInput {
-    /** Filter options (areas, taskIds) — determines which tasks are included */
-    filter?: FilterOptions;
+    /**
+     * Task definitions returned by `ctx.taskSource.loadTasks(filter)` after
+     * any release-auto-scope narrowing has been applied. The fingerprint
+     * captures whatever set the pipeline is actually about to evaluate, so
+     * filter changes are reflected implicitly.
+     */
+    tasks: readonly GeneralizedTaskDefinition[];
     /** Grader model identifier (e.g., "anthropic:messages:claude-opus-4-5-20251101") */
     graderModel: string;
     /** Evaluation mode */
@@ -37,30 +42,23 @@ export interface FingerprintInput {
     /** Path to the packages/eval root directory */
     rootDir: string;
 }
-/**
- * Collect all file paths that contribute to the evaluation fingerprint.
- *
- * This is similar to `getStepInputPaths()` in `cache.ts` but is more
- * comprehensive and explicitly designed for cross-environment cache keys:
- *
- * - Includes `config/prompts` and `config/rubrics` directly
- *   (the local cache only includes them indirectly via generated configs)
- * - Includes `config/models` (model configuration)
- * - Includes task definitions and reference solutions
- * - Includes the actual documentation content (contexts/canonical/*.md)
- * - Respects filter flags to only include relevant files
- */
-export declare function collectFingerprintInputPaths(rootDir: string, filter?: FilterOptions): string[];
 /**
  * Compute a deterministic SHA-256 fingerprint of all evaluation inputs.
  *
- * The fingerprint is content-addressed: identical inputs always produce
- * the same fingerprint, regardless of the environment (local, CI, etc.).
- *
- * Reuses the existing `hashFiles()` from `cache.ts` to hash file content,
- * and adds non-file context (mode, grader model, filter flags) as
- * additional context strings.
+ * Identical inputs always produce the same fingerprint, regardless of the
+ * environment (local, CI, etc.). Cross-environment portability relies on
+ * (a) tasks coming from the same Content Lake source and (b) file paths
+ * being hashed as rootDir-relative.
  *
  * @returns SHA-256 hex string (64 characters)
  */
 export declare function computeEvalFingerprint(input: FingerprintInput): string;
+/**
+ * Collect repo-tracked + fetched file paths that contribute to the
+ * fingerprint. Tasks are NOT collected here — they come from
+ * `ctx.taskSource.loadTasks()` and flow into the hash via the `tasks`
+ * input on `computeEvalFingerprint`.
+ *
+ * Exported for the debug-fingerprint diagnostic script.
+ */
+export declare function collectFingerprintFilePaths(rootDir: string): string[];

package/dist/pipeline/eval-fingerprint.js CHANGED Viewed

@@ -6,28 +6,27 @@
  * pipeline can query the Sanity Content Lake for a previous report with an
  * identical fingerprint and skip the expensive eval step.
  *
- * The fingerprint captures everything that would change evaluation results:
+ * The fingerprint captures:
  * - Evaluation mode (baseline, observed, agentic)
- * - Model configuration (which models, their settings)
  * - Grader model identity (different graders score differently)
- * - Prompt templates (different instructions → different outputs)
- * - Rubric templates (different criteria → different scores)
- * - Task definitions (what's being evaluated)
- * - Reference solutions (used by grader assertions)
- * - Documentation content (the docs being evaluated — the primary variable)
- * - Filter flags (which subset of tasks is included)
+ * - The task set that was actually loaded for this run, in its canonical
+ *   shape (taken straight from `ctx.taskSource.loadTasks(filter)` so that
+ *   Studio-authored task edits in the Content Lake are picked up — pre-v2
+ *   the fingerprint walked `tasks/` on disk and missed them entirely).
+ * - Repo-tracked config (models, prompts, rubrics) and reference solutions.
+ * - Fetched canonical doc content (contexts/canonical/*.md).
  *
  * The fingerprint intentionally EXCLUDES:
- * - Source name/URL (content matters, not origin)
- * - Git metadata (informational, not eval-affecting)
- * - Trigger type (manual vs CI → same inputs → same results)
- * - Report tags (human labels)
+ * - Source name/URL (content matters, not origin).
+ * - Git metadata (informational, not eval-affecting).
+ * - Trigger type (manual vs CI → same inputs → same results).
+ * - Report tags (human labels).
  *
  * @see docs/design-docs/content-lake-eval-caching.md
  */
-import { existsSync, readdirSync, statSync } from "fs";
-import { join, resolve } from "path";
-import { hashFiles } from "./cache.js";
+import { createHash } from "crypto";
+import { existsSync, readdirSync, readFileSync, statSync } from "fs";
+import { join, relative, resolve } from "path";
 // ---------------------------------------------------------------------------
 // Constants
 // ---------------------------------------------------------------------------
@@ -35,130 +34,149 @@ import { hashFiles } from "./cache.js";
  * Version prefix for the fingerprint hash. Bumping this invalidates all
  * existing fingerprints in the Content Lake without needing to clear the
  * store. Change this when adding new inputs to the hash.
+ *
+ * v2 (2026-04-29): tasks now sourced from ctx.taskSource (not on-disk
+ * files), file paths normalized to rootDir-relative, grader passed
+ * through verbatim instead of the literal string "default".
  */
-const FINGERPRINT_VERSION = "eval-fingerprint-v1";
+const FINGERPRINT_VERSION = "eval-fingerprint-v2";
 /**
- * Collect all file paths that contribute to the evaluation fingerprint.
+ * Compute a deterministic SHA-256 fingerprint of all evaluation inputs.
  *
- * This is similar to `getStepInputPaths()` in `cache.ts` but is more
- * comprehensive and explicitly designed for cross-environment cache keys:
+ * Identical inputs always produce the same fingerprint, regardless of the
+ * environment (local, CI, etc.). Cross-environment portability relies on
+ * (a) tasks coming from the same Content Lake source and (b) file paths
+ * being hashed as rootDir-relative.
  *
- * - Includes `config/prompts` and `config/rubrics` directly
- *   (the local cache only includes them indirectly via generated configs)
- * - Includes `config/models` (model configuration)
- * - Includes task definitions and reference solutions
- * - Includes the actual documentation content (contexts/canonical/*.md)
- * - Respects filter flags to only include relevant files
+ * @returns SHA-256 hex string (64 characters)
  */
-export function collectFingerprintInputPaths(rootDir, filter) {
+export function computeEvalFingerprint(input) {
+    const { graderModel, mode, rootDir, tasks } = input;
+    const hash = createHash("sha256");
+    hash.update(`version:${FINGERPRINT_VERSION}\n`);
+    hash.update(`mode:${mode}\n`);
+    hash.update(`grader:${graderModel}\n`);
+    hash.update(`tasks:${hashTaskSet(tasks)}\n`);
+    // Hash repo-tracked + fetched files. Paths are stored as rootDir-relative
+    // so a CI runner at /home/runner/... and a laptop at /Users/... produce
+    // the same hash for byte-identical content.
+    const filePaths = collectFingerprintFilePaths(rootDir);
+    for (const p of [...filePaths].sort(byteCompare)) {
+        hash.update(`path:${relative(rootDir, p)}\n`);
+        if (existsSync(p)) {
+            hash.update(readFileSync(p));
+        }
+        else {
+            hash.update("__missing__\n");
+        }
+        hash.update("\n---\n");
+    }
+    return hash.digest("hex");
+}
+/**
+ * Collect repo-tracked + fetched file paths that contribute to the
+ * fingerprint. Tasks are NOT collected here — they come from
+ * `ctx.taskSource.loadTasks()` and flow into the hash via the `tasks`
+ * input on `computeEvalFingerprint`.
+ *
+ * Exported for the debug-fingerprint diagnostic script.
+ */
+export function collectFingerprintFilePaths(rootDir) {
     const r = (rel) => resolve(rootDir, rel);
     const paths = [];
-    // -----------------------------------------------------------------------
-    // Config files — always included
-    // -----------------------------------------------------------------------
-    // Check all supported extensions in priority order
+    // Config files (any of the supported extensions)
     const configNames = ["models", "prompts", "rubrics"];
     const configExts = [".ts", ".js", ".yaml", ".yml", ".json"];
-    const configFiles = configNames.flatMap((name) => configExts.map((ext) => `config/${name}${ext}`));
-    for (const f of configFiles) {
-        const p = r(f);
-        if (existsSync(p))
-            paths.push(p);
-    }
-    // -----------------------------------------------------------------------
-    // Task files — filtered if --area is set
-    // -----------------------------------------------------------------------
-    const tasksDir = r("tasks");
-    if (existsSync(tasksDir)) {
-        const taskFiles = readdirSync(tasksDir)
-            .filter((f) => /\.(yaml|yml|task\.ts|task\.js)$/.test(f))
-            .filter((f) => !f.startsWith(".")); // exclude .expanded.yaml
-        for (const f of taskFiles) {
-            // If area filter is set, only include matching task files
-            if (filter?.areas && filter.areas.length > 0) {
-                const stem = f.replace(/\.(yaml|yml|task\.ts|task\.js)$/, "");
-                if (!filter.areas.includes(stem))
-                    continue;
-            }
-            paths.push(join(tasksDir, f));
+    for (const name of configNames) {
+        for (const ext of configExts) {
+            const p = r(`config/${name}${ext}`);
+            if (existsSync(p))
+                paths.push(p);
         }
     }
-    // -----------------------------------------------------------------------
-    // Reference solutions — all included (they're referenced by tasks)
-    // -----------------------------------------------------------------------
+    // Reference solutions — recursive (mixed languages, nested by area)
     const refDir = r("canonical/reference-solutions");
-    if (existsSync(refDir)) {
+    if (existsSync(refDir))
         collectFilesRecursive(refDir, paths);
-    }
-    // -----------------------------------------------------------------------
-    // Canonical context files — the documentation content being evaluated
-    // This is the KEY differentiator from the local cache (which doesn't
-    // include Sanity document content in the fetch-docs cache key).
-    // -----------------------------------------------------------------------
+    // Canonical context files — the fetched documentation content. These
+    // change whenever the Content Lake source shifts, so they capture
+    // doc-level edits that the task set itself wouldn't reflect.
     const canonicalDir = r("contexts/canonical");
     if (existsSync(canonicalDir)) {
         const contextFiles = readdirSync(canonicalDir)
             .filter((f) => f.endsWith(".md"))
-            .sort();
-        for (const f of contextFiles) {
-            // If area or task filter is set, we include all context files anyway
-            // because context filenames map to task IDs, and task-to-area mapping
-            // requires reading the YAML. It's safer to include all — a superset
-            // doesn't cause false cache hits, only potential false misses when
-            // a non-matching context changes. This is acceptable: the filter
-            // flags in the context strings differentiate the fingerprints.
+            .sort(byteCompare);
+        for (const f of contextFiles)
             paths.push(join(canonicalDir, f));
-        }
     }
     return paths;
 }
+// ---------------------------------------------------------------------------
+// Canonical serialization — byte-stable across runtimes
+// ---------------------------------------------------------------------------
 /**
- * Compute a deterministic SHA-256 fingerprint of all evaluation inputs.
- *
- * The fingerprint is content-addressed: identical inputs always produce
- * the same fingerprint, regardless of the environment (local, CI, etc.).
- *
- * Reuses the existing `hashFiles()` from `cache.ts` to hash file content,
- * and adds non-file context (mode, grader model, filter flags) as
- * additional context strings.
+ * Compare two strings by their UTF-8 byte representation.
  *
- * @returns SHA-256 hex string (64 characters)
+ * Why this and not `localeCompare` or default `<`: `localeCompare` calls
+ * into ICU, whose tables can vary between Node builds (small-icu vs
+ * full-icu) and platforms. The default JS string comparison uses UTF-16
+ * code units, which diverges from UTF-8 byte order for surrogate pairs.
+ * `Buffer.compare` on UTF-8 is well-defined and runtime-independent —
+ * the right primitive when the result feeds a hash.
  */
-export function computeEvalFingerprint(input) {
-    const { filter, graderModel, mode, rootDir } = input;
-    // -----------------------------------------------------------------------
-    // 1. Collect context strings (non-file inputs)
-    // -----------------------------------------------------------------------
-    const context = [
-        FINGERPRINT_VERSION,
-        `mode:${mode}`,
-        `grader:${graderModel}`,
-    ];
-    // Include filter flags so that scoped runs produce different fingerprints
-    if (filter?.areas && filter.areas.length > 0) {
-        context.push(`areas:${[...filter.areas].sort().join(",")}`);
-    }
-    if (filter?.taskIds && filter.taskIds.length > 0) {
-        context.push(`tasks:${[...filter.taskIds].sort().join(",")}`);
+function byteCompare(a, b) {
+    return Buffer.compare(Buffer.from(a, "utf8"), Buffer.from(b, "utf8"));
+}
+/**
+ * Recursively normalize `value` for stable serialization: object keys
+ * sorted by UTF-8 byte order; arrays preserve order (the caller decides
+ * whether to pre-sort).
+ */
+function canonicalize(value) {
+    if (Array.isArray(value))
+        return value.map(canonicalize);
+    if (value !== null && typeof value === "object") {
+        const out = {};
+        const obj = value;
+        for (const k of Object.keys(obj).sort(byteCompare)) {
+            out[k] = canonicalize(obj[k]);
+        }
+        return out;
     }
-    // -----------------------------------------------------------------------
-    // 2. Collect input file paths (all files that affect eval output)
-    // -----------------------------------------------------------------------
-    const paths = collectFingerprintInputPaths(rootDir, filter);
-    // -----------------------------------------------------------------------
-    // 3. Hash everything together
-    // -----------------------------------------------------------------------
-    return hashFiles(paths, context);
+    return value;
+}
+/**
+ * Hash a task set in a way that's invariant under source ordering and
+ * optional-field-spread reorder.
+ *
+ * Each task is canonicalized once, then the array is sorted by
+ * `(id, canonical-json)`. The secondary sort key matters: the Content
+ * Lake currently has duplicate `ailf.task` documents that share the
+ * same `id.current` but differ in body (DOC-2096). With only the id as
+ * the sort key, two such duplicates compare equal and their relative
+ * order falls back to GROQ's input order — which is undefined for
+ * equal `(area, id)` rows, so the hash could shift between runs over
+ * the same dataset. Tiebreaking on the serialized content makes the
+ * hash deterministic even in the presence of dup-id rows.
+ */
+function hashTaskSet(tasks) {
+    const serialized = tasks.map((t) => ({
+        id: t.id,
+        json: JSON.stringify(canonicalize(t)),
+    }));
+    serialized.sort((a, b) => byteCompare(a.id, b.id) || byteCompare(a.json, b.json));
+    const arrayJson = "[" + serialized.map((e) => e.json).join(",") + "]";
+    return createHash("sha256").update(arrayJson).digest("hex");
 }
 // ---------------------------------------------------------------------------
-// Helpers
+// File walk
 // ---------------------------------------------------------------------------
 /**
  * Recursively collect all file paths under a directory.
  * Skips hidden files and directories (starting with '.').
  */
 function collectFilesRecursive(dir, paths) {
-    const entries = readdirSync(dir);
+    const entries = readdirSync(dir).sort(byteCompare);
     for (const entry of entries) {
         if (entry.startsWith("."))
             continue;

package/dist/report-store.js CHANGED Viewed

@@ -286,7 +286,10 @@ export function generateReportId() {
  * metadata (_id, _type, _rev, etc.) that we strip.
  */
 function toReport(doc) {
+    const summary = doc.summary;
+    const artifactManifest = summary?.artifactManifest;
     return {
+        artifactManifest,
         comparison: doc.comparison,
         completedAt: doc.completedAt,
         durationMs: doc.durationMs,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@sanity/ailf",
-  "version": "4.0.0",
+  "version": "4.0.1",
   "private": false,
   "publishConfig": {
     "access": "public"