npm - @sanity/ailf - Versions diffs - 3.9.0 → 4.0.1 - Mend

@sanity/ailf 3.9.0 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

package/config/models.ts +32 -4
package/dist/_vendor/ailf-core/config-helpers.d.ts +8 -2
package/dist/_vendor/ailf-core/config-helpers.js +54 -1
package/dist/_vendor/ailf-core/services/slim-report-summary.js +13 -4
package/dist/_vendor/ailf-core/types/index.d.ts +10 -0
package/dist/_vendor/ailf-shared/index.d.ts +16 -10
package/dist/_vendor/ailf-shared/index.js +13 -10
package/dist/adapters/task-sources/repo-schemas.d.ts +3 -3
package/dist/agent-observer/agentic-provider.js +28 -23
package/dist/agent-observer/classifier.js +7 -2
package/dist/agent-observer/proxy.d.ts +88 -3
package/dist/agent-observer/proxy.js +174 -16
package/dist/agent-observer/types.d.ts +23 -5
package/dist/artifact-capture/accumulating-artifact-writer.d.ts +13 -0
package/dist/artifact-capture/accumulating-artifact-writer.js +19 -0
package/dist/cli-program.js +1 -1
package/dist/commands/baseline.d.ts +3 -1
package/dist/commands/baseline.js +29 -9
package/dist/commands/cache.d.ts +5 -1
package/dist/commands/cache.js +31 -15
package/dist/commands/check-staleness.js +12 -4
package/dist/commands/compare.js +11 -4
package/dist/commands/explain-handler.js +2 -2
package/dist/config/models.ts +32 -4
package/dist/orchestration/steps/run-eval-step.js +39 -29
package/dist/pipeline/baseline.d.ts +14 -3
package/dist/pipeline/baseline.js +7 -13
package/dist/pipeline/cache-hit-restore.d.ts +24 -0
package/dist/pipeline/cache-hit-restore.js +32 -0
package/dist/pipeline/calculate-scores.js +40 -1
package/dist/pipeline/compiler/provider-assembler.d.ts +23 -0
package/dist/pipeline/compiler/provider-assembler.js +37 -2
package/dist/pipeline/eval-fingerprint.d.ts +33 -35
package/dist/pipeline/eval-fingerprint.js +124 -106
package/dist/pipeline/mirror-repo-tasks.d.ts +1 -1
package/dist/report-store.js +3 -0
package/package.json +2 -2

package/dist/pipeline/eval-fingerprint.js CHANGED Viewed

@@ -6,28 +6,27 @@
  * pipeline can query the Sanity Content Lake for a previous report with an
  * identical fingerprint and skip the expensive eval step.
  *
- * The fingerprint captures everything that would change evaluation results:
+ * The fingerprint captures:
  * - Evaluation mode (baseline, observed, agentic)
- * - Model configuration (which models, their settings)
  * - Grader model identity (different graders score differently)
- * - Prompt templates (different instructions → different outputs)
- * - Rubric templates (different criteria → different scores)
- * - Task definitions (what's being evaluated)
- * - Reference solutions (used by grader assertions)
- * - Documentation content (the docs being evaluated — the primary variable)
- * - Filter flags (which subset of tasks is included)
+ * - The task set that was actually loaded for this run, in its canonical
+ *   shape (taken straight from `ctx.taskSource.loadTasks(filter)` so that
+ *   Studio-authored task edits in the Content Lake are picked up — pre-v2
+ *   the fingerprint walked `tasks/` on disk and missed them entirely).
+ * - Repo-tracked config (models, prompts, rubrics) and reference solutions.
+ * - Fetched canonical doc content (contexts/canonical/*.md).
  *
  * The fingerprint intentionally EXCLUDES:
- * - Source name/URL (content matters, not origin)
- * - Git metadata (informational, not eval-affecting)
- * - Trigger type (manual vs CI → same inputs → same results)
- * - Report tags (human labels)
+ * - Source name/URL (content matters, not origin).
+ * - Git metadata (informational, not eval-affecting).
+ * - Trigger type (manual vs CI → same inputs → same results).
+ * - Report tags (human labels).
  *
  * @see docs/design-docs/content-lake-eval-caching.md
  */
-import { existsSync, readdirSync, statSync } from "fs";
-import { join, resolve } from "path";
-import { hashFiles } from "./cache.js";
+import { createHash } from "crypto";
+import { existsSync, readdirSync, readFileSync, statSync } from "fs";
+import { join, relative, resolve } from "path";
 // ---------------------------------------------------------------------------
 // Constants
 // ---------------------------------------------------------------------------
@@ -35,130 +34,149 @@ import { hashFiles } from "./cache.js";
  * Version prefix for the fingerprint hash. Bumping this invalidates all
  * existing fingerprints in the Content Lake without needing to clear the
  * store. Change this when adding new inputs to the hash.
+ *
+ * v2 (2026-04-29): tasks now sourced from ctx.taskSource (not on-disk
+ * files), file paths normalized to rootDir-relative, grader passed
+ * through verbatim instead of the literal string "default".
  */
-const FINGERPRINT_VERSION = "eval-fingerprint-v1";
+const FINGERPRINT_VERSION = "eval-fingerprint-v2";
 /**
- * Collect all file paths that contribute to the evaluation fingerprint.
+ * Compute a deterministic SHA-256 fingerprint of all evaluation inputs.
  *
- * This is similar to `getStepInputPaths()` in `cache.ts` but is more
- * comprehensive and explicitly designed for cross-environment cache keys:
+ * Identical inputs always produce the same fingerprint, regardless of the
+ * environment (local, CI, etc.). Cross-environment portability relies on
+ * (a) tasks coming from the same Content Lake source and (b) file paths
+ * being hashed as rootDir-relative.
  *
- * - Includes `config/prompts` and `config/rubrics` directly
- *   (the local cache only includes them indirectly via generated configs)
- * - Includes `config/models` (model configuration)
- * - Includes task definitions and reference solutions
- * - Includes the actual documentation content (contexts/canonical/*.md)
- * - Respects filter flags to only include relevant files
+ * @returns SHA-256 hex string (64 characters)
  */
-export function collectFingerprintInputPaths(rootDir, filter) {
+export function computeEvalFingerprint(input) {
+    const { graderModel, mode, rootDir, tasks } = input;
+    const hash = createHash("sha256");
+    hash.update(`version:${FINGERPRINT_VERSION}\n`);
+    hash.update(`mode:${mode}\n`);
+    hash.update(`grader:${graderModel}\n`);
+    hash.update(`tasks:${hashTaskSet(tasks)}\n`);
+    // Hash repo-tracked + fetched files. Paths are stored as rootDir-relative
+    // so a CI runner at /home/runner/... and a laptop at /Users/... produce
+    // the same hash for byte-identical content.
+    const filePaths = collectFingerprintFilePaths(rootDir);
+    for (const p of [...filePaths].sort(byteCompare)) {
+        hash.update(`path:${relative(rootDir, p)}\n`);
+        if (existsSync(p)) {
+            hash.update(readFileSync(p));
+        }
+        else {
+            hash.update("__missing__\n");
+        }
+        hash.update("\n---\n");
+    }
+    return hash.digest("hex");
+}
+/**
+ * Collect repo-tracked + fetched file paths that contribute to the
+ * fingerprint. Tasks are NOT collected here — they come from
+ * `ctx.taskSource.loadTasks()` and flow into the hash via the `tasks`
+ * input on `computeEvalFingerprint`.
+ *
+ * Exported for the debug-fingerprint diagnostic script.
+ */
+export function collectFingerprintFilePaths(rootDir) {
     const r = (rel) => resolve(rootDir, rel);
     const paths = [];
-    // -----------------------------------------------------------------------
-    // Config files — always included
-    // -----------------------------------------------------------------------
-    // Check all supported extensions in priority order
+    // Config files (any of the supported extensions)
     const configNames = ["models", "prompts", "rubrics"];
     const configExts = [".ts", ".js", ".yaml", ".yml", ".json"];
-    const configFiles = configNames.flatMap((name) => configExts.map((ext) => `config/${name}${ext}`));
-    for (const f of configFiles) {
-        const p = r(f);
-        if (existsSync(p))
-            paths.push(p);
-    }
-    // -----------------------------------------------------------------------
-    // Task files — filtered if --area is set
-    // -----------------------------------------------------------------------
-    const tasksDir = r("tasks");
-    if (existsSync(tasksDir)) {
-        const taskFiles = readdirSync(tasksDir)
-            .filter((f) => /\.(yaml|yml|task\.ts|task\.js)$/.test(f))
-            .filter((f) => !f.startsWith(".")); // exclude .expanded.yaml
-        for (const f of taskFiles) {
-            // If area filter is set, only include matching task files
-            if (filter?.areas && filter.areas.length > 0) {
-                const stem = f.replace(/\.(yaml|yml|task\.ts|task\.js)$/, "");
-                if (!filter.areas.includes(stem))
-                    continue;
-            }
-            paths.push(join(tasksDir, f));
+    for (const name of configNames) {
+        for (const ext of configExts) {
+            const p = r(`config/${name}${ext}`);
+            if (existsSync(p))
+                paths.push(p);
         }
     }
-    // -----------------------------------------------------------------------
-    // Reference solutions — all included (they're referenced by tasks)
-    // -----------------------------------------------------------------------
+    // Reference solutions — recursive (mixed languages, nested by area)
     const refDir = r("canonical/reference-solutions");
-    if (existsSync(refDir)) {
+    if (existsSync(refDir))
         collectFilesRecursive(refDir, paths);
-    }
-    // -----------------------------------------------------------------------
-    // Canonical context files — the documentation content being evaluated
-    // This is the KEY differentiator from the local cache (which doesn't
-    // include Sanity document content in the fetch-docs cache key).
-    // -----------------------------------------------------------------------
+    // Canonical context files — the fetched documentation content. These
+    // change whenever the Content Lake source shifts, so they capture
+    // doc-level edits that the task set itself wouldn't reflect.
     const canonicalDir = r("contexts/canonical");
     if (existsSync(canonicalDir)) {
         const contextFiles = readdirSync(canonicalDir)
             .filter((f) => f.endsWith(".md"))
-            .sort();
-        for (const f of contextFiles) {
-            // If area or task filter is set, we include all context files anyway
-            // because context filenames map to task IDs, and task-to-area mapping
-            // requires reading the YAML. It's safer to include all — a superset
-            // doesn't cause false cache hits, only potential false misses when
-            // a non-matching context changes. This is acceptable: the filter
-            // flags in the context strings differentiate the fingerprints.
+            .sort(byteCompare);
+        for (const f of contextFiles)
             paths.push(join(canonicalDir, f));
-        }
     }
     return paths;
 }
+// ---------------------------------------------------------------------------
+// Canonical serialization — byte-stable across runtimes
+// ---------------------------------------------------------------------------
 /**
- * Compute a deterministic SHA-256 fingerprint of all evaluation inputs.
- *
- * The fingerprint is content-addressed: identical inputs always produce
- * the same fingerprint, regardless of the environment (local, CI, etc.).
- *
- * Reuses the existing `hashFiles()` from `cache.ts` to hash file content,
- * and adds non-file context (mode, grader model, filter flags) as
- * additional context strings.
+ * Compare two strings by their UTF-8 byte representation.
  *
- * @returns SHA-256 hex string (64 characters)
+ * Why this and not `localeCompare` or default `<`: `localeCompare` calls
+ * into ICU, whose tables can vary between Node builds (small-icu vs
+ * full-icu) and platforms. The default JS string comparison uses UTF-16
+ * code units, which diverges from UTF-8 byte order for surrogate pairs.
+ * `Buffer.compare` on UTF-8 is well-defined and runtime-independent —
+ * the right primitive when the result feeds a hash.
  */
-export function computeEvalFingerprint(input) {
-    const { filter, graderModel, mode, rootDir } = input;
-    // -----------------------------------------------------------------------
-    // 1. Collect context strings (non-file inputs)
-    // -----------------------------------------------------------------------
-    const context = [
-        FINGERPRINT_VERSION,
-        `mode:${mode}`,
-        `grader:${graderModel}`,
-    ];
-    // Include filter flags so that scoped runs produce different fingerprints
-    if (filter?.areas && filter.areas.length > 0) {
-        context.push(`areas:${[...filter.areas].sort().join(",")}`);
-    }
-    if (filter?.taskIds && filter.taskIds.length > 0) {
-        context.push(`tasks:${[...filter.taskIds].sort().join(",")}`);
+function byteCompare(a, b) {
+    return Buffer.compare(Buffer.from(a, "utf8"), Buffer.from(b, "utf8"));
+}
+/**
+ * Recursively normalize `value` for stable serialization: object keys
+ * sorted by UTF-8 byte order; arrays preserve order (the caller decides
+ * whether to pre-sort).
+ */
+function canonicalize(value) {
+    if (Array.isArray(value))
+        return value.map(canonicalize);
+    if (value !== null && typeof value === "object") {
+        const out = {};
+        const obj = value;
+        for (const k of Object.keys(obj).sort(byteCompare)) {
+            out[k] = canonicalize(obj[k]);
+        }
+        return out;
     }
-    // -----------------------------------------------------------------------
-    // 2. Collect input file paths (all files that affect eval output)
-    // -----------------------------------------------------------------------
-    const paths = collectFingerprintInputPaths(rootDir, filter);
-    // -----------------------------------------------------------------------
-    // 3. Hash everything together
-    // -----------------------------------------------------------------------
-    return hashFiles(paths, context);
+    return value;
+}
+/**
+ * Hash a task set in a way that's invariant under source ordering and
+ * optional-field-spread reorder.
+ *
+ * Each task is canonicalized once, then the array is sorted by
+ * `(id, canonical-json)`. The secondary sort key matters: the Content
+ * Lake currently has duplicate `ailf.task` documents that share the
+ * same `id.current` but differ in body (DOC-2096). With only the id as
+ * the sort key, two such duplicates compare equal and their relative
+ * order falls back to GROQ's input order — which is undefined for
+ * equal `(area, id)` rows, so the hash could shift between runs over
+ * the same dataset. Tiebreaking on the serialized content makes the
+ * hash deterministic even in the presence of dup-id rows.
+ */
+function hashTaskSet(tasks) {
+    const serialized = tasks.map((t) => ({
+        id: t.id,
+        json: JSON.stringify(canonicalize(t)),
+    }));
+    serialized.sort((a, b) => byteCompare(a.id, b.id) || byteCompare(a.json, b.json));
+    const arrayJson = "[" + serialized.map((e) => e.json).join(",") + "]";
+    return createHash("sha256").update(arrayJson).digest("hex");
 }
 // ---------------------------------------------------------------------------
-// Helpers
+// File walk
 // ---------------------------------------------------------------------------
 /**
  * Recursively collect all file paths under a directory.
  * Skips hidden files and directories (starting with '.').
  */
 function collectFilesRecursive(dir, paths) {
-    const entries = readdirSync(dir);
+    const entries = readdirSync(dir).sort(byteCompare);
     for (const entry of entries) {
         if (entry.startsWith("."))
             continue;

package/dist/pipeline/mirror-repo-tasks.d.ts CHANGED Viewed

@@ -107,7 +107,7 @@ export declare function buildMirrorDocument(task: LiteracyTaskDefinition, opts:
     slugToDocId: Map<string, string>;
 }): {
     baseline?: {
-        rubric?: "abbreviated" | "full" | "none" | undefined;
+        rubric?: "full" | "abbreviated" | "none" | undefined;
         enabled?: boolean | undefined;
     } | undefined;
     _id: string;

package/dist/report-store.js CHANGED Viewed

@@ -286,7 +286,10 @@ export function generateReportId() {
  * metadata (_id, _type, _rev, etc.) that we strip.
  */
 function toReport(doc) {
+    const summary = doc.summary;
+    const artifactManifest = summary?.artifactManifest;
     return {
+        artifactManifest,
         comparison: doc.comparison,
         completedAt: doc.completedAt,
         durationMs: doc.durationMs,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@sanity/ailf",
-  "version": "3.9.0",
+  "version": "4.0.1",
   "private": false,
   "publishConfig": {
     "access": "public"
@@ -77,7 +77,7 @@
     "test": "tsx --test src/__tests__/*.test.ts src/adapters/**/__tests__/*.adapter.test.ts",
     "test:e2e": "AILF_E2E=1 tsx --test src/__tests__/e2e/*.e2e.test.ts",
     "test:e2e:adapters": "AILF_E2E=1 tsx --test src/adapters/**/__tests__/*.adapter.test.ts",
-    "test:e2e:api": "AILF_E2E_API=1 tsx --test src/__tests__/api-tier2-tenant-integration.test.ts src/__tests__/run-remote-tier2.test.ts",
+    "test:e2e:api": "AILF_E2E_API=1 tsx --test src/__tests__/api-tier2-tenant-integration.test.ts",
     "test:all": "AILF_E2E=1 tsx --test src/__tests__/*.test.ts src/pipeline/compiler/__tests__/*.test.ts src/__tests__/e2e/*.e2e.test.ts src/adapters/**/__tests__/*.adapter.test.ts",
     "pr-comment": "tsx src/cli.ts pr-comment",
     "coverage-audit": "tsx src/cli.ts report coverage",