npm - @sanity/ailf - Versions diffs - 2.1.0 → 2.3.0 - Mend

@sanity/ailf 2.1.0 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

package/config/rubrics.ts +3 -3
package/dist/_vendor/ailf-core/examples/index.d.ts +50 -1
package/dist/_vendor/ailf-core/examples/index.js +66 -1
package/dist/_vendor/ailf-core/types/index.d.ts +25 -0
package/dist/agent-harness/assertions-runtime.d.ts +49 -0
package/dist/agent-harness/assertions-runtime.js +138 -0
package/dist/agent-harness/provider.d.ts +58 -0
package/dist/agent-harness/provider.js +104 -0
package/dist/commands/calculate-scores.js +7 -2
package/dist/commands/capture-list.d.ts +1 -1
package/dist/commands/capture-list.js +6 -3
package/dist/commands/compare.js +11 -7
package/dist/commands/explain-handler.js +22 -24
package/dist/commands/fetch-docs.js +4 -2
package/dist/commands/generate-configs.js +6 -2
package/dist/commands/init.js +3 -0
package/dist/commands/pipeline-action.js +8 -24
package/dist/commands/pipeline.js +1 -1
package/dist/commands/pr-comment.js +6 -2
package/dist/commands/publish.d.ts +1 -0
package/dist/commands/publish.js +12 -8
package/dist/commands/remote-pipeline.js +1 -1
package/dist/commands/remote-results.d.ts +8 -8
package/dist/commands/remote-results.js +7 -7
package/dist/commands/shared/options.d.ts +8 -0
package/dist/commands/shared/options.js +10 -0
package/dist/commands/shared/resolve-output-dir.d.ts +27 -0
package/dist/commands/shared/resolve-output-dir.js +36 -0
package/dist/composition-root.js +1 -1
package/dist/config/rubrics.ts +3 -3
package/dist/orchestration/build-app-context.js +1 -1
package/dist/orchestration/steps/gap-analysis-step.js +86 -75
package/dist/orchestration/steps/generate-configs-step.d.ts +7 -0
package/dist/orchestration/steps/generate-configs-step.js +47 -2
package/dist/pipeline/calculate-scores.js +113 -2
package/dist/pipeline/compare.js +50 -19
package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +103 -25
package/dist/pipeline/compiler/compiler-to-yaml.js +78 -7
package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +15 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +42 -85
package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +22 -15
package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +8 -1
package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +42 -12
package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +3 -0
package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +1 -27
package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +2 -9
package/dist/pipeline/compiler/rubric-resolution.d.ts +40 -0
package/dist/pipeline/compiler/rubric-resolution.js +52 -0
package/dist/pipeline/compiler/scoring-bridge.js +59 -7
package/dist/pipeline/provenance.js +7 -1
package/dist/pipeline/validate.d.ts +5 -4
package/dist/pipeline/validate.js +34 -113
package/package.json +2 -1

package/dist/commands/shared/options.js CHANGED Viewed

@@ -36,6 +36,16 @@ export function addOutputOptions(cmd) {
         .option("-o, --output <path>", "Write output to a specific file path")
         .option("-f, --format <fmt>", "Output format (e.g., table, json, md)");
 }
+/**
+ * Add --output-dir option for commands that write pipeline artifacts.
+ *
+ * Pair with `resolveOutputDir()` from `./resolve-output-dir.js` to resolve
+ * the value. When omitted, `resolveOutputDir()` defaults to
+ * `$CWD/.ailf/results/latest/`.
+ */
+export function addOutputDirOption(cmd) {
+    return cmd.option("--output-dir <path>", "Base directory for output artifacts (default: .ailf/results/latest/)");
+}
 /**
  * Add Sanity source options: --sanity-dataset, --sanity-project, etc.
  */

package/dist/commands/shared/resolve-output-dir.d.ts ADDED Viewed

@@ -0,0 +1,27 @@
+/**
+ * Shared output directory resolution for all CLI commands.
+ *
+ * Resolution order (2-tier):
+ *   1. Explicit `--output-dir <path>` — resolved relative to callerCwd
+ *   2. Default — `$callerCwd/.ailf/results/latest/`
+ *
+ * callerCwd is `AILF_CALLER_CWD ?? process.cwd()` — the user's actual
+ * working directory, not the eval package root.
+ *
+ * @see docs/design-docs/output-dir-routing.md
+ * @see docs/work-items/W0030.json
+ */
+/**
+ * Get the caller's working directory.
+ *
+ * When the CLI is invoked via `npx @sanity/ailf`, the wrapper may set
+ * AILF_CALLER_CWD to preserve the real CWD before Node changes it.
+ */
+export declare function getCallerCwd(): string;
+/**
+ * Resolve the output directory for pipeline artifacts.
+ *
+ * @param outputDir - Explicit `--output-dir` value from the CLI (may be relative)
+ * @returns Absolute path to the output directory
+ */
+export declare function resolveOutputDir(outputDir?: string): string;

package/dist/commands/shared/resolve-output-dir.js ADDED Viewed

@@ -0,0 +1,36 @@
+/**
+ * Shared output directory resolution for all CLI commands.
+ *
+ * Resolution order (2-tier):
+ *   1. Explicit `--output-dir <path>` — resolved relative to callerCwd
+ *   2. Default — `$callerCwd/.ailf/results/latest/`
+ *
+ * callerCwd is `AILF_CALLER_CWD ?? process.cwd()` — the user's actual
+ * working directory, not the eval package root.
+ *
+ * @see docs/design-docs/output-dir-routing.md
+ * @see docs/work-items/W0030.json
+ */
+import { resolve } from "path";
+/**
+ * Get the caller's working directory.
+ *
+ * When the CLI is invoked via `npx @sanity/ailf`, the wrapper may set
+ * AILF_CALLER_CWD to preserve the real CWD before Node changes it.
+ */
+export function getCallerCwd() {
+    return process.env.AILF_CALLER_CWD ?? process.cwd();
+}
+/**
+ * Resolve the output directory for pipeline artifacts.
+ *
+ * @param outputDir - Explicit `--output-dir` value from the CLI (may be relative)
+ * @returns Absolute path to the output directory
+ */
+export function resolveOutputDir(outputDir) {
+    const callerCwd = getCallerCwd();
+    if (outputDir) {
+        return resolve(callerCwd, outputDir);
+    }
+    return resolve(callerCwd, ".ailf", "results", "latest");
+}

package/dist/composition-root.js CHANGED Viewed

@@ -60,7 +60,7 @@ export function createAppContext(config) {
     // Artifact collector — no-op by default, filesystem when --capture is set
     const collector = config.captureEnabled
         ? new FilesystemArtifactCollector({
-            captureDir: config.captureDir ?? join(config.rootDir, "results", "captures"),
+            captureDir: config.captureDir ?? join(config.outputDir, "..", "captures"),
             mode: config.mode,
             compress: config.captureCompress ?? true,
             extras: config.captureExtras ?? true,

package/dist/config/rubrics.ts CHANGED Viewed

@@ -201,9 +201,9 @@ export default defineRubrics({
       currency: 0.2,
     },
     "agent-harness": {
-      "agent-output": 0.45,
-      "tool-usage": 0.4,
-      "process-quality": 0.15,
+      "assertion-pass-rate": 0.35,
+      "agent-output": 0.35,
+      "tool-usage": 0.3,
     },
   },

package/dist/orchestration/build-app-context.js CHANGED Viewed

@@ -79,7 +79,7 @@ export function mapToResolvedConfig(opts, rootDir) {
         apiUrl: opts.apiUrl ?? "https://ailf-api.sanity.build",
         apiKey: opts.apiKey,
         captureEnabled: opts.captureEnabled ?? false,
-        captureDir: opts.captureDir ?? join(rootDir, "results", "captures"),
+        captureDir: opts.captureDir ?? join(opts.outputDir, "..", "captures"),
         captureCompress: opts.captureCompress ?? true,
         captureExtras: opts.captureExtras ?? true,
     };

package/dist/orchestration/steps/gap-analysis-step.js CHANGED Viewed

@@ -61,87 +61,96 @@ export class GapAnalysisStep {
             mkdirSync(outDir, { recursive: true });
             writeFileSync(join(outDir, "failure-modes.json"), JSON.stringify(failureModeReport, null, 2));
             writeFileSync(join(outDir, "gap-analysis.json"), JSON.stringify(gapReport, null, 2));
-            const manifestPath = resolve(root, "contexts", "document-manifest.json");
-            const manifestEntries = existsSync(manifestPath)
-                ? JSON.parse(readFileSync(manifestPath, "utf-8"))
-                : [];
-            const refBySlug = new Map();
-            for (const entry of manifestEntries) {
-                refBySlug.set(entry.slug, entry);
-            }
-            const resolveRefs = (slugs) => slugs
-                .map((slug) => {
-                const m = refBySlug.get(slug);
-                return m
-                    ? {
-                        documentId: m._id,
-                        revision: m._rev,
-                        slug: m.slug,
-                        title: m.title,
-                    }
-                    : { documentId: "", slug, title: slug };
-            })
-                .filter((r) => r.documentId !== "");
-            // ── Build description→docs mapping from TaskSource ─────────
-            // Primary source: use the TaskSource adapter from AppContext.
-            // This works with Content Lake, repo-based, and YAML tasks.
-            // Judgments use task description as their taskId, so we build
-            // maps keyed by both description and task ID for robust matching.
+            // ── Document enrichment (literacy mode only) ──────────────
+            // Non-literacy modes don't use canonical docs. Skip manifest
+            // loading, doc-reference enrichment, and canonical doc mapping
+            // entirely — those fields are only meaningful for literacy evals.
+            const isLiteracyMode = ctx.config.mode === "literacy";
+            let documentManifest;
+            let enrichedScores = scoreSummary.scores;
             const descToDocRefs = new Map();
-            const areaToDocRefs = new Map();
-            let tasks = [];
-            try {
-                tasks = (await ctx.taskSource.loadTasks()).filter((t) => t.mode === "literacy");
-            }
-            catch {
-                // TaskSource may not be available in all contexts (e.g., standalone
-                // gap analysis on cached results). Fall through to legacy fallback.
-            }
-            if (tasks.length > 0) {
-                // Group tasks by feature area and build slug maps
-                const byArea = new Map();
-                for (const task of tasks) {
-                    const slugs = extractSlugsFromRefs(task.context?.docs ?? []);
-                    const refs = resolveRefs(slugs);
-                    // Map by title (what judgments use as taskId)
-                    descToDocRefs.set(task.title, refs);
-                    // Also map by task ID for prefix-based matching
-                    descToDocRefs.set(task.id, refs);
-                    // Group slugs by feature area
-                    const area = task.area ?? "";
-                    if (!byArea.has(area))
-                        byArea.set(area, new Set());
-                    for (const s of slugs)
-                        byArea.get(area).add(s);
+            if (isLiteracyMode) {
+                const manifestPath = resolve(root, "contexts", "document-manifest.json");
+                const manifestEntries = existsSync(manifestPath)
+                    ? JSON.parse(readFileSync(manifestPath, "utf-8"))
+                    : [];
+                const refBySlug = new Map();
+                for (const entry of manifestEntries) {
+                    refBySlug.set(entry.slug, entry);
                 }
-                for (const [area, slugs] of byArea) {
-                    areaToDocRefs.set(area, resolveRefs([...slugs]));
+                const resolveRefs = (slugs) => slugs
+                    .map((slug) => {
+                    const m = refBySlug.get(slug);
+                    return m
+                        ? {
+                            documentId: m._id,
+                            revision: m._rev,
+                            slug: m.slug,
+                            title: m.title,
+                        }
+                        : { documentId: "", slug, title: slug };
+                })
+                    .filter((r) => r.documentId !== "");
+                // ── Build description→docs mapping from TaskSource ─────────
+                // Primary source: use the TaskSource adapter from AppContext.
+                // This works with Content Lake, repo-based, and YAML tasks.
+                // Judgments use task description as their taskId, so we build
+                // maps keyed by both description and task ID for robust matching.
+                const areaToDocRefs = new Map();
+                let tasks = [];
+                try {
+                    tasks = (await ctx.taskSource.loadTasks()).filter((t) => t.mode === "literacy");
                 }
-            }
-            // Legacy fallback: merge in any tasks from local YAML that weren't
-            // already covered by the TaskSource adapter.
-            const { resolveMappings } = await import("../../pipeline/resolve-mappings.js");
-            const mappings = resolveMappings(root);
-            for (const [area, areaData] of Object.entries(mappings.feature_areas)) {
-                const areaSlugs = new Set();
-                for (const task of areaData.tasks) {
-                    const taskSlugs = task.canonical_docs.map((d) => d.slug);
-                    // Only add if not already mapped by the primary source
-                    if (!descToDocRefs.has(task.description)) {
-                        descToDocRefs.set(task.description, resolveRefs(taskSlugs));
+                catch {
+                    // TaskSource may not be available in all contexts (e.g., standalone
+                    // gap analysis on cached results). Fall through to legacy fallback.
+                }
+                if (tasks.length > 0) {
+                    // Group tasks by feature area and build slug maps
+                    const byArea = new Map();
+                    for (const task of tasks) {
+                        const slugs = extractSlugsFromRefs(task.context?.docs ?? []);
+                        const refs = resolveRefs(slugs);
+                        // Map by title (what judgments use as taskId)
+                        descToDocRefs.set(task.title, refs);
+                        // Also map by task ID for prefix-based matching
+                        descToDocRefs.set(task.id, refs);
+                        // Group slugs by feature area
+                        const area = task.area ?? "";
+                        if (!byArea.has(area))
+                            byArea.set(area, new Set());
+                        for (const s of slugs)
+                            byArea.get(area).add(s);
+                    }
+                    for (const [area, slugs] of byArea) {
+                        areaToDocRefs.set(area, resolveRefs([...slugs]));
                     }
-                    for (const s of taskSlugs)
-                        areaSlugs.add(s);
                 }
-                if (!areaToDocRefs.has(area)) {
-                    areaToDocRefs.set(area, resolveRefs([...areaSlugs]));
+                // Legacy fallback: merge in any tasks from local YAML that weren't
+                // already covered by the TaskSource adapter.
+                const { resolveMappings } = await import("../../pipeline/resolve-mappings.js");
+                const mappings = resolveMappings(root);
+                for (const [area, areaData] of Object.entries(mappings.feature_areas)) {
+                    const areaSlugs = new Set();
+                    for (const task of areaData.tasks) {
+                        const taskSlugs = task.canonical_docs.map((d) => d.slug);
+                        // Only add if not already mapped by the primary source
+                        if (!descToDocRefs.has(task.description)) {
+                            descToDocRefs.set(task.description, resolveRefs(taskSlugs));
+                        }
+                        for (const s of taskSlugs)
+                            areaSlugs.add(s);
+                    }
+                    if (!areaToDocRefs.has(area)) {
+                        areaToDocRefs.set(area, resolveRefs([...areaSlugs]));
+                    }
                 }
+                documentManifest = resolveRefs([...refBySlug.keys()]);
+                enrichedScores = scoreSummary.scores.map((s) => ({
+                    ...s,
+                    documents: areaToDocRefs.get(s.feature),
+                }));
             }
-            const documentManifest = resolveRefs([...refBySlug.keys()]);
-            const enrichedScores = scoreSummary.scores.map((s) => ({
-                ...s,
-                documents: areaToDocRefs.get(s.feature),
-            }));
             // ── Low-scoring judgments ────────────────────────────────────
             const LOW_SCORE_THRESHOLD = 70;
             const MAX_STORED_JUDGMENTS = 50;
@@ -154,6 +163,8 @@ export class GapAnalysisStep {
                 .sort((a, b) => a.score - b.score)
                 .slice(0, MAX_STORED_JUDGMENTS)
                 .map((j) => {
+                if (!isLiteracyMode)
+                    return j;
                 // Judgment taskId is the description with "(gold)" or "(baseline)" suffix
                 const baseDesc = j.taskId.replace(/\s*\((gold|baseline)\)\s*$/, "");
                 const canonicalDocs = descToDocRefs.get(baseDesc);
@@ -161,7 +172,7 @@ export class GapAnalysisStep {
             });
             const enrichedSummary = {
                 ...scoreSummary,
-                documentManifest,
+                ...(documentManifest !== undefined && { documentManifest }),
                 failureModes: failureModeReport,
                 lowScoringJudgments,
                 recommendations: gapReport,

package/dist/orchestration/steps/generate-configs-step.d.ts CHANGED Viewed

@@ -11,12 +11,19 @@
 import type { AppContext, PipelineState, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
 export declare class GenerateConfigsStep implements PipelineStep {
     readonly name = "generate-configs";
+    /** Task IDs from the last loadTasks call (pre-filter), for error messages. */
+    private lastLoadedTaskIds;
     check(ctx: AppContext): ValidationIssue[];
     execute(ctx: AppContext, state: PipelineState): Promise<StepResult>;
     private compileLiteracyVariants;
     private compileSingleMode;
     private loadTasks;
     private applyFilters;
+    /**
+     * Build a descriptive error message when no tasks match the current filters.
+     * Distinguishes between "no tasks exist" and "tasks exist but filters exclude them".
+     */
+    private buildNoTasksError;
     /**
      * Compile all tasks through a handler, merging results.
      * For literacy mode, ctx can carry evalMode as an extension.

package/dist/orchestration/steps/generate-configs-step.js CHANGED Viewed

@@ -20,6 +20,8 @@ import { loadSource } from "../../sources.js";
 import { configToSourceOverrides } from "../config-to-source-overrides.js";
 export class GenerateConfigsStep {
     name = "generate-configs";
+    /** Task IDs from the last loadTasks call (pre-filter), for error messages. */
+    lastLoadedTaskIds = [];
     check(ctx) {
         const issues = validateModelsYaml(ctx.config.rootDir);
         return issues.filter((i) => i.severity === "error");
@@ -54,10 +56,10 @@ export class GenerateConfigsStep {
             // Load tasks
             const tasks = await this.loadTasks(ctx, mode, state);
             if (tasks.length === 0) {
+                const error = this.buildNoTasksError(ctx, mode);
                 return {
                     durationMs: Date.now() - start,
-                    error: `No ${mode} tasks found. Create *.task.ts files in ` +
-                        `packages/eval/tasks/${mode}/`,
+                    error,
                     status: "failed",
                 };
             }
@@ -157,10 +159,22 @@ export class GenerateConfigsStep {
             label: m.label,
             config: m.config,
         }));
+        // Load rubric config for template resolution (needed by modes that use
+        // templated LLM-rubric assertions, e.g., agent-harness with agent-output
+        // and agent-tool-usage templates)
+        let rubricConfig;
+        try {
+            const { loadRubricTemplates } = await import("../../pipeline/rubric-loader.js");
+            rubricConfig = loadRubricTemplates(ctx.config.rootDir);
+        }
+        catch {
+            ctx.logger.warn("  ⚠ Could not load rubric config — templates will not resolve");
+        }
         const merged = this.compileAll(handler, tasks, {
             rootDir: ctx.config.rootDir,
             graderProvider: models.grader.id,
             models: modeModels,
+            rubricConfig,
         });
         for (const w of merged.warnings) {
             ctx.logger.warn(`  ⚠ ${w}`);
@@ -249,6 +263,10 @@ export class GenerateConfigsStep {
         return filtered;
     }
     applyFilters(ctx, tasks) {
+        // Capture pre-filter IDs for diagnostic messages
+        this.lastLoadedTaskIds = tasks
+            .map((t) => t.id)
+            .filter((id) => !!id);
         let result = tasks;
         if (ctx.config.areas?.length) {
             const allowed = new Set(ctx.config.areas.map((a) => a.toLowerCase()));
@@ -273,6 +291,33 @@ export class GenerateConfigsStep {
         }
         return result;
     }
+    /**
+     * Build a descriptive error message when no tasks match the current filters.
+     * Distinguishes between "no tasks exist" and "tasks exist but filters exclude them".
+     */
+    buildNoTasksError(ctx, mode) {
+        const filters = [];
+        if (ctx.config.tasks?.length) {
+            filters.push(`--task ${ctx.config.tasks.join(", ")}`);
+        }
+        if (ctx.config.areas?.length) {
+            filters.push(`--area ${ctx.config.areas.join(", ")}`);
+        }
+        if (ctx.config.tags?.length) {
+            filters.push(`--tag ${ctx.config.tags.join(", ")}`);
+        }
+        if (filters.length > 0) {
+            // Collect available task IDs for the hint
+            const availableIds = this.lastLoadedTaskIds ?? [];
+            const hint = availableIds.length > 0
+                ? `\n  Available ${mode} task IDs: ${availableIds.join(", ")}`
+                : "";
+            return (`No ${mode} tasks match the current filters (${filters.join("; ")}).` +
+                hint);
+        }
+        return (`No ${mode} tasks found. Create *.task.ts files in ` +
+            `packages/eval/tasks/${mode}/`);
+    }
     // ---------------------------------------------------------------------------
     // Compilation helpers
     // ---------------------------------------------------------------------------

package/dist/pipeline/calculate-scores.js CHANGED Viewed

@@ -546,11 +546,13 @@ function scoreResults(results, goldProfile, baselineProfile, modelId) {
         const featureScore = {
             ceilingScore,
             codeCorrectness: gold.dimensions.codeCorrectness ?? 0,
+            dimensions: gold.dimensions,
             docCoverage: gold.dimensions.docCoverage ?? 0,
             docLift,
             docQualityGap: 100 - ceilingScore,
             feature,
             floorScore,
+            groupType: "feature",
             ...(modelId && { modelId }),
             negativeDocLift: docLift < 0,
             taskCompletion: gold.dimensions.taskCompletion ?? 0,
@@ -563,6 +565,69 @@ function scoreResults(results, goldProfile, baselineProfile, modelId) {
     return scores.sort((a, b) => a.feature.localeCompare(b.feature));
 }
 // ---------------------------------------------------------------------------
+// Agent-harness scoring — groups by task ID, single variant
+// ---------------------------------------------------------------------------
+/**
+ * Score agent-harness evaluation results. Unlike literacy mode, agent-harness
+ * tasks don't have a with-docs/without-docs split. All results are scored
+ * as a single "actual" variant using the agent-harness profile.
+ *
+ * Groups results by task ID (extracted from the test description prefix)
+ * rather than by feature area. Each group produces a FeatureScore with
+ * groupType: "task".
+ *
+ * Literacy-specific fields (ceilingScore, floorScore, docLift, docQualityGap)
+ * are set to 0 for backward compatibility with downstream consumers.
+ */
+function scoreAgentHarnessResults(results, profile) {
+    // Group by task ID (extracted from description: "task-id — Title")
+    const byTask = {};
+    for (const result of results) {
+        const taskId = extractTaskId(result.description);
+        if (!byTask[taskId]) {
+            byTask[taskId] = [];
+        }
+        byTask[taskId].push(result);
+    }
+    const scores = [];
+    for (const [taskId, taskResults] of Object.entries(byTask)) {
+        const scored = scoreTestGroup(taskResults, profile, taskId);
+        const totalCost = scored.totalCost;
+        // Detect feature area for backward compat (used by report grouping)
+        const feature = taskResults[0]?.vars.__featureArea ??
+            detectFeatureArea(taskResults[0]?.description ?? taskId);
+        scores.push({
+            assertionPassRate: scored.dimensions.assertionPassRate,
+            ceilingScore: 0,
+            codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
+            dimensions: scored.dimensions,
+            docCoverage: scored.dimensions.docCoverage ?? 0,
+            docLift: 0,
+            docQualityGap: 0,
+            feature,
+            floorScore: 0,
+            groupType: "task",
+            negativeDocLift: false,
+            taskCompletion: scored.dimensions.taskCompletion ?? 0,
+            testCount: taskResults.length,
+            totalCost,
+            totalScore: scored.composite,
+        });
+    }
+    return scores.sort((a, b) => a.feature.localeCompare(b.feature));
+}
+/**
+ * Extract task ID from a test description string.
+ * Descriptions follow the pattern: "task-id — Title"
+ */
+function extractTaskId(description) {
+    const dashIndex = description.indexOf(" — ");
+    if (dashIndex > 0) {
+        return description.slice(0, dashIndex).trim();
+    }
+    return description.trim() || "unknown";
+}
+// ---------------------------------------------------------------------------
 // Agentic scoring — all results are "actual" (agent retrieves docs via tools)
 // ---------------------------------------------------------------------------
 /**
@@ -684,11 +749,57 @@ export function calculateAndWriteScores(options) {
     if (source) {
         log.info(`Source: ${sourceName} (${source.baseUrl})`);
     }
-    // Load rubric config and resolve scoring profiles per variant.
+    // Load rubric config — shared across all modes
+    const rubricConfig = loadRubricTemplates(ROOT);
+    // ── Agent-harness scoring path ──────────────────────────────
+    // Agent-harness mode uses its own scoring path because:
+    // 1. No with-docs/without-docs split — all results are "actual"
+    // 2. Groups by task ID, not feature area
+    // 3. Uses the agent-harness profile (assertion-pass-rate, agent-output, tool-usage)
+    // See docs/design-docs/mode-agnostic-scoring.md
+    if (mode === "agent-harness") {
+        const agentProfile = resolveProfile("agent-harness", "gold", rubricConfig);
+        log.debug("Agent-harness scoring profile", agentProfile);
+        const results = readAndNormalizeResults(baselineResultsPath);
+        const scores = scoreAgentHarnessResults(results, agentProfile);
+        log.debug("Agent-harness scores calculated", {
+            taskCount: scores.length,
+            tasks: scores.map((s) => ({
+                feature: s.feature,
+                totalScore: s.totalScore,
+                testCount: s.testCount,
+                dimensions: s.dimensions,
+            })),
+        });
+        const urlRefs = aggregateUrlReferences(baselineResultsPath);
+        const sourceVerification = buildSourceVerification(ROOT, source, {
+            allowedOrigins: options.allowedOrigins,
+            mode,
+            searchMode: options.searchMode,
+        });
+        const graderCost = extractGraderCost(baselineResultsPath);
+        const summary = printReport(scores, urlRefs, source, null, // no agent behavior (that's for literacy agentic mode)
+        graderCost, null, // no per-model breakdown
+        null, // no source isolation
+        sourceVerification, "agent-harness", log);
+        // Persist
+        const outDir = join(ROOT, "results", "latest");
+        mkdirSync(outDir, { recursive: true });
+        writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
+        log.info("Score summary written to results/latest/score-summary.json");
+        // Extract and persist grader judgments
+        const judgments = extractGraderJudgments(baselineResultsPath);
+        if (judgments.length > 0) {
+            writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
+            log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
+        }
+        const testSummary = computeTestSummary(baselineResultsPath);
+        return { belowCritical: summary.belowCritical, testSummary };
+    }
+    // ── Literacy scoring path ───────────────────────────────────
     // Gold (with-docs) entries use the "default" profile (3 dimensions).
     // Baseline (without-docs) entries use "output-only" (2 dimensions,
     // doc-coverage excluded). See docs/design-docs/named-scoring-profiles.md.
-    const rubricConfig = loadRubricTemplates(ROOT);
     const goldProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.STANDARD);
     const baselineProfileWeights = resolveProfile("literacy", LiteracyVariant.STANDARD, rubricConfig, LiteracyVariant.STANDARD);
     log.debug("Loaded scoring profiles", {

package/dist/pipeline/compare.js CHANGED Viewed

@@ -146,12 +146,6 @@ function buildAreaDelta(area, baselineScore, experimentScore, threshold, isMisma
     const bTotal = b?.totalScore ?? 0;
     const eTotal = e?.totalScore ?? 0;
     const delta = eTotal - bTotal;
-    const bTask = b?.taskCompletion ?? 0;
-    const eTask = e?.taskCompletion ?? 0;
-    const bCode = b?.codeCorrectness ?? 0;
-    const eCode = e?.codeCorrectness ?? 0;
-    const bDoc = b?.docCoverage ?? 0;
-    const eDoc = e?.docCoverage ?? 0;
     // Support both new field names and legacy data (old baselines/Sanity docs)
     const bRaw = b;
     const eRaw = e;
@@ -183,19 +177,7 @@ function buildAreaDelta(area, baselineScore, experimentScore, threshold, isMisma
         ceilingDelta: eCeiling - bCeiling,
         change: isMismatched ? "not-evaluated" : classifyChange(delta, threshold),
         delta,
-        dimensions: {
-            codeCorrectness: {
-                baseline: bCode,
-                delta: eCode - bCode,
-                experiment: eCode,
-            },
-            docCoverage: { baseline: bDoc, delta: eDoc - bDoc, experiment: eDoc },
-            taskCompletion: {
-                baseline: bTask,
-                delta: eTask - bTask,
-                experiment: eTask,
-            },
-        },
+        dimensions: buildDimensionDeltas(b, e),
         docLiftDelta: eLift - bLift,
         experiment: eTotal,
         floorDelta: eFloor - bFloor,
@@ -206,6 +188,55 @@ function buildAreaDelta(area, baselineScore, experimentScore, threshold, isMisma
         ...(hasCost && { costDelta: eCost - bCost }),
     };
 }
+/**
+ * Build per-dimension deltas from the generic dimensions map when available,
+ * falling back to the three legacy literacy fields for backward compatibility.
+ *
+ * This ensures non-literacy modes (agent-harness, mcp-server, etc.) get their
+ * actual dimensions (e.g., agentOutput, toolUsage) in comparison reports
+ * instead of hardcoded zeros for codeCorrectness/docCoverage/taskCompletion.
+ */
+function buildDimensionDeltas(b, e) {
+    const bDims = b?.dimensions;
+    const eDims = e?.dimensions;
+    // When the generic dimensions map is populated, use it — this covers
+    // agent-harness (agentOutput, toolUsage), literacy (taskCompletion,
+    // codeCorrectness, docCoverage), and any future mode dimensions.
+    if (bDims || eDims) {
+        const allKeys = new Set([
+            ...Object.keys(bDims ?? {}),
+            ...Object.keys(eDims ?? {}),
+        ]);
+        const result = {};
+        for (const key of allKeys) {
+            const bVal = bDims?.[key] ?? 0;
+            const eVal = eDims?.[key] ?? 0;
+            result[key] = { baseline: bVal, delta: eVal - bVal, experiment: eVal };
+        }
+        return result;
+    }
+    // Legacy fallback: older reports may lack the dimensions map entirely.
+    // Read from the three named FeatureScore fields instead.
+    const bTask = b?.taskCompletion ?? 0;
+    const eTask = e?.taskCompletion ?? 0;
+    const bCode = b?.codeCorrectness ?? 0;
+    const eCode = e?.codeCorrectness ?? 0;
+    const bDoc = b?.docCoverage ?? 0;
+    const eDoc = e?.docCoverage ?? 0;
+    return {
+        codeCorrectness: {
+            baseline: bCode,
+            delta: eCode - bCode,
+            experiment: eCode,
+        },
+        docCoverage: { baseline: bDoc, delta: eDoc - bDoc, experiment: eDoc },
+        taskCompletion: {
+            baseline: bTask,
+            delta: eTask - bTask,
+            experiment: eTask,
+        },
+    };
+}
 // ---------------------------------------------------------------------------
 // Main compare function
 // ---------------------------------------------------------------------------