@sanity/ailf 2.2.0 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/config/rubrics.ts +3 -3
  2. package/dist/_vendor/ailf-core/types/index.d.ts +25 -0
  3. package/dist/commands/calculate-scores.js +7 -2
  4. package/dist/commands/capture-list.d.ts +1 -1
  5. package/dist/commands/capture-list.js +6 -3
  6. package/dist/commands/compare.js +11 -7
  7. package/dist/commands/explain-handler.js +22 -24
  8. package/dist/commands/fetch-docs.js +4 -2
  9. package/dist/commands/generate-configs.js +6 -2
  10. package/dist/commands/pipeline-action.js +8 -24
  11. package/dist/commands/pipeline.js +1 -1
  12. package/dist/commands/pr-comment.js +6 -2
  13. package/dist/commands/publish.d.ts +1 -0
  14. package/dist/commands/publish.js +12 -8
  15. package/dist/commands/remote-pipeline.js +1 -1
  16. package/dist/commands/remote-results.d.ts +8 -8
  17. package/dist/commands/remote-results.js +7 -7
  18. package/dist/commands/shared/options.d.ts +8 -0
  19. package/dist/commands/shared/options.js +10 -0
  20. package/dist/commands/shared/resolve-output-dir.d.ts +27 -0
  21. package/dist/commands/shared/resolve-output-dir.js +36 -0
  22. package/dist/composition-root.js +1 -1
  23. package/dist/config/rubrics.ts +3 -3
  24. package/dist/orchestration/build-app-context.js +1 -1
  25. package/dist/orchestration/steps/gap-analysis-step.js +86 -75
  26. package/dist/orchestration/steps/generate-configs-step.js +12 -0
  27. package/dist/pipeline/calculate-scores.js +113 -2
  28. package/dist/pipeline/compare.js +50 -19
  29. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +64 -0
  30. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +6 -0
  31. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +14 -0
  32. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -0
  33. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +3 -0
  34. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +1 -27
  35. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +2 -9
  36. package/dist/pipeline/compiler/rubric-resolution.d.ts +40 -0
  37. package/dist/pipeline/compiler/rubric-resolution.js +52 -0
  38. package/dist/pipeline/compiler/scoring-bridge.js +59 -7
  39. package/dist/pipeline/provenance.js +7 -1
  40. package/dist/pipeline/validate.d.ts +5 -4
  41. package/dist/pipeline/validate.js +34 -113
  42. package/package.json +1 -1
@@ -4,9 +4,9 @@
4
4
  * Produces the same file layout as local mode so downstream tools
5
5
  * (workflow PR comments, score comparison, baseline save) work unchanged:
6
6
  *
7
- * results/latest/score-summary.json — scores by area + overall
8
- * results/latest/report.md — rendered markdown report
9
- * results/latest/job-metadata.json — job ID, timing, API URL
7
+ * <outputDir>/score-summary.json — scores by area + overall
8
+ * <outputDir>/report.md — rendered markdown report
9
+ * <outputDir>/job-metadata.json — job ID, timing, API URL
10
10
  *
11
11
  * @see packages/eval/src/commands/remote-pipeline.ts — caller
12
12
  */
@@ -19,13 +19,13 @@ import { resolve } from "path";
19
19
  * Fetch report artifacts from the API and write them to disk.
20
20
  *
21
21
  * Writes:
22
- * - `results/latest/score-summary.json` — score data from job response
23
- * - `results/latest/report.md` — full markdown report (if reportId present)
24
- * - `results/latest/job-metadata.json` — job tracking info
22
+ * - `<outputDir>/score-summary.json` — score data from job response
23
+ * - `<outputDir>/report.md` — full markdown report (if reportId present)
24
+ * - `<outputDir>/job-metadata.json` — job tracking info
25
25
  * - `--output` path — markdown report (if specified)
26
26
  */
27
27
  export async function writeRemoteResults(client, job, options) {
28
- const resultsDir = resolve(options.rootDir, "results", "latest");
28
+ const resultsDir = options.outputDir;
29
29
  mkdirSync(resultsDir, { recursive: true });
30
30
  // 1. Write score summary
31
31
  const scoreSummary = buildScoreSummary(job);
@@ -18,6 +18,14 @@ export declare function addDebugOptions(cmd: Command): Command;
18
18
  * Add output options: --output, --format
19
19
  */
20
20
  export declare function addOutputOptions(cmd: Command): Command;
21
+ /**
22
+ * Add --output-dir option for commands that write pipeline artifacts.
23
+ *
24
+ * Pair with `resolveOutputDir()` from `./resolve-output-dir.js` to resolve
25
+ * the value. When omitted, `resolveOutputDir()` defaults to
26
+ * `$CWD/.ailf/results/latest/`.
27
+ */
28
+ export declare function addOutputDirOption(cmd: Command): Command;
21
29
  /**
22
30
  * Add Sanity source options: --sanity-dataset, --sanity-project, etc.
23
31
  */
@@ -36,6 +36,16 @@ export function addOutputOptions(cmd) {
36
36
  .option("-o, --output <path>", "Write output to a specific file path")
37
37
  .option("-f, --format <fmt>", "Output format (e.g., table, json, md)");
38
38
  }
39
+ /**
40
+ * Add --output-dir option for commands that write pipeline artifacts.
41
+ *
42
+ * Pair with `resolveOutputDir()` from `./resolve-output-dir.js` to resolve
43
+ * the value. When omitted, `resolveOutputDir()` defaults to
44
+ * `$CWD/.ailf/results/latest/`.
45
+ */
46
+ export function addOutputDirOption(cmd) {
47
+ return cmd.option("--output-dir <path>", "Base directory for output artifacts (default: .ailf/results/latest/)");
48
+ }
39
49
  /**
40
50
  * Add Sanity source options: --sanity-dataset, --sanity-project, etc.
41
51
  */
@@ -0,0 +1,27 @@
1
+ /**
2
+ * Shared output directory resolution for all CLI commands.
3
+ *
4
+ * Resolution order (2-tier):
5
+ * 1. Explicit `--output-dir <path>` — resolved relative to callerCwd
6
+ * 2. Default — `$callerCwd/.ailf/results/latest/`
7
+ *
8
+ * callerCwd is `AILF_CALLER_CWD ?? process.cwd()` — the user's actual
9
+ * working directory, not the eval package root.
10
+ *
11
+ * @see docs/design-docs/output-dir-routing.md
12
+ * @see docs/work-items/W0030.json
13
+ */
14
+ /**
15
+ * Get the caller's working directory.
16
+ *
17
+ * When the CLI is invoked via `npx @sanity/ailf`, the wrapper may set
18
+ * AILF_CALLER_CWD to preserve the real CWD before Node changes it.
19
+ */
20
+ export declare function getCallerCwd(): string;
21
+ /**
22
+ * Resolve the output directory for pipeline artifacts.
23
+ *
24
+ * @param outputDir - Explicit `--output-dir` value from the CLI (may be relative)
25
+ * @returns Absolute path to the output directory
26
+ */
27
+ export declare function resolveOutputDir(outputDir?: string): string;
@@ -0,0 +1,36 @@
1
+ /**
2
+ * Shared output directory resolution for all CLI commands.
3
+ *
4
+ * Resolution order (2-tier):
5
+ * 1. Explicit `--output-dir <path>` — resolved relative to callerCwd
6
+ * 2. Default — `$callerCwd/.ailf/results/latest/`
7
+ *
8
+ * callerCwd is `AILF_CALLER_CWD ?? process.cwd()` — the user's actual
9
+ * working directory, not the eval package root.
10
+ *
11
+ * @see docs/design-docs/output-dir-routing.md
12
+ * @see docs/work-items/W0030.json
13
+ */
14
+ import { resolve } from "path";
15
+ /**
16
+ * Get the caller's working directory.
17
+ *
18
+ * When the CLI is invoked via `npx @sanity/ailf`, the wrapper may set
19
+ * AILF_CALLER_CWD to preserve the real CWD before Node changes it.
20
+ */
21
+ export function getCallerCwd() {
22
+ return process.env.AILF_CALLER_CWD ?? process.cwd();
23
+ }
24
+ /**
25
+ * Resolve the output directory for pipeline artifacts.
26
+ *
27
+ * @param outputDir - Explicit `--output-dir` value from the CLI (may be relative)
28
+ * @returns Absolute path to the output directory
29
+ */
30
+ export function resolveOutputDir(outputDir) {
31
+ const callerCwd = getCallerCwd();
32
+ if (outputDir) {
33
+ return resolve(callerCwd, outputDir);
34
+ }
35
+ return resolve(callerCwd, ".ailf", "results", "latest");
36
+ }
@@ -60,7 +60,7 @@ export function createAppContext(config) {
60
60
  // Artifact collector — no-op by default, filesystem when --capture is set
61
61
  const collector = config.captureEnabled
62
62
  ? new FilesystemArtifactCollector({
63
- captureDir: config.captureDir ?? join(config.rootDir, "results", "captures"),
63
+ captureDir: config.captureDir ?? join(config.outputDir, "..", "captures"),
64
64
  mode: config.mode,
65
65
  compress: config.captureCompress ?? true,
66
66
  extras: config.captureExtras ?? true,
@@ -201,9 +201,9 @@ export default defineRubrics({
201
201
  currency: 0.2,
202
202
  },
203
203
  "agent-harness": {
204
- "agent-output": 0.45,
205
- "tool-usage": 0.4,
206
- "process-quality": 0.15,
204
+ "assertion-pass-rate": 0.35,
205
+ "agent-output": 0.35,
206
+ "tool-usage": 0.3,
207
207
  },
208
208
  },
209
209
 
@@ -79,7 +79,7 @@ export function mapToResolvedConfig(opts, rootDir) {
79
79
  apiUrl: opts.apiUrl ?? "https://ailf-api.sanity.build",
80
80
  apiKey: opts.apiKey,
81
81
  captureEnabled: opts.captureEnabled ?? false,
82
- captureDir: opts.captureDir ?? join(rootDir, "results", "captures"),
82
+ captureDir: opts.captureDir ?? join(opts.outputDir, "..", "captures"),
83
83
  captureCompress: opts.captureCompress ?? true,
84
84
  captureExtras: opts.captureExtras ?? true,
85
85
  };
@@ -61,87 +61,96 @@ export class GapAnalysisStep {
61
61
  mkdirSync(outDir, { recursive: true });
62
62
  writeFileSync(join(outDir, "failure-modes.json"), JSON.stringify(failureModeReport, null, 2));
63
63
  writeFileSync(join(outDir, "gap-analysis.json"), JSON.stringify(gapReport, null, 2));
64
- const manifestPath = resolve(root, "contexts", "document-manifest.json");
65
- const manifestEntries = existsSync(manifestPath)
66
- ? JSON.parse(readFileSync(manifestPath, "utf-8"))
67
- : [];
68
- const refBySlug = new Map();
69
- for (const entry of manifestEntries) {
70
- refBySlug.set(entry.slug, entry);
71
- }
72
- const resolveRefs = (slugs) => slugs
73
- .map((slug) => {
74
- const m = refBySlug.get(slug);
75
- return m
76
- ? {
77
- documentId: m._id,
78
- revision: m._rev,
79
- slug: m.slug,
80
- title: m.title,
81
- }
82
- : { documentId: "", slug, title: slug };
83
- })
84
- .filter((r) => r.documentId !== "");
85
- // ── Build description→docs mapping from TaskSource ─────────
86
- // Primary source: use the TaskSource adapter from AppContext.
87
- // This works with Content Lake, repo-based, and YAML tasks.
88
- // Judgments use task description as their taskId, so we build
89
- // maps keyed by both description and task ID for robust matching.
64
+ // ── Document enrichment (literacy mode only) ──────────────
65
+ // Non-literacy modes don't use canonical docs. Skip manifest
66
+ // loading, doc-reference enrichment, and canonical doc mapping
67
+ // entirely — those fields are only meaningful for literacy evals.
68
+ const isLiteracyMode = ctx.config.mode === "literacy";
69
+ let documentManifest;
70
+ let enrichedScores = scoreSummary.scores;
90
71
  const descToDocRefs = new Map();
91
- const areaToDocRefs = new Map();
92
- let tasks = [];
93
- try {
94
- tasks = (await ctx.taskSource.loadTasks()).filter((t) => t.mode === "literacy");
95
- }
96
- catch {
97
- // TaskSource may not be available in all contexts (e.g., standalone
98
- // gap analysis on cached results). Fall through to legacy fallback.
99
- }
100
- if (tasks.length > 0) {
101
- // Group tasks by feature area and build slug maps
102
- const byArea = new Map();
103
- for (const task of tasks) {
104
- const slugs = extractSlugsFromRefs(task.context?.docs ?? []);
105
- const refs = resolveRefs(slugs);
106
- // Map by title (what judgments use as taskId)
107
- descToDocRefs.set(task.title, refs);
108
- // Also map by task ID for prefix-based matching
109
- descToDocRefs.set(task.id, refs);
110
- // Group slugs by feature area
111
- const area = task.area ?? "";
112
- if (!byArea.has(area))
113
- byArea.set(area, new Set());
114
- for (const s of slugs)
115
- byArea.get(area).add(s);
72
+ if (isLiteracyMode) {
73
+ const manifestPath = resolve(root, "contexts", "document-manifest.json");
74
+ const manifestEntries = existsSync(manifestPath)
75
+ ? JSON.parse(readFileSync(manifestPath, "utf-8"))
76
+ : [];
77
+ const refBySlug = new Map();
78
+ for (const entry of manifestEntries) {
79
+ refBySlug.set(entry.slug, entry);
116
80
  }
117
- for (const [area, slugs] of byArea) {
118
- areaToDocRefs.set(area, resolveRefs([...slugs]));
81
+ const resolveRefs = (slugs) => slugs
82
+ .map((slug) => {
83
+ const m = refBySlug.get(slug);
84
+ return m
85
+ ? {
86
+ documentId: m._id,
87
+ revision: m._rev,
88
+ slug: m.slug,
89
+ title: m.title,
90
+ }
91
+ : { documentId: "", slug, title: slug };
92
+ })
93
+ .filter((r) => r.documentId !== "");
94
+ // ── Build description→docs mapping from TaskSource ─────────
95
+ // Primary source: use the TaskSource adapter from AppContext.
96
+ // This works with Content Lake, repo-based, and YAML tasks.
97
+ // Judgments use task description as their taskId, so we build
98
+ // maps keyed by both description and task ID for robust matching.
99
+ const areaToDocRefs = new Map();
100
+ let tasks = [];
101
+ try {
102
+ tasks = (await ctx.taskSource.loadTasks()).filter((t) => t.mode === "literacy");
119
103
  }
120
- }
121
- // Legacy fallback: merge in any tasks from local YAML that weren't
122
- // already covered by the TaskSource adapter.
123
- const { resolveMappings } = await import("../../pipeline/resolve-mappings.js");
124
- const mappings = resolveMappings(root);
125
- for (const [area, areaData] of Object.entries(mappings.feature_areas)) {
126
- const areaSlugs = new Set();
127
- for (const task of areaData.tasks) {
128
- const taskSlugs = task.canonical_docs.map((d) => d.slug);
129
- // Only add if not already mapped by the primary source
130
- if (!descToDocRefs.has(task.description)) {
131
- descToDocRefs.set(task.description, resolveRefs(taskSlugs));
104
+ catch {
105
+ // TaskSource may not be available in all contexts (e.g., standalone
106
+ // gap analysis on cached results). Fall through to legacy fallback.
107
+ }
108
+ if (tasks.length > 0) {
109
+ // Group tasks by feature area and build slug maps
110
+ const byArea = new Map();
111
+ for (const task of tasks) {
112
+ const slugs = extractSlugsFromRefs(task.context?.docs ?? []);
113
+ const refs = resolveRefs(slugs);
114
+ // Map by title (what judgments use as taskId)
115
+ descToDocRefs.set(task.title, refs);
116
+ // Also map by task ID for prefix-based matching
117
+ descToDocRefs.set(task.id, refs);
118
+ // Group slugs by feature area
119
+ const area = task.area ?? "";
120
+ if (!byArea.has(area))
121
+ byArea.set(area, new Set());
122
+ for (const s of slugs)
123
+ byArea.get(area).add(s);
124
+ }
125
+ for (const [area, slugs] of byArea) {
126
+ areaToDocRefs.set(area, resolveRefs([...slugs]));
132
127
  }
133
- for (const s of taskSlugs)
134
- areaSlugs.add(s);
135
128
  }
136
- if (!areaToDocRefs.has(area)) {
137
- areaToDocRefs.set(area, resolveRefs([...areaSlugs]));
129
+ // Legacy fallback: merge in any tasks from local YAML that weren't
130
+ // already covered by the TaskSource adapter.
131
+ const { resolveMappings } = await import("../../pipeline/resolve-mappings.js");
132
+ const mappings = resolveMappings(root);
133
+ for (const [area, areaData] of Object.entries(mappings.feature_areas)) {
134
+ const areaSlugs = new Set();
135
+ for (const task of areaData.tasks) {
136
+ const taskSlugs = task.canonical_docs.map((d) => d.slug);
137
+ // Only add if not already mapped by the primary source
138
+ if (!descToDocRefs.has(task.description)) {
139
+ descToDocRefs.set(task.description, resolveRefs(taskSlugs));
140
+ }
141
+ for (const s of taskSlugs)
142
+ areaSlugs.add(s);
143
+ }
144
+ if (!areaToDocRefs.has(area)) {
145
+ areaToDocRefs.set(area, resolveRefs([...areaSlugs]));
146
+ }
138
147
  }
148
+ documentManifest = resolveRefs([...refBySlug.keys()]);
149
+ enrichedScores = scoreSummary.scores.map((s) => ({
150
+ ...s,
151
+ documents: areaToDocRefs.get(s.feature),
152
+ }));
139
153
  }
140
- const documentManifest = resolveRefs([...refBySlug.keys()]);
141
- const enrichedScores = scoreSummary.scores.map((s) => ({
142
- ...s,
143
- documents: areaToDocRefs.get(s.feature),
144
- }));
145
154
  // ── Low-scoring judgments ────────────────────────────────────
146
155
  const LOW_SCORE_THRESHOLD = 70;
147
156
  const MAX_STORED_JUDGMENTS = 50;
@@ -154,6 +163,8 @@ export class GapAnalysisStep {
154
163
  .sort((a, b) => a.score - b.score)
155
164
  .slice(0, MAX_STORED_JUDGMENTS)
156
165
  .map((j) => {
166
+ if (!isLiteracyMode)
167
+ return j;
157
168
  // Judgment taskId is the description with "(gold)" or "(baseline)" suffix
158
169
  const baseDesc = j.taskId.replace(/\s*\((gold|baseline)\)\s*$/, "");
159
170
  const canonicalDocs = descToDocRefs.get(baseDesc);
@@ -161,7 +172,7 @@ export class GapAnalysisStep {
161
172
  });
162
173
  const enrichedSummary = {
163
174
  ...scoreSummary,
164
- documentManifest,
175
+ ...(documentManifest !== undefined && { documentManifest }),
165
176
  failureModes: failureModeReport,
166
177
  lowScoringJudgments,
167
178
  recommendations: gapReport,
@@ -159,10 +159,22 @@ export class GenerateConfigsStep {
159
159
  label: m.label,
160
160
  config: m.config,
161
161
  }));
162
+ // Load rubric config for template resolution (needed by modes that use
163
+ // templated LLM-rubric assertions, e.g., agent-harness with agent-output
164
+ // and agent-tool-usage templates)
165
+ let rubricConfig;
166
+ try {
167
+ const { loadRubricTemplates } = await import("../../pipeline/rubric-loader.js");
168
+ rubricConfig = loadRubricTemplates(ctx.config.rootDir);
169
+ }
170
+ catch {
171
+ ctx.logger.warn(" ⚠ Could not load rubric config — templates will not resolve");
172
+ }
162
173
  const merged = this.compileAll(handler, tasks, {
163
174
  rootDir: ctx.config.rootDir,
164
175
  graderProvider: models.grader.id,
165
176
  models: modeModels,
177
+ rubricConfig,
166
178
  });
167
179
  for (const w of merged.warnings) {
168
180
  ctx.logger.warn(` ⚠ ${w}`);
@@ -546,11 +546,13 @@ function scoreResults(results, goldProfile, baselineProfile, modelId) {
546
546
  const featureScore = {
547
547
  ceilingScore,
548
548
  codeCorrectness: gold.dimensions.codeCorrectness ?? 0,
549
+ dimensions: gold.dimensions,
549
550
  docCoverage: gold.dimensions.docCoverage ?? 0,
550
551
  docLift,
551
552
  docQualityGap: 100 - ceilingScore,
552
553
  feature,
553
554
  floorScore,
555
+ groupType: "feature",
554
556
  ...(modelId && { modelId }),
555
557
  negativeDocLift: docLift < 0,
556
558
  taskCompletion: gold.dimensions.taskCompletion ?? 0,
@@ -563,6 +565,69 @@ function scoreResults(results, goldProfile, baselineProfile, modelId) {
563
565
  return scores.sort((a, b) => a.feature.localeCompare(b.feature));
564
566
  }
565
567
  // ---------------------------------------------------------------------------
568
+ // Agent-harness scoring — groups by task ID, single variant
569
+ // ---------------------------------------------------------------------------
570
+ /**
571
+ * Score agent-harness evaluation results. Unlike literacy mode, agent-harness
572
+ * tasks don't have a with-docs/without-docs split. All results are scored
573
+ * as a single "actual" variant using the agent-harness profile.
574
+ *
575
+ * Groups results by task ID (extracted from the test description prefix)
576
+ * rather than by feature area. Each group produces a FeatureScore with
577
+ * groupType: "task".
578
+ *
579
+ * Literacy-specific fields (ceilingScore, floorScore, docLift, docQualityGap)
580
+ * are set to 0 for backward compatibility with downstream consumers.
581
+ */
582
+ function scoreAgentHarnessResults(results, profile) {
583
+ // Group by task ID (extracted from description: "task-id — Title")
584
+ const byTask = {};
585
+ for (const result of results) {
586
+ const taskId = extractTaskId(result.description);
587
+ if (!byTask[taskId]) {
588
+ byTask[taskId] = [];
589
+ }
590
+ byTask[taskId].push(result);
591
+ }
592
+ const scores = [];
593
+ for (const [taskId, taskResults] of Object.entries(byTask)) {
594
+ const scored = scoreTestGroup(taskResults, profile, taskId);
595
+ const totalCost = scored.totalCost;
596
+ // Detect feature area for backward compat (used by report grouping)
597
+ const feature = taskResults[0]?.vars.__featureArea ??
598
+ detectFeatureArea(taskResults[0]?.description ?? taskId);
599
+ scores.push({
600
+ assertionPassRate: scored.dimensions.assertionPassRate,
601
+ ceilingScore: 0,
602
+ codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
603
+ dimensions: scored.dimensions,
604
+ docCoverage: scored.dimensions.docCoverage ?? 0,
605
+ docLift: 0,
606
+ docQualityGap: 0,
607
+ feature,
608
+ floorScore: 0,
609
+ groupType: "task",
610
+ negativeDocLift: false,
611
+ taskCompletion: scored.dimensions.taskCompletion ?? 0,
612
+ testCount: taskResults.length,
613
+ totalCost,
614
+ totalScore: scored.composite,
615
+ });
616
+ }
617
+ return scores.sort((a, b) => a.feature.localeCompare(b.feature));
618
+ }
619
+ /**
620
+ * Extract task ID from a test description string.
621
+ * Descriptions follow the pattern: "task-id — Title"
622
+ */
623
+ function extractTaskId(description) {
624
+ const dashIndex = description.indexOf(" — ");
625
+ if (dashIndex > 0) {
626
+ return description.slice(0, dashIndex).trim();
627
+ }
628
+ return description.trim() || "unknown";
629
+ }
630
+ // ---------------------------------------------------------------------------
566
631
  // Agentic scoring — all results are "actual" (agent retrieves docs via tools)
567
632
  // ---------------------------------------------------------------------------
568
633
  /**
@@ -684,11 +749,57 @@ export function calculateAndWriteScores(options) {
684
749
  if (source) {
685
750
  log.info(`Source: ${sourceName} (${source.baseUrl})`);
686
751
  }
687
- // Load rubric config and resolve scoring profiles per variant.
752
+ // Load rubric config shared across all modes
753
+ const rubricConfig = loadRubricTemplates(ROOT);
754
+ // ── Agent-harness scoring path ──────────────────────────────
755
+ // Agent-harness mode uses its own scoring path because:
756
+ // 1. No with-docs/without-docs split — all results are "actual"
757
+ // 2. Groups by task ID, not feature area
758
+ // 3. Uses the agent-harness profile (assertion-pass-rate, agent-output, tool-usage)
759
+ // See docs/design-docs/mode-agnostic-scoring.md
760
+ if (mode === "agent-harness") {
761
+ const agentProfile = resolveProfile("agent-harness", "gold", rubricConfig);
762
+ log.debug("Agent-harness scoring profile", agentProfile);
763
+ const results = readAndNormalizeResults(baselineResultsPath);
764
+ const scores = scoreAgentHarnessResults(results, agentProfile);
765
+ log.debug("Agent-harness scores calculated", {
766
+ taskCount: scores.length,
767
+ tasks: scores.map((s) => ({
768
+ feature: s.feature,
769
+ totalScore: s.totalScore,
770
+ testCount: s.testCount,
771
+ dimensions: s.dimensions,
772
+ })),
773
+ });
774
+ const urlRefs = aggregateUrlReferences(baselineResultsPath);
775
+ const sourceVerification = buildSourceVerification(ROOT, source, {
776
+ allowedOrigins: options.allowedOrigins,
777
+ mode,
778
+ searchMode: options.searchMode,
779
+ });
780
+ const graderCost = extractGraderCost(baselineResultsPath);
781
+ const summary = printReport(scores, urlRefs, source, null, // no agent behavior (that's for literacy agentic mode)
782
+ graderCost, null, // no per-model breakdown
783
+ null, // no source isolation
784
+ sourceVerification, "agent-harness", log);
785
+ // Persist
786
+ const outDir = join(ROOT, "results", "latest");
787
+ mkdirSync(outDir, { recursive: true });
788
+ writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
789
+ log.info("Score summary written to results/latest/score-summary.json");
790
+ // Extract and persist grader judgments
791
+ const judgments = extractGraderJudgments(baselineResultsPath);
792
+ if (judgments.length > 0) {
793
+ writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
794
+ log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
795
+ }
796
+ const testSummary = computeTestSummary(baselineResultsPath);
797
+ return { belowCritical: summary.belowCritical, testSummary };
798
+ }
799
+ // ── Literacy scoring path ───────────────────────────────────
688
800
  // Gold (with-docs) entries use the "default" profile (3 dimensions).
689
801
  // Baseline (without-docs) entries use "output-only" (2 dimensions,
690
802
  // doc-coverage excluded). See docs/design-docs/named-scoring-profiles.md.
691
- const rubricConfig = loadRubricTemplates(ROOT);
692
803
  const goldProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.STANDARD);
693
804
  const baselineProfileWeights = resolveProfile("literacy", LiteracyVariant.STANDARD, rubricConfig, LiteracyVariant.STANDARD);
694
805
  log.debug("Loaded scoring profiles", {
@@ -146,12 +146,6 @@ function buildAreaDelta(area, baselineScore, experimentScore, threshold, isMisma
146
146
  const bTotal = b?.totalScore ?? 0;
147
147
  const eTotal = e?.totalScore ?? 0;
148
148
  const delta = eTotal - bTotal;
149
- const bTask = b?.taskCompletion ?? 0;
150
- const eTask = e?.taskCompletion ?? 0;
151
- const bCode = b?.codeCorrectness ?? 0;
152
- const eCode = e?.codeCorrectness ?? 0;
153
- const bDoc = b?.docCoverage ?? 0;
154
- const eDoc = e?.docCoverage ?? 0;
155
149
  // Support both new field names and legacy data (old baselines/Sanity docs)
156
150
  const bRaw = b;
157
151
  const eRaw = e;
@@ -183,19 +177,7 @@ function buildAreaDelta(area, baselineScore, experimentScore, threshold, isMisma
183
177
  ceilingDelta: eCeiling - bCeiling,
184
178
  change: isMismatched ? "not-evaluated" : classifyChange(delta, threshold),
185
179
  delta,
186
- dimensions: {
187
- codeCorrectness: {
188
- baseline: bCode,
189
- delta: eCode - bCode,
190
- experiment: eCode,
191
- },
192
- docCoverage: { baseline: bDoc, delta: eDoc - bDoc, experiment: eDoc },
193
- taskCompletion: {
194
- baseline: bTask,
195
- delta: eTask - bTask,
196
- experiment: eTask,
197
- },
198
- },
180
+ dimensions: buildDimensionDeltas(b, e),
199
181
  docLiftDelta: eLift - bLift,
200
182
  experiment: eTotal,
201
183
  floorDelta: eFloor - bFloor,
@@ -206,6 +188,55 @@ function buildAreaDelta(area, baselineScore, experimentScore, threshold, isMisma
206
188
  ...(hasCost && { costDelta: eCost - bCost }),
207
189
  };
208
190
  }
191
+ /**
192
+ * Build per-dimension deltas from the generic dimensions map when available,
193
+ * falling back to the three legacy literacy fields for backward compatibility.
194
+ *
195
+ * This ensures non-literacy modes (agent-harness, mcp-server, etc.) get their
196
+ * actual dimensions (e.g., agentOutput, toolUsage) in comparison reports
197
+ * instead of hardcoded zeros for codeCorrectness/docCoverage/taskCompletion.
198
+ */
199
+ function buildDimensionDeltas(b, e) {
200
+ const bDims = b?.dimensions;
201
+ const eDims = e?.dimensions;
202
+ // When the generic dimensions map is populated, use it — this covers
203
+ // agent-harness (agentOutput, toolUsage), literacy (taskCompletion,
204
+ // codeCorrectness, docCoverage), and any future mode dimensions.
205
+ if (bDims || eDims) {
206
+ const allKeys = new Set([
207
+ ...Object.keys(bDims ?? {}),
208
+ ...Object.keys(eDims ?? {}),
209
+ ]);
210
+ const result = {};
211
+ for (const key of allKeys) {
212
+ const bVal = bDims?.[key] ?? 0;
213
+ const eVal = eDims?.[key] ?? 0;
214
+ result[key] = { baseline: bVal, delta: eVal - bVal, experiment: eVal };
215
+ }
216
+ return result;
217
+ }
218
+ // Legacy fallback: older reports may lack the dimensions map entirely.
219
+ // Read from the three named FeatureScore fields instead.
220
+ const bTask = b?.taskCompletion ?? 0;
221
+ const eTask = e?.taskCompletion ?? 0;
222
+ const bCode = b?.codeCorrectness ?? 0;
223
+ const eCode = e?.codeCorrectness ?? 0;
224
+ const bDoc = b?.docCoverage ?? 0;
225
+ const eDoc = e?.docCoverage ?? 0;
226
+ return {
227
+ codeCorrectness: {
228
+ baseline: bCode,
229
+ delta: eCode - bCode,
230
+ experiment: eCode,
231
+ },
232
+ docCoverage: { baseline: bDoc, delta: eDoc - bDoc, experiment: eDoc },
233
+ taskCompletion: {
234
+ baseline: bTask,
235
+ delta: eTask - bTask,
236
+ experiment: eTask,
237
+ },
238
+ };
239
+ }
209
240
  // ---------------------------------------------------------------------------
210
241
  // Main compare function
211
242
  // ---------------------------------------------------------------------------