@sanity/ailf 2.1.0 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/rubrics.ts +3 -3
- package/dist/_vendor/ailf-core/examples/index.d.ts +50 -1
- package/dist/_vendor/ailf-core/examples/index.js +66 -1
- package/dist/_vendor/ailf-core/types/index.d.ts +25 -0
- package/dist/agent-harness/assertions-runtime.d.ts +49 -0
- package/dist/agent-harness/assertions-runtime.js +138 -0
- package/dist/agent-harness/provider.d.ts +58 -0
- package/dist/agent-harness/provider.js +104 -0
- package/dist/commands/calculate-scores.js +7 -2
- package/dist/commands/capture-list.d.ts +1 -1
- package/dist/commands/capture-list.js +6 -3
- package/dist/commands/compare.js +11 -7
- package/dist/commands/explain-handler.js +22 -24
- package/dist/commands/fetch-docs.js +4 -2
- package/dist/commands/generate-configs.js +6 -2
- package/dist/commands/init.js +3 -0
- package/dist/commands/pipeline-action.js +8 -24
- package/dist/commands/pipeline.js +1 -1
- package/dist/commands/pr-comment.js +6 -2
- package/dist/commands/publish.d.ts +1 -0
- package/dist/commands/publish.js +12 -8
- package/dist/commands/remote-pipeline.js +1 -1
- package/dist/commands/remote-results.d.ts +8 -8
- package/dist/commands/remote-results.js +7 -7
- package/dist/commands/shared/options.d.ts +8 -0
- package/dist/commands/shared/options.js +10 -0
- package/dist/commands/shared/resolve-output-dir.d.ts +27 -0
- package/dist/commands/shared/resolve-output-dir.js +36 -0
- package/dist/composition-root.js +1 -1
- package/dist/config/rubrics.ts +3 -3
- package/dist/orchestration/build-app-context.js +1 -1
- package/dist/orchestration/steps/gap-analysis-step.js +86 -75
- package/dist/orchestration/steps/generate-configs-step.d.ts +7 -0
- package/dist/orchestration/steps/generate-configs-step.js +47 -2
- package/dist/pipeline/calculate-scores.js +113 -2
- package/dist/pipeline/compare.js +50 -19
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +103 -25
- package/dist/pipeline/compiler/compiler-to-yaml.js +78 -7
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +15 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +42 -85
- package/dist/pipeline/compiler/mode-handlers/agent-harness/compiler.js +22 -15
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.d.ts +8 -1
- package/dist/pipeline/compiler/mode-handlers/agent-harness/sandbox.js +42 -12
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +3 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +1 -27
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +2 -9
- package/dist/pipeline/compiler/rubric-resolution.d.ts +40 -0
- package/dist/pipeline/compiler/rubric-resolution.js +52 -0
- package/dist/pipeline/compiler/scoring-bridge.js +59 -7
- package/dist/pipeline/provenance.js +7 -1
- package/dist/pipeline/validate.d.ts +5 -4
- package/dist/pipeline/validate.js +34 -113
- package/package.json +2 -1
|
@@ -36,6 +36,16 @@ export function addOutputOptions(cmd) {
|
|
|
36
36
|
.option("-o, --output <path>", "Write output to a specific file path")
|
|
37
37
|
.option("-f, --format <fmt>", "Output format (e.g., table, json, md)");
|
|
38
38
|
}
|
|
39
|
+
/**
|
|
40
|
+
* Add --output-dir option for commands that write pipeline artifacts.
|
|
41
|
+
*
|
|
42
|
+
* Pair with `resolveOutputDir()` from `./resolve-output-dir.js` to resolve
|
|
43
|
+
* the value. When omitted, `resolveOutputDir()` defaults to
|
|
44
|
+
* `$CWD/.ailf/results/latest/`.
|
|
45
|
+
*/
|
|
46
|
+
export function addOutputDirOption(cmd) {
|
|
47
|
+
return cmd.option("--output-dir <path>", "Base directory for output artifacts (default: .ailf/results/latest/)");
|
|
48
|
+
}
|
|
39
49
|
/**
|
|
40
50
|
* Add Sanity source options: --sanity-dataset, --sanity-project, etc.
|
|
41
51
|
*/
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared output directory resolution for all CLI commands.
|
|
3
|
+
*
|
|
4
|
+
* Resolution order (2-tier):
|
|
5
|
+
* 1. Explicit `--output-dir <path>` — resolved relative to callerCwd
|
|
6
|
+
* 2. Default — `$callerCwd/.ailf/results/latest/`
|
|
7
|
+
*
|
|
8
|
+
* callerCwd is `AILF_CALLER_CWD ?? process.cwd()` — the user's actual
|
|
9
|
+
* working directory, not the eval package root.
|
|
10
|
+
*
|
|
11
|
+
* @see docs/design-docs/output-dir-routing.md
|
|
12
|
+
* @see docs/work-items/W0030.json
|
|
13
|
+
*/
|
|
14
|
+
/**
|
|
15
|
+
* Get the caller's working directory.
|
|
16
|
+
*
|
|
17
|
+
* When the CLI is invoked via `npx @sanity/ailf`, the wrapper may set
|
|
18
|
+
* AILF_CALLER_CWD to preserve the real CWD before Node changes it.
|
|
19
|
+
*/
|
|
20
|
+
export declare function getCallerCwd(): string;
|
|
21
|
+
/**
|
|
22
|
+
* Resolve the output directory for pipeline artifacts.
|
|
23
|
+
*
|
|
24
|
+
* @param outputDir - Explicit `--output-dir` value from the CLI (may be relative)
|
|
25
|
+
* @returns Absolute path to the output directory
|
|
26
|
+
*/
|
|
27
|
+
export declare function resolveOutputDir(outputDir?: string): string;
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared output directory resolution for all CLI commands.
|
|
3
|
+
*
|
|
4
|
+
* Resolution order (2-tier):
|
|
5
|
+
* 1. Explicit `--output-dir <path>` — resolved relative to callerCwd
|
|
6
|
+
* 2. Default — `$callerCwd/.ailf/results/latest/`
|
|
7
|
+
*
|
|
8
|
+
* callerCwd is `AILF_CALLER_CWD ?? process.cwd()` — the user's actual
|
|
9
|
+
* working directory, not the eval package root.
|
|
10
|
+
*
|
|
11
|
+
* @see docs/design-docs/output-dir-routing.md
|
|
12
|
+
* @see docs/work-items/W0030.json
|
|
13
|
+
*/
|
|
14
|
+
import { resolve } from "path";
|
|
15
|
+
/**
|
|
16
|
+
* Get the caller's working directory.
|
|
17
|
+
*
|
|
18
|
+
* When the CLI is invoked via `npx @sanity/ailf`, the wrapper may set
|
|
19
|
+
* AILF_CALLER_CWD to preserve the real CWD before Node changes it.
|
|
20
|
+
*/
|
|
21
|
+
export function getCallerCwd() {
|
|
22
|
+
return process.env.AILF_CALLER_CWD ?? process.cwd();
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Resolve the output directory for pipeline artifacts.
|
|
26
|
+
*
|
|
27
|
+
* @param outputDir - Explicit `--output-dir` value from the CLI (may be relative)
|
|
28
|
+
* @returns Absolute path to the output directory
|
|
29
|
+
*/
|
|
30
|
+
export function resolveOutputDir(outputDir) {
|
|
31
|
+
const callerCwd = getCallerCwd();
|
|
32
|
+
if (outputDir) {
|
|
33
|
+
return resolve(callerCwd, outputDir);
|
|
34
|
+
}
|
|
35
|
+
return resolve(callerCwd, ".ailf", "results", "latest");
|
|
36
|
+
}
|
package/dist/composition-root.js
CHANGED
|
@@ -60,7 +60,7 @@ export function createAppContext(config) {
|
|
|
60
60
|
// Artifact collector — no-op by default, filesystem when --capture is set
|
|
61
61
|
const collector = config.captureEnabled
|
|
62
62
|
? new FilesystemArtifactCollector({
|
|
63
|
-
captureDir: config.captureDir ?? join(config.
|
|
63
|
+
captureDir: config.captureDir ?? join(config.outputDir, "..", "captures"),
|
|
64
64
|
mode: config.mode,
|
|
65
65
|
compress: config.captureCompress ?? true,
|
|
66
66
|
extras: config.captureExtras ?? true,
|
package/dist/config/rubrics.ts
CHANGED
|
@@ -201,9 +201,9 @@ export default defineRubrics({
|
|
|
201
201
|
currency: 0.2,
|
|
202
202
|
},
|
|
203
203
|
"agent-harness": {
|
|
204
|
-
"
|
|
205
|
-
"
|
|
206
|
-
"
|
|
204
|
+
"assertion-pass-rate": 0.35,
|
|
205
|
+
"agent-output": 0.35,
|
|
206
|
+
"tool-usage": 0.3,
|
|
207
207
|
},
|
|
208
208
|
},
|
|
209
209
|
|
|
@@ -79,7 +79,7 @@ export function mapToResolvedConfig(opts, rootDir) {
|
|
|
79
79
|
apiUrl: opts.apiUrl ?? "https://ailf-api.sanity.build",
|
|
80
80
|
apiKey: opts.apiKey,
|
|
81
81
|
captureEnabled: opts.captureEnabled ?? false,
|
|
82
|
-
captureDir: opts.captureDir ?? join(
|
|
82
|
+
captureDir: opts.captureDir ?? join(opts.outputDir, "..", "captures"),
|
|
83
83
|
captureCompress: opts.captureCompress ?? true,
|
|
84
84
|
captureExtras: opts.captureExtras ?? true,
|
|
85
85
|
};
|
|
@@ -61,87 +61,96 @@ export class GapAnalysisStep {
|
|
|
61
61
|
mkdirSync(outDir, { recursive: true });
|
|
62
62
|
writeFileSync(join(outDir, "failure-modes.json"), JSON.stringify(failureModeReport, null, 2));
|
|
63
63
|
writeFileSync(join(outDir, "gap-analysis.json"), JSON.stringify(gapReport, null, 2));
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
const
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
}
|
|
72
|
-
const resolveRefs = (slugs) => slugs
|
|
73
|
-
.map((slug) => {
|
|
74
|
-
const m = refBySlug.get(slug);
|
|
75
|
-
return m
|
|
76
|
-
? {
|
|
77
|
-
documentId: m._id,
|
|
78
|
-
revision: m._rev,
|
|
79
|
-
slug: m.slug,
|
|
80
|
-
title: m.title,
|
|
81
|
-
}
|
|
82
|
-
: { documentId: "", slug, title: slug };
|
|
83
|
-
})
|
|
84
|
-
.filter((r) => r.documentId !== "");
|
|
85
|
-
// ── Build description→docs mapping from TaskSource ─────────
|
|
86
|
-
// Primary source: use the TaskSource adapter from AppContext.
|
|
87
|
-
// This works with Content Lake, repo-based, and YAML tasks.
|
|
88
|
-
// Judgments use task description as their taskId, so we build
|
|
89
|
-
// maps keyed by both description and task ID for robust matching.
|
|
64
|
+
// ── Document enrichment (literacy mode only) ──────────────
|
|
65
|
+
// Non-literacy modes don't use canonical docs. Skip manifest
|
|
66
|
+
// loading, doc-reference enrichment, and canonical doc mapping
|
|
67
|
+
// entirely — those fields are only meaningful for literacy evals.
|
|
68
|
+
const isLiteracyMode = ctx.config.mode === "literacy";
|
|
69
|
+
let documentManifest;
|
|
70
|
+
let enrichedScores = scoreSummary.scores;
|
|
90
71
|
const descToDocRefs = new Map();
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
}
|
|
100
|
-
if (tasks.length > 0) {
|
|
101
|
-
// Group tasks by feature area and build slug maps
|
|
102
|
-
const byArea = new Map();
|
|
103
|
-
for (const task of tasks) {
|
|
104
|
-
const slugs = extractSlugsFromRefs(task.context?.docs ?? []);
|
|
105
|
-
const refs = resolveRefs(slugs);
|
|
106
|
-
// Map by title (what judgments use as taskId)
|
|
107
|
-
descToDocRefs.set(task.title, refs);
|
|
108
|
-
// Also map by task ID for prefix-based matching
|
|
109
|
-
descToDocRefs.set(task.id, refs);
|
|
110
|
-
// Group slugs by feature area
|
|
111
|
-
const area = task.area ?? "";
|
|
112
|
-
if (!byArea.has(area))
|
|
113
|
-
byArea.set(area, new Set());
|
|
114
|
-
for (const s of slugs)
|
|
115
|
-
byArea.get(area).add(s);
|
|
72
|
+
if (isLiteracyMode) {
|
|
73
|
+
const manifestPath = resolve(root, "contexts", "document-manifest.json");
|
|
74
|
+
const manifestEntries = existsSync(manifestPath)
|
|
75
|
+
? JSON.parse(readFileSync(manifestPath, "utf-8"))
|
|
76
|
+
: [];
|
|
77
|
+
const refBySlug = new Map();
|
|
78
|
+
for (const entry of manifestEntries) {
|
|
79
|
+
refBySlug.set(entry.slug, entry);
|
|
116
80
|
}
|
|
117
|
-
|
|
118
|
-
|
|
81
|
+
const resolveRefs = (slugs) => slugs
|
|
82
|
+
.map((slug) => {
|
|
83
|
+
const m = refBySlug.get(slug);
|
|
84
|
+
return m
|
|
85
|
+
? {
|
|
86
|
+
documentId: m._id,
|
|
87
|
+
revision: m._rev,
|
|
88
|
+
slug: m.slug,
|
|
89
|
+
title: m.title,
|
|
90
|
+
}
|
|
91
|
+
: { documentId: "", slug, title: slug };
|
|
92
|
+
})
|
|
93
|
+
.filter((r) => r.documentId !== "");
|
|
94
|
+
// ── Build description→docs mapping from TaskSource ─────────
|
|
95
|
+
// Primary source: use the TaskSource adapter from AppContext.
|
|
96
|
+
// This works with Content Lake, repo-based, and YAML tasks.
|
|
97
|
+
// Judgments use task description as their taskId, so we build
|
|
98
|
+
// maps keyed by both description and task ID for robust matching.
|
|
99
|
+
const areaToDocRefs = new Map();
|
|
100
|
+
let tasks = [];
|
|
101
|
+
try {
|
|
102
|
+
tasks = (await ctx.taskSource.loadTasks()).filter((t) => t.mode === "literacy");
|
|
119
103
|
}
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
descToDocRefs.set(task.
|
|
104
|
+
catch {
|
|
105
|
+
// TaskSource may not be available in all contexts (e.g., standalone
|
|
106
|
+
// gap analysis on cached results). Fall through to legacy fallback.
|
|
107
|
+
}
|
|
108
|
+
if (tasks.length > 0) {
|
|
109
|
+
// Group tasks by feature area and build slug maps
|
|
110
|
+
const byArea = new Map();
|
|
111
|
+
for (const task of tasks) {
|
|
112
|
+
const slugs = extractSlugsFromRefs(task.context?.docs ?? []);
|
|
113
|
+
const refs = resolveRefs(slugs);
|
|
114
|
+
// Map by title (what judgments use as taskId)
|
|
115
|
+
descToDocRefs.set(task.title, refs);
|
|
116
|
+
// Also map by task ID for prefix-based matching
|
|
117
|
+
descToDocRefs.set(task.id, refs);
|
|
118
|
+
// Group slugs by feature area
|
|
119
|
+
const area = task.area ?? "";
|
|
120
|
+
if (!byArea.has(area))
|
|
121
|
+
byArea.set(area, new Set());
|
|
122
|
+
for (const s of slugs)
|
|
123
|
+
byArea.get(area).add(s);
|
|
124
|
+
}
|
|
125
|
+
for (const [area, slugs] of byArea) {
|
|
126
|
+
areaToDocRefs.set(area, resolveRefs([...slugs]));
|
|
132
127
|
}
|
|
133
|
-
for (const s of taskSlugs)
|
|
134
|
-
areaSlugs.add(s);
|
|
135
128
|
}
|
|
136
|
-
|
|
137
|
-
|
|
129
|
+
// Legacy fallback: merge in any tasks from local YAML that weren't
|
|
130
|
+
// already covered by the TaskSource adapter.
|
|
131
|
+
const { resolveMappings } = await import("../../pipeline/resolve-mappings.js");
|
|
132
|
+
const mappings = resolveMappings(root);
|
|
133
|
+
for (const [area, areaData] of Object.entries(mappings.feature_areas)) {
|
|
134
|
+
const areaSlugs = new Set();
|
|
135
|
+
for (const task of areaData.tasks) {
|
|
136
|
+
const taskSlugs = task.canonical_docs.map((d) => d.slug);
|
|
137
|
+
// Only add if not already mapped by the primary source
|
|
138
|
+
if (!descToDocRefs.has(task.description)) {
|
|
139
|
+
descToDocRefs.set(task.description, resolveRefs(taskSlugs));
|
|
140
|
+
}
|
|
141
|
+
for (const s of taskSlugs)
|
|
142
|
+
areaSlugs.add(s);
|
|
143
|
+
}
|
|
144
|
+
if (!areaToDocRefs.has(area)) {
|
|
145
|
+
areaToDocRefs.set(area, resolveRefs([...areaSlugs]));
|
|
146
|
+
}
|
|
138
147
|
}
|
|
148
|
+
documentManifest = resolveRefs([...refBySlug.keys()]);
|
|
149
|
+
enrichedScores = scoreSummary.scores.map((s) => ({
|
|
150
|
+
...s,
|
|
151
|
+
documents: areaToDocRefs.get(s.feature),
|
|
152
|
+
}));
|
|
139
153
|
}
|
|
140
|
-
const documentManifest = resolveRefs([...refBySlug.keys()]);
|
|
141
|
-
const enrichedScores = scoreSummary.scores.map((s) => ({
|
|
142
|
-
...s,
|
|
143
|
-
documents: areaToDocRefs.get(s.feature),
|
|
144
|
-
}));
|
|
145
154
|
// ── Low-scoring judgments ────────────────────────────────────
|
|
146
155
|
const LOW_SCORE_THRESHOLD = 70;
|
|
147
156
|
const MAX_STORED_JUDGMENTS = 50;
|
|
@@ -154,6 +163,8 @@ export class GapAnalysisStep {
|
|
|
154
163
|
.sort((a, b) => a.score - b.score)
|
|
155
164
|
.slice(0, MAX_STORED_JUDGMENTS)
|
|
156
165
|
.map((j) => {
|
|
166
|
+
if (!isLiteracyMode)
|
|
167
|
+
return j;
|
|
157
168
|
// Judgment taskId is the description with "(gold)" or "(baseline)" suffix
|
|
158
169
|
const baseDesc = j.taskId.replace(/\s*\((gold|baseline)\)\s*$/, "");
|
|
159
170
|
const canonicalDocs = descToDocRefs.get(baseDesc);
|
|
@@ -161,7 +172,7 @@ export class GapAnalysisStep {
|
|
|
161
172
|
});
|
|
162
173
|
const enrichedSummary = {
|
|
163
174
|
...scoreSummary,
|
|
164
|
-
documentManifest,
|
|
175
|
+
...(documentManifest !== undefined && { documentManifest }),
|
|
165
176
|
failureModes: failureModeReport,
|
|
166
177
|
lowScoringJudgments,
|
|
167
178
|
recommendations: gapReport,
|
|
@@ -11,12 +11,19 @@
|
|
|
11
11
|
import type { AppContext, PipelineState, PipelineStep, StepResult, ValidationIssue } from "../../_vendor/ailf-core/index.d.ts";
|
|
12
12
|
export declare class GenerateConfigsStep implements PipelineStep {
|
|
13
13
|
readonly name = "generate-configs";
|
|
14
|
+
/** Task IDs from the last loadTasks call (pre-filter), for error messages. */
|
|
15
|
+
private lastLoadedTaskIds;
|
|
14
16
|
check(ctx: AppContext): ValidationIssue[];
|
|
15
17
|
execute(ctx: AppContext, state: PipelineState): Promise<StepResult>;
|
|
16
18
|
private compileLiteracyVariants;
|
|
17
19
|
private compileSingleMode;
|
|
18
20
|
private loadTasks;
|
|
19
21
|
private applyFilters;
|
|
22
|
+
/**
|
|
23
|
+
* Build a descriptive error message when no tasks match the current filters.
|
|
24
|
+
* Distinguishes between "no tasks exist" and "tasks exist but filters exclude them".
|
|
25
|
+
*/
|
|
26
|
+
private buildNoTasksError;
|
|
20
27
|
/**
|
|
21
28
|
* Compile all tasks through a handler, merging results.
|
|
22
29
|
* For literacy mode, ctx can carry evalMode as an extension.
|
|
@@ -20,6 +20,8 @@ import { loadSource } from "../../sources.js";
|
|
|
20
20
|
import { configToSourceOverrides } from "../config-to-source-overrides.js";
|
|
21
21
|
export class GenerateConfigsStep {
|
|
22
22
|
name = "generate-configs";
|
|
23
|
+
/** Task IDs from the last loadTasks call (pre-filter), for error messages. */
|
|
24
|
+
lastLoadedTaskIds = [];
|
|
23
25
|
check(ctx) {
|
|
24
26
|
const issues = validateModelsYaml(ctx.config.rootDir);
|
|
25
27
|
return issues.filter((i) => i.severity === "error");
|
|
@@ -54,10 +56,10 @@ export class GenerateConfigsStep {
|
|
|
54
56
|
// Load tasks
|
|
55
57
|
const tasks = await this.loadTasks(ctx, mode, state);
|
|
56
58
|
if (tasks.length === 0) {
|
|
59
|
+
const error = this.buildNoTasksError(ctx, mode);
|
|
57
60
|
return {
|
|
58
61
|
durationMs: Date.now() - start,
|
|
59
|
-
error
|
|
60
|
-
`packages/eval/tasks/${mode}/`,
|
|
62
|
+
error,
|
|
61
63
|
status: "failed",
|
|
62
64
|
};
|
|
63
65
|
}
|
|
@@ -157,10 +159,22 @@ export class GenerateConfigsStep {
|
|
|
157
159
|
label: m.label,
|
|
158
160
|
config: m.config,
|
|
159
161
|
}));
|
|
162
|
+
// Load rubric config for template resolution (needed by modes that use
|
|
163
|
+
// templated LLM-rubric assertions, e.g., agent-harness with agent-output
|
|
164
|
+
// and agent-tool-usage templates)
|
|
165
|
+
let rubricConfig;
|
|
166
|
+
try {
|
|
167
|
+
const { loadRubricTemplates } = await import("../../pipeline/rubric-loader.js");
|
|
168
|
+
rubricConfig = loadRubricTemplates(ctx.config.rootDir);
|
|
169
|
+
}
|
|
170
|
+
catch {
|
|
171
|
+
ctx.logger.warn(" ⚠ Could not load rubric config — templates will not resolve");
|
|
172
|
+
}
|
|
160
173
|
const merged = this.compileAll(handler, tasks, {
|
|
161
174
|
rootDir: ctx.config.rootDir,
|
|
162
175
|
graderProvider: models.grader.id,
|
|
163
176
|
models: modeModels,
|
|
177
|
+
rubricConfig,
|
|
164
178
|
});
|
|
165
179
|
for (const w of merged.warnings) {
|
|
166
180
|
ctx.logger.warn(` ⚠ ${w}`);
|
|
@@ -249,6 +263,10 @@ export class GenerateConfigsStep {
|
|
|
249
263
|
return filtered;
|
|
250
264
|
}
|
|
251
265
|
applyFilters(ctx, tasks) {
|
|
266
|
+
// Capture pre-filter IDs for diagnostic messages
|
|
267
|
+
this.lastLoadedTaskIds = tasks
|
|
268
|
+
.map((t) => t.id)
|
|
269
|
+
.filter((id) => !!id);
|
|
252
270
|
let result = tasks;
|
|
253
271
|
if (ctx.config.areas?.length) {
|
|
254
272
|
const allowed = new Set(ctx.config.areas.map((a) => a.toLowerCase()));
|
|
@@ -273,6 +291,33 @@ export class GenerateConfigsStep {
|
|
|
273
291
|
}
|
|
274
292
|
return result;
|
|
275
293
|
}
|
|
294
|
+
/**
|
|
295
|
+
* Build a descriptive error message when no tasks match the current filters.
|
|
296
|
+
* Distinguishes between "no tasks exist" and "tasks exist but filters exclude them".
|
|
297
|
+
*/
|
|
298
|
+
buildNoTasksError(ctx, mode) {
|
|
299
|
+
const filters = [];
|
|
300
|
+
if (ctx.config.tasks?.length) {
|
|
301
|
+
filters.push(`--task ${ctx.config.tasks.join(", ")}`);
|
|
302
|
+
}
|
|
303
|
+
if (ctx.config.areas?.length) {
|
|
304
|
+
filters.push(`--area ${ctx.config.areas.join(", ")}`);
|
|
305
|
+
}
|
|
306
|
+
if (ctx.config.tags?.length) {
|
|
307
|
+
filters.push(`--tag ${ctx.config.tags.join(", ")}`);
|
|
308
|
+
}
|
|
309
|
+
if (filters.length > 0) {
|
|
310
|
+
// Collect available task IDs for the hint
|
|
311
|
+
const availableIds = this.lastLoadedTaskIds ?? [];
|
|
312
|
+
const hint = availableIds.length > 0
|
|
313
|
+
? `\n Available ${mode} task IDs: ${availableIds.join(", ")}`
|
|
314
|
+
: "";
|
|
315
|
+
return (`No ${mode} tasks match the current filters (${filters.join("; ")}).` +
|
|
316
|
+
hint);
|
|
317
|
+
}
|
|
318
|
+
return (`No ${mode} tasks found. Create *.task.ts files in ` +
|
|
319
|
+
`packages/eval/tasks/${mode}/`);
|
|
320
|
+
}
|
|
276
321
|
// ---------------------------------------------------------------------------
|
|
277
322
|
// Compilation helpers
|
|
278
323
|
// ---------------------------------------------------------------------------
|
|
@@ -546,11 +546,13 @@ function scoreResults(results, goldProfile, baselineProfile, modelId) {
|
|
|
546
546
|
const featureScore = {
|
|
547
547
|
ceilingScore,
|
|
548
548
|
codeCorrectness: gold.dimensions.codeCorrectness ?? 0,
|
|
549
|
+
dimensions: gold.dimensions,
|
|
549
550
|
docCoverage: gold.dimensions.docCoverage ?? 0,
|
|
550
551
|
docLift,
|
|
551
552
|
docQualityGap: 100 - ceilingScore,
|
|
552
553
|
feature,
|
|
553
554
|
floorScore,
|
|
555
|
+
groupType: "feature",
|
|
554
556
|
...(modelId && { modelId }),
|
|
555
557
|
negativeDocLift: docLift < 0,
|
|
556
558
|
taskCompletion: gold.dimensions.taskCompletion ?? 0,
|
|
@@ -563,6 +565,69 @@ function scoreResults(results, goldProfile, baselineProfile, modelId) {
|
|
|
563
565
|
return scores.sort((a, b) => a.feature.localeCompare(b.feature));
|
|
564
566
|
}
|
|
565
567
|
// ---------------------------------------------------------------------------
|
|
568
|
+
// Agent-harness scoring — groups by task ID, single variant
|
|
569
|
+
// ---------------------------------------------------------------------------
|
|
570
|
+
/**
|
|
571
|
+
* Score agent-harness evaluation results. Unlike literacy mode, agent-harness
|
|
572
|
+
* tasks don't have a with-docs/without-docs split. All results are scored
|
|
573
|
+
* as a single "actual" variant using the agent-harness profile.
|
|
574
|
+
*
|
|
575
|
+
* Groups results by task ID (extracted from the test description prefix)
|
|
576
|
+
* rather than by feature area. Each group produces a FeatureScore with
|
|
577
|
+
* groupType: "task".
|
|
578
|
+
*
|
|
579
|
+
* Literacy-specific fields (ceilingScore, floorScore, docLift, docQualityGap)
|
|
580
|
+
* are set to 0 for backward compatibility with downstream consumers.
|
|
581
|
+
*/
|
|
582
|
+
function scoreAgentHarnessResults(results, profile) {
|
|
583
|
+
// Group by task ID (extracted from description: "task-id — Title")
|
|
584
|
+
const byTask = {};
|
|
585
|
+
for (const result of results) {
|
|
586
|
+
const taskId = extractTaskId(result.description);
|
|
587
|
+
if (!byTask[taskId]) {
|
|
588
|
+
byTask[taskId] = [];
|
|
589
|
+
}
|
|
590
|
+
byTask[taskId].push(result);
|
|
591
|
+
}
|
|
592
|
+
const scores = [];
|
|
593
|
+
for (const [taskId, taskResults] of Object.entries(byTask)) {
|
|
594
|
+
const scored = scoreTestGroup(taskResults, profile, taskId);
|
|
595
|
+
const totalCost = scored.totalCost;
|
|
596
|
+
// Detect feature area for backward compat (used by report grouping)
|
|
597
|
+
const feature = taskResults[0]?.vars.__featureArea ??
|
|
598
|
+
detectFeatureArea(taskResults[0]?.description ?? taskId);
|
|
599
|
+
scores.push({
|
|
600
|
+
assertionPassRate: scored.dimensions.assertionPassRate,
|
|
601
|
+
ceilingScore: 0,
|
|
602
|
+
codeCorrectness: scored.dimensions.codeCorrectness ?? 0,
|
|
603
|
+
dimensions: scored.dimensions,
|
|
604
|
+
docCoverage: scored.dimensions.docCoverage ?? 0,
|
|
605
|
+
docLift: 0,
|
|
606
|
+
docQualityGap: 0,
|
|
607
|
+
feature,
|
|
608
|
+
floorScore: 0,
|
|
609
|
+
groupType: "task",
|
|
610
|
+
negativeDocLift: false,
|
|
611
|
+
taskCompletion: scored.dimensions.taskCompletion ?? 0,
|
|
612
|
+
testCount: taskResults.length,
|
|
613
|
+
totalCost,
|
|
614
|
+
totalScore: scored.composite,
|
|
615
|
+
});
|
|
616
|
+
}
|
|
617
|
+
return scores.sort((a, b) => a.feature.localeCompare(b.feature));
|
|
618
|
+
}
|
|
619
|
+
/**
|
|
620
|
+
* Extract task ID from a test description string.
|
|
621
|
+
* Descriptions follow the pattern: "task-id — Title"
|
|
622
|
+
*/
|
|
623
|
+
function extractTaskId(description) {
|
|
624
|
+
const dashIndex = description.indexOf(" — ");
|
|
625
|
+
if (dashIndex > 0) {
|
|
626
|
+
return description.slice(0, dashIndex).trim();
|
|
627
|
+
}
|
|
628
|
+
return description.trim() || "unknown";
|
|
629
|
+
}
|
|
630
|
+
// ---------------------------------------------------------------------------
|
|
566
631
|
// Agentic scoring — all results are "actual" (agent retrieves docs via tools)
|
|
567
632
|
// ---------------------------------------------------------------------------
|
|
568
633
|
/**
|
|
@@ -684,11 +749,57 @@ export function calculateAndWriteScores(options) {
|
|
|
684
749
|
if (source) {
|
|
685
750
|
log.info(`Source: ${sourceName} (${source.baseUrl})`);
|
|
686
751
|
}
|
|
687
|
-
// Load rubric config
|
|
752
|
+
// Load rubric config — shared across all modes
|
|
753
|
+
const rubricConfig = loadRubricTemplates(ROOT);
|
|
754
|
+
// ── Agent-harness scoring path ──────────────────────────────
|
|
755
|
+
// Agent-harness mode uses its own scoring path because:
|
|
756
|
+
// 1. No with-docs/without-docs split — all results are "actual"
|
|
757
|
+
// 2. Groups by task ID, not feature area
|
|
758
|
+
// 3. Uses the agent-harness profile (assertion-pass-rate, agent-output, tool-usage)
|
|
759
|
+
// See docs/design-docs/mode-agnostic-scoring.md
|
|
760
|
+
if (mode === "agent-harness") {
|
|
761
|
+
const agentProfile = resolveProfile("agent-harness", "gold", rubricConfig);
|
|
762
|
+
log.debug("Agent-harness scoring profile", agentProfile);
|
|
763
|
+
const results = readAndNormalizeResults(baselineResultsPath);
|
|
764
|
+
const scores = scoreAgentHarnessResults(results, agentProfile);
|
|
765
|
+
log.debug("Agent-harness scores calculated", {
|
|
766
|
+
taskCount: scores.length,
|
|
767
|
+
tasks: scores.map((s) => ({
|
|
768
|
+
feature: s.feature,
|
|
769
|
+
totalScore: s.totalScore,
|
|
770
|
+
testCount: s.testCount,
|
|
771
|
+
dimensions: s.dimensions,
|
|
772
|
+
})),
|
|
773
|
+
});
|
|
774
|
+
const urlRefs = aggregateUrlReferences(baselineResultsPath);
|
|
775
|
+
const sourceVerification = buildSourceVerification(ROOT, source, {
|
|
776
|
+
allowedOrigins: options.allowedOrigins,
|
|
777
|
+
mode,
|
|
778
|
+
searchMode: options.searchMode,
|
|
779
|
+
});
|
|
780
|
+
const graderCost = extractGraderCost(baselineResultsPath);
|
|
781
|
+
const summary = printReport(scores, urlRefs, source, null, // no agent behavior (that's for literacy agentic mode)
|
|
782
|
+
graderCost, null, // no per-model breakdown
|
|
783
|
+
null, // no source isolation
|
|
784
|
+
sourceVerification, "agent-harness", log);
|
|
785
|
+
// Persist
|
|
786
|
+
const outDir = join(ROOT, "results", "latest");
|
|
787
|
+
mkdirSync(outDir, { recursive: true });
|
|
788
|
+
writeFileSync(join(outDir, "score-summary.json"), JSON.stringify(summary, null, 2));
|
|
789
|
+
log.info("Score summary written to results/latest/score-summary.json");
|
|
790
|
+
// Extract and persist grader judgments
|
|
791
|
+
const judgments = extractGraderJudgments(baselineResultsPath);
|
|
792
|
+
if (judgments.length > 0) {
|
|
793
|
+
writeFileSync(join(outDir, "grader-judgments.json"), JSON.stringify(judgments, null, 2));
|
|
794
|
+
log.info(`Grader judgments written to results/latest/grader-judgments.json (${judgments.length} judgments)`);
|
|
795
|
+
}
|
|
796
|
+
const testSummary = computeTestSummary(baselineResultsPath);
|
|
797
|
+
return { belowCritical: summary.belowCritical, testSummary };
|
|
798
|
+
}
|
|
799
|
+
// ── Literacy scoring path ───────────────────────────────────
|
|
688
800
|
// Gold (with-docs) entries use the "default" profile (3 dimensions).
|
|
689
801
|
// Baseline (without-docs) entries use "output-only" (2 dimensions,
|
|
690
802
|
// doc-coverage excluded). See docs/design-docs/named-scoring-profiles.md.
|
|
691
|
-
const rubricConfig = loadRubricTemplates(ROOT);
|
|
692
803
|
const goldProfile = resolveProfile("literacy", "gold", rubricConfig, LiteracyVariant.STANDARD);
|
|
693
804
|
const baselineProfileWeights = resolveProfile("literacy", LiteracyVariant.STANDARD, rubricConfig, LiteracyVariant.STANDARD);
|
|
694
805
|
log.debug("Loaded scoring profiles", {
|
package/dist/pipeline/compare.js
CHANGED
|
@@ -146,12 +146,6 @@ function buildAreaDelta(area, baselineScore, experimentScore, threshold, isMisma
|
|
|
146
146
|
const bTotal = b?.totalScore ?? 0;
|
|
147
147
|
const eTotal = e?.totalScore ?? 0;
|
|
148
148
|
const delta = eTotal - bTotal;
|
|
149
|
-
const bTask = b?.taskCompletion ?? 0;
|
|
150
|
-
const eTask = e?.taskCompletion ?? 0;
|
|
151
|
-
const bCode = b?.codeCorrectness ?? 0;
|
|
152
|
-
const eCode = e?.codeCorrectness ?? 0;
|
|
153
|
-
const bDoc = b?.docCoverage ?? 0;
|
|
154
|
-
const eDoc = e?.docCoverage ?? 0;
|
|
155
149
|
// Support both new field names and legacy data (old baselines/Sanity docs)
|
|
156
150
|
const bRaw = b;
|
|
157
151
|
const eRaw = e;
|
|
@@ -183,19 +177,7 @@ function buildAreaDelta(area, baselineScore, experimentScore, threshold, isMisma
|
|
|
183
177
|
ceilingDelta: eCeiling - bCeiling,
|
|
184
178
|
change: isMismatched ? "not-evaluated" : classifyChange(delta, threshold),
|
|
185
179
|
delta,
|
|
186
|
-
dimensions:
|
|
187
|
-
codeCorrectness: {
|
|
188
|
-
baseline: bCode,
|
|
189
|
-
delta: eCode - bCode,
|
|
190
|
-
experiment: eCode,
|
|
191
|
-
},
|
|
192
|
-
docCoverage: { baseline: bDoc, delta: eDoc - bDoc, experiment: eDoc },
|
|
193
|
-
taskCompletion: {
|
|
194
|
-
baseline: bTask,
|
|
195
|
-
delta: eTask - bTask,
|
|
196
|
-
experiment: eTask,
|
|
197
|
-
},
|
|
198
|
-
},
|
|
180
|
+
dimensions: buildDimensionDeltas(b, e),
|
|
199
181
|
docLiftDelta: eLift - bLift,
|
|
200
182
|
experiment: eTotal,
|
|
201
183
|
floorDelta: eFloor - bFloor,
|
|
@@ -206,6 +188,55 @@ function buildAreaDelta(area, baselineScore, experimentScore, threshold, isMisma
|
|
|
206
188
|
...(hasCost && { costDelta: eCost - bCost }),
|
|
207
189
|
};
|
|
208
190
|
}
|
|
191
|
+
/**
|
|
192
|
+
* Build per-dimension deltas from the generic dimensions map when available,
|
|
193
|
+
* falling back to the three legacy literacy fields for backward compatibility.
|
|
194
|
+
*
|
|
195
|
+
* This ensures non-literacy modes (agent-harness, mcp-server, etc.) get their
|
|
196
|
+
* actual dimensions (e.g., agentOutput, toolUsage) in comparison reports
|
|
197
|
+
* instead of hardcoded zeros for codeCorrectness/docCoverage/taskCompletion.
|
|
198
|
+
*/
|
|
199
|
+
function buildDimensionDeltas(b, e) {
|
|
200
|
+
const bDims = b?.dimensions;
|
|
201
|
+
const eDims = e?.dimensions;
|
|
202
|
+
// When the generic dimensions map is populated, use it — this covers
|
|
203
|
+
// agent-harness (agentOutput, toolUsage), literacy (taskCompletion,
|
|
204
|
+
// codeCorrectness, docCoverage), and any future mode dimensions.
|
|
205
|
+
if (bDims || eDims) {
|
|
206
|
+
const allKeys = new Set([
|
|
207
|
+
...Object.keys(bDims ?? {}),
|
|
208
|
+
...Object.keys(eDims ?? {}),
|
|
209
|
+
]);
|
|
210
|
+
const result = {};
|
|
211
|
+
for (const key of allKeys) {
|
|
212
|
+
const bVal = bDims?.[key] ?? 0;
|
|
213
|
+
const eVal = eDims?.[key] ?? 0;
|
|
214
|
+
result[key] = { baseline: bVal, delta: eVal - bVal, experiment: eVal };
|
|
215
|
+
}
|
|
216
|
+
return result;
|
|
217
|
+
}
|
|
218
|
+
// Legacy fallback: older reports may lack the dimensions map entirely.
|
|
219
|
+
// Read from the three named FeatureScore fields instead.
|
|
220
|
+
const bTask = b?.taskCompletion ?? 0;
|
|
221
|
+
const eTask = e?.taskCompletion ?? 0;
|
|
222
|
+
const bCode = b?.codeCorrectness ?? 0;
|
|
223
|
+
const eCode = e?.codeCorrectness ?? 0;
|
|
224
|
+
const bDoc = b?.docCoverage ?? 0;
|
|
225
|
+
const eDoc = e?.docCoverage ?? 0;
|
|
226
|
+
return {
|
|
227
|
+
codeCorrectness: {
|
|
228
|
+
baseline: bCode,
|
|
229
|
+
delta: eCode - bCode,
|
|
230
|
+
experiment: eCode,
|
|
231
|
+
},
|
|
232
|
+
docCoverage: { baseline: bDoc, delta: eDoc - bDoc, experiment: eDoc },
|
|
233
|
+
taskCompletion: {
|
|
234
|
+
baseline: bTask,
|
|
235
|
+
delta: eTask - bTask,
|
|
236
|
+
experiment: eTask,
|
|
237
|
+
},
|
|
238
|
+
};
|
|
239
|
+
}
|
|
209
240
|
// ---------------------------------------------------------------------------
|
|
210
241
|
// Main compare function
|
|
211
242
|
// ---------------------------------------------------------------------------
|