@sanity/ailf 2.2.0 → 2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/rubrics.ts +3 -3
- package/dist/_vendor/ailf-core/types/index.d.ts +25 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +15 -7
- package/dist/commands/calculate-scores.js +7 -2
- package/dist/commands/capture-list.d.ts +1 -1
- package/dist/commands/capture-list.js +6 -3
- package/dist/commands/compare.js +11 -7
- package/dist/commands/explain-handler.js +22 -24
- package/dist/commands/fetch-docs.js +4 -2
- package/dist/commands/generate-configs.js +6 -2
- package/dist/commands/pipeline-action.js +8 -24
- package/dist/commands/pipeline.js +1 -1
- package/dist/commands/pr-comment.js +6 -2
- package/dist/commands/publish.d.ts +1 -0
- package/dist/commands/publish.js +12 -8
- package/dist/commands/remote-pipeline.js +1 -1
- package/dist/commands/remote-results.d.ts +8 -8
- package/dist/commands/remote-results.js +7 -7
- package/dist/commands/shared/options.d.ts +8 -0
- package/dist/commands/shared/options.js +10 -0
- package/dist/commands/shared/resolve-output-dir.d.ts +27 -0
- package/dist/commands/shared/resolve-output-dir.js +36 -0
- package/dist/composition-root.js +1 -1
- package/dist/config/rubrics.ts +3 -3
- package/dist/orchestration/build-app-context.js +1 -1
- package/dist/orchestration/steps/fetch-docs-step.js +23 -9
- package/dist/orchestration/steps/gap-analysis-step.js +86 -75
- package/dist/orchestration/steps/generate-configs-step.d.ts +15 -0
- package/dist/orchestration/steps/generate-configs-step.js +56 -0
- package/dist/orchestration/steps/run-eval-step.js +14 -0
- package/dist/pipeline/calculate-scores.js +113 -2
- package/dist/pipeline/compare.js +50 -19
- package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +64 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +6 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +14 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -0
- package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +3 -0
- package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +1 -27
- package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +2 -9
- package/dist/pipeline/compiler/rubric-resolution.d.ts +40 -0
- package/dist/pipeline/compiler/rubric-resolution.js +52 -0
- package/dist/pipeline/compiler/scoring-bridge.js +59 -7
- package/dist/pipeline/provenance.js +7 -1
- package/dist/pipeline/validate.d.ts +5 -4
- package/dist/pipeline/validate.js +34 -113
- package/dist/webhook/eval-request-handler.js +4 -0
- package/package.json +1 -1
|
@@ -4,9 +4,9 @@
|
|
|
4
4
|
* Produces the same file layout as local mode so downstream tools
|
|
5
5
|
* (workflow PR comments, score comparison, baseline save) work unchanged:
|
|
6
6
|
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
7
|
+
* <outputDir>/score-summary.json — scores by area + overall
|
|
8
|
+
* <outputDir>/report.md — rendered markdown report
|
|
9
|
+
* <outputDir>/job-metadata.json — job ID, timing, API URL
|
|
10
10
|
*
|
|
11
11
|
* @see packages/eval/src/commands/remote-pipeline.ts — caller
|
|
12
12
|
*/
|
|
@@ -14,8 +14,8 @@ import type { ApiClient } from "../adapters/api-client/api-client.js";
|
|
|
14
14
|
import type { JobResponse } from "../adapters/api-client/types.js";
|
|
15
15
|
/** Options for writing remote results. */
|
|
16
16
|
export interface WriteResultsOptions {
|
|
17
|
-
/**
|
|
18
|
-
|
|
17
|
+
/** Base directory for output artifacts. */
|
|
18
|
+
outputDir: string;
|
|
19
19
|
/** Optional output path override (--output flag). */
|
|
20
20
|
outputPath?: string;
|
|
21
21
|
/** API base URL (for metadata). */
|
|
@@ -25,9 +25,9 @@ export interface WriteResultsOptions {
|
|
|
25
25
|
* Fetch report artifacts from the API and write them to disk.
|
|
26
26
|
*
|
|
27
27
|
* Writes:
|
|
28
|
-
* -
|
|
29
|
-
* -
|
|
30
|
-
* -
|
|
28
|
+
* - `<outputDir>/score-summary.json` — score data from job response
|
|
29
|
+
* - `<outputDir>/report.md` — full markdown report (if reportId present)
|
|
30
|
+
* - `<outputDir>/job-metadata.json` — job tracking info
|
|
31
31
|
* - `--output` path — markdown report (if specified)
|
|
32
32
|
*/
|
|
33
33
|
export declare function writeRemoteResults(client: ApiClient, job: JobResponse, options: WriteResultsOptions): Promise<void>;
|
|
@@ -4,9 +4,9 @@
|
|
|
4
4
|
* Produces the same file layout as local mode so downstream tools
|
|
5
5
|
* (workflow PR comments, score comparison, baseline save) work unchanged:
|
|
6
6
|
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
7
|
+
* <outputDir>/score-summary.json — scores by area + overall
|
|
8
|
+
* <outputDir>/report.md — rendered markdown report
|
|
9
|
+
* <outputDir>/job-metadata.json — job ID, timing, API URL
|
|
10
10
|
*
|
|
11
11
|
* @see packages/eval/src/commands/remote-pipeline.ts — caller
|
|
12
12
|
*/
|
|
@@ -19,13 +19,13 @@ import { resolve } from "path";
|
|
|
19
19
|
* Fetch report artifacts from the API and write them to disk.
|
|
20
20
|
*
|
|
21
21
|
* Writes:
|
|
22
|
-
* -
|
|
23
|
-
* -
|
|
24
|
-
* -
|
|
22
|
+
* - `<outputDir>/score-summary.json` — score data from job response
|
|
23
|
+
* - `<outputDir>/report.md` — full markdown report (if reportId present)
|
|
24
|
+
* - `<outputDir>/job-metadata.json` — job tracking info
|
|
25
25
|
* - `--output` path — markdown report (if specified)
|
|
26
26
|
*/
|
|
27
27
|
export async function writeRemoteResults(client, job, options) {
|
|
28
|
-
const resultsDir =
|
|
28
|
+
const resultsDir = options.outputDir;
|
|
29
29
|
mkdirSync(resultsDir, { recursive: true });
|
|
30
30
|
// 1. Write score summary
|
|
31
31
|
const scoreSummary = buildScoreSummary(job);
|
|
@@ -18,6 +18,14 @@ export declare function addDebugOptions(cmd: Command): Command;
|
|
|
18
18
|
* Add output options: --output, --format
|
|
19
19
|
*/
|
|
20
20
|
export declare function addOutputOptions(cmd: Command): Command;
|
|
21
|
+
/**
|
|
22
|
+
* Add --output-dir option for commands that write pipeline artifacts.
|
|
23
|
+
*
|
|
24
|
+
* Pair with `resolveOutputDir()` from `./resolve-output-dir.js` to resolve
|
|
25
|
+
* the value. When omitted, `resolveOutputDir()` defaults to
|
|
26
|
+
* `$CWD/.ailf/results/latest/`.
|
|
27
|
+
*/
|
|
28
|
+
export declare function addOutputDirOption(cmd: Command): Command;
|
|
21
29
|
/**
|
|
22
30
|
* Add Sanity source options: --sanity-dataset, --sanity-project, etc.
|
|
23
31
|
*/
|
|
@@ -36,6 +36,16 @@ export function addOutputOptions(cmd) {
|
|
|
36
36
|
.option("-o, --output <path>", "Write output to a specific file path")
|
|
37
37
|
.option("-f, --format <fmt>", "Output format (e.g., table, json, md)");
|
|
38
38
|
}
|
|
39
|
+
/**
|
|
40
|
+
* Add --output-dir option for commands that write pipeline artifacts.
|
|
41
|
+
*
|
|
42
|
+
* Pair with `resolveOutputDir()` from `./resolve-output-dir.js` to resolve
|
|
43
|
+
* the value. When omitted, `resolveOutputDir()` defaults to
|
|
44
|
+
* `$CWD/.ailf/results/latest/`.
|
|
45
|
+
*/
|
|
46
|
+
export function addOutputDirOption(cmd) {
|
|
47
|
+
return cmd.option("--output-dir <path>", "Base directory for output artifacts (default: .ailf/results/latest/)");
|
|
48
|
+
}
|
|
39
49
|
/**
|
|
40
50
|
* Add Sanity source options: --sanity-dataset, --sanity-project, etc.
|
|
41
51
|
*/
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared output directory resolution for all CLI commands.
|
|
3
|
+
*
|
|
4
|
+
* Resolution order (2-tier):
|
|
5
|
+
* 1. Explicit `--output-dir <path>` — resolved relative to callerCwd
|
|
6
|
+
* 2. Default — `$callerCwd/.ailf/results/latest/`
|
|
7
|
+
*
|
|
8
|
+
* callerCwd is `AILF_CALLER_CWD ?? process.cwd()` — the user's actual
|
|
9
|
+
* working directory, not the eval package root.
|
|
10
|
+
*
|
|
11
|
+
* @see docs/design-docs/output-dir-routing.md
|
|
12
|
+
* @see docs/work-items/W0030.json
|
|
13
|
+
*/
|
|
14
|
+
/**
|
|
15
|
+
* Get the caller's working directory.
|
|
16
|
+
*
|
|
17
|
+
* When the CLI is invoked via `npx @sanity/ailf`, the wrapper may set
|
|
18
|
+
* AILF_CALLER_CWD to preserve the real CWD before Node changes it.
|
|
19
|
+
*/
|
|
20
|
+
export declare function getCallerCwd(): string;
|
|
21
|
+
/**
|
|
22
|
+
* Resolve the output directory for pipeline artifacts.
|
|
23
|
+
*
|
|
24
|
+
* @param outputDir - Explicit `--output-dir` value from the CLI (may be relative)
|
|
25
|
+
* @returns Absolute path to the output directory
|
|
26
|
+
*/
|
|
27
|
+
export declare function resolveOutputDir(outputDir?: string): string;
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared output directory resolution for all CLI commands.
|
|
3
|
+
*
|
|
4
|
+
* Resolution order (2-tier):
|
|
5
|
+
* 1. Explicit `--output-dir <path>` — resolved relative to callerCwd
|
|
6
|
+
* 2. Default — `$callerCwd/.ailf/results/latest/`
|
|
7
|
+
*
|
|
8
|
+
* callerCwd is `AILF_CALLER_CWD ?? process.cwd()` — the user's actual
|
|
9
|
+
* working directory, not the eval package root.
|
|
10
|
+
*
|
|
11
|
+
* @see docs/design-docs/output-dir-routing.md
|
|
12
|
+
* @see docs/work-items/W0030.json
|
|
13
|
+
*/
|
|
14
|
+
import { resolve } from "path";
|
|
15
|
+
/**
|
|
16
|
+
* Get the caller's working directory.
|
|
17
|
+
*
|
|
18
|
+
* When the CLI is invoked via `npx @sanity/ailf`, the wrapper may set
|
|
19
|
+
* AILF_CALLER_CWD to preserve the real CWD before Node changes it.
|
|
20
|
+
*/
|
|
21
|
+
export function getCallerCwd() {
|
|
22
|
+
return process.env.AILF_CALLER_CWD ?? process.cwd();
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Resolve the output directory for pipeline artifacts.
|
|
26
|
+
*
|
|
27
|
+
* @param outputDir - Explicit `--output-dir` value from the CLI (may be relative)
|
|
28
|
+
* @returns Absolute path to the output directory
|
|
29
|
+
*/
|
|
30
|
+
export function resolveOutputDir(outputDir) {
|
|
31
|
+
const callerCwd = getCallerCwd();
|
|
32
|
+
if (outputDir) {
|
|
33
|
+
return resolve(callerCwd, outputDir);
|
|
34
|
+
}
|
|
35
|
+
return resolve(callerCwd, ".ailf", "results", "latest");
|
|
36
|
+
}
|
package/dist/composition-root.js
CHANGED
|
@@ -60,7 +60,7 @@ export function createAppContext(config) {
|
|
|
60
60
|
// Artifact collector — no-op by default, filesystem when --capture is set
|
|
61
61
|
const collector = config.captureEnabled
|
|
62
62
|
? new FilesystemArtifactCollector({
|
|
63
|
-
captureDir: config.captureDir ?? join(config.
|
|
63
|
+
captureDir: config.captureDir ?? join(config.outputDir, "..", "captures"),
|
|
64
64
|
mode: config.mode,
|
|
65
65
|
compress: config.captureCompress ?? true,
|
|
66
66
|
extras: config.captureExtras ?? true,
|
package/dist/config/rubrics.ts
CHANGED
|
@@ -201,9 +201,9 @@ export default defineRubrics({
|
|
|
201
201
|
currency: 0.2,
|
|
202
202
|
},
|
|
203
203
|
"agent-harness": {
|
|
204
|
-
"
|
|
205
|
-
"
|
|
206
|
-
"
|
|
204
|
+
"assertion-pass-rate": 0.35,
|
|
205
|
+
"agent-output": 0.35,
|
|
206
|
+
"tool-usage": 0.3,
|
|
207
207
|
},
|
|
208
208
|
},
|
|
209
209
|
|
|
@@ -79,7 +79,7 @@ export function mapToResolvedConfig(opts, rootDir) {
|
|
|
79
79
|
apiUrl: opts.apiUrl ?? "https://ailf-api.sanity.build",
|
|
80
80
|
apiKey: opts.apiKey,
|
|
81
81
|
captureEnabled: opts.captureEnabled ?? false,
|
|
82
|
-
captureDir: opts.captureDir ?? join(
|
|
82
|
+
captureDir: opts.captureDir ?? join(opts.outputDir, "..", "captures"),
|
|
83
83
|
captureCompress: opts.captureCompress ?? true,
|
|
84
84
|
captureExtras: opts.captureExtras ?? true,
|
|
85
85
|
};
|
|
@@ -29,15 +29,29 @@ export class FetchDocsStep {
|
|
|
29
29
|
return { status: "skipped", reason: "--skip-fetch" };
|
|
30
30
|
}
|
|
31
31
|
const start = Date.now();
|
|
32
|
-
// Load tasks
|
|
33
|
-
//
|
|
34
|
-
//
|
|
35
|
-
//
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
32
|
+
// Load tasks — use the same source as GenerateConfigsStep to avoid
|
|
33
|
+
// a mismatch where configs reference context files that were never
|
|
34
|
+
// fetched.
|
|
35
|
+
//
|
|
36
|
+
// Content Lake path: use ctx.taskSource (ContentLakeTaskSource) which
|
|
37
|
+
// loads Studio-owned ailf.task documents via GROQ.
|
|
38
|
+
// Filesystem path: load from .task.ts files (repo/inline tasks).
|
|
39
|
+
let allTasks;
|
|
40
|
+
if (ctx.config.taskSourceType === "content-lake") {
|
|
41
|
+
const filter = {
|
|
42
|
+
...(ctx.config.areas?.length ? { areas: ctx.config.areas } : {}),
|
|
43
|
+
...(ctx.config.tasks?.length ? { taskIds: ctx.config.tasks } : {}),
|
|
44
|
+
...(ctx.config.tags?.length ? { tags: ctx.config.tags } : {}),
|
|
45
|
+
};
|
|
46
|
+
allTasks = await ctx.taskSource.loadTasks(Object.keys(filter).length > 0 ? filter : undefined);
|
|
47
|
+
}
|
|
48
|
+
else {
|
|
49
|
+
allTasks = await loadPipelineTasks({
|
|
50
|
+
rootDir: ctx.config.rootDir,
|
|
51
|
+
mode: ctx.config.mode,
|
|
52
|
+
repoTasksPath: ctx.config.repoTasksPath,
|
|
53
|
+
});
|
|
54
|
+
}
|
|
41
55
|
// Bridge: narrow to literacy tasks for canonical doc access
|
|
42
56
|
const literacyTasks = allTasks.filter((t) => t.mode === "literacy");
|
|
43
57
|
const tasksWithDocs = literacyTasks.filter((t) => (t.context?.docs?.length ?? 0) > 0);
|
|
@@ -61,87 +61,96 @@ export class GapAnalysisStep {
|
|
|
61
61
|
mkdirSync(outDir, { recursive: true });
|
|
62
62
|
writeFileSync(join(outDir, "failure-modes.json"), JSON.stringify(failureModeReport, null, 2));
|
|
63
63
|
writeFileSync(join(outDir, "gap-analysis.json"), JSON.stringify(gapReport, null, 2));
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
const
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
}
|
|
72
|
-
const resolveRefs = (slugs) => slugs
|
|
73
|
-
.map((slug) => {
|
|
74
|
-
const m = refBySlug.get(slug);
|
|
75
|
-
return m
|
|
76
|
-
? {
|
|
77
|
-
documentId: m._id,
|
|
78
|
-
revision: m._rev,
|
|
79
|
-
slug: m.slug,
|
|
80
|
-
title: m.title,
|
|
81
|
-
}
|
|
82
|
-
: { documentId: "", slug, title: slug };
|
|
83
|
-
})
|
|
84
|
-
.filter((r) => r.documentId !== "");
|
|
85
|
-
// ── Build description→docs mapping from TaskSource ─────────
|
|
86
|
-
// Primary source: use the TaskSource adapter from AppContext.
|
|
87
|
-
// This works with Content Lake, repo-based, and YAML tasks.
|
|
88
|
-
// Judgments use task description as their taskId, so we build
|
|
89
|
-
// maps keyed by both description and task ID for robust matching.
|
|
64
|
+
// ── Document enrichment (literacy mode only) ──────────────
|
|
65
|
+
// Non-literacy modes don't use canonical docs. Skip manifest
|
|
66
|
+
// loading, doc-reference enrichment, and canonical doc mapping
|
|
67
|
+
// entirely — those fields are only meaningful for literacy evals.
|
|
68
|
+
const isLiteracyMode = ctx.config.mode === "literacy";
|
|
69
|
+
let documentManifest;
|
|
70
|
+
let enrichedScores = scoreSummary.scores;
|
|
90
71
|
const descToDocRefs = new Map();
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
}
|
|
100
|
-
if (tasks.length > 0) {
|
|
101
|
-
// Group tasks by feature area and build slug maps
|
|
102
|
-
const byArea = new Map();
|
|
103
|
-
for (const task of tasks) {
|
|
104
|
-
const slugs = extractSlugsFromRefs(task.context?.docs ?? []);
|
|
105
|
-
const refs = resolveRefs(slugs);
|
|
106
|
-
// Map by title (what judgments use as taskId)
|
|
107
|
-
descToDocRefs.set(task.title, refs);
|
|
108
|
-
// Also map by task ID for prefix-based matching
|
|
109
|
-
descToDocRefs.set(task.id, refs);
|
|
110
|
-
// Group slugs by feature area
|
|
111
|
-
const area = task.area ?? "";
|
|
112
|
-
if (!byArea.has(area))
|
|
113
|
-
byArea.set(area, new Set());
|
|
114
|
-
for (const s of slugs)
|
|
115
|
-
byArea.get(area).add(s);
|
|
72
|
+
if (isLiteracyMode) {
|
|
73
|
+
const manifestPath = resolve(root, "contexts", "document-manifest.json");
|
|
74
|
+
const manifestEntries = existsSync(manifestPath)
|
|
75
|
+
? JSON.parse(readFileSync(manifestPath, "utf-8"))
|
|
76
|
+
: [];
|
|
77
|
+
const refBySlug = new Map();
|
|
78
|
+
for (const entry of manifestEntries) {
|
|
79
|
+
refBySlug.set(entry.slug, entry);
|
|
116
80
|
}
|
|
117
|
-
|
|
118
|
-
|
|
81
|
+
const resolveRefs = (slugs) => slugs
|
|
82
|
+
.map((slug) => {
|
|
83
|
+
const m = refBySlug.get(slug);
|
|
84
|
+
return m
|
|
85
|
+
? {
|
|
86
|
+
documentId: m._id,
|
|
87
|
+
revision: m._rev,
|
|
88
|
+
slug: m.slug,
|
|
89
|
+
title: m.title,
|
|
90
|
+
}
|
|
91
|
+
: { documentId: "", slug, title: slug };
|
|
92
|
+
})
|
|
93
|
+
.filter((r) => r.documentId !== "");
|
|
94
|
+
// ── Build description→docs mapping from TaskSource ─────────
|
|
95
|
+
// Primary source: use the TaskSource adapter from AppContext.
|
|
96
|
+
// This works with Content Lake, repo-based, and YAML tasks.
|
|
97
|
+
// Judgments use task description as their taskId, so we build
|
|
98
|
+
// maps keyed by both description and task ID for robust matching.
|
|
99
|
+
const areaToDocRefs = new Map();
|
|
100
|
+
let tasks = [];
|
|
101
|
+
try {
|
|
102
|
+
tasks = (await ctx.taskSource.loadTasks()).filter((t) => t.mode === "literacy");
|
|
119
103
|
}
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
descToDocRefs.set(task.
|
|
104
|
+
catch {
|
|
105
|
+
// TaskSource may not be available in all contexts (e.g., standalone
|
|
106
|
+
// gap analysis on cached results). Fall through to legacy fallback.
|
|
107
|
+
}
|
|
108
|
+
if (tasks.length > 0) {
|
|
109
|
+
// Group tasks by feature area and build slug maps
|
|
110
|
+
const byArea = new Map();
|
|
111
|
+
for (const task of tasks) {
|
|
112
|
+
const slugs = extractSlugsFromRefs(task.context?.docs ?? []);
|
|
113
|
+
const refs = resolveRefs(slugs);
|
|
114
|
+
// Map by title (what judgments use as taskId)
|
|
115
|
+
descToDocRefs.set(task.title, refs);
|
|
116
|
+
// Also map by task ID for prefix-based matching
|
|
117
|
+
descToDocRefs.set(task.id, refs);
|
|
118
|
+
// Group slugs by feature area
|
|
119
|
+
const area = task.area ?? "";
|
|
120
|
+
if (!byArea.has(area))
|
|
121
|
+
byArea.set(area, new Set());
|
|
122
|
+
for (const s of slugs)
|
|
123
|
+
byArea.get(area).add(s);
|
|
124
|
+
}
|
|
125
|
+
for (const [area, slugs] of byArea) {
|
|
126
|
+
areaToDocRefs.set(area, resolveRefs([...slugs]));
|
|
132
127
|
}
|
|
133
|
-
for (const s of taskSlugs)
|
|
134
|
-
areaSlugs.add(s);
|
|
135
128
|
}
|
|
136
|
-
|
|
137
|
-
|
|
129
|
+
// Legacy fallback: merge in any tasks from local YAML that weren't
|
|
130
|
+
// already covered by the TaskSource adapter.
|
|
131
|
+
const { resolveMappings } = await import("../../pipeline/resolve-mappings.js");
|
|
132
|
+
const mappings = resolveMappings(root);
|
|
133
|
+
for (const [area, areaData] of Object.entries(mappings.feature_areas)) {
|
|
134
|
+
const areaSlugs = new Set();
|
|
135
|
+
for (const task of areaData.tasks) {
|
|
136
|
+
const taskSlugs = task.canonical_docs.map((d) => d.slug);
|
|
137
|
+
// Only add if not already mapped by the primary source
|
|
138
|
+
if (!descToDocRefs.has(task.description)) {
|
|
139
|
+
descToDocRefs.set(task.description, resolveRefs(taskSlugs));
|
|
140
|
+
}
|
|
141
|
+
for (const s of taskSlugs)
|
|
142
|
+
areaSlugs.add(s);
|
|
143
|
+
}
|
|
144
|
+
if (!areaToDocRefs.has(area)) {
|
|
145
|
+
areaToDocRefs.set(area, resolveRefs([...areaSlugs]));
|
|
146
|
+
}
|
|
138
147
|
}
|
|
148
|
+
documentManifest = resolveRefs([...refBySlug.keys()]);
|
|
149
|
+
enrichedScores = scoreSummary.scores.map((s) => ({
|
|
150
|
+
...s,
|
|
151
|
+
documents: areaToDocRefs.get(s.feature),
|
|
152
|
+
}));
|
|
139
153
|
}
|
|
140
|
-
const documentManifest = resolveRefs([...refBySlug.keys()]);
|
|
141
|
-
const enrichedScores = scoreSummary.scores.map((s) => ({
|
|
142
|
-
...s,
|
|
143
|
-
documents: areaToDocRefs.get(s.feature),
|
|
144
|
-
}));
|
|
145
154
|
// ── Low-scoring judgments ────────────────────────────────────
|
|
146
155
|
const LOW_SCORE_THRESHOLD = 70;
|
|
147
156
|
const MAX_STORED_JUDGMENTS = 50;
|
|
@@ -154,6 +163,8 @@ export class GapAnalysisStep {
|
|
|
154
163
|
.sort((a, b) => a.score - b.score)
|
|
155
164
|
.slice(0, MAX_STORED_JUDGMENTS)
|
|
156
165
|
.map((j) => {
|
|
166
|
+
if (!isLiteracyMode)
|
|
167
|
+
return j;
|
|
157
168
|
// Judgment taskId is the description with "(gold)" or "(baseline)" suffix
|
|
158
169
|
const baseDesc = j.taskId.replace(/\s*\((gold|baseline)\)\s*$/, "");
|
|
159
170
|
const canonicalDocs = descToDocRefs.get(baseDesc);
|
|
@@ -161,7 +172,7 @@ export class GapAnalysisStep {
|
|
|
161
172
|
});
|
|
162
173
|
const enrichedSummary = {
|
|
163
174
|
...scoreSummary,
|
|
164
|
-
documentManifest,
|
|
175
|
+
...(documentManifest !== undefined && { documentManifest }),
|
|
165
176
|
failureModes: failureModeReport,
|
|
166
177
|
lowScoringJudgments,
|
|
167
178
|
recommendations: gapReport,
|
|
@@ -18,6 +18,21 @@ export declare class GenerateConfigsStep implements PipelineStep {
|
|
|
18
18
|
private compileLiteracyVariants;
|
|
19
19
|
private compileSingleMode;
|
|
20
20
|
private loadTasks;
|
|
21
|
+
/**
|
|
22
|
+
* Load tasks from the Content Lake via ctx.taskSource.
|
|
23
|
+
*
|
|
24
|
+
* The ContentLakeTaskSource adapter handles area/task/tag filtering
|
|
25
|
+
* in the GROQ query itself, so we build a FilterOptions and pass it
|
|
26
|
+
* through rather than filtering in-memory after loading.
|
|
27
|
+
*/
|
|
28
|
+
private loadTasksFromContentLake;
|
|
29
|
+
/**
|
|
30
|
+
* Load tasks from filesystem .task.ts files.
|
|
31
|
+
*
|
|
32
|
+
* This is the original path used for repo-based and inline tasks.
|
|
33
|
+
* It scans tasks/{mode}/ and optionally --repo-tasks-path.
|
|
34
|
+
*/
|
|
35
|
+
private loadTasksFromFilesystem;
|
|
21
36
|
private applyFilters;
|
|
22
37
|
/**
|
|
23
38
|
* Build a descriptive error message when no tasks match the current filters.
|
|
@@ -159,10 +159,22 @@ export class GenerateConfigsStep {
|
|
|
159
159
|
label: m.label,
|
|
160
160
|
config: m.config,
|
|
161
161
|
}));
|
|
162
|
+
// Load rubric config for template resolution (needed by modes that use
|
|
163
|
+
// templated LLM-rubric assertions, e.g., agent-harness with agent-output
|
|
164
|
+
// and agent-tool-usage templates)
|
|
165
|
+
let rubricConfig;
|
|
166
|
+
try {
|
|
167
|
+
const { loadRubricTemplates } = await import("../../pipeline/rubric-loader.js");
|
|
168
|
+
rubricConfig = loadRubricTemplates(ctx.config.rootDir);
|
|
169
|
+
}
|
|
170
|
+
catch {
|
|
171
|
+
ctx.logger.warn(" ⚠ Could not load rubric config — templates will not resolve");
|
|
172
|
+
}
|
|
162
173
|
const merged = this.compileAll(handler, tasks, {
|
|
163
174
|
rootDir: ctx.config.rootDir,
|
|
164
175
|
graderProvider: models.grader.id,
|
|
165
176
|
models: modeModels,
|
|
177
|
+
rubricConfig,
|
|
166
178
|
});
|
|
167
179
|
for (const w of merged.warnings) {
|
|
168
180
|
ctx.logger.warn(` ⚠ ${w}`);
|
|
@@ -197,6 +209,50 @@ export class GenerateConfigsStep {
|
|
|
197
209
|
// Task loading — unified for all modes
|
|
198
210
|
// ---------------------------------------------------------------------------
|
|
199
211
|
async loadTasks(ctx, mode, state) {
|
|
212
|
+
// Content Lake path — use ctx.taskSource (ContentLakeTaskSource) which
|
|
213
|
+
// loads ailf.task documents via GROQ. This is the only path that sees
|
|
214
|
+
// Studio-owned tasks (ownership: "studio").
|
|
215
|
+
if (ctx.config.taskSourceType === "content-lake") {
|
|
216
|
+
return this.loadTasksFromContentLake(ctx, state);
|
|
217
|
+
}
|
|
218
|
+
// Filesystem path — load from .task.ts files (repo tasks, inline tasks).
|
|
219
|
+
return this.loadTasksFromFilesystem(ctx, mode, state);
|
|
220
|
+
}
|
|
221
|
+
/**
|
|
222
|
+
* Load tasks from the Content Lake via ctx.taskSource.
|
|
223
|
+
*
|
|
224
|
+
* The ContentLakeTaskSource adapter handles area/task/tag filtering
|
|
225
|
+
* in the GROQ query itself, so we build a FilterOptions and pass it
|
|
226
|
+
* through rather than filtering in-memory after loading.
|
|
227
|
+
*/
|
|
228
|
+
async loadTasksFromContentLake(ctx, state) {
|
|
229
|
+
const filter = {
|
|
230
|
+
...(ctx.config.areas?.length ? { areas: ctx.config.areas } : {}),
|
|
231
|
+
...(ctx.config.tasks?.length ? { taskIds: ctx.config.tasks } : {}),
|
|
232
|
+
...(ctx.config.tags?.length ? { tags: ctx.config.tags } : {}),
|
|
233
|
+
};
|
|
234
|
+
const tasks = await ctx.taskSource.loadTasks(Object.keys(filter).length > 0 ? filter : undefined);
|
|
235
|
+
// Capture loaded IDs for error messages (same as filesystem path)
|
|
236
|
+
this.lastLoadedTaskIds = tasks
|
|
237
|
+
.map((t) => t.id)
|
|
238
|
+
.filter((id) => !!id);
|
|
239
|
+
// Release auto-scope
|
|
240
|
+
if (state.releaseAutoScope && !ctx.config.noAutoScope) {
|
|
241
|
+
const scopedIds = new Set(state.releaseAutoScope.affectedTaskIds);
|
|
242
|
+
const beforeCount = tasks.length;
|
|
243
|
+
const scoped = tasks.filter((t) => "id" in t && scopedIds.has(t.id));
|
|
244
|
+
ctx.logger.info(` 🎯 Auto-scoped to ${scoped.length} of ${beforeCount} task(s) affected by release`);
|
|
245
|
+
return scoped;
|
|
246
|
+
}
|
|
247
|
+
return tasks;
|
|
248
|
+
}
|
|
249
|
+
/**
|
|
250
|
+
* Load tasks from filesystem .task.ts files.
|
|
251
|
+
*
|
|
252
|
+
* This is the original path used for repo-based and inline tasks.
|
|
253
|
+
* It scans tasks/{mode}/ and optionally --repo-tasks-path.
|
|
254
|
+
*/
|
|
255
|
+
async loadTasksFromFilesystem(ctx, mode, state) {
|
|
200
256
|
const { resolve } = await import("path");
|
|
201
257
|
const { discoverTsTaskFiles, loadTsTaskFile } = await import("../../adapters/task-sources/task-file-loader.js");
|
|
202
258
|
const { resolveVendoredSubdir } = await import("../../pipeline/compiler/config-loader.js");
|
|
@@ -113,6 +113,11 @@ export class RunEvalStep {
|
|
|
113
113
|
// required eval modes were satisfied from the remote cache.
|
|
114
114
|
state.remoteCacheHits ??= new Set();
|
|
115
115
|
state.remoteCacheHits.add(this.mode);
|
|
116
|
+
// Carry forward Promptfoo share URLs from the cached report
|
|
117
|
+
if (remoteCacheResult.promptfooUrls?.length) {
|
|
118
|
+
state.promptfooUrls ??= [];
|
|
119
|
+
state.promptfooUrls.push(...remoteCacheResult.promptfooUrls);
|
|
120
|
+
}
|
|
116
121
|
// Capture the restored score-summary from remote cache
|
|
117
122
|
const cachedSummaryPath = resolve(rootDir, "results", "latest", "score-summary.json");
|
|
118
123
|
if (existsSync(cachedSummaryPath)) {
|
|
@@ -189,6 +194,14 @@ export class RunEvalStep {
|
|
|
189
194
|
mode: this.mode,
|
|
190
195
|
});
|
|
191
196
|
}
|
|
197
|
+
// Extract Promptfoo share URL from eval results (Step 3b)
|
|
198
|
+
if (ctx.evalRunner.extractShareUrl) {
|
|
199
|
+
const shareUrl = ctx.evalRunner.extractShareUrl(resolve(rootDir, resultsFileForMode(this.mode)));
|
|
200
|
+
if (shareUrl) {
|
|
201
|
+
state.promptfooUrls ??= [];
|
|
202
|
+
state.promptfooUrls.push({ mode: this.mode, url: shareUrl });
|
|
203
|
+
}
|
|
204
|
+
}
|
|
192
205
|
const durationMs = Date.now() - start;
|
|
193
206
|
return {
|
|
194
207
|
durationMs,
|
|
@@ -224,6 +237,7 @@ async function checkRemoteCache(fingerprint, reportStore, rootDir) {
|
|
|
224
237
|
console.log(` ℹ️ Fingerprint: ${fingerprint.slice(0, 16)}... (${queryMs}ms)`);
|
|
225
238
|
return {
|
|
226
239
|
completedAt: cachedReport.completedAt,
|
|
240
|
+
promptfooUrls: cachedReport.provenance?.promptfooUrls,
|
|
227
241
|
reportId: cachedReport.id,
|
|
228
242
|
};
|
|
229
243
|
}
|