@sanity/ailf 2.2.0 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/config/rubrics.ts +3 -3
  2. package/dist/_vendor/ailf-core/types/index.d.ts +25 -0
  3. package/dist/adapters/task-sources/content-lake-task-source.js +15 -7
  4. package/dist/commands/calculate-scores.js +7 -2
  5. package/dist/commands/capture-list.d.ts +1 -1
  6. package/dist/commands/capture-list.js +6 -3
  7. package/dist/commands/compare.js +11 -7
  8. package/dist/commands/explain-handler.js +22 -24
  9. package/dist/commands/fetch-docs.js +4 -2
  10. package/dist/commands/generate-configs.js +6 -2
  11. package/dist/commands/pipeline-action.js +8 -24
  12. package/dist/commands/pipeline.js +1 -1
  13. package/dist/commands/pr-comment.js +6 -2
  14. package/dist/commands/publish.d.ts +1 -0
  15. package/dist/commands/publish.js +12 -8
  16. package/dist/commands/remote-pipeline.js +1 -1
  17. package/dist/commands/remote-results.d.ts +8 -8
  18. package/dist/commands/remote-results.js +7 -7
  19. package/dist/commands/shared/options.d.ts +8 -0
  20. package/dist/commands/shared/options.js +10 -0
  21. package/dist/commands/shared/resolve-output-dir.d.ts +27 -0
  22. package/dist/commands/shared/resolve-output-dir.js +36 -0
  23. package/dist/composition-root.js +1 -1
  24. package/dist/config/rubrics.ts +3 -3
  25. package/dist/orchestration/build-app-context.js +1 -1
  26. package/dist/orchestration/steps/fetch-docs-step.js +23 -9
  27. package/dist/orchestration/steps/gap-analysis-step.js +86 -75
  28. package/dist/orchestration/steps/generate-configs-step.d.ts +15 -0
  29. package/dist/orchestration/steps/generate-configs-step.js +56 -0
  30. package/dist/orchestration/steps/run-eval-step.js +14 -0
  31. package/dist/pipeline/calculate-scores.js +113 -2
  32. package/dist/pipeline/compare.js +50 -19
  33. package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +64 -0
  34. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +6 -0
  35. package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +14 -0
  36. package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -0
  37. package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +3 -0
  38. package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +1 -27
  39. package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +2 -9
  40. package/dist/pipeline/compiler/rubric-resolution.d.ts +40 -0
  41. package/dist/pipeline/compiler/rubric-resolution.js +52 -0
  42. package/dist/pipeline/compiler/scoring-bridge.js +59 -7
  43. package/dist/pipeline/provenance.js +7 -1
  44. package/dist/pipeline/validate.d.ts +5 -4
  45. package/dist/pipeline/validate.js +34 -113
  46. package/dist/webhook/eval-request-handler.js +4 -0
  47. package/package.json +1 -1
@@ -4,9 +4,9 @@
4
4
  * Produces the same file layout as local mode so downstream tools
5
5
  * (workflow PR comments, score comparison, baseline save) work unchanged:
6
6
  *
7
- * results/latest/score-summary.json — scores by area + overall
8
- * results/latest/report.md — rendered markdown report
9
- * results/latest/job-metadata.json — job ID, timing, API URL
7
+ * <outputDir>/score-summary.json — scores by area + overall
8
+ * <outputDir>/report.md — rendered markdown report
9
+ * <outputDir>/job-metadata.json — job ID, timing, API URL
10
10
  *
11
11
  * @see packages/eval/src/commands/remote-pipeline.ts — caller
12
12
  */
@@ -14,8 +14,8 @@ import type { ApiClient } from "../adapters/api-client/api-client.js";
14
14
  import type { JobResponse } from "../adapters/api-client/types.js";
15
15
  /** Options for writing remote results. */
16
16
  export interface WriteResultsOptions {
17
- /** Eval package root directory (for results/latest/ path). */
18
- rootDir: string;
17
+ /** Base directory for output artifacts. */
18
+ outputDir: string;
19
19
  /** Optional output path override (--output flag). */
20
20
  outputPath?: string;
21
21
  /** API base URL (for metadata). */
@@ -25,9 +25,9 @@ export interface WriteResultsOptions {
25
25
  * Fetch report artifacts from the API and write them to disk.
26
26
  *
27
27
  * Writes:
28
- * - `results/latest/score-summary.json` — score data from job response
29
- * - `results/latest/report.md` — full markdown report (if reportId present)
30
- * - `results/latest/job-metadata.json` — job tracking info
28
+ * - `<outputDir>/score-summary.json` — score data from job response
29
+ * - `<outputDir>/report.md` — full markdown report (if reportId present)
30
+ * - `<outputDir>/job-metadata.json` — job tracking info
31
31
  * - `--output` path — markdown report (if specified)
32
32
  */
33
33
  export declare function writeRemoteResults(client: ApiClient, job: JobResponse, options: WriteResultsOptions): Promise<void>;
@@ -4,9 +4,9 @@
4
4
  * Produces the same file layout as local mode so downstream tools
5
5
  * (workflow PR comments, score comparison, baseline save) work unchanged:
6
6
  *
7
- * results/latest/score-summary.json — scores by area + overall
8
- * results/latest/report.md — rendered markdown report
9
- * results/latest/job-metadata.json — job ID, timing, API URL
7
+ * <outputDir>/score-summary.json — scores by area + overall
8
+ * <outputDir>/report.md — rendered markdown report
9
+ * <outputDir>/job-metadata.json — job ID, timing, API URL
10
10
  *
11
11
  * @see packages/eval/src/commands/remote-pipeline.ts — caller
12
12
  */
@@ -19,13 +19,13 @@ import { resolve } from "path";
19
19
  * Fetch report artifacts from the API and write them to disk.
20
20
  *
21
21
  * Writes:
22
- * - `results/latest/score-summary.json` — score data from job response
23
- * - `results/latest/report.md` — full markdown report (if reportId present)
24
- * - `results/latest/job-metadata.json` — job tracking info
22
+ * - `<outputDir>/score-summary.json` — score data from job response
23
+ * - `<outputDir>/report.md` — full markdown report (if reportId present)
24
+ * - `<outputDir>/job-metadata.json` — job tracking info
25
25
  * - `--output` path — markdown report (if specified)
26
26
  */
27
27
  export async function writeRemoteResults(client, job, options) {
28
- const resultsDir = resolve(options.rootDir, "results", "latest");
28
+ const resultsDir = options.outputDir;
29
29
  mkdirSync(resultsDir, { recursive: true });
30
30
  // 1. Write score summary
31
31
  const scoreSummary = buildScoreSummary(job);
@@ -18,6 +18,14 @@ export declare function addDebugOptions(cmd: Command): Command;
18
18
  * Add output options: --output, --format
19
19
  */
20
20
  export declare function addOutputOptions(cmd: Command): Command;
21
+ /**
22
+ * Add --output-dir option for commands that write pipeline artifacts.
23
+ *
24
+ * Pair with `resolveOutputDir()` from `./resolve-output-dir.js` to resolve
25
+ * the value. When omitted, `resolveOutputDir()` defaults to
26
+ * `$CWD/.ailf/results/latest/`.
27
+ */
28
+ export declare function addOutputDirOption(cmd: Command): Command;
21
29
  /**
22
30
  * Add Sanity source options: --sanity-dataset, --sanity-project, etc.
23
31
  */
@@ -36,6 +36,16 @@ export function addOutputOptions(cmd) {
36
36
  .option("-o, --output <path>", "Write output to a specific file path")
37
37
  .option("-f, --format <fmt>", "Output format (e.g., table, json, md)");
38
38
  }
39
+ /**
40
+ * Add --output-dir option for commands that write pipeline artifacts.
41
+ *
42
+ * Pair with `resolveOutputDir()` from `./resolve-output-dir.js` to resolve
43
+ * the value. When omitted, `resolveOutputDir()` defaults to
44
+ * `$CWD/.ailf/results/latest/`.
45
+ */
46
+ export function addOutputDirOption(cmd) {
47
+ return cmd.option("--output-dir <path>", "Base directory for output artifacts (default: .ailf/results/latest/)");
48
+ }
39
49
  /**
40
50
  * Add Sanity source options: --sanity-dataset, --sanity-project, etc.
41
51
  */
@@ -0,0 +1,27 @@
1
+ /**
2
+ * Shared output directory resolution for all CLI commands.
3
+ *
4
+ * Resolution order (2-tier):
5
+ * 1. Explicit `--output-dir <path>` — resolved relative to callerCwd
6
+ * 2. Default — `$callerCwd/.ailf/results/latest/`
7
+ *
8
+ * callerCwd is `AILF_CALLER_CWD ?? process.cwd()` — the user's actual
9
+ * working directory, not the eval package root.
10
+ *
11
+ * @see docs/design-docs/output-dir-routing.md
12
+ * @see docs/work-items/W0030.json
13
+ */
14
+ /**
15
+ * Get the caller's working directory.
16
+ *
17
+ * When the CLI is invoked via `npx @sanity/ailf`, the wrapper may set
18
+ * AILF_CALLER_CWD to preserve the real CWD before Node changes it.
19
+ */
20
+ export declare function getCallerCwd(): string;
21
+ /**
22
+ * Resolve the output directory for pipeline artifacts.
23
+ *
24
+ * @param outputDir - Explicit `--output-dir` value from the CLI (may be relative)
25
+ * @returns Absolute path to the output directory
26
+ */
27
+ export declare function resolveOutputDir(outputDir?: string): string;
@@ -0,0 +1,36 @@
1
+ /**
2
+ * Shared output directory resolution for all CLI commands.
3
+ *
4
+ * Resolution order (2-tier):
5
+ * 1. Explicit `--output-dir <path>` — resolved relative to callerCwd
6
+ * 2. Default — `$callerCwd/.ailf/results/latest/`
7
+ *
8
+ * callerCwd is `AILF_CALLER_CWD ?? process.cwd()` — the user's actual
9
+ * working directory, not the eval package root.
10
+ *
11
+ * @see docs/design-docs/output-dir-routing.md
12
+ * @see docs/work-items/W0030.json
13
+ */
14
+ import { resolve } from "path";
15
+ /**
16
+ * Get the caller's working directory.
17
+ *
18
+ * When the CLI is invoked via `npx @sanity/ailf`, the wrapper may set
19
+ * AILF_CALLER_CWD to preserve the real CWD before Node changes it.
20
+ */
21
+ export function getCallerCwd() {
22
+ return process.env.AILF_CALLER_CWD ?? process.cwd();
23
+ }
24
+ /**
25
+ * Resolve the output directory for pipeline artifacts.
26
+ *
27
+ * @param outputDir - Explicit `--output-dir` value from the CLI (may be relative)
28
+ * @returns Absolute path to the output directory
29
+ */
30
+ export function resolveOutputDir(outputDir) {
31
+ const callerCwd = getCallerCwd();
32
+ if (outputDir) {
33
+ return resolve(callerCwd, outputDir);
34
+ }
35
+ return resolve(callerCwd, ".ailf", "results", "latest");
36
+ }
@@ -60,7 +60,7 @@ export function createAppContext(config) {
60
60
  // Artifact collector — no-op by default, filesystem when --capture is set
61
61
  const collector = config.captureEnabled
62
62
  ? new FilesystemArtifactCollector({
63
- captureDir: config.captureDir ?? join(config.rootDir, "results", "captures"),
63
+ captureDir: config.captureDir ?? join(config.outputDir, "..", "captures"),
64
64
  mode: config.mode,
65
65
  compress: config.captureCompress ?? true,
66
66
  extras: config.captureExtras ?? true,
@@ -201,9 +201,9 @@ export default defineRubrics({
201
201
  currency: 0.2,
202
202
  },
203
203
  "agent-harness": {
204
- "agent-output": 0.45,
205
- "tool-usage": 0.4,
206
- "process-quality": 0.15,
204
+ "assertion-pass-rate": 0.35,
205
+ "agent-output": 0.35,
206
+ "tool-usage": 0.3,
207
207
  },
208
208
  },
209
209
 
@@ -79,7 +79,7 @@ export function mapToResolvedConfig(opts, rootDir) {
79
79
  apiUrl: opts.apiUrl ?? "https://ailf-api.sanity.build",
80
80
  apiKey: opts.apiKey,
81
81
  captureEnabled: opts.captureEnabled ?? false,
82
- captureDir: opts.captureDir ?? join(rootDir, "results", "captures"),
82
+ captureDir: opts.captureDir ?? join(opts.outputDir, "..", "captures"),
83
83
  captureCompress: opts.captureCompress ?? true,
84
84
  captureExtras: opts.captureExtras ?? true,
85
85
  };
@@ -29,15 +29,29 @@ export class FetchDocsStep {
29
29
  return { status: "skipped", reason: "--skip-fetch" };
30
30
  }
31
31
  const start = Date.now();
32
- // Load tasks from the filesystem — the same source GenerateConfigsStep
33
- // uses. This replaces ctx.taskSource (ContentLakeTaskSource) which may
34
- // have no ailf.task documents, causing a mismatch where generated
35
- // configs reference context files that were never fetched.
36
- const allTasks = await loadPipelineTasks({
37
- rootDir: ctx.config.rootDir,
38
- mode: ctx.config.mode,
39
- repoTasksPath: ctx.config.repoTasksPath,
40
- });
32
+ // Load tasks use the same source as GenerateConfigsStep to avoid
33
+ // a mismatch where configs reference context files that were never
34
+ // fetched.
35
+ //
36
+ // Content Lake path: use ctx.taskSource (ContentLakeTaskSource) which
37
+ // loads Studio-owned ailf.task documents via GROQ.
38
+ // Filesystem path: load from .task.ts files (repo/inline tasks).
39
+ let allTasks;
40
+ if (ctx.config.taskSourceType === "content-lake") {
41
+ const filter = {
42
+ ...(ctx.config.areas?.length ? { areas: ctx.config.areas } : {}),
43
+ ...(ctx.config.tasks?.length ? { taskIds: ctx.config.tasks } : {}),
44
+ ...(ctx.config.tags?.length ? { tags: ctx.config.tags } : {}),
45
+ };
46
+ allTasks = await ctx.taskSource.loadTasks(Object.keys(filter).length > 0 ? filter : undefined);
47
+ }
48
+ else {
49
+ allTasks = await loadPipelineTasks({
50
+ rootDir: ctx.config.rootDir,
51
+ mode: ctx.config.mode,
52
+ repoTasksPath: ctx.config.repoTasksPath,
53
+ });
54
+ }
41
55
  // Bridge: narrow to literacy tasks for canonical doc access
42
56
  const literacyTasks = allTasks.filter((t) => t.mode === "literacy");
43
57
  const tasksWithDocs = literacyTasks.filter((t) => (t.context?.docs?.length ?? 0) > 0);
@@ -61,87 +61,96 @@ export class GapAnalysisStep {
61
61
  mkdirSync(outDir, { recursive: true });
62
62
  writeFileSync(join(outDir, "failure-modes.json"), JSON.stringify(failureModeReport, null, 2));
63
63
  writeFileSync(join(outDir, "gap-analysis.json"), JSON.stringify(gapReport, null, 2));
64
- const manifestPath = resolve(root, "contexts", "document-manifest.json");
65
- const manifestEntries = existsSync(manifestPath)
66
- ? JSON.parse(readFileSync(manifestPath, "utf-8"))
67
- : [];
68
- const refBySlug = new Map();
69
- for (const entry of manifestEntries) {
70
- refBySlug.set(entry.slug, entry);
71
- }
72
- const resolveRefs = (slugs) => slugs
73
- .map((slug) => {
74
- const m = refBySlug.get(slug);
75
- return m
76
- ? {
77
- documentId: m._id,
78
- revision: m._rev,
79
- slug: m.slug,
80
- title: m.title,
81
- }
82
- : { documentId: "", slug, title: slug };
83
- })
84
- .filter((r) => r.documentId !== "");
85
- // ── Build description→docs mapping from TaskSource ─────────
86
- // Primary source: use the TaskSource adapter from AppContext.
87
- // This works with Content Lake, repo-based, and YAML tasks.
88
- // Judgments use task description as their taskId, so we build
89
- // maps keyed by both description and task ID for robust matching.
64
+ // ── Document enrichment (literacy mode only) ──────────────
65
+ // Non-literacy modes don't use canonical docs. Skip manifest
66
+ // loading, doc-reference enrichment, and canonical doc mapping
67
+ // entirely — those fields are only meaningful for literacy evals.
68
+ const isLiteracyMode = ctx.config.mode === "literacy";
69
+ let documentManifest;
70
+ let enrichedScores = scoreSummary.scores;
90
71
  const descToDocRefs = new Map();
91
- const areaToDocRefs = new Map();
92
- let tasks = [];
93
- try {
94
- tasks = (await ctx.taskSource.loadTasks()).filter((t) => t.mode === "literacy");
95
- }
96
- catch {
97
- // TaskSource may not be available in all contexts (e.g., standalone
98
- // gap analysis on cached results). Fall through to legacy fallback.
99
- }
100
- if (tasks.length > 0) {
101
- // Group tasks by feature area and build slug maps
102
- const byArea = new Map();
103
- for (const task of tasks) {
104
- const slugs = extractSlugsFromRefs(task.context?.docs ?? []);
105
- const refs = resolveRefs(slugs);
106
- // Map by title (what judgments use as taskId)
107
- descToDocRefs.set(task.title, refs);
108
- // Also map by task ID for prefix-based matching
109
- descToDocRefs.set(task.id, refs);
110
- // Group slugs by feature area
111
- const area = task.area ?? "";
112
- if (!byArea.has(area))
113
- byArea.set(area, new Set());
114
- for (const s of slugs)
115
- byArea.get(area).add(s);
72
+ if (isLiteracyMode) {
73
+ const manifestPath = resolve(root, "contexts", "document-manifest.json");
74
+ const manifestEntries = existsSync(manifestPath)
75
+ ? JSON.parse(readFileSync(manifestPath, "utf-8"))
76
+ : [];
77
+ const refBySlug = new Map();
78
+ for (const entry of manifestEntries) {
79
+ refBySlug.set(entry.slug, entry);
116
80
  }
117
- for (const [area, slugs] of byArea) {
118
- areaToDocRefs.set(area, resolveRefs([...slugs]));
81
+ const resolveRefs = (slugs) => slugs
82
+ .map((slug) => {
83
+ const m = refBySlug.get(slug);
84
+ return m
85
+ ? {
86
+ documentId: m._id,
87
+ revision: m._rev,
88
+ slug: m.slug,
89
+ title: m.title,
90
+ }
91
+ : { documentId: "", slug, title: slug };
92
+ })
93
+ .filter((r) => r.documentId !== "");
94
+ // ── Build description→docs mapping from TaskSource ─────────
95
+ // Primary source: use the TaskSource adapter from AppContext.
96
+ // This works with Content Lake, repo-based, and YAML tasks.
97
+ // Judgments use task description as their taskId, so we build
98
+ // maps keyed by both description and task ID for robust matching.
99
+ const areaToDocRefs = new Map();
100
+ let tasks = [];
101
+ try {
102
+ tasks = (await ctx.taskSource.loadTasks()).filter((t) => t.mode === "literacy");
119
103
  }
120
- }
121
- // Legacy fallback: merge in any tasks from local YAML that weren't
122
- // already covered by the TaskSource adapter.
123
- const { resolveMappings } = await import("../../pipeline/resolve-mappings.js");
124
- const mappings = resolveMappings(root);
125
- for (const [area, areaData] of Object.entries(mappings.feature_areas)) {
126
- const areaSlugs = new Set();
127
- for (const task of areaData.tasks) {
128
- const taskSlugs = task.canonical_docs.map((d) => d.slug);
129
- // Only add if not already mapped by the primary source
130
- if (!descToDocRefs.has(task.description)) {
131
- descToDocRefs.set(task.description, resolveRefs(taskSlugs));
104
+ catch {
105
+ // TaskSource may not be available in all contexts (e.g., standalone
106
+ // gap analysis on cached results). Fall through to legacy fallback.
107
+ }
108
+ if (tasks.length > 0) {
109
+ // Group tasks by feature area and build slug maps
110
+ const byArea = new Map();
111
+ for (const task of tasks) {
112
+ const slugs = extractSlugsFromRefs(task.context?.docs ?? []);
113
+ const refs = resolveRefs(slugs);
114
+ // Map by title (what judgments use as taskId)
115
+ descToDocRefs.set(task.title, refs);
116
+ // Also map by task ID for prefix-based matching
117
+ descToDocRefs.set(task.id, refs);
118
+ // Group slugs by feature area
119
+ const area = task.area ?? "";
120
+ if (!byArea.has(area))
121
+ byArea.set(area, new Set());
122
+ for (const s of slugs)
123
+ byArea.get(area).add(s);
124
+ }
125
+ for (const [area, slugs] of byArea) {
126
+ areaToDocRefs.set(area, resolveRefs([...slugs]));
132
127
  }
133
- for (const s of taskSlugs)
134
- areaSlugs.add(s);
135
128
  }
136
- if (!areaToDocRefs.has(area)) {
137
- areaToDocRefs.set(area, resolveRefs([...areaSlugs]));
129
+ // Legacy fallback: merge in any tasks from local YAML that weren't
130
+ // already covered by the TaskSource adapter.
131
+ const { resolveMappings } = await import("../../pipeline/resolve-mappings.js");
132
+ const mappings = resolveMappings(root);
133
+ for (const [area, areaData] of Object.entries(mappings.feature_areas)) {
134
+ const areaSlugs = new Set();
135
+ for (const task of areaData.tasks) {
136
+ const taskSlugs = task.canonical_docs.map((d) => d.slug);
137
+ // Only add if not already mapped by the primary source
138
+ if (!descToDocRefs.has(task.description)) {
139
+ descToDocRefs.set(task.description, resolveRefs(taskSlugs));
140
+ }
141
+ for (const s of taskSlugs)
142
+ areaSlugs.add(s);
143
+ }
144
+ if (!areaToDocRefs.has(area)) {
145
+ areaToDocRefs.set(area, resolveRefs([...areaSlugs]));
146
+ }
138
147
  }
148
+ documentManifest = resolveRefs([...refBySlug.keys()]);
149
+ enrichedScores = scoreSummary.scores.map((s) => ({
150
+ ...s,
151
+ documents: areaToDocRefs.get(s.feature),
152
+ }));
139
153
  }
140
- const documentManifest = resolveRefs([...refBySlug.keys()]);
141
- const enrichedScores = scoreSummary.scores.map((s) => ({
142
- ...s,
143
- documents: areaToDocRefs.get(s.feature),
144
- }));
145
154
  // ── Low-scoring judgments ────────────────────────────────────
146
155
  const LOW_SCORE_THRESHOLD = 70;
147
156
  const MAX_STORED_JUDGMENTS = 50;
@@ -154,6 +163,8 @@ export class GapAnalysisStep {
154
163
  .sort((a, b) => a.score - b.score)
155
164
  .slice(0, MAX_STORED_JUDGMENTS)
156
165
  .map((j) => {
166
+ if (!isLiteracyMode)
167
+ return j;
157
168
  // Judgment taskId is the description with "(gold)" or "(baseline)" suffix
158
169
  const baseDesc = j.taskId.replace(/\s*\((gold|baseline)\)\s*$/, "");
159
170
  const canonicalDocs = descToDocRefs.get(baseDesc);
@@ -161,7 +172,7 @@ export class GapAnalysisStep {
161
172
  });
162
173
  const enrichedSummary = {
163
174
  ...scoreSummary,
164
- documentManifest,
175
+ ...(documentManifest !== undefined && { documentManifest }),
165
176
  failureModes: failureModeReport,
166
177
  lowScoringJudgments,
167
178
  recommendations: gapReport,
@@ -18,6 +18,21 @@ export declare class GenerateConfigsStep implements PipelineStep {
18
18
  private compileLiteracyVariants;
19
19
  private compileSingleMode;
20
20
  private loadTasks;
21
+ /**
22
+ * Load tasks from the Content Lake via ctx.taskSource.
23
+ *
24
+ * The ContentLakeTaskSource adapter handles area/task/tag filtering
25
+ * in the GROQ query itself, so we build a FilterOptions and pass it
26
+ * through rather than filtering in-memory after loading.
27
+ */
28
+ private loadTasksFromContentLake;
29
+ /**
30
+ * Load tasks from filesystem .task.ts files.
31
+ *
32
+ * This is the original path used for repo-based and inline tasks.
33
+ * It scans tasks/{mode}/ and optionally --repo-tasks-path.
34
+ */
35
+ private loadTasksFromFilesystem;
21
36
  private applyFilters;
22
37
  /**
23
38
  * Build a descriptive error message when no tasks match the current filters.
@@ -159,10 +159,22 @@ export class GenerateConfigsStep {
159
159
  label: m.label,
160
160
  config: m.config,
161
161
  }));
162
+ // Load rubric config for template resolution (needed by modes that use
163
+ // templated LLM-rubric assertions, e.g., agent-harness with agent-output
164
+ // and agent-tool-usage templates)
165
+ let rubricConfig;
166
+ try {
167
+ const { loadRubricTemplates } = await import("../../pipeline/rubric-loader.js");
168
+ rubricConfig = loadRubricTemplates(ctx.config.rootDir);
169
+ }
170
+ catch {
171
+ ctx.logger.warn(" ⚠ Could not load rubric config — templates will not resolve");
172
+ }
162
173
  const merged = this.compileAll(handler, tasks, {
163
174
  rootDir: ctx.config.rootDir,
164
175
  graderProvider: models.grader.id,
165
176
  models: modeModels,
177
+ rubricConfig,
166
178
  });
167
179
  for (const w of merged.warnings) {
168
180
  ctx.logger.warn(` ⚠ ${w}`);
@@ -197,6 +209,50 @@ export class GenerateConfigsStep {
197
209
  // Task loading — unified for all modes
198
210
  // ---------------------------------------------------------------------------
199
211
  async loadTasks(ctx, mode, state) {
212
+ // Content Lake path — use ctx.taskSource (ContentLakeTaskSource) which
213
+ // loads ailf.task documents via GROQ. This is the only path that sees
214
+ // Studio-owned tasks (ownership: "studio").
215
+ if (ctx.config.taskSourceType === "content-lake") {
216
+ return this.loadTasksFromContentLake(ctx, state);
217
+ }
218
+ // Filesystem path — load from .task.ts files (repo tasks, inline tasks).
219
+ return this.loadTasksFromFilesystem(ctx, mode, state);
220
+ }
221
+ /**
222
+ * Load tasks from the Content Lake via ctx.taskSource.
223
+ *
224
+ * The ContentLakeTaskSource adapter handles area/task/tag filtering
225
+ * in the GROQ query itself, so we build a FilterOptions and pass it
226
+ * through rather than filtering in-memory after loading.
227
+ */
228
+ async loadTasksFromContentLake(ctx, state) {
229
+ const filter = {
230
+ ...(ctx.config.areas?.length ? { areas: ctx.config.areas } : {}),
231
+ ...(ctx.config.tasks?.length ? { taskIds: ctx.config.tasks } : {}),
232
+ ...(ctx.config.tags?.length ? { tags: ctx.config.tags } : {}),
233
+ };
234
+ const tasks = await ctx.taskSource.loadTasks(Object.keys(filter).length > 0 ? filter : undefined);
235
+ // Capture loaded IDs for error messages (same as filesystem path)
236
+ this.lastLoadedTaskIds = tasks
237
+ .map((t) => t.id)
238
+ .filter((id) => !!id);
239
+ // Release auto-scope
240
+ if (state.releaseAutoScope && !ctx.config.noAutoScope) {
241
+ const scopedIds = new Set(state.releaseAutoScope.affectedTaskIds);
242
+ const beforeCount = tasks.length;
243
+ const scoped = tasks.filter((t) => "id" in t && scopedIds.has(t.id));
244
+ ctx.logger.info(` 🎯 Auto-scoped to ${scoped.length} of ${beforeCount} task(s) affected by release`);
245
+ return scoped;
246
+ }
247
+ return tasks;
248
+ }
249
+ /**
250
+ * Load tasks from filesystem .task.ts files.
251
+ *
252
+ * This is the original path used for repo-based and inline tasks.
253
+ * It scans tasks/{mode}/ and optionally --repo-tasks-path.
254
+ */
255
+ async loadTasksFromFilesystem(ctx, mode, state) {
200
256
  const { resolve } = await import("path");
201
257
  const { discoverTsTaskFiles, loadTsTaskFile } = await import("../../adapters/task-sources/task-file-loader.js");
202
258
  const { resolveVendoredSubdir } = await import("../../pipeline/compiler/config-loader.js");
@@ -113,6 +113,11 @@ export class RunEvalStep {
113
113
  // required eval modes were satisfied from the remote cache.
114
114
  state.remoteCacheHits ??= new Set();
115
115
  state.remoteCacheHits.add(this.mode);
116
+ // Carry forward Promptfoo share URLs from the cached report
117
+ if (remoteCacheResult.promptfooUrls?.length) {
118
+ state.promptfooUrls ??= [];
119
+ state.promptfooUrls.push(...remoteCacheResult.promptfooUrls);
120
+ }
116
121
  // Capture the restored score-summary from remote cache
117
122
  const cachedSummaryPath = resolve(rootDir, "results", "latest", "score-summary.json");
118
123
  if (existsSync(cachedSummaryPath)) {
@@ -189,6 +194,14 @@ export class RunEvalStep {
189
194
  mode: this.mode,
190
195
  });
191
196
  }
197
+ // Extract Promptfoo share URL from eval results (Step 3b)
198
+ if (ctx.evalRunner.extractShareUrl) {
199
+ const shareUrl = ctx.evalRunner.extractShareUrl(resolve(rootDir, resultsFileForMode(this.mode)));
200
+ if (shareUrl) {
201
+ state.promptfooUrls ??= [];
202
+ state.promptfooUrls.push({ mode: this.mode, url: shareUrl });
203
+ }
204
+ }
192
205
  const durationMs = Date.now() - start;
193
206
  return {
194
207
  durationMs,
@@ -224,6 +237,7 @@ async function checkRemoteCache(fingerprint, reportStore, rootDir) {
224
237
  console.log(` ℹ️ Fingerprint: ${fingerprint.slice(0, 16)}... (${queryMs}ms)`);
225
238
  return {
226
239
  completedAt: cachedReport.completedAt,
240
+ promptfooUrls: cachedReport.provenance?.promptfooUrls,
227
241
  reportId: cachedReport.id,
228
242
  };
229
243
  }