npm - @sanity/ailf - Versions diffs - 2.2.0 → 2.3.1 - Mend

@sanity/ailf 2.2.0 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

package/config/rubrics.ts +3 -3
package/dist/_vendor/ailf-core/types/index.d.ts +25 -0
package/dist/adapters/task-sources/content-lake-task-source.js +15 -7
package/dist/commands/calculate-scores.js +7 -2
package/dist/commands/capture-list.d.ts +1 -1
package/dist/commands/capture-list.js +6 -3
package/dist/commands/compare.js +11 -7
package/dist/commands/explain-handler.js +22 -24
package/dist/commands/fetch-docs.js +4 -2
package/dist/commands/generate-configs.js +6 -2
package/dist/commands/pipeline-action.js +8 -24
package/dist/commands/pipeline.js +1 -1
package/dist/commands/pr-comment.js +6 -2
package/dist/commands/publish.d.ts +1 -0
package/dist/commands/publish.js +12 -8
package/dist/commands/remote-pipeline.js +1 -1
package/dist/commands/remote-results.d.ts +8 -8
package/dist/commands/remote-results.js +7 -7
package/dist/commands/shared/options.d.ts +8 -0
package/dist/commands/shared/options.js +10 -0
package/dist/commands/shared/resolve-output-dir.d.ts +27 -0
package/dist/commands/shared/resolve-output-dir.js +36 -0
package/dist/composition-root.js +1 -1
package/dist/config/rubrics.ts +3 -3
package/dist/orchestration/build-app-context.js +1 -1
package/dist/orchestration/steps/fetch-docs-step.js +23 -9
package/dist/orchestration/steps/gap-analysis-step.js +86 -75
package/dist/orchestration/steps/generate-configs-step.d.ts +15 -0
package/dist/orchestration/steps/generate-configs-step.js +56 -0
package/dist/orchestration/steps/run-eval-step.js +14 -0
package/dist/pipeline/calculate-scores.js +113 -2
package/dist/pipeline/compare.js +50 -19
package/dist/pipeline/compiler/__tests__/agent-harness-handler.test.js +64 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.d.ts +6 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness/assertions.js +14 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness/index.js +1 -0
package/dist/pipeline/compiler/mode-handlers/agent-harness/types.d.ts +3 -0
package/dist/pipeline/compiler/mode-handlers/literacy/assertions.js +1 -27
package/dist/pipeline/compiler/mode-handlers/literacy/types.d.ts +2 -9
package/dist/pipeline/compiler/rubric-resolution.d.ts +40 -0
package/dist/pipeline/compiler/rubric-resolution.js +52 -0
package/dist/pipeline/compiler/scoring-bridge.js +59 -7
package/dist/pipeline/provenance.js +7 -1
package/dist/pipeline/validate.d.ts +5 -4
package/dist/pipeline/validate.js +34 -113
package/dist/webhook/eval-request-handler.js +4 -0
package/package.json +1 -1

package/dist/commands/remote-results.d.ts CHANGED Viewed

@@ -4,9 +4,9 @@
  * Produces the same file layout as local mode so downstream tools
  * (workflow PR comments, score comparison, baseline save) work unchanged:
  *
- *   results/latest/score-summary.json  — scores by area + overall
- *   results/latest/report.md           — rendered markdown report
- *   results/latest/job-metadata.json   — job ID, timing, API URL
+ *   <outputDir>/score-summary.json  — scores by area + overall
+ *   <outputDir>/report.md           — rendered markdown report
+ *   <outputDir>/job-metadata.json   — job ID, timing, API URL
  *
  * @see packages/eval/src/commands/remote-pipeline.ts — caller
  */
@@ -14,8 +14,8 @@ import type { ApiClient } from "../adapters/api-client/api-client.js";
 import type { JobResponse } from "../adapters/api-client/types.js";
 /** Options for writing remote results. */
 export interface WriteResultsOptions {
-    /** Eval package root directory (for results/latest/ path). */
-    rootDir: string;
+    /** Base directory for output artifacts. */
+    outputDir: string;
     /** Optional output path override (--output flag). */
     outputPath?: string;
     /** API base URL (for metadata). */
@@ -25,9 +25,9 @@ export interface WriteResultsOptions {
  * Fetch report artifacts from the API and write them to disk.
  *
  * Writes:
- * - `results/latest/score-summary.json` — score data from job response
- * - `results/latest/report.md` — full markdown report (if reportId present)
- * - `results/latest/job-metadata.json` — job tracking info
+ * - `<outputDir>/score-summary.json` — score data from job response
+ * - `<outputDir>/report.md` — full markdown report (if reportId present)
+ * - `<outputDir>/job-metadata.json` — job tracking info
  * - `--output` path — markdown report (if specified)
  */
 export declare function writeRemoteResults(client: ApiClient, job: JobResponse, options: WriteResultsOptions): Promise<void>;

package/dist/commands/remote-results.js CHANGED Viewed

@@ -4,9 +4,9 @@
  * Produces the same file layout as local mode so downstream tools
  * (workflow PR comments, score comparison, baseline save) work unchanged:
  *
- *   results/latest/score-summary.json  — scores by area + overall
- *   results/latest/report.md           — rendered markdown report
- *   results/latest/job-metadata.json   — job ID, timing, API URL
+ *   <outputDir>/score-summary.json  — scores by area + overall
+ *   <outputDir>/report.md           — rendered markdown report
+ *   <outputDir>/job-metadata.json   — job ID, timing, API URL
  *
  * @see packages/eval/src/commands/remote-pipeline.ts — caller
  */
@@ -19,13 +19,13 @@ import { resolve } from "path";
  * Fetch report artifacts from the API and write them to disk.
  *
  * Writes:
- * - `results/latest/score-summary.json` — score data from job response
- * - `results/latest/report.md` — full markdown report (if reportId present)
- * - `results/latest/job-metadata.json` — job tracking info
+ * - `<outputDir>/score-summary.json` — score data from job response
+ * - `<outputDir>/report.md` — full markdown report (if reportId present)
+ * - `<outputDir>/job-metadata.json` — job tracking info
  * - `--output` path — markdown report (if specified)
  */
 export async function writeRemoteResults(client, job, options) {
-    const resultsDir = resolve(options.rootDir, "results", "latest");
+    const resultsDir = options.outputDir;
     mkdirSync(resultsDir, { recursive: true });
     // 1. Write score summary
     const scoreSummary = buildScoreSummary(job);

package/dist/commands/shared/options.d.ts CHANGED Viewed

@@ -18,6 +18,14 @@ export declare function addDebugOptions(cmd: Command): Command;
  * Add output options: --output, --format
  */
 export declare function addOutputOptions(cmd: Command): Command;
+/**
+ * Add --output-dir option for commands that write pipeline artifacts.
+ *
+ * Pair with `resolveOutputDir()` from `./resolve-output-dir.js` to resolve
+ * the value. When omitted, `resolveOutputDir()` defaults to
+ * `$CWD/.ailf/results/latest/`.
+ */
+export declare function addOutputDirOption(cmd: Command): Command;
 /**
  * Add Sanity source options: --sanity-dataset, --sanity-project, etc.
  */

package/dist/commands/shared/options.js CHANGED Viewed

@@ -36,6 +36,16 @@ export function addOutputOptions(cmd) {
         .option("-o, --output <path>", "Write output to a specific file path")
         .option("-f, --format <fmt>", "Output format (e.g., table, json, md)");
 }
+/**
+ * Add --output-dir option for commands that write pipeline artifacts.
+ *
+ * Pair with `resolveOutputDir()` from `./resolve-output-dir.js` to resolve
+ * the value. When omitted, `resolveOutputDir()` defaults to
+ * `$CWD/.ailf/results/latest/`.
+ */
+export function addOutputDirOption(cmd) {
+    return cmd.option("--output-dir <path>", "Base directory for output artifacts (default: .ailf/results/latest/)");
+}
 /**
  * Add Sanity source options: --sanity-dataset, --sanity-project, etc.
  */

package/dist/commands/shared/resolve-output-dir.d.ts ADDED Viewed

@@ -0,0 +1,27 @@
+/**
+ * Shared output directory resolution for all CLI commands.
+ *
+ * Resolution order (2-tier):
+ *   1. Explicit `--output-dir <path>` — resolved relative to callerCwd
+ *   2. Default — `$callerCwd/.ailf/results/latest/`
+ *
+ * callerCwd is `AILF_CALLER_CWD ?? process.cwd()` — the user's actual
+ * working directory, not the eval package root.
+ *
+ * @see docs/design-docs/output-dir-routing.md
+ * @see docs/work-items/W0030.json
+ */
+/**
+ * Get the caller's working directory.
+ *
+ * When the CLI is invoked via `npx @sanity/ailf`, the wrapper may set
+ * AILF_CALLER_CWD to preserve the real CWD before Node changes it.
+ */
+export declare function getCallerCwd(): string;
+/**
+ * Resolve the output directory for pipeline artifacts.
+ *
+ * @param outputDir - Explicit `--output-dir` value from the CLI (may be relative)
+ * @returns Absolute path to the output directory
+ */
+export declare function resolveOutputDir(outputDir?: string): string;

package/dist/commands/shared/resolve-output-dir.js ADDED Viewed

@@ -0,0 +1,36 @@
+/**
+ * Shared output directory resolution for all CLI commands.
+ *
+ * Resolution order (2-tier):
+ *   1. Explicit `--output-dir <path>` — resolved relative to callerCwd
+ *   2. Default — `$callerCwd/.ailf/results/latest/`
+ *
+ * callerCwd is `AILF_CALLER_CWD ?? process.cwd()` — the user's actual
+ * working directory, not the eval package root.
+ *
+ * @see docs/design-docs/output-dir-routing.md
+ * @see docs/work-items/W0030.json
+ */
+import { resolve } from "path";
+/**
+ * Get the caller's working directory.
+ *
+ * When the CLI is invoked via `npx @sanity/ailf`, the wrapper may set
+ * AILF_CALLER_CWD to preserve the real CWD before Node changes it.
+ */
+export function getCallerCwd() {
+    return process.env.AILF_CALLER_CWD ?? process.cwd();
+}
+/**
+ * Resolve the output directory for pipeline artifacts.
+ *
+ * @param outputDir - Explicit `--output-dir` value from the CLI (may be relative)
+ * @returns Absolute path to the output directory
+ */
+export function resolveOutputDir(outputDir) {
+    const callerCwd = getCallerCwd();
+    if (outputDir) {
+        return resolve(callerCwd, outputDir);
+    }
+    return resolve(callerCwd, ".ailf", "results", "latest");
+}

package/dist/composition-root.js CHANGED Viewed

@@ -60,7 +60,7 @@ export function createAppContext(config) {
     // Artifact collector — no-op by default, filesystem when --capture is set
     const collector = config.captureEnabled
         ? new FilesystemArtifactCollector({
-            captureDir: config.captureDir ?? join(config.rootDir, "results", "captures"),
+            captureDir: config.captureDir ?? join(config.outputDir, "..", "captures"),
             mode: config.mode,
             compress: config.captureCompress ?? true,
             extras: config.captureExtras ?? true,

package/dist/config/rubrics.ts CHANGED Viewed

@@ -201,9 +201,9 @@ export default defineRubrics({
       currency: 0.2,
     },
     "agent-harness": {
-      "agent-output": 0.45,
-      "tool-usage": 0.4,
-      "process-quality": 0.15,
+      "assertion-pass-rate": 0.35,
+      "agent-output": 0.35,
+      "tool-usage": 0.3,
     },
   },

package/dist/orchestration/build-app-context.js CHANGED Viewed

@@ -79,7 +79,7 @@ export function mapToResolvedConfig(opts, rootDir) {
         apiUrl: opts.apiUrl ?? "https://ailf-api.sanity.build",
         apiKey: opts.apiKey,
         captureEnabled: opts.captureEnabled ?? false,
-        captureDir: opts.captureDir ?? join(rootDir, "results", "captures"),
+        captureDir: opts.captureDir ?? join(opts.outputDir, "..", "captures"),
         captureCompress: opts.captureCompress ?? true,
         captureExtras: opts.captureExtras ?? true,
     };

package/dist/orchestration/steps/fetch-docs-step.js CHANGED Viewed

@@ -29,15 +29,29 @@ export class FetchDocsStep {
             return { status: "skipped", reason: "--skip-fetch" };
         }
         const start = Date.now();
-        // Load tasks from the filesystem — the same source GenerateConfigsStep
-        // uses. This replaces ctx.taskSource (ContentLakeTaskSource) which may
-        // have no ailf.task documents, causing a mismatch where generated
-        // configs reference context files that were never fetched.
-        const allTasks = await loadPipelineTasks({
-            rootDir: ctx.config.rootDir,
-            mode: ctx.config.mode,
-            repoTasksPath: ctx.config.repoTasksPath,
-        });
+        // Load tasks — use the same source as GenerateConfigsStep to avoid
+        // a mismatch where configs reference context files that were never
+        // fetched.
+        //
+        // Content Lake path: use ctx.taskSource (ContentLakeTaskSource) which
+        // loads Studio-owned ailf.task documents via GROQ.
+        // Filesystem path: load from .task.ts files (repo/inline tasks).
+        let allTasks;
+        if (ctx.config.taskSourceType === "content-lake") {
+            const filter = {
+                ...(ctx.config.areas?.length ? { areas: ctx.config.areas } : {}),
+                ...(ctx.config.tasks?.length ? { taskIds: ctx.config.tasks } : {}),
+                ...(ctx.config.tags?.length ? { tags: ctx.config.tags } : {}),
+            };
+            allTasks = await ctx.taskSource.loadTasks(Object.keys(filter).length > 0 ? filter : undefined);
+        }
+        else {
+            allTasks = await loadPipelineTasks({
+                rootDir: ctx.config.rootDir,
+                mode: ctx.config.mode,
+                repoTasksPath: ctx.config.repoTasksPath,
+            });
+        }
         // Bridge: narrow to literacy tasks for canonical doc access
         const literacyTasks = allTasks.filter((t) => t.mode === "literacy");
         const tasksWithDocs = literacyTasks.filter((t) => (t.context?.docs?.length ?? 0) > 0);

package/dist/orchestration/steps/gap-analysis-step.js CHANGED Viewed

@@ -61,87 +61,96 @@ export class GapAnalysisStep {
             mkdirSync(outDir, { recursive: true });
             writeFileSync(join(outDir, "failure-modes.json"), JSON.stringify(failureModeReport, null, 2));
             writeFileSync(join(outDir, "gap-analysis.json"), JSON.stringify(gapReport, null, 2));
-            const manifestPath = resolve(root, "contexts", "document-manifest.json");
-            const manifestEntries = existsSync(manifestPath)
-                ? JSON.parse(readFileSync(manifestPath, "utf-8"))
-                : [];
-            const refBySlug = new Map();
-            for (const entry of manifestEntries) {
-                refBySlug.set(entry.slug, entry);
-            }
-            const resolveRefs = (slugs) => slugs
-                .map((slug) => {
-                const m = refBySlug.get(slug);
-                return m
-                    ? {
-                        documentId: m._id,
-                        revision: m._rev,
-                        slug: m.slug,
-                        title: m.title,
-                    }
-                    : { documentId: "", slug, title: slug };
-            })
-                .filter((r) => r.documentId !== "");
-            // ── Build description→docs mapping from TaskSource ─────────
-            // Primary source: use the TaskSource adapter from AppContext.
-            // This works with Content Lake, repo-based, and YAML tasks.
-            // Judgments use task description as their taskId, so we build
-            // maps keyed by both description and task ID for robust matching.
+            // ── Document enrichment (literacy mode only) ──────────────
+            // Non-literacy modes don't use canonical docs. Skip manifest
+            // loading, doc-reference enrichment, and canonical doc mapping
+            // entirely — those fields are only meaningful for literacy evals.
+            const isLiteracyMode = ctx.config.mode === "literacy";
+            let documentManifest;
+            let enrichedScores = scoreSummary.scores;
             const descToDocRefs = new Map();
-            const areaToDocRefs = new Map();
-            let tasks = [];
-            try {
-                tasks = (await ctx.taskSource.loadTasks()).filter((t) => t.mode === "literacy");
-            }
-            catch {
-                // TaskSource may not be available in all contexts (e.g., standalone
-                // gap analysis on cached results). Fall through to legacy fallback.
-            }
-            if (tasks.length > 0) {
-                // Group tasks by feature area and build slug maps
-                const byArea = new Map();
-                for (const task of tasks) {
-                    const slugs = extractSlugsFromRefs(task.context?.docs ?? []);
-                    const refs = resolveRefs(slugs);
-                    // Map by title (what judgments use as taskId)
-                    descToDocRefs.set(task.title, refs);
-                    // Also map by task ID for prefix-based matching
-                    descToDocRefs.set(task.id, refs);
-                    // Group slugs by feature area
-                    const area = task.area ?? "";
-                    if (!byArea.has(area))
-                        byArea.set(area, new Set());
-                    for (const s of slugs)
-                        byArea.get(area).add(s);
+            if (isLiteracyMode) {
+                const manifestPath = resolve(root, "contexts", "document-manifest.json");
+                const manifestEntries = existsSync(manifestPath)
+                    ? JSON.parse(readFileSync(manifestPath, "utf-8"))
+                    : [];
+                const refBySlug = new Map();
+                for (const entry of manifestEntries) {
+                    refBySlug.set(entry.slug, entry);
                 }
-                for (const [area, slugs] of byArea) {
-                    areaToDocRefs.set(area, resolveRefs([...slugs]));
+                const resolveRefs = (slugs) => slugs
+                    .map((slug) => {
+                    const m = refBySlug.get(slug);
+                    return m
+                        ? {
+                            documentId: m._id,
+                            revision: m._rev,
+                            slug: m.slug,
+                            title: m.title,
+                        }
+                        : { documentId: "", slug, title: slug };
+                })
+                    .filter((r) => r.documentId !== "");
+                // ── Build description→docs mapping from TaskSource ─────────
+                // Primary source: use the TaskSource adapter from AppContext.
+                // This works with Content Lake, repo-based, and YAML tasks.
+                // Judgments use task description as their taskId, so we build
+                // maps keyed by both description and task ID for robust matching.
+                const areaToDocRefs = new Map();
+                let tasks = [];
+                try {
+                    tasks = (await ctx.taskSource.loadTasks()).filter((t) => t.mode === "literacy");
                 }
-            }
-            // Legacy fallback: merge in any tasks from local YAML that weren't
-            // already covered by the TaskSource adapter.
-            const { resolveMappings } = await import("../../pipeline/resolve-mappings.js");
-            const mappings = resolveMappings(root);
-            for (const [area, areaData] of Object.entries(mappings.feature_areas)) {
-                const areaSlugs = new Set();
-                for (const task of areaData.tasks) {
-                    const taskSlugs = task.canonical_docs.map((d) => d.slug);
-                    // Only add if not already mapped by the primary source
-                    if (!descToDocRefs.has(task.description)) {
-                        descToDocRefs.set(task.description, resolveRefs(taskSlugs));
+                catch {
+                    // TaskSource may not be available in all contexts (e.g., standalone
+                    // gap analysis on cached results). Fall through to legacy fallback.
+                }
+                if (tasks.length > 0) {
+                    // Group tasks by feature area and build slug maps
+                    const byArea = new Map();
+                    for (const task of tasks) {
+                        const slugs = extractSlugsFromRefs(task.context?.docs ?? []);
+                        const refs = resolveRefs(slugs);
+                        // Map by title (what judgments use as taskId)
+                        descToDocRefs.set(task.title, refs);
+                        // Also map by task ID for prefix-based matching
+                        descToDocRefs.set(task.id, refs);
+                        // Group slugs by feature area
+                        const area = task.area ?? "";
+                        if (!byArea.has(area))
+                            byArea.set(area, new Set());
+                        for (const s of slugs)
+                            byArea.get(area).add(s);
+                    }
+                    for (const [area, slugs] of byArea) {
+                        areaToDocRefs.set(area, resolveRefs([...slugs]));
                     }
-                    for (const s of taskSlugs)
-                        areaSlugs.add(s);
                 }
-                if (!areaToDocRefs.has(area)) {
-                    areaToDocRefs.set(area, resolveRefs([...areaSlugs]));
+                // Legacy fallback: merge in any tasks from local YAML that weren't
+                // already covered by the TaskSource adapter.
+                const { resolveMappings } = await import("../../pipeline/resolve-mappings.js");
+                const mappings = resolveMappings(root);
+                for (const [area, areaData] of Object.entries(mappings.feature_areas)) {
+                    const areaSlugs = new Set();
+                    for (const task of areaData.tasks) {
+                        const taskSlugs = task.canonical_docs.map((d) => d.slug);
+                        // Only add if not already mapped by the primary source
+                        if (!descToDocRefs.has(task.description)) {
+                            descToDocRefs.set(task.description, resolveRefs(taskSlugs));
+                        }
+                        for (const s of taskSlugs)
+                            areaSlugs.add(s);
+                    }
+                    if (!areaToDocRefs.has(area)) {
+                        areaToDocRefs.set(area, resolveRefs([...areaSlugs]));
+                    }
                 }
+                documentManifest = resolveRefs([...refBySlug.keys()]);
+                enrichedScores = scoreSummary.scores.map((s) => ({
+                    ...s,
+                    documents: areaToDocRefs.get(s.feature),
+                }));
             }
-            const documentManifest = resolveRefs([...refBySlug.keys()]);
-            const enrichedScores = scoreSummary.scores.map((s) => ({
-                ...s,
-                documents: areaToDocRefs.get(s.feature),
-            }));
             // ── Low-scoring judgments ────────────────────────────────────
             const LOW_SCORE_THRESHOLD = 70;
             const MAX_STORED_JUDGMENTS = 50;
@@ -154,6 +163,8 @@ export class GapAnalysisStep {
                 .sort((a, b) => a.score - b.score)
                 .slice(0, MAX_STORED_JUDGMENTS)
                 .map((j) => {
+                if (!isLiteracyMode)
+                    return j;
                 // Judgment taskId is the description with "(gold)" or "(baseline)" suffix
                 const baseDesc = j.taskId.replace(/\s*\((gold|baseline)\)\s*$/, "");
                 const canonicalDocs = descToDocRefs.get(baseDesc);
@@ -161,7 +172,7 @@ export class GapAnalysisStep {
             });
             const enrichedSummary = {
                 ...scoreSummary,
-                documentManifest,
+                ...(documentManifest !== undefined && { documentManifest }),
                 failureModes: failureModeReport,
                 lowScoringJudgments,
                 recommendations: gapReport,

package/dist/orchestration/steps/generate-configs-step.d.ts CHANGED Viewed

@@ -18,6 +18,21 @@ export declare class GenerateConfigsStep implements PipelineStep {
     private compileLiteracyVariants;
     private compileSingleMode;
     private loadTasks;
+    /**
+     * Load tasks from the Content Lake via ctx.taskSource.
+     *
+     * The ContentLakeTaskSource adapter handles area/task/tag filtering
+     * in the GROQ query itself, so we build a FilterOptions and pass it
+     * through rather than filtering in-memory after loading.
+     */
+    private loadTasksFromContentLake;
+    /**
+     * Load tasks from filesystem .task.ts files.
+     *
+     * This is the original path used for repo-based and inline tasks.
+     * It scans tasks/{mode}/ and optionally --repo-tasks-path.
+     */
+    private loadTasksFromFilesystem;
     private applyFilters;
     /**
      * Build a descriptive error message when no tasks match the current filters.

package/dist/orchestration/steps/generate-configs-step.js CHANGED Viewed

@@ -159,10 +159,22 @@ export class GenerateConfigsStep {
             label: m.label,
             config: m.config,
         }));
+        // Load rubric config for template resolution (needed by modes that use
+        // templated LLM-rubric assertions, e.g., agent-harness with agent-output
+        // and agent-tool-usage templates)
+        let rubricConfig;
+        try {
+            const { loadRubricTemplates } = await import("../../pipeline/rubric-loader.js");
+            rubricConfig = loadRubricTemplates(ctx.config.rootDir);
+        }
+        catch {
+            ctx.logger.warn("  ⚠ Could not load rubric config — templates will not resolve");
+        }
         const merged = this.compileAll(handler, tasks, {
             rootDir: ctx.config.rootDir,
             graderProvider: models.grader.id,
             models: modeModels,
+            rubricConfig,
         });
         for (const w of merged.warnings) {
             ctx.logger.warn(`  ⚠ ${w}`);
@@ -197,6 +209,50 @@ export class GenerateConfigsStep {
     // Task loading — unified for all modes
     // ---------------------------------------------------------------------------
     async loadTasks(ctx, mode, state) {
+        // Content Lake path — use ctx.taskSource (ContentLakeTaskSource) which
+        // loads ailf.task documents via GROQ. This is the only path that sees
+        // Studio-owned tasks (ownership: "studio").
+        if (ctx.config.taskSourceType === "content-lake") {
+            return this.loadTasksFromContentLake(ctx, state);
+        }
+        // Filesystem path — load from .task.ts files (repo tasks, inline tasks).
+        return this.loadTasksFromFilesystem(ctx, mode, state);
+    }
+    /**
+     * Load tasks from the Content Lake via ctx.taskSource.
+     *
+     * The ContentLakeTaskSource adapter handles area/task/tag filtering
+     * in the GROQ query itself, so we build a FilterOptions and pass it
+     * through rather than filtering in-memory after loading.
+     */
+    async loadTasksFromContentLake(ctx, state) {
+        const filter = {
+            ...(ctx.config.areas?.length ? { areas: ctx.config.areas } : {}),
+            ...(ctx.config.tasks?.length ? { taskIds: ctx.config.tasks } : {}),
+            ...(ctx.config.tags?.length ? { tags: ctx.config.tags } : {}),
+        };
+        const tasks = await ctx.taskSource.loadTasks(Object.keys(filter).length > 0 ? filter : undefined);
+        // Capture loaded IDs for error messages (same as filesystem path)
+        this.lastLoadedTaskIds = tasks
+            .map((t) => t.id)
+            .filter((id) => !!id);
+        // Release auto-scope
+        if (state.releaseAutoScope && !ctx.config.noAutoScope) {
+            const scopedIds = new Set(state.releaseAutoScope.affectedTaskIds);
+            const beforeCount = tasks.length;
+            const scoped = tasks.filter((t) => "id" in t && scopedIds.has(t.id));
+            ctx.logger.info(`  🎯 Auto-scoped to ${scoped.length} of ${beforeCount} task(s) affected by release`);
+            return scoped;
+        }
+        return tasks;
+    }
+    /**
+     * Load tasks from filesystem .task.ts files.
+     *
+     * This is the original path used for repo-based and inline tasks.
+     * It scans tasks/{mode}/ and optionally --repo-tasks-path.
+     */
+    async loadTasksFromFilesystem(ctx, mode, state) {
         const { resolve } = await import("path");
         const { discoverTsTaskFiles, loadTsTaskFile } = await import("../../adapters/task-sources/task-file-loader.js");
         const { resolveVendoredSubdir } = await import("../../pipeline/compiler/config-loader.js");

package/dist/orchestration/steps/run-eval-step.js CHANGED Viewed

@@ -113,6 +113,11 @@ export class RunEvalStep {
                 // required eval modes were satisfied from the remote cache.
                 state.remoteCacheHits ??= new Set();
                 state.remoteCacheHits.add(this.mode);
+                // Carry forward Promptfoo share URLs from the cached report
+                if (remoteCacheResult.promptfooUrls?.length) {
+                    state.promptfooUrls ??= [];
+                    state.promptfooUrls.push(...remoteCacheResult.promptfooUrls);
+                }
                 // Capture the restored score-summary from remote cache
                 const cachedSummaryPath = resolve(rootDir, "results", "latest", "score-summary.json");
                 if (existsSync(cachedSummaryPath)) {
@@ -189,6 +194,14 @@ export class RunEvalStep {
                 mode: this.mode,
             });
         }
+        // Extract Promptfoo share URL from eval results (Step 3b)
+        if (ctx.evalRunner.extractShareUrl) {
+            const shareUrl = ctx.evalRunner.extractShareUrl(resolve(rootDir, resultsFileForMode(this.mode)));
+            if (shareUrl) {
+                state.promptfooUrls ??= [];
+                state.promptfooUrls.push({ mode: this.mode, url: shareUrl });
+            }
+        }
         const durationMs = Date.now() - start;
         return {
             durationMs,
@@ -224,6 +237,7 @@ async function checkRemoteCache(fingerprint, reportStore, rootDir) {
         console.log(`  ℹ️  Fingerprint: ${fingerprint.slice(0, 16)}... (${queryMs}ms)`);
         return {
             completedAt: cachedReport.completedAt,
+            promptfooUrls: cachedReport.provenance?.promptfooUrls,
             reportId: cachedReport.id,
         };
     }